1*2abb3134SXin Li# Copyright 2014 Google Inc. All rights reserved. 2*2abb3134SXin Li# 3*2abb3134SXin Li# Licensed under the Apache License, Version 2.0 (the "License"); 4*2abb3134SXin Li# you may not use this file except in compliance with the License. 5*2abb3134SXin Li# You may obtain a copy of the License at 6*2abb3134SXin Li# 7*2abb3134SXin Li# http://www.apache.org/licenses/LICENSE-2.0 8*2abb3134SXin Li# 9*2abb3134SXin Li# Unless required by applicable law or agreed to in writing, software 10*2abb3134SXin Li# distributed under the License is distributed on an "AS IS" BASIS, 11*2abb3134SXin Li# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12*2abb3134SXin Li# See the License for the specific language governing permissions and 13*2abb3134SXin Li# limitations under the License. 14*2abb3134SXin Li 15*2abb3134SXin Li# 16*2abb3134SXin Li# This library implements the RAPPOR marginal decoding algorithms using LASSO. 17*2abb3134SXin Li 18*2abb3134SXin Lilibrary(glmnet) 19*2abb3134SXin Li 20*2abb3134SXin Li# So we don't have to change pwd 21*2abb3134SXin Lisource.rappor <- function(rel_path) { 22*2abb3134SXin Li abs_path <- paste0(Sys.getenv("RAPPOR_REPO", ""), rel_path) 23*2abb3134SXin Li source(abs_path) 24*2abb3134SXin Li} 25*2abb3134SXin Li 26*2abb3134SXin Lisource.rappor('analysis/R/alternative.R') 27*2abb3134SXin Li 28*2abb3134SXin LiEstimateBloomCounts <- function(params, obs_counts) { 29*2abb3134SXin Li # Estimates the number of times each bit in each cohort was set in original 30*2abb3134SXin Li # Bloom filters. 31*2abb3134SXin Li # 32*2abb3134SXin Li # Input: 33*2abb3134SXin Li # params: a list of RAPPOR parameters: 34*2abb3134SXin Li # k - size of a Bloom filter 35*2abb3134SXin Li # h - number of hash functions 36*2abb3134SXin Li # m - number of cohorts 37*2abb3134SXin Li # p - P(IRR = 1 | PRR = 0) 38*2abb3134SXin Li # q - P(IRR = 1 | PRR = 1) 39*2abb3134SXin Li # f - Proportion of bits in the Bloom filter that are set randomly 40*2abb3134SXin Li # to 0 or 1 regardless of the underlying true bit value 41*2abb3134SXin Li # obs_counts: a matrix of size m by (k + 1). Column one contains sample 42*2abb3134SXin Li # sizes for each cohort. Other counts indicated how many times 43*2abb3134SXin Li # each bit was set in each cohort. 44*2abb3134SXin Li # 45*2abb3134SXin Li # Output: 46*2abb3134SXin Li # ests: a matrix of size m by k with estimated counts for the probability 47*2abb3134SXin Li # of each bit set to 1 in the true Bloom filter. 48*2abb3134SXin Li # stds: standard deviation of the estimates. 49*2abb3134SXin Li 50*2abb3134SXin Li p <- params$p 51*2abb3134SXin Li q <- params$q 52*2abb3134SXin Li f <- params$f 53*2abb3134SXin Li m <- params$m 54*2abb3134SXin Li k <- params$k 55*2abb3134SXin Li 56*2abb3134SXin Li stopifnot(m == nrow(obs_counts), k + 1 == ncol(obs_counts)) 57*2abb3134SXin Li 58*2abb3134SXin Li p11 <- q * (1 - f/2) + p * f / 2 # probability of a true 1 reported as 1 59*2abb3134SXin Li p01 <- p * (1 - f/2) + q * f / 2 # probability of a true 0 reported as 1 60*2abb3134SXin Li 61*2abb3134SXin Li p2 <- p11 - p01 # == (1 - f) * (q - p) 62*2abb3134SXin Li 63*2abb3134SXin Li # When m = 1, obs_counts does not have the right dimensions. Fixing this. 64*2abb3134SXin Li dim(obs_counts) <- c(m, k + 1) 65*2abb3134SXin Li 66*2abb3134SXin Li ests <- apply(obs_counts, 1, function(cohort_row) { 67*2abb3134SXin Li N <- cohort_row[1] # sample size for the cohort -- first column is total 68*2abb3134SXin Li v <- cohort_row[-1] # counts for individual bits 69*2abb3134SXin Li (v - p01 * N) / p2 # unbiased estimator for individual bits' 70*2abb3134SXin Li # true counts. It can be negative or 71*2abb3134SXin Li # exceed the total. 72*2abb3134SXin Li }) 73*2abb3134SXin Li 74*2abb3134SXin Li # NOTE: When k == 1, rows of obs_counts have 2 entries. Then cohort_row[-1] 75*2abb3134SXin Li # is a singleton vector, and apply() returns a *vector*. When rows have 3 76*2abb3134SXin Li # entries, cohort_row[-1] is a vector of length 2 and apply() returns a 77*2abb3134SXin Li # *matrix*. 78*2abb3134SXin Li # 79*2abb3134SXin Li # Fix this by explicitly setting dimensions. NOTE: It's k x m, not m x k. 80*2abb3134SXin Li dim(ests) <- c(k, m) 81*2abb3134SXin Li 82*2abb3134SXin Li total <- sum(obs_counts[,1]) 83*2abb3134SXin Li 84*2abb3134SXin Li variances <- apply(obs_counts, 1, function(cohort_row) { 85*2abb3134SXin Li N <- cohort_row[1] 86*2abb3134SXin Li v <- cohort_row[-1] 87*2abb3134SXin Li p_hats <- (v - p01 * N) / (N * p2) # expectation of a true 1 88*2abb3134SXin Li p_hats <- pmax(0, pmin(1, p_hats)) # clamp to [0,1] 89*2abb3134SXin Li r <- p_hats * p11 + (1 - p_hats) * p01 # expectation of a reported 1 90*2abb3134SXin Li N * r * (1 - r) / p2^2 # variance of the binomial 91*2abb3134SXin Li }) 92*2abb3134SXin Li 93*2abb3134SXin Li dim(variances) <- c(k, m) 94*2abb3134SXin Li 95*2abb3134SXin Li # Transform counts from absolute values to fractional, removing bias due to 96*2abb3134SXin Li # variability of reporting between cohorts. 97*2abb3134SXin Li ests <- apply(ests, 1, function(x) x / obs_counts[,1]) 98*2abb3134SXin Li stds <- apply(variances^.5, 1, function(x) x / obs_counts[,1]) 99*2abb3134SXin Li 100*2abb3134SXin Li # Some estimates may be set to infinity, e.g. if f=1. We want to account for 101*2abb3134SXin Li # this possibility, and set the corresponding counts to 0. 102*2abb3134SXin Li ests[abs(ests) == Inf] <- 0 103*2abb3134SXin Li 104*2abb3134SXin Li list(estimates = ests, stds = stds) 105*2abb3134SXin Li} 106*2abb3134SXin Li 107*2abb3134SXin LiFitLasso <- function(X, Y, intercept = TRUE) { 108*2abb3134SXin Li # Fits a Lasso model to select a subset of columns of X. 109*2abb3134SXin Li # 110*2abb3134SXin Li # Input: 111*2abb3134SXin Li # X: a design matrix of size km by M (the number of candidate strings). 112*2abb3134SXin Li # Y: a vector of size km with estimated counts from EstimateBloomCounts(). 113*2abb3134SXin Li # intercept: whether to fit with intercept or not. 114*2abb3134SXin Li # 115*2abb3134SXin Li # Output: 116*2abb3134SXin Li # a vector of size ncol(X) of coefficients. 117*2abb3134SXin Li 118*2abb3134SXin Li # TODO(mironov): Test cv.glmnet instead of glmnet 119*2abb3134SXin Li mod <- try(glmnet(X, Y, standardize = FALSE, intercept = intercept, 120*2abb3134SXin Li lower.limits = 0, # outputs are non-negative 121*2abb3134SXin Li # Cap the number of non-zero coefficients to 500 or 122*2abb3134SXin Li # 80% of the length of Y, whichever is less. The 500 cap 123*2abb3134SXin Li # is for performance reasons, 80% is to avoid overfitting. 124*2abb3134SXin Li pmax = min(500, length(Y) * .8)), 125*2abb3134SXin Li silent = TRUE) 126*2abb3134SXin Li 127*2abb3134SXin Li # If fitting fails, return an empty data.frame. 128*2abb3134SXin Li if (class(mod)[1] == "try-error") { 129*2abb3134SXin Li coefs <- setNames(rep(0, ncol(X)), colnames(X)) 130*2abb3134SXin Li } else { 131*2abb3134SXin Li coefs <- coef(mod) 132*2abb3134SXin Li coefs <- coefs[-1, ncol(coefs), drop = FALSE] # coefs[1] is the intercept 133*2abb3134SXin Li } 134*2abb3134SXin Li coefs 135*2abb3134SXin Li} 136*2abb3134SXin Li 137*2abb3134SXin LiPerformInference <- function(X, Y, N, mod, params, alpha, correction) { 138*2abb3134SXin Li m <- params$m 139*2abb3134SXin Li p <- params$p 140*2abb3134SXin Li q <- params$q 141*2abb3134SXin Li f <- params$f 142*2abb3134SXin Li h <- params$h 143*2abb3134SXin Li 144*2abb3134SXin Li q2 <- .5 * f * (p + q) + (1 - f) * q 145*2abb3134SXin Li p2 <- .5 * f * (p + q) + (1 - f) * p 146*2abb3134SXin Li resid_var <- p2 * (1 - p2) * (N / m) / (q2 - p2)^2 147*2abb3134SXin Li 148*2abb3134SXin Li # Total Sum of Squares (SS). 149*2abb3134SXin Li TSS <- sum((Y - mean(Y))^2) 150*2abb3134SXin Li # Error Sum of Squares (ESS). 151*2abb3134SXin Li ESS <- resid_var * nrow(X) 152*2abb3134SXin Li 153*2abb3134SXin Li betas <- matrix(mod$coefs, ncol = 1) 154*2abb3134SXin Li 155*2abb3134SXin Li# mod_var <- summary(mod$fit)$sigma^2 156*2abb3134SXin Li# betas_sd <- rep(sqrt(max(resid_var, mod_var) / (m * h)), length(betas)) 157*2abb3134SXin Li# 158*2abb3134SXin Li# z_values <- betas / betas_sd 159*2abb3134SXin Li# 160*2abb3134SXin Li# # 1-sided t-test. 161*2abb3134SXin Li# p_values <- pnorm(z_values, lower = FALSE) 162*2abb3134SXin Li 163*2abb3134SXin Li fit <- data.frame(string = colnames(X), Estimate = betas, 164*2abb3134SXin Li SD = mod$stds, # z_stat = z_values, pvalue = p_values, 165*2abb3134SXin Li stringsAsFactors = FALSE) 166*2abb3134SXin Li 167*2abb3134SXin Li# if (correction == "FDR") { 168*2abb3134SXin Li# fit <- fit[order(fit$pvalue, decreasing = FALSE), ] 169*2abb3134SXin Li# ind <- which(fit$pvalue < (1:nrow(fit)) * alpha / nrow(fit)) 170*2abb3134SXin Li# if (length(ind) > 0) { 171*2abb3134SXin Li# fit <- fit[1:max(ind), ] 172*2abb3134SXin Li# } else { 173*2abb3134SXin Li# fit <- fit[numeric(0), ] 174*2abb3134SXin Li# } 175*2abb3134SXin Li# } else { 176*2abb3134SXin Li# fit <- fit[fit$p < alpha, ] 177*2abb3134SXin Li# } 178*2abb3134SXin Li 179*2abb3134SXin Li fit <- fit[order(fit$Estimate, decreasing = TRUE), ] 180*2abb3134SXin Li 181*2abb3134SXin Li if (nrow(fit) > 0) { 182*2abb3134SXin Li str_names <- fit$string 183*2abb3134SXin Li str_names <- str_names[!is.na(str_names)] 184*2abb3134SXin Li if (length(str_names) > 0 && length(str_names) < nrow(X)) { 185*2abb3134SXin Li this_data <- as.data.frame(as.matrix(X[, str_names])) 186*2abb3134SXin Li Y_hat <- predict(lm(Y ~ ., data = this_data)) 187*2abb3134SXin Li RSS <- sum((Y_hat - mean(Y))^2) 188*2abb3134SXin Li } else { 189*2abb3134SXin Li RSS <- NA 190*2abb3134SXin Li } 191*2abb3134SXin Li } else { 192*2abb3134SXin Li RSS <- 0 193*2abb3134SXin Li } 194*2abb3134SXin Li 195*2abb3134SXin Li USS <- TSS - ESS - RSS 196*2abb3134SXin Li SS <- c(RSS, USS, ESS) / TSS 197*2abb3134SXin Li 198*2abb3134SXin Li list(fit = fit, SS = SS, resid_sigma = sqrt(resid_var)) 199*2abb3134SXin Li} 200*2abb3134SXin Li 201*2abb3134SXin LiComputePrivacyGuarantees <- function(params, alpha, N) { 202*2abb3134SXin Li # Compute privacy parameters and guarantees. 203*2abb3134SXin Li p <- params$p 204*2abb3134SXin Li q <- params$q 205*2abb3134SXin Li f <- params$f 206*2abb3134SXin Li h <- params$h 207*2abb3134SXin Li 208*2abb3134SXin Li q2 <- .5 * f * (p + q) + (1 - f) * q 209*2abb3134SXin Li p2 <- .5 * f * (p + q) + (1 - f) * p 210*2abb3134SXin Li 211*2abb3134SXin Li exp_e_one <- ((q2 * (1 - p2)) / (p2 * (1 - q2)))^h 212*2abb3134SXin Li if (exp_e_one < 1) { 213*2abb3134SXin Li exp_e_one <- 1 / exp_e_one 214*2abb3134SXin Li } 215*2abb3134SXin Li e_one <- log(exp_e_one) 216*2abb3134SXin Li 217*2abb3134SXin Li exp_e_inf <- ((1 - .5 * f) / (.5 * f))^(2 * h) 218*2abb3134SXin Li e_inf <- log(exp_e_inf) 219*2abb3134SXin Li 220*2abb3134SXin Li std_dev_counts <- sqrt(p2 * (1 - p2) * N) / (q2 - p2) 221*2abb3134SXin Li detection_freq <- qnorm(1 - alpha) * std_dev_counts / N 222*2abb3134SXin Li 223*2abb3134SXin Li privacy_names <- c("Effective p", "Effective q", "exp(e_1)", 224*2abb3134SXin Li "e_1", "exp(e_inf)", "e_inf", "Detection frequency") 225*2abb3134SXin Li privacy_vals <- c(p2, q2, exp_e_one, e_one, exp_e_inf, e_inf, detection_freq) 226*2abb3134SXin Li 227*2abb3134SXin Li privacy <- data.frame(parameters = privacy_names, 228*2abb3134SXin Li values = privacy_vals) 229*2abb3134SXin Li privacy 230*2abb3134SXin Li} 231*2abb3134SXin Li 232*2abb3134SXin LiFitDistribution <- function(estimates_stds, map, quiet = FALSE) { 233*2abb3134SXin Li # Find a distribution over rows of map that approximates estimates_stds best 234*2abb3134SXin Li # 235*2abb3134SXin Li # Input: 236*2abb3134SXin Li # estimates_stds: a list of two m x k matrices, one for estimates, another 237*2abb3134SXin Li # for standard errors 238*2abb3134SXin Li # map : an (m * k) x S boolean matrix 239*2abb3134SXin Li # 240*2abb3134SXin Li # Output: 241*2abb3134SXin Li # a float vector of length S, so that a distribution over map's rows sampled 242*2abb3134SXin Li # according to this vector approximates estimates 243*2abb3134SXin Li 244*2abb3134SXin Li S <- ncol(map) # total number of candidates 245*2abb3134SXin Li 246*2abb3134SXin Li support_coefs <- 1:S 247*2abb3134SXin Li 248*2abb3134SXin Li if (S > length(estimates_stds$estimates) * .8) { 249*2abb3134SXin Li # the system is close to being underdetermined 250*2abb3134SXin Li lasso <- FitLasso(map, as.vector(t(estimates_stds$estimates))) 251*2abb3134SXin Li 252*2abb3134SXin Li # Select non-zero coefficients. 253*2abb3134SXin Li support_coefs <- which(lasso > 0) 254*2abb3134SXin Li 255*2abb3134SXin Li if(!quiet) 256*2abb3134SXin Li cat("LASSO selected ", length(support_coefs), " non-zero coefficients.\n") 257*2abb3134SXin Li } 258*2abb3134SXin Li 259*2abb3134SXin Li coefs <- setNames(rep(0, S), colnames(map)) 260*2abb3134SXin Li 261*2abb3134SXin Li if(length(support_coefs) > 0) { # LASSO may return an empty list 262*2abb3134SXin Li constrained_coefs <- ConstrainedLinModel(map[, support_coefs, drop = FALSE], 263*2abb3134SXin Li estimates_stds) 264*2abb3134SXin Li 265*2abb3134SXin Li coefs[support_coefs] <- constrained_coefs 266*2abb3134SXin Li } 267*2abb3134SXin Li 268*2abb3134SXin Li coefs 269*2abb3134SXin Li} 270*2abb3134SXin Li 271*2abb3134SXin LiResample <- function(e) { 272*2abb3134SXin Li # Simulate resampling of the Bloom filter estimates by adding Gaussian noise 273*2abb3134SXin Li # with estimated standard deviation. 274*2abb3134SXin Li estimates <- matrix(mapply(function(x, y) x + rnorm(1, 0, y), 275*2abb3134SXin Li e$estimates, e$stds), 276*2abb3134SXin Li nrow = nrow(e$estimates), ncol = ncol(e$estimates)) 277*2abb3134SXin Li stds <- e$stds * 2^.5 278*2abb3134SXin Li 279*2abb3134SXin Li list(estimates = estimates, stds = stds) 280*2abb3134SXin Li} 281*2abb3134SXin Li 282*2abb3134SXin Li# Private function 283*2abb3134SXin Li# Decode for Boolean RAPPOR inputs 284*2abb3134SXin Li# Returns a list with attribute fit only. (Inference and other aspects 285*2abb3134SXin Li# currently not incorporated because they're unnecessary for association.) 286*2abb3134SXin Li.DecodeBoolean <- function(counts, params, num_reports) { 287*2abb3134SXin Li # Boolean variables are reported without cohorts and to estimate counts, 288*2abb3134SXin Li # first sum up counts across all cohorts and then run EstimateBloomCounts 289*2abb3134SXin Li # with the number of cohorts set to 1. 290*2abb3134SXin Li params$m <- 1 # set number of cohorts to 1 291*2abb3134SXin Li summed_counts <- colSums(counts) # sum counts across cohorts 292*2abb3134SXin Li es <- EstimateBloomCounts(params, summed_counts) # estimate boolean counts 293*2abb3134SXin Li 294*2abb3134SXin Li ests <- es$estimates[[1]] 295*2abb3134SXin Li std <- es$stds[[1]] 296*2abb3134SXin Li 297*2abb3134SXin Li fit <- data.frame( 298*2abb3134SXin Li string = c("TRUE", "FALSE"), 299*2abb3134SXin Li estimate = c(ests * num_reports, 300*2abb3134SXin Li num_reports - ests * num_reports), 301*2abb3134SXin Li std_error = c(std * num_reports, std * num_reports), 302*2abb3134SXin Li proportion = c(ests, 1 - ests), 303*2abb3134SXin Li prop_std_error = c(std, std)) 304*2abb3134SXin Li 305*2abb3134SXin Li low_95 <- fit$proportion - 1.96 * fit$prop_std_error 306*2abb3134SXin Li high_95 <- fit$proportion + 1.96 * fit$prop_std_error 307*2abb3134SXin Li 308*2abb3134SXin Li fit$prop_low_95 <- pmax(low_95, 0.0) 309*2abb3134SXin Li fit$prop_high_95 <- pmin(high_95, 1.0) 310*2abb3134SXin Li rownames(fit) <- fit$string 311*2abb3134SXin Li 312*2abb3134SXin Li return(list(fit = fit)) 313*2abb3134SXin Li} 314*2abb3134SXin Li 315*2abb3134SXin LiCheckDecodeInputs <- function(counts, map, params) { 316*2abb3134SXin Li # Returns an error message, or NULL if there is no error. 317*2abb3134SXin Li 318*2abb3134SXin Li if (nrow(map) != (params$m * params$k)) { 319*2abb3134SXin Li return(sprintf( 320*2abb3134SXin Li "Map matrix has invalid dimensions: m * k = %d, nrow(map) = %d", 321*2abb3134SXin Li params$m * params$k, nrow(map))) 322*2abb3134SXin Li } 323*2abb3134SXin Li 324*2abb3134SXin Li if ((ncol(counts) - 1) != params$k) { 325*2abb3134SXin Li return(sprintf(paste0( 326*2abb3134SXin Li "Dimensions of counts file do not match: m = %d, k = %d, ", 327*2abb3134SXin Li "nrow(counts) = %d, ncol(counts) = %d"), params$m, params$k, 328*2abb3134SXin Li nrow(counts), ncol(counts))) 329*2abb3134SXin Li 330*2abb3134SXin Li } 331*2abb3134SXin Li 332*2abb3134SXin Li # numerically correct comparison 333*2abb3134SXin Li if (isTRUE(all.equal((1 - params$f) * (params$p - params$q), 0))) { 334*2abb3134SXin Li return("Information is lost. Cannot decode.") 335*2abb3134SXin Li } 336*2abb3134SXin Li 337*2abb3134SXin Li return(NULL) # no error 338*2abb3134SXin Li} 339*2abb3134SXin Li 340*2abb3134SXin LiDecode <- function(counts, map, params, alpha = 0.05, 341*2abb3134SXin Li correction = c("Bonferroni"), quiet = FALSE, ...) { 342*2abb3134SXin Li 343*2abb3134SXin Li error_msg <- CheckDecodeInputs(counts, map, params) 344*2abb3134SXin Li if (!is.null(error_msg)) { 345*2abb3134SXin Li stop(error_msg) 346*2abb3134SXin Li } 347*2abb3134SXin Li 348*2abb3134SXin Li k <- params$k 349*2abb3134SXin Li p <- params$p 350*2abb3134SXin Li q <- params$q 351*2abb3134SXin Li f <- params$f 352*2abb3134SXin Li h <- params$h 353*2abb3134SXin Li m <- params$m 354*2abb3134SXin Li 355*2abb3134SXin Li S <- ncol(map) # total number of candidates 356*2abb3134SXin Li 357*2abb3134SXin Li N <- sum(counts[, 1]) 358*2abb3134SXin Li if (k == 1) { 359*2abb3134SXin Li return(.DecodeBoolean(counts, params, N)) 360*2abb3134SXin Li } 361*2abb3134SXin Li 362*2abb3134SXin Li filter_cohorts <- which(counts[, 1] != 0) # exclude cohorts with zero reports 363*2abb3134SXin Li 364*2abb3134SXin Li # stretch cohorts to bits 365*2abb3134SXin Li filter_bits <- as.vector(matrix(1:nrow(map), ncol = m)[,filter_cohorts, drop = FALSE]) 366*2abb3134SXin Li 367*2abb3134SXin Li map_filtered <- map[filter_bits, , drop = FALSE] 368*2abb3134SXin Li 369*2abb3134SXin Li es <- EstimateBloomCounts(params, counts) 370*2abb3134SXin Li 371*2abb3134SXin Li estimates_stds_filtered <- 372*2abb3134SXin Li list(estimates = es$estimates[filter_cohorts, , drop = FALSE], 373*2abb3134SXin Li stds = es$stds[filter_cohorts, , drop = FALSE]) 374*2abb3134SXin Li 375*2abb3134SXin Li coefs_all <- vector() 376*2abb3134SXin Li 377*2abb3134SXin Li # Run the fitting procedure several times (5 seems to be sufficient and not 378*2abb3134SXin Li # too many) to estimate standard deviation of the output. 379*2abb3134SXin Li for(r in 1:5) { 380*2abb3134SXin Li if(r > 1) 381*2abb3134SXin Li e <- Resample(estimates_stds_filtered) 382*2abb3134SXin Li else 383*2abb3134SXin Li e <- estimates_stds_filtered 384*2abb3134SXin Li 385*2abb3134SXin Li coefs_all <- rbind(coefs_all, 386*2abb3134SXin Li FitDistribution(e, map_filtered, quiet)) 387*2abb3134SXin Li } 388*2abb3134SXin Li 389*2abb3134SXin Li coefs_ssd <- N * apply(coefs_all, 2, sd) # compute sample standard deviations 390*2abb3134SXin Li coefs_ave <- N * apply(coefs_all, 2, mean) 391*2abb3134SXin Li 392*2abb3134SXin Li # Only select coefficients more than two standard deviations from 0. May 393*2abb3134SXin Li # inflate empirical SD of the estimates. 394*2abb3134SXin Li reported <- which(coefs_ave > 1E-6 + 2 * coefs_ssd) 395*2abb3134SXin Li 396*2abb3134SXin Li mod <- list(coefs = coefs_ave[reported], stds = coefs_ssd[reported]) 397*2abb3134SXin Li 398*2abb3134SXin Li coefs_ave_zeroed <- coefs_ave 399*2abb3134SXin Li coefs_ave_zeroed[-reported] <- 0 400*2abb3134SXin Li 401*2abb3134SXin Li residual <- as.vector(t(estimates_stds_filtered$estimates)) - 402*2abb3134SXin Li map_filtered %*% coefs_ave_zeroed / N 403*2abb3134SXin Li 404*2abb3134SXin Li if (correction == "Bonferroni") { 405*2abb3134SXin Li alpha <- alpha / S 406*2abb3134SXin Li } 407*2abb3134SXin Li 408*2abb3134SXin Li inf <- PerformInference(map_filtered[, reported, drop = FALSE], 409*2abb3134SXin Li as.vector(t(estimates_stds_filtered$estimates)), 410*2abb3134SXin Li N, mod, params, alpha, 411*2abb3134SXin Li correction) 412*2abb3134SXin Li fit <- inf$fit 413*2abb3134SXin Li # If this is a basic RAPPOR instance, just use the counts for the estimate 414*2abb3134SXin Li # (Check if the map is diagonal to tell if this is basic RAPPOR.) 415*2abb3134SXin Li if (sum(map) == sum(diag(map))) { 416*2abb3134SXin Li fit$Estimate <- colSums(counts)[-1] 417*2abb3134SXin Li } 418*2abb3134SXin Li 419*2abb3134SXin Li # Estimates from the model are per instance so must be multipled by h. 420*2abb3134SXin Li # Standard errors are also adjusted. 421*2abb3134SXin Li fit$estimate <- floor(fit$Estimate) 422*2abb3134SXin Li fit$proportion <- fit$estimate / N 423*2abb3134SXin Li 424*2abb3134SXin Li fit$std_error <- floor(fit$SD) 425*2abb3134SXin Li fit$prop_std_error <- fit$std_error / N 426*2abb3134SXin Li 427*2abb3134SXin Li # 1.96 standard deviations gives 95% confidence interval. 428*2abb3134SXin Li low_95 <- fit$proportion - 1.96 * fit$prop_std_error 429*2abb3134SXin Li high_95 <- fit$proportion + 1.96 * fit$prop_std_error 430*2abb3134SXin Li # Clamp estimated proportion. pmin/max: vectorized min and max 431*2abb3134SXin Li fit$prop_low_95 <- pmax(low_95, 0.0) 432*2abb3134SXin Li fit$prop_high_95 <- pmin(high_95, 1.0) 433*2abb3134SXin Li 434*2abb3134SXin Li fit <- fit[, c("string", "estimate", "std_error", "proportion", 435*2abb3134SXin Li "prop_std_error", "prop_low_95", "prop_high_95")] 436*2abb3134SXin Li 437*2abb3134SXin Li allocated_mass <- sum(fit$proportion) 438*2abb3134SXin Li num_detected <- nrow(fit) 439*2abb3134SXin Li 440*2abb3134SXin Li ss <- round(inf$SS, digits = 3) 441*2abb3134SXin Li explained_var <- ss[[1]] 442*2abb3134SXin Li missing_var <- ss[[2]] 443*2abb3134SXin Li noise_var <- ss[[3]] 444*2abb3134SXin Li 445*2abb3134SXin Li noise_std_dev <- round(inf$resid_sigma, digits = 3) 446*2abb3134SXin Li 447*2abb3134SXin Li # Compute summary of the fit. 448*2abb3134SXin Li parameters <- 449*2abb3134SXin Li c("Candidate strings", "Detected strings", 450*2abb3134SXin Li "Sample size (N)", "Discovered Prop (out of N)", 451*2abb3134SXin Li "Explained Variance", "Missing Variance", "Noise Variance", 452*2abb3134SXin Li "Theoretical Noise Std. Dev.") 453*2abb3134SXin Li values <- c(S, num_detected, N, allocated_mass, 454*2abb3134SXin Li explained_var, missing_var, noise_var, noise_std_dev) 455*2abb3134SXin Li 456*2abb3134SXin Li res_summary <- data.frame(parameters = parameters, values = values) 457*2abb3134SXin Li 458*2abb3134SXin Li privacy <- ComputePrivacyGuarantees(params, alpha, N) 459*2abb3134SXin Li params <- data.frame(parameters = 460*2abb3134SXin Li c("k", "h", "m", "p", "q", "f", "N", "alpha"), 461*2abb3134SXin Li values = c(k, h, m, p, q, f, N, alpha)) 462*2abb3134SXin Li 463*2abb3134SXin Li # This is a list of decode stats in a better format than 'summary'. 464*2abb3134SXin Li metrics <- list(sample_size = N, 465*2abb3134SXin Li allocated_mass = allocated_mass, 466*2abb3134SXin Li num_detected = num_detected, 467*2abb3134SXin Li explained_var = explained_var, 468*2abb3134SXin Li missing_var = missing_var) 469*2abb3134SXin Li 470*2abb3134SXin Li list(fit = fit, summary = res_summary, privacy = privacy, params = params, 471*2abb3134SXin Li lasso = NULL, residual = as.vector(residual), 472*2abb3134SXin Li counts = counts[, -1], resid = NULL, metrics = metrics, 473*2abb3134SXin Li ests = es$estimates # ests needed by Shiny rappor-sim app 474*2abb3134SXin Li ) 475*2abb3134SXin Li} 476*2abb3134SXin Li 477*2abb3134SXin LiComputeCounts <- function(reports, cohorts, params) { 478*2abb3134SXin Li # Counts the number of times each bit in the Bloom filters was set for 479*2abb3134SXin Li # each cohort. 480*2abb3134SXin Li # 481*2abb3134SXin Li # Args: 482*2abb3134SXin Li # reports: A list of N elements, each containing the 483*2abb3134SXin Li # report for a given report 484*2abb3134SXin Li # cohorts: A list of N elements, each containing the 485*2abb3134SXin Li # cohort number for a given report 486*2abb3134SXin Li # params: A list of parameters for the problem 487*2abb3134SXin Li # 488*2abb3134SXin Li # Returns: 489*2abb3134SXin Li # An mx(k+1) array containing the number of times each bit was set 490*2abb3134SXin Li # in each cohort. 491*2abb3134SXin Li 492*2abb3134SXin Li # Check that the cohorts are evenly assigned. We assume that if there 493*2abb3134SXin Li # are m cohorts, each cohort should have approximately N/m reports. 494*2abb3134SXin Li # The constraint we impose here simply says that cohort bins should 495*2abb3134SXin Li # each have within N/m reports of one another. Since the most popular 496*2abb3134SXin Li # cohort is expected to have about O(logN/loglogN) reports (which we ) 497*2abb3134SXin Li # approximate as O(logN) bins for practical values of N, a discrepancy of 498*2abb3134SXin Li # O(N) bins seems significant enough to alter expected behavior. This 499*2abb3134SXin Li # threshold can be changed to be more sensitive if desired. 500*2abb3134SXin Li N <- length(reports) 501*2abb3134SXin Li cohort_freqs <- table(factor(cohorts, levels = 1:params$m)) 502*2abb3134SXin Li imbalance_threshold <- N / params$m 503*2abb3134SXin Li if ((max(cohort_freqs) - min(cohort_freqs)) > imbalance_threshold) { 504*2abb3134SXin Li cat("\nNote: You are using unbalanced cohort assignments, which can", 505*2abb3134SXin Li "significantly degrade estimation quality!\n\n") 506*2abb3134SXin Li } 507*2abb3134SXin Li 508*2abb3134SXin Li # Count the times each bit was set, and add cohort counts to first column 509*2abb3134SXin Li counts <- lapply(1:params$m, function(i) 510*2abb3134SXin Li Reduce("+", reports[which(cohorts == i)])) 511*2abb3134SXin Li counts[which(cohort_freqs == 0)] <- data.frame(rep(0, params$k)) 512*2abb3134SXin Li cbind(cohort_freqs, do.call("rbind", counts)) 513*2abb3134SXin Li} 514