xref: /aosp_15_r20/external/rappor/analysis/R/decode.R (revision 2abb31345f6c95944768b5222a9a5ed3fc68cc00)
1*2abb3134SXin Li# Copyright 2014 Google Inc. All rights reserved.
2*2abb3134SXin Li#
3*2abb3134SXin Li# Licensed under the Apache License, Version 2.0 (the "License");
4*2abb3134SXin Li# you may not use this file except in compliance with the License.
5*2abb3134SXin Li# You may obtain a copy of the License at
6*2abb3134SXin Li#
7*2abb3134SXin Li#     http://www.apache.org/licenses/LICENSE-2.0
8*2abb3134SXin Li#
9*2abb3134SXin Li# Unless required by applicable law or agreed to in writing, software
10*2abb3134SXin Li# distributed under the License is distributed on an "AS IS" BASIS,
11*2abb3134SXin Li# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12*2abb3134SXin Li# See the License for the specific language governing permissions and
13*2abb3134SXin Li# limitations under the License.
14*2abb3134SXin Li
15*2abb3134SXin Li#
16*2abb3134SXin Li# This library implements the RAPPOR marginal decoding algorithms using LASSO.
17*2abb3134SXin Li
18*2abb3134SXin Lilibrary(glmnet)
19*2abb3134SXin Li
20*2abb3134SXin Li# So we don't have to change pwd
21*2abb3134SXin Lisource.rappor <- function(rel_path)  {
22*2abb3134SXin Li  abs_path <- paste0(Sys.getenv("RAPPOR_REPO", ""), rel_path)
23*2abb3134SXin Li  source(abs_path)
24*2abb3134SXin Li}
25*2abb3134SXin Li
26*2abb3134SXin Lisource.rappor('analysis/R/alternative.R')
27*2abb3134SXin Li
28*2abb3134SXin LiEstimateBloomCounts <- function(params, obs_counts) {
29*2abb3134SXin Li  # Estimates the number of times each bit in each cohort was set in original
30*2abb3134SXin Li  # Bloom filters.
31*2abb3134SXin Li  #
32*2abb3134SXin Li  # Input:
33*2abb3134SXin Li  #    params: a list of RAPPOR parameters:
34*2abb3134SXin Li  #            k - size of a Bloom filter
35*2abb3134SXin Li  #            h - number of hash functions
36*2abb3134SXin Li  #            m - number of cohorts
37*2abb3134SXin Li  #            p - P(IRR = 1 | PRR = 0)
38*2abb3134SXin Li  #            q - P(IRR = 1 | PRR = 1)
39*2abb3134SXin Li  #            f - Proportion of bits in the Bloom filter that are set randomly
40*2abb3134SXin Li  #                to 0 or 1 regardless of the underlying true bit value
41*2abb3134SXin Li  #    obs_counts: a matrix of size m by (k + 1). Column one contains sample
42*2abb3134SXin Li  #                sizes for each cohort. Other counts indicated how many times
43*2abb3134SXin Li  #                each bit was set in each cohort.
44*2abb3134SXin Li  #
45*2abb3134SXin Li  # Output:
46*2abb3134SXin Li  #    ests: a matrix of size m by k with estimated counts for the probability
47*2abb3134SXin Li  #          of each bit set to 1 in the true Bloom filter.
48*2abb3134SXin Li  #    stds: standard deviation of the estimates.
49*2abb3134SXin Li
50*2abb3134SXin Li  p <- params$p
51*2abb3134SXin Li  q <- params$q
52*2abb3134SXin Li  f <- params$f
53*2abb3134SXin Li  m <- params$m
54*2abb3134SXin Li  k <- params$k
55*2abb3134SXin Li
56*2abb3134SXin Li  stopifnot(m == nrow(obs_counts), k + 1 == ncol(obs_counts))
57*2abb3134SXin Li
58*2abb3134SXin Li  p11 <- q * (1 - f/2) + p * f / 2  # probability of a true 1 reported as 1
59*2abb3134SXin Li  p01 <- p * (1 - f/2) + q * f / 2  # probability of a true 0 reported as 1
60*2abb3134SXin Li
61*2abb3134SXin Li  p2 <- p11 - p01  # == (1 - f) * (q - p)
62*2abb3134SXin Li
63*2abb3134SXin Li  # When m = 1, obs_counts does not have the right dimensions. Fixing this.
64*2abb3134SXin Li  dim(obs_counts) <- c(m, k + 1)
65*2abb3134SXin Li
66*2abb3134SXin Li  ests <- apply(obs_counts, 1, function(cohort_row) {
67*2abb3134SXin Li      N <- cohort_row[1]  # sample size for the cohort -- first column is total
68*2abb3134SXin Li      v <- cohort_row[-1] # counts for individual bits
69*2abb3134SXin Li      (v - p01 * N) / p2  # unbiased estimator for individual bits'
70*2abb3134SXin Li                          # true counts. It can be negative or
71*2abb3134SXin Li                          # exceed the total.
72*2abb3134SXin Li    })
73*2abb3134SXin Li
74*2abb3134SXin Li  # NOTE: When k == 1, rows of obs_counts have 2 entries.  Then cohort_row[-1]
75*2abb3134SXin Li  # is a singleton vector, and apply() returns a *vector*.  When rows have 3
76*2abb3134SXin Li  # entries, cohort_row[-1] is a vector of length 2 and apply() returns a
77*2abb3134SXin Li  # *matrix*.
78*2abb3134SXin Li  #
79*2abb3134SXin Li  # Fix this by explicitly setting dimensions.  NOTE: It's k x m, not m x k.
80*2abb3134SXin Li  dim(ests) <- c(k, m)
81*2abb3134SXin Li
82*2abb3134SXin Li  total <- sum(obs_counts[,1])
83*2abb3134SXin Li
84*2abb3134SXin Li  variances <- apply(obs_counts, 1, function(cohort_row) {
85*2abb3134SXin Li      N <- cohort_row[1]
86*2abb3134SXin Li      v <- cohort_row[-1]
87*2abb3134SXin Li      p_hats <- (v - p01 * N) / (N * p2)  # expectation of a true 1
88*2abb3134SXin Li      p_hats <- pmax(0, pmin(1, p_hats))  # clamp to [0,1]
89*2abb3134SXin Li      r <- p_hats * p11 + (1 - p_hats) * p01  # expectation of a reported 1
90*2abb3134SXin Li      N * r * (1 - r) / p2^2  # variance of the binomial
91*2abb3134SXin Li     })
92*2abb3134SXin Li
93*2abb3134SXin Li  dim(variances) <- c(k, m)
94*2abb3134SXin Li
95*2abb3134SXin Li  # Transform counts from absolute values to fractional, removing bias due to
96*2abb3134SXin Li  #      variability of reporting between cohorts.
97*2abb3134SXin Li  ests <- apply(ests, 1, function(x) x / obs_counts[,1])
98*2abb3134SXin Li  stds <- apply(variances^.5, 1, function(x) x / obs_counts[,1])
99*2abb3134SXin Li
100*2abb3134SXin Li  # Some estimates may be set to infinity, e.g. if f=1. We want to account for
101*2abb3134SXin Li  # this possibility, and set the corresponding counts to 0.
102*2abb3134SXin Li  ests[abs(ests) == Inf] <- 0
103*2abb3134SXin Li
104*2abb3134SXin Li  list(estimates = ests, stds = stds)
105*2abb3134SXin Li}
106*2abb3134SXin Li
107*2abb3134SXin LiFitLasso <- function(X, Y, intercept = TRUE) {
108*2abb3134SXin Li  # Fits a Lasso model to select a subset of columns of X.
109*2abb3134SXin Li  #
110*2abb3134SXin Li  # Input:
111*2abb3134SXin Li  #    X: a design matrix of size km by M (the number of candidate strings).
112*2abb3134SXin Li  #    Y: a vector of size km with estimated counts from EstimateBloomCounts().
113*2abb3134SXin Li  #    intercept: whether to fit with intercept or not.
114*2abb3134SXin Li  #
115*2abb3134SXin Li  # Output:
116*2abb3134SXin Li  #    a vector of size ncol(X) of coefficients.
117*2abb3134SXin Li
118*2abb3134SXin Li  # TODO(mironov): Test cv.glmnet instead of glmnet
119*2abb3134SXin Li  mod <- try(glmnet(X, Y, standardize = FALSE, intercept = intercept,
120*2abb3134SXin Li                    lower.limits = 0,  # outputs are non-negative
121*2abb3134SXin Li                    # Cap the number of non-zero coefficients to 500 or
122*2abb3134SXin Li                    # 80% of the length of Y, whichever is less. The 500 cap
123*2abb3134SXin Li                    # is for performance reasons, 80% is to avoid overfitting.
124*2abb3134SXin Li                    pmax = min(500, length(Y) * .8)),
125*2abb3134SXin Li             silent = TRUE)
126*2abb3134SXin Li
127*2abb3134SXin Li  # If fitting fails, return an empty data.frame.
128*2abb3134SXin Li  if (class(mod)[1] == "try-error") {
129*2abb3134SXin Li    coefs <- setNames(rep(0, ncol(X)), colnames(X))
130*2abb3134SXin Li  } else {
131*2abb3134SXin Li    coefs <- coef(mod)
132*2abb3134SXin Li    coefs <- coefs[-1, ncol(coefs), drop = FALSE]  # coefs[1] is the intercept
133*2abb3134SXin Li  }
134*2abb3134SXin Li  coefs
135*2abb3134SXin Li}
136*2abb3134SXin Li
137*2abb3134SXin LiPerformInference <- function(X, Y, N, mod, params, alpha, correction) {
138*2abb3134SXin Li  m <- params$m
139*2abb3134SXin Li  p <- params$p
140*2abb3134SXin Li  q <- params$q
141*2abb3134SXin Li  f <- params$f
142*2abb3134SXin Li  h <- params$h
143*2abb3134SXin Li
144*2abb3134SXin Li  q2 <- .5 * f * (p + q) + (1 - f) * q
145*2abb3134SXin Li  p2 <- .5 * f * (p + q) + (1 - f) * p
146*2abb3134SXin Li  resid_var <- p2 * (1 - p2) * (N / m) / (q2 - p2)^2
147*2abb3134SXin Li
148*2abb3134SXin Li  # Total Sum of Squares (SS).
149*2abb3134SXin Li  TSS <- sum((Y - mean(Y))^2)
150*2abb3134SXin Li  # Error Sum of Squares (ESS).
151*2abb3134SXin Li  ESS <- resid_var * nrow(X)
152*2abb3134SXin Li
153*2abb3134SXin Li  betas <- matrix(mod$coefs, ncol = 1)
154*2abb3134SXin Li
155*2abb3134SXin Li#   mod_var <- summary(mod$fit)$sigma^2
156*2abb3134SXin Li#   betas_sd <- rep(sqrt(max(resid_var, mod_var) / (m * h)), length(betas))
157*2abb3134SXin Li#
158*2abb3134SXin Li#   z_values <- betas / betas_sd
159*2abb3134SXin Li#
160*2abb3134SXin Li#   # 1-sided t-test.
161*2abb3134SXin Li#   p_values <- pnorm(z_values, lower = FALSE)
162*2abb3134SXin Li
163*2abb3134SXin Li  fit <- data.frame(string = colnames(X), Estimate = betas,
164*2abb3134SXin Li                    SD = mod$stds, # z_stat = z_values, pvalue = p_values,
165*2abb3134SXin Li                    stringsAsFactors = FALSE)
166*2abb3134SXin Li
167*2abb3134SXin Li#   if (correction == "FDR") {
168*2abb3134SXin Li#     fit <- fit[order(fit$pvalue, decreasing = FALSE), ]
169*2abb3134SXin Li#     ind <- which(fit$pvalue < (1:nrow(fit)) * alpha / nrow(fit))
170*2abb3134SXin Li#     if (length(ind) > 0) {
171*2abb3134SXin Li#       fit <- fit[1:max(ind), ]
172*2abb3134SXin Li#     } else {
173*2abb3134SXin Li#       fit <- fit[numeric(0), ]
174*2abb3134SXin Li#     }
175*2abb3134SXin Li#   } else {
176*2abb3134SXin Li#     fit <- fit[fit$p < alpha, ]
177*2abb3134SXin Li#   }
178*2abb3134SXin Li
179*2abb3134SXin Li  fit <- fit[order(fit$Estimate, decreasing = TRUE), ]
180*2abb3134SXin Li
181*2abb3134SXin Li  if (nrow(fit) > 0) {
182*2abb3134SXin Li    str_names <- fit$string
183*2abb3134SXin Li    str_names <- str_names[!is.na(str_names)]
184*2abb3134SXin Li    if (length(str_names) > 0 && length(str_names) < nrow(X)) {
185*2abb3134SXin Li      this_data <- as.data.frame(as.matrix(X[, str_names]))
186*2abb3134SXin Li      Y_hat <- predict(lm(Y ~ ., data = this_data))
187*2abb3134SXin Li      RSS <- sum((Y_hat - mean(Y))^2)
188*2abb3134SXin Li    } else {
189*2abb3134SXin Li      RSS <- NA
190*2abb3134SXin Li    }
191*2abb3134SXin Li  } else {
192*2abb3134SXin Li    RSS <- 0
193*2abb3134SXin Li  }
194*2abb3134SXin Li
195*2abb3134SXin Li  USS <- TSS - ESS - RSS
196*2abb3134SXin Li  SS <- c(RSS, USS, ESS) / TSS
197*2abb3134SXin Li
198*2abb3134SXin Li  list(fit = fit, SS = SS, resid_sigma = sqrt(resid_var))
199*2abb3134SXin Li}
200*2abb3134SXin Li
201*2abb3134SXin LiComputePrivacyGuarantees <- function(params, alpha, N) {
202*2abb3134SXin Li  # Compute privacy parameters and guarantees.
203*2abb3134SXin Li  p <- params$p
204*2abb3134SXin Li  q <- params$q
205*2abb3134SXin Li  f <- params$f
206*2abb3134SXin Li  h <- params$h
207*2abb3134SXin Li
208*2abb3134SXin Li  q2 <- .5 * f * (p + q) + (1 - f) * q
209*2abb3134SXin Li  p2 <- .5 * f * (p + q) + (1 - f) * p
210*2abb3134SXin Li
211*2abb3134SXin Li  exp_e_one <- ((q2 * (1 - p2)) / (p2 * (1 - q2)))^h
212*2abb3134SXin Li  if (exp_e_one < 1) {
213*2abb3134SXin Li    exp_e_one <- 1 / exp_e_one
214*2abb3134SXin Li  }
215*2abb3134SXin Li  e_one <- log(exp_e_one)
216*2abb3134SXin Li
217*2abb3134SXin Li  exp_e_inf <- ((1 - .5 * f) / (.5 * f))^(2 * h)
218*2abb3134SXin Li  e_inf <- log(exp_e_inf)
219*2abb3134SXin Li
220*2abb3134SXin Li  std_dev_counts <- sqrt(p2 * (1 - p2) * N) / (q2 - p2)
221*2abb3134SXin Li  detection_freq <- qnorm(1 - alpha) * std_dev_counts / N
222*2abb3134SXin Li
223*2abb3134SXin Li  privacy_names <- c("Effective p", "Effective q", "exp(e_1)",
224*2abb3134SXin Li                     "e_1", "exp(e_inf)", "e_inf", "Detection frequency")
225*2abb3134SXin Li  privacy_vals <- c(p2, q2, exp_e_one, e_one, exp_e_inf, e_inf, detection_freq)
226*2abb3134SXin Li
227*2abb3134SXin Li  privacy <- data.frame(parameters = privacy_names,
228*2abb3134SXin Li                        values = privacy_vals)
229*2abb3134SXin Li  privacy
230*2abb3134SXin Li}
231*2abb3134SXin Li
232*2abb3134SXin LiFitDistribution <- function(estimates_stds, map, quiet = FALSE) {
233*2abb3134SXin Li  # Find a distribution over rows of map that approximates estimates_stds best
234*2abb3134SXin Li  #
235*2abb3134SXin Li  # Input:
236*2abb3134SXin Li  #   estimates_stds: a list of two m x k matrices, one for estimates, another
237*2abb3134SXin Li  #                   for standard errors
238*2abb3134SXin Li  #   map           : an (m * k) x S boolean matrix
239*2abb3134SXin Li  #
240*2abb3134SXin Li  # Output:
241*2abb3134SXin Li  #   a float vector of length S, so that a distribution over map's rows sampled
242*2abb3134SXin Li  #   according to this vector approximates estimates
243*2abb3134SXin Li
244*2abb3134SXin Li  S <- ncol(map)  # total number of candidates
245*2abb3134SXin Li
246*2abb3134SXin Li  support_coefs <- 1:S
247*2abb3134SXin Li
248*2abb3134SXin Li  if (S > length(estimates_stds$estimates) * .8) {
249*2abb3134SXin Li    # the system is close to being underdetermined
250*2abb3134SXin Li    lasso <- FitLasso(map, as.vector(t(estimates_stds$estimates)))
251*2abb3134SXin Li
252*2abb3134SXin Li    # Select non-zero coefficients.
253*2abb3134SXin Li    support_coefs <- which(lasso > 0)
254*2abb3134SXin Li
255*2abb3134SXin Li    if(!quiet)
256*2abb3134SXin Li      cat("LASSO selected ", length(support_coefs), " non-zero coefficients.\n")
257*2abb3134SXin Li  }
258*2abb3134SXin Li
259*2abb3134SXin Li  coefs <- setNames(rep(0, S), colnames(map))
260*2abb3134SXin Li
261*2abb3134SXin Li  if(length(support_coefs) > 0) {  # LASSO may return an empty list
262*2abb3134SXin Li    constrained_coefs <- ConstrainedLinModel(map[, support_coefs, drop = FALSE],
263*2abb3134SXin Li                                             estimates_stds)
264*2abb3134SXin Li
265*2abb3134SXin Li    coefs[support_coefs] <- constrained_coefs
266*2abb3134SXin Li  }
267*2abb3134SXin Li
268*2abb3134SXin Li  coefs
269*2abb3134SXin Li}
270*2abb3134SXin Li
271*2abb3134SXin LiResample <- function(e) {
272*2abb3134SXin Li  # Simulate resampling of the Bloom filter estimates by adding Gaussian noise
273*2abb3134SXin Li  # with estimated standard deviation.
274*2abb3134SXin Li  estimates <- matrix(mapply(function(x, y) x + rnorm(1, 0, y),
275*2abb3134SXin Li                             e$estimates, e$stds),
276*2abb3134SXin Li                             nrow = nrow(e$estimates), ncol = ncol(e$estimates))
277*2abb3134SXin Li  stds <- e$stds * 2^.5
278*2abb3134SXin Li
279*2abb3134SXin Li  list(estimates = estimates, stds = stds)
280*2abb3134SXin Li}
281*2abb3134SXin Li
282*2abb3134SXin Li# Private function
283*2abb3134SXin Li# Decode for Boolean RAPPOR inputs
284*2abb3134SXin Li# Returns a list with attribute fit only. (Inference and other aspects
285*2abb3134SXin Li# currently not incorporated because they're unnecessary for association.)
286*2abb3134SXin Li.DecodeBoolean <- function(counts, params, num_reports) {
287*2abb3134SXin Li  # Boolean variables are reported without cohorts and to estimate counts,
288*2abb3134SXin Li  # first sum up counts across all cohorts and then run EstimateBloomCounts
289*2abb3134SXin Li  # with the number of cohorts set to 1.
290*2abb3134SXin Li  params$m <- 1  # set number of cohorts to 1
291*2abb3134SXin Li  summed_counts <- colSums(counts)  # sum counts across cohorts
292*2abb3134SXin Li  es <- EstimateBloomCounts(params, summed_counts)  # estimate boolean counts
293*2abb3134SXin Li
294*2abb3134SXin Li  ests <- es$estimates[[1]]
295*2abb3134SXin Li  std <- es$stds[[1]]
296*2abb3134SXin Li
297*2abb3134SXin Li  fit <- data.frame(
298*2abb3134SXin Li           string         = c("TRUE", "FALSE"),
299*2abb3134SXin Li           estimate       = c(ests * num_reports,
300*2abb3134SXin Li                              num_reports - ests * num_reports),
301*2abb3134SXin Li           std_error      = c(std * num_reports, std * num_reports),
302*2abb3134SXin Li           proportion     = c(ests, 1 - ests),
303*2abb3134SXin Li           prop_std_error = c(std, std))
304*2abb3134SXin Li
305*2abb3134SXin Li  low_95 <- fit$proportion - 1.96 * fit$prop_std_error
306*2abb3134SXin Li  high_95 <- fit$proportion + 1.96 * fit$prop_std_error
307*2abb3134SXin Li
308*2abb3134SXin Li  fit$prop_low_95 <- pmax(low_95, 0.0)
309*2abb3134SXin Li  fit$prop_high_95 <- pmin(high_95, 1.0)
310*2abb3134SXin Li  rownames(fit) <- fit$string
311*2abb3134SXin Li
312*2abb3134SXin Li  return(list(fit = fit))
313*2abb3134SXin Li}
314*2abb3134SXin Li
315*2abb3134SXin LiCheckDecodeInputs <- function(counts, map, params) {
316*2abb3134SXin Li  # Returns an error message, or NULL if there is no error.
317*2abb3134SXin Li
318*2abb3134SXin Li  if (nrow(map) != (params$m * params$k)) {
319*2abb3134SXin Li    return(sprintf(
320*2abb3134SXin Li        "Map matrix has invalid dimensions: m * k = %d, nrow(map) = %d",
321*2abb3134SXin Li        params$m * params$k, nrow(map)))
322*2abb3134SXin Li  }
323*2abb3134SXin Li
324*2abb3134SXin Li  if ((ncol(counts) - 1) != params$k) {
325*2abb3134SXin Li    return(sprintf(paste0(
326*2abb3134SXin Li        "Dimensions of counts file do not match: m = %d, k = %d, ",
327*2abb3134SXin Li        "nrow(counts) = %d, ncol(counts) = %d"), params$m, params$k,
328*2abb3134SXin Li        nrow(counts), ncol(counts)))
329*2abb3134SXin Li
330*2abb3134SXin Li  }
331*2abb3134SXin Li
332*2abb3134SXin Li  # numerically correct comparison
333*2abb3134SXin Li  if (isTRUE(all.equal((1 - params$f) * (params$p - params$q), 0))) {
334*2abb3134SXin Li    return("Information is lost. Cannot decode.")
335*2abb3134SXin Li  }
336*2abb3134SXin Li
337*2abb3134SXin Li  return(NULL)  # no error
338*2abb3134SXin Li}
339*2abb3134SXin Li
340*2abb3134SXin LiDecode <- function(counts, map, params, alpha = 0.05,
341*2abb3134SXin Li                   correction = c("Bonferroni"), quiet = FALSE, ...) {
342*2abb3134SXin Li
343*2abb3134SXin Li  error_msg <- CheckDecodeInputs(counts, map, params)
344*2abb3134SXin Li  if (!is.null(error_msg)) {
345*2abb3134SXin Li    stop(error_msg)
346*2abb3134SXin Li  }
347*2abb3134SXin Li
348*2abb3134SXin Li  k <- params$k
349*2abb3134SXin Li  p <- params$p
350*2abb3134SXin Li  q <- params$q
351*2abb3134SXin Li  f <- params$f
352*2abb3134SXin Li  h <- params$h
353*2abb3134SXin Li  m <- params$m
354*2abb3134SXin Li
355*2abb3134SXin Li  S <- ncol(map)  # total number of candidates
356*2abb3134SXin Li
357*2abb3134SXin Li  N <- sum(counts[, 1])
358*2abb3134SXin Li  if (k == 1) {
359*2abb3134SXin Li    return(.DecodeBoolean(counts, params, N))
360*2abb3134SXin Li  }
361*2abb3134SXin Li
362*2abb3134SXin Li  filter_cohorts <- which(counts[, 1] != 0)  # exclude cohorts with zero reports
363*2abb3134SXin Li
364*2abb3134SXin Li  # stretch cohorts to bits
365*2abb3134SXin Li  filter_bits <- as.vector(matrix(1:nrow(map), ncol = m)[,filter_cohorts, drop = FALSE])
366*2abb3134SXin Li
367*2abb3134SXin Li  map_filtered <- map[filter_bits, , drop = FALSE]
368*2abb3134SXin Li
369*2abb3134SXin Li  es <- EstimateBloomCounts(params, counts)
370*2abb3134SXin Li
371*2abb3134SXin Li  estimates_stds_filtered <-
372*2abb3134SXin Li    list(estimates = es$estimates[filter_cohorts, , drop = FALSE],
373*2abb3134SXin Li         stds = es$stds[filter_cohorts, , drop = FALSE])
374*2abb3134SXin Li
375*2abb3134SXin Li  coefs_all <- vector()
376*2abb3134SXin Li
377*2abb3134SXin Li  # Run the fitting procedure several times (5 seems to be sufficient and not
378*2abb3134SXin Li  # too many) to estimate standard deviation of the output.
379*2abb3134SXin Li  for(r in 1:5) {
380*2abb3134SXin Li    if(r > 1)
381*2abb3134SXin Li      e <- Resample(estimates_stds_filtered)
382*2abb3134SXin Li    else
383*2abb3134SXin Li      e <- estimates_stds_filtered
384*2abb3134SXin Li
385*2abb3134SXin Li    coefs_all <- rbind(coefs_all,
386*2abb3134SXin Li                       FitDistribution(e, map_filtered,  quiet))
387*2abb3134SXin Li  }
388*2abb3134SXin Li
389*2abb3134SXin Li  coefs_ssd <- N * apply(coefs_all, 2, sd)  # compute sample standard deviations
390*2abb3134SXin Li  coefs_ave <- N * apply(coefs_all, 2, mean)
391*2abb3134SXin Li
392*2abb3134SXin Li  # Only select coefficients more than two standard deviations from 0. May
393*2abb3134SXin Li  # inflate empirical SD of the estimates.
394*2abb3134SXin Li  reported <- which(coefs_ave > 1E-6 + 2 * coefs_ssd)
395*2abb3134SXin Li
396*2abb3134SXin Li  mod <- list(coefs = coefs_ave[reported], stds = coefs_ssd[reported])
397*2abb3134SXin Li
398*2abb3134SXin Li  coefs_ave_zeroed <- coefs_ave
399*2abb3134SXin Li  coefs_ave_zeroed[-reported] <- 0
400*2abb3134SXin Li
401*2abb3134SXin Li  residual <- as.vector(t(estimates_stds_filtered$estimates)) -
402*2abb3134SXin Li    map_filtered %*% coefs_ave_zeroed / N
403*2abb3134SXin Li
404*2abb3134SXin Li  if (correction == "Bonferroni") {
405*2abb3134SXin Li    alpha <- alpha / S
406*2abb3134SXin Li  }
407*2abb3134SXin Li
408*2abb3134SXin Li  inf <- PerformInference(map_filtered[, reported, drop = FALSE],
409*2abb3134SXin Li                          as.vector(t(estimates_stds_filtered$estimates)),
410*2abb3134SXin Li                          N, mod, params, alpha,
411*2abb3134SXin Li                          correction)
412*2abb3134SXin Li  fit <- inf$fit
413*2abb3134SXin Li  # If this is a basic RAPPOR instance, just use the counts for the estimate
414*2abb3134SXin Li  #     (Check if the map is diagonal to tell if this is basic RAPPOR.)
415*2abb3134SXin Li  if (sum(map) == sum(diag(map))) {
416*2abb3134SXin Li    fit$Estimate <- colSums(counts)[-1]
417*2abb3134SXin Li  }
418*2abb3134SXin Li
419*2abb3134SXin Li  # Estimates from the model are per instance so must be multipled by h.
420*2abb3134SXin Li  # Standard errors are also adjusted.
421*2abb3134SXin Li  fit$estimate <- floor(fit$Estimate)
422*2abb3134SXin Li  fit$proportion <- fit$estimate / N
423*2abb3134SXin Li
424*2abb3134SXin Li  fit$std_error <- floor(fit$SD)
425*2abb3134SXin Li  fit$prop_std_error <- fit$std_error / N
426*2abb3134SXin Li
427*2abb3134SXin Li  # 1.96 standard deviations gives 95% confidence interval.
428*2abb3134SXin Li  low_95 <- fit$proportion - 1.96 * fit$prop_std_error
429*2abb3134SXin Li  high_95 <- fit$proportion + 1.96 * fit$prop_std_error
430*2abb3134SXin Li  # Clamp estimated proportion.  pmin/max: vectorized min and max
431*2abb3134SXin Li  fit$prop_low_95 <- pmax(low_95, 0.0)
432*2abb3134SXin Li  fit$prop_high_95 <- pmin(high_95, 1.0)
433*2abb3134SXin Li
434*2abb3134SXin Li  fit <- fit[, c("string", "estimate", "std_error", "proportion",
435*2abb3134SXin Li                 "prop_std_error", "prop_low_95", "prop_high_95")]
436*2abb3134SXin Li
437*2abb3134SXin Li  allocated_mass <- sum(fit$proportion)
438*2abb3134SXin Li  num_detected <- nrow(fit)
439*2abb3134SXin Li
440*2abb3134SXin Li  ss <- round(inf$SS, digits = 3)
441*2abb3134SXin Li  explained_var <- ss[[1]]
442*2abb3134SXin Li  missing_var <- ss[[2]]
443*2abb3134SXin Li  noise_var <- ss[[3]]
444*2abb3134SXin Li
445*2abb3134SXin Li  noise_std_dev <- round(inf$resid_sigma, digits = 3)
446*2abb3134SXin Li
447*2abb3134SXin Li  # Compute summary of the fit.
448*2abb3134SXin Li  parameters <-
449*2abb3134SXin Li      c("Candidate strings", "Detected strings",
450*2abb3134SXin Li        "Sample size (N)", "Discovered Prop (out of N)",
451*2abb3134SXin Li        "Explained Variance", "Missing Variance", "Noise Variance",
452*2abb3134SXin Li        "Theoretical Noise Std. Dev.")
453*2abb3134SXin Li  values <- c(S, num_detected, N, allocated_mass,
454*2abb3134SXin Li              explained_var, missing_var, noise_var, noise_std_dev)
455*2abb3134SXin Li
456*2abb3134SXin Li  res_summary <- data.frame(parameters = parameters, values = values)
457*2abb3134SXin Li
458*2abb3134SXin Li  privacy <- ComputePrivacyGuarantees(params, alpha, N)
459*2abb3134SXin Li  params <- data.frame(parameters =
460*2abb3134SXin Li                       c("k", "h", "m", "p", "q", "f", "N", "alpha"),
461*2abb3134SXin Li                       values = c(k, h, m, p, q, f, N, alpha))
462*2abb3134SXin Li
463*2abb3134SXin Li  # This is a list of decode stats in a better format than 'summary'.
464*2abb3134SXin Li  metrics <- list(sample_size = N,
465*2abb3134SXin Li                  allocated_mass = allocated_mass,
466*2abb3134SXin Li                  num_detected = num_detected,
467*2abb3134SXin Li                  explained_var = explained_var,
468*2abb3134SXin Li                  missing_var = missing_var)
469*2abb3134SXin Li
470*2abb3134SXin Li  list(fit = fit, summary = res_summary, privacy = privacy, params = params,
471*2abb3134SXin Li       lasso = NULL, residual = as.vector(residual),
472*2abb3134SXin Li       counts = counts[, -1], resid = NULL, metrics = metrics,
473*2abb3134SXin Li       ests = es$estimates  # ests needed by Shiny rappor-sim app
474*2abb3134SXin Li  )
475*2abb3134SXin Li}
476*2abb3134SXin Li
477*2abb3134SXin LiComputeCounts <- function(reports, cohorts, params) {
478*2abb3134SXin Li  # Counts the number of times each bit in the Bloom filters was set for
479*2abb3134SXin Li  #     each cohort.
480*2abb3134SXin Li  #
481*2abb3134SXin Li  # Args:
482*2abb3134SXin Li  #   reports: A list of N elements, each containing the
483*2abb3134SXin Li  #       report for a given report
484*2abb3134SXin Li  #   cohorts: A list of N elements, each containing the
485*2abb3134SXin Li  #       cohort number for a given report
486*2abb3134SXin Li  #   params: A list of parameters for the problem
487*2abb3134SXin Li  #
488*2abb3134SXin Li  # Returns:
489*2abb3134SXin Li  #   An mx(k+1) array containing the number of times each bit was set
490*2abb3134SXin Li  #       in each cohort.
491*2abb3134SXin Li
492*2abb3134SXin Li  # Check that the cohorts are evenly assigned. We assume that if there
493*2abb3134SXin Li  #     are m cohorts, each cohort should have approximately N/m reports.
494*2abb3134SXin Li  #     The constraint we impose here simply says that cohort bins should
495*2abb3134SXin Li  #     each have within N/m reports of one another. Since the most popular
496*2abb3134SXin Li  #     cohort is expected to have about O(logN/loglogN) reports (which we )
497*2abb3134SXin Li  #     approximate as O(logN) bins for practical values of N, a discrepancy of
498*2abb3134SXin Li  #     O(N) bins seems significant enough to alter expected behavior. This
499*2abb3134SXin Li  #     threshold can be changed to be more sensitive if desired.
500*2abb3134SXin Li  N <- length(reports)
501*2abb3134SXin Li  cohort_freqs <- table(factor(cohorts, levels = 1:params$m))
502*2abb3134SXin Li  imbalance_threshold <- N / params$m
503*2abb3134SXin Li  if ((max(cohort_freqs) - min(cohort_freqs)) > imbalance_threshold) {
504*2abb3134SXin Li    cat("\nNote: You are using unbalanced cohort assignments, which can",
505*2abb3134SXin Li        "significantly degrade estimation quality!\n\n")
506*2abb3134SXin Li  }
507*2abb3134SXin Li
508*2abb3134SXin Li  # Count the times each bit was set, and add cohort counts to first column
509*2abb3134SXin Li  counts <- lapply(1:params$m, function(i)
510*2abb3134SXin Li                   Reduce("+", reports[which(cohorts == i)]))
511*2abb3134SXin Li  counts[which(cohort_freqs == 0)] <- data.frame(rep(0, params$k))
512*2abb3134SXin Li  cbind(cohort_freqs, do.call("rbind", counts))
513*2abb3134SXin Li}
514