xref: /aosp_15_r20/external/rappor/analysis/R/simulation.R (revision 2abb31345f6c95944768b5222a9a5ed3fc68cc00)
1*2abb3134SXin Li# Copyright 2014 Google Inc. All rights reserved.
2*2abb3134SXin Li#
3*2abb3134SXin Li# Licensed under the Apache License, Version 2.0 (the "License");
4*2abb3134SXin Li# you may not use this file except in compliance with the License.
5*2abb3134SXin Li# You may obtain a copy of the License at
6*2abb3134SXin Li#
7*2abb3134SXin Li#     http://www.apache.org/licenses/LICENSE-2.0
8*2abb3134SXin Li#
9*2abb3134SXin Li# Unless required by applicable law or agreed to in writing, software
10*2abb3134SXin Li# distributed under the License is distributed on an "AS IS" BASIS,
11*2abb3134SXin Li# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12*2abb3134SXin Li# See the License for the specific language governing permissions and
13*2abb3134SXin Li# limitations under the License.
14*2abb3134SXin Li
15*2abb3134SXin Li#
16*2abb3134SXin Li# RAPPOR simulation library. Contains code for encoding simulated data and
17*2abb3134SXin Li#     creating the map used to encode and decode reports.
18*2abb3134SXin Li
19*2abb3134SXin Lilibrary(glmnet)
20*2abb3134SXin Lilibrary(parallel)  # mclapply
21*2abb3134SXin Li
22*2abb3134SXin LiSetOfStrings <- function(num_strings = 100) {
23*2abb3134SXin Li  # Generates a set of strings for simulation purposes.
24*2abb3134SXin Li  strs <- paste0("V_", as.character(1:num_strings))
25*2abb3134SXin Li  strs
26*2abb3134SXin Li}
27*2abb3134SXin Li
28*2abb3134SXin LiGetSampleProbs <- function(params) {
29*2abb3134SXin Li  # Generate different underlying distributions for simulations purposes.
30*2abb3134SXin Li  # Args:
31*2abb3134SXin Li  #    - params: a list describing the shape of the true distribution:
32*2abb3134SXin Li  #              c(num_strings, prop_nonzero_strings, decay_type,
33*2abb3134SXin Li  #                rate_exponetial).
34*2abb3134SXin Li  nstrs <- params[[1]]
35*2abb3134SXin Li  nonzero <- params[[2]]
36*2abb3134SXin Li  decay <- params[[3]]
37*2abb3134SXin Li  expo <- params[[4]]
38*2abb3134SXin Li  background <- params[[5]]
39*2abb3134SXin Li
40*2abb3134SXin Li  probs <- rep(0, nstrs)
41*2abb3134SXin Li  ind <- floor(nstrs * nonzero)
42*2abb3134SXin Li  if (decay == "Linear") {
43*2abb3134SXin Li    probs[1:ind] <- (ind:1) / sum(1:ind)
44*2abb3134SXin Li  } else if (decay == "Constant") {
45*2abb3134SXin Li    probs[1:ind] <- 1 / ind
46*2abb3134SXin Li  } else if (decay == "Exponential") {
47*2abb3134SXin Li    temp <- seq(0, nonzero, length.out = ind)
48*2abb3134SXin Li    temp <- exp(-temp * expo)
49*2abb3134SXin Li    temp <- temp + background
50*2abb3134SXin Li    temp <- temp / sum(temp)
51*2abb3134SXin Li    probs[1:ind] <- temp
52*2abb3134SXin Li  } else {
53*2abb3134SXin Li    stop('params[[4]] must be in c("Linear", "Exponenential", "Constant")')
54*2abb3134SXin Li  }
55*2abb3134SXin Li  probs
56*2abb3134SXin Li}
57*2abb3134SXin Li
58*2abb3134SXin LiEncodeAll <- function(x, cohorts, map, params, num_cores = 1) {
59*2abb3134SXin Li  # Encodes the ground truth into RAPPOR reports.
60*2abb3134SXin Li  #
61*2abb3134SXin Li  # Args:
62*2abb3134SXin Li  #   x: Observed strings for each report, Nx1 vector
63*2abb3134SXin Li  #   cohort: Cohort assignment for each report, Nx1 vector
64*2abb3134SXin Li  #   map: list of matrices encoding locations of hashes for each
65*2abb3134SXin Li  #       string, for each cohort
66*2abb3134SXin Li  #   params: System parameters
67*2abb3134SXin Li  #
68*2abb3134SXin Li  # Returns:
69*2abb3134SXin Li  #   RAPPOR reports for each piece of data.
70*2abb3134SXin Li
71*2abb3134SXin Li  p <- params$p
72*2abb3134SXin Li  q <- params$q
73*2abb3134SXin Li  f <- params$f
74*2abb3134SXin Li  k <- params$k
75*2abb3134SXin Li
76*2abb3134SXin Li  qstar <- (1 - f / 2) * q + (f / 2) * p
77*2abb3134SXin Li  pstar <- (1 - f / 2) * p + (f / 2) * q
78*2abb3134SXin Li
79*2abb3134SXin Li  candidates <- colnames(map[[1]])
80*2abb3134SXin Li  if (!all(x %in% candidates)) {
81*2abb3134SXin Li    stop("Some strings are not in the map. set(X) - set(candidates): ",
82*2abb3134SXin Li         paste(setdiff(unique(x), candidates), collapse=" "), "\n")
83*2abb3134SXin Li  }
84*2abb3134SXin Li  bfs <- mapply(function(x, y) y[, x], x, map[cohorts], SIMPLIFY = FALSE,
85*2abb3134SXin Li                USE.NAMES = FALSE)
86*2abb3134SXin Li  reports <- mclapply(bfs, function(x) {
87*2abb3134SXin Li    noise <- sample(0:1, k, replace = TRUE, prob = c(1 - pstar, pstar))
88*2abb3134SXin Li    ind <- which(x)
89*2abb3134SXin Li    noise[ind] <- sample(0:1, length(ind), replace = TRUE,
90*2abb3134SXin Li                         prob = c(1 - qstar, qstar))
91*2abb3134SXin Li    noise
92*2abb3134SXin Li  }, mc.cores = num_cores)
93*2abb3134SXin Li
94*2abb3134SXin Li  reports
95*2abb3134SXin Li}
96*2abb3134SXin Li
97*2abb3134SXin LiCreateMap <- function(strs, params, generate_pos = TRUE, basic = FALSE) {
98*2abb3134SXin Li  # Creates a list of 0/1 matrices corresponding to mapping between the strs and
99*2abb3134SXin Li  # Bloom filters for each instance of the RAPPOR.
100*2abb3134SXin Li  # Ex. for 3 strings, 2 instances, 1 hash function and Bloom filter of size 4,
101*2abb3134SXin Li  # the result could look this:
102*2abb3134SXin Li  # [[1]]
103*2abb3134SXin Li  #   1 0 0 0
104*2abb3134SXin Li  #   0 1 0 0
105*2abb3134SXin Li  #   0 0 0 1
106*2abb3134SXin Li  # [[2]]
107*2abb3134SXin Li  #   0 1 0 0
108*2abb3134SXin Li  #   0 0 0 1
109*2abb3134SXin Li  #   0 0 1 0
110*2abb3134SXin Li  #
111*2abb3134SXin Li  # Args:
112*2abb3134SXin Li  #    strs: a vector of strings
113*2abb3134SXin Li  #    params: a list of parameters in the following format:
114*2abb3134SXin Li  #         (k, h, m, p, q, f).
115*2abb3134SXin Li  #    generate_pos: Tells whether to generate an object storing the
116*2abb3134SXin Li  #        positions of the nonzeros in the matrix
117*2abb3134SXin Li  #    basic: Tells whether to use basic RAPPOR (only works if h=1).
118*2abb3134SXin Li
119*2abb3134SXin Li  M <- length(strs)
120*2abb3134SXin Li  map_by_cohort <- list()
121*2abb3134SXin Li  k <- params$k
122*2abb3134SXin Li  h <- params$h
123*2abb3134SXin Li  m <- params$m
124*2abb3134SXin Li
125*2abb3134SXin Li  for (i in 1:m) {
126*2abb3134SXin Li    if (basic && (h == 1) && (k == M)) {
127*2abb3134SXin Li      ones <- 1:M
128*2abb3134SXin Li    } else {
129*2abb3134SXin Li      ones <- sample(1:k, M * h, replace = TRUE)
130*2abb3134SXin Li    }
131*2abb3134SXin Li    cols <- rep(1:M, each = h)
132*2abb3134SXin Li    map_by_cohort[[i]] <- sparseMatrix(ones, cols, dims = c(k, M))
133*2abb3134SXin Li    colnames(map_by_cohort[[i]]) <- strs
134*2abb3134SXin Li  }
135*2abb3134SXin Li
136*2abb3134SXin Li  all_cohorts_map <- do.call("rBind", map_by_cohort)
137*2abb3134SXin Li  if (generate_pos) {
138*2abb3134SXin Li    map_pos <- t(apply(all_cohorts_map, 2, function(x) {
139*2abb3134SXin Li      ind <- which(x == 1)
140*2abb3134SXin Li      n <- length(ind)
141*2abb3134SXin Li      if (n < h * m) {
142*2abb3134SXin Li        ind <- c(ind, rep(NA, h * m - n))
143*2abb3134SXin Li      }
144*2abb3134SXin Li      ind
145*2abb3134SXin Li    }))
146*2abb3134SXin Li  } else {
147*2abb3134SXin Li    map_pos <- NULL
148*2abb3134SXin Li  }
149*2abb3134SXin Li
150*2abb3134SXin Li  list(map_by_cohort = map_by_cohort, all_cohorts_map = all_cohorts_map,
151*2abb3134SXin Li       map_pos = map_pos)
152*2abb3134SXin Li}
153*2abb3134SXin Li
154*2abb3134SXin LiGetSample <- function(N, strs, probs) {
155*2abb3134SXin Li  # Sample for the strs population with distribution probs.
156*2abb3134SXin Li  sample(strs, N, replace = TRUE, prob = probs)
157*2abb3134SXin Li}
158*2abb3134SXin Li
159*2abb3134SXin LiGetTrueBits <- function(samp, map, params) {
160*2abb3134SXin Li  # Convert sample generated by GetSample() to Bloom filters where mapping
161*2abb3134SXin Li  # is defined in map.
162*2abb3134SXin Li  # Output:
163*2abb3134SXin Li  #    - reports: a matrix of size [num_instances x size] where each row
164*2abb3134SXin Li  #               represents the number of times each bit in the Bloom filter
165*2abb3134SXin Li  #               was set for a particular instance.
166*2abb3134SXin Li  # Note: reports[, 1] contains the same size for each instance.
167*2abb3134SXin Li
168*2abb3134SXin Li  N <- length(samp)
169*2abb3134SXin Li  k <- params$k
170*2abb3134SXin Li  m <- params$m
171*2abb3134SXin Li  strs <- colnames(map[[1]])
172*2abb3134SXin Li  reports <- matrix(0, m, k + 1)
173*2abb3134SXin Li  inst <- sample(1:m, N, replace = TRUE)
174*2abb3134SXin Li  for (i in 1:m) {
175*2abb3134SXin Li    tab <- table(samp[inst == i])
176*2abb3134SXin Li    tab2 <- rep(0, length(strs))
177*2abb3134SXin Li    tab2[match(names(tab), strs)] <- tab
178*2abb3134SXin Li    counts <- apply(map[[i]], 1, function(x) x * tab2)
179*2abb3134SXin Li    # cat(length(tab2), dim(map[[i]]), dim(counts), "\n")
180*2abb3134SXin Li    reports[i, ] <- c(sum(tab2), apply(counts, 2, sum))
181*2abb3134SXin Li  }
182*2abb3134SXin Li  reports
183*2abb3134SXin Li}
184*2abb3134SXin Li
185*2abb3134SXin LiGetNoisyBits <- function(truth, params) {
186*2abb3134SXin Li  # Applies RAPPOR to the Bloom filters.
187*2abb3134SXin Li  # Args:
188*2abb3134SXin Li  #     - truth: a matrix generated by GetTrueBits().
189*2abb3134SXin Li
190*2abb3134SXin Li  k <- params$k
191*2abb3134SXin Li  p <- params$p
192*2abb3134SXin Li  q <- params$q
193*2abb3134SXin Li  f <- params$f
194*2abb3134SXin Li
195*2abb3134SXin Li  rappors <- apply(truth, 1, function(x) {
196*2abb3134SXin Li    # The following samples considering 4 cases:
197*2abb3134SXin Li    # 1. Signal and we lie on the bit.
198*2abb3134SXin Li    # 2. Signal and we tell the truth.
199*2abb3134SXin Li    # 3. Noise and we lie.
200*2abb3134SXin Li    # 4. Noise and we tell the truth.
201*2abb3134SXin Li
202*2abb3134SXin Li    # Lies when signal sampled from the binomial distribution.
203*2abb3134SXin Li    lied_signal <- rbinom(k, x[-1], f)
204*2abb3134SXin Li
205*2abb3134SXin Li    # Remaining must be the non-lying bits when signal. Sampled with q.
206*2abb3134SXin Li    truth_signal <- x[-1] - lied_signal
207*2abb3134SXin Li
208*2abb3134SXin Li    # Lies when there is no signal which happens x[1] - x[-1] times.
209*2abb3134SXin Li    lied_nosignal <- rbinom(k, x[1] - x[-1], f)
210*2abb3134SXin Li
211*2abb3134SXin Li    # Trtuh when there's no signal. These are sampled with p.
212*2abb3134SXin Li    truth_nosignal <- x[1] - x[-1] - lied_nosignal
213*2abb3134SXin Li
214*2abb3134SXin Li    # Total lies and sampling lies with 50/50 for either p or q.
215*2abb3134SXin Li    lied <- lied_signal + lied_nosignal
216*2abb3134SXin Li    lied_p <- rbinom(k, lied, .5)
217*2abb3134SXin Li    lied_q <- lied - lied_p
218*2abb3134SXin Li
219*2abb3134SXin Li    # Generating the report where sampling of either p or q occurs.
220*2abb3134SXin Li    rbinom(k, lied_q + truth_signal, q) + rbinom(k, lied_p + truth_nosignal, p)
221*2abb3134SXin Li  })
222*2abb3134SXin Li
223*2abb3134SXin Li  cbind(truth[, 1], t(rappors))
224*2abb3134SXin Li}
225*2abb3134SXin Li
226*2abb3134SXin LiGenerateSamples <- function(N = 10^5, params, pop_params, alpha = .05,
227*2abb3134SXin Li                            prop_missing = 0,
228*2abb3134SXin Li                            correction = "Bonferroni") {
229*2abb3134SXin Li  # Simulate N reports with pop_params describing the population and
230*2abb3134SXin Li  # params describing the RAPPOR configuration.
231*2abb3134SXin Li  num_strings = pop_params[[1]]
232*2abb3134SXin Li
233*2abb3134SXin Li  strs <- SetOfStrings(num_strings)
234*2abb3134SXin Li  probs <- GetSampleProbs(pop_params)
235*2abb3134SXin Li  samp <- GetSample(N, strs, probs)
236*2abb3134SXin Li  map <- CreateMap(strs, params)
237*2abb3134SXin Li  truth <- GetTrueBits(samp, map$map_by_cohort, params)
238*2abb3134SXin Li  rappors <- GetNoisyBits(truth, params)
239*2abb3134SXin Li
240*2abb3134SXin Li  strs_apprx <- strs
241*2abb3134SXin Li  map_apprx <- map$all_cohorts_map
242*2abb3134SXin Li  # Remove % of strings to simulate missing variables.
243*2abb3134SXin Li  if (prop_missing > 0) {
244*2abb3134SXin Li    ind <- which(probs > 0)
245*2abb3134SXin Li    removed <- sample(ind, ceiling(prop_missing * length(ind)))
246*2abb3134SXin Li    map_apprx <- map$all_cohorts_map[, -removed]
247*2abb3134SXin Li    strs_apprx <- strs[-removed]
248*2abb3134SXin Li  }
249*2abb3134SXin Li
250*2abb3134SXin Li  # Randomize the columns.
251*2abb3134SXin Li  ind <- sample(1:length(strs_apprx), length(strs_apprx))
252*2abb3134SXin Li  map_apprx <- map_apprx[, ind]
253*2abb3134SXin Li  strs_apprx <- strs_apprx[ind]
254*2abb3134SXin Li
255*2abb3134SXin Li  fit <- Decode(rappors, map_apprx, params, alpha = alpha,
256*2abb3134SXin Li                correction = correction)
257*2abb3134SXin Li
258*2abb3134SXin Li  # Add truth column.
259*2abb3134SXin Li  fit$fit$Truth <- table(samp)[fit$fit$string]
260*2abb3134SXin Li  fit$fit$Truth[is.na(fit$fit$Truth)] <- 0
261*2abb3134SXin Li
262*2abb3134SXin Li  fit$map <- map$map_by_cohort
263*2abb3134SXin Li  fit$truth <- truth
264*2abb3134SXin Li  fit$strs <- strs
265*2abb3134SXin Li  fit$probs <- probs
266*2abb3134SXin Li
267*2abb3134SXin Li  fit
268*2abb3134SXin Li}
269