xref: /aosp_15_r20/external/rappor/analysis/R/association_test.R (revision 2abb31345f6c95944768b5222a9a5ed3fc68cc00)
1*2abb3134SXin Li# Copyright 2014 Google Inc. All rights reserved.
2*2abb3134SXin Li#
3*2abb3134SXin Li# Licensed under the Apache License, Version 2.0 (the "License");
4*2abb3134SXin Li# you may not use this file except in compliance with the License.
5*2abb3134SXin Li# You may obtain a copy of the License at
6*2abb3134SXin Li#
7*2abb3134SXin Li#     http://www.apache.org/licenses/LICENSE-2.0
8*2abb3134SXin Li#
9*2abb3134SXin Li# Unless required by applicable law or agreed to in writing, software
10*2abb3134SXin Li# distributed under the License is distributed on an "AS IS" BASIS,
11*2abb3134SXin Li# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12*2abb3134SXin Li# See the License for the specific language governing permissions and
13*2abb3134SXin Li# limitations under the License.
14*2abb3134SXin Li
15*2abb3134SXin Li# Authors: [email protected] (Vasyl Pihur), [email protected] (Giulia Fanti)
16*2abb3134SXin Li
17*2abb3134SXin Lilibrary(RUnit)
18*2abb3134SXin Lisource("analysis/R/encode.R")
19*2abb3134SXin Lisource("analysis/R/decode.R")
20*2abb3134SXin Lisource("analysis/R/simulation.R")
21*2abb3134SXin Lisource("analysis/R/association.R")
22*2abb3134SXin Lisource("analysis/R/fast_em.R")
23*2abb3134SXin Lisource("analysis/R/util.R")
24*2abb3134SXin Li
25*2abb3134SXin LiSamplePopulations <- function(N, num_variables = 1, params,
26*2abb3134SXin Li                              variable_opts) {
27*2abb3134SXin Li  # Samples a number of variables. User specifies the number of variables
28*2abb3134SXin Li  #     and some desired properties of those variables.
29*2abb3134SXin Li  #
30*2abb3134SXin Li  # Args:
31*2abb3134SXin Li  #   N: Number of reports to generate.
32*2abb3134SXin Li  #   params: RAPPOR parameters, like Bloom filter size, number of
33*2abb3134SXin Li  #       hash bits, etc.
34*2abb3134SXin Li  #   variable_opts: List of options for generating the ground truth:
35*2abb3134SXin Li  #       independent = whether distinct variables should be independently drawn
36*2abb3134SXin Li  #       deterministic = whether the variables should be drawn from a
37*2abb3134SXin Li  #           Poisson distribution or uniformly assigned across the range
38*2abb3134SXin Li  #           of 1:num_strings
39*2abb3134SXin Li  #       num_strings: Only does something if deterministic == TRUE, and
40*2abb3134SXin Li  #           specifies how many strings to use in the uniform assignment
41*2abb3134SXin Li  #           of ground truth strings.
42*2abb3134SXin Li  #
43*2abb3134SXin Li  # Returns:
44*2abb3134SXin Li  #   RAPPOR simulated ground truth for each piece of data.
45*2abb3134SXin Li
46*2abb3134SXin Li  m <- params$m
47*2abb3134SXin Li  num_strings <- variable_opts$num_strings
48*2abb3134SXin Li
49*2abb3134SXin Li  if (variable_opts$deterministic) {
50*2abb3134SXin Li    # If a deterministic assignment is desired, evenly distribute
51*2abb3134SXin Li    #     strings across all cohorts.
52*2abb3134SXin Li
53*2abb3134SXin Li    reps <- ceiling(N / num_strings)
54*2abb3134SXin Li    variables <- lapply(1:num_variables,
55*2abb3134SXin Li                        function(i)
56*2abb3134SXin Li                        as.vector(sapply(1:num_strings, function(x)
57*2abb3134SXin Li                                         rep(x, reps)))[1:N])
58*2abb3134SXin Li    cohorts <- lapply(1:num_variables,
59*2abb3134SXin Li                      function(i) rep(1:m, ceiling(N / m))[1:N])
60*2abb3134SXin Li  } else {
61*2abb3134SXin Li    # Otherwise, draw from a Poisson random variable
62*2abb3134SXin Li    variables <- lapply(1:num_variables, function(i) rpois(N, 1) + 1)
63*2abb3134SXin Li
64*2abb3134SXin Li    # Randomly assign cohorts in each dimension
65*2abb3134SXin Li    cohorts <- lapply(1:num_variables,
66*2abb3134SXin Li                      function(i) sample(1:params$m, N, replace = TRUE))
67*2abb3134SXin Li
68*2abb3134SXin Li    if (!variable_opts$independent) {
69*2abb3134SXin Li      # If user wants dependent RVs, subsequent variables are closely correlated
70*2abb3134SXin Li      # with the first variable in the foll. manner:
71*2abb3134SXin Li      #   variable_i ~ variable_1 + (i-1) Bernoulli(0.5)
72*2abb3134SXin Li
73*2abb3134SXin Li      bernoulli_corr <- function(x) {
74*2abb3134SXin Li        variables[[1]] + (x - 1) * sample(c(0, 1), N, replace = TRUE)}
75*2abb3134SXin Li
76*2abb3134SXin Li      variables[2:num_variables] <- lapply(2:num_variables,
77*2abb3134SXin Li                                           function(x) bernoulli_corr(x))
78*2abb3134SXin Li    }
79*2abb3134SXin Li  }
80*2abb3134SXin Li  list(variables = variables, cohorts = cohorts)
81*2abb3134SXin Li}
82*2abb3134SXin Li
83*2abb3134SXin LiSimulate <- function(N, num_variables, params, variable_opts = NULL,
84*2abb3134SXin Li                     truth = NULL, basic = FALSE) {
85*2abb3134SXin Li  if (is.null(truth)) {
86*2abb3134SXin Li    truth <- SamplePopulations(N, num_variables, params,
87*2abb3134SXin Li                               variable_opts)
88*2abb3134SXin Li  }
89*2abb3134SXin Li  strs <- lapply(truth$variables, function(x) sort(seq(max(x))))
90*2abb3134SXin Li  # strs <- lapply(truth$variables, function(x) sort(unique(x)))
91*2abb3134SXin Li  # strs <- lapply(truth$variables, function(x) 1:length(unique(x)))
92*2abb3134SXin Li
93*2abb3134SXin Li  # Construct lists of maps and reports
94*2abb3134SXin Li  if (variable_opts$deterministic) {
95*2abb3134SXin Li    # Build the maps
96*2abb3134SXin Li    map <- CreateMap(strs[[1]], params, FALSE, basic = basic)
97*2abb3134SXin Li    maps <- lapply(1:num_variables, function(x) map)
98*2abb3134SXin Li    # Build the reports
99*2abb3134SXin Li    report <- EncodeAll(truth$variables[[1]], truth$cohorts[[1]],
100*2abb3134SXin Li                        map$map_by_cohort, params)
101*2abb3134SXin Li    reports <- lapply(1:num_variables, function(x) report)
102*2abb3134SXin Li  } else {
103*2abb3134SXin Li    # Build the maps
104*2abb3134SXin Li    maps <- lapply(1:num_variables, function(x)
105*2abb3134SXin Li                   CreateMap(strs[[x]], params, FALSE,
106*2abb3134SXin Li                             basic = basic))
107*2abb3134SXin Li    # Build the reports
108*2abb3134SXin Li    reports <- lapply(1:num_variables, function(x)
109*2abb3134SXin Li                      EncodeAll(truth$variables[[x]], truth$cohorts[[x]],
110*2abb3134SXin Li                                maps[[x]]$map_by_cohort, params))
111*2abb3134SXin Li  }
112*2abb3134SXin Li
113*2abb3134SXin Li  list(reports = reports, cohorts = truth$cohorts,
114*2abb3134SXin Li       truth = truth$variables, maps = maps, strs = strs)
115*2abb3134SXin Li
116*2abb3134SXin Li}
117*2abb3134SXin Li
118*2abb3134SXin Li# ----------------Actual testing starts here--------------- #
119*2abb3134SXin LiTestComputeDistributionEM <- function() {
120*2abb3134SXin Li  # Test various aspects of ComputeDistributionEM in association.R.
121*2abb3134SXin Li  #     Tests include:
122*2abb3134SXin Li  #     Test 1: Compute a joint distribution of uniformly distributed,
123*2abb3134SXin Li  #         perfectly correlated strings
124*2abb3134SXin Li  #     Test 2: Compute a marginal distribution of uniformly distributed strings
125*2abb3134SXin Li  #     Test 3: Check the "other" category estimation works by removing
126*2abb3134SXin Li  #          a string from the known map.
127*2abb3134SXin Li  #     Test 4: Test that the variance from EM algorithm is 1/N when there
128*2abb3134SXin Li  #          is no noise in the system.
129*2abb3134SXin Li  #     Test 5: Check that the right answer is still obtained when f = 0.2.
130*2abb3134SXin Li
131*2abb3134SXin Li  num_variables <- 3
132*2abb3134SXin Li  N <- 100
133*2abb3134SXin Li
134*2abb3134SXin Li  # Initialize the parameters
135*2abb3134SXin Li  params <- list(k = 12, h = 2, m = 4, p = 0, q = 1, f = 0)
136*2abb3134SXin Li  variable_opts <- list(deterministic = TRUE, num_strings = 2,
137*2abb3134SXin Li                        independent = FALSE)
138*2abb3134SXin Li  sim <- Simulate(N, num_variables, params, variable_opts)
139*2abb3134SXin Li
140*2abb3134SXin Li  # Test 1: Delta function pmf
141*2abb3134SXin Li  joint_dist <- ComputeDistributionEM(sim$reports,
142*2abb3134SXin Li                                      sim$cohorts, sim$maps,
143*2abb3134SXin Li                                      ignore_other = TRUE,
144*2abb3134SXin Li                                      params = params,
145*2abb3134SXin Li                                      marginals = NULL,
146*2abb3134SXin Li                                      estimate_var = FALSE)
147*2abb3134SXin Li  # The recovered distribution should be close to the delta function.
148*2abb3134SXin Li  checkTrue(abs(joint_dist$fit["1", "1", "1"] - 0.5) < 0.01)
149*2abb3134SXin Li  checkTrue(abs(joint_dist$fit["2", "2", "2"] - 0.5) < 0.01)
150*2abb3134SXin Li
151*2abb3134SXin Li  # Test 2: Now compute a marginal using EM
152*2abb3134SXin Li  dist <- ComputeDistributionEM(list(sim$reports[[1]]),
153*2abb3134SXin Li                                list(sim$cohorts[[1]]),
154*2abb3134SXin Li                                list(sim$maps[[1]]),
155*2abb3134SXin Li                                ignore_other = TRUE,
156*2abb3134SXin Li                                params = params,
157*2abb3134SXin Li                                marginals = NULL,
158*2abb3134SXin Li                                estimate_var = FALSE)
159*2abb3134SXin Li  checkTrue(abs(dist$fit["1"] - 0.5) < 0.01)
160*2abb3134SXin Li
161*2abb3134SXin Li  # Test 3: Check that the "other" category is correctly computed
162*2abb3134SXin Li  # Build a modified map with no column 2 (i.e. we only know that string
163*2abb3134SXin Li  #     "1" is a valid string
164*2abb3134SXin Li  map <- sim$maps[[1]]
165*2abb3134SXin Li  small_map <- map
166*2abb3134SXin Li
167*2abb3134SXin Li  for (i in 1:params$m) {
168*2abb3134SXin Li    locs <- which(map$map_by_cohort[[i]][, 1])
169*2abb3134SXin Li    small_map$map_by_cohort[[i]] <- sparseMatrix(locs, rep(1, length(locs)),
170*2abb3134SXin Li                                                 dims = c(params$k, 1))
171*2abb3134SXin Li    locs <- which(map$all_cohorts_map[, 1])
172*2abb3134SXin Li    colnames(small_map$map_by_cohort[[i]]) <- sim$strs[1]
173*2abb3134SXin Li  }
174*2abb3134SXin Li  small_map$all_cohorts_map <- do.call("rBind", small_map$map_by_cohort)
175*2abb3134SXin Li
176*2abb3134SXin Li  dist <- ComputeDistributionEM(list(sim$reports[[1]]),
177*2abb3134SXin Li                                list(sim$cohorts[[1]]),
178*2abb3134SXin Li                                list(small_map),
179*2abb3134SXin Li                                ignore_other = FALSE,
180*2abb3134SXin Li                                params = params,
181*2abb3134SXin Li                                marginals = NULL,
182*2abb3134SXin Li                                estimate_var = FALSE)
183*2abb3134SXin Li
184*2abb3134SXin Li  # The recovered distribution should be uniform over 2 strings.
185*2abb3134SXin Li  checkTrue(abs(dist$fit[1] - 0.5) < 0.1)
186*2abb3134SXin Li
187*2abb3134SXin Li  # Test 4: Test the variance is 1/N
188*2abb3134SXin Li  variable_opts <- list(deterministic = TRUE, num_strings = 1)
189*2abb3134SXin Li  sim <- Simulate(N, num_variables = 1, params, variable_opts)
190*2abb3134SXin Li  dist <- ComputeDistributionEM(sim$reports, sim$cohorts,
191*2abb3134SXin Li                                sim$maps, ignore_other = TRUE,
192*2abb3134SXin Li                                params = params, marginals = NULL,
193*2abb3134SXin Li                                estimate_var = TRUE)
194*2abb3134SXin Li
195*2abb3134SXin Li  checkEqualsNumeric(dist$em$var_cov[1, 1], 1 / N)
196*2abb3134SXin Li
197*2abb3134SXin Li  # Test 5: Check that when f=0.2, we still get a good estimate
198*2abb3134SXin Li  params <- list(k = 12, h = 2, m = 2, p = 0, q = 1, f = 0.2)
199*2abb3134SXin Li  variable_opts <- list(deterministic = TRUE, num_strings = 2)
200*2abb3134SXin Li  sim <- Simulate(N, num_variables = 2, params, variable_opts)
201*2abb3134SXin Li  dist <- ComputeDistributionEM(sim$reports, sim$cohorts,
202*2abb3134SXin Li                                sim$maps, ignore_other = TRUE,
203*2abb3134SXin Li                                params = params, marginals = NULL,
204*2abb3134SXin Li                                estimate_var = FALSE)
205*2abb3134SXin Li
206*2abb3134SXin Li  checkTrue(abs(dist$fit["1", "1"] - 0.5) < 0.15)
207*2abb3134SXin Li  checkTrue(abs(dist$fit["2", "2"] - 0.5) < 0.15)
208*2abb3134SXin Li
209*2abb3134SXin Li  # Test 6: Check the computed joint distribution with randomized
210*2abb3134SXin Li  # correlated inputs from the Poisson distribution
211*2abb3134SXin Li  # Expect to have correlation between strings n and n + 1
212*2abb3134SXin Li  N <- 1000
213*2abb3134SXin Li  params <- list(k = 16, h = 2, m = 4, p = 0.1, q = 0.9, f = 0.1)
214*2abb3134SXin Li  variable_opts <- list(deterministic = FALSE, independent = FALSE)
215*2abb3134SXin Li  sim <- Simulate(N, num_variables = 2, params, variable_opts)
216*2abb3134SXin Li  dist <- ComputeDistributionEM(sim$reports, sim$cohorts,
217*2abb3134SXin Li                                sim$maps, ignore_other = TRUE,
218*2abb3134SXin Li                                params = params, marginals = NULL,
219*2abb3134SXin Li                                estimate_var = FALSE)
220*2abb3134SXin Li
221*2abb3134SXin Li  print_dist <- TRUE  # to print joint distribution, set to TRUE
222*2abb3134SXin Li
223*2abb3134SXin Li  if (print_dist) {
224*2abb3134SXin Li    # dist$fit[dist$fit<1e-4] <- 0
225*2abb3134SXin Li    # Sort by row names and column names to visually see correlation
226*2abb3134SXin Li    print(dist$fit[sort(rownames(dist$fit)), sort(colnames(dist$fit))])
227*2abb3134SXin Li  }
228*2abb3134SXin Li
229*2abb3134SXin Li  # Check for correlations (constants chosen heuristically to get good
230*2abb3134SXin Li  # test confidence with small # of samples)
231*2abb3134SXin Li  # Should have mass roughly 1/2e and 1/2e each
232*2abb3134SXin Li  checkTrue(abs(dist$fit["1", "1"] - dist$fit["1", "2"]) < 0.1)
233*2abb3134SXin Li  checkTrue(abs(dist$fit["2", "2"] - dist$fit["2", "3"]) < 0.1)
234*2abb3134SXin Li
235*2abb3134SXin Li  # Should have mass roughly 1/4e and 1/4e each
236*2abb3134SXin Li  checkTrue(abs(dist$fit["3", "3"] - dist$fit["3", "4"]) < 0.06)
237*2abb3134SXin Li
238*2abb3134SXin Li  # Check for lack of probability mass
239*2abb3134SXin Li  checkTrue(dist$fit["1", "3"] < 0.02)
240*2abb3134SXin Li  checkTrue(dist$fit["1", "4"] < 0.02)
241*2abb3134SXin Li  checkTrue(dist$fit["2", "1"] < 0.02)
242*2abb3134SXin Li  checkTrue(dist$fit["2", "4"] < 0.02)
243*2abb3134SXin Li  checkTrue(dist$fit["3", "1"] < 0.02)
244*2abb3134SXin Li  checkTrue(dist$fit["3", "2"] < 0.02)
245*2abb3134SXin Li}
246*2abb3134SXin Li
247*2abb3134SXin LiMakeCondProb <- function() {
248*2abb3134SXin Li  d = matrix(c(1,1,2,2,3,3), nrow=3, ncol=2)
249*2abb3134SXin Li  d = d / sum(d)
250*2abb3134SXin Li
251*2abb3134SXin Li  e = matrix(c(3,3,2,2,1,1), nrow=3, ncol=2)
252*2abb3134SXin Li  e = e / sum(e)
253*2abb3134SXin Li
254*2abb3134SXin Li  list(d, e, d)  # 3 reports
255*2abb3134SXin Li}
256*2abb3134SXin Li
257*2abb3134SXin Li# Test the slow version in R.
258*2abb3134SXin LiRunEmFunction <- function(cond_prob, max_em_iters) {
259*2abb3134SXin Li  cond_prob <- MakeCondProb()
260*2abb3134SXin Li
261*2abb3134SXin Li  # Mechanical test of 4 iterations.  em$hist has 5 elements.
262*2abb3134SXin Li  result <- EM(cond_prob, max_em_iters=max_em_iters)
263*2abb3134SXin Li  result$est
264*2abb3134SXin Li}
265*2abb3134SXin Li
266*2abb3134SXin Li# Run a test of the EM executable
267*2abb3134SXin LiRunEmExecutable <- function(em_executable, cond_prob, max_em_iters) {
268*2abb3134SXin Li  print(cond_prob)
269*2abb3134SXin Li
270*2abb3134SXin Li  if (!file.exists(em_executable)) {
271*2abb3134SXin Li    stop(sprintf("EM executable %s doesn't exist (build it?)", em_executable))
272*2abb3134SXin Li  }
273*2abb3134SXin Li  em_iter_func <- ConstructFastEM(em_executable, "/tmp")
274*2abb3134SXin Li
275*2abb3134SXin Li  result <- em_iter_func(cond_prob, max_em_iters=max_em_iters)
276*2abb3134SXin Li  result$est
277*2abb3134SXin Li}
278*2abb3134SXin Li
279*2abb3134SXin LiTestCppImplementation <- function() {
280*2abb3134SXin Li  cond_prob <- MakeCondProb()
281*2abb3134SXin Li  max_em_iters <- 10
282*2abb3134SXin Li  fit1 <- RunEmFunction(cond_prob, max_em_iters)
283*2abb3134SXin Li
284*2abb3134SXin Li  # Assume we're in the repo root
285*2abb3134SXin Li  em_cpp <- file.path(getwd(), "analysis/cpp/_tmp/fast_em")
286*2abb3134SXin Li  fit2 <- RunEmExecutable(em_cpp, cond_prob, max_em_iters)
287*2abb3134SXin Li
288*2abb3134SXin Li  cpp_diff <- abs(fit1 - fit2)
289*2abb3134SXin Li  print(cpp_diff)
290*2abb3134SXin Li  Log("C++ implementation difference after %d iterations: %e", max_em_iters,
291*2abb3134SXin Li      sum(cpp_diff))
292*2abb3134SXin Li
293*2abb3134SXin Li  # After 10 iterations they should be almost indistinguishable.
294*2abb3134SXin Li  checkTrue(sum(cpp_diff) < 1e-10)
295*2abb3134SXin Li}
296*2abb3134SXin Li
297*2abb3134SXin LiTestTensorFlowImplementation <- function() {
298*2abb3134SXin Li  cond_prob <- MakeCondProb()
299*2abb3134SXin Li  max_em_iters <- 10
300*2abb3134SXin Li  fit1 <- RunEmFunction(cond_prob, max_em_iters)
301*2abb3134SXin Li
302*2abb3134SXin Li  em_tf <- file.path(getwd(), "analysis/tensorflow/fast_em.sh")
303*2abb3134SXin Li  fit2 <- RunEmExecutable(em_tf, cond_prob, max_em_iters)
304*2abb3134SXin Li
305*2abb3134SXin Li  tf_diff <- abs(fit1 - fit2)
306*2abb3134SXin Li  print(tf_diff)
307*2abb3134SXin Li  Log("TensorFlow implementation difference after %d iterations: %e",
308*2abb3134SXin Li      max_em_iters, sum(tf_diff))
309*2abb3134SXin Li
310*2abb3134SXin Li  checkTrue(sum(tf_diff) < 1e-10)
311*2abb3134SXin Li}
312