1*2abb3134SXin Li# Copyright 2014 Google Inc. All rights reserved. 2*2abb3134SXin Li# 3*2abb3134SXin Li# Licensed under the Apache License, Version 2.0 (the "License"); 4*2abb3134SXin Li# you may not use this file except in compliance with the License. 5*2abb3134SXin Li# You may obtain a copy of the License at 6*2abb3134SXin Li# 7*2abb3134SXin Li# http://www.apache.org/licenses/LICENSE-2.0 8*2abb3134SXin Li# 9*2abb3134SXin Li# Unless required by applicable law or agreed to in writing, software 10*2abb3134SXin Li# distributed under the License is distributed on an "AS IS" BASIS, 11*2abb3134SXin Li# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12*2abb3134SXin Li# See the License for the specific language governing permissions and 13*2abb3134SXin Li# limitations under the License. 14*2abb3134SXin Li 15*2abb3134SXin Li# Authors: [email protected] (Vasyl Pihur), [email protected] (Giulia Fanti) 16*2abb3134SXin Li 17*2abb3134SXin Lilibrary(RUnit) 18*2abb3134SXin Lisource("analysis/R/encode.R") 19*2abb3134SXin Lisource("analysis/R/decode.R") 20*2abb3134SXin Lisource("analysis/R/simulation.R") 21*2abb3134SXin Lisource("analysis/R/association.R") 22*2abb3134SXin Lisource("analysis/R/fast_em.R") 23*2abb3134SXin Lisource("analysis/R/util.R") 24*2abb3134SXin Li 25*2abb3134SXin LiSamplePopulations <- function(N, num_variables = 1, params, 26*2abb3134SXin Li variable_opts) { 27*2abb3134SXin Li # Samples a number of variables. User specifies the number of variables 28*2abb3134SXin Li # and some desired properties of those variables. 29*2abb3134SXin Li # 30*2abb3134SXin Li # Args: 31*2abb3134SXin Li # N: Number of reports to generate. 32*2abb3134SXin Li # params: RAPPOR parameters, like Bloom filter size, number of 33*2abb3134SXin Li # hash bits, etc. 34*2abb3134SXin Li # variable_opts: List of options for generating the ground truth: 35*2abb3134SXin Li # independent = whether distinct variables should be independently drawn 36*2abb3134SXin Li # deterministic = whether the variables should be drawn from a 37*2abb3134SXin Li # Poisson distribution or uniformly assigned across the range 38*2abb3134SXin Li # of 1:num_strings 39*2abb3134SXin Li # num_strings: Only does something if deterministic == TRUE, and 40*2abb3134SXin Li # specifies how many strings to use in the uniform assignment 41*2abb3134SXin Li # of ground truth strings. 42*2abb3134SXin Li # 43*2abb3134SXin Li # Returns: 44*2abb3134SXin Li # RAPPOR simulated ground truth for each piece of data. 45*2abb3134SXin Li 46*2abb3134SXin Li m <- params$m 47*2abb3134SXin Li num_strings <- variable_opts$num_strings 48*2abb3134SXin Li 49*2abb3134SXin Li if (variable_opts$deterministic) { 50*2abb3134SXin Li # If a deterministic assignment is desired, evenly distribute 51*2abb3134SXin Li # strings across all cohorts. 52*2abb3134SXin Li 53*2abb3134SXin Li reps <- ceiling(N / num_strings) 54*2abb3134SXin Li variables <- lapply(1:num_variables, 55*2abb3134SXin Li function(i) 56*2abb3134SXin Li as.vector(sapply(1:num_strings, function(x) 57*2abb3134SXin Li rep(x, reps)))[1:N]) 58*2abb3134SXin Li cohorts <- lapply(1:num_variables, 59*2abb3134SXin Li function(i) rep(1:m, ceiling(N / m))[1:N]) 60*2abb3134SXin Li } else { 61*2abb3134SXin Li # Otherwise, draw from a Poisson random variable 62*2abb3134SXin Li variables <- lapply(1:num_variables, function(i) rpois(N, 1) + 1) 63*2abb3134SXin Li 64*2abb3134SXin Li # Randomly assign cohorts in each dimension 65*2abb3134SXin Li cohorts <- lapply(1:num_variables, 66*2abb3134SXin Li function(i) sample(1:params$m, N, replace = TRUE)) 67*2abb3134SXin Li 68*2abb3134SXin Li if (!variable_opts$independent) { 69*2abb3134SXin Li # If user wants dependent RVs, subsequent variables are closely correlated 70*2abb3134SXin Li # with the first variable in the foll. manner: 71*2abb3134SXin Li # variable_i ~ variable_1 + (i-1) Bernoulli(0.5) 72*2abb3134SXin Li 73*2abb3134SXin Li bernoulli_corr <- function(x) { 74*2abb3134SXin Li variables[[1]] + (x - 1) * sample(c(0, 1), N, replace = TRUE)} 75*2abb3134SXin Li 76*2abb3134SXin Li variables[2:num_variables] <- lapply(2:num_variables, 77*2abb3134SXin Li function(x) bernoulli_corr(x)) 78*2abb3134SXin Li } 79*2abb3134SXin Li } 80*2abb3134SXin Li list(variables = variables, cohorts = cohorts) 81*2abb3134SXin Li} 82*2abb3134SXin Li 83*2abb3134SXin LiSimulate <- function(N, num_variables, params, variable_opts = NULL, 84*2abb3134SXin Li truth = NULL, basic = FALSE) { 85*2abb3134SXin Li if (is.null(truth)) { 86*2abb3134SXin Li truth <- SamplePopulations(N, num_variables, params, 87*2abb3134SXin Li variable_opts) 88*2abb3134SXin Li } 89*2abb3134SXin Li strs <- lapply(truth$variables, function(x) sort(seq(max(x)))) 90*2abb3134SXin Li # strs <- lapply(truth$variables, function(x) sort(unique(x))) 91*2abb3134SXin Li # strs <- lapply(truth$variables, function(x) 1:length(unique(x))) 92*2abb3134SXin Li 93*2abb3134SXin Li # Construct lists of maps and reports 94*2abb3134SXin Li if (variable_opts$deterministic) { 95*2abb3134SXin Li # Build the maps 96*2abb3134SXin Li map <- CreateMap(strs[[1]], params, FALSE, basic = basic) 97*2abb3134SXin Li maps <- lapply(1:num_variables, function(x) map) 98*2abb3134SXin Li # Build the reports 99*2abb3134SXin Li report <- EncodeAll(truth$variables[[1]], truth$cohorts[[1]], 100*2abb3134SXin Li map$map_by_cohort, params) 101*2abb3134SXin Li reports <- lapply(1:num_variables, function(x) report) 102*2abb3134SXin Li } else { 103*2abb3134SXin Li # Build the maps 104*2abb3134SXin Li maps <- lapply(1:num_variables, function(x) 105*2abb3134SXin Li CreateMap(strs[[x]], params, FALSE, 106*2abb3134SXin Li basic = basic)) 107*2abb3134SXin Li # Build the reports 108*2abb3134SXin Li reports <- lapply(1:num_variables, function(x) 109*2abb3134SXin Li EncodeAll(truth$variables[[x]], truth$cohorts[[x]], 110*2abb3134SXin Li maps[[x]]$map_by_cohort, params)) 111*2abb3134SXin Li } 112*2abb3134SXin Li 113*2abb3134SXin Li list(reports = reports, cohorts = truth$cohorts, 114*2abb3134SXin Li truth = truth$variables, maps = maps, strs = strs) 115*2abb3134SXin Li 116*2abb3134SXin Li} 117*2abb3134SXin Li 118*2abb3134SXin Li# ----------------Actual testing starts here--------------- # 119*2abb3134SXin LiTestComputeDistributionEM <- function() { 120*2abb3134SXin Li # Test various aspects of ComputeDistributionEM in association.R. 121*2abb3134SXin Li # Tests include: 122*2abb3134SXin Li # Test 1: Compute a joint distribution of uniformly distributed, 123*2abb3134SXin Li # perfectly correlated strings 124*2abb3134SXin Li # Test 2: Compute a marginal distribution of uniformly distributed strings 125*2abb3134SXin Li # Test 3: Check the "other" category estimation works by removing 126*2abb3134SXin Li # a string from the known map. 127*2abb3134SXin Li # Test 4: Test that the variance from EM algorithm is 1/N when there 128*2abb3134SXin Li # is no noise in the system. 129*2abb3134SXin Li # Test 5: Check that the right answer is still obtained when f = 0.2. 130*2abb3134SXin Li 131*2abb3134SXin Li num_variables <- 3 132*2abb3134SXin Li N <- 100 133*2abb3134SXin Li 134*2abb3134SXin Li # Initialize the parameters 135*2abb3134SXin Li params <- list(k = 12, h = 2, m = 4, p = 0, q = 1, f = 0) 136*2abb3134SXin Li variable_opts <- list(deterministic = TRUE, num_strings = 2, 137*2abb3134SXin Li independent = FALSE) 138*2abb3134SXin Li sim <- Simulate(N, num_variables, params, variable_opts) 139*2abb3134SXin Li 140*2abb3134SXin Li # Test 1: Delta function pmf 141*2abb3134SXin Li joint_dist <- ComputeDistributionEM(sim$reports, 142*2abb3134SXin Li sim$cohorts, sim$maps, 143*2abb3134SXin Li ignore_other = TRUE, 144*2abb3134SXin Li params = params, 145*2abb3134SXin Li marginals = NULL, 146*2abb3134SXin Li estimate_var = FALSE) 147*2abb3134SXin Li # The recovered distribution should be close to the delta function. 148*2abb3134SXin Li checkTrue(abs(joint_dist$fit["1", "1", "1"] - 0.5) < 0.01) 149*2abb3134SXin Li checkTrue(abs(joint_dist$fit["2", "2", "2"] - 0.5) < 0.01) 150*2abb3134SXin Li 151*2abb3134SXin Li # Test 2: Now compute a marginal using EM 152*2abb3134SXin Li dist <- ComputeDistributionEM(list(sim$reports[[1]]), 153*2abb3134SXin Li list(sim$cohorts[[1]]), 154*2abb3134SXin Li list(sim$maps[[1]]), 155*2abb3134SXin Li ignore_other = TRUE, 156*2abb3134SXin Li params = params, 157*2abb3134SXin Li marginals = NULL, 158*2abb3134SXin Li estimate_var = FALSE) 159*2abb3134SXin Li checkTrue(abs(dist$fit["1"] - 0.5) < 0.01) 160*2abb3134SXin Li 161*2abb3134SXin Li # Test 3: Check that the "other" category is correctly computed 162*2abb3134SXin Li # Build a modified map with no column 2 (i.e. we only know that string 163*2abb3134SXin Li # "1" is a valid string 164*2abb3134SXin Li map <- sim$maps[[1]] 165*2abb3134SXin Li small_map <- map 166*2abb3134SXin Li 167*2abb3134SXin Li for (i in 1:params$m) { 168*2abb3134SXin Li locs <- which(map$map_by_cohort[[i]][, 1]) 169*2abb3134SXin Li small_map$map_by_cohort[[i]] <- sparseMatrix(locs, rep(1, length(locs)), 170*2abb3134SXin Li dims = c(params$k, 1)) 171*2abb3134SXin Li locs <- which(map$all_cohorts_map[, 1]) 172*2abb3134SXin Li colnames(small_map$map_by_cohort[[i]]) <- sim$strs[1] 173*2abb3134SXin Li } 174*2abb3134SXin Li small_map$all_cohorts_map <- do.call("rBind", small_map$map_by_cohort) 175*2abb3134SXin Li 176*2abb3134SXin Li dist <- ComputeDistributionEM(list(sim$reports[[1]]), 177*2abb3134SXin Li list(sim$cohorts[[1]]), 178*2abb3134SXin Li list(small_map), 179*2abb3134SXin Li ignore_other = FALSE, 180*2abb3134SXin Li params = params, 181*2abb3134SXin Li marginals = NULL, 182*2abb3134SXin Li estimate_var = FALSE) 183*2abb3134SXin Li 184*2abb3134SXin Li # The recovered distribution should be uniform over 2 strings. 185*2abb3134SXin Li checkTrue(abs(dist$fit[1] - 0.5) < 0.1) 186*2abb3134SXin Li 187*2abb3134SXin Li # Test 4: Test the variance is 1/N 188*2abb3134SXin Li variable_opts <- list(deterministic = TRUE, num_strings = 1) 189*2abb3134SXin Li sim <- Simulate(N, num_variables = 1, params, variable_opts) 190*2abb3134SXin Li dist <- ComputeDistributionEM(sim$reports, sim$cohorts, 191*2abb3134SXin Li sim$maps, ignore_other = TRUE, 192*2abb3134SXin Li params = params, marginals = NULL, 193*2abb3134SXin Li estimate_var = TRUE) 194*2abb3134SXin Li 195*2abb3134SXin Li checkEqualsNumeric(dist$em$var_cov[1, 1], 1 / N) 196*2abb3134SXin Li 197*2abb3134SXin Li # Test 5: Check that when f=0.2, we still get a good estimate 198*2abb3134SXin Li params <- list(k = 12, h = 2, m = 2, p = 0, q = 1, f = 0.2) 199*2abb3134SXin Li variable_opts <- list(deterministic = TRUE, num_strings = 2) 200*2abb3134SXin Li sim <- Simulate(N, num_variables = 2, params, variable_opts) 201*2abb3134SXin Li dist <- ComputeDistributionEM(sim$reports, sim$cohorts, 202*2abb3134SXin Li sim$maps, ignore_other = TRUE, 203*2abb3134SXin Li params = params, marginals = NULL, 204*2abb3134SXin Li estimate_var = FALSE) 205*2abb3134SXin Li 206*2abb3134SXin Li checkTrue(abs(dist$fit["1", "1"] - 0.5) < 0.15) 207*2abb3134SXin Li checkTrue(abs(dist$fit["2", "2"] - 0.5) < 0.15) 208*2abb3134SXin Li 209*2abb3134SXin Li # Test 6: Check the computed joint distribution with randomized 210*2abb3134SXin Li # correlated inputs from the Poisson distribution 211*2abb3134SXin Li # Expect to have correlation between strings n and n + 1 212*2abb3134SXin Li N <- 1000 213*2abb3134SXin Li params <- list(k = 16, h = 2, m = 4, p = 0.1, q = 0.9, f = 0.1) 214*2abb3134SXin Li variable_opts <- list(deterministic = FALSE, independent = FALSE) 215*2abb3134SXin Li sim <- Simulate(N, num_variables = 2, params, variable_opts) 216*2abb3134SXin Li dist <- ComputeDistributionEM(sim$reports, sim$cohorts, 217*2abb3134SXin Li sim$maps, ignore_other = TRUE, 218*2abb3134SXin Li params = params, marginals = NULL, 219*2abb3134SXin Li estimate_var = FALSE) 220*2abb3134SXin Li 221*2abb3134SXin Li print_dist <- TRUE # to print joint distribution, set to TRUE 222*2abb3134SXin Li 223*2abb3134SXin Li if (print_dist) { 224*2abb3134SXin Li # dist$fit[dist$fit<1e-4] <- 0 225*2abb3134SXin Li # Sort by row names and column names to visually see correlation 226*2abb3134SXin Li print(dist$fit[sort(rownames(dist$fit)), sort(colnames(dist$fit))]) 227*2abb3134SXin Li } 228*2abb3134SXin Li 229*2abb3134SXin Li # Check for correlations (constants chosen heuristically to get good 230*2abb3134SXin Li # test confidence with small # of samples) 231*2abb3134SXin Li # Should have mass roughly 1/2e and 1/2e each 232*2abb3134SXin Li checkTrue(abs(dist$fit["1", "1"] - dist$fit["1", "2"]) < 0.1) 233*2abb3134SXin Li checkTrue(abs(dist$fit["2", "2"] - dist$fit["2", "3"]) < 0.1) 234*2abb3134SXin Li 235*2abb3134SXin Li # Should have mass roughly 1/4e and 1/4e each 236*2abb3134SXin Li checkTrue(abs(dist$fit["3", "3"] - dist$fit["3", "4"]) < 0.06) 237*2abb3134SXin Li 238*2abb3134SXin Li # Check for lack of probability mass 239*2abb3134SXin Li checkTrue(dist$fit["1", "3"] < 0.02) 240*2abb3134SXin Li checkTrue(dist$fit["1", "4"] < 0.02) 241*2abb3134SXin Li checkTrue(dist$fit["2", "1"] < 0.02) 242*2abb3134SXin Li checkTrue(dist$fit["2", "4"] < 0.02) 243*2abb3134SXin Li checkTrue(dist$fit["3", "1"] < 0.02) 244*2abb3134SXin Li checkTrue(dist$fit["3", "2"] < 0.02) 245*2abb3134SXin Li} 246*2abb3134SXin Li 247*2abb3134SXin LiMakeCondProb <- function() { 248*2abb3134SXin Li d = matrix(c(1,1,2,2,3,3), nrow=3, ncol=2) 249*2abb3134SXin Li d = d / sum(d) 250*2abb3134SXin Li 251*2abb3134SXin Li e = matrix(c(3,3,2,2,1,1), nrow=3, ncol=2) 252*2abb3134SXin Li e = e / sum(e) 253*2abb3134SXin Li 254*2abb3134SXin Li list(d, e, d) # 3 reports 255*2abb3134SXin Li} 256*2abb3134SXin Li 257*2abb3134SXin Li# Test the slow version in R. 258*2abb3134SXin LiRunEmFunction <- function(cond_prob, max_em_iters) { 259*2abb3134SXin Li cond_prob <- MakeCondProb() 260*2abb3134SXin Li 261*2abb3134SXin Li # Mechanical test of 4 iterations. em$hist has 5 elements. 262*2abb3134SXin Li result <- EM(cond_prob, max_em_iters=max_em_iters) 263*2abb3134SXin Li result$est 264*2abb3134SXin Li} 265*2abb3134SXin Li 266*2abb3134SXin Li# Run a test of the EM executable 267*2abb3134SXin LiRunEmExecutable <- function(em_executable, cond_prob, max_em_iters) { 268*2abb3134SXin Li print(cond_prob) 269*2abb3134SXin Li 270*2abb3134SXin Li if (!file.exists(em_executable)) { 271*2abb3134SXin Li stop(sprintf("EM executable %s doesn't exist (build it?)", em_executable)) 272*2abb3134SXin Li } 273*2abb3134SXin Li em_iter_func <- ConstructFastEM(em_executable, "/tmp") 274*2abb3134SXin Li 275*2abb3134SXin Li result <- em_iter_func(cond_prob, max_em_iters=max_em_iters) 276*2abb3134SXin Li result$est 277*2abb3134SXin Li} 278*2abb3134SXin Li 279*2abb3134SXin LiTestCppImplementation <- function() { 280*2abb3134SXin Li cond_prob <- MakeCondProb() 281*2abb3134SXin Li max_em_iters <- 10 282*2abb3134SXin Li fit1 <- RunEmFunction(cond_prob, max_em_iters) 283*2abb3134SXin Li 284*2abb3134SXin Li # Assume we're in the repo root 285*2abb3134SXin Li em_cpp <- file.path(getwd(), "analysis/cpp/_tmp/fast_em") 286*2abb3134SXin Li fit2 <- RunEmExecutable(em_cpp, cond_prob, max_em_iters) 287*2abb3134SXin Li 288*2abb3134SXin Li cpp_diff <- abs(fit1 - fit2) 289*2abb3134SXin Li print(cpp_diff) 290*2abb3134SXin Li Log("C++ implementation difference after %d iterations: %e", max_em_iters, 291*2abb3134SXin Li sum(cpp_diff)) 292*2abb3134SXin Li 293*2abb3134SXin Li # After 10 iterations they should be almost indistinguishable. 294*2abb3134SXin Li checkTrue(sum(cpp_diff) < 1e-10) 295*2abb3134SXin Li} 296*2abb3134SXin Li 297*2abb3134SXin LiTestTensorFlowImplementation <- function() { 298*2abb3134SXin Li cond_prob <- MakeCondProb() 299*2abb3134SXin Li max_em_iters <- 10 300*2abb3134SXin Li fit1 <- RunEmFunction(cond_prob, max_em_iters) 301*2abb3134SXin Li 302*2abb3134SXin Li em_tf <- file.path(getwd(), "analysis/tensorflow/fast_em.sh") 303*2abb3134SXin Li fit2 <- RunEmExecutable(em_tf, cond_prob, max_em_iters) 304*2abb3134SXin Li 305*2abb3134SXin Li tf_diff <- abs(fit1 - fit2) 306*2abb3134SXin Li print(tf_diff) 307*2abb3134SXin Li Log("TensorFlow implementation difference after %d iterations: %e", 308*2abb3134SXin Li max_em_iters, sum(tf_diff)) 309*2abb3134SXin Li 310*2abb3134SXin Li checkTrue(sum(tf_diff) < 1e-10) 311*2abb3134SXin Li} 312