1*2abb3134SXin Li#!/usr/bin/Rscript 2*2abb3134SXin Li# Copyright 2014 Google Inc. All rights reserved. 3*2abb3134SXin Li# 4*2abb3134SXin Li# Licensed under the Apache License, Version 2.0 (the "License"); 5*2abb3134SXin Li# you may not use this file except in compliance with the License. 6*2abb3134SXin Li# You may obtain a copy of the License at 7*2abb3134SXin Li# 8*2abb3134SXin Li# http://www.apache.org/licenses/LICENSE-2.0 9*2abb3134SXin Li# 10*2abb3134SXin Li# Unless required by applicable law or agreed to in writing, software 11*2abb3134SXin Li# distributed under the License is distributed on an "AS IS" BASIS, 12*2abb3134SXin Li# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13*2abb3134SXin Li# See the License for the specific language governing permissions and 14*2abb3134SXin Li# limitations under the License. 15*2abb3134SXin Li 16*2abb3134SXin Lilibrary(RUnit) 17*2abb3134SXin Lilibrary(abind) 18*2abb3134SXin Li 19*2abb3134SXin Lisource('analysis/R/decode.R') 20*2abb3134SXin Lisource('tests/gen_counts.R') 21*2abb3134SXin Li 22*2abb3134SXin LiL1Distance <- function(X, Y) { 23*2abb3134SXin Li # Computes the L1 distance between two named vectors 24*2abb3134SXin Li common <- intersect(names(X), names(Y)) 25*2abb3134SXin Li 26*2abb3134SXin Li L1_intersect <- sum(abs(X[common] - Y[common])) 27*2abb3134SXin Li L1_X_minus_Y <- sum(X[!names(X) %in% common]) 28*2abb3134SXin Li L1_Y_minus_X <- sum(Y[!names(Y) %in% common]) 29*2abb3134SXin Li 30*2abb3134SXin Li (L1_intersect + L1_X_minus_Y + L1_Y_minus_X) / 2 31*2abb3134SXin Li} 32*2abb3134SXin Li 33*2abb3134SXin LiLInfDistance <- function(X, Y) { 34*2abb3134SXin Li # Computes the L_infinity distance between two named vectors 35*2abb3134SXin Li common <- intersect(names(X), names(Y)) 36*2abb3134SXin Li 37*2abb3134SXin Li max(abs(X[common] - Y[common]), 38*2abb3134SXin Li abs(X[!names(X) %in% common]), 39*2abb3134SXin Li abs(Y[!names(Y) %in% common])) 40*2abb3134SXin Li} 41*2abb3134SXin Li 42*2abb3134SXin LiMatrixVectorMerge <- function(mat, vec) { 43*2abb3134SXin Li # Attaches a vector to a matrix, matching corresponding column names 44*2abb3134SXin Li 45*2abb3134SXin Li mat_only <- setdiff(colnames(mat), names(vec)) 46*2abb3134SXin Li vec_only <- setdiff(names(vec), colnames(mat)) 47*2abb3134SXin Li 48*2abb3134SXin Li # extend the vector with missing columns 49*2abb3134SXin Li vec_long <- c(vec, setNames(rep(NA, length(mat_only)), mat_only)) 50*2abb3134SXin Li 51*2abb3134SXin Li # extend the matrix with missing columns 52*2abb3134SXin Li newcols <- matrix(NA, nrow = nrow(mat), ncol = length(vec_only)) 53*2abb3134SXin Li colnames(newcols) <- vec_only 54*2abb3134SXin Li mat_long <- cbind(mat, newcols) 55*2abb3134SXin Li 56*2abb3134SXin Li # Now vec and mat have the same columns, but in the wrong order. Sort the 57*2abb3134SXin Li # columns lexicographically. 58*2abb3134SXin Li if(length(vec_long) > 0) { 59*2abb3134SXin Li mat_long <- mat_long[, order(colnames(mat_long)), drop = FALSE] 60*2abb3134SXin Li vec_long <- vec_long[order(names(vec_long))] 61*2abb3134SXin Li } 62*2abb3134SXin Li 63*2abb3134SXin Li rbind(mat_long, vec_long) 64*2abb3134SXin Li} 65*2abb3134SXin Li 66*2abb3134SXin LiRunMultipleTests <- function(title, fun, repetitions, ...) { 67*2abb3134SXin Li # Run a function with an annotated progress indicator. The function's outputs 68*2abb3134SXin Li # are concatenated and returned as a list of length repetitions. 69*2abb3134SXin Li cat(title, ": ") 70*2abb3134SXin Li 71*2abb3134SXin Li if(repetitions == 1) { 72*2abb3134SXin Li # only run once 73*2abb3134SXin Li results <- list(fun(...)) 74*2abb3134SXin Li 75*2abb3134SXin Li cat(" Done.\n") 76*2abb3134SXin Li } else { # run multiple times 77*2abb3134SXin Li pb <- txtProgressBar(min = 0, max = repetitions, 78*2abb3134SXin Li width = getOption("width") - 20 - nchar(title)) 79*2abb3134SXin Li 80*2abb3134SXin Li results <- vector(mode = "list", repetitions) 81*2abb3134SXin Li for(i in 1:repetitions) { 82*2abb3134SXin Li setTxtProgressBar(pb, i) 83*2abb3134SXin Li results[[i]] <- fun(...) 84*2abb3134SXin Li } 85*2abb3134SXin Li cat(" Done.") 86*2abb3134SXin Li close(pb) 87*2abb3134SXin Li } 88*2abb3134SXin Li 89*2abb3134SXin Li results 90*2abb3134SXin Li} 91*2abb3134SXin Li 92*2abb3134SXin LiCheckEstimatesAndStdsHelper <- function(params, map, pdf, total) { 93*2abb3134SXin Li # Helper function for TestEstimateBloomCounts. 94*2abb3134SXin Li partition <- RandomPartition(total, pdf) 95*2abb3134SXin Li counts <- GenerateCounts(params, map, partition, 1) 96*2abb3134SXin Li 97*2abb3134SXin Li EstimateBloomCounts(params, counts) 98*2abb3134SXin Li} 99*2abb3134SXin Li 100*2abb3134SXin LiCheckEstimatesAndStds <- function(repetitions, title, params, map, pdf, total) { 101*2abb3134SXin Li # Checks that the expectations returned by EstimateBloomCounts on simulated 102*2abb3134SXin Li # inputs match the ground truth and the empirical standard deviation matches 103*2abb3134SXin Li # EstimateBloomCounts outputs. 104*2abb3134SXin Li # 105*2abb3134SXin Li # Input: 106*2abb3134SXin Li # repetitions: the number of runs ofEstimateBloomCounts 107*2abb3134SXin Li # title: label 108*2abb3134SXin Li # params: params vector 109*2abb3134SXin Li # map: the map table 110*2abb3134SXin Li # pdf: probability density function of the distribution from which simulated 111*2abb3134SXin Li # clients are sampled 112*2abb3134SXin Li # total: number of reports 113*2abb3134SXin Li 114*2abb3134SXin Li results <- RunMultipleTests(title, CheckEstimatesAndStdsHelper, repetitions, 115*2abb3134SXin Li params, map, pdf, total) 116*2abb3134SXin Li 117*2abb3134SXin Li estimates <- abind(lapply(results, function(r) r$estimates), along = 3) 118*2abb3134SXin Li stds <- abind(lapply(results, function(r) r$stds), along = 3) 119*2abb3134SXin Li 120*2abb3134SXin Li ave_e <- apply(estimates, 1:2, mean) 121*2abb3134SXin Li observed_stds <- apply(estimates, 1:2, sd) 122*2abb3134SXin Li ave_stds <- apply(stds, 1:2, mean) 123*2abb3134SXin Li 124*2abb3134SXin Li ground_truth <- matrix(map %*% pdf, nrow = params$m, byrow = TRUE) 125*2abb3134SXin Li 126*2abb3134SXin Li checkTrue(!any(abs(ave_e - ground_truth) > 1E-9 + # tolerance level 127*2abb3134SXin Li (ave_stds / repetitions^.5) * 5), 128*2abb3134SXin Li "Averages deviate too much from expectations.") 129*2abb3134SXin Li 130*2abb3134SXin Li checkTrue(!any(observed_stds > ave_stds * (1 + 5 * repetitions^.5)), 131*2abb3134SXin Li "Expected standard deviations are too high") 132*2abb3134SXin Li 133*2abb3134SXin Li checkTrue(!any(observed_stds < ave_stds * (1 - 5 * repetitions^.5)), 134*2abb3134SXin Li "Expected standard deviations are too low") 135*2abb3134SXin Li} 136*2abb3134SXin Li 137*2abb3134SXin LiTestEstimateBloomCounts <- function() { 138*2abb3134SXin Li # Unit tests for the EstimateBloomCounts function. 139*2abb3134SXin Li 140*2abb3134SXin Li report4x2 <- list(k = 4, m = 2) # 2 cohorts, 4 bits each 141*2abb3134SXin Li map0 <- Matrix(0, nrow = 8, ncol = 3, sparse = TRUE) # 3 possible values 142*2abb3134SXin Li map0[1,] <- c(1, 0, 0) 143*2abb3134SXin Li map0[2,] <- c(0, 1, 0) 144*2abb3134SXin Li map0[3,] <- c(0, 0, 1) 145*2abb3134SXin Li map0[4,] <- c(1, 1, 1) # 4th bit of the first cohort gets signal from all 146*2abb3134SXin Li map0[5,] <- c(0, 0, 1) # 1st bit of the second cohort gets signal from v3 147*2abb3134SXin Li 148*2abb3134SXin Li colnames(map0) <- c('v1', 'v2', 'v3') 149*2abb3134SXin Li 150*2abb3134SXin Li pdf0 <- c(1/2, 1/3, 1/6) 151*2abb3134SXin Li names(pdf0) <- colnames(map0) 152*2abb3134SXin Li 153*2abb3134SXin Li noise0 <- list(p = 0, q = 1, f = 0) # no noise at all 154*2abb3134SXin Li 155*2abb3134SXin Li CheckEstimatesAndStds(repetitions = 1000, "Testing estimates and stds (1/3)", 156*2abb3134SXin Li c(report4x2, noise0), map0, pdf0, 100) 157*2abb3134SXin Li 158*2abb3134SXin Li noise1 <- list(p = 0.4, q = .6, f = 0.5) 159*2abb3134SXin Li CheckEstimatesAndStds(repetitions = 1000, "Testing estimates and stds (2/3)", 160*2abb3134SXin Li c(report4x2, noise1), map0, pdf0, 100) 161*2abb3134SXin Li 162*2abb3134SXin Li # MEDIUM TEST: 100 values, 32 cohorts, 8 bits each, 10^6 reports 163*2abb3134SXin Li values <- 100 164*2abb3134SXin Li 165*2abb3134SXin Li report8x32 <- list(k = 8, m = 32) # 32 cohorts, 8 bits each 166*2abb3134SXin Li 167*2abb3134SXin Li map1 <- matrix(rbinom(32 * 8 * values, 1, .25), nrow = 32 * 8, ncol = values) 168*2abb3134SXin Li 169*2abb3134SXin Li colnames(map1) <- sprintf("v%d", 1:values) 170*2abb3134SXin Li 171*2abb3134SXin Li pdf1 <- ComputePdf("zipf1", values) 172*2abb3134SXin Li 173*2abb3134SXin Li CheckEstimatesAndStds(repetitions = 100, "Testing estimates and stds (3/3)", 174*2abb3134SXin Li c(report8x32, noise1), map1, pdf1, 10^9) 175*2abb3134SXin Li} 176*2abb3134SXin Li 177*2abb3134SXin LiCheckDecodeHelper <- function(params, map, pdf, num_clients, 178*2abb3134SXin Li tolerance_l1, tolerance_linf) { 179*2abb3134SXin Li # Helper function for TestDecode. Simulates a RAPPOR run and checks results of 180*2abb3134SXin Li # Decode's output against the ground truth. Output is returned as a list. 181*2abb3134SXin Li 182*2abb3134SXin Li partition <- RandomPartition(num_clients, pdf) 183*2abb3134SXin Li counts <- GenerateCounts(params, map, partition, 1) 184*2abb3134SXin Li total <- sum(partition) 185*2abb3134SXin Li 186*2abb3134SXin Li decoded <- Decode(counts, map, params, quiet = TRUE) 187*2abb3134SXin Li 188*2abb3134SXin Li decoded_partition <- setNames(decoded$fit$estimate, decoded$fit$string) 189*2abb3134SXin Li 190*2abb3134SXin Li checkTrue(L1Distance(decoded_partition, partition) < total^.5 * tolerance_l1, 191*2abb3134SXin Li sprintf("L1 distance is too large: \ 192*2abb3134SXin Li L1Distance = %f, total^0.5 * tolerance_l1 = %f", 193*2abb3134SXin Li L1Distance(decoded_partition, partition), 194*2abb3134SXin Li total^0.5 * tolerance_l1)) 195*2abb3134SXin Li 196*2abb3134SXin Li checkTrue(LInfDistance(decoded_partition, partition) < 197*2abb3134SXin Li max(partition)^.5 * tolerance_linf, 198*2abb3134SXin Li sprintf("L_inf distance is too large: \ 199*2abb3134SXin Li L1Distance = %f, max(partition)^0.5 * tolerance_linf = %f", 200*2abb3134SXin Li L1Distance(decoded_partition, partition), 201*2abb3134SXin Li max(partition)^0.5 * tolerance_linf)) 202*2abb3134SXin Li 203*2abb3134SXin Li list(estimates = decoded_partition, 204*2abb3134SXin Li stds = setNames(decoded$fit$std_error, decoded$fit$string)) 205*2abb3134SXin Li} 206*2abb3134SXin Li 207*2abb3134SXin LiCheckDecodeAveAndStds <- function(...) { 208*2abb3134SXin Li # Runs Decode multiple times (specified by the repetition argument), checks 209*2abb3134SXin Li # individuals runs against the ground truth, and the estimates of the standard 210*2abb3134SXin Li # error against empirical observations. 211*2abb3134SXin Li 212*2abb3134SXin Li results <- RunMultipleTests(...) 213*2abb3134SXin Li 214*2abb3134SXin Li estimates <- matrix(nrow = 0, ncol = 0) 215*2abb3134SXin Li lapply(results, function(r) MatrixVectorMerge(estimates, r$estimates)) 216*2abb3134SXin Li 217*2abb3134SXin Li stds <- matrix(nrow = 0, ncol = 0) 218*2abb3134SXin Li lapply(results, function(r) MatrixVectorMerge(stds, r$stds)) 219*2abb3134SXin Li 220*2abb3134SXin Li empirical_stds <- apply(estimates, 2, sd, na.rm = TRUE) 221*2abb3134SXin Li estimated_stds <- apply(stds, 2, mean, na.rm = TRUE) 222*2abb3134SXin Li 223*2abb3134SXin Li if(dim(estimates)[1] > 1) { 224*2abb3134SXin Li checkTrue(any(estimated_stds > empirical_stds / 2), 225*2abb3134SXin Li "Our estimate for the standard deviation is too low") 226*2abb3134SXin Li 227*2abb3134SXin Li checkTrue(any(estimated_stds < empirical_stds * 3), 228*2abb3134SXin Li "Our estimate for the standard deviation is too high") 229*2abb3134SXin Li } 230*2abb3134SXin Li} 231*2abb3134SXin Li 232*2abb3134SXin LiTestDecode <- function() { 233*2abb3134SXin Li # Unit tests for the Decode function. 234*2abb3134SXin Li 235*2abb3134SXin Li # TOY TESTS: three values, 2 cohorts, 4 bits each 236*2abb3134SXin Li 237*2abb3134SXin Li params_4x2 <- list(k = 4, m = 2, h = 2) # 2 cohorts, 4 bits each 238*2abb3134SXin Li map0 <- Matrix(0, nrow = 8, ncol = 3, sparse = TRUE) # 3 possible values 239*2abb3134SXin Li map0[1,] <- c(1, 0, 0) 240*2abb3134SXin Li map0[2,] <- c(0, 1, 0) 241*2abb3134SXin Li map0[3,] <- c(0, 0, 1) 242*2abb3134SXin Li map0[4,] <- c(1, 1, 1) # 4th bit of the first cohort gets signal from all 243*2abb3134SXin Li map0[5,] <- c(0, 0, 1) # 1st bit of the second cohort gets signal from v3 244*2abb3134SXin Li 245*2abb3134SXin Li colnames(map0) <- c('v1', 'v2', 'v3') 246*2abb3134SXin Li distribution0 <- setNames(c(1/2, 1/3, 1/6), colnames(map0)) 247*2abb3134SXin Li 248*2abb3134SXin Li # Even in the absence of noise, the inferred counts won't necessarily 249*2abb3134SXin Li # match the ground truth. Must be close enough though. 250*2abb3134SXin Li noise0 <- list(p = 0, q = 1, f = 0) # no noise whatsoever 251*2abb3134SXin Li 252*2abb3134SXin Li # Args are: message str, test function, # repetitions, 253*2abb3134SXin Li # params, map, true pdf, # clients, 254*2abb3134SXin Li # tolerances 255*2abb3134SXin Li CheckDecodeAveAndStds("Testing Decode (1/5)", CheckDecodeHelper, 100, 256*2abb3134SXin Li c(params_4x2, noise0), map0, distribution0, 100, 257*2abb3134SXin Li tolerance_l1 = 5, 258*2abb3134SXin Li tolerance_linf = 3) 259*2abb3134SXin Li 260*2abb3134SXin Li noise1 <- list(p = .4, q = .6, f = .5) # substantial noise, very few reports 261*2abb3134SXin Li CheckDecodeAveAndStds("Testing Decode (2/5)", CheckDecodeHelper, 100, 262*2abb3134SXin Li c(params_4x2, noise1), map0, distribution0, 100, 263*2abb3134SXin Li tolerance_l1 = 20, 264*2abb3134SXin Li tolerance_linf = 20) 265*2abb3134SXin Li 266*2abb3134SXin Li # substantial noise, many reports 267*2abb3134SXin Li CheckDecodeAveAndStds("Testing Decode (3/5)", CheckDecodeHelper, 100, 268*2abb3134SXin Li c(params_4x2, noise1), map0, distribution0, 100000, 269*2abb3134SXin Li tolerance_l1 = 50, 270*2abb3134SXin Li tolerance_linf = 40) 271*2abb3134SXin Li 272*2abb3134SXin Li # MEDIUM TEST: 100 values, 32 cohorts, 8 bits each, 10^6 reports 273*2abb3134SXin Li num_values <- 100 274*2abb3134SXin Li 275*2abb3134SXin Li params_8x32 <- list(k = 8, m = 32, h = 2) # 32 cohorts, 8 bits each 276*2abb3134SXin Li 277*2abb3134SXin Li map1 <- matrix(rbinom(32 * 8 * num_values, 1, .25), nrow = 32 * 8, ncol = 278*2abb3134SXin Li num_values) 279*2abb3134SXin Li 280*2abb3134SXin Li colnames(map1) <- sprintf("v%d", 1:num_values) 281*2abb3134SXin Li 282*2abb3134SXin Li distribution1 <- ComputePdf("zipf1", num_values) 283*2abb3134SXin Li names(distribution1) <- colnames(map1) 284*2abb3134SXin Li CheckDecodeAveAndStds("Testing Decode (4/5)", CheckDecodeHelper, 100, 285*2abb3134SXin Li c(params_8x32, noise1), map1, distribution1, 10^6, 286*2abb3134SXin Li tolerance_l1 = num_values * 3, 287*2abb3134SXin Li tolerance_linf = 100) 288*2abb3134SXin Li 289*2abb3134SXin Li # Testing LASSO: 500 values, 32 cohorts, 8 bits each, 10^6 reports 290*2abb3134SXin Li num_values <- 500 291*2abb3134SXin Li 292*2abb3134SXin Li params_8x32 <- list(k = 8, m = 32, h = 2) # 32 cohorts, 8 bits each 293*2abb3134SXin Li 294*2abb3134SXin Li map2 <- matrix(rbinom(32 * 8 * num_values, 1, .25), nrow = 32 * 8, ncol = 295*2abb3134SXin Li num_values) 296*2abb3134SXin Li 297*2abb3134SXin Li colnames(map2) <- sprintf("v%d", 1:num_values) 298*2abb3134SXin Li 299*2abb3134SXin Li distribution2 <- ComputePdf("zipf1.5", num_values) 300*2abb3134SXin Li names(distribution2) <- colnames(map2) 301*2abb3134SXin Li 302*2abb3134SXin Li CheckDecodeAveAndStds("Testing Decode (5/5)", CheckDecodeHelper, 1, 303*2abb3134SXin Li c(params_8x32, noise1), map2, distribution2, 10^6, 304*2abb3134SXin Li tolerance_l1 = num_values * 3, 305*2abb3134SXin Li tolerance_linf = 80) 306*2abb3134SXin Li 307*2abb3134SXin Li} 308*2abb3134SXin Li 309*2abb3134SXin LiTestDecodeBool <- function() { 310*2abb3134SXin Li # Testing Boolean Decode 311*2abb3134SXin Li num_values <- 2 312*2abb3134SXin Li # 1 bit; rest of the params don't matter 313*2abb3134SXin Li params_bool <- list(k = 1, m = 128, h = 2) 314*2abb3134SXin Li # setting up map_bool to be consistent with the Decode API and for 315*2abb3134SXin Li # GenerateCounts() 316*2abb3134SXin Li map_bool <- matrix(c(0, 1), nrow = 128 * 1, ncol = num_values, byrow = TRUE) 317*2abb3134SXin Li 318*2abb3134SXin Li colnames(map_bool) <- c("FALSE", "TRUE") 319*2abb3134SXin Li distribution_bool <- ComputePdf("zipf1.5", num_values) 320*2abb3134SXin Li names(distribution_bool) <- colnames(map_bool) 321*2abb3134SXin Li noise2 <- list(p = 0.25, q = 0.75, f = 0.5) 322*2abb3134SXin Li 323*2abb3134SXin Li # tolerance_l1 set to four standard deviations to avoid any flakiness in 324*2abb3134SXin Li # tests 325*2abb3134SXin Li CheckDecodeAveAndStds("Testing .DecodeBoolean (1/3)", CheckDecodeHelper, 100, 326*2abb3134SXin Li c(params_bool, noise2), map_bool, distribution_bool, 327*2abb3134SXin Li 10^6, 328*2abb3134SXin Li tolerance_l1 = 4 * num_values, 329*2abb3134SXin Li tolerance_linf = 80) 330*2abb3134SXin Li 331*2abb3134SXin Li noise1 <- list(p = .4, q = .6, f = .5) # substantial noise => 7 stddevs error 332*2abb3134SXin Li CheckDecodeAveAndStds("Testing .DecodeBoolean (2/3)", CheckDecodeHelper, 100, 333*2abb3134SXin Li c(params_bool, noise1), map_bool, distribution_bool, 334*2abb3134SXin Li 10^6, 335*2abb3134SXin Li tolerance_l1 = 7 * num_values, 336*2abb3134SXin Li tolerance_linf = 80) 337*2abb3134SXin Li 338*2abb3134SXin Li distribution_near_zero <- c(0.999, 0.001) 339*2abb3134SXin Li names(distribution_near_zero) <- colnames(map_bool) 340*2abb3134SXin Li 341*2abb3134SXin Li CheckDecodeAveAndStds("Testing .DecodeBoolean (3/3)", CheckDecodeHelper, 100, 342*2abb3134SXin Li c(params_bool, noise2), map_bool, 343*2abb3134SXin Li distribution_near_zero, 10^6, 344*2abb3134SXin Li tolerance_l1 = 4 * num_values, 345*2abb3134SXin Li tolerance_linf = 80) 346*2abb3134SXin Li} 347*2abb3134SXin Li 348*2abb3134SXin LiRunAll <- function() { 349*2abb3134SXin Li TestEstimateBloomCounts() 350*2abb3134SXin Li TestDecode() 351*2abb3134SXin Li TestDecodeBool() 352*2abb3134SXin Li} 353*2abb3134SXin Li 354*2abb3134SXin LiRunAll() 355