1*2abb3134SXin Li# Copyright 2014 Google Inc. All rights reserved. 2*2abb3134SXin Li# 3*2abb3134SXin Li# Licensed under the Apache License, Version 2.0 (the "License"); 4*2abb3134SXin Li# you may not use this file except in compliance with the License. 5*2abb3134SXin Li# You may obtain a copy of the License at 6*2abb3134SXin Li# 7*2abb3134SXin Li# http://www.apache.org/licenses/LICENSE-2.0 8*2abb3134SXin Li# 9*2abb3134SXin Li# Unless required by applicable law or agreed to in writing, software 10*2abb3134SXin Li# distributed under the License is distributed on an "AS IS" BASIS, 11*2abb3134SXin Li# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12*2abb3134SXin Li# See the License for the specific language governing permissions and 13*2abb3134SXin Li# limitations under the License. 14*2abb3134SXin Li 15*2abb3134SXin Li# 16*2abb3134SXin Li# RAPPOR simulation library. Contains code for encoding simulated data and 17*2abb3134SXin Li# creating the map used to encode and decode reports. 18*2abb3134SXin Li 19*2abb3134SXin Lilibrary(glmnet) 20*2abb3134SXin Lilibrary(parallel) # mclapply 21*2abb3134SXin Li 22*2abb3134SXin LiSetOfStrings <- function(num_strings = 100) { 23*2abb3134SXin Li # Generates a set of strings for simulation purposes. 24*2abb3134SXin Li strs <- paste0("V_", as.character(1:num_strings)) 25*2abb3134SXin Li strs 26*2abb3134SXin Li} 27*2abb3134SXin Li 28*2abb3134SXin LiGetSampleProbs <- function(params) { 29*2abb3134SXin Li # Generate different underlying distributions for simulations purposes. 30*2abb3134SXin Li # Args: 31*2abb3134SXin Li # - params: a list describing the shape of the true distribution: 32*2abb3134SXin Li # c(num_strings, prop_nonzero_strings, decay_type, 33*2abb3134SXin Li # rate_exponetial). 34*2abb3134SXin Li nstrs <- params[[1]] 35*2abb3134SXin Li nonzero <- params[[2]] 36*2abb3134SXin Li decay <- params[[3]] 37*2abb3134SXin Li expo <- params[[4]] 38*2abb3134SXin Li background <- params[[5]] 39*2abb3134SXin Li 40*2abb3134SXin Li probs <- rep(0, nstrs) 41*2abb3134SXin Li ind <- floor(nstrs * nonzero) 42*2abb3134SXin Li if (decay == "Linear") { 43*2abb3134SXin Li probs[1:ind] <- (ind:1) / sum(1:ind) 44*2abb3134SXin Li } else if (decay == "Constant") { 45*2abb3134SXin Li probs[1:ind] <- 1 / ind 46*2abb3134SXin Li } else if (decay == "Exponential") { 47*2abb3134SXin Li temp <- seq(0, nonzero, length.out = ind) 48*2abb3134SXin Li temp <- exp(-temp * expo) 49*2abb3134SXin Li temp <- temp + background 50*2abb3134SXin Li temp <- temp / sum(temp) 51*2abb3134SXin Li probs[1:ind] <- temp 52*2abb3134SXin Li } else { 53*2abb3134SXin Li stop('params[[4]] must be in c("Linear", "Exponenential", "Constant")') 54*2abb3134SXin Li } 55*2abb3134SXin Li probs 56*2abb3134SXin Li} 57*2abb3134SXin Li 58*2abb3134SXin LiEncodeAll <- function(x, cohorts, map, params, num_cores = 1) { 59*2abb3134SXin Li # Encodes the ground truth into RAPPOR reports. 60*2abb3134SXin Li # 61*2abb3134SXin Li # Args: 62*2abb3134SXin Li # x: Observed strings for each report, Nx1 vector 63*2abb3134SXin Li # cohort: Cohort assignment for each report, Nx1 vector 64*2abb3134SXin Li # map: list of matrices encoding locations of hashes for each 65*2abb3134SXin Li # string, for each cohort 66*2abb3134SXin Li # params: System parameters 67*2abb3134SXin Li # 68*2abb3134SXin Li # Returns: 69*2abb3134SXin Li # RAPPOR reports for each piece of data. 70*2abb3134SXin Li 71*2abb3134SXin Li p <- params$p 72*2abb3134SXin Li q <- params$q 73*2abb3134SXin Li f <- params$f 74*2abb3134SXin Li k <- params$k 75*2abb3134SXin Li 76*2abb3134SXin Li qstar <- (1 - f / 2) * q + (f / 2) * p 77*2abb3134SXin Li pstar <- (1 - f / 2) * p + (f / 2) * q 78*2abb3134SXin Li 79*2abb3134SXin Li candidates <- colnames(map[[1]]) 80*2abb3134SXin Li if (!all(x %in% candidates)) { 81*2abb3134SXin Li stop("Some strings are not in the map. set(X) - set(candidates): ", 82*2abb3134SXin Li paste(setdiff(unique(x), candidates), collapse=" "), "\n") 83*2abb3134SXin Li } 84*2abb3134SXin Li bfs <- mapply(function(x, y) y[, x], x, map[cohorts], SIMPLIFY = FALSE, 85*2abb3134SXin Li USE.NAMES = FALSE) 86*2abb3134SXin Li reports <- mclapply(bfs, function(x) { 87*2abb3134SXin Li noise <- sample(0:1, k, replace = TRUE, prob = c(1 - pstar, pstar)) 88*2abb3134SXin Li ind <- which(x) 89*2abb3134SXin Li noise[ind] <- sample(0:1, length(ind), replace = TRUE, 90*2abb3134SXin Li prob = c(1 - qstar, qstar)) 91*2abb3134SXin Li noise 92*2abb3134SXin Li }, mc.cores = num_cores) 93*2abb3134SXin Li 94*2abb3134SXin Li reports 95*2abb3134SXin Li} 96*2abb3134SXin Li 97*2abb3134SXin LiCreateMap <- function(strs, params, generate_pos = TRUE, basic = FALSE) { 98*2abb3134SXin Li # Creates a list of 0/1 matrices corresponding to mapping between the strs and 99*2abb3134SXin Li # Bloom filters for each instance of the RAPPOR. 100*2abb3134SXin Li # Ex. for 3 strings, 2 instances, 1 hash function and Bloom filter of size 4, 101*2abb3134SXin Li # the result could look this: 102*2abb3134SXin Li # [[1]] 103*2abb3134SXin Li # 1 0 0 0 104*2abb3134SXin Li # 0 1 0 0 105*2abb3134SXin Li # 0 0 0 1 106*2abb3134SXin Li # [[2]] 107*2abb3134SXin Li # 0 1 0 0 108*2abb3134SXin Li # 0 0 0 1 109*2abb3134SXin Li # 0 0 1 0 110*2abb3134SXin Li # 111*2abb3134SXin Li # Args: 112*2abb3134SXin Li # strs: a vector of strings 113*2abb3134SXin Li # params: a list of parameters in the following format: 114*2abb3134SXin Li # (k, h, m, p, q, f). 115*2abb3134SXin Li # generate_pos: Tells whether to generate an object storing the 116*2abb3134SXin Li # positions of the nonzeros in the matrix 117*2abb3134SXin Li # basic: Tells whether to use basic RAPPOR (only works if h=1). 118*2abb3134SXin Li 119*2abb3134SXin Li M <- length(strs) 120*2abb3134SXin Li map_by_cohort <- list() 121*2abb3134SXin Li k <- params$k 122*2abb3134SXin Li h <- params$h 123*2abb3134SXin Li m <- params$m 124*2abb3134SXin Li 125*2abb3134SXin Li for (i in 1:m) { 126*2abb3134SXin Li if (basic && (h == 1) && (k == M)) { 127*2abb3134SXin Li ones <- 1:M 128*2abb3134SXin Li } else { 129*2abb3134SXin Li ones <- sample(1:k, M * h, replace = TRUE) 130*2abb3134SXin Li } 131*2abb3134SXin Li cols <- rep(1:M, each = h) 132*2abb3134SXin Li map_by_cohort[[i]] <- sparseMatrix(ones, cols, dims = c(k, M)) 133*2abb3134SXin Li colnames(map_by_cohort[[i]]) <- strs 134*2abb3134SXin Li } 135*2abb3134SXin Li 136*2abb3134SXin Li all_cohorts_map <- do.call("rBind", map_by_cohort) 137*2abb3134SXin Li if (generate_pos) { 138*2abb3134SXin Li map_pos <- t(apply(all_cohorts_map, 2, function(x) { 139*2abb3134SXin Li ind <- which(x == 1) 140*2abb3134SXin Li n <- length(ind) 141*2abb3134SXin Li if (n < h * m) { 142*2abb3134SXin Li ind <- c(ind, rep(NA, h * m - n)) 143*2abb3134SXin Li } 144*2abb3134SXin Li ind 145*2abb3134SXin Li })) 146*2abb3134SXin Li } else { 147*2abb3134SXin Li map_pos <- NULL 148*2abb3134SXin Li } 149*2abb3134SXin Li 150*2abb3134SXin Li list(map_by_cohort = map_by_cohort, all_cohorts_map = all_cohorts_map, 151*2abb3134SXin Li map_pos = map_pos) 152*2abb3134SXin Li} 153*2abb3134SXin Li 154*2abb3134SXin LiGetSample <- function(N, strs, probs) { 155*2abb3134SXin Li # Sample for the strs population with distribution probs. 156*2abb3134SXin Li sample(strs, N, replace = TRUE, prob = probs) 157*2abb3134SXin Li} 158*2abb3134SXin Li 159*2abb3134SXin LiGetTrueBits <- function(samp, map, params) { 160*2abb3134SXin Li # Convert sample generated by GetSample() to Bloom filters where mapping 161*2abb3134SXin Li # is defined in map. 162*2abb3134SXin Li # Output: 163*2abb3134SXin Li # - reports: a matrix of size [num_instances x size] where each row 164*2abb3134SXin Li # represents the number of times each bit in the Bloom filter 165*2abb3134SXin Li # was set for a particular instance. 166*2abb3134SXin Li # Note: reports[, 1] contains the same size for each instance. 167*2abb3134SXin Li 168*2abb3134SXin Li N <- length(samp) 169*2abb3134SXin Li k <- params$k 170*2abb3134SXin Li m <- params$m 171*2abb3134SXin Li strs <- colnames(map[[1]]) 172*2abb3134SXin Li reports <- matrix(0, m, k + 1) 173*2abb3134SXin Li inst <- sample(1:m, N, replace = TRUE) 174*2abb3134SXin Li for (i in 1:m) { 175*2abb3134SXin Li tab <- table(samp[inst == i]) 176*2abb3134SXin Li tab2 <- rep(0, length(strs)) 177*2abb3134SXin Li tab2[match(names(tab), strs)] <- tab 178*2abb3134SXin Li counts <- apply(map[[i]], 1, function(x) x * tab2) 179*2abb3134SXin Li # cat(length(tab2), dim(map[[i]]), dim(counts), "\n") 180*2abb3134SXin Li reports[i, ] <- c(sum(tab2), apply(counts, 2, sum)) 181*2abb3134SXin Li } 182*2abb3134SXin Li reports 183*2abb3134SXin Li} 184*2abb3134SXin Li 185*2abb3134SXin LiGetNoisyBits <- function(truth, params) { 186*2abb3134SXin Li # Applies RAPPOR to the Bloom filters. 187*2abb3134SXin Li # Args: 188*2abb3134SXin Li # - truth: a matrix generated by GetTrueBits(). 189*2abb3134SXin Li 190*2abb3134SXin Li k <- params$k 191*2abb3134SXin Li p <- params$p 192*2abb3134SXin Li q <- params$q 193*2abb3134SXin Li f <- params$f 194*2abb3134SXin Li 195*2abb3134SXin Li rappors <- apply(truth, 1, function(x) { 196*2abb3134SXin Li # The following samples considering 4 cases: 197*2abb3134SXin Li # 1. Signal and we lie on the bit. 198*2abb3134SXin Li # 2. Signal and we tell the truth. 199*2abb3134SXin Li # 3. Noise and we lie. 200*2abb3134SXin Li # 4. Noise and we tell the truth. 201*2abb3134SXin Li 202*2abb3134SXin Li # Lies when signal sampled from the binomial distribution. 203*2abb3134SXin Li lied_signal <- rbinom(k, x[-1], f) 204*2abb3134SXin Li 205*2abb3134SXin Li # Remaining must be the non-lying bits when signal. Sampled with q. 206*2abb3134SXin Li truth_signal <- x[-1] - lied_signal 207*2abb3134SXin Li 208*2abb3134SXin Li # Lies when there is no signal which happens x[1] - x[-1] times. 209*2abb3134SXin Li lied_nosignal <- rbinom(k, x[1] - x[-1], f) 210*2abb3134SXin Li 211*2abb3134SXin Li # Trtuh when there's no signal. These are sampled with p. 212*2abb3134SXin Li truth_nosignal <- x[1] - x[-1] - lied_nosignal 213*2abb3134SXin Li 214*2abb3134SXin Li # Total lies and sampling lies with 50/50 for either p or q. 215*2abb3134SXin Li lied <- lied_signal + lied_nosignal 216*2abb3134SXin Li lied_p <- rbinom(k, lied, .5) 217*2abb3134SXin Li lied_q <- lied - lied_p 218*2abb3134SXin Li 219*2abb3134SXin Li # Generating the report where sampling of either p or q occurs. 220*2abb3134SXin Li rbinom(k, lied_q + truth_signal, q) + rbinom(k, lied_p + truth_nosignal, p) 221*2abb3134SXin Li }) 222*2abb3134SXin Li 223*2abb3134SXin Li cbind(truth[, 1], t(rappors)) 224*2abb3134SXin Li} 225*2abb3134SXin Li 226*2abb3134SXin LiGenerateSamples <- function(N = 10^5, params, pop_params, alpha = .05, 227*2abb3134SXin Li prop_missing = 0, 228*2abb3134SXin Li correction = "Bonferroni") { 229*2abb3134SXin Li # Simulate N reports with pop_params describing the population and 230*2abb3134SXin Li # params describing the RAPPOR configuration. 231*2abb3134SXin Li num_strings = pop_params[[1]] 232*2abb3134SXin Li 233*2abb3134SXin Li strs <- SetOfStrings(num_strings) 234*2abb3134SXin Li probs <- GetSampleProbs(pop_params) 235*2abb3134SXin Li samp <- GetSample(N, strs, probs) 236*2abb3134SXin Li map <- CreateMap(strs, params) 237*2abb3134SXin Li truth <- GetTrueBits(samp, map$map_by_cohort, params) 238*2abb3134SXin Li rappors <- GetNoisyBits(truth, params) 239*2abb3134SXin Li 240*2abb3134SXin Li strs_apprx <- strs 241*2abb3134SXin Li map_apprx <- map$all_cohorts_map 242*2abb3134SXin Li # Remove % of strings to simulate missing variables. 243*2abb3134SXin Li if (prop_missing > 0) { 244*2abb3134SXin Li ind <- which(probs > 0) 245*2abb3134SXin Li removed <- sample(ind, ceiling(prop_missing * length(ind))) 246*2abb3134SXin Li map_apprx <- map$all_cohorts_map[, -removed] 247*2abb3134SXin Li strs_apprx <- strs[-removed] 248*2abb3134SXin Li } 249*2abb3134SXin Li 250*2abb3134SXin Li # Randomize the columns. 251*2abb3134SXin Li ind <- sample(1:length(strs_apprx), length(strs_apprx)) 252*2abb3134SXin Li map_apprx <- map_apprx[, ind] 253*2abb3134SXin Li strs_apprx <- strs_apprx[ind] 254*2abb3134SXin Li 255*2abb3134SXin Li fit <- Decode(rappors, map_apprx, params, alpha = alpha, 256*2abb3134SXin Li correction = correction) 257*2abb3134SXin Li 258*2abb3134SXin Li # Add truth column. 259*2abb3134SXin Li fit$fit$Truth <- table(samp)[fit$fit$string] 260*2abb3134SXin Li fit$fit$Truth[is.na(fit$fit$Truth)] <- 0 261*2abb3134SXin Li 262*2abb3134SXin Li fit$map <- map$map_by_cohort 263*2abb3134SXin Li fit$truth <- truth 264*2abb3134SXin Li fit$strs <- strs 265*2abb3134SXin Li fit$probs <- probs 266*2abb3134SXin Li 267*2abb3134SXin Li fit 268*2abb3134SXin Li} 269