1*2abb3134SXin Li#!/usr/bin/env Rscript 2*2abb3134SXin Li# 3*2abb3134SXin Li# Copyright 2015 Google Inc. All rights reserved. 4*2abb3134SXin Li# 5*2abb3134SXin Li# Licensed under the Apache License, Version 2.0 (the "License"); 6*2abb3134SXin Li# you may not use this file except in compliance with the License. 7*2abb3134SXin Li# You may obtain a copy of the License at 8*2abb3134SXin Li# 9*2abb3134SXin Li# http://www.apache.org/licenses/LICENSE-2.0 10*2abb3134SXin Li# 11*2abb3134SXin Li# Unless required by applicable law or agreed to in writing, software 12*2abb3134SXin Li# distributed under the License is distributed on an "AS IS" BASIS, 13*2abb3134SXin Li# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14*2abb3134SXin Li# See the License for the specific language governing permissions and 15*2abb3134SXin Li# limitations under the License. 16*2abb3134SXin Li 17*2abb3134SXin Lisource('analysis/R/read_input.R') 18*2abb3134SXin Li 19*2abb3134SXin LiRandomPartition <- function(total, weights) { 20*2abb3134SXin Li # Outputs a random partition according to a specified distribution 21*2abb3134SXin Li # Args: 22*2abb3134SXin Li # total - number of samples 23*2abb3134SXin Li # weights - weights that are proportional to the probability density 24*2abb3134SXin Li # function of the target distribution 25*2abb3134SXin Li # Returns: 26*2abb3134SXin Li # a histogram sampled according to the pdf 27*2abb3134SXin Li # Example: 28*2abb3134SXin Li # > RandomPartition(100, c(3, 2, 1, 0, 1)) 29*2abb3134SXin Li # [1] 47 24 15 0 14 30*2abb3134SXin Li if (any(weights < 0)) 31*2abb3134SXin Li stop("Probabilities cannot be negative") 32*2abb3134SXin Li 33*2abb3134SXin Li if (sum(weights) == 0) 34*2abb3134SXin Li stop("Probabilities cannot sum up to 0") 35*2abb3134SXin Li 36*2abb3134SXin Li bins <- length(weights) 37*2abb3134SXin Li result <- rep(0, bins) 38*2abb3134SXin Li 39*2abb3134SXin Li # idiomatic way: 40*2abb3134SXin Li # rnd_list <- sample(strs, total, replace = TRUE, weights) 41*2abb3134SXin Li # apply(as.array(strs), 1, function(x) length(rnd_list[rnd_list == x])) 42*2abb3134SXin Li # 43*2abb3134SXin Li # The following is much faster for larger totals. We can replace a loop with 44*2abb3134SXin Li # (tail) recusion, but R chokes with the recursion depth > 850. 45*2abb3134SXin Li 46*2abb3134SXin Li w <- sum(weights) 47*2abb3134SXin Li 48*2abb3134SXin Li for (i in 1:bins) 49*2abb3134SXin Li if (total > 0) { # if total == 0, nothing else to do 50*2abb3134SXin Li # invariant: w = sum(weights[i:bins]) 51*2abb3134SXin Li # rather than computing sum every time leading to quadratic time, keep 52*2abb3134SXin Li # updating it 53*2abb3134SXin Li 54*2abb3134SXin Li # The probability p is clamped to [0, 1] to avoid under/overflow errors. 55*2abb3134SXin Li p <- min(max(weights[i] / w, 0), 1) 56*2abb3134SXin Li # draw the number of balls falling into the current bin 57*2abb3134SXin Li rnd_draw <- rbinom(n = 1, size = total, prob = p) 58*2abb3134SXin Li result[i] <- rnd_draw # push rnd_draw balls from total to result[i] 59*2abb3134SXin Li total <- total - rnd_draw 60*2abb3134SXin Li w <- w - weights[i] 61*2abb3134SXin Li } 62*2abb3134SXin Li 63*2abb3134SXin Li names(result) <- names(weights) 64*2abb3134SXin Li 65*2abb3134SXin Li return(result) 66*2abb3134SXin Li} 67*2abb3134SXin Li 68*2abb3134SXin LiGenerateCounts <- function(params, true_map, partition, reports_per_client) { 69*2abb3134SXin Li # Fast simulation of the marginal table for RAPPOR reports 70*2abb3134SXin Li # Args: 71*2abb3134SXin Li # params - parameters of the RAPPOR reporting process 72*2abb3134SXin Li # true_map - hashed true inputs 73*2abb3134SXin Li # partition - allocation of clients between true values 74*2abb3134SXin Li # reports_per_client - number of reports (IRRs) per client 75*2abb3134SXin Li if (nrow(true_map) != (params$m * params$k)) { 76*2abb3134SXin Li stop(cat("Map does not match the params file!", 77*2abb3134SXin Li "mk =", params$m * params$k, 78*2abb3134SXin Li "nrow(map):", nrow(true_map), 79*2abb3134SXin Li sep = " ")) 80*2abb3134SXin Li } 81*2abb3134SXin Li 82*2abb3134SXin Li # For each reporting type computes its allocation to cohorts. 83*2abb3134SXin Li # Output is an m x strs matrix. 84*2abb3134SXin Li cohorts <- as.matrix( 85*2abb3134SXin Li apply(as.data.frame(partition), 1, 86*2abb3134SXin Li function(count) RandomPartition(count, rep(1, params$m)))) 87*2abb3134SXin Li 88*2abb3134SXin Li # Expands to (m x k) x strs matrix, where each element (corresponding to the 89*2abb3134SXin Li # bit in the aggregate Bloom filter) is repeated k times. 90*2abb3134SXin Li expanded <- apply(cohorts, 2, function(vec) rep(vec, each = params$k)) 91*2abb3134SXin Li 92*2abb3134SXin Li # For each bit, the number of clients reporting this bit: 93*2abb3134SXin Li clients_per_bit <- rep(apply(cohorts, 1, sum), each = params$k) 94*2abb3134SXin Li 95*2abb3134SXin Li # Computes the true number of bits set to one BEFORE PRR. 96*2abb3134SXin Li true_ones <- apply(expanded * true_map, 1, sum) 97*2abb3134SXin Li 98*2abb3134SXin Li ones_in_prr <- 99*2abb3134SXin Li unlist(lapply(true_ones, 100*2abb3134SXin Li function(x) rbinom(n = 1, size = x, prob = 1 - params$f / 2))) + 101*2abb3134SXin Li unlist(lapply(clients_per_bit - true_ones, # clients where the bit is 0 102*2abb3134SXin Li function(x) rbinom(n = 1, size = x, prob = params$f / 2))) 103*2abb3134SXin Li 104*2abb3134SXin Li # Number of IRRs where each bit is reported (either as 0 or as 1) 105*2abb3134SXin Li reports_per_bit <- clients_per_bit * reports_per_client 106*2abb3134SXin Li 107*2abb3134SXin Li ones_before_irr <- ones_in_prr * reports_per_client 108*2abb3134SXin Li 109*2abb3134SXin Li ones_after_irr <- 110*2abb3134SXin Li unlist(lapply(ones_before_irr, 111*2abb3134SXin Li function(x) rbinom(n = 1, size = x, prob = params$q))) + 112*2abb3134SXin Li unlist(lapply(reports_per_bit - ones_before_irr, 113*2abb3134SXin Li function(x) rbinom(n = 1, size = x, prob = params$p))) 114*2abb3134SXin Li 115*2abb3134SXin Li counts <- cbind(apply(cohorts, 1, sum) * reports_per_client, 116*2abb3134SXin Li matrix(ones_after_irr, nrow = params$m, ncol = params$k, byrow = TRUE)) 117*2abb3134SXin Li 118*2abb3134SXin Li if(any(is.na(counts))) 119*2abb3134SXin Li stop("Failed to generate bit counts. Likely due to integer overflow.") 120*2abb3134SXin Li 121*2abb3134SXin Li counts 122*2abb3134SXin Li} 123*2abb3134SXin Li 124*2abb3134SXin LiComputePdf <- function(distr, range) { 125*2abb3134SXin Li # Outputs discrete probability density function for a given distribution 126*2abb3134SXin Li 127*2abb3134SXin Li # These are the five distributions in gen_sim_input.py 128*2abb3134SXin Li if (distr == 'exp') { 129*2abb3134SXin Li pdf <- dexp(1:range, rate = 5 / range) 130*2abb3134SXin Li } else if (distr == 'gauss') { 131*2abb3134SXin Li half <- range / 2 132*2abb3134SXin Li left <- -half + 1 133*2abb3134SXin Li pdf <- dnorm(left : half, sd = range / 6) 134*2abb3134SXin Li } else if (distr == 'unif') { 135*2abb3134SXin Li # e.g. for N = 4, weights are [0.25, 0.25, 0.25, 0.25] 136*2abb3134SXin Li pdf <- dunif(1:range, max = range) 137*2abb3134SXin Li } else if (distr == 'zipf1') { 138*2abb3134SXin Li # Since the distrubition defined over a finite set, we allow the parameter 139*2abb3134SXin Li # of the Zipf distribution to be 1. 140*2abb3134SXin Li pdf <- sapply(1:range, function(x) 1 / x) 141*2abb3134SXin Li } else if (distr == 'zipf1.5') { 142*2abb3134SXin Li pdf <- sapply(1:range, function(x) 1 / x^1.5) 143*2abb3134SXin Li } 144*2abb3134SXin Li else { 145*2abb3134SXin Li stop(sprintf("Invalid distribution '%s'", distr)) 146*2abb3134SXin Li } 147*2abb3134SXin Li 148*2abb3134SXin Li pdf <- pdf / sum(pdf) # normalize 149*2abb3134SXin Li 150*2abb3134SXin Li pdf 151*2abb3134SXin Li} 152*2abb3134SXin Li 153*2abb3134SXin Li# Usage: 154*2abb3134SXin Li# 155*2abb3134SXin Li# $ ./gen_counts.R exp 10000 1 foo_params.csv foo_true_map.csv foo 156*2abb3134SXin Li# 157*2abb3134SXin Li# Inputs: 158*2abb3134SXin Li# distribution name 159*2abb3134SXin Li# number of clients 160*2abb3134SXin Li# reports per client 161*2abb3134SXin Li# parameters file 162*2abb3134SXin Li# map file 163*2abb3134SXin Li# prefix for output files 164*2abb3134SXin Li# Outputs: 165*2abb3134SXin Li# foo_counts.csv 166*2abb3134SXin Li# foo_hist.csv 167*2abb3134SXin Li# 168*2abb3134SXin Li# Warning: the number of reports in any cohort must be less than 169*2abb3134SXin Li# .Machine$integer.max 170*2abb3134SXin Li 171*2abb3134SXin Limain <- function(argv) { 172*2abb3134SXin Li distr <- argv[[1]] 173*2abb3134SXin Li num_clients <- as.integer(argv[[2]]) 174*2abb3134SXin Li reports_per_client <- as.integer(argv[[3]]) 175*2abb3134SXin Li params_file <- argv[[4]] 176*2abb3134SXin Li true_map_file <- argv[[5]] 177*2abb3134SXin Li out_prefix <- argv[[6]] 178*2abb3134SXin Li 179*2abb3134SXin Li params <- ReadParameterFile(params_file) 180*2abb3134SXin Li 181*2abb3134SXin Li true_map <- ReadMapFile(true_map_file, params) 182*2abb3134SXin Li 183*2abb3134SXin Li num_unique_values <- length(true_map$strs) 184*2abb3134SXin Li 185*2abb3134SXin Li pdf <- ComputePdf(distr, num_unique_values) 186*2abb3134SXin Li 187*2abb3134SXin Li # Computes the number of clients reporting each string 188*2abb3134SXin Li # according to the pre-specified distribution. 189*2abb3134SXin Li partition <- RandomPartition(num_clients, pdf) 190*2abb3134SXin Li 191*2abb3134SXin Li # Histogram 192*2abb3134SXin Li true_hist <- data.frame(string = true_map$strs, count = partition) 193*2abb3134SXin Li 194*2abb3134SXin Li counts <- GenerateCounts(params, true_map$map, partition, reports_per_client) 195*2abb3134SXin Li 196*2abb3134SXin Li # Now create a CSV file 197*2abb3134SXin Li 198*2abb3134SXin Li # Opposite of ReadCountsFile in read_input.R 199*2abb3134SXin Li # http://stackoverflow.com/questions/6750546/export-csv-without-col-names 200*2abb3134SXin Li counts_path <- paste0(out_prefix, '_counts.csv') 201*2abb3134SXin Li write.table(counts, file = counts_path, 202*2abb3134SXin Li row.names = FALSE, col.names = FALSE, sep = ',') 203*2abb3134SXin Li cat(sprintf('Wrote %s\n', counts_path)) 204*2abb3134SXin Li 205*2abb3134SXin Li # TODO: Don't write strings that appear 0 times? 206*2abb3134SXin Li hist_path <- paste0(out_prefix, '_hist.csv') 207*2abb3134SXin Li write.csv(true_hist, file = hist_path, row.names = FALSE) 208*2abb3134SXin Li cat(sprintf('Wrote %s\n', hist_path)) 209*2abb3134SXin Li} 210*2abb3134SXin Li 211*2abb3134SXin Liif (length(sys.frames()) == 0) { 212*2abb3134SXin Li main(commandArgs(TRUE)) 213*2abb3134SXin Li} 214