1*2abb3134SXin Li#!/usr/bin/env Rscript 2*2abb3134SXin Li# 3*2abb3134SXin Li# Copyright 2015 Google Inc. All rights reserved. 4*2abb3134SXin Li# 5*2abb3134SXin Li# Licensed under the Apache License, Version 2.0 (the "License"); 6*2abb3134SXin Li# you may not use this file except in compliance with the License. 7*2abb3134SXin Li# You may obtain a copy of the License at 8*2abb3134SXin Li# 9*2abb3134SXin Li# http://www.apache.org/licenses/LICENSE-2.0 10*2abb3134SXin Li# 11*2abb3134SXin Li# Unless required by applicable law or agreed to in writing, software 12*2abb3134SXin Li# distributed under the License is distributed on an "AS IS" BASIS, 13*2abb3134SXin Li# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14*2abb3134SXin Li# See the License for the specific language governing permissions and 15*2abb3134SXin Li# limitations under the License. 16*2abb3134SXin Li 17*2abb3134SXin Li# Simulates inputs on which association analysis can be run. 18*2abb3134SXin Li# Currently assoc_sim.R only supports 2 variables but can 19*2abb3134SXin Li# be easily extended to support more. 20*2abb3134SXin Li# 21*2abb3134SXin Li# Usage: 22*2abb3134SXin Li# $ ./assoc_sim.R -n 1000 23*2abb3134SXin Li# Inputs: uvals, params, reports, map, num, unif 24*2abb3134SXin Li# see how options are parsed below for more information 25*2abb3134SXin Li# Outputs: 26*2abb3134SXin Li# reports.csv file containing reports 27*2abb3134SXin Li# map_{1, 2, ...}.csv file(s) containing maps of variables 28*2abb3134SXin Li 29*2abb3134SXin Lilibrary("optparse") 30*2abb3134SXin Li 31*2abb3134SXin Lioptions(stringsAsFactors = FALSE) 32*2abb3134SXin Li 33*2abb3134SXin Liif(!interactive()) { 34*2abb3134SXin Li option_list <- list( 35*2abb3134SXin Li make_option(c("--uvals", "-v"), default = "uvals.csv", 36*2abb3134SXin Li help = "Filename for list of values over which 37*2abb3134SXin Li distributions are simulated. The file is a list of 38*2abb3134SXin Li comma-separated strings each line of which refers 39*2abb3134SXin Li to a new variable."), 40*2abb3134SXin Li make_option(c("--params", "-p"), default = "params.csv", 41*2abb3134SXin Li help = "Filename for RAPPOR parameters"), 42*2abb3134SXin Li make_option(c("--reports", "-r"), default = "reports.csv", 43*2abb3134SXin Li help = "Filename for reports"), 44*2abb3134SXin Li make_option(c("--map", "-m"), default = "map", 45*2abb3134SXin Li help = "Filename *prefix* for map(s)"), 46*2abb3134SXin Li make_option(c("--num", "-n"), default = 1e05, 47*2abb3134SXin Li help = "Number of reports"), 48*2abb3134SXin Li make_option(c("--unif", "-u"), default = FALSE, 49*2abb3134SXin Li help = "Run simulation with uniform distribution") 50*2abb3134SXin Li ) 51*2abb3134SXin Li opts <- parse_args(OptionParser(option_list = option_list)) 52*2abb3134SXin Li} 53*2abb3134SXin Li 54*2abb3134SXin Lisource("../analysis/R/encode.R") 55*2abb3134SXin Lisource("../analysis/R/decode.R") 56*2abb3134SXin Lisource("../analysis/R/simulation.R") 57*2abb3134SXin Lisource("../analysis/R/read_input.R") 58*2abb3134SXin Lisource("../analysis/R/association.R") 59*2abb3134SXin Li 60*2abb3134SXin Li# Read unique values of reports from a csv file 61*2abb3134SXin Li# Inputs: filename. The file is expected to contain two rows of strings 62*2abb3134SXin Li# (one for each variable): 63*2abb3134SXin Li# "google.com", "apple.com", ... 64*2abb3134SXin Li# "ssl", "nossl", ... 65*2abb3134SXin Li# Returns: a list containing strings 66*2abb3134SXin LiGetUniqueValsFromFile <- function(filename) { 67*2abb3134SXin Li contents <- read.csv(filename, header = FALSE) 68*2abb3134SXin Li # Expect 2 rows of unique vals 69*2abb3134SXin Li if(nrow(contents) != 2) { 70*2abb3134SXin Li stop(paste("Unique vals file", filename, "expected to have 71*2abb3134SXin Li two rows of strings.")) 72*2abb3134SXin Li } 73*2abb3134SXin Li # Removes superfluous "" entries if the lists of unique values 74*2abb3134SXin Li # differ in length 75*2abb3134SXin Li strip_empty <- function(vec) { 76*2abb3134SXin Li vec[!vec %in% c("")] 77*2abb3134SXin Li } 78*2abb3134SXin Li list(var1 = strip_empty(as.vector(t(contents[1,]))), 79*2abb3134SXin Li var2 = strip_empty(as.vector(t(contents[2,])))) 80*2abb3134SXin Li} 81*2abb3134SXin Li 82*2abb3134SXin Li# Simulate correlated reports and write into reportsfile 83*2abb3134SXin Li# Inputs: N = number of reports 84*2abb3134SXin Li# uvals = list containing a list of unique values 85*2abb3134SXin Li# params = list with RAPPOR parameters 86*2abb3134SXin Li# unif = whether to replace poisson with uniform 87*2abb3134SXin Li# mapfile = file to write maps into (with .csv suffixes) 88*2abb3134SXin Li# reportsfile = file to write reports into (with .csv suffix) 89*2abb3134SXin LiSimulateReports <- function(N, uvals, params, unif, 90*2abb3134SXin Li mapfile, reportsfile) { 91*2abb3134SXin Li # Compute true distribution 92*2abb3134SXin Li m <- params$m 93*2abb3134SXin Li 94*2abb3134SXin Li if (unif) { 95*2abb3134SXin Li # Draw uniformly from 1 to 10 96*2abb3134SXin Li v1_samples <- as.integer(runif(N, 1, 10)) 97*2abb3134SXin Li } else { 98*2abb3134SXin Li # Draw from a Poisson random variable 99*2abb3134SXin Li v1_samples <- rpois(N, 1) + rep(1, N) 100*2abb3134SXin Li } 101*2abb3134SXin Li 102*2abb3134SXin Li # Pr[var2 = N + 1 | var1 = N] = 0.5 103*2abb3134SXin Li # Pr[var2 = N | var1 = N] = 0.5 104*2abb3134SXin Li v2_samples <- v1_samples + sample(c(0, 1), N, replace = TRUE) 105*2abb3134SXin Li 106*2abb3134SXin Li tmp_samples <- list(v1_samples, v2_samples) 107*2abb3134SXin Li 108*2abb3134SXin Li # Function to pad strings to uval_vec if sample_vec has 109*2abb3134SXin Li # larger support than the number of strings in uval_vec 110*2abb3134SXin Li # For e.g., if samples have support {1, 2, 3, 4, ...} and uvals 111*2abb3134SXin Li # only have "value1", "value2", and "value3", samples now 112*2abb3134SXin Li # over support {"value1", "value2", "value3", "str4", ...} 113*2abb3134SXin Li PadStrings <- function(sample_vec, uval_vec) { 114*2abb3134SXin Li if (max(sample_vec) > length(uval_vec)) { 115*2abb3134SXin Li # Padding uvals to required length 116*2abb3134SXin Li len <- length(uval_vec) 117*2abb3134SXin Li max_of_samples <- max(sample_vec) 118*2abb3134SXin Li uval_vec[(len + 1):max_of_samples] <- apply( 119*2abb3134SXin Li as.matrix((len + 1):max_of_samples), 120*2abb3134SXin Li 1, 121*2abb3134SXin Li function(i) sprintf("str%d", i)) 122*2abb3134SXin Li } 123*2abb3134SXin Li uval_vec 124*2abb3134SXin Li } 125*2abb3134SXin Li 126*2abb3134SXin Li # Pad and update uvals 127*2abb3134SXin Li uvals <- lapply(1:2, function(i) PadStrings(tmp_samples[[i]], 128*2abb3134SXin Li uvals[[i]])) 129*2abb3134SXin Li 130*2abb3134SXin Li # Replace integers in tmp_samples with actual sample strings 131*2abb3134SXin Li samples <- lapply(1:2, function(i) uvals[[i]][tmp_samples[[i]]]) 132*2abb3134SXin Li 133*2abb3134SXin Li # Randomly assign cohorts in each dimension 134*2abb3134SXin Li cohorts <- sample(1:m, N, replace = TRUE) 135*2abb3134SXin Li 136*2abb3134SXin Li # Create and write map into mapfile_1.csv and mapfile_2.csv 137*2abb3134SXin Li map <- lapply(uvals, function(u) CreateMap(u, params)) 138*2abb3134SXin Li write.table(map[[1]]$map_pos, file = paste(mapfile, "_1.csv", sep = ""), 139*2abb3134SXin Li sep = ",", col.names = FALSE, na = "", quote = FALSE) 140*2abb3134SXin Li write.table(map[[2]]$map_pos, file = paste(mapfile, "_2.csv", sep = ""), 141*2abb3134SXin Li sep = ",", col.names = FALSE, na = "", quote = FALSE) 142*2abb3134SXin Li 143*2abb3134SXin Li # Write reports into a csv file 144*2abb3134SXin Li # Format: 145*2abb3134SXin Li # cohort, bloom filter var1, bloom filter var2 146*2abb3134SXin Li reports <- lapply(1:2, function(i) 147*2abb3134SXin Li EncodeAll(samples[[i]], cohorts, map[[i]]$map, params)) 148*2abb3134SXin Li # Organize cohorts and reports into format 149*2abb3134SXin Li write_matrix <- cbind(as.matrix(cohorts), 150*2abb3134SXin Li as.matrix(lapply(reports[[1]], 151*2abb3134SXin Li function(x) paste(x, collapse = ""))), 152*2abb3134SXin Li as.matrix(lapply(reports[[2]], 153*2abb3134SXin Li function(x) paste(x, collapse = "")))) 154*2abb3134SXin Li write.table(write_matrix, file = reportsfile, quote = FALSE, 155*2abb3134SXin Li row.names = FALSE, col.names = FALSE, sep = ",") 156*2abb3134SXin Li} 157*2abb3134SXin Li 158*2abb3134SXin Limain <- function(opts) { 159*2abb3134SXin Li ptm <- proc.time() 160*2abb3134SXin Li 161*2abb3134SXin Li uvals <- GetUniqueValsFromFile(opts$uvals) 162*2abb3134SXin Li params <- ReadParameterFile(opts$params) 163*2abb3134SXin Li SimulateReports(opts$num, uvals, params, opts$unif, # inputs 164*2abb3134SXin Li opts$map, opts$reports) # outputs 165*2abb3134SXin Li 166*2abb3134SXin Li print("PROC.TIME") 167*2abb3134SXin Li print(proc.time() - ptm) 168*2abb3134SXin Li} 169*2abb3134SXin Li 170*2abb3134SXin Liif(!interactive()) { 171*2abb3134SXin Li main(opts) 172*2abb3134SXin Li} 173