1#!/usr/bin/env Rscript 2# 3# Copyright 2015 Google Inc. All rights reserved. 4# 5# Licensed under the Apache License, Version 2.0 (the "License"); 6# you may not use this file except in compliance with the License. 7# You may obtain a copy of the License at 8# 9# http://www.apache.org/licenses/LICENSE-2.0 10# 11# Unless required by applicable law or agreed to in writing, software 12# distributed under the License is distributed on an "AS IS" BASIS, 13# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14# See the License for the specific language governing permissions and 15# limitations under the License. 16 17# Reads map files, report files, and RAPPOR parameters to run 18# an EM algorithm to estimate joint distribution over two or more variables 19# 20# Usage: 21# $ ./analyze_assoc.R -map1 map_1.csv -map2 map_2.csv \ 22# -reports reports.csv \ 23# Inputs: map1, map2, reports, params 24# see how options are parsed below for more information 25# Outputs: 26# prints a table with estimated joint probability masses 27# over candidate strings 28# Ex. 29# ssl nossl 30# intel 0.1 0.3 31# google 0.5 0.1 32 33library("optparse") 34 35options(stringsAsFactors = FALSE) 36 37if(!interactive()) { 38 option_list <- list( 39 # Flags 40 make_option(c("--map1", "-m1"), default = "map_1.csv", 41 help = "Hashed candidates for 1st variable"), 42 make_option(c("--map2", "-m2"), default = "map_2.csv", 43 help = "Hashed candidates for 2nd variable"), 44 make_option(c("--reports", "-r"), default = "reports.csv", 45 help = "File with raw reports as <cohort, report1, report2>"), 46 make_option(c("--params", "-p"), default = "params.csv", 47 help = "Filename for RAPPOR parameters") 48 ) 49 opts <- parse_args(OptionParser(option_list = option_list)) 50} 51 52source("../analysis/R/encode.R") 53source("../analysis/R/decode.R") 54source("../analysis/R/simulation.R") 55source("../analysis/R/read_input.R") 56source("../analysis/R/association.R") 57 58# This function processes the maps loaded using ReadMapFile 59# Association analysis requires a map object with a map 60# field that has the map split into cohorts and an rmap field 61# that has all the cohorts combined 62# Arguments: 63# map = map object with cohorts as sparse matrix in 64# object map$map 65# This is the expected object from ReadMapFile 66# params = data field with parameters 67# TODO(pseudorandom): move this functionality to ReadMapFile 68ProcessMap <- function(map, params) { 69 map$rmap <- map$map 70 split_map <- function(i, map_struct) { 71 numbits <- params$k 72 indices <- which(as.matrix( 73 map_struct[((i - 1) * numbits + 1):(i * numbits),]) == TRUE, 74 arr.ind = TRUE) 75 sparseMatrix(indices[, "row"], indices[, "col"], 76 dims = c(numbits, max(indices[, "col"]))) 77 } 78 map$map <- lapply(1:params$m, function(i) split_map(i, map$rmap)) 79 map 80} 81 82main <- function(opts) { 83 ptm <- proc.time() 84 85 params <- ReadParameterFile(opts$params) 86 opts_map <- list(opts$map1, opts$map2) 87 map <- lapply(opts_map, function(o) 88 ProcessMap(ReadMapFile(o, params = params), 89 params = params)) 90 # Reports must be of the format 91 # cohort no, rappor bitstring 1, rappor bitstring 2 92 reportsObj <- read.csv(opts$reports, 93 colClasses = c("integer", "character", "character"), 94 header = FALSE) 95 96 # Parsing reportsObj 97 # ComputeDistributionEM allows for different sets of cohorts 98 # for each variable. Here, both sets of cohorts are identical 99 co <- as.list(reportsObj[1])[[1]] 100 cohorts <- list(co, co) 101 # Parse reports from reportObj cols 2 and 3 102 reports <- lapply(1:2, function(x) as.list(reportsObj[x + 1])) 103 104 # Split strings into bit arrays (as required by assoc analysis) 105 reports <- lapply(1:2, function(i) { 106 # apply the following function to each of reports[[1]] and reports[[2]] 107 lapply(reports[[i]][[1]], function(x) { 108 # function splits strings and converts them to numeric values 109 as.numeric(strsplit(x, split = "")[[1]]) 110 }) 111 }) 112 113 joint_dist <- ComputeDistributionEM(reports, cohorts, map, 114 ignore_other = TRUE, 115 params, marginals = NULL, 116 estimate_var = FALSE) 117 # TODO(pseudorandom): Export the results to a file for further analysis 118 print("JOINT_DIST$FIT") 119 print(joint_dist$fit) 120 print("PROC.TIME") 121 print(proc.time() - ptm) 122} 123 124if(!interactive()) { 125 main(opts) 126}