1*2abb3134SXin Li#!/usr/bin/env Rscript 2*2abb3134SXin Li# 3*2abb3134SXin Li# Copyright 2015 Google Inc. All rights reserved. 4*2abb3134SXin Li# 5*2abb3134SXin Li# Licensed under the Apache License, Version 2.0 (the "License"); 6*2abb3134SXin Li# you may not use this file except in compliance with the License. 7*2abb3134SXin Li# You may obtain a copy of the License at 8*2abb3134SXin Li# 9*2abb3134SXin Li# http://www.apache.org/licenses/LICENSE-2.0 10*2abb3134SXin Li# 11*2abb3134SXin Li# Unless required by applicable law or agreed to in writing, software 12*2abb3134SXin Li# distributed under the License is distributed on an "AS IS" BASIS, 13*2abb3134SXin Li# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14*2abb3134SXin Li# See the License for the specific language governing permissions and 15*2abb3134SXin Li# limitations under the License. 16*2abb3134SXin Li 17*2abb3134SXin Li# Reads map files, report files, and RAPPOR parameters to run 18*2abb3134SXin Li# an EM algorithm to estimate joint distribution over two or more variables 19*2abb3134SXin Li# 20*2abb3134SXin Li# Usage: 21*2abb3134SXin Li# $ ./analyze_assoc.R -map1 map_1.csv -map2 map_2.csv \ 22*2abb3134SXin Li# -reports reports.csv \ 23*2abb3134SXin Li# Inputs: map1, map2, reports, params 24*2abb3134SXin Li# see how options are parsed below for more information 25*2abb3134SXin Li# Outputs: 26*2abb3134SXin Li# prints a table with estimated joint probability masses 27*2abb3134SXin Li# over candidate strings 28*2abb3134SXin Li# Ex. 29*2abb3134SXin Li# ssl nossl 30*2abb3134SXin Li# intel 0.1 0.3 31*2abb3134SXin Li# google 0.5 0.1 32*2abb3134SXin Li 33*2abb3134SXin Lilibrary("optparse") 34*2abb3134SXin Li 35*2abb3134SXin Lioptions(stringsAsFactors = FALSE) 36*2abb3134SXin Li 37*2abb3134SXin Liif(!interactive()) { 38*2abb3134SXin Li option_list <- list( 39*2abb3134SXin Li # Flags 40*2abb3134SXin Li make_option(c("--map1", "-m1"), default = "map_1.csv", 41*2abb3134SXin Li help = "Hashed candidates for 1st variable"), 42*2abb3134SXin Li make_option(c("--map2", "-m2"), default = "map_2.csv", 43*2abb3134SXin Li help = "Hashed candidates for 2nd variable"), 44*2abb3134SXin Li make_option(c("--reports", "-r"), default = "reports.csv", 45*2abb3134SXin Li help = "File with raw reports as <cohort, report1, report2>"), 46*2abb3134SXin Li make_option(c("--params", "-p"), default = "params.csv", 47*2abb3134SXin Li help = "Filename for RAPPOR parameters") 48*2abb3134SXin Li ) 49*2abb3134SXin Li opts <- parse_args(OptionParser(option_list = option_list)) 50*2abb3134SXin Li} 51*2abb3134SXin Li 52*2abb3134SXin Lisource("../analysis/R/encode.R") 53*2abb3134SXin Lisource("../analysis/R/decode.R") 54*2abb3134SXin Lisource("../analysis/R/simulation.R") 55*2abb3134SXin Lisource("../analysis/R/read_input.R") 56*2abb3134SXin Lisource("../analysis/R/association.R") 57*2abb3134SXin Li 58*2abb3134SXin Li# This function processes the maps loaded using ReadMapFile 59*2abb3134SXin Li# Association analysis requires a map object with a map 60*2abb3134SXin Li# field that has the map split into cohorts and an rmap field 61*2abb3134SXin Li# that has all the cohorts combined 62*2abb3134SXin Li# Arguments: 63*2abb3134SXin Li# map = map object with cohorts as sparse matrix in 64*2abb3134SXin Li# object map$map 65*2abb3134SXin Li# This is the expected object from ReadMapFile 66*2abb3134SXin Li# params = data field with parameters 67*2abb3134SXin Li# TODO(pseudorandom): move this functionality to ReadMapFile 68*2abb3134SXin LiProcessMap <- function(map, params) { 69*2abb3134SXin Li map$rmap <- map$map 70*2abb3134SXin Li split_map <- function(i, map_struct) { 71*2abb3134SXin Li numbits <- params$k 72*2abb3134SXin Li indices <- which(as.matrix( 73*2abb3134SXin Li map_struct[((i - 1) * numbits + 1):(i * numbits),]) == TRUE, 74*2abb3134SXin Li arr.ind = TRUE) 75*2abb3134SXin Li sparseMatrix(indices[, "row"], indices[, "col"], 76*2abb3134SXin Li dims = c(numbits, max(indices[, "col"]))) 77*2abb3134SXin Li } 78*2abb3134SXin Li map$map <- lapply(1:params$m, function(i) split_map(i, map$rmap)) 79*2abb3134SXin Li map 80*2abb3134SXin Li} 81*2abb3134SXin Li 82*2abb3134SXin Limain <- function(opts) { 83*2abb3134SXin Li ptm <- proc.time() 84*2abb3134SXin Li 85*2abb3134SXin Li params <- ReadParameterFile(opts$params) 86*2abb3134SXin Li opts_map <- list(opts$map1, opts$map2) 87*2abb3134SXin Li map <- lapply(opts_map, function(o) 88*2abb3134SXin Li ProcessMap(ReadMapFile(o, params = params), 89*2abb3134SXin Li params = params)) 90*2abb3134SXin Li # Reports must be of the format 91*2abb3134SXin Li # cohort no, rappor bitstring 1, rappor bitstring 2 92*2abb3134SXin Li reportsObj <- read.csv(opts$reports, 93*2abb3134SXin Li colClasses = c("integer", "character", "character"), 94*2abb3134SXin Li header = FALSE) 95*2abb3134SXin Li 96*2abb3134SXin Li # Parsing reportsObj 97*2abb3134SXin Li # ComputeDistributionEM allows for different sets of cohorts 98*2abb3134SXin Li # for each variable. Here, both sets of cohorts are identical 99*2abb3134SXin Li co <- as.list(reportsObj[1])[[1]] 100*2abb3134SXin Li cohorts <- list(co, co) 101*2abb3134SXin Li # Parse reports from reportObj cols 2 and 3 102*2abb3134SXin Li reports <- lapply(1:2, function(x) as.list(reportsObj[x + 1])) 103*2abb3134SXin Li 104*2abb3134SXin Li # Split strings into bit arrays (as required by assoc analysis) 105*2abb3134SXin Li reports <- lapply(1:2, function(i) { 106*2abb3134SXin Li # apply the following function to each of reports[[1]] and reports[[2]] 107*2abb3134SXin Li lapply(reports[[i]][[1]], function(x) { 108*2abb3134SXin Li # function splits strings and converts them to numeric values 109*2abb3134SXin Li as.numeric(strsplit(x, split = "")[[1]]) 110*2abb3134SXin Li }) 111*2abb3134SXin Li }) 112*2abb3134SXin Li 113*2abb3134SXin Li joint_dist <- ComputeDistributionEM(reports, cohorts, map, 114*2abb3134SXin Li ignore_other = TRUE, 115*2abb3134SXin Li params, marginals = NULL, 116*2abb3134SXin Li estimate_var = FALSE) 117*2abb3134SXin Li # TODO(pseudorandom): Export the results to a file for further analysis 118*2abb3134SXin Li print("JOINT_DIST$FIT") 119*2abb3134SXin Li print(joint_dist$fit) 120*2abb3134SXin Li print("PROC.TIME") 121*2abb3134SXin Li print(proc.time() - ptm) 122*2abb3134SXin Li} 123*2abb3134SXin Li 124*2abb3134SXin Liif(!interactive()) { 125*2abb3134SXin Li main(opts) 126*2abb3134SXin Li}