xref: /aosp_15_r20/external/rappor/tests/analyze_assoc.R (revision 2abb31345f6c95944768b5222a9a5ed3fc68cc00)
1*2abb3134SXin Li#!/usr/bin/env Rscript
2*2abb3134SXin Li#
3*2abb3134SXin Li# Copyright 2015 Google Inc. All rights reserved.
4*2abb3134SXin Li#
5*2abb3134SXin Li# Licensed under the Apache License, Version 2.0 (the "License");
6*2abb3134SXin Li# you may not use this file except in compliance with the License.
7*2abb3134SXin Li# You may obtain a copy of the License at
8*2abb3134SXin Li#
9*2abb3134SXin Li#     http://www.apache.org/licenses/LICENSE-2.0
10*2abb3134SXin Li#
11*2abb3134SXin Li# Unless required by applicable law or agreed to in writing, software
12*2abb3134SXin Li# distributed under the License is distributed on an "AS IS" BASIS,
13*2abb3134SXin Li# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14*2abb3134SXin Li# See the License for the specific language governing permissions and
15*2abb3134SXin Li# limitations under the License.
16*2abb3134SXin Li
17*2abb3134SXin Li# Reads map files, report files, and RAPPOR parameters to run
18*2abb3134SXin Li# an EM algorithm to estimate joint distribution over two or more variables
19*2abb3134SXin Li#
20*2abb3134SXin Li# Usage:
21*2abb3134SXin Li#       $ ./analyze_assoc.R -map1 map_1.csv -map2 map_2.csv \
22*2abb3134SXin Li#                                 -reports reports.csv \
23*2abb3134SXin Li# Inputs: map1, map2, reports, params
24*2abb3134SXin Li#         see how options are parsed below for more information
25*2abb3134SXin Li# Outputs:
26*2abb3134SXin Li#         prints a table with estimated joint probability masses
27*2abb3134SXin Li#         over candidate strings
28*2abb3134SXin Li#         Ex.
29*2abb3134SXin Li#                 ssl   nossl
30*2abb3134SXin Li#         intel   0.1   0.3
31*2abb3134SXin Li#         google  0.5   0.1
32*2abb3134SXin Li
33*2abb3134SXin Lilibrary("optparse")
34*2abb3134SXin Li
35*2abb3134SXin Lioptions(stringsAsFactors = FALSE)
36*2abb3134SXin Li
37*2abb3134SXin Liif(!interactive()) {
38*2abb3134SXin Li  option_list <- list(
39*2abb3134SXin Li    # Flags
40*2abb3134SXin Li    make_option(c("--map1", "-m1"), default = "map_1.csv",
41*2abb3134SXin Li                help = "Hashed candidates for 1st variable"),
42*2abb3134SXin Li    make_option(c("--map2", "-m2"), default = "map_2.csv",
43*2abb3134SXin Li                help = "Hashed candidates for 2nd variable"),
44*2abb3134SXin Li    make_option(c("--reports", "-r"), default = "reports.csv",
45*2abb3134SXin Li                help = "File with raw reports as <cohort, report1, report2>"),
46*2abb3134SXin Li    make_option(c("--params", "-p"), default = "params.csv",
47*2abb3134SXin Li                help = "Filename for RAPPOR parameters")
48*2abb3134SXin Li  )
49*2abb3134SXin Li  opts <- parse_args(OptionParser(option_list = option_list))
50*2abb3134SXin Li}
51*2abb3134SXin Li
52*2abb3134SXin Lisource("../analysis/R/encode.R")
53*2abb3134SXin Lisource("../analysis/R/decode.R")
54*2abb3134SXin Lisource("../analysis/R/simulation.R")
55*2abb3134SXin Lisource("../analysis/R/read_input.R")
56*2abb3134SXin Lisource("../analysis/R/association.R")
57*2abb3134SXin Li
58*2abb3134SXin Li# This function processes the maps loaded using ReadMapFile
59*2abb3134SXin Li# Association analysis requires a map object with a map
60*2abb3134SXin Li# field that has the map split into cohorts and an rmap field
61*2abb3134SXin Li# that has all the cohorts combined
62*2abb3134SXin Li# Arguments:
63*2abb3134SXin Li#       map = map object with cohorts as sparse matrix in
64*2abb3134SXin Li#             object map$map
65*2abb3134SXin Li#             This is the expected object from ReadMapFile
66*2abb3134SXin Li#       params = data field with parameters
67*2abb3134SXin Li# TODO(pseudorandom): move this functionality to ReadMapFile
68*2abb3134SXin LiProcessMap <- function(map, params) {
69*2abb3134SXin Li  map$rmap <- map$map
70*2abb3134SXin Li  split_map <- function(i, map_struct) {
71*2abb3134SXin Li    numbits <- params$k
72*2abb3134SXin Li    indices <- which(as.matrix(
73*2abb3134SXin Li      map_struct[((i - 1) * numbits + 1):(i * numbits),]) == TRUE,
74*2abb3134SXin Li      arr.ind = TRUE)
75*2abb3134SXin Li    sparseMatrix(indices[, "row"], indices[, "col"],
76*2abb3134SXin Li                 dims = c(numbits, max(indices[, "col"])))
77*2abb3134SXin Li  }
78*2abb3134SXin Li  map$map <- lapply(1:params$m, function(i) split_map(i, map$rmap))
79*2abb3134SXin Li  map
80*2abb3134SXin Li}
81*2abb3134SXin Li
82*2abb3134SXin Limain <- function(opts) {
83*2abb3134SXin Li  ptm <- proc.time()
84*2abb3134SXin Li
85*2abb3134SXin Li  params <- ReadParameterFile(opts$params)
86*2abb3134SXin Li  opts_map <- list(opts$map1, opts$map2)
87*2abb3134SXin Li  map <- lapply(opts_map, function(o)
88*2abb3134SXin Li                  ProcessMap(ReadMapFile(o, params = params),
89*2abb3134SXin Li                             params = params))
90*2abb3134SXin Li  # Reports must be of the format
91*2abb3134SXin Li  #     cohort no, rappor bitstring 1, rappor bitstring 2
92*2abb3134SXin Li  reportsObj <- read.csv(opts$reports,
93*2abb3134SXin Li                         colClasses = c("integer", "character", "character"),
94*2abb3134SXin Li                         header = FALSE)
95*2abb3134SXin Li
96*2abb3134SXin Li  # Parsing reportsObj
97*2abb3134SXin Li  # ComputeDistributionEM allows for different sets of cohorts
98*2abb3134SXin Li  # for each variable. Here, both sets of cohorts are identical
99*2abb3134SXin Li  co <- as.list(reportsObj[1])[[1]]
100*2abb3134SXin Li  cohorts <- list(co, co)
101*2abb3134SXin Li  # Parse reports from reportObj cols 2 and 3
102*2abb3134SXin Li  reports <- lapply(1:2, function(x) as.list(reportsObj[x + 1]))
103*2abb3134SXin Li
104*2abb3134SXin Li  # Split strings into bit arrays (as required by assoc analysis)
105*2abb3134SXin Li  reports <- lapply(1:2, function(i) {
106*2abb3134SXin Li    # apply the following function to each of reports[[1]] and reports[[2]]
107*2abb3134SXin Li    lapply(reports[[i]][[1]], function(x) {
108*2abb3134SXin Li      # function splits strings and converts them to numeric values
109*2abb3134SXin Li      as.numeric(strsplit(x, split = "")[[1]])
110*2abb3134SXin Li    })
111*2abb3134SXin Li  })
112*2abb3134SXin Li
113*2abb3134SXin Li  joint_dist <- ComputeDistributionEM(reports, cohorts, map,
114*2abb3134SXin Li                                      ignore_other = TRUE,
115*2abb3134SXin Li                                      params, marginals = NULL,
116*2abb3134SXin Li                                      estimate_var = FALSE)
117*2abb3134SXin Li  # TODO(pseudorandom): Export the results to a file for further analysis
118*2abb3134SXin Li  print("JOINT_DIST$FIT")
119*2abb3134SXin Li  print(joint_dist$fit)
120*2abb3134SXin Li  print("PROC.TIME")
121*2abb3134SXin Li  print(proc.time() - ptm)
122*2abb3134SXin Li}
123*2abb3134SXin Li
124*2abb3134SXin Liif(!interactive()) {
125*2abb3134SXin Li  main(opts)
126*2abb3134SXin Li}