xref: /aosp_15_r20/external/rappor/tests/assoc_sim.R (revision 2abb31345f6c95944768b5222a9a5ed3fc68cc00)
1*2abb3134SXin Li#!/usr/bin/env Rscript
2*2abb3134SXin Li#
3*2abb3134SXin Li# Copyright 2015 Google Inc. All rights reserved.
4*2abb3134SXin Li#
5*2abb3134SXin Li# Licensed under the Apache License, Version 2.0 (the "License");
6*2abb3134SXin Li# you may not use this file except in compliance with the License.
7*2abb3134SXin Li# You may obtain a copy of the License at
8*2abb3134SXin Li#
9*2abb3134SXin Li#     http://www.apache.org/licenses/LICENSE-2.0
10*2abb3134SXin Li#
11*2abb3134SXin Li# Unless required by applicable law or agreed to in writing, software
12*2abb3134SXin Li# distributed under the License is distributed on an "AS IS" BASIS,
13*2abb3134SXin Li# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14*2abb3134SXin Li# See the License for the specific language governing permissions and
15*2abb3134SXin Li# limitations under the License.
16*2abb3134SXin Li
17*2abb3134SXin Li# Simulates inputs on which association analysis can be run.
18*2abb3134SXin Li# Currently assoc_sim.R only supports 2 variables but can
19*2abb3134SXin Li# be easily extended to support more.
20*2abb3134SXin Li#
21*2abb3134SXin Li# Usage:
22*2abb3134SXin Li#       $ ./assoc_sim.R -n 1000
23*2abb3134SXin Li# Inputs: uvals, params, reports, map, num, unif
24*2abb3134SXin Li#         see how options are parsed below for more information
25*2abb3134SXin Li# Outputs:
26*2abb3134SXin Li#         reports.csv file containing reports
27*2abb3134SXin Li#         map_{1, 2, ...}.csv file(s) containing maps of variables
28*2abb3134SXin Li
29*2abb3134SXin Lilibrary("optparse")
30*2abb3134SXin Li
31*2abb3134SXin Lioptions(stringsAsFactors = FALSE)
32*2abb3134SXin Li
33*2abb3134SXin Liif(!interactive()) {
34*2abb3134SXin Li  option_list <- list(
35*2abb3134SXin Li    make_option(c("--uvals", "-v"), default = "uvals.csv",
36*2abb3134SXin Li                help = "Filename for list of values over which
37*2abb3134SXin Li                distributions are simulated. The file is a list of
38*2abb3134SXin Li                comma-separated strings each line of which refers
39*2abb3134SXin Li                to a new variable."),
40*2abb3134SXin Li    make_option(c("--params", "-p"), default = "params.csv",
41*2abb3134SXin Li                help = "Filename for RAPPOR parameters"),
42*2abb3134SXin Li    make_option(c("--reports", "-r"), default = "reports.csv",
43*2abb3134SXin Li                help = "Filename for reports"),
44*2abb3134SXin Li    make_option(c("--map", "-m"), default = "map",
45*2abb3134SXin Li                help = "Filename *prefix* for map(s)"),
46*2abb3134SXin Li    make_option(c("--num", "-n"), default = 1e05,
47*2abb3134SXin Li                help = "Number of reports"),
48*2abb3134SXin Li    make_option(c("--unif", "-u"), default = FALSE,
49*2abb3134SXin Li                help = "Run simulation with uniform distribution")
50*2abb3134SXin Li  )
51*2abb3134SXin Li  opts <- parse_args(OptionParser(option_list = option_list))
52*2abb3134SXin Li}
53*2abb3134SXin Li
54*2abb3134SXin Lisource("../analysis/R/encode.R")
55*2abb3134SXin Lisource("../analysis/R/decode.R")
56*2abb3134SXin Lisource("../analysis/R/simulation.R")
57*2abb3134SXin Lisource("../analysis/R/read_input.R")
58*2abb3134SXin Lisource("../analysis/R/association.R")
59*2abb3134SXin Li
60*2abb3134SXin Li# Read unique values of reports from a csv file
61*2abb3134SXin Li# Inputs: filename. The file is expected to contain two rows of strings
62*2abb3134SXin Li#         (one for each variable):
63*2abb3134SXin Li#         "google.com", "apple.com", ...
64*2abb3134SXin Li#         "ssl", "nossl", ...
65*2abb3134SXin Li# Returns: a list containing strings
66*2abb3134SXin LiGetUniqueValsFromFile <- function(filename) {
67*2abb3134SXin Li  contents <- read.csv(filename, header = FALSE)
68*2abb3134SXin Li  # Expect 2 rows of unique vals
69*2abb3134SXin Li  if(nrow(contents) != 2) {
70*2abb3134SXin Li    stop(paste("Unique vals file", filename, "expected to have
71*2abb3134SXin Li               two rows of strings."))
72*2abb3134SXin Li  }
73*2abb3134SXin Li  # Removes superfluous "" entries if the lists of unique values
74*2abb3134SXin Li  # differ in length
75*2abb3134SXin Li  strip_empty <- function(vec) {
76*2abb3134SXin Li    vec[!vec %in% c("")]
77*2abb3134SXin Li  }
78*2abb3134SXin Li  list(var1 = strip_empty(as.vector(t(contents[1,]))),
79*2abb3134SXin Li       var2 = strip_empty(as.vector(t(contents[2,]))))
80*2abb3134SXin Li}
81*2abb3134SXin Li
82*2abb3134SXin Li# Simulate correlated reports and write into reportsfile
83*2abb3134SXin Li# Inputs: N = number of reports
84*2abb3134SXin Li#         uvals = list containing a list of unique values
85*2abb3134SXin Li#         params = list with RAPPOR parameters
86*2abb3134SXin Li#         unif = whether to replace poisson with uniform
87*2abb3134SXin Li#         mapfile = file to write maps into (with .csv suffixes)
88*2abb3134SXin Li#         reportsfile = file to write reports into (with .csv suffix)
89*2abb3134SXin LiSimulateReports <- function(N, uvals, params, unif,
90*2abb3134SXin Li                            mapfile, reportsfile) {
91*2abb3134SXin Li  # Compute true distribution
92*2abb3134SXin Li  m <- params$m
93*2abb3134SXin Li
94*2abb3134SXin Li  if (unif) {
95*2abb3134SXin Li    # Draw uniformly from 1 to 10
96*2abb3134SXin Li    v1_samples <- as.integer(runif(N, 1, 10))
97*2abb3134SXin Li  } else {
98*2abb3134SXin Li    # Draw from a Poisson random variable
99*2abb3134SXin Li    v1_samples <- rpois(N, 1) + rep(1, N)
100*2abb3134SXin Li  }
101*2abb3134SXin Li
102*2abb3134SXin Li  # Pr[var2 = N + 1 | var1 = N] = 0.5
103*2abb3134SXin Li  # Pr[var2 = N     | var1 = N] = 0.5
104*2abb3134SXin Li  v2_samples <- v1_samples + sample(c(0, 1), N, replace = TRUE)
105*2abb3134SXin Li
106*2abb3134SXin Li  tmp_samples <- list(v1_samples, v2_samples)
107*2abb3134SXin Li
108*2abb3134SXin Li  # Function to pad strings to uval_vec if sample_vec has
109*2abb3134SXin Li  # larger support than the number of strings in uval_vec
110*2abb3134SXin Li  # For e.g., if samples have support {1, 2, 3, 4, ...} and uvals
111*2abb3134SXin Li  # only have "value1", "value2", and "value3", samples now
112*2abb3134SXin Li  # over support {"value1", "value2", "value3", "str4", ...}
113*2abb3134SXin Li  PadStrings <- function(sample_vec, uval_vec) {
114*2abb3134SXin Li    if (max(sample_vec) > length(uval_vec)) {
115*2abb3134SXin Li      # Padding uvals to required length
116*2abb3134SXin Li      len <- length(uval_vec)
117*2abb3134SXin Li      max_of_samples <- max(sample_vec)
118*2abb3134SXin Li      uval_vec[(len + 1):max_of_samples] <- apply(
119*2abb3134SXin Li        as.matrix((len + 1):max_of_samples),
120*2abb3134SXin Li        1,
121*2abb3134SXin Li        function(i) sprintf("str%d", i))
122*2abb3134SXin Li    }
123*2abb3134SXin Li    uval_vec
124*2abb3134SXin Li  }
125*2abb3134SXin Li
126*2abb3134SXin Li  # Pad and update uvals
127*2abb3134SXin Li  uvals <- lapply(1:2, function(i) PadStrings(tmp_samples[[i]],
128*2abb3134SXin Li                                              uvals[[i]]))
129*2abb3134SXin Li
130*2abb3134SXin Li  # Replace integers in tmp_samples with actual sample strings
131*2abb3134SXin Li  samples <- lapply(1:2, function(i) uvals[[i]][tmp_samples[[i]]])
132*2abb3134SXin Li
133*2abb3134SXin Li  # Randomly assign cohorts in each dimension
134*2abb3134SXin Li  cohorts <- sample(1:m, N, replace = TRUE)
135*2abb3134SXin Li
136*2abb3134SXin Li  # Create and write map into mapfile_1.csv and mapfile_2.csv
137*2abb3134SXin Li  map <- lapply(uvals, function(u) CreateMap(u, params))
138*2abb3134SXin Li  write.table(map[[1]]$map_pos, file = paste(mapfile, "_1.csv", sep = ""),
139*2abb3134SXin Li              sep = ",", col.names = FALSE, na = "", quote = FALSE)
140*2abb3134SXin Li  write.table(map[[2]]$map_pos, file = paste(mapfile, "_2.csv", sep = ""),
141*2abb3134SXin Li              sep = ",", col.names = FALSE, na = "", quote = FALSE)
142*2abb3134SXin Li
143*2abb3134SXin Li  # Write reports into a csv file
144*2abb3134SXin Li  # Format:
145*2abb3134SXin Li  #     cohort, bloom filter var1, bloom filter var2
146*2abb3134SXin Li  reports <- lapply(1:2, function(i)
147*2abb3134SXin Li    EncodeAll(samples[[i]], cohorts, map[[i]]$map, params))
148*2abb3134SXin Li  # Organize cohorts and reports into format
149*2abb3134SXin Li  write_matrix <- cbind(as.matrix(cohorts),
150*2abb3134SXin Li                        as.matrix(lapply(reports[[1]],
151*2abb3134SXin Li                            function(x) paste(x, collapse = ""))),
152*2abb3134SXin Li                        as.matrix(lapply(reports[[2]],
153*2abb3134SXin Li                            function(x) paste(x, collapse = ""))))
154*2abb3134SXin Li  write.table(write_matrix, file = reportsfile, quote = FALSE,
155*2abb3134SXin Li              row.names = FALSE, col.names = FALSE, sep = ",")
156*2abb3134SXin Li}
157*2abb3134SXin Li
158*2abb3134SXin Limain <- function(opts) {
159*2abb3134SXin Li  ptm <- proc.time()
160*2abb3134SXin Li
161*2abb3134SXin Li  uvals <- GetUniqueValsFromFile(opts$uvals)
162*2abb3134SXin Li  params <- ReadParameterFile(opts$params)
163*2abb3134SXin Li  SimulateReports(opts$num, uvals, params,  opts$unif, # inputs
164*2abb3134SXin Li                  opts$map, opts$reports)              # outputs
165*2abb3134SXin Li
166*2abb3134SXin Li  print("PROC.TIME")
167*2abb3134SXin Li  print(proc.time() - ptm)
168*2abb3134SXin Li}
169*2abb3134SXin Li
170*2abb3134SXin Liif(!interactive()) {
171*2abb3134SXin Li  main(opts)
172*2abb3134SXin Li}
173