xref: /aosp_15_r20/external/rappor/bin/decode_assoc.R (revision 2abb31345f6c95944768b5222a9a5ed3fc68cc00)
1#!/usr/bin/env Rscript
2#
3# Command line tool to decode multidimensional reports.  It's a simple wrapper
4# around functions in association.R.
5
6library(optparse)
7
8#
9# Command line parsing.  Do this first before loading libraries to catch errors
10# quickly.  Loading libraries in R is slow.
11#
12
13# Display an error string and quit.
14UsageError <- function(...) {
15  cat(sprintf(...))
16  cat('\n')
17  quit(status = 1)
18}
19
20option_list <- list(
21    make_option(
22        "--metric-name", dest="metric_name", default="",
23        help="Name of the metric; metrics contain variables (required)"),
24    make_option(
25        "--reports", default="",
26        help="CSV file with reports; each variable is a column (required)"),
27    make_option(
28        "--schema", default="",
29        help="CSV file with variable types and metadata (required)"),
30    make_option(
31        "--params-dir", dest="params_dir", default="",
32        help="Directory where parameter CSV files are stored (required)"),
33
34    make_option(
35        "--var1", default="",
36        help="Name of first variable (required)"),
37    make_option(
38        "--var2", default="",
39        help="Name of second variable (required)"),
40
41    make_option(
42        "--map1", default="",
43        help="Path to map file, if var1 is a string"),
44    make_option(
45        "--map2", default="",
46        help="Path to map file, if var2 is a string"),
47
48    make_option(
49        "--output-dir", dest="output_dir", default=".",
50        help="Output directory (default .)"),
51
52    make_option(
53        "--create-bool-map", dest="create_bool_map", default=FALSE,
54        action="store_true",
55        help="Hack to use string RAPPOR to analyze boolean variables."),
56    make_option(
57        "--remove-bad-rows", dest="remove_bad_rows", default=FALSE,
58        action="store_true",
59        help="Whether we should remove rows where any value is missing (by
60             default, the program aborts with an error)"),
61
62    # Options that speed it up
63    make_option(
64        "--reports-sample-size", dest="reports_sample_size", default=-1,
65        help="Only analyze a random sample of this size.  This is for
66              limiting the execution time at the expense of accuracy."),
67    make_option(
68        "--num-cores", dest="num_cores", default=1,
69        help="Number of cores for mclapply to use.  Speeds up the parts
70              of the computation proportional to the number of reports,
71              EXCEPT the EM step, which can be sped up by native code."),
72    make_option(
73        "--max-em-iters", dest="max_em_iters", default=1000,
74        help="Maximum number of EM iterations"),
75    make_option(
76        "--em-executable", dest="em_executable", default="",
77        help="Shell out to this executable for an accelerated implementation
78             of EM."),
79    make_option(
80        "--tmp-dir", dest="tmp_dir", default="/tmp",
81        help="Use this tmp dir to communicate with the EM executable")
82)
83
84ParseOptions <- function() {
85  # NOTE: This API is bad; if you add positional_arguments, the return value
86  # changes!
87  parser <- OptionParser(option_list = option_list)
88  opts <- parse_args(parser)
89
90  if (opts$metric_name == "") {
91    UsageError("--metric-name is required.")
92  }
93  if (opts$reports== "") {
94    UsageError("--reports is required.")
95  }
96  if (opts$schema == "") {
97    UsageError("--schema is required.")
98  }
99  if (opts$params_dir == "") {
100    UsageError("--params-dir is required.")
101  }
102  if (opts$var1 == "") {
103    UsageError("--var1 is required.")
104  }
105  if (opts$var2 == "") {
106    UsageError("--var2 is required.")
107  }
108
109  return(opts)
110}
111
112if (!interactive()) {
113  opts <- ParseOptions()
114}
115
116#
117# Load libraries and source our own code.
118#
119
120library(RJSONIO)  # toJSON()
121
122# So we don't have to change pwd
123source.rappor <- function(rel_path)  {
124  abs_path <- paste0(Sys.getenv("RAPPOR_REPO", ""), rel_path)
125  source(abs_path)
126}
127
128source.rappor("analysis/R/association.R")
129source.rappor("analysis/R/fast_em.R")
130source.rappor("analysis/R/read_input.R")
131source.rappor("analysis/R/util.R")
132
133options(stringsAsFactors = FALSE)
134options(max.print = 100)  # So our structure() debug calls look better
135
136CreateAssocStringMap <- function(all_cohorts_map, params) {
137  # Processes the maps loaded using ReadMapFile and turns it into something
138  # that association.R can use.  Namely, we want a map per cohort.
139  #
140  # Arguments:
141  #   all_cohorts_map: map matrix, as for single variable analysis
142  #   params: encoding parameters
143
144  if (nrow(all_cohorts_map) != (params$m * params$k)) {
145    stop(sprintf(
146        "Map matrix has invalid dimensions: m * k = %d, nrow(map) = %d",
147        params$m * params$k, nrow(all_cohorts_map)))
148  }
149
150  k <- params$k
151  map_by_cohort <- lapply(0 : (params$m-1), function(cohort) {
152    begin <- cohort * k
153    end <- (cohort + 1) * k
154    all_cohorts_map[(begin+1) : end, ]
155  })
156
157  list(all_cohorts_map = all_cohorts_map, map_by_cohort = map_by_cohort)
158}
159
160# Hack to create a map for booleans.  We should use closed-form formulas instead.
161CreateAssocBoolMap <- function(params) {
162  names <- c("FALSE", "TRUE")
163
164  map_by_cohort <- lapply(1:params$m, function(unused_cohort) {
165    # The (1,1) cell is false and the (1,2) cell is true.
166    m <- sparseMatrix(c(1), c(2), dims = c(1, 2))
167    colnames(m) <- names
168    m
169  })
170
171  all_cohorts_map <- sparseMatrix(1:params$m, rep(2, params$m))
172  colnames(all_cohorts_map) <- names
173
174  list(map_by_cohort = map_by_cohort, all_cohorts_map = all_cohorts_map)
175}
176
177ResultMatrixToDataFrame <- function(m, string_var_name, bool_var_name) {
178  # Args:
179  #   m: A 2D matrix as output by ComputeDistributionEM, e.g.
180  #          bing.com yahoo.com google.com       Other
181  #   TRUE  0.2718526 0.1873424 0.19637704 0.003208933
182  #   Other 0.1404581 0.1091826 0.08958427 0.001994163
183  # Returns:
184  #   A flattened data frame, e.g.
185
186  # Name the dimensions of the matrix.
187  dim_names <- list()
188  # TODO: generalize this.  Right now we're assuming the first dimension is
189  # boolean.
190  dim_names[[bool_var_name]] <- c('TRUE', 'FALSE')
191  dim_names[[string_var_name]] <- dimnames(m)[[2]]
192
193  dimnames(m) <- dim_names
194
195  # http://stackoverflow.com/questions/15885111/create-data-frame-from-a-matrix-in-r
196  fit_df <- as.data.frame(as.table(m))
197
198  # The as.table conversion gives you a Freq column.  Call it "proportion" to
199  # be consistent with single variable analysis.
200  colnames(fit_df)[colnames(fit_df) == "Freq"] <- "proportion"
201
202  fit_df
203}
204
205main <- function(opts) {
206  Log("decode-assoc")
207  Log("argv:")
208  print(commandArgs(TRUE))
209
210  schema <- read.csv(opts$schema)
211  Log("Read %d vars from schema", nrow(schema))
212
213  schema1 <- schema[schema$metric == opts$metric_name &
214                    schema$var == opts$var1, ]
215  if (nrow(schema1) == 0) {
216    UsageError("Couldn't find metric '%s', field '%s' in schema",
217               opts$metric_name, opts$var1)
218  }
219  schema2 <- schema[schema$metric == opts$metric_name &
220                    schema$var== opts$var2, ]
221  if (nrow(schema2) == 0) {
222    UsageError("Couldn't find metric '%s', field '%s' in schema",
223               opts$metric_name, opts$var2)
224  }
225
226  if (schema1$params != schema2$params) {
227    UsageError('var1 and var2 should have the same params (%s != %s)',
228               schema1$params, schema2$params)
229  }
230  params_name <- schema1$params
231  params_path <- file.path(opts$params_dir, paste0(params_name, '.csv'))
232  params <- ReadParameterFile(params_path)
233
234  var1_type <- schema1$var_type
235  var2_type <- schema2$var_type
236
237  # Right now we're assuming that --var1 is a string and --var2 is a boolean.
238  # TODO: Remove these limitations.
239  if (var1_type != "string") {
240    UsageError("Variable 1 should be a string (%s is of type %s)", opts$var1,
241               var1_type)
242  }
243  if (var2_type != "boolean") {
244    UsageError("Variable 2 should be a boolean (%s is of type %s)", opts$var2,
245               var2_type)
246  }
247
248  if (opts$map1 == "") {
249    UsageError("--map1 must be provided when --var1 is a string (var = %s)",
250               opts$var1)
251  }
252
253  # Example cache speedup for 100k map file: 31 seconds to load map and write
254  # cache; vs 2.2 seconds to read cache.
255  string_params <- params
256  map <- LoadMapFile(opts$map1, string_params)
257
258  # Important: first column is cohort (integer); the rest are variables, which
259  # are ASCII bit strings.
260  reports <- read.csv(opts$reports, colClasses=c("character"), as.is = TRUE)
261
262  Log("Read %d reports.  Preview:", nrow(reports))
263  print(head(reports))
264  cat('\n')
265
266  # Filter bad reports first
267  is_empty1 <- reports[[opts$var1]] == ""
268  is_empty2 <- reports[[opts$var2]] == ""
269  Log('Found %d blank values in %s', sum(is_empty1), opts$var1)
270  Log('Found %d blank values in %s', sum(is_empty2), opts$var2)
271
272  is_empty <- is_empty1 | is_empty2 # boolean vectors
273  Log('%d bad rows', sum(is_empty))
274  if (sum(is_empty) > 0) {
275    if (opts$remove_bad_rows) {
276      reports <- reports[!is_empty, ]
277      Log('Removed %d rows, giving %d rows', sum(is_empty), nrow(reports))
278    } else {
279      stop("Found bad rows and --remove-bad-rows wasn't passed")
280    }
281  }
282
283  N <- nrow(reports)
284
285  if (N == 0) {
286    # Use an arbitrary error code when there is nothing to analyze, so we can
287    # distinguish this from more serious failures.
288    Log("No reports to analyze.  Exiting with code 9.")
289    quit(status = 9)
290  }
291
292  # Sample reports if specified.
293  if (opts$reports_sample_size != -1) {
294    if (N > opts$reports_sample_size) {
295      indices <- sample(1:N, opts$reports_sample_size)
296      reports <- reports[indices, ]
297      Log("Created a sample of %d reports", nrow(reports))
298    } else {
299      Log("Got less than %d reports, not sampling", opts$reports_sample_size)
300    }
301  }
302
303  num_vars <- 2  # hard-coded for now, since there is --var1 and --var2.
304
305  # Convert strings to integers
306  cohorts <- as.integer(reports$cohort)
307
308  # Hack for Chrome: like AdjustCounts in decode_dist.R.
309  cohorts <- cohorts %% params$m
310
311  # Assume the input has 0-based cohorts, and change to 1-based cohorts.
312  cohorts <- cohorts + 1
313
314  # i.e. create a list of length 2, with identical cohorts.
315  # NOTE: Basic RAPPOR doesn't need cohorts.
316  cohorts_list <- rep(list(cohorts), num_vars)
317
318  # TODO: We should use the closed-form formulas rather than calling the
319  # solver, and not require this flag.
320  if (!opts$create_bool_map) {
321    stop("ERROR: pass --create-bool-map to analyze booleans.")
322  }
323
324  bool_params <- params
325  # HACK: Make this the boolean.  The Decode() step uses k.  (Note that R makes
326  # a copy here)
327  bool_params$k <- 1
328
329  params_list <- list(bool_params, string_params)
330
331  Log('CreateAssocStringMap')
332  string_map <- CreateAssocStringMap(map$map, params)
333
334  Log('CreateAssocBoolMap')
335  bool_map <- CreateAssocBoolMap(params)
336
337  map_list <- list(bool_map, string_map)
338
339  string_var <- reports[[opts$var1]]
340  bool_var <- reports[[opts$var2]]
341
342  Log('Preview of string var:')
343  print(head(table(string_var)))
344  cat('\n')
345
346  Log('Preview of bool var:')
347  print(head(table(bool_var)))
348  cat('\n')
349
350  # Split ASCII strings into array of numerics (as required by association.R)
351
352  Log('Splitting string reports (%d cores)', opts$num_cores)
353  string_reports <- mclapply(string_var, function(x) {
354    # function splits strings and converts them to numeric values
355    # rev needed for endianness
356    rev(as.integer(strsplit(x, split = "")[[1]]))
357  }, mc.cores = opts$num_cores)
358
359  Log('Splitting bool reports (%d cores)', opts$num_cores)
360  # Has to be an list of length 1 integer vectors
361  bool_reports <- mclapply(bool_var, function(x) {
362    as.integer(x)
363  }, mc.cores = opts$num_cores)
364
365  reports_list <- list(bool_reports, string_reports)
366
367  Log('Association for %d vars', length(reports_list))
368
369  if (opts$em_executable != "") {
370    Log('Will shell out to %s for native EM implementation', opts$em_executable)
371    em_iter_func <- ConstructFastEM(opts$em_executable, opts$tmp_dir)
372  } else {
373    Log('Will use R implementation of EM (slow)')
374    em_iter_func <- EM
375  }
376
377  assoc_result <- ComputeDistributionEM(reports_list, cohorts_list, map_list,
378                                        ignore_other = FALSE,
379                                        params_list = params_list,
380                                        marginals = NULL,
381                                        estimate_var = FALSE,
382                                        num_cores = opts$num_cores,
383                                        em_iter_func = em_iter_func,
384                                        max_em_iters = opts$max_em_iters)
385
386  # This happens if the marginal can't be decoded.
387  if (is.null(assoc_result)) {
388    stop("ComputeDistributionEM failed.")
389  }
390
391  # NOTE: It would be nicer if reports_list, cohorts_list, etc. were indexed by
392  # names like 'domain' rather than numbers, and the result assoc_result$fit
393  # matrix had corresponding named dimensions.  Instead we call
394  # ResultMatrixToDataFrame to do this.
395
396  fit <- assoc_result$fit
397  fit_df <- ResultMatrixToDataFrame(fit, opts$var1, opts$var2)
398
399  Log("Association results:")
400  print(fit_df)
401  cat('\n')
402
403  results_csv_path <- file.path(opts$output_dir, 'assoc-results.csv')
404  write.csv(fit_df, file = results_csv_path, row.names = FALSE)
405  Log("Wrote %s", results_csv_path)
406
407  # Measure elapsed time as close to the end as possible
408  total_elapsed_time <- proc.time()[['elapsed']]
409
410  metrics <- list(num_reports = N,
411                  reports_sample_size = opts$reports_sample_size,
412                  # fit is a matrix
413                  estimate_dimensions = dim(fit),
414                  # should sum to near 1.0
415                  sum_estimates = sum(fit),
416                  total_elapsed_time = total_elapsed_time,
417                  em_elapsed_time = assoc_result$em_elapsed_time,
418                  num_em_iters = assoc_result$num_em_iters)
419
420  metrics_json_path <- file.path(opts$output_dir, 'assoc-metrics.json')
421  writeLines(toJSON(metrics), con = metrics_json_path)
422  Log("Wrote %s", metrics_json_path)
423
424  Log('DONE decode-assoc')
425}
426
427if (!interactive()) {
428  main(opts)
429}
430