xref: /aosp_15_r20/external/rappor/bin/decode_assoc.R (revision 2abb31345f6c95944768b5222a9a5ed3fc68cc00)
1*2abb3134SXin Li#!/usr/bin/env Rscript
2*2abb3134SXin Li#
3*2abb3134SXin Li# Command line tool to decode multidimensional reports.  It's a simple wrapper
4*2abb3134SXin Li# around functions in association.R.
5*2abb3134SXin Li
6*2abb3134SXin Lilibrary(optparse)
7*2abb3134SXin Li
8*2abb3134SXin Li#
9*2abb3134SXin Li# Command line parsing.  Do this first before loading libraries to catch errors
10*2abb3134SXin Li# quickly.  Loading libraries in R is slow.
11*2abb3134SXin Li#
12*2abb3134SXin Li
13*2abb3134SXin Li# Display an error string and quit.
14*2abb3134SXin LiUsageError <- function(...) {
15*2abb3134SXin Li  cat(sprintf(...))
16*2abb3134SXin Li  cat('\n')
17*2abb3134SXin Li  quit(status = 1)
18*2abb3134SXin Li}
19*2abb3134SXin Li
20*2abb3134SXin Lioption_list <- list(
21*2abb3134SXin Li    make_option(
22*2abb3134SXin Li        "--metric-name", dest="metric_name", default="",
23*2abb3134SXin Li        help="Name of the metric; metrics contain variables (required)"),
24*2abb3134SXin Li    make_option(
25*2abb3134SXin Li        "--reports", default="",
26*2abb3134SXin Li        help="CSV file with reports; each variable is a column (required)"),
27*2abb3134SXin Li    make_option(
28*2abb3134SXin Li        "--schema", default="",
29*2abb3134SXin Li        help="CSV file with variable types and metadata (required)"),
30*2abb3134SXin Li    make_option(
31*2abb3134SXin Li        "--params-dir", dest="params_dir", default="",
32*2abb3134SXin Li        help="Directory where parameter CSV files are stored (required)"),
33*2abb3134SXin Li
34*2abb3134SXin Li    make_option(
35*2abb3134SXin Li        "--var1", default="",
36*2abb3134SXin Li        help="Name of first variable (required)"),
37*2abb3134SXin Li    make_option(
38*2abb3134SXin Li        "--var2", default="",
39*2abb3134SXin Li        help="Name of second variable (required)"),
40*2abb3134SXin Li
41*2abb3134SXin Li    make_option(
42*2abb3134SXin Li        "--map1", default="",
43*2abb3134SXin Li        help="Path to map file, if var1 is a string"),
44*2abb3134SXin Li    make_option(
45*2abb3134SXin Li        "--map2", default="",
46*2abb3134SXin Li        help="Path to map file, if var2 is a string"),
47*2abb3134SXin Li
48*2abb3134SXin Li    make_option(
49*2abb3134SXin Li        "--output-dir", dest="output_dir", default=".",
50*2abb3134SXin Li        help="Output directory (default .)"),
51*2abb3134SXin Li
52*2abb3134SXin Li    make_option(
53*2abb3134SXin Li        "--create-bool-map", dest="create_bool_map", default=FALSE,
54*2abb3134SXin Li        action="store_true",
55*2abb3134SXin Li        help="Hack to use string RAPPOR to analyze boolean variables."),
56*2abb3134SXin Li    make_option(
57*2abb3134SXin Li        "--remove-bad-rows", dest="remove_bad_rows", default=FALSE,
58*2abb3134SXin Li        action="store_true",
59*2abb3134SXin Li        help="Whether we should remove rows where any value is missing (by
60*2abb3134SXin Li             default, the program aborts with an error)"),
61*2abb3134SXin Li
62*2abb3134SXin Li    # Options that speed it up
63*2abb3134SXin Li    make_option(
64*2abb3134SXin Li        "--reports-sample-size", dest="reports_sample_size", default=-1,
65*2abb3134SXin Li        help="Only analyze a random sample of this size.  This is for
66*2abb3134SXin Li              limiting the execution time at the expense of accuracy."),
67*2abb3134SXin Li    make_option(
68*2abb3134SXin Li        "--num-cores", dest="num_cores", default=1,
69*2abb3134SXin Li        help="Number of cores for mclapply to use.  Speeds up the parts
70*2abb3134SXin Li              of the computation proportional to the number of reports,
71*2abb3134SXin Li              EXCEPT the EM step, which can be sped up by native code."),
72*2abb3134SXin Li    make_option(
73*2abb3134SXin Li        "--max-em-iters", dest="max_em_iters", default=1000,
74*2abb3134SXin Li        help="Maximum number of EM iterations"),
75*2abb3134SXin Li    make_option(
76*2abb3134SXin Li        "--em-executable", dest="em_executable", default="",
77*2abb3134SXin Li        help="Shell out to this executable for an accelerated implementation
78*2abb3134SXin Li             of EM."),
79*2abb3134SXin Li    make_option(
80*2abb3134SXin Li        "--tmp-dir", dest="tmp_dir", default="/tmp",
81*2abb3134SXin Li        help="Use this tmp dir to communicate with the EM executable")
82*2abb3134SXin Li)
83*2abb3134SXin Li
84*2abb3134SXin LiParseOptions <- function() {
85*2abb3134SXin Li  # NOTE: This API is bad; if you add positional_arguments, the return value
86*2abb3134SXin Li  # changes!
87*2abb3134SXin Li  parser <- OptionParser(option_list = option_list)
88*2abb3134SXin Li  opts <- parse_args(parser)
89*2abb3134SXin Li
90*2abb3134SXin Li  if (opts$metric_name == "") {
91*2abb3134SXin Li    UsageError("--metric-name is required.")
92*2abb3134SXin Li  }
93*2abb3134SXin Li  if (opts$reports== "") {
94*2abb3134SXin Li    UsageError("--reports is required.")
95*2abb3134SXin Li  }
96*2abb3134SXin Li  if (opts$schema == "") {
97*2abb3134SXin Li    UsageError("--schema is required.")
98*2abb3134SXin Li  }
99*2abb3134SXin Li  if (opts$params_dir == "") {
100*2abb3134SXin Li    UsageError("--params-dir is required.")
101*2abb3134SXin Li  }
102*2abb3134SXin Li  if (opts$var1 == "") {
103*2abb3134SXin Li    UsageError("--var1 is required.")
104*2abb3134SXin Li  }
105*2abb3134SXin Li  if (opts$var2 == "") {
106*2abb3134SXin Li    UsageError("--var2 is required.")
107*2abb3134SXin Li  }
108*2abb3134SXin Li
109*2abb3134SXin Li  return(opts)
110*2abb3134SXin Li}
111*2abb3134SXin Li
112*2abb3134SXin Liif (!interactive()) {
113*2abb3134SXin Li  opts <- ParseOptions()
114*2abb3134SXin Li}
115*2abb3134SXin Li
116*2abb3134SXin Li#
117*2abb3134SXin Li# Load libraries and source our own code.
118*2abb3134SXin Li#
119*2abb3134SXin Li
120*2abb3134SXin Lilibrary(RJSONIO)  # toJSON()
121*2abb3134SXin Li
122*2abb3134SXin Li# So we don't have to change pwd
123*2abb3134SXin Lisource.rappor <- function(rel_path)  {
124*2abb3134SXin Li  abs_path <- paste0(Sys.getenv("RAPPOR_REPO", ""), rel_path)
125*2abb3134SXin Li  source(abs_path)
126*2abb3134SXin Li}
127*2abb3134SXin Li
128*2abb3134SXin Lisource.rappor("analysis/R/association.R")
129*2abb3134SXin Lisource.rappor("analysis/R/fast_em.R")
130*2abb3134SXin Lisource.rappor("analysis/R/read_input.R")
131*2abb3134SXin Lisource.rappor("analysis/R/util.R")
132*2abb3134SXin Li
133*2abb3134SXin Lioptions(stringsAsFactors = FALSE)
134*2abb3134SXin Lioptions(max.print = 100)  # So our structure() debug calls look better
135*2abb3134SXin Li
136*2abb3134SXin LiCreateAssocStringMap <- function(all_cohorts_map, params) {
137*2abb3134SXin Li  # Processes the maps loaded using ReadMapFile and turns it into something
138*2abb3134SXin Li  # that association.R can use.  Namely, we want a map per cohort.
139*2abb3134SXin Li  #
140*2abb3134SXin Li  # Arguments:
141*2abb3134SXin Li  #   all_cohorts_map: map matrix, as for single variable analysis
142*2abb3134SXin Li  #   params: encoding parameters
143*2abb3134SXin Li
144*2abb3134SXin Li  if (nrow(all_cohorts_map) != (params$m * params$k)) {
145*2abb3134SXin Li    stop(sprintf(
146*2abb3134SXin Li        "Map matrix has invalid dimensions: m * k = %d, nrow(map) = %d",
147*2abb3134SXin Li        params$m * params$k, nrow(all_cohorts_map)))
148*2abb3134SXin Li  }
149*2abb3134SXin Li
150*2abb3134SXin Li  k <- params$k
151*2abb3134SXin Li  map_by_cohort <- lapply(0 : (params$m-1), function(cohort) {
152*2abb3134SXin Li    begin <- cohort * k
153*2abb3134SXin Li    end <- (cohort + 1) * k
154*2abb3134SXin Li    all_cohorts_map[(begin+1) : end, ]
155*2abb3134SXin Li  })
156*2abb3134SXin Li
157*2abb3134SXin Li  list(all_cohorts_map = all_cohorts_map, map_by_cohort = map_by_cohort)
158*2abb3134SXin Li}
159*2abb3134SXin Li
160*2abb3134SXin Li# Hack to create a map for booleans.  We should use closed-form formulas instead.
161*2abb3134SXin LiCreateAssocBoolMap <- function(params) {
162*2abb3134SXin Li  names <- c("FALSE", "TRUE")
163*2abb3134SXin Li
164*2abb3134SXin Li  map_by_cohort <- lapply(1:params$m, function(unused_cohort) {
165*2abb3134SXin Li    # The (1,1) cell is false and the (1,2) cell is true.
166*2abb3134SXin Li    m <- sparseMatrix(c(1), c(2), dims = c(1, 2))
167*2abb3134SXin Li    colnames(m) <- names
168*2abb3134SXin Li    m
169*2abb3134SXin Li  })
170*2abb3134SXin Li
171*2abb3134SXin Li  all_cohorts_map <- sparseMatrix(1:params$m, rep(2, params$m))
172*2abb3134SXin Li  colnames(all_cohorts_map) <- names
173*2abb3134SXin Li
174*2abb3134SXin Li  list(map_by_cohort = map_by_cohort, all_cohorts_map = all_cohorts_map)
175*2abb3134SXin Li}
176*2abb3134SXin Li
177*2abb3134SXin LiResultMatrixToDataFrame <- function(m, string_var_name, bool_var_name) {
178*2abb3134SXin Li  # Args:
179*2abb3134SXin Li  #   m: A 2D matrix as output by ComputeDistributionEM, e.g.
180*2abb3134SXin Li  #          bing.com yahoo.com google.com       Other
181*2abb3134SXin Li  #   TRUE  0.2718526 0.1873424 0.19637704 0.003208933
182*2abb3134SXin Li  #   Other 0.1404581 0.1091826 0.08958427 0.001994163
183*2abb3134SXin Li  # Returns:
184*2abb3134SXin Li  #   A flattened data frame, e.g.
185*2abb3134SXin Li
186*2abb3134SXin Li  # Name the dimensions of the matrix.
187*2abb3134SXin Li  dim_names <- list()
188*2abb3134SXin Li  # TODO: generalize this.  Right now we're assuming the first dimension is
189*2abb3134SXin Li  # boolean.
190*2abb3134SXin Li  dim_names[[bool_var_name]] <- c('TRUE', 'FALSE')
191*2abb3134SXin Li  dim_names[[string_var_name]] <- dimnames(m)[[2]]
192*2abb3134SXin Li
193*2abb3134SXin Li  dimnames(m) <- dim_names
194*2abb3134SXin Li
195*2abb3134SXin Li  # http://stackoverflow.com/questions/15885111/create-data-frame-from-a-matrix-in-r
196*2abb3134SXin Li  fit_df <- as.data.frame(as.table(m))
197*2abb3134SXin Li
198*2abb3134SXin Li  # The as.table conversion gives you a Freq column.  Call it "proportion" to
199*2abb3134SXin Li  # be consistent with single variable analysis.
200*2abb3134SXin Li  colnames(fit_df)[colnames(fit_df) == "Freq"] <- "proportion"
201*2abb3134SXin Li
202*2abb3134SXin Li  fit_df
203*2abb3134SXin Li}
204*2abb3134SXin Li
205*2abb3134SXin Limain <- function(opts) {
206*2abb3134SXin Li  Log("decode-assoc")
207*2abb3134SXin Li  Log("argv:")
208*2abb3134SXin Li  print(commandArgs(TRUE))
209*2abb3134SXin Li
210*2abb3134SXin Li  schema <- read.csv(opts$schema)
211*2abb3134SXin Li  Log("Read %d vars from schema", nrow(schema))
212*2abb3134SXin Li
213*2abb3134SXin Li  schema1 <- schema[schema$metric == opts$metric_name &
214*2abb3134SXin Li                    schema$var == opts$var1, ]
215*2abb3134SXin Li  if (nrow(schema1) == 0) {
216*2abb3134SXin Li    UsageError("Couldn't find metric '%s', field '%s' in schema",
217*2abb3134SXin Li               opts$metric_name, opts$var1)
218*2abb3134SXin Li  }
219*2abb3134SXin Li  schema2 <- schema[schema$metric == opts$metric_name &
220*2abb3134SXin Li                    schema$var== opts$var2, ]
221*2abb3134SXin Li  if (nrow(schema2) == 0) {
222*2abb3134SXin Li    UsageError("Couldn't find metric '%s', field '%s' in schema",
223*2abb3134SXin Li               opts$metric_name, opts$var2)
224*2abb3134SXin Li  }
225*2abb3134SXin Li
226*2abb3134SXin Li  if (schema1$params != schema2$params) {
227*2abb3134SXin Li    UsageError('var1 and var2 should have the same params (%s != %s)',
228*2abb3134SXin Li               schema1$params, schema2$params)
229*2abb3134SXin Li  }
230*2abb3134SXin Li  params_name <- schema1$params
231*2abb3134SXin Li  params_path <- file.path(opts$params_dir, paste0(params_name, '.csv'))
232*2abb3134SXin Li  params <- ReadParameterFile(params_path)
233*2abb3134SXin Li
234*2abb3134SXin Li  var1_type <- schema1$var_type
235*2abb3134SXin Li  var2_type <- schema2$var_type
236*2abb3134SXin Li
237*2abb3134SXin Li  # Right now we're assuming that --var1 is a string and --var2 is a boolean.
238*2abb3134SXin Li  # TODO: Remove these limitations.
239*2abb3134SXin Li  if (var1_type != "string") {
240*2abb3134SXin Li    UsageError("Variable 1 should be a string (%s is of type %s)", opts$var1,
241*2abb3134SXin Li               var1_type)
242*2abb3134SXin Li  }
243*2abb3134SXin Li  if (var2_type != "boolean") {
244*2abb3134SXin Li    UsageError("Variable 2 should be a boolean (%s is of type %s)", opts$var2,
245*2abb3134SXin Li               var2_type)
246*2abb3134SXin Li  }
247*2abb3134SXin Li
248*2abb3134SXin Li  if (opts$map1 == "") {
249*2abb3134SXin Li    UsageError("--map1 must be provided when --var1 is a string (var = %s)",
250*2abb3134SXin Li               opts$var1)
251*2abb3134SXin Li  }
252*2abb3134SXin Li
253*2abb3134SXin Li  # Example cache speedup for 100k map file: 31 seconds to load map and write
254*2abb3134SXin Li  # cache; vs 2.2 seconds to read cache.
255*2abb3134SXin Li  string_params <- params
256*2abb3134SXin Li  map <- LoadMapFile(opts$map1, string_params)
257*2abb3134SXin Li
258*2abb3134SXin Li  # Important: first column is cohort (integer); the rest are variables, which
259*2abb3134SXin Li  # are ASCII bit strings.
260*2abb3134SXin Li  reports <- read.csv(opts$reports, colClasses=c("character"), as.is = TRUE)
261*2abb3134SXin Li
262*2abb3134SXin Li  Log("Read %d reports.  Preview:", nrow(reports))
263*2abb3134SXin Li  print(head(reports))
264*2abb3134SXin Li  cat('\n')
265*2abb3134SXin Li
266*2abb3134SXin Li  # Filter bad reports first
267*2abb3134SXin Li  is_empty1 <- reports[[opts$var1]] == ""
268*2abb3134SXin Li  is_empty2 <- reports[[opts$var2]] == ""
269*2abb3134SXin Li  Log('Found %d blank values in %s', sum(is_empty1), opts$var1)
270*2abb3134SXin Li  Log('Found %d blank values in %s', sum(is_empty2), opts$var2)
271*2abb3134SXin Li
272*2abb3134SXin Li  is_empty <- is_empty1 | is_empty2 # boolean vectors
273*2abb3134SXin Li  Log('%d bad rows', sum(is_empty))
274*2abb3134SXin Li  if (sum(is_empty) > 0) {
275*2abb3134SXin Li    if (opts$remove_bad_rows) {
276*2abb3134SXin Li      reports <- reports[!is_empty, ]
277*2abb3134SXin Li      Log('Removed %d rows, giving %d rows', sum(is_empty), nrow(reports))
278*2abb3134SXin Li    } else {
279*2abb3134SXin Li      stop("Found bad rows and --remove-bad-rows wasn't passed")
280*2abb3134SXin Li    }
281*2abb3134SXin Li  }
282*2abb3134SXin Li
283*2abb3134SXin Li  N <- nrow(reports)
284*2abb3134SXin Li
285*2abb3134SXin Li  if (N == 0) {
286*2abb3134SXin Li    # Use an arbitrary error code when there is nothing to analyze, so we can
287*2abb3134SXin Li    # distinguish this from more serious failures.
288*2abb3134SXin Li    Log("No reports to analyze.  Exiting with code 9.")
289*2abb3134SXin Li    quit(status = 9)
290*2abb3134SXin Li  }
291*2abb3134SXin Li
292*2abb3134SXin Li  # Sample reports if specified.
293*2abb3134SXin Li  if (opts$reports_sample_size != -1) {
294*2abb3134SXin Li    if (N > opts$reports_sample_size) {
295*2abb3134SXin Li      indices <- sample(1:N, opts$reports_sample_size)
296*2abb3134SXin Li      reports <- reports[indices, ]
297*2abb3134SXin Li      Log("Created a sample of %d reports", nrow(reports))
298*2abb3134SXin Li    } else {
299*2abb3134SXin Li      Log("Got less than %d reports, not sampling", opts$reports_sample_size)
300*2abb3134SXin Li    }
301*2abb3134SXin Li  }
302*2abb3134SXin Li
303*2abb3134SXin Li  num_vars <- 2  # hard-coded for now, since there is --var1 and --var2.
304*2abb3134SXin Li
305*2abb3134SXin Li  # Convert strings to integers
306*2abb3134SXin Li  cohorts <- as.integer(reports$cohort)
307*2abb3134SXin Li
308*2abb3134SXin Li  # Hack for Chrome: like AdjustCounts in decode_dist.R.
309*2abb3134SXin Li  cohorts <- cohorts %% params$m
310*2abb3134SXin Li
311*2abb3134SXin Li  # Assume the input has 0-based cohorts, and change to 1-based cohorts.
312*2abb3134SXin Li  cohorts <- cohorts + 1
313*2abb3134SXin Li
314*2abb3134SXin Li  # i.e. create a list of length 2, with identical cohorts.
315*2abb3134SXin Li  # NOTE: Basic RAPPOR doesn't need cohorts.
316*2abb3134SXin Li  cohorts_list <- rep(list(cohorts), num_vars)
317*2abb3134SXin Li
318*2abb3134SXin Li  # TODO: We should use the closed-form formulas rather than calling the
319*2abb3134SXin Li  # solver, and not require this flag.
320*2abb3134SXin Li  if (!opts$create_bool_map) {
321*2abb3134SXin Li    stop("ERROR: pass --create-bool-map to analyze booleans.")
322*2abb3134SXin Li  }
323*2abb3134SXin Li
324*2abb3134SXin Li  bool_params <- params
325*2abb3134SXin Li  # HACK: Make this the boolean.  The Decode() step uses k.  (Note that R makes
326*2abb3134SXin Li  # a copy here)
327*2abb3134SXin Li  bool_params$k <- 1
328*2abb3134SXin Li
329*2abb3134SXin Li  params_list <- list(bool_params, string_params)
330*2abb3134SXin Li
331*2abb3134SXin Li  Log('CreateAssocStringMap')
332*2abb3134SXin Li  string_map <- CreateAssocStringMap(map$map, params)
333*2abb3134SXin Li
334*2abb3134SXin Li  Log('CreateAssocBoolMap')
335*2abb3134SXin Li  bool_map <- CreateAssocBoolMap(params)
336*2abb3134SXin Li
337*2abb3134SXin Li  map_list <- list(bool_map, string_map)
338*2abb3134SXin Li
339*2abb3134SXin Li  string_var <- reports[[opts$var1]]
340*2abb3134SXin Li  bool_var <- reports[[opts$var2]]
341*2abb3134SXin Li
342*2abb3134SXin Li  Log('Preview of string var:')
343*2abb3134SXin Li  print(head(table(string_var)))
344*2abb3134SXin Li  cat('\n')
345*2abb3134SXin Li
346*2abb3134SXin Li  Log('Preview of bool var:')
347*2abb3134SXin Li  print(head(table(bool_var)))
348*2abb3134SXin Li  cat('\n')
349*2abb3134SXin Li
350*2abb3134SXin Li  # Split ASCII strings into array of numerics (as required by association.R)
351*2abb3134SXin Li
352*2abb3134SXin Li  Log('Splitting string reports (%d cores)', opts$num_cores)
353*2abb3134SXin Li  string_reports <- mclapply(string_var, function(x) {
354*2abb3134SXin Li    # function splits strings and converts them to numeric values
355*2abb3134SXin Li    # rev needed for endianness
356*2abb3134SXin Li    rev(as.integer(strsplit(x, split = "")[[1]]))
357*2abb3134SXin Li  }, mc.cores = opts$num_cores)
358*2abb3134SXin Li
359*2abb3134SXin Li  Log('Splitting bool reports (%d cores)', opts$num_cores)
360*2abb3134SXin Li  # Has to be an list of length 1 integer vectors
361*2abb3134SXin Li  bool_reports <- mclapply(bool_var, function(x) {
362*2abb3134SXin Li    as.integer(x)
363*2abb3134SXin Li  }, mc.cores = opts$num_cores)
364*2abb3134SXin Li
365*2abb3134SXin Li  reports_list <- list(bool_reports, string_reports)
366*2abb3134SXin Li
367*2abb3134SXin Li  Log('Association for %d vars', length(reports_list))
368*2abb3134SXin Li
369*2abb3134SXin Li  if (opts$em_executable != "") {
370*2abb3134SXin Li    Log('Will shell out to %s for native EM implementation', opts$em_executable)
371*2abb3134SXin Li    em_iter_func <- ConstructFastEM(opts$em_executable, opts$tmp_dir)
372*2abb3134SXin Li  } else {
373*2abb3134SXin Li    Log('Will use R implementation of EM (slow)')
374*2abb3134SXin Li    em_iter_func <- EM
375*2abb3134SXin Li  }
376*2abb3134SXin Li
377*2abb3134SXin Li  assoc_result <- ComputeDistributionEM(reports_list, cohorts_list, map_list,
378*2abb3134SXin Li                                        ignore_other = FALSE,
379*2abb3134SXin Li                                        params_list = params_list,
380*2abb3134SXin Li                                        marginals = NULL,
381*2abb3134SXin Li                                        estimate_var = FALSE,
382*2abb3134SXin Li                                        num_cores = opts$num_cores,
383*2abb3134SXin Li                                        em_iter_func = em_iter_func,
384*2abb3134SXin Li                                        max_em_iters = opts$max_em_iters)
385*2abb3134SXin Li
386*2abb3134SXin Li  # This happens if the marginal can't be decoded.
387*2abb3134SXin Li  if (is.null(assoc_result)) {
388*2abb3134SXin Li    stop("ComputeDistributionEM failed.")
389*2abb3134SXin Li  }
390*2abb3134SXin Li
391*2abb3134SXin Li  # NOTE: It would be nicer if reports_list, cohorts_list, etc. were indexed by
392*2abb3134SXin Li  # names like 'domain' rather than numbers, and the result assoc_result$fit
393*2abb3134SXin Li  # matrix had corresponding named dimensions.  Instead we call
394*2abb3134SXin Li  # ResultMatrixToDataFrame to do this.
395*2abb3134SXin Li
396*2abb3134SXin Li  fit <- assoc_result$fit
397*2abb3134SXin Li  fit_df <- ResultMatrixToDataFrame(fit, opts$var1, opts$var2)
398*2abb3134SXin Li
399*2abb3134SXin Li  Log("Association results:")
400*2abb3134SXin Li  print(fit_df)
401*2abb3134SXin Li  cat('\n')
402*2abb3134SXin Li
403*2abb3134SXin Li  results_csv_path <- file.path(opts$output_dir, 'assoc-results.csv')
404*2abb3134SXin Li  write.csv(fit_df, file = results_csv_path, row.names = FALSE)
405*2abb3134SXin Li  Log("Wrote %s", results_csv_path)
406*2abb3134SXin Li
407*2abb3134SXin Li  # Measure elapsed time as close to the end as possible
408*2abb3134SXin Li  total_elapsed_time <- proc.time()[['elapsed']]
409*2abb3134SXin Li
410*2abb3134SXin Li  metrics <- list(num_reports = N,
411*2abb3134SXin Li                  reports_sample_size = opts$reports_sample_size,
412*2abb3134SXin Li                  # fit is a matrix
413*2abb3134SXin Li                  estimate_dimensions = dim(fit),
414*2abb3134SXin Li                  # should sum to near 1.0
415*2abb3134SXin Li                  sum_estimates = sum(fit),
416*2abb3134SXin Li                  total_elapsed_time = total_elapsed_time,
417*2abb3134SXin Li                  em_elapsed_time = assoc_result$em_elapsed_time,
418*2abb3134SXin Li                  num_em_iters = assoc_result$num_em_iters)
419*2abb3134SXin Li
420*2abb3134SXin Li  metrics_json_path <- file.path(opts$output_dir, 'assoc-metrics.json')
421*2abb3134SXin Li  writeLines(toJSON(metrics), con = metrics_json_path)
422*2abb3134SXin Li  Log("Wrote %s", metrics_json_path)
423*2abb3134SXin Li
424*2abb3134SXin Li  Log('DONE decode-assoc')
425*2abb3134SXin Li}
426*2abb3134SXin Li
427*2abb3134SXin Liif (!interactive()) {
428*2abb3134SXin Li  main(opts)
429*2abb3134SXin Li}
430