1#!/usr/bin/env Rscript 2# 3# Command line tool to decode multidimensional reports. It's a simple wrapper 4# around functions in association.R. 5 6library(optparse) 7 8# 9# Command line parsing. Do this first before loading libraries to catch errors 10# quickly. Loading libraries in R is slow. 11# 12 13# Display an error string and quit. 14UsageError <- function(...) { 15 cat(sprintf(...)) 16 cat('\n') 17 quit(status = 1) 18} 19 20option_list <- list( 21 make_option( 22 "--metric-name", dest="metric_name", default="", 23 help="Name of the metric; metrics contain variables (required)"), 24 make_option( 25 "--reports", default="", 26 help="CSV file with reports; each variable is a column (required)"), 27 make_option( 28 "--schema", default="", 29 help="CSV file with variable types and metadata (required)"), 30 make_option( 31 "--params-dir", dest="params_dir", default="", 32 help="Directory where parameter CSV files are stored (required)"), 33 34 make_option( 35 "--var1", default="", 36 help="Name of first variable (required)"), 37 make_option( 38 "--var2", default="", 39 help="Name of second variable (required)"), 40 41 make_option( 42 "--map1", default="", 43 help="Path to map file, if var1 is a string"), 44 make_option( 45 "--map2", default="", 46 help="Path to map file, if var2 is a string"), 47 48 make_option( 49 "--output-dir", dest="output_dir", default=".", 50 help="Output directory (default .)"), 51 52 make_option( 53 "--create-bool-map", dest="create_bool_map", default=FALSE, 54 action="store_true", 55 help="Hack to use string RAPPOR to analyze boolean variables."), 56 make_option( 57 "--remove-bad-rows", dest="remove_bad_rows", default=FALSE, 58 action="store_true", 59 help="Whether we should remove rows where any value is missing (by 60 default, the program aborts with an error)"), 61 62 # Options that speed it up 63 make_option( 64 "--reports-sample-size", dest="reports_sample_size", default=-1, 65 help="Only analyze a random sample of this size. This is for 66 limiting the execution time at the expense of accuracy."), 67 make_option( 68 "--num-cores", dest="num_cores", default=1, 69 help="Number of cores for mclapply to use. Speeds up the parts 70 of the computation proportional to the number of reports, 71 EXCEPT the EM step, which can be sped up by native code."), 72 make_option( 73 "--max-em-iters", dest="max_em_iters", default=1000, 74 help="Maximum number of EM iterations"), 75 make_option( 76 "--em-executable", dest="em_executable", default="", 77 help="Shell out to this executable for an accelerated implementation 78 of EM."), 79 make_option( 80 "--tmp-dir", dest="tmp_dir", default="/tmp", 81 help="Use this tmp dir to communicate with the EM executable") 82) 83 84ParseOptions <- function() { 85 # NOTE: This API is bad; if you add positional_arguments, the return value 86 # changes! 87 parser <- OptionParser(option_list = option_list) 88 opts <- parse_args(parser) 89 90 if (opts$metric_name == "") { 91 UsageError("--metric-name is required.") 92 } 93 if (opts$reports== "") { 94 UsageError("--reports is required.") 95 } 96 if (opts$schema == "") { 97 UsageError("--schema is required.") 98 } 99 if (opts$params_dir == "") { 100 UsageError("--params-dir is required.") 101 } 102 if (opts$var1 == "") { 103 UsageError("--var1 is required.") 104 } 105 if (opts$var2 == "") { 106 UsageError("--var2 is required.") 107 } 108 109 return(opts) 110} 111 112if (!interactive()) { 113 opts <- ParseOptions() 114} 115 116# 117# Load libraries and source our own code. 118# 119 120library(RJSONIO) # toJSON() 121 122# So we don't have to change pwd 123source.rappor <- function(rel_path) { 124 abs_path <- paste0(Sys.getenv("RAPPOR_REPO", ""), rel_path) 125 source(abs_path) 126} 127 128source.rappor("analysis/R/association.R") 129source.rappor("analysis/R/fast_em.R") 130source.rappor("analysis/R/read_input.R") 131source.rappor("analysis/R/util.R") 132 133options(stringsAsFactors = FALSE) 134options(max.print = 100) # So our structure() debug calls look better 135 136CreateAssocStringMap <- function(all_cohorts_map, params) { 137 # Processes the maps loaded using ReadMapFile and turns it into something 138 # that association.R can use. Namely, we want a map per cohort. 139 # 140 # Arguments: 141 # all_cohorts_map: map matrix, as for single variable analysis 142 # params: encoding parameters 143 144 if (nrow(all_cohorts_map) != (params$m * params$k)) { 145 stop(sprintf( 146 "Map matrix has invalid dimensions: m * k = %d, nrow(map) = %d", 147 params$m * params$k, nrow(all_cohorts_map))) 148 } 149 150 k <- params$k 151 map_by_cohort <- lapply(0 : (params$m-1), function(cohort) { 152 begin <- cohort * k 153 end <- (cohort + 1) * k 154 all_cohorts_map[(begin+1) : end, ] 155 }) 156 157 list(all_cohorts_map = all_cohorts_map, map_by_cohort = map_by_cohort) 158} 159 160# Hack to create a map for booleans. We should use closed-form formulas instead. 161CreateAssocBoolMap <- function(params) { 162 names <- c("FALSE", "TRUE") 163 164 map_by_cohort <- lapply(1:params$m, function(unused_cohort) { 165 # The (1,1) cell is false and the (1,2) cell is true. 166 m <- sparseMatrix(c(1), c(2), dims = c(1, 2)) 167 colnames(m) <- names 168 m 169 }) 170 171 all_cohorts_map <- sparseMatrix(1:params$m, rep(2, params$m)) 172 colnames(all_cohorts_map) <- names 173 174 list(map_by_cohort = map_by_cohort, all_cohorts_map = all_cohorts_map) 175} 176 177ResultMatrixToDataFrame <- function(m, string_var_name, bool_var_name) { 178 # Args: 179 # m: A 2D matrix as output by ComputeDistributionEM, e.g. 180 # bing.com yahoo.com google.com Other 181 # TRUE 0.2718526 0.1873424 0.19637704 0.003208933 182 # Other 0.1404581 0.1091826 0.08958427 0.001994163 183 # Returns: 184 # A flattened data frame, e.g. 185 186 # Name the dimensions of the matrix. 187 dim_names <- list() 188 # TODO: generalize this. Right now we're assuming the first dimension is 189 # boolean. 190 dim_names[[bool_var_name]] <- c('TRUE', 'FALSE') 191 dim_names[[string_var_name]] <- dimnames(m)[[2]] 192 193 dimnames(m) <- dim_names 194 195 # http://stackoverflow.com/questions/15885111/create-data-frame-from-a-matrix-in-r 196 fit_df <- as.data.frame(as.table(m)) 197 198 # The as.table conversion gives you a Freq column. Call it "proportion" to 199 # be consistent with single variable analysis. 200 colnames(fit_df)[colnames(fit_df) == "Freq"] <- "proportion" 201 202 fit_df 203} 204 205main <- function(opts) { 206 Log("decode-assoc") 207 Log("argv:") 208 print(commandArgs(TRUE)) 209 210 schema <- read.csv(opts$schema) 211 Log("Read %d vars from schema", nrow(schema)) 212 213 schema1 <- schema[schema$metric == opts$metric_name & 214 schema$var == opts$var1, ] 215 if (nrow(schema1) == 0) { 216 UsageError("Couldn't find metric '%s', field '%s' in schema", 217 opts$metric_name, opts$var1) 218 } 219 schema2 <- schema[schema$metric == opts$metric_name & 220 schema$var== opts$var2, ] 221 if (nrow(schema2) == 0) { 222 UsageError("Couldn't find metric '%s', field '%s' in schema", 223 opts$metric_name, opts$var2) 224 } 225 226 if (schema1$params != schema2$params) { 227 UsageError('var1 and var2 should have the same params (%s != %s)', 228 schema1$params, schema2$params) 229 } 230 params_name <- schema1$params 231 params_path <- file.path(opts$params_dir, paste0(params_name, '.csv')) 232 params <- ReadParameterFile(params_path) 233 234 var1_type <- schema1$var_type 235 var2_type <- schema2$var_type 236 237 # Right now we're assuming that --var1 is a string and --var2 is a boolean. 238 # TODO: Remove these limitations. 239 if (var1_type != "string") { 240 UsageError("Variable 1 should be a string (%s is of type %s)", opts$var1, 241 var1_type) 242 } 243 if (var2_type != "boolean") { 244 UsageError("Variable 2 should be a boolean (%s is of type %s)", opts$var2, 245 var2_type) 246 } 247 248 if (opts$map1 == "") { 249 UsageError("--map1 must be provided when --var1 is a string (var = %s)", 250 opts$var1) 251 } 252 253 # Example cache speedup for 100k map file: 31 seconds to load map and write 254 # cache; vs 2.2 seconds to read cache. 255 string_params <- params 256 map <- LoadMapFile(opts$map1, string_params) 257 258 # Important: first column is cohort (integer); the rest are variables, which 259 # are ASCII bit strings. 260 reports <- read.csv(opts$reports, colClasses=c("character"), as.is = TRUE) 261 262 Log("Read %d reports. Preview:", nrow(reports)) 263 print(head(reports)) 264 cat('\n') 265 266 # Filter bad reports first 267 is_empty1 <- reports[[opts$var1]] == "" 268 is_empty2 <- reports[[opts$var2]] == "" 269 Log('Found %d blank values in %s', sum(is_empty1), opts$var1) 270 Log('Found %d blank values in %s', sum(is_empty2), opts$var2) 271 272 is_empty <- is_empty1 | is_empty2 # boolean vectors 273 Log('%d bad rows', sum(is_empty)) 274 if (sum(is_empty) > 0) { 275 if (opts$remove_bad_rows) { 276 reports <- reports[!is_empty, ] 277 Log('Removed %d rows, giving %d rows', sum(is_empty), nrow(reports)) 278 } else { 279 stop("Found bad rows and --remove-bad-rows wasn't passed") 280 } 281 } 282 283 N <- nrow(reports) 284 285 if (N == 0) { 286 # Use an arbitrary error code when there is nothing to analyze, so we can 287 # distinguish this from more serious failures. 288 Log("No reports to analyze. Exiting with code 9.") 289 quit(status = 9) 290 } 291 292 # Sample reports if specified. 293 if (opts$reports_sample_size != -1) { 294 if (N > opts$reports_sample_size) { 295 indices <- sample(1:N, opts$reports_sample_size) 296 reports <- reports[indices, ] 297 Log("Created a sample of %d reports", nrow(reports)) 298 } else { 299 Log("Got less than %d reports, not sampling", opts$reports_sample_size) 300 } 301 } 302 303 num_vars <- 2 # hard-coded for now, since there is --var1 and --var2. 304 305 # Convert strings to integers 306 cohorts <- as.integer(reports$cohort) 307 308 # Hack for Chrome: like AdjustCounts in decode_dist.R. 309 cohorts <- cohorts %% params$m 310 311 # Assume the input has 0-based cohorts, and change to 1-based cohorts. 312 cohorts <- cohorts + 1 313 314 # i.e. create a list of length 2, with identical cohorts. 315 # NOTE: Basic RAPPOR doesn't need cohorts. 316 cohorts_list <- rep(list(cohorts), num_vars) 317 318 # TODO: We should use the closed-form formulas rather than calling the 319 # solver, and not require this flag. 320 if (!opts$create_bool_map) { 321 stop("ERROR: pass --create-bool-map to analyze booleans.") 322 } 323 324 bool_params <- params 325 # HACK: Make this the boolean. The Decode() step uses k. (Note that R makes 326 # a copy here) 327 bool_params$k <- 1 328 329 params_list <- list(bool_params, string_params) 330 331 Log('CreateAssocStringMap') 332 string_map <- CreateAssocStringMap(map$map, params) 333 334 Log('CreateAssocBoolMap') 335 bool_map <- CreateAssocBoolMap(params) 336 337 map_list <- list(bool_map, string_map) 338 339 string_var <- reports[[opts$var1]] 340 bool_var <- reports[[opts$var2]] 341 342 Log('Preview of string var:') 343 print(head(table(string_var))) 344 cat('\n') 345 346 Log('Preview of bool var:') 347 print(head(table(bool_var))) 348 cat('\n') 349 350 # Split ASCII strings into array of numerics (as required by association.R) 351 352 Log('Splitting string reports (%d cores)', opts$num_cores) 353 string_reports <- mclapply(string_var, function(x) { 354 # function splits strings and converts them to numeric values 355 # rev needed for endianness 356 rev(as.integer(strsplit(x, split = "")[[1]])) 357 }, mc.cores = opts$num_cores) 358 359 Log('Splitting bool reports (%d cores)', opts$num_cores) 360 # Has to be an list of length 1 integer vectors 361 bool_reports <- mclapply(bool_var, function(x) { 362 as.integer(x) 363 }, mc.cores = opts$num_cores) 364 365 reports_list <- list(bool_reports, string_reports) 366 367 Log('Association for %d vars', length(reports_list)) 368 369 if (opts$em_executable != "") { 370 Log('Will shell out to %s for native EM implementation', opts$em_executable) 371 em_iter_func <- ConstructFastEM(opts$em_executable, opts$tmp_dir) 372 } else { 373 Log('Will use R implementation of EM (slow)') 374 em_iter_func <- EM 375 } 376 377 assoc_result <- ComputeDistributionEM(reports_list, cohorts_list, map_list, 378 ignore_other = FALSE, 379 params_list = params_list, 380 marginals = NULL, 381 estimate_var = FALSE, 382 num_cores = opts$num_cores, 383 em_iter_func = em_iter_func, 384 max_em_iters = opts$max_em_iters) 385 386 # This happens if the marginal can't be decoded. 387 if (is.null(assoc_result)) { 388 stop("ComputeDistributionEM failed.") 389 } 390 391 # NOTE: It would be nicer if reports_list, cohorts_list, etc. were indexed by 392 # names like 'domain' rather than numbers, and the result assoc_result$fit 393 # matrix had corresponding named dimensions. Instead we call 394 # ResultMatrixToDataFrame to do this. 395 396 fit <- assoc_result$fit 397 fit_df <- ResultMatrixToDataFrame(fit, opts$var1, opts$var2) 398 399 Log("Association results:") 400 print(fit_df) 401 cat('\n') 402 403 results_csv_path <- file.path(opts$output_dir, 'assoc-results.csv') 404 write.csv(fit_df, file = results_csv_path, row.names = FALSE) 405 Log("Wrote %s", results_csv_path) 406 407 # Measure elapsed time as close to the end as possible 408 total_elapsed_time <- proc.time()[['elapsed']] 409 410 metrics <- list(num_reports = N, 411 reports_sample_size = opts$reports_sample_size, 412 # fit is a matrix 413 estimate_dimensions = dim(fit), 414 # should sum to near 1.0 415 sum_estimates = sum(fit), 416 total_elapsed_time = total_elapsed_time, 417 em_elapsed_time = assoc_result$em_elapsed_time, 418 num_em_iters = assoc_result$num_em_iters) 419 420 metrics_json_path <- file.path(opts$output_dir, 'assoc-metrics.json') 421 writeLines(toJSON(metrics), con = metrics_json_path) 422 Log("Wrote %s", metrics_json_path) 423 424 Log('DONE decode-assoc') 425} 426 427if (!interactive()) { 428 main(opts) 429} 430