1*2abb3134SXin Li#!/usr/bin/env Rscript 2*2abb3134SXin Li# 3*2abb3134SXin Li# Copyright 2014 Google Inc. All rights reserved. 4*2abb3134SXin Li# 5*2abb3134SXin Li# Licensed under the Apache License, Version 2.0 (the "License"); 6*2abb3134SXin Li# you may not use this file except in compliance with the License. 7*2abb3134SXin Li# You may obtain a copy of the License at 8*2abb3134SXin Li# 9*2abb3134SXin Li# http://www.apache.org/licenses/LICENSE-2.0 10*2abb3134SXin Li# 11*2abb3134SXin Li# Unless required by applicable law or agreed to in writing, software 12*2abb3134SXin Li# distributed under the License is distributed on an "AS IS" BASIS, 13*2abb3134SXin Li# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14*2abb3134SXin Li# See the License for the specific language governing permissions and 15*2abb3134SXin Li# limitations under the License. 16*2abb3134SXin Li 17*2abb3134SXin Li# Simple tool that wraps the analysis/R library. 18*2abb3134SXin Li# 19*2abb3134SXin Li# To run this you need: 20*2abb3134SXin Li# - ggplot 21*2abb3134SXin Li# - optparse 22*2abb3134SXin Li# - glmnet -- dependency of analysis library 23*2abb3134SXin Li 24*2abb3134SXin Lilibrary(optparse) 25*2abb3134SXin Li 26*2abb3134SXin Li# For unit tests 27*2abb3134SXin Liis_main <- (length(sys.frames()) == 0) 28*2abb3134SXin Li 29*2abb3134SXin Li# Do command line parsing first to catch errors. Loading libraries in R is 30*2abb3134SXin Li# slow. 31*2abb3134SXin Liif (is_main) { 32*2abb3134SXin Li option_list <- list( 33*2abb3134SXin Li make_option(c("-t", "--title"), help="Plot Title") 34*2abb3134SXin Li ) 35*2abb3134SXin Li parsed <- parse_args(OptionParser(option_list = option_list), 36*2abb3134SXin Li positional_arguments = 3) # input and output 37*2abb3134SXin Li} 38*2abb3134SXin Li 39*2abb3134SXin Lilibrary(ggplot2) 40*2abb3134SXin Li 41*2abb3134SXin Li# Use CairoPNG if available. Useful for headless R. 42*2abb3134SXin Liif (library(Cairo, quietly = TRUE, logical.return = TRUE)) { 43*2abb3134SXin Li png_func = CairoPNG 44*2abb3134SXin Li cat('Using CairoPNG\n') 45*2abb3134SXin Li} else { 46*2abb3134SXin Li png_func = png 47*2abb3134SXin Li cat('Using png\n') 48*2abb3134SXin Li} 49*2abb3134SXin Li 50*2abb3134SXin Lisource("analysis/R/read_input.R") 51*2abb3134SXin Lisource("analysis/R/decode.R") 52*2abb3134SXin Lisource("analysis/R/util.R") 53*2abb3134SXin Li 54*2abb3134SXin Lisource("analysis/R/alternative.R") # temporary 55*2abb3134SXin Li 56*2abb3134SXin LiLoadContext <- function(prefix_case) { 57*2abb3134SXin Li # Creates the context, filling it with privacy parameters 58*2abb3134SXin Li # Arg: 59*2abb3134SXin Li # prefix_case: path prefix to the test case, e.g. '_tmp/exp' 60*2abb3134SXin Li 61*2abb3134SXin Li p <- paste0(prefix_case, '_params.csv') 62*2abb3134SXin Li 63*2abb3134SXin Li params <- ReadParameterFile(p) 64*2abb3134SXin Li 65*2abb3134SXin Li ctx <- new.env() 66*2abb3134SXin Li 67*2abb3134SXin Li ctx$params <- params # so we can write it out later 68*2abb3134SXin Li 69*2abb3134SXin Li ctx 70*2abb3134SXin Li} 71*2abb3134SXin Li 72*2abb3134SXin LiRunRappor <- function(prefix_case, prefix_instance, ctx) { 73*2abb3134SXin Li # Reads counts, map files, runs RAPPOR analysis engine. 74*2abb3134SXin Li # Args: 75*2abb3134SXin Li # prefix_case: path prefix to the test case, e.g., '_tmp/exp' 76*2abb3134SXin Li # prefix_instance: path prefix to the test instance, e.g., '_tmp/exp/1' 77*2abb3134SXin Li # ctx: context file with params field filled in 78*2abb3134SXin Li 79*2abb3134SXin Li c <- paste0(prefix_instance, '_counts.csv') 80*2abb3134SXin Li counts <- ReadCountsFile(c, ctx$params) 81*2abb3134SXin Li 82*2abb3134SXin Li m <- paste0(prefix_case, '_map.csv') 83*2abb3134SXin Li 84*2abb3134SXin Li # Switch to LoadMapFile if want to cache the result 85*2abb3134SXin Li map <- ReadMapFile(m, ctx$params) 86*2abb3134SXin Li 87*2abb3134SXin Li # Main decode.R API 88*2abb3134SXin Li timing <- system.time({ 89*2abb3134SXin Li res <- Decode(counts, map$map, ctx$params) 90*2abb3134SXin Li }) 91*2abb3134SXin Li 92*2abb3134SXin Li # The line is searched for, and elapsed time is extracted, by make_summary.py. 93*2abb3134SXin Li # Should the formating or wording change, make_summary must be updated too. 94*2abb3134SXin Li Log("Inference took %.3f seconds", timing[["elapsed"]]) 95*2abb3134SXin Li 96*2abb3134SXin Li if (is.null(res)) { 97*2abb3134SXin Li stop("RAPPOR analysis failed.") 98*2abb3134SXin Li } 99*2abb3134SXin Li 100*2abb3134SXin Li Log("Decoded results:") 101*2abb3134SXin Li str(res$fit) 102*2abb3134SXin Li 103*2abb3134SXin Li res$fit 104*2abb3134SXin Li} 105*2abb3134SXin Li 106*2abb3134SXin LiLoadActual <- function(prefix_instance) { 107*2abb3134SXin Li hist_path <- paste0(prefix_instance, '_hist.csv') # case.csv 108*2abb3134SXin Li 109*2abb3134SXin Li # gen_counts.R (fast_counts mode) outputs this, since we never have true 110*2abb3134SXin Li # client values. 111*2abb3134SXin Li if (file.exists(hist_path)) { 112*2abb3134SXin Li return(read.csv(hist_path)) 113*2abb3134SXin Li } 114*2abb3134SXin Li 115*2abb3134SXin Li # Load ground truth into context 116*2abb3134SXin Li input_path <- paste0(prefix_instance, '_true_values.csv') # case.csv 117*2abb3134SXin Li client_values <- read.csv(input_path) 118*2abb3134SXin Li 119*2abb3134SXin Li # Create a histogram, or R "table". Column 2 is the true value. 120*2abb3134SXin Li t <- table(client_values$value) 121*2abb3134SXin Li 122*2abb3134SXin Li d <- as.data.frame(t) # convert it to a data frame with 'string' and 'count' columns 123*2abb3134SXin Li colnames(d) <- c('string', 'count') 124*2abb3134SXin Li 125*2abb3134SXin Li d # return this data frame 126*2abb3134SXin Li} 127*2abb3134SXin Li 128*2abb3134SXin LiCompareRapporVsActual <- function(ctx) { 129*2abb3134SXin Li # Prepare input data to be plotted 130*2abb3134SXin Li 131*2abb3134SXin Li actual <- ctx$actual # from the ground truth file 132*2abb3134SXin Li rappor <- ctx$rappor # from output of AnalyzeRAPPOR 133*2abb3134SXin Li 134*2abb3134SXin Li # "s12" -> 12, for graphing 135*2abb3134SXin Li StringToInt <- function(x) as.integer(substring(x, 2)) 136*2abb3134SXin Li 137*2abb3134SXin Li actual_values <- StringToInt(actual$string) 138*2abb3134SXin Li rappor_values <- StringToInt(rappor$string) 139*2abb3134SXin Li 140*2abb3134SXin Li # False negatives: AnalyzeRAPPOR failed to find this value (e.g. because it 141*2abb3134SXin Li # occurs too rarely) 142*2abb3134SXin Li actual_only <- setdiff(actual_values, rappor_values) 143*2abb3134SXin Li 144*2abb3134SXin Li # False positives: AnalyzeRAPPOR attributed a proportion to a string in the 145*2abb3134SXin Li # map that wasn't in the true input. 146*2abb3134SXin Li rappor_only <- setdiff(rappor_values, actual_values) 147*2abb3134SXin Li 148*2abb3134SXin Li total <- sum(actual$count) 149*2abb3134SXin Li a <- data.frame(index = actual_values, 150*2abb3134SXin Li # Calculate the true proportion 151*2abb3134SXin Li proportion = actual$count / total, 152*2abb3134SXin Li dist = "actual") 153*2abb3134SXin Li 154*2abb3134SXin Li r <- data.frame(index = rappor_values, 155*2abb3134SXin Li proportion = rappor$proportion, 156*2abb3134SXin Li dist = rep("rappor", length(rappor_values))) 157*2abb3134SXin Li 158*2abb3134SXin Li # Extend a and r with the values that they are missing. 159*2abb3134SXin Li if (length(rappor_only) > 0) { 160*2abb3134SXin Li z <- data.frame(index = rappor_only, 161*2abb3134SXin Li proportion = 0.0, 162*2abb3134SXin Li dist = "actual") 163*2abb3134SXin Li a <- rbind(a, z) 164*2abb3134SXin Li } 165*2abb3134SXin Li if (length(actual_only) > 0) { 166*2abb3134SXin Li z <- data.frame(index = actual_only, 167*2abb3134SXin Li proportion = 0.0, 168*2abb3134SXin Li dist = "rappor") 169*2abb3134SXin Li r <- rbind(r, z) 170*2abb3134SXin Li } 171*2abb3134SXin Li 172*2abb3134SXin Li # IMPORTANT: Now a and r have the same rows, but in the wrong order. Sort by index. 173*2abb3134SXin Li a <- a[order(a$index), ] 174*2abb3134SXin Li r <- r[order(r$index), ] 175*2abb3134SXin Li 176*2abb3134SXin Li # L1 distance between actual and rappor distributions 177*2abb3134SXin Li l1 <- sum(abs(a$proportion - r$proportion)) 178*2abb3134SXin Li # The max L1 distance between two distributions is 2; the max total variation 179*2abb3134SXin Li # distance is 1. 180*2abb3134SXin Li total_variation <- l1 / 2 181*2abb3134SXin Li 182*2abb3134SXin Li # Choose false positive strings and their proportion from rappor estimates 183*2abb3134SXin Li false_pos <- r[r$index %in% rappor_only, c('index', 'proportion')] 184*2abb3134SXin Li false_neg <- a[a$index %in% actual_only, c('index', 'proportion')] 185*2abb3134SXin Li 186*2abb3134SXin Li Log("False positives:") 187*2abb3134SXin Li str(false_pos) 188*2abb3134SXin Li 189*2abb3134SXin Li Log("False negatives:") 190*2abb3134SXin Li str(false_neg) 191*2abb3134SXin Li 192*2abb3134SXin Li # NOTE: We should call Decode() directly, and then num_rappor is 193*2abb3134SXin Li # metrics$num_detected, and sum_proportion is metrics$allocated_mass. 194*2abb3134SXin Li metrics <- list( 195*2abb3134SXin Li num_actual = nrow(actual), # data frames 196*2abb3134SXin Li num_rappor = nrow(rappor), 197*2abb3134SXin Li num_false_pos = nrow(false_pos), 198*2abb3134SXin Li num_false_neg = nrow(false_neg), 199*2abb3134SXin Li total_variation = total_variation, 200*2abb3134SXin Li sum_proportion = sum(rappor$proportion) 201*2abb3134SXin Li ) 202*2abb3134SXin Li 203*2abb3134SXin Li Log("Metrics:") 204*2abb3134SXin Li str(metrics) 205*2abb3134SXin Li 206*2abb3134SXin Li # Return plot data and metrics 207*2abb3134SXin Li list(plot_data = rbind(r, a), metrics = metrics) 208*2abb3134SXin Li} 209*2abb3134SXin Li 210*2abb3134SXin Li# Colors selected to be friendly to the color blind: 211*2abb3134SXin Li# http://www.cookbook-r.com/Graphs/Colors_%28ggplot2%29/ 212*2abb3134SXin Lipalette <- c("#E69F00", "#56B4E9") 213*2abb3134SXin Li 214*2abb3134SXin LiPlotAll <- function(d, title) { 215*2abb3134SXin Li # NOTE: geom_bar makes a histogram by default; need stat = "identity" 216*2abb3134SXin Li g <- ggplot(d, aes(x = index, y = proportion, fill = factor(dist))) 217*2abb3134SXin Li b <- geom_bar(stat = "identity", width = 0.7, 218*2abb3134SXin Li position = position_dodge(width = 0.8)) 219*2abb3134SXin Li t <- ggtitle(title) 220*2abb3134SXin Li g + b + t + scale_fill_manual(values=palette) 221*2abb3134SXin Li} 222*2abb3134SXin Li 223*2abb3134SXin LiWritePlot <- function(p, outdir, width = 800, height = 600) { 224*2abb3134SXin Li filename <- file.path(outdir, 'dist.png') 225*2abb3134SXin Li png_func(filename, width=width, height=height) 226*2abb3134SXin Li plot(p) 227*2abb3134SXin Li dev.off() 228*2abb3134SXin Li Log('Wrote %s', filename) 229*2abb3134SXin Li} 230*2abb3134SXin Li 231*2abb3134SXin LiWriteSummary <- function(metrics, outdir) { 232*2abb3134SXin Li filename <- file.path(outdir, 'metrics.csv') 233*2abb3134SXin Li write.csv(metrics, file = filename, row.names = FALSE) 234*2abb3134SXin Li Log('Wrote %s', filename) 235*2abb3134SXin Li} 236*2abb3134SXin Li 237*2abb3134SXin Limain <- function(parsed) { 238*2abb3134SXin Li args <- parsed$args 239*2abb3134SXin Li options <- parsed$options 240*2abb3134SXin Li 241*2abb3134SXin Li input_case_prefix <- args[[1]] 242*2abb3134SXin Li input_instance_prefix <- args[[2]] 243*2abb3134SXin Li output_dir <- args[[3]] 244*2abb3134SXin Li 245*2abb3134SXin Li # increase ggplot font size globally 246*2abb3134SXin Li theme_set(theme_grey(base_size = 16)) 247*2abb3134SXin Li 248*2abb3134SXin Li # NOTE: It takes more than 2000+ ms to get here, while the analysis only 249*2abb3134SXin Li # takes 500 ms or so (as measured by system.time). 250*2abb3134SXin Li 251*2abb3134SXin Li ctx <- LoadContext(input_case_prefix) 252*2abb3134SXin Li ctx$rappor <- RunRappor(input_case_prefix, input_instance_prefix, ctx) 253*2abb3134SXin Li ctx$actual <- LoadActual(input_instance_prefix) 254*2abb3134SXin Li 255*2abb3134SXin Li d <- CompareRapporVsActual(ctx) 256*2abb3134SXin Li p <- PlotAll(d$plot_data, options$title) 257*2abb3134SXin Li 258*2abb3134SXin Li WriteSummary(d$metrics, output_dir) 259*2abb3134SXin Li WritePlot(p, output_dir) 260*2abb3134SXin Li} 261*2abb3134SXin Li 262*2abb3134SXin Liif (is_main) { 263*2abb3134SXin Li main(parsed) 264*2abb3134SXin Li} 265