xref: /aosp_15_r20/external/rappor/tests/compare_dist.R (revision 2abb31345f6c95944768b5222a9a5ed3fc68cc00)
1*2abb3134SXin Li#!/usr/bin/env Rscript
2*2abb3134SXin Li#
3*2abb3134SXin Li# Copyright 2014 Google Inc. All rights reserved.
4*2abb3134SXin Li#
5*2abb3134SXin Li# Licensed under the Apache License, Version 2.0 (the "License");
6*2abb3134SXin Li# you may not use this file except in compliance with the License.
7*2abb3134SXin Li# You may obtain a copy of the License at
8*2abb3134SXin Li#
9*2abb3134SXin Li#     http://www.apache.org/licenses/LICENSE-2.0
10*2abb3134SXin Li#
11*2abb3134SXin Li# Unless required by applicable law or agreed to in writing, software
12*2abb3134SXin Li# distributed under the License is distributed on an "AS IS" BASIS,
13*2abb3134SXin Li# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14*2abb3134SXin Li# See the License for the specific language governing permissions and
15*2abb3134SXin Li# limitations under the License.
16*2abb3134SXin Li
17*2abb3134SXin Li# Simple tool that wraps the analysis/R library.
18*2abb3134SXin Li#
19*2abb3134SXin Li# To run this you need:
20*2abb3134SXin Li# - ggplot
21*2abb3134SXin Li# - optparse
22*2abb3134SXin Li# - glmnet -- dependency of analysis library
23*2abb3134SXin Li
24*2abb3134SXin Lilibrary(optparse)
25*2abb3134SXin Li
26*2abb3134SXin Li# For unit tests
27*2abb3134SXin Liis_main <- (length(sys.frames()) == 0)
28*2abb3134SXin Li
29*2abb3134SXin Li# Do command line parsing first to catch errors.  Loading libraries in R is
30*2abb3134SXin Li# slow.
31*2abb3134SXin Liif (is_main) {
32*2abb3134SXin Li  option_list <- list(
33*2abb3134SXin Li     make_option(c("-t", "--title"), help="Plot Title")
34*2abb3134SXin Li     )
35*2abb3134SXin Li  parsed <- parse_args(OptionParser(option_list = option_list),
36*2abb3134SXin Li                       positional_arguments = 3)  # input and output
37*2abb3134SXin Li}
38*2abb3134SXin Li
39*2abb3134SXin Lilibrary(ggplot2)
40*2abb3134SXin Li
41*2abb3134SXin Li# Use CairoPNG if available.  Useful for headless R.
42*2abb3134SXin Liif (library(Cairo, quietly = TRUE, logical.return = TRUE)) {
43*2abb3134SXin Li  png_func = CairoPNG
44*2abb3134SXin Li  cat('Using CairoPNG\n')
45*2abb3134SXin Li} else {
46*2abb3134SXin Li  png_func = png
47*2abb3134SXin Li  cat('Using png\n')
48*2abb3134SXin Li}
49*2abb3134SXin Li
50*2abb3134SXin Lisource("analysis/R/read_input.R")
51*2abb3134SXin Lisource("analysis/R/decode.R")
52*2abb3134SXin Lisource("analysis/R/util.R")
53*2abb3134SXin Li
54*2abb3134SXin Lisource("analysis/R/alternative.R")  # temporary
55*2abb3134SXin Li
56*2abb3134SXin LiLoadContext <- function(prefix_case) {
57*2abb3134SXin Li  # Creates the context, filling it with privacy parameters
58*2abb3134SXin Li  # Arg:
59*2abb3134SXin Li  #    prefix_case: path prefix to the test case, e.g. '_tmp/exp'
60*2abb3134SXin Li
61*2abb3134SXin Li  p <- paste0(prefix_case, '_params.csv')
62*2abb3134SXin Li
63*2abb3134SXin Li  params <- ReadParameterFile(p)
64*2abb3134SXin Li
65*2abb3134SXin Li  ctx <- new.env()
66*2abb3134SXin Li
67*2abb3134SXin Li  ctx$params <- params  # so we can write it out later
68*2abb3134SXin Li
69*2abb3134SXin Li  ctx
70*2abb3134SXin Li}
71*2abb3134SXin Li
72*2abb3134SXin LiRunRappor <- function(prefix_case, prefix_instance, ctx) {
73*2abb3134SXin Li  # Reads counts, map files, runs RAPPOR analysis engine.
74*2abb3134SXin Li  # Args:
75*2abb3134SXin Li  #    prefix_case: path prefix to the test case, e.g., '_tmp/exp'
76*2abb3134SXin Li  #    prefix_instance: path prefix to the test instance, e.g., '_tmp/exp/1'
77*2abb3134SXin Li  #    ctx: context file with params field filled in
78*2abb3134SXin Li
79*2abb3134SXin Li  c <- paste0(prefix_instance, '_counts.csv')
80*2abb3134SXin Li  counts <- ReadCountsFile(c, ctx$params)
81*2abb3134SXin Li
82*2abb3134SXin Li  m <- paste0(prefix_case, '_map.csv')
83*2abb3134SXin Li
84*2abb3134SXin Li  # Switch to LoadMapFile if want to cache the result
85*2abb3134SXin Li  map <- ReadMapFile(m, ctx$params)
86*2abb3134SXin Li
87*2abb3134SXin Li  # Main decode.R API
88*2abb3134SXin Li  timing <- system.time({
89*2abb3134SXin Li    res <- Decode(counts, map$map, ctx$params)
90*2abb3134SXin Li  })
91*2abb3134SXin Li
92*2abb3134SXin Li  # The line is searched for, and elapsed time is extracted, by make_summary.py.
93*2abb3134SXin Li  # Should the formating or wording change, make_summary must be updated too.
94*2abb3134SXin Li  Log("Inference took %.3f seconds", timing[["elapsed"]])
95*2abb3134SXin Li
96*2abb3134SXin Li  if (is.null(res)) {
97*2abb3134SXin Li    stop("RAPPOR analysis failed.")
98*2abb3134SXin Li  }
99*2abb3134SXin Li
100*2abb3134SXin Li  Log("Decoded results:")
101*2abb3134SXin Li  str(res$fit)
102*2abb3134SXin Li
103*2abb3134SXin Li  res$fit
104*2abb3134SXin Li}
105*2abb3134SXin Li
106*2abb3134SXin LiLoadActual <- function(prefix_instance) {
107*2abb3134SXin Li  hist_path <- paste0(prefix_instance, '_hist.csv')  # case.csv
108*2abb3134SXin Li
109*2abb3134SXin Li  # gen_counts.R (fast_counts mode) outputs this, since we never have true
110*2abb3134SXin Li  # client values.
111*2abb3134SXin Li  if (file.exists(hist_path)) {
112*2abb3134SXin Li    return(read.csv(hist_path))
113*2abb3134SXin Li  }
114*2abb3134SXin Li
115*2abb3134SXin Li  # Load ground truth into context
116*2abb3134SXin Li  input_path <- paste0(prefix_instance, '_true_values.csv')  # case.csv
117*2abb3134SXin Li  client_values <- read.csv(input_path)
118*2abb3134SXin Li
119*2abb3134SXin Li  # Create a histogram, or R "table".  Column 2 is the true value.
120*2abb3134SXin Li  t <- table(client_values$value)
121*2abb3134SXin Li
122*2abb3134SXin Li  d <- as.data.frame(t)  # convert it to a data frame with 'string' and 'count' columns
123*2abb3134SXin Li  colnames(d) <- c('string', 'count')
124*2abb3134SXin Li
125*2abb3134SXin Li  d  # return this data frame
126*2abb3134SXin Li}
127*2abb3134SXin Li
128*2abb3134SXin LiCompareRapporVsActual <- function(ctx) {
129*2abb3134SXin Li  # Prepare input data to be plotted
130*2abb3134SXin Li
131*2abb3134SXin Li  actual <- ctx$actual  # from the ground truth file
132*2abb3134SXin Li  rappor <- ctx$rappor  # from output of AnalyzeRAPPOR
133*2abb3134SXin Li
134*2abb3134SXin Li  # "s12" -> 12, for graphing
135*2abb3134SXin Li  StringToInt <- function(x) as.integer(substring(x, 2))
136*2abb3134SXin Li
137*2abb3134SXin Li  actual_values <- StringToInt(actual$string)
138*2abb3134SXin Li  rappor_values <- StringToInt(rappor$string)
139*2abb3134SXin Li
140*2abb3134SXin Li  # False negatives: AnalyzeRAPPOR failed to find this value (e.g. because it
141*2abb3134SXin Li  # occurs too rarely)
142*2abb3134SXin Li  actual_only <- setdiff(actual_values, rappor_values)
143*2abb3134SXin Li
144*2abb3134SXin Li  # False positives: AnalyzeRAPPOR attributed a proportion to a string in the
145*2abb3134SXin Li  # map that wasn't in the true input.
146*2abb3134SXin Li  rappor_only <- setdiff(rappor_values, actual_values)
147*2abb3134SXin Li
148*2abb3134SXin Li  total <- sum(actual$count)
149*2abb3134SXin Li  a <- data.frame(index = actual_values,
150*2abb3134SXin Li                  # Calculate the true proportion
151*2abb3134SXin Li                  proportion = actual$count / total,
152*2abb3134SXin Li                  dist = "actual")
153*2abb3134SXin Li
154*2abb3134SXin Li  r <- data.frame(index = rappor_values,
155*2abb3134SXin Li                  proportion = rappor$proportion,
156*2abb3134SXin Li                  dist = rep("rappor", length(rappor_values)))
157*2abb3134SXin Li
158*2abb3134SXin Li  # Extend a and r with the values that they are missing.
159*2abb3134SXin Li  if (length(rappor_only) > 0) {
160*2abb3134SXin Li    z <- data.frame(index = rappor_only,
161*2abb3134SXin Li                    proportion = 0.0,
162*2abb3134SXin Li                    dist = "actual")
163*2abb3134SXin Li    a <- rbind(a, z)
164*2abb3134SXin Li  }
165*2abb3134SXin Li  if (length(actual_only) > 0) {
166*2abb3134SXin Li    z <- data.frame(index = actual_only,
167*2abb3134SXin Li                    proportion = 0.0,
168*2abb3134SXin Li                    dist = "rappor")
169*2abb3134SXin Li    r <- rbind(r, z)
170*2abb3134SXin Li  }
171*2abb3134SXin Li
172*2abb3134SXin Li  # IMPORTANT: Now a and r have the same rows, but in the wrong order.  Sort by index.
173*2abb3134SXin Li  a <- a[order(a$index), ]
174*2abb3134SXin Li  r <- r[order(r$index), ]
175*2abb3134SXin Li
176*2abb3134SXin Li  # L1 distance between actual and rappor distributions
177*2abb3134SXin Li  l1 <- sum(abs(a$proportion - r$proportion))
178*2abb3134SXin Li  # The max L1 distance between two distributions is 2; the max total variation
179*2abb3134SXin Li  # distance is 1.
180*2abb3134SXin Li  total_variation <- l1 / 2
181*2abb3134SXin Li
182*2abb3134SXin Li  # Choose false positive strings and their proportion from rappor estimates
183*2abb3134SXin Li  false_pos <- r[r$index %in% rappor_only, c('index', 'proportion')]
184*2abb3134SXin Li  false_neg <- a[a$index %in% actual_only, c('index', 'proportion')]
185*2abb3134SXin Li
186*2abb3134SXin Li  Log("False positives:")
187*2abb3134SXin Li  str(false_pos)
188*2abb3134SXin Li
189*2abb3134SXin Li  Log("False negatives:")
190*2abb3134SXin Li  str(false_neg)
191*2abb3134SXin Li
192*2abb3134SXin Li  # NOTE: We should call Decode() directly, and then num_rappor is
193*2abb3134SXin Li  # metrics$num_detected, and sum_proportion is metrics$allocated_mass.
194*2abb3134SXin Li  metrics <- list(
195*2abb3134SXin Li      num_actual = nrow(actual),  # data frames
196*2abb3134SXin Li      num_rappor = nrow(rappor),
197*2abb3134SXin Li      num_false_pos = nrow(false_pos),
198*2abb3134SXin Li      num_false_neg = nrow(false_neg),
199*2abb3134SXin Li      total_variation = total_variation,
200*2abb3134SXin Li      sum_proportion = sum(rappor$proportion)
201*2abb3134SXin Li      )
202*2abb3134SXin Li
203*2abb3134SXin Li  Log("Metrics:")
204*2abb3134SXin Li  str(metrics)
205*2abb3134SXin Li
206*2abb3134SXin Li  # Return plot data and metrics
207*2abb3134SXin Li  list(plot_data = rbind(r, a), metrics = metrics)
208*2abb3134SXin Li}
209*2abb3134SXin Li
210*2abb3134SXin Li# Colors selected to be friendly to the color blind:
211*2abb3134SXin Li# http://www.cookbook-r.com/Graphs/Colors_%28ggplot2%29/
212*2abb3134SXin Lipalette <- c("#E69F00", "#56B4E9")
213*2abb3134SXin Li
214*2abb3134SXin LiPlotAll <- function(d, title) {
215*2abb3134SXin Li  # NOTE: geom_bar makes a histogram by default; need stat = "identity"
216*2abb3134SXin Li  g <- ggplot(d, aes(x = index, y = proportion, fill = factor(dist)))
217*2abb3134SXin Li  b <- geom_bar(stat = "identity", width = 0.7,
218*2abb3134SXin Li                position = position_dodge(width = 0.8))
219*2abb3134SXin Li  t <- ggtitle(title)
220*2abb3134SXin Li  g + b + t + scale_fill_manual(values=palette)
221*2abb3134SXin Li}
222*2abb3134SXin Li
223*2abb3134SXin LiWritePlot <- function(p, outdir, width = 800, height = 600) {
224*2abb3134SXin Li  filename <- file.path(outdir, 'dist.png')
225*2abb3134SXin Li  png_func(filename, width=width, height=height)
226*2abb3134SXin Li  plot(p)
227*2abb3134SXin Li  dev.off()
228*2abb3134SXin Li  Log('Wrote %s', filename)
229*2abb3134SXin Li}
230*2abb3134SXin Li
231*2abb3134SXin LiWriteSummary <- function(metrics, outdir) {
232*2abb3134SXin Li  filename <- file.path(outdir, 'metrics.csv')
233*2abb3134SXin Li  write.csv(metrics, file = filename, row.names = FALSE)
234*2abb3134SXin Li  Log('Wrote %s', filename)
235*2abb3134SXin Li}
236*2abb3134SXin Li
237*2abb3134SXin Limain <- function(parsed) {
238*2abb3134SXin Li  args <- parsed$args
239*2abb3134SXin Li  options <- parsed$options
240*2abb3134SXin Li
241*2abb3134SXin Li  input_case_prefix <- args[[1]]
242*2abb3134SXin Li  input_instance_prefix <- args[[2]]
243*2abb3134SXin Li  output_dir <- args[[3]]
244*2abb3134SXin Li
245*2abb3134SXin Li  # increase ggplot font size globally
246*2abb3134SXin Li  theme_set(theme_grey(base_size = 16))
247*2abb3134SXin Li
248*2abb3134SXin Li  # NOTE: It takes more than 2000+ ms to get here, while the analysis only
249*2abb3134SXin Li  # takes 500 ms or so (as measured by system.time).
250*2abb3134SXin Li
251*2abb3134SXin Li  ctx <- LoadContext(input_case_prefix)
252*2abb3134SXin Li  ctx$rappor <- RunRappor(input_case_prefix, input_instance_prefix, ctx)
253*2abb3134SXin Li  ctx$actual <- LoadActual(input_instance_prefix)
254*2abb3134SXin Li
255*2abb3134SXin Li  d <- CompareRapporVsActual(ctx)
256*2abb3134SXin Li  p <- PlotAll(d$plot_data, options$title)
257*2abb3134SXin Li
258*2abb3134SXin Li  WriteSummary(d$metrics, output_dir)
259*2abb3134SXin Li  WritePlot(p, output_dir)
260*2abb3134SXin Li}
261*2abb3134SXin Li
262*2abb3134SXin Liif (is_main) {
263*2abb3134SXin Li  main(parsed)
264*2abb3134SXin Li}
265