xref: /aosp_15_r20/external/rappor/tests/gen_counts.R (revision 2abb31345f6c95944768b5222a9a5ed3fc68cc00)
1*2abb3134SXin Li#!/usr/bin/env Rscript
2*2abb3134SXin Li#
3*2abb3134SXin Li# Copyright 2015 Google Inc. All rights reserved.
4*2abb3134SXin Li#
5*2abb3134SXin Li# Licensed under the Apache License, Version 2.0 (the "License");
6*2abb3134SXin Li# you may not use this file except in compliance with the License.
7*2abb3134SXin Li# You may obtain a copy of the License at
8*2abb3134SXin Li#
9*2abb3134SXin Li#     http://www.apache.org/licenses/LICENSE-2.0
10*2abb3134SXin Li#
11*2abb3134SXin Li# Unless required by applicable law or agreed to in writing, software
12*2abb3134SXin Li# distributed under the License is distributed on an "AS IS" BASIS,
13*2abb3134SXin Li# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14*2abb3134SXin Li# See the License for the specific language governing permissions and
15*2abb3134SXin Li# limitations under the License.
16*2abb3134SXin Li
17*2abb3134SXin Lisource('analysis/R/read_input.R')
18*2abb3134SXin Li
19*2abb3134SXin LiRandomPartition <- function(total, weights) {
20*2abb3134SXin Li  # Outputs a random partition according to a specified distribution
21*2abb3134SXin Li  # Args:
22*2abb3134SXin Li  #   total - number of samples
23*2abb3134SXin Li  #   weights - weights that are proportional to the probability density
24*2abb3134SXin Li  #              function of the target distribution
25*2abb3134SXin Li  # Returns:
26*2abb3134SXin Li  #   a histogram sampled according to the pdf
27*2abb3134SXin Li  # Example:
28*2abb3134SXin Li  #   > RandomPartition(100, c(3, 2, 1, 0, 1))
29*2abb3134SXin Li  #   [1] 47 24 15  0 14
30*2abb3134SXin Li  if (any(weights < 0))
31*2abb3134SXin Li    stop("Probabilities cannot be negative")
32*2abb3134SXin Li
33*2abb3134SXin Li  if (sum(weights) == 0)
34*2abb3134SXin Li    stop("Probabilities cannot sum up to 0")
35*2abb3134SXin Li
36*2abb3134SXin Li  bins <- length(weights)
37*2abb3134SXin Li  result <- rep(0, bins)
38*2abb3134SXin Li
39*2abb3134SXin Li  # idiomatic way:
40*2abb3134SXin Li  #   rnd_list <- sample(strs, total, replace = TRUE, weights)
41*2abb3134SXin Li  #   apply(as.array(strs), 1, function(x) length(rnd_list[rnd_list == x]))
42*2abb3134SXin Li  #
43*2abb3134SXin Li  # The following is much faster for larger totals. We can replace a loop with
44*2abb3134SXin Li  # (tail) recusion, but R chokes with the recursion depth > 850.
45*2abb3134SXin Li
46*2abb3134SXin Li  w <- sum(weights)
47*2abb3134SXin Li
48*2abb3134SXin Li  for (i in 1:bins)
49*2abb3134SXin Li    if (total > 0) {  # if total == 0, nothing else to do
50*2abb3134SXin Li      # invariant: w = sum(weights[i:bins])
51*2abb3134SXin Li      # rather than computing sum every time leading to quadratic time, keep
52*2abb3134SXin Li      # updating it
53*2abb3134SXin Li
54*2abb3134SXin Li      # The probability p is clamped to [0, 1] to avoid under/overflow errors.
55*2abb3134SXin Li      p <- min(max(weights[i] / w, 0), 1)
56*2abb3134SXin Li      # draw the number of balls falling into the current bin
57*2abb3134SXin Li      rnd_draw <- rbinom(n = 1, size = total, prob = p)
58*2abb3134SXin Li      result[i] <- rnd_draw  # push rnd_draw balls from total to result[i]
59*2abb3134SXin Li      total <- total - rnd_draw
60*2abb3134SXin Li      w <- w - weights[i]
61*2abb3134SXin Li  }
62*2abb3134SXin Li
63*2abb3134SXin Li  names(result) <- names(weights)
64*2abb3134SXin Li
65*2abb3134SXin Li  return(result)
66*2abb3134SXin Li}
67*2abb3134SXin Li
68*2abb3134SXin LiGenerateCounts <- function(params, true_map, partition, reports_per_client) {
69*2abb3134SXin Li  # Fast simulation of the marginal table for RAPPOR reports
70*2abb3134SXin Li  # Args:
71*2abb3134SXin Li  #   params - parameters of the RAPPOR reporting process
72*2abb3134SXin Li  #   true_map - hashed true inputs
73*2abb3134SXin Li  #   partition - allocation of clients between true values
74*2abb3134SXin Li  #   reports_per_client - number of reports (IRRs) per client
75*2abb3134SXin Li  if (nrow(true_map) != (params$m * params$k)) {
76*2abb3134SXin Li    stop(cat("Map does not match the params file!",
77*2abb3134SXin Li                 "mk =", params$m * params$k,
78*2abb3134SXin Li                 "nrow(map):", nrow(true_map),
79*2abb3134SXin Li                 sep = " "))
80*2abb3134SXin Li  }
81*2abb3134SXin Li
82*2abb3134SXin Li  # For each reporting type computes its allocation to cohorts.
83*2abb3134SXin Li  # Output is an m x strs matrix.
84*2abb3134SXin Li  cohorts <- as.matrix(
85*2abb3134SXin Li                apply(as.data.frame(partition), 1,
86*2abb3134SXin Li                      function(count) RandomPartition(count, rep(1, params$m))))
87*2abb3134SXin Li
88*2abb3134SXin Li  # Expands to (m x k) x strs matrix, where each element (corresponding to the
89*2abb3134SXin Li  # bit in the aggregate Bloom filter) is repeated k times.
90*2abb3134SXin Li  expanded <- apply(cohorts, 2, function(vec) rep(vec, each = params$k))
91*2abb3134SXin Li
92*2abb3134SXin Li  # For each bit, the number of clients reporting this bit:
93*2abb3134SXin Li  clients_per_bit <- rep(apply(cohorts, 1, sum), each = params$k)
94*2abb3134SXin Li
95*2abb3134SXin Li  # Computes the true number of bits set to one BEFORE PRR.
96*2abb3134SXin Li  true_ones <- apply(expanded * true_map, 1, sum)
97*2abb3134SXin Li
98*2abb3134SXin Li  ones_in_prr <-
99*2abb3134SXin Li    unlist(lapply(true_ones,
100*2abb3134SXin Li                  function(x) rbinom(n = 1, size = x, prob = 1 - params$f / 2))) +
101*2abb3134SXin Li    unlist(lapply(clients_per_bit - true_ones,  # clients where the bit is 0
102*2abb3134SXin Li                  function(x) rbinom(n = 1, size = x, prob =  params$f / 2)))
103*2abb3134SXin Li
104*2abb3134SXin Li  # Number of IRRs where each bit is reported (either as 0 or as 1)
105*2abb3134SXin Li  reports_per_bit <- clients_per_bit * reports_per_client
106*2abb3134SXin Li
107*2abb3134SXin Li  ones_before_irr <- ones_in_prr * reports_per_client
108*2abb3134SXin Li
109*2abb3134SXin Li  ones_after_irr <-
110*2abb3134SXin Li    unlist(lapply(ones_before_irr,
111*2abb3134SXin Li                  function(x) rbinom(n = 1, size = x, prob = params$q))) +
112*2abb3134SXin Li    unlist(lapply(reports_per_bit - ones_before_irr,
113*2abb3134SXin Li                  function(x) rbinom(n = 1, size = x, prob = params$p)))
114*2abb3134SXin Li
115*2abb3134SXin Li  counts <- cbind(apply(cohorts, 1, sum) * reports_per_client,
116*2abb3134SXin Li        matrix(ones_after_irr, nrow = params$m, ncol = params$k, byrow = TRUE))
117*2abb3134SXin Li
118*2abb3134SXin Li  if(any(is.na(counts)))
119*2abb3134SXin Li    stop("Failed to generate bit counts. Likely due to integer overflow.")
120*2abb3134SXin Li
121*2abb3134SXin Li  counts
122*2abb3134SXin Li}
123*2abb3134SXin Li
124*2abb3134SXin LiComputePdf <- function(distr, range) {
125*2abb3134SXin Li  # Outputs discrete probability density function for a given distribution
126*2abb3134SXin Li
127*2abb3134SXin Li  # These are the five distributions in gen_sim_input.py
128*2abb3134SXin Li  if (distr == 'exp') {
129*2abb3134SXin Li    pdf <- dexp(1:range, rate = 5 / range)
130*2abb3134SXin Li  } else if (distr == 'gauss') {
131*2abb3134SXin Li    half <- range / 2
132*2abb3134SXin Li    left <- -half + 1
133*2abb3134SXin Li    pdf <- dnorm(left : half, sd = range / 6)
134*2abb3134SXin Li  } else if (distr == 'unif') {
135*2abb3134SXin Li    # e.g. for N = 4, weights are [0.25, 0.25, 0.25, 0.25]
136*2abb3134SXin Li    pdf <- dunif(1:range, max = range)
137*2abb3134SXin Li  } else if (distr == 'zipf1') {
138*2abb3134SXin Li    # Since the distrubition defined over a finite set, we allow the parameter
139*2abb3134SXin Li    # of the Zipf distribution to be 1.
140*2abb3134SXin Li    pdf <- sapply(1:range, function(x) 1 / x)
141*2abb3134SXin Li  } else if (distr == 'zipf1.5') {
142*2abb3134SXin Li    pdf <- sapply(1:range, function(x) 1 / x^1.5)
143*2abb3134SXin Li  }
144*2abb3134SXin Li  else {
145*2abb3134SXin Li    stop(sprintf("Invalid distribution '%s'", distr))
146*2abb3134SXin Li  }
147*2abb3134SXin Li
148*2abb3134SXin Li  pdf <- pdf / sum(pdf)  # normalize
149*2abb3134SXin Li
150*2abb3134SXin Li  pdf
151*2abb3134SXin Li}
152*2abb3134SXin Li
153*2abb3134SXin Li# Usage:
154*2abb3134SXin Li#
155*2abb3134SXin Li# $ ./gen_counts.R exp 10000 1 foo_params.csv foo_true_map.csv foo
156*2abb3134SXin Li#
157*2abb3134SXin Li# Inputs:
158*2abb3134SXin Li#   distribution name
159*2abb3134SXin Li#   number of clients
160*2abb3134SXin Li#   reports per client
161*2abb3134SXin Li#   parameters file
162*2abb3134SXin Li#   map file
163*2abb3134SXin Li#   prefix for output files
164*2abb3134SXin Li# Outputs:
165*2abb3134SXin Li#   foo_counts.csv
166*2abb3134SXin Li#   foo_hist.csv
167*2abb3134SXin Li#
168*2abb3134SXin Li# Warning: the number of reports in any cohort must be less than
169*2abb3134SXin Li#          .Machine$integer.max
170*2abb3134SXin Li
171*2abb3134SXin Limain <- function(argv) {
172*2abb3134SXin Li  distr <- argv[[1]]
173*2abb3134SXin Li  num_clients <- as.integer(argv[[2]])
174*2abb3134SXin Li  reports_per_client <- as.integer(argv[[3]])
175*2abb3134SXin Li  params_file <- argv[[4]]
176*2abb3134SXin Li  true_map_file <- argv[[5]]
177*2abb3134SXin Li  out_prefix <- argv[[6]]
178*2abb3134SXin Li
179*2abb3134SXin Li  params <- ReadParameterFile(params_file)
180*2abb3134SXin Li
181*2abb3134SXin Li  true_map <- ReadMapFile(true_map_file, params)
182*2abb3134SXin Li
183*2abb3134SXin Li  num_unique_values <- length(true_map$strs)
184*2abb3134SXin Li
185*2abb3134SXin Li  pdf <- ComputePdf(distr, num_unique_values)
186*2abb3134SXin Li
187*2abb3134SXin Li  # Computes the number of clients reporting each string
188*2abb3134SXin Li  # according to the pre-specified distribution.
189*2abb3134SXin Li  partition <- RandomPartition(num_clients, pdf)
190*2abb3134SXin Li
191*2abb3134SXin Li  # Histogram
192*2abb3134SXin Li  true_hist <- data.frame(string = true_map$strs, count = partition)
193*2abb3134SXin Li
194*2abb3134SXin Li  counts <- GenerateCounts(params, true_map$map, partition, reports_per_client)
195*2abb3134SXin Li
196*2abb3134SXin Li  # Now create a CSV file
197*2abb3134SXin Li
198*2abb3134SXin Li  # Opposite of ReadCountsFile in read_input.R
199*2abb3134SXin Li  # http://stackoverflow.com/questions/6750546/export-csv-without-col-names
200*2abb3134SXin Li  counts_path <- paste0(out_prefix, '_counts.csv')
201*2abb3134SXin Li  write.table(counts, file = counts_path,
202*2abb3134SXin Li              row.names = FALSE, col.names = FALSE, sep = ',')
203*2abb3134SXin Li  cat(sprintf('Wrote %s\n', counts_path))
204*2abb3134SXin Li
205*2abb3134SXin Li  # TODO: Don't write strings that appear 0 times?
206*2abb3134SXin Li  hist_path <- paste0(out_prefix, '_hist.csv')
207*2abb3134SXin Li  write.csv(true_hist, file = hist_path, row.names = FALSE)
208*2abb3134SXin Li  cat(sprintf('Wrote %s\n', hist_path))
209*2abb3134SXin Li}
210*2abb3134SXin Li
211*2abb3134SXin Liif (length(sys.frames()) == 0) {
212*2abb3134SXin Li  main(commandArgs(TRUE))
213*2abb3134SXin Li}
214