rappor/tests/gen_true_values.R

*2abb3134SXin Li#!/usr/bin/env Rscript
*2abb3134SXin Li#
*2abb3134SXin Li# Copyright 2015 Google Inc. All rights reserved.
*2abb3134SXin Li#
*2abb3134SXin Li# Licensed under the Apache License, Version 2.0 (the "License");
*2abb3134SXin Li# you may not use this file except in compliance with the License.
*2abb3134SXin Li# You may obtain a copy of the License at
*2abb3134SXin Li#
*2abb3134SXin Li#     http://www.apache.org/licenses/LICENSE-2.0
*2abb3134SXin Li#
*2abb3134SXin Li# Unless required by applicable law or agreed to in writing, software
*2abb3134SXin Li# distributed under the License is distributed on an "AS IS" BASIS,
*2abb3134SXin Li# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
*2abb3134SXin Li# See the License for the specific language governing permissions and
*2abb3134SXin Li# limitations under the License.
*2abb3134SXin Li
*2abb3134SXin Lisource('tests/gen_counts.R')
*2abb3134SXin Li
*2abb3134SXin Li# Usage:
*2abb3134SXin Li#
*2abb3134SXin Li# $ ./gen_true_values.R exp 100 10000 1 foo.csv
*2abb3134SXin Li#
*2abb3134SXin Li# Inputs:
*2abb3134SXin Li#   distribution name
*2abb3134SXin Li#   size of the distribution's support
*2abb3134SXin Li#   number of clients
*2abb3134SXin Li#   reports per client
*2abb3134SXin Li#   name of the output file
*2abb3134SXin Li# Output:
*2abb3134SXin Li#   csv file with reports sampled according to the specified distribution.
*2abb3134SXin Li
*2abb3134SXin LiGenerateTrueValues <- function(distr, distr_range, num_clients,
*2abb3134SXin Li                            reports_per_client, num_cohorts) {
*2abb3134SXin Li
*2abb3134SXin Li  # Sums to 1.0, e.g. [0.2 0.2 0.2 0.2 0.2] for uniform distribution of 5.
*2abb3134SXin Li  pdf <- ComputePdf(distr, distr_range)
*2abb3134SXin Li
*2abb3134SXin Li  num_reports <- num_clients * reports_per_client
*2abb3134SXin Li
*2abb3134SXin Li  # Computes the number of clients reporting each value, where the numbers are
*2abb3134SXin Li  # sampled according to pdf.  (sums to num_reports)
*2abb3134SXin Li  partition <- RandomPartition(num_reports, pdf)
*2abb3134SXin Li
*2abb3134SXin Li  value_ints <- rep(1:distr_range, partition)  # expand partition
*2abb3134SXin Li
*2abb3134SXin Li  stopifnot(length(value_ints) == num_reports)
*2abb3134SXin Li
*2abb3134SXin Li  # Shuffle values randomly (may take a few sec for > 10^8 inputs)
*2abb3134SXin Li  value_ints <- sample(value_ints)
*2abb3134SXin Li
*2abb3134SXin Li  # Reported values are strings, so prefix integers "v". Even slower than
*2abb3134SXin Li  # shuffling.
*2abb3134SXin Li  values <- sprintf("v%d", value_ints)
*2abb3134SXin Li
*2abb3134SXin Li  # e.g. [1 1 2 2 3 3] if num_clients is 3 and reports_per_client is 2
*2abb3134SXin Li  client_ints <- rep(1:num_clients, each = reports_per_client)
*2abb3134SXin Li
*2abb3134SXin Li  # Cohorts are assigned to clients. Cohorts are 0-based.
*2abb3134SXin Li  cohorts <- client_ints %% num_cohorts  # %% is integer modulus
*2abb3134SXin Li
*2abb3134SXin Li  clients <- sprintf("c%d", client_ints)
*2abb3134SXin Li
*2abb3134SXin Li  data.frame(client = clients, cohort = cohorts, value = values)
*2abb3134SXin Li}
*2abb3134SXin Li
*2abb3134SXin Limain <- function(argv) {
*2abb3134SXin Li  distr <- argv[[1]]
*2abb3134SXin Li  distr_range <- as.integer(argv[[2]])
*2abb3134SXin Li  num_clients <- as.integer(argv[[3]])
*2abb3134SXin Li  reports_per_client <- as.integer(argv[[4]])
*2abb3134SXin Li  num_cohorts <- as.integer(argv[[5]])
*2abb3134SXin Li  out_file <- argv[[6]]
*2abb3134SXin Li
*2abb3134SXin Li  reports <- GenerateTrueValues(distr, distr_range, num_clients,
*2abb3134SXin Li                                reports_per_client, num_cohorts)
*2abb3134SXin Li
*2abb3134SXin Li  write.csv(reports, file = out_file, row.names = FALSE, quote = FALSE)
*2abb3134SXin Li}
*2abb3134SXin Li
*2abb3134SXin Liif (length(sys.frames()) == 0) {
*2abb3134SXin Li  main(commandArgs(TRUE))
*2abb3134SXin Li}