1*2abb3134SXin Li#!/usr/bin/env Rscript 2*2abb3134SXin Li# 3*2abb3134SXin Li# Copyright 2015 Google Inc. All rights reserved. 4*2abb3134SXin Li# 5*2abb3134SXin Li# Licensed under the Apache License, Version 2.0 (the "License"); 6*2abb3134SXin Li# you may not use this file except in compliance with the License. 7*2abb3134SXin Li# You may obtain a copy of the License at 8*2abb3134SXin Li# 9*2abb3134SXin Li# http://www.apache.org/licenses/LICENSE-2.0 10*2abb3134SXin Li# 11*2abb3134SXin Li# Unless required by applicable law or agreed to in writing, software 12*2abb3134SXin Li# distributed under the License is distributed on an "AS IS" BASIS, 13*2abb3134SXin Li# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14*2abb3134SXin Li# See the License for the specific language governing permissions and 15*2abb3134SXin Li# limitations under the License. 16*2abb3134SXin Li 17*2abb3134SXin Lisource('tests/gen_counts.R') 18*2abb3134SXin Li 19*2abb3134SXin Li# Usage: 20*2abb3134SXin Li# 21*2abb3134SXin Li# $ ./gen_true_values.R exp 100 10000 1 foo.csv 22*2abb3134SXin Li# 23*2abb3134SXin Li# Inputs: 24*2abb3134SXin Li# distribution name 25*2abb3134SXin Li# size of the distribution's support 26*2abb3134SXin Li# number of clients 27*2abb3134SXin Li# reports per client 28*2abb3134SXin Li# name of the output file 29*2abb3134SXin Li# Output: 30*2abb3134SXin Li# csv file with reports sampled according to the specified distribution. 31*2abb3134SXin Li 32*2abb3134SXin LiGenerateTrueValues <- function(distr, distr_range, num_clients, 33*2abb3134SXin Li reports_per_client, num_cohorts) { 34*2abb3134SXin Li 35*2abb3134SXin Li # Sums to 1.0, e.g. [0.2 0.2 0.2 0.2 0.2] for uniform distribution of 5. 36*2abb3134SXin Li pdf <- ComputePdf(distr, distr_range) 37*2abb3134SXin Li 38*2abb3134SXin Li num_reports <- num_clients * reports_per_client 39*2abb3134SXin Li 40*2abb3134SXin Li # Computes the number of clients reporting each value, where the numbers are 41*2abb3134SXin Li # sampled according to pdf. (sums to num_reports) 42*2abb3134SXin Li partition <- RandomPartition(num_reports, pdf) 43*2abb3134SXin Li 44*2abb3134SXin Li value_ints <- rep(1:distr_range, partition) # expand partition 45*2abb3134SXin Li 46*2abb3134SXin Li stopifnot(length(value_ints) == num_reports) 47*2abb3134SXin Li 48*2abb3134SXin Li # Shuffle values randomly (may take a few sec for > 10^8 inputs) 49*2abb3134SXin Li value_ints <- sample(value_ints) 50*2abb3134SXin Li 51*2abb3134SXin Li # Reported values are strings, so prefix integers "v". Even slower than 52*2abb3134SXin Li # shuffling. 53*2abb3134SXin Li values <- sprintf("v%d", value_ints) 54*2abb3134SXin Li 55*2abb3134SXin Li # e.g. [1 1 2 2 3 3] if num_clients is 3 and reports_per_client is 2 56*2abb3134SXin Li client_ints <- rep(1:num_clients, each = reports_per_client) 57*2abb3134SXin Li 58*2abb3134SXin Li # Cohorts are assigned to clients. Cohorts are 0-based. 59*2abb3134SXin Li cohorts <- client_ints %% num_cohorts # %% is integer modulus 60*2abb3134SXin Li 61*2abb3134SXin Li clients <- sprintf("c%d", client_ints) 62*2abb3134SXin Li 63*2abb3134SXin Li data.frame(client = clients, cohort = cohorts, value = values) 64*2abb3134SXin Li} 65*2abb3134SXin Li 66*2abb3134SXin Limain <- function(argv) { 67*2abb3134SXin Li distr <- argv[[1]] 68*2abb3134SXin Li distr_range <- as.integer(argv[[2]]) 69*2abb3134SXin Li num_clients <- as.integer(argv[[3]]) 70*2abb3134SXin Li reports_per_client <- as.integer(argv[[4]]) 71*2abb3134SXin Li num_cohorts <- as.integer(argv[[5]]) 72*2abb3134SXin Li out_file <- argv[[6]] 73*2abb3134SXin Li 74*2abb3134SXin Li reports <- GenerateTrueValues(distr, distr_range, num_clients, 75*2abb3134SXin Li reports_per_client, num_cohorts) 76*2abb3134SXin Li 77*2abb3134SXin Li write.csv(reports, file = out_file, row.names = FALSE, quote = FALSE) 78*2abb3134SXin Li} 79*2abb3134SXin Li 80*2abb3134SXin Liif (length(sys.frames()) == 0) { 81*2abb3134SXin Li main(commandArgs(TRUE)) 82*2abb3134SXin Li} 83