1#!/usr/bin/python 2# 3# Copyright 2014 Google Inc. All rights reserved. 4# 5# Licensed under the Apache License, Version 2.0 (the "License"); 6# you may not use this file except in compliance with the License. 7# You may obtain a copy of the License at 8# 9# http://www.apache.org/licenses/LICENSE-2.0 10# 11# Unless required by applicable law or agreed to in writing, software 12# distributed under the License is distributed on an "AS IS" BASIS, 13# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14# See the License for the specific language governing permissions and 15# limitations under the License. 16 17"""Run the RAPPOR Python client on simulated input. 18 19It takes a 3-column CSV file as generated by gen_reports.R, and outputs a 5 20column CSV of RAPPOR'd data. 21 22Input columns: client,true_value 23Output coumns: client,cohort,bloom,prr,rappor 24 25TODO: 26- cohort should be in the input _input.csv file. 27 28See http://google.github.io/rappor/doc/data-flow.html for details. 29""" 30 31import csv 32import collections 33import optparse 34import os 35import random 36import sys 37import time 38 39import rappor # client library 40try: 41 import fastrand 42except ImportError: 43 print >>sys.stderr, ( 44 "Native fastrand module not imported; see README for speedups") 45 fastrand = None 46 47 48def log(msg, *args): 49 if args: 50 msg = msg % args 51 print >>sys.stderr, msg 52 53 54def CreateOptionsParser(): 55 p = optparse.OptionParser() 56 57 p.add_option( 58 '--num-bits', type='int', metavar='INT', dest='num_bits', default=16, 59 help='Number of bloom filter bits.') 60 p.add_option( 61 '--num-hashes', type='int', metavar='INT', dest='num_hashes', default=2, 62 help='Number of hashes.') 63 p.add_option( 64 '--num-cohorts', type='int', metavar='INT', dest='num_cohorts', 65 default=64, help='Number of cohorts.') 66 67 p.add_option( 68 '-p', type='float', metavar='FLOAT', dest='prob_p', default=1, 69 help='Probability p') 70 p.add_option( 71 '-q', type='float', metavar='FLOAT', dest='prob_q', default=1, 72 help='Probability q') 73 p.add_option( 74 '-f', type='float', metavar='FLOAT', dest='prob_f', default=1, 75 help='Probability f') 76 p.add_option( 77 '--assoc-testdata', type='int', dest='assoc_testdata', default=0, 78 help='Generate association testdata from true values on stdin.') 79 80 choices = ['simple', 'fast'] 81 p.add_option( 82 '-r', type='choice', metavar='STR', 83 dest='random_mode', default='fast', choices=choices, 84 help='Random algorithm (%s)' % '|'.join(choices)) 85 86 return p 87 88 89def GenAssocTestdata(params1, params2, irr_rand, assoc_testdata_count, 90 csv_in, csv_out): 91 """Read true values from csv_in and output encoded values on csv_out. 92 93 Replicate assoc_testdata_count times. First value is a string, second is a 94 bool. TODO: Generalize this. 95 """ 96 rows = [] 97 for i, (true_value1, true_value2) in enumerate(csv_in): 98 if i == 0: 99 v1_name = true_value1 100 v2_name = true_value2 101 continue # skip header row 102 103 rows.append((true_value1, true_value2)) 104 105 # Use the same column names 106 header = ('client', 'cohort', v1_name, v2_name) 107 csv_out.writerow(header) 108 109 n = assoc_testdata_count 110 report_index = 0 111 for i in xrange(n): 112 for v1, v2 in rows: 113 client_str = 'c%d' % report_index 114 115 # randint(a, b) gives i such that a <= i <= b 116 cohort = random.randint(0, params1.num_cohorts - 1) 117 118 string_encoder = rappor.Encoder(params1, cohort, client_str, irr_rand) 119 bool_encoder = rappor.Encoder(params2, cohort, client_str, irr_rand) 120 121 # Real users should call e.encode(). For testing purposes, we also want 122 # the PRR. 123 irr1 = string_encoder.encode(v1) 124 125 # TODO: Convert to bool and encode with basic RAPPOR 126 v2_int = int(v2) 127 #print v2_int 128 irr2 = bool_encoder.encode_bits(v2_int) 129 130 irr1_str = rappor.bit_string(irr1, params1.num_bloombits) 131 irr2_str = rappor.bit_string(irr2, params2.num_bloombits) 132 133 csv_out.writerow((client_str, cohort, irr1_str, irr2_str)) 134 135 report_index += 1 136 137 138def RapporClientSim(params, irr_rand, csv_in, csv_out): 139 """Read true values from csv_in and output encoded values on csv_out.""" 140 header = ('client', 'cohort', 'bloom', 'prr', 'irr') 141 csv_out.writerow(header) 142 143 # TODO: It would be more instructive/efficient to construct an encoder 144 # instance up front per client, rather than one per row below. 145 start_time = time.time() 146 147 for i, (client_str, cohort_str, true_value) in enumerate(csv_in): 148 if i == 0: 149 if client_str != 'client': 150 raise RuntimeError('Expected client header, got %s' % client_str) 151 if cohort_str != 'cohort': 152 raise RuntimeError('Expected cohort header, got %s' % cohort_str) 153 if true_value != 'value': 154 raise RuntimeError('Expected value header, got %s' % value) 155 continue # skip header row 156 157 #if i == 30: # EARLY STOP 158 # break 159 160 if i % 10000 == 0: 161 elapsed = time.time() - start_time 162 log('Processed %d inputs in %.2f seconds', i, elapsed) 163 164 cohort = int(cohort_str) 165 secret = client_str 166 e = rappor.Encoder(params, cohort, secret, irr_rand) 167 168 # Real users should call e.encode(). For testing purposes, we also want 169 # the PRR. 170 bloom, prr, irr = e._internal_encode(true_value) 171 172 bloom_str = rappor.bit_string(bloom, params.num_bloombits) 173 prr_str = rappor.bit_string(prr, params.num_bloombits) 174 irr_str = rappor.bit_string(irr, params.num_bloombits) 175 176 out_row = (client_str, cohort_str, bloom_str, prr_str, irr_str) 177 csv_out.writerow(out_row) 178 179 180def main(argv): 181 (opts, argv) = CreateOptionsParser().parse_args(argv) 182 183 # Copy flags into params 184 params = rappor.Params() 185 params.num_bloombits = opts.num_bits 186 params.num_hashes = opts.num_hashes 187 params.num_cohorts = opts.num_cohorts 188 params.prob_p = opts.prob_p 189 params.prob_q = opts.prob_q 190 params.prob_f = opts.prob_f 191 192 if opts.random_mode == 'simple': 193 irr_rand = rappor.SecureIrrRand(params) 194 elif opts.random_mode == 'fast': 195 if fastrand: 196 log('Using fastrand extension') 197 # NOTE: This doesn't take 'rand'. It's seeded in C with srand(). 198 irr_rand = fastrand.FastIrrRand(params) 199 else: 200 log('Warning: fastrand module not importable; see README for build ' 201 'instructions. Falling back to simple randomness.') 202 irr_rand = rappor.SecureIrrRand(params) 203 else: 204 raise AssertionError 205 # Other possible implementations: 206 # - random.SystemRandom (probably uses /dev/urandom on Linux) 207 # - HMAC-SHA256 with another secret? This could match C++ byte for byte. 208 # - or srand(0) might do it. 209 210 csv_in = csv.reader(sys.stdin) 211 csv_out = csv.writer(sys.stdout) 212 213 if opts.assoc_testdata: 214 # Copy flags into params 215 params1 = rappor.Params() 216 params1.num_bloombits = opts.num_bits 217 params1.num_hashes = opts.num_hashes 218 params1.num_cohorts = opts.num_cohorts 219 params1.prob_p = opts.prob_p 220 params1.prob_q = opts.prob_q 221 params1.prob_f = opts.prob_f 222 223 # Second one is boolean 224 params2 = rappor.Params() 225 params2.num_bloombits = 1 # 1 bit for boolean 226 params2.num_hashes = opts.num_hashes 227 params2.num_cohorts = opts.num_cohorts 228 params2.prob_p = opts.prob_p 229 params2.prob_q = opts.prob_q 230 params2.prob_f = opts.prob_f 231 232 GenAssocTestdata( 233 params1, params2, irr_rand, opts.assoc_testdata, csv_in, csv_out) 234 else: 235 RapporClientSim(params, irr_rand, csv_in, csv_out) 236 237 238if __name__ == "__main__": 239 try: 240 main(sys.argv) 241 except RuntimeError, e: 242 log('rappor_sim.py: FATAL: %s', e) 243