1*2abb3134SXin Li#!/usr/bin/python 2*2abb3134SXin Li# 3*2abb3134SXin Li# Copyright 2014 Google Inc. All rights reserved. 4*2abb3134SXin Li# 5*2abb3134SXin Li# Licensed under the Apache License, Version 2.0 (the "License"); 6*2abb3134SXin Li# you may not use this file except in compliance with the License. 7*2abb3134SXin Li# You may obtain a copy of the License at 8*2abb3134SXin Li# 9*2abb3134SXin Li# http://www.apache.org/licenses/LICENSE-2.0 10*2abb3134SXin Li# 11*2abb3134SXin Li# Unless required by applicable law or agreed to in writing, software 12*2abb3134SXin Li# distributed under the License is distributed on an "AS IS" BASIS, 13*2abb3134SXin Li# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14*2abb3134SXin Li# See the License for the specific language governing permissions and 15*2abb3134SXin Li# limitations under the License. 16*2abb3134SXin Li 17*2abb3134SXin Li"""Run the RAPPOR Python client on simulated input. 18*2abb3134SXin Li 19*2abb3134SXin LiIt takes a 3-column CSV file as generated by gen_reports.R, and outputs a 5 20*2abb3134SXin Licolumn CSV of RAPPOR'd data. 21*2abb3134SXin Li 22*2abb3134SXin LiInput columns: client,true_value 23*2abb3134SXin LiOutput coumns: client,cohort,bloom,prr,rappor 24*2abb3134SXin Li 25*2abb3134SXin LiTODO: 26*2abb3134SXin Li- cohort should be in the input _input.csv file. 27*2abb3134SXin Li 28*2abb3134SXin LiSee http://google.github.io/rappor/doc/data-flow.html for details. 29*2abb3134SXin Li""" 30*2abb3134SXin Li 31*2abb3134SXin Liimport csv 32*2abb3134SXin Liimport collections 33*2abb3134SXin Liimport optparse 34*2abb3134SXin Liimport os 35*2abb3134SXin Liimport random 36*2abb3134SXin Liimport sys 37*2abb3134SXin Liimport time 38*2abb3134SXin Li 39*2abb3134SXin Liimport rappor # client library 40*2abb3134SXin Litry: 41*2abb3134SXin Li import fastrand 42*2abb3134SXin Liexcept ImportError: 43*2abb3134SXin Li print >>sys.stderr, ( 44*2abb3134SXin Li "Native fastrand module not imported; see README for speedups") 45*2abb3134SXin Li fastrand = None 46*2abb3134SXin Li 47*2abb3134SXin Li 48*2abb3134SXin Lidef log(msg, *args): 49*2abb3134SXin Li if args: 50*2abb3134SXin Li msg = msg % args 51*2abb3134SXin Li print >>sys.stderr, msg 52*2abb3134SXin Li 53*2abb3134SXin Li 54*2abb3134SXin Lidef CreateOptionsParser(): 55*2abb3134SXin Li p = optparse.OptionParser() 56*2abb3134SXin Li 57*2abb3134SXin Li p.add_option( 58*2abb3134SXin Li '--num-bits', type='int', metavar='INT', dest='num_bits', default=16, 59*2abb3134SXin Li help='Number of bloom filter bits.') 60*2abb3134SXin Li p.add_option( 61*2abb3134SXin Li '--num-hashes', type='int', metavar='INT', dest='num_hashes', default=2, 62*2abb3134SXin Li help='Number of hashes.') 63*2abb3134SXin Li p.add_option( 64*2abb3134SXin Li '--num-cohorts', type='int', metavar='INT', dest='num_cohorts', 65*2abb3134SXin Li default=64, help='Number of cohorts.') 66*2abb3134SXin Li 67*2abb3134SXin Li p.add_option( 68*2abb3134SXin Li '-p', type='float', metavar='FLOAT', dest='prob_p', default=1, 69*2abb3134SXin Li help='Probability p') 70*2abb3134SXin Li p.add_option( 71*2abb3134SXin Li '-q', type='float', metavar='FLOAT', dest='prob_q', default=1, 72*2abb3134SXin Li help='Probability q') 73*2abb3134SXin Li p.add_option( 74*2abb3134SXin Li '-f', type='float', metavar='FLOAT', dest='prob_f', default=1, 75*2abb3134SXin Li help='Probability f') 76*2abb3134SXin Li p.add_option( 77*2abb3134SXin Li '--assoc-testdata', type='int', dest='assoc_testdata', default=0, 78*2abb3134SXin Li help='Generate association testdata from true values on stdin.') 79*2abb3134SXin Li 80*2abb3134SXin Li choices = ['simple', 'fast'] 81*2abb3134SXin Li p.add_option( 82*2abb3134SXin Li '-r', type='choice', metavar='STR', 83*2abb3134SXin Li dest='random_mode', default='fast', choices=choices, 84*2abb3134SXin Li help='Random algorithm (%s)' % '|'.join(choices)) 85*2abb3134SXin Li 86*2abb3134SXin Li return p 87*2abb3134SXin Li 88*2abb3134SXin Li 89*2abb3134SXin Lidef GenAssocTestdata(params1, params2, irr_rand, assoc_testdata_count, 90*2abb3134SXin Li csv_in, csv_out): 91*2abb3134SXin Li """Read true values from csv_in and output encoded values on csv_out. 92*2abb3134SXin Li 93*2abb3134SXin Li Replicate assoc_testdata_count times. First value is a string, second is a 94*2abb3134SXin Li bool. TODO: Generalize this. 95*2abb3134SXin Li """ 96*2abb3134SXin Li rows = [] 97*2abb3134SXin Li for i, (true_value1, true_value2) in enumerate(csv_in): 98*2abb3134SXin Li if i == 0: 99*2abb3134SXin Li v1_name = true_value1 100*2abb3134SXin Li v2_name = true_value2 101*2abb3134SXin Li continue # skip header row 102*2abb3134SXin Li 103*2abb3134SXin Li rows.append((true_value1, true_value2)) 104*2abb3134SXin Li 105*2abb3134SXin Li # Use the same column names 106*2abb3134SXin Li header = ('client', 'cohort', v1_name, v2_name) 107*2abb3134SXin Li csv_out.writerow(header) 108*2abb3134SXin Li 109*2abb3134SXin Li n = assoc_testdata_count 110*2abb3134SXin Li report_index = 0 111*2abb3134SXin Li for i in xrange(n): 112*2abb3134SXin Li for v1, v2 in rows: 113*2abb3134SXin Li client_str = 'c%d' % report_index 114*2abb3134SXin Li 115*2abb3134SXin Li # randint(a, b) gives i such that a <= i <= b 116*2abb3134SXin Li cohort = random.randint(0, params1.num_cohorts - 1) 117*2abb3134SXin Li 118*2abb3134SXin Li string_encoder = rappor.Encoder(params1, cohort, client_str, irr_rand) 119*2abb3134SXin Li bool_encoder = rappor.Encoder(params2, cohort, client_str, irr_rand) 120*2abb3134SXin Li 121*2abb3134SXin Li # Real users should call e.encode(). For testing purposes, we also want 122*2abb3134SXin Li # the PRR. 123*2abb3134SXin Li irr1 = string_encoder.encode(v1) 124*2abb3134SXin Li 125*2abb3134SXin Li # TODO: Convert to bool and encode with basic RAPPOR 126*2abb3134SXin Li v2_int = int(v2) 127*2abb3134SXin Li #print v2_int 128*2abb3134SXin Li irr2 = bool_encoder.encode_bits(v2_int) 129*2abb3134SXin Li 130*2abb3134SXin Li irr1_str = rappor.bit_string(irr1, params1.num_bloombits) 131*2abb3134SXin Li irr2_str = rappor.bit_string(irr2, params2.num_bloombits) 132*2abb3134SXin Li 133*2abb3134SXin Li csv_out.writerow((client_str, cohort, irr1_str, irr2_str)) 134*2abb3134SXin Li 135*2abb3134SXin Li report_index += 1 136*2abb3134SXin Li 137*2abb3134SXin Li 138*2abb3134SXin Lidef RapporClientSim(params, irr_rand, csv_in, csv_out): 139*2abb3134SXin Li """Read true values from csv_in and output encoded values on csv_out.""" 140*2abb3134SXin Li header = ('client', 'cohort', 'bloom', 'prr', 'irr') 141*2abb3134SXin Li csv_out.writerow(header) 142*2abb3134SXin Li 143*2abb3134SXin Li # TODO: It would be more instructive/efficient to construct an encoder 144*2abb3134SXin Li # instance up front per client, rather than one per row below. 145*2abb3134SXin Li start_time = time.time() 146*2abb3134SXin Li 147*2abb3134SXin Li for i, (client_str, cohort_str, true_value) in enumerate(csv_in): 148*2abb3134SXin Li if i == 0: 149*2abb3134SXin Li if client_str != 'client': 150*2abb3134SXin Li raise RuntimeError('Expected client header, got %s' % client_str) 151*2abb3134SXin Li if cohort_str != 'cohort': 152*2abb3134SXin Li raise RuntimeError('Expected cohort header, got %s' % cohort_str) 153*2abb3134SXin Li if true_value != 'value': 154*2abb3134SXin Li raise RuntimeError('Expected value header, got %s' % value) 155*2abb3134SXin Li continue # skip header row 156*2abb3134SXin Li 157*2abb3134SXin Li #if i == 30: # EARLY STOP 158*2abb3134SXin Li # break 159*2abb3134SXin Li 160*2abb3134SXin Li if i % 10000 == 0: 161*2abb3134SXin Li elapsed = time.time() - start_time 162*2abb3134SXin Li log('Processed %d inputs in %.2f seconds', i, elapsed) 163*2abb3134SXin Li 164*2abb3134SXin Li cohort = int(cohort_str) 165*2abb3134SXin Li secret = client_str 166*2abb3134SXin Li e = rappor.Encoder(params, cohort, secret, irr_rand) 167*2abb3134SXin Li 168*2abb3134SXin Li # Real users should call e.encode(). For testing purposes, we also want 169*2abb3134SXin Li # the PRR. 170*2abb3134SXin Li bloom, prr, irr = e._internal_encode(true_value) 171*2abb3134SXin Li 172*2abb3134SXin Li bloom_str = rappor.bit_string(bloom, params.num_bloombits) 173*2abb3134SXin Li prr_str = rappor.bit_string(prr, params.num_bloombits) 174*2abb3134SXin Li irr_str = rappor.bit_string(irr, params.num_bloombits) 175*2abb3134SXin Li 176*2abb3134SXin Li out_row = (client_str, cohort_str, bloom_str, prr_str, irr_str) 177*2abb3134SXin Li csv_out.writerow(out_row) 178*2abb3134SXin Li 179*2abb3134SXin Li 180*2abb3134SXin Lidef main(argv): 181*2abb3134SXin Li (opts, argv) = CreateOptionsParser().parse_args(argv) 182*2abb3134SXin Li 183*2abb3134SXin Li # Copy flags into params 184*2abb3134SXin Li params = rappor.Params() 185*2abb3134SXin Li params.num_bloombits = opts.num_bits 186*2abb3134SXin Li params.num_hashes = opts.num_hashes 187*2abb3134SXin Li params.num_cohorts = opts.num_cohorts 188*2abb3134SXin Li params.prob_p = opts.prob_p 189*2abb3134SXin Li params.prob_q = opts.prob_q 190*2abb3134SXin Li params.prob_f = opts.prob_f 191*2abb3134SXin Li 192*2abb3134SXin Li if opts.random_mode == 'simple': 193*2abb3134SXin Li irr_rand = rappor.SecureIrrRand(params) 194*2abb3134SXin Li elif opts.random_mode == 'fast': 195*2abb3134SXin Li if fastrand: 196*2abb3134SXin Li log('Using fastrand extension') 197*2abb3134SXin Li # NOTE: This doesn't take 'rand'. It's seeded in C with srand(). 198*2abb3134SXin Li irr_rand = fastrand.FastIrrRand(params) 199*2abb3134SXin Li else: 200*2abb3134SXin Li log('Warning: fastrand module not importable; see README for build ' 201*2abb3134SXin Li 'instructions. Falling back to simple randomness.') 202*2abb3134SXin Li irr_rand = rappor.SecureIrrRand(params) 203*2abb3134SXin Li else: 204*2abb3134SXin Li raise AssertionError 205*2abb3134SXin Li # Other possible implementations: 206*2abb3134SXin Li # - random.SystemRandom (probably uses /dev/urandom on Linux) 207*2abb3134SXin Li # - HMAC-SHA256 with another secret? This could match C++ byte for byte. 208*2abb3134SXin Li # - or srand(0) might do it. 209*2abb3134SXin Li 210*2abb3134SXin Li csv_in = csv.reader(sys.stdin) 211*2abb3134SXin Li csv_out = csv.writer(sys.stdout) 212*2abb3134SXin Li 213*2abb3134SXin Li if opts.assoc_testdata: 214*2abb3134SXin Li # Copy flags into params 215*2abb3134SXin Li params1 = rappor.Params() 216*2abb3134SXin Li params1.num_bloombits = opts.num_bits 217*2abb3134SXin Li params1.num_hashes = opts.num_hashes 218*2abb3134SXin Li params1.num_cohorts = opts.num_cohorts 219*2abb3134SXin Li params1.prob_p = opts.prob_p 220*2abb3134SXin Li params1.prob_q = opts.prob_q 221*2abb3134SXin Li params1.prob_f = opts.prob_f 222*2abb3134SXin Li 223*2abb3134SXin Li # Second one is boolean 224*2abb3134SXin Li params2 = rappor.Params() 225*2abb3134SXin Li params2.num_bloombits = 1 # 1 bit for boolean 226*2abb3134SXin Li params2.num_hashes = opts.num_hashes 227*2abb3134SXin Li params2.num_cohorts = opts.num_cohorts 228*2abb3134SXin Li params2.prob_p = opts.prob_p 229*2abb3134SXin Li params2.prob_q = opts.prob_q 230*2abb3134SXin Li params2.prob_f = opts.prob_f 231*2abb3134SXin Li 232*2abb3134SXin Li GenAssocTestdata( 233*2abb3134SXin Li params1, params2, irr_rand, opts.assoc_testdata, csv_in, csv_out) 234*2abb3134SXin Li else: 235*2abb3134SXin Li RapporClientSim(params, irr_rand, csv_in, csv_out) 236*2abb3134SXin Li 237*2abb3134SXin Li 238*2abb3134SXin Liif __name__ == "__main__": 239*2abb3134SXin Li try: 240*2abb3134SXin Li main(sys.argv) 241*2abb3134SXin Li except RuntimeError, e: 242*2abb3134SXin Li log('rappor_sim.py: FATAL: %s', e) 243