xref: /aosp_15_r20/external/rappor/tests/rappor_sim.py (revision 2abb31345f6c95944768b5222a9a5ed3fc68cc00)
1*2abb3134SXin Li#!/usr/bin/python
2*2abb3134SXin Li#
3*2abb3134SXin Li# Copyright 2014 Google Inc. All rights reserved.
4*2abb3134SXin Li#
5*2abb3134SXin Li# Licensed under the Apache License, Version 2.0 (the "License");
6*2abb3134SXin Li# you may not use this file except in compliance with the License.
7*2abb3134SXin Li# You may obtain a copy of the License at
8*2abb3134SXin Li#
9*2abb3134SXin Li#     http://www.apache.org/licenses/LICENSE-2.0
10*2abb3134SXin Li#
11*2abb3134SXin Li# Unless required by applicable law or agreed to in writing, software
12*2abb3134SXin Li# distributed under the License is distributed on an "AS IS" BASIS,
13*2abb3134SXin Li# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14*2abb3134SXin Li# See the License for the specific language governing permissions and
15*2abb3134SXin Li# limitations under the License.
16*2abb3134SXin Li
17*2abb3134SXin Li"""Run the RAPPOR Python client on simulated input.
18*2abb3134SXin Li
19*2abb3134SXin LiIt takes a 3-column CSV file as generated by gen_reports.R, and outputs a 5
20*2abb3134SXin Licolumn CSV of RAPPOR'd data.
21*2abb3134SXin Li
22*2abb3134SXin LiInput columns: client,true_value
23*2abb3134SXin LiOutput coumns: client,cohort,bloom,prr,rappor
24*2abb3134SXin Li
25*2abb3134SXin LiTODO:
26*2abb3134SXin Li- cohort should be in the input _input.csv file.
27*2abb3134SXin Li
28*2abb3134SXin LiSee http://google.github.io/rappor/doc/data-flow.html for details.
29*2abb3134SXin Li"""
30*2abb3134SXin Li
31*2abb3134SXin Liimport csv
32*2abb3134SXin Liimport collections
33*2abb3134SXin Liimport optparse
34*2abb3134SXin Liimport os
35*2abb3134SXin Liimport random
36*2abb3134SXin Liimport sys
37*2abb3134SXin Liimport time
38*2abb3134SXin Li
39*2abb3134SXin Liimport rappor  # client library
40*2abb3134SXin Litry:
41*2abb3134SXin Li  import fastrand
42*2abb3134SXin Liexcept ImportError:
43*2abb3134SXin Li  print >>sys.stderr, (
44*2abb3134SXin Li      "Native fastrand module not imported; see README for speedups")
45*2abb3134SXin Li  fastrand = None
46*2abb3134SXin Li
47*2abb3134SXin Li
48*2abb3134SXin Lidef log(msg, *args):
49*2abb3134SXin Li  if args:
50*2abb3134SXin Li    msg = msg % args
51*2abb3134SXin Li  print >>sys.stderr, msg
52*2abb3134SXin Li
53*2abb3134SXin Li
54*2abb3134SXin Lidef CreateOptionsParser():
55*2abb3134SXin Li  p = optparse.OptionParser()
56*2abb3134SXin Li
57*2abb3134SXin Li  p.add_option(
58*2abb3134SXin Li      '--num-bits', type='int', metavar='INT', dest='num_bits', default=16,
59*2abb3134SXin Li      help='Number of bloom filter bits.')
60*2abb3134SXin Li  p.add_option(
61*2abb3134SXin Li      '--num-hashes', type='int', metavar='INT', dest='num_hashes', default=2,
62*2abb3134SXin Li      help='Number of hashes.')
63*2abb3134SXin Li  p.add_option(
64*2abb3134SXin Li      '--num-cohorts', type='int', metavar='INT', dest='num_cohorts',
65*2abb3134SXin Li      default=64, help='Number of cohorts.')
66*2abb3134SXin Li
67*2abb3134SXin Li  p.add_option(
68*2abb3134SXin Li      '-p', type='float', metavar='FLOAT', dest='prob_p', default=1,
69*2abb3134SXin Li      help='Probability p')
70*2abb3134SXin Li  p.add_option(
71*2abb3134SXin Li      '-q', type='float', metavar='FLOAT', dest='prob_q', default=1,
72*2abb3134SXin Li      help='Probability q')
73*2abb3134SXin Li  p.add_option(
74*2abb3134SXin Li      '-f', type='float', metavar='FLOAT', dest='prob_f', default=1,
75*2abb3134SXin Li      help='Probability f')
76*2abb3134SXin Li  p.add_option(
77*2abb3134SXin Li      '--assoc-testdata', type='int', dest='assoc_testdata', default=0,
78*2abb3134SXin Li      help='Generate association testdata from true values on stdin.')
79*2abb3134SXin Li
80*2abb3134SXin Li  choices = ['simple', 'fast']
81*2abb3134SXin Li  p.add_option(
82*2abb3134SXin Li      '-r', type='choice', metavar='STR',
83*2abb3134SXin Li      dest='random_mode', default='fast', choices=choices,
84*2abb3134SXin Li      help='Random algorithm (%s)' % '|'.join(choices))
85*2abb3134SXin Li
86*2abb3134SXin Li  return p
87*2abb3134SXin Li
88*2abb3134SXin Li
89*2abb3134SXin Lidef GenAssocTestdata(params1, params2, irr_rand, assoc_testdata_count,
90*2abb3134SXin Li                     csv_in, csv_out):
91*2abb3134SXin Li  """Read true values from csv_in and output encoded values on csv_out.
92*2abb3134SXin Li
93*2abb3134SXin Li  Replicate assoc_testdata_count times.  First value is a string, second is a
94*2abb3134SXin Li  bool.  TODO: Generalize this.
95*2abb3134SXin Li  """
96*2abb3134SXin Li  rows = []
97*2abb3134SXin Li  for i, (true_value1, true_value2) in enumerate(csv_in):
98*2abb3134SXin Li    if i == 0:
99*2abb3134SXin Li      v1_name = true_value1
100*2abb3134SXin Li      v2_name = true_value2
101*2abb3134SXin Li      continue  # skip header row
102*2abb3134SXin Li
103*2abb3134SXin Li    rows.append((true_value1, true_value2))
104*2abb3134SXin Li
105*2abb3134SXin Li  # Use the same column names
106*2abb3134SXin Li  header = ('client', 'cohort', v1_name, v2_name)
107*2abb3134SXin Li  csv_out.writerow(header)
108*2abb3134SXin Li
109*2abb3134SXin Li  n = assoc_testdata_count
110*2abb3134SXin Li  report_index = 0
111*2abb3134SXin Li  for i in xrange(n):
112*2abb3134SXin Li    for v1, v2 in rows:
113*2abb3134SXin Li      client_str = 'c%d' % report_index
114*2abb3134SXin Li
115*2abb3134SXin Li      # randint(a, b) gives i such that a <= i <= b
116*2abb3134SXin Li      cohort = random.randint(0, params1.num_cohorts - 1)
117*2abb3134SXin Li
118*2abb3134SXin Li      string_encoder = rappor.Encoder(params1, cohort, client_str, irr_rand)
119*2abb3134SXin Li      bool_encoder = rappor.Encoder(params2, cohort, client_str, irr_rand)
120*2abb3134SXin Li
121*2abb3134SXin Li      # Real users should call e.encode().  For testing purposes, we also want
122*2abb3134SXin Li      # the PRR.
123*2abb3134SXin Li      irr1 = string_encoder.encode(v1)
124*2abb3134SXin Li
125*2abb3134SXin Li      # TODO: Convert to bool and encode with basic RAPPOR
126*2abb3134SXin Li      v2_int = int(v2)
127*2abb3134SXin Li      #print v2_int
128*2abb3134SXin Li      irr2 = bool_encoder.encode_bits(v2_int)
129*2abb3134SXin Li
130*2abb3134SXin Li      irr1_str = rappor.bit_string(irr1, params1.num_bloombits)
131*2abb3134SXin Li      irr2_str = rappor.bit_string(irr2, params2.num_bloombits)
132*2abb3134SXin Li
133*2abb3134SXin Li      csv_out.writerow((client_str, cohort, irr1_str, irr2_str))
134*2abb3134SXin Li
135*2abb3134SXin Li      report_index += 1
136*2abb3134SXin Li
137*2abb3134SXin Li
138*2abb3134SXin Lidef RapporClientSim(params, irr_rand, csv_in, csv_out):
139*2abb3134SXin Li  """Read true values from csv_in and output encoded values on csv_out."""
140*2abb3134SXin Li  header = ('client', 'cohort', 'bloom', 'prr', 'irr')
141*2abb3134SXin Li  csv_out.writerow(header)
142*2abb3134SXin Li
143*2abb3134SXin Li  # TODO: It would be more instructive/efficient to construct an encoder
144*2abb3134SXin Li  # instance up front per client, rather than one per row below.
145*2abb3134SXin Li  start_time = time.time()
146*2abb3134SXin Li
147*2abb3134SXin Li  for i, (client_str, cohort_str, true_value) in enumerate(csv_in):
148*2abb3134SXin Li    if i == 0:
149*2abb3134SXin Li      if client_str != 'client':
150*2abb3134SXin Li        raise RuntimeError('Expected client header, got %s' % client_str)
151*2abb3134SXin Li      if cohort_str != 'cohort':
152*2abb3134SXin Li        raise RuntimeError('Expected cohort header, got %s' % cohort_str)
153*2abb3134SXin Li      if true_value != 'value':
154*2abb3134SXin Li        raise RuntimeError('Expected value header, got %s' % value)
155*2abb3134SXin Li      continue  # skip header row
156*2abb3134SXin Li
157*2abb3134SXin Li    #if i == 30:  # EARLY STOP
158*2abb3134SXin Li    #  break
159*2abb3134SXin Li
160*2abb3134SXin Li    if i % 10000 == 0:
161*2abb3134SXin Li      elapsed = time.time() - start_time
162*2abb3134SXin Li      log('Processed %d inputs in %.2f seconds', i, elapsed)
163*2abb3134SXin Li
164*2abb3134SXin Li    cohort = int(cohort_str)
165*2abb3134SXin Li    secret = client_str
166*2abb3134SXin Li    e = rappor.Encoder(params, cohort, secret, irr_rand)
167*2abb3134SXin Li
168*2abb3134SXin Li    # Real users should call e.encode().  For testing purposes, we also want
169*2abb3134SXin Li    # the PRR.
170*2abb3134SXin Li    bloom, prr, irr = e._internal_encode(true_value)
171*2abb3134SXin Li
172*2abb3134SXin Li    bloom_str = rappor.bit_string(bloom, params.num_bloombits)
173*2abb3134SXin Li    prr_str = rappor.bit_string(prr, params.num_bloombits)
174*2abb3134SXin Li    irr_str = rappor.bit_string(irr, params.num_bloombits)
175*2abb3134SXin Li
176*2abb3134SXin Li    out_row = (client_str, cohort_str, bloom_str, prr_str, irr_str)
177*2abb3134SXin Li    csv_out.writerow(out_row)
178*2abb3134SXin Li
179*2abb3134SXin Li
180*2abb3134SXin Lidef main(argv):
181*2abb3134SXin Li  (opts, argv) = CreateOptionsParser().parse_args(argv)
182*2abb3134SXin Li
183*2abb3134SXin Li  # Copy flags into params
184*2abb3134SXin Li  params = rappor.Params()
185*2abb3134SXin Li  params.num_bloombits = opts.num_bits
186*2abb3134SXin Li  params.num_hashes = opts.num_hashes
187*2abb3134SXin Li  params.num_cohorts = opts.num_cohorts
188*2abb3134SXin Li  params.prob_p = opts.prob_p
189*2abb3134SXin Li  params.prob_q = opts.prob_q
190*2abb3134SXin Li  params.prob_f = opts.prob_f
191*2abb3134SXin Li
192*2abb3134SXin Li  if opts.random_mode == 'simple':
193*2abb3134SXin Li    irr_rand = rappor.SecureIrrRand(params)
194*2abb3134SXin Li  elif opts.random_mode == 'fast':
195*2abb3134SXin Li    if fastrand:
196*2abb3134SXin Li      log('Using fastrand extension')
197*2abb3134SXin Li      # NOTE: This doesn't take 'rand'.  It's seeded in C with srand().
198*2abb3134SXin Li      irr_rand = fastrand.FastIrrRand(params)
199*2abb3134SXin Li    else:
200*2abb3134SXin Li      log('Warning: fastrand module not importable; see README for build '
201*2abb3134SXin Li          'instructions.  Falling back to simple randomness.')
202*2abb3134SXin Li      irr_rand = rappor.SecureIrrRand(params)
203*2abb3134SXin Li  else:
204*2abb3134SXin Li    raise AssertionError
205*2abb3134SXin Li  # Other possible implementations:
206*2abb3134SXin Li  # - random.SystemRandom (probably uses /dev/urandom on Linux)
207*2abb3134SXin Li  # - HMAC-SHA256 with another secret?  This could match C++ byte for byte.
208*2abb3134SXin Li  #   - or srand(0) might do it.
209*2abb3134SXin Li
210*2abb3134SXin Li  csv_in = csv.reader(sys.stdin)
211*2abb3134SXin Li  csv_out = csv.writer(sys.stdout)
212*2abb3134SXin Li
213*2abb3134SXin Li  if opts.assoc_testdata:
214*2abb3134SXin Li    # Copy flags into params
215*2abb3134SXin Li    params1 = rappor.Params()
216*2abb3134SXin Li    params1.num_bloombits = opts.num_bits
217*2abb3134SXin Li    params1.num_hashes = opts.num_hashes
218*2abb3134SXin Li    params1.num_cohorts = opts.num_cohorts
219*2abb3134SXin Li    params1.prob_p = opts.prob_p
220*2abb3134SXin Li    params1.prob_q = opts.prob_q
221*2abb3134SXin Li    params1.prob_f = opts.prob_f
222*2abb3134SXin Li
223*2abb3134SXin Li    # Second one is boolean
224*2abb3134SXin Li    params2 = rappor.Params()
225*2abb3134SXin Li    params2.num_bloombits = 1  # 1 bit for boolean
226*2abb3134SXin Li    params2.num_hashes = opts.num_hashes
227*2abb3134SXin Li    params2.num_cohorts = opts.num_cohorts
228*2abb3134SXin Li    params2.prob_p = opts.prob_p
229*2abb3134SXin Li    params2.prob_q = opts.prob_q
230*2abb3134SXin Li    params2.prob_f = opts.prob_f
231*2abb3134SXin Li
232*2abb3134SXin Li    GenAssocTestdata(
233*2abb3134SXin Li        params1, params2, irr_rand, opts.assoc_testdata, csv_in, csv_out)
234*2abb3134SXin Li  else:
235*2abb3134SXin Li    RapporClientSim(params, irr_rand, csv_in, csv_out)
236*2abb3134SXin Li
237*2abb3134SXin Li
238*2abb3134SXin Liif __name__ == "__main__":
239*2abb3134SXin Li  try:
240*2abb3134SXin Li    main(sys.argv)
241*2abb3134SXin Li  except RuntimeError, e:
242*2abb3134SXin Li    log('rappor_sim.py: FATAL: %s', e)
243