xref: /aosp_15_r20/external/rappor/tests/rappor_sim.py (revision 2abb31345f6c95944768b5222a9a5ed3fc68cc00)
1#!/usr/bin/python
2#
3# Copyright 2014 Google Inc. All rights reserved.
4#
5# Licensed under the Apache License, Version 2.0 (the "License");
6# you may not use this file except in compliance with the License.
7# You may obtain a copy of the License at
8#
9#     http://www.apache.org/licenses/LICENSE-2.0
10#
11# Unless required by applicable law or agreed to in writing, software
12# distributed under the License is distributed on an "AS IS" BASIS,
13# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14# See the License for the specific language governing permissions and
15# limitations under the License.
16
17"""Run the RAPPOR Python client on simulated input.
18
19It takes a 3-column CSV file as generated by gen_reports.R, and outputs a 5
20column CSV of RAPPOR'd data.
21
22Input columns: client,true_value
23Output coumns: client,cohort,bloom,prr,rappor
24
25TODO:
26- cohort should be in the input _input.csv file.
27
28See http://google.github.io/rappor/doc/data-flow.html for details.
29"""
30
31import csv
32import collections
33import optparse
34import os
35import random
36import sys
37import time
38
39import rappor  # client library
40try:
41  import fastrand
42except ImportError:
43  print >>sys.stderr, (
44      "Native fastrand module not imported; see README for speedups")
45  fastrand = None
46
47
48def log(msg, *args):
49  if args:
50    msg = msg % args
51  print >>sys.stderr, msg
52
53
54def CreateOptionsParser():
55  p = optparse.OptionParser()
56
57  p.add_option(
58      '--num-bits', type='int', metavar='INT', dest='num_bits', default=16,
59      help='Number of bloom filter bits.')
60  p.add_option(
61      '--num-hashes', type='int', metavar='INT', dest='num_hashes', default=2,
62      help='Number of hashes.')
63  p.add_option(
64      '--num-cohorts', type='int', metavar='INT', dest='num_cohorts',
65      default=64, help='Number of cohorts.')
66
67  p.add_option(
68      '-p', type='float', metavar='FLOAT', dest='prob_p', default=1,
69      help='Probability p')
70  p.add_option(
71      '-q', type='float', metavar='FLOAT', dest='prob_q', default=1,
72      help='Probability q')
73  p.add_option(
74      '-f', type='float', metavar='FLOAT', dest='prob_f', default=1,
75      help='Probability f')
76  p.add_option(
77      '--assoc-testdata', type='int', dest='assoc_testdata', default=0,
78      help='Generate association testdata from true values on stdin.')
79
80  choices = ['simple', 'fast']
81  p.add_option(
82      '-r', type='choice', metavar='STR',
83      dest='random_mode', default='fast', choices=choices,
84      help='Random algorithm (%s)' % '|'.join(choices))
85
86  return p
87
88
89def GenAssocTestdata(params1, params2, irr_rand, assoc_testdata_count,
90                     csv_in, csv_out):
91  """Read true values from csv_in and output encoded values on csv_out.
92
93  Replicate assoc_testdata_count times.  First value is a string, second is a
94  bool.  TODO: Generalize this.
95  """
96  rows = []
97  for i, (true_value1, true_value2) in enumerate(csv_in):
98    if i == 0:
99      v1_name = true_value1
100      v2_name = true_value2
101      continue  # skip header row
102
103    rows.append((true_value1, true_value2))
104
105  # Use the same column names
106  header = ('client', 'cohort', v1_name, v2_name)
107  csv_out.writerow(header)
108
109  n = assoc_testdata_count
110  report_index = 0
111  for i in xrange(n):
112    for v1, v2 in rows:
113      client_str = 'c%d' % report_index
114
115      # randint(a, b) gives i such that a <= i <= b
116      cohort = random.randint(0, params1.num_cohorts - 1)
117
118      string_encoder = rappor.Encoder(params1, cohort, client_str, irr_rand)
119      bool_encoder = rappor.Encoder(params2, cohort, client_str, irr_rand)
120
121      # Real users should call e.encode().  For testing purposes, we also want
122      # the PRR.
123      irr1 = string_encoder.encode(v1)
124
125      # TODO: Convert to bool and encode with basic RAPPOR
126      v2_int = int(v2)
127      #print v2_int
128      irr2 = bool_encoder.encode_bits(v2_int)
129
130      irr1_str = rappor.bit_string(irr1, params1.num_bloombits)
131      irr2_str = rappor.bit_string(irr2, params2.num_bloombits)
132
133      csv_out.writerow((client_str, cohort, irr1_str, irr2_str))
134
135      report_index += 1
136
137
138def RapporClientSim(params, irr_rand, csv_in, csv_out):
139  """Read true values from csv_in and output encoded values on csv_out."""
140  header = ('client', 'cohort', 'bloom', 'prr', 'irr')
141  csv_out.writerow(header)
142
143  # TODO: It would be more instructive/efficient to construct an encoder
144  # instance up front per client, rather than one per row below.
145  start_time = time.time()
146
147  for i, (client_str, cohort_str, true_value) in enumerate(csv_in):
148    if i == 0:
149      if client_str != 'client':
150        raise RuntimeError('Expected client header, got %s' % client_str)
151      if cohort_str != 'cohort':
152        raise RuntimeError('Expected cohort header, got %s' % cohort_str)
153      if true_value != 'value':
154        raise RuntimeError('Expected value header, got %s' % value)
155      continue  # skip header row
156
157    #if i == 30:  # EARLY STOP
158    #  break
159
160    if i % 10000 == 0:
161      elapsed = time.time() - start_time
162      log('Processed %d inputs in %.2f seconds', i, elapsed)
163
164    cohort = int(cohort_str)
165    secret = client_str
166    e = rappor.Encoder(params, cohort, secret, irr_rand)
167
168    # Real users should call e.encode().  For testing purposes, we also want
169    # the PRR.
170    bloom, prr, irr = e._internal_encode(true_value)
171
172    bloom_str = rappor.bit_string(bloom, params.num_bloombits)
173    prr_str = rappor.bit_string(prr, params.num_bloombits)
174    irr_str = rappor.bit_string(irr, params.num_bloombits)
175
176    out_row = (client_str, cohort_str, bloom_str, prr_str, irr_str)
177    csv_out.writerow(out_row)
178
179
180def main(argv):
181  (opts, argv) = CreateOptionsParser().parse_args(argv)
182
183  # Copy flags into params
184  params = rappor.Params()
185  params.num_bloombits = opts.num_bits
186  params.num_hashes = opts.num_hashes
187  params.num_cohorts = opts.num_cohorts
188  params.prob_p = opts.prob_p
189  params.prob_q = opts.prob_q
190  params.prob_f = opts.prob_f
191
192  if opts.random_mode == 'simple':
193    irr_rand = rappor.SecureIrrRand(params)
194  elif opts.random_mode == 'fast':
195    if fastrand:
196      log('Using fastrand extension')
197      # NOTE: This doesn't take 'rand'.  It's seeded in C with srand().
198      irr_rand = fastrand.FastIrrRand(params)
199    else:
200      log('Warning: fastrand module not importable; see README for build '
201          'instructions.  Falling back to simple randomness.')
202      irr_rand = rappor.SecureIrrRand(params)
203  else:
204    raise AssertionError
205  # Other possible implementations:
206  # - random.SystemRandom (probably uses /dev/urandom on Linux)
207  # - HMAC-SHA256 with another secret?  This could match C++ byte for byte.
208  #   - or srand(0) might do it.
209
210  csv_in = csv.reader(sys.stdin)
211  csv_out = csv.writer(sys.stdout)
212
213  if opts.assoc_testdata:
214    # Copy flags into params
215    params1 = rappor.Params()
216    params1.num_bloombits = opts.num_bits
217    params1.num_hashes = opts.num_hashes
218    params1.num_cohorts = opts.num_cohorts
219    params1.prob_p = opts.prob_p
220    params1.prob_q = opts.prob_q
221    params1.prob_f = opts.prob_f
222
223    # Second one is boolean
224    params2 = rappor.Params()
225    params2.num_bloombits = 1  # 1 bit for boolean
226    params2.num_hashes = opts.num_hashes
227    params2.num_cohorts = opts.num_cohorts
228    params2.prob_p = opts.prob_p
229    params2.prob_q = opts.prob_q
230    params2.prob_f = opts.prob_f
231
232    GenAssocTestdata(
233        params1, params2, irr_rand, opts.assoc_testdata, csv_in, csv_out)
234  else:
235    RapporClientSim(params, irr_rand, csv_in, csv_out)
236
237
238if __name__ == "__main__":
239  try:
240    main(sys.argv)
241  except RuntimeError, e:
242    log('rappor_sim.py: FATAL: %s', e)
243