1*2abb3134SXin Li#!/bin/bash 2*2abb3134SXin Liusage() { 3*2abb3134SXin Liecho " 4*2abb3134SXin Li 5*2abb3134SXin Li Simple smoke test for the decode-dist tool. This will fail if your machine 6*2abb3134SXin Li doesn't have the right R libraries. 7*2abb3134SXin Li 8*2abb3134SXin Li Usage: 9*2abb3134SXin Li ./test.sh <function name> 10*2abb3134SXin Li 11*2abb3134SXin Li Example: 12*2abb3134SXin Li ./test.sh decode-assoc-R-smoke # test pure R implementation 13*2abb3134SXin Li ./test.sh decode-assoc-cpp-smoke # test with analysis/cpp/fast_em.cc 14*2abb3134SXin Li ./test.sh decode-assoc-cpp-converge # run for longer with C++ 15*2abb3134SXin Li ./test.sh decode-assoc-tensorflow 16*2abb3134SXin Li" 17*2abb3134SXin Li} 18*2abb3134SXin Li 19*2abb3134SXin Liset -o nounset 20*2abb3134SXin Liset -o pipefail 21*2abb3134SXin Liset -o errexit 22*2abb3134SXin Li 23*2abb3134SXin Lireadonly THIS_DIR=$(dirname $0) 24*2abb3134SXin Lireadonly RAPPOR_SRC=$(cd $THIS_DIR/.. && pwd) 25*2abb3134SXin Lireadonly EM_CPP_EXECUTABLE=$RAPPOR_SRC/analysis/cpp/_tmp/fast_em 26*2abb3134SXin Li 27*2abb3134SXin Lisource $RAPPOR_SRC/util.sh 28*2abb3134SXin Li 29*2abb3134SXin Lireadonly ASSOC_TESTDATA_DIR=_tmp/decode-assoc-test 30*2abb3134SXin Lireadonly DIST_TESTDATA_DIR=_tmp/decode-dist-test 31*2abb3134SXin Li 32*2abb3134SXin Li# Clear the R cache for the map files. 33*2abb3134SXin Liclear-cached-files() { 34*2abb3134SXin Li local dir=$1 35*2abb3134SXin Li find $dir -name '*.rda' | xargs --no-run-if-empty -- rm --verbose 36*2abb3134SXin Li} 37*2abb3134SXin Li 38*2abb3134SXin Liwrite-dist-testdata() { 39*2abb3134SXin Li local input_dir=$DIST_TESTDATA_DIR/input 40*2abb3134SXin Li 41*2abb3134SXin Li mkdir -p $input_dir 42*2abb3134SXin Li 43*2abb3134SXin Li clear-cached-files $DIST_TESTDATA_DIR 44*2abb3134SXin Li 45*2abb3134SXin Li # Right now, we copy a case from regtest.sh. (./demo.sh quick-python creates 46*2abb3134SXin Li # just this case) 47*2abb3134SXin Li local case_dir=$RAPPOR_SRC/_tmp/python/demo3 48*2abb3134SXin Li 49*2abb3134SXin Li cp --verbose $case_dir/1/case_counts.csv $input_dir/counts.csv 50*2abb3134SXin Li cp --verbose $case_dir/case_map.csv $input_dir/map.csv 51*2abb3134SXin Li cp --verbose $case_dir/case_params.csv $input_dir/params.csv 52*2abb3134SXin Li} 53*2abb3134SXin Li 54*2abb3134SXin Lidecode-dist() { 55*2abb3134SXin Li write-dist-testdata 56*2abb3134SXin Li 57*2abb3134SXin Li local output_dir=$DIST_TESTDATA_DIR 58*2abb3134SXin Li 59*2abb3134SXin Li local input_dir=$DIST_TESTDATA_DIR/input 60*2abb3134SXin Li 61*2abb3134SXin Li # Uses the ./demo.sh regtest files 62*2abb3134SXin Li time $RAPPOR_SRC/bin/decode-dist \ 63*2abb3134SXin Li --counts $input_dir/counts.csv \ 64*2abb3134SXin Li --map $input_dir/map.csv \ 65*2abb3134SXin Li --params $input_dir/params.csv \ 66*2abb3134SXin Li --output-dir $output_dir 67*2abb3134SXin Li 68*2abb3134SXin Li echo 69*2abb3134SXin Li head $output_dir/results.csv 70*2abb3134SXin Li echo 71*2abb3134SXin Li cat $output_dir/metrics.json 72*2abb3134SXin Li} 73*2abb3134SXin Li 74*2abb3134SXin Liwrite-assoc-testdata() { 75*2abb3134SXin Li # 'build' has intermediate build files, 'input' is the final input to the 76*2abb3134SXin Li # decode-assoc tool. 77*2abb3134SXin Li local build_dir=$ASSOC_TESTDATA_DIR/build 78*2abb3134SXin Li local input_dir=$ASSOC_TESTDATA_DIR/input 79*2abb3134SXin Li 80*2abb3134SXin Li mkdir -p $build_dir $input_dir 81*2abb3134SXin Li 82*2abb3134SXin Li clear-cached-files $ASSOC_TESTDATA_DIR 83*2abb3134SXin Li 84*2abb3134SXin Li cat >$build_dir/true_values.csv <<EOF 85*2abb3134SXin Lidomain,flag..HTTPS 86*2abb3134SXin Ligoogle.com,1 87*2abb3134SXin Ligoogle.com,1 88*2abb3134SXin Ligoogle.com,1 89*2abb3134SXin Ligoogle.com,1 90*2abb3134SXin Ligoogle.com,0 91*2abb3134SXin Liyahoo.com,1 92*2abb3134SXin Liyahoo.com,0 93*2abb3134SXin Libing.com,1 94*2abb3134SXin Libing.com,1 95*2abb3134SXin Libing.com,0 96*2abb3134SXin LiEOF 97*2abb3134SXin Li 98*2abb3134SXin Li local num_bits=8 99*2abb3134SXin Li local num_hashes=1 100*2abb3134SXin Li local num_cohorts=128 101*2abb3134SXin Li 102*2abb3134SXin Li local prob_p=0.25 103*2abb3134SXin Li local prob_q=0.75 104*2abb3134SXin Li local prob_f=0.5 105*2abb3134SXin Li 106*2abb3134SXin Li # 10 items in the input. 50,000 items is enough to eyeball accuracy of 107*2abb3134SXin Li # results. 108*2abb3134SXin Li local assoc_testdata_count=5000 109*2abb3134SXin Li 110*2abb3134SXin Li PYTHONPATH=$RAPPOR_SRC/client/python \ 111*2abb3134SXin Li $RAPPOR_SRC/tests/rappor_sim.py \ 112*2abb3134SXin Li --assoc-testdata $assoc_testdata_count \ 113*2abb3134SXin Li --num-bits $num_bits \ 114*2abb3134SXin Li --num-hashes $num_hashes \ 115*2abb3134SXin Li --num-cohorts $num_cohorts \ 116*2abb3134SXin Li -p $prob_p \ 117*2abb3134SXin Li -q $prob_q \ 118*2abb3134SXin Li -f $prob_f \ 119*2abb3134SXin Li < $build_dir/true_values.csv \ 120*2abb3134SXin Li > $input_dir/reports.csv 121*2abb3134SXin Li 122*2abb3134SXin Li # Output two bad rows: each row is missing one of the columns. 123*2abb3134SXin Li cat >$build_dir/bad_rows.txt <<EOF 124*2abb3134SXin Lic0,0,10101010, 125*2abb3134SXin Lic0,0,,0 126*2abb3134SXin LiEOF 127*2abb3134SXin Li 128*2abb3134SXin Li # Make CSV file with the header 129*2abb3134SXin Li cat - $build_dir/bad_rows.txt > $input_dir/bad_rows.csv <<EOF 130*2abb3134SXin Liclient,cohort,domain,flag..HTTPS 131*2abb3134SXin LiEOF 132*2abb3134SXin Li 133*2abb3134SXin Li # Make reports file with bad rows 134*2abb3134SXin Li cat $input_dir/reports.csv $build_dir/bad_rows.txt > $input_dir/reports_bad_rows.csv 135*2abb3134SXin Li 136*2abb3134SXin Li # Define a string variable and a boolean varaible. 137*2abb3134SXin Li cat >$input_dir/rappor-vars.csv <<EOF 138*2abb3134SXin Limetric, var, var_type, params 139*2abb3134SXin Lim,domain,string,m_params 140*2abb3134SXin Lim,flag..HTTPS,boolean,m_params 141*2abb3134SXin LiEOF 142*2abb3134SXin Li 143*2abb3134SXin Li cat >$input_dir/m_params.csv <<EOF 144*2abb3134SXin Lik,h,m,p,q,f 145*2abb3134SXin Li$num_bits,$num_hashes,$num_cohorts,$prob_p,$prob_q,$prob_f 146*2abb3134SXin LiEOF 147*2abb3134SXin Li 148*2abb3134SXin Li # Add a string with a double quote to test quoting behavior 149*2abb3134SXin Li cat >$build_dir/domain_candidates.csv <<EOF 150*2abb3134SXin Ligoogle.com 151*2abb3134SXin Liyahoo.com 152*2abb3134SXin Libing.com 153*2abb3134SXin Liq"q 154*2abb3134SXin LiEOF 155*2abb3134SXin Li 156*2abb3134SXin Li # Hash candidates to create map. 157*2abb3134SXin Li $RAPPOR_SRC/bin/hash-candidates $input_dir/m_params.csv \ 158*2abb3134SXin Li < $build_dir/domain_candidates.csv \ 159*2abb3134SXin Li > $input_dir/domain_map.csv 160*2abb3134SXin Li 161*2abb3134SXin Li banner "Wrote testdata in $input_dir (intermediate files in $build_dir)" 162*2abb3134SXin Li} 163*2abb3134SXin Li 164*2abb3134SXin Li# Helper function to run decode-assoc with testdata. 165*2abb3134SXin Lidecode-assoc-helper() { 166*2abb3134SXin Li write-assoc-testdata 167*2abb3134SXin Li 168*2abb3134SXin Li local output_dir=$1 169*2abb3134SXin Li shift 170*2abb3134SXin Li 171*2abb3134SXin Li local build_dir=$ASSOC_TESTDATA_DIR/build 172*2abb3134SXin Li local input_dir=$ASSOC_TESTDATA_DIR/input 173*2abb3134SXin Li 174*2abb3134SXin Li time $RAPPOR_SRC/bin/decode-assoc \ 175*2abb3134SXin Li --metric-name m \ 176*2abb3134SXin Li --schema $input_dir/rappor-vars.csv \ 177*2abb3134SXin Li --reports $input_dir/reports.csv \ 178*2abb3134SXin Li --params-dir $input_dir \ 179*2abb3134SXin Li --var1 domain \ 180*2abb3134SXin Li --var2 flag..HTTPS \ 181*2abb3134SXin Li --map1 $input_dir/domain_map.csv \ 182*2abb3134SXin Li --create-bool-map \ 183*2abb3134SXin Li --max-em-iters 10 \ 184*2abb3134SXin Li --num-cores 2 \ 185*2abb3134SXin Li --output-dir $output_dir \ 186*2abb3134SXin Li --tmp-dir $output_dir \ 187*2abb3134SXin Li "$@" 188*2abb3134SXin Li 189*2abb3134SXin Li head $output_dir/assoc-* 190*2abb3134SXin Li 191*2abb3134SXin Li # Print true values for comparison 192*2abb3134SXin Li echo 193*2abb3134SXin Li echo "$build_dir/true_values.csv:" 194*2abb3134SXin Li cat "$build_dir/true_values.csv" 195*2abb3134SXin Li} 196*2abb3134SXin Li 197*2abb3134SXin Li# Quick smoke test for R version. 198*2abb3134SXin Lidecode-assoc-R-smoke() { 199*2abb3134SXin Li local output_dir=_tmp/R 200*2abb3134SXin Li mkdir -p $output_dir 201*2abb3134SXin Li decode-assoc-helper $output_dir 202*2abb3134SXin Li} 203*2abb3134SXin Li 204*2abb3134SXin Li# Test what happens when there are bad rows. 205*2abb3134SXin Lidecode-assoc-bad-rows() { 206*2abb3134SXin Li local output_dir=_tmp/bad 207*2abb3134SXin Li mkdir -p $output_dir 208*2abb3134SXin Li 209*2abb3134SXin Li # Later flags override earlier ones 210*2abb3134SXin Li 211*2abb3134SXin Li # Reports + bad rows 212*2abb3134SXin Li decode-assoc-helper $output_dir \ 213*2abb3134SXin Li --reports _tmp/reports_bad_rows.csv \ 214*2abb3134SXin Li --remove-bad-rows \ 215*2abb3134SXin Li "$@" 216*2abb3134SXin Li 217*2abb3134SXin Li # ONLY bad rows 218*2abb3134SXin Li decode-assoc-helper $output_dir \ 219*2abb3134SXin Li --reports _tmp/bad_rows.csv \ 220*2abb3134SXin Li --remove-bad-rows \ 221*2abb3134SXin Li "$@" 222*2abb3134SXin Li} 223*2abb3134SXin Li 224*2abb3134SXin Libuild-em-executable() { 225*2abb3134SXin Li pushd $RAPPOR_SRC/analysis/cpp >/dev/null 226*2abb3134SXin Li ./run.sh build-fast-em 227*2abb3134SXin Li popd >/dev/null 228*2abb3134SXin Li} 229*2abb3134SXin Li 230*2abb3134SXin Lidecode-assoc-cpp-smoke() { 231*2abb3134SXin Li local output_dir=_tmp/cpp 232*2abb3134SXin Li mkdir -p $output_dir 233*2abb3134SXin Li 234*2abb3134SXin Li build-em-executable 235*2abb3134SXin Li 236*2abb3134SXin Li decode-assoc-helper $output_dir \ 237*2abb3134SXin Li --em-executable "$EM_CPP_EXECUTABLE" "$@" 238*2abb3134SXin Li} 239*2abb3134SXin Li 240*2abb3134SXin Lidecode-assoc-cpp-converge() { 241*2abb3134SXin Li # With the data we have, this converges and exits before 1000 iterations. 242*2abb3134SXin Li decode-assoc-cpp-smoke --max-em-iters 1000 243*2abb3134SXin Li} 244*2abb3134SXin Li 245*2abb3134SXin Lidecode-assoc-tensorflow() { 246*2abb3134SXin Li local output_dir=_tmp/tensorflow 247*2abb3134SXin Li mkdir -p $output_dir 248*2abb3134SXin Li 249*2abb3134SXin Li decode-assoc-helper $output_dir \ 250*2abb3134SXin Li --em-executable $RAPPOR_SRC/analysis/tensorflow/fast_em.sh "$@" 251*2abb3134SXin Li} 252*2abb3134SXin Li 253*2abb3134SXin Lidecode-assoc-tensorflow-converge() { 254*2abb3134SXin Li decode-assoc-tensorflow --max-em-iters 1000 255*2abb3134SXin Li} 256*2abb3134SXin Li 257*2abb3134SXin Liif test $# -eq 0 ; then 258*2abb3134SXin Li usage 259*2abb3134SXin Lielse 260*2abb3134SXin Li "$@" 261*2abb3134SXin Lifi 262