1*2abb3134SXin Li#!/bin/bash 2*2abb3134SXin Li# 3*2abb3134SXin Li# Usage: 4*2abb3134SXin Li# ./assoc.sh <function name> 5*2abb3134SXin Li 6*2abb3134SXin Liset -o nounset 7*2abb3134SXin Liset -o pipefail 8*2abb3134SXin Liset -o errexit 9*2abb3134SXin Li 10*2abb3134SXin Lireadonly THIS_DIR=$(dirname $0) 11*2abb3134SXin Lireadonly RAPPOR_SRC=$(cd $THIS_DIR/.. && pwd) 12*2abb3134SXin Li 13*2abb3134SXin Lisource $RAPPOR_SRC/util.sh # log, banner 14*2abb3134SXin Lisource $RAPPOR_SRC/pipeline/tools-lib.sh 15*2abb3134SXin Lisource $RAPPOR_SRC/pipeline/alarm-lib.sh 16*2abb3134SXin Li 17*2abb3134SXin Li# Change the default location of these tools by setting DEP_* 18*2abb3134SXin Lireadonly DECODE_ASSOC=${DEP_DECODE_ASSOC:-$RAPPOR_SRC/bin/decode-assoc} 19*2abb3134SXin Lireadonly FAST_EM=${DEP_FAST_EM:-$RAPPOR_SRC/analysis/cpp/_tmp/fast_em} 20*2abb3134SXin Li 21*2abb3134SXin Li# Run a single decode-assoc process, to analyze one variable pair for one 22*2abb3134SXin Li# metric. The arguments to this function are one row of the task spec. 23*2abb3134SXin Lidecode-one() { 24*2abb3134SXin Li # Job constants, from decode-many 25*2abb3134SXin Li local rappor_src=$1 26*2abb3134SXin Li local timeout_secs=$2 27*2abb3134SXin Li local min_reports=$3 28*2abb3134SXin Li local job_dir=$4 29*2abb3134SXin Li local sample_size=$5 30*2abb3134SXin Li 31*2abb3134SXin Li # Task spec variables, from task_spec.py 32*2abb3134SXin Li local num_reports=$6 33*2abb3134SXin Li local metric_name=$7 34*2abb3134SXin Li local date=$8 # for output naming only 35*2abb3134SXin Li local reports=$9 # file with reports 36*2abb3134SXin Li local var1=${10} 37*2abb3134SXin Li local var2=${11} 38*2abb3134SXin Li local map1=${12} 39*2abb3134SXin Li local output_dir=${13} 40*2abb3134SXin Li 41*2abb3134SXin Li local log_file=$output_dir/assoc-log.txt 42*2abb3134SXin Li local status_file=$output_dir/assoc-status.txt 43*2abb3134SXin Li mkdir --verbose -p $output_dir 44*2abb3134SXin Li 45*2abb3134SXin Li # Flags drived from job constants 46*2abb3134SXin Li local schema=$job_dir/config/rappor-vars.csv 47*2abb3134SXin Li local params_dir=$job_dir/config 48*2abb3134SXin Li local em_executable=$FAST_EM 49*2abb3134SXin Li 50*2abb3134SXin Li # TODO: 51*2abb3134SXin Li # - Skip jobs with few reports, like ./backfill.sh analyze-one. 52*2abb3134SXin Li 53*2abb3134SXin Li # Output the spec for combine_status.py. 54*2abb3134SXin Li echo "$@" > $output_dir/assoc-spec.txt 55*2abb3134SXin Li 56*2abb3134SXin Li # NOTE: Not passing --num-cores since we're parallelizing already. 57*2abb3134SXin Li 58*2abb3134SXin Li # NOTE: --tmp-dir is the output dir. Then we just delete all the .bin files 59*2abb3134SXin Li # afterward so we don't copy them to x20 (they are big). 60*2abb3134SXin Li 61*2abb3134SXin Li { time \ 62*2abb3134SXin Li alarm-status $status_file $timeout_secs \ 63*2abb3134SXin Li $DECODE_ASSOC \ 64*2abb3134SXin Li --create-bool-map \ 65*2abb3134SXin Li --remove-bad-rows \ 66*2abb3134SXin Li --em-executable $em_executable \ 67*2abb3134SXin Li --schema $schema \ 68*2abb3134SXin Li --params-dir $params_dir \ 69*2abb3134SXin Li --metric-name $metric_name \ 70*2abb3134SXin Li --reports $reports \ 71*2abb3134SXin Li --var1 $var1 \ 72*2abb3134SXin Li --var2 $var2 \ 73*2abb3134SXin Li --map1 $map1 \ 74*2abb3134SXin Li --reports-sample-size $sample_size \ 75*2abb3134SXin Li --tmp-dir $output_dir \ 76*2abb3134SXin Li --output-dir $output_dir 77*2abb3134SXin Li } >$log_file 2>&1 78*2abb3134SXin Li} 79*2abb3134SXin Li 80*2abb3134SXin Litest-decode-one() { 81*2abb3134SXin Li decode-one $RAPPOR_SRC 82*2abb3134SXin Li} 83*2abb3134SXin Li 84*2abb3134SXin Lireadonly DEFAULT_MIN_REPORTS=5000 85*2abb3134SXin Li 86*2abb3134SXin Li#readonly DEFAULT_TIMEOUT_SECONDS=300 # 5 minutes as a quick test. 87*2abb3134SXin Lireadonly DEFAULT_TIMEOUT_SECONDS=3600 # 1 hour 88*2abb3134SXin Li 89*2abb3134SXin Lireadonly DEFAULT_MAX_PROCS=6 # TODO: Share with backfill.sh 90*2abb3134SXin Li 91*2abb3134SXin Li# Limit to 1M for now. Raise it when we have a full run. 92*2abb3134SXin Lireadonly DEFAULT_SAMPLE_SIZE=1000000 93*2abb3134SXin Li 94*2abb3134SXin Lireadonly NUM_ARGS=8 # number of tokens in the task spec, used for xargs 95*2abb3134SXin Li 96*2abb3134SXin Li# Run many decode-assoc processes in parallel. 97*2abb3134SXin Lidecode-many() { 98*2abb3134SXin Li local job_dir=$1 99*2abb3134SXin Li local spec_list=$2 100*2abb3134SXin Li 101*2abb3134SXin Li # These 3 params affect speed 102*2abb3134SXin Li local timeout_secs=${3:-$DEFAULT_TIMEOUT_SECONDS} 103*2abb3134SXin Li local sample_size=${4:-$DEFAULT_SAMPLE_SIZE} 104*2abb3134SXin Li local max_procs=${5:-$DEFAULT_MAX_PROCS} 105*2abb3134SXin Li 106*2abb3134SXin Li local rappor_src=${6:-$RAPPOR_SRC} 107*2abb3134SXin Li local min_reports=${7:-$DEFAULT_MIN_REPORTS} 108*2abb3134SXin Li 109*2abb3134SXin Li time cat $spec_list \ 110*2abb3134SXin Li | xargs --verbose -n $NUM_ARGS -P $max_procs --no-run-if-empty -- \ 111*2abb3134SXin Li $0 decode-one $rappor_src $timeout_secs $min_reports $job_dir $sample_size 112*2abb3134SXin Li} 113*2abb3134SXin Li 114*2abb3134SXin Li# Combine assoc results and render HTML. 115*2abb3134SXin Li 116*2abb3134SXin Licombine-and-render-html() { 117*2abb3134SXin Li local jobs_base_dir=$1 118*2abb3134SXin Li local job_dir=$2 119*2abb3134SXin Li 120*2abb3134SXin Li banner "Combining assoc task status" 121*2abb3134SXin Li TOOLS-cook combine-assoc-task-status $jobs_base_dir $job_dir 122*2abb3134SXin Li 123*2abb3134SXin Li banner "Combining assoc results" 124*2abb3134SXin Li TOOLS-cook combine-assoc-results $jobs_base_dir $job_dir 125*2abb3134SXin Li 126*2abb3134SXin Li banner "Splitting out status per metric, and writing overview" 127*2abb3134SXin Li TOOLS-cook assoc-metric-status $job_dir 128*2abb3134SXin Li 129*2abb3134SXin Li TOOLS-gen-ui symlink-static assoc $job_dir 130*2abb3134SXin Li 131*2abb3134SXin Li banner "Building overview .part.html from CSV" 132*2abb3134SXin Li TOOLS-gen-ui assoc-overview-part-html $job_dir 133*2abb3134SXin Li 134*2abb3134SXin Li banner "Building metric .part.html from CSV" 135*2abb3134SXin Li TOOLS-gen-ui assoc-metric-part-html $job_dir 136*2abb3134SXin Li 137*2abb3134SXin Li banner "Building pair .part.html from CSV" 138*2abb3134SXin Li TOOLS-gen-ui assoc-pair-part-html $job_dir 139*2abb3134SXin Li 140*2abb3134SXin Li banner "Building day .part.html from CSV" 141*2abb3134SXin Li TOOLS-gen-ui assoc-day-part-html $job_dir 142*2abb3134SXin Li} 143*2abb3134SXin Li 144*2abb3134SXin Li# Temp files left over by the fast_em R <-> C++. 145*2abb3134SXin Lilist-and-remove-bin() { 146*2abb3134SXin Li local job_dir=$1 147*2abb3134SXin Li # If everything failed, we might not have anything to list/delete. 148*2abb3134SXin Li find $job_dir -name \*.bin | xargs --no-run-if-empty -- ls -l --si 149*2abb3134SXin Li find $job_dir -name \*.bin | xargs --no-run-if-empty -- rm -f --verbose 150*2abb3134SXin Li} 151*2abb3134SXin Li 152*2abb3134SXin Li"$@" 153