xref: /aosp_15_r20/external/rappor/pipeline/assoc.sh (revision 2abb31345f6c95944768b5222a9a5ed3fc68cc00)
1*2abb3134SXin Li#!/bin/bash
2*2abb3134SXin Li#
3*2abb3134SXin Li# Usage:
4*2abb3134SXin Li#   ./assoc.sh <function name>
5*2abb3134SXin Li
6*2abb3134SXin Liset -o nounset
7*2abb3134SXin Liset -o pipefail
8*2abb3134SXin Liset -o errexit
9*2abb3134SXin Li
10*2abb3134SXin Lireadonly THIS_DIR=$(dirname $0)
11*2abb3134SXin Lireadonly RAPPOR_SRC=$(cd $THIS_DIR/.. && pwd)
12*2abb3134SXin Li
13*2abb3134SXin Lisource $RAPPOR_SRC/util.sh  # log, banner
14*2abb3134SXin Lisource $RAPPOR_SRC/pipeline/tools-lib.sh
15*2abb3134SXin Lisource $RAPPOR_SRC/pipeline/alarm-lib.sh
16*2abb3134SXin Li
17*2abb3134SXin Li# Change the default location of these tools by setting DEP_*
18*2abb3134SXin Lireadonly DECODE_ASSOC=${DEP_DECODE_ASSOC:-$RAPPOR_SRC/bin/decode-assoc}
19*2abb3134SXin Lireadonly FAST_EM=${DEP_FAST_EM:-$RAPPOR_SRC/analysis/cpp/_tmp/fast_em}
20*2abb3134SXin Li
21*2abb3134SXin Li# Run a single decode-assoc process, to analyze one variable pair for one
22*2abb3134SXin Li# metric.  The arguments to this function are one row of the task spec.
23*2abb3134SXin Lidecode-one() {
24*2abb3134SXin Li  # Job constants, from decode-many
25*2abb3134SXin Li  local rappor_src=$1
26*2abb3134SXin Li  local timeout_secs=$2
27*2abb3134SXin Li  local min_reports=$3
28*2abb3134SXin Li  local job_dir=$4
29*2abb3134SXin Li  local sample_size=$5
30*2abb3134SXin Li
31*2abb3134SXin Li  # Task spec variables, from task_spec.py
32*2abb3134SXin Li  local num_reports=$6
33*2abb3134SXin Li  local metric_name=$7
34*2abb3134SXin Li  local date=$8  # for output naming only
35*2abb3134SXin Li  local reports=$9  # file with reports
36*2abb3134SXin Li  local var1=${10}
37*2abb3134SXin Li  local var2=${11}
38*2abb3134SXin Li  local map1=${12}
39*2abb3134SXin Li  local output_dir=${13}
40*2abb3134SXin Li
41*2abb3134SXin Li  local log_file=$output_dir/assoc-log.txt
42*2abb3134SXin Li  local status_file=$output_dir/assoc-status.txt
43*2abb3134SXin Li  mkdir --verbose -p $output_dir
44*2abb3134SXin Li
45*2abb3134SXin Li  # Flags drived from job constants
46*2abb3134SXin Li  local schema=$job_dir/config/rappor-vars.csv
47*2abb3134SXin Li  local params_dir=$job_dir/config
48*2abb3134SXin Li  local em_executable=$FAST_EM
49*2abb3134SXin Li
50*2abb3134SXin Li  # TODO:
51*2abb3134SXin Li  # - Skip jobs with few reports, like ./backfill.sh analyze-one.
52*2abb3134SXin Li
53*2abb3134SXin Li  # Output the spec for combine_status.py.
54*2abb3134SXin Li  echo "$@" > $output_dir/assoc-spec.txt
55*2abb3134SXin Li
56*2abb3134SXin Li  # NOTE: Not passing --num-cores since we're parallelizing already.
57*2abb3134SXin Li
58*2abb3134SXin Li  # NOTE: --tmp-dir is the output dir.  Then we just delete all the .bin files
59*2abb3134SXin Li  # afterward so we don't copy them to x20 (they are big).
60*2abb3134SXin Li
61*2abb3134SXin Li  { time \
62*2abb3134SXin Li      alarm-status $status_file $timeout_secs \
63*2abb3134SXin Li        $DECODE_ASSOC \
64*2abb3134SXin Li          --create-bool-map \
65*2abb3134SXin Li          --remove-bad-rows \
66*2abb3134SXin Li          --em-executable $em_executable \
67*2abb3134SXin Li          --schema $schema \
68*2abb3134SXin Li          --params-dir $params_dir \
69*2abb3134SXin Li          --metric-name $metric_name \
70*2abb3134SXin Li          --reports $reports \
71*2abb3134SXin Li          --var1 $var1 \
72*2abb3134SXin Li          --var2 $var2 \
73*2abb3134SXin Li          --map1 $map1 \
74*2abb3134SXin Li          --reports-sample-size $sample_size \
75*2abb3134SXin Li          --tmp-dir $output_dir \
76*2abb3134SXin Li          --output-dir $output_dir
77*2abb3134SXin Li  } >$log_file 2>&1
78*2abb3134SXin Li}
79*2abb3134SXin Li
80*2abb3134SXin Litest-decode-one() {
81*2abb3134SXin Li  decode-one $RAPPOR_SRC
82*2abb3134SXin Li}
83*2abb3134SXin Li
84*2abb3134SXin Lireadonly DEFAULT_MIN_REPORTS=5000
85*2abb3134SXin Li
86*2abb3134SXin Li#readonly DEFAULT_TIMEOUT_SECONDS=300  # 5 minutes as a quick test.
87*2abb3134SXin Lireadonly DEFAULT_TIMEOUT_SECONDS=3600  # 1 hour
88*2abb3134SXin Li
89*2abb3134SXin Lireadonly DEFAULT_MAX_PROCS=6  # TODO: Share with backfill.sh
90*2abb3134SXin Li
91*2abb3134SXin Li# Limit to 1M for now.  Raise it when we have a full run.
92*2abb3134SXin Lireadonly DEFAULT_SAMPLE_SIZE=1000000
93*2abb3134SXin Li
94*2abb3134SXin Lireadonly NUM_ARGS=8  # number of tokens in the task spec, used for xargs
95*2abb3134SXin Li
96*2abb3134SXin Li# Run many decode-assoc processes in parallel.
97*2abb3134SXin Lidecode-many() {
98*2abb3134SXin Li  local job_dir=$1
99*2abb3134SXin Li  local spec_list=$2
100*2abb3134SXin Li
101*2abb3134SXin Li  # These 3 params affect speed
102*2abb3134SXin Li  local timeout_secs=${3:-$DEFAULT_TIMEOUT_SECONDS}
103*2abb3134SXin Li  local sample_size=${4:-$DEFAULT_SAMPLE_SIZE}
104*2abb3134SXin Li  local max_procs=${5:-$DEFAULT_MAX_PROCS}
105*2abb3134SXin Li
106*2abb3134SXin Li  local rappor_src=${6:-$RAPPOR_SRC}
107*2abb3134SXin Li  local min_reports=${7:-$DEFAULT_MIN_REPORTS}
108*2abb3134SXin Li
109*2abb3134SXin Li  time cat $spec_list \
110*2abb3134SXin Li    | xargs --verbose -n $NUM_ARGS -P $max_procs --no-run-if-empty -- \
111*2abb3134SXin Li      $0 decode-one $rappor_src $timeout_secs $min_reports $job_dir $sample_size
112*2abb3134SXin Li}
113*2abb3134SXin Li
114*2abb3134SXin Li# Combine assoc results and render HTML.
115*2abb3134SXin Li
116*2abb3134SXin Licombine-and-render-html() {
117*2abb3134SXin Li  local jobs_base_dir=$1
118*2abb3134SXin Li  local job_dir=$2
119*2abb3134SXin Li
120*2abb3134SXin Li  banner "Combining assoc task status"
121*2abb3134SXin Li  TOOLS-cook combine-assoc-task-status $jobs_base_dir $job_dir
122*2abb3134SXin Li
123*2abb3134SXin Li  banner "Combining assoc results"
124*2abb3134SXin Li  TOOLS-cook combine-assoc-results $jobs_base_dir $job_dir
125*2abb3134SXin Li
126*2abb3134SXin Li  banner "Splitting out status per metric, and writing overview"
127*2abb3134SXin Li  TOOLS-cook assoc-metric-status $job_dir
128*2abb3134SXin Li
129*2abb3134SXin Li  TOOLS-gen-ui symlink-static assoc $job_dir
130*2abb3134SXin Li
131*2abb3134SXin Li  banner "Building overview .part.html from CSV"
132*2abb3134SXin Li  TOOLS-gen-ui assoc-overview-part-html $job_dir
133*2abb3134SXin Li
134*2abb3134SXin Li  banner "Building metric .part.html from CSV"
135*2abb3134SXin Li  TOOLS-gen-ui assoc-metric-part-html $job_dir
136*2abb3134SXin Li
137*2abb3134SXin Li  banner "Building pair .part.html from CSV"
138*2abb3134SXin Li  TOOLS-gen-ui assoc-pair-part-html $job_dir
139*2abb3134SXin Li
140*2abb3134SXin Li  banner "Building day .part.html from CSV"
141*2abb3134SXin Li  TOOLS-gen-ui assoc-day-part-html $job_dir
142*2abb3134SXin Li}
143*2abb3134SXin Li
144*2abb3134SXin Li# Temp files left over by the fast_em R <-> C++.
145*2abb3134SXin Lilist-and-remove-bin() {
146*2abb3134SXin Li  local job_dir=$1
147*2abb3134SXin Li  # If everything failed, we might not have anything to list/delete.
148*2abb3134SXin Li  find $job_dir -name \*.bin | xargs --no-run-if-empty -- ls -l --si
149*2abb3134SXin Li  find $job_dir -name \*.bin | xargs --no-run-if-empty -- rm -f --verbose
150*2abb3134SXin Li}
151*2abb3134SXin Li
152*2abb3134SXin Li"$@"
153