1*2abb3134SXin Li#!/bin/bash 2*2abb3134SXin Li# 3*2abb3134SXin Li# Usage: 4*2abb3134SXin Li# ./dist.sh <function name> 5*2abb3134SXin Li 6*2abb3134SXin Liset -o nounset 7*2abb3134SXin Liset -o pipefail 8*2abb3134SXin Liset -o errexit 9*2abb3134SXin Li 10*2abb3134SXin Lireadonly THIS_DIR=$(dirname $0) 11*2abb3134SXin Lireadonly RAPPOR_SRC=$(cd $THIS_DIR/.. && pwd) 12*2abb3134SXin Li 13*2abb3134SXin Lisource $RAPPOR_SRC/util.sh # log, banner 14*2abb3134SXin Lisource $RAPPOR_SRC/pipeline/tools-lib.sh 15*2abb3134SXin Lisource $RAPPOR_SRC/pipeline/alarm-lib.sh 16*2abb3134SXin Li 17*2abb3134SXin Lireadonly DECODE_DIST=${DEP_DECODE_DIST:-$RAPPOR_SRC/bin/decode-dist} 18*2abb3134SXin Li 19*2abb3134SXin Lireadonly NUM_ARGS=7 # used for xargs 20*2abb3134SXin Li 21*2abb3134SXin Lidecode-dist-one() { 22*2abb3134SXin Li # Job constants 23*2abb3134SXin Li local rappor_src=$1 24*2abb3134SXin Li local timeout_secs=$2 25*2abb3134SXin Li local min_reports=$3 26*2abb3134SXin Li shift 3 # job constants do not vary per task and are not part of the spec 27*2abb3134SXin Li 28*2abb3134SXin Li # 7 spec variables 29*2abb3134SXin Li local num_reports=$1 # unused, only for filtering 30*2abb3134SXin Li local metric_name=$2 31*2abb3134SXin Li local date=$3 32*2abb3134SXin Li local counts=$4 33*2abb3134SXin Li local params=$5 34*2abb3134SXin Li local map=$6 35*2abb3134SXin Li local results_dir=$7 36*2abb3134SXin Li 37*2abb3134SXin Li local task_dir=$results_dir/$metric_name/$date 38*2abb3134SXin Li mkdir --verbose -p $task_dir 39*2abb3134SXin Li 40*2abb3134SXin Li local log_file=$task_dir/log.txt 41*2abb3134SXin Li local status_file=$task_dir/STATUS.txt 42*2abb3134SXin Li 43*2abb3134SXin Li # Record the spec so we know params, counts, etc. 44*2abb3134SXin Li echo "$@" > $task_dir/spec.txt 45*2abb3134SXin Li 46*2abb3134SXin Li if test $num_reports -lt $min_reports; then 47*2abb3134SXin Li local msg="SKIPPED because $num_reports reports is less than $min_reports" 48*2abb3134SXin Li # Duplicate this message 49*2abb3134SXin Li echo "$msg" > $status_file 50*2abb3134SXin Li echo "$msg" > $log_file 51*2abb3134SXin Li return 52*2abb3134SXin Li fi 53*2abb3134SXin Li 54*2abb3134SXin Li # Run it with a timeout, and record status in the task dir. 55*2abb3134SXin Li { time \ 56*2abb3134SXin Li alarm-status $status_file $timeout_secs \ 57*2abb3134SXin Li $DECODE_DIST \ 58*2abb3134SXin Li --counts $counts \ 59*2abb3134SXin Li --params $params \ 60*2abb3134SXin Li --map $map \ 61*2abb3134SXin Li --output-dir $task_dir \ 62*2abb3134SXin Li --adjust-counts-hack 63*2abb3134SXin Li } >$log_file 2>&1 64*2abb3134SXin Li 65*2abb3134SXin Li # TODO: Don't pass --adjust-counts-hack unless the user asks for it. 66*2abb3134SXin Li} 67*2abb3134SXin Li 68*2abb3134SXin Li# Print the number of processes to use. 69*2abb3134SXin Li# NOTE: This is copied from google/rappor regtest.sh. 70*2abb3134SXin Li# It also doesn't take into account the fact that we are memory-bound. 71*2abb3134SXin Li# 72*2abb3134SXin Li# 128 GiB / 4GiB would also imply about 32 processes though. 73*2abb3134SXin Linum-processes() { 74*2abb3134SXin Li local processors=$(grep -c ^processor /proc/cpuinfo || echo 4) 75*2abb3134SXin Li if test $processors -gt 1; then # leave one CPU for the OS 76*2abb3134SXin Li processors=$(expr $processors - 1) 77*2abb3134SXin Li fi 78*2abb3134SXin Li echo $processors 79*2abb3134SXin Li} 80*2abb3134SXin Li 81*2abb3134SXin Li#readonly DEFAULT_MAX_PROCS=6 # for andychu2.hot, to avoid locking up UI 82*2abb3134SXin Li#readonly DEFAULT_MAX_PROCS=16 # for rappor-ac.hot, to avoid thrashing 83*2abb3134SXin Lireadonly DEFAULT_MAX_PROCS=$(num-processes) 84*2abb3134SXin Li 85*2abb3134SXin Li#readonly DEFAULT_MAX_TASKS=12 86*2abb3134SXin Lireadonly DEFAULT_MAX_TASKS=10000 # more than the max 87*2abb3134SXin Li 88*2abb3134SXin Li# NOTE: Since we have 125 GB RAM, and processes can take up to 12 gigs of RAM, 89*2abb3134SXin Li# only use parallelism of 10, even though we have 31 cores. 90*2abb3134SXin Li 91*2abb3134SXin Lireadonly DEFAULT_MIN_REPORTS=5000 92*2abb3134SXin Li 93*2abb3134SXin Li 94*2abb3134SXin Lidecode-dist-many() { 95*2abb3134SXin Li local job_dir=$1 96*2abb3134SXin Li local spec_list=$2 97*2abb3134SXin Li local timeout_secs=${3:-1200} # default timeout 98*2abb3134SXin Li local max_procs=${4:-$DEFAULT_MAX_PROCS} 99*2abb3134SXin Li local rappor_src=${5:-$RAPPOR_SRC} 100*2abb3134SXin Li local min_reports=${6:-$DEFAULT_MIN_REPORTS} 101*2abb3134SXin Li 102*2abb3134SXin Li local interval_secs=5 103*2abb3134SXin Li local pid_dir="$job_dir/pids" 104*2abb3134SXin Li local sys_mem="$job_dir/system-mem.csv" 105*2abb3134SXin Li mkdir --verbose -p $pid_dir 106*2abb3134SXin Li 107*2abb3134SXin Li time cat $spec_list \ 108*2abb3134SXin Li | xargs --verbose -n $NUM_ARGS -P $max_procs --no-run-if-empty -- \ 109*2abb3134SXin Li $0 decode-dist-one $rappor_src $timeout_secs $min_reports 110*2abb3134SXin Li} 111*2abb3134SXin Li 112*2abb3134SXin Li# Combine/summarize results and task metadata from the parallel decode-dist 113*2abb3134SXin Li# processes. Render them as HTML. 114*2abb3134SXin Licombine-and-render-html() { 115*2abb3134SXin Li local jobs_base_dir=$1 116*2abb3134SXin Li local job_dir=$2 117*2abb3134SXin Li 118*2abb3134SXin Li banner "Combining dist task status" 119*2abb3134SXin Li TOOLS-cook combine-dist-task-status $jobs_base_dir $job_dir 120*2abb3134SXin Li 121*2abb3134SXin Li banner "Combining dist results" 122*2abb3134SXin Li TOOLS-cook combine-dist-results $jobs_base_dir $job_dir 123*2abb3134SXin Li 124*2abb3134SXin Li banner "Splitting out status per metric, and writing overview" 125*2abb3134SXin Li TOOLS-cook dist-metric-status $job_dir 126*2abb3134SXin Li 127*2abb3134SXin Li # The task-status.csv file should have the a JOB ID. 128*2abb3134SXin Li banner "Building overview.html and per-metric HTML" 129*2abb3134SXin Li TOOLS-gen-ui build-html1 $job_dir 130*2abb3134SXin Li 131*2abb3134SXin Li banner "Building individual results.html (for ONE day)" 132*2abb3134SXin Li TOOLS-gen-ui results-html $job_dir 133*2abb3134SXin Li} 134*2abb3134SXin Li 135*2abb3134SXin Li"$@" 136