1*2abb3134SXin Li#!/usr/bin/Rscript 2*2abb3134SXin Li# 3*2abb3134SXin Li# Write an overview of task status, per-metric task status, task histograms. 4*2abb3134SXin Li 5*2abb3134SXin Lilibrary(data.table) 6*2abb3134SXin Lilibrary(ggplot2) 7*2abb3134SXin Li 8*2abb3134SXin Lioptions(stringsAsFactors = FALSE) # get rid of annoying behavior 9*2abb3134SXin Li 10*2abb3134SXin LiLog <- function(fmt, ...) { 11*2abb3134SXin Li cat(sprintf(fmt, ...)) 12*2abb3134SXin Li cat('\n') 13*2abb3134SXin Li} 14*2abb3134SXin Li 15*2abb3134SXin Li# max of non-NA values; NA if there are none 16*2abb3134SXin LiMaybeMax <- function(values) { 17*2abb3134SXin Li v <- values[!is.na(values)] 18*2abb3134SXin Li if (length(v) == 0) { 19*2abb3134SXin Li m <- NA 20*2abb3134SXin Li } else { 21*2abb3134SXin Li m <- max(v) 22*2abb3134SXin Li } 23*2abb3134SXin Li as.numeric(m) # data.table requires this; otherwise we get type errors 24*2abb3134SXin Li} 25*2abb3134SXin Li 26*2abb3134SXin Li# mean of non-NA values; NA if there are none 27*2abb3134SXin LiMaybeMean <- function(values) { 28*2abb3134SXin Li v <- values[!is.na(values)] 29*2abb3134SXin Li if (length(v) == 0) { 30*2abb3134SXin Li m <- NA 31*2abb3134SXin Li } else { 32*2abb3134SXin Li m <- mean(v) 33*2abb3134SXin Li } 34*2abb3134SXin Li as.numeric(m) # data.table require this; otherwise we get type errors 35*2abb3134SXin Li} 36*2abb3134SXin Li 37*2abb3134SXin LiWriteDistOverview <- function(summary, output_dir) { 38*2abb3134SXin Li s <- data.table(summary) # data.table syntax is easier here 39*2abb3134SXin Li 40*2abb3134SXin Li by_metric <- s[ , list( 41*2abb3134SXin Li params_file = unique(params_file), 42*2abb3134SXin Li map_file = unique(map_file), 43*2abb3134SXin Li days = length(date), 44*2abb3134SXin Li max_num_reports = MaybeMax(num_reports), 45*2abb3134SXin Li 46*2abb3134SXin Li # summarize status 47*2abb3134SXin Li ok = sum(status == 'OK'), 48*2abb3134SXin Li fail = sum(status == 'FAIL'), 49*2abb3134SXin Li timeout = sum(status == 'TIMEOUT'), 50*2abb3134SXin Li skipped = sum(status == 'SKIPPED'), 51*2abb3134SXin Li 52*2abb3134SXin Li # TODO: Need to document the meaning of these metrics. 53*2abb3134SXin Li # All could be NA 54*2abb3134SXin Li # KiB -> MB 55*2abb3134SXin Li #max_vm5_peak_mb = MaybeMax(vm5_peak_kib * 1024 / 1e6), 56*2abb3134SXin Li #mean_vm5_mean_mb = MaybeMean(vm5_mean_kib * 1024 / 1e6), 57*2abb3134SXin Li 58*2abb3134SXin Li mean_secs = MaybeMean(seconds), 59*2abb3134SXin Li mean_allocated_mass = MaybeMean(allocated_mass) 60*2abb3134SXin Li 61*2abb3134SXin Li # unique failure reasons 62*2abb3134SXin Li # This can be used when there are different call stacks. 63*2abb3134SXin Li #fail_reasons = length(unique(fail_reason[fail_reason != ""])) 64*2abb3134SXin Li ), by=metric] 65*2abb3134SXin Li 66*2abb3134SXin Li # Case insensitive sort by metric name 67*2abb3134SXin Li by_metric <- by_metric[order(tolower(by_metric$metric)), ] 68*2abb3134SXin Li 69*2abb3134SXin Li overview_path <- file.path(output_dir, 'overview.csv') 70*2abb3134SXin Li write.csv(by_metric, file = overview_path, row.names = FALSE) 71*2abb3134SXin Li Log("Wrote %s", overview_path) 72*2abb3134SXin Li 73*2abb3134SXin Li by_metric 74*2abb3134SXin Li} 75*2abb3134SXin Li 76*2abb3134SXin LiWriteDistMetricStatus <- function(summary, output_dir) { 77*2abb3134SXin Li # Write status.csv, num_reports.csv, and mass.csv for each metric. 78*2abb3134SXin Li 79*2abb3134SXin Li s <- data.table(summary) 80*2abb3134SXin Li 81*2abb3134SXin Li # loop over unique metrics, and write a CSV for each one 82*2abb3134SXin Li for (m in unique(s$metric)) { 83*2abb3134SXin Li # Select cols, and convert units. Don't need params / map / metric. 84*2abb3134SXin Li subframe <- s[s$metric == m, 85*2abb3134SXin Li list(job_id, date, status, 86*2abb3134SXin Li #vm5_peak_mb = vm5_peak_kib * 1024 / 1e6, 87*2abb3134SXin Li #vm5_mean_mb = vm5_mean_kib * 1024 / 1e6, 88*2abb3134SXin Li num_reports, 89*2abb3134SXin Li seconds, 90*2abb3134SXin Li allocated_mass, num_rappor)] 91*2abb3134SXin Li 92*2abb3134SXin Li # Sort by descending date. Alphabetical sort works fine for YYYY-MM-DD. 93*2abb3134SXin Li subframe <- subframe[order(subframe$date, decreasing = TRUE), ] 94*2abb3134SXin Li 95*2abb3134SXin Li out_path = file.path(output_dir, m, 'status.csv') 96*2abb3134SXin Li write.csv(subframe, file = out_path, row.names = FALSE) 97*2abb3134SXin Li Log("Wrote %s", out_path) 98*2abb3134SXin Li } 99*2abb3134SXin Li 100*2abb3134SXin Li # This one is just for plotting with dygraphs. TODO: can dygraphs do 101*2abb3134SXin Li # something smarter? Maybe you need to select the column in JavaScript, and 102*2abb3134SXin Li # pass it an array, rather than CSV text. 103*2abb3134SXin Li for (m in unique(s$metric)) { 104*2abb3134SXin Li f1 <- s[s$metric == m, list(date, num_reports)] 105*2abb3134SXin Li path1 <- file.path(output_dir, m, 'num_reports.csv') 106*2abb3134SXin Li # NOTE: dygraphs (only in Firefox?) doesn't like the quotes around 107*2abb3134SXin Li # "2015-04-03". In general, we can't turn off quotes, because strings with 108*2abb3134SXin Li # double quotes will be invalid CSV files. But in this case, we only have 109*2abb3134SXin Li # date and number columns, so we can. dygraphs is mistaken here. 110*2abb3134SXin Li write.csv(f1, file = path1, row.names = FALSE, quote = FALSE) 111*2abb3134SXin Li Log("Wrote %s", path1) 112*2abb3134SXin Li 113*2abb3134SXin Li # Write unallocated mass. TODO: Write the other 2 vars too? 114*2abb3134SXin Li f2 <- s[s$metric == m, 115*2abb3134SXin Li list(date, 116*2abb3134SXin Li unallocated_mass = 1.0 - allocated_mass)] 117*2abb3134SXin Li 118*2abb3134SXin Li path2 <- file.path(output_dir, m, 'mass.csv') 119*2abb3134SXin Li write.csv(f2, file = path2, row.names = FALSE, quote = FALSE) 120*2abb3134SXin Li Log("Wrote %s", path2) 121*2abb3134SXin Li } 122*2abb3134SXin Li} 123*2abb3134SXin Li 124*2abb3134SXin LiWritePlot <- function(p, outdir, filename, width = 800, height = 600) { 125*2abb3134SXin Li filename <- file.path(outdir, filename) 126*2abb3134SXin Li png(filename, width = width, height = height) 127*2abb3134SXin Li plot(p) 128*2abb3134SXin Li dev.off() 129*2abb3134SXin Li Log('Wrote %s', filename) 130*2abb3134SXin Li} 131*2abb3134SXin Li 132*2abb3134SXin Li# Make sure the histogram has some valid input. If we don't do this, ggplot 133*2abb3134SXin Li# blows up with an unintuitive error message. 134*2abb3134SXin LiCheckHistogramInput <- function(v) { 135*2abb3134SXin Li if (all(is.na(v))) { 136*2abb3134SXin Li arg_name <- deparse(substitute(v)) # R idiom to get name 137*2abb3134SXin Li Log('FATAL: All values in %s are NA (no successful runs?)', arg_name) 138*2abb3134SXin Li quit(status = 1) 139*2abb3134SXin Li } 140*2abb3134SXin Li} 141*2abb3134SXin Li 142*2abb3134SXin LiWriteDistHistograms <- function(s, output_dir) { 143*2abb3134SXin Li CheckHistogramInput(s$allocated_mass) 144*2abb3134SXin Li 145*2abb3134SXin Li p <- qplot(s$allocated_mass, geom = "histogram") 146*2abb3134SXin Li t <- ggtitle("Allocated Mass by Task") 147*2abb3134SXin Li x <- xlab("allocated mass") 148*2abb3134SXin Li y <- ylab("number of tasks") 149*2abb3134SXin Li WritePlot(p + t + x + y, output_dir, 'allocated_mass.png') 150*2abb3134SXin Li 151*2abb3134SXin Li CheckHistogramInput(s$num_rappor) 152*2abb3134SXin Li 153*2abb3134SXin Li p <- qplot(s$num_rappor, geom = "histogram") 154*2abb3134SXin Li t <- ggtitle("Detected Strings by Task") 155*2abb3134SXin Li x <- xlab("detected strings") 156*2abb3134SXin Li y <- ylab("number of tasks") 157*2abb3134SXin Li WritePlot(p + t + x + y, output_dir, 'num_rappor.png') 158*2abb3134SXin Li 159*2abb3134SXin Li CheckHistogramInput(s$num_reports) 160*2abb3134SXin Li 161*2abb3134SXin Li p <- qplot(s$num_reports / 1e6, geom = "histogram") 162*2abb3134SXin Li t <- ggtitle("Raw Reports by Task") 163*2abb3134SXin Li x <- xlab("millions of reports") 164*2abb3134SXin Li y <- ylab("number of tasks") 165*2abb3134SXin Li WritePlot(p + t + x + y, output_dir, 'num_reports.png') 166*2abb3134SXin Li 167*2abb3134SXin Li CheckHistogramInput(s$seconds) 168*2abb3134SXin Li 169*2abb3134SXin Li p <- qplot(s$seconds, geom = "histogram") 170*2abb3134SXin Li t <- ggtitle("Analysis Duration by Task") 171*2abb3134SXin Li x <- xlab("seconds") 172*2abb3134SXin Li y <- ylab("number of tasks") 173*2abb3134SXin Li WritePlot(p + t + x + y, output_dir, 'seconds.png') 174*2abb3134SXin Li 175*2abb3134SXin Li # NOTE: Skipping this for 'series' jobs. 176*2abb3134SXin Li if (sum(!is.na(s$vm5_peak_kib)) > 0) { 177*2abb3134SXin Li p <- qplot(s$vm5_peak_kib * 1024 / 1e6, geom = "histogram") 178*2abb3134SXin Li t <- ggtitle("Peak Memory Usage by Task") 179*2abb3134SXin Li x <- xlab("Peak megabytes (1e6 bytes) of memory") 180*2abb3134SXin Li y <- ylab("number of tasks") 181*2abb3134SXin Li WritePlot(p + t + x + y, output_dir, 'memory.png') 182*2abb3134SXin Li } 183*2abb3134SXin Li} 184*2abb3134SXin Li 185*2abb3134SXin LiProcessAllDist <- function(s, output_dir) { 186*2abb3134SXin Li Log('dist: Writing per-metric status.csv') 187*2abb3134SXin Li WriteDistMetricStatus(s, output_dir) 188*2abb3134SXin Li 189*2abb3134SXin Li Log('dist: Writing histograms') 190*2abb3134SXin Li WriteDistHistograms(s, output_dir) 191*2abb3134SXin Li 192*2abb3134SXin Li Log('dist: Writing aggregated overview.csv') 193*2abb3134SXin Li WriteDistOverview(s, output_dir) 194*2abb3134SXin Li} 195*2abb3134SXin Li 196*2abb3134SXin Li# Write the single CSV file loaded by assoc-overview.html. 197*2abb3134SXin LiWriteAssocOverview <- function(summary, output_dir) { 198*2abb3134SXin Li s <- data.table(summary) # data.table syntax is easier here 199*2abb3134SXin Li 200*2abb3134SXin Li by_metric <- s[ , list( 201*2abb3134SXin Li #params_file = unique(params_file), 202*2abb3134SXin Li #map_file = unique(map_file), 203*2abb3134SXin Li 204*2abb3134SXin Li days = length(date), 205*2abb3134SXin Li max_num_reports = MaybeMax(num_reports), 206*2abb3134SXin Li 207*2abb3134SXin Li # summarize status 208*2abb3134SXin Li ok = sum(status == 'OK'), 209*2abb3134SXin Li fail = sum(status == 'FAIL'), 210*2abb3134SXin Li timeout = sum(status == 'TIMEOUT'), 211*2abb3134SXin Li skipped = sum(status == 'SKIPPED'), 212*2abb3134SXin Li 213*2abb3134SXin Li mean_total_secs = MaybeMean(total_elapsed_seconds), 214*2abb3134SXin Li mean_em_secs = MaybeMean(em_elapsed_seconds) 215*2abb3134SXin Li 216*2abb3134SXin Li ), by=list(metric)] 217*2abb3134SXin Li 218*2abb3134SXin Li # Case insensitive sort by metric name 219*2abb3134SXin Li by_metric <- by_metric[order(tolower(by_metric$metric)), ] 220*2abb3134SXin Li 221*2abb3134SXin Li overview_path <- file.path(output_dir, 'assoc-overview.csv') 222*2abb3134SXin Li write.csv(by_metric, file = overview_path, row.names = FALSE) 223*2abb3134SXin Li Log("Wrote %s", overview_path) 224*2abb3134SXin Li 225*2abb3134SXin Li by_metric 226*2abb3134SXin Li} 227*2abb3134SXin Li 228*2abb3134SXin Li# Write the CSV files loaded by assoc-metric.html -- that is, one 229*2abb3134SXin Li# metric-status.csv for each metric name. 230*2abb3134SXin LiWriteAssocMetricStatus <- function(summary, output_dir) { 231*2abb3134SXin Li s <- data.table(summary) 232*2abb3134SXin Li csv_list <- unique(s[, list(metric)]) 233*2abb3134SXin Li for (i in 1:nrow(csv_list)) { 234*2abb3134SXin Li u <- csv_list[i, ] 235*2abb3134SXin Li # Select cols, and convert units. Don't need params / map / metric. 236*2abb3134SXin Li by_pair <- s[s$metric == u$metric, 237*2abb3134SXin Li list(days = length(date), 238*2abb3134SXin Li max_num_reports = MaybeMax(num_reports), 239*2abb3134SXin Li 240*2abb3134SXin Li # summarize status 241*2abb3134SXin Li ok = sum(status == 'OK'), 242*2abb3134SXin Li fail = sum(status == 'FAIL'), 243*2abb3134SXin Li timeout = sum(status == 'TIMEOUT'), 244*2abb3134SXin Li skipped = sum(status == 'SKIPPED'), 245*2abb3134SXin Li 246*2abb3134SXin Li mean_total_secs = MaybeMean(total_elapsed_seconds), 247*2abb3134SXin Li mean_em_secs = MaybeMean(em_elapsed_seconds) 248*2abb3134SXin Li ), 249*2abb3134SXin Li by=list(var1, var2)] 250*2abb3134SXin Li 251*2abb3134SXin Li # Case insensitive sort by var1 name 252*2abb3134SXin Li by_pair <- by_pair[order(tolower(by_pair$var1)), ] 253*2abb3134SXin Li 254*2abb3134SXin Li csv_path <- file.path(output_dir, u$metric, 'metric-status.csv') 255*2abb3134SXin Li write.csv(by_pair, file = csv_path, row.names = FALSE) 256*2abb3134SXin Li Log("Wrote %s", csv_path) 257*2abb3134SXin Li } 258*2abb3134SXin Li} 259*2abb3134SXin Li 260*2abb3134SXin Li# This naming convention is in task_spec.py AssocTaskSpec. 261*2abb3134SXin LiFormatAssocRelPath <- function(metric, var1, var2) { 262*2abb3134SXin Li v2 <- gsub('..', '_', var2, fixed = TRUE) 263*2abb3134SXin Li var_dir <- sprintf('%s_X_%s', var1, v2) 264*2abb3134SXin Li file.path(metric, var_dir) 265*2abb3134SXin Li} 266*2abb3134SXin Li 267*2abb3134SXin Li# Write the CSV files loaded by assoc-pair.html -- that is, one pair-status.csv 268*2abb3134SXin Li# for each (metric, var1, var2) pair. 269*2abb3134SXin LiWriteAssocPairStatus <- function(summary, output_dir) { 270*2abb3134SXin Li 271*2abb3134SXin Li s <- data.table(summary) 272*2abb3134SXin Li 273*2abb3134SXin Li csv_list <- unique(s[, list(metric, var1, var2)]) 274*2abb3134SXin Li Log('CSV list:') 275*2abb3134SXin Li print(csv_list) 276*2abb3134SXin Li 277*2abb3134SXin Li # loop over unique metrics, and write a CSV for each one 278*2abb3134SXin Li for (i in 1:nrow(csv_list)) { 279*2abb3134SXin Li u <- csv_list[i, ] 280*2abb3134SXin Li 281*2abb3134SXin Li # Select cols, and convert units. Don't need params / map / metric. 282*2abb3134SXin Li subframe <- s[s$metric == u$metric & s$var1 == u$var1 & s$var2 == u$var2, 283*2abb3134SXin Li list(job_id, date, status, 284*2abb3134SXin Li num_reports, d1, d2, 285*2abb3134SXin Li total_elapsed_seconds, 286*2abb3134SXin Li em_elapsed_seconds)] 287*2abb3134SXin Li 288*2abb3134SXin Li # Sort by descending date. Alphabetical sort works fine for YYYY-MM-DD. 289*2abb3134SXin Li subframe <- subframe[order(subframe$date, decreasing = TRUE), ] 290*2abb3134SXin Li 291*2abb3134SXin Li pair_rel_path <- FormatAssocRelPath(u$metric, u$var1, u$var2) 292*2abb3134SXin Li 293*2abb3134SXin Li csv_path <- file.path(output_dir, pair_rel_path, 'pair-status.csv') 294*2abb3134SXin Li write.csv(subframe, file = csv_path, row.names = FALSE) 295*2abb3134SXin Li Log("Wrote %s", csv_path) 296*2abb3134SXin Li 297*2abb3134SXin Li # Write a file with the raw variable names. Parsed by ui.sh, to pass to 298*2abb3134SXin Li # csv_to_html.py. 299*2abb3134SXin Li meta_path <- file.path(output_dir, pair_rel_path, 'pair-metadata.txt') 300*2abb3134SXin Li 301*2abb3134SXin Li # NOTE: The conversion from data.table to character vector requires 302*2abb3134SXin Li # stringsAsFactors to work correctly! 303*2abb3134SXin Li lines <- as.character(u) 304*2abb3134SXin Li writeLines(lines, con = meta_path) 305*2abb3134SXin Li Log("Wrote %s", meta_path) 306*2abb3134SXin Li } 307*2abb3134SXin Li} 308*2abb3134SXin Li 309*2abb3134SXin LiProcessAllAssoc <- function(s, output_dir) { 310*2abb3134SXin Li Log('assoc: Writing pair-status.csv for each variable pair in each metric') 311*2abb3134SXin Li WriteAssocPairStatus(s, output_dir) 312*2abb3134SXin Li 313*2abb3134SXin Li Log('assoc: Writing metric-status.csv for each metric') 314*2abb3134SXin Li WriteAssocMetricStatus(s, output_dir) 315*2abb3134SXin Li 316*2abb3134SXin Li Log('assoc: Writing aggregated overview.csv') 317*2abb3134SXin Li WriteAssocOverview(s, output_dir) 318*2abb3134SXin Li} 319*2abb3134SXin Li 320*2abb3134SXin Limain <- function(argv) { 321*2abb3134SXin Li # increase ggplot font size globally 322*2abb3134SXin Li theme_set(theme_grey(base_size = 16)) 323*2abb3134SXin Li 324*2abb3134SXin Li action = argv[[1]] 325*2abb3134SXin Li input = argv[[2]] 326*2abb3134SXin Li output_dir = argv[[3]] 327*2abb3134SXin Li 328*2abb3134SXin Li if (action == 'dist') { 329*2abb3134SXin Li summary = read.csv(input) 330*2abb3134SXin Li ProcessAllDist(summary, output_dir) 331*2abb3134SXin Li } else if (action == 'assoc') { 332*2abb3134SXin Li summary = read.csv(input) 333*2abb3134SXin Li ProcessAllAssoc(summary, output_dir) 334*2abb3134SXin Li } else { 335*2abb3134SXin Li stop(sprintf('Invalid action %s', action)) 336*2abb3134SXin Li } 337*2abb3134SXin Li 338*2abb3134SXin Li Log('Done') 339*2abb3134SXin Li} 340*2abb3134SXin Li 341*2abb3134SXin Liif (length(sys.frames()) == 0) { 342*2abb3134SXin Li main(commandArgs(TRUE)) 343*2abb3134SXin Li} 344