xref: /aosp_15_r20/external/rappor/pipeline/metric_status.R (revision 2abb31345f6c95944768b5222a9a5ed3fc68cc00)
1*2abb3134SXin Li#!/usr/bin/Rscript
2*2abb3134SXin Li#
3*2abb3134SXin Li# Write an overview of task status, per-metric task status, task histograms.
4*2abb3134SXin Li
5*2abb3134SXin Lilibrary(data.table)
6*2abb3134SXin Lilibrary(ggplot2)
7*2abb3134SXin Li
8*2abb3134SXin Lioptions(stringsAsFactors = FALSE)  # get rid of annoying behavior
9*2abb3134SXin Li
10*2abb3134SXin LiLog <- function(fmt, ...) {
11*2abb3134SXin Li  cat(sprintf(fmt, ...))
12*2abb3134SXin Li  cat('\n')
13*2abb3134SXin Li}
14*2abb3134SXin Li
15*2abb3134SXin Li# max of non-NA values; NA if there are none
16*2abb3134SXin LiMaybeMax <- function(values) {
17*2abb3134SXin Li  v <- values[!is.na(values)]
18*2abb3134SXin Li  if (length(v) == 0) {
19*2abb3134SXin Li    m <- NA
20*2abb3134SXin Li  } else {
21*2abb3134SXin Li    m <- max(v)
22*2abb3134SXin Li  }
23*2abb3134SXin Li  as.numeric(m)  # data.table requires this; otherwise we get type errors
24*2abb3134SXin Li}
25*2abb3134SXin Li
26*2abb3134SXin Li# mean of non-NA values; NA if there are none
27*2abb3134SXin LiMaybeMean <- function(values) {
28*2abb3134SXin Li  v <- values[!is.na(values)]
29*2abb3134SXin Li  if (length(v) == 0) {
30*2abb3134SXin Li    m <- NA
31*2abb3134SXin Li  } else {
32*2abb3134SXin Li    m <- mean(v)
33*2abb3134SXin Li  }
34*2abb3134SXin Li  as.numeric(m)  # data.table require this; otherwise we get type errors
35*2abb3134SXin Li}
36*2abb3134SXin Li
37*2abb3134SXin LiWriteDistOverview <- function(summary, output_dir) {
38*2abb3134SXin Li  s <- data.table(summary)  # data.table syntax is easier here
39*2abb3134SXin Li
40*2abb3134SXin Li  by_metric <-  s[ , list(
41*2abb3134SXin Li      params_file = unique(params_file),
42*2abb3134SXin Li      map_file = unique(map_file),
43*2abb3134SXin Li      days = length(date),
44*2abb3134SXin Li      max_num_reports = MaybeMax(num_reports),
45*2abb3134SXin Li
46*2abb3134SXin Li      # summarize status
47*2abb3134SXin Li      ok = sum(status == 'OK'),
48*2abb3134SXin Li      fail = sum(status == 'FAIL'),
49*2abb3134SXin Li      timeout = sum(status == 'TIMEOUT'),
50*2abb3134SXin Li      skipped = sum(status == 'SKIPPED'),
51*2abb3134SXin Li
52*2abb3134SXin Li      # TODO: Need to document the meaning of these metrics.
53*2abb3134SXin Li      # All could be NA
54*2abb3134SXin Li      # KiB -> MB
55*2abb3134SXin Li      #max_vm5_peak_mb = MaybeMax(vm5_peak_kib * 1024 / 1e6),
56*2abb3134SXin Li      #mean_vm5_mean_mb = MaybeMean(vm5_mean_kib * 1024 / 1e6),
57*2abb3134SXin Li
58*2abb3134SXin Li      mean_secs = MaybeMean(seconds),
59*2abb3134SXin Li      mean_allocated_mass = MaybeMean(allocated_mass)
60*2abb3134SXin Li
61*2abb3134SXin Li      # unique failure reasons
62*2abb3134SXin Li      # This can be used when there are different call stacks.
63*2abb3134SXin Li      #fail_reasons = length(unique(fail_reason[fail_reason != ""]))
64*2abb3134SXin Li      ), by=metric]
65*2abb3134SXin Li
66*2abb3134SXin Li  # Case insensitive sort by metric name
67*2abb3134SXin Li  by_metric <- by_metric[order(tolower(by_metric$metric)), ]
68*2abb3134SXin Li
69*2abb3134SXin Li  overview_path <- file.path(output_dir, 'overview.csv')
70*2abb3134SXin Li  write.csv(by_metric, file = overview_path, row.names = FALSE)
71*2abb3134SXin Li  Log("Wrote %s", overview_path)
72*2abb3134SXin Li
73*2abb3134SXin Li  by_metric
74*2abb3134SXin Li}
75*2abb3134SXin Li
76*2abb3134SXin LiWriteDistMetricStatus <- function(summary, output_dir) {
77*2abb3134SXin Li  # Write status.csv, num_reports.csv, and mass.csv for each metric.
78*2abb3134SXin Li
79*2abb3134SXin Li  s <- data.table(summary)
80*2abb3134SXin Li
81*2abb3134SXin Li  # loop over unique metrics, and write a CSV for each one
82*2abb3134SXin Li  for (m in unique(s$metric)) {
83*2abb3134SXin Li    # Select cols, and convert units.  Don't need params / map / metric.
84*2abb3134SXin Li    subframe <- s[s$metric == m,
85*2abb3134SXin Li                  list(job_id, date, status,
86*2abb3134SXin Li                       #vm5_peak_mb = vm5_peak_kib * 1024 / 1e6,
87*2abb3134SXin Li                       #vm5_mean_mb = vm5_mean_kib * 1024 / 1e6,
88*2abb3134SXin Li                       num_reports,
89*2abb3134SXin Li                       seconds,
90*2abb3134SXin Li                       allocated_mass, num_rappor)]
91*2abb3134SXin Li
92*2abb3134SXin Li    # Sort by descending date.  Alphabetical sort works fine for YYYY-MM-DD.
93*2abb3134SXin Li    subframe <- subframe[order(subframe$date, decreasing = TRUE), ]
94*2abb3134SXin Li
95*2abb3134SXin Li    out_path = file.path(output_dir, m, 'status.csv')
96*2abb3134SXin Li    write.csv(subframe, file = out_path, row.names = FALSE)
97*2abb3134SXin Li    Log("Wrote %s", out_path)
98*2abb3134SXin Li  }
99*2abb3134SXin Li
100*2abb3134SXin Li  # This one is just for plotting with dygraphs.  TODO: can dygraphs do
101*2abb3134SXin Li  # something smarter?  Maybe you need to select the column in JavaScript, and
102*2abb3134SXin Li  # pass it an array, rather than CSV text.
103*2abb3134SXin Li  for (m in unique(s$metric)) {
104*2abb3134SXin Li    f1 <- s[s$metric == m, list(date, num_reports)]
105*2abb3134SXin Li    path1 <- file.path(output_dir, m, 'num_reports.csv')
106*2abb3134SXin Li    # NOTE: dygraphs (only in Firefox?) doesn't like the quotes around
107*2abb3134SXin Li    # "2015-04-03".  In general, we can't turn off quotes, because strings with
108*2abb3134SXin Li    # double quotes will be invalid CSV files.  But in this case, we only have
109*2abb3134SXin Li    # date and number columns, so we can.  dygraphs is mistaken here.
110*2abb3134SXin Li    write.csv(f1, file = path1, row.names = FALSE, quote = FALSE)
111*2abb3134SXin Li    Log("Wrote %s", path1)
112*2abb3134SXin Li
113*2abb3134SXin Li    # Write unallocated mass.  TODO: Write the other 2 vars too?
114*2abb3134SXin Li    f2 <- s[s$metric == m,
115*2abb3134SXin Li            list(date,
116*2abb3134SXin Li                 unallocated_mass = 1.0 - allocated_mass)]
117*2abb3134SXin Li
118*2abb3134SXin Li    path2 <- file.path(output_dir, m, 'mass.csv')
119*2abb3134SXin Li    write.csv(f2, file = path2, row.names = FALSE, quote = FALSE)
120*2abb3134SXin Li    Log("Wrote %s", path2)
121*2abb3134SXin Li  }
122*2abb3134SXin Li}
123*2abb3134SXin Li
124*2abb3134SXin LiWritePlot <- function(p, outdir, filename, width = 800, height = 600) {
125*2abb3134SXin Li  filename <- file.path(outdir, filename)
126*2abb3134SXin Li  png(filename, width = width, height = height)
127*2abb3134SXin Li  plot(p)
128*2abb3134SXin Li  dev.off()
129*2abb3134SXin Li  Log('Wrote %s', filename)
130*2abb3134SXin Li}
131*2abb3134SXin Li
132*2abb3134SXin Li# Make sure the histogram has some valid input.  If we don't do this, ggplot
133*2abb3134SXin Li# blows up with an unintuitive error message.
134*2abb3134SXin LiCheckHistogramInput <- function(v) {
135*2abb3134SXin Li  if (all(is.na(v))) {
136*2abb3134SXin Li    arg_name <- deparse(substitute(v))  # R idiom to get name
137*2abb3134SXin Li    Log('FATAL: All values in %s are NA (no successful runs?)', arg_name)
138*2abb3134SXin Li    quit(status = 1)
139*2abb3134SXin Li  }
140*2abb3134SXin Li}
141*2abb3134SXin Li
142*2abb3134SXin LiWriteDistHistograms <- function(s, output_dir) {
143*2abb3134SXin Li  CheckHistogramInput(s$allocated_mass)
144*2abb3134SXin Li
145*2abb3134SXin Li  p <- qplot(s$allocated_mass, geom = "histogram")
146*2abb3134SXin Li  t <- ggtitle("Allocated Mass by Task")
147*2abb3134SXin Li  x <- xlab("allocated mass")
148*2abb3134SXin Li  y <- ylab("number of tasks")
149*2abb3134SXin Li  WritePlot(p + t + x + y, output_dir, 'allocated_mass.png')
150*2abb3134SXin Li
151*2abb3134SXin Li  CheckHistogramInput(s$num_rappor)
152*2abb3134SXin Li
153*2abb3134SXin Li  p <- qplot(s$num_rappor, geom = "histogram")
154*2abb3134SXin Li  t <- ggtitle("Detected Strings by Task")
155*2abb3134SXin Li  x <- xlab("detected strings")
156*2abb3134SXin Li  y <- ylab("number of tasks")
157*2abb3134SXin Li  WritePlot(p + t + x + y, output_dir, 'num_rappor.png')
158*2abb3134SXin Li
159*2abb3134SXin Li  CheckHistogramInput(s$num_reports)
160*2abb3134SXin Li
161*2abb3134SXin Li  p <- qplot(s$num_reports / 1e6, geom = "histogram")
162*2abb3134SXin Li  t <- ggtitle("Raw Reports by Task")
163*2abb3134SXin Li  x <- xlab("millions of reports")
164*2abb3134SXin Li  y <- ylab("number of tasks")
165*2abb3134SXin Li  WritePlot(p + t + x + y, output_dir, 'num_reports.png')
166*2abb3134SXin Li
167*2abb3134SXin Li  CheckHistogramInput(s$seconds)
168*2abb3134SXin Li
169*2abb3134SXin Li  p <- qplot(s$seconds, geom = "histogram")
170*2abb3134SXin Li  t <- ggtitle("Analysis Duration by Task")
171*2abb3134SXin Li  x <- xlab("seconds")
172*2abb3134SXin Li  y <- ylab("number of tasks")
173*2abb3134SXin Li  WritePlot(p + t + x + y, output_dir, 'seconds.png')
174*2abb3134SXin Li
175*2abb3134SXin Li  # NOTE: Skipping this for 'series' jobs.
176*2abb3134SXin Li  if (sum(!is.na(s$vm5_peak_kib)) > 0) {
177*2abb3134SXin Li    p <- qplot(s$vm5_peak_kib * 1024 / 1e6, geom = "histogram")
178*2abb3134SXin Li    t <- ggtitle("Peak Memory Usage by Task")
179*2abb3134SXin Li    x <- xlab("Peak megabytes (1e6 bytes) of memory")
180*2abb3134SXin Li    y <- ylab("number of tasks")
181*2abb3134SXin Li    WritePlot(p + t + x + y, output_dir, 'memory.png')
182*2abb3134SXin Li  }
183*2abb3134SXin Li}
184*2abb3134SXin Li
185*2abb3134SXin LiProcessAllDist <- function(s, output_dir) {
186*2abb3134SXin Li  Log('dist: Writing per-metric status.csv')
187*2abb3134SXin Li  WriteDistMetricStatus(s, output_dir)
188*2abb3134SXin Li
189*2abb3134SXin Li  Log('dist: Writing histograms')
190*2abb3134SXin Li  WriteDistHistograms(s, output_dir)
191*2abb3134SXin Li
192*2abb3134SXin Li  Log('dist: Writing aggregated overview.csv')
193*2abb3134SXin Li  WriteDistOverview(s, output_dir)
194*2abb3134SXin Li}
195*2abb3134SXin Li
196*2abb3134SXin Li# Write the single CSV file loaded by assoc-overview.html.
197*2abb3134SXin LiWriteAssocOverview <- function(summary, output_dir) {
198*2abb3134SXin Li  s <- data.table(summary)  # data.table syntax is easier here
199*2abb3134SXin Li
200*2abb3134SXin Li  by_metric <-  s[ , list(
201*2abb3134SXin Li      #params_file = unique(params_file),
202*2abb3134SXin Li      #map_file = unique(map_file),
203*2abb3134SXin Li
204*2abb3134SXin Li      days = length(date),
205*2abb3134SXin Li      max_num_reports = MaybeMax(num_reports),
206*2abb3134SXin Li
207*2abb3134SXin Li      # summarize status
208*2abb3134SXin Li      ok = sum(status == 'OK'),
209*2abb3134SXin Li      fail = sum(status == 'FAIL'),
210*2abb3134SXin Li      timeout = sum(status == 'TIMEOUT'),
211*2abb3134SXin Li      skipped = sum(status == 'SKIPPED'),
212*2abb3134SXin Li
213*2abb3134SXin Li      mean_total_secs = MaybeMean(total_elapsed_seconds),
214*2abb3134SXin Li      mean_em_secs = MaybeMean(em_elapsed_seconds)
215*2abb3134SXin Li
216*2abb3134SXin Li      ), by=list(metric)]
217*2abb3134SXin Li
218*2abb3134SXin Li  # Case insensitive sort by metric name
219*2abb3134SXin Li  by_metric <- by_metric[order(tolower(by_metric$metric)), ]
220*2abb3134SXin Li
221*2abb3134SXin Li  overview_path <- file.path(output_dir, 'assoc-overview.csv')
222*2abb3134SXin Li  write.csv(by_metric, file = overview_path, row.names = FALSE)
223*2abb3134SXin Li  Log("Wrote %s", overview_path)
224*2abb3134SXin Li
225*2abb3134SXin Li  by_metric
226*2abb3134SXin Li}
227*2abb3134SXin Li
228*2abb3134SXin Li# Write the CSV files loaded by assoc-metric.html -- that is, one
229*2abb3134SXin Li# metric-status.csv for each metric name.
230*2abb3134SXin LiWriteAssocMetricStatus <- function(summary, output_dir) {
231*2abb3134SXin Li  s <- data.table(summary)
232*2abb3134SXin Li  csv_list <- unique(s[, list(metric)])
233*2abb3134SXin Li  for (i in 1:nrow(csv_list)) {
234*2abb3134SXin Li    u <- csv_list[i, ]
235*2abb3134SXin Li    # Select cols, and convert units.  Don't need params / map / metric.
236*2abb3134SXin Li    by_pair <- s[s$metric == u$metric,
237*2abb3134SXin Li                 list(days = length(date),
238*2abb3134SXin Li                      max_num_reports = MaybeMax(num_reports),
239*2abb3134SXin Li
240*2abb3134SXin Li                      # summarize status
241*2abb3134SXin Li                      ok = sum(status == 'OK'),
242*2abb3134SXin Li                      fail = sum(status == 'FAIL'),
243*2abb3134SXin Li                      timeout = sum(status == 'TIMEOUT'),
244*2abb3134SXin Li                      skipped = sum(status == 'SKIPPED'),
245*2abb3134SXin Li
246*2abb3134SXin Li                      mean_total_secs = MaybeMean(total_elapsed_seconds),
247*2abb3134SXin Li                      mean_em_secs = MaybeMean(em_elapsed_seconds)
248*2abb3134SXin Li                      ),
249*2abb3134SXin Li                 by=list(var1, var2)]
250*2abb3134SXin Li
251*2abb3134SXin Li    # Case insensitive sort by var1 name
252*2abb3134SXin Li    by_pair <- by_pair[order(tolower(by_pair$var1)), ]
253*2abb3134SXin Li
254*2abb3134SXin Li    csv_path <- file.path(output_dir, u$metric, 'metric-status.csv')
255*2abb3134SXin Li    write.csv(by_pair, file = csv_path, row.names = FALSE)
256*2abb3134SXin Li    Log("Wrote %s", csv_path)
257*2abb3134SXin Li  }
258*2abb3134SXin Li}
259*2abb3134SXin Li
260*2abb3134SXin Li# This naming convention is in task_spec.py AssocTaskSpec.
261*2abb3134SXin LiFormatAssocRelPath <- function(metric, var1, var2) {
262*2abb3134SXin Li  v2 <- gsub('..', '_', var2, fixed = TRUE)
263*2abb3134SXin Li  var_dir <- sprintf('%s_X_%s', var1, v2)
264*2abb3134SXin Li  file.path(metric, var_dir)
265*2abb3134SXin Li}
266*2abb3134SXin Li
267*2abb3134SXin Li# Write the CSV files loaded by assoc-pair.html -- that is, one pair-status.csv
268*2abb3134SXin Li# for each (metric, var1, var2) pair.
269*2abb3134SXin LiWriteAssocPairStatus <- function(summary, output_dir) {
270*2abb3134SXin Li
271*2abb3134SXin Li  s <- data.table(summary)
272*2abb3134SXin Li
273*2abb3134SXin Li  csv_list <- unique(s[, list(metric, var1, var2)])
274*2abb3134SXin Li  Log('CSV list:')
275*2abb3134SXin Li  print(csv_list)
276*2abb3134SXin Li
277*2abb3134SXin Li  # loop over unique metrics, and write a CSV for each one
278*2abb3134SXin Li  for (i in 1:nrow(csv_list)) {
279*2abb3134SXin Li    u <- csv_list[i, ]
280*2abb3134SXin Li
281*2abb3134SXin Li    # Select cols, and convert units.  Don't need params / map / metric.
282*2abb3134SXin Li    subframe <- s[s$metric == u$metric & s$var1 == u$var1 & s$var2 == u$var2,
283*2abb3134SXin Li                  list(job_id, date, status,
284*2abb3134SXin Li                       num_reports, d1, d2,
285*2abb3134SXin Li                       total_elapsed_seconds,
286*2abb3134SXin Li                       em_elapsed_seconds)]
287*2abb3134SXin Li
288*2abb3134SXin Li    # Sort by descending date.  Alphabetical sort works fine for YYYY-MM-DD.
289*2abb3134SXin Li    subframe <- subframe[order(subframe$date, decreasing = TRUE), ]
290*2abb3134SXin Li
291*2abb3134SXin Li    pair_rel_path <- FormatAssocRelPath(u$metric, u$var1, u$var2)
292*2abb3134SXin Li
293*2abb3134SXin Li    csv_path <- file.path(output_dir, pair_rel_path, 'pair-status.csv')
294*2abb3134SXin Li    write.csv(subframe, file = csv_path, row.names = FALSE)
295*2abb3134SXin Li    Log("Wrote %s", csv_path)
296*2abb3134SXin Li
297*2abb3134SXin Li    # Write a file with the raw variable names.  Parsed by ui.sh, to pass to
298*2abb3134SXin Li    # csv_to_html.py.
299*2abb3134SXin Li    meta_path <- file.path(output_dir, pair_rel_path, 'pair-metadata.txt')
300*2abb3134SXin Li
301*2abb3134SXin Li    # NOTE: The conversion from data.table to character vector requires
302*2abb3134SXin Li    # stringsAsFactors to work correctly!
303*2abb3134SXin Li    lines <- as.character(u)
304*2abb3134SXin Li    writeLines(lines, con = meta_path)
305*2abb3134SXin Li    Log("Wrote %s", meta_path)
306*2abb3134SXin Li  }
307*2abb3134SXin Li}
308*2abb3134SXin Li
309*2abb3134SXin LiProcessAllAssoc <- function(s, output_dir) {
310*2abb3134SXin Li  Log('assoc: Writing pair-status.csv for each variable pair in each metric')
311*2abb3134SXin Li  WriteAssocPairStatus(s, output_dir)
312*2abb3134SXin Li
313*2abb3134SXin Li  Log('assoc: Writing metric-status.csv for each metric')
314*2abb3134SXin Li  WriteAssocMetricStatus(s, output_dir)
315*2abb3134SXin Li
316*2abb3134SXin Li  Log('assoc: Writing aggregated overview.csv')
317*2abb3134SXin Li  WriteAssocOverview(s, output_dir)
318*2abb3134SXin Li}
319*2abb3134SXin Li
320*2abb3134SXin Limain <- function(argv) {
321*2abb3134SXin Li  # increase ggplot font size globally
322*2abb3134SXin Li  theme_set(theme_grey(base_size = 16))
323*2abb3134SXin Li
324*2abb3134SXin Li  action = argv[[1]]
325*2abb3134SXin Li  input = argv[[2]]
326*2abb3134SXin Li  output_dir = argv[[3]]
327*2abb3134SXin Li
328*2abb3134SXin Li  if (action == 'dist') {
329*2abb3134SXin Li    summary = read.csv(input)
330*2abb3134SXin Li    ProcessAllDist(summary, output_dir)
331*2abb3134SXin Li  } else if (action == 'assoc') {
332*2abb3134SXin Li    summary = read.csv(input)
333*2abb3134SXin Li    ProcessAllAssoc(summary, output_dir)
334*2abb3134SXin Li  } else {
335*2abb3134SXin Li    stop(sprintf('Invalid action %s', action))
336*2abb3134SXin Li  }
337*2abb3134SXin Li
338*2abb3134SXin Li  Log('Done')
339*2abb3134SXin Li}
340*2abb3134SXin Li
341*2abb3134SXin Liif (length(sys.frames()) == 0) {
342*2abb3134SXin Li  main(commandArgs(TRUE))
343*2abb3134SXin Li}
344