xref: /aosp_15_r20/external/rappor/pipeline/csv_to_html.py (revision 2abb31345f6c95944768b5222a9a5ed3fc68cc00)
1#!/usr/bin/python
2"""Reads a CSV file on stdin, and prints an an HTML table on stdout.
3
4The static HTML can then be made made dynamic with JavaScript, e.g. jQuery
5DataTable.
6
7Use Cases:
8
9  - overview.csv -- each row is a metric
10    - links: to metric page
11
12  - status.csv -- each row is a day
13    - links: to log.txt, to results.html
14"""
15
16import cgi
17import csv
18import optparse
19import sys
20
21import util
22
23
24def CreateOptionsParser():
25  p = optparse.OptionParser()
26
27  # We are taking a path, and not using stdin, because we read it twice.
28  p.add_option(
29      '--col-format', dest='col_formats', metavar="'COLNAME FMT'", type='str',
30      default=[], action='append',
31      help='Add HTML links to the named column, using the given Python '
32           '.format() string')
33
34  p.add_option(
35      '--def', dest='defs', metavar="'NAME VALUE'", type='str',
36      default=[], action='append',
37      help='Define varaibles for use in format strings')
38
39  p.add_option(
40      '--as-percent', dest='percent_cols', metavar="COLNAME", type='str',
41      default=[], action='append',
42      help='Format this floating point column as a percentage string')
43
44  # TODO: We could include this by default, and then change all the HTML to
45  # have <div> placeholders instead of <table>.
46  p.add_option(
47      '--table', dest='table', default=False, action='store_true',
48      help='Add <table></table> tags (useful for testing)')
49
50  return p
51
52
53def ParseSpec(arg_list):
54  """Given an argument list, return a string -> string dictionary."""
55  # The format string is passed the cell value.  Escaped as HTML?
56  d = {}
57  for s in arg_list:
58    try:
59      name, value = s.split(' ', 1)
60    except ValueError:
61      raise RuntimeError('Invalid column format %r' % s)
62    d[name] = value
63  return d
64
65
66def PrintRow(row, col_names, col_formats, defs, percent_cols):
67  """Print a CSV row as HTML, using the given formatting.
68
69  Returns:
70    An array of booleans indicating whether each cell is a number.
71  """
72  is_number_flags = [False] * len(col_names)
73
74  for i, cell in enumerate(row):
75    # The cell as a string.  By default we leave it as is; it may be mutated
76    # below.
77    cell_str = cell
78    css_class = ''  # CSS class for the cell.
79    col_name = col_names[i]  # column that the cell is under
80
81    # Does the cell look like a float?
82    try:
83      cell_float = float(cell)
84      if col_name in percent_cols:  # Floats can be formatted as percentages.
85        cell_str = '{:.1f}%'.format(cell_float * 100)
86      else:
87        # Arbitrarily use 3 digits of precision for display
88        cell_str = '{:.3f}'.format(cell_float)
89      css_class = 'num'
90      is_number_flags[i] = True
91    except ValueError:
92      pass
93
94    # Does it look lik an int?
95    try:
96      cell_int = int(cell)
97      cell_str = '{:,}'.format(cell_int)
98      css_class = 'num'
99      is_number_flags[i] = True
100    except ValueError:
101      pass
102
103    # Special CSS class for R NA values.
104    if cell_str.strip() == 'NA':
105      css_class = 'num na'  # num should right justify; na should make it red
106      is_number_flags[i] = True
107
108    if css_class:
109      print '    <td class="{}">'.format(css_class),
110    else:
111      print '    <td>',
112
113    cell_safe = cgi.escape(cell_str)
114
115    # If the cell has a format string, print it this way.
116
117    fmt = col_formats.get(col_name)  # e.g. "../{date}.html"
118    if fmt:
119      # Copy variable bindings
120      bindings = dict(defs)
121
122      # Also let the format string use other column names.  TODO: Is there a
123      # more efficient way?
124      bindings.update(zip(col_names, [cgi.escape(c) for c in row]))
125
126      bindings[col_name] = cell_safe
127
128      print fmt.format(**bindings),  # no newline
129    else:
130      print cell_safe,  # no newline
131
132    print '</td>'
133
134  return is_number_flags
135
136
137def ReadCsv(f):
138  """Read the CSV file, returning the column names and rows."""
139  c = csv.reader(f)
140
141  # The first row of the CSV is assumed to be a header.  The rest are data.
142  col_names = []
143  rows = []
144  for i, row in enumerate(c):
145    if i == 0:
146      col_names = row
147      continue
148    rows.append(row)
149  return col_names, rows
150
151
152def PrintColGroup(col_names, col_is_numeric):
153  """Print HTML colgroup element, used for JavaScript sorting."""
154  print '<colgroup>'
155  for i, col in enumerate(col_names):
156    # CSS class is used for sorting
157    if col_is_numeric[i]:
158      css_class = 'number'
159    else:
160      css_class = 'case-insensitive'
161
162    # NOTE: id is a comment only; not used
163    print '  <col id="{}" type="{}" />'.format(col, css_class)
164  print '</colgroup>'
165
166
167def main(argv):
168  (opts, argv) = CreateOptionsParser().parse_args(argv)
169
170  col_formats = ParseSpec(opts.col_formats)
171  defs = ParseSpec(opts.defs)
172
173  col_names, rows = ReadCsv(sys.stdin)
174
175  for col in opts.percent_cols:
176    if col not in col_names:
177      raise RuntimeError('--percent-col %s is not a valid column' % col)
178
179  # By default, we don't print the <table> bit -- that's up to the host page
180  if opts.table:
181    print '<table>'
182
183  print '<thead>'
184  for col in col_names:
185    # change _ to space so long column names can wrap
186    print '  <td>%s</td>' % cgi.escape(col.replace('_', ' '))
187  print '</thead>'
188
189  # Assume all columns are numeric at first.  Look at each row for non-numeric
190  # values.
191  col_is_numeric = [True] * len(col_names)
192
193  print '<tbody>'
194  for row in rows:
195    print '  <tr>'
196    is_number_flags = PrintRow(row, col_names, col_formats, defs,
197                               opts.percent_cols)
198
199    # If one cell in a column is not a number, then the whole cell isn't.
200    for (i, is_number) in enumerate(is_number_flags):
201      if not is_number:
202        col_is_numeric[i] = False
203
204    print '  </tr>'
205  print '</tbody>'
206
207  PrintColGroup(col_names, col_is_numeric)
208
209  if opts.table:
210    print '</table>'
211
212
213if __name__ == '__main__':
214  try:
215    main(sys.argv)
216  except RuntimeError, e:
217    print >>sys.stderr, 'FATAL: %s' % e
218    sys.exit(1)
219