xref: /aosp_15_r20/external/cronet/testing/libfuzzer/dictionary_generator.py (revision 6777b5387eb2ff775bb5750e3f5d96f37fb7352b)
1*6777b538SAndroid Build Coastguard Worker#!/usr/bin/env python3
2*6777b538SAndroid Build Coastguard Worker#
3*6777b538SAndroid Build Coastguard Worker# Copyright 2016 The Chromium Authors
4*6777b538SAndroid Build Coastguard Worker# Use of this source code is governed by a BSD-style license that can be
5*6777b538SAndroid Build Coastguard Worker# found in the LICENSE file.
6*6777b538SAndroid Build Coastguard Worker
7*6777b538SAndroid Build Coastguard Worker"""Generate a dictionary for libFuzzer or AFL-based fuzzer.
8*6777b538SAndroid Build Coastguard Worker
9*6777b538SAndroid Build Coastguard WorkerInvoked manually using a fuzzer binary and target format/protocol specification.
10*6777b538SAndroid Build Coastguard WorkerWorks better for text formats or protocols. For binary ones may be useless.
11*6777b538SAndroid Build Coastguard Worker"""
12*6777b538SAndroid Build Coastguard Worker
13*6777b538SAndroid Build Coastguard Workerimport argparse
14*6777b538SAndroid Build Coastguard Workerimport HTMLParser
15*6777b538SAndroid Build Coastguard Workerimport io
16*6777b538SAndroid Build Coastguard Workerimport logging
17*6777b538SAndroid Build Coastguard Workerimport os
18*6777b538SAndroid Build Coastguard Workerimport re
19*6777b538SAndroid Build Coastguard Workerimport shutil
20*6777b538SAndroid Build Coastguard Workerimport string
21*6777b538SAndroid Build Coastguard Workerimport subprocess
22*6777b538SAndroid Build Coastguard Workerimport sys
23*6777b538SAndroid Build Coastguard Workerimport tempfile
24*6777b538SAndroid Build Coastguard Worker
25*6777b538SAndroid Build Coastguard Worker
26*6777b538SAndroid Build Coastguard WorkerENCODING_TYPES = ['ascii', 'utf_16_be', 'utf_16_le', 'utf_32_be', 'utf_32_le']
27*6777b538SAndroid Build Coastguard WorkerMIN_STRING_LENGTH = 4
28*6777b538SAndroid Build Coastguard Worker
29*6777b538SAndroid Build Coastguard Worker
30*6777b538SAndroid Build Coastguard Workerdef DecodeHTML(html_data):
31*6777b538SAndroid Build Coastguard Worker  """HTML-decoding of the data."""
32*6777b538SAndroid Build Coastguard Worker  html_parser = HTMLParser.HTMLParser()
33*6777b538SAndroid Build Coastguard Worker  data = html_parser.unescape(html_data.decode('ascii', 'ignore'))
34*6777b538SAndroid Build Coastguard Worker  return data.encode('ascii', 'ignore')
35*6777b538SAndroid Build Coastguard Worker
36*6777b538SAndroid Build Coastguard Worker
37*6777b538SAndroid Build Coastguard Workerdef EscapeDictionaryElement(element):
38*6777b538SAndroid Build Coastguard Worker  """Escape all unprintable and control characters in an element."""
39*6777b538SAndroid Build Coastguard Worker  element_escaped = element.encode('string_escape')
40*6777b538SAndroid Build Coastguard Worker  # Remove escaping for single quote because it breaks libFuzzer.
41*6777b538SAndroid Build Coastguard Worker  element_escaped = element_escaped.replace('\\\'', '\'')
42*6777b538SAndroid Build Coastguard Worker  # Add escaping for double quote.
43*6777b538SAndroid Build Coastguard Worker  element_escaped = element_escaped.replace('"', '\\"')
44*6777b538SAndroid Build Coastguard Worker  return element_escaped
45*6777b538SAndroid Build Coastguard Worker
46*6777b538SAndroid Build Coastguard Worker
47*6777b538SAndroid Build Coastguard Workerdef ExtractWordsFromBinary(filepath, min_length=MIN_STRING_LENGTH):
48*6777b538SAndroid Build Coastguard Worker  """Extract words (splitted strings) from a binary executable file."""
49*6777b538SAndroid Build Coastguard Worker  rodata = PreprocessAndReadRodata(filepath)
50*6777b538SAndroid Build Coastguard Worker  words = []
51*6777b538SAndroid Build Coastguard Worker
52*6777b538SAndroid Build Coastguard Worker  strings_re = re.compile(r'[^\x00-\x1F\x7F-\xFF]{%d,}' % min_length)
53*6777b538SAndroid Build Coastguard Worker  # Use different encodings for strings extraction.
54*6777b538SAndroid Build Coastguard Worker  for encoding in ENCODING_TYPES:
55*6777b538SAndroid Build Coastguard Worker    data = rodata.decode(encoding, 'ignore').encode('ascii', 'ignore')
56*6777b538SAndroid Build Coastguard Worker    raw_strings = strings_re.findall(data)
57*6777b538SAndroid Build Coastguard Worker    for splitted_line in map(lambda line: line.split(), raw_strings):
58*6777b538SAndroid Build Coastguard Worker      words += splitted_line
59*6777b538SAndroid Build Coastguard Worker
60*6777b538SAndroid Build Coastguard Worker  return set(words)
61*6777b538SAndroid Build Coastguard Worker
62*6777b538SAndroid Build Coastguard Worker
63*6777b538SAndroid Build Coastguard Workerdef ExtractWordsFromLines(lines):
64*6777b538SAndroid Build Coastguard Worker  """Extract all words from a list of strings."""
65*6777b538SAndroid Build Coastguard Worker  words = set()
66*6777b538SAndroid Build Coastguard Worker  for line in lines:
67*6777b538SAndroid Build Coastguard Worker    for word in line.split():
68*6777b538SAndroid Build Coastguard Worker      words.add(word)
69*6777b538SAndroid Build Coastguard Worker
70*6777b538SAndroid Build Coastguard Worker  return words
71*6777b538SAndroid Build Coastguard Worker
72*6777b538SAndroid Build Coastguard Worker
73*6777b538SAndroid Build Coastguard Workerdef ExtractWordsFromSpec(filepath, is_html):
74*6777b538SAndroid Build Coastguard Worker  """Extract words from a specification."""
75*6777b538SAndroid Build Coastguard Worker  data = ReadSpecification(filepath, is_html)
76*6777b538SAndroid Build Coastguard Worker  words = data.split()
77*6777b538SAndroid Build Coastguard Worker  return set(words)
78*6777b538SAndroid Build Coastguard Worker
79*6777b538SAndroid Build Coastguard Worker
80*6777b538SAndroid Build Coastguard Workerdef FindIndentedText(text):
81*6777b538SAndroid Build Coastguard Worker  """Find space-indented text blocks, e.g. code or data samples in RFCs."""
82*6777b538SAndroid Build Coastguard Worker  lines = text.split('\n')
83*6777b538SAndroid Build Coastguard Worker  indented_blocks = []
84*6777b538SAndroid Build Coastguard Worker  current_block = ''
85*6777b538SAndroid Build Coastguard Worker  previous_number_of_spaces = 0
86*6777b538SAndroid Build Coastguard Worker
87*6777b538SAndroid Build Coastguard Worker  # Go through every line and concatenate space-indented blocks into lines.
88*6777b538SAndroid Build Coastguard Worker  for i in xrange(0, len(lines), 1):
89*6777b538SAndroid Build Coastguard Worker    if not lines[i]:
90*6777b538SAndroid Build Coastguard Worker      # Ignore empty lines.
91*6777b538SAndroid Build Coastguard Worker      continue
92*6777b538SAndroid Build Coastguard Worker
93*6777b538SAndroid Build Coastguard Worker    # Space-indented text blocks have more leading spaces than regular text.
94*6777b538SAndroid Build Coastguard Worker    n = FindNumberOfLeadingSpaces(lines[i])
95*6777b538SAndroid Build Coastguard Worker
96*6777b538SAndroid Build Coastguard Worker    if n > previous_number_of_spaces:
97*6777b538SAndroid Build Coastguard Worker      # Beginning of a space-indented text block, start concatenation.
98*6777b538SAndroid Build Coastguard Worker      current_block = lines[i][n : ]
99*6777b538SAndroid Build Coastguard Worker    elif n == previous_number_of_spaces and current_block:
100*6777b538SAndroid Build Coastguard Worker      # Or continuation of a space-indented text block, concatenate lines.
101*6777b538SAndroid Build Coastguard Worker      current_block += '\n' + lines[i][n : ]
102*6777b538SAndroid Build Coastguard Worker
103*6777b538SAndroid Build Coastguard Worker    if n < previous_number_of_spaces and current_block:
104*6777b538SAndroid Build Coastguard Worker      # Current line is not indented, save previously concatenated lines.
105*6777b538SAndroid Build Coastguard Worker      indented_blocks.append(current_block)
106*6777b538SAndroid Build Coastguard Worker      current_block = ''
107*6777b538SAndroid Build Coastguard Worker
108*6777b538SAndroid Build Coastguard Worker    previous_number_of_spaces = n
109*6777b538SAndroid Build Coastguard Worker
110*6777b538SAndroid Build Coastguard Worker  return indented_blocks
111*6777b538SAndroid Build Coastguard Worker
112*6777b538SAndroid Build Coastguard Worker
113*6777b538SAndroid Build Coastguard Workerdef FindNumberOfLeadingSpaces(line):
114*6777b538SAndroid Build Coastguard Worker  """Calculate number of leading whitespace characters in the string."""
115*6777b538SAndroid Build Coastguard Worker  n = 0
116*6777b538SAndroid Build Coastguard Worker  while n < len(line) and line[n].isspace():
117*6777b538SAndroid Build Coastguard Worker    n += 1
118*6777b538SAndroid Build Coastguard Worker
119*6777b538SAndroid Build Coastguard Worker  return n
120*6777b538SAndroid Build Coastguard Worker
121*6777b538SAndroid Build Coastguard Worker
122*6777b538SAndroid Build Coastguard Workerdef GenerateDictionary(path_to_binary, path_to_spec, strategy, is_html=False):
123*6777b538SAndroid Build Coastguard Worker  """Generate a dictionary for given pair of fuzzer binary and specification."""
124*6777b538SAndroid Build Coastguard Worker  for filepath in [path_to_binary, path_to_spec]:
125*6777b538SAndroid Build Coastguard Worker    if not os.path.exists(filepath):
126*6777b538SAndroid Build Coastguard Worker      logging.error('%s doesn\'t exist. Exit.', filepath)
127*6777b538SAndroid Build Coastguard Worker      sys.exit(1)
128*6777b538SAndroid Build Coastguard Worker
129*6777b538SAndroid Build Coastguard Worker  words_from_binary = ExtractWordsFromBinary(path_to_binary)
130*6777b538SAndroid Build Coastguard Worker  words_from_spec = ExtractWordsFromSpec(path_to_spec, is_html)
131*6777b538SAndroid Build Coastguard Worker
132*6777b538SAndroid Build Coastguard Worker  dictionary_words = set()
133*6777b538SAndroid Build Coastguard Worker
134*6777b538SAndroid Build Coastguard Worker  if 'i' in strategy:
135*6777b538SAndroid Build Coastguard Worker    # Strategy i: only words which are common for binary and for specification.
136*6777b538SAndroid Build Coastguard Worker    dictionary_words = words_from_binary.intersection(words_from_spec)
137*6777b538SAndroid Build Coastguard Worker
138*6777b538SAndroid Build Coastguard Worker  if 'q' in strategy:
139*6777b538SAndroid Build Coastguard Worker    # Strategy q: add words from all quoted strings from specification.
140*6777b538SAndroid Build Coastguard Worker    # TODO(mmoroz): experimental and very noisy. Not recommended to use.
141*6777b538SAndroid Build Coastguard Worker    spec_data = ReadSpecification(path_to_spec, is_html)
142*6777b538SAndroid Build Coastguard Worker    quoted_strings = FindIndentedText(spec_data)
143*6777b538SAndroid Build Coastguard Worker    quoted_words = ExtractWordsFromLines(quoted_strings)
144*6777b538SAndroid Build Coastguard Worker    dictionary_words = dictionary_words.union(quoted_words)
145*6777b538SAndroid Build Coastguard Worker
146*6777b538SAndroid Build Coastguard Worker  if 'u' in strategy:
147*6777b538SAndroid Build Coastguard Worker    # Strategy u: add all uppercase words from specification.
148*6777b538SAndroid Build Coastguard Worker    uppercase_words = set(w for w in words_from_spec if w.isupper())
149*6777b538SAndroid Build Coastguard Worker    dictionary_words = dictionary_words.union(uppercase_words)
150*6777b538SAndroid Build Coastguard Worker
151*6777b538SAndroid Build Coastguard Worker  return dictionary_words
152*6777b538SAndroid Build Coastguard Worker
153*6777b538SAndroid Build Coastguard Worker
154*6777b538SAndroid Build Coastguard Workerdef PreprocessAndReadRodata(filepath):
155*6777b538SAndroid Build Coastguard Worker  """Create a stripped copy of the binary and extract .rodata section."""
156*6777b538SAndroid Build Coastguard Worker  stripped_file = tempfile.NamedTemporaryFile(prefix='.stripped_')
157*6777b538SAndroid Build Coastguard Worker  stripped_filepath = stripped_file.name
158*6777b538SAndroid Build Coastguard Worker  shutil.copyfile(filepath, stripped_filepath)
159*6777b538SAndroid Build Coastguard Worker
160*6777b538SAndroid Build Coastguard Worker  # Strip all symbols to reduce amount of redundant strings.
161*6777b538SAndroid Build Coastguard Worker  strip_cmd = ['strip', '--strip-all', stripped_filepath]
162*6777b538SAndroid Build Coastguard Worker  result = subprocess.call(strip_cmd)
163*6777b538SAndroid Build Coastguard Worker  if result:
164*6777b538SAndroid Build Coastguard Worker    logging.warning('Failed to strip the binary. Using the original version.')
165*6777b538SAndroid Build Coastguard Worker    stripped_filepath = filepath
166*6777b538SAndroid Build Coastguard Worker
167*6777b538SAndroid Build Coastguard Worker  # Extract .rodata section to reduce amount of redundant strings.
168*6777b538SAndroid Build Coastguard Worker  rodata_file = tempfile.NamedTemporaryFile(prefix='.rodata_')
169*6777b538SAndroid Build Coastguard Worker  rodata_filepath = rodata_file.name
170*6777b538SAndroid Build Coastguard Worker  objcopy_cmd = ['objcopy', '-j', '.rodata', stripped_filepath, rodata_filepath]
171*6777b538SAndroid Build Coastguard Worker
172*6777b538SAndroid Build Coastguard Worker  # Hide output from stderr since objcopy prints a warning.
173*6777b538SAndroid Build Coastguard Worker  with open(os.devnull, 'w') as devnull:
174*6777b538SAndroid Build Coastguard Worker    result = subprocess.call(objcopy_cmd, stderr=devnull)
175*6777b538SAndroid Build Coastguard Worker
176*6777b538SAndroid Build Coastguard Worker  if result:
177*6777b538SAndroid Build Coastguard Worker    logging.warning('Failed to extract .rodata section. Using the whole file.')
178*6777b538SAndroid Build Coastguard Worker    rodata_filepath = stripped_filepath
179*6777b538SAndroid Build Coastguard Worker
180*6777b538SAndroid Build Coastguard Worker  with open(rodata_filepath) as file_handle:
181*6777b538SAndroid Build Coastguard Worker    data = file_handle.read()
182*6777b538SAndroid Build Coastguard Worker
183*6777b538SAndroid Build Coastguard Worker  stripped_file.close()
184*6777b538SAndroid Build Coastguard Worker  rodata_file.close()
185*6777b538SAndroid Build Coastguard Worker
186*6777b538SAndroid Build Coastguard Worker  return data
187*6777b538SAndroid Build Coastguard Worker
188*6777b538SAndroid Build Coastguard Worker
189*6777b538SAndroid Build Coastguard Workerdef ReadSpecification(filepath, is_html):
190*6777b538SAndroid Build Coastguard Worker  """Read a specification file and return its contents."""
191*6777b538SAndroid Build Coastguard Worker  with open(filepath, 'r') as file_handle:
192*6777b538SAndroid Build Coastguard Worker    data = file_handle.read()
193*6777b538SAndroid Build Coastguard Worker
194*6777b538SAndroid Build Coastguard Worker  if is_html:
195*6777b538SAndroid Build Coastguard Worker    data = DecodeHTML(data)
196*6777b538SAndroid Build Coastguard Worker
197*6777b538SAndroid Build Coastguard Worker  return data
198*6777b538SAndroid Build Coastguard Worker
199*6777b538SAndroid Build Coastguard Worker
200*6777b538SAndroid Build Coastguard Workerdef WriteDictionary(dictionary_path, dictionary):
201*6777b538SAndroid Build Coastguard Worker  """Write given dictionary to a file."""
202*6777b538SAndroid Build Coastguard Worker  with open(dictionary_path, 'wb') as file_handle:
203*6777b538SAndroid Build Coastguard Worker    file_handle.write('# This is an automatically generated dictionary.\n')
204*6777b538SAndroid Build Coastguard Worker    for word in dictionary:
205*6777b538SAndroid Build Coastguard Worker      if not word:
206*6777b538SAndroid Build Coastguard Worker        continue
207*6777b538SAndroid Build Coastguard Worker      line = '"%s"\n' % EscapeDictionaryElement(word)
208*6777b538SAndroid Build Coastguard Worker      file_handle.write(line)
209*6777b538SAndroid Build Coastguard Worker
210*6777b538SAndroid Build Coastguard Worker
211*6777b538SAndroid Build Coastguard Workerdef main():
212*6777b538SAndroid Build Coastguard Worker  parser = argparse.ArgumentParser(description="Generate fuzzer dictionary.")
213*6777b538SAndroid Build Coastguard Worker  parser.add_argument('--fuzzer', required=True,
214*6777b538SAndroid Build Coastguard Worker                      help='Path to a fuzzer binary executable. It is '
215*6777b538SAndroid Build Coastguard Worker                      'recommended to use a binary built with '
216*6777b538SAndroid Build Coastguard Worker                      '"use_libfuzzer=false is_asan=false" to get a better '
217*6777b538SAndroid Build Coastguard Worker                      'dictionary with fewer number of redundant elements.')
218*6777b538SAndroid Build Coastguard Worker  parser.add_argument('--spec', required=True,
219*6777b538SAndroid Build Coastguard Worker                      help='Path to a target specification (in textual form).')
220*6777b538SAndroid Build Coastguard Worker  parser.add_argument('--html', default=0,
221*6777b538SAndroid Build Coastguard Worker                      help='Decode HTML [01] (0 is default value): '
222*6777b538SAndroid Build Coastguard Worker                      '1 - if specification has HTML entities to be decoded.')
223*6777b538SAndroid Build Coastguard Worker  parser.add_argument('--out', required=True,
224*6777b538SAndroid Build Coastguard Worker                      help='Path to a file to write a dictionary into.')
225*6777b538SAndroid Build Coastguard Worker  parser.add_argument('--strategy', default='iu',
226*6777b538SAndroid Build Coastguard Worker                      help='Generation strategy [iqu] ("iu" is default value): '
227*6777b538SAndroid Build Coastguard Worker                      'i - intersection, q - quoted, u - uppercase.')
228*6777b538SAndroid Build Coastguard Worker  args = parser.parse_args()
229*6777b538SAndroid Build Coastguard Worker
230*6777b538SAndroid Build Coastguard Worker  dictionary = GenerateDictionary(args.fuzzer, args.spec, args.strategy,
231*6777b538SAndroid Build Coastguard Worker                                  is_html=bool(args.html))
232*6777b538SAndroid Build Coastguard Worker  WriteDictionary(args.out, dictionary)
233*6777b538SAndroid Build Coastguard Worker
234*6777b538SAndroid Build Coastguard Worker
235*6777b538SAndroid Build Coastguard Workerif __name__ == '__main__':
236*6777b538SAndroid Build Coastguard Worker  main()
237