1*6777b538SAndroid Build Coastguard Worker#!/usr/bin/env python3 2*6777b538SAndroid Build Coastguard Worker# 3*6777b538SAndroid Build Coastguard Worker# Copyright 2016 The Chromium Authors 4*6777b538SAndroid Build Coastguard Worker# Use of this source code is governed by a BSD-style license that can be 5*6777b538SAndroid Build Coastguard Worker# found in the LICENSE file. 6*6777b538SAndroid Build Coastguard Worker 7*6777b538SAndroid Build Coastguard Worker"""Generate a dictionary for libFuzzer or AFL-based fuzzer. 8*6777b538SAndroid Build Coastguard Worker 9*6777b538SAndroid Build Coastguard WorkerInvoked manually using a fuzzer binary and target format/protocol specification. 10*6777b538SAndroid Build Coastguard WorkerWorks better for text formats or protocols. For binary ones may be useless. 11*6777b538SAndroid Build Coastguard Worker""" 12*6777b538SAndroid Build Coastguard Worker 13*6777b538SAndroid Build Coastguard Workerimport argparse 14*6777b538SAndroid Build Coastguard Workerimport HTMLParser 15*6777b538SAndroid Build Coastguard Workerimport io 16*6777b538SAndroid Build Coastguard Workerimport logging 17*6777b538SAndroid Build Coastguard Workerimport os 18*6777b538SAndroid Build Coastguard Workerimport re 19*6777b538SAndroid Build Coastguard Workerimport shutil 20*6777b538SAndroid Build Coastguard Workerimport string 21*6777b538SAndroid Build Coastguard Workerimport subprocess 22*6777b538SAndroid Build Coastguard Workerimport sys 23*6777b538SAndroid Build Coastguard Workerimport tempfile 24*6777b538SAndroid Build Coastguard Worker 25*6777b538SAndroid Build Coastguard Worker 26*6777b538SAndroid Build Coastguard WorkerENCODING_TYPES = ['ascii', 'utf_16_be', 'utf_16_le', 'utf_32_be', 'utf_32_le'] 27*6777b538SAndroid Build Coastguard WorkerMIN_STRING_LENGTH = 4 28*6777b538SAndroid Build Coastguard Worker 29*6777b538SAndroid Build Coastguard Worker 30*6777b538SAndroid Build Coastguard Workerdef DecodeHTML(html_data): 31*6777b538SAndroid Build Coastguard Worker """HTML-decoding of the data.""" 32*6777b538SAndroid Build Coastguard Worker html_parser = HTMLParser.HTMLParser() 33*6777b538SAndroid Build Coastguard Worker data = html_parser.unescape(html_data.decode('ascii', 'ignore')) 34*6777b538SAndroid Build Coastguard Worker return data.encode('ascii', 'ignore') 35*6777b538SAndroid Build Coastguard Worker 36*6777b538SAndroid Build Coastguard Worker 37*6777b538SAndroid Build Coastguard Workerdef EscapeDictionaryElement(element): 38*6777b538SAndroid Build Coastguard Worker """Escape all unprintable and control characters in an element.""" 39*6777b538SAndroid Build Coastguard Worker element_escaped = element.encode('string_escape') 40*6777b538SAndroid Build Coastguard Worker # Remove escaping for single quote because it breaks libFuzzer. 41*6777b538SAndroid Build Coastguard Worker element_escaped = element_escaped.replace('\\\'', '\'') 42*6777b538SAndroid Build Coastguard Worker # Add escaping for double quote. 43*6777b538SAndroid Build Coastguard Worker element_escaped = element_escaped.replace('"', '\\"') 44*6777b538SAndroid Build Coastguard Worker return element_escaped 45*6777b538SAndroid Build Coastguard Worker 46*6777b538SAndroid Build Coastguard Worker 47*6777b538SAndroid Build Coastguard Workerdef ExtractWordsFromBinary(filepath, min_length=MIN_STRING_LENGTH): 48*6777b538SAndroid Build Coastguard Worker """Extract words (splitted strings) from a binary executable file.""" 49*6777b538SAndroid Build Coastguard Worker rodata = PreprocessAndReadRodata(filepath) 50*6777b538SAndroid Build Coastguard Worker words = [] 51*6777b538SAndroid Build Coastguard Worker 52*6777b538SAndroid Build Coastguard Worker strings_re = re.compile(r'[^\x00-\x1F\x7F-\xFF]{%d,}' % min_length) 53*6777b538SAndroid Build Coastguard Worker # Use different encodings for strings extraction. 54*6777b538SAndroid Build Coastguard Worker for encoding in ENCODING_TYPES: 55*6777b538SAndroid Build Coastguard Worker data = rodata.decode(encoding, 'ignore').encode('ascii', 'ignore') 56*6777b538SAndroid Build Coastguard Worker raw_strings = strings_re.findall(data) 57*6777b538SAndroid Build Coastguard Worker for splitted_line in map(lambda line: line.split(), raw_strings): 58*6777b538SAndroid Build Coastguard Worker words += splitted_line 59*6777b538SAndroid Build Coastguard Worker 60*6777b538SAndroid Build Coastguard Worker return set(words) 61*6777b538SAndroid Build Coastguard Worker 62*6777b538SAndroid Build Coastguard Worker 63*6777b538SAndroid Build Coastguard Workerdef ExtractWordsFromLines(lines): 64*6777b538SAndroid Build Coastguard Worker """Extract all words from a list of strings.""" 65*6777b538SAndroid Build Coastguard Worker words = set() 66*6777b538SAndroid Build Coastguard Worker for line in lines: 67*6777b538SAndroid Build Coastguard Worker for word in line.split(): 68*6777b538SAndroid Build Coastguard Worker words.add(word) 69*6777b538SAndroid Build Coastguard Worker 70*6777b538SAndroid Build Coastguard Worker return words 71*6777b538SAndroid Build Coastguard Worker 72*6777b538SAndroid Build Coastguard Worker 73*6777b538SAndroid Build Coastguard Workerdef ExtractWordsFromSpec(filepath, is_html): 74*6777b538SAndroid Build Coastguard Worker """Extract words from a specification.""" 75*6777b538SAndroid Build Coastguard Worker data = ReadSpecification(filepath, is_html) 76*6777b538SAndroid Build Coastguard Worker words = data.split() 77*6777b538SAndroid Build Coastguard Worker return set(words) 78*6777b538SAndroid Build Coastguard Worker 79*6777b538SAndroid Build Coastguard Worker 80*6777b538SAndroid Build Coastguard Workerdef FindIndentedText(text): 81*6777b538SAndroid Build Coastguard Worker """Find space-indented text blocks, e.g. code or data samples in RFCs.""" 82*6777b538SAndroid Build Coastguard Worker lines = text.split('\n') 83*6777b538SAndroid Build Coastguard Worker indented_blocks = [] 84*6777b538SAndroid Build Coastguard Worker current_block = '' 85*6777b538SAndroid Build Coastguard Worker previous_number_of_spaces = 0 86*6777b538SAndroid Build Coastguard Worker 87*6777b538SAndroid Build Coastguard Worker # Go through every line and concatenate space-indented blocks into lines. 88*6777b538SAndroid Build Coastguard Worker for i in xrange(0, len(lines), 1): 89*6777b538SAndroid Build Coastguard Worker if not lines[i]: 90*6777b538SAndroid Build Coastguard Worker # Ignore empty lines. 91*6777b538SAndroid Build Coastguard Worker continue 92*6777b538SAndroid Build Coastguard Worker 93*6777b538SAndroid Build Coastguard Worker # Space-indented text blocks have more leading spaces than regular text. 94*6777b538SAndroid Build Coastguard Worker n = FindNumberOfLeadingSpaces(lines[i]) 95*6777b538SAndroid Build Coastguard Worker 96*6777b538SAndroid Build Coastguard Worker if n > previous_number_of_spaces: 97*6777b538SAndroid Build Coastguard Worker # Beginning of a space-indented text block, start concatenation. 98*6777b538SAndroid Build Coastguard Worker current_block = lines[i][n : ] 99*6777b538SAndroid Build Coastguard Worker elif n == previous_number_of_spaces and current_block: 100*6777b538SAndroid Build Coastguard Worker # Or continuation of a space-indented text block, concatenate lines. 101*6777b538SAndroid Build Coastguard Worker current_block += '\n' + lines[i][n : ] 102*6777b538SAndroid Build Coastguard Worker 103*6777b538SAndroid Build Coastguard Worker if n < previous_number_of_spaces and current_block: 104*6777b538SAndroid Build Coastguard Worker # Current line is not indented, save previously concatenated lines. 105*6777b538SAndroid Build Coastguard Worker indented_blocks.append(current_block) 106*6777b538SAndroid Build Coastguard Worker current_block = '' 107*6777b538SAndroid Build Coastguard Worker 108*6777b538SAndroid Build Coastguard Worker previous_number_of_spaces = n 109*6777b538SAndroid Build Coastguard Worker 110*6777b538SAndroid Build Coastguard Worker return indented_blocks 111*6777b538SAndroid Build Coastguard Worker 112*6777b538SAndroid Build Coastguard Worker 113*6777b538SAndroid Build Coastguard Workerdef FindNumberOfLeadingSpaces(line): 114*6777b538SAndroid Build Coastguard Worker """Calculate number of leading whitespace characters in the string.""" 115*6777b538SAndroid Build Coastguard Worker n = 0 116*6777b538SAndroid Build Coastguard Worker while n < len(line) and line[n].isspace(): 117*6777b538SAndroid Build Coastguard Worker n += 1 118*6777b538SAndroid Build Coastguard Worker 119*6777b538SAndroid Build Coastguard Worker return n 120*6777b538SAndroid Build Coastguard Worker 121*6777b538SAndroid Build Coastguard Worker 122*6777b538SAndroid Build Coastguard Workerdef GenerateDictionary(path_to_binary, path_to_spec, strategy, is_html=False): 123*6777b538SAndroid Build Coastguard Worker """Generate a dictionary for given pair of fuzzer binary and specification.""" 124*6777b538SAndroid Build Coastguard Worker for filepath in [path_to_binary, path_to_spec]: 125*6777b538SAndroid Build Coastguard Worker if not os.path.exists(filepath): 126*6777b538SAndroid Build Coastguard Worker logging.error('%s doesn\'t exist. Exit.', filepath) 127*6777b538SAndroid Build Coastguard Worker sys.exit(1) 128*6777b538SAndroid Build Coastguard Worker 129*6777b538SAndroid Build Coastguard Worker words_from_binary = ExtractWordsFromBinary(path_to_binary) 130*6777b538SAndroid Build Coastguard Worker words_from_spec = ExtractWordsFromSpec(path_to_spec, is_html) 131*6777b538SAndroid Build Coastguard Worker 132*6777b538SAndroid Build Coastguard Worker dictionary_words = set() 133*6777b538SAndroid Build Coastguard Worker 134*6777b538SAndroid Build Coastguard Worker if 'i' in strategy: 135*6777b538SAndroid Build Coastguard Worker # Strategy i: only words which are common for binary and for specification. 136*6777b538SAndroid Build Coastguard Worker dictionary_words = words_from_binary.intersection(words_from_spec) 137*6777b538SAndroid Build Coastguard Worker 138*6777b538SAndroid Build Coastguard Worker if 'q' in strategy: 139*6777b538SAndroid Build Coastguard Worker # Strategy q: add words from all quoted strings from specification. 140*6777b538SAndroid Build Coastguard Worker # TODO(mmoroz): experimental and very noisy. Not recommended to use. 141*6777b538SAndroid Build Coastguard Worker spec_data = ReadSpecification(path_to_spec, is_html) 142*6777b538SAndroid Build Coastguard Worker quoted_strings = FindIndentedText(spec_data) 143*6777b538SAndroid Build Coastguard Worker quoted_words = ExtractWordsFromLines(quoted_strings) 144*6777b538SAndroid Build Coastguard Worker dictionary_words = dictionary_words.union(quoted_words) 145*6777b538SAndroid Build Coastguard Worker 146*6777b538SAndroid Build Coastguard Worker if 'u' in strategy: 147*6777b538SAndroid Build Coastguard Worker # Strategy u: add all uppercase words from specification. 148*6777b538SAndroid Build Coastguard Worker uppercase_words = set(w for w in words_from_spec if w.isupper()) 149*6777b538SAndroid Build Coastguard Worker dictionary_words = dictionary_words.union(uppercase_words) 150*6777b538SAndroid Build Coastguard Worker 151*6777b538SAndroid Build Coastguard Worker return dictionary_words 152*6777b538SAndroid Build Coastguard Worker 153*6777b538SAndroid Build Coastguard Worker 154*6777b538SAndroid Build Coastguard Workerdef PreprocessAndReadRodata(filepath): 155*6777b538SAndroid Build Coastguard Worker """Create a stripped copy of the binary and extract .rodata section.""" 156*6777b538SAndroid Build Coastguard Worker stripped_file = tempfile.NamedTemporaryFile(prefix='.stripped_') 157*6777b538SAndroid Build Coastguard Worker stripped_filepath = stripped_file.name 158*6777b538SAndroid Build Coastguard Worker shutil.copyfile(filepath, stripped_filepath) 159*6777b538SAndroid Build Coastguard Worker 160*6777b538SAndroid Build Coastguard Worker # Strip all symbols to reduce amount of redundant strings. 161*6777b538SAndroid Build Coastguard Worker strip_cmd = ['strip', '--strip-all', stripped_filepath] 162*6777b538SAndroid Build Coastguard Worker result = subprocess.call(strip_cmd) 163*6777b538SAndroid Build Coastguard Worker if result: 164*6777b538SAndroid Build Coastguard Worker logging.warning('Failed to strip the binary. Using the original version.') 165*6777b538SAndroid Build Coastguard Worker stripped_filepath = filepath 166*6777b538SAndroid Build Coastguard Worker 167*6777b538SAndroid Build Coastguard Worker # Extract .rodata section to reduce amount of redundant strings. 168*6777b538SAndroid Build Coastguard Worker rodata_file = tempfile.NamedTemporaryFile(prefix='.rodata_') 169*6777b538SAndroid Build Coastguard Worker rodata_filepath = rodata_file.name 170*6777b538SAndroid Build Coastguard Worker objcopy_cmd = ['objcopy', '-j', '.rodata', stripped_filepath, rodata_filepath] 171*6777b538SAndroid Build Coastguard Worker 172*6777b538SAndroid Build Coastguard Worker # Hide output from stderr since objcopy prints a warning. 173*6777b538SAndroid Build Coastguard Worker with open(os.devnull, 'w') as devnull: 174*6777b538SAndroid Build Coastguard Worker result = subprocess.call(objcopy_cmd, stderr=devnull) 175*6777b538SAndroid Build Coastguard Worker 176*6777b538SAndroid Build Coastguard Worker if result: 177*6777b538SAndroid Build Coastguard Worker logging.warning('Failed to extract .rodata section. Using the whole file.') 178*6777b538SAndroid Build Coastguard Worker rodata_filepath = stripped_filepath 179*6777b538SAndroid Build Coastguard Worker 180*6777b538SAndroid Build Coastguard Worker with open(rodata_filepath) as file_handle: 181*6777b538SAndroid Build Coastguard Worker data = file_handle.read() 182*6777b538SAndroid Build Coastguard Worker 183*6777b538SAndroid Build Coastguard Worker stripped_file.close() 184*6777b538SAndroid Build Coastguard Worker rodata_file.close() 185*6777b538SAndroid Build Coastguard Worker 186*6777b538SAndroid Build Coastguard Worker return data 187*6777b538SAndroid Build Coastguard Worker 188*6777b538SAndroid Build Coastguard Worker 189*6777b538SAndroid Build Coastguard Workerdef ReadSpecification(filepath, is_html): 190*6777b538SAndroid Build Coastguard Worker """Read a specification file and return its contents.""" 191*6777b538SAndroid Build Coastguard Worker with open(filepath, 'r') as file_handle: 192*6777b538SAndroid Build Coastguard Worker data = file_handle.read() 193*6777b538SAndroid Build Coastguard Worker 194*6777b538SAndroid Build Coastguard Worker if is_html: 195*6777b538SAndroid Build Coastguard Worker data = DecodeHTML(data) 196*6777b538SAndroid Build Coastguard Worker 197*6777b538SAndroid Build Coastguard Worker return data 198*6777b538SAndroid Build Coastguard Worker 199*6777b538SAndroid Build Coastguard Worker 200*6777b538SAndroid Build Coastguard Workerdef WriteDictionary(dictionary_path, dictionary): 201*6777b538SAndroid Build Coastguard Worker """Write given dictionary to a file.""" 202*6777b538SAndroid Build Coastguard Worker with open(dictionary_path, 'wb') as file_handle: 203*6777b538SAndroid Build Coastguard Worker file_handle.write('# This is an automatically generated dictionary.\n') 204*6777b538SAndroid Build Coastguard Worker for word in dictionary: 205*6777b538SAndroid Build Coastguard Worker if not word: 206*6777b538SAndroid Build Coastguard Worker continue 207*6777b538SAndroid Build Coastguard Worker line = '"%s"\n' % EscapeDictionaryElement(word) 208*6777b538SAndroid Build Coastguard Worker file_handle.write(line) 209*6777b538SAndroid Build Coastguard Worker 210*6777b538SAndroid Build Coastguard Worker 211*6777b538SAndroid Build Coastguard Workerdef main(): 212*6777b538SAndroid Build Coastguard Worker parser = argparse.ArgumentParser(description="Generate fuzzer dictionary.") 213*6777b538SAndroid Build Coastguard Worker parser.add_argument('--fuzzer', required=True, 214*6777b538SAndroid Build Coastguard Worker help='Path to a fuzzer binary executable. It is ' 215*6777b538SAndroid Build Coastguard Worker 'recommended to use a binary built with ' 216*6777b538SAndroid Build Coastguard Worker '"use_libfuzzer=false is_asan=false" to get a better ' 217*6777b538SAndroid Build Coastguard Worker 'dictionary with fewer number of redundant elements.') 218*6777b538SAndroid Build Coastguard Worker parser.add_argument('--spec', required=True, 219*6777b538SAndroid Build Coastguard Worker help='Path to a target specification (in textual form).') 220*6777b538SAndroid Build Coastguard Worker parser.add_argument('--html', default=0, 221*6777b538SAndroid Build Coastguard Worker help='Decode HTML [01] (0 is default value): ' 222*6777b538SAndroid Build Coastguard Worker '1 - if specification has HTML entities to be decoded.') 223*6777b538SAndroid Build Coastguard Worker parser.add_argument('--out', required=True, 224*6777b538SAndroid Build Coastguard Worker help='Path to a file to write a dictionary into.') 225*6777b538SAndroid Build Coastguard Worker parser.add_argument('--strategy', default='iu', 226*6777b538SAndroid Build Coastguard Worker help='Generation strategy [iqu] ("iu" is default value): ' 227*6777b538SAndroid Build Coastguard Worker 'i - intersection, q - quoted, u - uppercase.') 228*6777b538SAndroid Build Coastguard Worker args = parser.parse_args() 229*6777b538SAndroid Build Coastguard Worker 230*6777b538SAndroid Build Coastguard Worker dictionary = GenerateDictionary(args.fuzzer, args.spec, args.strategy, 231*6777b538SAndroid Build Coastguard Worker is_html=bool(args.html)) 232*6777b538SAndroid Build Coastguard Worker WriteDictionary(args.out, dictionary) 233*6777b538SAndroid Build Coastguard Worker 234*6777b538SAndroid Build Coastguard Worker 235*6777b538SAndroid Build Coastguard Workerif __name__ == '__main__': 236*6777b538SAndroid Build Coastguard Worker main() 237