1#!/usr/bin/env python3 2# 3# Copyright 2016 The Chromium Authors 4# Use of this source code is governed by a BSD-style license that can be 5# found in the LICENSE file. 6 7"""Generate a dictionary for libFuzzer or AFL-based fuzzer. 8 9Invoked manually using a fuzzer binary and target format/protocol specification. 10Works better for text formats or protocols. For binary ones may be useless. 11""" 12 13import argparse 14import HTMLParser 15import io 16import logging 17import os 18import re 19import shutil 20import string 21import subprocess 22import sys 23import tempfile 24 25 26ENCODING_TYPES = ['ascii', 'utf_16_be', 'utf_16_le', 'utf_32_be', 'utf_32_le'] 27MIN_STRING_LENGTH = 4 28 29 30def DecodeHTML(html_data): 31 """HTML-decoding of the data.""" 32 html_parser = HTMLParser.HTMLParser() 33 data = html_parser.unescape(html_data.decode('ascii', 'ignore')) 34 return data.encode('ascii', 'ignore') 35 36 37def EscapeDictionaryElement(element): 38 """Escape all unprintable and control characters in an element.""" 39 element_escaped = element.encode('string_escape') 40 # Remove escaping for single quote because it breaks libFuzzer. 41 element_escaped = element_escaped.replace('\\\'', '\'') 42 # Add escaping for double quote. 43 element_escaped = element_escaped.replace('"', '\\"') 44 return element_escaped 45 46 47def ExtractWordsFromBinary(filepath, min_length=MIN_STRING_LENGTH): 48 """Extract words (splitted strings) from a binary executable file.""" 49 rodata = PreprocessAndReadRodata(filepath) 50 words = [] 51 52 strings_re = re.compile(r'[^\x00-\x1F\x7F-\xFF]{%d,}' % min_length) 53 # Use different encodings for strings extraction. 54 for encoding in ENCODING_TYPES: 55 data = rodata.decode(encoding, 'ignore').encode('ascii', 'ignore') 56 raw_strings = strings_re.findall(data) 57 for splitted_line in map(lambda line: line.split(), raw_strings): 58 words += splitted_line 59 60 return set(words) 61 62 63def ExtractWordsFromLines(lines): 64 """Extract all words from a list of strings.""" 65 words = set() 66 for line in lines: 67 for word in line.split(): 68 words.add(word) 69 70 return words 71 72 73def ExtractWordsFromSpec(filepath, is_html): 74 """Extract words from a specification.""" 75 data = ReadSpecification(filepath, is_html) 76 words = data.split() 77 return set(words) 78 79 80def FindIndentedText(text): 81 """Find space-indented text blocks, e.g. code or data samples in RFCs.""" 82 lines = text.split('\n') 83 indented_blocks = [] 84 current_block = '' 85 previous_number_of_spaces = 0 86 87 # Go through every line and concatenate space-indented blocks into lines. 88 for i in xrange(0, len(lines), 1): 89 if not lines[i]: 90 # Ignore empty lines. 91 continue 92 93 # Space-indented text blocks have more leading spaces than regular text. 94 n = FindNumberOfLeadingSpaces(lines[i]) 95 96 if n > previous_number_of_spaces: 97 # Beginning of a space-indented text block, start concatenation. 98 current_block = lines[i][n : ] 99 elif n == previous_number_of_spaces and current_block: 100 # Or continuation of a space-indented text block, concatenate lines. 101 current_block += '\n' + lines[i][n : ] 102 103 if n < previous_number_of_spaces and current_block: 104 # Current line is not indented, save previously concatenated lines. 105 indented_blocks.append(current_block) 106 current_block = '' 107 108 previous_number_of_spaces = n 109 110 return indented_blocks 111 112 113def FindNumberOfLeadingSpaces(line): 114 """Calculate number of leading whitespace characters in the string.""" 115 n = 0 116 while n < len(line) and line[n].isspace(): 117 n += 1 118 119 return n 120 121 122def GenerateDictionary(path_to_binary, path_to_spec, strategy, is_html=False): 123 """Generate a dictionary for given pair of fuzzer binary and specification.""" 124 for filepath in [path_to_binary, path_to_spec]: 125 if not os.path.exists(filepath): 126 logging.error('%s doesn\'t exist. Exit.', filepath) 127 sys.exit(1) 128 129 words_from_binary = ExtractWordsFromBinary(path_to_binary) 130 words_from_spec = ExtractWordsFromSpec(path_to_spec, is_html) 131 132 dictionary_words = set() 133 134 if 'i' in strategy: 135 # Strategy i: only words which are common for binary and for specification. 136 dictionary_words = words_from_binary.intersection(words_from_spec) 137 138 if 'q' in strategy: 139 # Strategy q: add words from all quoted strings from specification. 140 # TODO(mmoroz): experimental and very noisy. Not recommended to use. 141 spec_data = ReadSpecification(path_to_spec, is_html) 142 quoted_strings = FindIndentedText(spec_data) 143 quoted_words = ExtractWordsFromLines(quoted_strings) 144 dictionary_words = dictionary_words.union(quoted_words) 145 146 if 'u' in strategy: 147 # Strategy u: add all uppercase words from specification. 148 uppercase_words = set(w for w in words_from_spec if w.isupper()) 149 dictionary_words = dictionary_words.union(uppercase_words) 150 151 return dictionary_words 152 153 154def PreprocessAndReadRodata(filepath): 155 """Create a stripped copy of the binary and extract .rodata section.""" 156 stripped_file = tempfile.NamedTemporaryFile(prefix='.stripped_') 157 stripped_filepath = stripped_file.name 158 shutil.copyfile(filepath, stripped_filepath) 159 160 # Strip all symbols to reduce amount of redundant strings. 161 strip_cmd = ['strip', '--strip-all', stripped_filepath] 162 result = subprocess.call(strip_cmd) 163 if result: 164 logging.warning('Failed to strip the binary. Using the original version.') 165 stripped_filepath = filepath 166 167 # Extract .rodata section to reduce amount of redundant strings. 168 rodata_file = tempfile.NamedTemporaryFile(prefix='.rodata_') 169 rodata_filepath = rodata_file.name 170 objcopy_cmd = ['objcopy', '-j', '.rodata', stripped_filepath, rodata_filepath] 171 172 # Hide output from stderr since objcopy prints a warning. 173 with open(os.devnull, 'w') as devnull: 174 result = subprocess.call(objcopy_cmd, stderr=devnull) 175 176 if result: 177 logging.warning('Failed to extract .rodata section. Using the whole file.') 178 rodata_filepath = stripped_filepath 179 180 with open(rodata_filepath) as file_handle: 181 data = file_handle.read() 182 183 stripped_file.close() 184 rodata_file.close() 185 186 return data 187 188 189def ReadSpecification(filepath, is_html): 190 """Read a specification file and return its contents.""" 191 with open(filepath, 'r') as file_handle: 192 data = file_handle.read() 193 194 if is_html: 195 data = DecodeHTML(data) 196 197 return data 198 199 200def WriteDictionary(dictionary_path, dictionary): 201 """Write given dictionary to a file.""" 202 with open(dictionary_path, 'wb') as file_handle: 203 file_handle.write('# This is an automatically generated dictionary.\n') 204 for word in dictionary: 205 if not word: 206 continue 207 line = '"%s"\n' % EscapeDictionaryElement(word) 208 file_handle.write(line) 209 210 211def main(): 212 parser = argparse.ArgumentParser(description="Generate fuzzer dictionary.") 213 parser.add_argument('--fuzzer', required=True, 214 help='Path to a fuzzer binary executable. It is ' 215 'recommended to use a binary built with ' 216 '"use_libfuzzer=false is_asan=false" to get a better ' 217 'dictionary with fewer number of redundant elements.') 218 parser.add_argument('--spec', required=True, 219 help='Path to a target specification (in textual form).') 220 parser.add_argument('--html', default=0, 221 help='Decode HTML [01] (0 is default value): ' 222 '1 - if specification has HTML entities to be decoded.') 223 parser.add_argument('--out', required=True, 224 help='Path to a file to write a dictionary into.') 225 parser.add_argument('--strategy', default='iu', 226 help='Generation strategy [iqu] ("iu" is default value): ' 227 'i - intersection, q - quoted, u - uppercase.') 228 args = parser.parse_args() 229 230 dictionary = GenerateDictionary(args.fuzzer, args.spec, args.strategy, 231 is_html=bool(args.html)) 232 WriteDictionary(args.out, dictionary) 233 234 235if __name__ == '__main__': 236 main() 237