1#!/usr/bin/env python3 2# Copyright 2019 The Chromium Authors 3# Use of this source code is governed by a BSD-style license that can be 4# found in the LICENSE file. 5"""Utilities for optimistically parsing dex files. 6 7This file is not meant to provide a generic tool for analyzing dex files. 8A DexFile class that exposes access to several memory items in the dex format 9is provided, but it does not include error handling or validation. 10""" 11 12import argparse 13import collections 14import errno 15import os 16import re 17import struct 18import sys 19import zipfile 20 21# https://source.android.com/devices/tech/dalvik/dex-format#header-item 22_DEX_HEADER_FMT = ( 23 ('magic', '8s'), 24 ('checksum', 'I'), 25 ('signature', '20s'), 26 ('file_size', 'I'), 27 ('header_size', 'I'), 28 ('endian_tag', 'I'), 29 ('link_size', 'I'), 30 ('link_off', 'I'), 31 ('map_off', 'I'), 32 ('string_ids_size', 'I'), 33 ('string_ids_off', 'I'), 34 ('type_ids_size', 'I'), 35 ('type_ids_off', 'I'), 36 ('proto_ids_size', 'I'), 37 ('proto_ids_off', 'I'), 38 ('field_ids_size', 'I'), 39 ('field_ids_off', 'I'), 40 ('method_ids_size', 'I'), 41 ('method_ids_off', 'I'), 42 ('class_defs_size', 'I'), 43 ('class_defs_off', 'I'), 44 ('data_size', 'I'), 45 ('data_off', 'I'), 46) 47 48DexHeader = collections.namedtuple('DexHeader', 49 ','.join(t[0] for t in _DEX_HEADER_FMT)) 50 51# Simple memory items. 52_TypeIdItem = collections.namedtuple('TypeIdItem', 'descriptor_idx') 53_ProtoIdItem = collections.namedtuple( 54 'ProtoIdItem', 'shorty_idx,return_type_idx,parameters_off') 55_MethodIdItem = collections.namedtuple('MethodIdItem', 56 'type_idx,proto_idx,name_idx') 57_TypeItem = collections.namedtuple('TypeItem', 'type_idx') 58_StringDataItem = collections.namedtuple('StringItem', 'utf16_size,data') 59_ClassDefItem = collections.namedtuple( 60 'ClassDefItem', 61 'class_idx,access_flags,superclass_idx,interfaces_off,source_file_idx,' 62 'annotations_off,class_data_off,static_values_off') 63 64 65class _MemoryItemList: 66 """Base class for repeated memory items.""" 67 68 def __init__(self, 69 reader, 70 offset, 71 size, 72 factory, 73 alignment=None, 74 first_item_offset=None): 75 """Creates the item list using the specific item factory. 76 77 Args: 78 reader: _DexReader used for decoding the memory item. 79 offset: Offset from start of the file to the item list, serving as the 80 key for some item types. 81 size: Number of memory items in the list. 82 factory: Function to extract each memory item from a _DexReader. 83 alignment: Optional integer specifying the alignment for the memory 84 section represented by this list. 85 first_item_offset: Optional, specifies a different offset to use for 86 extracting memory items (default is to use offset). 87 """ 88 self.offset = offset 89 self.size = size 90 reader.Seek(first_item_offset or offset) 91 self._items = [factory(reader) for _ in range(size)] 92 93 if alignment: 94 reader.AlignUpTo(alignment) 95 96 def __iter__(self): 97 return iter(self._items) 98 99 def __getitem__(self, key): 100 return self._items[key] 101 102 def __len__(self): 103 return len(self._items) 104 105 def __repr__(self): 106 item_type_part = '' 107 if self.size != 0: 108 item_type = type(self._items[0]) 109 item_type_part = ', item type={}'.format(item_type.__name__) 110 111 return '{}(offset={:#x}, size={}{})'.format( 112 type(self).__name__, self.offset, self.size, item_type_part) 113 114 115class _TypeIdItemList(_MemoryItemList): 116 def __init__(self, reader, offset, size): 117 factory = lambda x: _TypeIdItem(x.ReadUInt()) 118 super().__init__(reader, offset, size, factory) 119 120 121class _ProtoIdItemList(_MemoryItemList): 122 def __init__(self, reader, offset, size): 123 factory = lambda x: _ProtoIdItem(x.ReadUInt(), x.ReadUInt(), x.ReadUInt()) 124 super().__init__(reader, offset, size, factory) 125 126 127class _MethodIdItemList(_MemoryItemList): 128 def __init__(self, reader, offset, size): 129 factory = ( 130 lambda x: _MethodIdItem(x.ReadUShort(), x.ReadUShort(), x.ReadUInt())) 131 super().__init__(reader, offset, size, factory) 132 133 134class _StringItemList(_MemoryItemList): 135 def __init__(self, reader, offset, size): 136 reader.Seek(offset) 137 string_item_offsets = iter([reader.ReadUInt() for _ in range(size)]) 138 139 def factory(x): 140 data_offset = next(string_item_offsets) 141 string = x.ReadString(data_offset) 142 return _StringDataItem(len(string), string) 143 144 super().__init__(reader, offset, size, factory) 145 146 147class _TypeListItem(_MemoryItemList): 148 def __init__(self, reader): 149 offset = reader.Tell() 150 size = reader.ReadUInt() 151 factory = lambda x: _TypeItem(x.ReadUShort()) 152 # This is necessary because we need to extract the size of the type list 153 # (in other cases the list size is provided in the header). 154 first_item_offset = reader.Tell() 155 super().__init__(reader, 156 offset, 157 size, 158 factory, 159 alignment=4, 160 first_item_offset=first_item_offset) 161 162 163class _TypeListItemList(_MemoryItemList): 164 def __init__(self, reader, offset, size): 165 super().__init__(reader, offset, size, _TypeListItem) 166 167 168class _ClassDefItemList(_MemoryItemList): 169 def __init__(self, reader, offset, size): 170 reader.Seek(offset) 171 172 def factory(x): 173 return _ClassDefItem(*(x.ReadUInt() 174 for _ in range(len(_ClassDefItem._fields)))) 175 176 super().__init__(reader, offset, size, factory) 177 178 179class _DexMapItem: 180 def __init__(self, reader): 181 self.type = reader.ReadUShort() 182 reader.ReadUShort() 183 self.size = reader.ReadUInt() 184 self.offset = reader.ReadUInt() 185 186 def __repr__(self): 187 return '_DexMapItem(type={}, size={}, offset={:#x})'.format( 188 self.type, self.size, self.offset) 189 190 191class _DexMapList: 192 # Full list of type codes: 193 # https://source.android.com/devices/tech/dalvik/dex-format#type-codes 194 TYPE_TYPE_LIST = 0x1001 195 196 def __init__(self, reader, offset): 197 self._map = {} 198 reader.Seek(offset) 199 self._size = reader.ReadUInt() 200 for _ in range(self._size): 201 item = _DexMapItem(reader) 202 self._map[item.type] = item 203 204 def __getitem__(self, key): 205 return self._map[key] 206 207 def __contains__(self, key): 208 return key in self._map 209 210 def __repr__(self): 211 return '_DexMapList(size={}, items={})'.format(self._size, self._map) 212 213 214class _DexReader: 215 def __init__(self, data): 216 self._data = data 217 self._pos = 0 218 219 def Seek(self, offset): 220 self._pos = offset 221 222 def Tell(self): 223 return self._pos 224 225 def ReadUByte(self): 226 return self._ReadData('<B') 227 228 def ReadUShort(self): 229 return self._ReadData('<H') 230 231 def ReadUInt(self): 232 return self._ReadData('<I') 233 234 def ReadString(self, data_offset): 235 string_length, string_offset = self._ReadULeb128(data_offset) 236 string_data_offset = string_offset + data_offset 237 return self._DecodeMUtf8(string_length, string_data_offset) 238 239 def AlignUpTo(self, align_unit): 240 off_by = self._pos % align_unit 241 if off_by: 242 self.Seek(self._pos + align_unit - off_by) 243 244 def ReadHeader(self): 245 header_fmt = '<' + ''.join(t[1] for t in _DEX_HEADER_FMT) 246 return DexHeader._make(struct.unpack_from(header_fmt, self._data)) 247 248 def _ReadData(self, fmt): 249 ret = struct.unpack_from(fmt, self._data, self._pos)[0] 250 self._pos += struct.calcsize(fmt) 251 return ret 252 253 def _ReadULeb128(self, data_offset): 254 """Returns a tuple of (uleb128 value, number of bytes occupied). 255 256 From DWARF3 spec: http://dwarfstd.org/doc/Dwarf3.pdf 257 258 Args: 259 data_offset: Location of the unsigned LEB128. 260 """ 261 value = 0 262 shift = 0 263 cur_offset = data_offset 264 while True: 265 byte = self._data[cur_offset] 266 cur_offset += 1 267 value |= (byte & 0b01111111) << shift 268 if (byte & 0b10000000) == 0: 269 break 270 shift += 7 271 272 return value, cur_offset - data_offset 273 274 def _DecodeMUtf8(self, string_length, offset): 275 """Returns the string located at the specified offset. 276 277 See https://source.android.com/devices/tech/dalvik/dex-format#mutf-8 278 279 Ported from the Android Java implementation: 280 https://android.googlesource.com/platform/dalvik/+/fe107fb6e3f308ac5174ebdc5a794ee880c741d9/dx/src/com/android/dex/Mutf8.java#34 281 282 Args: 283 string_length: The length of the decoded string. 284 offset: Offset to the beginning of the string. 285 """ 286 self.Seek(offset) 287 ret = '' 288 289 for _ in range(string_length): 290 a = self.ReadUByte() 291 if a == 0: 292 raise _MUTf8DecodeError('Early string termination encountered', 293 string_length, offset) 294 if (a & 0x80) == 0x00: 295 code = a 296 elif (a & 0xe0) == 0xc0: 297 b = self.ReadUByte() 298 if (b & 0xc0) != 0x80: 299 raise _MUTf8DecodeError('Error in byte 2', string_length, offset) 300 code = ((a & 0x1f) << 6) | (b & 0x3f) 301 elif (a & 0xf0) == 0xe0: 302 b = self.ReadUByte() 303 c = self.ReadUByte() 304 if (b & 0xc0) != 0x80 or (c & 0xc0) != 0x80: 305 raise _MUTf8DecodeError('Error in byte 3 or 4', string_length, offset) 306 code = ((a & 0x0f) << 12) | ((b & 0x3f) << 6) | (c & 0x3f) 307 else: 308 raise _MUTf8DecodeError('Bad byte', string_length, offset) 309 ret += chr(code) 310 311 if self.ReadUByte() != 0x00: 312 raise _MUTf8DecodeError('Expected string termination', string_length, 313 offset) 314 315 return ret 316 317 318class _MUTf8DecodeError(Exception): 319 def __init__(self, message, length, offset): 320 message += ' (decoded string length: {}, string data offset: {:#x})'.format( 321 length, offset) 322 super().__init__(message) 323 324 325class DexFile: 326 """Represents a single dex file. 327 328 Parses and exposes access to dex file structure and contents, as described 329 at https://source.android.com/devices/tech/dalvik/dex-format 330 331 Fields: 332 reader: _DexReader object used to decode dex file contents. 333 header: DexHeader for this dex file. 334 map_list: _DexMapList object containing list of dex file contents. 335 type_item_list: _TypeIdItemList containing type_id_items. 336 proto_item_list: _ProtoIdItemList containing proto_id_items. 337 method_item_list: _MethodIdItemList containing method_id_items. 338 string_item_list: _StringItemList containing string_data_items that are 339 referenced by index in other sections. 340 type_list_item_list: _TypeListItemList containing _TypeListItems. 341 _TypeListItems are referenced by their offsets from other dex items. 342 class_def_item_list: _ClassDefItemList containing _ClassDefItems. 343 """ 344 _CLASS_ACCESS_FLAGS = { 345 0x1: 'public', 346 0x2: 'private', 347 0x4: 'protected', 348 0x8: 'static', 349 0x10: 'final', 350 0x200: 'interface', 351 0x400: 'abstract', 352 0x1000: 'synthetic', 353 0x2000: 'annotation', 354 0x4000: 'enum', 355 } 356 357 def __init__(self, data): 358 """Decodes dex file memory sections. 359 360 Args: 361 data: bytearray containing the contents of a dex file. 362 """ 363 self.reader = _DexReader(data) 364 self.header = self.reader.ReadHeader() 365 self.map_list = _DexMapList(self.reader, self.header.map_off) 366 self.type_item_list = _TypeIdItemList(self.reader, self.header.type_ids_off, 367 self.header.type_ids_size) 368 self.proto_item_list = _ProtoIdItemList(self.reader, 369 self.header.proto_ids_off, 370 self.header.proto_ids_size) 371 self.method_item_list = _MethodIdItemList(self.reader, 372 self.header.method_ids_off, 373 self.header.method_ids_size) 374 self.string_item_list = _StringItemList(self.reader, 375 self.header.string_ids_off, 376 self.header.string_ids_size) 377 self.class_def_item_list = _ClassDefItemList(self.reader, 378 self.header.class_defs_off, 379 self.header.class_defs_size) 380 381 type_list_key = _DexMapList.TYPE_TYPE_LIST 382 if type_list_key in self.map_list: 383 map_list_item = self.map_list[type_list_key] 384 self.type_list_item_list = _TypeListItemList(self.reader, 385 map_list_item.offset, 386 map_list_item.size) 387 else: 388 self.type_list_item_list = _TypeListItemList(self.reader, 0, 0) 389 self._type_lists_by_offset = { 390 type_list.offset: type_list 391 for type_list in self.type_list_item_list 392 } 393 394 def GetString(self, string_item_idx): 395 string_item = self.string_item_list[string_item_idx] 396 return string_item.data 397 398 def GetTypeString(self, type_item_idx): 399 type_item = self.type_item_list[type_item_idx] 400 return self.GetString(type_item.descriptor_idx) 401 402 def GetTypeListStringsByOffset(self, offset): 403 if not offset: 404 return () 405 type_list = self._type_lists_by_offset[offset] 406 return tuple(self.GetTypeString(item.type_idx) for item in type_list) 407 408 @staticmethod 409 def ResolveClassAccessFlags(access_flags): 410 return tuple(flag_string 411 for flag, flag_string in DexFile._CLASS_ACCESS_FLAGS.items() 412 if flag & access_flags) 413 414 def IterMethodSignatureParts(self): 415 """Yields the string components of dex methods in a dex file. 416 417 Yields: 418 Tuples that look like: 419 (class name, return type, method name, (parameter type, ...)). 420 """ 421 for method_item in self.method_item_list: 422 class_name_string = self.GetTypeString(method_item.type_idx) 423 method_name_string = self.GetString(method_item.name_idx) 424 proto_item = self.proto_item_list[method_item.proto_idx] 425 return_type_string = self.GetTypeString(proto_item.return_type_idx) 426 parameter_types = self.GetTypeListStringsByOffset( 427 proto_item.parameters_off) 428 yield (class_name_string, return_type_string, method_name_string, 429 parameter_types) 430 431 def __repr__(self): 432 items = [ 433 self.header, 434 self.map_list, 435 self.type_item_list, 436 self.proto_item_list, 437 self.method_item_list, 438 self.string_item_list, 439 self.type_list_item_list, 440 self.class_def_item_list, 441 ] 442 return '\n'.join(str(item) for item in items) 443 444 445class _DumpCommand: 446 def __init__(self, dexfile): 447 self._dexfile = dexfile 448 449 def Run(self): 450 raise NotImplementedError() 451 452 453class _DumpMethods(_DumpCommand): 454 def Run(self): 455 for parts in self._dexfile.IterMethodSignatureParts(): 456 class_type, return_type, method_name, parameter_types = parts 457 print('{} {} (return type={}, parameters={})'.format( 458 class_type, method_name, return_type, parameter_types)) 459 460 461class _DumpStrings(_DumpCommand): 462 def Run(self): 463 for string_item in self._dexfile.string_item_list: 464 # Some strings are likely to be non-ascii (vs. methods/classes). 465 print(string_item.data.encode('utf-8')) 466 467 468class _DumpClasses(_DumpCommand): 469 def Run(self): 470 for class_item in self._dexfile.class_def_item_list: 471 class_string = self._dexfile.GetTypeString(class_item.class_idx) 472 superclass_string = self._dexfile.GetTypeString(class_item.superclass_idx) 473 interfaces = self._dexfile.GetTypeListStringsByOffset( 474 class_item.interfaces_off) 475 access_flags = DexFile.ResolveClassAccessFlags(class_item.access_flags) 476 print('{} (superclass={}, interfaces={}, access_flags={})'.format( 477 class_string, superclass_string, interfaces, access_flags)) 478 479 480class _DumpSummary(_DumpCommand): 481 def Run(self): 482 print(self._dexfile) 483 484 485def _DumpDexItems(dexfile_data, name, item): 486 dexfile = DexFile(bytearray(dexfile_data)) 487 print('dex_parser: Dumping {} for {}'.format(item, name)) 488 cmds = { 489 'summary': _DumpSummary, 490 'methods': _DumpMethods, 491 'strings': _DumpStrings, 492 'classes': _DumpClasses, 493 } 494 try: 495 cmds[item](dexfile).Run() 496 except IOError as e: 497 if e.errno == errno.EPIPE: 498 # Assume we're piping to "less", do nothing. 499 pass 500 501 502def main(): 503 parser = argparse.ArgumentParser(description='Dump dex contents to stdout.') 504 parser.add_argument('input', 505 help='Input (.dex, .jar, .zip, .aab, .apk) file path.') 506 parser.add_argument('item', 507 choices=('methods', 'strings', 'classes', 'summary'), 508 help='Item to dump', 509 nargs='?', 510 default='summary') 511 args = parser.parse_args() 512 513 if os.path.splitext(args.input)[1] in ('.apk', '.jar', '.zip', '.aab'): 514 with zipfile.ZipFile(args.input) as z: 515 dex_file_paths = [ 516 f for f in z.namelist() if re.match(r'.*classes[0-9]*\.dex$', f) 517 ] 518 if not dex_file_paths: 519 print('Error: {} does not contain any classes.dex files'.format( 520 args.input)) 521 sys.exit(1) 522 523 for path in dex_file_paths: 524 _DumpDexItems(z.read(path), path, args.item) 525 526 else: 527 with open(args.input, 'rb') as f: 528 _DumpDexItems(f.read(), args.input, args.item) 529 530 531if __name__ == '__main__': 532 main() 533