xref: /aosp_15_r20/external/cronet/build/android/pylib/dex/dex_parser.py (revision 6777b5387eb2ff775bb5750e3f5d96f37fb7352b)
1#!/usr/bin/env python3
2# Copyright 2019 The Chromium Authors
3# Use of this source code is governed by a BSD-style license that can be
4# found in the LICENSE file.
5"""Utilities for optimistically parsing dex files.
6
7This file is not meant to provide a generic tool for analyzing dex files.
8A DexFile class that exposes access to several memory items in the dex format
9is provided, but it does not include error handling or validation.
10"""
11
12import argparse
13import collections
14import errno
15import os
16import re
17import struct
18import sys
19import zipfile
20
21# https://source.android.com/devices/tech/dalvik/dex-format#header-item
22_DEX_HEADER_FMT = (
23    ('magic', '8s'),
24    ('checksum', 'I'),
25    ('signature', '20s'),
26    ('file_size', 'I'),
27    ('header_size', 'I'),
28    ('endian_tag', 'I'),
29    ('link_size', 'I'),
30    ('link_off', 'I'),
31    ('map_off', 'I'),
32    ('string_ids_size', 'I'),
33    ('string_ids_off', 'I'),
34    ('type_ids_size', 'I'),
35    ('type_ids_off', 'I'),
36    ('proto_ids_size', 'I'),
37    ('proto_ids_off', 'I'),
38    ('field_ids_size', 'I'),
39    ('field_ids_off', 'I'),
40    ('method_ids_size', 'I'),
41    ('method_ids_off', 'I'),
42    ('class_defs_size', 'I'),
43    ('class_defs_off', 'I'),
44    ('data_size', 'I'),
45    ('data_off', 'I'),
46)
47
48DexHeader = collections.namedtuple('DexHeader',
49                                   ','.join(t[0] for t in _DEX_HEADER_FMT))
50
51# Simple memory items.
52_TypeIdItem = collections.namedtuple('TypeIdItem', 'descriptor_idx')
53_ProtoIdItem = collections.namedtuple(
54    'ProtoIdItem', 'shorty_idx,return_type_idx,parameters_off')
55_MethodIdItem = collections.namedtuple('MethodIdItem',
56                                       'type_idx,proto_idx,name_idx')
57_TypeItem = collections.namedtuple('TypeItem', 'type_idx')
58_StringDataItem = collections.namedtuple('StringItem', 'utf16_size,data')
59_ClassDefItem = collections.namedtuple(
60    'ClassDefItem',
61    'class_idx,access_flags,superclass_idx,interfaces_off,source_file_idx,'
62    'annotations_off,class_data_off,static_values_off')
63
64
65class _MemoryItemList:
66  """Base class for repeated memory items."""
67
68  def __init__(self,
69               reader,
70               offset,
71               size,
72               factory,
73               alignment=None,
74               first_item_offset=None):
75    """Creates the item list using the specific item factory.
76
77    Args:
78      reader: _DexReader used for decoding the memory item.
79      offset: Offset from start of the file to the item list, serving as the
80        key for some item types.
81      size: Number of memory items in the list.
82      factory: Function to extract each memory item from a _DexReader.
83      alignment: Optional integer specifying the alignment for the memory
84        section represented by this list.
85      first_item_offset: Optional, specifies a different offset to use for
86        extracting memory items (default is to use offset).
87    """
88    self.offset = offset
89    self.size = size
90    reader.Seek(first_item_offset or offset)
91    self._items = [factory(reader) for _ in range(size)]
92
93    if alignment:
94      reader.AlignUpTo(alignment)
95
96  def __iter__(self):
97    return iter(self._items)
98
99  def __getitem__(self, key):
100    return self._items[key]
101
102  def __len__(self):
103    return len(self._items)
104
105  def __repr__(self):
106    item_type_part = ''
107    if self.size != 0:
108      item_type = type(self._items[0])
109      item_type_part = ', item type={}'.format(item_type.__name__)
110
111    return '{}(offset={:#x}, size={}{})'.format(
112        type(self).__name__, self.offset, self.size, item_type_part)
113
114
115class _TypeIdItemList(_MemoryItemList):
116  def __init__(self, reader, offset, size):
117    factory = lambda x: _TypeIdItem(x.ReadUInt())
118    super().__init__(reader, offset, size, factory)
119
120
121class _ProtoIdItemList(_MemoryItemList):
122  def __init__(self, reader, offset, size):
123    factory = lambda x: _ProtoIdItem(x.ReadUInt(), x.ReadUInt(), x.ReadUInt())
124    super().__init__(reader, offset, size, factory)
125
126
127class _MethodIdItemList(_MemoryItemList):
128  def __init__(self, reader, offset, size):
129    factory = (
130        lambda x: _MethodIdItem(x.ReadUShort(), x.ReadUShort(), x.ReadUInt()))
131    super().__init__(reader, offset, size, factory)
132
133
134class _StringItemList(_MemoryItemList):
135  def __init__(self, reader, offset, size):
136    reader.Seek(offset)
137    string_item_offsets = iter([reader.ReadUInt() for _ in range(size)])
138
139    def factory(x):
140      data_offset = next(string_item_offsets)
141      string = x.ReadString(data_offset)
142      return _StringDataItem(len(string), string)
143
144    super().__init__(reader, offset, size, factory)
145
146
147class _TypeListItem(_MemoryItemList):
148  def __init__(self, reader):
149    offset = reader.Tell()
150    size = reader.ReadUInt()
151    factory = lambda x: _TypeItem(x.ReadUShort())
152    # This is necessary because we need to extract the size of the type list
153    # (in other cases the list size is provided in the header).
154    first_item_offset = reader.Tell()
155    super().__init__(reader,
156                     offset,
157                     size,
158                     factory,
159                     alignment=4,
160                     first_item_offset=first_item_offset)
161
162
163class _TypeListItemList(_MemoryItemList):
164  def __init__(self, reader, offset, size):
165    super().__init__(reader, offset, size, _TypeListItem)
166
167
168class _ClassDefItemList(_MemoryItemList):
169  def __init__(self, reader, offset, size):
170    reader.Seek(offset)
171
172    def factory(x):
173      return _ClassDefItem(*(x.ReadUInt()
174                             for _ in range(len(_ClassDefItem._fields))))
175
176    super().__init__(reader, offset, size, factory)
177
178
179class _DexMapItem:
180  def __init__(self, reader):
181    self.type = reader.ReadUShort()
182    reader.ReadUShort()
183    self.size = reader.ReadUInt()
184    self.offset = reader.ReadUInt()
185
186  def __repr__(self):
187    return '_DexMapItem(type={}, size={}, offset={:#x})'.format(
188        self.type, self.size, self.offset)
189
190
191class _DexMapList:
192  # Full list of type codes:
193  # https://source.android.com/devices/tech/dalvik/dex-format#type-codes
194  TYPE_TYPE_LIST = 0x1001
195
196  def __init__(self, reader, offset):
197    self._map = {}
198    reader.Seek(offset)
199    self._size = reader.ReadUInt()
200    for _ in range(self._size):
201      item = _DexMapItem(reader)
202      self._map[item.type] = item
203
204  def __getitem__(self, key):
205    return self._map[key]
206
207  def __contains__(self, key):
208    return key in self._map
209
210  def __repr__(self):
211    return '_DexMapList(size={}, items={})'.format(self._size, self._map)
212
213
214class _DexReader:
215  def __init__(self, data):
216    self._data = data
217    self._pos = 0
218
219  def Seek(self, offset):
220    self._pos = offset
221
222  def Tell(self):
223    return self._pos
224
225  def ReadUByte(self):
226    return self._ReadData('<B')
227
228  def ReadUShort(self):
229    return self._ReadData('<H')
230
231  def ReadUInt(self):
232    return self._ReadData('<I')
233
234  def ReadString(self, data_offset):
235    string_length, string_offset = self._ReadULeb128(data_offset)
236    string_data_offset = string_offset + data_offset
237    return self._DecodeMUtf8(string_length, string_data_offset)
238
239  def AlignUpTo(self, align_unit):
240    off_by = self._pos % align_unit
241    if off_by:
242      self.Seek(self._pos + align_unit - off_by)
243
244  def ReadHeader(self):
245    header_fmt = '<' + ''.join(t[1] for t in _DEX_HEADER_FMT)
246    return DexHeader._make(struct.unpack_from(header_fmt, self._data))
247
248  def _ReadData(self, fmt):
249    ret = struct.unpack_from(fmt, self._data, self._pos)[0]
250    self._pos += struct.calcsize(fmt)
251    return ret
252
253  def _ReadULeb128(self, data_offset):
254    """Returns a tuple of (uleb128 value, number of bytes occupied).
255
256    From DWARF3 spec: http://dwarfstd.org/doc/Dwarf3.pdf
257
258    Args:
259      data_offset: Location of the unsigned LEB128.
260    """
261    value = 0
262    shift = 0
263    cur_offset = data_offset
264    while True:
265      byte = self._data[cur_offset]
266      cur_offset += 1
267      value |= (byte & 0b01111111) << shift
268      if (byte & 0b10000000) == 0:
269        break
270      shift += 7
271
272    return value, cur_offset - data_offset
273
274  def _DecodeMUtf8(self, string_length, offset):
275    """Returns the string located at the specified offset.
276
277    See https://source.android.com/devices/tech/dalvik/dex-format#mutf-8
278
279    Ported from the Android Java implementation:
280    https://android.googlesource.com/platform/dalvik/+/fe107fb6e3f308ac5174ebdc5a794ee880c741d9/dx/src/com/android/dex/Mutf8.java#34
281
282    Args:
283      string_length: The length of the decoded string.
284      offset: Offset to the beginning of the string.
285    """
286    self.Seek(offset)
287    ret = ''
288
289    for _ in range(string_length):
290      a = self.ReadUByte()
291      if a == 0:
292        raise _MUTf8DecodeError('Early string termination encountered',
293                                string_length, offset)
294      if (a & 0x80) == 0x00:
295        code = a
296      elif (a & 0xe0) == 0xc0:
297        b = self.ReadUByte()
298        if (b & 0xc0) != 0x80:
299          raise _MUTf8DecodeError('Error in byte 2', string_length, offset)
300        code = ((a & 0x1f) << 6) | (b & 0x3f)
301      elif (a & 0xf0) == 0xe0:
302        b = self.ReadUByte()
303        c = self.ReadUByte()
304        if (b & 0xc0) != 0x80 or (c & 0xc0) != 0x80:
305          raise _MUTf8DecodeError('Error in byte 3 or 4', string_length, offset)
306        code = ((a & 0x0f) << 12) | ((b & 0x3f) << 6) | (c & 0x3f)
307      else:
308        raise _MUTf8DecodeError('Bad byte', string_length, offset)
309      ret += chr(code)
310
311    if self.ReadUByte() != 0x00:
312      raise _MUTf8DecodeError('Expected string termination', string_length,
313                              offset)
314
315    return ret
316
317
318class _MUTf8DecodeError(Exception):
319  def __init__(self, message, length, offset):
320    message += ' (decoded string length: {}, string data offset: {:#x})'.format(
321        length, offset)
322    super().__init__(message)
323
324
325class DexFile:
326  """Represents a single dex file.
327
328  Parses and exposes access to dex file structure and contents, as described
329  at https://source.android.com/devices/tech/dalvik/dex-format
330
331  Fields:
332    reader: _DexReader object used to decode dex file contents.
333    header: DexHeader for this dex file.
334    map_list: _DexMapList object containing list of dex file contents.
335    type_item_list: _TypeIdItemList containing type_id_items.
336    proto_item_list: _ProtoIdItemList containing proto_id_items.
337    method_item_list: _MethodIdItemList containing method_id_items.
338    string_item_list: _StringItemList containing string_data_items that are
339      referenced by index in other sections.
340    type_list_item_list: _TypeListItemList containing _TypeListItems.
341      _TypeListItems are referenced by their offsets from other dex items.
342    class_def_item_list: _ClassDefItemList containing _ClassDefItems.
343  """
344  _CLASS_ACCESS_FLAGS = {
345      0x1: 'public',
346      0x2: 'private',
347      0x4: 'protected',
348      0x8: 'static',
349      0x10: 'final',
350      0x200: 'interface',
351      0x400: 'abstract',
352      0x1000: 'synthetic',
353      0x2000: 'annotation',
354      0x4000: 'enum',
355  }
356
357  def __init__(self, data):
358    """Decodes dex file memory sections.
359
360    Args:
361      data: bytearray containing the contents of a dex file.
362    """
363    self.reader = _DexReader(data)
364    self.header = self.reader.ReadHeader()
365    self.map_list = _DexMapList(self.reader, self.header.map_off)
366    self.type_item_list = _TypeIdItemList(self.reader, self.header.type_ids_off,
367                                          self.header.type_ids_size)
368    self.proto_item_list = _ProtoIdItemList(self.reader,
369                                            self.header.proto_ids_off,
370                                            self.header.proto_ids_size)
371    self.method_item_list = _MethodIdItemList(self.reader,
372                                              self.header.method_ids_off,
373                                              self.header.method_ids_size)
374    self.string_item_list = _StringItemList(self.reader,
375                                            self.header.string_ids_off,
376                                            self.header.string_ids_size)
377    self.class_def_item_list = _ClassDefItemList(self.reader,
378                                                 self.header.class_defs_off,
379                                                 self.header.class_defs_size)
380
381    type_list_key = _DexMapList.TYPE_TYPE_LIST
382    if type_list_key in self.map_list:
383      map_list_item = self.map_list[type_list_key]
384      self.type_list_item_list = _TypeListItemList(self.reader,
385                                                   map_list_item.offset,
386                                                   map_list_item.size)
387    else:
388      self.type_list_item_list = _TypeListItemList(self.reader, 0, 0)
389    self._type_lists_by_offset = {
390        type_list.offset: type_list
391        for type_list in self.type_list_item_list
392    }
393
394  def GetString(self, string_item_idx):
395    string_item = self.string_item_list[string_item_idx]
396    return string_item.data
397
398  def GetTypeString(self, type_item_idx):
399    type_item = self.type_item_list[type_item_idx]
400    return self.GetString(type_item.descriptor_idx)
401
402  def GetTypeListStringsByOffset(self, offset):
403    if not offset:
404      return ()
405    type_list = self._type_lists_by_offset[offset]
406    return tuple(self.GetTypeString(item.type_idx) for item in type_list)
407
408  @staticmethod
409  def ResolveClassAccessFlags(access_flags):
410    return tuple(flag_string
411                 for flag, flag_string in DexFile._CLASS_ACCESS_FLAGS.items()
412                 if flag & access_flags)
413
414  def IterMethodSignatureParts(self):
415    """Yields the string components of dex methods in a dex file.
416
417    Yields:
418      Tuples that look like:
419        (class name, return type, method name, (parameter type, ...)).
420    """
421    for method_item in self.method_item_list:
422      class_name_string = self.GetTypeString(method_item.type_idx)
423      method_name_string = self.GetString(method_item.name_idx)
424      proto_item = self.proto_item_list[method_item.proto_idx]
425      return_type_string = self.GetTypeString(proto_item.return_type_idx)
426      parameter_types = self.GetTypeListStringsByOffset(
427          proto_item.parameters_off)
428      yield (class_name_string, return_type_string, method_name_string,
429             parameter_types)
430
431  def __repr__(self):
432    items = [
433        self.header,
434        self.map_list,
435        self.type_item_list,
436        self.proto_item_list,
437        self.method_item_list,
438        self.string_item_list,
439        self.type_list_item_list,
440        self.class_def_item_list,
441    ]
442    return '\n'.join(str(item) for item in items)
443
444
445class _DumpCommand:
446  def __init__(self, dexfile):
447    self._dexfile = dexfile
448
449  def Run(self):
450    raise NotImplementedError()
451
452
453class _DumpMethods(_DumpCommand):
454  def Run(self):
455    for parts in self._dexfile.IterMethodSignatureParts():
456      class_type, return_type, method_name, parameter_types = parts
457      print('{} {} (return type={}, parameters={})'.format(
458          class_type, method_name, return_type, parameter_types))
459
460
461class _DumpStrings(_DumpCommand):
462  def Run(self):
463    for string_item in self._dexfile.string_item_list:
464      # Some strings are likely to be non-ascii (vs. methods/classes).
465      print(string_item.data.encode('utf-8'))
466
467
468class _DumpClasses(_DumpCommand):
469  def Run(self):
470    for class_item in self._dexfile.class_def_item_list:
471      class_string = self._dexfile.GetTypeString(class_item.class_idx)
472      superclass_string = self._dexfile.GetTypeString(class_item.superclass_idx)
473      interfaces = self._dexfile.GetTypeListStringsByOffset(
474          class_item.interfaces_off)
475      access_flags = DexFile.ResolveClassAccessFlags(class_item.access_flags)
476      print('{} (superclass={}, interfaces={}, access_flags={})'.format(
477          class_string, superclass_string, interfaces, access_flags))
478
479
480class _DumpSummary(_DumpCommand):
481  def Run(self):
482    print(self._dexfile)
483
484
485def _DumpDexItems(dexfile_data, name, item):
486  dexfile = DexFile(bytearray(dexfile_data))
487  print('dex_parser: Dumping {} for {}'.format(item, name))
488  cmds = {
489      'summary': _DumpSummary,
490      'methods': _DumpMethods,
491      'strings': _DumpStrings,
492      'classes': _DumpClasses,
493  }
494  try:
495    cmds[item](dexfile).Run()
496  except IOError as e:
497    if e.errno == errno.EPIPE:
498      # Assume we're piping to "less", do nothing.
499      pass
500
501
502def main():
503  parser = argparse.ArgumentParser(description='Dump dex contents to stdout.')
504  parser.add_argument('input',
505                      help='Input (.dex, .jar, .zip, .aab, .apk) file path.')
506  parser.add_argument('item',
507                      choices=('methods', 'strings', 'classes', 'summary'),
508                      help='Item to dump',
509                      nargs='?',
510                      default='summary')
511  args = parser.parse_args()
512
513  if os.path.splitext(args.input)[1] in ('.apk', '.jar', '.zip', '.aab'):
514    with zipfile.ZipFile(args.input) as z:
515      dex_file_paths = [
516          f for f in z.namelist() if re.match(r'.*classes[0-9]*\.dex$', f)
517      ]
518      if not dex_file_paths:
519        print('Error: {} does not contain any classes.dex files'.format(
520            args.input))
521        sys.exit(1)
522
523      for path in dex_file_paths:
524        _DumpDexItems(z.read(path), path, args.item)
525
526  else:
527    with open(args.input, 'rb') as f:
528      _DumpDexItems(f.read(), args.input, args.item)
529
530
531if __name__ == '__main__':
532  main()
533