xref: /aosp_15_r20/external/cronet/build/android/gyp/create_unwind_table.py (revision 6777b5387eb2ff775bb5750e3f5d96f37fb7352b)
1#!/usr/bin/env python3
2# Copyright 2021 The Chromium Authors
3# Use of this source code is governed by a BSD-style license that can be
4# found in the LICENSE file.
5"""Creates a table of unwind information in Android Chrome's bespoke format."""
6
7import abc
8import argparse
9import collections
10import enum
11import json
12import logging
13import re
14import struct
15import subprocess
16import sys
17from typing import (Dict, Iterable, List, NamedTuple, Sequence, TextIO, Tuple,
18                    Union)
19
20from util import build_utils
21
22_STACK_CFI_INIT_REGEX = re.compile(
23    r'^STACK CFI INIT ([0-9a-f]+) ([0-9a-f]+) (.+)$')
24_STACK_CFI_REGEX = re.compile(r'^STACK CFI ([0-9a-f]+) (.+)$')
25
26
27class AddressCfi(NamedTuple):
28  """Record representing CFI for an address within a function.
29
30  Represents the Call Frame Information required to unwind from an address in a
31  function.
32
33  Attributes:
34      address: The address.
35      unwind_instructions: The unwind instructions for the address.
36
37  """
38  address: int
39  unwind_instructions: str
40
41
42class FunctionCfi(NamedTuple):
43  """Record representing CFI for a function.
44
45  Note: address_cfi[0].address is the start address of the function.
46
47  Attributes:
48      size: The function size in bytes.
49      address_cfi: The CFI at each address in the function.
50
51  """
52  size: int
53  address_cfi: Tuple[AddressCfi, ...]
54
55
56def FilterToNonTombstoneCfi(stream: TextIO) -> Iterable[str]:
57  """Generates non-tombstone STACK CFI lines from the stream.
58
59  STACK CFI functions with address 0 correspond are a 'tombstone' record
60  associated with dead code and can be ignored. See
61  https://bugs.llvm.org/show_bug.cgi?id=47148#c2.
62
63  Args:
64      stream: A file object.
65
66  Returns:
67      An iterable over the non-tombstone STACK CFI lines in the stream.
68  """
69  in_tombstone_function = False
70  for line in stream:
71    if not line.startswith('STACK CFI '):
72      continue
73
74    if line.startswith('STACK CFI INIT 0 '):
75      in_tombstone_function = True
76    elif line.startswith('STACK CFI INIT '):
77      in_tombstone_function = False
78
79    if not in_tombstone_function:
80      yield line
81
82
83def ReadFunctionCfi(stream: TextIO) -> Iterable[FunctionCfi]:
84  """Generates FunctionCfi records from the stream.
85
86  Args:
87      stream: A file object.
88
89  Returns:
90      An iterable over FunctionCfi corresponding to the non-tombstone STACK CFI
91      lines in the stream.
92  """
93  current_function_address = None
94  current_function_size = None
95  current_function_address_cfi = []
96  for line in FilterToNonTombstoneCfi(stream):
97    cfi_init_match = _STACK_CFI_INIT_REGEX.search(line)
98    if cfi_init_match:
99      # Function CFI with address 0 are tombstone entries per
100      # https://bugs.llvm.org/show_bug.cgi?id=47148#c2 and should have been
101      # filtered in `FilterToNonTombstoneCfi`.
102      assert current_function_address != 0
103      if (current_function_address is not None
104          and current_function_size is not None):
105        yield FunctionCfi(current_function_size,
106                          tuple(current_function_address_cfi))
107      current_function_address = int(cfi_init_match.group(1), 16)
108      current_function_size = int(cfi_init_match.group(2), 16)
109      current_function_address_cfi = [
110          AddressCfi(int(cfi_init_match.group(1), 16), cfi_init_match.group(3))
111      ]
112    else:
113      cfi_match = _STACK_CFI_REGEX.search(line)
114      assert cfi_match
115      current_function_address_cfi.append(
116          AddressCfi(int(cfi_match.group(1), 16), cfi_match.group(2)))
117
118  assert current_function_address is not None
119  assert current_function_size is not None
120  yield FunctionCfi(current_function_size, tuple(current_function_address_cfi))
121
122
123def EncodeAsBytes(*values: int) -> bytes:
124  """Encodes the argument ints as bytes.
125
126  This function validates that the inputs are within the range that can be
127  represented as bytes.
128
129  Args:
130    values: Integers in range [0, 255].
131
132  Returns:
133    The values encoded as bytes.
134  """
135  for i, value in enumerate(values):
136    if not 0 <= value <= 255:
137      raise ValueError('value = %d out of bounds at byte %d' % (value, i))
138  return bytes(values)
139
140
141def Uleb128Encode(value: int) -> bytes:
142  """Encodes the argument int to ULEB128 format.
143
144  Args:
145    value: Unsigned integer.
146
147  Returns:
148    The values encoded as ULEB128 bytes.
149  """
150  if value < 0:
151    raise ValueError(f'Cannot uleb128 encode negative value ({value}).')
152
153  uleb128_bytes = []
154  done = False
155  while not done:
156    value, lowest_seven_bits = divmod(value, 0x80)
157    done = value == 0
158    uleb128_bytes.append(lowest_seven_bits | (0x80 if not done else 0x00))
159  return EncodeAsBytes(*uleb128_bytes)
160
161
162def EncodeStackPointerUpdate(offset: int) -> bytes:
163  """Encodes a stack pointer update as arm unwind instructions.
164
165  Args:
166    offset: Offset to apply on stack pointer. Should be in range [-0x204, inf).
167
168  Returns:
169    A list of arm unwind instructions as bytes.
170  """
171  assert offset % 4 == 0
172
173  abs_offset = abs(offset)
174  instruction_code = 0b01000000 if offset < 0 else 0b00000000
175  if 0x04 <= abs_offset <= 0x200:
176    instructions = [
177        # vsp = vsp + (xxxxxx << 2) + 4. Covers range 0x04-0x100 inclusive.
178        instruction_code | ((min(abs_offset, 0x100) - 4) >> 2)
179    ]
180    # For vsp increments of 0x104-0x200 we use 00xxxxxx twice.
181    if abs_offset >= 0x104:
182      instructions.append(instruction_code | ((abs_offset - 0x100 - 4) >> 2))
183    try:
184      return EncodeAsBytes(*instructions)
185    except ValueError as e:
186      raise RuntimeError('offset = %d produced out of range value' %
187                         offset) from e
188  else:
189    # This only encodes positive sp movement.
190    assert offset > 0, offset
191    return EncodeAsBytes(0b10110010  # vsp = vsp + 0x204 + (uleb128 << 2)
192                         ) + Uleb128Encode((offset - 0x204) >> 2)
193
194
195def EncodePop(registers: Sequence[int]) -> bytes:
196  """Encodes popping of a sequence of registers as arm unwind instructions.
197
198  Args:
199    registers: Collection of target registers to accept values popped from
200      stack. Register value order in the sequence does not matter. Values are
201      popped based on register index order.
202
203  Returns:
204    A list of arm unwind instructions as bytes.
205  """
206  assert all(
207      r in range(4, 16)
208      for r in registers), f'Can only pop r4 ~ r15. Registers:\n{registers}.'
209  assert len(registers) > 0, 'Register sequence cannot be empty.'
210
211  instructions: List[int] = []
212
213  # Check if the pushed registers are continuous set starting from r4 (and
214  # ending prior to r12). This scenario has its own encoding.
215  pop_lr = 14 in registers
216  non_lr_registers = [r for r in registers if r != 14]
217  non_lr_registers_continuous_from_r4 = \
218    sorted(non_lr_registers) == list(range(4, 4 + len(non_lr_registers)))
219
220  if (pop_lr and 0 < len(non_lr_registers) <= 8
221      and non_lr_registers_continuous_from_r4):
222    instructions.append(0b10101000
223                        | (len(non_lr_registers) - 1)  # Pop r4-r[4+nnn], r14.
224                        )
225  else:
226    register_bits = 0
227    for register in registers:
228      register_bits |= 1 << register
229    register_bits = register_bits >> 4  # Skip r0 ~ r3.
230    instructions.extend([
231        # Pop up to 12 integer registers under masks {r15-r12}, {r11-r4}.
232        0b10000000 | (register_bits >> 8),
233        register_bits & 0xff
234    ])
235
236  return EncodeAsBytes(*instructions)
237
238
239class UnwindType(enum.Enum):
240  """
241  The type of unwind action to perform.
242  """
243
244  # Use lr as the return address.
245  RETURN_TO_LR = 1
246
247  # Increment or decrement the stack pointer and/or pop registers (r4 ~ r15).
248  # If both, the increment/decrement occurs first.
249  UPDATE_SP_AND_OR_POP_REGISTERS = 2
250
251  # Restore the stack pointer from a register then increment/decrement the stack
252  # pointer.
253  RESTORE_SP_FROM_REGISTER = 3
254
255  # No action necessary. Used for floating point register pops.
256  NO_ACTION = 4
257
258
259class AddressUnwind(NamedTuple):
260  """Record representing unwind information for an address within a function.
261
262  Attributes:
263      address_offset: The offset of the address from the start of the function.
264      unwind_type: The type of unwind to perform from the address.
265      sp_offset: The offset to apply to the stack pointer.
266      registers: The registers involved in the unwind.
267  """
268  address_offset: int
269  unwind_type: UnwindType
270  sp_offset: int
271  registers: Tuple[int, ...]
272
273
274class FunctionUnwind(NamedTuple):
275  """Record representing unwind information for a function.
276
277  Attributes:
278      address: The address of the function.
279      size: The function size in bytes.
280      address_unwind_info: The unwind info at each address in the function.
281  """
282
283  address: int
284  size: int
285  address_unwinds: Tuple[AddressUnwind, ...]
286
287
288def EncodeAddressUnwind(address_unwind: AddressUnwind) -> bytes:
289  """Encodes an `AddressUnwind` object as arm unwind instructions.
290
291  Args:
292    address_unwind: Record representing unwind information for an address within
293      a function.
294
295  Returns:
296    A list of arm unwind instructions as bytes.
297  """
298  if address_unwind.unwind_type == UnwindType.RETURN_TO_LR:
299    return EncodeAsBytes(0b10110000)  # Finish.
300  if address_unwind.unwind_type == UnwindType.UPDATE_SP_AND_OR_POP_REGISTERS:
301    return ((EncodeStackPointerUpdate(address_unwind.sp_offset)
302             if address_unwind.sp_offset else b'') +
303            (EncodePop(address_unwind.registers)
304             if address_unwind.registers else b''))
305
306  if address_unwind.unwind_type == UnwindType.RESTORE_SP_FROM_REGISTER:
307    assert len(address_unwind.registers) == 1
308    return (EncodeAsBytes(0b10010000
309                          | address_unwind.registers[0]  # Set vsp = r[nnnn].
310                          ) +
311            (EncodeStackPointerUpdate(address_unwind.sp_offset)
312             if address_unwind.sp_offset else b''))
313
314  if address_unwind.unwind_type == UnwindType.NO_ACTION:
315    return b''
316
317  assert False, 'unknown unwind type'
318  return b''
319
320
321class UnwindInstructionsParser(abc.ABC):
322  """Base class for parsers of breakpad unwind instruction sequences.
323
324  Provides regexes matching breakpad instruction sequences understood by the
325  parser, and parsing of the sequences from the regex match.
326  """
327
328  @abc.abstractmethod
329  def GetBreakpadInstructionsRegex(self) -> re.Pattern:
330    pass
331
332  @abc.abstractmethod
333  def ParseFromMatch(self, address_offset: int, cfa_sp_offset: int,
334                     match: re.Match) -> Tuple[AddressUnwind, int]:
335    """Returns the regex matching the breakpad instructions.
336
337    Args:
338      address_offset: Offset from function start address.
339      cfa_sp_offset: CFA stack pointer offset.
340
341    Returns:
342      The unwind info for the address plus the new cfa_sp_offset.
343    """
344
345
346class NullParser(UnwindInstructionsParser):
347  """Translates the state before any instruction has been executed."""
348
349  regex = re.compile(r'^\.cfa: sp 0 \+ \.ra: lr$')
350
351  def GetBreakpadInstructionsRegex(self) -> re.Pattern:
352    return self.regex
353
354  def ParseFromMatch(self, address_offset: int, cfa_sp_offset: int,
355                     match: re.Match) -> Tuple[AddressUnwind, int]:
356    return AddressUnwind(address_offset, UnwindType.RETURN_TO_LR, 0, ()), 0
357
358
359class PushOrSubSpParser(UnwindInstructionsParser):
360  """Translates unwinds from push or sub sp, #constant instructions."""
361
362  # We expect at least one of the three outer groups to be non-empty. Cases:
363  #
364  # Standard prologue pushes.
365  #   Match the first two and optionally the third.
366  #
367  # Standard prologue sub sp, #constant.
368  #   Match only the first.
369  #
370  # Pushes in dynamic stack allocation functions after saving sp.
371  #   Match only the third since they don't alter the stack pointer or store the
372  #   return address.
373  #
374  # Leaf functions that use callee-save registers.
375  #   Match the first and third but not the second.
376  regex = re.compile(r'^(?:\.cfa: sp (\d+) \+ ?)?'
377                     r'(?:\.ra: \.cfa (-\d+) \+ \^ ?)?'
378                     r'((?:r\d+: \.cfa -\d+ \+ \^ ?)*)$')
379
380  # 'r' followed by digits, with 'r' matched via positive lookbehind so only the
381  # number appears in the match.
382  register_regex = re.compile('(?<=r)(\d+)')
383
384  def GetBreakpadInstructionsRegex(self) -> re.Pattern:
385    return self.regex
386
387  def ParseFromMatch(self, address_offset: int, cfa_sp_offset: int,
388                     match: re.Match) -> Tuple[AddressUnwind, int]:
389    # The group will be None if the outer non-capturing groups for the(\d+) and
390    # (-\d+) expressions are not matched.
391    new_cfa_sp_offset, ra_cfa_offset = (int(group) if group else None
392                                        for group in match.groups()[:2])
393
394    # Registers are pushed in reverse order by register number so are popped in
395    # order. Sort them to ensure the proper order.
396    registers = sorted([
397        int(register)
398        for register in self.register_regex.findall(match.group(3))
399        # `UpdateSpAndOrPopRegisters` only supports popping of register
400        # r4 ~ r15. The ignored registers are translated to sp increments by
401        # the following calculation on `sp_offset`.
402        if int(register) in range(4, 16)
403    ] +
404                       # Also pop lr (ra in breakpad terms) if it was stored.
405                       ([14] if ra_cfa_offset is not None else []))
406
407    sp_offset = 0
408    if new_cfa_sp_offset is not None:
409      sp_offset = new_cfa_sp_offset - cfa_sp_offset
410      assert sp_offset % 4 == 0
411      if sp_offset >= len(registers) * 4:
412        # Handles the sub sp, #constant case, and push instructions that push
413        # caller-save registers r0-r3 which don't get encoded in the unwind
414        # instructions. In the latter case we need to move the stack pointer up
415        # to the first pushed register.
416        sp_offset -= len(registers) * 4
417
418    return AddressUnwind(address_offset,
419                         UnwindType.UPDATE_SP_AND_OR_POP_REGISTERS, sp_offset,
420                         tuple(registers)), new_cfa_sp_offset or cfa_sp_offset
421
422
423class VPushParser(UnwindInstructionsParser):
424  # VPushes that occur in dynamic stack allocation functions after storing the
425  # stack pointer don't change the stack pointer or push any register that we
426  # care about. The first group will not match in those cases.
427  #
428  # Breakpad doesn't seem to understand how to name the floating point
429  # registers so calls them unnamed_register.
430  regex = re.compile(r'^(?:\.cfa: sp (\d+) \+ )?'
431                     r'(?:unnamed_register\d+: \.cfa -\d+ \+ \^ ?)+$')
432
433  def GetBreakpadInstructionsRegex(self) -> re.Pattern:
434    return self.regex
435
436  def ParseFromMatch(self, address_offset: int, cfa_sp_offset: int,
437                     match: re.Match) -> Tuple[AddressUnwind, int]:
438    # `match.group(1)`, which corresponds to the (\d+) expression, will be None
439    # if the first outer non-capturing group is not matched.
440    new_cfa_sp_offset = int(match.group(1)) if match.group(1) else None
441    if new_cfa_sp_offset is None:
442      return (AddressUnwind(address_offset, UnwindType.NO_ACTION, 0,
443                            ()), cfa_sp_offset)
444
445    sp_offset = new_cfa_sp_offset - cfa_sp_offset
446    assert sp_offset % 4 == 0
447    return AddressUnwind(address_offset,
448                         UnwindType.UPDATE_SP_AND_OR_POP_REGISTERS, sp_offset,
449                         ()), new_cfa_sp_offset
450
451
452class StoreSpParser(UnwindInstructionsParser):
453  regex = re.compile(r'^\.cfa: r(\d+) (\d+) \+$')
454
455  def GetBreakpadInstructionsRegex(self) -> re.Pattern:
456    return self.regex
457
458  def ParseFromMatch(self, address_offset: int, cfa_sp_offset: int,
459                     match: re.Match) -> Tuple[AddressUnwind, int]:
460    register = int(match.group(1))
461    new_cfa_sp_offset = int(match.group(2))
462    sp_offset = new_cfa_sp_offset - cfa_sp_offset
463    assert sp_offset % 4 == 0
464    return AddressUnwind(address_offset, UnwindType.RESTORE_SP_FROM_REGISTER,
465                         sp_offset, (register, )), new_cfa_sp_offset
466
467
468def EncodeUnwindInstructionTable(complete_instruction_sequences: Iterable[bytes]
469                                 ) -> Tuple[bytes, Dict[bytes, int]]:
470  """Encodes the unwind instruction table.
471
472  Deduplicates the encoded unwind instruction sequences. Generates the table and
473  a dictionary mapping a function to its starting index in the table.
474
475  The instruction table is used by the unwinder to provide the sequence of
476  unwind instructions to execute for each function, separated by offset
477  into the function.
478
479  Args:
480    complete_instruction_sequences: An iterable of encoded unwind instruction
481      sequences. The sequences represent the series of unwind instructions to
482      execute corresponding to offsets within each function.
483
484  Returns:
485    A tuple containing:
486    - The unwind instruction table as bytes.
487    - The mapping from the instruction sequence to the offset in the unwind
488      instruction table. This mapping is used to construct the function offset
489      table, which references entries in the unwind instruction table.
490  """
491  # As the function offset table uses variable length number encoding (uleb128),
492  # which means smaller number uses fewer bytes to represent, we should sort
493  # the unwind instruction table by number of references from the function
494  # offset table in order to minimize the size of the function offset table.
495  ref_counts: Dict[bytes, int] = collections.defaultdict(int)
496  for sequence in complete_instruction_sequences:
497    ref_counts[sequence] += 1
498
499  def ComputeScore(sequence):
500    """ Score for each sequence is computed as  ref_count / size_of_sequence.
501
502    According to greedy algorithm, items with higher value / space cost ratio
503    should be prioritized. Here value is bytes saved in the function offset
504    table, represetned by ref_count. Space cost is the space taken in the
505    unwind instruction table, represented by size_of_sequence.
506
507    Note: In order to ensure build-time determinism, `sequence` is also returned
508    to resolve sorting order when scores are the same.
509    """
510    return ref_counts[sequence] / len(sequence), sequence
511
512  ordered_sequences = sorted(ref_counts.keys(), key=ComputeScore, reverse=True)
513  offsets: Dict[bytes, int] = {}
514  current_offset = 0
515  for sequence in ordered_sequences:
516    offsets[sequence] = current_offset
517    current_offset += len(sequence)
518  return b''.join(ordered_sequences), offsets
519
520
521class EncodedAddressUnwind(NamedTuple):
522  """Record representing unwind information for an address within a function.
523
524  This structure represents the same concept as `AddressUnwind`. The only
525  difference is that how to unwind from the address is represented as
526  encoded ARM unwind instructions.
527
528  Attributes:
529    address_offset: The offset of the address from the start address of the
530      function.
531    complete_instruction_sequence: The full ARM unwind instruction sequence to
532      unwind from the `address_offset`.
533  """
534  address_offset: int
535  complete_instruction_sequence: bytes
536
537
538def EncodeAddressUnwinds(address_unwinds: Tuple[AddressUnwind, ...]
539                         ) -> Tuple[EncodedAddressUnwind, ...]:
540  """Encodes the unwind instructions and offset for the addresses within a
541  function.
542
543  Args:
544    address_unwinds: A tuple of unwind state for addresses within a function.
545
546  Returns:
547    The encoded unwind instructions and offsets for the addresses within a
548    function, ordered by decreasing offset.
549  """
550  sorted_address_unwinds: List[AddressUnwind] = sorted(
551      address_unwinds,
552      key=lambda address_unwind: address_unwind.address_offset,
553      reverse=True)
554  unwind_instructions: List[bytes] = [
555      EncodeAddressUnwind(address_unwind)
556      for address_unwind in sorted_address_unwinds
557  ]
558
559  # A complete instruction sequence contains all the unwind instructions
560  # necessary to unwind from an offset within a function. For a given offset
561  # this includes the offset's instructions plus the instructions for all
562  # earlier offsets. The offsets are stored in reverse order, hence the i:
563  # range rather than :i+1.
564  complete_instruction_sequences = [
565      b''.join(unwind_instructions[i:]) for i in range(len(unwind_instructions))
566  ]
567
568  encoded_unwinds: List[EncodedAddressUnwind] = []
569  for address_unwind, sequence in zip(sorted_address_unwinds,
570                                      complete_instruction_sequences):
571    encoded_unwinds.append(
572        EncodedAddressUnwind(address_unwind.address_offset, sequence))
573  return tuple(encoded_unwinds)
574
575
576class EncodedFunctionUnwind(NamedTuple):
577  """Record representing unwind information for a function.
578
579  This structure represents the same concept as `FunctionUnwind`, but with
580  some differences:
581  - Attribute `address` is split into 2 attributes: `page_number` and
582    `page_offset`.
583  - Attribute `size` is dropped.
584  - Attribute `address_unwinds` becomes a collection of `EncodedAddressUnwind`s,
585    instead of a collection of `AddressUnwind`s.
586
587  Attributes:
588    page_number: The upper bits (17 ~ 31bits) of byte offset from text section
589      start.
590    page_offset: The lower bits (1 ~ 16bits) of instruction offset from text
591      section start.
592    address_unwinds: A collection of `EncodedAddressUnwind`s.
593
594  """
595
596  page_number: int
597  page_offset: int
598  address_unwinds: Tuple[EncodedAddressUnwind, ...]
599
600
601# The trivial unwind is defined as a single `RETURN_TO_LR` instruction
602# at the start of the function.
603TRIVIAL_UNWIND: Tuple[EncodedAddressUnwind, ...] = EncodeAddressUnwinds(
604    (AddressUnwind(address_offset=0,
605                   unwind_type=UnwindType.RETURN_TO_LR,
606                   sp_offset=0,
607                   registers=()), ))
608
609# The refuse to unwind filler unwind is used to fill the invalid space
610# before the first function in the first page and after the last function
611# in the last page.
612REFUSE_TO_UNWIND: Tuple[EncodedAddressUnwind, ...] = (EncodedAddressUnwind(
613    address_offset=0,
614    complete_instruction_sequence=bytes([0b10000000, 0b00000000])), )
615
616
617def EncodeFunctionUnwinds(function_unwinds: Iterable[FunctionUnwind],
618                          text_section_start_address: int
619                          ) -> Iterable[EncodedFunctionUnwind]:
620  """Encodes the unwind state for all functions defined in the binary.
621
622  This function
623  - sorts the collection of `FunctionUnwind`s by address.
624  - fills in gaps between functions with trivial unwind.
625  - fills the space in the last page after last function with refuse to unwind.
626  - fills the space in the first page before the first function with refuse
627    to unwind.
628
629  Args:
630    function_unwinds: An iterable of function unwind states.
631    text_section_start_address: The address of .text section in ELF file.
632
633  Returns:
634    The encoded function unwind states with no gaps between functions, ordered
635    by ascending address.
636  """
637
638  def GetPageNumber(address: int) -> int:
639    """Calculates the page number.
640
641    Page number is calculated as byte_offset_from_text_section_start >> 17,
642    i.e. the upper bits (17 ~ 31bits) of byte offset from text section start.
643    """
644    return (address - text_section_start_address) >> 17
645
646  def GetPageOffset(address: int) -> int:
647    """Calculates the page offset.
648
649    Page offset is calculated as (byte_offset_from_text_section_start >> 1)
650    & 0xffff, i.e. the lower bits (1 ~ 16bits) of instruction offset from
651    text section start.
652    """
653    return ((address - text_section_start_address) >> 1) & 0xffff
654
655  sorted_function_unwinds: List[FunctionUnwind] = sorted(
656      function_unwinds, key=lambda function_unwind: function_unwind.address)
657
658  if sorted_function_unwinds[0].address > text_section_start_address:
659    yield EncodedFunctionUnwind(page_number=0,
660                                page_offset=0,
661                                address_unwinds=REFUSE_TO_UNWIND)
662
663  prev_func_end_address: int = sorted_function_unwinds[0].address
664
665  gaps = 0
666  for unwind in sorted_function_unwinds:
667    assert prev_func_end_address <= unwind.address, (
668        'Detected overlap between functions.')
669
670    if prev_func_end_address < unwind.address:
671      # Gaps between functions are typically filled by regions of thunks which
672      # do not alter the stack pointer. Filling these gaps with TRIVIAL_UNWIND
673      # is the appropriate unwind strategy.
674      gaps += 1
675      yield EncodedFunctionUnwind(GetPageNumber(prev_func_end_address),
676                                  GetPageOffset(prev_func_end_address),
677                                  TRIVIAL_UNWIND)
678
679    yield EncodedFunctionUnwind(GetPageNumber(unwind.address),
680                                GetPageOffset(unwind.address),
681                                EncodeAddressUnwinds(unwind.address_unwinds))
682
683    prev_func_end_address = unwind.address + unwind.size
684
685  if GetPageOffset(prev_func_end_address) != 0:
686    yield EncodedFunctionUnwind(GetPageNumber(prev_func_end_address),
687                                GetPageOffset(prev_func_end_address),
688                                REFUSE_TO_UNWIND)
689
690  logging.info('%d/%d gaps between functions filled with trivial unwind.', gaps,
691               len(sorted_function_unwinds))
692
693
694def EncodeFunctionOffsetTable(
695    encoded_address_unwind_sequences: Iterable[
696        Tuple[EncodedAddressUnwind, ...]],
697    unwind_instruction_table_offsets: Dict[bytes, int]
698) -> Tuple[bytes, Dict[Tuple[EncodedAddressUnwind, ...], int]]:
699  """Encodes the function offset table.
700
701  The function offset table maps local instruction offset from function
702  start to the location in the unwind instruction table.
703
704  Args:
705    encoded_address_unwind_sequences: An iterable of encoded address unwind
706      sequences.
707    unwind_instruction_table_offsets: The offset mapping returned from
708      `EncodeUnwindInstructionTable`.
709
710  Returns:
711    A tuple containing:
712    - The function offset table as bytes.
713    - The mapping from the `EncodedAddressUnwind`s to the offset in the function
714      offset table. This mapping is used to construct the function table, which
715      references entries in the function offset table.
716  """
717  function_offset_table = bytearray()
718  offsets: Dict[Tuple[EncodedAddressUnwind, ...], int] = {}
719
720  for sequence in encoded_address_unwind_sequences:
721    if sequence in offsets:
722      continue
723
724    offsets[sequence] = len(function_offset_table)
725    for address_offset, complete_instruction_sequence in sequence:
726      # Note: address_offset is the number of bytes from one address to another,
727      # while the instruction_offset is the number of 2-byte instructions
728      # from one address to another.
729      instruction_offset = address_offset >> 1
730      function_offset_table += (
731          Uleb128Encode(instruction_offset) + Uleb128Encode(
732              unwind_instruction_table_offsets[complete_instruction_sequence]))
733
734  return bytes(function_offset_table), offsets
735
736
737def EncodePageTableAndFunctionTable(
738    function_unwinds: Iterable[EncodedFunctionUnwind],
739    function_offset_table_offsets: Dict[Tuple[EncodedAddressUnwind, ...], int]
740) -> Tuple[bytes, bytes]:
741  """Encode page table and function table as bytes.
742
743  Page table:
744  A table that contains the mapping from page_number to the location of the
745  entry for the first function on the page in the function table.
746
747  Function table:
748  A table that contains the mapping from page_offset to the location of an entry
749  in the function offset table.
750
751  Args:
752    function_unwinds: All encoded function unwinds in the module.
753    function_offset_table_offsets: The offset mapping returned from
754      `EncodeFunctionOffsetTable`.
755
756  Returns:
757    A tuple containing:
758    - The page table as bytes.
759    - The function table as bytes.
760  """
761  page_function_unwinds: Dict[
762      int, List[EncodedFunctionUnwind]] = collections.defaultdict(list)
763  for function_unwind in function_unwinds:
764    page_function_unwinds[function_unwind.page_number].append(function_unwind)
765
766  raw_page_table: List[int] = []
767  function_table = bytearray()
768
769  for page_number, same_page_function_unwinds in sorted(
770      page_function_unwinds.items(), key=lambda item: item[0]):
771    # Pad empty pages.
772    # Empty pages can occur when a function spans over multiple pages.
773    # Example:
774    # A page table with a starting function that spans 3 over pages.
775    # page_table:
776    # [0, 1, 1, 1]
777    # function_table:
778    # [
779    #   # Page 0
780    #   (0, 20) # This function spans from page 0 offset 0 to page 3 offset 5.
781    #   # Page 1 is empty.
782    #   # Page 2 is empty.
783    #   # Page 3
784    #   (6, 70)
785    # ]
786    assert page_number > len(raw_page_table) - 1
787    number_of_empty_pages = page_number - len(raw_page_table)
788    # The function table is represented as `base::FunctionTableEntry[]`,
789    # where `base::FunctionTableEntry` is 4 bytes.
790    function_table_index = len(function_table) // 4
791    raw_page_table.extend([function_table_index] * (number_of_empty_pages + 1))
792    assert page_number == len(raw_page_table) - 1
793
794    for function_unwind in sorted(
795        same_page_function_unwinds,
796        key=lambda function_unwind: function_unwind.page_offset):
797      function_table += struct.pack(
798          'HH', function_unwind.page_offset,
799          function_offset_table_offsets[function_unwind.address_unwinds])
800
801  page_table = struct.pack(f'{len(raw_page_table)}I', *raw_page_table)
802
803  return page_table, bytes(function_table)
804
805
806ALL_PARSERS: Tuple[UnwindInstructionsParser, ...] = (
807    NullParser(),
808    PushOrSubSpParser(),
809    StoreSpParser(),
810    VPushParser(),
811)
812
813
814def ParseAddressCfi(address_cfi: AddressCfi, function_start_address: int,
815                    parsers: Tuple[UnwindInstructionsParser, ...],
816                    prev_cfa_sp_offset: int
817                    ) -> Tuple[Union[AddressUnwind, None], bool, int]:
818  """Parses address CFI with given parsers.
819
820  Args:
821    address_cfi: The CFI for an address in the function.
822    function_start_address: The start address of the function.
823    parsers: Available parsers to try on CFI data.
824    prev_cfa_sp_offset: Previous CFA stack pointer offset.
825
826  Returns:
827    A tuple containing:
828    - An `AddressUnwind` object when the parse is successful, None otherwise.
829    - Whether the address is in function epilogue.
830    - The new cfa_sp_offset.
831  """
832  for parser in parsers:
833    match = parser.GetBreakpadInstructionsRegex().search(
834        address_cfi.unwind_instructions)
835    if not match:
836      continue
837
838    address_unwind, cfa_sp_offset = parser.ParseFromMatch(
839        address_cfi.address - function_start_address, prev_cfa_sp_offset, match)
840
841    in_epilogue = (
842        prev_cfa_sp_offset > cfa_sp_offset
843        and address_unwind.unwind_type != UnwindType.RESTORE_SP_FROM_REGISTER)
844
845    return (address_unwind if not in_epilogue else None, in_epilogue,
846            cfa_sp_offset)
847
848  return None, False, prev_cfa_sp_offset
849
850
851def GenerateUnwinds(function_cfis: Iterable[FunctionCfi],
852                    parsers: Tuple[UnwindInstructionsParser, ...]
853                    ) -> Iterable[FunctionUnwind]:
854  """Generates parsed function unwind states from breakpad CFI data.
855
856  This function parses `FunctionCfi`s to `FunctionUnwind`s using
857  `UnwindInstructionParser`.
858
859  Args:
860    function_cfis: An iterable of function CFI data.
861    parsers: Available parsers to try on CFI address data.
862
863  Returns:
864    An iterable of parsed function unwind states.
865  """
866  functions = 0
867  addresses = 0
868  handled_addresses = 0
869  epilogues_seen = 0
870
871  for function_cfi in function_cfis:
872    functions += 1
873    address_unwinds: List[AddressUnwind] = []
874    cfa_sp_offset = 0
875    for address_cfi in function_cfi.address_cfi:
876      addresses += 1
877
878      address_unwind, in_epilogue, cfa_sp_offset = ParseAddressCfi(
879          address_cfi, function_cfi.address_cfi[0].address, parsers,
880          cfa_sp_offset)
881
882      if address_unwind:
883        handled_addresses += 1
884        address_unwinds.append(address_unwind)
885        continue
886
887      if in_epilogue:
888        epilogues_seen += 1
889        break
890
891      logging.info('unrecognized CFI: %x %s.', address_cfi.address,
892                   address_cfi.unwind_instructions)
893
894    if address_unwinds:
895      # We expect that the unwind information for every function starts with a
896      # trivial unwind (RETURN_TO_LR) prior to the execution of any code in the
897      # function. This is required by the arm calling convention which involves
898      # setting lr to the return address on calling into a function.
899      assert address_unwinds[0].address_offset == 0
900      assert address_unwinds[0].unwind_type == UnwindType.RETURN_TO_LR
901
902      yield FunctionUnwind(function_cfi.address_cfi[0].address,
903                           function_cfi.size, tuple(address_unwinds))
904
905  logging.info('%d functions.', functions)
906  logging.info('%d/%d addresses handled.', handled_addresses, addresses)
907  logging.info('epilogues_seen: %d.', epilogues_seen)
908
909
910def EncodeUnwindInfo(page_table: bytes, function_table: bytes,
911                     function_offset_table: bytes,
912                     unwind_instruction_table: bytes) -> bytes:
913  """Encodes all unwind tables as a single binary.
914
915  Concats all unwind table binaries together and attach a header at the start
916  with a offset-size pair for each table.
917
918  offset: The offset to the target table from the start of the unwind info
919    binary in bytes.
920  size: The declared size of the target table.
921
922  Both offset and size are represented as 32bit integers.
923  See `base::ChromeUnwindInfoHeaderAndroid` for more details.
924
925  Args:
926    page_table: The page table as bytes.
927    function_table: The function table as bytes.
928    function_offset_table: The function offset table as bytes.
929    unwind_instruction_table: The unwind instruction table as bytes.
930
931  Returns:
932    A single binary containing
933    - A header that points to the location of each table.
934    - All unwind tables.
935  """
936  unwind_info_header = bytearray()
937  # Each table is represented as (offset, size) pair, both offset and size
938  # are represented as 4 byte integer.
939  unwind_info_header_size = 4 * 2 * 4
940  unwind_info_body = bytearray()
941
942  # Both the page_table and the function table need to be aligned because their
943  # contents are interpreted as multi-byte integers. However, the byte size of
944  # the header, the page table, the function table are all multiples of 4 and
945  # the resource will be memory mapped at a 4 byte boundary, so no extra care
946  # is required to align the page table and the function table.
947  #
948  # The function offset table and the unwind instruction table are accessed
949  # byte by byte, so they only need 1 byte alignment.
950
951  assert len(page_table) % 4 == 0, (
952      'Each entry in the page table should be 4-byte integer.')
953  assert len(function_table) % 4 == 0, (
954      'Each entry in the function table should be a pair of 2 2-byte integers.')
955
956  for table in page_table, function_table:
957    offset = unwind_info_header_size + len(unwind_info_body)
958    # For the page table and the function_table, declared size is the number of
959    # entries in each table. The tables will be aligned to a 4 byte boundary
960    # because the resource will be memory mapped at a 4 byte boundary and the
961    # header is a multiple of 4 bytes.
962    declared_size = len(table) // 4
963    unwind_info_header += struct.pack('II', offset, declared_size)
964    unwind_info_body += table
965
966  for table in function_offset_table, unwind_instruction_table:
967    offset = unwind_info_header_size + len(unwind_info_body)
968    # Because both the function offset table and the unwind instruction table
969    # contain variable length encoded numbers, the declared size is simply the
970    # number of bytes in each table. The tables only require 1 byte alignment.
971    declared_size = len(table)
972    unwind_info_header += struct.pack('II', offset, declared_size)
973    unwind_info_body += table
974
975  return bytes(unwind_info_header + unwind_info_body)
976
977
978def GenerateUnwindTables(
979    encoded_function_unwinds_iterable: Iterable[EncodedFunctionUnwind]
980) -> Tuple[bytes, bytes, bytes, bytes]:
981  """Generates all unwind tables as bytes.
982
983  Args:
984    encoded_function_unwinds_iterable: Encoded function unwinds for all
985      functions in the ELF binary.
986
987  Returns:
988    A tuple containing:
989    - The page table as bytes.
990    - The function table as bytes.
991    - The function offset table as bytes.
992    - The unwind instruction table as bytes.
993  """
994  encoded_function_unwinds: List[EncodedFunctionUnwind] = list(
995      encoded_function_unwinds_iterable)
996  complete_instruction_sequences: List[bytes] = []
997  encoded_address_unwind_sequences: List[Tuple[EncodedAddressUnwind, ...]] = []
998
999  for encoded_function_unwind in encoded_function_unwinds:
1000    encoded_address_unwind_sequences.append(
1001        encoded_function_unwind.address_unwinds)
1002    for address_unwind in encoded_function_unwind.address_unwinds:
1003      complete_instruction_sequences.append(
1004          address_unwind.complete_instruction_sequence)
1005
1006  unwind_instruction_table, unwind_instruction_table_offsets = (
1007      EncodeUnwindInstructionTable(complete_instruction_sequences))
1008
1009  function_offset_table, function_offset_table_offsets = (
1010      EncodeFunctionOffsetTable(encoded_address_unwind_sequences,
1011                                unwind_instruction_table_offsets))
1012
1013  page_table, function_table = EncodePageTableAndFunctionTable(
1014      encoded_function_unwinds, function_offset_table_offsets)
1015
1016  return (page_table, function_table, function_offset_table,
1017          unwind_instruction_table)
1018
1019
1020def ReadTextSectionStartAddress(readobj_path: str, libchrome_path: str) -> int:
1021  """Reads the .text section start address of libchrome ELF.
1022
1023  Arguments:
1024    readobj_path: Path to llvm-obj binary.
1025    libchrome_path: Path to libchrome binary.
1026
1027  Returns:
1028    The text section start address as a number.
1029  """
1030  def GetSectionName(section) -> str:
1031    # See crbug.com/1426287 for context on different JSON names.
1032    if 'Name' in section['Section']['Name']:
1033      return section['Section']['Name']['Name']
1034    return section['Section']['Name']['Value']
1035
1036  proc = subprocess.Popen(
1037      [readobj_path, '--sections', '--elf-output-style=JSON', libchrome_path],
1038      stdout=subprocess.PIPE,
1039      encoding='ascii')
1040
1041  elfs = json.loads(proc.stdout.read())[0]
1042  sections = elfs['Sections']
1043
1044  return next(s['Section']['Address'] for s in sections
1045              if GetSectionName(s) == '.text')
1046
1047
1048def main():
1049  build_utils.InitLogging('CREATE_UNWIND_TABLE_DEBUG')
1050  parser = argparse.ArgumentParser(description=__doc__)
1051  parser.add_argument('--input_path',
1052                      help='Path to the unstripped binary.',
1053                      required=True,
1054                      metavar='FILE')
1055  parser.add_argument('--output_path',
1056                      help='Path to unwind info binary output.',
1057                      required=True,
1058                      metavar='FILE')
1059  parser.add_argument('--dump_syms_path',
1060                      required=True,
1061                      help='The path of the dump_syms binary.',
1062                      metavar='FILE')
1063  parser.add_argument('--readobj_path',
1064                      required=True,
1065                      help='The path of the llvm-readobj binary.',
1066                      metavar='FILE')
1067
1068  args = parser.parse_args()
1069  proc = subprocess.Popen(['./' + args.dump_syms_path, args.input_path, '-v'],
1070                          stdout=subprocess.PIPE,
1071                          encoding='ascii')
1072
1073  function_cfis = ReadFunctionCfi(proc.stdout)
1074  function_unwinds = GenerateUnwinds(function_cfis, parsers=ALL_PARSERS)
1075  encoded_function_unwinds = EncodeFunctionUnwinds(
1076      function_unwinds,
1077      ReadTextSectionStartAddress(args.readobj_path, args.input_path))
1078  (page_table, function_table, function_offset_table,
1079   unwind_instruction_table) = GenerateUnwindTables(encoded_function_unwinds)
1080  unwind_info: bytes = EncodeUnwindInfo(page_table, function_table,
1081                                        function_offset_table,
1082                                        unwind_instruction_table)
1083
1084  if proc.wait():
1085    logging.critical('dump_syms exited with return code %d', proc.returncode)
1086    sys.exit(proc.returncode)
1087
1088  with open(args.output_path, 'wb') as f:
1089    f.write(unwind_info)
1090
1091  return 0
1092
1093
1094if __name__ == '__main__':
1095  sys.exit(main())
1096