1#!/usr/bin/env python3 2# Copyright 2021 The Chromium Authors 3# Use of this source code is governed by a BSD-style license that can be 4# found in the LICENSE file. 5"""Creates a table of unwind information in Android Chrome's bespoke format.""" 6 7import abc 8import argparse 9import collections 10import enum 11import json 12import logging 13import re 14import struct 15import subprocess 16import sys 17from typing import (Dict, Iterable, List, NamedTuple, Sequence, TextIO, Tuple, 18 Union) 19 20from util import build_utils 21 22_STACK_CFI_INIT_REGEX = re.compile( 23 r'^STACK CFI INIT ([0-9a-f]+) ([0-9a-f]+) (.+)$') 24_STACK_CFI_REGEX = re.compile(r'^STACK CFI ([0-9a-f]+) (.+)$') 25 26 27class AddressCfi(NamedTuple): 28 """Record representing CFI for an address within a function. 29 30 Represents the Call Frame Information required to unwind from an address in a 31 function. 32 33 Attributes: 34 address: The address. 35 unwind_instructions: The unwind instructions for the address. 36 37 """ 38 address: int 39 unwind_instructions: str 40 41 42class FunctionCfi(NamedTuple): 43 """Record representing CFI for a function. 44 45 Note: address_cfi[0].address is the start address of the function. 46 47 Attributes: 48 size: The function size in bytes. 49 address_cfi: The CFI at each address in the function. 50 51 """ 52 size: int 53 address_cfi: Tuple[AddressCfi, ...] 54 55 56def FilterToNonTombstoneCfi(stream: TextIO) -> Iterable[str]: 57 """Generates non-tombstone STACK CFI lines from the stream. 58 59 STACK CFI functions with address 0 correspond are a 'tombstone' record 60 associated with dead code and can be ignored. See 61 https://bugs.llvm.org/show_bug.cgi?id=47148#c2. 62 63 Args: 64 stream: A file object. 65 66 Returns: 67 An iterable over the non-tombstone STACK CFI lines in the stream. 68 """ 69 in_tombstone_function = False 70 for line in stream: 71 if not line.startswith('STACK CFI '): 72 continue 73 74 if line.startswith('STACK CFI INIT 0 '): 75 in_tombstone_function = True 76 elif line.startswith('STACK CFI INIT '): 77 in_tombstone_function = False 78 79 if not in_tombstone_function: 80 yield line 81 82 83def ReadFunctionCfi(stream: TextIO) -> Iterable[FunctionCfi]: 84 """Generates FunctionCfi records from the stream. 85 86 Args: 87 stream: A file object. 88 89 Returns: 90 An iterable over FunctionCfi corresponding to the non-tombstone STACK CFI 91 lines in the stream. 92 """ 93 current_function_address = None 94 current_function_size = None 95 current_function_address_cfi = [] 96 for line in FilterToNonTombstoneCfi(stream): 97 cfi_init_match = _STACK_CFI_INIT_REGEX.search(line) 98 if cfi_init_match: 99 # Function CFI with address 0 are tombstone entries per 100 # https://bugs.llvm.org/show_bug.cgi?id=47148#c2 and should have been 101 # filtered in `FilterToNonTombstoneCfi`. 102 assert current_function_address != 0 103 if (current_function_address is not None 104 and current_function_size is not None): 105 yield FunctionCfi(current_function_size, 106 tuple(current_function_address_cfi)) 107 current_function_address = int(cfi_init_match.group(1), 16) 108 current_function_size = int(cfi_init_match.group(2), 16) 109 current_function_address_cfi = [ 110 AddressCfi(int(cfi_init_match.group(1), 16), cfi_init_match.group(3)) 111 ] 112 else: 113 cfi_match = _STACK_CFI_REGEX.search(line) 114 assert cfi_match 115 current_function_address_cfi.append( 116 AddressCfi(int(cfi_match.group(1), 16), cfi_match.group(2))) 117 118 assert current_function_address is not None 119 assert current_function_size is not None 120 yield FunctionCfi(current_function_size, tuple(current_function_address_cfi)) 121 122 123def EncodeAsBytes(*values: int) -> bytes: 124 """Encodes the argument ints as bytes. 125 126 This function validates that the inputs are within the range that can be 127 represented as bytes. 128 129 Args: 130 values: Integers in range [0, 255]. 131 132 Returns: 133 The values encoded as bytes. 134 """ 135 for i, value in enumerate(values): 136 if not 0 <= value <= 255: 137 raise ValueError('value = %d out of bounds at byte %d' % (value, i)) 138 return bytes(values) 139 140 141def Uleb128Encode(value: int) -> bytes: 142 """Encodes the argument int to ULEB128 format. 143 144 Args: 145 value: Unsigned integer. 146 147 Returns: 148 The values encoded as ULEB128 bytes. 149 """ 150 if value < 0: 151 raise ValueError(f'Cannot uleb128 encode negative value ({value}).') 152 153 uleb128_bytes = [] 154 done = False 155 while not done: 156 value, lowest_seven_bits = divmod(value, 0x80) 157 done = value == 0 158 uleb128_bytes.append(lowest_seven_bits | (0x80 if not done else 0x00)) 159 return EncodeAsBytes(*uleb128_bytes) 160 161 162def EncodeStackPointerUpdate(offset: int) -> bytes: 163 """Encodes a stack pointer update as arm unwind instructions. 164 165 Args: 166 offset: Offset to apply on stack pointer. Should be in range [-0x204, inf). 167 168 Returns: 169 A list of arm unwind instructions as bytes. 170 """ 171 assert offset % 4 == 0 172 173 abs_offset = abs(offset) 174 instruction_code = 0b01000000 if offset < 0 else 0b00000000 175 if 0x04 <= abs_offset <= 0x200: 176 instructions = [ 177 # vsp = vsp + (xxxxxx << 2) + 4. Covers range 0x04-0x100 inclusive. 178 instruction_code | ((min(abs_offset, 0x100) - 4) >> 2) 179 ] 180 # For vsp increments of 0x104-0x200 we use 00xxxxxx twice. 181 if abs_offset >= 0x104: 182 instructions.append(instruction_code | ((abs_offset - 0x100 - 4) >> 2)) 183 try: 184 return EncodeAsBytes(*instructions) 185 except ValueError as e: 186 raise RuntimeError('offset = %d produced out of range value' % 187 offset) from e 188 else: 189 # This only encodes positive sp movement. 190 assert offset > 0, offset 191 return EncodeAsBytes(0b10110010 # vsp = vsp + 0x204 + (uleb128 << 2) 192 ) + Uleb128Encode((offset - 0x204) >> 2) 193 194 195def EncodePop(registers: Sequence[int]) -> bytes: 196 """Encodes popping of a sequence of registers as arm unwind instructions. 197 198 Args: 199 registers: Collection of target registers to accept values popped from 200 stack. Register value order in the sequence does not matter. Values are 201 popped based on register index order. 202 203 Returns: 204 A list of arm unwind instructions as bytes. 205 """ 206 assert all( 207 r in range(4, 16) 208 for r in registers), f'Can only pop r4 ~ r15. Registers:\n{registers}.' 209 assert len(registers) > 0, 'Register sequence cannot be empty.' 210 211 instructions: List[int] = [] 212 213 # Check if the pushed registers are continuous set starting from r4 (and 214 # ending prior to r12). This scenario has its own encoding. 215 pop_lr = 14 in registers 216 non_lr_registers = [r for r in registers if r != 14] 217 non_lr_registers_continuous_from_r4 = \ 218 sorted(non_lr_registers) == list(range(4, 4 + len(non_lr_registers))) 219 220 if (pop_lr and 0 < len(non_lr_registers) <= 8 221 and non_lr_registers_continuous_from_r4): 222 instructions.append(0b10101000 223 | (len(non_lr_registers) - 1) # Pop r4-r[4+nnn], r14. 224 ) 225 else: 226 register_bits = 0 227 for register in registers: 228 register_bits |= 1 << register 229 register_bits = register_bits >> 4 # Skip r0 ~ r3. 230 instructions.extend([ 231 # Pop up to 12 integer registers under masks {r15-r12}, {r11-r4}. 232 0b10000000 | (register_bits >> 8), 233 register_bits & 0xff 234 ]) 235 236 return EncodeAsBytes(*instructions) 237 238 239class UnwindType(enum.Enum): 240 """ 241 The type of unwind action to perform. 242 """ 243 244 # Use lr as the return address. 245 RETURN_TO_LR = 1 246 247 # Increment or decrement the stack pointer and/or pop registers (r4 ~ r15). 248 # If both, the increment/decrement occurs first. 249 UPDATE_SP_AND_OR_POP_REGISTERS = 2 250 251 # Restore the stack pointer from a register then increment/decrement the stack 252 # pointer. 253 RESTORE_SP_FROM_REGISTER = 3 254 255 # No action necessary. Used for floating point register pops. 256 NO_ACTION = 4 257 258 259class AddressUnwind(NamedTuple): 260 """Record representing unwind information for an address within a function. 261 262 Attributes: 263 address_offset: The offset of the address from the start of the function. 264 unwind_type: The type of unwind to perform from the address. 265 sp_offset: The offset to apply to the stack pointer. 266 registers: The registers involved in the unwind. 267 """ 268 address_offset: int 269 unwind_type: UnwindType 270 sp_offset: int 271 registers: Tuple[int, ...] 272 273 274class FunctionUnwind(NamedTuple): 275 """Record representing unwind information for a function. 276 277 Attributes: 278 address: The address of the function. 279 size: The function size in bytes. 280 address_unwind_info: The unwind info at each address in the function. 281 """ 282 283 address: int 284 size: int 285 address_unwinds: Tuple[AddressUnwind, ...] 286 287 288def EncodeAddressUnwind(address_unwind: AddressUnwind) -> bytes: 289 """Encodes an `AddressUnwind` object as arm unwind instructions. 290 291 Args: 292 address_unwind: Record representing unwind information for an address within 293 a function. 294 295 Returns: 296 A list of arm unwind instructions as bytes. 297 """ 298 if address_unwind.unwind_type == UnwindType.RETURN_TO_LR: 299 return EncodeAsBytes(0b10110000) # Finish. 300 if address_unwind.unwind_type == UnwindType.UPDATE_SP_AND_OR_POP_REGISTERS: 301 return ((EncodeStackPointerUpdate(address_unwind.sp_offset) 302 if address_unwind.sp_offset else b'') + 303 (EncodePop(address_unwind.registers) 304 if address_unwind.registers else b'')) 305 306 if address_unwind.unwind_type == UnwindType.RESTORE_SP_FROM_REGISTER: 307 assert len(address_unwind.registers) == 1 308 return (EncodeAsBytes(0b10010000 309 | address_unwind.registers[0] # Set vsp = r[nnnn]. 310 ) + 311 (EncodeStackPointerUpdate(address_unwind.sp_offset) 312 if address_unwind.sp_offset else b'')) 313 314 if address_unwind.unwind_type == UnwindType.NO_ACTION: 315 return b'' 316 317 assert False, 'unknown unwind type' 318 return b'' 319 320 321class UnwindInstructionsParser(abc.ABC): 322 """Base class for parsers of breakpad unwind instruction sequences. 323 324 Provides regexes matching breakpad instruction sequences understood by the 325 parser, and parsing of the sequences from the regex match. 326 """ 327 328 @abc.abstractmethod 329 def GetBreakpadInstructionsRegex(self) -> re.Pattern: 330 pass 331 332 @abc.abstractmethod 333 def ParseFromMatch(self, address_offset: int, cfa_sp_offset: int, 334 match: re.Match) -> Tuple[AddressUnwind, int]: 335 """Returns the regex matching the breakpad instructions. 336 337 Args: 338 address_offset: Offset from function start address. 339 cfa_sp_offset: CFA stack pointer offset. 340 341 Returns: 342 The unwind info for the address plus the new cfa_sp_offset. 343 """ 344 345 346class NullParser(UnwindInstructionsParser): 347 """Translates the state before any instruction has been executed.""" 348 349 regex = re.compile(r'^\.cfa: sp 0 \+ \.ra: lr$') 350 351 def GetBreakpadInstructionsRegex(self) -> re.Pattern: 352 return self.regex 353 354 def ParseFromMatch(self, address_offset: int, cfa_sp_offset: int, 355 match: re.Match) -> Tuple[AddressUnwind, int]: 356 return AddressUnwind(address_offset, UnwindType.RETURN_TO_LR, 0, ()), 0 357 358 359class PushOrSubSpParser(UnwindInstructionsParser): 360 """Translates unwinds from push or sub sp, #constant instructions.""" 361 362 # We expect at least one of the three outer groups to be non-empty. Cases: 363 # 364 # Standard prologue pushes. 365 # Match the first two and optionally the third. 366 # 367 # Standard prologue sub sp, #constant. 368 # Match only the first. 369 # 370 # Pushes in dynamic stack allocation functions after saving sp. 371 # Match only the third since they don't alter the stack pointer or store the 372 # return address. 373 # 374 # Leaf functions that use callee-save registers. 375 # Match the first and third but not the second. 376 regex = re.compile(r'^(?:\.cfa: sp (\d+) \+ ?)?' 377 r'(?:\.ra: \.cfa (-\d+) \+ \^ ?)?' 378 r'((?:r\d+: \.cfa -\d+ \+ \^ ?)*)$') 379 380 # 'r' followed by digits, with 'r' matched via positive lookbehind so only the 381 # number appears in the match. 382 register_regex = re.compile('(?<=r)(\d+)') 383 384 def GetBreakpadInstructionsRegex(self) -> re.Pattern: 385 return self.regex 386 387 def ParseFromMatch(self, address_offset: int, cfa_sp_offset: int, 388 match: re.Match) -> Tuple[AddressUnwind, int]: 389 # The group will be None if the outer non-capturing groups for the(\d+) and 390 # (-\d+) expressions are not matched. 391 new_cfa_sp_offset, ra_cfa_offset = (int(group) if group else None 392 for group in match.groups()[:2]) 393 394 # Registers are pushed in reverse order by register number so are popped in 395 # order. Sort them to ensure the proper order. 396 registers = sorted([ 397 int(register) 398 for register in self.register_regex.findall(match.group(3)) 399 # `UpdateSpAndOrPopRegisters` only supports popping of register 400 # r4 ~ r15. The ignored registers are translated to sp increments by 401 # the following calculation on `sp_offset`. 402 if int(register) in range(4, 16) 403 ] + 404 # Also pop lr (ra in breakpad terms) if it was stored. 405 ([14] if ra_cfa_offset is not None else [])) 406 407 sp_offset = 0 408 if new_cfa_sp_offset is not None: 409 sp_offset = new_cfa_sp_offset - cfa_sp_offset 410 assert sp_offset % 4 == 0 411 if sp_offset >= len(registers) * 4: 412 # Handles the sub sp, #constant case, and push instructions that push 413 # caller-save registers r0-r3 which don't get encoded in the unwind 414 # instructions. In the latter case we need to move the stack pointer up 415 # to the first pushed register. 416 sp_offset -= len(registers) * 4 417 418 return AddressUnwind(address_offset, 419 UnwindType.UPDATE_SP_AND_OR_POP_REGISTERS, sp_offset, 420 tuple(registers)), new_cfa_sp_offset or cfa_sp_offset 421 422 423class VPushParser(UnwindInstructionsParser): 424 # VPushes that occur in dynamic stack allocation functions after storing the 425 # stack pointer don't change the stack pointer or push any register that we 426 # care about. The first group will not match in those cases. 427 # 428 # Breakpad doesn't seem to understand how to name the floating point 429 # registers so calls them unnamed_register. 430 regex = re.compile(r'^(?:\.cfa: sp (\d+) \+ )?' 431 r'(?:unnamed_register\d+: \.cfa -\d+ \+ \^ ?)+$') 432 433 def GetBreakpadInstructionsRegex(self) -> re.Pattern: 434 return self.regex 435 436 def ParseFromMatch(self, address_offset: int, cfa_sp_offset: int, 437 match: re.Match) -> Tuple[AddressUnwind, int]: 438 # `match.group(1)`, which corresponds to the (\d+) expression, will be None 439 # if the first outer non-capturing group is not matched. 440 new_cfa_sp_offset = int(match.group(1)) if match.group(1) else None 441 if new_cfa_sp_offset is None: 442 return (AddressUnwind(address_offset, UnwindType.NO_ACTION, 0, 443 ()), cfa_sp_offset) 444 445 sp_offset = new_cfa_sp_offset - cfa_sp_offset 446 assert sp_offset % 4 == 0 447 return AddressUnwind(address_offset, 448 UnwindType.UPDATE_SP_AND_OR_POP_REGISTERS, sp_offset, 449 ()), new_cfa_sp_offset 450 451 452class StoreSpParser(UnwindInstructionsParser): 453 regex = re.compile(r'^\.cfa: r(\d+) (\d+) \+$') 454 455 def GetBreakpadInstructionsRegex(self) -> re.Pattern: 456 return self.regex 457 458 def ParseFromMatch(self, address_offset: int, cfa_sp_offset: int, 459 match: re.Match) -> Tuple[AddressUnwind, int]: 460 register = int(match.group(1)) 461 new_cfa_sp_offset = int(match.group(2)) 462 sp_offset = new_cfa_sp_offset - cfa_sp_offset 463 assert sp_offset % 4 == 0 464 return AddressUnwind(address_offset, UnwindType.RESTORE_SP_FROM_REGISTER, 465 sp_offset, (register, )), new_cfa_sp_offset 466 467 468def EncodeUnwindInstructionTable(complete_instruction_sequences: Iterable[bytes] 469 ) -> Tuple[bytes, Dict[bytes, int]]: 470 """Encodes the unwind instruction table. 471 472 Deduplicates the encoded unwind instruction sequences. Generates the table and 473 a dictionary mapping a function to its starting index in the table. 474 475 The instruction table is used by the unwinder to provide the sequence of 476 unwind instructions to execute for each function, separated by offset 477 into the function. 478 479 Args: 480 complete_instruction_sequences: An iterable of encoded unwind instruction 481 sequences. The sequences represent the series of unwind instructions to 482 execute corresponding to offsets within each function. 483 484 Returns: 485 A tuple containing: 486 - The unwind instruction table as bytes. 487 - The mapping from the instruction sequence to the offset in the unwind 488 instruction table. This mapping is used to construct the function offset 489 table, which references entries in the unwind instruction table. 490 """ 491 # As the function offset table uses variable length number encoding (uleb128), 492 # which means smaller number uses fewer bytes to represent, we should sort 493 # the unwind instruction table by number of references from the function 494 # offset table in order to minimize the size of the function offset table. 495 ref_counts: Dict[bytes, int] = collections.defaultdict(int) 496 for sequence in complete_instruction_sequences: 497 ref_counts[sequence] += 1 498 499 def ComputeScore(sequence): 500 """ Score for each sequence is computed as ref_count / size_of_sequence. 501 502 According to greedy algorithm, items with higher value / space cost ratio 503 should be prioritized. Here value is bytes saved in the function offset 504 table, represetned by ref_count. Space cost is the space taken in the 505 unwind instruction table, represented by size_of_sequence. 506 507 Note: In order to ensure build-time determinism, `sequence` is also returned 508 to resolve sorting order when scores are the same. 509 """ 510 return ref_counts[sequence] / len(sequence), sequence 511 512 ordered_sequences = sorted(ref_counts.keys(), key=ComputeScore, reverse=True) 513 offsets: Dict[bytes, int] = {} 514 current_offset = 0 515 for sequence in ordered_sequences: 516 offsets[sequence] = current_offset 517 current_offset += len(sequence) 518 return b''.join(ordered_sequences), offsets 519 520 521class EncodedAddressUnwind(NamedTuple): 522 """Record representing unwind information for an address within a function. 523 524 This structure represents the same concept as `AddressUnwind`. The only 525 difference is that how to unwind from the address is represented as 526 encoded ARM unwind instructions. 527 528 Attributes: 529 address_offset: The offset of the address from the start address of the 530 function. 531 complete_instruction_sequence: The full ARM unwind instruction sequence to 532 unwind from the `address_offset`. 533 """ 534 address_offset: int 535 complete_instruction_sequence: bytes 536 537 538def EncodeAddressUnwinds(address_unwinds: Tuple[AddressUnwind, ...] 539 ) -> Tuple[EncodedAddressUnwind, ...]: 540 """Encodes the unwind instructions and offset for the addresses within a 541 function. 542 543 Args: 544 address_unwinds: A tuple of unwind state for addresses within a function. 545 546 Returns: 547 The encoded unwind instructions and offsets for the addresses within a 548 function, ordered by decreasing offset. 549 """ 550 sorted_address_unwinds: List[AddressUnwind] = sorted( 551 address_unwinds, 552 key=lambda address_unwind: address_unwind.address_offset, 553 reverse=True) 554 unwind_instructions: List[bytes] = [ 555 EncodeAddressUnwind(address_unwind) 556 for address_unwind in sorted_address_unwinds 557 ] 558 559 # A complete instruction sequence contains all the unwind instructions 560 # necessary to unwind from an offset within a function. For a given offset 561 # this includes the offset's instructions plus the instructions for all 562 # earlier offsets. The offsets are stored in reverse order, hence the i: 563 # range rather than :i+1. 564 complete_instruction_sequences = [ 565 b''.join(unwind_instructions[i:]) for i in range(len(unwind_instructions)) 566 ] 567 568 encoded_unwinds: List[EncodedAddressUnwind] = [] 569 for address_unwind, sequence in zip(sorted_address_unwinds, 570 complete_instruction_sequences): 571 encoded_unwinds.append( 572 EncodedAddressUnwind(address_unwind.address_offset, sequence)) 573 return tuple(encoded_unwinds) 574 575 576class EncodedFunctionUnwind(NamedTuple): 577 """Record representing unwind information for a function. 578 579 This structure represents the same concept as `FunctionUnwind`, but with 580 some differences: 581 - Attribute `address` is split into 2 attributes: `page_number` and 582 `page_offset`. 583 - Attribute `size` is dropped. 584 - Attribute `address_unwinds` becomes a collection of `EncodedAddressUnwind`s, 585 instead of a collection of `AddressUnwind`s. 586 587 Attributes: 588 page_number: The upper bits (17 ~ 31bits) of byte offset from text section 589 start. 590 page_offset: The lower bits (1 ~ 16bits) of instruction offset from text 591 section start. 592 address_unwinds: A collection of `EncodedAddressUnwind`s. 593 594 """ 595 596 page_number: int 597 page_offset: int 598 address_unwinds: Tuple[EncodedAddressUnwind, ...] 599 600 601# The trivial unwind is defined as a single `RETURN_TO_LR` instruction 602# at the start of the function. 603TRIVIAL_UNWIND: Tuple[EncodedAddressUnwind, ...] = EncodeAddressUnwinds( 604 (AddressUnwind(address_offset=0, 605 unwind_type=UnwindType.RETURN_TO_LR, 606 sp_offset=0, 607 registers=()), )) 608 609# The refuse to unwind filler unwind is used to fill the invalid space 610# before the first function in the first page and after the last function 611# in the last page. 612REFUSE_TO_UNWIND: Tuple[EncodedAddressUnwind, ...] = (EncodedAddressUnwind( 613 address_offset=0, 614 complete_instruction_sequence=bytes([0b10000000, 0b00000000])), ) 615 616 617def EncodeFunctionUnwinds(function_unwinds: Iterable[FunctionUnwind], 618 text_section_start_address: int 619 ) -> Iterable[EncodedFunctionUnwind]: 620 """Encodes the unwind state for all functions defined in the binary. 621 622 This function 623 - sorts the collection of `FunctionUnwind`s by address. 624 - fills in gaps between functions with trivial unwind. 625 - fills the space in the last page after last function with refuse to unwind. 626 - fills the space in the first page before the first function with refuse 627 to unwind. 628 629 Args: 630 function_unwinds: An iterable of function unwind states. 631 text_section_start_address: The address of .text section in ELF file. 632 633 Returns: 634 The encoded function unwind states with no gaps between functions, ordered 635 by ascending address. 636 """ 637 638 def GetPageNumber(address: int) -> int: 639 """Calculates the page number. 640 641 Page number is calculated as byte_offset_from_text_section_start >> 17, 642 i.e. the upper bits (17 ~ 31bits) of byte offset from text section start. 643 """ 644 return (address - text_section_start_address) >> 17 645 646 def GetPageOffset(address: int) -> int: 647 """Calculates the page offset. 648 649 Page offset is calculated as (byte_offset_from_text_section_start >> 1) 650 & 0xffff, i.e. the lower bits (1 ~ 16bits) of instruction offset from 651 text section start. 652 """ 653 return ((address - text_section_start_address) >> 1) & 0xffff 654 655 sorted_function_unwinds: List[FunctionUnwind] = sorted( 656 function_unwinds, key=lambda function_unwind: function_unwind.address) 657 658 if sorted_function_unwinds[0].address > text_section_start_address: 659 yield EncodedFunctionUnwind(page_number=0, 660 page_offset=0, 661 address_unwinds=REFUSE_TO_UNWIND) 662 663 prev_func_end_address: int = sorted_function_unwinds[0].address 664 665 gaps = 0 666 for unwind in sorted_function_unwinds: 667 assert prev_func_end_address <= unwind.address, ( 668 'Detected overlap between functions.') 669 670 if prev_func_end_address < unwind.address: 671 # Gaps between functions are typically filled by regions of thunks which 672 # do not alter the stack pointer. Filling these gaps with TRIVIAL_UNWIND 673 # is the appropriate unwind strategy. 674 gaps += 1 675 yield EncodedFunctionUnwind(GetPageNumber(prev_func_end_address), 676 GetPageOffset(prev_func_end_address), 677 TRIVIAL_UNWIND) 678 679 yield EncodedFunctionUnwind(GetPageNumber(unwind.address), 680 GetPageOffset(unwind.address), 681 EncodeAddressUnwinds(unwind.address_unwinds)) 682 683 prev_func_end_address = unwind.address + unwind.size 684 685 if GetPageOffset(prev_func_end_address) != 0: 686 yield EncodedFunctionUnwind(GetPageNumber(prev_func_end_address), 687 GetPageOffset(prev_func_end_address), 688 REFUSE_TO_UNWIND) 689 690 logging.info('%d/%d gaps between functions filled with trivial unwind.', gaps, 691 len(sorted_function_unwinds)) 692 693 694def EncodeFunctionOffsetTable( 695 encoded_address_unwind_sequences: Iterable[ 696 Tuple[EncodedAddressUnwind, ...]], 697 unwind_instruction_table_offsets: Dict[bytes, int] 698) -> Tuple[bytes, Dict[Tuple[EncodedAddressUnwind, ...], int]]: 699 """Encodes the function offset table. 700 701 The function offset table maps local instruction offset from function 702 start to the location in the unwind instruction table. 703 704 Args: 705 encoded_address_unwind_sequences: An iterable of encoded address unwind 706 sequences. 707 unwind_instruction_table_offsets: The offset mapping returned from 708 `EncodeUnwindInstructionTable`. 709 710 Returns: 711 A tuple containing: 712 - The function offset table as bytes. 713 - The mapping from the `EncodedAddressUnwind`s to the offset in the function 714 offset table. This mapping is used to construct the function table, which 715 references entries in the function offset table. 716 """ 717 function_offset_table = bytearray() 718 offsets: Dict[Tuple[EncodedAddressUnwind, ...], int] = {} 719 720 for sequence in encoded_address_unwind_sequences: 721 if sequence in offsets: 722 continue 723 724 offsets[sequence] = len(function_offset_table) 725 for address_offset, complete_instruction_sequence in sequence: 726 # Note: address_offset is the number of bytes from one address to another, 727 # while the instruction_offset is the number of 2-byte instructions 728 # from one address to another. 729 instruction_offset = address_offset >> 1 730 function_offset_table += ( 731 Uleb128Encode(instruction_offset) + Uleb128Encode( 732 unwind_instruction_table_offsets[complete_instruction_sequence])) 733 734 return bytes(function_offset_table), offsets 735 736 737def EncodePageTableAndFunctionTable( 738 function_unwinds: Iterable[EncodedFunctionUnwind], 739 function_offset_table_offsets: Dict[Tuple[EncodedAddressUnwind, ...], int] 740) -> Tuple[bytes, bytes]: 741 """Encode page table and function table as bytes. 742 743 Page table: 744 A table that contains the mapping from page_number to the location of the 745 entry for the first function on the page in the function table. 746 747 Function table: 748 A table that contains the mapping from page_offset to the location of an entry 749 in the function offset table. 750 751 Args: 752 function_unwinds: All encoded function unwinds in the module. 753 function_offset_table_offsets: The offset mapping returned from 754 `EncodeFunctionOffsetTable`. 755 756 Returns: 757 A tuple containing: 758 - The page table as bytes. 759 - The function table as bytes. 760 """ 761 page_function_unwinds: Dict[ 762 int, List[EncodedFunctionUnwind]] = collections.defaultdict(list) 763 for function_unwind in function_unwinds: 764 page_function_unwinds[function_unwind.page_number].append(function_unwind) 765 766 raw_page_table: List[int] = [] 767 function_table = bytearray() 768 769 for page_number, same_page_function_unwinds in sorted( 770 page_function_unwinds.items(), key=lambda item: item[0]): 771 # Pad empty pages. 772 # Empty pages can occur when a function spans over multiple pages. 773 # Example: 774 # A page table with a starting function that spans 3 over pages. 775 # page_table: 776 # [0, 1, 1, 1] 777 # function_table: 778 # [ 779 # # Page 0 780 # (0, 20) # This function spans from page 0 offset 0 to page 3 offset 5. 781 # # Page 1 is empty. 782 # # Page 2 is empty. 783 # # Page 3 784 # (6, 70) 785 # ] 786 assert page_number > len(raw_page_table) - 1 787 number_of_empty_pages = page_number - len(raw_page_table) 788 # The function table is represented as `base::FunctionTableEntry[]`, 789 # where `base::FunctionTableEntry` is 4 bytes. 790 function_table_index = len(function_table) // 4 791 raw_page_table.extend([function_table_index] * (number_of_empty_pages + 1)) 792 assert page_number == len(raw_page_table) - 1 793 794 for function_unwind in sorted( 795 same_page_function_unwinds, 796 key=lambda function_unwind: function_unwind.page_offset): 797 function_table += struct.pack( 798 'HH', function_unwind.page_offset, 799 function_offset_table_offsets[function_unwind.address_unwinds]) 800 801 page_table = struct.pack(f'{len(raw_page_table)}I', *raw_page_table) 802 803 return page_table, bytes(function_table) 804 805 806ALL_PARSERS: Tuple[UnwindInstructionsParser, ...] = ( 807 NullParser(), 808 PushOrSubSpParser(), 809 StoreSpParser(), 810 VPushParser(), 811) 812 813 814def ParseAddressCfi(address_cfi: AddressCfi, function_start_address: int, 815 parsers: Tuple[UnwindInstructionsParser, ...], 816 prev_cfa_sp_offset: int 817 ) -> Tuple[Union[AddressUnwind, None], bool, int]: 818 """Parses address CFI with given parsers. 819 820 Args: 821 address_cfi: The CFI for an address in the function. 822 function_start_address: The start address of the function. 823 parsers: Available parsers to try on CFI data. 824 prev_cfa_sp_offset: Previous CFA stack pointer offset. 825 826 Returns: 827 A tuple containing: 828 - An `AddressUnwind` object when the parse is successful, None otherwise. 829 - Whether the address is in function epilogue. 830 - The new cfa_sp_offset. 831 """ 832 for parser in parsers: 833 match = parser.GetBreakpadInstructionsRegex().search( 834 address_cfi.unwind_instructions) 835 if not match: 836 continue 837 838 address_unwind, cfa_sp_offset = parser.ParseFromMatch( 839 address_cfi.address - function_start_address, prev_cfa_sp_offset, match) 840 841 in_epilogue = ( 842 prev_cfa_sp_offset > cfa_sp_offset 843 and address_unwind.unwind_type != UnwindType.RESTORE_SP_FROM_REGISTER) 844 845 return (address_unwind if not in_epilogue else None, in_epilogue, 846 cfa_sp_offset) 847 848 return None, False, prev_cfa_sp_offset 849 850 851def GenerateUnwinds(function_cfis: Iterable[FunctionCfi], 852 parsers: Tuple[UnwindInstructionsParser, ...] 853 ) -> Iterable[FunctionUnwind]: 854 """Generates parsed function unwind states from breakpad CFI data. 855 856 This function parses `FunctionCfi`s to `FunctionUnwind`s using 857 `UnwindInstructionParser`. 858 859 Args: 860 function_cfis: An iterable of function CFI data. 861 parsers: Available parsers to try on CFI address data. 862 863 Returns: 864 An iterable of parsed function unwind states. 865 """ 866 functions = 0 867 addresses = 0 868 handled_addresses = 0 869 epilogues_seen = 0 870 871 for function_cfi in function_cfis: 872 functions += 1 873 address_unwinds: List[AddressUnwind] = [] 874 cfa_sp_offset = 0 875 for address_cfi in function_cfi.address_cfi: 876 addresses += 1 877 878 address_unwind, in_epilogue, cfa_sp_offset = ParseAddressCfi( 879 address_cfi, function_cfi.address_cfi[0].address, parsers, 880 cfa_sp_offset) 881 882 if address_unwind: 883 handled_addresses += 1 884 address_unwinds.append(address_unwind) 885 continue 886 887 if in_epilogue: 888 epilogues_seen += 1 889 break 890 891 logging.info('unrecognized CFI: %x %s.', address_cfi.address, 892 address_cfi.unwind_instructions) 893 894 if address_unwinds: 895 # We expect that the unwind information for every function starts with a 896 # trivial unwind (RETURN_TO_LR) prior to the execution of any code in the 897 # function. This is required by the arm calling convention which involves 898 # setting lr to the return address on calling into a function. 899 assert address_unwinds[0].address_offset == 0 900 assert address_unwinds[0].unwind_type == UnwindType.RETURN_TO_LR 901 902 yield FunctionUnwind(function_cfi.address_cfi[0].address, 903 function_cfi.size, tuple(address_unwinds)) 904 905 logging.info('%d functions.', functions) 906 logging.info('%d/%d addresses handled.', handled_addresses, addresses) 907 logging.info('epilogues_seen: %d.', epilogues_seen) 908 909 910def EncodeUnwindInfo(page_table: bytes, function_table: bytes, 911 function_offset_table: bytes, 912 unwind_instruction_table: bytes) -> bytes: 913 """Encodes all unwind tables as a single binary. 914 915 Concats all unwind table binaries together and attach a header at the start 916 with a offset-size pair for each table. 917 918 offset: The offset to the target table from the start of the unwind info 919 binary in bytes. 920 size: The declared size of the target table. 921 922 Both offset and size are represented as 32bit integers. 923 See `base::ChromeUnwindInfoHeaderAndroid` for more details. 924 925 Args: 926 page_table: The page table as bytes. 927 function_table: The function table as bytes. 928 function_offset_table: The function offset table as bytes. 929 unwind_instruction_table: The unwind instruction table as bytes. 930 931 Returns: 932 A single binary containing 933 - A header that points to the location of each table. 934 - All unwind tables. 935 """ 936 unwind_info_header = bytearray() 937 # Each table is represented as (offset, size) pair, both offset and size 938 # are represented as 4 byte integer. 939 unwind_info_header_size = 4 * 2 * 4 940 unwind_info_body = bytearray() 941 942 # Both the page_table and the function table need to be aligned because their 943 # contents are interpreted as multi-byte integers. However, the byte size of 944 # the header, the page table, the function table are all multiples of 4 and 945 # the resource will be memory mapped at a 4 byte boundary, so no extra care 946 # is required to align the page table and the function table. 947 # 948 # The function offset table and the unwind instruction table are accessed 949 # byte by byte, so they only need 1 byte alignment. 950 951 assert len(page_table) % 4 == 0, ( 952 'Each entry in the page table should be 4-byte integer.') 953 assert len(function_table) % 4 == 0, ( 954 'Each entry in the function table should be a pair of 2 2-byte integers.') 955 956 for table in page_table, function_table: 957 offset = unwind_info_header_size + len(unwind_info_body) 958 # For the page table and the function_table, declared size is the number of 959 # entries in each table. The tables will be aligned to a 4 byte boundary 960 # because the resource will be memory mapped at a 4 byte boundary and the 961 # header is a multiple of 4 bytes. 962 declared_size = len(table) // 4 963 unwind_info_header += struct.pack('II', offset, declared_size) 964 unwind_info_body += table 965 966 for table in function_offset_table, unwind_instruction_table: 967 offset = unwind_info_header_size + len(unwind_info_body) 968 # Because both the function offset table and the unwind instruction table 969 # contain variable length encoded numbers, the declared size is simply the 970 # number of bytes in each table. The tables only require 1 byte alignment. 971 declared_size = len(table) 972 unwind_info_header += struct.pack('II', offset, declared_size) 973 unwind_info_body += table 974 975 return bytes(unwind_info_header + unwind_info_body) 976 977 978def GenerateUnwindTables( 979 encoded_function_unwinds_iterable: Iterable[EncodedFunctionUnwind] 980) -> Tuple[bytes, bytes, bytes, bytes]: 981 """Generates all unwind tables as bytes. 982 983 Args: 984 encoded_function_unwinds_iterable: Encoded function unwinds for all 985 functions in the ELF binary. 986 987 Returns: 988 A tuple containing: 989 - The page table as bytes. 990 - The function table as bytes. 991 - The function offset table as bytes. 992 - The unwind instruction table as bytes. 993 """ 994 encoded_function_unwinds: List[EncodedFunctionUnwind] = list( 995 encoded_function_unwinds_iterable) 996 complete_instruction_sequences: List[bytes] = [] 997 encoded_address_unwind_sequences: List[Tuple[EncodedAddressUnwind, ...]] = [] 998 999 for encoded_function_unwind in encoded_function_unwinds: 1000 encoded_address_unwind_sequences.append( 1001 encoded_function_unwind.address_unwinds) 1002 for address_unwind in encoded_function_unwind.address_unwinds: 1003 complete_instruction_sequences.append( 1004 address_unwind.complete_instruction_sequence) 1005 1006 unwind_instruction_table, unwind_instruction_table_offsets = ( 1007 EncodeUnwindInstructionTable(complete_instruction_sequences)) 1008 1009 function_offset_table, function_offset_table_offsets = ( 1010 EncodeFunctionOffsetTable(encoded_address_unwind_sequences, 1011 unwind_instruction_table_offsets)) 1012 1013 page_table, function_table = EncodePageTableAndFunctionTable( 1014 encoded_function_unwinds, function_offset_table_offsets) 1015 1016 return (page_table, function_table, function_offset_table, 1017 unwind_instruction_table) 1018 1019 1020def ReadTextSectionStartAddress(readobj_path: str, libchrome_path: str) -> int: 1021 """Reads the .text section start address of libchrome ELF. 1022 1023 Arguments: 1024 readobj_path: Path to llvm-obj binary. 1025 libchrome_path: Path to libchrome binary. 1026 1027 Returns: 1028 The text section start address as a number. 1029 """ 1030 def GetSectionName(section) -> str: 1031 # See crbug.com/1426287 for context on different JSON names. 1032 if 'Name' in section['Section']['Name']: 1033 return section['Section']['Name']['Name'] 1034 return section['Section']['Name']['Value'] 1035 1036 proc = subprocess.Popen( 1037 [readobj_path, '--sections', '--elf-output-style=JSON', libchrome_path], 1038 stdout=subprocess.PIPE, 1039 encoding='ascii') 1040 1041 elfs = json.loads(proc.stdout.read())[0] 1042 sections = elfs['Sections'] 1043 1044 return next(s['Section']['Address'] for s in sections 1045 if GetSectionName(s) == '.text') 1046 1047 1048def main(): 1049 build_utils.InitLogging('CREATE_UNWIND_TABLE_DEBUG') 1050 parser = argparse.ArgumentParser(description=__doc__) 1051 parser.add_argument('--input_path', 1052 help='Path to the unstripped binary.', 1053 required=True, 1054 metavar='FILE') 1055 parser.add_argument('--output_path', 1056 help='Path to unwind info binary output.', 1057 required=True, 1058 metavar='FILE') 1059 parser.add_argument('--dump_syms_path', 1060 required=True, 1061 help='The path of the dump_syms binary.', 1062 metavar='FILE') 1063 parser.add_argument('--readobj_path', 1064 required=True, 1065 help='The path of the llvm-readobj binary.', 1066 metavar='FILE') 1067 1068 args = parser.parse_args() 1069 proc = subprocess.Popen(['./' + args.dump_syms_path, args.input_path, '-v'], 1070 stdout=subprocess.PIPE, 1071 encoding='ascii') 1072 1073 function_cfis = ReadFunctionCfi(proc.stdout) 1074 function_unwinds = GenerateUnwinds(function_cfis, parsers=ALL_PARSERS) 1075 encoded_function_unwinds = EncodeFunctionUnwinds( 1076 function_unwinds, 1077 ReadTextSectionStartAddress(args.readobj_path, args.input_path)) 1078 (page_table, function_table, function_offset_table, 1079 unwind_instruction_table) = GenerateUnwindTables(encoded_function_unwinds) 1080 unwind_info: bytes = EncodeUnwindInfo(page_table, function_table, 1081 function_offset_table, 1082 unwind_instruction_table) 1083 1084 if proc.wait(): 1085 logging.critical('dump_syms exited with return code %d', proc.returncode) 1086 sys.exit(proc.returncode) 1087 1088 with open(args.output_path, 'wb') as f: 1089 f.write(unwind_info) 1090 1091 return 0 1092 1093 1094if __name__ == '__main__': 1095 sys.exit(main()) 1096