1# Copyright (c) Meta Platforms, Inc. and affiliates. 2# All rights reserved. 3# 4# This source code is licensed under the BSD-style license found in the 5# LICENSE file in the root directory of this source tree. 6 7import dataclasses 8import struct 9from collections import OrderedDict 10from enum import Enum 11 12from typing import Dict, List, Tuple 13 14from prettytable import PrettyTable 15 16# This version number should match the one defined in profiler.h 17ET_PROF_VER = 0x00000001 18 19# This string defines the layout of the prof_result_t struct 20# defined in executorch/profiler/profiler.h. This is used to 21# unpack the binary data to derive the profiling results. 22# To align the end of a structure to the alignment requirement 23# of a particular type, end the format with the code for that 24# type with a repeat count of zero. Adding 0Q at the ending ensures 25# that the struct is aligned by 8 bytes which is the alignment we 26# impose in the runtime. 27PROF_HEADER_STRUCT_FMT = "32s7I0Q" 28PROF_RESULT_STRUCT_FMT = "32siIQQ0Q" 29ALLOCATOR_STRUCT_FMT = "32sQ0Q" 30ALLOCATION_STRUCT_FMT = "2I0Q" 31CHAIN_IDX_NO_CHAIN = -1 32 33 34class TimeScale(Enum): 35 TIME_IN_NS = 0 36 TIME_IN_US = 1 37 TIME_IN_MS = 2 38 CPU_CYCLES = 3 39 40 41# These data classes represent the structures used on device to 42# log various forms of profiling data. 43@dataclasses.dataclass 44class ProfileData: 45 name: str 46 chain_idx: int 47 instruction_idx: int 48 start_time: int 49 end_time: int 50 51 52@dataclasses.dataclass 53class ProfilerHeader: 54 name: str 55 prof_ver: int 56 max_prof_entries: int 57 prof_entries: int 58 max_allocator_entries: int 59 allocator_entries: int 60 max_mem_prof_entries: int 61 mem_prof_entries: int 62 63 64@dataclasses.dataclass 65class Allocator: 66 name: str 67 allocator_id: int 68 69 70@dataclasses.dataclass 71class MemAllocation: 72 allocator_id: int 73 allocation_size: int 74 75 76""" 77These data classes are derived from the post-processing of the 78profiling data retrieved from the runtime. If there are multiple 79blocks of profiling data resulting from iterations of the same code 80segment then corresponding entries will be consolidated with each 81entry in the consolidated list representing one iteration. 82""" 83 84 85@dataclasses.dataclass 86class ProfileEvent: 87 name: str 88 ts: List[float] 89 duration: List[float] 90 chain_idx: int = -1 91 instruction_idx: int = -1 92 # pyre-ignore[8]: Incompatible attribute type 93 stacktrace: str = None 94 95 96@dataclasses.dataclass 97class ProfileEventFrameworkTax: 98 exec_time: List[int] 99 kernel_and_delegate_time: List[int] 100 framework_tax: List[float] 101 102 103@dataclasses.dataclass 104class MemEvent: 105 allocator_name: str 106 total_allocations_done: int 107 108 109def adjust_time_scale(event: ProfileData, time_scale: TimeScale): 110 time_div_factor = { 111 TimeScale.CPU_CYCLES: 1, 112 TimeScale.TIME_IN_MS: 1, 113 TimeScale.TIME_IN_US: 1000, 114 TimeScale.TIME_IN_NS: 1000000, 115 } 116 div_factor = time_div_factor[time_scale] 117 if div_factor != 1: 118 duration = round((event.end_time - event.start_time) / div_factor, 4) 119 start_time = round((event.start_time) / div_factor, 4) 120 else: 121 duration = event.end_time - event.start_time 122 start_time = event.start_time 123 return start_time, duration 124 125 126def parse_prof_blocks( 127 prof_blocks: Dict[str, List[Tuple[List[ProfileData], List[MemAllocation]]]], 128 allocator_dict: Dict[int, str], 129 time_scale: TimeScale, 130) -> Tuple[Dict[str, List[ProfileEvent]], Dict[str, List[MemEvent]]]: 131 132 prof_data = OrderedDict() 133 mem_prof_data = OrderedDict() 134 135 # Iterate through all the profiling blocks data that have been grouped by name. 136 for name, data_list in prof_blocks.items(): 137 prof_data_list = [] 138 mem_prof_data_list = [] 139 # Each entry in data_list is a tuple in which the first entry is profiling data 140 # and the second entry is memory allocation data, also each entry in data_list 141 # represents one iteration of a code block. 142 for i in range(len(data_list)): 143 for idx, event in enumerate(data_list[i][0]): 144 # If the event represented by the index idx already exists in the list 145 # then just append the new time entry to the duration list present in 146 # the event object. If it doesn't exist then create a new entry and add 147 # it to the list. 148 if idx < len(prof_data_list): 149 start_time, duration = adjust_time_scale(event, time_scale) 150 prof_data_list[idx].ts.append(start_time) 151 prof_data_list[idx].duration.append(duration) 152 else: 153 start_time, duration = adjust_time_scale(event, time_scale) 154 prof_data_list.append( 155 ProfileEvent( 156 event.name, 157 [start_time], 158 [duration], 159 event.chain_idx, 160 event.instruction_idx, 161 ) 162 ) 163 164 # Collect all the memory allocation events of this iteration of the code block 165 for idx, event in enumerate(data_list[i][1]): 166 if idx >= len(mem_prof_data_list): 167 mem_prof_data_list.append(event) 168 169 # Group all the memory allocation events based on the allocator they were 170 # allocated from. 171 alloc_sum_dict = OrderedDict() 172 for alloc in mem_prof_data_list: 173 alloc_sum_dict[alloc.allocator_id] = ( 174 alloc_sum_dict.get(alloc.allocator_id, 0) + alloc.allocation_size 175 ) 176 177 mem_prof_sum_list = [] 178 for allocator_id, allocation_size in alloc_sum_dict.items(): 179 mem_prof_sum_list.append( 180 MemEvent(allocator_dict[allocator_id], allocation_size) 181 ) 182 prof_data[name] = prof_data_list 183 mem_prof_data[name] = mem_prof_sum_list 184 185 return prof_data, mem_prof_data 186 187 188def sanity_check_prof_outputs( 189 prof_blocks: Dict[str, List[Tuple[List[ProfileData], List[MemAllocation]]]] 190): 191 for _, prof_block_vals in prof_blocks.items(): 192 for i in range(len(prof_block_vals) - 1): 193 prof_data_list_base = prof_block_vals[i][0] 194 prof_data_list_cmp = prof_block_vals[i + 1][0] 195 196 # Profiling blocks corresponding to the same name should always be of the same 197 # size as they essentially just represent one iteration of a code block that has been 198 # run multiple times. 199 if len(prof_data_list_base) != len(prof_data_list_cmp): 200 raise ValueError( 201 "Profiling blocks corresponding to the same name shouldn't be of different lengths." 202 ) 203 204 for idx in range(len(prof_data_list_base)): 205 if prof_data_list_base[idx].name != prof_data_list_cmp[idx].name: 206 raise ValueError( 207 "Corresponding entries in different iterations of the " 208 "profiling block do not match" 209 ) 210 211 mem_prof_data_list_base = prof_block_vals[i][1] 212 mem_prof_data_list_cmp = prof_block_vals[i + 1][1] 213 214 if len(mem_prof_data_list_base) != len(mem_prof_data_list_cmp): 215 raise ValueError( 216 "Memory profiling blocks corresponding to the same name shouldn't be of different lengths." 217 ) 218 219 for idx in range(len(mem_prof_data_list_base)): 220 if ( 221 mem_prof_data_list_base[idx].allocator_id 222 != mem_prof_data_list_cmp[idx].allocator_id 223 ): 224 raise ValueError( 225 "Corresponding entries in different iterations of the memory " 226 "profiling blocks do not have the same allocator id" 227 ) 228 if ( 229 mem_prof_data_list_base[idx].allocation_size 230 != mem_prof_data_list_cmp[idx].allocation_size 231 ): 232 raise ValueError( 233 "Corresponding entries in different iterations of the memory " 234 "profiling blocks do not have the same allocation size." 235 ) 236 237 238def deserialize_profile_results( 239 buff: bytes, time_scale: TimeScale = TimeScale.TIME_IN_NS 240) -> Tuple[Dict[str, List[ProfileEvent]], Dict[str, List[MemEvent]]]: 241 242 prof_header_struct_size = struct.calcsize(PROF_HEADER_STRUCT_FMT) 243 prof_allocator_struct_size = struct.calcsize(ALLOCATOR_STRUCT_FMT) 244 prof_allocation_struct_size = struct.calcsize(ALLOCATION_STRUCT_FMT) 245 prof_result_struct_size = struct.calcsize(PROF_RESULT_STRUCT_FMT) 246 prof_blocks = OrderedDict() 247 allocator_dict = {} 248 base_offset = 0 249 250 while base_offset < len(buff): 251 # Unpack the header for this profiling block from which we can figure 252 # out how many profiling entries are present in this block. 253 prof_header_args = list( 254 struct.unpack_from(PROF_HEADER_STRUCT_FMT, buff, offset=base_offset) 255 ) 256 # decode name in profiler header 257 prof_header_args[0] = prof_header_args[0].decode("utf-8").replace("\u0000", "") 258 prof_header = ProfilerHeader(*prof_header_args) 259 base_offset += prof_header_struct_size 260 261 assert prof_header.prof_ver == ET_PROF_VER, ( 262 "Mismatch in version between profile dump" "and post-processing tool" 263 ) 264 # Get all the profiling (perf events) entries 265 prof_data = [] 266 for i in range(prof_header.prof_entries): 267 name_bytes, type, id, start_time, end_time = struct.unpack_from( 268 PROF_RESULT_STRUCT_FMT, 269 buff, 270 offset=base_offset + i * prof_result_struct_size, 271 ) 272 prof_data.append( 273 ProfileData( 274 # name_bytes is 32 bytes string, where if the real log event is less 275 # than 32 characters it'll be filled with 0 chars => trimming it 276 name_bytes.decode("utf-8").replace("\u0000", ""), 277 type, 278 id, 279 start_time, 280 end_time, 281 ) 282 ) 283 284 # Move forward in the profiling block to start parsing memory allocation events. 285 base_offset += prof_result_struct_size * prof_header.max_prof_entries 286 287 # Parse the allocator entries table, this table maps the allocator id to the 288 # string containing the name designated to this allocator. 289 for i in range(0, prof_header.allocator_entries): 290 allocator_name, allocator_id = struct.unpack_from( 291 ALLOCATOR_STRUCT_FMT, 292 buff, 293 offset=base_offset + i * prof_allocator_struct_size, 294 ) 295 allocator_dict[allocator_id] = allocator_name.decode("utf-8").replace( 296 "\u0000", "" 297 ) 298 299 base_offset += prof_allocator_struct_size * prof_header.max_allocator_entries 300 301 # Get all the profiling (memory allocation events) entries 302 mem_prof_data = [] 303 for i in range(0, prof_header.mem_prof_entries): 304 mem_prof_entry = list( 305 struct.unpack_from( 306 ALLOCATION_STRUCT_FMT, 307 buff, 308 offset=base_offset + i * prof_allocation_struct_size, 309 ) 310 ) 311 mem_prof_data.append(MemAllocation(*mem_prof_entry)) 312 313 base_offset += prof_allocation_struct_size * prof_header.max_mem_prof_entries 314 315 # Get the name of this profiling block and append the profiling data and memory 316 # allocation data we just parsed to the list that maps to this block name. 317 prof_blocks[prof_header.name] = prof_blocks.get(prof_header.name, []) + [ 318 (prof_data, mem_prof_data) 319 ] 320 321 sanity_check_prof_outputs(prof_blocks) 322 return parse_prof_blocks(prof_blocks, allocator_dict, time_scale) 323 324 325def profile_table( 326 profile_data: Dict[str, List[ProfileEvent]], model_buffer=None 327) -> List[PrettyTable]: 328 329 results = [] 330 max_len = 0 331 332 for name, prof_entries_list in profile_data.items(): 333 table = PrettyTable() 334 table.title = name 335 table.add_rows( 336 [ 337 ( 338 entry.name, 339 entry.chain_idx, 340 entry.instruction_idx, 341 None, 342 ) 343 + tuple(val for val in entry.duration) 344 for entry in prof_entries_list 345 ] 346 ) 347 max_len = max(max_len, len(prof_entries_list[0].duration)) 348 table.field_names = [ 349 "Name", 350 "Chain", 351 "Instr", 352 "Frame", 353 ] + ["Iteration " + str(i) for i in range(max_len)] 354 results.append(table) 355 return results 356 357 358def mem_profile_table(mem_allocations: Dict[str, List[MemEvent]]) -> List[PrettyTable]: 359 tables = [] 360 for name, prof_data_list in mem_allocations.items(): 361 table = PrettyTable() 362 table.title = name 363 table_rows = [] 364 for mem_event in prof_data_list: 365 table_rows += [(mem_event.allocator_name, mem_event.total_allocations_done)] 366 table.add_rows(table_rows) 367 table.field_names = ["Allocator name"] + ["Total size of allocations done"] 368 tables.append(table) 369 return tables 370 371 372def profile_aggregate_framework_tax( 373 prof_data: Dict[str, List[ProfileEvent]] 374) -> Dict[str, ProfileEventFrameworkTax]: 375 prof_framework_tax = OrderedDict() 376 377 for name, prof_data_list in prof_data.items(): 378 execute_max = [] 379 kernel_and_delegate_sum = [] 380 381 for d in prof_data_list: 382 if "Method::execute" in d.name: 383 execute_max = max(execute_max, d.duration) 384 385 if "native_call" in d.name or "delegate_execute" in d.name: 386 for idx in range(len(d.duration)): 387 if idx < len(kernel_and_delegate_sum): 388 kernel_and_delegate_sum[idx] += d.duration[idx] 389 else: 390 kernel_and_delegate_sum.append(d.duration[idx]) 391 392 if len(execute_max) == 0 or len(kernel_and_delegate_sum) == 0: 393 continue 394 395 framework_tax_list = [ 396 round((execute_time - kernel_delegate_call) / execute_time, 4) * 100 397 for execute_time, kernel_delegate_call in zip( 398 execute_max, kernel_and_delegate_sum 399 ) 400 ] 401 402 prof_framework_tax[name] = ProfileEventFrameworkTax( 403 execute_max, kernel_and_delegate_sum, framework_tax_list 404 ) 405 406 return prof_framework_tax 407 408 409def profile_framework_tax_table( 410 prof_framework_tax_data: Dict[str, ProfileEventFrameworkTax] 411): 412 tables = [] 413 for name, prof_data_list in prof_framework_tax_data.items(): 414 tables = [] 415 table_agg = PrettyTable() 416 table_agg.title = name + " framework tax calculations" 417 418 table_agg.add_rows( 419 [ 420 ("Model execution time", *prof_data_list.exec_time), 421 ( 422 "Time spent in kernels and delegates", 423 *prof_data_list.kernel_and_delegate_time, 424 ), 425 ("Framework tax (%)", *prof_data_list.framework_tax), 426 ] 427 ) 428 table_agg.field_names = [""] + [ 429 "Iteration " + str(i) for i in range(len(prof_data_list.exec_time)) 430 ] 431 tables.append(table_agg) 432 return tables 433 434 435def deserialize_profile_results_files( 436 profile_results_path: str, 437 bundled_program_ff_path: str, 438 time_scale: TimeScale = TimeScale.TIME_IN_NS, 439): 440 with open(profile_results_path, "rb") as prof_res_file, open( 441 bundled_program_ff_path, "rb" 442 ) as model_ff_file: 443 prof_res_buf = prof_res_file.read() 444 bundled_program_ff_buf = model_ff_file.read() 445 446 prof_data, mem_allocations = deserialize_profile_results(prof_res_buf, time_scale) 447 framework_tax_data = profile_aggregate_framework_tax(prof_data) 448 449 prof_tables = profile_table(prof_data, bundled_program_ff_buf) 450 for table in prof_tables: 451 print(table) 452 453 prof_tables_agg = profile_framework_tax_table(framework_tax_data) 454 for table in prof_tables_agg: 455 print(table) 456 457 mem_prof_table = mem_profile_table(mem_allocations) 458 for table in mem_prof_table: 459 print(table) 460 461 return prof_data, mem_allocations 462