xref: /aosp_15_r20/external/executorch/profiler/parse_profiler_results.py (revision 523fa7a60841cd1ecfb9cc4201f1ca8b03ed023a)
1# Copyright (c) Meta Platforms, Inc. and affiliates.
2# All rights reserved.
3#
4# This source code is licensed under the BSD-style license found in the
5# LICENSE file in the root directory of this source tree.
6
7import dataclasses
8import struct
9from collections import OrderedDict
10from enum import Enum
11
12from typing import Dict, List, Tuple
13
14from prettytable import PrettyTable
15
16# This version number should match the one defined in profiler.h
17ET_PROF_VER = 0x00000001
18
19# This string defines the layout of the prof_result_t struct
20# defined in executorch/profiler/profiler.h. This is used to
21# unpack the binary data to derive the profiling results.
22# To align the end of a structure to the alignment requirement
23# of a particular type, end the format with the code for that
24# type with a repeat count of zero. Adding 0Q at the ending ensures
25# that the struct is aligned by 8 bytes which is the alignment we
26# impose in the runtime.
27PROF_HEADER_STRUCT_FMT = "32s7I0Q"
28PROF_RESULT_STRUCT_FMT = "32siIQQ0Q"
29ALLOCATOR_STRUCT_FMT = "32sQ0Q"
30ALLOCATION_STRUCT_FMT = "2I0Q"
31CHAIN_IDX_NO_CHAIN = -1
32
33
34class TimeScale(Enum):
35    TIME_IN_NS = 0
36    TIME_IN_US = 1
37    TIME_IN_MS = 2
38    CPU_CYCLES = 3
39
40
41# These data classes represent the structures used on device to
42# log various forms of profiling data.
43@dataclasses.dataclass
44class ProfileData:
45    name: str
46    chain_idx: int
47    instruction_idx: int
48    start_time: int
49    end_time: int
50
51
52@dataclasses.dataclass
53class ProfilerHeader:
54    name: str
55    prof_ver: int
56    max_prof_entries: int
57    prof_entries: int
58    max_allocator_entries: int
59    allocator_entries: int
60    max_mem_prof_entries: int
61    mem_prof_entries: int
62
63
64@dataclasses.dataclass
65class Allocator:
66    name: str
67    allocator_id: int
68
69
70@dataclasses.dataclass
71class MemAllocation:
72    allocator_id: int
73    allocation_size: int
74
75
76"""
77These data classes are derived from the post-processing of the
78profiling data retrieved from the runtime. If there are multiple
79blocks of profiling data resulting from iterations of the same code
80segment then corresponding entries will be consolidated with each
81entry in the consolidated list representing one iteration.
82"""
83
84
85@dataclasses.dataclass
86class ProfileEvent:
87    name: str
88    ts: List[float]
89    duration: List[float]
90    chain_idx: int = -1
91    instruction_idx: int = -1
92    # pyre-ignore[8]: Incompatible attribute type
93    stacktrace: str = None
94
95
96@dataclasses.dataclass
97class ProfileEventFrameworkTax:
98    exec_time: List[int]
99    kernel_and_delegate_time: List[int]
100    framework_tax: List[float]
101
102
103@dataclasses.dataclass
104class MemEvent:
105    allocator_name: str
106    total_allocations_done: int
107
108
109def adjust_time_scale(event: ProfileData, time_scale: TimeScale):
110    time_div_factor = {
111        TimeScale.CPU_CYCLES: 1,
112        TimeScale.TIME_IN_MS: 1,
113        TimeScale.TIME_IN_US: 1000,
114        TimeScale.TIME_IN_NS: 1000000,
115    }
116    div_factor = time_div_factor[time_scale]
117    if div_factor != 1:
118        duration = round((event.end_time - event.start_time) / div_factor, 4)
119        start_time = round((event.start_time) / div_factor, 4)
120    else:
121        duration = event.end_time - event.start_time
122        start_time = event.start_time
123    return start_time, duration
124
125
126def parse_prof_blocks(
127    prof_blocks: Dict[str, List[Tuple[List[ProfileData], List[MemAllocation]]]],
128    allocator_dict: Dict[int, str],
129    time_scale: TimeScale,
130) -> Tuple[Dict[str, List[ProfileEvent]], Dict[str, List[MemEvent]]]:
131
132    prof_data = OrderedDict()
133    mem_prof_data = OrderedDict()
134
135    # Iterate through all the profiling blocks data that have been grouped by name.
136    for name, data_list in prof_blocks.items():
137        prof_data_list = []
138        mem_prof_data_list = []
139        # Each entry in data_list is a tuple in which the first entry is profiling data
140        # and the second entry is memory allocation data, also each entry in data_list
141        # represents one iteration of a code block.
142        for i in range(len(data_list)):
143            for idx, event in enumerate(data_list[i][0]):
144                # If the event represented by the index idx already exists in the list
145                # then just append the new time entry to the duration list present in
146                # the event object. If it doesn't exist then create a new entry and add
147                # it to the list.
148                if idx < len(prof_data_list):
149                    start_time, duration = adjust_time_scale(event, time_scale)
150                    prof_data_list[idx].ts.append(start_time)
151                    prof_data_list[idx].duration.append(duration)
152                else:
153                    start_time, duration = adjust_time_scale(event, time_scale)
154                    prof_data_list.append(
155                        ProfileEvent(
156                            event.name,
157                            [start_time],
158                            [duration],
159                            event.chain_idx,
160                            event.instruction_idx,
161                        )
162                    )
163
164            # Collect all the memory allocation events of this iteration of the code block
165            for idx, event in enumerate(data_list[i][1]):
166                if idx >= len(mem_prof_data_list):
167                    mem_prof_data_list.append(event)
168
169        # Group all the memory allocation events based on the allocator they were
170        # allocated from.
171        alloc_sum_dict = OrderedDict()
172        for alloc in mem_prof_data_list:
173            alloc_sum_dict[alloc.allocator_id] = (
174                alloc_sum_dict.get(alloc.allocator_id, 0) + alloc.allocation_size
175            )
176
177        mem_prof_sum_list = []
178        for allocator_id, allocation_size in alloc_sum_dict.items():
179            mem_prof_sum_list.append(
180                MemEvent(allocator_dict[allocator_id], allocation_size)
181            )
182        prof_data[name] = prof_data_list
183        mem_prof_data[name] = mem_prof_sum_list
184
185    return prof_data, mem_prof_data
186
187
188def sanity_check_prof_outputs(
189    prof_blocks: Dict[str, List[Tuple[List[ProfileData], List[MemAllocation]]]]
190):
191    for _, prof_block_vals in prof_blocks.items():
192        for i in range(len(prof_block_vals) - 1):
193            prof_data_list_base = prof_block_vals[i][0]
194            prof_data_list_cmp = prof_block_vals[i + 1][0]
195
196            # Profiling blocks corresponding to the same name should always be of the same
197            # size as they essentially just represent one iteration of a code block that has been
198            # run multiple times.
199            if len(prof_data_list_base) != len(prof_data_list_cmp):
200                raise ValueError(
201                    "Profiling blocks corresponding to the same name shouldn't be of different lengths."
202                )
203
204            for idx in range(len(prof_data_list_base)):
205                if prof_data_list_base[idx].name != prof_data_list_cmp[idx].name:
206                    raise ValueError(
207                        "Corresponding entries in different iterations of the "
208                        "profiling block do not match"
209                    )
210
211            mem_prof_data_list_base = prof_block_vals[i][1]
212            mem_prof_data_list_cmp = prof_block_vals[i + 1][1]
213
214            if len(mem_prof_data_list_base) != len(mem_prof_data_list_cmp):
215                raise ValueError(
216                    "Memory profiling blocks corresponding to the same name shouldn't be of different lengths."
217                )
218
219            for idx in range(len(mem_prof_data_list_base)):
220                if (
221                    mem_prof_data_list_base[idx].allocator_id
222                    != mem_prof_data_list_cmp[idx].allocator_id
223                ):
224                    raise ValueError(
225                        "Corresponding entries in different iterations of the memory "
226                        "profiling blocks do not have the same allocator id"
227                    )
228                if (
229                    mem_prof_data_list_base[idx].allocation_size
230                    != mem_prof_data_list_cmp[idx].allocation_size
231                ):
232                    raise ValueError(
233                        "Corresponding entries in different iterations of the memory "
234                        "profiling blocks do not have the same allocation size."
235                    )
236
237
238def deserialize_profile_results(
239    buff: bytes, time_scale: TimeScale = TimeScale.TIME_IN_NS
240) -> Tuple[Dict[str, List[ProfileEvent]], Dict[str, List[MemEvent]]]:
241
242    prof_header_struct_size = struct.calcsize(PROF_HEADER_STRUCT_FMT)
243    prof_allocator_struct_size = struct.calcsize(ALLOCATOR_STRUCT_FMT)
244    prof_allocation_struct_size = struct.calcsize(ALLOCATION_STRUCT_FMT)
245    prof_result_struct_size = struct.calcsize(PROF_RESULT_STRUCT_FMT)
246    prof_blocks = OrderedDict()
247    allocator_dict = {}
248    base_offset = 0
249
250    while base_offset < len(buff):
251        # Unpack the header for this profiling block from which we can figure
252        # out how many profiling entries are present in this block.
253        prof_header_args = list(
254            struct.unpack_from(PROF_HEADER_STRUCT_FMT, buff, offset=base_offset)
255        )
256        # decode name in profiler header
257        prof_header_args[0] = prof_header_args[0].decode("utf-8").replace("\u0000", "")
258        prof_header = ProfilerHeader(*prof_header_args)
259        base_offset += prof_header_struct_size
260
261        assert prof_header.prof_ver == ET_PROF_VER, (
262            "Mismatch in version between profile dump" "and post-processing tool"
263        )
264        # Get all the profiling (perf events) entries
265        prof_data = []
266        for i in range(prof_header.prof_entries):
267            name_bytes, type, id, start_time, end_time = struct.unpack_from(
268                PROF_RESULT_STRUCT_FMT,
269                buff,
270                offset=base_offset + i * prof_result_struct_size,
271            )
272            prof_data.append(
273                ProfileData(
274                    # name_bytes is 32 bytes string, where if the real log event is less
275                    # than 32 characters it'll be filled with 0 chars => trimming it
276                    name_bytes.decode("utf-8").replace("\u0000", ""),
277                    type,
278                    id,
279                    start_time,
280                    end_time,
281                )
282            )
283
284        # Move forward in the profiling block to start parsing memory allocation events.
285        base_offset += prof_result_struct_size * prof_header.max_prof_entries
286
287        # Parse the allocator entries table, this table maps the allocator id to the
288        # string containing the name designated to this allocator.
289        for i in range(0, prof_header.allocator_entries):
290            allocator_name, allocator_id = struct.unpack_from(
291                ALLOCATOR_STRUCT_FMT,
292                buff,
293                offset=base_offset + i * prof_allocator_struct_size,
294            )
295            allocator_dict[allocator_id] = allocator_name.decode("utf-8").replace(
296                "\u0000", ""
297            )
298
299        base_offset += prof_allocator_struct_size * prof_header.max_allocator_entries
300
301        # Get all the profiling (memory allocation events) entries
302        mem_prof_data = []
303        for i in range(0, prof_header.mem_prof_entries):
304            mem_prof_entry = list(
305                struct.unpack_from(
306                    ALLOCATION_STRUCT_FMT,
307                    buff,
308                    offset=base_offset + i * prof_allocation_struct_size,
309                )
310            )
311            mem_prof_data.append(MemAllocation(*mem_prof_entry))
312
313        base_offset += prof_allocation_struct_size * prof_header.max_mem_prof_entries
314
315        # Get the name of this profiling block and append the profiling data and memory
316        # allocation data we just parsed to the list that maps to this block name.
317        prof_blocks[prof_header.name] = prof_blocks.get(prof_header.name, []) + [
318            (prof_data, mem_prof_data)
319        ]
320
321    sanity_check_prof_outputs(prof_blocks)
322    return parse_prof_blocks(prof_blocks, allocator_dict, time_scale)
323
324
325def profile_table(
326    profile_data: Dict[str, List[ProfileEvent]], model_buffer=None
327) -> List[PrettyTable]:
328
329    results = []
330    max_len = 0
331
332    for name, prof_entries_list in profile_data.items():
333        table = PrettyTable()
334        table.title = name
335        table.add_rows(
336            [
337                (
338                    entry.name,
339                    entry.chain_idx,
340                    entry.instruction_idx,
341                    None,
342                )
343                + tuple(val for val in entry.duration)
344                for entry in prof_entries_list
345            ]
346        )
347        max_len = max(max_len, len(prof_entries_list[0].duration))
348        table.field_names = [
349            "Name",
350            "Chain",
351            "Instr",
352            "Frame",
353        ] + ["Iteration " + str(i) for i in range(max_len)]
354        results.append(table)
355    return results
356
357
358def mem_profile_table(mem_allocations: Dict[str, List[MemEvent]]) -> List[PrettyTable]:
359    tables = []
360    for name, prof_data_list in mem_allocations.items():
361        table = PrettyTable()
362        table.title = name
363        table_rows = []
364        for mem_event in prof_data_list:
365            table_rows += [(mem_event.allocator_name, mem_event.total_allocations_done)]
366        table.add_rows(table_rows)
367        table.field_names = ["Allocator name"] + ["Total size of allocations done"]
368        tables.append(table)
369    return tables
370
371
372def profile_aggregate_framework_tax(
373    prof_data: Dict[str, List[ProfileEvent]]
374) -> Dict[str, ProfileEventFrameworkTax]:
375    prof_framework_tax = OrderedDict()
376
377    for name, prof_data_list in prof_data.items():
378        execute_max = []
379        kernel_and_delegate_sum = []
380
381        for d in prof_data_list:
382            if "Method::execute" in d.name:
383                execute_max = max(execute_max, d.duration)
384
385            if "native_call" in d.name or "delegate_execute" in d.name:
386                for idx in range(len(d.duration)):
387                    if idx < len(kernel_and_delegate_sum):
388                        kernel_and_delegate_sum[idx] += d.duration[idx]
389                    else:
390                        kernel_and_delegate_sum.append(d.duration[idx])
391
392        if len(execute_max) == 0 or len(kernel_and_delegate_sum) == 0:
393            continue
394
395        framework_tax_list = [
396            round((execute_time - kernel_delegate_call) / execute_time, 4) * 100
397            for execute_time, kernel_delegate_call in zip(
398                execute_max, kernel_and_delegate_sum
399            )
400        ]
401
402        prof_framework_tax[name] = ProfileEventFrameworkTax(
403            execute_max, kernel_and_delegate_sum, framework_tax_list
404        )
405
406    return prof_framework_tax
407
408
409def profile_framework_tax_table(
410    prof_framework_tax_data: Dict[str, ProfileEventFrameworkTax]
411):
412    tables = []
413    for name, prof_data_list in prof_framework_tax_data.items():
414        tables = []
415        table_agg = PrettyTable()
416        table_agg.title = name + " framework tax calculations"
417
418        table_agg.add_rows(
419            [
420                ("Model execution time", *prof_data_list.exec_time),
421                (
422                    "Time spent in kernels and delegates",
423                    *prof_data_list.kernel_and_delegate_time,
424                ),
425                ("Framework tax (%)", *prof_data_list.framework_tax),
426            ]
427        )
428        table_agg.field_names = [""] + [
429            "Iteration " + str(i) for i in range(len(prof_data_list.exec_time))
430        ]
431        tables.append(table_agg)
432    return tables
433
434
435def deserialize_profile_results_files(
436    profile_results_path: str,
437    bundled_program_ff_path: str,
438    time_scale: TimeScale = TimeScale.TIME_IN_NS,
439):
440    with open(profile_results_path, "rb") as prof_res_file, open(
441        bundled_program_ff_path, "rb"
442    ) as model_ff_file:
443        prof_res_buf = prof_res_file.read()
444        bundled_program_ff_buf = model_ff_file.read()
445
446    prof_data, mem_allocations = deserialize_profile_results(prof_res_buf, time_scale)
447    framework_tax_data = profile_aggregate_framework_tax(prof_data)
448
449    prof_tables = profile_table(prof_data, bundled_program_ff_buf)
450    for table in prof_tables:
451        print(table)
452
453    prof_tables_agg = profile_framework_tax_table(framework_tax_data)
454    for table in prof_tables_agg:
455        print(table)
456
457    mem_prof_table = mem_profile_table(mem_allocations)
458    for table in mem_prof_table:
459        print(table)
460
461    return prof_data, mem_allocations
462