xref: /aosp_15_r20/external/bcc/tools/compactsnoop.py (revision 387f9dfdfa2baef462e92476d413c7bc2470293e)
1#!/usr/bin/env python
2# @lint-avoid-python-3-compatibility-imports
3#
4# compactsnoop  Trace compact zone and print details including issuing PID.
5#       For Linux, uses BCC, eBPF.
6#
7# This uses in-kernel eBPF maps to cache process details (PID and comm) by
8# compact zone begin, as well as a starting timestamp for calculating
9# latency.
10#
11# Copyright (c) 2019 Wenbo Zhang
12# Licensed under the Apache License, Version 2.0 (the "License")
13#
14# 11-NOV-2019   Wenbo Zhang   Created this.
15
16from __future__ import print_function
17from bcc import BPF
18import argparse
19import platform
20from datetime import datetime, timedelta
21import sys
22
23# arguments
24examples = """examples:
25    ./compactsnoop          # trace all compact stall
26    ./compactsnoop -T       # include timestamps
27    ./compactsnoop -d 10    # trace for 10 seconds only
28    ./compactsnoop -K       # output kernel stack trace
29    ./compactsnoop -e       # show extended fields
30"""
31
32parser = argparse.ArgumentParser(
33    description="Trace compact zone",
34    formatter_class=argparse.RawDescriptionHelpFormatter,
35    epilog=examples,
36)
37parser.add_argument("-T", "--timestamp", action="store_true",
38        help="include timestamp on output")
39parser.add_argument("-p", "--pid", help="trace this PID only")
40parser.add_argument("-d", "--duration",
41        help="total duration of trace in seconds")
42parser.add_argument("-K", "--kernel-stack", action="store_true",
43        help="output kernel stack trace")
44parser.add_argument("-e", "--extended_fields", action="store_true",
45        help="show system memory state")
46parser.add_argument("--ebpf", action="store_true", help=argparse.SUPPRESS)
47args = parser.parse_args()
48debug = 0
49if args.duration:
50    args.duration = timedelta(seconds=int(args.duration))
51
52NO_EXTENDED = """
53#ifdef EXTNEDED_FIELDS
54#undef EXTNEDED_FIELDS
55#endif
56"""
57
58EXTENDED = """
59#define EXTNEDED_FIELDS    1
60"""
61
62bpf_text = """
63#include <uapi/linux/ptrace.h>
64#include <linux/sched.h>
65#include <linux/mmzone.h>
66#include <linux/compaction.h>
67
68struct val_t {
69    int nid;
70    int idx;
71    int order;
72    int sync;
73#ifdef EXTNEDED_FIELDS
74    int fragindex;
75    int low;
76    int min;
77    int high;
78    int free;
79#endif
80    u64 ts;    // compaction begin time
81};
82
83struct data_t {
84    u32 pid;
85    u32 tid;
86    int nid;
87    int idx;
88    int order;
89    u64 delta;
90    u64 ts;    // compaction end time
91    int sync;
92#ifdef EXTNEDED_FIELDS
93    int fragindex;
94    int low;
95    int min;
96    int high;
97    int free;
98#endif
99    int status;
100    int stack_id;
101    char comm[TASK_COMM_LEN];
102};
103
104BPF_HASH(start, u64, struct val_t);
105BPF_PERF_OUTPUT(events);
106BPF_STACK_TRACE(stack_traces, 2048);
107
108#ifdef CONFIG_NUMA
109static inline int zone_to_nid_(struct zone *zone)
110{
111    int node;
112    bpf_probe_read_kernel(&node, sizeof(node), &zone->node);
113    return node;
114}
115#else
116static inline int zone_to_nid_(struct zone *zone)
117{
118    return 0;
119}
120#endif
121
122// #define zone_idx(zone) ((zone) - (zone)->zone_pgdat->node_zones)
123static inline int zone_idx_(struct zone *zone)
124{
125    struct pglist_data *zone_pgdat = NULL;
126    bpf_probe_read_kernel(&zone_pgdat, sizeof(zone_pgdat), &zone->zone_pgdat);
127    return ((u64)zone - (u64)zone_pgdat->node_zones)/sizeof(struct zone);
128}
129
130#ifdef EXTNEDED_FIELDS
131static inline void get_all_wmark_pages(struct zone *zone, struct val_t *valp)
132{
133    u64 _watermark[NR_WMARK] = {};
134    u64 watermark_boost = 0;
135
136    bpf_probe_read_kernel(&_watermark, sizeof(_watermark), &zone->_watermark);
137    bpf_probe_read_kernel(&watermark_boost, sizeof(watermark_boost),
138                    &zone->watermark_boost);
139    valp->min = _watermark[WMARK_MIN] + watermark_boost;
140    valp->low = _watermark[WMARK_LOW] + watermark_boost;
141    valp->high = _watermark[WMARK_HIGH] + watermark_boost;
142    bpf_probe_read_kernel(&valp->free, sizeof(valp->free),
143                    &zone->vm_stat[NR_FREE_PAGES]);
144}
145#endif
146
147static inline void submit_event(void *ctx, int status)
148{
149    struct data_t data = {};
150    u64 ts = bpf_ktime_get_ns();
151    u64 id = bpf_get_current_pid_tgid();
152    struct val_t *valp = start.lookup(&id);
153    if (valp == NULL) {
154        // missed entry
155        return;
156    }
157
158    data.delta = ts - valp->ts;
159    data.ts = ts / 1000;
160    data.pid = id >> 32;
161    data.tid = id;
162    bpf_get_current_comm(&data.comm, sizeof(data.comm));
163    data.nid = valp->nid;
164    data.idx = valp->idx;
165    data.order = valp->order;
166    data.sync = valp->sync;
167
168#ifdef EXTNEDED_FIELDS
169    data.fragindex = valp->fragindex;
170    data.min = valp->min;
171    data.low = valp->low;
172    data.high = valp->high;
173    data.free = valp->free;
174#endif
175
176    data.status = status;
177    data.stack_id = stack_traces.get_stackid(ctx, 0);
178
179    events.perf_submit(ctx, &data, sizeof(data));
180
181    start.delete(&id);
182}
183
184#ifdef EXTNEDED_FIELDS
185int trace_fragmentation_index_return(struct pt_regs *ctx)
186{
187    struct val_t val = { };
188    int ret = PT_REGS_RC(ctx);
189    u64 id = bpf_get_current_pid_tgid();
190    PID_FILTER
191    val.fragindex = ret;
192    start.update(&id, &val);
193    return 0;
194}
195#endif
196
197static inline void fill_compact_info(struct val_t *valp,
198                                     struct zone *zone,
199                                     int order)
200{
201    valp->nid = zone_to_nid_(zone);
202    valp->idx = zone_idx_(zone);
203    valp->order = order;
204}
205
206RAW_TRACEPOINT_PROBE(mm_compaction_suitable)
207{
208    // TP_PROTO(struct zone *zone, int order, int ret)
209    struct zone *zone = (struct zone *)ctx->args[0];
210    int order = (int)ctx->args[1];
211    int ret = (int)ctx->args[2];
212    u64 id;
213
214    if(ret != COMPACT_CONTINUE)
215        return 0;
216
217    id = bpf_get_current_pid_tgid();
218    PID_FILTER
219
220#ifdef EXTNEDED_FIELDS
221    struct val_t *valp = start.lookup(&id);
222    if (valp == NULL) {
223        // missed entry or order <= PAGE_ALLOC_COSTLY_ORDER, eg:
224        // manual trigger echo 1 > /proc/sys/vm/compact_memory
225        struct val_t val = { .fragindex = -1000 };
226        valp = &val;
227        start.update(&id, valp);
228    }
229    fill_compact_info(valp, zone, order);
230    get_all_wmark_pages(zone, valp);
231#else
232    struct val_t val = { };
233    fill_compact_info(&val, zone, order);
234    start.update(&id, &val);
235#endif
236
237    return 0;
238}
239
240TRACEPOINT_PROBE(compaction, mm_compaction_begin)
241{
242    bool sync = args->sync;
243
244    u64 id = bpf_get_current_pid_tgid();
245    struct val_t *valp = start.lookup(&id);
246    if (valp == NULL) {
247        // missed entry
248        return 0;
249    }
250
251    valp->ts = bpf_ktime_get_ns();
252    valp->sync = sync;
253    return 0;
254}
255
256TRACEPOINT_PROBE(compaction, mm_compaction_end)
257{
258    submit_event(args, args->status);
259    return 0;
260}
261"""
262
263if platform.machine() != 'x86_64':
264    print("""
265          Currently only support x86_64 servers, if you want to use it on
266          other platforms, please refer include/linux/mmzone.h to modify
267          zone_idex_to_str to get the right zone type
268    """)
269    exit()
270
271if args.extended_fields:
272    bpf_text = EXTENDED + bpf_text
273else:
274    bpf_text = NO_EXTENDED + bpf_text
275
276if args.pid:
277    bpf_text = bpf_text.replace("PID_FILTER",
278                                "if (id >> 32 != %s) { return 0; }" % args.pid)
279else:
280    bpf_text = bpf_text.replace("PID_FILTER", "")
281if debug or args.ebpf:
282    print(bpf_text)
283    if args.ebpf:
284        exit()
285
286# load BPF program
287b = BPF(text=bpf_text)
288if args.extended_fields:
289    b.attach_kretprobe(event="fragmentation_index",
290                       fn_name="trace_fragmentation_index_return")
291
292stack_traces = b.get_table("stack_traces")
293initial_ts = 0
294
295def zone_idx_to_str(idx):
296    # from include/linux/mmzone.h
297    # NOTICE: consider only x86_64 servers
298    zone_type = {
299        0: "ZONE_DMA",
300        1: "ZONE_DMA32",
301        2: "ZONE_NORMAL",
302    }
303
304    if idx in zone_type:
305        return zone_type[idx]
306    else:
307        return str(idx)
308
309def compact_result_to_str(status):
310    # from include/trace/evnets/mmflags.h
311    # from include/linux/compaction.h
312    compact_status = {
313        # COMPACT_NOT_SUITABLE_ZONE: For more detailed tracepoint
314        # output - internal to compaction
315        0: "not_suitable_zone",
316        # COMPACT_SKIPPED: compaction didn't start as it was not
317        # possible or direct reclaim was more suitable
318        1: "skipped",
319        # COMPACT_DEFERRED: compaction didn't start as it was
320        # deferred due to past failures
321        2: "deferred",
322        # COMPACT_NOT_SUITABLE_PAGE: For more detailed tracepoint
323        # output - internal to compaction
324        3: "no_suitable_page",
325        # COMPACT_CONTINUE: compaction should continue to another pageblock
326        4: "continue",
327        # COMPACT_COMPLETE: The full zone was compacted scanned but wasn't
328        # successful to compact suitable pages.
329        5: "complete",
330        # COMPACT_PARTIAL_SKIPPED: direct compaction has scanned part of the
331        # zone but wasn't successful to compact suitable pages.
332        6: "partial_skipped",
333        # COMPACT_CONTENDED: compaction terminated prematurely due to lock
334        # contentions
335        7: "contended",
336        # COMPACT_SUCCESS: direct compaction terminated after concluding
337        # that the allocation should now succeed
338        8: "success",
339    }
340
341    if status in compact_status:
342        return compact_status[status]
343    else:
344        return str(status)
345
346# header
347if args.timestamp:
348    print("%-14s" % ("TIME(s)"), end=" ")
349print("%-14s %-6s %-4s %-12s %-5s %-7s" %
350      ("COMM", "PID", "NODE", "ZONE", "ORDER", "MODE"), end=" ")
351if args.extended_fields:
352    print("%-8s %-8s %-8s %-8s %-8s" %
353          ("FRAGIDX", "MIN", "LOW", "HIGH", "FREE"), end=" ")
354print("%9s %16s" % ("LAT(ms)", "STATUS"))
355
356# process event
357def print_event(cpu, data, size):
358    event = b["events"].event(data)
359
360    global initial_ts
361
362    if not initial_ts:
363        initial_ts = event.ts
364
365    if args.timestamp:
366        delta = event.ts - initial_ts
367        print("%-14.9f" % (float(delta) / 1000000), end=" ")
368
369    print("%-14.14s %-6s %-4s %-12s %-5s %-7s" % (
370            event.comm.decode("utf-8", "replace"),
371            event.pid,
372            event.nid,
373            zone_idx_to_str(event.idx),
374            event.order,
375            "SYNC" if event.sync else "ASYNC"), end=" ")
376    if args.extended_fields:
377        print("%-8.3f %-8s %-8s %-8s %-8s" % (
378            (float(event.fragindex) / 1000),
379            event.min, event.low, event.high, event.free
380            ), end=" ")
381    print("%9.3f %16s" % (
382        float(event.delta) / 1000000, compact_result_to_str(event.status)))
383    if args.kernel_stack:
384        for addr in stack_traces.walk(event.stack_id):
385            sym = b.ksym(addr, show_offset=True)
386            print("\t%s" % sym)
387        print("")
388
389    sys.stdout.flush()
390
391# loop with callback to print_event
392b["events"].open_perf_buffer(print_event, page_cnt=64)
393start_time = datetime.now()
394while not args.duration or datetime.now() - start_time < args.duration:
395    try:
396        b.perf_buffer_poll()
397    except KeyboardInterrupt:
398        exit()
399