xref: /aosp_15_r20/external/bcc/tools/biosnoop.py (revision 387f9dfdfa2baef462e92476d413c7bc2470293e)
1#!/usr/bin/env python
2# @lint-avoid-python-3-compatibility-imports
3#
4# biosnoop  Trace block device I/O and print details including issuing PID.
5#           For Linux, uses BCC, eBPF.
6#
7# This uses in-kernel eBPF maps to cache process details (PID and comm) by I/O
8# request, as well as a starting timestamp for calculating I/O latency.
9#
10# Copyright (c) 2015 Brendan Gregg.
11# Licensed under the Apache License, Version 2.0 (the "License")
12#
13# 16-Sep-2015   Brendan Gregg   Created this.
14# 11-Feb-2016   Allan McAleavy  updated for BPF_PERF_OUTPUT
15# 21-Jun-2022   Rocky Xing      Added disk filter support.
16# 13-Oct-2022   Rocky Xing      Added support for displaying block I/O pattern.
17# 01-Aug-2023   Jerome Marchand Added support for block tracepoints
18
19from __future__ import print_function
20from bcc import BPF
21import argparse
22import os
23
24# arguments
25examples = """examples:
26    ./biosnoop           # trace all block I/O
27    ./biosnoop -Q        # include OS queued time
28    ./biosnoop -d sdc    # trace sdc only
29    ./biosnoop -P        # display block I/O pattern
30"""
31parser = argparse.ArgumentParser(
32    description="Trace block I/O",
33    formatter_class=argparse.RawDescriptionHelpFormatter,
34    epilog=examples)
35parser.add_argument("-Q", "--queue", action="store_true",
36    help="include OS queued time")
37parser.add_argument("-d", "--disk", type=str,
38    help="trace this disk only")
39parser.add_argument("-P", "--pattern", action="store_true",
40    help="display block I/O pattern (sequential or random)")
41parser.add_argument("--ebpf", action="store_true",
42    help=argparse.SUPPRESS)
43args = parser.parse_args()
44debug = 0
45
46# define BPF program
47bpf_text = """
48#include <uapi/linux/ptrace.h>
49#include <linux/blk-mq.h>
50"""
51
52if args.pattern:
53    bpf_text += "#define INCLUDE_PATTERN\n"
54
55bpf_text += """
56// for saving the timestamp and __data_len of each request
57struct start_req_t {
58    u64 ts;
59    u64 data_len;
60};
61
62struct val_t {
63    u64 ts;
64    u32 pid;
65    char name[TASK_COMM_LEN];
66};
67
68struct tp_args {
69    u64 __unused__;
70    dev_t dev;
71    sector_t sector;
72    unsigned int nr_sector;
73    unsigned int bytes;
74    char rwbs[8];
75    char comm[16];
76    char cmd[];
77};
78
79struct hash_key {
80    dev_t dev;
81    u32 rwflag;
82    sector_t sector;
83};
84
85
86#ifdef INCLUDE_PATTERN
87struct sector_key_t {
88    u32 dev_major;
89    u32 dev_minor;
90};
91
92enum bio_pattern {
93    UNKNOWN,
94    SEQUENTIAL,
95    RANDOM,
96};
97#endif
98
99struct data_t {
100    u32 pid;
101    u32 dev;
102    u64 rwflag;
103    u64 delta;
104    u64 qdelta;
105    u64 sector;
106    u64 len;
107#ifdef INCLUDE_PATTERN
108    enum bio_pattern pattern;
109#endif
110    u64 ts;
111    char name[TASK_COMM_LEN];
112};
113
114#ifdef INCLUDE_PATTERN
115BPF_HASH(last_sectors, struct sector_key_t, u64);
116#endif
117
118BPF_HASH(start, struct hash_key, struct start_req_t);
119BPF_HASH(infobyreq, struct hash_key, struct val_t);
120BPF_PERF_OUTPUT(events);
121
122static dev_t ddevt(struct gendisk *disk) {
123    return (disk->major  << 20) | disk->first_minor;
124}
125
126/*
127 * The following deals with a kernel version change (in mainline 4.7, although
128 * it may be backported to earlier kernels) with how block request write flags
129 * are tested. We handle both pre- and post-change versions here. Please avoid
130 * kernel version tests like this as much as possible: they inflate the code,
131 * test, and maintenance burden.
132 */
133static int get_rwflag(u32 cmd_flags) {
134#ifdef REQ_WRITE
135    return !!(cmd_flags & REQ_WRITE);
136#elif defined(REQ_OP_SHIFT)
137    return !!((cmd_flags >> REQ_OP_SHIFT) == REQ_OP_WRITE);
138#else
139    return !!((cmd_flags & REQ_OP_MASK) == REQ_OP_WRITE);
140#endif
141}
142
143#define RWBS_LEN	8
144
145static int get_rwflag_tp(char *rwbs) {
146    for (int i = 0; i < RWBS_LEN; i++) {
147        if (rwbs[i] == 'W')
148            return 1;
149        if (rwbs[i] == '\\0')
150            return 0;
151    }
152    return 0;
153}
154
155// cache PID and comm by-req
156static int __trace_pid_start(struct hash_key key)
157{
158    DISK_FILTER
159
160    struct val_t val = {};
161    u64 ts;
162
163    if (bpf_get_current_comm(&val.name, sizeof(val.name)) == 0) {
164        val.pid = bpf_get_current_pid_tgid() >> 32;
165        if (##QUEUE##) {
166            val.ts = bpf_ktime_get_ns();
167        }
168        infobyreq.update(&key, &val);
169    }
170    return 0;
171}
172
173
174int trace_pid_start(struct pt_regs *ctx, struct request *req)
175{
176    struct hash_key key = {
177        .dev = ddevt(req->__RQ_DISK__),
178        .rwflag = get_rwflag(req->cmd_flags),
179        .sector = req->__sector
180    };
181
182    return __trace_pid_start(key);
183}
184
185int trace_pid_start_tp(struct tp_args *args)
186{
187    struct hash_key key = {
188        .dev = args->dev,
189        .rwflag = get_rwflag_tp(args->rwbs),
190        .sector = args->sector
191    };
192
193    return __trace_pid_start(key);
194}
195
196// time block I/O
197int trace_req_start(struct pt_regs *ctx, struct request *req)
198{
199    struct hash_key key = {
200        .dev = ddevt(req->__RQ_DISK__),
201        .rwflag = get_rwflag(req->cmd_flags),
202        .sector = req->__sector
203    };
204
205    DISK_FILTER
206
207    struct start_req_t start_req = {
208        .ts = bpf_ktime_get_ns(),
209        .data_len = req->__data_len
210    };
211    start.update(&key, &start_req);
212    return 0;
213}
214
215// output
216static int __trace_req_completion(void *ctx, struct hash_key key)
217{
218    struct start_req_t *startp;
219    struct val_t *valp;
220    struct data_t data = {};
221    //struct gendisk *rq_disk;
222    u64 ts;
223
224    // fetch timestamp and calculate delta
225    startp = start.lookup(&key);
226    if (startp == 0) {
227        // missed tracing issue
228        return 0;
229    }
230    ts = bpf_ktime_get_ns();
231    //rq_disk = req->__RQ_DISK__;
232    data.delta = ts - startp->ts;
233    data.ts = ts / 1000;
234    data.qdelta = 0;
235    data.len = startp->data_len;
236
237    valp = infobyreq.lookup(&key);
238    if (valp == 0) {
239        data.name[0] = '?';
240        data.name[1] = 0;
241    } else {
242        if (##QUEUE##) {
243            data.qdelta = startp->ts - valp->ts;
244        }
245        data.pid = valp->pid;
246        data.sector = key.sector;
247        data.dev = key.dev;
248        bpf_probe_read_kernel(&data.name, sizeof(data.name), valp->name);
249    }
250
251#ifdef INCLUDE_PATTERN
252    data.pattern = UNKNOWN;
253
254    u64 *sector, last_sector;
255
256    struct sector_key_t sector_key = {
257        .dev_major = key.dev >> 20,
258        .dev_minor = key.dev & ((1 << 20) - 1)
259    };
260
261    sector = last_sectors.lookup(&sector_key);
262    if (sector != 0) {
263        data.pattern = req->__sector == *sector ? SEQUENTIAL : RANDOM;
264    }
265
266    last_sector = req->__sector + data.len / 512;
267    last_sectors.update(&sector_key, &last_sector);
268#endif
269
270    data.rwflag = key.rwflag;
271
272    events.perf_submit(ctx, &data, sizeof(data));
273    start.delete(&key);
274    infobyreq.delete(&key);
275
276    return 0;
277}
278
279int trace_req_completion(struct pt_regs *ctx, struct request *req)
280{
281    struct hash_key key = {
282        .dev = ddevt(req->__RQ_DISK__),
283        .rwflag = get_rwflag(req->cmd_flags),
284        .sector = req->__sector
285    };
286
287    return __trace_req_completion(ctx, key);
288}
289
290int trace_req_completion_tp(struct tp_args *args)
291{
292    struct hash_key key = {
293        .dev = args->dev,
294        .rwflag = get_rwflag_tp(args->rwbs),
295        .sector = args->sector
296    };
297
298    return __trace_req_completion(args, key);
299}
300"""
301if args.queue:
302    bpf_text = bpf_text.replace('##QUEUE##', '1')
303else:
304    bpf_text = bpf_text.replace('##QUEUE##', '0')
305if BPF.kernel_struct_has_field(b'request', b'rq_disk') == 1:
306    bpf_text = bpf_text.replace('__RQ_DISK__', 'rq_disk')
307else:
308    bpf_text = bpf_text.replace('__RQ_DISK__', 'q->disk')
309
310if args.disk is not None:
311    disk_path = os.path.join('/dev', args.disk)
312    if not os.path.exists(disk_path):
313        print("no such disk '%s'" % args.disk)
314        exit(1)
315
316    stat_info = os.stat(disk_path)
317    dev = os.major(stat_info.st_rdev) << 20 | os.minor(stat_info.st_rdev)
318
319    disk_filter_str = """
320    if(key.dev != %s) {
321        return 0;
322    }
323    """ % (dev)
324
325    bpf_text = bpf_text.replace('DISK_FILTER', disk_filter_str)
326else:
327    bpf_text = bpf_text.replace('DISK_FILTER', '')
328
329if debug or args.ebpf:
330    print(bpf_text)
331    if args.ebpf:
332        exit()
333
334# initialize BPF
335b = BPF(text=bpf_text)
336if BPF.get_kprobe_functions(b'__blk_account_io_start'):
337    b.attach_kprobe(event="__blk_account_io_start", fn_name="trace_pid_start")
338elif BPF.get_kprobe_functions(b'blk_account_io_start'):
339    b.attach_kprobe(event="blk_account_io_start", fn_name="trace_pid_start")
340else:
341    b.attach_tracepoint(tp="block:block_io_start", fn_name="trace_pid_start_tp")
342if BPF.get_kprobe_functions(b'blk_start_request'):
343    b.attach_kprobe(event="blk_start_request", fn_name="trace_req_start")
344b.attach_kprobe(event="blk_mq_start_request", fn_name="trace_req_start")
345if BPF.get_kprobe_functions(b'__blk_account_io_done'):
346    b.attach_kprobe(event="__blk_account_io_done", fn_name="trace_req_completion")
347elif BPF.get_kprobe_functions(b'blk_account_io_done'):
348    b.attach_kprobe(event="blk_account_io_done", fn_name="trace_req_completion")
349else:
350    b.attach_tracepoint(tp="block:block_io_done", fn_name="trace_req_completion_tp")
351
352# header
353print("%-11s %-14s %-7s %-9s %-1s %-10s %-7s" % ("TIME(s)", "COMM", "PID",
354    "DISK", "T", "SECTOR", "BYTES"), end="")
355if args.pattern:
356    print("%-1s " % ("P"), end="")
357if args.queue:
358    print("%7s " % ("QUE(ms)"), end="")
359print("%7s" % "LAT(ms)")
360
361
362# cache disk major,minor -> diskname
363diskstats = "/proc/diskstats"
364disklookup = {}
365with open(diskstats) as stats:
366    for line in stats:
367        a = line.split()
368        disklookup[a[0] + "," + a[1]] = a[2]
369
370def disk_print(d):
371    major = d >> 20
372    minor = d & ((1 << 20) - 1)
373
374    disk = str(major) + "," + str(minor)
375    if disk in disklookup:
376        diskname = disklookup[disk]
377    else:
378        diskname = "<unknown>"
379
380    return diskname
381
382rwflg = ""
383pattern = ""
384start_ts = 0
385prev_ts = 0
386delta = 0
387
388P_SEQUENTIAL = 1
389P_RANDOM = 2
390
391# process event
392def print_event(cpu, data, size):
393    event = b["events"].event(data)
394
395    global start_ts
396    if start_ts == 0:
397        start_ts = event.ts
398
399    if event.rwflag == 1:
400        rwflg = "W"
401    else:
402        rwflg = "R"
403
404    delta = float(event.ts) - start_ts
405
406    disk_name = disk_print(event.dev)
407
408    print("%-11.6f %-14.14s %-7s %-9s %-1s %-10s %-7s" % (
409        delta / 1000000, event.name.decode('utf-8', 'replace'), event.pid,
410        disk_name, rwflg, event.sector, event.len), end="")
411    if args.pattern:
412        if event.pattern == P_SEQUENTIAL:
413            pattern = "S"
414        elif event.pattern == P_RANDOM:
415            pattern = "R"
416        else:
417            pattern = "?"
418        print("%-1s " % pattern, end="")
419    if args.queue:
420        print("%7.2f " % (float(event.qdelta) / 1000000), end="")
421    print("%7.2f" % (float(event.delta) / 1000000))
422
423# loop with callback to print_event
424b["events"].open_perf_buffer(print_event, page_cnt=64)
425while 1:
426    try:
427        b.perf_buffer_poll()
428    except KeyboardInterrupt:
429        exit()
430