1#!/usr/bin/env python 2# @lint-avoid-python-3-compatibility-imports 3# 4# biosnoop Trace block device I/O and print details including issuing PID. 5# For Linux, uses BCC, eBPF. 6# 7# This uses in-kernel eBPF maps to cache process details (PID and comm) by I/O 8# request, as well as a starting timestamp for calculating I/O latency. 9# 10# Copyright (c) 2015 Brendan Gregg. 11# Licensed under the Apache License, Version 2.0 (the "License") 12# 13# 16-Sep-2015 Brendan Gregg Created this. 14# 11-Feb-2016 Allan McAleavy updated for BPF_PERF_OUTPUT 15# 21-Jun-2022 Rocky Xing Added disk filter support. 16# 13-Oct-2022 Rocky Xing Added support for displaying block I/O pattern. 17# 01-Aug-2023 Jerome Marchand Added support for block tracepoints 18 19from __future__ import print_function 20from bcc import BPF 21import argparse 22import os 23 24# arguments 25examples = """examples: 26 ./biosnoop # trace all block I/O 27 ./biosnoop -Q # include OS queued time 28 ./biosnoop -d sdc # trace sdc only 29 ./biosnoop -P # display block I/O pattern 30""" 31parser = argparse.ArgumentParser( 32 description="Trace block I/O", 33 formatter_class=argparse.RawDescriptionHelpFormatter, 34 epilog=examples) 35parser.add_argument("-Q", "--queue", action="store_true", 36 help="include OS queued time") 37parser.add_argument("-d", "--disk", type=str, 38 help="trace this disk only") 39parser.add_argument("-P", "--pattern", action="store_true", 40 help="display block I/O pattern (sequential or random)") 41parser.add_argument("--ebpf", action="store_true", 42 help=argparse.SUPPRESS) 43args = parser.parse_args() 44debug = 0 45 46# define BPF program 47bpf_text = """ 48#include <uapi/linux/ptrace.h> 49#include <linux/blk-mq.h> 50""" 51 52if args.pattern: 53 bpf_text += "#define INCLUDE_PATTERN\n" 54 55bpf_text += """ 56// for saving the timestamp and __data_len of each request 57struct start_req_t { 58 u64 ts; 59 u64 data_len; 60}; 61 62struct val_t { 63 u64 ts; 64 u32 pid; 65 char name[TASK_COMM_LEN]; 66}; 67 68struct tp_args { 69 u64 __unused__; 70 dev_t dev; 71 sector_t sector; 72 unsigned int nr_sector; 73 unsigned int bytes; 74 char rwbs[8]; 75 char comm[16]; 76 char cmd[]; 77}; 78 79struct hash_key { 80 dev_t dev; 81 u32 rwflag; 82 sector_t sector; 83}; 84 85 86#ifdef INCLUDE_PATTERN 87struct sector_key_t { 88 u32 dev_major; 89 u32 dev_minor; 90}; 91 92enum bio_pattern { 93 UNKNOWN, 94 SEQUENTIAL, 95 RANDOM, 96}; 97#endif 98 99struct data_t { 100 u32 pid; 101 u32 dev; 102 u64 rwflag; 103 u64 delta; 104 u64 qdelta; 105 u64 sector; 106 u64 len; 107#ifdef INCLUDE_PATTERN 108 enum bio_pattern pattern; 109#endif 110 u64 ts; 111 char name[TASK_COMM_LEN]; 112}; 113 114#ifdef INCLUDE_PATTERN 115BPF_HASH(last_sectors, struct sector_key_t, u64); 116#endif 117 118BPF_HASH(start, struct hash_key, struct start_req_t); 119BPF_HASH(infobyreq, struct hash_key, struct val_t); 120BPF_PERF_OUTPUT(events); 121 122static dev_t ddevt(struct gendisk *disk) { 123 return (disk->major << 20) | disk->first_minor; 124} 125 126/* 127 * The following deals with a kernel version change (in mainline 4.7, although 128 * it may be backported to earlier kernels) with how block request write flags 129 * are tested. We handle both pre- and post-change versions here. Please avoid 130 * kernel version tests like this as much as possible: they inflate the code, 131 * test, and maintenance burden. 132 */ 133static int get_rwflag(u32 cmd_flags) { 134#ifdef REQ_WRITE 135 return !!(cmd_flags & REQ_WRITE); 136#elif defined(REQ_OP_SHIFT) 137 return !!((cmd_flags >> REQ_OP_SHIFT) == REQ_OP_WRITE); 138#else 139 return !!((cmd_flags & REQ_OP_MASK) == REQ_OP_WRITE); 140#endif 141} 142 143#define RWBS_LEN 8 144 145static int get_rwflag_tp(char *rwbs) { 146 for (int i = 0; i < RWBS_LEN; i++) { 147 if (rwbs[i] == 'W') 148 return 1; 149 if (rwbs[i] == '\\0') 150 return 0; 151 } 152 return 0; 153} 154 155// cache PID and comm by-req 156static int __trace_pid_start(struct hash_key key) 157{ 158 DISK_FILTER 159 160 struct val_t val = {}; 161 u64 ts; 162 163 if (bpf_get_current_comm(&val.name, sizeof(val.name)) == 0) { 164 val.pid = bpf_get_current_pid_tgid() >> 32; 165 if (##QUEUE##) { 166 val.ts = bpf_ktime_get_ns(); 167 } 168 infobyreq.update(&key, &val); 169 } 170 return 0; 171} 172 173 174int trace_pid_start(struct pt_regs *ctx, struct request *req) 175{ 176 struct hash_key key = { 177 .dev = ddevt(req->__RQ_DISK__), 178 .rwflag = get_rwflag(req->cmd_flags), 179 .sector = req->__sector 180 }; 181 182 return __trace_pid_start(key); 183} 184 185int trace_pid_start_tp(struct tp_args *args) 186{ 187 struct hash_key key = { 188 .dev = args->dev, 189 .rwflag = get_rwflag_tp(args->rwbs), 190 .sector = args->sector 191 }; 192 193 return __trace_pid_start(key); 194} 195 196// time block I/O 197int trace_req_start(struct pt_regs *ctx, struct request *req) 198{ 199 struct hash_key key = { 200 .dev = ddevt(req->__RQ_DISK__), 201 .rwflag = get_rwflag(req->cmd_flags), 202 .sector = req->__sector 203 }; 204 205 DISK_FILTER 206 207 struct start_req_t start_req = { 208 .ts = bpf_ktime_get_ns(), 209 .data_len = req->__data_len 210 }; 211 start.update(&key, &start_req); 212 return 0; 213} 214 215// output 216static int __trace_req_completion(void *ctx, struct hash_key key) 217{ 218 struct start_req_t *startp; 219 struct val_t *valp; 220 struct data_t data = {}; 221 //struct gendisk *rq_disk; 222 u64 ts; 223 224 // fetch timestamp and calculate delta 225 startp = start.lookup(&key); 226 if (startp == 0) { 227 // missed tracing issue 228 return 0; 229 } 230 ts = bpf_ktime_get_ns(); 231 //rq_disk = req->__RQ_DISK__; 232 data.delta = ts - startp->ts; 233 data.ts = ts / 1000; 234 data.qdelta = 0; 235 data.len = startp->data_len; 236 237 valp = infobyreq.lookup(&key); 238 if (valp == 0) { 239 data.name[0] = '?'; 240 data.name[1] = 0; 241 } else { 242 if (##QUEUE##) { 243 data.qdelta = startp->ts - valp->ts; 244 } 245 data.pid = valp->pid; 246 data.sector = key.sector; 247 data.dev = key.dev; 248 bpf_probe_read_kernel(&data.name, sizeof(data.name), valp->name); 249 } 250 251#ifdef INCLUDE_PATTERN 252 data.pattern = UNKNOWN; 253 254 u64 *sector, last_sector; 255 256 struct sector_key_t sector_key = { 257 .dev_major = key.dev >> 20, 258 .dev_minor = key.dev & ((1 << 20) - 1) 259 }; 260 261 sector = last_sectors.lookup(§or_key); 262 if (sector != 0) { 263 data.pattern = req->__sector == *sector ? SEQUENTIAL : RANDOM; 264 } 265 266 last_sector = req->__sector + data.len / 512; 267 last_sectors.update(§or_key, &last_sector); 268#endif 269 270 data.rwflag = key.rwflag; 271 272 events.perf_submit(ctx, &data, sizeof(data)); 273 start.delete(&key); 274 infobyreq.delete(&key); 275 276 return 0; 277} 278 279int trace_req_completion(struct pt_regs *ctx, struct request *req) 280{ 281 struct hash_key key = { 282 .dev = ddevt(req->__RQ_DISK__), 283 .rwflag = get_rwflag(req->cmd_flags), 284 .sector = req->__sector 285 }; 286 287 return __trace_req_completion(ctx, key); 288} 289 290int trace_req_completion_tp(struct tp_args *args) 291{ 292 struct hash_key key = { 293 .dev = args->dev, 294 .rwflag = get_rwflag_tp(args->rwbs), 295 .sector = args->sector 296 }; 297 298 return __trace_req_completion(args, key); 299} 300""" 301if args.queue: 302 bpf_text = bpf_text.replace('##QUEUE##', '1') 303else: 304 bpf_text = bpf_text.replace('##QUEUE##', '0') 305if BPF.kernel_struct_has_field(b'request', b'rq_disk') == 1: 306 bpf_text = bpf_text.replace('__RQ_DISK__', 'rq_disk') 307else: 308 bpf_text = bpf_text.replace('__RQ_DISK__', 'q->disk') 309 310if args.disk is not None: 311 disk_path = os.path.join('/dev', args.disk) 312 if not os.path.exists(disk_path): 313 print("no such disk '%s'" % args.disk) 314 exit(1) 315 316 stat_info = os.stat(disk_path) 317 dev = os.major(stat_info.st_rdev) << 20 | os.minor(stat_info.st_rdev) 318 319 disk_filter_str = """ 320 if(key.dev != %s) { 321 return 0; 322 } 323 """ % (dev) 324 325 bpf_text = bpf_text.replace('DISK_FILTER', disk_filter_str) 326else: 327 bpf_text = bpf_text.replace('DISK_FILTER', '') 328 329if debug or args.ebpf: 330 print(bpf_text) 331 if args.ebpf: 332 exit() 333 334# initialize BPF 335b = BPF(text=bpf_text) 336if BPF.get_kprobe_functions(b'__blk_account_io_start'): 337 b.attach_kprobe(event="__blk_account_io_start", fn_name="trace_pid_start") 338elif BPF.get_kprobe_functions(b'blk_account_io_start'): 339 b.attach_kprobe(event="blk_account_io_start", fn_name="trace_pid_start") 340else: 341 b.attach_tracepoint(tp="block:block_io_start", fn_name="trace_pid_start_tp") 342if BPF.get_kprobe_functions(b'blk_start_request'): 343 b.attach_kprobe(event="blk_start_request", fn_name="trace_req_start") 344b.attach_kprobe(event="blk_mq_start_request", fn_name="trace_req_start") 345if BPF.get_kprobe_functions(b'__blk_account_io_done'): 346 b.attach_kprobe(event="__blk_account_io_done", fn_name="trace_req_completion") 347elif BPF.get_kprobe_functions(b'blk_account_io_done'): 348 b.attach_kprobe(event="blk_account_io_done", fn_name="trace_req_completion") 349else: 350 b.attach_tracepoint(tp="block:block_io_done", fn_name="trace_req_completion_tp") 351 352# header 353print("%-11s %-14s %-7s %-9s %-1s %-10s %-7s" % ("TIME(s)", "COMM", "PID", 354 "DISK", "T", "SECTOR", "BYTES"), end="") 355if args.pattern: 356 print("%-1s " % ("P"), end="") 357if args.queue: 358 print("%7s " % ("QUE(ms)"), end="") 359print("%7s" % "LAT(ms)") 360 361 362# cache disk major,minor -> diskname 363diskstats = "/proc/diskstats" 364disklookup = {} 365with open(diskstats) as stats: 366 for line in stats: 367 a = line.split() 368 disklookup[a[0] + "," + a[1]] = a[2] 369 370def disk_print(d): 371 major = d >> 20 372 minor = d & ((1 << 20) - 1) 373 374 disk = str(major) + "," + str(minor) 375 if disk in disklookup: 376 diskname = disklookup[disk] 377 else: 378 diskname = "<unknown>" 379 380 return diskname 381 382rwflg = "" 383pattern = "" 384start_ts = 0 385prev_ts = 0 386delta = 0 387 388P_SEQUENTIAL = 1 389P_RANDOM = 2 390 391# process event 392def print_event(cpu, data, size): 393 event = b["events"].event(data) 394 395 global start_ts 396 if start_ts == 0: 397 start_ts = event.ts 398 399 if event.rwflag == 1: 400 rwflg = "W" 401 else: 402 rwflg = "R" 403 404 delta = float(event.ts) - start_ts 405 406 disk_name = disk_print(event.dev) 407 408 print("%-11.6f %-14.14s %-7s %-9s %-1s %-10s %-7s" % ( 409 delta / 1000000, event.name.decode('utf-8', 'replace'), event.pid, 410 disk_name, rwflg, event.sector, event.len), end="") 411 if args.pattern: 412 if event.pattern == P_SEQUENTIAL: 413 pattern = "S" 414 elif event.pattern == P_RANDOM: 415 pattern = "R" 416 else: 417 pattern = "?" 418 print("%-1s " % pattern, end="") 419 if args.queue: 420 print("%7.2f " % (float(event.qdelta) / 1000000), end="") 421 print("%7.2f" % (float(event.delta) / 1000000)) 422 423# loop with callback to print_event 424b["events"].open_perf_buffer(print_event, page_cnt=64) 425while 1: 426 try: 427 b.perf_buffer_poll() 428 except KeyboardInterrupt: 429 exit() 430