1#!/usr/bin/env python 2# @lint-avoid-python-3-compatibility-imports 3# 4# ext4slower Trace slow ext4 operations. 5# For Linux, uses BCC, eBPF. 6# 7# USAGE: ext4slower [-h] [-j] [-p PID] [min_ms] 8# 9# This script traces common ext4 file operations: reads, writes, opens, and 10# syncs. It measures the time spent in these operations, and prints details 11# for each that exceeded a threshold. 12# 13# WARNING: This adds low-overhead instrumentation to these ext4 operations, 14# including reads and writes from the file system cache. Such reads and writes 15# can be very frequent (depending on the workload; eg, 1M/sec), at which 16# point the overhead of this tool (even if it prints no "slower" events) can 17# begin to become significant. 18# 19# By default, a minimum millisecond threshold of 10 is used. 20# 21# Copyright 2016 Netflix, Inc. 22# Licensed under the Apache License, Version 2.0 (the "License") 23# 24# 11-Feb-2016 Brendan Gregg Created this. 25# 15-Oct-2016 Dina Goldshtein -p to filter by process ID. 26# 13-Jun-2018 Joe Yin modify generic_file_read_iter to ext4_file_read_iter. 27 28from __future__ import print_function 29from bcc import BPF 30import argparse 31from time import strftime 32 33# symbols 34kallsyms = "/proc/kallsyms" 35 36# arguments 37examples = """examples: 38 ./ext4slower # trace operations slower than 10 ms (default) 39 ./ext4slower 1 # trace operations slower than 1 ms 40 ./ext4slower -j 1 # ... 1 ms, parsable output (csv) 41 ./ext4slower 0 # trace all operations (warning: verbose) 42 ./ext4slower -p 185 # trace PID 185 only 43""" 44parser = argparse.ArgumentParser( 45 description="Trace common ext4 file operations slower than a threshold", 46 formatter_class=argparse.RawDescriptionHelpFormatter, 47 epilog=examples) 48parser.add_argument("-j", "--csv", action="store_true", 49 help="just print fields: comma-separated values") 50parser.add_argument("-p", "--pid", 51 help="trace this PID only") 52parser.add_argument("min_ms", nargs="?", default='10', 53 help="minimum I/O duration to trace, in ms (default 10)") 54parser.add_argument("--ebpf", action="store_true", 55 help=argparse.SUPPRESS) 56args = parser.parse_args() 57min_ms = int(args.min_ms) 58pid = args.pid 59csv = args.csv 60debug = 0 61 62# define BPF program 63bpf_text = """ 64#include <uapi/linux/ptrace.h> 65#include <linux/fs.h> 66#include <linux/sched.h> 67#include <linux/dcache.h> 68 69// XXX: switch these to char's when supported 70#define TRACE_READ 0 71#define TRACE_WRITE 1 72#define TRACE_OPEN 2 73#define TRACE_FSYNC 3 74 75struct val_t { 76 u64 ts; 77 u64 offset; 78 struct file *fp; 79}; 80 81struct data_t { 82 // XXX: switch some to u32's when supported 83 u64 ts_us; 84 u64 type; 85 u32 size; 86 u64 offset; 87 u64 delta_us; 88 u32 pid; 89 char task[TASK_COMM_LEN]; 90 char file[DNAME_INLINE_LEN]; 91}; 92 93BPF_HASH(entryinfo, u64, struct val_t); 94BPF_PERF_OUTPUT(events); 95 96// 97// Store timestamp and size on entry 98// 99 100// The current ext4 (Linux 4.5) uses generic_file_read_iter(), instead of it's 101// own function, for reads. So we need to trace that and then filter on ext4, 102// which I do by checking file->f_op. 103// The new Linux version (since form 4.10) uses ext4_file_read_iter(), And if the 'CONFIG_FS_DAX' 104// is not set, then ext4_file_read_iter() will call generic_file_read_iter(), else it will call 105// ext4_dax_read_iter(), and trace generic_file_read_iter() will fail. 106int trace_read_entry(struct pt_regs *ctx, struct kiocb *iocb) 107{ 108 u64 id = bpf_get_current_pid_tgid(); 109 u32 pid = id >> 32; // PID is higher part 110 111 if (FILTER_PID) 112 return 0; 113 114 // ext4 filter on file->f_op == ext4_file_operations 115 struct file *fp = iocb->ki_filp; 116 if ((u64)fp->f_op != EXT4_FILE_OPERATIONS) 117 return 0; 118 119 // store filep and timestamp by id 120 struct val_t val = {}; 121 val.ts = bpf_ktime_get_ns(); 122 val.fp = fp; 123 val.offset = iocb->ki_pos; 124 if (val.fp) 125 entryinfo.update(&id, &val); 126 127 return 0; 128} 129 130// ext4_file_write_iter(): 131int trace_write_entry(struct pt_regs *ctx, struct kiocb *iocb) 132{ 133 u64 id = bpf_get_current_pid_tgid(); 134 u32 pid = id >> 32; // PID is higher part 135 136 if (FILTER_PID) 137 return 0; 138 139 // store filep and timestamp by id 140 struct val_t val = {}; 141 val.ts = bpf_ktime_get_ns(); 142 val.fp = iocb->ki_filp; 143 val.offset = iocb->ki_pos; 144 if (val.fp) 145 entryinfo.update(&id, &val); 146 147 return 0; 148} 149 150// ext4_file_open(): 151int trace_open_entry(struct pt_regs *ctx, struct inode *inode, 152 struct file *file) 153{ 154 u64 id = bpf_get_current_pid_tgid(); 155 u32 pid = id >> 32; // PID is higher part 156 157 if (FILTER_PID) 158 return 0; 159 160 // store filep and timestamp by id 161 struct val_t val = {}; 162 val.ts = bpf_ktime_get_ns(); 163 val.fp = file; 164 val.offset = 0; 165 if (val.fp) 166 entryinfo.update(&id, &val); 167 168 return 0; 169} 170 171// ext4_sync_file(): 172int trace_fsync_entry(struct pt_regs *ctx, struct file *file) 173{ 174 u64 id = bpf_get_current_pid_tgid(); 175 u32 pid = id >> 32; // PID is higher part 176 177 if (FILTER_PID) 178 return 0; 179 180 // store filep and timestamp by id 181 struct val_t val = {}; 182 val.ts = bpf_ktime_get_ns(); 183 val.fp = file; 184 val.offset = 0; 185 if (val.fp) 186 entryinfo.update(&id, &val); 187 188 return 0; 189} 190 191// 192// Output 193// 194 195static int trace_return(struct pt_regs *ctx, int type) 196{ 197 struct val_t *valp; 198 u64 id = bpf_get_current_pid_tgid(); 199 u32 pid = id >> 32; // PID is higher part 200 201 valp = entryinfo.lookup(&id); 202 if (valp == 0) { 203 // missed tracing issue or filtered 204 return 0; 205 } 206 207 // calculate delta 208 u64 ts = bpf_ktime_get_ns(); 209 u64 delta_us = (ts - valp->ts) / 1000; 210 entryinfo.delete(&id); 211 if (FILTER_US) 212 return 0; 213 214 // populate output struct 215 struct data_t data = {}; 216 data.type = type; 217 data.size = PT_REGS_RC(ctx); 218 data.delta_us = delta_us; 219 data.pid = pid; 220 data.ts_us = ts / 1000; 221 data.offset = valp->offset; 222 bpf_get_current_comm(&data.task, sizeof(data.task)); 223 224 // workaround (rewriter should handle file to d_name in one step): 225 struct dentry *de = NULL; 226 struct qstr qs = {}; 227 de = valp->fp->f_path.dentry; 228 qs = de->d_name; 229 if (qs.len == 0) 230 return 0; 231 bpf_probe_read_kernel(&data.file, sizeof(data.file), (void *)qs.name); 232 233 // output 234 events.perf_submit(ctx, &data, sizeof(data)); 235 236 return 0; 237} 238 239int trace_read_return(struct pt_regs *ctx) 240{ 241 return trace_return(ctx, TRACE_READ); 242} 243 244int trace_write_return(struct pt_regs *ctx) 245{ 246 return trace_return(ctx, TRACE_WRITE); 247} 248 249int trace_open_return(struct pt_regs *ctx) 250{ 251 return trace_return(ctx, TRACE_OPEN); 252} 253 254int trace_fsync_return(struct pt_regs *ctx) 255{ 256 return trace_return(ctx, TRACE_FSYNC); 257} 258 259""" 260 261# code replacements 262with open(kallsyms) as syms: 263 ops = '' 264 for line in syms: 265 (addr, size, name) = line.rstrip().split(" ", 2) 266 name = name.split("\t")[0] 267 if name == "ext4_file_operations": 268 ops = "0x" + addr 269 break 270 if ops == '': 271 print("ERROR: no ext4_file_operations in /proc/kallsyms. Exiting.") 272 print("HINT: the kernel should be built with CONFIG_KALLSYMS_ALL.") 273 exit() 274 bpf_text = bpf_text.replace('EXT4_FILE_OPERATIONS', ops) 275if min_ms == 0: 276 bpf_text = bpf_text.replace('FILTER_US', '0') 277else: 278 bpf_text = bpf_text.replace('FILTER_US', 279 'delta_us <= %s' % str(min_ms * 1000)) 280if args.pid: 281 bpf_text = bpf_text.replace('FILTER_PID', 'pid != %s' % pid) 282else: 283 bpf_text = bpf_text.replace('FILTER_PID', '0') 284if debug or args.ebpf: 285 print(bpf_text) 286 if args.ebpf: 287 exit() 288 289# process event 290def print_event(cpu, data, size): 291 event = b["events"].event(data) 292 293 type = 'R' 294 if event.type == 1: 295 type = 'W' 296 elif event.type == 2: 297 type = 'O' 298 elif event.type == 3: 299 type = 'S' 300 301 if (csv): 302 print("%d,%s,%d,%s,%d,%d,%d,%s" % ( 303 event.ts_us, event.task.decode('utf-8', 'replace'), event.pid, 304 type, event.size, event.offset, event.delta_us, 305 event.file.decode('utf-8', 'replace'))) 306 return 307 print("%-8s %-14.14s %-6s %1s %-7s %-8d %7.2f %s" % (strftime("%H:%M:%S"), 308 event.task.decode('utf-8', 'replace'), event.pid, type, event.size, 309 event.offset / 1024, float(event.delta_us) / 1000, 310 event.file.decode('utf-8', 'replace'))) 311 312# initialize BPF 313b = BPF(text=bpf_text) 314 315# Common file functions. See earlier comment about generic_file_read_iter(). 316if BPF.get_kprobe_functions(b'ext4_file_read_iter'): 317 b.attach_kprobe(event="ext4_file_read_iter", fn_name="trace_read_entry") 318else: 319 b.attach_kprobe(event="generic_file_read_iter", fn_name="trace_read_entry") 320b.attach_kprobe(event="ext4_file_write_iter", fn_name="trace_write_entry") 321b.attach_kprobe(event="ext4_file_open", fn_name="trace_open_entry") 322b.attach_kprobe(event="ext4_sync_file", fn_name="trace_fsync_entry") 323if BPF.get_kprobe_functions(b'ext4_file_read_iter'): 324 b.attach_kretprobe(event="ext4_file_read_iter", fn_name="trace_read_return") 325else: 326 b.attach_kretprobe(event="generic_file_read_iter", fn_name="trace_read_return") 327b.attach_kretprobe(event="ext4_file_write_iter", fn_name="trace_write_return") 328b.attach_kretprobe(event="ext4_file_open", fn_name="trace_open_return") 329b.attach_kretprobe(event="ext4_sync_file", fn_name="trace_fsync_return") 330 331# header 332if (csv): 333 print("ENDTIME_us,TASK,PID,TYPE,BYTES,OFFSET_b,LATENCY_us,FILE") 334else: 335 if min_ms == 0: 336 print("Tracing ext4 operations") 337 else: 338 print("Tracing ext4 operations slower than %d ms" % min_ms) 339 print("%-8s %-14s %-6s %1s %-7s %-8s %7s %s" % ("TIME", "COMM", "PID", "T", 340 "BYTES", "OFF_KB", "LAT(ms)", "FILENAME")) 341 342# read events 343b["events"].open_perf_buffer(print_event, page_cnt=64) 344while 1: 345 try: 346 b.perf_buffer_poll() 347 except KeyboardInterrupt: 348 exit() 349