1#!/usr/bin/env python 2# @lint-avoid-python-3-compatibility-imports 3# 4# btrfsslower Trace slow btrfs operations. 5# For Linux, uses BCC, eBPF. 6# 7# USAGE: btrfsslower [-h] [-j] [-p PID] [min_ms] [-d DURATION] 8# 9# This script traces common btrfs file operations: reads, writes, opens, and 10# syncs. It measures the time spent in these operations, and prints details 11# for each that exceeded a threshold. 12# 13# WARNING: This adds low-overhead instrumentation to these btrfs operations, 14# including reads and writes from the file system cache. Such reads and writes 15# can be very frequent (depending on the workload; eg, 1M/sec), at which 16# point the overhead of this tool (even if it prints no "slower" events) can 17# begin to become significant. 18# 19# By default, a minimum millisecond threshold of 10 is used. 20# 21# Copyright 2016 Netflix, Inc. 22# Licensed under the Apache License, Version 2.0 (the "License") 23# 24# 15-Feb-2016 Brendan Gregg Created this. 25# 16-Oct-2016 Dina Goldshtein -p to filter by process ID. 26 27from __future__ import print_function 28from bcc import BPF 29import argparse 30from datetime import datetime, timedelta 31from time import strftime 32 33# symbols 34kallsyms = "/proc/kallsyms" 35 36# arguments 37examples = """examples: 38 ./btrfsslower # trace operations slower than 10 ms (default) 39 ./btrfsslower 1 # trace operations slower than 1 ms 40 ./btrfsslower -j 1 # ... 1 ms, parsable output (csv) 41 ./btrfsslower 0 # trace all operations (warning: verbose) 42 ./btrfsslower -p 185 # trace PID 185 only 43 ./btrfsslower -d 10 # trace for 10 seconds only 44""" 45parser = argparse.ArgumentParser( 46 description="Trace common btrfs file operations slower than a threshold", 47 formatter_class=argparse.RawDescriptionHelpFormatter, 48 epilog=examples) 49parser.add_argument("-j", "--csv", action="store_true", 50 help="just print fields: comma-separated values") 51parser.add_argument("-p", "--pid", 52 help="trace this PID only") 53parser.add_argument("min_ms", nargs="?", default='10', 54 help="minimum I/O duration to trace, in ms (default 10)") 55parser.add_argument("--ebpf", action="store_true", 56 help=argparse.SUPPRESS) 57parser.add_argument("-d", "--duration", 58 help="total duration of trace in seconds") 59args = parser.parse_args() 60min_ms = int(args.min_ms) 61pid = args.pid 62csv = args.csv 63debug = 0 64if args.duration: 65 args.duration = timedelta(seconds=int(args.duration)) 66 67# define BPF program 68bpf_text = """ 69#include <uapi/linux/ptrace.h> 70#include <linux/fs.h> 71#include <linux/sched.h> 72#include <linux/dcache.h> 73 74// XXX: switch these to char's when supported 75#define TRACE_READ 0 76#define TRACE_WRITE 1 77#define TRACE_OPEN 2 78#define TRACE_FSYNC 3 79 80struct val_t { 81 u64 ts; 82 u64 offset; 83 struct file *fp; 84}; 85 86struct data_t { 87 // XXX: switch some to u32's when supported 88 u64 ts_us; 89 u64 type; 90 u32 size; 91 u64 offset; 92 u64 delta_us; 93 u32 pid; 94 char task[TASK_COMM_LEN]; 95 char file[DNAME_INLINE_LEN]; 96}; 97 98BPF_HASH(entryinfo, u64, struct val_t); 99BPF_PERF_OUTPUT(events); 100 101// 102// Store timestamp and size on entry 103// 104 105// The current btrfs (Linux 4.5) uses generic_file_read_iter() instead of it's 106// own read function. So we need to trace that and then filter on btrfs, which 107// I do by checking file->f_op. 108int trace_read_entry(struct pt_regs *ctx, struct kiocb *iocb) 109{ 110 u64 id = bpf_get_current_pid_tgid(); 111 u32 pid = id >> 32; // PID is higher part 112 113 if (FILTER_PID) 114 return 0; 115 116 // btrfs filter on file->f_op == btrfs_file_operations 117 struct file *fp = iocb->ki_filp; 118 if ((u64)fp->f_op != BTRFS_FILE_OPERATIONS) 119 return 0; 120 121 // store filep and timestamp by pid 122 struct val_t val = {}; 123 val.ts = bpf_ktime_get_ns(); 124 val.fp = fp; 125 val.offset = iocb->ki_pos; 126 if (val.fp) 127 entryinfo.update(&id, &val); 128 129 return 0; 130} 131 132// btrfs_file_write_iter(): 133int trace_write_entry(struct pt_regs *ctx, struct kiocb *iocb) 134{ 135 u64 id = bpf_get_current_pid_tgid(); 136 u32 pid = id >> 32; // PID is higher part 137 138 if (FILTER_PID) 139 return 0; 140 141 // store filep and timestamp by id 142 struct val_t val = {}; 143 val.ts = bpf_ktime_get_ns(); 144 val.fp = iocb->ki_filp; 145 val.offset = iocb->ki_pos; 146 if (val.fp) 147 entryinfo.update(&id, &val); 148 149 return 0; 150} 151 152// The current btrfs (Linux 4.5) uses generic_file_open(), instead of it's own 153// function. Same as with reads. Trace the generic path and filter: 154int trace_open_entry(struct pt_regs *ctx, struct inode *inode, 155 struct file *file) 156{ 157 u64 id = bpf_get_current_pid_tgid(); 158 u32 pid = id >> 32; // PID is higher part 159 160 if (FILTER_PID) 161 return 0; 162 163 // btrfs filter on file->f_op == btrfs_file_operations 164 if ((u64)file->f_op != BTRFS_FILE_OPERATIONS) 165 return 0; 166 167 // store filep and timestamp by id 168 struct val_t val = {}; 169 val.ts = bpf_ktime_get_ns(); 170 val.fp = file; 171 val.offset = 0; 172 if (val.fp) 173 entryinfo.update(&id, &val); 174 175 return 0; 176} 177 178// btrfs_sync_file(): 179int trace_fsync_entry(struct pt_regs *ctx, struct file *file) 180{ 181 u64 id = bpf_get_current_pid_tgid(); 182 u32 pid = id >> 32; // PID is higher part 183 184 if (FILTER_PID) 185 return 0; 186 187 // store filep and timestamp by id 188 struct val_t val = {}; 189 val.ts = bpf_ktime_get_ns(); 190 val.fp = file; 191 val.offset = 0; 192 if (val.fp) 193 entryinfo.update(&id, &val); 194 195 return 0; 196} 197 198// 199// Output 200// 201 202static int trace_return(struct pt_regs *ctx, int type) 203{ 204 struct val_t *valp; 205 u64 id = bpf_get_current_pid_tgid(); 206 u32 pid = id >> 32; // PID is higher part 207 208 valp = entryinfo.lookup(&id); 209 if (valp == 0) { 210 // missed tracing issue or filtered 211 return 0; 212 } 213 214 // calculate delta 215 u64 ts = bpf_ktime_get_ns(); 216 u64 delta_us = (ts - valp->ts) / 1000; 217 entryinfo.delete(&id); 218 if (FILTER_US) 219 return 0; 220 221 // populate output struct 222 struct data_t data = {}; 223 data.type = type; 224 data.size = PT_REGS_RC(ctx); 225 data.delta_us = delta_us; 226 data.pid = pid; 227 data.ts_us = ts / 1000; 228 data.offset = valp->offset; 229 bpf_get_current_comm(&data.task, sizeof(data.task)); 230 231 // workaround (rewriter should handle file to d_name in one step): 232 struct dentry *de = NULL; 233 struct qstr qs = {}; 234 de = valp->fp->f_path.dentry; 235 qs = de->d_name; 236 if (qs.len == 0) 237 return 0; 238 bpf_probe_read_kernel(&data.file, sizeof(data.file), (void *)qs.name); 239 240 // output 241 events.perf_submit(ctx, &data, sizeof(data)); 242 243 return 0; 244} 245 246int trace_read_return(struct pt_regs *ctx) 247{ 248 return trace_return(ctx, TRACE_READ); 249} 250 251int trace_write_return(struct pt_regs *ctx) 252{ 253 return trace_return(ctx, TRACE_WRITE); 254} 255 256int trace_open_return(struct pt_regs *ctx) 257{ 258 return trace_return(ctx, TRACE_OPEN); 259} 260 261int trace_fsync_return(struct pt_regs *ctx) 262{ 263 return trace_return(ctx, TRACE_FSYNC); 264} 265 266""" 267 268# code replacements 269with open(kallsyms) as syms: 270 ops = '' 271 for line in syms: 272 a = line.rstrip().split() 273 (addr, name) = (a[0], a[2]) 274 name = name.split("\t")[0] 275 if name == "btrfs_file_operations": 276 ops = "0x" + addr 277 break 278 if ops == '': 279 print("ERROR: no btrfs_file_operations in /proc/kallsyms. Exiting.") 280 print("HINT: the kernel should be built with CONFIG_KALLSYMS_ALL.") 281 exit() 282 bpf_text = bpf_text.replace('BTRFS_FILE_OPERATIONS', ops) 283if min_ms == 0: 284 bpf_text = bpf_text.replace('FILTER_US', '0') 285else: 286 bpf_text = bpf_text.replace('FILTER_US', 287 'delta_us <= %s' % str(min_ms * 1000)) 288if args.pid: 289 bpf_text = bpf_text.replace('FILTER_PID', 'pid != %s' % pid) 290else: 291 bpf_text = bpf_text.replace('FILTER_PID', '0') 292if debug or args.ebpf: 293 print(bpf_text) 294 if args.ebpf: 295 exit() 296 297# process event 298def print_event(cpu, data, size): 299 event = b["events"].event(data) 300 301 type = 'R' 302 if event.type == 1: 303 type = 'W' 304 elif event.type == 2: 305 type = 'O' 306 elif event.type == 3: 307 type = 'S' 308 309 if (csv): 310 print("%d,%s,%d,%s,%d,%d,%d,%s" % ( 311 event.ts_us, event.task.decode('utf-8', 'replace'), event.pid, 312 type, event.size, event.offset, event.delta_us, 313 event.file.decode('utf-8', 'replace'))) 314 return 315 print("%-8s %-14.14s %-7d %1s %-7s %-8d %7.2f %s" % (strftime("%H:%M:%S"), 316 event.task.decode('utf-8', 'replace'), event.pid, type, event.size, 317 event.offset / 1024, float(event.delta_us) / 1000, 318 event.file.decode('utf-8', 'replace'))) 319 320# initialize BPF 321b = BPF(text=bpf_text) 322 323# Common file functions. See earlier comment about generic_*(). 324b.attach_kprobe(event="generic_file_read_iter", fn_name="trace_read_entry") 325b.attach_kprobe(event="btrfs_file_write_iter", fn_name="trace_write_entry") 326b.attach_kprobe(event="generic_file_open", fn_name="trace_open_entry") 327b.attach_kprobe(event="btrfs_sync_file", fn_name="trace_fsync_entry") 328b.attach_kretprobe(event="generic_file_read_iter", fn_name="trace_read_return") 329b.attach_kretprobe(event="btrfs_file_write_iter", fn_name="trace_write_return") 330b.attach_kretprobe(event="generic_file_open", fn_name="trace_open_return") 331b.attach_kretprobe(event="btrfs_sync_file", fn_name="trace_fsync_return") 332 333# header 334if (csv): 335 print("ENDTIME_us,TASK,PID,TYPE,BYTES,OFFSET_b,LATENCY_us,FILE") 336else: 337 if min_ms == 0: 338 print("Tracing btrfs operations") 339 else: 340 print("Tracing btrfs operations slower than %d ms" % min_ms) 341 print("%-8s %-14s %-7s %1s %-7s %-8s %7s %s" % ("TIME", "COMM", "PID", "T", 342 "BYTES", "OFF_KB", "LAT(ms)", "FILENAME")) 343 344# read events 345b["events"].open_perf_buffer(print_event, page_cnt=64) 346start_time = datetime.now() 347while not args.duration or datetime.now() - start_time < args.duration: 348 try: 349 b.perf_buffer_poll(timeout=1000) 350 except KeyboardInterrupt: 351 exit() 352