1#!/usr/bin/env python 2# @lint-avoid-python-3-compatibility-imports 3# 4# fileslower Trace slow synchronous file reads and writes. 5# For Linux, uses BCC, eBPF. 6# 7# USAGE: fileslower [-h] [-p PID] [-a] [min_ms] 8# 9# This script uses kernel dynamic tracing of synchronous reads and writes 10# at the VFS interface, to identify slow file reads and writes for any file 11# system. 12# 13# This works by tracing __vfs_read() and __vfs_write(), and filtering for 14# synchronous I/O (the path to new_sync_read() and new_sync_write()), and 15# for I/O with filenames. This approach provides a view of just two file 16# system request types. There are typically many others: asynchronous I/O, 17# directory operations, file handle operations, etc, that this tool does not 18# instrument. 19# 20# WARNING: This traces VFS reads and writes, which can be extremely frequent, 21# and so the overhead of this tool can become severe depending on the 22# workload. 23# 24# By default, a minimum millisecond threshold of 10 is used. 25# 26# Copyright 2016 Netflix, Inc. 27# Licensed under the Apache License, Version 2.0 (the "License") 28# 29# 06-Feb-2016 Brendan Gregg Created this. 30 31from __future__ import print_function 32from bcc import BPF 33import argparse 34import time 35 36# arguments 37examples = """examples: 38 ./fileslower # trace sync file I/O slower than 10 ms (default) 39 ./fileslower 1 # trace sync file I/O slower than 1 ms 40 ./fileslower -p 185 # trace PID 185 only 41""" 42parser = argparse.ArgumentParser( 43 description="Trace slow synchronous file reads and writes", 44 formatter_class=argparse.RawDescriptionHelpFormatter, 45 epilog=examples) 46parser.add_argument("-p", "--pid", type=int, metavar="PID", dest="tgid", 47 help="trace this PID only") 48parser.add_argument("-a", "--all-files", action="store_true", 49 help="include non-regular file types (sockets, FIFOs, etc)") 50parser.add_argument("min_ms", nargs="?", default='10', 51 help="minimum I/O duration to trace, in ms (default 10)") 52parser.add_argument("--ebpf", action="store_true", 53 help=argparse.SUPPRESS) 54args = parser.parse_args() 55min_ms = int(args.min_ms) 56tgid = args.tgid 57debug = 0 58 59# define BPF program 60bpf_text = """ 61#include <uapi/linux/ptrace.h> 62#include <linux/fs.h> 63#include <linux/sched.h> 64 65enum trace_mode { 66 MODE_READ, 67 MODE_WRITE 68}; 69 70struct val_t { 71 u32 sz; 72 u64 ts; 73 u32 name_len; 74 // de->d_name.name may point to de->d_iname so limit len accordingly 75 char name[DNAME_INLINE_LEN]; 76 char comm[TASK_COMM_LEN]; 77}; 78 79struct data_t { 80 enum trace_mode mode; 81 u32 pid; 82 u32 sz; 83 u64 delta_us; 84 u32 name_len; 85 char name[DNAME_INLINE_LEN]; 86 char comm[TASK_COMM_LEN]; 87}; 88 89BPF_HASH(entryinfo, pid_t, struct val_t); 90BPF_PERF_OUTPUT(events); 91 92// store timestamp and size on entry 93static int trace_rw_entry(struct pt_regs *ctx, struct file *file, 94 char __user *buf, size_t count) 95{ 96 u32 tgid = bpf_get_current_pid_tgid() >> 32; 97 if (TGID_FILTER) 98 return 0; 99 100 u32 pid = bpf_get_current_pid_tgid(); 101 102 // skip I/O lacking a filename 103 struct dentry *de = file->f_path.dentry; 104 int mode = file->f_inode->i_mode; 105 if (de->d_name.len == 0 || TYPE_FILTER) 106 return 0; 107 108 // store size and timestamp by pid 109 struct val_t val = {}; 110 val.sz = count; 111 val.ts = bpf_ktime_get_ns(); 112 113 struct qstr d_name = de->d_name; 114 val.name_len = d_name.len; 115 bpf_probe_read_kernel(&val.name, sizeof(val.name), d_name.name); 116 bpf_get_current_comm(&val.comm, sizeof(val.comm)); 117 entryinfo.update(&pid, &val); 118 119 return 0; 120} 121 122int trace_read_entry(struct pt_regs *ctx, struct file *file, 123 char __user *buf, size_t count) 124{ 125 // skip non-sync I/O; see kernel code for __vfs_read() 126 if (!(file->f_op->read_iter)) 127 return 0; 128 return trace_rw_entry(ctx, file, buf, count); 129} 130 131int trace_write_entry(struct pt_regs *ctx, struct file *file, 132 char __user *buf, size_t count) 133{ 134 // skip non-sync I/O; see kernel code for __vfs_write() 135 if (!(file->f_op->write_iter)) 136 return 0; 137 return trace_rw_entry(ctx, file, buf, count); 138} 139 140// output 141static int trace_rw_return(struct pt_regs *ctx, int type) 142{ 143 struct val_t *valp; 144 u32 pid = bpf_get_current_pid_tgid(); 145 146 valp = entryinfo.lookup(&pid); 147 if (valp == 0) { 148 // missed tracing issue or filtered 149 return 0; 150 } 151 u64 delta_us = (bpf_ktime_get_ns() - valp->ts) / 1000; 152 entryinfo.delete(&pid); 153 if (delta_us < MIN_US) 154 return 0; 155 156 struct data_t data = {}; 157 data.mode = type; 158 data.pid = pid; 159 data.sz = valp->sz; 160 data.delta_us = delta_us; 161 data.name_len = valp->name_len; 162 bpf_probe_read_kernel(&data.name, sizeof(data.name), valp->name); 163 bpf_probe_read_kernel(&data.comm, sizeof(data.comm), valp->comm); 164 events.perf_submit(ctx, &data, sizeof(data)); 165 166 return 0; 167} 168 169int trace_read_return(struct pt_regs *ctx) 170{ 171 return trace_rw_return(ctx, MODE_READ); 172} 173 174int trace_write_return(struct pt_regs *ctx) 175{ 176 return trace_rw_return(ctx, MODE_WRITE); 177} 178 179""" 180bpf_text = bpf_text.replace('MIN_US', str(min_ms * 1000)) 181if args.tgid: 182 bpf_text = bpf_text.replace('TGID_FILTER', 'tgid != %d' % tgid) 183else: 184 bpf_text = bpf_text.replace('TGID_FILTER', '0') 185if args.all_files: 186 bpf_text = bpf_text.replace('TYPE_FILTER', '0') 187else: 188 bpf_text = bpf_text.replace('TYPE_FILTER', '!S_ISREG(mode)') 189 190if debug or args.ebpf: 191 print(bpf_text) 192 if args.ebpf: 193 exit() 194 195# initialize BPF 196b = BPF(text=bpf_text) 197 198# I'd rather trace these via new_sync_read/new_sync_write (which used to be 199# do_sync_read/do_sync_write), but those became static. So trace these from 200# the parent functions, at the cost of more overhead, instead. 201# Ultimately, we should be using [V]FS tracepoints. 202try: 203 b.attach_kprobe(event="__vfs_read", fn_name="trace_read_entry") 204 b.attach_kretprobe(event="__vfs_read", fn_name="trace_read_return") 205except Exception: 206 print('Current kernel does not have __vfs_read, try vfs_read instead') 207 b.attach_kprobe(event="vfs_read", fn_name="trace_read_entry") 208 b.attach_kretprobe(event="vfs_read", fn_name="trace_read_return") 209try: 210 b.attach_kprobe(event="__vfs_write", fn_name="trace_write_entry") 211 b.attach_kretprobe(event="__vfs_write", fn_name="trace_write_return") 212except Exception: 213 print('Current kernel does not have __vfs_write, try vfs_write instead') 214 b.attach_kprobe(event="vfs_write", fn_name="trace_write_entry") 215 b.attach_kretprobe(event="vfs_write", fn_name="trace_write_return") 216 217mode_s = { 218 0: 'R', 219 1: 'W', 220} 221 222# header 223print("Tracing sync read/writes slower than %d ms" % min_ms) 224print("%-8s %-14s %-6s %1s %-7s %7s %s" % ("TIME(s)", "COMM", "TID", "D", 225 "BYTES", "LAT(ms)", "FILENAME")) 226 227start_ts = time.time() 228DNAME_INLINE_LEN = 32 229def print_event(cpu, data, size): 230 event = b["events"].event(data) 231 232 ms = float(event.delta_us) / 1000 233 name = event.name.decode('utf-8', 'replace') 234 if event.name_len > DNAME_INLINE_LEN: 235 name = name[:-3] + "..." 236 237 print("%-8.3f %-14.14s %-6s %1s %-7s %7.2f %s" % ( 238 time.time() - start_ts, event.comm.decode('utf-8', 'replace'), 239 event.pid, mode_s[event.mode], event.sz, ms, name)) 240 241b["events"].open_perf_buffer(print_event, page_cnt=64) 242while 1: 243 try: 244 b.perf_buffer_poll() 245 except KeyboardInterrupt: 246 exit() 247