1#!/usr/bin/env python 2# @lint-avoid-python-3-compatibility-imports 3# 4# zfsdist Summarize ZFS operation latency. 5# For Linux, uses BCC, eBPF. 6# 7# USAGE: zfsdist [-h] [-T] [-m] [-p PID] [interval] [count] 8# 9# Copyright 2016 Netflix, Inc. 10# Licensed under the Apache License, Version 2.0 (the "License") 11# 12# 14-Feb-2016 Brendan Gregg Created this. 13 14from __future__ import print_function 15from bcc import BPF 16from time import sleep, strftime 17import argparse 18 19# arguments 20examples = """examples: 21 ./zfsdist # show operation latency as a histogram 22 ./zfsdist -p 181 # trace PID 181 only 23 ./zfsdist 1 10 # print 1 second summaries, 10 times 24 ./zfsdist -m 5 # 5s summaries, milliseconds 25""" 26parser = argparse.ArgumentParser( 27 description="Summarize ZFS operation latency", 28 formatter_class=argparse.RawDescriptionHelpFormatter, 29 epilog=examples) 30parser.add_argument("-T", "--notimestamp", action="store_true", 31 help="don't include timestamp on interval output") 32parser.add_argument("-m", "--milliseconds", action="store_true", 33 help="output in milliseconds") 34parser.add_argument("-p", "--pid", 35 help="trace this PID only") 36parser.add_argument("interval", nargs="?", 37 help="output interval, in seconds") 38parser.add_argument("count", nargs="?", default=99999999, 39 help="number of outputs") 40parser.add_argument("--ebpf", action="store_true", 41 help=argparse.SUPPRESS) 42args = parser.parse_args() 43pid = args.pid 44countdown = int(args.count) 45if args.milliseconds: 46 factor = 1000000 47 label = "msecs" 48else: 49 factor = 1000 50 label = "usecs" 51if args.interval and int(args.interval) == 0: 52 print("ERROR: interval 0. Exiting.") 53 exit() 54debug = 0 55 56# define BPF program 57bpf_text = """ 58#include <uapi/linux/ptrace.h> 59#include <linux/fs.h> 60#include <linux/sched.h> 61 62#define OP_NAME_LEN 8 63typedef struct dist_key { 64 char op[OP_NAME_LEN]; 65 u64 slot; 66} dist_key_t; 67BPF_HASH(start, u32); 68BPF_HISTOGRAM(dist, dist_key_t); 69 70// time operation 71int trace_entry(struct pt_regs *ctx) 72{ 73 u64 pid_tgid = bpf_get_current_pid_tgid(); 74 u32 pid = pid_tgid >> 32; 75 u32 tid = (u32)pid_tgid; 76 77 if (FILTER_PID) 78 return 0; 79 u64 ts = bpf_ktime_get_ns(); 80 start.update(&tid, &ts); 81 return 0; 82} 83 84static int trace_return(struct pt_regs *ctx, const char *op) 85{ 86 u64 *tsp; 87 u64 pid_tgid = bpf_get_current_pid_tgid(); 88 u32 pid = pid_tgid >> 32; 89 u32 tid = (u32)pid_tgid; 90 91 // fetch timestamp and calculate delta 92 tsp = start.lookup(&tid); 93 if (tsp == 0) { 94 return 0; // missed start or filtered 95 } 96 u64 delta = (bpf_ktime_get_ns() - *tsp) / FACTOR; 97 98 // store as histogram 99 dist_key_t key = {.slot = bpf_log2l(delta)}; 100 __builtin_memcpy(&key.op, op, sizeof(key.op)); 101 dist.atomic_increment(key); 102 103 start.delete(&tid); 104 return 0; 105} 106 107int trace_read_return(struct pt_regs *ctx) 108{ 109 char *op = "read"; 110 return trace_return(ctx, op); 111} 112 113int trace_write_return(struct pt_regs *ctx) 114{ 115 char *op = "write"; 116 return trace_return(ctx, op); 117} 118 119int trace_open_return(struct pt_regs *ctx) 120{ 121 char *op = "open"; 122 return trace_return(ctx, op); 123} 124 125int trace_fsync_return(struct pt_regs *ctx) 126{ 127 char *op = "fsync"; 128 return trace_return(ctx, op); 129} 130""" 131bpf_text = bpf_text.replace('FACTOR', str(factor)) 132if args.pid: 133 bpf_text = bpf_text.replace('FILTER_PID', 'pid != %s' % pid) 134else: 135 bpf_text = bpf_text.replace('FILTER_PID', '0') 136if debug or args.ebpf: 137 print(bpf_text) 138 if args.ebpf: 139 exit() 140 141# load BPF program 142b = BPF(text=bpf_text) 143 144# common file functions 145if BPF.get_kprobe_functions(b'zpl_iter'): 146 b.attach_kprobe(event="zpl_iter_read", fn_name="trace_entry") 147 b.attach_kprobe(event="zpl_iter_write", fn_name="trace_entry") 148elif BPF.get_kprobe_functions(b'zpl_aio'): 149 b.attach_kprobe(event="zpl_aio_read", fn_name="trace_entry") 150 b.attach_kprobe(event="zpl_aio_write", fn_name="trace_entry") 151else: 152 b.attach_kprobe(event="zpl_read", fn_name="trace_entry") 153 b.attach_kprobe(event="zpl_write", fn_name="trace_entry") 154b.attach_kprobe(event="zpl_open", fn_name="trace_entry") 155b.attach_kprobe(event="zpl_fsync", fn_name="trace_entry") 156if BPF.get_kprobe_functions(b'zpl_iter'): 157 b.attach_kretprobe(event="zpl_iter_read", fn_name="trace_read_return") 158 b.attach_kretprobe(event="zpl_iter_write", fn_name="trace_write_return") 159elif BPF.get_kprobe_functions(b'zpl_aio'): 160 b.attach_kretprobe(event="zpl_aio_read", fn_name="trace_read_return") 161 b.attach_kretprobe(event="zpl_aio_write", fn_name="trace_write_return") 162else: 163 b.attach_kretprobe(event="zpl_read", fn_name="trace_read_return") 164 b.attach_kretprobe(event="zpl_write", fn_name="trace_write_return") 165b.attach_kretprobe(event="zpl_open", fn_name="trace_open_return") 166b.attach_kretprobe(event="zpl_fsync", fn_name="trace_fsync_return") 167 168print("Tracing ZFS operation latency... Hit Ctrl-C to end.") 169 170# output 171exiting = 0 172dist = b.get_table("dist") 173while (1): 174 try: 175 if args.interval: 176 sleep(int(args.interval)) 177 else: 178 sleep(99999999) 179 except KeyboardInterrupt: 180 exiting = 1 181 182 print() 183 if args.interval and (not args.notimestamp): 184 print(strftime("%H:%M:%S:")) 185 186 dist.print_log2_hist(label, "operation", section_print_fn=bytes.decode) 187 dist.clear() 188 189 countdown -= 1 190 if exiting or countdown == 0: 191 exit() 192