1#!/usr/bin/python 2# 3# offwaketime Summarize blocked time by kernel off-CPU stack + waker stack 4# For Linux, uses BCC, eBPF. 5# 6# USAGE: offwaketime [-h] [-u] [-p PID] [-T] [duration] 7# 8# The current implementation uses an unrolled loop for x86_64, and was written 9# as a proof of concept. This implementation should be replaced in the future 10# with an appropriate bpf_ call, when available. 11# 12# The Off-CPU stack is currently limited to a stack trace depth of 20 13# (maxtdepth), and the waker stack limited to 10 (maxwdepth). This is also 14# limited to kernel stacks, and x86_64 only. Check for future versions, where 15# these limitations should be removed. 16# 17# Copyright 2016 Netflix, Inc. 18# Licensed under the Apache License, Version 2.0 (the "License") 19# 20# 20-Jan-2016 Brendan Gregg Created this. 21 22from __future__ import print_function 23from bcc import BPF 24from time import sleep 25import argparse 26import signal 27 28# arguments 29examples = """examples: 30 ./offwaketime # trace off-CPU + waker stack time until Ctrl-C 31 ./offwaketime 5 # trace for 5 seconds only 32 ./offwaketime -f 5 # 5 seconds, and output in folded format 33 ./offwaketime -u # don't include kernel threads (user only) 34 ./offwaketime -p 185 # trace fo PID 185 only 35""" 36parser = argparse.ArgumentParser( 37 description="Summarize blocked time by kernel stack trace + waker stack", 38 formatter_class=argparse.RawDescriptionHelpFormatter, 39 epilog=examples) 40parser.add_argument("-u", "--useronly", action="store_true", 41 help="user threads only (no kernel threads)") 42parser.add_argument("-p", "--pid", 43 help="trace this PID only") 44parser.add_argument("-v", "--verbose", action="store_true", 45 help="show raw addresses") 46parser.add_argument("-f", "--folded", action="store_true", 47 help="output folded format") 48parser.add_argument("duration", nargs="?", default=99999999, 49 help="duration of trace, in seconds") 50args = parser.parse_args() 51folded = args.folded 52duration = int(args.duration) 53debug = 0 54maxwdepth = 10 # and MAXWDEPTH 55maxtdepth = 20 # and MAXTDEPTH 56if args.pid and args.useronly: 57 print("ERROR: use either -p or -u.") 58 exit() 59 60# signal handler 61def signal_ignore(signal, frame): 62 print() 63 64# define BPF program 65bpf_text = """ 66#include <uapi/linux/ptrace.h> 67#include <linux/sched.h> 68 69#define MAXWDEPTH 10 70#define MAXTDEPTH 20 71#define MINBLOCK_US 1 72 73struct key_t { 74 char waker[TASK_COMM_LEN]; 75 char target[TASK_COMM_LEN]; 76 u64 wret[MAXWDEPTH]; 77 u64 tret[MAXTDEPTH]; 78}; 79BPF_HASH(counts, struct key_t); 80BPF_HASH(start, u32); 81struct wokeby_t { 82 char name[TASK_COMM_LEN]; 83 u64 ret[MAXWDEPTH]; 84}; 85BPF_HASH(wokeby, u32, struct wokeby_t); 86 87static u64 get_frame(u64 *bp) { 88 if (*bp) { 89 // The following stack walker is x86_64 specific 90 u64 ret = 0; 91 if (bpf_probe_read(&ret, sizeof(ret), (void *)(*bp+8))) 92 return 0; 93 if (bpf_probe_read(bp, sizeof(*bp), (void *)*bp)) 94 *bp = 0; 95 if (ret < __START_KERNEL_map) 96 return 0; 97 return ret; 98 } 99 return 0; 100} 101 102int waker(struct pt_regs *ctx, struct task_struct *p) { 103 u32 pid = p->pid; 104 105 if (!(FILTER)) 106 return 0; 107 108 u64 bp = 0; 109 struct wokeby_t woke = {}; 110 int depth = 0; 111 bpf_get_current_comm(&woke.name, sizeof(woke.name)); 112 bp = ctx->bp; 113 114 // unrolled loop (MAXWDEPTH): 115 if (!(woke.ret[depth++] = get_frame(&bp))) goto out; 116 if (!(woke.ret[depth++] = get_frame(&bp))) goto out; 117 if (!(woke.ret[depth++] = get_frame(&bp))) goto out; 118 if (!(woke.ret[depth++] = get_frame(&bp))) goto out; 119 if (!(woke.ret[depth++] = get_frame(&bp))) goto out; 120 if (!(woke.ret[depth++] = get_frame(&bp))) goto out; 121 if (!(woke.ret[depth++] = get_frame(&bp))) goto out; 122 if (!(woke.ret[depth++] = get_frame(&bp))) goto out; 123 if (!(woke.ret[depth++] = get_frame(&bp))) goto out; 124 woke.ret[depth] = get_frame(&bp); 125 126out: 127 wokeby.update(&pid, &woke); 128 return 0; 129} 130 131int oncpu(struct pt_regs *ctx, struct task_struct *p) { 132 u32 pid = p->pid; 133 u64 ts, *tsp; 134 135 // record previous thread sleep time 136 if (FILTER) { 137 ts = bpf_ktime_get_ns(); 138 start.update(&pid, &ts); 139 } 140 141 // calculate current thread's delta time 142 pid = bpf_get_current_pid_tgid(); 143 tsp = start.lookup(&pid); 144 if (tsp == 0) 145 return 0; // missed start or filtered 146 u64 delta = bpf_ktime_get_ns() - *tsp; 147 start.delete(&pid); 148 delta = delta / 1000; 149 if (delta < MINBLOCK_US) 150 return 0; 151 152 // create map key 153 u64 zero = 0, *val, bp = 0; 154 int depth = 0; 155 struct key_t key = {}; 156 struct wokeby_t *woke; 157 bpf_get_current_comm(&key.target, sizeof(key.target)); 158 bp = ctx->bp; 159 160 // unrolled loop (MAXTDEPTH): 161 if (!(key.tret[depth++] = get_frame(&bp))) goto out; 162 if (!(key.tret[depth++] = get_frame(&bp))) goto out; 163 if (!(key.tret[depth++] = get_frame(&bp))) goto out; 164 if (!(key.tret[depth++] = get_frame(&bp))) goto out; 165 if (!(key.tret[depth++] = get_frame(&bp))) goto out; 166 if (!(key.tret[depth++] = get_frame(&bp))) goto out; 167 if (!(key.tret[depth++] = get_frame(&bp))) goto out; 168 if (!(key.tret[depth++] = get_frame(&bp))) goto out; 169 if (!(key.tret[depth++] = get_frame(&bp))) goto out; 170 if (!(key.tret[depth++] = get_frame(&bp))) goto out; 171 172 if (!(key.tret[depth++] = get_frame(&bp))) goto out; 173 if (!(key.tret[depth++] = get_frame(&bp))) goto out; 174 if (!(key.tret[depth++] = get_frame(&bp))) goto out; 175 if (!(key.tret[depth++] = get_frame(&bp))) goto out; 176 if (!(key.tret[depth++] = get_frame(&bp))) goto out; 177 if (!(key.tret[depth++] = get_frame(&bp))) goto out; 178 if (!(key.tret[depth++] = get_frame(&bp))) goto out; 179 if (!(key.tret[depth++] = get_frame(&bp))) goto out; 180 if (!(key.tret[depth++] = get_frame(&bp))) goto out; 181 key.tret[depth] = get_frame(&bp); 182 183out: 184 woke = wokeby.lookup(&pid); 185 if (woke) { 186 __builtin_memcpy(&key.wret, woke->ret, sizeof(key.wret)); 187 __builtin_memcpy(&key.waker, woke->name, TASK_COMM_LEN); 188 wokeby.delete(&pid); 189 } 190 191 val = counts.lookup_or_init(&key, &zero); 192 if (val) { 193 (*val) += delta; 194 } 195 return 0; 196} 197""" 198if args.pid: 199 filter = 'pid == %s' % args.pid 200elif args.useronly: 201 filter = '!(p->flags & PF_KTHREAD)' 202else: 203 filter = '1' 204bpf_text = bpf_text.replace('FILTER', filter) 205if debug: 206 print(bpf_text) 207 208# initialize BPF 209b = BPF(text=bpf_text) 210b.attach_kprobe(event="finish_task_switch", fn_name="oncpu") 211b.attach_kprobe(event="try_to_wake_up", fn_name="waker") 212matched = b.num_open_kprobes() 213if matched == 0: 214 print("0 functions traced. Exiting.") 215 exit() 216 217# header 218if not folded: 219 print("Tracing blocked time (us) by kernel off-CPU and waker stack", 220 end="") 221 if duration < 99999999: 222 print(" for %d secs." % duration) 223 else: 224 print("... Hit Ctrl-C to end.") 225 226# output 227while (1): 228 try: 229 sleep(duration) 230 except KeyboardInterrupt: 231 # as cleanup can take many seconds, trap Ctrl-C: 232 signal.signal(signal.SIGINT, signal_ignore) 233 234 if not folded: 235 print() 236 counts = b.get_table("counts") 237 for k, v in sorted(counts.items(), key=lambda counts: counts[1].value): 238 if folded: 239 # fold target stack 240 line = k.target + ";" 241 for i in reversed(range(0, maxtdepth)): 242 if k.tret[i] == 0: 243 continue 244 line = line + b.ksym(k.tret[i]) 245 if i != 0: 246 line = line + ";" 247 248 # add delimiter 249 line = line + ";-" 250 251 # fold waker stack 252 for i in range(0, maxwdepth): 253 line = line + ";" 254 if k.wret[i] == 0: 255 break 256 line = line + b.ksym(k.wret[i]) 257 if i != 0: 258 line = line + ";" + k.waker 259 260 # print as a line 261 print("%s %d" % (line, v.value)) 262 else: 263 # print wakeup name then stack in reverse order 264 print(" %-16s %s" % ("waker:", k.waker)) 265 for i in reversed(range(0, maxwdepth)): 266 if k.wret[i] == 0: 267 continue 268 print(" %-16x %s" % (k.wret[i], 269 b.ksym(k.wret[i]))) 270 271 # print delimiter 272 print(" %-16s %s" % ("-", "-")) 273 274 # print default multi-line stack output 275 for i in range(0, maxtdepth): 276 if k.tret[i] == 0: 277 break 278 print(" %-16x %s" % (k.tret[i], 279 b.ksym(k.tret[i]))) 280 print(" %-16s %s" % ("target:", k.target)) 281 print(" %d\n" % v.value) 282 counts.clear() 283 284 if not folded: 285 print("Detaching...") 286 exit() 287