1#!/usr/bin/env python 2# @lint-avoid-python-3-compatibility-imports 3# 4# cpuunclaimed Sample CPU run queues and calculate unclaimed idle CPU. 5# For Linux, uses BCC, eBPF. 6# 7# This samples the length of the run queues and determine when there are idle 8# CPUs, yet queued threads waiting their turn. Report the amount of idle 9# (yet unclaimed by waiting threads) CPU as a system-wide percentage. 10# 11# This situation can happen for a number of reasons: 12# 13# - An application has been bound to some, but not all, CPUs, and has runnable 14# threads that cannot migrate to other CPUs due to this configuration. 15# - CPU affinity: an optimization that leaves threads on CPUs where the CPU 16# caches are warm, even if this means short periods of waiting while other 17# CPUs are idle. The wait period is tunale (see sysctl, kernel.sched*). 18# - Scheduler bugs. 19# 20# An unclaimed idle of < 1% is likely to be CPU affinity, and not usually a 21# cause for concern. By leaving the CPU idle, overall throughput of the system 22# may be improved. This tool is best for identifying larger issues, > 2%, due 23# to the coarseness of its 99 Hertz samples. 24# 25# This is an experimental tool that currently works by use of sampling to 26# keep overheads low. Tool assumptions: 27# 28# - CPU samples consistently fire around the same offset. There will sometimes 29# be a lag as a sample is delayed by higher-priority interrupts, but it is 30# assumed the subsequent samples will catch up to the expected offsets (as 31# is seen in practice). You can use -J to inspect sample offsets. Some 32# systems can power down CPUs when idle, and when they wake up again they 33# may begin firing at a skewed offset: this tool will detect the skew, print 34# an error, and exit. 35# - All CPUs are online (see ncpu). 36# 37# If this identifies unclaimed CPU, you can double check it by dumping raw 38# samples (-j), as well as using other tracing tools to instrument scheduler 39# events (although this latter approach has much higher overhead). 40# 41# This tool passes all sampled events to user space for post processing. 42# I originally wrote this to do the calculations entirerly in kernel context, 43# and only pass a summary. That involves a number of challenges, and the 44# overhead savings may not outweigh the caveats. You can see my WIP here: 45# https://gist.github.com/brendangregg/731cf2ce54bf1f9a19d4ccd397625ad9 46# 47# USAGE: cpuunclaimed [-h] [-j] [-J] [-T] [interval] [count] 48# 49# If you see "Lost 1881 samples" warnings, try increasing wakeup_hz. 50# 51# REQUIRES: Linux 4.9+ (BPF_PROG_TYPE_PERF_EVENT support). Under tools/old is 52# a version of this tool that may work on Linux 4.6 - 4.8. 53# 54# Copyright 2016 Netflix, Inc. 55# Licensed under the Apache License, Version 2.0 (the "License") 56# 57# 20-Dec-2016 Brendan Gregg Created this. 58 59from __future__ import print_function 60from bcc import BPF, PerfType, PerfSWConfig 61from time import sleep, strftime 62import argparse 63import multiprocessing 64from os import getpid, system, open, close, dup, unlink, O_WRONLY 65from tempfile import NamedTemporaryFile 66 67# arguments 68examples = """examples: 69 ./cpuunclaimed # sample and calculate unclaimed idle CPUs, 70 # output every 1 second (default) 71 ./cpuunclaimed 5 10 # print 5 second summaries, 10 times 72 ./cpuunclaimed -T 1 # 1s summaries and timestamps 73 ./cpuunclaimed -j # raw dump of all samples (verbose), CSV 74""" 75parser = argparse.ArgumentParser( 76 description="Sample CPU run queues and calculate unclaimed idle CPU", 77 formatter_class=argparse.RawDescriptionHelpFormatter, 78 epilog=examples) 79parser.add_argument("-j", "--csv", action="store_true", 80 help="print sample summaries (verbose) as comma-separated values") 81parser.add_argument("-J", "--fullcsv", action="store_true", 82 help="print sample summaries with extra fields: CPU sample offsets") 83parser.add_argument("-T", "--timestamp", action="store_true", 84 help="include timestamp on output") 85parser.add_argument("interval", nargs="?", default=-1, 86 help="output interval, in seconds") 87parser.add_argument("count", nargs="?", default=99999999, 88 help="number of outputs") 89parser.add_argument("--ebpf", action="store_true", 90 help=argparse.SUPPRESS) 91args = parser.parse_args() 92countdown = int(args.count) 93frequency = 99 94dobind = 1 95wakeup_hz = 10 # frequency to read buffers 96wakeup_s = float(1) / wakeup_hz 97ncpu = multiprocessing.cpu_count() # assume all are online 98debug = 0 99 100# Linux 4.15 introduced a new field runnable_weight 101# in linux_src:kernel/sched/sched.h as 102# struct cfs_rq { 103# struct load_weight load; 104# unsigned long runnable_weight; 105# unsigned int nr_running, h_nr_running; 106# ...... 107# } 108# and this tool requires to access nr_running to get 109# runqueue len information. 110# 111# The commit which introduces cfs_rq->runnable_weight 112# field also introduces the field sched_entity->runnable_weight 113# where sched_entity is defined in linux_src:include/linux/sched.h. 114# 115# To cope with pre-4.15 and 4.15/post-4.15 releases, 116# we run a simple BPF program to detect whether 117# field sched_entity->runnable_weight exists. The existence of 118# this field should infer the existence of cfs_rq->runnable_weight. 119# 120# This will need maintenance as the relationship between these 121# two fields may change in the future. 122# 123def check_runnable_weight_field(): 124 # Define the bpf program for checking purpose 125 bpf_check_text = """ 126#include <linux/sched.h> 127unsigned long dummy(struct sched_entity *entity) 128{ 129 return entity->runnable_weight; 130} 131""" 132 133 # Get a temporary file name 134 tmp_file = NamedTemporaryFile(delete=False) 135 tmp_file.close(); 136 137 # Duplicate and close stderr (fd = 2) 138 old_stderr = dup(2) 139 close(2) 140 141 # Open a new file, should get fd number 2 142 # This will avoid printing llvm errors on the screen 143 fd = open(tmp_file.name, O_WRONLY) 144 try: 145 t = BPF(text=bpf_check_text) 146 success_compile = True 147 except: 148 success_compile = False 149 150 # Release the fd 2, and next dup should restore old stderr 151 close(fd) 152 dup(old_stderr) 153 close(old_stderr) 154 155 # remove the temporary file and return 156 unlink(tmp_file.name) 157 return success_compile 158 159 160# process arguments 161if args.fullcsv: 162 args.csv = True 163if args.csv: 164 interval = 0.2 165if args.interval != -1 and (args.fullcsv or args.csv): 166 print("ERROR: cannot use interval with either -j or -J. Exiting.") 167 exit() 168if args.interval == -1: 169 args.interval = "1" 170interval = float(args.interval) 171 172# define BPF program 173bpf_text = """ 174#include <uapi/linux/ptrace.h> 175#include <uapi/linux/bpf_perf_event.h> 176#include <linux/sched.h> 177 178struct data_t { 179 u64 ts; 180 u64 cpu; 181 u64 len; 182}; 183 184BPF_PERF_OUTPUT(events); 185 186// Declare enough of cfs_rq to find nr_running, since we can't #import the 187// header. This will need maintenance. It is from kernel/sched/sched.h: 188// The runnable_weight field is removed from Linux 5.7.0 189struct cfs_rq_partial { 190 struct load_weight load; 191#if LINUX_VERSION_CODE < KERNEL_VERSION(5, 7, 0) 192 RUNNABLE_WEIGHT_FIELD 193#endif 194 unsigned int nr_running, h_nr_running; 195}; 196 197int do_perf_event(struct bpf_perf_event_data *ctx) 198{ 199 int cpu = bpf_get_smp_processor_id(); 200 u64 now = bpf_ktime_get_ns(); 201 202 /* 203 * Fetch the run queue length from task->se.cfs_rq->nr_running. This is an 204 * unstable interface and may need maintenance. Perhaps a future version 205 * of BPF will support task_rq(p) or something similar as a more reliable 206 * interface. 207 */ 208 unsigned int len = 0; 209 struct task_struct *task = NULL; 210 struct cfs_rq_partial *my_q = NULL; 211 task = (struct task_struct *)bpf_get_current_task(); 212 my_q = (struct cfs_rq_partial *)task->se.cfs_rq; 213 len = my_q->nr_running; 214 215 struct data_t data = {.ts = now, .cpu = cpu, .len = len}; 216 events.perf_submit(ctx, &data, sizeof(data)); 217 218 return 0; 219} 220""" 221 222# If target has BTF enabled, use BTF to check runnable_weight field exists in 223# cfs_rq first, otherwise fallback to use check_runnable_weight_field(). 224if BPF.kernel_struct_has_field(b'cfs_rq', b'runnable_weight') == 1 \ 225 or check_runnable_weight_field(): 226 bpf_text = bpf_text.replace('RUNNABLE_WEIGHT_FIELD', 'unsigned long runnable_weight;') 227else: 228 bpf_text = bpf_text.replace('RUNNABLE_WEIGHT_FIELD', '') 229 230# code substitutions 231if debug or args.ebpf: 232 print(bpf_text) 233 if args.ebpf: 234 exit() 235 236# initialize BPF & perf_events 237b = BPF(text=bpf_text) 238# TODO: check for HW counters first and use if more accurate 239b.attach_perf_event(ev_type=PerfType.SOFTWARE, 240 ev_config=PerfSWConfig.TASK_CLOCK, fn_name="do_perf_event", 241 sample_period=0, sample_freq=frequency) 242 243if args.csv: 244 if args.timestamp: 245 print("TIME", end=",") 246 print("TIMESTAMP_ns", end=",") 247 print(",".join("CPU" + str(c) for c in range(ncpu)), end="") 248 if args.fullcsv: 249 print(",", end="") 250 print(",".join("OFFSET_ns_CPU" + str(c) for c in range(ncpu)), end="") 251 print() 252else: 253 print(("Sampling run queues... Output every %s seconds. " + 254 "Hit Ctrl-C to end.") % args.interval) 255 256samples = {} 257group = {} 258last = 0 259 260# process event 261def print_event(cpu, data, size): 262 event = b["events"].event(data) 263 samples[event.ts] = {} 264 samples[event.ts]['cpu'] = event.cpu 265 samples[event.ts]['len'] = event.len 266 267exiting = 0 if args.interval else 1 268slept = float(0) 269 270# Choose the elapsed time from one sample group to the next that identifies a 271# new sample group (a group being a set of samples from all CPUs). The 272# earliest timestamp is compared in each group. This trigger is also used 273# for sanity testing, if a group's samples exceed half this value. 274trigger = int(0.8 * (1000000000 / frequency)) 275 276# read events 277b["events"].open_perf_buffer(print_event, page_cnt=64) 278while 1: 279 # allow some buffering by calling sleep(), to reduce the context switch 280 # rate and lower overhead. 281 try: 282 if not exiting: 283 sleep(wakeup_s) 284 except KeyboardInterrupt: 285 exiting = 1 286 b.perf_buffer_poll() 287 slept += wakeup_s 288 289 if slept < 0.999 * interval: # floating point workaround 290 continue 291 slept = 0 292 293 positive = 0 # number of samples where an idle CPU could have run work 294 running = 0 295 idle = 0 296 if debug >= 2: 297 print("DEBUG: begin samples loop, count %d" % len(samples)) 298 for e in sorted(samples): 299 if debug >= 2: 300 print("DEBUG: ts %d cpu %d len %d delta %d trig %d" % (e, 301 samples[e]['cpu'], samples[e]['len'], e - last, 302 e - last > trigger)) 303 304 # look for time jumps to identify a new sample group 305 if e - last > trigger: 306 307 # first first group timestamp, and sanity test 308 g_time = 0 309 g_max = 0 310 for ge in sorted(group): 311 if g_time == 0: 312 g_time = ge 313 g_max = ge 314 315 # process previous sample group 316 if args.csv: 317 lens = [0] * ncpu 318 offs = [0] * ncpu 319 for ge in sorted(group): 320 lens[samples[ge]['cpu']] = samples[ge]['len'] 321 if args.fullcsv: 322 offs[samples[ge]['cpu']] = ge - g_time 323 if g_time > 0: # else first sample 324 if args.timestamp: 325 print("%-8s" % strftime("%H:%M:%S"), end=",") 326 print("%d" % g_time, end=",") 327 print(",".join(str(lens[c]) for c in range(ncpu)), end="") 328 if args.fullcsv: 329 print(",", end="") 330 print(",".join(str(offs[c]) for c in range(ncpu))) 331 else: 332 print() 333 else: 334 # calculate stats 335 g_running = 0 336 g_queued = 0 337 for ge in group: 338 if samples[ge]['len'] > 0: 339 g_running += 1 340 if samples[ge]['len'] > 1: 341 g_queued += samples[ge]['len'] - 1 342 g_idle = ncpu - g_running 343 344 # calculate the number of threads that could have run as the 345 # minimum of idle and queued 346 if g_idle > 0 and g_queued > 0: 347 if g_queued > g_idle: 348 i = g_idle 349 else: 350 i = g_queued 351 positive += i 352 running += g_running 353 idle += g_idle 354 355 # now sanity test, after -J output 356 g_range = g_max - g_time 357 if g_range > trigger / 2: 358 # if a sample group exceeds half the interval, we can no 359 # longer draw conclusions about some CPUs idle while others 360 # have queued work. Error and exit. This can happen when 361 # CPUs power down, then start again on different offsets. 362 # TODO: Since this is a sampling tool, an error margin should 363 # be anticipated, so an improvement may be to bump a counter 364 # instead of exiting, and only exit if this counter shows 365 # a skewed sample rate of over, say, 1%. Such an approach 366 # would allow a small rate of outliers (sampling error), 367 # and, we could tighten the trigger to be, say, trigger / 5. 368 # In the case of a power down, if it's detectable, perhaps 369 # the tool could reinitialize the timers (although exiting 370 # is simple and works). 371 print(("ERROR: CPU samples arrived at skewed offsets " + 372 "(CPUs may have powered down when idle), " + 373 "spanning %d ns (expected < %d ns). Debug with -J, " + 374 "and see the man page. As output may begin to be " + 375 "unreliable, exiting.") % (g_range, trigger / 2)) 376 exit() 377 378 # these are done, remove 379 for ge in sorted(group): 380 del samples[ge] 381 382 # begin next group 383 group = {} 384 last = e 385 386 # stash this timestamp in a sample group dict 387 group[e] = 1 388 389 if not args.csv: 390 total = running + idle 391 unclaimed = util = 0 392 393 if debug: 394 print("DEBUG: hit %d running %d idle %d total %d buffered %d" % ( 395 positive, running, idle, total, len(samples))) 396 397 if args.timestamp: 398 print("%-8s " % strftime("%H:%M:%S"), end="") 399 400 # output 401 if total: 402 unclaimed = float(positive) / total 403 util = float(running) / total 404 print("%%CPU %6.2f%%, unclaimed idle %0.2f%%" % (100 * util, 405 100 * unclaimed)) 406 407 countdown -= 1 408 if exiting or countdown == 0: 409 exit() 410