1#!/usr/bin/env python 2# @lint-avoid-python-3-compatibility-imports 3# 4# runqlen Summarize scheduler run queue length as a histogram. 5# For Linux, uses BCC, eBPF. 6# 7# This counts the length of the run queue, excluding the currently running 8# thread, and shows it as a histogram. 9# 10# Also answers run queue occupancy. 11# 12# USAGE: runqlen [-h] [-T] [-Q] [-m] [-D] [interval] [count] 13# 14# REQUIRES: Linux 4.9+ (BPF_PROG_TYPE_PERF_EVENT support). Under tools/old is 15# a version of this tool that may work on Linux 4.6 - 4.8. 16# 17# Copyright 2016 Netflix, Inc. 18# Licensed under the Apache License, Version 2.0 (the "License") 19# 20# 12-Dec-2016 Brendan Gregg Created this. 21 22from __future__ import print_function 23from bcc import BPF, PerfType, PerfSWConfig, utils 24from time import sleep, strftime 25from tempfile import NamedTemporaryFile 26from os import open, close, dup, unlink, O_WRONLY 27import argparse 28 29# arguments 30examples = """examples: 31 ./runqlen # summarize run queue length as a histogram 32 ./runqlen 1 10 # print 1 second summaries, 10 times 33 ./runqlen -T 1 # 1s summaries and timestamps 34 ./runqlen -O # report run queue occupancy 35 ./runqlen -C # show each CPU separately 36""" 37parser = argparse.ArgumentParser( 38 description="Summarize scheduler run queue length as a histogram", 39 formatter_class=argparse.RawDescriptionHelpFormatter, 40 epilog=examples) 41parser.add_argument("-T", "--timestamp", action="store_true", 42 help="include timestamp on output") 43parser.add_argument("-O", "--runqocc", action="store_true", 44 help="report run queue occupancy") 45parser.add_argument("-C", "--cpus", action="store_true", 46 help="print output for each CPU separately") 47parser.add_argument("interval", nargs="?", default=99999999, 48 help="output interval, in seconds") 49parser.add_argument("count", nargs="?", default=99999999, 50 help="number of outputs") 51parser.add_argument("--ebpf", action="store_true", 52 help=argparse.SUPPRESS) 53args = parser.parse_args() 54countdown = int(args.count) 55debug = 0 56frequency = 99 57 58# Linux 4.15 introduced a new field runnable_weight 59# in linux_src:kernel/sched/sched.h as 60# struct cfs_rq { 61# struct load_weight load; 62# unsigned long runnable_weight; 63# unsigned int nr_running, h_nr_running; 64# ...... 65# } 66# and this tool requires to access nr_running to get 67# runqueue len information. 68# 69# The commit which introduces cfs_rq->runnable_weight 70# field also introduces the field sched_entity->runnable_weight 71# where sched_entity is defined in linux_src:include/linux/sched.h. 72# 73# To cope with pre-4.15 and 4.15/post-4.15 releases, 74# we run a simple BPF program to detect whether 75# field sched_entity->runnable_weight exists. The existence of 76# this field should infer the existence of cfs_rq->runnable_weight. 77# 78# This will need maintenance as the relationship between these 79# two fields may change in the future. 80# 81def check_runnable_weight_field(): 82 # Define the bpf program for checking purpose 83 bpf_check_text = """ 84#include <linux/sched.h> 85unsigned long dummy(struct sched_entity *entity) 86{ 87 return entity->runnable_weight; 88} 89""" 90 91 # Get a temporary file name 92 tmp_file = NamedTemporaryFile(delete=False) 93 tmp_file.close() 94 95 # Duplicate and close stderr (fd = 2) 96 old_stderr = dup(2) 97 close(2) 98 99 # Open a new file, should get fd number 2 100 # This will avoid printing llvm errors on the screen 101 fd = open(tmp_file.name, O_WRONLY) 102 try: 103 t = BPF(text=bpf_check_text) 104 success_compile = True 105 except: 106 success_compile = False 107 108 # Release the fd 2, and next dup should restore old stderr 109 close(fd) 110 dup(old_stderr) 111 close(old_stderr) 112 113 # remove the temporary file and return 114 unlink(tmp_file.name) 115 return success_compile 116 117 118# define BPF program 119bpf_text = """ 120#include <uapi/linux/ptrace.h> 121#include <linux/sched.h> 122 123// Declare enough of cfs_rq to find nr_running, since we can't #import the 124// header. This will need maintenance. It is from kernel/sched/sched.h: 125// The runnable_weight field is removed from Linux 5.7.0 126struct cfs_rq_partial { 127 struct load_weight load; 128#if LINUX_VERSION_CODE < KERNEL_VERSION(5, 7, 0) 129 RUNNABLE_WEIGHT_FIELD 130#endif 131 unsigned int nr_running, h_nr_running; 132}; 133 134typedef struct cpu_key { 135 int cpu; 136 unsigned int slot; 137} cpu_key_t; 138STORAGE 139 140int do_perf_event() 141{ 142 unsigned int len = 0; 143 pid_t pid = 0; 144 struct task_struct *task = NULL; 145 struct cfs_rq_partial *my_q = NULL; 146 147 // Fetch the run queue length from task->se.cfs_rq->nr_running. This is an 148 // unstable interface and may need maintenance. Perhaps a future version 149 // of BPF will support task_rq(p) or something similar as a more reliable 150 // interface. 151 task = (struct task_struct *)bpf_get_current_task(); 152 my_q = (struct cfs_rq_partial *)task->se.cfs_rq; 153 len = my_q->nr_running; 154 155 // Calculate run queue length by subtracting the currently running task, 156 // if present. len 0 == idle, len 1 == one running task. 157 if (len > 0) 158 len--; 159 160 STORE 161 162 return 0; 163} 164""" 165 166# code substitutions 167if args.cpus: 168 bpf_text = bpf_text.replace('STORAGE', 169 'BPF_HISTOGRAM(dist, cpu_key_t, MAX_CPUS);') 170 bpf_text = bpf_text.replace('STORE', 'cpu_key_t key = {.slot = len}; ' + 171 'key.cpu = bpf_get_smp_processor_id(); ' + 172 'dist.increment(key);') 173else: 174 bpf_text = bpf_text.replace('STORAGE', 175 'BPF_HISTOGRAM(dist, unsigned int);') 176 bpf_text = bpf_text.replace('STORE', 'dist.atomic_increment(len);') 177 178# If target has BTF enabled, use BTF to check runnable_weight field exists in 179# cfs_rq first, otherwise fallback to use check_runnable_weight_field(). 180if BPF.kernel_struct_has_field(b'cfs_rq', b'runnable_weight') == 1 \ 181 or check_runnable_weight_field(): 182 bpf_text = bpf_text.replace('RUNNABLE_WEIGHT_FIELD', 'unsigned long runnable_weight;') 183else: 184 bpf_text = bpf_text.replace('RUNNABLE_WEIGHT_FIELD', '') 185 186if debug or args.ebpf: 187 print(bpf_text) 188 if args.ebpf: 189 exit() 190 191num_cpus = len(utils.get_online_cpus()) 192 193# initialize BPF & perf_events 194b = BPF(text=bpf_text, cflags=['-DMAX_CPUS=%s' % str(num_cpus)]) 195b.attach_perf_event(ev_type=PerfType.SOFTWARE, 196 ev_config=PerfSWConfig.CPU_CLOCK, fn_name="do_perf_event", 197 sample_period=0, sample_freq=frequency) 198 199print("Sampling run queue length... Hit Ctrl-C to end.") 200 201# output 202exiting = 0 if args.interval else 1 203dist = b.get_table("dist") 204while (1): 205 try: 206 sleep(int(args.interval)) 207 except KeyboardInterrupt: 208 exiting = 1 209 210 print() 211 if args.timestamp: 212 print("%-8s\n" % strftime("%H:%M:%S"), end="") 213 214 if args.runqocc: 215 if args.cpus: 216 # run queue occupancy, per-CPU summary 217 idle = {} 218 queued = {} 219 cpumax = 0 220 for k, v in dist.items(): 221 if k.cpu > cpumax: 222 cpumax = k.cpu 223 for c in range(0, cpumax + 1): 224 idle[c] = 0 225 queued[c] = 0 226 for k, v in dist.items(): 227 if k.slot == 0: 228 idle[k.cpu] += v.value 229 else: 230 queued[k.cpu] += v.value 231 for c in range(0, cpumax + 1): 232 samples = idle[c] + queued[c] 233 if samples: 234 runqocc = float(queued[c]) / samples 235 else: 236 runqocc = 0 237 print("runqocc, CPU %-3d %6.2f%%" % (c, 100 * runqocc)) 238 239 else: 240 # run queue occupancy, system-wide summary 241 idle = 0 242 queued = 0 243 for k, v in dist.items(): 244 if k.value == 0: 245 idle += v.value 246 else: 247 queued += v.value 248 samples = idle + queued 249 if samples: 250 runqocc = float(queued) / samples 251 else: 252 runqocc = 0 253 print("runqocc: %0.2f%%" % (100 * runqocc)) 254 255 else: 256 # run queue length histograms 257 dist.print_linear_hist("runqlen", "cpu") 258 259 dist.clear() 260 261 countdown -= 1 262 if exiting or countdown == 0: 263 exit() 264