1#!/usr/bin/env python 2# @lint-avoid-python-3-compatibility-imports 3# 4# compactsnoop Trace compact zone and print details including issuing PID. 5# For Linux, uses BCC, eBPF. 6# 7# This uses in-kernel eBPF maps to cache process details (PID and comm) by 8# compact zone begin, as well as a starting timestamp for calculating 9# latency. 10# 11# Copyright (c) 2019 Wenbo Zhang 12# Licensed under the Apache License, Version 2.0 (the "License") 13# 14# 11-NOV-2019 Wenbo Zhang Created this. 15 16from __future__ import print_function 17from bcc import BPF 18import argparse 19import platform 20from datetime import datetime, timedelta 21import sys 22 23# arguments 24examples = """examples: 25 ./compactsnoop # trace all compact stall 26 ./compactsnoop -T # include timestamps 27 ./compactsnoop -d 10 # trace for 10 seconds only 28 ./compactsnoop -K # output kernel stack trace 29 ./compactsnoop -e # show extended fields 30""" 31 32parser = argparse.ArgumentParser( 33 description="Trace compact zone", 34 formatter_class=argparse.RawDescriptionHelpFormatter, 35 epilog=examples, 36) 37parser.add_argument("-T", "--timestamp", action="store_true", 38 help="include timestamp on output") 39parser.add_argument("-p", "--pid", help="trace this PID only") 40parser.add_argument("-d", "--duration", 41 help="total duration of trace in seconds") 42parser.add_argument("-K", "--kernel-stack", action="store_true", 43 help="output kernel stack trace") 44parser.add_argument("-e", "--extended_fields", action="store_true", 45 help="show system memory state") 46parser.add_argument("--ebpf", action="store_true", help=argparse.SUPPRESS) 47args = parser.parse_args() 48debug = 0 49if args.duration: 50 args.duration = timedelta(seconds=int(args.duration)) 51 52NO_EXTENDED = """ 53#ifdef EXTNEDED_FIELDS 54#undef EXTNEDED_FIELDS 55#endif 56""" 57 58EXTENDED = """ 59#define EXTNEDED_FIELDS 1 60""" 61 62bpf_text = """ 63#include <uapi/linux/ptrace.h> 64#include <linux/sched.h> 65#include <linux/mmzone.h> 66#include <linux/compaction.h> 67 68struct val_t { 69 int nid; 70 int idx; 71 int order; 72 int sync; 73#ifdef EXTNEDED_FIELDS 74 int fragindex; 75 int low; 76 int min; 77 int high; 78 int free; 79#endif 80 u64 ts; // compaction begin time 81}; 82 83struct data_t { 84 u32 pid; 85 u32 tid; 86 int nid; 87 int idx; 88 int order; 89 u64 delta; 90 u64 ts; // compaction end time 91 int sync; 92#ifdef EXTNEDED_FIELDS 93 int fragindex; 94 int low; 95 int min; 96 int high; 97 int free; 98#endif 99 int status; 100 int stack_id; 101 char comm[TASK_COMM_LEN]; 102}; 103 104BPF_HASH(start, u64, struct val_t); 105BPF_PERF_OUTPUT(events); 106BPF_STACK_TRACE(stack_traces, 2048); 107 108#ifdef CONFIG_NUMA 109static inline int zone_to_nid_(struct zone *zone) 110{ 111 int node; 112 bpf_probe_read_kernel(&node, sizeof(node), &zone->node); 113 return node; 114} 115#else 116static inline int zone_to_nid_(struct zone *zone) 117{ 118 return 0; 119} 120#endif 121 122// #define zone_idx(zone) ((zone) - (zone)->zone_pgdat->node_zones) 123static inline int zone_idx_(struct zone *zone) 124{ 125 struct pglist_data *zone_pgdat = NULL; 126 bpf_probe_read_kernel(&zone_pgdat, sizeof(zone_pgdat), &zone->zone_pgdat); 127 return ((u64)zone - (u64)zone_pgdat->node_zones)/sizeof(struct zone); 128} 129 130#ifdef EXTNEDED_FIELDS 131static inline void get_all_wmark_pages(struct zone *zone, struct val_t *valp) 132{ 133 u64 _watermark[NR_WMARK] = {}; 134 u64 watermark_boost = 0; 135 136 bpf_probe_read_kernel(&_watermark, sizeof(_watermark), &zone->_watermark); 137 bpf_probe_read_kernel(&watermark_boost, sizeof(watermark_boost), 138 &zone->watermark_boost); 139 valp->min = _watermark[WMARK_MIN] + watermark_boost; 140 valp->low = _watermark[WMARK_LOW] + watermark_boost; 141 valp->high = _watermark[WMARK_HIGH] + watermark_boost; 142 bpf_probe_read_kernel(&valp->free, sizeof(valp->free), 143 &zone->vm_stat[NR_FREE_PAGES]); 144} 145#endif 146 147static inline void submit_event(void *ctx, int status) 148{ 149 struct data_t data = {}; 150 u64 ts = bpf_ktime_get_ns(); 151 u64 id = bpf_get_current_pid_tgid(); 152 struct val_t *valp = start.lookup(&id); 153 if (valp == NULL) { 154 // missed entry 155 return; 156 } 157 158 data.delta = ts - valp->ts; 159 data.ts = ts / 1000; 160 data.pid = id >> 32; 161 data.tid = id; 162 bpf_get_current_comm(&data.comm, sizeof(data.comm)); 163 data.nid = valp->nid; 164 data.idx = valp->idx; 165 data.order = valp->order; 166 data.sync = valp->sync; 167 168#ifdef EXTNEDED_FIELDS 169 data.fragindex = valp->fragindex; 170 data.min = valp->min; 171 data.low = valp->low; 172 data.high = valp->high; 173 data.free = valp->free; 174#endif 175 176 data.status = status; 177 data.stack_id = stack_traces.get_stackid(ctx, 0); 178 179 events.perf_submit(ctx, &data, sizeof(data)); 180 181 start.delete(&id); 182} 183 184#ifdef EXTNEDED_FIELDS 185int trace_fragmentation_index_return(struct pt_regs *ctx) 186{ 187 struct val_t val = { }; 188 int ret = PT_REGS_RC(ctx); 189 u64 id = bpf_get_current_pid_tgid(); 190 PID_FILTER 191 val.fragindex = ret; 192 start.update(&id, &val); 193 return 0; 194} 195#endif 196 197static inline void fill_compact_info(struct val_t *valp, 198 struct zone *zone, 199 int order) 200{ 201 valp->nid = zone_to_nid_(zone); 202 valp->idx = zone_idx_(zone); 203 valp->order = order; 204} 205 206RAW_TRACEPOINT_PROBE(mm_compaction_suitable) 207{ 208 // TP_PROTO(struct zone *zone, int order, int ret) 209 struct zone *zone = (struct zone *)ctx->args[0]; 210 int order = (int)ctx->args[1]; 211 int ret = (int)ctx->args[2]; 212 u64 id; 213 214 if(ret != COMPACT_CONTINUE) 215 return 0; 216 217 id = bpf_get_current_pid_tgid(); 218 PID_FILTER 219 220#ifdef EXTNEDED_FIELDS 221 struct val_t *valp = start.lookup(&id); 222 if (valp == NULL) { 223 // missed entry or order <= PAGE_ALLOC_COSTLY_ORDER, eg: 224 // manual trigger echo 1 > /proc/sys/vm/compact_memory 225 struct val_t val = { .fragindex = -1000 }; 226 valp = &val; 227 start.update(&id, valp); 228 } 229 fill_compact_info(valp, zone, order); 230 get_all_wmark_pages(zone, valp); 231#else 232 struct val_t val = { }; 233 fill_compact_info(&val, zone, order); 234 start.update(&id, &val); 235#endif 236 237 return 0; 238} 239 240TRACEPOINT_PROBE(compaction, mm_compaction_begin) 241{ 242 bool sync = args->sync; 243 244 u64 id = bpf_get_current_pid_tgid(); 245 struct val_t *valp = start.lookup(&id); 246 if (valp == NULL) { 247 // missed entry 248 return 0; 249 } 250 251 valp->ts = bpf_ktime_get_ns(); 252 valp->sync = sync; 253 return 0; 254} 255 256TRACEPOINT_PROBE(compaction, mm_compaction_end) 257{ 258 submit_event(args, args->status); 259 return 0; 260} 261""" 262 263if platform.machine() != 'x86_64': 264 print(""" 265 Currently only support x86_64 servers, if you want to use it on 266 other platforms, please refer include/linux/mmzone.h to modify 267 zone_idex_to_str to get the right zone type 268 """) 269 exit() 270 271if args.extended_fields: 272 bpf_text = EXTENDED + bpf_text 273else: 274 bpf_text = NO_EXTENDED + bpf_text 275 276if args.pid: 277 bpf_text = bpf_text.replace("PID_FILTER", 278 "if (id >> 32 != %s) { return 0; }" % args.pid) 279else: 280 bpf_text = bpf_text.replace("PID_FILTER", "") 281if debug or args.ebpf: 282 print(bpf_text) 283 if args.ebpf: 284 exit() 285 286# load BPF program 287b = BPF(text=bpf_text) 288if args.extended_fields: 289 b.attach_kretprobe(event="fragmentation_index", 290 fn_name="trace_fragmentation_index_return") 291 292stack_traces = b.get_table("stack_traces") 293initial_ts = 0 294 295def zone_idx_to_str(idx): 296 # from include/linux/mmzone.h 297 # NOTICE: consider only x86_64 servers 298 zone_type = { 299 0: "ZONE_DMA", 300 1: "ZONE_DMA32", 301 2: "ZONE_NORMAL", 302 } 303 304 if idx in zone_type: 305 return zone_type[idx] 306 else: 307 return str(idx) 308 309def compact_result_to_str(status): 310 # from include/trace/evnets/mmflags.h 311 # from include/linux/compaction.h 312 compact_status = { 313 # COMPACT_NOT_SUITABLE_ZONE: For more detailed tracepoint 314 # output - internal to compaction 315 0: "not_suitable_zone", 316 # COMPACT_SKIPPED: compaction didn't start as it was not 317 # possible or direct reclaim was more suitable 318 1: "skipped", 319 # COMPACT_DEFERRED: compaction didn't start as it was 320 # deferred due to past failures 321 2: "deferred", 322 # COMPACT_NOT_SUITABLE_PAGE: For more detailed tracepoint 323 # output - internal to compaction 324 3: "no_suitable_page", 325 # COMPACT_CONTINUE: compaction should continue to another pageblock 326 4: "continue", 327 # COMPACT_COMPLETE: The full zone was compacted scanned but wasn't 328 # successful to compact suitable pages. 329 5: "complete", 330 # COMPACT_PARTIAL_SKIPPED: direct compaction has scanned part of the 331 # zone but wasn't successful to compact suitable pages. 332 6: "partial_skipped", 333 # COMPACT_CONTENDED: compaction terminated prematurely due to lock 334 # contentions 335 7: "contended", 336 # COMPACT_SUCCESS: direct compaction terminated after concluding 337 # that the allocation should now succeed 338 8: "success", 339 } 340 341 if status in compact_status: 342 return compact_status[status] 343 else: 344 return str(status) 345 346# header 347if args.timestamp: 348 print("%-14s" % ("TIME(s)"), end=" ") 349print("%-14s %-6s %-4s %-12s %-5s %-7s" % 350 ("COMM", "PID", "NODE", "ZONE", "ORDER", "MODE"), end=" ") 351if args.extended_fields: 352 print("%-8s %-8s %-8s %-8s %-8s" % 353 ("FRAGIDX", "MIN", "LOW", "HIGH", "FREE"), end=" ") 354print("%9s %16s" % ("LAT(ms)", "STATUS")) 355 356# process event 357def print_event(cpu, data, size): 358 event = b["events"].event(data) 359 360 global initial_ts 361 362 if not initial_ts: 363 initial_ts = event.ts 364 365 if args.timestamp: 366 delta = event.ts - initial_ts 367 print("%-14.9f" % (float(delta) / 1000000), end=" ") 368 369 print("%-14.14s %-6s %-4s %-12s %-5s %-7s" % ( 370 event.comm.decode("utf-8", "replace"), 371 event.pid, 372 event.nid, 373 zone_idx_to_str(event.idx), 374 event.order, 375 "SYNC" if event.sync else "ASYNC"), end=" ") 376 if args.extended_fields: 377 print("%-8.3f %-8s %-8s %-8s %-8s" % ( 378 (float(event.fragindex) / 1000), 379 event.min, event.low, event.high, event.free 380 ), end=" ") 381 print("%9.3f %16s" % ( 382 float(event.delta) / 1000000, compact_result_to_str(event.status))) 383 if args.kernel_stack: 384 for addr in stack_traces.walk(event.stack_id): 385 sym = b.ksym(addr, show_offset=True) 386 print("\t%s" % sym) 387 print("") 388 389 sys.stdout.flush() 390 391# loop with callback to print_event 392b["events"].open_perf_buffer(print_event, page_cnt=64) 393start_time = datetime.now() 394while not args.duration or datetime.now() - start_time < args.duration: 395 try: 396 b.perf_buffer_poll() 397 except KeyboardInterrupt: 398 exit() 399