1#!/usr/bin/env python 2# @lint-avoid-python-3-compatibility-imports 3# 4# tcplife Trace the lifespan of TCP sessions and summarize. 5# For Linux, uses BCC, BPF. Embedded C. 6# 7# USAGE: tcplife [-h] [-C] [-S] [-p PID] [-4 | -6] [interval [count]] 8# 9# This uses the sock:inet_sock_set_state tracepoint if it exists (added to 10# Linux 4.16, and replacing the earlier tcp:tcp_set_state), else it uses 11# kernel dynamic tracing of tcp_set_state(). 12# 13# While throughput counters are emitted, they are fetched in a low-overhead 14# manner: reading members of the tcp_info struct on TCP close. ie, we do not 15# trace send/receive. 16# 17# Copyright 2016 Netflix, Inc. 18# Licensed under the Apache License, Version 2.0 (the "License") 19# 20# IDEA: Julia Evans 21# 22# 18-Oct-2016 Brendan Gregg Created this. 23# 29-Dec-2017 " " Added tracepoint support. 24 25from __future__ import print_function 26from bcc import BPF 27import argparse 28from socket import inet_ntop, AF_INET, AF_INET6 29from struct import pack 30from time import strftime 31 32# arguments 33examples = """examples: 34 ./tcplife # trace all TCP connect()s 35 ./tcplife -T # include time column (HH:MM:SS) 36 ./tcplife -w # wider columns (fit IPv6) 37 ./tcplife -stT # csv output, with times & timestamps 38 ./tcplife -p 181 # only trace PID 181 39 ./tcplife -L 80 # only trace local port 80 40 ./tcplife -L 80,81 # only trace local ports 80 and 81 41 ./tcplife -D 80 # only trace remote port 80 42 ./tcplife -4 # only trace IPv4 family 43 ./tcplife -6 # only trace IPv6 family 44""" 45parser = argparse.ArgumentParser( 46 description="Trace the lifespan of TCP sessions and summarize", 47 formatter_class=argparse.RawDescriptionHelpFormatter, 48 epilog=examples) 49parser.add_argument("-T", "--time", action="store_true", 50 help="include time column on output (HH:MM:SS)") 51parser.add_argument("-t", "--timestamp", action="store_true", 52 help="include timestamp on output (seconds)") 53parser.add_argument("-w", "--wide", action="store_true", 54 help="wide column output (fits IPv6 addresses)") 55parser.add_argument("-s", "--csv", action="store_true", 56 help="comma separated values output") 57parser.add_argument("-p", "--pid", 58 help="trace this PID only") 59parser.add_argument("-L", "--localport", 60 help="comma-separated list of local ports to trace.") 61parser.add_argument("-D", "--remoteport", 62 help="comma-separated list of remote ports to trace.") 63group = parser.add_mutually_exclusive_group() 64group.add_argument("-4", "--ipv4", action="store_true", 65 help="trace IPv4 family only") 66group.add_argument("-6", "--ipv6", action="store_true", 67 help="trace IPv6 family only") 68parser.add_argument("--ebpf", action="store_true", 69 help=argparse.SUPPRESS) 70args = parser.parse_args() 71debug = 0 72 73# define BPF program 74bpf_text = """ 75#include <uapi/linux/ptrace.h> 76#include <linux/tcp.h> 77#include <net/sock.h> 78#include <bcc/proto.h> 79 80BPF_HASH(birth, struct sock *, u64); 81 82// separate data structs for ipv4 and ipv6 83struct ipv4_data_t { 84 u64 ts_us; 85 u32 pid; 86 u32 saddr; 87 u32 daddr; 88 u64 ports; 89 u64 rx_b; 90 u64 tx_b; 91 u64 span_us; 92 char task[TASK_COMM_LEN]; 93}; 94BPF_PERF_OUTPUT(ipv4_events); 95 96struct ipv6_data_t { 97 u64 ts_us; 98 u32 pid; 99 unsigned __int128 saddr; 100 unsigned __int128 daddr; 101 u64 ports; 102 u64 rx_b; 103 u64 tx_b; 104 u64 span_us; 105 char task[TASK_COMM_LEN]; 106}; 107BPF_PERF_OUTPUT(ipv6_events); 108 109struct id_t { 110 u32 pid; 111 char task[TASK_COMM_LEN]; 112}; 113BPF_HASH(whoami, struct sock *, struct id_t); 114""" 115 116# 117# XXX: The following is temporary code for older kernels, Linux 4.14 and 118# older. It uses kprobes to instrument tcp_set_state(). On Linux 4.16 and 119# later, the sock:inet_sock_set_state tracepoint should be used instead, as 120# is done by the code that follows this. In the distant future (2021?), this 121# kprobe code can be removed. This is why there is so much code 122# duplication: to make removal easier. 123# 124bpf_text_kprobe = """ 125int kprobe__tcp_set_state(struct pt_regs *ctx, struct sock *sk, int state) 126{ 127 u32 pid = bpf_get_current_pid_tgid() >> 32; 128 129 // lport is either used in a filter here, or later 130 u16 lport = sk->__sk_common.skc_num; 131 FILTER_LPORT 132 133 // dport is either used in a filter here, or later 134 u16 dport = sk->__sk_common.skc_dport; 135 dport = ntohs(dport); 136 FILTER_DPORT 137 138 /* 139 * This tool includes PID and comm context. It's best effort, and may 140 * be wrong in some situations. It currently works like this: 141 * - record timestamp on any state < TCP_FIN_WAIT1 142 * - cache task context on: 143 * TCP_SYN_SENT: tracing from client 144 * TCP_LAST_ACK: client-closed from server 145 * - do output on TCP_CLOSE: 146 * fetch task context if cached, or use current task 147 */ 148 149 // capture birth time 150 if (state < TCP_FIN_WAIT1) { 151 /* 152 * Matching just ESTABLISHED may be sufficient, provided no code-path 153 * sets ESTABLISHED without a tcp_set_state() call. Until we know 154 * that for sure, match all early states to increase chances a 155 * timestamp is set. 156 * Note that this needs to be set before the PID filter later on, 157 * since the PID isn't reliable for these early stages, so we must 158 * save all timestamps and do the PID filter later when we can. 159 */ 160 u64 ts = bpf_ktime_get_ns(); 161 birth.update(&sk, &ts); 162 } 163 164 // record PID & comm on SYN_SENT 165 if (state == TCP_SYN_SENT || state == TCP_LAST_ACK) { 166 // now we can PID filter, both here and a little later on for CLOSE 167 FILTER_PID 168 struct id_t me = {.pid = pid}; 169 bpf_get_current_comm(&me.task, sizeof(me.task)); 170 whoami.update(&sk, &me); 171 } 172 173 if (state != TCP_CLOSE) 174 return 0; 175 176 // calculate lifespan 177 u64 *tsp, delta_us; 178 tsp = birth.lookup(&sk); 179 if (tsp == 0) { 180 whoami.delete(&sk); // may not exist 181 return 0; // missed create 182 } 183 delta_us = (bpf_ktime_get_ns() - *tsp) / 1000; 184 birth.delete(&sk); 185 186 // fetch possible cached data, and filter 187 struct id_t *mep; 188 mep = whoami.lookup(&sk); 189 if (mep != 0) 190 pid = mep->pid; 191 FILTER_PID 192 193 // get throughput stats. see tcp_get_info(). 194 u64 rx_b = 0, tx_b = 0; 195 struct tcp_sock *tp = (struct tcp_sock *)sk; 196 rx_b = tp->bytes_received; 197 tx_b = tp->bytes_acked; 198 199 u16 family = sk->__sk_common.skc_family; 200 201 FILTER_FAMILY 202 203 if (family == AF_INET) { 204 struct ipv4_data_t data4 = {}; 205 data4.span_us = delta_us; 206 data4.rx_b = rx_b; 207 data4.tx_b = tx_b; 208 data4.ts_us = bpf_ktime_get_ns() / 1000; 209 data4.saddr = sk->__sk_common.skc_rcv_saddr; 210 data4.daddr = sk->__sk_common.skc_daddr; 211 // a workaround until data4 compiles with separate lport/dport 212 data4.pid = pid; 213 data4.ports = dport + ((0ULL + lport) << 32); 214 if (mep == 0) { 215 bpf_get_current_comm(&data4.task, sizeof(data4.task)); 216 } else { 217 bpf_probe_read_kernel(&data4.task, sizeof(data4.task), (void *)mep->task); 218 } 219 ipv4_events.perf_submit(ctx, &data4, sizeof(data4)); 220 221 } else /* 6 */ { 222 struct ipv6_data_t data6 = {}; 223 data6.span_us = delta_us; 224 data6.rx_b = rx_b; 225 data6.tx_b = tx_b; 226 data6.ts_us = bpf_ktime_get_ns() / 1000; 227 bpf_probe_read_kernel(&data6.saddr, sizeof(data6.saddr), 228 sk->__sk_common.skc_v6_rcv_saddr.in6_u.u6_addr32); 229 bpf_probe_read_kernel(&data6.daddr, sizeof(data6.daddr), 230 sk->__sk_common.skc_v6_daddr.in6_u.u6_addr32); 231 // a workaround until data6 compiles with separate lport/dport 232 data6.ports = dport + ((0ULL + lport) << 32); 233 data6.pid = pid; 234 if (mep == 0) { 235 bpf_get_current_comm(&data6.task, sizeof(data6.task)); 236 } else { 237 bpf_probe_read_kernel(&data6.task, sizeof(data6.task), (void *)mep->task); 238 } 239 ipv6_events.perf_submit(ctx, &data6, sizeof(data6)); 240 } 241 242 if (mep != 0) 243 whoami.delete(&sk); 244 245 return 0; 246} 247""" 248 249bpf_text_tracepoint = """ 250TRACEPOINT_PROBE(sock, inet_sock_set_state) 251{ 252 if (args->protocol != IPPROTO_TCP) 253 return 0; 254 255 u32 pid = bpf_get_current_pid_tgid() >> 32; 256 // sk is mostly used as a UUID, and for two tcp stats: 257 struct sock *sk = (struct sock *)args->skaddr; 258 259 // lport is either used in a filter here, or later 260 u16 lport = args->sport; 261 FILTER_LPORT 262 263 // dport is either used in a filter here, or later 264 u16 dport = args->dport; 265 FILTER_DPORT 266 267 /* 268 * This tool includes PID and comm context. It's best effort, and may 269 * be wrong in some situations. It currently works like this: 270 * - record timestamp on any state < TCP_FIN_WAIT1 271 * - cache task context on: 272 * TCP_SYN_SENT: tracing from client 273 * TCP_LAST_ACK: client-closed from server 274 * - do output on TCP_CLOSE: 275 * fetch task context if cached, or use current task 276 */ 277 278 // capture birth time 279 if (args->newstate < TCP_FIN_WAIT1) { 280 /* 281 * Matching just ESTABLISHED may be sufficient, provided no code-path 282 * sets ESTABLISHED without a tcp_set_state() call. Until we know 283 * that for sure, match all early states to increase chances a 284 * timestamp is set. 285 * Note that this needs to be set before the PID filter later on, 286 * since the PID isn't reliable for these early stages, so we must 287 * save all timestamps and do the PID filter later when we can. 288 */ 289 u64 ts = bpf_ktime_get_ns(); 290 birth.update(&sk, &ts); 291 } 292 293 // record PID & comm on SYN_SENT 294 if (args->newstate == TCP_SYN_SENT || args->newstate == TCP_LAST_ACK) { 295 // now we can PID filter, both here and a little later on for CLOSE 296 FILTER_PID 297 struct id_t me = {.pid = pid}; 298 bpf_get_current_comm(&me.task, sizeof(me.task)); 299 whoami.update(&sk, &me); 300 } 301 302 if (args->newstate != TCP_CLOSE) 303 return 0; 304 305 // calculate lifespan 306 u64 *tsp, delta_us; 307 tsp = birth.lookup(&sk); 308 if (tsp == 0) { 309 whoami.delete(&sk); // may not exist 310 return 0; // missed create 311 } 312 delta_us = (bpf_ktime_get_ns() - *tsp) / 1000; 313 birth.delete(&sk); 314 315 // fetch possible cached data, and filter 316 struct id_t *mep; 317 mep = whoami.lookup(&sk); 318 if (mep != 0) 319 pid = mep->pid; 320 FILTER_PID 321 322 u16 family = args->family; 323 FILTER_FAMILY 324 325 // get throughput stats. see tcp_get_info(). 326 u64 rx_b = 0, tx_b = 0; 327 struct tcp_sock *tp = (struct tcp_sock *)sk; 328 rx_b = tp->bytes_received; 329 tx_b = tp->bytes_acked; 330 331 if (args->family == AF_INET) { 332 struct ipv4_data_t data4 = {}; 333 data4.span_us = delta_us; 334 data4.rx_b = rx_b; 335 data4.tx_b = tx_b; 336 data4.ts_us = bpf_ktime_get_ns() / 1000; 337 __builtin_memcpy(&data4.saddr, args->saddr, sizeof(data4.saddr)); 338 __builtin_memcpy(&data4.daddr, args->daddr, sizeof(data4.daddr)); 339 // a workaround until data4 compiles with separate lport/dport 340 data4.ports = dport + ((0ULL + lport) << 32); 341 data4.pid = pid; 342 343 if (mep == 0) { 344 bpf_get_current_comm(&data4.task, sizeof(data4.task)); 345 } else { 346 bpf_probe_read_kernel(&data4.task, sizeof(data4.task), (void *)mep->task); 347 } 348 ipv4_events.perf_submit(args, &data4, sizeof(data4)); 349 350 } else /* 6 */ { 351 struct ipv6_data_t data6 = {}; 352 data6.span_us = delta_us; 353 data6.rx_b = rx_b; 354 data6.tx_b = tx_b; 355 data6.ts_us = bpf_ktime_get_ns() / 1000; 356 __builtin_memcpy(&data6.saddr, args->saddr_v6, sizeof(data6.saddr)); 357 __builtin_memcpy(&data6.daddr, args->daddr_v6, sizeof(data6.daddr)); 358 // a workaround until data6 compiles with separate lport/dport 359 data6.ports = dport + ((0ULL + lport) << 32); 360 data6.pid = pid; 361 if (mep == 0) { 362 bpf_get_current_comm(&data6.task, sizeof(data6.task)); 363 } else { 364 bpf_probe_read_kernel(&data6.task, sizeof(data6.task), (void *)mep->task); 365 } 366 ipv6_events.perf_submit(args, &data6, sizeof(data6)); 367 } 368 369 if (mep != 0) 370 whoami.delete(&sk); 371 372 return 0; 373} 374""" 375 376if (BPF.tracepoint_exists("sock", "inet_sock_set_state")): 377 bpf_text += bpf_text_tracepoint 378else: 379 bpf_text += bpf_text_kprobe 380 381# code substitutions 382if args.pid: 383 bpf_text = bpf_text.replace('FILTER_PID', 384 'if (pid != %s) { return 0; }' % args.pid) 385if args.remoteport: 386 dports = [int(dport) for dport in args.remoteport.split(',')] 387 dports_if = ' && '.join(['dport != %d' % dport for dport in dports]) 388 bpf_text = bpf_text.replace('FILTER_DPORT', 389 'if (%s) { birth.delete(&sk); return 0; }' % dports_if) 390if args.localport: 391 lports = [int(lport) for lport in args.localport.split(',')] 392 lports_if = ' && '.join(['lport != %d' % lport for lport in lports]) 393 bpf_text = bpf_text.replace('FILTER_LPORT', 394 'if (%s) { birth.delete(&sk); return 0; }' % lports_if) 395if args.ipv4: 396 bpf_text = bpf_text.replace('FILTER_FAMILY', 397 'if (family != AF_INET) { return 0; }') 398elif args.ipv6: 399 bpf_text = bpf_text.replace('FILTER_FAMILY', 400 'if (family != AF_INET6) { return 0; }') 401bpf_text = bpf_text.replace('FILTER_PID', '') 402bpf_text = bpf_text.replace('FILTER_DPORT', '') 403bpf_text = bpf_text.replace('FILTER_LPORT', '') 404bpf_text = bpf_text.replace('FILTER_FAMILY', '') 405 406if debug or args.ebpf: 407 print(bpf_text) 408 if args.ebpf: 409 exit() 410 411# 412# Setup output formats 413# 414# Don't change the default output (next 2 lines): this fits in 80 chars. I 415# know it doesn't have NS or UIDs etc. I know. If you really, really, really 416# need to add columns, columns that solve real actual problems, I'd start by 417# adding an extended mode (-x) to included those columns. 418# 419header_string = "%-5s %-10.10s %s%-15s %-5s %-15s %-5s %5s %5s %s" 420format_string = "%-5d %-10.10s %s%-15s %-5d %-15s %-5d %5d %5d %.2f" 421if args.wide: 422 header_string = "%-5s %-16.16s %-2s %-39s %-5s %-39s %-5s %6s %6s %s" 423 format_string = "%-5d %-16.16s %-2s %-39s %-5s %-39s %-5d %6d %6d %.2f" 424if args.csv: 425 header_string = "%s,%s,%s,%s,%s,%s,%s,%s,%s,%s" 426 format_string = "%d,%s,%s,%s,%s,%s,%d,%d,%d,%.2f" 427 428# process event 429def print_ipv4_event(cpu, data, size): 430 event = b["ipv4_events"].event(data) 431 global start_ts 432 if args.time: 433 if args.csv: 434 print("%s," % strftime("%H:%M:%S"), end="") 435 else: 436 print("%-8s " % strftime("%H:%M:%S"), end="") 437 if args.timestamp: 438 if start_ts == 0: 439 start_ts = event.ts_us 440 delta_s = (float(event.ts_us) - start_ts) / 1000000 441 if args.csv: 442 print("%.6f," % delta_s, end="") 443 else: 444 print("%-9.6f " % delta_s, end="") 445 print(format_string % (event.pid, event.task.decode('utf-8', 'replace'), 446 "4" if args.wide or args.csv else "", 447 inet_ntop(AF_INET, pack("I", event.saddr)), event.ports >> 32, 448 inet_ntop(AF_INET, pack("I", event.daddr)), event.ports & 0xffffffff, 449 event.tx_b / 1024, event.rx_b / 1024, float(event.span_us) / 1000)) 450 451def print_ipv6_event(cpu, data, size): 452 event = b["ipv6_events"].event(data) 453 global start_ts 454 if args.time: 455 if args.csv: 456 print("%s," % strftime("%H:%M:%S"), end="") 457 else: 458 print("%-8s " % strftime("%H:%M:%S"), end="") 459 if args.timestamp: 460 if start_ts == 0: 461 start_ts = event.ts_us 462 delta_s = (float(event.ts_us) - start_ts) / 1000000 463 if args.csv: 464 print("%.6f," % delta_s, end="") 465 else: 466 print("%-9.6f " % delta_s, end="") 467 print(format_string % (event.pid, event.task.decode('utf-8', 'replace'), 468 "6" if args.wide or args.csv else "", 469 inet_ntop(AF_INET6, event.saddr), event.ports >> 32, 470 inet_ntop(AF_INET6, event.daddr), event.ports & 0xffffffff, 471 event.tx_b / 1024, event.rx_b / 1024, float(event.span_us) / 1000)) 472 473# initialize BPF 474b = BPF(text=bpf_text) 475 476# header 477if args.time: 478 if args.csv: 479 print("%s," % ("TIME"), end="") 480 else: 481 print("%-8s " % ("TIME"), end="") 482if args.timestamp: 483 if args.csv: 484 print("%s," % ("TIME(s)"), end="") 485 else: 486 print("%-9s " % ("TIME(s)"), end="") 487print(header_string % ("PID", "COMM", 488 "IP" if args.wide or args.csv else "", "LADDR", 489 "LPORT", "RADDR", "RPORT", "TX_KB", "RX_KB", "MS")) 490 491start_ts = 0 492 493# read events 494b["ipv4_events"].open_perf_buffer(print_ipv4_event, page_cnt=64) 495b["ipv6_events"].open_perf_buffer(print_ipv6_event, page_cnt=64) 496while 1: 497 try: 498 b.perf_buffer_poll() 499 except KeyboardInterrupt: 500 exit() 501