1#!/usr/bin/env python 2# @lint-avoid-python-3-compatibility-imports 3# 4# tcpconnlat Trace TCP active connection latency (connect). 5# For Linux, uses BCC, eBPF. Embedded C. 6# 7# USAGE: tcpconnlat [-h] [-t] [-p PID] [-4 | -6] 8# 9# This uses dynamic tracing of kernel functions, and will need to be updated 10# to match kernel changes. 11# 12# Copyright 2016 Netflix, Inc. 13# Licensed under the Apache License, Version 2.0 (the "License") 14# 15# 19-Feb-2016 Brendan Gregg Created this. 16# 15-Mar-2021 Suresh Kumar Added LPORT option 17 18from __future__ import print_function 19from bcc import BPF 20from socket import inet_ntop, AF_INET, AF_INET6 21from struct import pack 22import argparse 23 24# arg validation 25def positive_float(val): 26 try: 27 ival = float(val) 28 except ValueError: 29 raise argparse.ArgumentTypeError("must be a float") 30 31 if ival < 0: 32 raise argparse.ArgumentTypeError("must be positive") 33 return ival 34 35# arguments 36examples = """examples: 37 ./tcpconnlat # trace all TCP connect()s 38 ./tcpconnlat 1 # trace connection latency slower than 1 ms 39 ./tcpconnlat 0.1 # trace connection latency slower than 100 us 40 ./tcpconnlat -t # include timestamps 41 ./tcpconnlat -p 181 # only trace PID 181 42 ./tcpconnlat -L # include LPORT while printing outputs 43 ./tcpconnlat -4 # trace IPv4 family only 44 ./tcpconnlat -6 # trace IPv6 family only 45""" 46parser = argparse.ArgumentParser( 47 description="Trace TCP connects and show connection latency", 48 formatter_class=argparse.RawDescriptionHelpFormatter, 49 epilog=examples) 50parser.add_argument("-t", "--timestamp", action="store_true", 51 help="include timestamp on output") 52parser.add_argument("-p", "--pid", 53 help="trace this PID only") 54parser.add_argument("-L", "--lport", action="store_true", 55 help="include LPORT on output") 56group = parser.add_mutually_exclusive_group() 57group.add_argument("-4", "--ipv4", action="store_true", 58 help="trace IPv4 family only") 59group.add_argument("-6", "--ipv6", action="store_true", 60 help="trace IPv6 family only") 61parser.add_argument("duration_ms", nargs="?", default=0, 62 type=positive_float, 63 help="minimum duration to trace (ms)") 64parser.add_argument("-v", "--verbose", action="store_true", 65 help="print the BPF program for debugging purposes") 66parser.add_argument("--ebpf", action="store_true", 67 help=argparse.SUPPRESS) 68args = parser.parse_args() 69 70if args.duration_ms: 71 # support fractions but round to nearest microsecond 72 duration_us = int(args.duration_ms * 1000) 73else: 74 duration_us = 0 # default is show all 75 76debug = 0 77 78# define BPF program 79bpf_text = """ 80#include <uapi/linux/ptrace.h> 81#include <net/sock.h> 82#include <net/tcp_states.h> 83#include <bcc/proto.h> 84 85struct info_t { 86 u64 ts; 87 u32 pid; 88 char task[TASK_COMM_LEN]; 89}; 90BPF_HASH(start, struct sock *, struct info_t); 91 92// separate data structs for ipv4 and ipv6 93struct ipv4_data_t { 94 u64 ts_us; 95 u32 pid; 96 u32 saddr; 97 u32 daddr; 98 u64 ip; 99 u16 lport; 100 u16 dport; 101 u64 delta_us; 102 char task[TASK_COMM_LEN]; 103}; 104BPF_PERF_OUTPUT(ipv4_events); 105 106struct ipv6_data_t { 107 u64 ts_us; 108 u32 pid; 109 unsigned __int128 saddr; 110 unsigned __int128 daddr; 111 u64 ip; 112 u16 lport; 113 u16 dport; 114 u64 delta_us; 115 char task[TASK_COMM_LEN]; 116}; 117BPF_PERF_OUTPUT(ipv6_events); 118 119int trace_connect(struct pt_regs *ctx, struct sock *sk) 120{ 121 u32 pid = bpf_get_current_pid_tgid() >> 32; 122 FILTER 123 struct info_t info = {.pid = pid}; 124 info.ts = bpf_ktime_get_ns(); 125 bpf_get_current_comm(&info.task, sizeof(info.task)); 126 start.update(&sk, &info); 127 return 0; 128}; 129 130// See tcp_v4_do_rcv() and tcp_v6_do_rcv(). So TCP_ESTBALISHED and TCP_LISTEN 131// are fast path and processed elsewhere, and leftovers are processed by 132// tcp_rcv_state_process(). We can trace this for handshake completion. 133// This should all be switched to static tracepoints when available. 134int trace_tcp_rcv_state_process(struct pt_regs *ctx, struct sock *skp) 135{ 136 // will be in TCP_SYN_SENT for handshake 137 if (skp->__sk_common.skc_state != TCP_SYN_SENT) 138 return 0; 139 140 // check start and calculate delta 141 struct info_t *infop = start.lookup(&skp); 142 if (infop == 0) { 143 return 0; // missed entry or filtered 144 } 145 146 u64 ts = infop->ts; 147 u64 now = bpf_ktime_get_ns(); 148 149 u64 delta_us = (now - ts) / 1000ul; 150 151#ifdef MIN_LATENCY 152 if ( delta_us < DURATION_US ) { 153 return 0; // connect latency is below latency filter minimum 154 } 155#endif 156 157 // pull in details 158 u16 family = 0, lport = 0, dport = 0; 159 family = skp->__sk_common.skc_family; 160 lport = skp->__sk_common.skc_num; 161 dport = skp->__sk_common.skc_dport; 162 163 // emit to appropriate data path 164 if (family == AF_INET) { 165 struct ipv4_data_t data4 = {.pid = infop->pid, .ip = 4}; 166 data4.ts_us = now / 1000; 167 data4.saddr = skp->__sk_common.skc_rcv_saddr; 168 data4.daddr = skp->__sk_common.skc_daddr; 169 data4.lport = lport; 170 data4.dport = ntohs(dport); 171 data4.delta_us = delta_us; 172 __builtin_memcpy(&data4.task, infop->task, sizeof(data4.task)); 173 ipv4_events.perf_submit(ctx, &data4, sizeof(data4)); 174 175 } else /* AF_INET6 */ { 176 struct ipv6_data_t data6 = {.pid = infop->pid, .ip = 6}; 177 data6.ts_us = now / 1000; 178 bpf_probe_read_kernel(&data6.saddr, sizeof(data6.saddr), 179 skp->__sk_common.skc_v6_rcv_saddr.in6_u.u6_addr32); 180 bpf_probe_read_kernel(&data6.daddr, sizeof(data6.daddr), 181 skp->__sk_common.skc_v6_daddr.in6_u.u6_addr32); 182 data6.lport = lport; 183 data6.dport = ntohs(dport); 184 data6.delta_us = delta_us; 185 __builtin_memcpy(&data6.task, infop->task, sizeof(data6.task)); 186 ipv6_events.perf_submit(ctx, &data6, sizeof(data6)); 187 } 188 189 start.delete(&skp); 190 191 return 0; 192} 193""" 194 195if duration_us > 0: 196 bpf_text = "#define MIN_LATENCY\n" + bpf_text 197 bpf_text = bpf_text.replace('DURATION_US', str(duration_us)) 198 199# code substitutions 200if args.pid: 201 bpf_text = bpf_text.replace('FILTER', 202 'if (pid != %s) { return 0; }' % args.pid) 203else: 204 bpf_text = bpf_text.replace('FILTER', '') 205if debug or args.verbose or args.ebpf: 206 print(bpf_text) 207 if args.ebpf: 208 exit() 209 210# initialize BPF 211b = BPF(text=bpf_text) 212if args.ipv4: 213 b.attach_kprobe(event="tcp_v4_connect", fn_name="trace_connect") 214elif args.ipv6: 215 b.attach_kprobe(event="tcp_v6_connect", fn_name="trace_connect") 216else: 217 b.attach_kprobe(event="tcp_v4_connect", fn_name="trace_connect") 218 b.attach_kprobe(event="tcp_v6_connect", fn_name="trace_connect") 219 220b.attach_kprobe(event="tcp_rcv_state_process", 221 fn_name="trace_tcp_rcv_state_process") 222 223# process event 224start_ts = 0 225 226def print_ipv4_event(cpu, data, size): 227 event = b["ipv4_events"].event(data) 228 global start_ts 229 if args.timestamp: 230 if start_ts == 0: 231 start_ts = event.ts_us 232 print("%-9.3f" % ((float(event.ts_us) - start_ts) / 1000000), end="") 233 if args.lport: 234 print("%-7d %-12.12s %-2d %-16s %-6d %-16s %-5d %.2f" % (event.pid, 235 event.task.decode('utf-8', 'replace'), event.ip, 236 inet_ntop(AF_INET, pack("I", event.saddr)), event.lport, 237 inet_ntop(AF_INET, pack("I", event.daddr)), event.dport, 238 float(event.delta_us) / 1000)) 239 else: 240 print("%-7d %-12.12s %-2d %-16s %-16s %-5d %.2f" % (event.pid, 241 event.task.decode('utf-8', 'replace'), event.ip, 242 inet_ntop(AF_INET, pack("I", event.saddr)), 243 inet_ntop(AF_INET, pack("I", event.daddr)), event.dport, 244 float(event.delta_us) / 1000)) 245 246def print_ipv6_event(cpu, data, size): 247 event = b["ipv6_events"].event(data) 248 global start_ts 249 if args.timestamp: 250 if start_ts == 0: 251 start_ts = event.ts_us 252 print("%-9.3f" % ((float(event.ts_us) - start_ts) / 1000000), end="") 253 if args.lport: 254 print("%-7d %-12.12s %-2d %-16s %-6d %-16s %-5d %.2f" % (event.pid, 255 event.task.decode('utf-8', 'replace'), event.ip, 256 inet_ntop(AF_INET6, event.saddr), event.lport, 257 inet_ntop(AF_INET6, event.daddr), 258 event.dport, float(event.delta_us) / 1000)) 259 else: 260 print("%-7d %-12.12s %-2d %-16s %-16s %-5d %.2f" % (event.pid, 261 event.task.decode('utf-8', 'replace'), event.ip, 262 inet_ntop(AF_INET6, event.saddr), inet_ntop(AF_INET6, event.daddr), 263 event.dport, float(event.delta_us) / 1000)) 264 265# header 266if args.timestamp: 267 print("%-9s" % ("TIME(s)"), end="") 268if args.lport: 269 print("%-7s %-12s %-2s %-16s %-6s %-16s %-5s %s" % ("PID", "COMM", 270 "IP", "SADDR", "LPORT", "DADDR", "DPORT", "LAT(ms)")) 271else: 272 print("%-7s %-12s %-2s %-16s %-16s %-5s %s" % ("PID", "COMM", "IP", 273 "SADDR", "DADDR", "DPORT", "LAT(ms)")) 274 275# read events 276if args.ipv4: 277 b["ipv4_events"].open_perf_buffer(print_ipv4_event) 278elif args.ipv6: 279 b["ipv6_events"].open_perf_buffer(print_ipv6_event) 280else: 281 b["ipv4_events"].open_perf_buffer(print_ipv4_event) 282 b["ipv6_events"].open_perf_buffer(print_ipv6_event) 283 284while 1: 285 try: 286 b.perf_buffer_poll() 287 except KeyboardInterrupt: 288 exit() 289