1#!/usr/bin/env bcc-lua 2--[[ 3Licensed under the Apache License, Version 2.0 (the "License"); 4you may not use this file except in compliance with the License. 5You may obtain a copy of the License at 6 7http://www.apache.org/licenses/LICENSE-2.0 8 9Unless required by applicable law or agreed to in writing, software 10distributed under the License is distributed on an "AS IS" BASIS, 11WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12See the License for the specific language governing permissions and 13limitations under the License. 14 1518-Mar-2017 Simon Liu Created this. 16--]] 17 18local ffi = require("ffi") 19local bit = require("bit") 20 21ffi.cdef[[ 22const char *inet_ntop(int af, const void *src, char *dst, int size); 23uint16_t ntohs(uint16_t netshort); 24]] 25 26local program = [[ 27#include <uapi/linux/ptrace.h> 28#include <linux/tcp.h> 29#include <net/sock.h> 30#include <bcc/proto.h> 31 32BPF_HASH(birth, struct sock *, u64); 33 34// separate data structs for ipv4 and ipv6 35struct ipv4_data_t { 36 // XXX: switch some to u32's when supported 37 u64 ts_us; 38 u64 pid; 39 u64 saddr; 40 u64 daddr; 41 u64 ports; 42 u64 rx_b; 43 u64 tx_b; 44 u64 span_us; 45 char task[TASK_COMM_LEN]; 46}; 47BPF_PERF_OUTPUT(ipv4_events); 48 49struct ipv6_data_t { 50 u64 ts_us; 51 u64 pid; 52 u64 saddr[2]; 53 u64 daddr[2]; 54 u64 ports; 55 u64 rx_b; 56 u64 tx_b; 57 u64 span_us; 58 char task[TASK_COMM_LEN]; 59}; 60BPF_PERF_OUTPUT(ipv6_events); 61 62struct id_t { 63 u32 pid; 64 char task[TASK_COMM_LEN]; 65}; 66BPF_HASH(whoami, struct sock *, struct id_t); 67 68int trace_tcp_set_state(struct pt_regs *ctx, struct sock *sk, int state) 69{ 70 bpf_trace_printk("tcp_set_stat"); 71 u32 pid = bpf_get_current_pid_tgid() >> 32; 72 73 // lport is either used in a filter here, or later 74 u16 lport = sk->__sk_common.skc_num; 75 FILTER_LPORT 76 77 // dport is either used in a filter here, or later 78 u16 dport = sk->__sk_common.skc_dport; 79 FILTER_DPORT 80 81 /* 82 * This tool includes PID and comm context. It's best effort, and may 83 * be wrong in some situations. It currently works like this: 84 * - record timestamp on any state < TCP_FIN_WAIT1 85 * - cache task context on: 86 * TCP_SYN_SENT: tracing from client 87 * TCP_LAST_ACK: client-closed from server 88 * - do output on TCP_CLOSE: 89 * fetch task context if cached, or use current task 90 */ 91 92 // capture birth time 93 if (state < TCP_FIN_WAIT1) { 94 /* 95 * Matching just ESTABLISHED may be sufficient, provided no code-path 96 * sets ESTABLISHED without a tcp_set_state() call. Until we know 97 * that for sure, match all early states to increase chances a 98 * timestamp is set. 99 * Note that this needs to be set before the PID filter later on, 100 * since the PID isn't reliable for these early stages, so we must 101 * save all timestamps and do the PID filter later when we can. 102 */ 103 u64 ts = bpf_ktime_get_ns(); 104 birth.update(&sk, &ts); 105 } 106 107 // record PID & comm on SYN_SENT 108 if (state == TCP_SYN_SENT || state == TCP_LAST_ACK) { 109 // now we can PID filter, both here and a little later on for CLOSE 110 FILTER_PID 111 struct id_t me = {.pid = pid}; 112 bpf_get_current_comm(&me.task, sizeof(me.task)); 113 whoami.update(&sk, &me); 114 } 115 116 if (state != TCP_CLOSE) 117 return 0; 118 119 // calculate lifespan 120 u64 *tsp, delta_us; 121 tsp = birth.lookup(&sk); 122 if (tsp == 0) { 123 whoami.delete(&sk); // may not exist 124 return 0; // missed create 125 } 126 delta_us = (bpf_ktime_get_ns() - *tsp) / 1000; 127 birth.delete(&sk); 128 129 // fetch possible cached data, and filter 130 struct id_t *mep; 131 mep = whoami.lookup(&sk); 132 if (mep != 0) 133 pid = mep->pid; 134 FILTER_PID 135 136 // get throughput stats. see tcp_get_info(). 137 u64 rx_b = 0, tx_b = 0, sport = 0; 138 struct tcp_sock *tp = (struct tcp_sock *)sk; 139 rx_b = tp->bytes_received; 140 tx_b = tp->bytes_acked; 141 142 u16 family = sk->__sk_common.skc_family; 143 144 if (family == AF_INET) { 145 struct ipv4_data_t data4 = {.span_us = delta_us, 146 .rx_b = rx_b, .tx_b = tx_b}; 147 data4.ts_us = bpf_ktime_get_ns() / 1000; 148 data4.saddr = sk->__sk_common.skc_rcv_saddr; 149 data4.daddr = sk->__sk_common.skc_daddr; 150 // a workaround until data4 compiles with separate lport/dport 151 data4.pid = pid; 152 data4.ports = ntohs(dport) + ((0ULL + lport) << 32); 153 if (mep == 0) { 154 bpf_get_current_comm(&data4.task, sizeof(data4.task)); 155 } else { 156 bpf_probe_read_kernel(&data4.task, sizeof(data4.task), (void *)mep->task); 157 } 158 ipv4_events.perf_submit(ctx, &data4, sizeof(data4)); 159 160 } else /* 6 */ { 161 struct ipv6_data_t data6 = {.span_us = delta_us, 162 .rx_b = rx_b, .tx_b = tx_b}; 163 data6.ts_us = bpf_ktime_get_ns() / 1000; 164 bpf_probe_read_kernel(&data6.saddr, sizeof(data6.saddr), 165 sk->__sk_common.skc_v6_rcv_saddr.in6_u.u6_addr32); 166 bpf_probe_read_kernel(&data6.daddr, sizeof(data6.daddr), 167 sk->__sk_common.skc_v6_daddr.in6_u.u6_addr32); 168 // a workaround until data6 compiles with separate lport/dport 169 data6.ports = ntohs(dport) + ((0ULL + lport) << 32); 170 data6.pid = pid; 171 if (mep == 0) { 172 bpf_get_current_comm(&data6.task, sizeof(data6.task)); 173 } else { 174 bpf_probe_read_kernel(&data6.task, sizeof(data6.task), (void *)mep->task); 175 } 176 ipv6_events.perf_submit(ctx, &data6, sizeof(data6)); 177 } 178 179 if (mep != 0) 180 whoami.delete(&sk); 181 182 return 0; 183} 184]] 185 186local debug = false 187local start_ts = 0 188 189local inet_addresslen = #"255.255.255.255" 190local inet6_addresslen = #"ffff:ffff:ffff:ffff:ffff:ffff:255.255.255.255" 191local AF_INET = 2 192local AF_INET6 = 10 193 194local header_string = "%-5s %-10.10s %s%-15s %-5s %-15s %-5s %5s %5s %s" 195local format_string = "%-5d %-10.10s %s%-15s %-5d %-15s %-5d %5d %5d %.2f" 196local ip_string = "" 197local ip_version = false 198local arg_timestamp = false 199local arg_csv = false 200local arg_time = false 201 202local examples = [[examples: 203 ./tcplife # trace all TCP connect()s 204 ./tcplife -t # include time column (HH:MM:SS) 205 ./tcplife -w # wider columns (fit IPv6) 206 ./tcplife -stT # csv output, with times & timestamps 207 ./tcplife -p 181 # only trace PID 181 208 ./tcplife -L 80 # only trace local port 80 209 ./tcplife -L 80,81 # only trace local ports 80 and 81 210 ./tcplife -D 80 # only trace remote port 80 211]] 212 213local function split(str,sep) 214 local t = {} 215 for w in string.gmatch(str, '([^,]+)') do 216 table.insert(t, w) 217 end 218 return t 219end 220 221local function inet_ntop(af, addr, len) 222 local addr_dst = ffi.new("char[?]", len) 223 local addr_src 224 if af == AF_INET then 225 addr_src = ffi.new("uint64_t[1]", addr) 226 else 227 addr_src = ffi.new("uint64_t[2]", addr) 228 end 229 ffi.C.inet_ntop(af, addr_src, addr_dst, len) 230 return ffi.string(addr_dst, len) 231end 232 233local function inet_ntohs(port) 234 local p = tonumber(port) 235 return ffi.C.ntohs(p) 236end 237 238local function print_ipv4_event(cpu, event) 239 240 local event_pid = tonumber(event.pid) 241 local event_task = ffi.string(event.task) 242 local event_ports = tonumber(event.ports) 243 local event_tx_b = tonumber(event.tx_b) 244 local event_rx_b = tonumber(event.rx_b) 245 local event_span_us = tonumber(event.span_us) 246 local event_ts_us = tonumber(event.ts_us) 247 local event_saddr = inet_ntop(AF_INET, tonumber(event.saddr), inet_addresslen) 248 local event_daddr = inet_ntop(AF_INET, tonumber(event.daddr), inet_addresslen) 249 if arg_time then 250 if arg_csv then 251 io.write("%s," % os.date("%H:%M:%S")) 252 else 253 io.write("%-8s " % os.date("%H:%M:%S")) 254 end 255 end 256 if arg_timestamp then 257 if start_ts == 0 then 258 start_ts = event_ts_us 259 end 260 local delta_s = (event_ts_us - start_ts) / 1000000 261 if arg.csv then 262 io.write("%.6f," % delta_s) 263 else 264 io.write("%-9.6f " % delta_s) 265 end 266 end 267 local iv = "" 268 if ip_version then 269 iv = "4" 270 end 271 print(string.format(format_string, event_pid, event_task, iv, 272 event_saddr, bit.rshift(event_ports,32), 273 event_daddr, bit.band(event_ports,0xffffffff), 274 (event_tx_b / 1024), (event_rx_b / 1024), event_span_us/ 1000)) 275end 276 277 278local function print_ipv6_event(cpu, event) 279 local event_pid = tonumber(event.pid) 280 local event_task = ffi.string(event.task) 281 local event_ports = tonumber(event.ports) 282 local event_tx_b = tonumber(event.tx_b) 283 local event_rx_b = tonumber(event.rx_b) 284 local event_span_us = tonumber(event.span_us) 285 local event_ts_us = tonumber(event.ts_us) 286 local event_saddr = inet_ntop(AF_INET6, {tonumber(event.saddr[0]), tonumber(event.saddr[1])}, inet6_addresslen) 287 local event_daddr = inet_ntop(AF_INET6, {tonumber(event.daddr[0]), tonumber(event.daddr[1])}, inet6_addresslen) 288 if arg_time then 289 if arg_csv then 290 io.write("%s," % os.date("%H:%M:%S")) 291 else 292 io.write("%-8s " % os.date("%H:%M:%S")) 293 end 294 end 295 if arg_timestamp then 296 if start_ts == 0 then 297 start_ts = event_ts_us 298 end 299 local delta_s = (event_ts_us - start_ts) / 1000000 300 if arg.csv then 301 io.write("%.6f," % delta_s) 302 else 303 io.write("%-9.6f " % delta_s) 304 end 305 end 306 local iv = "" 307 if ip_version then 308 iv = "6" 309 end 310 print(string.format(format_string, event_pid, event_task, iv, 311 event_saddr, bit.rshift(event_ports,32), 312 event_daddr, bit.band(event_ports,0xffffffff), 313 (event_tx_b / 1024), (event_rx_b / 1024), event_span_us/ 1000)) 314end 315 316local function parse_arg(utils) 317 local parser = utils.argparse("tcplife", 318 "Trace the lifespan of TCP sessions and summarize", examples) 319 320 parser:flag("-T --time", "include time column on output (HH:MM:SS)") 321 parser:flag("-t --timestamp", "include timestamp on output (seconds)") 322 parser:flag("-w --wide", "wide column output (fits IPv6 addresses)") 323 parser:flag("-s --csv", "comma separated values output") 324 parser:option("-p --pid", "trace this PID only"):convert(tonumber) 325 parser:option("-L --localport", "comma-separated list of local ports to trace.") 326 parser:option("-D --remoteport", "comma-separated list of remote ports to trace.") 327 328 local args = parser:parse() 329 if args.pid then 330 local filter = 'if (pid != %d) { return 0; }' % args.pid 331 program = program.gsub('FILTER_PID', filter) 332 end 333 334 if args.remoteport then 335 local dports = split(args.remoteport, ",") 336 local dports_if = "" 337 for i,d in ipairs(dports) do 338 if dports_if == "" then 339 dports_if = 'dport != %d' % inet_ntohs(d) 340 else 341 dports_if = dports_if .. ' && ' .. ('dport != %d' % inet_ntohs(d)) 342 end 343 end 344 local filter = "if (%s) { birth.delete(&sk); return 0; }" % dports_if 345 program = program:gsub('FILTER_DPORT', filter) 346 end 347 if args.localport then 348 local lports = split(args.localport,",") 349 local lports_if = "" 350 for i,l in ipairs(lports) do 351 if lports_if == "" then 352 lports_if = 'lport != %d' % inet_ntohs(l) 353 else 354 lports_if = lports_if .. ' && ' .. ('lport != %d' % inet_ntohs(l)) 355 end 356 end 357 local filter = "if (%s) { birth.delete(&sk); return 0; }" % lports_if 358 program = program:gsub('FILTER_LPORT', filter) 359 end 360 program = program:gsub('FILTER_PID', '') 361 program = program:gsub('FILTER_DPORT', '') 362 program = program:gsub('FILTER_LPORT', '') 363 364 if args.wide then 365 header_string = "%-5s %-16.16s %-2s %-39s %-5s %-39s %-5s %6s %6s %s" 366 format_string = "%-5d %-16.16s %-2s %-39s %-5s %-39s %-5d %6d %6d %.2f" 367 ip_string = "IP" 368 ip_version = true 369 end 370 if args.csv then 371 header_string = "%s,%s,%s,%s,%s,%s,%s,%s,%s,%s" 372 format_string = "%d,%s,%s,%s,%s,%s,%d,%d,%d,%.2f" 373 ip_string = "IP" 374 ip_version = true 375 arg_csv = true 376 end 377 378 if args.time then 379 arg_time = true 380 if args.csv then 381 io.write("%s," % ("TIME")) 382 else 383 io.write("%-8s " % ("TIME")) 384 end 385 end 386 387 if args.timestamp then 388 arg_timestamp = true 389 if args.csv then 390 io.write("%s," % ("TIME(s)")) 391 else 392 io.write("%-9s " % ("TIME(s)")) 393 end 394 end 395 396end 397 398return function(BPF, utils) 399 parse_arg(utils) 400 if debug then 401 print(program) 402 end 403 404 local bpf = BPF:new{text=program} 405 bpf:attach_kprobe{event="tcp_set_state", fn_name="trace_tcp_set_state"} 406 print(header_string % {"PID", "COMM", 407 ip_string, "LADDR", 408 "LPORT", "RADDR", "RPORT", "TX_KB", "RX_KB", "MS"}) 409 local TASK_COMM_LEN = 16 -- linux/sched.h 410 bpf:get_table("ipv4_events"):open_perf_buffer(print_ipv4_event, [[ 411 struct { 412 uint64_t ts_us; 413 uint64_t pid; 414 uint64_t saddr; 415 uint64_t daddr; 416 uint64_t ports; 417 uint64_t rx_b; 418 uint64_t tx_b; 419 uint64_t span_us; 420 char task[$]; 421 } 422 ]], {TASK_COMM_LEN}, 64) 423 bpf:get_table("ipv6_events"):open_perf_buffer(print_ipv6_event, [[ 424 struct { 425 uint64_t ts_us; 426 uint64_t pid; 427 uint64_t saddr[2]; 428 uint64_t daddr[2]; 429 uint64_t ports; 430 uint64_t rx_b; 431 uint64_t tx_b; 432 uint64_t span_us; 433 char task[$]; 434 } 435 ]], {TASK_COMM_LEN}, 64) 436 437 bpf:perf_buffer_poll_loop() 438end 439