xref: /aosp_15_r20/external/bcc/tools/tcplife.py (revision 387f9dfdfa2baef462e92476d413c7bc2470293e)
1#!/usr/bin/env python
2# @lint-avoid-python-3-compatibility-imports
3#
4# tcplife   Trace the lifespan of TCP sessions and summarize.
5#           For Linux, uses BCC, BPF. Embedded C.
6#
7# USAGE: tcplife [-h] [-C] [-S] [-p PID] [-4 | -6] [interval [count]]
8#
9# This uses the sock:inet_sock_set_state tracepoint if it exists (added to
10# Linux 4.16, and replacing the earlier tcp:tcp_set_state), else it uses
11# kernel dynamic tracing of tcp_set_state().
12#
13# While throughput counters are emitted, they are fetched in a low-overhead
14# manner: reading members of the tcp_info struct on TCP close. ie, we do not
15# trace send/receive.
16#
17# Copyright 2016 Netflix, Inc.
18# Licensed under the Apache License, Version 2.0 (the "License")
19#
20# IDEA: Julia Evans
21#
22# 18-Oct-2016   Brendan Gregg   Created this.
23# 29-Dec-2017      "      "     Added tracepoint support.
24
25from __future__ import print_function
26from bcc import BPF
27import argparse
28from socket import inet_ntop, AF_INET, AF_INET6
29from struct import pack
30from time import strftime
31
32# arguments
33examples = """examples:
34    ./tcplife           # trace all TCP connect()s
35    ./tcplife -T        # include time column (HH:MM:SS)
36    ./tcplife -w        # wider columns (fit IPv6)
37    ./tcplife -stT      # csv output, with times & timestamps
38    ./tcplife -p 181    # only trace PID 181
39    ./tcplife -L 80     # only trace local port 80
40    ./tcplife -L 80,81  # only trace local ports 80 and 81
41    ./tcplife -D 80     # only trace remote port 80
42    ./tcplife -4        # only trace IPv4 family
43    ./tcplife -6        # only trace IPv6 family
44"""
45parser = argparse.ArgumentParser(
46    description="Trace the lifespan of TCP sessions and summarize",
47    formatter_class=argparse.RawDescriptionHelpFormatter,
48    epilog=examples)
49parser.add_argument("-T", "--time", action="store_true",
50    help="include time column on output (HH:MM:SS)")
51parser.add_argument("-t", "--timestamp", action="store_true",
52    help="include timestamp on output (seconds)")
53parser.add_argument("-w", "--wide", action="store_true",
54    help="wide column output (fits IPv6 addresses)")
55parser.add_argument("-s", "--csv", action="store_true",
56    help="comma separated values output")
57parser.add_argument("-p", "--pid",
58    help="trace this PID only")
59parser.add_argument("-L", "--localport",
60    help="comma-separated list of local ports to trace.")
61parser.add_argument("-D", "--remoteport",
62    help="comma-separated list of remote ports to trace.")
63group = parser.add_mutually_exclusive_group()
64group.add_argument("-4", "--ipv4", action="store_true",
65    help="trace IPv4 family only")
66group.add_argument("-6", "--ipv6", action="store_true",
67    help="trace IPv6 family only")
68parser.add_argument("--ebpf", action="store_true",
69    help=argparse.SUPPRESS)
70args = parser.parse_args()
71debug = 0
72
73# define BPF program
74bpf_text = """
75#include <uapi/linux/ptrace.h>
76#include <linux/tcp.h>
77#include <net/sock.h>
78#include <bcc/proto.h>
79
80BPF_HASH(birth, struct sock *, u64);
81
82// separate data structs for ipv4 and ipv6
83struct ipv4_data_t {
84    u64 ts_us;
85    u32 pid;
86    u32 saddr;
87    u32 daddr;
88    u64 ports;
89    u64 rx_b;
90    u64 tx_b;
91    u64 span_us;
92    char task[TASK_COMM_LEN];
93};
94BPF_PERF_OUTPUT(ipv4_events);
95
96struct ipv6_data_t {
97    u64 ts_us;
98    u32 pid;
99    unsigned __int128 saddr;
100    unsigned __int128 daddr;
101    u64 ports;
102    u64 rx_b;
103    u64 tx_b;
104    u64 span_us;
105    char task[TASK_COMM_LEN];
106};
107BPF_PERF_OUTPUT(ipv6_events);
108
109struct id_t {
110    u32 pid;
111    char task[TASK_COMM_LEN];
112};
113BPF_HASH(whoami, struct sock *, struct id_t);
114"""
115
116#
117# XXX: The following is temporary code for older kernels, Linux 4.14 and
118# older. It uses kprobes to instrument tcp_set_state(). On Linux 4.16 and
119# later, the sock:inet_sock_set_state tracepoint should be used instead, as
120# is done by the code that follows this. In the distant future (2021?), this
121# kprobe code can be removed. This is why there is so much code
122# duplication: to make removal easier.
123#
124bpf_text_kprobe = """
125int kprobe__tcp_set_state(struct pt_regs *ctx, struct sock *sk, int state)
126{
127    u32 pid = bpf_get_current_pid_tgid() >> 32;
128
129    // lport is either used in a filter here, or later
130    u16 lport = sk->__sk_common.skc_num;
131    FILTER_LPORT
132
133    // dport is either used in a filter here, or later
134    u16 dport = sk->__sk_common.skc_dport;
135    dport = ntohs(dport);
136    FILTER_DPORT
137
138    /*
139     * This tool includes PID and comm context. It's best effort, and may
140     * be wrong in some situations. It currently works like this:
141     * - record timestamp on any state < TCP_FIN_WAIT1
142     * - cache task context on:
143     *       TCP_SYN_SENT: tracing from client
144     *       TCP_LAST_ACK: client-closed from server
145     * - do output on TCP_CLOSE:
146     *       fetch task context if cached, or use current task
147     */
148
149    // capture birth time
150    if (state < TCP_FIN_WAIT1) {
151        /*
152         * Matching just ESTABLISHED may be sufficient, provided no code-path
153         * sets ESTABLISHED without a tcp_set_state() call. Until we know
154         * that for sure, match all early states to increase chances a
155         * timestamp is set.
156         * Note that this needs to be set before the PID filter later on,
157         * since the PID isn't reliable for these early stages, so we must
158         * save all timestamps and do the PID filter later when we can.
159         */
160        u64 ts = bpf_ktime_get_ns();
161        birth.update(&sk, &ts);
162    }
163
164    // record PID & comm on SYN_SENT
165    if (state == TCP_SYN_SENT || state == TCP_LAST_ACK) {
166        // now we can PID filter, both here and a little later on for CLOSE
167        FILTER_PID
168        struct id_t me = {.pid = pid};
169        bpf_get_current_comm(&me.task, sizeof(me.task));
170        whoami.update(&sk, &me);
171    }
172
173    if (state != TCP_CLOSE)
174        return 0;
175
176    // calculate lifespan
177    u64 *tsp, delta_us;
178    tsp = birth.lookup(&sk);
179    if (tsp == 0) {
180        whoami.delete(&sk);     // may not exist
181        return 0;               // missed create
182    }
183    delta_us = (bpf_ktime_get_ns() - *tsp) / 1000;
184    birth.delete(&sk);
185
186    // fetch possible cached data, and filter
187    struct id_t *mep;
188    mep = whoami.lookup(&sk);
189    if (mep != 0)
190        pid = mep->pid;
191    FILTER_PID
192
193    // get throughput stats. see tcp_get_info().
194    u64 rx_b = 0, tx_b = 0;
195    struct tcp_sock *tp = (struct tcp_sock *)sk;
196    rx_b = tp->bytes_received;
197    tx_b = tp->bytes_acked;
198
199    u16 family = sk->__sk_common.skc_family;
200
201    FILTER_FAMILY
202
203    if (family == AF_INET) {
204        struct ipv4_data_t data4 = {};
205        data4.span_us = delta_us;
206        data4.rx_b = rx_b;
207        data4.tx_b = tx_b;
208        data4.ts_us = bpf_ktime_get_ns() / 1000;
209        data4.saddr = sk->__sk_common.skc_rcv_saddr;
210        data4.daddr = sk->__sk_common.skc_daddr;
211        // a workaround until data4 compiles with separate lport/dport
212        data4.pid = pid;
213        data4.ports = dport + ((0ULL + lport) << 32);
214        if (mep == 0) {
215            bpf_get_current_comm(&data4.task, sizeof(data4.task));
216        } else {
217            bpf_probe_read_kernel(&data4.task, sizeof(data4.task), (void *)mep->task);
218        }
219        ipv4_events.perf_submit(ctx, &data4, sizeof(data4));
220
221    } else /* 6 */ {
222        struct ipv6_data_t data6 = {};
223        data6.span_us = delta_us;
224        data6.rx_b = rx_b;
225        data6.tx_b = tx_b;
226        data6.ts_us = bpf_ktime_get_ns() / 1000;
227        bpf_probe_read_kernel(&data6.saddr, sizeof(data6.saddr),
228            sk->__sk_common.skc_v6_rcv_saddr.in6_u.u6_addr32);
229        bpf_probe_read_kernel(&data6.daddr, sizeof(data6.daddr),
230            sk->__sk_common.skc_v6_daddr.in6_u.u6_addr32);
231        // a workaround until data6 compiles with separate lport/dport
232        data6.ports = dport + ((0ULL + lport) << 32);
233        data6.pid = pid;
234        if (mep == 0) {
235            bpf_get_current_comm(&data6.task, sizeof(data6.task));
236        } else {
237            bpf_probe_read_kernel(&data6.task, sizeof(data6.task), (void *)mep->task);
238        }
239        ipv6_events.perf_submit(ctx, &data6, sizeof(data6));
240    }
241
242    if (mep != 0)
243        whoami.delete(&sk);
244
245    return 0;
246}
247"""
248
249bpf_text_tracepoint = """
250TRACEPOINT_PROBE(sock, inet_sock_set_state)
251{
252    if (args->protocol != IPPROTO_TCP)
253        return 0;
254
255    u32 pid = bpf_get_current_pid_tgid() >> 32;
256    // sk is mostly used as a UUID, and for two tcp stats:
257    struct sock *sk = (struct sock *)args->skaddr;
258
259    // lport is either used in a filter here, or later
260    u16 lport = args->sport;
261    FILTER_LPORT
262
263    // dport is either used in a filter here, or later
264    u16 dport = args->dport;
265    FILTER_DPORT
266
267    /*
268     * This tool includes PID and comm context. It's best effort, and may
269     * be wrong in some situations. It currently works like this:
270     * - record timestamp on any state < TCP_FIN_WAIT1
271     * - cache task context on:
272     *       TCP_SYN_SENT: tracing from client
273     *       TCP_LAST_ACK: client-closed from server
274     * - do output on TCP_CLOSE:
275     *       fetch task context if cached, or use current task
276     */
277
278    // capture birth time
279    if (args->newstate < TCP_FIN_WAIT1) {
280        /*
281         * Matching just ESTABLISHED may be sufficient, provided no code-path
282         * sets ESTABLISHED without a tcp_set_state() call. Until we know
283         * that for sure, match all early states to increase chances a
284         * timestamp is set.
285         * Note that this needs to be set before the PID filter later on,
286         * since the PID isn't reliable for these early stages, so we must
287         * save all timestamps and do the PID filter later when we can.
288         */
289        u64 ts = bpf_ktime_get_ns();
290        birth.update(&sk, &ts);
291    }
292
293    // record PID & comm on SYN_SENT
294    if (args->newstate == TCP_SYN_SENT || args->newstate == TCP_LAST_ACK) {
295        // now we can PID filter, both here and a little later on for CLOSE
296        FILTER_PID
297        struct id_t me = {.pid = pid};
298        bpf_get_current_comm(&me.task, sizeof(me.task));
299        whoami.update(&sk, &me);
300    }
301
302    if (args->newstate != TCP_CLOSE)
303        return 0;
304
305    // calculate lifespan
306    u64 *tsp, delta_us;
307    tsp = birth.lookup(&sk);
308    if (tsp == 0) {
309        whoami.delete(&sk);     // may not exist
310        return 0;               // missed create
311    }
312    delta_us = (bpf_ktime_get_ns() - *tsp) / 1000;
313    birth.delete(&sk);
314
315    // fetch possible cached data, and filter
316    struct id_t *mep;
317    mep = whoami.lookup(&sk);
318    if (mep != 0)
319        pid = mep->pid;
320    FILTER_PID
321
322    u16 family = args->family;
323    FILTER_FAMILY
324
325    // get throughput stats. see tcp_get_info().
326    u64 rx_b = 0, tx_b = 0;
327    struct tcp_sock *tp = (struct tcp_sock *)sk;
328    rx_b = tp->bytes_received;
329    tx_b = tp->bytes_acked;
330
331    if (args->family == AF_INET) {
332        struct ipv4_data_t data4 = {};
333        data4.span_us = delta_us;
334        data4.rx_b = rx_b;
335        data4.tx_b = tx_b;
336        data4.ts_us = bpf_ktime_get_ns() / 1000;
337        __builtin_memcpy(&data4.saddr, args->saddr, sizeof(data4.saddr));
338        __builtin_memcpy(&data4.daddr, args->daddr, sizeof(data4.daddr));
339        // a workaround until data4 compiles with separate lport/dport
340        data4.ports = dport + ((0ULL + lport) << 32);
341        data4.pid = pid;
342
343        if (mep == 0) {
344            bpf_get_current_comm(&data4.task, sizeof(data4.task));
345        } else {
346            bpf_probe_read_kernel(&data4.task, sizeof(data4.task), (void *)mep->task);
347        }
348        ipv4_events.perf_submit(args, &data4, sizeof(data4));
349
350    } else /* 6 */ {
351        struct ipv6_data_t data6 = {};
352        data6.span_us = delta_us;
353        data6.rx_b = rx_b;
354        data6.tx_b = tx_b;
355        data6.ts_us = bpf_ktime_get_ns() / 1000;
356        __builtin_memcpy(&data6.saddr, args->saddr_v6, sizeof(data6.saddr));
357        __builtin_memcpy(&data6.daddr, args->daddr_v6, sizeof(data6.daddr));
358        // a workaround until data6 compiles with separate lport/dport
359        data6.ports = dport + ((0ULL + lport) << 32);
360        data6.pid = pid;
361        if (mep == 0) {
362            bpf_get_current_comm(&data6.task, sizeof(data6.task));
363        } else {
364            bpf_probe_read_kernel(&data6.task, sizeof(data6.task), (void *)mep->task);
365        }
366        ipv6_events.perf_submit(args, &data6, sizeof(data6));
367    }
368
369    if (mep != 0)
370        whoami.delete(&sk);
371
372    return 0;
373}
374"""
375
376if (BPF.tracepoint_exists("sock", "inet_sock_set_state")):
377    bpf_text += bpf_text_tracepoint
378else:
379    bpf_text += bpf_text_kprobe
380
381# code substitutions
382if args.pid:
383    bpf_text = bpf_text.replace('FILTER_PID',
384        'if (pid != %s) { return 0; }' % args.pid)
385if args.remoteport:
386    dports = [int(dport) for dport in args.remoteport.split(',')]
387    dports_if = ' && '.join(['dport != %d' % dport for dport in dports])
388    bpf_text = bpf_text.replace('FILTER_DPORT',
389        'if (%s) { birth.delete(&sk); return 0; }' % dports_if)
390if args.localport:
391    lports = [int(lport) for lport in args.localport.split(',')]
392    lports_if = ' && '.join(['lport != %d' % lport for lport in lports])
393    bpf_text = bpf_text.replace('FILTER_LPORT',
394        'if (%s) { birth.delete(&sk); return 0; }' % lports_if)
395if args.ipv4:
396    bpf_text = bpf_text.replace('FILTER_FAMILY',
397        'if (family != AF_INET) { return 0; }')
398elif args.ipv6:
399    bpf_text = bpf_text.replace('FILTER_FAMILY',
400        'if (family != AF_INET6) { return 0; }')
401bpf_text = bpf_text.replace('FILTER_PID', '')
402bpf_text = bpf_text.replace('FILTER_DPORT', '')
403bpf_text = bpf_text.replace('FILTER_LPORT', '')
404bpf_text = bpf_text.replace('FILTER_FAMILY', '')
405
406if debug or args.ebpf:
407    print(bpf_text)
408    if args.ebpf:
409        exit()
410
411#
412# Setup output formats
413#
414# Don't change the default output (next 2 lines): this fits in 80 chars. I
415# know it doesn't have NS or UIDs etc. I know. If you really, really, really
416# need to add columns, columns that solve real actual problems, I'd start by
417# adding an extended mode (-x) to included those columns.
418#
419header_string = "%-5s %-10.10s %s%-15s %-5s %-15s %-5s %5s %5s %s"
420format_string = "%-5d %-10.10s %s%-15s %-5d %-15s %-5d %5d %5d %.2f"
421if args.wide:
422    header_string = "%-5s %-16.16s %-2s %-39s %-5s %-39s %-5s %6s %6s %s"
423    format_string = "%-5d %-16.16s %-2s %-39s %-5s %-39s %-5d %6d %6d %.2f"
424if args.csv:
425    header_string = "%s,%s,%s,%s,%s,%s,%s,%s,%s,%s"
426    format_string = "%d,%s,%s,%s,%s,%s,%d,%d,%d,%.2f"
427
428# process event
429def print_ipv4_event(cpu, data, size):
430    event = b["ipv4_events"].event(data)
431    global start_ts
432    if args.time:
433        if args.csv:
434            print("%s," % strftime("%H:%M:%S"), end="")
435        else:
436            print("%-8s " % strftime("%H:%M:%S"), end="")
437    if args.timestamp:
438        if start_ts == 0:
439            start_ts = event.ts_us
440        delta_s = (float(event.ts_us) - start_ts) / 1000000
441        if args.csv:
442            print("%.6f," % delta_s, end="")
443        else:
444            print("%-9.6f " % delta_s, end="")
445    print(format_string % (event.pid, event.task.decode('utf-8', 'replace'),
446        "4" if args.wide or args.csv else "",
447        inet_ntop(AF_INET, pack("I", event.saddr)), event.ports >> 32,
448        inet_ntop(AF_INET, pack("I", event.daddr)), event.ports & 0xffffffff,
449        event.tx_b / 1024, event.rx_b / 1024, float(event.span_us) / 1000))
450
451def print_ipv6_event(cpu, data, size):
452    event = b["ipv6_events"].event(data)
453    global start_ts
454    if args.time:
455        if args.csv:
456            print("%s," % strftime("%H:%M:%S"), end="")
457        else:
458            print("%-8s " % strftime("%H:%M:%S"), end="")
459    if args.timestamp:
460        if start_ts == 0:
461            start_ts = event.ts_us
462        delta_s = (float(event.ts_us) - start_ts) / 1000000
463        if args.csv:
464            print("%.6f," % delta_s, end="")
465        else:
466            print("%-9.6f " % delta_s, end="")
467    print(format_string % (event.pid, event.task.decode('utf-8', 'replace'),
468        "6" if args.wide or args.csv else "",
469        inet_ntop(AF_INET6, event.saddr), event.ports >> 32,
470        inet_ntop(AF_INET6, event.daddr), event.ports & 0xffffffff,
471        event.tx_b / 1024, event.rx_b / 1024, float(event.span_us) / 1000))
472
473# initialize BPF
474b = BPF(text=bpf_text)
475
476# header
477if args.time:
478    if args.csv:
479        print("%s," % ("TIME"), end="")
480    else:
481        print("%-8s " % ("TIME"), end="")
482if args.timestamp:
483    if args.csv:
484        print("%s," % ("TIME(s)"), end="")
485    else:
486        print("%-9s " % ("TIME(s)"), end="")
487print(header_string % ("PID", "COMM",
488    "IP" if args.wide or args.csv else "", "LADDR",
489    "LPORT", "RADDR", "RPORT", "TX_KB", "RX_KB", "MS"))
490
491start_ts = 0
492
493# read events
494b["ipv4_events"].open_perf_buffer(print_ipv4_event, page_cnt=64)
495b["ipv6_events"].open_perf_buffer(print_ipv6_event, page_cnt=64)
496while 1:
497    try:
498        b.perf_buffer_poll()
499    except KeyboardInterrupt:
500        exit()
501