Add TCP_INFO support to getsockopt for tcp sessions.
authorclaudio <claudio@openbsd.org>
Thu, 11 Aug 2022 09:13:21 +0000 (09:13 +0000)
committerclaudio <claudio@openbsd.org>
Thu, 11 Aug 2022 09:13:21 +0000 (09:13 +0000)
TCP_INFO provides a lot of information about the TCP session of this socket.
Many processes like to peek at the rtt of a connection but this also provides
a lot of more special info for use by e.g. tcpbench(1).
While the basic minimal info is available all the time the more specific
data is only populated for privileged processes. This is done to not share
data back to userland that may allow to attack a session.
TCP_INFO is available to pledge "inet" since pledged processes like chrome
tend to use TCP_INFO when available.
OK bluhm@

sys/kern/kern_pledge.c
sys/netinet/tcp.h
sys/netinet/tcp_input.c
sys/netinet/tcp_output.c
sys/netinet/tcp_usrreq.c
sys/netinet/tcp_var.h

index f2378fc..57ebe45 100644 (file)
@@ -1,4 +1,4 @@
-/*     $OpenBSD: kern_pledge.c,v 1.292 2022/08/08 01:53:01 deraadt Exp $       */
+/*     $OpenBSD: kern_pledge.c,v 1.293 2022/08/11 09:13:21 claudio Exp $       */
 
 /*
  * Copyright (c) 2015 Nicholas Marriott <nicm@openbsd.org>
@@ -1370,7 +1370,7 @@ pledge_sockopt(struct proc *p, int set, int level, int optname)
                switch (optname) {
                case SO_RCVBUF:
                case SO_ERROR:
-                       return 0;
+                       return (0);
                }
                break;
        }
@@ -1392,7 +1392,7 @@ pledge_sockopt(struct proc *p, int set, int level, int optname)
        case SOL_SOCKET:
                switch (optname) {
                case SO_TIMESTAMP:
-                       return 0;
+                       return (0);
                }
                break;
        }
@@ -1430,6 +1430,7 @@ pledge_sockopt(struct proc *p, int set, int level, int optname)
                case TCP_SACK_ENABLE:
                case TCP_MAXSEG:
                case TCP_NOPUSH:
+               case TCP_INFO:
                        return (0);
                }
                break;
index 085624e..b9cf23d 100644 (file)
@@ -1,4 +1,4 @@
-/*     $OpenBSD: tcp.h,v 1.22 2021/02/08 19:37:15 jan Exp $    */
+/*     $OpenBSD: tcp.h,v 1.23 2022/08/11 09:13:21 claudio Exp $        */
 /*     $NetBSD: tcp.h,v 1.8 1995/04/17 05:32:58 cgd Exp $      */
 
 /*
@@ -126,6 +126,98 @@ struct tcphdr {
 #define        TCP_MAXSEG              0x02   /* set maximum segment size */
 #define        TCP_MD5SIG              0x04   /* enable TCP MD5 signature option */
 #define        TCP_SACK_ENABLE         0x08   /* enable SACKs (if disabled by def.) */
+#define        TCP_INFO                0x09   /* retrieve tcp_info structure */
 #define        TCP_NOPUSH              0x10   /* don't push last block of write */
 
+#define        TCPI_OPT_TIMESTAMPS     0x01
+#define        TCPI_OPT_SACK           0x02
+#define        TCPI_OPT_WSCALE         0x04
+#define        TCPI_OPT_ECN            0x08
+#define        TCPI_OPT_TOE            0x10
+
+/*
+ * The TCP_INFO socket option comes from the Linux 2.6 TCP API, and permits
+ * the caller to query certain information about the state of a TCP
+ * connection.  Provide an overlapping set of fields with the Linux
+ * implementation, but at the same time add a lot of OpenBSD specific
+ * extra information.
+ */
+struct tcp_info {
+       uint8_t         tcpi_state;             /* TCP FSM state. */
+       uint8_t         __tcpi_ca_state;
+       uint8_t         __tcpi_retransmits;
+       uint8_t         __tcpi_probes;
+       uint8_t         __tcpi_backoff;
+       uint8_t         tcpi_options;           /* Options enabled on conn. */
+       uint8_t         tcpi_snd_wscale;        /* RFC1323 send shift value. */
+       uint8_t         tcpi_rcv_wscale;        /* RFC1323 recv shift value. */
+
+       uint32_t        tcpi_rto;          /* Retransmission timeout (usec). */
+       uint32_t        __tcpi_ato;
+       uint32_t        tcpi_snd_mss;           /* Max segment size for send. */
+       uint32_t        tcpi_rcv_mss;           /* Max segment size for recv. */
+
+       uint32_t        __tcpi_unacked;
+       uint32_t        __tcpi_sacked;
+       uint32_t        __tcpi_lost;
+       uint32_t        __tcpi_retrans;
+       uint32_t        __tcpi_fackets;
+
+       /* Times; measurements in usecs. */
+       uint32_t        tcpi_last_data_sent;    /* since last sent data. */
+       uint32_t        tcpi_last_ack_sent;     /* since last sent ack. */
+       uint32_t        tcpi_last_data_recv;    /* since last recv data. */
+       uint32_t        tcpi_last_ack_recv;     /* since last recv ack. */
+
+       /* Metrics; variable units. */
+       uint32_t        __tcpi_pmtu;
+       uint32_t        __tcpi_rcv_ssthresh;
+       uint32_t        tcpi_rtt;               /* Smoothed RTT in usecs. */
+       uint32_t        tcpi_rttvar;            /* RTT variance in usecs. */
+       uint32_t        tcpi_snd_ssthresh;      /* Slow start threshold. */
+       uint32_t        tcpi_snd_cwnd;          /* Send congestion window. */
+       uint32_t        __tcpi_advmss;
+       uint32_t        __tcpi_reordering;
+
+       uint32_t        __tcpi_rcv_rtt;
+       uint32_t        tcpi_rcv_space;         /* Advertised recv window. */
+
+       /*
+        * Members below this point are only set if process is privileged,
+        * otherwise values will be 0.
+        */
+
+       /* FreeBSD/NetBSD extensions to tcp_info. */
+       uint32_t        tcpi_snd_wnd;           /* Advertised send window. */
+       uint32_t        tcpi_snd_nxt;           /* Next egress seqno */
+       uint32_t        tcpi_rcv_nxt;           /* Next ingress seqno */
+       uint32_t        tcpi_toe_tid;           /* HWTID for TOE endpoints */
+       uint32_t        tcpi_snd_rexmitpack;    /* Retransmitted packets */
+       uint32_t        tcpi_rcv_ooopack;       /* Out-of-order packets */
+       uint32_t        tcpi_snd_zerowin;       /* Zero-sized windows sent */
+
+       /* OpenBSD extensions */
+       uint32_t        tcpi_rttmin;
+       uint32_t        tcpi_max_sndwnd;
+       uint32_t        tcpi_rcv_adv;
+       uint32_t        tcpi_rcv_up;
+       uint32_t        tcpi_snd_una;
+       uint32_t        tcpi_snd_up;
+       uint32_t        tcpi_snd_wl1;
+       uint32_t        tcpi_snd_wl2;
+       uint32_t        tcpi_snd_max;
+       uint32_t        tcpi_ts_recent;
+       uint32_t        tcpi_ts_recent_age;
+       uint32_t        tcpi_rfbuf_cnt;
+       uint32_t        tcpi_rfbuf_ts;
+       uint32_t        tcpi_so_rcv_sb_cc;
+       uint32_t        tcpi_so_rcv_sb_hiwat;
+       uint32_t        tcpi_so_rcv_sb_lowat;
+       uint32_t        tcpi_so_rcv_sb_wat;
+       uint32_t        tcpi_so_snd_sb_cc;
+       uint32_t        tcpi_so_snd_sb_hiwat;
+       uint32_t        tcpi_so_snd_sb_lowat;
+       uint32_t        tcpi_so_snd_sb_wat;
+};
+
 #endif /* _NETINET_TCP_H_ */
index b5c9be1..7df100f 100644 (file)
@@ -1,4 +1,4 @@
-/*     $OpenBSD: tcp_input.c,v 1.376 2022/08/08 12:06:30 bluhm Exp $   */
+/*     $OpenBSD: tcp_input.c,v 1.377 2022/08/11 09:13:21 claudio Exp $ */
 /*     $NetBSD: tcp_input.c,v 1.23 1996/02/13 23:43:44 christos Exp $  */
 
 /*
@@ -275,6 +275,7 @@ tcp_reass(struct tcpcb *tp, struct tcphdr *th, struct mbuf *m, int *tlen)
                }
        }
        tcpstat_pkt(tcps_rcvoopack, tcps_rcvoobyte, *tlen);
+       tp->t_rcvoopack++;
 
        /*
         * While we overlap succeeding segments trim them or,
@@ -947,6 +948,7 @@ findpcb:
                                acked = th->th_ack - tp->snd_una;
                                tcpstat_pkt(tcps_rcvackpack, tcps_rcvackbyte,
                                    acked);
+                               tp->t_rcvacktime = tcp_now;
                                ND6_HINT(tp);
                                sbdrop(so, &so->so_snd, acked);
 
@@ -1681,6 +1683,7 @@ trimthenstep6:
                }
                acked = th->th_ack - tp->snd_una;
                tcpstat_pkt(tcps_rcvackpack, tcps_rcvackbyte, acked);
+               tp->t_rcvacktime = tcp_now;
 
                /*
                 * If we have a timestamp reply, update smoothed
@@ -3620,6 +3623,9 @@ syn_cache_get(struct sockaddr *src, struct sockaddr *dst, struct tcphdr *th,
        tcp_rcvseqinit(tp);
        tp->t_state = TCPS_SYN_RECEIVED;
        tp->t_rcvtime = tcp_now;
+       tp->t_sndtime = tcp_now;
+       tp->t_rcvacktime = tcp_now;
+       tp->t_sndacktime = tcp_now;
        TCP_TIMER_ARM(tp, TCPT_KEEP, tcptv_keep_init);
        tcpstat_inc(tcps_accepts);
 
index faaec77..dacfd7c 100644 (file)
@@ -1,4 +1,4 @@
-/*     $OpenBSD: tcp_output.c,v 1.131 2021/11/25 13:46:02 bluhm Exp $  */
+/*     $OpenBSD: tcp_output.c,v 1.132 2022/08/11 09:13:21 claudio Exp $        */
 /*     $NetBSD: tcp_output.c,v 1.16 1997/06/03 16:17:09 kml Exp $      */
 
 /*
@@ -636,6 +636,7 @@ send:
                else if (SEQ_LT(tp->snd_nxt, tp->snd_max)) {
                        tcpstat_pkt(tcps_sndrexmitpack, tcps_sndrexmitbyte,
                            len);
+                       tp->t_sndrexmitpack++;
                } else {
                        tcpstat_pkt(tcps_sndpack, tcps_sndbyte, len);
                }
@@ -690,6 +691,7 @@ send:
                 */
                if (off + len == so->so_snd.sb_cc && !soissending(so))
                        flags |= TH_PUSH;
+               tp->t_sndtime = tcp_now;
        } else {
                if (tp->t_flags & TF_ACKNOW)
                        tcpstat_inc(tcps_sndacks);
@@ -821,6 +823,8 @@ send:
        if (flags & TH_RST)
                win = 0;
        th->th_win = htons((u_int16_t) (win>>tp->rcv_scale));
+       if (th->th_win == 0)
+               tp->t_sndzerowin++;
        if (SEQ_GT(tp->snd_up, tp->snd_nxt)) {
                u_int32_t urp = tp->snd_up - tp->snd_nxt;
                if (urp > IP_MAXPACKET)
@@ -1119,6 +1123,7 @@ out:
        if (win > 0 && SEQ_GT(tp->rcv_nxt+win, tp->rcv_adv))
                tp->rcv_adv = tp->rcv_nxt + win;
        tp->last_ack_sent = tp->rcv_nxt;
+       tp->t_sndacktime = tcp_now;
        tp->t_flags &= ~TF_ACKNOW;
        TCP_TIMER_DISARM(tp, TCPT_DELACK);
        if (sendalot)
index 6185187..0f588bb 100644 (file)
@@ -1,4 +1,4 @@
-/*     $OpenBSD: tcp_usrreq.c,v 1.184 2022/08/08 12:06:30 bluhm Exp $  */
+/*     $OpenBSD: tcp_usrreq.c,v 1.185 2022/08/11 09:13:21 claudio Exp $        */
 /*     $NetBSD: tcp_usrreq.c,v 1.20 1996/02/13 23:44:16 christos Exp $ */
 
 /*
@@ -78,7 +78,9 @@
 #include <sys/sysctl.h>
 #include <sys/domain.h>
 #include <sys/kernel.h>
+#include <sys/pledge.h>
 #include <sys/pool.h>
+#include <sys/proc.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
@@ -132,7 +134,8 @@ const struct sysctl_bounded_args tcpctl_vars[] = {
 
 struct inpcbtable tcbtable;
 
-int tcp_ident(void *, size_t *, void *, size_t, int);
+int    tcp_fill_info(struct tcpcb *, struct socket *, struct mbuf *);
+int    tcp_ident(void *, size_t *, void *, size_t, int);
 
 /*
  * Process a TCP user request for TCP tb.  If this is a send request
@@ -425,6 +428,103 @@ tcp_usrreq(struct socket *so, int req, struct mbuf *m, struct mbuf *nam,
        return (error);
 }
 
+/*
+ * Export internal TCP state information via a struct tcp_info without
+ * leaking any sensitive information. Sequence numbers are reported
+ * relative to the initial sequence number.
+ */
+int
+tcp_fill_info(struct tcpcb *tp, struct socket *so, struct mbuf *m)
+{
+       struct proc *p = curproc;
+       struct tcp_info *ti;
+       u_int t = 1000000 / PR_SLOWHZ;
+
+       if (sizeof(*ti) > MLEN) {
+               MCLGETL(m, M_WAITOK, sizeof(*ti));
+               if (!ISSET(m->m_flags, M_EXT))
+                       return ENOMEM;
+       }
+       ti = mtod(m, struct tcp_info *);
+       m->m_len = sizeof(*ti);
+       memset(ti, 0, sizeof(*ti));
+
+       ti->tcpi_state = tp->t_state;
+       if ((tp->t_flags & TF_REQ_TSTMP) && (tp->t_flags & TF_RCVD_TSTMP))
+               ti->tcpi_options |= TCPI_OPT_TIMESTAMPS;
+       if (tp->t_flags & TF_SACK_PERMIT)
+               ti->tcpi_options |= TCPI_OPT_SACK;
+       if ((tp->t_flags & TF_REQ_SCALE) && (tp->t_flags & TF_RCVD_SCALE)) {
+               ti->tcpi_options |= TCPI_OPT_WSCALE;
+               ti->tcpi_snd_wscale = tp->snd_scale;
+               ti->tcpi_rcv_wscale = tp->rcv_scale;
+       }
+#ifdef TCP_ECN
+       if (tp->t_flags & TF_ECN_PERMIT)
+               ti->tcpi_options |= TCPI_OPT_ECN;
+#endif
+
+       ti->tcpi_rto = tp->t_rxtcur * t;
+       ti->tcpi_snd_mss = tp->t_maxseg;
+       ti->tcpi_rcv_mss = tp->t_peermss;
+
+       ti->tcpi_last_data_sent = (tcp_now - tp->t_sndtime) * t;
+       ti->tcpi_last_ack_sent = (tcp_now - tp->t_sndacktime) * t;
+       ti->tcpi_last_data_recv = (tcp_now - tp->t_rcvtime) * t;
+       ti->tcpi_last_ack_recv = (tcp_now - tp->t_rcvacktime) * t;
+
+       ti->tcpi_rtt = ((uint64_t)tp->t_srtt * t) >>
+           (TCP_RTT_SHIFT + TCP_RTT_BASE_SHIFT);
+       ti->tcpi_rttvar = ((uint64_t)tp->t_rttvar * t) >>
+           (TCP_RTTVAR_SHIFT + TCP_RTT_BASE_SHIFT);
+       ti->tcpi_snd_ssthresh = tp->snd_ssthresh;
+       ti->tcpi_snd_cwnd = tp->snd_cwnd;
+
+       ti->tcpi_rcv_space = tp->rcv_wnd;
+
+       /*
+        * Provide only minimal information for unprivileged processes.
+        */
+       if (suser(p) != 0)
+               return 0;
+
+       /* FreeBSD-specific extension fields for tcp_info.  */
+       ti->tcpi_snd_wnd = tp->snd_wnd;
+       ti->tcpi_snd_nxt = tp->snd_nxt - tp->iss;
+       ti->tcpi_rcv_nxt = tp->rcv_nxt - tp->irs;
+       /* missing tcpi_toe_tid */
+       ti->tcpi_snd_rexmitpack = tp->t_sndrexmitpack;
+       ti->tcpi_rcv_ooopack = tp->t_rcvoopack;
+       ti->tcpi_snd_zerowin = tp->t_sndzerowin;
+
+       /* OpenBSD extensions */
+       ti->tcpi_rttmin = tp->t_rttmin * t;
+       ti->tcpi_max_sndwnd = tp->max_sndwnd;
+       ti->tcpi_rcv_adv = tp->rcv_adv - tp->irs;
+       ti->tcpi_rcv_up = tp->rcv_up - tp->irs;
+       ti->tcpi_snd_una = tp->snd_una - tp->iss;
+       ti->tcpi_snd_up = tp->snd_up - tp->iss;
+       ti->tcpi_snd_wl1 = tp->snd_wl1 - tp->iss;
+       ti->tcpi_snd_wl2 = tp->snd_wl2 - tp->iss;
+       ti->tcpi_snd_max = tp->snd_max - tp->iss;
+
+       ti->tcpi_ts_recent = tp->ts_recent; /* XXX value from the wire */
+       ti->tcpi_ts_recent_age = (tcp_now - tp->ts_recent_age) * t;
+       ti->tcpi_rfbuf_cnt = tp->rfbuf_cnt;
+       ti->tcpi_rfbuf_ts = (tcp_now - tp->rfbuf_ts) * t;
+
+       ti->tcpi_so_rcv_sb_cc = so->so_rcv.sb_cc;
+       ti->tcpi_so_rcv_sb_hiwat = so->so_rcv.sb_hiwat;
+       ti->tcpi_so_rcv_sb_lowat = so->so_rcv.sb_lowat;
+       ti->tcpi_so_rcv_sb_wat = so->so_rcv.sb_wat;
+       ti->tcpi_so_snd_sb_cc = so->so_snd.sb_cc;
+       ti->tcpi_so_snd_sb_hiwat = so->so_snd.sb_hiwat;
+       ti->tcpi_so_snd_sb_lowat = so->so_snd.sb_lowat;
+       ti->tcpi_so_snd_sb_wat = so->so_snd.sb_wat;
+
+       return 0;
+}
+
 int
 tcp_ctloutput(int op, struct socket *so, int level, int optname,
     struct mbuf *m)
@@ -541,23 +641,29 @@ tcp_ctloutput(int op, struct socket *so, int level, int optname,
                break;
 
        case PRCO_GETOPT:
-               m->m_len = sizeof(int);
-
                switch (optname) {
                case TCP_NODELAY:
+                       m->m_len = sizeof(int);
                        *mtod(m, int *) = tp->t_flags & TF_NODELAY;
                        break;
                case TCP_NOPUSH:
+                       m->m_len = sizeof(int);
                        *mtod(m, int *) = tp->t_flags & TF_NOPUSH;
                        break;
                case TCP_MAXSEG:
+                       m->m_len = sizeof(int);
                        *mtod(m, int *) = tp->t_maxseg;
                        break;
                case TCP_SACK_ENABLE:
+                       m->m_len = sizeof(int);
                        *mtod(m, int *) = tp->sack_enable;
                        break;
+               case TCP_INFO:
+                       error = tcp_fill_info(tp, so, m);
+                       break;
 #ifdef TCP_SIGNATURE
                case TCP_MD5SIG:
+                       m->m_len = sizeof(int);
                        *mtod(m, int *) = tp->t_flags & TF_SIGNATURE;
                        break;
 #endif
index 7d8f615..29dcff8 100644 (file)
@@ -1,4 +1,4 @@
-/*     $OpenBSD: tcp_var.h,v 1.139 2022/02/25 23:51:03 guenther Exp $  */
+/*     $OpenBSD: tcp_var.h,v 1.140 2022/08/11 09:13:21 claudio Exp $   */
 /*     $NetBSD: tcp_var.h,v 1.17 1996/02/13 23:44:24 christos Exp $    */
 
 /*
@@ -161,6 +161,9 @@ struct tcpcb {
  * "Variance" is actually smoothed difference.
  */
        uint32_t t_rcvtime;             /* time last segment received */
+       uint32_t t_rcvacktime;          /* time last ack received */
+       uint32_t t_sndtime;             /* time last segment sent */
+       uint32_t t_sndacktime;          /* time last ack sent */
        uint32_t t_rtttime;             /* time we started measuring rtt */
        tcp_seq t_rtseq;                /* sequence number being timed */
        short   t_srtt;                 /* smoothed round-trip time */
@@ -182,7 +185,7 @@ struct tcpcb {
        u_char  requested_s_scale;
        u_int32_t ts_recent;            /* timestamp echo data */
        u_int32_t ts_modulate;          /* modulation on timestamp */
-       u_int32_t ts_recent_age;                /* when last updated */
+       u_int32_t ts_recent_age;        /* when last updated */
        tcp_seq last_ack_sent;
 
 /* pointer for syn cache entries*/
@@ -197,6 +200,11 @@ struct tcpcb {
        u_short t_pmtud_ip_hl;          /* IP header length from ICMP payload */
 
        int pf;
+
+/* maintain a few stats per connection: */
+       u_int   t_rcvoopack;            /* out-of-order packets received */
+       u_int   t_sndrexmitpack;        /* retransmit packets sent */
+       u_int   t_sndzerowin;           /* zero-window updates sent */
 };
 
 #define        intotcpcb(ip)   ((struct tcpcb *)(ip)->inp_ppcb)