From: bluhm Date: Wed, 10 May 2023 12:07:16 +0000 (+0000) Subject: Implement TCP send offloading, for now in software only. This is X-Git-Url: http://artulab.com/gitweb/?a=commitdiff_plain;h=c06845b1c3ca93e4e363133d3ad803fb7ba7a70b;p=openbsd Implement TCP send offloading, for now in software only. This is meant as a fallback if network hardware does not support TSO. Driver support is still work in progress. TCP output generates large packets. In IP output the packet is chopped to TCP maximum segment size. This reduces the CPU cycles used by pf. The regular output could be assisted by hardware later, but pf route-to and IPsec needs the software fallback in general. For performance comparison or to workaround possible bugs, sysctl net.inet.tcp.tso=0 disables the feature. netstat -s -p tcp shows TSO counter with chopped and generated packets. based on work from jan@ tested by jmc@ jan@ Hrvoje Popovski OK jan@ claudio@ --- diff --git a/sys/net/pf.c b/sys/net/pf.c index 8ea53be3c2d..ee979d9e8e5 100644 --- a/sys/net/pf.c +++ b/sys/net/pf.c @@ -1,4 +1,4 @@ -/* $OpenBSD: pf.c,v 1.1177 2023/05/08 13:22:13 bluhm Exp $ */ +/* $OpenBSD: pf.c,v 1.1178 2023/05/10 12:07:16 bluhm Exp $ */ /* * Copyright (c) 2001 Daniel Hartmeier @@ -6561,6 +6561,16 @@ pf_route(struct pf_pdesc *pd, struct pf_state *st) goto done; } + if (ISSET(m0->m_pkthdr.csum_flags, M_TCP_TSO) && + m0->m_pkthdr.ph_mss <= ifp->if_mtu) { + if (tcp_chopper(m0, &ml, ifp, m0->m_pkthdr.ph_mss) || + if_output_ml(ifp, &ml, sintosa(dst), rt)) + goto done; + tcpstat_inc(tcps_outswtso); + goto done; + } + CLR(m0->m_pkthdr.csum_flags, M_TCP_TSO); + /* * Too large for interface; fragment if possible. * Must be able to put at least 8 bytes per fragment. @@ -6594,6 +6604,7 @@ void pf_route6(struct pf_pdesc *pd, struct pf_state *st) { struct mbuf *m0; + struct mbuf_list ml; struct sockaddr_in6 *dst, sin6; struct rtentry *rt = NULL; struct ip6_hdr *ip6; @@ -6685,12 +6696,22 @@ pf_route6(struct pf_pdesc *pd, struct pf_state *st) goto done; } - if ((u_long)m0->m_pkthdr.len <= ifp->if_mtu) { + if (m0->m_pkthdr.len <= ifp->if_mtu) { in6_proto_cksum_out(m0, ifp); ifp->if_output(ifp, m0, sin6tosa(dst), rt); goto done; } + if (ISSET(m0->m_pkthdr.csum_flags, M_TCP_TSO) && + m0->m_pkthdr.ph_mss <= ifp->if_mtu) { + if (tcp_chopper(m0, &ml, ifp, m0->m_pkthdr.ph_mss) || + if_output_ml(ifp, &ml, sin6tosa(dst), rt)) + goto done; + tcpstat_inc(tcps_outswtso); + goto done; + } + CLR(m0->m_pkthdr.csum_flags, M_TCP_TSO); + ip6stat_inc(ip6s_cantfrag); if (st->rt != PF_DUPTO) pf_send_icmp(m0, ICMP6_PACKET_TOO_BIG, 0, diff --git a/sys/netinet/in.h b/sys/netinet/in.h index 4fc6f3a58bc..16dcdb24ca0 100644 --- a/sys/netinet/in.h +++ b/sys/netinet/in.h @@ -1,4 +1,4 @@ -/* $OpenBSD: in.h,v 1.142 2023/04/11 00:45:09 jsg Exp $ */ +/* $OpenBSD: in.h,v 1.143 2023/05/10 12:07:16 bluhm Exp $ */ /* $NetBSD: in.h,v 1.20 1996/02/13 23:41:47 christos Exp $ */ /* @@ -780,6 +780,7 @@ int in_canforward(struct in_addr); int in_cksum(struct mbuf *, int); int in4_cksum(struct mbuf *, u_int8_t, int, int); void in_proto_cksum_out(struct mbuf *, struct ifnet *); +int in_ifcap_cksum(struct mbuf *, struct ifnet *, int); void in_ifdetach(struct ifnet *); int in_mask2len(struct in_addr *); void in_len2mask(struct in_addr *, int); diff --git a/sys/netinet/ip_output.c b/sys/netinet/ip_output.c index 76746568ff6..f5b1173a299 100644 --- a/sys/netinet/ip_output.c +++ b/sys/netinet/ip_output.c @@ -1,4 +1,4 @@ -/* $OpenBSD: ip_output.c,v 1.384 2023/05/08 13:22:13 bluhm Exp $ */ +/* $OpenBSD: ip_output.c,v 1.385 2023/05/10 12:07:16 bluhm Exp $ */ /* $NetBSD: ip_output.c,v 1.28 1996/02/13 23:43:07 christos Exp $ */ /* @@ -84,7 +84,6 @@ void ip_mloopback(struct ifnet *, struct mbuf *, struct sockaddr_in *); static __inline u_int16_t __attribute__((__unused__)) in_cksum_phdr(u_int32_t, u_int32_t, u_int32_t); void in_delayed_cksum(struct mbuf *); -int in_ifcap_cksum(struct mbuf *, struct ifnet *, int); int ip_output_ipsec_lookup(struct mbuf *m, int hlen, struct inpcb *inp, struct tdb **, int ipsecflowinfo); @@ -468,6 +467,16 @@ sendit: goto done; } + if (ISSET(m->m_pkthdr.csum_flags, M_TCP_TSO) && + m->m_pkthdr.ph_mss <= mtu) { + if ((error = tcp_chopper(m, &ml, ifp, m->m_pkthdr.ph_mss)) || + (error = if_output_ml(ifp, &ml, sintosa(dst), ro->ro_rt))) + goto done; + tcpstat_inc(tcps_outswtso); + goto done; + } + CLR(m->m_pkthdr.csum_flags, M_TCP_TSO); + /* * Too large for interface; fragment if possible. * Must be able to put at least 8 bytes per fragment. @@ -597,12 +606,12 @@ ip_output_ipsec_pmtu_update(struct tdb *tdb, struct route *ro, int ip_output_ipsec_send(struct tdb *tdb, struct mbuf *m, struct route *ro, int fwd) { -#if NPF > 0 - struct ifnet *encif; -#endif + struct mbuf_list ml; + struct ifnet *encif = NULL; struct ip *ip; struct in_addr dst; - int error, rtableid; + u_int len; + int error, rtableid, tso = 0; #if NPF > 0 /* @@ -622,16 +631,22 @@ ip_output_ipsec_send(struct tdb *tdb, struct mbuf *m, struct route *ro, int fwd) * Until now the change was not reconsidered. * What's the behaviour? */ - in_proto_cksum_out(m, encif); #endif - /* Check if we are allowed to fragment */ + /* Check if we can chop the TCP packet */ ip = mtod(m, struct ip *); + if (ISSET(m->m_pkthdr.csum_flags, M_TCP_TSO) && + m->m_pkthdr.ph_mss <= tdb->tdb_mtu) { + tso = 1; + len = m->m_pkthdr.ph_mss; + } else + len = ntohs(ip->ip_len); + + /* Check if we are allowed to fragment */ dst = ip->ip_dst; rtableid = m->m_pkthdr.ph_rtableid; if (ip_mtudisc && (ip->ip_off & htons(IP_DF)) && tdb->tdb_mtu && - ntohs(ip->ip_len) > tdb->tdb_mtu && - tdb->tdb_mtutimeout > gettime()) { + len > tdb->tdb_mtu && tdb->tdb_mtutimeout > gettime()) { int transportmode; transportmode = (tdb->tdb_dst.sa.sa_family == AF_INET) && @@ -652,14 +667,33 @@ ip_output_ipsec_send(struct tdb *tdb, struct mbuf *m, struct route *ro, int fwd) */ m->m_flags &= ~(M_MCAST | M_BCAST); - /* Callee frees mbuf */ + if (tso) { + error = tcp_chopper(m, &ml, encif, len); + if (error) + goto done; + } else { + CLR(m->m_pkthdr.csum_flags, M_TCP_TSO); + in_proto_cksum_out(m, encif); + ml_init(&ml); + ml_enqueue(&ml, m); + } + KERNEL_LOCK(); - error = ipsp_process_packet(m, tdb, AF_INET, 0); + while ((m = ml_dequeue(&ml)) != NULL) { + /* Callee frees mbuf */ + error = ipsp_process_packet(m, tdb, AF_INET, 0); + if (error) + break; + } KERNEL_UNLOCK(); + done: if (error) { + ml_purge(&ml); ipsecstat_inc(ipsec_odrops); tdbstat_inc(tdb, tdb_odrops); } + if (!error && tso) + tcpstat_inc(tcps_outswtso); if (ip_mtudisc && error == EMSGSIZE) ip_output_ipsec_pmtu_update(tdb, ro, dst, rtableid, 0); return error; diff --git a/sys/netinet/tcp_output.c b/sys/netinet/tcp_output.c index 19e3bbd10d3..84adc729bf8 100644 --- a/sys/netinet/tcp_output.c +++ b/sys/netinet/tcp_output.c @@ -1,4 +1,4 @@ -/* $OpenBSD: tcp_output.c,v 1.135 2023/04/25 22:56:28 bluhm Exp $ */ +/* $OpenBSD: tcp_output.c,v 1.136 2023/05/10 12:07:16 bluhm Exp $ */ /* $NetBSD: tcp_output.c,v 1.16 1997/06/03 16:17:09 kml Exp $ */ /* @@ -210,6 +210,7 @@ tcp_output(struct tcpcb *tp) #ifdef TCP_ECN int needect; #endif + int tso; if (tp->t_flags & TF_BLOCKOUTPUT) { tp->t_flags |= TF_NEEDOUTPUT; @@ -279,6 +280,7 @@ again: } sendalot = 0; + tso = 0; /* * If in persist timeout with window of 0, send 1 byte. * Otherwise, if window is small but nonzero @@ -346,8 +348,25 @@ again: txmaxseg = ulmin(so->so_snd.sb_hiwat / 2, tp->t_maxseg); if (len > txmaxseg) { - len = txmaxseg; - sendalot = 1; + if (tcp_do_tso && + tp->t_inpcb->inp_options == NULL && + tp->t_inpcb->inp_outputopts6 == NULL && +#ifdef TCP_SIGNATURE + ((tp->t_flags & TF_SIGNATURE) == 0) && +#endif + len >= 2 * tp->t_maxseg && + tp->rcv_numsacks == 0 && sack_rxmit == 0 && + !(flags & (TH_SYN|TH_RST|TH_FIN))) { + tso = 1; + /* avoid small chopped packets */ + if (len > (len / tp->t_maxseg) * tp->t_maxseg) { + len = (len / tp->t_maxseg) * tp->t_maxseg; + sendalot = 1; + } + } else { + len = txmaxseg; + sendalot = 1; + } } if (off + len < so->so_snd.sb_cc) flags &= ~TH_FIN; @@ -365,7 +384,7 @@ again: * to send into a small window), then must resend. */ if (len) { - if (len == txmaxseg) + if (len >= txmaxseg) goto send; if ((idle || (tp->t_flags & TF_NODELAY)) && len + off >= so->so_snd.sb_cc && !soissending(so) && @@ -616,10 +635,19 @@ send: /* * Adjust data length if insertion of options will * bump the packet length beyond the t_maxopd length. + * Clear the FIN bit because we cut off the tail of + * the segment. */ if (len > tp->t_maxopd - optlen) { - len = tp->t_maxopd - optlen; - sendalot = 1; + if (tso) { + if (len + hdrlen + max_linkhdr > MAXMCLBYTES) { + len = MAXMCLBYTES - hdrlen - max_linkhdr; + sendalot = 1; + } + } else { + len = tp->t_maxopd - optlen; + sendalot = 1; + } flags &= ~TH_FIN; } @@ -723,6 +751,12 @@ send: m->m_pkthdr.ph_ifidx = 0; m->m_pkthdr.len = hdrlen + len; + /* Enable TSO and specify the size of the resulting segments. */ + if (tso) { + m->m_pkthdr.csum_flags |= M_TCP_TSO; + m->m_pkthdr.ph_mss = tp->t_maxseg; + } + if (!tp->t_template) panic("tcp_output"); #ifdef DIAGNOSTIC @@ -1153,3 +1187,176 @@ tcp_setpersist(struct tcpcb *tp) if (tp->t_rxtshift < TCP_MAXRXTSHIFT) tp->t_rxtshift++; } + +int +tcp_chopper(struct mbuf *m0, struct mbuf_list *ml, struct ifnet *ifp, + u_int mss) +{ + struct ip *ip = NULL; +#ifdef INET6 + struct ip6_hdr *ip6 = NULL; +#endif + struct tcphdr *th; + int firstlen, iphlen, hlen, tlen, off; + int error; + + ml_init(ml); + ml_enqueue(ml, m0); + + ip = mtod(m0, struct ip *); + switch (ip->ip_v) { + case 4: + iphlen = ip->ip_hl << 2; + if (ISSET(ip->ip_off, htons(IP_OFFMASK | IP_MF)) || + iphlen != sizeof(struct ip) || ip->ip_p != IPPROTO_TCP) { + /* only TCP without fragment or IP option supported */ + error = EPROTOTYPE; + goto bad; + } + break; +#ifdef INET6 + case 6: + ip = NULL; + ip6 = mtod(m0, struct ip6_hdr *); + iphlen = sizeof(struct ip6_hdr); + if (ip6->ip6_nxt != IPPROTO_TCP) { + /* only TCP without IPv6 header chain supported */ + error = EPROTOTYPE; + goto bad; + } + break; +#endif + default: + panic("%s: unknown ip version %d", __func__, ip->ip_v); + } + + tlen = m0->m_pkthdr.len; + if (tlen < iphlen + sizeof(struct tcphdr)) { + error = ENOPROTOOPT; + goto bad; + } + /* IP and TCP header should be contiguous, this check is paranoia */ + if (m0->m_len < iphlen + sizeof(*th)) { + ml_dequeue(ml); + if ((m0 = m_pullup(m0, iphlen + sizeof(*th))) == NULL) { + error = ENOBUFS; + goto bad; + } + ml_enqueue(ml, m0); + } + th = (struct tcphdr *)(mtod(m0, caddr_t) + iphlen); + hlen = iphlen + (th->th_off << 2); + if (tlen < hlen) { + error = ENOPROTOOPT; + goto bad; + } + firstlen = MIN(tlen - hlen, mss); + + CLR(m0->m_pkthdr.csum_flags, M_TCP_TSO); + + /* + * Loop through length of payload after first segment, + * make new header and copy data of each part and link onto chain. + */ + for (off = hlen + firstlen; off < tlen; off += mss) { + struct mbuf *m; + struct tcphdr *mhth; + int len; + + len = MIN(tlen - off, mss); + + MGETHDR(m, M_DONTWAIT, MT_HEADER); + if (m == NULL) { + error = ENOBUFS; + goto bad; + } + ml_enqueue(ml, m); + if ((error = m_dup_pkthdr(m, m0, M_DONTWAIT)) != 0) + goto bad; + + /* IP and TCP header to the end, space for link layer header */ + m->m_len = hlen; + m_align(m, hlen); + + /* copy and adjust TCP header */ + mhth = (struct tcphdr *)(mtod(m, caddr_t) + iphlen); + memcpy(mhth, th, hlen - iphlen); + mhth->th_seq = htonl(ntohl(th->th_seq) + (off - hlen)); + if (off + len < tlen) + CLR(mhth->th_flags, TH_PUSH|TH_FIN); + + /* add mbuf chain with payload */ + m->m_pkthdr.len = hlen + len; + if ((m->m_next = m_copym(m0, off, len, M_DONTWAIT)) == NULL) { + error = ENOBUFS; + goto bad; + } + + /* copy and adjust IP header, calculate checksum */ + SET(m->m_pkthdr.csum_flags, M_TCP_CSUM_OUT); + mhth->th_sum = 0; + if (ip) { + struct ip *mhip; + + mhip = mtod(m, struct ip *); + *mhip = *ip; + mhip->ip_len = htons(hlen + len); + mhip->ip_id = htons(ip_randomid()); + mhip->ip_sum = 0; + if (ifp && in_ifcap_cksum(m, ifp, IFCAP_CSUM_IPv4)) { + m->m_pkthdr.csum_flags |= M_IPV4_CSUM_OUT; + } else { + ipstat_inc(ips_outswcsum); + mhip->ip_sum = in_cksum(m, iphlen); + } + in_proto_cksum_out(m, ifp); + } +#ifdef INET6 + if (ip6) { + struct ip6_hdr *mhip6; + + mhip6 = mtod(m, struct ip6_hdr *); + *mhip6 = *ip6; + mhip6->ip6_plen = htons(hlen - iphlen + len); + in6_proto_cksum_out(m, ifp); + } +#endif + } + + /* + * Update first segment by trimming what's been copied out + * and updating header, then send each segment (in order). + */ + if (hlen + firstlen < tlen) { + m_adj(m0, hlen + firstlen - tlen); + CLR(th->th_flags, TH_PUSH|TH_FIN); + } + /* adjust IP header, calculate checksum */ + SET(m0->m_pkthdr.csum_flags, M_TCP_CSUM_OUT); + th->th_sum = 0; + if (ip) { + ip->ip_len = htons(m0->m_pkthdr.len); + ip->ip_sum = 0; + if (ifp && in_ifcap_cksum(m0, ifp, IFCAP_CSUM_IPv4)) { + m0->m_pkthdr.csum_flags |= M_IPV4_CSUM_OUT; + } else { + ipstat_inc(ips_outswcsum); + ip->ip_sum = in_cksum(m0, iphlen); + } + in_proto_cksum_out(m0, ifp); + } +#ifdef INET6 + if (ip6) { + ip6->ip6_plen = htons(m0->m_pkthdr.len - iphlen); + in6_proto_cksum_out(m0, ifp); + } +#endif + + tcpstat_add(tcps_outpkttso, ml_len(ml)); + return 0; + + bad: + tcpstat_inc(tcps_outbadtso); + ml_purge(ml); + return error; +} diff --git a/sys/netinet/tcp_subr.c b/sys/netinet/tcp_subr.c index b08f55f00e2..ad8f7eae1b0 100644 --- a/sys/netinet/tcp_subr.c +++ b/sys/netinet/tcp_subr.c @@ -1,4 +1,4 @@ -/* $OpenBSD: tcp_subr.c,v 1.190 2022/11/07 11:22:55 yasuoka Exp $ */ +/* $OpenBSD: tcp_subr.c,v 1.191 2023/05/10 12:07:16 bluhm Exp $ */ /* $NetBSD: tcp_subr.c,v 1.22 1996/02/13 23:44:00 christos Exp $ */ /* @@ -119,6 +119,7 @@ int tcp_ack_on_push = 0; /* set to enable immediate ACK-on-PUSH */ int tcp_do_ecn = 0; /* RFC3168 ECN enabled/disabled? */ #endif int tcp_do_rfc3390 = 2; /* Increase TCP's Initial Window to 10*mss */ +int tcp_do_tso = 1; /* TCP segmentation offload for output */ #ifndef TCB_INITIAL_HASH_SIZE #define TCB_INITIAL_HASH_SIZE 128 diff --git a/sys/netinet/tcp_usrreq.c b/sys/netinet/tcp_usrreq.c index c676a5782b2..120e3cc5ea7 100644 --- a/sys/netinet/tcp_usrreq.c +++ b/sys/netinet/tcp_usrreq.c @@ -1,4 +1,4 @@ -/* $OpenBSD: tcp_usrreq.c,v 1.217 2023/03/14 00:24:05 yasuoka Exp $ */ +/* $OpenBSD: tcp_usrreq.c,v 1.218 2023/05/10 12:07:16 bluhm Exp $ */ /* $NetBSD: tcp_usrreq.c,v 1.20 1996/02/13 23:44:16 christos Exp $ */ /* @@ -166,6 +166,7 @@ const struct sysctl_bounded_args tcpctl_vars[] = { { TCPCTL_SYN_BUCKET_LIMIT, &tcp_syn_bucket_limit, 1, INT_MAX }, { TCPCTL_RFC3390, &tcp_do_rfc3390, 0, 2 }, { TCPCTL_ALWAYS_KEEPALIVE, &tcp_always_keepalive, 0, 1 }, + { TCPCTL_TSO, &tcp_do_tso, 0, 1 }, }; struct inpcbtable tcbtable; @@ -1335,6 +1336,10 @@ tcp_sysctl_tcpstat(void *oldp, size_t *oldlenp, void *newp) ASSIGN(tcps_sack_rcv_opts); ASSIGN(tcps_sack_snd_opts); ASSIGN(tcps_sack_drop_opts); + ASSIGN(tcps_outswtso); + ASSIGN(tcps_outhwtso); + ASSIGN(tcps_outpkttso); + ASSIGN(tcps_outbadtso); #undef ASSIGN @@ -1494,8 +1499,8 @@ tcp_sysctl(int *name, u_int namelen, void *oldp, size_t *oldlenp, void *newp, default: NET_LOCK(); - error = sysctl_bounded_arr(tcpctl_vars, nitems(tcpctl_vars), name, - namelen, oldp, oldlenp, newp, newlen); + error = sysctl_bounded_arr(tcpctl_vars, nitems(tcpctl_vars), + name, namelen, oldp, oldlenp, newp, newlen); NET_UNLOCK(); return (error); } diff --git a/sys/netinet/tcp_var.h b/sys/netinet/tcp_var.h index 72424e0eff0..d8cfc12c517 100644 --- a/sys/netinet/tcp_var.h +++ b/sys/netinet/tcp_var.h @@ -1,4 +1,4 @@ -/* $OpenBSD: tcp_var.h,v 1.163 2023/03/14 00:24:05 yasuoka Exp $ */ +/* $OpenBSD: tcp_var.h,v 1.164 2023/05/10 12:07:16 bluhm Exp $ */ /* $NetBSD: tcp_var.h,v 1.17 1996/02/13 23:44:24 christos Exp $ */ /* @@ -442,6 +442,11 @@ struct tcpstat { u_int64_t tcps_sack_rcv_opts; /* SACK options received */ u_int64_t tcps_sack_snd_opts; /* SACK options sent */ u_int64_t tcps_sack_drop_opts; /* SACK options dropped */ + + u_int32_t tcps_outswtso; /* output tso chopped in software */ + u_int32_t tcps_outhwtso; /* output tso processed by hardware */ + u_int32_t tcps_outpkttso; /* packets generated by tso */ + u_int32_t tcps_outbadtso; /* output tso failed, packet dropped */ }; /* @@ -473,7 +478,8 @@ struct tcpstat { #define TCPCTL_SYN_USE_LIMIT 23 /* number of uses before reseeding hash */ #define TCPCTL_ROOTONLY 24 /* return root only port bitmap */ #define TCPCTL_SYN_HASH_SIZE 25 /* number of buckets in the hash */ -#define TCPCTL_MAXID 26 +#define TCPCTL_TSO 26 /* enable TCP segmentation offload */ +#define TCPCTL_MAXID 27 #define TCPCTL_NAMES { \ { 0, 0 }, \ @@ -500,8 +506,9 @@ struct tcpstat { { "stats", CTLTYPE_STRUCT }, \ { "always_keepalive", CTLTYPE_INT }, \ { "synuselimit", CTLTYPE_INT }, \ - { "rootonly", CTLTYPE_STRUCT }, \ + { "rootonly", CTLTYPE_STRUCT }, \ { "synhashsize", CTLTYPE_INT }, \ + { "tso", CTLTYPE_INT }, \ } struct tcp_ident_mapping { @@ -614,6 +621,10 @@ enum tcpstat_counters { tcps_sack_rcv_opts, tcps_sack_snd_opts, tcps_sack_drop_opts, + tcps_outswtso, + tcps_outhwtso, + tcps_outpkttso, + tcps_outbadtso, tcps_ncounters, }; @@ -665,6 +676,7 @@ extern struct pool sackhl_pool; extern int tcp_sackhole_limit; /* max entries for tcp sack queues */ extern int tcp_do_ecn; /* RFC3168 ECN enabled/disabled? */ extern int tcp_do_rfc3390; /* RFC3390 Increasing TCP's Initial Window */ +extern int tcp_do_tso; /* enable TSO for TCP output packets */ extern struct pool tcpqe_pool; extern int tcp_reass_limit; /* max entries for tcp reass queues */ @@ -706,6 +718,7 @@ struct tcpcb * tcp_newtcpcb(struct inpcb *, int); void tcp_notify(struct inpcb *, int); int tcp_output(struct tcpcb *); +int tcp_chopper(struct mbuf *, struct mbuf_list *, struct ifnet *, u_int); void tcp_pulloutofband(struct socket *, u_int, struct mbuf *, int); int tcp_reass(struct tcpcb *, struct tcphdr *, struct mbuf *, int *); void tcp_rscale(struct tcpcb *, u_long); diff --git a/sys/netinet6/ip6_output.c b/sys/netinet6/ip6_output.c index dfa2a3c5245..b14ffbe4106 100644 --- a/sys/netinet6/ip6_output.c +++ b/sys/netinet6/ip6_output.c @@ -1,4 +1,4 @@ -/* $OpenBSD: ip6_output.c,v 1.274 2023/05/08 13:22:13 bluhm Exp $ */ +/* $OpenBSD: ip6_output.c,v 1.275 2023/05/10 12:07:17 bluhm Exp $ */ /* $KAME: ip6_output.c,v 1.172 2001/03/25 09:55:56 itojun Exp $ */ /* @@ -686,7 +686,9 @@ reroute: dontfrag = 1; else dontfrag = 0; - if (dontfrag && tlen > ifp->if_mtu) { /* case 2-b */ + if (dontfrag && /* case 2-b */ + (ISSET(m->m_pkthdr.csum_flags, M_TCP_TSO) ? + m->m_pkthdr.csum_flags : tlen) > ifp->if_mtu) { #ifdef IPSEC if (ip_mtudisc) ipsec_adjust_mtu(m, mtu); @@ -698,12 +700,22 @@ reroute: /* * transmit packet without fragmentation */ - if (dontfrag || (tlen <= mtu)) { /* case 1-a and 2-a */ + if (dontfrag || tlen <= mtu) { /* case 1-a and 2-a */ in6_proto_cksum_out(m, ifp); error = ifp->if_output(ifp, m, sin6tosa(dst), ro->ro_rt); goto done; } + if (ISSET(m->m_pkthdr.csum_flags, M_TCP_TSO) && + m->m_pkthdr.ph_mss <= mtu) { + if ((error = tcp_chopper(m, &ml, ifp, m->m_pkthdr.ph_mss)) || + (error = if_output_ml(ifp, &ml, sin6tosa(dst), ro->ro_rt))) + goto done; + tcpstat_inc(tcps_outswtso); + goto done; + } + CLR(m->m_pkthdr.csum_flags, M_TCP_TSO); + /* * try to fragment the packet. case 1-b */ @@ -2829,12 +2841,12 @@ int ip6_output_ipsec_send(struct tdb *tdb, struct mbuf *m, struct route_in6 *ro, int tunalready, int fwd) { -#if NPF > 0 - struct ifnet *encif; -#endif + struct mbuf_list ml; + struct ifnet *encif = NULL; struct ip6_hdr *ip6; struct in6_addr dst; - int error, ifidx, rtableid; + u_int len; + int error, ifidx, rtableid, tso = 0; #if NPF > 0 /* @@ -2854,17 +2866,23 @@ ip6_output_ipsec_send(struct tdb *tdb, struct mbuf *m, struct route_in6 *ro, * Until now the change was not reconsidered. * What's the behaviour? */ - in6_proto_cksum_out(m, encif); #endif - /* Check if we are allowed to fragment */ + /* Check if we can chop the TCP packet */ ip6 = mtod(m, struct ip6_hdr *); + if (ISSET(m->m_pkthdr.csum_flags, M_TCP_TSO) && + m->m_pkthdr.ph_mss <= tdb->tdb_mtu) { + tso = 1; + len = m->m_pkthdr.ph_mss; + } else + len = sizeof(struct ip6_hdr) + ntohs(ip6->ip6_plen); + + /* Check if we are allowed to fragment */ dst = ip6->ip6_dst; ifidx = m->m_pkthdr.ph_ifidx; rtableid = m->m_pkthdr.ph_rtableid; if (ip_mtudisc && tdb->tdb_mtu && - sizeof(struct ip6_hdr) + ntohs(ip6->ip6_plen) > tdb->tdb_mtu && - tdb->tdb_mtutimeout > gettime()) { + len > tdb->tdb_mtu && tdb->tdb_mtutimeout > gettime()) { int transportmode; transportmode = (tdb->tdb_dst.sa.sa_family == AF_INET6) && @@ -2891,14 +2909,33 @@ ip6_output_ipsec_send(struct tdb *tdb, struct mbuf *m, struct route_in6 *ro, */ m->m_flags &= ~(M_BCAST | M_MCAST); - /* Callee frees mbuf */ + if (tso) { + error = tcp_chopper(m, &ml, encif, len); + if (error) + goto done; + } else { + CLR(m->m_pkthdr.csum_flags, M_TCP_TSO); + in6_proto_cksum_out(m, encif); + ml_init(&ml); + ml_enqueue(&ml, m); + } + KERNEL_LOCK(); - error = ipsp_process_packet(m, tdb, AF_INET6, tunalready); + while ((m = ml_dequeue(&ml)) != NULL) { + /* Callee frees mbuf */ + error = ipsp_process_packet(m, tdb, AF_INET6, tunalready); + if (error) + break; + } KERNEL_UNLOCK(); + done: if (error) { + ml_purge(&ml); ipsecstat_inc(ipsec_odrops); tdbstat_inc(tdb, tdb_odrops); } + if (!error && tso) + tcpstat_inc(tcps_outswtso); if (ip_mtudisc && error == EMSGSIZE) ip6_output_ipsec_pmtu_update(tdb, ro, &dst, ifidx, rtableid, 0); return error; diff --git a/sys/sys/mbuf.h b/sys/sys/mbuf.h index 65d5c5b5897..2e479b715d9 100644 --- a/sys/sys/mbuf.h +++ b/sys/sys/mbuf.h @@ -1,4 +1,4 @@ -/* $OpenBSD: mbuf.h,v 1.256 2023/05/05 01:19:51 bluhm Exp $ */ +/* $OpenBSD: mbuf.h,v 1.257 2023/05/10 12:07:17 bluhm Exp $ */ /* $NetBSD: mbuf.h,v 1.19 1996/02/09 18:25:14 christos Exp $ */ /* @@ -129,12 +129,13 @@ struct pkthdr { SLIST_HEAD(, m_tag) ph_tags; /* list of packet tags */ int64_t ph_timestamp; /* packet timestamp */ int len; /* total packet length */ + u_int ph_rtableid; /* routing table id */ + u_int ph_ifidx; /* rcv interface index */ u_int16_t ph_tagsset; /* mtags attached */ u_int16_t ph_flowid; /* pseudo unique flow id */ u_int16_t csum_flags; /* checksum flags */ u_int16_t ether_vtag; /* Ethernet 802.1p+Q vlan tag */ - u_int ph_rtableid; /* routing table id */ - u_int ph_ifidx; /* rcv interface index */ + u_int16_t ph_mss; /* TCP max segment size */ u_int8_t ph_loopcnt; /* mbuf is looping in kernel */ u_int8_t ph_family; /* af, used when queueing */ struct pkthdr_pf pf; @@ -226,6 +227,7 @@ struct mbuf { #define M_IPV6_DF_OUT 0x1000 /* don't fragment outgoing IPv6 */ #define M_TIMESTAMP 0x2000 /* ph_timestamp is set */ #define M_FLOWID 0x4000 /* ph_flowid is set */ +#define M_TCP_TSO 0x8000 /* TCP Segmentation Offload needed */ #ifdef _KERNEL #define MCS_BITS \ diff --git a/usr.bin/netstat/inet.c b/usr.bin/netstat/inet.c index 2157946fc19..e04355ed078 100644 --- a/usr.bin/netstat/inet.c +++ b/usr.bin/netstat/inet.c @@ -1,4 +1,4 @@ -/* $OpenBSD: inet.c,v 1.174 2022/08/12 14:49:15 bluhm Exp $ */ +/* $OpenBSD: inet.c,v 1.175 2023/05/10 12:07:17 bluhm Exp $ */ /* $NetBSD: inet.c,v 1.14 1995/10/03 21:42:37 thorpej Exp $ */ /* @@ -408,6 +408,10 @@ tcp_stats(char *name) p(tcps_sndwinup, "\t\t%u window update packet%s\n"); p(tcps_sndctrl, "\t\t%u control packet%s\n"); p(tcps_outswcsum, "\t\t%u packet%s software-checksummed\n"); + p(tcps_outswtso, "\t\t%u output TSO packet%s software chopped\n"); + p(tcps_outhwtso, "\t\t%u output TSO packet%s hardware processed\n"); + p(tcps_outpkttso, "\t\t%u output TSO packet%s generated\n"); + p(tcps_outbadtso, "\t\t%u output TSO packet%s dropped\n"); p(tcps_rcvtotal, "\t%u packet%s received\n"); p2(tcps_rcvackpack, tcps_rcvackbyte, "\t\t%u ack%s (for %llu byte%s)\n"); p(tcps_rcvdupack, "\t\t%u duplicate ack%s\n");