From 5ebaba9d2927ab6dd81457cc6db99291a68437ff Mon Sep 17 00:00:00 2001 From: bluhm Date: Fri, 7 Jul 2023 08:05:02 +0000 Subject: [PATCH] Fix path MTU discovery for TCP LRO/TSO when forwarding. When doing LRO (Large Receive Offload), the drivers, currently ix(4) and lo(4) only, record an upper bound of the size of the original packets in ph_mss. When sending, either stack or hardware must chop the packets with TSO (TCP Segmentation Offload) to that size. That means we have to call tcp_if_output_tso() before ifp->if_output(). Put that logic into if_output_tso() to avoid code duplication. As TCP packets on the wire do not get larger that way, path MTU discovery should still work. tested by and OK jan@ --- sys/net/if.c | 53 +++++++++++++++++++++++++++++++++++++- sys/net/if_var.h | 4 ++- sys/net/pf.c | 23 ++++------------- sys/netinet/ip_output.c | 14 +++------- sys/netinet6/ip6_forward.c | 16 ++---------- sys/netinet6/ip6_output.c | 23 +++++++---------- 6 files changed, 75 insertions(+), 58 deletions(-) diff --git a/sys/net/if.c b/sys/net/if.c index 1cecfdf0224..fd54cc813f3 100644 --- a/sys/net/if.c +++ b/sys/net/if.c @@ -1,4 +1,4 @@ -/* $OpenBSD: if.c,v 1.704 2023/07/06 04:55:04 dlg Exp $ */ +/* $OpenBSD: if.c,v 1.705 2023/07/07 08:05:02 bluhm Exp $ */ /* $NetBSD: if.c,v 1.35 1996/05/07 05:26:04 thorpej Exp $ */ /* @@ -885,6 +885,57 @@ if_output_ml(struct ifnet *ifp, struct mbuf_list *ml, return error; } +int +if_output_tso(struct ifnet *ifp, struct mbuf **mp, struct sockaddr *dst, + struct rtentry *rt, u_int mtu) +{ + uint32_t ifcap; + int error; + + switch (dst->sa_family) { + case AF_INET: + ifcap = IFCAP_TSOv4; + break; +#ifdef INET6 + case AF_INET6: + ifcap = IFCAP_TSOv6; + break; +#endif + default: + unhandled_af(dst->sa_family); + } + + /* + * Try to send with TSO first. When forwarding LRO may set + * maximium segment size in mbuf header. Chop TCP segment + * even if it would fit interface MTU to preserve maximum + * path MTU. + */ + error = tcp_if_output_tso(ifp, mp, dst, rt, ifcap, mtu); + if (error || *mp == NULL) + return error; + + if ((*mp)->m_pkthdr.len <= mtu) { + switch (dst->sa_family) { + case AF_INET: + in_hdr_cksum_out(*mp, ifp); + in_proto_cksum_out(*mp, ifp); + break; +#ifdef INET6 + case AF_INET6: + in6_proto_cksum_out(*mp, ifp); + break; +#endif + } + error = ifp->if_output(ifp, *mp, dst, rt); + *mp = NULL; + return error; + } + + /* mp still contains mbuf that has to be fragmented or dropped. */ + return 0; +} + int if_output_mq(struct ifnet *ifp, struct mbuf_queue *mq, unsigned int *total, struct sockaddr *dst, struct rtentry *rt) diff --git a/sys/net/if_var.h b/sys/net/if_var.h index a4eabc52ca0..73d6e8ea323 100644 --- a/sys/net/if_var.h +++ b/sys/net/if_var.h @@ -1,4 +1,4 @@ -/* $OpenBSD: if_var.h,v 1.128 2023/06/28 11:49:49 kn Exp $ */ +/* $OpenBSD: if_var.h,v 1.129 2023/07/07 08:05:02 bluhm Exp $ */ /* $NetBSD: if.h,v 1.23 1996/05/07 02:40:27 thorpej Exp $ */ /* @@ -329,6 +329,8 @@ int if_output_ml(struct ifnet *, struct mbuf_list *, struct sockaddr *, struct rtentry *); int if_output_mq(struct ifnet *, struct mbuf_queue *, unsigned int *, struct sockaddr *, struct rtentry *); +int if_output_tso(struct ifnet *, struct mbuf **, struct sockaddr *, + struct rtentry *, u_int); int if_output_local(struct ifnet *, struct mbuf *, sa_family_t); void if_rtrequest_dummy(struct ifnet *, int, struct rtentry *); void p2p_rtrequest(struct ifnet *, int, struct rtentry *); diff --git a/sys/net/pf.c b/sys/net/pf.c index f5ad04dfa8d..eb233a8af37 100644 --- a/sys/net/pf.c +++ b/sys/net/pf.c @@ -1,4 +1,4 @@ -/* $OpenBSD: pf.c,v 1.1182 2023/07/06 04:55:05 dlg Exp $ */ +/* $OpenBSD: pf.c,v 1.1183 2023/07/07 08:05:02 bluhm Exp $ */ /* * Copyright (c) 2001 Daniel Hartmeier @@ -6610,15 +6610,8 @@ pf_route(struct pf_pdesc *pd, struct pf_state *st) ip = mtod(m0, struct ip *); } - if (ntohs(ip->ip_len) <= ifp->if_mtu) { - in_hdr_cksum_out(m0, ifp); - in_proto_cksum_out(m0, ifp); - ifp->if_output(ifp, m0, sintosa(dst), rt); - goto done; - } - - if (tcp_if_output_tso(ifp, &m0, sintosa(dst), rt, - IFCAP_TSOv4, ifp->if_mtu) || m0 == NULL) + if (if_output_tso(ifp, &m0, sintosa(dst), rt, ifp->if_mtu) || + m0 == NULL) goto done; /* @@ -6745,14 +6738,8 @@ pf_route6(struct pf_pdesc *pd, struct pf_state *st) goto done; } - if (m0->m_pkthdr.len <= ifp->if_mtu) { - in6_proto_cksum_out(m0, ifp); - ifp->if_output(ifp, m0, sin6tosa(dst), rt); - goto done; - } - - if (tcp_if_output_tso(ifp, &m0, sin6tosa(dst), rt, - IFCAP_TSOv6, ifp->if_mtu) || m0 == NULL) + if (if_output_tso(ifp, &m0, sin6tosa(dst), rt, ifp->if_mtu) || + m0 == NULL) goto done; ip6stat_inc(ip6s_cantfrag); diff --git a/sys/netinet/ip_output.c b/sys/netinet/ip_output.c index 0e56ffb3f11..1094c495ae0 100644 --- a/sys/netinet/ip_output.c +++ b/sys/netinet/ip_output.c @@ -1,4 +1,4 @@ -/* $OpenBSD: ip_output.c,v 1.389 2023/07/04 10:48:19 bluhm Exp $ */ +/* $OpenBSD: ip_output.c,v 1.390 2023/07/07 08:05:02 bluhm Exp $ */ /* $NetBSD: ip_output.c,v 1.28 1996/02/13 23:43:07 christos Exp $ */ /* @@ -451,17 +451,9 @@ sendit: #endif /* - * If small enough for interface, can just send directly. + * If TSO or small enough for interface, can just send directly. */ - if (ntohs(ip->ip_len) <= mtu) { - in_hdr_cksum_out(m, ifp); - in_proto_cksum_out(m, ifp); - error = ifp->if_output(ifp, m, sintosa(dst), ro->ro_rt); - goto done; - } - - error = tcp_if_output_tso(ifp, &m, sintosa(dst), ro->ro_rt, - IFCAP_TSOv4, mtu); + error = if_output_tso(ifp, &m, sintosa(dst), ro->ro_rt, mtu); if (error || m == NULL) goto done; diff --git a/sys/netinet6/ip6_forward.c b/sys/netinet6/ip6_forward.c index db68b32eca8..7de1e0ec701 100644 --- a/sys/netinet6/ip6_forward.c +++ b/sys/netinet6/ip6_forward.c @@ -1,4 +1,4 @@ -/* $OpenBSD: ip6_forward.c,v 1.111 2023/06/16 19:18:56 bluhm Exp $ */ +/* $OpenBSD: ip6_forward.c,v 1.112 2023/07/07 08:05:02 bluhm Exp $ */ /* $KAME: ip6_forward.c,v 1.75 2001/06/29 12:42:13 jinmei Exp $ */ /* @@ -319,8 +319,7 @@ reroute: } #endif - error = tcp_if_output_tso(ifp, &m, sin6tosa(sin6), rt, IFCAP_TSOv6, - ifp->if_mtu); + error = if_output_tso(ifp, &m, sin6tosa(sin6), rt, ifp->if_mtu); if (error) ip6stat_inc(ip6s_cantforward); else if (m == NULL) @@ -328,17 +327,6 @@ reroute: if (error || m == NULL) goto senderr; - /* Check the size after pf_test to give pf a chance to refragment. */ - if (m->m_pkthdr.len <= ifp->if_mtu) { - in6_proto_cksum_out(m, ifp); - error = ifp->if_output(ifp, m, sin6tosa(sin6), rt); - if (error) - ip6stat_inc(ip6s_cantforward); - else - ip6stat_inc(ip6s_forward); - goto senderr; - } - if (mcopy != NULL) icmp6_error(mcopy, ICMP6_PACKET_TOO_BIG, 0, ifp->if_mtu); m_freem(m); diff --git a/sys/netinet6/ip6_output.c b/sys/netinet6/ip6_output.c index f2fa3511943..8057f26e8aa 100644 --- a/sys/netinet6/ip6_output.c +++ b/sys/netinet6/ip6_output.c @@ -1,4 +1,4 @@ -/* $OpenBSD: ip6_output.c,v 1.278 2023/06/13 19:34:12 bluhm Exp $ */ +/* $OpenBSD: ip6_output.c,v 1.279 2023/07/07 08:05:02 bluhm Exp $ */ /* $KAME: ip6_output.c,v 1.172 2001/03/25 09:55:56 itojun Exp $ */ /* @@ -677,7 +677,8 @@ reroute: * 2-a: send as is if tlen <= interface mtu * 2-b: error if tlen > interface mtu */ - tlen = m->m_pkthdr.len; + tlen = ISSET(m->m_pkthdr.csum_flags, M_TCP_TSO) ? + m->m_pkthdr.ph_mss : m->m_pkthdr.len; if (ISSET(m->m_pkthdr.csum_flags, M_IPV6_DF_OUT)) { CLR(m->m_pkthdr.csum_flags, M_IPV6_DF_OUT); @@ -686,9 +687,8 @@ reroute: dontfrag = 1; else dontfrag = 0; - if (dontfrag && /* case 2-b */ - (ISSET(m->m_pkthdr.csum_flags, M_TCP_TSO) ? - m->m_pkthdr.ph_mss : tlen) > ifp->if_mtu) { + + if (dontfrag && tlen > ifp->if_mtu) { /* case 2-b */ #ifdef IPSEC if (ip_mtudisc) ipsec_adjust_mtu(m, mtu); @@ -701,16 +701,13 @@ reroute: * transmit packet without fragmentation */ if (dontfrag || tlen <= mtu) { /* case 1-a and 2-a */ - in6_proto_cksum_out(m, ifp); - error = ifp->if_output(ifp, m, sin6tosa(dst), ro->ro_rt); - goto done; + error = if_output_tso(ifp, &m, sin6tosa(dst), ro->ro_rt, + ifp->if_mtu); + if (error || m == NULL) + goto done; + goto bad; /* should not happen */ } - error = tcp_if_output_tso(ifp, &m, sin6tosa(dst), ro->ro_rt, - IFCAP_TSOv6, mtu); - if (error || m == NULL) - goto done; - /* * try to fragment the packet. case 1-b */ -- 2.20.1