Use TCP Large Receive Offload in vmx(4).
authorjan <jan@openbsd.org>
Fri, 7 Jun 2024 08:44:25 +0000 (08:44 +0000)
committerjan <jan@openbsd.org>
Fri, 7 Jun 2024 08:44:25 +0000 (08:44 +0000)
tested by Hrvoje Popovski and bluhm@

ok bluhm@

sys/dev/pci/if_vmx.c
sys/dev/pci/if_vmxreg.h

index 0c5883a..e70d9b5 100644 (file)
@@ -1,4 +1,4 @@
-/*     $OpenBSD: if_vmx.c,v 1.86 2024/05/21 19:49:06 jan Exp $ */
+/*     $OpenBSD: if_vmx.c,v 1.87 2024/06/07 08:44:25 jan Exp $ */
 
 /*
  * Copyright (c) 2013 Tsubai Masanari
@@ -114,6 +114,8 @@ struct vmxnet3_comp_ring {
        };
        u_int next;
        u_int32_t gen;
+       struct mbuf *sendmp;
+       struct mbuf *lastmp;
 };
 
 struct vmxnet3_txqueue {
@@ -160,6 +162,7 @@ struct vmxnet3_softc {
        struct vmxnet3_queue *sc_q;
        struct intrmap *sc_intrmap;
 
+       u_int sc_vrrs;
        struct vmxnet3_driver_shared *sc_ds;
        u_int8_t *sc_mcast;
        struct vmxnet3_upt1_rss_conf *sc_rss;
@@ -170,7 +173,7 @@ struct vmxnet3_softc {
 #endif
 };
 
-#define JUMBO_LEN (1024 * 9)
+#define JUMBO_LEN ((16 * 1024) - 1)
 #define DMAADDR(map) ((map)->dm_segs[0].ds_addr)
 
 #define READ_BAR0(sc, reg) bus_space_read_4((sc)->sc_iot0, (sc)->sc_ioh0, reg)
@@ -273,15 +276,21 @@ vmxnet3_attach(struct device *parent, struct device *self, void *aux)
                return;
        }
 
+       /* Vmxnet3 Revision Report and Selection */
        ver = READ_BAR1(sc, VMXNET3_BAR1_VRRS);
-       if ((ver & 0x1) == 0) {
+       if (ISSET(ver, 0x2)) {
+               sc->sc_vrrs = 2;
+       } else if (ISSET(ver, 0x1)) {
+               sc->sc_vrrs = 1;
+       } else {
                printf(": unsupported hardware version 0x%x\n", ver);
                return;
        }
-       WRITE_BAR1(sc, VMXNET3_BAR1_VRRS, 1);
+       WRITE_BAR1(sc, VMXNET3_BAR1_VRRS, sc->sc_vrrs);
 
+       /* UPT Version Report and Selection */
        ver = READ_BAR1(sc, VMXNET3_BAR1_UVRS);
-       if ((ver & 0x1) == 0) {
+       if (!ISSET(ver, 0x1)) {
                printf(": incompatible UPT version 0x%x\n", ver);
                return;
        }
@@ -410,6 +419,11 @@ vmxnet3_attach(struct device *parent, struct device *self, void *aux)
 
        ifp->if_capabilities |= IFCAP_TSOv4 | IFCAP_TSOv6;
 
+       if (sc->sc_vrrs == 2) {
+               ifp->if_xflags |= IFXF_LRO;
+               ifp->if_capabilities |= IFCAP_LRO;
+       }
+
 #if NVLAN > 0
        if (sc->sc_ds->upt_features & UPT1_F_VLAN)
                ifp->if_capabilities |= IFCAP_VLAN_HWTAGGING;
@@ -704,6 +718,10 @@ vmxnet3_rxfill(struct vmxnet3_rxring *ring)
        uint32_t rgen;
        uint32_t type = htole32(VMXNET3_BTYPE_HEAD << VMXNET3_RX_BTYPE_S);
 
+       /* Second ring just contains packet bodies. */
+       if (ring->rid == 1)
+               type = htole32(VMXNET3_BTYPE_BODY << VMXNET3_RX_BTYPE_S);
+
        MUTEX_ASSERT_LOCKED(&ring->mtx);
 
        slots = if_rxr_get(&ring->rxr, NRXDESC);
@@ -781,17 +799,17 @@ vmxnet3_rxinit(struct vmxnet3_softc *sc, struct vmxnet3_rxqueue *rq)
                    VMX_DMA_LEN(&ring->dmamem));
                bus_dmamap_sync(sc->sc_dmat, VMX_DMA_MAP(&ring->dmamem),
                    0, VMX_DMA_LEN(&ring->dmamem), BUS_DMASYNC_PREWRITE);
-       }
 
-       /* XXX only fill ring 0 */
-       ring = &rq->cmd_ring[0];
-       mtx_enter(&ring->mtx);
-       vmxnet3_rxfill(ring);
-       mtx_leave(&ring->mtx);
+               mtx_enter(&ring->mtx);
+               vmxnet3_rxfill(ring);
+               mtx_leave(&ring->mtx);
+       }
 
        comp_ring = &rq->comp_ring;
        comp_ring->next = 0;
        comp_ring->gen = VMX_RXC_GEN;
+       comp_ring->sendmp = NULL;
+       comp_ring->lastmp = NULL;
 
        memset(VMX_DMA_KVA(&comp_ring->dmamem), 0,
            VMX_DMA_LEN(&comp_ring->dmamem));
@@ -1074,9 +1092,9 @@ vmxnet3_rxintr(struct vmxnet3_softc *sc, struct vmxnet3_rxqueue *rq)
        struct mbuf_list ml = MBUF_LIST_INITIALIZER();
        struct mbuf *m;
        bus_dmamap_t map;
-       unsigned int idx, len;
+       unsigned int idx;
        unsigned int next, rgen;
-       unsigned int done = 0;
+       unsigned int rid, done[2] = {0, 0};
 
        next = comp_ring->next;
        rgen = comp_ring->gen;
@@ -1096,11 +1114,14 @@ vmxnet3_rxintr(struct vmxnet3_softc *sc, struct vmxnet3_rxqueue *rq)
 
                idx = letoh32((rxcd->rxc_word0 >> VMXNET3_RXC_IDX_S) &
                    VMXNET3_RXC_IDX_M);
+
                if (letoh32((rxcd->rxc_word0 >> VMXNET3_RXC_QID_S) &
                    VMXNET3_RXC_QID_M) < sc->sc_nqueues)
-                       ring = &rq->cmd_ring[0];
+                       rid = 0;
                else
-                       ring = &rq->cmd_ring[1];
+                       rid = 1;
+
+               ring = &rq->cmd_ring[rid];
 
                m = ring->m[idx];
                KASSERT(m != NULL);
@@ -1111,31 +1132,62 @@ vmxnet3_rxintr(struct vmxnet3_softc *sc, struct vmxnet3_rxqueue *rq)
                    BUS_DMASYNC_POSTREAD);
                bus_dmamap_unload(sc->sc_dmat, map);
 
-               done++;
+               done[rid]++;
+
+               /*
+                * A receive descriptor of type 4 which is flaged as start of
+                * packet, contains the number of TCP segment of an LRO packet.
+                */
+               if (letoh32((rxcd->rxc_word3 & VMXNET3_RXC_TYPE_M) >>
+                   VMXNET3_RXC_TYPE_S) == 4 &&
+                   ISSET(rxcd->rxc_word0, VMXNET3_RXC_SOP)) {
+                       m->m_pkthdr.ph_mss = letoh32(rxcd->rxc_word1 &
+                           VMXNET3_RXC_SEG_CNT_M);
+               }
+
+               m->m_len = letoh32((rxcd->rxc_word2 >> VMXNET3_RXC_LEN_S) &
+                   VMXNET3_RXC_LEN_M);
+
+               if (comp_ring->sendmp == NULL) {
+                       comp_ring->sendmp = comp_ring->lastmp = m;
+                       comp_ring->sendmp->m_pkthdr.len = 0;
+               } else {
+                       CLR(m->m_flags, M_PKTHDR);
+                       comp_ring->lastmp->m_next = m;
+                       comp_ring->lastmp = m;
+               }
+               comp_ring->sendmp->m_pkthdr.len += m->m_len;
+
+               if (!ISSET(rxcd->rxc_word0, VMXNET3_RXC_EOP))
+                       continue;
+
+               /*
+                * End of Packet
+                */
 
                if (letoh32(rxcd->rxc_word2 & VMXNET3_RXC_ERROR)) {
                        ifp->if_ierrors++;
-                       m_freem(m);
+                       m_freem(comp_ring->sendmp);
+                       comp_ring->sendmp = comp_ring->lastmp = NULL;
                        continue;
                }
 
-               len = letoh32((rxcd->rxc_word2 >> VMXNET3_RXC_LEN_S) &
-                   VMXNET3_RXC_LEN_M);
-               if (len < VMXNET3_MIN_MTU) {
-                       m_freem(m);
+               if (comp_ring->sendmp->m_pkthdr.len < VMXNET3_MIN_MTU) {
+                       m_freem(comp_ring->sendmp);
+                       comp_ring->sendmp = comp_ring->lastmp = NULL;
                        continue;
                }
-               m->m_pkthdr.len = m->m_len = len;
-
-               vmxnet3_rx_offload(rxcd, m);
 
                if (((letoh32(rxcd->rxc_word0) >> VMXNET3_RXC_RSSTYPE_S) &
                    VMXNET3_RXC_RSSTYPE_M) != VMXNET3_RXC_RSSTYPE_NONE) {
-                       m->m_pkthdr.ph_flowid = letoh32(rxcd->rxc_word1);
-                       SET(m->m_pkthdr.csum_flags, M_FLOWID);
+                       comp_ring->sendmp->m_pkthdr.ph_flowid =
+                           letoh32(rxcd->rxc_word1);
+                       SET(comp_ring->sendmp->m_pkthdr.csum_flags, M_FLOWID);
                }
 
-               ml_enqueue(&ml, m);
+               vmxnet3_rx_offload(rxcd, comp_ring->sendmp);
+               ml_enqueue(&ml, comp_ring->sendmp);
+               comp_ring->sendmp = comp_ring->lastmp = NULL;
        }
 
        bus_dmamap_sync(sc->sc_dmat, VMX_DMA_MAP(&comp_ring->dmamem),
@@ -1144,19 +1196,20 @@ vmxnet3_rxintr(struct vmxnet3_softc *sc, struct vmxnet3_rxqueue *rq)
        comp_ring->next = next;
        comp_ring->gen = rgen;
 
-       if (done == 0)
-               return;
+       for (int i = 0; i < 2; i++) {
+               if (done[i] == 0)
+                       continue;
 
-       ring = &rq->cmd_ring[0];
+               ring = &rq->cmd_ring[i];
 
-       if (ifiq_input(rq->ifiq, &ml))
-               if_rxr_livelocked(&ring->rxr);
+               if (ifiq_input(rq->ifiq, &ml))
+                       if_rxr_livelocked(&ring->rxr);
 
-       /* XXX Should we (try to) allocate buffers for ring 2 too? */
-       mtx_enter(&ring->mtx);
-       if_rxr_put(&ring->rxr, done);
-       vmxnet3_rxfill(ring);
-       mtx_leave(&ring->mtx);
+               mtx_enter(&ring->mtx);
+               if_rxr_put(&ring->rxr, done[i]);
+               vmxnet3_rxfill(ring);
+               mtx_leave(&ring->mtx);
+       }
 }
 
 void
@@ -1211,6 +1264,8 @@ vmxnet3_iff(struct vmxnet3_softc *sc)
 void
 vmxnet3_rx_offload(struct vmxnet3_rxcompdesc *rxcd, struct mbuf *m)
 {
+       uint32_t pkts;
+
        /*
         * VLAN Offload
         */
@@ -1243,6 +1298,45 @@ vmxnet3_rx_offload(struct vmxnet3_rxcompdesc *rxcd, struct mbuf *m)
                else if (ISSET(rxcd->rxc_word3, VMXNET3_RXC_UDP))
                        SET(m->m_pkthdr.csum_flags, M_UDP_CSUM_IN_OK);
        }
+
+       /*
+        * TCP Large Receive Offload
+        */
+
+       pkts = m->m_pkthdr.ph_mss;
+       m->m_pkthdr.ph_mss = 0;
+
+       if (pkts > 1) {
+               struct ether_extracted ext;
+               uint32_t paylen;
+
+               ether_extract_headers(m, &ext);
+
+               paylen = ext.iplen;
+               if (ext.ip4 || ext.ip6)
+                       paylen -= ext.iphlen;
+
+               if (ext.tcp) {
+                       paylen -= ext.tcphlen;
+                       tcpstat_inc(tcps_inhwlro);
+                       tcpstat_add(tcps_inpktlro, pkts);
+               } else {
+                       tcpstat_inc(tcps_inbadlro);
+               }
+
+               /*
+                * If we gonna forward this packet, we have to mark it as TSO,
+                * set a correct mss, and recalculate the TCP checksum.
+                */
+               if (ext.tcp && paylen >= pkts) {
+                       SET(m->m_pkthdr.csum_flags, M_TCP_TSO);
+                       m->m_pkthdr.ph_mss = paylen / pkts;
+               }
+               if (ext.tcp &&
+                   ISSET(m->m_pkthdr.csum_flags, M_TCP_CSUM_IN_OK)) {
+                       SET(m->m_pkthdr.csum_flags, M_TCP_CSUM_OUT);
+               }
+       }
 }
 
 void
@@ -1309,6 +1403,13 @@ vmxnet3_init(struct vmxnet3_softc *sc)
                return EIO;
        }
 
+       /* TCP Large Receive Offload */
+       if (ISSET(ifp->if_xflags, IFXF_LRO))
+               SET(sc->sc_ds->upt_features, UPT1_F_LRO);
+       else
+               CLR(sc->sc_ds->upt_features, UPT1_F_LRO);
+       WRITE_CMD(sc, VMXNET3_CMD_SET_FEATURE);
+
        /* Program promiscuous mode and multicast filters. */
        vmxnet3_iff(sc);
 
index 44f5e43..a697856 100644 (file)
@@ -1,4 +1,4 @@
-/*     $OpenBSD: if_vmxreg.h,v 1.9 2020/07/07 01:36:49 dlg Exp $       */
+/*     $OpenBSD: if_vmxreg.h,v 1.10 2024/06/07 08:44:25 jan Exp $      */
 
 /*
  * Copyright (c) 2013 Tsubai Masanari
@@ -76,6 +76,7 @@ enum UPT1_RxStats {
 #define VMXNET3_CMD_RESET      0xcafe0002      /* reset device */
 #define VMXNET3_CMD_SET_RXMODE 0xcafe0003      /* set interface flags */
 #define VMXNET3_CMD_SET_FILTER 0xcafe0004      /* set address filter */
+#define VMXNET3_CMD_SET_FEATURE        0xcafe0009      /* set features */
 #define VMXNET3_CMD_GET_STATUS 0xf00d0000      /* get queue errors */
 #define VMXNET3_CMD_GET_STATS  0xf00d0001
 #define VMXNET3_CMD_GET_LINK   0xf00d0002      /* get link status */
@@ -189,6 +190,7 @@ struct vmxnet3_rxcompdesc {
        u_int32_t               rxc_word1;
 #define VMXNET3_RXC_RSSHASH_M  0xffffffff      /* RSS hash value */
 #define VMXNET3_RXC_RSSHASH_S  0
+#define VMXNET3_RXC_SEG_CNT_M  0x000000ff      /* No. of seg. in LRO pkt */
 
        u_int32_t               rxc_word2;
 #define VMXNET3_RXC_LEN_M      0x00003fff
@@ -210,6 +212,7 @@ struct vmxnet3_rxcompdesc {
 #define VMXNET3_RXC_FRAGMENT   0x00400000      /* IP fragment */
 #define VMXNET3_RXC_FCS                0x00800000      /* frame CRC correct */
 #define VMXNET3_RXC_TYPE_M     0x7f000000
+#define VMXNET3_RXC_TYPE_S     24
 #define VMXNET3_RXC_GEN_M      0x00000001U
 #define VMXNET3_RXC_GEN_S      31
 } __packed;