make if_pfsync.c a better friend with PF_LOCK
authorsashan <sashan@openbsd.org>
Thu, 4 Feb 2021 00:55:41 +0000 (00:55 +0000)
committersashan <sashan@openbsd.org>
Thu, 4 Feb 2021 00:55:41 +0000 (00:55 +0000)
The code delivered in this change is currently disabled. Brave souls
may enable the code by adding -DWITH_PF_LOCK when building customized
kernel. Big thanks goes to Hrvoje@ for providing test equipment and
testing.

As soon as we enter the next release cycle, the WITH_PF_LOCK will be
defined as default option for MP kernels.

OK dlg@

sys/arch/amd64/conf/GENERIC.MP
sys/net/if_pfsync.c
sys/net/if_pfsync.h
sys/net/pf.c

index 7f195a5..5575ffe 100644 (file)
@@ -1,7 +1,8 @@
-#      $OpenBSD: GENERIC.MP,v 1.14 2018/07/13 05:25:24 tb Exp $
+#      $OpenBSD: GENERIC.MP,v 1.15 2021/02/04 00:55:41 sashan Exp $
 
 include "arch/amd64/conf/GENERIC"
 
+#option        WITH_PF_LOCK
 option MULTIPROCESSOR
 #option        MP_LOCKDEBUG
 #option        WITNESS
index 2b094dd..b5fe9fb 100644 (file)
@@ -1,4 +1,4 @@
-/*     $OpenBSD: if_pfsync.c,v 1.282 2021/02/01 00:31:05 dlg Exp $     */
+/*     $OpenBSD: if_pfsync.c,v 1.283 2021/02/04 00:55:41 sashan Exp $  */
 
 /*
  * Copyright (c) 2002 Michael Shalayeff
@@ -210,9 +210,11 @@ struct pfsync_softc {
        struct ip                sc_template;
 
        struct pf_state_queue    sc_qs[PFSYNC_S_COUNT];
+       struct mutex             sc_mtx[PFSYNC_S_COUNT];
        size_t                   sc_len;
 
        struct pfsync_upd_reqs   sc_upd_req_list;
+       struct mutex             sc_upd_req_mtx;
 
        int                      sc_initial_bulk;
        int                      sc_link_demoted;
@@ -220,6 +222,7 @@ struct pfsync_softc {
        int                      sc_defer;
        struct pfsync_deferrals  sc_deferrals;
        u_int                    sc_deferred;
+       struct mutex             sc_deferrals_mtx;
 
        void                    *sc_plus;
        size_t                   sc_pluslen;
@@ -234,6 +237,7 @@ struct pfsync_softc {
        struct timeout           sc_bulk_tmo;
 
        TAILQ_HEAD(, tdb)        sc_tdb_q;
+       struct mutex             sc_tdb_mtx;
 
        struct task              sc_ltask;
        struct task              sc_dtask;
@@ -241,6 +245,16 @@ struct pfsync_softc {
        struct timeout           sc_tmo;
 };
 
+struct pfsync_snapshot {
+       struct pfsync_softc     *sn_sc;
+       struct pf_state_queue    sn_qs[PFSYNC_S_COUNT];
+       struct pfsync_upd_reqs   sn_upd_req_list;
+       TAILQ_HEAD(, tdb)        sn_tdb_q;
+       size_t                   sn_len;
+       void                    *sn_plus;
+       size_t                   sn_pluslen;
+};
+
 struct pfsync_softc    *pfsyncif = NULL;
 struct cpumem          *pfsynccounters;
 
@@ -276,6 +290,10 @@ void       pfsync_bulk_start(void);
 void   pfsync_bulk_status(u_int8_t);
 void   pfsync_bulk_update(void *);
 void   pfsync_bulk_fail(void *);
+
+void   pfsync_grab_snapshot(struct pfsync_snapshot *, struct pfsync_softc *);
+void   pfsync_drop_snapshot(struct pfsync_snapshot *);
+
 #ifdef WITH_PF_LOCK
 void   pfsync_send_dispatch(void *);
 void   pfsync_send_pkt(struct mbuf *);
@@ -307,6 +325,13 @@ pfsync_clone_create(struct if_clone *ifc, int unit)
        struct pfsync_softc *sc;
        struct ifnet *ifp;
        int q;
+       static const char *mtx_names[] = {
+               "iack_mtx",
+               "upd_c_mtx",
+               "del_mtx",
+               "ins_mtx",
+               "upd_mtx",
+               "" };
 
        if (unit != 0)
                return (EINVAL);
@@ -314,18 +339,23 @@ pfsync_clone_create(struct if_clone *ifc, int unit)
        pfsync_sync_ok = 1;
 
        sc = malloc(sizeof(*pfsyncif), M_DEVBUF, M_WAITOK|M_ZERO);
-       for (q = 0; q < PFSYNC_S_COUNT; q++)
+       for (q = 0; q < PFSYNC_S_COUNT; q++) {
                TAILQ_INIT(&sc->sc_qs[q]);
+               mtx_init_flags(&sc->sc_mtx[q], IPL_SOFTNET, mtx_names[q], 0);
+       }
 
        pool_init(&sc->sc_pool, PFSYNC_PLSIZE, 0, IPL_SOFTNET, 0, "pfsync",
            NULL);
        TAILQ_INIT(&sc->sc_upd_req_list);
+       mtx_init(&sc->sc_upd_req_mtx, IPL_SOFTNET);
        TAILQ_INIT(&sc->sc_deferrals);
+       mtx_init(&sc->sc_deferrals_mtx, IPL_SOFTNET);
        task_set(&sc->sc_ltask, pfsync_syncdev_state, sc);
        task_set(&sc->sc_dtask, pfsync_ifdetach, sc);
        sc->sc_deferred = 0;
 
        TAILQ_INIT(&sc->sc_tdb_q);
+       mtx_init(&sc->sc_tdb_mtx, IPL_SOFTNET);
 
        sc->sc_len = PFSYNC_MINPKT;
        sc->sc_maxupdates = 128;
@@ -370,6 +400,7 @@ pfsync_clone_destroy(struct ifnet *ifp)
        struct pfsync_softc *sc = ifp->if_softc;
        struct ifnet *ifp0;
        struct pfsync_deferral *pd;
+       struct pfsync_deferrals  deferrals;
 
        NET_LOCK();
 
@@ -392,10 +423,18 @@ pfsync_clone_destroy(struct ifnet *ifp)
 
        pfsync_drop(sc);
 
-       while (sc->sc_deferred > 0) {
-               pd = TAILQ_FIRST(&sc->sc_deferrals);
-               timeout_del(&pd->pd_tmo);
-               pfsync_undefer(pd, 0);
+       if (sc->sc_deferred > 0) {
+               TAILQ_INIT(&deferrals);
+               mtx_enter(&sc->sc_deferrals_mtx);
+               TAILQ_CONCAT(&deferrals, &sc->sc_deferrals, pd_entry);
+               sc->sc_deferred = 0;
+               mtx_leave(&sc->sc_deferrals_mtx);
+
+               while (!TAILQ_EMPTY(&deferrals)) {
+                       pd = TAILQ_FIRST(&deferrals);
+                       TAILQ_REMOVE(&deferrals, pd, pd_entry);
+                       pfsync_undefer(pd, 0);
+               }
        }
 
        pfsyncif = NULL;
@@ -786,10 +825,8 @@ pfsync_input(struct mbuf **mp, int *offp, int proto, int af)
                        return IPPROTO_DONE;
                }
 
-               PF_LOCK();
                e = pfsync_acts[subh.action].in(n->m_data + noff, mlen, count,
                    flags);
-               PF_UNLOCK();
                if (e != 0)
                        goto done;
 
@@ -810,6 +847,7 @@ pfsync_in_clr(caddr_t buf, int len, int count, int flags)
        u_int32_t creatorid;
        int i;
 
+       PF_LOCK();
        for (i = 0; i < count; i++) {
                clr = (struct pfsync_clr *)buf + len * i;
                kif = NULL;
@@ -818,6 +856,7 @@ pfsync_in_clr(caddr_t buf, int len, int count, int flags)
                    (kif = pfi_kif_find(clr->ifname)) == NULL)
                        continue;
 
+               PF_STATE_ENTER_WRITE();
                for (st = RB_MIN(pf_state_tree_id, &tree_id); st; st = nexts) {
                        nexts = RB_NEXT(pf_state_tree_id, &tree_id, st);
                        if (st->creatorid == creatorid &&
@@ -826,7 +865,9 @@ pfsync_in_clr(caddr_t buf, int len, int count, int flags)
                                pf_remove_state(st);
                        }
                }
+               PF_STATE_EXIT_WRITE();
        }
+       PF_UNLOCK();
 
        return (0);
 }
@@ -838,6 +879,7 @@ pfsync_in_ins(caddr_t buf, int len, int count, int flags)
        sa_family_t af1, af2;
        int i;
 
+       PF_LOCK();
        for (i = 0; i < count; i++) {
                sp = (struct pfsync_state *)(buf + len * i);
                af1 = sp->key[0].af;
@@ -863,6 +905,7 @@ pfsync_in_ins(caddr_t buf, int len, int count, int flags)
                        break;
                }
        }
+       PF_UNLOCK();
 
        return (0);
 }
@@ -881,12 +924,17 @@ pfsync_in_iack(caddr_t buf, int len, int count, int flags)
                id_key.id = ia->id;
                id_key.creatorid = ia->creatorid;
 
+               PF_STATE_ENTER_READ();
                st = pf_find_state_byid(&id_key);
+               pf_state_ref(st);
+               PF_STATE_EXIT_READ();
                if (st == NULL)
                        continue;
 
                if (ISSET(st->state_flags, PFSTATE_ACK))
                        pfsync_deferred(st, 0);
+
+               pf_state_unref(st);
        }
 
        return (0);
@@ -930,8 +978,7 @@ pfsync_in_upd(caddr_t buf, int len, int count, int flags)
        struct pfsync_state *sp;
        struct pf_state_cmp id_key;
        struct pf_state *st;
-       int sync;
-
+       int sync, error;
        int i;
 
        for (i = 0; i < count; i++) {
@@ -950,11 +997,17 @@ pfsync_in_upd(caddr_t buf, int len, int count, int flags)
                id_key.id = sp->id;
                id_key.creatorid = sp->creatorid;
 
+               PF_STATE_ENTER_READ();
                st = pf_find_state_byid(&id_key);
+               pf_state_ref(st);
+               PF_STATE_EXIT_READ();
                if (st == NULL) {
                        /* insert the update */
-                       if (pfsync_state_import(sp, flags))
+                       PF_LOCK();
+                       error = pfsync_state_import(sp, flags);
+                       if (error)
                                pfsyncstat_inc(pfsyncs_badstate);
+                       PF_UNLOCK();
                        continue;
                }
 
@@ -992,9 +1045,11 @@ pfsync_in_upd(caddr_t buf, int len, int count, int flags)
                if (sync) {
                        pfsyncstat_inc(pfsyncs_stale);
 
-                       pfsync_update_state_locked(st);
+                       pfsync_update_state(st);
                        schednetisr(NETISR_PFSYNC);
                }
+
+               pf_state_unref(st);
        }
 
        return (0);
@@ -1027,7 +1082,10 @@ pfsync_in_upd_c(caddr_t buf, int len, int count, int flags)
                id_key.id = up->id;
                id_key.creatorid = up->creatorid;
 
+               PF_STATE_ENTER_READ();
                st = pf_find_state_byid(&id_key);
+               pf_state_ref(st);
+               PF_STATE_EXIT_READ();
                if (st == NULL) {
                        /* We don't have this state. Ask for it. */
                        pfsync_request_update(id_key.creatorid, id_key.id);
@@ -1066,9 +1124,11 @@ pfsync_in_upd_c(caddr_t buf, int len, int count, int flags)
                if (sync) {
                        pfsyncstat_inc(pfsyncs_stale);
 
-                       pfsync_update_state_locked(st);
+                       pfsync_update_state(st);
                        schednetisr(NETISR_PFSYNC);
                }
+
+               pf_state_unref(st);
        }
 
        return (0);
@@ -1092,15 +1152,21 @@ pfsync_in_ureq(caddr_t buf, int len, int count, int flags)
                if (id_key.id == 0 && id_key.creatorid == 0)
                        pfsync_bulk_start();
                else {
+                       PF_STATE_ENTER_READ();
                        st = pf_find_state_byid(&id_key);
+                       pf_state_ref(st);
+                       PF_STATE_EXIT_READ();
                        if (st == NULL) {
                                pfsyncstat_inc(pfsyncs_badstate);
                                continue;
                        }
-                       if (ISSET(st->state_flags, PFSTATE_NOSYNC))
+                       if (ISSET(st->state_flags, PFSTATE_NOSYNC)) {
+                               pf_state_unref(st);
                                continue;
+                       }
 
                        pfsync_update_state_req(st);
+                       pf_state_unref(st);
                }
        }
 
@@ -1115,6 +1181,7 @@ pfsync_in_del(caddr_t buf, int len, int count, int flags)
        struct pf_state *st;
        int i;
 
+       PF_STATE_ENTER_WRITE();
        for (i = 0; i < count; i++) {
                sp = (struct pfsync_state *)(buf + len * i);
 
@@ -1129,6 +1196,7 @@ pfsync_in_del(caddr_t buf, int len, int count, int flags)
                SET(st->state_flags, PFSTATE_NOSYNC);
                pf_remove_state(st);
        }
+       PF_STATE_EXIT_WRITE();
 
        return (0);
 }
@@ -1141,6 +1209,8 @@ pfsync_in_del_c(caddr_t buf, int len, int count, int flags)
        struct pf_state *st;
        int i;
 
+       PF_LOCK();
+       PF_STATE_ENTER_WRITE();
        for (i = 0; i < count; i++) {
                sp = (struct pfsync_del_c *)(buf + len * i);
 
@@ -1156,6 +1226,8 @@ pfsync_in_del_c(caddr_t buf, int len, int count, int flags)
                SET(st->state_flags, PFSTATE_NOSYNC);
                pf_remove_state(st);
        }
+       PF_STATE_EXIT_WRITE();
+       PF_UNLOCK();
 
        return (0);
 }
@@ -1505,19 +1577,59 @@ pfsync_out_del(struct pf_state *st, void *buf)
 }
 
 void
-pfsync_drop(struct pfsync_softc *sc)
+pfsync_grab_snapshot(struct pfsync_snapshot *sn, struct pfsync_softc *sc)
+{
+       int q;
+
+       sn->sn_sc = sc;
+
+       for (q = 0; q < PFSYNC_S_COUNT; q++)
+               mtx_enter(&sc->sc_mtx[q]);
+
+       mtx_enter(&sc->sc_upd_req_mtx);
+       mtx_enter(&sc->sc_tdb_mtx);
+
+       for (q = 0; q < PFSYNC_S_COUNT; q++) {
+               TAILQ_INIT(&sn->sn_qs[q]);
+               TAILQ_CONCAT(&sn->sn_qs[q], &sc->sc_qs[q], sync_list);
+       }
+
+       TAILQ_INIT(&sn->sn_upd_req_list);
+       TAILQ_CONCAT(&sn->sn_upd_req_list, &sc->sc_upd_req_list, ur_entry);
+
+       TAILQ_INIT(&sn->sn_tdb_q);
+       TAILQ_CONCAT(&sn->sn_tdb_q, &sc->sc_tdb_q, tdb_sync_entry);
+
+       sn->sn_len = sc->sc_len;
+       sc->sc_len = PFSYNC_MINPKT;
+
+       sn->sn_plus = sc->sc_plus;
+       sc->sc_plus = NULL;
+       sn->sn_pluslen = sc->sc_pluslen;
+       sc->sc_pluslen = 0;
+
+       mtx_leave(&sc->sc_tdb_mtx);
+       mtx_leave(&sc->sc_upd_req_mtx);
+
+       for (q = (PFSYNC_S_COUNT - 1); q >= 0; q--)
+               mtx_leave(&sc->sc_mtx[q]);
+}
+
+void
+pfsync_drop_snapshot(struct pfsync_snapshot *sn)
 {
        struct pf_state *st;
        struct pfsync_upd_req_item *ur;
        struct tdb *t;
        int q;
 
+
        for (q = 0; q < PFSYNC_S_COUNT; q++) {
-               if (TAILQ_EMPTY(&sc->sc_qs[q]))
+               if (TAILQ_EMPTY(&sn->sn_qs[q]))
                        continue;
 
-               while ((st = TAILQ_FIRST(&sc->sc_qs[q])) != NULL) {
-                       TAILQ_REMOVE(&sc->sc_qs[q], st, sync_list);
+               while ((st = TAILQ_FIRST(&sn->sn_qs[q])) != NULL) {
+                       TAILQ_REMOVE(&sn->sn_qs[q], st, sync_list);
 #ifdef PFSYNC_DEBUG
                        KASSERT(st->sync_state == q);
 #endif
@@ -1526,19 +1638,42 @@ pfsync_drop(struct pfsync_softc *sc)
                }
        }
 
-       while ((ur = TAILQ_FIRST(&sc->sc_upd_req_list)) != NULL) {
-               TAILQ_REMOVE(&sc->sc_upd_req_list, ur, ur_entry);
-               pool_put(&sc->sc_pool, ur);
+       while ((ur = TAILQ_FIRST(&sn->sn_upd_req_list)) != NULL) {
+               TAILQ_REMOVE(&sn->sn_upd_req_list, ur, ur_entry);
+               pool_put(&sn->sn_sc->sc_pool, ur);
        }
 
-       sc->sc_plus = NULL;
-
-       while ((t = TAILQ_FIRST(&sc->sc_tdb_q)) != NULL) {
-               TAILQ_REMOVE(&sc->sc_tdb_q, t, tdb_sync_entry);
+       while ((t = TAILQ_FIRST(&sn->sn_tdb_q)) != NULL) {
+               TAILQ_REMOVE(&sn->sn_tdb_q, t, tdb_sync_entry);
                CLR(t->tdb_flags, TDBF_PFSYNC);
        }
+}
 
-       sc->sc_len = PFSYNC_MINPKT;
+int
+pfsync_is_snapshot_empty(struct pfsync_snapshot *sn)
+{
+       int     q;
+
+       for (q = 0; q < PFSYNC_S_COUNT; q++)
+               if (!TAILQ_EMPTY(&sn->sn_qs[q]))
+                       return (0);
+
+       if (!TAILQ_EMPTY(&sn->sn_upd_req_list))
+               return (0);
+
+       if (!TAILQ_EMPTY(&sn->sn_tdb_q))
+               return (0);
+
+       return (sn->sn_plus == NULL);
+}
+
+void
+pfsync_drop(struct pfsync_softc *sc)
+{
+       struct pfsync_snapshot  sn;
+
+       pfsync_grab_snapshot(&sn, sc);
+       pfsync_drop_snapshot(&sn);
 }
 
 #ifdef WITH_PF_LOCK
@@ -1591,6 +1726,7 @@ pfsync_send_pkt(struct mbuf *m)
 void
 pfsync_sendout(void)
 {
+       struct pfsync_snapshot sn;
        struct pfsync_softc *sc = pfsyncif;
 #if NBPFILTER > 0
        struct ifnet *ifp = &sc->sc_if;
@@ -1602,12 +1738,9 @@ pfsync_sendout(void)
        struct pf_state *st;
        struct pfsync_upd_req_item *ur;
        struct tdb *t;
-
        int offset;
        int q, count = 0;
 
-       PF_ASSERT_LOCKED();
-
        if (sc == NULL || sc->sc_len == PFSYNC_MINPKT)
                return;
 
@@ -1621,26 +1754,35 @@ pfsync_sendout(void)
                return;
        }
 
+       pfsync_grab_snapshot(&sn, sc);
+
+       /*
+        * Check below is sufficient to prevent us from sending empty packets,
+        * but it does not stop us from sending short packets.
+        */
+       if (pfsync_is_snapshot_empty(&sn))
+               return;
+
        MGETHDR(m, M_DONTWAIT, MT_DATA);
        if (m == NULL) {
                sc->sc_if.if_oerrors++;
                pfsyncstat_inc(pfsyncs_onomem);
-               pfsync_drop(sc);
+               pfsync_drop_snapshot(&sn);
                return;
        }
 
-       if (max_linkhdr + sc->sc_len > MHLEN) {
-               MCLGETL(m, M_DONTWAIT, max_linkhdr + sc->sc_len);
+       if (max_linkhdr + sn.sn_len > MHLEN) {
+               MCLGETL(m, M_DONTWAIT, max_linkhdr + sn.sn_len);
                if (!ISSET(m->m_flags, M_EXT)) {
                        m_free(m);
                        sc->sc_if.if_oerrors++;
                        pfsyncstat_inc(pfsyncs_onomem);
-                       pfsync_drop(sc);
+                       pfsync_drop_snapshot(&sn);
                        return;
                }
        }
        m->m_data += max_linkhdr;
-       m->m_len = m->m_pkthdr.len = sc->sc_len;
+       m->m_len = m->m_pkthdr.len = sn.sn_len;
 
        /* build the ip header */
        ip = mtod(m, struct ip *);
@@ -1656,16 +1798,16 @@ pfsync_sendout(void)
        offset += sizeof(*ph);
 
        ph->version = PFSYNC_VERSION;
-       ph->len = htons(sc->sc_len - sizeof(*ip));
+       ph->len = htons(sn.sn_len - sizeof(*ip));
        bcopy(pf_status.pf_chksum, ph->pfcksum, PF_MD5_DIGEST_LENGTH);
 
-       if (!TAILQ_EMPTY(&sc->sc_upd_req_list)) {
+       if (!TAILQ_EMPTY(&sn.sn_upd_req_list)) {
                subh = (struct pfsync_subheader *)(m->m_data + offset);
                offset += sizeof(*subh);
 
                count = 0;
-               while ((ur = TAILQ_FIRST(&sc->sc_upd_req_list)) != NULL) {
-                       TAILQ_REMOVE(&sc->sc_upd_req_list, ur, ur_entry);
+               while ((ur = TAILQ_FIRST(&sn.sn_upd_req_list)) != NULL) {
+                       TAILQ_REMOVE(&sn.sn_upd_req_list, ur, ur_entry);
 
                        bcopy(&ur->ur_msg, m->m_data + offset,
                            sizeof(ur->ur_msg));
@@ -1683,20 +1825,19 @@ pfsync_sendout(void)
        }
 
        /* has someone built a custom region for us to add? */
-       if (sc->sc_plus != NULL) {
-               bcopy(sc->sc_plus, m->m_data + offset, sc->sc_pluslen);
-               offset += sc->sc_pluslen;
-
-               sc->sc_plus = NULL;
+       if (sn.sn_plus != NULL) {
+               bcopy(sn.sn_plus, m->m_data + offset, sn.sn_pluslen);
+               offset += sn.sn_pluslen;
+               sn.sn_plus = NULL;      /* XXX memory leak ? */
        }
 
-       if (!TAILQ_EMPTY(&sc->sc_tdb_q)) {
+       if (!TAILQ_EMPTY(&sn.sn_tdb_q)) {
                subh = (struct pfsync_subheader *)(m->m_data + offset);
                offset += sizeof(*subh);
 
                count = 0;
-               while ((t = TAILQ_FIRST(&sc->sc_tdb_q)) != NULL) {
-                       TAILQ_REMOVE(&sc->sc_tdb_q, t, tdb_sync_entry);
+               while ((t = TAILQ_FIRST(&sn.sn_tdb_q)) != NULL) {
+                       TAILQ_REMOVE(&sn.sn_tdb_q, t, tdb_sync_entry);
                        pfsync_out_tdb(t, m->m_data + offset);
                        offset += sizeof(struct pfsync_tdb);
                        CLR(t->tdb_flags, TDBF_PFSYNC);
@@ -1711,15 +1852,15 @@ pfsync_sendout(void)
 
        /* walk the queues */
        for (q = 0; q < PFSYNC_S_COUNT; q++) {
-               if (TAILQ_EMPTY(&sc->sc_qs[q]))
+               if (TAILQ_EMPTY(&sn.sn_qs[q]))
                        continue;
 
                subh = (struct pfsync_subheader *)(m->m_data + offset);
                offset += sizeof(*subh);
 
                count = 0;
-               while ((st = TAILQ_FIRST(&sc->sc_qs[q])) != NULL) {
-                       TAILQ_REMOVE(&sc->sc_qs[q], st, sync_list);
+               while ((st = TAILQ_FIRST(&sn.sn_qs[q])) != NULL) {
+                       TAILQ_REMOVE(&sn.sn_qs[q], st, sync_list);
 #ifdef PFSYNC_DEBUG
                        KASSERT(st->sync_state == q);
 #endif
@@ -1741,10 +1882,10 @@ pfsync_sendout(void)
 #if NBPFILTER > 0
        if (ifp->if_bpf) {
                m->m_data += sizeof(*ip);
-               m->m_len = m->m_pkthdr.len = sc->sc_len - sizeof(*ip);
+               m->m_len = m->m_pkthdr.len = sn.sn_len - sizeof(*ip);
                bpf_mtap(ifp->if_bpf, m, BPF_DIRECTION_OUT);
                m->m_data -= sizeof(*ip);
-               m->m_len = m->m_pkthdr.len = sc->sc_len;
+               m->m_len = m->m_pkthdr.len = sn.sn_len;
        }
 
        if (sc->sc_sync_ifidx == 0) {
@@ -1754,9 +1895,6 @@ pfsync_sendout(void)
        }
 #endif
 
-       /* start again */
-       sc->sc_len = PFSYNC_MINPKT;
-
        sc->sc_if.if_opackets++;
        sc->sc_if.if_obytes += m->m_pkthdr.len;
 
@@ -1815,7 +1953,11 @@ pfsync_defer(struct pf_state *st, struct mbuf *m)
                return (0);
 
        if (sc->sc_deferred >= 128) {
+               mtx_enter(&sc->sc_deferrals_mtx);
                pd = TAILQ_FIRST(&sc->sc_deferrals);
+               TAILQ_REMOVE(&sc->sc_deferrals, pd, pd_entry);
+               sc->sc_deferred--;
+               mtx_leave(&sc->sc_deferrals_mtx);
                if (timeout_del(&pd->pd_tmo))
                        pfsync_undefer(pd, 0);
        }
@@ -1830,8 +1972,10 @@ pfsync_defer(struct pf_state *st, struct mbuf *m)
        pd->pd_st = pf_state_ref(st);
        pd->pd_m = m;
 
+       mtx_enter(&sc->sc_deferrals_mtx);
        sc->sc_deferred++;
        TAILQ_INSERT_TAIL(&sc->sc_deferrals, pd, pd_entry);
+       mtx_leave(&sc->sc_deferrals_mtx);
 
        timeout_set_proc(&pd->pd_tmo, pfsync_defer_tmo, pd);
        timeout_add_msec(&pd->pd_tmo, 20);
@@ -1842,71 +1986,93 @@ pfsync_defer(struct pf_state *st, struct mbuf *m)
 }
 
 void
-pfsync_undefer(struct pfsync_deferral *pd, int drop)
+pfsync_undefer_notify(struct pfsync_deferral *pd)
 {
-       struct pfsync_softc *sc = pfsyncif;
        struct pf_pdesc pdesc;
        struct pf_state *st = pd->pd_st;
 
-       NET_ASSERT_LOCKED();
-
-       if (sc == NULL)
-               return;
-
-       TAILQ_REMOVE(&sc->sc_deferrals, pd, pd_entry);
-       sc->sc_deferred--;
-
-       CLR(st->state_flags, PFSTATE_ACK);
-       if (drop)
-               m_freem(pd->pd_m);
-       else {
-               if (st->rt == PF_ROUTETO) {
-                       if (pf_setup_pdesc(&pdesc, st->key[PF_SK_WIRE]->af,
-                           st->direction, st->kif, pd->pd_m, NULL) !=
-                           PF_PASS) {
-                               m_freem(pd->pd_m);
-                               goto out;
-                       }
-                       switch (st->key[PF_SK_WIRE]->af) {
-                       case AF_INET:
-                               pf_route(&pdesc, st);
-                               break;
+       if (st->rule.ptr->rt == PF_ROUTETO) {
+               if (pf_setup_pdesc(&pdesc, st->key[PF_SK_WIRE]->af,
+                   st->direction, st->kif, pd->pd_m, NULL) != PF_PASS) {
+                       m_freem(pd->pd_m);
+                       return;
+               }
+               switch (st->key[PF_SK_WIRE]->af) {
+               case AF_INET:
+                       pf_route(&pdesc, st);
+                       break;
 #ifdef INET6
-                       case AF_INET6:
-                               pf_route6(&pdesc, st);
-                               break;
+               case AF_INET6:
+                       pf_route6(&pdesc, st);
+                       break;
 #endif /* INET6 */
-                       default:
-                               unhandled_af(st->key[PF_SK_WIRE]->af);
-                       }
-                       pd->pd_m = pdesc.m;
-               } else {
-                       switch (st->key[PF_SK_WIRE]->af) {
-                       case AF_INET:
-                               ip_output(pd->pd_m, NULL, NULL, 0, NULL, NULL,
-                                   0);
-                               break;
+               default:
+                       unhandled_af(st->key[PF_SK_WIRE]->af);
+               }
+               pd->pd_m = pdesc.m;
+       } else {
+               switch (st->key[PF_SK_WIRE]->af) {
+               case AF_INET:
+                       ip_output(pd->pd_m, NULL, NULL, 0, NULL, NULL,
+                           0);
+                       break;
 #ifdef INET6
-                       case AF_INET6:
-                               ip6_output(pd->pd_m, NULL, NULL, 0,
-                                   NULL, NULL);
-                               break;
+               case AF_INET6:
+                       ip6_output(pd->pd_m, NULL, NULL, 0,
+                           NULL, NULL);
+                       break;
 #endif /* INET6 */
-                       default:
-                               unhandled_af(st->key[PF_SK_WIRE]->af);
-                       }
+               default:
+                       unhandled_af(st->key[PF_SK_WIRE]->af);
                }
+
+               pd->pd_m = NULL;
        }
- out:
-       pf_state_unref(st);
+}
+
+void
+pfsync_free_deferral(struct pfsync_deferral *pd)
+{
+       struct pfsync_softc *sc = pfsyncif;
+
+       pf_state_unref(pd->pd_st);
+       if (pd->pd_m != NULL)
+               m_freem(pd->pd_m);
        pool_put(&sc->sc_pool, pd);
 }
 
+void
+pfsync_undefer(struct pfsync_deferral *pd, int drop)
+{
+       struct pfsync_softc *sc = pfsyncif;
+
+       NET_ASSERT_LOCKED();
+
+       if (sc == NULL)
+               return;
+
+       CLR(pd->pd_st->state_flags, PFSTATE_ACK);
+       if (drop) {
+               m_freem(pd->pd_m);
+               pd->pd_m = NULL;
+       } else
+               pfsync_undefer_notify(pd);
+
+       pfsync_free_deferral(pd);
+}
+
 void
 pfsync_defer_tmo(void *arg)
 {
+       struct pfsync_softc *sc = pfsyncif;
+       struct pfsync_deferral *pd = arg;
+
+       mtx_enter(&sc->sc_deferrals_mtx);
+       TAILQ_REMOVE(&sc->sc_deferrals, pd, pd_entry);
+       sc->sc_deferred--;
+       mtx_leave(&sc->sc_deferrals_mtx);
        NET_LOCK();
-       pfsync_undefer(arg, 0);
+       pfsync_undefer(pd, 0);
        NET_UNLOCK();
 }
 
@@ -1918,25 +2084,29 @@ pfsync_deferred(struct pf_state *st, int drop)
 
        NET_ASSERT_LOCKED();
 
+       mtx_enter(&sc->sc_deferrals_mtx);
        TAILQ_FOREACH(pd, &sc->sc_deferrals, pd_entry) {
                 if (pd->pd_st == st) {
-                       if (timeout_del(&pd->pd_tmo))
+                       if (timeout_del(&pd->pd_tmo)) {
+                               TAILQ_REMOVE(&sc->sc_deferrals, pd, pd_entry);
+                               sc->sc_deferred--;
+                               mtx_leave(&sc->sc_deferrals_mtx);
                                pfsync_undefer(pd, drop);
+                       } else
+                               mtx_leave(&sc->sc_deferrals_mtx);
                        return;
                }
        }
-
-       panic("pfsync_deferred: unable to find deferred state");
+       mtx_leave(&sc->sc_deferrals_mtx);
 }
 
 void
-pfsync_update_state_locked(struct pf_state *st)
+pfsync_update_state(struct pf_state *st)
 {
        struct pfsync_softc *sc = pfsyncif;
        int sync = 0;
 
        NET_ASSERT_LOCKED();
-       PF_ASSERT_LOCKED();
 
        if (sc == NULL || !ISSET(sc->sc_if.if_flags, IFF_RUNNING))
                return;
@@ -1981,22 +2151,6 @@ pfsync_update_state_locked(struct pf_state *st)
                schednetisr(NETISR_PFSYNC);
 }
 
-void
-pfsync_update_state(struct pf_state *st, int *have_pf_lock)
-{
-       struct pfsync_softc *sc = pfsyncif;
-
-       if (sc == NULL || !ISSET(sc->sc_if.if_flags, IFF_RUNNING))
-               return;
-
-       if (*have_pf_lock == 0) {
-               PF_LOCK();
-               *have_pf_lock = 1;
-       }
-
-       pfsync_update_state_locked(st);
-}
-
 void
 pfsync_cancel_full_update(struct pfsync_softc *sc)
 {
@@ -2049,7 +2203,7 @@ pfsync_request_update(u_int32_t creatorid, u_int64_t id)
 {
        struct pfsync_softc *sc = pfsyncif;
        struct pfsync_upd_req_item *item;
-       size_t nlen = sizeof(struct pfsync_upd_req);
+       size_t nlen, sc_len;
 
        /*
         * this code does nothing to prevent multiple update requests for the
@@ -2065,18 +2219,24 @@ pfsync_request_update(u_int32_t creatorid, u_int64_t id)
        item->ur_msg.id = id;
        item->ur_msg.creatorid = creatorid;
 
-       if (TAILQ_EMPTY(&sc->sc_upd_req_list))
-               nlen += sizeof(struct pfsync_subheader);
+       do {
+               mtx_enter(&sc->sc_upd_req_mtx);
 
-       if (sc->sc_len + nlen > sc->sc_if.if_mtu) {
-               pfsync_sendout();
+               nlen = sizeof(struct pfsync_upd_req);
+               if (TAILQ_EMPTY(&sc->sc_upd_req_list))
+                       nlen += sizeof(struct pfsync_subheader);
 
-               nlen = sizeof(struct pfsync_subheader) +
-                   sizeof(struct pfsync_upd_req);
-       }
+               sc_len = atomic_add_long_nv(&sc->sc_len, nlen);
+               if (sc_len > sc->sc_if.if_mtu) {
+                       atomic_sub_long(&sc->sc_len, nlen);
+                       mtx_leave(&sc->sc_upd_req_mtx);
+                       pfsync_sendout();
+                       continue;
+               }
 
-       TAILQ_INSERT_TAIL(&sc->sc_upd_req_list, item, ur_entry);
-       sc->sc_len += nlen;
+               TAILQ_INSERT_TAIL(&sc->sc_upd_req_list, item, ur_entry);
+               mtx_leave(&sc->sc_upd_req_mtx);
+       } while (0);
 
        schednetisr(NETISR_PFSYNC);
 }
@@ -2202,7 +2362,7 @@ void
 pfsync_q_ins(struct pf_state *st, int q)
 {
        struct pfsync_softc *sc = pfsyncif;
-       size_t nlen = pfsync_qs[q].len;
+       size_t nlen, sc_len;
 
        KASSERT(st->sync_state == PFSYNC_S_NONE);
 
@@ -2210,19 +2370,37 @@ pfsync_q_ins(struct pf_state *st, int q)
        if (sc->sc_len < PFSYNC_MINPKT)
                panic("pfsync pkt len is too low %zd", sc->sc_len);
 #endif
-       if (TAILQ_EMPTY(&sc->sc_qs[q]))
-               nlen += sizeof(struct pfsync_subheader);
+       do {
+               mtx_enter(&sc->sc_mtx[q]);
 
-       if (sc->sc_len + nlen > sc->sc_if.if_mtu) {
-               pfsync_sendout();
+               /*
+                * If two threads are competing to insert the same state, then
+                * there must be just single winner.
+                */
+               if (st->sync_state != PFSYNC_S_NONE) {
+                       mtx_leave(&sc->sc_mtx[q]);
+                       break;
+               }
 
-               nlen = sizeof(struct pfsync_subheader) + pfsync_qs[q].len;
-       }
+               nlen = pfsync_qs[q].len;
 
-       sc->sc_len += nlen;
-       pf_state_ref(st);
-       TAILQ_INSERT_TAIL(&sc->sc_qs[q], st, sync_list);
-       st->sync_state = q;
+               if (TAILQ_EMPTY(&sc->sc_qs[q]))
+                       nlen += sizeof(struct pfsync_subheader);
+
+               sc_len = atomic_add_long_nv(&sc->sc_len, nlen);
+               if (sc_len > sc->sc_if.if_mtu) {
+                       atomic_sub_long(&sc->sc_len, nlen);
+                       mtx_leave(&sc->sc_mtx[q]);
+                       pfsync_sendout();
+                       continue;
+               }
+
+               pf_state_ref(st);
+
+               TAILQ_INSERT_TAIL(&sc->sc_qs[q], st, sync_list);
+               st->sync_state = q;
+               mtx_leave(&sc->sc_mtx[q]);
+       } while (0);
 }
 
 void
@@ -2233,39 +2411,48 @@ pfsync_q_del(struct pf_state *st)
 
        KASSERT(st->sync_state != PFSYNC_S_NONE);
 
-       sc->sc_len -= pfsync_qs[q].len;
+       mtx_enter(&sc->sc_mtx[q]);
+       atomic_sub_long(&sc->sc_len, pfsync_qs[q].len);
        TAILQ_REMOVE(&sc->sc_qs[q], st, sync_list);
+       if (TAILQ_EMPTY(&sc->sc_qs[q]))
+               atomic_sub_long(&sc->sc_len, sizeof (struct pfsync_subheader));
+       mtx_leave(&sc->sc_mtx[q]);
+
        st->sync_state = PFSYNC_S_NONE;
        pf_state_unref(st);
-
-       if (TAILQ_EMPTY(&sc->sc_qs[q]))
-               sc->sc_len -= sizeof(struct pfsync_subheader);
 }
 
 void
 pfsync_update_tdb(struct tdb *t, int output)
 {
        struct pfsync_softc *sc = pfsyncif;
-       size_t nlen = sizeof(struct pfsync_tdb);
+       size_t nlen, sc_len;
 
        if (sc == NULL)
                return;
 
        if (!ISSET(t->tdb_flags, TDBF_PFSYNC)) {
-               if (TAILQ_EMPTY(&sc->sc_tdb_q))
-                       nlen += sizeof(struct pfsync_subheader);
-
-               if (sc->sc_len + nlen > sc->sc_if.if_mtu) {
-                       pfsync_sendout();
+               do {
+                       mtx_enter(&sc->sc_tdb_mtx);
+                       nlen = sizeof(struct pfsync_tdb);
+
+                       if (TAILQ_EMPTY(&sc->sc_tdb_q))
+                               nlen += sizeof(struct pfsync_subheader);
+
+                       sc_len = atomic_add_long_nv(&sc->sc_len, nlen);
+                       if (sc_len > sc->sc_if.if_mtu) {
+                               atomic_sub_long(&sc->sc_len, nlen);
+                               mtx_leave(&sc->sc_tdb_mtx);
+                               pfsync_sendout();
+                               continue;
+                       }
 
-                       nlen = sizeof(struct pfsync_subheader) +
-                           sizeof(struct pfsync_tdb);
-               }
+                       TAILQ_INSERT_TAIL(&sc->sc_tdb_q, t, tdb_sync_entry);
+                       mtx_leave(&sc->sc_tdb_mtx);
 
-               sc->sc_len += nlen;
-               TAILQ_INSERT_TAIL(&sc->sc_tdb_q, t, tdb_sync_entry);
-               SET(t->tdb_flags, TDBF_PFSYNC);
-               t->tdb_updates = 0;
+                       SET(t->tdb_flags, TDBF_PFSYNC);
+                       t->tdb_updates = 0;
+               } while (0);
        } else {
                if (++t->tdb_updates >= sc->sc_maxupdates)
                        schednetisr(NETISR_PFSYNC);
@@ -2281,16 +2468,22 @@ void
 pfsync_delete_tdb(struct tdb *t)
 {
        struct pfsync_softc *sc = pfsyncif;
+       size_t nlen;
 
        if (sc == NULL || !ISSET(t->tdb_flags, TDBF_PFSYNC))
                return;
 
-       sc->sc_len -= sizeof(struct pfsync_tdb);
+       mtx_enter(&sc->sc_tdb_mtx);
+
        TAILQ_REMOVE(&sc->sc_tdb_q, t, tdb_sync_entry);
        CLR(t->tdb_flags, TDBF_PFSYNC);
 
+       nlen = sizeof(struct pfsync_tdb);
        if (TAILQ_EMPTY(&sc->sc_tdb_q))
-               sc->sc_len -= sizeof(struct pfsync_subheader);
+               nlen += sizeof(struct pfsync_subheader);
+       atomic_sub_long(&sc->sc_len, nlen);
+
+       mtx_leave(&sc->sc_tdb_mtx);
 }
 
 void
@@ -2338,9 +2531,14 @@ pfsync_bulk_start(void)
        else {
                sc->sc_ureq_received = getuptime();
 
-               if (sc->sc_bulk_next == NULL)
+               if (sc->sc_bulk_next == NULL) {
+                       PF_STATE_ENTER_READ();
                        sc->sc_bulk_next = TAILQ_FIRST(&state_list);
+                       pf_state_ref(sc->sc_bulk_next);
+                       PF_STATE_EXIT_READ();
+               }
                sc->sc_bulk_last = sc->sc_bulk_next;
+               pf_state_ref(sc->sc_bulk_last);
 
                pfsync_bulk_status(PFSYNC_BUS_START);
                timeout_add(&sc->sc_bulk_tmo, 0);
@@ -2351,7 +2549,7 @@ void
 pfsync_bulk_update(void *arg)
 {
        struct pfsync_softc *sc;
-       struct pf_state *st;
+       struct pf_state *st, *st_next;
        int i = 0;
 
        NET_LOCK();
@@ -2359,6 +2557,7 @@ pfsync_bulk_update(void *arg)
        if (sc == NULL)
                goto out;
        st = sc->sc_bulk_next;
+       sc->sc_bulk_next = NULL;
 
        for (;;) {
                if (st->sync_state == PFSYNC_S_NONE &&
@@ -2368,13 +2567,23 @@ pfsync_bulk_update(void *arg)
                        i++;
                }
 
-               st = TAILQ_NEXT(st, entry_list);
+               /*
+                * I wonder how we prevent infinite bulk update.  IMO it can
+                * happen when sc_bulk_last state expires before we iterate
+                * through the whole list.
+                */
+               PF_STATE_ENTER_READ();
+               st_next = TAILQ_NEXT(st, entry_list);
+               pf_state_unref(st);
+               st = st_next;
                if (st == NULL)
                        st = TAILQ_FIRST(&state_list);
+               pf_state_ref(st);
+               PF_STATE_EXIT_READ();
 
-               if (st == sc->sc_bulk_last) {
+               if ((st == NULL) || (st == sc->sc_bulk_last)) {
                        /* we're done */
-                       sc->sc_bulk_next = NULL;
+                       pf_state_unref(sc->sc_bulk_last);
                        sc->sc_bulk_last = NULL;
                        pfsync_bulk_status(PFSYNC_BUS_END);
                        break;
@@ -2497,9 +2706,7 @@ void
 pfsync_timeout(void *arg)
 {
        NET_LOCK();
-       PF_LOCK();
        pfsync_sendout();
-       PF_UNLOCK();
        NET_UNLOCK();
 }
 
@@ -2507,9 +2714,7 @@ pfsync_timeout(void *arg)
 void
 pfsyncintr(void)
 {
-       PF_LOCK();
        pfsync_sendout();
-       PF_UNLOCK();
 }
 
 int
index 3c3b7dd..2c9bedc 100644 (file)
@@ -1,4 +1,4 @@
-/*     $OpenBSD: if_pfsync.h,v 1.54 2018/09/11 07:53:38 sashan Exp $   */
+/*     $OpenBSD: if_pfsync.h,v 1.55 2021/02/04 00:55:41 sashan Exp $   */
 
 /*
  * Copyright (c) 2001 Michael Shalayeff
@@ -328,7 +328,7 @@ void                        pfsync_state_export(struct pfsync_state *,
                            struct pf_state *);
 
 void                   pfsync_insert_state(struct pf_state *);
-void                   pfsync_update_state(struct pf_state *, int *);
+void                   pfsync_update_state(struct pf_state *);
 void                   pfsync_delete_state(struct pf_state *);
 void                   pfsync_clear_states(u_int32_t, const char *);
 
index 44e0e17..74ec0aa 100644 (file)
@@ -1,4 +1,4 @@
-/*     $OpenBSD: pf.c,v 1.1107 2021/02/03 07:41:12 dlg Exp $ */
+/*     $OpenBSD: pf.c,v 1.1108 2021/02/04 00:55:41 sashan Exp $ */
 
 /*
  * Copyright (c) 2001 Daniel Hartmeier
@@ -6935,7 +6935,7 @@ pf_test(sa_family_t af, int fwdir, struct ifnet *ifp, struct mbuf **m0)
                PF_STATE_EXIT_READ();
                if (action == PF_PASS || action == PF_AFRT) {
 #if NPFSYNC > 0
-                       pfsync_update_state(s, &have_pf_lock);
+                       pfsync_update_state(s);
 #endif /* NPFSYNC > 0 */
                        r = s->rule.ptr;
                        a = s->anchor.ptr;
@@ -6967,7 +6967,7 @@ pf_test(sa_family_t af, int fwdir, struct ifnet *ifp, struct mbuf **m0)
                PF_STATE_EXIT_READ();
                if (action == PF_PASS || action == PF_AFRT) {
 #if NPFSYNC > 0
-                       pfsync_update_state(s, &have_pf_lock);
+                       pfsync_update_state(s);
 #endif /* NPFSYNC > 0 */
                        r = s->rule.ptr;
                        a = s->anchor.ptr;
@@ -7043,7 +7043,7 @@ pf_test(sa_family_t af, int fwdir, struct ifnet *ifp, struct mbuf **m0)
 
                if (action == PF_PASS || action == PF_AFRT) {
 #if NPFSYNC > 0
-                       pfsync_update_state(s, &have_pf_lock);
+                       pfsync_update_state(s);
 #endif /* NPFSYNC > 0 */
                        r = s->rule.ptr;
                        a = s->anchor.ptr;