big update to pfsync to try and clean up locking in particular.
authordlg <dlg@openbsd.org>
Thu, 6 Jul 2023 04:55:04 +0000 (04:55 +0000)
committerdlg <dlg@openbsd.org>
Thu, 6 Jul 2023 04:55:04 +0000 (04:55 +0000)
moving pf forward has been a real struggle, and pfsync has been a
constant source of pain. we have been papering over the problems
for a while now, but it reached the point that it needed a fundamental
restructure, which is what this diff is.

the big headliner changes in this diff are:

- pfsync specific locks

this is the whole reason for this diff.

rather than rely on NET_LOCK or KERNEL_LOCK or whatever, pfsync now
has it's own locks to protect it's internal data structures. this
is important because pfsync runs a bunch of timeouts and tasks to
push pfsync packets out on the wire, or when it's handling requests
generated by incoming pfsync packets, both of which happen outside
pf itself running. having pfsync specific locks around pfsync data
structures makes the mutations of these data structures a lot more
explicit and auditable.

- partitioning

to enable future parallelisation of the network stack, this rewrite
includes support for pfsync to partition states into different "slices".
these slices run independently, ie, the states collected by one slice
are serialised into a separate packet to the states collected and
serialised by another slice.

states are mapped to pfsync slices based on the pf state hash, which
is the same hash that the rest of the network stack and multiq
hardware uses.

- no more pfsync called from netisr

pfsync used to be called from netisr to try and bundle packets, but now
that there's multiple pfsync slices this doesnt make sense. instead it
uses tasks in softnet tqs.

- improved bulk transfer handling

there's shiny new state machines around both the bulk transmit and
receive handling. pfsync used to do horrible things to carp demotion
counters, but now it is very predictable and returns the counters back
where they started.

- better tdb handling

the tdb handling was pretty hairy, but hrvoje has kicked this around
a lot with ipsec and sasyncd and we've found and fixed a bunch of
issues as a result of that testing.

- mpsafe pf state purges

this was committed previously, but because the locks pfsync relied on
weren't clear this just caused a ton of bugs. as part of this diff it's
now reliable, and moves a big chunk of work out from under KERNEL_LOCK,
which in turn improves the responsiveness and throughput of a firewall
even if you're not using pfsync.

there's a bunch of other little changes along the way, but the above are
the big ones.

hrvoje has done performance testing with this diff and notes a big
improvement when pfsync is not in use. performance when pfsync is
enabled is about the same, but im hoping the slices means we can scale
along with pf as it improves.

lots (months) of testing by me and hrvoje on pfsync boxes
tests and ok sashan@
deraadt@ says this is a good time to put it in

sys/net/if.c
sys/net/if_pfsync.c
sys/net/if_pfsync.h
sys/net/netisr.h
sys/net/pf.c
sys/net/pf_ioctl.c
sys/net/pf_norm.c
sys/net/pfvar.h
sys/net/pfvar_priv.h
sys/netinet/in_proto.c
sys/netinet/ip_ipsp.h

index 30a36d8..1cecfdf 100644 (file)
@@ -1,4 +1,4 @@
-/*     $OpenBSD: if.c,v 1.703 2023/07/04 13:37:47 jan Exp $    */
+/*     $OpenBSD: if.c,v 1.704 2023/07/06 04:55:04 dlg Exp $    */
 /*     $NetBSD: if.c,v 1.35 1996/05/07 05:26:04 thorpej Exp $  */
 
 /*
@@ -1034,14 +1034,6 @@ if_netisr(void *unused)
                t |= n;
        }
 
-#if NPFSYNC > 0
-       if (t & (1 << NETISR_PFSYNC)) {
-               KERNEL_LOCK();
-               pfsyncintr();
-               KERNEL_UNLOCK();
-       }
-#endif
-
        NET_UNLOCK();
 }
 
index 2457796..bf68571 100644 (file)
@@ -1,4 +1,4 @@
-/*     $OpenBSD: if_pfsync.c,v 1.317 2023/06/05 08:45:20 sashan Exp $  */
+/*     $OpenBSD: if_pfsync.c,v 1.318 2023/07/06 04:55:05 dlg Exp $     */
 
 /*
  * Copyright (c) 2002 Michael Shalayeff
@@ -27,7 +27,7 @@
  */
 
 /*
- * Copyright (c) 2009 David Gwynne <dlg@openbsd.org>
+ * Copyright (c) 2009, 2022, 2023 David Gwynne <dlg@openbsd.org>
  *
  * Permission to use, copy, modify, and distribute this software for any
  * purpose with or without fee is hereby granted, provided that the above
  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
  */
 
+#include "bpfilter.h"
+#include "pfsync.h"
+#include "kstat.h"
+
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/time.h>
 #include <sys/sysctl.h>
 #include <sys/pool.h>
 #include <sys/syslog.h>
+#include <sys/tree.h>
+#include <sys/smr.h>
+#include <sys/percpu.h>
+#include <sys/refcnt.h>
+#include <sys/kstat.h>
+#include <sys/stdarg.h>
 
 #include <net/if.h>
 #include <net/if_types.h>
 #include <netinet/ip_carp.h>
 #endif
 
-#define PF_DEBUGNAME   "pfsync: "
 #include <net/pfvar.h>
 #include <net/pfvar_priv.h>
 #include <net/if_pfsync.h>
 
-#include "bpfilter.h"
-#include "pfsync.h"
-
-#define PFSYNC_DEFER_NSEC 20000000ULL
-
 #define PFSYNC_MINPKT ( \
        sizeof(struct ip) + \
        sizeof(struct pfsync_header))
 
-int    pfsync_upd_tcp(struct pf_state *, struct pfsync_state_peer *,
-           struct pfsync_state_peer *);
-
-int    pfsync_in_clr(caddr_t, int, int, int);
-int    pfsync_in_iack(caddr_t, int, int, int);
-int    pfsync_in_upd_c(caddr_t, int, int, int);
-int    pfsync_in_ureq(caddr_t, int, int, int);
-int    pfsync_in_del(caddr_t, int, int, int);
-int    pfsync_in_del_c(caddr_t, int, int, int);
-int    pfsync_in_bus(caddr_t, int, int, int);
-int    pfsync_in_tdb(caddr_t, int, int, int);
-int    pfsync_in_ins(caddr_t, int, int, int);
-int    pfsync_in_upd(caddr_t, int, int, int);
-int    pfsync_in_eof(caddr_t, int, int, int);
-
-int    pfsync_in_error(caddr_t, int, int, int);
-
-void   pfsync_update_state_locked(struct pf_state *);
-
-const struct {
-       int     (*in)(caddr_t, int, int, int);
-       size_t  len;
-} pfsync_acts[] = {
-       /* PFSYNC_ACT_CLR */
-       { pfsync_in_clr,        sizeof(struct pfsync_clr) },
-        /* PFSYNC_ACT_OINS */
-       { pfsync_in_error,      0 },
-       /* PFSYNC_ACT_INS_ACK */
-       { pfsync_in_iack,       sizeof(struct pfsync_ins_ack) },
-       /* PFSYNC_ACT_OUPD */
-       { pfsync_in_error,      0 },
-       /* PFSYNC_ACT_UPD_C */
-       { pfsync_in_upd_c,      sizeof(struct pfsync_upd_c) },
-       /* PFSYNC_ACT_UPD_REQ */
-       { pfsync_in_ureq,       sizeof(struct pfsync_upd_req) },
-       /* PFSYNC_ACT_DEL */
-       { pfsync_in_del,        sizeof(struct pfsync_state) },
-       /* PFSYNC_ACT_DEL_C */
-       { pfsync_in_del_c,      sizeof(struct pfsync_del_c) },
-       /* PFSYNC_ACT_INS_F */
-       { pfsync_in_error,      0 },
-       /* PFSYNC_ACT_DEL_F */
-       { pfsync_in_error,      0 },
-       /* PFSYNC_ACT_BUS */
-       { pfsync_in_bus,        sizeof(struct pfsync_bus) },
-       /* PFSYNC_ACT_OTDB */
-       { pfsync_in_error,      0 },
-       /* PFSYNC_ACT_EOF */
-       { pfsync_in_error,      0 },
-       /* PFSYNC_ACT_INS */
-       { pfsync_in_ins,        sizeof(struct pfsync_state) },
-       /* PFSYNC_ACT_UPD */
-       { pfsync_in_upd,        sizeof(struct pfsync_state) },
-       /* PFSYNC_ACT_TDB */
-       { pfsync_in_tdb,        sizeof(struct pfsync_tdb) },
-};
+struct pfsync_softc;
 
-struct pfsync_q {
-       void            (*write)(struct pf_state *, void *);
-       size_t          len;
-       u_int8_t        action;
+struct pfsync_deferral {
+       TAILQ_ENTRY(pfsync_deferral)             pd_entry;
+       struct pf_state                         *pd_st;
+       struct mbuf                             *pd_m;
+       uint64_t                                 pd_deadline;
 };
+TAILQ_HEAD(pfsync_deferrals, pfsync_deferral);
 
-/* we have one of these for every PFSYNC_S_ */
-void   pfsync_out_state(struct pf_state *, void *);
-void   pfsync_out_iack(struct pf_state *, void *);
-void   pfsync_out_upd_c(struct pf_state *, void *);
-void   pfsync_out_del(struct pf_state *, void *);
+#define PFSYNC_DEFER_NSEC      20000000ULL
+#define PFSYNC_DEFER_LIMIT     128
+#define PFSYNC_BULK_SND_IVAL_MS        20
 
-struct pfsync_q pfsync_qs[] = {
-       { pfsync_out_iack,  sizeof(struct pfsync_ins_ack), PFSYNC_ACT_INS_ACK },
-       { pfsync_out_upd_c, sizeof(struct pfsync_upd_c),   PFSYNC_ACT_UPD_C },
-       { pfsync_out_del,   sizeof(struct pfsync_del_c),   PFSYNC_ACT_DEL_C },
-       { pfsync_out_state, sizeof(struct pfsync_state),   PFSYNC_ACT_INS },
-       { pfsync_out_state, sizeof(struct pfsync_state),   PFSYNC_ACT_UPD }
+static struct pool pfsync_deferrals_pool;
+
+enum pfsync_bulk_req_state {
+       PFSYNC_BREQ_S_NONE,
+       PFSYNC_BREQ_S_START,
+       PFSYNC_BREQ_S_SENT,
+       PFSYNC_BREQ_S_BULK,
+       PFSYNC_BREQ_S_DONE,
 };
 
-void   pfsync_q_ins(struct pf_state *, int);
-void   pfsync_q_del(struct pf_state *);
+static const char *pfsync_bulk_req_state_names[] = {
+       [PFSYNC_BREQ_S_NONE]            = "none",
+       [PFSYNC_BREQ_S_START]           = "start",
+       [PFSYNC_BREQ_S_SENT]            = "sent",
+       [PFSYNC_BREQ_S_BULK]            = "bulk",
+       [PFSYNC_BREQ_S_DONE]            = "done",
+};
 
-struct pfsync_upd_req_item {
-       TAILQ_ENTRY(pfsync_upd_req_item)        ur_entry;
-       TAILQ_ENTRY(pfsync_upd_req_item)        ur_snap;
-       struct pfsync_upd_req                   ur_msg;
+enum pfsync_bulk_req_event {
+       PFSYNC_BREQ_EVT_UP,
+       PFSYNC_BREQ_EVT_DOWN,
+       PFSYNC_BREQ_EVT_TMO,
+       PFSYNC_BREQ_EVT_LINK,
+       PFSYNC_BREQ_EVT_BUS_START,
+       PFSYNC_BREQ_EVT_BUS_END,
 };
-TAILQ_HEAD(pfsync_upd_reqs, pfsync_upd_req_item);
 
-struct pfsync_deferral {
-       TAILQ_ENTRY(pfsync_deferral)             pd_entry;
-       struct pf_state                         *pd_st;
-       struct mbuf                             *pd_m;
-       uint64_t                                 pd_deadline;
+static const char *pfsync_bulk_req_event_names[] = {
+       [PFSYNC_BREQ_EVT_UP]            = "up",
+       [PFSYNC_BREQ_EVT_DOWN]          = "down",
+       [PFSYNC_BREQ_EVT_TMO]           = "timeout",
+       [PFSYNC_BREQ_EVT_LINK]          = "link",
+       [PFSYNC_BREQ_EVT_BUS_START]     = "bus-start",
+       [PFSYNC_BREQ_EVT_BUS_END]       = "bus-end",
 };
-TAILQ_HEAD(pfsync_deferrals, pfsync_deferral);
 
-#define PFSYNC_PLSIZE  MAX(sizeof(struct pfsync_upd_req_item), \
-                           sizeof(struct pfsync_deferral))
+struct pfsync_slice {
+       struct pfsync_softc     *s_pfsync;
+       struct mutex             s_mtx;
 
-void   pfsync_out_tdb(struct tdb *, void *);
+       struct pf_state_queue    s_qs[PFSYNC_S_COUNT];
+       TAILQ_HEAD(, tdb)        s_tdb_q;
+       size_t                   s_len;
+       struct mbuf_list         s_ml;
+
+       struct taskq            *s_softnet;
+       struct task              s_task;
+       struct timeout           s_tmo;
+
+       struct mbuf_queue        s_sendq;
+       struct task              s_send;
+
+       struct pfsync_deferrals  s_deferrals;
+       unsigned int             s_deferred;
+       struct task              s_deferrals_task;
+       struct timeout           s_deferrals_tmo;
+
+       uint64_t                 s_stat_locks;
+       uint64_t                 s_stat_contended;
+       uint64_t                 s_stat_write_nop;
+       uint64_t                 s_stat_task_add;
+       uint64_t                 s_stat_task_run;
+       uint64_t                 s_stat_enqueue;
+       uint64_t                 s_stat_dequeue;
+
+       uint64_t                 s_stat_defer_add;
+       uint64_t                 s_stat_defer_ack;
+       uint64_t                 s_stat_defer_run;
+       uint64_t                 s_stat_defer_overlimit;
+
+       struct kstat            *s_kstat;
+} __aligned(CACHELINESIZE);
+
+#define PFSYNC_SLICE_BITS       1
+#define PFSYNC_NSLICES          (1 << PFSYNC_SLICE_BITS)
 
 struct pfsync_softc {
        struct ifnet             sc_if;
+       unsigned int             sc_dead;
+       unsigned int             sc_up;
+       struct refcnt            sc_refs;
+
+       /* config */
+       struct in_addr           sc_syncpeer;
+       unsigned int             sc_maxupdates;
+       unsigned int             sc_defer;
+
+       /* operation */
        unsigned int             sc_sync_ifidx;
+       unsigned int             sc_sync_if_down;
+       void                    *sc_inm;
+       struct task              sc_ltask;
+       struct task              sc_dtask;
+       struct ip                sc_template;
 
-       struct pool              sc_pool;
+       struct pfsync_slice      sc_slices[PFSYNC_NSLICES];
 
-       struct ip_moptions       sc_imo;
+       struct {
+               struct rwlock                    req_lock;
+               struct timeout                   req_tmo;
+               enum pfsync_bulk_req_state       req_state;
+               unsigned int                     req_tries;
+               unsigned int                     req_demoted;
+       }                        sc_bulk_req;
 
-       struct in_addr           sc_sync_peer;
-       u_int8_t                 sc_maxupdates;
+       struct {
+               struct rwlock                    snd_lock;
+               struct timeout                   snd_tmo;
+               time_t                           snd_requested;
+
+               struct pf_state                 *snd_next;
+               struct pf_state                 *snd_tail;
+               unsigned int                     snd_again;
+       }                        sc_bulk_snd;
+};
 
-       struct ip                sc_template;
+static struct pfsync_softc     *pfsyncif = NULL;
+static struct cpumem           *pfsynccounters;
 
-       struct pf_state_queue    sc_qs[PFSYNC_S_COUNT];
-       struct mutex             sc_st_mtx;
-       size_t                   sc_len;
+static inline void
+pfsyncstat_inc(enum pfsync_counters c)
+{
+       counters_inc(pfsynccounters, c);
+}
 
-       struct pfsync_upd_reqs   sc_upd_req_list;
-       struct mutex             sc_upd_req_mtx;
+static int     pfsync_clone_create(struct if_clone *, int);
+static int     pfsync_clone_destroy(struct ifnet *);
 
-       int                      sc_initial_bulk;
-       int                      sc_link_demoted;
+static int     pfsync_output(struct ifnet *, struct mbuf *, struct sockaddr *,
+                   struct rtentry *);
+static void    pfsync_start(struct ifqueue *);
 
-       int                      sc_defer;
-       struct pfsync_deferrals  sc_deferrals;
-       u_int                    sc_deferred;
-       struct mutex             sc_deferrals_mtx;
-       struct timeout           sc_deferrals_tmo;
+static int     pfsync_ioctl(struct ifnet *, u_long, caddr_t);
+static int     pfsync_up(struct pfsync_softc *);
+static int     pfsync_down(struct pfsync_softc *);
 
-       void                    *sc_plus;
-       size_t                   sc_pluslen;
+static int     pfsync_set_mtu(struct pfsync_softc *, unsigned int);
+static int     pfsync_set_parent(struct pfsync_softc *,
+                   const struct if_parent *);
+static int     pfsync_get_parent(struct pfsync_softc *, struct if_parent *);
+static int     pfsync_del_parent(struct pfsync_softc *);
 
-       u_int32_t                sc_ureq_sent;
-       int                      sc_bulk_tries;
-       struct timeout           sc_bulkfail_tmo;
+static int     pfsync_get_ioc(struct pfsync_softc *, struct ifreq *);
+static int     pfsync_set_ioc(struct pfsync_softc *, struct ifreq *);
 
-       u_int32_t                sc_ureq_received;
-       struct pf_state         *sc_bulk_next;
-       struct pf_state         *sc_bulk_last;
-       struct timeout           sc_bulk_tmo;
+static void    pfsync_syncif_link(void *);
+static void    pfsync_syncif_detach(void *);
 
-       TAILQ_HEAD(, tdb)        sc_tdb_q;
-       struct mutex             sc_tdb_mtx;
+static void    pfsync_sendout(struct pfsync_softc *, struct mbuf *);
+static void    pfsync_slice_drop(struct pfsync_softc *, struct pfsync_slice *);
 
-       struct task              sc_ltask;
-       struct task              sc_dtask;
+static void    pfsync_slice_tmo(void *);
+static void    pfsync_slice_task(void *);
+static void    pfsync_slice_sendq(void *);
+
+static void    pfsync_deferrals_tmo(void *);
+static void    pfsync_deferrals_task(void *);
+static void    pfsync_defer_output(struct pfsync_deferral *);
+
+static void    pfsync_bulk_req_evt(struct pfsync_softc *,
+                   enum pfsync_bulk_req_event);
+static void    pfsync_bulk_req_tmo(void *);
+
+static void    pfsync_bulk_snd_tmo(void *);
+
+#if NKSTAT > 0
+struct pfsync_kstat_data {
+       struct kstat_kv pd_locks;
+       struct kstat_kv pd_contended;
+       struct kstat_kv pd_write_nop;
+       struct kstat_kv pd_task_add;
+       struct kstat_kv pd_task_run;
+       struct kstat_kv pd_enqueue;
+       struct kstat_kv pd_dequeue;
+       struct kstat_kv pd_qdrop;
 
-       struct timeout           sc_tmo;
+       struct kstat_kv pd_defer_len;
+       struct kstat_kv pd_defer_add;
+       struct kstat_kv pd_defer_ack;
+       struct kstat_kv pd_defer_run;
+       struct kstat_kv pd_defer_overlimit;
 };
 
-struct pfsync_snapshot {
-       struct pfsync_softc     *sn_sc;
-       struct pf_state_queue    sn_qs[PFSYNC_S_COUNT];
-       struct pfsync_upd_reqs   sn_upd_req_list;
-       TAILQ_HEAD(, tdb)        sn_tdb_q;
-       size_t                   sn_len;
-       void                    *sn_plus;
-       size_t                   sn_pluslen;
+static const struct pfsync_kstat_data pfsync_kstat_tpl = {
+       KSTAT_KV_INITIALIZER("locks",           KSTAT_KV_T_COUNTER64),
+       KSTAT_KV_INITIALIZER("contended",       KSTAT_KV_T_COUNTER64),
+       KSTAT_KV_INITIALIZER("write-nops",      KSTAT_KV_T_COUNTER64),
+       KSTAT_KV_INITIALIZER("send-sched",      KSTAT_KV_T_COUNTER64),
+       KSTAT_KV_INITIALIZER("send-run",        KSTAT_KV_T_COUNTER64),
+       KSTAT_KV_INITIALIZER("enqueues",        KSTAT_KV_T_COUNTER64),
+       KSTAT_KV_INITIALIZER("dequeues",        KSTAT_KV_T_COUNTER64),
+       KSTAT_KV_UNIT_INITIALIZER("qdrops",
+           KSTAT_KV_T_COUNTER32, KSTAT_KV_U_PACKETS),
+
+       KSTAT_KV_UNIT_INITIALIZER("defer-len",
+           KSTAT_KV_T_COUNTER32, KSTAT_KV_U_PACKETS),
+       KSTAT_KV_INITIALIZER("defer-add",       KSTAT_KV_T_COUNTER64),
+       KSTAT_KV_INITIALIZER("defer-ack",       KSTAT_KV_T_COUNTER64),
+       KSTAT_KV_INITIALIZER("defer-run",       KSTAT_KV_T_COUNTER64),
+       KSTAT_KV_INITIALIZER("defer-over",      KSTAT_KV_T_COUNTER64),
 };
 
-struct pfsync_softc    *pfsyncif = NULL;
-struct cpumem          *pfsynccounters;
-
-void   pfsyncattach(int);
-int    pfsync_clone_create(struct if_clone *, int);
-int    pfsync_clone_destroy(struct ifnet *);
-void   pfsync_update_net_tdb(struct pfsync_tdb *);
-int    pfsyncoutput(struct ifnet *, struct mbuf *, struct sockaddr *,
-           struct rtentry *);
-int    pfsyncioctl(struct ifnet *, u_long, caddr_t);
-void   pfsyncstart(struct ifqueue *);
-void   pfsync_syncdev_state(void *);
-void   pfsync_ifdetach(void *);
-
-void   pfsync_deferred(struct pf_state *, int);
-void   pfsync_undefer(struct pfsync_deferral *, int);
-void   pfsync_deferrals_tmo(void *);
-
-void   pfsync_cancel_full_update(struct pfsync_softc *);
-void   pfsync_request_full_update(struct pfsync_softc *);
-void   pfsync_request_update(u_int32_t, u_int64_t);
-void   pfsync_update_state_req(struct pf_state *);
-
-void   pfsync_drop(struct pfsync_softc *);
-void   pfsync_sendout(void);
-void   pfsync_send_plus(void *, size_t);
-void   pfsync_timeout(void *);
-void   pfsync_tdb_timeout(void *);
-
-void   pfsync_bulk_start(void);
-void   pfsync_bulk_status(u_int8_t);
-void   pfsync_bulk_update(void *);
-void   pfsync_bulk_fail(void *);
-
-void   pfsync_grab_snapshot(struct pfsync_snapshot *, struct pfsync_softc *);
-void   pfsync_drop_snapshot(struct pfsync_snapshot *);
-
-void   pfsync_send_dispatch(void *);
-void   pfsync_send_pkt(struct mbuf *);
-
-static struct mbuf_queue       pfsync_mq;
-static struct task     pfsync_task =
-    TASK_INITIALIZER(pfsync_send_dispatch, &pfsync_mq);
+static int
+pfsync_kstat_copy(struct kstat *ks, void *dst)
+{
+       struct pfsync_slice *s = ks->ks_softc;
+       struct pfsync_kstat_data *pd = dst;
+
+       *pd = pfsync_kstat_tpl;
+       kstat_kv_u64(&pd->pd_locks) = s->s_stat_locks;
+       kstat_kv_u64(&pd->pd_contended) = s->s_stat_contended;
+       kstat_kv_u64(&pd->pd_write_nop) = s->s_stat_write_nop;
+       kstat_kv_u64(&pd->pd_task_add) = s->s_stat_task_add;
+       kstat_kv_u64(&pd->pd_task_run) = s->s_stat_task_run;
+       kstat_kv_u64(&pd->pd_enqueue) = s->s_stat_enqueue;
+       kstat_kv_u64(&pd->pd_dequeue) = s->s_stat_dequeue;
+       kstat_kv_u32(&pd->pd_qdrop) = mq_drops(&s->s_sendq);
+
+       kstat_kv_u32(&pd->pd_defer_len) = s->s_deferred;
+       kstat_kv_u64(&pd->pd_defer_add) = s->s_stat_defer_add;
+       kstat_kv_u64(&pd->pd_defer_ack) = s->s_stat_defer_ack;
+       kstat_kv_u64(&pd->pd_defer_run) = s->s_stat_defer_run;
+       kstat_kv_u64(&pd->pd_defer_overlimit) = s->s_stat_defer_overlimit;
+
+       return (0);
+}
+#endif /* NKSTAT > 0 */
 
 #define PFSYNC_MAX_BULKTRIES   12
-int    pfsync_sync_ok;
 
 struct if_clone        pfsync_cloner =
     IF_CLONE_INITIALIZER("pfsync", pfsync_clone_create, pfsync_clone_destroy);
@@ -312,63 +353,98 @@ struct if_clone   pfsync_cloner =
 void
 pfsyncattach(int npfsync)
 {
-       if_clone_attach(&pfsync_cloner);
        pfsynccounters = counters_alloc(pfsyncs_ncounters);
-       mq_init(&pfsync_mq, 4096, IPL_MPFLOOR);
+       if_clone_attach(&pfsync_cloner);
 }
 
-int
+static int
 pfsync_clone_create(struct if_clone *ifc, int unit)
 {
        struct pfsync_softc *sc;
        struct ifnet *ifp;
-       int q;
+       size_t i, q;
 
        if (unit != 0)
-               return (EINVAL);
-
-       pfsync_sync_ok = 1;
+               return (ENXIO);
 
-       sc = malloc(sizeof(*pfsyncif), M_DEVBUF, M_WAITOK|M_ZERO);
-       for (q = 0; q < PFSYNC_S_COUNT; q++)
-               TAILQ_INIT(&sc->sc_qs[q]);
-       mtx_init(&sc->sc_st_mtx, IPL_MPFLOOR);
+       if (pfsync_deferrals_pool.pr_size == 0) {
+               pool_init(&pfsync_deferrals_pool,
+                   sizeof(struct pfsync_deferral), 0,
+                   IPL_MPFLOOR, 0, "pfdefer", NULL);
+               /* pool_cache_init(&pfsync_deferrals_pool); */
+       }
 
-       pool_init(&sc->sc_pool, PFSYNC_PLSIZE, 0, IPL_MPFLOOR, 0, "pfsync",
-           NULL);
-       TAILQ_INIT(&sc->sc_upd_req_list);
-       mtx_init(&sc->sc_upd_req_mtx, IPL_MPFLOOR);
-       TAILQ_INIT(&sc->sc_deferrals);
-       mtx_init(&sc->sc_deferrals_mtx, IPL_MPFLOOR);
-       timeout_set_proc(&sc->sc_deferrals_tmo, pfsync_deferrals_tmo, sc);
-       task_set(&sc->sc_ltask, pfsync_syncdev_state, sc);
-       task_set(&sc->sc_dtask, pfsync_ifdetach, sc);
-       sc->sc_deferred = 0;
+       sc = malloc(sizeof(*sc), M_DEVBUF, M_WAITOK|M_ZERO|M_CANFAIL);
+       if (sc == NULL)
+               return (ENOMEM);
 
-       TAILQ_INIT(&sc->sc_tdb_q);
-       mtx_init(&sc->sc_tdb_mtx, IPL_MPFLOOR);
+       /* sc_refs is "owned" by IFF_RUNNING */
 
-       sc->sc_len = PFSYNC_MINPKT;
+       sc->sc_syncpeer.s_addr = INADDR_PFSYNC_GROUP;
        sc->sc_maxupdates = 128;
+       sc->sc_defer = 0;
+
+       task_set(&sc->sc_ltask, pfsync_syncif_link, sc);
+       task_set(&sc->sc_dtask, pfsync_syncif_detach, sc);
+
+       rw_init(&sc->sc_bulk_req.req_lock, "pfsyncbreq");
+       /* need process context to take net lock to call ip_output */
+       timeout_set_proc(&sc->sc_bulk_req.req_tmo, pfsync_bulk_req_tmo, sc);
 
-       sc->sc_imo.imo_membership = mallocarray(IP_MIN_MEMBERSHIPS,
-           sizeof(struct in_multi *), M_IPMOPTS, M_WAITOK|M_ZERO);
-       sc->sc_imo.imo_max_memberships = IP_MIN_MEMBERSHIPS;
+       rw_init(&sc->sc_bulk_snd.snd_lock, "pfsyncbsnd");
+       /* need process context to take net lock to call ip_output */
+       timeout_set_proc(&sc->sc_bulk_snd.snd_tmo, pfsync_bulk_snd_tmo, sc);
 
        ifp = &sc->sc_if;
-       snprintf(ifp->if_xname, sizeof ifp->if_xname, "pfsync%d", unit);
+       snprintf(ifp->if_xname, sizeof ifp->if_xname, "%s%d",
+           ifc->ifc_name, unit);
        ifp->if_softc = sc;
-       ifp->if_ioctl = pfsyncioctl;
-       ifp->if_output = pfsyncoutput;
-       ifp->if_qstart = pfsyncstart;
+       ifp->if_ioctl = pfsync_ioctl;
+       ifp->if_output = pfsync_output;
+       ifp->if_qstart = pfsync_start;
        ifp->if_type = IFT_PFSYNC;
        ifp->if_hdrlen = sizeof(struct pfsync_header);
        ifp->if_mtu = ETHERMTU;
        ifp->if_xflags = IFXF_CLONED | IFXF_MPSAFE;
-       timeout_set_proc(&sc->sc_tmo, pfsync_timeout, NULL);
-       timeout_set_proc(&sc->sc_bulk_tmo, pfsync_bulk_update, NULL);
-       timeout_set_proc(&sc->sc_bulkfail_tmo, pfsync_bulk_fail, NULL);
 
+       for (i = 0; i < nitems(sc->sc_slices); i++) {
+               struct pfsync_slice *s = &sc->sc_slices[i];
+
+               s->s_pfsync = sc;
+
+               mtx_init_flags(&s->s_mtx, IPL_SOFTNET, "pfslice", 0);
+               s->s_softnet = net_tq(i);
+               timeout_set(&s->s_tmo, pfsync_slice_tmo, s);
+               task_set(&s->s_task, pfsync_slice_task, s);
+
+               mq_init(&s->s_sendq, 16, IPL_SOFTNET);
+               task_set(&s->s_send, pfsync_slice_sendq, s);
+
+               s->s_len = PFSYNC_MINPKT;
+               ml_init(&s->s_ml);
+
+               for (q = 0; q < nitems(s->s_qs); q++)
+                       TAILQ_INIT(&s->s_qs[q]);
+               TAILQ_INIT(&s->s_tdb_q);
+
+               /* stupid NET_LOCK */
+               timeout_set(&s->s_deferrals_tmo, pfsync_deferrals_tmo, s);
+               task_set(&s->s_deferrals_task, pfsync_deferrals_task, s);
+               TAILQ_INIT(&s->s_deferrals);
+
+#if NKSTAT > 0
+               s->s_kstat = kstat_create(ifp->if_xname, 0, "pfsync-slice", i,
+                   KSTAT_T_KV, 0);
+
+               kstat_set_mutex(s->s_kstat, &s->s_mtx);
+               s->s_kstat->ks_softc = s;
+               s->s_kstat->ks_datalen = sizeof(pfsync_kstat_tpl);
+               s->s_kstat->ks_copy = pfsync_kstat_copy;
+               kstat_install(s->s_kstat);
+#endif
+       }
+
+       if_counters_alloc(ifp);
        if_attach(ifp);
        if_alloc_sadl(ifp);
 
@@ -380,937 +456,861 @@ pfsync_clone_create(struct if_clone *ifc, int unit)
        bpfattach(&sc->sc_if.if_bpf, ifp, DLT_PFSYNC, PFSYNC_HDRLEN);
 #endif
 
-       pfsyncif = sc;
-
        return (0);
 }
 
-int
+static int
 pfsync_clone_destroy(struct ifnet *ifp)
 {
        struct pfsync_softc *sc = ifp->if_softc;
-       struct ifnet *ifp0;
-       struct pfsync_deferral *pd;
-       struct pfsync_deferrals  deferrals;
+#if NKSTAT > 0
+       size_t i;
+#endif
 
        NET_LOCK();
+       sc->sc_dead = 1;
 
-#if NCARP > 0
-       if (!pfsync_sync_ok)
-               carp_group_demote_adj(&sc->sc_if, -1, "pfsync destroy");
-       if (sc->sc_link_demoted)
-               carp_group_demote_adj(&sc->sc_if, -1, "pfsync destroy");
-#endif
-       if ((ifp0 = if_get(sc->sc_sync_ifidx)) != NULL) {
-               if_linkstatehook_del(ifp0, &sc->sc_ltask);
-               if_detachhook_del(ifp0, &sc->sc_dtask);
-       }
-       if_put(ifp0);
-
-       /* XXXSMP breaks atomicity */
+       if (ISSET(ifp->if_flags, IFF_RUNNING))
+               pfsync_down(sc);
        NET_UNLOCK();
-       if_detach(ifp);
-       NET_LOCK();
 
-       pfsync_drop(sc);
+       if_detach(ifp);
 
-       if (sc->sc_deferred > 0) {
-               TAILQ_INIT(&deferrals);
-               mtx_enter(&sc->sc_deferrals_mtx);
-               TAILQ_CONCAT(&deferrals, &sc->sc_deferrals, pd_entry);
-               sc->sc_deferred = 0;
-               mtx_leave(&sc->sc_deferrals_mtx);
+#if NKSTAT > 0
+       for (i = 0; i < nitems(sc->sc_slices); i++) {
+               struct pfsync_slice *s = &sc->sc_slices[i];
 
-               while ((pd = TAILQ_FIRST(&deferrals)) != NULL) {
-                       TAILQ_REMOVE(&deferrals, pd, pd_entry);
-                       pfsync_undefer(pd, 0);
-               }
+               kstat_destroy(s->s_kstat);
        }
+#endif
 
-       pfsyncif = NULL;
-       timeout_del(&sc->sc_bulkfail_tmo);
-       timeout_del(&sc->sc_bulk_tmo);
-       timeout_del(&sc->sc_tmo);
-
-       NET_UNLOCK();
-
-       pool_destroy(&sc->sc_pool);
-       free(sc->sc_imo.imo_membership, M_IPMOPTS,
-           sc->sc_imo.imo_max_memberships * sizeof(struct in_multi *));
        free(sc, M_DEVBUF, sizeof(*sc));
 
        return (0);
 }
 
-/*
- * Start output on the pfsync interface.
- */
-void
-pfsyncstart(struct ifqueue *ifq)
+static void
+pfsync_dprintf(struct pfsync_softc *sc, const char *fmt, ...)
 {
-       ifq_purge(ifq);
+       struct ifnet *ifp = &sc->sc_if;
+       va_list ap;
+
+       if (!ISSET(ifp->if_flags, IFF_DEBUG))
+               return;
+
+       printf("%s: ", ifp->if_xname);
+       va_start(ap, fmt);
+       vprintf(fmt, ap);
+       va_end(ap);
+       printf("\n");
 }
 
-void
-pfsync_syncdev_state(void *arg)
+static void
+pfsync_syncif_link(void *arg)
 {
        struct pfsync_softc *sc = arg;
-       struct ifnet *ifp;
+       struct ifnet *ifp0;
+       unsigned int sync_if_down = 1;
 
-       if ((sc->sc_if.if_flags & IFF_UP) == 0)
-               return;
-       if ((ifp = if_get(sc->sc_sync_ifidx)) == NULL)
-               return;
+       ifp0 = if_get(sc->sc_sync_ifidx);
+       if (ifp0 != NULL && LINK_STATE_IS_UP(ifp0->if_link_state)) {
+               pfsync_bulk_req_evt(sc, PFSYNC_BREQ_EVT_LINK);
+               sync_if_down = 0;
+       }
+       if_put(ifp0);
 
-       if (ifp->if_link_state == LINK_STATE_DOWN) {
-               sc->sc_if.if_flags &= ~IFF_RUNNING;
-               if (!sc->sc_link_demoted) {
 #if NCARP > 0
-                       carp_group_demote_adj(&sc->sc_if, 1,
-                           "pfsync link state down");
-#endif
-                       sc->sc_link_demoted = 1;
-               }
-
-               /* drop everything */
-               timeout_del(&sc->sc_tmo);
-               pfsync_drop(sc);
-
-               pfsync_cancel_full_update(sc);
-       } else if (sc->sc_link_demoted) {
-               sc->sc_if.if_flags |= IFF_RUNNING;
-
-               pfsync_request_full_update(sc);
+       if (sc->sc_sync_if_down != sync_if_down) {
+               carp_group_demote_adj(&sc->sc_if,
+                   sync_if_down ? 1 : -1, "pfsync link");
        }
+#endif
 
-       if_put(ifp);
+       sc->sc_sync_if_down = sync_if_down;
 }
 
-void
-pfsync_ifdetach(void *arg)
+static void
+pfsync_syncif_detach(void *arg)
 {
        struct pfsync_softc *sc = arg;
-       struct ifnet *ifp;
+       struct ifnet *ifp = &sc->sc_if;
 
-       if ((ifp = if_get(sc->sc_sync_ifidx)) != NULL) {
-               if_linkstatehook_del(ifp, &sc->sc_ltask);
-               if_detachhook_del(ifp, &sc->sc_dtask);
+       if (ISSET(ifp->if_flags, IFF_RUNNING)) {
+               pfsync_down(sc);
+               if_down(ifp);
        }
-       if_put(ifp);
 
        sc->sc_sync_ifidx = 0;
 }
 
-int
-pfsync_input(struct mbuf **mp, int *offp, int proto, int af)
+static int
+pfsync_output(struct ifnet *ifp, struct mbuf *m, struct sockaddr *dst,
+    struct rtentry *rt)
 {
-       struct mbuf *n, *m = *mp;
-       struct pfsync_softc *sc = pfsyncif;
-       struct ip *ip = mtod(m, struct ip *);
-       struct pfsync_header *ph;
-       struct pfsync_subheader subh;
-       int offset, noff, len, count, mlen, flags = 0;
-       int e;
-
-       NET_ASSERT_LOCKED();
+       m_freem(m);     /* drop packet */
+       return (EAFNOSUPPORT);
+}
 
-       pfsyncstat_inc(pfsyncs_ipackets);
+static int
+pfsync_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
+{
+       struct pfsync_softc *sc = ifp->if_softc;
+       struct ifreq *ifr = (struct ifreq *)data;
+       int error = ENOTTY;
 
-       /* verify that we have a sync interface configured */
-       if (sc == NULL || !ISSET(sc->sc_if.if_flags, IFF_RUNNING) ||
-           sc->sc_sync_ifidx == 0 || !pf_status.running)
-               goto done;
+       switch (cmd) {
+       case SIOCSIFADDR:
+               error = EOPNOTSUPP;
+               break;
 
-       /* verify that the packet came in on the right interface */
-       if (sc->sc_sync_ifidx != m->m_pkthdr.ph_ifidx) {
-               pfsyncstat_inc(pfsyncs_badif);
-               goto done;
-       }
+       case SIOCSIFFLAGS:
+               if (ISSET(ifp->if_flags, IFF_UP)) {
+                       if (!ISSET(ifp->if_flags, IFF_RUNNING))
+                               error = pfsync_up(sc);
+                       else
+                               error = ENETRESET;
+               } else {
+                       if (ISSET(ifp->if_flags, IFF_RUNNING))
+                               error = pfsync_down(sc);
+               }
+               break;
 
-       sc->sc_if.if_ipackets++;
-       sc->sc_if.if_ibytes += m->m_pkthdr.len;
+       case SIOCSIFMTU:
+               error = pfsync_set_mtu(sc, ifr->ifr_mtu);
+               break;
 
-       /* verify that the IP TTL is 255. */
-       if (ip->ip_ttl != PFSYNC_DFLTTL) {
-               pfsyncstat_inc(pfsyncs_badttl);
-               goto done;
-       }
+       case SIOCSIFPARENT:
+               error = pfsync_set_parent(sc, (struct if_parent *)data);
+               break;
+       case SIOCGIFPARENT:
+               error = pfsync_get_parent(sc, (struct if_parent *)data);
+               break;
+       case SIOCDIFPARENT:
+               error = pfsync_del_parent(sc);
+               break;
 
-       offset = ip->ip_hl << 2;
-       n = m_pulldown(m, offset, sizeof(*ph), &noff);
-       if (n == NULL) {
-               pfsyncstat_inc(pfsyncs_hdrops);
-               return IPPROTO_DONE;
-       }
-       ph = (struct pfsync_header *)(n->m_data + noff);
+       case SIOCSETPFSYNC:
+               error = pfsync_set_ioc(sc, ifr);
+               break;
+       case SIOCGETPFSYNC:
+               error = pfsync_get_ioc(sc, ifr);
+               break;
 
-       /* verify the version */
-       if (ph->version != PFSYNC_VERSION) {
-               pfsyncstat_inc(pfsyncs_badver);
-               goto done;
-       }
-       len = ntohs(ph->len) + offset;
-       if (m->m_pkthdr.len < len) {
-               pfsyncstat_inc(pfsyncs_badlen);
-               goto done;
+       default:
+               break;
        }
 
-       if (!bcmp(&ph->pfcksum, &pf_status.pf_chksum, PF_MD5_DIGEST_LENGTH))
-               flags = PFSYNC_SI_CKSUM;
-
-       offset += sizeof(*ph);
-       while (offset <= len - sizeof(subh)) {
-               m_copydata(m, offset, sizeof(subh), &subh);
-               offset += sizeof(subh);
-
-               mlen = subh.len << 2;
-               count = ntohs(subh.count);
-
-               if (subh.action >= PFSYNC_ACT_MAX ||
-                   subh.action >= nitems(pfsync_acts) ||
-                   mlen < pfsync_acts[subh.action].len) {
-                       /*
-                        * subheaders are always followed by at least one
-                        * message, so if the peer is new
-                        * enough to tell us how big its messages are then we
-                        * know enough to skip them.
-                        */
-                       if (count > 0 && mlen > 0) {
-                               offset += count * mlen;
-                               continue;
-                       }
-                       pfsyncstat_inc(pfsyncs_badact);
-                       goto done;
-               }
-
-               n = m_pulldown(m, offset, mlen * count, &noff);
-               if (n == NULL) {
-                       pfsyncstat_inc(pfsyncs_badlen);
-                       return IPPROTO_DONE;
-               }
-
-               e = pfsync_acts[subh.action].in(n->m_data + noff, mlen, count,
-                   flags);
-               if (e != 0)
-                       goto done;
-
-               offset += mlen * count;
-       }
+       if (error == ENETRESET)
+               error = 0;
 
-done:
-       m_freem(m);
-       return IPPROTO_DONE;
+       return (error);
 }
 
-int
-pfsync_in_clr(caddr_t buf, int len, int count, int flags)
+static int
+pfsync_set_mtu(struct pfsync_softc *sc, unsigned int mtu)
 {
-       struct pfsync_clr *clr;
-       struct pf_state *st, *nexts;
-       struct pfi_kif *kif;
-       u_int32_t creatorid;
-       int i;
+       struct ifnet *ifp = &sc->sc_if;
+       struct ifnet *ifp0;
+       int error = 0;
 
-       PF_LOCK();
-       for (i = 0; i < count; i++) {
-               clr = (struct pfsync_clr *)buf + len * i;
-               kif = NULL;
-               creatorid = clr->creatorid;
-               if (strlen(clr->ifname) &&
-                   (kif = pfi_kif_find(clr->ifname)) == NULL)
-                       continue;
+       ifp0 = if_get(sc->sc_sync_ifidx);
+       if (ifp0 == NULL)
+               return (EINVAL);
 
-               PF_STATE_ENTER_WRITE();
-               RBT_FOREACH_SAFE(st, pf_state_tree_id, &tree_id, nexts) {
-                       if (st->creatorid == creatorid &&
-                           ((kif && st->kif == kif) || !kif)) {
-                               SET(st->state_flags, PFSTATE_NOSYNC);
-                               pf_remove_state(st);
-                       }
-               }
-               PF_STATE_EXIT_WRITE();
+       if (mtu <= PFSYNC_MINPKT || mtu > ifp0->if_mtu) {
+               error = EINVAL;
+               goto put;
        }
-       PF_UNLOCK();
 
-       return (0);
+       /* commit */
+       ifp->if_mtu = mtu;
+
+put:
+       if_put(ifp0);
+       return (error);
 }
 
-int
-pfsync_in_ins(caddr_t buf, int len, int count, int flags)
+static int
+pfsync_set_parent(struct pfsync_softc *sc, const struct if_parent *p)
 {
-       struct pfsync_state *sp;
-       sa_family_t af1, af2;
-       int i;
+       struct ifnet *ifp = &sc->sc_if;
+       struct ifnet *ifp0;
+       int error = 0;
 
-       PF_LOCK();
-       for (i = 0; i < count; i++) {
-               sp = (struct pfsync_state *)(buf + len * i);
-               af1 = sp->key[0].af;
-               af2 = sp->key[1].af;
+       ifp0 = if_unit(p->ifp_parent);
+       if (ifp0 == NULL)
+               return (ENXIO);
 
-               /* check for invalid values */
-               if (sp->timeout >= PFTM_MAX ||
-                   sp->src.state > PF_TCPS_PROXY_DST ||
-                   sp->dst.state > PF_TCPS_PROXY_DST ||
-                   sp->direction > PF_OUT ||
-                   (((af1 || af2) &&
-                    ((af1 != AF_INET && af1 != AF_INET6) ||
-                     (af2 != AF_INET && af2 != AF_INET6))) ||
-                   (sp->af != AF_INET && sp->af != AF_INET6))) {
-                       DPFPRINTF(LOG_NOTICE,
-                           "pfsync_input: PFSYNC5_ACT_INS: invalid value");
-                       pfsyncstat_inc(pfsyncs_badval);
-                       continue;
-               }
+       if (ifp0->if_index == sc->sc_sync_ifidx)
+               goto put;
 
-               if (pf_state_import(sp, flags) == ENOMEM) {
-                       /* drop out, but process the rest of the actions */
-                       break;
-               }
+       if (ISSET(ifp->if_flags, IFF_RUNNING)) {
+               error = EBUSY;
+               goto put;
        }
-       PF_UNLOCK();
 
-       return (0);
+       /* commit */
+       sc->sc_sync_ifidx = ifp0->if_index;
+
+put:
+       if_put(ifp0);
+       return (error);
 }
 
-int
-pfsync_in_iack(caddr_t buf, int len, int count, int flags)
+static int
+pfsync_get_parent(struct pfsync_softc *sc, struct if_parent *p)
 {
-       struct pfsync_ins_ack *ia;
-       struct pf_state_cmp id_key;
-       struct pf_state *st;
-       int i;
+       struct ifnet *ifp0;
+       int error = 0;
 
-       for (i = 0; i < count; i++) {
-               ia = (struct pfsync_ins_ack *)(buf + len * i);
+       ifp0 = if_get(sc->sc_sync_ifidx);
+       if (ifp0 == NULL)
+               error = EADDRNOTAVAIL;
+       else
+               strlcpy(p->ifp_parent, ifp0->if_xname, sizeof(p->ifp_parent));
+       if_put(ifp0);
 
-               id_key.id = ia->id;
-               id_key.creatorid = ia->creatorid;
+       return (error);
+}
 
-               PF_STATE_ENTER_READ();
-               st = pf_find_state_byid(&id_key);
-               pf_state_ref(st);
-               PF_STATE_EXIT_READ();
-               if (st == NULL)
-                       continue;
+static int
+pfsync_del_parent(struct pfsync_softc *sc)
+{
+       struct ifnet *ifp = &sc->sc_if;
 
-               if (ISSET(st->state_flags, PFSTATE_ACK))
-                       pfsync_deferred(st, 0);
+       if (ISSET(ifp->if_flags, IFF_RUNNING))
+               return (EBUSY);
 
-               pf_state_unref(st);
-       }
+       /* commit */
+       sc->sc_sync_ifidx = 0;
 
        return (0);
 }
 
-int
-pfsync_upd_tcp(struct pf_state *st, struct pfsync_state_peer *src,
-    struct pfsync_state_peer *dst)
+static int
+pfsync_get_ioc(struct pfsync_softc *sc, struct ifreq *ifr)
 {
-       int sync = 0;
-
-       /*
-        * The state should never go backwards except
-        * for syn-proxy states.  Neither should the
-        * sequence window slide backwards.
-        */
-       if ((st->src.state > src->state &&
-           (st->src.state < PF_TCPS_PROXY_SRC ||
-           src->state >= PF_TCPS_PROXY_SRC)) ||
+       struct pfsyncreq pfsyncr;
+       struct ifnet *ifp0;
 
-           (st->src.state == src->state &&
-           SEQ_GT(st->src.seqlo, ntohl(src->seqlo))))
-               sync++;
-       else
-               pf_state_peer_ntoh(src, &st->src);
+       memset(&pfsyncr, 0, sizeof(pfsyncr));
 
-       if ((st->dst.state > dst->state) ||
+       ifp0 = if_get(sc->sc_sync_ifidx);
+       if (ifp0 != NULL) {
+               strlcpy(pfsyncr.pfsyncr_syncdev, ifp0->if_xname,
+                   sizeof(pfsyncr.pfsyncr_syncdev));
+       }
+       if_put(ifp0);
 
-           (st->dst.state >= TCPS_SYN_SENT &&
-           SEQ_GT(st->dst.seqlo, ntohl(dst->seqlo))))
-               sync++;
-       else
-               pf_state_peer_ntoh(dst, &st->dst);
+       pfsyncr.pfsyncr_syncpeer = sc->sc_syncpeer;
+       pfsyncr.pfsyncr_maxupdates = sc->sc_maxupdates;
+       pfsyncr.pfsyncr_defer = sc->sc_defer;
 
-       return (sync);
+       return (copyout(&pfsyncr, ifr->ifr_data, sizeof(pfsyncr)));
 }
 
-int
-pfsync_in_upd(caddr_t buf, int len, int count, int flags)
+static int
+pfsync_set_ioc(struct pfsync_softc *sc, struct ifreq *ifr)
 {
-       struct pfsync_state *sp;
-       struct pf_state_cmp id_key;
-       struct pf_state *st;
-       int sync, error;
-       int i;
-
-       for (i = 0; i < count; i++) {
-               sp = (struct pfsync_state *)(buf + len * i);
-
-               /* check for invalid values */
-               if (sp->timeout >= PFTM_MAX ||
-                   sp->src.state > PF_TCPS_PROXY_DST ||
-                   sp->dst.state > PF_TCPS_PROXY_DST) {
-                       DPFPRINTF(LOG_NOTICE,
-                           "pfsync_input: PFSYNC_ACT_UPD: invalid value");
-                       pfsyncstat_inc(pfsyncs_badval);
-                       continue;
-               }
-
-               id_key.id = sp->id;
-               id_key.creatorid = sp->creatorid;
+       struct ifnet *ifp = &sc->sc_if;
+       struct pfsyncreq pfsyncr;
+       unsigned int sync_ifidx = sc->sc_sync_ifidx;
+       int wantdown = 0;
+       int error;
 
-               PF_STATE_ENTER_READ();
-               st = pf_find_state_byid(&id_key);
-               pf_state_ref(st);
-               PF_STATE_EXIT_READ();
-               if (st == NULL) {
-                       /* insert the update */
-                       PF_LOCK();
-                       error = pf_state_import(sp, flags);
-                       if (error)
-                               pfsyncstat_inc(pfsyncs_badstate);
-                       PF_UNLOCK();
-                       continue;
-               }
+       error = suser(curproc);
+       if (error != 0)
+               return (error);
 
-               if (ISSET(st->state_flags, PFSTATE_ACK))
-                       pfsync_deferred(st, 1);
+       error = copyin(ifr->ifr_data, &pfsyncr, sizeof(pfsyncr));
+       if (error != 0)
+               return (error);
 
-               if (st->key[PF_SK_WIRE]->proto == IPPROTO_TCP)
-                       sync = pfsync_upd_tcp(st, &sp->src, &sp->dst);
-               else {
-                       sync = 0;
+       if (pfsyncr.pfsyncr_maxupdates > 255)
+               return (EINVAL);
 
-                       /*
-                        * Non-TCP protocol state machine always go
-                        * forwards
-                        */
-                       if (st->src.state > sp->src.state)
-                               sync++;
-                       else
-                               pf_state_peer_ntoh(&sp->src, &st->src);
+       if (pfsyncr.pfsyncr_syncdev[0] != '\0') { /* set */
+               struct ifnet *ifp0 = if_unit(pfsyncr.pfsyncr_syncdev);
+               if (ifp0 == NULL)
+                       return (ENXIO);
 
-                       if (st->dst.state > sp->dst.state)
-                               sync++;
-                       else
-                               pf_state_peer_ntoh(&sp->dst, &st->dst);
-               }
+               if (ifp0->if_index != sync_ifidx)
+                       wantdown = 1;
 
-               if (sync < 2) {
-                       pf_state_alloc_scrub_memory(&sp->dst, &st->dst);
-                       pf_state_peer_ntoh(&sp->dst, &st->dst);
-                       st->expire = getuptime();
-                       st->timeout = sp->timeout;
-               }
-               st->pfsync_time = getuptime();
+               sync_ifidx = ifp0->if_index;
+               if_put(ifp0);
+       } else { /* del */
+               wantdown = 1;
+               sync_ifidx = 0;
+       }
 
-               if (sync) {
-                       pfsyncstat_inc(pfsyncs_stale);
+       if (pfsyncr.pfsyncr_syncpeer.s_addr == INADDR_ANY)
+               pfsyncr.pfsyncr_syncpeer.s_addr = INADDR_PFSYNC_GROUP;
+       if (pfsyncr.pfsyncr_syncpeer.s_addr != sc->sc_syncpeer.s_addr)
+               wantdown = 1;
 
-                       pfsync_update_state(st);
-                       schednetisr(NETISR_PFSYNC);
-               }
+       if (wantdown && ISSET(ifp->if_flags, IFF_RUNNING))
+               return (EBUSY);
 
-               pf_state_unref(st);
-       }
+       /* commit */
+       sc->sc_sync_ifidx = sync_ifidx;
+       sc->sc_syncpeer = pfsyncr.pfsyncr_syncpeer;
+       sc->sc_maxupdates = pfsyncr.pfsyncr_maxupdates;
+       sc->sc_defer = pfsyncr.pfsyncr_defer;
 
        return (0);
 }
 
-int
-pfsync_in_upd_c(caddr_t buf, int len, int count, int flags)
+static int
+pfsync_up(struct pfsync_softc *sc)
 {
-       struct pfsync_upd_c *up;
-       struct pf_state_cmp id_key;
-       struct pf_state *st;
-
-       int sync;
-
-       int i;
-
-       for (i = 0; i < count; i++) {
-               up = (struct pfsync_upd_c *)(buf + len * i);
+       struct ifnet *ifp = &sc->sc_if;
+       struct ifnet *ifp0;
+       void *inm = NULL;
+       int error = 0;
+       struct ip *ip;
 
-               /* check for invalid values */
-               if (up->timeout >= PFTM_MAX ||
-                   up->src.state > PF_TCPS_PROXY_DST ||
-                   up->dst.state > PF_TCPS_PROXY_DST) {
-                       DPFPRINTF(LOG_NOTICE,
-                           "pfsync_input: PFSYNC_ACT_UPD_C: invalid value");
-                       pfsyncstat_inc(pfsyncs_badval);
-                       continue;
-               }
+       NET_ASSERT_LOCKED();
+       KASSERT(!ISSET(ifp->if_flags, IFF_RUNNING));
 
-               id_key.id = up->id;
-               id_key.creatorid = up->creatorid;
+       if (sc->sc_dead)
+               return (ENXIO);
 
-               PF_STATE_ENTER_READ();
-               st = pf_find_state_byid(&id_key);
-               pf_state_ref(st);
-               PF_STATE_EXIT_READ();
-               if (st == NULL) {
-                       /* We don't have this state. Ask for it. */
-                       pfsync_request_update(id_key.creatorid, id_key.id);
-                       continue;
-               }
+       /*
+        * coordinate with pfsync_down(). if sc_up is still up and
+        * we're here then something else is tearing pfsync down.
+        */
+       if (sc->sc_up)
+               return (EBUSY);
 
-               if (ISSET(st->state_flags, PFSTATE_ACK))
-                       pfsync_deferred(st, 1);
+       if (sc->sc_syncpeer.s_addr == INADDR_ANY ||
+           sc->sc_syncpeer.s_addr == INADDR_BROADCAST)
+               return (EDESTADDRREQ);
 
-               if (st->key[PF_SK_WIRE]->proto == IPPROTO_TCP)
-                       sync = pfsync_upd_tcp(st, &up->src, &up->dst);
-               else {
-                       sync = 0;
-                       /*
-                        * Non-TCP protocol state machine always go
-                        * forwards
-                        */
-                       if (st->src.state > up->src.state)
-                               sync++;
-                       else
-                               pf_state_peer_ntoh(&up->src, &st->src);
+       ifp0 = if_get(sc->sc_sync_ifidx);
+       if (ifp0 == NULL)
+               return (ENXIO);
 
-                       if (st->dst.state > up->dst.state)
-                               sync++;
-                       else
-                               pf_state_peer_ntoh(&up->dst, &st->dst);
+       if (IN_MULTICAST(sc->sc_syncpeer.s_addr)) {
+               if (!ISSET(ifp0->if_flags, IFF_MULTICAST)) {
+                       error = ENODEV;
+                       goto put;
                }
-               if (sync < 2) {
-                       pf_state_alloc_scrub_memory(&up->dst, &st->dst);
-                       pf_state_peer_ntoh(&up->dst, &st->dst);
-                       st->expire = getuptime();
-                       st->timeout = up->timeout;
+               inm = in_addmulti(&sc->sc_syncpeer, ifp0);
+               if (inm == NULL) {
+                       error = ECONNABORTED;
+                       goto put;
                }
-               st->pfsync_time = getuptime();
-
-               if (sync) {
-                       pfsyncstat_inc(pfsyncs_stale);
+       }
 
-                       pfsync_update_state(st);
-                       schednetisr(NETISR_PFSYNC);
-               }
+       sc->sc_up = 1;
 
-               pf_state_unref(st);
-       }
+       ip = &sc->sc_template;
+       memset(ip, 0, sizeof(*ip));
+       ip->ip_v = IPVERSION;
+       ip->ip_hl = sizeof(*ip) >> 2;
+       ip->ip_tos = IPTOS_LOWDELAY;
+       /* len and id are set later */
+       ip->ip_off = htons(IP_DF);
+       ip->ip_ttl = PFSYNC_DFLTTL;
+       ip->ip_p = IPPROTO_PFSYNC;
+       ip->ip_src.s_addr = INADDR_ANY;
+       ip->ip_dst.s_addr = sc->sc_syncpeer.s_addr;
 
-       return (0);
-}
+       /* commit */
+       refcnt_init(&sc->sc_refs); /* IFF_RUNNING kind of owns this */
 
-int
-pfsync_in_ureq(caddr_t buf, int len, int count, int flags)
-{
-       struct pfsync_upd_req *ur;
-       int i;
+#if NCARP > 0
+       sc->sc_sync_if_down = 1;
+       carp_group_demote_adj(&sc->sc_if, 1, "pfsync up");
+#endif
 
-       struct pf_state_cmp id_key;
-       struct pf_state *st;
+       if_linkstatehook_add(ifp0, &sc->sc_ltask);
+       if_detachhook_add(ifp0, &sc->sc_dtask);
 
-       for (i = 0; i < count; i++) {
-               ur = (struct pfsync_upd_req *)(buf + len * i);
+       sc->sc_inm = inm;
+       SET(ifp->if_flags, IFF_RUNNING);
 
-               id_key.id = ur->id;
-               id_key.creatorid = ur->creatorid;
+       pfsync_bulk_req_evt(sc, PFSYNC_BREQ_EVT_UP);
 
-               if (id_key.id == 0 && id_key.creatorid == 0)
-                       pfsync_bulk_start();
-               else {
-                       PF_STATE_ENTER_READ();
-                       st = pf_find_state_byid(&id_key);
-                       pf_state_ref(st);
-                       PF_STATE_EXIT_READ();
-                       if (st == NULL) {
-                               pfsyncstat_inc(pfsyncs_badstate);
-                               continue;
-                       }
-                       if (ISSET(st->state_flags, PFSTATE_NOSYNC)) {
-                               pf_state_unref(st);
-                               continue;
-                       }
+       refcnt_take(&sc->sc_refs); /* give one to SMR */
+       SMR_PTR_SET_LOCKED(&pfsyncif, sc);
 
-                       pfsync_update_state_req(st);
-                       pf_state_unref(st);
-               }
-       }
+       pfsync_syncif_link(sc); /* try and push the bulk req state forward */
 
-       return (0);
+put:
+       if_put(ifp0);
+       return (error);
 }
 
-int
-pfsync_in_del(caddr_t buf, int len, int count, int flags)
+static struct mbuf *
+pfsync_encap(struct pfsync_softc *sc, struct mbuf *m)
 {
-       struct pfsync_state *sp;
-       struct pf_state_cmp id_key;
-       struct pf_state *st;
-       int i;
+       struct {
+               struct ip               ip;
+               struct pfsync_header    ph;
+       } __packed __aligned(4) *h;
+       unsigned int mlen = m->m_pkthdr.len;
 
-       PF_STATE_ENTER_WRITE();
-       for (i = 0; i < count; i++) {
-               sp = (struct pfsync_state *)(buf + len * i);
+       m = m_prepend(m, sizeof(*h), M_DONTWAIT);
+       if (m == NULL)
+               return (NULL);
 
-               id_key.id = sp->id;
-               id_key.creatorid = sp->creatorid;
+       h = mtod(m, void *);
+       memset(h, 0, sizeof(*h));
 
-               st = pf_find_state_byid(&id_key);
-               if (st == NULL) {
-                       pfsyncstat_inc(pfsyncs_badstate);
-                       continue;
-               }
-               SET(st->state_flags, PFSTATE_NOSYNC);
-               pf_remove_state(st);
-       }
-       PF_STATE_EXIT_WRITE();
+       mlen += sizeof(h->ph);
+       h->ph.version = PFSYNC_VERSION;
+       h->ph.len = htons(mlen);
+       /* h->ph.pfcksum */
 
-       return (0);
+       mlen += sizeof(h->ip);
+       h->ip = sc->sc_template;
+       h->ip.ip_len = htons(mlen);
+       h->ip.ip_id = htons(ip_randomid());
+
+       return (m);
 }
 
-int
-pfsync_in_del_c(caddr_t buf, int len, int count, int flags)
+static void
+pfsync_bulk_req_send(struct pfsync_softc *sc)
 {
-       struct pfsync_del_c *sp;
-       struct pf_state_cmp id_key;
-       struct pf_state *st;
-       int i;
-
-       PF_LOCK();
-       PF_STATE_ENTER_WRITE();
-       for (i = 0; i < count; i++) {
-               sp = (struct pfsync_del_c *)(buf + len * i);
-
-               id_key.id = sp->id;
-               id_key.creatorid = sp->creatorid;
+       struct {
+               struct pfsync_subheader subh;
+               struct pfsync_upd_req   ur;
+       } __packed __aligned(4) *h;
+       unsigned mlen = max_linkhdr +
+           sizeof(struct ip) + sizeof(struct pfsync_header) + sizeof(*h);
+       struct mbuf *m;
 
-               st = pf_find_state_byid(&id_key);
-               if (st == NULL) {
-                       pfsyncstat_inc(pfsyncs_badstate);
-                       continue;
-               }
+       m = m_gethdr(M_DONTWAIT, MT_DATA);
+       if (m == NULL)
+               goto fail;
 
-               SET(st->state_flags, PFSTATE_NOSYNC);
-               pf_remove_state(st);
+       if (mlen > MHLEN) {
+               MCLGETL(m, M_DONTWAIT, mlen);
+               if (!ISSET(m->m_flags, M_EXT))
+                       goto drop;
        }
-       PF_STATE_EXIT_WRITE();
-       PF_UNLOCK();
 
-       return (0);
-}
+       m_align(m, sizeof(*h));
+       m->m_len = m->m_pkthdr.len = sizeof(*h);
 
-int
-pfsync_in_bus(caddr_t buf, int len, int count, int flags)
-{
-       struct pfsync_softc *sc = pfsyncif;
-       struct pfsync_bus *bus;
+       h = mtod(m, void *);
+       memset(h, 0, sizeof(*h));
 
-       /* If we're not waiting for a bulk update, who cares. */
-       if (sc->sc_ureq_sent == 0)
-               return (0);
+       h->subh.action = PFSYNC_ACT_UPD_REQ;
+       h->subh.len = sizeof(h->ur) >> 2;
+       h->subh.count = htons(1);
 
-       bus = (struct pfsync_bus *)buf;
+       h->ur.id = htobe64(0);
+       h->ur.creatorid = htobe32(0);
 
-       switch (bus->status) {
-       case PFSYNC_BUS_START:
-               PF_LOCK();
-               timeout_add(&sc->sc_bulkfail_tmo, 4 * hz +
-                   pf_pool_limits[PF_LIMIT_STATES].limit /
-                   ((sc->sc_if.if_mtu - PFSYNC_MINPKT) /
-                   sizeof(struct pfsync_state)));
-               PF_UNLOCK();
-               DPFPRINTF(LOG_INFO, "received bulk update start");
-               break;
+       m = pfsync_encap(sc, m);
+       if (m == NULL)
+               goto fail;
 
-       case PFSYNC_BUS_END:
-               if (getuptime() - ntohl(bus->endtime) >=
-                   sc->sc_ureq_sent) {
-                       /* that's it, we're happy */
-                       sc->sc_ureq_sent = 0;
-                       sc->sc_bulk_tries = 0;
-                       timeout_del(&sc->sc_bulkfail_tmo);
-#if NCARP > 0
-                       if (!pfsync_sync_ok)
-                               carp_group_demote_adj(&sc->sc_if, -1,
-                                   sc->sc_link_demoted ?
-                                   "pfsync link state up" :
-                                   "pfsync bulk done");
-                       if (sc->sc_initial_bulk) {
-                               carp_group_demote_adj(&sc->sc_if, -32,
-                                   "pfsync init");
-                               sc->sc_initial_bulk = 0;
-                       }
-#endif
-                       pfsync_sync_ok = 1;
-                       sc->sc_link_demoted = 0;
-                       DPFPRINTF(LOG_INFO, "received valid bulk update end");
-               } else {
-                       DPFPRINTF(LOG_WARNING, "received invalid "
-                           "bulk update end: bad timestamp");
-               }
-               break;
-       }
+       pfsync_sendout(sc, m);
+       return;
 
-       return (0);
+drop:
+       m_freem(m);
+fail:
+       printf("%s: unable to request bulk update\n", sc->sc_if.if_xname);
 }
 
-int
-pfsync_in_tdb(caddr_t buf, int len, int count, int flags)
+static void
+pfsync_bulk_req_nstate(struct pfsync_softc *sc,
+    enum pfsync_bulk_req_state nstate, int seconds)
 {
-#if defined(IPSEC)
-       struct pfsync_tdb *tp;
-       int i;
-
-       for (i = 0; i < count; i++) {
-               tp = (struct pfsync_tdb *)(buf + len * i);
-               pfsync_update_net_tdb(tp);
-       }
-#endif
-
-       return (0);
+       sc->sc_bulk_req.req_state = nstate;
+       if (seconds > 0)
+               timeout_add_sec(&sc->sc_bulk_req.req_tmo, seconds);
+       else
+               timeout_del(&sc->sc_bulk_req.req_tmo);
 }
 
-#if defined(IPSEC)
-/* Update an in-kernel tdb. Silently fail if no tdb is found. */
-void
-pfsync_update_net_tdb(struct pfsync_tdb *pt)
+static void
+pfsync_bulk_req_invstate(struct pfsync_softc *sc,
+    enum pfsync_bulk_req_event evt)
 {
-       struct tdb              *tdb;
-
-       NET_ASSERT_LOCKED();
-
-       /* check for invalid values */
-       if (ntohl(pt->spi) <= SPI_RESERVED_MAX ||
-           (pt->dst.sa.sa_family != AF_INET &&
-            pt->dst.sa.sa_family != AF_INET6))
-               goto bad;
+       panic("%s: unexpected event %s in state %s", sc->sc_if.if_xname,
+           pfsync_bulk_req_event_names[evt],
+           pfsync_bulk_req_state_names[sc->sc_bulk_req.req_state]);
+}
 
-       tdb = gettdb(ntohs(pt->rdomain), pt->spi,
-           (union sockaddr_union *)&pt->dst, pt->sproto);
-       if (tdb) {
-               pt->rpl = betoh64(pt->rpl);
-               pt->cur_bytes = betoh64(pt->cur_bytes);
+static void
+pfsync_bulk_req_nstate_bulk(struct pfsync_softc *sc)
+{
+       /* calculate the number of packets we expect */
+       int t = pf_pool_limits[PF_LIMIT_STATES].limit /
+           ((sc->sc_if.if_mtu - PFSYNC_MINPKT) /
+            sizeof(struct pfsync_state));
 
-               /* Neither replay nor byte counter should ever decrease. */
-               if (pt->rpl < tdb->tdb_rpl ||
-                   pt->cur_bytes < tdb->tdb_cur_bytes) {
-                       tdb_unref(tdb);
-                       goto bad;
-               }
+       /* turn it into seconds */
+       t /= 1000 / PFSYNC_BULK_SND_IVAL_MS;
 
-               tdb->tdb_rpl = pt->rpl;
-               tdb->tdb_cur_bytes = pt->cur_bytes;
-               tdb_unref(tdb);
-       }
-       return;
+       if (t == 0)
+               t = 1;
 
- bad:
-       DPFPRINTF(LOG_WARNING, "pfsync_insert: PFSYNC_ACT_TDB_UPD: "
-           "invalid value");
-       pfsyncstat_inc(pfsyncs_badstate);
-       return;
+       pfsync_bulk_req_nstate(sc, PFSYNC_BREQ_S_BULK, t * 4);
 }
-#endif
 
-
-int
-pfsync_in_eof(caddr_t buf, int len, int count, int flags)
+static inline void
+pfsync_bulk_req_nstate_done(struct pfsync_softc *sc)
 {
-       if (len > 0 || count > 0)
-               pfsyncstat_inc(pfsyncs_badact);
+       pfsync_bulk_req_nstate(sc, PFSYNC_BREQ_S_DONE, 0);
 
-       /* we're done. let the caller return */
-       return (1);
-}
+       KASSERT(sc->sc_bulk_req.req_demoted == 1);
+       sc->sc_bulk_req.req_demoted = 0;
 
-int
-pfsync_in_error(caddr_t buf, int len, int count, int flags)
-{
-       pfsyncstat_inc(pfsyncs_badact);
-       return (-1);
+#if NCARP > 0
+       carp_group_demote_adj(&sc->sc_if, -32, "pfsync done");
+#endif
 }
 
-int
-pfsyncoutput(struct ifnet *ifp, struct mbuf *m, struct sockaddr *dst,
-       struct rtentry *rt)
+static void
+pfsync_bulk_req_evt(struct pfsync_softc *sc, enum pfsync_bulk_req_event evt)
 {
-       m_freem(m);     /* drop packet */
-       return (EAFNOSUPPORT);
-}
+       struct ifnet *ifp = &sc->sc_if;
 
-int
-pfsyncioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
-{
-       struct proc *p = curproc;
-       struct pfsync_softc *sc = ifp->if_softc;
-       struct ifreq *ifr = (struct ifreq *)data;
-       struct ip_moptions *imo = &sc->sc_imo;
-       struct pfsyncreq pfsyncr;
-       struct ifnet *ifp0, *sifp;
-       struct ip *ip;
-       int error;
+       rw_enter_write(&sc->sc_bulk_req.req_lock);
+       pfsync_dprintf(sc, "%s state %s evt %s", __func__,
+           pfsync_bulk_req_state_names[sc->sc_bulk_req.req_state],
+           pfsync_bulk_req_event_names[evt]);
 
-       switch (cmd) {
-       case SIOCSIFFLAGS:
-               if ((ifp->if_flags & IFF_RUNNING) == 0 &&
-                   (ifp->if_flags & IFF_UP)) {
-                       ifp->if_flags |= IFF_RUNNING;
+       if (evt == PFSYNC_BREQ_EVT_DOWN) {
+               /* unconditionally move down */
+               sc->sc_bulk_req.req_tries = 0;
+               pfsync_bulk_req_nstate(sc, PFSYNC_BREQ_S_NONE, 0);
 
+               if (sc->sc_bulk_req.req_demoted) {
+                       sc->sc_bulk_req.req_demoted = 0;
 #if NCARP > 0
-                       sc->sc_initial_bulk = 1;
-                       carp_group_demote_adj(&sc->sc_if, 32, "pfsync init");
+                       carp_group_demote_adj(&sc->sc_if, -32,
+                           "pfsync down");
 #endif
-
-                       pfsync_request_full_update(sc);
                }
-               if ((ifp->if_flags & IFF_RUNNING) &&
-                   (ifp->if_flags & IFF_UP) == 0) {
-                       ifp->if_flags &= ~IFF_RUNNING;
-
-                       /* drop everything */
-                       timeout_del(&sc->sc_tmo);
-                       pfsync_drop(sc);
-
-                       pfsync_cancel_full_update(sc);
-               }
-               break;
-       case SIOCSIFMTU:
-               if ((ifp0 = if_get(sc->sc_sync_ifidx)) == NULL)
-                       return (EINVAL);
-               error = 0;
-               if (ifr->ifr_mtu <= PFSYNC_MINPKT ||
-                   ifr->ifr_mtu > ifp0->if_mtu) {
-                       error = EINVAL;
+       } else switch (sc->sc_bulk_req.req_state) {
+       case PFSYNC_BREQ_S_NONE:
+               switch (evt) {
+               case PFSYNC_BREQ_EVT_UP:
+                       KASSERT(sc->sc_bulk_req.req_demoted == 0);
+                       sc->sc_bulk_req.req_demoted = 1;
+#if NCARP > 0
+                       carp_group_demote_adj(&sc->sc_if, 32,
+                           "pfsync start");
+#endif
+                       pfsync_bulk_req_nstate(sc, PFSYNC_BREQ_S_START, 30);
+                       break;
+               default:
+                       pfsync_bulk_req_invstate(sc, evt);
                }
-               if_put(ifp0);
-               if (error)
-                       return error;
-               if (ifr->ifr_mtu < ifp->if_mtu)
-                       pfsync_sendout();
-               ifp->if_mtu = ifr->ifr_mtu;
+
                break;
-       case SIOCGETPFSYNC:
-               bzero(&pfsyncr, sizeof(pfsyncr));
-               if ((ifp0 = if_get(sc->sc_sync_ifidx)) != NULL) {
-                       strlcpy(pfsyncr.pfsyncr_syncdev,
-                           ifp0->if_xname, IFNAMSIZ);
+
+       case PFSYNC_BREQ_S_START:
+               switch (evt) {
+               case PFSYNC_BREQ_EVT_LINK:
+                       pfsync_bulk_req_send(sc);
+                       pfsync_bulk_req_nstate(sc, PFSYNC_BREQ_S_SENT, 2);
+                       break;
+               case PFSYNC_BREQ_EVT_TMO:
+                       pfsync_dprintf(sc, "timeout waiting for link");
+                       pfsync_bulk_req_nstate_done(sc);
+                       break;
+               case PFSYNC_BREQ_EVT_BUS_START:
+                       pfsync_bulk_req_nstate_bulk(sc);
+                       break;
+               case PFSYNC_BREQ_EVT_BUS_END:
+                       /* ignore this */
+                       break;
+               default:
+                       pfsync_bulk_req_invstate(sc, evt);
                }
-               if_put(ifp0);
-               pfsyncr.pfsyncr_syncpeer = sc->sc_sync_peer;
-               pfsyncr.pfsyncr_maxupdates = sc->sc_maxupdates;
-               pfsyncr.pfsyncr_defer = sc->sc_defer;
-               return (copyout(&pfsyncr, ifr->ifr_data, sizeof(pfsyncr)));
+               break;
 
-       case SIOCSETPFSYNC:
-               if ((error = suser(p)) != 0)
-                       return (error);
-               if ((error = copyin(ifr->ifr_data, &pfsyncr, sizeof(pfsyncr))))
-                       return (error);
+       case PFSYNC_BREQ_S_SENT:
+               switch (evt) {
+               case PFSYNC_BREQ_EVT_BUS_START:
+                       pfsync_bulk_req_nstate_bulk(sc);
+                       break;
+               case PFSYNC_BREQ_EVT_BUS_END:
+               case PFSYNC_BREQ_EVT_LINK:
+                       /* ignore this */
+                       break;
+               case PFSYNC_BREQ_EVT_TMO:
+                       if (++sc->sc_bulk_req.req_tries <
+                           PFSYNC_MAX_BULKTRIES) {
+                               pfsync_bulk_req_send(sc);
+                               pfsync_bulk_req_nstate(sc,
+                                   PFSYNC_BREQ_S_SENT, 2);
+                               break;
+                       }
 
-               if (pfsyncr.pfsyncr_syncpeer.s_addr == 0)
-                       sc->sc_sync_peer.s_addr = INADDR_PFSYNC_GROUP;
-               else
-                       sc->sc_sync_peer.s_addr =
-                           pfsyncr.pfsyncr_syncpeer.s_addr;
+                       pfsync_dprintf(sc,
+                           "timeout waiting for bulk transfer start");
+                       pfsync_bulk_req_nstate_done(sc);
+                       break;
+               default:
+                       pfsync_bulk_req_invstate(sc, evt);
+               }
+               break;
 
-               if (pfsyncr.pfsyncr_maxupdates > 255)
-                       return (EINVAL);
-               sc->sc_maxupdates = pfsyncr.pfsyncr_maxupdates;
+       case PFSYNC_BREQ_S_BULK:
+               switch (evt) {
+               case PFSYNC_BREQ_EVT_BUS_START:
+               case PFSYNC_BREQ_EVT_LINK:
+                       /* ignore this */
+                       break;
+               case PFSYNC_BREQ_EVT_BUS_END:
+                       pfsync_bulk_req_nstate_done(sc);
+                       break;
+               case PFSYNC_BREQ_EVT_TMO:
+                       if (++sc->sc_bulk_req.req_tries <
+                           PFSYNC_MAX_BULKTRIES) {
+                               pfsync_bulk_req_send(sc);
+                               pfsync_bulk_req_nstate(sc,
+                                   PFSYNC_BREQ_S_SENT, 2);
+                       }
 
-               sc->sc_defer = pfsyncr.pfsyncr_defer;
+                       pfsync_dprintf(sc,
+                           "timeout waiting for bulk transfer end");
+                       pfsync_bulk_req_nstate_done(sc);
+                       break;
+               default:
+                       pfsync_bulk_req_invstate(sc, evt);
+               }
+               break;
 
-               if (pfsyncr.pfsyncr_syncdev[0] == 0) {
-                       if ((ifp0 = if_get(sc->sc_sync_ifidx)) != NULL) {
-                               if_linkstatehook_del(ifp0, &sc->sc_ltask);
-                               if_detachhook_del(ifp0, &sc->sc_dtask);
-                       }
-                       if_put(ifp0);
-                       sc->sc_sync_ifidx = 0;
-                       if (imo->imo_num_memberships > 0) {
-                               in_delmulti(imo->imo_membership[
-                                   --imo->imo_num_memberships]);
-                               imo->imo_ifidx = 0;
-                       }
+       case PFSYNC_BREQ_S_DONE: /* pfsync is up and running */
+               switch (evt) {
+               case PFSYNC_BREQ_EVT_BUS_START:
+               case PFSYNC_BREQ_EVT_BUS_END:
+               case PFSYNC_BREQ_EVT_LINK:
+                       /* nops */
                        break;
+               default:
+                       pfsync_bulk_req_invstate(sc, evt);
                }
+               break;
 
-               if ((sifp = if_unit(pfsyncr.pfsyncr_syncdev)) == NULL)
-                       return (EINVAL);
+       default:
+               panic("%s: unknown event %d", ifp->if_xname, evt);
+               /* NOTREACHED */
+       }
+       rw_exit_write(&sc->sc_bulk_req.req_lock);
+}
 
-               ifp0 = if_get(sc->sc_sync_ifidx);
+static void
+pfsync_bulk_req_tmo(void *arg)
+{
+       struct pfsync_softc *sc = arg;
 
-               if (sifp->if_mtu < sc->sc_if.if_mtu || (ifp0 != NULL &&
-                   sifp->if_mtu < ifp0->if_mtu) ||
-                   sifp->if_mtu < MCLBYTES - sizeof(struct ip))
-                       pfsync_sendout();
+       NET_LOCK();
+       pfsync_bulk_req_evt(sc, PFSYNC_BREQ_EVT_TMO);
+       NET_UNLOCK();
+}
 
-               if (ifp0) {
-                       if_linkstatehook_del(ifp0, &sc->sc_ltask);
-                       if_detachhook_del(ifp0, &sc->sc_dtask);
-               }
-               if_put(ifp0);
-               sc->sc_sync_ifidx = sifp->if_index;
+static int
+pfsync_down(struct pfsync_softc *sc)
+{
+       struct ifnet *ifp = &sc->sc_if;
+       struct ifnet *ifp0;
+       struct smr_entry smr;
+       size_t i;
+       void *inm = NULL;
+       unsigned int sndbar = 0;
+       struct pfsync_deferrals pds = TAILQ_HEAD_INITIALIZER(pds);
+       struct pfsync_deferral *pd;
 
-               if (imo->imo_num_memberships > 0) {
-                       in_delmulti(imo->imo_membership[--imo->imo_num_memberships]);
-                       imo->imo_ifidx = 0;
-               }
+       NET_ASSERT_LOCKED();
+       KASSERT(ISSET(ifp->if_flags, IFF_RUNNING));
 
-               if (sc->sc_sync_peer.s_addr == INADDR_PFSYNC_GROUP) {
-                       struct in_addr addr;
+       /*
+        * tearing down pfsync involves waiting for pfsync to stop
+        * running in various contexts including softnet taskqs.
+        * this thread cannot hold netlock while waiting for a
+        * barrier in softnet because softnet might be waiting for
+        * the netlock. sc->sc_up is used to coordinate with
+        * pfsync_up.
+        */
 
-                       if (!(sifp->if_flags & IFF_MULTICAST)) {
-                               sc->sc_sync_ifidx = 0;
-                               if_put(sifp);
-                               return (EADDRNOTAVAIL);
-                       }
+       CLR(ifp->if_flags, IFF_RUNNING);
 
-                       addr.s_addr = INADDR_PFSYNC_GROUP;
+       ifp0 = if_get(sc->sc_sync_ifidx);
+       if (ifp0 != NULL) {
+               if_linkstatehook_del(ifp0, &sc->sc_ltask);
+               if_detachhook_del(ifp0, &sc->sc_dtask);
+       }
+       if_put(ifp0);
 
-                       if ((imo->imo_membership[0] =
-                           in_addmulti(&addr, sifp)) == NULL) {
-                               sc->sc_sync_ifidx = 0;
-                               if_put(sifp);
-                               return (ENOBUFS);
-                       }
-                       imo->imo_num_memberships++;
-                       imo->imo_ifidx = sc->sc_sync_ifidx;
-                       imo->imo_ttl = PFSYNC_DFLTTL;
-                       imo->imo_loop = 0;
-               }
+#if NCARP > 0
+       if (sc->sc_sync_if_down)
+               carp_group_demote_adj(&sc->sc_if, -1, "pfsync down");
+#endif
 
-               ip = &sc->sc_template;
-               bzero(ip, sizeof(*ip));
-               ip->ip_v = IPVERSION;
-               ip->ip_hl = sizeof(sc->sc_template) >> 2;
-               ip->ip_tos = IPTOS_LOWDELAY;
-               /* len and id are set later */
-               ip->ip_off = htons(IP_DF);
-               ip->ip_ttl = PFSYNC_DFLTTL;
-               ip->ip_p = IPPROTO_PFSYNC;
-               ip->ip_src.s_addr = INADDR_ANY;
-               ip->ip_dst.s_addr = sc->sc_sync_peer.s_addr;
+       NET_UNLOCK();
 
-               if_linkstatehook_add(sifp, &sc->sc_ltask);
-               if_detachhook_add(sifp, &sc->sc_dtask);
-               if_put(sifp);
+       KASSERTMSG(SMR_PTR_GET_LOCKED(&pfsyncif) == sc,
+          "pfsyncif %p != sc %p", pfsyncif, sc);
+       SMR_PTR_SET_LOCKED(&pfsyncif, NULL);
+       smr_init(&smr);
+       smr_call(&smr, (void (*)(void *))refcnt_rele_wake, &sc->sc_refs);
 
-               pfsync_request_full_update(sc);
+       /* stop pf producing work before cleaning up the timeouts and tasks */
+       refcnt_finalize(&sc->sc_refs, "pfsyncfini");
 
-               break;
+       pfsync_bulk_req_evt(sc, PFSYNC_BREQ_EVT_DOWN);
 
-       default:
-               return (ENOTTY);
+       rw_enter_read(&pf_state_list.pfs_rwl);
+       rw_enter_write(&sc->sc_bulk_snd.snd_lock);
+       if (sc->sc_bulk_snd.snd_tail != NULL) {
+               sndbar = !timeout_del(&sc->sc_bulk_snd.snd_tmo);
+
+               sc->sc_bulk_snd.snd_again = 0;
+               sc->sc_bulk_snd.snd_next = NULL;
+               sc->sc_bulk_snd.snd_tail = NULL;
+       }
+       rw_exit_write(&sc->sc_bulk_snd.snd_lock);
+       rw_exit_read(&pf_state_list.pfs_rwl);
+
+       /*
+        * do a single barrier for all the timeouts. because the
+        * timeouts in each slice are configured the same way, the
+        * barrier for one will work for all of them.
+        */
+       for (i = 0; i < nitems(sc->sc_slices); i++) {
+               struct pfsync_slice *s = &sc->sc_slices[i];
+
+               timeout_del(&s->s_tmo);
+               task_del(s->s_softnet, &s->s_task);
+               task_del(s->s_softnet, &s->s_send);
+
+               timeout_del(&s->s_deferrals_tmo);
+               task_del(s->s_softnet, &s->s_deferrals_task);
+       }
+       timeout_barrier(&sc->sc_slices[0].s_tmo);
+       timeout_barrier(&sc->sc_bulk_req.req_tmo); /* XXX proc */
+       if (sndbar) {
+               /* technically the preceding barrier does the same job */
+               timeout_barrier(&sc->sc_bulk_snd.snd_tmo);
+       }
+       net_tq_barriers("pfsyncbar");
+
+       /* pfsync is no longer running */
+
+       if (sc->sc_inm != NULL) {
+               inm = sc->sc_inm;
+               sc->sc_inm = NULL;
+       }
+
+       for (i = 0; i < nitems(sc->sc_slices); i++) {
+               struct pfsync_slice *s = &sc->sc_slices[i];
+               struct pf_state *st;
+
+               pfsync_slice_drop(sc, s);
+               mq_purge(&s->s_sendq);
+
+               while ((pd = TAILQ_FIRST(&s->s_deferrals)) != NULL) {
+                       TAILQ_REMOVE(&s->s_deferrals, pd, pd_entry);
+
+                       st = pd->pd_st;
+                       st->sync_defer = NULL;
+
+                       TAILQ_INSERT_TAIL(&pds, pd, pd_entry);
+               }
+               s->s_deferred = 0;
+       }
+
+       NET_LOCK();
+       sc->sc_up = 0;
+
+       if (inm != NULL)
+               in_delmulti(inm);
+
+       while ((pd = TAILQ_FIRST(&pds)) != NULL) {
+               TAILQ_REMOVE(&pds, pd, pd_entry);
+
+               pfsync_defer_output(pd);
        }
 
        return (0);
 }
 
-void
+int
+pfsync_is_up(void)
+{
+       int rv;
+
+       smr_read_enter();
+       rv = SMR_PTR_GET(&pfsyncif) != NULL;
+       smr_read_leave();
+
+       return (rv);
+}
+
+static void
+pfsync_start(struct ifqueue *ifq)
+{
+       ifq_purge(ifq);
+}
+
+struct pfsync_q {
+       void            (*write)(struct pf_state *, void *);
+       size_t          len;
+       u_int8_t        action;
+};
+
+static struct pfsync_slice *
+pfsync_slice_enter(struct pfsync_softc *sc, const struct pf_state *st)
+{
+       unsigned int idx = st->key[0]->hash % nitems(sc->sc_slices); 
+       struct pfsync_slice *s = &sc->sc_slices[idx];
+
+       if (!mtx_enter_try(&s->s_mtx)) {
+               mtx_enter(&s->s_mtx);
+               s->s_stat_contended++;
+       }
+       s->s_stat_locks++;
+
+       return (s);
+}
+
+static void
+pfsync_slice_leave(struct pfsync_softc *sc, struct pfsync_slice *s)
+{
+       mtx_leave(&s->s_mtx);
+}
+
+/* we have one of these for every PFSYNC_S_ */
+static void    pfsync_out_state(struct pf_state *, void *);
+static void    pfsync_out_iack(struct pf_state *, void *);
+static void    pfsync_out_upd_c(struct pf_state *, void *);
+static void    pfsync_out_del(struct pf_state *, void *);
+#if defined(IPSEC)
+static void    pfsync_out_tdb(struct tdb *, void *);
+#endif
+
+static const struct pfsync_q pfsync_qs[] = {
+       { pfsync_out_iack,  sizeof(struct pfsync_ins_ack), PFSYNC_ACT_INS_ACK },
+       { pfsync_out_upd_c, sizeof(struct pfsync_upd_c),   PFSYNC_ACT_UPD_C },
+       { pfsync_out_del,   sizeof(struct pfsync_del_c),   PFSYNC_ACT_DEL_C },
+       { pfsync_out_state, sizeof(struct pfsync_state),   PFSYNC_ACT_INS },
+       { pfsync_out_state, sizeof(struct pfsync_state),   PFSYNC_ACT_UPD }
+};
+
+static void
 pfsync_out_state(struct pf_state *st, void *buf)
 {
        struct pfsync_state *sp = buf;
 
+       mtx_enter(&st->mtx);
        pf_state_export(sp, st);
+       mtx_leave(&st->mtx);
 }
 
-void
+static void
 pfsync_out_iack(struct pf_state *st, void *buf)
 {
        struct pfsync_ins_ack *iack = buf;
@@ -1319,20 +1319,23 @@ pfsync_out_iack(struct pf_state *st, void *buf)
        iack->creatorid = st->creatorid;
 }
 
-void
+static void
 pfsync_out_upd_c(struct pf_state *st, void *buf)
 {
        struct pfsync_upd_c *up = buf;
 
-       bzero(up, sizeof(*up));
+       memset(up, 0, sizeof(*up));
        up->id = st->id;
+       up->creatorid = st->creatorid;
+
+       mtx_enter(&st->mtx);
        pf_state_peer_hton(&st->src, &up->src);
        pf_state_peer_hton(&st->dst, &up->dst);
-       up->creatorid = st->creatorid;
        up->timeout = st->timeout;
+       mtx_leave(&st->mtx);
 }
 
-void
+static void
 pfsync_out_del(struct pf_state *st, void *buf)
 {
        struct pfsync_del_c *dp = buf;
@@ -1340,1070 +1343,1045 @@ pfsync_out_del(struct pf_state *st, void *buf)
        dp->id = st->id;
        dp->creatorid = st->creatorid;
 
-       SET(st->state_flags, PFSTATE_NOSYNC);
+       st->sync_state = PFSYNC_S_DEAD;
 }
 
-void
-pfsync_grab_snapshot(struct pfsync_snapshot *sn, struct pfsync_softc *sc)
-{
-       int q;
-       struct pf_state *st;
-       struct pfsync_upd_req_item *ur;
-#if defined(IPSEC)
-       struct tdb *tdb;
-#endif
-
-       sn->sn_sc = sc;
-
-       mtx_enter(&sc->sc_st_mtx);
-       mtx_enter(&sc->sc_upd_req_mtx);
-       mtx_enter(&sc->sc_tdb_mtx);
-
-       for (q = 0; q < PFSYNC_S_COUNT; q++) {
-               TAILQ_INIT(&sn->sn_qs[q]);
-
-               while ((st = TAILQ_FIRST(&sc->sc_qs[q])) != NULL) {
-                       TAILQ_REMOVE(&sc->sc_qs[q], st, sync_list);
-                       mtx_enter(&st->mtx);
-                       if (st->snapped == 0) {
-                               TAILQ_INSERT_TAIL(&sn->sn_qs[q], st, sync_snap);
-                               st->snapped = 1;
-                               mtx_leave(&st->mtx);
-                       } else {
-                               /*
-                                * item is on snapshot list already, so we can
-                                * skip it now.
-                                */
-                               mtx_leave(&st->mtx);
-                               pf_state_unref(st);
-                       }
-               }
-       }
-
-       TAILQ_INIT(&sn->sn_upd_req_list);
-       while ((ur = TAILQ_FIRST(&sc->sc_upd_req_list)) != NULL) {
-               TAILQ_REMOVE(&sc->sc_upd_req_list, ur, ur_entry);
-               TAILQ_INSERT_TAIL(&sn->sn_upd_req_list, ur, ur_snap);
-       }
-
-       TAILQ_INIT(&sn->sn_tdb_q);
 #if defined(IPSEC)
-       while ((tdb = TAILQ_FIRST(&sc->sc_tdb_q)) != NULL) {
-               TAILQ_REMOVE(&sc->sc_tdb_q, tdb, tdb_sync_entry);
-               TAILQ_INSERT_TAIL(&sn->sn_tdb_q, tdb, tdb_sync_snap);
-
-               mtx_enter(&tdb->tdb_mtx);
-               KASSERT(!ISSET(tdb->tdb_flags, TDBF_PFSYNC_SNAPPED));
-               SET(tdb->tdb_flags, TDBF_PFSYNC_SNAPPED);
-               mtx_leave(&tdb->tdb_mtx);
-       }
-#endif
-
-       sn->sn_len = sc->sc_len;
-       sc->sc_len = PFSYNC_MINPKT;
-
-       sn->sn_plus = sc->sc_plus;
-       sc->sc_plus = NULL;
-       sn->sn_pluslen = sc->sc_pluslen;
-       sc->sc_pluslen = 0;
+static inline void
+pfsync_tdb_enter(struct tdb *tdb)
+{
+       mtx_enter(&tdb->tdb_mtx);
+}
 
-       mtx_leave(&sc->sc_tdb_mtx);
-       mtx_leave(&sc->sc_upd_req_mtx);
-       mtx_leave(&sc->sc_st_mtx);
+static inline void
+pfsync_tdb_leave(struct tdb *tdb)
+{
+       unsigned int snapped = ISSET(tdb->tdb_flags, TDBF_PFSYNC_SNAPPED);
+       mtx_leave(&tdb->tdb_mtx);
+       if (snapped)
+               wakeup_one(&tdb->tdb_updates);
 }
+#endif /* defined(IPSEC) */
 
-void
-pfsync_drop_snapshot(struct pfsync_snapshot *sn)
+static void
+pfsync_slice_drop(struct pfsync_softc *sc, struct pfsync_slice *s)
 {
        struct pf_state *st;
-       struct pfsync_upd_req_item *ur;
+       int q;
 #if defined(IPSEC)
-       struct tdb *t;
+       struct tdb *tdb;
 #endif
-       int q;
 
-       for (q = 0; q < PFSYNC_S_COUNT; q++) {
-               if (TAILQ_EMPTY(&sn->sn_qs[q]))
+       for (q = 0; q < nitems(s->s_qs); q++) {
+               if (TAILQ_EMPTY(&s->s_qs[q]))
                        continue;
 
-               while ((st = TAILQ_FIRST(&sn->sn_qs[q])) != NULL) {
-                       mtx_enter(&st->mtx);
+               while ((st = TAILQ_FIRST(&s->s_qs[q])) != NULL) {
+                       TAILQ_REMOVE(&s->s_qs[q], st, sync_list);
+#ifdef PFSYNC_DEBUG
                        KASSERT(st->sync_state == q);
-                       KASSERT(st->snapped == 1);
-                       TAILQ_REMOVE(&sn->sn_qs[q], st, sync_snap);
+#endif
                        st->sync_state = PFSYNC_S_NONE;
-                       st->snapped = 0;
-                       mtx_leave(&st->mtx);
                        pf_state_unref(st);
                }
        }
 
-       while ((ur = TAILQ_FIRST(&sn->sn_upd_req_list)) != NULL) {
-               TAILQ_REMOVE(&sn->sn_upd_req_list, ur, ur_snap);
-               pool_put(&sn->sn_sc->sc_pool, ur);
-       }
-
 #if defined(IPSEC)
-       while ((t = TAILQ_FIRST(&sn->sn_tdb_q)) != NULL) {
-               TAILQ_REMOVE(&sn->sn_tdb_q, t, tdb_sync_snap);
-               mtx_enter(&t->tdb_mtx);
-               KASSERT(ISSET(t->tdb_flags, TDBF_PFSYNC_SNAPPED));
-               CLR(t->tdb_flags, TDBF_PFSYNC_SNAPPED);
-               CLR(t->tdb_flags, TDBF_PFSYNC);
-               mtx_leave(&t->tdb_mtx);
-       }
-#endif
-}
-
-int
-pfsync_is_snapshot_empty(struct pfsync_snapshot *sn)
-{
-       int     q;
-
-       for (q = 0; q < PFSYNC_S_COUNT; q++)
-               if (!TAILQ_EMPTY(&sn->sn_qs[q]))
-                       return (0);
-
-       if (!TAILQ_EMPTY(&sn->sn_upd_req_list))
-               return (0);
-
-       if (!TAILQ_EMPTY(&sn->sn_tdb_q))
-               return (0);
-
-       return (sn->sn_plus == NULL);
-}
-
-void
-pfsync_drop(struct pfsync_softc *sc)
-{
-       struct pfsync_snapshot  sn;
-
-       pfsync_grab_snapshot(&sn, sc);
-       pfsync_drop_snapshot(&sn);
-}
-
-void
-pfsync_send_dispatch(void *xmq)
-{
-       struct mbuf_queue *mq = xmq;
-       struct pfsync_softc *sc;
-       struct mbuf *m;
-       struct mbuf_list ml;
-       int error;
-
-       mq_delist(mq, &ml);
-       if (ml_empty(&ml))
-               return;
-
-       NET_LOCK();
-       sc = pfsyncif;
-       if (sc == NULL) {
-               ml_purge(&ml);
-               goto done;
-       }
+       while ((tdb = TAILQ_FIRST(&s->s_tdb_q)) != NULL) {
+               TAILQ_REMOVE(&s->s_tdb_q, tdb, tdb_sync_entry);
 
-       while ((m = ml_dequeue(&ml)) != NULL) {
-               if ((error = ip_output(m, NULL, NULL, IP_RAWOUTPUT,
-                   &sc->sc_imo, NULL, 0)) == 0)
-                       pfsyncstat_inc(pfsyncs_opackets);
-               else {
-                       DPFPRINTF(LOG_DEBUG,
-                           "ip_output() @ %s failed (%d)\n", __func__, error);
-                       pfsyncstat_inc(pfsyncs_oerrors);
-               }
+               pfsync_tdb_enter(tdb);
+               KASSERT(ISSET(tdb->tdb_flags, TDBF_PFSYNC));
+               CLR(tdb->tdb_flags, TDBF_PFSYNC);
+               pfsync_tdb_leave(tdb);
        }
-done:
-       NET_UNLOCK();
-}
+#endif /* defined(IPSEC) */
 
-void
-pfsync_send_pkt(struct mbuf *m)
-{
-       if (mq_enqueue(&pfsync_mq, m) != 0) {
-               pfsyncstat_inc(pfsyncs_oerrors);
-               DPFPRINTF(LOG_DEBUG, "mq_enqueue() @ %s failed, queue full\n",
-                   __func__);
-       } else
-               task_add(net_tq(0), &pfsync_task);
+       timeout_del(&s->s_tmo);
+       s->s_len = PFSYNC_MINPKT;
 }
 
-void
-pfsync_sendout(void)
+static struct mbuf *
+pfsync_slice_write(struct pfsync_slice *s)
 {
-       struct pfsync_snapshot sn;
-       struct pfsync_softc *sc = pfsyncif;
-#if NBPFILTER > 0
-       struct ifnet *ifp = &sc->sc_if;
-#endif
+       struct pfsync_softc *sc = s->s_pfsync;
        struct mbuf *m;
+
        struct ip *ip;
        struct pfsync_header *ph;
        struct pfsync_subheader *subh;
-       struct pf_state *st;
-       struct pfsync_upd_req_item *ur;
-       int offset;
-       int q, count = 0;
 
-       if (sc == NULL || sc->sc_len == PFSYNC_MINPKT)
-               return;
+       unsigned int mlen = max_linkhdr + s->s_len;
+       unsigned int q, count;
+       caddr_t ptr;
+       size_t off;
 
-       if (!ISSET(sc->sc_if.if_flags, IFF_RUNNING) ||
-#if NBPFILTER > 0
-           (ifp->if_bpf == NULL && sc->sc_sync_ifidx == 0)) {
-#else
-           sc->sc_sync_ifidx == 0) {
-#endif
-               pfsync_drop(sc);
-               return;
+       MUTEX_ASSERT_LOCKED(&s->s_mtx);
+       if (s->s_len == PFSYNC_MINPKT) {
+               s->s_stat_write_nop++;
+               return (NULL);
        }
 
-       pfsync_grab_snapshot(&sn, sc);
+       task_del(s->s_softnet, &s->s_task);
 
-       /*
-        * Check below is sufficient to prevent us from sending empty packets,
-        * but it does not stop us from sending short packets.
-        */
-       if (pfsync_is_snapshot_empty(&sn))
-               return;
+       m = m_gethdr(M_DONTWAIT, MT_DATA);
+       if (m == NULL)
+               goto drop;
 
-       MGETHDR(m, M_DONTWAIT, MT_DATA);
-       if (m == NULL) {
-               sc->sc_if.if_oerrors++;
-               pfsyncstat_inc(pfsyncs_onomem);
-               pfsync_drop_snapshot(&sn);
-               return;
+       if (mlen > MHLEN) {
+               MCLGETL(m, M_DONTWAIT, mlen);
+               if (!ISSET(m->m_flags, M_EXT))
+                       goto drop;
        }
 
-       if (max_linkhdr + sn.sn_len > MHLEN) {
-               MCLGETL(m, M_DONTWAIT, max_linkhdr + sn.sn_len);
-               if (!ISSET(m->m_flags, M_EXT)) {
-                       m_free(m);
-                       sc->sc_if.if_oerrors++;
-                       pfsyncstat_inc(pfsyncs_onomem);
-                       pfsync_drop_snapshot(&sn);
-                       return;
-               }
-       }
-       m->m_data += max_linkhdr;
-       m->m_len = m->m_pkthdr.len = sn.sn_len;
+       m_align(m, s->s_len);
+       m->m_len = m->m_pkthdr.len = s->s_len;
 
-       /* build the ip header */
-       ip = mtod(m, struct ip *);
-       bcopy(&sc->sc_template, ip, sizeof(*ip));
-       offset = sizeof(*ip);
+       ptr = mtod(m, caddr_t);
+       off = 0;
 
+       ip = (struct ip *)(ptr + off);
+       off += sizeof(*ip);
+       *ip = sc->sc_template;
        ip->ip_len = htons(m->m_pkthdr.len);
        ip->ip_id = htons(ip_randomid());
 
-       /* build the pfsync header */
-       ph = (struct pfsync_header *)(m->m_data + offset);
-       bzero(ph, sizeof(*ph));
-       offset += sizeof(*ph);
-
+       ph = (struct pfsync_header *)(ptr + off);
+       off += sizeof(*ph);
+       memset(ph, 0, sizeof(*ph));
        ph->version = PFSYNC_VERSION;
-       ph->len = htons(sn.sn_len - sizeof(*ip));
-       bcopy(pf_status.pf_chksum, ph->pfcksum, PF_MD5_DIGEST_LENGTH);
+       ph->len = htons(m->m_pkthdr.len - sizeof(*ip));
+
+       for (q = 0; q < nitems(s->s_qs); q++) {
+               struct pf_state_queue *psq = &s->s_qs[q];
+               struct pf_state *st;
 
-       if (!TAILQ_EMPTY(&sn.sn_upd_req_list)) {
-               subh = (struct pfsync_subheader *)(m->m_data + offset);
-               offset += sizeof(*subh);
+               if (TAILQ_EMPTY(psq))
+                       continue;
+
+               subh = (struct pfsync_subheader *)(ptr + off);
+               off += sizeof(*subh);
 
                count = 0;
-               while ((ur = TAILQ_FIRST(&sn.sn_upd_req_list)) != NULL) {
-                       TAILQ_REMOVE(&sn.sn_upd_req_list, ur, ur_snap);
+               while ((st = TAILQ_FIRST(psq)) != NULL) {
+                       TAILQ_REMOVE(psq, st, sync_list);
+                       count++;
 
-                       bcopy(&ur->ur_msg, m->m_data + offset,
-                           sizeof(ur->ur_msg));
-                       offset += sizeof(ur->ur_msg);
+                       KASSERT(st->sync_state == q);
+                       /* the write handler below may override this */
+                       st->sync_state = PFSYNC_S_NONE;
 
-                       pool_put(&sc->sc_pool, ur);
+                       pfsync_qs[q].write(st, ptr + off);
+                       off += pfsync_qs[q].len;
 
-                       count++;
+                       pf_state_unref(st);
                }
 
-               bzero(subh, sizeof(*subh));
-               subh->len = sizeof(ur->ur_msg) >> 2;
-               subh->action = PFSYNC_ACT_UPD_REQ;
+               subh->action = pfsync_qs[q].action;
+               subh->len = pfsync_qs[q].len >> 2;
                subh->count = htons(count);
        }
 
-       /* has someone built a custom region for us to add? */
-       if (sn.sn_plus != NULL) {
-               bcopy(sn.sn_plus, m->m_data + offset, sn.sn_pluslen);
-               offset += sn.sn_pluslen;
-               sn.sn_plus = NULL;      /* XXX memory leak ? */
-       }
-
 #if defined(IPSEC)
-       if (!TAILQ_EMPTY(&sn.sn_tdb_q)) {
-               struct tdb *t;
+       if (!TAILQ_EMPTY(&s->s_tdb_q)) {
+               struct tdb *tdb;
 
-               subh = (struct pfsync_subheader *)(m->m_data + offset);
-               offset += sizeof(*subh);
+               subh = (struct pfsync_subheader *)(ptr + off);
+               off += sizeof(*subh);
 
                count = 0;
-               while ((t = TAILQ_FIRST(&sn.sn_tdb_q)) != NULL) {
-                       TAILQ_REMOVE(&sn.sn_tdb_q, t, tdb_sync_snap);
-                       pfsync_out_tdb(t, m->m_data + offset);
-                       offset += sizeof(struct pfsync_tdb);
-                       mtx_enter(&t->tdb_mtx);
-                       KASSERT(ISSET(t->tdb_flags, TDBF_PFSYNC_SNAPPED));
-                       CLR(t->tdb_flags, TDBF_PFSYNC_SNAPPED);
-                       CLR(t->tdb_flags, TDBF_PFSYNC);
-                       mtx_leave(&t->tdb_mtx);
-                       tdb_unref(t);
+               while ((tdb = TAILQ_FIRST(&s->s_tdb_q)) != NULL) {
+                       TAILQ_REMOVE(&s->s_tdb_q, tdb, tdb_sync_entry);
                        count++;
+
+                       pfsync_tdb_enter(tdb);
+                       KASSERT(ISSET(tdb->tdb_flags, TDBF_PFSYNC));
+
+                       /* get a consistent view of the counters */
+                       pfsync_out_tdb(tdb, ptr + off);
+
+                       CLR(tdb->tdb_flags, TDBF_PFSYNC);
+                       pfsync_tdb_leave(tdb);
+
+                       off += sizeof(struct pfsync_tdb);
                }
 
-               bzero(subh, sizeof(*subh));
                subh->action = PFSYNC_ACT_TDB;
                subh->len = sizeof(struct pfsync_tdb) >> 2;
                subh->count = htons(count);
        }
 #endif
 
-       /* walk the queues */
-       for (q = 0; q < PFSYNC_S_COUNT; q++) {
-               if (TAILQ_EMPTY(&sn.sn_qs[q]))
-                       continue;
-
-               subh = (struct pfsync_subheader *)(m->m_data + offset);
-               offset += sizeof(*subh);
-
-               count = 0;
-               while ((st = TAILQ_FIRST(&sn.sn_qs[q])) != NULL) {
-                       mtx_enter(&st->mtx);
-                       TAILQ_REMOVE(&sn.sn_qs[q], st, sync_snap);
-                       KASSERT(st->sync_state == q);
-                       KASSERT(st->snapped == 1);
-                       st->sync_state = PFSYNC_S_NONE;
-                       st->snapped = 0;
-                       pfsync_qs[q].write(st, m->m_data + offset);
-                       offset += pfsync_qs[q].len;
-                       mtx_leave(&st->mtx);
+       timeout_del(&s->s_tmo);
+       s->s_len = PFSYNC_MINPKT;
 
-                       pf_state_unref(st);
-                       count++;
-               }
+       return (m);
+drop:
+       m_freem(m);
+       pfsyncstat_inc(pfsyncs_onomem);
+       pfsync_slice_drop(sc, s);
+       return (NULL);
+}
 
-               bzero(subh, sizeof(*subh));
-               subh->action = pfsync_qs[q].action;
-               subh->len = pfsync_qs[q].len >> 2;
-               subh->count = htons(count);
-       }
+static void
+pfsync_sendout(struct pfsync_softc *sc, struct mbuf *m)
+{
+       struct ip_moptions imo;
+       unsigned int len = m->m_pkthdr.len;
+#if NBPF > 0
+       caddr_t if_bpf = sc->sc_if.if_bpf;
+       if (if_bpf)
+               bpf_mtap(if_bpf, m, BPF_DIRECTION_OUT);
+#endif
 
-       /* we're done, let's put it on the wire */
-#if NBPFILTER > 0
-       if (ifp->if_bpf) {
-               m->m_data += sizeof(*ip);
-               m->m_len = m->m_pkthdr.len = sn.sn_len - sizeof(*ip);
-               bpf_mtap(ifp->if_bpf, m, BPF_DIRECTION_OUT);
-               m->m_data -= sizeof(*ip);
-               m->m_len = m->m_pkthdr.len = sn.sn_len;
-       }
+       imo.imo_ifidx = sc->sc_sync_ifidx;
+       imo.imo_ttl = PFSYNC_DFLTTL;
+       imo.imo_loop = 0;
 
-       if (sc->sc_sync_ifidx == 0) {
-               sc->sc_len = PFSYNC_MINPKT;
-               m_freem(m);
-               return;
+       if (ip_output(m, NULL, NULL, IP_RAWOUTPUT, &imo, NULL, 0) == 0) {
+               counters_pkt(sc->sc_if.if_counters, ifc_opackets,
+                   ifc_obytes, len);
+               pfsyncstat_inc(pfsyncs_opackets);
+       } else {
+               counters_inc(sc->sc_if.if_counters, ifc_oerrors);
+               pfsyncstat_inc(pfsyncs_oerrors);
        }
-#endif
+}
 
-       sc->sc_if.if_opackets++;
-       sc->sc_if.if_obytes += m->m_pkthdr.len;
+static void
+pfsync_slice_tmo(void *arg)
+{
+       struct pfsync_slice *s = arg;
 
-       m->m_pkthdr.ph_rtableid = sc->sc_if.if_rdomain;
+       task_add(s->s_softnet, &s->s_task);
+}
 
-       pfsync_send_pkt(m);
+static void
+pfsync_slice_sched(struct pfsync_slice *s)
+{
+       s->s_stat_task_add++;
+       task_add(s->s_softnet, &s->s_task);
 }
 
-void
-pfsync_insert_state(struct pf_state *st)
+static void
+pfsync_slice_task(void *arg)
 {
-       struct pfsync_softc *sc = pfsyncif;
+       struct pfsync_slice *s = arg;
+       struct mbuf *m;
 
-       NET_ASSERT_LOCKED();
+       mtx_enter(&s->s_mtx);
+       s->s_stat_task_run++;
 
-       if (ISSET(st->rule.ptr->rule_flag, PFRULE_NOSYNC) ||
-           st->key[PF_SK_WIRE]->proto == IPPROTO_PFSYNC) {
-               SET(st->state_flags, PFSTATE_NOSYNC);
-               return;
+       m = pfsync_slice_write(s);
+       mtx_leave(&s->s_mtx);
+       if (m != NULL) {
+               NET_LOCK();
+               pfsync_sendout(s->s_pfsync, m);
+               NET_UNLOCK();
        }
+}
 
-       if (sc == NULL || !ISSET(sc->sc_if.if_flags, IFF_RUNNING) ||
-           ISSET(st->state_flags, PFSTATE_NOSYNC))
-               return;
+static void
+pfsync_slice_sendq(void *arg)
+{
+       struct pfsync_slice *s = arg;
+       struct mbuf_list ml;
+       struct mbuf *m;
 
-       if (sc->sc_len == PFSYNC_MINPKT)
-               timeout_add_sec(&sc->sc_tmo, 1);
+       mq_delist(&s->s_sendq, &ml);
+       if (ml_empty(&ml))
+               return;
 
-       pfsync_q_ins(st, PFSYNC_S_INS);
+       mtx_enter(&s->s_mtx);
+       s->s_stat_dequeue++;
+       mtx_leave(&s->s_mtx);
 
-       st->sync_updates = 0;
+       NET_LOCK();
+       while ((m = ml_dequeue(&ml)) != NULL)
+               pfsync_sendout(s->s_pfsync, m);
+       NET_UNLOCK();
 }
 
-int
-pfsync_defer(struct pf_state *st, struct mbuf *m, struct pfsync_deferral **ppd)
+static void
+pfsync_q_ins(struct pfsync_slice *s, struct pf_state *st, unsigned int q)
 {
-       struct pfsync_softc *sc = pfsyncif;
-       struct pfsync_deferral *pd;
-       unsigned int sched;
-
-       NET_ASSERT_LOCKED();
+       size_t nlen = pfsync_qs[q].len;
+       struct mbuf *m = NULL;
 
-       if (!sc->sc_defer ||
-           ISSET(st->state_flags, PFSTATE_NOSYNC) ||
-           m->m_flags & (M_BCAST|M_MCAST))
-               return (0);
+       MUTEX_ASSERT_LOCKED(&s->s_mtx);
+       KASSERT(st->sync_state == PFSYNC_S_NONE);
+       KASSERT(s->s_len >= PFSYNC_MINPKT);
 
-       pd = pool_get(&sc->sc_pool, M_NOWAIT);
-       if (pd == NULL)
-               return (0);
+       if (TAILQ_EMPTY(&s->s_qs[q]))
+               nlen += sizeof(struct pfsync_subheader);
 
-       /*
-        * deferral queue grows faster, than timeout can consume,
-        * we have to ask packet (caller) to help timer and dispatch
-        * one deferral for us.
-        *
-        * We wish to call pfsync_undefer() here. Unfortunately we can't,
-        * because pfsync_undefer() will be calling to ip_output(),
-        * which in turn will call to pf_test(), which would then attempt
-        * to grab PF_LOCK() we currently hold.
-        */
-       if (sc->sc_deferred >= 128) {
-               mtx_enter(&sc->sc_deferrals_mtx);
-               *ppd = TAILQ_FIRST(&sc->sc_deferrals);
-               if (*ppd != NULL) {
-                       TAILQ_REMOVE(&sc->sc_deferrals, *ppd, pd_entry);
-                       sc->sc_deferred--;
+       if (s->s_len + nlen > s->s_pfsync->sc_if.if_mtu) {
+               m = pfsync_slice_write(s);
+               if (m != NULL) {
+                       s->s_stat_enqueue++;
+                       if (mq_enqueue(&s->s_sendq, m) == 0)
+                               task_add(s->s_softnet, &s->s_send);
                }
-               mtx_leave(&sc->sc_deferrals_mtx);
-       } else
-               *ppd = NULL;
-
-       m->m_pkthdr.pf.flags |= PF_TAG_GENERATED;
-       SET(st->state_flags, PFSTATE_ACK);
 
-       pd->pd_st = pf_state_ref(st);
-       pd->pd_m = m;
+               nlen = sizeof(struct pfsync_subheader) + pfsync_qs[q].len;
+       }
 
-       pd->pd_deadline = getnsecuptime() + PFSYNC_DEFER_NSEC;
+       s->s_len += nlen;
+       pf_state_ref(st);
+       TAILQ_INSERT_TAIL(&s->s_qs[q], st, sync_list);
+       st->sync_state = q;
 
-       mtx_enter(&sc->sc_deferrals_mtx);
-       sched = TAILQ_EMPTY(&sc->sc_deferrals);
+       if (!timeout_pending(&s->s_tmo))
+               timeout_add_sec(&s->s_tmo, 1);
+}
 
-       TAILQ_INSERT_TAIL(&sc->sc_deferrals, pd, pd_entry);
-       sc->sc_deferred++;
-       mtx_leave(&sc->sc_deferrals_mtx);
+static void
+pfsync_q_del(struct pfsync_slice *s, struct pf_state *st)
+{
+       unsigned int q = st->sync_state;
 
-       if (sched)
-               timeout_add_nsec(&sc->sc_deferrals_tmo, PFSYNC_DEFER_NSEC);
+       MUTEX_ASSERT_LOCKED(&s->s_mtx);
+       KASSERT(st->sync_state < PFSYNC_S_NONE);
 
-       schednetisr(NETISR_PFSYNC);
+       st->sync_state = PFSYNC_S_NONE;
+       TAILQ_REMOVE(&s->s_qs[q], st, sync_list);
+       pf_state_unref(st);
+       s->s_len -= pfsync_qs[q].len;
 
-       return (1);
+       if (TAILQ_EMPTY(&s->s_qs[q]))
+               s->s_len -= sizeof(struct pfsync_subheader);
 }
 
+/*
+ * the pfsync hooks that pf calls
+ */
+
 void
-pfsync_undefer_notify(struct pfsync_deferral *pd)
+pfsync_init_state(struct pf_state *st, const struct pf_state_key *skw,
+    const struct pf_state_key *sks, int flags)
 {
-       struct pf_pdesc pdesc;
-       struct pf_state *st = pd->pd_st;
+       /* this is called before pf_state_insert */
 
-       /*
-        * pf_remove_state removes the state keys and sets st->timeout
-        * to PFTM_UNLINKED. this is done under NET_LOCK which should
-        * be held here, so we can use PFTM_UNLINKED as a test for
-        * whether the state keys are set for the address family
-        * lookup.
-        */
+       if (skw->proto == IPPROTO_PFSYNC)
+               SET(st->state_flags, PFSTATE_NOSYNC);
 
-       if (st->timeout == PFTM_UNLINKED)
+       if (ISSET(st->state_flags, PFSTATE_NOSYNC)) {
+               st->sync_state = PFSYNC_S_DEAD;
                return;
+       }
 
-       if (st->rt == PF_ROUTETO) {
-               if (pf_setup_pdesc(&pdesc, st->key[PF_SK_WIRE]->af,
-                   st->direction, st->kif, pd->pd_m, NULL) != PF_PASS)
-                       return;
-               switch (st->key[PF_SK_WIRE]->af) {
-               case AF_INET:
-                       pf_route(&pdesc, st);
-                       break;
-#ifdef INET6
-               case AF_INET6:
-                       pf_route6(&pdesc, st);
-                       break;
-#endif /* INET6 */
-               default:
-                       unhandled_af(st->key[PF_SK_WIRE]->af);
-               }
-               pd->pd_m = pdesc.m;
-       } else {
-               switch (st->key[PF_SK_WIRE]->af) {
-               case AF_INET:
-                       ip_output(pd->pd_m, NULL, NULL, 0, NULL, NULL, 0);
-                       break;
-#ifdef INET6
-               case AF_INET6:
-                       ip6_output(pd->pd_m, NULL, NULL, 0, NULL, NULL);
-                       break;
-#endif /* INET6 */
-               default:
-                       unhandled_af(st->key[PF_SK_WIRE]->af);
-               }
-
-               pd->pd_m = NULL;
+       if (ISSET(flags, PFSYNC_SI_IOCTL)) {
+               /* all good */
+               return;
        }
-}
 
-void
-pfsync_free_deferral(struct pfsync_deferral *pd)
-{
-       struct pfsync_softc *sc = pfsyncif;
+       /* state came off the wire */
+       if (ISSET(st->state_flags, PFSTATE_ACK)) {
+               CLR(st->state_flags, PFSTATE_ACK);
 
-       pf_state_unref(pd->pd_st);
-       m_freem(pd->pd_m);
-       pool_put(&sc->sc_pool, pd);
+               /* peer wants an iack, not an insert */
+               st->sync_state = PFSYNC_S_SYNC;
+       }
 }
 
 void
-pfsync_undefer(struct pfsync_deferral *pd, int drop)
+pfsync_insert_state(struct pf_state *st)
 {
-       struct pfsync_softc *sc = pfsyncif;
+       struct pfsync_softc *sc;
 
-       NET_ASSERT_LOCKED();
+       MUTEX_ASSERT_UNLOCKED(&st->mtx);
 
-       if (sc == NULL)
+       if (ISSET(st->state_flags, PFSTATE_NOSYNC) ||
+           st->sync_state == PFSYNC_S_DEAD)
                return;
 
-       CLR(pd->pd_st->state_flags, PFSTATE_ACK);
-       if (!drop)
-               pfsync_undefer_notify(pd);
+       smr_read_enter();
+       sc = SMR_PTR_GET(&pfsyncif);
+       if (sc != NULL) {
+               struct pfsync_slice *s = pfsync_slice_enter(sc, st);
+
+               switch (st->sync_state) {
+               case PFSYNC_S_UPD_C:
+                       /* we must have lost a race after insert */
+                       pfsync_q_del(s, st);
+                       /* FALLTHROUGH */
+               case PFSYNC_S_NONE:
+                       pfsync_q_ins(s, st, PFSYNC_S_INS);
+                       break;
+               case PFSYNC_S_SYNC:
+                       st->sync_state = PFSYNC_S_NONE; /* gross */
+                       pfsync_q_ins(s, st, PFSYNC_S_IACK);
+                       pfsync_slice_sched(s); /* the peer is waiting */
+                       break;
+               default:
+                       panic("%s: state %p unexpected sync_state %d",
+                           __func__, st, st->sync_state);
+                       /* NOTREACHED */
+               }
 
-       pfsync_free_deferral(pd);
+               pfsync_slice_leave(sc, s);
+       }
+       smr_read_leave();
 }
 
 void
-pfsync_deferrals_tmo(void *arg)
+pfsync_update_state(struct pf_state *st)
 {
-       struct pfsync_softc *sc = arg;
-       struct pfsync_deferral *pd;
-       uint64_t now, nsec = 0;
-       struct pfsync_deferrals pds = TAILQ_HEAD_INITIALIZER(pds);
+       struct pfsync_softc *sc;
 
-       now = getnsecuptime();
+       MUTEX_ASSERT_UNLOCKED(&st->mtx);
 
-       mtx_enter(&sc->sc_deferrals_mtx);
-       for (;;) {
-               pd = TAILQ_FIRST(&sc->sc_deferrals);
-               if (pd == NULL)
+       if (ISSET(st->state_flags, PFSTATE_NOSYNC) ||
+           st->sync_state == PFSYNC_S_DEAD)
+               return;
+
+       smr_read_enter();
+       sc = SMR_PTR_GET(&pfsyncif);
+       if (sc != NULL) {
+               struct pfsync_slice *s = pfsync_slice_enter(sc, st);
+               int sync = 0;
+
+               switch (st->sync_state) {
+               case PFSYNC_S_UPD_C:
+               case PFSYNC_S_UPD:
+                       /* we're already handling it */
+                       if (st->key[PF_SK_WIRE]->proto == IPPROTO_TCP) {
+                               st->sync_updates++;
+                               if (st->sync_updates >= sc->sc_maxupdates)
+                                       sync = 1;
+                       }
+                       /* FALLTHROUGH */
+               case PFSYNC_S_INS:
+               case PFSYNC_S_DEL:
+               case PFSYNC_S_DEAD:
                        break;
 
-               if (now < pd->pd_deadline) {
-                       nsec = pd->pd_deadline - now;
+               case PFSYNC_S_IACK:
+                       pfsync_q_del(s, st);
+                       /* FALLTHROUGH */
+               case PFSYNC_S_NONE:
+                       pfsync_q_ins(s, st, PFSYNC_S_UPD_C);
+                       st->sync_updates = 0;
                        break;
+               default:
+                       panic("%s: state %p unexpected sync_state %d",
+                           __func__, st, st->sync_state);
+                       /* NOTREACHED */
                }
 
-               TAILQ_REMOVE(&sc->sc_deferrals, pd, pd_entry);
-               sc->sc_deferred--;
-               TAILQ_INSERT_TAIL(&pds, pd, pd_entry);
-       }
-       mtx_leave(&sc->sc_deferrals_mtx);
-
-       if (nsec > 0) {
-               /* we were looking at a pd, but it wasn't old enough */
-               timeout_add_nsec(&sc->sc_deferrals_tmo, nsec);
-       }
-
-       if (TAILQ_EMPTY(&pds))
-               return;
-
-       NET_LOCK();
-       while ((pd = TAILQ_FIRST(&pds)) != NULL) {
-               TAILQ_REMOVE(&pds, pd, pd_entry);
+               if (!sync && (getuptime() - st->pfsync_time) < 2)
+                       sync = 1;
 
-               pfsync_undefer(pd, 0);
+               if (sync)
+                       pfsync_slice_sched(s);
+               pfsync_slice_leave(sc, s);
        }
-       NET_UNLOCK();
+       smr_read_leave();
 }
 
 void
-pfsync_deferred(struct pf_state *st, int drop)
+pfsync_delete_state(struct pf_state *st)
 {
-       struct pfsync_softc *sc = pfsyncif;
-       struct pfsync_deferral *pd;
+       struct pfsync_softc *sc;
 
-       NET_ASSERT_LOCKED();
+       MUTEX_ASSERT_UNLOCKED(&st->mtx);
+
+       if (ISSET(st->state_flags, PFSTATE_NOSYNC) ||
+           st->sync_state == PFSYNC_S_DEAD)
+               return;
+
+       smr_read_enter();
+       sc = SMR_PTR_GET(&pfsyncif);
+       if (sc != NULL) {
+               struct pfsync_slice *s = pfsync_slice_enter(sc, st);
+
+               switch (st->sync_state) {
+               case PFSYNC_S_INS:
+                       /* let's pretend this never happened */
+                       pfsync_q_del(s, st);
+                       break;
 
-       mtx_enter(&sc->sc_deferrals_mtx);
-       TAILQ_FOREACH(pd, &sc->sc_deferrals, pd_entry) {
-                if (pd->pd_st == st) {
-                       TAILQ_REMOVE(&sc->sc_deferrals, pd, pd_entry);
-                       sc->sc_deferred--;
+               case PFSYNC_S_UPD_C:
+               case PFSYNC_S_UPD:
+               case PFSYNC_S_IACK:
+                       pfsync_q_del(s, st);
+                       /* FALLTHROUGH */
+               case PFSYNC_S_NONE:
+                       pfsync_q_ins(s, st, PFSYNC_S_DEL);
+                       st->sync_updates = 0;
                        break;
+               case PFSYNC_S_DEL:
+               case PFSYNC_S_DEAD:
+                       /* XXX we should count this */
+                       break;
+               default:
+                       panic("%s: state %p unexpected sync_state %d",
+                           __func__, st, st->sync_state);
+                       /* NOTREACHED */
                }
-       }
-       mtx_leave(&sc->sc_deferrals_mtx);
 
-       if (pd != NULL)
-               pfsync_undefer(pd, drop);
+               pfsync_slice_leave(sc, s);
+       }
+       smr_read_leave();
 }
 
+struct pfsync_subh_clr {
+       struct pfsync_subheader subh;
+       struct pfsync_clr       clr;
+} __packed __aligned(4);
+
 void
-pfsync_update_state(struct pf_state *st)
+pfsync_clear_states(u_int32_t creatorid, const char *ifname)
 {
-       struct pfsync_softc *sc = pfsyncif;
-       int sync = 0;
+       struct pfsync_softc *sc;
+       struct pfsync_subh_clr *h;
+       struct mbuf *m;
+       unsigned int hlen, mlen;
 
-       NET_ASSERT_LOCKED();
+       smr_read_enter();
+       sc = SMR_PTR_GET(&pfsyncif);
+       if (sc != NULL)
+               refcnt_take(&sc->sc_refs);
+       smr_read_leave();
 
-       if (sc == NULL || !ISSET(sc->sc_if.if_flags, IFF_RUNNING))
+       if (sc == NULL)
                return;
 
-       if (ISSET(st->state_flags, PFSTATE_ACK))
-               pfsync_deferred(st, 0);
-       if (ISSET(st->state_flags, PFSTATE_NOSYNC)) {
-               if (st->sync_state != PFSYNC_S_NONE)
-                       pfsync_q_del(st);
-               return;
-       }
+       hlen = sizeof(sc->sc_template) +
+           sizeof(struct pfsync_header) +
+           sizeof(*h);
 
-       if (sc->sc_len == PFSYNC_MINPKT)
-               timeout_add_sec(&sc->sc_tmo, 1);
+       mlen = max_linkhdr + hlen;
 
-       switch (st->sync_state) {
-       case PFSYNC_S_UPD_C:
-       case PFSYNC_S_UPD:
-       case PFSYNC_S_INS:
-               /* we're already handling it */
+       m = m_gethdr(M_DONTWAIT, MT_DATA);
+       if (m == NULL) {
+               /* count error */
+               goto leave;
+       }
 
-               if (st->key[PF_SK_WIRE]->proto == IPPROTO_TCP) {
-                       st->sync_updates++;
-                       if (st->sync_updates >= sc->sc_maxupdates)
-                               sync = 1;
+       if (mlen > MHLEN) {
+               MCLGETL(m, M_DONTWAIT, mlen);
+               if (!ISSET(m->m_flags, M_EXT)) {
+                       m_freem(m);
+                       goto leave;
                }
-               break;
+       }
 
-       case PFSYNC_S_IACK:
-               pfsync_q_del(st);
-       case PFSYNC_S_NONE:
-               pfsync_q_ins(st, PFSYNC_S_UPD_C);
-               st->sync_updates = 0;
-               break;
+       m_align(m, sizeof(*h));
+       h = mtod(m, struct pfsync_subh_clr *);
 
-       case PFSYNC_S_DEL:
-       case PFSYNC_S_COUNT:
-       case PFSYNC_S_DEFER:
-               break;
+       h->subh.action = PFSYNC_ACT_CLR;
+       h->subh.len = sizeof(h->clr) >> 2;
+       h->subh.count = htons(1);
 
-       default:
-               panic("pfsync_update_state: unexpected sync state %d",
-                   st->sync_state);
-       }
+       strlcpy(h->clr.ifname, ifname, sizeof(h->clr.ifname));
+       h->clr.creatorid = creatorid;
 
-       if (sync || (getuptime() - st->pfsync_time) < 2)
-               schednetisr(NETISR_PFSYNC);
-}
+       m->m_pkthdr.len = m->m_len = sizeof(*h);
+       m = pfsync_encap(sc, m);
+       if (m == NULL)
+               goto leave;
 
-void
-pfsync_cancel_full_update(struct pfsync_softc *sc)
-{
-       if (timeout_pending(&sc->sc_bulkfail_tmo) ||
-           timeout_pending(&sc->sc_bulk_tmo)) {
-#if NCARP > 0
-               if (!pfsync_sync_ok)
-                       carp_group_demote_adj(&sc->sc_if, -1,
-                           "pfsync bulk cancelled");
-               if (sc->sc_initial_bulk) {
-                       carp_group_demote_adj(&sc->sc_if, -32,
-                           "pfsync init");
-                       sc->sc_initial_bulk = 0;
-               }
-#endif
-               pfsync_sync_ok = 1;
-               DPFPRINTF(LOG_INFO, "cancelling bulk update");
-       }
-       timeout_del(&sc->sc_bulkfail_tmo);
-       timeout_del(&sc->sc_bulk_tmo);
-       sc->sc_bulk_next = NULL;
-       sc->sc_bulk_last = NULL;
-       sc->sc_ureq_sent = 0;
-       sc->sc_bulk_tries = 0;
+       pfsync_sendout(sc, m);
+leave:
+       refcnt_rele_wake(&sc->sc_refs);
 }
 
-void
-pfsync_request_full_update(struct pfsync_softc *sc)
+int
+pfsync_state_in_use(struct pf_state *st)
 {
-       if (sc->sc_sync_ifidx != 0 && ISSET(sc->sc_if.if_flags, IFF_RUNNING)) {
-               /* Request a full state table update. */
-               sc->sc_ureq_sent = getuptime();
-#if NCARP > 0
-               if (!sc->sc_link_demoted && pfsync_sync_ok)
-                       carp_group_demote_adj(&sc->sc_if, 1,
-                           "pfsync bulk start");
-#endif
-               pfsync_sync_ok = 0;
-               DPFPRINTF(LOG_INFO, "requesting bulk update");
-               PF_LOCK();
-               timeout_add(&sc->sc_bulkfail_tmo, 4 * hz +
-                   pf_pool_limits[PF_LIMIT_STATES].limit /
-                   ((sc->sc_if.if_mtu - PFSYNC_MINPKT) /
-                   sizeof(struct pfsync_state)));
-               PF_UNLOCK();
-               pfsync_request_update(0, 0);
+       struct pfsync_softc *sc;
+       int rv = 0;
+
+       smr_read_enter();
+       sc = SMR_PTR_GET(&pfsyncif);
+       if (sc != NULL) {
+               /*
+                * pfsync bulk sends run inside
+                * rw_enter_read(&pf_state_list.pfs_rwl), and this
+                * code (pfsync_state_in_use) is only called from the
+                * purge code inside
+                * rw_enter_write(&pf_state_list.pfs_rwl). therefore,
+                * those two sections are exclusive so we can safely
+                * look at the bulk send pointers.
+                */
+               /* rw_assert_wrlock(&pf_state_list.pfs_rwl); */
+               if (sc->sc_bulk_snd.snd_next == st ||
+                   sc->sc_bulk_snd.snd_tail == st)
+                       rv = 1;
        }
+       smr_read_leave();
+
+       return (rv);
 }
 
-void
-pfsync_request_update(u_int32_t creatorid, u_int64_t id)
+int
+pfsync_defer(struct pf_state *st, struct mbuf *m)
 {
-       struct pfsync_softc *sc = pfsyncif;
-       struct pfsync_upd_req_item *item;
-       size_t nlen, sclen;
-       int retry;
-
-       /*
-        * this code does nothing to prevent multiple update requests for the
-        * same state being generated.
-        */
+       struct pfsync_softc *sc;
+       struct pfsync_slice *s;
+       struct pfsync_deferral *pd;
+       int sched = 0;
+       int rv = 0;
 
-       item = pool_get(&sc->sc_pool, PR_NOWAIT);
-       if (item == NULL) {
-               /* XXX stats */
-               return;
-       }
+       if (ISSET(st->state_flags, PFSTATE_NOSYNC) ||
+           ISSET(m->m_flags, M_BCAST|M_MCAST))
+               return (0);
 
-       item->ur_msg.id = id;
-       item->ur_msg.creatorid = creatorid;
+       smr_read_enter();
+       sc = SMR_PTR_GET(&pfsyncif);
+       if (sc == NULL || !sc->sc_defer)
+               goto leave;
 
-       for (;;) {
-               mtx_enter(&sc->sc_upd_req_mtx);
+       pd = pool_get(&pfsync_deferrals_pool, M_NOWAIT);
+       if (pd == NULL) {
+               goto leave;
+       }
 
-               nlen = sizeof(struct pfsync_upd_req);
-               if (TAILQ_EMPTY(&sc->sc_upd_req_list))
-                       nlen += sizeof(struct pfsync_subheader);
+       s = pfsync_slice_enter(sc, st);
+       s->s_stat_defer_add++;
 
-               sclen = atomic_add_long_nv(&sc->sc_len, nlen);
-               retry = (sclen > sc->sc_if.if_mtu);
-               if (retry)
-                       atomic_sub_long(&sc->sc_len, nlen);
-               else
-                       TAILQ_INSERT_TAIL(&sc->sc_upd_req_list, item, ur_entry);
+       pd->pd_st = pf_state_ref(st);
+       pd->pd_m = m;
+       pd->pd_deadline = getnsecuptime() + PFSYNC_DEFER_NSEC;
 
-               mtx_leave(&sc->sc_upd_req_mtx);
+       m->m_pkthdr.pf.flags |= PF_TAG_GENERATED;
+       st->sync_defer = pd;
 
-               if (!retry)
-                       break;
+       sched = s->s_deferred++;
+       TAILQ_INSERT_TAIL(&s->s_deferrals, pd, pd_entry);
 
-               pfsync_sendout();
+       if (sched == 0)
+               timeout_add_nsec(&s->s_deferrals_tmo, PFSYNC_DEFER_NSEC);
+       else if (sched >= PFSYNC_DEFER_LIMIT) {
+               s->s_stat_defer_overlimit++;
+               timeout_del(&s->s_deferrals_tmo);
+               task_add(s->s_softnet, &s->s_deferrals_task);
        }
 
-       schednetisr(NETISR_PFSYNC);
+       pfsync_slice_sched(s);
+       pfsync_slice_leave(sc, s);
+       rv = 1;
+leave:
+       smr_read_leave();
+
+       return (rv);
 }
 
-void
-pfsync_update_state_req(struct pf_state *st)
+static void
+pfsync_deferred(struct pfsync_softc *sc, struct pf_state *st)
 {
-       struct pfsync_softc *sc = pfsyncif;
-
-       if (sc == NULL)
-               panic("pfsync_update_state_req: nonexistent instance");
+       struct pfsync_slice *s;
+       struct pfsync_deferral *pd;
 
-       if (ISSET(st->state_flags, PFSTATE_NOSYNC)) {
-               if (st->sync_state != PFSYNC_S_NONE)
-                       pfsync_q_del(st);
-               return;
-       }
+       s = pfsync_slice_enter(sc, st);
 
-       switch (st->sync_state) {
-       case PFSYNC_S_UPD_C:
-       case PFSYNC_S_IACK:
-               pfsync_q_del(st);
-       case PFSYNC_S_NONE:
-               pfsync_q_ins(st, PFSYNC_S_UPD);
-               schednetisr(NETISR_PFSYNC);
-               return;
+       pd = st->sync_defer;
+       if (pd != NULL) {
+               s->s_stat_defer_ack++;
 
-       case PFSYNC_S_INS:
-       case PFSYNC_S_UPD:
-       case PFSYNC_S_DEL:
-               /* we're already handling it */
-               return;
+               TAILQ_REMOVE(&s->s_deferrals, pd, pd_entry);
+               s->s_deferred--;
 
-       default:
-               panic("pfsync_update_state_req: unexpected sync state %d",
-                   st->sync_state);
+               st = pd->pd_st;
+               st->sync_defer = NULL;
        }
+       pfsync_slice_leave(sc, s);
+
+       if (pd != NULL)
+               pfsync_defer_output(pd);
 }
 
-void
-pfsync_delete_state(struct pf_state *st)
+static void
+pfsync_deferrals_tmo(void *arg)
 {
-       struct pfsync_softc *sc = pfsyncif;
+       struct pfsync_slice *s = arg;
 
-       NET_ASSERT_LOCKED();
+       if (READ_ONCE(s->s_deferred) > 0)
+               task_add(s->s_softnet, &s->s_deferrals_task);
+}
 
-       if (sc == NULL || !ISSET(sc->sc_if.if_flags, IFF_RUNNING))
-               return;
+static void
+pfsync_deferrals_task(void *arg)
+{
+       struct pfsync_slice *s = arg;
+       struct pfsync_deferral *pd;
+       struct pf_state *st;
+       uint64_t now, nsec = 0;
+       struct pfsync_deferrals pds = TAILQ_HEAD_INITIALIZER(pds);
 
-       if (ISSET(st->state_flags, PFSTATE_ACK))
-               pfsync_deferred(st, 1);
-       if (ISSET(st->state_flags, PFSTATE_NOSYNC)) {
-               if (st->sync_state != PFSYNC_S_NONE)
-                       pfsync_q_del(st);
-               return;
-       }
+       now = getnsecuptime();
+
+       mtx_enter(&s->s_mtx);
+       s->s_stat_defer_run++; /* maybe move this into the loop */
+       for (;;) {
+               pd = TAILQ_FIRST(&s->s_deferrals);
+               if (pd == NULL)
+                       break;
 
-       if (sc->sc_len == PFSYNC_MINPKT)
-               timeout_add_sec(&sc->sc_tmo, 1);
+               if (s->s_deferred < PFSYNC_DEFER_LIMIT &&
+                   now < pd->pd_deadline) {
+                       nsec = pd->pd_deadline - now;
+                       break;
+               }
 
-       switch (st->sync_state) {
-       case PFSYNC_S_INS:
-               /* we never got to tell the world so just forget about it */
-               pfsync_q_del(st);
-               return;
+               TAILQ_REMOVE(&s->s_deferrals, pd, pd_entry);
+               s->s_deferred--;
 
-       case PFSYNC_S_UPD_C:
-       case PFSYNC_S_UPD:
-       case PFSYNC_S_IACK:
-               pfsync_q_del(st);
                /*
-                * FALLTHROUGH to putting it on the del list
-                * Note on reference count bookkeeping:
-                *      pfsync_q_del() drops reference for queue
-                *      ownership. But the st entry survives, because
-                *      our caller still holds a reference.
+                * detach the pd from the state. the pd still refers
+                * to the state though.
                 */
+               st = pd->pd_st;
+               st->sync_defer = NULL;
 
-       case PFSYNC_S_NONE:
-               /*
-                * We either fall through here, or there is no reference to
-                * st owned by pfsync queues at this point.
-                *
-                * Calling pfsync_q_ins() puts st to del queue. The pfsync_q_ins()
-                * grabs a reference for delete queue.
-                */
-               pfsync_q_ins(st, PFSYNC_S_DEL);
+               TAILQ_INSERT_TAIL(&pds, pd, pd_entry);
+       }
+       mtx_leave(&s->s_mtx);
+
+       if (nsec > 0) {
+               /* we were looking at a pd, but it wasn't old enough */
+               timeout_add_nsec(&s->s_deferrals_tmo, nsec);
+       }
+
+       if (TAILQ_EMPTY(&pds))
                return;
 
-       default:
-               panic("pfsync_delete_state: unexpected sync state %d",
-                   st->sync_state);
+       NET_LOCK();
+       while ((pd = TAILQ_FIRST(&pds)) != NULL) {
+               TAILQ_REMOVE(&pds, pd, pd_entry);
+
+               pfsync_defer_output(pd);
        }
+       NET_UNLOCK();
 }
 
-void
-pfsync_clear_states(u_int32_t creatorid, const char *ifname)
+static void
+pfsync_defer_output(struct pfsync_deferral *pd)
 {
-       struct pfsync_softc *sc = pfsyncif;
-       struct {
-               struct pfsync_subheader subh;
-               struct pfsync_clr clr;
-       } __packed r;
+       struct pf_pdesc pdesc;
+       struct pf_state *st = pd->pd_st;
 
-       NET_ASSERT_LOCKED();
+       if (st->rt == PF_ROUTETO) {
+               if (pf_setup_pdesc(&pdesc, st->key[PF_SK_WIRE]->af,
+                   st->direction, NULL, pd->pd_m, NULL) != PF_PASS)
+                       return;
+               switch (st->key[PF_SK_WIRE]->af) {
+               case AF_INET:
+                       pf_route(&pdesc, st);
+                       break;
+#ifdef INET6
+               case AF_INET6:
+                       pf_route6(&pdesc, st);
+                       break;
+#endif /* INET6 */
+               default:
+                       unhandled_af(st->key[PF_SK_WIRE]->af);
+               }
+               pd->pd_m = pdesc.m;
+       } else {
+               switch (st->key[PF_SK_WIRE]->af) {
+               case AF_INET:
+                       ip_output(pd->pd_m, NULL, NULL, 0, NULL, NULL, 0);
+                       break;
+#ifdef INET6
+               case AF_INET6:
+                       ip6_output(pd->pd_m, NULL, NULL, 0, NULL, NULL);
+                       break;
+#endif /* INET6 */
+               default:
+                       unhandled_af(st->key[PF_SK_WIRE]->af);
+               }
 
-       if (sc == NULL || !ISSET(sc->sc_if.if_flags, IFF_RUNNING))
-               return;
+               pd->pd_m = NULL;
+       }
 
-       bzero(&r, sizeof(r));
+       pf_state_unref(st);
+       m_freem(pd->pd_m);
+       pool_put(&pfsync_deferrals_pool, pd);
+}
 
-       r.subh.action = PFSYNC_ACT_CLR;
-       r.subh.len = sizeof(struct pfsync_clr) >> 2;
-       r.subh.count = htons(1);
+struct pfsync_subh_bus {
+       struct pfsync_subheader subh;
+       struct pfsync_bus       bus;
+} __packed __aligned(4);
 
-       strlcpy(r.clr.ifname, ifname, sizeof(r.clr.ifname));
-       r.clr.creatorid = creatorid;
+static unsigned int
+pfsync_bulk_snd_bus(struct pfsync_softc *sc,
+    struct mbuf *m, const unsigned int space,
+    uint32_t endtime, uint8_t status)
+{
+       struct pfsync_subh_bus *h;
+       unsigned int nlen;
 
-       pfsync_send_plus(&r, sizeof(r));
-}
+       nlen = m->m_len + sizeof(*h);
+       if (space < nlen)
+               return (0);
 
-void
-pfsync_iack(struct pf_state *st)
-{
-       pfsync_q_ins(st, PFSYNC_S_IACK);
-       schednetisr(NETISR_PFSYNC);
+       h = (struct pfsync_subh_bus *)(mtod(m, caddr_t) + m->m_len);
+       memset(h, 0, sizeof(*h));
+
+       h->subh.action = PFSYNC_ACT_BUS;
+       h->subh.len = sizeof(h->bus) >> 2;
+       h->subh.count = htons(1);
+
+       h->bus.creatorid = pf_status.hostid;
+       h->bus.endtime = htonl(endtime);
+       h->bus.status = status;
+
+       m->m_len = nlen;
+
+       return (1);
 }
 
-void
-pfsync_q_ins(struct pf_state *st, int q)
+static unsigned int
+pfsync_bulk_snd_states(struct pfsync_softc *sc,
+    struct mbuf *m, const unsigned int space, unsigned int len)
 {
-       struct pfsync_softc *sc = pfsyncif;
-       size_t nlen, sclen;
+       struct pf_state *st;
+       struct pfsync_state *sp;
+       unsigned int nlen;
+       unsigned int count = 0;
 
-       if (sc->sc_len < PFSYNC_MINPKT)
-               panic("pfsync pkt len is too low %zd", sc->sc_len);
-       do {
-               mtx_enter(&sc->sc_st_mtx);
-               mtx_enter(&st->mtx);
+       st = sc->sc_bulk_snd.snd_next;
 
-               /*
-                * There are either two threads trying to update the
-                * the same state, or the state is just being processed
-                * (is on snapshot queue).
-                */
-               if (st->sync_state != PFSYNC_S_NONE) {
-                       mtx_leave(&st->mtx);
-                       mtx_leave(&sc->sc_st_mtx);
+       for (;;) {
+               nlen = len + sizeof(*sp);
+               sp = (struct pfsync_state *)(mtod(m, caddr_t) + len);
+               if (space < nlen)
                        break;
-               }
 
-               nlen = pfsync_qs[q].len;
+               mtx_enter(&st->mtx);
+               pf_state_export(sp, st);
+               mtx_leave(&st->mtx);
+
+               /* commit */
+               count++;
+               m->m_len = len = nlen;
 
-               if (TAILQ_EMPTY(&sc->sc_qs[q]))
-                       nlen += sizeof(struct pfsync_subheader);
+               if (st == sc->sc_bulk_snd.snd_tail) {
+                       if (pfsync_bulk_snd_bus(sc, m, space,
+                           0, PFSYNC_BUS_END) == 0) {
+                               /* couldn't fit the BUS */
+                               st = NULL;
+                               break;
+                       }
 
-               sclen = atomic_add_long_nv(&sc->sc_len, nlen);
-               if (sclen > sc->sc_if.if_mtu) {
-                       atomic_sub_long(&sc->sc_len, nlen);
-                       mtx_leave(&st->mtx);
-                       mtx_leave(&sc->sc_st_mtx);
-                       pfsync_sendout();
-                       continue;
+                       /* this BUS is done */
+                       pfsync_dprintf(sc, "bulk send done (%s)", __func__);
+                       sc->sc_bulk_snd.snd_again = 0; /* XXX */
+                       sc->sc_bulk_snd.snd_next = NULL;
+                       sc->sc_bulk_snd.snd_tail = NULL;
+                       return (count);
                }
 
-               pf_state_ref(st);
+               st = TAILQ_NEXT(st, entry_list);
+       }
 
-               TAILQ_INSERT_TAIL(&sc->sc_qs[q], st, sync_list);
-               st->sync_state = q;
-               mtx_leave(&st->mtx);
-               mtx_leave(&sc->sc_st_mtx);
-       } while (0);
+       /* there's still work to do */
+       sc->sc_bulk_snd.snd_next = st;
+       timeout_add_msec(&sc->sc_bulk_snd.snd_tmo, PFSYNC_BULK_SND_IVAL_MS);
+
+       return (count);
 }
 
-void
-pfsync_q_del(struct pf_state *st)
+static unsigned int
+pfsync_bulk_snd_sub(struct pfsync_softc *sc,
+    struct mbuf *m, const unsigned int space)
 {
-       struct pfsync_softc *sc = pfsyncif;
-       int q;
+       struct pfsync_subheader *subh;
+       unsigned int count;
+       unsigned int len, nlen;
+
+       len = m->m_len;
+       nlen = len + sizeof(*subh);
+       if (nlen > space)
+               return (0);
+
+       subh = (struct pfsync_subheader *)(mtod(m, caddr_t) + len);
 
-       mtx_enter(&sc->sc_st_mtx);
-       mtx_enter(&st->mtx);
-       q = st->sync_state;
        /*
-        * re-check under mutex
-        * if state is snapped already, then just bail out, because we came
-        * too late, the state is being just processed/dispatched to peer.
+        * pfsync_bulk_snd_states only updates m->m_len after
+        * filling in a state after the offset we gave it.
         */
-       if ((q == PFSYNC_S_NONE) || (st->snapped)) {
-               mtx_leave(&st->mtx);
-               mtx_leave(&sc->sc_st_mtx);
-               return;
-       }
-       atomic_sub_long(&sc->sc_len, pfsync_qs[q].len);
-       TAILQ_REMOVE(&sc->sc_qs[q], st, sync_list);
-       if (TAILQ_EMPTY(&sc->sc_qs[q]))
-               atomic_sub_long(&sc->sc_len, sizeof (struct pfsync_subheader));
-       st->sync_state = PFSYNC_S_NONE;
-       mtx_leave(&st->mtx);
-       mtx_leave(&sc->sc_st_mtx);
+       count = pfsync_bulk_snd_states(sc, m, space, nlen);
+       if (count == 0)
+               return (0);
 
-       pf_state_unref(st);
+       subh->action = PFSYNC_ACT_UPD;
+       subh->len = sizeof(struct pfsync_state) >> 2;
+       subh->count = htons(count);
+
+       return (count);
 }
 
-#if defined(IPSEC)
-void
-pfsync_update_tdb(struct tdb *t, int output)
+static void
+pfsync_bulk_snd_start(struct pfsync_softc *sc)
 {
-       struct pfsync_softc *sc = pfsyncif;
-       size_t nlen, sclen;
+       const unsigned int space = sc->sc_if.if_mtu -
+           (sizeof(struct ip) + sizeof(struct pfsync_header));
+       struct mbuf *m;
 
-       if (sc == NULL)
-               return;
+       rw_enter_read(&pf_state_list.pfs_rwl);
 
-       if (!ISSET(t->tdb_flags, TDBF_PFSYNC)) {
-               do {
-                       mtx_enter(&sc->sc_tdb_mtx);
-                       nlen = sizeof(struct pfsync_tdb);
+       rw_enter_write(&sc->sc_bulk_snd.snd_lock);
+       if (sc->sc_bulk_snd.snd_next != NULL) {
+               sc->sc_bulk_snd.snd_again = 1;
+               goto leave;
+       }
 
-                       mtx_enter(&t->tdb_mtx);
-                       if (ISSET(t->tdb_flags, TDBF_PFSYNC)) {
-                               /* we've lost race, no action for us then */
-                               mtx_leave(&t->tdb_mtx);
-                               mtx_leave(&sc->sc_tdb_mtx);
-                               break;
-                       }
+       mtx_enter(&pf_state_list.pfs_mtx);
+       sc->sc_bulk_snd.snd_next = TAILQ_FIRST(&pf_state_list.pfs_list);
+       sc->sc_bulk_snd.snd_tail = TAILQ_LAST(&pf_state_list.pfs_list,
+           pf_state_queue);
+       mtx_leave(&pf_state_list.pfs_mtx);
 
-                       if (TAILQ_EMPTY(&sc->sc_tdb_q))
-                               nlen += sizeof(struct pfsync_subheader);
+       m = m_gethdr(M_DONTWAIT, MT_DATA);
+       if (m == NULL)
+               goto leave;
 
-                       sclen = atomic_add_long_nv(&sc->sc_len, nlen);
-                       if (sclen > sc->sc_if.if_mtu) {
-                               atomic_sub_long(&sc->sc_len, nlen);
-                               mtx_leave(&t->tdb_mtx);
-                               mtx_leave(&sc->sc_tdb_mtx);
-                               pfsync_sendout();
-                               continue;
-                       }
+       MCLGETL(m, M_DONTWAIT, max_linkhdr + sc->sc_if.if_mtu);
+       if (!ISSET(m->m_flags, M_EXT)) {
+               /* some error++ */
+               m_freem(m); /* drop */
+               goto leave;
+       }
 
-                       TAILQ_INSERT_TAIL(&sc->sc_tdb_q, t, tdb_sync_entry);
-                       tdb_ref(t);
-                       SET(t->tdb_flags, TDBF_PFSYNC);
-                       mtx_leave(&t->tdb_mtx);
+       m_align(m, space);
+       m->m_len = 0;
 
-                       mtx_leave(&sc->sc_tdb_mtx);
-                       t->tdb_updates = 0;
-               } while (0);
-       } else {
-               if (++t->tdb_updates >= sc->sc_maxupdates)
-                       schednetisr(NETISR_PFSYNC);
+       if (sc->sc_bulk_snd.snd_tail == NULL) {
+               pfsync_dprintf(sc, "bulk send empty (%s)", __func__);
+
+               /* list is empty */
+               if (pfsync_bulk_snd_bus(sc, m, space, 0, PFSYNC_BUS_END) == 0)
+                       panic("%s: mtu is too low", __func__);
+               goto encap;
        }
 
-       mtx_enter(&t->tdb_mtx);
-       if (output)
-               SET(t->tdb_flags, TDBF_PFSYNC_RPL);
-       else
-               CLR(t->tdb_flags, TDBF_PFSYNC_RPL);
-       mtx_leave(&t->tdb_mtx);
+       pfsync_dprintf(sc, "bulk send start (%s)", __func__);
+
+       /* start a bulk update. */
+       if (pfsync_bulk_snd_bus(sc, m, space, 0, PFSYNC_BUS_START) == 0)
+               panic("%s: mtu is too low", __func__);
+
+       /* fill it up with state updates. */
+       pfsync_bulk_snd_sub(sc, m, space);
+
+encap:
+       m->m_pkthdr.len = m->m_len;
+       m = pfsync_encap(sc, m);
+       if (m == NULL)
+               goto leave;
+
+       pfsync_sendout(sc, m);
+
+leave:
+       rw_exit_write(&sc->sc_bulk_snd.snd_lock);
+
+       rw_exit_read(&pf_state_list.pfs_rwl);
 }
-#endif
 
-#if defined(IPSEC)
-void
-pfsync_delete_tdb(struct tdb *t)
+static void
+pfsync_bulk_snd_tmo(void *arg)
 {
-       struct pfsync_softc *sc = pfsyncif;
-       size_t nlen;
+       struct pfsync_softc *sc = arg;
+       const unsigned int space = sc->sc_if.if_mtu -
+           (sizeof(struct ip) + sizeof(struct pfsync_header));
+       struct mbuf *m;
 
-       if (sc == NULL || !ISSET(t->tdb_flags, TDBF_PFSYNC))
+       m = m_gethdr(M_DONTWAIT, MT_DATA);
+       if (m == NULL) {
+               /* some error++ */
+               /* retry later */
+               timeout_add_msec(&sc->sc_bulk_snd.snd_tmo,
+                   PFSYNC_BULK_SND_IVAL_MS);
                return;
+       }
 
-       mtx_enter(&sc->sc_tdb_mtx);
-
-       /*
-        * if tdb entry is just being processed (found in snapshot),
-        * then it can not be deleted. we just came too late
-        */
-       if (ISSET(t->tdb_flags, TDBF_PFSYNC_SNAPPED)) {
-               mtx_leave(&sc->sc_tdb_mtx);
+       MCLGETL(m, M_DONTWAIT, max_linkhdr + sc->sc_if.if_mtu);
+       if (!ISSET(m->m_flags, M_EXT)) {
+               /* some error++ */
+               m_freem(m);
+               /* retry later */
+               timeout_add_msec(&sc->sc_bulk_snd.snd_tmo,
+                   PFSYNC_BULK_SND_IVAL_MS);
                return;
        }
 
-       TAILQ_REMOVE(&sc->sc_tdb_q, t, tdb_sync_entry);
+       m_align(m, space);
+       m->m_len = 0;
+
+       rw_enter_read(&pf_state_list.pfs_rwl);
+       rw_enter_write(&sc->sc_bulk_snd.snd_lock);
 
-       mtx_enter(&t->tdb_mtx);
-       CLR(t->tdb_flags, TDBF_PFSYNC);
-       mtx_leave(&t->tdb_mtx);
+       if (sc->sc_bulk_snd.snd_next == NULL) {
+               /* there was no space in the previous packet for a BUS END */
 
-       nlen = sizeof(struct pfsync_tdb);
-       if (TAILQ_EMPTY(&sc->sc_tdb_q))
-               nlen += sizeof(struct pfsync_subheader);
-       atomic_sub_long(&sc->sc_len, nlen);
+               if (pfsync_bulk_snd_bus(sc, m, space, 0, PFSYNC_BUS_END) == 0)
+                       panic("%s: mtu is too low", __func__);
 
-       mtx_leave(&sc->sc_tdb_mtx);
+               /* this bulk is done */
+               pfsync_dprintf(sc, "bulk send done (%s)", __func__);
+               sc->sc_bulk_snd.snd_again = 0; /* XXX */
+               sc->sc_bulk_snd.snd_tail = NULL;
+       } else {
+               pfsync_dprintf(sc, "bulk send again (%s)", __func__);
+
+               /* fill it up with state updates. */
+               pfsync_bulk_snd_sub(sc, m, space);
+       }
+
+       m->m_pkthdr.len = m->m_len;
+       m = pfsync_encap(sc, m);
 
-       tdb_unref(t);
+       rw_exit_write(&sc->sc_bulk_snd.snd_lock);
+       rw_exit_read(&pf_state_list.pfs_rwl);
+
+       if (m != NULL) {
+               NET_LOCK();
+               pfsync_sendout(sc, m);
+               NET_UNLOCK();
+       }
 }
-#endif
 
-void
-pfsync_out_tdb(struct tdb *t, void *buf)
+static void
+pfsync_update_state_req(struct pfsync_softc *sc, struct pf_state *st)
+{
+       struct pfsync_slice *s = pfsync_slice_enter(sc, st);
+
+       switch (st->sync_state) {
+       case PFSYNC_S_UPD_C:
+       case PFSYNC_S_IACK:
+               pfsync_q_del(s, st);
+               /* FALLTHROUGH */
+       case PFSYNC_S_NONE:
+               pfsync_q_ins(s, st, PFSYNC_S_UPD);
+               break;
+
+       case PFSYNC_S_INS:
+       case PFSYNC_S_UPD:
+       case PFSYNC_S_DEL:
+               /* we're already handling it */
+               break;
+       default:
+               panic("%s: state %p unexpected sync_state %d",
+                   __func__, st, st->sync_state);
+       }
+
+       pfsync_slice_sched(s);
+       pfsync_slice_leave(sc, s);
+}
+
+#if defined(IPSEC)
+static void
+pfsync_out_tdb(struct tdb *tdb, void *buf)
 {
        struct pfsync_tdb *ut = buf;
 
-       bzero(ut, sizeof(*ut));
-       ut->spi = t->tdb_spi;
-       bcopy(&t->tdb_dst, &ut->dst, sizeof(ut->dst));
+       memset(ut, 0, sizeof(*ut));
+       ut->spi = tdb->tdb_spi;
+       memcpy(&ut->dst, &tdb->tdb_dst, sizeof(ut->dst));
        /*
         * When a failover happens, the master's rpl is probably above
         * what we see here (we may be up to a second late), so
@@ -2422,219 +2400,934 @@ pfsync_out_tdb(struct tdb *t, void *buf)
         * this edge case.
         */
 #define RPL_INCR 16384
-       ut->rpl = htobe64(t->tdb_rpl + (ISSET(t->tdb_flags, TDBF_PFSYNC_RPL) ?
-           RPL_INCR : 0));
-       ut->cur_bytes = htobe64(t->tdb_cur_bytes);
-       ut->sproto = t->tdb_sproto;
-       ut->rdomain = htons(t->tdb_rdomain);
+       ut->rpl = htobe64(tdb->tdb_rpl +
+           (ISSET(tdb->tdb_flags, TDBF_PFSYNC_RPL) ? RPL_INCR : 0));
+       ut->cur_bytes = htobe64(tdb->tdb_cur_bytes);
+       ut->sproto = tdb->tdb_sproto;
+       ut->rdomain = htons(tdb->tdb_rdomain);
 }
 
-void
-pfsync_bulk_start(void)
+static struct pfsync_slice *
+pfsync_slice_enter_tdb(struct pfsync_softc *sc, const struct tdb *t)
 {
-       struct pfsync_softc *sc = pfsyncif;
-
-       NET_ASSERT_LOCKED();
-
        /*
-        * pf gc via pfsync_state_in_use reads sc_bulk_next and
-        * sc_bulk_last while exclusively holding the pf_state_list
-        * rwlock. make sure it can't race with us setting these
-        * pointers. they basically act as hazards, and borrow the
-        * lists state reference count.
+        * just use the first slice for all ipsec (for now) until
+        * it's more obvious what property (eg, spi) we can distribute
+        * tdbs over slices with.
         */
-       rw_enter_read(&pf_state_list.pfs_rwl);
-
-       /* get a consistent view of the list pointers */
-       mtx_enter(&pf_state_list.pfs_mtx);
-       if (sc->sc_bulk_next == NULL)
-               sc->sc_bulk_next = TAILQ_FIRST(&pf_state_list.pfs_list);
-
-       sc->sc_bulk_last = TAILQ_LAST(&pf_state_list.pfs_list, pf_state_queue);
-       mtx_leave(&pf_state_list.pfs_mtx);
-
-       rw_exit_read(&pf_state_list.pfs_rwl);
-
-       DPFPRINTF(LOG_INFO, "received bulk update request");
+       struct pfsync_slice *s = &sc->sc_slices[0];
 
-       if (sc->sc_bulk_last == NULL)
-               pfsync_bulk_status(PFSYNC_BUS_END);
-       else {
-               sc->sc_ureq_received = getuptime();
-
-               pfsync_bulk_status(PFSYNC_BUS_START);
-               timeout_add(&sc->sc_bulk_tmo, 0);
+       if (!mtx_enter_try(&s->s_mtx)) {
+               mtx_enter(&s->s_mtx);
+               s->s_stat_contended++;
        }
+       s->s_stat_locks++;
+
+       return (s);
 }
 
-void
-pfsync_bulk_update(void *arg)
+static void
+pfsync_tdb_ins(struct pfsync_slice *s, struct tdb *tdb)
 {
-       struct pfsync_softc *sc;
-       struct pf_state *st;
-       int i = 0;
-
-       NET_LOCK();
-       sc = pfsyncif;
-       if (sc == NULL)
-               goto out;
+       size_t nlen = sizeof(struct pfsync_tdb);
+       struct mbuf *m = NULL;
 
-       rw_enter_read(&pf_state_list.pfs_rwl);
-       st = sc->sc_bulk_next;
-       sc->sc_bulk_next = NULL;
+       KASSERT(s->s_len >= PFSYNC_MINPKT);
 
-       if (st == NULL) {
-               rw_exit_read(&pf_state_list.pfs_rwl);
-               goto out;
-       }
+       MUTEX_ASSERT_LOCKED(&s->s_mtx);
+       MUTEX_ASSERT_UNLOCKED(&tdb->tdb_mtx);
 
-       for (;;) {
-               if (st->sync_state == PFSYNC_S_NONE &&
-                   st->timeout < PFTM_MAX &&
-                   st->pfsync_time <= sc->sc_ureq_received) {
-                       pfsync_update_state_req(st);
-                       i++;
-               }
+       if (TAILQ_EMPTY(&s->s_tdb_q))
+               nlen += sizeof(struct pfsync_subheader);
 
-               st = TAILQ_NEXT(st, entry_list);
-               if ((st == NULL) || (st == sc->sc_bulk_last)) {
-                       /* we're done */
-                       sc->sc_bulk_last = NULL;
-                       pfsync_bulk_status(PFSYNC_BUS_END);
-                       break;
+       if (s->s_len + nlen > s->s_pfsync->sc_if.if_mtu) {
+               m = pfsync_slice_write(s);
+               if (m != NULL) {
+                       s->s_stat_enqueue++;
+                       if (mq_enqueue(&s->s_sendq, m) == 0)
+                               task_add(s->s_softnet, &s->s_send);
                }
 
-               if (i > 1 && (sc->sc_if.if_mtu - sc->sc_len) <
-                   sizeof(struct pfsync_state)) {
-                       /* we've filled a packet */
-                       sc->sc_bulk_next = st;
-                       timeout_add(&sc->sc_bulk_tmo, 1);
-                       break;
-               }
+               nlen = sizeof(struct pfsync_subheader) +
+                   sizeof(struct pfsync_tdb);
        }
 
-       rw_exit_read(&pf_state_list.pfs_rwl);
- out:
-       NET_UNLOCK();
+       s->s_len += nlen;
+       TAILQ_INSERT_TAIL(&s->s_tdb_q, tdb, tdb_sync_entry);
+       tdb->tdb_updates = 0;
+
+       if (!timeout_pending(&s->s_tmo))
+               timeout_add_sec(&s->s_tmo, 1);
 }
 
-void
-pfsync_bulk_status(u_int8_t status)
+static void
+pfsync_tdb_del(struct pfsync_slice *s, struct tdb *tdb)
 {
-       struct {
-               struct pfsync_subheader subh;
-               struct pfsync_bus bus;
-       } __packed r;
-
-       struct pfsync_softc *sc = pfsyncif;
+       MUTEX_ASSERT_LOCKED(&s->s_mtx);
+       MUTEX_ASSERT_UNLOCKED(&tdb->tdb_mtx);
 
-       bzero(&r, sizeof(r));
+       TAILQ_REMOVE(&s->s_tdb_q, tdb, tdb_sync_entry);
 
-       r.subh.action = PFSYNC_ACT_BUS;
-       r.subh.len = sizeof(struct pfsync_bus) >> 2;
-       r.subh.count = htons(1);
-
-       r.bus.creatorid = pf_status.hostid;
-       r.bus.endtime = htonl(getuptime() - sc->sc_ureq_received);
-       r.bus.status = status;
-
-       pfsync_send_plus(&r, sizeof(r));
+       s->s_len -= sizeof(struct pfsync_tdb);
+       if (TAILQ_EMPTY(&s->s_tdb_q))
+               s->s_len -= sizeof(struct pfsync_subheader);
 }
 
+/*
+ * the reference that pfsync has to a tdb is accounted for by the
+ * TDBF_PFSYNC flag, not by tdb_ref/tdb_unref. tdb_delete_tdb() is
+ * called after all other references to a tdb are dropped (with
+ * tdb_unref) as part of the tdb_free().
+ *
+ * tdb_free() needs to wait for pfsync to let go of the tdb though,
+ * which would be best handled by a reference count, but tdb_free
+ * needs the NET_LOCK which pfsync is already fighting with. instead
+ * use the TDBF_PFSYNC_SNAPPED flag to coordinate the pfsync write/drop
+ * with tdb_free.
+ */
+
 void
-pfsync_bulk_fail(void *arg)
+pfsync_update_tdb(struct tdb *tdb, int output)
 {
        struct pfsync_softc *sc;
 
-       NET_LOCK();
-       sc = pfsyncif;
-       if (sc == NULL)
-               goto out;
-       if (sc->sc_bulk_tries++ < PFSYNC_MAX_BULKTRIES) {
-               /* Try again */
-               timeout_add_sec(&sc->sc_bulkfail_tmo, 5);
-               pfsync_request_update(0, 0);
-       } else {
-               /* Pretend like the transfer was ok */
-               sc->sc_ureq_sent = 0;
-               sc->sc_bulk_tries = 0;
-#if NCARP > 0
-               if (!pfsync_sync_ok)
-                       carp_group_demote_adj(&sc->sc_if, -1,
-                           sc->sc_link_demoted ?
-                           "pfsync link state up" :
-                           "pfsync bulk fail");
-               if (sc->sc_initial_bulk) {
-                       carp_group_demote_adj(&sc->sc_if, -32,
-                           "pfsync init");
-                       sc->sc_initial_bulk = 0;
-               }
-#endif
-               pfsync_sync_ok = 1;
-               sc->sc_link_demoted = 0;
-               DPFPRINTF(LOG_ERR, "failed to receive bulk update");
+       MUTEX_ASSERT_UNLOCKED(&tdb->tdb_mtx);
+
+       smr_read_enter();
+       sc = SMR_PTR_GET(&pfsyncif);
+       if (sc != NULL) {
+               struct pfsync_slice *s = pfsync_slice_enter_tdb(sc, tdb);
+
+               /* TDBF_PFSYNC is only changed while the slice mtx is held */
+               if (!ISSET(tdb->tdb_flags, TDBF_PFSYNC)) {
+                       mtx_enter(&tdb->tdb_mtx);
+                       SET(tdb->tdb_flags, TDBF_PFSYNC);
+                       mtx_leave(&tdb->tdb_mtx);
+
+                       pfsync_tdb_ins(s, tdb);
+               } else if (++tdb->tdb_updates >= sc->sc_maxupdates)
+                       pfsync_slice_sched(s);
+
+               /* XXX no sync timestamp on tdbs to check */
+
+               pfsync_slice_leave(sc, s);
        }
- out:
-       NET_UNLOCK();
+       smr_read_leave();
 }
 
 void
-pfsync_send_plus(void *plus, size_t pluslen)
+pfsync_delete_tdb(struct tdb *tdb)
 {
-       struct pfsync_softc *sc = pfsyncif;
+       struct pfsync_softc *sc;
 
-       if (sc->sc_len + pluslen > sc->sc_if.if_mtu)
-               pfsync_sendout();
+       MUTEX_ASSERT_UNLOCKED(&tdb->tdb_mtx);
 
-       sc->sc_plus = plus;
-       sc->sc_pluslen = pluslen;
-       atomic_add_long(&sc->sc_len, pluslen);
+       smr_read_enter();
+       sc = SMR_PTR_GET(&pfsyncif);
+       if (sc != NULL) {
+               struct pfsync_slice *s = pfsync_slice_enter_tdb(sc, tdb);
 
-       pfsync_sendout();
-}
+               /* TDBF_PFSYNC is only changed while the slice mtx is held */
+               if (ISSET(tdb->tdb_flags, TDBF_PFSYNC)) {
+                       pfsync_tdb_del(s, tdb);
 
-int
-pfsync_is_up(void)
-{
-       struct pfsync_softc *sc = pfsyncif;
+                       mtx_enter(&tdb->tdb_mtx);
+                       CLR(tdb->tdb_flags, TDBF_PFSYNC);
+                       mtx_leave(&tdb->tdb_mtx);
+               }
 
-       if (sc == NULL || !ISSET(sc->sc_if.if_flags, IFF_RUNNING))
-               return (0);
+               pfsync_slice_leave(sc, s);
+       }
+       smr_read_leave();
 
-       return (1);
+       /*
+        * handle pfsync_slice_drop being called from pfsync_down
+        * and the smr/slice access above won't work.
+        */
+
+       mtx_enter(&tdb->tdb_mtx);
+       SET(tdb->tdb_flags, TDBF_PFSYNC_SNAPPED); /* like a thanos snap */
+       while (ISSET(tdb->tdb_flags, TDBF_PFSYNC)) {
+               msleep_nsec(&tdb->tdb_updates, &tdb->tdb_mtx, PWAIT,
+                   "tdbfree", INFSLP);
+       }
+       mtx_leave(&tdb->tdb_mtx);
 }
+#endif /* defined(IPSEC) */
 
-int
-pfsync_state_in_use(struct pf_state *st)
+struct pfsync_act {
+       void (*in)(struct pfsync_softc *, const caddr_t,
+           unsigned int, unsigned int);
+       size_t len;
+};
+
+static void    pfsync_in_clr(struct pfsync_softc *,
+                   const caddr_t, unsigned int, unsigned int);
+static void    pfsync_in_iack(struct pfsync_softc *,
+                   const caddr_t, unsigned int, unsigned int);
+static void    pfsync_in_upd_c(struct pfsync_softc *,
+                   const caddr_t, unsigned int, unsigned int);
+static void    pfsync_in_ureq(struct pfsync_softc *,
+                   const caddr_t, unsigned int, unsigned int);
+static void    pfsync_in_del(struct pfsync_softc *,
+                   const caddr_t, unsigned int, unsigned int);
+static void    pfsync_in_del_c(struct pfsync_softc *,
+                   const caddr_t, unsigned int, unsigned int);
+static void    pfsync_in_bus(struct pfsync_softc *,
+                   const caddr_t, unsigned int, unsigned int);
+static void    pfsync_in_tdb(struct pfsync_softc *,
+                   const caddr_t, unsigned int, unsigned int);
+static void    pfsync_in_ins(struct pfsync_softc *,
+                   const caddr_t, unsigned int, unsigned int);
+static void    pfsync_in_upd(struct pfsync_softc *,
+                   const caddr_t, unsigned int, unsigned int);
+
+static const struct pfsync_act pfsync_acts[] = {
+       [PFSYNC_ACT_CLR] =
+           { pfsync_in_clr,    sizeof(struct pfsync_clr) },
+       [PFSYNC_ACT_INS_ACK] =
+           { pfsync_in_iack,   sizeof(struct pfsync_ins_ack) },
+       [PFSYNC_ACT_UPD_C] =
+           { pfsync_in_upd_c,  sizeof(struct pfsync_upd_c) },
+       [PFSYNC_ACT_UPD_REQ] =
+           { pfsync_in_ureq,   sizeof(struct pfsync_upd_req) },
+       [PFSYNC_ACT_DEL] =
+           { pfsync_in_del,    sizeof(struct pfsync_state) },
+       [PFSYNC_ACT_DEL_C] =
+           { pfsync_in_del_c,  sizeof(struct pfsync_del_c) },
+       [PFSYNC_ACT_BUS] =
+           { pfsync_in_bus,    sizeof(struct pfsync_bus) },
+       [PFSYNC_ACT_INS] =
+           { pfsync_in_ins,    sizeof(struct pfsync_state) },
+       [PFSYNC_ACT_UPD] =
+           { pfsync_in_upd,    sizeof(struct pfsync_state) },
+       [PFSYNC_ACT_TDB] =
+           { pfsync_in_tdb,    sizeof(struct pfsync_tdb) },
+};
+
+static void
+pfsync_in_skip(struct pfsync_softc *sc,
+    const caddr_t buf, unsigned int mlen, unsigned int count)
 {
-       struct pfsync_softc *sc = pfsyncif;
+       /* nop */
+}
 
+static struct mbuf *
+pfsync_input(struct mbuf *m, uint8_t ttl, unsigned int hlen)
+{
+       struct pfsync_softc *sc;
+       struct pfsync_header *ph;
+       struct pfsync_subheader *subh;
+       unsigned int len;
+       void (*in)(struct pfsync_softc *,
+           const caddr_t, unsigned int, unsigned int);
+#if NBPF > 0
+       caddr_t if_bpf;
+#endif
+
+       pfsyncstat_inc(pfsyncs_ipackets);
+
+       if (!pf_status.running)
+               return (m);
+
+       /*
+        * pfsyncif is only set if it is up and running correctly.
+        */
+       smr_read_enter();
+       sc = SMR_PTR_GET(&pfsyncif);
        if (sc == NULL)
-               return (0);
+               goto leave;
 
-       rw_assert_wrlock(&pf_state_list.pfs_rwl);
+       if (sc->sc_sync_ifidx != m->m_pkthdr.ph_ifidx) {
+               pfsyncstat_inc(pfsyncs_badif);
+               goto leave;
+       }
 
-       if (st->sync_state != PFSYNC_S_NONE ||
-           st == sc->sc_bulk_next ||
-           st == sc->sc_bulk_last)
-               return (1);
+#if NBPF > 0
+#endif
 
-       return (0);
+       /* verify that the IP TTL is 255. */
+       if (ttl != PFSYNC_DFLTTL) {
+               pfsyncstat_inc(pfsyncs_badttl);
+               goto leave;
+       }
+
+       m_adj(m, hlen);
+
+       if (m->m_pkthdr.len < sizeof(*ph)) {
+               pfsyncstat_inc(pfsyncs_hdrops);
+               goto leave;
+       }
+       if (m->m_len < sizeof(*ph)) {
+               m = m_pullup(m, sizeof(*ph));
+               if (m == NULL)
+                       goto leave;
+       }
+
+       ph = mtod(m, struct pfsync_header *);
+       if (ph->version != PFSYNC_VERSION) {
+               pfsyncstat_inc(pfsyncs_badver);
+               goto leave;
+       }
+
+       len = ntohs(ph->len);
+       if (m->m_pkthdr.len < len) {
+               pfsyncstat_inc(pfsyncs_badlen);
+               goto leave;
+       }
+       if (m->m_pkthdr.len > len)
+               m->m_pkthdr.len = len;
+
+       /* ok, it's serious now */
+       refcnt_take(&sc->sc_refs);
+       smr_read_leave();
+
+       counters_pkt(sc->sc_if.if_counters, ifc_ipackets, ifc_ibytes, len);
+
+       m_adj(m, sizeof(*ph));
+
+       while (m->m_pkthdr.len >= sizeof(*subh)) {
+               unsigned int action, mlen, count;
+
+               if (m->m_len < sizeof(*subh)) {
+                       m = m_pullup(m, sizeof(*subh));
+                       if (m == NULL)
+                               goto rele;
+               }
+               subh = mtod(m, struct pfsync_subheader *);
+
+               action = subh->action;
+               mlen = subh->len << 2;
+               count = ntohs(subh->count);
+
+               if (action >= PFSYNC_ACT_MAX ||
+                   action >= nitems(pfsync_acts) ||
+                   mlen < pfsync_acts[subh->action].len) {
+                       /*
+                        * subheaders are always followed by at least one
+                        * message, so if the peer is new
+                        * enough to tell us how big its messages are then we
+                        * know enough to skip them.
+                        */
+                       if (count == 0 || mlen == 0) {
+                               pfsyncstat_inc(pfsyncs_badact);
+                               goto rele;
+                       }
+
+                       in = pfsync_in_skip;
+               } else {
+                       in = pfsync_acts[action].in;
+                       if (in == NULL)
+                               in = pfsync_in_skip;
+               }
+
+               m_adj(m, sizeof(*subh));
+               len = mlen * count;
+               if (len > m->m_pkthdr.len) {
+                       pfsyncstat_inc(pfsyncs_badlen);
+                       goto rele;
+               }
+               if (m->m_len < len) {
+                       m = m_pullup(m, len);
+                       if (m == NULL)
+                               goto rele;
+               }
+
+               (*in)(sc, mtod(m, caddr_t), mlen, count);
+               m_adj(m, len);
+       }
+
+rele:
+       refcnt_rele_wake(&sc->sc_refs);
+       return (m);
+
+leave:
+       smr_read_leave();
+       return (m);
 }
 
-void
-pfsync_timeout(void *arg)
+static void
+pfsync_in_clr(struct pfsync_softc *sc,
+    const caddr_t buf, unsigned int mlen, unsigned int count)
 {
-       NET_LOCK();
-       pfsync_sendout();
-       NET_UNLOCK();
+       const struct pfsync_clr *clr;
+       struct pf_state *head, *tail, *st, *next;
+       struct pfi_kif *kif;
+       uint32_t creatorid;
+       unsigned int i;
+
+       rw_enter_read(&pf_state_list.pfs_rwl);
+
+       /* get a view of the state list */
+       mtx_enter(&pf_state_list.pfs_mtx);
+       head = TAILQ_FIRST(&pf_state_list.pfs_list);
+       tail = TAILQ_LAST(&pf_state_list.pfs_list, pf_state_queue);
+       mtx_leave(&pf_state_list.pfs_mtx);
+
+       PF_LOCK();
+       for (i = 0; i < count; i++) {
+               clr = (struct pfsync_clr *)(buf + i * mlen);
+
+               creatorid = clr->creatorid;
+               if (clr->ifname[0] == '\0')
+                       kif = NULL;
+               else {
+                       kif = pfi_kif_find(clr->ifname);
+                       if (kif == NULL)
+                               continue;
+               }
+
+               st = NULL;
+               next = head;
+
+               PF_STATE_ENTER_WRITE();
+               while (st != tail) {
+                       st = next;
+                       next = TAILQ_NEXT(st, entry_list);
+
+                       if (creatorid != st->creatorid)
+                               continue;
+                       if (kif != NULL && kif != st->kif)
+                               continue;
+
+                       mtx_enter(&st->mtx);
+                       SET(st->state_flags, PFSTATE_NOSYNC);
+                       mtx_leave(&st->mtx);
+                       pf_remove_state(st);
+               }
+               PF_STATE_EXIT_WRITE();
+       }
+       PF_UNLOCK();
+
+       rw_exit_read(&pf_state_list.pfs_rwl);
 }
 
-/* this is a softnet/netisr handler */
-void
-pfsyncintr(void)
+static void
+pfsync_in_ins(struct pfsync_softc *sc,
+    const caddr_t buf, unsigned int mlen, unsigned int count)
+{
+       const struct pfsync_state *sp;
+       sa_family_t af1, af2;
+       unsigned int i;
+
+       PF_LOCK();
+       for (i = 0; i < count; i++) {
+               sp = (struct pfsync_state *)(buf + mlen * i);
+               af1 = sp->key[0].af;
+               af2 = sp->key[1].af;
+
+               /* check for invalid values */
+               if (sp->timeout >= PFTM_MAX ||
+                   sp->src.state > PF_TCPS_PROXY_DST ||
+                   sp->dst.state > PF_TCPS_PROXY_DST ||
+                   sp->direction > PF_OUT ||
+                   (((af1 || af2) &&
+                    ((af1 != AF_INET && af1 != AF_INET6) ||
+                     (af2 != AF_INET && af2 != AF_INET6))) ||
+                    (sp->af != AF_INET && sp->af != AF_INET6))) {
+                       pfsyncstat_inc(pfsyncs_badval);
+                       continue;
+               }
+
+               if (pf_state_import(sp, 0) == ENOMEM) {
+                       /* drop out, but process the rest of the actions */
+                       break;
+               }
+       }
+       PF_UNLOCK();
+}
+
+static void
+pfsync_in_iack(struct pfsync_softc *sc,
+    const caddr_t buf, unsigned int mlen, unsigned int count)
+{
+       const struct pfsync_ins_ack *ia;
+       struct pf_state_cmp id_key;
+       struct pf_state *st;
+       unsigned int i;
+
+       for (i = 0; i < count; i++) {
+               ia = (struct pfsync_ins_ack *)(buf + mlen * i);
+
+               id_key.id = ia->id;
+               id_key.creatorid = ia->creatorid;
+
+               PF_STATE_ENTER_READ();
+               st = pf_find_state_byid(&id_key);
+               pf_state_ref(st);
+               PF_STATE_EXIT_READ();
+               if (st == NULL)
+                       continue;
+
+               if (READ_ONCE(st->sync_defer) != NULL)
+                       pfsync_deferred(sc, st);
+
+               pf_state_unref(st);
+       }
+}
+
+static int
+pfsync_upd_tcp(struct pf_state *st, const struct pfsync_state_peer *src,
+    const struct pfsync_state_peer *dst)
+{
+       int sync = 0;
+
+       /*
+        * The state should never go backwards except
+        * for syn-proxy states.  Neither should the
+        * sequence window slide backwards.
+        */
+       if ((st->src.state > src->state &&
+           (st->src.state < PF_TCPS_PROXY_SRC ||
+            src->state >= PF_TCPS_PROXY_SRC)) ||
+
+           (st->src.state == src->state &&
+            SEQ_GT(st->src.seqlo, ntohl(src->seqlo))))
+               sync++;
+       else
+               pf_state_peer_ntoh(src, &st->src);
+
+       if ((st->dst.state > dst->state) ||
+
+           (st->dst.state >= TCPS_SYN_SENT &&
+            SEQ_GT(st->dst.seqlo, ntohl(dst->seqlo))))
+               sync++;
+       else
+               pf_state_peer_ntoh(dst, &st->dst);
+
+       return (sync);
+}
+
+static void
+pfsync_in_updates(struct pfsync_softc *sc, struct pf_state *st,
+    const struct pfsync_state_peer *src, const struct pfsync_state_peer *dst,
+    uint8_t timeout)
 {
-       pfsync_sendout();
+       struct pf_state_scrub *sscrub = NULL;
+       struct pf_state_scrub *dscrub = NULL;
+       int sync;
+
+       if (src->scrub.scrub_flag && st->src.scrub == NULL) {
+               sscrub = pf_state_scrub_get();
+               if (sscrub == NULL) {
+                       /* inc error? */
+                       goto out;
+               }
+       }
+       if (dst->scrub.scrub_flag && st->dst.scrub == NULL) {
+               dscrub = pf_state_scrub_get();
+               if (dscrub == NULL) {
+                       /* inc error? */
+                       goto out;
+               }
+       }
+
+       if (READ_ONCE(st->sync_defer) != NULL)
+               pfsync_deferred(sc, st);
+
+       mtx_enter(&st->mtx);
+
+       /* attach the scrub memory if needed */
+       if (sscrub != NULL && st->src.scrub == NULL) {
+               st->src.scrub = sscrub;
+               sscrub = NULL;
+       }
+       if (dscrub != NULL && st->dst.scrub == NULL) {
+               st->dst.scrub = dscrub;
+               dscrub = NULL;
+       }
+
+       if (st->key[PF_SK_WIRE]->proto == IPPROTO_TCP)
+               sync = pfsync_upd_tcp(st, src, dst);
+       else {
+               sync = 0;
+
+               /*
+                * Non-TCP protocol state machine always go
+                * forwards
+                */
+               if (st->src.state > src->state)
+                       sync++;
+               else
+                       pf_state_peer_ntoh(src, &st->src);
+
+               if (st->dst.state > dst->state)
+                       sync++;
+               else
+                       pf_state_peer_ntoh(dst, &st->dst);
+       }
+
+       st->pfsync_time = getuptime();
+       if (sync < 2) {
+               st->expire = st->pfsync_time;
+               st->timeout = timeout;
+       }
+
+       mtx_leave(&st->mtx);
+
+       if (sync) {
+               pfsyncstat_inc(pfsyncs_stale);
+               pfsync_update_state(st);
+       }
+
+out:
+       if (sscrub != NULL)
+               pf_state_scrub_put(sscrub);
+       if (dscrub != NULL)
+               pf_state_scrub_put(dscrub);
+}
+
+
+static void
+pfsync_in_upd(struct pfsync_softc *sc,
+    const caddr_t buf, unsigned int mlen, unsigned int count)
+{
+       const struct pfsync_state *sp;
+       struct pf_state_cmp id_key;
+       struct pf_state *st;
+       int error;
+       unsigned int i;
+
+       for (i = 0; i < count; i++) {
+               sp = (struct pfsync_state *)(buf + mlen * i);
+
+               /* check for invalid values */
+               if (sp->timeout >= PFTM_MAX ||
+                   sp->src.state > PF_TCPS_PROXY_DST ||
+                   sp->dst.state > PF_TCPS_PROXY_DST) {
+                       pfsyncstat_inc(pfsyncs_badval);
+                       continue;
+               }
+
+               id_key.id = sp->id;
+               id_key.creatorid = sp->creatorid;
+
+               PF_STATE_ENTER_READ();
+               st = pf_find_state_byid(&id_key);
+               pf_state_ref(st);
+               PF_STATE_EXIT_READ();
+               if (st == NULL) {
+                       /* insert the update */
+                       PF_LOCK();
+                       error = pf_state_import(sp, 0);
+                       if (error)
+                               pfsyncstat_inc(pfsyncs_badstate);
+                       PF_UNLOCK();
+                       continue;
+               }
+
+               pfsync_in_updates(sc, st, &sp->src, &sp->dst, sp->timeout);
+
+               pf_state_unref(st);
+       }
+}
+
+static struct mbuf *
+pfsync_upd_req_init(struct pfsync_softc *sc, unsigned int count)
+{
+       struct mbuf *m;
+       unsigned int mlen;
+
+       m = m_gethdr(M_DONTWAIT, MT_DATA);
+       if (m == NULL) {
+               pfsyncstat_inc(pfsyncs_onomem);
+               return (NULL);
+       }
+
+       mlen = max_linkhdr + sizeof(sc->sc_template) +
+           sizeof(struct pfsync_header) +
+           sizeof(struct pfsync_subheader) +
+           sizeof(struct pfsync_upd_req) * count;
+
+       if (mlen > MHLEN) {
+               MCLGETL(m, M_DONTWAIT, mlen);
+               if (!ISSET(m->m_flags, M_EXT)) {
+                       m_freem(m);
+                       return (NULL);
+               }
+       }
+
+       m_align(m, 0);
+       m->m_len = 0;
+
+       return (m);
+}
+
+static void
+pfsync_in_upd_c(struct pfsync_softc *sc,
+    const caddr_t buf, unsigned int mlen, unsigned int count)
+{
+       const struct pfsync_upd_c *up;
+       struct pf_state_cmp id_key;
+       struct pf_state *st;
+       unsigned int i;
+       struct mbuf *m = NULL;
+       unsigned int rcount = 0;
+
+       for (i = 0; i < count; i++) {
+               up = (struct pfsync_upd_c *)(buf + mlen * i);
+
+               /* check for invalid values */
+               if (up->timeout >= PFTM_MAX ||
+                   up->src.state > PF_TCPS_PROXY_DST ||
+                   up->dst.state > PF_TCPS_PROXY_DST) {
+                       pfsyncstat_inc(pfsyncs_badval);
+                       continue;
+               }
+
+               id_key.id = up->id;
+               id_key.creatorid = up->creatorid;
+
+               PF_STATE_ENTER_READ();
+               st = pf_find_state_byid(&id_key);
+               pf_state_ref(st);
+               PF_STATE_EXIT_READ();
+               if (st == NULL) {
+                       /* We don't have this state. Ask for it. */
+                       struct pfsync_upd_req *ur;
+
+                       if (m == NULL) {
+                               m = pfsync_upd_req_init(sc, count);
+                               if (m == NULL) {
+                                       pfsyncstat_inc(pfsyncs_onomem);
+                                       continue;
+                               }
+                       }
+
+                       m = m_prepend(m, sizeof(*ur), M_DONTWAIT);
+                       if (m == NULL) {
+                               pfsyncstat_inc(pfsyncs_onomem);
+                               continue;
+                       }
+
+                       ur = mtod(m, struct pfsync_upd_req *);
+                       ur->id = up->id;
+                       ur->creatorid = up->creatorid;
+                       rcount++;
+
+                       continue;
+               }
+
+               pfsync_in_updates(sc, st, &up->src, &up->dst, up->timeout);
+
+               pf_state_unref(st);
+       }
+
+       if (m != NULL) {
+               struct pfsync_subheader *subh;
+
+               m = m_prepend(m, sizeof(*subh), M_DONTWAIT);
+               if (m == NULL) {
+                       pfsyncstat_inc(pfsyncs_onomem);
+                       return;
+               }
+
+               subh = mtod(m, struct pfsync_subheader *);
+               subh->action = PFSYNC_ACT_UPD_REQ;
+               subh->len = sizeof(struct pfsync_upd_req) >> 2;
+               subh->count = htons(rcount);
+
+               m = pfsync_encap(sc, m);
+               if (m == NULL) {
+                       pfsyncstat_inc(pfsyncs_onomem);
+                       return;
+               }
+
+               pfsync_sendout(sc, m);
+       }
+}
+
+static void
+pfsync_in_ureq(struct pfsync_softc *sc,
+    const caddr_t buf, unsigned int mlen, unsigned int count)
+{
+       const struct pfsync_upd_req *ur;
+       struct pf_state_cmp id_key;
+       struct pf_state *st;
+       unsigned int i;
+
+       for (i = 0; i < count; i++) {
+               ur = (struct pfsync_upd_req *)(buf + mlen * i);
+
+               id_key.id = ur->id;
+               id_key.creatorid = ur->creatorid;
+
+               if (id_key.id == 0 && id_key.creatorid == 0) {
+                       pfsync_bulk_snd_start(sc);
+                       continue;
+               }
+
+               PF_STATE_ENTER_READ();
+               st = pf_find_state_byid(&id_key);
+               if (st != NULL && st->timeout < PFTM_MAX &&
+                   !ISSET(st->state_flags, PFSTATE_NOSYNC))
+                       pf_state_ref(st);
+               else
+                       st = NULL;
+               PF_STATE_EXIT_READ();
+               if (st == NULL) {
+                       pfsyncstat_inc(pfsyncs_badstate);
+                       continue;
+               }
+
+               pfsync_update_state_req(sc, st);
+
+               pf_state_unref(st);
+       }
+}
+
+static void
+pfsync_in_del(struct pfsync_softc *sc,
+    const caddr_t buf, unsigned int mlen, unsigned int count)
+{
+       const struct pfsync_state *sp;
+       struct pf_state_cmp id_key;
+       struct pf_state *st;
+       unsigned int i;
+
+       PF_LOCK();
+       PF_STATE_ENTER_WRITE();
+       for (i = 0; i < count; i++) {
+               sp = (struct pfsync_state *)(buf + mlen * i);
+
+               id_key.id = sp->id;
+               id_key.creatorid = sp->creatorid;
+
+               st = pf_find_state_byid(&id_key);
+               if (st == NULL) {
+                       pfsyncstat_inc(pfsyncs_badstate);
+                       continue;
+               }
+
+               mtx_enter(&st->mtx);
+               SET(st->state_flags, PFSTATE_NOSYNC);
+               mtx_leave(&st->mtx);
+               pf_remove_state(st);
+       }
+       PF_STATE_EXIT_WRITE();
+       PF_UNLOCK();
+}
+
+static void
+pfsync_in_del_c(struct pfsync_softc *sc,
+    const caddr_t buf, unsigned int mlen, unsigned int count)
+{
+       const struct pfsync_del_c *sp;
+       struct pf_state_cmp id_key;
+       struct pf_state *st;
+       unsigned int i;
+
+       PF_LOCK();
+       PF_STATE_ENTER_WRITE();
+       for (i = 0; i < count; i++) {
+               sp = (struct pfsync_del_c *)(buf + mlen * i);
+
+               id_key.id = sp->id;
+               id_key.creatorid = sp->creatorid;
+
+               st = pf_find_state_byid(&id_key);
+               if (st == NULL) {
+                       pfsyncstat_inc(pfsyncs_badstate);
+                       continue;
+               }
+
+               mtx_enter(&st->mtx);
+               SET(st->state_flags, PFSTATE_NOSYNC);
+               mtx_leave(&st->mtx);
+               pf_remove_state(st);
+       }
+       PF_STATE_EXIT_WRITE();
+       PF_UNLOCK();
+}
+
+static void
+pfsync_in_bus(struct pfsync_softc *sc,
+    const caddr_t buf, unsigned int len, unsigned int count)
+{
+       const struct pfsync_bus *bus = (struct pfsync_bus *)buf;
+
+       switch (bus->status) {
+       case PFSYNC_BUS_START:
+               pfsync_bulk_req_evt(sc, PFSYNC_BREQ_EVT_BUS_START);
+               break;
+
+       case PFSYNC_BUS_END:
+               pfsync_bulk_req_evt(sc, PFSYNC_BREQ_EVT_BUS_END);
+               break;
+       }
+}
+
+#if defined(IPSEC)
+/* Update an in-kernel tdb. Silently fail if no tdb is found. */
+static void
+pfsync_update_net_tdb(const struct pfsync_tdb *pt)
+{
+       struct tdb *tdb;
+
+       NET_ASSERT_LOCKED();
+
+       /* check for invalid values */
+       if (ntohl(pt->spi) <= SPI_RESERVED_MAX ||
+           (pt->dst.sa.sa_family != AF_INET &&
+            pt->dst.sa.sa_family != AF_INET6))
+               goto bad;
+
+       tdb = gettdb(ntohs(pt->rdomain), pt->spi,
+           (union sockaddr_union *)&pt->dst, pt->sproto);
+       if (tdb) {
+               uint64_t rpl = betoh64(pt->rpl);
+               uint64_t cur_bytes = betoh64(pt->cur_bytes);
+
+               /* Neither replay nor byte counter should ever decrease. */
+               mtx_enter(&tdb->tdb_mtx);
+               if (rpl >= tdb->tdb_rpl &&
+                   cur_bytes >= tdb->tdb_cur_bytes) {
+                       tdb->tdb_rpl = rpl;
+                       tdb->tdb_cur_bytes = cur_bytes;
+               }
+               mtx_leave(&tdb->tdb_mtx);
+
+               tdb_unref(tdb);
+       }
+       return;
+
+ bad:
+       DPFPRINTF(LOG_WARNING, "pfsync_insert: PFSYNC_ACT_TDB_UPD: "
+           "invalid value");
+       pfsyncstat_inc(pfsyncs_badstate);
+       return;
+}
+#endif
+
+static void
+pfsync_in_tdb(struct pfsync_softc *sc,
+    const caddr_t buf, unsigned int len, unsigned int count)
+{
+#if defined(IPSEC)
+       const struct pfsync_tdb *tp;
+       unsigned int i;
+
+       for (i = 0; i < count; i++) {
+               tp = (const struct pfsync_tdb *)(buf + len * i);
+               pfsync_update_net_tdb(tp);
+       }
+#endif
+}
+
+int
+pfsync_input4(struct mbuf **mp, int *offp, int proto, int af)
+{
+       struct mbuf *m = *mp;
+       struct ip *ip;
+
+       ip = mtod(m, struct ip *);
+
+       m = pfsync_input(m, ip->ip_ttl, ip->ip_hl << 2);
+
+       m_freem(m);
+       *mp = NULL;
+
+       return (IPPROTO_DONE);
 }
 
 int
@@ -2651,8 +3344,8 @@ pfsync_sysctl_pfsyncstat(void *oldp, size_t *oldlenp, void *newp)
 }
 
 int
-pfsync_sysctl(int *name, u_int namelen, void *oldp, size_t *oldlenp, void *newp,
-    size_t newlen)
+pfsync_sysctl(int *name, u_int namelen, void *oldp, size_t *oldlenp,
+    void *newp, size_t newlen)
 {
        /* All sysctl names at this level are terminal. */
        if (namelen != 1)
index ff26ac3..e83ddd8 100644 (file)
@@ -1,4 +1,4 @@
-/*     $OpenBSD: if_pfsync.h,v 1.59 2022/11/11 11:47:13 dlg Exp $      */
+/*     $OpenBSD: if_pfsync.h,v 1.60 2023/07/06 04:55:05 dlg Exp $      */
 
 /*
  * Copyright (c) 2001 Michael Shalayeff
@@ -177,7 +177,7 @@ struct pfsync_upd_c {
 struct pfsync_upd_req {
        u_int64_t                       id;
        u_int32_t                       creatorid;
-} __packed;
+} __packed __aligned(4);
 
 /*
  * DEL_C
@@ -295,16 +295,6 @@ enum pfsync_counters {
        pfsyncs_ncounters,
 };
 
-extern struct cpumem *pfsynccounters;
-
-struct pfsync_deferral;
-
-static inline void
-pfsyncstat_inc(enum pfsync_counters c)
-{
-       counters_inc(pfsynccounters, c);
-}
-
 /*
  * this shows where a pf state is with respect to the syncing.
  */
@@ -315,10 +305,11 @@ pfsyncstat_inc(enum pfsync_counters c)
 #define PFSYNC_S_UPD   0x04
 #define PFSYNC_S_COUNT 0x05
 
-#define PFSYNC_S_DEFER 0xfe
-#define PFSYNC_S_NONE  0xff
+#define PFSYNC_S_NONE  0xd0
+#define PFSYNC_S_SYNC  0xd1
+#define PFSYNC_S_DEAD  0xde
 
-int                    pfsync_input(struct mbuf **, int *, int, int);
+int                    pfsync_input4(struct mbuf **, int *, int, int);
 int                    pfsync_sysctl(int *, u_int,  void *, size_t *,
                            void *, size_t);
 
@@ -329,6 +320,9 @@ int                 pfsync_state_import(struct pfsync_state *, int);
 void                   pfsync_state_export(struct pfsync_state *,
                            struct pf_state *);
 
+void                   pfsync_init_state(struct pf_state *,
+                           const struct pf_state_key *,
+                           const struct pf_state_key *, int);
 void                   pfsync_insert_state(struct pf_state *);
 void                   pfsync_update_state(struct pf_state *);
 void                   pfsync_delete_state(struct pf_state *);
@@ -337,14 +331,10 @@ void                      pfsync_clear_states(u_int32_t, const char *);
 void                   pfsync_update_tdb(struct tdb *, int);
 void                   pfsync_delete_tdb(struct tdb *);
 
-int                    pfsync_defer(struct pf_state *, struct mbuf *,
-                           struct pfsync_deferral **);
-void                   pfsync_undefer(struct pfsync_deferral *, int);
+int                    pfsync_defer(struct pf_state *, struct mbuf *);
 
 int                    pfsync_is_up(void);
 int                    pfsync_state_in_use(struct pf_state *);
-
-void                   pfsync_iack(struct pf_state *);
 #endif /* _KERNEL */
 
 #endif /* _NET_IF_PFSYNC_H_ */
index d79d697..0e9f51b 100644 (file)
@@ -1,4 +1,4 @@
-/*     $OpenBSD: netisr.h,v 1.60 2022/07/14 10:52:21 mvs Exp $ */
+/*     $OpenBSD: netisr.h,v 1.61 2023/07/06 04:55:05 dlg Exp $ */
 /*     $NetBSD: netisr.h,v 1.12 1995/08/12 23:59:24 mycroft Exp $      */
 
 /*
@@ -42,7 +42,6 @@
  * on the lowest level routine of each protocol.
  */
 #define NETISR_IP      2               /* same as AF_INET */
-#define NETISR_PFSYNC  5               /* for pfsync "immediate" tx */
 #define NETISR_ARP     18              /* same as AF_LINK */
 #define NETISR_IPV6    24              /* same as AF_INET6 */
 #define NETISR_PIPEX   27              /* for pipex processing */
@@ -64,7 +63,6 @@ void  ipintr(void);
 void   ip6intr(void);
 void   pppintr(void);
 void   bridgeintr(void);
-void   pfsyncintr(void);
 void   pipexintr(void);
 void   pppoeintr(void);
 
index b3a655b..f5ad04d 100644 (file)
@@ -1,4 +1,4 @@
-/*     $OpenBSD: pf.c,v 1.1181 2023/06/05 08:37:27 sashan Exp $ */
+/*     $OpenBSD: pf.c,v 1.1182 2023/07/06 04:55:05 dlg Exp $ */
 
 /*
  * Copyright (c) 2001 Daniel Hartmeier
 
 #if NPFSYNC > 0
 #include <net/if_pfsync.h>
-#else
-struct pfsync_deferral;
 #endif /* NPFSYNC > 0 */
 
 /*
@@ -121,10 +119,6 @@ u_char                      pf_tcp_secret[16];
 int                     pf_tcp_secret_init;
 int                     pf_tcp_iss_off;
 
-int             pf_npurge;
-struct task     pf_purge_task = TASK_INITIALIZER(pf_purge, &pf_npurge);
-struct timeout  pf_purge_to = TIMEOUT_INITIALIZER(pf_purge_timeout, NULL);
-
 enum pf_test_status {
        PF_TEST_FAIL = -1,
        PF_TEST_OK,
@@ -190,8 +184,7 @@ void                         pf_rule_to_actions(struct pf_rule *,
                            struct pf_rule_actions *);
 int                     pf_test_rule(struct pf_pdesc *, struct pf_rule **,
                            struct pf_state **, struct pf_rule **,
-                           struct pf_ruleset **, u_short *,
-                           struct pfsync_deferral **);
+                           struct pf_ruleset **, u_short *);
 static __inline int     pf_create_state(struct pf_pdesc *, struct pf_rule *,
                            struct pf_rule *, struct pf_rule *,
                            struct pf_state_key **, struct pf_state_key **,
@@ -250,6 +243,10 @@ void                        pf_counters_inc(int, struct pf_pdesc *,
                            struct pf_state *, struct pf_rule *,
                            struct pf_rule *);
 
+int                     pf_state_insert(struct pfi_kif *,
+                           struct pf_state_key **, struct pf_state_key **,
+                           struct pf_state *);
+
 int                     pf_state_key_isvalid(struct pf_state_key *);
 struct pf_state_key    *pf_state_key_ref(struct pf_state_key *);
 void                    pf_state_key_unref(struct pf_state_key *);
@@ -1064,10 +1061,11 @@ pf_state_insert(struct pfi_kif *kif, struct pf_state_key **skwp,
        pf_status.fcounters[FCNT_STATE_INSERT]++;
        pf_status.states++;
        pfi_kif_ref(kif, PFI_KIF_REF_STATE);
+       PF_STATE_EXIT_WRITE();
+
 #if NPFSYNC > 0
        pfsync_insert_state(st);
 #endif /* NPFSYNC > 0 */
-       PF_STATE_EXIT_WRITE();
 
        *skwp = skw;
        *sksp = sks;
@@ -1318,6 +1316,8 @@ pf_state_export(struct pfsync_state *sp, struct pf_state *st)
 #endif /* NPFLOG > 0 */
        sp->timeout = st->timeout;
        sp->state_flags = htons(st->state_flags);
+       if (READ_ONCE(st->sync_defer) != NULL)
+               sp->state_flags |= htons(PFSTATE_ACK);
        if (!SLIST_EMPTY(&st->src_nodes))
                sp->sync_flags |= PFSYNC_FLAG_SRCNODE;
 
@@ -1519,9 +1519,6 @@ pf_state_import(const struct pfsync_state *sp, int flags)
        st->rule.ptr = r;
        st->anchor.ptr = NULL;
 
-       st->pfsync_time = getuptime();
-       st->sync_state = PFSYNC_S_NONE;
-
        PF_REF_INIT(st->refcnt);
        mtx_init(&st->mtx, IPL_NET);
 
@@ -1529,15 +1526,12 @@ pf_state_import(const struct pfsync_state *sp, int flags)
        r->states_cur++;
        r->states_tot++;
 
+       st->sync_state = PFSYNC_S_NONE;
+       st->pfsync_time = getuptime();
 #if NPFSYNC > 0
-       if (!ISSET(flags, PFSYNC_SI_IOCTL))
-               SET(st->state_flags, PFSTATE_NOSYNC);
+       pfsync_init_state(st, skw, sks, flags);
 #endif
 
-       /*
-        * We just set PFSTATE_NOSYNC bit, which prevents
-        * pfsync_insert_state() to insert state to pfsync.
-        */
        if (pf_state_insert(kif, &skw, &sks, st) != 0) {
                /* XXX when we have anchors, use STATE_DEC_COUNTERS */
                r->states_cur--;
@@ -1545,15 +1539,6 @@ pf_state_import(const struct pfsync_state *sp, int flags)
                goto cleanup_state;
        }
 
-#if NPFSYNC > 0
-       if (!ISSET(flags, PFSYNC_SI_IOCTL)) {
-               CLR(st->state_flags, PFSTATE_NOSYNC);
-               if (ISSET(st->state_flags, PFSTATE_ACK))
-                       pfsync_iack(st);
-       }
-       CLR(st->state_flags, PFSTATE_ACK);
-#endif
-
        return (0);
 
  cleanup:
@@ -1576,47 +1561,106 @@ pf_state_import(const struct pfsync_state *sp, int flags)
 
 /* END state table stuff */
 
+void            pf_purge_states(void *);
+struct task     pf_purge_states_task =
+                    TASK_INITIALIZER(pf_purge_states, NULL);
+
+void            pf_purge_states_tick(void *);
+struct timeout  pf_purge_states_to =
+                    TIMEOUT_INITIALIZER(pf_purge_states_tick, NULL);
+
+unsigned int    pf_purge_expired_states(unsigned int, unsigned int);
+
+/*
+ * how many states to scan this interval.
+ *
+ * this is set when the timeout fires, and reduced by the task. the
+ * task will reschedule itself until the limit is reduced to zero,
+ * and then it adds the timeout again.
+ */
+unsigned int pf_purge_states_limit;
+
+/*
+ * limit how many states are processed with locks held per run of
+ * the state purge task.
+ */
+unsigned int pf_purge_states_collect = 64;
+
+ void
+pf_purge_states_tick(void *null)
+ {
+       unsigned int limit = pf_status.states;
+       unsigned int interval = pf_default_rule.timeout[PFTM_INTERVAL];
+
+       if (limit == 0) {
+               timeout_add_sec(&pf_purge_states_to, 1);
+               return;
+       }
+       /*
+        * process a fraction of the state table every second
+        */
+       if (interval > 1)
+               limit /= interval;
+
+       pf_purge_states_limit = limit;
+       task_add(systqmp, &pf_purge_states_task);
+}
+
 void
-pf_purge_timeout(void *unused)
+pf_purge_states(void *null)
 {
-       /* XXX move to systqmp to avoid KERNEL_LOCK */
-       task_add(systq, &pf_purge_task);
+       unsigned int limit;
+       unsigned int scanned;
+
+       limit = pf_purge_states_limit;
+       if (limit < pf_purge_states_collect)
+               limit = pf_purge_states_collect;
+
+       scanned = pf_purge_expired_states(limit, pf_purge_states_collect);
+       if (scanned >= pf_purge_states_limit) {
+               /* we've run out of states to scan this "interval" */
+               timeout_add_sec(&pf_purge_states_to, 1);
+               return;
+       }
+
+       pf_purge_states_limit -= scanned;
+       task_add(systqmp, &pf_purge_states_task);
 }
 
+void            pf_purge_tick(void *);
+struct timeout  pf_purge_to =
+                    TIMEOUT_INITIALIZER(pf_purge_tick, NULL);
+
+void            pf_purge(void *);
+struct task     pf_purge_task =
+                    TASK_INITIALIZER(pf_purge, NULL);
+
 void
-pf_purge(void *xnloops)
+pf_purge_tick(void *null)
 {
-       int *nloops = xnloops;
-
-       /*
-        * process a fraction of the state table every second
-        * Note:
-        *     we no longer need PF_LOCK() here, because
-        *     pf_purge_expired_states() uses pf_state_lock to maintain
-        *     consistency.
-        */
-       if (pf_default_rule.timeout[PFTM_INTERVAL] > 0)
-               pf_purge_expired_states(1 + (pf_status.states
-                   / pf_default_rule.timeout[PFTM_INTERVAL]));
+       task_add(systqmp, &pf_purge_task);
+}
 
-       NET_LOCK();
+void
+pf_purge(void *null)
+{
+       unsigned int interval = max(1, pf_default_rule.timeout[PFTM_INTERVAL]);
 
        PF_LOCK();
-       /* purge other expired types every PFTM_INTERVAL seconds */
-       if (++(*nloops) >= pf_default_rule.timeout[PFTM_INTERVAL])
-               pf_purge_expired_src_nodes();
-       PF_UNLOCK();
 
+       pf_purge_expired_src_nodes();
+
+       PF_UNLOCK();
        /*
         * Fragments don't require PF_LOCK(), they use their own lock.
         */
-       if ((*nloops) >= pf_default_rule.timeout[PFTM_INTERVAL]) {
-               pf_purge_expired_fragments();
-               *nloops = 0;
-       }
-       NET_UNLOCK();
-
-       timeout_add_sec(&pf_purge_to, 1);
+       pf_purge_expired_fragments();
+       /* interpret the interval as idle time between runs */
+       timeout_add_sec(&pf_purge_to, interval);
 }
 
 int32_t
@@ -1717,6 +1761,8 @@ pf_remove_state(struct pf_state *st)
        if (st->timeout == PFTM_UNLINKED)
                return;
 
+       st->timeout = PFTM_UNLINKED;
+
        /* handle load balancing related tasks */
        pf_postprocess_addr(st);
 
@@ -1741,7 +1787,6 @@ pf_remove_state(struct pf_state *st)
 #if NPFSYNC > 0
        pfsync_delete_state(st);
 #endif /* NPFSYNC > 0 */
-       st->timeout = PFTM_UNLINKED;
        pf_src_tree_remove_state(st);
        pf_detach_state(st);
 }
@@ -1795,6 +1840,7 @@ pf_free_state(struct pf_state *st)
        if (pfsync_state_in_use(st))
                return;
 #endif /* NPFSYNC > 0 */
+
        KASSERT(st->timeout == PFTM_UNLINKED);
        if (--st->rule.ptr->states_cur == 0 &&
            st->rule.ptr->src_nodes == 0)
@@ -1819,8 +1865,8 @@ pf_free_state(struct pf_state *st)
        pf_status.states--;
 }
 
-void
-pf_purge_expired_states(u_int32_t maxcheck)
+unsigned int
+pf_purge_expired_states(const unsigned int limit, const unsigned int collect)
 {
        /*
         * this task/thread/context/whatever is the only thing that
@@ -1834,6 +1880,8 @@ pf_purge_expired_states(u_int32_t maxcheck)
        struct pf_state         *st;
        SLIST_HEAD(pf_state_gcl, pf_state) gcl = SLIST_HEAD_INITIALIZER(gcl);
        time_t                   now;
+       unsigned int             scanned;
+       unsigned int             collected = 0;
 
        PF_ASSERT_UNLOCKED();
 
@@ -1847,7 +1895,7 @@ pf_purge_expired_states(u_int32_t maxcheck)
        if (head == NULL) {
                /* the list is empty */
                rw_exit_read(&pf_state_list.pfs_rwl);
-               return;
+               return (limit);
        }
 
        /* (re)start at the front of the list */
@@ -1856,13 +1904,17 @@ pf_purge_expired_states(u_int32_t maxcheck)
 
        now = getuptime();
 
-       do {
+       for (scanned = 0; scanned < limit; scanned++) {
                uint8_t stimeout = cur->timeout;
+               unsigned int limited = 0;
 
                if ((stimeout == PFTM_UNLINKED) ||
                    (pf_state_expires(cur, stimeout) <= now)) {
                        st = pf_state_ref(cur);
                        SLIST_INSERT_HEAD(&gcl, st, gc_list);
+
+                       if (++collected >= collect)
+                               limited = 1;
                }
 
                /* don't iterate past the end of our view of the list */
@@ -1872,14 +1924,18 @@ pf_purge_expired_states(u_int32_t maxcheck)
                }
 
                cur = TAILQ_NEXT(cur, entry_list);
-       } while (maxcheck--);
+
+               /* don't spend too much time here. */
+               if (ISSET(READ_ONCE(curcpu()->ci_schedstate.spc_schedflags),
+                    SPCF_SHOULDYIELD) || limited)
+                       break;
+       }
 
        rw_exit_read(&pf_state_list.pfs_rwl);
 
        if (SLIST_EMPTY(&gcl))
-               return;
+               return (scanned);
 
-       NET_LOCK();
        rw_enter_write(&pf_state_list.pfs_rwl);
        PF_LOCK();
        PF_STATE_ENTER_WRITE();
@@ -1892,12 +1948,13 @@ pf_purge_expired_states(u_int32_t maxcheck)
        PF_STATE_EXIT_WRITE();
        PF_UNLOCK();
        rw_exit_write(&pf_state_list.pfs_rwl);
-       NET_UNLOCK();
 
        while ((st = SLIST_FIRST(&gcl)) != NULL) {
                SLIST_REMOVE_HEAD(&gcl, gc_list);
                pf_state_unref(st);
        }
+
+       return (scanned);
 }
 
 int
@@ -4262,8 +4319,7 @@ next_rule:
 
 int
 pf_test_rule(struct pf_pdesc *pd, struct pf_rule **rm, struct pf_state **sm,
-    struct pf_rule **am, struct pf_ruleset **rsm, u_short *reason,
-    struct pfsync_deferral **pdeferral)
+    struct pf_rule **am, struct pf_ruleset **rsm, u_short *reason)
 {
        struct pf_rule          *r = NULL;
        struct pf_rule          *a = NULL;
@@ -4475,7 +4531,7 @@ pf_test_rule(struct pf_pdesc *pd, struct pf_rule **rm, struct pf_state **sm,
                 * firewall has to know about it to allow
                 * replies through it.
                 */
-               if (pfsync_defer(*sm, pd->m, pdeferral))
+               if (pfsync_defer(*sm, pd->m))
                        return (PF_DEFER);
        }
 #endif /* NPFSYNC > 0 */
@@ -4517,6 +4573,8 @@ pf_create_state(struct pf_pdesc *pd, struct pf_rule *r, struct pf_rule *a,
                st->state_flags |= PFSTATE_SLOPPY;
        if (r->rule_flag & PFRULE_PFLOW)
                st->state_flags |= PFSTATE_PFLOW;
+       if (r->rule_flag & PFRULE_NOSYNC)
+               st->state_flags |= PFSTATE_NOSYNC;
 #if NPFLOG > 0
        st->log = act->log & PF_LOG_ALL;
 #endif /* NPFLOG > 0 */
@@ -4535,6 +4593,7 @@ pf_create_state(struct pf_pdesc *pd, struct pf_rule *r, struct pf_rule *a,
        st->set_prio[1] = act->set_prio[1];
        st->delay = act->delay;
        SLIST_INIT(&st->src_nodes);
+
        /*
         * must initialize refcnt, before pf_state_insert() gets called.
         * pf_state_inserts() grabs reference for pfsync!
@@ -7462,7 +7521,6 @@ pf_test(sa_family_t af, int fwdir, struct ifnet *ifp, struct mbuf **m0)
        int                      dir = (fwdir == PF_FWD) ? PF_OUT : fwdir;
        u_int32_t                qid, pqid = 0;
        int                      have_pf_lock = 0;
-       struct pfsync_deferral  *deferral = NULL;
 
        if (!pf_status.running)
                return (PF_PASS);
@@ -7565,8 +7623,7 @@ pf_test(sa_family_t af, int fwdir, struct ifnet *ifp, struct mbuf **m0)
                 */
                PF_LOCK();
                have_pf_lock = 1;
-               action = pf_test_rule(&pd, &r, &st, &a, &ruleset, &reason,
-                   &deferral);
+               action = pf_test_rule(&pd, &r, &st, &a, &ruleset, &reason);
                st = pf_state_ref(st);
                if (action != PF_PASS)
                        REASON_SET(&reason, PFRES_FRAG);
@@ -7598,7 +7655,7 @@ pf_test(sa_family_t af, int fwdir, struct ifnet *ifp, struct mbuf **m0)
                        PF_LOCK();
                        have_pf_lock = 1;
                        action = pf_test_rule(&pd, &r, &st, &a, &ruleset,
-                           &reason, &deferral);
+                           &reason);
                        st = pf_state_ref(st);
                }
                break;
@@ -7630,7 +7687,7 @@ pf_test(sa_family_t af, int fwdir, struct ifnet *ifp, struct mbuf **m0)
                        PF_LOCK();
                        have_pf_lock = 1;
                        action = pf_test_rule(&pd, &r, &st, &a, &ruleset,
-                           &reason, &deferral);
+                           &reason);
                        st = pf_state_ref(st);
                }
                break;
@@ -7714,7 +7771,7 @@ pf_test(sa_family_t af, int fwdir, struct ifnet *ifp, struct mbuf **m0)
                        PF_LOCK();
                        have_pf_lock = 1;
                        action = pf_test_rule(&pd, &r, &st, &a, &ruleset,
-                           &reason, &deferral);
+                           &reason);
                        st = pf_state_ref(st);
                }
 
@@ -7854,14 +7911,6 @@ done:
                m_freem(pd.m);
                /* FALLTHROUGH */
        case PF_DEFER:
-#if NPFSYNC > 0
-               /*
-                * We no longer hold PF_LOCK() here, so we can dispatch
-                * deferral if we are asked to do so.
-                */
-               if (deferral != NULL)
-                       pfsync_undefer(deferral, 0);
-#endif /* NPFSYNC > 0 */
                pd.m = NULL;
                action = PF_PASS;
                break;
@@ -8210,7 +8259,7 @@ pf_state_unref(struct pf_state *st)
 #if NPFSYNC > 0
                KASSERT((TAILQ_NEXT(st, sync_list) == NULL) ||
                    ((TAILQ_NEXT(st, sync_list) == _Q_INVALID) &&
-                   (st->sync_state == PFSYNC_S_NONE)));
+                   (st->sync_state >= PFSYNC_S_NONE)));
 #endif /* NPFSYNC */
                KASSERT((TAILQ_NEXT(st, entry_list) == NULL) ||
                    (TAILQ_NEXT(st, entry_list) == _Q_INVALID));
index f20632d..078fa72 100644 (file)
@@ -1,4 +1,4 @@
-/*     $OpenBSD: pf_ioctl.c,v 1.414 2023/07/04 14:23:38 sashan Exp $ */
+/*     $OpenBSD: pf_ioctl.c,v 1.415 2023/07/06 04:55:05 dlg Exp $ */
 
 /*
  * Copyright (c) 2001 Daniel Hartmeier
@@ -1000,13 +1000,14 @@ pf_states_clr(struct pfioc_state_kill *psk)
        }
 
        PF_STATE_EXIT_WRITE();
-#if NPFSYNC > 0
-       pfsync_clear_states(pf_status.hostid, psk->psk_ifname);
-#endif /* NPFSYNC > 0 */
        PF_UNLOCK();
        rw_exit(&pf_state_list.pfs_rwl);
 
        psk->psk_killed = killed;
+
+#if NPFSYNC > 0
+       pfsync_clear_states(pf_status.hostid, psk->psk_ifname);
+#endif /* NPFSYNC > 0 */
 unlock:
        NET_UNLOCK();
 
@@ -1190,6 +1191,7 @@ pfioctl(dev_t dev, u_long cmd, caddr_t addr, int flags, struct proc *p)
                                pf_status.stateid = gettime();
                                pf_status.stateid = pf_status.stateid << 32;
                        }
+                       timeout_add_sec(&pf_purge_states_to, 1);
                        timeout_add_sec(&pf_purge_to, 1);
                        pf_create_queues();
                        DPFPRINTF(LOG_NOTICE, "pf: started");
@@ -2783,8 +2785,9 @@ pfioctl(dev_t dev, u_long cmd, caddr_t addr, int flags, struct proc *p)
                        pf_default_rule.timeout[i] =
                            pf_default_rule_new.timeout[i];
                        if (pf_default_rule.timeout[i] == PFTM_INTERVAL &&
-                           pf_default_rule.timeout[i] < old)
-                               task_add(net_tq(0), &pf_purge_task);
+                           pf_default_rule.timeout[i] < old &&
+                           timeout_del(&pf_purge_to))
+                               task_add(systqmp, &pf_purge_task);
                }
                pfi_xcommit();
                pf_trans_set_commit();
index 7ab4c00..ef2c884 100644 (file)
@@ -1,4 +1,4 @@
-/*     $OpenBSD: pf_norm.c,v 1.227 2023/05/07 16:23:23 bluhm Exp $ */
+/*     $OpenBSD: pf_norm.c,v 1.228 2023/07/06 04:55:05 dlg Exp $ */
 
 /*
  * Copyright 2001 Niels Provos <provos@citi.umich.edu>
@@ -1098,10 +1098,22 @@ no_fragment:
 }
 #endif /* INET6 */
 
+struct pf_state_scrub *
+pf_state_scrub_get(void)
+{
+       return (pool_get(&pf_state_scrub_pl, PR_NOWAIT | PR_ZERO));
+}
+
+void
+pf_state_scrub_put(struct pf_state_scrub *scrub)
+{
+       pool_put(&pf_state_scrub_pl, scrub);
+}
+
 int
 pf_normalize_tcp_alloc(struct pf_state_peer *src)
 {
-       src->scrub = pool_get(&pf_state_scrub_pl, PR_NOWAIT | PR_ZERO);
+       src->scrub = pf_state_scrub_get();
        if (src->scrub == NULL)
                return (ENOMEM);
 
index 9e86391..27cce82 100644 (file)
@@ -1,4 +1,4 @@
-/*     $OpenBSD: pfvar.h,v 1.532 2023/07/04 11:34:19 sashan Exp $ */
+/*     $OpenBSD: pfvar.h,v 1.533 2023/07/06 04:55:05 dlg Exp $ */
 
 /*
  * Copyright (c) 2001 Daniel Hartmeier
@@ -1604,15 +1604,10 @@ extern void                      pf_tbladdr_remove(struct pf_addr_wrap *);
 extern void                     pf_tbladdr_copyout(struct pf_addr_wrap *);
 extern void                     pf_calc_skip_steps(struct pf_rulequeue *);
 extern void                     pf_purge_expired_src_nodes(void);
-extern void                     pf_purge_expired_states(u_int32_t);
 extern void                     pf_purge_expired_rules(void);
 extern void                     pf_remove_state(struct pf_state *);
 extern void                     pf_remove_divert_state(struct pf_state_key *);
 extern void                     pf_free_state(struct pf_state *);
-extern int                      pf_state_insert(struct pfi_kif *,
-                                   struct pf_state_key **,
-                                   struct pf_state_key **,
-                                   struct pf_state *);
 int                             pf_insert_src_node(struct pf_src_node **,
                                    struct pf_rule *, enum pf_sn_types,
                                    sa_family_t, struct pf_addr *,
@@ -1676,6 +1671,10 @@ int      pf_match_port(u_int8_t, u_int16_t, u_int16_t, u_int16_t);
 int    pf_match_uid(u_int8_t, uid_t, uid_t, uid_t);
 int    pf_match_gid(u_int8_t, gid_t, gid_t, gid_t);
 
+struct pf_state_scrub *
+       pf_state_scrub_get(void);
+void   pf_state_scrub_put(struct pf_state_scrub *);
+
 int    pf_refragment6(struct mbuf **, struct m_tag *mtag,
            struct sockaddr_in6 *, struct ifnet *, struct rtentry *);
 void   pf_normalize_init(void);
index e9e80f6..53d9834 100644 (file)
@@ -1,4 +1,4 @@
-/*     $OpenBSD: pfvar_priv.h,v 1.33 2023/05/10 22:42:51 sashan Exp $  */
+/*     $OpenBSD: pfvar_priv.h,v 1.34 2023/07/06 04:55:05 dlg Exp $     */
 
 /*
  * Copyright (c) 2001 Daniel Hartmeier
 #include <sys/mutex.h>
 #include <sys/percpu.h>
 
+struct pfsync_deferral;
+
+/*
+ * pf state items - links from pf_state_key to pf_states
+ */
+
 struct pf_state_item {
        TAILQ_ENTRY(pf_state_item)
                                 si_entry;
@@ -49,6 +55,10 @@ struct pf_state_item {
 
 TAILQ_HEAD(pf_statelisthead, pf_state_item);
 
+/*
+ * pf state keys - look up states by address
+ */
+
 struct pf_state_key {
        struct pf_addr   addr[2];
        u_int16_t        port[2];
@@ -73,11 +83,13 @@ RBT_PROTOTYPE(pf_state_tree, pf_state_key, sk_entry, pf_state_compare_key);
         (key[PF_SK_WIRE]->af != (family)))
 
 /*
+ * pf state
+ *
  * Protection/ownership of pf_state members:
  *     I       immutable after pf_state_insert()
  *     M       pf_state mtx
  *     P       PF_STATE_LOCK
- *     S       pfsync mutex
+ *     S       pfsync
  *     L       pf_state_list
  *     g       pf_purge gc
  */
@@ -89,7 +101,7 @@ struct pf_state {
        u_int8_t                 pad[3];
 
        TAILQ_ENTRY(pf_state)    sync_list;     /* [S] */
-       TAILQ_ENTRY(pf_state)    sync_snap;     /* [S] */
+       struct pfsync_deferral  *sync_defer;    /* [S] */
        TAILQ_ENTRY(pf_state)    entry_list;    /* [L] */
        SLIST_ENTRY(pf_state)    gc_list;       /* [g] */
        RB_ENTRY(pf_state)       entry_id;      /* [P] */
@@ -101,7 +113,7 @@ struct pf_state {
        union pf_rule_ptr        natrule;       /* [I] */
        struct pf_addr           rt_addr;       /* [I] */
        struct pf_sn_head        src_nodes;     /* [I] */
-       struct pf_state_key     *key[2];        /* [I] stack and wire  */
+       struct pf_state_key     *key[2];        /* [I] stack and wire */
        struct pfi_kif          *kif;           /* [I] */
        struct mutex             mtx;
        pf_refcnt_t              refcnt;
@@ -109,16 +121,16 @@ struct pf_state {
        u_int64_t                bytes[2];
        int32_t                  creation;      /* [I] */
        int32_t                  expire;
-       int32_t                  pfsync_time;
-       int                      rtableid[2];   /* [I] rtables stack and wire */
+       int32_t                  pfsync_time;   /* [S] */
+       int                      rtableid[2];   /* [I] stack and wire */
        u_int16_t                qid;           /* [I] */
        u_int16_t                pqid;          /* [I] */
        u_int16_t                tag;           /* [I] */
-       u_int16_t                state_flags;
+       u_int16_t                state_flags;   /* [M] */
        u_int8_t                 log;           /* [I] */
        u_int8_t                 timeout;
-       u_int8_t                 sync_state;    /* PFSYNC_S_x */
-       u_int8_t                 sync_updates;
+       u_int8_t                 sync_state;    /* [S] PFSYNC_S_x */
+       u_int8_t                 sync_updates;  /* [S] */
        u_int8_t                 min_ttl;       /* [I] */
        u_int8_t                 set_tos;       /* [I] */
        u_int8_t                 set_prio[2];   /* [I] */
@@ -127,7 +139,6 @@ struct pf_state {
        u_int16_t                if_index_out;  /* [I] */
        u_int16_t                delay;         /* [I] */
        u_int8_t                 rt;            /* [I] */
-       u_int8_t                 snapped;       /* [S] */
 };
 
 RBT_HEAD(pf_state_tree_id, pf_state);
@@ -345,6 +356,7 @@ struct pf_trans {
 #define pftgr_anchor   u.u_getrule.gr_anchor
 #define pftgr_rule     u.u_getrule.gr_rule
 
+extern struct timeout  pf_purge_states_to;
 extern struct task     pf_purge_task;
 extern struct timeout  pf_purge_to;
 
@@ -397,9 +409,6 @@ extern struct rwlock        pf_state_lock;
                            rw_status(&pf_state_lock), __func__);\
        } while (0)
 
-extern void                     pf_purge_timeout(void *);
-extern void                     pf_purge(void *);
-
 /* for copies to/from network byte order */
 void                   pf_state_peer_hton(const struct pf_state_peer *,
                            struct pfsync_state_peer *);
index 526596e..d2e67c6 100644 (file)
@@ -1,4 +1,4 @@
-/*     $OpenBSD: in_proto.c,v 1.101 2023/05/18 09:59:43 mvs Exp $      */
+/*     $OpenBSD: in_proto.c,v 1.102 2023/07/06 04:55:05 dlg Exp $      */
 /*     $NetBSD: in_proto.c,v 1.14 1996/02/18 18:58:32 christos Exp $   */
 
 /*
@@ -343,7 +343,7 @@ const struct protosw inetsw[] = {
   .pr_domain   = &inetdomain,
   .pr_protocol = IPPROTO_PFSYNC,
   .pr_flags    = PR_ATOMIC|PR_ADDR,
-  .pr_input    = pfsync_input,
+  .pr_input    = pfsync_input4,
   .pr_ctloutput        = rip_ctloutput,
   .pr_usrreqs  = &rip_usrreqs,
   .pr_sysctl   = pfsync_sysctl
index f7e621e..5da3ad7 100644 (file)
@@ -1,4 +1,4 @@
-/*     $OpenBSD: ip_ipsp.h,v 1.240 2022/07/14 13:52:10 mvs Exp $       */
+/*     $OpenBSD: ip_ipsp.h,v 1.241 2023/07/06 04:55:05 dlg Exp $       */
 /*
  * The authors of this code are John Ioannidis (ji@tla.org),
  * Angelos D. Keromytis (kermit@csd.uch.gr),
@@ -50,6 +50,7 @@
  *     P       ipo_tdb_mtx             link policy to TDB global mutex
  *     D       tdb_sadb_mtx            SA database global mutex
  *     m       tdb_mtx                 fields of struct tdb
+ *     S       pfsync                  fields of struct tdb
  */
 
 /* IPSP global definitions. */
@@ -405,7 +406,6 @@ struct tdb {                                /* tunnel descriptor block */
        u_int8_t        tdb_sproto;     /* [I] IPsec protocol */
        u_int8_t        tdb_wnd;        /* Replay window */
        u_int8_t        tdb_satype;     /* SA type (RFC2367, PF_KEY) */
-       u_int8_t        tdb_updates;    /* pfsync update counter */
 
        union sockaddr_union    tdb_dst;        /* [N] Destination address */
        union sockaddr_union    tdb_src;        /* [N] Source address */
@@ -439,8 +439,8 @@ struct tdb {                                /* tunnel descriptor block */
        struct sockaddr_encap   tdb_filtermask; /* And the mask */
 
        TAILQ_HEAD(tdb_policy_head, ipsec_policy) tdb_policy_head; /* [P] */
-       TAILQ_ENTRY(tdb)        tdb_sync_entry;
-       TAILQ_ENTRY(tdb)        tdb_sync_snap;
+       TAILQ_ENTRY(tdb)        tdb_sync_entry; /* [S] pfsync tdb queue */
+       u_int32_t       tdb_updates;    /* [S] pfsync update counter */
 };
 
 enum tdb_counters {