From cc90b7e6d535f1994553796d023b32c530e92873 Mon Sep 17 00:00:00 2001 From: dlg Date: Thu, 6 Jul 2023 04:55:04 +0000 Subject: [PATCH] big update to pfsync to try and clean up locking in particular. moving pf forward has been a real struggle, and pfsync has been a constant source of pain. we have been papering over the problems for a while now, but it reached the point that it needed a fundamental restructure, which is what this diff is. the big headliner changes in this diff are: - pfsync specific locks this is the whole reason for this diff. rather than rely on NET_LOCK or KERNEL_LOCK or whatever, pfsync now has it's own locks to protect it's internal data structures. this is important because pfsync runs a bunch of timeouts and tasks to push pfsync packets out on the wire, or when it's handling requests generated by incoming pfsync packets, both of which happen outside pf itself running. having pfsync specific locks around pfsync data structures makes the mutations of these data structures a lot more explicit and auditable. - partitioning to enable future parallelisation of the network stack, this rewrite includes support for pfsync to partition states into different "slices". these slices run independently, ie, the states collected by one slice are serialised into a separate packet to the states collected and serialised by another slice. states are mapped to pfsync slices based on the pf state hash, which is the same hash that the rest of the network stack and multiq hardware uses. - no more pfsync called from netisr pfsync used to be called from netisr to try and bundle packets, but now that there's multiple pfsync slices this doesnt make sense. instead it uses tasks in softnet tqs. - improved bulk transfer handling there's shiny new state machines around both the bulk transmit and receive handling. pfsync used to do horrible things to carp demotion counters, but now it is very predictable and returns the counters back where they started. - better tdb handling the tdb handling was pretty hairy, but hrvoje has kicked this around a lot with ipsec and sasyncd and we've found and fixed a bunch of issues as a result of that testing. - mpsafe pf state purges this was committed previously, but because the locks pfsync relied on weren't clear this just caused a ton of bugs. as part of this diff it's now reliable, and moves a big chunk of work out from under KERNEL_LOCK, which in turn improves the responsiveness and throughput of a firewall even if you're not using pfsync. there's a bunch of other little changes along the way, but the above are the big ones. hrvoje has done performance testing with this diff and notes a big improvement when pfsync is not in use. performance when pfsync is enabled is about the same, but im hoping the slices means we can scale along with pf as it improves. lots (months) of testing by me and hrvoje on pfsync boxes tests and ok sashan@ deraadt@ says this is a good time to put it in --- sys/net/if.c | 10 +- sys/net/if_pfsync.c | 4525 +++++++++++++++++++++++----------------- sys/net/if_pfsync.h | 30 +- sys/net/netisr.h | 4 +- sys/net/pf.c | 215 +- sys/net/pf_ioctl.c | 15 +- sys/net/pf_norm.c | 16 +- sys/net/pfvar.h | 11 +- sys/net/pfvar_priv.h | 35 +- sys/netinet/in_proto.c | 4 +- sys/netinet/ip_ipsp.h | 8 +- 11 files changed, 2809 insertions(+), 2064 deletions(-) diff --git a/sys/net/if.c b/sys/net/if.c index 30a36d844a6..1cecfdf0224 100644 --- a/sys/net/if.c +++ b/sys/net/if.c @@ -1,4 +1,4 @@ -/* $OpenBSD: if.c,v 1.703 2023/07/04 13:37:47 jan Exp $ */ +/* $OpenBSD: if.c,v 1.704 2023/07/06 04:55:04 dlg Exp $ */ /* $NetBSD: if.c,v 1.35 1996/05/07 05:26:04 thorpej Exp $ */ /* @@ -1034,14 +1034,6 @@ if_netisr(void *unused) t |= n; } -#if NPFSYNC > 0 - if (t & (1 << NETISR_PFSYNC)) { - KERNEL_LOCK(); - pfsyncintr(); - KERNEL_UNLOCK(); - } -#endif - NET_UNLOCK(); } diff --git a/sys/net/if_pfsync.c b/sys/net/if_pfsync.c index 2457796b600..bf685712ec9 100644 --- a/sys/net/if_pfsync.c +++ b/sys/net/if_pfsync.c @@ -1,4 +1,4 @@ -/* $OpenBSD: if_pfsync.c,v 1.317 2023/06/05 08:45:20 sashan Exp $ */ +/* $OpenBSD: if_pfsync.c,v 1.318 2023/07/06 04:55:05 dlg Exp $ */ /* * Copyright (c) 2002 Michael Shalayeff @@ -27,7 +27,7 @@ */ /* - * Copyright (c) 2009 David Gwynne + * Copyright (c) 2009, 2022, 2023 David Gwynne * * Permission to use, copy, modify, and distribute this software for any * purpose with or without fee is hereby granted, provided that the above @@ -42,6 +42,10 @@ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ +#include "bpfilter.h" +#include "pfsync.h" +#include "kstat.h" + #include #include #include @@ -54,6 +58,12 @@ #include #include #include +#include +#include +#include +#include +#include +#include #include #include @@ -85,226 +95,257 @@ #include #endif -#define PF_DEBUGNAME "pfsync: " #include #include #include -#include "bpfilter.h" -#include "pfsync.h" - -#define PFSYNC_DEFER_NSEC 20000000ULL - #define PFSYNC_MINPKT ( \ sizeof(struct ip) + \ sizeof(struct pfsync_header)) -int pfsync_upd_tcp(struct pf_state *, struct pfsync_state_peer *, - struct pfsync_state_peer *); - -int pfsync_in_clr(caddr_t, int, int, int); -int pfsync_in_iack(caddr_t, int, int, int); -int pfsync_in_upd_c(caddr_t, int, int, int); -int pfsync_in_ureq(caddr_t, int, int, int); -int pfsync_in_del(caddr_t, int, int, int); -int pfsync_in_del_c(caddr_t, int, int, int); -int pfsync_in_bus(caddr_t, int, int, int); -int pfsync_in_tdb(caddr_t, int, int, int); -int pfsync_in_ins(caddr_t, int, int, int); -int pfsync_in_upd(caddr_t, int, int, int); -int pfsync_in_eof(caddr_t, int, int, int); - -int pfsync_in_error(caddr_t, int, int, int); - -void pfsync_update_state_locked(struct pf_state *); - -const struct { - int (*in)(caddr_t, int, int, int); - size_t len; -} pfsync_acts[] = { - /* PFSYNC_ACT_CLR */ - { pfsync_in_clr, sizeof(struct pfsync_clr) }, - /* PFSYNC_ACT_OINS */ - { pfsync_in_error, 0 }, - /* PFSYNC_ACT_INS_ACK */ - { pfsync_in_iack, sizeof(struct pfsync_ins_ack) }, - /* PFSYNC_ACT_OUPD */ - { pfsync_in_error, 0 }, - /* PFSYNC_ACT_UPD_C */ - { pfsync_in_upd_c, sizeof(struct pfsync_upd_c) }, - /* PFSYNC_ACT_UPD_REQ */ - { pfsync_in_ureq, sizeof(struct pfsync_upd_req) }, - /* PFSYNC_ACT_DEL */ - { pfsync_in_del, sizeof(struct pfsync_state) }, - /* PFSYNC_ACT_DEL_C */ - { pfsync_in_del_c, sizeof(struct pfsync_del_c) }, - /* PFSYNC_ACT_INS_F */ - { pfsync_in_error, 0 }, - /* PFSYNC_ACT_DEL_F */ - { pfsync_in_error, 0 }, - /* PFSYNC_ACT_BUS */ - { pfsync_in_bus, sizeof(struct pfsync_bus) }, - /* PFSYNC_ACT_OTDB */ - { pfsync_in_error, 0 }, - /* PFSYNC_ACT_EOF */ - { pfsync_in_error, 0 }, - /* PFSYNC_ACT_INS */ - { pfsync_in_ins, sizeof(struct pfsync_state) }, - /* PFSYNC_ACT_UPD */ - { pfsync_in_upd, sizeof(struct pfsync_state) }, - /* PFSYNC_ACT_TDB */ - { pfsync_in_tdb, sizeof(struct pfsync_tdb) }, -}; +struct pfsync_softc; -struct pfsync_q { - void (*write)(struct pf_state *, void *); - size_t len; - u_int8_t action; +struct pfsync_deferral { + TAILQ_ENTRY(pfsync_deferral) pd_entry; + struct pf_state *pd_st; + struct mbuf *pd_m; + uint64_t pd_deadline; }; +TAILQ_HEAD(pfsync_deferrals, pfsync_deferral); -/* we have one of these for every PFSYNC_S_ */ -void pfsync_out_state(struct pf_state *, void *); -void pfsync_out_iack(struct pf_state *, void *); -void pfsync_out_upd_c(struct pf_state *, void *); -void pfsync_out_del(struct pf_state *, void *); +#define PFSYNC_DEFER_NSEC 20000000ULL +#define PFSYNC_DEFER_LIMIT 128 +#define PFSYNC_BULK_SND_IVAL_MS 20 -struct pfsync_q pfsync_qs[] = { - { pfsync_out_iack, sizeof(struct pfsync_ins_ack), PFSYNC_ACT_INS_ACK }, - { pfsync_out_upd_c, sizeof(struct pfsync_upd_c), PFSYNC_ACT_UPD_C }, - { pfsync_out_del, sizeof(struct pfsync_del_c), PFSYNC_ACT_DEL_C }, - { pfsync_out_state, sizeof(struct pfsync_state), PFSYNC_ACT_INS }, - { pfsync_out_state, sizeof(struct pfsync_state), PFSYNC_ACT_UPD } +static struct pool pfsync_deferrals_pool; + +enum pfsync_bulk_req_state { + PFSYNC_BREQ_S_NONE, + PFSYNC_BREQ_S_START, + PFSYNC_BREQ_S_SENT, + PFSYNC_BREQ_S_BULK, + PFSYNC_BREQ_S_DONE, }; -void pfsync_q_ins(struct pf_state *, int); -void pfsync_q_del(struct pf_state *); +static const char *pfsync_bulk_req_state_names[] = { + [PFSYNC_BREQ_S_NONE] = "none", + [PFSYNC_BREQ_S_START] = "start", + [PFSYNC_BREQ_S_SENT] = "sent", + [PFSYNC_BREQ_S_BULK] = "bulk", + [PFSYNC_BREQ_S_DONE] = "done", +}; -struct pfsync_upd_req_item { - TAILQ_ENTRY(pfsync_upd_req_item) ur_entry; - TAILQ_ENTRY(pfsync_upd_req_item) ur_snap; - struct pfsync_upd_req ur_msg; +enum pfsync_bulk_req_event { + PFSYNC_BREQ_EVT_UP, + PFSYNC_BREQ_EVT_DOWN, + PFSYNC_BREQ_EVT_TMO, + PFSYNC_BREQ_EVT_LINK, + PFSYNC_BREQ_EVT_BUS_START, + PFSYNC_BREQ_EVT_BUS_END, }; -TAILQ_HEAD(pfsync_upd_reqs, pfsync_upd_req_item); -struct pfsync_deferral { - TAILQ_ENTRY(pfsync_deferral) pd_entry; - struct pf_state *pd_st; - struct mbuf *pd_m; - uint64_t pd_deadline; +static const char *pfsync_bulk_req_event_names[] = { + [PFSYNC_BREQ_EVT_UP] = "up", + [PFSYNC_BREQ_EVT_DOWN] = "down", + [PFSYNC_BREQ_EVT_TMO] = "timeout", + [PFSYNC_BREQ_EVT_LINK] = "link", + [PFSYNC_BREQ_EVT_BUS_START] = "bus-start", + [PFSYNC_BREQ_EVT_BUS_END] = "bus-end", }; -TAILQ_HEAD(pfsync_deferrals, pfsync_deferral); -#define PFSYNC_PLSIZE MAX(sizeof(struct pfsync_upd_req_item), \ - sizeof(struct pfsync_deferral)) +struct pfsync_slice { + struct pfsync_softc *s_pfsync; + struct mutex s_mtx; -void pfsync_out_tdb(struct tdb *, void *); + struct pf_state_queue s_qs[PFSYNC_S_COUNT]; + TAILQ_HEAD(, tdb) s_tdb_q; + size_t s_len; + struct mbuf_list s_ml; + + struct taskq *s_softnet; + struct task s_task; + struct timeout s_tmo; + + struct mbuf_queue s_sendq; + struct task s_send; + + struct pfsync_deferrals s_deferrals; + unsigned int s_deferred; + struct task s_deferrals_task; + struct timeout s_deferrals_tmo; + + uint64_t s_stat_locks; + uint64_t s_stat_contended; + uint64_t s_stat_write_nop; + uint64_t s_stat_task_add; + uint64_t s_stat_task_run; + uint64_t s_stat_enqueue; + uint64_t s_stat_dequeue; + + uint64_t s_stat_defer_add; + uint64_t s_stat_defer_ack; + uint64_t s_stat_defer_run; + uint64_t s_stat_defer_overlimit; + + struct kstat *s_kstat; +} __aligned(CACHELINESIZE); + +#define PFSYNC_SLICE_BITS 1 +#define PFSYNC_NSLICES (1 << PFSYNC_SLICE_BITS) struct pfsync_softc { struct ifnet sc_if; + unsigned int sc_dead; + unsigned int sc_up; + struct refcnt sc_refs; + + /* config */ + struct in_addr sc_syncpeer; + unsigned int sc_maxupdates; + unsigned int sc_defer; + + /* operation */ unsigned int sc_sync_ifidx; + unsigned int sc_sync_if_down; + void *sc_inm; + struct task sc_ltask; + struct task sc_dtask; + struct ip sc_template; - struct pool sc_pool; + struct pfsync_slice sc_slices[PFSYNC_NSLICES]; - struct ip_moptions sc_imo; + struct { + struct rwlock req_lock; + struct timeout req_tmo; + enum pfsync_bulk_req_state req_state; + unsigned int req_tries; + unsigned int req_demoted; + } sc_bulk_req; - struct in_addr sc_sync_peer; - u_int8_t sc_maxupdates; + struct { + struct rwlock snd_lock; + struct timeout snd_tmo; + time_t snd_requested; + + struct pf_state *snd_next; + struct pf_state *snd_tail; + unsigned int snd_again; + } sc_bulk_snd; +}; - struct ip sc_template; +static struct pfsync_softc *pfsyncif = NULL; +static struct cpumem *pfsynccounters; - struct pf_state_queue sc_qs[PFSYNC_S_COUNT]; - struct mutex sc_st_mtx; - size_t sc_len; +static inline void +pfsyncstat_inc(enum pfsync_counters c) +{ + counters_inc(pfsynccounters, c); +} - struct pfsync_upd_reqs sc_upd_req_list; - struct mutex sc_upd_req_mtx; +static int pfsync_clone_create(struct if_clone *, int); +static int pfsync_clone_destroy(struct ifnet *); - int sc_initial_bulk; - int sc_link_demoted; +static int pfsync_output(struct ifnet *, struct mbuf *, struct sockaddr *, + struct rtentry *); +static void pfsync_start(struct ifqueue *); - int sc_defer; - struct pfsync_deferrals sc_deferrals; - u_int sc_deferred; - struct mutex sc_deferrals_mtx; - struct timeout sc_deferrals_tmo; +static int pfsync_ioctl(struct ifnet *, u_long, caddr_t); +static int pfsync_up(struct pfsync_softc *); +static int pfsync_down(struct pfsync_softc *); - void *sc_plus; - size_t sc_pluslen; +static int pfsync_set_mtu(struct pfsync_softc *, unsigned int); +static int pfsync_set_parent(struct pfsync_softc *, + const struct if_parent *); +static int pfsync_get_parent(struct pfsync_softc *, struct if_parent *); +static int pfsync_del_parent(struct pfsync_softc *); - u_int32_t sc_ureq_sent; - int sc_bulk_tries; - struct timeout sc_bulkfail_tmo; +static int pfsync_get_ioc(struct pfsync_softc *, struct ifreq *); +static int pfsync_set_ioc(struct pfsync_softc *, struct ifreq *); - u_int32_t sc_ureq_received; - struct pf_state *sc_bulk_next; - struct pf_state *sc_bulk_last; - struct timeout sc_bulk_tmo; +static void pfsync_syncif_link(void *); +static void pfsync_syncif_detach(void *); - TAILQ_HEAD(, tdb) sc_tdb_q; - struct mutex sc_tdb_mtx; +static void pfsync_sendout(struct pfsync_softc *, struct mbuf *); +static void pfsync_slice_drop(struct pfsync_softc *, struct pfsync_slice *); - struct task sc_ltask; - struct task sc_dtask; +static void pfsync_slice_tmo(void *); +static void pfsync_slice_task(void *); +static void pfsync_slice_sendq(void *); + +static void pfsync_deferrals_tmo(void *); +static void pfsync_deferrals_task(void *); +static void pfsync_defer_output(struct pfsync_deferral *); + +static void pfsync_bulk_req_evt(struct pfsync_softc *, + enum pfsync_bulk_req_event); +static void pfsync_bulk_req_tmo(void *); + +static void pfsync_bulk_snd_tmo(void *); + +#if NKSTAT > 0 +struct pfsync_kstat_data { + struct kstat_kv pd_locks; + struct kstat_kv pd_contended; + struct kstat_kv pd_write_nop; + struct kstat_kv pd_task_add; + struct kstat_kv pd_task_run; + struct kstat_kv pd_enqueue; + struct kstat_kv pd_dequeue; + struct kstat_kv pd_qdrop; - struct timeout sc_tmo; + struct kstat_kv pd_defer_len; + struct kstat_kv pd_defer_add; + struct kstat_kv pd_defer_ack; + struct kstat_kv pd_defer_run; + struct kstat_kv pd_defer_overlimit; }; -struct pfsync_snapshot { - struct pfsync_softc *sn_sc; - struct pf_state_queue sn_qs[PFSYNC_S_COUNT]; - struct pfsync_upd_reqs sn_upd_req_list; - TAILQ_HEAD(, tdb) sn_tdb_q; - size_t sn_len; - void *sn_plus; - size_t sn_pluslen; +static const struct pfsync_kstat_data pfsync_kstat_tpl = { + KSTAT_KV_INITIALIZER("locks", KSTAT_KV_T_COUNTER64), + KSTAT_KV_INITIALIZER("contended", KSTAT_KV_T_COUNTER64), + KSTAT_KV_INITIALIZER("write-nops", KSTAT_KV_T_COUNTER64), + KSTAT_KV_INITIALIZER("send-sched", KSTAT_KV_T_COUNTER64), + KSTAT_KV_INITIALIZER("send-run", KSTAT_KV_T_COUNTER64), + KSTAT_KV_INITIALIZER("enqueues", KSTAT_KV_T_COUNTER64), + KSTAT_KV_INITIALIZER("dequeues", KSTAT_KV_T_COUNTER64), + KSTAT_KV_UNIT_INITIALIZER("qdrops", + KSTAT_KV_T_COUNTER32, KSTAT_KV_U_PACKETS), + + KSTAT_KV_UNIT_INITIALIZER("defer-len", + KSTAT_KV_T_COUNTER32, KSTAT_KV_U_PACKETS), + KSTAT_KV_INITIALIZER("defer-add", KSTAT_KV_T_COUNTER64), + KSTAT_KV_INITIALIZER("defer-ack", KSTAT_KV_T_COUNTER64), + KSTAT_KV_INITIALIZER("defer-run", KSTAT_KV_T_COUNTER64), + KSTAT_KV_INITIALIZER("defer-over", KSTAT_KV_T_COUNTER64), }; -struct pfsync_softc *pfsyncif = NULL; -struct cpumem *pfsynccounters; - -void pfsyncattach(int); -int pfsync_clone_create(struct if_clone *, int); -int pfsync_clone_destroy(struct ifnet *); -void pfsync_update_net_tdb(struct pfsync_tdb *); -int pfsyncoutput(struct ifnet *, struct mbuf *, struct sockaddr *, - struct rtentry *); -int pfsyncioctl(struct ifnet *, u_long, caddr_t); -void pfsyncstart(struct ifqueue *); -void pfsync_syncdev_state(void *); -void pfsync_ifdetach(void *); - -void pfsync_deferred(struct pf_state *, int); -void pfsync_undefer(struct pfsync_deferral *, int); -void pfsync_deferrals_tmo(void *); - -void pfsync_cancel_full_update(struct pfsync_softc *); -void pfsync_request_full_update(struct pfsync_softc *); -void pfsync_request_update(u_int32_t, u_int64_t); -void pfsync_update_state_req(struct pf_state *); - -void pfsync_drop(struct pfsync_softc *); -void pfsync_sendout(void); -void pfsync_send_plus(void *, size_t); -void pfsync_timeout(void *); -void pfsync_tdb_timeout(void *); - -void pfsync_bulk_start(void); -void pfsync_bulk_status(u_int8_t); -void pfsync_bulk_update(void *); -void pfsync_bulk_fail(void *); - -void pfsync_grab_snapshot(struct pfsync_snapshot *, struct pfsync_softc *); -void pfsync_drop_snapshot(struct pfsync_snapshot *); - -void pfsync_send_dispatch(void *); -void pfsync_send_pkt(struct mbuf *); - -static struct mbuf_queue pfsync_mq; -static struct task pfsync_task = - TASK_INITIALIZER(pfsync_send_dispatch, &pfsync_mq); +static int +pfsync_kstat_copy(struct kstat *ks, void *dst) +{ + struct pfsync_slice *s = ks->ks_softc; + struct pfsync_kstat_data *pd = dst; + + *pd = pfsync_kstat_tpl; + kstat_kv_u64(&pd->pd_locks) = s->s_stat_locks; + kstat_kv_u64(&pd->pd_contended) = s->s_stat_contended; + kstat_kv_u64(&pd->pd_write_nop) = s->s_stat_write_nop; + kstat_kv_u64(&pd->pd_task_add) = s->s_stat_task_add; + kstat_kv_u64(&pd->pd_task_run) = s->s_stat_task_run; + kstat_kv_u64(&pd->pd_enqueue) = s->s_stat_enqueue; + kstat_kv_u64(&pd->pd_dequeue) = s->s_stat_dequeue; + kstat_kv_u32(&pd->pd_qdrop) = mq_drops(&s->s_sendq); + + kstat_kv_u32(&pd->pd_defer_len) = s->s_deferred; + kstat_kv_u64(&pd->pd_defer_add) = s->s_stat_defer_add; + kstat_kv_u64(&pd->pd_defer_ack) = s->s_stat_defer_ack; + kstat_kv_u64(&pd->pd_defer_run) = s->s_stat_defer_run; + kstat_kv_u64(&pd->pd_defer_overlimit) = s->s_stat_defer_overlimit; + + return (0); +} +#endif /* NKSTAT > 0 */ #define PFSYNC_MAX_BULKTRIES 12 -int pfsync_sync_ok; struct if_clone pfsync_cloner = IF_CLONE_INITIALIZER("pfsync", pfsync_clone_create, pfsync_clone_destroy); @@ -312,63 +353,98 @@ struct if_clone pfsync_cloner = void pfsyncattach(int npfsync) { - if_clone_attach(&pfsync_cloner); pfsynccounters = counters_alloc(pfsyncs_ncounters); - mq_init(&pfsync_mq, 4096, IPL_MPFLOOR); + if_clone_attach(&pfsync_cloner); } -int +static int pfsync_clone_create(struct if_clone *ifc, int unit) { struct pfsync_softc *sc; struct ifnet *ifp; - int q; + size_t i, q; if (unit != 0) - return (EINVAL); - - pfsync_sync_ok = 1; + return (ENXIO); - sc = malloc(sizeof(*pfsyncif), M_DEVBUF, M_WAITOK|M_ZERO); - for (q = 0; q < PFSYNC_S_COUNT; q++) - TAILQ_INIT(&sc->sc_qs[q]); - mtx_init(&sc->sc_st_mtx, IPL_MPFLOOR); + if (pfsync_deferrals_pool.pr_size == 0) { + pool_init(&pfsync_deferrals_pool, + sizeof(struct pfsync_deferral), 0, + IPL_MPFLOOR, 0, "pfdefer", NULL); + /* pool_cache_init(&pfsync_deferrals_pool); */ + } - pool_init(&sc->sc_pool, PFSYNC_PLSIZE, 0, IPL_MPFLOOR, 0, "pfsync", - NULL); - TAILQ_INIT(&sc->sc_upd_req_list); - mtx_init(&sc->sc_upd_req_mtx, IPL_MPFLOOR); - TAILQ_INIT(&sc->sc_deferrals); - mtx_init(&sc->sc_deferrals_mtx, IPL_MPFLOOR); - timeout_set_proc(&sc->sc_deferrals_tmo, pfsync_deferrals_tmo, sc); - task_set(&sc->sc_ltask, pfsync_syncdev_state, sc); - task_set(&sc->sc_dtask, pfsync_ifdetach, sc); - sc->sc_deferred = 0; + sc = malloc(sizeof(*sc), M_DEVBUF, M_WAITOK|M_ZERO|M_CANFAIL); + if (sc == NULL) + return (ENOMEM); - TAILQ_INIT(&sc->sc_tdb_q); - mtx_init(&sc->sc_tdb_mtx, IPL_MPFLOOR); + /* sc_refs is "owned" by IFF_RUNNING */ - sc->sc_len = PFSYNC_MINPKT; + sc->sc_syncpeer.s_addr = INADDR_PFSYNC_GROUP; sc->sc_maxupdates = 128; + sc->sc_defer = 0; + + task_set(&sc->sc_ltask, pfsync_syncif_link, sc); + task_set(&sc->sc_dtask, pfsync_syncif_detach, sc); + + rw_init(&sc->sc_bulk_req.req_lock, "pfsyncbreq"); + /* need process context to take net lock to call ip_output */ + timeout_set_proc(&sc->sc_bulk_req.req_tmo, pfsync_bulk_req_tmo, sc); - sc->sc_imo.imo_membership = mallocarray(IP_MIN_MEMBERSHIPS, - sizeof(struct in_multi *), M_IPMOPTS, M_WAITOK|M_ZERO); - sc->sc_imo.imo_max_memberships = IP_MIN_MEMBERSHIPS; + rw_init(&sc->sc_bulk_snd.snd_lock, "pfsyncbsnd"); + /* need process context to take net lock to call ip_output */ + timeout_set_proc(&sc->sc_bulk_snd.snd_tmo, pfsync_bulk_snd_tmo, sc); ifp = &sc->sc_if; - snprintf(ifp->if_xname, sizeof ifp->if_xname, "pfsync%d", unit); + snprintf(ifp->if_xname, sizeof ifp->if_xname, "%s%d", + ifc->ifc_name, unit); ifp->if_softc = sc; - ifp->if_ioctl = pfsyncioctl; - ifp->if_output = pfsyncoutput; - ifp->if_qstart = pfsyncstart; + ifp->if_ioctl = pfsync_ioctl; + ifp->if_output = pfsync_output; + ifp->if_qstart = pfsync_start; ifp->if_type = IFT_PFSYNC; ifp->if_hdrlen = sizeof(struct pfsync_header); ifp->if_mtu = ETHERMTU; ifp->if_xflags = IFXF_CLONED | IFXF_MPSAFE; - timeout_set_proc(&sc->sc_tmo, pfsync_timeout, NULL); - timeout_set_proc(&sc->sc_bulk_tmo, pfsync_bulk_update, NULL); - timeout_set_proc(&sc->sc_bulkfail_tmo, pfsync_bulk_fail, NULL); + for (i = 0; i < nitems(sc->sc_slices); i++) { + struct pfsync_slice *s = &sc->sc_slices[i]; + + s->s_pfsync = sc; + + mtx_init_flags(&s->s_mtx, IPL_SOFTNET, "pfslice", 0); + s->s_softnet = net_tq(i); + timeout_set(&s->s_tmo, pfsync_slice_tmo, s); + task_set(&s->s_task, pfsync_slice_task, s); + + mq_init(&s->s_sendq, 16, IPL_SOFTNET); + task_set(&s->s_send, pfsync_slice_sendq, s); + + s->s_len = PFSYNC_MINPKT; + ml_init(&s->s_ml); + + for (q = 0; q < nitems(s->s_qs); q++) + TAILQ_INIT(&s->s_qs[q]); + TAILQ_INIT(&s->s_tdb_q); + + /* stupid NET_LOCK */ + timeout_set(&s->s_deferrals_tmo, pfsync_deferrals_tmo, s); + task_set(&s->s_deferrals_task, pfsync_deferrals_task, s); + TAILQ_INIT(&s->s_deferrals); + +#if NKSTAT > 0 + s->s_kstat = kstat_create(ifp->if_xname, 0, "pfsync-slice", i, + KSTAT_T_KV, 0); + + kstat_set_mutex(s->s_kstat, &s->s_mtx); + s->s_kstat->ks_softc = s; + s->s_kstat->ks_datalen = sizeof(pfsync_kstat_tpl); + s->s_kstat->ks_copy = pfsync_kstat_copy; + kstat_install(s->s_kstat); +#endif + } + + if_counters_alloc(ifp); if_attach(ifp); if_alloc_sadl(ifp); @@ -380,937 +456,861 @@ pfsync_clone_create(struct if_clone *ifc, int unit) bpfattach(&sc->sc_if.if_bpf, ifp, DLT_PFSYNC, PFSYNC_HDRLEN); #endif - pfsyncif = sc; - return (0); } -int +static int pfsync_clone_destroy(struct ifnet *ifp) { struct pfsync_softc *sc = ifp->if_softc; - struct ifnet *ifp0; - struct pfsync_deferral *pd; - struct pfsync_deferrals deferrals; +#if NKSTAT > 0 + size_t i; +#endif NET_LOCK(); + sc->sc_dead = 1; -#if NCARP > 0 - if (!pfsync_sync_ok) - carp_group_demote_adj(&sc->sc_if, -1, "pfsync destroy"); - if (sc->sc_link_demoted) - carp_group_demote_adj(&sc->sc_if, -1, "pfsync destroy"); -#endif - if ((ifp0 = if_get(sc->sc_sync_ifidx)) != NULL) { - if_linkstatehook_del(ifp0, &sc->sc_ltask); - if_detachhook_del(ifp0, &sc->sc_dtask); - } - if_put(ifp0); - - /* XXXSMP breaks atomicity */ + if (ISSET(ifp->if_flags, IFF_RUNNING)) + pfsync_down(sc); NET_UNLOCK(); - if_detach(ifp); - NET_LOCK(); - pfsync_drop(sc); + if_detach(ifp); - if (sc->sc_deferred > 0) { - TAILQ_INIT(&deferrals); - mtx_enter(&sc->sc_deferrals_mtx); - TAILQ_CONCAT(&deferrals, &sc->sc_deferrals, pd_entry); - sc->sc_deferred = 0; - mtx_leave(&sc->sc_deferrals_mtx); +#if NKSTAT > 0 + for (i = 0; i < nitems(sc->sc_slices); i++) { + struct pfsync_slice *s = &sc->sc_slices[i]; - while ((pd = TAILQ_FIRST(&deferrals)) != NULL) { - TAILQ_REMOVE(&deferrals, pd, pd_entry); - pfsync_undefer(pd, 0); - } + kstat_destroy(s->s_kstat); } +#endif - pfsyncif = NULL; - timeout_del(&sc->sc_bulkfail_tmo); - timeout_del(&sc->sc_bulk_tmo); - timeout_del(&sc->sc_tmo); - - NET_UNLOCK(); - - pool_destroy(&sc->sc_pool); - free(sc->sc_imo.imo_membership, M_IPMOPTS, - sc->sc_imo.imo_max_memberships * sizeof(struct in_multi *)); free(sc, M_DEVBUF, sizeof(*sc)); return (0); } -/* - * Start output on the pfsync interface. - */ -void -pfsyncstart(struct ifqueue *ifq) +static void +pfsync_dprintf(struct pfsync_softc *sc, const char *fmt, ...) { - ifq_purge(ifq); + struct ifnet *ifp = &sc->sc_if; + va_list ap; + + if (!ISSET(ifp->if_flags, IFF_DEBUG)) + return; + + printf("%s: ", ifp->if_xname); + va_start(ap, fmt); + vprintf(fmt, ap); + va_end(ap); + printf("\n"); } -void -pfsync_syncdev_state(void *arg) +static void +pfsync_syncif_link(void *arg) { struct pfsync_softc *sc = arg; - struct ifnet *ifp; + struct ifnet *ifp0; + unsigned int sync_if_down = 1; - if ((sc->sc_if.if_flags & IFF_UP) == 0) - return; - if ((ifp = if_get(sc->sc_sync_ifidx)) == NULL) - return; + ifp0 = if_get(sc->sc_sync_ifidx); + if (ifp0 != NULL && LINK_STATE_IS_UP(ifp0->if_link_state)) { + pfsync_bulk_req_evt(sc, PFSYNC_BREQ_EVT_LINK); + sync_if_down = 0; + } + if_put(ifp0); - if (ifp->if_link_state == LINK_STATE_DOWN) { - sc->sc_if.if_flags &= ~IFF_RUNNING; - if (!sc->sc_link_demoted) { #if NCARP > 0 - carp_group_demote_adj(&sc->sc_if, 1, - "pfsync link state down"); -#endif - sc->sc_link_demoted = 1; - } - - /* drop everything */ - timeout_del(&sc->sc_tmo); - pfsync_drop(sc); - - pfsync_cancel_full_update(sc); - } else if (sc->sc_link_demoted) { - sc->sc_if.if_flags |= IFF_RUNNING; - - pfsync_request_full_update(sc); + if (sc->sc_sync_if_down != sync_if_down) { + carp_group_demote_adj(&sc->sc_if, + sync_if_down ? 1 : -1, "pfsync link"); } +#endif - if_put(ifp); + sc->sc_sync_if_down = sync_if_down; } -void -pfsync_ifdetach(void *arg) +static void +pfsync_syncif_detach(void *arg) { struct pfsync_softc *sc = arg; - struct ifnet *ifp; + struct ifnet *ifp = &sc->sc_if; - if ((ifp = if_get(sc->sc_sync_ifidx)) != NULL) { - if_linkstatehook_del(ifp, &sc->sc_ltask); - if_detachhook_del(ifp, &sc->sc_dtask); + if (ISSET(ifp->if_flags, IFF_RUNNING)) { + pfsync_down(sc); + if_down(ifp); } - if_put(ifp); sc->sc_sync_ifidx = 0; } -int -pfsync_input(struct mbuf **mp, int *offp, int proto, int af) +static int +pfsync_output(struct ifnet *ifp, struct mbuf *m, struct sockaddr *dst, + struct rtentry *rt) { - struct mbuf *n, *m = *mp; - struct pfsync_softc *sc = pfsyncif; - struct ip *ip = mtod(m, struct ip *); - struct pfsync_header *ph; - struct pfsync_subheader subh; - int offset, noff, len, count, mlen, flags = 0; - int e; - - NET_ASSERT_LOCKED(); + m_freem(m); /* drop packet */ + return (EAFNOSUPPORT); +} - pfsyncstat_inc(pfsyncs_ipackets); +static int +pfsync_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) +{ + struct pfsync_softc *sc = ifp->if_softc; + struct ifreq *ifr = (struct ifreq *)data; + int error = ENOTTY; - /* verify that we have a sync interface configured */ - if (sc == NULL || !ISSET(sc->sc_if.if_flags, IFF_RUNNING) || - sc->sc_sync_ifidx == 0 || !pf_status.running) - goto done; + switch (cmd) { + case SIOCSIFADDR: + error = EOPNOTSUPP; + break; - /* verify that the packet came in on the right interface */ - if (sc->sc_sync_ifidx != m->m_pkthdr.ph_ifidx) { - pfsyncstat_inc(pfsyncs_badif); - goto done; - } + case SIOCSIFFLAGS: + if (ISSET(ifp->if_flags, IFF_UP)) { + if (!ISSET(ifp->if_flags, IFF_RUNNING)) + error = pfsync_up(sc); + else + error = ENETRESET; + } else { + if (ISSET(ifp->if_flags, IFF_RUNNING)) + error = pfsync_down(sc); + } + break; - sc->sc_if.if_ipackets++; - sc->sc_if.if_ibytes += m->m_pkthdr.len; + case SIOCSIFMTU: + error = pfsync_set_mtu(sc, ifr->ifr_mtu); + break; - /* verify that the IP TTL is 255. */ - if (ip->ip_ttl != PFSYNC_DFLTTL) { - pfsyncstat_inc(pfsyncs_badttl); - goto done; - } + case SIOCSIFPARENT: + error = pfsync_set_parent(sc, (struct if_parent *)data); + break; + case SIOCGIFPARENT: + error = pfsync_get_parent(sc, (struct if_parent *)data); + break; + case SIOCDIFPARENT: + error = pfsync_del_parent(sc); + break; - offset = ip->ip_hl << 2; - n = m_pulldown(m, offset, sizeof(*ph), &noff); - if (n == NULL) { - pfsyncstat_inc(pfsyncs_hdrops); - return IPPROTO_DONE; - } - ph = (struct pfsync_header *)(n->m_data + noff); + case SIOCSETPFSYNC: + error = pfsync_set_ioc(sc, ifr); + break; + case SIOCGETPFSYNC: + error = pfsync_get_ioc(sc, ifr); + break; - /* verify the version */ - if (ph->version != PFSYNC_VERSION) { - pfsyncstat_inc(pfsyncs_badver); - goto done; - } - len = ntohs(ph->len) + offset; - if (m->m_pkthdr.len < len) { - pfsyncstat_inc(pfsyncs_badlen); - goto done; + default: + break; } - if (!bcmp(&ph->pfcksum, &pf_status.pf_chksum, PF_MD5_DIGEST_LENGTH)) - flags = PFSYNC_SI_CKSUM; - - offset += sizeof(*ph); - while (offset <= len - sizeof(subh)) { - m_copydata(m, offset, sizeof(subh), &subh); - offset += sizeof(subh); - - mlen = subh.len << 2; - count = ntohs(subh.count); - - if (subh.action >= PFSYNC_ACT_MAX || - subh.action >= nitems(pfsync_acts) || - mlen < pfsync_acts[subh.action].len) { - /* - * subheaders are always followed by at least one - * message, so if the peer is new - * enough to tell us how big its messages are then we - * know enough to skip them. - */ - if (count > 0 && mlen > 0) { - offset += count * mlen; - continue; - } - pfsyncstat_inc(pfsyncs_badact); - goto done; - } - - n = m_pulldown(m, offset, mlen * count, &noff); - if (n == NULL) { - pfsyncstat_inc(pfsyncs_badlen); - return IPPROTO_DONE; - } - - e = pfsync_acts[subh.action].in(n->m_data + noff, mlen, count, - flags); - if (e != 0) - goto done; - - offset += mlen * count; - } + if (error == ENETRESET) + error = 0; -done: - m_freem(m); - return IPPROTO_DONE; + return (error); } -int -pfsync_in_clr(caddr_t buf, int len, int count, int flags) +static int +pfsync_set_mtu(struct pfsync_softc *sc, unsigned int mtu) { - struct pfsync_clr *clr; - struct pf_state *st, *nexts; - struct pfi_kif *kif; - u_int32_t creatorid; - int i; + struct ifnet *ifp = &sc->sc_if; + struct ifnet *ifp0; + int error = 0; - PF_LOCK(); - for (i = 0; i < count; i++) { - clr = (struct pfsync_clr *)buf + len * i; - kif = NULL; - creatorid = clr->creatorid; - if (strlen(clr->ifname) && - (kif = pfi_kif_find(clr->ifname)) == NULL) - continue; + ifp0 = if_get(sc->sc_sync_ifidx); + if (ifp0 == NULL) + return (EINVAL); - PF_STATE_ENTER_WRITE(); - RBT_FOREACH_SAFE(st, pf_state_tree_id, &tree_id, nexts) { - if (st->creatorid == creatorid && - ((kif && st->kif == kif) || !kif)) { - SET(st->state_flags, PFSTATE_NOSYNC); - pf_remove_state(st); - } - } - PF_STATE_EXIT_WRITE(); + if (mtu <= PFSYNC_MINPKT || mtu > ifp0->if_mtu) { + error = EINVAL; + goto put; } - PF_UNLOCK(); - return (0); + /* commit */ + ifp->if_mtu = mtu; + +put: + if_put(ifp0); + return (error); } -int -pfsync_in_ins(caddr_t buf, int len, int count, int flags) +static int +pfsync_set_parent(struct pfsync_softc *sc, const struct if_parent *p) { - struct pfsync_state *sp; - sa_family_t af1, af2; - int i; + struct ifnet *ifp = &sc->sc_if; + struct ifnet *ifp0; + int error = 0; - PF_LOCK(); - for (i = 0; i < count; i++) { - sp = (struct pfsync_state *)(buf + len * i); - af1 = sp->key[0].af; - af2 = sp->key[1].af; + ifp0 = if_unit(p->ifp_parent); + if (ifp0 == NULL) + return (ENXIO); - /* check for invalid values */ - if (sp->timeout >= PFTM_MAX || - sp->src.state > PF_TCPS_PROXY_DST || - sp->dst.state > PF_TCPS_PROXY_DST || - sp->direction > PF_OUT || - (((af1 || af2) && - ((af1 != AF_INET && af1 != AF_INET6) || - (af2 != AF_INET && af2 != AF_INET6))) || - (sp->af != AF_INET && sp->af != AF_INET6))) { - DPFPRINTF(LOG_NOTICE, - "pfsync_input: PFSYNC5_ACT_INS: invalid value"); - pfsyncstat_inc(pfsyncs_badval); - continue; - } + if (ifp0->if_index == sc->sc_sync_ifidx) + goto put; - if (pf_state_import(sp, flags) == ENOMEM) { - /* drop out, but process the rest of the actions */ - break; - } + if (ISSET(ifp->if_flags, IFF_RUNNING)) { + error = EBUSY; + goto put; } - PF_UNLOCK(); - return (0); + /* commit */ + sc->sc_sync_ifidx = ifp0->if_index; + +put: + if_put(ifp0); + return (error); } -int -pfsync_in_iack(caddr_t buf, int len, int count, int flags) +static int +pfsync_get_parent(struct pfsync_softc *sc, struct if_parent *p) { - struct pfsync_ins_ack *ia; - struct pf_state_cmp id_key; - struct pf_state *st; - int i; + struct ifnet *ifp0; + int error = 0; - for (i = 0; i < count; i++) { - ia = (struct pfsync_ins_ack *)(buf + len * i); + ifp0 = if_get(sc->sc_sync_ifidx); + if (ifp0 == NULL) + error = EADDRNOTAVAIL; + else + strlcpy(p->ifp_parent, ifp0->if_xname, sizeof(p->ifp_parent)); + if_put(ifp0); - id_key.id = ia->id; - id_key.creatorid = ia->creatorid; + return (error); +} - PF_STATE_ENTER_READ(); - st = pf_find_state_byid(&id_key); - pf_state_ref(st); - PF_STATE_EXIT_READ(); - if (st == NULL) - continue; +static int +pfsync_del_parent(struct pfsync_softc *sc) +{ + struct ifnet *ifp = &sc->sc_if; - if (ISSET(st->state_flags, PFSTATE_ACK)) - pfsync_deferred(st, 0); + if (ISSET(ifp->if_flags, IFF_RUNNING)) + return (EBUSY); - pf_state_unref(st); - } + /* commit */ + sc->sc_sync_ifidx = 0; return (0); } -int -pfsync_upd_tcp(struct pf_state *st, struct pfsync_state_peer *src, - struct pfsync_state_peer *dst) +static int +pfsync_get_ioc(struct pfsync_softc *sc, struct ifreq *ifr) { - int sync = 0; - - /* - * The state should never go backwards except - * for syn-proxy states. Neither should the - * sequence window slide backwards. - */ - if ((st->src.state > src->state && - (st->src.state < PF_TCPS_PROXY_SRC || - src->state >= PF_TCPS_PROXY_SRC)) || + struct pfsyncreq pfsyncr; + struct ifnet *ifp0; - (st->src.state == src->state && - SEQ_GT(st->src.seqlo, ntohl(src->seqlo)))) - sync++; - else - pf_state_peer_ntoh(src, &st->src); + memset(&pfsyncr, 0, sizeof(pfsyncr)); - if ((st->dst.state > dst->state) || + ifp0 = if_get(sc->sc_sync_ifidx); + if (ifp0 != NULL) { + strlcpy(pfsyncr.pfsyncr_syncdev, ifp0->if_xname, + sizeof(pfsyncr.pfsyncr_syncdev)); + } + if_put(ifp0); - (st->dst.state >= TCPS_SYN_SENT && - SEQ_GT(st->dst.seqlo, ntohl(dst->seqlo)))) - sync++; - else - pf_state_peer_ntoh(dst, &st->dst); + pfsyncr.pfsyncr_syncpeer = sc->sc_syncpeer; + pfsyncr.pfsyncr_maxupdates = sc->sc_maxupdates; + pfsyncr.pfsyncr_defer = sc->sc_defer; - return (sync); + return (copyout(&pfsyncr, ifr->ifr_data, sizeof(pfsyncr))); } -int -pfsync_in_upd(caddr_t buf, int len, int count, int flags) +static int +pfsync_set_ioc(struct pfsync_softc *sc, struct ifreq *ifr) { - struct pfsync_state *sp; - struct pf_state_cmp id_key; - struct pf_state *st; - int sync, error; - int i; - - for (i = 0; i < count; i++) { - sp = (struct pfsync_state *)(buf + len * i); - - /* check for invalid values */ - if (sp->timeout >= PFTM_MAX || - sp->src.state > PF_TCPS_PROXY_DST || - sp->dst.state > PF_TCPS_PROXY_DST) { - DPFPRINTF(LOG_NOTICE, - "pfsync_input: PFSYNC_ACT_UPD: invalid value"); - pfsyncstat_inc(pfsyncs_badval); - continue; - } - - id_key.id = sp->id; - id_key.creatorid = sp->creatorid; + struct ifnet *ifp = &sc->sc_if; + struct pfsyncreq pfsyncr; + unsigned int sync_ifidx = sc->sc_sync_ifidx; + int wantdown = 0; + int error; - PF_STATE_ENTER_READ(); - st = pf_find_state_byid(&id_key); - pf_state_ref(st); - PF_STATE_EXIT_READ(); - if (st == NULL) { - /* insert the update */ - PF_LOCK(); - error = pf_state_import(sp, flags); - if (error) - pfsyncstat_inc(pfsyncs_badstate); - PF_UNLOCK(); - continue; - } + error = suser(curproc); + if (error != 0) + return (error); - if (ISSET(st->state_flags, PFSTATE_ACK)) - pfsync_deferred(st, 1); + error = copyin(ifr->ifr_data, &pfsyncr, sizeof(pfsyncr)); + if (error != 0) + return (error); - if (st->key[PF_SK_WIRE]->proto == IPPROTO_TCP) - sync = pfsync_upd_tcp(st, &sp->src, &sp->dst); - else { - sync = 0; + if (pfsyncr.pfsyncr_maxupdates > 255) + return (EINVAL); - /* - * Non-TCP protocol state machine always go - * forwards - */ - if (st->src.state > sp->src.state) - sync++; - else - pf_state_peer_ntoh(&sp->src, &st->src); + if (pfsyncr.pfsyncr_syncdev[0] != '\0') { /* set */ + struct ifnet *ifp0 = if_unit(pfsyncr.pfsyncr_syncdev); + if (ifp0 == NULL) + return (ENXIO); - if (st->dst.state > sp->dst.state) - sync++; - else - pf_state_peer_ntoh(&sp->dst, &st->dst); - } + if (ifp0->if_index != sync_ifidx) + wantdown = 1; - if (sync < 2) { - pf_state_alloc_scrub_memory(&sp->dst, &st->dst); - pf_state_peer_ntoh(&sp->dst, &st->dst); - st->expire = getuptime(); - st->timeout = sp->timeout; - } - st->pfsync_time = getuptime(); + sync_ifidx = ifp0->if_index; + if_put(ifp0); + } else { /* del */ + wantdown = 1; + sync_ifidx = 0; + } - if (sync) { - pfsyncstat_inc(pfsyncs_stale); + if (pfsyncr.pfsyncr_syncpeer.s_addr == INADDR_ANY) + pfsyncr.pfsyncr_syncpeer.s_addr = INADDR_PFSYNC_GROUP; + if (pfsyncr.pfsyncr_syncpeer.s_addr != sc->sc_syncpeer.s_addr) + wantdown = 1; - pfsync_update_state(st); - schednetisr(NETISR_PFSYNC); - } + if (wantdown && ISSET(ifp->if_flags, IFF_RUNNING)) + return (EBUSY); - pf_state_unref(st); - } + /* commit */ + sc->sc_sync_ifidx = sync_ifidx; + sc->sc_syncpeer = pfsyncr.pfsyncr_syncpeer; + sc->sc_maxupdates = pfsyncr.pfsyncr_maxupdates; + sc->sc_defer = pfsyncr.pfsyncr_defer; return (0); } -int -pfsync_in_upd_c(caddr_t buf, int len, int count, int flags) +static int +pfsync_up(struct pfsync_softc *sc) { - struct pfsync_upd_c *up; - struct pf_state_cmp id_key; - struct pf_state *st; - - int sync; - - int i; - - for (i = 0; i < count; i++) { - up = (struct pfsync_upd_c *)(buf + len * i); + struct ifnet *ifp = &sc->sc_if; + struct ifnet *ifp0; + void *inm = NULL; + int error = 0; + struct ip *ip; - /* check for invalid values */ - if (up->timeout >= PFTM_MAX || - up->src.state > PF_TCPS_PROXY_DST || - up->dst.state > PF_TCPS_PROXY_DST) { - DPFPRINTF(LOG_NOTICE, - "pfsync_input: PFSYNC_ACT_UPD_C: invalid value"); - pfsyncstat_inc(pfsyncs_badval); - continue; - } + NET_ASSERT_LOCKED(); + KASSERT(!ISSET(ifp->if_flags, IFF_RUNNING)); - id_key.id = up->id; - id_key.creatorid = up->creatorid; + if (sc->sc_dead) + return (ENXIO); - PF_STATE_ENTER_READ(); - st = pf_find_state_byid(&id_key); - pf_state_ref(st); - PF_STATE_EXIT_READ(); - if (st == NULL) { - /* We don't have this state. Ask for it. */ - pfsync_request_update(id_key.creatorid, id_key.id); - continue; - } + /* + * coordinate with pfsync_down(). if sc_up is still up and + * we're here then something else is tearing pfsync down. + */ + if (sc->sc_up) + return (EBUSY); - if (ISSET(st->state_flags, PFSTATE_ACK)) - pfsync_deferred(st, 1); + if (sc->sc_syncpeer.s_addr == INADDR_ANY || + sc->sc_syncpeer.s_addr == INADDR_BROADCAST) + return (EDESTADDRREQ); - if (st->key[PF_SK_WIRE]->proto == IPPROTO_TCP) - sync = pfsync_upd_tcp(st, &up->src, &up->dst); - else { - sync = 0; - /* - * Non-TCP protocol state machine always go - * forwards - */ - if (st->src.state > up->src.state) - sync++; - else - pf_state_peer_ntoh(&up->src, &st->src); + ifp0 = if_get(sc->sc_sync_ifidx); + if (ifp0 == NULL) + return (ENXIO); - if (st->dst.state > up->dst.state) - sync++; - else - pf_state_peer_ntoh(&up->dst, &st->dst); + if (IN_MULTICAST(sc->sc_syncpeer.s_addr)) { + if (!ISSET(ifp0->if_flags, IFF_MULTICAST)) { + error = ENODEV; + goto put; } - if (sync < 2) { - pf_state_alloc_scrub_memory(&up->dst, &st->dst); - pf_state_peer_ntoh(&up->dst, &st->dst); - st->expire = getuptime(); - st->timeout = up->timeout; + inm = in_addmulti(&sc->sc_syncpeer, ifp0); + if (inm == NULL) { + error = ECONNABORTED; + goto put; } - st->pfsync_time = getuptime(); - - if (sync) { - pfsyncstat_inc(pfsyncs_stale); + } - pfsync_update_state(st); - schednetisr(NETISR_PFSYNC); - } + sc->sc_up = 1; - pf_state_unref(st); - } + ip = &sc->sc_template; + memset(ip, 0, sizeof(*ip)); + ip->ip_v = IPVERSION; + ip->ip_hl = sizeof(*ip) >> 2; + ip->ip_tos = IPTOS_LOWDELAY; + /* len and id are set later */ + ip->ip_off = htons(IP_DF); + ip->ip_ttl = PFSYNC_DFLTTL; + ip->ip_p = IPPROTO_PFSYNC; + ip->ip_src.s_addr = INADDR_ANY; + ip->ip_dst.s_addr = sc->sc_syncpeer.s_addr; - return (0); -} + /* commit */ + refcnt_init(&sc->sc_refs); /* IFF_RUNNING kind of owns this */ -int -pfsync_in_ureq(caddr_t buf, int len, int count, int flags) -{ - struct pfsync_upd_req *ur; - int i; +#if NCARP > 0 + sc->sc_sync_if_down = 1; + carp_group_demote_adj(&sc->sc_if, 1, "pfsync up"); +#endif - struct pf_state_cmp id_key; - struct pf_state *st; + if_linkstatehook_add(ifp0, &sc->sc_ltask); + if_detachhook_add(ifp0, &sc->sc_dtask); - for (i = 0; i < count; i++) { - ur = (struct pfsync_upd_req *)(buf + len * i); + sc->sc_inm = inm; + SET(ifp->if_flags, IFF_RUNNING); - id_key.id = ur->id; - id_key.creatorid = ur->creatorid; + pfsync_bulk_req_evt(sc, PFSYNC_BREQ_EVT_UP); - if (id_key.id == 0 && id_key.creatorid == 0) - pfsync_bulk_start(); - else { - PF_STATE_ENTER_READ(); - st = pf_find_state_byid(&id_key); - pf_state_ref(st); - PF_STATE_EXIT_READ(); - if (st == NULL) { - pfsyncstat_inc(pfsyncs_badstate); - continue; - } - if (ISSET(st->state_flags, PFSTATE_NOSYNC)) { - pf_state_unref(st); - continue; - } + refcnt_take(&sc->sc_refs); /* give one to SMR */ + SMR_PTR_SET_LOCKED(&pfsyncif, sc); - pfsync_update_state_req(st); - pf_state_unref(st); - } - } + pfsync_syncif_link(sc); /* try and push the bulk req state forward */ - return (0); +put: + if_put(ifp0); + return (error); } -int -pfsync_in_del(caddr_t buf, int len, int count, int flags) +static struct mbuf * +pfsync_encap(struct pfsync_softc *sc, struct mbuf *m) { - struct pfsync_state *sp; - struct pf_state_cmp id_key; - struct pf_state *st; - int i; + struct { + struct ip ip; + struct pfsync_header ph; + } __packed __aligned(4) *h; + unsigned int mlen = m->m_pkthdr.len; - PF_STATE_ENTER_WRITE(); - for (i = 0; i < count; i++) { - sp = (struct pfsync_state *)(buf + len * i); + m = m_prepend(m, sizeof(*h), M_DONTWAIT); + if (m == NULL) + return (NULL); - id_key.id = sp->id; - id_key.creatorid = sp->creatorid; + h = mtod(m, void *); + memset(h, 0, sizeof(*h)); - st = pf_find_state_byid(&id_key); - if (st == NULL) { - pfsyncstat_inc(pfsyncs_badstate); - continue; - } - SET(st->state_flags, PFSTATE_NOSYNC); - pf_remove_state(st); - } - PF_STATE_EXIT_WRITE(); + mlen += sizeof(h->ph); + h->ph.version = PFSYNC_VERSION; + h->ph.len = htons(mlen); + /* h->ph.pfcksum */ - return (0); + mlen += sizeof(h->ip); + h->ip = sc->sc_template; + h->ip.ip_len = htons(mlen); + h->ip.ip_id = htons(ip_randomid()); + + return (m); } -int -pfsync_in_del_c(caddr_t buf, int len, int count, int flags) +static void +pfsync_bulk_req_send(struct pfsync_softc *sc) { - struct pfsync_del_c *sp; - struct pf_state_cmp id_key; - struct pf_state *st; - int i; - - PF_LOCK(); - PF_STATE_ENTER_WRITE(); - for (i = 0; i < count; i++) { - sp = (struct pfsync_del_c *)(buf + len * i); - - id_key.id = sp->id; - id_key.creatorid = sp->creatorid; + struct { + struct pfsync_subheader subh; + struct pfsync_upd_req ur; + } __packed __aligned(4) *h; + unsigned mlen = max_linkhdr + + sizeof(struct ip) + sizeof(struct pfsync_header) + sizeof(*h); + struct mbuf *m; - st = pf_find_state_byid(&id_key); - if (st == NULL) { - pfsyncstat_inc(pfsyncs_badstate); - continue; - } + m = m_gethdr(M_DONTWAIT, MT_DATA); + if (m == NULL) + goto fail; - SET(st->state_flags, PFSTATE_NOSYNC); - pf_remove_state(st); + if (mlen > MHLEN) { + MCLGETL(m, M_DONTWAIT, mlen); + if (!ISSET(m->m_flags, M_EXT)) + goto drop; } - PF_STATE_EXIT_WRITE(); - PF_UNLOCK(); - return (0); -} + m_align(m, sizeof(*h)); + m->m_len = m->m_pkthdr.len = sizeof(*h); -int -pfsync_in_bus(caddr_t buf, int len, int count, int flags) -{ - struct pfsync_softc *sc = pfsyncif; - struct pfsync_bus *bus; + h = mtod(m, void *); + memset(h, 0, sizeof(*h)); - /* If we're not waiting for a bulk update, who cares. */ - if (sc->sc_ureq_sent == 0) - return (0); + h->subh.action = PFSYNC_ACT_UPD_REQ; + h->subh.len = sizeof(h->ur) >> 2; + h->subh.count = htons(1); - bus = (struct pfsync_bus *)buf; + h->ur.id = htobe64(0); + h->ur.creatorid = htobe32(0); - switch (bus->status) { - case PFSYNC_BUS_START: - PF_LOCK(); - timeout_add(&sc->sc_bulkfail_tmo, 4 * hz + - pf_pool_limits[PF_LIMIT_STATES].limit / - ((sc->sc_if.if_mtu - PFSYNC_MINPKT) / - sizeof(struct pfsync_state))); - PF_UNLOCK(); - DPFPRINTF(LOG_INFO, "received bulk update start"); - break; + m = pfsync_encap(sc, m); + if (m == NULL) + goto fail; - case PFSYNC_BUS_END: - if (getuptime() - ntohl(bus->endtime) >= - sc->sc_ureq_sent) { - /* that's it, we're happy */ - sc->sc_ureq_sent = 0; - sc->sc_bulk_tries = 0; - timeout_del(&sc->sc_bulkfail_tmo); -#if NCARP > 0 - if (!pfsync_sync_ok) - carp_group_demote_adj(&sc->sc_if, -1, - sc->sc_link_demoted ? - "pfsync link state up" : - "pfsync bulk done"); - if (sc->sc_initial_bulk) { - carp_group_demote_adj(&sc->sc_if, -32, - "pfsync init"); - sc->sc_initial_bulk = 0; - } -#endif - pfsync_sync_ok = 1; - sc->sc_link_demoted = 0; - DPFPRINTF(LOG_INFO, "received valid bulk update end"); - } else { - DPFPRINTF(LOG_WARNING, "received invalid " - "bulk update end: bad timestamp"); - } - break; - } + pfsync_sendout(sc, m); + return; - return (0); +drop: + m_freem(m); +fail: + printf("%s: unable to request bulk update\n", sc->sc_if.if_xname); } -int -pfsync_in_tdb(caddr_t buf, int len, int count, int flags) +static void +pfsync_bulk_req_nstate(struct pfsync_softc *sc, + enum pfsync_bulk_req_state nstate, int seconds) { -#if defined(IPSEC) - struct pfsync_tdb *tp; - int i; - - for (i = 0; i < count; i++) { - tp = (struct pfsync_tdb *)(buf + len * i); - pfsync_update_net_tdb(tp); - } -#endif - - return (0); + sc->sc_bulk_req.req_state = nstate; + if (seconds > 0) + timeout_add_sec(&sc->sc_bulk_req.req_tmo, seconds); + else + timeout_del(&sc->sc_bulk_req.req_tmo); } -#if defined(IPSEC) -/* Update an in-kernel tdb. Silently fail if no tdb is found. */ -void -pfsync_update_net_tdb(struct pfsync_tdb *pt) +static void +pfsync_bulk_req_invstate(struct pfsync_softc *sc, + enum pfsync_bulk_req_event evt) { - struct tdb *tdb; - - NET_ASSERT_LOCKED(); - - /* check for invalid values */ - if (ntohl(pt->spi) <= SPI_RESERVED_MAX || - (pt->dst.sa.sa_family != AF_INET && - pt->dst.sa.sa_family != AF_INET6)) - goto bad; + panic("%s: unexpected event %s in state %s", sc->sc_if.if_xname, + pfsync_bulk_req_event_names[evt], + pfsync_bulk_req_state_names[sc->sc_bulk_req.req_state]); +} - tdb = gettdb(ntohs(pt->rdomain), pt->spi, - (union sockaddr_union *)&pt->dst, pt->sproto); - if (tdb) { - pt->rpl = betoh64(pt->rpl); - pt->cur_bytes = betoh64(pt->cur_bytes); +static void +pfsync_bulk_req_nstate_bulk(struct pfsync_softc *sc) +{ + /* calculate the number of packets we expect */ + int t = pf_pool_limits[PF_LIMIT_STATES].limit / + ((sc->sc_if.if_mtu - PFSYNC_MINPKT) / + sizeof(struct pfsync_state)); - /* Neither replay nor byte counter should ever decrease. */ - if (pt->rpl < tdb->tdb_rpl || - pt->cur_bytes < tdb->tdb_cur_bytes) { - tdb_unref(tdb); - goto bad; - } + /* turn it into seconds */ + t /= 1000 / PFSYNC_BULK_SND_IVAL_MS; - tdb->tdb_rpl = pt->rpl; - tdb->tdb_cur_bytes = pt->cur_bytes; - tdb_unref(tdb); - } - return; + if (t == 0) + t = 1; - bad: - DPFPRINTF(LOG_WARNING, "pfsync_insert: PFSYNC_ACT_TDB_UPD: " - "invalid value"); - pfsyncstat_inc(pfsyncs_badstate); - return; + pfsync_bulk_req_nstate(sc, PFSYNC_BREQ_S_BULK, t * 4); } -#endif - -int -pfsync_in_eof(caddr_t buf, int len, int count, int flags) +static inline void +pfsync_bulk_req_nstate_done(struct pfsync_softc *sc) { - if (len > 0 || count > 0) - pfsyncstat_inc(pfsyncs_badact); + pfsync_bulk_req_nstate(sc, PFSYNC_BREQ_S_DONE, 0); - /* we're done. let the caller return */ - return (1); -} + KASSERT(sc->sc_bulk_req.req_demoted == 1); + sc->sc_bulk_req.req_demoted = 0; -int -pfsync_in_error(caddr_t buf, int len, int count, int flags) -{ - pfsyncstat_inc(pfsyncs_badact); - return (-1); +#if NCARP > 0 + carp_group_demote_adj(&sc->sc_if, -32, "pfsync done"); +#endif } -int -pfsyncoutput(struct ifnet *ifp, struct mbuf *m, struct sockaddr *dst, - struct rtentry *rt) +static void +pfsync_bulk_req_evt(struct pfsync_softc *sc, enum pfsync_bulk_req_event evt) { - m_freem(m); /* drop packet */ - return (EAFNOSUPPORT); -} + struct ifnet *ifp = &sc->sc_if; -int -pfsyncioctl(struct ifnet *ifp, u_long cmd, caddr_t data) -{ - struct proc *p = curproc; - struct pfsync_softc *sc = ifp->if_softc; - struct ifreq *ifr = (struct ifreq *)data; - struct ip_moptions *imo = &sc->sc_imo; - struct pfsyncreq pfsyncr; - struct ifnet *ifp0, *sifp; - struct ip *ip; - int error; + rw_enter_write(&sc->sc_bulk_req.req_lock); + pfsync_dprintf(sc, "%s state %s evt %s", __func__, + pfsync_bulk_req_state_names[sc->sc_bulk_req.req_state], + pfsync_bulk_req_event_names[evt]); - switch (cmd) { - case SIOCSIFFLAGS: - if ((ifp->if_flags & IFF_RUNNING) == 0 && - (ifp->if_flags & IFF_UP)) { - ifp->if_flags |= IFF_RUNNING; + if (evt == PFSYNC_BREQ_EVT_DOWN) { + /* unconditionally move down */ + sc->sc_bulk_req.req_tries = 0; + pfsync_bulk_req_nstate(sc, PFSYNC_BREQ_S_NONE, 0); + if (sc->sc_bulk_req.req_demoted) { + sc->sc_bulk_req.req_demoted = 0; #if NCARP > 0 - sc->sc_initial_bulk = 1; - carp_group_demote_adj(&sc->sc_if, 32, "pfsync init"); + carp_group_demote_adj(&sc->sc_if, -32, + "pfsync down"); #endif - - pfsync_request_full_update(sc); } - if ((ifp->if_flags & IFF_RUNNING) && - (ifp->if_flags & IFF_UP) == 0) { - ifp->if_flags &= ~IFF_RUNNING; - - /* drop everything */ - timeout_del(&sc->sc_tmo); - pfsync_drop(sc); - - pfsync_cancel_full_update(sc); - } - break; - case SIOCSIFMTU: - if ((ifp0 = if_get(sc->sc_sync_ifidx)) == NULL) - return (EINVAL); - error = 0; - if (ifr->ifr_mtu <= PFSYNC_MINPKT || - ifr->ifr_mtu > ifp0->if_mtu) { - error = EINVAL; + } else switch (sc->sc_bulk_req.req_state) { + case PFSYNC_BREQ_S_NONE: + switch (evt) { + case PFSYNC_BREQ_EVT_UP: + KASSERT(sc->sc_bulk_req.req_demoted == 0); + sc->sc_bulk_req.req_demoted = 1; +#if NCARP > 0 + carp_group_demote_adj(&sc->sc_if, 32, + "pfsync start"); +#endif + pfsync_bulk_req_nstate(sc, PFSYNC_BREQ_S_START, 30); + break; + default: + pfsync_bulk_req_invstate(sc, evt); } - if_put(ifp0); - if (error) - return error; - if (ifr->ifr_mtu < ifp->if_mtu) - pfsync_sendout(); - ifp->if_mtu = ifr->ifr_mtu; + break; - case SIOCGETPFSYNC: - bzero(&pfsyncr, sizeof(pfsyncr)); - if ((ifp0 = if_get(sc->sc_sync_ifidx)) != NULL) { - strlcpy(pfsyncr.pfsyncr_syncdev, - ifp0->if_xname, IFNAMSIZ); + + case PFSYNC_BREQ_S_START: + switch (evt) { + case PFSYNC_BREQ_EVT_LINK: + pfsync_bulk_req_send(sc); + pfsync_bulk_req_nstate(sc, PFSYNC_BREQ_S_SENT, 2); + break; + case PFSYNC_BREQ_EVT_TMO: + pfsync_dprintf(sc, "timeout waiting for link"); + pfsync_bulk_req_nstate_done(sc); + break; + case PFSYNC_BREQ_EVT_BUS_START: + pfsync_bulk_req_nstate_bulk(sc); + break; + case PFSYNC_BREQ_EVT_BUS_END: + /* ignore this */ + break; + default: + pfsync_bulk_req_invstate(sc, evt); } - if_put(ifp0); - pfsyncr.pfsyncr_syncpeer = sc->sc_sync_peer; - pfsyncr.pfsyncr_maxupdates = sc->sc_maxupdates; - pfsyncr.pfsyncr_defer = sc->sc_defer; - return (copyout(&pfsyncr, ifr->ifr_data, sizeof(pfsyncr))); + break; - case SIOCSETPFSYNC: - if ((error = suser(p)) != 0) - return (error); - if ((error = copyin(ifr->ifr_data, &pfsyncr, sizeof(pfsyncr)))) - return (error); + case PFSYNC_BREQ_S_SENT: + switch (evt) { + case PFSYNC_BREQ_EVT_BUS_START: + pfsync_bulk_req_nstate_bulk(sc); + break; + case PFSYNC_BREQ_EVT_BUS_END: + case PFSYNC_BREQ_EVT_LINK: + /* ignore this */ + break; + case PFSYNC_BREQ_EVT_TMO: + if (++sc->sc_bulk_req.req_tries < + PFSYNC_MAX_BULKTRIES) { + pfsync_bulk_req_send(sc); + pfsync_bulk_req_nstate(sc, + PFSYNC_BREQ_S_SENT, 2); + break; + } - if (pfsyncr.pfsyncr_syncpeer.s_addr == 0) - sc->sc_sync_peer.s_addr = INADDR_PFSYNC_GROUP; - else - sc->sc_sync_peer.s_addr = - pfsyncr.pfsyncr_syncpeer.s_addr; + pfsync_dprintf(sc, + "timeout waiting for bulk transfer start"); + pfsync_bulk_req_nstate_done(sc); + break; + default: + pfsync_bulk_req_invstate(sc, evt); + } + break; - if (pfsyncr.pfsyncr_maxupdates > 255) - return (EINVAL); - sc->sc_maxupdates = pfsyncr.pfsyncr_maxupdates; + case PFSYNC_BREQ_S_BULK: + switch (evt) { + case PFSYNC_BREQ_EVT_BUS_START: + case PFSYNC_BREQ_EVT_LINK: + /* ignore this */ + break; + case PFSYNC_BREQ_EVT_BUS_END: + pfsync_bulk_req_nstate_done(sc); + break; + case PFSYNC_BREQ_EVT_TMO: + if (++sc->sc_bulk_req.req_tries < + PFSYNC_MAX_BULKTRIES) { + pfsync_bulk_req_send(sc); + pfsync_bulk_req_nstate(sc, + PFSYNC_BREQ_S_SENT, 2); + } - sc->sc_defer = pfsyncr.pfsyncr_defer; + pfsync_dprintf(sc, + "timeout waiting for bulk transfer end"); + pfsync_bulk_req_nstate_done(sc); + break; + default: + pfsync_bulk_req_invstate(sc, evt); + } + break; - if (pfsyncr.pfsyncr_syncdev[0] == 0) { - if ((ifp0 = if_get(sc->sc_sync_ifidx)) != NULL) { - if_linkstatehook_del(ifp0, &sc->sc_ltask); - if_detachhook_del(ifp0, &sc->sc_dtask); - } - if_put(ifp0); - sc->sc_sync_ifidx = 0; - if (imo->imo_num_memberships > 0) { - in_delmulti(imo->imo_membership[ - --imo->imo_num_memberships]); - imo->imo_ifidx = 0; - } + case PFSYNC_BREQ_S_DONE: /* pfsync is up and running */ + switch (evt) { + case PFSYNC_BREQ_EVT_BUS_START: + case PFSYNC_BREQ_EVT_BUS_END: + case PFSYNC_BREQ_EVT_LINK: + /* nops */ break; + default: + pfsync_bulk_req_invstate(sc, evt); } + break; - if ((sifp = if_unit(pfsyncr.pfsyncr_syncdev)) == NULL) - return (EINVAL); + default: + panic("%s: unknown event %d", ifp->if_xname, evt); + /* NOTREACHED */ + } + rw_exit_write(&sc->sc_bulk_req.req_lock); +} - ifp0 = if_get(sc->sc_sync_ifidx); +static void +pfsync_bulk_req_tmo(void *arg) +{ + struct pfsync_softc *sc = arg; - if (sifp->if_mtu < sc->sc_if.if_mtu || (ifp0 != NULL && - sifp->if_mtu < ifp0->if_mtu) || - sifp->if_mtu < MCLBYTES - sizeof(struct ip)) - pfsync_sendout(); + NET_LOCK(); + pfsync_bulk_req_evt(sc, PFSYNC_BREQ_EVT_TMO); + NET_UNLOCK(); +} - if (ifp0) { - if_linkstatehook_del(ifp0, &sc->sc_ltask); - if_detachhook_del(ifp0, &sc->sc_dtask); - } - if_put(ifp0); - sc->sc_sync_ifidx = sifp->if_index; +static int +pfsync_down(struct pfsync_softc *sc) +{ + struct ifnet *ifp = &sc->sc_if; + struct ifnet *ifp0; + struct smr_entry smr; + size_t i; + void *inm = NULL; + unsigned int sndbar = 0; + struct pfsync_deferrals pds = TAILQ_HEAD_INITIALIZER(pds); + struct pfsync_deferral *pd; - if (imo->imo_num_memberships > 0) { - in_delmulti(imo->imo_membership[--imo->imo_num_memberships]); - imo->imo_ifidx = 0; - } + NET_ASSERT_LOCKED(); + KASSERT(ISSET(ifp->if_flags, IFF_RUNNING)); - if (sc->sc_sync_peer.s_addr == INADDR_PFSYNC_GROUP) { - struct in_addr addr; + /* + * tearing down pfsync involves waiting for pfsync to stop + * running in various contexts including softnet taskqs. + * this thread cannot hold netlock while waiting for a + * barrier in softnet because softnet might be waiting for + * the netlock. sc->sc_up is used to coordinate with + * pfsync_up. + */ - if (!(sifp->if_flags & IFF_MULTICAST)) { - sc->sc_sync_ifidx = 0; - if_put(sifp); - return (EADDRNOTAVAIL); - } + CLR(ifp->if_flags, IFF_RUNNING); - addr.s_addr = INADDR_PFSYNC_GROUP; + ifp0 = if_get(sc->sc_sync_ifidx); + if (ifp0 != NULL) { + if_linkstatehook_del(ifp0, &sc->sc_ltask); + if_detachhook_del(ifp0, &sc->sc_dtask); + } + if_put(ifp0); - if ((imo->imo_membership[0] = - in_addmulti(&addr, sifp)) == NULL) { - sc->sc_sync_ifidx = 0; - if_put(sifp); - return (ENOBUFS); - } - imo->imo_num_memberships++; - imo->imo_ifidx = sc->sc_sync_ifidx; - imo->imo_ttl = PFSYNC_DFLTTL; - imo->imo_loop = 0; - } +#if NCARP > 0 + if (sc->sc_sync_if_down) + carp_group_demote_adj(&sc->sc_if, -1, "pfsync down"); +#endif - ip = &sc->sc_template; - bzero(ip, sizeof(*ip)); - ip->ip_v = IPVERSION; - ip->ip_hl = sizeof(sc->sc_template) >> 2; - ip->ip_tos = IPTOS_LOWDELAY; - /* len and id are set later */ - ip->ip_off = htons(IP_DF); - ip->ip_ttl = PFSYNC_DFLTTL; - ip->ip_p = IPPROTO_PFSYNC; - ip->ip_src.s_addr = INADDR_ANY; - ip->ip_dst.s_addr = sc->sc_sync_peer.s_addr; + NET_UNLOCK(); - if_linkstatehook_add(sifp, &sc->sc_ltask); - if_detachhook_add(sifp, &sc->sc_dtask); - if_put(sifp); + KASSERTMSG(SMR_PTR_GET_LOCKED(&pfsyncif) == sc, + "pfsyncif %p != sc %p", pfsyncif, sc); + SMR_PTR_SET_LOCKED(&pfsyncif, NULL); + smr_init(&smr); + smr_call(&smr, (void (*)(void *))refcnt_rele_wake, &sc->sc_refs); - pfsync_request_full_update(sc); + /* stop pf producing work before cleaning up the timeouts and tasks */ + refcnt_finalize(&sc->sc_refs, "pfsyncfini"); - break; + pfsync_bulk_req_evt(sc, PFSYNC_BREQ_EVT_DOWN); - default: - return (ENOTTY); + rw_enter_read(&pf_state_list.pfs_rwl); + rw_enter_write(&sc->sc_bulk_snd.snd_lock); + if (sc->sc_bulk_snd.snd_tail != NULL) { + sndbar = !timeout_del(&sc->sc_bulk_snd.snd_tmo); + + sc->sc_bulk_snd.snd_again = 0; + sc->sc_bulk_snd.snd_next = NULL; + sc->sc_bulk_snd.snd_tail = NULL; + } + rw_exit_write(&sc->sc_bulk_snd.snd_lock); + rw_exit_read(&pf_state_list.pfs_rwl); + + /* + * do a single barrier for all the timeouts. because the + * timeouts in each slice are configured the same way, the + * barrier for one will work for all of them. + */ + for (i = 0; i < nitems(sc->sc_slices); i++) { + struct pfsync_slice *s = &sc->sc_slices[i]; + + timeout_del(&s->s_tmo); + task_del(s->s_softnet, &s->s_task); + task_del(s->s_softnet, &s->s_send); + + timeout_del(&s->s_deferrals_tmo); + task_del(s->s_softnet, &s->s_deferrals_task); + } + timeout_barrier(&sc->sc_slices[0].s_tmo); + timeout_barrier(&sc->sc_bulk_req.req_tmo); /* XXX proc */ + if (sndbar) { + /* technically the preceding barrier does the same job */ + timeout_barrier(&sc->sc_bulk_snd.snd_tmo); + } + net_tq_barriers("pfsyncbar"); + + /* pfsync is no longer running */ + + if (sc->sc_inm != NULL) { + inm = sc->sc_inm; + sc->sc_inm = NULL; + } + + for (i = 0; i < nitems(sc->sc_slices); i++) { + struct pfsync_slice *s = &sc->sc_slices[i]; + struct pf_state *st; + + pfsync_slice_drop(sc, s); + mq_purge(&s->s_sendq); + + while ((pd = TAILQ_FIRST(&s->s_deferrals)) != NULL) { + TAILQ_REMOVE(&s->s_deferrals, pd, pd_entry); + + st = pd->pd_st; + st->sync_defer = NULL; + + TAILQ_INSERT_TAIL(&pds, pd, pd_entry); + } + s->s_deferred = 0; + } + + NET_LOCK(); + sc->sc_up = 0; + + if (inm != NULL) + in_delmulti(inm); + + while ((pd = TAILQ_FIRST(&pds)) != NULL) { + TAILQ_REMOVE(&pds, pd, pd_entry); + + pfsync_defer_output(pd); } return (0); } -void +int +pfsync_is_up(void) +{ + int rv; + + smr_read_enter(); + rv = SMR_PTR_GET(&pfsyncif) != NULL; + smr_read_leave(); + + return (rv); +} + +static void +pfsync_start(struct ifqueue *ifq) +{ + ifq_purge(ifq); +} + +struct pfsync_q { + void (*write)(struct pf_state *, void *); + size_t len; + u_int8_t action; +}; + +static struct pfsync_slice * +pfsync_slice_enter(struct pfsync_softc *sc, const struct pf_state *st) +{ + unsigned int idx = st->key[0]->hash % nitems(sc->sc_slices); + struct pfsync_slice *s = &sc->sc_slices[idx]; + + if (!mtx_enter_try(&s->s_mtx)) { + mtx_enter(&s->s_mtx); + s->s_stat_contended++; + } + s->s_stat_locks++; + + return (s); +} + +static void +pfsync_slice_leave(struct pfsync_softc *sc, struct pfsync_slice *s) +{ + mtx_leave(&s->s_mtx); +} + +/* we have one of these for every PFSYNC_S_ */ +static void pfsync_out_state(struct pf_state *, void *); +static void pfsync_out_iack(struct pf_state *, void *); +static void pfsync_out_upd_c(struct pf_state *, void *); +static void pfsync_out_del(struct pf_state *, void *); +#if defined(IPSEC) +static void pfsync_out_tdb(struct tdb *, void *); +#endif + +static const struct pfsync_q pfsync_qs[] = { + { pfsync_out_iack, sizeof(struct pfsync_ins_ack), PFSYNC_ACT_INS_ACK }, + { pfsync_out_upd_c, sizeof(struct pfsync_upd_c), PFSYNC_ACT_UPD_C }, + { pfsync_out_del, sizeof(struct pfsync_del_c), PFSYNC_ACT_DEL_C }, + { pfsync_out_state, sizeof(struct pfsync_state), PFSYNC_ACT_INS }, + { pfsync_out_state, sizeof(struct pfsync_state), PFSYNC_ACT_UPD } +}; + +static void pfsync_out_state(struct pf_state *st, void *buf) { struct pfsync_state *sp = buf; + mtx_enter(&st->mtx); pf_state_export(sp, st); + mtx_leave(&st->mtx); } -void +static void pfsync_out_iack(struct pf_state *st, void *buf) { struct pfsync_ins_ack *iack = buf; @@ -1319,20 +1319,23 @@ pfsync_out_iack(struct pf_state *st, void *buf) iack->creatorid = st->creatorid; } -void +static void pfsync_out_upd_c(struct pf_state *st, void *buf) { struct pfsync_upd_c *up = buf; - bzero(up, sizeof(*up)); + memset(up, 0, sizeof(*up)); up->id = st->id; + up->creatorid = st->creatorid; + + mtx_enter(&st->mtx); pf_state_peer_hton(&st->src, &up->src); pf_state_peer_hton(&st->dst, &up->dst); - up->creatorid = st->creatorid; up->timeout = st->timeout; + mtx_leave(&st->mtx); } -void +static void pfsync_out_del(struct pf_state *st, void *buf) { struct pfsync_del_c *dp = buf; @@ -1340,1070 +1343,1045 @@ pfsync_out_del(struct pf_state *st, void *buf) dp->id = st->id; dp->creatorid = st->creatorid; - SET(st->state_flags, PFSTATE_NOSYNC); + st->sync_state = PFSYNC_S_DEAD; } -void -pfsync_grab_snapshot(struct pfsync_snapshot *sn, struct pfsync_softc *sc) -{ - int q; - struct pf_state *st; - struct pfsync_upd_req_item *ur; -#if defined(IPSEC) - struct tdb *tdb; -#endif - - sn->sn_sc = sc; - - mtx_enter(&sc->sc_st_mtx); - mtx_enter(&sc->sc_upd_req_mtx); - mtx_enter(&sc->sc_tdb_mtx); - - for (q = 0; q < PFSYNC_S_COUNT; q++) { - TAILQ_INIT(&sn->sn_qs[q]); - - while ((st = TAILQ_FIRST(&sc->sc_qs[q])) != NULL) { - TAILQ_REMOVE(&sc->sc_qs[q], st, sync_list); - mtx_enter(&st->mtx); - if (st->snapped == 0) { - TAILQ_INSERT_TAIL(&sn->sn_qs[q], st, sync_snap); - st->snapped = 1; - mtx_leave(&st->mtx); - } else { - /* - * item is on snapshot list already, so we can - * skip it now. - */ - mtx_leave(&st->mtx); - pf_state_unref(st); - } - } - } - - TAILQ_INIT(&sn->sn_upd_req_list); - while ((ur = TAILQ_FIRST(&sc->sc_upd_req_list)) != NULL) { - TAILQ_REMOVE(&sc->sc_upd_req_list, ur, ur_entry); - TAILQ_INSERT_TAIL(&sn->sn_upd_req_list, ur, ur_snap); - } - - TAILQ_INIT(&sn->sn_tdb_q); #if defined(IPSEC) - while ((tdb = TAILQ_FIRST(&sc->sc_tdb_q)) != NULL) { - TAILQ_REMOVE(&sc->sc_tdb_q, tdb, tdb_sync_entry); - TAILQ_INSERT_TAIL(&sn->sn_tdb_q, tdb, tdb_sync_snap); - - mtx_enter(&tdb->tdb_mtx); - KASSERT(!ISSET(tdb->tdb_flags, TDBF_PFSYNC_SNAPPED)); - SET(tdb->tdb_flags, TDBF_PFSYNC_SNAPPED); - mtx_leave(&tdb->tdb_mtx); - } -#endif - - sn->sn_len = sc->sc_len; - sc->sc_len = PFSYNC_MINPKT; - - sn->sn_plus = sc->sc_plus; - sc->sc_plus = NULL; - sn->sn_pluslen = sc->sc_pluslen; - sc->sc_pluslen = 0; +static inline void +pfsync_tdb_enter(struct tdb *tdb) +{ + mtx_enter(&tdb->tdb_mtx); +} - mtx_leave(&sc->sc_tdb_mtx); - mtx_leave(&sc->sc_upd_req_mtx); - mtx_leave(&sc->sc_st_mtx); +static inline void +pfsync_tdb_leave(struct tdb *tdb) +{ + unsigned int snapped = ISSET(tdb->tdb_flags, TDBF_PFSYNC_SNAPPED); + mtx_leave(&tdb->tdb_mtx); + if (snapped) + wakeup_one(&tdb->tdb_updates); } +#endif /* defined(IPSEC) */ -void -pfsync_drop_snapshot(struct pfsync_snapshot *sn) +static void +pfsync_slice_drop(struct pfsync_softc *sc, struct pfsync_slice *s) { struct pf_state *st; - struct pfsync_upd_req_item *ur; + int q; #if defined(IPSEC) - struct tdb *t; + struct tdb *tdb; #endif - int q; - for (q = 0; q < PFSYNC_S_COUNT; q++) { - if (TAILQ_EMPTY(&sn->sn_qs[q])) + for (q = 0; q < nitems(s->s_qs); q++) { + if (TAILQ_EMPTY(&s->s_qs[q])) continue; - while ((st = TAILQ_FIRST(&sn->sn_qs[q])) != NULL) { - mtx_enter(&st->mtx); + while ((st = TAILQ_FIRST(&s->s_qs[q])) != NULL) { + TAILQ_REMOVE(&s->s_qs[q], st, sync_list); +#ifdef PFSYNC_DEBUG KASSERT(st->sync_state == q); - KASSERT(st->snapped == 1); - TAILQ_REMOVE(&sn->sn_qs[q], st, sync_snap); +#endif st->sync_state = PFSYNC_S_NONE; - st->snapped = 0; - mtx_leave(&st->mtx); pf_state_unref(st); } } - while ((ur = TAILQ_FIRST(&sn->sn_upd_req_list)) != NULL) { - TAILQ_REMOVE(&sn->sn_upd_req_list, ur, ur_snap); - pool_put(&sn->sn_sc->sc_pool, ur); - } - #if defined(IPSEC) - while ((t = TAILQ_FIRST(&sn->sn_tdb_q)) != NULL) { - TAILQ_REMOVE(&sn->sn_tdb_q, t, tdb_sync_snap); - mtx_enter(&t->tdb_mtx); - KASSERT(ISSET(t->tdb_flags, TDBF_PFSYNC_SNAPPED)); - CLR(t->tdb_flags, TDBF_PFSYNC_SNAPPED); - CLR(t->tdb_flags, TDBF_PFSYNC); - mtx_leave(&t->tdb_mtx); - } -#endif -} - -int -pfsync_is_snapshot_empty(struct pfsync_snapshot *sn) -{ - int q; - - for (q = 0; q < PFSYNC_S_COUNT; q++) - if (!TAILQ_EMPTY(&sn->sn_qs[q])) - return (0); - - if (!TAILQ_EMPTY(&sn->sn_upd_req_list)) - return (0); - - if (!TAILQ_EMPTY(&sn->sn_tdb_q)) - return (0); - - return (sn->sn_plus == NULL); -} - -void -pfsync_drop(struct pfsync_softc *sc) -{ - struct pfsync_snapshot sn; - - pfsync_grab_snapshot(&sn, sc); - pfsync_drop_snapshot(&sn); -} - -void -pfsync_send_dispatch(void *xmq) -{ - struct mbuf_queue *mq = xmq; - struct pfsync_softc *sc; - struct mbuf *m; - struct mbuf_list ml; - int error; - - mq_delist(mq, &ml); - if (ml_empty(&ml)) - return; - - NET_LOCK(); - sc = pfsyncif; - if (sc == NULL) { - ml_purge(&ml); - goto done; - } + while ((tdb = TAILQ_FIRST(&s->s_tdb_q)) != NULL) { + TAILQ_REMOVE(&s->s_tdb_q, tdb, tdb_sync_entry); - while ((m = ml_dequeue(&ml)) != NULL) { - if ((error = ip_output(m, NULL, NULL, IP_RAWOUTPUT, - &sc->sc_imo, NULL, 0)) == 0) - pfsyncstat_inc(pfsyncs_opackets); - else { - DPFPRINTF(LOG_DEBUG, - "ip_output() @ %s failed (%d)\n", __func__, error); - pfsyncstat_inc(pfsyncs_oerrors); - } + pfsync_tdb_enter(tdb); + KASSERT(ISSET(tdb->tdb_flags, TDBF_PFSYNC)); + CLR(tdb->tdb_flags, TDBF_PFSYNC); + pfsync_tdb_leave(tdb); } -done: - NET_UNLOCK(); -} +#endif /* defined(IPSEC) */ -void -pfsync_send_pkt(struct mbuf *m) -{ - if (mq_enqueue(&pfsync_mq, m) != 0) { - pfsyncstat_inc(pfsyncs_oerrors); - DPFPRINTF(LOG_DEBUG, "mq_enqueue() @ %s failed, queue full\n", - __func__); - } else - task_add(net_tq(0), &pfsync_task); + timeout_del(&s->s_tmo); + s->s_len = PFSYNC_MINPKT; } -void -pfsync_sendout(void) +static struct mbuf * +pfsync_slice_write(struct pfsync_slice *s) { - struct pfsync_snapshot sn; - struct pfsync_softc *sc = pfsyncif; -#if NBPFILTER > 0 - struct ifnet *ifp = &sc->sc_if; -#endif + struct pfsync_softc *sc = s->s_pfsync; struct mbuf *m; + struct ip *ip; struct pfsync_header *ph; struct pfsync_subheader *subh; - struct pf_state *st; - struct pfsync_upd_req_item *ur; - int offset; - int q, count = 0; - if (sc == NULL || sc->sc_len == PFSYNC_MINPKT) - return; + unsigned int mlen = max_linkhdr + s->s_len; + unsigned int q, count; + caddr_t ptr; + size_t off; - if (!ISSET(sc->sc_if.if_flags, IFF_RUNNING) || -#if NBPFILTER > 0 - (ifp->if_bpf == NULL && sc->sc_sync_ifidx == 0)) { -#else - sc->sc_sync_ifidx == 0) { -#endif - pfsync_drop(sc); - return; + MUTEX_ASSERT_LOCKED(&s->s_mtx); + if (s->s_len == PFSYNC_MINPKT) { + s->s_stat_write_nop++; + return (NULL); } - pfsync_grab_snapshot(&sn, sc); + task_del(s->s_softnet, &s->s_task); - /* - * Check below is sufficient to prevent us from sending empty packets, - * but it does not stop us from sending short packets. - */ - if (pfsync_is_snapshot_empty(&sn)) - return; + m = m_gethdr(M_DONTWAIT, MT_DATA); + if (m == NULL) + goto drop; - MGETHDR(m, M_DONTWAIT, MT_DATA); - if (m == NULL) { - sc->sc_if.if_oerrors++; - pfsyncstat_inc(pfsyncs_onomem); - pfsync_drop_snapshot(&sn); - return; + if (mlen > MHLEN) { + MCLGETL(m, M_DONTWAIT, mlen); + if (!ISSET(m->m_flags, M_EXT)) + goto drop; } - if (max_linkhdr + sn.sn_len > MHLEN) { - MCLGETL(m, M_DONTWAIT, max_linkhdr + sn.sn_len); - if (!ISSET(m->m_flags, M_EXT)) { - m_free(m); - sc->sc_if.if_oerrors++; - pfsyncstat_inc(pfsyncs_onomem); - pfsync_drop_snapshot(&sn); - return; - } - } - m->m_data += max_linkhdr; - m->m_len = m->m_pkthdr.len = sn.sn_len; + m_align(m, s->s_len); + m->m_len = m->m_pkthdr.len = s->s_len; - /* build the ip header */ - ip = mtod(m, struct ip *); - bcopy(&sc->sc_template, ip, sizeof(*ip)); - offset = sizeof(*ip); + ptr = mtod(m, caddr_t); + off = 0; + ip = (struct ip *)(ptr + off); + off += sizeof(*ip); + *ip = sc->sc_template; ip->ip_len = htons(m->m_pkthdr.len); ip->ip_id = htons(ip_randomid()); - /* build the pfsync header */ - ph = (struct pfsync_header *)(m->m_data + offset); - bzero(ph, sizeof(*ph)); - offset += sizeof(*ph); - + ph = (struct pfsync_header *)(ptr + off); + off += sizeof(*ph); + memset(ph, 0, sizeof(*ph)); ph->version = PFSYNC_VERSION; - ph->len = htons(sn.sn_len - sizeof(*ip)); - bcopy(pf_status.pf_chksum, ph->pfcksum, PF_MD5_DIGEST_LENGTH); + ph->len = htons(m->m_pkthdr.len - sizeof(*ip)); + + for (q = 0; q < nitems(s->s_qs); q++) { + struct pf_state_queue *psq = &s->s_qs[q]; + struct pf_state *st; - if (!TAILQ_EMPTY(&sn.sn_upd_req_list)) { - subh = (struct pfsync_subheader *)(m->m_data + offset); - offset += sizeof(*subh); + if (TAILQ_EMPTY(psq)) + continue; + + subh = (struct pfsync_subheader *)(ptr + off); + off += sizeof(*subh); count = 0; - while ((ur = TAILQ_FIRST(&sn.sn_upd_req_list)) != NULL) { - TAILQ_REMOVE(&sn.sn_upd_req_list, ur, ur_snap); + while ((st = TAILQ_FIRST(psq)) != NULL) { + TAILQ_REMOVE(psq, st, sync_list); + count++; - bcopy(&ur->ur_msg, m->m_data + offset, - sizeof(ur->ur_msg)); - offset += sizeof(ur->ur_msg); + KASSERT(st->sync_state == q); + /* the write handler below may override this */ + st->sync_state = PFSYNC_S_NONE; - pool_put(&sc->sc_pool, ur); + pfsync_qs[q].write(st, ptr + off); + off += pfsync_qs[q].len; - count++; + pf_state_unref(st); } - bzero(subh, sizeof(*subh)); - subh->len = sizeof(ur->ur_msg) >> 2; - subh->action = PFSYNC_ACT_UPD_REQ; + subh->action = pfsync_qs[q].action; + subh->len = pfsync_qs[q].len >> 2; subh->count = htons(count); } - /* has someone built a custom region for us to add? */ - if (sn.sn_plus != NULL) { - bcopy(sn.sn_plus, m->m_data + offset, sn.sn_pluslen); - offset += sn.sn_pluslen; - sn.sn_plus = NULL; /* XXX memory leak ? */ - } - #if defined(IPSEC) - if (!TAILQ_EMPTY(&sn.sn_tdb_q)) { - struct tdb *t; + if (!TAILQ_EMPTY(&s->s_tdb_q)) { + struct tdb *tdb; - subh = (struct pfsync_subheader *)(m->m_data + offset); - offset += sizeof(*subh); + subh = (struct pfsync_subheader *)(ptr + off); + off += sizeof(*subh); count = 0; - while ((t = TAILQ_FIRST(&sn.sn_tdb_q)) != NULL) { - TAILQ_REMOVE(&sn.sn_tdb_q, t, tdb_sync_snap); - pfsync_out_tdb(t, m->m_data + offset); - offset += sizeof(struct pfsync_tdb); - mtx_enter(&t->tdb_mtx); - KASSERT(ISSET(t->tdb_flags, TDBF_PFSYNC_SNAPPED)); - CLR(t->tdb_flags, TDBF_PFSYNC_SNAPPED); - CLR(t->tdb_flags, TDBF_PFSYNC); - mtx_leave(&t->tdb_mtx); - tdb_unref(t); + while ((tdb = TAILQ_FIRST(&s->s_tdb_q)) != NULL) { + TAILQ_REMOVE(&s->s_tdb_q, tdb, tdb_sync_entry); count++; + + pfsync_tdb_enter(tdb); + KASSERT(ISSET(tdb->tdb_flags, TDBF_PFSYNC)); + + /* get a consistent view of the counters */ + pfsync_out_tdb(tdb, ptr + off); + + CLR(tdb->tdb_flags, TDBF_PFSYNC); + pfsync_tdb_leave(tdb); + + off += sizeof(struct pfsync_tdb); } - bzero(subh, sizeof(*subh)); subh->action = PFSYNC_ACT_TDB; subh->len = sizeof(struct pfsync_tdb) >> 2; subh->count = htons(count); } #endif - /* walk the queues */ - for (q = 0; q < PFSYNC_S_COUNT; q++) { - if (TAILQ_EMPTY(&sn.sn_qs[q])) - continue; - - subh = (struct pfsync_subheader *)(m->m_data + offset); - offset += sizeof(*subh); - - count = 0; - while ((st = TAILQ_FIRST(&sn.sn_qs[q])) != NULL) { - mtx_enter(&st->mtx); - TAILQ_REMOVE(&sn.sn_qs[q], st, sync_snap); - KASSERT(st->sync_state == q); - KASSERT(st->snapped == 1); - st->sync_state = PFSYNC_S_NONE; - st->snapped = 0; - pfsync_qs[q].write(st, m->m_data + offset); - offset += pfsync_qs[q].len; - mtx_leave(&st->mtx); + timeout_del(&s->s_tmo); + s->s_len = PFSYNC_MINPKT; - pf_state_unref(st); - count++; - } + return (m); +drop: + m_freem(m); + pfsyncstat_inc(pfsyncs_onomem); + pfsync_slice_drop(sc, s); + return (NULL); +} - bzero(subh, sizeof(*subh)); - subh->action = pfsync_qs[q].action; - subh->len = pfsync_qs[q].len >> 2; - subh->count = htons(count); - } +static void +pfsync_sendout(struct pfsync_softc *sc, struct mbuf *m) +{ + struct ip_moptions imo; + unsigned int len = m->m_pkthdr.len; +#if NBPF > 0 + caddr_t if_bpf = sc->sc_if.if_bpf; + if (if_bpf) + bpf_mtap(if_bpf, m, BPF_DIRECTION_OUT); +#endif - /* we're done, let's put it on the wire */ -#if NBPFILTER > 0 - if (ifp->if_bpf) { - m->m_data += sizeof(*ip); - m->m_len = m->m_pkthdr.len = sn.sn_len - sizeof(*ip); - bpf_mtap(ifp->if_bpf, m, BPF_DIRECTION_OUT); - m->m_data -= sizeof(*ip); - m->m_len = m->m_pkthdr.len = sn.sn_len; - } + imo.imo_ifidx = sc->sc_sync_ifidx; + imo.imo_ttl = PFSYNC_DFLTTL; + imo.imo_loop = 0; - if (sc->sc_sync_ifidx == 0) { - sc->sc_len = PFSYNC_MINPKT; - m_freem(m); - return; + if (ip_output(m, NULL, NULL, IP_RAWOUTPUT, &imo, NULL, 0) == 0) { + counters_pkt(sc->sc_if.if_counters, ifc_opackets, + ifc_obytes, len); + pfsyncstat_inc(pfsyncs_opackets); + } else { + counters_inc(sc->sc_if.if_counters, ifc_oerrors); + pfsyncstat_inc(pfsyncs_oerrors); } -#endif +} - sc->sc_if.if_opackets++; - sc->sc_if.if_obytes += m->m_pkthdr.len; +static void +pfsync_slice_tmo(void *arg) +{ + struct pfsync_slice *s = arg; - m->m_pkthdr.ph_rtableid = sc->sc_if.if_rdomain; + task_add(s->s_softnet, &s->s_task); +} - pfsync_send_pkt(m); +static void +pfsync_slice_sched(struct pfsync_slice *s) +{ + s->s_stat_task_add++; + task_add(s->s_softnet, &s->s_task); } -void -pfsync_insert_state(struct pf_state *st) +static void +pfsync_slice_task(void *arg) { - struct pfsync_softc *sc = pfsyncif; + struct pfsync_slice *s = arg; + struct mbuf *m; - NET_ASSERT_LOCKED(); + mtx_enter(&s->s_mtx); + s->s_stat_task_run++; - if (ISSET(st->rule.ptr->rule_flag, PFRULE_NOSYNC) || - st->key[PF_SK_WIRE]->proto == IPPROTO_PFSYNC) { - SET(st->state_flags, PFSTATE_NOSYNC); - return; + m = pfsync_slice_write(s); + mtx_leave(&s->s_mtx); + if (m != NULL) { + NET_LOCK(); + pfsync_sendout(s->s_pfsync, m); + NET_UNLOCK(); } +} - if (sc == NULL || !ISSET(sc->sc_if.if_flags, IFF_RUNNING) || - ISSET(st->state_flags, PFSTATE_NOSYNC)) - return; +static void +pfsync_slice_sendq(void *arg) +{ + struct pfsync_slice *s = arg; + struct mbuf_list ml; + struct mbuf *m; - if (sc->sc_len == PFSYNC_MINPKT) - timeout_add_sec(&sc->sc_tmo, 1); + mq_delist(&s->s_sendq, &ml); + if (ml_empty(&ml)) + return; - pfsync_q_ins(st, PFSYNC_S_INS); + mtx_enter(&s->s_mtx); + s->s_stat_dequeue++; + mtx_leave(&s->s_mtx); - st->sync_updates = 0; + NET_LOCK(); + while ((m = ml_dequeue(&ml)) != NULL) + pfsync_sendout(s->s_pfsync, m); + NET_UNLOCK(); } -int -pfsync_defer(struct pf_state *st, struct mbuf *m, struct pfsync_deferral **ppd) +static void +pfsync_q_ins(struct pfsync_slice *s, struct pf_state *st, unsigned int q) { - struct pfsync_softc *sc = pfsyncif; - struct pfsync_deferral *pd; - unsigned int sched; - - NET_ASSERT_LOCKED(); + size_t nlen = pfsync_qs[q].len; + struct mbuf *m = NULL; - if (!sc->sc_defer || - ISSET(st->state_flags, PFSTATE_NOSYNC) || - m->m_flags & (M_BCAST|M_MCAST)) - return (0); + MUTEX_ASSERT_LOCKED(&s->s_mtx); + KASSERT(st->sync_state == PFSYNC_S_NONE); + KASSERT(s->s_len >= PFSYNC_MINPKT); - pd = pool_get(&sc->sc_pool, M_NOWAIT); - if (pd == NULL) - return (0); + if (TAILQ_EMPTY(&s->s_qs[q])) + nlen += sizeof(struct pfsync_subheader); - /* - * deferral queue grows faster, than timeout can consume, - * we have to ask packet (caller) to help timer and dispatch - * one deferral for us. - * - * We wish to call pfsync_undefer() here. Unfortunately we can't, - * because pfsync_undefer() will be calling to ip_output(), - * which in turn will call to pf_test(), which would then attempt - * to grab PF_LOCK() we currently hold. - */ - if (sc->sc_deferred >= 128) { - mtx_enter(&sc->sc_deferrals_mtx); - *ppd = TAILQ_FIRST(&sc->sc_deferrals); - if (*ppd != NULL) { - TAILQ_REMOVE(&sc->sc_deferrals, *ppd, pd_entry); - sc->sc_deferred--; + if (s->s_len + nlen > s->s_pfsync->sc_if.if_mtu) { + m = pfsync_slice_write(s); + if (m != NULL) { + s->s_stat_enqueue++; + if (mq_enqueue(&s->s_sendq, m) == 0) + task_add(s->s_softnet, &s->s_send); } - mtx_leave(&sc->sc_deferrals_mtx); - } else - *ppd = NULL; - - m->m_pkthdr.pf.flags |= PF_TAG_GENERATED; - SET(st->state_flags, PFSTATE_ACK); - pd->pd_st = pf_state_ref(st); - pd->pd_m = m; + nlen = sizeof(struct pfsync_subheader) + pfsync_qs[q].len; + } - pd->pd_deadline = getnsecuptime() + PFSYNC_DEFER_NSEC; + s->s_len += nlen; + pf_state_ref(st); + TAILQ_INSERT_TAIL(&s->s_qs[q], st, sync_list); + st->sync_state = q; - mtx_enter(&sc->sc_deferrals_mtx); - sched = TAILQ_EMPTY(&sc->sc_deferrals); + if (!timeout_pending(&s->s_tmo)) + timeout_add_sec(&s->s_tmo, 1); +} - TAILQ_INSERT_TAIL(&sc->sc_deferrals, pd, pd_entry); - sc->sc_deferred++; - mtx_leave(&sc->sc_deferrals_mtx); +static void +pfsync_q_del(struct pfsync_slice *s, struct pf_state *st) +{ + unsigned int q = st->sync_state; - if (sched) - timeout_add_nsec(&sc->sc_deferrals_tmo, PFSYNC_DEFER_NSEC); + MUTEX_ASSERT_LOCKED(&s->s_mtx); + KASSERT(st->sync_state < PFSYNC_S_NONE); - schednetisr(NETISR_PFSYNC); + st->sync_state = PFSYNC_S_NONE; + TAILQ_REMOVE(&s->s_qs[q], st, sync_list); + pf_state_unref(st); + s->s_len -= pfsync_qs[q].len; - return (1); + if (TAILQ_EMPTY(&s->s_qs[q])) + s->s_len -= sizeof(struct pfsync_subheader); } +/* + * the pfsync hooks that pf calls + */ + void -pfsync_undefer_notify(struct pfsync_deferral *pd) +pfsync_init_state(struct pf_state *st, const struct pf_state_key *skw, + const struct pf_state_key *sks, int flags) { - struct pf_pdesc pdesc; - struct pf_state *st = pd->pd_st; + /* this is called before pf_state_insert */ - /* - * pf_remove_state removes the state keys and sets st->timeout - * to PFTM_UNLINKED. this is done under NET_LOCK which should - * be held here, so we can use PFTM_UNLINKED as a test for - * whether the state keys are set for the address family - * lookup. - */ + if (skw->proto == IPPROTO_PFSYNC) + SET(st->state_flags, PFSTATE_NOSYNC); - if (st->timeout == PFTM_UNLINKED) + if (ISSET(st->state_flags, PFSTATE_NOSYNC)) { + st->sync_state = PFSYNC_S_DEAD; return; + } - if (st->rt == PF_ROUTETO) { - if (pf_setup_pdesc(&pdesc, st->key[PF_SK_WIRE]->af, - st->direction, st->kif, pd->pd_m, NULL) != PF_PASS) - return; - switch (st->key[PF_SK_WIRE]->af) { - case AF_INET: - pf_route(&pdesc, st); - break; -#ifdef INET6 - case AF_INET6: - pf_route6(&pdesc, st); - break; -#endif /* INET6 */ - default: - unhandled_af(st->key[PF_SK_WIRE]->af); - } - pd->pd_m = pdesc.m; - } else { - switch (st->key[PF_SK_WIRE]->af) { - case AF_INET: - ip_output(pd->pd_m, NULL, NULL, 0, NULL, NULL, 0); - break; -#ifdef INET6 - case AF_INET6: - ip6_output(pd->pd_m, NULL, NULL, 0, NULL, NULL); - break; -#endif /* INET6 */ - default: - unhandled_af(st->key[PF_SK_WIRE]->af); - } - - pd->pd_m = NULL; + if (ISSET(flags, PFSYNC_SI_IOCTL)) { + /* all good */ + return; } -} -void -pfsync_free_deferral(struct pfsync_deferral *pd) -{ - struct pfsync_softc *sc = pfsyncif; + /* state came off the wire */ + if (ISSET(st->state_flags, PFSTATE_ACK)) { + CLR(st->state_flags, PFSTATE_ACK); - pf_state_unref(pd->pd_st); - m_freem(pd->pd_m); - pool_put(&sc->sc_pool, pd); + /* peer wants an iack, not an insert */ + st->sync_state = PFSYNC_S_SYNC; + } } void -pfsync_undefer(struct pfsync_deferral *pd, int drop) +pfsync_insert_state(struct pf_state *st) { - struct pfsync_softc *sc = pfsyncif; + struct pfsync_softc *sc; - NET_ASSERT_LOCKED(); + MUTEX_ASSERT_UNLOCKED(&st->mtx); - if (sc == NULL) + if (ISSET(st->state_flags, PFSTATE_NOSYNC) || + st->sync_state == PFSYNC_S_DEAD) return; - CLR(pd->pd_st->state_flags, PFSTATE_ACK); - if (!drop) - pfsync_undefer_notify(pd); + smr_read_enter(); + sc = SMR_PTR_GET(&pfsyncif); + if (sc != NULL) { + struct pfsync_slice *s = pfsync_slice_enter(sc, st); + + switch (st->sync_state) { + case PFSYNC_S_UPD_C: + /* we must have lost a race after insert */ + pfsync_q_del(s, st); + /* FALLTHROUGH */ + case PFSYNC_S_NONE: + pfsync_q_ins(s, st, PFSYNC_S_INS); + break; + case PFSYNC_S_SYNC: + st->sync_state = PFSYNC_S_NONE; /* gross */ + pfsync_q_ins(s, st, PFSYNC_S_IACK); + pfsync_slice_sched(s); /* the peer is waiting */ + break; + default: + panic("%s: state %p unexpected sync_state %d", + __func__, st, st->sync_state); + /* NOTREACHED */ + } - pfsync_free_deferral(pd); + pfsync_slice_leave(sc, s); + } + smr_read_leave(); } void -pfsync_deferrals_tmo(void *arg) +pfsync_update_state(struct pf_state *st) { - struct pfsync_softc *sc = arg; - struct pfsync_deferral *pd; - uint64_t now, nsec = 0; - struct pfsync_deferrals pds = TAILQ_HEAD_INITIALIZER(pds); + struct pfsync_softc *sc; - now = getnsecuptime(); + MUTEX_ASSERT_UNLOCKED(&st->mtx); - mtx_enter(&sc->sc_deferrals_mtx); - for (;;) { - pd = TAILQ_FIRST(&sc->sc_deferrals); - if (pd == NULL) + if (ISSET(st->state_flags, PFSTATE_NOSYNC) || + st->sync_state == PFSYNC_S_DEAD) + return; + + smr_read_enter(); + sc = SMR_PTR_GET(&pfsyncif); + if (sc != NULL) { + struct pfsync_slice *s = pfsync_slice_enter(sc, st); + int sync = 0; + + switch (st->sync_state) { + case PFSYNC_S_UPD_C: + case PFSYNC_S_UPD: + /* we're already handling it */ + if (st->key[PF_SK_WIRE]->proto == IPPROTO_TCP) { + st->sync_updates++; + if (st->sync_updates >= sc->sc_maxupdates) + sync = 1; + } + /* FALLTHROUGH */ + case PFSYNC_S_INS: + case PFSYNC_S_DEL: + case PFSYNC_S_DEAD: break; - if (now < pd->pd_deadline) { - nsec = pd->pd_deadline - now; + case PFSYNC_S_IACK: + pfsync_q_del(s, st); + /* FALLTHROUGH */ + case PFSYNC_S_NONE: + pfsync_q_ins(s, st, PFSYNC_S_UPD_C); + st->sync_updates = 0; break; + default: + panic("%s: state %p unexpected sync_state %d", + __func__, st, st->sync_state); + /* NOTREACHED */ } - TAILQ_REMOVE(&sc->sc_deferrals, pd, pd_entry); - sc->sc_deferred--; - TAILQ_INSERT_TAIL(&pds, pd, pd_entry); - } - mtx_leave(&sc->sc_deferrals_mtx); - - if (nsec > 0) { - /* we were looking at a pd, but it wasn't old enough */ - timeout_add_nsec(&sc->sc_deferrals_tmo, nsec); - } - - if (TAILQ_EMPTY(&pds)) - return; - - NET_LOCK(); - while ((pd = TAILQ_FIRST(&pds)) != NULL) { - TAILQ_REMOVE(&pds, pd, pd_entry); + if (!sync && (getuptime() - st->pfsync_time) < 2) + sync = 1; - pfsync_undefer(pd, 0); + if (sync) + pfsync_slice_sched(s); + pfsync_slice_leave(sc, s); } - NET_UNLOCK(); + smr_read_leave(); } void -pfsync_deferred(struct pf_state *st, int drop) +pfsync_delete_state(struct pf_state *st) { - struct pfsync_softc *sc = pfsyncif; - struct pfsync_deferral *pd; + struct pfsync_softc *sc; - NET_ASSERT_LOCKED(); + MUTEX_ASSERT_UNLOCKED(&st->mtx); + + if (ISSET(st->state_flags, PFSTATE_NOSYNC) || + st->sync_state == PFSYNC_S_DEAD) + return; + + smr_read_enter(); + sc = SMR_PTR_GET(&pfsyncif); + if (sc != NULL) { + struct pfsync_slice *s = pfsync_slice_enter(sc, st); + + switch (st->sync_state) { + case PFSYNC_S_INS: + /* let's pretend this never happened */ + pfsync_q_del(s, st); + break; - mtx_enter(&sc->sc_deferrals_mtx); - TAILQ_FOREACH(pd, &sc->sc_deferrals, pd_entry) { - if (pd->pd_st == st) { - TAILQ_REMOVE(&sc->sc_deferrals, pd, pd_entry); - sc->sc_deferred--; + case PFSYNC_S_UPD_C: + case PFSYNC_S_UPD: + case PFSYNC_S_IACK: + pfsync_q_del(s, st); + /* FALLTHROUGH */ + case PFSYNC_S_NONE: + pfsync_q_ins(s, st, PFSYNC_S_DEL); + st->sync_updates = 0; break; + case PFSYNC_S_DEL: + case PFSYNC_S_DEAD: + /* XXX we should count this */ + break; + default: + panic("%s: state %p unexpected sync_state %d", + __func__, st, st->sync_state); + /* NOTREACHED */ } - } - mtx_leave(&sc->sc_deferrals_mtx); - if (pd != NULL) - pfsync_undefer(pd, drop); + pfsync_slice_leave(sc, s); + } + smr_read_leave(); } +struct pfsync_subh_clr { + struct pfsync_subheader subh; + struct pfsync_clr clr; +} __packed __aligned(4); + void -pfsync_update_state(struct pf_state *st) +pfsync_clear_states(u_int32_t creatorid, const char *ifname) { - struct pfsync_softc *sc = pfsyncif; - int sync = 0; + struct pfsync_softc *sc; + struct pfsync_subh_clr *h; + struct mbuf *m; + unsigned int hlen, mlen; - NET_ASSERT_LOCKED(); + smr_read_enter(); + sc = SMR_PTR_GET(&pfsyncif); + if (sc != NULL) + refcnt_take(&sc->sc_refs); + smr_read_leave(); - if (sc == NULL || !ISSET(sc->sc_if.if_flags, IFF_RUNNING)) + if (sc == NULL) return; - if (ISSET(st->state_flags, PFSTATE_ACK)) - pfsync_deferred(st, 0); - if (ISSET(st->state_flags, PFSTATE_NOSYNC)) { - if (st->sync_state != PFSYNC_S_NONE) - pfsync_q_del(st); - return; - } + hlen = sizeof(sc->sc_template) + + sizeof(struct pfsync_header) + + sizeof(*h); - if (sc->sc_len == PFSYNC_MINPKT) - timeout_add_sec(&sc->sc_tmo, 1); + mlen = max_linkhdr + hlen; - switch (st->sync_state) { - case PFSYNC_S_UPD_C: - case PFSYNC_S_UPD: - case PFSYNC_S_INS: - /* we're already handling it */ + m = m_gethdr(M_DONTWAIT, MT_DATA); + if (m == NULL) { + /* count error */ + goto leave; + } - if (st->key[PF_SK_WIRE]->proto == IPPROTO_TCP) { - st->sync_updates++; - if (st->sync_updates >= sc->sc_maxupdates) - sync = 1; + if (mlen > MHLEN) { + MCLGETL(m, M_DONTWAIT, mlen); + if (!ISSET(m->m_flags, M_EXT)) { + m_freem(m); + goto leave; } - break; + } - case PFSYNC_S_IACK: - pfsync_q_del(st); - case PFSYNC_S_NONE: - pfsync_q_ins(st, PFSYNC_S_UPD_C); - st->sync_updates = 0; - break; + m_align(m, sizeof(*h)); + h = mtod(m, struct pfsync_subh_clr *); - case PFSYNC_S_DEL: - case PFSYNC_S_COUNT: - case PFSYNC_S_DEFER: - break; + h->subh.action = PFSYNC_ACT_CLR; + h->subh.len = sizeof(h->clr) >> 2; + h->subh.count = htons(1); - default: - panic("pfsync_update_state: unexpected sync state %d", - st->sync_state); - } + strlcpy(h->clr.ifname, ifname, sizeof(h->clr.ifname)); + h->clr.creatorid = creatorid; - if (sync || (getuptime() - st->pfsync_time) < 2) - schednetisr(NETISR_PFSYNC); -} + m->m_pkthdr.len = m->m_len = sizeof(*h); + m = pfsync_encap(sc, m); + if (m == NULL) + goto leave; -void -pfsync_cancel_full_update(struct pfsync_softc *sc) -{ - if (timeout_pending(&sc->sc_bulkfail_tmo) || - timeout_pending(&sc->sc_bulk_tmo)) { -#if NCARP > 0 - if (!pfsync_sync_ok) - carp_group_demote_adj(&sc->sc_if, -1, - "pfsync bulk cancelled"); - if (sc->sc_initial_bulk) { - carp_group_demote_adj(&sc->sc_if, -32, - "pfsync init"); - sc->sc_initial_bulk = 0; - } -#endif - pfsync_sync_ok = 1; - DPFPRINTF(LOG_INFO, "cancelling bulk update"); - } - timeout_del(&sc->sc_bulkfail_tmo); - timeout_del(&sc->sc_bulk_tmo); - sc->sc_bulk_next = NULL; - sc->sc_bulk_last = NULL; - sc->sc_ureq_sent = 0; - sc->sc_bulk_tries = 0; + pfsync_sendout(sc, m); +leave: + refcnt_rele_wake(&sc->sc_refs); } -void -pfsync_request_full_update(struct pfsync_softc *sc) +int +pfsync_state_in_use(struct pf_state *st) { - if (sc->sc_sync_ifidx != 0 && ISSET(sc->sc_if.if_flags, IFF_RUNNING)) { - /* Request a full state table update. */ - sc->sc_ureq_sent = getuptime(); -#if NCARP > 0 - if (!sc->sc_link_demoted && pfsync_sync_ok) - carp_group_demote_adj(&sc->sc_if, 1, - "pfsync bulk start"); -#endif - pfsync_sync_ok = 0; - DPFPRINTF(LOG_INFO, "requesting bulk update"); - PF_LOCK(); - timeout_add(&sc->sc_bulkfail_tmo, 4 * hz + - pf_pool_limits[PF_LIMIT_STATES].limit / - ((sc->sc_if.if_mtu - PFSYNC_MINPKT) / - sizeof(struct pfsync_state))); - PF_UNLOCK(); - pfsync_request_update(0, 0); + struct pfsync_softc *sc; + int rv = 0; + + smr_read_enter(); + sc = SMR_PTR_GET(&pfsyncif); + if (sc != NULL) { + /* + * pfsync bulk sends run inside + * rw_enter_read(&pf_state_list.pfs_rwl), and this + * code (pfsync_state_in_use) is only called from the + * purge code inside + * rw_enter_write(&pf_state_list.pfs_rwl). therefore, + * those two sections are exclusive so we can safely + * look at the bulk send pointers. + */ + /* rw_assert_wrlock(&pf_state_list.pfs_rwl); */ + if (sc->sc_bulk_snd.snd_next == st || + sc->sc_bulk_snd.snd_tail == st) + rv = 1; } + smr_read_leave(); + + return (rv); } -void -pfsync_request_update(u_int32_t creatorid, u_int64_t id) +int +pfsync_defer(struct pf_state *st, struct mbuf *m) { - struct pfsync_softc *sc = pfsyncif; - struct pfsync_upd_req_item *item; - size_t nlen, sclen; - int retry; - - /* - * this code does nothing to prevent multiple update requests for the - * same state being generated. - */ + struct pfsync_softc *sc; + struct pfsync_slice *s; + struct pfsync_deferral *pd; + int sched = 0; + int rv = 0; - item = pool_get(&sc->sc_pool, PR_NOWAIT); - if (item == NULL) { - /* XXX stats */ - return; - } + if (ISSET(st->state_flags, PFSTATE_NOSYNC) || + ISSET(m->m_flags, M_BCAST|M_MCAST)) + return (0); - item->ur_msg.id = id; - item->ur_msg.creatorid = creatorid; + smr_read_enter(); + sc = SMR_PTR_GET(&pfsyncif); + if (sc == NULL || !sc->sc_defer) + goto leave; - for (;;) { - mtx_enter(&sc->sc_upd_req_mtx); + pd = pool_get(&pfsync_deferrals_pool, M_NOWAIT); + if (pd == NULL) { + goto leave; + } - nlen = sizeof(struct pfsync_upd_req); - if (TAILQ_EMPTY(&sc->sc_upd_req_list)) - nlen += sizeof(struct pfsync_subheader); + s = pfsync_slice_enter(sc, st); + s->s_stat_defer_add++; - sclen = atomic_add_long_nv(&sc->sc_len, nlen); - retry = (sclen > sc->sc_if.if_mtu); - if (retry) - atomic_sub_long(&sc->sc_len, nlen); - else - TAILQ_INSERT_TAIL(&sc->sc_upd_req_list, item, ur_entry); + pd->pd_st = pf_state_ref(st); + pd->pd_m = m; + pd->pd_deadline = getnsecuptime() + PFSYNC_DEFER_NSEC; - mtx_leave(&sc->sc_upd_req_mtx); + m->m_pkthdr.pf.flags |= PF_TAG_GENERATED; + st->sync_defer = pd; - if (!retry) - break; + sched = s->s_deferred++; + TAILQ_INSERT_TAIL(&s->s_deferrals, pd, pd_entry); - pfsync_sendout(); + if (sched == 0) + timeout_add_nsec(&s->s_deferrals_tmo, PFSYNC_DEFER_NSEC); + else if (sched >= PFSYNC_DEFER_LIMIT) { + s->s_stat_defer_overlimit++; + timeout_del(&s->s_deferrals_tmo); + task_add(s->s_softnet, &s->s_deferrals_task); } - schednetisr(NETISR_PFSYNC); + pfsync_slice_sched(s); + pfsync_slice_leave(sc, s); + rv = 1; +leave: + smr_read_leave(); + + return (rv); } -void -pfsync_update_state_req(struct pf_state *st) +static void +pfsync_deferred(struct pfsync_softc *sc, struct pf_state *st) { - struct pfsync_softc *sc = pfsyncif; - - if (sc == NULL) - panic("pfsync_update_state_req: nonexistent instance"); + struct pfsync_slice *s; + struct pfsync_deferral *pd; - if (ISSET(st->state_flags, PFSTATE_NOSYNC)) { - if (st->sync_state != PFSYNC_S_NONE) - pfsync_q_del(st); - return; - } + s = pfsync_slice_enter(sc, st); - switch (st->sync_state) { - case PFSYNC_S_UPD_C: - case PFSYNC_S_IACK: - pfsync_q_del(st); - case PFSYNC_S_NONE: - pfsync_q_ins(st, PFSYNC_S_UPD); - schednetisr(NETISR_PFSYNC); - return; + pd = st->sync_defer; + if (pd != NULL) { + s->s_stat_defer_ack++; - case PFSYNC_S_INS: - case PFSYNC_S_UPD: - case PFSYNC_S_DEL: - /* we're already handling it */ - return; + TAILQ_REMOVE(&s->s_deferrals, pd, pd_entry); + s->s_deferred--; - default: - panic("pfsync_update_state_req: unexpected sync state %d", - st->sync_state); + st = pd->pd_st; + st->sync_defer = NULL; } + pfsync_slice_leave(sc, s); + + if (pd != NULL) + pfsync_defer_output(pd); } -void -pfsync_delete_state(struct pf_state *st) +static void +pfsync_deferrals_tmo(void *arg) { - struct pfsync_softc *sc = pfsyncif; + struct pfsync_slice *s = arg; - NET_ASSERT_LOCKED(); + if (READ_ONCE(s->s_deferred) > 0) + task_add(s->s_softnet, &s->s_deferrals_task); +} - if (sc == NULL || !ISSET(sc->sc_if.if_flags, IFF_RUNNING)) - return; +static void +pfsync_deferrals_task(void *arg) +{ + struct pfsync_slice *s = arg; + struct pfsync_deferral *pd; + struct pf_state *st; + uint64_t now, nsec = 0; + struct pfsync_deferrals pds = TAILQ_HEAD_INITIALIZER(pds); - if (ISSET(st->state_flags, PFSTATE_ACK)) - pfsync_deferred(st, 1); - if (ISSET(st->state_flags, PFSTATE_NOSYNC)) { - if (st->sync_state != PFSYNC_S_NONE) - pfsync_q_del(st); - return; - } + now = getnsecuptime(); + + mtx_enter(&s->s_mtx); + s->s_stat_defer_run++; /* maybe move this into the loop */ + for (;;) { + pd = TAILQ_FIRST(&s->s_deferrals); + if (pd == NULL) + break; - if (sc->sc_len == PFSYNC_MINPKT) - timeout_add_sec(&sc->sc_tmo, 1); + if (s->s_deferred < PFSYNC_DEFER_LIMIT && + now < pd->pd_deadline) { + nsec = pd->pd_deadline - now; + break; + } - switch (st->sync_state) { - case PFSYNC_S_INS: - /* we never got to tell the world so just forget about it */ - pfsync_q_del(st); - return; + TAILQ_REMOVE(&s->s_deferrals, pd, pd_entry); + s->s_deferred--; - case PFSYNC_S_UPD_C: - case PFSYNC_S_UPD: - case PFSYNC_S_IACK: - pfsync_q_del(st); /* - * FALLTHROUGH to putting it on the del list - * Note on reference count bookkeeping: - * pfsync_q_del() drops reference for queue - * ownership. But the st entry survives, because - * our caller still holds a reference. + * detach the pd from the state. the pd still refers + * to the state though. */ + st = pd->pd_st; + st->sync_defer = NULL; - case PFSYNC_S_NONE: - /* - * We either fall through here, or there is no reference to - * st owned by pfsync queues at this point. - * - * Calling pfsync_q_ins() puts st to del queue. The pfsync_q_ins() - * grabs a reference for delete queue. - */ - pfsync_q_ins(st, PFSYNC_S_DEL); + TAILQ_INSERT_TAIL(&pds, pd, pd_entry); + } + mtx_leave(&s->s_mtx); + + if (nsec > 0) { + /* we were looking at a pd, but it wasn't old enough */ + timeout_add_nsec(&s->s_deferrals_tmo, nsec); + } + + if (TAILQ_EMPTY(&pds)) return; - default: - panic("pfsync_delete_state: unexpected sync state %d", - st->sync_state); + NET_LOCK(); + while ((pd = TAILQ_FIRST(&pds)) != NULL) { + TAILQ_REMOVE(&pds, pd, pd_entry); + + pfsync_defer_output(pd); } + NET_UNLOCK(); } -void -pfsync_clear_states(u_int32_t creatorid, const char *ifname) +static void +pfsync_defer_output(struct pfsync_deferral *pd) { - struct pfsync_softc *sc = pfsyncif; - struct { - struct pfsync_subheader subh; - struct pfsync_clr clr; - } __packed r; + struct pf_pdesc pdesc; + struct pf_state *st = pd->pd_st; - NET_ASSERT_LOCKED(); + if (st->rt == PF_ROUTETO) { + if (pf_setup_pdesc(&pdesc, st->key[PF_SK_WIRE]->af, + st->direction, NULL, pd->pd_m, NULL) != PF_PASS) + return; + switch (st->key[PF_SK_WIRE]->af) { + case AF_INET: + pf_route(&pdesc, st); + break; +#ifdef INET6 + case AF_INET6: + pf_route6(&pdesc, st); + break; +#endif /* INET6 */ + default: + unhandled_af(st->key[PF_SK_WIRE]->af); + } + pd->pd_m = pdesc.m; + } else { + switch (st->key[PF_SK_WIRE]->af) { + case AF_INET: + ip_output(pd->pd_m, NULL, NULL, 0, NULL, NULL, 0); + break; +#ifdef INET6 + case AF_INET6: + ip6_output(pd->pd_m, NULL, NULL, 0, NULL, NULL); + break; +#endif /* INET6 */ + default: + unhandled_af(st->key[PF_SK_WIRE]->af); + } - if (sc == NULL || !ISSET(sc->sc_if.if_flags, IFF_RUNNING)) - return; + pd->pd_m = NULL; + } - bzero(&r, sizeof(r)); + pf_state_unref(st); + m_freem(pd->pd_m); + pool_put(&pfsync_deferrals_pool, pd); +} - r.subh.action = PFSYNC_ACT_CLR; - r.subh.len = sizeof(struct pfsync_clr) >> 2; - r.subh.count = htons(1); +struct pfsync_subh_bus { + struct pfsync_subheader subh; + struct pfsync_bus bus; +} __packed __aligned(4); - strlcpy(r.clr.ifname, ifname, sizeof(r.clr.ifname)); - r.clr.creatorid = creatorid; +static unsigned int +pfsync_bulk_snd_bus(struct pfsync_softc *sc, + struct mbuf *m, const unsigned int space, + uint32_t endtime, uint8_t status) +{ + struct pfsync_subh_bus *h; + unsigned int nlen; - pfsync_send_plus(&r, sizeof(r)); -} + nlen = m->m_len + sizeof(*h); + if (space < nlen) + return (0); -void -pfsync_iack(struct pf_state *st) -{ - pfsync_q_ins(st, PFSYNC_S_IACK); - schednetisr(NETISR_PFSYNC); + h = (struct pfsync_subh_bus *)(mtod(m, caddr_t) + m->m_len); + memset(h, 0, sizeof(*h)); + + h->subh.action = PFSYNC_ACT_BUS; + h->subh.len = sizeof(h->bus) >> 2; + h->subh.count = htons(1); + + h->bus.creatorid = pf_status.hostid; + h->bus.endtime = htonl(endtime); + h->bus.status = status; + + m->m_len = nlen; + + return (1); } -void -pfsync_q_ins(struct pf_state *st, int q) +static unsigned int +pfsync_bulk_snd_states(struct pfsync_softc *sc, + struct mbuf *m, const unsigned int space, unsigned int len) { - struct pfsync_softc *sc = pfsyncif; - size_t nlen, sclen; + struct pf_state *st; + struct pfsync_state *sp; + unsigned int nlen; + unsigned int count = 0; - if (sc->sc_len < PFSYNC_MINPKT) - panic("pfsync pkt len is too low %zd", sc->sc_len); - do { - mtx_enter(&sc->sc_st_mtx); - mtx_enter(&st->mtx); + st = sc->sc_bulk_snd.snd_next; - /* - * There are either two threads trying to update the - * the same state, or the state is just being processed - * (is on snapshot queue). - */ - if (st->sync_state != PFSYNC_S_NONE) { - mtx_leave(&st->mtx); - mtx_leave(&sc->sc_st_mtx); + for (;;) { + nlen = len + sizeof(*sp); + sp = (struct pfsync_state *)(mtod(m, caddr_t) + len); + if (space < nlen) break; - } - nlen = pfsync_qs[q].len; + mtx_enter(&st->mtx); + pf_state_export(sp, st); + mtx_leave(&st->mtx); + + /* commit */ + count++; + m->m_len = len = nlen; - if (TAILQ_EMPTY(&sc->sc_qs[q])) - nlen += sizeof(struct pfsync_subheader); + if (st == sc->sc_bulk_snd.snd_tail) { + if (pfsync_bulk_snd_bus(sc, m, space, + 0, PFSYNC_BUS_END) == 0) { + /* couldn't fit the BUS */ + st = NULL; + break; + } - sclen = atomic_add_long_nv(&sc->sc_len, nlen); - if (sclen > sc->sc_if.if_mtu) { - atomic_sub_long(&sc->sc_len, nlen); - mtx_leave(&st->mtx); - mtx_leave(&sc->sc_st_mtx); - pfsync_sendout(); - continue; + /* this BUS is done */ + pfsync_dprintf(sc, "bulk send done (%s)", __func__); + sc->sc_bulk_snd.snd_again = 0; /* XXX */ + sc->sc_bulk_snd.snd_next = NULL; + sc->sc_bulk_snd.snd_tail = NULL; + return (count); } - pf_state_ref(st); + st = TAILQ_NEXT(st, entry_list); + } - TAILQ_INSERT_TAIL(&sc->sc_qs[q], st, sync_list); - st->sync_state = q; - mtx_leave(&st->mtx); - mtx_leave(&sc->sc_st_mtx); - } while (0); + /* there's still work to do */ + sc->sc_bulk_snd.snd_next = st; + timeout_add_msec(&sc->sc_bulk_snd.snd_tmo, PFSYNC_BULK_SND_IVAL_MS); + + return (count); } -void -pfsync_q_del(struct pf_state *st) +static unsigned int +pfsync_bulk_snd_sub(struct pfsync_softc *sc, + struct mbuf *m, const unsigned int space) { - struct pfsync_softc *sc = pfsyncif; - int q; + struct pfsync_subheader *subh; + unsigned int count; + unsigned int len, nlen; + + len = m->m_len; + nlen = len + sizeof(*subh); + if (nlen > space) + return (0); + + subh = (struct pfsync_subheader *)(mtod(m, caddr_t) + len); - mtx_enter(&sc->sc_st_mtx); - mtx_enter(&st->mtx); - q = st->sync_state; /* - * re-check under mutex - * if state is snapped already, then just bail out, because we came - * too late, the state is being just processed/dispatched to peer. + * pfsync_bulk_snd_states only updates m->m_len after + * filling in a state after the offset we gave it. */ - if ((q == PFSYNC_S_NONE) || (st->snapped)) { - mtx_leave(&st->mtx); - mtx_leave(&sc->sc_st_mtx); - return; - } - atomic_sub_long(&sc->sc_len, pfsync_qs[q].len); - TAILQ_REMOVE(&sc->sc_qs[q], st, sync_list); - if (TAILQ_EMPTY(&sc->sc_qs[q])) - atomic_sub_long(&sc->sc_len, sizeof (struct pfsync_subheader)); - st->sync_state = PFSYNC_S_NONE; - mtx_leave(&st->mtx); - mtx_leave(&sc->sc_st_mtx); + count = pfsync_bulk_snd_states(sc, m, space, nlen); + if (count == 0) + return (0); - pf_state_unref(st); + subh->action = PFSYNC_ACT_UPD; + subh->len = sizeof(struct pfsync_state) >> 2; + subh->count = htons(count); + + return (count); } -#if defined(IPSEC) -void -pfsync_update_tdb(struct tdb *t, int output) +static void +pfsync_bulk_snd_start(struct pfsync_softc *sc) { - struct pfsync_softc *sc = pfsyncif; - size_t nlen, sclen; + const unsigned int space = sc->sc_if.if_mtu - + (sizeof(struct ip) + sizeof(struct pfsync_header)); + struct mbuf *m; - if (sc == NULL) - return; + rw_enter_read(&pf_state_list.pfs_rwl); - if (!ISSET(t->tdb_flags, TDBF_PFSYNC)) { - do { - mtx_enter(&sc->sc_tdb_mtx); - nlen = sizeof(struct pfsync_tdb); + rw_enter_write(&sc->sc_bulk_snd.snd_lock); + if (sc->sc_bulk_snd.snd_next != NULL) { + sc->sc_bulk_snd.snd_again = 1; + goto leave; + } - mtx_enter(&t->tdb_mtx); - if (ISSET(t->tdb_flags, TDBF_PFSYNC)) { - /* we've lost race, no action for us then */ - mtx_leave(&t->tdb_mtx); - mtx_leave(&sc->sc_tdb_mtx); - break; - } + mtx_enter(&pf_state_list.pfs_mtx); + sc->sc_bulk_snd.snd_next = TAILQ_FIRST(&pf_state_list.pfs_list); + sc->sc_bulk_snd.snd_tail = TAILQ_LAST(&pf_state_list.pfs_list, + pf_state_queue); + mtx_leave(&pf_state_list.pfs_mtx); - if (TAILQ_EMPTY(&sc->sc_tdb_q)) - nlen += sizeof(struct pfsync_subheader); + m = m_gethdr(M_DONTWAIT, MT_DATA); + if (m == NULL) + goto leave; - sclen = atomic_add_long_nv(&sc->sc_len, nlen); - if (sclen > sc->sc_if.if_mtu) { - atomic_sub_long(&sc->sc_len, nlen); - mtx_leave(&t->tdb_mtx); - mtx_leave(&sc->sc_tdb_mtx); - pfsync_sendout(); - continue; - } + MCLGETL(m, M_DONTWAIT, max_linkhdr + sc->sc_if.if_mtu); + if (!ISSET(m->m_flags, M_EXT)) { + /* some error++ */ + m_freem(m); /* drop */ + goto leave; + } - TAILQ_INSERT_TAIL(&sc->sc_tdb_q, t, tdb_sync_entry); - tdb_ref(t); - SET(t->tdb_flags, TDBF_PFSYNC); - mtx_leave(&t->tdb_mtx); + m_align(m, space); + m->m_len = 0; - mtx_leave(&sc->sc_tdb_mtx); - t->tdb_updates = 0; - } while (0); - } else { - if (++t->tdb_updates >= sc->sc_maxupdates) - schednetisr(NETISR_PFSYNC); + if (sc->sc_bulk_snd.snd_tail == NULL) { + pfsync_dprintf(sc, "bulk send empty (%s)", __func__); + + /* list is empty */ + if (pfsync_bulk_snd_bus(sc, m, space, 0, PFSYNC_BUS_END) == 0) + panic("%s: mtu is too low", __func__); + goto encap; } - mtx_enter(&t->tdb_mtx); - if (output) - SET(t->tdb_flags, TDBF_PFSYNC_RPL); - else - CLR(t->tdb_flags, TDBF_PFSYNC_RPL); - mtx_leave(&t->tdb_mtx); + pfsync_dprintf(sc, "bulk send start (%s)", __func__); + + /* start a bulk update. */ + if (pfsync_bulk_snd_bus(sc, m, space, 0, PFSYNC_BUS_START) == 0) + panic("%s: mtu is too low", __func__); + + /* fill it up with state updates. */ + pfsync_bulk_snd_sub(sc, m, space); + +encap: + m->m_pkthdr.len = m->m_len; + m = pfsync_encap(sc, m); + if (m == NULL) + goto leave; + + pfsync_sendout(sc, m); + +leave: + rw_exit_write(&sc->sc_bulk_snd.snd_lock); + + rw_exit_read(&pf_state_list.pfs_rwl); } -#endif -#if defined(IPSEC) -void -pfsync_delete_tdb(struct tdb *t) +static void +pfsync_bulk_snd_tmo(void *arg) { - struct pfsync_softc *sc = pfsyncif; - size_t nlen; + struct pfsync_softc *sc = arg; + const unsigned int space = sc->sc_if.if_mtu - + (sizeof(struct ip) + sizeof(struct pfsync_header)); + struct mbuf *m; - if (sc == NULL || !ISSET(t->tdb_flags, TDBF_PFSYNC)) + m = m_gethdr(M_DONTWAIT, MT_DATA); + if (m == NULL) { + /* some error++ */ + /* retry later */ + timeout_add_msec(&sc->sc_bulk_snd.snd_tmo, + PFSYNC_BULK_SND_IVAL_MS); return; + } - mtx_enter(&sc->sc_tdb_mtx); - - /* - * if tdb entry is just being processed (found in snapshot), - * then it can not be deleted. we just came too late - */ - if (ISSET(t->tdb_flags, TDBF_PFSYNC_SNAPPED)) { - mtx_leave(&sc->sc_tdb_mtx); + MCLGETL(m, M_DONTWAIT, max_linkhdr + sc->sc_if.if_mtu); + if (!ISSET(m->m_flags, M_EXT)) { + /* some error++ */ + m_freem(m); + /* retry later */ + timeout_add_msec(&sc->sc_bulk_snd.snd_tmo, + PFSYNC_BULK_SND_IVAL_MS); return; } - TAILQ_REMOVE(&sc->sc_tdb_q, t, tdb_sync_entry); + m_align(m, space); + m->m_len = 0; + + rw_enter_read(&pf_state_list.pfs_rwl); + rw_enter_write(&sc->sc_bulk_snd.snd_lock); - mtx_enter(&t->tdb_mtx); - CLR(t->tdb_flags, TDBF_PFSYNC); - mtx_leave(&t->tdb_mtx); + if (sc->sc_bulk_snd.snd_next == NULL) { + /* there was no space in the previous packet for a BUS END */ - nlen = sizeof(struct pfsync_tdb); - if (TAILQ_EMPTY(&sc->sc_tdb_q)) - nlen += sizeof(struct pfsync_subheader); - atomic_sub_long(&sc->sc_len, nlen); + if (pfsync_bulk_snd_bus(sc, m, space, 0, PFSYNC_BUS_END) == 0) + panic("%s: mtu is too low", __func__); - mtx_leave(&sc->sc_tdb_mtx); + /* this bulk is done */ + pfsync_dprintf(sc, "bulk send done (%s)", __func__); + sc->sc_bulk_snd.snd_again = 0; /* XXX */ + sc->sc_bulk_snd.snd_tail = NULL; + } else { + pfsync_dprintf(sc, "bulk send again (%s)", __func__); + + /* fill it up with state updates. */ + pfsync_bulk_snd_sub(sc, m, space); + } + + m->m_pkthdr.len = m->m_len; + m = pfsync_encap(sc, m); - tdb_unref(t); + rw_exit_write(&sc->sc_bulk_snd.snd_lock); + rw_exit_read(&pf_state_list.pfs_rwl); + + if (m != NULL) { + NET_LOCK(); + pfsync_sendout(sc, m); + NET_UNLOCK(); + } } -#endif -void -pfsync_out_tdb(struct tdb *t, void *buf) +static void +pfsync_update_state_req(struct pfsync_softc *sc, struct pf_state *st) +{ + struct pfsync_slice *s = pfsync_slice_enter(sc, st); + + switch (st->sync_state) { + case PFSYNC_S_UPD_C: + case PFSYNC_S_IACK: + pfsync_q_del(s, st); + /* FALLTHROUGH */ + case PFSYNC_S_NONE: + pfsync_q_ins(s, st, PFSYNC_S_UPD); + break; + + case PFSYNC_S_INS: + case PFSYNC_S_UPD: + case PFSYNC_S_DEL: + /* we're already handling it */ + break; + default: + panic("%s: state %p unexpected sync_state %d", + __func__, st, st->sync_state); + } + + pfsync_slice_sched(s); + pfsync_slice_leave(sc, s); +} + +#if defined(IPSEC) +static void +pfsync_out_tdb(struct tdb *tdb, void *buf) { struct pfsync_tdb *ut = buf; - bzero(ut, sizeof(*ut)); - ut->spi = t->tdb_spi; - bcopy(&t->tdb_dst, &ut->dst, sizeof(ut->dst)); + memset(ut, 0, sizeof(*ut)); + ut->spi = tdb->tdb_spi; + memcpy(&ut->dst, &tdb->tdb_dst, sizeof(ut->dst)); /* * When a failover happens, the master's rpl is probably above * what we see here (we may be up to a second late), so @@ -2422,219 +2400,934 @@ pfsync_out_tdb(struct tdb *t, void *buf) * this edge case. */ #define RPL_INCR 16384 - ut->rpl = htobe64(t->tdb_rpl + (ISSET(t->tdb_flags, TDBF_PFSYNC_RPL) ? - RPL_INCR : 0)); - ut->cur_bytes = htobe64(t->tdb_cur_bytes); - ut->sproto = t->tdb_sproto; - ut->rdomain = htons(t->tdb_rdomain); + ut->rpl = htobe64(tdb->tdb_rpl + + (ISSET(tdb->tdb_flags, TDBF_PFSYNC_RPL) ? RPL_INCR : 0)); + ut->cur_bytes = htobe64(tdb->tdb_cur_bytes); + ut->sproto = tdb->tdb_sproto; + ut->rdomain = htons(tdb->tdb_rdomain); } -void -pfsync_bulk_start(void) +static struct pfsync_slice * +pfsync_slice_enter_tdb(struct pfsync_softc *sc, const struct tdb *t) { - struct pfsync_softc *sc = pfsyncif; - - NET_ASSERT_LOCKED(); - /* - * pf gc via pfsync_state_in_use reads sc_bulk_next and - * sc_bulk_last while exclusively holding the pf_state_list - * rwlock. make sure it can't race with us setting these - * pointers. they basically act as hazards, and borrow the - * lists state reference count. + * just use the first slice for all ipsec (for now) until + * it's more obvious what property (eg, spi) we can distribute + * tdbs over slices with. */ - rw_enter_read(&pf_state_list.pfs_rwl); - - /* get a consistent view of the list pointers */ - mtx_enter(&pf_state_list.pfs_mtx); - if (sc->sc_bulk_next == NULL) - sc->sc_bulk_next = TAILQ_FIRST(&pf_state_list.pfs_list); - - sc->sc_bulk_last = TAILQ_LAST(&pf_state_list.pfs_list, pf_state_queue); - mtx_leave(&pf_state_list.pfs_mtx); - - rw_exit_read(&pf_state_list.pfs_rwl); - - DPFPRINTF(LOG_INFO, "received bulk update request"); + struct pfsync_slice *s = &sc->sc_slices[0]; - if (sc->sc_bulk_last == NULL) - pfsync_bulk_status(PFSYNC_BUS_END); - else { - sc->sc_ureq_received = getuptime(); - - pfsync_bulk_status(PFSYNC_BUS_START); - timeout_add(&sc->sc_bulk_tmo, 0); + if (!mtx_enter_try(&s->s_mtx)) { + mtx_enter(&s->s_mtx); + s->s_stat_contended++; } + s->s_stat_locks++; + + return (s); } -void -pfsync_bulk_update(void *arg) +static void +pfsync_tdb_ins(struct pfsync_slice *s, struct tdb *tdb) { - struct pfsync_softc *sc; - struct pf_state *st; - int i = 0; - - NET_LOCK(); - sc = pfsyncif; - if (sc == NULL) - goto out; + size_t nlen = sizeof(struct pfsync_tdb); + struct mbuf *m = NULL; - rw_enter_read(&pf_state_list.pfs_rwl); - st = sc->sc_bulk_next; - sc->sc_bulk_next = NULL; + KASSERT(s->s_len >= PFSYNC_MINPKT); - if (st == NULL) { - rw_exit_read(&pf_state_list.pfs_rwl); - goto out; - } + MUTEX_ASSERT_LOCKED(&s->s_mtx); + MUTEX_ASSERT_UNLOCKED(&tdb->tdb_mtx); - for (;;) { - if (st->sync_state == PFSYNC_S_NONE && - st->timeout < PFTM_MAX && - st->pfsync_time <= sc->sc_ureq_received) { - pfsync_update_state_req(st); - i++; - } + if (TAILQ_EMPTY(&s->s_tdb_q)) + nlen += sizeof(struct pfsync_subheader); - st = TAILQ_NEXT(st, entry_list); - if ((st == NULL) || (st == sc->sc_bulk_last)) { - /* we're done */ - sc->sc_bulk_last = NULL; - pfsync_bulk_status(PFSYNC_BUS_END); - break; + if (s->s_len + nlen > s->s_pfsync->sc_if.if_mtu) { + m = pfsync_slice_write(s); + if (m != NULL) { + s->s_stat_enqueue++; + if (mq_enqueue(&s->s_sendq, m) == 0) + task_add(s->s_softnet, &s->s_send); } - if (i > 1 && (sc->sc_if.if_mtu - sc->sc_len) < - sizeof(struct pfsync_state)) { - /* we've filled a packet */ - sc->sc_bulk_next = st; - timeout_add(&sc->sc_bulk_tmo, 1); - break; - } + nlen = sizeof(struct pfsync_subheader) + + sizeof(struct pfsync_tdb); } - rw_exit_read(&pf_state_list.pfs_rwl); - out: - NET_UNLOCK(); + s->s_len += nlen; + TAILQ_INSERT_TAIL(&s->s_tdb_q, tdb, tdb_sync_entry); + tdb->tdb_updates = 0; + + if (!timeout_pending(&s->s_tmo)) + timeout_add_sec(&s->s_tmo, 1); } -void -pfsync_bulk_status(u_int8_t status) +static void +pfsync_tdb_del(struct pfsync_slice *s, struct tdb *tdb) { - struct { - struct pfsync_subheader subh; - struct pfsync_bus bus; - } __packed r; - - struct pfsync_softc *sc = pfsyncif; + MUTEX_ASSERT_LOCKED(&s->s_mtx); + MUTEX_ASSERT_UNLOCKED(&tdb->tdb_mtx); - bzero(&r, sizeof(r)); + TAILQ_REMOVE(&s->s_tdb_q, tdb, tdb_sync_entry); - r.subh.action = PFSYNC_ACT_BUS; - r.subh.len = sizeof(struct pfsync_bus) >> 2; - r.subh.count = htons(1); - - r.bus.creatorid = pf_status.hostid; - r.bus.endtime = htonl(getuptime() - sc->sc_ureq_received); - r.bus.status = status; - - pfsync_send_plus(&r, sizeof(r)); + s->s_len -= sizeof(struct pfsync_tdb); + if (TAILQ_EMPTY(&s->s_tdb_q)) + s->s_len -= sizeof(struct pfsync_subheader); } +/* + * the reference that pfsync has to a tdb is accounted for by the + * TDBF_PFSYNC flag, not by tdb_ref/tdb_unref. tdb_delete_tdb() is + * called after all other references to a tdb are dropped (with + * tdb_unref) as part of the tdb_free(). + * + * tdb_free() needs to wait for pfsync to let go of the tdb though, + * which would be best handled by a reference count, but tdb_free + * needs the NET_LOCK which pfsync is already fighting with. instead + * use the TDBF_PFSYNC_SNAPPED flag to coordinate the pfsync write/drop + * with tdb_free. + */ + void -pfsync_bulk_fail(void *arg) +pfsync_update_tdb(struct tdb *tdb, int output) { struct pfsync_softc *sc; - NET_LOCK(); - sc = pfsyncif; - if (sc == NULL) - goto out; - if (sc->sc_bulk_tries++ < PFSYNC_MAX_BULKTRIES) { - /* Try again */ - timeout_add_sec(&sc->sc_bulkfail_tmo, 5); - pfsync_request_update(0, 0); - } else { - /* Pretend like the transfer was ok */ - sc->sc_ureq_sent = 0; - sc->sc_bulk_tries = 0; -#if NCARP > 0 - if (!pfsync_sync_ok) - carp_group_demote_adj(&sc->sc_if, -1, - sc->sc_link_demoted ? - "pfsync link state up" : - "pfsync bulk fail"); - if (sc->sc_initial_bulk) { - carp_group_demote_adj(&sc->sc_if, -32, - "pfsync init"); - sc->sc_initial_bulk = 0; - } -#endif - pfsync_sync_ok = 1; - sc->sc_link_demoted = 0; - DPFPRINTF(LOG_ERR, "failed to receive bulk update"); + MUTEX_ASSERT_UNLOCKED(&tdb->tdb_mtx); + + smr_read_enter(); + sc = SMR_PTR_GET(&pfsyncif); + if (sc != NULL) { + struct pfsync_slice *s = pfsync_slice_enter_tdb(sc, tdb); + + /* TDBF_PFSYNC is only changed while the slice mtx is held */ + if (!ISSET(tdb->tdb_flags, TDBF_PFSYNC)) { + mtx_enter(&tdb->tdb_mtx); + SET(tdb->tdb_flags, TDBF_PFSYNC); + mtx_leave(&tdb->tdb_mtx); + + pfsync_tdb_ins(s, tdb); + } else if (++tdb->tdb_updates >= sc->sc_maxupdates) + pfsync_slice_sched(s); + + /* XXX no sync timestamp on tdbs to check */ + + pfsync_slice_leave(sc, s); } - out: - NET_UNLOCK(); + smr_read_leave(); } void -pfsync_send_plus(void *plus, size_t pluslen) +pfsync_delete_tdb(struct tdb *tdb) { - struct pfsync_softc *sc = pfsyncif; + struct pfsync_softc *sc; - if (sc->sc_len + pluslen > sc->sc_if.if_mtu) - pfsync_sendout(); + MUTEX_ASSERT_UNLOCKED(&tdb->tdb_mtx); - sc->sc_plus = plus; - sc->sc_pluslen = pluslen; - atomic_add_long(&sc->sc_len, pluslen); + smr_read_enter(); + sc = SMR_PTR_GET(&pfsyncif); + if (sc != NULL) { + struct pfsync_slice *s = pfsync_slice_enter_tdb(sc, tdb); - pfsync_sendout(); -} + /* TDBF_PFSYNC is only changed while the slice mtx is held */ + if (ISSET(tdb->tdb_flags, TDBF_PFSYNC)) { + pfsync_tdb_del(s, tdb); -int -pfsync_is_up(void) -{ - struct pfsync_softc *sc = pfsyncif; + mtx_enter(&tdb->tdb_mtx); + CLR(tdb->tdb_flags, TDBF_PFSYNC); + mtx_leave(&tdb->tdb_mtx); + } - if (sc == NULL || !ISSET(sc->sc_if.if_flags, IFF_RUNNING)) - return (0); + pfsync_slice_leave(sc, s); + } + smr_read_leave(); - return (1); + /* + * handle pfsync_slice_drop being called from pfsync_down + * and the smr/slice access above won't work. + */ + + mtx_enter(&tdb->tdb_mtx); + SET(tdb->tdb_flags, TDBF_PFSYNC_SNAPPED); /* like a thanos snap */ + while (ISSET(tdb->tdb_flags, TDBF_PFSYNC)) { + msleep_nsec(&tdb->tdb_updates, &tdb->tdb_mtx, PWAIT, + "tdbfree", INFSLP); + } + mtx_leave(&tdb->tdb_mtx); } +#endif /* defined(IPSEC) */ -int -pfsync_state_in_use(struct pf_state *st) +struct pfsync_act { + void (*in)(struct pfsync_softc *, const caddr_t, + unsigned int, unsigned int); + size_t len; +}; + +static void pfsync_in_clr(struct pfsync_softc *, + const caddr_t, unsigned int, unsigned int); +static void pfsync_in_iack(struct pfsync_softc *, + const caddr_t, unsigned int, unsigned int); +static void pfsync_in_upd_c(struct pfsync_softc *, + const caddr_t, unsigned int, unsigned int); +static void pfsync_in_ureq(struct pfsync_softc *, + const caddr_t, unsigned int, unsigned int); +static void pfsync_in_del(struct pfsync_softc *, + const caddr_t, unsigned int, unsigned int); +static void pfsync_in_del_c(struct pfsync_softc *, + const caddr_t, unsigned int, unsigned int); +static void pfsync_in_bus(struct pfsync_softc *, + const caddr_t, unsigned int, unsigned int); +static void pfsync_in_tdb(struct pfsync_softc *, + const caddr_t, unsigned int, unsigned int); +static void pfsync_in_ins(struct pfsync_softc *, + const caddr_t, unsigned int, unsigned int); +static void pfsync_in_upd(struct pfsync_softc *, + const caddr_t, unsigned int, unsigned int); + +static const struct pfsync_act pfsync_acts[] = { + [PFSYNC_ACT_CLR] = + { pfsync_in_clr, sizeof(struct pfsync_clr) }, + [PFSYNC_ACT_INS_ACK] = + { pfsync_in_iack, sizeof(struct pfsync_ins_ack) }, + [PFSYNC_ACT_UPD_C] = + { pfsync_in_upd_c, sizeof(struct pfsync_upd_c) }, + [PFSYNC_ACT_UPD_REQ] = + { pfsync_in_ureq, sizeof(struct pfsync_upd_req) }, + [PFSYNC_ACT_DEL] = + { pfsync_in_del, sizeof(struct pfsync_state) }, + [PFSYNC_ACT_DEL_C] = + { pfsync_in_del_c, sizeof(struct pfsync_del_c) }, + [PFSYNC_ACT_BUS] = + { pfsync_in_bus, sizeof(struct pfsync_bus) }, + [PFSYNC_ACT_INS] = + { pfsync_in_ins, sizeof(struct pfsync_state) }, + [PFSYNC_ACT_UPD] = + { pfsync_in_upd, sizeof(struct pfsync_state) }, + [PFSYNC_ACT_TDB] = + { pfsync_in_tdb, sizeof(struct pfsync_tdb) }, +}; + +static void +pfsync_in_skip(struct pfsync_softc *sc, + const caddr_t buf, unsigned int mlen, unsigned int count) { - struct pfsync_softc *sc = pfsyncif; + /* nop */ +} +static struct mbuf * +pfsync_input(struct mbuf *m, uint8_t ttl, unsigned int hlen) +{ + struct pfsync_softc *sc; + struct pfsync_header *ph; + struct pfsync_subheader *subh; + unsigned int len; + void (*in)(struct pfsync_softc *, + const caddr_t, unsigned int, unsigned int); +#if NBPF > 0 + caddr_t if_bpf; +#endif + + pfsyncstat_inc(pfsyncs_ipackets); + + if (!pf_status.running) + return (m); + + /* + * pfsyncif is only set if it is up and running correctly. + */ + smr_read_enter(); + sc = SMR_PTR_GET(&pfsyncif); if (sc == NULL) - return (0); + goto leave; - rw_assert_wrlock(&pf_state_list.pfs_rwl); + if (sc->sc_sync_ifidx != m->m_pkthdr.ph_ifidx) { + pfsyncstat_inc(pfsyncs_badif); + goto leave; + } - if (st->sync_state != PFSYNC_S_NONE || - st == sc->sc_bulk_next || - st == sc->sc_bulk_last) - return (1); +#if NBPF > 0 +#endif - return (0); + /* verify that the IP TTL is 255. */ + if (ttl != PFSYNC_DFLTTL) { + pfsyncstat_inc(pfsyncs_badttl); + goto leave; + } + + m_adj(m, hlen); + + if (m->m_pkthdr.len < sizeof(*ph)) { + pfsyncstat_inc(pfsyncs_hdrops); + goto leave; + } + if (m->m_len < sizeof(*ph)) { + m = m_pullup(m, sizeof(*ph)); + if (m == NULL) + goto leave; + } + + ph = mtod(m, struct pfsync_header *); + if (ph->version != PFSYNC_VERSION) { + pfsyncstat_inc(pfsyncs_badver); + goto leave; + } + + len = ntohs(ph->len); + if (m->m_pkthdr.len < len) { + pfsyncstat_inc(pfsyncs_badlen); + goto leave; + } + if (m->m_pkthdr.len > len) + m->m_pkthdr.len = len; + + /* ok, it's serious now */ + refcnt_take(&sc->sc_refs); + smr_read_leave(); + + counters_pkt(sc->sc_if.if_counters, ifc_ipackets, ifc_ibytes, len); + + m_adj(m, sizeof(*ph)); + + while (m->m_pkthdr.len >= sizeof(*subh)) { + unsigned int action, mlen, count; + + if (m->m_len < sizeof(*subh)) { + m = m_pullup(m, sizeof(*subh)); + if (m == NULL) + goto rele; + } + subh = mtod(m, struct pfsync_subheader *); + + action = subh->action; + mlen = subh->len << 2; + count = ntohs(subh->count); + + if (action >= PFSYNC_ACT_MAX || + action >= nitems(pfsync_acts) || + mlen < pfsync_acts[subh->action].len) { + /* + * subheaders are always followed by at least one + * message, so if the peer is new + * enough to tell us how big its messages are then we + * know enough to skip them. + */ + if (count == 0 || mlen == 0) { + pfsyncstat_inc(pfsyncs_badact); + goto rele; + } + + in = pfsync_in_skip; + } else { + in = pfsync_acts[action].in; + if (in == NULL) + in = pfsync_in_skip; + } + + m_adj(m, sizeof(*subh)); + len = mlen * count; + if (len > m->m_pkthdr.len) { + pfsyncstat_inc(pfsyncs_badlen); + goto rele; + } + if (m->m_len < len) { + m = m_pullup(m, len); + if (m == NULL) + goto rele; + } + + (*in)(sc, mtod(m, caddr_t), mlen, count); + m_adj(m, len); + } + +rele: + refcnt_rele_wake(&sc->sc_refs); + return (m); + +leave: + smr_read_leave(); + return (m); } -void -pfsync_timeout(void *arg) +static void +pfsync_in_clr(struct pfsync_softc *sc, + const caddr_t buf, unsigned int mlen, unsigned int count) { - NET_LOCK(); - pfsync_sendout(); - NET_UNLOCK(); + const struct pfsync_clr *clr; + struct pf_state *head, *tail, *st, *next; + struct pfi_kif *kif; + uint32_t creatorid; + unsigned int i; + + rw_enter_read(&pf_state_list.pfs_rwl); + + /* get a view of the state list */ + mtx_enter(&pf_state_list.pfs_mtx); + head = TAILQ_FIRST(&pf_state_list.pfs_list); + tail = TAILQ_LAST(&pf_state_list.pfs_list, pf_state_queue); + mtx_leave(&pf_state_list.pfs_mtx); + + PF_LOCK(); + for (i = 0; i < count; i++) { + clr = (struct pfsync_clr *)(buf + i * mlen); + + creatorid = clr->creatorid; + if (clr->ifname[0] == '\0') + kif = NULL; + else { + kif = pfi_kif_find(clr->ifname); + if (kif == NULL) + continue; + } + + st = NULL; + next = head; + + PF_STATE_ENTER_WRITE(); + while (st != tail) { + st = next; + next = TAILQ_NEXT(st, entry_list); + + if (creatorid != st->creatorid) + continue; + if (kif != NULL && kif != st->kif) + continue; + + mtx_enter(&st->mtx); + SET(st->state_flags, PFSTATE_NOSYNC); + mtx_leave(&st->mtx); + pf_remove_state(st); + } + PF_STATE_EXIT_WRITE(); + } + PF_UNLOCK(); + + rw_exit_read(&pf_state_list.pfs_rwl); } -/* this is a softnet/netisr handler */ -void -pfsyncintr(void) +static void +pfsync_in_ins(struct pfsync_softc *sc, + const caddr_t buf, unsigned int mlen, unsigned int count) +{ + const struct pfsync_state *sp; + sa_family_t af1, af2; + unsigned int i; + + PF_LOCK(); + for (i = 0; i < count; i++) { + sp = (struct pfsync_state *)(buf + mlen * i); + af1 = sp->key[0].af; + af2 = sp->key[1].af; + + /* check for invalid values */ + if (sp->timeout >= PFTM_MAX || + sp->src.state > PF_TCPS_PROXY_DST || + sp->dst.state > PF_TCPS_PROXY_DST || + sp->direction > PF_OUT || + (((af1 || af2) && + ((af1 != AF_INET && af1 != AF_INET6) || + (af2 != AF_INET && af2 != AF_INET6))) || + (sp->af != AF_INET && sp->af != AF_INET6))) { + pfsyncstat_inc(pfsyncs_badval); + continue; + } + + if (pf_state_import(sp, 0) == ENOMEM) { + /* drop out, but process the rest of the actions */ + break; + } + } + PF_UNLOCK(); +} + +static void +pfsync_in_iack(struct pfsync_softc *sc, + const caddr_t buf, unsigned int mlen, unsigned int count) +{ + const struct pfsync_ins_ack *ia; + struct pf_state_cmp id_key; + struct pf_state *st; + unsigned int i; + + for (i = 0; i < count; i++) { + ia = (struct pfsync_ins_ack *)(buf + mlen * i); + + id_key.id = ia->id; + id_key.creatorid = ia->creatorid; + + PF_STATE_ENTER_READ(); + st = pf_find_state_byid(&id_key); + pf_state_ref(st); + PF_STATE_EXIT_READ(); + if (st == NULL) + continue; + + if (READ_ONCE(st->sync_defer) != NULL) + pfsync_deferred(sc, st); + + pf_state_unref(st); + } +} + +static int +pfsync_upd_tcp(struct pf_state *st, const struct pfsync_state_peer *src, + const struct pfsync_state_peer *dst) +{ + int sync = 0; + + /* + * The state should never go backwards except + * for syn-proxy states. Neither should the + * sequence window slide backwards. + */ + if ((st->src.state > src->state && + (st->src.state < PF_TCPS_PROXY_SRC || + src->state >= PF_TCPS_PROXY_SRC)) || + + (st->src.state == src->state && + SEQ_GT(st->src.seqlo, ntohl(src->seqlo)))) + sync++; + else + pf_state_peer_ntoh(src, &st->src); + + if ((st->dst.state > dst->state) || + + (st->dst.state >= TCPS_SYN_SENT && + SEQ_GT(st->dst.seqlo, ntohl(dst->seqlo)))) + sync++; + else + pf_state_peer_ntoh(dst, &st->dst); + + return (sync); +} + +static void +pfsync_in_updates(struct pfsync_softc *sc, struct pf_state *st, + const struct pfsync_state_peer *src, const struct pfsync_state_peer *dst, + uint8_t timeout) { - pfsync_sendout(); + struct pf_state_scrub *sscrub = NULL; + struct pf_state_scrub *dscrub = NULL; + int sync; + + if (src->scrub.scrub_flag && st->src.scrub == NULL) { + sscrub = pf_state_scrub_get(); + if (sscrub == NULL) { + /* inc error? */ + goto out; + } + } + if (dst->scrub.scrub_flag && st->dst.scrub == NULL) { + dscrub = pf_state_scrub_get(); + if (dscrub == NULL) { + /* inc error? */ + goto out; + } + } + + if (READ_ONCE(st->sync_defer) != NULL) + pfsync_deferred(sc, st); + + mtx_enter(&st->mtx); + + /* attach the scrub memory if needed */ + if (sscrub != NULL && st->src.scrub == NULL) { + st->src.scrub = sscrub; + sscrub = NULL; + } + if (dscrub != NULL && st->dst.scrub == NULL) { + st->dst.scrub = dscrub; + dscrub = NULL; + } + + if (st->key[PF_SK_WIRE]->proto == IPPROTO_TCP) + sync = pfsync_upd_tcp(st, src, dst); + else { + sync = 0; + + /* + * Non-TCP protocol state machine always go + * forwards + */ + if (st->src.state > src->state) + sync++; + else + pf_state_peer_ntoh(src, &st->src); + + if (st->dst.state > dst->state) + sync++; + else + pf_state_peer_ntoh(dst, &st->dst); + } + + st->pfsync_time = getuptime(); + if (sync < 2) { + st->expire = st->pfsync_time; + st->timeout = timeout; + } + + mtx_leave(&st->mtx); + + if (sync) { + pfsyncstat_inc(pfsyncs_stale); + pfsync_update_state(st); + } + +out: + if (sscrub != NULL) + pf_state_scrub_put(sscrub); + if (dscrub != NULL) + pf_state_scrub_put(dscrub); +} + + +static void +pfsync_in_upd(struct pfsync_softc *sc, + const caddr_t buf, unsigned int mlen, unsigned int count) +{ + const struct pfsync_state *sp; + struct pf_state_cmp id_key; + struct pf_state *st; + int error; + unsigned int i; + + for (i = 0; i < count; i++) { + sp = (struct pfsync_state *)(buf + mlen * i); + + /* check for invalid values */ + if (sp->timeout >= PFTM_MAX || + sp->src.state > PF_TCPS_PROXY_DST || + sp->dst.state > PF_TCPS_PROXY_DST) { + pfsyncstat_inc(pfsyncs_badval); + continue; + } + + id_key.id = sp->id; + id_key.creatorid = sp->creatorid; + + PF_STATE_ENTER_READ(); + st = pf_find_state_byid(&id_key); + pf_state_ref(st); + PF_STATE_EXIT_READ(); + if (st == NULL) { + /* insert the update */ + PF_LOCK(); + error = pf_state_import(sp, 0); + if (error) + pfsyncstat_inc(pfsyncs_badstate); + PF_UNLOCK(); + continue; + } + + pfsync_in_updates(sc, st, &sp->src, &sp->dst, sp->timeout); + + pf_state_unref(st); + } +} + +static struct mbuf * +pfsync_upd_req_init(struct pfsync_softc *sc, unsigned int count) +{ + struct mbuf *m; + unsigned int mlen; + + m = m_gethdr(M_DONTWAIT, MT_DATA); + if (m == NULL) { + pfsyncstat_inc(pfsyncs_onomem); + return (NULL); + } + + mlen = max_linkhdr + sizeof(sc->sc_template) + + sizeof(struct pfsync_header) + + sizeof(struct pfsync_subheader) + + sizeof(struct pfsync_upd_req) * count; + + if (mlen > MHLEN) { + MCLGETL(m, M_DONTWAIT, mlen); + if (!ISSET(m->m_flags, M_EXT)) { + m_freem(m); + return (NULL); + } + } + + m_align(m, 0); + m->m_len = 0; + + return (m); +} + +static void +pfsync_in_upd_c(struct pfsync_softc *sc, + const caddr_t buf, unsigned int mlen, unsigned int count) +{ + const struct pfsync_upd_c *up; + struct pf_state_cmp id_key; + struct pf_state *st; + unsigned int i; + struct mbuf *m = NULL; + unsigned int rcount = 0; + + for (i = 0; i < count; i++) { + up = (struct pfsync_upd_c *)(buf + mlen * i); + + /* check for invalid values */ + if (up->timeout >= PFTM_MAX || + up->src.state > PF_TCPS_PROXY_DST || + up->dst.state > PF_TCPS_PROXY_DST) { + pfsyncstat_inc(pfsyncs_badval); + continue; + } + + id_key.id = up->id; + id_key.creatorid = up->creatorid; + + PF_STATE_ENTER_READ(); + st = pf_find_state_byid(&id_key); + pf_state_ref(st); + PF_STATE_EXIT_READ(); + if (st == NULL) { + /* We don't have this state. Ask for it. */ + struct pfsync_upd_req *ur; + + if (m == NULL) { + m = pfsync_upd_req_init(sc, count); + if (m == NULL) { + pfsyncstat_inc(pfsyncs_onomem); + continue; + } + } + + m = m_prepend(m, sizeof(*ur), M_DONTWAIT); + if (m == NULL) { + pfsyncstat_inc(pfsyncs_onomem); + continue; + } + + ur = mtod(m, struct pfsync_upd_req *); + ur->id = up->id; + ur->creatorid = up->creatorid; + rcount++; + + continue; + } + + pfsync_in_updates(sc, st, &up->src, &up->dst, up->timeout); + + pf_state_unref(st); + } + + if (m != NULL) { + struct pfsync_subheader *subh; + + m = m_prepend(m, sizeof(*subh), M_DONTWAIT); + if (m == NULL) { + pfsyncstat_inc(pfsyncs_onomem); + return; + } + + subh = mtod(m, struct pfsync_subheader *); + subh->action = PFSYNC_ACT_UPD_REQ; + subh->len = sizeof(struct pfsync_upd_req) >> 2; + subh->count = htons(rcount); + + m = pfsync_encap(sc, m); + if (m == NULL) { + pfsyncstat_inc(pfsyncs_onomem); + return; + } + + pfsync_sendout(sc, m); + } +} + +static void +pfsync_in_ureq(struct pfsync_softc *sc, + const caddr_t buf, unsigned int mlen, unsigned int count) +{ + const struct pfsync_upd_req *ur; + struct pf_state_cmp id_key; + struct pf_state *st; + unsigned int i; + + for (i = 0; i < count; i++) { + ur = (struct pfsync_upd_req *)(buf + mlen * i); + + id_key.id = ur->id; + id_key.creatorid = ur->creatorid; + + if (id_key.id == 0 && id_key.creatorid == 0) { + pfsync_bulk_snd_start(sc); + continue; + } + + PF_STATE_ENTER_READ(); + st = pf_find_state_byid(&id_key); + if (st != NULL && st->timeout < PFTM_MAX && + !ISSET(st->state_flags, PFSTATE_NOSYNC)) + pf_state_ref(st); + else + st = NULL; + PF_STATE_EXIT_READ(); + if (st == NULL) { + pfsyncstat_inc(pfsyncs_badstate); + continue; + } + + pfsync_update_state_req(sc, st); + + pf_state_unref(st); + } +} + +static void +pfsync_in_del(struct pfsync_softc *sc, + const caddr_t buf, unsigned int mlen, unsigned int count) +{ + const struct pfsync_state *sp; + struct pf_state_cmp id_key; + struct pf_state *st; + unsigned int i; + + PF_LOCK(); + PF_STATE_ENTER_WRITE(); + for (i = 0; i < count; i++) { + sp = (struct pfsync_state *)(buf + mlen * i); + + id_key.id = sp->id; + id_key.creatorid = sp->creatorid; + + st = pf_find_state_byid(&id_key); + if (st == NULL) { + pfsyncstat_inc(pfsyncs_badstate); + continue; + } + + mtx_enter(&st->mtx); + SET(st->state_flags, PFSTATE_NOSYNC); + mtx_leave(&st->mtx); + pf_remove_state(st); + } + PF_STATE_EXIT_WRITE(); + PF_UNLOCK(); +} + +static void +pfsync_in_del_c(struct pfsync_softc *sc, + const caddr_t buf, unsigned int mlen, unsigned int count) +{ + const struct pfsync_del_c *sp; + struct pf_state_cmp id_key; + struct pf_state *st; + unsigned int i; + + PF_LOCK(); + PF_STATE_ENTER_WRITE(); + for (i = 0; i < count; i++) { + sp = (struct pfsync_del_c *)(buf + mlen * i); + + id_key.id = sp->id; + id_key.creatorid = sp->creatorid; + + st = pf_find_state_byid(&id_key); + if (st == NULL) { + pfsyncstat_inc(pfsyncs_badstate); + continue; + } + + mtx_enter(&st->mtx); + SET(st->state_flags, PFSTATE_NOSYNC); + mtx_leave(&st->mtx); + pf_remove_state(st); + } + PF_STATE_EXIT_WRITE(); + PF_UNLOCK(); +} + +static void +pfsync_in_bus(struct pfsync_softc *sc, + const caddr_t buf, unsigned int len, unsigned int count) +{ + const struct pfsync_bus *bus = (struct pfsync_bus *)buf; + + switch (bus->status) { + case PFSYNC_BUS_START: + pfsync_bulk_req_evt(sc, PFSYNC_BREQ_EVT_BUS_START); + break; + + case PFSYNC_BUS_END: + pfsync_bulk_req_evt(sc, PFSYNC_BREQ_EVT_BUS_END); + break; + } +} + +#if defined(IPSEC) +/* Update an in-kernel tdb. Silently fail if no tdb is found. */ +static void +pfsync_update_net_tdb(const struct pfsync_tdb *pt) +{ + struct tdb *tdb; + + NET_ASSERT_LOCKED(); + + /* check for invalid values */ + if (ntohl(pt->spi) <= SPI_RESERVED_MAX || + (pt->dst.sa.sa_family != AF_INET && + pt->dst.sa.sa_family != AF_INET6)) + goto bad; + + tdb = gettdb(ntohs(pt->rdomain), pt->spi, + (union sockaddr_union *)&pt->dst, pt->sproto); + if (tdb) { + uint64_t rpl = betoh64(pt->rpl); + uint64_t cur_bytes = betoh64(pt->cur_bytes); + + /* Neither replay nor byte counter should ever decrease. */ + mtx_enter(&tdb->tdb_mtx); + if (rpl >= tdb->tdb_rpl && + cur_bytes >= tdb->tdb_cur_bytes) { + tdb->tdb_rpl = rpl; + tdb->tdb_cur_bytes = cur_bytes; + } + mtx_leave(&tdb->tdb_mtx); + + tdb_unref(tdb); + } + return; + + bad: + DPFPRINTF(LOG_WARNING, "pfsync_insert: PFSYNC_ACT_TDB_UPD: " + "invalid value"); + pfsyncstat_inc(pfsyncs_badstate); + return; +} +#endif + +static void +pfsync_in_tdb(struct pfsync_softc *sc, + const caddr_t buf, unsigned int len, unsigned int count) +{ +#if defined(IPSEC) + const struct pfsync_tdb *tp; + unsigned int i; + + for (i = 0; i < count; i++) { + tp = (const struct pfsync_tdb *)(buf + len * i); + pfsync_update_net_tdb(tp); + } +#endif +} + +int +pfsync_input4(struct mbuf **mp, int *offp, int proto, int af) +{ + struct mbuf *m = *mp; + struct ip *ip; + + ip = mtod(m, struct ip *); + + m = pfsync_input(m, ip->ip_ttl, ip->ip_hl << 2); + + m_freem(m); + *mp = NULL; + + return (IPPROTO_DONE); } int @@ -2651,8 +3344,8 @@ pfsync_sysctl_pfsyncstat(void *oldp, size_t *oldlenp, void *newp) } int -pfsync_sysctl(int *name, u_int namelen, void *oldp, size_t *oldlenp, void *newp, - size_t newlen) +pfsync_sysctl(int *name, u_int namelen, void *oldp, size_t *oldlenp, + void *newp, size_t newlen) { /* All sysctl names at this level are terminal. */ if (namelen != 1) diff --git a/sys/net/if_pfsync.h b/sys/net/if_pfsync.h index ff26ac3669e..e83ddd8306c 100644 --- a/sys/net/if_pfsync.h +++ b/sys/net/if_pfsync.h @@ -1,4 +1,4 @@ -/* $OpenBSD: if_pfsync.h,v 1.59 2022/11/11 11:47:13 dlg Exp $ */ +/* $OpenBSD: if_pfsync.h,v 1.60 2023/07/06 04:55:05 dlg Exp $ */ /* * Copyright (c) 2001 Michael Shalayeff @@ -177,7 +177,7 @@ struct pfsync_upd_c { struct pfsync_upd_req { u_int64_t id; u_int32_t creatorid; -} __packed; +} __packed __aligned(4); /* * DEL_C @@ -295,16 +295,6 @@ enum pfsync_counters { pfsyncs_ncounters, }; -extern struct cpumem *pfsynccounters; - -struct pfsync_deferral; - -static inline void -pfsyncstat_inc(enum pfsync_counters c) -{ - counters_inc(pfsynccounters, c); -} - /* * this shows where a pf state is with respect to the syncing. */ @@ -315,10 +305,11 @@ pfsyncstat_inc(enum pfsync_counters c) #define PFSYNC_S_UPD 0x04 #define PFSYNC_S_COUNT 0x05 -#define PFSYNC_S_DEFER 0xfe -#define PFSYNC_S_NONE 0xff +#define PFSYNC_S_NONE 0xd0 +#define PFSYNC_S_SYNC 0xd1 +#define PFSYNC_S_DEAD 0xde -int pfsync_input(struct mbuf **, int *, int, int); +int pfsync_input4(struct mbuf **, int *, int, int); int pfsync_sysctl(int *, u_int, void *, size_t *, void *, size_t); @@ -329,6 +320,9 @@ int pfsync_state_import(struct pfsync_state *, int); void pfsync_state_export(struct pfsync_state *, struct pf_state *); +void pfsync_init_state(struct pf_state *, + const struct pf_state_key *, + const struct pf_state_key *, int); void pfsync_insert_state(struct pf_state *); void pfsync_update_state(struct pf_state *); void pfsync_delete_state(struct pf_state *); @@ -337,14 +331,10 @@ void pfsync_clear_states(u_int32_t, const char *); void pfsync_update_tdb(struct tdb *, int); void pfsync_delete_tdb(struct tdb *); -int pfsync_defer(struct pf_state *, struct mbuf *, - struct pfsync_deferral **); -void pfsync_undefer(struct pfsync_deferral *, int); +int pfsync_defer(struct pf_state *, struct mbuf *); int pfsync_is_up(void); int pfsync_state_in_use(struct pf_state *); - -void pfsync_iack(struct pf_state *); #endif /* _KERNEL */ #endif /* _NET_IF_PFSYNC_H_ */ diff --git a/sys/net/netisr.h b/sys/net/netisr.h index d79d697732e..0e9f51bfe30 100644 --- a/sys/net/netisr.h +++ b/sys/net/netisr.h @@ -1,4 +1,4 @@ -/* $OpenBSD: netisr.h,v 1.60 2022/07/14 10:52:21 mvs Exp $ */ +/* $OpenBSD: netisr.h,v 1.61 2023/07/06 04:55:05 dlg Exp $ */ /* $NetBSD: netisr.h,v 1.12 1995/08/12 23:59:24 mycroft Exp $ */ /* @@ -42,7 +42,6 @@ * on the lowest level routine of each protocol. */ #define NETISR_IP 2 /* same as AF_INET */ -#define NETISR_PFSYNC 5 /* for pfsync "immediate" tx */ #define NETISR_ARP 18 /* same as AF_LINK */ #define NETISR_IPV6 24 /* same as AF_INET6 */ #define NETISR_PIPEX 27 /* for pipex processing */ @@ -64,7 +63,6 @@ void ipintr(void); void ip6intr(void); void pppintr(void); void bridgeintr(void); -void pfsyncintr(void); void pipexintr(void); void pppoeintr(void); diff --git a/sys/net/pf.c b/sys/net/pf.c index b3a655b602b..f5ad04dfa8d 100644 --- a/sys/net/pf.c +++ b/sys/net/pf.c @@ -1,4 +1,4 @@ -/* $OpenBSD: pf.c,v 1.1181 2023/06/05 08:37:27 sashan Exp $ */ +/* $OpenBSD: pf.c,v 1.1182 2023/07/06 04:55:05 dlg Exp $ */ /* * Copyright (c) 2001 Daniel Hartmeier @@ -100,8 +100,6 @@ #if NPFSYNC > 0 #include -#else -struct pfsync_deferral; #endif /* NPFSYNC > 0 */ /* @@ -121,10 +119,6 @@ u_char pf_tcp_secret[16]; int pf_tcp_secret_init; int pf_tcp_iss_off; -int pf_npurge; -struct task pf_purge_task = TASK_INITIALIZER(pf_purge, &pf_npurge); -struct timeout pf_purge_to = TIMEOUT_INITIALIZER(pf_purge_timeout, NULL); - enum pf_test_status { PF_TEST_FAIL = -1, PF_TEST_OK, @@ -190,8 +184,7 @@ void pf_rule_to_actions(struct pf_rule *, struct pf_rule_actions *); int pf_test_rule(struct pf_pdesc *, struct pf_rule **, struct pf_state **, struct pf_rule **, - struct pf_ruleset **, u_short *, - struct pfsync_deferral **); + struct pf_ruleset **, u_short *); static __inline int pf_create_state(struct pf_pdesc *, struct pf_rule *, struct pf_rule *, struct pf_rule *, struct pf_state_key **, struct pf_state_key **, @@ -250,6 +243,10 @@ void pf_counters_inc(int, struct pf_pdesc *, struct pf_state *, struct pf_rule *, struct pf_rule *); +int pf_state_insert(struct pfi_kif *, + struct pf_state_key **, struct pf_state_key **, + struct pf_state *); + int pf_state_key_isvalid(struct pf_state_key *); struct pf_state_key *pf_state_key_ref(struct pf_state_key *); void pf_state_key_unref(struct pf_state_key *); @@ -1064,10 +1061,11 @@ pf_state_insert(struct pfi_kif *kif, struct pf_state_key **skwp, pf_status.fcounters[FCNT_STATE_INSERT]++; pf_status.states++; pfi_kif_ref(kif, PFI_KIF_REF_STATE); + PF_STATE_EXIT_WRITE(); + #if NPFSYNC > 0 pfsync_insert_state(st); #endif /* NPFSYNC > 0 */ - PF_STATE_EXIT_WRITE(); *skwp = skw; *sksp = sks; @@ -1318,6 +1316,8 @@ pf_state_export(struct pfsync_state *sp, struct pf_state *st) #endif /* NPFLOG > 0 */ sp->timeout = st->timeout; sp->state_flags = htons(st->state_flags); + if (READ_ONCE(st->sync_defer) != NULL) + sp->state_flags |= htons(PFSTATE_ACK); if (!SLIST_EMPTY(&st->src_nodes)) sp->sync_flags |= PFSYNC_FLAG_SRCNODE; @@ -1519,9 +1519,6 @@ pf_state_import(const struct pfsync_state *sp, int flags) st->rule.ptr = r; st->anchor.ptr = NULL; - st->pfsync_time = getuptime(); - st->sync_state = PFSYNC_S_NONE; - PF_REF_INIT(st->refcnt); mtx_init(&st->mtx, IPL_NET); @@ -1529,15 +1526,12 @@ pf_state_import(const struct pfsync_state *sp, int flags) r->states_cur++; r->states_tot++; + st->sync_state = PFSYNC_S_NONE; + st->pfsync_time = getuptime(); #if NPFSYNC > 0 - if (!ISSET(flags, PFSYNC_SI_IOCTL)) - SET(st->state_flags, PFSTATE_NOSYNC); + pfsync_init_state(st, skw, sks, flags); #endif - /* - * We just set PFSTATE_NOSYNC bit, which prevents - * pfsync_insert_state() to insert state to pfsync. - */ if (pf_state_insert(kif, &skw, &sks, st) != 0) { /* XXX when we have anchors, use STATE_DEC_COUNTERS */ r->states_cur--; @@ -1545,15 +1539,6 @@ pf_state_import(const struct pfsync_state *sp, int flags) goto cleanup_state; } -#if NPFSYNC > 0 - if (!ISSET(flags, PFSYNC_SI_IOCTL)) { - CLR(st->state_flags, PFSTATE_NOSYNC); - if (ISSET(st->state_flags, PFSTATE_ACK)) - pfsync_iack(st); - } - CLR(st->state_flags, PFSTATE_ACK); -#endif - return (0); cleanup: @@ -1576,47 +1561,106 @@ pf_state_import(const struct pfsync_state *sp, int flags) /* END state table stuff */ +void pf_purge_states(void *); +struct task pf_purge_states_task = + TASK_INITIALIZER(pf_purge_states, NULL); + +void pf_purge_states_tick(void *); +struct timeout pf_purge_states_to = + TIMEOUT_INITIALIZER(pf_purge_states_tick, NULL); + +unsigned int pf_purge_expired_states(unsigned int, unsigned int); + +/* + * how many states to scan this interval. + * + * this is set when the timeout fires, and reduced by the task. the + * task will reschedule itself until the limit is reduced to zero, + * and then it adds the timeout again. + */ +unsigned int pf_purge_states_limit; + +/* + * limit how many states are processed with locks held per run of + * the state purge task. + */ +unsigned int pf_purge_states_collect = 64; + + void +pf_purge_states_tick(void *null) + { + unsigned int limit = pf_status.states; + unsigned int interval = pf_default_rule.timeout[PFTM_INTERVAL]; + + if (limit == 0) { + timeout_add_sec(&pf_purge_states_to, 1); + return; + } + + /* + * process a fraction of the state table every second + */ + + if (interval > 1) + limit /= interval; + + pf_purge_states_limit = limit; + task_add(systqmp, &pf_purge_states_task); +} + void -pf_purge_timeout(void *unused) +pf_purge_states(void *null) { - /* XXX move to systqmp to avoid KERNEL_LOCK */ - task_add(systq, &pf_purge_task); + unsigned int limit; + unsigned int scanned; + + limit = pf_purge_states_limit; + if (limit < pf_purge_states_collect) + limit = pf_purge_states_collect; + + scanned = pf_purge_expired_states(limit, pf_purge_states_collect); + if (scanned >= pf_purge_states_limit) { + /* we've run out of states to scan this "interval" */ + timeout_add_sec(&pf_purge_states_to, 1); + return; + } + + pf_purge_states_limit -= scanned; + task_add(systqmp, &pf_purge_states_task); } +void pf_purge_tick(void *); +struct timeout pf_purge_to = + TIMEOUT_INITIALIZER(pf_purge_tick, NULL); + +void pf_purge(void *); +struct task pf_purge_task = + TASK_INITIALIZER(pf_purge, NULL); + void -pf_purge(void *xnloops) +pf_purge_tick(void *null) { - int *nloops = xnloops; - - /* - * process a fraction of the state table every second - * Note: - * we no longer need PF_LOCK() here, because - * pf_purge_expired_states() uses pf_state_lock to maintain - * consistency. - */ - if (pf_default_rule.timeout[PFTM_INTERVAL] > 0) - pf_purge_expired_states(1 + (pf_status.states - / pf_default_rule.timeout[PFTM_INTERVAL])); + task_add(systqmp, &pf_purge_task); +} - NET_LOCK(); +void +pf_purge(void *null) +{ + unsigned int interval = max(1, pf_default_rule.timeout[PFTM_INTERVAL]); PF_LOCK(); - /* purge other expired types every PFTM_INTERVAL seconds */ - if (++(*nloops) >= pf_default_rule.timeout[PFTM_INTERVAL]) - pf_purge_expired_src_nodes(); - PF_UNLOCK(); + pf_purge_expired_src_nodes(); + + PF_UNLOCK(); + /* * Fragments don't require PF_LOCK(), they use their own lock. */ - if ((*nloops) >= pf_default_rule.timeout[PFTM_INTERVAL]) { - pf_purge_expired_fragments(); - *nloops = 0; - } - NET_UNLOCK(); - - timeout_add_sec(&pf_purge_to, 1); + pf_purge_expired_fragments(); + + /* interpret the interval as idle time between runs */ + timeout_add_sec(&pf_purge_to, interval); } int32_t @@ -1717,6 +1761,8 @@ pf_remove_state(struct pf_state *st) if (st->timeout == PFTM_UNLINKED) return; + st->timeout = PFTM_UNLINKED; + /* handle load balancing related tasks */ pf_postprocess_addr(st); @@ -1741,7 +1787,6 @@ pf_remove_state(struct pf_state *st) #if NPFSYNC > 0 pfsync_delete_state(st); #endif /* NPFSYNC > 0 */ - st->timeout = PFTM_UNLINKED; pf_src_tree_remove_state(st); pf_detach_state(st); } @@ -1795,6 +1840,7 @@ pf_free_state(struct pf_state *st) if (pfsync_state_in_use(st)) return; #endif /* NPFSYNC > 0 */ + KASSERT(st->timeout == PFTM_UNLINKED); if (--st->rule.ptr->states_cur == 0 && st->rule.ptr->src_nodes == 0) @@ -1819,8 +1865,8 @@ pf_free_state(struct pf_state *st) pf_status.states--; } -void -pf_purge_expired_states(u_int32_t maxcheck) +unsigned int +pf_purge_expired_states(const unsigned int limit, const unsigned int collect) { /* * this task/thread/context/whatever is the only thing that @@ -1834,6 +1880,8 @@ pf_purge_expired_states(u_int32_t maxcheck) struct pf_state *st; SLIST_HEAD(pf_state_gcl, pf_state) gcl = SLIST_HEAD_INITIALIZER(gcl); time_t now; + unsigned int scanned; + unsigned int collected = 0; PF_ASSERT_UNLOCKED(); @@ -1847,7 +1895,7 @@ pf_purge_expired_states(u_int32_t maxcheck) if (head == NULL) { /* the list is empty */ rw_exit_read(&pf_state_list.pfs_rwl); - return; + return (limit); } /* (re)start at the front of the list */ @@ -1856,13 +1904,17 @@ pf_purge_expired_states(u_int32_t maxcheck) now = getuptime(); - do { + for (scanned = 0; scanned < limit; scanned++) { uint8_t stimeout = cur->timeout; + unsigned int limited = 0; if ((stimeout == PFTM_UNLINKED) || (pf_state_expires(cur, stimeout) <= now)) { st = pf_state_ref(cur); SLIST_INSERT_HEAD(&gcl, st, gc_list); + + if (++collected >= collect) + limited = 1; } /* don't iterate past the end of our view of the list */ @@ -1872,14 +1924,18 @@ pf_purge_expired_states(u_int32_t maxcheck) } cur = TAILQ_NEXT(cur, entry_list); - } while (maxcheck--); + + /* don't spend too much time here. */ + if (ISSET(READ_ONCE(curcpu()->ci_schedstate.spc_schedflags), + SPCF_SHOULDYIELD) || limited) + break; + } rw_exit_read(&pf_state_list.pfs_rwl); if (SLIST_EMPTY(&gcl)) - return; + return (scanned); - NET_LOCK(); rw_enter_write(&pf_state_list.pfs_rwl); PF_LOCK(); PF_STATE_ENTER_WRITE(); @@ -1892,12 +1948,13 @@ pf_purge_expired_states(u_int32_t maxcheck) PF_STATE_EXIT_WRITE(); PF_UNLOCK(); rw_exit_write(&pf_state_list.pfs_rwl); - NET_UNLOCK(); while ((st = SLIST_FIRST(&gcl)) != NULL) { SLIST_REMOVE_HEAD(&gcl, gc_list); pf_state_unref(st); } + + return (scanned); } int @@ -4262,8 +4319,7 @@ next_rule: int pf_test_rule(struct pf_pdesc *pd, struct pf_rule **rm, struct pf_state **sm, - struct pf_rule **am, struct pf_ruleset **rsm, u_short *reason, - struct pfsync_deferral **pdeferral) + struct pf_rule **am, struct pf_ruleset **rsm, u_short *reason) { struct pf_rule *r = NULL; struct pf_rule *a = NULL; @@ -4475,7 +4531,7 @@ pf_test_rule(struct pf_pdesc *pd, struct pf_rule **rm, struct pf_state **sm, * firewall has to know about it to allow * replies through it. */ - if (pfsync_defer(*sm, pd->m, pdeferral)) + if (pfsync_defer(*sm, pd->m)) return (PF_DEFER); } #endif /* NPFSYNC > 0 */ @@ -4517,6 +4573,8 @@ pf_create_state(struct pf_pdesc *pd, struct pf_rule *r, struct pf_rule *a, st->state_flags |= PFSTATE_SLOPPY; if (r->rule_flag & PFRULE_PFLOW) st->state_flags |= PFSTATE_PFLOW; + if (r->rule_flag & PFRULE_NOSYNC) + st->state_flags |= PFSTATE_NOSYNC; #if NPFLOG > 0 st->log = act->log & PF_LOG_ALL; #endif /* NPFLOG > 0 */ @@ -4535,6 +4593,7 @@ pf_create_state(struct pf_pdesc *pd, struct pf_rule *r, struct pf_rule *a, st->set_prio[1] = act->set_prio[1]; st->delay = act->delay; SLIST_INIT(&st->src_nodes); + /* * must initialize refcnt, before pf_state_insert() gets called. * pf_state_inserts() grabs reference for pfsync! @@ -7462,7 +7521,6 @@ pf_test(sa_family_t af, int fwdir, struct ifnet *ifp, struct mbuf **m0) int dir = (fwdir == PF_FWD) ? PF_OUT : fwdir; u_int32_t qid, pqid = 0; int have_pf_lock = 0; - struct pfsync_deferral *deferral = NULL; if (!pf_status.running) return (PF_PASS); @@ -7565,8 +7623,7 @@ pf_test(sa_family_t af, int fwdir, struct ifnet *ifp, struct mbuf **m0) */ PF_LOCK(); have_pf_lock = 1; - action = pf_test_rule(&pd, &r, &st, &a, &ruleset, &reason, - &deferral); + action = pf_test_rule(&pd, &r, &st, &a, &ruleset, &reason); st = pf_state_ref(st); if (action != PF_PASS) REASON_SET(&reason, PFRES_FRAG); @@ -7598,7 +7655,7 @@ pf_test(sa_family_t af, int fwdir, struct ifnet *ifp, struct mbuf **m0) PF_LOCK(); have_pf_lock = 1; action = pf_test_rule(&pd, &r, &st, &a, &ruleset, - &reason, &deferral); + &reason); st = pf_state_ref(st); } break; @@ -7630,7 +7687,7 @@ pf_test(sa_family_t af, int fwdir, struct ifnet *ifp, struct mbuf **m0) PF_LOCK(); have_pf_lock = 1; action = pf_test_rule(&pd, &r, &st, &a, &ruleset, - &reason, &deferral); + &reason); st = pf_state_ref(st); } break; @@ -7714,7 +7771,7 @@ pf_test(sa_family_t af, int fwdir, struct ifnet *ifp, struct mbuf **m0) PF_LOCK(); have_pf_lock = 1; action = pf_test_rule(&pd, &r, &st, &a, &ruleset, - &reason, &deferral); + &reason); st = pf_state_ref(st); } @@ -7854,14 +7911,6 @@ done: m_freem(pd.m); /* FALLTHROUGH */ case PF_DEFER: -#if NPFSYNC > 0 - /* - * We no longer hold PF_LOCK() here, so we can dispatch - * deferral if we are asked to do so. - */ - if (deferral != NULL) - pfsync_undefer(deferral, 0); -#endif /* NPFSYNC > 0 */ pd.m = NULL; action = PF_PASS; break; @@ -8210,7 +8259,7 @@ pf_state_unref(struct pf_state *st) #if NPFSYNC > 0 KASSERT((TAILQ_NEXT(st, sync_list) == NULL) || ((TAILQ_NEXT(st, sync_list) == _Q_INVALID) && - (st->sync_state == PFSYNC_S_NONE))); + (st->sync_state >= PFSYNC_S_NONE))); #endif /* NPFSYNC */ KASSERT((TAILQ_NEXT(st, entry_list) == NULL) || (TAILQ_NEXT(st, entry_list) == _Q_INVALID)); diff --git a/sys/net/pf_ioctl.c b/sys/net/pf_ioctl.c index f20632df590..078fa72e083 100644 --- a/sys/net/pf_ioctl.c +++ b/sys/net/pf_ioctl.c @@ -1,4 +1,4 @@ -/* $OpenBSD: pf_ioctl.c,v 1.414 2023/07/04 14:23:38 sashan Exp $ */ +/* $OpenBSD: pf_ioctl.c,v 1.415 2023/07/06 04:55:05 dlg Exp $ */ /* * Copyright (c) 2001 Daniel Hartmeier @@ -1000,13 +1000,14 @@ pf_states_clr(struct pfioc_state_kill *psk) } PF_STATE_EXIT_WRITE(); -#if NPFSYNC > 0 - pfsync_clear_states(pf_status.hostid, psk->psk_ifname); -#endif /* NPFSYNC > 0 */ PF_UNLOCK(); rw_exit(&pf_state_list.pfs_rwl); psk->psk_killed = killed; + +#if NPFSYNC > 0 + pfsync_clear_states(pf_status.hostid, psk->psk_ifname); +#endif /* NPFSYNC > 0 */ unlock: NET_UNLOCK(); @@ -1190,6 +1191,7 @@ pfioctl(dev_t dev, u_long cmd, caddr_t addr, int flags, struct proc *p) pf_status.stateid = gettime(); pf_status.stateid = pf_status.stateid << 32; } + timeout_add_sec(&pf_purge_states_to, 1); timeout_add_sec(&pf_purge_to, 1); pf_create_queues(); DPFPRINTF(LOG_NOTICE, "pf: started"); @@ -2783,8 +2785,9 @@ pfioctl(dev_t dev, u_long cmd, caddr_t addr, int flags, struct proc *p) pf_default_rule.timeout[i] = pf_default_rule_new.timeout[i]; if (pf_default_rule.timeout[i] == PFTM_INTERVAL && - pf_default_rule.timeout[i] < old) - task_add(net_tq(0), &pf_purge_task); + pf_default_rule.timeout[i] < old && + timeout_del(&pf_purge_to)) + task_add(systqmp, &pf_purge_task); } pfi_xcommit(); pf_trans_set_commit(); diff --git a/sys/net/pf_norm.c b/sys/net/pf_norm.c index 7ab4c00c73f..ef2c884918f 100644 --- a/sys/net/pf_norm.c +++ b/sys/net/pf_norm.c @@ -1,4 +1,4 @@ -/* $OpenBSD: pf_norm.c,v 1.227 2023/05/07 16:23:23 bluhm Exp $ */ +/* $OpenBSD: pf_norm.c,v 1.228 2023/07/06 04:55:05 dlg Exp $ */ /* * Copyright 2001 Niels Provos @@ -1098,10 +1098,22 @@ no_fragment: } #endif /* INET6 */ +struct pf_state_scrub * +pf_state_scrub_get(void) +{ + return (pool_get(&pf_state_scrub_pl, PR_NOWAIT | PR_ZERO)); +} + +void +pf_state_scrub_put(struct pf_state_scrub *scrub) +{ + pool_put(&pf_state_scrub_pl, scrub); +} + int pf_normalize_tcp_alloc(struct pf_state_peer *src) { - src->scrub = pool_get(&pf_state_scrub_pl, PR_NOWAIT | PR_ZERO); + src->scrub = pf_state_scrub_get(); if (src->scrub == NULL) return (ENOMEM); diff --git a/sys/net/pfvar.h b/sys/net/pfvar.h index 9e863915fbd..27cce82bdad 100644 --- a/sys/net/pfvar.h +++ b/sys/net/pfvar.h @@ -1,4 +1,4 @@ -/* $OpenBSD: pfvar.h,v 1.532 2023/07/04 11:34:19 sashan Exp $ */ +/* $OpenBSD: pfvar.h,v 1.533 2023/07/06 04:55:05 dlg Exp $ */ /* * Copyright (c) 2001 Daniel Hartmeier @@ -1604,15 +1604,10 @@ extern void pf_tbladdr_remove(struct pf_addr_wrap *); extern void pf_tbladdr_copyout(struct pf_addr_wrap *); extern void pf_calc_skip_steps(struct pf_rulequeue *); extern void pf_purge_expired_src_nodes(void); -extern void pf_purge_expired_states(u_int32_t); extern void pf_purge_expired_rules(void); extern void pf_remove_state(struct pf_state *); extern void pf_remove_divert_state(struct pf_state_key *); extern void pf_free_state(struct pf_state *); -extern int pf_state_insert(struct pfi_kif *, - struct pf_state_key **, - struct pf_state_key **, - struct pf_state *); int pf_insert_src_node(struct pf_src_node **, struct pf_rule *, enum pf_sn_types, sa_family_t, struct pf_addr *, @@ -1676,6 +1671,10 @@ int pf_match_port(u_int8_t, u_int16_t, u_int16_t, u_int16_t); int pf_match_uid(u_int8_t, uid_t, uid_t, uid_t); int pf_match_gid(u_int8_t, gid_t, gid_t, gid_t); +struct pf_state_scrub * + pf_state_scrub_get(void); +void pf_state_scrub_put(struct pf_state_scrub *); + int pf_refragment6(struct mbuf **, struct m_tag *mtag, struct sockaddr_in6 *, struct ifnet *, struct rtentry *); void pf_normalize_init(void); diff --git a/sys/net/pfvar_priv.h b/sys/net/pfvar_priv.h index e9e80f6196f..53d983432a9 100644 --- a/sys/net/pfvar_priv.h +++ b/sys/net/pfvar_priv.h @@ -1,4 +1,4 @@ -/* $OpenBSD: pfvar_priv.h,v 1.33 2023/05/10 22:42:51 sashan Exp $ */ +/* $OpenBSD: pfvar_priv.h,v 1.34 2023/07/06 04:55:05 dlg Exp $ */ /* * Copyright (c) 2001 Daniel Hartmeier @@ -41,6 +41,12 @@ #include #include +struct pfsync_deferral; + +/* + * pf state items - links from pf_state_key to pf_states + */ + struct pf_state_item { TAILQ_ENTRY(pf_state_item) si_entry; @@ -49,6 +55,10 @@ struct pf_state_item { TAILQ_HEAD(pf_statelisthead, pf_state_item); +/* + * pf state keys - look up states by address + */ + struct pf_state_key { struct pf_addr addr[2]; u_int16_t port[2]; @@ -73,11 +83,13 @@ RBT_PROTOTYPE(pf_state_tree, pf_state_key, sk_entry, pf_state_compare_key); (key[PF_SK_WIRE]->af != (family))) /* + * pf state + * * Protection/ownership of pf_state members: * I immutable after pf_state_insert() * M pf_state mtx * P PF_STATE_LOCK - * S pfsync mutex + * S pfsync * L pf_state_list * g pf_purge gc */ @@ -89,7 +101,7 @@ struct pf_state { u_int8_t pad[3]; TAILQ_ENTRY(pf_state) sync_list; /* [S] */ - TAILQ_ENTRY(pf_state) sync_snap; /* [S] */ + struct pfsync_deferral *sync_defer; /* [S] */ TAILQ_ENTRY(pf_state) entry_list; /* [L] */ SLIST_ENTRY(pf_state) gc_list; /* [g] */ RB_ENTRY(pf_state) entry_id; /* [P] */ @@ -101,7 +113,7 @@ struct pf_state { union pf_rule_ptr natrule; /* [I] */ struct pf_addr rt_addr; /* [I] */ struct pf_sn_head src_nodes; /* [I] */ - struct pf_state_key *key[2]; /* [I] stack and wire */ + struct pf_state_key *key[2]; /* [I] stack and wire */ struct pfi_kif *kif; /* [I] */ struct mutex mtx; pf_refcnt_t refcnt; @@ -109,16 +121,16 @@ struct pf_state { u_int64_t bytes[2]; int32_t creation; /* [I] */ int32_t expire; - int32_t pfsync_time; - int rtableid[2]; /* [I] rtables stack and wire */ + int32_t pfsync_time; /* [S] */ + int rtableid[2]; /* [I] stack and wire */ u_int16_t qid; /* [I] */ u_int16_t pqid; /* [I] */ u_int16_t tag; /* [I] */ - u_int16_t state_flags; + u_int16_t state_flags; /* [M] */ u_int8_t log; /* [I] */ u_int8_t timeout; - u_int8_t sync_state; /* PFSYNC_S_x */ - u_int8_t sync_updates; + u_int8_t sync_state; /* [S] PFSYNC_S_x */ + u_int8_t sync_updates; /* [S] */ u_int8_t min_ttl; /* [I] */ u_int8_t set_tos; /* [I] */ u_int8_t set_prio[2]; /* [I] */ @@ -127,7 +139,6 @@ struct pf_state { u_int16_t if_index_out; /* [I] */ u_int16_t delay; /* [I] */ u_int8_t rt; /* [I] */ - u_int8_t snapped; /* [S] */ }; RBT_HEAD(pf_state_tree_id, pf_state); @@ -345,6 +356,7 @@ struct pf_trans { #define pftgr_anchor u.u_getrule.gr_anchor #define pftgr_rule u.u_getrule.gr_rule +extern struct timeout pf_purge_states_to; extern struct task pf_purge_task; extern struct timeout pf_purge_to; @@ -397,9 +409,6 @@ extern struct rwlock pf_state_lock; rw_status(&pf_state_lock), __func__);\ } while (0) -extern void pf_purge_timeout(void *); -extern void pf_purge(void *); - /* for copies to/from network byte order */ void pf_state_peer_hton(const struct pf_state_peer *, struct pfsync_state_peer *); diff --git a/sys/netinet/in_proto.c b/sys/netinet/in_proto.c index 526596eb6b8..d2e67c61aae 100644 --- a/sys/netinet/in_proto.c +++ b/sys/netinet/in_proto.c @@ -1,4 +1,4 @@ -/* $OpenBSD: in_proto.c,v 1.101 2023/05/18 09:59:43 mvs Exp $ */ +/* $OpenBSD: in_proto.c,v 1.102 2023/07/06 04:55:05 dlg Exp $ */ /* $NetBSD: in_proto.c,v 1.14 1996/02/18 18:58:32 christos Exp $ */ /* @@ -343,7 +343,7 @@ const struct protosw inetsw[] = { .pr_domain = &inetdomain, .pr_protocol = IPPROTO_PFSYNC, .pr_flags = PR_ATOMIC|PR_ADDR, - .pr_input = pfsync_input, + .pr_input = pfsync_input4, .pr_ctloutput = rip_ctloutput, .pr_usrreqs = &rip_usrreqs, .pr_sysctl = pfsync_sysctl diff --git a/sys/netinet/ip_ipsp.h b/sys/netinet/ip_ipsp.h index f7e621efa3a..5da3ad7437f 100644 --- a/sys/netinet/ip_ipsp.h +++ b/sys/netinet/ip_ipsp.h @@ -1,4 +1,4 @@ -/* $OpenBSD: ip_ipsp.h,v 1.240 2022/07/14 13:52:10 mvs Exp $ */ +/* $OpenBSD: ip_ipsp.h,v 1.241 2023/07/06 04:55:05 dlg Exp $ */ /* * The authors of this code are John Ioannidis (ji@tla.org), * Angelos D. Keromytis (kermit@csd.uch.gr), @@ -50,6 +50,7 @@ * P ipo_tdb_mtx link policy to TDB global mutex * D tdb_sadb_mtx SA database global mutex * m tdb_mtx fields of struct tdb + * S pfsync fields of struct tdb */ /* IPSP global definitions. */ @@ -405,7 +406,6 @@ struct tdb { /* tunnel descriptor block */ u_int8_t tdb_sproto; /* [I] IPsec protocol */ u_int8_t tdb_wnd; /* Replay window */ u_int8_t tdb_satype; /* SA type (RFC2367, PF_KEY) */ - u_int8_t tdb_updates; /* pfsync update counter */ union sockaddr_union tdb_dst; /* [N] Destination address */ union sockaddr_union tdb_src; /* [N] Source address */ @@ -439,8 +439,8 @@ struct tdb { /* tunnel descriptor block */ struct sockaddr_encap tdb_filtermask; /* And the mask */ TAILQ_HEAD(tdb_policy_head, ipsec_policy) tdb_policy_head; /* [P] */ - TAILQ_ENTRY(tdb) tdb_sync_entry; - TAILQ_ENTRY(tdb) tdb_sync_snap; + TAILQ_ENTRY(tdb) tdb_sync_entry; /* [S] pfsync tdb queue */ + u_int32_t tdb_updates; /* [S] pfsync update counter */ }; enum tdb_counters { -- 2.20.1