-/* $OpenBSD: if_pfsync.c,v 1.294 2021/06/23 05:43:53 dlg Exp $ */
+/* $OpenBSD: if_pfsync.c,v 1.295 2021/06/23 06:53:51 dlg Exp $ */
/*
* Copyright (c) 2002 Michael Shalayeff
{
struct pfsync_softc *sc = pfsyncif;
+ NET_ASSERT_LOCKED();
+
+ /*
+ * pf gc via pfsync_state_in_use reads sc_bulk_next and
+ * sc_bulk_last while exclusively holding the pf_state_list
+ * rwlock. make sure it can't race with us setting these
+ * pointers. they basically act as hazards, and borrow the
+ * lists state reference count.
+ */
+ rw_enter_read(&pf_state_list.pfs_rwl);
+
+ /* get a consistent view of the list pointers */
+ mtx_enter(&pf_state_list.pfs_mtx);
+ if (sc->sc_bulk_next == NULL)
+ sc->sc_bulk_next = TAILQ_FIRST(&pf_state_list.pfs_list);
+
+ sc->sc_bulk_last = TAILQ_LAST(&pf_state_list.pfs_list, pf_state_queue);
+ mtx_leave(&pf_state_list.pfs_mtx);
+
+ rw_exit_read(&pf_state_list.pfs_rwl);
+
DPFPRINTF(LOG_INFO, "received bulk update request");
- if (TAILQ_EMPTY(&state_list))
+ if (sc->sc_bulk_last == NULL)
pfsync_bulk_status(PFSYNC_BUS_END);
else {
sc->sc_ureq_received = getuptime();
- if (sc->sc_bulk_next == NULL) {
- PF_STATE_ENTER_READ();
- sc->sc_bulk_next = TAILQ_FIRST(&state_list);
- pf_state_ref(sc->sc_bulk_next);
- PF_STATE_EXIT_READ();
- }
- sc->sc_bulk_last = sc->sc_bulk_next;
- pf_state_ref(sc->sc_bulk_last);
-
pfsync_bulk_status(PFSYNC_BUS_START);
timeout_add(&sc->sc_bulk_tmo, 0);
}
pfsync_bulk_update(void *arg)
{
struct pfsync_softc *sc;
- struct pf_state *st, *st_next;
+ struct pf_state *st;
int i = 0;
NET_LOCK();
sc = pfsyncif;
if (sc == NULL)
goto out;
+
+ rw_enter_read(&pf_state_list.pfs_rwl);
st = sc->sc_bulk_next;
sc->sc_bulk_next = NULL;
i++;
}
- /*
- * I wonder how we prevent infinite bulk update. IMO it can
- * happen when sc_bulk_last state expires before we iterate
- * through the whole list.
- */
- PF_STATE_ENTER_READ();
- st_next = TAILQ_NEXT(st, entry_list);
- pf_state_unref(st);
- st = st_next;
- if (st == NULL)
- st = TAILQ_FIRST(&state_list);
- pf_state_ref(st);
- PF_STATE_EXIT_READ();
-
+ st = TAILQ_NEXT(st, entry_list);
if ((st == NULL) || (st == sc->sc_bulk_last)) {
/* we're done */
- pf_state_unref(sc->sc_bulk_last);
sc->sc_bulk_last = NULL;
pfsync_bulk_status(PFSYNC_BUS_END);
break;
break;
}
}
+
+ rw_exit_read(&pf_state_list.pfs_rwl);
out:
NET_UNLOCK();
}
if (sc == NULL)
return (0);
+ rw_assert_wrlock(&pf_state_list.pfs_rwl);
+
if (st->sync_state != PFSYNC_S_NONE ||
st == sc->sc_bulk_next ||
st == sc->sc_bulk_last)
-/* $OpenBSD: pf.c,v 1.1120 2021/06/23 05:51:27 dlg Exp $ */
+/* $OpenBSD: pf.c,v 1.1121 2021/06/23 06:53:52 dlg Exp $ */
/*
* Copyright (c) 2001 Daniel Hartmeier
struct pf_src_tree tree_src_tracking;
struct pf_state_tree_id tree_id;
-struct pf_state_queue state_list;
+struct pf_state_list pf_state_list = PF_STATE_LIST_INITIALIZER(pf_state_list);
RB_GENERATE(pf_src_tree, pf_src_node, entry, pf_src_compare);
RB_GENERATE(pf_state_tree, pf_state_key, entry, pf_state_compare_key);
return (threshold->count > threshold->limit);
}
+void
+pf_state_list_insert(struct pf_state_list *pfs, struct pf_state *st)
+{
+ /*
+ * we can always put states on the end of the list.
+ *
+ * things reading the list should take a read lock, then
+ * the mutex, get the head and tail pointers, release the
+ * mutex, and then they can iterate between the head and tail.
+ */
+
+ pf_state_ref(st); /* get a ref for the list */
+
+ mtx_enter(&pfs->pfs_mtx);
+ TAILQ_INSERT_TAIL(&pfs->pfs_list, st, entry_list);
+ mtx_leave(&pfs->pfs_mtx);
+}
+
+void
+pf_state_list_remove(struct pf_state_list *pfs, struct pf_state *st)
+{
+ /* states can only be removed when the write lock is held */
+ rw_assert_wrlock(&pfs->pfs_rwl);
+
+ mtx_enter(&pfs->pfs_mtx);
+ TAILQ_REMOVE(&pfs->pfs_list, st, entry_list);
+ mtx_leave(&pfs->pfs_mtx);
+
+ pf_state_unref(st); /* list no longer references the state */
+}
+
int
pf_src_connlimit(struct pf_state **state)
{
PF_STATE_EXIT_WRITE();
return (-1);
}
- TAILQ_INSERT_TAIL(&state_list, s, entry_list);
+ pf_state_list_insert(&pf_state_list, s);
pf_status.fcounters[FCNT_STATE_INSERT]++;
pf_status.states++;
pfi_kif_ref(kif, PFI_KIF_REF_STATE);
void
pf_purge_timeout(void *unused)
{
- task_add(net_tq(0), &pf_purge_task);
+ /* XXX move to systqmp to avoid KERNEL_LOCK */
+ task_add(systq, &pf_purge_task);
}
void
{
int *nloops = xnloops;
- KERNEL_LOCK();
- NET_LOCK();
-
/*
* process a fraction of the state table every second
* Note:
pf_purge_expired_states(1 + (pf_status.states
/ pf_default_rule.timeout[PFTM_INTERVAL]));
+ NET_LOCK();
+
PF_LOCK();
/* purge other expired types every PFTM_INTERVAL seconds */
if (++(*nloops) >= pf_default_rule.timeout[PFTM_INTERVAL]) {
*nloops = 0;
}
NET_UNLOCK();
- KERNEL_UNLOCK();
timeout_add_sec(&pf_purge_to, 1);
}
}
pf_normalize_tcp_cleanup(cur);
pfi_kif_unref(cur->kif, PFI_KIF_REF_STATE);
- TAILQ_REMOVE(&state_list, cur, entry_list);
+ pf_state_list_remove(&pf_state_list, cur);
if (cur->tag)
pf_tag_unref(cur->tag);
pf_state_unref(cur);
void
pf_purge_expired_states(u_int32_t maxcheck)
{
+ /*
+ * this task/thread/context/whatever is the only thing that
+ * removes states from the pf_state_list, so the cur reference
+ * it holds between calls is guaranteed to still be in the
+ * list.
+ */
static struct pf_state *cur = NULL;
- struct pf_state *next;
- SLIST_HEAD(pf_state_gcl, pf_state) gcl;
+
+ struct pf_state *head, *tail;
+ struct pf_state *st;
+ SLIST_HEAD(pf_state_gcl, pf_state) gcl = SLIST_HEAD_INITIALIZER(gcl);
time_t now;
PF_ASSERT_UNLOCKED();
- SLIST_INIT(&gcl);
+
+ rw_enter_read(&pf_state_list.pfs_rwl);
- PF_STATE_ENTER_READ();
+ mtx_enter(&pf_state_list.pfs_mtx);
+ head = TAILQ_FIRST(&pf_state_list.pfs_list);
+ tail = TAILQ_LAST(&pf_state_list.pfs_list, pf_state_queue);
+ mtx_leave(&pf_state_list.pfs_mtx);
- now = getuptime();
+ if (head == NULL) {
+ /* the list is empty */
+ rw_exit_read(&pf_state_list.pfs_rwl);
+ return;
+ }
- while (maxcheck--) {
- uint8_t stimeout;
+ /* (re)start at the front of the list */
+ if (cur == NULL)
+ cur = head;
- /* wrap to start of list when we hit the end */
- if (cur == NULL) {
- cur = pf_state_ref(TAILQ_FIRST(&state_list));
- if (cur == NULL)
- break; /* list empty */
- }
+ now = getuptime();
- /* get next state, as cur may get deleted */
- next = TAILQ_NEXT(cur, entry_list);
+ do {
+ uint8_t stimeout = cur->timeout;
- stimeout = cur->timeout;
if ((stimeout == PFTM_UNLINKED) ||
- (pf_state_expires(cur, stimeout) <= now))
- SLIST_INSERT_HEAD(&gcl, cur, gc_list);
- else
- pf_state_unref(cur);
-
- cur = pf_state_ref(next);
+ (pf_state_expires(cur, stimeout) <= now)) {
+ st = pf_state_ref(cur);
+ SLIST_INSERT_HEAD(&gcl, st, gc_list);
+ }
- if (cur == NULL)
+ /* don't iterate past the end of our view of the list */
+ if (cur == tail) {
+ cur = NULL;
break;
- }
- PF_STATE_EXIT_READ();
+ }
+
+ cur = TAILQ_NEXT(cur, entry_list);
+ } while (maxcheck--);
+
+ rw_exit_read(&pf_state_list.pfs_rwl);
+ if (SLIST_EMPTY(&gcl))
+ return;
+
+ NET_LOCK();
+ rw_enter_write(&pf_state_list.pfs_rwl);
PF_LOCK();
PF_STATE_ENTER_WRITE();
- while ((next = SLIST_FIRST(&gcl)) != NULL) {
- SLIST_REMOVE_HEAD(&gcl, gc_list);
- if (next->timeout == PFTM_UNLINKED)
- pf_free_state(next);
- else {
- pf_remove_state(next);
- pf_free_state(next);
- }
-
- pf_state_unref(next);
+ SLIST_FOREACH(st, &gcl, gc_list) {
+ if (st->timeout != PFTM_UNLINKED)
+ pf_remove_state(st);
+
+ pf_free_state(st);
}
PF_STATE_EXIT_WRITE();
PF_UNLOCK();
+ rw_exit_write(&pf_state_list.pfs_rwl);
+ NET_UNLOCK();
+
+ while ((st = SLIST_FIRST(&gcl)) != NULL) {
+ SLIST_REMOVE_HEAD(&gcl, gc_list);
+ pf_state_unref(st);
+ }
}
int
-/* $OpenBSD: pf_ioctl.c,v 1.364 2021/06/02 07:46:22 dlg Exp $ */
+/* $OpenBSD: pf_ioctl.c,v 1.365 2021/06/23 06:53:52 dlg Exp $ */
/*
* Copyright (c) 2001 Daniel Hartmeier
u_int16_t pf_qname2qid(char *, int);
void pf_qid2qname(u_int16_t, char *);
void pf_qid_unref(u_int16_t);
+int pf_states_clr(struct pfioc_state_kill *);
+int pf_states_get(struct pfioc_states *);
struct pf_rule pf_default_rule, pf_default_rule_new;
TAILQ_INIT(&pf_queues[1]);
pf_queues_active = &pf_queues[0];
pf_queues_inactive = &pf_queues[1];
- TAILQ_INIT(&state_list);
/* default rule should never be garbage collected */
pf_default_rule.entries.tqe_prev = &pf_default_rule.entries.tqe_next;
pf_rtlabel_copyout(addr);
}
+int
+pf_states_clr(struct pfioc_state_kill *psk)
+{
+ struct pf_state *s, *nexts;
+ struct pf_state *head, *tail;
+ u_int killed = 0;
+ int error;
+
+ NET_LOCK();
+
+ /* lock against the gc removing an item from the list */
+ error = rw_enter(&pf_state_list.pfs_rwl, RW_READ|RW_INTR);
+ if (error != 0)
+ goto unlock;
+
+ /* get a snapshot view of the ends of the list to traverse between */
+ mtx_enter(&pf_state_list.pfs_mtx);
+ head = TAILQ_FIRST(&pf_state_list.pfs_list);
+ tail = TAILQ_LAST(&pf_state_list.pfs_list, pf_state_queue);
+ mtx_leave(&pf_state_list.pfs_mtx);
+
+ s = NULL;
+ nexts = head;
+
+ PF_LOCK();
+ PF_STATE_ENTER_WRITE();
+
+ while (s != tail) {
+ s = nexts;
+ nexts = TAILQ_NEXT(s, entry_list);
+
+ if (s->timeout == PFTM_UNLINKED)
+ continue;
+
+ if (!psk->psk_ifname[0] || !strcmp(psk->psk_ifname,
+ s->kif->pfik_name)) {
+#if NPFSYNC > 0
+ /* don't send out individual delete messages */
+ SET(s->state_flags, PFSTATE_NOSYNC);
+#endif /* NPFSYNC > 0 */
+ pf_remove_state(s);
+ killed++;
+ }
+ }
+
+ PF_STATE_EXIT_WRITE();
+#if NPFSYNC > 0
+ pfsync_clear_states(pf_status.hostid, psk->psk_ifname);
+#endif /* NPFSYNC > 0 */
+ PF_UNLOCK();
+ rw_exit(&pf_state_list.pfs_rwl);
+
+ psk->psk_killed = killed;
+unlock:
+ NET_UNLOCK();
+
+ return (error);
+}
+
+int
+pf_states_get(struct pfioc_states *ps)
+{
+ struct pf_state *head, *tail;
+ struct pf_state *next, *state;
+ struct pfsync_state *p, pstore;
+ u_int32_t nr = 0;
+ int error;
+
+ if (ps->ps_len == 0) {
+ nr = pf_status.states;
+ ps->ps_len = sizeof(struct pfsync_state) * nr;
+ return (0);
+ }
+
+ p = ps->ps_states;
+
+ /* lock against the gc removing an item from the list */
+ error = rw_enter(&pf_state_list.pfs_rwl, RW_READ|RW_INTR);
+ if (error != 0)
+ return (error);
+
+ /* get a snapshot view of the ends of the list to traverse between */
+ mtx_enter(&pf_state_list.pfs_mtx);
+ head = TAILQ_FIRST(&pf_state_list.pfs_list);
+ tail = TAILQ_LAST(&pf_state_list.pfs_list, pf_state_queue);
+ mtx_leave(&pf_state_list.pfs_mtx);
+
+ state = NULL;
+ next = head;
+
+ while (state != tail) {
+ state = next;
+ next = TAILQ_NEXT(state, entry_list);
+
+ if (state->timeout == PFTM_UNLINKED)
+ continue;
+
+ if ((nr+1) * sizeof(*p) > ps->ps_len)
+ break;
+
+ pf_state_export(&pstore, state);
+ error = copyout(&pstore, p, sizeof(*p));
+ if (error)
+ goto fail;
+
+ p++;
+ nr++;
+ }
+ ps->ps_len = sizeof(struct pfsync_state) * nr;
+
+fail:
+ rw_exit(&pf_state_list.pfs_rwl);
+
+ return (error);
+}
+
int
pfioctl(dev_t dev, u_long cmd, caddr_t addr, int flags, struct proc *p)
{
break;
}
- case DIOCCLRSTATES: {
- struct pf_state *s, *nexts;
- struct pfioc_state_kill *psk = (struct pfioc_state_kill *)addr;
- u_int killed = 0;
-
- NET_LOCK();
- PF_LOCK();
- PF_STATE_ENTER_WRITE();
- for (s = RB_MIN(pf_state_tree_id, &tree_id); s; s = nexts) {
- nexts = RB_NEXT(pf_state_tree_id, &tree_id, s);
-
- if (!psk->psk_ifname[0] || !strcmp(psk->psk_ifname,
- s->kif->pfik_name)) {
-#if NPFSYNC > 0
- /* don't send out individual delete messages */
- SET(s->state_flags, PFSTATE_NOSYNC);
-#endif /* NPFSYNC > 0 */
- pf_remove_state(s);
- killed++;
- }
- }
- PF_STATE_EXIT_WRITE();
- psk->psk_killed = killed;
-#if NPFSYNC > 0
- pfsync_clear_states(pf_status.hostid, psk->psk_ifname);
-#endif /* NPFSYNC > 0 */
- PF_UNLOCK();
- NET_UNLOCK();
+ case DIOCCLRSTATES:
+ error = pf_states_clr((struct pfioc_state_kill *)addr);
break;
- }
case DIOCKILLSTATES: {
struct pf_state *s, *nexts;
break;
}
- case DIOCGETSTATES: {
- struct pfioc_states *ps = (struct pfioc_states *)addr;
- struct pf_state *state;
- struct pfsync_state *p, *pstore;
- u_int32_t nr = 0;
-
- if (ps->ps_len == 0) {
- nr = pf_status.states;
- ps->ps_len = sizeof(struct pfsync_state) * nr;
- break;
- }
-
- pstore = malloc(sizeof(*pstore), M_TEMP, M_WAITOK);
-
- p = ps->ps_states;
-
- NET_LOCK();
- PF_STATE_ENTER_READ();
- state = TAILQ_FIRST(&state_list);
- while (state) {
- if (state->timeout != PFTM_UNLINKED) {
- if ((nr+1) * sizeof(*p) > ps->ps_len)
- break;
- pf_state_export(pstore, state);
- error = copyout(pstore, p, sizeof(*p));
- if (error) {
- free(pstore, M_TEMP, sizeof(*pstore));
- PF_STATE_EXIT_READ();
- NET_UNLOCK();
- goto fail;
- }
- p++;
- nr++;
- }
- state = TAILQ_NEXT(state, entry_list);
- }
- PF_STATE_EXIT_READ();
- NET_UNLOCK();
-
- ps->ps_len = sizeof(struct pfsync_state) * nr;
-
- free(pstore, M_TEMP, sizeof(*pstore));
+ case DIOCGETSTATES:
+ error = pf_states_get((struct pfioc_states *)addr);
break;
- }
case DIOCGETSTATUS: {
struct pf_status *s = (struct pf_status *)addr;
-/* $OpenBSD: pfvar.h,v 1.501 2021/06/23 04:16:32 dlg Exp $ */
+/* $OpenBSD: pfvar.h,v 1.502 2021/06/23 06:53:52 dlg Exp $ */
/*
* Copyright (c) 2001 Daniel Hartmeier
RB_PROTOTYPE(pf_state_tree_id, pf_state,
entry_id, pf_state_compare_id);
extern struct pf_state_tree_id tree_id;
-extern struct pf_state_queue state_list;
+extern struct pf_state_list pf_state_list;
TAILQ_HEAD(pf_queuehead, pf_queuespec);
extern struct pf_queuehead pf_queues[2];
-/* $OpenBSD: pfvar_priv.h,v 1.6 2021/02/09 14:06:19 patrick Exp $ */
+/* $OpenBSD: pfvar_priv.h,v 1.7 2021/06/23 06:53:52 dlg Exp $ */
/*
* Copyright (c) 2001 Daniel Hartmeier
#ifdef _KERNEL
#include <sys/rwlock.h>
+#include <sys/mutex.h>
+
+/*
+ *
+ * states are linked into a global list to support the following
+ * functionality:
+ *
+ * - garbage collection
+ * - pfsync bulk send operations
+ * - bulk state fetches via the DIOCGETSTATES ioctl
+ * - bulk state clearing via the DIOCCLRSTATES ioctl
+ *
+ * states are inserted into the global pf_state_list once it has also
+ * been successfully added to the various trees that make up the state
+ * table. states are only removed from the pf_state_list by the garbage
+ * collection process.
+
+ * the pf_state_list head and tail pointers (ie, the pfs_list TAILQ_HEAD
+ * structure) and the pointers between the entries on the pf_state_list
+ * are locked separately. at a high level, this allows for insertion
+ * of new states into the pf_state_list while other contexts (eg, the
+ * ioctls) are traversing the state items in the list. for garbage
+ * collection to remove items from the pf_state_list, it has to exclude
+ * both modifications to the list head and tail pointers, and traversal
+ * of the links between the states.
+ *
+ * the head and tail pointers are protected by a mutex. the pointers
+ * between states are protected by an rwlock.
+ *
+ * because insertions are only made to the end of the list, if we get
+ * a snapshot of the head and tail of the list and prevent modifications
+ * to the links between states, we can safely traverse between the
+ * head and tail entries. subsequent insertions can add entries after
+ * our view of the tail, but we don't look past our view.
+ *
+ * if both locks must be taken, the rwlock protecting the links between
+ * states is taken before the mutex protecting the head and tail
+ * pointer.
+ *
+ * insertion into the list follows this pattern:
+ *
+ * // serialise list head/tail modifications
+ * mtx_enter(&pf_state_list.pfs_mtx);
+ * TAILQ_INSERT_TAIL(&pf_state_list.pfs_list, state, entry_list);
+ * mtx_leave(&pf_state_list.pfs_mtx);
+ *
+ * traversal of the list:
+ *
+ * // lock against the gc removing an item from the list
+ * rw_enter_read(&pf_state_list.pfs_rwl);
+ *
+ * // get a snapshot view of the ends of the list
+ * mtx_enter(&pf_state_list.pfs_mtx);
+ * head = TAILQ_FIRST(&pf_state_list.pfs_list);
+ * tail = TAILQ_LAST(&pf_state_list.pfs_list, pf_state_queue);
+ * mtx_leave(&pf_state_list.pfs_mtx);
+ *
+ * state = NULL;
+ * next = head;
+ *
+ * while (state != tail) {
+ * state = next;
+ * next = TAILQ_NEXT(state, entry_list);
+ *
+ * // look at the state
+ * }
+ *
+ * rw_exit_read(&pf_state_list.pfs_rwl);
+ *
+ * removing an item from the list:
+ *
+ * // wait for iterators (readers) to get out
+ * rw_enter_write(&pf_state_list.pfs_rwl);
+ *
+ * // serialise list head/tail modifications
+ * mtx_enter(&pf_state_list.pfs_mtx);
+ * TAILQ_REMOVE(&pf_state_list.pfs_list, state, entry_list);
+ * mtx_leave(&pf_state_list.pfs_mtx);
+ *
+ * rw_exit_write(&pf_state_list.pfs_rwl);
+ *
+ * the lock ordering for pf_state_list locks and the rest of the pf
+ * locks are:
+ *
+ * 1. KERNEL_LOCK
+ * 2. NET_LOCK
+ * 3. pf_state_list.pfs_rwl
+ * 4. PF_LOCK
+ * 5. PF_STATE_LOCK
+ * 6. pf_state_list.pfs_mtx
+ */
+
+struct pf_state_list {
+ /* the list of states in the system */
+ struct pf_state_queue pfs_list;
+
+ /* serialise pfs_list head/tail access */
+ struct mutex pfs_mtx;
+
+ /* serialise access to pointers between pfs_list entries */
+ struct rwlock pfs_rwl;
+};
+
+#define PF_STATE_LIST_INITIALIZER(_pfs) { \
+ .pfs_list = TAILQ_HEAD_INITIALIZER(_pfs.pfs_list), \
+ .pfs_mtx = MUTEX_INITIALIZER(IPL_SOFTNET), \
+ .pfs_rwl = RWLOCK_INITIALIZER("pfstates"), \
+}
extern struct rwlock pf_lock;