augment the global pf state list with its own locks.
authordlg <dlg@openbsd.org>
Wed, 23 Jun 2021 06:53:51 +0000 (06:53 +0000)
committerdlg <dlg@openbsd.org>
Wed, 23 Jun 2021 06:53:51 +0000 (06:53 +0000)
before this, things that iterated over the global list of pf states
had to take the net, pf, or pf state locks. in particular, the
ioctls that dump the state table took the net and pf state locks
before iterating over the states and using copyout to export them
to userland. when we tried replacing the use rwlocks with mutexes
under the pf locks, this blew up because you can't sleep when holding
a mutex and there's a sleeping lock used inside copyout.

this diff introduces two locks around the global state list: a mutex
that protects the head and tail of the list, and an rwlock that
protects the links between elements in the list. inserts on the
state list only occur during packet handling and can be done by
taking the mutex and putting the state on the tail before releasing
the mutex. iterating over states is only done from thread/process
contexts, so we can take a read lock, then the mutex to get a
snapshot of the head and tail pointers, and then keep the read lock
to iterate between the head and tail points. because it's a read
lock we can then take other sleeping locks (eg, the one inside
copyout) without (further) gymnastics. the pf state purge code takes
the rwlock exclusively and the mutex to remove elements from the
list.

this allows the ioctls and purge code to loop over the list
concurrently and largely without blocking the creation of states
when pf is processing packets.

pfsync also iterates over the state list when doing bulk sends,
which the state purge code needs to be careful around.

ok sashan@

sys/net/if_pfsync.c
sys/net/pf.c
sys/net/pf_ioctl.c
sys/net/pfvar.h
sys/net/pfvar_priv.h

index cc8fa39..eaaaa26 100644 (file)
@@ -1,4 +1,4 @@
-/*     $OpenBSD: if_pfsync.c,v 1.294 2021/06/23 05:43:53 dlg Exp $     */
+/*     $OpenBSD: if_pfsync.c,v 1.295 2021/06/23 06:53:51 dlg Exp $     */
 
 /*
  * Copyright (c) 2002 Michael Shalayeff
@@ -2550,22 +2550,34 @@ pfsync_bulk_start(void)
 {
        struct pfsync_softc *sc = pfsyncif;
 
+       NET_ASSERT_LOCKED();
+
+       /*
+        * pf gc via pfsync_state_in_use reads sc_bulk_next and
+        * sc_bulk_last while exclusively holding the pf_state_list
+        * rwlock. make sure it can't race with us setting these
+        * pointers. they basically act as hazards, and borrow the
+        * lists state reference count.
+        */
+       rw_enter_read(&pf_state_list.pfs_rwl);
+
+       /* get a consistent view of the list pointers */
+       mtx_enter(&pf_state_list.pfs_mtx);
+       if (sc->sc_bulk_next == NULL)
+               sc->sc_bulk_next = TAILQ_FIRST(&pf_state_list.pfs_list);
+
+       sc->sc_bulk_last = TAILQ_LAST(&pf_state_list.pfs_list, pf_state_queue);
+       mtx_leave(&pf_state_list.pfs_mtx);
+
+       rw_exit_read(&pf_state_list.pfs_rwl);
+
        DPFPRINTF(LOG_INFO, "received bulk update request");
 
-       if (TAILQ_EMPTY(&state_list))
+       if (sc->sc_bulk_last == NULL)
                pfsync_bulk_status(PFSYNC_BUS_END);
        else {
                sc->sc_ureq_received = getuptime();
 
-               if (sc->sc_bulk_next == NULL) {
-                       PF_STATE_ENTER_READ();
-                       sc->sc_bulk_next = TAILQ_FIRST(&state_list);
-                       pf_state_ref(sc->sc_bulk_next);
-                       PF_STATE_EXIT_READ();
-               }
-               sc->sc_bulk_last = sc->sc_bulk_next;
-               pf_state_ref(sc->sc_bulk_last);
-
                pfsync_bulk_status(PFSYNC_BUS_START);
                timeout_add(&sc->sc_bulk_tmo, 0);
        }
@@ -2575,13 +2587,15 @@ void
 pfsync_bulk_update(void *arg)
 {
        struct pfsync_softc *sc;
-       struct pf_state *st, *st_next;
+       struct pf_state *st;
        int i = 0;
 
        NET_LOCK();
        sc = pfsyncif;
        if (sc == NULL)
                goto out;
+
+       rw_enter_read(&pf_state_list.pfs_rwl);
        st = sc->sc_bulk_next;
        sc->sc_bulk_next = NULL;
 
@@ -2593,23 +2607,9 @@ pfsync_bulk_update(void *arg)
                        i++;
                }
 
-               /*
-                * I wonder how we prevent infinite bulk update.  IMO it can
-                * happen when sc_bulk_last state expires before we iterate
-                * through the whole list.
-                */
-               PF_STATE_ENTER_READ();
-               st_next = TAILQ_NEXT(st, entry_list);
-               pf_state_unref(st);
-               st = st_next;
-               if (st == NULL)
-                       st = TAILQ_FIRST(&state_list);
-               pf_state_ref(st);
-               PF_STATE_EXIT_READ();
-
+               st = TAILQ_NEXT(st, entry_list);
                if ((st == NULL) || (st == sc->sc_bulk_last)) {
                        /* we're done */
-                       pf_state_unref(sc->sc_bulk_last);
                        sc->sc_bulk_last = NULL;
                        pfsync_bulk_status(PFSYNC_BUS_END);
                        break;
@@ -2623,6 +2623,8 @@ pfsync_bulk_update(void *arg)
                        break;
                }
        }
+
+       rw_exit_read(&pf_state_list.pfs_rwl);
  out:
        NET_UNLOCK();
 }
@@ -2720,6 +2722,8 @@ pfsync_state_in_use(struct pf_state *st)
        if (sc == NULL)
                return (0);
 
+       rw_assert_wrlock(&pf_state_list.pfs_rwl);
+
        if (st->sync_state != PFSYNC_S_NONE ||
            st == sc->sc_bulk_next ||
            st == sc->sc_bulk_last)
index 27f961a..672f415 100644 (file)
@@ -1,4 +1,4 @@
-/*     $OpenBSD: pf.c,v 1.1120 2021/06/23 05:51:27 dlg Exp $ */
+/*     $OpenBSD: pf.c,v 1.1121 2021/06/23 06:53:52 dlg Exp $ */
 
 /*
  * Copyright (c) 2001 Daniel Hartmeier
@@ -309,7 +309,7 @@ static __inline void pf_set_protostate(struct pf_state *, int, u_int8_t);
 struct pf_src_tree tree_src_tracking;
 
 struct pf_state_tree_id tree_id;
-struct pf_state_queue state_list;
+struct pf_state_list pf_state_list = PF_STATE_LIST_INITIALIZER(pf_state_list);
 
 RB_GENERATE(pf_src_tree, pf_src_node, entry, pf_src_compare);
 RB_GENERATE(pf_state_tree, pf_state_key, entry, pf_state_compare_key);
@@ -441,6 +441,37 @@ pf_check_threshold(struct pf_threshold *threshold)
        return (threshold->count > threshold->limit);
 }
 
+void
+pf_state_list_insert(struct pf_state_list *pfs, struct pf_state *st)
+{
+       /*
+        * we can always put states on the end of the list.
+        *
+        * things reading the list should take a read lock, then
+        * the mutex, get the head and tail pointers, release the
+        * mutex, and then they can iterate between the head and tail.
+        */
+
+       pf_state_ref(st); /* get a ref for the list */
+
+       mtx_enter(&pfs->pfs_mtx);
+       TAILQ_INSERT_TAIL(&pfs->pfs_list, st, entry_list);
+       mtx_leave(&pfs->pfs_mtx);
+}
+
+void
+pf_state_list_remove(struct pf_state_list *pfs, struct pf_state *st)
+{
+       /* states can only be removed when the write lock is held */
+       rw_assert_wrlock(&pfs->pfs_rwl);
+
+       mtx_enter(&pfs->pfs_mtx);
+       TAILQ_REMOVE(&pfs->pfs_list, st, entry_list);
+       mtx_leave(&pfs->pfs_mtx);
+
+       pf_state_unref(st); /* list no longer references the state */
+}
+
 int
 pf_src_connlimit(struct pf_state **state)
 {
@@ -987,7 +1018,7 @@ pf_state_insert(struct pfi_kif *kif, struct pf_state_key **skw,
                PF_STATE_EXIT_WRITE();
                return (-1);
        }
-       TAILQ_INSERT_TAIL(&state_list, s, entry_list);
+       pf_state_list_insert(&pf_state_list, s);
        pf_status.fcounters[FCNT_STATE_INSERT]++;
        pf_status.states++;
        pfi_kif_ref(kif, PFI_KIF_REF_STATE);
@@ -1248,7 +1279,8 @@ pf_purge_expired_rules(void)
 void
 pf_purge_timeout(void *unused)
 {
-       task_add(net_tq(0), &pf_purge_task);
+       /* XXX move to systqmp to avoid KERNEL_LOCK */
+       task_add(systq, &pf_purge_task);
 }
 
 void
@@ -1256,9 +1288,6 @@ pf_purge(void *xnloops)
 {
        int *nloops = xnloops;
 
-       KERNEL_LOCK();
-       NET_LOCK();
-
        /*
         * process a fraction of the state table every second
         * Note:
@@ -1269,6 +1298,8 @@ pf_purge(void *xnloops)
        pf_purge_expired_states(1 + (pf_status.states
            / pf_default_rule.timeout[PFTM_INTERVAL]));
 
+       NET_LOCK();
+
        PF_LOCK();
        /* purge other expired types every PFTM_INTERVAL seconds */
        if (++(*nloops) >= pf_default_rule.timeout[PFTM_INTERVAL]) {
@@ -1285,7 +1316,6 @@ pf_purge(void *xnloops)
                *nloops = 0;
        }
        NET_UNLOCK();
-       KERNEL_UNLOCK();
 
        timeout_add_sec(&pf_purge_to, 1);
 }
@@ -1463,7 +1493,7 @@ pf_free_state(struct pf_state *cur)
        }
        pf_normalize_tcp_cleanup(cur);
        pfi_kif_unref(cur->kif, PFI_KIF_REF_STATE);
-       TAILQ_REMOVE(&state_list, cur, entry_list);
+       pf_state_list_remove(&pf_state_list, cur);
        if (cur->tag)
                pf_tag_unref(cur->tag);
        pf_state_unref(cur);
@@ -1474,60 +1504,82 @@ pf_free_state(struct pf_state *cur)
 void
 pf_purge_expired_states(u_int32_t maxcheck)
 {
+       /*
+        * this task/thread/context/whatever is the only thing that
+        * removes states from the pf_state_list, so the cur reference
+        * it holds between calls is guaranteed to still be in the
+        * list.
+        */
        static struct pf_state  *cur = NULL;
-       struct pf_state         *next;
-       SLIST_HEAD(pf_state_gcl, pf_state) gcl;
+
+       struct pf_state         *head, *tail;
+       struct pf_state         *st;
+       SLIST_HEAD(pf_state_gcl, pf_state) gcl = SLIST_HEAD_INITIALIZER(gcl);
        time_t                   now;
 
        PF_ASSERT_UNLOCKED();
-       SLIST_INIT(&gcl);
+       rw_enter_read(&pf_state_list.pfs_rwl);
 
-       PF_STATE_ENTER_READ();
+       mtx_enter(&pf_state_list.pfs_mtx);
+       head = TAILQ_FIRST(&pf_state_list.pfs_list);
+       tail = TAILQ_LAST(&pf_state_list.pfs_list, pf_state_queue);
+       mtx_leave(&pf_state_list.pfs_mtx);
 
-       now = getuptime();
+       if (head == NULL) {
+               /* the list is empty */
+               rw_exit_read(&pf_state_list.pfs_rwl);
+               return;
+       }
 
-       while (maxcheck--) {
-               uint8_t stimeout;
+       /* (re)start at the front of the list */
+       if (cur == NULL)
+               cur = head;
 
-               /* wrap to start of list when we hit the end */
-               if (cur == NULL) {
-                       cur = pf_state_ref(TAILQ_FIRST(&state_list));
-                       if (cur == NULL)
-                               break;  /* list empty */
-               }
+       now = getuptime();
 
-               /* get next state, as cur may get deleted */
-               next = TAILQ_NEXT(cur, entry_list);
+       do {
+               uint8_t stimeout = cur->timeout;
 
-               stimeout = cur->timeout;
                if ((stimeout == PFTM_UNLINKED) ||
-                   (pf_state_expires(cur, stimeout) <= now))
-                       SLIST_INSERT_HEAD(&gcl, cur, gc_list);
-               else
-                       pf_state_unref(cur);
-
-               cur = pf_state_ref(next);
+                   (pf_state_expires(cur, stimeout) <= now)) {
+                       st = pf_state_ref(cur);
+                       SLIST_INSERT_HEAD(&gcl, st, gc_list);
+               }
 
-               if (cur == NULL)
+               /* don't iterate past the end of our view of the list */
+               if (cur == tail) {
+                       cur = NULL;
                        break;
-       }
-       PF_STATE_EXIT_READ();
+               }
+               cur = TAILQ_NEXT(cur, entry_list);
+       } while (maxcheck--);
+
+       rw_exit_read(&pf_state_list.pfs_rwl);
 
+       if (SLIST_EMPTY(&gcl))
+               return;
+
+       NET_LOCK();
+       rw_enter_write(&pf_state_list.pfs_rwl);
        PF_LOCK();
        PF_STATE_ENTER_WRITE();
-       while ((next = SLIST_FIRST(&gcl)) != NULL) {
-               SLIST_REMOVE_HEAD(&gcl, gc_list);
-               if (next->timeout == PFTM_UNLINKED)
-                       pf_free_state(next);
-               else {
-                       pf_remove_state(next);
-                       pf_free_state(next);
-               }
-
-               pf_state_unref(next);
+       SLIST_FOREACH(st, &gcl, gc_list) {
+               if (st->timeout != PFTM_UNLINKED)
+                       pf_remove_state(st);
+               pf_free_state(st);
        }
        PF_STATE_EXIT_WRITE();
        PF_UNLOCK();
+       rw_exit_write(&pf_state_list.pfs_rwl);
+       NET_UNLOCK();
+
+       while ((st = SLIST_FIRST(&gcl)) != NULL) {
+               SLIST_REMOVE_HEAD(&gcl, gc_list);
+               pf_state_unref(st);
+       }
 }
 
 int
index 2335fb0..d4893d7 100644 (file)
@@ -1,4 +1,4 @@
-/*     $OpenBSD: pf_ioctl.c,v 1.364 2021/06/02 07:46:22 dlg Exp $ */
+/*     $OpenBSD: pf_ioctl.c,v 1.365 2021/06/23 06:53:52 dlg Exp $ */
 
 /*
  * Copyright (c) 2001 Daniel Hartmeier
@@ -113,6 +113,8 @@ int                  pf_rule_copyin(struct pf_rule *, struct pf_rule *,
 u_int16_t               pf_qname2qid(char *, int);
 void                    pf_qid2qname(u_int16_t, char *);
 void                    pf_qid_unref(u_int16_t);
+int                     pf_states_clr(struct pfioc_state_kill *);
+int                     pf_states_get(struct pfioc_states *);
 
 struct pf_rule          pf_default_rule, pf_default_rule_new;
 
@@ -206,7 +208,6 @@ pfattach(int num)
        TAILQ_INIT(&pf_queues[1]);
        pf_queues_active = &pf_queues[0];
        pf_queues_inactive = &pf_queues[1];
-       TAILQ_INIT(&state_list);
 
        /* default rule should never be garbage collected */
        pf_default_rule.entries.tqe_prev = &pf_default_rule.entries.tqe_next;
@@ -902,6 +903,122 @@ pf_addr_copyout(struct pf_addr_wrap *addr)
        pf_rtlabel_copyout(addr);
 }
 
+int
+pf_states_clr(struct pfioc_state_kill *psk)
+{
+       struct pf_state         *s, *nexts;
+       struct pf_state         *head, *tail;
+       u_int                    killed = 0;
+       int                      error;
+
+       NET_LOCK();
+
+       /* lock against the gc removing an item from the list */
+       error = rw_enter(&pf_state_list.pfs_rwl, RW_READ|RW_INTR);
+       if (error != 0)
+               goto unlock;
+
+       /* get a snapshot view of the ends of the list to traverse between */
+       mtx_enter(&pf_state_list.pfs_mtx);
+       head = TAILQ_FIRST(&pf_state_list.pfs_list);
+       tail = TAILQ_LAST(&pf_state_list.pfs_list, pf_state_queue);
+       mtx_leave(&pf_state_list.pfs_mtx);
+
+       s = NULL;
+       nexts = head;
+
+       PF_LOCK();
+       PF_STATE_ENTER_WRITE();
+
+       while (s != tail) {
+               s = nexts;
+               nexts = TAILQ_NEXT(s, entry_list);
+
+               if (s->timeout == PFTM_UNLINKED)
+                       continue;
+
+               if (!psk->psk_ifname[0] || !strcmp(psk->psk_ifname,
+                   s->kif->pfik_name)) {
+#if NPFSYNC > 0
+                       /* don't send out individual delete messages */
+                       SET(s->state_flags, PFSTATE_NOSYNC);
+#endif /* NPFSYNC > 0 */
+                       pf_remove_state(s);
+                       killed++;
+               }
+       }
+
+       PF_STATE_EXIT_WRITE();
+#if NPFSYNC > 0
+       pfsync_clear_states(pf_status.hostid, psk->psk_ifname);
+#endif /* NPFSYNC > 0 */
+       PF_UNLOCK();
+       rw_exit(&pf_state_list.pfs_rwl);
+
+       psk->psk_killed = killed;
+unlock:
+       NET_UNLOCK();
+
+       return (error);
+}
+
+int
+pf_states_get(struct pfioc_states *ps)
+{
+       struct pf_state         *head, *tail;
+       struct pf_state         *next, *state;
+       struct pfsync_state     *p, pstore;
+       u_int32_t                nr = 0;
+       int                      error;
+
+       if (ps->ps_len == 0) {
+               nr = pf_status.states;
+               ps->ps_len = sizeof(struct pfsync_state) * nr;
+               return (0);
+       }
+
+       p = ps->ps_states;
+
+       /* lock against the gc removing an item from the list */
+       error = rw_enter(&pf_state_list.pfs_rwl, RW_READ|RW_INTR);
+       if (error != 0)
+               return (error);
+
+       /* get a snapshot view of the ends of the list to traverse between */
+       mtx_enter(&pf_state_list.pfs_mtx);
+       head = TAILQ_FIRST(&pf_state_list.pfs_list);
+       tail = TAILQ_LAST(&pf_state_list.pfs_list, pf_state_queue);
+       mtx_leave(&pf_state_list.pfs_mtx);
+
+       state = NULL;
+       next = head;
+
+       while (state != tail) {
+               state = next;
+               next = TAILQ_NEXT(state, entry_list);
+
+               if (state->timeout == PFTM_UNLINKED)
+                       continue;
+
+               if ((nr+1) * sizeof(*p) > ps->ps_len)
+                       break;
+
+               pf_state_export(&pstore, state);
+               error = copyout(&pstore, p, sizeof(*p));
+               if (error)
+                       goto fail;
+
+               p++;
+               nr++;
+       }
+       ps->ps_len = sizeof(struct pfsync_state) * nr;
+
+fail:
+       rw_exit(&pf_state_list.pfs_rwl);
+
+       return (error);
+}
+
 int
 pfioctl(dev_t dev, u_long cmd, caddr_t addr, int flags, struct proc *p)
 {
@@ -1545,36 +1662,9 @@ pfioctl(dev_t dev, u_long cmd, caddr_t addr, int flags, struct proc *p)
                break;
        }
 
-       case DIOCCLRSTATES: {
-               struct pf_state         *s, *nexts;
-               struct pfioc_state_kill *psk = (struct pfioc_state_kill *)addr;
-               u_int                    killed = 0;
-
-               NET_LOCK();
-               PF_LOCK();
-               PF_STATE_ENTER_WRITE();
-               for (s = RB_MIN(pf_state_tree_id, &tree_id); s; s = nexts) {
-                       nexts = RB_NEXT(pf_state_tree_id, &tree_id, s);
-
-                       if (!psk->psk_ifname[0] || !strcmp(psk->psk_ifname,
-                           s->kif->pfik_name)) {
-#if NPFSYNC > 0
-                               /* don't send out individual delete messages */
-                               SET(s->state_flags, PFSTATE_NOSYNC);
-#endif /* NPFSYNC > 0 */
-                               pf_remove_state(s);
-                               killed++;
-                       }
-               }
-               PF_STATE_EXIT_WRITE();
-               psk->psk_killed = killed;
-#if NPFSYNC > 0
-               pfsync_clear_states(pf_status.hostid, psk->psk_ifname);
-#endif /* NPFSYNC > 0 */
-               PF_UNLOCK();
-               NET_UNLOCK();
+       case DIOCCLRSTATES:
+               error = pf_states_clr((struct pfioc_state_kill *)addr);
                break;
-       }
 
        case DIOCKILLSTATES: {
                struct pf_state         *s, *nexts;
@@ -1757,50 +1847,9 @@ pfioctl(dev_t dev, u_long cmd, caddr_t addr, int flags, struct proc *p)
                break;
        }
 
-       case DIOCGETSTATES: {
-               struct pfioc_states     *ps = (struct pfioc_states *)addr;
-               struct pf_state         *state;
-               struct pfsync_state     *p, *pstore;
-               u_int32_t                nr = 0;
-
-               if (ps->ps_len == 0) {
-                       nr = pf_status.states;
-                       ps->ps_len = sizeof(struct pfsync_state) * nr;
-                       break;
-               }
-
-               pstore = malloc(sizeof(*pstore), M_TEMP, M_WAITOK);
-
-               p = ps->ps_states;
-
-               NET_LOCK();
-               PF_STATE_ENTER_READ();
-               state = TAILQ_FIRST(&state_list);
-               while (state) {
-                       if (state->timeout != PFTM_UNLINKED) {
-                               if ((nr+1) * sizeof(*p) > ps->ps_len)
-                                       break;
-                               pf_state_export(pstore, state);
-                               error = copyout(pstore, p, sizeof(*p));
-                               if (error) {
-                                       free(pstore, M_TEMP, sizeof(*pstore));
-                                       PF_STATE_EXIT_READ();
-                                       NET_UNLOCK();
-                                       goto fail;
-                               }
-                               p++;
-                               nr++;
-                       }
-                       state = TAILQ_NEXT(state, entry_list);
-               }
-               PF_STATE_EXIT_READ();
-               NET_UNLOCK();
-
-               ps->ps_len = sizeof(struct pfsync_state) * nr;
-
-               free(pstore, M_TEMP, sizeof(*pstore));
+       case DIOCGETSTATES: 
+               error = pf_states_get((struct pfioc_states *)addr);
                break;
-       }
 
        case DIOCGETSTATUS: {
                struct pf_status *s = (struct pf_status *)addr;
index 44a7f60..d2f2464 100644 (file)
@@ -1,4 +1,4 @@
-/*     $OpenBSD: pfvar.h,v 1.501 2021/06/23 04:16:32 dlg Exp $ */
+/*     $OpenBSD: pfvar.h,v 1.502 2021/06/23 06:53:52 dlg Exp $ */
 
 /*
  * Copyright (c) 2001 Daniel Hartmeier
@@ -1682,7 +1682,7 @@ RB_HEAD(pf_state_tree_id, pf_state);
 RB_PROTOTYPE(pf_state_tree_id, pf_state,
     entry_id, pf_state_compare_id);
 extern struct pf_state_tree_id tree_id;
-extern struct pf_state_queue state_list;
+extern struct pf_state_list pf_state_list;
 
 TAILQ_HEAD(pf_queuehead, pf_queuespec);
 extern struct pf_queuehead               pf_queues[2];
index b1a122a..5bba87b 100644 (file)
@@ -1,4 +1,4 @@
-/*     $OpenBSD: pfvar_priv.h,v 1.6 2021/02/09 14:06:19 patrick Exp $  */
+/*     $OpenBSD: pfvar_priv.h,v 1.7 2021/06/23 06:53:52 dlg Exp $      */
 
 /*
  * Copyright (c) 2001 Daniel Hartmeier
 #ifdef _KERNEL
 
 #include <sys/rwlock.h>
+#include <sys/mutex.h>
+
+/*
+ *
+ * states are linked into a global list to support the following
+ * functionality:
+ *
+ * - garbage collection
+ * - pfsync bulk send operations
+ * - bulk state fetches via the DIOCGETSTATES ioctl
+ * - bulk state clearing via the DIOCCLRSTATES ioctl
+ * 
+ * states are inserted into the global pf_state_list once it has also
+ * been successfully added to the various trees that make up the state
+ * table. states are only removed from the pf_state_list by the garbage
+ * collection process.
+
+ * the pf_state_list head and tail pointers (ie, the pfs_list TAILQ_HEAD
+ * structure) and the pointers between the entries on the pf_state_list
+ * are locked separately. at a high level, this allows for insertion
+ * of new states into the pf_state_list while other contexts (eg, the
+ * ioctls) are traversing the state items in the list. for garbage
+ * collection to remove items from the pf_state_list, it has to exclude
+ * both modifications to the list head and tail pointers, and traversal
+ * of the links between the states.
+ *
+ * the head and tail pointers are protected by a mutex. the pointers
+ * between states are protected by an rwlock.
+ *
+ * because insertions are only made to the end of the list, if we get
+ * a snapshot of the head and tail of the list and prevent modifications
+ * to the links between states, we can safely traverse between the
+ * head and tail entries. subsequent insertions can add entries after
+ * our view of the tail, but we don't look past our view.
+ *
+ * if both locks must be taken, the rwlock protecting the links between
+ * states is taken before the mutex protecting the head and tail
+ * pointer.
+ *
+ * insertion into the list follows this pattern:
+ *
+ *     // serialise list head/tail modifications
+ *     mtx_enter(&pf_state_list.pfs_mtx);
+ *     TAILQ_INSERT_TAIL(&pf_state_list.pfs_list, state, entry_list);
+ *     mtx_leave(&pf_state_list.pfs_mtx);
+ *
+ * traversal of the list:
+ *
+ *     // lock against the gc removing an item from the list
+ *     rw_enter_read(&pf_state_list.pfs_rwl);
+ *
+ *     // get a snapshot view of the ends of the list
+ *     mtx_enter(&pf_state_list.pfs_mtx);
+ *     head = TAILQ_FIRST(&pf_state_list.pfs_list);
+ *     tail = TAILQ_LAST(&pf_state_list.pfs_list, pf_state_queue);
+ *     mtx_leave(&pf_state_list.pfs_mtx);
+ *
+ *     state = NULL;
+ *     next = head;
+ *
+ *     while (state != tail) {
+ *             state = next;
+ *             next = TAILQ_NEXT(state, entry_list);
+ *
+ *             // look at the state
+ *     }
+ *
+ *     rw_exit_read(&pf_state_list.pfs_rwl);
+ *
+ * removing an item from the list:
+ * 
+ *     // wait for iterators (readers) to get out
+ *     rw_enter_write(&pf_state_list.pfs_rwl);
+ *
+ *     // serialise list head/tail modifications
+ *     mtx_enter(&pf_state_list.pfs_mtx);
+ *     TAILQ_REMOVE(&pf_state_list.pfs_list, state, entry_list);
+ *     mtx_leave(&pf_state_list.pfs_mtx);
+ *
+ *     rw_exit_write(&pf_state_list.pfs_rwl);
+ *
+ * the lock ordering for pf_state_list locks and the rest of the pf
+ * locks are:
+ *
+ * 1. KERNEL_LOCK
+ * 2. NET_LOCK
+ * 3. pf_state_list.pfs_rwl
+ * 4. PF_LOCK
+ * 5. PF_STATE_LOCK
+ * 6. pf_state_list.pfs_mtx
+ */
+
+struct pf_state_list {
+       /* the list of states in the system */
+       struct pf_state_queue           pfs_list;
+
+       /* serialise pfs_list head/tail access */
+       struct mutex                    pfs_mtx;
+
+       /* serialise access to pointers between pfs_list entries */
+       struct rwlock                   pfs_rwl;
+};
+
+#define PF_STATE_LIST_INITIALIZER(_pfs) {                              \
+       .pfs_list       = TAILQ_HEAD_INITIALIZER(_pfs.pfs_list),        \
+       .pfs_mtx        = MUTEX_INITIALIZER(IPL_SOFTNET),               \
+       .pfs_rwl        = RWLOCK_INITIALIZER("pfstates"),               \
+}
 
 extern struct rwlock pf_lock;