augment the global pf state list with its own locks.

author dlg <dlg@openbsd.org>

Wed, 23 Jun 2021 06:53:51 +0000 (06:53 +0000)

committer dlg <dlg@openbsd.org>

Wed, 23 Jun 2021 06:53:51 +0000 (06:53 +0000)
author dlg <dlg@openbsd.org>
Wed, 23 Jun 2021 06:53:51 +0000 (06:53 +0000)
committer dlg <dlg@openbsd.org>
Wed, 23 Jun 2021 06:53:51 +0000 (06:53 +0000)
diff --git a/sys/net/if_pfsync.c b/sys/net/if_pfsync.c

index cc8fa39..eaaaa26 100644 (file)
--- a/sys/net/if_pfsync.c
+++ b/sys/net/if_pfsync.c
@@ -1,4 +1,4 @@
-/*     $OpenBSD: if_pfsync.c,v 1.294 2021/06/23 05:43:53 dlg Exp $     */
+/*     $OpenBSD: if_pfsync.c,v 1.295 2021/06/23 06:53:51 dlg Exp $     */
  
  /*
   * Copyright (c) 2002 Michael Shalayeff
@@ -2550,22 +2550,34 @@ pfsync_bulk_start(void)
  {
         struct pfsync_softc *sc = pfsyncif;
  
+       NET_ASSERT_LOCKED();
+
+       /*
+        * pf gc via pfsync_state_in_use reads sc_bulk_next and
+        * sc_bulk_last while exclusively holding the pf_state_list
+        * rwlock. make sure it can't race with us setting these
+        * pointers. they basically act as hazards, and borrow the
+        * lists state reference count.
+        */
+       rw_enter_read(&pf_state_list.pfs_rwl);
+
+       /* get a consistent view of the list pointers */
+       mtx_enter(&pf_state_list.pfs_mtx);
+       if (sc->sc_bulk_next == NULL)
+               sc->sc_bulk_next = TAILQ_FIRST(&pf_state_list.pfs_list);
+
+       sc->sc_bulk_last = TAILQ_LAST(&pf_state_list.pfs_list, pf_state_queue);
+       mtx_leave(&pf_state_list.pfs_mtx);
+
+       rw_exit_read(&pf_state_list.pfs_rwl);
+
         DPFPRINTF(LOG_INFO, "received bulk update request");
  
-       if (TAILQ_EMPTY(&state_list))
+       if (sc->sc_bulk_last == NULL)
                 pfsync_bulk_status(PFSYNC_BUS_END);
         else {
                 sc->sc_ureq_received = getuptime();
  
-               if (sc->sc_bulk_next == NULL) {
-                       PF_STATE_ENTER_READ();
-                       sc->sc_bulk_next = TAILQ_FIRST(&state_list);
-                       pf_state_ref(sc->sc_bulk_next);
-                       PF_STATE_EXIT_READ();
-               }
-               sc->sc_bulk_last = sc->sc_bulk_next;
-               pf_state_ref(sc->sc_bulk_last);
-
                 pfsync_bulk_status(PFSYNC_BUS_START);
                 timeout_add(&sc->sc_bulk_tmo, 0);
         }
@@ -2575,13 +2587,15 @@ void
  pfsync_bulk_update(void *arg)
  {
         struct pfsync_softc *sc;
-       struct pf_state *st, *st_next;
+       struct pf_state *st;
         int i = 0;
  
         NET_LOCK();
         sc = pfsyncif;
         if (sc == NULL)
                 goto out;
+
+       rw_enter_read(&pf_state_list.pfs_rwl);
         st = sc->sc_bulk_next;
         sc->sc_bulk_next = NULL;
  
@@ -2593,23 +2607,9 @@ pfsync_bulk_update(void *arg)
                         i++;
                 }
  
-               /*
-                * I wonder how we prevent infinite bulk update.  IMO it can
-                * happen when sc_bulk_last state expires before we iterate
-                * through the whole list.
-                */
-               PF_STATE_ENTER_READ();
-               st_next = TAILQ_NEXT(st, entry_list);
-               pf_state_unref(st);
-               st = st_next;
-               if (st == NULL)
-                       st = TAILQ_FIRST(&state_list);
-               pf_state_ref(st);
-               PF_STATE_EXIT_READ();
-
+               st = TAILQ_NEXT(st, entry_list);
                 if ((st == NULL) || (st == sc->sc_bulk_last)) {
                         /* we're done */
-                       pf_state_unref(sc->sc_bulk_last);
                         sc->sc_bulk_last = NULL;
                         pfsync_bulk_status(PFSYNC_BUS_END);
                         break;
@@ -2623,6 +2623,8 @@ pfsync_bulk_update(void *arg)
                         break;
                 }
         }
+
+       rw_exit_read(&pf_state_list.pfs_rwl);
   out:
         NET_UNLOCK();
  }
@@ -2720,6 +2722,8 @@ pfsync_state_in_use(struct pf_state *st)
         if (sc == NULL)
                 return (0);
  
+       rw_assert_wrlock(&pf_state_list.pfs_rwl);
+
         if (st->sync_state != PFSYNC_S_NONE ||
             st == sc->sc_bulk_next ||
             st == sc->sc_bulk_last)
diff --git a/sys/net/pf.c b/sys/net/pf.c

index 27f961a..672f415 100644 (file)
--- a/sys/net/pf.c
+++ b/sys/net/pf.c
@@ -1,4 +1,4 @@
-/*     $OpenBSD: pf.c,v 1.1120 2021/06/23 05:51:27 dlg Exp $ */
+/*     $OpenBSD: pf.c,v 1.1121 2021/06/23 06:53:52 dlg Exp $ */
  
  /*
   * Copyright (c) 2001 Daniel Hartmeier
@@ -309,7 +309,7 @@ static __inline void pf_set_protostate(struct pf_state *, int, u_int8_t);
  struct pf_src_tree tree_src_tracking;
  
  struct pf_state_tree_id tree_id;
-struct pf_state_queue state_list;
+struct pf_state_list pf_state_list = PF_STATE_LIST_INITIALIZER(pf_state_list);
  
  RB_GENERATE(pf_src_tree, pf_src_node, entry, pf_src_compare);
  RB_GENERATE(pf_state_tree, pf_state_key, entry, pf_state_compare_key);
@@ -441,6 +441,37 @@ pf_check_threshold(struct pf_threshold *threshold)
         return (threshold->count > threshold->limit);
  }
  
+void
+pf_state_list_insert(struct pf_state_list *pfs, struct pf_state *st)
+{
+       /*
+        * we can always put states on the end of the list.
+        *
+        * things reading the list should take a read lock, then
+        * the mutex, get the head and tail pointers, release the
+        * mutex, and then they can iterate between the head and tail.
+        */
+
+       pf_state_ref(st); /* get a ref for the list */
+
+       mtx_enter(&pfs->pfs_mtx);
+       TAILQ_INSERT_TAIL(&pfs->pfs_list, st, entry_list);
+       mtx_leave(&pfs->pfs_mtx);
+}
+
+void
+pf_state_list_remove(struct pf_state_list *pfs, struct pf_state *st)
+{
+       /* states can only be removed when the write lock is held */
+       rw_assert_wrlock(&pfs->pfs_rwl);
+
+       mtx_enter(&pfs->pfs_mtx);
+       TAILQ_REMOVE(&pfs->pfs_list, st, entry_list);
+       mtx_leave(&pfs->pfs_mtx);
+
+       pf_state_unref(st); /* list no longer references the state */
+}
+
  int
  pf_src_connlimit(struct pf_state **state)
  {
@@ -987,7 +1018,7 @@ pf_state_insert(struct pfi_kif *kif, struct pf_state_key **skw,
                 PF_STATE_EXIT_WRITE();
                 return (-1);
         }
-       TAILQ_INSERT_TAIL(&state_list, s, entry_list);
+       pf_state_list_insert(&pf_state_list, s);
         pf_status.fcounters[FCNT_STATE_INSERT]++;
         pf_status.states++;
         pfi_kif_ref(kif, PFI_KIF_REF_STATE);
@@ -1248,7 +1279,8 @@ pf_purge_expired_rules(void)
  void
  pf_purge_timeout(void *unused)
  {
-       task_add(net_tq(0), &pf_purge_task);
+       /* XXX move to systqmp to avoid KERNEL_LOCK */
+       task_add(systq, &pf_purge_task);
  }
  
  void
@@ -1256,9 +1288,6 @@ pf_purge(void *xnloops)
  {
         int *nloops = xnloops;
  
-       KERNEL_LOCK();
-       NET_LOCK();
-
         /*
          * process a fraction of the state table every second
          * Note:
@@ -1269,6 +1298,8 @@ pf_purge(void *xnloops)
         pf_purge_expired_states(1 + (pf_status.states
             / pf_default_rule.timeout[PFTM_INTERVAL]));
  
+       NET_LOCK();
+
         PF_LOCK();
         /* purge other expired types every PFTM_INTERVAL seconds */
         if (++(*nloops) >= pf_default_rule.timeout[PFTM_INTERVAL]) {
@@ -1285,7 +1316,6 @@ pf_purge(void *xnloops)
                 *nloops = 0;
         }
         NET_UNLOCK();
-       KERNEL_UNLOCK();
  
         timeout_add_sec(&pf_purge_to, 1);
  }
@@ -1463,7 +1493,7 @@ pf_free_state(struct pf_state *cur)
         }
         pf_normalize_tcp_cleanup(cur);
         pfi_kif_unref(cur->kif, PFI_KIF_REF_STATE);
-       TAILQ_REMOVE(&state_list, cur, entry_list);
+       pf_state_list_remove(&pf_state_list, cur);
         if (cur->tag)
                 pf_tag_unref(cur->tag);
         pf_state_unref(cur);
@@ -1474,60 +1504,82 @@ pf_free_state(struct pf_state *cur)
  void
  pf_purge_expired_states(u_int32_t maxcheck)
  {
+       /*
+        * this task/thread/context/whatever is the only thing that
+        * removes states from the pf_state_list, so the cur reference
+        * it holds between calls is guaranteed to still be in the
+        * list.
+        */
         static struct pf_state  *cur = NULL;
-       struct pf_state         *next;
-       SLIST_HEAD(pf_state_gcl, pf_state) gcl;
+
+       struct pf_state         *head, *tail;
+       struct pf_state         *st;
+       SLIST_HEAD(pf_state_gcl, pf_state) gcl = SLIST_HEAD_INITIALIZER(gcl);
         time_t                   now;
  
         PF_ASSERT_UNLOCKED();
-       SLIST_INIT(&gcl);
+ 
+       rw_enter_read(&pf_state_list.pfs_rwl);
  
-       PF_STATE_ENTER_READ();
+       mtx_enter(&pf_state_list.pfs_mtx);
+       head = TAILQ_FIRST(&pf_state_list.pfs_list);
+       tail = TAILQ_LAST(&pf_state_list.pfs_list, pf_state_queue);
+       mtx_leave(&pf_state_list.pfs_mtx);
  
-       now = getuptime();
+       if (head == NULL) {
+               /* the list is empty */
+               rw_exit_read(&pf_state_list.pfs_rwl);
+               return;
+       }
  
-       while (maxcheck--) {
-               uint8_t stimeout;
+       /* (re)start at the front of the list */
+       if (cur == NULL)
+               cur = head;
  
-               /* wrap to start of list when we hit the end */
-               if (cur == NULL) {
-                       cur = pf_state_ref(TAILQ_FIRST(&state_list));
-                       if (cur == NULL)
-                               break;  /* list empty */
-               }
+       now = getuptime();
  
-               /* get next state, as cur may get deleted */
-               next = TAILQ_NEXT(cur, entry_list);
+       do {
+               uint8_t stimeout = cur->timeout;
  
-               stimeout = cur->timeout;
                 if ((stimeout == PFTM_UNLINKED) ||
-                   (pf_state_expires(cur, stimeout) <= now))
-                       SLIST_INSERT_HEAD(&gcl, cur, gc_list);
-               else
-                       pf_state_unref(cur);
-
-               cur = pf_state_ref(next);
+                   (pf_state_expires(cur, stimeout) <= now)) {
+                       st = pf_state_ref(cur);
+                       SLIST_INSERT_HEAD(&gcl, st, gc_list);
+               }
  
-               if (cur == NULL)
+               /* don't iterate past the end of our view of the list */
+               if (cur == tail) {
+                       cur = NULL;
                         break;
-       }
-       PF_STATE_EXIT_READ();
+               }
+ 
+               cur = TAILQ_NEXT(cur, entry_list);
+       } while (maxcheck--);
+
+       rw_exit_read(&pf_state_list.pfs_rwl);
  
+       if (SLIST_EMPTY(&gcl))
+               return;
+
+       NET_LOCK();
+       rw_enter_write(&pf_state_list.pfs_rwl);
         PF_LOCK();
         PF_STATE_ENTER_WRITE();
-       while ((next = SLIST_FIRST(&gcl)) != NULL) {
-               SLIST_REMOVE_HEAD(&gcl, gc_list);
-               if (next->timeout == PFTM_UNLINKED)
-                       pf_free_state(next);
-               else {
-                       pf_remove_state(next);
-                       pf_free_state(next);
-               }
-
-               pf_state_unref(next);
+       SLIST_FOREACH(st, &gcl, gc_list) {
+               if (st->timeout != PFTM_UNLINKED)
+                       pf_remove_state(st);
+ 
+               pf_free_state(st);
         }
         PF_STATE_EXIT_WRITE();
         PF_UNLOCK();
+       rw_exit_write(&pf_state_list.pfs_rwl);
+       NET_UNLOCK();
+
+       while ((st = SLIST_FIRST(&gcl)) != NULL) {
+               SLIST_REMOVE_HEAD(&gcl, gc_list);
+               pf_state_unref(st);
+       }
  }
  
  int
diff --git a/sys/net/pf_ioctl.c b/sys/net/pf_ioctl.c

index 2335fb0..d4893d7 100644 (file)
--- a/sys/net/pf_ioctl.c
+++ b/sys/net/pf_ioctl.c
@@ -1,4 +1,4 @@
-/*     $OpenBSD: pf_ioctl.c,v 1.364 2021/06/02 07:46:22 dlg Exp $ */
+/*     $OpenBSD: pf_ioctl.c,v 1.365 2021/06/23 06:53:52 dlg Exp $ */
  
  /*
   * Copyright (c) 2001 Daniel Hartmeier
@@ -113,6 +113,8 @@ int                  pf_rule_copyin(struct pf_rule *, struct pf_rule *,
  u_int16_t               pf_qname2qid(char *, int);
  void                    pf_qid2qname(u_int16_t, char *);
  void                    pf_qid_unref(u_int16_t);
+int                     pf_states_clr(struct pfioc_state_kill *);
+int                     pf_states_get(struct pfioc_states *);
  
  struct pf_rule          pf_default_rule, pf_default_rule_new;
  
@@ -206,7 +208,6 @@ pfattach(int num)
         TAILQ_INIT(&pf_queues[1]);
         pf_queues_active = &pf_queues[0];
         pf_queues_inactive = &pf_queues[1];
-       TAILQ_INIT(&state_list);
  
         /* default rule should never be garbage collected */
         pf_default_rule.entries.tqe_prev = &pf_default_rule.entries.tqe_next;
@@ -902,6 +903,122 @@ pf_addr_copyout(struct pf_addr_wrap *addr)
         pf_rtlabel_copyout(addr);
  }
  
+int
+pf_states_clr(struct pfioc_state_kill *psk)
+{
+       struct pf_state         *s, *nexts;
+       struct pf_state         *head, *tail;
+       u_int                    killed = 0;
+       int                      error;
+
+       NET_LOCK();
+
+       /* lock against the gc removing an item from the list */
+       error = rw_enter(&pf_state_list.pfs_rwl, RW_READ|RW_INTR);
+       if (error != 0)
+               goto unlock;
+
+       /* get a snapshot view of the ends of the list to traverse between */
+       mtx_enter(&pf_state_list.pfs_mtx);
+       head = TAILQ_FIRST(&pf_state_list.pfs_list);
+       tail = TAILQ_LAST(&pf_state_list.pfs_list, pf_state_queue);
+       mtx_leave(&pf_state_list.pfs_mtx);
+
+       s = NULL;
+       nexts = head;
+
+       PF_LOCK();
+       PF_STATE_ENTER_WRITE();
+
+       while (s != tail) {
+               s = nexts;
+               nexts = TAILQ_NEXT(s, entry_list);
+
+               if (s->timeout == PFTM_UNLINKED)
+                       continue;
+
+               if (!psk->psk_ifname[0] || !strcmp(psk->psk_ifname,
+                   s->kif->pfik_name)) {
+#if NPFSYNC > 0
+                       /* don't send out individual delete messages */
+                       SET(s->state_flags, PFSTATE_NOSYNC);
+#endif /* NPFSYNC > 0 */
+                       pf_remove_state(s);
+                       killed++;
+               }
+       }
+
+       PF_STATE_EXIT_WRITE();
+#if NPFSYNC > 0
+       pfsync_clear_states(pf_status.hostid, psk->psk_ifname);
+#endif /* NPFSYNC > 0 */
+       PF_UNLOCK();
+       rw_exit(&pf_state_list.pfs_rwl);
+
+       psk->psk_killed = killed;
+unlock:
+       NET_UNLOCK();
+
+       return (error);
+}
+
+int
+pf_states_get(struct pfioc_states *ps)
+{
+       struct pf_state         *head, *tail;
+       struct pf_state         *next, *state;
+       struct pfsync_state     *p, pstore;
+       u_int32_t                nr = 0;
+       int                      error;
+
+       if (ps->ps_len == 0) {
+               nr = pf_status.states;
+               ps->ps_len = sizeof(struct pfsync_state) * nr;
+               return (0);
+       }
+
+       p = ps->ps_states;
+
+       /* lock against the gc removing an item from the list */
+       error = rw_enter(&pf_state_list.pfs_rwl, RW_READ|RW_INTR);
+       if (error != 0)
+               return (error);
+
+       /* get a snapshot view of the ends of the list to traverse between */
+       mtx_enter(&pf_state_list.pfs_mtx);
+       head = TAILQ_FIRST(&pf_state_list.pfs_list);
+       tail = TAILQ_LAST(&pf_state_list.pfs_list, pf_state_queue);
+       mtx_leave(&pf_state_list.pfs_mtx);
+
+       state = NULL;
+       next = head;
+
+       while (state != tail) {
+               state = next;
+               next = TAILQ_NEXT(state, entry_list);
+
+               if (state->timeout == PFTM_UNLINKED)
+                       continue;
+
+               if ((nr+1) * sizeof(*p) > ps->ps_len)
+                       break;
+
+               pf_state_export(&pstore, state);
+               error = copyout(&pstore, p, sizeof(*p));
+               if (error)
+                       goto fail;
+
+               p++;
+               nr++;
+       }
+       ps->ps_len = sizeof(struct pfsync_state) * nr;
+
+fail:
+       rw_exit(&pf_state_list.pfs_rwl);
+
+       return (error);
+}
+
  int
  pfioctl(dev_t dev, u_long cmd, caddr_t addr, int flags, struct proc *p)
  {
@@ -1545,36 +1662,9 @@ pfioctl(dev_t dev, u_long cmd, caddr_t addr, int flags, struct proc *p)
                 break;
         }
  
-       case DIOCCLRSTATES: {
-               struct pf_state         *s, *nexts;
-               struct pfioc_state_kill *psk = (struct pfioc_state_kill *)addr;
-               u_int                    killed = 0;
-
-               NET_LOCK();
-               PF_LOCK();
-               PF_STATE_ENTER_WRITE();
-               for (s = RB_MIN(pf_state_tree_id, &tree_id); s; s = nexts) {
-                       nexts = RB_NEXT(pf_state_tree_id, &tree_id, s);
-
-                       if (!psk->psk_ifname[0] || !strcmp(psk->psk_ifname,
-                           s->kif->pfik_name)) {
-#if NPFSYNC > 0
-                               /* don't send out individual delete messages */
-                               SET(s->state_flags, PFSTATE_NOSYNC);
-#endif /* NPFSYNC > 0 */
-                               pf_remove_state(s);
-                               killed++;
-                       }
-               }
-               PF_STATE_EXIT_WRITE();
-               psk->psk_killed = killed;
-#if NPFSYNC > 0
-               pfsync_clear_states(pf_status.hostid, psk->psk_ifname);
-#endif /* NPFSYNC > 0 */
-               PF_UNLOCK();
-               NET_UNLOCK();
+       case DIOCCLRSTATES:
+               error = pf_states_clr((struct pfioc_state_kill *)addr);
                 break;
-       }
  
         case DIOCKILLSTATES: {
                 struct pf_state         *s, *nexts;
@@ -1757,50 +1847,9 @@ pfioctl(dev_t dev, u_long cmd, caddr_t addr, int flags, struct proc *p)
                 break;
         }
  
-       case DIOCGETSTATES: {
-               struct pfioc_states     *ps = (struct pfioc_states *)addr;
-               struct pf_state         *state;
-               struct pfsync_state     *p, *pstore;
-               u_int32_t                nr = 0;
-
-               if (ps->ps_len == 0) {
-                       nr = pf_status.states;
-                       ps->ps_len = sizeof(struct pfsync_state) * nr;
-                       break;
-               }
-
-               pstore = malloc(sizeof(*pstore), M_TEMP, M_WAITOK);
-
-               p = ps->ps_states;
-
-               NET_LOCK();
-               PF_STATE_ENTER_READ();
-               state = TAILQ_FIRST(&state_list);
-               while (state) {
-                       if (state->timeout != PFTM_UNLINKED) {
-                               if ((nr+1) * sizeof(*p) > ps->ps_len)
-                                       break;
-                               pf_state_export(pstore, state);
-                               error = copyout(pstore, p, sizeof(*p));
-                               if (error) {
-                                       free(pstore, M_TEMP, sizeof(*pstore));
-                                       PF_STATE_EXIT_READ();
-                                       NET_UNLOCK();
-                                       goto fail;
-                               }
-                               p++;
-                               nr++;
-                       }
-                       state = TAILQ_NEXT(state, entry_list);
-               }
-               PF_STATE_EXIT_READ();
-               NET_UNLOCK();
-
-               ps->ps_len = sizeof(struct pfsync_state) * nr;
-
-               free(pstore, M_TEMP, sizeof(*pstore));
+       case DIOCGETSTATES: 
+               error = pf_states_get((struct pfioc_states *)addr);
                 break;
-       }
  
         case DIOCGETSTATUS: {
                 struct pf_status *s = (struct pf_status *)addr;
diff --git a/sys/net/pfvar.h b/sys/net/pfvar.h

index 44a7f60..d2f2464 100644 (file)
--- a/sys/net/pfvar.h
+++ b/sys/net/pfvar.h
@@ -1,4 +1,4 @@
-/*     $OpenBSD: pfvar.h,v 1.501 2021/06/23 04:16:32 dlg Exp $ */
+/*     $OpenBSD: pfvar.h,v 1.502 2021/06/23 06:53:52 dlg Exp $ */
  
  /*
   * Copyright (c) 2001 Daniel Hartmeier
@@ -1682,7 +1682,7 @@ RB_HEAD(pf_state_tree_id, pf_state);
  RB_PROTOTYPE(pf_state_tree_id, pf_state,
      entry_id, pf_state_compare_id);
  extern struct pf_state_tree_id tree_id;
-extern struct pf_state_queue state_list;
+extern struct pf_state_list pf_state_list;
  
  TAILQ_HEAD(pf_queuehead, pf_queuespec);
  extern struct pf_queuehead               pf_queues[2];
diff --git a/sys/net/pfvar_priv.h b/sys/net/pfvar_priv.h

index b1a122a..5bba87b 100644 (file)
--- a/sys/net/pfvar_priv.h
+++ b/sys/net/pfvar_priv.h
@@ -1,4 +1,4 @@
-/*     $OpenBSD: pfvar_priv.h,v 1.6 2021/02/09 14:06:19 patrick Exp $  */
+/*     $OpenBSD: pfvar_priv.h,v 1.7 2021/06/23 06:53:52 dlg Exp $      */
  
  /*
   * Copyright (c) 2001 Daniel Hartmeier
@@ -38,6 +38,114 @@
  #ifdef _KERNEL
  
  #include <sys/rwlock.h>
+#include <sys/mutex.h>
+
+/*
+ *
+ * states are linked into a global list to support the following
+ * functionality:
+ *
+ * - garbage collection
+ * - pfsync bulk send operations
+ * - bulk state fetches via the DIOCGETSTATES ioctl
+ * - bulk state clearing via the DIOCCLRSTATES ioctl
+ * 
+ * states are inserted into the global pf_state_list once it has also
+ * been successfully added to the various trees that make up the state
+ * table. states are only removed from the pf_state_list by the garbage
+ * collection process.
+
+ * the pf_state_list head and tail pointers (ie, the pfs_list TAILQ_HEAD
+ * structure) and the pointers between the entries on the pf_state_list
+ * are locked separately. at a high level, this allows for insertion
+ * of new states into the pf_state_list while other contexts (eg, the
+ * ioctls) are traversing the state items in the list. for garbage
+ * collection to remove items from the pf_state_list, it has to exclude
+ * both modifications to the list head and tail pointers, and traversal
+ * of the links between the states.
+ *
+ * the head and tail pointers are protected by a mutex. the pointers
+ * between states are protected by an rwlock.
+ *
+ * because insertions are only made to the end of the list, if we get
+ * a snapshot of the head and tail of the list and prevent modifications
+ * to the links between states, we can safely traverse between the
+ * head and tail entries. subsequent insertions can add entries after
+ * our view of the tail, but we don't look past our view.
+ *
+ * if both locks must be taken, the rwlock protecting the links between
+ * states is taken before the mutex protecting the head and tail
+ * pointer.
+ *
+ * insertion into the list follows this pattern:
+ *
+ *     // serialise list head/tail modifications
+ *     mtx_enter(&pf_state_list.pfs_mtx);
+ *     TAILQ_INSERT_TAIL(&pf_state_list.pfs_list, state, entry_list);
+ *     mtx_leave(&pf_state_list.pfs_mtx);
+ *
+ * traversal of the list:
+ *
+ *     // lock against the gc removing an item from the list
+ *     rw_enter_read(&pf_state_list.pfs_rwl);
+ *
+ *     // get a snapshot view of the ends of the list
+ *     mtx_enter(&pf_state_list.pfs_mtx);
+ *     head = TAILQ_FIRST(&pf_state_list.pfs_list);
+ *     tail = TAILQ_LAST(&pf_state_list.pfs_list, pf_state_queue);
+ *     mtx_leave(&pf_state_list.pfs_mtx);
+ *
+ *     state = NULL;
+ *     next = head;
+ *
+ *     while (state != tail) {
+ *             state = next;
+ *             next = TAILQ_NEXT(state, entry_list);
+ *
+ *             // look at the state
+ *     }
+ *
+ *     rw_exit_read(&pf_state_list.pfs_rwl);
+ *
+ * removing an item from the list:
+ * 
+ *     // wait for iterators (readers) to get out
+ *     rw_enter_write(&pf_state_list.pfs_rwl);
+ *
+ *     // serialise list head/tail modifications
+ *     mtx_enter(&pf_state_list.pfs_mtx);
+ *     TAILQ_REMOVE(&pf_state_list.pfs_list, state, entry_list);
+ *     mtx_leave(&pf_state_list.pfs_mtx);
+ *
+ *     rw_exit_write(&pf_state_list.pfs_rwl);
+ *
+ * the lock ordering for pf_state_list locks and the rest of the pf
+ * locks are:
+ *
+ * 1. KERNEL_LOCK
+ * 2. NET_LOCK
+ * 3. pf_state_list.pfs_rwl
+ * 4. PF_LOCK
+ * 5. PF_STATE_LOCK
+ * 6. pf_state_list.pfs_mtx
+ */
+
+struct pf_state_list {
+       /* the list of states in the system */
+       struct pf_state_queue           pfs_list;
+
+       /* serialise pfs_list head/tail access */
+       struct mutex                    pfs_mtx;
+
+       /* serialise access to pointers between pfs_list entries */
+       struct rwlock                   pfs_rwl;
+};
+
+#define PF_STATE_LIST_INITIALIZER(_pfs) {                              \
+       .pfs_list       = TAILQ_HEAD_INITIALIZER(_pfs.pfs_list),        \
+       .pfs_mtx        = MUTEX_INITIALIZER(IPL_SOFTNET),               \
+       .pfs_rwl        = RWLOCK_INITIALIZER("pfstates"),               \
+}
  
  extern struct rwlock pf_lock;
author	dlg <dlg@openbsd.org>
	Wed, 23 Jun 2021 06:53:51 +0000 (06:53 +0000)
committer	dlg <dlg@openbsd.org>
	Wed, 23 Jun 2021 06:53:51 +0000 (06:53 +0000)
sys/net/if_pfsync.c		patch \| blob \| history
sys/net/pf.c		patch \| blob \| history
sys/net/pf_ioctl.c		patch \| blob \| history
sys/net/pfvar.h		patch \| blob \| history
sys/net/pfvar_priv.h		patch \| blob \| history