From 3f1b5c31154a0e33fd002641cfd038dc6b15129f Mon Sep 17 00:00:00 2001 From: dlg Date: Wed, 23 Jun 2021 06:53:51 +0000 Subject: [PATCH] augment the global pf state list with its own locks. before this, things that iterated over the global list of pf states had to take the net, pf, or pf state locks. in particular, the ioctls that dump the state table took the net and pf state locks before iterating over the states and using copyout to export them to userland. when we tried replacing the use rwlocks with mutexes under the pf locks, this blew up because you can't sleep when holding a mutex and there's a sleeping lock used inside copyout. this diff introduces two locks around the global state list: a mutex that protects the head and tail of the list, and an rwlock that protects the links between elements in the list. inserts on the state list only occur during packet handling and can be done by taking the mutex and putting the state on the tail before releasing the mutex. iterating over states is only done from thread/process contexts, so we can take a read lock, then the mutex to get a snapshot of the head and tail pointers, and then keep the read lock to iterate between the head and tail points. because it's a read lock we can then take other sleeping locks (eg, the one inside copyout) without (further) gymnastics. the pf state purge code takes the rwlock exclusively and the mutex to remove elements from the list. this allows the ioctls and purge code to loop over the list concurrently and largely without blocking the creation of states when pf is processing packets. pfsync also iterates over the state list when doing bulk sends, which the state purge code needs to be careful around. ok sashan@ --- sys/net/if_pfsync.c | 58 +++++++------ sys/net/pf.c | 140 ++++++++++++++++++++---------- sys/net/pf_ioctl.c | 197 +++++++++++++++++++++++++++---------------- sys/net/pfvar.h | 4 +- sys/net/pfvar_priv.h | 110 +++++++++++++++++++++++- 5 files changed, 361 insertions(+), 148 deletions(-) diff --git a/sys/net/if_pfsync.c b/sys/net/if_pfsync.c index cc8fa3953fa..eaaaa26f19d 100644 --- a/sys/net/if_pfsync.c +++ b/sys/net/if_pfsync.c @@ -1,4 +1,4 @@ -/* $OpenBSD: if_pfsync.c,v 1.294 2021/06/23 05:43:53 dlg Exp $ */ +/* $OpenBSD: if_pfsync.c,v 1.295 2021/06/23 06:53:51 dlg Exp $ */ /* * Copyright (c) 2002 Michael Shalayeff @@ -2550,22 +2550,34 @@ pfsync_bulk_start(void) { struct pfsync_softc *sc = pfsyncif; + NET_ASSERT_LOCKED(); + + /* + * pf gc via pfsync_state_in_use reads sc_bulk_next and + * sc_bulk_last while exclusively holding the pf_state_list + * rwlock. make sure it can't race with us setting these + * pointers. they basically act as hazards, and borrow the + * lists state reference count. + */ + rw_enter_read(&pf_state_list.pfs_rwl); + + /* get a consistent view of the list pointers */ + mtx_enter(&pf_state_list.pfs_mtx); + if (sc->sc_bulk_next == NULL) + sc->sc_bulk_next = TAILQ_FIRST(&pf_state_list.pfs_list); + + sc->sc_bulk_last = TAILQ_LAST(&pf_state_list.pfs_list, pf_state_queue); + mtx_leave(&pf_state_list.pfs_mtx); + + rw_exit_read(&pf_state_list.pfs_rwl); + DPFPRINTF(LOG_INFO, "received bulk update request"); - if (TAILQ_EMPTY(&state_list)) + if (sc->sc_bulk_last == NULL) pfsync_bulk_status(PFSYNC_BUS_END); else { sc->sc_ureq_received = getuptime(); - if (sc->sc_bulk_next == NULL) { - PF_STATE_ENTER_READ(); - sc->sc_bulk_next = TAILQ_FIRST(&state_list); - pf_state_ref(sc->sc_bulk_next); - PF_STATE_EXIT_READ(); - } - sc->sc_bulk_last = sc->sc_bulk_next; - pf_state_ref(sc->sc_bulk_last); - pfsync_bulk_status(PFSYNC_BUS_START); timeout_add(&sc->sc_bulk_tmo, 0); } @@ -2575,13 +2587,15 @@ void pfsync_bulk_update(void *arg) { struct pfsync_softc *sc; - struct pf_state *st, *st_next; + struct pf_state *st; int i = 0; NET_LOCK(); sc = pfsyncif; if (sc == NULL) goto out; + + rw_enter_read(&pf_state_list.pfs_rwl); st = sc->sc_bulk_next; sc->sc_bulk_next = NULL; @@ -2593,23 +2607,9 @@ pfsync_bulk_update(void *arg) i++; } - /* - * I wonder how we prevent infinite bulk update. IMO it can - * happen when sc_bulk_last state expires before we iterate - * through the whole list. - */ - PF_STATE_ENTER_READ(); - st_next = TAILQ_NEXT(st, entry_list); - pf_state_unref(st); - st = st_next; - if (st == NULL) - st = TAILQ_FIRST(&state_list); - pf_state_ref(st); - PF_STATE_EXIT_READ(); - + st = TAILQ_NEXT(st, entry_list); if ((st == NULL) || (st == sc->sc_bulk_last)) { /* we're done */ - pf_state_unref(sc->sc_bulk_last); sc->sc_bulk_last = NULL; pfsync_bulk_status(PFSYNC_BUS_END); break; @@ -2623,6 +2623,8 @@ pfsync_bulk_update(void *arg) break; } } + + rw_exit_read(&pf_state_list.pfs_rwl); out: NET_UNLOCK(); } @@ -2720,6 +2722,8 @@ pfsync_state_in_use(struct pf_state *st) if (sc == NULL) return (0); + rw_assert_wrlock(&pf_state_list.pfs_rwl); + if (st->sync_state != PFSYNC_S_NONE || st == sc->sc_bulk_next || st == sc->sc_bulk_last) diff --git a/sys/net/pf.c b/sys/net/pf.c index 27f961a16e3..672f4159719 100644 --- a/sys/net/pf.c +++ b/sys/net/pf.c @@ -1,4 +1,4 @@ -/* $OpenBSD: pf.c,v 1.1120 2021/06/23 05:51:27 dlg Exp $ */ +/* $OpenBSD: pf.c,v 1.1121 2021/06/23 06:53:52 dlg Exp $ */ /* * Copyright (c) 2001 Daniel Hartmeier @@ -309,7 +309,7 @@ static __inline void pf_set_protostate(struct pf_state *, int, u_int8_t); struct pf_src_tree tree_src_tracking; struct pf_state_tree_id tree_id; -struct pf_state_queue state_list; +struct pf_state_list pf_state_list = PF_STATE_LIST_INITIALIZER(pf_state_list); RB_GENERATE(pf_src_tree, pf_src_node, entry, pf_src_compare); RB_GENERATE(pf_state_tree, pf_state_key, entry, pf_state_compare_key); @@ -441,6 +441,37 @@ pf_check_threshold(struct pf_threshold *threshold) return (threshold->count > threshold->limit); } +void +pf_state_list_insert(struct pf_state_list *pfs, struct pf_state *st) +{ + /* + * we can always put states on the end of the list. + * + * things reading the list should take a read lock, then + * the mutex, get the head and tail pointers, release the + * mutex, and then they can iterate between the head and tail. + */ + + pf_state_ref(st); /* get a ref for the list */ + + mtx_enter(&pfs->pfs_mtx); + TAILQ_INSERT_TAIL(&pfs->pfs_list, st, entry_list); + mtx_leave(&pfs->pfs_mtx); +} + +void +pf_state_list_remove(struct pf_state_list *pfs, struct pf_state *st) +{ + /* states can only be removed when the write lock is held */ + rw_assert_wrlock(&pfs->pfs_rwl); + + mtx_enter(&pfs->pfs_mtx); + TAILQ_REMOVE(&pfs->pfs_list, st, entry_list); + mtx_leave(&pfs->pfs_mtx); + + pf_state_unref(st); /* list no longer references the state */ +} + int pf_src_connlimit(struct pf_state **state) { @@ -987,7 +1018,7 @@ pf_state_insert(struct pfi_kif *kif, struct pf_state_key **skw, PF_STATE_EXIT_WRITE(); return (-1); } - TAILQ_INSERT_TAIL(&state_list, s, entry_list); + pf_state_list_insert(&pf_state_list, s); pf_status.fcounters[FCNT_STATE_INSERT]++; pf_status.states++; pfi_kif_ref(kif, PFI_KIF_REF_STATE); @@ -1248,7 +1279,8 @@ pf_purge_expired_rules(void) void pf_purge_timeout(void *unused) { - task_add(net_tq(0), &pf_purge_task); + /* XXX move to systqmp to avoid KERNEL_LOCK */ + task_add(systq, &pf_purge_task); } void @@ -1256,9 +1288,6 @@ pf_purge(void *xnloops) { int *nloops = xnloops; - KERNEL_LOCK(); - NET_LOCK(); - /* * process a fraction of the state table every second * Note: @@ -1269,6 +1298,8 @@ pf_purge(void *xnloops) pf_purge_expired_states(1 + (pf_status.states / pf_default_rule.timeout[PFTM_INTERVAL])); + NET_LOCK(); + PF_LOCK(); /* purge other expired types every PFTM_INTERVAL seconds */ if (++(*nloops) >= pf_default_rule.timeout[PFTM_INTERVAL]) { @@ -1285,7 +1316,6 @@ pf_purge(void *xnloops) *nloops = 0; } NET_UNLOCK(); - KERNEL_UNLOCK(); timeout_add_sec(&pf_purge_to, 1); } @@ -1463,7 +1493,7 @@ pf_free_state(struct pf_state *cur) } pf_normalize_tcp_cleanup(cur); pfi_kif_unref(cur->kif, PFI_KIF_REF_STATE); - TAILQ_REMOVE(&state_list, cur, entry_list); + pf_state_list_remove(&pf_state_list, cur); if (cur->tag) pf_tag_unref(cur->tag); pf_state_unref(cur); @@ -1474,60 +1504,82 @@ pf_free_state(struct pf_state *cur) void pf_purge_expired_states(u_int32_t maxcheck) { + /* + * this task/thread/context/whatever is the only thing that + * removes states from the pf_state_list, so the cur reference + * it holds between calls is guaranteed to still be in the + * list. + */ static struct pf_state *cur = NULL; - struct pf_state *next; - SLIST_HEAD(pf_state_gcl, pf_state) gcl; + + struct pf_state *head, *tail; + struct pf_state *st; + SLIST_HEAD(pf_state_gcl, pf_state) gcl = SLIST_HEAD_INITIALIZER(gcl); time_t now; PF_ASSERT_UNLOCKED(); - SLIST_INIT(&gcl); + + rw_enter_read(&pf_state_list.pfs_rwl); - PF_STATE_ENTER_READ(); + mtx_enter(&pf_state_list.pfs_mtx); + head = TAILQ_FIRST(&pf_state_list.pfs_list); + tail = TAILQ_LAST(&pf_state_list.pfs_list, pf_state_queue); + mtx_leave(&pf_state_list.pfs_mtx); - now = getuptime(); + if (head == NULL) { + /* the list is empty */ + rw_exit_read(&pf_state_list.pfs_rwl); + return; + } - while (maxcheck--) { - uint8_t stimeout; + /* (re)start at the front of the list */ + if (cur == NULL) + cur = head; - /* wrap to start of list when we hit the end */ - if (cur == NULL) { - cur = pf_state_ref(TAILQ_FIRST(&state_list)); - if (cur == NULL) - break; /* list empty */ - } + now = getuptime(); - /* get next state, as cur may get deleted */ - next = TAILQ_NEXT(cur, entry_list); + do { + uint8_t stimeout = cur->timeout; - stimeout = cur->timeout; if ((stimeout == PFTM_UNLINKED) || - (pf_state_expires(cur, stimeout) <= now)) - SLIST_INSERT_HEAD(&gcl, cur, gc_list); - else - pf_state_unref(cur); - - cur = pf_state_ref(next); + (pf_state_expires(cur, stimeout) <= now)) { + st = pf_state_ref(cur); + SLIST_INSERT_HEAD(&gcl, st, gc_list); + } - if (cur == NULL) + /* don't iterate past the end of our view of the list */ + if (cur == tail) { + cur = NULL; break; - } - PF_STATE_EXIT_READ(); + } + + cur = TAILQ_NEXT(cur, entry_list); + } while (maxcheck--); + + rw_exit_read(&pf_state_list.pfs_rwl); + if (SLIST_EMPTY(&gcl)) + return; + + NET_LOCK(); + rw_enter_write(&pf_state_list.pfs_rwl); PF_LOCK(); PF_STATE_ENTER_WRITE(); - while ((next = SLIST_FIRST(&gcl)) != NULL) { - SLIST_REMOVE_HEAD(&gcl, gc_list); - if (next->timeout == PFTM_UNLINKED) - pf_free_state(next); - else { - pf_remove_state(next); - pf_free_state(next); - } - - pf_state_unref(next); + SLIST_FOREACH(st, &gcl, gc_list) { + if (st->timeout != PFTM_UNLINKED) + pf_remove_state(st); + + pf_free_state(st); } PF_STATE_EXIT_WRITE(); PF_UNLOCK(); + rw_exit_write(&pf_state_list.pfs_rwl); + NET_UNLOCK(); + + while ((st = SLIST_FIRST(&gcl)) != NULL) { + SLIST_REMOVE_HEAD(&gcl, gc_list); + pf_state_unref(st); + } } int diff --git a/sys/net/pf_ioctl.c b/sys/net/pf_ioctl.c index 2335fb04679..d4893d70d0f 100644 --- a/sys/net/pf_ioctl.c +++ b/sys/net/pf_ioctl.c @@ -1,4 +1,4 @@ -/* $OpenBSD: pf_ioctl.c,v 1.364 2021/06/02 07:46:22 dlg Exp $ */ +/* $OpenBSD: pf_ioctl.c,v 1.365 2021/06/23 06:53:52 dlg Exp $ */ /* * Copyright (c) 2001 Daniel Hartmeier @@ -113,6 +113,8 @@ int pf_rule_copyin(struct pf_rule *, struct pf_rule *, u_int16_t pf_qname2qid(char *, int); void pf_qid2qname(u_int16_t, char *); void pf_qid_unref(u_int16_t); +int pf_states_clr(struct pfioc_state_kill *); +int pf_states_get(struct pfioc_states *); struct pf_rule pf_default_rule, pf_default_rule_new; @@ -206,7 +208,6 @@ pfattach(int num) TAILQ_INIT(&pf_queues[1]); pf_queues_active = &pf_queues[0]; pf_queues_inactive = &pf_queues[1]; - TAILQ_INIT(&state_list); /* default rule should never be garbage collected */ pf_default_rule.entries.tqe_prev = &pf_default_rule.entries.tqe_next; @@ -902,6 +903,122 @@ pf_addr_copyout(struct pf_addr_wrap *addr) pf_rtlabel_copyout(addr); } +int +pf_states_clr(struct pfioc_state_kill *psk) +{ + struct pf_state *s, *nexts; + struct pf_state *head, *tail; + u_int killed = 0; + int error; + + NET_LOCK(); + + /* lock against the gc removing an item from the list */ + error = rw_enter(&pf_state_list.pfs_rwl, RW_READ|RW_INTR); + if (error != 0) + goto unlock; + + /* get a snapshot view of the ends of the list to traverse between */ + mtx_enter(&pf_state_list.pfs_mtx); + head = TAILQ_FIRST(&pf_state_list.pfs_list); + tail = TAILQ_LAST(&pf_state_list.pfs_list, pf_state_queue); + mtx_leave(&pf_state_list.pfs_mtx); + + s = NULL; + nexts = head; + + PF_LOCK(); + PF_STATE_ENTER_WRITE(); + + while (s != tail) { + s = nexts; + nexts = TAILQ_NEXT(s, entry_list); + + if (s->timeout == PFTM_UNLINKED) + continue; + + if (!psk->psk_ifname[0] || !strcmp(psk->psk_ifname, + s->kif->pfik_name)) { +#if NPFSYNC > 0 + /* don't send out individual delete messages */ + SET(s->state_flags, PFSTATE_NOSYNC); +#endif /* NPFSYNC > 0 */ + pf_remove_state(s); + killed++; + } + } + + PF_STATE_EXIT_WRITE(); +#if NPFSYNC > 0 + pfsync_clear_states(pf_status.hostid, psk->psk_ifname); +#endif /* NPFSYNC > 0 */ + PF_UNLOCK(); + rw_exit(&pf_state_list.pfs_rwl); + + psk->psk_killed = killed; +unlock: + NET_UNLOCK(); + + return (error); +} + +int +pf_states_get(struct pfioc_states *ps) +{ + struct pf_state *head, *tail; + struct pf_state *next, *state; + struct pfsync_state *p, pstore; + u_int32_t nr = 0; + int error; + + if (ps->ps_len == 0) { + nr = pf_status.states; + ps->ps_len = sizeof(struct pfsync_state) * nr; + return (0); + } + + p = ps->ps_states; + + /* lock against the gc removing an item from the list */ + error = rw_enter(&pf_state_list.pfs_rwl, RW_READ|RW_INTR); + if (error != 0) + return (error); + + /* get a snapshot view of the ends of the list to traverse between */ + mtx_enter(&pf_state_list.pfs_mtx); + head = TAILQ_FIRST(&pf_state_list.pfs_list); + tail = TAILQ_LAST(&pf_state_list.pfs_list, pf_state_queue); + mtx_leave(&pf_state_list.pfs_mtx); + + state = NULL; + next = head; + + while (state != tail) { + state = next; + next = TAILQ_NEXT(state, entry_list); + + if (state->timeout == PFTM_UNLINKED) + continue; + + if ((nr+1) * sizeof(*p) > ps->ps_len) + break; + + pf_state_export(&pstore, state); + error = copyout(&pstore, p, sizeof(*p)); + if (error) + goto fail; + + p++; + nr++; + } + ps->ps_len = sizeof(struct pfsync_state) * nr; + +fail: + rw_exit(&pf_state_list.pfs_rwl); + + return (error); +} + int pfioctl(dev_t dev, u_long cmd, caddr_t addr, int flags, struct proc *p) { @@ -1545,36 +1662,9 @@ pfioctl(dev_t dev, u_long cmd, caddr_t addr, int flags, struct proc *p) break; } - case DIOCCLRSTATES: { - struct pf_state *s, *nexts; - struct pfioc_state_kill *psk = (struct pfioc_state_kill *)addr; - u_int killed = 0; - - NET_LOCK(); - PF_LOCK(); - PF_STATE_ENTER_WRITE(); - for (s = RB_MIN(pf_state_tree_id, &tree_id); s; s = nexts) { - nexts = RB_NEXT(pf_state_tree_id, &tree_id, s); - - if (!psk->psk_ifname[0] || !strcmp(psk->psk_ifname, - s->kif->pfik_name)) { -#if NPFSYNC > 0 - /* don't send out individual delete messages */ - SET(s->state_flags, PFSTATE_NOSYNC); -#endif /* NPFSYNC > 0 */ - pf_remove_state(s); - killed++; - } - } - PF_STATE_EXIT_WRITE(); - psk->psk_killed = killed; -#if NPFSYNC > 0 - pfsync_clear_states(pf_status.hostid, psk->psk_ifname); -#endif /* NPFSYNC > 0 */ - PF_UNLOCK(); - NET_UNLOCK(); + case DIOCCLRSTATES: + error = pf_states_clr((struct pfioc_state_kill *)addr); break; - } case DIOCKILLSTATES: { struct pf_state *s, *nexts; @@ -1757,50 +1847,9 @@ pfioctl(dev_t dev, u_long cmd, caddr_t addr, int flags, struct proc *p) break; } - case DIOCGETSTATES: { - struct pfioc_states *ps = (struct pfioc_states *)addr; - struct pf_state *state; - struct pfsync_state *p, *pstore; - u_int32_t nr = 0; - - if (ps->ps_len == 0) { - nr = pf_status.states; - ps->ps_len = sizeof(struct pfsync_state) * nr; - break; - } - - pstore = malloc(sizeof(*pstore), M_TEMP, M_WAITOK); - - p = ps->ps_states; - - NET_LOCK(); - PF_STATE_ENTER_READ(); - state = TAILQ_FIRST(&state_list); - while (state) { - if (state->timeout != PFTM_UNLINKED) { - if ((nr+1) * sizeof(*p) > ps->ps_len) - break; - pf_state_export(pstore, state); - error = copyout(pstore, p, sizeof(*p)); - if (error) { - free(pstore, M_TEMP, sizeof(*pstore)); - PF_STATE_EXIT_READ(); - NET_UNLOCK(); - goto fail; - } - p++; - nr++; - } - state = TAILQ_NEXT(state, entry_list); - } - PF_STATE_EXIT_READ(); - NET_UNLOCK(); - - ps->ps_len = sizeof(struct pfsync_state) * nr; - - free(pstore, M_TEMP, sizeof(*pstore)); + case DIOCGETSTATES: + error = pf_states_get((struct pfioc_states *)addr); break; - } case DIOCGETSTATUS: { struct pf_status *s = (struct pf_status *)addr; diff --git a/sys/net/pfvar.h b/sys/net/pfvar.h index 44a7f60328b..d2f2464bc25 100644 --- a/sys/net/pfvar.h +++ b/sys/net/pfvar.h @@ -1,4 +1,4 @@ -/* $OpenBSD: pfvar.h,v 1.501 2021/06/23 04:16:32 dlg Exp $ */ +/* $OpenBSD: pfvar.h,v 1.502 2021/06/23 06:53:52 dlg Exp $ */ /* * Copyright (c) 2001 Daniel Hartmeier @@ -1682,7 +1682,7 @@ RB_HEAD(pf_state_tree_id, pf_state); RB_PROTOTYPE(pf_state_tree_id, pf_state, entry_id, pf_state_compare_id); extern struct pf_state_tree_id tree_id; -extern struct pf_state_queue state_list; +extern struct pf_state_list pf_state_list; TAILQ_HEAD(pf_queuehead, pf_queuespec); extern struct pf_queuehead pf_queues[2]; diff --git a/sys/net/pfvar_priv.h b/sys/net/pfvar_priv.h index b1a122af409..5bba87b19a8 100644 --- a/sys/net/pfvar_priv.h +++ b/sys/net/pfvar_priv.h @@ -1,4 +1,4 @@ -/* $OpenBSD: pfvar_priv.h,v 1.6 2021/02/09 14:06:19 patrick Exp $ */ +/* $OpenBSD: pfvar_priv.h,v 1.7 2021/06/23 06:53:52 dlg Exp $ */ /* * Copyright (c) 2001 Daniel Hartmeier @@ -38,6 +38,114 @@ #ifdef _KERNEL #include +#include + +/* + * + * states are linked into a global list to support the following + * functionality: + * + * - garbage collection + * - pfsync bulk send operations + * - bulk state fetches via the DIOCGETSTATES ioctl + * - bulk state clearing via the DIOCCLRSTATES ioctl + * + * states are inserted into the global pf_state_list once it has also + * been successfully added to the various trees that make up the state + * table. states are only removed from the pf_state_list by the garbage + * collection process. + + * the pf_state_list head and tail pointers (ie, the pfs_list TAILQ_HEAD + * structure) and the pointers between the entries on the pf_state_list + * are locked separately. at a high level, this allows for insertion + * of new states into the pf_state_list while other contexts (eg, the + * ioctls) are traversing the state items in the list. for garbage + * collection to remove items from the pf_state_list, it has to exclude + * both modifications to the list head and tail pointers, and traversal + * of the links between the states. + * + * the head and tail pointers are protected by a mutex. the pointers + * between states are protected by an rwlock. + * + * because insertions are only made to the end of the list, if we get + * a snapshot of the head and tail of the list and prevent modifications + * to the links between states, we can safely traverse between the + * head and tail entries. subsequent insertions can add entries after + * our view of the tail, but we don't look past our view. + * + * if both locks must be taken, the rwlock protecting the links between + * states is taken before the mutex protecting the head and tail + * pointer. + * + * insertion into the list follows this pattern: + * + * // serialise list head/tail modifications + * mtx_enter(&pf_state_list.pfs_mtx); + * TAILQ_INSERT_TAIL(&pf_state_list.pfs_list, state, entry_list); + * mtx_leave(&pf_state_list.pfs_mtx); + * + * traversal of the list: + * + * // lock against the gc removing an item from the list + * rw_enter_read(&pf_state_list.pfs_rwl); + * + * // get a snapshot view of the ends of the list + * mtx_enter(&pf_state_list.pfs_mtx); + * head = TAILQ_FIRST(&pf_state_list.pfs_list); + * tail = TAILQ_LAST(&pf_state_list.pfs_list, pf_state_queue); + * mtx_leave(&pf_state_list.pfs_mtx); + * + * state = NULL; + * next = head; + * + * while (state != tail) { + * state = next; + * next = TAILQ_NEXT(state, entry_list); + * + * // look at the state + * } + * + * rw_exit_read(&pf_state_list.pfs_rwl); + * + * removing an item from the list: + * + * // wait for iterators (readers) to get out + * rw_enter_write(&pf_state_list.pfs_rwl); + * + * // serialise list head/tail modifications + * mtx_enter(&pf_state_list.pfs_mtx); + * TAILQ_REMOVE(&pf_state_list.pfs_list, state, entry_list); + * mtx_leave(&pf_state_list.pfs_mtx); + * + * rw_exit_write(&pf_state_list.pfs_rwl); + * + * the lock ordering for pf_state_list locks and the rest of the pf + * locks are: + * + * 1. KERNEL_LOCK + * 2. NET_LOCK + * 3. pf_state_list.pfs_rwl + * 4. PF_LOCK + * 5. PF_STATE_LOCK + * 6. pf_state_list.pfs_mtx + */ + +struct pf_state_list { + /* the list of states in the system */ + struct pf_state_queue pfs_list; + + /* serialise pfs_list head/tail access */ + struct mutex pfs_mtx; + + /* serialise access to pointers between pfs_list entries */ + struct rwlock pfs_rwl; +}; + +#define PF_STATE_LIST_INITIALIZER(_pfs) { \ + .pfs_list = TAILQ_HEAD_INITIALIZER(_pfs.pfs_list), \ + .pfs_mtx = MUTEX_INITIALIZER(IPL_SOFTNET), \ + .pfs_rwl = RWLOCK_INITIALIZER("pfstates"), \ +} extern struct rwlock pf_lock; -- 2.20.1