move pf_purge out from under the kernel lock and avoid the hogging cpu
authordlg <dlg@openbsd.org>
Mon, 7 Nov 2022 12:56:38 +0000 (12:56 +0000)
committerdlg <dlg@openbsd.org>
Mon, 7 Nov 2022 12:56:38 +0000 (12:56 +0000)
this also avoids holding NET_LOCK too long.

the main change is done by running the purge tasks in systqmp instead
of systq. the pf state list was recently reworked so iteration over
the state can be done without blocking insertions.

however, scanning a lot of states can still take a lot of time, so
this also makes the state list scanner yield if it has spent too
much time running.

the other purge tasks for source nodes, rules, and fragments have
been moved to their own timeout/task pair to simplify the time
accounting.

in my environment, before this change pf purges often took 10 to
50ms. the softclock thread runs next to it often took a similar
amount of time, presumably because they ended up spinning waiting
for each other. after this change the pf_purges are more like 6 to
12ms, and dont block softclock. most of the variability in the runs
now seems to come from contention on the net lock.

tested by me sthen@ chris@
ok sashan@ kn@ claudio@

sys/net/pf.c
sys/net/pf_ioctl.c
sys/net/pfvar.h
sys/net/pfvar_priv.h

index 62108ab..2c6124e 100644 (file)
@@ -1,4 +1,4 @@
-/*     $OpenBSD: pf.c,v 1.1142 2022/11/06 18:05:05 dlg Exp $ */
+/*     $OpenBSD: pf.c,v 1.1143 2022/11/07 12:56:38 dlg Exp $ */
 
 /*
  * Copyright (c) 2001 Daniel Hartmeier
@@ -120,10 +120,6 @@ u_char                      pf_tcp_secret[16];
 int                     pf_tcp_secret_init;
 int                     pf_tcp_iss_off;
 
-int             pf_npurge;
-struct task     pf_purge_task = TASK_INITIALIZER(pf_purge, &pf_npurge);
-struct timeout  pf_purge_to = TIMEOUT_INITIALIZER(pf_purge_timeout, NULL);
-
 enum pf_test_status {
        PF_TEST_FAIL = -1,
        PF_TEST_OK,
@@ -1499,49 +1495,111 @@ pf_purge_expired_rules(void)
        }
 }
 
-void
-pf_purge_timeout(void *unused)
-{
-       /* XXX move to systqmp to avoid KERNEL_LOCK */
-       task_add(systq, &pf_purge_task);
-}
+void            pf_purge_states(void *);
+struct task     pf_purge_states_task =
+                    TASK_INITIALIZER(pf_purge_states, NULL);
+
+void            pf_purge_states_tick(void *);
+struct timeout  pf_purge_states_to =
+                    TIMEOUT_INITIALIZER(pf_purge_states_tick, NULL);
+
+unsigned int    pf_purge_expired_states(unsigned int, unsigned int);
+
+/*
+ * how many states to scan this interval.
+ *
+ * this is set when the timeout fires, and reduced by the task. the
+ * task will reschedule itself until the limit is reduced to zero,
+ * and then it adds the timeout again.
+ */
+unsigned int pf_purge_states_limit;
+
+/*
+ * limit how many states are processed with locks held per run of
+ * the state purge task.
+ */
+unsigned int pf_purge_states_collect = 64;
 
 void
-pf_purge(void *xnloops)
+pf_purge_states_tick(void *null)
 {
-       int *nloops = xnloops;
+       unsigned int limit = pf_status.states;
+       unsigned int interval = pf_default_rule.timeout[PFTM_INTERVAL];
+
+       if (limit == 0) {
+               timeout_add_sec(&pf_purge_states_to, 1);
+               return;
+       }
 
        /*
         * process a fraction of the state table every second
-        * Note:
-        *     we no longer need PF_LOCK() here, because
-        *     pf_purge_expired_states() uses pf_state_lock to maintain
-        *     consistency.
         */
-       if (pf_default_rule.timeout[PFTM_INTERVAL] > 0)
-               pf_purge_expired_states(1 + (pf_status.states
-                   / pf_default_rule.timeout[PFTM_INTERVAL]));
 
+       if (interval > 1)
+               limit /= interval;
+
+       pf_purge_states_limit = limit;
+       task_add(systqmp, &pf_purge_states_task);
+}
+
+void
+pf_purge_states(void *null)
+{
+       unsigned int limit;
+       unsigned int scanned;
+
+       limit = pf_purge_states_limit;
+       if (limit < pf_purge_states_collect)
+               limit = pf_purge_states_collect;
+
+       scanned = pf_purge_expired_states(limit, pf_purge_states_collect);
+       if (scanned >= pf_purge_states_limit) {
+               /* we've run out of states to scan this "interval" */
+               timeout_add_sec(&pf_purge_states_to, 1);
+               return;
+       }
+
+       pf_purge_states_limit -= scanned;
+       task_add(systqmp, &pf_purge_states_task);
+}
+
+void            pf_purge_tick(void *);
+struct timeout  pf_purge_to =
+                    TIMEOUT_INITIALIZER(pf_purge_tick, NULL);
+
+void            pf_purge(void *);
+struct task     pf_purge_task =
+                    TASK_INITIALIZER(pf_purge, NULL);
+
+void
+pf_purge_tick(void *null)
+{
+       task_add(systqmp, &pf_purge_task);
+}
+
+void
+pf_purge(void *null)
+{
+       unsigned int interval = max(1, pf_default_rule.timeout[PFTM_INTERVAL]);
+
+       /* XXX is NET_LOCK necessary? */
        NET_LOCK();
 
        PF_LOCK();
-       /* purge other expired types every PFTM_INTERVAL seconds */
-       if (++(*nloops) >= pf_default_rule.timeout[PFTM_INTERVAL]) {
-               pf_purge_expired_src_nodes();
-               pf_purge_expired_rules();
-       }
+
+       pf_purge_expired_src_nodes();
+       pf_purge_expired_rules();
+
        PF_UNLOCK();
 
        /*
         * Fragments don't require PF_LOCK(), they use their own lock.
         */
-       if ((*nloops) >= pf_default_rule.timeout[PFTM_INTERVAL]) {
-               pf_purge_expired_fragments();
-               *nloops = 0;
-       }
+       pf_purge_expired_fragments();
        NET_UNLOCK();
 
-       timeout_add_sec(&pf_purge_to, 1);
+       /* interpret the interval as idle time between runs */
+       timeout_add_sec(&pf_purge_to, interval);
 }
 
 int32_t
@@ -1741,8 +1799,8 @@ pf_free_state(struct pf_state *cur)
        pf_status.states--;
 }
 
-void
-pf_purge_expired_states(u_int32_t maxcheck)
+unsigned int
+pf_purge_expired_states(const unsigned int limit, const unsigned int collect)
 {
        /*
         * this task/thread/context/whatever is the only thing that
@@ -1756,6 +1814,8 @@ pf_purge_expired_states(u_int32_t maxcheck)
        struct pf_state         *st;
        SLIST_HEAD(pf_state_gcl, pf_state) gcl = SLIST_HEAD_INITIALIZER(gcl);
        time_t                   now;
+       unsigned int             scanned;
+       unsigned int             collected = 0;
 
        PF_ASSERT_UNLOCKED();
 
@@ -1769,7 +1829,7 @@ pf_purge_expired_states(u_int32_t maxcheck)
        if (head == NULL) {
                /* the list is empty */
                rw_exit_read(&pf_state_list.pfs_rwl);
-               return;
+               return (limit);
        }
 
        /* (re)start at the front of the list */
@@ -1778,28 +1838,38 @@ pf_purge_expired_states(u_int32_t maxcheck)
 
        now = getuptime();
 
-       do {
+       for (scanned = 0; scanned < limit; scanned++) {
                uint8_t stimeout = cur->timeout;
+               unsigned int limited = 0;
 
                if ((stimeout == PFTM_UNLINKED) ||
                    (pf_state_expires(cur, stimeout) <= now)) {
                        st = pf_state_ref(cur);
                        SLIST_INSERT_HEAD(&gcl, st, gc_list);
+
+                       if (++collected >= collect)
+                               limited = 1;
                }
 
                /* don't iterate past the end of our view of the list */
                if (cur == tail) {
+                       scanned = limit;
                        cur = NULL;
                        break;
                }
 
                cur = TAILQ_NEXT(cur, entry_list);
-       } while (maxcheck--);
+
+               /* don't spend too much time here. */
+               if (ISSET(READ_ONCE(curcpu()->ci_schedstate.spc_schedflags),
+                    SPCF_SHOULDYIELD) || limited)
+                       break;
+       }
 
        rw_exit_read(&pf_state_list.pfs_rwl);
 
        if (SLIST_EMPTY(&gcl))
-               return;
+               return (scanned);
 
        NET_LOCK();
        rw_enter_write(&pf_state_list.pfs_rwl);
@@ -1820,6 +1890,8 @@ pf_purge_expired_states(u_int32_t maxcheck)
                SLIST_REMOVE_HEAD(&gcl, gc_list);
                pf_state_unref(st);
        }
+
+       return (scanned);
 }
 
 int
index 60b11cb..b5d3261 100644 (file)
@@ -1,4 +1,4 @@
-/*     $OpenBSD: pf_ioctl.c,v 1.387 2022/11/06 18:05:05 dlg Exp $ */
+/*     $OpenBSD: pf_ioctl.c,v 1.388 2022/11/07 12:56:38 dlg Exp $ */
 
 /*
  * Copyright (c) 2001 Daniel Hartmeier
@@ -1169,6 +1169,7 @@ pfioctl(dev_t dev, u_long cmd, caddr_t addr, int flags, struct proc *p)
                                pf_status.stateid = gettime();
                                pf_status.stateid = pf_status.stateid << 32;
                        }
+                       timeout_add_sec(&pf_purge_states_to, 1);
                        timeout_add_sec(&pf_purge_to, 1);
                        pf_create_queues();
                        DPFPRINTF(LOG_NOTICE, "pf: started");
@@ -2768,8 +2769,9 @@ pfioctl(dev_t dev, u_long cmd, caddr_t addr, int flags, struct proc *p)
                        pf_default_rule.timeout[i] =
                            pf_default_rule_new.timeout[i];
                        if (pf_default_rule.timeout[i] == PFTM_INTERVAL &&
-                           pf_default_rule.timeout[i] < old)
-                               task_add(net_tq(0), &pf_purge_task);
+                           pf_default_rule.timeout[i] < old &&
+                           timeout_del(&pf_purge_to))
+                               task_add(systqmp, &pf_purge_task);
                }
                pfi_xcommit();
                pf_trans_set_commit();
index 8d97289..91b309a 100644 (file)
@@ -1,4 +1,4 @@
-/*     $OpenBSD: pfvar.h,v 1.512 2022/11/06 18:05:05 dlg Exp $ */
+/*     $OpenBSD: pfvar.h,v 1.513 2022/11/07 12:56:38 dlg Exp $ */
 
 /*
  * Copyright (c) 2001 Daniel Hartmeier
@@ -1716,7 +1716,6 @@ extern void                        pf_tbladdr_remove(struct pf_addr_wrap *);
 extern void                     pf_tbladdr_copyout(struct pf_addr_wrap *);
 extern void                     pf_calc_skip_steps(struct pf_rulequeue *);
 extern void                     pf_purge_expired_src_nodes(void);
-extern void                     pf_purge_expired_states(u_int32_t);
 extern void                     pf_purge_expired_rules(void);
 extern void                     pf_remove_state(struct pf_state *);
 extern void                     pf_remove_divert_state(struct pf_state_key *);
index ec8f61a..70d135b 100644 (file)
@@ -1,4 +1,4 @@
-/*     $OpenBSD: pfvar_priv.h,v 1.10 2022/04/29 08:58:49 bluhm Exp $   */
+/*     $OpenBSD: pfvar_priv.h,v 1.11 2022/11/07 12:56:38 dlg Exp $     */
 
 /*
  * Copyright (c) 2001 Daniel Hartmeier
@@ -209,6 +209,7 @@ struct pf_pdesc {
        } hdr;
 };
 
+extern struct timeout  pf_purge_states_to;
 extern struct task     pf_purge_task;
 extern struct timeout  pf_purge_to;
 
@@ -262,8 +263,6 @@ extern struct rwlock        pf_state_lock;
                            rw_status(&pf_state_lock), __func__);\
        } while (0)
 
-extern void                     pf_purge_timeout(void *);
-extern void                     pf_purge(void *);
 #endif /* _KERNEL */
 
 #endif /* _NET_PFVAR_PRIV_H_ */