Add a mutex to pf_state structure. Mutex retain a consistency
authorsashan <sashan@openbsd.org>
Thu, 10 Nov 2022 14:22:43 +0000 (14:22 +0000)
committersashan <sashan@openbsd.org>
Thu, 10 Nov 2022 14:22:43 +0000 (14:22 +0000)
of structure members without using a global state lock.
The first member which uses protection by mutex is key[] array.
more will follow.

OK dlg@

sys/net/pf.c
sys/net/pfvar.h

index b0147c1..d5a6f09 100644 (file)
@@ -1,4 +1,4 @@
-/*     $OpenBSD: pf.c,v 1.1146 2022/11/09 23:00:00 sashan Exp $ */
+/*     $OpenBSD: pf.c,v 1.1147 2022/11/10 14:22:43 sashan Exp $ */
 
 /*
  * Copyright (c) 2001 Daniel Hartmeier
@@ -185,7 +185,8 @@ int                  pf_translate_icmp_af(struct pf_pdesc*, int, void *);
 void                    pf_send_icmp(struct mbuf *, u_int8_t, u_int8_t, int,
                            sa_family_t, struct pf_rule *, u_int);
 void                    pf_detach_state(struct pf_state *);
-void                    pf_state_key_detach(struct pf_state *, int);
+void                    pf_state_key_detach(struct pf_state *,
+                           struct pf_state_key *);
 u_int32_t               pf_tcp_iss(struct pf_pdesc *);
 void                    pf_rule_to_actions(struct pf_rule *,
                            struct pf_rule_actions *);
@@ -776,7 +777,8 @@ pf_state_key_attach(struct pf_state_key *sk, struct pf_state *s, int idx)
                s->key[idx] = sk;
 
        if ((si = pool_get(&pf_state_item_pl, PR_NOWAIT)) == NULL) {
-               pf_state_key_detach(s, idx);
+               pf_state_key_detach(s, s->key[idx]);
+               s->key[idx] = NULL;
                return (-1);
        }
        si->s = s;
@@ -796,42 +798,50 @@ pf_state_key_attach(struct pf_state_key *sk, struct pf_state *s, int idx)
 void
 pf_detach_state(struct pf_state *s)
 {
-       if (s->key[PF_SK_WIRE] == s->key[PF_SK_STACK])
-               s->key[PF_SK_WIRE] = NULL;
+       struct pf_state_key *key[2];
+
+       mtx_enter(&s->mtx);
+       key[PF_SK_WIRE] = s->key[PF_SK_WIRE];
+       key[PF_SK_STACK] = s->key[PF_SK_STACK];
+       s->key[PF_SK_WIRE] = NULL;
+       s->key[PF_SK_STACK] = NULL;
+       mtx_leave(&s->mtx);
+
+       if (key[PF_SK_WIRE] == key[PF_SK_STACK])
+               key[PF_SK_WIRE] = NULL;
 
-       if (s->key[PF_SK_STACK] != NULL)
-               pf_state_key_detach(s, PF_SK_STACK);
+       if (key[PF_SK_STACK] != NULL)
+               pf_state_key_detach(s, key[PF_SK_STACK]);
 
-       if (s->key[PF_SK_WIRE] != NULL)
-               pf_state_key_detach(s, PF_SK_WIRE);
+       if (key[PF_SK_WIRE] != NULL)
+               pf_state_key_detach(s, key[PF_SK_WIRE]);
 }
 
 void
-pf_state_key_detach(struct pf_state *s, int idx)
+pf_state_key_detach(struct pf_state *s, struct pf_state_key *key)
 {
        struct pf_state_item    *si;
-       struct pf_state_key     *sk;
 
-       if (s->key[idx] == NULL)
+       PF_STATE_ASSERT_LOCKED();
+
+       if (key == NULL)
                return;
 
-       si = TAILQ_FIRST(&s->key[idx]->states);
+       si = TAILQ_FIRST(&key->states);
        while (si && si->s != s)
            si = TAILQ_NEXT(si, entry);
 
        if (si) {
-               TAILQ_REMOVE(&s->key[idx]->states, si, entry);
+               TAILQ_REMOVE(&key->states, si, entry);
                pool_put(&pf_state_item_pl, si);
        }
 
-       sk = s->key[idx];
-       s->key[idx] = NULL;
-       if (TAILQ_EMPTY(&sk->states)) {
-               RB_REMOVE(pf_state_tree, &pf_statetbl, sk);
-               sk->removed = 1;
-               pf_state_key_unlink_reverse(sk);
-               pf_state_key_unlink_inpcb(sk);
-               pf_state_key_unref(sk);
+       if (TAILQ_EMPTY(&key->states)) {
+               RB_REMOVE(pf_state_tree, &pf_statetbl, key);
+               key->removed = 1;
+               pf_state_key_unlink_reverse(key);
+               pf_state_key_unlink_inpcb(key);
+               pf_state_key_unref(key);
        }
 }
 
@@ -994,7 +1004,9 @@ pf_state_insert(struct pfi_kif *kif, struct pf_state_key **skw,
                }
                *skw = s->key[PF_SK_WIRE];
                if (pf_state_key_attach(*sks, s, PF_SK_STACK)) {
-                       pf_state_key_detach(s, PF_SK_WIRE);
+                       pf_state_key_detach(s, s->key[PF_SK_WIRE]);
+                       s->key[PF_SK_WIRE] = NULL;
+                       *skw = NULL;
                        PF_STATE_EXIT_WRITE();
                        return (-1);
                }
@@ -1426,6 +1438,7 @@ pf_state_import(const struct pfsync_state *sp, int flags)
        st->sync_state = PFSYNC_S_NONE;
 
        refcnt_init(&st->refcnt);
+       mtx_init(&st->mtx, IPL_NET);
 
        /* XXX when we have anchors, use STATE_INC_COUNTERS */
        r->states_cur++;
@@ -4319,6 +4332,7 @@ pf_create_state(struct pf_pdesc *pd, struct pf_rule *r, struct pf_rule *a,
         * pf_state_inserts() grabs reference for pfsync!
         */
        refcnt_init(&s->refcnt);
+       mtx_init(&s->mtx, IPL_NET);
 
        switch (pd->proto) {
        case IPPROTO_TCP:
index ced0e95..800970e 100644 (file)
@@ -1,4 +1,4 @@
-/*     $OpenBSD: pfvar.h,v 1.515 2022/11/09 23:00:00 sashan Exp $ */
+/*     $OpenBSD: pfvar.h,v 1.516 2022/11/10 14:22:43 sashan Exp $ */
 
 /*
  * Copyright (c) 2001 Daniel Hartmeier
@@ -741,37 +741,49 @@ struct pf_state_cmp {
        u_int8_t                 pad[3];
 };
 
+/*
+ * pf_state structure members require protection
+ * either by state lock (a global rw-lock) or
+ * need to be protected by mutex which is part
+ * of state. The annotation goes as follows:
+ *     - stateRW: means protected by exclusive state lock
+ *     - mtx: protected by mutex (pf_state::state_mtx)
+ *     - syncMTX: protected by mutex in pfsync
+ *     - I: immutable since creation
+ *     - ?: needs to be revisited
+ */
 struct pf_state {
-       u_int64_t                id;
-       u_int32_t                creatorid;
-       u_int8_t                 direction;
+       u_int64_t                id;            /* I */
+       u_int32_t                creatorid;     /* I */
+       u_int8_t                 direction;     /* I */
        u_int8_t                 pad[3];
 
-       TAILQ_ENTRY(pf_state)    sync_list;
-       TAILQ_ENTRY(pf_state)    sync_snap;
-       TAILQ_ENTRY(pf_state)    entry_list;
-       SLIST_ENTRY(pf_state)    gc_list;
-       RB_ENTRY(pf_state)       entry_id;
+       TAILQ_ENTRY(pf_state)    sync_list;     /* syncMTX */
+       TAILQ_ENTRY(pf_state)    sync_snap;     /* syncMTX */
+       TAILQ_ENTRY(pf_state)    entry_list;    /* stateRW */
+       SLIST_ENTRY(pf_state)    gc_list;       /* syncMTX */
+       RB_ENTRY(pf_state)       entry_id;      /* stateRW */
        struct pf_state_peer     src;
        struct pf_state_peer     dst;
-       struct pf_rule_slist     match_rules;
-       union pf_rule_ptr        rule;
-       union pf_rule_ptr        anchor;
-       union pf_rule_ptr        natrule;
-       struct pf_addr           rt_addr;
-       struct pf_sn_head        src_nodes;
-       struct pf_state_key     *key[2];        /* addresses stack and wire  */
-       struct pfi_kif          *kif;
-       u_int64_t                packets[2];
-       u_int64_t                bytes[2];
-       int32_t                  creation;
-       int32_t                  expire;
-       int32_t                  pfsync_time;
-       int                      rtableid[2];   /* rtables stack and wire */
-       u_int16_t                qid;
-       u_int16_t                pqid;
-       u_int16_t                tag;
-       u_int16_t                state_flags;
+       struct pf_rule_slist     match_rules;   /* I */
+       union pf_rule_ptr        rule;          /* I */
+       union pf_rule_ptr        anchor;        /* I */
+       union pf_rule_ptr        natrule;       /* I */
+       struct pf_addr           rt_addr;       /* I */
+       struct pf_sn_head        src_nodes;     /* I */
+       struct pf_state_key     *key[2];        /* mtx, stack and wire  */
+       struct pfi_kif          *kif;           /* I */
+       struct mutex             mtx;
+       u_int64_t                packets[2];    /* ? */
+       u_int64_t                bytes[2];      /* ? */
+       int32_t                  creation;      /* I */
+       int32_t                  expire;        /* ? */
+       int32_t                  pfsync_time;   /* ? */
+       int                      rtableid[2];   /* I rtables stack and wire */
+       u_int16_t                qid;           /* I */
+       u_int16_t                pqid;          /* I */
+       u_int16_t                tag;           /* I */
+       u_int16_t                state_flags;   /* ? */
 #define        PFSTATE_ALLOWOPTS       0x0001
 #define        PFSTATE_SLOPPY          0x0002
 #define        PFSTATE_PFLOW           0x0004
@@ -785,20 +797,20 @@ struct pf_state {
 #define        PFSTATE_INP_UNLINKED    0x0400
 #define        PFSTATE_SCRUBMASK (PFSTATE_NODF|PFSTATE_RANDOMID|PFSTATE_SCRUB_TCP)
 #define        PFSTATE_SETMASK   (PFSTATE_SETTOS|PFSTATE_SETPRIO)
-       u_int8_t                 log;
-       u_int8_t                 timeout;
-       u_int8_t                 sync_state; /* PFSYNC_S_x */
-       u_int8_t                 sync_updates;
-       u_int8_t                 min_ttl;
-       u_int8_t                 set_tos;
-       u_int8_t                 set_prio[2];
-       u_int16_t                max_mss;
-       u_int16_t                if_index_in;
-       u_int16_t                if_index_out;
+       u_int8_t                 log;           /* I */
+       u_int8_t                 timeout;       /* ? */
+       u_int8_t                 sync_state;    /* ? PFSYNC_S_x */
+       u_int8_t                 sync_updates;  /* ? */
+       u_int8_t                 min_ttl;       /* I */
+       u_int8_t                 set_tos;       /* I */
+       u_int8_t                 set_prio[2];   /* I */
+       u_int16_t                max_mss;       /* I */
+       u_int16_t                if_index_in;   /* I */
+       u_int16_t                if_index_out;  /* I */
        pf_refcnt_t              refcnt;
-       u_int16_t                delay;
-       u_int8_t                 rt;
-       u_int8_t                 snapped;
+       u_int16_t                delay;         /* I */
+       u_int8_t                 rt;            /* I */
+       u_int8_t                 snapped;       /* syncMTX */
 };
 
 /*