- moving state look up outside of PF_LOCK()
authorsashan <sashan@openbsd.org>
Tue, 11 Sep 2018 07:53:38 +0000 (07:53 +0000)
committersashan <sashan@openbsd.org>
Tue, 11 Sep 2018 07:53:38 +0000 (07:53 +0000)
this change adds a pf_state_lock rw-lock, which protects consistency
of state table in PF. The code delivered in this change is guarded
by 'WITH_PF_LOCK', which is still undefined. People, who are willing
to experiment and want to run it must do two things:

- compile kernel with -DWITH_PF_LOCK
- bump NET_TASKQ from 1 to ... sky is the limit,
  (just select some sensible value for number of tasks your
  system is able to handle)

OK bluhm@

sys/net/if_pfsync.c
sys/net/if_pfsync.h
sys/net/pf.c
sys/net/pf_ioctl.c
sys/net/pfvar.h
sys/net/pfvar_priv.h

index 3f71ffc..8d86f42 100644 (file)
@@ -1,4 +1,4 @@
-/*     $OpenBSD: if_pfsync.c,v 1.258 2018/08/24 12:29:33 sashan Exp $  */
+/*     $OpenBSD: if_pfsync.c,v 1.259 2018/09/11 07:53:38 sashan Exp $  */
 
 /*
  * Copyright (c) 2002 Michael Shalayeff
@@ -114,6 +114,8 @@ int pfsync_in_eof(caddr_t, int, int, int);
 
 int    pfsync_in_error(caddr_t, int, int, int);
 
+void   pfsync_update_state_locked(struct pf_state *);
+
 struct {
        int     (*in)(caddr_t, int, int, int);
        size_t  len;
@@ -602,6 +604,8 @@ pfsync_state_import(struct pfsync_state *sp, int flags)
        st->pfsync_time = time_uptime;
        st->sync_state = PFSYNC_S_NONE;
 
+       refcnt_init(&st->refcnt);
+
        /* XXX when we have anchors, use STATE_INC_COUNTERS */
        r->states_cur++;
        r->states_tot++;
@@ -609,6 +613,10 @@ pfsync_state_import(struct pfsync_state *sp, int flags)
        if (!ISSET(flags, PFSYNC_SI_IOCTL))
                SET(st->state_flags, PFSTATE_NOSYNC);
 
+       /*
+        * We just set PFSTATE_NOSYNC bit, which prevents
+        * pfsync_insert_state() to insert state to pfsync.
+        */
        if (pf_state_insert(kif, &skw, &sks, st) != 0) {
                /* XXX when we have anchors, use STATE_DEC_COUNTERS */
                r->states_cur--;
@@ -941,7 +949,7 @@ pfsync_in_upd(caddr_t buf, int len, int count, int flags)
                if (sync) {
                        pfsyncstat_inc(pfsyncs_stale);
 
-                       pfsync_update_state(st);
+                       pfsync_update_state_locked(st);
                        schednetisr(NETISR_PFSYNC);
                }
        }
@@ -1015,7 +1023,7 @@ pfsync_in_upd_c(caddr_t buf, int len, int count, int flags)
                if (sync) {
                        pfsyncstat_inc(pfsyncs_stale);
 
-                       pfsync_update_state(st);
+                       pfsync_update_state_locked(st);
                        schednetisr(NETISR_PFSYNC);
                }
        }
@@ -1470,6 +1478,7 @@ pfsync_drop(struct pfsync_softc *sc)
                        KASSERT(st->sync_state == q);
 #endif
                        st->sync_state = PFSYNC_S_NONE;
+                       pf_state_unref(st);
                }
        }
 
@@ -1618,13 +1627,14 @@ pfsync_sendout(void)
                count = 0;
                while ((st = TAILQ_FIRST(&sc->sc_qs[q])) != NULL) {
                        TAILQ_REMOVE(&sc->sc_qs[q], st, sync_list);
+                       st->sync_state = PFSYNC_S_NONE;
 #ifdef PFSYNC_DEBUG
                        KASSERT(st->sync_state == q);
 #endif
                        pfsync_qs[q].write(st, m->m_data + offset);
                        offset += pfsync_qs[q].len;
 
-                       st->sync_state = PFSYNC_S_NONE;
+                       pf_state_unref(st);
                        count++;
                }
 
@@ -1720,7 +1730,7 @@ pfsync_defer(struct pf_state *st, struct mbuf *m)
        m->m_pkthdr.pf.flags |= PF_TAG_GENERATED;
        SET(st->state_flags, PFSTATE_ACK);
 
-       pd->pd_st = st;
+       pd->pd_st = pf_state_ref(st);
        pd->pd_m = m;
 
        sc->sc_deferred++;
@@ -1768,6 +1778,8 @@ pfsync_undefer(struct pfsync_deferral *pd, int drop)
                                    pd->pd_st->rule.ptr, pd->pd_st);
                                break;
 #endif /* INET6 */
+                       default:
+                               unhandled_af(pd->pd_st->key[PF_SK_WIRE]->af);
                        }
                        pd->pd_m = pdesc.m;
                } else {
@@ -1782,10 +1794,13 @@ pfsync_undefer(struct pfsync_deferral *pd, int drop)
                                    NULL, NULL);
                                break;
 #endif /* INET6 */
+                       default:
+                               unhandled_af(pd->pd_st->key[PF_SK_WIRE]->af);
                        }
                }
        }
  out:
+       pf_state_unref(pd->pd_st);
        pool_put(&sc->sc_pool, pd);
 }
 
@@ -1817,12 +1832,13 @@ pfsync_deferred(struct pf_state *st, int drop)
 }
 
 void
-pfsync_update_state(struct pf_state *st)
+pfsync_update_state_locked(struct pf_state *st)
 {
        struct pfsync_softc *sc = pfsyncif;
        int sync = 0;
 
        NET_ASSERT_LOCKED();
+       PF_ASSERT_LOCKED();
 
        if (sc == NULL || !ISSET(sc->sc_if.if_flags, IFF_RUNNING))
                return;
@@ -1867,6 +1883,22 @@ pfsync_update_state(struct pf_state *st)
                schednetisr(NETISR_PFSYNC);
 }
 
+void
+pfsync_update_state(struct pf_state *st, int *have_pf_lock)
+{
+       struct pfsync_softc *sc = pfsyncif;
+
+       if (sc == NULL || !ISSET(sc->sc_if.if_flags, IFF_RUNNING))
+               return;
+
+       if (*have_pf_lock == 0) {
+               PF_LOCK();
+               *have_pf_lock = 1;
+       }
+
+       pfsync_update_state_locked(st);
+}
+
 void
 pfsync_cancel_full_update(struct pfsync_softc *sc)
 {
@@ -2017,9 +2049,22 @@ pfsync_delete_state(struct pf_state *st)
        case PFSYNC_S_UPD:
        case PFSYNC_S_IACK:
                pfsync_q_del(st);
-               /* FALLTHROUGH to putting it on the del list */
+               /*
+                * FALLTHROUGH to putting it on the del list
+                * Note on refence count bookeeping:
+                *      pfsync_q_del() drops reference for queue
+                *      ownership. But the st entry survives, because
+                *      our caller still holds a reference.
+                */
 
        case PFSYNC_S_NONE:
+               /*
+                * We either fall through here, or there is no reference to
+                * st owned by pfsync queues at this point.
+                *
+                * Calling pfsync_q_ins() puts st to del queue. The pfsync_q_ins()
+                * grabs a reference for delete queue.
+                */
                pfsync_q_ins(st, PFSYNC_S_DEL);
                return;
 
@@ -2077,6 +2122,7 @@ pfsync_q_ins(struct pf_state *st, int q)
        }
 
        sc->sc_len += nlen;
+       pf_state_ref(st);
        TAILQ_INSERT_TAIL(&sc->sc_qs[q], st, sync_list);
        st->sync_state = q;
 }
@@ -2092,6 +2138,7 @@ pfsync_q_del(struct pf_state *st)
        sc->sc_len -= pfsync_qs[q].len;
        TAILQ_REMOVE(&sc->sc_qs[q], st, sync_list);
        st->sync_state = PFSYNC_S_NONE;
+       pf_state_unref(st);
 
        if (TAILQ_EMPTY(&sc->sc_qs[q]))
                sc->sc_len -= sizeof(struct pfsync_subheader);
index ef24c22..3c3b7dd 100644 (file)
@@ -1,4 +1,4 @@
-/*     $OpenBSD: if_pfsync.h,v 1.53 2017/04/14 20:46:31 bluhm Exp $    */
+/*     $OpenBSD: if_pfsync.h,v 1.54 2018/09/11 07:53:38 sashan Exp $   */
 
 /*
  * Copyright (c) 2001 Michael Shalayeff
@@ -328,7 +328,7 @@ void                        pfsync_state_export(struct pfsync_state *,
                            struct pf_state *);
 
 void                   pfsync_insert_state(struct pf_state *);
-void                   pfsync_update_state(struct pf_state *);
+void                   pfsync_update_state(struct pf_state *, int *);
 void                   pfsync_delete_state(struct pf_state *);
 void                   pfsync_clear_states(u_int32_t, const char *);
 
index 5f39f29..03f12f4 100644 (file)
@@ -1,4 +1,4 @@
-/*     $OpenBSD: pf.c,v 1.1073 2018/07/22 09:09:18 sf Exp $ */
+/*     $OpenBSD: pf.c,v 1.1074 2018/09/11 07:53:38 sashan Exp $ */
 
 /*
  * Copyright (c) 2001 Daniel Hartmeier
@@ -1250,11 +1250,17 @@ pf_purge(void *xnloops)
        KERNEL_LOCK();
        NET_LOCK();
 
-       PF_LOCK();
-       /* process a fraction of the state table every second */
+       /*
+        * process a fraction of the state table every second
+        * Note:
+        *      we no longer need PF_LOCK() here, because
+        *      pf_purge_expired_states() uses pf_state_lock to maintain
+        *      consistency.
+        */
        pf_purge_expired_states(1 + (pf_status.states
            / pf_default_rule.timeout[PFTM_INTERVAL]));
 
+       PF_LOCK();
        /* purge other expired types every PFTM_INTERVAL seconds */
        if (++(*nloops) >= pf_default_rule.timeout[PFTM_INTERVAL]) {
                pf_purge_expired_src_nodes();
@@ -1430,7 +1436,7 @@ pf_free_state(struct pf_state *cur)
        TAILQ_REMOVE(&state_list, cur, entry_list);
        if (cur->tag)
                pf_tag_unref(cur->tag);
-       pool_put(&pf_state_pl, cur);
+       pf_state_unref(cur);
        pf_status.fcounters[FCNT_STATE_REMOVALS]++;
        pf_status.states--;
 }
@@ -1440,13 +1446,16 @@ pf_purge_expired_states(u_int32_t maxcheck)
 {
        static struct pf_state  *cur = NULL;
        struct pf_state         *next;
+       SLIST_HEAD(pf_state_gcl, pf_state) gcl;
 
-       PF_ASSERT_LOCKED();
+       PF_ASSERT_UNLOCKED();
+       SLIST_INIT(&gcl);
 
+       PF_STATE_ENTER_READ();
        while (maxcheck--) {
                /* wrap to start of list when we hit the end */
                if (cur == NULL) {
-                       cur = TAILQ_FIRST(&state_list);
+                       cur = pf_state_ref(TAILQ_FIRST(&state_list));
                        if (cur == NULL)
                                break;  /* list empty */
                }
@@ -1454,16 +1463,31 @@ pf_purge_expired_states(u_int32_t maxcheck)
                /* get next state, as cur may get deleted */
                next = TAILQ_NEXT(cur, entry_list);
 
-               if (cur->timeout == PFTM_UNLINKED) {
-                       /* free removed state */
-                       pf_free_state(cur);
-               } else if (pf_state_expires(cur) <= time_uptime) {
-                       /* remove and free expired state */
-                       pf_remove_state(cur);
-                       pf_free_state(cur);
+               if ((cur->timeout == PFTM_UNLINKED) ||
+                   (pf_state_expires(cur) <= time_uptime))
+                       SLIST_INSERT_HEAD(&gcl, cur, gc_list);
+               else
+                       pf_state_unref(cur);
+
+               cur = pf_state_ref(next);
+       }
+       PF_STATE_EXIT_READ();
+
+       PF_LOCK();
+       PF_STATE_ENTER_WRITE();
+       while ((next = SLIST_FIRST(&gcl)) != NULL) {
+               SLIST_REMOVE_HEAD(&gcl, gc_list);
+               if (next->timeout == PFTM_UNLINKED)
+                       pf_free_state(next);
+               else if (pf_state_expires(next) <= time_uptime) {
+                       pf_remove_state(next);
+                       pf_free_state(next);
                }
-               cur = next;
+
+               pf_state_unref(next);
        }
+       PF_STATE_EXIT_WRITE();
+       PF_UNLOCK();
 }
 
 int
@@ -3973,6 +3997,11 @@ pf_create_state(struct pf_pdesc *pd, struct pf_rule *r, struct pf_rule *a,
        s->set_prio[1] = act->set_prio[1];
        s->delay = act->delay;
        SLIST_INIT(&s->src_nodes);
+       /*
+        * must initialize refcnt, before pf_state_insert() gets called.
+        * pf_state_inserts() grabs reference for pfsync!
+        */
+       refcnt_init(&s->refcnt);
 
        switch (pd->proto) {
        case IPPROTO_TCP:
@@ -4774,7 +4803,7 @@ pf_test_state(struct pf_pdesc *pd, struct pf_state **state, u_short *reason,
                                        addlog("\n");
                                }
                                /* XXX make sure it's the same direction ?? */
-                               pf_remove_state(*state);
+                               (*state)->timeout = PFTM_PURGE;
                                *state = NULL;
                                pd->m->m_pkthdr.pf.inp = inp;
                                return (PF_DROP);
@@ -6770,6 +6799,7 @@ pf_test(sa_family_t af, int fwdir, struct ifnet *ifp, struct mbuf **m0)
        struct pf_pdesc          pd;
        int                      dir = (fwdir == PF_FWD) ? PF_OUT : fwdir;
        u_int32_t                qid, pqid = 0;
+       int                      have_pf_lock = 0;
 
        if (!pf_status.running)
                return (PF_PASS);
@@ -6859,9 +6889,6 @@ pf_test(sa_family_t af, int fwdir, struct ifnet *ifp, struct mbuf **m0)
                pd.lookup.pid = NO_PID;
        }
 
-       /* lock the lookup/write section of pf_test() */
-       PF_LOCK();
-
        switch (pd.virtual_proto) {
 
        case PF_VPROTO_FRAGMENT: {
@@ -6869,7 +6896,10 @@ pf_test(sa_family_t af, int fwdir, struct ifnet *ifp, struct mbuf **m0)
                 * handle fragments that aren't reassembled by
                 * normalization
                 */
+               PF_LOCK();
+               have_pf_lock = 1;
                action = pf_test_rule(&pd, &r, &s, &a, &ruleset, &reason);
+               s = pf_state_ref(s);
                if (action != PF_PASS)
                        REASON_SET(&reason, PFRES_FRAG);
                break;
@@ -6883,19 +6913,26 @@ pf_test(sa_family_t af, int fwdir, struct ifnet *ifp, struct mbuf **m0)
                            "dropping IPv6 packet with ICMPv4 payload");
                        break;
                }
+               PF_STATE_ENTER_READ();
                action = pf_test_state_icmp(&pd, &s, &reason);
+               s = pf_state_ref(s);
+               PF_STATE_EXIT_READ();
                if (action == PF_PASS || action == PF_AFRT) {
 #if NPFSYNC > 0
-                       pfsync_update_state(s);
+                       pfsync_update_state(s, &have_pf_lock);
 #endif /* NPFSYNC > 0 */
                        r = s->rule.ptr;
                        a = s->anchor.ptr;
 #if NPFLOG > 0
                        pd.pflog |= s->log;
 #endif /* NPFLOG > 0 */
-               } else if (s == NULL)
+               } else if (s == NULL) {
+                       PF_LOCK();
+                       have_pf_lock = 1;
                        action = pf_test_rule(&pd, &r, &s, &a, &ruleset,
                            &reason);
+                       s = pf_state_ref(s);
+               }
                break;
        }
 
@@ -6908,19 +6945,26 @@ pf_test(sa_family_t af, int fwdir, struct ifnet *ifp, struct mbuf **m0)
                            "dropping IPv4 packet with ICMPv6 payload");
                        break;
                }
+               PF_STATE_ENTER_READ();
                action = pf_test_state_icmp(&pd, &s, &reason);
+               s = pf_state_ref(s);
+               PF_STATE_EXIT_READ();
                if (action == PF_PASS || action == PF_AFRT) {
 #if NPFSYNC > 0
-                       pfsync_update_state(s);
+                       pfsync_update_state(s, &have_pf_lock);
 #endif /* NPFSYNC > 0 */
                        r = s->rule.ptr;
                        a = s->anchor.ptr;
 #if NPFLOG > 0
                        pd.pflog |= s->log;
 #endif /* NPFLOG > 0 */
-               } else if (s == NULL)
+               } else if (s == NULL) {
+                       PF_LOCK();
+                       have_pf_lock = 1;
                        action = pf_test_rule(&pd, &r, &s, &a, &ruleset,
                            &reason);
+                       s = pf_state_ref(s);
+               }
                break;
        }
 #endif /* INET6 */
@@ -6930,6 +6974,8 @@ pf_test(sa_family_t af, int fwdir, struct ifnet *ifp, struct mbuf **m0)
                        if (pd.dir == PF_IN && (pd.hdr.tcp.th_flags &
                            (TH_SYN|TH_ACK)) == TH_SYN &&
                            pf_synflood_check(&pd)) {
+                               PF_LOCK();
+                               have_pf_lock = 1;
                                pf_syncookie_send(&pd);
                                action = PF_DROP;
                                break;
@@ -6940,7 +6986,10 @@ pf_test(sa_family_t af, int fwdir, struct ifnet *ifp, struct mbuf **m0)
                        if (action == PF_DROP)
                                break;
                }
+               PF_STATE_ENTER_READ();
                action = pf_test_state(&pd, &s, &reason, 0);
+               s = pf_state_ref(s);
+               PF_STATE_EXIT_READ();
                if (s == NULL && action != PF_PASS && action != PF_AFRT &&
                    pd.dir == PF_IN && pd.virtual_proto == IPPROTO_TCP &&
                    (pd.hdr.tcp.th_flags & (TH_SYN|TH_ACK|TH_RST)) == TH_ACK &&
@@ -6948,25 +6997,27 @@ pf_test(sa_family_t af, int fwdir, struct ifnet *ifp, struct mbuf **m0)
                        struct mbuf     *msyn;
                        msyn = pf_syncookie_recreate_syn(&pd);
                        if (msyn) {
-                               PF_UNLOCK();
                                action = pf_test(af, fwdir, ifp, &msyn);
-                               PF_LOCK();
                                m_freem(msyn);
                                if (action == PF_PASS || action == PF_AFRT) {
+                                       PF_STATE_ENTER_READ();
                                        pf_test_state(&pd, &s, &reason, 1);
-                                       if (s == NULL) {
-                                               PF_UNLOCK();
+                                       s = pf_state_ref(s);
+                                       PF_STATE_EXIT_READ();
+                                       if (s == NULL)
                                                return (PF_DROP);
-                                       }
                                        s->src.seqhi =
                                            ntohl(pd.hdr.tcp.th_ack) - 1;
                                        s->src.seqlo =
                                            ntohl(pd.hdr.tcp.th_seq) - 1;
                                        pf_set_protostate(s, PF_PEER_SRC,
                                            PF_TCPS_PROXY_DST);
+                                       PF_LOCK();
+                                       have_pf_lock = 1;
                                        action = pf_synproxy(&pd, &s, &reason);
                                        if (action != PF_PASS) {
                                                PF_UNLOCK();
+                                               pf_state_unref(s);
                                                return (action);
                                        }
                                }
@@ -6974,19 +7025,22 @@ pf_test(sa_family_t af, int fwdir, struct ifnet *ifp, struct mbuf **m0)
                                action = PF_DROP;
                }
 
-
                if (action == PF_PASS || action == PF_AFRT) {
 #if NPFSYNC > 0
-                       pfsync_update_state(s);
+                       pfsync_update_state(s, &have_pf_lock);
 #endif /* NPFSYNC > 0 */
                        r = s->rule.ptr;
                        a = s->anchor.ptr;
 #if NPFLOG > 0
                        pd.pflog |= s->log;
 #endif /* NPFLOG > 0 */
-               } else if (s == NULL)
+               } else if (s == NULL) {
+                       PF_LOCK();
+                       have_pf_lock = 1;
                        action = pf_test_rule(&pd, &r, &s, &a, &ruleset,
                            &reason);
+                       s = pf_state_ref(s);
+               }
 
                if (pd.virtual_proto == IPPROTO_TCP) {
                        if (s) {
@@ -6999,12 +7053,13 @@ pf_test(sa_family_t af, int fwdir, struct ifnet *ifp, struct mbuf **m0)
                break;
        }
 
-       PF_UNLOCK();
+       if (have_pf_lock != 0)
+               PF_UNLOCK();
 
        /*
         * At the moment, we rely on NET_LOCK() to prevent removal of items
-        * we've collected above ('s', 'r', 'anchor' and 'ruleset').  They'll
-        * have to be refcounted when NET_LOCK() is gone.
+        * we've collected above ('r', 'anchor' and 'ruleset').  They'll have
+        * to be refcounted when NET_LOCK() is gone.
         */
 
 done:
@@ -7201,6 +7256,8 @@ done:
 
        *m0 = pd.m;
 
+       pf_state_unref(s);
+
        return (action);
 }
 
@@ -7401,6 +7458,33 @@ pf_state_key_unlink_reverse(struct pf_state_key *sk)
        }
 }
 
+struct pf_state *
+pf_state_ref(struct pf_state *s)
+{
+       if (s != NULL)
+               PF_REF_TAKE(s->refcnt);
+       return (s);
+}
+
+void
+pf_state_unref(struct pf_state *s)
+{
+       if ((s != NULL) && PF_REF_RELE(s->refcnt)) {
+               /* never inserted or removed */
+#if NPFSYNC > 0
+               KASSERT((TAILQ_NEXT(s, sync_list) == NULL) ||
+                   ((TAILQ_NEXT(s, sync_list) == _Q_INVALID) &&
+                   (s->sync_state == PFSYNC_S_NONE)));
+#endif /* NPFSYNC */
+               KASSERT((TAILQ_NEXT(s, entry_list) == NULL) ||
+                   (TAILQ_NEXT(s, entry_list) == _Q_INVALID));
+               KASSERT((s->key[PF_SK_WIRE] == NULL) &&
+                   (s->key[PF_SK_STACK] == NULL));
+
+               pool_put(&pf_state_pl, s);
+       }
+}
+
 int
 pf_delay_pkt(struct mbuf *m, u_int ifidx)
 {
index a639bd6..7277443 100644 (file)
@@ -1,4 +1,4 @@
-/*     $OpenBSD: pf_ioctl.c,v 1.336 2018/07/22 09:09:18 sf Exp $ */
+/*     $OpenBSD: pf_ioctl.c,v 1.337 2018/09/11 07:53:38 sashan Exp $ */
 
 /*
  * Copyright (c) 2001 Daniel Hartmeier
@@ -132,7 +132,21 @@ TAILQ_HEAD(pf_tags, pf_tagname)    pf_tags = TAILQ_HEAD_INITIALIZER(pf_tags),
                                pf_qids = TAILQ_HEAD_INITIALIZER(pf_qids);
 
 #ifdef WITH_PF_LOCK
+/*
+ * pf_lock protects consistency of PF data structures, which don't have
+ * their dedicated lock yet. The pf_lock currently protects:
+ *     - rules,
+ *     - radix tables,
+ *     - source nodes
+ * All callers must grab pf_lock exclusively.
+ *
+ * pf_state_lock protects consistency of state table. Packets, which do state
+ * look up grab the lock as readers. If packet must create state, then it must
+ * grab the lock as writer. Whenever packet creates state it grabs pf_lock
+ * first then it locks pf_state_lock as the writer.
+ */
 struct rwlock           pf_lock = RWLOCK_INITIALIZER("pf_lock");
+struct rwlock           pf_state_lock = RWLOCK_INITIALIZER("pf_state_lock");
 #endif /* WITH_PF_LOCK */
 
 #if (PF_QNAME_SIZE != PF_TAG_NAME_SIZE)
@@ -1501,6 +1515,7 @@ pfioctl(dev_t dev, u_long cmd, caddr_t addr, int flags, struct proc *p)
                u_int                    killed = 0;
 
                PF_LOCK();
+               PF_STATE_ENTER_WRITE();
                for (s = RB_MIN(pf_state_tree_id, &tree_id); s; s = nexts) {
                        nexts = RB_NEXT(pf_state_tree_id, &tree_id, s);
 
@@ -1514,6 +1529,7 @@ pfioctl(dev_t dev, u_long cmd, caddr_t addr, int flags, struct proc *p)
                                killed++;
                        }
                }
+               PF_STATE_EXIT_WRITE();
                psk->psk_killed = killed;
 #if NPFSYNC > 0
                pfsync_clear_states(pf_status.hostid, psk->psk_ifname);
@@ -1537,10 +1553,12 @@ pfioctl(dev_t dev, u_long cmd, caddr_t addr, int flags, struct proc *p)
                        if (psk->psk_pfcmp.creatorid == 0)
                                psk->psk_pfcmp.creatorid = pf_status.hostid;
                        PF_LOCK();
+                       PF_STATE_ENTER_WRITE();
                        if ((s = pf_find_state_byid(&psk->psk_pfcmp))) {
                                pf_remove_state(s);
                                psk->psk_killed = 1;
                        }
+                       PF_STATE_EXIT_WRITE();
                        PF_UNLOCK();
                        break;
                }
@@ -1554,6 +1572,7 @@ pfioctl(dev_t dev, u_long cmd, caddr_t addr, int flags, struct proc *p)
                        key.rdomain = psk->psk_rdomain;
 
                        PF_LOCK();
+                       PF_STATE_ENTER_WRITE();
                        for (i = 0; i < nitems(dirs); i++) {
                                if (dirs[i] == PF_IN) {
                                        sidx = 0;
@@ -1594,11 +1613,13 @@ pfioctl(dev_t dev, u_long cmd, caddr_t addr, int flags, struct proc *p)
                        }
                        if (killed)
                                psk->psk_killed = killed;
+                       PF_STATE_EXIT_WRITE();
                        PF_UNLOCK();
                        break;
                }
 
                PF_LOCK();
+               PF_STATE_ENTER_WRITE();
                for (s = RB_MIN(pf_state_tree_id, &tree_id); s;
                    s = nexts) {
                        nexts = RB_NEXT(pf_state_tree_id, &tree_id, s);
@@ -1644,6 +1665,7 @@ pfioctl(dev_t dev, u_long cmd, caddr_t addr, int flags, struct proc *p)
                        }
                }
                psk->psk_killed = killed;
+               PF_STATE_EXIT_WRITE();
                PF_UNLOCK();
                break;
        }
@@ -1658,7 +1680,9 @@ pfioctl(dev_t dev, u_long cmd, caddr_t addr, int flags, struct proc *p)
                        break;
                }
                PF_LOCK();
+               PF_STATE_ENTER_WRITE();
                error = pfsync_state_import(sp, PFSYNC_SI_IOCTL);
+               PF_STATE_EXIT_WRITE();
                PF_UNLOCK();
                break;
        }
@@ -1673,16 +1697,17 @@ pfioctl(dev_t dev, u_long cmd, caddr_t addr, int flags, struct proc *p)
                id_key.id = ps->state.id;
                id_key.creatorid = ps->state.creatorid;
 
-               PF_LOCK();
+               PF_STATE_ENTER_READ();
                s = pf_find_state_byid(&id_key);
+               s = pf_state_ref(s);
+               PF_STATE_EXIT_READ();
                if (s == NULL) {
                        error = ENOENT;
-                       PF_UNLOCK();
                        break;
                }
 
                pf_state_export(&ps->state, s);
-               PF_UNLOCK();
+               pf_state_unref(s);
                break;
        }
 
@@ -1702,7 +1727,7 @@ pfioctl(dev_t dev, u_long cmd, caddr_t addr, int flags, struct proc *p)
 
                p = ps->ps_states;
 
-               PF_LOCK();
+               PF_STATE_ENTER_READ();
                state = TAILQ_FIRST(&state_list);
                while (state) {
                        if (state->timeout != PFTM_UNLINKED) {
@@ -1712,7 +1737,7 @@ pfioctl(dev_t dev, u_long cmd, caddr_t addr, int flags, struct proc *p)
                                error = copyout(pstore, p, sizeof(*p));
                                if (error) {
                                        free(pstore, M_TEMP, sizeof(*pstore));
-                                       PF_UNLOCK();
+                                       PF_STATE_EXIT_READ();
                                        goto fail;
                                }
                                p++;
@@ -1720,7 +1745,7 @@ pfioctl(dev_t dev, u_long cmd, caddr_t addr, int flags, struct proc *p)
                        }
                        state = TAILQ_NEXT(state, entry_list);
                }
-               PF_UNLOCK();
+               PF_STATE_EXIT_READ();
 
                ps->ps_len = sizeof(struct pfsync_state) * nr;
 
@@ -1801,8 +1826,10 @@ pfioctl(dev_t dev, u_long cmd, caddr_t addr, int flags, struct proc *p)
                        PF_ACPY(&key.addr[didx], &pnl->daddr, pnl->af);
                        key.port[didx] = pnl->dport;
 
-                       PF_LOCK();
+                       PF_STATE_ENTER_READ();
                        state = pf_find_state_all(&key, direction, &m);
+                       state = pf_state_ref(state);
+                       PF_STATE_EXIT_READ();
 
                        if (m > 1)
                                error = E2BIG;  /* more than one state */
@@ -1815,7 +1842,7 @@ pfioctl(dev_t dev, u_long cmd, caddr_t addr, int flags, struct proc *p)
                                pnl->rrdomain = sk->rdomain;
                        } else
                                error = ENOENT;
-                       PF_UNLOCK();
+                       pf_state_unref(state);
                }
                break;
        }
@@ -2576,8 +2603,10 @@ pfioctl(dev_t dev, u_long cmd, caddr_t addr, int flags, struct proc *p)
                struct pf_state         *state;
 
                PF_LOCK();
+               PF_STATE_ENTER_WRITE();
                RB_FOREACH(state, pf_state_tree_id, &tree_id)
                        pf_src_tree_remove_state(state);
+               PF_STATE_EXIT_WRITE();
                RB_FOREACH(n, pf_src_tree, &tree_src_tracking)
                        n->expire = 1;
                pf_purge_expired_src_nodes();
@@ -2603,10 +2632,14 @@ pfioctl(dev_t dev, u_long cmd, caddr_t addr, int flags, struct proc *p)
                                &psnk->psnk_dst.addr.v.a.mask,
                                &sn->raddr, sn->af)) {
                                /* Handle state to src_node linkage */
-                               if (sn->states != 0)
+                               if (sn->states != 0) {
+                                       PF_ASSERT_LOCKED();
+                                       PF_STATE_ENTER_WRITE();
                                        RB_FOREACH(s, pf_state_tree_id,
                                           &tree_id)
                                                pf_state_rm_src_node(s, sn);
+                                       PF_STATE_EXIT_WRITE();
+                               }
                                sn->expire = 1;
                                killed++;
                        }
index 97c13aa..fb43fe0 100644 (file)
@@ -1,4 +1,4 @@
-/*     $OpenBSD: pfvar.h,v 1.484 2018/09/10 11:37:26 bluhm Exp $ */
+/*     $OpenBSD: pfvar.h,v 1.485 2018/09/11 07:53:38 sashan Exp $ */
 
 /*
  * Copyright (c) 2001 Daniel Hartmeier
@@ -761,6 +761,7 @@ struct pf_state {
 
        TAILQ_ENTRY(pf_state)    sync_list;
        TAILQ_ENTRY(pf_state)    entry_list;
+       SLIST_ENTRY(pf_state)    gc_list;
        RB_ENTRY(pf_state)       entry_id;
        struct pf_state_peer     src;
        struct pf_state_peer     dst;
@@ -805,6 +806,7 @@ struct pf_state {
        u_int16_t                max_mss;
        u_int16_t                if_index_in;
        u_int16_t                if_index_out;
+       pf_refcnt_t              refcnt;
        u_int16_t                delay;
 };
 
index 466548c..2b62cfe 100644 (file)
@@ -1,4 +1,4 @@
-/*     $OpenBSD: pfvar_priv.h,v 1.4 2017/08/06 13:16:11 mpi Exp $      */
+/*     $OpenBSD: pfvar_priv.h,v 1.5 2018/09/11 07:53:38 sashan Exp $   */
 
 /*
  * Copyright (c) 2001 Daniel Hartmeier
@@ -101,8 +101,12 @@ struct pf_pdesc {
 extern struct task     pf_purge_task;
 extern struct timeout  pf_purge_to;
 
+struct pf_state                *pf_state_ref(struct pf_state *);
+void                    pf_state_unref(struct pf_state *);
+
 #ifdef WITH_PF_LOCK
 extern struct rwlock   pf_lock;
+extern struct rwlock   pf_state_lock;
 
 #define PF_LOCK()              do {                    \
                NET_ASSERT_LOCKED();                    \
@@ -124,11 +128,42 @@ extern struct rwlock      pf_lock;
                if (rw_status(&pf_lock) == RW_WRITE)    \
                        splassert_fail(0, rw_status(&pf_lock), __func__);\
        } while (0)
+
+#define PF_STATE_ENTER_READ()  do {                    \
+               rw_enter_read(&pf_state_lock);          \
+       } while (0)
+
+#define PF_STATE_EXIT_READ()   do {                    \
+               rw_exit_read(&pf_state_lock);           \
+       } while (0)
+
+#define PF_STATE_ENTER_WRITE() do {                    \
+               rw_enter_write(&pf_state_lock);         \
+       } while (0)
+
+#define PF_STATE_EXIT_WRITE()  do {                    \
+               PF_ASSERT_STATE_LOCKED();               \
+               rw_exit_write(&pf_state_lock);          \
+       } while (0)
+
+#define PF_ASSERT_STATE_LOCKED()       do {            \
+               if (rw_status(&pf_state_lock) != RW_WRITE)\
+                       splassert_fail(RW_WRITE,        \
+                           rw_status(&pf_state_lock), __func__);\
+       } while (0)
+
 #else /* !WITH_PF_LOCK */
 #define PF_LOCK()              (void)(0)
 #define PF_UNLOCK()            (void)(0)
 #define PF_ASSERT_LOCKED()     (void)(0)
 #define PF_ASSERT_UNLOCKED()   (void)(0)
+
+#define PF_STATE_ENTER_READ()  (void)(0)
+#define PF_STATE_EXIT_READ()   (void)(0)
+#define PF_STATE_ENTER_WRITE() (void)(0)
+#define PF_STATE_EXIT_WRITE()  (void)(0)
+#define PF_ASSERT_STATE_LOCKED()       (void)(0)
+
 #endif /* WITH_PF_LOCK */
 
 extern void                     pf_purge_timeout(void *);