Move UNIX domain sockets garbage collector out of `unp_lock.
authormvs <mvs@openbsd.org>
Tue, 16 Nov 2021 08:56:19 +0000 (08:56 +0000)
committermvs <mvs@openbsd.org>
Tue, 16 Nov 2021 08:56:19 +0000 (08:56 +0000)
Except `unp_ino' this leaves only per-socket data protected by
`unp_lock'. The `unp_ino' protection is not the big deal and will be
done with mutex(9) in the future diff.

The garbage collector flags moved from from `unp_flags' to unp_gcflags'.

The two new locks introduced to protect garbage collector data. The
`unp_gc_lock' rwlock(9) protects `unp_defer', `unp_gcing', `unp_gcflags'
and `unp_link' list. The `unp_df_lock' protects `ud_link' list.

We need to simultaneously lock `unp_gc_lock' and `unp_lock'. When we
perform unp_attach() or unp_detach() we link PCB to `unp_link' list with
`unp_lock' held. But when unp_gc() does `unp_link' list walkthrough with
the `unp_gc_lock' lock held it should lock socket while performs
`so_rcv' buffer scan and the lock order should be the opposite.

In the future diff `unp_lock' will be replaced by per-socket `so_lock'
so it's better to enforce `unp_gc_lock' -> `unp_lock' (solock()) lock
order and release `unp_lock' in the unp_attach() and unp_detach() paths.
The previously committed diffs made this safe.

The `unp_df_lock' introduced because the `unp_lock' and `unp_gc_lock'
state are unknown when unp_discard() called. Since it touches only
`ud_link' list the re-lock dances are unwanted in this path. Also this
keeps M_WAITOK allocation outside rwlock(9) when unp_discard() called
from unp_externalize() error path.

ok bluhm@

sys/kern/uipc_usrreq.c
sys/sys/unpcb.h

index d819f43..2db67d7 100644 (file)
@@ -1,4 +1,4 @@
-/*     $OpenBSD: uipc_usrreq.c,v 1.156 2021/11/11 18:36:59 mvs Exp $   */
+/*     $OpenBSD: uipc_usrreq.c,v 1.157 2021/11/16 08:56:19 mvs Exp $   */
 /*     $NetBSD: uipc_usrreq.c,v 1.18 1996/02/09 19:00:50 christos Exp $        */
 
 /*
 /*
  * Locks used to protect global data and struct members:
  *      I       immutable after creation
+ *      D       unp_df_lock
+ *      G       unp_gc_lock
  *      U       unp_lock
  *      R       unp_rights_mtx
  *      a       atomic
  */
 
 struct rwlock unp_lock = RWLOCK_INITIALIZER("unplock");
+struct rwlock unp_df_lock = RWLOCK_INITIALIZER("unpdflk");
+struct rwlock unp_gc_lock = RWLOCK_INITIALIZER("unpgclk");
+
 struct mutex unp_rights_mtx = MUTEX_INITIALIZER(IPL_SOFTNET);
 
 /*
@@ -72,7 +77,7 @@ struct mutex unp_rights_mtx = MUTEX_INITIALIZER(IPL_SOFTNET);
  * not received and need to be closed.
  */
 struct unp_deferral {
-       SLIST_ENTRY(unp_deferral)       ud_link;        /* [U] */
+       SLIST_ENTRY(unp_deferral)       ud_link;        /* [D] */
        int                             ud_n;           /* [I] */
        /* followed by ud_n struct fdpass */
        struct fdpass                   ud_fp[];        /* [I] */
@@ -97,17 +102,17 @@ struct task unp_gc_task = TASK_INITIALIZER(unp_gc, NULL);
  */
 const struct   sockaddr sun_noname = { sizeof(sun_noname), AF_UNIX };
 
-/* [U] list of all UNIX domain sockets, for unp_gc() */
+/* [G] list of all UNIX domain sockets, for unp_gc() */
 LIST_HEAD(unp_head, unpcb)     unp_head =
        LIST_HEAD_INITIALIZER(unp_head);
-/* [U] list of sets of files that were sent over sockets that are now closed */
+/* [D] list of sets of files that were sent over sockets that are now closed */
 SLIST_HEAD(,unp_deferral)      unp_deferred =
        SLIST_HEAD_INITIALIZER(unp_deferred);
 
 ino_t  unp_ino;        /* [U] prototype for fake inode numbers */
 int    unp_rights;     /* [R] file descriptors in flight */
-int    unp_defer;      /* [U] number of deferred fp to close by the GC task */
-int    unp_gcing;      /* [U] GC task currently running */
+int    unp_defer;      /* [G] number of deferred fp to close by the GC task */
+int    unp_gcing;      /* [G] GC task currently running */
 
 void
 unp_init(void)
@@ -430,7 +435,21 @@ uipc_attach(struct socket *so, int proto)
        unp->unp_socket = so;
        so->so_pcb = unp;
        getnanotime(&unp->unp_ctime);
+
+       /*
+        * Enforce `unp_gc_lock' -> `solock()' lock order.
+        */
+       /*
+        * We also release the lock on listening socket and on our peer
+        * socket when called from unp_connect(). This is safe. The
+        * listening socket protected by vnode(9) lock. The peer socket
+        * has 'UNP_CONNECTING' flag set.
+        */
+       sounlock(so, SL_LOCKED);
+       rw_enter_write(&unp_gc_lock);
        LIST_INSERT_HEAD(&unp_head, unp, unp_link);
+       rw_exit_write(&unp_gc_lock);
+       solock(so);
        return (0);
 }
 
@@ -490,28 +509,29 @@ unp_detach(struct unpcb *unp)
 
        rw_assert_wrlock(&unp_lock);
 
-       LIST_REMOVE(unp, unp_link);
+       unp->unp_vnode = NULL;
 
-       if (vp != NULL) {
-               unp->unp_vnode = NULL;
-
-               /*
-                * Enforce `i_lock' -> `unp_lock' because fifo
-                * subsystem requires it.
-                */
+       /*
+        * Enforce `unp_gc_lock' -> `solock()' lock order.
+        * Enforce `i_lock' -> `unp_lock' lock order.
+        */
+       sounlock(so, SL_LOCKED);
 
-               sounlock(so, SL_LOCKED);
+       rw_enter_write(&unp_gc_lock);
+       LIST_REMOVE(unp, unp_link);
+       rw_exit_write(&unp_gc_lock);
 
+       if (vp != NULL) {
                VOP_LOCK(vp, LK_EXCLUSIVE);
                vp->v_socket = NULL;
 
                KERNEL_LOCK();
                vput(vp);
                KERNEL_UNLOCK();
-
-               solock(so);
        }
 
+       solock(so);
+
        if (unp->unp_conn)
                unp_disconnect(unp);
        while (!SLIST_EMPTY(&unp->unp_refs))
@@ -954,10 +974,11 @@ restart:
 
        if (error) {
                if (nfds > 0) {
+                       /*
+                        * No lock required. We are the only `cm' holder.
+                        */
                        rp = ((struct fdpass *)CMSG_DATA(cm));
-                       rw_enter_write(&unp_lock);
                        unp_discard(rp, nfds);
-                       rw_exit_write(&unp_lock);
                }
        }
 
@@ -1093,16 +1114,17 @@ unp_gc(void *arg __unused)
        struct unpcb *unp;
        int nunref, i;
 
-       rw_enter_write(&unp_lock);
-
+       rw_enter_write(&unp_gc_lock);
        if (unp_gcing)
                goto unlock;
        unp_gcing = 1;
+       rw_exit_write(&unp_gc_lock);
 
+       rw_enter_write(&unp_df_lock);
        /* close any fds on the deferred list */
        while ((defer = SLIST_FIRST(&unp_deferred)) != NULL) {
                SLIST_REMOVE_HEAD(&unp_deferred, ud_link);
-               rw_exit_write(&unp_lock);
+               rw_exit_write(&unp_df_lock);
                for (i = 0; i < defer->ud_n; i++) {
                        fp = defer->ud_fp[i].fp;
                        if (fp == NULL)
@@ -1118,25 +1140,27 @@ unp_gc(void *arg __unused)
                }
                free(defer, M_TEMP, sizeof(*defer) +
                    sizeof(struct fdpass) * defer->ud_n);
-               rw_enter_write(&unp_lock);
+               rw_enter_write(&unp_df_lock);
        }
+       rw_exit_write(&unp_df_lock);
 
+       rw_enter_write(&unp_gc_lock);
        unp_defer = 0;
        LIST_FOREACH(unp, &unp_head, unp_link)
-               unp->unp_flags &= ~(UNP_GCMARK | UNP_GCDEFER | UNP_GCDEAD);
+               unp->unp_gcflags = 0;
        do {
                nunref = 0;
                LIST_FOREACH(unp, &unp_head, unp_link) {
                        fp = unp->unp_file;
-                       if (unp->unp_flags & UNP_GCDEFER) {
+                       if (unp->unp_gcflags & UNP_GCDEFER) {
                                /*
                                 * This socket is referenced by another
                                 * socket which is known to be live,
                                 * so it's certainly live.
                                 */
-                               unp->unp_flags &= ~UNP_GCDEFER;
+                               unp->unp_gcflags &= ~UNP_GCDEFER;
                                unp_defer--;
-                       } else if (unp->unp_flags & UNP_GCMARK) {
+                       } else if (unp->unp_gcflags & UNP_GCMARK) {
                                /* marked as live in previous pass */
                                continue;
                        } else if (fp == NULL) {
@@ -1155,7 +1179,7 @@ unp_gc(void *arg __unused)
                                 */
                                if (fp->f_count == unp->unp_msgcount) {
                                        nunref++;
-                                       unp->unp_flags |= UNP_GCDEAD;
+                                       unp->unp_gcflags |= UNP_GCDEAD;
                                        continue;
                                }
                        }
@@ -1167,10 +1191,12 @@ unp_gc(void *arg __unused)
                         * sockets and note them as deferred (== referenced,
                         * but not yet marked).
                         */
-                       unp->unp_flags |= UNP_GCMARK;
+                       unp->unp_gcflags |= UNP_GCMARK;
 
                        so = unp->unp_socket;
+                       solock(so);
                        unp_scan(so->so_rcv.sb_mb, unp_mark);
+                       sounlock(so, SL_LOCKED);
                }
        } while (unp_defer);
 
@@ -1180,14 +1206,23 @@ unp_gc(void *arg __unused)
         */
        if (nunref) {
                LIST_FOREACH(unp, &unp_head, unp_link) {
-                       if (unp->unp_flags & UNP_GCDEAD)
-                               unp_scan(unp->unp_socket->so_rcv.sb_mb,
-                                   unp_discard);
+                       if (unp->unp_gcflags & UNP_GCDEAD) {
+                               /*
+                                * This socket could still be connected
+                                * and if so it's `so_rcv' is still
+                                * accessible by concurrent PRU_SEND
+                                * thread.
+                                */
+                               so = unp->unp_socket;
+                               solock(so);
+                               unp_scan(so->so_rcv.sb_mb, unp_discard);
+                               sounlock(so, SL_LOCKED);
+                       }
                }
        }
        unp_gcing = 0;
 unlock:
-       rw_exit_write(&unp_lock);
+       rw_exit_write(&unp_gc_lock);
 }
 
 void
@@ -1233,7 +1268,7 @@ unp_mark(struct fdpass *rp, int nfds)
        struct unpcb *unp;
        int i;
 
-       rw_assert_wrlock(&unp_lock);
+       rw_assert_wrlock(&unp_gc_lock);
 
        for (i = 0; i < nfds; i++) {
                if (rp[i].fp == NULL)
@@ -1243,12 +1278,12 @@ unp_mark(struct fdpass *rp, int nfds)
                if (unp == NULL)
                        continue;
 
-               if (unp->unp_flags & (UNP_GCMARK|UNP_GCDEFER))
+               if (unp->unp_gcflags & (UNP_GCMARK|UNP_GCDEFER))
                        continue;
 
                unp_defer++;
-               unp->unp_flags |= UNP_GCDEFER;
-               unp->unp_flags &= ~UNP_GCDEAD;
+               unp->unp_gcflags |= UNP_GCDEFER;
+               unp->unp_gcflags &= ~UNP_GCDEAD;
        }
 }
 
@@ -1257,14 +1292,15 @@ unp_discard(struct fdpass *rp, int nfds)
 {
        struct unp_deferral *defer;
 
-       rw_assert_wrlock(&unp_lock);
-
        /* copy the file pointers to a deferral structure */
        defer = malloc(sizeof(*defer) + sizeof(*rp) * nfds, M_TEMP, M_WAITOK);
        defer->ud_n = nfds;
        memcpy(&defer->ud_fp[0], rp, sizeof(*rp) * nfds);
        memset(rp, 0, sizeof(*rp) * nfds);
+
+       rw_enter_write(&unp_df_lock);
        SLIST_INSERT_HEAD(&unp_deferred, defer, ud_link);
+       rw_exit_write(&unp_df_lock);
 
        task_add(systqmp, &unp_gc_task);
 }
index ab195ab..7111ea9 100644 (file)
@@ -1,4 +1,4 @@
-/*     $OpenBSD: unpcb.h,v 1.19 2021/11/06 17:35:14 mvs Exp $  */
+/*     $OpenBSD: unpcb.h,v 1.20 2021/11/16 08:56:20 mvs Exp $  */
 /*     $NetBSD: unpcb.h,v 1.6 1994/06/29 06:46:08 cgd Exp $    */
 
 /*
@@ -59,6 +59,7 @@
  *
  * Locks used to protect struct members:
  *      I       immutable after creation
+ *      G       unp_gc_lock
  *      U       unp_lock
  *      a       atomic
  */
@@ -75,9 +76,10 @@ struct       unpcb {
        struct  mbuf *unp_addr;         /* [U] bound address of socket */
        long    unp_msgcount;           /* [a] references from socket rcv buf */
        int     unp_flags;              /* [U] this unpcb contains peer eids */
+       int     unp_gcflags;            /* [G] garbge collector flags */
        struct  sockpeercred unp_connid;/* [U] id of peer process */
        struct  timespec unp_ctime;     /* [I] holds creation time */
-       LIST_ENTRY(unpcb) unp_link;     /* [U] link in per-AF list of sockets */
+       LIST_ENTRY(unpcb) unp_link;     /* [G] link in per-AF list of sockets */
 };
 
 /*
@@ -85,11 +87,15 @@ struct      unpcb {
  */
 #define UNP_FEIDS      0x01            /* unp_connid contains information */
 #define UNP_FEIDSBIND  0x02            /* unp_connid was set by a bind */
-#define UNP_GCMARK     0x04            /* mark during unp_gc() */
-#define UNP_GCDEFER    0x08            /* ref'd, but not marked in this pass */
-#define UNP_GCDEAD     0x10            /* unref'd in this pass */
-#define UNP_BINDING    0x20            /* unp is binding now */
-#define UNP_CONNECTING 0x40            /* unp is connecting now */
+#define UNP_BINDING    0x04            /* unp is binding now */
+#define UNP_CONNECTING 0x08            /* unp is connecting now */
+
+/*
+ * flag bits in unp_gcflags
+ */
+#define UNP_GCMARK     0x01            /* mark during unp_gc() */
+#define UNP_GCDEFER    0x02            /* ref'd, but not marked in this pass */
+#define UNP_GCDEAD     0x04            /* unref'd in this pass */
 
 #define        sotounpcb(so)   ((struct unpcb *)((so)->so_pcb))