Make fine grained unix(4) domain sockets locking. Use the per-socket
authormvs <mvs@openbsd.org>
Fri, 1 Jul 2022 09:56:17 +0000 (09:56 +0000)
committermvs <mvs@openbsd.org>
Fri, 1 Jul 2022 09:56:17 +0000 (09:56 +0000)
`so_lock' rwlock(9) instead of global `unp_lock' which locks the whole
layer.

The PCB of unix(4) sockets are linked to each other and we need to lock
them both. This introduces the lock ordering problem, because when the
thread (1) keeps lock on `so1' and trying to lock `so2', the thread (2)
could hold lock on `so2' and trying to lock `so1'. To solve this we
always lock sockets in the strict order.

For the sockets which are already accessible from userland, we always
lock socket with the smallest memory address first. Sometimes we need to
unlock socket before lock it's peer and lock it again.

We use reference counters for prevent the connected peer destruction
during to relock. We also handle the case where the peer socket was
replaced by another socket.

For the newly connected sockets, which are not yet exported to the
userland by accept(2), we always lock the listening socket `head' first.
This allows us to avoid unwanted relock within accept(2) syscall.

ok claudio@

sys/kern/uipc_socket.c
sys/kern/uipc_socket2.c
sys/kern/uipc_syscalls.c
sys/kern/uipc_usrreq.c
sys/miscfs/fifofs/fifo_vnops.c
sys/sys/socketvar.h
sys/sys/unpcb.h

index fc2421e..aa856ab 100644 (file)
@@ -1,4 +1,4 @@
-/*     $OpenBSD: uipc_socket.c,v 1.278 2022/06/06 14:45:41 claudio Exp $       */
+/*     $OpenBSD: uipc_socket.c,v 1.279 2022/07/01 09:56:17 mvs Exp $   */
 /*     $NetBSD: uipc_socket.c,v 1.21 1996/02/04 02:17:52 christos Exp $        */
 
 /*
@@ -52,6 +52,7 @@
 #include <sys/atomic.h>
 #include <sys/rwlock.h>
 #include <sys/time.h>
+#include <sys/refcnt.h>
 
 #ifdef DDB
 #include <machine/db_machdep.h>
@@ -146,7 +147,9 @@ soalloc(int prflags)
        so = pool_get(&socket_pool, prflags);
        if (so == NULL)
                return (NULL);
-       rw_init(&so->so_lock, "solock");
+       rw_init_flags(&so->so_lock, "solock", RWL_DUPOK);
+       refcnt_init(&so->so_refcnt);
+
        return (so);
 }
 
@@ -247,6 +250,8 @@ solisten(struct socket *so, int backlog)
 void
 sofree(struct socket *so, int keep_lock)
 {
+       int persocket = solock_persocket(so);
+
        soassertlocked(so);
 
        if (so->so_pcb || (so->so_state & SS_NOFDREF) == 0) {
@@ -255,17 +260,54 @@ sofree(struct socket *so, int keep_lock)
                return;
        }
        if (so->so_head) {
+               struct socket *head = so->so_head;
+
                /*
                 * We must not decommission a socket that's on the accept(2)
                 * queue.  If we do, then accept(2) may hang after select(2)
                 * indicated that the listening socket was ready.
                 */
-               if (!soqremque(so, 0)) {
+               if (so->so_onq == &head->so_q) {
                        if (!keep_lock)
                                sounlock(so);
                        return;
                }
+
+               if (persocket) {
+                       /*
+                        * Concurrent close of `head' could
+                        * abort `so' due to re-lock.
+                        */
+                       soref(so);
+                       soref(head);
+                       sounlock(so);
+                       solock(head);
+                       solock(so);
+
+                       if (so->so_onq != &head->so_q0) {
+                               sounlock(head);
+                               sounlock(so);
+                               sorele(head);
+                               sorele(so);
+                               return;
+                       }
+
+                       sorele(head);
+                       sorele(so);
+               }
+
+               soqremque(so, 0);
+
+               if (persocket)
+                       sounlock(head);
        }
+
+       if (persocket) {
+               sounlock(so);
+               refcnt_finalize(&so->so_refcnt, "sofinal");
+               solock(so);
+       }
+
        sigio_free(&so->so_sigio);
        klist_free(&so->so_rcv.sb_sel.si_note);
        klist_free(&so->so_snd.sb_sel.si_note);
@@ -356,13 +398,36 @@ drop:
                        error = error2;
        }
        if (so->so_options & SO_ACCEPTCONN) {
+               int persocket = solock_persocket(so);
+
+               if (persocket) {
+                       /* Wait concurrent sonewconn() threads. */
+                       while (so->so_newconn > 0) {
+                               so->so_state |= SS_NEWCONN_WAIT;
+                               sosleep_nsec(so, &so->so_newconn, PSOCK,
+                                       "netlck", INFSLP);
+                       }
+               }
+
                while ((so2 = TAILQ_FIRST(&so->so_q0)) != NULL) {
+                       if (persocket)
+                               solock(so2);
                        (void) soqremque(so2, 0);
+                       if (persocket)
+                               sounlock(so);
                        (void) soabort(so2);
+                       if (persocket)
+                               solock(so);
                }
                while ((so2 = TAILQ_FIRST(&so->so_q)) != NULL) {
+                       if (persocket)
+                               solock(so2);
                        (void) soqremque(so2, 1);
+                       if (persocket)
+                               sounlock(so);
                        (void) soabort(so2);
+                       if (persocket)
+                               solock(so);
                }
        }
 discard:
@@ -430,11 +495,18 @@ soconnect(struct socket *so, struct mbuf *nam)
 int
 soconnect2(struct socket *so1, struct socket *so2)
 {
-       int error;
+       int persocket, error;
+
+       if ((persocket = solock_persocket(so1)))
+               solock_pair(so1, so2);
+       else
+               solock(so1);
 
-       solock(so1);
        error = (*so1->so_proto->pr_usrreq)(so1, PRU_CONNECT2, NULL,
            (struct mbuf *)so2, NULL, curproc);
+
+       if (persocket)
+               sounlock(so2);
        sounlock(so1);
        return (error);
 }
index 6b0b36e..e3327e1 100644 (file)
@@ -1,4 +1,4 @@
-/*     $OpenBSD: uipc_socket2.c,v 1.124 2022/06/26 05:20:42 visa Exp $ */
+/*     $OpenBSD: uipc_socket2.c,v 1.125 2022/07/01 09:56:17 mvs Exp $  */
 /*     $NetBSD: uipc_socket2.c,v 1.11 1996/02/04 02:17:55 christos Exp $       */
 
 /*
@@ -53,8 +53,6 @@ u_long        sb_max = SB_MAX;                /* patchable */
 extern struct pool mclpools[];
 extern struct pool mbpool;
 
-extern struct rwlock unp_lock;
-
 /*
  * Procedures to manipulate state flags of socket
  * and do appropriate wakeups.  Normal sequence from the
@@ -101,10 +99,37 @@ soisconnected(struct socket *so)
        soassertlocked(so);
        so->so_state &= ~(SS_ISCONNECTING|SS_ISDISCONNECTING);
        so->so_state |= SS_ISCONNECTED;
-       if (head && soqremque(so, 0)) {
+
+       if (head != NULL && so->so_onq == &head->so_q0) {
+               int persocket = solock_persocket(so);
+
+               if (persocket) {
+                       soref(so);
+                       soref(head);
+
+                       sounlock(so);
+                       solock(head);
+                       solock(so);
+
+                       if (so->so_onq != &head->so_q0) {
+                               sounlock(head);
+                               sorele(head);
+                               sorele(so);
+
+                               return;
+                       }
+
+                       sorele(head);
+                       sorele(so);
+               }
+
+               soqremque(so, 0);
                soqinsque(head, so, 1);
                sorwakeup(head);
                wakeup_one(&head->so_timeo);
+
+               if (persocket)
+                       sounlock(head);
        } else {
                wakeup(&so->so_timeo);
                sorwakeup(so);
@@ -146,7 +171,8 @@ struct socket *
 sonewconn(struct socket *head, int connstatus)
 {
        struct socket *so;
-       int soqueue = connstatus ? 1 : 0;
+       int persocket = solock_persocket(head);
+       int error;
 
        /*
         * XXXSMP as long as `so' and `head' share the same lock, we
@@ -174,10 +200,18 @@ sonewconn(struct socket *head, int connstatus)
        so->so_rgid = head->so_rgid;
        so->so_cpid = head->so_cpid;
 
+       /*
+        * Lock order will be `head' -> `so' while these sockets are linked.
+        */
+       if (persocket)
+               solock(so);
+
        /*
         * Inherit watermarks but those may get clamped in low mem situations.
         */
        if (soreserve(so, head->so_snd.sb_hiwat, head->so_rcv.sb_hiwat)) {
+               if (persocket)
+                       sounlock(so);
                pool_put(&socket_pool, so);
                return (NULL);
        }
@@ -193,20 +227,54 @@ sonewconn(struct socket *head, int connstatus)
        sigio_init(&so->so_sigio);
        sigio_copy(&so->so_sigio, &head->so_sigio);
 
-       soqinsque(head, so, soqueue);
-       if ((*so->so_proto->pr_attach)(so, 0)) {
-               (void) soqremque(so, soqueue);
+       soqinsque(head, so, 0);
+
+       /*
+        * We need to unlock `head' because PCB layer could release
+        * solock() to enforce desired lock order.
+        */
+       if (persocket) {
+               head->so_newconn++;
+               sounlock(head);
+       }
+
+       error = (*so->so_proto->pr_attach)(so, 0);
+
+       if (persocket) {
+               sounlock(so);
+               solock(head);
+               solock(so);
+
+               if ((head->so_newconn--) == 0) {
+                       if ((head->so_state & SS_NEWCONN_WAIT) != 0) {
+                               head->so_state &= ~SS_NEWCONN_WAIT;
+                               wakeup(&head->so_newconn);
+                       }
+               }
+       }
+
+       if (error) {
+               soqremque(so, 0);
+               if (persocket)
+                       sounlock(so);
                sigio_free(&so->so_sigio);
                klist_free(&so->so_rcv.sb_sel.si_note);
                klist_free(&so->so_snd.sb_sel.si_note);
                pool_put(&socket_pool, so);
                return (NULL);
        }
+
        if (connstatus) {
+               so->so_state |= connstatus;
+               soqremque(so, 0);
+               soqinsque(head, so, 1);
                sorwakeup(head);
                wakeup(&head->so_timeo);
-               so->so_state |= connstatus;
        }
+
+       if (persocket)
+               sounlock(so);
+
        return (so);
 }
 
@@ -214,6 +282,7 @@ void
 soqinsque(struct socket *head, struct socket *so, int q)
 {
        soassertlocked(head);
+       soassertlocked(so);
 
        KASSERT(so->so_onq == NULL);
 
@@ -233,6 +302,7 @@ soqremque(struct socket *so, int q)
 {
        struct socket *head = so->so_head;
 
+       soassertlocked(so);
        soassertlocked(head);
 
        if (q == 0) {
@@ -284,15 +354,40 @@ solock(struct socket *so)
        case PF_INET6:
                NET_LOCK();
                break;
-       case PF_UNIX:
-               rw_enter_write(&unp_lock);
-               break;
        default:
                rw_enter_write(&so->so_lock);
                break;
        }
 }
 
+int
+solock_persocket(struct socket *so)
+{
+       switch (so->so_proto->pr_domain->dom_family) {
+       case PF_INET:
+       case PF_INET6:
+               return 0;
+       default:
+               return 1;
+       }
+}
+
+void
+solock_pair(struct socket *so1, struct socket *so2)
+{
+       KASSERT(so1 != so2);
+       KASSERT(so1->so_type == so2->so_type);
+       KASSERT(solock_persocket(so1));
+
+       if (so1 < so2) {
+               solock(so1);
+               solock(so2);
+       } else {
+               solock(so2);
+               solock(so1);
+       }
+}
+
 void
 sounlock(struct socket *so)
 {
@@ -301,9 +396,6 @@ sounlock(struct socket *so)
        case PF_INET6:
                NET_UNLOCK();
                break;
-       case PF_UNIX:
-               rw_exit_write(&unp_lock);
-               break;
        default:
                rw_exit_write(&so->so_lock);
                break;
@@ -318,9 +410,6 @@ soassertlocked(struct socket *so)
        case PF_INET6:
                NET_ASSERT_LOCKED();
                break;
-       case PF_UNIX:
-               rw_assert_wrlock(&unp_lock);
-               break;
        default:
                rw_assert_wrlock(&so->so_lock);
                break;
@@ -338,9 +427,6 @@ sosleep_nsec(struct socket *so, void *ident, int prio, const char *wmesg,
        case PF_INET6:
                ret = rwsleep_nsec(ident, &netlock, prio, wmesg, nsecs);
                break;
-       case PF_UNIX:
-               ret = rwsleep_nsec(ident, &unp_lock, prio, wmesg, nsecs);
-               break;
        default:
                ret = rwsleep_nsec(ident, &so->so_lock, prio, wmesg, nsecs);
                break;
index a259001..c99ee61 100644 (file)
@@ -1,4 +1,4 @@
-/*     $OpenBSD: uipc_syscalls.c,v 1.195 2022/06/06 14:45:41 claudio Exp $     */
+/*     $OpenBSD: uipc_syscalls.c,v 1.196 2022/07/01 09:56:17 mvs Exp $ */
 /*     $NetBSD: uipc_syscalls.c,v 1.19 1996/02/09 19:00:48 christos Exp $      */
 
 /*
@@ -246,7 +246,7 @@ doaccept(struct proc *p, int sock, struct sockaddr *name, socklen_t *anamelen,
        socklen_t namelen;
        int error, tmpfd;
        struct socket *head, *so;
-       int cloexec, nflag;
+       int cloexec, nflag, persocket;
 
        cloexec = (flags & SOCK_CLOEXEC) ? UF_EXCLOSE : 0;
 
@@ -269,16 +269,19 @@ doaccept(struct proc *p, int sock, struct sockaddr *name, socklen_t *anamelen,
 
        head = headfp->f_data;
        solock(head);
+
+       persocket = solock_persocket(head);
+
        if (isdnssocket(head) || (head->so_options & SO_ACCEPTCONN) == 0) {
                error = EINVAL;
-               goto out;
+               goto out_unlock;
        }
        if ((headfp->f_flag & FNONBLOCK) && head->so_qlen == 0) {
                if (head->so_state & SS_CANTRCVMORE)
                        error = ECONNABORTED;
                else
                        error = EWOULDBLOCK;
-               goto out;
+               goto out_unlock;
        }
        while (head->so_qlen == 0 && head->so_error == 0) {
                if (head->so_state & SS_CANTRCVMORE) {
@@ -288,18 +291,22 @@ doaccept(struct proc *p, int sock, struct sockaddr *name, socklen_t *anamelen,
                error = sosleep_nsec(head, &head->so_timeo, PSOCK | PCATCH,
                    "netcon", INFSLP);
                if (error)
-                       goto out;
+                       goto out_unlock;
        }
        if (head->so_error) {
                error = head->so_error;
                head->so_error = 0;
-               goto out;
+               goto out_unlock;
        }
 
        /*
         * Do not sleep after we have taken the socket out of the queue.
         */
        so = TAILQ_FIRST(&head->so_q);
+
+       if (persocket)
+               solock(so);
+
        if (soqremque(so, 1) == 0)
                panic("accept");
 
@@ -310,30 +317,52 @@ doaccept(struct proc *p, int sock, struct sockaddr *name, socklen_t *anamelen,
        /* connection has been removed from the listen queue */
        KNOTE(&head->so_rcv.sb_sel.si_note, 0);
 
+       if (persocket)
+               sounlock(head);
+
        fp->f_type = DTYPE_SOCKET;
        fp->f_flag = FREAD | FWRITE | nflag;
        fp->f_ops = &socketops;
        fp->f_data = so;
+
        error = soaccept(so, nam);
-out:
-       sounlock(head);
-       if (!error && name != NULL)
+
+       if (persocket)
+               sounlock(so);
+       else
+               sounlock(head);
+
+       if (error)
+               goto out;
+
+       if (name != NULL) {
                error = copyaddrout(p, nam, name, namelen, anamelen);
-       if (!error) {
-               fdplock(fdp);
-               fdinsert(fdp, tmpfd, cloexec, fp);
-               fdpunlock(fdp);
-               FRELE(fp, p);
-               *retval = tmpfd;
-       } else {
-               fdplock(fdp);
-               fdremove(fdp, tmpfd);
-               fdpunlock(fdp);
-               closef(fp, p);
+               if (error)
+                       goto out;
        }
 
+       fdplock(fdp);
+       fdinsert(fdp, tmpfd, cloexec, fp);
+       fdpunlock(fdp);
+       FRELE(fp, p);
+       *retval = tmpfd;
+
        m_freem(nam);
        FRELE(headfp, p);
+
+       return 0;
+
+out_unlock:
+       sounlock(head);
+out:
+       fdplock(fdp);
+       fdremove(fdp, tmpfd);
+       fdpunlock(fdp);
+       closef(fp, p);
+
+       m_freem(nam);
+       FRELE(headfp, p);
+
        return (error);
 }
 
index 372a473..0710393 100644 (file)
@@ -1,4 +1,4 @@
-/*     $OpenBSD: uipc_usrreq.c,v 1.165 2022/06/06 14:45:41 claudio Exp $       */
+/*     $OpenBSD: uipc_usrreq.c,v 1.166 2022/07/01 09:56:17 mvs Exp $   */
 /*     $NetBSD: uipc_usrreq.c,v 1.18 1996/02/09 19:00:50 christos Exp $        */
 
 /*
@@ -55,6 +55,7 @@
 #include <sys/mutex.h>
 #include <sys/sysctl.h>
 #include <sys/lock.h>
+#include <sys/refcnt.h>
 
 #include "kcov.h"
 #if NKCOV > 0
  *      I       immutable after creation
  *      D       unp_df_lock
  *      G       unp_gc_lock
- *      U       unp_lock
+ *      M       unp_ino_mtx
  *      R       unp_rights_mtx
  *      a       atomic
+ *      s       socket lock
  */
 
 struct rwlock unp_lock = RWLOCK_INITIALIZER("unplock");
@@ -76,6 +78,7 @@ struct rwlock unp_df_lock = RWLOCK_INITIALIZER("unpdflk");
 struct rwlock unp_gc_lock = RWLOCK_INITIALIZER("unpgclk");
 
 struct mutex unp_rights_mtx = MUTEX_INITIALIZER(IPL_SOFTNET);
+struct mutex unp_ino_mtx = MUTEX_INITIALIZER(IPL_SOFTNET);
 
 /*
  * Stack of sets of files that were passed over a socket but were
@@ -94,6 +97,9 @@ void  unp_remove_gcrefs(struct fdpass *, int);
 void   unp_restore_gcrefs(struct fdpass *, int);
 void   unp_scan(struct mbuf *, void (*)(struct fdpass *, int));
 int    unp_nam2sun(struct mbuf *, struct sockaddr_un **, size_t *);
+static inline void unp_ref(struct unpcb *);
+static inline void unp_rele(struct unpcb *);
+struct socket *unp_solock_peer(struct socket *);
 
 struct pool unpcb_pool;
 struct task unp_gc_task = TASK_INITIALIZER(unp_gc, NULL);
@@ -127,6 +133,53 @@ unp_init(void)
            IPL_SOFTNET, 0, "unpcb", NULL);
 }
 
+static inline void
+unp_ref(struct unpcb *unp)
+{
+       refcnt_take(&unp->unp_refcnt);
+}
+
+static inline void
+unp_rele(struct unpcb *unp)
+{
+       refcnt_rele_wake(&unp->unp_refcnt);
+}
+
+struct socket *
+unp_solock_peer(struct socket *so)
+{
+       struct unpcb *unp, *unp2;
+       struct socket *so2;
+
+       unp = so->so_pcb;
+
+again:
+       if ((unp2 = unp->unp_conn) == NULL)
+               return NULL;
+
+       so2 = unp2->unp_socket;
+
+       if (so < so2)
+               solock(so2);
+       else if (so > so2){
+               unp_ref(unp2);
+               sounlock(so);
+               solock(so2);
+               solock(so);
+
+               /* Datagram socket could be reconnected due to re-lock. */
+               if (unp->unp_conn != unp2) {
+                       sounlock(so2);
+                       unp_rele(unp2);
+                       goto again;
+               }
+
+               unp_rele(unp2);
+       }
+
+       return so2;
+}
+
 void
 uipc_setaddr(const struct unpcb *unp, struct mbuf *nam)
 {
@@ -201,7 +254,10 @@ uipc_usrreq(struct socket *so, int req, struct mbuf *m, struct mbuf *nam,
                 * if it was bound and we are still connected
                 * (our peer may have closed already!).
                 */
+               so2 = unp_solock_peer(so);
                uipc_setaddr(unp->unp_conn, nam);
+               if (so2 != NULL && so2 != so)
+                       sounlock(so2);
                break;
 
        case PRU_SHUTDOWN:
@@ -218,9 +274,8 @@ uipc_usrreq(struct socket *so, int req, struct mbuf *m, struct mbuf *nam,
 
                case SOCK_STREAM:
                case SOCK_SEQPACKET:
-                       if (unp->unp_conn == NULL)
+                       if ((so2 = unp_solock_peer(so)) == NULL)
                                break;
-                       so2 = unp->unp_conn->unp_socket;
                        /*
                         * Adjust backpressure on sender
                         * and wakeup any waiting to write.
@@ -228,6 +283,7 @@ uipc_usrreq(struct socket *so, int req, struct mbuf *m, struct mbuf *nam,
                        so2->so_snd.sb_mbcnt = so->so_rcv.sb_mbcnt;
                        so2->so_snd.sb_cc = so->so_rcv.sb_cc;
                        sowwakeup(so2);
+                       sounlock(so2);
                        break;
 
                default:
@@ -256,13 +312,16 @@ uipc_usrreq(struct socket *so, int req, struct mbuf *m, struct mbuf *nam,
                                error = unp_connect(so, nam, p);
                                if (error)
                                        break;
-                       } else {
-                               if (unp->unp_conn == NULL) {
+                       }
+
+                       if ((so2 = unp_solock_peer(so)) == NULL) {
+                               if (nam != NULL)
+                                       error = ECONNREFUSED;
+                               else
                                        error = ENOTCONN;
-                                       break;
-                               }
+                               break;
                        }
-                       so2 = unp->unp_conn->unp_socket;
+
                        if (unp->unp_addr)
                                from = mtod(unp->unp_addr, struct sockaddr *);
                        else
@@ -273,6 +332,10 @@ uipc_usrreq(struct socket *so, int req, struct mbuf *m, struct mbuf *nam,
                                control = NULL;
                        } else
                                error = ENOBUFS;
+
+                       if (so2 != so)
+                               sounlock(so2);
+
                        if (nam)
                                unp_disconnect(unp);
                        break;
@@ -284,11 +347,11 @@ uipc_usrreq(struct socket *so, int req, struct mbuf *m, struct mbuf *nam,
                                error = EPIPE;
                                break;
                        }
-                       if (unp->unp_conn == NULL) {
+                       if ((so2 = unp_solock_peer(so)) == NULL) {
                                error = ENOTCONN;
                                break;
                        }
-                       so2 = unp->unp_conn->unp_socket;
+
                        /*
                         * Send to paired receive port, and then raise
                         * send buffer counts to maintain backpressure.
@@ -310,6 +373,8 @@ uipc_usrreq(struct socket *so, int req, struct mbuf *m, struct mbuf *nam,
                        so->so_snd.sb_cc = so2->so_rcv.sb_cc;
                        if (so2->so_rcv.sb_cc > 0)
                                sorwakeup(so2);
+
+                       sounlock(so2);
                        m = NULL;
                        break;
 
@@ -323,12 +388,7 @@ uipc_usrreq(struct socket *so, int req, struct mbuf *m, struct mbuf *nam,
 
        case PRU_ABORT:
                unp_detach(unp);
-               /*
-                * As long as `unp_lock' is taken before entering
-                * uipc_usrreq() releasing it here would lead to a
-                * double unlock.
-                */
-               sofree(so, 1);
+               sofree(so, 0);
                break;
 
        case PRU_SENSE: {
@@ -336,8 +396,10 @@ uipc_usrreq(struct socket *so, int req, struct mbuf *m, struct mbuf *nam,
 
                sb->st_blksize = so->so_snd.sb_hiwat;
                sb->st_dev = NODEV;
+               mtx_enter(&unp_ino_mtx);
                if (unp->unp_ino == 0)
                        unp->unp_ino = unp_ino++;
+               mtx_leave(&unp_ino_mtx);
                sb->st_atim.tv_sec =
                    sb->st_mtim.tv_sec =
                    sb->st_ctim.tv_sec = unp->unp_ctime.tv_sec;
@@ -358,7 +420,10 @@ uipc_usrreq(struct socket *so, int req, struct mbuf *m, struct mbuf *nam,
                break;
 
        case PRU_PEERADDR:
+               so2 = unp_solock_peer(so);
                uipc_setaddr(unp->unp_conn, nam);
+               if (so2 != NULL && so2 != so)
+                       sounlock(so2);
                break;
 
        case PRU_SLOWTIMO:
@@ -410,8 +475,6 @@ uipc_attach(struct socket *so, int proto)
        struct unpcb *unp;
        int error;
 
-       rw_assert_wrlock(&unp_lock);
-
        if (so->so_pcb)
                return EISCONN;
        if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0) {
@@ -438,6 +501,7 @@ uipc_attach(struct socket *so, int proto)
        unp = pool_get(&unpcb_pool, PR_NOWAIT|PR_ZERO);
        if (unp == NULL)
                return (ENOBUFS);
+       refcnt_init(&unp->unp_refcnt);
        unp->unp_socket = so;
        so->so_pcb = unp;
        getnanotime(&unp->unp_ctime);
@@ -445,12 +509,6 @@ uipc_attach(struct socket *so, int proto)
        /*
         * Enforce `unp_gc_lock' -> `solock()' lock order.
         */
-       /*
-        * We also release the lock on listening socket and on our peer
-        * socket when called from unp_connect(). This is safe. The
-        * listening socket protected by vnode(9) lock. The peer socket
-        * has 'UNP_CONNECTING' flag set.
-        */
        sounlock(so);
        rw_enter_write(&unp_gc_lock);
        LIST_INSERT_HEAD(&unp_head, unp, unp_link);
@@ -512,14 +570,13 @@ unp_detach(struct unpcb *unp)
 {
        struct socket *so = unp->unp_socket;
        struct vnode *vp = unp->unp_vnode;
-
-       rw_assert_wrlock(&unp_lock);
+       struct unpcb *unp2;
 
        unp->unp_vnode = NULL;
 
        /*
         * Enforce `unp_gc_lock' -> `solock()' lock order.
-        * Enforce `i_lock' -> `unp_lock' lock order.
+        * Enforce `i_lock' -> `solock()' lock order.
         */
        sounlock(so);
 
@@ -538,10 +595,47 @@ unp_detach(struct unpcb *unp)
 
        solock(so);
 
-       if (unp->unp_conn)
+       if (unp->unp_conn != NULL) {
+               /*
+                * Datagram socket could be connected to itself.
+                * Such socket will be disconnected here.
+                */
                unp_disconnect(unp);
-       while (!SLIST_EMPTY(&unp->unp_refs))
-               unp_drop(SLIST_FIRST(&unp->unp_refs), ECONNRESET);
+       }
+
+       while ((unp2 = SLIST_FIRST(&unp->unp_refs)) != NULL) {
+               struct socket *so2 = unp2->unp_socket;
+
+               if (so < so2)
+                       solock(so2);
+               else {
+                       unp_ref(unp2);
+                       sounlock(so);
+                       solock(so2);
+                       solock(so);
+
+                       if (unp2->unp_conn != unp) {
+                               /* `unp2' was disconnected due to re-lock. */
+                               sounlock(so2);
+                               unp_rele(unp2);
+                               continue;
+                       }
+
+                       unp_rele(unp2);
+               }
+
+               unp2->unp_conn = NULL;
+               SLIST_REMOVE(&unp->unp_refs, unp2, unpcb, unp_nextref);
+               so2->so_error = ECONNRESET;
+               so2->so_state &= ~SS_ISCONNECTED;
+
+               sounlock(so2);
+       }
+
+       sounlock(so);
+       refcnt_finalize(&unp->unp_refcnt, "unpfinal");
+       solock(so);
+
        soisdisconnected(so);
        so->so_pcb = NULL;
        m_freem(unp->unp_addr);
@@ -681,24 +775,42 @@ unp_connect(struct socket *so, struct mbuf *nam, struct proc *p)
        }
        if ((error = VOP_ACCESS(vp, VWRITE, p->p_ucred, p)) != 0)
                goto put;
-       solock(so);
        so2 = vp->v_socket;
        if (so2 == NULL) {
                error = ECONNREFUSED;
-               goto put_locked;
+               goto put;
        }
        if (so->so_type != so2->so_type) {
                error = EPROTOTYPE;
-               goto put_locked;
+               goto put;
        }
+
        if (so->so_proto->pr_flags & PR_CONNREQUIRED) {
+               solock(so2);
+
                if ((so2->so_options & SO_ACCEPTCONN) == 0 ||
                    (so3 = sonewconn(so2, 0)) == NULL) {
                        error = ECONNREFUSED;
-                       goto put_locked;
                }
+
+               sounlock(so2);
+
+               if (error != 0)
+                       goto put;
+
+               /*
+                * Since `so2' is protected by vnode(9) lock, `so3'
+                * can't be PRU_ABORT'ed here.
+                */
+               solock_pair(so, so3);
+
                unp2 = sotounpcb(so2);
                unp3 = sotounpcb(so3);
+
+               /*
+                * `unp_addr', `unp_connid' and 'UNP_FEIDSBIND' flag
+                * are immutable since we set them in unp_bind().
+                */
                if (unp2->unp_addr)
                        unp3->unp_addr =
                            m_copym(unp2->unp_addr, 0, M_COPYALL, M_NOWAIT);
@@ -706,15 +818,29 @@ unp_connect(struct socket *so, struct mbuf *nam, struct proc *p)
                unp3->unp_connid.gid = p->p_ucred->cr_gid;
                unp3->unp_connid.pid = p->p_p->ps_pid;
                unp3->unp_flags |= UNP_FEIDS;
-               so2 = so3;
+
                if (unp2->unp_flags & UNP_FEIDSBIND) {
                        unp->unp_connid = unp2->unp_connid;
                        unp->unp_flags |= UNP_FEIDS;
                }
+
+               so2 = so3;
+       } else {
+               if (so2 != so)
+                       solock_pair(so, so2);
+               else
+                       solock(so);
        }
+
        error = unp_connect2(so, so2);
-put_locked:
+
        sounlock(so);
+
+       /*
+        * `so2' can't be PRU_ABORT'ed concurrently
+        */
+       if (so2 != so)
+               sounlock(so2);
 put:
        vput(vp);
 unlock:
@@ -738,7 +864,8 @@ unp_connect2(struct socket *so, struct socket *so2)
        struct unpcb *unp = sotounpcb(so);
        struct unpcb *unp2;
 
-       rw_assert_wrlock(&unp_lock);
+       soassertlocked(so);
+       soassertlocked(so2);
 
        if (so2->so_type != so->so_type)
                return (EPROTOTYPE);
@@ -767,11 +894,15 @@ unp_connect2(struct socket *so, struct socket *so2)
 void
 unp_disconnect(struct unpcb *unp)
 {
-       struct unpcb *unp2 = unp->unp_conn;
+       struct socket *so2;
+       struct unpcb *unp2;
 
-       if (unp2 == NULL)
+       if ((so2 = unp_solock_peer(unp->unp_socket)) == NULL)
                return;
+
+       unp2 = unp->unp_conn;
        unp->unp_conn = NULL;
+
        switch (unp->unp_socket->so_type) {
 
        case SOCK_DGRAM:
@@ -790,35 +921,31 @@ unp_disconnect(struct unpcb *unp)
                soisdisconnected(unp2->unp_socket);
                break;
        }
+
+       if (so2 != unp->unp_socket)
+               sounlock(so2);
 }
 
 void
 unp_shutdown(struct unpcb *unp)
 {
-       struct socket *so;
+       struct socket *so2;
 
        switch (unp->unp_socket->so_type) {
        case SOCK_STREAM:
        case SOCK_SEQPACKET:
-               if (unp->unp_conn && (so = unp->unp_conn->unp_socket))
-                       socantrcvmore(so);
+               if ((so2 = unp_solock_peer(unp->unp_socket)) == NULL)
+                       break;
+               
+               socantrcvmore(so2);
+               sounlock(so2);
+
                break;
        default:
                break;
        }
 }
 
-void
-unp_drop(struct unpcb *unp, int errno)
-{
-       struct socket *so = unp->unp_socket;
-
-       rw_assert_wrlock(&unp_lock);
-
-       so->so_error = errno;
-       unp_disconnect(unp);
-}
-
 #ifdef notdef
 unp_drain(void)
 {
index ff62072..d4f5df4 100644 (file)
@@ -1,4 +1,4 @@
-/*     $OpenBSD: fifo_vnops.c,v 1.95 2022/06/26 05:20:42 visa Exp $    */
+/*     $OpenBSD: fifo_vnops.c,v 1.96 2022/07/01 09:56:17 mvs Exp $     */
 /*     $NetBSD: fifo_vnops.c,v 1.18 1996/03/16 23:52:42 christos Exp $ */
 
 /*
@@ -176,15 +176,17 @@ fifo_open(void *v)
                solock(wso);
                wso->so_state |= SS_CANTSENDMORE;
                wso->so_snd.sb_lowat = PIPE_BUF;
+               sounlock(wso);
        } else {
                rso = fip->fi_readsock;
                wso = fip->fi_writesock;
-               solock(wso);
        }
        if (ap->a_mode & FREAD) {
                fip->fi_readers++;
                if (fip->fi_readers == 1) {
+                       solock(wso);
                        wso->so_state &= ~SS_CANTSENDMORE;
+                       sounlock(wso);
                        if (fip->fi_writers > 0)
                                wakeup(&fip->fi_writers);
                }
@@ -193,16 +195,16 @@ fifo_open(void *v)
                fip->fi_writers++;
                if ((ap->a_mode & O_NONBLOCK) && fip->fi_readers == 0) {
                        error = ENXIO;
-                       sounlock(wso);
                        goto bad;
                }
                if (fip->fi_writers == 1) {
+                       solock(rso);
                        rso->so_state &= ~(SS_CANTRCVMORE|SS_ISDISCONNECTED);
+                       sounlock(rso);
                        if (fip->fi_readers > 0)
                                wakeup(&fip->fi_readers);
                }
        }
-       sounlock(wso);
        if ((ap->a_mode & O_NONBLOCK) == 0) {
                if ((ap->a_mode & FREAD) && fip->fi_writers == 0) {
                        VOP_UNLOCK(vp);
index 7e899ae..f82c1b0 100644 (file)
@@ -1,4 +1,4 @@
-/*     $OpenBSD: socketvar.h,v 1.104 2022/06/26 05:20:42 visa Exp $    */
+/*     $OpenBSD: socketvar.h,v 1.105 2022/07/01 09:56:17 mvs Exp $     */
 /*     $NetBSD: socketvar.h,v 1.18 1996/02/09 18:25:38 christos Exp $  */
 
 /*-
@@ -38,6 +38,7 @@
 #include <sys/task.h>
 #include <sys/timeout.h>
 #include <sys/rwlock.h>
+#include <sys/refcnt.h>
 
 #ifndef        _SOCKLEN_T_DEFINED_
 #define        _SOCKLEN_T_DEFINED_
@@ -55,6 +56,7 @@ TAILQ_HEAD(soqhead, socket);
 struct socket {
        const struct protosw *so_proto; /* protocol handle */
        struct rwlock so_lock;          /* this socket lock */
+       struct refcnt so_refcnt;        /* references to this socket */
        void    *so_pcb;                /* protocol control block */
        u_int   so_state;               /* internal state flags SS_*, below */
        short   so_type;                /* generic type, see socket.h */
@@ -80,6 +82,7 @@ struct socket {
        short   so_q0len;               /* partials on so_q0 */
        short   so_qlen;                /* number of connections on so_q */
        short   so_qlimit;              /* max number queued connections */
+       u_long  so_newconn;             /* # of pending sonewconn() threads */
        short   so_timeo;               /* connection timeout */
        u_long  so_oobmark;             /* chars to oob mark */
        u_int   so_error;               /* error affecting connection */
@@ -149,6 +152,7 @@ struct socket {
 #define        SS_CONNECTOUT           0x1000  /* connect, not accept, at this end */
 #define        SS_ISSENDING            0x2000  /* hint for lower layer */
 #define        SS_DNS                  0x4000  /* created using SOCK_DNS socket(2) */
+#define        SS_NEWCONN_WAIT         0x8000  /* waiting sonewconn() relock */
 
 #ifdef _KERNEL
 
@@ -156,6 +160,18 @@ struct socket {
 
 void   soassertlocked(struct socket *);
 
+static inline void
+soref(struct socket *so)
+{
+       refcnt_take(&so->so_refcnt);
+}
+
+static inline void
+sorele(struct socket *so)
+{
+       refcnt_rele_wake(&so->so_refcnt);
+}
+
 /*
  * Macros for sockets and socket buffering.
  */
@@ -329,6 +345,8 @@ int sockargs(struct mbuf **, const void *, size_t, int);
 
 int    sosleep_nsec(struct socket *, void *, int, const char *, uint64_t);
 void   solock(struct socket *);
+int    solock_persocket(struct socket *);
+void   solock_pair(struct socket *, struct socket *);
 void   sounlock(struct socket *);
 
 int    sendit(struct proc *, int, struct msghdr *, int, register_t *);
index b3641bd..54d083b 100644 (file)
@@ -1,4 +1,4 @@
-/*     $OpenBSD: unpcb.h,v 1.25 2022/02/25 23:51:04 guenther Exp $     */
+/*     $OpenBSD: unpcb.h,v 1.26 2022/07/01 09:56:17 mvs Exp $  */
 /*     $NetBSD: unpcb.h,v 1.6 1994/06/29 06:46:08 cgd Exp $    */
 
 /*
@@ -32,6 +32,8 @@
  *     @(#)unpcb.h     8.1 (Berkeley) 6/2/93
  */
 
+#include <sys/refcnt.h>
+
 /*
  * Protocol control block for an active
  * instance of a UNIX internal protocol.
  * Locks used to protect struct members:
  *      I       immutable after creation
  *      G       unp_gc_lock
- *      U       unp_lock
+ *      s       socket lock
  */
 
 
 struct unpcb {
+       struct  refcnt unp_refcnt;      /* references to this pcb */
        struct  socket *unp_socket;     /* [I] pointer back to socket */
-       struct  vnode *unp_vnode;       /* [U] if associated with file */
+       struct  vnode *unp_vnode;       /* [s] if associated with file */
        struct  file *unp_file;         /* [G] backpointer for unp_gc() */
-       struct  unpcb *unp_conn;        /* [U] control block of connected socket */
-       ino_t   unp_ino;                /* [U] fake inode number */
-       SLIST_HEAD(,unpcb) unp_refs;    /* [U] referencing socket linked list */
-       SLIST_ENTRY(unpcb) unp_nextref; /* [U] link in unp_refs list */
-       struct  mbuf *unp_addr;         /* [U] bound address of socket */
+       struct  unpcb *unp_conn;        /* [s] control block of connected
+                                               socket */
+       ino_t   unp_ino;                /* [s] fake inode number */
+       SLIST_HEAD(,unpcb) unp_refs;    /* [s] referencing socket linked list */
+       SLIST_ENTRY(unpcb) unp_nextref; /* [s] link in unp_refs list */
+       struct  mbuf *unp_addr;         /* [s] bound address of socket */
        long    unp_msgcount;           /* [G] references from socket rcv buf */
        long    unp_gcrefs;             /* [G] references from gc */
-       int     unp_flags;              /* [U] this unpcb contains peer eids */
+       int     unp_flags;              /* [s] this unpcb contains peer eids */
        int     unp_gcflags;            /* [G] garbage collector flags */
-       struct  sockpeercred unp_connid;/* [U] id of peer process */
+       struct  sockpeercred unp_connid;/* [s] id of peer process */
        struct  timespec unp_ctime;     /* [I] holds creation time */
        LIST_ENTRY(unpcb) unp_link;     /* [G] link in per-AF list of sockets */
 };
@@ -114,7 +118,6 @@ int unp_connect(struct socket *, struct mbuf *, struct proc *);
 int    unp_connect2(struct socket *, struct socket *);
 void   unp_detach(struct unpcb *);
 void   unp_disconnect(struct unpcb *);
-void   unp_drop(struct unpcb *, int);
 void   unp_gc(void *);
 void   unp_shutdown(struct unpcb *);
 int    unp_externalize(struct mbuf *, socklen_t, int);