From: mvs Date: Wed, 10 Feb 2021 08:20:09 +0000 (+0000) Subject: Move UNIX domain sockets out of kernel lock. The new `unp_lock' rwlock(9) X-Git-Url: http://artulab.com/gitweb/?a=commitdiff_plain;h=0da27877e7afa731b975d25f3e7a1ed560f8cb84;p=openbsd Move UNIX domain sockets out of kernel lock. The new `unp_lock' rwlock(9) used as solock()'s backend to protect the whole layer. With feedback from mpi@. ok bluhm@ claudio@ --- diff --git a/sys/kern/uipc_socket2.c b/sys/kern/uipc_socket2.c index bb0caf90af2..27887c815f9 100644 --- a/sys/kern/uipc_socket2.c +++ b/sys/kern/uipc_socket2.c @@ -1,4 +1,4 @@ -/* $OpenBSD: uipc_socket2.c,v 1.104 2020/04/11 14:07:06 claudio Exp $ */ +/* $OpenBSD: uipc_socket2.c,v 1.105 2021/02/10 08:20:09 mvs Exp $ */ /* $NetBSD: uipc_socket2.c,v 1.11 1996/02/04 02:17:55 christos Exp $ */ /* @@ -53,6 +53,8 @@ u_long sb_max = SB_MAX; /* patchable */ extern struct pool mclpools[]; extern struct pool mbpool; +extern struct rwlock unp_lock; + /* * Procedures to manipulate state flags of socket * and do appropriate wakeups. Normal sequence from the @@ -282,6 +284,8 @@ solock(struct socket *so) NET_LOCK(); break; case PF_UNIX: + rw_enter_write(&unp_lock); + break; case PF_ROUTE: case PF_KEY: default: @@ -306,6 +310,8 @@ sounlock(struct socket *so, int s) NET_UNLOCK(); break; case PF_UNIX: + rw_exit_write(&unp_lock); + break; case PF_ROUTE: case PF_KEY: default: @@ -323,6 +329,8 @@ soassertlocked(struct socket *so) NET_ASSERT_LOCKED(); break; case PF_UNIX: + rw_assert_wrlock(&unp_lock); + break; case PF_ROUTE: case PF_KEY: default: @@ -335,12 +343,24 @@ int sosleep_nsec(struct socket *so, void *ident, int prio, const char *wmesg, uint64_t nsecs) { - if ((so->so_proto->pr_domain->dom_family != PF_UNIX) && - (so->so_proto->pr_domain->dom_family != PF_ROUTE) && - (so->so_proto->pr_domain->dom_family != PF_KEY)) { - return rwsleep_nsec(ident, &netlock, prio, wmesg, nsecs); - } else - return tsleep_nsec(ident, prio, wmesg, nsecs); + int ret; + + switch (so->so_proto->pr_domain->dom_family) { + case PF_INET: + case PF_INET6: + ret = rwsleep_nsec(ident, &netlock, prio, wmesg, nsecs); + break; + case PF_UNIX: + ret = rwsleep_nsec(ident, &unp_lock, prio, wmesg, nsecs); + break; + case PF_ROUTE: + case PF_KEY: + default: + ret = tsleep_nsec(ident, prio, wmesg, nsecs); + break; + } + + return ret; } /* diff --git a/sys/kern/uipc_usrreq.c b/sys/kern/uipc_usrreq.c index 8a735f3092e..90a5e07cbe4 100644 --- a/sys/kern/uipc_usrreq.c +++ b/sys/kern/uipc_usrreq.c @@ -1,4 +1,4 @@ -/* $OpenBSD: uipc_usrreq.c,v 1.142 2019/07/16 21:41:37 bluhm Exp $ */ +/* $OpenBSD: uipc_usrreq.c,v 1.143 2021/02/10 08:20:09 mvs Exp $ */ /* $NetBSD: uipc_usrreq.c,v 1.18 1996/02/09 19:00:50 christos Exp $ */ /* @@ -51,35 +51,35 @@ #include #include #include +#include -void uipc_setaddr(const struct unpcb *, struct mbuf *); - -/* list of all UNIX domain sockets, for unp_gc() */ -LIST_HEAD(unp_head, unpcb) unp_head = LIST_HEAD_INITIALIZER(unp_head); +/* + * Locks used to protect global data and struct members: + * I immutable after creation + * U unp_lock + */ +struct rwlock unp_lock = RWLOCK_INITIALIZER("unplock"); /* * Stack of sets of files that were passed over a socket but were * not received and need to be closed. */ struct unp_deferral { - SLIST_ENTRY(unp_deferral) ud_link; - int ud_n; + SLIST_ENTRY(unp_deferral) ud_link; /* [U] */ + int ud_n; /* [I] */ /* followed by ud_n struct fdpass */ - struct fdpass ud_fp[]; + struct fdpass ud_fp[]; /* [I] */ }; +void uipc_setaddr(const struct unpcb *, struct mbuf *); void unp_discard(struct fdpass *, int); void unp_mark(struct fdpass *, int); void unp_scan(struct mbuf *, void (*)(struct fdpass *, int)); int unp_nam2sun(struct mbuf *, struct sockaddr_un **, size_t *); struct pool unpcb_pool; -/* list of sets of files that were sent over sockets that are now closed */ -SLIST_HEAD(,unp_deferral) unp_deferred = SLIST_HEAD_INITIALIZER(unp_deferred); - struct task unp_gc_task = TASK_INITIALIZER(unp_gc, NULL); - /* * Unix communications domain. * @@ -88,14 +88,25 @@ struct task unp_gc_task = TASK_INITIALIZER(unp_gc, NULL); * rethink name space problems * need a proper out-of-band */ -struct sockaddr sun_noname = { sizeof(sun_noname), AF_UNIX }; -ino_t unp_ino; /* prototype for fake inode numbers */ +const struct sockaddr sun_noname = { sizeof(sun_noname), AF_UNIX }; + +/* [U] list of all UNIX domain sockets, for unp_gc() */ +LIST_HEAD(unp_head, unpcb) unp_head = + LIST_HEAD_INITIALIZER(unp_head); +/* [U] list of sets of files that were sent over sockets that are now closed */ +SLIST_HEAD(,unp_deferral) unp_deferred = + SLIST_HEAD_INITIALIZER(unp_deferred); + +ino_t unp_ino; /* [U] prototype for fake inode numbers */ +int unp_rights; /* [U] file descriptors in flight */ +int unp_defer; /* [U] number of deferred fp to close by the GC task */ +int unp_gcing; /* [U] GC task currently running */ void unp_init(void) { pool_init(&unpcb_pool, sizeof(struct unpcb), 0, - IPL_NONE, 0, "unpcb", NULL); + IPL_SOFTNET, 0, "unpcb", NULL); } void @@ -214,7 +225,7 @@ uipc_usrreq(struct socket *so, int req, struct mbuf *m, struct mbuf *nam, switch (so->so_type) { case SOCK_DGRAM: { - struct sockaddr *from; + const struct sockaddr *from; if (nam) { if (unp->unp_conn) { @@ -351,14 +362,14 @@ u_long unpst_recvspace = PIPSIZ; u_long unpdg_sendspace = 2*1024; /* really max datagram size */ u_long unpdg_recvspace = 4*1024; -int unp_rights; /* file descriptors in flight */ - int uipc_attach(struct socket *so, int proto) { struct unpcb *unp; int error; + rw_assert_wrlock(&unp_lock); + if (so->so_pcb) return EISCONN; if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0) { @@ -407,25 +418,48 @@ uipc_detach(struct socket *so) void unp_detach(struct unpcb *unp) { - struct vnode *vp; + struct socket *so = unp->unp_socket; + struct vnode *vp = NULL; + + rw_assert_wrlock(&unp_lock); LIST_REMOVE(unp, unp_link); if (unp->unp_vnode) { + /* + * `v_socket' is only read in unp_connect and + * unplock prevents concurrent access. + */ + unp->unp_vnode->v_socket = NULL; vp = unp->unp_vnode; unp->unp_vnode = NULL; - vrele(vp); } + if (unp->unp_conn) unp_disconnect(unp); while (!SLIST_EMPTY(&unp->unp_refs)) unp_drop(SLIST_FIRST(&unp->unp_refs), ECONNRESET); - soisdisconnected(unp->unp_socket); - unp->unp_socket->so_pcb = NULL; + soisdisconnected(so); + so->so_pcb = NULL; m_freem(unp->unp_addr); pool_put(&unpcb_pool, unp); if (unp_rights) task_add(systq, &unp_gc_task); + + if (vp != NULL) { + /* + * Enforce `i_lock' -> `unplock' because fifo subsystem + * requires it. The socket can't be closed concurrently + * because the file descriptor reference is + * still hold. + */ + + sounlock(so, SL_LOCKED); + KERNEL_LOCK(); + vrele(vp); + KERNEL_UNLOCK(); + solock(so); + } } int @@ -439,6 +473,8 @@ unp_bind(struct unpcb *unp, struct mbuf *nam, struct proc *p) struct nameidata nd; size_t pathlen; + if (unp->unp_flags & (UNP_BINDING | UNP_CONNECTING)) + return (EINVAL); if (unp->unp_vnode != NULL) return (EINVAL); if ((error = unp_nam2sun(nam, &soun, &pathlen))) @@ -458,10 +494,24 @@ unp_bind(struct unpcb *unp, struct mbuf *nam, struct proc *p) NDINIT(&nd, CREATE, NOFOLLOW | LOCKPARENT, UIO_SYSSPACE, soun->sun_path, p); nd.ni_pledge = PLEDGE_UNIX; + + unp->unp_flags |= UNP_BINDING; + + /* + * Enforce `i_lock' -> `unplock' because fifo subsystem + * requires it. The socket can't be closed concurrently + * because the file descriptor reference is still held. + */ + + sounlock(unp->unp_socket, SL_LOCKED); + + KERNEL_LOCK(); /* SHOULD BE ABLE TO ADOPT EXISTING AND wakeup() ALA FIFO's */ - if ((error = namei(&nd)) != 0) { + error = namei(&nd); + if (error != 0) { m_freem(nam2); - return (error); + solock(unp->unp_socket); + goto out; } vp = nd.ni_vp; if (vp != NULL) { @@ -472,7 +522,9 @@ unp_bind(struct unpcb *unp, struct mbuf *nam, struct proc *p) vput(nd.ni_dvp); vrele(vp); m_freem(nam2); - return (EADDRINUSE); + error = EADDRINUSE; + solock(unp->unp_socket); + goto out; } VATTR_NULL(&vattr); vattr.va_type = VSOCK; @@ -481,8 +533,10 @@ unp_bind(struct unpcb *unp, struct mbuf *nam, struct proc *p) vput(nd.ni_dvp); if (error) { m_freem(nam2); - return (error); + solock(unp->unp_socket); + goto out; } + solock(unp->unp_socket); unp->unp_addr = nam2; vp = nd.ni_vp; vp->v_socket = unp->unp_socket; @@ -492,7 +546,11 @@ unp_bind(struct unpcb *unp, struct mbuf *nam, struct proc *p) unp->unp_connid.pid = p->p_p->ps_pid; unp->unp_flags |= UNP_FEIDSBIND; VOP_UNLOCK(vp); - return (0); +out: + KERNEL_UNLOCK(); + unp->unp_flags &= ~UNP_BINDING; + + return (error); } int @@ -505,36 +563,52 @@ unp_connect(struct socket *so, struct mbuf *nam, struct proc *p) struct nameidata nd; int error; + unp = sotounpcb(so); + if (unp->unp_flags & (UNP_BINDING | UNP_CONNECTING)) + return (EISCONN); if ((error = unp_nam2sun(nam, &soun, NULL))) return (error); NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, soun->sun_path, p); nd.ni_pledge = PLEDGE_UNIX; - if ((error = namei(&nd)) != 0) - return (error); + + unp->unp_flags |= UNP_CONNECTING; + + /* + * Enforce `i_lock' -> `unplock' because fifo subsystem + * requires it. The socket can't be closed concurrently + * because the file descriptor reference is still held. + */ + + sounlock(so, SL_LOCKED); + + KERNEL_LOCK(); + error = namei(&nd); + if (error != 0) + goto unlock; vp = nd.ni_vp; if (vp->v_type != VSOCK) { error = ENOTSOCK; - goto bad; + goto put; } if ((error = VOP_ACCESS(vp, VWRITE, p->p_ucred, p)) != 0) - goto bad; + goto put; + solock(so); so2 = vp->v_socket; if (so2 == NULL) { error = ECONNREFUSED; - goto bad; + goto put_locked; } if (so->so_type != so2->so_type) { error = EPROTOTYPE; - goto bad; + goto put_locked; } if (so->so_proto->pr_flags & PR_CONNREQUIRED) { if ((so2->so_options & SO_ACCEPTCONN) == 0 || (so3 = sonewconn(so2, 0)) == 0) { error = ECONNREFUSED; - goto bad; + goto put_locked; } - unp = sotounpcb(so); unp2 = sotounpcb(so2); unp3 = sotounpcb(so3); if (unp2->unp_addr) @@ -551,8 +625,15 @@ unp_connect(struct socket *so, struct mbuf *nam, struct proc *p) } } error = unp_connect2(so, so2); -bad: +put_locked: + sounlock(so, SL_LOCKED); +put: vput(vp); +unlock: + KERNEL_UNLOCK(); + solock(so); + unp->unp_flags &= ~UNP_CONNECTING; + return (error); } @@ -562,6 +643,8 @@ unp_connect2(struct socket *so, struct socket *so2) struct unpcb *unp = sotounpcb(so); struct unpcb *unp2; + rw_assert_wrlock(&unp_lock); + if (so2->so_type != so->so_type) return (EPROTOTYPE); unp2 = sotounpcb(so2); @@ -635,15 +718,16 @@ unp_drop(struct unpcb *unp, int errno) { struct socket *so = unp->unp_socket; - KERNEL_ASSERT_LOCKED(); + rw_assert_wrlock(&unp_lock); so->so_error = errno; unp_disconnect(unp); if (so->so_head) { so->so_pcb = NULL; /* - * As long as the KERNEL_LOCK() is the default lock for Unix - * sockets, do not release it. + * As long as `unp_lock' is taken before entering + * uipc_usrreq() releasing it here would lead to a + * double unlock. */ sofree(so, SL_NOUNLOCK); m_freem(unp->unp_addr); @@ -685,6 +769,8 @@ unp_externalize(struct mbuf *rights, socklen_t controllen, int flags) struct file *fp; int nfds, error = 0; + rw_assert_wrlock(&unp_lock); + /* * This code only works because SCM_RIGHTS is the only supported * control message type on unix sockets. Enforce this here. @@ -705,6 +791,10 @@ unp_externalize(struct mbuf *rights, socklen_t controllen, int flags) /* Make sure the recipient should be able to see the descriptors.. */ rp = (struct fdpass *)CMSG_DATA(cm); + + /* fdp->fd_rdir requires KERNEL_LOCK() */ + KERNEL_LOCK(); + for (i = 0; i < nfds; i++) { fp = rp->fp; rp++; @@ -728,6 +818,8 @@ unp_externalize(struct mbuf *rights, socklen_t controllen, int flags) } } + KERNEL_UNLOCK(); + fds = mallocarray(nfds, sizeof(int), M_TEMP, M_WAITOK); restart: @@ -825,6 +917,8 @@ unp_internalize(struct mbuf *control, struct proc *p) int i, error; int nfds, *ip, fd, neededspace; + rw_assert_wrlock(&unp_lock); + /* * Check for two potential msg_controllen values because * IETF stuck their nose in a place it does not belong. @@ -923,8 +1017,6 @@ fail: return (error); } -int unp_defer, unp_gcing; - void unp_gc(void *arg __unused) { @@ -934,8 +1026,10 @@ unp_gc(void *arg __unused) struct unpcb *unp; int nunref, i; + rw_enter_write(&unp_lock); + if (unp_gcing) - return; + goto unlock; unp_gcing = 1; /* close any fds on the deferred list */ @@ -950,7 +1044,9 @@ unp_gc(void *arg __unused) if ((unp = fptounp(fp)) != NULL) unp->unp_msgcount--; unp_rights--; + rw_exit_write(&unp_lock); (void) closef(fp, NULL); + rw_enter_write(&unp_lock); } free(defer, M_TEMP, sizeof(*defer) + sizeof(struct fdpass) * defer->ud_n); @@ -1021,6 +1117,8 @@ unp_gc(void *arg __unused) } } unp_gcing = 0; +unlock: + rw_exit_write(&unp_lock); } void @@ -1066,6 +1164,8 @@ unp_mark(struct fdpass *rp, int nfds) struct unpcb *unp; int i; + rw_assert_wrlock(&unp_lock); + for (i = 0; i < nfds; i++) { if (rp[i].fp == NULL) continue; @@ -1088,6 +1188,8 @@ unp_discard(struct fdpass *rp, int nfds) { struct unp_deferral *defer; + rw_assert_wrlock(&unp_lock); + /* copy the file pointers to a deferral structure */ defer = malloc(sizeof(*defer) + sizeof(*rp) * nfds, M_TEMP, M_WAITOK); defer->ud_n = nfds; diff --git a/sys/sys/unpcb.h b/sys/sys/unpcb.h index daa22a11d57..0d41e1437e0 100644 --- a/sys/sys/unpcb.h +++ b/sys/sys/unpcb.h @@ -1,4 +1,4 @@ -/* $OpenBSD: unpcb.h,v 1.17 2019/07/15 12:28:06 bluhm Exp $ */ +/* $OpenBSD: unpcb.h,v 1.18 2021/02/10 08:20:09 mvs Exp $ */ /* $NetBSD: unpcb.h,v 1.6 1994/06/29 06:46:08 cgd Exp $ */ /* @@ -56,22 +56,27 @@ * Stream sockets keep copies of receive sockbuf sb_cc and sb_mbcnt * so that changes in the sockbuf may be computed to modify * back pressure on the sender accordingly. + * + * Locks used to protect struct members: + * I immutable after creation + * U unp_lock */ + struct unpcb { - struct socket *unp_socket; /* pointer back to socket */ - struct vnode *unp_vnode; /* if associated with file */ - struct file *unp_file; /* backpointer for unp_gc() */ - struct unpcb *unp_conn; /* control block of connected socket */ - ino_t unp_ino; /* fake inode number */ - SLIST_HEAD(,unpcb) unp_refs; /* referencing socket linked list */ - SLIST_ENTRY(unpcb) unp_nextref; /* link in unp_refs list */ - struct mbuf *unp_addr; /* bound address of socket */ - long unp_msgcount; /* references from socket rcv buf */ - int unp_flags; /* this unpcb contains peer eids */ - struct sockpeercred unp_connid;/* id of peer process */ - struct timespec unp_ctime; /* holds creation time */ - LIST_ENTRY(unpcb) unp_link; /* link in per-AF list of sockets */ + struct socket *unp_socket; /* [I] pointer back to socket */ + struct vnode *unp_vnode; /* [U] if associated with file */ + struct file *unp_file; /* [U] backpointer for unp_gc() */ + struct unpcb *unp_conn; /* [U] control block of connected socket */ + ino_t unp_ino; /* [U] fake inode number */ + SLIST_HEAD(,unpcb) unp_refs; /* [U] referencing socket linked list */ + SLIST_ENTRY(unpcb) unp_nextref; /* [U] link in unp_refs list */ + struct mbuf *unp_addr; /* [U] bound address of socket */ + long unp_msgcount; /* [U] references from socket rcv buf */ + int unp_flags; /* [U] this unpcb contains peer eids */ + struct sockpeercred unp_connid;/* [U] id of peer process */ + struct timespec unp_ctime; /* [I] holds creation time */ + LIST_ENTRY(unpcb) unp_link; /* [U] link in per-AF list of sockets */ }; /* @@ -82,6 +87,8 @@ struct unpcb { #define UNP_GCMARK 0x04 /* mark during unp_gc() */ #define UNP_GCDEFER 0x08 /* ref'd, but not marked in this pass */ #define UNP_GCDEAD 0x10 /* unref'd in this pass */ +#define UNP_BINDING 0x20 /* unp is binding now */ +#define UNP_CONNECTING 0x40 /* unp is connecting now */ #define sotounpcb(so) ((struct unpcb *)((so)->so_pcb))