From 531d726ab58f7926899b170459479c6680cf2c97 Mon Sep 17 00:00:00 2001 From: bluhm Date: Mon, 18 Dec 2023 13:11:20 +0000 Subject: [PATCH] Run bind(2) system call in parallel. For protocols that care about locking, use the shared net lock to call sobind(). Use the per socket rwlock together with shared net lock. This affects protocols UDP, raw IP, and divert. Move the inpcb mutex locking into soreceive(), it is only used there. Add a comment to describe the current inmplementation of inpcb locking. OK mvs@ sashan@ --- sys/kern/uipc_socket.c | 12 +++++++++++- sys/kern/uipc_socket2.c | 10 +++++----- sys/kern/uipc_syscalls.c | 6 +++--- sys/netinet/in_pcb.h | 34 +++++++++++++++++++++++++++++++++- sys/sys/protosw.h | 8 +++++--- 5 files changed, 57 insertions(+), 13 deletions(-) diff --git a/sys/kern/uipc_socket.c b/sys/kern/uipc_socket.c index 6c2f93cda67..7a3062cd87f 100644 --- a/sys/kern/uipc_socket.c +++ b/sys/kern/uipc_socket.c @@ -1,4 +1,4 @@ -/* $OpenBSD: uipc_socket.c,v 1.309 2023/08/08 22:07:25 mvs Exp $ */ +/* $OpenBSD: uipc_socket.c,v 1.310 2023/12/18 13:11:20 bluhm Exp $ */ /* $NetBSD: uipc_socket.c,v 1.21 1996/02/04 02:17:52 christos Exp $ */ /* @@ -832,8 +832,10 @@ bad: *mp = NULL; solock_shared(so); + pru_lock(so); restart: if ((error = sblock(so, &so->so_rcv, SBLOCKWAIT(flags))) != 0) { + pru_unlock(so); sounlock_shared(so); return (error); } @@ -900,11 +902,13 @@ restart: SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 1"); SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 1"); sbunlock(so, &so->so_rcv); + pru_unlock(so); error = sbwait(so, &so->so_rcv); if (error) { sounlock_shared(so); return (error); } + pru_lock(so); goto restart; } dontblock: @@ -971,11 +975,13 @@ dontblock: sbsync(&so->so_rcv, nextrecord); if (controlp) { if (pr->pr_domain->dom_externalize) { + pru_unlock(so); sounlock_shared(so); error = (*pr->pr_domain->dom_externalize) (cm, controllen, flags); solock_shared(so); + pru_lock(so); } *controlp = cm; } else { @@ -1049,9 +1055,11 @@ dontblock: SBLASTRECORDCHK(&so->so_rcv, "soreceive uiomove"); SBLASTMBUFCHK(&so->so_rcv, "soreceive uiomove"); resid = uio->uio_resid; + pru_unlock(so); sounlock_shared(so); uio_error = uiomove(mtod(m, caddr_t) + moff, len, uio); solock_shared(so); + pru_lock(so); if (uio_error) uio->uio_resid = resid - len; } else @@ -1136,6 +1144,7 @@ dontblock: error = sbwait(so, &so->so_rcv); if (error) { sbunlock(so, &so->so_rcv); + pru_unlock(so); sounlock_shared(so); return (0); } @@ -1182,6 +1191,7 @@ dontblock: *flagsp |= flags; release: sbunlock(so, &so->so_rcv); + pru_unlock(so); sounlock_shared(so); return (error); } diff --git a/sys/kern/uipc_socket2.c b/sys/kern/uipc_socket2.c index f21e0e20ab8..18f7746f611 100644 --- a/sys/kern/uipc_socket2.c +++ b/sys/kern/uipc_socket2.c @@ -1,4 +1,4 @@ -/* $OpenBSD: uipc_socket2.c,v 1.138 2023/10/30 13:27:53 bluhm Exp $ */ +/* $OpenBSD: uipc_socket2.c,v 1.139 2023/12/18 13:11:20 bluhm Exp $ */ /* $NetBSD: uipc_socket2.c,v 1.11 1996/02/04 02:17:55 christos Exp $ */ /* @@ -368,7 +368,7 @@ solock_shared(struct socket *so) case PF_INET6: if (so->so_proto->pr_usrreqs->pru_lock != NULL) { NET_LOCK_SHARED(); - pru_lock(so); + rw_enter_write(&so->so_lock); } else NET_LOCK(); break; @@ -427,7 +427,7 @@ sounlock_shared(struct socket *so) case PF_INET: case PF_INET6: if (so->so_proto->pr_usrreqs->pru_unlock != NULL) { - pru_unlock(so); + rw_exit_write(&so->so_lock); NET_UNLOCK_SHARED(); } else NET_UNLOCK(); @@ -463,12 +463,12 @@ sosleep_nsec(struct socket *so, void *ident, int prio, const char *wmesg, case PF_INET6: if (so->so_proto->pr_usrreqs->pru_unlock != NULL && rw_status(&netlock) == RW_READ) { - pru_unlock(so); + rw_exit_write(&so->so_lock); } ret = rwsleep_nsec(ident, &netlock, prio, wmesg, nsecs); if (so->so_proto->pr_usrreqs->pru_lock != NULL && rw_status(&netlock) == RW_READ) { - pru_lock(so); + rw_enter_write(&so->so_lock); } break; default: diff --git a/sys/kern/uipc_syscalls.c b/sys/kern/uipc_syscalls.c index 2919c1c9686..0a58664adca 100644 --- a/sys/kern/uipc_syscalls.c +++ b/sys/kern/uipc_syscalls.c @@ -1,4 +1,4 @@ -/* $OpenBSD: uipc_syscalls.c,v 1.214 2023/09/23 09:17:21 jan Exp $ */ +/* $OpenBSD: uipc_syscalls.c,v 1.215 2023/12/18 13:11:20 bluhm Exp $ */ /* $NetBSD: uipc_syscalls.c,v 1.19 1996/02/09 19:00:48 christos Exp $ */ /* @@ -185,9 +185,9 @@ sys_bind(struct proc *p, void *v, register_t *retval) if (KTRPOINT(p, KTR_STRUCT)) ktrsockaddr(p, mtod(nam, caddr_t), SCARG(uap, namelen)); #endif - solock(so); + solock_shared(so); error = sobind(so, nam, p); - sounlock(so); + sounlock_shared(so); m_freem(nam); out: FRELE(fp, p); diff --git a/sys/netinet/in_pcb.h b/sys/netinet/in_pcb.h index b618a2e804d..16d1ce324c4 100644 --- a/sys/netinet/in_pcb.h +++ b/sys/netinet/in_pcb.h @@ -1,4 +1,4 @@ -/* $OpenBSD: in_pcb.h,v 1.144 2023/12/15 00:24:56 bluhm Exp $ */ +/* $OpenBSD: in_pcb.h,v 1.145 2023/12/18 13:11:20 bluhm Exp $ */ /* $NetBSD: in_pcb.h,v 1.14 1996/02/13 23:42:00 christos Exp $ */ /* @@ -84,6 +84,38 @@ * p inpcb_mtx pcb mutex */ +/* + * The pcb table mutex guarantees that all inpcb are consistent and + * that bind(2) and connect(2) create unique combinations of + * laddr/faddr/lport/fport/rtalbleid. This mutex is used to protect + * both address consistency and inpcb lookup during protocol input. + * All writes to inp_[lf]addr take table mutex. A per socket lock is + * needed, so that socket layer input have a consistent view at these + * values. + * + * In soconnect() and sosend() pcb mutex cannot be used. They eventually + * can call IP output which takes pf lock which is a sleeping lock. + * Also connect(2) does a route lookup for source selection. There + * route resolve happens, which creates a route, which sends a route + * message, which needs route lock, which is a rw-lock. + * + * On the other hand a mutex should be used in protocol input. It + * does not make sense to do a process switch per packet. Better spin + * until the packet can be processed. + * + * So there are three locks. Table mutex is for writing inp_[lf]addr/port + * and lookup, socket rw-lock to separate sockets in system calls, and + * pcb mutex to protect socket receive buffer. Changing inp_[lf]addr/port + * takes both per socket rw-lock and global table mutex. Protocol + * input only reads inp_[lf]addr/port during lookup and is safe. System + * call only reads when holding socket rw-lock and is safe. The socket + * layer needs pcb mutex only in soreceive(). + * + * Function pru_lock() grabs the pcb mutex and its existence indicates + * that a protocol is MP safe. Otherwise the exclusive net lock is + * used. + */ + struct pf_state_key; union inpaddru { diff --git a/sys/sys/protosw.h b/sys/sys/protosw.h index bf17e7b78a8..78b439b7b88 100644 --- a/sys/sys/protosw.h +++ b/sys/sys/protosw.h @@ -1,4 +1,4 @@ -/* $OpenBSD: protosw.h,v 1.62 2023/05/18 09:59:44 mvs Exp $ */ +/* $OpenBSD: protosw.h,v 1.63 2023/12/18 13:11:20 bluhm Exp $ */ /* $NetBSD: protosw.h,v 1.10 1996/04/09 20:55:32 cgd Exp $ */ /*- @@ -284,13 +284,15 @@ pru_detach(struct socket *so) static inline void pru_lock(struct socket *so) { - (*so->so_proto->pr_usrreqs->pru_lock)(so); + if (so->so_proto->pr_usrreqs->pru_lock) + (*so->so_proto->pr_usrreqs->pru_lock)(so); } static inline void pru_unlock(struct socket *so) { - (*so->so_proto->pr_usrreqs->pru_unlock)(so); + if (so->so_proto->pr_usrreqs->pru_unlock) + (*so->so_proto->pr_usrreqs->pru_unlock)(so); } static inline int -- 2.20.1