From 1baa51d9481f74d21e434002fb195a48a7fc3443 Mon Sep 17 00:00:00 2001 From: mvs Date: Thu, 11 Apr 2024 13:32:51 +0000 Subject: [PATCH] Don't take solock() in soreceive() for SOCK_RAW inet sockets. For inet sockets solock() is the netlock wrapper, so soreceive() could be performed simultaneous with exclusively locked code paths. These sockets are not connection oriented, they don't call pru_rcvd(), they can't be spliced, they don't set `so_error'. Nothing to protect with solock() in soreceive() path. `so_rcv' buffer protected by `sb_mtx' mutex(9), but since it released, sblock() required to serialize concurrent soreceive() and sorflush() threads. Current sblock() is some kind of rwlock(9) implementation, so introduce `sb_lock' rwlock(9) and use it directly for that purpose. The sorflush() and callers were refactored to avoid solock() for raw inet sockets. This was done to avoid packet processing stop. Tested and ok bluhm. --- sys/kern/uipc_socket.c | 111 ++++++++++++++++++++++++++++------------ sys/kern/uipc_socket2.c | 27 +++++++++- sys/sys/socketvar.h | 7 +-- 3 files changed, 108 insertions(+), 37 deletions(-) diff --git a/sys/kern/uipc_socket.c b/sys/kern/uipc_socket.c index e5f4c1c9c19..0ab76539ac2 100644 --- a/sys/kern/uipc_socket.c +++ b/sys/kern/uipc_socket.c @@ -1,4 +1,4 @@ -/* $OpenBSD: uipc_socket.c,v 1.328 2024/04/10 12:04:41 mvs Exp $ */ +/* $OpenBSD: uipc_socket.c,v 1.329 2024/04/11 13:32:51 mvs Exp $ */ /* $NetBSD: uipc_socket.c,v 1.21 1996/02/04 02:17:52 christos Exp $ */ /* @@ -66,6 +66,7 @@ void soreaper(void *); void soput(void *); int somove(struct socket *, int); void sorflush(struct socket *); +void sorflush_locked(struct socket *); void filt_sordetach(struct knote *kn); int filt_soread(struct knote *kn, long hint); @@ -143,6 +144,8 @@ soalloc(const struct protosw *prp, int wait) return (NULL); rw_init_flags(&so->so_lock, dp->dom_name, RWL_DUPOK); refcnt_init(&so->so_refcnt); + rw_init(&so->so_rcv.sb_lock, "sbufrcv"); + rw_init(&so->so_snd.sb_lock, "sbufsnd"); mtx_init(&so->so_rcv.sb_mtx, IPL_MPFLOOR); mtx_init(&so->so_snd.sb_mtx, IPL_MPFLOOR); klist_init_mutex(&so->so_rcv.sb_klist, &so->so_rcv.sb_mtx); @@ -156,15 +159,15 @@ soalloc(const struct protosw *prp, int wait) case AF_INET6: switch (prp->pr_type) { case SOCK_DGRAM: - so->so_rcv.sb_flags |= SB_OWNLOCK; - /* FALLTHROUGH */ - case SOCK_RAW: so->so_rcv.sb_flags |= SB_MTXLOCK; break; + case SOCK_RAW: + so->so_rcv.sb_flags |= SB_MTXLOCK | SB_OWNLOCK; + break; } break; case AF_UNIX: - so->so_rcv.sb_flags |= SB_MTXLOCK | SB_OWNLOCK; + so->so_rcv.sb_flags |= SB_MTXLOCK; break; } @@ -346,9 +349,22 @@ sofree(struct socket *so, int keep_lock) } #endif /* SOCKET_SPLICE */ sbrelease(so, &so->so_snd); - sorflush(so); - if (!keep_lock) + + /* + * Regardless on '_locked' postfix, must release solock() before + * call sorflush_locked() for SB_OWNLOCK marked socket. Can't + * release solock() and call sorflush() because solock() release + * is unwanted for tcp(4) socket. + */ + + if (so->so_rcv.sb_flags & SB_OWNLOCK) + sounlock(so); + + sorflush_locked(so); + + if (!((so->so_rcv.sb_flags & SB_OWNLOCK) || keep_lock)) sounlock(so); + #ifdef SOCKET_SPLICE if (so->so_sp) { /* Reuse splice idle, sounsplice() has been called before. */ @@ -807,6 +823,7 @@ soreceive(struct socket *so, struct mbuf **paddr, struct uio *uio, const struct protosw *pr = so->so_proto; struct mbuf *nextrecord; size_t resid, orig_resid = uio->uio_resid; + int dosolock = ((so->so_rcv.sb_flags & SB_OWNLOCK) == 0); mp = mp0; if (paddr) @@ -836,12 +853,11 @@ bad: if (mp) *mp = NULL; - solock_shared(so); + if (dosolock) + solock_shared(so); restart: - if ((error = sblock(so, &so->so_rcv, SBLOCKWAIT(flags))) != 0) { - sounlock_shared(so); - return (error); - } + if ((error = sblock(so, &so->so_rcv, SBLOCKWAIT(flags))) != 0) + goto out; sb_mtx_lock(&so->so_rcv); m = so->so_rcv.sb_mb; @@ -906,14 +922,16 @@ restart: SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 1"); SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 1"); - if (so->so_rcv.sb_flags & SB_OWNLOCK) { + if (so->so_rcv.sb_flags & (SB_MTXLOCK | SB_OWNLOCK)) { sbunlock_locked(so, &so->so_rcv); - sounlock_shared(so); + if (dosolock) + sounlock_shared(so); error = sbwait_locked(so, &so->so_rcv); sb_mtx_unlock(&so->so_rcv); if (error) return (error); - solock_shared(so); + if (dosolock) + solock_shared(so); } else { sb_mtx_unlock(&so->so_rcv); sbunlock(so, &so->so_rcv); @@ -990,11 +1008,13 @@ dontblock: if (controlp) { if (pr->pr_domain->dom_externalize) { sb_mtx_unlock(&so->so_rcv); - sounlock_shared(so); + if (dosolock) + sounlock_shared(so); error = (*pr->pr_domain->dom_externalize) (cm, controllen, flags); - solock_shared(so); + if (dosolock) + solock_shared(so); sb_mtx_lock(&so->so_rcv); } *controlp = cm; @@ -1073,9 +1093,11 @@ dontblock: SBLASTMBUFCHK(&so->so_rcv, "soreceive uiomove"); resid = uio->uio_resid; sb_mtx_unlock(&so->so_rcv); - sounlock_shared(so); + if (dosolock) + sounlock_shared(so); uio_error = uiomove(mtod(m, caddr_t) + moff, len, uio); - solock_shared(so); + if (dosolock) + solock_shared(so); sb_mtx_lock(&so->so_rcv); if (uio_error) uio->uio_resid = resid - len; @@ -1158,14 +1180,22 @@ dontblock: break; SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 2"); SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 2"); - sb_mtx_unlock(&so->so_rcv); - error = sbwait(so, &so->so_rcv); - if (error) { - sbunlock(so, &so->so_rcv); - sounlock_shared(so); - return (0); + if (dosolock) { + sb_mtx_unlock(&so->so_rcv); + error = sbwait(so, &so->so_rcv); + if (error) { + sbunlock(so, &so->so_rcv); + sounlock_shared(so); + return (0); + } + sb_mtx_lock(&so->so_rcv); + } else { + if (sbwait_locked(so, &so->so_rcv)) { + sb_mtx_unlock(&so->so_rcv); + sbunlock(so, &so->so_rcv); + return (0); + } } - sb_mtx_lock(&so->so_rcv); if ((m = so->so_rcv.sb_mb) != NULL) nextrecord = m->m_nextpkt; } @@ -1214,7 +1244,9 @@ dontblock: release: sb_mtx_unlock(&so->so_rcv); sbunlock(so, &so->so_rcv); - sounlock_shared(so); +out: + if (dosolock) + sounlock_shared(so); return (error); } @@ -1223,7 +1255,6 @@ soshutdown(struct socket *so, int how) { int error = 0; - solock(so); switch (how) { case SHUT_RD: sorflush(so); @@ -1232,25 +1263,29 @@ soshutdown(struct socket *so, int how) sorflush(so); /* FALLTHROUGH */ case SHUT_WR: + solock(so); error = pru_shutdown(so); + sounlock(so); break; default: error = EINVAL; break; } - sounlock(so); return (error); } void -sorflush(struct socket *so) +sorflush_locked(struct socket *so) { struct sockbuf *sb = &so->so_rcv; struct mbuf *m; const struct protosw *pr = so->so_proto; int error; + if ((sb->sb_flags & SB_OWNLOCK) == 0) + soassertlocked(so); + error = sblock(so, sb, SBL_WAIT | SBL_NOINTR); /* with SBL_WAIT and SLB_NOINTR sblock() must not fail */ KASSERT(error == 0); @@ -1267,6 +1302,16 @@ sorflush(struct socket *so) m_purge(m); } +void +sorflush(struct socket *so) +{ + if ((so->so_rcv.sb_flags & SB_OWNLOCK) == 0) + solock_shared(so); + sorflush_locked(so); + if ((so->so_rcv.sb_flags & SB_OWNLOCK) == 0) + sounlock_shared(so); +} + #ifdef SOCKET_SPLICE #define so_splicelen so_sp->ssp_len @@ -1905,7 +1950,8 @@ sosetopt(struct socket *so, int level, int optname, struct mbuf *m) if ((long)cnt <= 0) cnt = 1; - solock(so); + if (((sb->sb_flags & SB_OWNLOCK) == 0)) + solock(so); mtx_enter(&sb->sb_mtx); switch (optname) { @@ -1931,7 +1977,8 @@ sosetopt(struct socket *so, int level, int optname, struct mbuf *m) } mtx_leave(&sb->sb_mtx); - sounlock(so); + if (((sb->sb_flags & SB_OWNLOCK) == 0)) + sounlock(so); break; } diff --git a/sys/kern/uipc_socket2.c b/sys/kern/uipc_socket2.c index 9c52cf7b42a..64eb92102d1 100644 --- a/sys/kern/uipc_socket2.c +++ b/sys/kern/uipc_socket2.c @@ -1,4 +1,4 @@ -/* $OpenBSD: uipc_socket2.c,v 1.148 2024/04/10 12:04:41 mvs Exp $ */ +/* $OpenBSD: uipc_socket2.c,v 1.149 2024/04/11 13:32:51 mvs Exp $ */ /* $NetBSD: uipc_socket2.c,v 1.11 1996/02/04 02:17:55 christos Exp $ */ /* @@ -322,7 +322,9 @@ socantsendmore(struct socket *so) void socantrcvmore(struct socket *so) { - soassertlocked(so); + if ((so->so_rcv.sb_flags & SB_OWNLOCK) == 0) + soassertlocked(so); + mtx_enter(&so->so_rcv.sb_mtx); so->so_rcv.sb_state |= SS_CANTRCVMORE; mtx_leave(&so->so_rcv.sb_mtx); @@ -529,6 +531,17 @@ sblock(struct socket *so, struct sockbuf *sb, int flags) { int error = 0, prio = PSOCK; + if (sb->sb_flags & SB_OWNLOCK) { + int rwflags = RW_WRITE; + + if (!(flags & SBL_NOINTR || sb->sb_flags & SB_NOINTR)) + rwflags |= RW_INTR; + if (!(flags & SBL_WAIT)) + rwflags |= RW_NOSLEEP; + + return rw_enter(&sb->sb_lock, rwflags); + } + soassertlocked(so); mtx_enter(&sb->sb_mtx); @@ -561,6 +574,11 @@ out: void sbunlock_locked(struct socket *so, struct sockbuf *sb) { + if (sb->sb_flags & SB_OWNLOCK) { + rw_exit(&sb->sb_lock); + return; + } + MUTEX_ASSERT_LOCKED(&sb->sb_mtx); sb->sb_flags &= ~SB_LOCK; @@ -573,6 +591,11 @@ sbunlock_locked(struct socket *so, struct sockbuf *sb) void sbunlock(struct socket *so, struct sockbuf *sb) { + if (sb->sb_flags & SB_OWNLOCK) { + rw_exit(&sb->sb_lock); + return; + } + mtx_enter(&sb->sb_mtx); sbunlock_locked(so, sb); mtx_leave(&sb->sb_mtx); diff --git a/sys/sys/socketvar.h b/sys/sys/socketvar.h index cca3780c0bb..780b1ccb8dd 100644 --- a/sys/sys/socketvar.h +++ b/sys/sys/socketvar.h @@ -1,4 +1,4 @@ -/* $OpenBSD: socketvar.h,v 1.128 2024/04/10 12:04:41 mvs Exp $ */ +/* $OpenBSD: socketvar.h,v 1.129 2024/04/11 13:32:51 mvs Exp $ */ /* $NetBSD: socketvar.h,v 1.18 1996/02/09 18:25:38 christos Exp $ */ /*- @@ -105,7 +105,8 @@ struct socket { * Variables for socket buffering. */ struct sockbuf { - struct mutex sb_mtx; + struct rwlock sb_lock; + struct mutex sb_mtx; /* The following fields are all zeroed on flush. */ #define sb_startzero sb_cc u_long sb_cc; /* actual chars in buffer */ @@ -134,7 +135,7 @@ struct socket { #define SB_SPLICE 0x0020 /* buffer is splice source or drain */ #define SB_NOINTR 0x0040 /* operations not interruptible */ #define SB_MTXLOCK 0x0080 /* use sb_mtx for sockbuf protection */ -#define SB_OWNLOCK 0x0100 /* sb_mtx used standalone */ +#define SB_OWNLOCK 0x0100 /* sblock() doesn't need solock() */ void (*so_upcall)(struct socket *so, caddr_t arg, int waitf); caddr_t so_upcallarg; /* Arg for above */ -- 2.20.1