From da3acf12474752f7d54a1a606d1b4babe47c40f0 Mon Sep 17 00:00:00 2001 From: bluhm Date: Mon, 22 Aug 2022 10:37:27 +0000 Subject: [PATCH] Use rwlock per inpcb table to protect notify list. The notify function may sleep, so holding a mutex is not possible. The same list entry and rwlock is used for UDP multicast and raw IP delivery. By adding a write lock, exclusive netlock is no longer necessary for PCB notify and UDP and raw IP input. OK mvs@ --- sys/netinet/in_pcb.c | 16 +++++++++++++--- sys/netinet/in_pcb.h | 7 +++++-- sys/netinet/raw_ip.c | 10 ++++++++-- sys/netinet/udp_usrreq.c | 8 ++++++-- sys/netinet6/in6_pcb.c | 6 +++--- sys/netinet6/raw_ip6.c | 10 ++++++++-- 6 files changed, 43 insertions(+), 14 deletions(-) diff --git a/sys/netinet/in_pcb.c b/sys/netinet/in_pcb.c index 903a03b6d48..d145fb69c36 100644 --- a/sys/netinet/in_pcb.c +++ b/sys/netinet/in_pcb.c @@ -1,4 +1,4 @@ -/* $OpenBSD: in_pcb.c,v 1.271 2022/08/21 11:44:53 bluhm Exp $ */ +/* $OpenBSD: in_pcb.c,v 1.272 2022/08/22 10:37:27 bluhm Exp $ */ /* $NetBSD: in_pcb.c,v 1.25 1996/02/13 23:41:53 christos Exp $ */ /* @@ -175,6 +175,7 @@ void in_pcbinit(struct inpcbtable *table, int hashsize) { mtx_init(&table->inpt_mtx, IPL_SOFTNET); + rw_init(&table->inpt_notify, "inpnotify"); TAILQ_INIT(&table->inpt_queue); table->inpt_hashtbl = hashinit(hashsize, M_PCB, M_WAITOK, &table->inpt_mask); @@ -696,8 +697,6 @@ in_pcbnotifyall(struct inpcbtable *table, struct sockaddr *dst, u_int rtable, struct in_addr faddr; u_int rdomain; - NET_ASSERT_LOCKED_EXCLUSIVE(); - if (dst->sa_family != AF_INET) return; faddr = satosin(dst)->sin_addr; @@ -706,8 +705,18 @@ in_pcbnotifyall(struct inpcbtable *table, struct sockaddr *dst, u_int rtable, if (notify == NULL) return; + /* + * Use a temporary notify list protected by rwlock to run over + * selected PCB. This is necessary as the list of all PCB is + * protected by a mutex. Notify may call ip_output() eventually + * which may sleep as pf lock is a rwlock. Also the SRP + * implementation of the routing table might sleep. + * The same inp_notify list entry and inpt_notify rwlock are + * used for UDP multicast and raw IP delivery. + */ SIMPLEQ_INIT(&inpcblist); rdomain = rtable_l2(rtable); + rw_enter_write(&table->inpt_notify); mtx_enter(&table->inpt_mtx); TAILQ_FOREACH(inp, &table->inpt_queue, inp_queue) { #ifdef INET6 @@ -729,6 +738,7 @@ in_pcbnotifyall(struct inpcbtable *table, struct sockaddr *dst, u_int rtable, (*notify)(inp, errno); in_pcbunref(inp); } + rw_exit_write(&table->inpt_notify); } /* diff --git a/sys/netinet/in_pcb.h b/sys/netinet/in_pcb.h index 9d19225bd50..cc262a292b0 100644 --- a/sys/netinet/in_pcb.h +++ b/sys/netinet/in_pcb.h @@ -1,4 +1,4 @@ -/* $OpenBSD: in_pcb.h,v 1.130 2022/08/21 11:44:53 bluhm Exp $ */ +/* $OpenBSD: in_pcb.h,v 1.131 2022/08/22 10:37:27 bluhm Exp $ */ /* $NetBSD: in_pcb.h,v 1.14 1996/02/13 23:42:00 christos Exp $ */ /* @@ -66,6 +66,7 @@ #include #include +#include #include #include #include @@ -79,6 +80,7 @@ * I immutable after creation * N net lock * t inpt_mtx pcb table mutex + * y inpt_notify pcb table rwlock for notify * p inpcb_mtx pcb mutex */ @@ -103,7 +105,7 @@ struct inpcb { LIST_ENTRY(inpcb) inp_hash; /* [t] local and foreign hash */ LIST_ENTRY(inpcb) inp_lhash; /* [t] local port hash */ TAILQ_ENTRY(inpcb) inp_queue; /* [t] inet PCB queue */ - SIMPLEQ_ENTRY(inpcb) inp_notify; /* [N] notify or udp append */ + SIMPLEQ_ENTRY(inpcb) inp_notify; /* [y] notify or udp append */ struct inpcbtable *inp_table; /* [I] inet queue/hash table */ union inpaddru inp_faddru; /* Foreign address. */ union inpaddru inp_laddru; /* Local address. */ @@ -166,6 +168,7 @@ LIST_HEAD(inpcbhead, inpcb); struct inpcbtable { struct mutex inpt_mtx; /* protect queue and hash */ + struct rwlock inpt_notify; /* protect inp_notify list */ TAILQ_HEAD(inpthead, inpcb) inpt_queue; /* [t] inet PCB queue */ struct inpcbhead *inpt_hashtbl; /* [t] local and foreign hash */ struct inpcbhead *inpt_lhashtbl; /* [t] local port hash */ diff --git a/sys/netinet/raw_ip.c b/sys/netinet/raw_ip.c index 66c096b0b01..b5368df1a5e 100644 --- a/sys/netinet/raw_ip.c +++ b/sys/netinet/raw_ip.c @@ -1,4 +1,4 @@ -/* $OpenBSD: raw_ip.c,v 1.134 2022/08/22 08:08:46 mvs Exp $ */ +/* $OpenBSD: raw_ip.c,v 1.135 2022/08/22 10:37:27 bluhm Exp $ */ /* $NetBSD: raw_ip.c,v 1.25 1996/02/18 18:58:33 christos Exp $ */ /* @@ -160,8 +160,8 @@ rip_input(struct mbuf **mp, int *offp, int proto, int af) } } #endif - NET_ASSERT_LOCKED_EXCLUSIVE(); SIMPLEQ_INIT(&inpcblist); + rw_enter_write(&rawcbtable.inpt_notify); mtx_enter(&rawcbtable.inpt_mtx); TAILQ_FOREACH(inp, &rawcbtable.inpt_queue, inp_queue) { if (inp->inp_socket->so_state & SS_CANTRCVMORE) @@ -189,6 +189,8 @@ rip_input(struct mbuf **mp, int *offp, int proto, int af) mtx_leave(&rawcbtable.inpt_mtx); if (SIMPLEQ_EMPTY(&inpcblist)) { + rw_exit_write(&rawcbtable.inpt_notify); + if (ip->ip_p != IPPROTO_ICMP) icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_PROTOCOL, 0, 0); @@ -199,6 +201,8 @@ rip_input(struct mbuf **mp, int *offp, int proto, int af) counters[ips_noproto]++; counters[ips_delivered]--; counters_leave(&ref, ipcounters); + + return IPPROTO_DONE; } while ((inp = SIMPLEQ_FIRST(&inpcblist)) != NULL) { @@ -224,6 +228,8 @@ rip_input(struct mbuf **mp, int *offp, int proto, int af) } in_pcbunref(inp); } + rw_exit_write(&rawcbtable.inpt_notify); + return IPPROTO_DONE; } diff --git a/sys/netinet/udp_usrreq.c b/sys/netinet/udp_usrreq.c index e8135716a73..5a0ade0fc76 100644 --- a/sys/netinet/udp_usrreq.c +++ b/sys/netinet/udp_usrreq.c @@ -1,4 +1,4 @@ -/* $OpenBSD: udp_usrreq.c,v 1.286 2022/08/22 08:08:46 mvs Exp $ */ +/* $OpenBSD: udp_usrreq.c,v 1.287 2022/08/22 10:37:27 bluhm Exp $ */ /* $NetBSD: udp_usrreq.c,v 1.28 1996/03/16 23:54:03 christos Exp $ */ /* @@ -372,8 +372,8 @@ udp_input(struct mbuf **mp, int *offp, int proto, int af) * Locate pcb(s) for datagram. * (Algorithm copied from raw_intr().) */ - NET_ASSERT_LOCKED_EXCLUSIVE(); SIMPLEQ_INIT(&inpcblist); + rw_enter_write(&udbtable.inpt_notify); mtx_enter(&udbtable.inpt_mtx); TAILQ_FOREACH(inp, &udbtable.inpt_queue, inp_queue) { if (inp->inp_socket->so_state & SS_CANTRCVMORE) @@ -446,6 +446,8 @@ udp_input(struct mbuf **mp, int *offp, int proto, int af) mtx_leave(&udbtable.inpt_mtx); if (SIMPLEQ_EMPTY(&inpcblist)) { + rw_exit_write(&udbtable.inpt_notify); + /* * No matching pcb found; discard datagram. * (No need to send an ICMP Port Unreachable @@ -469,6 +471,8 @@ udp_input(struct mbuf **mp, int *offp, int proto, int af) } in_pcbunref(inp); } + rw_exit_write(&udbtable.inpt_notify); + return IPPROTO_DONE; } /* diff --git a/sys/netinet6/in6_pcb.c b/sys/netinet6/in6_pcb.c index 43a6e739e28..471a9614fea 100644 --- a/sys/netinet6/in6_pcb.c +++ b/sys/netinet6/in6_pcb.c @@ -1,4 +1,4 @@ -/* $OpenBSD: in6_pcb.c,v 1.119 2022/08/08 12:06:31 bluhm Exp $ */ +/* $OpenBSD: in6_pcb.c,v 1.120 2022/08/22 10:37:27 bluhm Exp $ */ /* * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. @@ -387,8 +387,6 @@ in6_pcbnotify(struct inpcbtable *table, struct sockaddr_in6 *dst, u_int32_t flowinfo; u_int rdomain; - NET_ASSERT_LOCKED_EXCLUSIVE(); - if ((unsigned)cmd >= PRC_NCMDS) return; @@ -430,6 +428,7 @@ in6_pcbnotify(struct inpcbtable *table, struct sockaddr_in6 *dst, SIMPLEQ_INIT(&inpcblist); rdomain = rtable_l2(rtable); + rw_enter_write(&table->inpt_notify); mtx_enter(&table->inpt_mtx); TAILQ_FOREACH(inp, &table->inpt_queue, inp_queue) { if ((inp->inp_flags & INP_IPV6) == 0) @@ -513,6 +512,7 @@ in6_pcbnotify(struct inpcbtable *table, struct sockaddr_in6 *dst, (*notify)(inp, errno); in_pcbunref(inp); } + rw_exit_write(&table->inpt_notify); } struct inpcb * diff --git a/sys/netinet6/raw_ip6.c b/sys/netinet6/raw_ip6.c index 3809f776044..29af83ad5e4 100644 --- a/sys/netinet6/raw_ip6.c +++ b/sys/netinet6/raw_ip6.c @@ -1,4 +1,4 @@ -/* $OpenBSD: raw_ip6.c,v 1.154 2022/08/22 08:08:46 mvs Exp $ */ +/* $OpenBSD: raw_ip6.c,v 1.155 2022/08/22 10:37:27 bluhm Exp $ */ /* $KAME: raw_ip6.c,v 1.69 2001/03/04 15:55:44 itojun Exp $ */ /* @@ -172,8 +172,8 @@ rip6_input(struct mbuf **mp, int *offp, int proto, int af) } } #endif - NET_ASSERT_LOCKED_EXCLUSIVE(); SIMPLEQ_INIT(&inpcblist); + rw_enter_write(&rawin6pcbtable.inpt_notify); mtx_enter(&rawin6pcbtable.inpt_mtx); TAILQ_FOREACH(in6p, &rawin6pcbtable.inpt_queue, inp_queue) { if (in6p->inp_socket->so_state & SS_CANTRCVMORE) @@ -224,6 +224,8 @@ rip6_input(struct mbuf **mp, int *offp, int proto, int af) struct counters_ref ref; uint64_t *counters; + rw_exit_write(&rawin6pcbtable.inpt_notify); + if (proto != IPPROTO_ICMPV6) { rip6stat_inc(rip6s_nosock); if (m->m_flags & M_MCAST) @@ -240,6 +242,8 @@ rip6_input(struct mbuf **mp, int *offp, int proto, int af) counters = counters_enter(&ref, ip6counters); counters[ip6s_delivered]--; counters_leave(&ref, ip6counters); + + return IPPROTO_DONE; } while ((in6p = SIMPLEQ_FIRST(&inpcblist)) != NULL) { @@ -267,6 +271,8 @@ rip6_input(struct mbuf **mp, int *offp, int proto, int af) } in_pcbunref(in6p); } + rw_exit_write(&rawin6pcbtable.inpt_notify); + return IPPROTO_DONE; } -- 2.20.1