From dc28d33f8280295b5a64640946defa654096f4ce Mon Sep 17 00:00:00 2001 From: bluhm Date: Mon, 21 Mar 2022 09:12:34 +0000 Subject: [PATCH] Header netinet/in_pcb.h includes sys/mutex.h now. Recommit mutex for PCB tables. It does not break userland build anymore. pf_socket_lookup() calls in_pcbhashlookup() in the PCB layer. To run pf in parallel, make parts of the stack MP safe. Protect the list and hashes in the PCB tables with a mutex. Note that the protocol notify functions may call pf via tcp_output(). As the pf lock is a sleeping rw_lock, we must not hold a mutex. To solve this for now, collect these PCBs in inp_notify list and protect it with exclusive netlock. OK sashan@ --- sys/kern/kern_sysctl.c | 10 ++++++- sys/netinet/in_pcb.c | 62 +++++++++++++++++++++++++++++++++------- sys/netinet/in_pcb.h | 31 +++++++++++++------- sys/netinet/raw_ip.c | 5 +++- sys/netinet/udp_usrreq.c | 4 ++- sys/netinet6/in6_pcb.c | 28 ++++++++++++++---- sys/netinet6/raw_ip6.c | 9 ++++-- 7 files changed, 116 insertions(+), 33 deletions(-) diff --git a/sys/kern/kern_sysctl.c b/sys/kern/kern_sysctl.c index 0f8d053b4c5..326265d56f2 100644 --- a/sys/kern/kern_sysctl.c +++ b/sys/kern/kern_sysctl.c @@ -1,4 +1,4 @@ -/* $OpenBSD: kern_sysctl.c,v 1.401 2022/03/14 22:38:43 tb Exp $ */ +/* $OpenBSD: kern_sysctl.c,v 1.402 2022/03/21 09:12:34 bluhm Exp $ */ /* $NetBSD: kern_sysctl.c,v 1.17 1996/05/20 17:49:05 mrg Exp $ */ /*- @@ -1366,16 +1366,24 @@ sysctl_file(int *name, u_int namelen, char *where, size_t *sizep, struct inpcb *inp; NET_LOCK(); + mtx_enter(&tcbtable.inpt_mtx); TAILQ_FOREACH(inp, &tcbtable.inpt_queue, inp_queue) FILLSO(inp->inp_socket); + mtx_leave(&tcbtable.inpt_mtx); + mtx_enter(&udbtable.inpt_mtx); TAILQ_FOREACH(inp, &udbtable.inpt_queue, inp_queue) FILLSO(inp->inp_socket); + mtx_leave(&udbtable.inpt_mtx); + mtx_enter(&rawcbtable.inpt_mtx); TAILQ_FOREACH(inp, &rawcbtable.inpt_queue, inp_queue) FILLSO(inp->inp_socket); + mtx_leave(&rawcbtable.inpt_mtx); #ifdef INET6 + mtx_enter(&rawin6pcbtable.inpt_mtx); TAILQ_FOREACH(inp, &rawin6pcbtable.inpt_queue, inp_queue) FILLSO(inp->inp_socket); + mtx_leave(&rawin6pcbtable.inpt_mtx); #endif NET_UNLOCK(); } diff --git a/sys/netinet/in_pcb.c b/sys/netinet/in_pcb.c index 26908d9b5fb..a2060c16fd6 100644 --- a/sys/netinet/in_pcb.c +++ b/sys/netinet/in_pcb.c @@ -1,4 +1,4 @@ -/* $OpenBSD: in_pcb.c,v 1.262 2022/03/21 03:51:09 dlg Exp $ */ +/* $OpenBSD: in_pcb.c,v 1.263 2022/03/21 09:12:34 bluhm Exp $ */ /* $NetBSD: in_pcb.c,v 1.25 1996/02/13 23:41:53 christos Exp $ */ /* @@ -120,7 +120,8 @@ struct baddynamicports baddynamicports; struct baddynamicports rootonlyports; struct pool inpcb_pool; -int in_pcbresize (struct inpcbtable *, int); +void in_pcbrehash_locked(struct inpcb *); +int in_pcbresize(struct inpcbtable *, int); #define INPCBHASH_LOADFACTOR(_x) (((_x) * 3) / 4) @@ -173,7 +174,7 @@ in_pcblhash(struct inpcbtable *table, int rdom, u_short lport) void in_pcbinit(struct inpcbtable *table, int hashsize) { - + mtx_init(&table->inpt_mtx, IPL_SOFTNET); TAILQ_INIT(&table->inpt_queue); table->inpt_hashtbl = hashinit(hashsize, M_PCB, M_WAITOK, &table->inpt_mask); @@ -252,6 +253,7 @@ in_pcballoc(struct socket *so, struct inpcbtable *table) inp->inp_cksum6 = -1; #endif /* INET6 */ + mtx_enter(&table->inpt_mtx); if (table->inpt_count++ > INPCBHASH_LOADFACTOR(table->inpt_size)) (void)in_pcbresize(table, table->inpt_size * 2); TAILQ_INSERT_HEAD(&table->inpt_queue, inp, inp_queue); @@ -268,6 +270,8 @@ in_pcballoc(struct socket *so, struct inpcbtable *table) &inp->inp_faddr, inp->inp_fport, &inp->inp_laddr, inp->inp_lport); LIST_INSERT_HEAD(head, inp, inp_hash); + mtx_leave(&table->inpt_mtx); + so->so_pcb = inp; return (0); @@ -556,6 +560,7 @@ void in_pcbdetach(struct inpcb *inp) { struct socket *so = inp->inp_socket; + struct inpcbtable *table = inp->inp_table; NET_ASSERT_LOCKED(); @@ -585,10 +590,13 @@ in_pcbdetach(struct inpcb *inp) pf_inp_unlink(inp); } #endif + mtx_enter(&table->inpt_mtx); LIST_REMOVE(inp, inp_lhash); LIST_REMOVE(inp, inp_hash); - TAILQ_REMOVE(&inp->inp_table->inpt_queue, inp, inp_queue); - inp->inp_table->inpt_count--; + TAILQ_REMOVE(&table->inpt_queue, inp, inp_queue); + table->inpt_count--; + mtx_leave(&table->inpt_mtx); + in_pcbunref(inp); } @@ -661,20 +669,25 @@ void in_pcbnotifyall(struct inpcbtable *table, struct sockaddr *dst, u_int rtable, int errno, void (*notify)(struct inpcb *, int)) { - struct inpcb *inp, *ninp; + SIMPLEQ_HEAD(, inpcb) inpcblist; + struct inpcb *inp; struct in_addr faddr; u_int rdomain; - NET_ASSERT_LOCKED(); + NET_ASSERT_WLOCKED(); if (dst->sa_family != AF_INET) return; faddr = satosin(dst)->sin_addr; if (faddr.s_addr == INADDR_ANY) return; + if (notify == NULL) + return; + SIMPLEQ_INIT(&inpcblist); rdomain = rtable_l2(rtable); - TAILQ_FOREACH_SAFE(inp, &table->inpt_queue, inp_queue, ninp) { + mtx_enter(&table->inpt_mtx); + TAILQ_FOREACH(inp, &table->inpt_queue, inp_queue) { #ifdef INET6 if (inp->inp_flags & INP_IPV6) continue; @@ -684,8 +697,15 @@ in_pcbnotifyall(struct inpcbtable *table, struct sockaddr *dst, u_int rtable, inp->inp_socket == NULL) { continue; } - if (notify) - (*notify)(inp, errno); + in_pcbref(inp); + SIMPLEQ_INSERT_TAIL(&inpcblist, inp, inp_notify); + } + mtx_leave(&table->inpt_mtx); + + while ((inp = SIMPLEQ_FIRST(&inpcblist)) != NULL) { + SIMPLEQ_REMOVE_HEAD(&inpcblist, inp_notify); + (*notify)(inp, errno); + in_pcbunref(inp); } } @@ -759,6 +779,7 @@ in_pcblookup_local(struct inpcbtable *table, void *laddrp, u_int lport_arg, u_int rdomain; rdomain = rtable_l2(rtable); + mtx_enter(&table->inpt_mtx); head = in_pcblhash(table, rdomain, lport); LIST_FOREACH(inp, head, inp_lhash) { if (rtable_l2(inp->inp_rtableid) != rdomain) @@ -809,6 +830,8 @@ in_pcblookup_local(struct inpcbtable *table, void *laddrp, u_int lport_arg, break; } } + mtx_leave(&table->inpt_mtx); + return (match); } @@ -977,11 +1000,22 @@ in_pcbselsrc(struct in_addr **insrc, struct sockaddr_in *sin, void in_pcbrehash(struct inpcb *inp) +{ + struct inpcbtable *table = inp->inp_table; + + mtx_enter(&table->inpt_mtx); + in_pcbrehash_locked(inp); + mtx_leave(&table->inpt_mtx); +} + +void +in_pcbrehash_locked(struct inpcb *inp) { struct inpcbtable *table = inp->inp_table; struct inpcbhead *head; NET_ASSERT_LOCKED(); + MUTEX_ASSERT_LOCKED(&table->inpt_mtx); LIST_REMOVE(inp, inp_lhash); head = in_pcblhash(table, inp->inp_rtableid, inp->inp_lport); @@ -1008,6 +1042,8 @@ in_pcbresize(struct inpcbtable *table, int hashsize) void *nhashtbl, *nlhashtbl, *ohashtbl, *olhashtbl; struct inpcb *inp; + MUTEX_ASSERT_LOCKED(&table->inpt_mtx); + ohashtbl = table->inpt_hashtbl; olhashtbl = table->inpt_lhashtbl; osize = table->inpt_size; @@ -1029,7 +1065,7 @@ in_pcbresize(struct inpcbtable *table, int hashsize) arc4random_buf(&table->inpt_lkey, sizeof(table->inpt_lkey)); TAILQ_FOREACH(inp, &table->inpt_queue, inp_queue) { - in_pcbrehash(inp); + in_pcbrehash_locked(inp); } hashfree(ohashtbl, osize, M_PCB); hashfree(olhashtbl, osize, M_PCB); @@ -1060,6 +1096,7 @@ in_pcbhashlookup(struct inpcbtable *table, struct in_addr faddr, u_int rdomain; rdomain = rtable_l2(rtable); + mtx_enter(&table->inpt_mtx); head = in_pcbhash(table, rdomain, &faddr, fport, &laddr, lport); LIST_FOREACH(inp, head, inp_hash) { #ifdef INET6 @@ -1082,6 +1119,7 @@ in_pcbhashlookup(struct inpcbtable *table, struct in_addr faddr, break; } } + mtx_leave(&table->inpt_mtx); #ifdef DIAGNOSTIC if (inp == NULL && in_pcbnotifymiss) { printf("%s: faddr=%08x fport=%d laddr=%08x lport=%d rdom=%u\n", @@ -1143,6 +1181,7 @@ in_pcblookup_listen(struct inpcbtable *table, struct in_addr laddr, #endif rdomain = rtable_l2(rtable); + mtx_enter(&table->inpt_mtx); head = in_pcbhash(table, rdomain, &zeroin_addr, 0, key1, lport); LIST_FOREACH(inp, head, inp_hash) { #ifdef INET6 @@ -1179,6 +1218,7 @@ in_pcblookup_listen(struct inpcbtable *table, struct in_addr laddr, LIST_REMOVE(inp, inp_hash); LIST_INSERT_HEAD(head, inp, inp_hash); } + mtx_leave(&table->inpt_mtx); #ifdef DIAGNOSTIC if (inp == NULL && in_pcbnotifymiss) { printf("%s: laddr=%08x lport=%d rdom=%u\n", diff --git a/sys/netinet/in_pcb.h b/sys/netinet/in_pcb.h index 35c7d9e525c..0e749af9c3b 100644 --- a/sys/netinet/in_pcb.h +++ b/sys/netinet/in_pcb.h @@ -1,4 +1,4 @@ -/* $OpenBSD: in_pcb.h,v 1.126 2022/03/20 19:16:29 bluhm Exp $ */ +/* $OpenBSD: in_pcb.h,v 1.127 2022/03/21 09:12:34 bluhm Exp $ */ /* $NetBSD: in_pcb.h,v 1.14 1996/02/13 23:42:00 christos Exp $ */ /* @@ -74,6 +74,13 @@ #include +/* + * Locks used to protect struct members in this file: + * I immutable after creation + * N net lock + * t inpt_mtx pcb table mutex + */ + struct pf_state_key; union inpaddru { @@ -92,10 +99,11 @@ union inpaddru { * control block. */ struct inpcb { - LIST_ENTRY(inpcb) inp_hash; /* local and foreign hash */ - LIST_ENTRY(inpcb) inp_lhash; /* local port hash */ - TAILQ_ENTRY(inpcb) inp_queue; /* inet PCB queue */ - struct inpcbtable *inp_table; /* inet queue/hash table */ + LIST_ENTRY(inpcb) inp_hash; /* [t] local and foreign hash */ + LIST_ENTRY(inpcb) inp_lhash; /* [t] local port hash */ + TAILQ_ENTRY(inpcb) inp_queue; /* [t] inet PCB queue */ + SIMPLEQ_ENTRY(inpcb) inp_notify; /* [N] queue to notify PCB */ + struct inpcbtable *inp_table; /* [I] inet queue/hash table */ union inpaddru inp_faddru; /* Foreign address. */ union inpaddru inp_laddru; /* Local address. */ #define inp_faddr inp_faddru.iau_a4u.inaddr @@ -155,12 +163,13 @@ struct inpcb { LIST_HEAD(inpcbhead, inpcb); struct inpcbtable { - TAILQ_HEAD(inpthead, inpcb) inpt_queue; /* inet PCB queue */ - struct inpcbhead *inpt_hashtbl; /* local and foreign hash */ - struct inpcbhead *inpt_lhashtbl; /* local port hash */ - SIPHASH_KEY inpt_key, inpt_lkey; /* secrets for hashes */ - u_long inpt_mask, inpt_lmask; /* hash masks */ - int inpt_count, inpt_size; /* queue count, hash size */ + struct mutex inpt_mtx; /* protect queue and hash */ + TAILQ_HEAD(inpthead, inpcb) inpt_queue; /* [t] inet PCB queue */ + struct inpcbhead *inpt_hashtbl; /* [t] local and foreign hash */ + struct inpcbhead *inpt_lhashtbl; /* [t] local port hash */ + SIPHASH_KEY inpt_key, inpt_lkey; /* [t] secrets for hashes */ + u_long inpt_mask, inpt_lmask; /* [t] hash masks */ + int inpt_count, inpt_size; /* [t] queue count, hash size */ }; /* flags in inp_flags: */ diff --git a/sys/netinet/raw_ip.c b/sys/netinet/raw_ip.c index 550c8663d8c..fc7d8dff62e 100644 --- a/sys/netinet/raw_ip.c +++ b/sys/netinet/raw_ip.c @@ -1,4 +1,4 @@ -/* $OpenBSD: raw_ip.c,v 1.124 2022/03/21 04:00:56 dlg Exp $ */ +/* $OpenBSD: raw_ip.c,v 1.125 2022/03/21 09:12:34 bluhm Exp $ */ /* $NetBSD: raw_ip.c,v 1.25 1996/02/18 18:58:33 christos Exp $ */ /* @@ -151,6 +151,7 @@ rip_input(struct mbuf **mp, int *offp, int proto, int af) } #endif NET_ASSERT_LOCKED(); + mtx_enter(&rawcbtable.inpt_mtx); TAILQ_FOREACH(inp, &rawcbtable.inpt_queue, inp_queue) { if (inp->inp_socket->so_state & SS_CANTRCVMORE) continue; @@ -190,6 +191,8 @@ rip_input(struct mbuf **mp, int *offp, int proto, int af) } last = inp; } + mtx_leave(&rawcbtable.inpt_mtx); + if (last) { if (last->inp_flags & INP_CONTROLOPTS || last->inp_socket->so_options & SO_TIMESTAMP) diff --git a/sys/netinet/udp_usrreq.c b/sys/netinet/udp_usrreq.c index b60e5bc2ea7..5c5354e84ea 100644 --- a/sys/netinet/udp_usrreq.c +++ b/sys/netinet/udp_usrreq.c @@ -1,4 +1,4 @@ -/* $OpenBSD: udp_usrreq.c,v 1.274 2022/03/14 22:38:43 tb Exp $ */ +/* $OpenBSD: udp_usrreq.c,v 1.275 2022/03/21 09:12:34 bluhm Exp $ */ /* $NetBSD: udp_usrreq.c,v 1.28 1996/03/16 23:54:03 christos Exp $ */ /* @@ -365,6 +365,7 @@ udp_input(struct mbuf **mp, int *offp, int proto, int af) */ last = NULL; NET_ASSERT_LOCKED(); + mtx_enter(&udbtable.inpt_mtx); TAILQ_FOREACH(inp, &udbtable.inpt_queue, inp_queue) { if (inp->inp_socket->so_state & SS_CANTRCVMORE) continue; @@ -440,6 +441,7 @@ udp_input(struct mbuf **mp, int *offp, int proto, int af) SO_REUSEADDR)) == 0) break; } + mtx_leave(&udbtable.inpt_mtx); if (last == NULL) { /* diff --git a/sys/netinet6/in6_pcb.c b/sys/netinet6/in6_pcb.c index 65e578413dc..75fc61ef9d4 100644 --- a/sys/netinet6/in6_pcb.c +++ b/sys/netinet6/in6_pcb.c @@ -1,4 +1,4 @@ -/* $OpenBSD: in6_pcb.c,v 1.115 2022/03/14 22:38:43 tb Exp $ */ +/* $OpenBSD: in6_pcb.c,v 1.116 2022/03/21 09:12:34 bluhm Exp $ */ /* * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. @@ -369,14 +369,15 @@ in6_pcbnotify(struct inpcbtable *table, struct sockaddr_in6 *dst, uint fport_arg, const struct sockaddr_in6 *src, uint lport_arg, u_int rtable, int cmd, void *cmdarg, void (*notify)(struct inpcb *, int)) { - struct inpcb *inp, *ninp; + SIMPLEQ_HEAD(, inpcb) inpcblist; + struct inpcb *inp; u_short fport = fport_arg, lport = lport_arg; struct sockaddr_in6 sa6_src; int errno; u_int32_t flowinfo; u_int rdomain; - NET_ASSERT_LOCKED(); + NET_ASSERT_WLOCKED(); if ((unsigned)cmd >= PRC_NCMDS) return; @@ -414,9 +415,13 @@ in6_pcbnotify(struct inpcbtable *table, struct sockaddr_in6 *dst, notify = in_rtchange; } errno = inet6ctlerrmap[cmd]; + if (notify == NULL) + return; + SIMPLEQ_INIT(&inpcblist); rdomain = rtable_l2(rtable); - TAILQ_FOREACH_SAFE(inp, &table->inpt_queue, inp_queue, ninp) { + mtx_enter(&table->inpt_mtx); + TAILQ_FOREACH(inp, &table->inpt_queue, inp_queue) { if ((inp->inp_flags & INP_IPV6) == 0) continue; @@ -488,8 +493,15 @@ in6_pcbnotify(struct inpcbtable *table, struct sockaddr_in6 *dst, continue; } do_notify: - if (notify) - (*notify)(inp, errno); + in_pcbref(inp); + SIMPLEQ_INSERT_TAIL(&inpcblist, inp, inp_notify); + } + mtx_leave(&table->inpt_mtx); + + while ((inp = SIMPLEQ_FIRST(&inpcblist)) != NULL) { + SIMPLEQ_REMOVE_HEAD(&inpcblist, inp_notify); + (*notify)(inp, errno); + in_pcbunref(inp); } } @@ -504,6 +516,7 @@ in6_pcbhashlookup(struct inpcbtable *table, const struct in6_addr *faddr, u_int rdomain; rdomain = rtable_l2(rtable); + mtx_enter(&table->inpt_mtx); head = in6_pcbhash(table, rdomain, faddr, fport, laddr, lport); LIST_FOREACH(inp, head, inp_hash) { if (!(inp->inp_flags & INP_IPV6)) @@ -524,6 +537,7 @@ in6_pcbhashlookup(struct inpcbtable *table, const struct in6_addr *faddr, break; } } + mtx_leave(&table->inpt_mtx); #ifdef DIAGNOSTIC if (inp == NULL && in_pcbnotifymiss) { printf("%s: faddr= fport=%d laddr= lport=%d rdom=%u\n", @@ -574,6 +588,7 @@ in6_pcblookup_listen(struct inpcbtable *table, struct in6_addr *laddr, #endif rdomain = rtable_l2(rtable); + mtx_enter(&table->inpt_mtx); head = in6_pcbhash(table, rdomain, &zeroin6_addr, 0, key1, lport); LIST_FOREACH(inp, head, inp_hash) { if (!(inp->inp_flags & INP_IPV6)) @@ -606,6 +621,7 @@ in6_pcblookup_listen(struct inpcbtable *table, struct in6_addr *laddr, LIST_REMOVE(inp, inp_hash); LIST_INSERT_HEAD(head, inp, inp_hash); } + mtx_leave(&table->inpt_mtx); #ifdef DIAGNOSTIC if (inp == NULL && in_pcbnotifymiss) { printf("%s: laddr= lport=%d rdom=%u\n", diff --git a/sys/netinet6/raw_ip6.c b/sys/netinet6/raw_ip6.c index 6b588d69450..b5bb3bf6907 100644 --- a/sys/netinet6/raw_ip6.c +++ b/sys/netinet6/raw_ip6.c @@ -1,4 +1,4 @@ -/* $OpenBSD: raw_ip6.c,v 1.144 2022/03/14 22:38:43 tb Exp $ */ +/* $OpenBSD: raw_ip6.c,v 1.145 2022/03/21 09:12:34 bluhm Exp $ */ /* $KAME: raw_ip6.c,v 1.69 2001/03/04 15:55:44 itojun Exp $ */ /* @@ -157,6 +157,7 @@ rip6_input(struct mbuf **mp, int *offp, int proto, int af) } #endif NET_ASSERT_LOCKED(); + mtx_enter(&rawin6pcbtable.inpt_mtx); TAILQ_FOREACH(in6p, &rawin6pcbtable.inpt_queue, inp_queue) { if (in6p->inp_socket->so_state & SS_CANTRCVMORE) continue; @@ -180,8 +181,10 @@ rip6_input(struct mbuf **mp, int *offp, int proto, int af) IP6_EXTHDR_GET(icmp6, struct icmp6_hdr *, m, *offp, sizeof(*icmp6)); - if (icmp6 == NULL) + if (icmp6 == NULL) { + mtx_leave(&rawin6pcbtable.inpt_mtx); return IPPROTO_DONE; + } if (ICMP6_FILTER_WILLBLOCK(icmp6->icmp6_type, in6p->inp_icmp6filt)) continue; @@ -224,6 +227,8 @@ rip6_input(struct mbuf **mp, int *offp, int proto, int af) } last = in6p; } + mtx_leave(&rawin6pcbtable.inpt_mtx); + if (last) { if (last->inp_flags & IN6P_CONTROLOPTS) ip6_savecontrol(last, m, &opts); -- 2.20.1