From: dlg Date: Fri, 15 Dec 2017 01:37:30 +0000 (+0000) Subject: add ifiqueues for mp safety and nics with multiple rx rings. X-Git-Url: http://artulab.com/gitweb/?a=commitdiff_plain;h=ea47e5e1b6d67781cc4f61bd0240820384903832;p=openbsd add ifiqueues for mp safety and nics with multiple rx rings. currently there is a single mbuf_queue per interface, which all rings on a nic shove packets onto. while the list inside this queue is protected by a mutex, the counters around it (ie, ipackets, ibytes, idrops) are not. this means updates can be lost, and reading the statistics is also inconsistent. having a single queue means that busy rx rings can dominate and then starve the others. ifiqueue structs are like ifqueue structs. they provide per ring queues, and independent counters for each ring. when ifdata is read for userland, these counters are aggregated. having a queue per ring now allows for per ring backpressure to be applied. MCLGETI will have it's day again. right now we assume every interface wants an input queue and unconditionally provide one. individual interfaces can opt into more. im not completely happy about the shape of this atm, but shuffling it around more makes the diff bigger. ok visa@ --- diff --git a/sys/net/if.c b/sys/net/if.c index f46f98eb213..f7a2275d8b5 100644 --- a/sys/net/if.c +++ b/sys/net/if.c @@ -1,4 +1,4 @@ -/* $OpenBSD: if.c,v 1.530 2017/11/20 10:16:25 mpi Exp $ */ +/* $OpenBSD: if.c,v 1.531 2017/12/15 01:37:30 dlg Exp $ */ /* $NetBSD: if.c,v 1.35 1996/05/07 05:26:04 thorpej Exp $ */ /* @@ -156,7 +156,6 @@ int if_group_egress_build(void); void if_watchdog_task(void *); -void if_input_process(void *); void if_netisr(void *); #ifdef DDB @@ -437,8 +436,6 @@ if_attachsetup(struct ifnet *ifp) ifidx = ifp->if_index; - mq_init(&ifp->if_inputqueue, 8192, IPL_NET); - task_set(ifp->if_inputtask, if_input_process, (void *)ifidx); task_set(ifp->if_watchdogtask, if_watchdog_task, (void *)ifidx); task_set(ifp->if_linkstatetask, if_linkstate_task, (void *)ifidx); @@ -562,6 +559,30 @@ if_attach_queues(struct ifnet *ifp, unsigned int nqs) ifp->if_nifqs = nqs; } +void +if_attach_iqueues(struct ifnet *ifp, unsigned int niqs) +{ + struct ifiqueue **map; + struct ifiqueue *ifiq; + unsigned int i; + + KASSERT(niqs != 0); + + map = mallocarray(niqs, sizeof(*map), M_DEVBUF, M_WAITOK); + + ifp->if_rcv.ifiq_softc = NULL; + map[0] = &ifp->if_rcv; + + for (i = 1; i < niqs; i++) { + ifiq = malloc(sizeof(*ifiq), M_DEVBUF, M_WAITOK|M_ZERO); + ifiq_init(ifiq, ifp, i); + map[i] = ifiq; + } + + ifp->if_iqs = map; + ifp->if_niqs = niqs; +} + void if_attach_common(struct ifnet *ifp) { @@ -587,6 +608,12 @@ if_attach_common(struct ifnet *ifp) ifp->if_ifqs = ifp->if_snd.ifq_ifqs; ifp->if_nifqs = 1; + ifiq_init(&ifp->if_rcv, ifp, 0); + + ifp->if_rcv.ifiq_ifiqs[0] = &ifp->if_rcv; + ifp->if_iqs = ifp->if_rcv.ifiq_ifiqs; + ifp->if_niqs = 1; + ifp->if_addrhooks = malloc(sizeof(*ifp->if_addrhooks), M_TEMP, M_WAITOK); TAILQ_INIT(ifp->if_addrhooks); @@ -605,8 +632,6 @@ if_attach_common(struct ifnet *ifp) M_TEMP, M_WAITOK|M_ZERO); ifp->if_linkstatetask = malloc(sizeof(*ifp->if_linkstatetask), M_TEMP, M_WAITOK|M_ZERO); - ifp->if_inputtask = malloc(sizeof(*ifp->if_inputtask), - M_TEMP, M_WAITOK|M_ZERO); ifp->if_llprio = IFQ_DEFPRIO; SRPL_INIT(&ifp->if_inputs); @@ -694,47 +719,7 @@ if_enqueue(struct ifnet *ifp, struct mbuf *m) void if_input(struct ifnet *ifp, struct mbuf_list *ml) { - struct mbuf *m; - size_t ibytes = 0; -#if NBPFILTER > 0 - caddr_t if_bpf; -#endif - - if (ml_empty(ml)) - return; - - MBUF_LIST_FOREACH(ml, m) { - m->m_pkthdr.ph_ifidx = ifp->if_index; - m->m_pkthdr.ph_rtableid = ifp->if_rdomain; - ibytes += m->m_pkthdr.len; - } - - ifp->if_ipackets += ml_len(ml); - ifp->if_ibytes += ibytes; - -#if NBPFILTER > 0 - if_bpf = ifp->if_bpf; - if (if_bpf) { - struct mbuf_list ml0; - - ml_init(&ml0); - ml_enlist(&ml0, ml); - ml_init(ml); - - while ((m = ml_dequeue(&ml0)) != NULL) { - if (bpf_mtap_ether(if_bpf, m, BPF_DIRECTION_IN)) - m_freem(m); - else - ml_enqueue(ml, m); - } - - if (ml_empty(ml)) - return; - } -#endif - - if (mq_enlist(&ifp->if_inputqueue, ml) == 0) - task_add(net_tq(ifp->if_index), ifp->if_inputtask); + ifiq_input(&ifp->if_rcv, ml, 2048); } int @@ -789,6 +774,24 @@ if_input_local(struct ifnet *ifp, struct mbuf *m, sa_family_t af) return (0); } +int +if_output_local(struct ifnet *ifp, struct mbuf *m, sa_family_t af) +{ + struct ifiqueue *ifiq; + unsigned int flow = 0; + + m->m_pkthdr.ph_family = af; + m->m_pkthdr.ph_ifidx = ifp->if_index; + m->m_pkthdr.ph_rtableid = ifp->if_rdomain; + + if (ISSET(m->m_pkthdr.ph_flowid, M_FLOWID_VALID)) + flow = m->m_pkthdr.ph_flowid & M_FLOWID_MASK; + + ifiq = ifp->if_iqs[flow % ifp->if_niqs]; + + return (ifiq_enqueue(ifiq, m) == 0 ? 0 : ENOBUFS); +} + struct ifih { SRPL_ENTRY(ifih) ifih_next; int (*ifih_input)(struct ifnet *, struct mbuf *, @@ -873,26 +876,18 @@ if_ih_remove(struct ifnet *ifp, int (*input)(struct ifnet *, struct mbuf *, } void -if_input_process(void *xifidx) +if_input_process(struct ifnet *ifp, struct mbuf_list *ml) { - unsigned int ifidx = (unsigned long)xifidx; - struct mbuf_list ml; struct mbuf *m; - struct ifnet *ifp; struct ifih *ifih; struct srp_ref sr; int s; - ifp = if_get(ifidx); - if (ifp == NULL) + if (ml_empty(ml)) return; - mq_delist(&ifp->if_inputqueue, &ml); - if (ml_empty(&ml)) - goto out; - if (!ISSET(ifp->if_xflags, IFXF_CLONED)) - add_net_randomness(ml_len(&ml)); + add_net_randomness(ml_len(ml)); /* * We grab the NET_LOCK() before processing any packet to @@ -908,7 +903,7 @@ if_input_process(void *xifidx) */ NET_RLOCK(); s = splnet(); - while ((m = ml_dequeue(&ml)) != NULL) { + while ((m = ml_dequeue(ml)) != NULL) { /* * Pass this mbuf to all input handlers of its * interface until it is consumed. @@ -924,8 +919,6 @@ if_input_process(void *xifidx) } splx(s); NET_RUNLOCK(); -out: - if_put(ifp); } void @@ -1033,10 +1026,6 @@ if_detach(struct ifnet *ifp) ifp->if_ioctl = if_detached_ioctl; ifp->if_watchdog = NULL; - /* Remove the input task */ - task_del(net_tq(ifp->if_index), ifp->if_inputtask); - mq_purge(&ifp->if_inputqueue); - /* Remove the watchdog timeout & task */ timeout_del(ifp->if_slowtimo); task_del(net_tq(ifp->if_index), ifp->if_watchdogtask); @@ -1090,7 +1079,6 @@ if_detach(struct ifnet *ifp) free(ifp->if_slowtimo, M_TEMP, sizeof(*ifp->if_slowtimo)); free(ifp->if_watchdogtask, M_TEMP, sizeof(*ifp->if_watchdogtask)); free(ifp->if_linkstatetask, M_TEMP, sizeof(*ifp->if_linkstatetask)); - free(ifp->if_inputtask, M_TEMP, sizeof(*ifp->if_inputtask)); for (i = 0; (dp = domains[i]) != NULL; i++) { if (dp->dom_ifdetach && ifp->if_afdata[dp->dom_family]) @@ -1113,6 +1101,17 @@ if_detach(struct ifnet *ifp) free(ifp->if_ifqs, M_DEVBUF, sizeof(struct ifqueue *) * ifp->if_nifqs); } + + for (i = 0; i < ifp->if_niqs; i++) + ifiq_destroy(ifp->if_iqs[i]); + if (ifp->if_iqs != ifp->if_rcv.ifiq_ifiqs) { + for (i = 1; i < ifp->if_niqs; i++) { + free(ifp->if_iqs[i], M_DEVBUF, + sizeof(struct ifiqueue)); + } + free(ifp->if_iqs, M_DEVBUF, + sizeof(struct ifiqueue *) * ifp->if_niqs); + } } /* @@ -2315,6 +2314,12 @@ if_getdata(struct ifnet *ifp, struct if_data *data) ifq_add_data(ifq, data); } + + for (i = 0; i < ifp->if_niqs; i++) { + struct ifiqueue *ifiq = ifp->if_iqs[i]; + + ifiq_add_data(ifiq, data); + } } /* diff --git a/sys/net/if_loop.c b/sys/net/if_loop.c index 5f7b1cec165..66ca33c92d7 100644 --- a/sys/net/if_loop.c +++ b/sys/net/if_loop.c @@ -1,4 +1,4 @@ -/* $OpenBSD: if_loop.c,v 1.83 2017/10/31 22:05:12 sashan Exp $ */ +/* $OpenBSD: if_loop.c,v 1.84 2017/12/15 01:37:30 dlg Exp $ */ /* $NetBSD: if_loop.c,v 1.15 1996/05/07 02:40:33 thorpej Exp $ */ /* @@ -241,12 +241,7 @@ looutput(struct ifnet *ifp, struct mbuf *m, struct sockaddr *dst, if ((m->m_flags & M_LOOP) == 0) return (if_input_local(ifp, m, dst->sa_family)); - m->m_pkthdr.ph_family = dst->sa_family; - if (mq_enqueue(&ifp->if_inputqueue, m)) - return ENOBUFS; - task_add(net_tq(ifp->if_index), ifp->if_inputtask); - - return (0); + return (if_output_local(ifp, m, dst->sa_family)); } void diff --git a/sys/net/if_var.h b/sys/net/if_var.h index bedb61bb81d..65ce87be4d3 100644 --- a/sys/net/if_var.h +++ b/sys/net/if_var.h @@ -1,4 +1,4 @@ -/* $OpenBSD: if_var.h,v 1.84 2017/11/17 03:51:32 dlg Exp $ */ +/* $OpenBSD: if_var.h,v 1.85 2017/12/15 01:37:30 dlg Exp $ */ /* $NetBSD: if.h,v 1.23 1996/05/07 02:40:27 thorpej Exp $ */ /* @@ -140,8 +140,6 @@ struct ifnet { /* and the entries */ struct task *if_linkstatetask; /* task to do route updates */ /* procedure handles */ - struct mbuf_queue if_inputqueue; - struct task *if_inputtask; /* input task */ SRPL_HEAD(, ifih) if_inputs; /* input routines (dequeue) */ /* output routine (enqueue) */ @@ -164,6 +162,10 @@ struct ifnet { /* and the entries */ void (*if_qstart)(struct ifqueue *); unsigned int if_nifqs; + struct ifiqueue if_rcv; /* rx/input queue */ + struct ifiqueue **if_iqs; /* pointer to the array of iqs */ + unsigned int if_niqs; + struct sockaddr_dl *if_sadl; /* pointer to our sockaddr_dl */ void *if_afdata[AF_MAX]; @@ -303,7 +305,9 @@ void if_start(struct ifnet *); int if_enqueue_try(struct ifnet *, struct mbuf *); int if_enqueue(struct ifnet *, struct mbuf *); void if_input(struct ifnet *, struct mbuf_list *); +void if_input_process(struct ifnet *, struct mbuf_list *); int if_input_local(struct ifnet *, struct mbuf *, sa_family_t); +int if_output_local(struct ifnet *, struct mbuf *, sa_family_t); void if_rtrequest_dummy(struct ifnet *, int, struct rtentry *); void p2p_rtrequest(struct ifnet *, int, struct rtentry *); diff --git a/sys/net/ifq.c b/sys/net/ifq.c index 3ad81b4ba66..2ab11a9c1e1 100644 --- a/sys/net/ifq.c +++ b/sys/net/ifq.c @@ -1,4 +1,4 @@ -/* $OpenBSD: ifq.c,v 1.17 2017/12/14 02:40:51 dlg Exp $ */ +/* $OpenBSD: ifq.c,v 1.18 2017/12/15 01:37:30 dlg Exp $ */ /* * Copyright (c) 2015 David Gwynne @@ -16,6 +16,8 @@ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ +#include "bpfilter.h" + #include #include #include @@ -25,6 +27,10 @@ #include #include +#if NBPFILTER > 0 +#include +#endif + /* * priq glue */ @@ -413,6 +419,164 @@ ifq_mfreeml(struct ifqueue *ifq, struct mbuf_list *ml) ml_enlist(&ifq->ifq_free, ml); } +/* + * ifiq + */ + +static void ifiq_process(void *); + +void +ifiq_init(struct ifiqueue *ifiq, struct ifnet *ifp, unsigned int idx) +{ + ifiq->ifiq_if = ifp; + ifiq->ifiq_softnet = net_tq(ifp->if_index); /* + idx */ + ifiq->ifiq_softc = NULL; + + mtx_init(&ifiq->ifiq_mtx, IPL_NET); + ml_init(&ifiq->ifiq_ml); + task_set(&ifiq->ifiq_task, ifiq_process, ifiq); + + ifiq->ifiq_qdrops = 0; + ifiq->ifiq_packets = 0; + ifiq->ifiq_bytes = 0; + ifiq->ifiq_qdrops = 0; + ifiq->ifiq_errors = 0; + + ifiq->ifiq_idx = idx; +} + +void +ifiq_destroy(struct ifiqueue *ifiq) +{ + if (!task_del(ifiq->ifiq_softnet, &ifiq->ifiq_task)) { + int netlocked = (rw_status(&netlock) == RW_WRITE); + + if (netlocked) /* XXXSMP breaks atomicity */ + NET_UNLOCK(); + + taskq_barrier(ifiq->ifiq_softnet); + + if (netlocked) + NET_LOCK(); + } + + /* don't need to lock because this is the last use of the ifiq */ + ml_purge(&ifiq->ifiq_ml); +} + +int +ifiq_input(struct ifiqueue *ifiq, struct mbuf_list *ml, unsigned int cwm) +{ + struct ifnet *ifp = ifiq->ifiq_if; + struct mbuf *m; + uint64_t packets; + uint64_t bytes = 0; +#if NBPFILTER > 0 + caddr_t if_bpf; +#endif + int rv = 1; + + if (ml_empty(ml)) + return (0); + + MBUF_LIST_FOREACH(ml, m) { + m->m_pkthdr.ph_ifidx = ifp->if_index; + m->m_pkthdr.ph_rtableid = ifp->if_rdomain; + bytes += m->m_pkthdr.len; + } + packets = ml_len(ml); + +#if NBPFILTER > 0 + if_bpf = ifp->if_bpf; + if (if_bpf) { + struct mbuf_list ml0 = *ml; + + ml_init(ml); + + while ((m = ml_dequeue(&ml0)) != NULL) { + if (bpf_mtap_ether(if_bpf, m, BPF_DIRECTION_IN)) + m_freem(m); + else + ml_enqueue(ml, m); + } + + if (ml_empty(ml)) { + mtx_enter(&ifiq->ifiq_mtx); + ifiq->ifiq_packets += packets; + ifiq->ifiq_bytes += bytes; + mtx_leave(&ifiq->ifiq_mtx); + + return (0); + } + } +#endif + + mtx_enter(&ifiq->ifiq_mtx); + ifiq->ifiq_packets += packets; + ifiq->ifiq_bytes += bytes; + + if (ifiq_len(ifiq) >= cwm * 5) + ifiq->ifiq_qdrops += ml_len(ml); + else { + rv = (ifiq_len(ifiq) >= cwm * 3); + ml_enlist(&ifiq->ifiq_ml, ml); + } + mtx_leave(&ifiq->ifiq_mtx); + + if (ml_empty(ml)) + task_add(ifiq->ifiq_softnet, &ifiq->ifiq_task); + else + ml_purge(ml); + + return (rv); +} + +void +ifiq_add_data(struct ifiqueue *ifiq, struct if_data *data) +{ + mtx_enter(&ifiq->ifiq_mtx); + data->ifi_ipackets += ifiq->ifiq_packets; + data->ifi_ibytes += ifiq->ifiq_bytes; + data->ifi_iqdrops += ifiq->ifiq_qdrops; + mtx_leave(&ifiq->ifiq_mtx); +} + +void +ifiq_barrier(struct ifiqueue *ifiq) +{ + if (!task_del(ifiq->ifiq_softnet, &ifiq->ifiq_task)) + taskq_barrier(ifiq->ifiq_softnet); +} + +int +ifiq_enqueue(struct ifiqueue *ifiq, struct mbuf *m) +{ + mtx_enter(&ifiq->ifiq_mtx); + ml_enqueue(&ifiq->ifiq_ml, m); + mtx_leave(&ifiq->ifiq_mtx); + + task_add(ifiq->ifiq_softnet, &ifiq->ifiq_task); + + return (0); +} + +static void +ifiq_process(void *arg) +{ + struct ifiqueue *ifiq = arg; + struct mbuf_list ml; + + if (ifiq_empty(ifiq)) + return; + + mtx_enter(&ifiq->ifiq_mtx); + ml = ifiq->ifiq_ml; + ml_init(&ifiq->ifiq_ml); + mtx_leave(&ifiq->ifiq_mtx); + + if_input_process(ifiq->ifiq_if, &ml); +} + /* * priq implementation */ diff --git a/sys/net/ifq.h b/sys/net/ifq.h index b558cdcc59a..f8f520df4a2 100644 --- a/sys/net/ifq.h +++ b/sys/net/ifq.h @@ -1,4 +1,4 @@ -/* $OpenBSD: ifq.h,v 1.16 2017/11/14 08:44:11 dlg Exp $ */ +/* $OpenBSD: ifq.h,v 1.17 2017/12/15 01:37:30 dlg Exp $ */ /* * Copyright (c) 2015 David Gwynne @@ -67,6 +67,32 @@ struct ifqueue { unsigned int ifq_idx; }; +struct ifiqueue { + struct ifnet *ifiq_if; + struct taskq *ifiq_softnet; + union { + void *_ifiq_softc; + struct ifiqueue *_ifiq_ifiqs[1]; + } _ifiq_ptr; +#define ifiq_softc _ifiq_ptr._ifiq_softc +#define ifiq_ifiqs _ifiq_ptr._ifiq_ifiqs + + struct mutex ifiq_mtx; + struct mbuf_list ifiq_ml; + struct task ifiq_task; + + /* counters */ + uint64_t ifiq_packets; + uint64_t ifiq_bytes; + uint64_t ifiq_qdrops; + uint64_t ifiq_errors; + uint64_t ifiq_mcasts; + uint64_t ifiq_noproto; + + /* properties */ + unsigned int ifiq_idx; +}; + #ifdef _KERNEL #define IFQ_MAXLEN 256 @@ -436,6 +462,18 @@ ifq_idx(struct ifqueue *ifq, unsigned int nifqs, const struct mbuf *m) extern const struct ifq_ops * const ifq_priq_ops; +/* ifiq */ + +void ifiq_init(struct ifiqueue *, struct ifnet *, unsigned int); +void ifiq_destroy(struct ifiqueue *); +int ifiq_input(struct ifiqueue *, struct mbuf_list *, + unsigned int); +int ifiq_enqueue(struct ifiqueue *, struct mbuf *); +void ifiq_add_data(struct ifiqueue *, struct if_data *); + +#define ifiq_len(_ifiq) ml_len(&(_ifiq)->ifiq_ml) +#define ifiq_empty(_ifiq) ml_empty(&(_ifiq)->ifiq_ml) + #endif /* _KERNEL */ #endif /* _NET_IFQ_H_ */