Improve handling of ENOBUFS by not running in a tight loop forever.
authorflorian <florian@openbsd.org>
Mon, 11 Sep 2023 06:00:23 +0000 (06:00 +0000)
committerflorian <florian@openbsd.org>
Mon, 11 Sep 2023 06:00:23 +0000 (06:00 +0000)
This cherry-picks upstream git commit
0ee44ef384593ed0382d1ce6048d5a9c9440b45c.

Issue reported by landry@, which could be traced back to a buggy WiFi
driver that would constantly return ENOBUFS. This in turn lead
unwind(8) enter a tight loop through poll / sendto without making any
progress and burning 100% cpu.

OK sthen

sbin/unwind/resolver.c
usr.sbin/unbound/doc/Changelog
usr.sbin/unbound/util/netevent.c

index 98c5f47..9cd19b5 100644 (file)
@@ -1,4 +1,4 @@
-/*     $OpenBSD: resolver.c,v 1.160 2023/04/18 09:57:51 florian Exp $  */
+/*     $OpenBSD: resolver.c,v 1.161 2023/09/11 06:00:23 florian Exp $  */
 
 
 /*
@@ -920,6 +920,8 @@ resolve_done(struct uw_resolver *res, void *arg, int rcode,
        uint8_t                 *p, *data;
        uint8_t                  answer_imsg[MAX_IMSGSIZE - IMSG_HEADER_SIZE];
 
+       log_debug("%s: %d", __func__, rcode);
+
        clock_gettime(CLOCK_MONOTONIC, &tp);
 
        query_imsg = (struct query_imsg *)arg;
@@ -1074,14 +1076,17 @@ resolve_done(struct uw_resolver *res, void *arg, int rcode,
        goto out;
 
  servfail:
+       log_debug("%s: foo: 1", __func__);
        /* try_next_resolver() might free rq */
        if (try_next_resolver(rq) != 0 && running_res == 0) {
                /* we are the last one, send SERVFAIL */
                answer_header->srvfail = 1;
+               log_debug("%s: foo: 2", __func__);
                resolver_imsg_compose_frontend(IMSG_ANSWER, 0,
                    answer_imsg, sizeof(*answer_header));
        }
  out:
+       log_debug("%s: foo: 3", __func__);
        free(query_imsg);
        sldns_buffer_free(buf);
        regional_destroy(region);
index a7c9c40..be48561 100644 (file)
@@ -1,3 +1,8 @@
+8 September 2023: Wouter
+       - Fix send of udp retries when ENOBUFS is returned. It stops looping
+         and also waits for the condition to go away. Reported by Florian
+         Obser.
+
 25 August 2023: Wouter
        - Fix compile error on NetBSD in util/netevent.h.
 
index 204e488..b9395a8 100644 (file)
 
 /** timeout in millisec to wait for write to unblock, packets dropped after.*/
 #define SEND_BLOCKED_WAIT_TIMEOUT 200
+/** max number of times to wait for write to unblock, packets dropped after.*/
+#define SEND_BLOCKED_MAX_RETRY 5
 
 /** Let's make timestamping code cleaner and redefine SO_TIMESTAMP* */
 #ifndef SO_TIMESTAMP
@@ -402,9 +404,10 @@ comm_point_send_udp_msg(struct comm_point *c, sldns_buffer* packet,
                        WSAGetLastError() == WSAENOBUFS ||
                        WSAGetLastError() == WSAEWOULDBLOCK) {
 #endif
+                       int retries = 0;
                        /* if we set the fd blocking, other threads suddenly
                         * have a blocking fd that they operate on */
-                       while(sent == -1 && (
+                       while(sent == -1 && retries < SEND_BLOCKED_MAX_RETRY && (
 #ifndef USE_WINSOCK
                                errno == EAGAIN || errno == EINTR ||
 #  ifdef EWOULDBLOCK
@@ -419,6 +422,13 @@ comm_point_send_udp_msg(struct comm_point *c, sldns_buffer* packet,
 #endif
                        )) {
 #if defined(HAVE_POLL) || defined(USE_WINSOCK)
+                               int send_nobufs = (
+#ifndef USE_WINSOCK
+                                       errno == ENOBUFS
+#else
+                                       WSAGetLastError() == WSAENOBUFS
+#endif
+                               );
                                struct pollfd p;
                                int pret;
                                memset(&p, 0, sizeof(p));
@@ -457,8 +467,48 @@ comm_point_send_udp_msg(struct comm_point *c, sldns_buffer* packet,
                                        log_err("poll udp out failed: %s",
                                                sock_strerror(errno));
                                        return 0;
+                               } else if((pret < 0 &&
+#ifndef USE_WINSOCK
+                                       errno == ENOBUFS
+#else
+                                       WSAGetLastError() == WSAENOBUFS
+#endif
+                                       ) || (send_nobufs && retries > 0)) {
+                                       /* ENOBUFS, and poll returned without
+                                        * a timeout. Or the retried send call
+                                        * returned ENOBUFS. It is good to
+                                        * wait a bit for the error to clear. */
+                                       /* The timeout is 20*(2^(retries+1)),
+                                        * it increases exponentially, starting
+                                        * at 40 msec. After 5 tries, 1240 msec
+                                        * have passed in total, when poll
+                                        * returned the error, and 1200 msec
+                                        * when send returned the errors. */
+#ifndef USE_WINSOCK
+                                       pret = poll(NULL, 0, (SEND_BLOCKED_WAIT_TIMEOUT/10)<<(retries+1));
+#else
+                                       pret = WSAPoll(NULL, 0, (SEND_BLOCKED_WAIT_TIMEOUT/10)<<(retries+1));
+#endif
+                                       if(pret < 0 &&
+#ifndef USE_WINSOCK
+                                               errno != EAGAIN && errno != EINTR &&
+#  ifdef EWOULDBLOCK
+                                               errno != EWOULDBLOCK &&
+#  endif
+                                               errno != ENOBUFS
+#else
+                                               WSAGetLastError() != WSAEINPROGRESS &&
+                                               WSAGetLastError() != WSAEINTR &&
+                                               WSAGetLastError() != WSAENOBUFS &&
+                                               WSAGetLastError() != WSAEWOULDBLOCK
+#endif
+                                       ) {
+                                               log_err("poll udp out timer failed: %s",
+                                                       sock_strerror(errno));
+                                       }
                                }
 #endif /* defined(HAVE_POLL) || defined(USE_WINSOCK) */
+                               retries++;
                                if (!is_connected) {
                                        sent = sendto(c->fd, (void*)sldns_buffer_begin(packet),
                                                sldns_buffer_remaining(packet), 0,
@@ -665,7 +715,8 @@ comm_point_send_udp_msg_if(struct comm_point *c, sldns_buffer* packet,
                        WSAGetLastError() == WSAENOBUFS ||
                        WSAGetLastError() == WSAEWOULDBLOCK) {
 #endif
-                       while(sent == -1 && (
+                       int retries = 0;
+                       while(sent == -1 && retries < SEND_BLOCKED_MAX_RETRY && (
 #ifndef USE_WINSOCK
                                errno == EAGAIN || errno == EINTR ||
 #  ifdef EWOULDBLOCK
@@ -680,6 +731,13 @@ comm_point_send_udp_msg_if(struct comm_point *c, sldns_buffer* packet,
 #endif
                        )) {
 #if defined(HAVE_POLL) || defined(USE_WINSOCK)
+                               int send_nobufs = (
+#ifndef USE_WINSOCK
+                                       errno == ENOBUFS
+#else
+                                       WSAGetLastError() == WSAENOBUFS
+#endif
+                               );
                                struct pollfd p;
                                int pret;
                                memset(&p, 0, sizeof(p));
@@ -718,8 +776,48 @@ comm_point_send_udp_msg_if(struct comm_point *c, sldns_buffer* packet,
                                        log_err("poll udp out failed: %s",
                                                sock_strerror(errno));
                                        return 0;
+                               } else if((pret < 0 &&
+#ifndef USE_WINSOCK
+                                       errno == ENOBUFS
+#else
+                                       WSAGetLastError() == WSAENOBUFS
+#endif
+                                       ) || (send_nobufs && retries > 0)) {
+                                       /* ENOBUFS, and poll returned without
+                                        * a timeout. Or the retried send call
+                                        * returned ENOBUFS. It is good to
+                                        * wait a bit for the error to clear. */
+                                       /* The timeout is 20*(2^(retries+1)),
+                                        * it increases exponentially, starting
+                                        * at 40 msec. After 5 tries, 1240 msec
+                                        * have passed in total, when poll
+                                        * returned the error, and 1200 msec
+                                        * when send returned the errors. */
+#ifndef USE_WINSOCK
+                                       pret = poll(NULL, 0, (SEND_BLOCKED_WAIT_TIMEOUT/10)<<(retries+1));
+#else
+                                       pret = WSAPoll(NULL, 0, (SEND_BLOCKED_WAIT_TIMEOUT/10)<<(retries+1));
+#endif
+                                       if(pret < 0 &&
+#ifndef USE_WINSOCK
+                                               errno != EAGAIN && errno != EINTR &&
+#  ifdef EWOULDBLOCK
+                                               errno != EWOULDBLOCK &&
+#  endif
+                                               errno != ENOBUFS
+#else
+                                               WSAGetLastError() != WSAEINPROGRESS &&
+                                               WSAGetLastError() != WSAEINTR &&
+                                               WSAGetLastError() != WSAENOBUFS &&
+                                               WSAGetLastError() != WSAEWOULDBLOCK
+#endif
+                                       ) {
+                                               log_err("poll udp out timer failed: %s",
+                                                       sock_strerror(errno));
+                                       }
                                }
 #endif /* defined(HAVE_POLL) || defined(USE_WINSOCK) */
+                               retries++;
                                sent = sendmsg(c->fd, &msg, 0);
                        }
                }