-/* $OpenBSD: tcp_input.c,v 1.391 2023/09/03 21:37:17 bluhm Exp $ */
+/* $OpenBSD: tcp_input.c,v 1.392 2023/11/16 18:27:48 bluhm Exp $ */
/* $NetBSD: tcp_input.c,v 1.23 1996/02/13 23:43:44 christos Exp $ */
/*
* state for SYN_RECEIVED.
*/
+/*
+ * Locks used to protect global data and struct members:
+ * N net lock
+ * S syn_cache_mtx tcp syn cache global mutex
+ */
+
/* syn hash parameters */
-int tcp_syn_hash_size = TCP_SYN_HASH_SIZE;
-int tcp_syn_cache_limit = TCP_SYN_HASH_SIZE*TCP_SYN_BUCKET_SIZE;
-int tcp_syn_bucket_limit = 3*TCP_SYN_BUCKET_SIZE;
-int tcp_syn_use_limit = 100000;
+int tcp_syn_hash_size = TCP_SYN_HASH_SIZE; /* [N] size of hash table */
+int tcp_syn_cache_limit = /* [N] global entry limit */
+ TCP_SYN_HASH_SIZE * TCP_SYN_BUCKET_SIZE;
+int tcp_syn_bucket_limit = /* [N] per bucket limit */
+ 3 * TCP_SYN_BUCKET_SIZE;
+int tcp_syn_use_limit = 100000; /* [N] reseed after uses */
struct pool syn_cache_pool;
struct syn_cache_set tcp_syn_cache[2];
int tcp_syn_cache_active;
+struct mutex syn_cache_mtx = MUTEX_INITIALIZER(IPL_SOFTNET);
#define SYN_HASH(sa, sp, dp, rand) \
(((sa)->s_addr ^ (rand)[0]) * \
void
syn_cache_rm(struct syn_cache *sc)
{
- sc->sc_flags |= SCF_DEAD;
+ MUTEX_ASSERT_LOCKED(&syn_cache_mtx);
+
+ KASSERT(!ISSET(sc->sc_dynflags, SCF_DEAD));
+ SET(sc->sc_dynflags, SCF_DEAD);
TAILQ_REMOVE(&sc->sc_buckethead->sch_bucket, sc, sc_bucketq);
sc->sc_tp = NULL;
LIST_REMOVE(sc, sc_tpq);
if (refcnt_rele(&sc->sc_refcnt) == 0)
return;
+ /* Dealing with last reference, no lock needed. */
m_free(sc->sc_ipopts);
- if (sc->sc_route4.ro_rt != NULL) {
- rtfree(sc->sc_route4.ro_rt);
- sc->sc_route4.ro_rt = NULL;
- }
+ rtfree(sc->sc_route4.ro_rt);
+
pool_put(&syn_cache_pool, sc);
}
int i;
NET_ASSERT_LOCKED();
+ MUTEX_ASSERT_LOCKED(&syn_cache_mtx);
/*
* If there are no entries in the hash table, reinitialize
uint64_t now;
int lastref;
- NET_LOCK();
- if (sc->sc_flags & SCF_DEAD)
+ mtx_enter(&syn_cache_mtx);
+ if (ISSET(sc->sc_dynflags, SCF_DEAD))
goto freeit;
- now = tcp_now();
-
if (__predict_false(sc->sc_rxtshift == TCP_MAXRXTSHIFT)) {
/* Drop it -- too many retransmissions. */
goto dropit;
if (sc->sc_rxttot >= tcptv_keep_init)
goto dropit;
- tcpstat_inc(tcps_sc_retransmitted);
- (void) syn_cache_respond(sc, NULL, now);
-
/* Advance the timer back-off. */
sc->sc_rxtshift++;
TCPT_RANGESET(sc->sc_rxtcur,
TCPTV_SRTTDFLT * tcp_backoff[sc->sc_rxtshift], TCPTV_MIN,
TCPTV_REXMTMAX);
- if (!timeout_add_msec(&sc->sc_timer, sc->sc_rxtcur))
- syn_cache_put(sc);
+ if (timeout_add_msec(&sc->sc_timer, sc->sc_rxtcur))
+ refcnt_take(&sc->sc_refcnt);
+ mtx_leave(&syn_cache_mtx);
+ NET_LOCK();
+ now = tcp_now();
+ (void) syn_cache_respond(sc, NULL, now);
+ tcpstat_inc(tcps_sc_retransmitted);
NET_UNLOCK();
+
+ syn_cache_put(sc);
return;
dropit:
KASSERT(lastref == 0);
(void)lastref;
freeit:
+ mtx_leave(&syn_cache_mtx);
syn_cache_put(sc);
- NET_UNLOCK();
}
/*
NET_ASSERT_LOCKED();
+ mtx_enter(&syn_cache_mtx);
LIST_FOREACH_SAFE(sc, &tp->t_sc, sc_tpq, nsc) {
#ifdef DIAGNOSTIC
if (sc->sc_tp != tp)
syn_cache_rm(sc);
syn_cache_put(sc);
}
- /* just for safety */
- LIST_INIT(&tp->t_sc);
+ mtx_leave(&syn_cache_mtx);
+
+ KASSERT(LIST_EMPTY(&tp->t_sc));
}
/*
int i;
NET_ASSERT_LOCKED();
+ MUTEX_ASSERT_LOCKED(&syn_cache_mtx);
/* Check the active cache first, the passive cache is likely empty. */
sets[0] = &tcp_syn_cache[tcp_syn_cache_active];
NET_ASSERT_LOCKED();
+ mtx_enter(&syn_cache_mtx);
sc = syn_cache_lookup(src, dst, &scp, sotoinpcb(so)->inp_rtableid);
- if (sc == NULL)
+ if (sc == NULL) {
+ mtx_leave(&syn_cache_mtx);
return (NULL);
+ }
/*
* Verify the sequence and ack numbers. Try getting the correct
if ((th->th_ack != sc->sc_iss + 1) ||
SEQ_LEQ(th->th_seq, sc->sc_irs) ||
SEQ_GT(th->th_seq, sc->sc_irs + 1 + sc->sc_win)) {
+ refcnt_take(&sc->sc_refcnt);
+ mtx_leave(&syn_cache_mtx);
(void) syn_cache_respond(sc, m, now);
+ syn_cache_put(sc);
return ((struct socket *)(-1));
}
/* Remove this cache entry */
syn_cache_rm(sc);
+ mtx_leave(&syn_cache_mtx);
/*
* Ok, create the full blown connection, and set things up
tp->request_r_scale = sc->sc_request_r_scale;
tp->t_flags |= TF_REQ_SCALE|TF_RCVD_SCALE;
}
- if (sc->sc_flags & SCF_TIMESTAMP)
+ if (ISSET(sc->sc_fixflags, SCF_TIMESTAMP))
tp->t_flags |= TF_REQ_TSTMP|TF_RCVD_TSTMP;
tp->t_template = tcp_template(tp);
so = NULL;
goto abort;
}
- tp->sack_enable = sc->sc_flags & SCF_SACK_PERMIT;
+ tp->sack_enable = ISSET(sc->sc_fixflags, SCF_SACK_PERMIT);
tp->ts_modulate = sc->sc_modulate;
tp->ts_recent = sc->sc_timestamp;
tp->iss = sc->sc_iss;
tcp_sendseqinit(tp);
tp->snd_last = tp->snd_una;
#ifdef TCP_ECN
- if (sc->sc_flags & SCF_ECN_PERMIT) {
+ if (ISSET(sc->sc_fixflags, SCF_ECN_PERMIT)) {
tp->t_flags |= TF_ECN_PERMIT;
tcpstat_inc(tcps_ecn_accepts);
}
#endif
- if (sc->sc_flags & SCF_SACK_PERMIT)
+ if (ISSET(sc->sc_fixflags, SCF_SACK_PERMIT))
tp->t_flags |= TF_SACK_PERMIT;
#ifdef TCP_SIGNATURE
- if (sc->sc_flags & SCF_SIGNATURE)
+ if (ISSET(sc->sc_fixflags, SCF_SIGNATURE))
tp->t_flags |= TF_SIGNATURE;
#endif
tcp_rcvseqinit(tp);
if (sc->sc_peermaxseg)
tcp_mss_update(tp);
/* Reset initial window to 1 segment for retransmit */
- if (sc->sc_rxtshift > 0)
+ if (READ_ONCE(sc->sc_rxtshift) > 0)
tp->snd_cwnd = tp->t_maxseg;
tp->snd_wl1 = sc->sc_irs;
tp->rcv_up = sc->sc_irs + 1;
NET_ASSERT_LOCKED();
- if ((sc = syn_cache_lookup(src, dst, &scp, rtableid)) == NULL)
+ mtx_enter(&syn_cache_mtx);
+ sc = syn_cache_lookup(src, dst, &scp, rtableid);
+ if (sc == NULL) {
+ mtx_leave(&syn_cache_mtx);
return;
+ }
if (SEQ_LT(th->th_seq, sc->sc_irs) ||
- SEQ_GT(th->th_seq, sc->sc_irs + 1))
+ SEQ_GT(th->th_seq, sc->sc_irs + 1)) {
+ mtx_leave(&syn_cache_mtx);
return;
+ }
syn_cache_rm(sc);
+ mtx_leave(&syn_cache_mtx);
tcpstat_inc(tcps_sc_reset);
syn_cache_put(sc);
}
NET_ASSERT_LOCKED();
- if ((sc = syn_cache_lookup(src, dst, &scp, rtableid)) == NULL)
+ mtx_enter(&syn_cache_mtx);
+ sc = syn_cache_lookup(src, dst, &scp, rtableid);
+ if (sc == NULL) {
+ mtx_leave(&syn_cache_mtx);
return;
+ }
/* If the sequence number != sc_iss, then it's a bogus ICMP msg */
if (ntohl (th->th_seq) != sc->sc_iss) {
+ mtx_leave(&syn_cache_mtx);
return;
}
*
* See tcp_notify().
*/
- if ((sc->sc_flags & SCF_UNREACH) == 0 || sc->sc_rxtshift < 3) {
- sc->sc_flags |= SCF_UNREACH;
+ if (!ISSET(sc->sc_dynflags, SCF_UNREACH) || sc->sc_rxtshift < 3) {
+ SET(sc->sc_dynflags, SCF_UNREACH);
+ mtx_leave(&syn_cache_mtx);
return;
}
syn_cache_rm(sc);
+ mtx_leave(&syn_cache_mtx);
tcpstat_inc(tcps_sc_unreach);
syn_cache_put(sc);
}
struct syn_cache_head *scp;
struct mbuf *ipopts;
+ NET_ASSERT_LOCKED();
+
tp = sototcpcb(so);
/*
* If we do, resend the SYN,ACK. We do not count this
* as a retransmission (XXX though maybe we should).
*/
+ mtx_enter(&syn_cache_mtx);
sc = syn_cache_lookup(src, dst, &scp, sotoinpcb(so)->inp_rtableid);
if (sc != NULL) {
+ refcnt_take(&sc->sc_refcnt);
+ mtx_leave(&syn_cache_mtx);
tcpstat_inc(tcps_sc_dupesyn);
if (ipopts) {
/*
tcpstat_inc(tcps_sndacks);
tcpstat_inc(tcps_sndtotal);
}
+ syn_cache_put(sc);
return (0);
}
+ mtx_leave(&syn_cache_mtx);
sc = pool_get(&syn_cache_pool, PR_NOWAIT|PR_ZERO);
if (sc == NULL) {
memcpy(&sc->sc_src, src, src->sa_len);
memcpy(&sc->sc_dst, dst, dst->sa_len);
sc->sc_rtableid = sotoinpcb(so)->inp_rtableid;
- sc->sc_flags = 0;
sc->sc_ipopts = ipopts;
sc->sc_irs = th->th_seq;
sc->sc_timestamp = tb.ts_recent;
if ((tb.t_flags & (TF_REQ_TSTMP|TF_RCVD_TSTMP)) ==
(TF_REQ_TSTMP|TF_RCVD_TSTMP)) {
- sc->sc_flags |= SCF_TIMESTAMP;
+ SET(sc->sc_fixflags, SCF_TIMESTAMP);
sc->sc_modulate = arc4random();
}
if ((tb.t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) ==
*/
if (tcp_do_ecn &&
(th->th_flags & (TH_ECE|TH_CWR)) == (TH_ECE|TH_CWR))
- sc->sc_flags |= SCF_ECN_PERMIT;
+ SET(sc->sc_fixflags, SCF_ECN_PERMIT);
#endif
/*
* Set SCF_SACK_PERMIT if peer did send a SACK_PERMITTED option
* (i.e., if tcp_dooptions() did set TF_SACK_PERMIT).
*/
if (tb.sack_enable && (tb.t_flags & TF_SACK_PERMIT))
- sc->sc_flags |= SCF_SACK_PERMIT;
+ SET(sc->sc_fixflags, SCF_SACK_PERMIT);
#ifdef TCP_SIGNATURE
if (tb.t_flags & TF_SIGNATURE)
- sc->sc_flags |= SCF_SIGNATURE;
+ SET(sc->sc_fixflags, SCF_SIGNATURE);
#endif
sc->sc_tp = tp;
if (syn_cache_respond(sc, m, now) == 0) {
+ mtx_enter(&syn_cache_mtx);
+ /*
+ * XXXSMP Currently exclusive netlock prevents another insert
+ * after our syn_cache_lookup() and before syn_cache_insert().
+ * Double insert should be handled and not rely on netlock.
+ */
syn_cache_insert(sc, tp);
+ mtx_leave(&syn_cache_mtx);
tcpstat_inc(tcps_sndacks);
tcpstat_inc(tcps_sndtotal);
} else {
u_int hlen;
struct inpcb *inp;
+ NET_ASSERT_LOCKED();
+
switch (sc->sc_src.sa.sa_family) {
case AF_INET:
hlen = sizeof(struct ip);
/* Compute the size of the TCP options. */
optlen = 4 + (sc->sc_request_r_scale != 15 ? 4 : 0) +
- ((sc->sc_flags & SCF_SACK_PERMIT) ? 4 : 0) +
+ (ISSET(sc->sc_fixflags, SCF_SACK_PERMIT) ? 4 : 0) +
#ifdef TCP_SIGNATURE
- ((sc->sc_flags & SCF_SIGNATURE) ? TCPOLEN_SIGLEN : 0) +
+ (ISSET(sc->sc_fixflags, SCF_SIGNATURE) ? TCPOLEN_SIGLEN : 0) +
#endif
- ((sc->sc_flags & SCF_TIMESTAMP) ? TCPOLEN_TSTAMP_APPA : 0);
+ (ISSET(sc->sc_fixflags, SCF_TIMESTAMP) ? TCPOLEN_TSTAMP_APPA : 0);
tlen = hlen + sizeof(struct tcphdr) + optlen;
th->th_flags = TH_SYN|TH_ACK;
#ifdef TCP_ECN
/* Set ECE for SYN-ACK if peer supports ECN. */
- if (tcp_do_ecn && (sc->sc_flags & SCF_ECN_PERMIT))
+ if (tcp_do_ecn && ISSET(sc->sc_fixflags, SCF_ECN_PERMIT))
th->th_flags |= TH_ECE;
#endif
th->th_win = htons(sc->sc_win);
*optp++ = sc->sc_ourmaxseg & 0xff;
/* Include SACK_PERMIT_HDR option if peer has already done so. */
- if (sc->sc_flags & SCF_SACK_PERMIT) {
+ if (ISSET(sc->sc_fixflags, SCF_SACK_PERMIT)) {
*((u_int32_t *)optp) = htonl(TCPOPT_SACK_PERMIT_HDR);
optp += 4;
}
optp += 4;
}
- if (sc->sc_flags & SCF_TIMESTAMP) {
+ if (ISSET(sc->sc_fixflags, SCF_TIMESTAMP)) {
u_int32_t *lp = (u_int32_t *)(optp);
/* Form timestamp option as shown in appendix A of RFC 1323. */
*lp++ = htonl(TCPOPT_TSTAMP_HDR);
}
#ifdef TCP_SIGNATURE
- if (sc->sc_flags & SCF_SIGNATURE) {
+ if (ISSET(sc->sc_fixflags, SCF_SIGNATURE)) {
union sockaddr_union src, dst;
struct tdb *tdb;
SET(m->m_pkthdr.csum_flags, M_TCP_CSUM_OUT);
/* use IPsec policy and ttl from listening socket, on SYN ACK */
+ mtx_enter(&syn_cache_mtx);
inp = sc->sc_tp ? sc->sc_tp->t_inpcb : NULL;
+ mtx_leave(&syn_cache_mtx);
/*
* Fill in some straggling IP bits. Note the stack expects
-/* $OpenBSD: tcp_var.h,v 1.171 2023/09/04 23:00:36 bluhm Exp $ */
+/* $OpenBSD: tcp_var.h,v 1.172 2023/11/16 18:27:48 bluhm Exp $ */
/* $NetBSD: tcp_var.h,v 1.17 1996/02/13 23:44:24 christos Exp $ */
/*
* Data for the TCP compressed state engine.
*/
+/*
+ * Locks used to protect global data and struct members:
+ * I immutable after creation
+ * N net lock
+ * S syn_cache_mtx tcp syn cache global mutex
+ */
+
+extern struct mutex syn_cache_mtx;
+
#define TCP_SYN_HASH_SIZE 293
#define TCP_SYN_BUCKET_SIZE 35
};
struct syn_cache {
- TAILQ_ENTRY(syn_cache) sc_bucketq; /* link on bucket list */
+ TAILQ_ENTRY(syn_cache) sc_bucketq; /* [S] link on bucket list */
struct refcnt sc_refcnt; /* ref count list and timer */
struct timeout sc_timer; /* rexmt timer */
union { /* cached route */
struct route_in6 route6;
#endif
} sc_route_u;
-#define sc_route4 sc_route_u.route4
+#define sc_route4 sc_route_u.route4 /* [N] */
#ifdef INET6
-#define sc_route6 sc_route_u.route6
+#define sc_route6 sc_route_u.route6 /* [N] */
#endif
- long sc_win; /* advertised window */
- struct syn_cache_head *sc_buckethead; /* our bucket index */
- struct syn_cache_set *sc_set; /* our syn cache set */
- u_int64_t sc_timestamp; /* timestamp from SYN */
- u_int32_t sc_hash;
- u_int32_t sc_modulate; /* our timestamp modulator */
- union syn_cache_sa sc_src;
- union syn_cache_sa sc_dst;
- tcp_seq sc_irs;
- tcp_seq sc_iss;
- u_int sc_rtableid;
- u_int sc_rxtcur; /* current rxt timeout */
- u_int sc_rxttot; /* total time spend on queues */
- u_short sc_rxtshift; /* for computing backoff */
- u_short sc_flags;
-
-#define SCF_UNREACH 0x0001 /* we've had an unreach error */
-#define SCF_TIMESTAMP 0x0002 /* peer will do timestamps */
-#define SCF_DEAD 0x0004 /* this entry to be released */
-#define SCF_SACK_PERMIT 0x0008 /* permit sack */
-#define SCF_ECN_PERMIT 0x0010 /* permit ecn */
-#define SCF_SIGNATURE 0x0020 /* enforce tcp signatures */
-
- struct mbuf *sc_ipopts; /* IP options */
- u_int16_t sc_peermaxseg;
- u_int16_t sc_ourmaxseg;
- u_int sc_request_r_scale : 4,
- sc_requested_s_scale : 4;
-
- struct tcpcb *sc_tp; /* tcb for listening socket */
- LIST_ENTRY(syn_cache) sc_tpq; /* list of entries by same tp */
+ long sc_win; /* [I] advertised window */
+ struct syn_cache_head *sc_buckethead; /* [S] our bucket index */
+ struct syn_cache_set *sc_set; /* [S] our syn cache set */
+ u_int64_t sc_timestamp; /* [N] timestamp from SYN */
+ u_int32_t sc_hash; /* [S] */
+ u_int32_t sc_modulate; /* [I] our timestamp modulator */
+ union syn_cache_sa sc_src; /* [I] */
+ union syn_cache_sa sc_dst; /* [I] */
+ tcp_seq sc_irs; /* [I] */
+ tcp_seq sc_iss; /* [I] */
+ u_int sc_rtableid; /* [I] */
+ u_int sc_rxtcur; /* [S] current rxt timeout */
+ u_int sc_rxttot; /* [S] total time spend on queues */
+ u_int sc_rxtshift; /* [S] for computing backoff */
+ u_int sc_dynflags; /* [S] flags accessed with mutex */
+#define SCF_UNREACH 0x0001U /* we've had an unreach error */
+#define SCF_DEAD 0x0002U /* this entry to be released */
+
+ u_short sc_fixflags; /* [I] set during initialization */
+#define SCF_TIMESTAMP 0x0010U /* peer will do timestamps */
+#define SCF_SACK_PERMIT 0x0020U /* permit sack */
+#define SCF_ECN_PERMIT 0x0040U /* permit ecn */
+#define SCF_SIGNATURE 0x0080U /* enforce tcp signatures */
+
+ struct mbuf *sc_ipopts; /* [N] IP options */
+ u_int16_t sc_peermaxseg; /* [I] */
+ u_int16_t sc_ourmaxseg; /* [I] */
+ u_int sc_request_r_scale : 4, /* [I] */
+ sc_requested_s_scale : 4; /* [I] */
+
+ struct tcpcb *sc_tp; /* [S] tcb for listening socket */
+ LIST_ENTRY(syn_cache) sc_tpq; /* [S] list of entries by same tp */
};
struct syn_cache_head {
- TAILQ_HEAD(, syn_cache) sch_bucket; /* bucket entries */
- u_short sch_length; /* # entries in bucket */
+ TAILQ_HEAD(, syn_cache) sch_bucket; /* [S] bucket entries */
+ u_short sch_length; /* [S] # entries in bucket */
};
struct syn_cache_set {
- struct syn_cache_head *scs_buckethead;
- long scs_use;
- int scs_size;
- int scs_count;
- u_int32_t scs_random[5];
+ struct syn_cache_head *scs_buckethead; /* [S] */
+ long scs_use; /* [S] */
+ int scs_size; /* [S] current size of hash table */
+ int scs_count; /* [S] */
+ u_int32_t scs_random[5]; /* [S] */
};
#endif /* _KERNEL */