From b1dddf40430065b498e8e9346d5bfcf956a86380 Mon Sep 17 00:00:00 2001 From: claudio Date: Thu, 7 Jul 2022 12:16:04 +0000 Subject: [PATCH] Introduce a decision metric (dmetric) that classifies the relation of this prefix with respect to its previous one. Currently the plan is to distinguish the best prefix (only one), ecmp prefixes (currently the same as as-wide-multipath), as-wide-multipath prefixes, valid prefixes and invalid prefixes. This information will be used to implement add-path send but also for ecmp support in bgpd. OK tb@ --- usr.sbin/bgpd/bgpd.h | 21 +++--- usr.sbin/bgpd/rde.c | 32 ++++++--- usr.sbin/bgpd/rde.h | 11 ++- usr.sbin/bgpd/rde_decide.c | 134 +++++++++++++++++++++++++++---------- 4 files changed, 142 insertions(+), 56 deletions(-) diff --git a/usr.sbin/bgpd/bgpd.h b/usr.sbin/bgpd/bgpd.h index b5ee6828b68..3d592a3679f 100644 --- a/usr.sbin/bgpd/bgpd.h +++ b/usr.sbin/bgpd/bgpd.h @@ -1,4 +1,4 @@ -/* $OpenBSD: bgpd.h,v 1.439 2022/06/30 20:33:14 claudio Exp $ */ +/* $OpenBSD: bgpd.h,v 1.440 2022/07/07 12:16:04 claudio Exp $ */ /* * Copyright (c) 2003, 2004 Henning Brauer @@ -777,14 +777,16 @@ struct ctl_neighbor { int is_group; }; -#define F_PREF_ELIGIBLE 0x01 -#define F_PREF_BEST 0x02 -#define F_PREF_INTERNAL 0x04 -#define F_PREF_ANNOUNCE 0x08 -#define F_PREF_STALE 0x10 -#define F_PREF_INVALID 0x20 -#define F_PREF_PATH_ID 0x40 -#define F_PREF_OTC_LOOP 0x80 +#define F_PREF_ELIGIBLE 0x001 +#define F_PREF_BEST 0x002 +#define F_PREF_INTERNAL 0x004 +#define F_PREF_ANNOUNCE 0x008 +#define F_PREF_STALE 0x010 +#define F_PREF_INVALID 0x020 +#define F_PREF_PATH_ID 0x040 +#define F_PREF_OTC_LOOP 0x080 +#define F_PREF_ECMP 0x100 +#define F_PREF_AS_WIDE 0x200 struct ctl_show_rib { struct bgpd_addr true_nexthop; @@ -802,6 +804,7 @@ struct ctl_show_rib { uint8_t prefixlen; uint8_t origin; uint8_t validation_state; + int8_t dmetric; /* plus an aspath */ }; diff --git a/usr.sbin/bgpd/rde.c b/usr.sbin/bgpd/rde.c index f440614919c..7fa7440d466 100644 --- a/usr.sbin/bgpd/rde.c +++ b/usr.sbin/bgpd/rde.c @@ -1,4 +1,4 @@ -/* $OpenBSD: rde.c,v 1.548 2022/07/07 10:46:54 claudio Exp $ */ +/* $OpenBSD: rde.c,v 1.549 2022/07/07 12:16:04 claudio Exp $ */ /* * Copyright (c) 2003, 2004 Henning Brauer @@ -2419,6 +2419,7 @@ rde_dump_rib_as(struct prefix *p, struct rde_aspath *asp, pid_t pid, int flags, struct attr *a; struct nexthop *nexthop; struct rib_entry *re; + struct prefix *xp; struct rde_peer *peer; void *bp; time_t staletime; @@ -2452,10 +2453,28 @@ rde_dump_rib_as(struct prefix *p, struct rde_aspath *asp, pid_t pid, int flags, rib.prefixlen = p->pt->prefixlen; rib.origin = asp->origin; rib.validation_state = p->validation_state; + rib.dmetric = p->dmetric; rib.flags = 0; re = prefix_re(p); - if (re != NULL && prefix_best(re) == p) - rib.flags |= F_PREF_BEST; + TAILQ_FOREACH(xp, &re->prefix_h, entry.list.rib) { + switch (xp->dmetric) { + case PREFIX_DMETRIC_BEST: + if (xp == p) + rib.flags |= F_PREF_BEST; + break; + case PREFIX_DMETRIC_ECMP: + if (xp == p) + rib.flags |= F_PREF_ECMP; + break; + case PREFIX_DMETRIC_AS_WIDE: + if (xp == p) + rib.flags |= F_PREF_AS_WIDE; + break; + default: + xp = NULL; /* stop loop */ + break; + } + } if (!peer->conf.ebgp) rib.flags |= F_PREF_INTERNAL; if (asp->flags & F_PREFIX_ANNOUNCED) @@ -2550,16 +2569,12 @@ static void rde_dump_filter(struct prefix *p, struct ctl_show_rib_request *req, int adjout) { struct rde_aspath *asp; - struct rib_entry *re; if (!rde_match_peer(prefix_peer(p), &req->neighbor)) return; asp = prefix_aspath(p); - re = prefix_re(p); - if (asp == NULL) /* skip pending withdraw in Adj-RIB-Out */ - return; - if ((req->flags & F_CTL_BEST) && re != NULL && prefix_best(re) != p) + if ((req->flags & F_CTL_BEST) && p->dmetric != PREFIX_DMETRIC_BEST) return; if ((req->flags & F_CTL_INVALID) && (asp->flags & F_ATTR_PARSE_ERR) == 0) @@ -3743,6 +3758,7 @@ rde_softreconfig_sync_reeval(struct rib_entry *re, void *arg) TAILQ_FOREACH(p, &re->prefix_h, entry.list.rib) { if (p->flags & PREFIX_NEXTHOP_LINKED) nexthop_unlink(p); + p->dmetric = PREFIX_DMETRIC_INVALID; } return; } diff --git a/usr.sbin/bgpd/rde.h b/usr.sbin/bgpd/rde.h index 1194a82184c..af61093f5d6 100644 --- a/usr.sbin/bgpd/rde.h +++ b/usr.sbin/bgpd/rde.h @@ -1,4 +1,4 @@ -/* $OpenBSD: rde.h,v 1.255 2022/07/07 10:46:54 claudio Exp $ */ +/* $OpenBSD: rde.h,v 1.256 2022/07/07 12:16:04 claudio Exp $ */ /* * Copyright (c) 2003, 2004 Claudio Jeker and @@ -336,7 +336,7 @@ struct prefix { uint32_t path_id_tx; uint8_t validation_state; uint8_t nhflags; - uint8_t unused; + int8_t dmetric; /* decision metric */ uint8_t flags; #define PREFIX_FLAG_WITHDRAW 0x01 /* enqueued on withdraw queue */ #define PREFIX_FLAG_UPDATE 0x02 /* enqueued on update queue */ @@ -347,6 +347,13 @@ struct prefix { #define PREFIX_FLAG_EOR 0x20 /* prefix is EoR */ #define PREFIX_NEXTHOP_LINKED 0x40 /* prefix is linked onto nexthop list */ #define PREFIX_FLAG_LOCKED 0x80 /* locked by rib walker */ + +#define PREFIX_DMETRIC_NONE 0 +#define PREFIX_DMETRIC_INVALID 1 +#define PREFIX_DMETRIC_VALID 2 +#define PREFIX_DMETRIC_AS_WIDE 3 +#define PREFIX_DMETRIC_ECMP 4 +#define PREFIX_DMETRIC_BEST 5 }; /* possible states for nhflags */ diff --git a/usr.sbin/bgpd/rde_decide.c b/usr.sbin/bgpd/rde_decide.c index 0f5057c2bca..eb92aa42b6c 100644 --- a/usr.sbin/bgpd/rde_decide.c +++ b/usr.sbin/bgpd/rde_decide.c @@ -1,4 +1,4 @@ -/* $OpenBSD: rde_decide.c,v 1.92 2022/07/07 10:46:54 claudio Exp $ */ +/* $OpenBSD: rde_decide.c,v 1.93 2022/07/07 12:16:04 claudio Exp $ */ /* * Copyright (c) 2003, 2004 Claudio Jeker @@ -27,6 +27,7 @@ #include "log.h" int prefix_cmp(struct prefix *, struct prefix *, int *); +void prefix_set_dmetric(struct prefix *, struct prefix *); void prefix_insert(struct prefix *, struct prefix *, struct rib_entry *); void prefix_remove(struct prefix *, struct rib_entry *); /* @@ -106,7 +107,14 @@ void prefix_remove(struct prefix *, struct rib_entry *); * Compare two prefixes with equal pt_entry. Returns an integer greater than or * less than 0, according to whether the prefix p1 is more or less preferred * than the prefix p2. p1 should be used for the new prefix and p2 for a - * already added prefix. + * already added prefix. The absolute value returned specifies the similarity + * of the prefixes. + * 1: prefixes differ because of validity + * 2: prefixes don't belong in any multipath set + * 3: prefixes belong only in the as-wide multipath set + * 4: prefixes belong in both the ecmp and as-wide multipath set + * TODO: maybe we also need a strict ecmp set that requires + * prefixes to e.g. equal ASPATH or equal neighbor-as (like for MED). */ int prefix_cmp(struct prefix *p1, struct prefix *p2, int *testall) @@ -116,6 +124,7 @@ prefix_cmp(struct prefix *p1, struct prefix *p2, int *testall) struct attr *a; uint32_t p1id, p2id; int p1cnt, p2cnt, i; + int rv = 1; /* * If a match happens before the MED check then the list is @@ -129,9 +138,9 @@ prefix_cmp(struct prefix *p1, struct prefix *p2, int *testall) *testall = 0; if (p1 == NULL) - return -1; + return -rv; if (p2 == NULL) - return 1; + return rv; asp1 = prefix_aspath(p1); asp2 = prefix_aspath(p2); @@ -140,15 +149,15 @@ prefix_cmp(struct prefix *p1, struct prefix *p2, int *testall) /* pathes with errors are not eligible */ if (asp1 == NULL || asp1->flags & F_ATTR_PARSE_ERR) - return -1; + return -rv; if (asp2 == NULL || asp2->flags & F_ATTR_PARSE_ERR) - return 1; + return rv; /* only loop free pathes are eligible */ if (asp1->flags & F_ATTR_LOOP) - return -1; + return -rv; if (asp2->flags & F_ATTR_LOOP) - return 1; + return rv; /* * 1. check if prefix is eligible a.k.a reachable @@ -157,24 +166,31 @@ prefix_cmp(struct prefix *p1, struct prefix *p2, int *testall) */ if (prefix_nexthop(p2) != NULL && prefix_nexthop(p2)->state != NEXTHOP_REACH) - return 1; + return rv; if (prefix_nexthop(p1) != NULL && prefix_nexthop(p1)->state != NEXTHOP_REACH) - return -1; + return -rv; + + /* bump rv, from here on prefix is considered valid */ + rv++; /* 2. local preference of prefix, bigger is better */ if (asp1->lpref > asp2->lpref) - return 1; + return rv; if (asp1->lpref < asp2->lpref) - return -1; + return -rv; /* 3. aspath count, the shorter the better */ - if ((asp2->aspath->ascnt - asp1->aspath->ascnt) != 0) - return (asp2->aspath->ascnt - asp1->aspath->ascnt); + if (asp1->aspath->ascnt < asp2->aspath->ascnt) + return rv; + if (asp1->aspath->ascnt > asp2->aspath->ascnt) + return -rv; /* 4. origin, the lower the better */ - if ((asp2->origin - asp1->origin) != 0) - return (asp2->origin - asp1->origin); + if (asp1->origin < asp2->origin) + return rv; + if (asp1->origin > asp2->origin) + return -rv; /* * 5. MED decision @@ -189,9 +205,9 @@ prefix_cmp(struct prefix *p1, struct prefix *p2, int *testall) *testall = 2; /* lowest value wins */ if (asp1->med < asp2->med) - return 1; + return rv; if (asp1->med > asp2->med) - return -1; + return -rv; } if (!(rde_decisionflags() & BGPD_FLAG_DECISION_MED_ALWAYS)) @@ -204,11 +220,14 @@ prefix_cmp(struct prefix *p1, struct prefix *p2, int *testall) */ if (peer1->conf.ebgp != peer2->conf.ebgp) { if (peer1->conf.ebgp) /* peer1 is EBGP other is lower */ - return 1; + return rv; else if (peer2->conf.ebgp) /* peer2 is EBGP */ - return -1; + return -rv; } + /* bump rv, as-wide multipath */ + rv++; + /* * 7. local tie-breaker, this weight is here to tip equal long AS * paths in one or the other direction. It happens more and more @@ -217,21 +236,24 @@ prefix_cmp(struct prefix *p1, struct prefix *p2, int *testall) * decision process. */ if (asp1->weight > asp2->weight) - return 1; + return rv; if (asp1->weight < asp2->weight) - return -1; + return -rv; /* 8. nexthop costs. NOT YET -> IGNORE */ + /* bump rv, equal cost multipath */ + rv++; + /* * 9. older route (more stable) wins but only if route-age * evaluation is enabled. */ if (rde_decisionflags() & BGPD_FLAG_DECISION_ROUTEAGE) { if (p1->lastchange < p2->lastchange) /* p1 is older */ - return 1; + return rv; if (p1->lastchange > p2->lastchange) - return -1; + return -rv; } /* 10. lowest BGP Id wins, use ORIGINATOR_ID if present */ @@ -246,9 +268,9 @@ prefix_cmp(struct prefix *p1, struct prefix *p2, int *testall) } else p2id = peer2->remote_bgpid; if (p1id < p2id) - return 1; + return rv; if (p1id > p2id) - return -1; + return -rv; /* 11. compare CLUSTER_LIST length, shorter is better */ p1cnt = p2cnt = 0; @@ -256,14 +278,16 @@ prefix_cmp(struct prefix *p1, struct prefix *p2, int *testall) p1cnt = a->len / sizeof(uint32_t); if ((a = attr_optget(asp2, ATTR_CLUSTER_LIST)) != NULL) p2cnt = a->len / sizeof(uint32_t); - if ((p2cnt - p1cnt) != 0) - return (p2cnt - p1cnt); + if (p1cnt < p2cnt) + return rv; + if (p1cnt > p2cnt) + return -rv; /* 12. lowest peer address wins (IPv4 is better than IPv6) */ if (peer1->remote_addr.aid < peer2->remote_addr.aid) - return 1; + return rv; if (peer1->remote_addr.aid > peer2->remote_addr.aid) - return -1; + return -rv; switch (peer1->remote_addr.aid) { case AID_INET: i = memcmp(&peer1->remote_addr.v4, &peer2->remote_addr.v4, @@ -277,20 +301,41 @@ prefix_cmp(struct prefix *p1, struct prefix *p2, int *testall) fatalx("%s: unknown af", __func__); } if (i < 0) - return 1; + return rv; if (i > 0) - return -1; + return -rv; /* RFC7911 does not specify this but something like this is needed. */ /* 13. lowest path identifier wins */ if (p1->path_id < p2->path_id) - return 1; + return rv; if (p1->path_id > p2->path_id) - return -1; + return -rv; fatalx("Uh, oh a politician in the decision process"); } +/* + * set the dmetric value of np based on the return value of + * prefix_evaluate(pp, np) or set it to either PREFIX_DMETRIC_BEST + * or PREFIX_DMETRIC_INVALID for the first element. + */ +void +prefix_set_dmetric(struct prefix *pp, struct prefix *np) +{ + int testall; + + if (np != NULL) { + if (pp == NULL) + np->dmetric = prefix_eligible(np) ? + PREFIX_DMETRIC_BEST : PREFIX_DMETRIC_INVALID; + else + np->dmetric = prefix_cmp(pp, np, &testall); + if (np->dmetric < 0) + fatalx("bad dmetric in decision process"); + } +} + /* * Insert a prefix keeping the total order of the list. For routes * that may depend on a MED selection the set is scanned until the @@ -321,6 +366,8 @@ prefix_insert(struct prefix *new, struct prefix *ep, struct rib_entry *re) * MED inversion, take out prefix and * put it onto redo queue. */ + prefix_set_dmetric(TAILQ_PREV(xp, prefix_queue, + entry.list.rib), np); TAILQ_REMOVE(&re->prefix_h, xp, entry.list.rib); TAILQ_INSERT_TAIL(&redo, xp, entry.list.rib); } else { @@ -348,10 +395,17 @@ prefix_insert(struct prefix *new, struct prefix *ep, struct rib_entry *re) } } - if (insertp == NULL) + if (insertp == NULL) { TAILQ_INSERT_HEAD(&re->prefix_h, new, entry.list.rib); - else + } else { TAILQ_INSERT_AFTER(&re->prefix_h, insertp, new, entry.list.rib); + new->dmetric = prefix_cmp(insertp, new, &testall); + if (new->dmetric < 0) + fatalx("bad dmetric in decision process"); + } + + prefix_set_dmetric(insertp, new); + prefix_set_dmetric(new, TAILQ_NEXT(new, entry.list.rib)); /* Fixup MED order again. All elements are < new */ while (!TAILQ_EMPTY(&redo)) { @@ -379,7 +433,9 @@ prefix_remove(struct prefix *old, struct rib_entry *re) int testall; xp = TAILQ_NEXT(old, entry.list.rib); + prefix_set_dmetric(TAILQ_PREV(old, prefix_queue, entry.list.rib), xp); TAILQ_REMOVE(&re->prefix_h, old, entry.list.rib); + /* check if a MED inversion could be possible */ prefix_cmp(old, xp, &testall); if (testall > 0) { @@ -396,6 +452,8 @@ prefix_remove(struct prefix *old, struct rib_entry *re) * possible MED inversion, take out prefix and * put it onto redo queue. */ + prefix_set_dmetric(TAILQ_PREV(xp, prefix_queue, + entry.list.rib), np); TAILQ_REMOVE(&re->prefix_h, xp, entry.list.rib); TAILQ_INSERT_TAIL(&redo, xp, entry.list.rib); } @@ -466,8 +524,10 @@ prefix_evaluate(struct rib_entry *re, struct prefix *new, struct prefix *old) /* decision process is turned off */ if (old != NULL) TAILQ_REMOVE(&re->prefix_h, old, entry.list.rib); - if (new != NULL) + if (new != NULL) { TAILQ_INSERT_HEAD(&re->prefix_h, new, entry.list.rib); + new->dmetric = PREFIX_DMETRIC_INVALID; + } return; } -- 2.20.1