1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3 * 4 * Copyright (c) 2012-2021 Marko Zec 5 * Copyright (c) 2005, 2018 University of Zagreb 6 * Copyright (c) 2005 International Computer Science Institute 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 27 * SUCH DAMAGE. 28 */ 29 30 /* 31 * An implementation of DXR, a simple IPv4 LPM scheme with compact lookup 32 * structures and a trivial search procedure. More significant bits of 33 * the search key are used to directly index a two-stage trie, while the 34 * remaining bits are used for finding the next hop in a sorted array. 35 * More details in: 36 * 37 * M. Zec, L. Rizzo, M. Mikuc, DXR: towards a billion routing lookups per 38 * second in software, ACM SIGCOMM Computer Communication Review, September 39 * 2012 40 * 41 * M. Zec, M. Mikuc, Pushing the envelope: beyond two billion IP routing 42 * lookups per second on commodity CPUs, IEEE SoftCOM, September 2017, Split 43 */ 44 45 #include <sys/cdefs.h> 46 __FBSDID("$FreeBSD$"); 47 48 #include "opt_inet.h" 49 50 #include <sys/param.h> 51 #include <sys/kernel.h> 52 #include <sys/epoch.h> 53 #include <sys/malloc.h> 54 #include <sys/module.h> 55 #include <sys/socket.h> 56 #include <sys/sysctl.h> 57 #include <sys/syslog.h> 58 59 #include <vm/uma.h> 60 61 #include <netinet/in.h> 62 #include <netinet/in_fib.h> 63 64 #include <net/route.h> 65 #include <net/route/route_ctl.h> 66 #include <net/route/fib_algo.h> 67 68 #define DXR_TRIE_BITS 20 69 70 CTASSERT(DXR_TRIE_BITS >= 16 && DXR_TRIE_BITS <= 24); 71 72 /* DXR2: two-stage primary trie, instead of a single direct lookup table */ 73 #define DXR2 74 75 #if DXR_TRIE_BITS > 16 76 #define DXR_D 16 77 #else 78 #define DXR_D (DXR_TRIE_BITS - 1) 79 #endif 80 #define DXR_X (DXR_TRIE_BITS - DXR_D) 81 82 #define D_TBL_SIZE (1 << DXR_D) 83 #define DIRECT_TBL_SIZE (1 << DXR_TRIE_BITS) 84 #define DXR_RANGE_MASK (0xffffffffU >> DXR_TRIE_BITS) 85 #define DXR_RANGE_SHIFT (32 - DXR_TRIE_BITS) 86 87 #define DESC_BASE_BITS 22 88 #define DESC_FRAGMENTS_BITS (32 - DESC_BASE_BITS) 89 #define BASE_MAX ((1 << DESC_BASE_BITS) - 1) 90 #define RTBL_SIZE_INCR (BASE_MAX / 64) 91 92 #if DXR_TRIE_BITS < 24 93 #define FRAGS_MASK_SHORT ((1 << (23 - DXR_TRIE_BITS)) - 1) 94 #else 95 #define FRAGS_MASK_SHORT 0 96 #endif 97 #define FRAGS_PREF_SHORT (((1 << DESC_FRAGMENTS_BITS) - 1) & \ 98 ~FRAGS_MASK_SHORT) 99 #define FRAGS_MARK_XL (FRAGS_PREF_SHORT - 1) 100 #define FRAGS_MARK_HIT (FRAGS_PREF_SHORT - 2) 101 102 #define IS_SHORT_FORMAT(x) ((x & FRAGS_PREF_SHORT) == FRAGS_PREF_SHORT) 103 #define IS_LONG_FORMAT(x) ((x & FRAGS_PREF_SHORT) != FRAGS_PREF_SHORT) 104 #define IS_XL_FORMAT(x) (x == FRAGS_MARK_XL) 105 106 #define RE_SHORT_MAX_NH ((1 << (DXR_TRIE_BITS - 8)) - 1) 107 108 #define CHUNK_HASH_BITS 16 109 #define CHUNK_HASH_SIZE (1 << CHUNK_HASH_BITS) 110 #define CHUNK_HASH_MASK (CHUNK_HASH_SIZE - 1) 111 112 #define TRIE_HASH_BITS 16 113 #define TRIE_HASH_SIZE (1 << TRIE_HASH_BITS) 114 #define TRIE_HASH_MASK (TRIE_HASH_SIZE - 1) 115 116 #define XTBL_SIZE_INCR (DIRECT_TBL_SIZE / 16) 117 118 #define UNUSED_BUCKETS 8 119 120 /* Lookup structure elements */ 121 122 struct direct_entry { 123 uint32_t fragments: DESC_FRAGMENTS_BITS, 124 base: DESC_BASE_BITS; 125 }; 126 127 struct range_entry_long { 128 uint32_t start: DXR_RANGE_SHIFT, 129 nexthop: DXR_TRIE_BITS; 130 }; 131 132 #if DXR_TRIE_BITS < 24 133 struct range_entry_short { 134 uint16_t start: DXR_RANGE_SHIFT - 8, 135 nexthop: DXR_TRIE_BITS - 8; 136 }; 137 #endif 138 139 /* Auxiliary structures */ 140 141 struct heap_entry { 142 uint32_t start; 143 uint32_t end; 144 uint32_t preflen; 145 uint32_t nexthop; 146 }; 147 148 struct chunk_desc { 149 LIST_ENTRY(chunk_desc) cd_all_le; 150 LIST_ENTRY(chunk_desc) cd_hash_le; 151 uint32_t cd_hash; 152 uint32_t cd_refcnt; 153 uint32_t cd_base; 154 uint32_t cd_cur_size; 155 uint32_t cd_max_size; 156 }; 157 158 struct trie_desc { 159 LIST_ENTRY(trie_desc) td_all_le; 160 LIST_ENTRY(trie_desc) td_hash_le; 161 uint32_t td_hash; 162 uint32_t td_index; 163 uint32_t td_refcnt; 164 }; 165 166 struct dxr_aux { 167 /* Glue to external state */ 168 struct fib_data *fd; 169 uint32_t fibnum; 170 int refcnt; 171 172 /* Auxiliary build-time tables */ 173 struct direct_entry direct_tbl[DIRECT_TBL_SIZE]; 174 uint16_t d_tbl[D_TBL_SIZE]; 175 struct direct_entry *x_tbl; 176 union { 177 struct range_entry_long re; 178 uint32_t fragments; 179 } *range_tbl; 180 181 /* Auxiliary internal state */ 182 uint32_t updates_mask[DIRECT_TBL_SIZE / 32]; 183 struct trie_desc *trietbl[D_TBL_SIZE]; 184 LIST_HEAD(, chunk_desc) chunk_hashtbl[CHUNK_HASH_SIZE]; 185 LIST_HEAD(, chunk_desc) all_chunks; 186 LIST_HEAD(, chunk_desc) unused_chunks[UNUSED_BUCKETS]; 187 LIST_HEAD(, trie_desc) trie_hashtbl[TRIE_HASH_SIZE]; 188 LIST_HEAD(, trie_desc) all_trie; 189 LIST_HEAD(, trie_desc) unused_trie; /* abuses hash link entry */ 190 struct sockaddr_in dst; 191 struct sockaddr_in mask; 192 struct heap_entry heap[33]; 193 uint32_t prefixes; 194 uint32_t updates_low; 195 uint32_t updates_high; 196 uint32_t all_chunks_cnt; 197 uint32_t unused_chunks_cnt; 198 uint32_t xtbl_size; 199 uint32_t all_trie_cnt; 200 uint32_t unused_trie_cnt; 201 uint32_t trie_rebuilt_prefixes; 202 uint32_t heap_index; 203 uint32_t d_bits; 204 uint32_t rtbl_size; 205 uint32_t rtbl_top; 206 uint32_t rtbl_work_frags; 207 uint32_t work_chunk; 208 }; 209 210 /* Main lookup structure container */ 211 212 struct dxr { 213 /* Lookup tables */ 214 uint16_t d_shift; 215 uint16_t x_shift; 216 uint32_t x_mask; 217 void *d; 218 void *x; 219 void *r; 220 struct nhop_object **nh_tbl; 221 222 /* Glue to external state */ 223 struct dxr_aux *aux; 224 struct fib_data *fd; 225 struct epoch_context epoch_ctx; 226 uint32_t fibnum; 227 }; 228 229 static MALLOC_DEFINE(M_DXRLPM, "dxr", "DXR LPM"); 230 static MALLOC_DEFINE(M_DXRAUX, "dxr aux", "DXR auxiliary"); 231 232 uma_zone_t chunk_zone; 233 uma_zone_t trie_zone; 234 235 SYSCTL_DECL(_net_route_algo); 236 SYSCTL_NODE(_net_route_algo, OID_AUTO, dxr, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 237 "DXR tunables"); 238 239 VNET_DEFINE_STATIC(int, max_trie_holes) = 8; 240 #define V_max_trie_holes VNET(max_trie_holes) 241 SYSCTL_INT(_net_route_algo_dxr, OID_AUTO, max_trie_holes, 242 CTLFLAG_RW | CTLFLAG_VNET, &VNET_NAME(max_trie_holes), 0, 243 "Trie fragmentation threshold before triggering a full rebuild"); 244 245 VNET_DEFINE_STATIC(int, max_range_holes) = 16; 246 #define V_max_range_holes VNET(max_range_holes) 247 SYSCTL_INT(_net_route_algo_dxr, OID_AUTO, max_range_holes, 248 CTLFLAG_RW | CTLFLAG_VNET, &VNET_NAME(max_range_holes), 0, 249 "Range table fragmentation threshold before triggering a full rebuild"); 250 251 /* Binary search for a matching address range */ 252 #define DXR_LOOKUP_STAGE \ 253 if (masked_dst < range[middle].start) { \ 254 upperbound = middle; \ 255 middle = (middle + lowerbound) / 2; \ 256 } else if (masked_dst < range[middle + 1].start) \ 257 return (range[middle].nexthop); \ 258 else { \ 259 lowerbound = middle + 1; \ 260 middle = (upperbound + middle + 1) / 2; \ 261 } \ 262 if (upperbound == lowerbound) \ 263 return (range[lowerbound].nexthop); 264 265 static int 266 dxr_lookup(struct dxr *dxr, uint32_t dst) 267 { 268 #ifdef DXR2 269 uint16_t *dt = dxr->d; 270 struct direct_entry *xt = dxr->x; 271 int xi; 272 #else 273 struct direct_entry *dt = dxr->d; 274 #endif 275 struct direct_entry de; 276 struct range_entry_long *rt; 277 uint32_t base; 278 uint32_t upperbound; 279 uint32_t middle; 280 uint32_t lowerbound; 281 uint32_t masked_dst; 282 283 #ifdef DXR2 284 xi = (dt[dst >> dxr->d_shift] << dxr->x_shift) + 285 ((dst >> DXR_RANGE_SHIFT) & dxr->x_mask); 286 de = xt[xi]; 287 #else 288 de = dt[dst >> DXR_RANGE_SHIFT]; 289 #endif 290 291 if (__predict_true(de.fragments == FRAGS_MARK_HIT)) 292 return (de.base); 293 294 rt = dxr->r; 295 base = de.base; 296 lowerbound = 0; 297 masked_dst = dst & DXR_RANGE_MASK; 298 299 #if DXR_TRIE_BITS < 24 300 if (__predict_true(IS_SHORT_FORMAT(de.fragments))) { 301 upperbound = de.fragments & FRAGS_MASK_SHORT; 302 struct range_entry_short *range = 303 (struct range_entry_short *) &rt[base]; 304 305 masked_dst >>= 8; 306 middle = upperbound; 307 upperbound = upperbound * 2 + 1; 308 309 for (;;) { 310 DXR_LOOKUP_STAGE 311 DXR_LOOKUP_STAGE 312 } 313 } 314 #endif 315 316 upperbound = de.fragments; 317 middle = upperbound / 2; 318 struct range_entry_long *range = &rt[base]; 319 if (__predict_false(IS_XL_FORMAT(de.fragments))) { 320 upperbound = *((uint32_t *) range); 321 range++; 322 middle = upperbound / 2; 323 } 324 325 for (;;) { 326 DXR_LOOKUP_STAGE 327 DXR_LOOKUP_STAGE 328 } 329 } 330 331 static void 332 initheap(struct dxr_aux *da, uint32_t dst_u32, uint32_t chunk) 333 { 334 struct heap_entry *fhp = &da->heap[0]; 335 struct rtentry *rt; 336 struct route_nhop_data rnd; 337 338 da->heap_index = 0; 339 da->dst.sin_addr.s_addr = htonl(dst_u32); 340 rt = fib4_lookup_rt(da->fibnum, da->dst.sin_addr, 0, NHR_UNLOCKED, 341 &rnd); 342 if (rt != NULL) { 343 struct in_addr addr; 344 uint32_t scopeid; 345 346 rt_get_inet_prefix_plen(rt, &addr, &fhp->preflen, &scopeid); 347 fhp->start = ntohl(addr.s_addr); 348 fhp->end = fhp->start; 349 if (fhp->preflen < 32) 350 fhp->end |= (0xffffffffU >> fhp->preflen); 351 fhp->nexthop = fib_get_nhop_idx(da->fd, rnd.rnd_nhop); 352 } else { 353 fhp->preflen = fhp->nexthop = fhp->start = 0; 354 fhp->end = 0xffffffffU; 355 } 356 } 357 358 static uint32_t 359 chunk_size(struct dxr_aux *da, struct direct_entry *fdesc) 360 { 361 362 if (IS_SHORT_FORMAT(fdesc->fragments)) 363 return ((fdesc->fragments & FRAGS_MASK_SHORT) + 1); 364 else if (IS_XL_FORMAT(fdesc->fragments)) 365 return (da->range_tbl[fdesc->base].fragments + 2); 366 else /* if (IS_LONG_FORMAT(fdesc->fragments)) */ 367 return (fdesc->fragments + 1); 368 } 369 370 static uint32_t 371 chunk_hash(struct dxr_aux *da, struct direct_entry *fdesc) 372 { 373 uint32_t size = chunk_size(da, fdesc); 374 uint32_t *p = (uint32_t *) &da->range_tbl[fdesc->base]; 375 uint32_t *l = (uint32_t *) &da->range_tbl[fdesc->base + size]; 376 uint32_t hash = fdesc->fragments; 377 378 for (; p < l; p++) 379 hash = (hash << 7) + (hash >> 13) + *p; 380 381 return (hash + (hash >> 16)); 382 } 383 384 static int 385 chunk_ref(struct dxr_aux *da, uint32_t chunk) 386 { 387 struct direct_entry *fdesc = &da->direct_tbl[chunk]; 388 struct chunk_desc *cdp, *empty_cdp; 389 uint32_t base = fdesc->base; 390 uint32_t size = chunk_size(da, fdesc); 391 uint32_t hash = chunk_hash(da, fdesc); 392 int i; 393 394 /* Find an existing descriptor */ 395 LIST_FOREACH(cdp, &da->chunk_hashtbl[hash & CHUNK_HASH_MASK], 396 cd_hash_le) { 397 if (cdp->cd_hash != hash || cdp->cd_cur_size != size || 398 memcmp(&da->range_tbl[base], &da->range_tbl[cdp->cd_base], 399 sizeof(struct range_entry_long) * size)) 400 continue; 401 da->rtbl_top = fdesc->base; 402 fdesc->base = cdp->cd_base; 403 cdp->cd_refcnt++; 404 return (0); 405 } 406 407 /* No matching chunks found. Find an empty one to recycle. */ 408 for (cdp = NULL, i = size; cdp == NULL && i < UNUSED_BUCKETS; i++) 409 cdp = LIST_FIRST(&da->unused_chunks[i]); 410 411 if (cdp == NULL) 412 LIST_FOREACH(empty_cdp, &da->unused_chunks[0], cd_hash_le) 413 if (empty_cdp->cd_max_size >= size && (cdp == NULL || 414 empty_cdp->cd_max_size < cdp->cd_max_size)) { 415 cdp = empty_cdp; 416 if (empty_cdp->cd_max_size == size) 417 break; 418 } 419 420 if (cdp != NULL) { 421 /* Copy from heap into the recycled chunk */ 422 bcopy(&da->range_tbl[fdesc->base], &da->range_tbl[cdp->cd_base], 423 size * sizeof(struct range_entry_long)); 424 fdesc->base = cdp->cd_base; 425 da->rtbl_top -= size; 426 da->unused_chunks_cnt--; 427 if (cdp->cd_max_size > size) { 428 /* Split the range in two, need a new descriptor */ 429 empty_cdp = uma_zalloc(chunk_zone, M_NOWAIT); 430 if (empty_cdp == NULL) 431 return (1); 432 LIST_INSERT_BEFORE(cdp, empty_cdp, cd_all_le); 433 empty_cdp->cd_base = cdp->cd_base + size; 434 empty_cdp->cd_cur_size = 0; 435 empty_cdp->cd_max_size = cdp->cd_max_size - size; 436 437 i = empty_cdp->cd_max_size; 438 if (i >= UNUSED_BUCKETS) 439 i = 0; 440 LIST_INSERT_HEAD(&da->unused_chunks[i], empty_cdp, 441 cd_hash_le); 442 443 da->all_chunks_cnt++; 444 da->unused_chunks_cnt++; 445 cdp->cd_max_size = size; 446 } 447 LIST_REMOVE(cdp, cd_hash_le); 448 } else { 449 /* Alloc a new descriptor at the top of the heap*/ 450 cdp = uma_zalloc(chunk_zone, M_NOWAIT); 451 if (cdp == NULL) 452 return (1); 453 cdp->cd_max_size = size; 454 cdp->cd_base = fdesc->base; 455 LIST_INSERT_HEAD(&da->all_chunks, cdp, cd_all_le); 456 da->all_chunks_cnt++; 457 KASSERT(cdp->cd_base + cdp->cd_max_size == da->rtbl_top, 458 ("dxr: %s %d", __FUNCTION__, __LINE__)); 459 } 460 461 cdp->cd_hash = hash; 462 cdp->cd_refcnt = 1; 463 cdp->cd_cur_size = size; 464 LIST_INSERT_HEAD(&da->chunk_hashtbl[hash & CHUNK_HASH_MASK], cdp, 465 cd_hash_le); 466 if (da->rtbl_top >= da->rtbl_size) { 467 if (da->rtbl_top >= BASE_MAX) { 468 FIB_PRINTF(LOG_ERR, da->fd, 469 "structural limit exceeded at %d " 470 "range table elements", da->rtbl_top); 471 return (1); 472 } 473 da->rtbl_size += RTBL_SIZE_INCR; 474 if (da->rtbl_top >= BASE_MAX / 4) 475 FIB_PRINTF(LOG_WARNING, da->fd, "range table at %d%%", 476 da->rtbl_top * 100 / BASE_MAX); 477 da->range_tbl = realloc(da->range_tbl, 478 sizeof(*da->range_tbl) * da->rtbl_size + FRAGS_PREF_SHORT, 479 M_DXRAUX, M_NOWAIT); 480 if (da->range_tbl == NULL) 481 return (1); 482 } 483 484 return (0); 485 } 486 487 static void 488 chunk_unref(struct dxr_aux *da, uint32_t chunk) 489 { 490 struct direct_entry *fdesc = &da->direct_tbl[chunk]; 491 struct chunk_desc *cdp, *cdp2; 492 uint32_t base = fdesc->base; 493 uint32_t size = chunk_size(da, fdesc); 494 uint32_t hash = chunk_hash(da, fdesc); 495 int i; 496 497 /* Find the corresponding descriptor */ 498 LIST_FOREACH(cdp, &da->chunk_hashtbl[hash & CHUNK_HASH_MASK], 499 cd_hash_le) 500 if (cdp->cd_hash == hash && cdp->cd_cur_size == size && 501 memcmp(&da->range_tbl[base], &da->range_tbl[cdp->cd_base], 502 sizeof(struct range_entry_long) * size) == 0) 503 break; 504 505 KASSERT(cdp != NULL, ("dxr: dangling chunk")); 506 if (--cdp->cd_refcnt > 0) 507 return; 508 509 LIST_REMOVE(cdp, cd_hash_le); 510 da->unused_chunks_cnt++; 511 cdp->cd_cur_size = 0; 512 513 /* Attempt to merge with the preceding chunk, if empty */ 514 cdp2 = LIST_NEXT(cdp, cd_all_le); 515 if (cdp2 != NULL && cdp2->cd_cur_size == 0) { 516 KASSERT(cdp2->cd_base + cdp2->cd_max_size == cdp->cd_base, 517 ("dxr: %s %d", __FUNCTION__, __LINE__)); 518 LIST_REMOVE(cdp, cd_all_le); 519 da->all_chunks_cnt--; 520 LIST_REMOVE(cdp2, cd_hash_le); 521 da->unused_chunks_cnt--; 522 cdp2->cd_max_size += cdp->cd_max_size; 523 uma_zfree(chunk_zone, cdp); 524 cdp = cdp2; 525 } 526 527 /* Attempt to merge with the subsequent chunk, if empty */ 528 cdp2 = LIST_PREV(cdp, &da->all_chunks, chunk_desc, cd_all_le); 529 if (cdp2 != NULL && cdp2->cd_cur_size == 0) { 530 KASSERT(cdp->cd_base + cdp->cd_max_size == cdp2->cd_base, 531 ("dxr: %s %d", __FUNCTION__, __LINE__)); 532 LIST_REMOVE(cdp, cd_all_le); 533 da->all_chunks_cnt--; 534 LIST_REMOVE(cdp2, cd_hash_le); 535 da->unused_chunks_cnt--; 536 cdp2->cd_max_size += cdp->cd_max_size; 537 cdp2->cd_base = cdp->cd_base; 538 uma_zfree(chunk_zone, cdp); 539 cdp = cdp2; 540 } 541 542 if (cdp->cd_base + cdp->cd_max_size == da->rtbl_top) { 543 /* Free the chunk on the top of the range heap, trim the heap */ 544 KASSERT(cdp == LIST_FIRST(&da->all_chunks), 545 ("dxr: %s %d", __FUNCTION__, __LINE__)); 546 da->all_chunks_cnt--; 547 da->unused_chunks_cnt--; 548 da->rtbl_top -= cdp->cd_max_size; 549 LIST_REMOVE(cdp, cd_all_le); 550 uma_zfree(chunk_zone, cdp); 551 return; 552 } 553 554 i = cdp->cd_max_size; 555 if (i >= UNUSED_BUCKETS) 556 i = 0; 557 LIST_INSERT_HEAD(&da->unused_chunks[i], cdp, cd_hash_le); 558 } 559 560 #ifdef DXR2 561 static uint32_t 562 trie_hash(struct dxr_aux *da, uint32_t dxr_x, uint32_t index) 563 { 564 uint32_t i, *val; 565 uint32_t hash = 0; 566 567 for (i = 0; i < (1 << dxr_x); i++) { 568 hash = (hash << 3) ^ (hash >> 3); 569 val = (uint32_t *) 570 (void *) &da->direct_tbl[(index << dxr_x) + i]; 571 hash += (*val << 5); 572 hash += (*val >> 5); 573 } 574 575 return (hash + (hash >> 16)); 576 } 577 578 static int 579 trie_ref(struct dxr_aux *da, uint32_t index) 580 { 581 struct trie_desc *tp; 582 uint32_t dxr_d = da->d_bits; 583 uint32_t dxr_x = DXR_TRIE_BITS - dxr_d; 584 uint32_t hash = trie_hash(da, dxr_x, index); 585 586 /* Find an existing descriptor */ 587 LIST_FOREACH(tp, &da->trie_hashtbl[hash & TRIE_HASH_MASK], td_hash_le) 588 if (tp->td_hash == hash && 589 memcmp(&da->direct_tbl[index << dxr_x], 590 &da->x_tbl[tp->td_index << dxr_x], 591 sizeof(*da->x_tbl) << dxr_x) == 0) { 592 tp->td_refcnt++; 593 da->trietbl[index] = tp; 594 return(tp->td_index); 595 } 596 597 tp = LIST_FIRST(&da->unused_trie); 598 if (tp != NULL) { 599 LIST_REMOVE(tp, td_hash_le); 600 da->unused_trie_cnt--; 601 } else { 602 tp = uma_zalloc(trie_zone, M_NOWAIT); 603 if (tp == NULL) 604 return (-1); 605 LIST_INSERT_HEAD(&da->all_trie, tp, td_all_le); 606 tp->td_index = da->all_trie_cnt++; 607 } 608 609 tp->td_hash = hash; 610 tp->td_refcnt = 1; 611 LIST_INSERT_HEAD(&da->trie_hashtbl[hash & TRIE_HASH_MASK], tp, 612 td_hash_le); 613 memcpy(&da->x_tbl[tp->td_index << dxr_x], 614 &da->direct_tbl[index << dxr_x], sizeof(*da->x_tbl) << dxr_x); 615 da->trietbl[index] = tp; 616 if (da->all_trie_cnt >= da->xtbl_size >> dxr_x) { 617 da->xtbl_size += XTBL_SIZE_INCR; 618 da->x_tbl = realloc(da->x_tbl, 619 sizeof(*da->x_tbl) * da->xtbl_size, M_DXRAUX, M_NOWAIT); 620 if (da->x_tbl == NULL) 621 return (-1); 622 } 623 return(tp->td_index); 624 } 625 626 static void 627 trie_unref(struct dxr_aux *da, uint32_t index) 628 { 629 struct trie_desc *tp = da->trietbl[index]; 630 631 if (tp == NULL) 632 return; 633 da->trietbl[index] = NULL; 634 if (--tp->td_refcnt > 0) 635 return; 636 637 LIST_REMOVE(tp, td_hash_le); 638 da->unused_trie_cnt++; 639 if (tp->td_index != da->all_trie_cnt - 1) { 640 LIST_INSERT_HEAD(&da->unused_trie, tp, td_hash_le); 641 return; 642 } 643 644 do { 645 da->all_trie_cnt--; 646 da->unused_trie_cnt--; 647 LIST_REMOVE(tp, td_all_le); 648 uma_zfree(trie_zone, tp); 649 LIST_FOREACH(tp, &da->unused_trie, td_hash_le) 650 if (tp->td_index == da->all_trie_cnt - 1) { 651 LIST_REMOVE(tp, td_hash_le); 652 break; 653 } 654 } while (tp != NULL); 655 } 656 #endif 657 658 static void 659 heap_inject(struct dxr_aux *da, uint32_t start, uint32_t end, uint32_t preflen, 660 uint32_t nh) 661 { 662 struct heap_entry *fhp; 663 int i; 664 665 for (i = da->heap_index; i >= 0; i--) { 666 if (preflen > da->heap[i].preflen) 667 break; 668 else if (preflen < da->heap[i].preflen) 669 da->heap[i + 1] = da->heap[i]; 670 else 671 return; 672 } 673 674 fhp = &da->heap[i + 1]; 675 fhp->preflen = preflen; 676 fhp->start = start; 677 fhp->end = end; 678 fhp->nexthop = nh; 679 da->heap_index++; 680 } 681 682 static int 683 dxr_walk(struct rtentry *rt, void *arg) 684 { 685 struct dxr_aux *da = arg; 686 uint32_t chunk = da->work_chunk; 687 uint32_t first = chunk << DXR_RANGE_SHIFT; 688 uint32_t last = first | DXR_RANGE_MASK; 689 struct range_entry_long *fp = 690 &da->range_tbl[da->rtbl_top + da->rtbl_work_frags].re; 691 struct heap_entry *fhp = &da->heap[da->heap_index]; 692 uint32_t preflen, nh, start, end, scopeid; 693 struct in_addr addr; 694 695 rt_get_inet_prefix_plen(rt, &addr, &preflen, &scopeid); 696 start = ntohl(addr.s_addr); 697 if (start > last) 698 return (-1); /* Beyond chunk boundaries, we are done */ 699 if (start < first) 700 return (0); /* Skip this route */ 701 702 end = start; 703 if (preflen < 32) 704 end |= (0xffffffffU >> preflen); 705 nh = fib_get_nhop_idx(da->fd, rt_get_raw_nhop(rt)); 706 707 if (start == fhp->start) 708 heap_inject(da, start, end, preflen, nh); 709 else { 710 /* start > fhp->start */ 711 while (start > fhp->end) { 712 uint32_t oend = fhp->end; 713 714 if (da->heap_index > 0) { 715 fhp--; 716 da->heap_index--; 717 } else 718 initheap(da, fhp->end + 1, chunk); 719 if (fhp->end > oend && fhp->nexthop != fp->nexthop) { 720 fp++; 721 da->rtbl_work_frags++; 722 fp->start = (oend + 1) & DXR_RANGE_MASK; 723 fp->nexthop = fhp->nexthop; 724 } 725 } 726 if (start > ((chunk << DXR_RANGE_SHIFT) | fp->start) && 727 nh != fp->nexthop) { 728 fp++; 729 da->rtbl_work_frags++; 730 fp->start = start & DXR_RANGE_MASK; 731 } else if (da->rtbl_work_frags) { 732 if ((--fp)->nexthop == nh) 733 da->rtbl_work_frags--; 734 else 735 fp++; 736 } 737 fp->nexthop = nh; 738 heap_inject(da, start, end, preflen, nh); 739 } 740 741 return (0); 742 } 743 744 static int 745 update_chunk(struct dxr_aux *da, uint32_t chunk) 746 { 747 struct range_entry_long *fp; 748 #if DXR_TRIE_BITS < 24 749 struct range_entry_short *fps; 750 uint32_t start, nh, i; 751 #endif 752 struct heap_entry *fhp; 753 uint32_t first = chunk << DXR_RANGE_SHIFT; 754 uint32_t last = first | DXR_RANGE_MASK; 755 756 if (da->direct_tbl[chunk].fragments != FRAGS_MARK_HIT) 757 chunk_unref(da, chunk); 758 759 initheap(da, first, chunk); 760 761 fp = &da->range_tbl[da->rtbl_top].re; 762 da->rtbl_work_frags = 0; 763 fp->start = first & DXR_RANGE_MASK; 764 fp->nexthop = da->heap[0].nexthop; 765 766 da->dst.sin_addr.s_addr = htonl(first); 767 da->mask.sin_addr.s_addr = htonl(~DXR_RANGE_MASK); 768 769 da->work_chunk = chunk; 770 rib_walk_from(da->fibnum, AF_INET, RIB_FLAG_LOCKED, 771 (struct sockaddr *) &da->dst, (struct sockaddr *) &da->mask, 772 dxr_walk, da); 773 774 /* Flush any remaining objects on the heap */ 775 fp = &da->range_tbl[da->rtbl_top + da->rtbl_work_frags].re; 776 fhp = &da->heap[da->heap_index]; 777 while (fhp->preflen > DXR_TRIE_BITS) { 778 uint32_t oend = fhp->end; 779 780 if (da->heap_index > 0) { 781 fhp--; 782 da->heap_index--; 783 } else 784 initheap(da, fhp->end + 1, chunk); 785 if (fhp->end > oend && fhp->nexthop != fp->nexthop) { 786 /* Have we crossed the upper chunk boundary? */ 787 if (oend >= last) 788 break; 789 fp++; 790 da->rtbl_work_frags++; 791 fp->start = (oend + 1) & DXR_RANGE_MASK; 792 fp->nexthop = fhp->nexthop; 793 } 794 } 795 796 /* Direct hit if the chunk contains only a single fragment */ 797 if (da->rtbl_work_frags == 0) { 798 da->direct_tbl[chunk].base = fp->nexthop; 799 da->direct_tbl[chunk].fragments = FRAGS_MARK_HIT; 800 return (0); 801 } 802 803 da->direct_tbl[chunk].base = da->rtbl_top; 804 da->direct_tbl[chunk].fragments = da->rtbl_work_frags; 805 806 #if DXR_TRIE_BITS < 24 807 /* Check whether the chunk can be more compactly encoded */ 808 fp = &da->range_tbl[da->rtbl_top].re; 809 for (i = 0; i <= da->rtbl_work_frags; i++, fp++) 810 if ((fp->start & 0xff) != 0 || fp->nexthop > RE_SHORT_MAX_NH) 811 break; 812 if (i == da->rtbl_work_frags + 1) { 813 fp = &da->range_tbl[da->rtbl_top].re; 814 fps = (void *) fp; 815 for (i = 0; i <= da->rtbl_work_frags; i++, fp++, fps++) { 816 start = fp->start; 817 nh = fp->nexthop; 818 fps->start = start >> 8; 819 fps->nexthop = nh; 820 } 821 fps->start = start >> 8; 822 fps->nexthop = nh; 823 da->rtbl_work_frags >>= 1; 824 da->direct_tbl[chunk].fragments = 825 da->rtbl_work_frags | FRAGS_PREF_SHORT; 826 } else 827 #endif 828 if (da->rtbl_work_frags >= FRAGS_MARK_HIT) { 829 da->direct_tbl[chunk].fragments = FRAGS_MARK_XL; 830 memmove(&da->range_tbl[da->rtbl_top + 1], 831 &da->range_tbl[da->rtbl_top], 832 (da->rtbl_work_frags + 1) * sizeof(*da->range_tbl)); 833 da->range_tbl[da->rtbl_top].fragments = da->rtbl_work_frags; 834 da->rtbl_work_frags++; 835 } 836 da->rtbl_top += (da->rtbl_work_frags + 1); 837 return (chunk_ref(da, chunk)); 838 } 839 840 static void 841 dxr_build(struct dxr *dxr) 842 { 843 struct dxr_aux *da = dxr->aux; 844 struct chunk_desc *cdp; 845 struct rib_rtable_info rinfo; 846 struct timeval t0, t1, t2, t3; 847 uint32_t r_size, dxr_tot_size; 848 uint32_t i, m, range_rebuild = 0; 849 #ifdef DXR2 850 struct trie_desc *tp; 851 uint32_t d_tbl_size, dxr_x, d_size, x_size; 852 uint32_t ti, trie_rebuild = 0, prev_size = 0; 853 #endif 854 855 KASSERT(dxr->d == NULL, ("dxr: d not free")); 856 857 if (da == NULL) { 858 da = malloc(sizeof(*dxr->aux), M_DXRAUX, M_NOWAIT); 859 if (da == NULL) 860 return; 861 dxr->aux = da; 862 da->fibnum = dxr->fibnum; 863 da->refcnt = 1; 864 LIST_INIT(&da->all_chunks); 865 LIST_INIT(&da->all_trie); 866 da->rtbl_size = RTBL_SIZE_INCR; 867 da->range_tbl = NULL; 868 da->xtbl_size = XTBL_SIZE_INCR; 869 da->x_tbl = NULL; 870 bzero(&da->dst, sizeof(da->dst)); 871 bzero(&da->mask, sizeof(da->mask)); 872 da->dst.sin_len = sizeof(da->dst); 873 da->mask.sin_len = sizeof(da->mask); 874 da->dst.sin_family = AF_INET; 875 da->mask.sin_family = AF_INET; 876 } 877 if (da->range_tbl == NULL) { 878 da->range_tbl = malloc(sizeof(*da->range_tbl) * da->rtbl_size 879 + FRAGS_PREF_SHORT, M_DXRAUX, M_NOWAIT); 880 if (da->range_tbl == NULL) 881 return; 882 range_rebuild = 1; 883 } 884 #ifdef DXR2 885 if (da->x_tbl == NULL) { 886 da->x_tbl = malloc(sizeof(*da->x_tbl) * da->xtbl_size, 887 M_DXRAUX, M_NOWAIT); 888 if (da->x_tbl == NULL) 889 return; 890 trie_rebuild = 1; 891 } 892 #endif 893 da->fd = dxr->fd; 894 895 microuptime(&t0); 896 897 dxr->nh_tbl = fib_get_nhop_array(da->fd); 898 fib_get_rtable_info(fib_get_rh(da->fd), &rinfo); 899 900 if (da->updates_low > da->updates_high || 901 da->unused_chunks_cnt > V_max_range_holes) 902 range_rebuild = 1; 903 if (range_rebuild) { 904 /* Bulk cleanup */ 905 bzero(da->chunk_hashtbl, sizeof(da->chunk_hashtbl)); 906 while ((cdp = LIST_FIRST(&da->all_chunks)) != NULL) { 907 LIST_REMOVE(cdp, cd_all_le); 908 uma_zfree(chunk_zone, cdp); 909 } 910 for (i = 0; i < UNUSED_BUCKETS; i++) 911 LIST_INIT(&da->unused_chunks[i]); 912 da->all_chunks_cnt = da->unused_chunks_cnt = 0; 913 da->rtbl_top = 0; 914 da->updates_low = 0; 915 da->updates_high = DIRECT_TBL_SIZE - 1; 916 memset(da->updates_mask, 0xff, sizeof(da->updates_mask)); 917 for (i = 0; i < DIRECT_TBL_SIZE; i++) { 918 da->direct_tbl[i].fragments = FRAGS_MARK_HIT; 919 da->direct_tbl[i].base = 0; 920 } 921 } 922 da->prefixes = rinfo.num_prefixes; 923 924 /* DXR: construct direct & range table */ 925 for (i = da->updates_low; i <= da->updates_high; i++) { 926 m = da->updates_mask[i >> 5] >> (i & 0x1f); 927 if (m == 0) 928 i |= 0x1f; 929 else if (m & 1 && update_chunk(da, i) != 0) 930 return; 931 } 932 r_size = sizeof(*da->range_tbl) * da->rtbl_top; 933 microuptime(&t1); 934 935 #ifdef DXR2 936 if (range_rebuild || da->unused_trie_cnt > V_max_trie_holes || 937 abs(fls(da->prefixes) - fls(da->trie_rebuilt_prefixes)) > 1) 938 trie_rebuild = 1; 939 if (trie_rebuild) { 940 da->trie_rebuilt_prefixes = da->prefixes; 941 da->d_bits = DXR_D; 942 da->updates_low = 0; 943 da->updates_high = DIRECT_TBL_SIZE - 1; 944 } 945 946 dxr2_try_squeeze: 947 if (trie_rebuild) { 948 /* Bulk cleanup */ 949 bzero(da->trietbl, sizeof(da->trietbl)); 950 bzero(da->trie_hashtbl, sizeof(da->trie_hashtbl)); 951 while ((tp = LIST_FIRST(&da->all_trie)) != NULL) { 952 LIST_REMOVE(tp, td_all_le); 953 uma_zfree(trie_zone, tp); 954 } 955 LIST_INIT(&da->unused_trie); 956 da->all_trie_cnt = da->unused_trie_cnt = 0; 957 } 958 959 /* Populate d_tbl, x_tbl */ 960 dxr_x = DXR_TRIE_BITS - da->d_bits; 961 d_tbl_size = (1 << da->d_bits); 962 963 for (i = da->updates_low >> dxr_x; i <= da->updates_high >> dxr_x; 964 i++) { 965 if (!trie_rebuild) { 966 m = 0; 967 for (int j = 0; j < (1 << dxr_x); j += 32) 968 m |= da->updates_mask[((i << dxr_x) + j) >> 5]; 969 if (m == 0) 970 continue; 971 trie_unref(da, i); 972 } 973 ti = trie_ref(da, i); 974 if (ti < 0) 975 return; 976 da->d_tbl[i] = ti; 977 } 978 979 d_size = sizeof(*da->d_tbl) * d_tbl_size; 980 x_size = sizeof(*da->x_tbl) * DIRECT_TBL_SIZE / d_tbl_size 981 * da->all_trie_cnt; 982 dxr_tot_size = d_size + x_size + r_size; 983 984 if (trie_rebuild == 1) { 985 /* Try to find a more compact D/X split */ 986 if (prev_size == 0 || dxr_tot_size <= prev_size) 987 da->d_bits--; 988 else { 989 da->d_bits++; 990 trie_rebuild = 2; 991 } 992 prev_size = dxr_tot_size; 993 goto dxr2_try_squeeze; 994 } 995 microuptime(&t2); 996 #else /* !DXR2 */ 997 dxr_tot_size = sizeof(da->direct_tbl) + r_size; 998 t2 = t1; 999 #endif 1000 1001 dxr->d = malloc(dxr_tot_size, M_DXRLPM, M_NOWAIT); 1002 if (dxr->d == NULL) 1003 return; 1004 #ifdef DXR2 1005 memcpy(dxr->d, da->d_tbl, d_size); 1006 dxr->x = ((char *) dxr->d) + d_size; 1007 memcpy(dxr->x, da->x_tbl, x_size); 1008 dxr->r = ((char *) dxr->x) + x_size; 1009 dxr->d_shift = 32 - da->d_bits; 1010 dxr->x_shift = dxr_x; 1011 dxr->x_mask = 0xffffffffU >> (32 - dxr_x); 1012 #else /* !DXR2 */ 1013 memcpy(dxr->d, da->direct_tbl, sizeof(da->direct_tbl)); 1014 dxr->r = ((char *) dxr->d) + sizeof(da->direct_tbl); 1015 #endif 1016 memcpy(dxr->r, da->range_tbl, r_size); 1017 1018 if (da->updates_low <= da->updates_high) 1019 bzero(&da->updates_mask[da->updates_low / 32], 1020 (da->updates_high - da->updates_low) / 8 + 1); 1021 da->updates_low = DIRECT_TBL_SIZE - 1; 1022 da->updates_high = 0; 1023 microuptime(&t3); 1024 1025 #ifdef DXR2 1026 FIB_PRINTF(LOG_INFO, da->fd, "D%dX%dR, %d prefixes, %d nhops (max)", 1027 da->d_bits, dxr_x, rinfo.num_prefixes, rinfo.num_nhops); 1028 #else 1029 FIB_PRINTF(LOG_INFO, da->fd, "D%dR, %d prefixes, %d nhops (max)", 1030 DXR_D, rinfo.num_prefixes, rinfo.num_nhops); 1031 #endif 1032 i = dxr_tot_size * 100; 1033 if (rinfo.num_prefixes) 1034 i /= rinfo.num_prefixes; 1035 FIB_PRINTF(LOG_INFO, da->fd, "%d.%02d KBytes, %d.%02d Bytes/prefix", 1036 dxr_tot_size / 1024, dxr_tot_size * 100 / 1024 % 100, 1037 i / 100, i % 100); 1038 i = (t1.tv_sec - t0.tv_sec) * 1000000 + t1.tv_usec - t0.tv_usec; 1039 FIB_PRINTF(LOG_INFO, da->fd, "range table %s in %u.%03u ms", 1040 range_rebuild ? "rebuilt" : "updated", i / 1000, i % 1000); 1041 #ifdef DXR2 1042 i = (t2.tv_sec - t1.tv_sec) * 1000000 + t2.tv_usec - t1.tv_usec; 1043 FIB_PRINTF(LOG_INFO, da->fd, "trie %s in %u.%03u ms", 1044 trie_rebuild ? "rebuilt" : "updated", i / 1000, i % 1000); 1045 #endif 1046 i = (t3.tv_sec - t2.tv_sec) * 1000000 + t3.tv_usec - t2.tv_usec; 1047 FIB_PRINTF(LOG_INFO, da->fd, "snapshot forked in %u.%03u ms", 1048 i / 1000, i % 1000); 1049 FIB_PRINTF(LOG_INFO, da->fd, "range table: %d%%, %d chunks, %d holes", 1050 da->rtbl_top * 100 / BASE_MAX, da->all_chunks_cnt, 1051 da->unused_chunks_cnt); 1052 } 1053 1054 /* 1055 * Glue functions for attaching to FreeBSD 13 fib_algo infrastructure. 1056 */ 1057 1058 static struct nhop_object * 1059 dxr_fib_lookup(void *algo_data, const struct flm_lookup_key key, 1060 uint32_t scopeid) 1061 { 1062 struct dxr *dxr = algo_data; 1063 uint32_t nh; 1064 1065 nh = dxr_lookup(dxr, ntohl(key.addr4.s_addr)); 1066 1067 return (dxr->nh_tbl[nh]); 1068 } 1069 1070 static enum flm_op_result 1071 dxr_init(uint32_t fibnum, struct fib_data *fd, void *old_data, void **data) 1072 { 1073 struct dxr *old_dxr = old_data; 1074 struct dxr_aux *da = NULL; 1075 struct dxr *dxr; 1076 1077 dxr = malloc(sizeof(*dxr), M_DXRAUX, M_NOWAIT); 1078 if (dxr == NULL) 1079 return (FLM_REBUILD); 1080 1081 /* Check whether we may reuse the old auxiliary structures */ 1082 if (old_dxr != NULL && old_dxr->aux != NULL) { 1083 da = old_dxr->aux; 1084 atomic_add_int(&da->refcnt, 1); 1085 } 1086 1087 dxr->aux = da; 1088 dxr->d = NULL; 1089 dxr->fd = fd; 1090 dxr->fibnum = fibnum; 1091 *data = dxr; 1092 1093 return (FLM_SUCCESS); 1094 } 1095 1096 static void 1097 dxr_destroy(void *data) 1098 { 1099 struct dxr *dxr = data; 1100 struct dxr_aux *da; 1101 struct chunk_desc *cdp; 1102 struct trie_desc *tp; 1103 1104 if (dxr->d != NULL) 1105 free(dxr->d, M_DXRLPM); 1106 1107 da = dxr->aux; 1108 free(dxr, M_DXRAUX); 1109 1110 if (da == NULL || atomic_fetchadd_int(&da->refcnt, -1) > 1) 1111 return; 1112 1113 /* Release auxiliary structures */ 1114 while ((cdp = LIST_FIRST(&da->all_chunks)) != NULL) { 1115 LIST_REMOVE(cdp, cd_all_le); 1116 uma_zfree(chunk_zone, cdp); 1117 } 1118 while ((tp = LIST_FIRST(&da->all_trie)) != NULL) { 1119 LIST_REMOVE(tp, td_all_le); 1120 uma_zfree(trie_zone, tp); 1121 } 1122 free(da->range_tbl, M_DXRAUX); 1123 free(da->x_tbl, M_DXRAUX); 1124 free(da, M_DXRAUX); 1125 } 1126 1127 static void 1128 epoch_dxr_destroy(epoch_context_t ctx) 1129 { 1130 struct dxr *dxr = __containerof(ctx, struct dxr, epoch_ctx); 1131 1132 dxr_destroy(dxr); 1133 } 1134 1135 static enum flm_op_result 1136 dxr_dump_end(void *data, struct fib_dp *dp) 1137 { 1138 struct dxr *dxr = data; 1139 struct dxr_aux *da; 1140 1141 dxr_build(dxr); 1142 1143 da = dxr->aux; 1144 if (da == NULL) 1145 return (FLM_REBUILD); 1146 1147 /* Structural limit exceeded, hard error */ 1148 if (da->rtbl_top >= BASE_MAX) 1149 return (FLM_ERROR); 1150 1151 /* A malloc(,, M_NOWAIT) failed somewhere, retry later */ 1152 if (dxr->d == NULL) 1153 return (FLM_REBUILD); 1154 1155 dp->f = dxr_fib_lookup; 1156 dp->arg = dxr; 1157 1158 return (FLM_SUCCESS); 1159 } 1160 1161 static enum flm_op_result 1162 dxr_dump_rib_item(struct rtentry *rt, void *data) 1163 { 1164 1165 return (FLM_SUCCESS); 1166 } 1167 1168 static enum flm_op_result 1169 dxr_change_rib_item(struct rib_head *rnh, struct rib_cmd_info *rc, 1170 void *data) 1171 { 1172 1173 return (FLM_BATCH); 1174 } 1175 1176 static enum flm_op_result 1177 dxr_change_rib_batch(struct rib_head *rnh, struct fib_change_queue *q, 1178 void *data) 1179 { 1180 struct dxr *dxr = data; 1181 struct dxr *new_dxr; 1182 struct dxr_aux *da; 1183 struct fib_dp new_dp; 1184 enum flm_op_result res; 1185 uint32_t ip, plen, hmask, start, end, i, ui; 1186 #ifdef INVARIANTS 1187 struct rib_rtable_info rinfo; 1188 int update_delta = 0; 1189 #endif 1190 1191 KASSERT(data != NULL, ("%s: NULL data", __FUNCTION__)); 1192 KASSERT(q != NULL, ("%s: NULL q", __FUNCTION__)); 1193 KASSERT(q->count < q->size, ("%s: q->count %d q->size %d", 1194 __FUNCTION__, q->count, q->size)); 1195 1196 da = dxr->aux; 1197 KASSERT(da != NULL, ("%s: NULL dxr->aux", __FUNCTION__)); 1198 1199 FIB_PRINTF(LOG_INFO, da->fd, "processing %d update(s)", q->count); 1200 for (ui = 0; ui < q->count; ui++) { 1201 #ifdef INVARIANTS 1202 if (q->entries[ui].nh_new != NULL) 1203 update_delta++; 1204 if (q->entries[ui].nh_old != NULL) 1205 update_delta--; 1206 #endif 1207 plen = q->entries[ui].plen; 1208 ip = ntohl(q->entries[ui].addr4.s_addr); 1209 if (plen < 32) 1210 hmask = 0xffffffffU >> plen; 1211 else 1212 hmask = 0; 1213 start = (ip & ~hmask) >> DXR_RANGE_SHIFT; 1214 end = (ip | hmask) >> DXR_RANGE_SHIFT; 1215 1216 if ((start & 0x1f) == 0 && (end & 0x1f) == 0x1f) 1217 for (i = start >> 5; i <= end >> 5; i++) 1218 da->updates_mask[i] = 0xffffffffU; 1219 else 1220 for (i = start; i <= end; i++) 1221 da->updates_mask[i >> 5] |= (1 << (i & 0x1f)); 1222 if (start < da->updates_low) 1223 da->updates_low = start; 1224 if (end > da->updates_high) 1225 da->updates_high = end; 1226 } 1227 1228 #ifdef INVARIANTS 1229 fib_get_rtable_info(fib_get_rh(da->fd), &rinfo); 1230 KASSERT(da->prefixes + update_delta == rinfo.num_prefixes, 1231 ("%s: update count mismatch", __FUNCTION__)); 1232 #endif 1233 1234 res = dxr_init(0, dxr->fd, data, (void **) &new_dxr); 1235 if (res != FLM_SUCCESS) 1236 return (res); 1237 1238 dxr_build(new_dxr); 1239 1240 /* Structural limit exceeded, hard error */ 1241 if (da->rtbl_top >= BASE_MAX) { 1242 dxr_destroy(new_dxr); 1243 return (FLM_ERROR); 1244 } 1245 1246 /* A malloc(,, M_NOWAIT) failed somewhere, retry later */ 1247 if (new_dxr->d == NULL) { 1248 dxr_destroy(new_dxr); 1249 return (FLM_REBUILD); 1250 } 1251 1252 new_dp.f = dxr_fib_lookup; 1253 new_dp.arg = new_dxr; 1254 if (fib_set_datapath_ptr(dxr->fd, &new_dp)) { 1255 fib_set_algo_ptr(dxr->fd, new_dxr); 1256 fib_epoch_call(epoch_dxr_destroy, &dxr->epoch_ctx); 1257 return (FLM_SUCCESS); 1258 } 1259 1260 dxr_destroy(new_dxr); 1261 return (FLM_REBUILD); 1262 } 1263 1264 static uint8_t 1265 dxr_get_pref(const struct rib_rtable_info *rinfo) 1266 { 1267 1268 /* Below bsearch4 up to 10 prefixes. Always supersedes dpdk_lpm4. */ 1269 return (251); 1270 } 1271 1272 static struct fib_lookup_module fib_dxr_mod = { 1273 .flm_name = "dxr", 1274 .flm_family = AF_INET, 1275 .flm_init_cb = dxr_init, 1276 .flm_destroy_cb = dxr_destroy, 1277 .flm_dump_rib_item_cb = dxr_dump_rib_item, 1278 .flm_dump_end_cb = dxr_dump_end, 1279 .flm_change_rib_item_cb = dxr_change_rib_item, 1280 .flm_change_rib_items_cb = dxr_change_rib_batch, 1281 .flm_get_pref = dxr_get_pref, 1282 }; 1283 1284 static int 1285 dxr_modevent(module_t mod, int type, void *unused) 1286 { 1287 int error; 1288 1289 switch (type) { 1290 case MOD_LOAD: 1291 chunk_zone = uma_zcreate("dxr chunk", sizeof(struct chunk_desc), 1292 NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); 1293 trie_zone = uma_zcreate("dxr trie", sizeof(struct trie_desc), 1294 NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); 1295 fib_module_register(&fib_dxr_mod); 1296 return(0); 1297 case MOD_UNLOAD: 1298 error = fib_module_unregister(&fib_dxr_mod); 1299 if (error) 1300 return (error); 1301 uma_zdestroy(chunk_zone); 1302 uma_zdestroy(trie_zone); 1303 return(0); 1304 default: 1305 return(EOPNOTSUPP); 1306 } 1307 } 1308 1309 static moduledata_t dxr_mod = {"fib_dxr", dxr_modevent, 0}; 1310 1311 DECLARE_MODULE(fib_dxr, dxr_mod, SI_SUB_PSEUDO, SI_ORDER_ANY); 1312 MODULE_VERSION(fib_dxr, 1); 1313