1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3 * 4 * Copyright (c) 2012-2022 Marko Zec 5 * Copyright (c) 2005, 2018 University of Zagreb 6 * Copyright (c) 2005 International Computer Science Institute 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 27 * SUCH DAMAGE. 28 */ 29 30 /* 31 * An implementation of DXR, a simple IPv4 LPM scheme with compact lookup 32 * structures and a trivial search procedure. More significant bits of 33 * the search key are used to directly index a two-stage trie, while the 34 * remaining bits are used for finding the next hop in a sorted array. 35 * More details in: 36 * 37 * M. Zec, L. Rizzo, M. Mikuc, DXR: towards a billion routing lookups per 38 * second in software, ACM SIGCOMM Computer Communication Review, September 39 * 2012 40 * 41 * M. Zec, M. Mikuc, Pushing the envelope: beyond two billion IP routing 42 * lookups per second on commodity CPUs, IEEE SoftCOM, September 2017, Split 43 */ 44 45 #include <sys/cdefs.h> 46 __FBSDID("$FreeBSD$"); 47 48 #include "opt_inet.h" 49 50 #include <sys/param.h> 51 #include <sys/kernel.h> 52 #include <sys/ctype.h> 53 #include <sys/epoch.h> 54 #include <sys/malloc.h> 55 #include <sys/module.h> 56 #include <sys/socket.h> 57 #include <sys/sysctl.h> 58 #include <sys/syslog.h> 59 60 #include <vm/uma.h> 61 62 #include <netinet/in.h> 63 #include <netinet/in_fib.h> 64 65 #include <net/route.h> 66 #include <net/route/route_ctl.h> 67 #include <net/route/fib_algo.h> 68 69 #define DXR_TRIE_BITS 20 70 71 CTASSERT(DXR_TRIE_BITS >= 16 && DXR_TRIE_BITS <= 24); 72 73 /* DXR2: two-stage primary trie, instead of a single direct lookup table */ 74 #define DXR2 75 76 #if DXR_TRIE_BITS > 16 77 #define DXR_D 16 78 #else 79 #define DXR_D (DXR_TRIE_BITS - 1) 80 #endif 81 82 #define D_TBL_SIZE (1 << DXR_D) 83 #define DIRECT_TBL_SIZE (1 << DXR_TRIE_BITS) 84 #define DXR_RANGE_MASK (0xffffffffU >> DXR_TRIE_BITS) 85 #define DXR_RANGE_SHIFT (32 - DXR_TRIE_BITS) 86 87 #define DESC_BASE_BITS 22 88 #define DESC_FRAGMENTS_BITS (32 - DESC_BASE_BITS) 89 #define BASE_MAX ((1 << DESC_BASE_BITS) - 1) 90 #define RTBL_SIZE_INCR (BASE_MAX / 64) 91 92 #if DXR_TRIE_BITS < 24 93 #define FRAGS_MASK_SHORT ((1 << (23 - DXR_TRIE_BITS)) - 1) 94 #else 95 #define FRAGS_MASK_SHORT 0 96 #endif 97 #define FRAGS_PREF_SHORT (((1 << DESC_FRAGMENTS_BITS) - 1) & \ 98 ~FRAGS_MASK_SHORT) 99 #define FRAGS_MARK_XL (FRAGS_PREF_SHORT - 1) 100 #define FRAGS_MARK_HIT (FRAGS_PREF_SHORT - 2) 101 102 #define IS_SHORT_FORMAT(x) ((x & FRAGS_PREF_SHORT) == FRAGS_PREF_SHORT) 103 #define IS_LONG_FORMAT(x) ((x & FRAGS_PREF_SHORT) != FRAGS_PREF_SHORT) 104 #define IS_XL_FORMAT(x) (x == FRAGS_MARK_XL) 105 106 #define RE_SHORT_MAX_NH ((1 << (DXR_TRIE_BITS - 8)) - 1) 107 108 #define CHUNK_HASH_BITS 16 109 #define CHUNK_HASH_SIZE (1 << CHUNK_HASH_BITS) 110 #define CHUNK_HASH_MASK (CHUNK_HASH_SIZE - 1) 111 112 #define TRIE_HASH_BITS 16 113 #define TRIE_HASH_SIZE (1 << TRIE_HASH_BITS) 114 #define TRIE_HASH_MASK (TRIE_HASH_SIZE - 1) 115 116 #define XTBL_SIZE_INCR (DIRECT_TBL_SIZE / 16) 117 118 #define UNUSED_BUCKETS 8 119 120 /* Lookup structure elements */ 121 122 struct direct_entry { 123 uint32_t fragments: DESC_FRAGMENTS_BITS, 124 base: DESC_BASE_BITS; 125 }; 126 127 struct range_entry_long { 128 uint32_t start: DXR_RANGE_SHIFT, 129 nexthop: DXR_TRIE_BITS; 130 }; 131 132 #if DXR_TRIE_BITS < 24 133 struct range_entry_short { 134 uint16_t start: DXR_RANGE_SHIFT - 8, 135 nexthop: DXR_TRIE_BITS - 8; 136 }; 137 #endif 138 139 /* Auxiliary structures */ 140 141 struct heap_entry { 142 uint32_t start; 143 uint32_t end; 144 uint32_t preflen; 145 uint32_t nexthop; 146 }; 147 148 struct chunk_desc { 149 LIST_ENTRY(chunk_desc) cd_all_le; 150 LIST_ENTRY(chunk_desc) cd_hash_le; 151 uint32_t cd_hash; 152 uint32_t cd_refcnt; 153 uint32_t cd_base; 154 uint32_t cd_cur_size; 155 uint32_t cd_max_size; 156 }; 157 158 struct trie_desc { 159 LIST_ENTRY(trie_desc) td_all_le; 160 LIST_ENTRY(trie_desc) td_hash_le; 161 uint32_t td_hash; 162 uint32_t td_index; 163 uint32_t td_refcnt; 164 }; 165 166 struct dxr_aux { 167 /* Glue to external state */ 168 struct fib_data *fd; 169 uint32_t fibnum; 170 int refcnt; 171 172 /* Auxiliary build-time tables */ 173 struct direct_entry direct_tbl[DIRECT_TBL_SIZE]; 174 uint16_t d_tbl[D_TBL_SIZE]; 175 struct direct_entry *x_tbl; 176 union { 177 struct range_entry_long re; 178 uint32_t fragments; 179 } *range_tbl; 180 181 /* Auxiliary internal state */ 182 uint32_t updates_mask[DIRECT_TBL_SIZE / 32]; 183 struct trie_desc *trietbl[D_TBL_SIZE]; 184 LIST_HEAD(, chunk_desc) chunk_hashtbl[CHUNK_HASH_SIZE]; 185 LIST_HEAD(, chunk_desc) all_chunks; 186 LIST_HEAD(, chunk_desc) unused_chunks[UNUSED_BUCKETS]; 187 LIST_HEAD(, trie_desc) trie_hashtbl[TRIE_HASH_SIZE]; 188 LIST_HEAD(, trie_desc) all_trie; 189 LIST_HEAD(, trie_desc) unused_trie; /* abuses hash link entry */ 190 struct sockaddr_in dst; 191 struct sockaddr_in mask; 192 struct heap_entry heap[33]; 193 uint32_t prefixes; 194 uint32_t updates_low; 195 uint32_t updates_high; 196 uint32_t unused_chunks_size; 197 uint32_t xtbl_size; 198 uint32_t all_trie_cnt; 199 uint32_t unused_trie_cnt; 200 uint32_t trie_rebuilt_prefixes; 201 uint32_t heap_index; 202 uint32_t d_bits; 203 uint32_t rtbl_size; 204 uint32_t rtbl_top; 205 uint32_t rtbl_work_frags; 206 uint32_t work_chunk; 207 }; 208 209 /* Main lookup structure container */ 210 211 struct dxr { 212 /* Lookup tables */ 213 void *d; 214 void *x; 215 void *r; 216 struct nhop_object **nh_tbl; 217 218 /* Glue to external state */ 219 struct dxr_aux *aux; 220 struct fib_data *fd; 221 struct epoch_context epoch_ctx; 222 uint32_t fibnum; 223 uint16_t d_shift; 224 uint16_t x_shift; 225 uint32_t x_mask; 226 }; 227 228 static MALLOC_DEFINE(M_DXRLPM, "dxr", "DXR LPM"); 229 static MALLOC_DEFINE(M_DXRAUX, "dxr aux", "DXR auxiliary"); 230 231 uma_zone_t chunk_zone; 232 uma_zone_t trie_zone; 233 234 VNET_DEFINE_STATIC(int, frag_limit) = 100; 235 #define V_frag_limit VNET(frag_limit) 236 237 /* Binary search for a matching address range */ 238 #define DXR_LOOKUP_STAGE \ 239 if (masked_dst < range[middle].start) { \ 240 upperbound = middle; \ 241 middle = (middle + lowerbound) / 2; \ 242 } else if (masked_dst < range[middle + 1].start) \ 243 return (range[middle].nexthop); \ 244 else { \ 245 lowerbound = middle + 1; \ 246 middle = (upperbound + middle + 1) / 2; \ 247 } \ 248 if (upperbound == lowerbound) \ 249 return (range[lowerbound].nexthop); 250 251 static int 252 range_lookup(struct range_entry_long *rt, struct direct_entry de, uint32_t dst) 253 { 254 uint32_t base; 255 uint32_t upperbound; 256 uint32_t middle; 257 uint32_t lowerbound; 258 uint32_t masked_dst; 259 260 base = de.base; 261 lowerbound = 0; 262 masked_dst = dst & DXR_RANGE_MASK; 263 264 #if DXR_TRIE_BITS < 24 265 if (__predict_true(IS_SHORT_FORMAT(de.fragments))) { 266 upperbound = de.fragments & FRAGS_MASK_SHORT; 267 struct range_entry_short *range = 268 (struct range_entry_short *) &rt[base]; 269 270 masked_dst >>= 8; 271 middle = upperbound; 272 upperbound = upperbound * 2 + 1; 273 274 for (;;) { 275 DXR_LOOKUP_STAGE 276 DXR_LOOKUP_STAGE 277 } 278 } 279 #endif 280 281 upperbound = de.fragments; 282 middle = upperbound / 2; 283 struct range_entry_long *range = &rt[base]; 284 if (__predict_false(IS_XL_FORMAT(de.fragments))) { 285 upperbound = *((uint32_t *) range); 286 range++; 287 middle = upperbound / 2; 288 } 289 290 for (;;) { 291 DXR_LOOKUP_STAGE 292 DXR_LOOKUP_STAGE 293 } 294 } 295 296 #define DXR_LOOKUP_DEFINE(D) \ 297 static int inline \ 298 dxr_lookup_##D(struct dxr *dxr, uint32_t dst) \ 299 { \ 300 struct direct_entry de; \ 301 uint16_t *dt = dxr->d; \ 302 struct direct_entry *xt = dxr->x; \ 303 \ 304 de = xt[(dt[dst >> (32 - (D))] << (DXR_TRIE_BITS - (D))) \ 305 + ((dst >> DXR_RANGE_SHIFT) & \ 306 (0xffffffffU >> (32 - DXR_TRIE_BITS + (D))))]; \ 307 if (__predict_true(de.fragments == FRAGS_MARK_HIT)) \ 308 return (de.base); \ 309 return (range_lookup(dxr->r, de, dst)); \ 310 } \ 311 \ 312 static struct nhop_object * \ 313 dxr_fib_lookup_##D(void *algo_data, \ 314 const struct flm_lookup_key key, uint32_t scopeid __unused) \ 315 { \ 316 struct dxr *dxr = algo_data; \ 317 \ 318 return (dxr->nh_tbl[dxr_lookup_##D(dxr, \ 319 ntohl(key.addr4.s_addr))]); \ 320 } 321 322 #ifdef DXR2 323 #if DXR_TRIE_BITS > 16 324 DXR_LOOKUP_DEFINE(16) 325 #endif 326 DXR_LOOKUP_DEFINE(15) 327 DXR_LOOKUP_DEFINE(14) 328 DXR_LOOKUP_DEFINE(13) 329 DXR_LOOKUP_DEFINE(12) 330 DXR_LOOKUP_DEFINE(11) 331 DXR_LOOKUP_DEFINE(10) 332 DXR_LOOKUP_DEFINE(9) 333 #endif /* DXR2 */ 334 335 static int inline 336 dxr_lookup(struct dxr *dxr, uint32_t dst) 337 { 338 struct direct_entry de; 339 #ifdef DXR2 340 uint16_t *dt = dxr->d; 341 struct direct_entry *xt = dxr->x; 342 343 de = xt[(dt[dst >> dxr->d_shift] << dxr->x_shift) + 344 ((dst >> DXR_RANGE_SHIFT) & dxr->x_mask)]; 345 #else /* !DXR2 */ 346 struct direct_entry *dt = dxr->d; 347 348 de = dt[dst >> DXR_RANGE_SHIFT]; 349 #endif /* !DXR2 */ 350 if (__predict_true(de.fragments == FRAGS_MARK_HIT)) 351 return (de.base); 352 return (range_lookup(dxr->r, de, dst)); 353 } 354 355 static void 356 initheap(struct dxr_aux *da, uint32_t dst_u32, uint32_t chunk) 357 { 358 struct heap_entry *fhp = &da->heap[0]; 359 struct rtentry *rt; 360 struct route_nhop_data rnd; 361 362 da->heap_index = 0; 363 da->dst.sin_addr.s_addr = htonl(dst_u32); 364 rt = fib4_lookup_rt(da->fibnum, da->dst.sin_addr, 0, NHR_UNLOCKED, 365 &rnd); 366 if (rt != NULL) { 367 struct in_addr addr; 368 uint32_t scopeid; 369 370 rt_get_inet_prefix_plen(rt, &addr, &fhp->preflen, &scopeid); 371 fhp->start = ntohl(addr.s_addr); 372 fhp->end = fhp->start; 373 if (fhp->preflen < 32) 374 fhp->end |= (0xffffffffU >> fhp->preflen); 375 fhp->nexthop = fib_get_nhop_idx(da->fd, rnd.rnd_nhop); 376 } else { 377 fhp->preflen = fhp->nexthop = fhp->start = 0; 378 fhp->end = 0xffffffffU; 379 } 380 } 381 382 static uint32_t 383 chunk_size(struct dxr_aux *da, struct direct_entry *fdesc) 384 { 385 386 if (IS_SHORT_FORMAT(fdesc->fragments)) 387 return ((fdesc->fragments & FRAGS_MASK_SHORT) + 1); 388 else if (IS_XL_FORMAT(fdesc->fragments)) 389 return (da->range_tbl[fdesc->base].fragments + 2); 390 else /* if (IS_LONG_FORMAT(fdesc->fragments)) */ 391 return (fdesc->fragments + 1); 392 } 393 394 static uint32_t 395 chunk_hash(struct dxr_aux *da, struct direct_entry *fdesc) 396 { 397 uint32_t size = chunk_size(da, fdesc); 398 uint32_t *p = (uint32_t *) &da->range_tbl[fdesc->base]; 399 uint32_t *l = (uint32_t *) &da->range_tbl[fdesc->base + size]; 400 uint32_t hash = fdesc->fragments; 401 402 for (; p < l; p++) 403 hash = (hash << 7) + (hash >> 13) + *p; 404 405 return (hash + (hash >> 16)); 406 } 407 408 static int 409 chunk_ref(struct dxr_aux *da, uint32_t chunk) 410 { 411 struct direct_entry *fdesc = &da->direct_tbl[chunk]; 412 struct chunk_desc *cdp, *empty_cdp; 413 uint32_t base = fdesc->base; 414 uint32_t size = chunk_size(da, fdesc); 415 uint32_t hash = chunk_hash(da, fdesc); 416 int i; 417 418 /* Find an existing descriptor */ 419 LIST_FOREACH(cdp, &da->chunk_hashtbl[hash & CHUNK_HASH_MASK], 420 cd_hash_le) { 421 if (cdp->cd_hash != hash || cdp->cd_cur_size != size || 422 memcmp(&da->range_tbl[base], &da->range_tbl[cdp->cd_base], 423 sizeof(struct range_entry_long) * size)) 424 continue; 425 da->rtbl_top = fdesc->base; 426 fdesc->base = cdp->cd_base; 427 cdp->cd_refcnt++; 428 return (0); 429 } 430 431 /* No matching chunks found. Find an empty one to recycle. */ 432 for (cdp = NULL, i = size; cdp == NULL && i < UNUSED_BUCKETS; i++) 433 cdp = LIST_FIRST(&da->unused_chunks[i]); 434 435 if (cdp == NULL) 436 LIST_FOREACH(empty_cdp, &da->unused_chunks[0], cd_hash_le) 437 if (empty_cdp->cd_max_size >= size && (cdp == NULL || 438 empty_cdp->cd_max_size < cdp->cd_max_size)) { 439 cdp = empty_cdp; 440 if (empty_cdp->cd_max_size == size) 441 break; 442 } 443 444 if (cdp != NULL) { 445 /* Copy from heap into the recycled chunk */ 446 bcopy(&da->range_tbl[fdesc->base], &da->range_tbl[cdp->cd_base], 447 size * sizeof(struct range_entry_long)); 448 fdesc->base = cdp->cd_base; 449 da->rtbl_top -= size; 450 da->unused_chunks_size -= cdp->cd_max_size; 451 if (cdp->cd_max_size > size) { 452 /* Split the range in two, need a new descriptor */ 453 empty_cdp = uma_zalloc(chunk_zone, M_NOWAIT); 454 if (empty_cdp == NULL) 455 return (1); 456 LIST_INSERT_BEFORE(cdp, empty_cdp, cd_all_le); 457 empty_cdp->cd_base = cdp->cd_base + size; 458 empty_cdp->cd_cur_size = 0; 459 empty_cdp->cd_max_size = cdp->cd_max_size - size; 460 461 i = empty_cdp->cd_max_size; 462 if (i >= UNUSED_BUCKETS) 463 i = 0; 464 LIST_INSERT_HEAD(&da->unused_chunks[i], empty_cdp, 465 cd_hash_le); 466 467 da->unused_chunks_size += empty_cdp->cd_max_size; 468 cdp->cd_max_size = size; 469 } 470 LIST_REMOVE(cdp, cd_hash_le); 471 } else { 472 /* Alloc a new descriptor at the top of the heap*/ 473 cdp = uma_zalloc(chunk_zone, M_NOWAIT); 474 if (cdp == NULL) 475 return (1); 476 cdp->cd_max_size = size; 477 cdp->cd_base = fdesc->base; 478 LIST_INSERT_HEAD(&da->all_chunks, cdp, cd_all_le); 479 KASSERT(cdp->cd_base + cdp->cd_max_size == da->rtbl_top, 480 ("dxr: %s %d", __FUNCTION__, __LINE__)); 481 } 482 483 cdp->cd_hash = hash; 484 cdp->cd_refcnt = 1; 485 cdp->cd_cur_size = size; 486 LIST_INSERT_HEAD(&da->chunk_hashtbl[hash & CHUNK_HASH_MASK], cdp, 487 cd_hash_le); 488 if (da->rtbl_top >= da->rtbl_size) { 489 if (da->rtbl_top >= BASE_MAX) { 490 FIB_PRINTF(LOG_ERR, da->fd, 491 "structural limit exceeded at %d " 492 "range table elements", da->rtbl_top); 493 return (1); 494 } 495 da->rtbl_size += RTBL_SIZE_INCR; 496 i = (BASE_MAX - da->rtbl_top) * LOG_DEBUG / BASE_MAX; 497 FIB_PRINTF(i, da->fd, "range table at %d%% structural limit", 498 da->rtbl_top * 100 / BASE_MAX); 499 da->range_tbl = realloc(da->range_tbl, 500 sizeof(*da->range_tbl) * da->rtbl_size + FRAGS_PREF_SHORT, 501 M_DXRAUX, M_NOWAIT); 502 if (da->range_tbl == NULL) 503 return (1); 504 } 505 506 return (0); 507 } 508 509 static void 510 chunk_unref(struct dxr_aux *da, uint32_t chunk) 511 { 512 struct direct_entry *fdesc = &da->direct_tbl[chunk]; 513 struct chunk_desc *cdp, *cdp2; 514 uint32_t base = fdesc->base; 515 uint32_t size = chunk_size(da, fdesc); 516 uint32_t hash = chunk_hash(da, fdesc); 517 int i; 518 519 /* Find the corresponding descriptor */ 520 LIST_FOREACH(cdp, &da->chunk_hashtbl[hash & CHUNK_HASH_MASK], 521 cd_hash_le) 522 if (cdp->cd_hash == hash && cdp->cd_cur_size == size && 523 memcmp(&da->range_tbl[base], &da->range_tbl[cdp->cd_base], 524 sizeof(struct range_entry_long) * size) == 0) 525 break; 526 527 KASSERT(cdp != NULL, ("dxr: dangling chunk")); 528 if (--cdp->cd_refcnt > 0) 529 return; 530 531 LIST_REMOVE(cdp, cd_hash_le); 532 da->unused_chunks_size += cdp->cd_max_size; 533 cdp->cd_cur_size = 0; 534 535 /* Attempt to merge with the preceding chunk, if empty */ 536 cdp2 = LIST_NEXT(cdp, cd_all_le); 537 if (cdp2 != NULL && cdp2->cd_cur_size == 0) { 538 KASSERT(cdp2->cd_base + cdp2->cd_max_size == cdp->cd_base, 539 ("dxr: %s %d", __FUNCTION__, __LINE__)); 540 LIST_REMOVE(cdp, cd_all_le); 541 LIST_REMOVE(cdp2, cd_hash_le); 542 cdp2->cd_max_size += cdp->cd_max_size; 543 uma_zfree(chunk_zone, cdp); 544 cdp = cdp2; 545 } 546 547 /* Attempt to merge with the subsequent chunk, if empty */ 548 cdp2 = LIST_PREV(cdp, &da->all_chunks, chunk_desc, cd_all_le); 549 if (cdp2 != NULL && cdp2->cd_cur_size == 0) { 550 KASSERT(cdp->cd_base + cdp->cd_max_size == cdp2->cd_base, 551 ("dxr: %s %d", __FUNCTION__, __LINE__)); 552 LIST_REMOVE(cdp, cd_all_le); 553 LIST_REMOVE(cdp2, cd_hash_le); 554 cdp2->cd_max_size += cdp->cd_max_size; 555 cdp2->cd_base = cdp->cd_base; 556 uma_zfree(chunk_zone, cdp); 557 cdp = cdp2; 558 } 559 560 if (cdp->cd_base + cdp->cd_max_size == da->rtbl_top) { 561 /* Free the chunk on the top of the range heap, trim the heap */ 562 KASSERT(cdp == LIST_FIRST(&da->all_chunks), 563 ("dxr: %s %d", __FUNCTION__, __LINE__)); 564 da->rtbl_top -= cdp->cd_max_size; 565 da->unused_chunks_size -= cdp->cd_max_size; 566 LIST_REMOVE(cdp, cd_all_le); 567 uma_zfree(chunk_zone, cdp); 568 return; 569 } 570 571 i = cdp->cd_max_size; 572 if (i >= UNUSED_BUCKETS) 573 i = 0; 574 LIST_INSERT_HEAD(&da->unused_chunks[i], cdp, cd_hash_le); 575 } 576 577 #ifdef DXR2 578 static uint32_t 579 trie_hash(struct dxr_aux *da, uint32_t dxr_x, uint32_t index) 580 { 581 uint32_t i, *val; 582 uint32_t hash = 0; 583 584 for (i = 0; i < (1 << dxr_x); i++) { 585 hash = (hash << 3) ^ (hash >> 3); 586 val = (uint32_t *) 587 (void *) &da->direct_tbl[(index << dxr_x) + i]; 588 hash += (*val << 5); 589 hash += (*val >> 5); 590 } 591 592 return (hash + (hash >> 16)); 593 } 594 595 static int 596 trie_ref(struct dxr_aux *da, uint32_t index) 597 { 598 struct trie_desc *tp; 599 uint32_t dxr_d = da->d_bits; 600 uint32_t dxr_x = DXR_TRIE_BITS - dxr_d; 601 uint32_t hash = trie_hash(da, dxr_x, index); 602 603 /* Find an existing descriptor */ 604 LIST_FOREACH(tp, &da->trie_hashtbl[hash & TRIE_HASH_MASK], td_hash_le) 605 if (tp->td_hash == hash && 606 memcmp(&da->direct_tbl[index << dxr_x], 607 &da->x_tbl[tp->td_index << dxr_x], 608 sizeof(*da->x_tbl) << dxr_x) == 0) { 609 tp->td_refcnt++; 610 da->trietbl[index] = tp; 611 return(tp->td_index); 612 } 613 614 tp = LIST_FIRST(&da->unused_trie); 615 if (tp != NULL) { 616 LIST_REMOVE(tp, td_hash_le); 617 da->unused_trie_cnt--; 618 } else { 619 tp = uma_zalloc(trie_zone, M_NOWAIT); 620 if (tp == NULL) 621 return (-1); 622 LIST_INSERT_HEAD(&da->all_trie, tp, td_all_le); 623 tp->td_index = da->all_trie_cnt++; 624 } 625 626 tp->td_hash = hash; 627 tp->td_refcnt = 1; 628 LIST_INSERT_HEAD(&da->trie_hashtbl[hash & TRIE_HASH_MASK], tp, 629 td_hash_le); 630 memcpy(&da->x_tbl[tp->td_index << dxr_x], 631 &da->direct_tbl[index << dxr_x], sizeof(*da->x_tbl) << dxr_x); 632 da->trietbl[index] = tp; 633 if (da->all_trie_cnt >= da->xtbl_size >> dxr_x) { 634 da->xtbl_size += XTBL_SIZE_INCR; 635 da->x_tbl = realloc(da->x_tbl, 636 sizeof(*da->x_tbl) * da->xtbl_size, M_DXRAUX, M_NOWAIT); 637 if (da->x_tbl == NULL) 638 return (-1); 639 } 640 return(tp->td_index); 641 } 642 643 static void 644 trie_unref(struct dxr_aux *da, uint32_t index) 645 { 646 struct trie_desc *tp = da->trietbl[index]; 647 648 if (tp == NULL) 649 return; 650 da->trietbl[index] = NULL; 651 if (--tp->td_refcnt > 0) 652 return; 653 654 LIST_REMOVE(tp, td_hash_le); 655 da->unused_trie_cnt++; 656 if (tp->td_index != da->all_trie_cnt - 1) { 657 LIST_INSERT_HEAD(&da->unused_trie, tp, td_hash_le); 658 return; 659 } 660 661 do { 662 da->all_trie_cnt--; 663 da->unused_trie_cnt--; 664 LIST_REMOVE(tp, td_all_le); 665 uma_zfree(trie_zone, tp); 666 LIST_FOREACH(tp, &da->unused_trie, td_hash_le) 667 if (tp->td_index == da->all_trie_cnt - 1) { 668 LIST_REMOVE(tp, td_hash_le); 669 break; 670 } 671 } while (tp != NULL); 672 } 673 #endif 674 675 static void 676 heap_inject(struct dxr_aux *da, uint32_t start, uint32_t end, uint32_t preflen, 677 uint32_t nh) 678 { 679 struct heap_entry *fhp; 680 int i; 681 682 for (i = da->heap_index; i >= 0; i--) { 683 if (preflen > da->heap[i].preflen) 684 break; 685 else if (preflen < da->heap[i].preflen) 686 da->heap[i + 1] = da->heap[i]; 687 else 688 return; 689 } 690 691 fhp = &da->heap[i + 1]; 692 fhp->preflen = preflen; 693 fhp->start = start; 694 fhp->end = end; 695 fhp->nexthop = nh; 696 da->heap_index++; 697 } 698 699 static int 700 dxr_walk(struct rtentry *rt, void *arg) 701 { 702 struct dxr_aux *da = arg; 703 uint32_t chunk = da->work_chunk; 704 uint32_t first = chunk << DXR_RANGE_SHIFT; 705 uint32_t last = first | DXR_RANGE_MASK; 706 struct range_entry_long *fp = 707 &da->range_tbl[da->rtbl_top + da->rtbl_work_frags].re; 708 struct heap_entry *fhp = &da->heap[da->heap_index]; 709 uint32_t preflen, nh, start, end, scopeid; 710 struct in_addr addr; 711 712 rt_get_inet_prefix_plen(rt, &addr, &preflen, &scopeid); 713 start = ntohl(addr.s_addr); 714 if (start > last) 715 return (-1); /* Beyond chunk boundaries, we are done */ 716 if (start < first) 717 return (0); /* Skip this route */ 718 719 end = start; 720 if (preflen < 32) 721 end |= (0xffffffffU >> preflen); 722 nh = fib_get_nhop_idx(da->fd, rt_get_raw_nhop(rt)); 723 724 if (start == fhp->start) 725 heap_inject(da, start, end, preflen, nh); 726 else { 727 /* start > fhp->start */ 728 while (start > fhp->end) { 729 uint32_t oend = fhp->end; 730 731 if (da->heap_index > 0) { 732 fhp--; 733 da->heap_index--; 734 } else 735 initheap(da, fhp->end + 1, chunk); 736 if (fhp->end > oend && fhp->nexthop != fp->nexthop) { 737 fp++; 738 da->rtbl_work_frags++; 739 fp->start = (oend + 1) & DXR_RANGE_MASK; 740 fp->nexthop = fhp->nexthop; 741 } 742 } 743 if (start > ((chunk << DXR_RANGE_SHIFT) | fp->start) && 744 nh != fp->nexthop) { 745 fp++; 746 da->rtbl_work_frags++; 747 fp->start = start & DXR_RANGE_MASK; 748 } else if (da->rtbl_work_frags) { 749 if ((--fp)->nexthop == nh) 750 da->rtbl_work_frags--; 751 else 752 fp++; 753 } 754 fp->nexthop = nh; 755 heap_inject(da, start, end, preflen, nh); 756 } 757 758 return (0); 759 } 760 761 static int 762 update_chunk(struct dxr_aux *da, uint32_t chunk) 763 { 764 struct range_entry_long *fp; 765 #if DXR_TRIE_BITS < 24 766 struct range_entry_short *fps; 767 uint32_t start, nh, i; 768 #endif 769 struct heap_entry *fhp; 770 uint32_t first = chunk << DXR_RANGE_SHIFT; 771 uint32_t last = first | DXR_RANGE_MASK; 772 773 if (da->direct_tbl[chunk].fragments != FRAGS_MARK_HIT) 774 chunk_unref(da, chunk); 775 776 initheap(da, first, chunk); 777 778 fp = &da->range_tbl[da->rtbl_top].re; 779 da->rtbl_work_frags = 0; 780 fp->start = first & DXR_RANGE_MASK; 781 fp->nexthop = da->heap[0].nexthop; 782 783 da->dst.sin_addr.s_addr = htonl(first); 784 da->mask.sin_addr.s_addr = htonl(~DXR_RANGE_MASK); 785 786 da->work_chunk = chunk; 787 rib_walk_from(da->fibnum, AF_INET, RIB_FLAG_LOCKED, 788 (struct sockaddr *) &da->dst, (struct sockaddr *) &da->mask, 789 dxr_walk, da); 790 791 /* Flush any remaining objects on the heap */ 792 fp = &da->range_tbl[da->rtbl_top + da->rtbl_work_frags].re; 793 fhp = &da->heap[da->heap_index]; 794 while (fhp->preflen > DXR_TRIE_BITS) { 795 uint32_t oend = fhp->end; 796 797 if (da->heap_index > 0) { 798 fhp--; 799 da->heap_index--; 800 } else 801 initheap(da, fhp->end + 1, chunk); 802 if (fhp->end > oend && fhp->nexthop != fp->nexthop) { 803 /* Have we crossed the upper chunk boundary? */ 804 if (oend >= last) 805 break; 806 fp++; 807 da->rtbl_work_frags++; 808 fp->start = (oend + 1) & DXR_RANGE_MASK; 809 fp->nexthop = fhp->nexthop; 810 } 811 } 812 813 /* Direct hit if the chunk contains only a single fragment */ 814 if (da->rtbl_work_frags == 0) { 815 da->direct_tbl[chunk].base = fp->nexthop; 816 da->direct_tbl[chunk].fragments = FRAGS_MARK_HIT; 817 return (0); 818 } 819 820 da->direct_tbl[chunk].base = da->rtbl_top; 821 da->direct_tbl[chunk].fragments = da->rtbl_work_frags; 822 823 #if DXR_TRIE_BITS < 24 824 /* Check whether the chunk can be more compactly encoded */ 825 fp = &da->range_tbl[da->rtbl_top].re; 826 for (i = 0; i <= da->rtbl_work_frags; i++, fp++) 827 if ((fp->start & 0xff) != 0 || fp->nexthop > RE_SHORT_MAX_NH) 828 break; 829 if (i == da->rtbl_work_frags + 1) { 830 fp = &da->range_tbl[da->rtbl_top].re; 831 fps = (void *) fp; 832 for (i = 0; i <= da->rtbl_work_frags; i++, fp++, fps++) { 833 start = fp->start; 834 nh = fp->nexthop; 835 fps->start = start >> 8; 836 fps->nexthop = nh; 837 } 838 fps->start = start >> 8; 839 fps->nexthop = nh; 840 da->rtbl_work_frags >>= 1; 841 da->direct_tbl[chunk].fragments = 842 da->rtbl_work_frags | FRAGS_PREF_SHORT; 843 } else 844 #endif 845 if (da->rtbl_work_frags >= FRAGS_MARK_HIT) { 846 da->direct_tbl[chunk].fragments = FRAGS_MARK_XL; 847 memmove(&da->range_tbl[da->rtbl_top + 1], 848 &da->range_tbl[da->rtbl_top], 849 (da->rtbl_work_frags + 1) * sizeof(*da->range_tbl)); 850 da->range_tbl[da->rtbl_top].fragments = da->rtbl_work_frags; 851 da->rtbl_work_frags++; 852 } 853 da->rtbl_top += (da->rtbl_work_frags + 1); 854 return (chunk_ref(da, chunk)); 855 } 856 857 static void 858 dxr_build(struct dxr *dxr) 859 { 860 struct dxr_aux *da = dxr->aux; 861 struct chunk_desc *cdp; 862 struct rib_rtable_info rinfo; 863 struct timeval t0, t1, t2, t3; 864 uint32_t r_size, dxr_tot_size; 865 uint32_t i, m, range_rebuild = 0; 866 uint32_t range_frag; 867 #ifdef DXR2 868 struct trie_desc *tp; 869 uint32_t d_tbl_size, dxr_x, d_size, x_size; 870 uint32_t ti, trie_rebuild = 0, prev_size = 0; 871 uint32_t trie_frag; 872 #endif 873 874 KASSERT(dxr->d == NULL, ("dxr: d not free")); 875 876 if (da == NULL) { 877 da = malloc(sizeof(*dxr->aux), M_DXRAUX, M_NOWAIT); 878 if (da == NULL) 879 return; 880 dxr->aux = da; 881 da->fibnum = dxr->fibnum; 882 da->refcnt = 1; 883 LIST_INIT(&da->all_chunks); 884 LIST_INIT(&da->all_trie); 885 da->rtbl_size = RTBL_SIZE_INCR; 886 da->range_tbl = NULL; 887 da->xtbl_size = XTBL_SIZE_INCR; 888 da->x_tbl = NULL; 889 bzero(&da->dst, sizeof(da->dst)); 890 bzero(&da->mask, sizeof(da->mask)); 891 da->dst.sin_len = sizeof(da->dst); 892 da->mask.sin_len = sizeof(da->mask); 893 da->dst.sin_family = AF_INET; 894 da->mask.sin_family = AF_INET; 895 } 896 if (da->range_tbl == NULL) { 897 da->range_tbl = malloc(sizeof(*da->range_tbl) * da->rtbl_size 898 + FRAGS_PREF_SHORT, M_DXRAUX, M_NOWAIT); 899 if (da->range_tbl == NULL) 900 return; 901 range_rebuild = 1; 902 } 903 #ifdef DXR2 904 if (da->x_tbl == NULL) { 905 da->x_tbl = malloc(sizeof(*da->x_tbl) * da->xtbl_size, 906 M_DXRAUX, M_NOWAIT); 907 if (da->x_tbl == NULL) 908 return; 909 trie_rebuild = 1; 910 } 911 #endif 912 da->fd = dxr->fd; 913 914 microuptime(&t0); 915 916 dxr->nh_tbl = fib_get_nhop_array(da->fd); 917 fib_get_rtable_info(fib_get_rh(da->fd), &rinfo); 918 919 if (da->updates_low > da->updates_high) 920 range_rebuild = 1; 921 922 range_build: 923 if (range_rebuild) { 924 /* Bulk cleanup */ 925 bzero(da->chunk_hashtbl, sizeof(da->chunk_hashtbl)); 926 while ((cdp = LIST_FIRST(&da->all_chunks)) != NULL) { 927 LIST_REMOVE(cdp, cd_all_le); 928 uma_zfree(chunk_zone, cdp); 929 } 930 for (i = 0; i < UNUSED_BUCKETS; i++) 931 LIST_INIT(&da->unused_chunks[i]); 932 da->unused_chunks_size = 0; 933 da->rtbl_top = 0; 934 da->updates_low = 0; 935 da->updates_high = DIRECT_TBL_SIZE - 1; 936 memset(da->updates_mask, 0xff, sizeof(da->updates_mask)); 937 for (i = 0; i < DIRECT_TBL_SIZE; i++) { 938 da->direct_tbl[i].fragments = FRAGS_MARK_HIT; 939 da->direct_tbl[i].base = 0; 940 } 941 } 942 da->prefixes = rinfo.num_prefixes; 943 944 /* DXR: construct direct & range table */ 945 for (i = da->updates_low; i <= da->updates_high; i++) { 946 m = da->updates_mask[i >> 5] >> (i & 0x1f); 947 if (m == 0) 948 i |= 0x1f; 949 else if (m & 1 && update_chunk(da, i) != 0) 950 return; 951 } 952 953 range_frag = 0; 954 if (da->rtbl_top) 955 range_frag = da->unused_chunks_size * 10000ULL / da->rtbl_top; 956 if (range_frag > V_frag_limit) { 957 range_rebuild = 1; 958 goto range_build; 959 } 960 961 r_size = sizeof(*da->range_tbl) * da->rtbl_top; 962 microuptime(&t1); 963 964 #ifdef DXR2 965 if (range_rebuild || 966 abs(fls(da->prefixes) - fls(da->trie_rebuilt_prefixes)) > 1) 967 trie_rebuild = 1; 968 969 trie_build: 970 if (trie_rebuild) { 971 da->trie_rebuilt_prefixes = da->prefixes; 972 da->d_bits = DXR_D; 973 da->updates_low = 0; 974 da->updates_high = DIRECT_TBL_SIZE - 1; 975 if (!range_rebuild) 976 memset(da->updates_mask, 0xff, 977 sizeof(da->updates_mask)); 978 } 979 980 dxr2_try_squeeze: 981 if (trie_rebuild) { 982 /* Bulk cleanup */ 983 bzero(da->trietbl, sizeof(da->trietbl)); 984 bzero(da->trie_hashtbl, sizeof(da->trie_hashtbl)); 985 while ((tp = LIST_FIRST(&da->all_trie)) != NULL) { 986 LIST_REMOVE(tp, td_all_le); 987 uma_zfree(trie_zone, tp); 988 } 989 LIST_INIT(&da->unused_trie); 990 da->all_trie_cnt = da->unused_trie_cnt = 0; 991 } 992 993 /* Populate d_tbl, x_tbl */ 994 dxr_x = DXR_TRIE_BITS - da->d_bits; 995 d_tbl_size = (1 << da->d_bits); 996 997 for (i = da->updates_low >> dxr_x; i <= da->updates_high >> dxr_x; 998 i++) { 999 if (!trie_rebuild) { 1000 m = 0; 1001 for (int j = 0; j < (1 << dxr_x); j += 32) 1002 m |= da->updates_mask[((i << dxr_x) + j) >> 5]; 1003 if (m == 0) 1004 continue; 1005 trie_unref(da, i); 1006 } 1007 ti = trie_ref(da, i); 1008 if (ti < 0) 1009 return; 1010 da->d_tbl[i] = ti; 1011 } 1012 1013 trie_frag = 0; 1014 if (da->all_trie_cnt) 1015 trie_frag = da->unused_trie_cnt * 10000ULL / da->all_trie_cnt; 1016 if (trie_frag > V_frag_limit) { 1017 trie_rebuild = 1; 1018 goto trie_build; 1019 } 1020 1021 d_size = sizeof(*da->d_tbl) * d_tbl_size; 1022 x_size = sizeof(*da->x_tbl) * DIRECT_TBL_SIZE / d_tbl_size 1023 * da->all_trie_cnt; 1024 dxr_tot_size = d_size + x_size + r_size; 1025 1026 if (trie_rebuild == 1) { 1027 /* Try to find a more compact D/X split */ 1028 if (prev_size == 0 || dxr_tot_size <= prev_size) 1029 da->d_bits--; 1030 else { 1031 da->d_bits++; 1032 trie_rebuild = 2; 1033 } 1034 prev_size = dxr_tot_size; 1035 goto dxr2_try_squeeze; 1036 } 1037 microuptime(&t2); 1038 #else /* !DXR2 */ 1039 dxr_tot_size = sizeof(da->direct_tbl) + r_size; 1040 t2 = t1; 1041 #endif 1042 1043 dxr->d = malloc(dxr_tot_size, M_DXRLPM, M_NOWAIT); 1044 if (dxr->d == NULL) 1045 return; 1046 #ifdef DXR2 1047 memcpy(dxr->d, da->d_tbl, d_size); 1048 dxr->x = ((char *) dxr->d) + d_size; 1049 memcpy(dxr->x, da->x_tbl, x_size); 1050 dxr->r = ((char *) dxr->x) + x_size; 1051 dxr->d_shift = 32 - da->d_bits; 1052 dxr->x_shift = dxr_x; 1053 dxr->x_mask = 0xffffffffU >> (32 - dxr_x); 1054 #else /* !DXR2 */ 1055 memcpy(dxr->d, da->direct_tbl, sizeof(da->direct_tbl)); 1056 dxr->r = ((char *) dxr->d) + sizeof(da->direct_tbl); 1057 #endif 1058 memcpy(dxr->r, da->range_tbl, r_size); 1059 1060 if (da->updates_low <= da->updates_high) 1061 bzero(&da->updates_mask[da->updates_low / 32], 1062 (da->updates_high - da->updates_low) / 8 + 1); 1063 da->updates_low = DIRECT_TBL_SIZE - 1; 1064 da->updates_high = 0; 1065 microuptime(&t3); 1066 1067 #ifdef DXR2 1068 FIB_PRINTF(LOG_INFO, da->fd, "D%dX%dR, %d prefixes, %d nhops (max)", 1069 da->d_bits, dxr_x, rinfo.num_prefixes, rinfo.num_nhops); 1070 #else 1071 FIB_PRINTF(LOG_INFO, da->fd, "D%dR, %d prefixes, %d nhops (max)", 1072 DXR_D, rinfo.num_prefixes, rinfo.num_nhops); 1073 #endif 1074 i = dxr_tot_size * 100; 1075 if (rinfo.num_prefixes) 1076 i /= rinfo.num_prefixes; 1077 FIB_PRINTF(LOG_INFO, da->fd, "%d.%02d KBytes, %d.%02d Bytes/prefix", 1078 dxr_tot_size / 1024, dxr_tot_size * 100 / 1024 % 100, 1079 i / 100, i % 100); 1080 #ifdef DXR2 1081 FIB_PRINTF(LOG_INFO, da->fd, 1082 "%d.%02d%% trie, %d.%02d%% range fragmentation", 1083 trie_frag / 100, trie_frag % 100, 1084 range_frag / 100, range_frag % 100); 1085 #else 1086 FIB_PRINTF(LOG_INFO, da->fd, "%d.%01d%% range fragmentation", 1087 range_frag / 100, range_frag % 100); 1088 #endif 1089 i = (t1.tv_sec - t0.tv_sec) * 1000000 + t1.tv_usec - t0.tv_usec; 1090 FIB_PRINTF(LOG_INFO, da->fd, "range table %s in %u.%03u ms", 1091 range_rebuild ? "rebuilt" : "updated", i / 1000, i % 1000); 1092 #ifdef DXR2 1093 i = (t2.tv_sec - t1.tv_sec) * 1000000 + t2.tv_usec - t1.tv_usec; 1094 FIB_PRINTF(LOG_INFO, da->fd, "trie %s in %u.%03u ms", 1095 trie_rebuild ? "rebuilt" : "updated", i / 1000, i % 1000); 1096 #endif 1097 i = (t3.tv_sec - t2.tv_sec) * 1000000 + t3.tv_usec - t2.tv_usec; 1098 FIB_PRINTF(LOG_INFO, da->fd, "snapshot forked in %u.%03u ms", 1099 i / 1000, i % 1000); 1100 } 1101 1102 /* 1103 * Glue functions for attaching to FreeBSD 13 fib_algo infrastructure. 1104 */ 1105 1106 static struct nhop_object * 1107 dxr_fib_lookup(void *algo_data, const struct flm_lookup_key key, 1108 uint32_t scopeid) 1109 { 1110 struct dxr *dxr = algo_data; 1111 1112 return (dxr->nh_tbl[dxr_lookup(dxr, ntohl(key.addr4.s_addr))]); 1113 } 1114 1115 static enum flm_op_result 1116 dxr_init(uint32_t fibnum, struct fib_data *fd, void *old_data, void **data) 1117 { 1118 struct dxr *old_dxr = old_data; 1119 struct dxr_aux *da = NULL; 1120 struct dxr *dxr; 1121 1122 dxr = malloc(sizeof(*dxr), M_DXRAUX, M_NOWAIT); 1123 if (dxr == NULL) 1124 return (FLM_REBUILD); 1125 1126 /* Check whether we may reuse the old auxiliary structures */ 1127 if (old_dxr != NULL && old_dxr->aux != NULL) { 1128 da = old_dxr->aux; 1129 atomic_add_int(&da->refcnt, 1); 1130 } 1131 1132 dxr->aux = da; 1133 dxr->d = NULL; 1134 dxr->fd = fd; 1135 dxr->fibnum = fibnum; 1136 *data = dxr; 1137 1138 return (FLM_SUCCESS); 1139 } 1140 1141 static void 1142 dxr_destroy(void *data) 1143 { 1144 struct dxr *dxr = data; 1145 struct dxr_aux *da; 1146 struct chunk_desc *cdp; 1147 struct trie_desc *tp; 1148 1149 if (dxr->d != NULL) 1150 free(dxr->d, M_DXRLPM); 1151 1152 da = dxr->aux; 1153 free(dxr, M_DXRAUX); 1154 1155 if (da == NULL || atomic_fetchadd_int(&da->refcnt, -1) > 1) 1156 return; 1157 1158 /* Release auxiliary structures */ 1159 while ((cdp = LIST_FIRST(&da->all_chunks)) != NULL) { 1160 LIST_REMOVE(cdp, cd_all_le); 1161 uma_zfree(chunk_zone, cdp); 1162 } 1163 while ((tp = LIST_FIRST(&da->all_trie)) != NULL) { 1164 LIST_REMOVE(tp, td_all_le); 1165 uma_zfree(trie_zone, tp); 1166 } 1167 free(da->range_tbl, M_DXRAUX); 1168 free(da->x_tbl, M_DXRAUX); 1169 free(da, M_DXRAUX); 1170 } 1171 1172 static void 1173 epoch_dxr_destroy(epoch_context_t ctx) 1174 { 1175 struct dxr *dxr = __containerof(ctx, struct dxr, epoch_ctx); 1176 1177 dxr_destroy(dxr); 1178 } 1179 1180 static void * 1181 choose_lookup_fn(struct dxr_aux *da) 1182 { 1183 1184 #ifdef DXR2 1185 switch (da->d_bits) { 1186 #if DXR_TRIE_BITS > 16 1187 case 16: 1188 return (dxr_fib_lookup_16); 1189 #endif 1190 case 15: 1191 return (dxr_fib_lookup_15); 1192 case 14: 1193 return (dxr_fib_lookup_14); 1194 case 13: 1195 return (dxr_fib_lookup_13); 1196 case 12: 1197 return (dxr_fib_lookup_12); 1198 case 11: 1199 return (dxr_fib_lookup_11); 1200 case 10: 1201 return (dxr_fib_lookup_10); 1202 case 9: 1203 return (dxr_fib_lookup_9); 1204 } 1205 #endif /* DXR2 */ 1206 return (dxr_fib_lookup); 1207 } 1208 1209 static enum flm_op_result 1210 dxr_dump_end(void *data, struct fib_dp *dp) 1211 { 1212 struct dxr *dxr = data; 1213 struct dxr_aux *da; 1214 1215 dxr_build(dxr); 1216 1217 da = dxr->aux; 1218 if (da == NULL) 1219 return (FLM_REBUILD); 1220 1221 /* Structural limit exceeded, hard error */ 1222 if (da->rtbl_top >= BASE_MAX) 1223 return (FLM_ERROR); 1224 1225 /* A malloc(,, M_NOWAIT) failed somewhere, retry later */ 1226 if (dxr->d == NULL) 1227 return (FLM_REBUILD); 1228 1229 dp->f = choose_lookup_fn(da); 1230 dp->arg = dxr; 1231 1232 return (FLM_SUCCESS); 1233 } 1234 1235 static enum flm_op_result 1236 dxr_dump_rib_item(struct rtentry *rt, void *data) 1237 { 1238 1239 return (FLM_SUCCESS); 1240 } 1241 1242 static enum flm_op_result 1243 dxr_change_rib_item(struct rib_head *rnh, struct rib_cmd_info *rc, 1244 void *data) 1245 { 1246 1247 return (FLM_BATCH); 1248 } 1249 1250 static enum flm_op_result 1251 dxr_change_rib_batch(struct rib_head *rnh, struct fib_change_queue *q, 1252 void *data) 1253 { 1254 struct dxr *dxr = data; 1255 struct dxr *new_dxr; 1256 struct dxr_aux *da; 1257 struct fib_dp new_dp; 1258 enum flm_op_result res; 1259 uint32_t ip, plen, hmask, start, end, i, ui; 1260 #ifdef INVARIANTS 1261 struct rib_rtable_info rinfo; 1262 int update_delta = 0; 1263 #endif 1264 1265 KASSERT(data != NULL, ("%s: NULL data", __FUNCTION__)); 1266 KASSERT(q != NULL, ("%s: NULL q", __FUNCTION__)); 1267 KASSERT(q->count < q->size, ("%s: q->count %d q->size %d", 1268 __FUNCTION__, q->count, q->size)); 1269 1270 da = dxr->aux; 1271 KASSERT(da != NULL, ("%s: NULL dxr->aux", __FUNCTION__)); 1272 1273 FIB_PRINTF(LOG_INFO, da->fd, "processing %d update(s)", q->count); 1274 for (ui = 0; ui < q->count; ui++) { 1275 #ifdef INVARIANTS 1276 if (q->entries[ui].nh_new != NULL) 1277 update_delta++; 1278 if (q->entries[ui].nh_old != NULL) 1279 update_delta--; 1280 #endif 1281 plen = q->entries[ui].plen; 1282 ip = ntohl(q->entries[ui].addr4.s_addr); 1283 if (plen < 32) 1284 hmask = 0xffffffffU >> plen; 1285 else 1286 hmask = 0; 1287 start = (ip & ~hmask) >> DXR_RANGE_SHIFT; 1288 end = (ip | hmask) >> DXR_RANGE_SHIFT; 1289 1290 if ((start & 0x1f) == 0 && (end & 0x1f) == 0x1f) 1291 for (i = start >> 5; i <= end >> 5; i++) 1292 da->updates_mask[i] = 0xffffffffU; 1293 else 1294 for (i = start; i <= end; i++) 1295 da->updates_mask[i >> 5] |= (1 << (i & 0x1f)); 1296 if (start < da->updates_low) 1297 da->updates_low = start; 1298 if (end > da->updates_high) 1299 da->updates_high = end; 1300 } 1301 1302 #ifdef INVARIANTS 1303 fib_get_rtable_info(fib_get_rh(da->fd), &rinfo); 1304 KASSERT(da->prefixes + update_delta == rinfo.num_prefixes, 1305 ("%s: update count mismatch", __FUNCTION__)); 1306 #endif 1307 1308 res = dxr_init(0, dxr->fd, data, (void **) &new_dxr); 1309 if (res != FLM_SUCCESS) 1310 return (res); 1311 1312 dxr_build(new_dxr); 1313 1314 /* Structural limit exceeded, hard error */ 1315 if (da->rtbl_top >= BASE_MAX) { 1316 dxr_destroy(new_dxr); 1317 return (FLM_ERROR); 1318 } 1319 1320 /* A malloc(,, M_NOWAIT) failed somewhere, retry later */ 1321 if (new_dxr->d == NULL) { 1322 dxr_destroy(new_dxr); 1323 return (FLM_REBUILD); 1324 } 1325 1326 new_dp.f = choose_lookup_fn(da); 1327 new_dp.arg = new_dxr; 1328 if (fib_set_datapath_ptr(dxr->fd, &new_dp)) { 1329 fib_set_algo_ptr(dxr->fd, new_dxr); 1330 fib_epoch_call(epoch_dxr_destroy, &dxr->epoch_ctx); 1331 return (FLM_SUCCESS); 1332 } 1333 1334 dxr_destroy(new_dxr); 1335 return (FLM_REBUILD); 1336 } 1337 1338 static uint8_t 1339 dxr_get_pref(const struct rib_rtable_info *rinfo) 1340 { 1341 1342 /* Below bsearch4 up to 10 prefixes. Always supersedes dpdk_lpm4. */ 1343 return (251); 1344 } 1345 1346 SYSCTL_DECL(_net_route_algo); 1347 SYSCTL_NODE(_net_route_algo, OID_AUTO, dxr, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 1348 "DXR tunables"); 1349 1350 static int 1351 sysctl_dxr_frag_limit(SYSCTL_HANDLER_ARGS) 1352 { 1353 char buf[8]; 1354 int error, new, i; 1355 1356 snprintf(buf, sizeof(buf), "%d.%02d%%", V_frag_limit / 100, 1357 V_frag_limit % 100); 1358 error = sysctl_handle_string(oidp, buf, sizeof(buf), req); 1359 if (error != 0 || req->newptr == NULL) 1360 return (error); 1361 if (!isdigit(*buf) && *buf != '.') 1362 return (EINVAL); 1363 for (i = 0, new = 0; isdigit(buf[i]) && i < sizeof(buf); i++) 1364 new = new * 10 + buf[i] - '0'; 1365 new *= 100; 1366 if (buf[i++] == '.') { 1367 if (!isdigit(buf[i])) 1368 return (EINVAL); 1369 new += (buf[i++] - '0') * 10; 1370 if (isdigit(buf[i])) 1371 new += buf[i++] - '0'; 1372 } 1373 if (new > 1000) 1374 return (EINVAL); 1375 V_frag_limit = new; 1376 snprintf(buf, sizeof(buf), "%d.%02d%%", V_frag_limit / 100, 1377 V_frag_limit % 100); 1378 return (0); 1379 } 1380 1381 SYSCTL_PROC(_net_route_algo_dxr, OID_AUTO, frag_limit, 1382 CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_VNET, 1383 0, 0, sysctl_dxr_frag_limit, "A", 1384 "Fragmentation threshold to full rebuild"); 1385 1386 static struct fib_lookup_module fib_dxr_mod = { 1387 .flm_name = "dxr", 1388 .flm_family = AF_INET, 1389 .flm_init_cb = dxr_init, 1390 .flm_destroy_cb = dxr_destroy, 1391 .flm_dump_rib_item_cb = dxr_dump_rib_item, 1392 .flm_dump_end_cb = dxr_dump_end, 1393 .flm_change_rib_item_cb = dxr_change_rib_item, 1394 .flm_change_rib_items_cb = dxr_change_rib_batch, 1395 .flm_get_pref = dxr_get_pref, 1396 }; 1397 1398 static int 1399 dxr_modevent(module_t mod, int type, void *unused) 1400 { 1401 int error; 1402 1403 switch (type) { 1404 case MOD_LOAD: 1405 chunk_zone = uma_zcreate("dxr chunk", sizeof(struct chunk_desc), 1406 NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); 1407 trie_zone = uma_zcreate("dxr trie", sizeof(struct trie_desc), 1408 NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); 1409 fib_module_register(&fib_dxr_mod); 1410 return(0); 1411 case MOD_UNLOAD: 1412 error = fib_module_unregister(&fib_dxr_mod); 1413 if (error) 1414 return (error); 1415 uma_zdestroy(chunk_zone); 1416 uma_zdestroy(trie_zone); 1417 return(0); 1418 default: 1419 return(EOPNOTSUPP); 1420 } 1421 } 1422 1423 static moduledata_t dxr_mod = {"fib_dxr", dxr_modevent, 0}; 1424 1425 DECLARE_MODULE(fib_dxr, dxr_mod, SI_SUB_PSEUDO, SI_ORDER_ANY); 1426 MODULE_VERSION(fib_dxr, 1); 1427