1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause 3 * 4 * Copyright (c) 2012-2022 Marko Zec 5 * Copyright (c) 2005, 2018 University of Zagreb 6 * Copyright (c) 2005 International Computer Science Institute 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 27 * SUCH DAMAGE. 28 */ 29 30 /* 31 * An implementation of DXR, a simple IPv4 LPM scheme with compact lookup 32 * structures and a trivial search procedure. More significant bits of 33 * the search key are used to directly index a two-stage trie, while the 34 * remaining bits are used for finding the next hop in a sorted array. 35 * More details in: 36 * 37 * M. Zec, L. Rizzo, M. Mikuc, DXR: towards a billion routing lookups per 38 * second in software, ACM SIGCOMM Computer Communication Review, September 39 * 2012 40 * 41 * M. Zec, M. Mikuc, Pushing the envelope: beyond two billion IP routing 42 * lookups per second on commodity CPUs, IEEE SoftCOM, September 2017, Split 43 */ 44 45 #include <sys/cdefs.h> 46 #include "opt_inet.h" 47 48 #include <sys/param.h> 49 #include <sys/kernel.h> 50 #include <sys/ctype.h> 51 #include <sys/epoch.h> 52 #include <sys/malloc.h> 53 #include <sys/module.h> 54 #include <sys/socket.h> 55 #include <sys/sysctl.h> 56 #include <sys/syslog.h> 57 58 #include <vm/uma.h> 59 60 #include <netinet/in.h> 61 #include <netinet/in_fib.h> 62 63 #include <net/route.h> 64 #include <net/route/route_ctl.h> 65 #include <net/route/fib_algo.h> 66 67 #define DXR_TRIE_BITS 20 68 69 CTASSERT(DXR_TRIE_BITS >= 16 && DXR_TRIE_BITS <= 24); 70 71 /* DXR2: two-stage primary trie, instead of a single direct lookup table */ 72 #define DXR2 73 74 #if DXR_TRIE_BITS > 16 75 #define DXR_D 16 76 #else 77 #define DXR_D (DXR_TRIE_BITS - 1) 78 #endif 79 80 #define D_TBL_SIZE (1 << DXR_D) 81 #define DIRECT_TBL_SIZE (1 << DXR_TRIE_BITS) 82 #define DXR_RANGE_MASK (0xffffffffU >> DXR_TRIE_BITS) 83 #define DXR_RANGE_SHIFT (32 - DXR_TRIE_BITS) 84 85 #define DESC_BASE_BITS 22 86 #define DESC_FRAGMENTS_BITS (32 - DESC_BASE_BITS) 87 #define BASE_MAX ((1 << DESC_BASE_BITS) - 1) 88 #define RTBL_SIZE_INCR (BASE_MAX / 64) 89 90 #if DXR_TRIE_BITS < 24 91 #define FRAGS_MASK_SHORT ((1 << (23 - DXR_TRIE_BITS)) - 1) 92 #else 93 #define FRAGS_MASK_SHORT 0 94 #endif 95 #define FRAGS_PREF_SHORT (((1 << DESC_FRAGMENTS_BITS) - 1) & \ 96 ~FRAGS_MASK_SHORT) 97 #define FRAGS_MARK_XL (FRAGS_PREF_SHORT - 1) 98 #define FRAGS_MARK_HIT (FRAGS_PREF_SHORT - 2) 99 100 #define IS_SHORT_FORMAT(x) ((x & FRAGS_PREF_SHORT) == FRAGS_PREF_SHORT) 101 #define IS_LONG_FORMAT(x) ((x & FRAGS_PREF_SHORT) != FRAGS_PREF_SHORT) 102 #define IS_XL_FORMAT(x) (x == FRAGS_MARK_XL) 103 104 #define RE_SHORT_MAX_NH ((1 << (DXR_TRIE_BITS - 8)) - 1) 105 106 #define CHUNK_HASH_BITS 16 107 #define CHUNK_HASH_SIZE (1 << CHUNK_HASH_BITS) 108 #define CHUNK_HASH_MASK (CHUNK_HASH_SIZE - 1) 109 110 #define TRIE_HASH_BITS 16 111 #define TRIE_HASH_SIZE (1 << TRIE_HASH_BITS) 112 #define TRIE_HASH_MASK (TRIE_HASH_SIZE - 1) 113 114 #define XTBL_SIZE_INCR (DIRECT_TBL_SIZE / 16) 115 116 #define UNUSED_BUCKETS 8 117 118 /* Lookup structure elements */ 119 120 struct direct_entry { 121 uint32_t fragments: DESC_FRAGMENTS_BITS, 122 base: DESC_BASE_BITS; 123 }; 124 125 struct range_entry_long { 126 uint32_t start: DXR_RANGE_SHIFT, 127 nexthop: DXR_TRIE_BITS; 128 }; 129 130 #if DXR_TRIE_BITS < 24 131 struct range_entry_short { 132 uint16_t start: DXR_RANGE_SHIFT - 8, 133 nexthop: DXR_TRIE_BITS - 8; 134 }; 135 #endif 136 137 /* Auxiliary structures */ 138 139 struct heap_entry { 140 uint32_t start; 141 uint32_t end; 142 uint32_t preflen; 143 uint32_t nexthop; 144 }; 145 146 struct chunk_desc { 147 LIST_ENTRY(chunk_desc) cd_all_le; 148 LIST_ENTRY(chunk_desc) cd_hash_le; 149 uint32_t cd_hash; 150 uint32_t cd_refcnt; 151 uint32_t cd_base; 152 uint32_t cd_cur_size; 153 uint32_t cd_max_size; 154 }; 155 156 struct trie_desc { 157 LIST_ENTRY(trie_desc) td_all_le; 158 LIST_ENTRY(trie_desc) td_hash_le; 159 uint32_t td_hash; 160 uint32_t td_index; 161 uint32_t td_refcnt; 162 }; 163 164 struct dxr_aux { 165 /* Glue to external state */ 166 struct fib_data *fd; 167 uint32_t fibnum; 168 int refcnt; 169 170 /* Auxiliary build-time tables */ 171 struct direct_entry direct_tbl[DIRECT_TBL_SIZE]; 172 uint16_t d_tbl[D_TBL_SIZE]; 173 struct direct_entry *x_tbl; 174 union { 175 struct range_entry_long re; 176 uint32_t fragments; 177 } *range_tbl; 178 179 /* Auxiliary internal state */ 180 uint32_t updates_mask[DIRECT_TBL_SIZE / 32]; 181 struct trie_desc *trietbl[D_TBL_SIZE]; 182 LIST_HEAD(, chunk_desc) chunk_hashtbl[CHUNK_HASH_SIZE]; 183 LIST_HEAD(, chunk_desc) all_chunks; 184 LIST_HEAD(, chunk_desc) unused_chunks[UNUSED_BUCKETS]; 185 LIST_HEAD(, trie_desc) trie_hashtbl[TRIE_HASH_SIZE]; 186 LIST_HEAD(, trie_desc) all_trie; 187 LIST_HEAD(, trie_desc) unused_trie; /* abuses hash link entry */ 188 struct sockaddr_in dst; 189 struct sockaddr_in mask; 190 struct heap_entry heap[33]; 191 uint32_t prefixes; 192 uint32_t updates_low; 193 uint32_t updates_high; 194 uint32_t unused_chunks_size; 195 uint32_t xtbl_size; 196 uint32_t all_trie_cnt; 197 uint32_t unused_trie_cnt; 198 uint32_t trie_rebuilt_prefixes; 199 uint32_t heap_index; 200 uint32_t d_bits; 201 uint32_t rtbl_size; 202 uint32_t rtbl_top; 203 uint32_t rtbl_work_frags; 204 uint32_t work_chunk; 205 }; 206 207 /* Main lookup structure container */ 208 209 struct dxr { 210 /* Lookup tables */ 211 void *d; 212 void *x; 213 void *r; 214 struct nhop_object **nh_tbl; 215 216 /* Glue to external state */ 217 struct dxr_aux *aux; 218 struct fib_data *fd; 219 struct epoch_context epoch_ctx; 220 uint32_t fibnum; 221 uint16_t d_shift; 222 uint16_t x_shift; 223 uint32_t x_mask; 224 }; 225 226 static MALLOC_DEFINE(M_DXRLPM, "dxr", "DXR LPM"); 227 static MALLOC_DEFINE(M_DXRAUX, "dxr aux", "DXR auxiliary"); 228 229 uma_zone_t chunk_zone; 230 uma_zone_t trie_zone; 231 232 VNET_DEFINE_STATIC(int, frag_limit) = 100; 233 #define V_frag_limit VNET(frag_limit) 234 235 /* Binary search for a matching address range */ 236 #define DXR_LOOKUP_STAGE \ 237 if (masked_dst < range[middle].start) { \ 238 upperbound = middle; \ 239 middle = (middle + lowerbound) / 2; \ 240 } else if (masked_dst < range[middle + 1].start) \ 241 return (range[middle].nexthop); \ 242 else { \ 243 lowerbound = middle + 1; \ 244 middle = (upperbound + middle + 1) / 2; \ 245 } \ 246 if (upperbound == lowerbound) \ 247 return (range[lowerbound].nexthop); 248 249 static int 250 range_lookup(struct range_entry_long *rt, struct direct_entry de, uint32_t dst) 251 { 252 uint32_t base; 253 uint32_t upperbound; 254 uint32_t middle; 255 uint32_t lowerbound; 256 uint32_t masked_dst; 257 258 base = de.base; 259 lowerbound = 0; 260 masked_dst = dst & DXR_RANGE_MASK; 261 262 #if DXR_TRIE_BITS < 24 263 if (__predict_true(IS_SHORT_FORMAT(de.fragments))) { 264 upperbound = de.fragments & FRAGS_MASK_SHORT; 265 struct range_entry_short *range = 266 (struct range_entry_short *) &rt[base]; 267 268 masked_dst >>= 8; 269 middle = upperbound; 270 upperbound = upperbound * 2 + 1; 271 272 for (;;) { 273 DXR_LOOKUP_STAGE 274 DXR_LOOKUP_STAGE 275 } 276 } 277 #endif 278 279 upperbound = de.fragments; 280 middle = upperbound / 2; 281 struct range_entry_long *range = &rt[base]; 282 if (__predict_false(IS_XL_FORMAT(de.fragments))) { 283 upperbound = *((uint32_t *) range); 284 range++; 285 middle = upperbound / 2; 286 } 287 288 for (;;) { 289 DXR_LOOKUP_STAGE 290 DXR_LOOKUP_STAGE 291 } 292 } 293 294 #define DXR_LOOKUP_DEFINE(D) \ 295 static int inline \ 296 dxr_lookup_##D(struct dxr *dxr, uint32_t dst) \ 297 { \ 298 struct direct_entry de; \ 299 uint16_t *dt = dxr->d; \ 300 struct direct_entry *xt = dxr->x; \ 301 \ 302 de = xt[(dt[dst >> (32 - (D))] << (DXR_TRIE_BITS - (D))) \ 303 + ((dst >> DXR_RANGE_SHIFT) & \ 304 (0xffffffffU >> (32 - DXR_TRIE_BITS + (D))))]; \ 305 if (__predict_true(de.fragments == FRAGS_MARK_HIT)) \ 306 return (de.base); \ 307 return (range_lookup(dxr->r, de, dst)); \ 308 } \ 309 \ 310 static struct nhop_object * \ 311 dxr_fib_lookup_##D(void *algo_data, \ 312 const struct flm_lookup_key key, uint32_t scopeid __unused) \ 313 { \ 314 struct dxr *dxr = algo_data; \ 315 \ 316 return (dxr->nh_tbl[dxr_lookup_##D(dxr, \ 317 ntohl(key.addr4.s_addr))]); \ 318 } 319 320 #ifdef DXR2 321 #if DXR_TRIE_BITS > 16 322 DXR_LOOKUP_DEFINE(16) 323 #endif 324 DXR_LOOKUP_DEFINE(15) 325 DXR_LOOKUP_DEFINE(14) 326 DXR_LOOKUP_DEFINE(13) 327 DXR_LOOKUP_DEFINE(12) 328 DXR_LOOKUP_DEFINE(11) 329 DXR_LOOKUP_DEFINE(10) 330 DXR_LOOKUP_DEFINE(9) 331 #endif /* DXR2 */ 332 333 static int inline 334 dxr_lookup(struct dxr *dxr, uint32_t dst) 335 { 336 struct direct_entry de; 337 #ifdef DXR2 338 uint16_t *dt = dxr->d; 339 struct direct_entry *xt = dxr->x; 340 341 de = xt[(dt[dst >> dxr->d_shift] << dxr->x_shift) + 342 ((dst >> DXR_RANGE_SHIFT) & dxr->x_mask)]; 343 #else /* !DXR2 */ 344 struct direct_entry *dt = dxr->d; 345 346 de = dt[dst >> DXR_RANGE_SHIFT]; 347 #endif /* !DXR2 */ 348 if (__predict_true(de.fragments == FRAGS_MARK_HIT)) 349 return (de.base); 350 return (range_lookup(dxr->r, de, dst)); 351 } 352 353 static void 354 initheap(struct dxr_aux *da, uint32_t dst_u32, uint32_t chunk) 355 { 356 struct heap_entry *fhp = &da->heap[0]; 357 struct rtentry *rt; 358 struct route_nhop_data rnd; 359 360 da->heap_index = 0; 361 da->dst.sin_addr.s_addr = htonl(dst_u32); 362 rt = fib4_lookup_rt(da->fibnum, da->dst.sin_addr, 0, NHR_UNLOCKED, 363 &rnd); 364 if (rt != NULL) { 365 struct in_addr addr; 366 uint32_t scopeid; 367 368 rt_get_inet_prefix_plen(rt, &addr, &fhp->preflen, &scopeid); 369 fhp->start = ntohl(addr.s_addr); 370 fhp->end = fhp->start; 371 if (fhp->preflen < 32) 372 fhp->end |= (0xffffffffU >> fhp->preflen); 373 fhp->nexthop = fib_get_nhop_idx(da->fd, rnd.rnd_nhop); 374 } else { 375 fhp->preflen = fhp->nexthop = fhp->start = 0; 376 fhp->end = 0xffffffffU; 377 } 378 } 379 380 static uint32_t 381 chunk_size(struct dxr_aux *da, struct direct_entry *fdesc) 382 { 383 384 if (IS_SHORT_FORMAT(fdesc->fragments)) 385 return ((fdesc->fragments & FRAGS_MASK_SHORT) + 1); 386 else if (IS_XL_FORMAT(fdesc->fragments)) 387 return (da->range_tbl[fdesc->base].fragments + 2); 388 else /* if (IS_LONG_FORMAT(fdesc->fragments)) */ 389 return (fdesc->fragments + 1); 390 } 391 392 static uint32_t 393 chunk_hash(struct dxr_aux *da, struct direct_entry *fdesc) 394 { 395 uint32_t size = chunk_size(da, fdesc); 396 uint32_t *p = (uint32_t *) &da->range_tbl[fdesc->base]; 397 uint32_t *l = (uint32_t *) &da->range_tbl[fdesc->base + size]; 398 uint32_t hash = fdesc->fragments; 399 400 for (; p < l; p++) 401 hash = (hash << 7) + (hash >> 13) + *p; 402 403 return (hash + (hash >> 16)); 404 } 405 406 static int 407 chunk_ref(struct dxr_aux *da, uint32_t chunk) 408 { 409 struct direct_entry *fdesc = &da->direct_tbl[chunk]; 410 struct chunk_desc *cdp, *empty_cdp; 411 uint32_t base = fdesc->base; 412 uint32_t size = chunk_size(da, fdesc); 413 uint32_t hash = chunk_hash(da, fdesc); 414 int i; 415 416 /* Find an existing descriptor */ 417 LIST_FOREACH(cdp, &da->chunk_hashtbl[hash & CHUNK_HASH_MASK], 418 cd_hash_le) { 419 if (cdp->cd_hash != hash || cdp->cd_cur_size != size || 420 memcmp(&da->range_tbl[base], &da->range_tbl[cdp->cd_base], 421 sizeof(struct range_entry_long) * size)) 422 continue; 423 da->rtbl_top = fdesc->base; 424 fdesc->base = cdp->cd_base; 425 cdp->cd_refcnt++; 426 return (0); 427 } 428 429 /* No matching chunks found. Find an empty one to recycle. */ 430 for (cdp = NULL, i = size; cdp == NULL && i < UNUSED_BUCKETS; i++) 431 cdp = LIST_FIRST(&da->unused_chunks[i]); 432 433 if (cdp == NULL) 434 LIST_FOREACH(empty_cdp, &da->unused_chunks[0], cd_hash_le) 435 if (empty_cdp->cd_max_size >= size && (cdp == NULL || 436 empty_cdp->cd_max_size < cdp->cd_max_size)) { 437 cdp = empty_cdp; 438 if (empty_cdp->cd_max_size == size) 439 break; 440 } 441 442 if (cdp != NULL) { 443 /* Copy from heap into the recycled chunk */ 444 bcopy(&da->range_tbl[fdesc->base], &da->range_tbl[cdp->cd_base], 445 size * sizeof(struct range_entry_long)); 446 fdesc->base = cdp->cd_base; 447 da->rtbl_top -= size; 448 da->unused_chunks_size -= cdp->cd_max_size; 449 if (cdp->cd_max_size > size) { 450 /* Split the range in two, need a new descriptor */ 451 empty_cdp = uma_zalloc(chunk_zone, M_NOWAIT); 452 if (empty_cdp == NULL) 453 return (1); 454 LIST_INSERT_BEFORE(cdp, empty_cdp, cd_all_le); 455 empty_cdp->cd_base = cdp->cd_base + size; 456 empty_cdp->cd_cur_size = 0; 457 empty_cdp->cd_max_size = cdp->cd_max_size - size; 458 459 i = empty_cdp->cd_max_size; 460 if (i >= UNUSED_BUCKETS) 461 i = 0; 462 LIST_INSERT_HEAD(&da->unused_chunks[i], empty_cdp, 463 cd_hash_le); 464 465 da->unused_chunks_size += empty_cdp->cd_max_size; 466 cdp->cd_max_size = size; 467 } 468 LIST_REMOVE(cdp, cd_hash_le); 469 } else { 470 /* Alloc a new descriptor at the top of the heap*/ 471 cdp = uma_zalloc(chunk_zone, M_NOWAIT); 472 if (cdp == NULL) 473 return (1); 474 cdp->cd_max_size = size; 475 cdp->cd_base = fdesc->base; 476 LIST_INSERT_HEAD(&da->all_chunks, cdp, cd_all_le); 477 KASSERT(cdp->cd_base + cdp->cd_max_size == da->rtbl_top, 478 ("dxr: %s %d", __FUNCTION__, __LINE__)); 479 } 480 481 cdp->cd_hash = hash; 482 cdp->cd_refcnt = 1; 483 cdp->cd_cur_size = size; 484 LIST_INSERT_HEAD(&da->chunk_hashtbl[hash & CHUNK_HASH_MASK], cdp, 485 cd_hash_le); 486 if (da->rtbl_top >= da->rtbl_size) { 487 if (da->rtbl_top >= BASE_MAX) { 488 FIB_PRINTF(LOG_ERR, da->fd, 489 "structural limit exceeded at %d " 490 "range table elements", da->rtbl_top); 491 return (1); 492 } 493 da->rtbl_size += RTBL_SIZE_INCR; 494 i = (BASE_MAX - da->rtbl_top) * LOG_DEBUG / BASE_MAX; 495 FIB_PRINTF(i, da->fd, "range table at %d%% structural limit", 496 da->rtbl_top * 100 / BASE_MAX); 497 da->range_tbl = realloc(da->range_tbl, 498 sizeof(*da->range_tbl) * da->rtbl_size + FRAGS_PREF_SHORT, 499 M_DXRAUX, M_NOWAIT); 500 if (da->range_tbl == NULL) 501 return (1); 502 } 503 504 return (0); 505 } 506 507 static void 508 chunk_unref(struct dxr_aux *da, uint32_t chunk) 509 { 510 struct direct_entry *fdesc = &da->direct_tbl[chunk]; 511 struct chunk_desc *cdp, *cdp2; 512 uint32_t base = fdesc->base; 513 uint32_t size = chunk_size(da, fdesc); 514 uint32_t hash = chunk_hash(da, fdesc); 515 int i; 516 517 /* Find the corresponding descriptor */ 518 LIST_FOREACH(cdp, &da->chunk_hashtbl[hash & CHUNK_HASH_MASK], 519 cd_hash_le) 520 if (cdp->cd_hash == hash && cdp->cd_cur_size == size && 521 memcmp(&da->range_tbl[base], &da->range_tbl[cdp->cd_base], 522 sizeof(struct range_entry_long) * size) == 0) 523 break; 524 525 KASSERT(cdp != NULL, ("dxr: dangling chunk")); 526 if (--cdp->cd_refcnt > 0) 527 return; 528 529 LIST_REMOVE(cdp, cd_hash_le); 530 da->unused_chunks_size += cdp->cd_max_size; 531 cdp->cd_cur_size = 0; 532 533 /* Attempt to merge with the preceding chunk, if empty */ 534 cdp2 = LIST_NEXT(cdp, cd_all_le); 535 if (cdp2 != NULL && cdp2->cd_cur_size == 0) { 536 KASSERT(cdp2->cd_base + cdp2->cd_max_size == cdp->cd_base, 537 ("dxr: %s %d", __FUNCTION__, __LINE__)); 538 LIST_REMOVE(cdp, cd_all_le); 539 LIST_REMOVE(cdp2, cd_hash_le); 540 cdp2->cd_max_size += cdp->cd_max_size; 541 uma_zfree(chunk_zone, cdp); 542 cdp = cdp2; 543 } 544 545 /* Attempt to merge with the subsequent chunk, if empty */ 546 cdp2 = LIST_PREV(cdp, &da->all_chunks, chunk_desc, cd_all_le); 547 if (cdp2 != NULL && cdp2->cd_cur_size == 0) { 548 KASSERT(cdp->cd_base + cdp->cd_max_size == cdp2->cd_base, 549 ("dxr: %s %d", __FUNCTION__, __LINE__)); 550 LIST_REMOVE(cdp, cd_all_le); 551 LIST_REMOVE(cdp2, cd_hash_le); 552 cdp2->cd_max_size += cdp->cd_max_size; 553 cdp2->cd_base = cdp->cd_base; 554 uma_zfree(chunk_zone, cdp); 555 cdp = cdp2; 556 } 557 558 if (cdp->cd_base + cdp->cd_max_size == da->rtbl_top) { 559 /* Free the chunk on the top of the range heap, trim the heap */ 560 KASSERT(cdp == LIST_FIRST(&da->all_chunks), 561 ("dxr: %s %d", __FUNCTION__, __LINE__)); 562 da->rtbl_top -= cdp->cd_max_size; 563 da->unused_chunks_size -= cdp->cd_max_size; 564 LIST_REMOVE(cdp, cd_all_le); 565 uma_zfree(chunk_zone, cdp); 566 return; 567 } 568 569 i = cdp->cd_max_size; 570 if (i >= UNUSED_BUCKETS) 571 i = 0; 572 LIST_INSERT_HEAD(&da->unused_chunks[i], cdp, cd_hash_le); 573 } 574 575 #ifdef DXR2 576 static uint32_t 577 trie_hash(struct dxr_aux *da, uint32_t dxr_x, uint32_t index) 578 { 579 uint32_t i, *val; 580 uint32_t hash = 0; 581 582 for (i = 0; i < (1 << dxr_x); i++) { 583 hash = (hash << 3) ^ (hash >> 3); 584 val = (uint32_t *) 585 (void *) &da->direct_tbl[(index << dxr_x) + i]; 586 hash += (*val << 5); 587 hash += (*val >> 5); 588 } 589 590 return (hash + (hash >> 16)); 591 } 592 593 static int 594 trie_ref(struct dxr_aux *da, uint32_t index) 595 { 596 struct trie_desc *tp; 597 uint32_t dxr_d = da->d_bits; 598 uint32_t dxr_x = DXR_TRIE_BITS - dxr_d; 599 uint32_t hash = trie_hash(da, dxr_x, index); 600 601 /* Find an existing descriptor */ 602 LIST_FOREACH(tp, &da->trie_hashtbl[hash & TRIE_HASH_MASK], td_hash_le) 603 if (tp->td_hash == hash && 604 memcmp(&da->direct_tbl[index << dxr_x], 605 &da->x_tbl[tp->td_index << dxr_x], 606 sizeof(*da->x_tbl) << dxr_x) == 0) { 607 tp->td_refcnt++; 608 da->trietbl[index] = tp; 609 return(tp->td_index); 610 } 611 612 tp = LIST_FIRST(&da->unused_trie); 613 if (tp != NULL) { 614 LIST_REMOVE(tp, td_hash_le); 615 da->unused_trie_cnt--; 616 } else { 617 tp = uma_zalloc(trie_zone, M_NOWAIT); 618 if (tp == NULL) 619 return (-1); 620 LIST_INSERT_HEAD(&da->all_trie, tp, td_all_le); 621 tp->td_index = da->all_trie_cnt++; 622 } 623 624 tp->td_hash = hash; 625 tp->td_refcnt = 1; 626 LIST_INSERT_HEAD(&da->trie_hashtbl[hash & TRIE_HASH_MASK], tp, 627 td_hash_le); 628 memcpy(&da->x_tbl[tp->td_index << dxr_x], 629 &da->direct_tbl[index << dxr_x], sizeof(*da->x_tbl) << dxr_x); 630 da->trietbl[index] = tp; 631 if (da->all_trie_cnt >= da->xtbl_size >> dxr_x) { 632 da->xtbl_size += XTBL_SIZE_INCR; 633 da->x_tbl = realloc(da->x_tbl, 634 sizeof(*da->x_tbl) * da->xtbl_size, M_DXRAUX, M_NOWAIT); 635 if (da->x_tbl == NULL) 636 return (-1); 637 } 638 return(tp->td_index); 639 } 640 641 static void 642 trie_unref(struct dxr_aux *da, uint32_t index) 643 { 644 struct trie_desc *tp = da->trietbl[index]; 645 646 if (tp == NULL) 647 return; 648 da->trietbl[index] = NULL; 649 if (--tp->td_refcnt > 0) 650 return; 651 652 LIST_REMOVE(tp, td_hash_le); 653 da->unused_trie_cnt++; 654 if (tp->td_index != da->all_trie_cnt - 1) { 655 LIST_INSERT_HEAD(&da->unused_trie, tp, td_hash_le); 656 return; 657 } 658 659 do { 660 da->all_trie_cnt--; 661 da->unused_trie_cnt--; 662 LIST_REMOVE(tp, td_all_le); 663 uma_zfree(trie_zone, tp); 664 LIST_FOREACH(tp, &da->unused_trie, td_hash_le) 665 if (tp->td_index == da->all_trie_cnt - 1) { 666 LIST_REMOVE(tp, td_hash_le); 667 break; 668 } 669 } while (tp != NULL); 670 } 671 #endif 672 673 static void 674 heap_inject(struct dxr_aux *da, uint32_t start, uint32_t end, uint32_t preflen, 675 uint32_t nh) 676 { 677 struct heap_entry *fhp; 678 int i; 679 680 for (i = da->heap_index; i >= 0; i--) { 681 if (preflen > da->heap[i].preflen) 682 break; 683 else if (preflen < da->heap[i].preflen) 684 da->heap[i + 1] = da->heap[i]; 685 else 686 return; 687 } 688 689 fhp = &da->heap[i + 1]; 690 fhp->preflen = preflen; 691 fhp->start = start; 692 fhp->end = end; 693 fhp->nexthop = nh; 694 da->heap_index++; 695 } 696 697 static int 698 dxr_walk(struct rtentry *rt, void *arg) 699 { 700 struct dxr_aux *da = arg; 701 uint32_t chunk = da->work_chunk; 702 uint32_t first = chunk << DXR_RANGE_SHIFT; 703 uint32_t last = first | DXR_RANGE_MASK; 704 struct range_entry_long *fp = 705 &da->range_tbl[da->rtbl_top + da->rtbl_work_frags].re; 706 struct heap_entry *fhp = &da->heap[da->heap_index]; 707 uint32_t preflen, nh, start, end, scopeid; 708 struct in_addr addr; 709 710 rt_get_inet_prefix_plen(rt, &addr, &preflen, &scopeid); 711 start = ntohl(addr.s_addr); 712 if (start > last) 713 return (-1); /* Beyond chunk boundaries, we are done */ 714 if (start < first) 715 return (0); /* Skip this route */ 716 717 end = start; 718 if (preflen < 32) 719 end |= (0xffffffffU >> preflen); 720 nh = fib_get_nhop_idx(da->fd, rt_get_raw_nhop(rt)); 721 722 if (start == fhp->start) 723 heap_inject(da, start, end, preflen, nh); 724 else { 725 /* start > fhp->start */ 726 while (start > fhp->end) { 727 uint32_t oend = fhp->end; 728 729 if (da->heap_index > 0) { 730 fhp--; 731 da->heap_index--; 732 } else 733 initheap(da, fhp->end + 1, chunk); 734 if (fhp->end > oend && fhp->nexthop != fp->nexthop) { 735 fp++; 736 da->rtbl_work_frags++; 737 fp->start = (oend + 1) & DXR_RANGE_MASK; 738 fp->nexthop = fhp->nexthop; 739 } 740 } 741 if (start > ((chunk << DXR_RANGE_SHIFT) | fp->start) && 742 nh != fp->nexthop) { 743 fp++; 744 da->rtbl_work_frags++; 745 fp->start = start & DXR_RANGE_MASK; 746 } else if (da->rtbl_work_frags) { 747 if ((--fp)->nexthop == nh) 748 da->rtbl_work_frags--; 749 else 750 fp++; 751 } 752 fp->nexthop = nh; 753 heap_inject(da, start, end, preflen, nh); 754 } 755 756 return (0); 757 } 758 759 static int 760 update_chunk(struct dxr_aux *da, uint32_t chunk) 761 { 762 struct range_entry_long *fp; 763 #if DXR_TRIE_BITS < 24 764 struct range_entry_short *fps; 765 uint32_t start, nh, i; 766 #endif 767 struct heap_entry *fhp; 768 uint32_t first = chunk << DXR_RANGE_SHIFT; 769 uint32_t last = first | DXR_RANGE_MASK; 770 771 if (da->direct_tbl[chunk].fragments != FRAGS_MARK_HIT) 772 chunk_unref(da, chunk); 773 774 initheap(da, first, chunk); 775 776 fp = &da->range_tbl[da->rtbl_top].re; 777 da->rtbl_work_frags = 0; 778 fp->start = first & DXR_RANGE_MASK; 779 fp->nexthop = da->heap[0].nexthop; 780 781 da->dst.sin_addr.s_addr = htonl(first); 782 da->mask.sin_addr.s_addr = htonl(~DXR_RANGE_MASK); 783 784 da->work_chunk = chunk; 785 rib_walk_from(da->fibnum, AF_INET, RIB_FLAG_LOCKED, 786 (struct sockaddr *) &da->dst, (struct sockaddr *) &da->mask, 787 dxr_walk, da); 788 789 /* Flush any remaining objects on the heap */ 790 fp = &da->range_tbl[da->rtbl_top + da->rtbl_work_frags].re; 791 fhp = &da->heap[da->heap_index]; 792 while (fhp->preflen > DXR_TRIE_BITS) { 793 uint32_t oend = fhp->end; 794 795 if (da->heap_index > 0) { 796 fhp--; 797 da->heap_index--; 798 } else 799 initheap(da, fhp->end + 1, chunk); 800 if (fhp->end > oend && fhp->nexthop != fp->nexthop) { 801 /* Have we crossed the upper chunk boundary? */ 802 if (oend >= last) 803 break; 804 fp++; 805 da->rtbl_work_frags++; 806 fp->start = (oend + 1) & DXR_RANGE_MASK; 807 fp->nexthop = fhp->nexthop; 808 } 809 } 810 811 /* Direct hit if the chunk contains only a single fragment */ 812 if (da->rtbl_work_frags == 0) { 813 da->direct_tbl[chunk].base = fp->nexthop; 814 da->direct_tbl[chunk].fragments = FRAGS_MARK_HIT; 815 return (0); 816 } 817 818 da->direct_tbl[chunk].base = da->rtbl_top; 819 da->direct_tbl[chunk].fragments = da->rtbl_work_frags; 820 821 #if DXR_TRIE_BITS < 24 822 /* Check whether the chunk can be more compactly encoded */ 823 fp = &da->range_tbl[da->rtbl_top].re; 824 for (i = 0; i <= da->rtbl_work_frags; i++, fp++) 825 if ((fp->start & 0xff) != 0 || fp->nexthop > RE_SHORT_MAX_NH) 826 break; 827 if (i == da->rtbl_work_frags + 1) { 828 fp = &da->range_tbl[da->rtbl_top].re; 829 fps = (void *) fp; 830 for (i = 0; i <= da->rtbl_work_frags; i++, fp++, fps++) { 831 start = fp->start; 832 nh = fp->nexthop; 833 fps->start = start >> 8; 834 fps->nexthop = nh; 835 } 836 fps->start = start >> 8; 837 fps->nexthop = nh; 838 da->rtbl_work_frags >>= 1; 839 da->direct_tbl[chunk].fragments = 840 da->rtbl_work_frags | FRAGS_PREF_SHORT; 841 } else 842 #endif 843 if (da->rtbl_work_frags >= FRAGS_MARK_HIT) { 844 da->direct_tbl[chunk].fragments = FRAGS_MARK_XL; 845 memmove(&da->range_tbl[da->rtbl_top + 1], 846 &da->range_tbl[da->rtbl_top], 847 (da->rtbl_work_frags + 1) * sizeof(*da->range_tbl)); 848 da->range_tbl[da->rtbl_top].fragments = da->rtbl_work_frags; 849 da->rtbl_work_frags++; 850 } 851 da->rtbl_top += (da->rtbl_work_frags + 1); 852 return (chunk_ref(da, chunk)); 853 } 854 855 static void 856 dxr_build(struct dxr *dxr) 857 { 858 struct dxr_aux *da = dxr->aux; 859 struct chunk_desc *cdp; 860 struct rib_rtable_info rinfo; 861 struct timeval t0, t1, t2, t3; 862 uint32_t r_size, dxr_tot_size; 863 uint32_t i, m, range_rebuild = 0; 864 uint32_t range_frag; 865 #ifdef DXR2 866 struct trie_desc *tp; 867 uint32_t d_tbl_size, dxr_x, d_size, x_size; 868 uint32_t ti, trie_rebuild = 0, prev_size = 0; 869 uint32_t trie_frag; 870 #endif 871 872 KASSERT(dxr->d == NULL, ("dxr: d not free")); 873 874 if (da == NULL) { 875 da = malloc(sizeof(*dxr->aux), M_DXRAUX, M_NOWAIT); 876 if (da == NULL) 877 return; 878 dxr->aux = da; 879 da->fibnum = dxr->fibnum; 880 da->refcnt = 1; 881 LIST_INIT(&da->all_chunks); 882 LIST_INIT(&da->all_trie); 883 da->rtbl_size = RTBL_SIZE_INCR; 884 da->range_tbl = NULL; 885 da->xtbl_size = XTBL_SIZE_INCR; 886 da->x_tbl = NULL; 887 bzero(&da->dst, sizeof(da->dst)); 888 bzero(&da->mask, sizeof(da->mask)); 889 da->dst.sin_len = sizeof(da->dst); 890 da->mask.sin_len = sizeof(da->mask); 891 da->dst.sin_family = AF_INET; 892 da->mask.sin_family = AF_INET; 893 } 894 if (da->range_tbl == NULL) { 895 da->range_tbl = malloc(sizeof(*da->range_tbl) * da->rtbl_size 896 + FRAGS_PREF_SHORT, M_DXRAUX, M_NOWAIT); 897 if (da->range_tbl == NULL) 898 return; 899 range_rebuild = 1; 900 } 901 #ifdef DXR2 902 if (da->x_tbl == NULL) { 903 da->x_tbl = malloc(sizeof(*da->x_tbl) * da->xtbl_size, 904 M_DXRAUX, M_NOWAIT); 905 if (da->x_tbl == NULL) 906 return; 907 trie_rebuild = 1; 908 } 909 #endif 910 da->fd = dxr->fd; 911 912 microuptime(&t0); 913 914 dxr->nh_tbl = fib_get_nhop_array(da->fd); 915 fib_get_rtable_info(fib_get_rh(da->fd), &rinfo); 916 917 if (da->updates_low > da->updates_high) 918 range_rebuild = 1; 919 920 range_build: 921 if (range_rebuild) { 922 /* Bulk cleanup */ 923 bzero(da->chunk_hashtbl, sizeof(da->chunk_hashtbl)); 924 while ((cdp = LIST_FIRST(&da->all_chunks)) != NULL) { 925 LIST_REMOVE(cdp, cd_all_le); 926 uma_zfree(chunk_zone, cdp); 927 } 928 for (i = 0; i < UNUSED_BUCKETS; i++) 929 LIST_INIT(&da->unused_chunks[i]); 930 da->unused_chunks_size = 0; 931 da->rtbl_top = 0; 932 da->updates_low = 0; 933 da->updates_high = DIRECT_TBL_SIZE - 1; 934 memset(da->updates_mask, 0xff, sizeof(da->updates_mask)); 935 for (i = 0; i < DIRECT_TBL_SIZE; i++) { 936 da->direct_tbl[i].fragments = FRAGS_MARK_HIT; 937 da->direct_tbl[i].base = 0; 938 } 939 } 940 da->prefixes = rinfo.num_prefixes; 941 942 /* DXR: construct direct & range table */ 943 for (i = da->updates_low; i <= da->updates_high; i++) { 944 m = da->updates_mask[i >> 5] >> (i & 0x1f); 945 if (m == 0) 946 i |= 0x1f; 947 else if (m & 1 && update_chunk(da, i) != 0) 948 return; 949 } 950 951 range_frag = 0; 952 if (da->rtbl_top) 953 range_frag = da->unused_chunks_size * 10000ULL / da->rtbl_top; 954 if (range_frag > V_frag_limit) { 955 range_rebuild = 1; 956 goto range_build; 957 } 958 959 r_size = sizeof(*da->range_tbl) * da->rtbl_top; 960 microuptime(&t1); 961 962 #ifdef DXR2 963 if (range_rebuild || 964 abs(fls(da->prefixes) - fls(da->trie_rebuilt_prefixes)) > 1) 965 trie_rebuild = 1; 966 967 trie_build: 968 if (trie_rebuild) { 969 da->trie_rebuilt_prefixes = da->prefixes; 970 da->d_bits = DXR_D; 971 da->updates_low = 0; 972 da->updates_high = DIRECT_TBL_SIZE - 1; 973 if (!range_rebuild) 974 memset(da->updates_mask, 0xff, 975 sizeof(da->updates_mask)); 976 } 977 978 dxr2_try_squeeze: 979 if (trie_rebuild) { 980 /* Bulk cleanup */ 981 bzero(da->trietbl, sizeof(da->trietbl)); 982 bzero(da->trie_hashtbl, sizeof(da->trie_hashtbl)); 983 while ((tp = LIST_FIRST(&da->all_trie)) != NULL) { 984 LIST_REMOVE(tp, td_all_le); 985 uma_zfree(trie_zone, tp); 986 } 987 LIST_INIT(&da->unused_trie); 988 da->all_trie_cnt = da->unused_trie_cnt = 0; 989 } 990 991 /* Populate d_tbl, x_tbl */ 992 dxr_x = DXR_TRIE_BITS - da->d_bits; 993 d_tbl_size = (1 << da->d_bits); 994 995 for (i = da->updates_low >> dxr_x; i <= da->updates_high >> dxr_x; 996 i++) { 997 if (!trie_rebuild) { 998 m = 0; 999 for (int j = 0; j < (1 << dxr_x); j += 32) 1000 m |= da->updates_mask[((i << dxr_x) + j) >> 5]; 1001 if (m == 0) 1002 continue; 1003 trie_unref(da, i); 1004 } 1005 ti = trie_ref(da, i); 1006 if (ti < 0) 1007 return; 1008 da->d_tbl[i] = ti; 1009 } 1010 1011 trie_frag = 0; 1012 if (da->all_trie_cnt) 1013 trie_frag = da->unused_trie_cnt * 10000ULL / da->all_trie_cnt; 1014 if (trie_frag > V_frag_limit) { 1015 trie_rebuild = 1; 1016 goto trie_build; 1017 } 1018 1019 d_size = sizeof(*da->d_tbl) * d_tbl_size; 1020 x_size = sizeof(*da->x_tbl) * DIRECT_TBL_SIZE / d_tbl_size 1021 * da->all_trie_cnt; 1022 dxr_tot_size = d_size + x_size + r_size; 1023 1024 if (trie_rebuild == 1) { 1025 /* Try to find a more compact D/X split */ 1026 if (prev_size == 0 || dxr_tot_size <= prev_size) 1027 da->d_bits--; 1028 else { 1029 da->d_bits++; 1030 trie_rebuild = 2; 1031 } 1032 prev_size = dxr_tot_size; 1033 goto dxr2_try_squeeze; 1034 } 1035 microuptime(&t2); 1036 #else /* !DXR2 */ 1037 dxr_tot_size = sizeof(da->direct_tbl) + r_size; 1038 t2 = t1; 1039 #endif 1040 1041 dxr->d = malloc(dxr_tot_size, M_DXRLPM, M_NOWAIT); 1042 if (dxr->d == NULL) 1043 return; 1044 #ifdef DXR2 1045 memcpy(dxr->d, da->d_tbl, d_size); 1046 dxr->x = ((char *) dxr->d) + d_size; 1047 memcpy(dxr->x, da->x_tbl, x_size); 1048 dxr->r = ((char *) dxr->x) + x_size; 1049 dxr->d_shift = 32 - da->d_bits; 1050 dxr->x_shift = dxr_x; 1051 dxr->x_mask = 0xffffffffU >> (32 - dxr_x); 1052 #else /* !DXR2 */ 1053 memcpy(dxr->d, da->direct_tbl, sizeof(da->direct_tbl)); 1054 dxr->r = ((char *) dxr->d) + sizeof(da->direct_tbl); 1055 #endif 1056 memcpy(dxr->r, da->range_tbl, r_size); 1057 1058 if (da->updates_low <= da->updates_high) 1059 bzero(&da->updates_mask[da->updates_low / 32], 1060 (da->updates_high - da->updates_low) / 8 + 1); 1061 da->updates_low = DIRECT_TBL_SIZE - 1; 1062 da->updates_high = 0; 1063 microuptime(&t3); 1064 1065 #ifdef DXR2 1066 FIB_PRINTF(LOG_INFO, da->fd, "D%dX%dR, %d prefixes, %d nhops (max)", 1067 da->d_bits, dxr_x, rinfo.num_prefixes, rinfo.num_nhops); 1068 #else 1069 FIB_PRINTF(LOG_INFO, da->fd, "D%dR, %d prefixes, %d nhops (max)", 1070 DXR_D, rinfo.num_prefixes, rinfo.num_nhops); 1071 #endif 1072 i = dxr_tot_size * 100; 1073 if (rinfo.num_prefixes) 1074 i /= rinfo.num_prefixes; 1075 FIB_PRINTF(LOG_INFO, da->fd, "%d.%02d KBytes, %d.%02d Bytes/prefix", 1076 dxr_tot_size / 1024, dxr_tot_size * 100 / 1024 % 100, 1077 i / 100, i % 100); 1078 #ifdef DXR2 1079 FIB_PRINTF(LOG_INFO, da->fd, 1080 "%d.%02d%% trie, %d.%02d%% range fragmentation", 1081 trie_frag / 100, trie_frag % 100, 1082 range_frag / 100, range_frag % 100); 1083 #else 1084 FIB_PRINTF(LOG_INFO, da->fd, "%d.%01d%% range fragmentation", 1085 range_frag / 100, range_frag % 100); 1086 #endif 1087 i = (t1.tv_sec - t0.tv_sec) * 1000000 + t1.tv_usec - t0.tv_usec; 1088 FIB_PRINTF(LOG_INFO, da->fd, "range table %s in %u.%03u ms", 1089 range_rebuild ? "rebuilt" : "updated", i / 1000, i % 1000); 1090 #ifdef DXR2 1091 i = (t2.tv_sec - t1.tv_sec) * 1000000 + t2.tv_usec - t1.tv_usec; 1092 FIB_PRINTF(LOG_INFO, da->fd, "trie %s in %u.%03u ms", 1093 trie_rebuild ? "rebuilt" : "updated", i / 1000, i % 1000); 1094 #endif 1095 i = (t3.tv_sec - t2.tv_sec) * 1000000 + t3.tv_usec - t2.tv_usec; 1096 FIB_PRINTF(LOG_INFO, da->fd, "snapshot forked in %u.%03u ms", 1097 i / 1000, i % 1000); 1098 } 1099 1100 /* 1101 * Glue functions for attaching to FreeBSD 13 fib_algo infrastructure. 1102 */ 1103 1104 static struct nhop_object * 1105 dxr_fib_lookup(void *algo_data, const struct flm_lookup_key key, 1106 uint32_t scopeid) 1107 { 1108 struct dxr *dxr = algo_data; 1109 1110 return (dxr->nh_tbl[dxr_lookup(dxr, ntohl(key.addr4.s_addr))]); 1111 } 1112 1113 static enum flm_op_result 1114 dxr_init(uint32_t fibnum, struct fib_data *fd, void *old_data, void **data) 1115 { 1116 struct dxr *old_dxr = old_data; 1117 struct dxr_aux *da = NULL; 1118 struct dxr *dxr; 1119 1120 dxr = malloc(sizeof(*dxr), M_DXRAUX, M_NOWAIT); 1121 if (dxr == NULL) 1122 return (FLM_REBUILD); 1123 1124 /* Check whether we may reuse the old auxiliary structures */ 1125 if (old_dxr != NULL && old_dxr->aux != NULL) { 1126 da = old_dxr->aux; 1127 atomic_add_int(&da->refcnt, 1); 1128 } 1129 1130 dxr->aux = da; 1131 dxr->d = NULL; 1132 dxr->fd = fd; 1133 dxr->fibnum = fibnum; 1134 *data = dxr; 1135 1136 return (FLM_SUCCESS); 1137 } 1138 1139 static void 1140 dxr_destroy(void *data) 1141 { 1142 struct dxr *dxr = data; 1143 struct dxr_aux *da; 1144 struct chunk_desc *cdp; 1145 struct trie_desc *tp; 1146 1147 if (dxr->d != NULL) 1148 free(dxr->d, M_DXRLPM); 1149 1150 da = dxr->aux; 1151 free(dxr, M_DXRAUX); 1152 1153 if (da == NULL || atomic_fetchadd_int(&da->refcnt, -1) > 1) 1154 return; 1155 1156 /* Release auxiliary structures */ 1157 while ((cdp = LIST_FIRST(&da->all_chunks)) != NULL) { 1158 LIST_REMOVE(cdp, cd_all_le); 1159 uma_zfree(chunk_zone, cdp); 1160 } 1161 while ((tp = LIST_FIRST(&da->all_trie)) != NULL) { 1162 LIST_REMOVE(tp, td_all_le); 1163 uma_zfree(trie_zone, tp); 1164 } 1165 free(da->range_tbl, M_DXRAUX); 1166 free(da->x_tbl, M_DXRAUX); 1167 free(da, M_DXRAUX); 1168 } 1169 1170 static void 1171 epoch_dxr_destroy(epoch_context_t ctx) 1172 { 1173 struct dxr *dxr = __containerof(ctx, struct dxr, epoch_ctx); 1174 1175 dxr_destroy(dxr); 1176 } 1177 1178 static void * 1179 choose_lookup_fn(struct dxr_aux *da) 1180 { 1181 1182 #ifdef DXR2 1183 switch (da->d_bits) { 1184 #if DXR_TRIE_BITS > 16 1185 case 16: 1186 return (dxr_fib_lookup_16); 1187 #endif 1188 case 15: 1189 return (dxr_fib_lookup_15); 1190 case 14: 1191 return (dxr_fib_lookup_14); 1192 case 13: 1193 return (dxr_fib_lookup_13); 1194 case 12: 1195 return (dxr_fib_lookup_12); 1196 case 11: 1197 return (dxr_fib_lookup_11); 1198 case 10: 1199 return (dxr_fib_lookup_10); 1200 case 9: 1201 return (dxr_fib_lookup_9); 1202 } 1203 #endif /* DXR2 */ 1204 return (dxr_fib_lookup); 1205 } 1206 1207 static enum flm_op_result 1208 dxr_dump_end(void *data, struct fib_dp *dp) 1209 { 1210 struct dxr *dxr = data; 1211 struct dxr_aux *da; 1212 1213 dxr_build(dxr); 1214 1215 da = dxr->aux; 1216 if (da == NULL) 1217 return (FLM_REBUILD); 1218 1219 /* Structural limit exceeded, hard error */ 1220 if (da->rtbl_top >= BASE_MAX) 1221 return (FLM_ERROR); 1222 1223 /* A malloc(,, M_NOWAIT) failed somewhere, retry later */ 1224 if (dxr->d == NULL) 1225 return (FLM_REBUILD); 1226 1227 dp->f = choose_lookup_fn(da); 1228 dp->arg = dxr; 1229 1230 return (FLM_SUCCESS); 1231 } 1232 1233 static enum flm_op_result 1234 dxr_dump_rib_item(struct rtentry *rt, void *data) 1235 { 1236 1237 return (FLM_SUCCESS); 1238 } 1239 1240 static enum flm_op_result 1241 dxr_change_rib_item(struct rib_head *rnh, struct rib_cmd_info *rc, 1242 void *data) 1243 { 1244 1245 return (FLM_BATCH); 1246 } 1247 1248 static enum flm_op_result 1249 dxr_change_rib_batch(struct rib_head *rnh, struct fib_change_queue *q, 1250 void *data) 1251 { 1252 struct dxr *dxr = data; 1253 struct dxr *new_dxr; 1254 struct dxr_aux *da; 1255 struct fib_dp new_dp; 1256 enum flm_op_result res; 1257 uint32_t ip, plen, hmask, start, end, i, ui; 1258 #ifdef INVARIANTS 1259 struct rib_rtable_info rinfo; 1260 int update_delta = 0; 1261 #endif 1262 1263 KASSERT(data != NULL, ("%s: NULL data", __FUNCTION__)); 1264 KASSERT(q != NULL, ("%s: NULL q", __FUNCTION__)); 1265 KASSERT(q->count < q->size, ("%s: q->count %d q->size %d", 1266 __FUNCTION__, q->count, q->size)); 1267 1268 da = dxr->aux; 1269 KASSERT(da != NULL, ("%s: NULL dxr->aux", __FUNCTION__)); 1270 1271 FIB_PRINTF(LOG_INFO, da->fd, "processing %d update(s)", q->count); 1272 for (ui = 0; ui < q->count; ui++) { 1273 #ifdef INVARIANTS 1274 if (q->entries[ui].nh_new != NULL) 1275 update_delta++; 1276 if (q->entries[ui].nh_old != NULL) 1277 update_delta--; 1278 #endif 1279 plen = q->entries[ui].plen; 1280 ip = ntohl(q->entries[ui].addr4.s_addr); 1281 if (plen < 32) 1282 hmask = 0xffffffffU >> plen; 1283 else 1284 hmask = 0; 1285 start = (ip & ~hmask) >> DXR_RANGE_SHIFT; 1286 end = (ip | hmask) >> DXR_RANGE_SHIFT; 1287 1288 if ((start & 0x1f) == 0 && (end & 0x1f) == 0x1f) 1289 for (i = start >> 5; i <= end >> 5; i++) 1290 da->updates_mask[i] = 0xffffffffU; 1291 else 1292 for (i = start; i <= end; i++) 1293 da->updates_mask[i >> 5] |= (1 << (i & 0x1f)); 1294 if (start < da->updates_low) 1295 da->updates_low = start; 1296 if (end > da->updates_high) 1297 da->updates_high = end; 1298 } 1299 1300 #ifdef INVARIANTS 1301 fib_get_rtable_info(fib_get_rh(da->fd), &rinfo); 1302 KASSERT(da->prefixes + update_delta == rinfo.num_prefixes, 1303 ("%s: update count mismatch", __FUNCTION__)); 1304 #endif 1305 1306 res = dxr_init(0, dxr->fd, data, (void **) &new_dxr); 1307 if (res != FLM_SUCCESS) 1308 return (res); 1309 1310 dxr_build(new_dxr); 1311 1312 /* Structural limit exceeded, hard error */ 1313 if (da->rtbl_top >= BASE_MAX) { 1314 dxr_destroy(new_dxr); 1315 return (FLM_ERROR); 1316 } 1317 1318 /* A malloc(,, M_NOWAIT) failed somewhere, retry later */ 1319 if (new_dxr->d == NULL) { 1320 dxr_destroy(new_dxr); 1321 return (FLM_REBUILD); 1322 } 1323 1324 new_dp.f = choose_lookup_fn(da); 1325 new_dp.arg = new_dxr; 1326 if (fib_set_datapath_ptr(dxr->fd, &new_dp)) { 1327 fib_set_algo_ptr(dxr->fd, new_dxr); 1328 fib_epoch_call(epoch_dxr_destroy, &dxr->epoch_ctx); 1329 return (FLM_SUCCESS); 1330 } 1331 1332 dxr_destroy(new_dxr); 1333 return (FLM_REBUILD); 1334 } 1335 1336 static uint8_t 1337 dxr_get_pref(const struct rib_rtable_info *rinfo) 1338 { 1339 1340 /* Below bsearch4 up to 10 prefixes. Always supersedes dpdk_lpm4. */ 1341 return (251); 1342 } 1343 1344 SYSCTL_DECL(_net_route_algo); 1345 SYSCTL_NODE(_net_route_algo, OID_AUTO, dxr, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 1346 "DXR tunables"); 1347 1348 static int 1349 sysctl_dxr_frag_limit(SYSCTL_HANDLER_ARGS) 1350 { 1351 char buf[8]; 1352 int error, new, i; 1353 1354 snprintf(buf, sizeof(buf), "%d.%02d%%", V_frag_limit / 100, 1355 V_frag_limit % 100); 1356 error = sysctl_handle_string(oidp, buf, sizeof(buf), req); 1357 if (error != 0 || req->newptr == NULL) 1358 return (error); 1359 if (!isdigit(*buf) && *buf != '.') 1360 return (EINVAL); 1361 for (i = 0, new = 0; isdigit(buf[i]) && i < sizeof(buf); i++) 1362 new = new * 10 + buf[i] - '0'; 1363 new *= 100; 1364 if (buf[i++] == '.') { 1365 if (!isdigit(buf[i])) 1366 return (EINVAL); 1367 new += (buf[i++] - '0') * 10; 1368 if (isdigit(buf[i])) 1369 new += buf[i++] - '0'; 1370 } 1371 if (new > 1000) 1372 return (EINVAL); 1373 V_frag_limit = new; 1374 snprintf(buf, sizeof(buf), "%d.%02d%%", V_frag_limit / 100, 1375 V_frag_limit % 100); 1376 return (0); 1377 } 1378 1379 SYSCTL_PROC(_net_route_algo_dxr, OID_AUTO, frag_limit, 1380 CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_VNET, 1381 0, 0, sysctl_dxr_frag_limit, "A", 1382 "Fragmentation threshold to full rebuild"); 1383 1384 static struct fib_lookup_module fib_dxr_mod = { 1385 .flm_name = "dxr", 1386 .flm_family = AF_INET, 1387 .flm_init_cb = dxr_init, 1388 .flm_destroy_cb = dxr_destroy, 1389 .flm_dump_rib_item_cb = dxr_dump_rib_item, 1390 .flm_dump_end_cb = dxr_dump_end, 1391 .flm_change_rib_item_cb = dxr_change_rib_item, 1392 .flm_change_rib_items_cb = dxr_change_rib_batch, 1393 .flm_get_pref = dxr_get_pref, 1394 }; 1395 1396 static int 1397 dxr_modevent(module_t mod, int type, void *unused) 1398 { 1399 int error; 1400 1401 switch (type) { 1402 case MOD_LOAD: 1403 chunk_zone = uma_zcreate("dxr chunk", sizeof(struct chunk_desc), 1404 NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); 1405 trie_zone = uma_zcreate("dxr trie", sizeof(struct trie_desc), 1406 NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); 1407 fib_module_register(&fib_dxr_mod); 1408 return(0); 1409 case MOD_UNLOAD: 1410 error = fib_module_unregister(&fib_dxr_mod); 1411 if (error) 1412 return (error); 1413 uma_zdestroy(chunk_zone); 1414 uma_zdestroy(trie_zone); 1415 return(0); 1416 default: 1417 return(EOPNOTSUPP); 1418 } 1419 } 1420 1421 static moduledata_t dxr_mod = {"fib_dxr", dxr_modevent, 0}; 1422 1423 DECLARE_MODULE(fib_dxr, dxr_mod, SI_SUB_PSEUDO, SI_ORDER_ANY); 1424 MODULE_VERSION(fib_dxr, 1); 1425