1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause 3 * 4 * Copyright (c) 2012-2024 Marko Zec 5 * Copyright (c) 2005, 2018 University of Zagreb 6 * Copyright (c) 2005 International Computer Science Institute 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 27 * SUCH DAMAGE. 28 */ 29 30 /* 31 * An implementation of DXR, a simple IPv4 LPM scheme with compact lookup 32 * structures and a trivial search procedure. More significant bits of 33 * the search key are used to directly index a two-stage trie, while the 34 * remaining bits are used for finding the next hop in a sorted array. 35 * More details in: 36 * 37 * M. Zec, L. Rizzo, M. Mikuc, DXR: towards a billion routing lookups per 38 * second in software, ACM SIGCOMM Computer Communication Review, September 39 * 2012 40 * 41 * M. Zec, M. Mikuc, Pushing the envelope: beyond two billion IP routing 42 * lookups per second on commodity CPUs, IEEE SoftCOM, September 2017, Split 43 */ 44 45 #include <sys/cdefs.h> 46 #include "opt_inet.h" 47 48 #include <sys/param.h> 49 #include <sys/kernel.h> 50 #include <sys/ctype.h> 51 #include <sys/epoch.h> 52 #include <sys/malloc.h> 53 #include <sys/module.h> 54 #include <sys/socket.h> 55 #include <sys/sysctl.h> 56 #include <sys/syslog.h> 57 58 #include <vm/uma.h> 59 60 #include <netinet/in.h> 61 #include <netinet/in_fib.h> 62 63 #include <net/route.h> 64 #include <net/route/route_ctl.h> 65 #include <net/route/fib_algo.h> 66 67 #define DXR_TRIE_BITS 20 68 69 CTASSERT(DXR_TRIE_BITS >= 16 && DXR_TRIE_BITS <= 24); 70 71 #if DXR_TRIE_BITS > 16 72 #define DXR_D 16 73 #else 74 #define DXR_D (DXR_TRIE_BITS - 1) 75 #endif 76 77 #define D_TBL_SIZE (1 << DXR_D) 78 #define DIRECT_TBL_SIZE (1 << DXR_TRIE_BITS) 79 #define DXR_RANGE_MASK (0xffffffffU >> DXR_TRIE_BITS) 80 #define DXR_RANGE_SHIFT (32 - DXR_TRIE_BITS) 81 82 #define DESC_BASE_BITS 22 83 #define DESC_FRAGMENTS_BITS (32 - DESC_BASE_BITS) 84 #define BASE_MAX ((1 << DESC_BASE_BITS) - 1) 85 #define RTBL_SIZE_INCR (BASE_MAX / 64) 86 87 #if DXR_TRIE_BITS < 24 88 #define FRAGS_MASK_SHORT ((1 << (23 - DXR_TRIE_BITS)) - 1) 89 #else 90 #define FRAGS_MASK_SHORT 0 91 #endif 92 #define FRAGS_PREF_SHORT (((1 << DESC_FRAGMENTS_BITS) - 1) & \ 93 ~FRAGS_MASK_SHORT) 94 #define FRAGS_MARK_XL (FRAGS_PREF_SHORT - 1) 95 #define FRAGS_MARK_HIT (FRAGS_PREF_SHORT - 2) 96 97 #define IS_SHORT_FORMAT(x) ((x & FRAGS_PREF_SHORT) == FRAGS_PREF_SHORT) 98 #define IS_LONG_FORMAT(x) ((x & FRAGS_PREF_SHORT) != FRAGS_PREF_SHORT) 99 #define IS_XL_FORMAT(x) (x == FRAGS_MARK_XL) 100 101 #define RE_SHORT_MAX_NH ((1 << (DXR_TRIE_BITS - 8)) - 1) 102 103 #define CHUNK_HASH_BITS 16 104 #define CHUNK_HASH_SIZE (1 << CHUNK_HASH_BITS) 105 #define CHUNK_HASH_MASK (CHUNK_HASH_SIZE - 1) 106 107 #define TRIE_HASH_BITS 16 108 #define TRIE_HASH_SIZE (1 << TRIE_HASH_BITS) 109 #define TRIE_HASH_MASK (TRIE_HASH_SIZE - 1) 110 111 #define XTBL_SIZE_INCR (DIRECT_TBL_SIZE / 16) 112 113 #define UNUSED_BUCKETS 8 114 115 /* Lookup structure elements */ 116 117 struct direct_entry { 118 uint32_t fragments: DESC_FRAGMENTS_BITS, 119 base: DESC_BASE_BITS; 120 }; 121 122 struct range_entry_long { 123 uint32_t start: DXR_RANGE_SHIFT, 124 nexthop: DXR_TRIE_BITS; 125 }; 126 127 #if DXR_TRIE_BITS < 24 128 struct range_entry_short { 129 uint16_t start: DXR_RANGE_SHIFT - 8, 130 nexthop: DXR_TRIE_BITS - 8; 131 }; 132 #endif 133 134 /* Auxiliary structures */ 135 136 struct heap_entry { 137 uint32_t start; 138 uint32_t end; 139 uint32_t preflen; 140 uint32_t nexthop; 141 }; 142 143 struct chunk_desc { 144 LIST_ENTRY(chunk_desc) cd_all_le; 145 LIST_ENTRY(chunk_desc) cd_hash_le; 146 uint32_t cd_hash; 147 uint32_t cd_refcnt; 148 uint32_t cd_base; 149 uint32_t cd_cur_size; 150 uint32_t cd_max_size; 151 }; 152 153 struct trie_desc { 154 LIST_ENTRY(trie_desc) td_all_le; 155 LIST_ENTRY(trie_desc) td_hash_le; 156 uint32_t td_hash; 157 uint32_t td_index; 158 uint32_t td_refcnt; 159 }; 160 161 struct dxr_aux { 162 /* Glue to external state */ 163 struct fib_data *fd; 164 uint32_t fibnum; 165 int refcnt; 166 167 /* Auxiliary build-time tables */ 168 struct direct_entry direct_tbl[DIRECT_TBL_SIZE]; 169 uint16_t d_tbl[D_TBL_SIZE]; 170 struct direct_entry *x_tbl; 171 union { 172 struct range_entry_long re; 173 uint32_t fragments; 174 } *range_tbl; 175 176 /* Auxiliary internal state */ 177 uint32_t updates_mask[DIRECT_TBL_SIZE / 32]; 178 struct trie_desc *trietbl[D_TBL_SIZE]; 179 LIST_HEAD(, chunk_desc) chunk_hashtbl[CHUNK_HASH_SIZE]; 180 LIST_HEAD(, chunk_desc) all_chunks; 181 LIST_HEAD(, chunk_desc) unused_chunks[UNUSED_BUCKETS]; 182 LIST_HEAD(, trie_desc) trie_hashtbl[TRIE_HASH_SIZE]; 183 LIST_HEAD(, trie_desc) all_trie; 184 LIST_HEAD(, trie_desc) unused_trie; /* abuses hash link entry */ 185 struct sockaddr_in dst; 186 struct sockaddr_in mask; 187 struct heap_entry heap[33]; 188 uint32_t prefixes; 189 uint32_t updates_low; 190 uint32_t updates_high; 191 uint32_t unused_chunks_size; 192 uint32_t xtbl_size; 193 uint32_t all_trie_cnt; 194 uint32_t unused_trie_cnt; 195 uint32_t trie_rebuilt_prefixes; 196 uint32_t heap_index; 197 uint32_t d_bits; 198 uint32_t rtbl_size; 199 uint32_t rtbl_top; 200 uint32_t rtbl_work_frags; 201 uint32_t work_chunk; 202 }; 203 204 /* Main lookup structure container */ 205 206 struct dxr { 207 /* Lookup tables */ 208 void *d; 209 void *x; 210 void *r; 211 struct nhop_object **nh_tbl; 212 213 /* Glue to external state */ 214 struct dxr_aux *aux; 215 struct fib_data *fd; 216 struct epoch_context epoch_ctx; 217 uint32_t fibnum; 218 uint16_t d_shift; 219 uint16_t x_shift; 220 uint32_t x_mask; 221 }; 222 223 static MALLOC_DEFINE(M_DXRLPM, "dxr", "DXR LPM"); 224 static MALLOC_DEFINE(M_DXRAUX, "dxr aux", "DXR auxiliary"); 225 226 uma_zone_t chunk_zone; 227 uma_zone_t trie_zone; 228 229 VNET_DEFINE_STATIC(int, frag_limit) = 100; 230 #define V_frag_limit VNET(frag_limit) 231 232 /* Binary search for a matching address range */ 233 #define DXR_LOOKUP_STAGE \ 234 if (masked_dst < range[middle].start) { \ 235 upperbound = middle; \ 236 middle = (middle + lowerbound) / 2; \ 237 } else if (masked_dst < range[middle + 1].start) \ 238 return (range[middle].nexthop); \ 239 else { \ 240 lowerbound = middle + 1; \ 241 middle = (upperbound + middle + 1) / 2; \ 242 } \ 243 if (upperbound == lowerbound) \ 244 return (range[lowerbound].nexthop); 245 246 static int 247 range_lookup(struct range_entry_long *rt, struct direct_entry de, uint32_t dst) 248 { 249 uint32_t base; 250 uint32_t upperbound; 251 uint32_t middle; 252 uint32_t lowerbound; 253 uint32_t masked_dst; 254 255 base = de.base; 256 lowerbound = 0; 257 masked_dst = dst & DXR_RANGE_MASK; 258 259 #if DXR_TRIE_BITS < 24 260 if (__predict_true(IS_SHORT_FORMAT(de.fragments))) { 261 upperbound = de.fragments & FRAGS_MASK_SHORT; 262 struct range_entry_short *range = 263 (struct range_entry_short *) &rt[base]; 264 265 masked_dst >>= 8; 266 middle = upperbound; 267 upperbound = upperbound * 2 + 1; 268 269 for (;;) { 270 DXR_LOOKUP_STAGE 271 DXR_LOOKUP_STAGE 272 } 273 } 274 #endif 275 276 upperbound = de.fragments; 277 middle = upperbound / 2; 278 struct range_entry_long *range = &rt[base]; 279 if (__predict_false(IS_XL_FORMAT(de.fragments))) { 280 upperbound = *((uint32_t *) range); 281 range++; 282 middle = upperbound / 2; 283 } 284 285 for (;;) { 286 DXR_LOOKUP_STAGE 287 DXR_LOOKUP_STAGE 288 } 289 } 290 291 #define DXR_LOOKUP_DEFINE(D) \ 292 static int inline \ 293 dxr_lookup_##D(struct dxr *dxr, uint32_t dst) \ 294 { \ 295 struct direct_entry de; \ 296 uint16_t *dt = dxr->d; \ 297 struct direct_entry *xt = dxr->x; \ 298 \ 299 de = xt[(dt[dst >> (32 - (D))] << (DXR_TRIE_BITS - (D))) \ 300 + ((dst >> DXR_RANGE_SHIFT) & \ 301 (0xffffffffU >> (32 - DXR_TRIE_BITS + (D))))]; \ 302 if (__predict_true(de.fragments == FRAGS_MARK_HIT)) \ 303 return (de.base); \ 304 return (range_lookup(dxr->r, de, dst)); \ 305 } \ 306 \ 307 static struct nhop_object * \ 308 dxr_fib_lookup_##D(void *algo_data, \ 309 const struct flm_lookup_key key, uint32_t scopeid __unused) \ 310 { \ 311 struct dxr *dxr = algo_data; \ 312 \ 313 return (dxr->nh_tbl[dxr_lookup_##D(dxr, \ 314 ntohl(key.addr4.s_addr))]); \ 315 } 316 317 #if DXR_TRIE_BITS > 16 318 DXR_LOOKUP_DEFINE(16) 319 #endif 320 DXR_LOOKUP_DEFINE(15) 321 DXR_LOOKUP_DEFINE(14) 322 DXR_LOOKUP_DEFINE(13) 323 DXR_LOOKUP_DEFINE(12) 324 DXR_LOOKUP_DEFINE(11) 325 DXR_LOOKUP_DEFINE(10) 326 DXR_LOOKUP_DEFINE(9) 327 328 static int inline 329 dxr_lookup(struct dxr *dxr, uint32_t dst) 330 { 331 struct direct_entry de; 332 uint16_t *dt = dxr->d; 333 struct direct_entry *xt = dxr->x; 334 335 de = xt[(dt[dst >> dxr->d_shift] << dxr->x_shift) + 336 ((dst >> DXR_RANGE_SHIFT) & dxr->x_mask)]; 337 if (__predict_true(de.fragments == FRAGS_MARK_HIT)) 338 return (de.base); 339 return (range_lookup(dxr->r, de, dst)); 340 } 341 342 static void 343 initheap(struct dxr_aux *da, uint32_t dst_u32, uint32_t chunk) 344 { 345 struct heap_entry *fhp = &da->heap[0]; 346 struct rtentry *rt; 347 struct route_nhop_data rnd; 348 349 da->heap_index = 0; 350 da->dst.sin_addr.s_addr = htonl(dst_u32); 351 rt = fib4_lookup_rt(da->fibnum, da->dst.sin_addr, 0, NHR_UNLOCKED, 352 &rnd); 353 if (rt != NULL) { 354 struct in_addr addr; 355 uint32_t scopeid; 356 357 rt_get_inet_prefix_plen(rt, &addr, &fhp->preflen, &scopeid); 358 fhp->start = ntohl(addr.s_addr); 359 fhp->end = fhp->start; 360 if (fhp->preflen < 32) 361 fhp->end |= (0xffffffffU >> fhp->preflen); 362 fhp->nexthop = fib_get_nhop_idx(da->fd, rnd.rnd_nhop); 363 } else { 364 fhp->preflen = fhp->nexthop = fhp->start = 0; 365 fhp->end = 0xffffffffU; 366 } 367 } 368 369 static uint32_t 370 chunk_size(struct dxr_aux *da, struct direct_entry *fdesc) 371 { 372 373 if (IS_SHORT_FORMAT(fdesc->fragments)) 374 return ((fdesc->fragments & FRAGS_MASK_SHORT) + 1); 375 else if (IS_XL_FORMAT(fdesc->fragments)) 376 return (da->range_tbl[fdesc->base].fragments + 2); 377 else /* if (IS_LONG_FORMAT(fdesc->fragments)) */ 378 return (fdesc->fragments + 1); 379 } 380 381 static uint32_t 382 chunk_hash(struct dxr_aux *da, struct direct_entry *fdesc) 383 { 384 uint32_t size = chunk_size(da, fdesc); 385 uint32_t *p = (uint32_t *) &da->range_tbl[fdesc->base]; 386 uint32_t *l = (uint32_t *) &da->range_tbl[fdesc->base + size]; 387 uint32_t hash = fdesc->fragments; 388 389 for (; p < l; p++) 390 hash = (hash << 7) + (hash >> 13) + *p; 391 392 return (hash + (hash >> 16)); 393 } 394 395 static int 396 chunk_ref(struct dxr_aux *da, uint32_t chunk) 397 { 398 struct direct_entry *fdesc = &da->direct_tbl[chunk]; 399 struct chunk_desc *cdp, *empty_cdp; 400 uint32_t base = fdesc->base; 401 uint32_t size = chunk_size(da, fdesc); 402 uint32_t hash = chunk_hash(da, fdesc); 403 int i; 404 405 /* Find an existing descriptor */ 406 LIST_FOREACH(cdp, &da->chunk_hashtbl[hash & CHUNK_HASH_MASK], 407 cd_hash_le) { 408 if (cdp->cd_hash != hash || cdp->cd_cur_size != size || 409 memcmp(&da->range_tbl[base], &da->range_tbl[cdp->cd_base], 410 sizeof(struct range_entry_long) * size)) 411 continue; 412 da->rtbl_top = fdesc->base; 413 fdesc->base = cdp->cd_base; 414 cdp->cd_refcnt++; 415 return (0); 416 } 417 418 /* No matching chunks found. Find an empty one to recycle. */ 419 for (cdp = NULL, i = size; cdp == NULL && i < UNUSED_BUCKETS; i++) 420 cdp = LIST_FIRST(&da->unused_chunks[i]); 421 422 if (cdp == NULL) 423 LIST_FOREACH(empty_cdp, &da->unused_chunks[0], cd_hash_le) 424 if (empty_cdp->cd_max_size >= size && (cdp == NULL || 425 empty_cdp->cd_max_size < cdp->cd_max_size)) { 426 cdp = empty_cdp; 427 if (empty_cdp->cd_max_size == size) 428 break; 429 } 430 431 if (cdp != NULL) { 432 /* Copy from heap into the recycled chunk */ 433 bcopy(&da->range_tbl[fdesc->base], &da->range_tbl[cdp->cd_base], 434 size * sizeof(struct range_entry_long)); 435 fdesc->base = cdp->cd_base; 436 da->rtbl_top -= size; 437 da->unused_chunks_size -= cdp->cd_max_size; 438 if (cdp->cd_max_size > size) { 439 /* Split the range in two, need a new descriptor */ 440 empty_cdp = uma_zalloc(chunk_zone, M_NOWAIT); 441 if (empty_cdp == NULL) 442 return (1); 443 LIST_INSERT_BEFORE(cdp, empty_cdp, cd_all_le); 444 empty_cdp->cd_base = cdp->cd_base + size; 445 empty_cdp->cd_cur_size = 0; 446 empty_cdp->cd_max_size = cdp->cd_max_size - size; 447 448 i = empty_cdp->cd_max_size; 449 if (i >= UNUSED_BUCKETS) 450 i = 0; 451 LIST_INSERT_HEAD(&da->unused_chunks[i], empty_cdp, 452 cd_hash_le); 453 454 da->unused_chunks_size += empty_cdp->cd_max_size; 455 cdp->cd_max_size = size; 456 } 457 LIST_REMOVE(cdp, cd_hash_le); 458 } else { 459 /* Alloc a new descriptor at the top of the heap*/ 460 cdp = uma_zalloc(chunk_zone, M_NOWAIT); 461 if (cdp == NULL) 462 return (1); 463 cdp->cd_max_size = size; 464 cdp->cd_base = fdesc->base; 465 LIST_INSERT_HEAD(&da->all_chunks, cdp, cd_all_le); 466 MPASS(cdp->cd_base + cdp->cd_max_size == da->rtbl_top); 467 } 468 469 cdp->cd_hash = hash; 470 cdp->cd_refcnt = 1; 471 cdp->cd_cur_size = size; 472 LIST_INSERT_HEAD(&da->chunk_hashtbl[hash & CHUNK_HASH_MASK], cdp, 473 cd_hash_le); 474 if (da->rtbl_top >= da->rtbl_size) { 475 if (da->rtbl_top >= BASE_MAX) { 476 FIB_PRINTF(LOG_ERR, da->fd, 477 "structural limit exceeded at %d " 478 "range table elements", da->rtbl_top); 479 return (1); 480 } 481 da->rtbl_size += RTBL_SIZE_INCR; 482 i = (BASE_MAX - da->rtbl_top) * LOG_DEBUG / BASE_MAX; 483 FIB_PRINTF(i, da->fd, "range table at %d%% structural limit", 484 da->rtbl_top * 100 / BASE_MAX); 485 da->range_tbl = realloc(da->range_tbl, 486 sizeof(*da->range_tbl) * da->rtbl_size + FRAGS_PREF_SHORT, 487 M_DXRAUX, M_NOWAIT); 488 if (da->range_tbl == NULL) { 489 FIB_PRINTF(LOG_NOTICE, da->fd, 490 "Unable to allocate DXR range table"); 491 return (1); 492 } 493 } 494 495 return (0); 496 } 497 498 static void 499 chunk_unref(struct dxr_aux *da, uint32_t chunk) 500 { 501 struct direct_entry *fdesc = &da->direct_tbl[chunk]; 502 struct chunk_desc *cdp, *cdp2; 503 uint32_t base = fdesc->base; 504 uint32_t size = chunk_size(da, fdesc); 505 uint32_t hash = chunk_hash(da, fdesc); 506 int i; 507 508 /* Find the corresponding descriptor */ 509 LIST_FOREACH(cdp, &da->chunk_hashtbl[hash & CHUNK_HASH_MASK], 510 cd_hash_le) 511 if (cdp->cd_hash == hash && cdp->cd_cur_size == size && 512 memcmp(&da->range_tbl[base], &da->range_tbl[cdp->cd_base], 513 sizeof(struct range_entry_long) * size) == 0) 514 break; 515 516 MPASS(cdp != NULL); 517 if (--cdp->cd_refcnt > 0) 518 return; 519 520 LIST_REMOVE(cdp, cd_hash_le); 521 da->unused_chunks_size += cdp->cd_max_size; 522 cdp->cd_cur_size = 0; 523 524 /* Attempt to merge with the preceding chunk, if empty */ 525 cdp2 = LIST_NEXT(cdp, cd_all_le); 526 if (cdp2 != NULL && cdp2->cd_cur_size == 0) { 527 MPASS(cdp2->cd_base + cdp2->cd_max_size == cdp->cd_base); 528 LIST_REMOVE(cdp, cd_all_le); 529 LIST_REMOVE(cdp2, cd_hash_le); 530 cdp2->cd_max_size += cdp->cd_max_size; 531 uma_zfree(chunk_zone, cdp); 532 cdp = cdp2; 533 } 534 535 /* Attempt to merge with the subsequent chunk, if empty */ 536 cdp2 = LIST_PREV(cdp, &da->all_chunks, chunk_desc, cd_all_le); 537 if (cdp2 != NULL && cdp2->cd_cur_size == 0) { 538 MPASS(cdp->cd_base + cdp->cd_max_size == cdp2->cd_base); 539 LIST_REMOVE(cdp, cd_all_le); 540 LIST_REMOVE(cdp2, cd_hash_le); 541 cdp2->cd_max_size += cdp->cd_max_size; 542 cdp2->cd_base = cdp->cd_base; 543 uma_zfree(chunk_zone, cdp); 544 cdp = cdp2; 545 } 546 547 if (cdp->cd_base + cdp->cd_max_size == da->rtbl_top) { 548 /* Free the chunk on the top of the range heap, trim the heap */ 549 MPASS(cdp == LIST_FIRST(&da->all_chunks)); 550 da->rtbl_top -= cdp->cd_max_size; 551 da->unused_chunks_size -= cdp->cd_max_size; 552 LIST_REMOVE(cdp, cd_all_le); 553 uma_zfree(chunk_zone, cdp); 554 return; 555 } 556 557 i = cdp->cd_max_size; 558 if (i >= UNUSED_BUCKETS) 559 i = 0; 560 LIST_INSERT_HEAD(&da->unused_chunks[i], cdp, cd_hash_le); 561 } 562 563 static uint32_t 564 trie_hash(struct dxr_aux *da, uint32_t dxr_x, uint32_t index) 565 { 566 uint32_t i, *val; 567 uint32_t hash = 0; 568 569 for (i = 0; i < (1 << dxr_x); i++) { 570 hash = (hash << 3) ^ (hash >> 3); 571 val = (uint32_t *) 572 (void *) &da->direct_tbl[(index << dxr_x) + i]; 573 hash += (*val << 5); 574 hash += (*val >> 5); 575 } 576 577 return (hash + (hash >> 16)); 578 } 579 580 static int 581 trie_ref(struct dxr_aux *da, uint32_t index) 582 { 583 struct trie_desc *tp; 584 uint32_t dxr_d = da->d_bits; 585 uint32_t dxr_x = DXR_TRIE_BITS - dxr_d; 586 uint32_t hash = trie_hash(da, dxr_x, index); 587 588 /* Find an existing descriptor */ 589 LIST_FOREACH(tp, &da->trie_hashtbl[hash & TRIE_HASH_MASK], td_hash_le) 590 if (tp->td_hash == hash && 591 memcmp(&da->direct_tbl[index << dxr_x], 592 &da->x_tbl[tp->td_index << dxr_x], 593 sizeof(*da->x_tbl) << dxr_x) == 0) { 594 tp->td_refcnt++; 595 da->trietbl[index] = tp; 596 return(tp->td_index); 597 } 598 599 tp = LIST_FIRST(&da->unused_trie); 600 if (tp != NULL) { 601 LIST_REMOVE(tp, td_hash_le); 602 da->unused_trie_cnt--; 603 } else { 604 tp = uma_zalloc(trie_zone, M_NOWAIT); 605 if (tp == NULL) 606 return (-1); 607 LIST_INSERT_HEAD(&da->all_trie, tp, td_all_le); 608 tp->td_index = da->all_trie_cnt++; 609 } 610 611 tp->td_hash = hash; 612 tp->td_refcnt = 1; 613 LIST_INSERT_HEAD(&da->trie_hashtbl[hash & TRIE_HASH_MASK], tp, 614 td_hash_le); 615 memcpy(&da->x_tbl[tp->td_index << dxr_x], 616 &da->direct_tbl[index << dxr_x], sizeof(*da->x_tbl) << dxr_x); 617 da->trietbl[index] = tp; 618 if (da->all_trie_cnt >= da->xtbl_size >> dxr_x) { 619 da->xtbl_size += XTBL_SIZE_INCR; 620 da->x_tbl = realloc(da->x_tbl, 621 sizeof(*da->x_tbl) * da->xtbl_size, M_DXRAUX, M_NOWAIT); 622 if (da->x_tbl == NULL) { 623 FIB_PRINTF(LOG_NOTICE, da->fd, 624 "Unable to allocate DXR extension table"); 625 return (-1); 626 } 627 } 628 return(tp->td_index); 629 } 630 631 static void 632 trie_unref(struct dxr_aux *da, uint32_t index) 633 { 634 struct trie_desc *tp = da->trietbl[index]; 635 636 if (tp == NULL) 637 return; 638 da->trietbl[index] = NULL; 639 if (--tp->td_refcnt > 0) 640 return; 641 642 LIST_REMOVE(tp, td_hash_le); 643 da->unused_trie_cnt++; 644 if (tp->td_index != da->all_trie_cnt - 1) { 645 LIST_INSERT_HEAD(&da->unused_trie, tp, td_hash_le); 646 return; 647 } 648 649 do { 650 da->all_trie_cnt--; 651 da->unused_trie_cnt--; 652 LIST_REMOVE(tp, td_all_le); 653 uma_zfree(trie_zone, tp); 654 LIST_FOREACH(tp, &da->unused_trie, td_hash_le) 655 if (tp->td_index == da->all_trie_cnt - 1) { 656 LIST_REMOVE(tp, td_hash_le); 657 break; 658 } 659 } while (tp != NULL); 660 } 661 662 static void 663 heap_inject(struct dxr_aux *da, uint32_t start, uint32_t end, uint32_t preflen, 664 uint32_t nh) 665 { 666 struct heap_entry *fhp; 667 int i; 668 669 for (i = da->heap_index; i >= 0; i--) { 670 if (preflen > da->heap[i].preflen) 671 break; 672 else if (preflen < da->heap[i].preflen) 673 da->heap[i + 1] = da->heap[i]; 674 else 675 return; 676 } 677 678 fhp = &da->heap[i + 1]; 679 fhp->preflen = preflen; 680 fhp->start = start; 681 fhp->end = end; 682 fhp->nexthop = nh; 683 da->heap_index++; 684 } 685 686 static int 687 dxr_walk(struct rtentry *rt, void *arg) 688 { 689 struct dxr_aux *da = arg; 690 uint32_t chunk = da->work_chunk; 691 uint32_t first = chunk << DXR_RANGE_SHIFT; 692 uint32_t last = first | DXR_RANGE_MASK; 693 struct range_entry_long *fp = 694 &da->range_tbl[da->rtbl_top + da->rtbl_work_frags].re; 695 struct heap_entry *fhp = &da->heap[da->heap_index]; 696 uint32_t preflen, nh, start, end, scopeid; 697 struct in_addr addr; 698 699 rt_get_inet_prefix_plen(rt, &addr, &preflen, &scopeid); 700 start = ntohl(addr.s_addr); 701 if (start > last) 702 return (-1); /* Beyond chunk boundaries, we are done */ 703 if (start < first) 704 return (0); /* Skip this route */ 705 706 end = start; 707 if (preflen < 32) 708 end |= (0xffffffffU >> preflen); 709 nh = fib_get_nhop_idx(da->fd, rt_get_raw_nhop(rt)); 710 711 if (start == fhp->start) 712 heap_inject(da, start, end, preflen, nh); 713 else { 714 /* start > fhp->start */ 715 while (start > fhp->end) { 716 uint32_t oend = fhp->end; 717 718 if (da->heap_index > 0) { 719 fhp--; 720 da->heap_index--; 721 } else 722 initheap(da, fhp->end + 1, chunk); 723 if (fhp->end > oend && fhp->nexthop != fp->nexthop) { 724 fp++; 725 da->rtbl_work_frags++; 726 fp->start = (oend + 1) & DXR_RANGE_MASK; 727 fp->nexthop = fhp->nexthop; 728 } 729 } 730 if (start > ((chunk << DXR_RANGE_SHIFT) | fp->start) && 731 nh != fp->nexthop) { 732 fp++; 733 da->rtbl_work_frags++; 734 fp->start = start & DXR_RANGE_MASK; 735 } else if (da->rtbl_work_frags) { 736 if ((--fp)->nexthop == nh) 737 da->rtbl_work_frags--; 738 else 739 fp++; 740 } 741 fp->nexthop = nh; 742 heap_inject(da, start, end, preflen, nh); 743 } 744 745 return (0); 746 } 747 748 static int 749 update_chunk(struct dxr_aux *da, uint32_t chunk) 750 { 751 struct range_entry_long *fp; 752 #if DXR_TRIE_BITS < 24 753 struct range_entry_short *fps; 754 uint32_t start, nh, i; 755 #endif 756 struct heap_entry *fhp; 757 uint32_t first = chunk << DXR_RANGE_SHIFT; 758 uint32_t last = first | DXR_RANGE_MASK; 759 760 if (da->direct_tbl[chunk].fragments != FRAGS_MARK_HIT) 761 chunk_unref(da, chunk); 762 763 initheap(da, first, chunk); 764 765 fp = &da->range_tbl[da->rtbl_top].re; 766 da->rtbl_work_frags = 0; 767 fp->start = first & DXR_RANGE_MASK; 768 fp->nexthop = da->heap[0].nexthop; 769 770 da->dst.sin_addr.s_addr = htonl(first); 771 da->mask.sin_addr.s_addr = htonl(~DXR_RANGE_MASK); 772 773 da->work_chunk = chunk; 774 rib_walk_from(da->fibnum, AF_INET, RIB_FLAG_LOCKED, 775 (struct sockaddr *) &da->dst, (struct sockaddr *) &da->mask, 776 dxr_walk, da); 777 778 /* Flush any remaining objects on the heap */ 779 fp = &da->range_tbl[da->rtbl_top + da->rtbl_work_frags].re; 780 fhp = &da->heap[da->heap_index]; 781 while (fhp->preflen > DXR_TRIE_BITS) { 782 uint32_t oend = fhp->end; 783 784 if (da->heap_index > 0) { 785 fhp--; 786 da->heap_index--; 787 } else 788 initheap(da, fhp->end + 1, chunk); 789 if (fhp->end > oend && fhp->nexthop != fp->nexthop) { 790 /* Have we crossed the upper chunk boundary? */ 791 if (oend >= last) 792 break; 793 fp++; 794 da->rtbl_work_frags++; 795 fp->start = (oend + 1) & DXR_RANGE_MASK; 796 fp->nexthop = fhp->nexthop; 797 } 798 } 799 800 /* Direct hit if the chunk contains only a single fragment */ 801 if (da->rtbl_work_frags == 0) { 802 da->direct_tbl[chunk].base = fp->nexthop; 803 da->direct_tbl[chunk].fragments = FRAGS_MARK_HIT; 804 return (0); 805 } 806 807 da->direct_tbl[chunk].base = da->rtbl_top; 808 da->direct_tbl[chunk].fragments = da->rtbl_work_frags; 809 810 #if DXR_TRIE_BITS < 24 811 /* Check whether the chunk can be more compactly encoded */ 812 fp = &da->range_tbl[da->rtbl_top].re; 813 for (i = 0; i <= da->rtbl_work_frags; i++, fp++) 814 if ((fp->start & 0xff) != 0 || fp->nexthop > RE_SHORT_MAX_NH) 815 break; 816 if (i == da->rtbl_work_frags + 1) { 817 fp = &da->range_tbl[da->rtbl_top].re; 818 fps = (void *) fp; 819 for (i = 0; i <= da->rtbl_work_frags; i++, fp++, fps++) { 820 start = fp->start; 821 nh = fp->nexthop; 822 fps->start = start >> 8; 823 fps->nexthop = nh; 824 } 825 fps->start = start >> 8; 826 fps->nexthop = nh; 827 da->rtbl_work_frags >>= 1; 828 da->direct_tbl[chunk].fragments = 829 da->rtbl_work_frags | FRAGS_PREF_SHORT; 830 } else 831 #endif 832 if (da->rtbl_work_frags >= FRAGS_MARK_HIT) { 833 da->direct_tbl[chunk].fragments = FRAGS_MARK_XL; 834 memmove(&da->range_tbl[da->rtbl_top + 1], 835 &da->range_tbl[da->rtbl_top], 836 (da->rtbl_work_frags + 1) * sizeof(*da->range_tbl)); 837 da->range_tbl[da->rtbl_top].fragments = da->rtbl_work_frags; 838 da->rtbl_work_frags++; 839 } 840 da->rtbl_top += (da->rtbl_work_frags + 1); 841 return (chunk_ref(da, chunk)); 842 } 843 844 static void 845 dxr_build(struct dxr *dxr) 846 { 847 struct dxr_aux *da = dxr->aux; 848 struct chunk_desc *cdp; 849 struct rib_rtable_info rinfo; 850 struct timeval t0, t1, t2, t3; 851 uint32_t r_size, dxr_tot_size; 852 uint32_t i, m, range_rebuild = 0; 853 uint32_t range_frag; 854 struct trie_desc *tp; 855 uint32_t d_tbl_size, dxr_x, d_size, x_size; 856 uint32_t ti, trie_rebuild = 0, prev_size = 0; 857 uint32_t trie_frag; 858 859 MPASS(dxr->d == NULL); 860 861 if (da == NULL) { 862 da = malloc(sizeof(*dxr->aux), M_DXRAUX, M_NOWAIT); 863 if (da == NULL) { 864 FIB_PRINTF(LOG_NOTICE, dxr->fd, 865 "Unable to allocate DXR aux struct"); 866 return; 867 } 868 dxr->aux = da; 869 da->fibnum = dxr->fibnum; 870 da->fd = dxr->fd; 871 da->refcnt = 1; 872 LIST_INIT(&da->all_chunks); 873 LIST_INIT(&da->all_trie); 874 da->rtbl_size = RTBL_SIZE_INCR; 875 da->range_tbl = NULL; 876 da->xtbl_size = XTBL_SIZE_INCR; 877 da->x_tbl = NULL; 878 bzero(&da->dst, sizeof(da->dst)); 879 bzero(&da->mask, sizeof(da->mask)); 880 da->dst.sin_len = sizeof(da->dst); 881 da->mask.sin_len = sizeof(da->mask); 882 da->dst.sin_family = AF_INET; 883 da->mask.sin_family = AF_INET; 884 } 885 if (da->range_tbl == NULL) { 886 da->range_tbl = malloc(sizeof(*da->range_tbl) * da->rtbl_size 887 + FRAGS_PREF_SHORT, M_DXRAUX, M_NOWAIT); 888 if (da->range_tbl == NULL) { 889 FIB_PRINTF(LOG_NOTICE, da->fd, 890 "Unable to allocate DXR range table"); 891 return; 892 } 893 range_rebuild = 1; 894 } 895 if (da->x_tbl == NULL) { 896 da->x_tbl = malloc(sizeof(*da->x_tbl) * da->xtbl_size, 897 M_DXRAUX, M_NOWAIT); 898 if (da->x_tbl == NULL) { 899 FIB_PRINTF(LOG_NOTICE, da->fd, 900 "Unable to allocate DXR extension table"); 901 return; 902 } 903 trie_rebuild = 1; 904 } 905 906 microuptime(&t0); 907 908 dxr->nh_tbl = fib_get_nhop_array(da->fd); 909 fib_get_rtable_info(fib_get_rh(da->fd), &rinfo); 910 911 if (da->updates_low > da->updates_high) 912 range_rebuild = 1; 913 914 range_build: 915 if (range_rebuild) { 916 /* Bulk cleanup */ 917 bzero(da->chunk_hashtbl, sizeof(da->chunk_hashtbl)); 918 while ((cdp = LIST_FIRST(&da->all_chunks)) != NULL) { 919 LIST_REMOVE(cdp, cd_all_le); 920 uma_zfree(chunk_zone, cdp); 921 } 922 for (i = 0; i < UNUSED_BUCKETS; i++) 923 LIST_INIT(&da->unused_chunks[i]); 924 da->unused_chunks_size = 0; 925 da->rtbl_top = 0; 926 da->updates_low = 0; 927 da->updates_high = DIRECT_TBL_SIZE - 1; 928 memset(da->updates_mask, 0xff, sizeof(da->updates_mask)); 929 for (i = 0; i < DIRECT_TBL_SIZE; i++) { 930 da->direct_tbl[i].fragments = FRAGS_MARK_HIT; 931 da->direct_tbl[i].base = 0; 932 } 933 } 934 da->prefixes = rinfo.num_prefixes; 935 936 /* DXR: construct direct & range table */ 937 for (i = da->updates_low; i <= da->updates_high; i++) { 938 m = da->updates_mask[i >> 5] >> (i & 0x1f); 939 if (m == 0) 940 i |= 0x1f; 941 else if (m & 1 && update_chunk(da, i) != 0) 942 return; 943 } 944 945 range_frag = 0; 946 if (da->rtbl_top) 947 range_frag = da->unused_chunks_size * 10000ULL / da->rtbl_top; 948 if (range_frag > V_frag_limit) { 949 range_rebuild = 1; 950 goto range_build; 951 } 952 953 r_size = sizeof(*da->range_tbl) * da->rtbl_top; 954 microuptime(&t1); 955 956 if (range_rebuild || 957 abs(fls(da->prefixes) - fls(da->trie_rebuilt_prefixes)) > 1) 958 trie_rebuild = 1; 959 960 trie_build: 961 if (trie_rebuild) { 962 da->trie_rebuilt_prefixes = da->prefixes; 963 da->d_bits = DXR_D; 964 da->updates_low = 0; 965 da->updates_high = DIRECT_TBL_SIZE - 1; 966 if (!range_rebuild) 967 memset(da->updates_mask, 0xff, 968 sizeof(da->updates_mask)); 969 } 970 971 dxr2_try_squeeze: 972 if (trie_rebuild) { 973 /* Bulk cleanup */ 974 bzero(da->trietbl, sizeof(da->trietbl)); 975 bzero(da->trie_hashtbl, sizeof(da->trie_hashtbl)); 976 while ((tp = LIST_FIRST(&da->all_trie)) != NULL) { 977 LIST_REMOVE(tp, td_all_le); 978 uma_zfree(trie_zone, tp); 979 } 980 LIST_INIT(&da->unused_trie); 981 da->all_trie_cnt = da->unused_trie_cnt = 0; 982 } 983 984 /* Populate d_tbl, x_tbl */ 985 dxr_x = DXR_TRIE_BITS - da->d_bits; 986 d_tbl_size = (1 << da->d_bits); 987 988 for (i = da->updates_low >> dxr_x; i <= da->updates_high >> dxr_x; 989 i++) { 990 if (!trie_rebuild) { 991 m = 0; 992 for (int j = 0; j < (1 << dxr_x); j += 32) 993 m |= da->updates_mask[((i << dxr_x) + j) >> 5]; 994 if (m == 0) 995 continue; 996 trie_unref(da, i); 997 } 998 ti = trie_ref(da, i); 999 if (ti < 0) 1000 return; 1001 da->d_tbl[i] = ti; 1002 } 1003 1004 trie_frag = 0; 1005 if (da->all_trie_cnt) 1006 trie_frag = da->unused_trie_cnt * 10000ULL / da->all_trie_cnt; 1007 if (trie_frag > V_frag_limit) { 1008 trie_rebuild = 1; 1009 goto trie_build; 1010 } 1011 1012 d_size = sizeof(*da->d_tbl) * d_tbl_size; 1013 x_size = sizeof(*da->x_tbl) * DIRECT_TBL_SIZE / d_tbl_size 1014 * da->all_trie_cnt; 1015 dxr_tot_size = d_size + x_size + r_size; 1016 1017 if (trie_rebuild == 1) { 1018 /* Try to find a more compact D/X split */ 1019 if (prev_size == 0 || dxr_tot_size <= prev_size) 1020 da->d_bits--; 1021 else { 1022 da->d_bits++; 1023 trie_rebuild = 2; 1024 } 1025 prev_size = dxr_tot_size; 1026 goto dxr2_try_squeeze; 1027 } 1028 microuptime(&t2); 1029 1030 dxr->d = malloc(dxr_tot_size, M_DXRLPM, M_NOWAIT); 1031 if (dxr->d == NULL) { 1032 FIB_PRINTF(LOG_NOTICE, da->fd, 1033 "Unable to allocate DXR lookup table"); 1034 return; 1035 } 1036 memcpy(dxr->d, da->d_tbl, d_size); 1037 dxr->x = ((char *) dxr->d) + d_size; 1038 memcpy(dxr->x, da->x_tbl, x_size); 1039 dxr->r = ((char *) dxr->x) + x_size; 1040 dxr->d_shift = 32 - da->d_bits; 1041 dxr->x_shift = dxr_x; 1042 dxr->x_mask = 0xffffffffU >> (32 - dxr_x); 1043 memcpy(dxr->r, da->range_tbl, r_size); 1044 1045 if (da->updates_low <= da->updates_high) 1046 bzero(&da->updates_mask[da->updates_low / 32], 1047 (da->updates_high - da->updates_low) / 8 + 1); 1048 da->updates_low = DIRECT_TBL_SIZE - 1; 1049 da->updates_high = 0; 1050 microuptime(&t3); 1051 1052 FIB_PRINTF(LOG_INFO, da->fd, "D%dX%dR, %d prefixes, %d nhops (max)", 1053 da->d_bits, dxr_x, rinfo.num_prefixes, rinfo.num_nhops); 1054 i = dxr_tot_size * 100; 1055 if (rinfo.num_prefixes) 1056 i /= rinfo.num_prefixes; 1057 FIB_PRINTF(LOG_INFO, da->fd, "%d.%02d KBytes, %d.%02d Bytes/prefix", 1058 dxr_tot_size / 1024, dxr_tot_size * 100 / 1024 % 100, 1059 i / 100, i % 100); 1060 FIB_PRINTF(LOG_INFO, da->fd, 1061 "%d.%02d%% trie, %d.%02d%% range fragmentation", 1062 trie_frag / 100, trie_frag % 100, 1063 range_frag / 100, range_frag % 100); 1064 i = (t1.tv_sec - t0.tv_sec) * 1000000 + t1.tv_usec - t0.tv_usec; 1065 FIB_PRINTF(LOG_INFO, da->fd, "range table %s in %u.%03u ms", 1066 range_rebuild ? "rebuilt" : "updated", i / 1000, i % 1000); 1067 i = (t2.tv_sec - t1.tv_sec) * 1000000 + t2.tv_usec - t1.tv_usec; 1068 FIB_PRINTF(LOG_INFO, da->fd, "trie %s in %u.%03u ms", 1069 trie_rebuild ? "rebuilt" : "updated", i / 1000, i % 1000); 1070 i = (t3.tv_sec - t2.tv_sec) * 1000000 + t3.tv_usec - t2.tv_usec; 1071 FIB_PRINTF(LOG_INFO, da->fd, "snapshot forked in %u.%03u ms", 1072 i / 1000, i % 1000); 1073 } 1074 1075 /* 1076 * Glue functions for attaching to the FIB_ALGO infrastructure. 1077 */ 1078 1079 static struct nhop_object * 1080 dxr_fib_lookup(void *algo_data, const struct flm_lookup_key key, 1081 uint32_t scopeid) 1082 { 1083 struct dxr *dxr = algo_data; 1084 1085 return (dxr->nh_tbl[dxr_lookup(dxr, ntohl(key.addr4.s_addr))]); 1086 } 1087 1088 static enum flm_op_result 1089 dxr_init(uint32_t fibnum, struct fib_data *fd, void *old_data, void **data) 1090 { 1091 struct dxr *old_dxr = old_data; 1092 struct dxr_aux *da = NULL; 1093 struct dxr *dxr; 1094 1095 dxr = malloc(sizeof(*dxr), M_DXRAUX, M_NOWAIT); 1096 if (dxr == NULL) { 1097 FIB_PRINTF(LOG_NOTICE, fd, 1098 "Unable to allocate DXR container struct"); 1099 return (FLM_REBUILD); 1100 } 1101 1102 /* Check whether we may reuse the old auxiliary structures */ 1103 if (old_dxr != NULL && old_dxr->aux != NULL && 1104 old_dxr->aux->fd == fd) { 1105 da = old_dxr->aux; 1106 atomic_add_int(&da->refcnt, 1); 1107 } 1108 1109 dxr->aux = da; 1110 dxr->d = NULL; 1111 dxr->fd = fd; 1112 dxr->fibnum = fibnum; 1113 *data = dxr; 1114 1115 return (FLM_SUCCESS); 1116 } 1117 1118 static void 1119 dxr_destroy(void *data) 1120 { 1121 struct dxr *dxr = data; 1122 struct dxr_aux *da = dxr->aux; 1123 struct chunk_desc *cdp; 1124 struct trie_desc *tp; 1125 1126 free(dxr->d, M_DXRLPM); 1127 free(dxr, M_DXRAUX); 1128 1129 if (da == NULL || atomic_fetchadd_int(&da->refcnt, -1) > 1) 1130 return; 1131 1132 /* Release auxiliary structures */ 1133 while ((cdp = LIST_FIRST(&da->all_chunks)) != NULL) { 1134 LIST_REMOVE(cdp, cd_all_le); 1135 uma_zfree(chunk_zone, cdp); 1136 } 1137 while ((tp = LIST_FIRST(&da->all_trie)) != NULL) { 1138 LIST_REMOVE(tp, td_all_le); 1139 uma_zfree(trie_zone, tp); 1140 } 1141 free(da->range_tbl, M_DXRAUX); 1142 free(da->x_tbl, M_DXRAUX); 1143 free(da, M_DXRAUX); 1144 } 1145 1146 static void 1147 epoch_dxr_destroy(epoch_context_t ctx) 1148 { 1149 struct dxr *dxr = __containerof(ctx, struct dxr, epoch_ctx); 1150 1151 dxr_destroy(dxr); 1152 } 1153 1154 static void * 1155 choose_lookup_fn(struct dxr_aux *da) 1156 { 1157 1158 switch (da->d_bits) { 1159 #if DXR_TRIE_BITS > 16 1160 case 16: 1161 return (dxr_fib_lookup_16); 1162 #endif 1163 case 15: 1164 return (dxr_fib_lookup_15); 1165 case 14: 1166 return (dxr_fib_lookup_14); 1167 case 13: 1168 return (dxr_fib_lookup_13); 1169 case 12: 1170 return (dxr_fib_lookup_12); 1171 case 11: 1172 return (dxr_fib_lookup_11); 1173 case 10: 1174 return (dxr_fib_lookup_10); 1175 case 9: 1176 return (dxr_fib_lookup_9); 1177 } 1178 return (dxr_fib_lookup); 1179 } 1180 1181 static enum flm_op_result 1182 dxr_dump_end(void *data, struct fib_dp *dp) 1183 { 1184 struct dxr *dxr = data; 1185 struct dxr_aux *da; 1186 1187 dxr_build(dxr); 1188 1189 da = dxr->aux; 1190 if (da == NULL || dxr->d == NULL) 1191 return (FLM_REBUILD); 1192 1193 if (da->rtbl_top >= BASE_MAX) 1194 return (FLM_ERROR); 1195 1196 dp->f = choose_lookup_fn(da); 1197 dp->arg = dxr; 1198 1199 return (FLM_SUCCESS); 1200 } 1201 1202 static enum flm_op_result 1203 dxr_dump_rib_item(struct rtentry *rt, void *data) 1204 { 1205 1206 return (FLM_SUCCESS); 1207 } 1208 1209 static enum flm_op_result 1210 dxr_change_rib_item(struct rib_head *rnh, struct rib_cmd_info *rc, 1211 void *data) 1212 { 1213 1214 return (FLM_BATCH); 1215 } 1216 1217 static enum flm_op_result 1218 dxr_change_rib_batch(struct rib_head *rnh, struct fib_change_queue *q, 1219 void *data) 1220 { 1221 struct dxr *dxr = data; 1222 struct dxr *new_dxr; 1223 struct dxr_aux *da; 1224 struct fib_dp new_dp; 1225 enum flm_op_result res; 1226 uint32_t ip, plen, hmask, start, end, i, ui; 1227 #ifdef INVARIANTS 1228 struct rib_rtable_info rinfo; 1229 int update_delta = 0; 1230 #endif 1231 1232 MPASS(data != NULL); 1233 MPASS(q != NULL); 1234 MPASS(q->count < q->size); 1235 1236 da = dxr->aux; 1237 MPASS(da != NULL); 1238 MPASS(da->fd == dxr->fd); 1239 MPASS(da->refcnt > 0); 1240 1241 FIB_PRINTF(LOG_INFO, da->fd, "processing %d update(s)", q->count); 1242 for (ui = 0; ui < q->count; ui++) { 1243 #ifdef INVARIANTS 1244 if (q->entries[ui].nh_new != NULL) 1245 update_delta++; 1246 if (q->entries[ui].nh_old != NULL) 1247 update_delta--; 1248 #endif 1249 plen = q->entries[ui].plen; 1250 ip = ntohl(q->entries[ui].addr4.s_addr); 1251 if (plen < 32) 1252 hmask = 0xffffffffU >> plen; 1253 else 1254 hmask = 0; 1255 start = (ip & ~hmask) >> DXR_RANGE_SHIFT; 1256 end = (ip | hmask) >> DXR_RANGE_SHIFT; 1257 1258 if ((start & 0x1f) == 0 && (end & 0x1f) == 0x1f) 1259 for (i = start >> 5; i <= end >> 5; i++) 1260 da->updates_mask[i] = 0xffffffffU; 1261 else 1262 for (i = start; i <= end; i++) 1263 da->updates_mask[i >> 5] |= (1 << (i & 0x1f)); 1264 if (start < da->updates_low) 1265 da->updates_low = start; 1266 if (end > da->updates_high) 1267 da->updates_high = end; 1268 } 1269 1270 #ifdef INVARIANTS 1271 fib_get_rtable_info(fib_get_rh(da->fd), &rinfo); 1272 MPASS(da->prefixes + update_delta == rinfo.num_prefixes); 1273 #endif 1274 1275 res = dxr_init(0, dxr->fd, data, (void **) &new_dxr); 1276 if (res != FLM_SUCCESS) 1277 return (res); 1278 1279 dxr_build(new_dxr); 1280 1281 /* Structural limit exceeded, hard error */ 1282 if (da->rtbl_top >= BASE_MAX) { 1283 dxr_destroy(new_dxr); 1284 return (FLM_ERROR); 1285 } 1286 1287 if (new_dxr->d == NULL) { 1288 dxr_destroy(new_dxr); 1289 return (FLM_REBUILD); 1290 } 1291 1292 new_dp.f = choose_lookup_fn(da); 1293 new_dp.arg = new_dxr; 1294 if (fib_set_datapath_ptr(dxr->fd, &new_dp)) { 1295 fib_set_algo_ptr(dxr->fd, new_dxr); 1296 fib_epoch_call(epoch_dxr_destroy, &dxr->epoch_ctx); 1297 return (FLM_SUCCESS); 1298 } 1299 1300 FIB_PRINTF(LOG_NOTICE, dxr->fd, "fib_set_datapath_ptr() failed"); 1301 dxr_destroy(new_dxr); 1302 return (FLM_REBUILD); 1303 } 1304 1305 static uint8_t 1306 dxr_get_pref(const struct rib_rtable_info *rinfo) 1307 { 1308 1309 /* Below bsearch4 up to 10 prefixes. Always supersedes dpdk_lpm4. */ 1310 return (251); 1311 } 1312 1313 SYSCTL_DECL(_net_route_algo); 1314 SYSCTL_NODE(_net_route_algo, OID_AUTO, dxr, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 1315 "DXR tunables"); 1316 1317 static int 1318 sysctl_dxr_frag_limit(SYSCTL_HANDLER_ARGS) 1319 { 1320 char buf[8]; 1321 int error, new, i; 1322 1323 snprintf(buf, sizeof(buf), "%d.%02d%%", V_frag_limit / 100, 1324 V_frag_limit % 100); 1325 error = sysctl_handle_string(oidp, buf, sizeof(buf), req); 1326 if (error != 0 || req->newptr == NULL) 1327 return (error); 1328 if (!isdigit(*buf) && *buf != '.') 1329 return (EINVAL); 1330 for (i = 0, new = 0; isdigit(buf[i]) && i < sizeof(buf); i++) 1331 new = new * 10 + buf[i] - '0'; 1332 new *= 100; 1333 if (buf[i++] == '.') { 1334 if (!isdigit(buf[i])) 1335 return (EINVAL); 1336 new += (buf[i++] - '0') * 10; 1337 if (isdigit(buf[i])) 1338 new += buf[i++] - '0'; 1339 } 1340 if (new > 1000) 1341 return (EINVAL); 1342 V_frag_limit = new; 1343 snprintf(buf, sizeof(buf), "%d.%02d%%", V_frag_limit / 100, 1344 V_frag_limit % 100); 1345 return (0); 1346 } 1347 1348 SYSCTL_PROC(_net_route_algo_dxr, OID_AUTO, frag_limit, 1349 CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_VNET, 1350 0, 0, sysctl_dxr_frag_limit, "A", 1351 "Fragmentation threshold to full rebuild"); 1352 1353 static struct fib_lookup_module fib_dxr_mod = { 1354 .flm_name = "dxr", 1355 .flm_family = AF_INET, 1356 .flm_init_cb = dxr_init, 1357 .flm_destroy_cb = dxr_destroy, 1358 .flm_dump_rib_item_cb = dxr_dump_rib_item, 1359 .flm_dump_end_cb = dxr_dump_end, 1360 .flm_change_rib_item_cb = dxr_change_rib_item, 1361 .flm_change_rib_items_cb = dxr_change_rib_batch, 1362 .flm_get_pref = dxr_get_pref, 1363 }; 1364 1365 static int 1366 dxr_modevent(module_t mod, int type, void *unused) 1367 { 1368 int error; 1369 1370 switch (type) { 1371 case MOD_LOAD: 1372 chunk_zone = uma_zcreate("dxr chunk", sizeof(struct chunk_desc), 1373 NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); 1374 trie_zone = uma_zcreate("dxr trie", sizeof(struct trie_desc), 1375 NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); 1376 fib_module_register(&fib_dxr_mod); 1377 return(0); 1378 case MOD_UNLOAD: 1379 error = fib_module_unregister(&fib_dxr_mod); 1380 if (error) 1381 return (error); 1382 uma_zdestroy(chunk_zone); 1383 uma_zdestroy(trie_zone); 1384 return(0); 1385 default: 1386 return(EOPNOTSUPP); 1387 } 1388 } 1389 1390 static moduledata_t dxr_mod = {"fib_dxr", dxr_modevent, 0}; 1391 1392 DECLARE_MODULE(fib_dxr, dxr_mod, SI_SUB_PSEUDO, SI_ORDER_ANY); 1393 MODULE_VERSION(fib_dxr, 1); 1394