1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause 3 * 4 * Copyright (c) 2012-2022 Marko Zec 5 * Copyright (c) 2005, 2018 University of Zagreb 6 * Copyright (c) 2005 International Computer Science Institute 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 27 * SUCH DAMAGE. 28 */ 29 30 /* 31 * An implementation of DXR, a simple IPv4 LPM scheme with compact lookup 32 * structures and a trivial search procedure. More significant bits of 33 * the search key are used to directly index a two-stage trie, while the 34 * remaining bits are used for finding the next hop in a sorted array. 35 * More details in: 36 * 37 * M. Zec, L. Rizzo, M. Mikuc, DXR: towards a billion routing lookups per 38 * second in software, ACM SIGCOMM Computer Communication Review, September 39 * 2012 40 * 41 * M. Zec, M. Mikuc, Pushing the envelope: beyond two billion IP routing 42 * lookups per second on commodity CPUs, IEEE SoftCOM, September 2017, Split 43 */ 44 45 #include <sys/cdefs.h> 46 #include "opt_inet.h" 47 48 #include <sys/param.h> 49 #include <sys/kernel.h> 50 #include <sys/ctype.h> 51 #include <sys/epoch.h> 52 #include <sys/malloc.h> 53 #include <sys/module.h> 54 #include <sys/socket.h> 55 #include <sys/sysctl.h> 56 #include <sys/syslog.h> 57 58 #include <vm/uma.h> 59 60 #include <netinet/in.h> 61 #include <netinet/in_fib.h> 62 63 #include <net/route.h> 64 #include <net/route/route_ctl.h> 65 #include <net/route/fib_algo.h> 66 67 #define DXR_TRIE_BITS 20 68 69 CTASSERT(DXR_TRIE_BITS >= 16 && DXR_TRIE_BITS <= 24); 70 71 /* DXR2: two-stage primary trie, instead of a single direct lookup table */ 72 #define DXR2 73 74 #if DXR_TRIE_BITS > 16 75 #define DXR_D 16 76 #else 77 #define DXR_D (DXR_TRIE_BITS - 1) 78 #endif 79 80 #define D_TBL_SIZE (1 << DXR_D) 81 #define DIRECT_TBL_SIZE (1 << DXR_TRIE_BITS) 82 #define DXR_RANGE_MASK (0xffffffffU >> DXR_TRIE_BITS) 83 #define DXR_RANGE_SHIFT (32 - DXR_TRIE_BITS) 84 85 #define DESC_BASE_BITS 22 86 #define DESC_FRAGMENTS_BITS (32 - DESC_BASE_BITS) 87 #define BASE_MAX ((1 << DESC_BASE_BITS) - 1) 88 #define RTBL_SIZE_INCR (BASE_MAX / 64) 89 90 #if DXR_TRIE_BITS < 24 91 #define FRAGS_MASK_SHORT ((1 << (23 - DXR_TRIE_BITS)) - 1) 92 #else 93 #define FRAGS_MASK_SHORT 0 94 #endif 95 #define FRAGS_PREF_SHORT (((1 << DESC_FRAGMENTS_BITS) - 1) & \ 96 ~FRAGS_MASK_SHORT) 97 #define FRAGS_MARK_XL (FRAGS_PREF_SHORT - 1) 98 #define FRAGS_MARK_HIT (FRAGS_PREF_SHORT - 2) 99 100 #define IS_SHORT_FORMAT(x) ((x & FRAGS_PREF_SHORT) == FRAGS_PREF_SHORT) 101 #define IS_LONG_FORMAT(x) ((x & FRAGS_PREF_SHORT) != FRAGS_PREF_SHORT) 102 #define IS_XL_FORMAT(x) (x == FRAGS_MARK_XL) 103 104 #define RE_SHORT_MAX_NH ((1 << (DXR_TRIE_BITS - 8)) - 1) 105 106 #define CHUNK_HASH_BITS 16 107 #define CHUNK_HASH_SIZE (1 << CHUNK_HASH_BITS) 108 #define CHUNK_HASH_MASK (CHUNK_HASH_SIZE - 1) 109 110 #define TRIE_HASH_BITS 16 111 #define TRIE_HASH_SIZE (1 << TRIE_HASH_BITS) 112 #define TRIE_HASH_MASK (TRIE_HASH_SIZE - 1) 113 114 #define XTBL_SIZE_INCR (DIRECT_TBL_SIZE / 16) 115 116 #define UNUSED_BUCKETS 8 117 118 /* Lookup structure elements */ 119 120 struct direct_entry { 121 uint32_t fragments: DESC_FRAGMENTS_BITS, 122 base: DESC_BASE_BITS; 123 }; 124 125 struct range_entry_long { 126 uint32_t start: DXR_RANGE_SHIFT, 127 nexthop: DXR_TRIE_BITS; 128 }; 129 130 #if DXR_TRIE_BITS < 24 131 struct range_entry_short { 132 uint16_t start: DXR_RANGE_SHIFT - 8, 133 nexthop: DXR_TRIE_BITS - 8; 134 }; 135 #endif 136 137 /* Auxiliary structures */ 138 139 struct heap_entry { 140 uint32_t start; 141 uint32_t end; 142 uint32_t preflen; 143 uint32_t nexthop; 144 }; 145 146 struct chunk_desc { 147 LIST_ENTRY(chunk_desc) cd_all_le; 148 LIST_ENTRY(chunk_desc) cd_hash_le; 149 uint32_t cd_hash; 150 uint32_t cd_refcnt; 151 uint32_t cd_base; 152 uint32_t cd_cur_size; 153 uint32_t cd_max_size; 154 }; 155 156 struct trie_desc { 157 LIST_ENTRY(trie_desc) td_all_le; 158 LIST_ENTRY(trie_desc) td_hash_le; 159 uint32_t td_hash; 160 uint32_t td_index; 161 uint32_t td_refcnt; 162 }; 163 164 struct dxr_aux { 165 /* Glue to external state */ 166 struct fib_data *fd; 167 uint32_t fibnum; 168 int refcnt; 169 170 /* Auxiliary build-time tables */ 171 struct direct_entry direct_tbl[DIRECT_TBL_SIZE]; 172 uint16_t d_tbl[D_TBL_SIZE]; 173 struct direct_entry *x_tbl; 174 union { 175 struct range_entry_long re; 176 uint32_t fragments; 177 } *range_tbl; 178 179 /* Auxiliary internal state */ 180 uint32_t updates_mask[DIRECT_TBL_SIZE / 32]; 181 struct trie_desc *trietbl[D_TBL_SIZE]; 182 LIST_HEAD(, chunk_desc) chunk_hashtbl[CHUNK_HASH_SIZE]; 183 LIST_HEAD(, chunk_desc) all_chunks; 184 LIST_HEAD(, chunk_desc) unused_chunks[UNUSED_BUCKETS]; 185 LIST_HEAD(, trie_desc) trie_hashtbl[TRIE_HASH_SIZE]; 186 LIST_HEAD(, trie_desc) all_trie; 187 LIST_HEAD(, trie_desc) unused_trie; /* abuses hash link entry */ 188 struct sockaddr_in dst; 189 struct sockaddr_in mask; 190 struct heap_entry heap[33]; 191 uint32_t prefixes; 192 uint32_t updates_low; 193 uint32_t updates_high; 194 uint32_t unused_chunks_size; 195 uint32_t xtbl_size; 196 uint32_t all_trie_cnt; 197 uint32_t unused_trie_cnt; 198 uint32_t trie_rebuilt_prefixes; 199 uint32_t heap_index; 200 uint32_t d_bits; 201 uint32_t rtbl_size; 202 uint32_t rtbl_top; 203 uint32_t rtbl_work_frags; 204 uint32_t work_chunk; 205 }; 206 207 /* Main lookup structure container */ 208 209 struct dxr { 210 /* Lookup tables */ 211 void *d; 212 void *x; 213 void *r; 214 struct nhop_object **nh_tbl; 215 216 /* Glue to external state */ 217 struct dxr_aux *aux; 218 struct fib_data *fd; 219 struct epoch_context epoch_ctx; 220 uint32_t fibnum; 221 uint16_t d_shift; 222 uint16_t x_shift; 223 uint32_t x_mask; 224 }; 225 226 static MALLOC_DEFINE(M_DXRLPM, "dxr", "DXR LPM"); 227 static MALLOC_DEFINE(M_DXRAUX, "dxr aux", "DXR auxiliary"); 228 229 uma_zone_t chunk_zone; 230 uma_zone_t trie_zone; 231 232 VNET_DEFINE_STATIC(int, frag_limit) = 100; 233 #define V_frag_limit VNET(frag_limit) 234 235 /* Binary search for a matching address range */ 236 #define DXR_LOOKUP_STAGE \ 237 if (masked_dst < range[middle].start) { \ 238 upperbound = middle; \ 239 middle = (middle + lowerbound) / 2; \ 240 } else if (masked_dst < range[middle + 1].start) \ 241 return (range[middle].nexthop); \ 242 else { \ 243 lowerbound = middle + 1; \ 244 middle = (upperbound + middle + 1) / 2; \ 245 } \ 246 if (upperbound == lowerbound) \ 247 return (range[lowerbound].nexthop); 248 249 static int 250 range_lookup(struct range_entry_long *rt, struct direct_entry de, uint32_t dst) 251 { 252 uint32_t base; 253 uint32_t upperbound; 254 uint32_t middle; 255 uint32_t lowerbound; 256 uint32_t masked_dst; 257 258 base = de.base; 259 lowerbound = 0; 260 masked_dst = dst & DXR_RANGE_MASK; 261 262 #if DXR_TRIE_BITS < 24 263 if (__predict_true(IS_SHORT_FORMAT(de.fragments))) { 264 upperbound = de.fragments & FRAGS_MASK_SHORT; 265 struct range_entry_short *range = 266 (struct range_entry_short *) &rt[base]; 267 268 masked_dst >>= 8; 269 middle = upperbound; 270 upperbound = upperbound * 2 + 1; 271 272 for (;;) { 273 DXR_LOOKUP_STAGE 274 DXR_LOOKUP_STAGE 275 } 276 } 277 #endif 278 279 upperbound = de.fragments; 280 middle = upperbound / 2; 281 struct range_entry_long *range = &rt[base]; 282 if (__predict_false(IS_XL_FORMAT(de.fragments))) { 283 upperbound = *((uint32_t *) range); 284 range++; 285 middle = upperbound / 2; 286 } 287 288 for (;;) { 289 DXR_LOOKUP_STAGE 290 DXR_LOOKUP_STAGE 291 } 292 } 293 294 #define DXR_LOOKUP_DEFINE(D) \ 295 static int inline \ 296 dxr_lookup_##D(struct dxr *dxr, uint32_t dst) \ 297 { \ 298 struct direct_entry de; \ 299 uint16_t *dt = dxr->d; \ 300 struct direct_entry *xt = dxr->x; \ 301 \ 302 de = xt[(dt[dst >> (32 - (D))] << (DXR_TRIE_BITS - (D))) \ 303 + ((dst >> DXR_RANGE_SHIFT) & \ 304 (0xffffffffU >> (32 - DXR_TRIE_BITS + (D))))]; \ 305 if (__predict_true(de.fragments == FRAGS_MARK_HIT)) \ 306 return (de.base); \ 307 return (range_lookup(dxr->r, de, dst)); \ 308 } \ 309 \ 310 static struct nhop_object * \ 311 dxr_fib_lookup_##D(void *algo_data, \ 312 const struct flm_lookup_key key, uint32_t scopeid __unused) \ 313 { \ 314 struct dxr *dxr = algo_data; \ 315 \ 316 return (dxr->nh_tbl[dxr_lookup_##D(dxr, \ 317 ntohl(key.addr4.s_addr))]); \ 318 } 319 320 #ifdef DXR2 321 #if DXR_TRIE_BITS > 16 322 DXR_LOOKUP_DEFINE(16) 323 #endif 324 DXR_LOOKUP_DEFINE(15) 325 DXR_LOOKUP_DEFINE(14) 326 DXR_LOOKUP_DEFINE(13) 327 DXR_LOOKUP_DEFINE(12) 328 DXR_LOOKUP_DEFINE(11) 329 DXR_LOOKUP_DEFINE(10) 330 DXR_LOOKUP_DEFINE(9) 331 #endif /* DXR2 */ 332 333 static int inline 334 dxr_lookup(struct dxr *dxr, uint32_t dst) 335 { 336 struct direct_entry de; 337 #ifdef DXR2 338 uint16_t *dt = dxr->d; 339 struct direct_entry *xt = dxr->x; 340 341 de = xt[(dt[dst >> dxr->d_shift] << dxr->x_shift) + 342 ((dst >> DXR_RANGE_SHIFT) & dxr->x_mask)]; 343 #else /* !DXR2 */ 344 struct direct_entry *dt = dxr->d; 345 346 de = dt[dst >> DXR_RANGE_SHIFT]; 347 #endif /* !DXR2 */ 348 if (__predict_true(de.fragments == FRAGS_MARK_HIT)) 349 return (de.base); 350 return (range_lookup(dxr->r, de, dst)); 351 } 352 353 static void 354 initheap(struct dxr_aux *da, uint32_t dst_u32, uint32_t chunk) 355 { 356 struct heap_entry *fhp = &da->heap[0]; 357 struct rtentry *rt; 358 struct route_nhop_data rnd; 359 360 da->heap_index = 0; 361 da->dst.sin_addr.s_addr = htonl(dst_u32); 362 rt = fib4_lookup_rt(da->fibnum, da->dst.sin_addr, 0, NHR_UNLOCKED, 363 &rnd); 364 if (rt != NULL) { 365 struct in_addr addr; 366 uint32_t scopeid; 367 368 rt_get_inet_prefix_plen(rt, &addr, &fhp->preflen, &scopeid); 369 fhp->start = ntohl(addr.s_addr); 370 fhp->end = fhp->start; 371 if (fhp->preflen < 32) 372 fhp->end |= (0xffffffffU >> fhp->preflen); 373 fhp->nexthop = fib_get_nhop_idx(da->fd, rnd.rnd_nhop); 374 } else { 375 fhp->preflen = fhp->nexthop = fhp->start = 0; 376 fhp->end = 0xffffffffU; 377 } 378 } 379 380 static uint32_t 381 chunk_size(struct dxr_aux *da, struct direct_entry *fdesc) 382 { 383 384 if (IS_SHORT_FORMAT(fdesc->fragments)) 385 return ((fdesc->fragments & FRAGS_MASK_SHORT) + 1); 386 else if (IS_XL_FORMAT(fdesc->fragments)) 387 return (da->range_tbl[fdesc->base].fragments + 2); 388 else /* if (IS_LONG_FORMAT(fdesc->fragments)) */ 389 return (fdesc->fragments + 1); 390 } 391 392 static uint32_t 393 chunk_hash(struct dxr_aux *da, struct direct_entry *fdesc) 394 { 395 uint32_t size = chunk_size(da, fdesc); 396 uint32_t *p = (uint32_t *) &da->range_tbl[fdesc->base]; 397 uint32_t *l = (uint32_t *) &da->range_tbl[fdesc->base + size]; 398 uint32_t hash = fdesc->fragments; 399 400 for (; p < l; p++) 401 hash = (hash << 7) + (hash >> 13) + *p; 402 403 return (hash + (hash >> 16)); 404 } 405 406 static int 407 chunk_ref(struct dxr_aux *da, uint32_t chunk) 408 { 409 struct direct_entry *fdesc = &da->direct_tbl[chunk]; 410 struct chunk_desc *cdp, *empty_cdp; 411 uint32_t base = fdesc->base; 412 uint32_t size = chunk_size(da, fdesc); 413 uint32_t hash = chunk_hash(da, fdesc); 414 int i; 415 416 /* Find an existing descriptor */ 417 LIST_FOREACH(cdp, &da->chunk_hashtbl[hash & CHUNK_HASH_MASK], 418 cd_hash_le) { 419 if (cdp->cd_hash != hash || cdp->cd_cur_size != size || 420 memcmp(&da->range_tbl[base], &da->range_tbl[cdp->cd_base], 421 sizeof(struct range_entry_long) * size)) 422 continue; 423 da->rtbl_top = fdesc->base; 424 fdesc->base = cdp->cd_base; 425 cdp->cd_refcnt++; 426 return (0); 427 } 428 429 /* No matching chunks found. Find an empty one to recycle. */ 430 for (cdp = NULL, i = size; cdp == NULL && i < UNUSED_BUCKETS; i++) 431 cdp = LIST_FIRST(&da->unused_chunks[i]); 432 433 if (cdp == NULL) 434 LIST_FOREACH(empty_cdp, &da->unused_chunks[0], cd_hash_le) 435 if (empty_cdp->cd_max_size >= size && (cdp == NULL || 436 empty_cdp->cd_max_size < cdp->cd_max_size)) { 437 cdp = empty_cdp; 438 if (empty_cdp->cd_max_size == size) 439 break; 440 } 441 442 if (cdp != NULL) { 443 /* Copy from heap into the recycled chunk */ 444 bcopy(&da->range_tbl[fdesc->base], &da->range_tbl[cdp->cd_base], 445 size * sizeof(struct range_entry_long)); 446 fdesc->base = cdp->cd_base; 447 da->rtbl_top -= size; 448 da->unused_chunks_size -= cdp->cd_max_size; 449 if (cdp->cd_max_size > size) { 450 /* Split the range in two, need a new descriptor */ 451 empty_cdp = uma_zalloc(chunk_zone, M_NOWAIT); 452 if (empty_cdp == NULL) 453 return (1); 454 LIST_INSERT_BEFORE(cdp, empty_cdp, cd_all_le); 455 empty_cdp->cd_base = cdp->cd_base + size; 456 empty_cdp->cd_cur_size = 0; 457 empty_cdp->cd_max_size = cdp->cd_max_size - size; 458 459 i = empty_cdp->cd_max_size; 460 if (i >= UNUSED_BUCKETS) 461 i = 0; 462 LIST_INSERT_HEAD(&da->unused_chunks[i], empty_cdp, 463 cd_hash_le); 464 465 da->unused_chunks_size += empty_cdp->cd_max_size; 466 cdp->cd_max_size = size; 467 } 468 LIST_REMOVE(cdp, cd_hash_le); 469 } else { 470 /* Alloc a new descriptor at the top of the heap*/ 471 cdp = uma_zalloc(chunk_zone, M_NOWAIT); 472 if (cdp == NULL) 473 return (1); 474 cdp->cd_max_size = size; 475 cdp->cd_base = fdesc->base; 476 LIST_INSERT_HEAD(&da->all_chunks, cdp, cd_all_le); 477 MPASS(cdp->cd_base + cdp->cd_max_size == da->rtbl_top); 478 } 479 480 cdp->cd_hash = hash; 481 cdp->cd_refcnt = 1; 482 cdp->cd_cur_size = size; 483 LIST_INSERT_HEAD(&da->chunk_hashtbl[hash & CHUNK_HASH_MASK], cdp, 484 cd_hash_le); 485 if (da->rtbl_top >= da->rtbl_size) { 486 if (da->rtbl_top >= BASE_MAX) { 487 FIB_PRINTF(LOG_ERR, da->fd, 488 "structural limit exceeded at %d " 489 "range table elements", da->rtbl_top); 490 return (1); 491 } 492 da->rtbl_size += RTBL_SIZE_INCR; 493 i = (BASE_MAX - da->rtbl_top) * LOG_DEBUG / BASE_MAX; 494 FIB_PRINTF(i, da->fd, "range table at %d%% structural limit", 495 da->rtbl_top * 100 / BASE_MAX); 496 da->range_tbl = realloc(da->range_tbl, 497 sizeof(*da->range_tbl) * da->rtbl_size + FRAGS_PREF_SHORT, 498 M_DXRAUX, M_NOWAIT); 499 if (da->range_tbl == NULL) { 500 FIB_PRINTF(LOG_NOTICE, da->fd, 501 "Unable to allocate DXR range table"); 502 return (1); 503 } 504 } 505 506 return (0); 507 } 508 509 static void 510 chunk_unref(struct dxr_aux *da, uint32_t chunk) 511 { 512 struct direct_entry *fdesc = &da->direct_tbl[chunk]; 513 struct chunk_desc *cdp, *cdp2; 514 uint32_t base = fdesc->base; 515 uint32_t size = chunk_size(da, fdesc); 516 uint32_t hash = chunk_hash(da, fdesc); 517 int i; 518 519 /* Find the corresponding descriptor */ 520 LIST_FOREACH(cdp, &da->chunk_hashtbl[hash & CHUNK_HASH_MASK], 521 cd_hash_le) 522 if (cdp->cd_hash == hash && cdp->cd_cur_size == size && 523 memcmp(&da->range_tbl[base], &da->range_tbl[cdp->cd_base], 524 sizeof(struct range_entry_long) * size) == 0) 525 break; 526 527 MPASS(cdp != NULL); 528 if (--cdp->cd_refcnt > 0) 529 return; 530 531 LIST_REMOVE(cdp, cd_hash_le); 532 da->unused_chunks_size += cdp->cd_max_size; 533 cdp->cd_cur_size = 0; 534 535 /* Attempt to merge with the preceding chunk, if empty */ 536 cdp2 = LIST_NEXT(cdp, cd_all_le); 537 if (cdp2 != NULL && cdp2->cd_cur_size == 0) { 538 MPASS(cdp2->cd_base + cdp2->cd_max_size == cdp->cd_base); 539 LIST_REMOVE(cdp, cd_all_le); 540 LIST_REMOVE(cdp2, cd_hash_le); 541 cdp2->cd_max_size += cdp->cd_max_size; 542 uma_zfree(chunk_zone, cdp); 543 cdp = cdp2; 544 } 545 546 /* Attempt to merge with the subsequent chunk, if empty */ 547 cdp2 = LIST_PREV(cdp, &da->all_chunks, chunk_desc, cd_all_le); 548 if (cdp2 != NULL && cdp2->cd_cur_size == 0) { 549 MPASS(cdp->cd_base + cdp->cd_max_size == cdp2->cd_base); 550 LIST_REMOVE(cdp, cd_all_le); 551 LIST_REMOVE(cdp2, cd_hash_le); 552 cdp2->cd_max_size += cdp->cd_max_size; 553 cdp2->cd_base = cdp->cd_base; 554 uma_zfree(chunk_zone, cdp); 555 cdp = cdp2; 556 } 557 558 if (cdp->cd_base + cdp->cd_max_size == da->rtbl_top) { 559 /* Free the chunk on the top of the range heap, trim the heap */ 560 MPASS(cdp == LIST_FIRST(&da->all_chunks)); 561 da->rtbl_top -= cdp->cd_max_size; 562 da->unused_chunks_size -= cdp->cd_max_size; 563 LIST_REMOVE(cdp, cd_all_le); 564 uma_zfree(chunk_zone, cdp); 565 return; 566 } 567 568 i = cdp->cd_max_size; 569 if (i >= UNUSED_BUCKETS) 570 i = 0; 571 LIST_INSERT_HEAD(&da->unused_chunks[i], cdp, cd_hash_le); 572 } 573 574 #ifdef DXR2 575 static uint32_t 576 trie_hash(struct dxr_aux *da, uint32_t dxr_x, uint32_t index) 577 { 578 uint32_t i, *val; 579 uint32_t hash = 0; 580 581 for (i = 0; i < (1 << dxr_x); i++) { 582 hash = (hash << 3) ^ (hash >> 3); 583 val = (uint32_t *) 584 (void *) &da->direct_tbl[(index << dxr_x) + i]; 585 hash += (*val << 5); 586 hash += (*val >> 5); 587 } 588 589 return (hash + (hash >> 16)); 590 } 591 592 static int 593 trie_ref(struct dxr_aux *da, uint32_t index) 594 { 595 struct trie_desc *tp; 596 uint32_t dxr_d = da->d_bits; 597 uint32_t dxr_x = DXR_TRIE_BITS - dxr_d; 598 uint32_t hash = trie_hash(da, dxr_x, index); 599 600 /* Find an existing descriptor */ 601 LIST_FOREACH(tp, &da->trie_hashtbl[hash & TRIE_HASH_MASK], td_hash_le) 602 if (tp->td_hash == hash && 603 memcmp(&da->direct_tbl[index << dxr_x], 604 &da->x_tbl[tp->td_index << dxr_x], 605 sizeof(*da->x_tbl) << dxr_x) == 0) { 606 tp->td_refcnt++; 607 da->trietbl[index] = tp; 608 return(tp->td_index); 609 } 610 611 tp = LIST_FIRST(&da->unused_trie); 612 if (tp != NULL) { 613 LIST_REMOVE(tp, td_hash_le); 614 da->unused_trie_cnt--; 615 } else { 616 tp = uma_zalloc(trie_zone, M_NOWAIT); 617 if (tp == NULL) 618 return (-1); 619 LIST_INSERT_HEAD(&da->all_trie, tp, td_all_le); 620 tp->td_index = da->all_trie_cnt++; 621 } 622 623 tp->td_hash = hash; 624 tp->td_refcnt = 1; 625 LIST_INSERT_HEAD(&da->trie_hashtbl[hash & TRIE_HASH_MASK], tp, 626 td_hash_le); 627 memcpy(&da->x_tbl[tp->td_index << dxr_x], 628 &da->direct_tbl[index << dxr_x], sizeof(*da->x_tbl) << dxr_x); 629 da->trietbl[index] = tp; 630 if (da->all_trie_cnt >= da->xtbl_size >> dxr_x) { 631 da->xtbl_size += XTBL_SIZE_INCR; 632 da->x_tbl = realloc(da->x_tbl, 633 sizeof(*da->x_tbl) * da->xtbl_size, M_DXRAUX, M_NOWAIT); 634 if (da->x_tbl == NULL) { 635 FIB_PRINTF(LOG_NOTICE, da->fd, 636 "Unable to allocate DXR extension table"); 637 return (-1); 638 } 639 } 640 return(tp->td_index); 641 } 642 643 static void 644 trie_unref(struct dxr_aux *da, uint32_t index) 645 { 646 struct trie_desc *tp = da->trietbl[index]; 647 648 if (tp == NULL) 649 return; 650 da->trietbl[index] = NULL; 651 if (--tp->td_refcnt > 0) 652 return; 653 654 LIST_REMOVE(tp, td_hash_le); 655 da->unused_trie_cnt++; 656 if (tp->td_index != da->all_trie_cnt - 1) { 657 LIST_INSERT_HEAD(&da->unused_trie, tp, td_hash_le); 658 return; 659 } 660 661 do { 662 da->all_trie_cnt--; 663 da->unused_trie_cnt--; 664 LIST_REMOVE(tp, td_all_le); 665 uma_zfree(trie_zone, tp); 666 LIST_FOREACH(tp, &da->unused_trie, td_hash_le) 667 if (tp->td_index == da->all_trie_cnt - 1) { 668 LIST_REMOVE(tp, td_hash_le); 669 break; 670 } 671 } while (tp != NULL); 672 } 673 #endif 674 675 static void 676 heap_inject(struct dxr_aux *da, uint32_t start, uint32_t end, uint32_t preflen, 677 uint32_t nh) 678 { 679 struct heap_entry *fhp; 680 int i; 681 682 for (i = da->heap_index; i >= 0; i--) { 683 if (preflen > da->heap[i].preflen) 684 break; 685 else if (preflen < da->heap[i].preflen) 686 da->heap[i + 1] = da->heap[i]; 687 else 688 return; 689 } 690 691 fhp = &da->heap[i + 1]; 692 fhp->preflen = preflen; 693 fhp->start = start; 694 fhp->end = end; 695 fhp->nexthop = nh; 696 da->heap_index++; 697 } 698 699 static int 700 dxr_walk(struct rtentry *rt, void *arg) 701 { 702 struct dxr_aux *da = arg; 703 uint32_t chunk = da->work_chunk; 704 uint32_t first = chunk << DXR_RANGE_SHIFT; 705 uint32_t last = first | DXR_RANGE_MASK; 706 struct range_entry_long *fp = 707 &da->range_tbl[da->rtbl_top + da->rtbl_work_frags].re; 708 struct heap_entry *fhp = &da->heap[da->heap_index]; 709 uint32_t preflen, nh, start, end, scopeid; 710 struct in_addr addr; 711 712 rt_get_inet_prefix_plen(rt, &addr, &preflen, &scopeid); 713 start = ntohl(addr.s_addr); 714 if (start > last) 715 return (-1); /* Beyond chunk boundaries, we are done */ 716 if (start < first) 717 return (0); /* Skip this route */ 718 719 end = start; 720 if (preflen < 32) 721 end |= (0xffffffffU >> preflen); 722 nh = fib_get_nhop_idx(da->fd, rt_get_raw_nhop(rt)); 723 724 if (start == fhp->start) 725 heap_inject(da, start, end, preflen, nh); 726 else { 727 /* start > fhp->start */ 728 while (start > fhp->end) { 729 uint32_t oend = fhp->end; 730 731 if (da->heap_index > 0) { 732 fhp--; 733 da->heap_index--; 734 } else 735 initheap(da, fhp->end + 1, chunk); 736 if (fhp->end > oend && fhp->nexthop != fp->nexthop) { 737 fp++; 738 da->rtbl_work_frags++; 739 fp->start = (oend + 1) & DXR_RANGE_MASK; 740 fp->nexthop = fhp->nexthop; 741 } 742 } 743 if (start > ((chunk << DXR_RANGE_SHIFT) | fp->start) && 744 nh != fp->nexthop) { 745 fp++; 746 da->rtbl_work_frags++; 747 fp->start = start & DXR_RANGE_MASK; 748 } else if (da->rtbl_work_frags) { 749 if ((--fp)->nexthop == nh) 750 da->rtbl_work_frags--; 751 else 752 fp++; 753 } 754 fp->nexthop = nh; 755 heap_inject(da, start, end, preflen, nh); 756 } 757 758 return (0); 759 } 760 761 static int 762 update_chunk(struct dxr_aux *da, uint32_t chunk) 763 { 764 struct range_entry_long *fp; 765 #if DXR_TRIE_BITS < 24 766 struct range_entry_short *fps; 767 uint32_t start, nh, i; 768 #endif 769 struct heap_entry *fhp; 770 uint32_t first = chunk << DXR_RANGE_SHIFT; 771 uint32_t last = first | DXR_RANGE_MASK; 772 773 if (da->direct_tbl[chunk].fragments != FRAGS_MARK_HIT) 774 chunk_unref(da, chunk); 775 776 initheap(da, first, chunk); 777 778 fp = &da->range_tbl[da->rtbl_top].re; 779 da->rtbl_work_frags = 0; 780 fp->start = first & DXR_RANGE_MASK; 781 fp->nexthop = da->heap[0].nexthop; 782 783 da->dst.sin_addr.s_addr = htonl(first); 784 da->mask.sin_addr.s_addr = htonl(~DXR_RANGE_MASK); 785 786 da->work_chunk = chunk; 787 rib_walk_from(da->fibnum, AF_INET, RIB_FLAG_LOCKED, 788 (struct sockaddr *) &da->dst, (struct sockaddr *) &da->mask, 789 dxr_walk, da); 790 791 /* Flush any remaining objects on the heap */ 792 fp = &da->range_tbl[da->rtbl_top + da->rtbl_work_frags].re; 793 fhp = &da->heap[da->heap_index]; 794 while (fhp->preflen > DXR_TRIE_BITS) { 795 uint32_t oend = fhp->end; 796 797 if (da->heap_index > 0) { 798 fhp--; 799 da->heap_index--; 800 } else 801 initheap(da, fhp->end + 1, chunk); 802 if (fhp->end > oend && fhp->nexthop != fp->nexthop) { 803 /* Have we crossed the upper chunk boundary? */ 804 if (oend >= last) 805 break; 806 fp++; 807 da->rtbl_work_frags++; 808 fp->start = (oend + 1) & DXR_RANGE_MASK; 809 fp->nexthop = fhp->nexthop; 810 } 811 } 812 813 /* Direct hit if the chunk contains only a single fragment */ 814 if (da->rtbl_work_frags == 0) { 815 da->direct_tbl[chunk].base = fp->nexthop; 816 da->direct_tbl[chunk].fragments = FRAGS_MARK_HIT; 817 return (0); 818 } 819 820 da->direct_tbl[chunk].base = da->rtbl_top; 821 da->direct_tbl[chunk].fragments = da->rtbl_work_frags; 822 823 #if DXR_TRIE_BITS < 24 824 /* Check whether the chunk can be more compactly encoded */ 825 fp = &da->range_tbl[da->rtbl_top].re; 826 for (i = 0; i <= da->rtbl_work_frags; i++, fp++) 827 if ((fp->start & 0xff) != 0 || fp->nexthop > RE_SHORT_MAX_NH) 828 break; 829 if (i == da->rtbl_work_frags + 1) { 830 fp = &da->range_tbl[da->rtbl_top].re; 831 fps = (void *) fp; 832 for (i = 0; i <= da->rtbl_work_frags; i++, fp++, fps++) { 833 start = fp->start; 834 nh = fp->nexthop; 835 fps->start = start >> 8; 836 fps->nexthop = nh; 837 } 838 fps->start = start >> 8; 839 fps->nexthop = nh; 840 da->rtbl_work_frags >>= 1; 841 da->direct_tbl[chunk].fragments = 842 da->rtbl_work_frags | FRAGS_PREF_SHORT; 843 } else 844 #endif 845 if (da->rtbl_work_frags >= FRAGS_MARK_HIT) { 846 da->direct_tbl[chunk].fragments = FRAGS_MARK_XL; 847 memmove(&da->range_tbl[da->rtbl_top + 1], 848 &da->range_tbl[da->rtbl_top], 849 (da->rtbl_work_frags + 1) * sizeof(*da->range_tbl)); 850 da->range_tbl[da->rtbl_top].fragments = da->rtbl_work_frags; 851 da->rtbl_work_frags++; 852 } 853 da->rtbl_top += (da->rtbl_work_frags + 1); 854 return (chunk_ref(da, chunk)); 855 } 856 857 static void 858 dxr_build(struct dxr *dxr) 859 { 860 struct dxr_aux *da = dxr->aux; 861 struct chunk_desc *cdp; 862 struct rib_rtable_info rinfo; 863 struct timeval t0, t1, t2, t3; 864 uint32_t r_size, dxr_tot_size; 865 uint32_t i, m, range_rebuild = 0; 866 uint32_t range_frag; 867 #ifdef DXR2 868 struct trie_desc *tp; 869 uint32_t d_tbl_size, dxr_x, d_size, x_size; 870 uint32_t ti, trie_rebuild = 0, prev_size = 0; 871 uint32_t trie_frag; 872 #endif 873 874 MPASS(dxr->d == NULL); 875 876 if (da == NULL) { 877 da = malloc(sizeof(*dxr->aux), M_DXRAUX, M_NOWAIT); 878 if (da == NULL) { 879 FIB_PRINTF(LOG_NOTICE, dxr->fd, 880 "Unable to allocate DXR aux struct"); 881 return; 882 } 883 dxr->aux = da; 884 da->fibnum = dxr->fibnum; 885 da->fd = dxr->fd; 886 da->refcnt = 1; 887 LIST_INIT(&da->all_chunks); 888 LIST_INIT(&da->all_trie); 889 da->rtbl_size = RTBL_SIZE_INCR; 890 da->range_tbl = NULL; 891 da->xtbl_size = XTBL_SIZE_INCR; 892 da->x_tbl = NULL; 893 bzero(&da->dst, sizeof(da->dst)); 894 bzero(&da->mask, sizeof(da->mask)); 895 da->dst.sin_len = sizeof(da->dst); 896 da->mask.sin_len = sizeof(da->mask); 897 da->dst.sin_family = AF_INET; 898 da->mask.sin_family = AF_INET; 899 } 900 if (da->range_tbl == NULL) { 901 da->range_tbl = malloc(sizeof(*da->range_tbl) * da->rtbl_size 902 + FRAGS_PREF_SHORT, M_DXRAUX, M_NOWAIT); 903 if (da->range_tbl == NULL) { 904 FIB_PRINTF(LOG_NOTICE, da->fd, 905 "Unable to allocate DXR range table"); 906 return; 907 } 908 range_rebuild = 1; 909 } 910 #ifdef DXR2 911 if (da->x_tbl == NULL) { 912 da->x_tbl = malloc(sizeof(*da->x_tbl) * da->xtbl_size, 913 M_DXRAUX, M_NOWAIT); 914 if (da->x_tbl == NULL) { 915 FIB_PRINTF(LOG_NOTICE, da->fd, 916 "Unable to allocate DXR extension table"); 917 return; 918 } 919 trie_rebuild = 1; 920 } 921 #endif 922 923 microuptime(&t0); 924 925 dxr->nh_tbl = fib_get_nhop_array(da->fd); 926 fib_get_rtable_info(fib_get_rh(da->fd), &rinfo); 927 928 if (da->updates_low > da->updates_high) 929 range_rebuild = 1; 930 931 range_build: 932 if (range_rebuild) { 933 /* Bulk cleanup */ 934 bzero(da->chunk_hashtbl, sizeof(da->chunk_hashtbl)); 935 while ((cdp = LIST_FIRST(&da->all_chunks)) != NULL) { 936 LIST_REMOVE(cdp, cd_all_le); 937 uma_zfree(chunk_zone, cdp); 938 } 939 for (i = 0; i < UNUSED_BUCKETS; i++) 940 LIST_INIT(&da->unused_chunks[i]); 941 da->unused_chunks_size = 0; 942 da->rtbl_top = 0; 943 da->updates_low = 0; 944 da->updates_high = DIRECT_TBL_SIZE - 1; 945 memset(da->updates_mask, 0xff, sizeof(da->updates_mask)); 946 for (i = 0; i < DIRECT_TBL_SIZE; i++) { 947 da->direct_tbl[i].fragments = FRAGS_MARK_HIT; 948 da->direct_tbl[i].base = 0; 949 } 950 } 951 da->prefixes = rinfo.num_prefixes; 952 953 /* DXR: construct direct & range table */ 954 for (i = da->updates_low; i <= da->updates_high; i++) { 955 m = da->updates_mask[i >> 5] >> (i & 0x1f); 956 if (m == 0) 957 i |= 0x1f; 958 else if (m & 1 && update_chunk(da, i) != 0) 959 return; 960 } 961 962 range_frag = 0; 963 if (da->rtbl_top) 964 range_frag = da->unused_chunks_size * 10000ULL / da->rtbl_top; 965 if (range_frag > V_frag_limit) { 966 range_rebuild = 1; 967 goto range_build; 968 } 969 970 r_size = sizeof(*da->range_tbl) * da->rtbl_top; 971 microuptime(&t1); 972 973 #ifdef DXR2 974 if (range_rebuild || 975 abs(fls(da->prefixes) - fls(da->trie_rebuilt_prefixes)) > 1) 976 trie_rebuild = 1; 977 978 trie_build: 979 if (trie_rebuild) { 980 da->trie_rebuilt_prefixes = da->prefixes; 981 da->d_bits = DXR_D; 982 da->updates_low = 0; 983 da->updates_high = DIRECT_TBL_SIZE - 1; 984 if (!range_rebuild) 985 memset(da->updates_mask, 0xff, 986 sizeof(da->updates_mask)); 987 } 988 989 dxr2_try_squeeze: 990 if (trie_rebuild) { 991 /* Bulk cleanup */ 992 bzero(da->trietbl, sizeof(da->trietbl)); 993 bzero(da->trie_hashtbl, sizeof(da->trie_hashtbl)); 994 while ((tp = LIST_FIRST(&da->all_trie)) != NULL) { 995 LIST_REMOVE(tp, td_all_le); 996 uma_zfree(trie_zone, tp); 997 } 998 LIST_INIT(&da->unused_trie); 999 da->all_trie_cnt = da->unused_trie_cnt = 0; 1000 } 1001 1002 /* Populate d_tbl, x_tbl */ 1003 dxr_x = DXR_TRIE_BITS - da->d_bits; 1004 d_tbl_size = (1 << da->d_bits); 1005 1006 for (i = da->updates_low >> dxr_x; i <= da->updates_high >> dxr_x; 1007 i++) { 1008 if (!trie_rebuild) { 1009 m = 0; 1010 for (int j = 0; j < (1 << dxr_x); j += 32) 1011 m |= da->updates_mask[((i << dxr_x) + j) >> 5]; 1012 if (m == 0) 1013 continue; 1014 trie_unref(da, i); 1015 } 1016 ti = trie_ref(da, i); 1017 if (ti < 0) 1018 return; 1019 da->d_tbl[i] = ti; 1020 } 1021 1022 trie_frag = 0; 1023 if (da->all_trie_cnt) 1024 trie_frag = da->unused_trie_cnt * 10000ULL / da->all_trie_cnt; 1025 if (trie_frag > V_frag_limit) { 1026 trie_rebuild = 1; 1027 goto trie_build; 1028 } 1029 1030 d_size = sizeof(*da->d_tbl) * d_tbl_size; 1031 x_size = sizeof(*da->x_tbl) * DIRECT_TBL_SIZE / d_tbl_size 1032 * da->all_trie_cnt; 1033 dxr_tot_size = d_size + x_size + r_size; 1034 1035 if (trie_rebuild == 1) { 1036 /* Try to find a more compact D/X split */ 1037 if (prev_size == 0 || dxr_tot_size <= prev_size) 1038 da->d_bits--; 1039 else { 1040 da->d_bits++; 1041 trie_rebuild = 2; 1042 } 1043 prev_size = dxr_tot_size; 1044 goto dxr2_try_squeeze; 1045 } 1046 microuptime(&t2); 1047 #else /* !DXR2 */ 1048 dxr_tot_size = sizeof(da->direct_tbl) + r_size; 1049 t2 = t1; 1050 #endif 1051 1052 dxr->d = malloc(dxr_tot_size, M_DXRLPM, M_NOWAIT); 1053 if (dxr->d == NULL) { 1054 FIB_PRINTF(LOG_NOTICE, da->fd, 1055 "Unable to allocate DXR lookup table"); 1056 return; 1057 } 1058 #ifdef DXR2 1059 memcpy(dxr->d, da->d_tbl, d_size); 1060 dxr->x = ((char *) dxr->d) + d_size; 1061 memcpy(dxr->x, da->x_tbl, x_size); 1062 dxr->r = ((char *) dxr->x) + x_size; 1063 dxr->d_shift = 32 - da->d_bits; 1064 dxr->x_shift = dxr_x; 1065 dxr->x_mask = 0xffffffffU >> (32 - dxr_x); 1066 #else /* !DXR2 */ 1067 memcpy(dxr->d, da->direct_tbl, sizeof(da->direct_tbl)); 1068 dxr->r = ((char *) dxr->d) + sizeof(da->direct_tbl); 1069 #endif 1070 memcpy(dxr->r, da->range_tbl, r_size); 1071 1072 if (da->updates_low <= da->updates_high) 1073 bzero(&da->updates_mask[da->updates_low / 32], 1074 (da->updates_high - da->updates_low) / 8 + 1); 1075 da->updates_low = DIRECT_TBL_SIZE - 1; 1076 da->updates_high = 0; 1077 microuptime(&t3); 1078 1079 #ifdef DXR2 1080 FIB_PRINTF(LOG_INFO, da->fd, "D%dX%dR, %d prefixes, %d nhops (max)", 1081 da->d_bits, dxr_x, rinfo.num_prefixes, rinfo.num_nhops); 1082 #else 1083 FIB_PRINTF(LOG_INFO, da->fd, "D%dR, %d prefixes, %d nhops (max)", 1084 DXR_D, rinfo.num_prefixes, rinfo.num_nhops); 1085 #endif 1086 i = dxr_tot_size * 100; 1087 if (rinfo.num_prefixes) 1088 i /= rinfo.num_prefixes; 1089 FIB_PRINTF(LOG_INFO, da->fd, "%d.%02d KBytes, %d.%02d Bytes/prefix", 1090 dxr_tot_size / 1024, dxr_tot_size * 100 / 1024 % 100, 1091 i / 100, i % 100); 1092 #ifdef DXR2 1093 FIB_PRINTF(LOG_INFO, da->fd, 1094 "%d.%02d%% trie, %d.%02d%% range fragmentation", 1095 trie_frag / 100, trie_frag % 100, 1096 range_frag / 100, range_frag % 100); 1097 #else 1098 FIB_PRINTF(LOG_INFO, da->fd, "%d.%01d%% range fragmentation", 1099 range_frag / 100, range_frag % 100); 1100 #endif 1101 i = (t1.tv_sec - t0.tv_sec) * 1000000 + t1.tv_usec - t0.tv_usec; 1102 FIB_PRINTF(LOG_INFO, da->fd, "range table %s in %u.%03u ms", 1103 range_rebuild ? "rebuilt" : "updated", i / 1000, i % 1000); 1104 #ifdef DXR2 1105 i = (t2.tv_sec - t1.tv_sec) * 1000000 + t2.tv_usec - t1.tv_usec; 1106 FIB_PRINTF(LOG_INFO, da->fd, "trie %s in %u.%03u ms", 1107 trie_rebuild ? "rebuilt" : "updated", i / 1000, i % 1000); 1108 #endif 1109 i = (t3.tv_sec - t2.tv_sec) * 1000000 + t3.tv_usec - t2.tv_usec; 1110 FIB_PRINTF(LOG_INFO, da->fd, "snapshot forked in %u.%03u ms", 1111 i / 1000, i % 1000); 1112 } 1113 1114 /* 1115 * Glue functions for attaching to the FIB_ALGO infrastructure. 1116 */ 1117 1118 static struct nhop_object * 1119 dxr_fib_lookup(void *algo_data, const struct flm_lookup_key key, 1120 uint32_t scopeid) 1121 { 1122 struct dxr *dxr = algo_data; 1123 1124 return (dxr->nh_tbl[dxr_lookup(dxr, ntohl(key.addr4.s_addr))]); 1125 } 1126 1127 static enum flm_op_result 1128 dxr_init(uint32_t fibnum, struct fib_data *fd, void *old_data, void **data) 1129 { 1130 struct dxr *old_dxr = old_data; 1131 struct dxr_aux *da = NULL; 1132 struct dxr *dxr; 1133 1134 dxr = malloc(sizeof(*dxr), M_DXRAUX, M_NOWAIT); 1135 if (dxr == NULL) { 1136 FIB_PRINTF(LOG_NOTICE, fd, 1137 "Unable to allocate DXR container struct"); 1138 return (FLM_REBUILD); 1139 } 1140 1141 /* Check whether we may reuse the old auxiliary structures */ 1142 if (old_dxr != NULL && old_dxr->aux != NULL && 1143 old_dxr->aux->fd == fd) { 1144 da = old_dxr->aux; 1145 atomic_add_int(&da->refcnt, 1); 1146 } 1147 1148 dxr->aux = da; 1149 dxr->d = NULL; 1150 dxr->fd = fd; 1151 dxr->fibnum = fibnum; 1152 *data = dxr; 1153 1154 return (FLM_SUCCESS); 1155 } 1156 1157 static void 1158 dxr_destroy(void *data) 1159 { 1160 struct dxr *dxr = data; 1161 struct dxr_aux *da = dxr->aux; 1162 struct chunk_desc *cdp; 1163 struct trie_desc *tp; 1164 1165 free(dxr->d, M_DXRLPM); 1166 free(dxr, M_DXRAUX); 1167 1168 if (da == NULL || atomic_fetchadd_int(&da->refcnt, -1) > 1) 1169 return; 1170 1171 /* Release auxiliary structures */ 1172 while ((cdp = LIST_FIRST(&da->all_chunks)) != NULL) { 1173 LIST_REMOVE(cdp, cd_all_le); 1174 uma_zfree(chunk_zone, cdp); 1175 } 1176 while ((tp = LIST_FIRST(&da->all_trie)) != NULL) { 1177 LIST_REMOVE(tp, td_all_le); 1178 uma_zfree(trie_zone, tp); 1179 } 1180 free(da->range_tbl, M_DXRAUX); 1181 free(da->x_tbl, M_DXRAUX); 1182 free(da, M_DXRAUX); 1183 } 1184 1185 static void 1186 epoch_dxr_destroy(epoch_context_t ctx) 1187 { 1188 struct dxr *dxr = __containerof(ctx, struct dxr, epoch_ctx); 1189 1190 dxr_destroy(dxr); 1191 } 1192 1193 static void * 1194 choose_lookup_fn(struct dxr_aux *da) 1195 { 1196 1197 #ifdef DXR2 1198 switch (da->d_bits) { 1199 #if DXR_TRIE_BITS > 16 1200 case 16: 1201 return (dxr_fib_lookup_16); 1202 #endif 1203 case 15: 1204 return (dxr_fib_lookup_15); 1205 case 14: 1206 return (dxr_fib_lookup_14); 1207 case 13: 1208 return (dxr_fib_lookup_13); 1209 case 12: 1210 return (dxr_fib_lookup_12); 1211 case 11: 1212 return (dxr_fib_lookup_11); 1213 case 10: 1214 return (dxr_fib_lookup_10); 1215 case 9: 1216 return (dxr_fib_lookup_9); 1217 } 1218 #endif /* DXR2 */ 1219 return (dxr_fib_lookup); 1220 } 1221 1222 static enum flm_op_result 1223 dxr_dump_end(void *data, struct fib_dp *dp) 1224 { 1225 struct dxr *dxr = data; 1226 struct dxr_aux *da; 1227 1228 dxr_build(dxr); 1229 1230 da = dxr->aux; 1231 if (da == NULL || dxr->d == NULL) 1232 return (FLM_REBUILD); 1233 1234 if (da->rtbl_top >= BASE_MAX) 1235 return (FLM_ERROR); 1236 1237 dp->f = choose_lookup_fn(da); 1238 dp->arg = dxr; 1239 1240 return (FLM_SUCCESS); 1241 } 1242 1243 static enum flm_op_result 1244 dxr_dump_rib_item(struct rtentry *rt, void *data) 1245 { 1246 1247 return (FLM_SUCCESS); 1248 } 1249 1250 static enum flm_op_result 1251 dxr_change_rib_item(struct rib_head *rnh, struct rib_cmd_info *rc, 1252 void *data) 1253 { 1254 1255 return (FLM_BATCH); 1256 } 1257 1258 static enum flm_op_result 1259 dxr_change_rib_batch(struct rib_head *rnh, struct fib_change_queue *q, 1260 void *data) 1261 { 1262 struct dxr *dxr = data; 1263 struct dxr *new_dxr; 1264 struct dxr_aux *da; 1265 struct fib_dp new_dp; 1266 enum flm_op_result res; 1267 uint32_t ip, plen, hmask, start, end, i, ui; 1268 #ifdef INVARIANTS 1269 struct rib_rtable_info rinfo; 1270 int update_delta = 0; 1271 #endif 1272 1273 MPASS(data != NULL); 1274 MPASS(q != NULL); 1275 MPASS(q->count < q->size); 1276 1277 da = dxr->aux; 1278 MPASS(da != NULL); 1279 MPASS(da->fd == dxr->fd); 1280 MPASS(da->refcnt > 0); 1281 1282 FIB_PRINTF(LOG_INFO, da->fd, "processing %d update(s)", q->count); 1283 for (ui = 0; ui < q->count; ui++) { 1284 #ifdef INVARIANTS 1285 if (q->entries[ui].nh_new != NULL) 1286 update_delta++; 1287 if (q->entries[ui].nh_old != NULL) 1288 update_delta--; 1289 #endif 1290 plen = q->entries[ui].plen; 1291 ip = ntohl(q->entries[ui].addr4.s_addr); 1292 if (plen < 32) 1293 hmask = 0xffffffffU >> plen; 1294 else 1295 hmask = 0; 1296 start = (ip & ~hmask) >> DXR_RANGE_SHIFT; 1297 end = (ip | hmask) >> DXR_RANGE_SHIFT; 1298 1299 if ((start & 0x1f) == 0 && (end & 0x1f) == 0x1f) 1300 for (i = start >> 5; i <= end >> 5; i++) 1301 da->updates_mask[i] = 0xffffffffU; 1302 else 1303 for (i = start; i <= end; i++) 1304 da->updates_mask[i >> 5] |= (1 << (i & 0x1f)); 1305 if (start < da->updates_low) 1306 da->updates_low = start; 1307 if (end > da->updates_high) 1308 da->updates_high = end; 1309 } 1310 1311 #ifdef INVARIANTS 1312 fib_get_rtable_info(fib_get_rh(da->fd), &rinfo); 1313 MPASS(da->prefixes + update_delta == rinfo.num_prefixes); 1314 #endif 1315 1316 res = dxr_init(0, dxr->fd, data, (void **) &new_dxr); 1317 if (res != FLM_SUCCESS) 1318 return (res); 1319 1320 dxr_build(new_dxr); 1321 1322 /* Structural limit exceeded, hard error */ 1323 if (da->rtbl_top >= BASE_MAX) 1324 return (FLM_ERROR); 1325 1326 if (new_dxr->d == NULL) { 1327 dxr_destroy(new_dxr); 1328 return (FLM_REBUILD); 1329 } 1330 1331 new_dp.f = choose_lookup_fn(da); 1332 new_dp.arg = new_dxr; 1333 if (fib_set_datapath_ptr(dxr->fd, &new_dp)) { 1334 fib_set_algo_ptr(dxr->fd, new_dxr); 1335 fib_epoch_call(epoch_dxr_destroy, &dxr->epoch_ctx); 1336 return (FLM_SUCCESS); 1337 } 1338 1339 FIB_PRINTF(LOG_NOTICE, dxr->fd, "fib_set_datapath_ptr() failed"); 1340 dxr_destroy(new_dxr); 1341 return (FLM_REBUILD); 1342 } 1343 1344 static uint8_t 1345 dxr_get_pref(const struct rib_rtable_info *rinfo) 1346 { 1347 1348 /* Below bsearch4 up to 10 prefixes. Always supersedes dpdk_lpm4. */ 1349 return (251); 1350 } 1351 1352 SYSCTL_DECL(_net_route_algo); 1353 SYSCTL_NODE(_net_route_algo, OID_AUTO, dxr, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 1354 "DXR tunables"); 1355 1356 static int 1357 sysctl_dxr_frag_limit(SYSCTL_HANDLER_ARGS) 1358 { 1359 char buf[8]; 1360 int error, new, i; 1361 1362 snprintf(buf, sizeof(buf), "%d.%02d%%", V_frag_limit / 100, 1363 V_frag_limit % 100); 1364 error = sysctl_handle_string(oidp, buf, sizeof(buf), req); 1365 if (error != 0 || req->newptr == NULL) 1366 return (error); 1367 if (!isdigit(*buf) && *buf != '.') 1368 return (EINVAL); 1369 for (i = 0, new = 0; isdigit(buf[i]) && i < sizeof(buf); i++) 1370 new = new * 10 + buf[i] - '0'; 1371 new *= 100; 1372 if (buf[i++] == '.') { 1373 if (!isdigit(buf[i])) 1374 return (EINVAL); 1375 new += (buf[i++] - '0') * 10; 1376 if (isdigit(buf[i])) 1377 new += buf[i++] - '0'; 1378 } 1379 if (new > 1000) 1380 return (EINVAL); 1381 V_frag_limit = new; 1382 snprintf(buf, sizeof(buf), "%d.%02d%%", V_frag_limit / 100, 1383 V_frag_limit % 100); 1384 return (0); 1385 } 1386 1387 SYSCTL_PROC(_net_route_algo_dxr, OID_AUTO, frag_limit, 1388 CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_VNET, 1389 0, 0, sysctl_dxr_frag_limit, "A", 1390 "Fragmentation threshold to full rebuild"); 1391 1392 static struct fib_lookup_module fib_dxr_mod = { 1393 .flm_name = "dxr", 1394 .flm_family = AF_INET, 1395 .flm_init_cb = dxr_init, 1396 .flm_destroy_cb = dxr_destroy, 1397 .flm_dump_rib_item_cb = dxr_dump_rib_item, 1398 .flm_dump_end_cb = dxr_dump_end, 1399 .flm_change_rib_item_cb = dxr_change_rib_item, 1400 .flm_change_rib_items_cb = dxr_change_rib_batch, 1401 .flm_get_pref = dxr_get_pref, 1402 }; 1403 1404 static int 1405 dxr_modevent(module_t mod, int type, void *unused) 1406 { 1407 int error; 1408 1409 switch (type) { 1410 case MOD_LOAD: 1411 chunk_zone = uma_zcreate("dxr chunk", sizeof(struct chunk_desc), 1412 NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); 1413 trie_zone = uma_zcreate("dxr trie", sizeof(struct trie_desc), 1414 NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); 1415 fib_module_register(&fib_dxr_mod); 1416 return(0); 1417 case MOD_UNLOAD: 1418 error = fib_module_unregister(&fib_dxr_mod); 1419 if (error) 1420 return (error); 1421 uma_zdestroy(chunk_zone); 1422 uma_zdestroy(trie_zone); 1423 return(0); 1424 default: 1425 return(EOPNOTSUPP); 1426 } 1427 } 1428 1429 static moduledata_t dxr_mod = {"fib_dxr", dxr_modevent, 0}; 1430 1431 DECLARE_MODULE(fib_dxr, dxr_mod, SI_SUB_PSEUDO, SI_ORDER_ANY); 1432 MODULE_VERSION(fib_dxr, 1); 1433