1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3 * 4 * Copyright (c) 2012-2021 Marko Zec 5 * Copyright (c) 2005, 2018 University of Zagreb 6 * Copyright (c) 2005 International Computer Science Institute 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 27 * SUCH DAMAGE. 28 */ 29 30 /* 31 * An implementation of DXR, a simple IPv4 LPM scheme with compact lookup 32 * structures and a trivial search procedure. More significant bits of 33 * the search key are used to directly index a two-stage trie, while the 34 * remaining bits are used for finding the next hop in a sorted array. 35 * More details in: 36 * 37 * M. Zec, L. Rizzo, M. Mikuc, DXR: towards a billion routing lookups per 38 * second in software, ACM SIGCOMM Computer Communication Review, September 39 * 2012 40 * 41 * M. Zec, M. Mikuc, Pushing the envelope: beyond two billion IP routing 42 * lookups per second on commodity CPUs, IEEE SoftCOM, September 2017, Split 43 */ 44 45 #include <sys/cdefs.h> 46 __FBSDID("$FreeBSD$"); 47 48 #include "opt_inet.h" 49 50 #include <sys/param.h> 51 #include <sys/kernel.h> 52 #include <sys/ctype.h> 53 #include <sys/epoch.h> 54 #include <sys/malloc.h> 55 #include <sys/module.h> 56 #include <sys/socket.h> 57 #include <sys/sysctl.h> 58 #include <sys/syslog.h> 59 60 #include <vm/uma.h> 61 62 #include <netinet/in.h> 63 #include <netinet/in_fib.h> 64 65 #include <net/route.h> 66 #include <net/route/route_ctl.h> 67 #include <net/route/fib_algo.h> 68 69 #define DXR_TRIE_BITS 20 70 71 CTASSERT(DXR_TRIE_BITS >= 16 && DXR_TRIE_BITS <= 24); 72 73 /* DXR2: two-stage primary trie, instead of a single direct lookup table */ 74 #define DXR2 75 76 #if DXR_TRIE_BITS > 16 77 #define DXR_D 16 78 #else 79 #define DXR_D (DXR_TRIE_BITS - 1) 80 #endif 81 #define DXR_X (DXR_TRIE_BITS - DXR_D) 82 83 #define D_TBL_SIZE (1 << DXR_D) 84 #define DIRECT_TBL_SIZE (1 << DXR_TRIE_BITS) 85 #define DXR_RANGE_MASK (0xffffffffU >> DXR_TRIE_BITS) 86 #define DXR_RANGE_SHIFT (32 - DXR_TRIE_BITS) 87 88 #define DESC_BASE_BITS 22 89 #define DESC_FRAGMENTS_BITS (32 - DESC_BASE_BITS) 90 #define BASE_MAX ((1 << DESC_BASE_BITS) - 1) 91 #define RTBL_SIZE_INCR (BASE_MAX / 64) 92 93 #if DXR_TRIE_BITS < 24 94 #define FRAGS_MASK_SHORT ((1 << (23 - DXR_TRIE_BITS)) - 1) 95 #else 96 #define FRAGS_MASK_SHORT 0 97 #endif 98 #define FRAGS_PREF_SHORT (((1 << DESC_FRAGMENTS_BITS) - 1) & \ 99 ~FRAGS_MASK_SHORT) 100 #define FRAGS_MARK_XL (FRAGS_PREF_SHORT - 1) 101 #define FRAGS_MARK_HIT (FRAGS_PREF_SHORT - 2) 102 103 #define IS_SHORT_FORMAT(x) ((x & FRAGS_PREF_SHORT) == FRAGS_PREF_SHORT) 104 #define IS_LONG_FORMAT(x) ((x & FRAGS_PREF_SHORT) != FRAGS_PREF_SHORT) 105 #define IS_XL_FORMAT(x) (x == FRAGS_MARK_XL) 106 107 #define RE_SHORT_MAX_NH ((1 << (DXR_TRIE_BITS - 8)) - 1) 108 109 #define CHUNK_HASH_BITS 16 110 #define CHUNK_HASH_SIZE (1 << CHUNK_HASH_BITS) 111 #define CHUNK_HASH_MASK (CHUNK_HASH_SIZE - 1) 112 113 #define TRIE_HASH_BITS 16 114 #define TRIE_HASH_SIZE (1 << TRIE_HASH_BITS) 115 #define TRIE_HASH_MASK (TRIE_HASH_SIZE - 1) 116 117 #define XTBL_SIZE_INCR (DIRECT_TBL_SIZE / 16) 118 119 #define UNUSED_BUCKETS 8 120 121 /* Lookup structure elements */ 122 123 struct direct_entry { 124 uint32_t fragments: DESC_FRAGMENTS_BITS, 125 base: DESC_BASE_BITS; 126 }; 127 128 struct range_entry_long { 129 uint32_t start: DXR_RANGE_SHIFT, 130 nexthop: DXR_TRIE_BITS; 131 }; 132 133 #if DXR_TRIE_BITS < 24 134 struct range_entry_short { 135 uint16_t start: DXR_RANGE_SHIFT - 8, 136 nexthop: DXR_TRIE_BITS - 8; 137 }; 138 #endif 139 140 /* Auxiliary structures */ 141 142 struct heap_entry { 143 uint32_t start; 144 uint32_t end; 145 uint32_t preflen; 146 uint32_t nexthop; 147 }; 148 149 struct chunk_desc { 150 LIST_ENTRY(chunk_desc) cd_all_le; 151 LIST_ENTRY(chunk_desc) cd_hash_le; 152 uint32_t cd_hash; 153 uint32_t cd_refcnt; 154 uint32_t cd_base; 155 uint32_t cd_cur_size; 156 uint32_t cd_max_size; 157 }; 158 159 struct trie_desc { 160 LIST_ENTRY(trie_desc) td_all_le; 161 LIST_ENTRY(trie_desc) td_hash_le; 162 uint32_t td_hash; 163 uint32_t td_index; 164 uint32_t td_refcnt; 165 }; 166 167 struct dxr_aux { 168 /* Glue to external state */ 169 struct fib_data *fd; 170 uint32_t fibnum; 171 int refcnt; 172 173 /* Auxiliary build-time tables */ 174 struct direct_entry direct_tbl[DIRECT_TBL_SIZE]; 175 uint16_t d_tbl[D_TBL_SIZE]; 176 struct direct_entry *x_tbl; 177 union { 178 struct range_entry_long re; 179 uint32_t fragments; 180 } *range_tbl; 181 182 /* Auxiliary internal state */ 183 uint32_t updates_mask[DIRECT_TBL_SIZE / 32]; 184 struct trie_desc *trietbl[D_TBL_SIZE]; 185 LIST_HEAD(, chunk_desc) chunk_hashtbl[CHUNK_HASH_SIZE]; 186 LIST_HEAD(, chunk_desc) all_chunks; 187 LIST_HEAD(, chunk_desc) unused_chunks[UNUSED_BUCKETS]; 188 LIST_HEAD(, trie_desc) trie_hashtbl[TRIE_HASH_SIZE]; 189 LIST_HEAD(, trie_desc) all_trie; 190 LIST_HEAD(, trie_desc) unused_trie; /* abuses hash link entry */ 191 struct sockaddr_in dst; 192 struct sockaddr_in mask; 193 struct heap_entry heap[33]; 194 uint32_t prefixes; 195 uint32_t updates_low; 196 uint32_t updates_high; 197 uint32_t unused_chunks_size; 198 uint32_t xtbl_size; 199 uint32_t all_trie_cnt; 200 uint32_t unused_trie_cnt; 201 uint32_t trie_rebuilt_prefixes; 202 uint32_t heap_index; 203 uint32_t d_bits; 204 uint32_t rtbl_size; 205 uint32_t rtbl_top; 206 uint32_t rtbl_work_frags; 207 uint32_t work_chunk; 208 }; 209 210 /* Main lookup structure container */ 211 212 struct dxr { 213 /* Lookup tables */ 214 uint16_t d_shift; 215 uint16_t x_shift; 216 uint32_t x_mask; 217 void *d; 218 void *x; 219 void *r; 220 struct nhop_object **nh_tbl; 221 222 /* Glue to external state */ 223 struct dxr_aux *aux; 224 struct fib_data *fd; 225 struct epoch_context epoch_ctx; 226 uint32_t fibnum; 227 }; 228 229 static MALLOC_DEFINE(M_DXRLPM, "dxr", "DXR LPM"); 230 static MALLOC_DEFINE(M_DXRAUX, "dxr aux", "DXR auxiliary"); 231 232 uma_zone_t chunk_zone; 233 uma_zone_t trie_zone; 234 235 VNET_DEFINE_STATIC(int, frag_limit) = 100; 236 #define V_frag_limit VNET(frag_limit) 237 238 SYSCTL_DECL(_net_route_algo); 239 SYSCTL_NODE(_net_route_algo, OID_AUTO, dxr, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 240 "DXR tunables"); 241 242 static int 243 sysctl_dxr_frag_limit(SYSCTL_HANDLER_ARGS) 244 { 245 char buf[8]; 246 int error, new, i; 247 248 snprintf(buf, sizeof(buf), "%d.%02d%%", V_frag_limit / 100, 249 V_frag_limit % 100); 250 error = sysctl_handle_string(oidp, buf, sizeof(buf), req); 251 if (error != 0 || req->newptr == NULL) 252 return (error); 253 if (!isdigit(*buf) && *buf != '.') 254 return (EINVAL); 255 for (i = 0, new = 0; isdigit(buf[i]) && i < sizeof(buf); i++) 256 new = new * 10 + buf[i] - '0'; 257 new *= 100; 258 if (buf[i++] == '.') { 259 if (!isdigit(buf[i])) 260 return (EINVAL); 261 new += (buf[i++] - '0') * 10; 262 if (isdigit(buf[i])) 263 new += buf[i++] - '0'; 264 } 265 if (new > 1000) 266 return (EINVAL); 267 V_frag_limit = new; 268 snprintf(buf, sizeof(buf), "%d.%02d%%", V_frag_limit / 100, 269 V_frag_limit % 100); 270 return (0); 271 } 272 273 SYSCTL_PROC(_net_route_algo_dxr, OID_AUTO, frag_limit, 274 CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_VNET, 275 0, 0, sysctl_dxr_frag_limit, "A", 276 "Fragmentation threshold to full rebuild"); 277 278 /* Binary search for a matching address range */ 279 #define DXR_LOOKUP_STAGE \ 280 if (masked_dst < range[middle].start) { \ 281 upperbound = middle; \ 282 middle = (middle + lowerbound) / 2; \ 283 } else if (masked_dst < range[middle + 1].start) \ 284 return (range[middle].nexthop); \ 285 else { \ 286 lowerbound = middle + 1; \ 287 middle = (upperbound + middle + 1) / 2; \ 288 } \ 289 if (upperbound == lowerbound) \ 290 return (range[lowerbound].nexthop); 291 292 static int 293 dxr_lookup(struct dxr *dxr, uint32_t dst) 294 { 295 #ifdef DXR2 296 uint16_t *dt = dxr->d; 297 struct direct_entry *xt = dxr->x; 298 int xi; 299 #else 300 struct direct_entry *dt = dxr->d; 301 #endif 302 struct direct_entry de; 303 struct range_entry_long *rt; 304 uint32_t base; 305 uint32_t upperbound; 306 uint32_t middle; 307 uint32_t lowerbound; 308 uint32_t masked_dst; 309 310 #ifdef DXR2 311 xi = (dt[dst >> dxr->d_shift] << dxr->x_shift) + 312 ((dst >> DXR_RANGE_SHIFT) & dxr->x_mask); 313 de = xt[xi]; 314 #else 315 de = dt[dst >> DXR_RANGE_SHIFT]; 316 #endif 317 318 if (__predict_true(de.fragments == FRAGS_MARK_HIT)) 319 return (de.base); 320 321 rt = dxr->r; 322 base = de.base; 323 lowerbound = 0; 324 masked_dst = dst & DXR_RANGE_MASK; 325 326 #if DXR_TRIE_BITS < 24 327 if (__predict_true(IS_SHORT_FORMAT(de.fragments))) { 328 upperbound = de.fragments & FRAGS_MASK_SHORT; 329 struct range_entry_short *range = 330 (struct range_entry_short *) &rt[base]; 331 332 masked_dst >>= 8; 333 middle = upperbound; 334 upperbound = upperbound * 2 + 1; 335 336 for (;;) { 337 DXR_LOOKUP_STAGE 338 DXR_LOOKUP_STAGE 339 } 340 } 341 #endif 342 343 upperbound = de.fragments; 344 middle = upperbound / 2; 345 struct range_entry_long *range = &rt[base]; 346 if (__predict_false(IS_XL_FORMAT(de.fragments))) { 347 upperbound = *((uint32_t *) range); 348 range++; 349 middle = upperbound / 2; 350 } 351 352 for (;;) { 353 DXR_LOOKUP_STAGE 354 DXR_LOOKUP_STAGE 355 } 356 } 357 358 static void 359 initheap(struct dxr_aux *da, uint32_t dst_u32, uint32_t chunk) 360 { 361 struct heap_entry *fhp = &da->heap[0]; 362 struct rtentry *rt; 363 struct route_nhop_data rnd; 364 365 da->heap_index = 0; 366 da->dst.sin_addr.s_addr = htonl(dst_u32); 367 rt = fib4_lookup_rt(da->fibnum, da->dst.sin_addr, 0, NHR_UNLOCKED, 368 &rnd); 369 if (rt != NULL) { 370 struct in_addr addr; 371 uint32_t scopeid; 372 373 rt_get_inet_prefix_plen(rt, &addr, &fhp->preflen, &scopeid); 374 fhp->start = ntohl(addr.s_addr); 375 fhp->end = fhp->start; 376 if (fhp->preflen < 32) 377 fhp->end |= (0xffffffffU >> fhp->preflen); 378 fhp->nexthop = fib_get_nhop_idx(da->fd, rnd.rnd_nhop); 379 } else { 380 fhp->preflen = fhp->nexthop = fhp->start = 0; 381 fhp->end = 0xffffffffU; 382 } 383 } 384 385 static uint32_t 386 chunk_size(struct dxr_aux *da, struct direct_entry *fdesc) 387 { 388 389 if (IS_SHORT_FORMAT(fdesc->fragments)) 390 return ((fdesc->fragments & FRAGS_MASK_SHORT) + 1); 391 else if (IS_XL_FORMAT(fdesc->fragments)) 392 return (da->range_tbl[fdesc->base].fragments + 2); 393 else /* if (IS_LONG_FORMAT(fdesc->fragments)) */ 394 return (fdesc->fragments + 1); 395 } 396 397 static uint32_t 398 chunk_hash(struct dxr_aux *da, struct direct_entry *fdesc) 399 { 400 uint32_t size = chunk_size(da, fdesc); 401 uint32_t *p = (uint32_t *) &da->range_tbl[fdesc->base]; 402 uint32_t *l = (uint32_t *) &da->range_tbl[fdesc->base + size]; 403 uint32_t hash = fdesc->fragments; 404 405 for (; p < l; p++) 406 hash = (hash << 7) + (hash >> 13) + *p; 407 408 return (hash + (hash >> 16)); 409 } 410 411 static int 412 chunk_ref(struct dxr_aux *da, uint32_t chunk) 413 { 414 struct direct_entry *fdesc = &da->direct_tbl[chunk]; 415 struct chunk_desc *cdp, *empty_cdp; 416 uint32_t base = fdesc->base; 417 uint32_t size = chunk_size(da, fdesc); 418 uint32_t hash = chunk_hash(da, fdesc); 419 int i; 420 421 /* Find an existing descriptor */ 422 LIST_FOREACH(cdp, &da->chunk_hashtbl[hash & CHUNK_HASH_MASK], 423 cd_hash_le) { 424 if (cdp->cd_hash != hash || cdp->cd_cur_size != size || 425 memcmp(&da->range_tbl[base], &da->range_tbl[cdp->cd_base], 426 sizeof(struct range_entry_long) * size)) 427 continue; 428 da->rtbl_top = fdesc->base; 429 fdesc->base = cdp->cd_base; 430 cdp->cd_refcnt++; 431 return (0); 432 } 433 434 /* No matching chunks found. Find an empty one to recycle. */ 435 for (cdp = NULL, i = size; cdp == NULL && i < UNUSED_BUCKETS; i++) 436 cdp = LIST_FIRST(&da->unused_chunks[i]); 437 438 if (cdp == NULL) 439 LIST_FOREACH(empty_cdp, &da->unused_chunks[0], cd_hash_le) 440 if (empty_cdp->cd_max_size >= size && (cdp == NULL || 441 empty_cdp->cd_max_size < cdp->cd_max_size)) { 442 cdp = empty_cdp; 443 if (empty_cdp->cd_max_size == size) 444 break; 445 } 446 447 if (cdp != NULL) { 448 /* Copy from heap into the recycled chunk */ 449 bcopy(&da->range_tbl[fdesc->base], &da->range_tbl[cdp->cd_base], 450 size * sizeof(struct range_entry_long)); 451 fdesc->base = cdp->cd_base; 452 da->rtbl_top -= size; 453 da->unused_chunks_size -= cdp->cd_max_size; 454 if (cdp->cd_max_size > size) { 455 /* Split the range in two, need a new descriptor */ 456 empty_cdp = uma_zalloc(chunk_zone, M_NOWAIT); 457 if (empty_cdp == NULL) 458 return (1); 459 LIST_INSERT_BEFORE(cdp, empty_cdp, cd_all_le); 460 empty_cdp->cd_base = cdp->cd_base + size; 461 empty_cdp->cd_cur_size = 0; 462 empty_cdp->cd_max_size = cdp->cd_max_size - size; 463 464 i = empty_cdp->cd_max_size; 465 if (i >= UNUSED_BUCKETS) 466 i = 0; 467 LIST_INSERT_HEAD(&da->unused_chunks[i], empty_cdp, 468 cd_hash_le); 469 470 da->unused_chunks_size += empty_cdp->cd_max_size; 471 cdp->cd_max_size = size; 472 } 473 LIST_REMOVE(cdp, cd_hash_le); 474 } else { 475 /* Alloc a new descriptor at the top of the heap*/ 476 cdp = uma_zalloc(chunk_zone, M_NOWAIT); 477 if (cdp == NULL) 478 return (1); 479 cdp->cd_max_size = size; 480 cdp->cd_base = fdesc->base; 481 LIST_INSERT_HEAD(&da->all_chunks, cdp, cd_all_le); 482 KASSERT(cdp->cd_base + cdp->cd_max_size == da->rtbl_top, 483 ("dxr: %s %d", __FUNCTION__, __LINE__)); 484 } 485 486 cdp->cd_hash = hash; 487 cdp->cd_refcnt = 1; 488 cdp->cd_cur_size = size; 489 LIST_INSERT_HEAD(&da->chunk_hashtbl[hash & CHUNK_HASH_MASK], cdp, 490 cd_hash_le); 491 if (da->rtbl_top >= da->rtbl_size) { 492 if (da->rtbl_top >= BASE_MAX) { 493 FIB_PRINTF(LOG_ERR, da->fd, 494 "structural limit exceeded at %d " 495 "range table elements", da->rtbl_top); 496 return (1); 497 } 498 da->rtbl_size += RTBL_SIZE_INCR; 499 i = (BASE_MAX - da->rtbl_top) * LOG_DEBUG / BASE_MAX; 500 FIB_PRINTF(i, da->fd, "range table at %d%% structural limit", 501 da->rtbl_top * 100 / BASE_MAX); 502 da->range_tbl = realloc(da->range_tbl, 503 sizeof(*da->range_tbl) * da->rtbl_size + FRAGS_PREF_SHORT, 504 M_DXRAUX, M_NOWAIT); 505 if (da->range_tbl == NULL) 506 return (1); 507 } 508 509 return (0); 510 } 511 512 static void 513 chunk_unref(struct dxr_aux *da, uint32_t chunk) 514 { 515 struct direct_entry *fdesc = &da->direct_tbl[chunk]; 516 struct chunk_desc *cdp, *cdp2; 517 uint32_t base = fdesc->base; 518 uint32_t size = chunk_size(da, fdesc); 519 uint32_t hash = chunk_hash(da, fdesc); 520 int i; 521 522 /* Find the corresponding descriptor */ 523 LIST_FOREACH(cdp, &da->chunk_hashtbl[hash & CHUNK_HASH_MASK], 524 cd_hash_le) 525 if (cdp->cd_hash == hash && cdp->cd_cur_size == size && 526 memcmp(&da->range_tbl[base], &da->range_tbl[cdp->cd_base], 527 sizeof(struct range_entry_long) * size) == 0) 528 break; 529 530 KASSERT(cdp != NULL, ("dxr: dangling chunk")); 531 if (--cdp->cd_refcnt > 0) 532 return; 533 534 LIST_REMOVE(cdp, cd_hash_le); 535 da->unused_chunks_size += cdp->cd_max_size; 536 cdp->cd_cur_size = 0; 537 538 /* Attempt to merge with the preceding chunk, if empty */ 539 cdp2 = LIST_NEXT(cdp, cd_all_le); 540 if (cdp2 != NULL && cdp2->cd_cur_size == 0) { 541 KASSERT(cdp2->cd_base + cdp2->cd_max_size == cdp->cd_base, 542 ("dxr: %s %d", __FUNCTION__, __LINE__)); 543 LIST_REMOVE(cdp, cd_all_le); 544 LIST_REMOVE(cdp2, cd_hash_le); 545 cdp2->cd_max_size += cdp->cd_max_size; 546 uma_zfree(chunk_zone, cdp); 547 cdp = cdp2; 548 } 549 550 /* Attempt to merge with the subsequent chunk, if empty */ 551 cdp2 = LIST_PREV(cdp, &da->all_chunks, chunk_desc, cd_all_le); 552 if (cdp2 != NULL && cdp2->cd_cur_size == 0) { 553 KASSERT(cdp->cd_base + cdp->cd_max_size == cdp2->cd_base, 554 ("dxr: %s %d", __FUNCTION__, __LINE__)); 555 LIST_REMOVE(cdp, cd_all_le); 556 LIST_REMOVE(cdp2, cd_hash_le); 557 cdp2->cd_max_size += cdp->cd_max_size; 558 cdp2->cd_base = cdp->cd_base; 559 uma_zfree(chunk_zone, cdp); 560 cdp = cdp2; 561 } 562 563 if (cdp->cd_base + cdp->cd_max_size == da->rtbl_top) { 564 /* Free the chunk on the top of the range heap, trim the heap */ 565 KASSERT(cdp == LIST_FIRST(&da->all_chunks), 566 ("dxr: %s %d", __FUNCTION__, __LINE__)); 567 da->rtbl_top -= cdp->cd_max_size; 568 da->unused_chunks_size -= cdp->cd_max_size; 569 LIST_REMOVE(cdp, cd_all_le); 570 uma_zfree(chunk_zone, cdp); 571 return; 572 } 573 574 i = cdp->cd_max_size; 575 if (i >= UNUSED_BUCKETS) 576 i = 0; 577 LIST_INSERT_HEAD(&da->unused_chunks[i], cdp, cd_hash_le); 578 } 579 580 #ifdef DXR2 581 static uint32_t 582 trie_hash(struct dxr_aux *da, uint32_t dxr_x, uint32_t index) 583 { 584 uint32_t i, *val; 585 uint32_t hash = 0; 586 587 for (i = 0; i < (1 << dxr_x); i++) { 588 hash = (hash << 3) ^ (hash >> 3); 589 val = (uint32_t *) 590 (void *) &da->direct_tbl[(index << dxr_x) + i]; 591 hash += (*val << 5); 592 hash += (*val >> 5); 593 } 594 595 return (hash + (hash >> 16)); 596 } 597 598 static int 599 trie_ref(struct dxr_aux *da, uint32_t index) 600 { 601 struct trie_desc *tp; 602 uint32_t dxr_d = da->d_bits; 603 uint32_t dxr_x = DXR_TRIE_BITS - dxr_d; 604 uint32_t hash = trie_hash(da, dxr_x, index); 605 606 /* Find an existing descriptor */ 607 LIST_FOREACH(tp, &da->trie_hashtbl[hash & TRIE_HASH_MASK], td_hash_le) 608 if (tp->td_hash == hash && 609 memcmp(&da->direct_tbl[index << dxr_x], 610 &da->x_tbl[tp->td_index << dxr_x], 611 sizeof(*da->x_tbl) << dxr_x) == 0) { 612 tp->td_refcnt++; 613 da->trietbl[index] = tp; 614 return(tp->td_index); 615 } 616 617 tp = LIST_FIRST(&da->unused_trie); 618 if (tp != NULL) { 619 LIST_REMOVE(tp, td_hash_le); 620 da->unused_trie_cnt--; 621 } else { 622 tp = uma_zalloc(trie_zone, M_NOWAIT); 623 if (tp == NULL) 624 return (-1); 625 LIST_INSERT_HEAD(&da->all_trie, tp, td_all_le); 626 tp->td_index = da->all_trie_cnt++; 627 } 628 629 tp->td_hash = hash; 630 tp->td_refcnt = 1; 631 LIST_INSERT_HEAD(&da->trie_hashtbl[hash & TRIE_HASH_MASK], tp, 632 td_hash_le); 633 memcpy(&da->x_tbl[tp->td_index << dxr_x], 634 &da->direct_tbl[index << dxr_x], sizeof(*da->x_tbl) << dxr_x); 635 da->trietbl[index] = tp; 636 if (da->all_trie_cnt >= da->xtbl_size >> dxr_x) { 637 da->xtbl_size += XTBL_SIZE_INCR; 638 da->x_tbl = realloc(da->x_tbl, 639 sizeof(*da->x_tbl) * da->xtbl_size, M_DXRAUX, M_NOWAIT); 640 if (da->x_tbl == NULL) 641 return (-1); 642 } 643 return(tp->td_index); 644 } 645 646 static void 647 trie_unref(struct dxr_aux *da, uint32_t index) 648 { 649 struct trie_desc *tp = da->trietbl[index]; 650 651 if (tp == NULL) 652 return; 653 da->trietbl[index] = NULL; 654 if (--tp->td_refcnt > 0) 655 return; 656 657 LIST_REMOVE(tp, td_hash_le); 658 da->unused_trie_cnt++; 659 if (tp->td_index != da->all_trie_cnt - 1) { 660 LIST_INSERT_HEAD(&da->unused_trie, tp, td_hash_le); 661 return; 662 } 663 664 do { 665 da->all_trie_cnt--; 666 da->unused_trie_cnt--; 667 LIST_REMOVE(tp, td_all_le); 668 uma_zfree(trie_zone, tp); 669 LIST_FOREACH(tp, &da->unused_trie, td_hash_le) 670 if (tp->td_index == da->all_trie_cnt - 1) { 671 LIST_REMOVE(tp, td_hash_le); 672 break; 673 } 674 } while (tp != NULL); 675 } 676 #endif 677 678 static void 679 heap_inject(struct dxr_aux *da, uint32_t start, uint32_t end, uint32_t preflen, 680 uint32_t nh) 681 { 682 struct heap_entry *fhp; 683 int i; 684 685 for (i = da->heap_index; i >= 0; i--) { 686 if (preflen > da->heap[i].preflen) 687 break; 688 else if (preflen < da->heap[i].preflen) 689 da->heap[i + 1] = da->heap[i]; 690 else 691 return; 692 } 693 694 fhp = &da->heap[i + 1]; 695 fhp->preflen = preflen; 696 fhp->start = start; 697 fhp->end = end; 698 fhp->nexthop = nh; 699 da->heap_index++; 700 } 701 702 static int 703 dxr_walk(struct rtentry *rt, void *arg) 704 { 705 struct dxr_aux *da = arg; 706 uint32_t chunk = da->work_chunk; 707 uint32_t first = chunk << DXR_RANGE_SHIFT; 708 uint32_t last = first | DXR_RANGE_MASK; 709 struct range_entry_long *fp = 710 &da->range_tbl[da->rtbl_top + da->rtbl_work_frags].re; 711 struct heap_entry *fhp = &da->heap[da->heap_index]; 712 uint32_t preflen, nh, start, end, scopeid; 713 struct in_addr addr; 714 715 rt_get_inet_prefix_plen(rt, &addr, &preflen, &scopeid); 716 start = ntohl(addr.s_addr); 717 if (start > last) 718 return (-1); /* Beyond chunk boundaries, we are done */ 719 if (start < first) 720 return (0); /* Skip this route */ 721 722 end = start; 723 if (preflen < 32) 724 end |= (0xffffffffU >> preflen); 725 nh = fib_get_nhop_idx(da->fd, rt_get_raw_nhop(rt)); 726 727 if (start == fhp->start) 728 heap_inject(da, start, end, preflen, nh); 729 else { 730 /* start > fhp->start */ 731 while (start > fhp->end) { 732 uint32_t oend = fhp->end; 733 734 if (da->heap_index > 0) { 735 fhp--; 736 da->heap_index--; 737 } else 738 initheap(da, fhp->end + 1, chunk); 739 if (fhp->end > oend && fhp->nexthop != fp->nexthop) { 740 fp++; 741 da->rtbl_work_frags++; 742 fp->start = (oend + 1) & DXR_RANGE_MASK; 743 fp->nexthop = fhp->nexthop; 744 } 745 } 746 if (start > ((chunk << DXR_RANGE_SHIFT) | fp->start) && 747 nh != fp->nexthop) { 748 fp++; 749 da->rtbl_work_frags++; 750 fp->start = start & DXR_RANGE_MASK; 751 } else if (da->rtbl_work_frags) { 752 if ((--fp)->nexthop == nh) 753 da->rtbl_work_frags--; 754 else 755 fp++; 756 } 757 fp->nexthop = nh; 758 heap_inject(da, start, end, preflen, nh); 759 } 760 761 return (0); 762 } 763 764 static int 765 update_chunk(struct dxr_aux *da, uint32_t chunk) 766 { 767 struct range_entry_long *fp; 768 #if DXR_TRIE_BITS < 24 769 struct range_entry_short *fps; 770 uint32_t start, nh, i; 771 #endif 772 struct heap_entry *fhp; 773 uint32_t first = chunk << DXR_RANGE_SHIFT; 774 uint32_t last = first | DXR_RANGE_MASK; 775 776 if (da->direct_tbl[chunk].fragments != FRAGS_MARK_HIT) 777 chunk_unref(da, chunk); 778 779 initheap(da, first, chunk); 780 781 fp = &da->range_tbl[da->rtbl_top].re; 782 da->rtbl_work_frags = 0; 783 fp->start = first & DXR_RANGE_MASK; 784 fp->nexthop = da->heap[0].nexthop; 785 786 da->dst.sin_addr.s_addr = htonl(first); 787 da->mask.sin_addr.s_addr = htonl(~DXR_RANGE_MASK); 788 789 da->work_chunk = chunk; 790 rib_walk_from(da->fibnum, AF_INET, RIB_FLAG_LOCKED, 791 (struct sockaddr *) &da->dst, (struct sockaddr *) &da->mask, 792 dxr_walk, da); 793 794 /* Flush any remaining objects on the heap */ 795 fp = &da->range_tbl[da->rtbl_top + da->rtbl_work_frags].re; 796 fhp = &da->heap[da->heap_index]; 797 while (fhp->preflen > DXR_TRIE_BITS) { 798 uint32_t oend = fhp->end; 799 800 if (da->heap_index > 0) { 801 fhp--; 802 da->heap_index--; 803 } else 804 initheap(da, fhp->end + 1, chunk); 805 if (fhp->end > oend && fhp->nexthop != fp->nexthop) { 806 /* Have we crossed the upper chunk boundary? */ 807 if (oend >= last) 808 break; 809 fp++; 810 da->rtbl_work_frags++; 811 fp->start = (oend + 1) & DXR_RANGE_MASK; 812 fp->nexthop = fhp->nexthop; 813 } 814 } 815 816 /* Direct hit if the chunk contains only a single fragment */ 817 if (da->rtbl_work_frags == 0) { 818 da->direct_tbl[chunk].base = fp->nexthop; 819 da->direct_tbl[chunk].fragments = FRAGS_MARK_HIT; 820 return (0); 821 } 822 823 da->direct_tbl[chunk].base = da->rtbl_top; 824 da->direct_tbl[chunk].fragments = da->rtbl_work_frags; 825 826 #if DXR_TRIE_BITS < 24 827 /* Check whether the chunk can be more compactly encoded */ 828 fp = &da->range_tbl[da->rtbl_top].re; 829 for (i = 0; i <= da->rtbl_work_frags; i++, fp++) 830 if ((fp->start & 0xff) != 0 || fp->nexthop > RE_SHORT_MAX_NH) 831 break; 832 if (i == da->rtbl_work_frags + 1) { 833 fp = &da->range_tbl[da->rtbl_top].re; 834 fps = (void *) fp; 835 for (i = 0; i <= da->rtbl_work_frags; i++, fp++, fps++) { 836 start = fp->start; 837 nh = fp->nexthop; 838 fps->start = start >> 8; 839 fps->nexthop = nh; 840 } 841 fps->start = start >> 8; 842 fps->nexthop = nh; 843 da->rtbl_work_frags >>= 1; 844 da->direct_tbl[chunk].fragments = 845 da->rtbl_work_frags | FRAGS_PREF_SHORT; 846 } else 847 #endif 848 if (da->rtbl_work_frags >= FRAGS_MARK_HIT) { 849 da->direct_tbl[chunk].fragments = FRAGS_MARK_XL; 850 memmove(&da->range_tbl[da->rtbl_top + 1], 851 &da->range_tbl[da->rtbl_top], 852 (da->rtbl_work_frags + 1) * sizeof(*da->range_tbl)); 853 da->range_tbl[da->rtbl_top].fragments = da->rtbl_work_frags; 854 da->rtbl_work_frags++; 855 } 856 da->rtbl_top += (da->rtbl_work_frags + 1); 857 return (chunk_ref(da, chunk)); 858 } 859 860 static void 861 dxr_build(struct dxr *dxr) 862 { 863 struct dxr_aux *da = dxr->aux; 864 struct chunk_desc *cdp; 865 struct rib_rtable_info rinfo; 866 struct timeval t0, t1, t2, t3; 867 uint32_t r_size, dxr_tot_size; 868 uint32_t i, m, range_rebuild = 0; 869 uint32_t range_frag; 870 #ifdef DXR2 871 struct trie_desc *tp; 872 uint32_t d_tbl_size, dxr_x, d_size, x_size; 873 uint32_t ti, trie_rebuild = 0, prev_size = 0; 874 uint32_t trie_frag; 875 #endif 876 877 KASSERT(dxr->d == NULL, ("dxr: d not free")); 878 879 if (da == NULL) { 880 da = malloc(sizeof(*dxr->aux), M_DXRAUX, M_NOWAIT); 881 if (da == NULL) 882 return; 883 dxr->aux = da; 884 da->fibnum = dxr->fibnum; 885 da->refcnt = 1; 886 LIST_INIT(&da->all_chunks); 887 LIST_INIT(&da->all_trie); 888 da->rtbl_size = RTBL_SIZE_INCR; 889 da->range_tbl = NULL; 890 da->xtbl_size = XTBL_SIZE_INCR; 891 da->x_tbl = NULL; 892 bzero(&da->dst, sizeof(da->dst)); 893 bzero(&da->mask, sizeof(da->mask)); 894 da->dst.sin_len = sizeof(da->dst); 895 da->mask.sin_len = sizeof(da->mask); 896 da->dst.sin_family = AF_INET; 897 da->mask.sin_family = AF_INET; 898 } 899 if (da->range_tbl == NULL) { 900 da->range_tbl = malloc(sizeof(*da->range_tbl) * da->rtbl_size 901 + FRAGS_PREF_SHORT, M_DXRAUX, M_NOWAIT); 902 if (da->range_tbl == NULL) 903 return; 904 range_rebuild = 1; 905 } 906 #ifdef DXR2 907 if (da->x_tbl == NULL) { 908 da->x_tbl = malloc(sizeof(*da->x_tbl) * da->xtbl_size, 909 M_DXRAUX, M_NOWAIT); 910 if (da->x_tbl == NULL) 911 return; 912 trie_rebuild = 1; 913 } 914 #endif 915 da->fd = dxr->fd; 916 917 microuptime(&t0); 918 919 dxr->nh_tbl = fib_get_nhop_array(da->fd); 920 fib_get_rtable_info(fib_get_rh(da->fd), &rinfo); 921 922 if (da->updates_low > da->updates_high) 923 range_rebuild = 1; 924 925 range_build: 926 if (range_rebuild) { 927 /* Bulk cleanup */ 928 bzero(da->chunk_hashtbl, sizeof(da->chunk_hashtbl)); 929 while ((cdp = LIST_FIRST(&da->all_chunks)) != NULL) { 930 LIST_REMOVE(cdp, cd_all_le); 931 uma_zfree(chunk_zone, cdp); 932 } 933 for (i = 0; i < UNUSED_BUCKETS; i++) 934 LIST_INIT(&da->unused_chunks[i]); 935 da->unused_chunks_size = 0; 936 da->rtbl_top = 0; 937 da->updates_low = 0; 938 da->updates_high = DIRECT_TBL_SIZE - 1; 939 memset(da->updates_mask, 0xff, sizeof(da->updates_mask)); 940 for (i = 0; i < DIRECT_TBL_SIZE; i++) { 941 da->direct_tbl[i].fragments = FRAGS_MARK_HIT; 942 da->direct_tbl[i].base = 0; 943 } 944 } 945 da->prefixes = rinfo.num_prefixes; 946 947 /* DXR: construct direct & range table */ 948 for (i = da->updates_low; i <= da->updates_high; i++) { 949 m = da->updates_mask[i >> 5] >> (i & 0x1f); 950 if (m == 0) 951 i |= 0x1f; 952 else if (m & 1 && update_chunk(da, i) != 0) 953 return; 954 } 955 956 range_frag = 0; 957 if (da->rtbl_top) 958 range_frag = da->unused_chunks_size * 10000ULL / da->rtbl_top; 959 if (range_frag > V_frag_limit) { 960 range_rebuild = 1; 961 goto range_build; 962 } 963 964 r_size = sizeof(*da->range_tbl) * da->rtbl_top; 965 microuptime(&t1); 966 967 #ifdef DXR2 968 if (range_rebuild || 969 abs(fls(da->prefixes) - fls(da->trie_rebuilt_prefixes)) > 1) 970 trie_rebuild = 1; 971 972 trie_build: 973 if (trie_rebuild) { 974 da->trie_rebuilt_prefixes = da->prefixes; 975 da->d_bits = DXR_D; 976 da->updates_low = 0; 977 da->updates_high = DIRECT_TBL_SIZE - 1; 978 if (!range_rebuild) 979 memset(da->updates_mask, 0xff, 980 sizeof(da->updates_mask)); 981 } 982 983 dxr2_try_squeeze: 984 if (trie_rebuild) { 985 /* Bulk cleanup */ 986 bzero(da->trietbl, sizeof(da->trietbl)); 987 bzero(da->trie_hashtbl, sizeof(da->trie_hashtbl)); 988 while ((tp = LIST_FIRST(&da->all_trie)) != NULL) { 989 LIST_REMOVE(tp, td_all_le); 990 uma_zfree(trie_zone, tp); 991 } 992 LIST_INIT(&da->unused_trie); 993 da->all_trie_cnt = da->unused_trie_cnt = 0; 994 } 995 996 /* Populate d_tbl, x_tbl */ 997 dxr_x = DXR_TRIE_BITS - da->d_bits; 998 d_tbl_size = (1 << da->d_bits); 999 1000 for (i = da->updates_low >> dxr_x; i <= da->updates_high >> dxr_x; 1001 i++) { 1002 if (!trie_rebuild) { 1003 m = 0; 1004 for (int j = 0; j < (1 << dxr_x); j += 32) 1005 m |= da->updates_mask[((i << dxr_x) + j) >> 5]; 1006 if (m == 0) 1007 continue; 1008 trie_unref(da, i); 1009 } 1010 ti = trie_ref(da, i); 1011 if (ti < 0) 1012 return; 1013 da->d_tbl[i] = ti; 1014 } 1015 1016 trie_frag = 0; 1017 if (da->all_trie_cnt) 1018 trie_frag = da->unused_trie_cnt * 10000ULL / da->all_trie_cnt; 1019 if (trie_frag > V_frag_limit) { 1020 trie_rebuild = 1; 1021 goto trie_build; 1022 } 1023 1024 d_size = sizeof(*da->d_tbl) * d_tbl_size; 1025 x_size = sizeof(*da->x_tbl) * DIRECT_TBL_SIZE / d_tbl_size 1026 * da->all_trie_cnt; 1027 dxr_tot_size = d_size + x_size + r_size; 1028 1029 if (trie_rebuild == 1) { 1030 /* Try to find a more compact D/X split */ 1031 if (prev_size == 0 || dxr_tot_size <= prev_size) 1032 da->d_bits--; 1033 else { 1034 da->d_bits++; 1035 trie_rebuild = 2; 1036 } 1037 prev_size = dxr_tot_size; 1038 goto dxr2_try_squeeze; 1039 } 1040 microuptime(&t2); 1041 #else /* !DXR2 */ 1042 dxr_tot_size = sizeof(da->direct_tbl) + r_size; 1043 t2 = t1; 1044 #endif 1045 1046 dxr->d = malloc(dxr_tot_size, M_DXRLPM, M_NOWAIT); 1047 if (dxr->d == NULL) 1048 return; 1049 #ifdef DXR2 1050 memcpy(dxr->d, da->d_tbl, d_size); 1051 dxr->x = ((char *) dxr->d) + d_size; 1052 memcpy(dxr->x, da->x_tbl, x_size); 1053 dxr->r = ((char *) dxr->x) + x_size; 1054 dxr->d_shift = 32 - da->d_bits; 1055 dxr->x_shift = dxr_x; 1056 dxr->x_mask = 0xffffffffU >> (32 - dxr_x); 1057 #else /* !DXR2 */ 1058 memcpy(dxr->d, da->direct_tbl, sizeof(da->direct_tbl)); 1059 dxr->r = ((char *) dxr->d) + sizeof(da->direct_tbl); 1060 #endif 1061 memcpy(dxr->r, da->range_tbl, r_size); 1062 1063 if (da->updates_low <= da->updates_high) 1064 bzero(&da->updates_mask[da->updates_low / 32], 1065 (da->updates_high - da->updates_low) / 8 + 1); 1066 da->updates_low = DIRECT_TBL_SIZE - 1; 1067 da->updates_high = 0; 1068 microuptime(&t3); 1069 1070 #ifdef DXR2 1071 FIB_PRINTF(LOG_INFO, da->fd, "D%dX%dR, %d prefixes, %d nhops (max)", 1072 da->d_bits, dxr_x, rinfo.num_prefixes, rinfo.num_nhops); 1073 #else 1074 FIB_PRINTF(LOG_INFO, da->fd, "D%dR, %d prefixes, %d nhops (max)", 1075 DXR_D, rinfo.num_prefixes, rinfo.num_nhops); 1076 #endif 1077 i = dxr_tot_size * 100; 1078 if (rinfo.num_prefixes) 1079 i /= rinfo.num_prefixes; 1080 FIB_PRINTF(LOG_INFO, da->fd, "%d.%02d KBytes, %d.%02d Bytes/prefix", 1081 dxr_tot_size / 1024, dxr_tot_size * 100 / 1024 % 100, 1082 i / 100, i % 100); 1083 #ifdef DXR2 1084 FIB_PRINTF(LOG_INFO, da->fd, 1085 "%d.%02d%% trie, %d.%02d%% range fragmentation", 1086 trie_frag / 100, trie_frag % 100, 1087 range_frag / 100, range_frag % 100); 1088 #else 1089 FIB_PRINTF(LOG_INFO, da->fd, "%d.%01d%% range fragmentation", 1090 range_frag / 100, range_frag % 100); 1091 #endif 1092 i = (t1.tv_sec - t0.tv_sec) * 1000000 + t1.tv_usec - t0.tv_usec; 1093 FIB_PRINTF(LOG_INFO, da->fd, "range table %s in %u.%03u ms", 1094 range_rebuild ? "rebuilt" : "updated", i / 1000, i % 1000); 1095 #ifdef DXR2 1096 i = (t2.tv_sec - t1.tv_sec) * 1000000 + t2.tv_usec - t1.tv_usec; 1097 FIB_PRINTF(LOG_INFO, da->fd, "trie %s in %u.%03u ms", 1098 trie_rebuild ? "rebuilt" : "updated", i / 1000, i % 1000); 1099 #endif 1100 i = (t3.tv_sec - t2.tv_sec) * 1000000 + t3.tv_usec - t2.tv_usec; 1101 FIB_PRINTF(LOG_INFO, da->fd, "snapshot forked in %u.%03u ms", 1102 i / 1000, i % 1000); 1103 } 1104 1105 /* 1106 * Glue functions for attaching to FreeBSD 13 fib_algo infrastructure. 1107 */ 1108 1109 static struct nhop_object * 1110 dxr_fib_lookup(void *algo_data, const struct flm_lookup_key key, 1111 uint32_t scopeid) 1112 { 1113 struct dxr *dxr = algo_data; 1114 uint32_t nh; 1115 1116 nh = dxr_lookup(dxr, ntohl(key.addr4.s_addr)); 1117 1118 return (dxr->nh_tbl[nh]); 1119 } 1120 1121 static enum flm_op_result 1122 dxr_init(uint32_t fibnum, struct fib_data *fd, void *old_data, void **data) 1123 { 1124 struct dxr *old_dxr = old_data; 1125 struct dxr_aux *da = NULL; 1126 struct dxr *dxr; 1127 1128 dxr = malloc(sizeof(*dxr), M_DXRAUX, M_NOWAIT); 1129 if (dxr == NULL) 1130 return (FLM_REBUILD); 1131 1132 /* Check whether we may reuse the old auxiliary structures */ 1133 if (old_dxr != NULL && old_dxr->aux != NULL) { 1134 da = old_dxr->aux; 1135 atomic_add_int(&da->refcnt, 1); 1136 } 1137 1138 dxr->aux = da; 1139 dxr->d = NULL; 1140 dxr->fd = fd; 1141 dxr->fibnum = fibnum; 1142 *data = dxr; 1143 1144 return (FLM_SUCCESS); 1145 } 1146 1147 static void 1148 dxr_destroy(void *data) 1149 { 1150 struct dxr *dxr = data; 1151 struct dxr_aux *da; 1152 struct chunk_desc *cdp; 1153 struct trie_desc *tp; 1154 1155 if (dxr->d != NULL) 1156 free(dxr->d, M_DXRLPM); 1157 1158 da = dxr->aux; 1159 free(dxr, M_DXRAUX); 1160 1161 if (da == NULL || atomic_fetchadd_int(&da->refcnt, -1) > 1) 1162 return; 1163 1164 /* Release auxiliary structures */ 1165 while ((cdp = LIST_FIRST(&da->all_chunks)) != NULL) { 1166 LIST_REMOVE(cdp, cd_all_le); 1167 uma_zfree(chunk_zone, cdp); 1168 } 1169 while ((tp = LIST_FIRST(&da->all_trie)) != NULL) { 1170 LIST_REMOVE(tp, td_all_le); 1171 uma_zfree(trie_zone, tp); 1172 } 1173 free(da->range_tbl, M_DXRAUX); 1174 free(da->x_tbl, M_DXRAUX); 1175 free(da, M_DXRAUX); 1176 } 1177 1178 static void 1179 epoch_dxr_destroy(epoch_context_t ctx) 1180 { 1181 struct dxr *dxr = __containerof(ctx, struct dxr, epoch_ctx); 1182 1183 dxr_destroy(dxr); 1184 } 1185 1186 static enum flm_op_result 1187 dxr_dump_end(void *data, struct fib_dp *dp) 1188 { 1189 struct dxr *dxr = data; 1190 struct dxr_aux *da; 1191 1192 dxr_build(dxr); 1193 1194 da = dxr->aux; 1195 if (da == NULL) 1196 return (FLM_REBUILD); 1197 1198 /* Structural limit exceeded, hard error */ 1199 if (da->rtbl_top >= BASE_MAX) 1200 return (FLM_ERROR); 1201 1202 /* A malloc(,, M_NOWAIT) failed somewhere, retry later */ 1203 if (dxr->d == NULL) 1204 return (FLM_REBUILD); 1205 1206 dp->f = dxr_fib_lookup; 1207 dp->arg = dxr; 1208 1209 return (FLM_SUCCESS); 1210 } 1211 1212 static enum flm_op_result 1213 dxr_dump_rib_item(struct rtentry *rt, void *data) 1214 { 1215 1216 return (FLM_SUCCESS); 1217 } 1218 1219 static enum flm_op_result 1220 dxr_change_rib_item(struct rib_head *rnh, struct rib_cmd_info *rc, 1221 void *data) 1222 { 1223 1224 return (FLM_BATCH); 1225 } 1226 1227 static enum flm_op_result 1228 dxr_change_rib_batch(struct rib_head *rnh, struct fib_change_queue *q, 1229 void *data) 1230 { 1231 struct dxr *dxr = data; 1232 struct dxr *new_dxr; 1233 struct dxr_aux *da; 1234 struct fib_dp new_dp; 1235 enum flm_op_result res; 1236 uint32_t ip, plen, hmask, start, end, i, ui; 1237 #ifdef INVARIANTS 1238 struct rib_rtable_info rinfo; 1239 int update_delta = 0; 1240 #endif 1241 1242 KASSERT(data != NULL, ("%s: NULL data", __FUNCTION__)); 1243 KASSERT(q != NULL, ("%s: NULL q", __FUNCTION__)); 1244 KASSERT(q->count < q->size, ("%s: q->count %d q->size %d", 1245 __FUNCTION__, q->count, q->size)); 1246 1247 da = dxr->aux; 1248 KASSERT(da != NULL, ("%s: NULL dxr->aux", __FUNCTION__)); 1249 1250 FIB_PRINTF(LOG_INFO, da->fd, "processing %d update(s)", q->count); 1251 for (ui = 0; ui < q->count; ui++) { 1252 #ifdef INVARIANTS 1253 if (q->entries[ui].nh_new != NULL) 1254 update_delta++; 1255 if (q->entries[ui].nh_old != NULL) 1256 update_delta--; 1257 #endif 1258 plen = q->entries[ui].plen; 1259 ip = ntohl(q->entries[ui].addr4.s_addr); 1260 if (plen < 32) 1261 hmask = 0xffffffffU >> plen; 1262 else 1263 hmask = 0; 1264 start = (ip & ~hmask) >> DXR_RANGE_SHIFT; 1265 end = (ip | hmask) >> DXR_RANGE_SHIFT; 1266 1267 if ((start & 0x1f) == 0 && (end & 0x1f) == 0x1f) 1268 for (i = start >> 5; i <= end >> 5; i++) 1269 da->updates_mask[i] = 0xffffffffU; 1270 else 1271 for (i = start; i <= end; i++) 1272 da->updates_mask[i >> 5] |= (1 << (i & 0x1f)); 1273 if (start < da->updates_low) 1274 da->updates_low = start; 1275 if (end > da->updates_high) 1276 da->updates_high = end; 1277 } 1278 1279 #ifdef INVARIANTS 1280 fib_get_rtable_info(fib_get_rh(da->fd), &rinfo); 1281 KASSERT(da->prefixes + update_delta == rinfo.num_prefixes, 1282 ("%s: update count mismatch", __FUNCTION__)); 1283 #endif 1284 1285 res = dxr_init(0, dxr->fd, data, (void **) &new_dxr); 1286 if (res != FLM_SUCCESS) 1287 return (res); 1288 1289 dxr_build(new_dxr); 1290 1291 /* Structural limit exceeded, hard error */ 1292 if (da->rtbl_top >= BASE_MAX) { 1293 dxr_destroy(new_dxr); 1294 return (FLM_ERROR); 1295 } 1296 1297 /* A malloc(,, M_NOWAIT) failed somewhere, retry later */ 1298 if (new_dxr->d == NULL) { 1299 dxr_destroy(new_dxr); 1300 return (FLM_REBUILD); 1301 } 1302 1303 new_dp.f = dxr_fib_lookup; 1304 new_dp.arg = new_dxr; 1305 if (fib_set_datapath_ptr(dxr->fd, &new_dp)) { 1306 fib_set_algo_ptr(dxr->fd, new_dxr); 1307 fib_epoch_call(epoch_dxr_destroy, &dxr->epoch_ctx); 1308 return (FLM_SUCCESS); 1309 } 1310 1311 dxr_destroy(new_dxr); 1312 return (FLM_REBUILD); 1313 } 1314 1315 static uint8_t 1316 dxr_get_pref(const struct rib_rtable_info *rinfo) 1317 { 1318 1319 /* Below bsearch4 up to 10 prefixes. Always supersedes dpdk_lpm4. */ 1320 return (251); 1321 } 1322 1323 static struct fib_lookup_module fib_dxr_mod = { 1324 .flm_name = "dxr", 1325 .flm_family = AF_INET, 1326 .flm_init_cb = dxr_init, 1327 .flm_destroy_cb = dxr_destroy, 1328 .flm_dump_rib_item_cb = dxr_dump_rib_item, 1329 .flm_dump_end_cb = dxr_dump_end, 1330 .flm_change_rib_item_cb = dxr_change_rib_item, 1331 .flm_change_rib_items_cb = dxr_change_rib_batch, 1332 .flm_get_pref = dxr_get_pref, 1333 }; 1334 1335 static int 1336 dxr_modevent(module_t mod, int type, void *unused) 1337 { 1338 int error; 1339 1340 switch (type) { 1341 case MOD_LOAD: 1342 chunk_zone = uma_zcreate("dxr chunk", sizeof(struct chunk_desc), 1343 NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); 1344 trie_zone = uma_zcreate("dxr trie", sizeof(struct trie_desc), 1345 NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); 1346 fib_module_register(&fib_dxr_mod); 1347 return(0); 1348 case MOD_UNLOAD: 1349 error = fib_module_unregister(&fib_dxr_mod); 1350 if (error) 1351 return (error); 1352 uma_zdestroy(chunk_zone); 1353 uma_zdestroy(trie_zone); 1354 return(0); 1355 default: 1356 return(EOPNOTSUPP); 1357 } 1358 } 1359 1360 static moduledata_t dxr_mod = {"fib_dxr", dxr_modevent, 0}; 1361 1362 DECLARE_MODULE(fib_dxr, dxr_mod, SI_SUB_PSEUDO, SI_ORDER_ANY); 1363 MODULE_VERSION(fib_dxr, 1); 1364