1 /*-
2 * SPDX-License-Identifier: BSD-2-Clause
3 *
4 * Copyright (c) 2012-2024 Marko Zec
5 * Copyright (c) 2005, 2018 University of Zagreb
6 * Copyright (c) 2005 International Computer Science Institute
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27 * SUCH DAMAGE.
28 */
29
30 /*
31 * An implementation of DXR, a simple IPv4 LPM scheme with compact lookup
32 * structures and a trivial search procedure. More significant bits of
33 * the search key are used to directly index a two-stage trie, while the
34 * remaining bits are used for finding the next hop in a sorted array.
35 * More details in:
36 *
37 * M. Zec, L. Rizzo, M. Mikuc, DXR: towards a billion routing lookups per
38 * second in software, ACM SIGCOMM Computer Communication Review, September
39 * 2012
40 *
41 * M. Zec, M. Mikuc, Pushing the envelope: beyond two billion IP routing
42 * lookups per second on commodity CPUs, IEEE SoftCOM, September 2017, Split
43 */
44
45 #include "opt_inet.h"
46
47 #include <sys/param.h>
48 #include <sys/kernel.h>
49 #include <sys/ctype.h>
50 #include <sys/epoch.h>
51 #include <sys/malloc.h>
52 #include <sys/module.h>
53 #include <sys/socket.h>
54 #include <sys/sysctl.h>
55 #include <sys/syslog.h>
56
57 #include <vm/uma.h>
58
59 #include <netinet/in.h>
60 #include <netinet/in_fib.h>
61
62 #include <net/route.h>
63 #include <net/route/route_ctl.h>
64 #include <net/route/fib_algo.h>
65
66 #define DXR_TRIE_BITS 20
67
68 CTASSERT(DXR_TRIE_BITS >= 16 && DXR_TRIE_BITS <= 24);
69
70 #if DXR_TRIE_BITS > 16
71 #define DXR_D 16
72 #else
73 #define DXR_D (DXR_TRIE_BITS - 1)
74 #endif
75
76 #define D_TBL_SIZE (1 << DXR_D)
77 #define DIRECT_TBL_SIZE (1 << DXR_TRIE_BITS)
78 #define DXR_RANGE_MASK (0xffffffffU >> DXR_TRIE_BITS)
79 #define DXR_RANGE_SHIFT (32 - DXR_TRIE_BITS)
80
81 #define DESC_BASE_BITS 22
82 #define DESC_FRAGMENTS_BITS (32 - DESC_BASE_BITS)
83 #define BASE_MAX ((1 << DESC_BASE_BITS) - 1)
84 #define RTBL_SIZE_INCR (BASE_MAX / 64)
85
86 #if DXR_TRIE_BITS < 24
87 #define FRAGS_MASK_SHORT ((1 << (23 - DXR_TRIE_BITS)) - 1)
88 #else
89 #define FRAGS_MASK_SHORT 0
90 #endif
91 #define FRAGS_PREF_SHORT (((1 << DESC_FRAGMENTS_BITS) - 1) & \
92 ~FRAGS_MASK_SHORT)
93 #define FRAGS_MARK_XL (FRAGS_PREF_SHORT - 1)
94 #define FRAGS_MARK_HIT (FRAGS_PREF_SHORT - 2)
95
96 #define IS_SHORT_FORMAT(x) ((x & FRAGS_PREF_SHORT) == FRAGS_PREF_SHORT)
97 #define IS_LONG_FORMAT(x) ((x & FRAGS_PREF_SHORT) != FRAGS_PREF_SHORT)
98 #define IS_XL_FORMAT(x) (x == FRAGS_MARK_XL)
99
100 #define RE_SHORT_MAX_NH ((1 << (DXR_TRIE_BITS - 8)) - 1)
101
102 #define CHUNK_HASH_BITS 16
103 #define CHUNK_HASH_SIZE (1 << CHUNK_HASH_BITS)
104 #define CHUNK_HASH_MASK (CHUNK_HASH_SIZE - 1)
105
106 #define TRIE_HASH_BITS 16
107 #define TRIE_HASH_SIZE (1 << TRIE_HASH_BITS)
108 #define TRIE_HASH_MASK (TRIE_HASH_SIZE - 1)
109
110 #define XTBL_SIZE_INCR (DIRECT_TBL_SIZE / 16)
111
112 #define UNUSED_BUCKETS 8
113
114 /* Lookup structure elements */
115
116 struct direct_entry {
117 uint32_t fragments: DESC_FRAGMENTS_BITS,
118 base: DESC_BASE_BITS;
119 };
120
121 struct range_entry_long {
122 uint32_t start: DXR_RANGE_SHIFT,
123 nexthop: DXR_TRIE_BITS;
124 };
125
126 #if DXR_TRIE_BITS < 24
127 struct range_entry_short {
128 uint16_t start: DXR_RANGE_SHIFT - 8,
129 nexthop: DXR_TRIE_BITS - 8;
130 };
131 #endif
132
133 /* Auxiliary structures */
134
135 struct heap_entry {
136 uint32_t start;
137 uint32_t end;
138 uint32_t preflen;
139 uint32_t nexthop;
140 };
141
142 struct chunk_desc {
143 LIST_ENTRY(chunk_desc) cd_all_le;
144 LIST_ENTRY(chunk_desc) cd_hash_le;
145 uint32_t cd_hash;
146 uint32_t cd_refcnt;
147 uint32_t cd_base;
148 uint32_t cd_cur_size;
149 uint32_t cd_max_size;
150 };
151
152 struct trie_desc {
153 LIST_ENTRY(trie_desc) td_all_le;
154 LIST_ENTRY(trie_desc) td_hash_le;
155 uint32_t td_hash;
156 uint32_t td_index;
157 uint32_t td_refcnt;
158 };
159
160 struct dxr_aux {
161 /* Glue to external state */
162 struct fib_data *fd;
163 uint32_t fibnum;
164 int refcnt;
165
166 /* Auxiliary build-time tables */
167 struct direct_entry direct_tbl[DIRECT_TBL_SIZE];
168 uint16_t d_tbl[D_TBL_SIZE];
169 struct direct_entry *x_tbl;
170 union {
171 struct range_entry_long re;
172 uint32_t fragments;
173 } *range_tbl;
174
175 /* Auxiliary internal state */
176 uint32_t updates_mask[DIRECT_TBL_SIZE / 32];
177 struct trie_desc *trietbl[D_TBL_SIZE];
178 LIST_HEAD(, chunk_desc) chunk_hashtbl[CHUNK_HASH_SIZE];
179 LIST_HEAD(, chunk_desc) all_chunks;
180 LIST_HEAD(, chunk_desc) unused_chunks[UNUSED_BUCKETS];
181 LIST_HEAD(, trie_desc) trie_hashtbl[TRIE_HASH_SIZE];
182 LIST_HEAD(, trie_desc) all_trie;
183 LIST_HEAD(, trie_desc) unused_trie; /* abuses hash link entry */
184 struct sockaddr_in dst;
185 struct sockaddr_in mask;
186 struct heap_entry heap[33];
187 uint32_t prefixes;
188 uint32_t updates_low;
189 uint32_t updates_high;
190 uint32_t unused_chunks_size;
191 uint32_t xtbl_size;
192 uint32_t all_trie_cnt;
193 uint32_t unused_trie_cnt;
194 uint32_t trie_rebuilt_prefixes;
195 uint32_t heap_index;
196 uint32_t d_bits;
197 uint32_t rtbl_size;
198 uint32_t rtbl_top;
199 uint32_t rtbl_work_frags;
200 uint32_t work_chunk;
201 };
202
203 /* Main lookup structure container */
204
205 struct dxr {
206 /* Lookup tables */
207 void *d;
208 void *x;
209 void *r;
210 struct nhop_object **nh_tbl;
211
212 /* Glue to external state */
213 struct dxr_aux *aux;
214 struct fib_data *fd;
215 struct epoch_context epoch_ctx;
216 uint32_t fibnum;
217 uint16_t d_shift;
218 uint16_t x_shift;
219 uint32_t x_mask;
220 };
221
222 static MALLOC_DEFINE(M_DXRLPM, "dxr", "DXR LPM");
223 static MALLOC_DEFINE(M_DXRAUX, "dxr aux", "DXR auxiliary");
224
225 uma_zone_t chunk_zone;
226 uma_zone_t trie_zone;
227
228 VNET_DEFINE_STATIC(int, frag_limit) = 100;
229 #define V_frag_limit VNET(frag_limit)
230
231 /* Binary search for a matching address range */
232 #define DXR_LOOKUP_STAGE \
233 if (masked_dst < range[middle].start) { \
234 upperbound = middle; \
235 middle = (middle + lowerbound) / 2; \
236 } else if (masked_dst < range[middle + 1].start) \
237 return (range[middle].nexthop); \
238 else { \
239 lowerbound = middle + 1; \
240 middle = (upperbound + middle + 1) / 2; \
241 } \
242 if (upperbound == lowerbound) \
243 return (range[lowerbound].nexthop);
244
245 static int
range_lookup(struct range_entry_long * rt,struct direct_entry de,uint32_t dst)246 range_lookup(struct range_entry_long *rt, struct direct_entry de, uint32_t dst)
247 {
248 uint32_t base;
249 uint32_t upperbound;
250 uint32_t middle;
251 uint32_t lowerbound;
252 uint32_t masked_dst;
253
254 base = de.base;
255 lowerbound = 0;
256 masked_dst = dst & DXR_RANGE_MASK;
257
258 #if DXR_TRIE_BITS < 24
259 if (__predict_true(IS_SHORT_FORMAT(de.fragments))) {
260 upperbound = de.fragments & FRAGS_MASK_SHORT;
261 struct range_entry_short *range =
262 (struct range_entry_short *) &rt[base];
263
264 masked_dst >>= 8;
265 middle = upperbound;
266 upperbound = upperbound * 2 + 1;
267
268 for (;;) {
269 DXR_LOOKUP_STAGE
270 DXR_LOOKUP_STAGE
271 }
272 }
273 #endif
274
275 upperbound = de.fragments;
276 middle = upperbound / 2;
277 struct range_entry_long *range = &rt[base];
278 if (__predict_false(IS_XL_FORMAT(de.fragments))) {
279 upperbound = *((uint32_t *) range);
280 range++;
281 middle = upperbound / 2;
282 }
283
284 for (;;) {
285 DXR_LOOKUP_STAGE
286 DXR_LOOKUP_STAGE
287 }
288 }
289
290 #define DXR_LOOKUP_DEFINE(D) \
291 static int inline \
292 dxr_lookup_##D(struct dxr *dxr, uint32_t dst) \
293 { \
294 struct direct_entry de; \
295 uint16_t *dt = dxr->d; \
296 struct direct_entry *xt = dxr->x; \
297 \
298 de = xt[(dt[dst >> (32 - (D))] << (DXR_TRIE_BITS - (D))) \
299 + ((dst >> DXR_RANGE_SHIFT) & \
300 (0xffffffffU >> (32 - DXR_TRIE_BITS + (D))))]; \
301 if (__predict_true(de.fragments == FRAGS_MARK_HIT)) \
302 return (de.base); \
303 return (range_lookup(dxr->r, de, dst)); \
304 } \
305 \
306 static struct nhop_object * \
307 dxr_fib_lookup_##D(void *algo_data, \
308 const struct flm_lookup_key key, uint32_t scopeid __unused) \
309 { \
310 struct dxr *dxr = algo_data; \
311 \
312 return (dxr->nh_tbl[dxr_lookup_##D(dxr, \
313 ntohl(key.addr4.s_addr))]); \
314 }
315
316 #if DXR_TRIE_BITS > 16
317 DXR_LOOKUP_DEFINE(16)
318 #endif
319 DXR_LOOKUP_DEFINE(15)
320 DXR_LOOKUP_DEFINE(14)
321 DXR_LOOKUP_DEFINE(13)
322 DXR_LOOKUP_DEFINE(12)
323 DXR_LOOKUP_DEFINE(11)
324 DXR_LOOKUP_DEFINE(10)
325 DXR_LOOKUP_DEFINE(9)
326
327 static int inline
dxr_lookup(struct dxr * dxr,uint32_t dst)328 dxr_lookup(struct dxr *dxr, uint32_t dst)
329 {
330 struct direct_entry de;
331 uint16_t *dt = dxr->d;
332 struct direct_entry *xt = dxr->x;
333
334 de = xt[(dt[dst >> dxr->d_shift] << dxr->x_shift) +
335 ((dst >> DXR_RANGE_SHIFT) & dxr->x_mask)];
336 if (__predict_true(de.fragments == FRAGS_MARK_HIT))
337 return (de.base);
338 return (range_lookup(dxr->r, de, dst));
339 }
340
341 static void
initheap(struct dxr_aux * da,uint32_t dst_u32,uint32_t chunk)342 initheap(struct dxr_aux *da, uint32_t dst_u32, uint32_t chunk)
343 {
344 struct heap_entry *fhp = &da->heap[0];
345 struct rtentry *rt;
346 struct route_nhop_data rnd;
347
348 da->heap_index = 0;
349 da->dst.sin_addr.s_addr = htonl(dst_u32);
350 rt = fib4_lookup_rt(da->fibnum, da->dst.sin_addr, 0, NHR_UNLOCKED,
351 &rnd);
352 if (rt != NULL) {
353 struct in_addr addr;
354 uint32_t scopeid;
355
356 rt_get_inet_prefix_plen(rt, &addr, &fhp->preflen, &scopeid);
357 fhp->start = ntohl(addr.s_addr);
358 fhp->end = fhp->start;
359 if (fhp->preflen < 32)
360 fhp->end |= (0xffffffffU >> fhp->preflen);
361 fhp->nexthop = fib_get_nhop_idx(da->fd, rnd.rnd_nhop);
362 } else {
363 fhp->preflen = fhp->nexthop = fhp->start = 0;
364 fhp->end = 0xffffffffU;
365 }
366 }
367
368 static uint32_t
chunk_size(struct dxr_aux * da,struct direct_entry * fdesc)369 chunk_size(struct dxr_aux *da, struct direct_entry *fdesc)
370 {
371
372 if (IS_SHORT_FORMAT(fdesc->fragments))
373 return ((fdesc->fragments & FRAGS_MASK_SHORT) + 1);
374 else if (IS_XL_FORMAT(fdesc->fragments))
375 return (da->range_tbl[fdesc->base].fragments + 2);
376 else /* if (IS_LONG_FORMAT(fdesc->fragments)) */
377 return (fdesc->fragments + 1);
378 }
379
380 static uint32_t
chunk_hash(struct dxr_aux * da,struct direct_entry * fdesc)381 chunk_hash(struct dxr_aux *da, struct direct_entry *fdesc)
382 {
383 uint32_t size = chunk_size(da, fdesc);
384 uint32_t *p = (uint32_t *) &da->range_tbl[fdesc->base];
385 uint32_t *l = (uint32_t *) &da->range_tbl[fdesc->base + size];
386 uint32_t hash = fdesc->fragments;
387
388 for (; p < l; p++)
389 hash = (hash << 7) + (hash >> 13) + *p;
390
391 return (hash + (hash >> 16));
392 }
393
394 static int
chunk_ref(struct dxr_aux * da,uint32_t chunk)395 chunk_ref(struct dxr_aux *da, uint32_t chunk)
396 {
397 struct direct_entry *fdesc = &da->direct_tbl[chunk];
398 struct chunk_desc *cdp, *empty_cdp;
399 uint32_t base = fdesc->base;
400 uint32_t size = chunk_size(da, fdesc);
401 uint32_t hash = chunk_hash(da, fdesc);
402 int i;
403
404 /* Find an existing descriptor */
405 LIST_FOREACH(cdp, &da->chunk_hashtbl[hash & CHUNK_HASH_MASK],
406 cd_hash_le) {
407 if (cdp->cd_hash != hash || cdp->cd_cur_size != size ||
408 memcmp(&da->range_tbl[base], &da->range_tbl[cdp->cd_base],
409 sizeof(struct range_entry_long) * size))
410 continue;
411 da->rtbl_top = fdesc->base;
412 fdesc->base = cdp->cd_base;
413 cdp->cd_refcnt++;
414 return (0);
415 }
416
417 /* No matching chunks found. Find an empty one to recycle. */
418 for (cdp = NULL, i = size; cdp == NULL && i < UNUSED_BUCKETS; i++)
419 cdp = LIST_FIRST(&da->unused_chunks[i]);
420
421 if (cdp == NULL)
422 LIST_FOREACH(empty_cdp, &da->unused_chunks[0], cd_hash_le)
423 if (empty_cdp->cd_max_size >= size && (cdp == NULL ||
424 empty_cdp->cd_max_size < cdp->cd_max_size)) {
425 cdp = empty_cdp;
426 if (empty_cdp->cd_max_size == size)
427 break;
428 }
429
430 if (cdp != NULL) {
431 /* Copy from heap into the recycled chunk */
432 bcopy(&da->range_tbl[fdesc->base], &da->range_tbl[cdp->cd_base],
433 size * sizeof(struct range_entry_long));
434 fdesc->base = cdp->cd_base;
435 da->rtbl_top -= size;
436 da->unused_chunks_size -= cdp->cd_max_size;
437 if (cdp->cd_max_size > size) {
438 /* Split the range in two, need a new descriptor */
439 empty_cdp = uma_zalloc(chunk_zone, M_NOWAIT);
440 if (empty_cdp == NULL)
441 return (1);
442 LIST_INSERT_BEFORE(cdp, empty_cdp, cd_all_le);
443 empty_cdp->cd_base = cdp->cd_base + size;
444 empty_cdp->cd_cur_size = 0;
445 empty_cdp->cd_max_size = cdp->cd_max_size - size;
446
447 i = empty_cdp->cd_max_size;
448 if (i >= UNUSED_BUCKETS)
449 i = 0;
450 LIST_INSERT_HEAD(&da->unused_chunks[i], empty_cdp,
451 cd_hash_le);
452
453 da->unused_chunks_size += empty_cdp->cd_max_size;
454 cdp->cd_max_size = size;
455 }
456 LIST_REMOVE(cdp, cd_hash_le);
457 } else {
458 /* Alloc a new descriptor at the top of the heap*/
459 cdp = uma_zalloc(chunk_zone, M_NOWAIT);
460 if (cdp == NULL)
461 return (1);
462 cdp->cd_max_size = size;
463 cdp->cd_base = fdesc->base;
464 LIST_INSERT_HEAD(&da->all_chunks, cdp, cd_all_le);
465 MPASS(cdp->cd_base + cdp->cd_max_size == da->rtbl_top);
466 }
467
468 cdp->cd_hash = hash;
469 cdp->cd_refcnt = 1;
470 cdp->cd_cur_size = size;
471 LIST_INSERT_HEAD(&da->chunk_hashtbl[hash & CHUNK_HASH_MASK], cdp,
472 cd_hash_le);
473 if (da->rtbl_top >= da->rtbl_size) {
474 if (da->rtbl_top >= BASE_MAX) {
475 FIB_PRINTF(LOG_ERR, da->fd,
476 "structural limit exceeded at %d "
477 "range table elements", da->rtbl_top);
478 return (1);
479 }
480 da->rtbl_size += RTBL_SIZE_INCR;
481 i = (BASE_MAX - da->rtbl_top) * LOG_DEBUG / BASE_MAX;
482 FIB_PRINTF(i, da->fd, "range table at %d%% structural limit",
483 da->rtbl_top * 100 / BASE_MAX);
484 da->range_tbl = realloc(da->range_tbl,
485 sizeof(*da->range_tbl) * da->rtbl_size + FRAGS_PREF_SHORT,
486 M_DXRAUX, M_NOWAIT);
487 if (da->range_tbl == NULL) {
488 FIB_PRINTF(LOG_NOTICE, da->fd,
489 "Unable to allocate DXR range table");
490 return (1);
491 }
492 }
493
494 return (0);
495 }
496
497 static void
chunk_unref(struct dxr_aux * da,uint32_t chunk)498 chunk_unref(struct dxr_aux *da, uint32_t chunk)
499 {
500 struct direct_entry *fdesc = &da->direct_tbl[chunk];
501 struct chunk_desc *cdp, *cdp2;
502 uint32_t base = fdesc->base;
503 uint32_t size = chunk_size(da, fdesc);
504 uint32_t hash = chunk_hash(da, fdesc);
505 int i;
506
507 /* Find the corresponding descriptor */
508 LIST_FOREACH(cdp, &da->chunk_hashtbl[hash & CHUNK_HASH_MASK],
509 cd_hash_le)
510 if (cdp->cd_hash == hash && cdp->cd_cur_size == size &&
511 memcmp(&da->range_tbl[base], &da->range_tbl[cdp->cd_base],
512 sizeof(struct range_entry_long) * size) == 0)
513 break;
514
515 MPASS(cdp != NULL);
516 if (--cdp->cd_refcnt > 0)
517 return;
518
519 LIST_REMOVE(cdp, cd_hash_le);
520 da->unused_chunks_size += cdp->cd_max_size;
521 cdp->cd_cur_size = 0;
522
523 /* Attempt to merge with the preceding chunk, if empty */
524 cdp2 = LIST_NEXT(cdp, cd_all_le);
525 if (cdp2 != NULL && cdp2->cd_cur_size == 0) {
526 MPASS(cdp2->cd_base + cdp2->cd_max_size == cdp->cd_base);
527 LIST_REMOVE(cdp, cd_all_le);
528 LIST_REMOVE(cdp2, cd_hash_le);
529 cdp2->cd_max_size += cdp->cd_max_size;
530 uma_zfree(chunk_zone, cdp);
531 cdp = cdp2;
532 }
533
534 /* Attempt to merge with the subsequent chunk, if empty */
535 cdp2 = LIST_PREV(cdp, &da->all_chunks, chunk_desc, cd_all_le);
536 if (cdp2 != NULL && cdp2->cd_cur_size == 0) {
537 MPASS(cdp->cd_base + cdp->cd_max_size == cdp2->cd_base);
538 LIST_REMOVE(cdp, cd_all_le);
539 LIST_REMOVE(cdp2, cd_hash_le);
540 cdp2->cd_max_size += cdp->cd_max_size;
541 cdp2->cd_base = cdp->cd_base;
542 uma_zfree(chunk_zone, cdp);
543 cdp = cdp2;
544 }
545
546 if (cdp->cd_base + cdp->cd_max_size == da->rtbl_top) {
547 /* Free the chunk on the top of the range heap, trim the heap */
548 MPASS(cdp == LIST_FIRST(&da->all_chunks));
549 da->rtbl_top -= cdp->cd_max_size;
550 da->unused_chunks_size -= cdp->cd_max_size;
551 LIST_REMOVE(cdp, cd_all_le);
552 uma_zfree(chunk_zone, cdp);
553 return;
554 }
555
556 i = cdp->cd_max_size;
557 if (i >= UNUSED_BUCKETS)
558 i = 0;
559 LIST_INSERT_HEAD(&da->unused_chunks[i], cdp, cd_hash_le);
560 }
561
562 static uint32_t
trie_hash(struct dxr_aux * da,uint32_t dxr_x,uint32_t index)563 trie_hash(struct dxr_aux *da, uint32_t dxr_x, uint32_t index)
564 {
565 uint32_t i, *val;
566 uint32_t hash = 0;
567
568 for (i = 0; i < (1 << dxr_x); i++) {
569 hash = (hash << 3) ^ (hash >> 3);
570 val = (uint32_t *)
571 (void *) &da->direct_tbl[(index << dxr_x) + i];
572 hash += (*val << 5);
573 hash += (*val >> 5);
574 }
575
576 return (hash + (hash >> 16));
577 }
578
579 static int
trie_ref(struct dxr_aux * da,uint32_t index)580 trie_ref(struct dxr_aux *da, uint32_t index)
581 {
582 struct trie_desc *tp;
583 uint32_t dxr_d = da->d_bits;
584 uint32_t dxr_x = DXR_TRIE_BITS - dxr_d;
585 uint32_t hash = trie_hash(da, dxr_x, index);
586
587 /* Find an existing descriptor */
588 LIST_FOREACH(tp, &da->trie_hashtbl[hash & TRIE_HASH_MASK], td_hash_le)
589 if (tp->td_hash == hash &&
590 memcmp(&da->direct_tbl[index << dxr_x],
591 &da->x_tbl[tp->td_index << dxr_x],
592 sizeof(*da->x_tbl) << dxr_x) == 0) {
593 tp->td_refcnt++;
594 da->trietbl[index] = tp;
595 return(tp->td_index);
596 }
597
598 tp = LIST_FIRST(&da->unused_trie);
599 if (tp != NULL) {
600 LIST_REMOVE(tp, td_hash_le);
601 da->unused_trie_cnt--;
602 } else {
603 tp = uma_zalloc(trie_zone, M_NOWAIT);
604 if (tp == NULL)
605 return (-1);
606 LIST_INSERT_HEAD(&da->all_trie, tp, td_all_le);
607 tp->td_index = da->all_trie_cnt++;
608 }
609
610 tp->td_hash = hash;
611 tp->td_refcnt = 1;
612 LIST_INSERT_HEAD(&da->trie_hashtbl[hash & TRIE_HASH_MASK], tp,
613 td_hash_le);
614 memcpy(&da->x_tbl[tp->td_index << dxr_x],
615 &da->direct_tbl[index << dxr_x], sizeof(*da->x_tbl) << dxr_x);
616 da->trietbl[index] = tp;
617 if (da->all_trie_cnt >= da->xtbl_size >> dxr_x) {
618 da->xtbl_size += XTBL_SIZE_INCR;
619 da->x_tbl = realloc(da->x_tbl,
620 sizeof(*da->x_tbl) * da->xtbl_size, M_DXRAUX, M_NOWAIT);
621 if (da->x_tbl == NULL) {
622 FIB_PRINTF(LOG_NOTICE, da->fd,
623 "Unable to allocate DXR extension table");
624 return (-1);
625 }
626 }
627 return(tp->td_index);
628 }
629
630 static void
trie_unref(struct dxr_aux * da,uint32_t index)631 trie_unref(struct dxr_aux *da, uint32_t index)
632 {
633 struct trie_desc *tp = da->trietbl[index];
634
635 if (tp == NULL)
636 return;
637 da->trietbl[index] = NULL;
638 if (--tp->td_refcnt > 0)
639 return;
640
641 LIST_REMOVE(tp, td_hash_le);
642 da->unused_trie_cnt++;
643 if (tp->td_index != da->all_trie_cnt - 1) {
644 LIST_INSERT_HEAD(&da->unused_trie, tp, td_hash_le);
645 return;
646 }
647
648 do {
649 da->all_trie_cnt--;
650 da->unused_trie_cnt--;
651 LIST_REMOVE(tp, td_all_le);
652 uma_zfree(trie_zone, tp);
653 LIST_FOREACH(tp, &da->unused_trie, td_hash_le)
654 if (tp->td_index == da->all_trie_cnt - 1) {
655 LIST_REMOVE(tp, td_hash_le);
656 break;
657 }
658 } while (tp != NULL);
659 }
660
661 static void
heap_inject(struct dxr_aux * da,uint32_t start,uint32_t end,uint32_t preflen,uint32_t nh)662 heap_inject(struct dxr_aux *da, uint32_t start, uint32_t end, uint32_t preflen,
663 uint32_t nh)
664 {
665 struct heap_entry *fhp;
666 int i;
667
668 for (i = da->heap_index; i >= 0; i--) {
669 if (preflen > da->heap[i].preflen)
670 break;
671 else if (preflen < da->heap[i].preflen)
672 da->heap[i + 1] = da->heap[i];
673 else
674 return;
675 }
676
677 fhp = &da->heap[i + 1];
678 fhp->preflen = preflen;
679 fhp->start = start;
680 fhp->end = end;
681 fhp->nexthop = nh;
682 da->heap_index++;
683 }
684
685 static int
dxr_walk(struct rtentry * rt,void * arg)686 dxr_walk(struct rtentry *rt, void *arg)
687 {
688 struct dxr_aux *da = arg;
689 uint32_t chunk = da->work_chunk;
690 uint32_t first = chunk << DXR_RANGE_SHIFT;
691 uint32_t last = first | DXR_RANGE_MASK;
692 struct range_entry_long *fp =
693 &da->range_tbl[da->rtbl_top + da->rtbl_work_frags].re;
694 struct heap_entry *fhp = &da->heap[da->heap_index];
695 uint32_t preflen, nh, start, end, scopeid;
696 struct in_addr addr;
697
698 rt_get_inet_prefix_plen(rt, &addr, &preflen, &scopeid);
699 start = ntohl(addr.s_addr);
700 if (start > last)
701 return (-1); /* Beyond chunk boundaries, we are done */
702 if (start < first)
703 return (0); /* Skip this route */
704
705 end = start;
706 if (preflen < 32)
707 end |= (0xffffffffU >> preflen);
708 nh = fib_get_nhop_idx(da->fd, rt_get_raw_nhop(rt));
709
710 if (start == fhp->start)
711 heap_inject(da, start, end, preflen, nh);
712 else {
713 /* start > fhp->start */
714 while (start > fhp->end) {
715 uint32_t oend = fhp->end;
716
717 if (da->heap_index > 0) {
718 fhp--;
719 da->heap_index--;
720 } else
721 initheap(da, fhp->end + 1, chunk);
722 if (fhp->end > oend && fhp->nexthop != fp->nexthop) {
723 fp++;
724 da->rtbl_work_frags++;
725 fp->start = (oend + 1) & DXR_RANGE_MASK;
726 fp->nexthop = fhp->nexthop;
727 }
728 }
729 if (start > ((chunk << DXR_RANGE_SHIFT) | fp->start) &&
730 nh != fp->nexthop) {
731 fp++;
732 da->rtbl_work_frags++;
733 fp->start = start & DXR_RANGE_MASK;
734 } else if (da->rtbl_work_frags) {
735 if ((--fp)->nexthop == nh)
736 da->rtbl_work_frags--;
737 else
738 fp++;
739 }
740 fp->nexthop = nh;
741 heap_inject(da, start, end, preflen, nh);
742 }
743
744 return (0);
745 }
746
747 static int
update_chunk(struct dxr_aux * da,uint32_t chunk)748 update_chunk(struct dxr_aux *da, uint32_t chunk)
749 {
750 struct range_entry_long *fp;
751 #if DXR_TRIE_BITS < 24
752 struct range_entry_short *fps;
753 uint32_t start, nh, i;
754 #endif
755 struct heap_entry *fhp;
756 uint32_t first = chunk << DXR_RANGE_SHIFT;
757 uint32_t last = first | DXR_RANGE_MASK;
758
759 if (da->direct_tbl[chunk].fragments != FRAGS_MARK_HIT)
760 chunk_unref(da, chunk);
761
762 initheap(da, first, chunk);
763
764 fp = &da->range_tbl[da->rtbl_top].re;
765 da->rtbl_work_frags = 0;
766 fp->start = first & DXR_RANGE_MASK;
767 fp->nexthop = da->heap[0].nexthop;
768
769 da->dst.sin_addr.s_addr = htonl(first);
770 da->mask.sin_addr.s_addr = htonl(~DXR_RANGE_MASK);
771
772 da->work_chunk = chunk;
773 rib_walk_from(da->fibnum, AF_INET, RIB_FLAG_LOCKED,
774 (struct sockaddr *) &da->dst, (struct sockaddr *) &da->mask,
775 dxr_walk, da);
776
777 /* Flush any remaining objects on the heap */
778 fp = &da->range_tbl[da->rtbl_top + da->rtbl_work_frags].re;
779 fhp = &da->heap[da->heap_index];
780 while (fhp->preflen > DXR_TRIE_BITS) {
781 uint32_t oend = fhp->end;
782
783 if (da->heap_index > 0) {
784 fhp--;
785 da->heap_index--;
786 } else
787 initheap(da, fhp->end + 1, chunk);
788 if (fhp->end > oend && fhp->nexthop != fp->nexthop) {
789 /* Have we crossed the upper chunk boundary? */
790 if (oend >= last)
791 break;
792 fp++;
793 da->rtbl_work_frags++;
794 fp->start = (oend + 1) & DXR_RANGE_MASK;
795 fp->nexthop = fhp->nexthop;
796 }
797 }
798
799 /* Direct hit if the chunk contains only a single fragment */
800 if (da->rtbl_work_frags == 0) {
801 da->direct_tbl[chunk].base = fp->nexthop;
802 da->direct_tbl[chunk].fragments = FRAGS_MARK_HIT;
803 return (0);
804 }
805
806 da->direct_tbl[chunk].base = da->rtbl_top;
807 da->direct_tbl[chunk].fragments = da->rtbl_work_frags;
808
809 #if DXR_TRIE_BITS < 24
810 /* Check whether the chunk can be more compactly encoded */
811 fp = &da->range_tbl[da->rtbl_top].re;
812 for (i = 0; i <= da->rtbl_work_frags; i++, fp++)
813 if ((fp->start & 0xff) != 0 || fp->nexthop > RE_SHORT_MAX_NH)
814 break;
815 if (i == da->rtbl_work_frags + 1) {
816 fp = &da->range_tbl[da->rtbl_top].re;
817 fps = (void *) fp;
818 for (i = 0; i <= da->rtbl_work_frags; i++, fp++, fps++) {
819 start = fp->start;
820 nh = fp->nexthop;
821 fps->start = start >> 8;
822 fps->nexthop = nh;
823 }
824 fps->start = start >> 8;
825 fps->nexthop = nh;
826 da->rtbl_work_frags >>= 1;
827 da->direct_tbl[chunk].fragments =
828 da->rtbl_work_frags | FRAGS_PREF_SHORT;
829 } else
830 #endif
831 if (da->rtbl_work_frags >= FRAGS_MARK_HIT) {
832 da->direct_tbl[chunk].fragments = FRAGS_MARK_XL;
833 memmove(&da->range_tbl[da->rtbl_top + 1],
834 &da->range_tbl[da->rtbl_top],
835 (da->rtbl_work_frags + 1) * sizeof(*da->range_tbl));
836 da->range_tbl[da->rtbl_top].fragments = da->rtbl_work_frags;
837 da->rtbl_work_frags++;
838 }
839 da->rtbl_top += (da->rtbl_work_frags + 1);
840 return (chunk_ref(da, chunk));
841 }
842
843 static void
dxr_build(struct dxr * dxr)844 dxr_build(struct dxr *dxr)
845 {
846 struct dxr_aux *da = dxr->aux;
847 struct chunk_desc *cdp;
848 struct rib_rtable_info rinfo;
849 struct timeval t0, t1, t2, t3;
850 uint32_t r_size, dxr_tot_size;
851 uint32_t i, m, range_rebuild = 0;
852 uint32_t range_frag;
853 struct trie_desc *tp;
854 uint32_t d_tbl_size, dxr_x, d_size, x_size;
855 uint32_t ti, trie_rebuild = 0, prev_size = 0;
856 uint32_t trie_frag;
857
858 MPASS(dxr->d == NULL);
859
860 if (da == NULL) {
861 da = malloc(sizeof(*dxr->aux), M_DXRAUX, M_NOWAIT);
862 if (da == NULL) {
863 FIB_PRINTF(LOG_NOTICE, dxr->fd,
864 "Unable to allocate DXR aux struct");
865 return;
866 }
867 dxr->aux = da;
868 da->fibnum = dxr->fibnum;
869 da->fd = dxr->fd;
870 da->refcnt = 1;
871 LIST_INIT(&da->all_chunks);
872 LIST_INIT(&da->all_trie);
873 da->rtbl_size = RTBL_SIZE_INCR;
874 da->range_tbl = NULL;
875 da->xtbl_size = XTBL_SIZE_INCR;
876 da->x_tbl = NULL;
877 bzero(&da->dst, sizeof(da->dst));
878 bzero(&da->mask, sizeof(da->mask));
879 da->dst.sin_len = sizeof(da->dst);
880 da->mask.sin_len = sizeof(da->mask);
881 da->dst.sin_family = AF_INET;
882 da->mask.sin_family = AF_INET;
883 }
884 if (da->range_tbl == NULL) {
885 da->range_tbl = malloc(sizeof(*da->range_tbl) * da->rtbl_size
886 + FRAGS_PREF_SHORT, M_DXRAUX, M_NOWAIT);
887 if (da->range_tbl == NULL) {
888 FIB_PRINTF(LOG_NOTICE, da->fd,
889 "Unable to allocate DXR range table");
890 return;
891 }
892 range_rebuild = 1;
893 }
894 if (da->x_tbl == NULL) {
895 da->x_tbl = malloc(sizeof(*da->x_tbl) * da->xtbl_size,
896 M_DXRAUX, M_NOWAIT);
897 if (da->x_tbl == NULL) {
898 FIB_PRINTF(LOG_NOTICE, da->fd,
899 "Unable to allocate DXR extension table");
900 return;
901 }
902 trie_rebuild = 1;
903 }
904
905 microuptime(&t0);
906
907 dxr->nh_tbl = fib_get_nhop_array(da->fd);
908 fib_get_rtable_info(fib_get_rh(da->fd), &rinfo);
909
910 if (da->updates_low > da->updates_high)
911 range_rebuild = 1;
912
913 range_build:
914 if (range_rebuild) {
915 /* Bulk cleanup */
916 bzero(da->chunk_hashtbl, sizeof(da->chunk_hashtbl));
917 while ((cdp = LIST_FIRST(&da->all_chunks)) != NULL) {
918 LIST_REMOVE(cdp, cd_all_le);
919 uma_zfree(chunk_zone, cdp);
920 }
921 for (i = 0; i < UNUSED_BUCKETS; i++)
922 LIST_INIT(&da->unused_chunks[i]);
923 da->unused_chunks_size = 0;
924 da->rtbl_top = 0;
925 da->updates_low = 0;
926 da->updates_high = DIRECT_TBL_SIZE - 1;
927 memset(da->updates_mask, 0xff, sizeof(da->updates_mask));
928 for (i = 0; i < DIRECT_TBL_SIZE; i++) {
929 da->direct_tbl[i].fragments = FRAGS_MARK_HIT;
930 da->direct_tbl[i].base = 0;
931 }
932 }
933 da->prefixes = rinfo.num_prefixes;
934
935 /* DXR: construct direct & range table */
936 for (i = da->updates_low; i <= da->updates_high; i++) {
937 m = da->updates_mask[i >> 5] >> (i & 0x1f);
938 if (m == 0)
939 i |= 0x1f;
940 else if (m & 1 && update_chunk(da, i) != 0)
941 return;
942 }
943
944 range_frag = 0;
945 if (da->rtbl_top)
946 range_frag = da->unused_chunks_size * 10000ULL / da->rtbl_top;
947 if (range_frag > V_frag_limit) {
948 range_rebuild = 1;
949 goto range_build;
950 }
951
952 r_size = sizeof(*da->range_tbl) * da->rtbl_top;
953 microuptime(&t1);
954
955 if (range_rebuild ||
956 abs(fls(da->prefixes) - fls(da->trie_rebuilt_prefixes)) > 1)
957 trie_rebuild = 1;
958
959 trie_build:
960 if (trie_rebuild) {
961 da->trie_rebuilt_prefixes = da->prefixes;
962 da->d_bits = DXR_D;
963 da->updates_low = 0;
964 da->updates_high = DIRECT_TBL_SIZE - 1;
965 if (!range_rebuild)
966 memset(da->updates_mask, 0xff,
967 sizeof(da->updates_mask));
968 }
969
970 dxr2_try_squeeze:
971 if (trie_rebuild) {
972 /* Bulk cleanup */
973 bzero(da->trietbl, sizeof(da->trietbl));
974 bzero(da->trie_hashtbl, sizeof(da->trie_hashtbl));
975 while ((tp = LIST_FIRST(&da->all_trie)) != NULL) {
976 LIST_REMOVE(tp, td_all_le);
977 uma_zfree(trie_zone, tp);
978 }
979 LIST_INIT(&da->unused_trie);
980 da->all_trie_cnt = da->unused_trie_cnt = 0;
981 }
982
983 /* Populate d_tbl, x_tbl */
984 dxr_x = DXR_TRIE_BITS - da->d_bits;
985 d_tbl_size = (1 << da->d_bits);
986
987 for (i = da->updates_low >> dxr_x; i <= da->updates_high >> dxr_x;
988 i++) {
989 if (!trie_rebuild) {
990 m = 0;
991 for (int j = 0; j < (1 << dxr_x); j += 32)
992 m |= da->updates_mask[((i << dxr_x) + j) >> 5];
993 if (m == 0)
994 continue;
995 trie_unref(da, i);
996 }
997 ti = trie_ref(da, i);
998 if (ti < 0)
999 return;
1000 da->d_tbl[i] = ti;
1001 }
1002
1003 trie_frag = 0;
1004 if (da->all_trie_cnt)
1005 trie_frag = da->unused_trie_cnt * 10000ULL / da->all_trie_cnt;
1006 if (trie_frag > V_frag_limit) {
1007 trie_rebuild = 1;
1008 goto trie_build;
1009 }
1010
1011 d_size = sizeof(*da->d_tbl) * d_tbl_size;
1012 x_size = sizeof(*da->x_tbl) * DIRECT_TBL_SIZE / d_tbl_size
1013 * da->all_trie_cnt;
1014 dxr_tot_size = d_size + x_size + r_size;
1015
1016 if (trie_rebuild == 1) {
1017 /* Try to find a more compact D/X split */
1018 if (prev_size == 0 || dxr_tot_size <= prev_size)
1019 da->d_bits--;
1020 else {
1021 da->d_bits++;
1022 trie_rebuild = 2;
1023 }
1024 prev_size = dxr_tot_size;
1025 goto dxr2_try_squeeze;
1026 }
1027 microuptime(&t2);
1028
1029 dxr->d = malloc(dxr_tot_size, M_DXRLPM, M_NOWAIT);
1030 if (dxr->d == NULL) {
1031 FIB_PRINTF(LOG_NOTICE, da->fd,
1032 "Unable to allocate DXR lookup table");
1033 return;
1034 }
1035 memcpy(dxr->d, da->d_tbl, d_size);
1036 dxr->x = ((char *) dxr->d) + d_size;
1037 memcpy(dxr->x, da->x_tbl, x_size);
1038 dxr->r = ((char *) dxr->x) + x_size;
1039 dxr->d_shift = 32 - da->d_bits;
1040 dxr->x_shift = dxr_x;
1041 dxr->x_mask = 0xffffffffU >> (32 - dxr_x);
1042 memcpy(dxr->r, da->range_tbl, r_size);
1043
1044 if (da->updates_low <= da->updates_high)
1045 bzero(&da->updates_mask[da->updates_low / 32],
1046 (da->updates_high - da->updates_low) / 8 + 1);
1047 da->updates_low = DIRECT_TBL_SIZE - 1;
1048 da->updates_high = 0;
1049 microuptime(&t3);
1050
1051 FIB_PRINTF(LOG_INFO, da->fd, "D%dX%dR, %d prefixes, %d nhops (max)",
1052 da->d_bits, dxr_x, rinfo.num_prefixes, rinfo.num_nhops);
1053 i = dxr_tot_size * 100;
1054 if (rinfo.num_prefixes)
1055 i /= rinfo.num_prefixes;
1056 FIB_PRINTF(LOG_INFO, da->fd, "%d.%02d KBytes, %d.%02d Bytes/prefix",
1057 dxr_tot_size / 1024, dxr_tot_size * 100 / 1024 % 100,
1058 i / 100, i % 100);
1059 FIB_PRINTF(LOG_INFO, da->fd,
1060 "%d.%02d%% trie, %d.%02d%% range fragmentation",
1061 trie_frag / 100, trie_frag % 100,
1062 range_frag / 100, range_frag % 100);
1063 i = (t1.tv_sec - t0.tv_sec) * 1000000 + t1.tv_usec - t0.tv_usec;
1064 FIB_PRINTF(LOG_INFO, da->fd, "range table %s in %u.%03u ms",
1065 range_rebuild ? "rebuilt" : "updated", i / 1000, i % 1000);
1066 i = (t2.tv_sec - t1.tv_sec) * 1000000 + t2.tv_usec - t1.tv_usec;
1067 FIB_PRINTF(LOG_INFO, da->fd, "trie %s in %u.%03u ms",
1068 trie_rebuild ? "rebuilt" : "updated", i / 1000, i % 1000);
1069 i = (t3.tv_sec - t2.tv_sec) * 1000000 + t3.tv_usec - t2.tv_usec;
1070 FIB_PRINTF(LOG_INFO, da->fd, "snapshot forked in %u.%03u ms",
1071 i / 1000, i % 1000);
1072 }
1073
1074 /*
1075 * Glue functions for attaching to the FIB_ALGO infrastructure.
1076 */
1077
1078 static struct nhop_object *
dxr_fib_lookup(void * algo_data,const struct flm_lookup_key key,uint32_t scopeid)1079 dxr_fib_lookup(void *algo_data, const struct flm_lookup_key key,
1080 uint32_t scopeid)
1081 {
1082 struct dxr *dxr = algo_data;
1083
1084 return (dxr->nh_tbl[dxr_lookup(dxr, ntohl(key.addr4.s_addr))]);
1085 }
1086
1087 static enum flm_op_result
dxr_init(uint32_t fibnum,struct fib_data * fd,void * old_data,void ** data)1088 dxr_init(uint32_t fibnum, struct fib_data *fd, void *old_data, void **data)
1089 {
1090 struct dxr *old_dxr = old_data;
1091 struct dxr_aux *da = NULL;
1092 struct dxr *dxr;
1093
1094 dxr = malloc(sizeof(*dxr), M_DXRAUX, M_NOWAIT);
1095 if (dxr == NULL) {
1096 FIB_PRINTF(LOG_NOTICE, fd,
1097 "Unable to allocate DXR container struct");
1098 return (FLM_REBUILD);
1099 }
1100
1101 /* Check whether we may reuse the old auxiliary structures */
1102 if (old_dxr != NULL && old_dxr->aux != NULL &&
1103 old_dxr->aux->fd == fd) {
1104 da = old_dxr->aux;
1105 atomic_add_int(&da->refcnt, 1);
1106 }
1107
1108 dxr->aux = da;
1109 dxr->d = NULL;
1110 dxr->fd = fd;
1111 dxr->fibnum = fibnum;
1112 *data = dxr;
1113
1114 return (FLM_SUCCESS);
1115 }
1116
1117 static void
dxr_destroy(void * data)1118 dxr_destroy(void *data)
1119 {
1120 struct dxr *dxr = data;
1121 struct dxr_aux *da = dxr->aux;
1122 struct chunk_desc *cdp;
1123 struct trie_desc *tp;
1124
1125 free(dxr->d, M_DXRLPM);
1126 free(dxr, M_DXRAUX);
1127
1128 if (da == NULL || atomic_fetchadd_int(&da->refcnt, -1) > 1)
1129 return;
1130
1131 /* Release auxiliary structures */
1132 while ((cdp = LIST_FIRST(&da->all_chunks)) != NULL) {
1133 LIST_REMOVE(cdp, cd_all_le);
1134 uma_zfree(chunk_zone, cdp);
1135 }
1136 while ((tp = LIST_FIRST(&da->all_trie)) != NULL) {
1137 LIST_REMOVE(tp, td_all_le);
1138 uma_zfree(trie_zone, tp);
1139 }
1140 free(da->range_tbl, M_DXRAUX);
1141 free(da->x_tbl, M_DXRAUX);
1142 free(da, M_DXRAUX);
1143 }
1144
1145 static void
epoch_dxr_destroy(epoch_context_t ctx)1146 epoch_dxr_destroy(epoch_context_t ctx)
1147 {
1148 struct dxr *dxr = __containerof(ctx, struct dxr, epoch_ctx);
1149
1150 dxr_destroy(dxr);
1151 }
1152
1153 static void *
choose_lookup_fn(struct dxr_aux * da)1154 choose_lookup_fn(struct dxr_aux *da)
1155 {
1156
1157 switch (da->d_bits) {
1158 #if DXR_TRIE_BITS > 16
1159 case 16:
1160 return (dxr_fib_lookup_16);
1161 #endif
1162 case 15:
1163 return (dxr_fib_lookup_15);
1164 case 14:
1165 return (dxr_fib_lookup_14);
1166 case 13:
1167 return (dxr_fib_lookup_13);
1168 case 12:
1169 return (dxr_fib_lookup_12);
1170 case 11:
1171 return (dxr_fib_lookup_11);
1172 case 10:
1173 return (dxr_fib_lookup_10);
1174 case 9:
1175 return (dxr_fib_lookup_9);
1176 }
1177 return (dxr_fib_lookup);
1178 }
1179
1180 static enum flm_op_result
dxr_dump_end(void * data,struct fib_dp * dp)1181 dxr_dump_end(void *data, struct fib_dp *dp)
1182 {
1183 struct dxr *dxr = data;
1184 struct dxr_aux *da;
1185
1186 dxr_build(dxr);
1187
1188 da = dxr->aux;
1189 if (da == NULL || dxr->d == NULL)
1190 return (FLM_REBUILD);
1191
1192 if (da->rtbl_top >= BASE_MAX)
1193 return (FLM_ERROR);
1194
1195 dp->f = choose_lookup_fn(da);
1196 dp->arg = dxr;
1197
1198 return (FLM_SUCCESS);
1199 }
1200
1201 static enum flm_op_result
dxr_dump_rib_item(struct rtentry * rt,void * data)1202 dxr_dump_rib_item(struct rtentry *rt, void *data)
1203 {
1204
1205 return (FLM_SUCCESS);
1206 }
1207
1208 static enum flm_op_result
dxr_change_rib_item(struct rib_head * rnh,struct rib_cmd_info * rc,void * data)1209 dxr_change_rib_item(struct rib_head *rnh, struct rib_cmd_info *rc,
1210 void *data)
1211 {
1212
1213 return (FLM_BATCH);
1214 }
1215
1216 static enum flm_op_result
dxr_change_rib_batch(struct rib_head * rnh,struct fib_change_queue * q,void * data)1217 dxr_change_rib_batch(struct rib_head *rnh, struct fib_change_queue *q,
1218 void *data)
1219 {
1220 struct dxr *dxr = data;
1221 struct dxr *new_dxr;
1222 struct dxr_aux *da;
1223 struct fib_dp new_dp;
1224 enum flm_op_result res;
1225 uint32_t ip, plen, hmask, start, end, i, ui;
1226 #ifdef INVARIANTS
1227 struct rib_rtable_info rinfo;
1228 int update_delta = 0;
1229 #endif
1230
1231 MPASS(data != NULL);
1232 MPASS(q != NULL);
1233 MPASS(q->count < q->size);
1234
1235 da = dxr->aux;
1236 MPASS(da != NULL);
1237 MPASS(da->fd == dxr->fd);
1238 MPASS(da->refcnt > 0);
1239
1240 FIB_PRINTF(LOG_INFO, da->fd, "processing %d update(s)", q->count);
1241 for (ui = 0; ui < q->count; ui++) {
1242 #ifdef INVARIANTS
1243 if (q->entries[ui].nh_new != NULL)
1244 update_delta++;
1245 if (q->entries[ui].nh_old != NULL)
1246 update_delta--;
1247 #endif
1248 plen = q->entries[ui].plen;
1249 ip = ntohl(q->entries[ui].addr4.s_addr);
1250 if (plen < 32)
1251 hmask = 0xffffffffU >> plen;
1252 else
1253 hmask = 0;
1254 start = (ip & ~hmask) >> DXR_RANGE_SHIFT;
1255 end = (ip | hmask) >> DXR_RANGE_SHIFT;
1256
1257 if ((start & 0x1f) == 0 && (end & 0x1f) == 0x1f)
1258 for (i = start >> 5; i <= end >> 5; i++)
1259 da->updates_mask[i] = 0xffffffffU;
1260 else
1261 for (i = start; i <= end; i++)
1262 da->updates_mask[i >> 5] |= (1 << (i & 0x1f));
1263 if (start < da->updates_low)
1264 da->updates_low = start;
1265 if (end > da->updates_high)
1266 da->updates_high = end;
1267 }
1268
1269 #ifdef INVARIANTS
1270 fib_get_rtable_info(fib_get_rh(da->fd), &rinfo);
1271 MPASS(da->prefixes + update_delta == rinfo.num_prefixes);
1272 #endif
1273
1274 res = dxr_init(0, dxr->fd, data, (void **) &new_dxr);
1275 if (res != FLM_SUCCESS)
1276 return (res);
1277
1278 dxr_build(new_dxr);
1279
1280 /* Structural limit exceeded, hard error */
1281 if (da->rtbl_top >= BASE_MAX) {
1282 dxr_destroy(new_dxr);
1283 return (FLM_ERROR);
1284 }
1285
1286 if (new_dxr->d == NULL) {
1287 dxr_destroy(new_dxr);
1288 return (FLM_REBUILD);
1289 }
1290
1291 new_dp.f = choose_lookup_fn(da);
1292 new_dp.arg = new_dxr;
1293 if (fib_set_datapath_ptr(dxr->fd, &new_dp)) {
1294 fib_set_algo_ptr(dxr->fd, new_dxr);
1295 fib_epoch_call(epoch_dxr_destroy, &dxr->epoch_ctx);
1296 return (FLM_SUCCESS);
1297 }
1298
1299 FIB_PRINTF(LOG_NOTICE, dxr->fd, "fib_set_datapath_ptr() failed");
1300 dxr_destroy(new_dxr);
1301 return (FLM_REBUILD);
1302 }
1303
1304 static uint8_t
dxr_get_pref(const struct rib_rtable_info * rinfo)1305 dxr_get_pref(const struct rib_rtable_info *rinfo)
1306 {
1307
1308 /* Below bsearch4 up to 10 prefixes. Always supersedes dpdk_lpm4. */
1309 return (251);
1310 }
1311
1312 SYSCTL_DECL(_net_route_algo);
1313 SYSCTL_NODE(_net_route_algo, OID_AUTO, dxr, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
1314 "DXR tunables");
1315
1316 static int
sysctl_dxr_frag_limit(SYSCTL_HANDLER_ARGS)1317 sysctl_dxr_frag_limit(SYSCTL_HANDLER_ARGS)
1318 {
1319 char buf[8];
1320 int error, new, i;
1321
1322 snprintf(buf, sizeof(buf), "%d.%02d%%", V_frag_limit / 100,
1323 V_frag_limit % 100);
1324 error = sysctl_handle_string(oidp, buf, sizeof(buf), req);
1325 if (error != 0 || req->newptr == NULL)
1326 return (error);
1327 if (!isdigit(*buf) && *buf != '.')
1328 return (EINVAL);
1329 for (i = 0, new = 0; isdigit(buf[i]) && i < sizeof(buf); i++)
1330 new = new * 10 + buf[i] - '0';
1331 new *= 100;
1332 if (buf[i++] == '.') {
1333 if (!isdigit(buf[i]))
1334 return (EINVAL);
1335 new += (buf[i++] - '0') * 10;
1336 if (isdigit(buf[i]))
1337 new += buf[i++] - '0';
1338 }
1339 if (new > 1000)
1340 return (EINVAL);
1341 V_frag_limit = new;
1342 snprintf(buf, sizeof(buf), "%d.%02d%%", V_frag_limit / 100,
1343 V_frag_limit % 100);
1344 return (0);
1345 }
1346
1347 SYSCTL_PROC(_net_route_algo_dxr, OID_AUTO, frag_limit,
1348 CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_VNET,
1349 0, 0, sysctl_dxr_frag_limit, "A",
1350 "Fragmentation threshold to full rebuild");
1351
1352 static struct fib_lookup_module fib_dxr_mod = {
1353 .flm_name = "dxr",
1354 .flm_family = AF_INET,
1355 .flm_init_cb = dxr_init,
1356 .flm_destroy_cb = dxr_destroy,
1357 .flm_dump_rib_item_cb = dxr_dump_rib_item,
1358 .flm_dump_end_cb = dxr_dump_end,
1359 .flm_change_rib_item_cb = dxr_change_rib_item,
1360 .flm_change_rib_items_cb = dxr_change_rib_batch,
1361 .flm_get_pref = dxr_get_pref,
1362 };
1363
1364 static int
dxr_modevent(module_t mod,int type,void * unused)1365 dxr_modevent(module_t mod, int type, void *unused)
1366 {
1367 int error;
1368
1369 switch (type) {
1370 case MOD_LOAD:
1371 chunk_zone = uma_zcreate("dxr chunk", sizeof(struct chunk_desc),
1372 NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
1373 trie_zone = uma_zcreate("dxr trie", sizeof(struct trie_desc),
1374 NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
1375 fib_module_register(&fib_dxr_mod);
1376 return(0);
1377 case MOD_UNLOAD:
1378 error = fib_module_unregister(&fib_dxr_mod);
1379 if (error)
1380 return (error);
1381 uma_zdestroy(chunk_zone);
1382 uma_zdestroy(trie_zone);
1383 return(0);
1384 default:
1385 return(EOPNOTSUPP);
1386 }
1387 }
1388
1389 static moduledata_t dxr_mod = {"fib_dxr", dxr_modevent, 0};
1390
1391 DECLARE_MODULE(fib_dxr, dxr_mod, SI_SUB_PSEUDO, SI_ORDER_ANY);
1392 MODULE_VERSION(fib_dxr, 1);
1393