xref: /freebsd/sys/netinet/in_fib_dxr.c (revision 1603881667360c015f6685131f2f25474fa67a72)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2012-2021 Marko Zec
5  * Copyright (c) 2005, 2018 University of Zagreb
6  * Copyright (c) 2005 International Computer Science Institute
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27  * SUCH DAMAGE.
28  */
29 
30 /*
31  * An implementation of DXR, a simple IPv4 LPM scheme with compact lookup
32  * structures and a trivial search procedure.  More significant bits of
33  * the search key are used to directly index a two-stage trie, while the
34  * remaining bits are used for finding the next hop in a sorted array.
35  * More details in:
36  *
37  * M. Zec, L. Rizzo, M. Mikuc, DXR: towards a billion routing lookups per
38  * second in software, ACM SIGCOMM Computer Communication Review, September
39  * 2012
40  *
41  * M. Zec, M. Mikuc, Pushing the envelope: beyond two billion IP routing
42  * lookups per second on commodity CPUs, IEEE SoftCOM, September 2017, Split
43  */
44 
45 #include <sys/cdefs.h>
46 __FBSDID("$FreeBSD$");
47 
48 #include "opt_inet.h"
49 
50 #include <sys/param.h>
51 #include <sys/kernel.h>
52 #include <sys/epoch.h>
53 #include <sys/malloc.h>
54 #include <sys/module.h>
55 #include <sys/socket.h>
56 #include <sys/sysctl.h>
57 #include <sys/syslog.h>
58 
59 #include <vm/uma.h>
60 
61 #include <netinet/in.h>
62 #include <netinet/in_fib.h>
63 
64 #include <net/route.h>
65 #include <net/route/route_ctl.h>
66 #include <net/route/fib_algo.h>
67 
68 #define	DXR_TRIE_BITS		20
69 
70 CTASSERT(DXR_TRIE_BITS >= 16 && DXR_TRIE_BITS <= 24);
71 
72 /* DXR2: two-stage primary trie, instead of a single direct lookup table */
73 #define	DXR2
74 
75 #if DXR_TRIE_BITS > 16
76 #define	DXR_D			16
77 #else
78 #define	DXR_D			(DXR_TRIE_BITS - 1)
79 #endif
80 #define	DXR_X			(DXR_TRIE_BITS - DXR_D)
81 
82 #define	D_TBL_SIZE		(1 << DXR_D)
83 #define	DIRECT_TBL_SIZE		(1 << DXR_TRIE_BITS)
84 #define	DXR_RANGE_MASK		(0xffffffffU >> DXR_TRIE_BITS)
85 #define	DXR_RANGE_SHIFT		(32 - DXR_TRIE_BITS)
86 
87 #define	DESC_BASE_BITS		22
88 #define	DESC_FRAGMENTS_BITS	(32 - DESC_BASE_BITS)
89 #define	BASE_MAX		((1 << DESC_BASE_BITS) - 1)
90 #define	RTBL_SIZE_INCR		(BASE_MAX / 64)
91 
92 #if DXR_TRIE_BITS < 24
93 #define	FRAGS_MASK_SHORT	((1 << (23 - DXR_TRIE_BITS)) - 1)
94 #else
95 #define	FRAGS_MASK_SHORT	0
96 #endif
97 #define	FRAGS_PREF_SHORT	(((1 << DESC_FRAGMENTS_BITS) - 1) & \
98 				 ~FRAGS_MASK_SHORT)
99 #define	FRAGS_MARK_XL		(FRAGS_PREF_SHORT - 1)
100 #define	FRAGS_MARK_HIT		(FRAGS_PREF_SHORT - 2)
101 
102 #define	IS_SHORT_FORMAT(x)	((x & FRAGS_PREF_SHORT) == FRAGS_PREF_SHORT)
103 #define	IS_LONG_FORMAT(x)	((x & FRAGS_PREF_SHORT) != FRAGS_PREF_SHORT)
104 #define	IS_XL_FORMAT(x)		(x == FRAGS_MARK_XL)
105 
106 #define	RE_SHORT_MAX_NH		((1 << (DXR_TRIE_BITS - 8)) - 1)
107 
108 #define	CHUNK_HASH_BITS		16
109 #define	CHUNK_HASH_SIZE		(1 << CHUNK_HASH_BITS)
110 #define	CHUNK_HASH_MASK		(CHUNK_HASH_SIZE - 1)
111 
112 #define	TRIE_HASH_BITS		16
113 #define	TRIE_HASH_SIZE		(1 << TRIE_HASH_BITS)
114 #define	TRIE_HASH_MASK		(TRIE_HASH_SIZE - 1)
115 
116 #define	XTBL_SIZE_INCR		(DIRECT_TBL_SIZE / 16)
117 
118 /* Lookup structure elements */
119 
120 struct direct_entry {
121 	uint32_t		fragments: DESC_FRAGMENTS_BITS,
122 				base: DESC_BASE_BITS;
123 };
124 
125 struct range_entry_long {
126 	uint32_t		start: DXR_RANGE_SHIFT,
127 				nexthop: DXR_TRIE_BITS;
128 };
129 
130 #if DXR_TRIE_BITS < 24
131 struct range_entry_short {
132 	uint16_t		start: DXR_RANGE_SHIFT - 8,
133 				nexthop: DXR_TRIE_BITS - 8;
134 };
135 #endif
136 
137 /* Auxiliary structures */
138 
139 struct heap_entry {
140 	uint32_t		start;
141 	uint32_t		end;
142 	uint32_t		preflen;
143 	uint32_t		nexthop;
144 };
145 
146 struct chunk_desc {
147 	LIST_ENTRY(chunk_desc)	cd_all_le;
148 	LIST_ENTRY(chunk_desc)	cd_hash_le;
149 	uint32_t		cd_hash;
150 	uint32_t		cd_refcnt;
151 	uint32_t		cd_base;
152 	uint32_t		cd_cur_size;
153 	uint32_t		cd_max_size;
154 };
155 
156 struct trie_desc {
157 	LIST_ENTRY(trie_desc)	td_all_le;
158 	LIST_ENTRY(trie_desc)	td_hash_le;
159 	uint32_t		td_hash;
160 	uint32_t		td_index;
161 	uint32_t		td_refcnt;
162 };
163 
164 struct dxr_aux {
165 	/* Glue to external state */
166 	struct fib_data		*fd;
167 	uint32_t		fibnum;
168 	int			refcnt;
169 
170 	/* Auxiliary build-time tables */
171 	struct direct_entry	direct_tbl[DIRECT_TBL_SIZE];
172 	uint16_t		d_tbl[D_TBL_SIZE];
173 	struct direct_entry	*x_tbl;
174 	union {
175 		struct range_entry_long	re;
176 		uint32_t	fragments;
177 	}			*range_tbl;
178 
179 	/* Auxiliary internal state */
180 	uint32_t		updates_mask[DIRECT_TBL_SIZE / 32];
181 	struct trie_desc	*trietbl[D_TBL_SIZE];
182 	LIST_HEAD(, chunk_desc)	chunk_hashtbl[CHUNK_HASH_SIZE];
183 	LIST_HEAD(, chunk_desc)	all_chunks;
184 	LIST_HEAD(, chunk_desc) unused_chunks; /* abuses hash link entry */
185 	LIST_HEAD(, trie_desc)	trie_hashtbl[TRIE_HASH_SIZE];
186 	LIST_HEAD(, trie_desc)	all_trie;
187 	LIST_HEAD(, trie_desc)	unused_trie; /* abuses hash link entry */
188 	struct sockaddr_in	dst;
189 	struct sockaddr_in	mask;
190 	struct heap_entry	heap[33];
191 	uint32_t		prefixes;
192 	uint32_t		updates_low;
193 	uint32_t		updates_high;
194 	uint32_t		all_chunks_cnt;
195 	uint32_t		unused_chunks_cnt;
196 	uint32_t		xtbl_size;
197 	uint32_t		all_trie_cnt;
198 	uint32_t		unused_trie_cnt;
199 	uint32_t		trie_rebuilt_prefixes;
200 	uint32_t		heap_index;
201 	uint32_t		d_bits;
202 	uint32_t		rtbl_size;
203 	uint32_t		rtbl_top;
204 	uint32_t		rtbl_work_frags;
205 	uint32_t		work_chunk;
206 };
207 
208 /* Main lookup structure container */
209 
210 struct dxr {
211 	/* Lookup tables */
212 	uint16_t		d_shift;
213 	uint16_t		x_shift;
214 	uint32_t		x_mask;
215 	void			*d;
216 	void			*x;
217 	void			*r;
218 	struct nhop_object	**nh_tbl;
219 
220 	/* Glue to external state */
221 	struct dxr_aux		*aux;
222 	struct fib_data		*fd;
223 	struct epoch_context	epoch_ctx;
224 	uint32_t		fibnum;
225 };
226 
227 static MALLOC_DEFINE(M_DXRLPM, "dxr", "DXR LPM");
228 static MALLOC_DEFINE(M_DXRAUX, "dxr aux", "DXR auxiliary");
229 
230 uma_zone_t chunk_zone;
231 uma_zone_t trie_zone;
232 
233 SYSCTL_DECL(_net_route_algo);
234 SYSCTL_NODE(_net_route_algo, OID_AUTO, dxr, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
235     "DXR tunables");
236 
237 VNET_DEFINE_STATIC(int, max_trie_holes) = 8;
238 #define	V_max_trie_holes	VNET(max_trie_holes)
239 SYSCTL_INT(_net_route_algo_dxr, OID_AUTO, max_trie_holes,
240     CTLFLAG_RW | CTLFLAG_VNET, &VNET_NAME(max_trie_holes), 0,
241     "Trie fragmentation threshold before triggering a full rebuild");
242 
243 VNET_DEFINE_STATIC(int, max_range_holes) = 16;
244 #define	V_max_range_holes	VNET(max_range_holes)
245 SYSCTL_INT(_net_route_algo_dxr, OID_AUTO, max_range_holes,
246     CTLFLAG_RW | CTLFLAG_VNET, &VNET_NAME(max_range_holes), 0,
247     "Range table fragmentation threshold before triggering a full rebuild");
248 
249 /* Binary search for a matching address range */
250 #define	DXR_LOOKUP_STAGE					\
251 	if (masked_dst < range[middle].start) {			\
252 		upperbound = middle;				\
253 		middle = (middle + lowerbound) / 2;		\
254 	} else if (masked_dst < range[middle + 1].start)	\
255 		return (range[middle].nexthop);			\
256 	else {							\
257 		lowerbound = middle + 1;			\
258 		middle = (upperbound + middle + 1) / 2;		\
259 	}							\
260 	if (upperbound == lowerbound)				\
261 		return (range[lowerbound].nexthop);
262 
263 static int
264 dxr_lookup(struct dxr *dxr, uint32_t dst)
265 {
266 #ifdef DXR2
267 	uint16_t *dt = dxr->d;
268 	struct direct_entry *xt = dxr->x;
269 	int xi;
270 #else
271 	struct direct_entry *dt = dxr->d;
272 #endif
273 	struct direct_entry de;
274 	struct range_entry_long	*rt;
275 	uint32_t base;
276 	uint32_t upperbound;
277 	uint32_t middle;
278 	uint32_t lowerbound;
279 	uint32_t masked_dst;
280 
281 #ifdef DXR2
282 	xi = (dt[dst >> dxr->d_shift] << dxr->x_shift) +
283 	    ((dst >> DXR_RANGE_SHIFT) & dxr->x_mask);
284 	de = xt[xi];
285 #else
286 	de = dt[dst >> DXR_RANGE_SHIFT];
287 #endif
288 
289 	if (__predict_true(de.fragments == FRAGS_MARK_HIT))
290 		return (de.base);
291 
292 	rt = dxr->r;
293 	base = de.base;
294 	lowerbound = 0;
295 	masked_dst = dst & DXR_RANGE_MASK;
296 
297 #if DXR_TRIE_BITS < 24
298 	if (__predict_true(IS_SHORT_FORMAT(de.fragments))) {
299 		upperbound = de.fragments & FRAGS_MASK_SHORT;
300 		struct range_entry_short *range =
301 		    (struct range_entry_short *) &rt[base];
302 
303 		masked_dst >>= 8;
304 		middle = upperbound;
305 		upperbound = upperbound * 2 + 1;
306 
307 		for (;;) {
308 			DXR_LOOKUP_STAGE
309 			DXR_LOOKUP_STAGE
310 		}
311 	}
312 #endif
313 
314 	upperbound = de.fragments;
315 	middle = upperbound / 2;
316 	struct range_entry_long *range = &rt[base];
317 	if (__predict_false(IS_XL_FORMAT(de.fragments))) {
318 		upperbound = *((uint32_t *) range);
319 		range++;
320 		middle = upperbound / 2;
321 	}
322 
323 	for (;;) {
324 		DXR_LOOKUP_STAGE
325 		DXR_LOOKUP_STAGE
326 	}
327 }
328 
329 static void
330 initheap(struct dxr_aux *da, uint32_t dst_u32, uint32_t chunk)
331 {
332 	struct heap_entry *fhp = &da->heap[0];
333 	struct rtentry *rt;
334 	struct route_nhop_data rnd;
335 
336 	da->heap_index = 0;
337 	da->dst.sin_addr.s_addr = htonl(dst_u32);
338 	rt = fib4_lookup_rt(da->fibnum, da->dst.sin_addr, 0, NHR_UNLOCKED,
339 	    &rnd);
340 	if (rt != NULL) {
341 		struct in_addr addr;
342 		uint32_t scopeid;
343 
344 		rt_get_inet_prefix_plen(rt, &addr, &fhp->preflen, &scopeid);
345 		fhp->start = ntohl(addr.s_addr);
346 		fhp->end = fhp->start;
347 		if (fhp->preflen < 32)
348 			fhp->end |= (0xffffffffU >> fhp->preflen);
349 		fhp->nexthop = fib_get_nhop_idx(da->fd, rnd.rnd_nhop);
350 	} else {
351 		fhp->preflen = fhp->nexthop = fhp->start = 0;
352 		fhp->end = 0xffffffffU;
353 	}
354 }
355 
356 static uint32_t
357 chunk_size(struct dxr_aux *da, struct direct_entry *fdesc)
358 {
359 
360 	if (IS_SHORT_FORMAT(fdesc->fragments))
361 		return ((fdesc->fragments & FRAGS_MASK_SHORT) + 1);
362 	else if (IS_XL_FORMAT(fdesc->fragments))
363 		return (da->range_tbl[fdesc->base].fragments + 2);
364 	else /* if (IS_LONG_FORMAT(fdesc->fragments)) */
365 		return (fdesc->fragments + 1);
366 }
367 
368 static uint32_t
369 chunk_hash(struct dxr_aux *da, struct direct_entry *fdesc)
370 {
371 	uint32_t size = chunk_size(da, fdesc);
372 	uint32_t *p = (uint32_t *) &da->range_tbl[fdesc->base];
373 	uint32_t *l = (uint32_t *) &da->range_tbl[fdesc->base + size];
374 	uint32_t hash = fdesc->fragments;
375 
376 	for (; p < l; p++)
377 		hash = (hash << 7) + (hash >> 13) + *p;
378 
379 	return (hash + (hash >> 16));
380 }
381 
382 static int
383 chunk_ref(struct dxr_aux *da, uint32_t chunk)
384 {
385 	struct direct_entry *fdesc = &da->direct_tbl[chunk];
386 	struct chunk_desc *cdp, *empty_cdp;
387 	uint32_t base = fdesc->base;
388 	uint32_t size = chunk_size(da, fdesc);
389 	uint32_t hash = chunk_hash(da, fdesc);
390 
391 	/* Find an existing descriptor */
392 	LIST_FOREACH(cdp, &da->chunk_hashtbl[hash & CHUNK_HASH_MASK],
393 	    cd_hash_le) {
394 		if (cdp->cd_hash != hash || cdp->cd_cur_size != size ||
395 		    memcmp(&da->range_tbl[base], &da->range_tbl[cdp->cd_base],
396 		    sizeof(struct range_entry_long) * size))
397 			continue;
398 		da->rtbl_top = fdesc->base;
399 		fdesc->base = cdp->cd_base;
400 		cdp->cd_refcnt++;
401 		return (0);
402 	}
403 
404 	/* No matching chunks found. Recycle an empty or allocate a new one */
405 	cdp = NULL;
406 	LIST_FOREACH(empty_cdp, &da->unused_chunks, cd_hash_le)
407 		if (empty_cdp->cd_max_size >= size && (cdp == NULL ||
408 		    empty_cdp->cd_max_size < cdp->cd_max_size)) {
409 			cdp = empty_cdp;
410 			if (empty_cdp->cd_max_size == size)
411 				break;
412 		}
413 
414 	if (cdp != NULL) {
415 		/* Copy from heap into the recycled chunk */
416 		bcopy(&da->range_tbl[fdesc->base], &da->range_tbl[cdp->cd_base],
417 		    size * sizeof(struct range_entry_long));
418 		fdesc->base = cdp->cd_base;
419 		da->rtbl_top -= size;
420 		da->unused_chunks_cnt--;
421 		if (cdp->cd_max_size > size + 1) {
422 			/* Split the range in two, need a new descriptor */
423 			empty_cdp = uma_zalloc(chunk_zone, M_NOWAIT);
424 			if (empty_cdp == NULL)
425 				return (1);
426 			empty_cdp->cd_max_size = cdp->cd_max_size - size;
427 			empty_cdp->cd_base = cdp->cd_base + size;
428 			LIST_INSERT_AFTER(cdp, empty_cdp, cd_all_le);
429 			LIST_INSERT_AFTER(cdp, empty_cdp, cd_hash_le);
430 			da->all_chunks_cnt++;
431 			da->unused_chunks_cnt++;
432 			cdp->cd_max_size = size;
433 		}
434 		LIST_REMOVE(cdp, cd_hash_le);
435 	} else {
436 		/* Alloc a new descriptor */
437 		cdp = uma_zalloc(chunk_zone, M_NOWAIT);
438 		if (cdp == NULL)
439 			return (1);
440 		cdp->cd_max_size = size;
441 		cdp->cd_base = fdesc->base;
442 		LIST_INSERT_HEAD(&da->all_chunks, cdp, cd_all_le);
443 		da->all_chunks_cnt++;
444 	}
445 
446 	cdp->cd_hash = hash;
447 	cdp->cd_refcnt = 1;
448 	cdp->cd_cur_size = size;
449 	LIST_INSERT_HEAD(&da->chunk_hashtbl[hash & CHUNK_HASH_MASK], cdp,
450 	    cd_hash_le);
451 	if (da->rtbl_top >= da->rtbl_size) {
452 		if (da->rtbl_top >= BASE_MAX) {
453 			FIB_PRINTF(LOG_ERR, da->fd,
454 			    "structural limit exceeded at %d "
455 			    "range table elements", da->rtbl_top);
456 			return (1);
457 		}
458 		da->rtbl_size += RTBL_SIZE_INCR;
459 		if (da->rtbl_top >= BASE_MAX / 4)
460 			FIB_PRINTF(LOG_WARNING, da->fd, "range table at %d%%",
461 			    da->rtbl_top * 100 / BASE_MAX);
462 		da->range_tbl = realloc(da->range_tbl,
463 		    sizeof(*da->range_tbl) * da->rtbl_size + FRAGS_PREF_SHORT,
464 		    M_DXRAUX, M_NOWAIT);
465 		if (da->range_tbl == NULL)
466 			return (1);
467 	}
468 
469 	return (0);
470 }
471 
472 static void
473 chunk_unref(struct dxr_aux *da, uint32_t chunk)
474 {
475 	struct direct_entry *fdesc = &da->direct_tbl[chunk];
476 	struct chunk_desc *cdp;
477 	uint32_t base = fdesc->base;
478 	uint32_t size = chunk_size(da, fdesc);
479 	uint32_t hash = chunk_hash(da, fdesc);
480 
481 	/* Find an existing descriptor */
482 	LIST_FOREACH(cdp, &da->chunk_hashtbl[hash & CHUNK_HASH_MASK],
483 	    cd_hash_le)
484 		if (cdp->cd_hash == hash && cdp->cd_cur_size == size &&
485 		    memcmp(&da->range_tbl[base], &da->range_tbl[cdp->cd_base],
486 		    sizeof(struct range_entry_long) * size) == 0)
487 			break;
488 
489 	KASSERT(cdp != NULL, ("dxr: dangling chunk"));
490 	if (--cdp->cd_refcnt > 0)
491 		return;
492 
493 	LIST_REMOVE(cdp, cd_hash_le);
494 	da->unused_chunks_cnt++;
495 	if (cdp->cd_base + cdp->cd_max_size != da->rtbl_top) {
496 		LIST_INSERT_HEAD(&da->unused_chunks, cdp, cd_hash_le);
497 		return;
498 	}
499 
500 	do {
501 		da->all_chunks_cnt--;
502 		da->unused_chunks_cnt--;
503 		da->rtbl_top -= cdp->cd_max_size;
504 		LIST_REMOVE(cdp, cd_all_le);
505 		uma_zfree(chunk_zone, cdp);
506 		LIST_FOREACH(cdp, &da->unused_chunks, cd_hash_le)
507 			if (cdp->cd_base + cdp->cd_max_size == da->rtbl_top) {
508 				LIST_REMOVE(cdp, cd_hash_le);
509 				break;
510 			}
511 	} while (cdp != NULL);
512 }
513 
514 #ifdef DXR2
515 static uint32_t
516 trie_hash(struct dxr_aux *da, uint32_t dxr_x, uint32_t index)
517 {
518 	uint32_t i, *val;
519 	uint32_t hash = 0;
520 
521 	for (i = 0; i < (1 << dxr_x); i++) {
522 		hash = (hash << 3) ^ (hash >> 3);
523 		val = (uint32_t *)
524 		    (void *) &da->direct_tbl[(index << dxr_x) + i];
525 		hash += (*val << 5);
526 		hash += (*val >> 5);
527 	}
528 
529 	return (hash + (hash >> 16));
530 }
531 
532 static int
533 trie_ref(struct dxr_aux *da, uint32_t index)
534 {
535 	struct trie_desc *tp;
536 	uint32_t dxr_d = da->d_bits;
537 	uint32_t dxr_x = DXR_TRIE_BITS - dxr_d;
538 	uint32_t hash = trie_hash(da, dxr_x, index);
539 
540 	/* Find an existing descriptor */
541 	LIST_FOREACH(tp, &da->trie_hashtbl[hash & TRIE_HASH_MASK], td_hash_le)
542 		if (tp->td_hash == hash &&
543 		    memcmp(&da->direct_tbl[index << dxr_x],
544 		    &da->x_tbl[tp->td_index << dxr_x],
545 		    sizeof(*da->x_tbl) << dxr_x) == 0) {
546 			tp->td_refcnt++;
547 			da->trietbl[index] = tp;
548 			return(tp->td_index);
549 		}
550 
551 	tp = LIST_FIRST(&da->unused_trie);
552 	if (tp != NULL) {
553 		LIST_REMOVE(tp, td_hash_le);
554 		da->unused_trie_cnt--;
555 	} else {
556 		tp = uma_zalloc(trie_zone, M_NOWAIT);
557 		if (tp == NULL)
558 			return (-1);
559 		LIST_INSERT_HEAD(&da->all_trie, tp, td_all_le);
560 		tp->td_index = da->all_trie_cnt++;
561 	}
562 
563 	tp->td_hash = hash;
564 	tp->td_refcnt = 1;
565 	LIST_INSERT_HEAD(&da->trie_hashtbl[hash & TRIE_HASH_MASK], tp,
566 	   td_hash_le);
567 	memcpy(&da->x_tbl[tp->td_index << dxr_x],
568 	    &da->direct_tbl[index << dxr_x], sizeof(*da->x_tbl) << dxr_x);
569 	da->trietbl[index] = tp;
570 	if (da->all_trie_cnt >= da->xtbl_size >> dxr_x) {
571 		da->xtbl_size += XTBL_SIZE_INCR;
572 		da->x_tbl = realloc(da->x_tbl,
573 		    sizeof(*da->x_tbl) * da->xtbl_size, M_DXRAUX, M_NOWAIT);
574 		if (da->x_tbl == NULL)
575 			return (-1);
576 	}
577 	return(tp->td_index);
578 }
579 
580 static void
581 trie_unref(struct dxr_aux *da, uint32_t index)
582 {
583 	struct trie_desc *tp = da->trietbl[index];
584 
585 	if (tp == NULL)
586 		return;
587 	da->trietbl[index] = NULL;
588 	if (--tp->td_refcnt > 0)
589 		return;
590 
591 	LIST_REMOVE(tp, td_hash_le);
592 	da->unused_trie_cnt++;
593 	if (tp->td_index != da->all_trie_cnt - 1) {
594 		LIST_INSERT_HEAD(&da->unused_trie, tp, td_hash_le);
595 		return;
596 	}
597 
598 	do {
599 		da->all_trie_cnt--;
600 		da->unused_trie_cnt--;
601 		LIST_REMOVE(tp, td_all_le);
602 		uma_zfree(trie_zone, tp);
603 		LIST_FOREACH(tp, &da->unused_trie, td_hash_le)
604 			if (tp->td_index == da->all_trie_cnt - 1) {
605 				LIST_REMOVE(tp, td_hash_le);
606 				break;
607 			}
608 	} while (tp != NULL);
609 }
610 #endif
611 
612 static void
613 heap_inject(struct dxr_aux *da, uint32_t start, uint32_t end, uint32_t preflen,
614     uint32_t nh)
615 {
616 	struct heap_entry *fhp;
617 	int i;
618 
619 	for (i = da->heap_index; i >= 0; i--) {
620 		if (preflen > da->heap[i].preflen)
621 			break;
622 		else if (preflen < da->heap[i].preflen)
623 			da->heap[i + 1] = da->heap[i];
624 		else
625 			return;
626 	}
627 
628 	fhp = &da->heap[i + 1];
629 	fhp->preflen = preflen;
630 	fhp->start = start;
631 	fhp->end = end;
632 	fhp->nexthop = nh;
633 	da->heap_index++;
634 }
635 
636 static int
637 dxr_walk(struct rtentry *rt, void *arg)
638 {
639 	struct dxr_aux *da = arg;
640 	uint32_t chunk = da->work_chunk;
641 	uint32_t first = chunk << DXR_RANGE_SHIFT;
642 	uint32_t last = first | DXR_RANGE_MASK;
643 	struct range_entry_long *fp =
644 	    &da->range_tbl[da->rtbl_top + da->rtbl_work_frags].re;
645 	struct heap_entry *fhp = &da->heap[da->heap_index];
646 	uint32_t preflen, nh, start, end, scopeid;
647 	struct in_addr addr;
648 
649 	rt_get_inet_prefix_plen(rt, &addr, &preflen, &scopeid);
650 	start = ntohl(addr.s_addr);
651 	if (start > last)
652 		return (-1);	/* Beyond chunk boundaries, we are done */
653 	if (start < first)
654 		return (0);	/* Skip this route */
655 
656 	end = start;
657 	if (preflen < 32)
658 		end |= (0xffffffffU >> preflen);
659 	nh = fib_get_nhop_idx(da->fd, rt_get_raw_nhop(rt));
660 
661 	if (start == fhp->start)
662 		heap_inject(da, start, end, preflen, nh);
663 	else {
664 		/* start > fhp->start */
665 		while (start > fhp->end) {
666 			uint32_t oend = fhp->end;
667 
668 			if (da->heap_index > 0) {
669 				fhp--;
670 				da->heap_index--;
671 			} else
672 				initheap(da, fhp->end + 1, chunk);
673 			if (fhp->end > oend && fhp->nexthop != fp->nexthop) {
674 				fp++;
675 				da->rtbl_work_frags++;
676 				fp->start = (oend + 1) & DXR_RANGE_MASK;
677 				fp->nexthop = fhp->nexthop;
678 			}
679 		}
680 		if (start > ((chunk << DXR_RANGE_SHIFT) | fp->start) &&
681 		    nh != fp->nexthop) {
682 			fp++;
683 			da->rtbl_work_frags++;
684 			fp->start = start & DXR_RANGE_MASK;
685 		} else if (da->rtbl_work_frags) {
686 			if ((--fp)->nexthop == nh)
687 				da->rtbl_work_frags--;
688 			else
689 				fp++;
690 		}
691 		fp->nexthop = nh;
692 		heap_inject(da, start, end, preflen, nh);
693 	}
694 
695 	return (0);
696 }
697 
698 static int
699 update_chunk(struct dxr_aux *da, uint32_t chunk)
700 {
701 	struct range_entry_long *fp;
702 #if DXR_TRIE_BITS < 24
703 	struct range_entry_short *fps;
704 	uint32_t start, nh, i;
705 #endif
706 	struct heap_entry *fhp;
707 	uint32_t first = chunk << DXR_RANGE_SHIFT;
708 	uint32_t last = first | DXR_RANGE_MASK;
709 
710 	if (da->direct_tbl[chunk].fragments != FRAGS_MARK_HIT)
711 		chunk_unref(da, chunk);
712 
713 	initheap(da, first, chunk);
714 
715 	fp = &da->range_tbl[da->rtbl_top].re;
716 	da->rtbl_work_frags = 0;
717 	fp->start = first & DXR_RANGE_MASK;
718 	fp->nexthop = da->heap[0].nexthop;
719 
720 	da->dst.sin_addr.s_addr = htonl(first);
721 	da->mask.sin_addr.s_addr = htonl(~DXR_RANGE_MASK);
722 
723 	da->work_chunk = chunk;
724 	rib_walk_from(da->fibnum, AF_INET, RIB_FLAG_LOCKED,
725 	    (struct sockaddr *) &da->dst, (struct sockaddr *) &da->mask,
726 	    dxr_walk, da);
727 
728 	/* Flush any remaining objects on the heap */
729 	fp = &da->range_tbl[da->rtbl_top + da->rtbl_work_frags].re;
730 	fhp = &da->heap[da->heap_index];
731 	while (fhp->preflen > DXR_TRIE_BITS) {
732 		uint32_t oend = fhp->end;
733 
734 		if (da->heap_index > 0) {
735 			fhp--;
736 			da->heap_index--;
737 		} else
738 			initheap(da, fhp->end + 1, chunk);
739 		if (fhp->end > oend && fhp->nexthop != fp->nexthop) {
740 			/* Have we crossed the upper chunk boundary? */
741 			if (oend >= last)
742 				break;
743 			fp++;
744 			da->rtbl_work_frags++;
745 			fp->start = (oend + 1) & DXR_RANGE_MASK;
746 			fp->nexthop = fhp->nexthop;
747 		}
748 	}
749 
750 	/* Direct hit if the chunk contains only a single fragment */
751 	if (da->rtbl_work_frags == 0) {
752 		da->direct_tbl[chunk].base = fp->nexthop;
753 		da->direct_tbl[chunk].fragments = FRAGS_MARK_HIT;
754 		return (0);
755 	}
756 
757 	da->direct_tbl[chunk].base = da->rtbl_top;
758 	da->direct_tbl[chunk].fragments = da->rtbl_work_frags;
759 
760 #if DXR_TRIE_BITS < 24
761 	/* Check whether the chunk can be more compactly encoded */
762 	fp = &da->range_tbl[da->rtbl_top].re;
763 	for (i = 0; i <= da->rtbl_work_frags; i++, fp++)
764 		if ((fp->start & 0xff) != 0 || fp->nexthop > RE_SHORT_MAX_NH)
765 			break;
766 	if (i == da->rtbl_work_frags + 1) {
767 		fp = &da->range_tbl[da->rtbl_top].re;
768 		fps = (void *) fp;
769 		for (i = 0; i <= da->rtbl_work_frags; i++, fp++, fps++) {
770 			start = fp->start;
771 			nh = fp->nexthop;
772 			fps->start = start >> 8;
773 			fps->nexthop = nh;
774 		}
775 		fps->start = start >> 8;
776 		fps->nexthop = nh;
777 		da->rtbl_work_frags >>= 1;
778 		da->direct_tbl[chunk].fragments =
779 		    da->rtbl_work_frags | FRAGS_PREF_SHORT;
780 	} else
781 #endif
782 	if (da->rtbl_work_frags >= FRAGS_MARK_HIT) {
783 		da->direct_tbl[chunk].fragments = FRAGS_MARK_XL;
784 		memmove(&da->range_tbl[da->rtbl_top + 1],
785 		   &da->range_tbl[da->rtbl_top],
786 		   (da->rtbl_work_frags + 1) * sizeof(*da->range_tbl));
787 		da->range_tbl[da->rtbl_top].fragments = da->rtbl_work_frags;
788 		da->rtbl_work_frags++;
789 	}
790 	da->rtbl_top += (da->rtbl_work_frags + 1);
791 	return (chunk_ref(da, chunk));
792 }
793 
794 static void
795 dxr_build(struct dxr *dxr)
796 {
797 	struct dxr_aux *da = dxr->aux;
798 	struct chunk_desc *cdp;
799 	struct rib_rtable_info rinfo;
800 	struct timeval t0, t1, t2, t3;
801 	uint32_t r_size, dxr_tot_size;
802 	uint32_t i, m, range_rebuild = 0;
803 #ifdef DXR2
804 	struct trie_desc *tp;
805 	uint32_t d_tbl_size, dxr_x, d_size, x_size;
806 	uint32_t ti, trie_rebuild = 0, prev_size = 0;
807 #endif
808 
809 	KASSERT(dxr->d == NULL, ("dxr: d not free"));
810 
811 	if (da == NULL) {
812 		da = malloc(sizeof(*dxr->aux), M_DXRAUX, M_NOWAIT);
813 		if (da == NULL)
814 			return;
815 		dxr->aux = da;
816 		da->fibnum = dxr->fibnum;
817 		da->refcnt = 1;
818 		LIST_INIT(&da->all_chunks);
819 		LIST_INIT(&da->all_trie);
820 		da->rtbl_size = RTBL_SIZE_INCR;
821 		da->range_tbl = NULL;
822 		da->xtbl_size = XTBL_SIZE_INCR;
823 		da->x_tbl = NULL;
824 		bzero(&da->dst, sizeof(da->dst));
825 		bzero(&da->mask, sizeof(da->mask));
826 		da->dst.sin_len = sizeof(da->dst);
827 		da->mask.sin_len = sizeof(da->mask);
828 		da->dst.sin_family = AF_INET;
829 		da->mask.sin_family = AF_INET;
830 	}
831 	if (da->range_tbl == NULL) {
832 		da->range_tbl = malloc(sizeof(*da->range_tbl) * da->rtbl_size
833 		    + FRAGS_PREF_SHORT, M_DXRAUX, M_NOWAIT);
834 		if (da->range_tbl == NULL)
835 			return;
836 		range_rebuild = 1;
837 	}
838 #ifdef DXR2
839 	if (da->x_tbl == NULL) {
840 		da->x_tbl = malloc(sizeof(*da->x_tbl) * da->xtbl_size,
841 		    M_DXRAUX, M_NOWAIT);
842 		if (da->x_tbl == NULL)
843 			return;
844 		trie_rebuild = 1;
845 	}
846 #endif
847 	da->fd = dxr->fd;
848 
849 	microuptime(&t0);
850 
851 	dxr->nh_tbl = fib_get_nhop_array(da->fd);
852 	fib_get_rtable_info(fib_get_rh(da->fd), &rinfo);
853 
854 	if (da->updates_low > da->updates_high ||
855 	    da->unused_chunks_cnt > V_max_range_holes)
856 		range_rebuild = 1;
857 	if (range_rebuild) {
858 		/* Bulk cleanup */
859 		bzero(da->chunk_hashtbl, sizeof(da->chunk_hashtbl));
860 		while ((cdp = LIST_FIRST(&da->all_chunks)) != NULL) {
861 			LIST_REMOVE(cdp, cd_all_le);
862 			uma_zfree(chunk_zone, cdp);
863 		}
864 		LIST_INIT(&da->unused_chunks);
865 		da->all_chunks_cnt = da->unused_chunks_cnt = 0;
866 		da->rtbl_top = 0;
867 		da->updates_low = 0;
868 		da->updates_high = DIRECT_TBL_SIZE - 1;
869 		memset(da->updates_mask, 0xff, sizeof(da->updates_mask));
870 		for (i = 0; i < DIRECT_TBL_SIZE; i++) {
871 			da->direct_tbl[i].fragments = FRAGS_MARK_HIT;
872 			da->direct_tbl[i].base = 0;
873 		}
874 	}
875 	da->prefixes = rinfo.num_prefixes;
876 
877 	/* DXR: construct direct & range table */
878 	for (i = da->updates_low; i <= da->updates_high; i++) {
879 		m = da->updates_mask[i >> 5] >> (i & 0x1f);
880 		if (m == 0)
881 			i |= 0x1f;
882 		else if (m & 1 && update_chunk(da, i) != 0)
883 			return;
884 	}
885 	r_size = sizeof(*da->range_tbl) * da->rtbl_top;
886 	microuptime(&t1);
887 
888 #ifdef DXR2
889 	if (range_rebuild || da->unused_trie_cnt > V_max_trie_holes ||
890 	    abs(fls(da->prefixes) - fls(da->trie_rebuilt_prefixes)) > 1)
891 		trie_rebuild = 1;
892 	if (trie_rebuild) {
893 		da->trie_rebuilt_prefixes = da->prefixes;
894 		da->d_bits = DXR_D;
895 		da->updates_low = 0;
896 		da->updates_high = DIRECT_TBL_SIZE - 1;
897 	}
898 
899 dxr2_try_squeeze:
900 	if (trie_rebuild) {
901 		/* Bulk cleanup */
902 		bzero(da->trietbl, sizeof(da->trietbl));
903 		bzero(da->trie_hashtbl, sizeof(da->trie_hashtbl));
904 		while ((tp = LIST_FIRST(&da->all_trie)) != NULL) {
905 			LIST_REMOVE(tp, td_all_le);
906 			uma_zfree(trie_zone, tp);
907 		}
908 		LIST_INIT(&da->unused_trie);
909 		da->all_trie_cnt = da->unused_trie_cnt = 0;
910 	}
911 
912 	/* Populate d_tbl, x_tbl */
913 	dxr_x = DXR_TRIE_BITS - da->d_bits;
914 	d_tbl_size = (1 << da->d_bits);
915 
916 	for (i = da->updates_low >> dxr_x; i <= da->updates_high >> dxr_x;
917 	    i++) {
918 		trie_unref(da, i);
919 		ti = trie_ref(da, i);
920 		if (ti < 0)
921 			return;
922 		da->d_tbl[i] = ti;
923 	}
924 
925 	d_size = sizeof(*da->d_tbl) * d_tbl_size;
926 	x_size = sizeof(*da->x_tbl) * DIRECT_TBL_SIZE / d_tbl_size
927 	    * da->all_trie_cnt;
928 	dxr_tot_size = d_size + x_size + r_size;
929 
930 	if (trie_rebuild == 1) {
931 		/* Try to find a more compact D/X split */
932 		if (prev_size == 0 || dxr_tot_size <= prev_size)
933 			da->d_bits--;
934 		else {
935 			da->d_bits++;
936 			trie_rebuild = 2;
937 		}
938 		prev_size = dxr_tot_size;
939 		goto dxr2_try_squeeze;
940 	}
941 	microuptime(&t2);
942 #else /* !DXR2 */
943 	dxr_tot_size = sizeof(da->direct_tbl) + r_size;
944 	t2 = t1;
945 #endif
946 
947 	dxr->d = malloc(dxr_tot_size, M_DXRLPM, M_NOWAIT);
948 	if (dxr->d == NULL)
949 		return;
950 #ifdef DXR2
951 	memcpy(dxr->d, da->d_tbl, d_size);
952 	dxr->x = ((char *) dxr->d) + d_size;
953 	memcpy(dxr->x, da->x_tbl, x_size);
954 	dxr->r = ((char *) dxr->x) + x_size;
955 	dxr->d_shift = 32 - da->d_bits;
956 	dxr->x_shift = dxr_x;
957 	dxr->x_mask = 0xffffffffU >> (32 - dxr_x);
958 #else /* !DXR2 */
959 	memcpy(dxr->d, da->direct_tbl, sizeof(da->direct_tbl));
960 	dxr->r = ((char *) dxr->d) + sizeof(da->direct_tbl);
961 #endif
962 	memcpy(dxr->r, da->range_tbl, r_size);
963 
964 	if (da->updates_low <= da->updates_high)
965 		bzero(&da->updates_mask[da->updates_low / 32],
966 		    (da->updates_high - da->updates_low) / 8 + 1);
967 	da->updates_low = DIRECT_TBL_SIZE - 1;
968 	da->updates_high = 0;
969 	microuptime(&t3);
970 
971 #ifdef DXR2
972 	FIB_PRINTF(LOG_INFO, da->fd, "D%dX%dR, %d prefixes, %d nhops (max)",
973 	    da->d_bits, dxr_x, rinfo.num_prefixes, rinfo.num_nhops);
974 #else
975 	FIB_PRINTF(LOG_INFO, da->fd, "D%dR, %d prefixes, %d nhops (max)",
976 	    DXR_D, rinfo.num_prefixes, rinfo.num_nhops);
977 #endif
978 	i = dxr_tot_size * 100 / rinfo.num_prefixes;
979 	FIB_PRINTF(LOG_INFO, da->fd, "%d.%02d KBytes, %d.%02d Bytes/prefix",
980 	    dxr_tot_size / 1024, dxr_tot_size * 100 / 1024 % 100,
981 	    i / 100, i % 100);
982 	i = (t1.tv_sec - t0.tv_sec) * 1000000 + t1.tv_usec - t0.tv_usec;
983 	FIB_PRINTF(LOG_INFO, da->fd, "range table %s in %u.%03u ms",
984 	    range_rebuild ? "rebuilt" : "updated", i / 1000, i % 1000);
985 #ifdef DXR2
986 	i = (t2.tv_sec - t1.tv_sec) * 1000000 + t2.tv_usec - t1.tv_usec;
987 	FIB_PRINTF(LOG_INFO, da->fd, "trie %s in %u.%03u ms",
988 	    trie_rebuild ? "rebuilt" : "updated", i / 1000, i % 1000);
989 #endif
990 	i = (t3.tv_sec - t2.tv_sec) * 1000000 + t3.tv_usec - t2.tv_usec;
991 	FIB_PRINTF(LOG_INFO, da->fd, "snapshot forked in %u.%03u ms",
992 	    i / 1000, i % 1000);
993 	FIB_PRINTF(LOG_INFO, da->fd, "range table: %d%%, %d chunks, %d holes",
994 	    da->rtbl_top * 100 / BASE_MAX, da->all_chunks_cnt,
995 	    da->unused_chunks_cnt);
996 }
997 
998 /*
999  * Glue functions for attaching to FreeBSD 13 fib_algo infrastructure.
1000  */
1001 
1002 static struct nhop_object *
1003 dxr_fib_lookup(void *algo_data, const struct flm_lookup_key key,
1004     uint32_t scopeid)
1005 {
1006 	struct dxr *dxr = algo_data;
1007 	uint32_t nh;
1008 
1009 	nh = dxr_lookup(dxr, ntohl(key.addr4.s_addr));
1010 
1011 	return (dxr->nh_tbl[nh]);
1012 }
1013 
1014 static enum flm_op_result
1015 dxr_init(uint32_t fibnum, struct fib_data *fd, void *old_data, void **data)
1016 {
1017 	struct dxr *old_dxr = old_data;
1018 	struct dxr_aux *da = NULL;
1019 	struct dxr *dxr;
1020 
1021 	dxr = malloc(sizeof(*dxr), M_DXRAUX, M_NOWAIT);
1022 	if (dxr == NULL)
1023 		return (FLM_REBUILD);
1024 
1025 	/* Check whether we may reuse the old auxiliary structures */
1026 	if (old_dxr != NULL && old_dxr->aux != NULL) {
1027 		da = old_dxr->aux;
1028 		atomic_add_int(&da->refcnt, 1);
1029 	}
1030 
1031 	dxr->aux = da;
1032 	dxr->d = NULL;
1033 	dxr->fd = fd;
1034 	dxr->fibnum = fibnum;
1035 	*data = dxr;
1036 
1037 	return (FLM_SUCCESS);
1038 }
1039 
1040 static void
1041 dxr_destroy(void *data)
1042 {
1043 	struct dxr *dxr = data;
1044 	struct dxr_aux *da;
1045 	struct chunk_desc *cdp;
1046 	struct trie_desc *tp;
1047 
1048 	if (dxr->d != NULL)
1049 		free(dxr->d, M_DXRLPM);
1050 
1051 	da = dxr->aux;
1052 	free(dxr, M_DXRAUX);
1053 
1054 	if (da == NULL || atomic_fetchadd_int(&da->refcnt, -1) > 1)
1055 		return;
1056 
1057 	/* Release auxiliary structures */
1058 	while ((cdp = LIST_FIRST(&da->all_chunks)) != NULL) {
1059 		LIST_REMOVE(cdp, cd_all_le);
1060 		uma_zfree(chunk_zone, cdp);
1061 	}
1062 	while ((tp = LIST_FIRST(&da->all_trie)) != NULL) {
1063 		LIST_REMOVE(tp, td_all_le);
1064 		uma_zfree(trie_zone, tp);
1065 	}
1066 	free(da->range_tbl, M_DXRAUX);
1067 	free(da->x_tbl, M_DXRAUX);
1068 	free(da, M_DXRAUX);
1069 }
1070 
1071 static void
1072 epoch_dxr_destroy(epoch_context_t ctx)
1073 {
1074 	struct dxr *dxr = __containerof(ctx, struct dxr, epoch_ctx);
1075 
1076 	dxr_destroy(dxr);
1077 }
1078 
1079 static enum flm_op_result
1080 dxr_dump_end(void *data, struct fib_dp *dp)
1081 {
1082 	struct dxr *dxr = data;
1083 	struct dxr_aux *da;
1084 
1085 	dxr_build(dxr);
1086 
1087 	da = dxr->aux;
1088 	if (da == NULL)
1089 		return (FLM_REBUILD);
1090 
1091 	/* Structural limit exceeded, hard error */
1092 	if (da->rtbl_top >= BASE_MAX)
1093 		return (FLM_ERROR);
1094 
1095 	/* A malloc(,, M_NOWAIT) failed somewhere, retry later */
1096 	if (dxr->d == NULL)
1097 		return (FLM_REBUILD);
1098 
1099 	dp->f = dxr_fib_lookup;
1100 	dp->arg = dxr;
1101 
1102 	return (FLM_SUCCESS);
1103 }
1104 
1105 static enum flm_op_result
1106 dxr_dump_rib_item(struct rtentry *rt, void *data)
1107 {
1108 
1109 	return (FLM_SUCCESS);
1110 }
1111 
1112 static enum flm_op_result
1113 dxr_change_rib_item(struct rib_head *rnh, struct rib_cmd_info *rc,
1114     void *data)
1115 {
1116 
1117 	return (FLM_BATCH);
1118 }
1119 
1120 static enum flm_op_result
1121 dxr_change_rib_batch(struct rib_head *rnh, struct fib_change_queue *q,
1122     void *data)
1123 {
1124 	struct dxr *dxr = data;
1125 	struct dxr *new_dxr;
1126 	struct dxr_aux *da;
1127 	struct fib_dp new_dp;
1128 	enum flm_op_result res;
1129 	uint32_t ip, plen, hmask, start, end, i, ui;
1130 #ifdef INVARIANTS
1131 	struct rib_rtable_info rinfo;
1132 	int update_delta = 0;
1133 #endif
1134 
1135 	KASSERT(data != NULL, ("%s: NULL data", __FUNCTION__));
1136 	KASSERT(q != NULL, ("%s: NULL q", __FUNCTION__));
1137 	KASSERT(q->count < q->size, ("%s: q->count %d q->size %d",
1138 	    __FUNCTION__, q->count, q->size));
1139 
1140 	da = dxr->aux;
1141 	KASSERT(da != NULL, ("%s: NULL dxr->aux", __FUNCTION__));
1142 
1143 	FIB_PRINTF(LOG_INFO, da->fd, "processing %d update(s)", q->count);
1144 	for (ui = 0; ui < q->count; ui++) {
1145 #ifdef INVARIANTS
1146 		if (q->entries[ui].nh_new != NULL)
1147 			update_delta++;
1148 		if (q->entries[ui].nh_old != NULL)
1149 			update_delta--;
1150 #endif
1151 		plen = q->entries[ui].plen;
1152 		ip = ntohl(q->entries[ui].addr4.s_addr);
1153 		hmask = 0xffffffffU >> plen;
1154 		start = (ip & ~hmask) >> DXR_RANGE_SHIFT;
1155 		end = (ip | hmask) >> DXR_RANGE_SHIFT;
1156 
1157 		if ((start & 0x1f) == 0 && (end & 0x1f) == 0x1f)
1158 			for (i = start >> 5; i <= end >> 5; i++)
1159 				da->updates_mask[i] = 0xffffffffU;
1160 		else
1161 			for (i = start; i <= end; i++)
1162 				da->updates_mask[i >> 5] |= (1 << (i & 0x1f));
1163 		if (start < da->updates_low)
1164 			da->updates_low = start;
1165 		if (end > da->updates_high)
1166 			da->updates_high = end;
1167 	}
1168 
1169 #ifdef INVARIANTS
1170 	fib_get_rtable_info(fib_get_rh(da->fd), &rinfo);
1171 	KASSERT(da->prefixes + update_delta == rinfo.num_prefixes,
1172 	    ("%s: update count mismatch", __FUNCTION__));
1173 #endif
1174 
1175 	res = dxr_init(0, dxr->fd, data, (void **) &new_dxr);
1176 	if (res != FLM_SUCCESS)
1177 		return (res);
1178 
1179 	dxr_build(new_dxr);
1180 
1181 	/* Structural limit exceeded, hard error */
1182 	if (da->rtbl_top >= BASE_MAX) {
1183 		dxr_destroy(new_dxr);
1184 		return (FLM_ERROR);
1185 	}
1186 
1187 	/* A malloc(,, M_NOWAIT) failed somewhere, retry later */
1188 	if (new_dxr->d == NULL) {
1189 		dxr_destroy(new_dxr);
1190 		return (FLM_REBUILD);
1191 	}
1192 
1193 	new_dp.f = dxr_fib_lookup;
1194 	new_dp.arg = new_dxr;
1195 	if (fib_set_datapath_ptr(dxr->fd, &new_dp)) {
1196 		fib_set_algo_ptr(dxr->fd, new_dxr);
1197 		fib_epoch_call(epoch_dxr_destroy, &dxr->epoch_ctx);
1198 		return (FLM_SUCCESS);
1199 	}
1200 
1201 	dxr_destroy(new_dxr);
1202 	return (FLM_REBUILD);
1203 }
1204 
1205 static uint8_t
1206 dxr_get_pref(const struct rib_rtable_info *rinfo)
1207 {
1208 
1209 	/* Below bsearch4 up to 10 prefixes. Always supersedes dpdk_lpm4. */
1210 	return (251);
1211 }
1212 
1213 static struct fib_lookup_module fib_dxr_mod = {
1214 	.flm_name = "dxr",
1215 	.flm_family = AF_INET,
1216 	.flm_init_cb = dxr_init,
1217 	.flm_destroy_cb = dxr_destroy,
1218 	.flm_dump_rib_item_cb = dxr_dump_rib_item,
1219 	.flm_dump_end_cb = dxr_dump_end,
1220 	.flm_change_rib_item_cb = dxr_change_rib_item,
1221 	.flm_change_rib_items_cb = dxr_change_rib_batch,
1222 	.flm_get_pref = dxr_get_pref,
1223 };
1224 
1225 static int
1226 dxr_modevent(module_t mod, int type, void *unused)
1227 {
1228 	int error;
1229 
1230 	switch (type) {
1231 	case MOD_LOAD:
1232 		chunk_zone = uma_zcreate("dxr chunk", sizeof(struct chunk_desc),
1233 		    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
1234 		trie_zone = uma_zcreate("dxr trie", sizeof(struct trie_desc),
1235 		    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
1236 		fib_module_register(&fib_dxr_mod);
1237 		return(0);
1238 	case MOD_UNLOAD:
1239 		error = fib_module_unregister(&fib_dxr_mod);
1240 		if (error)
1241 			return (error);
1242 		uma_zdestroy(chunk_zone);
1243 		uma_zdestroy(trie_zone);
1244 		return(0);
1245 	default:
1246 		return(EOPNOTSUPP);
1247 	}
1248 }
1249 
1250 static moduledata_t dxr_mod = {"fib_dxr", dxr_modevent, 0};
1251 
1252 DECLARE_MODULE(fib_dxr, dxr_mod, SI_SUB_PSEUDO, SI_ORDER_ANY);
1253 MODULE_VERSION(fib_dxr, 1);
1254