xref: /freebsd/sys/netinet/in_fib_dxr.c (revision 3984400149a53982f1254cfaf7b73fd2acda460a)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2012-2021 Marko Zec
5  * Copyright (c) 2005, 2018 University of Zagreb
6  * Copyright (c) 2005 International Computer Science Institute
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27  * SUCH DAMAGE.
28  */
29 
30 /*
31  * An implementation of DXR, a simple IPv4 LPM scheme with compact lookup
32  * structures and a trivial search procedure.  More significant bits of
33  * the search key are used to directly index a two-stage trie, while the
34  * remaining bits are used for finding the next hop in a sorted array.
35  * More details in:
36  *
37  * M. Zec, L. Rizzo, M. Mikuc, DXR: towards a billion routing lookups per
38  * second in software, ACM SIGCOMM Computer Communication Review, September
39  * 2012
40  *
41  * M. Zec, M. Mikuc, Pushing the envelope: beyond two billion IP routing
42  * lookups per second on commodity CPUs, IEEE SoftCOM, September 2017, Split
43  */
44 
45 #include <sys/cdefs.h>
46 __FBSDID("$FreeBSD$");
47 
48 #include "opt_inet.h"
49 
50 #include <sys/param.h>
51 #include <sys/kernel.h>
52 #include <sys/epoch.h>
53 #include <sys/malloc.h>
54 #include <sys/module.h>
55 #include <sys/socket.h>
56 #include <sys/sysctl.h>
57 #include <sys/syslog.h>
58 
59 #include <vm/uma.h>
60 
61 #include <netinet/in.h>
62 #include <netinet/in_fib.h>
63 
64 #include <net/route.h>
65 #include <net/route/route_ctl.h>
66 #include <net/route/fib_algo.h>
67 
68 #define	DXR_TRIE_BITS		20
69 
70 CTASSERT(DXR_TRIE_BITS >= 16 && DXR_TRIE_BITS <= 24);
71 
72 /* DXR2: two-stage primary trie, instead of a single direct lookup table */
73 #define	DXR2
74 
75 #if DXR_TRIE_BITS > 16
76 #define	DXR_D			16
77 #else
78 #define	DXR_D			(DXR_TRIE_BITS - 1)
79 #endif
80 #define	DXR_X			(DXR_TRIE_BITS - DXR_D)
81 
82 #define	D_TBL_SIZE		(1 << DXR_D)
83 #define	DIRECT_TBL_SIZE		(1 << DXR_TRIE_BITS)
84 #define	DXR_RANGE_MASK		(0xffffffffU >> DXR_TRIE_BITS)
85 #define	DXR_RANGE_SHIFT		(32 - DXR_TRIE_BITS)
86 
87 #define	DESC_BASE_BITS		22
88 #define	DESC_FRAGMENTS_BITS	(32 - DESC_BASE_BITS)
89 #define	BASE_MAX		((1 << DESC_BASE_BITS) - 1)
90 #define	RTBL_SIZE_INCR		(BASE_MAX / 64)
91 
92 #if DXR_TRIE_BITS < 24
93 #define	FRAGS_MASK_SHORT	((1 << (23 - DXR_TRIE_BITS)) - 1)
94 #else
95 #define	FRAGS_MASK_SHORT	0
96 #endif
97 #define	FRAGS_PREF_SHORT	(((1 << DESC_FRAGMENTS_BITS) - 1) & \
98 				 ~FRAGS_MASK_SHORT)
99 #define	FRAGS_MARK_XL		(FRAGS_PREF_SHORT - 1)
100 #define	FRAGS_MARK_HIT		(FRAGS_PREF_SHORT - 2)
101 
102 #define	IS_SHORT_FORMAT(x)	((x & FRAGS_PREF_SHORT) == FRAGS_PREF_SHORT)
103 #define	IS_LONG_FORMAT(x)	((x & FRAGS_PREF_SHORT) != FRAGS_PREF_SHORT)
104 #define	IS_XL_FORMAT(x)		(x == FRAGS_MARK_XL)
105 
106 #define	RE_SHORT_MAX_NH		((1 << (DXR_TRIE_BITS - 8)) - 1)
107 
108 #define	CHUNK_HASH_BITS		16
109 #define	CHUNK_HASH_SIZE		(1 << CHUNK_HASH_BITS)
110 #define	CHUNK_HASH_MASK		(CHUNK_HASH_SIZE - 1)
111 
112 #define	TRIE_HASH_BITS		16
113 #define	TRIE_HASH_SIZE		(1 << TRIE_HASH_BITS)
114 #define	TRIE_HASH_MASK		(TRIE_HASH_SIZE - 1)
115 
116 #define	XTBL_SIZE_INCR		(DIRECT_TBL_SIZE / 16)
117 
118 #define	UNUSED_BUCKETS		8
119 
120 /* Lookup structure elements */
121 
122 struct direct_entry {
123 	uint32_t		fragments: DESC_FRAGMENTS_BITS,
124 				base: DESC_BASE_BITS;
125 };
126 
127 struct range_entry_long {
128 	uint32_t		start: DXR_RANGE_SHIFT,
129 				nexthop: DXR_TRIE_BITS;
130 };
131 
132 #if DXR_TRIE_BITS < 24
133 struct range_entry_short {
134 	uint16_t		start: DXR_RANGE_SHIFT - 8,
135 				nexthop: DXR_TRIE_BITS - 8;
136 };
137 #endif
138 
139 /* Auxiliary structures */
140 
141 struct heap_entry {
142 	uint32_t		start;
143 	uint32_t		end;
144 	uint32_t		preflen;
145 	uint32_t		nexthop;
146 };
147 
148 struct chunk_desc {
149 	LIST_ENTRY(chunk_desc)	cd_all_le;
150 	LIST_ENTRY(chunk_desc)	cd_hash_le;
151 	uint32_t		cd_hash;
152 	uint32_t		cd_refcnt;
153 	uint32_t		cd_base;
154 	uint32_t		cd_cur_size;
155 	uint32_t		cd_max_size;
156 };
157 
158 struct trie_desc {
159 	LIST_ENTRY(trie_desc)	td_all_le;
160 	LIST_ENTRY(trie_desc)	td_hash_le;
161 	uint32_t		td_hash;
162 	uint32_t		td_index;
163 	uint32_t		td_refcnt;
164 };
165 
166 struct dxr_aux {
167 	/* Glue to external state */
168 	struct fib_data		*fd;
169 	uint32_t		fibnum;
170 	int			refcnt;
171 
172 	/* Auxiliary build-time tables */
173 	struct direct_entry	direct_tbl[DIRECT_TBL_SIZE];
174 	uint16_t		d_tbl[D_TBL_SIZE];
175 	struct direct_entry	*x_tbl;
176 	union {
177 		struct range_entry_long	re;
178 		uint32_t	fragments;
179 	}			*range_tbl;
180 
181 	/* Auxiliary internal state */
182 	uint32_t		updates_mask[DIRECT_TBL_SIZE / 32];
183 	struct trie_desc	*trietbl[D_TBL_SIZE];
184 	LIST_HEAD(, chunk_desc)	chunk_hashtbl[CHUNK_HASH_SIZE];
185 	LIST_HEAD(, chunk_desc)	all_chunks;
186 	LIST_HEAD(, chunk_desc) unused_chunks[UNUSED_BUCKETS];
187 	LIST_HEAD(, trie_desc)	trie_hashtbl[TRIE_HASH_SIZE];
188 	LIST_HEAD(, trie_desc)	all_trie;
189 	LIST_HEAD(, trie_desc)	unused_trie; /* abuses hash link entry */
190 	struct sockaddr_in	dst;
191 	struct sockaddr_in	mask;
192 	struct heap_entry	heap[33];
193 	uint32_t		prefixes;
194 	uint32_t		updates_low;
195 	uint32_t		updates_high;
196 	uint32_t		all_chunks_cnt;
197 	uint32_t		unused_chunks_cnt;
198 	uint32_t		xtbl_size;
199 	uint32_t		all_trie_cnt;
200 	uint32_t		unused_trie_cnt;
201 	uint32_t		trie_rebuilt_prefixes;
202 	uint32_t		heap_index;
203 	uint32_t		d_bits;
204 	uint32_t		rtbl_size;
205 	uint32_t		rtbl_top;
206 	uint32_t		rtbl_work_frags;
207 	uint32_t		work_chunk;
208 };
209 
210 /* Main lookup structure container */
211 
212 struct dxr {
213 	/* Lookup tables */
214 	uint16_t		d_shift;
215 	uint16_t		x_shift;
216 	uint32_t		x_mask;
217 	void			*d;
218 	void			*x;
219 	void			*r;
220 	struct nhop_object	**nh_tbl;
221 
222 	/* Glue to external state */
223 	struct dxr_aux		*aux;
224 	struct fib_data		*fd;
225 	struct epoch_context	epoch_ctx;
226 	uint32_t		fibnum;
227 };
228 
229 static MALLOC_DEFINE(M_DXRLPM, "dxr", "DXR LPM");
230 static MALLOC_DEFINE(M_DXRAUX, "dxr aux", "DXR auxiliary");
231 
232 uma_zone_t chunk_zone;
233 uma_zone_t trie_zone;
234 
235 SYSCTL_DECL(_net_route_algo);
236 SYSCTL_NODE(_net_route_algo, OID_AUTO, dxr, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
237     "DXR tunables");
238 
239 VNET_DEFINE_STATIC(int, max_trie_holes) = 8;
240 #define	V_max_trie_holes	VNET(max_trie_holes)
241 SYSCTL_INT(_net_route_algo_dxr, OID_AUTO, max_trie_holes,
242     CTLFLAG_RW | CTLFLAG_VNET, &VNET_NAME(max_trie_holes), 0,
243     "Trie fragmentation threshold before triggering a full rebuild");
244 
245 VNET_DEFINE_STATIC(int, max_range_holes) = 16;
246 #define	V_max_range_holes	VNET(max_range_holes)
247 SYSCTL_INT(_net_route_algo_dxr, OID_AUTO, max_range_holes,
248     CTLFLAG_RW | CTLFLAG_VNET, &VNET_NAME(max_range_holes), 0,
249     "Range table fragmentation threshold before triggering a full rebuild");
250 
251 /* Binary search for a matching address range */
252 #define	DXR_LOOKUP_STAGE					\
253 	if (masked_dst < range[middle].start) {			\
254 		upperbound = middle;				\
255 		middle = (middle + lowerbound) / 2;		\
256 	} else if (masked_dst < range[middle + 1].start)	\
257 		return (range[middle].nexthop);			\
258 	else {							\
259 		lowerbound = middle + 1;			\
260 		middle = (upperbound + middle + 1) / 2;		\
261 	}							\
262 	if (upperbound == lowerbound)				\
263 		return (range[lowerbound].nexthop);
264 
265 static int
266 dxr_lookup(struct dxr *dxr, uint32_t dst)
267 {
268 #ifdef DXR2
269 	uint16_t *dt = dxr->d;
270 	struct direct_entry *xt = dxr->x;
271 	int xi;
272 #else
273 	struct direct_entry *dt = dxr->d;
274 #endif
275 	struct direct_entry de;
276 	struct range_entry_long	*rt;
277 	uint32_t base;
278 	uint32_t upperbound;
279 	uint32_t middle;
280 	uint32_t lowerbound;
281 	uint32_t masked_dst;
282 
283 #ifdef DXR2
284 	xi = (dt[dst >> dxr->d_shift] << dxr->x_shift) +
285 	    ((dst >> DXR_RANGE_SHIFT) & dxr->x_mask);
286 	de = xt[xi];
287 #else
288 	de = dt[dst >> DXR_RANGE_SHIFT];
289 #endif
290 
291 	if (__predict_true(de.fragments == FRAGS_MARK_HIT))
292 		return (de.base);
293 
294 	rt = dxr->r;
295 	base = de.base;
296 	lowerbound = 0;
297 	masked_dst = dst & DXR_RANGE_MASK;
298 
299 #if DXR_TRIE_BITS < 24
300 	if (__predict_true(IS_SHORT_FORMAT(de.fragments))) {
301 		upperbound = de.fragments & FRAGS_MASK_SHORT;
302 		struct range_entry_short *range =
303 		    (struct range_entry_short *) &rt[base];
304 
305 		masked_dst >>= 8;
306 		middle = upperbound;
307 		upperbound = upperbound * 2 + 1;
308 
309 		for (;;) {
310 			DXR_LOOKUP_STAGE
311 			DXR_LOOKUP_STAGE
312 		}
313 	}
314 #endif
315 
316 	upperbound = de.fragments;
317 	middle = upperbound / 2;
318 	struct range_entry_long *range = &rt[base];
319 	if (__predict_false(IS_XL_FORMAT(de.fragments))) {
320 		upperbound = *((uint32_t *) range);
321 		range++;
322 		middle = upperbound / 2;
323 	}
324 
325 	for (;;) {
326 		DXR_LOOKUP_STAGE
327 		DXR_LOOKUP_STAGE
328 	}
329 }
330 
331 static void
332 initheap(struct dxr_aux *da, uint32_t dst_u32, uint32_t chunk)
333 {
334 	struct heap_entry *fhp = &da->heap[0];
335 	struct rtentry *rt;
336 	struct route_nhop_data rnd;
337 
338 	da->heap_index = 0;
339 	da->dst.sin_addr.s_addr = htonl(dst_u32);
340 	rt = fib4_lookup_rt(da->fibnum, da->dst.sin_addr, 0, NHR_UNLOCKED,
341 	    &rnd);
342 	if (rt != NULL) {
343 		struct in_addr addr;
344 		uint32_t scopeid;
345 
346 		rt_get_inet_prefix_plen(rt, &addr, &fhp->preflen, &scopeid);
347 		fhp->start = ntohl(addr.s_addr);
348 		fhp->end = fhp->start;
349 		if (fhp->preflen < 32)
350 			fhp->end |= (0xffffffffU >> fhp->preflen);
351 		fhp->nexthop = fib_get_nhop_idx(da->fd, rnd.rnd_nhop);
352 	} else {
353 		fhp->preflen = fhp->nexthop = fhp->start = 0;
354 		fhp->end = 0xffffffffU;
355 	}
356 }
357 
358 static uint32_t
359 chunk_size(struct dxr_aux *da, struct direct_entry *fdesc)
360 {
361 
362 	if (IS_SHORT_FORMAT(fdesc->fragments))
363 		return ((fdesc->fragments & FRAGS_MASK_SHORT) + 1);
364 	else if (IS_XL_FORMAT(fdesc->fragments))
365 		return (da->range_tbl[fdesc->base].fragments + 2);
366 	else /* if (IS_LONG_FORMAT(fdesc->fragments)) */
367 		return (fdesc->fragments + 1);
368 }
369 
370 static uint32_t
371 chunk_hash(struct dxr_aux *da, struct direct_entry *fdesc)
372 {
373 	uint32_t size = chunk_size(da, fdesc);
374 	uint32_t *p = (uint32_t *) &da->range_tbl[fdesc->base];
375 	uint32_t *l = (uint32_t *) &da->range_tbl[fdesc->base + size];
376 	uint32_t hash = fdesc->fragments;
377 
378 	for (; p < l; p++)
379 		hash = (hash << 7) + (hash >> 13) + *p;
380 
381 	return (hash + (hash >> 16));
382 }
383 
384 static int
385 chunk_ref(struct dxr_aux *da, uint32_t chunk)
386 {
387 	struct direct_entry *fdesc = &da->direct_tbl[chunk];
388 	struct chunk_desc *cdp, *empty_cdp;
389 	uint32_t base = fdesc->base;
390 	uint32_t size = chunk_size(da, fdesc);
391 	uint32_t hash = chunk_hash(da, fdesc);
392 	int i;
393 
394 	/* Find an existing descriptor */
395 	LIST_FOREACH(cdp, &da->chunk_hashtbl[hash & CHUNK_HASH_MASK],
396 	    cd_hash_le) {
397 		if (cdp->cd_hash != hash || cdp->cd_cur_size != size ||
398 		    memcmp(&da->range_tbl[base], &da->range_tbl[cdp->cd_base],
399 		    sizeof(struct range_entry_long) * size))
400 			continue;
401 		da->rtbl_top = fdesc->base;
402 		fdesc->base = cdp->cd_base;
403 		cdp->cd_refcnt++;
404 		return (0);
405 	}
406 
407 	/* No matching chunks found. Find an empty one to recycle. */
408 	for (cdp = NULL, i = size; cdp == NULL && i < UNUSED_BUCKETS; i++)
409 		cdp = LIST_FIRST(&da->unused_chunks[i]);
410 
411 	if (cdp == NULL)
412 		LIST_FOREACH(empty_cdp, &da->unused_chunks[0], cd_hash_le)
413 			if (empty_cdp->cd_max_size >= size && (cdp == NULL ||
414 			    empty_cdp->cd_max_size < cdp->cd_max_size)) {
415 				cdp = empty_cdp;
416 				if (empty_cdp->cd_max_size == size)
417 					break;
418 			}
419 
420 	if (cdp != NULL) {
421 		/* Copy from heap into the recycled chunk */
422 		bcopy(&da->range_tbl[fdesc->base], &da->range_tbl[cdp->cd_base],
423 		    size * sizeof(struct range_entry_long));
424 		fdesc->base = cdp->cd_base;
425 		da->rtbl_top -= size;
426 		da->unused_chunks_cnt--;
427 		if (cdp->cd_max_size > size) {
428 			/* Split the range in two, need a new descriptor */
429 			empty_cdp = uma_zalloc(chunk_zone, M_NOWAIT);
430 			if (empty_cdp == NULL)
431 				return (1);
432 			LIST_INSERT_BEFORE(cdp, empty_cdp, cd_all_le);
433 			empty_cdp->cd_base = cdp->cd_base + size;
434 			empty_cdp->cd_cur_size = 0;
435 			empty_cdp->cd_max_size = cdp->cd_max_size - size;
436 
437 			i = empty_cdp->cd_max_size;
438 			if (i >= UNUSED_BUCKETS)
439 				i = 0;
440 			LIST_INSERT_HEAD(&da->unused_chunks[i], empty_cdp,
441 			    cd_hash_le);
442 
443 			da->all_chunks_cnt++;
444 			da->unused_chunks_cnt++;
445 			cdp->cd_max_size = size;
446 		}
447 		LIST_REMOVE(cdp, cd_hash_le);
448 	} else {
449 		/* Alloc a new descriptor at the top of the heap*/
450 		cdp = uma_zalloc(chunk_zone, M_NOWAIT);
451 		if (cdp == NULL)
452 			return (1);
453 		cdp->cd_max_size = size;
454 		cdp->cd_base = fdesc->base;
455 		LIST_INSERT_HEAD(&da->all_chunks, cdp, cd_all_le);
456 		da->all_chunks_cnt++;
457 		KASSERT(cdp->cd_base + cdp->cd_max_size == da->rtbl_top,
458 		    ("dxr: %s %d", __FUNCTION__, __LINE__));
459 	}
460 
461 	cdp->cd_hash = hash;
462 	cdp->cd_refcnt = 1;
463 	cdp->cd_cur_size = size;
464 	LIST_INSERT_HEAD(&da->chunk_hashtbl[hash & CHUNK_HASH_MASK], cdp,
465 	    cd_hash_le);
466 	if (da->rtbl_top >= da->rtbl_size) {
467 		if (da->rtbl_top >= BASE_MAX) {
468 			FIB_PRINTF(LOG_ERR, da->fd,
469 			    "structural limit exceeded at %d "
470 			    "range table elements", da->rtbl_top);
471 			return (1);
472 		}
473 		da->rtbl_size += RTBL_SIZE_INCR;
474 		if (da->rtbl_top >= BASE_MAX / 4)
475 			FIB_PRINTF(LOG_WARNING, da->fd, "range table at %d%%",
476 			    da->rtbl_top * 100 / BASE_MAX);
477 		da->range_tbl = realloc(da->range_tbl,
478 		    sizeof(*da->range_tbl) * da->rtbl_size + FRAGS_PREF_SHORT,
479 		    M_DXRAUX, M_NOWAIT);
480 		if (da->range_tbl == NULL)
481 			return (1);
482 	}
483 
484 	return (0);
485 }
486 
487 static void
488 chunk_unref(struct dxr_aux *da, uint32_t chunk)
489 {
490 	struct direct_entry *fdesc = &da->direct_tbl[chunk];
491 	struct chunk_desc *cdp, *cdp2;
492 	uint32_t base = fdesc->base;
493 	uint32_t size = chunk_size(da, fdesc);
494 	uint32_t hash = chunk_hash(da, fdesc);
495 	int i;
496 
497 	/* Find the corresponding descriptor */
498 	LIST_FOREACH(cdp, &da->chunk_hashtbl[hash & CHUNK_HASH_MASK],
499 	    cd_hash_le)
500 		if (cdp->cd_hash == hash && cdp->cd_cur_size == size &&
501 		    memcmp(&da->range_tbl[base], &da->range_tbl[cdp->cd_base],
502 		    sizeof(struct range_entry_long) * size) == 0)
503 			break;
504 
505 	KASSERT(cdp != NULL, ("dxr: dangling chunk"));
506 	if (--cdp->cd_refcnt > 0)
507 		return;
508 
509 	LIST_REMOVE(cdp, cd_hash_le);
510 	da->unused_chunks_cnt++;
511 	cdp->cd_cur_size = 0;
512 
513 	/* Attempt to merge with the preceding chunk, if empty */
514 	cdp2 = LIST_NEXT(cdp, cd_all_le);
515 	if (cdp2 != NULL && cdp2->cd_cur_size == 0) {
516 		KASSERT(cdp2->cd_base + cdp2->cd_max_size == cdp->cd_base,
517 		    ("dxr: %s %d", __FUNCTION__, __LINE__));
518 		LIST_REMOVE(cdp, cd_all_le);
519 		da->all_chunks_cnt--;
520 		LIST_REMOVE(cdp2, cd_hash_le);
521 		da->unused_chunks_cnt--;
522 		cdp2->cd_max_size += cdp->cd_max_size;
523 		uma_zfree(chunk_zone, cdp);
524 		cdp = cdp2;
525 	}
526 
527 	/* Attempt to merge with the subsequent chunk, if empty */
528 	cdp2 = LIST_PREV(cdp, &da->all_chunks, chunk_desc, cd_all_le);
529 	if (cdp2 != NULL && cdp2->cd_cur_size == 0) {
530 		KASSERT(cdp->cd_base + cdp->cd_max_size == cdp2->cd_base,
531 		    ("dxr: %s %d", __FUNCTION__, __LINE__));
532 		LIST_REMOVE(cdp, cd_all_le);
533 		da->all_chunks_cnt--;
534 		LIST_REMOVE(cdp2, cd_hash_le);
535 		da->unused_chunks_cnt--;
536 		cdp2->cd_max_size += cdp->cd_max_size;
537 		cdp2->cd_base = cdp->cd_base;
538 		uma_zfree(chunk_zone, cdp);
539 		cdp = cdp2;
540 	}
541 
542 	if (cdp->cd_base + cdp->cd_max_size == da->rtbl_top) {
543 		/* Free the chunk on the top of the range heap, trim the heap */
544 		KASSERT(cdp == LIST_FIRST(&da->all_chunks),
545 		    ("dxr: %s %d", __FUNCTION__, __LINE__));
546 		da->all_chunks_cnt--;
547 		da->unused_chunks_cnt--;
548 		da->rtbl_top -= cdp->cd_max_size;
549 		LIST_REMOVE(cdp, cd_all_le);
550 		uma_zfree(chunk_zone, cdp);
551 		return;
552 	}
553 
554 	i = cdp->cd_max_size;
555 	if (i >= UNUSED_BUCKETS)
556 		i = 0;
557 	LIST_INSERT_HEAD(&da->unused_chunks[i], cdp, cd_hash_le);
558 }
559 
560 #ifdef DXR2
561 static uint32_t
562 trie_hash(struct dxr_aux *da, uint32_t dxr_x, uint32_t index)
563 {
564 	uint32_t i, *val;
565 	uint32_t hash = 0;
566 
567 	for (i = 0; i < (1 << dxr_x); i++) {
568 		hash = (hash << 3) ^ (hash >> 3);
569 		val = (uint32_t *)
570 		    (void *) &da->direct_tbl[(index << dxr_x) + i];
571 		hash += (*val << 5);
572 		hash += (*val >> 5);
573 	}
574 
575 	return (hash + (hash >> 16));
576 }
577 
578 static int
579 trie_ref(struct dxr_aux *da, uint32_t index)
580 {
581 	struct trie_desc *tp;
582 	uint32_t dxr_d = da->d_bits;
583 	uint32_t dxr_x = DXR_TRIE_BITS - dxr_d;
584 	uint32_t hash = trie_hash(da, dxr_x, index);
585 
586 	/* Find an existing descriptor */
587 	LIST_FOREACH(tp, &da->trie_hashtbl[hash & TRIE_HASH_MASK], td_hash_le)
588 		if (tp->td_hash == hash &&
589 		    memcmp(&da->direct_tbl[index << dxr_x],
590 		    &da->x_tbl[tp->td_index << dxr_x],
591 		    sizeof(*da->x_tbl) << dxr_x) == 0) {
592 			tp->td_refcnt++;
593 			da->trietbl[index] = tp;
594 			return(tp->td_index);
595 		}
596 
597 	tp = LIST_FIRST(&da->unused_trie);
598 	if (tp != NULL) {
599 		LIST_REMOVE(tp, td_hash_le);
600 		da->unused_trie_cnt--;
601 	} else {
602 		tp = uma_zalloc(trie_zone, M_NOWAIT);
603 		if (tp == NULL)
604 			return (-1);
605 		LIST_INSERT_HEAD(&da->all_trie, tp, td_all_le);
606 		tp->td_index = da->all_trie_cnt++;
607 	}
608 
609 	tp->td_hash = hash;
610 	tp->td_refcnt = 1;
611 	LIST_INSERT_HEAD(&da->trie_hashtbl[hash & TRIE_HASH_MASK], tp,
612 	   td_hash_le);
613 	memcpy(&da->x_tbl[tp->td_index << dxr_x],
614 	    &da->direct_tbl[index << dxr_x], sizeof(*da->x_tbl) << dxr_x);
615 	da->trietbl[index] = tp;
616 	if (da->all_trie_cnt >= da->xtbl_size >> dxr_x) {
617 		da->xtbl_size += XTBL_SIZE_INCR;
618 		da->x_tbl = realloc(da->x_tbl,
619 		    sizeof(*da->x_tbl) * da->xtbl_size, M_DXRAUX, M_NOWAIT);
620 		if (da->x_tbl == NULL)
621 			return (-1);
622 	}
623 	return(tp->td_index);
624 }
625 
626 static void
627 trie_unref(struct dxr_aux *da, uint32_t index)
628 {
629 	struct trie_desc *tp = da->trietbl[index];
630 
631 	if (tp == NULL)
632 		return;
633 	da->trietbl[index] = NULL;
634 	if (--tp->td_refcnt > 0)
635 		return;
636 
637 	LIST_REMOVE(tp, td_hash_le);
638 	da->unused_trie_cnt++;
639 	if (tp->td_index != da->all_trie_cnt - 1) {
640 		LIST_INSERT_HEAD(&da->unused_trie, tp, td_hash_le);
641 		return;
642 	}
643 
644 	do {
645 		da->all_trie_cnt--;
646 		da->unused_trie_cnt--;
647 		LIST_REMOVE(tp, td_all_le);
648 		uma_zfree(trie_zone, tp);
649 		LIST_FOREACH(tp, &da->unused_trie, td_hash_le)
650 			if (tp->td_index == da->all_trie_cnt - 1) {
651 				LIST_REMOVE(tp, td_hash_le);
652 				break;
653 			}
654 	} while (tp != NULL);
655 }
656 #endif
657 
658 static void
659 heap_inject(struct dxr_aux *da, uint32_t start, uint32_t end, uint32_t preflen,
660     uint32_t nh)
661 {
662 	struct heap_entry *fhp;
663 	int i;
664 
665 	for (i = da->heap_index; i >= 0; i--) {
666 		if (preflen > da->heap[i].preflen)
667 			break;
668 		else if (preflen < da->heap[i].preflen)
669 			da->heap[i + 1] = da->heap[i];
670 		else
671 			return;
672 	}
673 
674 	fhp = &da->heap[i + 1];
675 	fhp->preflen = preflen;
676 	fhp->start = start;
677 	fhp->end = end;
678 	fhp->nexthop = nh;
679 	da->heap_index++;
680 }
681 
682 static int
683 dxr_walk(struct rtentry *rt, void *arg)
684 {
685 	struct dxr_aux *da = arg;
686 	uint32_t chunk = da->work_chunk;
687 	uint32_t first = chunk << DXR_RANGE_SHIFT;
688 	uint32_t last = first | DXR_RANGE_MASK;
689 	struct range_entry_long *fp =
690 	    &da->range_tbl[da->rtbl_top + da->rtbl_work_frags].re;
691 	struct heap_entry *fhp = &da->heap[da->heap_index];
692 	uint32_t preflen, nh, start, end, scopeid;
693 	struct in_addr addr;
694 
695 	rt_get_inet_prefix_plen(rt, &addr, &preflen, &scopeid);
696 	start = ntohl(addr.s_addr);
697 	if (start > last)
698 		return (-1);	/* Beyond chunk boundaries, we are done */
699 	if (start < first)
700 		return (0);	/* Skip this route */
701 
702 	end = start;
703 	if (preflen < 32)
704 		end |= (0xffffffffU >> preflen);
705 	nh = fib_get_nhop_idx(da->fd, rt_get_raw_nhop(rt));
706 
707 	if (start == fhp->start)
708 		heap_inject(da, start, end, preflen, nh);
709 	else {
710 		/* start > fhp->start */
711 		while (start > fhp->end) {
712 			uint32_t oend = fhp->end;
713 
714 			if (da->heap_index > 0) {
715 				fhp--;
716 				da->heap_index--;
717 			} else
718 				initheap(da, fhp->end + 1, chunk);
719 			if (fhp->end > oend && fhp->nexthop != fp->nexthop) {
720 				fp++;
721 				da->rtbl_work_frags++;
722 				fp->start = (oend + 1) & DXR_RANGE_MASK;
723 				fp->nexthop = fhp->nexthop;
724 			}
725 		}
726 		if (start > ((chunk << DXR_RANGE_SHIFT) | fp->start) &&
727 		    nh != fp->nexthop) {
728 			fp++;
729 			da->rtbl_work_frags++;
730 			fp->start = start & DXR_RANGE_MASK;
731 		} else if (da->rtbl_work_frags) {
732 			if ((--fp)->nexthop == nh)
733 				da->rtbl_work_frags--;
734 			else
735 				fp++;
736 		}
737 		fp->nexthop = nh;
738 		heap_inject(da, start, end, preflen, nh);
739 	}
740 
741 	return (0);
742 }
743 
744 static int
745 update_chunk(struct dxr_aux *da, uint32_t chunk)
746 {
747 	struct range_entry_long *fp;
748 #if DXR_TRIE_BITS < 24
749 	struct range_entry_short *fps;
750 	uint32_t start, nh, i;
751 #endif
752 	struct heap_entry *fhp;
753 	uint32_t first = chunk << DXR_RANGE_SHIFT;
754 	uint32_t last = first | DXR_RANGE_MASK;
755 
756 	if (da->direct_tbl[chunk].fragments != FRAGS_MARK_HIT)
757 		chunk_unref(da, chunk);
758 
759 	initheap(da, first, chunk);
760 
761 	fp = &da->range_tbl[da->rtbl_top].re;
762 	da->rtbl_work_frags = 0;
763 	fp->start = first & DXR_RANGE_MASK;
764 	fp->nexthop = da->heap[0].nexthop;
765 
766 	da->dst.sin_addr.s_addr = htonl(first);
767 	da->mask.sin_addr.s_addr = htonl(~DXR_RANGE_MASK);
768 
769 	da->work_chunk = chunk;
770 	rib_walk_from(da->fibnum, AF_INET, RIB_FLAG_LOCKED,
771 	    (struct sockaddr *) &da->dst, (struct sockaddr *) &da->mask,
772 	    dxr_walk, da);
773 
774 	/* Flush any remaining objects on the heap */
775 	fp = &da->range_tbl[da->rtbl_top + da->rtbl_work_frags].re;
776 	fhp = &da->heap[da->heap_index];
777 	while (fhp->preflen > DXR_TRIE_BITS) {
778 		uint32_t oend = fhp->end;
779 
780 		if (da->heap_index > 0) {
781 			fhp--;
782 			da->heap_index--;
783 		} else
784 			initheap(da, fhp->end + 1, chunk);
785 		if (fhp->end > oend && fhp->nexthop != fp->nexthop) {
786 			/* Have we crossed the upper chunk boundary? */
787 			if (oend >= last)
788 				break;
789 			fp++;
790 			da->rtbl_work_frags++;
791 			fp->start = (oend + 1) & DXR_RANGE_MASK;
792 			fp->nexthop = fhp->nexthop;
793 		}
794 	}
795 
796 	/* Direct hit if the chunk contains only a single fragment */
797 	if (da->rtbl_work_frags == 0) {
798 		da->direct_tbl[chunk].base = fp->nexthop;
799 		da->direct_tbl[chunk].fragments = FRAGS_MARK_HIT;
800 		return (0);
801 	}
802 
803 	da->direct_tbl[chunk].base = da->rtbl_top;
804 	da->direct_tbl[chunk].fragments = da->rtbl_work_frags;
805 
806 #if DXR_TRIE_BITS < 24
807 	/* Check whether the chunk can be more compactly encoded */
808 	fp = &da->range_tbl[da->rtbl_top].re;
809 	for (i = 0; i <= da->rtbl_work_frags; i++, fp++)
810 		if ((fp->start & 0xff) != 0 || fp->nexthop > RE_SHORT_MAX_NH)
811 			break;
812 	if (i == da->rtbl_work_frags + 1) {
813 		fp = &da->range_tbl[da->rtbl_top].re;
814 		fps = (void *) fp;
815 		for (i = 0; i <= da->rtbl_work_frags; i++, fp++, fps++) {
816 			start = fp->start;
817 			nh = fp->nexthop;
818 			fps->start = start >> 8;
819 			fps->nexthop = nh;
820 		}
821 		fps->start = start >> 8;
822 		fps->nexthop = nh;
823 		da->rtbl_work_frags >>= 1;
824 		da->direct_tbl[chunk].fragments =
825 		    da->rtbl_work_frags | FRAGS_PREF_SHORT;
826 	} else
827 #endif
828 	if (da->rtbl_work_frags >= FRAGS_MARK_HIT) {
829 		da->direct_tbl[chunk].fragments = FRAGS_MARK_XL;
830 		memmove(&da->range_tbl[da->rtbl_top + 1],
831 		   &da->range_tbl[da->rtbl_top],
832 		   (da->rtbl_work_frags + 1) * sizeof(*da->range_tbl));
833 		da->range_tbl[da->rtbl_top].fragments = da->rtbl_work_frags;
834 		da->rtbl_work_frags++;
835 	}
836 	da->rtbl_top += (da->rtbl_work_frags + 1);
837 	return (chunk_ref(da, chunk));
838 }
839 
840 static void
841 dxr_build(struct dxr *dxr)
842 {
843 	struct dxr_aux *da = dxr->aux;
844 	struct chunk_desc *cdp;
845 	struct rib_rtable_info rinfo;
846 	struct timeval t0, t1, t2, t3;
847 	uint32_t r_size, dxr_tot_size;
848 	uint32_t i, m, range_rebuild = 0;
849 #ifdef DXR2
850 	struct trie_desc *tp;
851 	uint32_t d_tbl_size, dxr_x, d_size, x_size;
852 	uint32_t ti, trie_rebuild = 0, prev_size = 0;
853 #endif
854 
855 	KASSERT(dxr->d == NULL, ("dxr: d not free"));
856 
857 	if (da == NULL) {
858 		da = malloc(sizeof(*dxr->aux), M_DXRAUX, M_NOWAIT);
859 		if (da == NULL)
860 			return;
861 		dxr->aux = da;
862 		da->fibnum = dxr->fibnum;
863 		da->refcnt = 1;
864 		LIST_INIT(&da->all_chunks);
865 		LIST_INIT(&da->all_trie);
866 		da->rtbl_size = RTBL_SIZE_INCR;
867 		da->range_tbl = NULL;
868 		da->xtbl_size = XTBL_SIZE_INCR;
869 		da->x_tbl = NULL;
870 		bzero(&da->dst, sizeof(da->dst));
871 		bzero(&da->mask, sizeof(da->mask));
872 		da->dst.sin_len = sizeof(da->dst);
873 		da->mask.sin_len = sizeof(da->mask);
874 		da->dst.sin_family = AF_INET;
875 		da->mask.sin_family = AF_INET;
876 	}
877 	if (da->range_tbl == NULL) {
878 		da->range_tbl = malloc(sizeof(*da->range_tbl) * da->rtbl_size
879 		    + FRAGS_PREF_SHORT, M_DXRAUX, M_NOWAIT);
880 		if (da->range_tbl == NULL)
881 			return;
882 		range_rebuild = 1;
883 	}
884 #ifdef DXR2
885 	if (da->x_tbl == NULL) {
886 		da->x_tbl = malloc(sizeof(*da->x_tbl) * da->xtbl_size,
887 		    M_DXRAUX, M_NOWAIT);
888 		if (da->x_tbl == NULL)
889 			return;
890 		trie_rebuild = 1;
891 	}
892 #endif
893 	da->fd = dxr->fd;
894 
895 	microuptime(&t0);
896 
897 	dxr->nh_tbl = fib_get_nhop_array(da->fd);
898 	fib_get_rtable_info(fib_get_rh(da->fd), &rinfo);
899 
900 	if (da->updates_low > da->updates_high ||
901 	    da->unused_chunks_cnt > V_max_range_holes)
902 		range_rebuild = 1;
903 	if (range_rebuild) {
904 		/* Bulk cleanup */
905 		bzero(da->chunk_hashtbl, sizeof(da->chunk_hashtbl));
906 		while ((cdp = LIST_FIRST(&da->all_chunks)) != NULL) {
907 			LIST_REMOVE(cdp, cd_all_le);
908 			uma_zfree(chunk_zone, cdp);
909 		}
910 		for (i = 0; i < UNUSED_BUCKETS; i++)
911 			LIST_INIT(&da->unused_chunks[i]);
912 		da->all_chunks_cnt = da->unused_chunks_cnt = 0;
913 		da->rtbl_top = 0;
914 		da->updates_low = 0;
915 		da->updates_high = DIRECT_TBL_SIZE - 1;
916 		memset(da->updates_mask, 0xff, sizeof(da->updates_mask));
917 		for (i = 0; i < DIRECT_TBL_SIZE; i++) {
918 			da->direct_tbl[i].fragments = FRAGS_MARK_HIT;
919 			da->direct_tbl[i].base = 0;
920 		}
921 	}
922 	da->prefixes = rinfo.num_prefixes;
923 
924 	/* DXR: construct direct & range table */
925 	for (i = da->updates_low; i <= da->updates_high; i++) {
926 		m = da->updates_mask[i >> 5] >> (i & 0x1f);
927 		if (m == 0)
928 			i |= 0x1f;
929 		else if (m & 1 && update_chunk(da, i) != 0)
930 			return;
931 	}
932 	r_size = sizeof(*da->range_tbl) * da->rtbl_top;
933 	microuptime(&t1);
934 
935 #ifdef DXR2
936 	if (range_rebuild || da->unused_trie_cnt > V_max_trie_holes ||
937 	    abs(fls(da->prefixes) - fls(da->trie_rebuilt_prefixes)) > 1)
938 		trie_rebuild = 1;
939 	if (trie_rebuild) {
940 		da->trie_rebuilt_prefixes = da->prefixes;
941 		da->d_bits = DXR_D;
942 		da->updates_low = 0;
943 		da->updates_high = DIRECT_TBL_SIZE - 1;
944 	}
945 
946 dxr2_try_squeeze:
947 	if (trie_rebuild) {
948 		/* Bulk cleanup */
949 		bzero(da->trietbl, sizeof(da->trietbl));
950 		bzero(da->trie_hashtbl, sizeof(da->trie_hashtbl));
951 		while ((tp = LIST_FIRST(&da->all_trie)) != NULL) {
952 			LIST_REMOVE(tp, td_all_le);
953 			uma_zfree(trie_zone, tp);
954 		}
955 		LIST_INIT(&da->unused_trie);
956 		da->all_trie_cnt = da->unused_trie_cnt = 0;
957 	}
958 
959 	/* Populate d_tbl, x_tbl */
960 	dxr_x = DXR_TRIE_BITS - da->d_bits;
961 	d_tbl_size = (1 << da->d_bits);
962 
963 	for (i = da->updates_low >> dxr_x; i <= da->updates_high >> dxr_x;
964 	    i++) {
965 		if (!trie_rebuild) {
966 			m = 0;
967 			for (int j = 0; j < (1 << dxr_x); j += 32)
968 				m |= da->updates_mask[((i << dxr_x) + j) >> 5];
969 			if (m == 0)
970 				continue;
971 			trie_unref(da, i);
972 		}
973 		ti = trie_ref(da, i);
974 		if (ti < 0)
975 			return;
976 		da->d_tbl[i] = ti;
977 	}
978 
979 	d_size = sizeof(*da->d_tbl) * d_tbl_size;
980 	x_size = sizeof(*da->x_tbl) * DIRECT_TBL_SIZE / d_tbl_size
981 	    * da->all_trie_cnt;
982 	dxr_tot_size = d_size + x_size + r_size;
983 
984 	if (trie_rebuild == 1) {
985 		/* Try to find a more compact D/X split */
986 		if (prev_size == 0 || dxr_tot_size <= prev_size)
987 			da->d_bits--;
988 		else {
989 			da->d_bits++;
990 			trie_rebuild = 2;
991 		}
992 		prev_size = dxr_tot_size;
993 		goto dxr2_try_squeeze;
994 	}
995 	microuptime(&t2);
996 #else /* !DXR2 */
997 	dxr_tot_size = sizeof(da->direct_tbl) + r_size;
998 	t2 = t1;
999 #endif
1000 
1001 	dxr->d = malloc(dxr_tot_size, M_DXRLPM, M_NOWAIT);
1002 	if (dxr->d == NULL)
1003 		return;
1004 #ifdef DXR2
1005 	memcpy(dxr->d, da->d_tbl, d_size);
1006 	dxr->x = ((char *) dxr->d) + d_size;
1007 	memcpy(dxr->x, da->x_tbl, x_size);
1008 	dxr->r = ((char *) dxr->x) + x_size;
1009 	dxr->d_shift = 32 - da->d_bits;
1010 	dxr->x_shift = dxr_x;
1011 	dxr->x_mask = 0xffffffffU >> (32 - dxr_x);
1012 #else /* !DXR2 */
1013 	memcpy(dxr->d, da->direct_tbl, sizeof(da->direct_tbl));
1014 	dxr->r = ((char *) dxr->d) + sizeof(da->direct_tbl);
1015 #endif
1016 	memcpy(dxr->r, da->range_tbl, r_size);
1017 
1018 	if (da->updates_low <= da->updates_high)
1019 		bzero(&da->updates_mask[da->updates_low / 32],
1020 		    (da->updates_high - da->updates_low) / 8 + 1);
1021 	da->updates_low = DIRECT_TBL_SIZE - 1;
1022 	da->updates_high = 0;
1023 	microuptime(&t3);
1024 
1025 #ifdef DXR2
1026 	FIB_PRINTF(LOG_INFO, da->fd, "D%dX%dR, %d prefixes, %d nhops (max)",
1027 	    da->d_bits, dxr_x, rinfo.num_prefixes, rinfo.num_nhops);
1028 #else
1029 	FIB_PRINTF(LOG_INFO, da->fd, "D%dR, %d prefixes, %d nhops (max)",
1030 	    DXR_D, rinfo.num_prefixes, rinfo.num_nhops);
1031 #endif
1032 	i = dxr_tot_size * 100;
1033 	if (rinfo.num_prefixes)
1034 		i /= rinfo.num_prefixes;
1035 	FIB_PRINTF(LOG_INFO, da->fd, "%d.%02d KBytes, %d.%02d Bytes/prefix",
1036 	    dxr_tot_size / 1024, dxr_tot_size * 100 / 1024 % 100,
1037 	    i / 100, i % 100);
1038 	i = (t1.tv_sec - t0.tv_sec) * 1000000 + t1.tv_usec - t0.tv_usec;
1039 	FIB_PRINTF(LOG_INFO, da->fd, "range table %s in %u.%03u ms",
1040 	    range_rebuild ? "rebuilt" : "updated", i / 1000, i % 1000);
1041 #ifdef DXR2
1042 	i = (t2.tv_sec - t1.tv_sec) * 1000000 + t2.tv_usec - t1.tv_usec;
1043 	FIB_PRINTF(LOG_INFO, da->fd, "trie %s in %u.%03u ms",
1044 	    trie_rebuild ? "rebuilt" : "updated", i / 1000, i % 1000);
1045 #endif
1046 	i = (t3.tv_sec - t2.tv_sec) * 1000000 + t3.tv_usec - t2.tv_usec;
1047 	FIB_PRINTF(LOG_INFO, da->fd, "snapshot forked in %u.%03u ms",
1048 	    i / 1000, i % 1000);
1049 	FIB_PRINTF(LOG_INFO, da->fd, "range table: %d%%, %d chunks, %d holes",
1050 	    da->rtbl_top * 100 / BASE_MAX, da->all_chunks_cnt,
1051 	    da->unused_chunks_cnt);
1052 }
1053 
1054 /*
1055  * Glue functions for attaching to FreeBSD 13 fib_algo infrastructure.
1056  */
1057 
1058 static struct nhop_object *
1059 dxr_fib_lookup(void *algo_data, const struct flm_lookup_key key,
1060     uint32_t scopeid)
1061 {
1062 	struct dxr *dxr = algo_data;
1063 	uint32_t nh;
1064 
1065 	nh = dxr_lookup(dxr, ntohl(key.addr4.s_addr));
1066 
1067 	return (dxr->nh_tbl[nh]);
1068 }
1069 
1070 static enum flm_op_result
1071 dxr_init(uint32_t fibnum, struct fib_data *fd, void *old_data, void **data)
1072 {
1073 	struct dxr *old_dxr = old_data;
1074 	struct dxr_aux *da = NULL;
1075 	struct dxr *dxr;
1076 
1077 	dxr = malloc(sizeof(*dxr), M_DXRAUX, M_NOWAIT);
1078 	if (dxr == NULL)
1079 		return (FLM_REBUILD);
1080 
1081 	/* Check whether we may reuse the old auxiliary structures */
1082 	if (old_dxr != NULL && old_dxr->aux != NULL) {
1083 		da = old_dxr->aux;
1084 		atomic_add_int(&da->refcnt, 1);
1085 	}
1086 
1087 	dxr->aux = da;
1088 	dxr->d = NULL;
1089 	dxr->fd = fd;
1090 	dxr->fibnum = fibnum;
1091 	*data = dxr;
1092 
1093 	return (FLM_SUCCESS);
1094 }
1095 
1096 static void
1097 dxr_destroy(void *data)
1098 {
1099 	struct dxr *dxr = data;
1100 	struct dxr_aux *da;
1101 	struct chunk_desc *cdp;
1102 	struct trie_desc *tp;
1103 
1104 	if (dxr->d != NULL)
1105 		free(dxr->d, M_DXRLPM);
1106 
1107 	da = dxr->aux;
1108 	free(dxr, M_DXRAUX);
1109 
1110 	if (da == NULL || atomic_fetchadd_int(&da->refcnt, -1) > 1)
1111 		return;
1112 
1113 	/* Release auxiliary structures */
1114 	while ((cdp = LIST_FIRST(&da->all_chunks)) != NULL) {
1115 		LIST_REMOVE(cdp, cd_all_le);
1116 		uma_zfree(chunk_zone, cdp);
1117 	}
1118 	while ((tp = LIST_FIRST(&da->all_trie)) != NULL) {
1119 		LIST_REMOVE(tp, td_all_le);
1120 		uma_zfree(trie_zone, tp);
1121 	}
1122 	free(da->range_tbl, M_DXRAUX);
1123 	free(da->x_tbl, M_DXRAUX);
1124 	free(da, M_DXRAUX);
1125 }
1126 
1127 static void
1128 epoch_dxr_destroy(epoch_context_t ctx)
1129 {
1130 	struct dxr *dxr = __containerof(ctx, struct dxr, epoch_ctx);
1131 
1132 	dxr_destroy(dxr);
1133 }
1134 
1135 static enum flm_op_result
1136 dxr_dump_end(void *data, struct fib_dp *dp)
1137 {
1138 	struct dxr *dxr = data;
1139 	struct dxr_aux *da;
1140 
1141 	dxr_build(dxr);
1142 
1143 	da = dxr->aux;
1144 	if (da == NULL)
1145 		return (FLM_REBUILD);
1146 
1147 	/* Structural limit exceeded, hard error */
1148 	if (da->rtbl_top >= BASE_MAX)
1149 		return (FLM_ERROR);
1150 
1151 	/* A malloc(,, M_NOWAIT) failed somewhere, retry later */
1152 	if (dxr->d == NULL)
1153 		return (FLM_REBUILD);
1154 
1155 	dp->f = dxr_fib_lookup;
1156 	dp->arg = dxr;
1157 
1158 	return (FLM_SUCCESS);
1159 }
1160 
1161 static enum flm_op_result
1162 dxr_dump_rib_item(struct rtentry *rt, void *data)
1163 {
1164 
1165 	return (FLM_SUCCESS);
1166 }
1167 
1168 static enum flm_op_result
1169 dxr_change_rib_item(struct rib_head *rnh, struct rib_cmd_info *rc,
1170     void *data)
1171 {
1172 
1173 	return (FLM_BATCH);
1174 }
1175 
1176 static enum flm_op_result
1177 dxr_change_rib_batch(struct rib_head *rnh, struct fib_change_queue *q,
1178     void *data)
1179 {
1180 	struct dxr *dxr = data;
1181 	struct dxr *new_dxr;
1182 	struct dxr_aux *da;
1183 	struct fib_dp new_dp;
1184 	enum flm_op_result res;
1185 	uint32_t ip, plen, hmask, start, end, i, ui;
1186 #ifdef INVARIANTS
1187 	struct rib_rtable_info rinfo;
1188 	int update_delta = 0;
1189 #endif
1190 
1191 	KASSERT(data != NULL, ("%s: NULL data", __FUNCTION__));
1192 	KASSERT(q != NULL, ("%s: NULL q", __FUNCTION__));
1193 	KASSERT(q->count < q->size, ("%s: q->count %d q->size %d",
1194 	    __FUNCTION__, q->count, q->size));
1195 
1196 	da = dxr->aux;
1197 	KASSERT(da != NULL, ("%s: NULL dxr->aux", __FUNCTION__));
1198 
1199 	FIB_PRINTF(LOG_INFO, da->fd, "processing %d update(s)", q->count);
1200 	for (ui = 0; ui < q->count; ui++) {
1201 #ifdef INVARIANTS
1202 		if (q->entries[ui].nh_new != NULL)
1203 			update_delta++;
1204 		if (q->entries[ui].nh_old != NULL)
1205 			update_delta--;
1206 #endif
1207 		plen = q->entries[ui].plen;
1208 		ip = ntohl(q->entries[ui].addr4.s_addr);
1209 		if (plen < 32)
1210 			hmask = 0xffffffffU >> plen;
1211 		else
1212 			hmask = 0;
1213 		start = (ip & ~hmask) >> DXR_RANGE_SHIFT;
1214 		end = (ip | hmask) >> DXR_RANGE_SHIFT;
1215 
1216 		if ((start & 0x1f) == 0 && (end & 0x1f) == 0x1f)
1217 			for (i = start >> 5; i <= end >> 5; i++)
1218 				da->updates_mask[i] = 0xffffffffU;
1219 		else
1220 			for (i = start; i <= end; i++)
1221 				da->updates_mask[i >> 5] |= (1 << (i & 0x1f));
1222 		if (start < da->updates_low)
1223 			da->updates_low = start;
1224 		if (end > da->updates_high)
1225 			da->updates_high = end;
1226 	}
1227 
1228 #ifdef INVARIANTS
1229 	fib_get_rtable_info(fib_get_rh(da->fd), &rinfo);
1230 	KASSERT(da->prefixes + update_delta == rinfo.num_prefixes,
1231 	    ("%s: update count mismatch", __FUNCTION__));
1232 #endif
1233 
1234 	res = dxr_init(0, dxr->fd, data, (void **) &new_dxr);
1235 	if (res != FLM_SUCCESS)
1236 		return (res);
1237 
1238 	dxr_build(new_dxr);
1239 
1240 	/* Structural limit exceeded, hard error */
1241 	if (da->rtbl_top >= BASE_MAX) {
1242 		dxr_destroy(new_dxr);
1243 		return (FLM_ERROR);
1244 	}
1245 
1246 	/* A malloc(,, M_NOWAIT) failed somewhere, retry later */
1247 	if (new_dxr->d == NULL) {
1248 		dxr_destroy(new_dxr);
1249 		return (FLM_REBUILD);
1250 	}
1251 
1252 	new_dp.f = dxr_fib_lookup;
1253 	new_dp.arg = new_dxr;
1254 	if (fib_set_datapath_ptr(dxr->fd, &new_dp)) {
1255 		fib_set_algo_ptr(dxr->fd, new_dxr);
1256 		fib_epoch_call(epoch_dxr_destroy, &dxr->epoch_ctx);
1257 		return (FLM_SUCCESS);
1258 	}
1259 
1260 	dxr_destroy(new_dxr);
1261 	return (FLM_REBUILD);
1262 }
1263 
1264 static uint8_t
1265 dxr_get_pref(const struct rib_rtable_info *rinfo)
1266 {
1267 
1268 	/* Below bsearch4 up to 10 prefixes. Always supersedes dpdk_lpm4. */
1269 	return (251);
1270 }
1271 
1272 static struct fib_lookup_module fib_dxr_mod = {
1273 	.flm_name = "dxr",
1274 	.flm_family = AF_INET,
1275 	.flm_init_cb = dxr_init,
1276 	.flm_destroy_cb = dxr_destroy,
1277 	.flm_dump_rib_item_cb = dxr_dump_rib_item,
1278 	.flm_dump_end_cb = dxr_dump_end,
1279 	.flm_change_rib_item_cb = dxr_change_rib_item,
1280 	.flm_change_rib_items_cb = dxr_change_rib_batch,
1281 	.flm_get_pref = dxr_get_pref,
1282 };
1283 
1284 static int
1285 dxr_modevent(module_t mod, int type, void *unused)
1286 {
1287 	int error;
1288 
1289 	switch (type) {
1290 	case MOD_LOAD:
1291 		chunk_zone = uma_zcreate("dxr chunk", sizeof(struct chunk_desc),
1292 		    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
1293 		trie_zone = uma_zcreate("dxr trie", sizeof(struct trie_desc),
1294 		    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
1295 		fib_module_register(&fib_dxr_mod);
1296 		return(0);
1297 	case MOD_UNLOAD:
1298 		error = fib_module_unregister(&fib_dxr_mod);
1299 		if (error)
1300 			return (error);
1301 		uma_zdestroy(chunk_zone);
1302 		uma_zdestroy(trie_zone);
1303 		return(0);
1304 	default:
1305 		return(EOPNOTSUPP);
1306 	}
1307 }
1308 
1309 static moduledata_t dxr_mod = {"fib_dxr", dxr_modevent, 0};
1310 
1311 DECLARE_MODULE(fib_dxr, dxr_mod, SI_SUB_PSEUDO, SI_ORDER_ANY);
1312 MODULE_VERSION(fib_dxr, 1);
1313