xref: /freebsd/sys/netinet/in_fib_dxr.c (revision b4a58fbf640409a1e507d9f7b411c83a3f83a2f3)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2012-2021 Marko Zec
5  * Copyright (c) 2005, 2018 University of Zagreb
6  * Copyright (c) 2005 International Computer Science Institute
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27  * SUCH DAMAGE.
28  */
29 
30 /*
31  * An implementation of DXR, a simple IPv4 LPM scheme with compact lookup
32  * structures and a trivial search procedure.  More significant bits of
33  * the search key are used to directly index a two-stage trie, while the
34  * remaining bits are used for finding the next hop in a sorted array.
35  * More details in:
36  *
37  * M. Zec, L. Rizzo, M. Mikuc, DXR: towards a billion routing lookups per
38  * second in software, ACM SIGCOMM Computer Communication Review, September
39  * 2012
40  *
41  * M. Zec, M. Mikuc, Pushing the envelope: beyond two billion IP routing
42  * lookups per second on commodity CPUs, IEEE SoftCOM, September 2017, Split
43  */
44 
45 #include <sys/cdefs.h>
46 __FBSDID("$FreeBSD$");
47 
48 #include "opt_inet.h"
49 
50 #include <sys/param.h>
51 #include <sys/kernel.h>
52 #include <sys/ctype.h>
53 #include <sys/epoch.h>
54 #include <sys/malloc.h>
55 #include <sys/module.h>
56 #include <sys/socket.h>
57 #include <sys/sysctl.h>
58 #include <sys/syslog.h>
59 
60 #include <vm/uma.h>
61 
62 #include <netinet/in.h>
63 #include <netinet/in_fib.h>
64 
65 #include <net/route.h>
66 #include <net/route/route_ctl.h>
67 #include <net/route/fib_algo.h>
68 
69 #define	DXR_TRIE_BITS		20
70 
71 CTASSERT(DXR_TRIE_BITS >= 16 && DXR_TRIE_BITS <= 24);
72 
73 /* DXR2: two-stage primary trie, instead of a single direct lookup table */
74 #define	DXR2
75 
76 #if DXR_TRIE_BITS > 16
77 #define	DXR_D			16
78 #else
79 #define	DXR_D			(DXR_TRIE_BITS - 1)
80 #endif
81 #define	DXR_X			(DXR_TRIE_BITS - DXR_D)
82 
83 #define	D_TBL_SIZE		(1 << DXR_D)
84 #define	DIRECT_TBL_SIZE		(1 << DXR_TRIE_BITS)
85 #define	DXR_RANGE_MASK		(0xffffffffU >> DXR_TRIE_BITS)
86 #define	DXR_RANGE_SHIFT		(32 - DXR_TRIE_BITS)
87 
88 #define	DESC_BASE_BITS		22
89 #define	DESC_FRAGMENTS_BITS	(32 - DESC_BASE_BITS)
90 #define	BASE_MAX		((1 << DESC_BASE_BITS) - 1)
91 #define	RTBL_SIZE_INCR		(BASE_MAX / 64)
92 
93 #if DXR_TRIE_BITS < 24
94 #define	FRAGS_MASK_SHORT	((1 << (23 - DXR_TRIE_BITS)) - 1)
95 #else
96 #define	FRAGS_MASK_SHORT	0
97 #endif
98 #define	FRAGS_PREF_SHORT	(((1 << DESC_FRAGMENTS_BITS) - 1) & \
99 				 ~FRAGS_MASK_SHORT)
100 #define	FRAGS_MARK_XL		(FRAGS_PREF_SHORT - 1)
101 #define	FRAGS_MARK_HIT		(FRAGS_PREF_SHORT - 2)
102 
103 #define	IS_SHORT_FORMAT(x)	((x & FRAGS_PREF_SHORT) == FRAGS_PREF_SHORT)
104 #define	IS_LONG_FORMAT(x)	((x & FRAGS_PREF_SHORT) != FRAGS_PREF_SHORT)
105 #define	IS_XL_FORMAT(x)		(x == FRAGS_MARK_XL)
106 
107 #define	RE_SHORT_MAX_NH		((1 << (DXR_TRIE_BITS - 8)) - 1)
108 
109 #define	CHUNK_HASH_BITS		16
110 #define	CHUNK_HASH_SIZE		(1 << CHUNK_HASH_BITS)
111 #define	CHUNK_HASH_MASK		(CHUNK_HASH_SIZE - 1)
112 
113 #define	TRIE_HASH_BITS		16
114 #define	TRIE_HASH_SIZE		(1 << TRIE_HASH_BITS)
115 #define	TRIE_HASH_MASK		(TRIE_HASH_SIZE - 1)
116 
117 #define	XTBL_SIZE_INCR		(DIRECT_TBL_SIZE / 16)
118 
119 #define	UNUSED_BUCKETS		8
120 
121 /* Lookup structure elements */
122 
123 struct direct_entry {
124 	uint32_t		fragments: DESC_FRAGMENTS_BITS,
125 				base: DESC_BASE_BITS;
126 };
127 
128 struct range_entry_long {
129 	uint32_t		start: DXR_RANGE_SHIFT,
130 				nexthop: DXR_TRIE_BITS;
131 };
132 
133 #if DXR_TRIE_BITS < 24
134 struct range_entry_short {
135 	uint16_t		start: DXR_RANGE_SHIFT - 8,
136 				nexthop: DXR_TRIE_BITS - 8;
137 };
138 #endif
139 
140 /* Auxiliary structures */
141 
142 struct heap_entry {
143 	uint32_t		start;
144 	uint32_t		end;
145 	uint32_t		preflen;
146 	uint32_t		nexthop;
147 };
148 
149 struct chunk_desc {
150 	LIST_ENTRY(chunk_desc)	cd_all_le;
151 	LIST_ENTRY(chunk_desc)	cd_hash_le;
152 	uint32_t		cd_hash;
153 	uint32_t		cd_refcnt;
154 	uint32_t		cd_base;
155 	uint32_t		cd_cur_size;
156 	uint32_t		cd_max_size;
157 };
158 
159 struct trie_desc {
160 	LIST_ENTRY(trie_desc)	td_all_le;
161 	LIST_ENTRY(trie_desc)	td_hash_le;
162 	uint32_t		td_hash;
163 	uint32_t		td_index;
164 	uint32_t		td_refcnt;
165 };
166 
167 struct dxr_aux {
168 	/* Glue to external state */
169 	struct fib_data		*fd;
170 	uint32_t		fibnum;
171 	int			refcnt;
172 
173 	/* Auxiliary build-time tables */
174 	struct direct_entry	direct_tbl[DIRECT_TBL_SIZE];
175 	uint16_t		d_tbl[D_TBL_SIZE];
176 	struct direct_entry	*x_tbl;
177 	union {
178 		struct range_entry_long	re;
179 		uint32_t	fragments;
180 	}			*range_tbl;
181 
182 	/* Auxiliary internal state */
183 	uint32_t		updates_mask[DIRECT_TBL_SIZE / 32];
184 	struct trie_desc	*trietbl[D_TBL_SIZE];
185 	LIST_HEAD(, chunk_desc)	chunk_hashtbl[CHUNK_HASH_SIZE];
186 	LIST_HEAD(, chunk_desc)	all_chunks;
187 	LIST_HEAD(, chunk_desc) unused_chunks[UNUSED_BUCKETS];
188 	LIST_HEAD(, trie_desc)	trie_hashtbl[TRIE_HASH_SIZE];
189 	LIST_HEAD(, trie_desc)	all_trie;
190 	LIST_HEAD(, trie_desc)	unused_trie; /* abuses hash link entry */
191 	struct sockaddr_in	dst;
192 	struct sockaddr_in	mask;
193 	struct heap_entry	heap[33];
194 	uint32_t		prefixes;
195 	uint32_t		updates_low;
196 	uint32_t		updates_high;
197 	uint32_t		unused_chunks_size;
198 	uint32_t		xtbl_size;
199 	uint32_t		all_trie_cnt;
200 	uint32_t		unused_trie_cnt;
201 	uint32_t		trie_rebuilt_prefixes;
202 	uint32_t		heap_index;
203 	uint32_t		d_bits;
204 	uint32_t		rtbl_size;
205 	uint32_t		rtbl_top;
206 	uint32_t		rtbl_work_frags;
207 	uint32_t		work_chunk;
208 };
209 
210 /* Main lookup structure container */
211 
212 struct dxr {
213 	/* Lookup tables */
214 	uint16_t		d_shift;
215 	uint16_t		x_shift;
216 	uint32_t		x_mask;
217 	void			*d;
218 	void			*x;
219 	void			*r;
220 	struct nhop_object	**nh_tbl;
221 
222 	/* Glue to external state */
223 	struct dxr_aux		*aux;
224 	struct fib_data		*fd;
225 	struct epoch_context	epoch_ctx;
226 	uint32_t		fibnum;
227 };
228 
229 static MALLOC_DEFINE(M_DXRLPM, "dxr", "DXR LPM");
230 static MALLOC_DEFINE(M_DXRAUX, "dxr aux", "DXR auxiliary");
231 
232 uma_zone_t chunk_zone;
233 uma_zone_t trie_zone;
234 
235 VNET_DEFINE_STATIC(int, frag_limit) = 100;
236 #define	V_frag_limit	VNET(frag_limit)
237 
238 SYSCTL_DECL(_net_route_algo);
239 SYSCTL_NODE(_net_route_algo, OID_AUTO, dxr, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
240     "DXR tunables");
241 
242 static int
243 sysctl_dxr_frag_limit(SYSCTL_HANDLER_ARGS)
244 {
245 	char buf[8];
246 	int error, new, i;
247 
248 	snprintf(buf, sizeof(buf), "%d.%02d%%", V_frag_limit / 100,
249 	    V_frag_limit % 100);
250 	error = sysctl_handle_string(oidp, buf, sizeof(buf), req);
251 	if (error != 0 || req->newptr == NULL)
252 		return (error);
253 	if (!isdigit(*buf) && *buf != '.')
254 		return (EINVAL);
255 	for (i = 0, new = 0; isdigit(buf[i]) && i < sizeof(buf); i++)
256 		new = new * 10 + buf[i] - '0';
257 	new *= 100;
258 	if (buf[i++] == '.') {
259 		if (!isdigit(buf[i]))
260 			return (EINVAL);
261 		new += (buf[i++] - '0') * 10;
262 		if (isdigit(buf[i]))
263 			new += buf[i++] - '0';
264 	}
265 	if (new > 1000)
266 		return (EINVAL);
267 	V_frag_limit = new;
268 	snprintf(buf, sizeof(buf), "%d.%02d%%", V_frag_limit / 100,
269 	    V_frag_limit % 100);
270 	return (0);
271 }
272 
273 SYSCTL_PROC(_net_route_algo_dxr, OID_AUTO, frag_limit,
274     CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_VNET,
275     0, 0, sysctl_dxr_frag_limit, "A",
276     "Fragmentation threshold to full rebuild");
277 
278 /* Binary search for a matching address range */
279 #define	DXR_LOOKUP_STAGE					\
280 	if (masked_dst < range[middle].start) {			\
281 		upperbound = middle;				\
282 		middle = (middle + lowerbound) / 2;		\
283 	} else if (masked_dst < range[middle + 1].start)	\
284 		return (range[middle].nexthop);			\
285 	else {							\
286 		lowerbound = middle + 1;			\
287 		middle = (upperbound + middle + 1) / 2;		\
288 	}							\
289 	if (upperbound == lowerbound)				\
290 		return (range[lowerbound].nexthop);
291 
292 static int
293 dxr_lookup(struct dxr *dxr, uint32_t dst)
294 {
295 #ifdef DXR2
296 	uint16_t *dt = dxr->d;
297 	struct direct_entry *xt = dxr->x;
298 	int xi;
299 #else
300 	struct direct_entry *dt = dxr->d;
301 #endif
302 	struct direct_entry de;
303 	struct range_entry_long	*rt;
304 	uint32_t base;
305 	uint32_t upperbound;
306 	uint32_t middle;
307 	uint32_t lowerbound;
308 	uint32_t masked_dst;
309 
310 #ifdef DXR2
311 	xi = (dt[dst >> dxr->d_shift] << dxr->x_shift) +
312 	    ((dst >> DXR_RANGE_SHIFT) & dxr->x_mask);
313 	de = xt[xi];
314 #else
315 	de = dt[dst >> DXR_RANGE_SHIFT];
316 #endif
317 
318 	if (__predict_true(de.fragments == FRAGS_MARK_HIT))
319 		return (de.base);
320 
321 	rt = dxr->r;
322 	base = de.base;
323 	lowerbound = 0;
324 	masked_dst = dst & DXR_RANGE_MASK;
325 
326 #if DXR_TRIE_BITS < 24
327 	if (__predict_true(IS_SHORT_FORMAT(de.fragments))) {
328 		upperbound = de.fragments & FRAGS_MASK_SHORT;
329 		struct range_entry_short *range =
330 		    (struct range_entry_short *) &rt[base];
331 
332 		masked_dst >>= 8;
333 		middle = upperbound;
334 		upperbound = upperbound * 2 + 1;
335 
336 		for (;;) {
337 			DXR_LOOKUP_STAGE
338 			DXR_LOOKUP_STAGE
339 		}
340 	}
341 #endif
342 
343 	upperbound = de.fragments;
344 	middle = upperbound / 2;
345 	struct range_entry_long *range = &rt[base];
346 	if (__predict_false(IS_XL_FORMAT(de.fragments))) {
347 		upperbound = *((uint32_t *) range);
348 		range++;
349 		middle = upperbound / 2;
350 	}
351 
352 	for (;;) {
353 		DXR_LOOKUP_STAGE
354 		DXR_LOOKUP_STAGE
355 	}
356 }
357 
358 static void
359 initheap(struct dxr_aux *da, uint32_t dst_u32, uint32_t chunk)
360 {
361 	struct heap_entry *fhp = &da->heap[0];
362 	struct rtentry *rt;
363 	struct route_nhop_data rnd;
364 
365 	da->heap_index = 0;
366 	da->dst.sin_addr.s_addr = htonl(dst_u32);
367 	rt = fib4_lookup_rt(da->fibnum, da->dst.sin_addr, 0, NHR_UNLOCKED,
368 	    &rnd);
369 	if (rt != NULL) {
370 		struct in_addr addr;
371 		uint32_t scopeid;
372 
373 		rt_get_inet_prefix_plen(rt, &addr, &fhp->preflen, &scopeid);
374 		fhp->start = ntohl(addr.s_addr);
375 		fhp->end = fhp->start;
376 		if (fhp->preflen < 32)
377 			fhp->end |= (0xffffffffU >> fhp->preflen);
378 		fhp->nexthop = fib_get_nhop_idx(da->fd, rnd.rnd_nhop);
379 	} else {
380 		fhp->preflen = fhp->nexthop = fhp->start = 0;
381 		fhp->end = 0xffffffffU;
382 	}
383 }
384 
385 static uint32_t
386 chunk_size(struct dxr_aux *da, struct direct_entry *fdesc)
387 {
388 
389 	if (IS_SHORT_FORMAT(fdesc->fragments))
390 		return ((fdesc->fragments & FRAGS_MASK_SHORT) + 1);
391 	else if (IS_XL_FORMAT(fdesc->fragments))
392 		return (da->range_tbl[fdesc->base].fragments + 2);
393 	else /* if (IS_LONG_FORMAT(fdesc->fragments)) */
394 		return (fdesc->fragments + 1);
395 }
396 
397 static uint32_t
398 chunk_hash(struct dxr_aux *da, struct direct_entry *fdesc)
399 {
400 	uint32_t size = chunk_size(da, fdesc);
401 	uint32_t *p = (uint32_t *) &da->range_tbl[fdesc->base];
402 	uint32_t *l = (uint32_t *) &da->range_tbl[fdesc->base + size];
403 	uint32_t hash = fdesc->fragments;
404 
405 	for (; p < l; p++)
406 		hash = (hash << 7) + (hash >> 13) + *p;
407 
408 	return (hash + (hash >> 16));
409 }
410 
411 static int
412 chunk_ref(struct dxr_aux *da, uint32_t chunk)
413 {
414 	struct direct_entry *fdesc = &da->direct_tbl[chunk];
415 	struct chunk_desc *cdp, *empty_cdp;
416 	uint32_t base = fdesc->base;
417 	uint32_t size = chunk_size(da, fdesc);
418 	uint32_t hash = chunk_hash(da, fdesc);
419 	int i;
420 
421 	/* Find an existing descriptor */
422 	LIST_FOREACH(cdp, &da->chunk_hashtbl[hash & CHUNK_HASH_MASK],
423 	    cd_hash_le) {
424 		if (cdp->cd_hash != hash || cdp->cd_cur_size != size ||
425 		    memcmp(&da->range_tbl[base], &da->range_tbl[cdp->cd_base],
426 		    sizeof(struct range_entry_long) * size))
427 			continue;
428 		da->rtbl_top = fdesc->base;
429 		fdesc->base = cdp->cd_base;
430 		cdp->cd_refcnt++;
431 		return (0);
432 	}
433 
434 	/* No matching chunks found. Find an empty one to recycle. */
435 	for (cdp = NULL, i = size; cdp == NULL && i < UNUSED_BUCKETS; i++)
436 		cdp = LIST_FIRST(&da->unused_chunks[i]);
437 
438 	if (cdp == NULL)
439 		LIST_FOREACH(empty_cdp, &da->unused_chunks[0], cd_hash_le)
440 			if (empty_cdp->cd_max_size >= size && (cdp == NULL ||
441 			    empty_cdp->cd_max_size < cdp->cd_max_size)) {
442 				cdp = empty_cdp;
443 				if (empty_cdp->cd_max_size == size)
444 					break;
445 			}
446 
447 	if (cdp != NULL) {
448 		/* Copy from heap into the recycled chunk */
449 		bcopy(&da->range_tbl[fdesc->base], &da->range_tbl[cdp->cd_base],
450 		    size * sizeof(struct range_entry_long));
451 		fdesc->base = cdp->cd_base;
452 		da->rtbl_top -= size;
453 		da->unused_chunks_size -= cdp->cd_max_size;
454 		if (cdp->cd_max_size > size) {
455 			/* Split the range in two, need a new descriptor */
456 			empty_cdp = uma_zalloc(chunk_zone, M_NOWAIT);
457 			if (empty_cdp == NULL)
458 				return (1);
459 			LIST_INSERT_BEFORE(cdp, empty_cdp, cd_all_le);
460 			empty_cdp->cd_base = cdp->cd_base + size;
461 			empty_cdp->cd_cur_size = 0;
462 			empty_cdp->cd_max_size = cdp->cd_max_size - size;
463 
464 			i = empty_cdp->cd_max_size;
465 			if (i >= UNUSED_BUCKETS)
466 				i = 0;
467 			LIST_INSERT_HEAD(&da->unused_chunks[i], empty_cdp,
468 			    cd_hash_le);
469 
470 			da->unused_chunks_size += empty_cdp->cd_max_size;
471 			cdp->cd_max_size = size;
472 		}
473 		LIST_REMOVE(cdp, cd_hash_le);
474 	} else {
475 		/* Alloc a new descriptor at the top of the heap*/
476 		cdp = uma_zalloc(chunk_zone, M_NOWAIT);
477 		if (cdp == NULL)
478 			return (1);
479 		cdp->cd_max_size = size;
480 		cdp->cd_base = fdesc->base;
481 		LIST_INSERT_HEAD(&da->all_chunks, cdp, cd_all_le);
482 		KASSERT(cdp->cd_base + cdp->cd_max_size == da->rtbl_top,
483 		    ("dxr: %s %d", __FUNCTION__, __LINE__));
484 	}
485 
486 	cdp->cd_hash = hash;
487 	cdp->cd_refcnt = 1;
488 	cdp->cd_cur_size = size;
489 	LIST_INSERT_HEAD(&da->chunk_hashtbl[hash & CHUNK_HASH_MASK], cdp,
490 	    cd_hash_le);
491 	if (da->rtbl_top >= da->rtbl_size) {
492 		if (da->rtbl_top >= BASE_MAX) {
493 			FIB_PRINTF(LOG_ERR, da->fd,
494 			    "structural limit exceeded at %d "
495 			    "range table elements", da->rtbl_top);
496 			return (1);
497 		}
498 		da->rtbl_size += RTBL_SIZE_INCR;
499 		i = (BASE_MAX - da->rtbl_top) * LOG_DEBUG / BASE_MAX;
500 		FIB_PRINTF(i, da->fd, "range table at %d%% structural limit",
501 		    da->rtbl_top * 100 / BASE_MAX);
502 		da->range_tbl = realloc(da->range_tbl,
503 		    sizeof(*da->range_tbl) * da->rtbl_size + FRAGS_PREF_SHORT,
504 		    M_DXRAUX, M_NOWAIT);
505 		if (da->range_tbl == NULL)
506 			return (1);
507 	}
508 
509 	return (0);
510 }
511 
512 static void
513 chunk_unref(struct dxr_aux *da, uint32_t chunk)
514 {
515 	struct direct_entry *fdesc = &da->direct_tbl[chunk];
516 	struct chunk_desc *cdp, *cdp2;
517 	uint32_t base = fdesc->base;
518 	uint32_t size = chunk_size(da, fdesc);
519 	uint32_t hash = chunk_hash(da, fdesc);
520 	int i;
521 
522 	/* Find the corresponding descriptor */
523 	LIST_FOREACH(cdp, &da->chunk_hashtbl[hash & CHUNK_HASH_MASK],
524 	    cd_hash_le)
525 		if (cdp->cd_hash == hash && cdp->cd_cur_size == size &&
526 		    memcmp(&da->range_tbl[base], &da->range_tbl[cdp->cd_base],
527 		    sizeof(struct range_entry_long) * size) == 0)
528 			break;
529 
530 	KASSERT(cdp != NULL, ("dxr: dangling chunk"));
531 	if (--cdp->cd_refcnt > 0)
532 		return;
533 
534 	LIST_REMOVE(cdp, cd_hash_le);
535 	da->unused_chunks_size += cdp->cd_max_size;
536 	cdp->cd_cur_size = 0;
537 
538 	/* Attempt to merge with the preceding chunk, if empty */
539 	cdp2 = LIST_NEXT(cdp, cd_all_le);
540 	if (cdp2 != NULL && cdp2->cd_cur_size == 0) {
541 		KASSERT(cdp2->cd_base + cdp2->cd_max_size == cdp->cd_base,
542 		    ("dxr: %s %d", __FUNCTION__, __LINE__));
543 		LIST_REMOVE(cdp, cd_all_le);
544 		LIST_REMOVE(cdp2, cd_hash_le);
545 		cdp2->cd_max_size += cdp->cd_max_size;
546 		uma_zfree(chunk_zone, cdp);
547 		cdp = cdp2;
548 	}
549 
550 	/* Attempt to merge with the subsequent chunk, if empty */
551 	cdp2 = LIST_PREV(cdp, &da->all_chunks, chunk_desc, cd_all_le);
552 	if (cdp2 != NULL && cdp2->cd_cur_size == 0) {
553 		KASSERT(cdp->cd_base + cdp->cd_max_size == cdp2->cd_base,
554 		    ("dxr: %s %d", __FUNCTION__, __LINE__));
555 		LIST_REMOVE(cdp, cd_all_le);
556 		LIST_REMOVE(cdp2, cd_hash_le);
557 		cdp2->cd_max_size += cdp->cd_max_size;
558 		cdp2->cd_base = cdp->cd_base;
559 		uma_zfree(chunk_zone, cdp);
560 		cdp = cdp2;
561 	}
562 
563 	if (cdp->cd_base + cdp->cd_max_size == da->rtbl_top) {
564 		/* Free the chunk on the top of the range heap, trim the heap */
565 		KASSERT(cdp == LIST_FIRST(&da->all_chunks),
566 		    ("dxr: %s %d", __FUNCTION__, __LINE__));
567 		da->rtbl_top -= cdp->cd_max_size;
568 		da->unused_chunks_size -= cdp->cd_max_size;
569 		LIST_REMOVE(cdp, cd_all_le);
570 		uma_zfree(chunk_zone, cdp);
571 		return;
572 	}
573 
574 	i = cdp->cd_max_size;
575 	if (i >= UNUSED_BUCKETS)
576 		i = 0;
577 	LIST_INSERT_HEAD(&da->unused_chunks[i], cdp, cd_hash_le);
578 }
579 
580 #ifdef DXR2
581 static uint32_t
582 trie_hash(struct dxr_aux *da, uint32_t dxr_x, uint32_t index)
583 {
584 	uint32_t i, *val;
585 	uint32_t hash = 0;
586 
587 	for (i = 0; i < (1 << dxr_x); i++) {
588 		hash = (hash << 3) ^ (hash >> 3);
589 		val = (uint32_t *)
590 		    (void *) &da->direct_tbl[(index << dxr_x) + i];
591 		hash += (*val << 5);
592 		hash += (*val >> 5);
593 	}
594 
595 	return (hash + (hash >> 16));
596 }
597 
598 static int
599 trie_ref(struct dxr_aux *da, uint32_t index)
600 {
601 	struct trie_desc *tp;
602 	uint32_t dxr_d = da->d_bits;
603 	uint32_t dxr_x = DXR_TRIE_BITS - dxr_d;
604 	uint32_t hash = trie_hash(da, dxr_x, index);
605 
606 	/* Find an existing descriptor */
607 	LIST_FOREACH(tp, &da->trie_hashtbl[hash & TRIE_HASH_MASK], td_hash_le)
608 		if (tp->td_hash == hash &&
609 		    memcmp(&da->direct_tbl[index << dxr_x],
610 		    &da->x_tbl[tp->td_index << dxr_x],
611 		    sizeof(*da->x_tbl) << dxr_x) == 0) {
612 			tp->td_refcnt++;
613 			da->trietbl[index] = tp;
614 			return(tp->td_index);
615 		}
616 
617 	tp = LIST_FIRST(&da->unused_trie);
618 	if (tp != NULL) {
619 		LIST_REMOVE(tp, td_hash_le);
620 		da->unused_trie_cnt--;
621 	} else {
622 		tp = uma_zalloc(trie_zone, M_NOWAIT);
623 		if (tp == NULL)
624 			return (-1);
625 		LIST_INSERT_HEAD(&da->all_trie, tp, td_all_le);
626 		tp->td_index = da->all_trie_cnt++;
627 	}
628 
629 	tp->td_hash = hash;
630 	tp->td_refcnt = 1;
631 	LIST_INSERT_HEAD(&da->trie_hashtbl[hash & TRIE_HASH_MASK], tp,
632 	   td_hash_le);
633 	memcpy(&da->x_tbl[tp->td_index << dxr_x],
634 	    &da->direct_tbl[index << dxr_x], sizeof(*da->x_tbl) << dxr_x);
635 	da->trietbl[index] = tp;
636 	if (da->all_trie_cnt >= da->xtbl_size >> dxr_x) {
637 		da->xtbl_size += XTBL_SIZE_INCR;
638 		da->x_tbl = realloc(da->x_tbl,
639 		    sizeof(*da->x_tbl) * da->xtbl_size, M_DXRAUX, M_NOWAIT);
640 		if (da->x_tbl == NULL)
641 			return (-1);
642 	}
643 	return(tp->td_index);
644 }
645 
646 static void
647 trie_unref(struct dxr_aux *da, uint32_t index)
648 {
649 	struct trie_desc *tp = da->trietbl[index];
650 
651 	if (tp == NULL)
652 		return;
653 	da->trietbl[index] = NULL;
654 	if (--tp->td_refcnt > 0)
655 		return;
656 
657 	LIST_REMOVE(tp, td_hash_le);
658 	da->unused_trie_cnt++;
659 	if (tp->td_index != da->all_trie_cnt - 1) {
660 		LIST_INSERT_HEAD(&da->unused_trie, tp, td_hash_le);
661 		return;
662 	}
663 
664 	do {
665 		da->all_trie_cnt--;
666 		da->unused_trie_cnt--;
667 		LIST_REMOVE(tp, td_all_le);
668 		uma_zfree(trie_zone, tp);
669 		LIST_FOREACH(tp, &da->unused_trie, td_hash_le)
670 			if (tp->td_index == da->all_trie_cnt - 1) {
671 				LIST_REMOVE(tp, td_hash_le);
672 				break;
673 			}
674 	} while (tp != NULL);
675 }
676 #endif
677 
678 static void
679 heap_inject(struct dxr_aux *da, uint32_t start, uint32_t end, uint32_t preflen,
680     uint32_t nh)
681 {
682 	struct heap_entry *fhp;
683 	int i;
684 
685 	for (i = da->heap_index; i >= 0; i--) {
686 		if (preflen > da->heap[i].preflen)
687 			break;
688 		else if (preflen < da->heap[i].preflen)
689 			da->heap[i + 1] = da->heap[i];
690 		else
691 			return;
692 	}
693 
694 	fhp = &da->heap[i + 1];
695 	fhp->preflen = preflen;
696 	fhp->start = start;
697 	fhp->end = end;
698 	fhp->nexthop = nh;
699 	da->heap_index++;
700 }
701 
702 static int
703 dxr_walk(struct rtentry *rt, void *arg)
704 {
705 	struct dxr_aux *da = arg;
706 	uint32_t chunk = da->work_chunk;
707 	uint32_t first = chunk << DXR_RANGE_SHIFT;
708 	uint32_t last = first | DXR_RANGE_MASK;
709 	struct range_entry_long *fp =
710 	    &da->range_tbl[da->rtbl_top + da->rtbl_work_frags].re;
711 	struct heap_entry *fhp = &da->heap[da->heap_index];
712 	uint32_t preflen, nh, start, end, scopeid;
713 	struct in_addr addr;
714 
715 	rt_get_inet_prefix_plen(rt, &addr, &preflen, &scopeid);
716 	start = ntohl(addr.s_addr);
717 	if (start > last)
718 		return (-1);	/* Beyond chunk boundaries, we are done */
719 	if (start < first)
720 		return (0);	/* Skip this route */
721 
722 	end = start;
723 	if (preflen < 32)
724 		end |= (0xffffffffU >> preflen);
725 	nh = fib_get_nhop_idx(da->fd, rt_get_raw_nhop(rt));
726 
727 	if (start == fhp->start)
728 		heap_inject(da, start, end, preflen, nh);
729 	else {
730 		/* start > fhp->start */
731 		while (start > fhp->end) {
732 			uint32_t oend = fhp->end;
733 
734 			if (da->heap_index > 0) {
735 				fhp--;
736 				da->heap_index--;
737 			} else
738 				initheap(da, fhp->end + 1, chunk);
739 			if (fhp->end > oend && fhp->nexthop != fp->nexthop) {
740 				fp++;
741 				da->rtbl_work_frags++;
742 				fp->start = (oend + 1) & DXR_RANGE_MASK;
743 				fp->nexthop = fhp->nexthop;
744 			}
745 		}
746 		if (start > ((chunk << DXR_RANGE_SHIFT) | fp->start) &&
747 		    nh != fp->nexthop) {
748 			fp++;
749 			da->rtbl_work_frags++;
750 			fp->start = start & DXR_RANGE_MASK;
751 		} else if (da->rtbl_work_frags) {
752 			if ((--fp)->nexthop == nh)
753 				da->rtbl_work_frags--;
754 			else
755 				fp++;
756 		}
757 		fp->nexthop = nh;
758 		heap_inject(da, start, end, preflen, nh);
759 	}
760 
761 	return (0);
762 }
763 
764 static int
765 update_chunk(struct dxr_aux *da, uint32_t chunk)
766 {
767 	struct range_entry_long *fp;
768 #if DXR_TRIE_BITS < 24
769 	struct range_entry_short *fps;
770 	uint32_t start, nh, i;
771 #endif
772 	struct heap_entry *fhp;
773 	uint32_t first = chunk << DXR_RANGE_SHIFT;
774 	uint32_t last = first | DXR_RANGE_MASK;
775 
776 	if (da->direct_tbl[chunk].fragments != FRAGS_MARK_HIT)
777 		chunk_unref(da, chunk);
778 
779 	initheap(da, first, chunk);
780 
781 	fp = &da->range_tbl[da->rtbl_top].re;
782 	da->rtbl_work_frags = 0;
783 	fp->start = first & DXR_RANGE_MASK;
784 	fp->nexthop = da->heap[0].nexthop;
785 
786 	da->dst.sin_addr.s_addr = htonl(first);
787 	da->mask.sin_addr.s_addr = htonl(~DXR_RANGE_MASK);
788 
789 	da->work_chunk = chunk;
790 	rib_walk_from(da->fibnum, AF_INET, RIB_FLAG_LOCKED,
791 	    (struct sockaddr *) &da->dst, (struct sockaddr *) &da->mask,
792 	    dxr_walk, da);
793 
794 	/* Flush any remaining objects on the heap */
795 	fp = &da->range_tbl[da->rtbl_top + da->rtbl_work_frags].re;
796 	fhp = &da->heap[da->heap_index];
797 	while (fhp->preflen > DXR_TRIE_BITS) {
798 		uint32_t oend = fhp->end;
799 
800 		if (da->heap_index > 0) {
801 			fhp--;
802 			da->heap_index--;
803 		} else
804 			initheap(da, fhp->end + 1, chunk);
805 		if (fhp->end > oend && fhp->nexthop != fp->nexthop) {
806 			/* Have we crossed the upper chunk boundary? */
807 			if (oend >= last)
808 				break;
809 			fp++;
810 			da->rtbl_work_frags++;
811 			fp->start = (oend + 1) & DXR_RANGE_MASK;
812 			fp->nexthop = fhp->nexthop;
813 		}
814 	}
815 
816 	/* Direct hit if the chunk contains only a single fragment */
817 	if (da->rtbl_work_frags == 0) {
818 		da->direct_tbl[chunk].base = fp->nexthop;
819 		da->direct_tbl[chunk].fragments = FRAGS_MARK_HIT;
820 		return (0);
821 	}
822 
823 	da->direct_tbl[chunk].base = da->rtbl_top;
824 	da->direct_tbl[chunk].fragments = da->rtbl_work_frags;
825 
826 #if DXR_TRIE_BITS < 24
827 	/* Check whether the chunk can be more compactly encoded */
828 	fp = &da->range_tbl[da->rtbl_top].re;
829 	for (i = 0; i <= da->rtbl_work_frags; i++, fp++)
830 		if ((fp->start & 0xff) != 0 || fp->nexthop > RE_SHORT_MAX_NH)
831 			break;
832 	if (i == da->rtbl_work_frags + 1) {
833 		fp = &da->range_tbl[da->rtbl_top].re;
834 		fps = (void *) fp;
835 		for (i = 0; i <= da->rtbl_work_frags; i++, fp++, fps++) {
836 			start = fp->start;
837 			nh = fp->nexthop;
838 			fps->start = start >> 8;
839 			fps->nexthop = nh;
840 		}
841 		fps->start = start >> 8;
842 		fps->nexthop = nh;
843 		da->rtbl_work_frags >>= 1;
844 		da->direct_tbl[chunk].fragments =
845 		    da->rtbl_work_frags | FRAGS_PREF_SHORT;
846 	} else
847 #endif
848 	if (da->rtbl_work_frags >= FRAGS_MARK_HIT) {
849 		da->direct_tbl[chunk].fragments = FRAGS_MARK_XL;
850 		memmove(&da->range_tbl[da->rtbl_top + 1],
851 		   &da->range_tbl[da->rtbl_top],
852 		   (da->rtbl_work_frags + 1) * sizeof(*da->range_tbl));
853 		da->range_tbl[da->rtbl_top].fragments = da->rtbl_work_frags;
854 		da->rtbl_work_frags++;
855 	}
856 	da->rtbl_top += (da->rtbl_work_frags + 1);
857 	return (chunk_ref(da, chunk));
858 }
859 
860 static void
861 dxr_build(struct dxr *dxr)
862 {
863 	struct dxr_aux *da = dxr->aux;
864 	struct chunk_desc *cdp;
865 	struct rib_rtable_info rinfo;
866 	struct timeval t0, t1, t2, t3;
867 	uint32_t r_size, dxr_tot_size;
868 	uint32_t i, m, range_rebuild = 0;
869 	uint32_t range_frag;
870 #ifdef DXR2
871 	struct trie_desc *tp;
872 	uint32_t d_tbl_size, dxr_x, d_size, x_size;
873 	uint32_t ti, trie_rebuild = 0, prev_size = 0;
874 	uint32_t trie_frag;
875 #endif
876 
877 	KASSERT(dxr->d == NULL, ("dxr: d not free"));
878 
879 	if (da == NULL) {
880 		da = malloc(sizeof(*dxr->aux), M_DXRAUX, M_NOWAIT);
881 		if (da == NULL)
882 			return;
883 		dxr->aux = da;
884 		da->fibnum = dxr->fibnum;
885 		da->refcnt = 1;
886 		LIST_INIT(&da->all_chunks);
887 		LIST_INIT(&da->all_trie);
888 		da->rtbl_size = RTBL_SIZE_INCR;
889 		da->range_tbl = NULL;
890 		da->xtbl_size = XTBL_SIZE_INCR;
891 		da->x_tbl = NULL;
892 		bzero(&da->dst, sizeof(da->dst));
893 		bzero(&da->mask, sizeof(da->mask));
894 		da->dst.sin_len = sizeof(da->dst);
895 		da->mask.sin_len = sizeof(da->mask);
896 		da->dst.sin_family = AF_INET;
897 		da->mask.sin_family = AF_INET;
898 	}
899 	if (da->range_tbl == NULL) {
900 		da->range_tbl = malloc(sizeof(*da->range_tbl) * da->rtbl_size
901 		    + FRAGS_PREF_SHORT, M_DXRAUX, M_NOWAIT);
902 		if (da->range_tbl == NULL)
903 			return;
904 		range_rebuild = 1;
905 	}
906 #ifdef DXR2
907 	if (da->x_tbl == NULL) {
908 		da->x_tbl = malloc(sizeof(*da->x_tbl) * da->xtbl_size,
909 		    M_DXRAUX, M_NOWAIT);
910 		if (da->x_tbl == NULL)
911 			return;
912 		trie_rebuild = 1;
913 	}
914 #endif
915 	da->fd = dxr->fd;
916 
917 	microuptime(&t0);
918 
919 	dxr->nh_tbl = fib_get_nhop_array(da->fd);
920 	fib_get_rtable_info(fib_get_rh(da->fd), &rinfo);
921 
922 	if (da->updates_low > da->updates_high)
923 		range_rebuild = 1;
924 
925 range_build:
926 	if (range_rebuild) {
927 		/* Bulk cleanup */
928 		bzero(da->chunk_hashtbl, sizeof(da->chunk_hashtbl));
929 		while ((cdp = LIST_FIRST(&da->all_chunks)) != NULL) {
930 			LIST_REMOVE(cdp, cd_all_le);
931 			uma_zfree(chunk_zone, cdp);
932 		}
933 		for (i = 0; i < UNUSED_BUCKETS; i++)
934 			LIST_INIT(&da->unused_chunks[i]);
935 		da->unused_chunks_size = 0;
936 		da->rtbl_top = 0;
937 		da->updates_low = 0;
938 		da->updates_high = DIRECT_TBL_SIZE - 1;
939 		memset(da->updates_mask, 0xff, sizeof(da->updates_mask));
940 		for (i = 0; i < DIRECT_TBL_SIZE; i++) {
941 			da->direct_tbl[i].fragments = FRAGS_MARK_HIT;
942 			da->direct_tbl[i].base = 0;
943 		}
944 	}
945 	da->prefixes = rinfo.num_prefixes;
946 
947 	/* DXR: construct direct & range table */
948 	for (i = da->updates_low; i <= da->updates_high; i++) {
949 		m = da->updates_mask[i >> 5] >> (i & 0x1f);
950 		if (m == 0)
951 			i |= 0x1f;
952 		else if (m & 1 && update_chunk(da, i) != 0)
953 			return;
954 	}
955 
956 	range_frag = 0;
957 	if (da->rtbl_top)
958 		range_frag = da->unused_chunks_size * 10000ULL / da->rtbl_top;
959 	if (range_frag > V_frag_limit) {
960 		range_rebuild = 1;
961 		goto range_build;
962 	}
963 
964 	r_size = sizeof(*da->range_tbl) * da->rtbl_top;
965 	microuptime(&t1);
966 
967 #ifdef DXR2
968 	if (range_rebuild ||
969 	    abs(fls(da->prefixes) - fls(da->trie_rebuilt_prefixes)) > 1)
970 		trie_rebuild = 1;
971 
972 trie_build:
973 	if (trie_rebuild) {
974 		da->trie_rebuilt_prefixes = da->prefixes;
975 		da->d_bits = DXR_D;
976 		da->updates_low = 0;
977 		da->updates_high = DIRECT_TBL_SIZE - 1;
978 		if (!range_rebuild)
979 			memset(da->updates_mask, 0xff,
980 			    sizeof(da->updates_mask));
981 	}
982 
983 dxr2_try_squeeze:
984 	if (trie_rebuild) {
985 		/* Bulk cleanup */
986 		bzero(da->trietbl, sizeof(da->trietbl));
987 		bzero(da->trie_hashtbl, sizeof(da->trie_hashtbl));
988 		while ((tp = LIST_FIRST(&da->all_trie)) != NULL) {
989 			LIST_REMOVE(tp, td_all_le);
990 			uma_zfree(trie_zone, tp);
991 		}
992 		LIST_INIT(&da->unused_trie);
993 		da->all_trie_cnt = da->unused_trie_cnt = 0;
994 	}
995 
996 	/* Populate d_tbl, x_tbl */
997 	dxr_x = DXR_TRIE_BITS - da->d_bits;
998 	d_tbl_size = (1 << da->d_bits);
999 
1000 	for (i = da->updates_low >> dxr_x; i <= da->updates_high >> dxr_x;
1001 	    i++) {
1002 		if (!trie_rebuild) {
1003 			m = 0;
1004 			for (int j = 0; j < (1 << dxr_x); j += 32)
1005 				m |= da->updates_mask[((i << dxr_x) + j) >> 5];
1006 			if (m == 0)
1007 				continue;
1008 			trie_unref(da, i);
1009 		}
1010 		ti = trie_ref(da, i);
1011 		if (ti < 0)
1012 			return;
1013 		da->d_tbl[i] = ti;
1014 	}
1015 
1016 	trie_frag = 0;
1017 	if (da->all_trie_cnt)
1018 		trie_frag = da->unused_trie_cnt * 10000ULL / da->all_trie_cnt;
1019 	if (trie_frag > V_frag_limit) {
1020 		trie_rebuild = 1;
1021 		goto trie_build;
1022 	}
1023 
1024 	d_size = sizeof(*da->d_tbl) * d_tbl_size;
1025 	x_size = sizeof(*da->x_tbl) * DIRECT_TBL_SIZE / d_tbl_size
1026 	    * da->all_trie_cnt;
1027 	dxr_tot_size = d_size + x_size + r_size;
1028 
1029 	if (trie_rebuild == 1) {
1030 		/* Try to find a more compact D/X split */
1031 		if (prev_size == 0 || dxr_tot_size <= prev_size)
1032 			da->d_bits--;
1033 		else {
1034 			da->d_bits++;
1035 			trie_rebuild = 2;
1036 		}
1037 		prev_size = dxr_tot_size;
1038 		goto dxr2_try_squeeze;
1039 	}
1040 	microuptime(&t2);
1041 #else /* !DXR2 */
1042 	dxr_tot_size = sizeof(da->direct_tbl) + r_size;
1043 	t2 = t1;
1044 #endif
1045 
1046 	dxr->d = malloc(dxr_tot_size, M_DXRLPM, M_NOWAIT);
1047 	if (dxr->d == NULL)
1048 		return;
1049 #ifdef DXR2
1050 	memcpy(dxr->d, da->d_tbl, d_size);
1051 	dxr->x = ((char *) dxr->d) + d_size;
1052 	memcpy(dxr->x, da->x_tbl, x_size);
1053 	dxr->r = ((char *) dxr->x) + x_size;
1054 	dxr->d_shift = 32 - da->d_bits;
1055 	dxr->x_shift = dxr_x;
1056 	dxr->x_mask = 0xffffffffU >> (32 - dxr_x);
1057 #else /* !DXR2 */
1058 	memcpy(dxr->d, da->direct_tbl, sizeof(da->direct_tbl));
1059 	dxr->r = ((char *) dxr->d) + sizeof(da->direct_tbl);
1060 #endif
1061 	memcpy(dxr->r, da->range_tbl, r_size);
1062 
1063 	if (da->updates_low <= da->updates_high)
1064 		bzero(&da->updates_mask[da->updates_low / 32],
1065 		    (da->updates_high - da->updates_low) / 8 + 1);
1066 	da->updates_low = DIRECT_TBL_SIZE - 1;
1067 	da->updates_high = 0;
1068 	microuptime(&t3);
1069 
1070 #ifdef DXR2
1071 	FIB_PRINTF(LOG_INFO, da->fd, "D%dX%dR, %d prefixes, %d nhops (max)",
1072 	    da->d_bits, dxr_x, rinfo.num_prefixes, rinfo.num_nhops);
1073 #else
1074 	FIB_PRINTF(LOG_INFO, da->fd, "D%dR, %d prefixes, %d nhops (max)",
1075 	    DXR_D, rinfo.num_prefixes, rinfo.num_nhops);
1076 #endif
1077 	i = dxr_tot_size * 100;
1078 	if (rinfo.num_prefixes)
1079 		i /= rinfo.num_prefixes;
1080 	FIB_PRINTF(LOG_INFO, da->fd, "%d.%02d KBytes, %d.%02d Bytes/prefix",
1081 	    dxr_tot_size / 1024, dxr_tot_size * 100 / 1024 % 100,
1082 	    i / 100, i % 100);
1083 #ifdef DXR2
1084 	FIB_PRINTF(LOG_INFO, da->fd,
1085 	    "%d.%02d%% trie, %d.%02d%% range fragmentation",
1086 	    trie_frag / 100, trie_frag % 100,
1087 	    range_frag / 100, range_frag % 100);
1088 #else
1089 	FIB_PRINTF(LOG_INFO, da->fd, "%d.%01d%% range fragmentation",
1090 	    range_frag / 100, range_frag % 100);
1091 #endif
1092 	i = (t1.tv_sec - t0.tv_sec) * 1000000 + t1.tv_usec - t0.tv_usec;
1093 	FIB_PRINTF(LOG_INFO, da->fd, "range table %s in %u.%03u ms",
1094 	    range_rebuild ? "rebuilt" : "updated", i / 1000, i % 1000);
1095 #ifdef DXR2
1096 	i = (t2.tv_sec - t1.tv_sec) * 1000000 + t2.tv_usec - t1.tv_usec;
1097 	FIB_PRINTF(LOG_INFO, da->fd, "trie %s in %u.%03u ms",
1098 	    trie_rebuild ? "rebuilt" : "updated", i / 1000, i % 1000);
1099 #endif
1100 	i = (t3.tv_sec - t2.tv_sec) * 1000000 + t3.tv_usec - t2.tv_usec;
1101 	FIB_PRINTF(LOG_INFO, da->fd, "snapshot forked in %u.%03u ms",
1102 	    i / 1000, i % 1000);
1103 }
1104 
1105 /*
1106  * Glue functions for attaching to FreeBSD 13 fib_algo infrastructure.
1107  */
1108 
1109 static struct nhop_object *
1110 dxr_fib_lookup(void *algo_data, const struct flm_lookup_key key,
1111     uint32_t scopeid)
1112 {
1113 	struct dxr *dxr = algo_data;
1114 	uint32_t nh;
1115 
1116 	nh = dxr_lookup(dxr, ntohl(key.addr4.s_addr));
1117 
1118 	return (dxr->nh_tbl[nh]);
1119 }
1120 
1121 static enum flm_op_result
1122 dxr_init(uint32_t fibnum, struct fib_data *fd, void *old_data, void **data)
1123 {
1124 	struct dxr *old_dxr = old_data;
1125 	struct dxr_aux *da = NULL;
1126 	struct dxr *dxr;
1127 
1128 	dxr = malloc(sizeof(*dxr), M_DXRAUX, M_NOWAIT);
1129 	if (dxr == NULL)
1130 		return (FLM_REBUILD);
1131 
1132 	/* Check whether we may reuse the old auxiliary structures */
1133 	if (old_dxr != NULL && old_dxr->aux != NULL) {
1134 		da = old_dxr->aux;
1135 		atomic_add_int(&da->refcnt, 1);
1136 	}
1137 
1138 	dxr->aux = da;
1139 	dxr->d = NULL;
1140 	dxr->fd = fd;
1141 	dxr->fibnum = fibnum;
1142 	*data = dxr;
1143 
1144 	return (FLM_SUCCESS);
1145 }
1146 
1147 static void
1148 dxr_destroy(void *data)
1149 {
1150 	struct dxr *dxr = data;
1151 	struct dxr_aux *da;
1152 	struct chunk_desc *cdp;
1153 	struct trie_desc *tp;
1154 
1155 	if (dxr->d != NULL)
1156 		free(dxr->d, M_DXRLPM);
1157 
1158 	da = dxr->aux;
1159 	free(dxr, M_DXRAUX);
1160 
1161 	if (da == NULL || atomic_fetchadd_int(&da->refcnt, -1) > 1)
1162 		return;
1163 
1164 	/* Release auxiliary structures */
1165 	while ((cdp = LIST_FIRST(&da->all_chunks)) != NULL) {
1166 		LIST_REMOVE(cdp, cd_all_le);
1167 		uma_zfree(chunk_zone, cdp);
1168 	}
1169 	while ((tp = LIST_FIRST(&da->all_trie)) != NULL) {
1170 		LIST_REMOVE(tp, td_all_le);
1171 		uma_zfree(trie_zone, tp);
1172 	}
1173 	free(da->range_tbl, M_DXRAUX);
1174 	free(da->x_tbl, M_DXRAUX);
1175 	free(da, M_DXRAUX);
1176 }
1177 
1178 static void
1179 epoch_dxr_destroy(epoch_context_t ctx)
1180 {
1181 	struct dxr *dxr = __containerof(ctx, struct dxr, epoch_ctx);
1182 
1183 	dxr_destroy(dxr);
1184 }
1185 
1186 static enum flm_op_result
1187 dxr_dump_end(void *data, struct fib_dp *dp)
1188 {
1189 	struct dxr *dxr = data;
1190 	struct dxr_aux *da;
1191 
1192 	dxr_build(dxr);
1193 
1194 	da = dxr->aux;
1195 	if (da == NULL)
1196 		return (FLM_REBUILD);
1197 
1198 	/* Structural limit exceeded, hard error */
1199 	if (da->rtbl_top >= BASE_MAX)
1200 		return (FLM_ERROR);
1201 
1202 	/* A malloc(,, M_NOWAIT) failed somewhere, retry later */
1203 	if (dxr->d == NULL)
1204 		return (FLM_REBUILD);
1205 
1206 	dp->f = dxr_fib_lookup;
1207 	dp->arg = dxr;
1208 
1209 	return (FLM_SUCCESS);
1210 }
1211 
1212 static enum flm_op_result
1213 dxr_dump_rib_item(struct rtentry *rt, void *data)
1214 {
1215 
1216 	return (FLM_SUCCESS);
1217 }
1218 
1219 static enum flm_op_result
1220 dxr_change_rib_item(struct rib_head *rnh, struct rib_cmd_info *rc,
1221     void *data)
1222 {
1223 
1224 	return (FLM_BATCH);
1225 }
1226 
1227 static enum flm_op_result
1228 dxr_change_rib_batch(struct rib_head *rnh, struct fib_change_queue *q,
1229     void *data)
1230 {
1231 	struct dxr *dxr = data;
1232 	struct dxr *new_dxr;
1233 	struct dxr_aux *da;
1234 	struct fib_dp new_dp;
1235 	enum flm_op_result res;
1236 	uint32_t ip, plen, hmask, start, end, i, ui;
1237 #ifdef INVARIANTS
1238 	struct rib_rtable_info rinfo;
1239 	int update_delta = 0;
1240 #endif
1241 
1242 	KASSERT(data != NULL, ("%s: NULL data", __FUNCTION__));
1243 	KASSERT(q != NULL, ("%s: NULL q", __FUNCTION__));
1244 	KASSERT(q->count < q->size, ("%s: q->count %d q->size %d",
1245 	    __FUNCTION__, q->count, q->size));
1246 
1247 	da = dxr->aux;
1248 	KASSERT(da != NULL, ("%s: NULL dxr->aux", __FUNCTION__));
1249 
1250 	FIB_PRINTF(LOG_INFO, da->fd, "processing %d update(s)", q->count);
1251 	for (ui = 0; ui < q->count; ui++) {
1252 #ifdef INVARIANTS
1253 		if (q->entries[ui].nh_new != NULL)
1254 			update_delta++;
1255 		if (q->entries[ui].nh_old != NULL)
1256 			update_delta--;
1257 #endif
1258 		plen = q->entries[ui].plen;
1259 		ip = ntohl(q->entries[ui].addr4.s_addr);
1260 		if (plen < 32)
1261 			hmask = 0xffffffffU >> plen;
1262 		else
1263 			hmask = 0;
1264 		start = (ip & ~hmask) >> DXR_RANGE_SHIFT;
1265 		end = (ip | hmask) >> DXR_RANGE_SHIFT;
1266 
1267 		if ((start & 0x1f) == 0 && (end & 0x1f) == 0x1f)
1268 			for (i = start >> 5; i <= end >> 5; i++)
1269 				da->updates_mask[i] = 0xffffffffU;
1270 		else
1271 			for (i = start; i <= end; i++)
1272 				da->updates_mask[i >> 5] |= (1 << (i & 0x1f));
1273 		if (start < da->updates_low)
1274 			da->updates_low = start;
1275 		if (end > da->updates_high)
1276 			da->updates_high = end;
1277 	}
1278 
1279 #ifdef INVARIANTS
1280 	fib_get_rtable_info(fib_get_rh(da->fd), &rinfo);
1281 	KASSERT(da->prefixes + update_delta == rinfo.num_prefixes,
1282 	    ("%s: update count mismatch", __FUNCTION__));
1283 #endif
1284 
1285 	res = dxr_init(0, dxr->fd, data, (void **) &new_dxr);
1286 	if (res != FLM_SUCCESS)
1287 		return (res);
1288 
1289 	dxr_build(new_dxr);
1290 
1291 	/* Structural limit exceeded, hard error */
1292 	if (da->rtbl_top >= BASE_MAX) {
1293 		dxr_destroy(new_dxr);
1294 		return (FLM_ERROR);
1295 	}
1296 
1297 	/* A malloc(,, M_NOWAIT) failed somewhere, retry later */
1298 	if (new_dxr->d == NULL) {
1299 		dxr_destroy(new_dxr);
1300 		return (FLM_REBUILD);
1301 	}
1302 
1303 	new_dp.f = dxr_fib_lookup;
1304 	new_dp.arg = new_dxr;
1305 	if (fib_set_datapath_ptr(dxr->fd, &new_dp)) {
1306 		fib_set_algo_ptr(dxr->fd, new_dxr);
1307 		fib_epoch_call(epoch_dxr_destroy, &dxr->epoch_ctx);
1308 		return (FLM_SUCCESS);
1309 	}
1310 
1311 	dxr_destroy(new_dxr);
1312 	return (FLM_REBUILD);
1313 }
1314 
1315 static uint8_t
1316 dxr_get_pref(const struct rib_rtable_info *rinfo)
1317 {
1318 
1319 	/* Below bsearch4 up to 10 prefixes. Always supersedes dpdk_lpm4. */
1320 	return (251);
1321 }
1322 
1323 static struct fib_lookup_module fib_dxr_mod = {
1324 	.flm_name = "dxr",
1325 	.flm_family = AF_INET,
1326 	.flm_init_cb = dxr_init,
1327 	.flm_destroy_cb = dxr_destroy,
1328 	.flm_dump_rib_item_cb = dxr_dump_rib_item,
1329 	.flm_dump_end_cb = dxr_dump_end,
1330 	.flm_change_rib_item_cb = dxr_change_rib_item,
1331 	.flm_change_rib_items_cb = dxr_change_rib_batch,
1332 	.flm_get_pref = dxr_get_pref,
1333 };
1334 
1335 static int
1336 dxr_modevent(module_t mod, int type, void *unused)
1337 {
1338 	int error;
1339 
1340 	switch (type) {
1341 	case MOD_LOAD:
1342 		chunk_zone = uma_zcreate("dxr chunk", sizeof(struct chunk_desc),
1343 		    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
1344 		trie_zone = uma_zcreate("dxr trie", sizeof(struct trie_desc),
1345 		    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
1346 		fib_module_register(&fib_dxr_mod);
1347 		return(0);
1348 	case MOD_UNLOAD:
1349 		error = fib_module_unregister(&fib_dxr_mod);
1350 		if (error)
1351 			return (error);
1352 		uma_zdestroy(chunk_zone);
1353 		uma_zdestroy(trie_zone);
1354 		return(0);
1355 	default:
1356 		return(EOPNOTSUPP);
1357 	}
1358 }
1359 
1360 static moduledata_t dxr_mod = {"fib_dxr", dxr_modevent, 0};
1361 
1362 DECLARE_MODULE(fib_dxr, dxr_mod, SI_SUB_PSEUDO, SI_ORDER_ANY);
1363 MODULE_VERSION(fib_dxr, 1);
1364