xref: /freebsd/sys/netinet/in_fib_dxr.c (revision 4ab122e8ef127d36d95f874e85600c36c87c8c22)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause
3  *
4  * Copyright (c) 2012-2022 Marko Zec
5  * Copyright (c) 2005, 2018 University of Zagreb
6  * Copyright (c) 2005 International Computer Science Institute
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27  * SUCH DAMAGE.
28  */
29 
30 /*
31  * An implementation of DXR, a simple IPv4 LPM scheme with compact lookup
32  * structures and a trivial search procedure.  More significant bits of
33  * the search key are used to directly index a two-stage trie, while the
34  * remaining bits are used for finding the next hop in a sorted array.
35  * More details in:
36  *
37  * M. Zec, L. Rizzo, M. Mikuc, DXR: towards a billion routing lookups per
38  * second in software, ACM SIGCOMM Computer Communication Review, September
39  * 2012
40  *
41  * M. Zec, M. Mikuc, Pushing the envelope: beyond two billion IP routing
42  * lookups per second on commodity CPUs, IEEE SoftCOM, September 2017, Split
43  */
44 
45 #include <sys/cdefs.h>
46 #include "opt_inet.h"
47 
48 #include <sys/param.h>
49 #include <sys/kernel.h>
50 #include <sys/ctype.h>
51 #include <sys/epoch.h>
52 #include <sys/malloc.h>
53 #include <sys/module.h>
54 #include <sys/socket.h>
55 #include <sys/sysctl.h>
56 #include <sys/syslog.h>
57 
58 #include <vm/uma.h>
59 
60 #include <netinet/in.h>
61 #include <netinet/in_fib.h>
62 
63 #include <net/route.h>
64 #include <net/route/route_ctl.h>
65 #include <net/route/fib_algo.h>
66 
67 #define	DXR_TRIE_BITS		20
68 
69 CTASSERT(DXR_TRIE_BITS >= 16 && DXR_TRIE_BITS <= 24);
70 
71 /* DXR2: two-stage primary trie, instead of a single direct lookup table */
72 #define	DXR2
73 
74 #if DXR_TRIE_BITS > 16
75 #define	DXR_D			16
76 #else
77 #define	DXR_D			(DXR_TRIE_BITS - 1)
78 #endif
79 
80 #define	D_TBL_SIZE		(1 << DXR_D)
81 #define	DIRECT_TBL_SIZE		(1 << DXR_TRIE_BITS)
82 #define	DXR_RANGE_MASK		(0xffffffffU >> DXR_TRIE_BITS)
83 #define	DXR_RANGE_SHIFT		(32 - DXR_TRIE_BITS)
84 
85 #define	DESC_BASE_BITS		22
86 #define	DESC_FRAGMENTS_BITS	(32 - DESC_BASE_BITS)
87 #define	BASE_MAX		((1 << DESC_BASE_BITS) - 1)
88 #define	RTBL_SIZE_INCR		(BASE_MAX / 64)
89 
90 #if DXR_TRIE_BITS < 24
91 #define	FRAGS_MASK_SHORT	((1 << (23 - DXR_TRIE_BITS)) - 1)
92 #else
93 #define	FRAGS_MASK_SHORT	0
94 #endif
95 #define	FRAGS_PREF_SHORT	(((1 << DESC_FRAGMENTS_BITS) - 1) & \
96 				 ~FRAGS_MASK_SHORT)
97 #define	FRAGS_MARK_XL		(FRAGS_PREF_SHORT - 1)
98 #define	FRAGS_MARK_HIT		(FRAGS_PREF_SHORT - 2)
99 
100 #define	IS_SHORT_FORMAT(x)	((x & FRAGS_PREF_SHORT) == FRAGS_PREF_SHORT)
101 #define	IS_LONG_FORMAT(x)	((x & FRAGS_PREF_SHORT) != FRAGS_PREF_SHORT)
102 #define	IS_XL_FORMAT(x)		(x == FRAGS_MARK_XL)
103 
104 #define	RE_SHORT_MAX_NH		((1 << (DXR_TRIE_BITS - 8)) - 1)
105 
106 #define	CHUNK_HASH_BITS		16
107 #define	CHUNK_HASH_SIZE		(1 << CHUNK_HASH_BITS)
108 #define	CHUNK_HASH_MASK		(CHUNK_HASH_SIZE - 1)
109 
110 #define	TRIE_HASH_BITS		16
111 #define	TRIE_HASH_SIZE		(1 << TRIE_HASH_BITS)
112 #define	TRIE_HASH_MASK		(TRIE_HASH_SIZE - 1)
113 
114 #define	XTBL_SIZE_INCR		(DIRECT_TBL_SIZE / 16)
115 
116 #define	UNUSED_BUCKETS		8
117 
118 /* Lookup structure elements */
119 
120 struct direct_entry {
121 	uint32_t		fragments: DESC_FRAGMENTS_BITS,
122 				base: DESC_BASE_BITS;
123 };
124 
125 struct range_entry_long {
126 	uint32_t		start: DXR_RANGE_SHIFT,
127 				nexthop: DXR_TRIE_BITS;
128 };
129 
130 #if DXR_TRIE_BITS < 24
131 struct range_entry_short {
132 	uint16_t		start: DXR_RANGE_SHIFT - 8,
133 				nexthop: DXR_TRIE_BITS - 8;
134 };
135 #endif
136 
137 /* Auxiliary structures */
138 
139 struct heap_entry {
140 	uint32_t		start;
141 	uint32_t		end;
142 	uint32_t		preflen;
143 	uint32_t		nexthop;
144 };
145 
146 struct chunk_desc {
147 	LIST_ENTRY(chunk_desc)	cd_all_le;
148 	LIST_ENTRY(chunk_desc)	cd_hash_le;
149 	uint32_t		cd_hash;
150 	uint32_t		cd_refcnt;
151 	uint32_t		cd_base;
152 	uint32_t		cd_cur_size;
153 	uint32_t		cd_max_size;
154 };
155 
156 struct trie_desc {
157 	LIST_ENTRY(trie_desc)	td_all_le;
158 	LIST_ENTRY(trie_desc)	td_hash_le;
159 	uint32_t		td_hash;
160 	uint32_t		td_index;
161 	uint32_t		td_refcnt;
162 };
163 
164 struct dxr_aux {
165 	/* Glue to external state */
166 	struct fib_data		*fd;
167 	uint32_t		fibnum;
168 	int			refcnt;
169 
170 	/* Auxiliary build-time tables */
171 	struct direct_entry	direct_tbl[DIRECT_TBL_SIZE];
172 	uint16_t		d_tbl[D_TBL_SIZE];
173 	struct direct_entry	*x_tbl;
174 	union {
175 		struct range_entry_long	re;
176 		uint32_t	fragments;
177 	}			*range_tbl;
178 
179 	/* Auxiliary internal state */
180 	uint32_t		updates_mask[DIRECT_TBL_SIZE / 32];
181 	struct trie_desc	*trietbl[D_TBL_SIZE];
182 	LIST_HEAD(, chunk_desc)	chunk_hashtbl[CHUNK_HASH_SIZE];
183 	LIST_HEAD(, chunk_desc)	all_chunks;
184 	LIST_HEAD(, chunk_desc) unused_chunks[UNUSED_BUCKETS];
185 	LIST_HEAD(, trie_desc)	trie_hashtbl[TRIE_HASH_SIZE];
186 	LIST_HEAD(, trie_desc)	all_trie;
187 	LIST_HEAD(, trie_desc)	unused_trie; /* abuses hash link entry */
188 	struct sockaddr_in	dst;
189 	struct sockaddr_in	mask;
190 	struct heap_entry	heap[33];
191 	uint32_t		prefixes;
192 	uint32_t		updates_low;
193 	uint32_t		updates_high;
194 	uint32_t		unused_chunks_size;
195 	uint32_t		xtbl_size;
196 	uint32_t		all_trie_cnt;
197 	uint32_t		unused_trie_cnt;
198 	uint32_t		trie_rebuilt_prefixes;
199 	uint32_t		heap_index;
200 	uint32_t		d_bits;
201 	uint32_t		rtbl_size;
202 	uint32_t		rtbl_top;
203 	uint32_t		rtbl_work_frags;
204 	uint32_t		work_chunk;
205 };
206 
207 /* Main lookup structure container */
208 
209 struct dxr {
210 	/* Lookup tables */
211 	void			*d;
212 	void			*x;
213 	void			*r;
214 	struct nhop_object	**nh_tbl;
215 
216 	/* Glue to external state */
217 	struct dxr_aux		*aux;
218 	struct fib_data		*fd;
219 	struct epoch_context	epoch_ctx;
220 	uint32_t		fibnum;
221 	uint16_t		d_shift;
222 	uint16_t		x_shift;
223 	uint32_t		x_mask;
224 };
225 
226 static MALLOC_DEFINE(M_DXRLPM, "dxr", "DXR LPM");
227 static MALLOC_DEFINE(M_DXRAUX, "dxr aux", "DXR auxiliary");
228 
229 uma_zone_t chunk_zone;
230 uma_zone_t trie_zone;
231 
232 VNET_DEFINE_STATIC(int, frag_limit) = 100;
233 #define	V_frag_limit	VNET(frag_limit)
234 
235 /* Binary search for a matching address range */
236 #define	DXR_LOOKUP_STAGE					\
237 	if (masked_dst < range[middle].start) {			\
238 		upperbound = middle;				\
239 		middle = (middle + lowerbound) / 2;		\
240 	} else if (masked_dst < range[middle + 1].start)	\
241 		return (range[middle].nexthop);			\
242 	else {							\
243 		lowerbound = middle + 1;			\
244 		middle = (upperbound + middle + 1) / 2;		\
245 	}							\
246 	if (upperbound == lowerbound)				\
247 		return (range[lowerbound].nexthop);
248 
249 static int
250 range_lookup(struct range_entry_long *rt, struct direct_entry de, uint32_t dst)
251 {
252 	uint32_t base;
253 	uint32_t upperbound;
254 	uint32_t middle;
255 	uint32_t lowerbound;
256 	uint32_t masked_dst;
257 
258 	base = de.base;
259 	lowerbound = 0;
260 	masked_dst = dst & DXR_RANGE_MASK;
261 
262 #if DXR_TRIE_BITS < 24
263 	if (__predict_true(IS_SHORT_FORMAT(de.fragments))) {
264 		upperbound = de.fragments & FRAGS_MASK_SHORT;
265 		struct range_entry_short *range =
266 		    (struct range_entry_short *) &rt[base];
267 
268 		masked_dst >>= 8;
269 		middle = upperbound;
270 		upperbound = upperbound * 2 + 1;
271 
272 		for (;;) {
273 			DXR_LOOKUP_STAGE
274 			DXR_LOOKUP_STAGE
275 		}
276 	}
277 #endif
278 
279 	upperbound = de.fragments;
280 	middle = upperbound / 2;
281 	struct range_entry_long *range = &rt[base];
282 	if (__predict_false(IS_XL_FORMAT(de.fragments))) {
283 		upperbound = *((uint32_t *) range);
284 		range++;
285 		middle = upperbound / 2;
286 	}
287 
288 	for (;;) {
289 		DXR_LOOKUP_STAGE
290 		DXR_LOOKUP_STAGE
291 	}
292 }
293 
294 #define DXR_LOOKUP_DEFINE(D)						\
295 	static int inline						\
296 	dxr_lookup_##D(struct dxr *dxr, uint32_t dst)			\
297 	{								\
298 		struct direct_entry de;					\
299 		uint16_t *dt = dxr->d;					\
300 		struct direct_entry *xt = dxr->x;			\
301 									\
302 		de = xt[(dt[dst >> (32 - (D))] << (DXR_TRIE_BITS - (D))) \
303 		    + ((dst >> DXR_RANGE_SHIFT) &			\
304 		    (0xffffffffU >> (32 - DXR_TRIE_BITS + (D))))];	\
305 		if (__predict_true(de.fragments == FRAGS_MARK_HIT))	\
306 			return (de.base);				\
307 		return (range_lookup(dxr->r, de, dst));			\
308 	}								\
309 									\
310 	static struct nhop_object *					\
311 	dxr_fib_lookup_##D(void *algo_data,				\
312 	    const struct flm_lookup_key key, uint32_t scopeid __unused)	\
313 	{								\
314 		struct dxr *dxr = algo_data;				\
315 									\
316 		return (dxr->nh_tbl[dxr_lookup_##D(dxr,			\
317 		    ntohl(key.addr4.s_addr))]);				\
318 	}
319 
320 #ifdef DXR2
321 #if DXR_TRIE_BITS > 16
322 DXR_LOOKUP_DEFINE(16)
323 #endif
324 DXR_LOOKUP_DEFINE(15)
325 DXR_LOOKUP_DEFINE(14)
326 DXR_LOOKUP_DEFINE(13)
327 DXR_LOOKUP_DEFINE(12)
328 DXR_LOOKUP_DEFINE(11)
329 DXR_LOOKUP_DEFINE(10)
330 DXR_LOOKUP_DEFINE(9)
331 #endif /* DXR2 */
332 
333 static int inline
334 dxr_lookup(struct dxr *dxr, uint32_t dst)
335 {
336 	struct direct_entry de;
337 #ifdef DXR2
338 	uint16_t *dt = dxr->d;
339 	struct direct_entry *xt = dxr->x;
340 
341 	de = xt[(dt[dst >> dxr->d_shift] << dxr->x_shift) +
342 	    ((dst >> DXR_RANGE_SHIFT) & dxr->x_mask)];
343 #else /* !DXR2 */
344 	struct direct_entry *dt = dxr->d;
345 
346 	de = dt[dst >> DXR_RANGE_SHIFT];
347 #endif /* !DXR2 */
348 	if (__predict_true(de.fragments == FRAGS_MARK_HIT))
349 		return (de.base);
350 	return (range_lookup(dxr->r, de, dst));
351 }
352 
353 static void
354 initheap(struct dxr_aux *da, uint32_t dst_u32, uint32_t chunk)
355 {
356 	struct heap_entry *fhp = &da->heap[0];
357 	struct rtentry *rt;
358 	struct route_nhop_data rnd;
359 
360 	da->heap_index = 0;
361 	da->dst.sin_addr.s_addr = htonl(dst_u32);
362 	rt = fib4_lookup_rt(da->fibnum, da->dst.sin_addr, 0, NHR_UNLOCKED,
363 	    &rnd);
364 	if (rt != NULL) {
365 		struct in_addr addr;
366 		uint32_t scopeid;
367 
368 		rt_get_inet_prefix_plen(rt, &addr, &fhp->preflen, &scopeid);
369 		fhp->start = ntohl(addr.s_addr);
370 		fhp->end = fhp->start;
371 		if (fhp->preflen < 32)
372 			fhp->end |= (0xffffffffU >> fhp->preflen);
373 		fhp->nexthop = fib_get_nhop_idx(da->fd, rnd.rnd_nhop);
374 	} else {
375 		fhp->preflen = fhp->nexthop = fhp->start = 0;
376 		fhp->end = 0xffffffffU;
377 	}
378 }
379 
380 static uint32_t
381 chunk_size(struct dxr_aux *da, struct direct_entry *fdesc)
382 {
383 
384 	if (IS_SHORT_FORMAT(fdesc->fragments))
385 		return ((fdesc->fragments & FRAGS_MASK_SHORT) + 1);
386 	else if (IS_XL_FORMAT(fdesc->fragments))
387 		return (da->range_tbl[fdesc->base].fragments + 2);
388 	else /* if (IS_LONG_FORMAT(fdesc->fragments)) */
389 		return (fdesc->fragments + 1);
390 }
391 
392 static uint32_t
393 chunk_hash(struct dxr_aux *da, struct direct_entry *fdesc)
394 {
395 	uint32_t size = chunk_size(da, fdesc);
396 	uint32_t *p = (uint32_t *) &da->range_tbl[fdesc->base];
397 	uint32_t *l = (uint32_t *) &da->range_tbl[fdesc->base + size];
398 	uint32_t hash = fdesc->fragments;
399 
400 	for (; p < l; p++)
401 		hash = (hash << 7) + (hash >> 13) + *p;
402 
403 	return (hash + (hash >> 16));
404 }
405 
406 static int
407 chunk_ref(struct dxr_aux *da, uint32_t chunk)
408 {
409 	struct direct_entry *fdesc = &da->direct_tbl[chunk];
410 	struct chunk_desc *cdp, *empty_cdp;
411 	uint32_t base = fdesc->base;
412 	uint32_t size = chunk_size(da, fdesc);
413 	uint32_t hash = chunk_hash(da, fdesc);
414 	int i;
415 
416 	/* Find an existing descriptor */
417 	LIST_FOREACH(cdp, &da->chunk_hashtbl[hash & CHUNK_HASH_MASK],
418 	    cd_hash_le) {
419 		if (cdp->cd_hash != hash || cdp->cd_cur_size != size ||
420 		    memcmp(&da->range_tbl[base], &da->range_tbl[cdp->cd_base],
421 		    sizeof(struct range_entry_long) * size))
422 			continue;
423 		da->rtbl_top = fdesc->base;
424 		fdesc->base = cdp->cd_base;
425 		cdp->cd_refcnt++;
426 		return (0);
427 	}
428 
429 	/* No matching chunks found. Find an empty one to recycle. */
430 	for (cdp = NULL, i = size; cdp == NULL && i < UNUSED_BUCKETS; i++)
431 		cdp = LIST_FIRST(&da->unused_chunks[i]);
432 
433 	if (cdp == NULL)
434 		LIST_FOREACH(empty_cdp, &da->unused_chunks[0], cd_hash_le)
435 			if (empty_cdp->cd_max_size >= size && (cdp == NULL ||
436 			    empty_cdp->cd_max_size < cdp->cd_max_size)) {
437 				cdp = empty_cdp;
438 				if (empty_cdp->cd_max_size == size)
439 					break;
440 			}
441 
442 	if (cdp != NULL) {
443 		/* Copy from heap into the recycled chunk */
444 		bcopy(&da->range_tbl[fdesc->base], &da->range_tbl[cdp->cd_base],
445 		    size * sizeof(struct range_entry_long));
446 		fdesc->base = cdp->cd_base;
447 		da->rtbl_top -= size;
448 		da->unused_chunks_size -= cdp->cd_max_size;
449 		if (cdp->cd_max_size > size) {
450 			/* Split the range in two, need a new descriptor */
451 			empty_cdp = uma_zalloc(chunk_zone, M_NOWAIT);
452 			if (empty_cdp == NULL)
453 				return (1);
454 			LIST_INSERT_BEFORE(cdp, empty_cdp, cd_all_le);
455 			empty_cdp->cd_base = cdp->cd_base + size;
456 			empty_cdp->cd_cur_size = 0;
457 			empty_cdp->cd_max_size = cdp->cd_max_size - size;
458 
459 			i = empty_cdp->cd_max_size;
460 			if (i >= UNUSED_BUCKETS)
461 				i = 0;
462 			LIST_INSERT_HEAD(&da->unused_chunks[i], empty_cdp,
463 			    cd_hash_le);
464 
465 			da->unused_chunks_size += empty_cdp->cd_max_size;
466 			cdp->cd_max_size = size;
467 		}
468 		LIST_REMOVE(cdp, cd_hash_le);
469 	} else {
470 		/* Alloc a new descriptor at the top of the heap*/
471 		cdp = uma_zalloc(chunk_zone, M_NOWAIT);
472 		if (cdp == NULL)
473 			return (1);
474 		cdp->cd_max_size = size;
475 		cdp->cd_base = fdesc->base;
476 		LIST_INSERT_HEAD(&da->all_chunks, cdp, cd_all_le);
477 		MPASS(cdp->cd_base + cdp->cd_max_size == da->rtbl_top);
478 	}
479 
480 	cdp->cd_hash = hash;
481 	cdp->cd_refcnt = 1;
482 	cdp->cd_cur_size = size;
483 	LIST_INSERT_HEAD(&da->chunk_hashtbl[hash & CHUNK_HASH_MASK], cdp,
484 	    cd_hash_le);
485 	if (da->rtbl_top >= da->rtbl_size) {
486 		if (da->rtbl_top >= BASE_MAX) {
487 			FIB_PRINTF(LOG_ERR, da->fd,
488 			    "structural limit exceeded at %d "
489 			    "range table elements", da->rtbl_top);
490 			return (1);
491 		}
492 		da->rtbl_size += RTBL_SIZE_INCR;
493 		i = (BASE_MAX - da->rtbl_top) * LOG_DEBUG / BASE_MAX;
494 		FIB_PRINTF(i, da->fd, "range table at %d%% structural limit",
495 		    da->rtbl_top * 100 / BASE_MAX);
496 		da->range_tbl = realloc(da->range_tbl,
497 		    sizeof(*da->range_tbl) * da->rtbl_size + FRAGS_PREF_SHORT,
498 		    M_DXRAUX, M_NOWAIT);
499 		if (da->range_tbl == NULL) {
500 			FIB_PRINTF(LOG_NOTICE, da->fd,
501 			    "Unable to allocate DXR range table");
502 			return (1);
503 		}
504 	}
505 
506 	return (0);
507 }
508 
509 static void
510 chunk_unref(struct dxr_aux *da, uint32_t chunk)
511 {
512 	struct direct_entry *fdesc = &da->direct_tbl[chunk];
513 	struct chunk_desc *cdp, *cdp2;
514 	uint32_t base = fdesc->base;
515 	uint32_t size = chunk_size(da, fdesc);
516 	uint32_t hash = chunk_hash(da, fdesc);
517 	int i;
518 
519 	/* Find the corresponding descriptor */
520 	LIST_FOREACH(cdp, &da->chunk_hashtbl[hash & CHUNK_HASH_MASK],
521 	    cd_hash_le)
522 		if (cdp->cd_hash == hash && cdp->cd_cur_size == size &&
523 		    memcmp(&da->range_tbl[base], &da->range_tbl[cdp->cd_base],
524 		    sizeof(struct range_entry_long) * size) == 0)
525 			break;
526 
527 	MPASS(cdp != NULL);
528 	if (--cdp->cd_refcnt > 0)
529 		return;
530 
531 	LIST_REMOVE(cdp, cd_hash_le);
532 	da->unused_chunks_size += cdp->cd_max_size;
533 	cdp->cd_cur_size = 0;
534 
535 	/* Attempt to merge with the preceding chunk, if empty */
536 	cdp2 = LIST_NEXT(cdp, cd_all_le);
537 	if (cdp2 != NULL && cdp2->cd_cur_size == 0) {
538 		MPASS(cdp2->cd_base + cdp2->cd_max_size == cdp->cd_base);
539 		LIST_REMOVE(cdp, cd_all_le);
540 		LIST_REMOVE(cdp2, cd_hash_le);
541 		cdp2->cd_max_size += cdp->cd_max_size;
542 		uma_zfree(chunk_zone, cdp);
543 		cdp = cdp2;
544 	}
545 
546 	/* Attempt to merge with the subsequent chunk, if empty */
547 	cdp2 = LIST_PREV(cdp, &da->all_chunks, chunk_desc, cd_all_le);
548 	if (cdp2 != NULL && cdp2->cd_cur_size == 0) {
549 		MPASS(cdp->cd_base + cdp->cd_max_size == cdp2->cd_base);
550 		LIST_REMOVE(cdp, cd_all_le);
551 		LIST_REMOVE(cdp2, cd_hash_le);
552 		cdp2->cd_max_size += cdp->cd_max_size;
553 		cdp2->cd_base = cdp->cd_base;
554 		uma_zfree(chunk_zone, cdp);
555 		cdp = cdp2;
556 	}
557 
558 	if (cdp->cd_base + cdp->cd_max_size == da->rtbl_top) {
559 		/* Free the chunk on the top of the range heap, trim the heap */
560 		MPASS(cdp == LIST_FIRST(&da->all_chunks));
561 		da->rtbl_top -= cdp->cd_max_size;
562 		da->unused_chunks_size -= cdp->cd_max_size;
563 		LIST_REMOVE(cdp, cd_all_le);
564 		uma_zfree(chunk_zone, cdp);
565 		return;
566 	}
567 
568 	i = cdp->cd_max_size;
569 	if (i >= UNUSED_BUCKETS)
570 		i = 0;
571 	LIST_INSERT_HEAD(&da->unused_chunks[i], cdp, cd_hash_le);
572 }
573 
574 #ifdef DXR2
575 static uint32_t
576 trie_hash(struct dxr_aux *da, uint32_t dxr_x, uint32_t index)
577 {
578 	uint32_t i, *val;
579 	uint32_t hash = 0;
580 
581 	for (i = 0; i < (1 << dxr_x); i++) {
582 		hash = (hash << 3) ^ (hash >> 3);
583 		val = (uint32_t *)
584 		    (void *) &da->direct_tbl[(index << dxr_x) + i];
585 		hash += (*val << 5);
586 		hash += (*val >> 5);
587 	}
588 
589 	return (hash + (hash >> 16));
590 }
591 
592 static int
593 trie_ref(struct dxr_aux *da, uint32_t index)
594 {
595 	struct trie_desc *tp;
596 	uint32_t dxr_d = da->d_bits;
597 	uint32_t dxr_x = DXR_TRIE_BITS - dxr_d;
598 	uint32_t hash = trie_hash(da, dxr_x, index);
599 
600 	/* Find an existing descriptor */
601 	LIST_FOREACH(tp, &da->trie_hashtbl[hash & TRIE_HASH_MASK], td_hash_le)
602 		if (tp->td_hash == hash &&
603 		    memcmp(&da->direct_tbl[index << dxr_x],
604 		    &da->x_tbl[tp->td_index << dxr_x],
605 		    sizeof(*da->x_tbl) << dxr_x) == 0) {
606 			tp->td_refcnt++;
607 			da->trietbl[index] = tp;
608 			return(tp->td_index);
609 		}
610 
611 	tp = LIST_FIRST(&da->unused_trie);
612 	if (tp != NULL) {
613 		LIST_REMOVE(tp, td_hash_le);
614 		da->unused_trie_cnt--;
615 	} else {
616 		tp = uma_zalloc(trie_zone, M_NOWAIT);
617 		if (tp == NULL)
618 			return (-1);
619 		LIST_INSERT_HEAD(&da->all_trie, tp, td_all_le);
620 		tp->td_index = da->all_trie_cnt++;
621 	}
622 
623 	tp->td_hash = hash;
624 	tp->td_refcnt = 1;
625 	LIST_INSERT_HEAD(&da->trie_hashtbl[hash & TRIE_HASH_MASK], tp,
626 	   td_hash_le);
627 	memcpy(&da->x_tbl[tp->td_index << dxr_x],
628 	    &da->direct_tbl[index << dxr_x], sizeof(*da->x_tbl) << dxr_x);
629 	da->trietbl[index] = tp;
630 	if (da->all_trie_cnt >= da->xtbl_size >> dxr_x) {
631 		da->xtbl_size += XTBL_SIZE_INCR;
632 		da->x_tbl = realloc(da->x_tbl,
633 		    sizeof(*da->x_tbl) * da->xtbl_size, M_DXRAUX, M_NOWAIT);
634 		if (da->x_tbl == NULL) {
635 			FIB_PRINTF(LOG_NOTICE, da->fd,
636 			    "Unable to allocate DXR extension table");
637 			return (-1);
638 		}
639 	}
640 	return(tp->td_index);
641 }
642 
643 static void
644 trie_unref(struct dxr_aux *da, uint32_t index)
645 {
646 	struct trie_desc *tp = da->trietbl[index];
647 
648 	if (tp == NULL)
649 		return;
650 	da->trietbl[index] = NULL;
651 	if (--tp->td_refcnt > 0)
652 		return;
653 
654 	LIST_REMOVE(tp, td_hash_le);
655 	da->unused_trie_cnt++;
656 	if (tp->td_index != da->all_trie_cnt - 1) {
657 		LIST_INSERT_HEAD(&da->unused_trie, tp, td_hash_le);
658 		return;
659 	}
660 
661 	do {
662 		da->all_trie_cnt--;
663 		da->unused_trie_cnt--;
664 		LIST_REMOVE(tp, td_all_le);
665 		uma_zfree(trie_zone, tp);
666 		LIST_FOREACH(tp, &da->unused_trie, td_hash_le)
667 			if (tp->td_index == da->all_trie_cnt - 1) {
668 				LIST_REMOVE(tp, td_hash_le);
669 				break;
670 			}
671 	} while (tp != NULL);
672 }
673 #endif
674 
675 static void
676 heap_inject(struct dxr_aux *da, uint32_t start, uint32_t end, uint32_t preflen,
677     uint32_t nh)
678 {
679 	struct heap_entry *fhp;
680 	int i;
681 
682 	for (i = da->heap_index; i >= 0; i--) {
683 		if (preflen > da->heap[i].preflen)
684 			break;
685 		else if (preflen < da->heap[i].preflen)
686 			da->heap[i + 1] = da->heap[i];
687 		else
688 			return;
689 	}
690 
691 	fhp = &da->heap[i + 1];
692 	fhp->preflen = preflen;
693 	fhp->start = start;
694 	fhp->end = end;
695 	fhp->nexthop = nh;
696 	da->heap_index++;
697 }
698 
699 static int
700 dxr_walk(struct rtentry *rt, void *arg)
701 {
702 	struct dxr_aux *da = arg;
703 	uint32_t chunk = da->work_chunk;
704 	uint32_t first = chunk << DXR_RANGE_SHIFT;
705 	uint32_t last = first | DXR_RANGE_MASK;
706 	struct range_entry_long *fp =
707 	    &da->range_tbl[da->rtbl_top + da->rtbl_work_frags].re;
708 	struct heap_entry *fhp = &da->heap[da->heap_index];
709 	uint32_t preflen, nh, start, end, scopeid;
710 	struct in_addr addr;
711 
712 	rt_get_inet_prefix_plen(rt, &addr, &preflen, &scopeid);
713 	start = ntohl(addr.s_addr);
714 	if (start > last)
715 		return (-1);	/* Beyond chunk boundaries, we are done */
716 	if (start < first)
717 		return (0);	/* Skip this route */
718 
719 	end = start;
720 	if (preflen < 32)
721 		end |= (0xffffffffU >> preflen);
722 	nh = fib_get_nhop_idx(da->fd, rt_get_raw_nhop(rt));
723 
724 	if (start == fhp->start)
725 		heap_inject(da, start, end, preflen, nh);
726 	else {
727 		/* start > fhp->start */
728 		while (start > fhp->end) {
729 			uint32_t oend = fhp->end;
730 
731 			if (da->heap_index > 0) {
732 				fhp--;
733 				da->heap_index--;
734 			} else
735 				initheap(da, fhp->end + 1, chunk);
736 			if (fhp->end > oend && fhp->nexthop != fp->nexthop) {
737 				fp++;
738 				da->rtbl_work_frags++;
739 				fp->start = (oend + 1) & DXR_RANGE_MASK;
740 				fp->nexthop = fhp->nexthop;
741 			}
742 		}
743 		if (start > ((chunk << DXR_RANGE_SHIFT) | fp->start) &&
744 		    nh != fp->nexthop) {
745 			fp++;
746 			da->rtbl_work_frags++;
747 			fp->start = start & DXR_RANGE_MASK;
748 		} else if (da->rtbl_work_frags) {
749 			if ((--fp)->nexthop == nh)
750 				da->rtbl_work_frags--;
751 			else
752 				fp++;
753 		}
754 		fp->nexthop = nh;
755 		heap_inject(da, start, end, preflen, nh);
756 	}
757 
758 	return (0);
759 }
760 
761 static int
762 update_chunk(struct dxr_aux *da, uint32_t chunk)
763 {
764 	struct range_entry_long *fp;
765 #if DXR_TRIE_BITS < 24
766 	struct range_entry_short *fps;
767 	uint32_t start, nh, i;
768 #endif
769 	struct heap_entry *fhp;
770 	uint32_t first = chunk << DXR_RANGE_SHIFT;
771 	uint32_t last = first | DXR_RANGE_MASK;
772 
773 	if (da->direct_tbl[chunk].fragments != FRAGS_MARK_HIT)
774 		chunk_unref(da, chunk);
775 
776 	initheap(da, first, chunk);
777 
778 	fp = &da->range_tbl[da->rtbl_top].re;
779 	da->rtbl_work_frags = 0;
780 	fp->start = first & DXR_RANGE_MASK;
781 	fp->nexthop = da->heap[0].nexthop;
782 
783 	da->dst.sin_addr.s_addr = htonl(first);
784 	da->mask.sin_addr.s_addr = htonl(~DXR_RANGE_MASK);
785 
786 	da->work_chunk = chunk;
787 	rib_walk_from(da->fibnum, AF_INET, RIB_FLAG_LOCKED,
788 	    (struct sockaddr *) &da->dst, (struct sockaddr *) &da->mask,
789 	    dxr_walk, da);
790 
791 	/* Flush any remaining objects on the heap */
792 	fp = &da->range_tbl[da->rtbl_top + da->rtbl_work_frags].re;
793 	fhp = &da->heap[da->heap_index];
794 	while (fhp->preflen > DXR_TRIE_BITS) {
795 		uint32_t oend = fhp->end;
796 
797 		if (da->heap_index > 0) {
798 			fhp--;
799 			da->heap_index--;
800 		} else
801 			initheap(da, fhp->end + 1, chunk);
802 		if (fhp->end > oend && fhp->nexthop != fp->nexthop) {
803 			/* Have we crossed the upper chunk boundary? */
804 			if (oend >= last)
805 				break;
806 			fp++;
807 			da->rtbl_work_frags++;
808 			fp->start = (oend + 1) & DXR_RANGE_MASK;
809 			fp->nexthop = fhp->nexthop;
810 		}
811 	}
812 
813 	/* Direct hit if the chunk contains only a single fragment */
814 	if (da->rtbl_work_frags == 0) {
815 		da->direct_tbl[chunk].base = fp->nexthop;
816 		da->direct_tbl[chunk].fragments = FRAGS_MARK_HIT;
817 		return (0);
818 	}
819 
820 	da->direct_tbl[chunk].base = da->rtbl_top;
821 	da->direct_tbl[chunk].fragments = da->rtbl_work_frags;
822 
823 #if DXR_TRIE_BITS < 24
824 	/* Check whether the chunk can be more compactly encoded */
825 	fp = &da->range_tbl[da->rtbl_top].re;
826 	for (i = 0; i <= da->rtbl_work_frags; i++, fp++)
827 		if ((fp->start & 0xff) != 0 || fp->nexthop > RE_SHORT_MAX_NH)
828 			break;
829 	if (i == da->rtbl_work_frags + 1) {
830 		fp = &da->range_tbl[da->rtbl_top].re;
831 		fps = (void *) fp;
832 		for (i = 0; i <= da->rtbl_work_frags; i++, fp++, fps++) {
833 			start = fp->start;
834 			nh = fp->nexthop;
835 			fps->start = start >> 8;
836 			fps->nexthop = nh;
837 		}
838 		fps->start = start >> 8;
839 		fps->nexthop = nh;
840 		da->rtbl_work_frags >>= 1;
841 		da->direct_tbl[chunk].fragments =
842 		    da->rtbl_work_frags | FRAGS_PREF_SHORT;
843 	} else
844 #endif
845 	if (da->rtbl_work_frags >= FRAGS_MARK_HIT) {
846 		da->direct_tbl[chunk].fragments = FRAGS_MARK_XL;
847 		memmove(&da->range_tbl[da->rtbl_top + 1],
848 		   &da->range_tbl[da->rtbl_top],
849 		   (da->rtbl_work_frags + 1) * sizeof(*da->range_tbl));
850 		da->range_tbl[da->rtbl_top].fragments = da->rtbl_work_frags;
851 		da->rtbl_work_frags++;
852 	}
853 	da->rtbl_top += (da->rtbl_work_frags + 1);
854 	return (chunk_ref(da, chunk));
855 }
856 
857 static void
858 dxr_build(struct dxr *dxr)
859 {
860 	struct dxr_aux *da = dxr->aux;
861 	struct chunk_desc *cdp;
862 	struct rib_rtable_info rinfo;
863 	struct timeval t0, t1, t2, t3;
864 	uint32_t r_size, dxr_tot_size;
865 	uint32_t i, m, range_rebuild = 0;
866 	uint32_t range_frag;
867 #ifdef DXR2
868 	struct trie_desc *tp;
869 	uint32_t d_tbl_size, dxr_x, d_size, x_size;
870 	uint32_t ti, trie_rebuild = 0, prev_size = 0;
871 	uint32_t trie_frag;
872 #endif
873 
874 	MPASS(dxr->d == NULL);
875 
876 	if (da == NULL) {
877 		da = malloc(sizeof(*dxr->aux), M_DXRAUX, M_NOWAIT);
878 		if (da == NULL) {
879 			FIB_PRINTF(LOG_NOTICE, dxr->fd,
880 			    "Unable to allocate DXR aux struct");
881 			return;
882 		}
883 		dxr->aux = da;
884 		da->fibnum = dxr->fibnum;
885 		da->fd = dxr->fd;
886 		da->refcnt = 1;
887 		LIST_INIT(&da->all_chunks);
888 		LIST_INIT(&da->all_trie);
889 		da->rtbl_size = RTBL_SIZE_INCR;
890 		da->range_tbl = NULL;
891 		da->xtbl_size = XTBL_SIZE_INCR;
892 		da->x_tbl = NULL;
893 		bzero(&da->dst, sizeof(da->dst));
894 		bzero(&da->mask, sizeof(da->mask));
895 		da->dst.sin_len = sizeof(da->dst);
896 		da->mask.sin_len = sizeof(da->mask);
897 		da->dst.sin_family = AF_INET;
898 		da->mask.sin_family = AF_INET;
899 	}
900 	if (da->range_tbl == NULL) {
901 		da->range_tbl = malloc(sizeof(*da->range_tbl) * da->rtbl_size
902 		    + FRAGS_PREF_SHORT, M_DXRAUX, M_NOWAIT);
903 		if (da->range_tbl == NULL) {
904 			FIB_PRINTF(LOG_NOTICE, da->fd,
905 			    "Unable to allocate DXR range table");
906 			return;
907 		}
908 		range_rebuild = 1;
909 	}
910 #ifdef DXR2
911 	if (da->x_tbl == NULL) {
912 		da->x_tbl = malloc(sizeof(*da->x_tbl) * da->xtbl_size,
913 		    M_DXRAUX, M_NOWAIT);
914 		if (da->x_tbl == NULL) {
915 			FIB_PRINTF(LOG_NOTICE, da->fd,
916 			    "Unable to allocate DXR extension table");
917 			return;
918 		}
919 		trie_rebuild = 1;
920 	}
921 #endif
922 
923 	microuptime(&t0);
924 
925 	dxr->nh_tbl = fib_get_nhop_array(da->fd);
926 	fib_get_rtable_info(fib_get_rh(da->fd), &rinfo);
927 
928 	if (da->updates_low > da->updates_high)
929 		range_rebuild = 1;
930 
931 range_build:
932 	if (range_rebuild) {
933 		/* Bulk cleanup */
934 		bzero(da->chunk_hashtbl, sizeof(da->chunk_hashtbl));
935 		while ((cdp = LIST_FIRST(&da->all_chunks)) != NULL) {
936 			LIST_REMOVE(cdp, cd_all_le);
937 			uma_zfree(chunk_zone, cdp);
938 		}
939 		for (i = 0; i < UNUSED_BUCKETS; i++)
940 			LIST_INIT(&da->unused_chunks[i]);
941 		da->unused_chunks_size = 0;
942 		da->rtbl_top = 0;
943 		da->updates_low = 0;
944 		da->updates_high = DIRECT_TBL_SIZE - 1;
945 		memset(da->updates_mask, 0xff, sizeof(da->updates_mask));
946 		for (i = 0; i < DIRECT_TBL_SIZE; i++) {
947 			da->direct_tbl[i].fragments = FRAGS_MARK_HIT;
948 			da->direct_tbl[i].base = 0;
949 		}
950 	}
951 	da->prefixes = rinfo.num_prefixes;
952 
953 	/* DXR: construct direct & range table */
954 	for (i = da->updates_low; i <= da->updates_high; i++) {
955 		m = da->updates_mask[i >> 5] >> (i & 0x1f);
956 		if (m == 0)
957 			i |= 0x1f;
958 		else if (m & 1 && update_chunk(da, i) != 0)
959 			return;
960 	}
961 
962 	range_frag = 0;
963 	if (da->rtbl_top)
964 		range_frag = da->unused_chunks_size * 10000ULL / da->rtbl_top;
965 	if (range_frag > V_frag_limit) {
966 		range_rebuild = 1;
967 		goto range_build;
968 	}
969 
970 	r_size = sizeof(*da->range_tbl) * da->rtbl_top;
971 	microuptime(&t1);
972 
973 #ifdef DXR2
974 	if (range_rebuild ||
975 	    abs(fls(da->prefixes) - fls(da->trie_rebuilt_prefixes)) > 1)
976 		trie_rebuild = 1;
977 
978 trie_build:
979 	if (trie_rebuild) {
980 		da->trie_rebuilt_prefixes = da->prefixes;
981 		da->d_bits = DXR_D;
982 		da->updates_low = 0;
983 		da->updates_high = DIRECT_TBL_SIZE - 1;
984 		if (!range_rebuild)
985 			memset(da->updates_mask, 0xff,
986 			    sizeof(da->updates_mask));
987 	}
988 
989 dxr2_try_squeeze:
990 	if (trie_rebuild) {
991 		/* Bulk cleanup */
992 		bzero(da->trietbl, sizeof(da->trietbl));
993 		bzero(da->trie_hashtbl, sizeof(da->trie_hashtbl));
994 		while ((tp = LIST_FIRST(&da->all_trie)) != NULL) {
995 			LIST_REMOVE(tp, td_all_le);
996 			uma_zfree(trie_zone, tp);
997 		}
998 		LIST_INIT(&da->unused_trie);
999 		da->all_trie_cnt = da->unused_trie_cnt = 0;
1000 	}
1001 
1002 	/* Populate d_tbl, x_tbl */
1003 	dxr_x = DXR_TRIE_BITS - da->d_bits;
1004 	d_tbl_size = (1 << da->d_bits);
1005 
1006 	for (i = da->updates_low >> dxr_x; i <= da->updates_high >> dxr_x;
1007 	    i++) {
1008 		if (!trie_rebuild) {
1009 			m = 0;
1010 			for (int j = 0; j < (1 << dxr_x); j += 32)
1011 				m |= da->updates_mask[((i << dxr_x) + j) >> 5];
1012 			if (m == 0)
1013 				continue;
1014 			trie_unref(da, i);
1015 		}
1016 		ti = trie_ref(da, i);
1017 		if (ti < 0)
1018 			return;
1019 		da->d_tbl[i] = ti;
1020 	}
1021 
1022 	trie_frag = 0;
1023 	if (da->all_trie_cnt)
1024 		trie_frag = da->unused_trie_cnt * 10000ULL / da->all_trie_cnt;
1025 	if (trie_frag > V_frag_limit) {
1026 		trie_rebuild = 1;
1027 		goto trie_build;
1028 	}
1029 
1030 	d_size = sizeof(*da->d_tbl) * d_tbl_size;
1031 	x_size = sizeof(*da->x_tbl) * DIRECT_TBL_SIZE / d_tbl_size
1032 	    * da->all_trie_cnt;
1033 	dxr_tot_size = d_size + x_size + r_size;
1034 
1035 	if (trie_rebuild == 1) {
1036 		/* Try to find a more compact D/X split */
1037 		if (prev_size == 0 || dxr_tot_size <= prev_size)
1038 			da->d_bits--;
1039 		else {
1040 			da->d_bits++;
1041 			trie_rebuild = 2;
1042 		}
1043 		prev_size = dxr_tot_size;
1044 		goto dxr2_try_squeeze;
1045 	}
1046 	microuptime(&t2);
1047 #else /* !DXR2 */
1048 	dxr_tot_size = sizeof(da->direct_tbl) + r_size;
1049 	t2 = t1;
1050 #endif
1051 
1052 	dxr->d = malloc(dxr_tot_size, M_DXRLPM, M_NOWAIT);
1053 	if (dxr->d == NULL) {
1054 		FIB_PRINTF(LOG_NOTICE, da->fd,
1055 		    "Unable to allocate DXR lookup table");
1056 		return;
1057 	}
1058 #ifdef DXR2
1059 	memcpy(dxr->d, da->d_tbl, d_size);
1060 	dxr->x = ((char *) dxr->d) + d_size;
1061 	memcpy(dxr->x, da->x_tbl, x_size);
1062 	dxr->r = ((char *) dxr->x) + x_size;
1063 	dxr->d_shift = 32 - da->d_bits;
1064 	dxr->x_shift = dxr_x;
1065 	dxr->x_mask = 0xffffffffU >> (32 - dxr_x);
1066 #else /* !DXR2 */
1067 	memcpy(dxr->d, da->direct_tbl, sizeof(da->direct_tbl));
1068 	dxr->r = ((char *) dxr->d) + sizeof(da->direct_tbl);
1069 #endif
1070 	memcpy(dxr->r, da->range_tbl, r_size);
1071 
1072 	if (da->updates_low <= da->updates_high)
1073 		bzero(&da->updates_mask[da->updates_low / 32],
1074 		    (da->updates_high - da->updates_low) / 8 + 1);
1075 	da->updates_low = DIRECT_TBL_SIZE - 1;
1076 	da->updates_high = 0;
1077 	microuptime(&t3);
1078 
1079 #ifdef DXR2
1080 	FIB_PRINTF(LOG_INFO, da->fd, "D%dX%dR, %d prefixes, %d nhops (max)",
1081 	    da->d_bits, dxr_x, rinfo.num_prefixes, rinfo.num_nhops);
1082 #else
1083 	FIB_PRINTF(LOG_INFO, da->fd, "D%dR, %d prefixes, %d nhops (max)",
1084 	    DXR_D, rinfo.num_prefixes, rinfo.num_nhops);
1085 #endif
1086 	i = dxr_tot_size * 100;
1087 	if (rinfo.num_prefixes)
1088 		i /= rinfo.num_prefixes;
1089 	FIB_PRINTF(LOG_INFO, da->fd, "%d.%02d KBytes, %d.%02d Bytes/prefix",
1090 	    dxr_tot_size / 1024, dxr_tot_size * 100 / 1024 % 100,
1091 	    i / 100, i % 100);
1092 #ifdef DXR2
1093 	FIB_PRINTF(LOG_INFO, da->fd,
1094 	    "%d.%02d%% trie, %d.%02d%% range fragmentation",
1095 	    trie_frag / 100, trie_frag % 100,
1096 	    range_frag / 100, range_frag % 100);
1097 #else
1098 	FIB_PRINTF(LOG_INFO, da->fd, "%d.%01d%% range fragmentation",
1099 	    range_frag / 100, range_frag % 100);
1100 #endif
1101 	i = (t1.tv_sec - t0.tv_sec) * 1000000 + t1.tv_usec - t0.tv_usec;
1102 	FIB_PRINTF(LOG_INFO, da->fd, "range table %s in %u.%03u ms",
1103 	    range_rebuild ? "rebuilt" : "updated", i / 1000, i % 1000);
1104 #ifdef DXR2
1105 	i = (t2.tv_sec - t1.tv_sec) * 1000000 + t2.tv_usec - t1.tv_usec;
1106 	FIB_PRINTF(LOG_INFO, da->fd, "trie %s in %u.%03u ms",
1107 	    trie_rebuild ? "rebuilt" : "updated", i / 1000, i % 1000);
1108 #endif
1109 	i = (t3.tv_sec - t2.tv_sec) * 1000000 + t3.tv_usec - t2.tv_usec;
1110 	FIB_PRINTF(LOG_INFO, da->fd, "snapshot forked in %u.%03u ms",
1111 	    i / 1000, i % 1000);
1112 }
1113 
1114 /*
1115  * Glue functions for attaching to the FIB_ALGO infrastructure.
1116  */
1117 
1118 static struct nhop_object *
1119 dxr_fib_lookup(void *algo_data, const struct flm_lookup_key key,
1120     uint32_t scopeid)
1121 {
1122 	struct dxr *dxr = algo_data;
1123 
1124 	return (dxr->nh_tbl[dxr_lookup(dxr, ntohl(key.addr4.s_addr))]);
1125 }
1126 
1127 static enum flm_op_result
1128 dxr_init(uint32_t fibnum, struct fib_data *fd, void *old_data, void **data)
1129 {
1130 	struct dxr *old_dxr = old_data;
1131 	struct dxr_aux *da = NULL;
1132 	struct dxr *dxr;
1133 
1134 	dxr = malloc(sizeof(*dxr), M_DXRAUX, M_NOWAIT);
1135 	if (dxr == NULL) {
1136 		FIB_PRINTF(LOG_NOTICE, fd,
1137 		    "Unable to allocate DXR container struct");
1138 		return (FLM_REBUILD);
1139 	}
1140 
1141 	/* Check whether we may reuse the old auxiliary structures */
1142 	if (old_dxr != NULL && old_dxr->aux != NULL &&
1143 	    old_dxr->aux->fd == fd) {
1144 		da = old_dxr->aux;
1145 		atomic_add_int(&da->refcnt, 1);
1146 	}
1147 
1148 	dxr->aux = da;
1149 	dxr->d = NULL;
1150 	dxr->fd = fd;
1151 	dxr->fibnum = fibnum;
1152 	*data = dxr;
1153 
1154 	return (FLM_SUCCESS);
1155 }
1156 
1157 static void
1158 dxr_destroy(void *data)
1159 {
1160 	struct dxr *dxr = data;
1161 	struct dxr_aux *da = dxr->aux;
1162 	struct chunk_desc *cdp;
1163 	struct trie_desc *tp;
1164 
1165 	free(dxr->d, M_DXRLPM);
1166 	free(dxr, M_DXRAUX);
1167 
1168 	if (da == NULL || atomic_fetchadd_int(&da->refcnt, -1) > 1)
1169 		return;
1170 
1171 	/* Release auxiliary structures */
1172 	while ((cdp = LIST_FIRST(&da->all_chunks)) != NULL) {
1173 		LIST_REMOVE(cdp, cd_all_le);
1174 		uma_zfree(chunk_zone, cdp);
1175 	}
1176 	while ((tp = LIST_FIRST(&da->all_trie)) != NULL) {
1177 		LIST_REMOVE(tp, td_all_le);
1178 		uma_zfree(trie_zone, tp);
1179 	}
1180 	free(da->range_tbl, M_DXRAUX);
1181 	free(da->x_tbl, M_DXRAUX);
1182 	free(da, M_DXRAUX);
1183 }
1184 
1185 static void
1186 epoch_dxr_destroy(epoch_context_t ctx)
1187 {
1188 	struct dxr *dxr = __containerof(ctx, struct dxr, epoch_ctx);
1189 
1190 	dxr_destroy(dxr);
1191 }
1192 
1193 static void *
1194 choose_lookup_fn(struct dxr_aux *da)
1195 {
1196 
1197 #ifdef DXR2
1198 	switch (da->d_bits) {
1199 #if DXR_TRIE_BITS > 16
1200 	case 16:
1201 		return (dxr_fib_lookup_16);
1202 #endif
1203 	case 15:
1204 		return (dxr_fib_lookup_15);
1205 	case 14:
1206 		return (dxr_fib_lookup_14);
1207 	case 13:
1208 		return (dxr_fib_lookup_13);
1209 	case 12:
1210 		return (dxr_fib_lookup_12);
1211 	case 11:
1212 		return (dxr_fib_lookup_11);
1213 	case 10:
1214 		return (dxr_fib_lookup_10);
1215 	case 9:
1216 		return (dxr_fib_lookup_9);
1217 	}
1218 #endif /* DXR2 */
1219 	return (dxr_fib_lookup);
1220 }
1221 
1222 static enum flm_op_result
1223 dxr_dump_end(void *data, struct fib_dp *dp)
1224 {
1225 	struct dxr *dxr = data;
1226 	struct dxr_aux *da;
1227 
1228 	dxr_build(dxr);
1229 
1230 	da = dxr->aux;
1231 	if (da == NULL || dxr->d == NULL)
1232 		return (FLM_REBUILD);
1233 
1234 	if (da->rtbl_top >= BASE_MAX)
1235 		return (FLM_ERROR);
1236 
1237 	dp->f = choose_lookup_fn(da);
1238 	dp->arg = dxr;
1239 
1240 	return (FLM_SUCCESS);
1241 }
1242 
1243 static enum flm_op_result
1244 dxr_dump_rib_item(struct rtentry *rt, void *data)
1245 {
1246 
1247 	return (FLM_SUCCESS);
1248 }
1249 
1250 static enum flm_op_result
1251 dxr_change_rib_item(struct rib_head *rnh, struct rib_cmd_info *rc,
1252     void *data)
1253 {
1254 
1255 	return (FLM_BATCH);
1256 }
1257 
1258 static enum flm_op_result
1259 dxr_change_rib_batch(struct rib_head *rnh, struct fib_change_queue *q,
1260     void *data)
1261 {
1262 	struct dxr *dxr = data;
1263 	struct dxr *new_dxr;
1264 	struct dxr_aux *da;
1265 	struct fib_dp new_dp;
1266 	enum flm_op_result res;
1267 	uint32_t ip, plen, hmask, start, end, i, ui;
1268 #ifdef INVARIANTS
1269 	struct rib_rtable_info rinfo;
1270 	int update_delta = 0;
1271 #endif
1272 
1273 	MPASS(data != NULL);
1274 	MPASS(q != NULL);
1275 	MPASS(q->count < q->size);
1276 
1277 	da = dxr->aux;
1278 	MPASS(da != NULL);
1279 	MPASS(da->fd == dxr->fd);
1280 	MPASS(da->refcnt > 0);
1281 
1282 	FIB_PRINTF(LOG_INFO, da->fd, "processing %d update(s)", q->count);
1283 	for (ui = 0; ui < q->count; ui++) {
1284 #ifdef INVARIANTS
1285 		if (q->entries[ui].nh_new != NULL)
1286 			update_delta++;
1287 		if (q->entries[ui].nh_old != NULL)
1288 			update_delta--;
1289 #endif
1290 		plen = q->entries[ui].plen;
1291 		ip = ntohl(q->entries[ui].addr4.s_addr);
1292 		if (plen < 32)
1293 			hmask = 0xffffffffU >> plen;
1294 		else
1295 			hmask = 0;
1296 		start = (ip & ~hmask) >> DXR_RANGE_SHIFT;
1297 		end = (ip | hmask) >> DXR_RANGE_SHIFT;
1298 
1299 		if ((start & 0x1f) == 0 && (end & 0x1f) == 0x1f)
1300 			for (i = start >> 5; i <= end >> 5; i++)
1301 				da->updates_mask[i] = 0xffffffffU;
1302 		else
1303 			for (i = start; i <= end; i++)
1304 				da->updates_mask[i >> 5] |= (1 << (i & 0x1f));
1305 		if (start < da->updates_low)
1306 			da->updates_low = start;
1307 		if (end > da->updates_high)
1308 			da->updates_high = end;
1309 	}
1310 
1311 #ifdef INVARIANTS
1312 	fib_get_rtable_info(fib_get_rh(da->fd), &rinfo);
1313 	MPASS(da->prefixes + update_delta == rinfo.num_prefixes);
1314 #endif
1315 
1316 	res = dxr_init(0, dxr->fd, data, (void **) &new_dxr);
1317 	if (res != FLM_SUCCESS)
1318 		return (res);
1319 
1320 	dxr_build(new_dxr);
1321 
1322 	/* Structural limit exceeded, hard error */
1323 	if (da->rtbl_top >= BASE_MAX)
1324 		return (FLM_ERROR);
1325 
1326 	if (new_dxr->d == NULL) {
1327 		dxr_destroy(new_dxr);
1328 		return (FLM_REBUILD);
1329 	}
1330 
1331 	new_dp.f = choose_lookup_fn(da);
1332 	new_dp.arg = new_dxr;
1333 	if (fib_set_datapath_ptr(dxr->fd, &new_dp)) {
1334 		fib_set_algo_ptr(dxr->fd, new_dxr);
1335 		fib_epoch_call(epoch_dxr_destroy, &dxr->epoch_ctx);
1336 		return (FLM_SUCCESS);
1337 	}
1338 
1339 	FIB_PRINTF(LOG_NOTICE, dxr->fd, "fib_set_datapath_ptr() failed");
1340 	dxr_destroy(new_dxr);
1341 	return (FLM_REBUILD);
1342 }
1343 
1344 static uint8_t
1345 dxr_get_pref(const struct rib_rtable_info *rinfo)
1346 {
1347 
1348 	/* Below bsearch4 up to 10 prefixes. Always supersedes dpdk_lpm4. */
1349 	return (251);
1350 }
1351 
1352 SYSCTL_DECL(_net_route_algo);
1353 SYSCTL_NODE(_net_route_algo, OID_AUTO, dxr, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
1354     "DXR tunables");
1355 
1356 static int
1357 sysctl_dxr_frag_limit(SYSCTL_HANDLER_ARGS)
1358 {
1359 	char buf[8];
1360 	int error, new, i;
1361 
1362 	snprintf(buf, sizeof(buf), "%d.%02d%%", V_frag_limit / 100,
1363 	    V_frag_limit % 100);
1364 	error = sysctl_handle_string(oidp, buf, sizeof(buf), req);
1365 	if (error != 0 || req->newptr == NULL)
1366 		return (error);
1367 	if (!isdigit(*buf) && *buf != '.')
1368 		return (EINVAL);
1369 	for (i = 0, new = 0; isdigit(buf[i]) && i < sizeof(buf); i++)
1370 		new = new * 10 + buf[i] - '0';
1371 	new *= 100;
1372 	if (buf[i++] == '.') {
1373 		if (!isdigit(buf[i]))
1374 			return (EINVAL);
1375 		new += (buf[i++] - '0') * 10;
1376 		if (isdigit(buf[i]))
1377 			new += buf[i++] - '0';
1378 	}
1379 	if (new > 1000)
1380 		return (EINVAL);
1381 	V_frag_limit = new;
1382 	snprintf(buf, sizeof(buf), "%d.%02d%%", V_frag_limit / 100,
1383 	    V_frag_limit % 100);
1384 	return (0);
1385 }
1386 
1387 SYSCTL_PROC(_net_route_algo_dxr, OID_AUTO, frag_limit,
1388     CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_VNET,
1389     0, 0, sysctl_dxr_frag_limit, "A",
1390     "Fragmentation threshold to full rebuild");
1391 
1392 static struct fib_lookup_module fib_dxr_mod = {
1393 	.flm_name = "dxr",
1394 	.flm_family = AF_INET,
1395 	.flm_init_cb = dxr_init,
1396 	.flm_destroy_cb = dxr_destroy,
1397 	.flm_dump_rib_item_cb = dxr_dump_rib_item,
1398 	.flm_dump_end_cb = dxr_dump_end,
1399 	.flm_change_rib_item_cb = dxr_change_rib_item,
1400 	.flm_change_rib_items_cb = dxr_change_rib_batch,
1401 	.flm_get_pref = dxr_get_pref,
1402 };
1403 
1404 static int
1405 dxr_modevent(module_t mod, int type, void *unused)
1406 {
1407 	int error;
1408 
1409 	switch (type) {
1410 	case MOD_LOAD:
1411 		chunk_zone = uma_zcreate("dxr chunk", sizeof(struct chunk_desc),
1412 		    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
1413 		trie_zone = uma_zcreate("dxr trie", sizeof(struct trie_desc),
1414 		    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
1415 		fib_module_register(&fib_dxr_mod);
1416 		return(0);
1417 	case MOD_UNLOAD:
1418 		error = fib_module_unregister(&fib_dxr_mod);
1419 		if (error)
1420 			return (error);
1421 		uma_zdestroy(chunk_zone);
1422 		uma_zdestroy(trie_zone);
1423 		return(0);
1424 	default:
1425 		return(EOPNOTSUPP);
1426 	}
1427 }
1428 
1429 static moduledata_t dxr_mod = {"fib_dxr", dxr_modevent, 0};
1430 
1431 DECLARE_MODULE(fib_dxr, dxr_mod, SI_SUB_PSEUDO, SI_ORDER_ANY);
1432 MODULE_VERSION(fib_dxr, 1);
1433