xref: /freebsd/sys/netinet/in_fib_dxr.c (revision 7b71f57f4e514a2ab7308ce4147e14d90e099ad0)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause
3  *
4  * Copyright (c) 2012-2024 Marko Zec
5  * Copyright (c) 2005, 2018 University of Zagreb
6  * Copyright (c) 2005 International Computer Science Institute
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27  * SUCH DAMAGE.
28  */
29 
30 /*
31  * An implementation of DXR, a simple IPv4 LPM scheme with compact lookup
32  * structures and a trivial search procedure.  More significant bits of
33  * the search key are used to directly index a two-stage trie, while the
34  * remaining bits are used for finding the next hop in a sorted array.
35  * More details in:
36  *
37  * M. Zec, L. Rizzo, M. Mikuc, DXR: towards a billion routing lookups per
38  * second in software, ACM SIGCOMM Computer Communication Review, September
39  * 2012
40  *
41  * M. Zec, M. Mikuc, Pushing the envelope: beyond two billion IP routing
42  * lookups per second on commodity CPUs, IEEE SoftCOM, September 2017, Split
43  */
44 
45 #include "opt_inet.h"
46 
47 #include <sys/param.h>
48 #include <sys/kernel.h>
49 #include <sys/ctype.h>
50 #include <sys/epoch.h>
51 #include <sys/malloc.h>
52 #include <sys/module.h>
53 #include <sys/socket.h>
54 #include <sys/sysctl.h>
55 #include <sys/syslog.h>
56 
57 #include <vm/uma.h>
58 
59 #include <netinet/in.h>
60 #include <netinet/in_fib.h>
61 
62 #include <net/route.h>
63 #include <net/route/route_ctl.h>
64 #include <net/route/fib_algo.h>
65 
66 #define	DXR_TRIE_BITS		20
67 
68 CTASSERT(DXR_TRIE_BITS >= 16 && DXR_TRIE_BITS <= 24);
69 
70 #if DXR_TRIE_BITS > 16
71 #define	DXR_D			16
72 #else
73 #define	DXR_D			(DXR_TRIE_BITS - 1)
74 #endif
75 
76 #define	D_TBL_SIZE		(1 << DXR_D)
77 #define	DIRECT_TBL_SIZE		(1 << DXR_TRIE_BITS)
78 #define	DXR_RANGE_MASK		(0xffffffffU >> DXR_TRIE_BITS)
79 #define	DXR_RANGE_SHIFT		(32 - DXR_TRIE_BITS)
80 
81 #define	DESC_BASE_BITS		22
82 #define	DESC_FRAGMENTS_BITS	(32 - DESC_BASE_BITS)
83 #define	BASE_MAX		((1 << DESC_BASE_BITS) - 1)
84 #define	RTBL_SIZE_INCR		(BASE_MAX / 64)
85 
86 #if DXR_TRIE_BITS < 24
87 #define	FRAGS_MASK_SHORT	((1 << (23 - DXR_TRIE_BITS)) - 1)
88 #else
89 #define	FRAGS_MASK_SHORT	0
90 #endif
91 #define	FRAGS_PREF_SHORT	(((1 << DESC_FRAGMENTS_BITS) - 1) & \
92 				 ~FRAGS_MASK_SHORT)
93 #define	FRAGS_MARK_XL		(FRAGS_PREF_SHORT - 1)
94 #define	FRAGS_MARK_HIT		(FRAGS_PREF_SHORT - 2)
95 
96 #define	IS_SHORT_FORMAT(x)	((x & FRAGS_PREF_SHORT) == FRAGS_PREF_SHORT)
97 #define	IS_LONG_FORMAT(x)	((x & FRAGS_PREF_SHORT) != FRAGS_PREF_SHORT)
98 #define	IS_XL_FORMAT(x)		(x == FRAGS_MARK_XL)
99 
100 #define	RE_SHORT_MAX_NH		((1 << (DXR_TRIE_BITS - 8)) - 1)
101 
102 #define	CHUNK_HASH_BITS		16
103 #define	CHUNK_HASH_SIZE		(1 << CHUNK_HASH_BITS)
104 #define	CHUNK_HASH_MASK		(CHUNK_HASH_SIZE - 1)
105 
106 #define	TRIE_HASH_BITS		16
107 #define	TRIE_HASH_SIZE		(1 << TRIE_HASH_BITS)
108 #define	TRIE_HASH_MASK		(TRIE_HASH_SIZE - 1)
109 
110 #define	XTBL_SIZE_INCR		(DIRECT_TBL_SIZE / 16)
111 
112 #define	UNUSED_BUCKETS		8
113 
114 /* Lookup structure elements */
115 
116 struct direct_entry {
117 	uint32_t		fragments: DESC_FRAGMENTS_BITS,
118 				base: DESC_BASE_BITS;
119 };
120 
121 struct range_entry_long {
122 	uint32_t		start: DXR_RANGE_SHIFT,
123 				nexthop: DXR_TRIE_BITS;
124 };
125 
126 #if DXR_TRIE_BITS < 24
127 struct range_entry_short {
128 	uint16_t		start: DXR_RANGE_SHIFT - 8,
129 				nexthop: DXR_TRIE_BITS - 8;
130 };
131 #endif
132 
133 /* Auxiliary structures */
134 
135 struct heap_entry {
136 	uint32_t		start;
137 	uint32_t		end;
138 	uint32_t		preflen;
139 	uint32_t		nexthop;
140 };
141 
142 struct chunk_desc {
143 	LIST_ENTRY(chunk_desc)	cd_all_le;
144 	LIST_ENTRY(chunk_desc)	cd_hash_le;
145 	uint32_t		cd_hash;
146 	uint32_t		cd_refcnt;
147 	uint32_t		cd_base;
148 	uint32_t		cd_cur_size;
149 	uint32_t		cd_max_size;
150 };
151 
152 struct trie_desc {
153 	LIST_ENTRY(trie_desc)	td_all_le;
154 	LIST_ENTRY(trie_desc)	td_hash_le;
155 	uint32_t		td_hash;
156 	uint32_t		td_index;
157 	uint32_t		td_refcnt;
158 };
159 
160 struct dxr_aux {
161 	/* Glue to external state */
162 	struct fib_data		*fd;
163 	uint32_t		fibnum;
164 	int			refcnt;
165 
166 	/* Auxiliary build-time tables */
167 	struct direct_entry	direct_tbl[DIRECT_TBL_SIZE];
168 	uint16_t		d_tbl[D_TBL_SIZE];
169 	struct direct_entry	*x_tbl;
170 	union {
171 		struct range_entry_long	re;
172 		uint32_t	fragments;
173 	}			*range_tbl;
174 
175 	/* Auxiliary internal state */
176 	uint32_t		updates_mask[DIRECT_TBL_SIZE / 32];
177 	struct trie_desc	*trietbl[D_TBL_SIZE];
178 	LIST_HEAD(, chunk_desc)	chunk_hashtbl[CHUNK_HASH_SIZE];
179 	LIST_HEAD(, chunk_desc)	all_chunks;
180 	LIST_HEAD(, chunk_desc) unused_chunks[UNUSED_BUCKETS];
181 	LIST_HEAD(, trie_desc)	trie_hashtbl[TRIE_HASH_SIZE];
182 	LIST_HEAD(, trie_desc)	all_trie;
183 	LIST_HEAD(, trie_desc)	unused_trie; /* abuses hash link entry */
184 	struct sockaddr_in	dst;
185 	struct sockaddr_in	mask;
186 	struct heap_entry	heap[33];
187 	uint32_t		prefixes;
188 	uint32_t		updates_low;
189 	uint32_t		updates_high;
190 	uint32_t		unused_chunks_size;
191 	uint32_t		xtbl_size;
192 	uint32_t		all_trie_cnt;
193 	uint32_t		unused_trie_cnt;
194 	uint32_t		trie_rebuilt_prefixes;
195 	uint32_t		heap_index;
196 	uint32_t		d_bits;
197 	uint32_t		rtbl_size;
198 	uint32_t		rtbl_top;
199 	uint32_t		rtbl_work_frags;
200 	uint32_t		work_chunk;
201 };
202 
203 /* Main lookup structure container */
204 
205 struct dxr {
206 	/* Lookup tables */
207 	void			*d;
208 	void			*x;
209 	void			*r;
210 	struct nhop_object	**nh_tbl;
211 
212 	/* Glue to external state */
213 	struct dxr_aux		*aux;
214 	struct fib_data		*fd;
215 	struct epoch_context	epoch_ctx;
216 	uint32_t		fibnum;
217 	uint16_t		d_shift;
218 	uint16_t		x_shift;
219 	uint32_t		x_mask;
220 };
221 
222 static MALLOC_DEFINE(M_DXRLPM, "dxr", "DXR LPM");
223 static MALLOC_DEFINE(M_DXRAUX, "dxr aux", "DXR auxiliary");
224 
225 uma_zone_t chunk_zone;
226 uma_zone_t trie_zone;
227 
228 VNET_DEFINE_STATIC(int, frag_limit) = 100;
229 #define	V_frag_limit	VNET(frag_limit)
230 
231 /* Binary search for a matching address range */
232 #define	DXR_LOOKUP_STAGE					\
233 	if (masked_dst < range[middle].start) {			\
234 		upperbound = middle;				\
235 		middle = (middle + lowerbound) / 2;		\
236 	} else if (masked_dst < range[middle + 1].start)	\
237 		return (range[middle].nexthop);			\
238 	else {							\
239 		lowerbound = middle + 1;			\
240 		middle = (upperbound + middle + 1) / 2;		\
241 	}							\
242 	if (upperbound == lowerbound)				\
243 		return (range[lowerbound].nexthop);
244 
245 static int
range_lookup(struct range_entry_long * rt,struct direct_entry de,uint32_t dst)246 range_lookup(struct range_entry_long *rt, struct direct_entry de, uint32_t dst)
247 {
248 	uint32_t base;
249 	uint32_t upperbound;
250 	uint32_t middle;
251 	uint32_t lowerbound;
252 	uint32_t masked_dst;
253 
254 	base = de.base;
255 	lowerbound = 0;
256 	masked_dst = dst & DXR_RANGE_MASK;
257 
258 #if DXR_TRIE_BITS < 24
259 	if (__predict_true(IS_SHORT_FORMAT(de.fragments))) {
260 		upperbound = de.fragments & FRAGS_MASK_SHORT;
261 		struct range_entry_short *range =
262 		    (struct range_entry_short *) &rt[base];
263 
264 		masked_dst >>= 8;
265 		middle = upperbound;
266 		upperbound = upperbound * 2 + 1;
267 
268 		for (;;) {
269 			DXR_LOOKUP_STAGE
270 			DXR_LOOKUP_STAGE
271 		}
272 	}
273 #endif
274 
275 	upperbound = de.fragments;
276 	middle = upperbound / 2;
277 	struct range_entry_long *range = &rt[base];
278 	if (__predict_false(IS_XL_FORMAT(de.fragments))) {
279 		upperbound = *((uint32_t *) range);
280 		range++;
281 		middle = upperbound / 2;
282 	}
283 
284 	for (;;) {
285 		DXR_LOOKUP_STAGE
286 		DXR_LOOKUP_STAGE
287 	}
288 }
289 
290 #define DXR_LOOKUP_DEFINE(D)						\
291 	static int inline						\
292 	dxr_lookup_##D(struct dxr *dxr, uint32_t dst)			\
293 	{								\
294 		struct direct_entry de;					\
295 		uint16_t *dt = dxr->d;					\
296 		struct direct_entry *xt = dxr->x;			\
297 									\
298 		de = xt[(dt[dst >> (32 - (D))] << (DXR_TRIE_BITS - (D))) \
299 		    + ((dst >> DXR_RANGE_SHIFT) &			\
300 		    (0xffffffffU >> (32 - DXR_TRIE_BITS + (D))))];	\
301 		if (__predict_true(de.fragments == FRAGS_MARK_HIT))	\
302 			return (de.base);				\
303 		return (range_lookup(dxr->r, de, dst));			\
304 	}								\
305 									\
306 	static struct nhop_object *					\
307 	dxr_fib_lookup_##D(void *algo_data,				\
308 	    const struct flm_lookup_key key, uint32_t scopeid __unused)	\
309 	{								\
310 		struct dxr *dxr = algo_data;				\
311 									\
312 		return (dxr->nh_tbl[dxr_lookup_##D(dxr,			\
313 		    ntohl(key.addr4.s_addr))]);				\
314 	}
315 
316 #if DXR_TRIE_BITS > 16
317 DXR_LOOKUP_DEFINE(16)
318 #endif
319 DXR_LOOKUP_DEFINE(15)
320 DXR_LOOKUP_DEFINE(14)
321 DXR_LOOKUP_DEFINE(13)
322 DXR_LOOKUP_DEFINE(12)
323 DXR_LOOKUP_DEFINE(11)
324 DXR_LOOKUP_DEFINE(10)
325 DXR_LOOKUP_DEFINE(9)
326 
327 static int inline
dxr_lookup(struct dxr * dxr,uint32_t dst)328 dxr_lookup(struct dxr *dxr, uint32_t dst)
329 {
330 	struct direct_entry de;
331 	uint16_t *dt = dxr->d;
332 	struct direct_entry *xt = dxr->x;
333 
334 	de = xt[(dt[dst >> dxr->d_shift] << dxr->x_shift) +
335 	    ((dst >> DXR_RANGE_SHIFT) & dxr->x_mask)];
336 	if (__predict_true(de.fragments == FRAGS_MARK_HIT))
337 		return (de.base);
338 	return (range_lookup(dxr->r, de, dst));
339 }
340 
341 static void
initheap(struct dxr_aux * da,uint32_t dst_u32,uint32_t chunk)342 initheap(struct dxr_aux *da, uint32_t dst_u32, uint32_t chunk)
343 {
344 	struct heap_entry *fhp = &da->heap[0];
345 	struct rtentry *rt;
346 	struct route_nhop_data rnd;
347 
348 	da->heap_index = 0;
349 	da->dst.sin_addr.s_addr = htonl(dst_u32);
350 	rt = fib4_lookup_rt(da->fibnum, da->dst.sin_addr, 0, NHR_UNLOCKED,
351 	    &rnd);
352 	if (rt != NULL) {
353 		struct in_addr addr;
354 		uint32_t scopeid;
355 
356 		rt_get_inet_prefix_plen(rt, &addr, &fhp->preflen, &scopeid);
357 		fhp->start = ntohl(addr.s_addr);
358 		fhp->end = fhp->start;
359 		if (fhp->preflen < 32)
360 			fhp->end |= (0xffffffffU >> fhp->preflen);
361 		fhp->nexthop = fib_get_nhop_idx(da->fd, rnd.rnd_nhop);
362 	} else {
363 		fhp->preflen = fhp->nexthop = fhp->start = 0;
364 		fhp->end = 0xffffffffU;
365 	}
366 }
367 
368 static uint32_t
chunk_size(struct dxr_aux * da,struct direct_entry * fdesc)369 chunk_size(struct dxr_aux *da, struct direct_entry *fdesc)
370 {
371 
372 	if (IS_SHORT_FORMAT(fdesc->fragments))
373 		return ((fdesc->fragments & FRAGS_MASK_SHORT) + 1);
374 	else if (IS_XL_FORMAT(fdesc->fragments))
375 		return (da->range_tbl[fdesc->base].fragments + 2);
376 	else /* if (IS_LONG_FORMAT(fdesc->fragments)) */
377 		return (fdesc->fragments + 1);
378 }
379 
380 static uint32_t
chunk_hash(struct dxr_aux * da,struct direct_entry * fdesc)381 chunk_hash(struct dxr_aux *da, struct direct_entry *fdesc)
382 {
383 	uint32_t size = chunk_size(da, fdesc);
384 	uint32_t *p = (uint32_t *) &da->range_tbl[fdesc->base];
385 	uint32_t *l = (uint32_t *) &da->range_tbl[fdesc->base + size];
386 	uint32_t hash = fdesc->fragments;
387 
388 	for (; p < l; p++)
389 		hash = (hash << 7) + (hash >> 13) + *p;
390 
391 	return (hash + (hash >> 16));
392 }
393 
394 static int
chunk_ref(struct dxr_aux * da,uint32_t chunk)395 chunk_ref(struct dxr_aux *da, uint32_t chunk)
396 {
397 	struct direct_entry *fdesc = &da->direct_tbl[chunk];
398 	struct chunk_desc *cdp, *empty_cdp;
399 	uint32_t base = fdesc->base;
400 	uint32_t size = chunk_size(da, fdesc);
401 	uint32_t hash = chunk_hash(da, fdesc);
402 	int i;
403 
404 	/* Find an existing descriptor */
405 	LIST_FOREACH(cdp, &da->chunk_hashtbl[hash & CHUNK_HASH_MASK],
406 	    cd_hash_le) {
407 		if (cdp->cd_hash != hash || cdp->cd_cur_size != size ||
408 		    memcmp(&da->range_tbl[base], &da->range_tbl[cdp->cd_base],
409 		    sizeof(struct range_entry_long) * size))
410 			continue;
411 		da->rtbl_top = fdesc->base;
412 		fdesc->base = cdp->cd_base;
413 		cdp->cd_refcnt++;
414 		return (0);
415 	}
416 
417 	/* No matching chunks found. Find an empty one to recycle. */
418 	for (cdp = NULL, i = size; cdp == NULL && i < UNUSED_BUCKETS; i++)
419 		cdp = LIST_FIRST(&da->unused_chunks[i]);
420 
421 	if (cdp == NULL)
422 		LIST_FOREACH(empty_cdp, &da->unused_chunks[0], cd_hash_le)
423 			if (empty_cdp->cd_max_size >= size && (cdp == NULL ||
424 			    empty_cdp->cd_max_size < cdp->cd_max_size)) {
425 				cdp = empty_cdp;
426 				if (empty_cdp->cd_max_size == size)
427 					break;
428 			}
429 
430 	if (cdp != NULL) {
431 		/* Copy from heap into the recycled chunk */
432 		bcopy(&da->range_tbl[fdesc->base], &da->range_tbl[cdp->cd_base],
433 		    size * sizeof(struct range_entry_long));
434 		fdesc->base = cdp->cd_base;
435 		da->rtbl_top -= size;
436 		da->unused_chunks_size -= cdp->cd_max_size;
437 		if (cdp->cd_max_size > size) {
438 			/* Split the range in two, need a new descriptor */
439 			empty_cdp = uma_zalloc(chunk_zone, M_NOWAIT);
440 			if (empty_cdp == NULL)
441 				return (1);
442 			LIST_INSERT_BEFORE(cdp, empty_cdp, cd_all_le);
443 			empty_cdp->cd_base = cdp->cd_base + size;
444 			empty_cdp->cd_cur_size = 0;
445 			empty_cdp->cd_max_size = cdp->cd_max_size - size;
446 
447 			i = empty_cdp->cd_max_size;
448 			if (i >= UNUSED_BUCKETS)
449 				i = 0;
450 			LIST_INSERT_HEAD(&da->unused_chunks[i], empty_cdp,
451 			    cd_hash_le);
452 
453 			da->unused_chunks_size += empty_cdp->cd_max_size;
454 			cdp->cd_max_size = size;
455 		}
456 		LIST_REMOVE(cdp, cd_hash_le);
457 	} else {
458 		/* Alloc a new descriptor at the top of the heap*/
459 		cdp = uma_zalloc(chunk_zone, M_NOWAIT);
460 		if (cdp == NULL)
461 			return (1);
462 		cdp->cd_max_size = size;
463 		cdp->cd_base = fdesc->base;
464 		LIST_INSERT_HEAD(&da->all_chunks, cdp, cd_all_le);
465 		MPASS(cdp->cd_base + cdp->cd_max_size == da->rtbl_top);
466 	}
467 
468 	cdp->cd_hash = hash;
469 	cdp->cd_refcnt = 1;
470 	cdp->cd_cur_size = size;
471 	LIST_INSERT_HEAD(&da->chunk_hashtbl[hash & CHUNK_HASH_MASK], cdp,
472 	    cd_hash_le);
473 	if (da->rtbl_top >= da->rtbl_size) {
474 		if (da->rtbl_top >= BASE_MAX) {
475 			FIB_PRINTF(LOG_ERR, da->fd,
476 			    "structural limit exceeded at %d "
477 			    "range table elements", da->rtbl_top);
478 			return (1);
479 		}
480 		da->rtbl_size += RTBL_SIZE_INCR;
481 		i = (BASE_MAX - da->rtbl_top) * LOG_DEBUG / BASE_MAX;
482 		FIB_PRINTF(i, da->fd, "range table at %d%% structural limit",
483 		    da->rtbl_top * 100 / BASE_MAX);
484 		da->range_tbl = realloc(da->range_tbl,
485 		    sizeof(*da->range_tbl) * da->rtbl_size + FRAGS_PREF_SHORT,
486 		    M_DXRAUX, M_NOWAIT);
487 		if (da->range_tbl == NULL) {
488 			FIB_PRINTF(LOG_NOTICE, da->fd,
489 			    "Unable to allocate DXR range table");
490 			return (1);
491 		}
492 	}
493 
494 	return (0);
495 }
496 
497 static void
chunk_unref(struct dxr_aux * da,uint32_t chunk)498 chunk_unref(struct dxr_aux *da, uint32_t chunk)
499 {
500 	struct direct_entry *fdesc = &da->direct_tbl[chunk];
501 	struct chunk_desc *cdp, *cdp2;
502 	uint32_t base = fdesc->base;
503 	uint32_t size = chunk_size(da, fdesc);
504 	uint32_t hash = chunk_hash(da, fdesc);
505 	int i;
506 
507 	/* Find the corresponding descriptor */
508 	LIST_FOREACH(cdp, &da->chunk_hashtbl[hash & CHUNK_HASH_MASK],
509 	    cd_hash_le)
510 		if (cdp->cd_hash == hash && cdp->cd_cur_size == size &&
511 		    memcmp(&da->range_tbl[base], &da->range_tbl[cdp->cd_base],
512 		    sizeof(struct range_entry_long) * size) == 0)
513 			break;
514 
515 	MPASS(cdp != NULL);
516 	if (--cdp->cd_refcnt > 0)
517 		return;
518 
519 	LIST_REMOVE(cdp, cd_hash_le);
520 	da->unused_chunks_size += cdp->cd_max_size;
521 	cdp->cd_cur_size = 0;
522 
523 	/* Attempt to merge with the preceding chunk, if empty */
524 	cdp2 = LIST_NEXT(cdp, cd_all_le);
525 	if (cdp2 != NULL && cdp2->cd_cur_size == 0) {
526 		MPASS(cdp2->cd_base + cdp2->cd_max_size == cdp->cd_base);
527 		LIST_REMOVE(cdp, cd_all_le);
528 		LIST_REMOVE(cdp2, cd_hash_le);
529 		cdp2->cd_max_size += cdp->cd_max_size;
530 		uma_zfree(chunk_zone, cdp);
531 		cdp = cdp2;
532 	}
533 
534 	/* Attempt to merge with the subsequent chunk, if empty */
535 	cdp2 = LIST_PREV(cdp, &da->all_chunks, chunk_desc, cd_all_le);
536 	if (cdp2 != NULL && cdp2->cd_cur_size == 0) {
537 		MPASS(cdp->cd_base + cdp->cd_max_size == cdp2->cd_base);
538 		LIST_REMOVE(cdp, cd_all_le);
539 		LIST_REMOVE(cdp2, cd_hash_le);
540 		cdp2->cd_max_size += cdp->cd_max_size;
541 		cdp2->cd_base = cdp->cd_base;
542 		uma_zfree(chunk_zone, cdp);
543 		cdp = cdp2;
544 	}
545 
546 	if (cdp->cd_base + cdp->cd_max_size == da->rtbl_top) {
547 		/* Free the chunk on the top of the range heap, trim the heap */
548 		MPASS(cdp == LIST_FIRST(&da->all_chunks));
549 		da->rtbl_top -= cdp->cd_max_size;
550 		da->unused_chunks_size -= cdp->cd_max_size;
551 		LIST_REMOVE(cdp, cd_all_le);
552 		uma_zfree(chunk_zone, cdp);
553 		return;
554 	}
555 
556 	i = cdp->cd_max_size;
557 	if (i >= UNUSED_BUCKETS)
558 		i = 0;
559 	LIST_INSERT_HEAD(&da->unused_chunks[i], cdp, cd_hash_le);
560 }
561 
562 static uint32_t
trie_hash(struct dxr_aux * da,uint32_t dxr_x,uint32_t index)563 trie_hash(struct dxr_aux *da, uint32_t dxr_x, uint32_t index)
564 {
565 	uint32_t i, *val;
566 	uint32_t hash = 0;
567 
568 	for (i = 0; i < (1 << dxr_x); i++) {
569 		hash = (hash << 3) ^ (hash >> 3);
570 		val = (uint32_t *)
571 		    (void *) &da->direct_tbl[(index << dxr_x) + i];
572 		hash += (*val << 5);
573 		hash += (*val >> 5);
574 	}
575 
576 	return (hash + (hash >> 16));
577 }
578 
579 static int
trie_ref(struct dxr_aux * da,uint32_t index)580 trie_ref(struct dxr_aux *da, uint32_t index)
581 {
582 	struct trie_desc *tp;
583 	uint32_t dxr_d = da->d_bits;
584 	uint32_t dxr_x = DXR_TRIE_BITS - dxr_d;
585 	uint32_t hash = trie_hash(da, dxr_x, index);
586 
587 	/* Find an existing descriptor */
588 	LIST_FOREACH(tp, &da->trie_hashtbl[hash & TRIE_HASH_MASK], td_hash_le)
589 		if (tp->td_hash == hash &&
590 		    memcmp(&da->direct_tbl[index << dxr_x],
591 		    &da->x_tbl[tp->td_index << dxr_x],
592 		    sizeof(*da->x_tbl) << dxr_x) == 0) {
593 			tp->td_refcnt++;
594 			da->trietbl[index] = tp;
595 			return(tp->td_index);
596 		}
597 
598 	tp = LIST_FIRST(&da->unused_trie);
599 	if (tp != NULL) {
600 		LIST_REMOVE(tp, td_hash_le);
601 		da->unused_trie_cnt--;
602 	} else {
603 		tp = uma_zalloc(trie_zone, M_NOWAIT);
604 		if (tp == NULL)
605 			return (-1);
606 		LIST_INSERT_HEAD(&da->all_trie, tp, td_all_le);
607 		tp->td_index = da->all_trie_cnt++;
608 	}
609 
610 	tp->td_hash = hash;
611 	tp->td_refcnt = 1;
612 	LIST_INSERT_HEAD(&da->trie_hashtbl[hash & TRIE_HASH_MASK], tp,
613 	   td_hash_le);
614 	memcpy(&da->x_tbl[tp->td_index << dxr_x],
615 	    &da->direct_tbl[index << dxr_x], sizeof(*da->x_tbl) << dxr_x);
616 	da->trietbl[index] = tp;
617 	if (da->all_trie_cnt >= da->xtbl_size >> dxr_x) {
618 		da->xtbl_size += XTBL_SIZE_INCR;
619 		da->x_tbl = realloc(da->x_tbl,
620 		    sizeof(*da->x_tbl) * da->xtbl_size, M_DXRAUX, M_NOWAIT);
621 		if (da->x_tbl == NULL) {
622 			FIB_PRINTF(LOG_NOTICE, da->fd,
623 			    "Unable to allocate DXR extension table");
624 			return (-1);
625 		}
626 	}
627 	return(tp->td_index);
628 }
629 
630 static void
trie_unref(struct dxr_aux * da,uint32_t index)631 trie_unref(struct dxr_aux *da, uint32_t index)
632 {
633 	struct trie_desc *tp = da->trietbl[index];
634 
635 	if (tp == NULL)
636 		return;
637 	da->trietbl[index] = NULL;
638 	if (--tp->td_refcnt > 0)
639 		return;
640 
641 	LIST_REMOVE(tp, td_hash_le);
642 	da->unused_trie_cnt++;
643 	if (tp->td_index != da->all_trie_cnt - 1) {
644 		LIST_INSERT_HEAD(&da->unused_trie, tp, td_hash_le);
645 		return;
646 	}
647 
648 	do {
649 		da->all_trie_cnt--;
650 		da->unused_trie_cnt--;
651 		LIST_REMOVE(tp, td_all_le);
652 		uma_zfree(trie_zone, tp);
653 		LIST_FOREACH(tp, &da->unused_trie, td_hash_le)
654 			if (tp->td_index == da->all_trie_cnt - 1) {
655 				LIST_REMOVE(tp, td_hash_le);
656 				break;
657 			}
658 	} while (tp != NULL);
659 }
660 
661 static void
heap_inject(struct dxr_aux * da,uint32_t start,uint32_t end,uint32_t preflen,uint32_t nh)662 heap_inject(struct dxr_aux *da, uint32_t start, uint32_t end, uint32_t preflen,
663     uint32_t nh)
664 {
665 	struct heap_entry *fhp;
666 	int i;
667 
668 	for (i = da->heap_index; i >= 0; i--) {
669 		if (preflen > da->heap[i].preflen)
670 			break;
671 		else if (preflen < da->heap[i].preflen)
672 			da->heap[i + 1] = da->heap[i];
673 		else
674 			return;
675 	}
676 
677 	fhp = &da->heap[i + 1];
678 	fhp->preflen = preflen;
679 	fhp->start = start;
680 	fhp->end = end;
681 	fhp->nexthop = nh;
682 	da->heap_index++;
683 }
684 
685 static int
dxr_walk(struct rtentry * rt,void * arg)686 dxr_walk(struct rtentry *rt, void *arg)
687 {
688 	struct dxr_aux *da = arg;
689 	uint32_t chunk = da->work_chunk;
690 	uint32_t first = chunk << DXR_RANGE_SHIFT;
691 	uint32_t last = first | DXR_RANGE_MASK;
692 	struct range_entry_long *fp =
693 	    &da->range_tbl[da->rtbl_top + da->rtbl_work_frags].re;
694 	struct heap_entry *fhp = &da->heap[da->heap_index];
695 	uint32_t preflen, nh, start, end, scopeid;
696 	struct in_addr addr;
697 
698 	rt_get_inet_prefix_plen(rt, &addr, &preflen, &scopeid);
699 	start = ntohl(addr.s_addr);
700 	if (start > last)
701 		return (-1);	/* Beyond chunk boundaries, we are done */
702 	if (start < first)
703 		return (0);	/* Skip this route */
704 
705 	end = start;
706 	if (preflen < 32)
707 		end |= (0xffffffffU >> preflen);
708 	nh = fib_get_nhop_idx(da->fd, rt_get_raw_nhop(rt));
709 
710 	if (start == fhp->start)
711 		heap_inject(da, start, end, preflen, nh);
712 	else {
713 		/* start > fhp->start */
714 		while (start > fhp->end) {
715 			uint32_t oend = fhp->end;
716 
717 			if (da->heap_index > 0) {
718 				fhp--;
719 				da->heap_index--;
720 			} else
721 				initheap(da, fhp->end + 1, chunk);
722 			if (fhp->end > oend && fhp->nexthop != fp->nexthop) {
723 				fp++;
724 				da->rtbl_work_frags++;
725 				fp->start = (oend + 1) & DXR_RANGE_MASK;
726 				fp->nexthop = fhp->nexthop;
727 			}
728 		}
729 		if (start > ((chunk << DXR_RANGE_SHIFT) | fp->start) &&
730 		    nh != fp->nexthop) {
731 			fp++;
732 			da->rtbl_work_frags++;
733 			fp->start = start & DXR_RANGE_MASK;
734 		} else if (da->rtbl_work_frags) {
735 			if ((--fp)->nexthop == nh)
736 				da->rtbl_work_frags--;
737 			else
738 				fp++;
739 		}
740 		fp->nexthop = nh;
741 		heap_inject(da, start, end, preflen, nh);
742 	}
743 
744 	return (0);
745 }
746 
747 static int
update_chunk(struct dxr_aux * da,uint32_t chunk)748 update_chunk(struct dxr_aux *da, uint32_t chunk)
749 {
750 	struct range_entry_long *fp;
751 #if DXR_TRIE_BITS < 24
752 	struct range_entry_short *fps;
753 	uint32_t start, nh, i;
754 #endif
755 	struct heap_entry *fhp;
756 	uint32_t first = chunk << DXR_RANGE_SHIFT;
757 	uint32_t last = first | DXR_RANGE_MASK;
758 
759 	if (da->direct_tbl[chunk].fragments != FRAGS_MARK_HIT)
760 		chunk_unref(da, chunk);
761 
762 	initheap(da, first, chunk);
763 
764 	fp = &da->range_tbl[da->rtbl_top].re;
765 	da->rtbl_work_frags = 0;
766 	fp->start = first & DXR_RANGE_MASK;
767 	fp->nexthop = da->heap[0].nexthop;
768 
769 	da->dst.sin_addr.s_addr = htonl(first);
770 	da->mask.sin_addr.s_addr = htonl(~DXR_RANGE_MASK);
771 
772 	da->work_chunk = chunk;
773 	rib_walk_from(da->fibnum, AF_INET, RIB_FLAG_LOCKED,
774 	    (struct sockaddr *) &da->dst, (struct sockaddr *) &da->mask,
775 	    dxr_walk, da);
776 
777 	/* Flush any remaining objects on the heap */
778 	fp = &da->range_tbl[da->rtbl_top + da->rtbl_work_frags].re;
779 	fhp = &da->heap[da->heap_index];
780 	while (fhp->preflen > DXR_TRIE_BITS) {
781 		uint32_t oend = fhp->end;
782 
783 		if (da->heap_index > 0) {
784 			fhp--;
785 			da->heap_index--;
786 		} else
787 			initheap(da, fhp->end + 1, chunk);
788 		if (fhp->end > oend && fhp->nexthop != fp->nexthop) {
789 			/* Have we crossed the upper chunk boundary? */
790 			if (oend >= last)
791 				break;
792 			fp++;
793 			da->rtbl_work_frags++;
794 			fp->start = (oend + 1) & DXR_RANGE_MASK;
795 			fp->nexthop = fhp->nexthop;
796 		}
797 	}
798 
799 	/* Direct hit if the chunk contains only a single fragment */
800 	if (da->rtbl_work_frags == 0) {
801 		da->direct_tbl[chunk].base = fp->nexthop;
802 		da->direct_tbl[chunk].fragments = FRAGS_MARK_HIT;
803 		return (0);
804 	}
805 
806 	da->direct_tbl[chunk].base = da->rtbl_top;
807 	da->direct_tbl[chunk].fragments = da->rtbl_work_frags;
808 
809 #if DXR_TRIE_BITS < 24
810 	/* Check whether the chunk can be more compactly encoded */
811 	fp = &da->range_tbl[da->rtbl_top].re;
812 	for (i = 0; i <= da->rtbl_work_frags; i++, fp++)
813 		if ((fp->start & 0xff) != 0 || fp->nexthop > RE_SHORT_MAX_NH)
814 			break;
815 	if (i == da->rtbl_work_frags + 1) {
816 		fp = &da->range_tbl[da->rtbl_top].re;
817 		fps = (void *) fp;
818 		for (i = 0; i <= da->rtbl_work_frags; i++, fp++, fps++) {
819 			start = fp->start;
820 			nh = fp->nexthop;
821 			fps->start = start >> 8;
822 			fps->nexthop = nh;
823 		}
824 		fps->start = start >> 8;
825 		fps->nexthop = nh;
826 		da->rtbl_work_frags >>= 1;
827 		da->direct_tbl[chunk].fragments =
828 		    da->rtbl_work_frags | FRAGS_PREF_SHORT;
829 	} else
830 #endif
831 	if (da->rtbl_work_frags >= FRAGS_MARK_HIT) {
832 		da->direct_tbl[chunk].fragments = FRAGS_MARK_XL;
833 		memmove(&da->range_tbl[da->rtbl_top + 1],
834 		   &da->range_tbl[da->rtbl_top],
835 		   (da->rtbl_work_frags + 1) * sizeof(*da->range_tbl));
836 		da->range_tbl[da->rtbl_top].fragments = da->rtbl_work_frags;
837 		da->rtbl_work_frags++;
838 	}
839 	da->rtbl_top += (da->rtbl_work_frags + 1);
840 	return (chunk_ref(da, chunk));
841 }
842 
843 static void
dxr_build(struct dxr * dxr)844 dxr_build(struct dxr *dxr)
845 {
846 	struct dxr_aux *da = dxr->aux;
847 	struct chunk_desc *cdp;
848 	struct rib_rtable_info rinfo;
849 	struct timeval t0, t1, t2, t3;
850 	uint32_t r_size, dxr_tot_size;
851 	uint32_t i, m, range_rebuild = 0;
852 	uint32_t range_frag;
853 	struct trie_desc *tp;
854 	uint32_t d_tbl_size, dxr_x, d_size, x_size;
855 	uint32_t ti, trie_rebuild = 0, prev_size = 0;
856 	uint32_t trie_frag;
857 
858 	MPASS(dxr->d == NULL);
859 
860 	if (da == NULL) {
861 		da = malloc(sizeof(*dxr->aux), M_DXRAUX, M_NOWAIT);
862 		if (da == NULL) {
863 			FIB_PRINTF(LOG_NOTICE, dxr->fd,
864 			    "Unable to allocate DXR aux struct");
865 			return;
866 		}
867 		dxr->aux = da;
868 		da->fibnum = dxr->fibnum;
869 		da->fd = dxr->fd;
870 		da->refcnt = 1;
871 		LIST_INIT(&da->all_chunks);
872 		LIST_INIT(&da->all_trie);
873 		da->rtbl_size = RTBL_SIZE_INCR;
874 		da->range_tbl = NULL;
875 		da->xtbl_size = XTBL_SIZE_INCR;
876 		da->x_tbl = NULL;
877 		bzero(&da->dst, sizeof(da->dst));
878 		bzero(&da->mask, sizeof(da->mask));
879 		da->dst.sin_len = sizeof(da->dst);
880 		da->mask.sin_len = sizeof(da->mask);
881 		da->dst.sin_family = AF_INET;
882 		da->mask.sin_family = AF_INET;
883 	}
884 	if (da->range_tbl == NULL) {
885 		da->range_tbl = malloc(sizeof(*da->range_tbl) * da->rtbl_size
886 		    + FRAGS_PREF_SHORT, M_DXRAUX, M_NOWAIT);
887 		if (da->range_tbl == NULL) {
888 			FIB_PRINTF(LOG_NOTICE, da->fd,
889 			    "Unable to allocate DXR range table");
890 			return;
891 		}
892 		range_rebuild = 1;
893 	}
894 	if (da->x_tbl == NULL) {
895 		da->x_tbl = malloc(sizeof(*da->x_tbl) * da->xtbl_size,
896 		    M_DXRAUX, M_NOWAIT);
897 		if (da->x_tbl == NULL) {
898 			FIB_PRINTF(LOG_NOTICE, da->fd,
899 			    "Unable to allocate DXR extension table");
900 			return;
901 		}
902 		trie_rebuild = 1;
903 	}
904 
905 	microuptime(&t0);
906 
907 	dxr->nh_tbl = fib_get_nhop_array(da->fd);
908 	fib_get_rtable_info(fib_get_rh(da->fd), &rinfo);
909 
910 	if (da->updates_low > da->updates_high)
911 		range_rebuild = 1;
912 
913 range_build:
914 	if (range_rebuild) {
915 		/* Bulk cleanup */
916 		bzero(da->chunk_hashtbl, sizeof(da->chunk_hashtbl));
917 		while ((cdp = LIST_FIRST(&da->all_chunks)) != NULL) {
918 			LIST_REMOVE(cdp, cd_all_le);
919 			uma_zfree(chunk_zone, cdp);
920 		}
921 		for (i = 0; i < UNUSED_BUCKETS; i++)
922 			LIST_INIT(&da->unused_chunks[i]);
923 		da->unused_chunks_size = 0;
924 		da->rtbl_top = 0;
925 		da->updates_low = 0;
926 		da->updates_high = DIRECT_TBL_SIZE - 1;
927 		memset(da->updates_mask, 0xff, sizeof(da->updates_mask));
928 		for (i = 0; i < DIRECT_TBL_SIZE; i++) {
929 			da->direct_tbl[i].fragments = FRAGS_MARK_HIT;
930 			da->direct_tbl[i].base = 0;
931 		}
932 	}
933 	da->prefixes = rinfo.num_prefixes;
934 
935 	/* DXR: construct direct & range table */
936 	for (i = da->updates_low; i <= da->updates_high; i++) {
937 		m = da->updates_mask[i >> 5] >> (i & 0x1f);
938 		if (m == 0)
939 			i |= 0x1f;
940 		else if (m & 1 && update_chunk(da, i) != 0)
941 			return;
942 	}
943 
944 	range_frag = 0;
945 	if (da->rtbl_top)
946 		range_frag = da->unused_chunks_size * 10000ULL / da->rtbl_top;
947 	if (range_frag > V_frag_limit) {
948 		range_rebuild = 1;
949 		goto range_build;
950 	}
951 
952 	r_size = sizeof(*da->range_tbl) * da->rtbl_top;
953 	microuptime(&t1);
954 
955 	if (range_rebuild ||
956 	    abs(fls(da->prefixes) - fls(da->trie_rebuilt_prefixes)) > 1)
957 		trie_rebuild = 1;
958 
959 trie_build:
960 	if (trie_rebuild) {
961 		da->trie_rebuilt_prefixes = da->prefixes;
962 		da->d_bits = DXR_D;
963 		da->updates_low = 0;
964 		da->updates_high = DIRECT_TBL_SIZE - 1;
965 		if (!range_rebuild)
966 			memset(da->updates_mask, 0xff,
967 			    sizeof(da->updates_mask));
968 	}
969 
970 dxr2_try_squeeze:
971 	if (trie_rebuild) {
972 		/* Bulk cleanup */
973 		bzero(da->trietbl, sizeof(da->trietbl));
974 		bzero(da->trie_hashtbl, sizeof(da->trie_hashtbl));
975 		while ((tp = LIST_FIRST(&da->all_trie)) != NULL) {
976 			LIST_REMOVE(tp, td_all_le);
977 			uma_zfree(trie_zone, tp);
978 		}
979 		LIST_INIT(&da->unused_trie);
980 		da->all_trie_cnt = da->unused_trie_cnt = 0;
981 	}
982 
983 	/* Populate d_tbl, x_tbl */
984 	dxr_x = DXR_TRIE_BITS - da->d_bits;
985 	d_tbl_size = (1 << da->d_bits);
986 
987 	for (i = da->updates_low >> dxr_x; i <= da->updates_high >> dxr_x;
988 	    i++) {
989 		if (!trie_rebuild) {
990 			m = 0;
991 			for (int j = 0; j < (1 << dxr_x); j += 32)
992 				m |= da->updates_mask[((i << dxr_x) + j) >> 5];
993 			if (m == 0)
994 				continue;
995 			trie_unref(da, i);
996 		}
997 		ti = trie_ref(da, i);
998 		if (ti < 0)
999 			return;
1000 		da->d_tbl[i] = ti;
1001 	}
1002 
1003 	trie_frag = 0;
1004 	if (da->all_trie_cnt)
1005 		trie_frag = da->unused_trie_cnt * 10000ULL / da->all_trie_cnt;
1006 	if (trie_frag > V_frag_limit) {
1007 		trie_rebuild = 1;
1008 		goto trie_build;
1009 	}
1010 
1011 	d_size = sizeof(*da->d_tbl) * d_tbl_size;
1012 	x_size = sizeof(*da->x_tbl) * DIRECT_TBL_SIZE / d_tbl_size
1013 	    * da->all_trie_cnt;
1014 	dxr_tot_size = d_size + x_size + r_size;
1015 
1016 	if (trie_rebuild == 1) {
1017 		/* Try to find a more compact D/X split */
1018 		if (prev_size == 0 || dxr_tot_size <= prev_size)
1019 			da->d_bits--;
1020 		else {
1021 			da->d_bits++;
1022 			trie_rebuild = 2;
1023 		}
1024 		prev_size = dxr_tot_size;
1025 		goto dxr2_try_squeeze;
1026 	}
1027 	microuptime(&t2);
1028 
1029 	dxr->d = malloc(dxr_tot_size, M_DXRLPM, M_NOWAIT);
1030 	if (dxr->d == NULL) {
1031 		FIB_PRINTF(LOG_NOTICE, da->fd,
1032 		    "Unable to allocate DXR lookup table");
1033 		return;
1034 	}
1035 	memcpy(dxr->d, da->d_tbl, d_size);
1036 	dxr->x = ((char *) dxr->d) + d_size;
1037 	memcpy(dxr->x, da->x_tbl, x_size);
1038 	dxr->r = ((char *) dxr->x) + x_size;
1039 	dxr->d_shift = 32 - da->d_bits;
1040 	dxr->x_shift = dxr_x;
1041 	dxr->x_mask = 0xffffffffU >> (32 - dxr_x);
1042 	memcpy(dxr->r, da->range_tbl, r_size);
1043 
1044 	if (da->updates_low <= da->updates_high)
1045 		bzero(&da->updates_mask[da->updates_low / 32],
1046 		    (da->updates_high - da->updates_low) / 8 + 1);
1047 	da->updates_low = DIRECT_TBL_SIZE - 1;
1048 	da->updates_high = 0;
1049 	microuptime(&t3);
1050 
1051 	FIB_PRINTF(LOG_INFO, da->fd, "D%dX%dR, %d prefixes, %d nhops (max)",
1052 	    da->d_bits, dxr_x, rinfo.num_prefixes, rinfo.num_nhops);
1053 	i = dxr_tot_size * 100;
1054 	if (rinfo.num_prefixes)
1055 		i /= rinfo.num_prefixes;
1056 	FIB_PRINTF(LOG_INFO, da->fd, "%d.%02d KBytes, %d.%02d Bytes/prefix",
1057 	    dxr_tot_size / 1024, dxr_tot_size * 100 / 1024 % 100,
1058 	    i / 100, i % 100);
1059 	FIB_PRINTF(LOG_INFO, da->fd,
1060 	    "%d.%02d%% trie, %d.%02d%% range fragmentation",
1061 	    trie_frag / 100, trie_frag % 100,
1062 	    range_frag / 100, range_frag % 100);
1063 	i = (t1.tv_sec - t0.tv_sec) * 1000000 + t1.tv_usec - t0.tv_usec;
1064 	FIB_PRINTF(LOG_INFO, da->fd, "range table %s in %u.%03u ms",
1065 	    range_rebuild ? "rebuilt" : "updated", i / 1000, i % 1000);
1066 	i = (t2.tv_sec - t1.tv_sec) * 1000000 + t2.tv_usec - t1.tv_usec;
1067 	FIB_PRINTF(LOG_INFO, da->fd, "trie %s in %u.%03u ms",
1068 	    trie_rebuild ? "rebuilt" : "updated", i / 1000, i % 1000);
1069 	i = (t3.tv_sec - t2.tv_sec) * 1000000 + t3.tv_usec - t2.tv_usec;
1070 	FIB_PRINTF(LOG_INFO, da->fd, "snapshot forked in %u.%03u ms",
1071 	    i / 1000, i % 1000);
1072 }
1073 
1074 /*
1075  * Glue functions for attaching to the FIB_ALGO infrastructure.
1076  */
1077 
1078 static struct nhop_object *
dxr_fib_lookup(void * algo_data,const struct flm_lookup_key key,uint32_t scopeid)1079 dxr_fib_lookup(void *algo_data, const struct flm_lookup_key key,
1080     uint32_t scopeid)
1081 {
1082 	struct dxr *dxr = algo_data;
1083 
1084 	return (dxr->nh_tbl[dxr_lookup(dxr, ntohl(key.addr4.s_addr))]);
1085 }
1086 
1087 static enum flm_op_result
dxr_init(uint32_t fibnum,struct fib_data * fd,void * old_data,void ** data)1088 dxr_init(uint32_t fibnum, struct fib_data *fd, void *old_data, void **data)
1089 {
1090 	struct dxr *old_dxr = old_data;
1091 	struct dxr_aux *da = NULL;
1092 	struct dxr *dxr;
1093 
1094 	dxr = malloc(sizeof(*dxr), M_DXRAUX, M_NOWAIT);
1095 	if (dxr == NULL) {
1096 		FIB_PRINTF(LOG_NOTICE, fd,
1097 		    "Unable to allocate DXR container struct");
1098 		return (FLM_REBUILD);
1099 	}
1100 
1101 	/* Check whether we may reuse the old auxiliary structures */
1102 	if (old_dxr != NULL && old_dxr->aux != NULL &&
1103 	    old_dxr->aux->fd == fd) {
1104 		da = old_dxr->aux;
1105 		atomic_add_int(&da->refcnt, 1);
1106 	}
1107 
1108 	dxr->aux = da;
1109 	dxr->d = NULL;
1110 	dxr->fd = fd;
1111 	dxr->fibnum = fibnum;
1112 	*data = dxr;
1113 
1114 	return (FLM_SUCCESS);
1115 }
1116 
1117 static void
dxr_destroy(void * data)1118 dxr_destroy(void *data)
1119 {
1120 	struct dxr *dxr = data;
1121 	struct dxr_aux *da = dxr->aux;
1122 	struct chunk_desc *cdp;
1123 	struct trie_desc *tp;
1124 
1125 	free(dxr->d, M_DXRLPM);
1126 	free(dxr, M_DXRAUX);
1127 
1128 	if (da == NULL || atomic_fetchadd_int(&da->refcnt, -1) > 1)
1129 		return;
1130 
1131 	/* Release auxiliary structures */
1132 	while ((cdp = LIST_FIRST(&da->all_chunks)) != NULL) {
1133 		LIST_REMOVE(cdp, cd_all_le);
1134 		uma_zfree(chunk_zone, cdp);
1135 	}
1136 	while ((tp = LIST_FIRST(&da->all_trie)) != NULL) {
1137 		LIST_REMOVE(tp, td_all_le);
1138 		uma_zfree(trie_zone, tp);
1139 	}
1140 	free(da->range_tbl, M_DXRAUX);
1141 	free(da->x_tbl, M_DXRAUX);
1142 	free(da, M_DXRAUX);
1143 }
1144 
1145 static void
epoch_dxr_destroy(epoch_context_t ctx)1146 epoch_dxr_destroy(epoch_context_t ctx)
1147 {
1148 	struct dxr *dxr = __containerof(ctx, struct dxr, epoch_ctx);
1149 
1150 	dxr_destroy(dxr);
1151 }
1152 
1153 static void *
choose_lookup_fn(struct dxr_aux * da)1154 choose_lookup_fn(struct dxr_aux *da)
1155 {
1156 
1157 	switch (da->d_bits) {
1158 #if DXR_TRIE_BITS > 16
1159 	case 16:
1160 		return (dxr_fib_lookup_16);
1161 #endif
1162 	case 15:
1163 		return (dxr_fib_lookup_15);
1164 	case 14:
1165 		return (dxr_fib_lookup_14);
1166 	case 13:
1167 		return (dxr_fib_lookup_13);
1168 	case 12:
1169 		return (dxr_fib_lookup_12);
1170 	case 11:
1171 		return (dxr_fib_lookup_11);
1172 	case 10:
1173 		return (dxr_fib_lookup_10);
1174 	case 9:
1175 		return (dxr_fib_lookup_9);
1176 	}
1177 	return (dxr_fib_lookup);
1178 }
1179 
1180 static enum flm_op_result
dxr_dump_end(void * data,struct fib_dp * dp)1181 dxr_dump_end(void *data, struct fib_dp *dp)
1182 {
1183 	struct dxr *dxr = data;
1184 	struct dxr_aux *da;
1185 
1186 	dxr_build(dxr);
1187 
1188 	da = dxr->aux;
1189 	if (da == NULL || dxr->d == NULL)
1190 		return (FLM_REBUILD);
1191 
1192 	if (da->rtbl_top >= BASE_MAX)
1193 		return (FLM_ERROR);
1194 
1195 	dp->f = choose_lookup_fn(da);
1196 	dp->arg = dxr;
1197 
1198 	return (FLM_SUCCESS);
1199 }
1200 
1201 static enum flm_op_result
dxr_dump_rib_item(struct rtentry * rt,void * data)1202 dxr_dump_rib_item(struct rtentry *rt, void *data)
1203 {
1204 
1205 	return (FLM_SUCCESS);
1206 }
1207 
1208 static enum flm_op_result
dxr_change_rib_item(struct rib_head * rnh,struct rib_cmd_info * rc,void * data)1209 dxr_change_rib_item(struct rib_head *rnh, struct rib_cmd_info *rc,
1210     void *data)
1211 {
1212 
1213 	return (FLM_BATCH);
1214 }
1215 
1216 static enum flm_op_result
dxr_change_rib_batch(struct rib_head * rnh,struct fib_change_queue * q,void * data)1217 dxr_change_rib_batch(struct rib_head *rnh, struct fib_change_queue *q,
1218     void *data)
1219 {
1220 	struct dxr *dxr = data;
1221 	struct dxr *new_dxr;
1222 	struct dxr_aux *da;
1223 	struct fib_dp new_dp;
1224 	enum flm_op_result res;
1225 	uint32_t ip, plen, hmask, start, end, i, ui;
1226 #ifdef INVARIANTS
1227 	struct rib_rtable_info rinfo;
1228 	int update_delta = 0;
1229 #endif
1230 
1231 	MPASS(data != NULL);
1232 	MPASS(q != NULL);
1233 	MPASS(q->count < q->size);
1234 
1235 	da = dxr->aux;
1236 	MPASS(da != NULL);
1237 	MPASS(da->fd == dxr->fd);
1238 	MPASS(da->refcnt > 0);
1239 
1240 	FIB_PRINTF(LOG_INFO, da->fd, "processing %d update(s)", q->count);
1241 	for (ui = 0; ui < q->count; ui++) {
1242 #ifdef INVARIANTS
1243 		if (q->entries[ui].nh_new != NULL)
1244 			update_delta++;
1245 		if (q->entries[ui].nh_old != NULL)
1246 			update_delta--;
1247 #endif
1248 		plen = q->entries[ui].plen;
1249 		ip = ntohl(q->entries[ui].addr4.s_addr);
1250 		if (plen < 32)
1251 			hmask = 0xffffffffU >> plen;
1252 		else
1253 			hmask = 0;
1254 		start = (ip & ~hmask) >> DXR_RANGE_SHIFT;
1255 		end = (ip | hmask) >> DXR_RANGE_SHIFT;
1256 
1257 		if ((start & 0x1f) == 0 && (end & 0x1f) == 0x1f)
1258 			for (i = start >> 5; i <= end >> 5; i++)
1259 				da->updates_mask[i] = 0xffffffffU;
1260 		else
1261 			for (i = start; i <= end; i++)
1262 				da->updates_mask[i >> 5] |= (1 << (i & 0x1f));
1263 		if (start < da->updates_low)
1264 			da->updates_low = start;
1265 		if (end > da->updates_high)
1266 			da->updates_high = end;
1267 	}
1268 
1269 #ifdef INVARIANTS
1270 	fib_get_rtable_info(fib_get_rh(da->fd), &rinfo);
1271 	MPASS(da->prefixes + update_delta == rinfo.num_prefixes);
1272 #endif
1273 
1274 	res = dxr_init(0, dxr->fd, data, (void **) &new_dxr);
1275 	if (res != FLM_SUCCESS)
1276 		return (res);
1277 
1278 	dxr_build(new_dxr);
1279 
1280 	/* Structural limit exceeded, hard error */
1281 	if (da->rtbl_top >= BASE_MAX) {
1282 		dxr_destroy(new_dxr);
1283 		return (FLM_ERROR);
1284 	}
1285 
1286 	if (new_dxr->d == NULL) {
1287 		dxr_destroy(new_dxr);
1288 		return (FLM_REBUILD);
1289 	}
1290 
1291 	new_dp.f = choose_lookup_fn(da);
1292 	new_dp.arg = new_dxr;
1293 	if (fib_set_datapath_ptr(dxr->fd, &new_dp)) {
1294 		fib_set_algo_ptr(dxr->fd, new_dxr);
1295 		fib_epoch_call(epoch_dxr_destroy, &dxr->epoch_ctx);
1296 		return (FLM_SUCCESS);
1297 	}
1298 
1299 	FIB_PRINTF(LOG_NOTICE, dxr->fd, "fib_set_datapath_ptr() failed");
1300 	dxr_destroy(new_dxr);
1301 	return (FLM_REBUILD);
1302 }
1303 
1304 static uint8_t
dxr_get_pref(const struct rib_rtable_info * rinfo)1305 dxr_get_pref(const struct rib_rtable_info *rinfo)
1306 {
1307 
1308 	/* Below bsearch4 up to 10 prefixes. Always supersedes dpdk_lpm4. */
1309 	return (251);
1310 }
1311 
1312 SYSCTL_DECL(_net_route_algo);
1313 SYSCTL_NODE(_net_route_algo, OID_AUTO, dxr, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
1314     "DXR tunables");
1315 
1316 static int
sysctl_dxr_frag_limit(SYSCTL_HANDLER_ARGS)1317 sysctl_dxr_frag_limit(SYSCTL_HANDLER_ARGS)
1318 {
1319 	char buf[8];
1320 	int error, new, i;
1321 
1322 	snprintf(buf, sizeof(buf), "%d.%02d%%", V_frag_limit / 100,
1323 	    V_frag_limit % 100);
1324 	error = sysctl_handle_string(oidp, buf, sizeof(buf), req);
1325 	if (error != 0 || req->newptr == NULL)
1326 		return (error);
1327 	if (!isdigit(*buf) && *buf != '.')
1328 		return (EINVAL);
1329 	for (i = 0, new = 0; isdigit(buf[i]) && i < sizeof(buf); i++)
1330 		new = new * 10 + buf[i] - '0';
1331 	new *= 100;
1332 	if (buf[i++] == '.') {
1333 		if (!isdigit(buf[i]))
1334 			return (EINVAL);
1335 		new += (buf[i++] - '0') * 10;
1336 		if (isdigit(buf[i]))
1337 			new += buf[i++] - '0';
1338 	}
1339 	if (new > 1000)
1340 		return (EINVAL);
1341 	V_frag_limit = new;
1342 	snprintf(buf, sizeof(buf), "%d.%02d%%", V_frag_limit / 100,
1343 	    V_frag_limit % 100);
1344 	return (0);
1345 }
1346 
1347 SYSCTL_PROC(_net_route_algo_dxr, OID_AUTO, frag_limit,
1348     CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_VNET,
1349     0, 0, sysctl_dxr_frag_limit, "A",
1350     "Fragmentation threshold to full rebuild");
1351 
1352 static struct fib_lookup_module fib_dxr_mod = {
1353 	.flm_name = "dxr",
1354 	.flm_family = AF_INET,
1355 	.flm_init_cb = dxr_init,
1356 	.flm_destroy_cb = dxr_destroy,
1357 	.flm_dump_rib_item_cb = dxr_dump_rib_item,
1358 	.flm_dump_end_cb = dxr_dump_end,
1359 	.flm_change_rib_item_cb = dxr_change_rib_item,
1360 	.flm_change_rib_items_cb = dxr_change_rib_batch,
1361 	.flm_get_pref = dxr_get_pref,
1362 };
1363 
1364 static int
dxr_modevent(module_t mod,int type,void * unused)1365 dxr_modevent(module_t mod, int type, void *unused)
1366 {
1367 	int error;
1368 
1369 	switch (type) {
1370 	case MOD_LOAD:
1371 		chunk_zone = uma_zcreate("dxr chunk", sizeof(struct chunk_desc),
1372 		    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
1373 		trie_zone = uma_zcreate("dxr trie", sizeof(struct trie_desc),
1374 		    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
1375 		fib_module_register(&fib_dxr_mod);
1376 		return(0);
1377 	case MOD_UNLOAD:
1378 		error = fib_module_unregister(&fib_dxr_mod);
1379 		if (error)
1380 			return (error);
1381 		uma_zdestroy(chunk_zone);
1382 		uma_zdestroy(trie_zone);
1383 		return(0);
1384 	default:
1385 		return(EOPNOTSUPP);
1386 	}
1387 }
1388 
1389 static moduledata_t dxr_mod = {"fib_dxr", dxr_modevent, 0};
1390 
1391 DECLARE_MODULE(fib_dxr, dxr_mod, SI_SUB_PSEUDO, SI_ORDER_ANY);
1392 MODULE_VERSION(fib_dxr, 1);
1393