xref: /freebsd/sys/net/route/route_helpers.c (revision 848ee2a3a8b47c9905fc51fefcf60eb371edbb98)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2020 Alexander V. Chernikov
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions
8  * are met:
9  * 1. Redistributions of source code must retain the above copyright
10  *    notice, this list of conditions and the following disclaimer.
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in the
13  *    documentation and/or other materials provided with the distribution.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
16  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
19  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
21  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
22  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
23  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
24  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
25  * SUCH DAMAGE.
26  */
27 
28 #include <sys/cdefs.h>
29 __FBSDID("$FreeBSD$");
30 #include "opt_inet.h"
31 #include "opt_inet6.h"
32 #include "opt_route.h"
33 
34 #include <sys/param.h>
35 #include <sys/jail.h>
36 #include <sys/systm.h>
37 #include <sys/malloc.h>
38 #include <sys/mbuf.h>
39 #include <sys/socket.h>
40 #include <sys/sysctl.h>
41 #include <sys/syslog.h>
42 #include <sys/sysproto.h>
43 #include <sys/proc.h>
44 #include <sys/domain.h>
45 #include <sys/kernel.h>
46 #include <sys/lock.h>
47 #include <sys/rmlock.h>
48 
49 #include <net/if.h>
50 #include <net/if_var.h>
51 #include <net/if_dl.h>
52 #include <net/route.h>
53 #include <net/route/route_ctl.h>
54 #include <net/route/route_var.h>
55 #include <net/route/nhop_utils.h>
56 #include <net/route/nhop.h>
57 #include <net/route/nhop_var.h>
58 #ifdef INET
59 #include <netinet/in_fib.h>
60 #endif
61 #ifdef INET6
62 #include <netinet6/in6_fib.h>
63 #include <netinet6/in6_var.h>
64 #endif
65 #include <net/vnet.h>
66 
67 /*
68  * RIB helper functions.
69  */
70 
71 void
72 rib_walk_ext_locked(struct rib_head *rnh, rib_walktree_f_t *wa_f,
73     rib_walk_hook_f_t *hook_f, void *arg)
74 {
75 	if (hook_f != NULL)
76 		hook_f(rnh, RIB_WALK_HOOK_PRE, arg);
77 	rnh->rnh_walktree(&rnh->head, (walktree_f_t *)wa_f, arg);
78 	if (hook_f != NULL)
79 		hook_f(rnh, RIB_WALK_HOOK_POST, arg);
80 }
81 
82 /*
83  * Calls @wa_f with @arg for each entry in the table specified by
84  * @af and @fibnum.
85  *
86  * @ss_t callback is called before and after the tree traversal
87  *  while holding table lock.
88  *
89  * Table is traversed under read lock unless @wlock is set.
90  */
91 void
92 rib_walk_ext_internal(struct rib_head *rnh, bool wlock, rib_walktree_f_t *wa_f,
93     rib_walk_hook_f_t *hook_f, void *arg)
94 {
95 	RIB_RLOCK_TRACKER;
96 
97 	if (wlock)
98 		RIB_WLOCK(rnh);
99 	else
100 		RIB_RLOCK(rnh);
101 	rib_walk_ext_locked(rnh, wa_f, hook_f, arg);
102 	if (wlock)
103 		RIB_WUNLOCK(rnh);
104 	else
105 		RIB_RUNLOCK(rnh);
106 }
107 
108 void
109 rib_walk_ext(uint32_t fibnum, int family, bool wlock, rib_walktree_f_t *wa_f,
110     rib_walk_hook_f_t *hook_f, void *arg)
111 {
112 	struct rib_head *rnh;
113 
114 	if ((rnh = rt_tables_get_rnh(fibnum, family)) != NULL)
115 		rib_walk_ext_internal(rnh, wlock, wa_f, hook_f, arg);
116 }
117 
118 /*
119  * Calls @wa_f with @arg for each entry in the table specified by
120  * @af and @fibnum.
121  *
122  * Table is traversed under read lock unless @wlock is set.
123  */
124 void
125 rib_walk(uint32_t fibnum, int family, bool wlock, rib_walktree_f_t *wa_f,
126     void *arg)
127 {
128 
129 	rib_walk_ext(fibnum, family, wlock, wa_f, NULL, arg);
130 }
131 
132 /*
133  * Calls @wa_f with @arg for each entry in the table matching @prefix/@mask.
134  *
135  * The following flags are supported:
136  *  RIB_FLAG_WLOCK: acquire exclusive lock
137  *  RIB_FLAG_LOCKED: Assumes the table is already locked & skip locking
138  *
139  * By default, table is traversed under read lock.
140  */
141 void
142 rib_walk_from(uint32_t fibnum, int family, uint32_t flags, struct sockaddr *prefix,
143     struct sockaddr *mask, rib_walktree_f_t *wa_f, void *arg)
144 {
145 	RIB_RLOCK_TRACKER;
146 	struct rib_head *rnh = rt_tables_get_rnh(fibnum, family);
147 
148 	if (rnh == NULL)
149 		return;
150 
151 	if (flags & RIB_FLAG_WLOCK)
152 		RIB_WLOCK(rnh);
153 	else if (!(flags & RIB_FLAG_LOCKED))
154 		RIB_RLOCK(rnh);
155 
156 	rnh->rnh_walktree_from(&rnh->head, prefix, mask, (walktree_f_t *)wa_f, arg);
157 
158 	if (flags & RIB_FLAG_WLOCK)
159 		RIB_WUNLOCK(rnh);
160 	else if (!(flags & RIB_FLAG_LOCKED))
161 		RIB_RUNLOCK(rnh);
162 }
163 
164 /*
165  * Iterates over all existing fibs in system calling
166  *  @hook_f function before/after traversing each fib.
167  *  Calls @wa_f function for each element in current fib.
168  * If af is not AF_UNSPEC, iterates over fibs in particular
169  * address family.
170  */
171 void
172 rib_foreach_table_walk(int family, bool wlock, rib_walktree_f_t *wa_f,
173     rib_walk_hook_f_t *hook_f, void *arg)
174 {
175 
176 	for (uint32_t fibnum = 0; fibnum < rt_numfibs; fibnum++) {
177 		/* Do we want some specific family? */
178 		if (family != AF_UNSPEC) {
179 			rib_walk_ext(fibnum, family, wlock, wa_f, hook_f, arg);
180 			continue;
181 		}
182 
183 		for (int i = 1; i <= AF_MAX; i++)
184 			rib_walk_ext(fibnum, i, wlock, wa_f, hook_f, arg);
185 	}
186 }
187 
188 /*
189  * Iterates over all existing fibs in system and deletes each element
190  *  for which @filter_f function returns non-zero value.
191  * If @family is not AF_UNSPEC, iterates over fibs in particular
192  * address family.
193  */
194 void
195 rib_foreach_table_walk_del(int family, rib_filter_f_t *filter_f, void *arg)
196 {
197 
198 	for (uint32_t fibnum = 0; fibnum < rt_numfibs; fibnum++) {
199 		/* Do we want some specific family? */
200 		if (family != AF_UNSPEC) {
201 			rib_walk_del(fibnum, family, filter_f, arg, 0);
202 			continue;
203 		}
204 
205 		for (int i = 1; i <= AF_MAX; i++)
206 			rib_walk_del(fibnum, i, filter_f, arg, 0);
207 	}
208 }
209 
210 
211 /*
212  * Wrapper for the control plane functions for performing af-agnostic
213  *  lookups.
214  * @fibnum: fib to perform the lookup.
215  * @dst: sockaddr with family and addr filled in. IPv6 addresses needs to be in
216  *  deembedded from.
217  * @flags: fib(9) flags.
218  * @flowid: flow id for path selection in multipath use case.
219  *
220  * Returns nhop_object or NULL.
221  *
222  * Requires NET_EPOCH.
223  *
224  */
225 struct nhop_object *
226 rib_lookup(uint32_t fibnum, const struct sockaddr *dst, uint32_t flags,
227     uint32_t flowid)
228 {
229 	struct nhop_object *nh;
230 
231 	nh = NULL;
232 
233 	switch (dst->sa_family) {
234 #ifdef INET
235 	case AF_INET:
236 	{
237 		const struct sockaddr_in *a = (const struct sockaddr_in *)dst;
238 		nh = fib4_lookup(fibnum, a->sin_addr, 0, flags, flowid);
239 		break;
240 	}
241 #endif
242 #ifdef INET6
243 	case AF_INET6:
244 	{
245 		const struct sockaddr_in6 *a = (const struct sockaddr_in6*)dst;
246 		nh = fib6_lookup(fibnum, &a->sin6_addr, a->sin6_scope_id,
247 		    flags, flowid);
248 		break;
249 	}
250 #endif
251 	}
252 
253 	return (nh);
254 }
255 
256 #ifdef ROUTE_MPATH
257 static void
258 decompose_change_notification(struct rib_cmd_info *rc, route_notification_t *cb,
259     void *cbdata)
260 {
261 	uint32_t num_old, num_new;
262 	uint32_t nh_idx_old, nh_idx_new;
263 	struct weightened_nhop *wn_old, *wn_new;
264 	struct weightened_nhop tmp = { NULL, 0 };
265 	uint32_t idx_old = 0, idx_new = 0;
266 
267 	struct rib_cmd_info rc_del = { .rc_cmd = RTM_DELETE, .rc_rt = rc->rc_rt };
268 	struct rib_cmd_info rc_add = { .rc_cmd = RTM_ADD, .rc_rt = rc->rc_rt };
269 
270 	if (NH_IS_NHGRP(rc->rc_nh_old)) {
271 		wn_old = nhgrp_get_nhops((struct nhgrp_object *)rc->rc_nh_old, &num_old);
272 	} else {
273 		tmp.nh = rc->rc_nh_old;
274 		tmp.weight = rc->rc_nh_weight;
275 		wn_old = &tmp;
276 		num_old = 1;
277 	}
278 	if (NH_IS_NHGRP(rc->rc_nh_new)) {
279 		wn_new = nhgrp_get_nhops((struct nhgrp_object *)rc->rc_nh_new, &num_new);
280 	} else {
281 		tmp.nh = rc->rc_nh_new;
282 		tmp.weight = rc->rc_nh_weight;
283 		wn_new = &tmp;
284 		num_new = 1;
285 	}
286 
287 	/* Use the fact that each @wn array is sorted */
288 	/*
289 	 * Want to convert into set of add and delete operations
290 	 * [1] -> [1, 2] = A{2}
291 	 * [2] -> [1, 2] = A{1}
292 	 * [1, 2, 4]->[1, 3, 4] = A{2}, D{3}
293 	 * [1, 2, 4]->[1, 4] = D{2}
294 	 * [1, 2, 4] -> [3, 4] = D{1}, C{2,3} OR C{1,3}, D{2} OR D{1},D{2},A{3}
295 	 * [1, 2] -> [3, 4] =
296 	 *
297 	 */
298 	idx_old = 0;
299 	while ((idx_old < num_old) && (idx_new < num_new)) {
300 		nh_idx_old = wn_old[idx_old].nh->nh_priv->nh_idx;
301 		nh_idx_new = wn_new[idx_new].nh->nh_priv->nh_idx;
302 
303 		if (nh_idx_old == nh_idx_new) {
304 			if (wn_old[idx_old].weight != wn_new[idx_new].weight) {
305 				/* Update weight by providing del/add notifications */
306 				rc_del.rc_nh_old = wn_old[idx_old].nh;
307 				rc_del.rc_nh_weight = wn_old[idx_old].weight;
308 				cb(&rc_del, cbdata);
309 
310 				rc_add.rc_nh_new = wn_new[idx_new].nh;
311 				rc_add.rc_nh_weight = wn_new[idx_new].weight;
312 				cb(&rc_add, cbdata);
313 			}
314 			idx_old++;
315 			idx_new++;
316 		} else if (nh_idx_old < nh_idx_new) {
317 			/*
318 			 * [1, ~2~, 4], [1, ~3~, 4]
319 			 * [1, ~2~, 5], [1, ~3~, 4]
320 			 * [1, ~2~], [1, ~3~, 4]
321 			 */
322 			if ((idx_old + 1 >= num_old) ||
323 			    (wn_old[idx_old + 1].nh->nh_priv->nh_idx > nh_idx_new)) {
324 				/* Add new unless the next old item is still <= new */
325 				rc_add.rc_nh_new = wn_new[idx_new].nh;
326 				rc_add.rc_nh_weight = wn_new[idx_new].weight;
327 				cb(&rc_add, cbdata);
328 				idx_new++;
329 			}
330 			/* In any case, delete current old */
331 			rc_del.rc_nh_old = wn_old[idx_old].nh;
332 			rc_del.rc_nh_weight = wn_old[idx_old].weight;
333 			cb(&rc_del, cbdata);
334 			idx_old++;
335 		} else {
336 			/*
337 			 * nh_idx_old > nh_idx_new
338 			 *
339 			 * [1, ~3~, 4], [1, ~2~, 4]
340 			 * [1, ~3~, 5], [1, ~2~, 4]
341 			 * [1, ~3~, 4], [1, ~2~]
342 			 */
343 			if ((idx_new + 1 >= num_new) ||
344 			    (wn_new[idx_new + 1].nh->nh_priv->nh_idx > nh_idx_old)) {
345 				/* No next item or next item is > current one */
346 				rc_add.rc_nh_new = wn_new[idx_new].nh;
347 				rc_add.rc_nh_weight = wn_new[idx_new].weight;
348 				cb(&rc_add, cbdata);
349 				idx_new++;
350 			}
351 			/* In any case, delete current old */
352 			rc_del.rc_nh_old = wn_old[idx_old].nh;
353 			rc_del.rc_nh_weight = wn_old[idx_old].weight;
354 			cb(&rc_del, cbdata);
355 			idx_old++;
356 		}
357 	}
358 
359 	while (idx_old < num_old) {
360 		rc_del.rc_nh_old = wn_old[idx_old].nh;
361 		rc_del.rc_nh_weight = wn_old[idx_old].weight;
362 		cb(&rc_del, cbdata);
363 		idx_old++;
364 	}
365 
366 	while (idx_new < num_new) {
367 		rc_add.rc_nh_new = wn_new[idx_new].nh;
368 		rc_add.rc_nh_weight = wn_new[idx_new].weight;
369 		cb(&rc_add, cbdata);
370 		idx_new++;
371 	}
372 }
373 
374 /*
375  * Decompose multipath cmd info @rc into a list of add/del/change
376  *  single-path operations, calling @cb callback for each operation.
377  * Assumes at least one of the nexthops in @rc is multipath.
378  */
379 void
380 rib_decompose_notification(struct rib_cmd_info *rc, route_notification_t *cb,
381     void *cbdata)
382 {
383 	struct weightened_nhop *wn;
384 	uint32_t num_nhops;
385 	struct rib_cmd_info rc_new;
386 
387 	rc_new = *rc;
388 	DPRINTF("cb=%p cmd=%d nh_old=%p nh_new=%p",
389 	    cb, rc->cmd, rc->nh_old, rc->nh_new);
390 	switch (rc->rc_cmd) {
391 	case RTM_ADD:
392 		if (!NH_IS_NHGRP(rc->rc_nh_new))
393 			return;
394 		wn = nhgrp_get_nhops((struct nhgrp_object *)rc->rc_nh_new, &num_nhops);
395 		for (uint32_t i = 0; i < num_nhops; i++) {
396 			rc_new.rc_nh_new = wn[i].nh;
397 			rc_new.rc_nh_weight = wn[i].weight;
398 			cb(&rc_new, cbdata);
399 		}
400 		break;
401 	case RTM_DELETE:
402 		if (!NH_IS_NHGRP(rc->rc_nh_old))
403 			return;
404 		wn = nhgrp_get_nhops((struct nhgrp_object *)rc->rc_nh_old, &num_nhops);
405 		for (uint32_t i = 0; i < num_nhops; i++) {
406 			rc_new.rc_nh_old = wn[i].nh;
407 			rc_new.rc_nh_weight = wn[i].weight;
408 			cb(&rc_new, cbdata);
409 		}
410 		break;
411 	case RTM_CHANGE:
412 		if (!NH_IS_NHGRP(rc->rc_nh_old) && !NH_IS_NHGRP(rc->rc_nh_new))
413 			return;
414 		decompose_change_notification(rc, cb, cbdata);
415 		break;
416 	}
417 }
418 #endif
419 
420 #ifdef INET
421 /*
422  * Checks if the found key in the trie contains (<=) a prefix covering
423  *  @paddr/@plen.
424  * Returns the most specific rtentry matching the condition or NULL.
425  */
426 static struct rtentry *
427 get_inet_parent_prefix(uint32_t fibnum, struct in_addr addr, int plen)
428 {
429 	struct route_nhop_data rnd;
430 	struct rtentry *rt;
431 	struct in_addr addr4;
432 	uint32_t scopeid;
433 	int parent_plen;
434 	struct radix_node *rn;
435 
436 	rt = fib4_lookup_rt(fibnum, addr, 0, NHR_UNLOCKED, &rnd);
437 	if (rt == NULL)
438 		return (NULL);
439 
440 	rt_get_inet_prefix_plen(rt, &addr4, &parent_plen, &scopeid);
441 	if (parent_plen <= plen)
442 		return (rt);
443 
444 	/*
445 	 * There can be multiple prefixes associated with the found key:
446 	 * 10.0.0.0 -> 10.0.0.0/24, 10.0.0.0/23, 10.0.0.0/22, etc.
447 	 * All such prefixes are linked via rn_dupedkey, from most specific
448 	 *  to least specific. Iterate over them to check if any of these
449 	 *  prefixes are wider than desired plen.
450 	 */
451 	rn = (struct radix_node *)rt;
452 	while ((rn = rn_nextprefix(rn)) != NULL) {
453 		rt = RNTORT(rn);
454 		rt_get_inet_prefix_plen(rt, &addr4, &parent_plen, &scopeid);
455 		if (parent_plen <= plen)
456 			return (rt);
457 	}
458 
459 	return (NULL);
460 }
461 
462 /*
463  * Returns the most specific prefix containing (>) @paddr/plen.
464  */
465 struct rtentry *
466 rt_get_inet_parent(uint32_t fibnum, struct in_addr addr, int plen)
467 {
468 	struct in_addr lookup_addr = { .s_addr = INADDR_BROADCAST };
469 	struct in_addr addr4 = addr;
470 	struct in_addr mask4;
471 	struct rtentry *rt;
472 
473 	while (plen-- > 0) {
474 		/* Calculate wider mask & new key to lookup */
475 		mask4.s_addr = htonl(plen ? ~((1 << (32 - plen)) - 1) : 0);
476 		addr4.s_addr = htonl(ntohl(addr4.s_addr) & ntohl(mask4.s_addr));
477 		if (addr4.s_addr == lookup_addr.s_addr) {
478 			/* Skip lookup if the key is the same */
479 			continue;
480 		}
481 		lookup_addr = addr4;
482 
483 		rt = get_inet_parent_prefix(fibnum, lookup_addr, plen);
484 		if (rt != NULL)
485 			return (rt);
486 	}
487 
488 	return (NULL);
489 }
490 #endif
491 
492 #ifdef INET6
493 /*
494  * Checks if the found key in the trie contains (<=) a prefix covering
495  *  @paddr/@plen.
496  * Returns the most specific rtentry matching the condition or NULL.
497  */
498 static struct rtentry *
499 get_inet6_parent_prefix(uint32_t fibnum, const struct in6_addr *paddr, int plen)
500 {
501 	struct route_nhop_data rnd;
502 	struct rtentry *rt;
503 	struct in6_addr addr6;
504 	uint32_t scopeid;
505 	int parent_plen;
506 	struct radix_node *rn;
507 
508 	rt = fib6_lookup_rt(fibnum, paddr, 0, NHR_UNLOCKED, &rnd);
509 	if (rt == NULL)
510 		return (NULL);
511 
512 	rt_get_inet6_prefix_plen(rt, &addr6, &parent_plen, &scopeid);
513 	if (parent_plen <= plen)
514 		return (rt);
515 
516 	/*
517 	 * There can be multiple prefixes associated with the found key:
518 	 * 2001:db8:1::/64 -> 2001:db8:1::/56, 2001:db8:1::/48, etc.
519 	 * All such prefixes are linked via rn_dupedkey, from most specific
520 	 *  to least specific. Iterate over them to check if any of these
521 	 *  prefixes are wider than desired plen.
522 	 */
523 	rn = (struct radix_node *)rt;
524 	while ((rn = rn_nextprefix(rn)) != NULL) {
525 		rt = RNTORT(rn);
526 		rt_get_inet6_prefix_plen(rt, &addr6, &parent_plen, &scopeid);
527 		if (parent_plen <= plen)
528 			return (rt);
529 	}
530 
531 	return (NULL);
532 }
533 
534 static void
535 ipv6_writemask(struct in6_addr *addr6, uint8_t mask)
536 {
537 	uint32_t *cp;
538 
539 	for (cp = (uint32_t *)addr6; mask >= 32; mask -= 32)
540 		*cp++ = 0xFFFFFFFF;
541 	if (mask > 0)
542 		*cp = htonl(mask ? ~((1 << (32 - mask)) - 1) : 0);
543 }
544 
545 /*
546  * Returns the most specific prefix containing (>) @paddr/plen.
547  */
548 struct rtentry *
549 rt_get_inet6_parent(uint32_t fibnum, const struct in6_addr *paddr, int plen)
550 {
551 	struct in6_addr lookup_addr = in6mask128;
552 	struct in6_addr addr6 = *paddr;
553 	struct in6_addr mask6;
554 	struct rtentry *rt;
555 
556 	while (plen-- > 0) {
557 		/* Calculate wider mask & new key to lookup */
558 		ipv6_writemask(&mask6, plen);
559 		IN6_MASK_ADDR(&addr6, &mask6);
560 		if (IN6_ARE_ADDR_EQUAL(&addr6, &lookup_addr)) {
561 			/* Skip lookup if the key is the same */
562 			continue;
563 		}
564 		lookup_addr = addr6;
565 
566 		rt = get_inet6_parent_prefix(fibnum, &lookup_addr, plen);
567 		if (rt != NULL)
568 			return (rt);
569 	}
570 
571 	return (NULL);
572 }
573 #endif
574