xref: /freebsd/sys/net/route/route_helpers.c (revision 2bfd8b5b9419b0ceb3dd0295fdf413d32969e5b2)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2020 Alexander V. Chernikov
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions
8  * are met:
9  * 1. Redistributions of source code must retain the above copyright
10  *    notice, this list of conditions and the following disclaimer.
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in the
13  *    documentation and/or other materials provided with the distribution.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
16  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
19  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
21  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
22  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
23  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
24  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
25  * SUCH DAMAGE.
26  */
27 
28 #include <sys/cdefs.h>
29 __FBSDID("$FreeBSD$");
30 #include "opt_inet.h"
31 #include "opt_inet6.h"
32 #include "opt_route.h"
33 
34 #include <sys/param.h>
35 #include <sys/jail.h>
36 #include <sys/systm.h>
37 #include <sys/malloc.h>
38 #include <sys/mbuf.h>
39 #include <sys/socket.h>
40 #include <sys/sysctl.h>
41 #include <sys/syslog.h>
42 #include <sys/sysproto.h>
43 #include <sys/proc.h>
44 #include <sys/domain.h>
45 #include <sys/kernel.h>
46 #include <sys/lock.h>
47 #include <sys/rmlock.h>
48 
49 #include <net/if.h>
50 #include <net/if_var.h>
51 #include <net/if_dl.h>
52 #include <net/route.h>
53 #include <net/route/route_ctl.h>
54 #include <net/route/route_var.h>
55 #include <net/route/nhop_utils.h>
56 #include <net/route/nhop.h>
57 #include <net/route/nhop_var.h>
58 #ifdef INET
59 #include <netinet/in_fib.h>
60 #endif
61 #ifdef INET6
62 #include <netinet6/in6_fib.h>
63 #include <netinet6/in6_var.h>
64 #endif
65 #include <net/vnet.h>
66 
67 #define	DEBUG_MOD_NAME	rt_helpers
68 #define	DEBUG_MAX_LEVEL	LOG_DEBUG2
69 #include <net/route/route_debug.h>
70 _DECLARE_DEBUG(LOG_INFO);
71 
72 /*
73  * RIB helper functions.
74  */
75 
76 void
77 rib_walk_ext_locked(struct rib_head *rnh, rib_walktree_f_t *wa_f,
78     rib_walk_hook_f_t *hook_f, void *arg)
79 {
80 	if (hook_f != NULL)
81 		hook_f(rnh, RIB_WALK_HOOK_PRE, arg);
82 	rnh->rnh_walktree(&rnh->head, (walktree_f_t *)wa_f, arg);
83 	if (hook_f != NULL)
84 		hook_f(rnh, RIB_WALK_HOOK_POST, arg);
85 }
86 
87 /*
88  * Calls @wa_f with @arg for each entry in the table specified by
89  * @af and @fibnum.
90  *
91  * @ss_t callback is called before and after the tree traversal
92  *  while holding table lock.
93  *
94  * Table is traversed under read lock unless @wlock is set.
95  */
96 void
97 rib_walk_ext_internal(struct rib_head *rnh, bool wlock, rib_walktree_f_t *wa_f,
98     rib_walk_hook_f_t *hook_f, void *arg)
99 {
100 	RIB_RLOCK_TRACKER;
101 
102 	if (wlock)
103 		RIB_WLOCK(rnh);
104 	else
105 		RIB_RLOCK(rnh);
106 	rib_walk_ext_locked(rnh, wa_f, hook_f, arg);
107 	if (wlock)
108 		RIB_WUNLOCK(rnh);
109 	else
110 		RIB_RUNLOCK(rnh);
111 }
112 
113 void
114 rib_walk_ext(uint32_t fibnum, int family, bool wlock, rib_walktree_f_t *wa_f,
115     rib_walk_hook_f_t *hook_f, void *arg)
116 {
117 	struct rib_head *rnh;
118 
119 	if ((rnh = rt_tables_get_rnh(fibnum, family)) != NULL)
120 		rib_walk_ext_internal(rnh, wlock, wa_f, hook_f, arg);
121 }
122 
123 /*
124  * Calls @wa_f with @arg for each entry in the table specified by
125  * @af and @fibnum.
126  *
127  * Table is traversed under read lock unless @wlock is set.
128  */
129 void
130 rib_walk(uint32_t fibnum, int family, bool wlock, rib_walktree_f_t *wa_f,
131     void *arg)
132 {
133 
134 	rib_walk_ext(fibnum, family, wlock, wa_f, NULL, arg);
135 }
136 
137 /*
138  * Calls @wa_f with @arg for each entry in the table matching @prefix/@mask.
139  *
140  * The following flags are supported:
141  *  RIB_FLAG_WLOCK: acquire exclusive lock
142  *  RIB_FLAG_LOCKED: Assumes the table is already locked & skip locking
143  *
144  * By default, table is traversed under read lock.
145  */
146 void
147 rib_walk_from(uint32_t fibnum, int family, uint32_t flags, struct sockaddr *prefix,
148     struct sockaddr *mask, rib_walktree_f_t *wa_f, void *arg)
149 {
150 	RIB_RLOCK_TRACKER;
151 	struct rib_head *rnh = rt_tables_get_rnh(fibnum, family);
152 
153 	if (rnh == NULL)
154 		return;
155 
156 	if (flags & RIB_FLAG_WLOCK)
157 		RIB_WLOCK(rnh);
158 	else if (!(flags & RIB_FLAG_LOCKED))
159 		RIB_RLOCK(rnh);
160 
161 	rnh->rnh_walktree_from(&rnh->head, prefix, mask, (walktree_f_t *)wa_f, arg);
162 
163 	if (flags & RIB_FLAG_WLOCK)
164 		RIB_WUNLOCK(rnh);
165 	else if (!(flags & RIB_FLAG_LOCKED))
166 		RIB_RUNLOCK(rnh);
167 }
168 
169 /*
170  * Iterates over all existing fibs in system calling
171  *  @hook_f function before/after traversing each fib.
172  *  Calls @wa_f function for each element in current fib.
173  * If af is not AF_UNSPEC, iterates over fibs in particular
174  * address family.
175  */
176 void
177 rib_foreach_table_walk(int family, bool wlock, rib_walktree_f_t *wa_f,
178     rib_walk_hook_f_t *hook_f, void *arg)
179 {
180 
181 	for (uint32_t fibnum = 0; fibnum < rt_numfibs; fibnum++) {
182 		/* Do we want some specific family? */
183 		if (family != AF_UNSPEC) {
184 			rib_walk_ext(fibnum, family, wlock, wa_f, hook_f, arg);
185 			continue;
186 		}
187 
188 		for (int i = 1; i <= AF_MAX; i++)
189 			rib_walk_ext(fibnum, i, wlock, wa_f, hook_f, arg);
190 	}
191 }
192 
193 /*
194  * Iterates over all existing fibs in system and deletes each element
195  *  for which @filter_f function returns non-zero value.
196  * If @family is not AF_UNSPEC, iterates over fibs in particular
197  * address family.
198  */
199 void
200 rib_foreach_table_walk_del(int family, rib_filter_f_t *filter_f, void *arg)
201 {
202 
203 	for (uint32_t fibnum = 0; fibnum < rt_numfibs; fibnum++) {
204 		/* Do we want some specific family? */
205 		if (family != AF_UNSPEC) {
206 			rib_walk_del(fibnum, family, filter_f, arg, 0);
207 			continue;
208 		}
209 
210 		for (int i = 1; i <= AF_MAX; i++)
211 			rib_walk_del(fibnum, i, filter_f, arg, 0);
212 	}
213 }
214 
215 
216 /*
217  * Wrapper for the control plane functions for performing af-agnostic
218  *  lookups.
219  * @fibnum: fib to perform the lookup.
220  * @dst: sockaddr with family and addr filled in. IPv6 addresses needs to be in
221  *  deembedded from.
222  * @flags: fib(9) flags.
223  * @flowid: flow id for path selection in multipath use case.
224  *
225  * Returns nhop_object or NULL.
226  *
227  * Requires NET_EPOCH.
228  *
229  */
230 struct nhop_object *
231 rib_lookup(uint32_t fibnum, const struct sockaddr *dst, uint32_t flags,
232     uint32_t flowid)
233 {
234 	struct nhop_object *nh;
235 
236 	nh = NULL;
237 
238 	switch (dst->sa_family) {
239 #ifdef INET
240 	case AF_INET:
241 	{
242 		const struct sockaddr_in *a = (const struct sockaddr_in *)dst;
243 		nh = fib4_lookup(fibnum, a->sin_addr, 0, flags, flowid);
244 		break;
245 	}
246 #endif
247 #ifdef INET6
248 	case AF_INET6:
249 	{
250 		const struct sockaddr_in6 *a = (const struct sockaddr_in6*)dst;
251 		nh = fib6_lookup(fibnum, &a->sin6_addr, a->sin6_scope_id,
252 		    flags, flowid);
253 		break;
254 	}
255 #endif
256 	}
257 
258 	return (nh);
259 }
260 
261 #ifdef ROUTE_MPATH
262 static void
263 notify_add(struct rib_cmd_info *rc, const struct weightened_nhop *wn_src,
264     route_notification_t *cb, void *cbdata) {
265 	rc->rc_nh_new = wn_src->nh;
266 	rc->rc_nh_weight = wn_src->weight;
267 #if DEBUG_MAX_LEVEL >= LOG_DEBUG2
268 	char nhbuf[NHOP_PRINT_BUFSIZE];
269 	FIB_NH_LOG(LOG_DEBUG2, wn_src->nh, "RTM_ADD for %s @ w=%u",
270 	    nhop_print_buf(wn_src->nh, nhbuf, sizeof(nhbuf)), wn_src->weight);
271 #endif
272 	cb(rc, cbdata);
273 }
274 
275 static void
276 notify_del(struct rib_cmd_info *rc, const struct weightened_nhop *wn_src,
277     route_notification_t *cb, void *cbdata) {
278 	rc->rc_nh_old = wn_src->nh;
279 	rc->rc_nh_weight = wn_src->weight;
280 #if DEBUG_MAX_LEVEL >= LOG_DEBUG2
281 	char nhbuf[NHOP_PRINT_BUFSIZE];
282 	FIB_NH_LOG(LOG_DEBUG2, wn_src->nh, "RTM_DEL for %s @ w=%u",
283 	    nhop_print_buf(wn_src->nh, nhbuf, sizeof(nhbuf)), wn_src->weight);
284 #endif
285 	cb(rc, cbdata);
286 }
287 
288 static void
289 decompose_change_notification(struct rib_cmd_info *rc, route_notification_t *cb,
290     void *cbdata)
291 {
292 	uint32_t num_old, num_new;
293 	struct weightened_nhop *wn_old, *wn_new;
294 	struct weightened_nhop tmp = { NULL, 0 };
295 	uint32_t idx_old = 0, idx_new = 0;
296 
297 	struct rib_cmd_info rc_del = { .rc_cmd = RTM_DELETE, .rc_rt = rc->rc_rt };
298 	struct rib_cmd_info rc_add = { .rc_cmd = RTM_ADD, .rc_rt = rc->rc_rt };
299 
300 	if (NH_IS_NHGRP(rc->rc_nh_old)) {
301 		wn_old = nhgrp_get_nhops((struct nhgrp_object *)rc->rc_nh_old, &num_old);
302 	} else {
303 		tmp.nh = rc->rc_nh_old;
304 		tmp.weight = rc->rc_nh_weight;
305 		wn_old = &tmp;
306 		num_old = 1;
307 	}
308 	if (NH_IS_NHGRP(rc->rc_nh_new)) {
309 		wn_new = nhgrp_get_nhops((struct nhgrp_object *)rc->rc_nh_new, &num_new);
310 	} else {
311 		tmp.nh = rc->rc_nh_new;
312 		tmp.weight = rc->rc_nh_weight;
313 		wn_new = &tmp;
314 		num_new = 1;
315 	}
316 #if DEBUG_MAX_LEVEL >= LOG_DEBUG
317 	{
318 		char buf_old[NHOP_PRINT_BUFSIZE], buf_new[NHOP_PRINT_BUFSIZE];
319 		nhop_print_buf_any(rc->rc_nh_old, buf_old, NHOP_PRINT_BUFSIZE);
320 		nhop_print_buf_any(rc->rc_nh_new, buf_new, NHOP_PRINT_BUFSIZE);
321 		FIB_NH_LOG(LOG_DEBUG, wn_old[0].nh, "change %s -> %s", buf_old, buf_new);
322 	}
323 #endif
324 
325 	/* Use the fact that each @wn array is sorted */
326 	/*
327 	 * Here we have one (or two) multipath groups and transition
328 	 *  between them needs to be reported to the caller, using series
329 	 *  of primitive (RTM_DEL, RTM_ADD) operations.
330 	 *
331 	 * Leverage the fact that each nexthop group has its nexthops sorted
332 	 *  by their indices.
333 	 * [1] -> [1, 2] = A{2}
334 	 * [1, 2] -> [1] = D{2}
335 	 * [1, 2, 4] -> [1, 3, 4] = D{2}, A{3}
336 	 * [1, 2] -> [3, 4] = D{1}, D{2}, A{3}, A{4]
337 	 */
338 	while ((idx_old < num_old) && (idx_new < num_new)) {
339 		uint32_t nh_idx_old = wn_old[idx_old].nh->nh_priv->nh_idx;
340 		uint32_t nh_idx_new = wn_new[idx_new].nh->nh_priv->nh_idx;
341 
342 		if (nh_idx_old == nh_idx_new) {
343 			if (wn_old[idx_old].weight != wn_new[idx_new].weight) {
344 				/* Update weight by providing del/add notifications */
345 				notify_del(&rc_del, &wn_old[idx_old], cb, cbdata);
346 				notify_add(&rc_add, &wn_new[idx_new], cb, cbdata);
347 			}
348 			idx_old++;
349 			idx_new++;
350 		} else if (nh_idx_old < nh_idx_new) {
351 			/* [1, ~2~, 4], [1, ~3~, 4] */
352 			notify_del(&rc_del, &wn_old[idx_old], cb, cbdata);
353 			idx_old++;
354 		} else {
355 			/* nh_idx_old > nh_idx_new. */
356 			notify_add(&rc_add, &wn_new[idx_new], cb, cbdata);
357 			idx_new++;
358 		}
359 	}
360 
361 	while (idx_old < num_old) {
362 		notify_del(&rc_del, &wn_old[idx_old], cb, cbdata);
363 		idx_old++;
364 	}
365 
366 	while (idx_new < num_new) {
367 		notify_add(&rc_add, &wn_new[idx_new], cb, cbdata);
368 		idx_new++;
369 	}
370 }
371 
372 /*
373  * Decompose multipath cmd info @rc into a list of add/del/change
374  *  single-path operations, calling @cb callback for each operation.
375  * Assumes at least one of the nexthops in @rc is multipath.
376  */
377 void
378 rib_decompose_notification(struct rib_cmd_info *rc, route_notification_t *cb,
379     void *cbdata)
380 {
381 	struct weightened_nhop *wn;
382 	uint32_t num_nhops;
383 	struct rib_cmd_info rc_new;
384 
385 	rc_new = *rc;
386 	DPRINTF("cb=%p cmd=%d nh_old=%p nh_new=%p",
387 	    cb, rc->cmd, rc->nh_old, rc->nh_new);
388 	switch (rc->rc_cmd) {
389 	case RTM_ADD:
390 		if (!NH_IS_NHGRP(rc->rc_nh_new))
391 			return;
392 		wn = nhgrp_get_nhops((struct nhgrp_object *)rc->rc_nh_new, &num_nhops);
393 		for (uint32_t i = 0; i < num_nhops; i++) {
394 			notify_add(&rc_new, &wn[i], cb, cbdata);
395 		}
396 		break;
397 	case RTM_DELETE:
398 		if (!NH_IS_NHGRP(rc->rc_nh_old))
399 			return;
400 		wn = nhgrp_get_nhops((struct nhgrp_object *)rc->rc_nh_old, &num_nhops);
401 		for (uint32_t i = 0; i < num_nhops; i++) {
402 			notify_del(&rc_new, &wn[i], cb, cbdata);
403 		}
404 		break;
405 	case RTM_CHANGE:
406 		if (!NH_IS_NHGRP(rc->rc_nh_old) && !NH_IS_NHGRP(rc->rc_nh_new))
407 			return;
408 		decompose_change_notification(rc, cb, cbdata);
409 		break;
410 	}
411 }
412 #endif
413 
414 #ifdef INET
415 /*
416  * Checks if the found key in the trie contains (<=) a prefix covering
417  *  @paddr/@plen.
418  * Returns the most specific rtentry matching the condition or NULL.
419  */
420 static struct rtentry *
421 get_inet_parent_prefix(uint32_t fibnum, struct in_addr addr, int plen)
422 {
423 	struct route_nhop_data rnd;
424 	struct rtentry *rt;
425 	struct in_addr addr4;
426 	uint32_t scopeid;
427 	int parent_plen;
428 	struct radix_node *rn;
429 
430 	rt = fib4_lookup_rt(fibnum, addr, 0, NHR_UNLOCKED, &rnd);
431 	if (rt == NULL)
432 		return (NULL);
433 
434 	rt_get_inet_prefix_plen(rt, &addr4, &parent_plen, &scopeid);
435 	if (parent_plen <= plen)
436 		return (rt);
437 
438 	/*
439 	 * There can be multiple prefixes associated with the found key:
440 	 * 10.0.0.0 -> 10.0.0.0/24, 10.0.0.0/23, 10.0.0.0/22, etc.
441 	 * All such prefixes are linked via rn_dupedkey, from most specific
442 	 *  to least specific. Iterate over them to check if any of these
443 	 *  prefixes are wider than desired plen.
444 	 */
445 	rn = (struct radix_node *)rt;
446 	while ((rn = rn_nextprefix(rn)) != NULL) {
447 		rt = RNTORT(rn);
448 		rt_get_inet_prefix_plen(rt, &addr4, &parent_plen, &scopeid);
449 		if (parent_plen <= plen)
450 			return (rt);
451 	}
452 
453 	return (NULL);
454 }
455 
456 /*
457  * Returns the most specific prefix containing (>) @paddr/plen.
458  */
459 struct rtentry *
460 rt_get_inet_parent(uint32_t fibnum, struct in_addr addr, int plen)
461 {
462 	struct in_addr lookup_addr = { .s_addr = INADDR_BROADCAST };
463 	struct in_addr addr4 = addr;
464 	struct in_addr mask4;
465 	struct rtentry *rt;
466 
467 	while (plen-- > 0) {
468 		/* Calculate wider mask & new key to lookup */
469 		mask4.s_addr = htonl(plen ? ~((1 << (32 - plen)) - 1) : 0);
470 		addr4.s_addr = htonl(ntohl(addr4.s_addr) & ntohl(mask4.s_addr));
471 		if (addr4.s_addr == lookup_addr.s_addr) {
472 			/* Skip lookup if the key is the same */
473 			continue;
474 		}
475 		lookup_addr = addr4;
476 
477 		rt = get_inet_parent_prefix(fibnum, lookup_addr, plen);
478 		if (rt != NULL)
479 			return (rt);
480 	}
481 
482 	return (NULL);
483 }
484 #endif
485 
486 #ifdef INET6
487 /*
488  * Checks if the found key in the trie contains (<=) a prefix covering
489  *  @paddr/@plen.
490  * Returns the most specific rtentry matching the condition or NULL.
491  */
492 static struct rtentry *
493 get_inet6_parent_prefix(uint32_t fibnum, const struct in6_addr *paddr, int plen)
494 {
495 	struct route_nhop_data rnd;
496 	struct rtentry *rt;
497 	struct in6_addr addr6;
498 	uint32_t scopeid;
499 	int parent_plen;
500 	struct radix_node *rn;
501 
502 	rt = fib6_lookup_rt(fibnum, paddr, 0, NHR_UNLOCKED, &rnd);
503 	if (rt == NULL)
504 		return (NULL);
505 
506 	rt_get_inet6_prefix_plen(rt, &addr6, &parent_plen, &scopeid);
507 	if (parent_plen <= plen)
508 		return (rt);
509 
510 	/*
511 	 * There can be multiple prefixes associated with the found key:
512 	 * 2001:db8:1::/64 -> 2001:db8:1::/56, 2001:db8:1::/48, etc.
513 	 * All such prefixes are linked via rn_dupedkey, from most specific
514 	 *  to least specific. Iterate over them to check if any of these
515 	 *  prefixes are wider than desired plen.
516 	 */
517 	rn = (struct radix_node *)rt;
518 	while ((rn = rn_nextprefix(rn)) != NULL) {
519 		rt = RNTORT(rn);
520 		rt_get_inet6_prefix_plen(rt, &addr6, &parent_plen, &scopeid);
521 		if (parent_plen <= plen)
522 			return (rt);
523 	}
524 
525 	return (NULL);
526 }
527 
528 static void
529 ipv6_writemask(struct in6_addr *addr6, uint8_t mask)
530 {
531 	uint32_t *cp;
532 
533 	for (cp = (uint32_t *)addr6; mask >= 32; mask -= 32)
534 		*cp++ = 0xFFFFFFFF;
535 	if (mask > 0)
536 		*cp = htonl(mask ? ~((1 << (32 - mask)) - 1) : 0);
537 }
538 
539 /*
540  * Returns the most specific prefix containing (>) @paddr/plen.
541  */
542 struct rtentry *
543 rt_get_inet6_parent(uint32_t fibnum, const struct in6_addr *paddr, int plen)
544 {
545 	struct in6_addr lookup_addr = in6mask128;
546 	struct in6_addr addr6 = *paddr;
547 	struct in6_addr mask6;
548 	struct rtentry *rt;
549 
550 	while (plen-- > 0) {
551 		/* Calculate wider mask & new key to lookup */
552 		ipv6_writemask(&mask6, plen);
553 		IN6_MASK_ADDR(&addr6, &mask6);
554 		if (IN6_ARE_ADDR_EQUAL(&addr6, &lookup_addr)) {
555 			/* Skip lookup if the key is the same */
556 			continue;
557 		}
558 		lookup_addr = addr6;
559 
560 		rt = get_inet6_parent_prefix(fibnum, &lookup_addr, plen);
561 		if (rt != NULL)
562 			return (rt);
563 	}
564 
565 	return (NULL);
566 }
567 #endif
568