xref: /freebsd/sys/netlink/route/nexthop.c (revision 66fd12cf4896eb08ad8e7a2627537f84ead84dd3)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause
3  *
4  * Copyright (c) 2022 Alexander V. Chernikov <melifaro@FreeBSD.org>
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions
8  * are met:
9  * 1. Redistributions of source code must retain the above copyright
10  *    notice, this list of conditions and the following disclaimer.
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in the
13  *    documentation and/or other materials provided with the distribution.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
16  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
19  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
21  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
22  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
23  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
24  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
25  * SUCH DAMAGE.
26  */
27 
28 #include "opt_netlink.h"
29 
30 #include <sys/cdefs.h>
31 __FBSDID("$FreeBSD$");
32 #include "opt_inet.h"
33 #include "opt_inet6.h"
34 #include "opt_route.h"
35 #include <sys/types.h>
36 #include <sys/ck.h>
37 #include <sys/epoch.h>
38 #include <sys/kernel.h>
39 #include <sys/malloc.h>
40 #include <sys/rmlock.h>
41 #include <sys/socket.h>
42 
43 #include <net/if.h>
44 #include <net/route.h>
45 #include <net/route/nhop.h>
46 #include <net/route/nhop_utils.h>
47 
48 #include <net/route/route_ctl.h>
49 #include <net/route/route_var.h>
50 #include <netinet6/scope6_var.h>
51 #include <netlink/netlink.h>
52 #include <netlink/netlink_ctl.h>
53 #include <netlink/netlink_route.h>
54 #include <netlink/route/route_var.h>
55 
56 #define	DEBUG_MOD_NAME	nl_nhop
57 #define	DEBUG_MAX_LEVEL	LOG_DEBUG3
58 #include <netlink/netlink_debug.h>
59 _DECLARE_DEBUG(LOG_INFO);
60 
61 /*
62  * This file contains the logic to maintain kernel nexthops and
63  *  nexhop groups based om the data provided by the user.
64  *
65  * Kernel stores (nearly) all of the routing data in the nexthops,
66  *  including the prefix-specific flags (NHF_HOST and NHF_DEFAULT).
67  *
68  * Netlink API provides higher-level abstraction for the user. Each
69  *  user-created nexthop may map to multiple kernel nexthops.
70  *
71  * The following variations require separate kernel nexthop to be
72  *  created:
73  *  * prefix flags (NHF_HOST, NHF_DEFAULT)
74  *  * using IPv6 gateway for IPv4 routes
75  *  * different fibnum
76  *
77  * These kernel nexthops have the lifetime bound to the lifetime of
78  *  the user_nhop object. They are not collected until user requests
79  *  to delete the created user_nhop.
80  *
81  */
82 struct user_nhop {
83         uint32_t                        un_idx; /* Userland-provided index */
84 	uint32_t			un_fibfam; /* fibnum+af(as highest byte) */
85 	uint8_t				un_protocol; /* protocol that install the record */
86 	struct nhop_object		*un_nhop; /* "production" nexthop */
87 	struct nhop_object		*un_nhop_src; /* nexthop to copy from */
88 	struct weightened_nhop		*un_nhgrp_src; /* nexthops for nhg */
89 	uint32_t			un_nhgrp_count; /* number of nexthops */
90         struct user_nhop		*un_next; /* next item in hash chain */
91         struct user_nhop		*un_nextchild; /* master -> children */
92 	struct epoch_context		un_epoch_ctx;	/* epoch ctl helper */
93 };
94 
95 /* produce hash value for an object */
96 #define	unhop_hash_obj(_obj)	(hash_unhop(_obj))
97 /* compare two objects */
98 #define	unhop_cmp(_one, _two)	(cmp_unhop(_one, _two))
99 /* next object accessor */
100 #define	unhop_next(_obj)	(_obj)->un_next
101 
102 CHT_SLIST_DEFINE(unhop, struct user_nhop);
103 
104 struct unhop_ctl {
105 	struct unhop_head	un_head;
106 	struct rmlock		un_lock;
107 };
108 #define	UN_LOCK_INIT(_ctl)	rm_init(&(_ctl)->un_lock, "unhop_ctl")
109 #define	UN_TRACKER		struct rm_priotracker un_tracker
110 #define	UN_RLOCK(_ctl)		rm_rlock(&((_ctl)->un_lock), &un_tracker)
111 #define	UN_RUNLOCK(_ctl)	rm_runlock(&((_ctl)->un_lock), &un_tracker)
112 
113 #define	UN_WLOCK(_ctl)		rm_wlock(&(_ctl)->un_lock);
114 #define	UN_WUNLOCK(_ctl)	rm_wunlock(&(_ctl)->un_lock);
115 
116 VNET_DEFINE_STATIC(struct unhop_ctl *, un_ctl) = NULL;
117 #define V_un_ctl	VNET(un_ctl)
118 
119 static void consider_resize(struct unhop_ctl *ctl, uint32_t new_size);
120 static int cmp_unhop(const struct user_nhop *a, const struct user_nhop *b);
121 static unsigned int hash_unhop(const struct user_nhop *obj);
122 
123 static void destroy_unhop(struct user_nhop *unhop);
124 static struct nhop_object *clone_unhop(const struct user_nhop *unhop,
125     uint32_t fibnum, int family, int nh_flags);
126 
127 static int
128 cmp_unhop(const struct user_nhop *a, const struct user_nhop *b)
129 {
130         return (a->un_idx == b->un_idx && a->un_fibfam == b->un_fibfam);
131 }
132 
133 /*
134  * Hash callback: calculate hash of an object
135  */
136 static unsigned int
137 hash_unhop(const struct user_nhop *obj)
138 {
139         return (obj->un_idx ^ obj->un_fibfam);
140 }
141 
142 #define	UNHOP_IS_MASTER(_unhop)	((_unhop)->un_fibfam == 0)
143 
144 /*
145  * Factory interface for creating matching kernel nexthops/nexthop groups
146  *
147  * @uidx: userland nexhop index used to create the nexthop
148  * @fibnum: fibnum nexthop will be used in
149  * @family: upper family nexthop will be used in
150  * @nh_flags: desired nexthop prefix flags
151  * @perror: pointer to store error to
152  *
153  * Returns referenced nexthop linked to @fibnum/@family rib on success.
154  */
155 struct nhop_object *
156 nl_find_nhop(uint32_t fibnum, int family, uint32_t uidx,
157     int nh_flags, int *perror)
158 {
159 	struct unhop_ctl *ctl = atomic_load_ptr(&V_un_ctl);
160         UN_TRACKER;
161 
162 	if (__predict_false(ctl == NULL))
163 		return (NULL);
164 
165 	struct user_nhop key= {
166 		.un_idx = uidx,
167 		.un_fibfam = fibnum  | ((uint32_t)family) << 24,
168 	};
169 	struct user_nhop *unhop;
170 
171 	nh_flags = nh_flags & (NHF_HOST | NHF_DEFAULT);
172 
173 	if (__predict_false(family == 0))
174 		return (NULL);
175 
176 	UN_RLOCK(ctl);
177 	CHT_SLIST_FIND_BYOBJ(&ctl->un_head, unhop, &key, unhop);
178 	if (unhop != NULL) {
179 		struct nhop_object *nh = unhop->un_nhop;
180 		UN_RLOCK(ctl);
181 		*perror = 0;
182 		nhop_ref_any(nh);
183 		return (nh);
184 	}
185 
186 	/*
187 	 * Exact nexthop not found. Search for template nexthop to clone from.
188 	 */
189 	key.un_fibfam = 0;
190 	CHT_SLIST_FIND_BYOBJ(&ctl->un_head, unhop, &key, unhop);
191 	if (unhop == NULL) {
192 		UN_RUNLOCK(ctl);
193 		*perror = ESRCH;
194 		return (NULL);
195 	}
196 
197 	UN_RUNLOCK(ctl);
198 
199 	/* Create entry to insert first */
200 	struct user_nhop *un_new, *un_tmp;
201 	un_new = malloc(sizeof(struct user_nhop), M_NETLINK, M_NOWAIT | M_ZERO);
202 	if (un_new == NULL) {
203 		*perror = ENOMEM;
204 		return (NULL);
205 	}
206 	un_new->un_idx = uidx;
207 	un_new->un_fibfam = fibnum  | ((uint32_t)family) << 24;
208 
209 	/* Relying on epoch to protect unhop here */
210 	un_new->un_nhop = clone_unhop(unhop, fibnum, family, nh_flags);
211 	if (un_new->un_nhop == NULL) {
212 		free(un_new, M_NETLINK);
213 		*perror = ENOMEM;
214 		return (NULL);
215 	}
216 
217 	/* Insert back and report */
218 	UN_WLOCK(ctl);
219 
220 	/* First, find template record once again */
221 	CHT_SLIST_FIND_BYOBJ(&ctl->un_head, unhop, &key, unhop);
222 	if (unhop == NULL) {
223 		/* Someone deleted the nexthop during the call */
224 		UN_WUNLOCK(ctl);
225 		*perror = ESRCH;
226 		destroy_unhop(un_new);
227 		return (NULL);
228 	}
229 
230 	/* Second, check the direct match */
231 	CHT_SLIST_FIND_BYOBJ(&ctl->un_head, unhop, un_new, un_tmp);
232 	struct nhop_object *nh;
233 	if (un_tmp != NULL) {
234 		/* Another thread already created the desired nextop, use it */
235 		nh = un_tmp->un_nhop;
236 	} else {
237 		/* Finally, insert the new nexthop and link it to the primary */
238 		nh = un_new->un_nhop;
239 		CHT_SLIST_INSERT_HEAD(&ctl->un_head, unhop, un_new);
240 		un_new->un_nextchild = unhop->un_nextchild;
241 		unhop->un_nextchild = un_new;
242 		un_new = NULL;
243 		NL_LOG(LOG_DEBUG2, "linked cloned nexthop %p", nh);
244 	}
245 
246 	UN_WUNLOCK(ctl);
247 
248 	if (un_new != NULL)
249 		destroy_unhop(un_new);
250 
251 	*perror = 0;
252 	nhop_ref_any(nh);
253 	return (nh);
254 }
255 
256 static struct user_nhop *
257 nl_find_base_unhop(struct unhop_ctl *ctl, uint32_t uidx)
258 {
259 	struct user_nhop key= { .un_idx = uidx };
260 	struct user_nhop *unhop = NULL;
261 	UN_TRACKER;
262 
263 	UN_RLOCK(ctl);
264 	CHT_SLIST_FIND_BYOBJ(&ctl->un_head, unhop, &key, unhop);
265 	UN_RUNLOCK(ctl);
266 
267 	return (unhop);
268 }
269 
270 #define MAX_STACK_NHOPS	4
271 static struct nhop_object *
272 clone_unhop(const struct user_nhop *unhop, uint32_t fibnum, int family, int nh_flags)
273 {
274 #ifdef ROUTE_MPATH
275 	const struct weightened_nhop *wn;
276 	struct weightened_nhop *wn_new, wn_base[MAX_STACK_NHOPS];
277 	uint32_t num_nhops;
278 #endif
279 	struct nhop_object *nh = NULL;
280 	int error;
281 
282 	if (unhop->un_nhop_src != NULL) {
283 		IF_DEBUG_LEVEL(LOG_DEBUG2) {
284 			char nhbuf[NHOP_PRINT_BUFSIZE];
285 			nhop_print_buf_any(unhop->un_nhop_src, nhbuf, sizeof(nhbuf));
286 			FIB_NH_LOG(LOG_DEBUG2, unhop->un_nhop_src,
287 			    "cloning nhop %s -> %u.%u flags 0x%X", nhbuf, fibnum,
288 			    family, nh_flags);
289 		}
290 		struct nhop_object *nh;
291 		nh = nhop_alloc(fibnum, AF_UNSPEC);
292 		if (nh == NULL)
293 			return (NULL);
294 		nhop_copy(nh, unhop->un_nhop_src);
295 		/* Check that nexthop gateway is compatible with the new family */
296 		if (!nhop_set_upper_family(nh, family)) {
297 			nhop_free(nh);
298 			return (NULL);
299 		}
300 		nhop_set_uidx(nh, unhop->un_idx);
301 		nhop_set_pxtype_flag(nh, nh_flags);
302 		return (nhop_get_nhop(nh, &error));
303 	}
304 #ifdef ROUTE_MPATH
305 	wn = unhop->un_nhgrp_src;
306 	num_nhops = unhop->un_nhgrp_count;
307 
308 	if (num_nhops > MAX_STACK_NHOPS) {
309 		wn_new = malloc(num_nhops * sizeof(struct weightened_nhop), M_TEMP, M_NOWAIT);
310 		if (wn_new == NULL)
311 			return (NULL);
312 	} else
313 		wn_new = wn_base;
314 
315 	for (int i = 0; i < num_nhops; i++) {
316 		uint32_t uidx = nhop_get_uidx(wn[i].nh);
317 		MPASS(uidx != 0);
318 		wn_new[i].nh = nl_find_nhop(fibnum, family, uidx, nh_flags, &error);
319 		if (error != 0)
320 			break;
321 		wn_new[i].weight = wn[i].weight;
322 	}
323 
324 	if (error == 0) {
325 		struct rib_head *rh = nhop_get_rh(wn_new[0].nh);
326 		struct nhgrp_object *nhg;
327 
328 		error = nhgrp_get_group(rh, wn_new, num_nhops, unhop->un_idx, &nhg);
329 		nh = (struct nhop_object *)nhg;
330 	}
331 
332 	if (wn_new != wn_base)
333 		free(wn_new, M_TEMP);
334 #endif
335 	return (nh);
336 }
337 
338 static void
339 destroy_unhop(struct user_nhop *unhop)
340 {
341 	if (unhop->un_nhop != NULL)
342 		nhop_free_any(unhop->un_nhop);
343 	if (unhop->un_nhop_src != NULL)
344 		nhop_free_any(unhop->un_nhop_src);
345 	free(unhop, M_NETLINK);
346 }
347 
348 static void
349 destroy_unhop_epoch(epoch_context_t ctx)
350 {
351 	struct user_nhop *unhop;
352 
353 	unhop = __containerof(ctx, struct user_nhop, un_epoch_ctx);
354 
355 	destroy_unhop(unhop);
356 }
357 
358 static uint32_t
359 find_spare_uidx(struct unhop_ctl *ctl)
360 {
361 	struct user_nhop *unhop, key = {};
362 	uint32_t uidx = 0;
363 	UN_TRACKER;
364 
365 	UN_RLOCK(ctl);
366 	/* This should return spare uid with 75% of 65k used in ~99/100 cases */
367 	for (int i = 0; i < 16; i++) {
368 		key.un_idx = (arc4random() % 65536) + 65536 * 4;
369 		CHT_SLIST_FIND_BYOBJ(&ctl->un_head, unhop, &key, unhop);
370 		if (unhop == NULL) {
371 			uidx = key.un_idx;
372 			break;
373 		}
374 	}
375 	UN_RUNLOCK(ctl);
376 
377 	return (uidx);
378 }
379 
380 
381 /*
382  * Actual netlink code
383  */
384 struct netlink_walkargs {
385 	struct nl_writer *nw;
386 	struct nlmsghdr hdr;
387 	struct nlpcb *so;
388 	int family;
389 	int error;
390 	int count;
391 	int dumped;
392 };
393 #define	ENOMEM_IF_NULL(_v)	if ((_v) == NULL) goto enomem
394 
395 static bool
396 dump_nhgrp(const struct user_nhop *unhop, struct nlmsghdr *hdr,
397     struct nl_writer *nw)
398 {
399 
400 	if (!nlmsg_reply(nw, hdr, sizeof(struct nhmsg)))
401 		goto enomem;
402 
403 	struct nhmsg *nhm = nlmsg_reserve_object(nw, struct nhmsg);
404 	nhm->nh_family = AF_UNSPEC;
405 	nhm->nh_scope = 0;
406 	nhm->nh_protocol = unhop->un_protocol;
407 	nhm->nh_flags = 0;
408 
409 	nlattr_add_u32(nw, NHA_ID, unhop->un_idx);
410 	nlattr_add_u16(nw, NHA_GROUP_TYPE, NEXTHOP_GRP_TYPE_MPATH);
411 
412 	struct weightened_nhop *wn = unhop->un_nhgrp_src;
413 	uint32_t num_nhops = unhop->un_nhgrp_count;
414 	/* TODO: a better API? */
415 	int nla_len = sizeof(struct nlattr);
416 	nla_len += NETLINK_ALIGN(num_nhops * sizeof(struct nexthop_grp));
417 	struct nlattr *nla = nlmsg_reserve_data(nw, nla_len, struct nlattr);
418 	if (nla == NULL)
419 		goto enomem;
420 	nla->nla_type = NHA_GROUP;
421 	nla->nla_len = nla_len;
422 	for (int i = 0; i < num_nhops; i++) {
423 		struct nexthop_grp *grp = &((struct nexthop_grp *)(nla + 1))[i];
424 		grp->id = nhop_get_uidx(wn[i].nh);
425 		grp->weight = wn[i].weight;
426 		grp->resvd1 = 0;
427 		grp->resvd2 = 0;
428 	}
429 
430         if (nlmsg_end(nw))
431 		return (true);
432 enomem:
433 	NL_LOG(LOG_DEBUG, "error: unable to allocate attribute memory");
434         nlmsg_abort(nw);
435 	return (false);
436 }
437 
438 static bool
439 dump_nhop(const struct nhop_object *nh, uint32_t uidx, struct nlmsghdr *hdr,
440     struct nl_writer *nw)
441 {
442 	if (!nlmsg_reply(nw, hdr, sizeof(struct nhmsg)))
443 		goto enomem;
444 
445 	struct nhmsg *nhm = nlmsg_reserve_object(nw, struct nhmsg);
446 	ENOMEM_IF_NULL(nhm);
447 	nhm->nh_family = nhop_get_neigh_family(nh);
448 	nhm->nh_scope = 0; // XXX: what's that?
449 	nhm->nh_protocol = nhop_get_origin(nh);
450 	nhm->nh_flags = 0;
451 
452 	if (uidx != 0)
453 		nlattr_add_u32(nw, NHA_ID, uidx);
454 	if (nh->nh_flags & NHF_BLACKHOLE) {
455 		nlattr_add_flag(nw, NHA_BLACKHOLE);
456 		goto done;
457 	}
458 	nlattr_add_u32(nw, NHA_OIF, if_getindex(nh->nh_ifp));
459 
460 	switch (nh->gw_sa.sa_family) {
461 #ifdef INET
462 	case AF_INET:
463 		nlattr_add(nw, NHA_GATEWAY, 4, &nh->gw4_sa.sin_addr);
464 		break;
465 #endif
466 #ifdef INET6
467 	case AF_INET6:
468 		{
469 			struct in6_addr addr = nh->gw6_sa.sin6_addr;
470 			in6_clearscope(&addr);
471 			nlattr_add(nw, NHA_GATEWAY, 16, &addr);
472 			break;
473 		}
474 #endif
475 	}
476 
477 	int off = nlattr_add_nested(nw, NHA_FREEBSD);
478 	if (off != 0) {
479 		nlattr_add_u32(nw, NHAF_AIF, if_getindex(nh->nh_aifp));
480 
481 		if (uidx == 0) {
482 			nlattr_add_u32(nw, NHAF_KID, nhop_get_idx(nh));
483 			nlattr_add_u32(nw, NHAF_FAMILY, nhop_get_upper_family(nh));
484 			nlattr_add_u32(nw, NHAF_TABLE, nhop_get_fibnum(nh));
485 		}
486 
487 		nlattr_set_len(nw, off);
488 	}
489 
490 done:
491         if (nlmsg_end(nw))
492 		return (true);
493 enomem:
494 	nlmsg_abort(nw);
495 	return (false);
496 }
497 
498 static void
499 dump_unhop(const struct user_nhop *unhop, struct nlmsghdr *hdr,
500     struct nl_writer *nw)
501 {
502 	if (unhop->un_nhop_src != NULL)
503 		dump_nhop(unhop->un_nhop_src, unhop->un_idx, hdr, nw);
504 	else
505 		dump_nhgrp(unhop, hdr, nw);
506 }
507 
508 static int
509 delete_unhop(struct unhop_ctl *ctl, struct nlmsghdr *hdr, uint32_t uidx)
510 {
511 	struct user_nhop *unhop_ret, *unhop_base, *unhop_chain;
512 
513 	struct user_nhop key = { .un_idx = uidx };
514 
515 	UN_WLOCK(ctl);
516 
517 	CHT_SLIST_FIND_BYOBJ(&ctl->un_head, unhop, &key, unhop_base);
518 
519 	if (unhop_base != NULL) {
520 		CHT_SLIST_REMOVE(&ctl->un_head, unhop, unhop_base, unhop_ret);
521 		IF_DEBUG_LEVEL(LOG_DEBUG2) {
522 			char nhbuf[NHOP_PRINT_BUFSIZE];
523 			nhop_print_buf_any(unhop_base->un_nhop, nhbuf, sizeof(nhbuf));
524 			FIB_NH_LOG(LOG_DEBUG3, unhop_base->un_nhop,
525 			    "removed base nhop %u: %s", uidx, nhbuf);
526 		}
527 		/* Unlink all child nexhops as well, keeping the chain intact */
528 		unhop_chain = unhop_base->un_nextchild;
529 		while (unhop_chain != NULL) {
530 			CHT_SLIST_REMOVE(&ctl->un_head, unhop, unhop_chain,
531 			    unhop_ret);
532 			MPASS(unhop_chain == unhop_ret);
533 			IF_DEBUG_LEVEL(LOG_DEBUG3) {
534 				char nhbuf[NHOP_PRINT_BUFSIZE];
535 				nhop_print_buf_any(unhop_chain->un_nhop,
536 				    nhbuf, sizeof(nhbuf));
537 				FIB_NH_LOG(LOG_DEBUG3, unhop_chain->un_nhop,
538 				    "removed child nhop %u: %s", uidx, nhbuf);
539 			}
540 			unhop_chain = unhop_chain->un_nextchild;
541 		}
542 	}
543 
544 	UN_WUNLOCK(ctl);
545 
546 	if (unhop_base == NULL) {
547 		NL_LOG(LOG_DEBUG, "unable to find unhop %u", uidx);
548 		return (ENOENT);
549 	}
550 
551 	/* Report nexthop deletion */
552 	struct netlink_walkargs wa = {
553 		.hdr.nlmsg_pid = hdr->nlmsg_pid,
554 		.hdr.nlmsg_seq = hdr->nlmsg_seq,
555 		.hdr.nlmsg_flags = hdr->nlmsg_flags,
556 		.hdr.nlmsg_type = NL_RTM_DELNEXTHOP,
557 	};
558 
559 	struct nl_writer nw = {};
560 	if (!nlmsg_get_group_writer(&nw, NLMSG_SMALL, NETLINK_ROUTE, RTNLGRP_NEXTHOP)) {
561 		NL_LOG(LOG_DEBUG, "error allocating message writer");
562 		return (ENOMEM);
563 	}
564 
565 	dump_unhop(unhop_base, &wa.hdr, &nw);
566 	nlmsg_flush(&nw);
567 
568 	while (unhop_base != NULL) {
569 		unhop_chain = unhop_base->un_nextchild;
570 		NET_EPOCH_CALL(destroy_unhop_epoch, &unhop_base->un_epoch_ctx);
571 		unhop_base = unhop_chain;
572 	}
573 
574 	return (0);
575 }
576 
577 static void
578 consider_resize(struct unhop_ctl *ctl, uint32_t new_size)
579 {
580 	void *new_ptr = NULL;
581 	size_t alloc_size;
582 
583         if (new_size == 0)
584                 return;
585 
586 	if (new_size != 0) {
587 		alloc_size = CHT_SLIST_GET_RESIZE_SIZE(new_size);
588 		new_ptr = malloc(alloc_size, M_NETLINK, M_NOWAIT | M_ZERO);
589                 if (new_ptr == NULL)
590                         return;
591 	}
592 
593 	NL_LOG(LOG_DEBUG, "resizing hash: %u -> %u", ctl->un_head.hash_size, new_size);
594 	UN_WLOCK(ctl);
595 	if (new_ptr != NULL) {
596 		CHT_SLIST_RESIZE(&ctl->un_head, unhop, new_ptr, new_size);
597 	}
598 	UN_WUNLOCK(ctl);
599 
600 
601 	if (new_ptr != NULL)
602 		free(new_ptr, M_NETLINK);
603 }
604 
605 static bool __noinline
606 vnet_init_unhops(void)
607 {
608         uint32_t num_buckets = 16;
609         size_t alloc_size = CHT_SLIST_GET_RESIZE_SIZE(num_buckets);
610 
611         struct unhop_ctl *ctl = malloc(sizeof(struct unhop_ctl), M_NETLINK,
612             M_NOWAIT | M_ZERO);
613         if (ctl == NULL)
614                 return (false);
615 
616         void *ptr = malloc(alloc_size, M_NETLINK, M_NOWAIT | M_ZERO);
617         if (ptr == NULL) {
618 		free(ctl, M_NETLINK);
619                 return (false);
620 	}
621         CHT_SLIST_INIT(&ctl->un_head, ptr, num_buckets);
622 	UN_LOCK_INIT(ctl);
623 
624 	if (!atomic_cmpset_ptr((uintptr_t *)&V_un_ctl, (uintptr_t)NULL, (uintptr_t)ctl)) {
625                 free(ptr, M_NETLINK);
626                 free(ctl, M_NETLINK);
627 	}
628 
629 	if (atomic_load_ptr(&V_un_ctl) == NULL)
630 		return (false);
631 
632 	NL_LOG(LOG_NOTICE, "UNHOPS init done");
633 
634         return (true);
635 }
636 
637 static void
638 vnet_destroy_unhops(const void *unused __unused)
639 {
640 	struct unhop_ctl *ctl = atomic_load_ptr(&V_un_ctl);
641 	struct user_nhop *unhop, *tmp;
642 
643 	if (ctl == NULL)
644 		return;
645 	V_un_ctl = NULL;
646 
647 	/* Wait till all unhop users finish their reads */
648 	NET_EPOCH_WAIT();
649 
650 	UN_WLOCK(ctl);
651 	CHT_SLIST_FOREACH_SAFE(&ctl->un_head, unhop, unhop, tmp) {
652 		destroy_unhop(unhop);
653 	} CHT_SLIST_FOREACH_SAFE_END;
654 	UN_WUNLOCK(ctl);
655 
656 	free(ctl->un_head.ptr, M_NETLINK);
657 	free(ctl, M_NETLINK);
658 }
659 VNET_SYSUNINIT(vnet_destroy_unhops, SI_SUB_PROTO_IF, SI_ORDER_ANY,
660     vnet_destroy_unhops, NULL);
661 
662 static int
663 nlattr_get_nhg(struct nlattr *nla, struct nl_pstate *npt, const void *arg, void *target)
664 {
665 	int error = 0;
666 
667 	/* Verify attribute correctness */
668 	struct nexthop_grp *grp = NLA_DATA(nla);
669 	int data_len = NLA_DATA_LEN(nla);
670 
671 	int count = data_len / sizeof(*grp);
672 	if (count == 0 || (count * sizeof(*grp) != data_len)) {
673 		NL_LOG(LOG_DEBUG, "Invalid length for RTA_GROUP: %d", data_len);
674 		return (EINVAL);
675 	}
676 
677 	*((struct nlattr **)target) = nla;
678 	return (error);
679 }
680 
681 static void
682 set_scope6(struct sockaddr *sa, if_t ifp)
683 {
684 #ifdef INET6
685 	if (sa != NULL && sa->sa_family == AF_INET6 && ifp != NULL) {
686 		struct sockaddr_in6 *sa6 = (struct sockaddr_in6 *)sa;
687 
688 		if (IN6_IS_ADDR_LINKLOCAL(&sa6->sin6_addr))
689 			in6_set_unicast_scopeid(&sa6->sin6_addr, if_getindex(ifp));
690 	}
691 #endif
692 }
693 
694 struct nl_parsed_nhop {
695 	uint32_t	nha_id;
696 	uint8_t		nha_blackhole;
697 	uint8_t		nha_groups;
698 	uint8_t		nhaf_knhops;
699 	uint8_t		nhaf_family;
700 	struct ifnet	*nha_oif;
701 	struct sockaddr	*nha_gw;
702 	struct nlattr	*nha_group;
703 	uint8_t		nh_family;
704 	uint8_t		nh_protocol;
705 	uint32_t	nhaf_table;
706 	uint32_t	nhaf_kid;
707 	uint32_t	nhaf_aif;
708 };
709 
710 #define	_IN(_field)	offsetof(struct nhmsg, _field)
711 #define	_OUT(_field)	offsetof(struct nl_parsed_nhop, _field)
712 static struct nlattr_parser nla_p_nh_fbsd[] = {
713 	{ .type = NHAF_KNHOPS, .off = _OUT(nhaf_knhops), .cb = nlattr_get_flag },
714 	{ .type = NHAF_TABLE, .off = _OUT(nhaf_table), .cb = nlattr_get_uint32 },
715 	{ .type = NHAF_FAMILY, .off = _OUT(nhaf_family), .cb = nlattr_get_uint8 },
716 	{ .type = NHAF_KID, .off = _OUT(nhaf_kid), .cb = nlattr_get_uint32 },
717 	{ .type = NHAF_AIF, .off = _OUT(nhaf_aif), .cb = nlattr_get_uint32 },
718 };
719 NL_DECLARE_ATTR_PARSER(nh_fbsd_parser, nla_p_nh_fbsd);
720 
721 static const struct nlfield_parser nlf_p_nh[] = {
722 	{ .off_in = _IN(nh_family), .off_out = _OUT(nh_family), .cb = nlf_get_u8 },
723 	{ .off_in = _IN(nh_protocol), .off_out = _OUT(nh_protocol), .cb = nlf_get_u8 },
724 };
725 
726 static const struct nlattr_parser nla_p_nh[] = {
727 	{ .type = NHA_ID, .off = _OUT(nha_id), .cb = nlattr_get_uint32 },
728 	{ .type = NHA_GROUP, .off = _OUT(nha_group), .cb = nlattr_get_nhg },
729 	{ .type = NHA_BLACKHOLE, .off = _OUT(nha_blackhole), .cb = nlattr_get_flag },
730 	{ .type = NHA_OIF, .off = _OUT(nha_oif), .cb = nlattr_get_ifp },
731 	{ .type = NHA_GATEWAY, .off = _OUT(nha_gw), .cb = nlattr_get_ip },
732 	{ .type = NHA_GROUPS, .off = _OUT(nha_groups), .cb = nlattr_get_flag },
733 	{ .type = NHA_FREEBSD, .arg = &nh_fbsd_parser, .cb = nlattr_get_nested },
734 };
735 #undef _IN
736 #undef _OUT
737 
738 static bool
739 post_p_nh(void *_attrs, struct nl_pstate *npt)
740 {
741 	struct nl_parsed_nhop *attrs = (struct nl_parsed_nhop *)_attrs;
742 
743 	set_scope6(attrs->nha_gw, attrs->nha_oif);
744 	return (true);
745 }
746 NL_DECLARE_PARSER_EXT(nhmsg_parser, struct nhmsg, NULL, nlf_p_nh, nla_p_nh, post_p_nh);
747 
748 static bool
749 eligible_nhg(const struct nhop_object *nh)
750 {
751 	return (nh->nh_flags & NHF_GATEWAY);
752 }
753 
754 static int
755 newnhg(struct unhop_ctl *ctl, struct nl_parsed_nhop *attrs, struct user_nhop *unhop)
756 {
757 	struct nexthop_grp *grp = NLA_DATA(attrs->nha_group);
758 	int count = NLA_DATA_LEN(attrs->nha_group) / sizeof(*grp);
759 	struct weightened_nhop *wn;
760 
761 	wn = malloc(sizeof(*wn) * count, M_NETLINK, M_NOWAIT | M_ZERO);
762 	if (wn == NULL)
763 		return (ENOMEM);
764 
765 	for (int i = 0; i < count; i++) {
766 		struct user_nhop *unhop;
767 		unhop = nl_find_base_unhop(ctl, grp[i].id);
768 		if (unhop == NULL) {
769 			NL_LOG(LOG_DEBUG, "unable to find uidx %u", grp[i].id);
770 			free(wn, M_NETLINK);
771 			return (ESRCH);
772 		} else if (unhop->un_nhop_src == NULL) {
773 			NL_LOG(LOG_DEBUG, "uidx %u is a group, nested group unsupported",
774 			    grp[i].id);
775 			free(wn, M_NETLINK);
776 			return (ENOTSUP);
777 		} else if (!eligible_nhg(unhop->un_nhop_src)) {
778 			NL_LOG(LOG_DEBUG, "uidx %u nhop is not mpath-eligible",
779 			    grp[i].id);
780 			free(wn, M_NETLINK);
781 			return (ENOTSUP);
782 		}
783 		/*
784 		 * TODO: consider more rigid eligibility checks:
785 		 * restrict nexthops with the same gateway
786 		 */
787 		wn[i].nh = unhop->un_nhop_src;
788 		wn[i].weight = grp[i].weight;
789 	}
790 	unhop->un_nhgrp_src = wn;
791 	unhop->un_nhgrp_count = count;
792 	return (0);
793 }
794 
795 /*
796  * Sets nexthop @nh gateway specified by @gw.
797  * If gateway is IPv6 link-local, alters @gw to include scopeid equal to
798  * @ifp ifindex.
799  * Returns 0 on success or errno.
800  */
801 int
802 nl_set_nexthop_gw(struct nhop_object *nh, struct sockaddr *gw, if_t ifp,
803     struct nl_pstate *npt)
804 {
805 #ifdef INET6
806 	if (gw->sa_family == AF_INET6) {
807 		struct sockaddr_in6 *gw6 = (struct sockaddr_in6 *)gw;
808 		if (IN6_IS_ADDR_LINKLOCAL(&gw6->sin6_addr)) {
809 			if (ifp == NULL) {
810 				NLMSG_REPORT_ERR_MSG(npt, "interface not set");
811 				return (EINVAL);
812 			}
813 			in6_set_unicast_scopeid(&gw6->sin6_addr, if_getindex(ifp));
814 		}
815 	}
816 #endif
817 	nhop_set_gw(nh, gw, true);
818 	return (0);
819 }
820 
821 static int
822 newnhop(struct nl_parsed_nhop *attrs, struct user_nhop *unhop, struct nl_pstate *npt)
823 {
824 	struct ifaddr *ifa = NULL;
825 	struct nhop_object *nh;
826 	int error;
827 
828 	if (!attrs->nha_blackhole) {
829 		if (attrs->nha_gw == NULL) {
830 			NLMSG_REPORT_ERR_MSG(npt, "missing NHA_GATEWAY");
831 			return (EINVAL);
832 		}
833 		if (attrs->nha_oif == NULL) {
834 			NLMSG_REPORT_ERR_MSG(npt, "missing NHA_OIF");
835 			return (EINVAL);
836 		}
837 		if (ifa == NULL)
838 			ifa = ifaof_ifpforaddr(attrs->nha_gw, attrs->nha_oif);
839 		if (ifa == NULL) {
840 			NLMSG_REPORT_ERR_MSG(npt, "Unable to determine default source IP");
841 			return (EINVAL);
842 		}
843 	}
844 
845 	int family = attrs->nha_gw != NULL ? attrs->nha_gw->sa_family : attrs->nh_family;
846 
847 	nh = nhop_alloc(RT_DEFAULT_FIB, family);
848 	if (nh == NULL) {
849 		NL_LOG(LOG_DEBUG, "Unable to allocate nexthop");
850 		return (ENOMEM);
851 	}
852 	nhop_set_uidx(nh, attrs->nha_id);
853 	nhop_set_origin(nh, attrs->nh_protocol);
854 
855 	if (attrs->nha_blackhole)
856 		nhop_set_blackhole(nh, NHF_BLACKHOLE);
857 	else {
858 		error = nl_set_nexthop_gw(nh, attrs->nha_gw, attrs->nha_oif, npt);
859 		if (error != 0) {
860 			nhop_free(nh);
861 			return (error);
862 		}
863 		nhop_set_transmit_ifp(nh, attrs->nha_oif);
864 		nhop_set_src(nh, ifa);
865 	}
866 
867 	error = nhop_get_unlinked(nh);
868 	if (error != 0) {
869 		NL_LOG(LOG_DEBUG, "unable to finalize nexthop");
870 		return (error);
871 	}
872 
873 	IF_DEBUG_LEVEL(LOG_DEBUG2) {
874 		char nhbuf[NHOP_PRINT_BUFSIZE];
875 		nhop_print_buf(nh, nhbuf, sizeof(nhbuf));
876 		NL_LOG(LOG_DEBUG2, "Adding unhop %u: %s", attrs->nha_id, nhbuf);
877 	}
878 
879 	unhop->un_nhop_src = nh;
880 	return (0);
881 }
882 
883 static int
884 rtnl_handle_newnhop(struct nlmsghdr *hdr, struct nlpcb *nlp,
885     struct nl_pstate *npt)
886 {
887 	struct user_nhop *unhop;
888 	int error;
889 
890         if ((__predict_false(V_un_ctl == NULL)) && (!vnet_init_unhops()))
891 		return (ENOMEM);
892 	struct unhop_ctl *ctl = V_un_ctl;
893 
894 	struct nl_parsed_nhop attrs = {};
895 	error = nl_parse_nlmsg(hdr, &nhmsg_parser, npt, &attrs);
896 	if (error != 0)
897 		return (error);
898 
899 	/*
900 	 * Get valid nha_id. Treat nha_id == 0 (auto-assignment) as a second-class
901 	 *  citizen.
902 	 */
903 	if (attrs.nha_id == 0) {
904 		attrs.nha_id = find_spare_uidx(ctl);
905 		if (attrs.nha_id == 0) {
906 			NL_LOG(LOG_DEBUG, "Unable to get spare uidx");
907 			return (ENOSPC);
908 		}
909 	}
910 
911 	NL_LOG(LOG_DEBUG, "IFINDEX %d", attrs.nha_oif ? if_getindex(attrs.nha_oif) : 0);
912 
913 	unhop = malloc(sizeof(struct user_nhop), M_NETLINK, M_NOWAIT | M_ZERO);
914 	if (unhop == NULL) {
915 		NL_LOG(LOG_DEBUG, "Unable to allocate user_nhop");
916 		return (ENOMEM);
917 	}
918 	unhop->un_idx = attrs.nha_id;
919 	unhop->un_protocol = attrs.nh_protocol;
920 
921 	if (attrs.nha_group)
922 		error = newnhg(ctl, &attrs, unhop);
923 	else
924 		error = newnhop(&attrs, unhop, npt);
925 
926 	if (error != 0) {
927 		free(unhop, M_NETLINK);
928 		return (error);
929 	}
930 
931 	UN_WLOCK(ctl);
932 	/* Check if uidx already exists */
933 	struct user_nhop *tmp = NULL;
934 	CHT_SLIST_FIND_BYOBJ(&ctl->un_head, unhop, unhop, tmp);
935 	if (tmp != NULL) {
936 		UN_WUNLOCK(ctl);
937 		NL_LOG(LOG_DEBUG, "nhop idx %u already exists", attrs.nha_id);
938 		destroy_unhop(unhop);
939 		return (EEXIST);
940 	}
941 	CHT_SLIST_INSERT_HEAD(&ctl->un_head, unhop, unhop);
942 	uint32_t num_buckets_new = CHT_SLIST_GET_RESIZE_BUCKETS(&ctl->un_head);
943 	UN_WUNLOCK(ctl);
944 
945 	/* Report addition of the next nexhop */
946 	struct netlink_walkargs wa = {
947 		.hdr.nlmsg_pid = hdr->nlmsg_pid,
948 		.hdr.nlmsg_seq = hdr->nlmsg_seq,
949 		.hdr.nlmsg_flags = hdr->nlmsg_flags,
950 		.hdr.nlmsg_type = NL_RTM_NEWNEXTHOP,
951 	};
952 
953 	struct nl_writer nw = {};
954 	if (!nlmsg_get_group_writer(&nw, NLMSG_SMALL, NETLINK_ROUTE, RTNLGRP_NEXTHOP)) {
955 		NL_LOG(LOG_DEBUG, "error allocating message writer");
956 		return (ENOMEM);
957 	}
958 
959 	dump_unhop(unhop, &wa.hdr, &nw);
960 	nlmsg_flush(&nw);
961 
962 	consider_resize(ctl, num_buckets_new);
963 
964         return (0);
965 }
966 
967 static int
968 rtnl_handle_delnhop(struct nlmsghdr *hdr, struct nlpcb *nlp,
969     struct nl_pstate *npt)
970 {
971 	struct unhop_ctl *ctl = atomic_load_ptr(&V_un_ctl);
972 	int error;
973 
974 	if (__predict_false(ctl == NULL))
975 		return (ESRCH);
976 
977 	struct nl_parsed_nhop attrs = {};
978 	error = nl_parse_nlmsg(hdr, &nhmsg_parser, npt, &attrs);
979 	if (error != 0)
980 		return (error);
981 
982 	if (attrs.nha_id == 0) {
983 		NL_LOG(LOG_DEBUG, "NHA_ID not set");
984 		return (EINVAL);
985 	}
986 
987 	error = delete_unhop(ctl, hdr, attrs.nha_id);
988 
989         return (error);
990 }
991 
992 static bool
993 match_unhop(const struct nl_parsed_nhop *attrs, struct user_nhop *unhop)
994 {
995 	if (attrs->nha_id != 0 && unhop->un_idx != attrs->nha_id)
996 		return (false);
997 	if (attrs->nha_groups != 0 && unhop->un_nhgrp_src == NULL)
998 		return (false);
999 	if (attrs->nha_oif != NULL &&
1000 	    (unhop->un_nhop_src == NULL || unhop->un_nhop_src->nh_ifp != attrs->nha_oif))
1001 		return (false);
1002 
1003 	return (true);
1004 }
1005 
1006 static int
1007 rtnl_handle_getnhop(struct nlmsghdr *hdr, struct nlpcb *nlp,
1008     struct nl_pstate *npt)
1009 {
1010 	struct user_nhop *unhop;
1011 	UN_TRACKER;
1012 	int error;
1013 
1014 	struct nl_parsed_nhop attrs = {};
1015 	error = nl_parse_nlmsg(hdr, &nhmsg_parser, npt, &attrs);
1016 	if (error != 0)
1017 		return (error);
1018 
1019 	struct netlink_walkargs wa = {
1020 		.nw = npt->nw,
1021 		.hdr.nlmsg_pid = hdr->nlmsg_pid,
1022 		.hdr.nlmsg_seq = hdr->nlmsg_seq,
1023 		.hdr.nlmsg_flags = hdr->nlmsg_flags,
1024 		.hdr.nlmsg_type = NL_RTM_NEWNEXTHOP,
1025 	};
1026 
1027 	if (attrs.nha_id != 0) {
1028 		struct unhop_ctl *ctl = atomic_load_ptr(&V_un_ctl);
1029 		struct user_nhop key = { .un_idx = attrs.nha_id };
1030 
1031 		if (__predict_false(ctl == NULL))
1032 			return (ESRCH);
1033 
1034 		NL_LOG(LOG_DEBUG2, "searching for uidx %u", attrs.nha_id);
1035 		UN_RLOCK(ctl);
1036 		CHT_SLIST_FIND_BYOBJ(&ctl->un_head, unhop, &key, unhop);
1037 		UN_RUNLOCK(ctl);
1038 
1039 		if (unhop == NULL)
1040 			return (ESRCH);
1041 		dump_unhop(unhop, &wa.hdr, wa.nw);
1042 		return (0);
1043 	} else if (attrs.nhaf_kid != 0) {
1044 		struct nhop_iter iter = {
1045 			.fibnum = attrs.nhaf_table,
1046 			.family = attrs.nhaf_family,
1047 		};
1048 		int error = ESRCH;
1049 
1050 		NL_LOG(LOG_DEBUG2, "START table %u family %d", attrs.nhaf_table, attrs.nhaf_family);
1051 		for (struct nhop_object *nh = nhops_iter_start(&iter); nh;
1052 		    nh = nhops_iter_next(&iter)) {
1053 			NL_LOG(LOG_DEBUG3, "get %u", nhop_get_idx(nh));
1054 			if (nhop_get_idx(nh) == attrs.nhaf_kid) {
1055 				dump_nhop(nh, 0, &wa.hdr, wa.nw);
1056 				error = 0;
1057 				break;
1058 			}
1059 		}
1060 		nhops_iter_stop(&iter);
1061 		return (error);
1062 	} else if (attrs.nhaf_knhops) {
1063 		struct nhop_iter iter = {
1064 			.fibnum = attrs.nhaf_table,
1065 			.family = attrs.nhaf_family,
1066 		};
1067 
1068 		NL_LOG(LOG_DEBUG2, "DUMP table %u family %d", attrs.nhaf_table, attrs.nhaf_family);
1069 		wa.hdr.nlmsg_flags |= NLM_F_MULTI;
1070 		for (struct nhop_object *nh = nhops_iter_start(&iter); nh;
1071 		    nh = nhops_iter_next(&iter)) {
1072 			dump_nhop(nh, 0, &wa.hdr, wa.nw);
1073 		}
1074 		nhops_iter_stop(&iter);
1075 	} else {
1076 		struct unhop_ctl *ctl = atomic_load_ptr(&V_un_ctl);
1077 
1078 		if (__predict_false(ctl == NULL))
1079 			return (ESRCH);
1080 
1081 		NL_LOG(LOG_DEBUG2, "DUMP unhops");
1082 		UN_RLOCK(ctl);
1083 		wa.hdr.nlmsg_flags |= NLM_F_MULTI;
1084 		CHT_SLIST_FOREACH(&ctl->un_head, unhop, unhop) {
1085 			if (UNHOP_IS_MASTER(unhop) && match_unhop(&attrs, unhop))
1086 				dump_unhop(unhop, &wa.hdr, wa.nw);
1087 		} CHT_SLIST_FOREACH_END;
1088 		UN_RUNLOCK(ctl);
1089 	}
1090 
1091 	if (wa.error == 0) {
1092 		if (!nlmsg_end_dump(wa.nw, wa.error, &wa.hdr))
1093 			return (ENOMEM);
1094 	}
1095         return (0);
1096 }
1097 
1098 static const struct rtnl_cmd_handler cmd_handlers[] = {
1099 	{
1100 		.cmd = NL_RTM_NEWNEXTHOP,
1101 		.name = "RTM_NEWNEXTHOP",
1102 		.cb = &rtnl_handle_newnhop,
1103 		.priv = PRIV_NET_ROUTE,
1104 	},
1105 	{
1106 		.cmd = NL_RTM_DELNEXTHOP,
1107 		.name = "RTM_DELNEXTHOP",
1108 		.cb = &rtnl_handle_delnhop,
1109 		.priv = PRIV_NET_ROUTE,
1110 	},
1111 	{
1112 		.cmd = NL_RTM_GETNEXTHOP,
1113 		.name = "RTM_GETNEXTHOP",
1114 		.cb = &rtnl_handle_getnhop,
1115 	}
1116 };
1117 
1118 static const struct nlhdr_parser *all_parsers[] = { &nhmsg_parser, &nh_fbsd_parser };
1119 
1120 void
1121 rtnl_nexthops_init(void)
1122 {
1123 	NL_VERIFY_PARSERS(all_parsers);
1124 	rtnl_register_messages(cmd_handlers, NL_ARRAY_LEN(cmd_handlers));
1125 }
1126