xref: /freebsd/sys/netlink/route/nexthop.c (revision 397e83df75e0fcd0d3fcb95ae4d794cb7600fc89)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause
3  *
4  * Copyright (c) 2022 Alexander V. Chernikov <melifaro@FreeBSD.org>
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions
8  * are met:
9  * 1. Redistributions of source code must retain the above copyright
10  *    notice, this list of conditions and the following disclaimer.
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in the
13  *    documentation and/or other materials provided with the distribution.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
16  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
19  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
21  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
22  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
23  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
24  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
25  * SUCH DAMAGE.
26  */
27 
28 #include <sys/cdefs.h>
29 #include "opt_inet.h"
30 #include "opt_inet6.h"
31 #include "opt_route.h"
32 #include <sys/types.h>
33 #include <sys/ck.h>
34 #include <sys/epoch.h>
35 #include <sys/kernel.h>
36 #include <sys/malloc.h>
37 #include <sys/rmlock.h>
38 #include <sys/socket.h>
39 
40 #include <net/if.h>
41 #include <net/route.h>
42 #include <net/route/nhop.h>
43 #include <net/route/nhop_utils.h>
44 
45 #include <net/route/route_ctl.h>
46 #include <net/route/route_var.h>
47 #include <netinet6/scope6_var.h>
48 #include <netlink/netlink.h>
49 #include <netlink/netlink_ctl.h>
50 #include <netlink/netlink_route.h>
51 #include <netlink/route/route_var.h>
52 
53 #define	DEBUG_MOD_NAME	nl_nhop
54 #define	DEBUG_MAX_LEVEL	LOG_DEBUG3
55 #include <netlink/netlink_debug.h>
56 _DECLARE_DEBUG(LOG_INFO);
57 
58 /*
59  * This file contains the logic to maintain kernel nexthops and
60  *  nexhop groups based om the data provided by the user.
61  *
62  * Kernel stores (nearly) all of the routing data in the nexthops,
63  *  including the prefix-specific flags (NHF_HOST and NHF_DEFAULT).
64  *
65  * Netlink API provides higher-level abstraction for the user. Each
66  *  user-created nexthop may map to multiple kernel nexthops.
67  *
68  * The following variations require separate kernel nexthop to be
69  *  created:
70  *  * prefix flags (NHF_HOST, NHF_DEFAULT)
71  *  * using IPv6 gateway for IPv4 routes
72  *  * different fibnum
73  *
74  * These kernel nexthops have the lifetime bound to the lifetime of
75  *  the user_nhop object. They are not collected until user requests
76  *  to delete the created user_nhop.
77  *
78  */
79 struct user_nhop {
80         uint32_t                        un_idx; /* Userland-provided index */
81 	uint32_t			un_fibfam; /* fibnum+af(as highest byte) */
82 	uint8_t				un_protocol; /* protocol that install the record */
83 	struct nhop_object		*un_nhop; /* "production" nexthop */
84 	struct nhop_object		*un_nhop_src; /* nexthop to copy from */
85 	struct weightened_nhop		*un_nhgrp_src; /* nexthops for nhg */
86 	uint32_t			un_nhgrp_count; /* number of nexthops */
87         struct user_nhop		*un_next; /* next item in hash chain */
88         struct user_nhop		*un_nextchild; /* master -> children */
89 	struct epoch_context		un_epoch_ctx;	/* epoch ctl helper */
90 };
91 
92 /* produce hash value for an object */
93 #define	unhop_hash_obj(_obj)	(hash_unhop(_obj))
94 /* compare two objects */
95 #define	unhop_cmp(_one, _two)	(cmp_unhop(_one, _two))
96 /* next object accessor */
97 #define	unhop_next(_obj)	(_obj)->un_next
98 
99 CHT_SLIST_DEFINE(unhop, struct user_nhop);
100 
101 struct unhop_ctl {
102 	struct unhop_head	un_head;
103 	struct rmlock		un_lock;
104 };
105 #define	UN_LOCK_INIT(_ctl)	rm_init(&(_ctl)->un_lock, "unhop_ctl")
106 #define	UN_TRACKER		struct rm_priotracker un_tracker
107 #define	UN_RLOCK(_ctl)		rm_rlock(&((_ctl)->un_lock), &un_tracker)
108 #define	UN_RUNLOCK(_ctl)	rm_runlock(&((_ctl)->un_lock), &un_tracker)
109 
110 #define	UN_WLOCK(_ctl)		rm_wlock(&(_ctl)->un_lock);
111 #define	UN_WUNLOCK(_ctl)	rm_wunlock(&(_ctl)->un_lock);
112 
113 VNET_DEFINE_STATIC(struct unhop_ctl *, un_ctl) = NULL;
114 #define V_un_ctl	VNET(un_ctl)
115 
116 static void consider_resize(struct unhop_ctl *ctl, uint32_t new_size);
117 static int cmp_unhop(const struct user_nhop *a, const struct user_nhop *b);
118 static unsigned int hash_unhop(const struct user_nhop *obj);
119 
120 static void destroy_unhop(struct user_nhop *unhop);
121 static struct nhop_object *clone_unhop(const struct user_nhop *unhop,
122     uint32_t fibnum, int family, int nh_flags);
123 
124 static int
125 cmp_unhop(const struct user_nhop *a, const struct user_nhop *b)
126 {
127         return (a->un_idx == b->un_idx && a->un_fibfam == b->un_fibfam);
128 }
129 
130 /*
131  * Hash callback: calculate hash of an object
132  */
133 static unsigned int
134 hash_unhop(const struct user_nhop *obj)
135 {
136         return (obj->un_idx ^ obj->un_fibfam);
137 }
138 
139 #define	UNHOP_IS_MASTER(_unhop)	((_unhop)->un_fibfam == 0)
140 
141 /*
142  * Factory interface for creating matching kernel nexthops/nexthop groups
143  *
144  * @uidx: userland nexhop index used to create the nexthop
145  * @fibnum: fibnum nexthop will be used in
146  * @family: upper family nexthop will be used in
147  * @nh_flags: desired nexthop prefix flags
148  * @perror: pointer to store error to
149  *
150  * Returns referenced nexthop linked to @fibnum/@family rib on success.
151  */
152 struct nhop_object *
153 nl_find_nhop(uint32_t fibnum, int family, uint32_t uidx,
154     int nh_flags, int *perror)
155 {
156 	struct unhop_ctl *ctl = atomic_load_ptr(&V_un_ctl);
157         UN_TRACKER;
158 
159 	if (__predict_false(ctl == NULL))
160 		return (NULL);
161 
162 	struct user_nhop key= {
163 		.un_idx = uidx,
164 		.un_fibfam = fibnum  | ((uint32_t)family) << 24,
165 	};
166 	struct user_nhop *unhop;
167 
168 	nh_flags = nh_flags & (NHF_HOST | NHF_DEFAULT);
169 
170 	if (__predict_false(family == 0))
171 		return (NULL);
172 
173 	UN_RLOCK(ctl);
174 	CHT_SLIST_FIND_BYOBJ(&ctl->un_head, unhop, &key, unhop);
175 	if (unhop != NULL) {
176 		struct nhop_object *nh = unhop->un_nhop;
177 		UN_RLOCK(ctl);
178 		*perror = 0;
179 		nhop_ref_any(nh);
180 		return (nh);
181 	}
182 
183 	/*
184 	 * Exact nexthop not found. Search for template nexthop to clone from.
185 	 */
186 	key.un_fibfam = 0;
187 	CHT_SLIST_FIND_BYOBJ(&ctl->un_head, unhop, &key, unhop);
188 	if (unhop == NULL) {
189 		UN_RUNLOCK(ctl);
190 		*perror = ESRCH;
191 		return (NULL);
192 	}
193 
194 	UN_RUNLOCK(ctl);
195 
196 	/* Create entry to insert first */
197 	struct user_nhop *un_new, *un_tmp;
198 	un_new = malloc(sizeof(struct user_nhop), M_NETLINK, M_NOWAIT | M_ZERO);
199 	if (un_new == NULL) {
200 		*perror = ENOMEM;
201 		return (NULL);
202 	}
203 	un_new->un_idx = uidx;
204 	un_new->un_fibfam = fibnum  | ((uint32_t)family) << 24;
205 
206 	/* Relying on epoch to protect unhop here */
207 	un_new->un_nhop = clone_unhop(unhop, fibnum, family, nh_flags);
208 	if (un_new->un_nhop == NULL) {
209 		free(un_new, M_NETLINK);
210 		*perror = ENOMEM;
211 		return (NULL);
212 	}
213 
214 	/* Insert back and report */
215 	UN_WLOCK(ctl);
216 
217 	/* First, find template record once again */
218 	CHT_SLIST_FIND_BYOBJ(&ctl->un_head, unhop, &key, unhop);
219 	if (unhop == NULL) {
220 		/* Someone deleted the nexthop during the call */
221 		UN_WUNLOCK(ctl);
222 		*perror = ESRCH;
223 		destroy_unhop(un_new);
224 		return (NULL);
225 	}
226 
227 	/* Second, check the direct match */
228 	CHT_SLIST_FIND_BYOBJ(&ctl->un_head, unhop, un_new, un_tmp);
229 	struct nhop_object *nh;
230 	if (un_tmp != NULL) {
231 		/* Another thread already created the desired nextop, use it */
232 		nh = un_tmp->un_nhop;
233 	} else {
234 		/* Finally, insert the new nexthop and link it to the primary */
235 		nh = un_new->un_nhop;
236 		CHT_SLIST_INSERT_HEAD(&ctl->un_head, unhop, un_new);
237 		un_new->un_nextchild = unhop->un_nextchild;
238 		unhop->un_nextchild = un_new;
239 		un_new = NULL;
240 		NL_LOG(LOG_DEBUG2, "linked cloned nexthop %p", nh);
241 	}
242 
243 	UN_WUNLOCK(ctl);
244 
245 	if (un_new != NULL)
246 		destroy_unhop(un_new);
247 
248 	*perror = 0;
249 	nhop_ref_any(nh);
250 	return (nh);
251 }
252 
253 static struct user_nhop *
254 nl_find_base_unhop(struct unhop_ctl *ctl, uint32_t uidx)
255 {
256 	struct user_nhop key= { .un_idx = uidx };
257 	struct user_nhop *unhop = NULL;
258 	UN_TRACKER;
259 
260 	UN_RLOCK(ctl);
261 	CHT_SLIST_FIND_BYOBJ(&ctl->un_head, unhop, &key, unhop);
262 	UN_RUNLOCK(ctl);
263 
264 	return (unhop);
265 }
266 
267 #define MAX_STACK_NHOPS	4
268 static struct nhop_object *
269 clone_unhop(const struct user_nhop *unhop, uint32_t fibnum, int family, int nh_flags)
270 {
271 #ifdef ROUTE_MPATH
272 	const struct weightened_nhop *wn;
273 	struct weightened_nhop *wn_new, wn_base[MAX_STACK_NHOPS];
274 	uint32_t num_nhops;
275 #endif
276 	struct nhop_object *nh = NULL;
277 	int error;
278 
279 	if (unhop->un_nhop_src != NULL) {
280 		IF_DEBUG_LEVEL(LOG_DEBUG2) {
281 			char nhbuf[NHOP_PRINT_BUFSIZE];
282 			nhop_print_buf_any(unhop->un_nhop_src, nhbuf, sizeof(nhbuf));
283 			FIB_NH_LOG(LOG_DEBUG2, unhop->un_nhop_src,
284 			    "cloning nhop %s -> %u.%u flags 0x%X", nhbuf, fibnum,
285 			    family, nh_flags);
286 		}
287 		struct nhop_object *nh;
288 		nh = nhop_alloc(fibnum, AF_UNSPEC);
289 		if (nh == NULL)
290 			return (NULL);
291 		nhop_copy(nh, unhop->un_nhop_src);
292 		/* Check that nexthop gateway is compatible with the new family */
293 		if (!nhop_set_upper_family(nh, family)) {
294 			nhop_free(nh);
295 			return (NULL);
296 		}
297 		nhop_set_uidx(nh, unhop->un_idx);
298 		nhop_set_pxtype_flag(nh, nh_flags);
299 		return (nhop_get_nhop(nh, &error));
300 	}
301 #ifdef ROUTE_MPATH
302 	wn = unhop->un_nhgrp_src;
303 	num_nhops = unhop->un_nhgrp_count;
304 
305 	if (num_nhops > MAX_STACK_NHOPS) {
306 		wn_new = malloc(num_nhops * sizeof(struct weightened_nhop), M_TEMP, M_NOWAIT);
307 		if (wn_new == NULL)
308 			return (NULL);
309 	} else
310 		wn_new = wn_base;
311 
312 	for (int i = 0; i < num_nhops; i++) {
313 		uint32_t uidx = nhop_get_uidx(wn[i].nh);
314 		MPASS(uidx != 0);
315 		wn_new[i].nh = nl_find_nhop(fibnum, family, uidx, nh_flags, &error);
316 		if (error != 0)
317 			break;
318 		wn_new[i].weight = wn[i].weight;
319 	}
320 
321 	if (error == 0) {
322 		struct rib_head *rh = nhop_get_rh(wn_new[0].nh);
323 		struct nhgrp_object *nhg;
324 
325 		error = nhgrp_get_group(rh, wn_new, num_nhops, unhop->un_idx, &nhg);
326 		nh = (struct nhop_object *)nhg;
327 	}
328 
329 	if (wn_new != wn_base)
330 		free(wn_new, M_TEMP);
331 #endif
332 	return (nh);
333 }
334 
335 static void
336 destroy_unhop(struct user_nhop *unhop)
337 {
338 	if (unhop->un_nhop != NULL)
339 		nhop_free_any(unhop->un_nhop);
340 	if (unhop->un_nhop_src != NULL)
341 		nhop_free_any(unhop->un_nhop_src);
342 	free(unhop, M_NETLINK);
343 }
344 
345 static void
346 destroy_unhop_epoch(epoch_context_t ctx)
347 {
348 	struct user_nhop *unhop;
349 
350 	unhop = __containerof(ctx, struct user_nhop, un_epoch_ctx);
351 
352 	destroy_unhop(unhop);
353 }
354 
355 static uint32_t
356 find_spare_uidx(struct unhop_ctl *ctl)
357 {
358 	struct user_nhop *unhop, key = {};
359 	uint32_t uidx = 0;
360 	UN_TRACKER;
361 
362 	UN_RLOCK(ctl);
363 	/* This should return spare uid with 75% of 65k used in ~99/100 cases */
364 	for (int i = 0; i < 16; i++) {
365 		key.un_idx = (arc4random() % 65536) + 65536 * 4;
366 		CHT_SLIST_FIND_BYOBJ(&ctl->un_head, unhop, &key, unhop);
367 		if (unhop == NULL) {
368 			uidx = key.un_idx;
369 			break;
370 		}
371 	}
372 	UN_RUNLOCK(ctl);
373 
374 	return (uidx);
375 }
376 
377 
378 /*
379  * Actual netlink code
380  */
381 struct netlink_walkargs {
382 	struct nl_writer *nw;
383 	struct nlmsghdr hdr;
384 	struct nlpcb *so;
385 	int family;
386 	int error;
387 	int count;
388 	int dumped;
389 };
390 #define	ENOMEM_IF_NULL(_v)	if ((_v) == NULL) goto enomem
391 
392 static bool
393 dump_nhgrp(const struct user_nhop *unhop, struct nlmsghdr *hdr,
394     struct nl_writer *nw)
395 {
396 
397 	if (!nlmsg_reply(nw, hdr, sizeof(struct nhmsg)))
398 		goto enomem;
399 
400 	struct nhmsg *nhm = nlmsg_reserve_object(nw, struct nhmsg);
401 	nhm->nh_family = AF_UNSPEC;
402 	nhm->nh_scope = 0;
403 	nhm->nh_protocol = unhop->un_protocol;
404 	nhm->nh_flags = 0;
405 
406 	nlattr_add_u32(nw, NHA_ID, unhop->un_idx);
407 	nlattr_add_u16(nw, NHA_GROUP_TYPE, NEXTHOP_GRP_TYPE_MPATH);
408 
409 	struct weightened_nhop *wn = unhop->un_nhgrp_src;
410 	uint32_t num_nhops = unhop->un_nhgrp_count;
411 	/* TODO: a better API? */
412 	int nla_len = sizeof(struct nlattr);
413 	nla_len += NETLINK_ALIGN(num_nhops * sizeof(struct nexthop_grp));
414 	struct nlattr *nla = nlmsg_reserve_data(nw, nla_len, struct nlattr);
415 	if (nla == NULL)
416 		goto enomem;
417 	nla->nla_type = NHA_GROUP;
418 	nla->nla_len = nla_len;
419 	for (int i = 0; i < num_nhops; i++) {
420 		struct nexthop_grp *grp = &((struct nexthop_grp *)(nla + 1))[i];
421 		grp->id = nhop_get_uidx(wn[i].nh);
422 		grp->weight = wn[i].weight;
423 		grp->resvd1 = 0;
424 		grp->resvd2 = 0;
425 	}
426 
427         if (nlmsg_end(nw))
428 		return (true);
429 enomem:
430 	NL_LOG(LOG_DEBUG, "error: unable to allocate attribute memory");
431         nlmsg_abort(nw);
432 	return (false);
433 }
434 
435 static bool
436 dump_nhop(const struct nhop_object *nh, uint32_t uidx, struct nlmsghdr *hdr,
437     struct nl_writer *nw)
438 {
439 	if (!nlmsg_reply(nw, hdr, sizeof(struct nhmsg)))
440 		goto enomem;
441 
442 	struct nhmsg *nhm = nlmsg_reserve_object(nw, struct nhmsg);
443 	ENOMEM_IF_NULL(nhm);
444 	nhm->nh_family = nhop_get_neigh_family(nh);
445 	nhm->nh_scope = 0; // XXX: what's that?
446 	nhm->nh_protocol = nhop_get_origin(nh);
447 	nhm->nh_flags = 0;
448 
449 	if (uidx != 0)
450 		nlattr_add_u32(nw, NHA_ID, uidx);
451 	if (nh->nh_flags & NHF_BLACKHOLE) {
452 		nlattr_add_flag(nw, NHA_BLACKHOLE);
453 		goto done;
454 	}
455 	nlattr_add_u32(nw, NHA_OIF, if_getindex(nh->nh_ifp));
456 
457 	switch (nh->gw_sa.sa_family) {
458 #ifdef INET
459 	case AF_INET:
460 		nlattr_add(nw, NHA_GATEWAY, 4, &nh->gw4_sa.sin_addr);
461 		break;
462 #endif
463 #ifdef INET6
464 	case AF_INET6:
465 		{
466 			struct in6_addr addr = nh->gw6_sa.sin6_addr;
467 			in6_clearscope(&addr);
468 			nlattr_add(nw, NHA_GATEWAY, 16, &addr);
469 			break;
470 		}
471 #endif
472 	}
473 
474 	int off = nlattr_add_nested(nw, NHA_FREEBSD);
475 	if (off != 0) {
476 		nlattr_add_u32(nw, NHAF_AIF, if_getindex(nh->nh_aifp));
477 
478 		if (uidx == 0) {
479 			nlattr_add_u32(nw, NHAF_KID, nhop_get_idx(nh));
480 			nlattr_add_u32(nw, NHAF_FAMILY, nhop_get_upper_family(nh));
481 			nlattr_add_u32(nw, NHAF_TABLE, nhop_get_fibnum(nh));
482 		}
483 
484 		nlattr_set_len(nw, off);
485 	}
486 
487 done:
488         if (nlmsg_end(nw))
489 		return (true);
490 enomem:
491 	nlmsg_abort(nw);
492 	return (false);
493 }
494 
495 static void
496 dump_unhop(const struct user_nhop *unhop, struct nlmsghdr *hdr,
497     struct nl_writer *nw)
498 {
499 	if (unhop->un_nhop_src != NULL)
500 		dump_nhop(unhop->un_nhop_src, unhop->un_idx, hdr, nw);
501 	else
502 		dump_nhgrp(unhop, hdr, nw);
503 }
504 
505 static int
506 delete_unhop(struct unhop_ctl *ctl, struct nlmsghdr *hdr, uint32_t uidx)
507 {
508 	struct user_nhop *unhop_ret, *unhop_base, *unhop_chain;
509 
510 	struct user_nhop key = { .un_idx = uidx };
511 
512 	UN_WLOCK(ctl);
513 
514 	CHT_SLIST_FIND_BYOBJ(&ctl->un_head, unhop, &key, unhop_base);
515 
516 	if (unhop_base != NULL) {
517 		CHT_SLIST_REMOVE(&ctl->un_head, unhop, unhop_base, unhop_ret);
518 		IF_DEBUG_LEVEL(LOG_DEBUG2) {
519 			char nhbuf[NHOP_PRINT_BUFSIZE];
520 			nhop_print_buf_any(unhop_base->un_nhop, nhbuf, sizeof(nhbuf));
521 			FIB_NH_LOG(LOG_DEBUG3, unhop_base->un_nhop,
522 			    "removed base nhop %u: %s", uidx, nhbuf);
523 		}
524 		/* Unlink all child nexhops as well, keeping the chain intact */
525 		unhop_chain = unhop_base->un_nextchild;
526 		while (unhop_chain != NULL) {
527 			CHT_SLIST_REMOVE(&ctl->un_head, unhop, unhop_chain,
528 			    unhop_ret);
529 			MPASS(unhop_chain == unhop_ret);
530 			IF_DEBUG_LEVEL(LOG_DEBUG3) {
531 				char nhbuf[NHOP_PRINT_BUFSIZE];
532 				nhop_print_buf_any(unhop_chain->un_nhop,
533 				    nhbuf, sizeof(nhbuf));
534 				FIB_NH_LOG(LOG_DEBUG3, unhop_chain->un_nhop,
535 				    "removed child nhop %u: %s", uidx, nhbuf);
536 			}
537 			unhop_chain = unhop_chain->un_nextchild;
538 		}
539 	}
540 
541 	UN_WUNLOCK(ctl);
542 
543 	if (unhop_base == NULL) {
544 		NL_LOG(LOG_DEBUG, "unable to find unhop %u", uidx);
545 		return (ENOENT);
546 	}
547 
548 	/* Report nexthop deletion */
549 	struct netlink_walkargs wa = {
550 		.hdr.nlmsg_pid = hdr->nlmsg_pid,
551 		.hdr.nlmsg_seq = hdr->nlmsg_seq,
552 		.hdr.nlmsg_flags = hdr->nlmsg_flags,
553 		.hdr.nlmsg_type = NL_RTM_DELNEXTHOP,
554 	};
555 
556 	struct nl_writer nw = {};
557 	if (!nlmsg_get_group_writer(&nw, NLMSG_SMALL, NETLINK_ROUTE, RTNLGRP_NEXTHOP)) {
558 		NL_LOG(LOG_DEBUG, "error allocating message writer");
559 		return (ENOMEM);
560 	}
561 
562 	dump_unhop(unhop_base, &wa.hdr, &nw);
563 	nlmsg_flush(&nw);
564 
565 	while (unhop_base != NULL) {
566 		unhop_chain = unhop_base->un_nextchild;
567 		NET_EPOCH_CALL(destroy_unhop_epoch, &unhop_base->un_epoch_ctx);
568 		unhop_base = unhop_chain;
569 	}
570 
571 	return (0);
572 }
573 
574 static void
575 consider_resize(struct unhop_ctl *ctl, uint32_t new_size)
576 {
577 	void *new_ptr = NULL;
578 	size_t alloc_size;
579 
580         if (new_size == 0)
581                 return;
582 
583 	if (new_size != 0) {
584 		alloc_size = CHT_SLIST_GET_RESIZE_SIZE(new_size);
585 		new_ptr = malloc(alloc_size, M_NETLINK, M_NOWAIT | M_ZERO);
586                 if (new_ptr == NULL)
587                         return;
588 	}
589 
590 	NL_LOG(LOG_DEBUG, "resizing hash: %u -> %u", ctl->un_head.hash_size, new_size);
591 	UN_WLOCK(ctl);
592 	if (new_ptr != NULL) {
593 		CHT_SLIST_RESIZE(&ctl->un_head, unhop, new_ptr, new_size);
594 	}
595 	UN_WUNLOCK(ctl);
596 
597 
598 	if (new_ptr != NULL)
599 		free(new_ptr, M_NETLINK);
600 }
601 
602 static bool __noinline
603 vnet_init_unhops(void)
604 {
605         uint32_t num_buckets = 16;
606         size_t alloc_size = CHT_SLIST_GET_RESIZE_SIZE(num_buckets);
607 
608         struct unhop_ctl *ctl = malloc(sizeof(struct unhop_ctl), M_NETLINK,
609             M_NOWAIT | M_ZERO);
610         if (ctl == NULL)
611                 return (false);
612 
613         void *ptr = malloc(alloc_size, M_NETLINK, M_NOWAIT | M_ZERO);
614         if (ptr == NULL) {
615 		free(ctl, M_NETLINK);
616                 return (false);
617 	}
618         CHT_SLIST_INIT(&ctl->un_head, ptr, num_buckets);
619 	UN_LOCK_INIT(ctl);
620 
621 	if (!atomic_cmpset_ptr((uintptr_t *)&V_un_ctl, (uintptr_t)NULL, (uintptr_t)ctl)) {
622                 free(ptr, M_NETLINK);
623                 free(ctl, M_NETLINK);
624 	}
625 
626 	if (atomic_load_ptr(&V_un_ctl) == NULL)
627 		return (false);
628 
629 	NL_LOG(LOG_NOTICE, "UNHOPS init done");
630 
631         return (true);
632 }
633 
634 static void
635 vnet_destroy_unhops(const void *unused __unused)
636 {
637 	struct unhop_ctl *ctl = atomic_load_ptr(&V_un_ctl);
638 	struct user_nhop *unhop, *tmp;
639 
640 	if (ctl == NULL)
641 		return;
642 	V_un_ctl = NULL;
643 
644 	/* Wait till all unhop users finish their reads */
645 	NET_EPOCH_WAIT();
646 
647 	UN_WLOCK(ctl);
648 	CHT_SLIST_FOREACH_SAFE(&ctl->un_head, unhop, unhop, tmp) {
649 		destroy_unhop(unhop);
650 	} CHT_SLIST_FOREACH_SAFE_END;
651 	UN_WUNLOCK(ctl);
652 
653 	free(ctl->un_head.ptr, M_NETLINK);
654 	free(ctl, M_NETLINK);
655 }
656 VNET_SYSUNINIT(vnet_destroy_unhops, SI_SUB_PROTO_IF, SI_ORDER_ANY,
657     vnet_destroy_unhops, NULL);
658 
659 static int
660 nlattr_get_nhg(struct nlattr *nla, struct nl_pstate *npt, const void *arg, void *target)
661 {
662 	int error = 0;
663 
664 	/* Verify attribute correctness */
665 	struct nexthop_grp *grp = NLA_DATA(nla);
666 	int data_len = NLA_DATA_LEN(nla);
667 
668 	int count = data_len / sizeof(*grp);
669 	if (count == 0 || (count * sizeof(*grp) != data_len)) {
670 		NL_LOG(LOG_DEBUG, "Invalid length for RTA_GROUP: %d", data_len);
671 		return (EINVAL);
672 	}
673 
674 	*((struct nlattr **)target) = nla;
675 	return (error);
676 }
677 
678 static void
679 set_scope6(struct sockaddr *sa, if_t ifp)
680 {
681 #ifdef INET6
682 	if (sa != NULL && sa->sa_family == AF_INET6 && ifp != NULL) {
683 		struct sockaddr_in6 *sa6 = (struct sockaddr_in6 *)sa;
684 
685 		if (IN6_IS_ADDR_LINKLOCAL(&sa6->sin6_addr))
686 			in6_set_unicast_scopeid(&sa6->sin6_addr, if_getindex(ifp));
687 	}
688 #endif
689 }
690 
691 struct nl_parsed_nhop {
692 	uint32_t	nha_id;
693 	uint8_t		nha_blackhole;
694 	uint8_t		nha_groups;
695 	uint8_t		nhaf_knhops;
696 	uint8_t		nhaf_family;
697 	struct ifnet	*nha_oif;
698 	struct sockaddr	*nha_gw;
699 	struct nlattr	*nha_group;
700 	uint8_t		nh_family;
701 	uint8_t		nh_protocol;
702 	uint32_t	nhaf_table;
703 	uint32_t	nhaf_kid;
704 	uint32_t	nhaf_aif;
705 };
706 
707 #define	_IN(_field)	offsetof(struct nhmsg, _field)
708 #define	_OUT(_field)	offsetof(struct nl_parsed_nhop, _field)
709 static struct nlattr_parser nla_p_nh_fbsd[] = {
710 	{ .type = NHAF_KNHOPS, .off = _OUT(nhaf_knhops), .cb = nlattr_get_flag },
711 	{ .type = NHAF_TABLE, .off = _OUT(nhaf_table), .cb = nlattr_get_uint32 },
712 	{ .type = NHAF_FAMILY, .off = _OUT(nhaf_family), .cb = nlattr_get_uint8 },
713 	{ .type = NHAF_KID, .off = _OUT(nhaf_kid), .cb = nlattr_get_uint32 },
714 	{ .type = NHAF_AIF, .off = _OUT(nhaf_aif), .cb = nlattr_get_uint32 },
715 };
716 NL_DECLARE_ATTR_PARSER(nh_fbsd_parser, nla_p_nh_fbsd);
717 
718 static const struct nlfield_parser nlf_p_nh[] = {
719 	{ .off_in = _IN(nh_family), .off_out = _OUT(nh_family), .cb = nlf_get_u8 },
720 	{ .off_in = _IN(nh_protocol), .off_out = _OUT(nh_protocol), .cb = nlf_get_u8 },
721 };
722 
723 static const struct nlattr_parser nla_p_nh[] = {
724 	{ .type = NHA_ID, .off = _OUT(nha_id), .cb = nlattr_get_uint32 },
725 	{ .type = NHA_GROUP, .off = _OUT(nha_group), .cb = nlattr_get_nhg },
726 	{ .type = NHA_BLACKHOLE, .off = _OUT(nha_blackhole), .cb = nlattr_get_flag },
727 	{ .type = NHA_OIF, .off = _OUT(nha_oif), .cb = nlattr_get_ifp },
728 	{ .type = NHA_GATEWAY, .off = _OUT(nha_gw), .cb = nlattr_get_ip },
729 	{ .type = NHA_GROUPS, .off = _OUT(nha_groups), .cb = nlattr_get_flag },
730 	{ .type = NHA_FREEBSD, .arg = &nh_fbsd_parser, .cb = nlattr_get_nested },
731 };
732 #undef _IN
733 #undef _OUT
734 
735 static bool
736 post_p_nh(void *_attrs, struct nl_pstate *npt)
737 {
738 	struct nl_parsed_nhop *attrs = (struct nl_parsed_nhop *)_attrs;
739 
740 	set_scope6(attrs->nha_gw, attrs->nha_oif);
741 	return (true);
742 }
743 NL_DECLARE_PARSER_EXT(nhmsg_parser, struct nhmsg, NULL, nlf_p_nh, nla_p_nh, post_p_nh);
744 
745 static bool
746 eligible_nhg(const struct nhop_object *nh)
747 {
748 	return (nh->nh_flags & NHF_GATEWAY);
749 }
750 
751 static int
752 newnhg(struct unhop_ctl *ctl, struct nl_parsed_nhop *attrs, struct user_nhop *unhop)
753 {
754 	struct nexthop_grp *grp = NLA_DATA(attrs->nha_group);
755 	int count = NLA_DATA_LEN(attrs->nha_group) / sizeof(*grp);
756 	struct weightened_nhop *wn;
757 
758 	wn = malloc(sizeof(*wn) * count, M_NETLINK, M_NOWAIT | M_ZERO);
759 	if (wn == NULL)
760 		return (ENOMEM);
761 
762 	for (int i = 0; i < count; i++) {
763 		struct user_nhop *unhop;
764 		unhop = nl_find_base_unhop(ctl, grp[i].id);
765 		if (unhop == NULL) {
766 			NL_LOG(LOG_DEBUG, "unable to find uidx %u", grp[i].id);
767 			free(wn, M_NETLINK);
768 			return (ESRCH);
769 		} else if (unhop->un_nhop_src == NULL) {
770 			NL_LOG(LOG_DEBUG, "uidx %u is a group, nested group unsupported",
771 			    grp[i].id);
772 			free(wn, M_NETLINK);
773 			return (ENOTSUP);
774 		} else if (!eligible_nhg(unhop->un_nhop_src)) {
775 			NL_LOG(LOG_DEBUG, "uidx %u nhop is not mpath-eligible",
776 			    grp[i].id);
777 			free(wn, M_NETLINK);
778 			return (ENOTSUP);
779 		}
780 		/*
781 		 * TODO: consider more rigid eligibility checks:
782 		 * restrict nexthops with the same gateway
783 		 */
784 		wn[i].nh = unhop->un_nhop_src;
785 		wn[i].weight = grp[i].weight;
786 	}
787 	unhop->un_nhgrp_src = wn;
788 	unhop->un_nhgrp_count = count;
789 	return (0);
790 }
791 
792 /*
793  * Sets nexthop @nh gateway specified by @gw.
794  * If gateway is IPv6 link-local, alters @gw to include scopeid equal to
795  * @ifp ifindex.
796  * Returns 0 on success or errno.
797  */
798 int
799 nl_set_nexthop_gw(struct nhop_object *nh, struct sockaddr *gw, if_t ifp,
800     struct nl_pstate *npt)
801 {
802 #ifdef INET6
803 	if (gw->sa_family == AF_INET6) {
804 		struct sockaddr_in6 *gw6 = (struct sockaddr_in6 *)gw;
805 		if (IN6_IS_ADDR_LINKLOCAL(&gw6->sin6_addr)) {
806 			if (ifp == NULL) {
807 				NLMSG_REPORT_ERR_MSG(npt, "interface not set");
808 				return (EINVAL);
809 			}
810 			in6_set_unicast_scopeid(&gw6->sin6_addr, if_getindex(ifp));
811 		}
812 	}
813 #endif
814 	nhop_set_gw(nh, gw, true);
815 	return (0);
816 }
817 
818 static int
819 newnhop(struct nl_parsed_nhop *attrs, struct user_nhop *unhop, struct nl_pstate *npt)
820 {
821 	struct ifaddr *ifa = NULL;
822 	struct nhop_object *nh;
823 	int error;
824 
825 	if (!attrs->nha_blackhole) {
826 		if (attrs->nha_gw == NULL) {
827 			NLMSG_REPORT_ERR_MSG(npt, "missing NHA_GATEWAY");
828 			return (EINVAL);
829 		}
830 		if (attrs->nha_oif == NULL) {
831 			NLMSG_REPORT_ERR_MSG(npt, "missing NHA_OIF");
832 			return (EINVAL);
833 		}
834 		if (ifa == NULL)
835 			ifa = ifaof_ifpforaddr(attrs->nha_gw, attrs->nha_oif);
836 		if (ifa == NULL) {
837 			NLMSG_REPORT_ERR_MSG(npt, "Unable to determine default source IP");
838 			return (EINVAL);
839 		}
840 	}
841 
842 	int family = attrs->nha_gw != NULL ? attrs->nha_gw->sa_family : attrs->nh_family;
843 
844 	nh = nhop_alloc(RT_DEFAULT_FIB, family);
845 	if (nh == NULL) {
846 		NL_LOG(LOG_DEBUG, "Unable to allocate nexthop");
847 		return (ENOMEM);
848 	}
849 	nhop_set_uidx(nh, attrs->nha_id);
850 	nhop_set_origin(nh, attrs->nh_protocol);
851 
852 	if (attrs->nha_blackhole)
853 		nhop_set_blackhole(nh, NHF_BLACKHOLE);
854 	else {
855 		error = nl_set_nexthop_gw(nh, attrs->nha_gw, attrs->nha_oif, npt);
856 		if (error != 0) {
857 			nhop_free(nh);
858 			return (error);
859 		}
860 		nhop_set_transmit_ifp(nh, attrs->nha_oif);
861 		nhop_set_src(nh, ifa);
862 	}
863 
864 	error = nhop_get_unlinked(nh);
865 	if (error != 0) {
866 		NL_LOG(LOG_DEBUG, "unable to finalize nexthop");
867 		return (error);
868 	}
869 
870 	IF_DEBUG_LEVEL(LOG_DEBUG2) {
871 		char nhbuf[NHOP_PRINT_BUFSIZE];
872 		nhop_print_buf(nh, nhbuf, sizeof(nhbuf));
873 		NL_LOG(LOG_DEBUG2, "Adding unhop %u: %s", attrs->nha_id, nhbuf);
874 	}
875 
876 	unhop->un_nhop_src = nh;
877 	return (0);
878 }
879 
880 static int
881 rtnl_handle_newnhop(struct nlmsghdr *hdr, struct nlpcb *nlp,
882     struct nl_pstate *npt)
883 {
884 	struct user_nhop *unhop;
885 	int error;
886 
887         if ((__predict_false(V_un_ctl == NULL)) && (!vnet_init_unhops()))
888 		return (ENOMEM);
889 	struct unhop_ctl *ctl = V_un_ctl;
890 
891 	struct nl_parsed_nhop attrs = {};
892 	error = nl_parse_nlmsg(hdr, &nhmsg_parser, npt, &attrs);
893 	if (error != 0)
894 		return (error);
895 
896 	/*
897 	 * Get valid nha_id. Treat nha_id == 0 (auto-assignment) as a second-class
898 	 *  citizen.
899 	 */
900 	if (attrs.nha_id == 0) {
901 		attrs.nha_id = find_spare_uidx(ctl);
902 		if (attrs.nha_id == 0) {
903 			NL_LOG(LOG_DEBUG, "Unable to get spare uidx");
904 			return (ENOSPC);
905 		}
906 	}
907 
908 	NL_LOG(LOG_DEBUG, "IFINDEX %d", attrs.nha_oif ? if_getindex(attrs.nha_oif) : 0);
909 
910 	unhop = malloc(sizeof(struct user_nhop), M_NETLINK, M_NOWAIT | M_ZERO);
911 	if (unhop == NULL) {
912 		NL_LOG(LOG_DEBUG, "Unable to allocate user_nhop");
913 		return (ENOMEM);
914 	}
915 	unhop->un_idx = attrs.nha_id;
916 	unhop->un_protocol = attrs.nh_protocol;
917 
918 	if (attrs.nha_group)
919 		error = newnhg(ctl, &attrs, unhop);
920 	else
921 		error = newnhop(&attrs, unhop, npt);
922 
923 	if (error != 0) {
924 		free(unhop, M_NETLINK);
925 		return (error);
926 	}
927 
928 	UN_WLOCK(ctl);
929 	/* Check if uidx already exists */
930 	struct user_nhop *tmp = NULL;
931 	CHT_SLIST_FIND_BYOBJ(&ctl->un_head, unhop, unhop, tmp);
932 	if (tmp != NULL) {
933 		UN_WUNLOCK(ctl);
934 		NL_LOG(LOG_DEBUG, "nhop idx %u already exists", attrs.nha_id);
935 		destroy_unhop(unhop);
936 		return (EEXIST);
937 	}
938 	CHT_SLIST_INSERT_HEAD(&ctl->un_head, unhop, unhop);
939 	uint32_t num_buckets_new = CHT_SLIST_GET_RESIZE_BUCKETS(&ctl->un_head);
940 	UN_WUNLOCK(ctl);
941 
942 	/* Report addition of the next nexhop */
943 	struct netlink_walkargs wa = {
944 		.hdr.nlmsg_pid = hdr->nlmsg_pid,
945 		.hdr.nlmsg_seq = hdr->nlmsg_seq,
946 		.hdr.nlmsg_flags = hdr->nlmsg_flags,
947 		.hdr.nlmsg_type = NL_RTM_NEWNEXTHOP,
948 	};
949 
950 	struct nl_writer nw = {};
951 	if (!nlmsg_get_group_writer(&nw, NLMSG_SMALL, NETLINK_ROUTE, RTNLGRP_NEXTHOP)) {
952 		NL_LOG(LOG_DEBUG, "error allocating message writer");
953 		return (ENOMEM);
954 	}
955 
956 	dump_unhop(unhop, &wa.hdr, &nw);
957 	nlmsg_flush(&nw);
958 
959 	consider_resize(ctl, num_buckets_new);
960 
961         return (0);
962 }
963 
964 static int
965 rtnl_handle_delnhop(struct nlmsghdr *hdr, struct nlpcb *nlp,
966     struct nl_pstate *npt)
967 {
968 	struct unhop_ctl *ctl = atomic_load_ptr(&V_un_ctl);
969 	int error;
970 
971 	if (__predict_false(ctl == NULL))
972 		return (ESRCH);
973 
974 	struct nl_parsed_nhop attrs = {};
975 	error = nl_parse_nlmsg(hdr, &nhmsg_parser, npt, &attrs);
976 	if (error != 0)
977 		return (error);
978 
979 	if (attrs.nha_id == 0) {
980 		NL_LOG(LOG_DEBUG, "NHA_ID not set");
981 		return (EINVAL);
982 	}
983 
984 	error = delete_unhop(ctl, hdr, attrs.nha_id);
985 
986         return (error);
987 }
988 
989 static bool
990 match_unhop(const struct nl_parsed_nhop *attrs, struct user_nhop *unhop)
991 {
992 	if (attrs->nha_id != 0 && unhop->un_idx != attrs->nha_id)
993 		return (false);
994 	if (attrs->nha_groups != 0 && unhop->un_nhgrp_src == NULL)
995 		return (false);
996 	if (attrs->nha_oif != NULL &&
997 	    (unhop->un_nhop_src == NULL || unhop->un_nhop_src->nh_ifp != attrs->nha_oif))
998 		return (false);
999 
1000 	return (true);
1001 }
1002 
1003 static int
1004 rtnl_handle_getnhop(struct nlmsghdr *hdr, struct nlpcb *nlp,
1005     struct nl_pstate *npt)
1006 {
1007 	struct user_nhop *unhop;
1008 	UN_TRACKER;
1009 	int error;
1010 
1011 	struct nl_parsed_nhop attrs = {};
1012 	error = nl_parse_nlmsg(hdr, &nhmsg_parser, npt, &attrs);
1013 	if (error != 0)
1014 		return (error);
1015 
1016 	struct netlink_walkargs wa = {
1017 		.nw = npt->nw,
1018 		.hdr.nlmsg_pid = hdr->nlmsg_pid,
1019 		.hdr.nlmsg_seq = hdr->nlmsg_seq,
1020 		.hdr.nlmsg_flags = hdr->nlmsg_flags,
1021 		.hdr.nlmsg_type = NL_RTM_NEWNEXTHOP,
1022 	};
1023 
1024 	if (attrs.nha_id != 0) {
1025 		struct unhop_ctl *ctl = atomic_load_ptr(&V_un_ctl);
1026 		struct user_nhop key = { .un_idx = attrs.nha_id };
1027 
1028 		if (__predict_false(ctl == NULL))
1029 			return (ESRCH);
1030 
1031 		NL_LOG(LOG_DEBUG2, "searching for uidx %u", attrs.nha_id);
1032 		UN_RLOCK(ctl);
1033 		CHT_SLIST_FIND_BYOBJ(&ctl->un_head, unhop, &key, unhop);
1034 		UN_RUNLOCK(ctl);
1035 
1036 		if (unhop == NULL)
1037 			return (ESRCH);
1038 		dump_unhop(unhop, &wa.hdr, wa.nw);
1039 		return (0);
1040 	} else if (attrs.nhaf_kid != 0) {
1041 		struct nhop_iter iter = {
1042 			.fibnum = attrs.nhaf_table,
1043 			.family = attrs.nhaf_family,
1044 		};
1045 		int error = ESRCH;
1046 
1047 		NL_LOG(LOG_DEBUG2, "START table %u family %d", attrs.nhaf_table, attrs.nhaf_family);
1048 		for (struct nhop_object *nh = nhops_iter_start(&iter); nh;
1049 		    nh = nhops_iter_next(&iter)) {
1050 			NL_LOG(LOG_DEBUG3, "get %u", nhop_get_idx(nh));
1051 			if (nhop_get_idx(nh) == attrs.nhaf_kid) {
1052 				dump_nhop(nh, 0, &wa.hdr, wa.nw);
1053 				error = 0;
1054 				break;
1055 			}
1056 		}
1057 		nhops_iter_stop(&iter);
1058 		return (error);
1059 	} else if (attrs.nhaf_knhops) {
1060 		struct nhop_iter iter = {
1061 			.fibnum = attrs.nhaf_table,
1062 			.family = attrs.nhaf_family,
1063 		};
1064 
1065 		NL_LOG(LOG_DEBUG2, "DUMP table %u family %d", attrs.nhaf_table, attrs.nhaf_family);
1066 		wa.hdr.nlmsg_flags |= NLM_F_MULTI;
1067 		for (struct nhop_object *nh = nhops_iter_start(&iter); nh;
1068 		    nh = nhops_iter_next(&iter)) {
1069 			dump_nhop(nh, 0, &wa.hdr, wa.nw);
1070 		}
1071 		nhops_iter_stop(&iter);
1072 	} else {
1073 		struct unhop_ctl *ctl = atomic_load_ptr(&V_un_ctl);
1074 
1075 		if (__predict_false(ctl == NULL))
1076 			return (ESRCH);
1077 
1078 		NL_LOG(LOG_DEBUG2, "DUMP unhops");
1079 		UN_RLOCK(ctl);
1080 		wa.hdr.nlmsg_flags |= NLM_F_MULTI;
1081 		CHT_SLIST_FOREACH(&ctl->un_head, unhop, unhop) {
1082 			if (UNHOP_IS_MASTER(unhop) && match_unhop(&attrs, unhop))
1083 				dump_unhop(unhop, &wa.hdr, wa.nw);
1084 		} CHT_SLIST_FOREACH_END;
1085 		UN_RUNLOCK(ctl);
1086 	}
1087 
1088 	if (wa.error == 0) {
1089 		if (!nlmsg_end_dump(wa.nw, wa.error, &wa.hdr))
1090 			return (ENOMEM);
1091 	}
1092         return (0);
1093 }
1094 
1095 static const struct rtnl_cmd_handler cmd_handlers[] = {
1096 	{
1097 		.cmd = NL_RTM_NEWNEXTHOP,
1098 		.name = "RTM_NEWNEXTHOP",
1099 		.cb = &rtnl_handle_newnhop,
1100 		.priv = PRIV_NET_ROUTE,
1101 	},
1102 	{
1103 		.cmd = NL_RTM_DELNEXTHOP,
1104 		.name = "RTM_DELNEXTHOP",
1105 		.cb = &rtnl_handle_delnhop,
1106 		.priv = PRIV_NET_ROUTE,
1107 	},
1108 	{
1109 		.cmd = NL_RTM_GETNEXTHOP,
1110 		.name = "RTM_GETNEXTHOP",
1111 		.cb = &rtnl_handle_getnhop,
1112 	}
1113 };
1114 
1115 static const struct nlhdr_parser *all_parsers[] = { &nhmsg_parser, &nh_fbsd_parser };
1116 
1117 void
1118 rtnl_nexthops_init(void)
1119 {
1120 	NL_VERIFY_PARSERS(all_parsers);
1121 	rtnl_register_messages(cmd_handlers, NL_ARRAY_LEN(cmd_handlers));
1122 }
1123