xref: /freebsd/sys/netlink/route/nexthop.c (revision 51015e6d0f570239b0c2088dc6cf2b018928375d)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2022 Alexander V. Chernikov <melifaro@FreeBSD.org>
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions
8  * are met:
9  * 1. Redistributions of source code must retain the above copyright
10  *    notice, this list of conditions and the following disclaimer.
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in the
13  *    documentation and/or other materials provided with the distribution.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
16  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
19  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
21  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
22  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
23  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
24  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
25  * SUCH DAMAGE.
26  */
27 
28 #include <sys/cdefs.h>
29 __FBSDID("$FreeBSD$");
30 #include "opt_inet.h"
31 #include "opt_inet6.h"
32 #include "opt_route.h"
33 #include <sys/types.h>
34 #include <sys/ck.h>
35 #include <sys/epoch.h>
36 #include <sys/kernel.h>
37 #include <sys/malloc.h>
38 #include <sys/rmlock.h>
39 #include <sys/socket.h>
40 
41 #include <net/if.h>
42 #include <net/route.h>
43 #include <net/route/nhop.h>
44 #include <net/route/nhop_utils.h>
45 
46 #include <net/route/route_ctl.h>
47 #include <net/route/route_var.h>
48 #include <netinet6/scope6_var.h>
49 #include <netlink/netlink.h>
50 #include <netlink/netlink_ctl.h>
51 #include <netlink/netlink_route.h>
52 #include <netlink/route/route_var.h>
53 
54 #define	DEBUG_MOD_NAME	nl_nhop
55 #define	DEBUG_MAX_LEVEL	LOG_DEBUG3
56 #include <netlink/netlink_debug.h>
57 _DECLARE_DEBUG(LOG_DEBUG3);
58 
59 /*
60  * This file contains the logic to maintain kernel nexthops and
61  *  nexhop groups based om the data provided by the user.
62  *
63  * Kernel stores (nearly) all of the routing data in the nexthops,
64  *  including the prefix-specific flags (NHF_HOST and NHF_DEFAULT).
65  *
66  * Netlink API provides higher-level abstraction for the user. Each
67  *  user-created nexthop may map to multiple kernel nexthops.
68  *
69  * The following variations require separate kernel nexthop to be
70  *  created:
71  *  * prefix flags (NHF_HOST, NHF_DEFAULT)
72  *  * using IPv6 gateway for IPv4 routes
73  *  * different fibnum
74  *
75  * These kernel nexthops have the lifetime bound to the lifetime of
76  *  the user_nhop object. They are not collected until user requests
77  *  to delete the created user_nhop.
78  *
79  */
80 struct user_nhop {
81         uint32_t                        un_idx; /* Userland-provided index */
82 	uint32_t			un_fibfam; /* fibnum+af(as highest byte) */
83 	uint8_t				un_protocol; /* protocol that install the record */
84 	struct nhop_object		*un_nhop; /* "production" nexthop */
85 	struct nhop_object		*un_nhop_src; /* nexthop to copy from */
86 	struct weightened_nhop		*un_nhgrp_src; /* nexthops for nhg */
87 	uint32_t			un_nhgrp_count; /* number of nexthops */
88         struct user_nhop		*un_next; /* next item in hash chain */
89         struct user_nhop		*un_nextchild; /* master -> children */
90 	struct epoch_context		un_epoch_ctx;	/* epoch ctl helper */
91 };
92 
93 /* produce hash value for an object */
94 #define	unhop_hash_obj(_obj)	(hash_unhop(_obj))
95 /* compare two objects */
96 #define	unhop_cmp(_one, _two)	(cmp_unhop(_one, _two))
97 /* next object accessor */
98 #define	unhop_next(_obj)	(_obj)->un_next
99 
100 CHT_SLIST_DEFINE(unhop, struct user_nhop);
101 
102 struct unhop_ctl {
103 	struct unhop_head	un_head;
104 	struct rmlock		un_lock;
105 };
106 #define	UN_LOCK_INIT(_ctl)	rm_init(&(_ctl)->un_lock, "unhop_ctl")
107 #define	UN_TRACKER		struct rm_priotracker un_tracker
108 #define	UN_RLOCK(_ctl)		rm_rlock(&((_ctl)->un_lock), &un_tracker)
109 #define	UN_RUNLOCK(_ctl)	rm_runlock(&((_ctl)->un_lock), &un_tracker)
110 
111 #define	UN_WLOCK(_ctl)		rm_wlock(&(_ctl)->un_lock);
112 #define	UN_WUNLOCK(_ctl)	rm_wunlock(&(_ctl)->un_lock);
113 
114 VNET_DEFINE_STATIC(struct unhop_ctl *, un_ctl) = NULL;
115 #define V_un_ctl	VNET(un_ctl)
116 
117 static void consider_resize(struct unhop_ctl *ctl, uint32_t new_size);
118 static int cmp_unhop(const struct user_nhop *a, const struct user_nhop *b);
119 static unsigned int hash_unhop(const struct user_nhop *obj);
120 
121 static void destroy_unhop(struct user_nhop *unhop);
122 static struct nhop_object *clone_unhop(const struct user_nhop *unhop,
123     uint32_t fibnum, int family, int nh_flags);
124 
125 static int
126 cmp_unhop(const struct user_nhop *a, const struct user_nhop *b)
127 {
128         return (a->un_idx == b->un_idx && a->un_fibfam == b->un_fibfam);
129 }
130 
131 /*
132  * Hash callback: calculate hash of an object
133  */
134 static unsigned int
135 hash_unhop(const struct user_nhop *obj)
136 {
137         return (obj->un_idx ^ obj->un_fibfam);
138 }
139 
140 #define	UNHOP_IS_MASTER(_unhop)	((_unhop)->un_fibfam == 0)
141 
142 /*
143  * Factory interface for creating matching kernel nexthops/nexthop groups
144  *
145  * @uidx: userland nexhop index used to create the nexthop
146  * @fibnum: fibnum nexthop will be used in
147  * @family: upper family nexthop will be used in
148  * @nh_flags: desired nexthop prefix flags
149  * @perror: pointer to store error to
150  *
151  * Returns referenced nexthop linked to @fibnum/@family rib on success.
152  */
153 struct nhop_object *
154 nl_find_nhop(uint32_t fibnum, int family, uint32_t uidx,
155     int nh_flags, int *perror)
156 {
157 	struct unhop_ctl *ctl = atomic_load_ptr(&V_un_ctl);
158         UN_TRACKER;
159 
160 	if (__predict_false(ctl == NULL))
161 		return (NULL);
162 
163 	struct user_nhop key= {
164 		.un_idx = uidx,
165 		.un_fibfam = fibnum  | ((uint32_t)family) << 24,
166 	};
167 	struct user_nhop *unhop;
168 
169 	nh_flags = nh_flags & (NHF_HOST | NHF_DEFAULT);
170 
171 	if (__predict_false(family == 0))
172 		return (NULL);
173 
174 	UN_RLOCK(ctl);
175 	CHT_SLIST_FIND_BYOBJ(&ctl->un_head, unhop, &key, unhop);
176 	if (unhop != NULL) {
177 		struct nhop_object *nh = unhop->un_nhop;
178 		UN_RLOCK(ctl);
179 		*perror = 0;
180 		nhop_ref_any(nh);
181 		return (nh);
182 	}
183 
184 	/*
185 	 * Exact nexthop not found. Search for template nexthop to clone from.
186 	 */
187 	key.un_fibfam = 0;
188 	CHT_SLIST_FIND_BYOBJ(&ctl->un_head, unhop, &key, unhop);
189 	if (unhop == NULL) {
190 		UN_RUNLOCK(ctl);
191 		*perror = ESRCH;
192 		return (NULL);
193 	}
194 
195 	UN_RUNLOCK(ctl);
196 
197 	/* Create entry to insert first */
198 	struct user_nhop *un_new, *un_tmp;
199 	un_new = malloc(sizeof(struct user_nhop), M_NETLINK, M_NOWAIT | M_ZERO);
200 	if (un_new == NULL) {
201 		*perror = ENOMEM;
202 		return (NULL);
203 	}
204 	un_new->un_idx = uidx;
205 	un_new->un_fibfam = fibnum  | ((uint32_t)family) << 24;
206 
207 	/* Relying on epoch to protect unhop here */
208 	un_new->un_nhop = clone_unhop(unhop, fibnum, family, nh_flags);
209 	if (un_new->un_nhop == NULL) {
210 		free(un_new, M_NETLINK);
211 		*perror = ENOMEM;
212 		return (NULL);
213 	}
214 
215 	/* Insert back and report */
216 	UN_WLOCK(ctl);
217 
218 	/* First, find template record once again */
219 	CHT_SLIST_FIND_BYOBJ(&ctl->un_head, unhop, &key, unhop);
220 	if (unhop == NULL) {
221 		/* Someone deleted the nexthop during the call */
222 		UN_WUNLOCK(ctl);
223 		*perror = ESRCH;
224 		destroy_unhop(un_new);
225 		return (NULL);
226 	}
227 
228 	/* Second, check the direct match */
229 	CHT_SLIST_FIND_BYOBJ(&ctl->un_head, unhop, un_new, un_tmp);
230 	struct nhop_object *nh;
231 	if (un_tmp != NULL) {
232 		/* Another thread already created the desired nextop, use it */
233 		nh = un_tmp->un_nhop;
234 	} else {
235 		/* Finally, insert the new nexthop and link it to the primary */
236 		nh = un_new->un_nhop;
237 		CHT_SLIST_INSERT_HEAD(&ctl->un_head, unhop, un_new);
238 		un_new->un_nextchild = unhop->un_nextchild;
239 		unhop->un_nextchild = un_new;
240 		un_new = NULL;
241 		NL_LOG(LOG_DEBUG2, "linked cloned nexthop %p", nh);
242 	}
243 
244 	UN_WUNLOCK(ctl);
245 
246 	if (un_new != NULL)
247 		destroy_unhop(un_new);
248 
249 	*perror = 0;
250 	nhop_ref_any(nh);
251 	return (nh);
252 }
253 
254 static struct user_nhop *
255 nl_find_base_unhop(struct unhop_ctl *ctl, uint32_t uidx)
256 {
257 	struct user_nhop key= { .un_idx = uidx };
258 	struct user_nhop *unhop = NULL;
259 	UN_TRACKER;
260 
261 	UN_RLOCK(ctl);
262 	CHT_SLIST_FIND_BYOBJ(&ctl->un_head, unhop, &key, unhop);
263 	UN_RUNLOCK(ctl);
264 
265 	return (unhop);
266 }
267 
268 #define MAX_STACK_NHOPS	4
269 static struct nhop_object *
270 clone_unhop(const struct user_nhop *unhop, uint32_t fibnum, int family, int nh_flags)
271 {
272 #ifdef ROUTE_MPATH
273 	const struct weightened_nhop *wn;
274 	struct weightened_nhop *wn_new, wn_base[MAX_STACK_NHOPS];
275 	uint32_t num_nhops;
276 #endif
277 	struct nhop_object *nh = NULL;
278 	int error;
279 
280 	if (unhop->un_nhop_src != NULL) {
281 		IF_DEBUG_LEVEL(LOG_DEBUG2) {
282 			char nhbuf[NHOP_PRINT_BUFSIZE];
283 			nhop_print_buf_any(unhop->un_nhop_src, nhbuf, sizeof(nhbuf));
284 			FIB_NH_LOG(LOG_DEBUG2, unhop->un_nhop_src,
285 			    "cloning nhop %s -> %u.%u flags 0x%X", nhbuf, fibnum,
286 			    family, nh_flags);
287 		}
288 		struct nhop_object *nh;
289 		nh = nhop_alloc(fibnum, AF_UNSPEC);
290 		if (nh == NULL)
291 			return (NULL);
292 		nhop_copy(nh, unhop->un_nhop_src);
293 		/* Check that nexthop gateway is compatible with the new family */
294 		if (!nhop_set_upper_family(nh, family)) {
295 			nhop_free(nh);
296 			return (NULL);
297 		}
298 		nhop_set_uidx(nh, unhop->un_idx);
299 		nhop_set_pxtype_flag(nh, nh_flags);
300 		return (nhop_get_nhop(nh, &error));
301 	}
302 #ifdef ROUTE_MPATH
303 	wn = unhop->un_nhgrp_src;
304 	num_nhops = unhop->un_nhgrp_count;
305 
306 	if (num_nhops > MAX_STACK_NHOPS) {
307 		wn_new = malloc(num_nhops * sizeof(struct weightened_nhop), M_TEMP, M_NOWAIT);
308 		if (wn_new == NULL)
309 			return (NULL);
310 	} else
311 		wn_new = wn_base;
312 
313 	for (int i = 0; i < num_nhops; i++) {
314 		uint32_t uidx = nhop_get_uidx(wn[i].nh);
315 		MPASS(uidx != 0);
316 		wn_new[i].nh = nl_find_nhop(fibnum, family, uidx, nh_flags, &error);
317 		if (error != 0)
318 			break;
319 		wn_new[i].weight = wn[i].weight;
320 	}
321 
322 	if (error == 0) {
323 		struct rib_head *rh = nhop_get_rh(wn_new[0].nh);
324 		struct nhgrp_object *nhg;
325 
326 		error = nhgrp_get_group(rh, wn_new, num_nhops, unhop->un_idx, &nhg);
327 		nh = (struct nhop_object *)nhg;
328 	}
329 
330 	if (wn_new != wn_base)
331 		free(wn_new, M_TEMP);
332 #endif
333 	return (nh);
334 }
335 
336 static void
337 destroy_unhop(struct user_nhop *unhop)
338 {
339 	if (unhop->un_nhop != NULL)
340 		nhop_free_any(unhop->un_nhop);
341 	if (unhop->un_nhop_src != NULL)
342 		nhop_free_any(unhop->un_nhop_src);
343 	free(unhop, M_NETLINK);
344 }
345 
346 static void
347 destroy_unhop_epoch(epoch_context_t ctx)
348 {
349 	struct user_nhop *unhop;
350 
351 	unhop = __containerof(ctx, struct user_nhop, un_epoch_ctx);
352 
353 	destroy_unhop(unhop);
354 }
355 
356 static uint32_t
357 find_spare_uidx(struct unhop_ctl *ctl)
358 {
359 	struct user_nhop *unhop, key = {};
360 	uint32_t uidx = 0;
361 	UN_TRACKER;
362 
363 	UN_RLOCK(ctl);
364 	/* This should return spare uid with 75% of 65k used in ~99/100 cases */
365 	for (int i = 0; i < 16; i++) {
366 		key.un_idx = (arc4random() % 65536) + 65536 * 4;
367 		CHT_SLIST_FIND_BYOBJ(&ctl->un_head, unhop, &key, unhop);
368 		if (unhop == NULL) {
369 			uidx = key.un_idx;
370 			break;
371 		}
372 	}
373 	UN_RUNLOCK(ctl);
374 
375 	return (uidx);
376 }
377 
378 
379 /*
380  * Actual netlink code
381  */
382 struct netlink_walkargs {
383 	struct nl_writer *nw;
384 	struct nlmsghdr hdr;
385 	struct nlpcb *so;
386 	int family;
387 	int error;
388 	int count;
389 	int dumped;
390 };
391 #define	ENOMEM_IF_NULL(_v)	if ((_v) == NULL) goto enomem
392 
393 static bool
394 dump_nhgrp(const struct user_nhop *unhop, struct nlmsghdr *hdr,
395     struct nl_writer *nw)
396 {
397 
398 	if (!nlmsg_reply(nw, hdr, sizeof(struct nhmsg)))
399 		goto enomem;
400 
401 	struct nhmsg *nhm = nlmsg_reserve_object(nw, struct nhmsg);
402 	nhm->nh_family = AF_UNSPEC;
403 	nhm->nh_scope = 0;
404 	nhm->nh_protocol = unhop->un_protocol;
405 	nhm->nh_flags = 0;
406 
407 	nlattr_add_u32(nw, NHA_ID, unhop->un_idx);
408 	nlattr_add_u16(nw, NHA_GROUP_TYPE, NEXTHOP_GRP_TYPE_MPATH);
409 
410 	struct weightened_nhop *wn = unhop->un_nhgrp_src;
411 	uint32_t num_nhops = unhop->un_nhgrp_count;
412 	/* TODO: a better API? */
413 	int nla_len = sizeof(struct nlattr);
414 	nla_len += NETLINK_ALIGN(num_nhops * sizeof(struct nexthop_grp));
415 	struct nlattr *nla = nlmsg_reserve_data(nw, nla_len, struct nlattr);
416 	if (nla == NULL)
417 		goto enomem;
418 	nla->nla_type = NHA_GROUP;
419 	nla->nla_len = nla_len;
420 	for (int i = 0; i < num_nhops; i++) {
421 		struct nexthop_grp *grp = &((struct nexthop_grp *)(nla + 1))[i];
422 		grp->id = nhop_get_uidx(wn[i].nh);
423 		grp->weight = wn[i].weight;
424 		grp->resvd1 = 0;
425 		grp->resvd2 = 0;
426 	}
427 
428         if (nlmsg_end(nw))
429 		return (true);
430 enomem:
431 	NL_LOG(LOG_DEBUG, "error: unable to allocate attribute memory");
432         nlmsg_abort(nw);
433 	return (false);
434 }
435 
436 static bool
437 dump_nhop(const struct user_nhop *unhop, struct nlmsghdr *hdr,
438     struct nl_writer *nw)
439 {
440 	struct nhop_object *nh = unhop->un_nhop_src;
441 
442 	if (!nlmsg_reply(nw, hdr, sizeof(struct nhmsg)))
443 		goto enomem;
444 
445 	struct nhmsg *nhm = nlmsg_reserve_object(nw, struct nhmsg);
446 	ENOMEM_IF_NULL(nhm);
447 	nhm->nh_family = nhop_get_neigh_family(nh);
448 	nhm->nh_scope = 0; // XXX: what's that?
449 	nhm->nh_protocol = unhop->un_protocol;
450 	nhm->nh_flags = 0;
451 
452 	nlattr_add_u32(nw, NHA_ID, unhop->un_idx);
453 	if (nh->nh_flags & NHF_BLACKHOLE) {
454 		nlattr_add_flag(nw, NHA_BLACKHOLE);
455 		goto done;
456 	}
457 	nlattr_add_u32(nw, NHA_OIF, nh->nh_ifp->if_index);
458 
459 	switch (nh->gw_sa.sa_family) {
460 #ifdef INET
461 	case AF_INET:
462 		nlattr_add(nw, NHA_GATEWAY, 4, &nh->gw4_sa.sin_addr);
463 		break;
464 #endif
465 #ifdef INET6
466 	case AF_INET6:
467 		{
468 			struct in6_addr addr = nh->gw6_sa.sin6_addr;
469 			in6_clearscope(&addr);
470 			nlattr_add(nw, NHA_GATEWAY, 16, &addr);
471 			break;
472 		}
473 #endif
474 	}
475 
476 done:
477         if (nlmsg_end(nw))
478 		return (true);
479 enomem:
480 	nlmsg_abort(nw);
481 	return (false);
482 }
483 
484 static void
485 dump_unhop(const struct user_nhop *unhop, struct nlmsghdr *hdr,
486     struct nl_writer *nw)
487 {
488 	if (unhop->un_nhop_src != NULL)
489 		dump_nhop(unhop, hdr, nw);
490 	else
491 		dump_nhgrp(unhop, hdr, nw);
492 }
493 
494 static int
495 delete_unhop(struct unhop_ctl *ctl, struct nlmsghdr *hdr, uint32_t uidx)
496 {
497 	struct user_nhop *unhop_ret, *unhop_base, *unhop_chain;
498 
499 	struct user_nhop key = { .un_idx = uidx };
500 
501 	UN_WLOCK(ctl);
502 
503 	CHT_SLIST_FIND_BYOBJ(&ctl->un_head, unhop, &key, unhop_base);
504 
505 	if (unhop_base != NULL) {
506 		CHT_SLIST_REMOVE(&ctl->un_head, unhop, unhop_base, unhop_ret);
507 		IF_DEBUG_LEVEL(LOG_DEBUG2) {
508 			char nhbuf[NHOP_PRINT_BUFSIZE];
509 			nhop_print_buf_any(unhop_base->un_nhop, nhbuf, sizeof(nhbuf));
510 			FIB_NH_LOG(LOG_DEBUG3, unhop_base->un_nhop,
511 			    "removed base nhop %u: %s", uidx, nhbuf);
512 		}
513 		/* Unlink all child nexhops as well, keeping the chain intact */
514 		unhop_chain = unhop_base->un_nextchild;
515 		while (unhop_chain != NULL) {
516 			CHT_SLIST_REMOVE(&ctl->un_head, unhop, unhop_chain,
517 			    unhop_ret);
518 			MPASS(unhop_chain == unhop_ret);
519 			IF_DEBUG_LEVEL(LOG_DEBUG3) {
520 				char nhbuf[NHOP_PRINT_BUFSIZE];
521 				nhop_print_buf_any(unhop_chain->un_nhop,
522 				    nhbuf, sizeof(nhbuf));
523 				FIB_NH_LOG(LOG_DEBUG3, unhop_chain->un_nhop,
524 				    "removed child nhop %u: %s", uidx, nhbuf);
525 			}
526 			unhop_chain = unhop_chain->un_nextchild;
527 		}
528 	}
529 
530 	UN_WUNLOCK(ctl);
531 
532 	if (unhop_base == NULL) {
533 		NL_LOG(LOG_DEBUG, "unable to find unhop %u", uidx);
534 		return (ENOENT);
535 	}
536 
537 	/* Report nexthop deletion */
538 	struct netlink_walkargs wa = {
539 		.hdr.nlmsg_pid = hdr->nlmsg_pid,
540 		.hdr.nlmsg_seq = hdr->nlmsg_seq,
541 		.hdr.nlmsg_flags = hdr->nlmsg_flags,
542 		.hdr.nlmsg_type = NL_RTM_DELNEXTHOP,
543 	};
544 
545 	struct nl_writer nw = {};
546 	if (!nlmsg_get_group_writer(&nw, NLMSG_SMALL, NETLINK_ROUTE, RTNLGRP_NEXTHOP)) {
547 		NL_LOG(LOG_DEBUG, "error allocating message writer");
548 		return (ENOMEM);
549 	}
550 
551 	dump_unhop(unhop_base, &wa.hdr, &nw);
552 	nlmsg_flush(&nw);
553 
554 	while (unhop_base != NULL) {
555 		unhop_chain = unhop_base->un_nextchild;
556 		NET_EPOCH_CALL(destroy_unhop_epoch, &unhop_base->un_epoch_ctx);
557 		unhop_base = unhop_chain;
558 	}
559 
560 	return (0);
561 }
562 
563 static void
564 consider_resize(struct unhop_ctl *ctl, uint32_t new_size)
565 {
566 	void *new_ptr = NULL;
567 	size_t alloc_size;
568 
569         if (new_size == 0)
570                 return;
571 
572 	if (new_size != 0) {
573 		alloc_size = CHT_SLIST_GET_RESIZE_SIZE(new_size);
574 		new_ptr = malloc(alloc_size, M_NETLINK, M_NOWAIT | M_ZERO);
575                 if (new_ptr == NULL)
576                         return;
577 	}
578 
579 	NL_LOG(LOG_DEBUG, "resizing hash: %u -> %u", ctl->un_head.hash_size, new_size);
580 	UN_WLOCK(ctl);
581 	if (new_ptr != NULL) {
582 		CHT_SLIST_RESIZE(&ctl->un_head, unhop, new_ptr, new_size);
583 	}
584 	UN_WUNLOCK(ctl);
585 
586 
587 	if (new_ptr != NULL)
588 		free(new_ptr, M_NETLINK);
589 }
590 
591 static bool __noinline
592 vnet_init_unhops(void)
593 {
594         uint32_t num_buckets = 16;
595         size_t alloc_size = CHT_SLIST_GET_RESIZE_SIZE(num_buckets);
596 
597         struct unhop_ctl *ctl = malloc(sizeof(struct unhop_ctl), M_NETLINK,
598             M_NOWAIT | M_ZERO);
599         if (ctl == NULL)
600                 return (false);
601 
602         void *ptr = malloc(alloc_size, M_NETLINK, M_NOWAIT | M_ZERO);
603         if (ptr == NULL) {
604 		free(ctl, M_NETLINK);
605                 return (false);
606 	}
607         CHT_SLIST_INIT(&ctl->un_head, ptr, num_buckets);
608 	UN_LOCK_INIT(ctl);
609 
610 	if (!atomic_cmpset_ptr((uintptr_t *)&V_un_ctl, (uintptr_t)NULL, (uintptr_t)ctl)) {
611                 free(ptr, M_NETLINK);
612                 free(ctl, M_NETLINK);
613 	}
614 
615 	if (atomic_load_ptr(&V_un_ctl) == NULL)
616 		return (false);
617 
618 	NL_LOG(LOG_NOTICE, "UNHOPS init done");
619 
620         return (true);
621 }
622 
623 static void
624 vnet_destroy_unhops(const void *unused __unused)
625 {
626 	struct unhop_ctl *ctl = atomic_load_ptr(&V_un_ctl);
627 	struct user_nhop *unhop, *tmp;
628 
629 	if (ctl == NULL)
630 		return;
631 	V_un_ctl = NULL;
632 
633 	/* Wait till all unhop users finish their reads */
634 	NET_EPOCH_WAIT();
635 
636 	UN_WLOCK(ctl);
637 	CHT_SLIST_FOREACH_SAFE(&ctl->un_head, unhop, unhop, tmp) {
638 		destroy_unhop(unhop);
639 	} CHT_SLIST_FOREACH_SAFE_END;
640 	UN_WUNLOCK(ctl);
641 
642 	free(ctl->un_head.ptr, M_NETLINK);
643 	free(ctl, M_NETLINK);
644 }
645 VNET_SYSUNINIT(vnet_destroy_unhops, SI_SUB_PROTO_IF, SI_ORDER_ANY,
646     vnet_destroy_unhops, NULL);
647 
648 static int
649 nlattr_get_nhg(struct nlattr *nla, struct nl_pstate *npt, const void *arg, void *target)
650 {
651 	int error = 0;
652 
653 	/* Verify attribute correctness */
654 	struct nexthop_grp *grp = NLA_DATA(nla);
655 	int data_len = NLA_DATA_LEN(nla);
656 
657 	int count = data_len / sizeof(*grp);
658 	if (count == 0 || (count * sizeof(*grp) != data_len)) {
659 		NL_LOG(LOG_DEBUG, "Invalid length for RTA_GROUP: %d", data_len);
660 		return (EINVAL);
661 	}
662 
663 	*((struct nlattr **)target) = nla;
664 	return (error);
665 }
666 
667 struct nl_parsed_nhop {
668 	uint32_t	nha_id;
669 	uint8_t		nha_blackhole;
670 	uint8_t		nha_groups;
671 	struct ifnet	*nha_oif;
672 	struct sockaddr	*nha_gw;
673 	struct nlattr	*nha_group;
674 	uint8_t		nh_family;
675 	uint8_t		nh_protocol;
676 };
677 
678 #define	_IN(_field)	offsetof(struct nhmsg, _field)
679 #define	_OUT(_field)	offsetof(struct nl_parsed_nhop, _field)
680 static const struct nlfield_parser nlf_p_nh[] = {
681 	{ .off_in = _IN(nh_family), .off_out = _OUT(nh_family), .cb = nlf_get_u8 },
682 	{ .off_in = _IN(nh_protocol), .off_out = _OUT(nh_protocol), .cb = nlf_get_u8 },
683 };
684 
685 static const struct nlattr_parser nla_p_nh[] = {
686 	{ .type = NHA_ID, .off = _OUT(nha_id), .cb = nlattr_get_uint32 },
687 	{ .type = NHA_GROUP, .off = _OUT(nha_group), .cb = nlattr_get_nhg },
688 	{ .type = NHA_BLACKHOLE, .off = _OUT(nha_blackhole), .cb = nlattr_get_flag },
689 	{ .type = NHA_OIF, .off = _OUT(nha_oif), .cb = nlattr_get_ifp },
690 	{ .type = NHA_GATEWAY, .off = _OUT(nha_gw), .cb = nlattr_get_ip },
691 	{ .type = NHA_GROUPS, .off = _OUT(nha_groups), .cb = nlattr_get_flag },
692 };
693 #undef _IN
694 #undef _OUT
695 NL_DECLARE_PARSER(nhmsg_parser, struct nhmsg, nlf_p_nh, nla_p_nh);
696 
697 static bool
698 eligible_nhg(const struct nhop_object *nh)
699 {
700 	return (nh->nh_flags & NHF_GATEWAY);
701 }
702 
703 static int
704 newnhg(struct unhop_ctl *ctl, struct nl_parsed_nhop *attrs, struct user_nhop *unhop)
705 {
706 	struct nexthop_grp *grp = NLA_DATA(attrs->nha_group);
707 	int count = NLA_DATA_LEN(attrs->nha_group) / sizeof(*grp);
708 	struct weightened_nhop *wn;
709 
710 	wn = malloc(sizeof(*wn) * count, M_NETLINK, M_NOWAIT | M_ZERO);
711 	if (wn == NULL)
712 		return (ENOMEM);
713 
714 	for (int i = 0; i < count; i++) {
715 		struct user_nhop *unhop;
716 		unhop = nl_find_base_unhop(ctl, grp[i].id);
717 		if (unhop == NULL) {
718 			NL_LOG(LOG_DEBUG, "unable to find uidx %u", grp[i].id);
719 			free(wn, M_NETLINK);
720 			return (ESRCH);
721 		} else if (unhop->un_nhop_src == NULL) {
722 			NL_LOG(LOG_DEBUG, "uidx %u is a group, nested group unsupported",
723 			    grp[i].id);
724 			free(wn, M_NETLINK);
725 			return (ENOTSUP);
726 		} else if (!eligible_nhg(unhop->un_nhop_src)) {
727 			NL_LOG(LOG_DEBUG, "uidx %u nhop is not mpath-eligible",
728 			    grp[i].id);
729 			free(wn, M_NETLINK);
730 			return (ENOTSUP);
731 		}
732 		/*
733 		 * TODO: consider more rigid eligibility checks:
734 		 * restrict nexthops with the same gateway
735 		 */
736 		wn[i].nh = unhop->un_nhop_src;
737 		wn[i].weight = grp[i].weight;
738 	}
739 	unhop->un_nhgrp_src = wn;
740 	unhop->un_nhgrp_count = count;
741 	return (0);
742 }
743 
744 /*
745  * Sets nexthop @nh gateway specified by @gw.
746  * If gateway is IPv6 link-local, alters @gw to include scopeid equal to
747  * @ifp ifindex.
748  * Returns 0 on success or errno.
749  */
750 int
751 nl_set_nexthop_gw(struct nhop_object *nh, struct sockaddr *gw, struct ifnet *ifp,
752     struct nl_pstate *npt)
753 {
754 #ifdef INET6
755 	if (gw->sa_family == AF_INET6) {
756 		struct sockaddr_in6 *gw6 = (struct sockaddr_in6 *)gw;
757 		if (IN6_IS_ADDR_LINKLOCAL(&gw6->sin6_addr)) {
758 			if (ifp == NULL) {
759 				NLMSG_REPORT_ERR_MSG(npt, "interface not set");
760 				return (EINVAL);
761 			}
762 			in6_set_unicast_scopeid(&gw6->sin6_addr, ifp->if_index);
763 		}
764 	}
765 #endif
766 	nhop_set_gw(nh, gw, true);
767 	return (0);
768 }
769 
770 static int
771 newnhop(struct nl_parsed_nhop *attrs, struct user_nhop *unhop, struct nl_pstate *npt)
772 {
773 	struct ifaddr *ifa = NULL;
774 	struct nhop_object *nh;
775 	int error;
776 
777 	if (!attrs->nha_blackhole) {
778 		if (attrs->nha_gw == NULL) {
779 			NLMSG_REPORT_ERR_MSG(npt, "missing NHA_GATEWAY");
780 			return (EINVAL);
781 		}
782 		if (attrs->nha_oif == NULL) {
783 			NLMSG_REPORT_ERR_MSG(npt, "missing NHA_OIF");
784 			return (EINVAL);
785 		}
786 		if (ifa == NULL)
787 			ifa = ifaof_ifpforaddr(attrs->nha_gw, attrs->nha_oif);
788 		if (ifa == NULL) {
789 			NLMSG_REPORT_ERR_MSG(npt, "Unable to determine default source IP");
790 			return (EINVAL);
791 		}
792 	}
793 
794 	int family = attrs->nha_gw != NULL ? attrs->nha_gw->sa_family : attrs->nh_family;
795 
796 	nh = nhop_alloc(RT_DEFAULT_FIB, family);
797 	if (nh == NULL) {
798 		NL_LOG(LOG_DEBUG, "Unable to allocate nexthop");
799 		return (ENOMEM);
800 	}
801 	nhop_set_uidx(nh, attrs->nha_id);
802 
803 	if (attrs->nha_blackhole)
804 		nhop_set_blackhole(nh, NHF_BLACKHOLE);
805 	else {
806 		error = nl_set_nexthop_gw(nh, attrs->nha_gw, attrs->nha_oif, npt);
807 		if (error != 0) {
808 			nhop_free(nh);
809 			return (error);
810 		}
811 		nhop_set_transmit_ifp(nh, attrs->nha_oif);
812 		nhop_set_src(nh, ifa);
813 	}
814 
815 	error = nhop_get_unlinked(nh);
816 	if (error != 0) {
817 		NL_LOG(LOG_DEBUG, "unable to finalize nexthop");
818 		return (error);
819 	}
820 
821 	IF_DEBUG_LEVEL(LOG_DEBUG2) {
822 		char nhbuf[NHOP_PRINT_BUFSIZE];
823 		nhop_print_buf(nh, nhbuf, sizeof(nhbuf));
824 		NL_LOG(LOG_DEBUG2, "Adding unhop %u: %s", attrs->nha_id, nhbuf);
825 	}
826 
827 	unhop->un_nhop_src = nh;
828 	return (0);
829 }
830 
831 static int
832 rtnl_handle_newnhop(struct nlmsghdr *hdr, struct nlpcb *nlp,
833     struct nl_pstate *npt)
834 {
835 	struct user_nhop *unhop;
836 	int error;
837 
838         if ((__predict_false(V_un_ctl == NULL)) && (!vnet_init_unhops()))
839 		return (ENOMEM);
840 	struct unhop_ctl *ctl = V_un_ctl;
841 
842 	struct nl_parsed_nhop attrs = {};
843 	error = nl_parse_nlmsg(hdr, &nhmsg_parser, npt, &attrs);
844 	if (error != 0)
845 		return (error);
846 
847 	/*
848 	 * Get valid nha_id. Treat nha_id == 0 (auto-assignment) as a second-class
849 	 *  citizen.
850 	 */
851 	if (attrs.nha_id == 0) {
852 		attrs.nha_id = find_spare_uidx(ctl);
853 		if (attrs.nha_id == 0) {
854 			NL_LOG(LOG_DEBUG, "Unable to get spare uidx");
855 			return (ENOSPC);
856 		}
857 	}
858 
859 	NL_LOG(LOG_DEBUG, "IFINDEX %d", attrs.nha_oif ? attrs.nha_oif->if_index : 0);
860 
861 	unhop = malloc(sizeof(struct user_nhop), M_NETLINK, M_NOWAIT | M_ZERO);
862 	if (unhop == NULL) {
863 		NL_LOG(LOG_DEBUG, "Unable to allocate user_nhop");
864 		return (ENOMEM);
865 	}
866 	unhop->un_idx = attrs.nha_id;
867 	unhop->un_protocol = attrs.nh_protocol;
868 
869 	if (attrs.nha_group)
870 		error = newnhg(ctl, &attrs, unhop);
871 	else
872 		error = newnhop(&attrs, unhop, npt);
873 
874 	if (error != 0) {
875 		free(unhop, M_NETLINK);
876 		return (error);
877 	}
878 
879 	UN_WLOCK(ctl);
880 	/* Check if uidx already exists */
881 	struct user_nhop *tmp = NULL;
882 	CHT_SLIST_FIND_BYOBJ(&ctl->un_head, unhop, unhop, tmp);
883 	if (tmp != NULL) {
884 		UN_WUNLOCK(ctl);
885 		NL_LOG(LOG_DEBUG, "nhop idx %u already exists", attrs.nha_id);
886 		destroy_unhop(unhop);
887 		return (EEXIST);
888 	}
889 	CHT_SLIST_INSERT_HEAD(&ctl->un_head, unhop, unhop);
890 	uint32_t num_buckets_new = CHT_SLIST_GET_RESIZE_BUCKETS(&ctl->un_head);
891 	UN_WUNLOCK(ctl);
892 
893 	/* Report addition of the next nexhop */
894 	struct netlink_walkargs wa = {
895 		.hdr.nlmsg_pid = hdr->nlmsg_pid,
896 		.hdr.nlmsg_seq = hdr->nlmsg_seq,
897 		.hdr.nlmsg_flags = hdr->nlmsg_flags,
898 		.hdr.nlmsg_type = NL_RTM_NEWNEXTHOP,
899 	};
900 
901 	struct nl_writer nw = {};
902 	if (!nlmsg_get_group_writer(&nw, NLMSG_SMALL, NETLINK_ROUTE, RTNLGRP_NEXTHOP)) {
903 		NL_LOG(LOG_DEBUG, "error allocating message writer");
904 		return (ENOMEM);
905 	}
906 
907 	dump_unhop(unhop, &wa.hdr, &nw);
908 	nlmsg_flush(&nw);
909 
910 	consider_resize(ctl, num_buckets_new);
911 
912         return (0);
913 }
914 
915 static int
916 rtnl_handle_delnhop(struct nlmsghdr *hdr, struct nlpcb *nlp,
917     struct nl_pstate *npt)
918 {
919 	struct unhop_ctl *ctl = atomic_load_ptr(&V_un_ctl);
920 	int error;
921 
922 	if (__predict_false(ctl == NULL))
923 		return (ESRCH);
924 
925 	struct nl_parsed_nhop attrs = {};
926 	error = nl_parse_nlmsg(hdr, &nhmsg_parser, npt, &attrs);
927 	if (error != 0)
928 		return (error);
929 
930 	if (attrs.nha_id == 0) {
931 		NL_LOG(LOG_DEBUG, "NHA_ID not set");
932 		return (EINVAL);
933 	}
934 
935 	error = delete_unhop(ctl, hdr, attrs.nha_id);
936 
937         return (error);
938 }
939 
940 static bool
941 match_unhop(const struct nl_parsed_nhop *attrs, struct user_nhop *unhop)
942 {
943 	if (attrs->nha_id != 0 && unhop->un_idx != attrs->nha_id)
944 		return (false);
945 	if (attrs->nha_groups != 0 && unhop->un_nhgrp_src == NULL)
946 		return (false);
947 	if (attrs->nha_oif != NULL &&
948 	    (unhop->un_nhop_src == NULL || unhop->un_nhop_src->nh_ifp != attrs->nha_oif))
949 		return (false);
950 
951 	return (true);
952 }
953 
954 static int
955 rtnl_handle_getnhop(struct nlmsghdr *hdr, struct nlpcb *nlp,
956     struct nl_pstate *npt)
957 {
958 	struct unhop_ctl *ctl = atomic_load_ptr(&V_un_ctl);
959 	struct user_nhop *unhop;
960 	UN_TRACKER;
961 	int error;
962 
963 	if (__predict_false(ctl == NULL))
964 		return (ESRCH);
965 
966 	struct nl_parsed_nhop attrs = {};
967 	error = nl_parse_nlmsg(hdr, &nhmsg_parser, npt, &attrs);
968 	if (error != 0)
969 		return (error);
970 
971 	struct netlink_walkargs wa = {
972 		.nw = npt->nw,
973 		.hdr.nlmsg_pid = hdr->nlmsg_pid,
974 		.hdr.nlmsg_seq = hdr->nlmsg_seq,
975 		.hdr.nlmsg_flags = hdr->nlmsg_flags,
976 		.hdr.nlmsg_type = NL_RTM_NEWNEXTHOP,
977 	};
978 
979 	if (attrs.nha_id != 0) {
980 		NL_LOG(LOG_DEBUG2, "searching for uidx %u", attrs.nha_id);
981 		struct user_nhop key= { .un_idx = attrs.nha_id };
982 		UN_RLOCK(ctl);
983 		CHT_SLIST_FIND_BYOBJ(&ctl->un_head, unhop, &key, unhop);
984 		UN_RUNLOCK(ctl);
985 
986 		if (unhop == NULL)
987 			return (ESRCH);
988 		dump_unhop(unhop, &wa.hdr, wa.nw);
989 		return (0);
990 	}
991 
992 	UN_RLOCK(ctl);
993 	wa.hdr.nlmsg_flags |= NLM_F_MULTI;
994 	CHT_SLIST_FOREACH(&ctl->un_head, unhop, unhop) {
995 		if (UNHOP_IS_MASTER(unhop) && match_unhop(&attrs, unhop))
996 			dump_unhop(unhop, &wa.hdr, wa.nw);
997 	} CHT_SLIST_FOREACH_END;
998 	UN_RUNLOCK(ctl);
999 
1000 	if (wa.error == 0) {
1001 		if (!nlmsg_end_dump(wa.nw, wa.error, &wa.hdr))
1002 			return (ENOMEM);
1003 	}
1004         return (0);
1005 }
1006 
1007 static const struct rtnl_cmd_handler cmd_handlers[] = {
1008 	{
1009 		.cmd = NL_RTM_NEWNEXTHOP,
1010 		.name = "RTM_NEWNEXTHOP",
1011 		.cb = &rtnl_handle_newnhop,
1012 		.priv = PRIV_NET_ROUTE,
1013 	},
1014 	{
1015 		.cmd = NL_RTM_DELNEXTHOP,
1016 		.name = "RTM_DELNEXTHOP",
1017 		.cb = &rtnl_handle_delnhop,
1018 		.priv = PRIV_NET_ROUTE,
1019 	},
1020 	{
1021 		.cmd = NL_RTM_GETNEXTHOP,
1022 		.name = "RTM_GETNEXTHOP",
1023 		.cb = &rtnl_handle_getnhop,
1024 	}
1025 };
1026 
1027 static const struct nlhdr_parser *all_parsers[] = { &nhmsg_parser };
1028 
1029 void
1030 rtnl_nexthops_init(void)
1031 {
1032 	NL_VERIFY_PARSERS(all_parsers);
1033 	rtnl_register_messages(cmd_handlers, NL_ARRAY_LEN(cmd_handlers));
1034 }
1035