xref: /freebsd/sys/netlink/route/nexthop.c (revision 0aa2700123e22c2b0a977375e087dc2759b8e980)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2022 Alexander V. Chernikov <melifaro@FreeBSD.org>
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions
8  * are met:
9  * 1. Redistributions of source code must retain the above copyright
10  *    notice, this list of conditions and the following disclaimer.
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in the
13  *    documentation and/or other materials provided with the distribution.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
16  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
19  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
21  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
22  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
23  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
24  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
25  * SUCH DAMAGE.
26  */
27 
28 #include <sys/cdefs.h>
29 __FBSDID("$FreeBSD$");
30 #include "opt_inet.h"
31 #include "opt_inet6.h"
32 #include <sys/types.h>
33 #include <sys/malloc.h>
34 #include <sys/rmlock.h>
35 #include <sys/socket.h>
36 #include <sys/ck.h>
37 
38 #include <net/if.h>
39 #include <net/route.h>
40 #include <net/route/nhop.h>
41 #include <net/route/nhop_utils.h>
42 
43 #include <net/route/route_ctl.h>
44 #include <net/route/route_var.h>
45 #include <netinet6/scope6_var.h>
46 #include <netlink/netlink.h>
47 #include <netlink/netlink_ctl.h>
48 #include <netlink/netlink_var.h>
49 #include <netlink/netlink_route.h>
50 #include <netlink/route/route_var.h>
51 
52 #define	DEBUG_MOD_NAME	nl_nhop
53 #define	DEBUG_MAX_LEVEL	LOG_DEBUG3
54 #include <netlink/netlink_debug.h>
55 _DECLARE_DEBUG(LOG_DEBUG3);
56 
57 /*
58  * This file contains the logic to maintain kernel nexthops and
59  *  nexhop groups based om the data provided by the user.
60  *
61  * Kernel stores (nearly) all of the routing data in the nexthops,
62  *  including the prefix-specific flags (NHF_HOST and NHF_DEFAULT).
63  *
64  * Netlink API provides higher-level abstraction for the user. Each
65  *  user-created nexthop may map to multiple kernel nexthops.
66  *
67  * The following variations require separate kernel nexthop to be
68  *  created:
69  *  * prefix flags (NHF_HOST, NHF_DEFAULT)
70  *  * using IPv6 gateway for IPv4 routes
71  *  * different fibnum
72  *
73  * These kernel nexthops have the lifetime bound to the lifetime of
74  *  the user_nhop object. They are not collected until user requests
75  *  to delete the created user_nhop.
76  *
77  */
78 struct user_nhop {
79         uint32_t                        un_idx; /* Userland-provided index */
80 	uint32_t			un_fibfam; /* fibnum+af(as highest byte) */
81 	uint8_t				un_protocol; /* protocol that install the record */
82 	struct nhop_object		*un_nhop; /* "production" nexthop */
83 	struct nhop_object		*un_nhop_src; /* nexthop to copy from */
84 	struct weightened_nhop		*un_nhgrp_src; /* nexthops for nhg */
85 	uint32_t			un_nhgrp_count; /* number of nexthops */
86         struct user_nhop		*un_next; /* next item in hash chain */
87         struct user_nhop		*un_nextchild; /* master -> children */
88 	struct epoch_context		un_epoch_ctx;	/* epoch ctl helper */
89 };
90 
91 /* produce hash value for an object */
92 #define	unhop_hash_obj(_obj)	(hash_unhop(_obj))
93 /* compare two objects */
94 #define	unhop_cmp(_one, _two)	(cmp_unhop(_one, _two))
95 /* next object accessor */
96 #define	unhop_next(_obj)	(_obj)->un_next
97 
98 CHT_SLIST_DEFINE(unhop, struct user_nhop);
99 
100 struct unhop_ctl {
101 	struct unhop_head	un_head;
102 	struct rmlock		un_lock;
103 };
104 #define	UN_LOCK_INIT(_ctl)	rm_init(&(_ctl)->un_lock, "unhop_ctl")
105 #define	UN_TRACKER		struct rm_priotracker un_tracker
106 #define	UN_RLOCK(_ctl)		rm_rlock(&((_ctl)->un_lock), &un_tracker)
107 #define	UN_RUNLOCK(_ctl)	rm_runlock(&((_ctl)->un_lock), &un_tracker)
108 
109 #define	UN_WLOCK(_ctl)		rm_wlock(&(_ctl)->un_lock);
110 #define	UN_WUNLOCK(_ctl)	rm_wunlock(&(_ctl)->un_lock);
111 
112 VNET_DEFINE_STATIC(struct unhop_ctl *, un_ctl) = NULL;
113 #define V_un_ctl	VNET(un_ctl)
114 
115 static void consider_resize(struct unhop_ctl *ctl, uint32_t new_size);
116 static int cmp_unhop(const struct user_nhop *a, const struct user_nhop *b);
117 static unsigned int hash_unhop(const struct user_nhop *obj);
118 
119 static void destroy_unhop(struct user_nhop *unhop);
120 static struct nhop_object *clone_unhop(const struct user_nhop *unhop,
121     uint32_t fibnum, int family, int nh_flags);
122 
123 static int
124 cmp_unhop(const struct user_nhop *a, const struct user_nhop *b)
125 {
126         return (a->un_idx == b->un_idx && a->un_fibfam == b->un_fibfam);
127 }
128 
129 /*
130  * Hash callback: calculate hash of an object
131  */
132 static unsigned int
133 hash_unhop(const struct user_nhop *obj)
134 {
135         return (obj->un_idx ^ obj->un_fibfam);
136 }
137 
138 #define	UNHOP_IS_MASTER(_unhop)	((_unhop)->un_fibfam == 0)
139 
140 /*
141  * Factory interface for creating matching kernel nexthops/nexthop groups
142  *
143  * @uidx: userland nexhop index used to create the nexthop
144  * @fibnum: fibnum nexthop will be used in
145  * @family: upper family nexthop will be used in
146  * @nh_flags: desired nexthop prefix flags
147  * @perror: pointer to store error to
148  *
149  * Returns referenced nexthop linked to @fibnum/@family rib on success.
150  */
151 struct nhop_object *
152 nl_find_nhop(uint32_t fibnum, int family, uint32_t uidx,
153     int nh_flags, int *perror)
154 {
155 	struct unhop_ctl *ctl = atomic_load_ptr(&V_un_ctl);
156         UN_TRACKER;
157 
158 	if (__predict_false(ctl == NULL))
159 		return (NULL);
160 
161 	struct user_nhop key= {
162 		.un_idx = uidx,
163 		.un_fibfam = fibnum  | ((uint32_t)family) << 24,
164 	};
165 	struct user_nhop *unhop;
166 
167 	nh_flags = nh_flags & (NHF_HOST | NHF_DEFAULT);
168 
169 	if (__predict_false(family == 0))
170 		return (NULL);
171 
172 	UN_RLOCK(ctl);
173 	CHT_SLIST_FIND_BYOBJ(&ctl->un_head, unhop, &key, unhop);
174 	if (unhop != NULL) {
175 		struct nhop_object *nh = unhop->un_nhop;
176 		UN_RLOCK(ctl);
177 		*perror = 0;
178 		nhop_ref_any(nh);
179 		return (nh);
180 	}
181 
182 	/*
183 	 * Exact nexthop not found. Search for template nexthop to clone from.
184 	 */
185 	key.un_fibfam = 0;
186 	CHT_SLIST_FIND_BYOBJ(&ctl->un_head, unhop, &key, unhop);
187 	if (unhop == NULL) {
188 		UN_RUNLOCK(ctl);
189 		*perror = ESRCH;
190 		return (NULL);
191 	}
192 
193 	UN_RUNLOCK(ctl);
194 
195 	/* Create entry to insert first */
196 	struct user_nhop *un_new, *un_tmp;
197 	un_new = malloc(sizeof(struct user_nhop), M_NETLINK, M_NOWAIT | M_ZERO);
198 	if (un_new == NULL) {
199 		*perror = ENOMEM;
200 		return (NULL);
201 	}
202 	un_new->un_idx = uidx;
203 	un_new->un_fibfam = fibnum  | ((uint32_t)family) << 24;
204 
205 	/* Relying on epoch to protect unhop here */
206 	un_new->un_nhop = clone_unhop(unhop, fibnum, family, nh_flags);
207 	if (un_new->un_nhop == NULL) {
208 		free(un_new, M_NETLINK);
209 		*perror = ENOMEM;
210 		return (NULL);
211 	}
212 
213 	/* Insert back and report */
214 	UN_WLOCK(ctl);
215 
216 	/* First, find template record once again */
217 	CHT_SLIST_FIND_BYOBJ(&ctl->un_head, unhop, &key, unhop);
218 	if (unhop == NULL) {
219 		/* Someone deleted the nexthop during the call */
220 		UN_WUNLOCK(ctl);
221 		*perror = ESRCH;
222 		destroy_unhop(un_new);
223 		return (NULL);
224 	}
225 
226 	/* Second, check the direct match */
227 	CHT_SLIST_FIND_BYOBJ(&ctl->un_head, unhop, un_new, un_tmp);
228 	struct nhop_object *nh;
229 	if (un_tmp != NULL) {
230 		/* Another thread already created the desired nextop, use it */
231 		nh = un_tmp->un_nhop;
232 	} else {
233 		/* Finally, insert the new nexthop and link it to the primary */
234 		nh = un_new->un_nhop;
235 		CHT_SLIST_INSERT_HEAD(&ctl->un_head, unhop, un_new);
236 		un_new->un_nextchild = unhop->un_nextchild;
237 		unhop->un_nextchild = un_new;
238 		un_new = NULL;
239 		NL_LOG(LOG_DEBUG2, "linked cloned nexthop %p", nh);
240 	}
241 
242 	UN_WUNLOCK(ctl);
243 
244 	if (un_new != NULL)
245 		destroy_unhop(un_new);
246 
247 	*perror = 0;
248 	nhop_ref_any(nh);
249 	return (nh);
250 }
251 
252 static struct user_nhop *
253 nl_find_base_unhop(struct unhop_ctl *ctl, uint32_t uidx)
254 {
255 	struct user_nhop key= { .un_idx = uidx };
256 	struct user_nhop *unhop = NULL;
257 	UN_TRACKER;
258 
259 	UN_RLOCK(ctl);
260 	CHT_SLIST_FIND_BYOBJ(&ctl->un_head, unhop, &key, unhop);
261 	UN_RUNLOCK(ctl);
262 
263 	return (unhop);
264 }
265 
266 #define MAX_STACK_NHOPS	4
267 static struct nhop_object *
268 clone_unhop(const struct user_nhop *unhop, uint32_t fibnum, int family, int nh_flags)
269 {
270 	const struct weightened_nhop *wn;
271 	struct weightened_nhop *wn_new, wn_base[MAX_STACK_NHOPS];
272 	struct nhop_object *nh = NULL;
273 	uint32_t num_nhops;
274 	int error;
275 
276 	if (unhop->un_nhop_src != NULL) {
277 		IF_DEBUG_LEVEL(LOG_DEBUG2) {
278 			char nhbuf[NHOP_PRINT_BUFSIZE];
279 			nhop_print_buf_any(unhop->un_nhop_src, nhbuf, sizeof(nhbuf));
280 			FIB_NH_LOG(LOG_DEBUG2, unhop->un_nhop_src,
281 			    "cloning nhop %s -> %u.%u flags 0x%X", nhbuf, fibnum,
282 			    family, nh_flags);
283 		}
284 		struct nhop_object *nh;
285 		nh = nhop_alloc(fibnum, AF_UNSPEC);
286 		if (nh == NULL)
287 			return (NULL);
288 		nhop_copy(nh, unhop->un_nhop_src);
289 		/* Check that nexthop gateway is compatible with the new family */
290 		if (!nhop_set_upper_family(nh, family)) {
291 			nhop_free(nh);
292 			return (NULL);
293 		}
294 		nhop_set_uidx(nh, unhop->un_idx);
295 		nhop_set_pxtype_flag(nh, nh_flags);
296 		return (nhop_get_nhop(nh, &error));
297 	}
298 
299 	wn = unhop->un_nhgrp_src;
300 	num_nhops = unhop->un_nhgrp_count;
301 
302 	if (num_nhops > MAX_STACK_NHOPS) {
303 		wn_new = malloc(num_nhops * sizeof(struct weightened_nhop), M_TEMP, M_NOWAIT);
304 		if (wn_new == NULL)
305 			return (NULL);
306 	} else
307 		wn_new = wn_base;
308 
309 	for (int i = 0; i < num_nhops; i++) {
310 		uint32_t uidx = nhop_get_uidx(wn[i].nh);
311 		MPASS(uidx != 0);
312 		wn_new[i].nh = nl_find_nhop(fibnum, family, uidx, nh_flags, &error);
313 		if (error != 0)
314 			break;
315 		wn_new[i].weight = wn[i].weight;
316 	}
317 
318 	if (error == 0) {
319 		struct rib_head *rh = nhop_get_rh(wn_new[0].nh);
320 		struct nhgrp_object *nhg;
321 
322 		error = nhgrp_get_group(rh, wn_new, num_nhops, unhop->un_idx, &nhg);
323 		nh = (struct nhop_object *)nhg;
324 	}
325 
326 	if (wn_new != wn_base)
327 		free(wn_new, M_TEMP);
328 	return (nh);
329 }
330 
331 static void
332 destroy_unhop(struct user_nhop *unhop)
333 {
334 	if (unhop->un_nhop != NULL)
335 		nhop_free_any(unhop->un_nhop);
336 	if (unhop->un_nhop_src != NULL)
337 		nhop_free_any(unhop->un_nhop_src);
338 	free(unhop, M_NETLINK);
339 }
340 
341 static void
342 destroy_unhop_epoch(epoch_context_t ctx)
343 {
344 	struct user_nhop *unhop;
345 
346 	unhop = __containerof(ctx, struct user_nhop, un_epoch_ctx);
347 
348 	destroy_unhop(unhop);
349 }
350 
351 static uint32_t
352 find_spare_uidx(struct unhop_ctl *ctl)
353 {
354 	struct user_nhop *unhop, key = {};
355 	uint32_t uidx = 0;
356 	UN_TRACKER;
357 
358 	UN_RLOCK(ctl);
359 	/* This should return spare uid with 75% of 65k used in ~99/100 cases */
360 	for (int i = 0; i < 16; i++) {
361 		key.un_idx = (arc4random() % 65536) + 65536 * 4;
362 		CHT_SLIST_FIND_BYOBJ(&ctl->un_head, unhop, &key, unhop);
363 		if (unhop == NULL) {
364 			uidx = key.un_idx;
365 			break;
366 		}
367 	}
368 	UN_RUNLOCK(ctl);
369 
370 	return (uidx);
371 }
372 
373 
374 /*
375  * Actual netlink code
376  */
377 struct netlink_walkargs {
378 	struct nl_writer *nw;
379 	struct nlmsghdr hdr;
380 	struct nlpcb *so;
381 	int family;
382 	int error;
383 	int count;
384 	int dumped;
385 };
386 #define	ENOMEM_IF_NULL(_v)	if ((_v) == NULL) goto enomem
387 
388 static bool
389 dump_nhgrp(const struct user_nhop *unhop, struct nlmsghdr *hdr,
390     struct nl_writer *nw)
391 {
392 
393 	if (!nlmsg_reply(nw, hdr, sizeof(struct nhmsg)))
394 		goto enomem;
395 
396 	struct nhmsg *nhm = nlmsg_reserve_object(nw, struct nhmsg);
397 	nhm->nh_family = AF_UNSPEC;
398 	nhm->nh_scope = 0;
399 	nhm->nh_protocol = unhop->un_protocol;
400 	nhm->nh_flags = 0;
401 
402 	nlattr_add_u32(nw, NHA_ID, unhop->un_idx);
403 	nlattr_add_u16(nw, NHA_GROUP_TYPE, NEXTHOP_GRP_TYPE_MPATH);
404 
405 	struct weightened_nhop *wn = unhop->un_nhgrp_src;
406 	uint32_t num_nhops = unhop->un_nhgrp_count;
407 	/* TODO: a better API? */
408 	int nla_len = sizeof(struct nlattr);
409 	nla_len += NETLINK_ALIGN(num_nhops * sizeof(struct nexthop_grp));
410 	struct nlattr *nla = nlmsg_reserve_data(nw, nla_len, struct nlattr);
411 	if (nla == NULL)
412 		goto enomem;
413 	nla->nla_type = NHA_GROUP;
414 	nla->nla_len = nla_len;
415 	for (int i = 0; i < num_nhops; i++) {
416 		struct nexthop_grp *grp = &((struct nexthop_grp *)(nla + 1))[i];
417 		grp->id = nhop_get_uidx(wn[i].nh);
418 		grp->weight = wn[i].weight;
419 		grp->resvd1 = 0;
420 		grp->resvd2 = 0;
421 	}
422 
423         if (nlmsg_end(nw))
424 		return (true);
425 enomem:
426 	NL_LOG(LOG_DEBUG, "error: unable to allocate attribute memory");
427         nlmsg_abort(nw);
428 	return (false);
429 }
430 
431 static bool
432 dump_nhop(const struct user_nhop *unhop, struct nlmsghdr *hdr,
433     struct nl_writer *nw)
434 {
435 	struct nhop_object *nh = unhop->un_nhop_src;
436 
437 	if (!nlmsg_reply(nw, hdr, sizeof(struct nhmsg)))
438 		goto enomem;
439 
440 	struct nhmsg *nhm = nlmsg_reserve_object(nw, struct nhmsg);
441 	ENOMEM_IF_NULL(nhm);
442 	nhm->nh_family = nhop_get_neigh_family(nh);
443 	nhm->nh_scope = 0; // XXX: what's that?
444 	nhm->nh_protocol = unhop->un_protocol;
445 	nhm->nh_flags = 0;
446 
447 	nlattr_add_u32(nw, NHA_ID, unhop->un_idx);
448 	if (nh->nh_flags & NHF_BLACKHOLE) {
449 		nlattr_add_flag(nw, NHA_BLACKHOLE);
450 		goto done;
451 	}
452 	nlattr_add_u32(nw, NHA_OIF, nh->nh_ifp->if_index);
453 
454 	switch (nh->gw_sa.sa_family) {
455 #ifdef INET
456 	case AF_INET:
457 		nlattr_add(nw, NHA_GATEWAY, 4, &nh->gw4_sa.sin_addr);
458 		break;
459 #endif
460 #ifdef INET6
461 	case AF_INET6:
462 		{
463 			struct in6_addr addr = nh->gw6_sa.sin6_addr;
464 			in6_clearscope(&addr);
465 			nlattr_add(nw, NHA_GATEWAY, 16, &addr);
466 			break;
467 		}
468 #endif
469 	}
470 
471 done:
472         if (nlmsg_end(nw))
473 		return (true);
474 enomem:
475 	nlmsg_abort(nw);
476 	return (false);
477 }
478 
479 static void
480 dump_unhop(const struct user_nhop *unhop, struct nlmsghdr *hdr,
481     struct nl_writer *nw)
482 {
483 	if (unhop->un_nhop_src != NULL)
484 		dump_nhop(unhop, hdr, nw);
485 	else
486 		dump_nhgrp(unhop, hdr, nw);
487 }
488 
489 static int
490 delete_unhop(struct unhop_ctl *ctl, struct nlmsghdr *hdr, uint32_t uidx)
491 {
492 	struct user_nhop *unhop_ret, *unhop_base, *unhop_chain;
493 
494 	struct user_nhop key = { .un_idx = uidx };
495 
496 	UN_WLOCK(ctl);
497 
498 	CHT_SLIST_FIND_BYOBJ(&ctl->un_head, unhop, &key, unhop_base);
499 
500 	if (unhop_base != NULL) {
501 		CHT_SLIST_REMOVE(&ctl->un_head, unhop, unhop_base, unhop_ret);
502 		IF_DEBUG_LEVEL(LOG_DEBUG2) {
503 			char nhbuf[NHOP_PRINT_BUFSIZE];
504 			nhop_print_buf_any(unhop_base->un_nhop, nhbuf, sizeof(nhbuf));
505 			FIB_NH_LOG(LOG_DEBUG3, unhop_base->un_nhop,
506 			    "removed base nhop %u: %s", uidx, nhbuf);
507 		}
508 		/* Unlink all child nexhops as well, keeping the chain intact */
509 		unhop_chain = unhop_base->un_nextchild;
510 		while (unhop_chain != NULL) {
511 			CHT_SLIST_REMOVE(&ctl->un_head, unhop, unhop_chain,
512 			    unhop_ret);
513 			MPASS(unhop_chain == unhop_ret);
514 			IF_DEBUG_LEVEL(LOG_DEBUG3) {
515 				char nhbuf[NHOP_PRINT_BUFSIZE];
516 				nhop_print_buf_any(unhop_chain->un_nhop,
517 				    nhbuf, sizeof(nhbuf));
518 				FIB_NH_LOG(LOG_DEBUG3, unhop_chain->un_nhop,
519 				    "removed child nhop %u: %s", uidx, nhbuf);
520 			}
521 			unhop_chain = unhop_chain->un_nextchild;
522 		}
523 	}
524 
525 	UN_WUNLOCK(ctl);
526 
527 	if (unhop_base == NULL) {
528 		NL_LOG(LOG_DEBUG, "unable to find unhop %u", uidx);
529 		return (ENOENT);
530 	}
531 
532 	/* Report nexthop deletion */
533 	struct netlink_walkargs wa = {
534 		.hdr.nlmsg_pid = hdr->nlmsg_pid,
535 		.hdr.nlmsg_seq = hdr->nlmsg_seq,
536 		.hdr.nlmsg_flags = hdr->nlmsg_flags,
537 		.hdr.nlmsg_type = NL_RTM_DELNEXTHOP,
538 	};
539 
540 	struct nl_writer nw = {};
541 	if (!nlmsg_get_group_writer(&nw, NLMSG_SMALL, NETLINK_ROUTE, RTNLGRP_NEXTHOP)) {
542 		NL_LOG(LOG_DEBUG, "error allocating message writer");
543 		return (ENOMEM);
544 	}
545 
546 	dump_unhop(unhop_base, &wa.hdr, &nw);
547 	nlmsg_flush(&nw);
548 
549 	while (unhop_base != NULL) {
550 		unhop_chain = unhop_base->un_nextchild;
551 		epoch_call(net_epoch_preempt, destroy_unhop_epoch,
552 		    &unhop_base->un_epoch_ctx);
553 		unhop_base = unhop_chain;
554 	}
555 
556 	return (0);
557 }
558 
559 static void
560 consider_resize(struct unhop_ctl *ctl, uint32_t new_size)
561 {
562 	void *new_ptr = NULL;
563 	size_t alloc_size;
564 
565         if (new_size == 0)
566                 return;
567 
568 	if (new_size != 0) {
569 		alloc_size = CHT_SLIST_GET_RESIZE_SIZE(new_size);
570 		new_ptr = malloc(alloc_size, M_NETLINK, M_NOWAIT | M_ZERO);
571                 if (new_ptr == NULL)
572                         return;
573 	}
574 
575 	NL_LOG(LOG_DEBUG, "resizing hash: %u -> %u", ctl->un_head.hash_size, new_size);
576 	UN_WLOCK(ctl);
577 	if (new_ptr != NULL) {
578 		CHT_SLIST_RESIZE(&ctl->un_head, unhop, new_ptr, new_size);
579 	}
580 	UN_WUNLOCK(ctl);
581 
582 
583 	if (new_ptr != NULL)
584 		free(new_ptr, M_NETLINK);
585 }
586 
587 static bool __noinline
588 vnet_init_unhops()
589 {
590         uint32_t num_buckets = 16;
591         size_t alloc_size = CHT_SLIST_GET_RESIZE_SIZE(num_buckets);
592 
593         struct unhop_ctl *ctl = malloc(sizeof(struct unhop_ctl), M_NETLINK,
594             M_NOWAIT | M_ZERO);
595         if (ctl == NULL)
596                 return (false);
597 
598         void *ptr = malloc(alloc_size, M_NETLINK, M_NOWAIT | M_ZERO);
599         if (ptr == NULL) {
600 		free(ctl, M_NETLINK);
601                 return (false);
602 	}
603         CHT_SLIST_INIT(&ctl->un_head, ptr, num_buckets);
604 	UN_LOCK_INIT(ctl);
605 
606 	if (!atomic_cmpset_ptr((uintptr_t *)&V_un_ctl, (uintptr_t)NULL, (uintptr_t)ctl)) {
607                 free(ptr, M_NETLINK);
608                 free(ctl, M_NETLINK);
609 	}
610 
611 	if (atomic_load_ptr(&V_un_ctl) == NULL)
612 		return (false);
613 
614 	NL_LOG(LOG_NOTICE, "UNHOPS init done");
615 
616         return (true);
617 }
618 
619 static void
620 vnet_destroy_unhops(const void *unused __unused)
621 {
622 	struct unhop_ctl *ctl = atomic_load_ptr(&V_un_ctl);
623 	struct user_nhop *unhop, *tmp;
624 
625 	if (ctl == NULL)
626 		return;
627 	V_un_ctl = NULL;
628 
629 	/* Wait till all unhop users finish their reads */
630 	epoch_wait_preempt(net_epoch_preempt);
631 
632 	UN_WLOCK(ctl);
633 	CHT_SLIST_FOREACH_SAFE(&ctl->un_head, unhop, unhop, tmp) {
634 		destroy_unhop(unhop);
635 	} CHT_SLIST_FOREACH_SAFE_END;
636 	UN_WUNLOCK(ctl);
637 
638 	free(ctl->un_head.ptr, M_NETLINK);
639 	free(ctl, M_NETLINK);
640 }
641 VNET_SYSUNINIT(vnet_destroy_unhops, SI_SUB_PROTO_IF, SI_ORDER_ANY,
642     vnet_destroy_unhops, NULL);
643 
644 static int
645 nlattr_get_nhg(struct nlattr *nla, struct nl_pstate *npt, const void *arg, void *target)
646 {
647 	int error = 0;
648 
649 	/* Verify attribute correctness */
650 	struct nexthop_grp *grp = NLA_DATA(nla);
651 	int data_len = NLA_DATA_LEN(nla);
652 
653 	int count = data_len / sizeof(*grp);
654 	if (count == 0 || (count * sizeof(*grp) != data_len)) {
655 		NL_LOG(LOG_DEBUG, "Invalid length for RTA_GROUP: %d", data_len);
656 		return (EINVAL);
657 	}
658 
659 	*((struct nlattr **)target) = nla;
660 	return (error);
661 }
662 
663 struct nl_parsed_nhop {
664 	uint32_t	nha_id;
665 	uint8_t		nha_blackhole;
666 	uint8_t		nha_groups;
667 	struct ifnet	*nha_oif;
668 	struct sockaddr	*nha_gw;
669 	struct nlattr	*nha_group;
670 	uint8_t		nh_family;
671 	uint8_t		nh_protocol;
672 };
673 
674 #define	_IN(_field)	offsetof(struct nhmsg, _field)
675 #define	_OUT(_field)	offsetof(struct nl_parsed_nhop, _field)
676 static const struct nlfield_parser nlf_p_nh[] = {
677 	{ .off_in = _IN(nh_family), .off_out = _OUT(nh_family), .cb = nlf_get_u8 },
678 	{ .off_in = _IN(nh_protocol), .off_out = _OUT(nh_protocol), .cb = nlf_get_u8 },
679 };
680 
681 static const struct nlattr_parser nla_p_nh[] = {
682 	{ .type = NHA_ID, .off = _OUT(nha_id), .cb = nlattr_get_uint32 },
683 	{ .type = NHA_GROUP, .off = _OUT(nha_group), .cb = nlattr_get_nhg },
684 	{ .type = NHA_BLACKHOLE, .off = _OUT(nha_blackhole), .cb = nlattr_get_flag },
685 	{ .type = NHA_OIF, .off = _OUT(nha_oif), .cb = nlattr_get_ifp },
686 	{ .type = NHA_GATEWAY, .off = _OUT(nha_gw), .cb = nlattr_get_ip },
687 	{ .type = NHA_GROUPS, .off = _OUT(nha_groups), .cb = nlattr_get_flag },
688 };
689 #undef _IN
690 #undef _OUT
691 NL_DECLARE_PARSER(nhmsg_parser, struct nhmsg, nlf_p_nh, nla_p_nh);
692 
693 static bool
694 eligible_nhg(const struct nhop_object *nh)
695 {
696 	return (nh->nh_flags & NHF_GATEWAY);
697 }
698 
699 static int
700 newnhg(struct unhop_ctl *ctl, struct nl_parsed_nhop *attrs, struct user_nhop *unhop)
701 {
702 	struct nexthop_grp *grp = NLA_DATA(attrs->nha_group);
703 	int count = NLA_DATA_LEN(attrs->nha_group) / sizeof(*grp);
704 	struct weightened_nhop *wn;
705 
706 	wn = malloc(sizeof(*wn) * count, M_NETLINK, M_NOWAIT | M_ZERO);
707 	if (wn == NULL)
708 		return (ENOMEM);
709 
710 	for (int i = 0; i < count; i++) {
711 		struct user_nhop *unhop;
712 		unhop = nl_find_base_unhop(ctl, grp[i].id);
713 		if (unhop == NULL) {
714 			NL_LOG(LOG_DEBUG, "unable to find uidx %u", grp[i].id);
715 			free(wn, M_NETLINK);
716 			return (ESRCH);
717 		} else if (unhop->un_nhop_src == NULL) {
718 			NL_LOG(LOG_DEBUG, "uidx %u is a group, nested group unsupported",
719 			    grp[i].id);
720 			free(wn, M_NETLINK);
721 			return (ENOTSUP);
722 		} else if (!eligible_nhg(unhop->un_nhop_src)) {
723 			NL_LOG(LOG_DEBUG, "uidx %u nhop is not mpath-eligible",
724 			    grp[i].id);
725 			free(wn, M_NETLINK);
726 			return (ENOTSUP);
727 		}
728 		/*
729 		 * TODO: consider more rigid eligibility checks:
730 		 * restrict nexthops with the same gateway
731 		 */
732 		wn[i].nh = unhop->un_nhop_src;
733 		wn[i].weight = grp[i].weight;
734 	}
735 	unhop->un_nhgrp_src = wn;
736 	unhop->un_nhgrp_count = count;
737 	return (0);
738 }
739 
740 static int
741 newnhop(struct nl_parsed_nhop *attrs, struct user_nhop *unhop)
742 {
743 	struct ifaddr *ifa = NULL;
744 	struct nhop_object *nh;
745 	int error;
746 
747 	if (!attrs->nha_blackhole) {
748 		if (attrs->nha_gw == NULL) {
749 			NL_LOG(LOG_DEBUG, "missing NHA_GATEWAY");
750 			return (EINVAL);
751 		}
752 		if (attrs->nha_oif == NULL) {
753 			NL_LOG(LOG_DEBUG, "missing NHA_OIF");
754 			return (EINVAL);
755 		}
756 		if (ifa == NULL)
757 			ifa = ifaof_ifpforaddr(attrs->nha_gw, attrs->nha_oif);
758 		if (ifa == NULL) {
759 			NL_LOG(LOG_DEBUG, "Unable to determine default source IP");
760 			return (EINVAL);
761 		}
762 	}
763 
764 	int family = attrs->nha_gw != NULL ? attrs->nha_gw->sa_family : attrs->nh_family;
765 
766 	nh = nhop_alloc(RT_DEFAULT_FIB, family);
767 	if (nh == NULL) {
768 		NL_LOG(LOG_DEBUG, "Unable to allocate nexthop");
769 		return (ENOMEM);
770 	}
771 	nhop_set_uidx(nh, attrs->nha_id);
772 
773 	if (attrs->nha_blackhole)
774 		nhop_set_blackhole(nh, NHF_BLACKHOLE);
775 	else {
776 		nhop_set_gw(nh, attrs->nha_gw, true);
777 		nhop_set_transmit_ifp(nh, attrs->nha_oif);
778 		nhop_set_src(nh, ifa);
779 	}
780 
781 	error = nhop_get_unlinked(nh);
782 	if (error != 0) {
783 		NL_LOG(LOG_DEBUG, "unable to finalize nexthop");
784 		return (error);
785 	}
786 
787 	IF_DEBUG_LEVEL(LOG_DEBUG2) {
788 		char nhbuf[NHOP_PRINT_BUFSIZE];
789 		nhop_print_buf(nh, nhbuf, sizeof(nhbuf));
790 		NL_LOG(LOG_DEBUG2, "Adding unhop %u: %s", attrs->nha_id, nhbuf);
791 	}
792 
793 	unhop->un_nhop_src = nh;
794 	return (0);
795 }
796 
797 static int
798 rtnl_handle_newnhop(struct nlmsghdr *hdr, struct nlpcb *nlp,
799     struct nl_pstate *npt)
800 {
801 	struct user_nhop *unhop;
802 	int error;
803 
804         if ((__predict_false(V_un_ctl == NULL)) && (!vnet_init_unhops()))
805 		return (ENOMEM);
806 	struct unhop_ctl *ctl = V_un_ctl;
807 
808 	struct nl_parsed_nhop attrs = {};
809 	error = nl_parse_nlmsg(hdr, &nhmsg_parser, npt, &attrs);
810 	if (error != 0)
811 		return (error);
812 
813 	/*
814 	 * Get valid nha_id. Treat nha_id == 0 (auto-assignment) as a second-class
815 	 *  citizen.
816 	 */
817 	if (attrs.nha_id == 0) {
818 		attrs.nha_id = find_spare_uidx(ctl);
819 		if (attrs.nha_id == 0) {
820 			NL_LOG(LOG_DEBUG, "Unable to get spare uidx");
821 			return (ENOSPC);
822 		}
823 	}
824 
825 	NL_LOG(LOG_DEBUG, "IFINDEX %d", attrs.nha_oif ? attrs.nha_oif->if_index : 0);
826 
827 	unhop = malloc(sizeof(struct user_nhop), M_NETLINK, M_NOWAIT | M_ZERO);
828 	if (unhop == NULL) {
829 		NL_LOG(LOG_DEBUG, "Unable to allocate user_nhop");
830 		return (ENOMEM);
831 	}
832 	unhop->un_idx = attrs.nha_id;
833 	unhop->un_protocol = attrs.nh_protocol;
834 
835 	if (attrs.nha_group)
836 		error = newnhg(ctl, &attrs, unhop);
837 	else
838 		error = newnhop(&attrs, unhop);
839 
840 	if (error != 0) {
841 		free(unhop, M_NETLINK);
842 		return (error);
843 	}
844 
845 	UN_WLOCK(ctl);
846 	/* Check if uidx already exists */
847 	struct user_nhop *tmp = NULL;
848 	CHT_SLIST_FIND_BYOBJ(&ctl->un_head, unhop, unhop, tmp);
849 	if (tmp != NULL) {
850 		UN_WUNLOCK(ctl);
851 		NL_LOG(LOG_DEBUG, "nhop idx %u already exists", attrs.nha_id);
852 		destroy_unhop(unhop);
853 		return (EEXIST);
854 	}
855 	CHT_SLIST_INSERT_HEAD(&ctl->un_head, unhop, unhop);
856 	uint32_t num_buckets_new = CHT_SLIST_GET_RESIZE_BUCKETS(&ctl->un_head);
857 	UN_WUNLOCK(ctl);
858 
859 	/* Report addition of the next nexhop */
860 	struct netlink_walkargs wa = {
861 		.hdr.nlmsg_pid = hdr->nlmsg_pid,
862 		.hdr.nlmsg_seq = hdr->nlmsg_seq,
863 		.hdr.nlmsg_flags = hdr->nlmsg_flags,
864 		.hdr.nlmsg_type = NL_RTM_NEWNEXTHOP,
865 	};
866 
867 	struct nl_writer nw = {};
868 	if (!nlmsg_get_group_writer(&nw, NLMSG_SMALL, NETLINK_ROUTE, RTNLGRP_NEXTHOP)) {
869 		NL_LOG(LOG_DEBUG, "error allocating message writer");
870 		return (ENOMEM);
871 	}
872 
873 	dump_unhop(unhop, &wa.hdr, &nw);
874 	nlmsg_flush(&nw);
875 
876 	consider_resize(ctl, num_buckets_new);
877 
878         return (0);
879 }
880 
881 static int
882 rtnl_handle_delnhop(struct nlmsghdr *hdr, struct nlpcb *nlp,
883     struct nl_pstate *npt)
884 {
885 	struct unhop_ctl *ctl = atomic_load_ptr(&V_un_ctl);
886 	int error;
887 
888 	if (__predict_false(ctl == NULL))
889 		return (ESRCH);
890 
891 	struct nl_parsed_nhop attrs = {};
892 	error = nl_parse_nlmsg(hdr, &nhmsg_parser, npt, &attrs);
893 	if (error != 0)
894 		return (error);
895 
896 	if (attrs.nha_id == 0) {
897 		NL_LOG(LOG_DEBUG, "NHA_ID not set");
898 		return (EINVAL);
899 	}
900 
901 	error = delete_unhop(ctl, hdr, attrs.nha_id);
902 
903         return (error);
904 }
905 
906 static bool
907 match_unhop(const struct nl_parsed_nhop *attrs, struct user_nhop *unhop)
908 {
909 	if (attrs->nha_id != 0 && unhop->un_idx != attrs->nha_id)
910 		return (false);
911 	if (attrs->nha_groups != 0 && unhop->un_nhgrp_src == NULL)
912 		return (false);
913 	if (attrs->nha_oif != NULL &&
914 	    (unhop->un_nhop_src == NULL || unhop->un_nhop_src->nh_ifp != attrs->nha_oif))
915 		return (false);
916 
917 	return (true);
918 }
919 
920 static int
921 rtnl_handle_getnhop(struct nlmsghdr *hdr, struct nlpcb *nlp,
922     struct nl_pstate *npt)
923 {
924 	struct unhop_ctl *ctl = atomic_load_ptr(&V_un_ctl);
925 	struct user_nhop *unhop;
926 	UN_TRACKER;
927 	int error;
928 
929 	if (__predict_false(ctl == NULL))
930 		return (ESRCH);
931 
932 	struct nl_parsed_nhop attrs = {};
933 	error = nl_parse_nlmsg(hdr, &nhmsg_parser, npt, &attrs);
934 	if (error != 0)
935 		return (error);
936 
937 	struct netlink_walkargs wa = {
938 		.nw = npt->nw,
939 		.hdr.nlmsg_pid = hdr->nlmsg_pid,
940 		.hdr.nlmsg_seq = hdr->nlmsg_seq,
941 		.hdr.nlmsg_flags = hdr->nlmsg_flags,
942 		.hdr.nlmsg_type = NL_RTM_NEWNEXTHOP,
943 	};
944 
945 	if (attrs.nha_id != 0) {
946 		NL_LOG(LOG_DEBUG2, "searching for uidx %u", attrs.nha_id);
947 		struct user_nhop key= { .un_idx = attrs.nha_id };
948 		UN_RLOCK(ctl);
949 		CHT_SLIST_FIND_BYOBJ(&ctl->un_head, unhop, &key, unhop);
950 		UN_RUNLOCK(ctl);
951 
952 		if (unhop == NULL)
953 			return (ESRCH);
954 		dump_unhop(unhop, &wa.hdr, wa.nw);
955 		return (0);
956 	}
957 
958 	UN_RLOCK(ctl);
959 	wa.hdr.nlmsg_flags |= NLM_F_MULTI;
960 	CHT_SLIST_FOREACH(&ctl->un_head, unhop, unhop) {
961 		if (UNHOP_IS_MASTER(unhop) && match_unhop(&attrs, unhop))
962 			dump_unhop(unhop, &wa.hdr, wa.nw);
963 	} CHT_SLIST_FOREACH_END;
964 	UN_RUNLOCK(ctl);
965 
966 	if (wa.error == 0) {
967 		if (!nlmsg_end_dump(wa.nw, wa.error, &wa.hdr))
968 			return (ENOMEM);
969 	}
970         return (0);
971 }
972 
973 static const struct rtnl_cmd_handler cmd_handlers[] = {
974 	{
975 		.cmd = NL_RTM_NEWNEXTHOP,
976 		.name = "RTM_NEWNEXTHOP",
977 		.cb = &rtnl_handle_newnhop,
978 		.priv = PRIV_NET_ROUTE,
979 	},
980 	{
981 		.cmd = NL_RTM_DELNEXTHOP,
982 		.name = "RTM_DELNEXTHOP",
983 		.cb = &rtnl_handle_delnhop,
984 		.priv = PRIV_NET_ROUTE,
985 	},
986 	{
987 		.cmd = NL_RTM_GETNEXTHOP,
988 		.name = "RTM_GETNEXTHOP",
989 		.cb = &rtnl_handle_getnhop,
990 	}
991 };
992 
993 static const struct nlhdr_parser *all_parsers[] = { &nhmsg_parser };
994 
995 void
996 rtnl_nexthops_init()
997 {
998 	NL_VERIFY_PARSERS(all_parsers);
999 	rtnl_register_messages(cmd_handlers, NL_ARRAY_LEN(cmd_handlers));
1000 }
1001