xref: /freebsd/sys/net/route/nhgrp_ctl.c (revision edf8578117e8844e02c0121147f45e4609b30680)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause
3  *
4  * Copyright (c) 2020 Alexander V. Chernikov
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions
8  * are met:
9  * 1. Redistributions of source code must retain the above copyright
10  *    notice, this list of conditions and the following disclaimer.
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in the
13  *    documentation and/or other materials provided with the distribution.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
16  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
19  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
21  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
22  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
23  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
24  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
25  * SUCH DAMAGE.
26  */
27 #include "opt_inet.h"
28 #include "opt_route.h"
29 
30 #include <sys/cdefs.h>
31 #include <sys/param.h>
32 #include <sys/systm.h>
33 #include <sys/lock.h>
34 #include <sys/rmlock.h>
35 #include <sys/malloc.h>
36 #include <sys/mbuf.h>
37 #include <sys/refcount.h>
38 #include <sys/socket.h>
39 #include <sys/sysctl.h>
40 #include <sys/kernel.h>
41 #include <sys/epoch.h>
42 
43 #include <net/if.h>
44 #include <net/if_var.h>
45 #include <net/if_private.h>
46 #include <net/route.h>
47 #include <net/route/route_ctl.h>
48 #include <net/route/route_var.h>
49 #include <net/vnet.h>
50 
51 #include <netinet/in.h>
52 #include <netinet/in_var.h>
53 #include <netinet/in_fib.h>
54 
55 #include <net/route/nhop_utils.h>
56 #include <net/route/nhop.h>
57 #include <net/route/nhop_var.h>
58 #include <net/route/nhgrp_var.h>
59 
60 #define	DEBUG_MOD_NAME	nhgrp_ctl
61 #define	DEBUG_MAX_LEVEL	LOG_DEBUG
62 #include <net/route/route_debug.h>
63 _DECLARE_DEBUG(LOG_INFO);
64 
65 /*
66  * This file contains the supporting functions for creating multipath groups
67  *  and compiling their dataplane parts.
68  */
69 
70 /* MPF_MULTIPATH must be the same as NHF_MULTIPATH for nhop selection to work */
71 _Static_assert(MPF_MULTIPATH == NHF_MULTIPATH,
72     "MPF_MULTIPATH must be the same as NHF_MULTIPATH");
73 /* Offset and size of flags field has to be the same for nhop/nhop groups */
74 CHK_STRUCT_FIELD_GENERIC(struct nhop_object, nh_flags, struct nhgrp_object, nhg_flags);
75 /* Cap multipath to 64, as the larger values would break rib_cmd_info bmasks */
76 CTASSERT(RIB_MAX_MPATH_WIDTH <= 64);
77 
78 static int wn_cmp_idx(const void *a, const void *b);
79 static void sort_weightened_nhops(struct weightened_nhop *wn, int num_nhops);
80 
81 static struct nhgrp_priv *get_nhgrp(struct nh_control *ctl,
82     struct weightened_nhop *wn, int num_nhops, uint32_t uidx, int *perror);
83 static void destroy_nhgrp(struct nhgrp_priv *nhg_priv);
84 static void destroy_nhgrp_epoch(epoch_context_t ctx);
85 static void free_nhgrp_nhops(struct nhgrp_priv *nhg_priv);
86 
87 static int
88 wn_cmp_idx(const void *a, const void *b)
89 {
90 	const struct weightened_nhop *w_a = a;
91 	const struct weightened_nhop *w_b = b;
92 	uint32_t a_idx = w_a->nh->nh_priv->nh_idx;
93 	uint32_t b_idx = w_b->nh->nh_priv->nh_idx;
94 
95 	if (a_idx < b_idx)
96 		return (-1);
97 	else if (a_idx > b_idx)
98 		return (1);
99 	else
100 		return (0);
101 }
102 
103 /*
104  * Perform in-place sorting for array of nexthops in @wn.
105  * Sort by nexthop index ascending.
106  */
107 static void
108 sort_weightened_nhops(struct weightened_nhop *wn, int num_nhops)
109 {
110 
111 	qsort(wn, num_nhops, sizeof(struct weightened_nhop), wn_cmp_idx);
112 }
113 
114 /*
115  * In order to determine the minimum weight difference in the array
116  * of weights, create a sorted array of weights, using spare "storage"
117  * field in the `struct weightened_nhop`.
118  * Assume weights to be (mostly) the same and use insertion sort to
119  * make it sorted.
120  */
121 static void
122 sort_weightened_nhops_weights(struct weightened_nhop *wn, int num_items)
123 {
124 	wn[0].storage = wn[0].weight;
125 	for (int i = 1, j = 0; i < num_items; i++) {
126 		uint32_t weight = wn[i].weight; // read from 'weight' as it's not reordered
127 		/* Move all weights > weight 1 position right */
128 		for (j = i - 1; j >= 0 && wn[j].storage > weight; j--)
129 			wn[j + 1].storage = wn[j].storage;
130 		wn[j + 1].storage = weight;
131 	}
132 }
133 
134 /*
135  * Calculate minimum number of slots required to fit the existing
136  * set of weights in the common use case where weights are "easily"
137  * comparable.
138  * Assumes @wn is sorted by weight ascending and each weight is > 0.
139  * Returns number of slots or 0 if precise calculation failed.
140  *
141  * Some examples:
142  * note: (i, X) pair means (nhop=i, weight=X):
143  * (1, 1) (2, 2) -> 3 slots [1, 2, 2]
144  * (1, 100), (2, 200) -> 3 slots [1, 2, 2]
145  * (1, 100), (2, 200), (3, 400) -> 7 slots [1, 2, 2, 3, 3, 3]
146  */
147 static uint32_t
148 calc_min_mpath_slots_fast(struct weightened_nhop *wn, size_t num_items,
149     uint64_t *ptotal)
150 {
151 	uint32_t i, last, xmin;
152 	uint64_t total = 0;
153 
154 	// Get sorted array of weights in .storage field
155 	sort_weightened_nhops_weights(wn, num_items);
156 
157 	last = 0;
158 	xmin = wn[0].storage;
159 	for (i = 0; i < num_items; i++) {
160 		total += wn[i].storage;
161 		if ((wn[i].storage != last) &&
162 		    ((wn[i].storage - last < xmin) || xmin == 0)) {
163 			xmin = wn[i].storage - last;
164 		}
165 		last = wn[i].storage;
166 	}
167 	*ptotal = total;
168 	/* xmin is the minimum unit of desired capacity */
169 	if ((total % xmin) != 0)
170 		return (0);
171 	for (i = 0; i < num_items; i++) {
172 		if ((wn[i].weight % xmin) != 0)
173 			return (0);
174 	}
175 
176 	return ((uint32_t)(total / xmin));
177 }
178 
179 /*
180  * Calculate minimum number of slots required to fit the existing
181  * set of weights while maintaining weight coefficients.
182  *
183  * Assume @wn is sorted by weight ascending and each weight is > 0.
184  *
185  * Tries to find simple precise solution first and falls back to
186  *  RIB_MAX_MPATH_WIDTH in case of any failure.
187  */
188 static uint32_t
189 calc_min_mpath_slots(struct weightened_nhop *wn, size_t num_items)
190 {
191 	uint32_t v;
192 	uint64_t total;
193 
194 	v = calc_min_mpath_slots_fast(wn, num_items, &total);
195 	if (total == 0)
196 		return (0);
197 	if ((v == 0) || (v > RIB_MAX_MPATH_WIDTH))
198 		v = RIB_MAX_MPATH_WIDTH;
199 
200 	return (v);
201 }
202 
203 /*
204  * Nexthop group data consists of
205  * 1) dataplane part, with nhgrp_object as a header followed by an
206  *   arbitrary number of nexthop pointers.
207  * 2) control plane part, with nhgrp_priv as a header, followed by
208  *   an arbirtrary number of 'struct weightened_nhop' object.
209  *
210  * Given nexthop groups are (mostly) immutable, allocate all data
211  * in one go.
212  *
213  */
214 __noinline static size_t
215 get_nhgrp_alloc_size(uint32_t nhg_size, uint32_t num_nhops)
216 {
217 	size_t sz;
218 
219 	sz = sizeof(struct nhgrp_object);
220 	sz += nhg_size * sizeof(struct nhop_object *);
221 	sz += sizeof(struct nhgrp_priv);
222 	sz += num_nhops * sizeof(struct weightened_nhop);
223 	return (sz);
224 }
225 
226 /*
227  * Compile actual list of nexthops to be used by datapath from
228  *  the nexthop group @dst.
229  *
230  * For example, compiling control plane list of 2 nexthops
231  *  [(200, A), (100, B)] would result in the datapath array
232  *  [A, A, B]
233  */
234 static void
235 compile_nhgrp(struct nhgrp_priv *dst_priv, const struct weightened_nhop *x,
236     uint32_t num_slots)
237 {
238 	struct nhgrp_object *dst;
239 	int i, slot_idx, remaining_slots;
240 	uint64_t remaining_sum, nh_weight, nh_slots;
241 
242 	slot_idx  = 0;
243 	dst = dst_priv->nhg;
244 	/* Calculate sum of all weights */
245 	remaining_sum = 0;
246 	for (i = 0; i < dst_priv->nhg_nh_count; i++)
247 		remaining_sum += x[i].weight;
248 	remaining_slots = num_slots;
249 	FIB_NH_LOG(LOG_DEBUG3, x[0].nh, "sum: %lu, slots: %d",
250 	    remaining_sum, remaining_slots);
251 	for (i = 0; i < dst_priv->nhg_nh_count; i++) {
252 		/* Calculate number of slots for the current nexthop */
253 		if (remaining_sum > 0) {
254 			nh_weight = (uint64_t)x[i].weight;
255 			nh_slots = (nh_weight * remaining_slots / remaining_sum);
256 		} else
257 			nh_slots = 0;
258 
259 		remaining_sum -= x[i].weight;
260 		remaining_slots -= nh_slots;
261 
262 		FIB_NH_LOG(LOG_DEBUG3, x[0].nh,
263 		    " rem_sum: %lu, rem_slots: %d nh_slots: %d, slot_idx: %d",
264 		    remaining_sum, remaining_slots, (int)nh_slots, slot_idx);
265 
266 		KASSERT((slot_idx + nh_slots <= num_slots),
267 		    ("index overflow during nhg compilation"));
268 		while (nh_slots-- > 0)
269 			dst->nhops[slot_idx++] = x[i].nh;
270 	}
271 }
272 
273 /*
274  * Allocates new nexthop group for the list of weightened nexthops.
275  * Assume sorted list.
276  * Does NOT reference any nexthops in the group.
277  * Returns group with refcount=1 or NULL.
278  */
279 static struct nhgrp_priv *
280 alloc_nhgrp(struct weightened_nhop *wn, int num_nhops)
281 {
282 	uint32_t nhgrp_size;
283 	struct nhgrp_object *nhg;
284 	struct nhgrp_priv *nhg_priv;
285 
286 	nhgrp_size = calc_min_mpath_slots(wn, num_nhops);
287 	if (nhgrp_size == 0) {
288 		/* Zero weights, abort */
289 		return (NULL);
290 	}
291 
292 	size_t sz = get_nhgrp_alloc_size(nhgrp_size, num_nhops);
293 	nhg = malloc(sz, M_NHOP, M_NOWAIT | M_ZERO);
294 	if (nhg == NULL) {
295 		FIB_NH_LOG(LOG_INFO, wn[0].nh,
296 		    "unable to allocate group with num_nhops %d (compiled %u)",
297 		    num_nhops, nhgrp_size);
298 		return (NULL);
299 	}
300 
301 	/* Has to be the first to make NHGRP_PRIV() work */
302 	nhg->nhg_size = nhgrp_size;
303 	nhg->nhg_flags = MPF_MULTIPATH;
304 
305 	nhg_priv = NHGRP_PRIV(nhg);
306 	nhg_priv->nhg_nh_count = num_nhops;
307 	refcount_init(&nhg_priv->nhg_refcount, 1);
308 
309 	/* Please see nhgrp_free() comments on the initial value */
310 	refcount_init(&nhg_priv->nhg_linked, 2);
311 
312 	nhg_priv->nhg = nhg;
313 	memcpy(&nhg_priv->nhg_nh_weights[0], wn,
314 	  num_nhops * sizeof(struct weightened_nhop));
315 
316 	FIB_NH_LOG(LOG_DEBUG, wn[0].nh, "num_nhops: %d, compiled_nhop: %u",
317 	    num_nhops, nhgrp_size);
318 
319 	compile_nhgrp(nhg_priv, wn, nhg->nhg_size);
320 
321 	return (nhg_priv);
322 }
323 
324 void
325 nhgrp_ref_object(struct nhgrp_object *nhg)
326 {
327 	struct nhgrp_priv *nhg_priv;
328 	u_int old __diagused;
329 
330 	nhg_priv = NHGRP_PRIV(nhg);
331 	old = refcount_acquire(&nhg_priv->nhg_refcount);
332 	KASSERT(old > 0, ("%s: nhgrp object %p has 0 refs", __func__, nhg));
333 }
334 
335 void
336 nhgrp_free(struct nhgrp_object *nhg)
337 {
338 	struct nhgrp_priv *nhg_priv;
339 	struct nh_control *ctl;
340 	struct epoch_tracker et;
341 
342 	nhg_priv = NHGRP_PRIV(nhg);
343 
344 	if (!refcount_release(&nhg_priv->nhg_refcount))
345 		return;
346 
347 	/*
348 	 * group objects don't have an explicit lock attached to it.
349 	 * As groups are reclaimed based on reference count, it is possible
350 	 * that some groups will persist after vnet destruction callback
351 	 * called. Given that, handle scenario with nhgrp_free_group() being
352 	 * called either after or simultaneously with nhgrp_ctl_unlink_all()
353 	 * by using another reference counter: nhg_linked.
354 	 *
355 	 * There are only 2 places, where nhg_linked can be decreased:
356 	 *  rib destroy (nhgrp_ctl_unlink_all) and this function.
357 	 * nhg_link can never be increased.
358 	 *
359 	 * Hence, use initial value of 2 to make use of
360 	 *  refcount_release_if_not_last().
361 	 *
362 	 * There can be two scenarious when calling this function:
363 	 *
364 	 * 1) nhg_linked value is 2. This means that either
365 	 *  nhgrp_ctl_unlink_all() has not been called OR it is running,
366 	 *  but we are guaranteed that nh_control won't be freed in
367 	 *  this epoch. Hence, nexthop can be safely unlinked.
368 	 *
369 	 * 2) nh_linked value is 1. In that case, nhgrp_ctl_unlink_all()
370 	 *  has been called and nhgrp unlink can be skipped.
371 	 */
372 
373 	NET_EPOCH_ENTER(et);
374 	if (refcount_release_if_not_last(&nhg_priv->nhg_linked)) {
375 		ctl = nhg_priv->nh_control;
376 		if (unlink_nhgrp(ctl, nhg_priv) == NULL) {
377 			/* Do not try to reclaim */
378 			RT_LOG(LOG_INFO, "Failed to unlink nexhop group %p",
379 			    nhg_priv);
380 			NET_EPOCH_EXIT(et);
381 			return;
382 		}
383 		MPASS((nhg_priv->nhg_idx == 0));
384 		MPASS((nhg_priv->nhg_refcount == 0));
385 	}
386 	NET_EPOCH_EXIT(et);
387 
388 	NET_EPOCH_CALL(destroy_nhgrp_epoch, &nhg_priv->nhg_epoch_ctx);
389 }
390 
391 /*
392  * Destroys all local resources belonging to @nhg_priv.
393  */
394 __noinline static void
395 destroy_nhgrp_int(struct nhgrp_priv *nhg_priv)
396 {
397 
398 	free(nhg_priv->nhg, M_NHOP);
399 }
400 
401 __noinline static void
402 destroy_nhgrp(struct nhgrp_priv *nhg_priv)
403 {
404 	IF_DEBUG_LEVEL(LOG_DEBUG2) {
405 		char nhgbuf[NHOP_PRINT_BUFSIZE] __unused;
406 		FIB_NH_LOG(LOG_DEBUG2, nhg_priv->nhg_nh_weights[0].nh,
407 		    "destroying %s", nhgrp_print_buf(nhg_priv->nhg,
408 		    nhgbuf, sizeof(nhgbuf)));
409 	}
410 
411 	free_nhgrp_nhops(nhg_priv);
412 	destroy_nhgrp_int(nhg_priv);
413 }
414 
415 /*
416  * Epoch callback indicating group is safe to destroy
417  */
418 static void
419 destroy_nhgrp_epoch(epoch_context_t ctx)
420 {
421 	struct nhgrp_priv *nhg_priv;
422 
423 	nhg_priv = __containerof(ctx, struct nhgrp_priv, nhg_epoch_ctx);
424 
425 	destroy_nhgrp(nhg_priv);
426 }
427 
428 static bool
429 ref_nhgrp_nhops(struct nhgrp_priv *nhg_priv)
430 {
431 
432 	for (int i = 0; i < nhg_priv->nhg_nh_count; i++) {
433 		if (nhop_try_ref_object(nhg_priv->nhg_nh_weights[i].nh) != 0)
434 			continue;
435 
436 		/*
437 		 * Failed to ref the nexthop, b/c it's deleted.
438 		 * Need to rollback references back.
439 		 */
440 		for (int j = 0; j < i; j++)
441 			nhop_free(nhg_priv->nhg_nh_weights[j].nh);
442 		return (false);
443 	}
444 
445 	return (true);
446 }
447 
448 static void
449 free_nhgrp_nhops(struct nhgrp_priv *nhg_priv)
450 {
451 
452 	for (int i = 0; i < nhg_priv->nhg_nh_count; i++)
453 		nhop_free(nhg_priv->nhg_nh_weights[i].nh);
454 }
455 
456 /*
457  * Allocate nexthop group of size @num_nhops with nexthops specified by
458  * @wn. Nexthops have to be unique and match the fibnum/family of the group.
459  * Returns unlinked nhgrp object on success or NULL and non-zero perror.
460  */
461 struct nhgrp_object *
462 nhgrp_alloc(uint32_t fibnum, int family, struct weightened_nhop *wn, int num_nhops,
463     int *perror)
464 {
465 	struct rib_head *rh = rt_tables_get_rnh(fibnum, family);
466 	struct nhgrp_priv *nhg_priv;
467 	struct nh_control *ctl;
468 
469 	if (rh == NULL) {
470 		*perror = E2BIG;
471 		return (NULL);
472 	}
473 
474 	ctl = rh->nh_control;
475 
476 	if (num_nhops > RIB_MAX_MPATH_WIDTH) {
477 		*perror = E2BIG;
478 		return (NULL);
479 	}
480 
481 	if (ctl->gr_head.hash_size == 0) {
482 		/* First multipath request. Bootstrap mpath datastructures. */
483 		if (nhgrp_ctl_alloc_default(ctl, M_NOWAIT) == 0) {
484 			*perror = ENOMEM;
485 			return (NULL);
486 		}
487 	}
488 
489 	/* Sort nexthops & check there are no duplicates */
490 	sort_weightened_nhops(wn, num_nhops);
491 	uint32_t last_id = 0;
492 	for (int i = 0; i < num_nhops; i++) {
493 		if (wn[i].nh->nh_priv->nh_control != ctl) {
494 			*perror = EINVAL;
495 			return (NULL);
496 		}
497 		if (wn[i].nh->nh_priv->nh_idx == last_id) {
498 			*perror = EEXIST;
499 			return (NULL);
500 		}
501 		last_id = wn[i].nh->nh_priv->nh_idx;
502 	}
503 
504 	if ((nhg_priv = alloc_nhgrp(wn, num_nhops)) == NULL) {
505 		*perror = ENOMEM;
506 		return (NULL);
507 	}
508 	nhg_priv->nh_control = ctl;
509 
510 	*perror = 0;
511 	return (nhg_priv->nhg);
512 }
513 
514 /*
515  * Finds an existing group matching @nhg or links @nhg to the tree.
516  * Returns the referenced group or NULL and non-zero @perror.
517  */
518 struct nhgrp_object *
519 nhgrp_get_nhgrp(struct nhgrp_object *nhg, int *perror)
520 {
521 	struct nhgrp_priv *nhg_priv, *key = NHGRP_PRIV(nhg);
522 	struct nh_control *ctl = key->nh_control;
523 
524 	nhg_priv = find_nhgrp(ctl, key);
525 	if (nhg_priv != NULL) {
526 		/*
527 		 * Free originally-created group. As it hasn't been linked
528 		 *  and the dependent nexhops haven't been referenced, just free
529 		 *  the group.
530 		 */
531 		destroy_nhgrp_int(key);
532 		*perror = 0;
533 		return (nhg_priv->nhg);
534 	} else {
535 		/* No existing group, try to link the new one */
536 		if (!ref_nhgrp_nhops(key)) {
537 			/*
538 			 * Some of the nexthops have been scheduled for deletion.
539 			 * As the group hasn't been linked / no nexhops have been
540 			 *  referenced, call the final destructor immediately.
541 			 */
542 			destroy_nhgrp_int(key);
543 			*perror = EAGAIN;
544 			return (NULL);
545 		}
546 		if (link_nhgrp(ctl, key) == 0) {
547 			/* Unable to allocate index? */
548 			*perror = EAGAIN;
549 			free_nhgrp_nhops(key);
550 			destroy_nhgrp_int(key);
551 			return (NULL);
552 		}
553 		*perror = 0;
554 		return (nhg);
555 	}
556 
557 	/* NOTREACHED */
558 }
559 
560 /*
561  * Creates or looks up an existing nexthop group based on @wn and @num_nhops.
562  *
563  * Returns referenced nhop group or NULL, passing error code in @perror.
564  */
565 struct nhgrp_priv *
566 get_nhgrp(struct nh_control *ctl, struct weightened_nhop *wn, int num_nhops,
567     uint32_t uidx, int *perror)
568 {
569 	struct nhgrp_object *nhg;
570 
571 	nhg = nhgrp_alloc(ctl->ctl_rh->rib_fibnum, ctl->ctl_rh->rib_family,
572 	    wn, num_nhops, perror);
573 	if (nhg == NULL)
574 		return (NULL);
575 	nhgrp_set_uidx(nhg, uidx);
576 	nhg = nhgrp_get_nhgrp(nhg, perror);
577 	if (nhg != NULL)
578 		return (NHGRP_PRIV(nhg));
579 	return (NULL);
580 }
581 
582 
583 /*
584  * Appends one or more nexthops denoted by @wm to the nexthop group @gr_orig.
585  *
586  * Returns referenced nexthop group or NULL. In the latter case, @perror is
587  *  filled with an error code.
588  * Note that function does NOT care if the next nexthops already exists
589  * in the @gr_orig. As a result, they will be added, resulting in the
590  * same nexthop being present multiple times in the new group.
591  */
592 static struct nhgrp_priv *
593 append_nhops(struct nh_control *ctl, const struct nhgrp_object *gr_orig,
594     struct weightened_nhop *wn, int num_nhops, int *perror)
595 {
596 	char storage[64];
597 	struct weightened_nhop *pnhops;
598 	struct nhgrp_priv *nhg_priv;
599 	const struct nhgrp_priv *src_priv;
600 	size_t sz;
601 	int curr_nhops;
602 
603 	src_priv = NHGRP_PRIV_CONST(gr_orig);
604 	curr_nhops = src_priv->nhg_nh_count;
605 
606 	*perror = 0;
607 
608 	sz = (src_priv->nhg_nh_count + num_nhops) * (sizeof(struct weightened_nhop));
609 	/* optimize for <= 4 paths, each path=16 bytes */
610 	if (sz <= sizeof(storage))
611 		pnhops = (struct weightened_nhop *)&storage[0];
612 	else {
613 		pnhops = malloc(sz, M_TEMP, M_NOWAIT);
614 		if (pnhops == NULL) {
615 			*perror = ENOMEM;
616 			return (NULL);
617 		}
618 	}
619 
620 	/* Copy nhops from original group first */
621 	memcpy(pnhops, src_priv->nhg_nh_weights,
622 	  curr_nhops * sizeof(struct weightened_nhop));
623 	memcpy(&pnhops[curr_nhops], wn, num_nhops * sizeof(struct weightened_nhop));
624 	curr_nhops += num_nhops;
625 
626 	nhg_priv = get_nhgrp(ctl, pnhops, curr_nhops, 0, perror);
627 
628 	if (pnhops != (struct weightened_nhop *)&storage[0])
629 		free(pnhops, M_TEMP);
630 
631 	if (nhg_priv == NULL)
632 		return (NULL);
633 
634 	return (nhg_priv);
635 }
636 
637 
638 /*
639  * Creates/finds nexthop group based on @wn and @num_nhops.
640  * Returns 0 on success with referenced group in @rnd, or
641  * errno.
642  *
643  * If the error is EAGAIN, then the operation can be retried.
644  */
645 int
646 nhgrp_get_group(struct rib_head *rh, struct weightened_nhop *wn, int num_nhops,
647     uint32_t uidx, struct nhgrp_object **pnhg)
648 {
649 	struct nh_control *ctl = rh->nh_control;
650 	struct nhgrp_priv *nhg_priv;
651 	int error;
652 
653 	nhg_priv = get_nhgrp(ctl, wn, num_nhops, uidx, &error);
654 	if (nhg_priv != NULL)
655 		*pnhg = nhg_priv->nhg;
656 
657 	return (error);
658 }
659 
660 /*
661  * Creates new nexthop group based on @src group without the nexthops
662  * chosen by @flt_func.
663  * Returns 0 on success, storring the reference nhop group/object in @rnd.
664  */
665 int
666 nhgrp_get_filtered_group(struct rib_head *rh, const struct rtentry *rt,
667     const struct nhgrp_object *src, rib_filter_f_t flt_func, void *flt_data,
668     struct route_nhop_data *rnd)
669 {
670 	char storage[64];
671 	struct nh_control *ctl = rh->nh_control;
672 	struct weightened_nhop *pnhops;
673 	const struct nhgrp_priv *mp_priv, *src_priv;
674 	size_t sz;
675 	int error, i, num_nhops;
676 
677 	src_priv = NHGRP_PRIV_CONST(src);
678 
679 	sz = src_priv->nhg_nh_count * (sizeof(struct weightened_nhop));
680 	/* optimize for <= 4 paths, each path=16 bytes */
681 	if (sz <= sizeof(storage))
682 		pnhops = (struct weightened_nhop *)&storage[0];
683 	else {
684 		if ((pnhops = malloc(sz, M_TEMP, M_NOWAIT)) == NULL)
685 			return (ENOMEM);
686 	}
687 
688 	/* Filter nexthops */
689 	error = 0;
690 	num_nhops = 0;
691 	for (i = 0; i < src_priv->nhg_nh_count; i++) {
692 		if (flt_func(rt, src_priv->nhg_nh_weights[i].nh, flt_data))
693 			continue;
694 		memcpy(&pnhops[num_nhops++], &src_priv->nhg_nh_weights[i],
695 		  sizeof(struct weightened_nhop));
696 	}
697 
698 	if (num_nhops == 0) {
699 		rnd->rnd_nhgrp = NULL;
700 		rnd->rnd_weight = 0;
701 	} else if (num_nhops == 1) {
702 		rnd->rnd_nhop = pnhops[0].nh;
703 		rnd->rnd_weight = pnhops[0].weight;
704 		if (nhop_try_ref_object(rnd->rnd_nhop) == 0)
705 			error = EAGAIN;
706 	} else {
707 		mp_priv = get_nhgrp(ctl, pnhops, num_nhops, 0, &error);
708 		if (mp_priv != NULL)
709 			rnd->rnd_nhgrp = mp_priv->nhg;
710 		rnd->rnd_weight = 0;
711 	}
712 
713 	if (pnhops != (struct weightened_nhop *)&storage[0])
714 		free(pnhops, M_TEMP);
715 
716 	return (error);
717 }
718 
719 /*
720  * Creates new multipath group based on existing group/nhop in @rnd_orig and
721  *  to-be-added nhop @wn_add.
722  * Returns 0 on success and stores result in @rnd_new.
723  */
724 int
725 nhgrp_get_addition_group(struct rib_head *rh, struct route_nhop_data *rnd_orig,
726     struct route_nhop_data *rnd_add, struct route_nhop_data *rnd_new)
727 {
728 	struct nh_control *ctl = rh->nh_control;
729 	struct nhgrp_priv *nhg_priv;
730 	struct weightened_nhop wn[2] = {};
731 	int error;
732 
733 	if (rnd_orig->rnd_nhop == NULL) {
734 		/* No paths to add to, just reference current nhop */
735 		*rnd_new = *rnd_add;
736 		if (nhop_try_ref_object(rnd_new->rnd_nhop) == 0)
737 			return (EAGAIN);
738 		return (0);
739 	}
740 
741 	wn[0].nh = rnd_add->rnd_nhop;
742 	wn[0].weight = rnd_add->rnd_weight;
743 
744 	if (!NH_IS_NHGRP(rnd_orig->rnd_nhop)) {
745 		/* Simple merge of 2 non-multipath nexthops */
746 		wn[1].nh = rnd_orig->rnd_nhop;
747 		wn[1].weight = rnd_orig->rnd_weight;
748 		nhg_priv = get_nhgrp(ctl, wn, 2, 0, &error);
749 	} else {
750 		/* Get new nhop group with @rt->rt_nhop as an additional nhop */
751 		nhg_priv = append_nhops(ctl, rnd_orig->rnd_nhgrp, &wn[0], 1,
752 		    &error);
753 	}
754 
755 	if (nhg_priv == NULL)
756 		return (error);
757 	rnd_new->rnd_nhgrp = nhg_priv->nhg;
758 	rnd_new->rnd_weight = 0;
759 
760 	return (0);
761 }
762 
763 /*
764  * Returns pointer to array of nexthops with weights for
765  * given @nhg. Stores number of items in the array into @pnum_nhops.
766  */
767 const struct weightened_nhop *
768 nhgrp_get_nhops(const struct nhgrp_object *nhg, uint32_t *pnum_nhops)
769 {
770 	const struct nhgrp_priv *nhg_priv;
771 
772 	KASSERT(((nhg->nhg_flags & MPF_MULTIPATH) != 0), ("nhop is not mpath"));
773 
774 	nhg_priv = NHGRP_PRIV_CONST(nhg);
775 	*pnum_nhops = nhg_priv->nhg_nh_count;
776 
777 	return (nhg_priv->nhg_nh_weights);
778 }
779 
780 void
781 nhgrp_set_uidx(struct nhgrp_object *nhg, uint32_t uidx)
782 {
783 	struct nhgrp_priv *nhg_priv;
784 
785 	KASSERT(((nhg->nhg_flags & MPF_MULTIPATH) != 0), ("nhop is not mpath"));
786 
787 	nhg_priv = NHGRP_PRIV(nhg);
788 
789 	nhg_priv->nhg_uidx = uidx;
790 }
791 
792 uint32_t
793 nhgrp_get_uidx(const struct nhgrp_object *nhg)
794 {
795 	const struct nhgrp_priv *nhg_priv;
796 
797 	KASSERT(((nhg->nhg_flags & MPF_MULTIPATH) != 0), ("nhop is not mpath"));
798 
799 	nhg_priv = NHGRP_PRIV_CONST(nhg);
800 	return (nhg_priv->nhg_uidx);
801 }
802 
803 /*
804  * Prints nexhop group @nhg data in the provided @buf.
805  * Example: nhg#33/sz=3:[#1:100,#2:100,#3:100]
806  * Example: nhg#33/sz=5:[#1:100,#2:100,..]
807  */
808 char *
809 nhgrp_print_buf(const struct nhgrp_object *nhg, char *buf, size_t bufsize)
810 {
811 	const struct nhgrp_priv *nhg_priv = NHGRP_PRIV_CONST(nhg);
812 
813 	int off = snprintf(buf, bufsize, "nhg#%u/sz=%u:[", nhg_priv->nhg_idx,
814 	    nhg_priv->nhg_nh_count);
815 
816 	for (int i = 0; i < nhg_priv->nhg_nh_count; i++) {
817 		const struct weightened_nhop *wn = &nhg_priv->nhg_nh_weights[i];
818 		int len = snprintf(&buf[off], bufsize - off, "#%u:%u,",
819 		    wn->nh->nh_priv->nh_idx, wn->weight);
820 		if (len + off + 3 >= bufsize) {
821 			int len = snprintf(&buf[off], bufsize - off, "...");
822 			off += len;
823 			break;
824 		}
825 		off += len;
826 	}
827 	if (off > 0)
828 		off--; // remove last ","
829 	if (off + 1 < bufsize)
830 		snprintf(&buf[off], bufsize - off, "]");
831 	return buf;
832 }
833 
834 __noinline static int
835 dump_nhgrp_entry(struct rib_head *rh, const struct nhgrp_priv *nhg_priv,
836     char *buffer, size_t buffer_size, struct sysctl_req *w)
837 {
838 	struct rt_msghdr *rtm;
839 	struct nhgrp_external *nhge;
840 	struct nhgrp_container *nhgc;
841 	const struct nhgrp_object *nhg;
842 	struct nhgrp_nhop_external *ext;
843 	int error;
844 	size_t sz;
845 
846 	nhg = nhg_priv->nhg;
847 
848 	sz = sizeof(struct rt_msghdr) + sizeof(struct nhgrp_external);
849 	/* controlplane nexthops */
850 	sz += sizeof(struct nhgrp_container);
851 	sz += sizeof(struct nhgrp_nhop_external) * nhg_priv->nhg_nh_count;
852 	/* dataplane nexthops */
853 	sz += sizeof(struct nhgrp_container);
854 	sz += sizeof(struct nhgrp_nhop_external) * nhg->nhg_size;
855 
856 	KASSERT(sz <= buffer_size, ("increase nhgrp buffer size"));
857 
858 	bzero(buffer, sz);
859 
860 	rtm = (struct rt_msghdr *)buffer;
861 	rtm->rtm_msglen = sz;
862 	rtm->rtm_version = RTM_VERSION;
863 	rtm->rtm_type = RTM_GET;
864 
865 	nhge = (struct nhgrp_external *)(rtm + 1);
866 
867 	nhge->nhg_idx = nhg_priv->nhg_idx;
868 	nhge->nhg_refcount = nhg_priv->nhg_refcount;
869 
870 	/* fill in control plane nexthops firs */
871 	nhgc = (struct nhgrp_container *)(nhge + 1);
872 	nhgc->nhgc_type = NHG_C_TYPE_CNHOPS;
873 	nhgc->nhgc_subtype = 0;
874 	nhgc->nhgc_len = sizeof(struct nhgrp_container);
875 	nhgc->nhgc_len += sizeof(struct nhgrp_nhop_external) * nhg_priv->nhg_nh_count;
876 	nhgc->nhgc_count = nhg_priv->nhg_nh_count;
877 
878 	ext = (struct nhgrp_nhop_external *)(nhgc + 1);
879 	for (int i = 0; i < nhg_priv->nhg_nh_count; i++) {
880 		ext[i].nh_idx = nhg_priv->nhg_nh_weights[i].nh->nh_priv->nh_idx;
881 		ext[i].nh_weight = nhg_priv->nhg_nh_weights[i].weight;
882 	}
883 
884 	/* fill in dataplane nexthops */
885 	nhgc = (struct nhgrp_container *)(&ext[nhg_priv->nhg_nh_count]);
886 	nhgc->nhgc_type = NHG_C_TYPE_DNHOPS;
887 	nhgc->nhgc_subtype = 0;
888 	nhgc->nhgc_len = sizeof(struct nhgrp_container);
889 	nhgc->nhgc_len += sizeof(struct nhgrp_nhop_external) * nhg->nhg_size;
890 	nhgc->nhgc_count = nhg->nhg_size;
891 
892 	ext = (struct nhgrp_nhop_external *)(nhgc + 1);
893 	for (int i = 0; i < nhg->nhg_size; i++) {
894 		ext[i].nh_idx = nhg->nhops[i]->nh_priv->nh_idx;
895 		ext[i].nh_weight = 0;
896 	}
897 
898 	error = SYSCTL_OUT(w, buffer, sz);
899 
900 	return (error);
901 }
902 
903 uint32_t
904 nhgrp_get_idx(const struct nhgrp_object *nhg)
905 {
906 	const struct nhgrp_priv *nhg_priv;
907 
908 	nhg_priv = NHGRP_PRIV_CONST(nhg);
909 	return (nhg_priv->nhg_idx);
910 }
911 
912 uint8_t
913 nhgrp_get_origin(const struct nhgrp_object *nhg)
914 {
915 	return (NHGRP_PRIV_CONST(nhg)->nhg_origin);
916 }
917 
918 void
919 nhgrp_set_origin(struct nhgrp_object *nhg, uint8_t origin)
920 {
921 	NHGRP_PRIV(nhg)->nhg_origin = origin;
922 }
923 
924 uint32_t
925 nhgrp_get_count(struct rib_head *rh)
926 {
927 	struct nh_control *ctl;
928 	uint32_t count;
929 
930 	ctl = rh->nh_control;
931 
932 	NHOPS_RLOCK(ctl);
933 	count = ctl->gr_head.items_count;
934 	NHOPS_RUNLOCK(ctl);
935 
936 	return (count);
937 }
938 
939 int
940 nhgrp_dump_sysctl(struct rib_head *rh, struct sysctl_req *w)
941 {
942 	struct nh_control *ctl = rh->nh_control;
943 	struct epoch_tracker et;
944 	struct nhgrp_priv *nhg_priv;
945 	char *buffer;
946 	size_t sz;
947 	int error = 0;
948 
949 	if (ctl->gr_head.items_count == 0)
950 		return (0);
951 
952 	/* Calculate the maximum nhop group size in bytes */
953 	sz = sizeof(struct rt_msghdr) + sizeof(struct nhgrp_external);
954 	sz += 2 * sizeof(struct nhgrp_container);
955 	sz += 2 * sizeof(struct nhgrp_nhop_external) * RIB_MAX_MPATH_WIDTH;
956 	buffer = malloc(sz, M_TEMP, M_NOWAIT);
957 	if (buffer == NULL)
958 		return (ENOMEM);
959 
960 	NET_EPOCH_ENTER(et);
961 	NHOPS_RLOCK(ctl);
962 	CHT_SLIST_FOREACH(&ctl->gr_head, mpath, nhg_priv) {
963 		error = dump_nhgrp_entry(rh, nhg_priv, buffer, sz, w);
964 		if (error != 0)
965 			break;
966 	} CHT_SLIST_FOREACH_END;
967 	NHOPS_RUNLOCK(ctl);
968 	NET_EPOCH_EXIT(et);
969 
970 	free(buffer, M_TEMP);
971 
972 	return (error);
973 }
974