xref: /freebsd/sys/net/route/nhgrp_ctl.c (revision 911f0260390e18cf85f3dbf2c719b593efdc1e3c)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause
3  *
4  * Copyright (c) 2020 Alexander V. Chernikov
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions
8  * are met:
9  * 1. Redistributions of source code must retain the above copyright
10  *    notice, this list of conditions and the following disclaimer.
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in the
13  *    documentation and/or other materials provided with the distribution.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
16  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
19  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
21  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
22  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
23  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
24  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
25  * SUCH DAMAGE.
26  *
27  * $FreeBSD$
28  */
29 #include "opt_inet.h"
30 #include "opt_route.h"
31 
32 #include <sys/cdefs.h>
33 #include <sys/param.h>
34 #include <sys/systm.h>
35 #include <sys/lock.h>
36 #include <sys/rmlock.h>
37 #include <sys/malloc.h>
38 #include <sys/mbuf.h>
39 #include <sys/refcount.h>
40 #include <sys/socket.h>
41 #include <sys/sysctl.h>
42 #include <sys/kernel.h>
43 #include <sys/epoch.h>
44 
45 #include <net/if.h>
46 #include <net/if_var.h>
47 #include <net/if_private.h>
48 #include <net/route.h>
49 #include <net/route/route_ctl.h>
50 #include <net/route/route_var.h>
51 #include <net/vnet.h>
52 
53 #include <netinet/in.h>
54 #include <netinet/in_var.h>
55 #include <netinet/in_fib.h>
56 
57 #include <net/route/nhop_utils.h>
58 #include <net/route/nhop.h>
59 #include <net/route/nhop_var.h>
60 #include <net/route/nhgrp_var.h>
61 
62 #define	DEBUG_MOD_NAME	nhgrp_ctl
63 #define	DEBUG_MAX_LEVEL	LOG_DEBUG
64 #include <net/route/route_debug.h>
65 _DECLARE_DEBUG(LOG_INFO);
66 
67 /*
68  * This file contains the supporting functions for creating multipath groups
69  *  and compiling their dataplane parts.
70  */
71 
72 /* MPF_MULTIPATH must be the same as NHF_MULTIPATH for nhop selection to work */
73 _Static_assert(MPF_MULTIPATH == NHF_MULTIPATH,
74     "MPF_MULTIPATH must be the same as NHF_MULTIPATH");
75 /* Offset and size of flags field has to be the same for nhop/nhop groups */
76 CHK_STRUCT_FIELD_GENERIC(struct nhop_object, nh_flags, struct nhgrp_object, nhg_flags);
77 /* Cap multipath to 64, as the larger values would break rib_cmd_info bmasks */
78 CTASSERT(RIB_MAX_MPATH_WIDTH <= 64);
79 
80 static int wn_cmp_idx(const void *a, const void *b);
81 static void sort_weightened_nhops(struct weightened_nhop *wn, int num_nhops);
82 
83 static struct nhgrp_priv *get_nhgrp(struct nh_control *ctl,
84     struct weightened_nhop *wn, int num_nhops, uint32_t uidx, int *perror);
85 static void destroy_nhgrp(struct nhgrp_priv *nhg_priv);
86 static void destroy_nhgrp_epoch(epoch_context_t ctx);
87 static void free_nhgrp_nhops(struct nhgrp_priv *nhg_priv);
88 
89 static int
90 wn_cmp_idx(const void *a, const void *b)
91 {
92 	const struct weightened_nhop *w_a = a;
93 	const struct weightened_nhop *w_b = b;
94 	uint32_t a_idx = w_a->nh->nh_priv->nh_idx;
95 	uint32_t b_idx = w_b->nh->nh_priv->nh_idx;
96 
97 	if (a_idx < b_idx)
98 		return (-1);
99 	else if (a_idx > b_idx)
100 		return (1);
101 	else
102 		return (0);
103 }
104 
105 /*
106  * Perform in-place sorting for array of nexthops in @wn.
107  * Sort by nexthop index ascending.
108  */
109 static void
110 sort_weightened_nhops(struct weightened_nhop *wn, int num_nhops)
111 {
112 
113 	qsort(wn, num_nhops, sizeof(struct weightened_nhop), wn_cmp_idx);
114 }
115 
116 /*
117  * In order to determine the minimum weight difference in the array
118  * of weights, create a sorted array of weights, using spare "storage"
119  * field in the `struct weightened_nhop`.
120  * Assume weights to be (mostly) the same and use insertion sort to
121  * make it sorted.
122  */
123 static void
124 sort_weightened_nhops_weights(struct weightened_nhop *wn, int num_items)
125 {
126 	wn[0].storage = wn[0].weight;
127 	for (int i = 1, j = 0; i < num_items; i++) {
128 		uint32_t weight = wn[i].weight; // read from 'weight' as it's not reordered
129 		/* Move all weights > weight 1 position right */
130 		for (j = i - 1; j >= 0 && wn[j].storage > weight; j--)
131 			wn[j + 1].storage = wn[j].storage;
132 		wn[j + 1].storage = weight;
133 	}
134 }
135 
136 /*
137  * Calculate minimum number of slots required to fit the existing
138  * set of weights in the common use case where weights are "easily"
139  * comparable.
140  * Assumes @wn is sorted by weight ascending and each weight is > 0.
141  * Returns number of slots or 0 if precise calculation failed.
142  *
143  * Some examples:
144  * note: (i, X) pair means (nhop=i, weight=X):
145  * (1, 1) (2, 2) -> 3 slots [1, 2, 2]
146  * (1, 100), (2, 200) -> 3 slots [1, 2, 2]
147  * (1, 100), (2, 200), (3, 400) -> 7 slots [1, 2, 2, 3, 3, 3]
148  */
149 static uint32_t
150 calc_min_mpath_slots_fast(struct weightened_nhop *wn, size_t num_items,
151     uint64_t *ptotal)
152 {
153 	uint32_t i, last, xmin;
154 	uint64_t total = 0;
155 
156 	// Get sorted array of weights in .storage field
157 	sort_weightened_nhops_weights(wn, num_items);
158 
159 	last = 0;
160 	xmin = wn[0].storage;
161 	for (i = 0; i < num_items; i++) {
162 		total += wn[i].storage;
163 		if ((wn[i].storage != last) &&
164 		    ((wn[i].storage - last < xmin) || xmin == 0)) {
165 			xmin = wn[i].storage - last;
166 		}
167 		last = wn[i].storage;
168 	}
169 	*ptotal = total;
170 	/* xmin is the minimum unit of desired capacity */
171 	if ((total % xmin) != 0)
172 		return (0);
173 	for (i = 0; i < num_items; i++) {
174 		if ((wn[i].weight % xmin) != 0)
175 			return (0);
176 	}
177 
178 	return ((uint32_t)(total / xmin));
179 }
180 
181 /*
182  * Calculate minimum number of slots required to fit the existing
183  * set of weights while maintaining weight coefficients.
184  *
185  * Assume @wn is sorted by weight ascending and each weight is > 0.
186  *
187  * Tries to find simple precise solution first and falls back to
188  *  RIB_MAX_MPATH_WIDTH in case of any failure.
189  */
190 static uint32_t
191 calc_min_mpath_slots(struct weightened_nhop *wn, size_t num_items)
192 {
193 	uint32_t v;
194 	uint64_t total;
195 
196 	v = calc_min_mpath_slots_fast(wn, num_items, &total);
197 	if (total == 0)
198 		return (0);
199 	if ((v == 0) || (v > RIB_MAX_MPATH_WIDTH))
200 		v = RIB_MAX_MPATH_WIDTH;
201 
202 	return (v);
203 }
204 
205 /*
206  * Nexthop group data consists of
207  * 1) dataplane part, with nhgrp_object as a header followed by an
208  *   arbitrary number of nexthop pointers.
209  * 2) control plane part, with nhgrp_priv as a header, followed by
210  *   an arbirtrary number of 'struct weightened_nhop' object.
211  *
212  * Given nexthop groups are (mostly) immutable, allocate all data
213  * in one go.
214  *
215  */
216 __noinline static size_t
217 get_nhgrp_alloc_size(uint32_t nhg_size, uint32_t num_nhops)
218 {
219 	size_t sz;
220 
221 	sz = sizeof(struct nhgrp_object);
222 	sz += nhg_size * sizeof(struct nhop_object *);
223 	sz += sizeof(struct nhgrp_priv);
224 	sz += num_nhops * sizeof(struct weightened_nhop);
225 	return (sz);
226 }
227 
228 /*
229  * Compile actual list of nexthops to be used by datapath from
230  *  the nexthop group @dst.
231  *
232  * For example, compiling control plane list of 2 nexthops
233  *  [(200, A), (100, B)] would result in the datapath array
234  *  [A, A, B]
235  */
236 static void
237 compile_nhgrp(struct nhgrp_priv *dst_priv, const struct weightened_nhop *x,
238     uint32_t num_slots)
239 {
240 	struct nhgrp_object *dst;
241 	int i, slot_idx, remaining_slots;
242 	uint64_t remaining_sum, nh_weight, nh_slots;
243 
244 	slot_idx  = 0;
245 	dst = dst_priv->nhg;
246 	/* Calculate sum of all weights */
247 	remaining_sum = 0;
248 	for (i = 0; i < dst_priv->nhg_nh_count; i++)
249 		remaining_sum += x[i].weight;
250 	remaining_slots = num_slots;
251 	FIB_NH_LOG(LOG_DEBUG3, x[0].nh, "sum: %lu, slots: %d",
252 	    remaining_sum, remaining_slots);
253 	for (i = 0; i < dst_priv->nhg_nh_count; i++) {
254 		/* Calculate number of slots for the current nexthop */
255 		if (remaining_sum > 0) {
256 			nh_weight = (uint64_t)x[i].weight;
257 			nh_slots = (nh_weight * remaining_slots / remaining_sum);
258 		} else
259 			nh_slots = 0;
260 
261 		remaining_sum -= x[i].weight;
262 		remaining_slots -= nh_slots;
263 
264 		FIB_NH_LOG(LOG_DEBUG3, x[0].nh,
265 		    " rem_sum: %lu, rem_slots: %d nh_slots: %d, slot_idx: %d",
266 		    remaining_sum, remaining_slots, (int)nh_slots, slot_idx);
267 
268 		KASSERT((slot_idx + nh_slots <= num_slots),
269 		    ("index overflow during nhg compilation"));
270 		while (nh_slots-- > 0)
271 			dst->nhops[slot_idx++] = x[i].nh;
272 	}
273 }
274 
275 /*
276  * Allocates new nexthop group for the list of weightened nexthops.
277  * Assume sorted list.
278  * Does NOT reference any nexthops in the group.
279  * Returns group with refcount=1 or NULL.
280  */
281 static struct nhgrp_priv *
282 alloc_nhgrp(struct weightened_nhop *wn, int num_nhops)
283 {
284 	uint32_t nhgrp_size;
285 	struct nhgrp_object *nhg;
286 	struct nhgrp_priv *nhg_priv;
287 
288 	nhgrp_size = calc_min_mpath_slots(wn, num_nhops);
289 	if (nhgrp_size == 0) {
290 		/* Zero weights, abort */
291 		return (NULL);
292 	}
293 
294 	size_t sz = get_nhgrp_alloc_size(nhgrp_size, num_nhops);
295 	nhg = malloc(sz, M_NHOP, M_NOWAIT | M_ZERO);
296 	if (nhg == NULL) {
297 		FIB_NH_LOG(LOG_INFO, wn[0].nh,
298 		    "unable to allocate group with num_nhops %d (compiled %u)",
299 		    num_nhops, nhgrp_size);
300 		return (NULL);
301 	}
302 
303 	/* Has to be the first to make NHGRP_PRIV() work */
304 	nhg->nhg_size = nhgrp_size;
305 	nhg->nhg_flags = MPF_MULTIPATH;
306 
307 	nhg_priv = NHGRP_PRIV(nhg);
308 	nhg_priv->nhg_nh_count = num_nhops;
309 	refcount_init(&nhg_priv->nhg_refcount, 1);
310 
311 	/* Please see nhgrp_free() comments on the initial value */
312 	refcount_init(&nhg_priv->nhg_linked, 2);
313 
314 	nhg_priv->nhg = nhg;
315 	memcpy(&nhg_priv->nhg_nh_weights[0], wn,
316 	  num_nhops * sizeof(struct weightened_nhop));
317 
318 	FIB_NH_LOG(LOG_DEBUG, wn[0].nh, "num_nhops: %d, compiled_nhop: %u",
319 	    num_nhops, nhgrp_size);
320 
321 	compile_nhgrp(nhg_priv, wn, nhg->nhg_size);
322 
323 	return (nhg_priv);
324 }
325 
326 void
327 nhgrp_ref_object(struct nhgrp_object *nhg)
328 {
329 	struct nhgrp_priv *nhg_priv;
330 	u_int old __diagused;
331 
332 	nhg_priv = NHGRP_PRIV(nhg);
333 	old = refcount_acquire(&nhg_priv->nhg_refcount);
334 	KASSERT(old > 0, ("%s: nhgrp object %p has 0 refs", __func__, nhg));
335 }
336 
337 void
338 nhgrp_free(struct nhgrp_object *nhg)
339 {
340 	struct nhgrp_priv *nhg_priv;
341 	struct nh_control *ctl;
342 	struct epoch_tracker et;
343 
344 	nhg_priv = NHGRP_PRIV(nhg);
345 
346 	if (!refcount_release(&nhg_priv->nhg_refcount))
347 		return;
348 
349 	/*
350 	 * group objects don't have an explicit lock attached to it.
351 	 * As groups are reclaimed based on reference count, it is possible
352 	 * that some groups will persist after vnet destruction callback
353 	 * called. Given that, handle scenario with nhgrp_free_group() being
354 	 * called either after or simultaneously with nhgrp_ctl_unlink_all()
355 	 * by using another reference counter: nhg_linked.
356 	 *
357 	 * There are only 2 places, where nhg_linked can be decreased:
358 	 *  rib destroy (nhgrp_ctl_unlink_all) and this function.
359 	 * nhg_link can never be increased.
360 	 *
361 	 * Hence, use initial value of 2 to make use of
362 	 *  refcount_release_if_not_last().
363 	 *
364 	 * There can be two scenarious when calling this function:
365 	 *
366 	 * 1) nhg_linked value is 2. This means that either
367 	 *  nhgrp_ctl_unlink_all() has not been called OR it is running,
368 	 *  but we are guaranteed that nh_control won't be freed in
369 	 *  this epoch. Hence, nexthop can be safely unlinked.
370 	 *
371 	 * 2) nh_linked value is 1. In that case, nhgrp_ctl_unlink_all()
372 	 *  has been called and nhgrp unlink can be skipped.
373 	 */
374 
375 	NET_EPOCH_ENTER(et);
376 	if (refcount_release_if_not_last(&nhg_priv->nhg_linked)) {
377 		ctl = nhg_priv->nh_control;
378 		if (unlink_nhgrp(ctl, nhg_priv) == NULL) {
379 			/* Do not try to reclaim */
380 			RT_LOG(LOG_INFO, "Failed to unlink nexhop group %p",
381 			    nhg_priv);
382 			NET_EPOCH_EXIT(et);
383 			return;
384 		}
385 		MPASS((nhg_priv->nhg_idx == 0));
386 		MPASS((nhg_priv->nhg_refcount == 0));
387 	}
388 	NET_EPOCH_EXIT(et);
389 
390 	NET_EPOCH_CALL(destroy_nhgrp_epoch, &nhg_priv->nhg_epoch_ctx);
391 }
392 
393 /*
394  * Destroys all local resources belonging to @nhg_priv.
395  */
396 __noinline static void
397 destroy_nhgrp_int(struct nhgrp_priv *nhg_priv)
398 {
399 
400 	free(nhg_priv->nhg, M_NHOP);
401 }
402 
403 __noinline static void
404 destroy_nhgrp(struct nhgrp_priv *nhg_priv)
405 {
406 	IF_DEBUG_LEVEL(LOG_DEBUG2) {
407 		char nhgbuf[NHOP_PRINT_BUFSIZE] __unused;
408 		FIB_NH_LOG(LOG_DEBUG2, nhg_priv->nhg_nh_weights[0].nh,
409 		    "destroying %s", nhgrp_print_buf(nhg_priv->nhg,
410 		    nhgbuf, sizeof(nhgbuf)));
411 	}
412 
413 	free_nhgrp_nhops(nhg_priv);
414 	destroy_nhgrp_int(nhg_priv);
415 }
416 
417 /*
418  * Epoch callback indicating group is safe to destroy
419  */
420 static void
421 destroy_nhgrp_epoch(epoch_context_t ctx)
422 {
423 	struct nhgrp_priv *nhg_priv;
424 
425 	nhg_priv = __containerof(ctx, struct nhgrp_priv, nhg_epoch_ctx);
426 
427 	destroy_nhgrp(nhg_priv);
428 }
429 
430 static bool
431 ref_nhgrp_nhops(struct nhgrp_priv *nhg_priv)
432 {
433 
434 	for (int i = 0; i < nhg_priv->nhg_nh_count; i++) {
435 		if (nhop_try_ref_object(nhg_priv->nhg_nh_weights[i].nh) != 0)
436 			continue;
437 
438 		/*
439 		 * Failed to ref the nexthop, b/c it's deleted.
440 		 * Need to rollback references back.
441 		 */
442 		for (int j = 0; j < i; j++)
443 			nhop_free(nhg_priv->nhg_nh_weights[j].nh);
444 		return (false);
445 	}
446 
447 	return (true);
448 }
449 
450 static void
451 free_nhgrp_nhops(struct nhgrp_priv *nhg_priv)
452 {
453 
454 	for (int i = 0; i < nhg_priv->nhg_nh_count; i++)
455 		nhop_free(nhg_priv->nhg_nh_weights[i].nh);
456 }
457 
458 /*
459  * Allocate nexthop group of size @num_nhops with nexthops specified by
460  * @wn. Nexthops have to be unique and match the fibnum/family of the group.
461  * Returns unlinked nhgrp object on success or NULL and non-zero perror.
462  */
463 struct nhgrp_object *
464 nhgrp_alloc(uint32_t fibnum, int family, struct weightened_nhop *wn, int num_nhops,
465     int *perror)
466 {
467 	struct rib_head *rh = rt_tables_get_rnh(fibnum, family);
468 	struct nhgrp_priv *nhg_priv;
469 	struct nh_control *ctl;
470 
471 	if (rh == NULL) {
472 		*perror = E2BIG;
473 		return (NULL);
474 	}
475 
476 	ctl = rh->nh_control;
477 
478 	if (num_nhops > RIB_MAX_MPATH_WIDTH) {
479 		*perror = E2BIG;
480 		return (NULL);
481 	}
482 
483 	if (ctl->gr_head.hash_size == 0) {
484 		/* First multipath request. Bootstrap mpath datastructures. */
485 		if (nhgrp_ctl_alloc_default(ctl, M_NOWAIT) == 0) {
486 			*perror = ENOMEM;
487 			return (NULL);
488 		}
489 	}
490 
491 	/* Sort nexthops & check there are no duplicates */
492 	sort_weightened_nhops(wn, num_nhops);
493 	uint32_t last_id = 0;
494 	for (int i = 0; i < num_nhops; i++) {
495 		if (wn[i].nh->nh_priv->nh_control != ctl) {
496 			*perror = EINVAL;
497 			return (NULL);
498 		}
499 		if (wn[i].nh->nh_priv->nh_idx == last_id) {
500 			*perror = EEXIST;
501 			return (NULL);
502 		}
503 		last_id = wn[i].nh->nh_priv->nh_idx;
504 	}
505 
506 	if ((nhg_priv = alloc_nhgrp(wn, num_nhops)) == NULL) {
507 		*perror = ENOMEM;
508 		return (NULL);
509 	}
510 	nhg_priv->nh_control = ctl;
511 
512 	*perror = 0;
513 	return (nhg_priv->nhg);
514 }
515 
516 /*
517  * Finds an existing group matching @nhg or links @nhg to the tree.
518  * Returns the referenced group or NULL and non-zero @perror.
519  */
520 struct nhgrp_object *
521 nhgrp_get_nhgrp(struct nhgrp_object *nhg, int *perror)
522 {
523 	struct nhgrp_priv *nhg_priv, *key = NHGRP_PRIV(nhg);
524 	struct nh_control *ctl = key->nh_control;
525 
526 	nhg_priv = find_nhgrp(ctl, key);
527 	if (nhg_priv != NULL) {
528 		/*
529 		 * Free originally-created group. As it hasn't been linked
530 		 *  and the dependent nexhops haven't been referenced, just free
531 		 *  the group.
532 		 */
533 		destroy_nhgrp_int(key);
534 		*perror = 0;
535 		return (nhg_priv->nhg);
536 	} else {
537 		/* No existing group, try to link the new one */
538 		if (!ref_nhgrp_nhops(key)) {
539 			/*
540 			 * Some of the nexthops have been scheduled for deletion.
541 			 * As the group hasn't been linked / no nexhops have been
542 			 *  referenced, call the final destructor immediately.
543 			 */
544 			destroy_nhgrp_int(key);
545 			*perror = EAGAIN;
546 			return (NULL);
547 		}
548 		if (link_nhgrp(ctl, key) == 0) {
549 			/* Unable to allocate index? */
550 			*perror = EAGAIN;
551 			free_nhgrp_nhops(key);
552 			destroy_nhgrp_int(key);
553 			return (NULL);
554 		}
555 		*perror = 0;
556 		return (nhg);
557 	}
558 
559 	/* NOTREACHED */
560 }
561 
562 /*
563  * Creates or looks up an existing nexthop group based on @wn and @num_nhops.
564  *
565  * Returns referenced nhop group or NULL, passing error code in @perror.
566  */
567 struct nhgrp_priv *
568 get_nhgrp(struct nh_control *ctl, struct weightened_nhop *wn, int num_nhops,
569     uint32_t uidx, int *perror)
570 {
571 	struct nhgrp_object *nhg;
572 
573 	nhg = nhgrp_alloc(ctl->ctl_rh->rib_fibnum, ctl->ctl_rh->rib_family,
574 	    wn, num_nhops, perror);
575 	if (nhg == NULL)
576 		return (NULL);
577 	nhgrp_set_uidx(nhg, uidx);
578 	nhg = nhgrp_get_nhgrp(nhg, perror);
579 	if (nhg != NULL)
580 		return (NHGRP_PRIV(nhg));
581 	return (NULL);
582 }
583 
584 
585 /*
586  * Appends one or more nexthops denoted by @wm to the nexthop group @gr_orig.
587  *
588  * Returns referenced nexthop group or NULL. In the latter case, @perror is
589  *  filled with an error code.
590  * Note that function does NOT care if the next nexthops already exists
591  * in the @gr_orig. As a result, they will be added, resulting in the
592  * same nexthop being present multiple times in the new group.
593  */
594 static struct nhgrp_priv *
595 append_nhops(struct nh_control *ctl, const struct nhgrp_object *gr_orig,
596     struct weightened_nhop *wn, int num_nhops, int *perror)
597 {
598 	char storage[64];
599 	struct weightened_nhop *pnhops;
600 	struct nhgrp_priv *nhg_priv;
601 	const struct nhgrp_priv *src_priv;
602 	size_t sz;
603 	int curr_nhops;
604 
605 	src_priv = NHGRP_PRIV_CONST(gr_orig);
606 	curr_nhops = src_priv->nhg_nh_count;
607 
608 	*perror = 0;
609 
610 	sz = (src_priv->nhg_nh_count + num_nhops) * (sizeof(struct weightened_nhop));
611 	/* optimize for <= 4 paths, each path=16 bytes */
612 	if (sz <= sizeof(storage))
613 		pnhops = (struct weightened_nhop *)&storage[0];
614 	else {
615 		pnhops = malloc(sz, M_TEMP, M_NOWAIT);
616 		if (pnhops == NULL) {
617 			*perror = ENOMEM;
618 			return (NULL);
619 		}
620 	}
621 
622 	/* Copy nhops from original group first */
623 	memcpy(pnhops, src_priv->nhg_nh_weights,
624 	  curr_nhops * sizeof(struct weightened_nhop));
625 	memcpy(&pnhops[curr_nhops], wn, num_nhops * sizeof(struct weightened_nhop));
626 	curr_nhops += num_nhops;
627 
628 	nhg_priv = get_nhgrp(ctl, pnhops, curr_nhops, 0, perror);
629 
630 	if (pnhops != (struct weightened_nhop *)&storage[0])
631 		free(pnhops, M_TEMP);
632 
633 	if (nhg_priv == NULL)
634 		return (NULL);
635 
636 	return (nhg_priv);
637 }
638 
639 
640 /*
641  * Creates/finds nexthop group based on @wn and @num_nhops.
642  * Returns 0 on success with referenced group in @rnd, or
643  * errno.
644  *
645  * If the error is EAGAIN, then the operation can be retried.
646  */
647 int
648 nhgrp_get_group(struct rib_head *rh, struct weightened_nhop *wn, int num_nhops,
649     uint32_t uidx, struct nhgrp_object **pnhg)
650 {
651 	struct nh_control *ctl = rh->nh_control;
652 	struct nhgrp_priv *nhg_priv;
653 	int error;
654 
655 	nhg_priv = get_nhgrp(ctl, wn, num_nhops, uidx, &error);
656 	if (nhg_priv != NULL)
657 		*pnhg = nhg_priv->nhg;
658 
659 	return (error);
660 }
661 
662 /*
663  * Creates new nexthop group based on @src group without the nexthops
664  * chosen by @flt_func.
665  * Returns 0 on success, storring the reference nhop group/object in @rnd.
666  */
667 int
668 nhgrp_get_filtered_group(struct rib_head *rh, const struct rtentry *rt,
669     const struct nhgrp_object *src, rib_filter_f_t flt_func, void *flt_data,
670     struct route_nhop_data *rnd)
671 {
672 	char storage[64];
673 	struct nh_control *ctl = rh->nh_control;
674 	struct weightened_nhop *pnhops;
675 	const struct nhgrp_priv *mp_priv, *src_priv;
676 	size_t sz;
677 	int error, i, num_nhops;
678 
679 	src_priv = NHGRP_PRIV_CONST(src);
680 
681 	sz = src_priv->nhg_nh_count * (sizeof(struct weightened_nhop));
682 	/* optimize for <= 4 paths, each path=16 bytes */
683 	if (sz <= sizeof(storage))
684 		pnhops = (struct weightened_nhop *)&storage[0];
685 	else {
686 		if ((pnhops = malloc(sz, M_TEMP, M_NOWAIT)) == NULL)
687 			return (ENOMEM);
688 	}
689 
690 	/* Filter nexthops */
691 	error = 0;
692 	num_nhops = 0;
693 	for (i = 0; i < src_priv->nhg_nh_count; i++) {
694 		if (flt_func(rt, src_priv->nhg_nh_weights[i].nh, flt_data))
695 			continue;
696 		memcpy(&pnhops[num_nhops++], &src_priv->nhg_nh_weights[i],
697 		  sizeof(struct weightened_nhop));
698 	}
699 
700 	if (num_nhops == 0) {
701 		rnd->rnd_nhgrp = NULL;
702 		rnd->rnd_weight = 0;
703 	} else if (num_nhops == 1) {
704 		rnd->rnd_nhop = pnhops[0].nh;
705 		rnd->rnd_weight = pnhops[0].weight;
706 		if (nhop_try_ref_object(rnd->rnd_nhop) == 0)
707 			error = EAGAIN;
708 	} else {
709 		mp_priv = get_nhgrp(ctl, pnhops, num_nhops, 0, &error);
710 		if (mp_priv != NULL)
711 			rnd->rnd_nhgrp = mp_priv->nhg;
712 		rnd->rnd_weight = 0;
713 	}
714 
715 	if (pnhops != (struct weightened_nhop *)&storage[0])
716 		free(pnhops, M_TEMP);
717 
718 	return (error);
719 }
720 
721 /*
722  * Creates new multipath group based on existing group/nhop in @rnd_orig and
723  *  to-be-added nhop @wn_add.
724  * Returns 0 on success and stores result in @rnd_new.
725  */
726 int
727 nhgrp_get_addition_group(struct rib_head *rh, struct route_nhop_data *rnd_orig,
728     struct route_nhop_data *rnd_add, struct route_nhop_data *rnd_new)
729 {
730 	struct nh_control *ctl = rh->nh_control;
731 	struct nhgrp_priv *nhg_priv;
732 	struct weightened_nhop wn[2] = {};
733 	int error;
734 
735 	if (rnd_orig->rnd_nhop == NULL) {
736 		/* No paths to add to, just reference current nhop */
737 		*rnd_new = *rnd_add;
738 		if (nhop_try_ref_object(rnd_new->rnd_nhop) == 0)
739 			return (EAGAIN);
740 		return (0);
741 	}
742 
743 	wn[0].nh = rnd_add->rnd_nhop;
744 	wn[0].weight = rnd_add->rnd_weight;
745 
746 	if (!NH_IS_NHGRP(rnd_orig->rnd_nhop)) {
747 		/* Simple merge of 2 non-multipath nexthops */
748 		wn[1].nh = rnd_orig->rnd_nhop;
749 		wn[1].weight = rnd_orig->rnd_weight;
750 		nhg_priv = get_nhgrp(ctl, wn, 2, 0, &error);
751 	} else {
752 		/* Get new nhop group with @rt->rt_nhop as an additional nhop */
753 		nhg_priv = append_nhops(ctl, rnd_orig->rnd_nhgrp, &wn[0], 1,
754 		    &error);
755 	}
756 
757 	if (nhg_priv == NULL)
758 		return (error);
759 	rnd_new->rnd_nhgrp = nhg_priv->nhg;
760 	rnd_new->rnd_weight = 0;
761 
762 	return (0);
763 }
764 
765 /*
766  * Returns pointer to array of nexthops with weights for
767  * given @nhg. Stores number of items in the array into @pnum_nhops.
768  */
769 const struct weightened_nhop *
770 nhgrp_get_nhops(const struct nhgrp_object *nhg, uint32_t *pnum_nhops)
771 {
772 	const struct nhgrp_priv *nhg_priv;
773 
774 	KASSERT(((nhg->nhg_flags & MPF_MULTIPATH) != 0), ("nhop is not mpath"));
775 
776 	nhg_priv = NHGRP_PRIV_CONST(nhg);
777 	*pnum_nhops = nhg_priv->nhg_nh_count;
778 
779 	return (nhg_priv->nhg_nh_weights);
780 }
781 
782 void
783 nhgrp_set_uidx(struct nhgrp_object *nhg, uint32_t uidx)
784 {
785 	struct nhgrp_priv *nhg_priv;
786 
787 	KASSERT(((nhg->nhg_flags & MPF_MULTIPATH) != 0), ("nhop is not mpath"));
788 
789 	nhg_priv = NHGRP_PRIV(nhg);
790 
791 	nhg_priv->nhg_uidx = uidx;
792 }
793 
794 uint32_t
795 nhgrp_get_uidx(const struct nhgrp_object *nhg)
796 {
797 	const struct nhgrp_priv *nhg_priv;
798 
799 	KASSERT(((nhg->nhg_flags & MPF_MULTIPATH) != 0), ("nhop is not mpath"));
800 
801 	nhg_priv = NHGRP_PRIV_CONST(nhg);
802 	return (nhg_priv->nhg_uidx);
803 }
804 
805 /*
806  * Prints nexhop group @nhg data in the provided @buf.
807  * Example: nhg#33/sz=3:[#1:100,#2:100,#3:100]
808  * Example: nhg#33/sz=5:[#1:100,#2:100,..]
809  */
810 char *
811 nhgrp_print_buf(const struct nhgrp_object *nhg, char *buf, size_t bufsize)
812 {
813 	const struct nhgrp_priv *nhg_priv = NHGRP_PRIV_CONST(nhg);
814 
815 	int off = snprintf(buf, bufsize, "nhg#%u/sz=%u:[", nhg_priv->nhg_idx,
816 	    nhg_priv->nhg_nh_count);
817 
818 	for (int i = 0; i < nhg_priv->nhg_nh_count; i++) {
819 		const struct weightened_nhop *wn = &nhg_priv->nhg_nh_weights[i];
820 		int len = snprintf(&buf[off], bufsize - off, "#%u:%u,",
821 		    wn->nh->nh_priv->nh_idx, wn->weight);
822 		if (len + off + 3 >= bufsize) {
823 			int len = snprintf(&buf[off], bufsize - off, "...");
824 			off += len;
825 			break;
826 		}
827 		off += len;
828 	}
829 	if (off > 0)
830 		off--; // remove last ","
831 	if (off + 1 < bufsize)
832 		snprintf(&buf[off], bufsize - off, "]");
833 	return buf;
834 }
835 
836 __noinline static int
837 dump_nhgrp_entry(struct rib_head *rh, const struct nhgrp_priv *nhg_priv,
838     char *buffer, size_t buffer_size, struct sysctl_req *w)
839 {
840 	struct rt_msghdr *rtm;
841 	struct nhgrp_external *nhge;
842 	struct nhgrp_container *nhgc;
843 	const struct nhgrp_object *nhg;
844 	struct nhgrp_nhop_external *ext;
845 	int error;
846 	size_t sz;
847 
848 	nhg = nhg_priv->nhg;
849 
850 	sz = sizeof(struct rt_msghdr) + sizeof(struct nhgrp_external);
851 	/* controlplane nexthops */
852 	sz += sizeof(struct nhgrp_container);
853 	sz += sizeof(struct nhgrp_nhop_external) * nhg_priv->nhg_nh_count;
854 	/* dataplane nexthops */
855 	sz += sizeof(struct nhgrp_container);
856 	sz += sizeof(struct nhgrp_nhop_external) * nhg->nhg_size;
857 
858 	KASSERT(sz <= buffer_size, ("increase nhgrp buffer size"));
859 
860 	bzero(buffer, sz);
861 
862 	rtm = (struct rt_msghdr *)buffer;
863 	rtm->rtm_msglen = sz;
864 	rtm->rtm_version = RTM_VERSION;
865 	rtm->rtm_type = RTM_GET;
866 
867 	nhge = (struct nhgrp_external *)(rtm + 1);
868 
869 	nhge->nhg_idx = nhg_priv->nhg_idx;
870 	nhge->nhg_refcount = nhg_priv->nhg_refcount;
871 
872 	/* fill in control plane nexthops firs */
873 	nhgc = (struct nhgrp_container *)(nhge + 1);
874 	nhgc->nhgc_type = NHG_C_TYPE_CNHOPS;
875 	nhgc->nhgc_subtype = 0;
876 	nhgc->nhgc_len = sizeof(struct nhgrp_container);
877 	nhgc->nhgc_len += sizeof(struct nhgrp_nhop_external) * nhg_priv->nhg_nh_count;
878 	nhgc->nhgc_count = nhg_priv->nhg_nh_count;
879 
880 	ext = (struct nhgrp_nhop_external *)(nhgc + 1);
881 	for (int i = 0; i < nhg_priv->nhg_nh_count; i++) {
882 		ext[i].nh_idx = nhg_priv->nhg_nh_weights[i].nh->nh_priv->nh_idx;
883 		ext[i].nh_weight = nhg_priv->nhg_nh_weights[i].weight;
884 	}
885 
886 	/* fill in dataplane nexthops */
887 	nhgc = (struct nhgrp_container *)(&ext[nhg_priv->nhg_nh_count]);
888 	nhgc->nhgc_type = NHG_C_TYPE_DNHOPS;
889 	nhgc->nhgc_subtype = 0;
890 	nhgc->nhgc_len = sizeof(struct nhgrp_container);
891 	nhgc->nhgc_len += sizeof(struct nhgrp_nhop_external) * nhg->nhg_size;
892 	nhgc->nhgc_count = nhg->nhg_size;
893 
894 	ext = (struct nhgrp_nhop_external *)(nhgc + 1);
895 	for (int i = 0; i < nhg->nhg_size; i++) {
896 		ext[i].nh_idx = nhg->nhops[i]->nh_priv->nh_idx;
897 		ext[i].nh_weight = 0;
898 	}
899 
900 	error = SYSCTL_OUT(w, buffer, sz);
901 
902 	return (error);
903 }
904 
905 uint32_t
906 nhgrp_get_idx(const struct nhgrp_object *nhg)
907 {
908 	const struct nhgrp_priv *nhg_priv;
909 
910 	nhg_priv = NHGRP_PRIV_CONST(nhg);
911 	return (nhg_priv->nhg_idx);
912 }
913 
914 uint8_t
915 nhgrp_get_origin(const struct nhgrp_object *nhg)
916 {
917 	return (NHGRP_PRIV_CONST(nhg)->nhg_origin);
918 }
919 
920 void
921 nhgrp_set_origin(struct nhgrp_object *nhg, uint8_t origin)
922 {
923 	NHGRP_PRIV(nhg)->nhg_origin = origin;
924 }
925 
926 uint32_t
927 nhgrp_get_count(struct rib_head *rh)
928 {
929 	struct nh_control *ctl;
930 	uint32_t count;
931 
932 	ctl = rh->nh_control;
933 
934 	NHOPS_RLOCK(ctl);
935 	count = ctl->gr_head.items_count;
936 	NHOPS_RUNLOCK(ctl);
937 
938 	return (count);
939 }
940 
941 int
942 nhgrp_dump_sysctl(struct rib_head *rh, struct sysctl_req *w)
943 {
944 	struct nh_control *ctl = rh->nh_control;
945 	struct epoch_tracker et;
946 	struct nhgrp_priv *nhg_priv;
947 	char *buffer;
948 	size_t sz;
949 	int error = 0;
950 
951 	if (ctl->gr_head.items_count == 0)
952 		return (0);
953 
954 	/* Calculate the maximum nhop group size in bytes */
955 	sz = sizeof(struct rt_msghdr) + sizeof(struct nhgrp_external);
956 	sz += 2 * sizeof(struct nhgrp_container);
957 	sz += 2 * sizeof(struct nhgrp_nhop_external) * RIB_MAX_MPATH_WIDTH;
958 	buffer = malloc(sz, M_TEMP, M_NOWAIT);
959 	if (buffer == NULL)
960 		return (ENOMEM);
961 
962 	NET_EPOCH_ENTER(et);
963 	NHOPS_RLOCK(ctl);
964 	CHT_SLIST_FOREACH(&ctl->gr_head, mpath, nhg_priv) {
965 		error = dump_nhgrp_entry(rh, nhg_priv, buffer, sz, w);
966 		if (error != 0)
967 			break;
968 	} CHT_SLIST_FOREACH_END;
969 	NHOPS_RUNLOCK(ctl);
970 	NET_EPOCH_EXIT(et);
971 
972 	free(buffer, M_TEMP);
973 
974 	return (error);
975 }
976