xref: /freebsd/sys/net/route/nhgrp_ctl.c (revision c07d6445eb89d9dd3950361b065b7bd110e3a043)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2020 Alexander V. Chernikov
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions
8  * are met:
9  * 1. Redistributions of source code must retain the above copyright
10  *    notice, this list of conditions and the following disclaimer.
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in the
13  *    documentation and/or other materials provided with the distribution.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
16  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
19  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
21  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
22  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
23  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
24  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
25  * SUCH DAMAGE.
26  *
27  * $FreeBSD$
28  */
29 #include "opt_inet.h"
30 #include "opt_route.h"
31 
32 #include <sys/cdefs.h>
33 #include <sys/param.h>
34 #include <sys/systm.h>
35 #include <sys/lock.h>
36 #include <sys/rmlock.h>
37 #include <sys/malloc.h>
38 #include <sys/mbuf.h>
39 #include <sys/refcount.h>
40 #include <sys/socket.h>
41 #include <sys/sysctl.h>
42 #include <sys/kernel.h>
43 #include <sys/epoch.h>
44 
45 #include <net/if.h>
46 #include <net/if_var.h>
47 #include <net/route.h>
48 #include <net/route/route_ctl.h>
49 #include <net/route/route_var.h>
50 #include <net/vnet.h>
51 
52 #include <netinet/in.h>
53 #include <netinet/in_var.h>
54 #include <netinet/in_fib.h>
55 
56 #include <net/route/nhop_utils.h>
57 #include <net/route/nhop.h>
58 #include <net/route/nhop_var.h>
59 #include <net/route/nhgrp_var.h>
60 
61 #define	DEBUG_MOD_NAME	nhgrp_ctl
62 #define	DEBUG_MAX_LEVEL	LOG_DEBUG
63 #include <net/route/route_debug.h>
64 _DECLARE_DEBUG(LOG_INFO);
65 
66 /*
67  * This file contains the supporting functions for creating multipath groups
68  *  and compiling their dataplane parts.
69  */
70 
71 /* MPF_MULTIPATH must be the same as NHF_MULTIPATH for nhop selection to work */
72 _Static_assert(MPF_MULTIPATH == NHF_MULTIPATH,
73     "MPF_MULTIPATH must be the same as NHF_MULTIPATH");
74 /* Offset and size of flags field has to be the same for nhop/nhop groups */
75 CHK_STRUCT_FIELD_GENERIC(struct nhop_object, nh_flags, struct nhgrp_object, nhg_flags);
76 /* Cap multipath to 64, as the larger values would break rib_cmd_info bmasks */
77 CTASSERT(RIB_MAX_MPATH_WIDTH <= 64);
78 
79 static int wn_cmp_idx(const void *a, const void *b);
80 static void sort_weightened_nhops(struct weightened_nhop *wn, int num_nhops);
81 
82 static struct nhgrp_priv *get_nhgrp(struct nh_control *ctl,
83     struct weightened_nhop *wn, int num_nhops, uint32_t uidx, int *perror);
84 static void destroy_nhgrp(struct nhgrp_priv *nhg_priv);
85 static void destroy_nhgrp_epoch(epoch_context_t ctx);
86 static void free_nhgrp_nhops(struct nhgrp_priv *nhg_priv);
87 
88 static int
89 wn_cmp_idx(const void *a, const void *b)
90 {
91 	const struct weightened_nhop *w_a = a;
92 	const struct weightened_nhop *w_b = b;
93 	uint32_t a_idx = w_a->nh->nh_priv->nh_idx;
94 	uint32_t b_idx = w_b->nh->nh_priv->nh_idx;
95 
96 	if (a_idx < b_idx)
97 		return (-1);
98 	else if (a_idx > b_idx)
99 		return (1);
100 	else
101 		return (0);
102 }
103 
104 /*
105  * Perform in-place sorting for array of nexthops in @wn.
106  * Sort by nexthop index ascending.
107  */
108 static void
109 sort_weightened_nhops(struct weightened_nhop *wn, int num_nhops)
110 {
111 
112 	qsort(wn, num_nhops, sizeof(struct weightened_nhop), wn_cmp_idx);
113 }
114 
115 /*
116  * In order to determine the minimum weight difference in the array
117  * of weights, create a sorted array of weights, using spare "storage"
118  * field in the `struct weightened_nhop`.
119  * Assume weights to be (mostly) the same and use insertion sort to
120  * make it sorted.
121  */
122 static void
123 sort_weightened_nhops_weights(struct weightened_nhop *wn, int num_items)
124 {
125 	wn[0].storage = wn[0].weight;
126 	for (int i = 1, j = 0; i < num_items; i++) {
127 		uint32_t weight = wn[i].weight; // read from 'weight' as it's not reordered
128 		/* Move all weights > weight 1 position right */
129 		for (j = i - 1; j >= 0 && wn[j].storage > weight; j--)
130 			wn[j + 1].storage = wn[j].storage;
131 		wn[j + 1].storage = weight;
132 	}
133 }
134 
135 /*
136  * Calculate minimum number of slots required to fit the existing
137  * set of weights in the common use case where weights are "easily"
138  * comparable.
139  * Assumes @wn is sorted by weight ascending and each weight is > 0.
140  * Returns number of slots or 0 if precise calculation failed.
141  *
142  * Some examples:
143  * note: (i, X) pair means (nhop=i, weight=X):
144  * (1, 1) (2, 2) -> 3 slots [1, 2, 2]
145  * (1, 100), (2, 200) -> 3 slots [1, 2, 2]
146  * (1, 100), (2, 200), (3, 400) -> 7 slots [1, 2, 2, 3, 3, 3]
147  */
148 static uint32_t
149 calc_min_mpath_slots_fast(struct weightened_nhop *wn, size_t num_items,
150     uint64_t *ptotal)
151 {
152 	uint32_t i, last, xmin;
153 	uint64_t total = 0;
154 
155 	// Get sorted array of weights in .storage field
156 	sort_weightened_nhops_weights(wn, num_items);
157 
158 	last = 0;
159 	xmin = wn[0].storage;
160 	for (i = 0; i < num_items; i++) {
161 		total += wn[i].storage;
162 		if ((wn[i].storage != last) &&
163 		    ((wn[i].storage - last < xmin) || xmin == 0)) {
164 			xmin = wn[i].storage - last;
165 		}
166 		last = wn[i].storage;
167 	}
168 	*ptotal = total;
169 	/* xmin is the minimum unit of desired capacity */
170 	if ((total % xmin) != 0)
171 		return (0);
172 	for (i = 0; i < num_items; i++) {
173 		if ((wn[i].weight % xmin) != 0)
174 			return (0);
175 	}
176 
177 	return ((uint32_t)(total / xmin));
178 }
179 
180 /*
181  * Calculate minimum number of slots required to fit the existing
182  * set of weights while maintaining weight coefficients.
183  *
184  * Assume @wn is sorted by weight ascending and each weight is > 0.
185  *
186  * Tries to find simple precise solution first and falls back to
187  *  RIB_MAX_MPATH_WIDTH in case of any failure.
188  */
189 static uint32_t
190 calc_min_mpath_slots(struct weightened_nhop *wn, size_t num_items)
191 {
192 	uint32_t v;
193 	uint64_t total;
194 
195 	v = calc_min_mpath_slots_fast(wn, num_items, &total);
196 	if (total == 0)
197 		return (0);
198 	if ((v == 0) || (v > RIB_MAX_MPATH_WIDTH))
199 		v = RIB_MAX_MPATH_WIDTH;
200 
201 	return (v);
202 }
203 
204 /*
205  * Nexthop group data consists of
206  * 1) dataplane part, with nhgrp_object as a header followed by an
207  *   arbitrary number of nexthop pointers.
208  * 2) control plane part, with nhgrp_priv as a header, followed by
209  *   an arbirtrary number of 'struct weightened_nhop' object.
210  *
211  * Given nexthop groups are (mostly) immutable, allocate all data
212  * in one go.
213  *
214  */
215 __noinline static size_t
216 get_nhgrp_alloc_size(uint32_t nhg_size, uint32_t num_nhops)
217 {
218 	size_t sz;
219 
220 	sz = sizeof(struct nhgrp_object);
221 	sz += nhg_size * sizeof(struct nhop_object *);
222 	sz += sizeof(struct nhgrp_priv);
223 	sz += num_nhops * sizeof(struct weightened_nhop);
224 	return (sz);
225 }
226 
227 /*
228  * Compile actual list of nexthops to be used by datapath from
229  *  the nexthop group @dst.
230  *
231  * For example, compiling control plane list of 2 nexthops
232  *  [(200, A), (100, B)] would result in the datapath array
233  *  [A, A, B]
234  */
235 static void
236 compile_nhgrp(struct nhgrp_priv *dst_priv, const struct weightened_nhop *x,
237     uint32_t num_slots)
238 {
239 	struct nhgrp_object *dst;
240 	int i, slot_idx, remaining_slots;
241 	uint64_t remaining_sum, nh_weight, nh_slots;
242 
243 	slot_idx  = 0;
244 	dst = dst_priv->nhg;
245 	/* Calculate sum of all weights */
246 	remaining_sum = 0;
247 	for (i = 0; i < dst_priv->nhg_nh_count; i++)
248 		remaining_sum += x[i].weight;
249 	remaining_slots = num_slots;
250 	FIB_NH_LOG(LOG_DEBUG3, x[0].nh, "sum: %lu, slots: %d",
251 	    remaining_sum, remaining_slots);
252 	for (i = 0; i < dst_priv->nhg_nh_count; i++) {
253 		/* Calculate number of slots for the current nexthop */
254 		if (remaining_sum > 0) {
255 			nh_weight = (uint64_t)x[i].weight;
256 			nh_slots = (nh_weight * remaining_slots / remaining_sum);
257 		} else
258 			nh_slots = 0;
259 
260 		remaining_sum -= x[i].weight;
261 		remaining_slots -= nh_slots;
262 
263 		FIB_NH_LOG(LOG_DEBUG3, x[0].nh,
264 		    " rem_sum: %lu, rem_slots: %d nh_slots: %d, slot_idx: %d",
265 		    remaining_sum, remaining_slots, (int)nh_slots, slot_idx);
266 
267 		KASSERT((slot_idx + nh_slots <= num_slots),
268 		    ("index overflow during nhg compilation"));
269 		while (nh_slots-- > 0)
270 			dst->nhops[slot_idx++] = x[i].nh;
271 	}
272 }
273 
274 /*
275  * Allocates new nexthop group for the list of weightened nexthops.
276  * Assume sorted list.
277  * Does NOT reference any nexthops in the group.
278  * Returns group with refcount=1 or NULL.
279  */
280 static struct nhgrp_priv *
281 alloc_nhgrp(struct weightened_nhop *wn, int num_nhops)
282 {
283 	uint32_t nhgrp_size;
284 	struct nhgrp_object *nhg;
285 	struct nhgrp_priv *nhg_priv;
286 
287 	nhgrp_size = calc_min_mpath_slots(wn, num_nhops);
288 	if (nhgrp_size == 0) {
289 		/* Zero weights, abort */
290 		return (NULL);
291 	}
292 
293 	size_t sz = get_nhgrp_alloc_size(nhgrp_size, num_nhops);
294 	nhg = malloc(sz, M_NHOP, M_NOWAIT | M_ZERO);
295 	if (nhg == NULL) {
296 		FIB_NH_LOG(LOG_INFO, wn[0].nh,
297 		    "unable to allocate group with num_nhops %d (compiled %u)",
298 		    num_nhops, nhgrp_size);
299 		return (NULL);
300 	}
301 
302 	/* Has to be the first to make NHGRP_PRIV() work */
303 	nhg->nhg_size = nhgrp_size;
304 	nhg->nhg_flags = MPF_MULTIPATH;
305 
306 	nhg_priv = NHGRP_PRIV(nhg);
307 	nhg_priv->nhg_nh_count = num_nhops;
308 	refcount_init(&nhg_priv->nhg_refcount, 1);
309 
310 	/* Please see nhgrp_free() comments on the initial value */
311 	refcount_init(&nhg_priv->nhg_linked, 2);
312 
313 	nhg_priv->nhg = nhg;
314 	memcpy(&nhg_priv->nhg_nh_weights[0], wn,
315 	  num_nhops * sizeof(struct weightened_nhop));
316 
317 	FIB_NH_LOG(LOG_DEBUG, wn[0].nh, "num_nhops: %d, compiled_nhop: %u",
318 	    num_nhops, nhgrp_size);
319 
320 	compile_nhgrp(nhg_priv, wn, nhg->nhg_size);
321 
322 	return (nhg_priv);
323 }
324 
325 void
326 nhgrp_ref_object(struct nhgrp_object *nhg)
327 {
328 	struct nhgrp_priv *nhg_priv;
329 	u_int old __diagused;
330 
331 	nhg_priv = NHGRP_PRIV(nhg);
332 	old = refcount_acquire(&nhg_priv->nhg_refcount);
333 	KASSERT(old > 0, ("%s: nhgrp object %p has 0 refs", __func__, nhg));
334 }
335 
336 void
337 nhgrp_free(struct nhgrp_object *nhg)
338 {
339 	struct nhgrp_priv *nhg_priv;
340 	struct nh_control *ctl;
341 	struct epoch_tracker et;
342 
343 	nhg_priv = NHGRP_PRIV(nhg);
344 
345 	if (!refcount_release(&nhg_priv->nhg_refcount))
346 		return;
347 
348 	/*
349 	 * group objects don't have an explicit lock attached to it.
350 	 * As groups are reclaimed based on reference count, it is possible
351 	 * that some groups will persist after vnet destruction callback
352 	 * called. Given that, handle scenario with nhgrp_free_group() being
353 	 * called either after or simultaneously with nhgrp_ctl_unlink_all()
354 	 * by using another reference counter: nhg_linked.
355 	 *
356 	 * There are only 2 places, where nhg_linked can be decreased:
357 	 *  rib destroy (nhgrp_ctl_unlink_all) and this function.
358 	 * nhg_link can never be increased.
359 	 *
360 	 * Hence, use initial value of 2 to make use of
361 	 *  refcount_release_if_not_last().
362 	 *
363 	 * There can be two scenarious when calling this function:
364 	 *
365 	 * 1) nhg_linked value is 2. This means that either
366 	 *  nhgrp_ctl_unlink_all() has not been called OR it is running,
367 	 *  but we are guaranteed that nh_control won't be freed in
368 	 *  this epoch. Hence, nexthop can be safely unlinked.
369 	 *
370 	 * 2) nh_linked value is 1. In that case, nhgrp_ctl_unlink_all()
371 	 *  has been called and nhgrp unlink can be skipped.
372 	 */
373 
374 	NET_EPOCH_ENTER(et);
375 	if (refcount_release_if_not_last(&nhg_priv->nhg_linked)) {
376 		ctl = nhg_priv->nh_control;
377 		if (unlink_nhgrp(ctl, nhg_priv) == NULL) {
378 			/* Do not try to reclaim */
379 			RT_LOG(LOG_INFO, "Failed to unlink nexhop group %p",
380 			    nhg_priv);
381 			NET_EPOCH_EXIT(et);
382 			return;
383 		}
384 	}
385 	NET_EPOCH_EXIT(et);
386 
387 	KASSERT((nhg_priv->nhg_idx == 0), ("gr_idx != 0"));
388 	NET_EPOCH_CALL(destroy_nhgrp_epoch, &nhg_priv->nhg_epoch_ctx);
389 }
390 
391 /*
392  * Destroys all local resources belonging to @nhg_priv.
393  */
394 __noinline static void
395 destroy_nhgrp_int(struct nhgrp_priv *nhg_priv)
396 {
397 
398 	free(nhg_priv->nhg, M_NHOP);
399 }
400 
401 __noinline static void
402 destroy_nhgrp(struct nhgrp_priv *nhg_priv)
403 {
404 
405 	KASSERT((nhg_priv->nhg_refcount == 0), ("nhg_refcount != 0"));
406 	KASSERT((nhg_priv->nhg_idx == 0), ("gr_idx != 0"));
407 
408 	IF_DEBUG_LEVEL(LOG_DEBUG2) {
409 		char nhgbuf[NHOP_PRINT_BUFSIZE] __unused;
410 		FIB_NH_LOG(LOG_DEBUG2, nhg_priv->nhg_nh_weights[0].nh,
411 		    "destroying %s", nhgrp_print_buf(nhg_priv->nhg,
412 		    nhgbuf, sizeof(nhgbuf)));
413 	}
414 
415 	free_nhgrp_nhops(nhg_priv);
416 	destroy_nhgrp_int(nhg_priv);
417 }
418 
419 /*
420  * Epoch callback indicating group is safe to destroy
421  */
422 static void
423 destroy_nhgrp_epoch(epoch_context_t ctx)
424 {
425 	struct nhgrp_priv *nhg_priv;
426 
427 	nhg_priv = __containerof(ctx, struct nhgrp_priv, nhg_epoch_ctx);
428 
429 	destroy_nhgrp(nhg_priv);
430 }
431 
432 static bool
433 ref_nhgrp_nhops(struct nhgrp_priv *nhg_priv)
434 {
435 
436 	for (int i = 0; i < nhg_priv->nhg_nh_count; i++) {
437 		if (nhop_try_ref_object(nhg_priv->nhg_nh_weights[i].nh) != 0)
438 			continue;
439 
440 		/*
441 		 * Failed to ref the nexthop, b/c it's deleted.
442 		 * Need to rollback references back.
443 		 */
444 		for (int j = 0; j < i; j++)
445 			nhop_free(nhg_priv->nhg_nh_weights[j].nh);
446 		return (false);
447 	}
448 
449 	return (true);
450 }
451 
452 static void
453 free_nhgrp_nhops(struct nhgrp_priv *nhg_priv)
454 {
455 
456 	for (int i = 0; i < nhg_priv->nhg_nh_count; i++)
457 		nhop_free(nhg_priv->nhg_nh_weights[i].nh);
458 }
459 
460 /*
461  * Allocate nexthop group of size @num_nhops with nexthops specified by
462  * @wn. Nexthops have to be unique and match the fibnum/family of the group.
463  * Returns unlinked nhgrp object on success or NULL and non-zero perror.
464  */
465 struct nhgrp_object *
466 nhgrp_alloc(uint32_t fibnum, int family, struct weightened_nhop *wn, int num_nhops,
467     int *perror)
468 {
469 	struct rib_head *rh = rt_tables_get_rnh(fibnum, family);
470 	struct nhgrp_priv *nhg_priv;
471 	struct nh_control *ctl;
472 
473 	if (rh == NULL) {
474 		*perror = E2BIG;
475 		return (NULL);
476 	}
477 
478 	ctl = rh->nh_control;
479 
480 	if (num_nhops > RIB_MAX_MPATH_WIDTH) {
481 		*perror = E2BIG;
482 		return (NULL);
483 	}
484 
485 	if (ctl->gr_head.hash_size == 0) {
486 		/* First multipath request. Bootstrap mpath datastructures. */
487 		if (nhgrp_ctl_alloc_default(ctl, M_NOWAIT) == 0) {
488 			*perror = ENOMEM;
489 			return (NULL);
490 		}
491 	}
492 
493 	/* Sort nexthops & check there are no duplicates */
494 	sort_weightened_nhops(wn, num_nhops);
495 	uint32_t last_id = 0;
496 	for (int i = 0; i < num_nhops; i++) {
497 		if (wn[i].nh->nh_priv->nh_control != ctl) {
498 			*perror = EINVAL;
499 			return (NULL);
500 		}
501 		if (wn[i].nh->nh_priv->nh_idx == last_id) {
502 			*perror = EEXIST;
503 			return (NULL);
504 		}
505 		last_id = wn[i].nh->nh_priv->nh_idx;
506 	}
507 
508 	if ((nhg_priv = alloc_nhgrp(wn, num_nhops)) == NULL) {
509 		*perror = ENOMEM;
510 		return (NULL);
511 	}
512 	nhg_priv->nh_control = ctl;
513 
514 	*perror = 0;
515 	return (nhg_priv->nhg);
516 }
517 
518 /*
519  * Finds an existing group matching @nhg or links @nhg to the tree.
520  * Returns the referenced group or NULL and non-zero @perror.
521  */
522 struct nhgrp_object *
523 nhgrp_get_nhgrp(struct nhgrp_object *nhg, int *perror)
524 {
525 	struct nhgrp_priv *nhg_priv, *key = NHGRP_PRIV(nhg);
526 	struct nh_control *ctl = key->nh_control;
527 
528 	nhg_priv = find_nhgrp(ctl, key);
529 	if (nhg_priv != NULL) {
530 		/*
531 		 * Free originally-created group. As it hasn't been linked
532 		 *  and the dependent nexhops haven't been referenced, just free
533 		 *  the group.
534 		 */
535 		destroy_nhgrp_int(key);
536 		*perror = 0;
537 		return (nhg_priv->nhg);
538 	} else {
539 		/* No existing group, try to link the new one */
540 		if (!ref_nhgrp_nhops(key)) {
541 			/*
542 			 * Some of the nexthops have been scheduled for deletion.
543 			 * As the group hasn't been linked / no nexhops have been
544 			 *  referenced, call the final destructor immediately.
545 			 */
546 			destroy_nhgrp_int(key);
547 			*perror = EAGAIN;
548 			return (NULL);
549 		}
550 		if (link_nhgrp(ctl, key) == 0) {
551 			/* Unable to allocate index? */
552 			*perror = EAGAIN;
553 			free_nhgrp_nhops(key);
554 			destroy_nhgrp_int(key);
555 			return (NULL);
556 		}
557 		*perror = 0;
558 		return (nhg);
559 	}
560 
561 	/* NOTREACHED */
562 }
563 
564 /*
565  * Creates or looks up an existing nexthop group based on @wn and @num_nhops.
566  *
567  * Returns referenced nhop group or NULL, passing error code in @perror.
568  */
569 struct nhgrp_priv *
570 get_nhgrp(struct nh_control *ctl, struct weightened_nhop *wn, int num_nhops,
571     uint32_t uidx, int *perror)
572 {
573 	struct nhgrp_object *nhg;
574 
575 	nhg = nhgrp_alloc(ctl->ctl_rh->rib_fibnum, ctl->ctl_rh->rib_family,
576 	    wn, num_nhops, perror);
577 	if (nhg == NULL)
578 		return (NULL);
579 	nhgrp_set_uidx(nhg, uidx);
580 	nhg = nhgrp_get_nhgrp(nhg, perror);
581 	if (nhg != NULL)
582 		return (NHGRP_PRIV(nhg));
583 	return (NULL);
584 }
585 
586 
587 /*
588  * Appends one or more nexthops denoted by @wm to the nexthop group @gr_orig.
589  *
590  * Returns referenced nexthop group or NULL. In the latter case, @perror is
591  *  filled with an error code.
592  * Note that function does NOT care if the next nexthops already exists
593  * in the @gr_orig. As a result, they will be added, resulting in the
594  * same nexthop being present multiple times in the new group.
595  */
596 static struct nhgrp_priv *
597 append_nhops(struct nh_control *ctl, const struct nhgrp_object *gr_orig,
598     struct weightened_nhop *wn, int num_nhops, int *perror)
599 {
600 	char storage[64];
601 	struct weightened_nhop *pnhops;
602 	struct nhgrp_priv *nhg_priv;
603 	const struct nhgrp_priv *src_priv;
604 	size_t sz;
605 	int curr_nhops;
606 
607 	src_priv = NHGRP_PRIV_CONST(gr_orig);
608 	curr_nhops = src_priv->nhg_nh_count;
609 
610 	*perror = 0;
611 
612 	sz = (src_priv->nhg_nh_count + num_nhops) * (sizeof(struct weightened_nhop));
613 	/* optimize for <= 4 paths, each path=16 bytes */
614 	if (sz <= sizeof(storage))
615 		pnhops = (struct weightened_nhop *)&storage[0];
616 	else {
617 		pnhops = malloc(sz, M_TEMP, M_NOWAIT);
618 		if (pnhops == NULL) {
619 			*perror = ENOMEM;
620 			return (NULL);
621 		}
622 	}
623 
624 	/* Copy nhops from original group first */
625 	memcpy(pnhops, src_priv->nhg_nh_weights,
626 	  curr_nhops * sizeof(struct weightened_nhop));
627 	memcpy(&pnhops[curr_nhops], wn, num_nhops * sizeof(struct weightened_nhop));
628 	curr_nhops += num_nhops;
629 
630 	nhg_priv = get_nhgrp(ctl, pnhops, curr_nhops, 0, perror);
631 
632 	if (pnhops != (struct weightened_nhop *)&storage[0])
633 		free(pnhops, M_TEMP);
634 
635 	if (nhg_priv == NULL)
636 		return (NULL);
637 
638 	return (nhg_priv);
639 }
640 
641 
642 /*
643  * Creates/finds nexthop group based on @wn and @num_nhops.
644  * Returns 0 on success with referenced group in @rnd, or
645  * errno.
646  *
647  * If the error is EAGAIN, then the operation can be retried.
648  */
649 int
650 nhgrp_get_group(struct rib_head *rh, struct weightened_nhop *wn, int num_nhops,
651     uint32_t uidx, struct nhgrp_object **pnhg)
652 {
653 	struct nh_control *ctl = rh->nh_control;
654 	struct nhgrp_priv *nhg_priv;
655 	int error;
656 
657 	nhg_priv = get_nhgrp(ctl, wn, num_nhops, uidx, &error);
658 	if (nhg_priv != NULL)
659 		*pnhg = nhg_priv->nhg;
660 
661 	return (error);
662 }
663 
664 /*
665  * Creates new nexthop group based on @src group without the nexthops
666  * chosen by @flt_func.
667  * Returns 0 on success, storring the reference nhop group/object in @rnd.
668  */
669 int
670 nhgrp_get_filtered_group(struct rib_head *rh, const struct rtentry *rt,
671     const struct nhgrp_object *src, rib_filter_f_t flt_func, void *flt_data,
672     struct route_nhop_data *rnd)
673 {
674 	char storage[64];
675 	struct nh_control *ctl = rh->nh_control;
676 	struct weightened_nhop *pnhops;
677 	const struct nhgrp_priv *mp_priv, *src_priv;
678 	size_t sz;
679 	int error, i, num_nhops;
680 
681 	src_priv = NHGRP_PRIV_CONST(src);
682 
683 	sz = src_priv->nhg_nh_count * (sizeof(struct weightened_nhop));
684 	/* optimize for <= 4 paths, each path=16 bytes */
685 	if (sz <= sizeof(storage))
686 		pnhops = (struct weightened_nhop *)&storage[0];
687 	else {
688 		if ((pnhops = malloc(sz, M_TEMP, M_NOWAIT)) == NULL)
689 			return (ENOMEM);
690 	}
691 
692 	/* Filter nexthops */
693 	error = 0;
694 	num_nhops = 0;
695 	for (i = 0; i < src_priv->nhg_nh_count; i++) {
696 		if (flt_func(rt, src_priv->nhg_nh_weights[i].nh, flt_data))
697 			continue;
698 		memcpy(&pnhops[num_nhops++], &src_priv->nhg_nh_weights[i],
699 		  sizeof(struct weightened_nhop));
700 	}
701 
702 	if (num_nhops == 0) {
703 		rnd->rnd_nhgrp = NULL;
704 		rnd->rnd_weight = 0;
705 	} else if (num_nhops == 1) {
706 		rnd->rnd_nhop = pnhops[0].nh;
707 		rnd->rnd_weight = pnhops[0].weight;
708 		if (nhop_try_ref_object(rnd->rnd_nhop) == 0)
709 			error = EAGAIN;
710 	} else {
711 		mp_priv = get_nhgrp(ctl, pnhops, num_nhops, 0, &error);
712 		if (mp_priv != NULL)
713 			rnd->rnd_nhgrp = mp_priv->nhg;
714 		rnd->rnd_weight = 0;
715 	}
716 
717 	if (pnhops != (struct weightened_nhop *)&storage[0])
718 		free(pnhops, M_TEMP);
719 
720 	return (error);
721 }
722 
723 /*
724  * Creates new multipath group based on existing group/nhop in @rnd_orig and
725  *  to-be-added nhop @wn_add.
726  * Returns 0 on success and stores result in @rnd_new.
727  */
728 int
729 nhgrp_get_addition_group(struct rib_head *rh, struct route_nhop_data *rnd_orig,
730     struct route_nhop_data *rnd_add, struct route_nhop_data *rnd_new)
731 {
732 	struct nh_control *ctl = rh->nh_control;
733 	struct nhgrp_priv *nhg_priv;
734 	struct weightened_nhop wn[2] = {};
735 	int error;
736 
737 	if (rnd_orig->rnd_nhop == NULL) {
738 		/* No paths to add to, just reference current nhop */
739 		*rnd_new = *rnd_add;
740 		if (nhop_try_ref_object(rnd_new->rnd_nhop) == 0)
741 			return (EAGAIN);
742 		return (0);
743 	}
744 
745 	wn[0].nh = rnd_add->rnd_nhop;
746 	wn[0].weight = rnd_add->rnd_weight;
747 
748 	if (!NH_IS_NHGRP(rnd_orig->rnd_nhop)) {
749 		/* Simple merge of 2 non-multipath nexthops */
750 		wn[1].nh = rnd_orig->rnd_nhop;
751 		wn[1].weight = rnd_orig->rnd_weight;
752 		nhg_priv = get_nhgrp(ctl, wn, 2, 0, &error);
753 	} else {
754 		/* Get new nhop group with @rt->rt_nhop as an additional nhop */
755 		nhg_priv = append_nhops(ctl, rnd_orig->rnd_nhgrp, &wn[0], 1,
756 		    &error);
757 	}
758 
759 	if (nhg_priv == NULL)
760 		return (error);
761 	rnd_new->rnd_nhgrp = nhg_priv->nhg;
762 	rnd_new->rnd_weight = 0;
763 
764 	return (0);
765 }
766 
767 /*
768  * Returns pointer to array of nexthops with weights for
769  * given @nhg. Stores number of items in the array into @pnum_nhops.
770  */
771 const struct weightened_nhop *
772 nhgrp_get_nhops(const struct nhgrp_object *nhg, uint32_t *pnum_nhops)
773 {
774 	const struct nhgrp_priv *nhg_priv;
775 
776 	KASSERT(((nhg->nhg_flags & MPF_MULTIPATH) != 0), ("nhop is not mpath"));
777 
778 	nhg_priv = NHGRP_PRIV_CONST(nhg);
779 	*pnum_nhops = nhg_priv->nhg_nh_count;
780 
781 	return (nhg_priv->nhg_nh_weights);
782 }
783 
784 void
785 nhgrp_set_uidx(struct nhgrp_object *nhg, uint32_t uidx)
786 {
787 	struct nhgrp_priv *nhg_priv;
788 
789 	KASSERT(((nhg->nhg_flags & MPF_MULTIPATH) != 0), ("nhop is not mpath"));
790 
791 	nhg_priv = NHGRP_PRIV(nhg);
792 
793 	nhg_priv->nhg_uidx = uidx;
794 }
795 
796 uint32_t
797 nhgrp_get_uidx(const struct nhgrp_object *nhg)
798 {
799 	const struct nhgrp_priv *nhg_priv;
800 
801 	KASSERT(((nhg->nhg_flags & MPF_MULTIPATH) != 0), ("nhop is not mpath"));
802 
803 	nhg_priv = NHGRP_PRIV_CONST(nhg);
804 	return (nhg_priv->nhg_uidx);
805 }
806 
807 /*
808  * Prints nexhop group @nhg data in the provided @buf.
809  * Example: nhg#33/sz=3:[#1:100,#2:100,#3:100]
810  * Example: nhg#33/sz=5:[#1:100,#2:100,..]
811  */
812 char *
813 nhgrp_print_buf(const struct nhgrp_object *nhg, char *buf, size_t bufsize)
814 {
815 	const struct nhgrp_priv *nhg_priv = NHGRP_PRIV_CONST(nhg);
816 
817 	int off = snprintf(buf, bufsize, "nhg#%u/sz=%u:[", nhg_priv->nhg_idx,
818 	    nhg_priv->nhg_nh_count);
819 
820 	for (int i = 0; i < nhg_priv->nhg_nh_count; i++) {
821 		const struct weightened_nhop *wn = &nhg_priv->nhg_nh_weights[i];
822 		int len = snprintf(&buf[off], bufsize - off, "#%u:%u,",
823 		    wn->nh->nh_priv->nh_idx, wn->weight);
824 		if (len + off + 3 >= bufsize) {
825 			int len = snprintf(&buf[off], bufsize - off, "...");
826 			off += len;
827 			break;
828 		}
829 		off += len;
830 	}
831 	if (off > 0)
832 		off--; // remove last ","
833 	if (off + 1 < bufsize)
834 		snprintf(&buf[off], bufsize - off, "]");
835 	return buf;
836 }
837 
838 __noinline static int
839 dump_nhgrp_entry(struct rib_head *rh, const struct nhgrp_priv *nhg_priv,
840     char *buffer, size_t buffer_size, struct sysctl_req *w)
841 {
842 	struct rt_msghdr *rtm;
843 	struct nhgrp_external *nhge;
844 	struct nhgrp_container *nhgc;
845 	const struct nhgrp_object *nhg;
846 	struct nhgrp_nhop_external *ext;
847 	int error;
848 	size_t sz;
849 
850 	nhg = nhg_priv->nhg;
851 
852 	sz = sizeof(struct rt_msghdr) + sizeof(struct nhgrp_external);
853 	/* controlplane nexthops */
854 	sz += sizeof(struct nhgrp_container);
855 	sz += sizeof(struct nhgrp_nhop_external) * nhg_priv->nhg_nh_count;
856 	/* dataplane nexthops */
857 	sz += sizeof(struct nhgrp_container);
858 	sz += sizeof(struct nhgrp_nhop_external) * nhg->nhg_size;
859 
860 	KASSERT(sz <= buffer_size, ("increase nhgrp buffer size"));
861 
862 	bzero(buffer, sz);
863 
864 	rtm = (struct rt_msghdr *)buffer;
865 	rtm->rtm_msglen = sz;
866 	rtm->rtm_version = RTM_VERSION;
867 	rtm->rtm_type = RTM_GET;
868 
869 	nhge = (struct nhgrp_external *)(rtm + 1);
870 
871 	nhge->nhg_idx = nhg_priv->nhg_idx;
872 	nhge->nhg_refcount = nhg_priv->nhg_refcount;
873 
874 	/* fill in control plane nexthops firs */
875 	nhgc = (struct nhgrp_container *)(nhge + 1);
876 	nhgc->nhgc_type = NHG_C_TYPE_CNHOPS;
877 	nhgc->nhgc_subtype = 0;
878 	nhgc->nhgc_len = sizeof(struct nhgrp_container);
879 	nhgc->nhgc_len += sizeof(struct nhgrp_nhop_external) * nhg_priv->nhg_nh_count;
880 	nhgc->nhgc_count = nhg_priv->nhg_nh_count;
881 
882 	ext = (struct nhgrp_nhop_external *)(nhgc + 1);
883 	for (int i = 0; i < nhg_priv->nhg_nh_count; i++) {
884 		ext[i].nh_idx = nhg_priv->nhg_nh_weights[i].nh->nh_priv->nh_idx;
885 		ext[i].nh_weight = nhg_priv->nhg_nh_weights[i].weight;
886 	}
887 
888 	/* fill in dataplane nexthops */
889 	nhgc = (struct nhgrp_container *)(&ext[nhg_priv->nhg_nh_count]);
890 	nhgc->nhgc_type = NHG_C_TYPE_DNHOPS;
891 	nhgc->nhgc_subtype = 0;
892 	nhgc->nhgc_len = sizeof(struct nhgrp_container);
893 	nhgc->nhgc_len += sizeof(struct nhgrp_nhop_external) * nhg->nhg_size;
894 	nhgc->nhgc_count = nhg->nhg_size;
895 
896 	ext = (struct nhgrp_nhop_external *)(nhgc + 1);
897 	for (int i = 0; i < nhg->nhg_size; i++) {
898 		ext[i].nh_idx = nhg->nhops[i]->nh_priv->nh_idx;
899 		ext[i].nh_weight = 0;
900 	}
901 
902 	error = SYSCTL_OUT(w, buffer, sz);
903 
904 	return (error);
905 }
906 
907 uint32_t
908 nhgrp_get_idx(const struct nhgrp_object *nhg)
909 {
910 	const struct nhgrp_priv *nhg_priv;
911 
912 	nhg_priv = NHGRP_PRIV_CONST(nhg);
913 	return (nhg_priv->nhg_idx);
914 }
915 
916 uint8_t
917 nhgrp_get_origin(const struct nhgrp_object *nhg)
918 {
919 	return (NHGRP_PRIV_CONST(nhg)->nhg_origin);
920 }
921 
922 void
923 nhgrp_set_origin(struct nhgrp_object *nhg, uint8_t origin)
924 {
925 	NHGRP_PRIV(nhg)->nhg_origin = origin;
926 }
927 
928 uint32_t
929 nhgrp_get_count(struct rib_head *rh)
930 {
931 	struct nh_control *ctl;
932 	uint32_t count;
933 
934 	ctl = rh->nh_control;
935 
936 	NHOPS_RLOCK(ctl);
937 	count = ctl->gr_head.items_count;
938 	NHOPS_RUNLOCK(ctl);
939 
940 	return (count);
941 }
942 
943 int
944 nhgrp_dump_sysctl(struct rib_head *rh, struct sysctl_req *w)
945 {
946 	struct nh_control *ctl = rh->nh_control;
947 	struct epoch_tracker et;
948 	struct nhgrp_priv *nhg_priv;
949 	char *buffer;
950 	size_t sz;
951 	int error = 0;
952 
953 	if (ctl->gr_head.items_count == 0)
954 		return (0);
955 
956 	/* Calculate the maximum nhop group size in bytes */
957 	sz = sizeof(struct rt_msghdr) + sizeof(struct nhgrp_external);
958 	sz += 2 * sizeof(struct nhgrp_container);
959 	sz += 2 * sizeof(struct nhgrp_nhop_external) * RIB_MAX_MPATH_WIDTH;
960 	buffer = malloc(sz, M_TEMP, M_NOWAIT);
961 	if (buffer == NULL)
962 		return (ENOMEM);
963 
964 	NET_EPOCH_ENTER(et);
965 	NHOPS_RLOCK(ctl);
966 	CHT_SLIST_FOREACH(&ctl->gr_head, mpath, nhg_priv) {
967 		error = dump_nhgrp_entry(rh, nhg_priv, buffer, sz, w);
968 		if (error != 0)
969 			break;
970 	} CHT_SLIST_FOREACH_END;
971 	NHOPS_RUNLOCK(ctl);
972 	NET_EPOCH_EXIT(et);
973 
974 	free(buffer, M_TEMP);
975 
976 	return (error);
977 }
978