xref: /freebsd/sys/net/route/nhgrp_ctl.c (revision f9fd7337f63698f33239c58c07bf430198235a22)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2020 Alexander V. Chernikov
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions
8  * are met:
9  * 1. Redistributions of source code must retain the above copyright
10  *    notice, this list of conditions and the following disclaimer.
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in the
13  *    documentation and/or other materials provided with the distribution.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
16  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
19  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
21  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
22  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
23  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
24  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
25  * SUCH DAMAGE.
26  *
27  * $FreeBSD$
28  */
29 #define RTDEBUG
30 #include "opt_inet.h"
31 #include "opt_route.h"
32 
33 #include <sys/cdefs.h>
34 #include <sys/param.h>
35 #include <sys/systm.h>
36 #include <sys/lock.h>
37 #include <sys/rmlock.h>
38 #include <sys/malloc.h>
39 #include <sys/mbuf.h>
40 #include <sys/refcount.h>
41 #include <sys/socket.h>
42 #include <sys/sysctl.h>
43 #include <sys/kernel.h>
44 #include <sys/epoch.h>
45 
46 #include <net/if.h>
47 #include <net/if_var.h>
48 #include <net/route.h>
49 #include <net/route/route_ctl.h>
50 #include <net/route/route_var.h>
51 #include <net/vnet.h>
52 
53 #include <netinet/in.h>
54 #include <netinet/in_var.h>
55 #include <netinet/in_fib.h>
56 
57 #include <net/route/nhop_utils.h>
58 #include <net/route/nhop.h>
59 #include <net/route/nhop_var.h>
60 #include <net/route/nhgrp_var.h>
61 
62 /*
63  * This file contains the supporting functions for creating multipath groups
64  *  and compiling their dataplane parts.
65  */
66 
67 /* MPF_MULTIPATH must be the same as NHF_MULTIPATH for nhop selection to work */
68 _Static_assert(MPF_MULTIPATH == NHF_MULTIPATH,
69     "MPF_MULTIPATH must be the same as NHF_MULTIPATH");
70 /* Offset and size of flags field has to be the same for nhop/nhop groups */
71 CHK_STRUCT_FIELD_GENERIC(struct nhop_object, nh_flags, struct nhgrp_object, nhg_flags);
72 /* Cap multipath to 64, as the larger values would break rib_cmd_info bmasks */
73 CTASSERT(RIB_MAX_MPATH_WIDTH <= 64);
74 
75 static int wn_cmp(const void *a, const void *b);
76 static void sort_weightened_nhops(struct weightened_nhop *wn, int num_nhops);
77 
78 static struct nhgrp_priv *get_nhgrp(struct nh_control *ctl,
79     struct weightened_nhop *wn, int num_nhops, int *perror);
80 static void destroy_nhgrp(struct nhgrp_priv *nhg_priv);
81 static void destroy_nhgrp_epoch(epoch_context_t ctx);
82 static void free_nhgrp_nhops(struct nhgrp_priv *nhg_priv);
83 
84 static int
85 wn_cmp(const void *a, const void *b)
86 {
87 	const struct weightened_nhop *wa = a;
88 	const struct weightened_nhop *wb = b;
89 
90 	if (wa->weight > wb->weight)
91 		return (1);
92 	else if (wa->weight < wb->weight)
93 		return (-1);
94 
95 	/* Compare nexthops by pointer */
96 	if (wa->nh > wb->nh)
97 		return (1);
98 	else if (wa->nh < wb->nh)
99 		return (-1);
100 	else
101 		return (0);
102 }
103 
104 /*
105  * Perform in-place sorting for array of nexthops in @wn.
106  *
107  * To avoid nh groups duplication, nexthops/weights in the
108  *   @wn need to be ordered deterministically.
109  * As this sorting is needed only for the control plane functionality,
110  *  there are no specific external requirements.
111  *
112  * Sort by weight first, to ease calculation of the slot sizes.
113  */
114 static void
115 sort_weightened_nhops(struct weightened_nhop *wn, int num_nhops)
116 {
117 
118 	qsort(wn, num_nhops, sizeof(struct weightened_nhop), wn_cmp);
119 }
120 
121 /*
122  * Calculate minimum number of slots required to fit the existing
123  * set of weights in the common use case where weights are "easily"
124  * comparable.
125  * Assumes @wn is sorted by weight ascending and each weight is > 0.
126  * Returns number of slots or 0 if precise calculation failed.
127  *
128  * Some examples:
129  * note: (i, X) pair means (nhop=i, weight=X):
130  * (1, 1) (2, 2) -> 3 slots [1, 2, 2]
131  * (1, 100), (2, 200) -> 3 slots [1, 2, 2]
132  * (1, 100), (2, 200), (3, 400) -> 7 slots [1, 2, 2, 3, 3, 3]
133  */
134 static uint32_t
135 calc_min_mpath_slots_fast(const struct weightened_nhop *wn, size_t num_items)
136 {
137 	uint32_t i, last, xmin;
138 	uint64_t total = 0;
139 
140 	last = 0;
141 	xmin = wn[0].weight;
142 	for (i = 0; i < num_items; i++) {
143 		total += wn[i].weight;
144 		if ((wn[i].weight - last < xmin) && (wn[i].weight != last))
145 			xmin = wn[i].weight - last;
146 		last = wn[i].weight;
147 	}
148 	/* xmin is the minimum unit of desired capacity */
149 	if ((total % xmin) != 0)
150 		return (0);
151 	for (i = 0; i < num_items; i++) {
152 		if ((wn[i].weight % xmin) != 0)
153 			return (0);
154 	}
155 
156 	return ((uint32_t)(total / xmin));
157 }
158 
159 /*
160  * Calculate minimum number of slots required to fit the existing
161  * set of weights while maintaining weight coefficients.
162  *
163  * Assume @wn is sorted by weight ascending and each weight is > 0.
164  *
165  * Tries to find simple precise solution first and falls back to
166  *  RIB_MAX_MPATH_WIDTH in case of any failure.
167  */
168 static uint32_t
169 calc_min_mpath_slots(const struct weightened_nhop *wn, size_t num_items)
170 {
171 	uint32_t v;
172 
173 	v = calc_min_mpath_slots_fast(wn, num_items);
174 	if ((v == 0) || (v > RIB_MAX_MPATH_WIDTH))
175 		v = RIB_MAX_MPATH_WIDTH;
176 
177 	return (v);
178 }
179 
180 /*
181  * Nexthop group data consists of
182  * 1) dataplane part, with nhgrp_object as a header followed by an
183  *   arbitrary number of nexthop pointers.
184  * 2) control plane part, with nhgrp_priv as a header, followed by
185  *   an arbirtrary number of 'struct weightened_nhop' object.
186  *
187  * Given nexthop groups are (mostly) immutable, allocate all data
188  * in one go.
189  *
190  */
191 __noinline static size_t
192 get_nhgrp_alloc_size(uint32_t nhg_size, uint32_t num_nhops)
193 {
194 	size_t sz;
195 
196 	sz = sizeof(struct nhgrp_object);
197 	sz += nhg_size * sizeof(struct nhop_object *);
198 	sz += sizeof(struct nhgrp_priv);
199 	sz += num_nhops * sizeof(struct weightened_nhop);
200 	return (sz);
201 }
202 
203 /*
204  * Compile actual list of nexthops to be used by datapath from
205  *  the nexthop group @dst.
206  *
207  * For example, compiling control plane list of 2 nexthops
208  *  [(200, A), (100, B)] would result in the datapath array
209  *  [A, A, B]
210  */
211 static void
212 compile_nhgrp(struct nhgrp_priv *dst_priv, const struct weightened_nhop *x,
213     uint32_t num_slots)
214 {
215 	struct nhgrp_object *dst;
216 	int i, slot_idx, remaining_slots;
217 	uint64_t remaining_sum, nh_weight, nh_slots;
218 
219 	slot_idx  = 0;
220 	dst = dst_priv->nhg;
221 	/* Calculate sum of all weights */
222 	remaining_sum = 0;
223 	for (i = 0; i < dst_priv->nhg_nh_count; i++)
224 		remaining_sum += x[i].weight;
225 	remaining_slots = num_slots;
226 	DPRINTF("O: %u/%u", (uint32_t)remaining_sum, remaining_slots);
227 	for (i = 0; i < dst_priv->nhg_nh_count; i++) {
228 		/* Calculate number of slots for the current nexthop */
229 		if (remaining_sum > 0) {
230 			nh_weight = (uint64_t)x[i].weight;
231 			nh_slots = (nh_weight * remaining_slots / remaining_sum);
232 		} else
233 			nh_slots = 0;
234 
235 		remaining_sum -= x[i].weight;
236 		remaining_slots -= nh_slots;
237 
238 		DPRINTF(" OO[%d]: %u/%u curr=%d slot_idx=%d", i,
239 		    (uint32_t)remaining_sum, remaining_slots,
240 		    (int)nh_slots, slot_idx);
241 
242 		KASSERT((slot_idx + nh_slots <= num_slots),
243 		    ("index overflow during nhg compilation"));
244 		while (nh_slots-- > 0)
245 			dst->nhops[slot_idx++] = x[i].nh;
246 	}
247 }
248 
249 /*
250  * Allocates new nexthop group for the list of weightened nexthops.
251  * Assume sorted list.
252  * Does NOT reference any nexthops in the group.
253  * Returns group with refcount=1 or NULL.
254  */
255 static struct nhgrp_priv *
256 alloc_nhgrp(struct weightened_nhop *wn, int num_nhops)
257 {
258 	uint32_t nhgrp_size;
259 	int flags = M_NOWAIT;
260 	struct nhgrp_object *nhg;
261 	struct nhgrp_priv *nhg_priv;
262 
263 	nhgrp_size = calc_min_mpath_slots(wn, num_nhops);
264 	if (nhgrp_size == 0) {
265 		/* Zero weights, abort */
266 		return (NULL);
267 	}
268 
269 	size_t sz = get_nhgrp_alloc_size(nhgrp_size, num_nhops);
270 	nhg = malloc(sz, M_NHOP, flags | M_ZERO);
271 	if (nhg == NULL) {
272 		return (NULL);
273 	}
274 
275 	/* Has to be the first to make NHGRP_PRIV() work */
276 	nhg->nhg_size = nhgrp_size;
277 	DPRINTF("new mpath group: num_nhops: %u", (uint32_t)nhgrp_size);
278 	nhg->nhg_flags = MPF_MULTIPATH;
279 
280 	nhg_priv = NHGRP_PRIV(nhg);
281 	nhg_priv->nhg_nh_count = num_nhops;
282 	refcount_init(&nhg_priv->nhg_refcount, 1);
283 
284 	/* Please see nhgrp_free() comments on the initial value */
285 	refcount_init(&nhg_priv->nhg_linked, 2);
286 
287 	nhg_priv->nhg = nhg;
288 	memcpy(&nhg_priv->nhg_nh_weights[0], wn,
289 	  num_nhops * sizeof(struct weightened_nhop));
290 
291 	compile_nhgrp(nhg_priv, wn, nhg->nhg_size);
292 
293 	return (nhg_priv);
294 }
295 
296 void
297 nhgrp_free(struct nhgrp_object *nhg)
298 {
299 	struct nhgrp_priv *nhg_priv;
300 	struct nh_control *ctl;
301 	struct epoch_tracker et;
302 
303 	nhg_priv = NHGRP_PRIV(nhg);
304 
305 	if (!refcount_release(&nhg_priv->nhg_refcount))
306 		return;
307 
308 	/*
309 	 * group objects don't have an explicit lock attached to it.
310 	 * As groups are reclaimed based on reference count, it is possible
311 	 * that some groups will persist after vnet destruction callback
312 	 * called. Given that, handle scenario with nhgrp_free_group() being
313 	 * called either after or simultaneously with nhgrp_ctl_unlink_all()
314 	 * by using another reference counter: nhg_linked.
315 	 *
316 	 * There are only 2 places, where nhg_linked can be decreased:
317 	 *  rib destroy (nhgrp_ctl_unlink_all) and this function.
318 	 * nhg_link can never be increased.
319 	 *
320 	 * Hence, use initial value of 2 to make use of
321 	 *  refcount_release_if_not_last().
322 	 *
323 	 * There can be two scenarious when calling this function:
324 	 *
325 	 * 1) nhg_linked value is 2. This means that either
326 	 *  nhgrp_ctl_unlink_all() has not been called OR it is running,
327 	 *  but we are guaranteed that nh_control won't be freed in
328 	 *  this epoch. Hence, nexthop can be safely unlinked.
329 	 *
330 	 * 2) nh_linked value is 1. In that case, nhgrp_ctl_unlink_all()
331 	 *  has been called and nhgrp unlink can be skipped.
332 	 */
333 
334 	NET_EPOCH_ENTER(et);
335 	if (refcount_release_if_not_last(&nhg_priv->nhg_linked)) {
336 		ctl = nhg_priv->nh_control;
337 		if (unlink_nhgrp(ctl, nhg_priv) == NULL) {
338 			/* Do not try to reclaim */
339 			DPRINTF("Failed to unlink nexhop group %p", nhg_priv);
340 			NET_EPOCH_EXIT(et);
341 			return;
342 		}
343 	}
344 	NET_EPOCH_EXIT(et);
345 
346 	epoch_call(net_epoch_preempt, destroy_nhgrp_epoch,
347 	    &nhg_priv->nhg_epoch_ctx);
348 }
349 
350 /*
351  * Destroys all local resources belonging to @nhg_priv.
352  */
353 __noinline static void
354 destroy_nhgrp_int(struct nhgrp_priv *nhg_priv)
355 {
356 
357 	free(nhg_priv->nhg, M_NHOP);
358 }
359 
360 __noinline static void
361 destroy_nhgrp(struct nhgrp_priv *nhg_priv)
362 {
363 
364 	KASSERT((nhg_priv->nhg_refcount == 0), ("nhg_refcount != 0"));
365 
366 	DPRINTF("DEL MPATH %p", nhg_priv);
367 
368 	KASSERT((nhg_priv->nhg_idx == 0), ("gr_idx != 0"));
369 
370 	free_nhgrp_nhops(nhg_priv);
371 
372 	destroy_nhgrp_int(nhg_priv);
373 }
374 
375 /*
376  * Epoch callback indicating group is safe to destroy
377  */
378 static void
379 destroy_nhgrp_epoch(epoch_context_t ctx)
380 {
381 	struct nhgrp_priv *nhg_priv;
382 
383 	nhg_priv = __containerof(ctx, struct nhgrp_priv, nhg_epoch_ctx);
384 
385 	destroy_nhgrp(nhg_priv);
386 }
387 
388 static bool
389 ref_nhgrp_nhops(struct nhgrp_priv *nhg_priv)
390 {
391 
392 	for (int i = 0; i < nhg_priv->nhg_nh_count; i++) {
393 		if (nhop_try_ref_object(nhg_priv->nhg_nh_weights[i].nh) != 0)
394 			continue;
395 
396 		/*
397 		 * Failed to ref the nexthop, b/c it's deleted.
398 		 * Need to rollback references back.
399 		 */
400 		for (int j = 0; j < i; j++)
401 			nhop_free(nhg_priv->nhg_nh_weights[j].nh);
402 		return (false);
403 	}
404 
405 	return (true);
406 }
407 
408 static void
409 free_nhgrp_nhops(struct nhgrp_priv *nhg_priv)
410 {
411 
412 	for (int i = 0; i < nhg_priv->nhg_nh_count; i++)
413 		nhop_free(nhg_priv->nhg_nh_weights[i].nh);
414 }
415 
416 /*
417  * Creates or looks up an existing nexthop group based on @wn and @num_nhops.
418  *
419  * Returns referenced nhop group or NULL, passing error code in @perror.
420  */
421 struct nhgrp_priv *
422 get_nhgrp(struct nh_control *ctl, struct weightened_nhop *wn, int num_nhops,
423     int *perror)
424 {
425 	struct nhgrp_priv *key, *nhg_priv;
426 
427 	if (num_nhops > RIB_MAX_MPATH_WIDTH) {
428 		*perror = E2BIG;
429 		return (NULL);
430 	}
431 
432 	if (ctl->gr_head.hash_size == 0) {
433 		/* First multipath request. Bootstrap mpath datastructures. */
434 		if (nhgrp_ctl_alloc_default(ctl, M_NOWAIT) == 0) {
435 			*perror = ENOMEM;
436 			return (NULL);
437 		}
438 	}
439 
440 	/* Sort nexthops & check there are no duplicates */
441 	sort_weightened_nhops(wn, num_nhops);
442 	uint32_t last_id = 0;
443 	for (int i = 0; i < num_nhops; i++) {
444 		if (wn[i].nh->nh_priv->nh_idx == last_id) {
445 			*perror = EEXIST;
446 			return (NULL);
447 		}
448 		last_id = wn[i].nh->nh_priv->nh_idx;
449 	}
450 
451 	if ((key = alloc_nhgrp(wn, num_nhops)) == NULL) {
452 		*perror = ENOMEM;
453 		return (NULL);
454 	}
455 
456 	nhg_priv = find_nhgrp(ctl, key);
457 	if (nhg_priv != NULL) {
458 		/*
459 		 * Free originally-created group. As it hasn't been linked
460 		 *  and the dependent nexhops haven't been referenced, just free
461 		 *  the group.
462 		 */
463 		destroy_nhgrp_int(key);
464 		*perror = 0;
465 		return (nhg_priv);
466 	} else {
467 		/* No existing group, try to link the new one */
468 		if (!ref_nhgrp_nhops(key)) {
469 			/*
470 			 * Some of the nexthops have been scheduled for deletion.
471 			 * As the group hasn't been linked / no nexhops have been
472 			 *  referenced, call the final destructor immediately.
473 			 */
474 			destroy_nhgrp_int(key);
475 			*perror = EAGAIN;
476 			return (NULL);
477 		}
478 		if (link_nhgrp(ctl, key) == 0) {
479 			/* Unable to allocate index? */
480 			*perror = EAGAIN;
481 			destroy_nhgrp(key);
482 		}
483 		*perror = 0;
484 		return (key);
485 	}
486 
487 	/* NOTREACHED */
488 }
489 
490 /*
491  * Appends one or more nexthops denoted by @wm to the nexthop group @gr_orig.
492  *
493  * Returns referenced nexthop group or NULL. In the latter case, @perror is
494  *  filled with an error code.
495  * Note that function does NOT care if the next nexthops already exists
496  * in the @gr_orig. As a result, they will be added, resulting in the
497  * same nexthop being present multiple times in the new group.
498  */
499 static struct nhgrp_priv *
500 append_nhops(struct nh_control *ctl, const struct nhgrp_object *gr_orig,
501     struct weightened_nhop *wn, int num_nhops, int *perror)
502 {
503 	char storage[64];
504 	struct weightened_nhop *pnhops;
505 	struct nhgrp_priv *nhg_priv;
506 	const struct nhgrp_priv *src_priv;
507 	size_t sz;
508 	int curr_nhops;
509 
510 	src_priv = NHGRP_PRIV_CONST(gr_orig);
511 	curr_nhops = src_priv->nhg_nh_count;
512 
513 	*perror = 0;
514 
515 	sz = (src_priv->nhg_nh_count + num_nhops) * (sizeof(struct weightened_nhop));
516 	/* optimize for <= 4 paths, each path=16 bytes */
517 	if (sz <= sizeof(storage))
518 		pnhops = (struct weightened_nhop *)&storage[0];
519 	else {
520 		pnhops = malloc(sz, M_TEMP, M_NOWAIT);
521 		if (pnhops == NULL) {
522 			*perror = ENOMEM;
523 			return (NULL);
524 		}
525 	}
526 
527 	/* Copy nhops from original group first */
528 	memcpy(pnhops, src_priv->nhg_nh_weights,
529 	  curr_nhops * sizeof(struct weightened_nhop));
530 	memcpy(&pnhops[curr_nhops], wn, num_nhops * sizeof(struct weightened_nhop));
531 	curr_nhops += num_nhops;
532 
533 	nhg_priv = get_nhgrp(ctl, pnhops, curr_nhops, perror);
534 
535 	if (pnhops != (struct weightened_nhop *)&storage[0])
536 		free(pnhops, M_TEMP);
537 
538 	if (nhg_priv == NULL)
539 		return (NULL);
540 
541 	return (nhg_priv);
542 }
543 
544 
545 /*
546  * Creates/finds nexthop group based on @wn and @num_nhops.
547  * Returns 0 on success with referenced group in @rnd, or
548  * errno.
549  *
550  * If the error is EAGAIN, then the operation can be retried.
551  */
552 int
553 nhgrp_get_group(struct rib_head *rh, struct weightened_nhop *wn, int num_nhops,
554     struct route_nhop_data *rnd)
555 {
556 	struct nh_control *ctl = rh->nh_control;
557 	struct nhgrp_priv *nhg_priv;
558 	int error;
559 
560 	nhg_priv = get_nhgrp(ctl, wn, num_nhops, &error);
561 	if (nhg_priv != NULL)
562 		rnd->rnd_nhgrp = nhg_priv->nhg;
563 	rnd->rnd_weight = 0;
564 
565 	return (error);
566 }
567 
568 /*
569  * Creates new nexthop group based on @src group with the nexthops defined in bitmask
570  *  @nhop_mask removed.
571  * Returns referenced nexthop group or NULL on failure.
572  */
573 int
574 nhgrp_get_filtered_group(struct rib_head *rh, const struct nhgrp_object *src,
575     nhgrp_filter_cb_t flt_func, void *flt_data, struct route_nhop_data *rnd)
576 {
577 	char storage[64];
578 	struct nh_control *ctl = rh->nh_control;
579 	struct weightened_nhop *pnhops;
580 	const struct nhgrp_priv *mp_priv, *src_priv;
581 	size_t sz;
582 	int error, i, num_nhops;
583 
584 	src_priv = NHGRP_PRIV_CONST(src);
585 
586 	sz = src_priv->nhg_nh_count * (sizeof(struct weightened_nhop));
587 	/* optimize for <= 4 paths, each path=16 bytes */
588 	if (sz <= sizeof(storage))
589 		pnhops = (struct weightened_nhop *)&storage[0];
590 	else {
591 		if ((pnhops = malloc(sz, M_TEMP, M_NOWAIT)) == NULL)
592 			return (ENOMEM);
593 	}
594 
595 	/* Filter nexthops */
596 	error = 0;
597 	num_nhops = 0;
598 	for (i = 0; i < src_priv->nhg_nh_count; i++) {
599 		if (flt_func(src_priv->nhg_nh_weights[i].nh, flt_data))
600 			continue;
601 		memcpy(&pnhops[num_nhops++], &src_priv->nhg_nh_weights[i],
602 		  sizeof(struct weightened_nhop));
603 	}
604 
605 	if (num_nhops == 0) {
606 		rnd->rnd_nhgrp = NULL;
607 		rnd->rnd_weight = 0;
608 	} else if (num_nhops == 1) {
609 		rnd->rnd_nhop = pnhops[0].nh;
610 		rnd->rnd_weight = pnhops[0].weight;
611 		if (nhop_try_ref_object(rnd->rnd_nhop) == 0)
612 			error = EAGAIN;
613 	} else {
614 		mp_priv = get_nhgrp(ctl, pnhops, num_nhops, &error);
615 		if (mp_priv != NULL)
616 			rnd->rnd_nhgrp = mp_priv->nhg;
617 		rnd->rnd_weight = 0;
618 	}
619 
620 	if (pnhops != (struct weightened_nhop *)&storage[0])
621 		free(pnhops, M_TEMP);
622 
623 	return (error);
624 }
625 
626 /*
627  * Creates new multipath group based on existing group/nhop in @rnd_orig and
628  *  to-be-added nhop @wn_add.
629  * Returns 0 on success and stores result in @rnd_new.
630  */
631 int
632 nhgrp_get_addition_group(struct rib_head *rh, struct route_nhop_data *rnd_orig,
633     struct route_nhop_data *rnd_add, struct route_nhop_data *rnd_new)
634 {
635 	struct nh_control *ctl = rh->nh_control;
636 	struct nhgrp_priv *nhg_priv;
637 	struct weightened_nhop wn[2];
638 	int error;
639 
640 	if (rnd_orig->rnd_nhop == NULL) {
641 		/* No paths to add to, just reference current nhop */
642 		*rnd_new = *rnd_add;
643 		if (nhop_try_ref_object(rnd_new->rnd_nhop) == 0)
644 			return (EAGAIN);
645 		return (0);
646 	}
647 
648 	wn[0].nh = rnd_add->rnd_nhop;
649 	wn[0].weight = rnd_add->rnd_weight;
650 
651 	if (!NH_IS_NHGRP(rnd_orig->rnd_nhop)) {
652 		/* Simple merge of 2 non-multipath nexthops */
653 		wn[1].nh = rnd_orig->rnd_nhop;
654 		wn[1].weight = rnd_orig->rnd_weight;
655 		nhg_priv = get_nhgrp(ctl, wn, 2, &error);
656 	} else {
657 		/* Get new nhop group with @rt->rt_nhop as an additional nhop */
658 		nhg_priv = append_nhops(ctl, rnd_orig->rnd_nhgrp, &wn[0], 1,
659 		    &error);
660 	}
661 
662 	if (nhg_priv == NULL)
663 		return (error);
664 	rnd_new->rnd_nhgrp = nhg_priv->nhg;
665 	rnd_new->rnd_weight = 0;
666 
667 	return (0);
668 }
669 
670 /*
671  * Returns pointer to array of nexthops with weights for
672  * given @nhg. Stores number of items in the array into @pnum_nhops.
673  */
674 struct weightened_nhop *
675 nhgrp_get_nhops(struct nhgrp_object *nhg, uint32_t *pnum_nhops)
676 {
677 	struct nhgrp_priv *nhg_priv;
678 
679 	KASSERT(((nhg->nhg_flags & MPF_MULTIPATH) != 0), ("nhop is not mpath"));
680 
681 	nhg_priv = NHGRP_PRIV(nhg);
682 	*pnum_nhops = nhg_priv->nhg_nh_count;
683 
684 	return (nhg_priv->nhg_nh_weights);
685 }
686 
687 __noinline static int
688 dump_nhgrp_entry(struct rib_head *rh, const struct nhgrp_priv *nhg_priv,
689     char *buffer, size_t buffer_size, struct sysctl_req *w)
690 {
691 	struct rt_msghdr *rtm;
692 	struct nhgrp_external *nhge;
693 	struct nhgrp_container *nhgc;
694 	const struct nhgrp_object *nhg;
695 	struct nhgrp_nhop_external *ext;
696 	int error;
697 	size_t sz;
698 
699 	nhg = nhg_priv->nhg;
700 
701 	sz = sizeof(struct rt_msghdr) + sizeof(struct nhgrp_external);
702 	/* controlplane nexthops */
703 	sz += sizeof(struct nhgrp_container);
704 	sz += sizeof(struct nhgrp_nhop_external) * nhg_priv->nhg_nh_count;
705 	/* dataplane nexthops */
706 	sz += sizeof(struct nhgrp_container);
707 	sz += sizeof(struct nhgrp_nhop_external) * nhg->nhg_size;
708 
709 	KASSERT(sz <= buffer_size, ("increase nhgrp buffer size"));
710 
711 	bzero(buffer, sz);
712 
713 	rtm = (struct rt_msghdr *)buffer;
714 	rtm->rtm_msglen = sz;
715 	rtm->rtm_version = RTM_VERSION;
716 	rtm->rtm_type = RTM_GET;
717 
718 	nhge = (struct nhgrp_external *)(rtm + 1);
719 
720 	nhge->nhg_idx = nhg_priv->nhg_idx;
721 	nhge->nhg_refcount = nhg_priv->nhg_refcount;
722 
723 	/* fill in control plane nexthops firs */
724 	nhgc = (struct nhgrp_container *)(nhge + 1);
725 	nhgc->nhgc_type = NHG_C_TYPE_CNHOPS;
726 	nhgc->nhgc_subtype = 0;
727 	nhgc->nhgc_len = sizeof(struct nhgrp_container);
728 	nhgc->nhgc_len += sizeof(struct nhgrp_nhop_external) * nhg_priv->nhg_nh_count;
729 	nhgc->nhgc_count = nhg_priv->nhg_nh_count;
730 
731 	ext = (struct nhgrp_nhop_external *)(nhgc + 1);
732 	for (int i = 0; i < nhg_priv->nhg_nh_count; i++) {
733 		ext[i].nh_idx = nhg_priv->nhg_nh_weights[i].nh->nh_priv->nh_idx;
734 		ext[i].nh_weight = nhg_priv->nhg_nh_weights[i].weight;
735 	}
736 
737 	/* fill in dataplane nexthops */
738 	nhgc = (struct nhgrp_container *)(&ext[nhg_priv->nhg_nh_count]);
739 	nhgc->nhgc_type = NHG_C_TYPE_DNHOPS;
740 	nhgc->nhgc_subtype = 0;
741 	nhgc->nhgc_len = sizeof(struct nhgrp_container);
742 	nhgc->nhgc_len += sizeof(struct nhgrp_nhop_external) * nhg->nhg_size;
743 	nhgc->nhgc_count = nhg->nhg_size;
744 
745 	ext = (struct nhgrp_nhop_external *)(nhgc + 1);
746 	for (int i = 0; i < nhg->nhg_size; i++) {
747 		ext[i].nh_idx = nhg->nhops[i]->nh_priv->nh_idx;
748 		ext[i].nh_weight = 0;
749 	}
750 
751 	error = SYSCTL_OUT(w, buffer, sz);
752 
753 	return (error);
754 }
755 
756 int
757 nhgrp_dump_sysctl(struct rib_head *rh, struct sysctl_req *w)
758 {
759 	struct nh_control *ctl = rh->nh_control;
760 	struct epoch_tracker et;
761 	struct nhgrp_priv *nhg_priv;
762 	char *buffer;
763 	size_t sz;
764 	int error = 0;
765 
766 	if (ctl->gr_head.items_count == 0)
767 		return (0);
768 
769 	/* Calculate the maximum nhop group size in bytes */
770 	sz = sizeof(struct rt_msghdr) + sizeof(struct nhgrp_external);
771 	sz += 2 * sizeof(struct nhgrp_container);
772 	sz += 2 * sizeof(struct nhgrp_nhop_external) * RIB_MAX_MPATH_WIDTH;
773 	buffer = malloc(sz, M_TEMP, M_WAITOK);
774 
775 	NET_EPOCH_ENTER(et);
776 	NHOPS_RLOCK(ctl);
777 	CHT_SLIST_FOREACH(&ctl->gr_head, mpath, nhg_priv) {
778 		error = dump_nhgrp_entry(rh, nhg_priv, buffer, sz, w);
779 		if (error != 0)
780 			break;
781 	} CHT_SLIST_FOREACH_END;
782 	NHOPS_RUNLOCK(ctl);
783 	NET_EPOCH_EXIT(et);
784 
785 	free(buffer, M_TEMP);
786 
787 	return (error);
788 }
789