xref: /freebsd/sys/net/route/nhgrp_ctl.c (revision c0256b31efcccb6964822b5aadb183e8a6d45507)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause
3  *
4  * Copyright (c) 2020 Alexander V. Chernikov
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions
8  * are met:
9  * 1. Redistributions of source code must retain the above copyright
10  *    notice, this list of conditions and the following disclaimer.
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in the
13  *    documentation and/or other materials provided with the distribution.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
16  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
19  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
21  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
22  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
23  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
24  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
25  * SUCH DAMAGE.
26  */
27 #include "opt_inet.h"
28 
29 #include <sys/param.h>
30 #include <sys/systm.h>
31 #include <sys/lock.h>
32 #include <sys/rmlock.h>
33 #include <sys/malloc.h>
34 #include <sys/mbuf.h>
35 #include <sys/refcount.h>
36 #include <sys/socket.h>
37 #include <sys/sysctl.h>
38 #include <sys/kernel.h>
39 #include <sys/epoch.h>
40 
41 #include <net/if.h>
42 #include <net/if_var.h>
43 #include <net/if_private.h>
44 #include <net/route.h>
45 #include <net/route/route_ctl.h>
46 #include <net/route/route_var.h>
47 #include <net/vnet.h>
48 
49 #include <netinet/in.h>
50 #include <netinet/in_var.h>
51 #include <netinet/in_fib.h>
52 
53 #include <net/route/nhop_utils.h>
54 #include <net/route/nhop.h>
55 #include <net/route/nhop_var.h>
56 #include <net/route/nhgrp_var.h>
57 
58 #define	DEBUG_MOD_NAME	nhgrp_ctl
59 #define	DEBUG_MAX_LEVEL	LOG_DEBUG
60 #include <net/route/route_debug.h>
61 _DECLARE_DEBUG(LOG_INFO);
62 
63 /*
64  * This file contains the supporting functions for creating multipath groups
65  *  and compiling their dataplane parts.
66  */
67 
68 /* MPF_MULTIPATH must be the same as NHF_MULTIPATH for nhop selection to work */
69 _Static_assert(MPF_MULTIPATH == NHF_MULTIPATH,
70     "MPF_MULTIPATH must be the same as NHF_MULTIPATH");
71 /* Offset and size of flags field has to be the same for nhop/nhop groups */
72 CHK_STRUCT_FIELD_GENERIC(struct nhop_object, nh_flags, struct nhgrp_object, nhg_flags);
73 /* Cap multipath to 64, as the larger values would break rib_cmd_info bmasks */
74 CTASSERT(RIB_MAX_MPATH_WIDTH <= 64);
75 
76 static int wn_cmp_idx(const void *a, const void *b);
77 static void sort_weightened_nhops(struct weightened_nhop *wn, int num_nhops);
78 
79 static struct nhgrp_priv *get_nhgrp(struct nh_control *ctl,
80     struct weightened_nhop *wn, int num_nhops, uint32_t uidx, int *perror);
81 static void destroy_nhgrp(struct nhgrp_priv *nhg_priv);
82 static void destroy_nhgrp_epoch(epoch_context_t ctx);
83 static void free_nhgrp_nhops(struct nhgrp_priv *nhg_priv);
84 
85 static int
wn_cmp_idx(const void * a,const void * b)86 wn_cmp_idx(const void *a, const void *b)
87 {
88 	const struct weightened_nhop *w_a = a;
89 	const struct weightened_nhop *w_b = b;
90 	uint32_t a_idx = w_a->nh->nh_priv->nh_idx;
91 	uint32_t b_idx = w_b->nh->nh_priv->nh_idx;
92 
93 	if (a_idx < b_idx)
94 		return (-1);
95 	else if (a_idx > b_idx)
96 		return (1);
97 	else
98 		return (0);
99 }
100 
101 /*
102  * Perform in-place sorting for array of nexthops in @wn.
103  * Sort by nexthop index ascending.
104  */
105 static void
sort_weightened_nhops(struct weightened_nhop * wn,int num_nhops)106 sort_weightened_nhops(struct weightened_nhop *wn, int num_nhops)
107 {
108 
109 	qsort(wn, num_nhops, sizeof(struct weightened_nhop), wn_cmp_idx);
110 }
111 
112 /*
113  * In order to determine the minimum weight difference in the array
114  * of weights, create a sorted array of weights, using spare "storage"
115  * field in the `struct weightened_nhop`.
116  * Assume weights to be (mostly) the same and use insertion sort to
117  * make it sorted.
118  */
119 static void
sort_weightened_nhops_weights(struct weightened_nhop * wn,int num_items)120 sort_weightened_nhops_weights(struct weightened_nhop *wn, int num_items)
121 {
122 	wn[0].storage = wn[0].weight;
123 	for (int i = 1, j = 0; i < num_items; i++) {
124 		uint32_t weight = wn[i].weight; // read from 'weight' as it's not reordered
125 		/* Move all weights > weight 1 position right */
126 		for (j = i - 1; j >= 0 && wn[j].storage > weight; j--)
127 			wn[j + 1].storage = wn[j].storage;
128 		wn[j + 1].storage = weight;
129 	}
130 }
131 
132 /*
133  * Calculate minimum number of slots required to fit the existing
134  * set of weights in the common use case where weights are "easily"
135  * comparable.
136  * Assumes @wn is sorted by weight ascending and each weight is > 0.
137  * Returns number of slots or 0 if precise calculation failed.
138  * Only calculate for nexthops with specified metric and ignore the rest.
139  *
140  * Some examples:
141  * note: (i, X) pair means (nhop=i, weight=X):
142  * (1, 1) (2, 2) -> 3 slots [1, 2, 2]
143  * (1, 100), (2, 200) -> 3 slots [1, 2, 2]
144  * (1, 100), (2, 200), (3, 400) -> 7 slots [1, 2, 2, 3, 3, 3]
145  */
146 static uint32_t
calc_min_mpath_slots_fast(struct weightened_nhop * wn,size_t num_items,uint32_t metric,uint64_t * ptotal)147 calc_min_mpath_slots_fast(struct weightened_nhop *wn, size_t num_items,
148     uint32_t metric, uint64_t *ptotal)
149 {
150 	uint32_t i, x, last, xmin = 0;
151 	uint64_t total = 0;
152 
153 	// Get sorted array of weights in .storage field
154 	sort_weightened_nhops_weights(wn, num_items);
155 
156 	/* start with lowest metric */
157 	for (x = 0; x < num_items; x++) {
158 		if (nhop_get_metric(wn[x].nh) == metric) {
159 			xmin = wn[x].storage;
160 			break;
161 		}
162 	}
163 	last = 0;
164 	for (i = x; i < num_items; i++) {
165 		if (nhop_get_metric(wn[i].nh) != metric)
166 			continue;
167 
168 		total += wn[i].storage;
169 		if ((wn[i].storage != last) &&
170 		    ((wn[i].storage - last < xmin) || xmin == 0)) {
171 			xmin = wn[i].storage - last;
172 		}
173 		last = wn[i].storage;
174 	}
175 	*ptotal = total;
176 	/* xmin is the minimum unit of desired capacity */
177 	if ((total % xmin) != 0)
178 		return (0);
179 	for (i = 0; i < num_items; i++) {
180 		if ((wn[i].weight % xmin) != 0)
181 			return (0);
182 	}
183 
184 	return ((uint32_t)(total / xmin));
185 }
186 
187 /*
188  * Calculate minimum number of slots required to fit the existing
189  * set of weights while maintaining weight coefficients
190  * after filtering by metric.
191  *
192  * Assume @wn is sorted by weight ascending and each weight is > 0.
193  *
194  * Tries to find simple precise solution first and falls back to
195  *  RIB_MAX_MPATH_WIDTH in case of any failure.
196  */
197 static uint32_t
calc_min_mpath_slots(struct weightened_nhop * wn,size_t num_items,uint32_t metric)198 calc_min_mpath_slots(struct weightened_nhop *wn, size_t num_items,
199     uint32_t metric)
200 {
201 	uint32_t v;
202 	uint64_t total;
203 
204 	v = calc_min_mpath_slots_fast(wn, num_items, metric, &total);
205 	if (total == 0)
206 		return (0);
207 	if ((v == 0) || (v > RIB_MAX_MPATH_WIDTH))
208 		v = RIB_MAX_MPATH_WIDTH;
209 
210 	return (v);
211 }
212 
213 /*
214  * Nexthop group data consists of
215  * 1) dataplane part, with nhgrp_object as a header followed by an
216  *   arbitrary number of nexthop pointers.
217  * 2) control plane part, with nhgrp_priv as a header, followed by
218  *   an arbirtrary number of 'struct weightened_nhop' object.
219  *
220  * Given nexthop groups are (mostly) immutable, allocate all data
221  * in one go.
222  *
223  */
224 __noinline static size_t
get_nhgrp_alloc_size(uint32_t nhg_size,uint32_t num_nhops)225 get_nhgrp_alloc_size(uint32_t nhg_size, uint32_t num_nhops)
226 {
227 	size_t sz;
228 
229 	sz = sizeof(struct nhgrp_object);
230 	sz += nhg_size * sizeof(struct nhop_object *);
231 	sz += sizeof(struct nhgrp_priv);
232 	sz += num_nhops * sizeof(struct weightened_nhop);
233 	return (sz);
234 }
235 
236 /*
237  * Compile actual list of nexthops to be used by datapath from
238  *  the nexthop group @dst.
239  * Since we only need nexthops with lowest metric, only process
240  * nexthops with specified metric. The metric argument is taken
241  * from input and is expected to be the lowest metric in weightened_nhop.
242  *
243  * For example, compiling control plane list of 2 nexthops
244  *  [(200, A), (100, B)] would result in the datapath array
245  *  [A, A, B]
246  */
247 static void
compile_nhgrp(struct nhgrp_priv * dst_priv,const struct weightened_nhop * x,uint32_t num_slots,uint32_t metric)248 compile_nhgrp(struct nhgrp_priv *dst_priv, const struct weightened_nhop *x,
249     uint32_t num_slots, uint32_t metric)
250 {
251 	struct nhgrp_object *dst;
252 	int i, slot_idx, remaining_slots;
253 	uint64_t remaining_sum, nh_weight, nh_slots;
254 
255 	slot_idx  = 0;
256 	dst = dst_priv->nhg;
257 	/* Calculate sum of all weights with lowest metric */
258 	remaining_sum = 0;
259 	for (i = 0; i < dst_priv->nhg_nh_count; i++) {
260 		if (nhop_get_metric(x[i].nh) == metric)
261 			remaining_sum += x[i].weight;
262 	}
263 
264 	remaining_slots = num_slots;
265 	FIB_NH_LOG(LOG_DEBUG3, x[0].nh, "sum: %lu, slots: %d, lowest_metric: %u",
266 	    remaining_sum, remaining_slots, metric);
267 	for (i = 0; i < dst_priv->nhg_nh_count; i++) {
268 		if (nhop_get_metric(x[i].nh) != metric)
269 			continue;
270 
271 		/* Calculate number of slots for the current nexthop */
272 		if (remaining_sum > 0) {
273 			nh_weight = (uint64_t)x[i].weight;
274 			nh_slots = (nh_weight * remaining_slots / remaining_sum);
275 		} else
276 			nh_slots = 0;
277 
278 		remaining_sum -= x[i].weight;
279 		remaining_slots -= nh_slots;
280 
281 		FIB_NH_LOG(LOG_DEBUG3, x[0].nh,
282 		    " rem_sum: %lu, rem_slots: %d nh_slots: %d, slot_idx: %d",
283 		    remaining_sum, remaining_slots, (int)nh_slots, slot_idx);
284 
285 		KASSERT((slot_idx + nh_slots <= num_slots),
286 		    ("index overflow during nhg compilation"));
287 		while (nh_slots-- > 0)
288 			dst->nhops[slot_idx++] = x[i].nh;
289 	}
290 }
291 
292 /*
293  * Allocates new nexthop group for the list of weightened nexthops.
294  * Assume sorted list.
295  * Does NOT reference any nexthops in the group.
296  * Returns group with refcount=1 or NULL.
297  */
298 static struct nhgrp_priv *
alloc_nhgrp(struct weightened_nhop * wn,int num_nhops,uint32_t min_metric)299 alloc_nhgrp(struct weightened_nhop *wn, int num_nhops, uint32_t min_metric)
300 {
301 	uint32_t nhgrp_size;
302 	struct nhgrp_object *nhg;
303 	struct nhgrp_priv *nhg_priv;
304 
305 	nhgrp_size = calc_min_mpath_slots(wn, num_nhops, min_metric);
306 	if (nhgrp_size == 0) {
307 		/* Zero weights, abort */
308 		return (NULL);
309 	}
310 
311 	size_t sz = get_nhgrp_alloc_size(nhgrp_size, num_nhops);
312 	nhg = malloc(sz, M_NHOP, M_NOWAIT | M_ZERO);
313 	if (nhg == NULL) {
314 		FIB_NH_LOG(LOG_INFO, wn[0].nh,
315 		    "unable to allocate group with num_nhops %d (compiled %u)",
316 		    num_nhops, nhgrp_size);
317 		return (NULL);
318 	}
319 
320 	/* Has to be the first to make NHGRP_PRIV() work */
321 	nhg->nhg_size = nhgrp_size;
322 	nhg->nhg_flags = MPF_MULTIPATH;
323 
324 	nhg_priv = NHGRP_PRIV(nhg);
325 	nhg_priv->nhg_nh_count = num_nhops;
326 	refcount_init(&nhg_priv->nhg_refcount, 1);
327 
328 	/* Please see nhgrp_free() comments on the initial value */
329 	refcount_init(&nhg_priv->nhg_linked, 2);
330 
331 	nhg_priv->nhg = nhg;
332 	memcpy(&nhg_priv->nhg_nh_weights[0], wn,
333 	  num_nhops * sizeof(struct weightened_nhop));
334 
335 	FIB_NH_LOG(LOG_DEBUG, wn[0].nh, "num_nhops: %d, compiled_nhop: %u",
336 	    num_nhops, nhgrp_size);
337 
338 	compile_nhgrp(nhg_priv, wn, nhg->nhg_size, min_metric);
339 
340 	return (nhg_priv);
341 }
342 
343 void
nhgrp_ref_object(struct nhgrp_object * nhg)344 nhgrp_ref_object(struct nhgrp_object *nhg)
345 {
346 	struct nhgrp_priv *nhg_priv;
347 	u_int old __diagused;
348 
349 	nhg_priv = NHGRP_PRIV(nhg);
350 	old = refcount_acquire(&nhg_priv->nhg_refcount);
351 	KASSERT(old > 0, ("%s: nhgrp object %p has 0 refs", __func__, nhg));
352 }
353 
354 void
nhgrp_free(struct nhgrp_object * nhg)355 nhgrp_free(struct nhgrp_object *nhg)
356 {
357 	struct nhgrp_priv *nhg_priv;
358 	struct nh_control *ctl;
359 	struct epoch_tracker et;
360 
361 	nhg_priv = NHGRP_PRIV(nhg);
362 
363 	if (!refcount_release(&nhg_priv->nhg_refcount))
364 		return;
365 
366 	/*
367 	 * group objects don't have an explicit lock attached to it.
368 	 * As groups are reclaimed based on reference count, it is possible
369 	 * that some groups will persist after vnet destruction callback
370 	 * called. Given that, handle scenario with nhgrp_free_group() being
371 	 * called either after or simultaneously with nhgrp_ctl_unlink_all()
372 	 * by using another reference counter: nhg_linked.
373 	 *
374 	 * There are only 2 places, where nhg_linked can be decreased:
375 	 *  rib destroy (nhgrp_ctl_unlink_all) and this function.
376 	 * nhg_link can never be increased.
377 	 *
378 	 * Hence, use initial value of 2 to make use of
379 	 *  refcount_release_if_not_last().
380 	 *
381 	 * There can be two scenarious when calling this function:
382 	 *
383 	 * 1) nhg_linked value is 2. This means that either
384 	 *  nhgrp_ctl_unlink_all() has not been called OR it is running,
385 	 *  but we are guaranteed that nh_control won't be freed in
386 	 *  this epoch. Hence, nexthop can be safely unlinked.
387 	 *
388 	 * 2) nh_linked value is 1. In that case, nhgrp_ctl_unlink_all()
389 	 *  has been called and nhgrp unlink can be skipped.
390 	 */
391 
392 	NET_EPOCH_ENTER(et);
393 	if (refcount_release_if_not_last(&nhg_priv->nhg_linked)) {
394 		ctl = nhg_priv->nh_control;
395 		if (unlink_nhgrp(ctl, nhg_priv) == NULL) {
396 			/* Do not try to reclaim */
397 			RT_LOG(LOG_INFO, "Failed to unlink nexhop group %p",
398 			    nhg_priv);
399 			NET_EPOCH_EXIT(et);
400 			return;
401 		}
402 		MPASS((nhg_priv->nhg_idx == 0));
403 		MPASS((nhg_priv->nhg_refcount == 0));
404 	}
405 	NET_EPOCH_EXIT(et);
406 
407 	NET_EPOCH_CALL(destroy_nhgrp_epoch, &nhg_priv->nhg_epoch_ctx);
408 }
409 
410 /*
411  * Destroys all local resources belonging to @nhg_priv.
412  */
413 __noinline static void
destroy_nhgrp_int(struct nhgrp_priv * nhg_priv)414 destroy_nhgrp_int(struct nhgrp_priv *nhg_priv)
415 {
416 
417 	free(nhg_priv->nhg, M_NHOP);
418 }
419 
420 __noinline static void
destroy_nhgrp(struct nhgrp_priv * nhg_priv)421 destroy_nhgrp(struct nhgrp_priv *nhg_priv)
422 {
423 	IF_DEBUG_LEVEL(LOG_DEBUG2) {
424 		char nhgbuf[NHOP_PRINT_BUFSIZE] __unused;
425 		FIB_NH_LOG(LOG_DEBUG2, nhg_priv->nhg_nh_weights[0].nh,
426 		    "destroying %s", nhgrp_print_buf(nhg_priv->nhg,
427 		    nhgbuf, sizeof(nhgbuf)));
428 	}
429 
430 	free_nhgrp_nhops(nhg_priv);
431 	destroy_nhgrp_int(nhg_priv);
432 }
433 
434 /*
435  * Epoch callback indicating group is safe to destroy
436  */
437 static void
destroy_nhgrp_epoch(epoch_context_t ctx)438 destroy_nhgrp_epoch(epoch_context_t ctx)
439 {
440 	struct nhgrp_priv *nhg_priv;
441 
442 	nhg_priv = __containerof(ctx, struct nhgrp_priv, nhg_epoch_ctx);
443 
444 	destroy_nhgrp(nhg_priv);
445 }
446 
447 static bool
ref_nhgrp_nhops(struct nhgrp_priv * nhg_priv)448 ref_nhgrp_nhops(struct nhgrp_priv *nhg_priv)
449 {
450 
451 	for (int i = 0; i < nhg_priv->nhg_nh_count; i++) {
452 		if (nhop_try_ref_object(nhg_priv->nhg_nh_weights[i].nh) != 0)
453 			continue;
454 
455 		/*
456 		 * Failed to ref the nexthop, b/c it's deleted.
457 		 * Need to rollback references back.
458 		 */
459 		for (int j = 0; j < i; j++)
460 			nhop_free(nhg_priv->nhg_nh_weights[j].nh);
461 		return (false);
462 	}
463 
464 	return (true);
465 }
466 
467 static void
free_nhgrp_nhops(struct nhgrp_priv * nhg_priv)468 free_nhgrp_nhops(struct nhgrp_priv *nhg_priv)
469 {
470 
471 	for (int i = 0; i < nhg_priv->nhg_nh_count; i++)
472 		nhop_free(nhg_priv->nhg_nh_weights[i].nh);
473 }
474 
475 /*
476  * Allocate nexthop group of size @num_nhops with nexthops specified by
477  * @wn. Nexthops have to be unique and match the fibnum/family of the group.
478  * Returns unlinked nhgrp object on success or NULL and non-zero perror.
479  */
480 struct nhgrp_object *
nhgrp_alloc(uint32_t fibnum,int family,struct weightened_nhop * wn,int num_nhops,int * perror)481 nhgrp_alloc(uint32_t fibnum, int family, struct weightened_nhop *wn, int num_nhops,
482     int *perror)
483 {
484 	struct rib_head *rh = rt_tables_get_rnh(fibnum, family);
485 	struct nhgrp_priv *nhg_priv;
486 	struct nh_control *ctl;
487 
488 	MPASS((num_nhops != 0));
489 
490 	if (rh == NULL) {
491 		*perror = E2BIG;
492 		return (NULL);
493 	}
494 
495 	ctl = rh->nh_control;
496 
497 	if (num_nhops > RIB_MAX_MPATH_WIDTH) {
498 		*perror = E2BIG;
499 		return (NULL);
500 	}
501 
502 	if (ctl->gr_head.hash_size == 0) {
503 		/* First multipath request. Bootstrap mpath datastructures. */
504 		if (nhgrp_ctl_alloc_default(ctl, M_NOWAIT) == 0) {
505 			*perror = ENOMEM;
506 			return (NULL);
507 		}
508 	}
509 
510 	/* Sort nexthops & check there are no duplicates */
511 	sort_weightened_nhops(wn, num_nhops);
512 	uint32_t last_id = 0;
513 	uint32_t min_metric = nhop_get_metric(wn[0].nh);
514 	for (int i = 0; i < num_nhops; i++) {
515 		if (wn[i].nh->nh_priv->nh_control != ctl) {
516 			*perror = EINVAL;
517 			return (NULL);
518 		}
519 		if (wn[i].nh->nh_priv->nh_idx == last_id) {
520 			*perror = EEXIST;
521 			return (NULL);
522 		}
523 		last_id = wn[i].nh->nh_priv->nh_idx;
524 
525 		if (nhop_get_metric(wn[i].nh) < min_metric)
526 			min_metric = nhop_get_metric(wn[i].nh);
527 	}
528 
529 	if ((nhg_priv = alloc_nhgrp(wn, num_nhops, min_metric)) == NULL) {
530 		*perror = ENOMEM;
531 		return (NULL);
532 	}
533 	nhg_priv->nh_control = ctl;
534 
535 	*perror = 0;
536 	return (nhg_priv->nhg);
537 }
538 
539 /*
540  * Finds an existing group matching @nhg or links @nhg to the tree.
541  * Returns the referenced group or NULL and non-zero @perror.
542  */
543 struct nhgrp_object *
nhgrp_get_nhgrp(struct nhgrp_object * nhg,int * perror)544 nhgrp_get_nhgrp(struct nhgrp_object *nhg, int *perror)
545 {
546 	struct nhgrp_priv *nhg_priv, *key = NHGRP_PRIV(nhg);
547 	struct nh_control *ctl = key->nh_control;
548 
549 	nhg_priv = find_nhgrp(ctl, key);
550 	if (nhg_priv != NULL) {
551 		/*
552 		 * Free originally-created group. As it hasn't been linked
553 		 *  and the dependent nexhops haven't been referenced, just free
554 		 *  the group.
555 		 */
556 		destroy_nhgrp_int(key);
557 		*perror = 0;
558 		return (nhg_priv->nhg);
559 	} else {
560 		/* No existing group, try to link the new one */
561 		if (!ref_nhgrp_nhops(key)) {
562 			/*
563 			 * Some of the nexthops have been scheduled for deletion.
564 			 * As the group hasn't been linked / no nexhops have been
565 			 *  referenced, call the final destructor immediately.
566 			 */
567 			destroy_nhgrp_int(key);
568 			*perror = EAGAIN;
569 			return (NULL);
570 		}
571 		if (link_nhgrp(ctl, key) == 0) {
572 			/* Unable to allocate index? */
573 			*perror = EAGAIN;
574 			free_nhgrp_nhops(key);
575 			destroy_nhgrp_int(key);
576 			return (NULL);
577 		}
578 		*perror = 0;
579 		return (nhg);
580 	}
581 
582 	/* NOTREACHED */
583 }
584 
585 /*
586  * Creates or looks up an existing nexthop group based on @wn and @num_nhops.
587  *
588  * Returns referenced nhop group or NULL, passing error code in @perror.
589  */
590 struct nhgrp_priv *
get_nhgrp(struct nh_control * ctl,struct weightened_nhop * wn,int num_nhops,uint32_t uidx,int * perror)591 get_nhgrp(struct nh_control *ctl, struct weightened_nhop *wn, int num_nhops,
592     uint32_t uidx, int *perror)
593 {
594 	struct nhgrp_object *nhg;
595 
596 	nhg = nhgrp_alloc(ctl->ctl_rh->rib_fibnum, ctl->ctl_rh->rib_family,
597 	    wn, num_nhops, perror);
598 	if (nhg == NULL)
599 		return (NULL);
600 	nhgrp_set_uidx(nhg, uidx);
601 	nhg = nhgrp_get_nhgrp(nhg, perror);
602 	if (nhg != NULL)
603 		return (NHGRP_PRIV(nhg));
604 	return (NULL);
605 }
606 
607 
608 /*
609  * Appends one or more nexthops denoted by @wm to the nexthop group @gr_orig.
610  *
611  * Returns referenced nexthop group or NULL. In the latter case, @perror is
612  *  filled with an error code.
613  * Note that function does NOT care if the next nexthops already exists
614  * in the @gr_orig. As a result, they will be added, resulting in the
615  * same nexthop being present multiple times in the new group.
616  */
617 static struct nhgrp_priv *
append_nhops(struct nh_control * ctl,const struct nhgrp_object * gr_orig,struct weightened_nhop * wn,int num_nhops,int * perror)618 append_nhops(struct nh_control *ctl, const struct nhgrp_object *gr_orig,
619     struct weightened_nhop *wn, int num_nhops, int *perror)
620 {
621 	char storage[64];
622 	struct weightened_nhop *pnhops;
623 	struct nhgrp_priv *nhg_priv;
624 	const struct nhgrp_priv *src_priv;
625 	size_t sz;
626 	int curr_nhops;
627 
628 	src_priv = NHGRP_PRIV_CONST(gr_orig);
629 	curr_nhops = src_priv->nhg_nh_count;
630 
631 	*perror = 0;
632 
633 	sz = (src_priv->nhg_nh_count + num_nhops) * (sizeof(struct weightened_nhop));
634 	/* optimize for <= 4 paths, each path=16 bytes */
635 	if (sz <= sizeof(storage))
636 		pnhops = (struct weightened_nhop *)&storage[0];
637 	else {
638 		pnhops = malloc(sz, M_TEMP, M_NOWAIT);
639 		if (pnhops == NULL) {
640 			*perror = ENOMEM;
641 			return (NULL);
642 		}
643 	}
644 
645 	/* Copy nhops from original group first */
646 	memcpy(pnhops, src_priv->nhg_nh_weights,
647 	  curr_nhops * sizeof(struct weightened_nhop));
648 	memcpy(&pnhops[curr_nhops], wn, num_nhops * sizeof(struct weightened_nhop));
649 	curr_nhops += num_nhops;
650 
651 	nhg_priv = get_nhgrp(ctl, pnhops, curr_nhops, 0, perror);
652 
653 	if (pnhops != (struct weightened_nhop *)&storage[0])
654 		free(pnhops, M_TEMP);
655 
656 	if (nhg_priv == NULL)
657 		return (NULL);
658 
659 	return (nhg_priv);
660 }
661 
662 /*
663  * Merge nexthop group denoted by @gr_add with the nexthop group @gr_orig.
664  *
665  * Returns referenced nexthop group or NULL. In the latter case, @perror is
666  *  filled with an error code.
667  * Note that function does NOT care if the next nexthops already exists
668  * in the @gr_orig. As a result, they will be added, resulting in the
669  * same nexthop being present multiple times in the new group.
670  */
671 static struct nhgrp_priv *
merge_nhgrps(struct nh_control * ctl,const struct nhgrp_object * gr_orig,const struct nhgrp_object * gr_add,int * perror)672 merge_nhgrps(struct nh_control *ctl, const struct nhgrp_object *gr_orig,
673      const struct nhgrp_object *gr_add, int *perror)
674 {
675 	char storage[64];
676 	struct weightened_nhop *pnhops;
677 	struct nhgrp_priv *nhg_priv;
678 	const struct nhgrp_priv *orig_priv, *add_priv;
679 	size_t sz;
680 	int curr_nhops;
681 
682 	orig_priv = NHGRP_PRIV_CONST(gr_orig);
683 	add_priv = NHGRP_PRIV_CONST(gr_add);
684 	curr_nhops = orig_priv->nhg_nh_count;
685 
686 	*perror = 0;
687 
688 	sz = (orig_priv->nhg_nh_count + orig_priv->nhg_nh_count) *
689 		sizeof(struct weightened_nhop);
690 	/* optimize for <= 4 paths, each path=16 bytes */
691 	if (sz <= sizeof(storage))
692 		pnhops = (struct weightened_nhop *)&storage[0];
693 	else {
694 		pnhops = malloc(sz, M_TEMP, M_NOWAIT);
695 		if (pnhops == NULL) {
696 			*perror = ENOMEM;
697 			return (NULL);
698 		}
699 	}
700 
701 	/* First, copy nhops from first group */
702 	memcpy(pnhops, orig_priv->nhg_nh_weights,
703 	   orig_priv->nhg_nh_count * sizeof(struct weightened_nhop));
704 	memcpy(&pnhops[curr_nhops], add_priv->nhg_nh_weights,
705 	   add_priv->nhg_nh_count * sizeof(struct weightened_nhop));
706 	curr_nhops += add_priv->nhg_nh_count;
707 
708 	nhg_priv = get_nhgrp(ctl, pnhops, curr_nhops, 0, perror);
709 
710 	if (pnhops != (struct weightened_nhop *)&storage[0])
711 		free(pnhops, M_TEMP);
712 
713 	if (nhg_priv == NULL)
714 		return (NULL);
715 
716 	return (nhg_priv);
717 }
718 
719 
720 /*
721  * Creates/finds nexthop group based on @wn and @num_nhops.
722  * Returns 0 on success with referenced group in @rnd, or
723  * errno.
724  *
725  * If the error is EAGAIN, then the operation can be retried.
726  */
727 int
nhgrp_get_group(struct rib_head * rh,struct weightened_nhop * wn,int num_nhops,uint32_t uidx,struct nhgrp_object ** pnhg)728 nhgrp_get_group(struct rib_head *rh, struct weightened_nhop *wn, int num_nhops,
729     uint32_t uidx, struct nhgrp_object **pnhg)
730 {
731 	struct nh_control *ctl = rh->nh_control;
732 	struct nhgrp_priv *nhg_priv;
733 	int error;
734 
735 	nhg_priv = get_nhgrp(ctl, wn, num_nhops, uidx, &error);
736 	if (nhg_priv != NULL)
737 		*pnhg = nhg_priv->nhg;
738 
739 	return (error);
740 }
741 
742 /*
743  * Creates new nexthop group based on @src group without the nexthops
744  * chosen by @flt_func.
745  * Returns 0 on success, storring the reference nhop group/object in @rnd.
746  */
747 int
nhgrp_get_filtered_group(struct rib_head * rh,const struct rtentry * rt,const struct nhgrp_object * src,rib_filter_f_t flt_func,void * flt_data,struct route_nhop_data * rnd)748 nhgrp_get_filtered_group(struct rib_head *rh, const struct rtentry *rt,
749     const struct nhgrp_object *src, rib_filter_f_t flt_func, void *flt_data,
750     struct route_nhop_data *rnd)
751 {
752 	char storage[64];
753 	struct nh_control *ctl = rh->nh_control;
754 	struct weightened_nhop *pnhops;
755 	const struct nhgrp_priv *mp_priv, *src_priv;
756 	size_t sz;
757 	int error, i, num_nhops;
758 
759 	src_priv = NHGRP_PRIV_CONST(src);
760 
761 	sz = src_priv->nhg_nh_count * (sizeof(struct weightened_nhop));
762 	/* optimize for <= 4 paths, each path=16 bytes */
763 	if (sz <= sizeof(storage))
764 		pnhops = (struct weightened_nhop *)&storage[0];
765 	else {
766 		if ((pnhops = malloc(sz, M_TEMP, M_NOWAIT)) == NULL)
767 			return (ENOMEM);
768 	}
769 
770 	/* Filter nexthops */
771 	error = 0;
772 	num_nhops = 0;
773 	for (i = 0; i < src_priv->nhg_nh_count; i++) {
774 		if (flt_func(rt, src_priv->nhg_nh_weights[i].nh, flt_data))
775 			continue;
776 		memcpy(&pnhops[num_nhops++], &src_priv->nhg_nh_weights[i],
777 		  sizeof(struct weightened_nhop));
778 	}
779 
780 	if (num_nhops == 0) {
781 		rnd->rnd_nhgrp = NULL;
782 		rnd->rnd_weight = 0;
783 	} else if (num_nhops == 1) {
784 		rnd->rnd_nhop = pnhops[0].nh;
785 		rnd->rnd_weight = pnhops[0].weight;
786 		if (nhop_try_ref_object(rnd->rnd_nhop) == 0)
787 			error = EAGAIN;
788 	} else {
789 		mp_priv = get_nhgrp(ctl, pnhops, num_nhops, 0, &error);
790 		if (mp_priv != NULL)
791 			rnd->rnd_nhgrp = mp_priv->nhg;
792 		rnd->rnd_weight = 0;
793 	}
794 
795 	if (pnhops != (struct weightened_nhop *)&storage[0])
796 		free(pnhops, M_TEMP);
797 
798 	return (error);
799 }
800 
801 /*
802  * Creates new multipath group based on existing group/nhop in @rnd_orig and
803  *  to-be-added nhop @wn_add.
804  * Returns 0 on success and stores result in @rnd_new.
805  */
806 int
nhgrp_get_addition_group(struct rib_head * rh,struct route_nhop_data * rnd_orig,struct route_nhop_data * rnd_add,struct route_nhop_data * rnd_new)807 nhgrp_get_addition_group(struct rib_head *rh, struct route_nhop_data *rnd_orig,
808     struct route_nhop_data *rnd_add, struct route_nhop_data *rnd_new)
809 {
810 	struct nh_control *ctl = rh->nh_control;
811 	struct nhgrp_priv *nhg_priv;
812 	struct weightened_nhop wn[2] = {};
813 	int error;
814 
815 	MPASS((!NH_IS_NHGRP(rnd_add->rnd_nhop)));
816 
817 	if (rnd_orig->rnd_nhop == NULL) {
818 		/* No paths to add to, just reference current nhop */
819 		*rnd_new = *rnd_add;
820 		if (nhop_try_ref_object(rnd_new->rnd_nhop) == 0)
821 			return (EAGAIN);
822 		return (0);
823 	}
824 
825 	wn[0].nh = rnd_add->rnd_nhop;
826 	wn[0].weight = rnd_add->rnd_weight;
827 
828 	if (!NH_IS_NHGRP(rnd_orig->rnd_nhop)) {
829 		/* Simple merge of 2 non-multipath nexthops */
830 		wn[1].nh = rnd_orig->rnd_nhop;
831 		wn[1].weight = rnd_orig->rnd_weight;
832 		nhg_priv = get_nhgrp(ctl, wn, 2, 0, &error);
833 	} else {
834 		/* Get new nhop group with @rt->rt_nhop as an additional nhop */
835 		nhg_priv = append_nhops(ctl, rnd_orig->rnd_nhgrp, &wn[0], 1,
836 		    &error);
837 	}
838 
839 	if (nhg_priv == NULL)
840 		return (error);
841 	rnd_new->rnd_nhgrp = nhg_priv->nhg;
842 	rnd_new->rnd_weight = 0;
843 
844 	return (0);
845 }
846 
847 /*
848  * Creates new multipath group based on existing group/nhop in @rnd_orig and
849  *  to-be-merged nhgrp @wn_add.
850  * Returns 0 on success and stores result in @rnd_new.
851  */
852 int
nhgrp_get_merge_group(struct rib_head * rh,struct route_nhop_data * rnd_orig,struct route_nhop_data * rnd_add,struct route_nhop_data * rnd_new)853 nhgrp_get_merge_group(struct rib_head *rh, struct route_nhop_data *rnd_orig,
854     struct route_nhop_data *rnd_add, struct route_nhop_data *rnd_new)
855 {
856 	struct nh_control *ctl = rh->nh_control;
857 	struct nhgrp_priv *nhg_priv;
858 	struct weightened_nhop wn = {};
859 	int error;
860 
861 	MPASS((NH_IS_NHGRP(rnd_add->rnd_nhop)));
862 
863 	/* No paths to add to, Just give up */
864 	if (rnd_orig->rnd_nhop == NULL)
865 		return (EINVAL);
866 
867 	if (!NH_IS_NHGRP(rnd_orig->rnd_nhop)) {
868 		wn.nh = rnd_orig->rnd_nhop;
869 		wn.weight = rnd_orig->rnd_weight;
870 		/* Get new nhop group with addition of nhops in nhgrp */
871 		nhg_priv = append_nhops(ctl, rnd_add->rnd_nhgrp, &wn, 1,
872 		    &error);
873 	} else {
874 		/* Get new nhop group with addition of nhops in nhgrp */
875 		nhg_priv = merge_nhgrps(ctl, rnd_orig->rnd_nhgrp, rnd_add->rnd_nhgrp,
876 		    &error);
877 	}
878 
879 	if (nhg_priv == NULL)
880 		return (error);
881 	rnd_new->rnd_nhgrp = nhg_priv->nhg;
882 	rnd_new->rnd_weight = 0;
883 
884 	return (0);
885 }
886 
887 /*
888  * Returns pointer to array of nexthops with weights for
889  * given @nhg. Stores number of items in the array into @pnum_nhops.
890  */
891 const struct weightened_nhop *
nhgrp_get_nhops(const struct nhgrp_object * nhg,uint32_t * pnum_nhops)892 nhgrp_get_nhops(const struct nhgrp_object *nhg, uint32_t *pnum_nhops)
893 {
894 	const struct nhgrp_priv *nhg_priv;
895 
896 	KASSERT(((nhg->nhg_flags & MPF_MULTIPATH) != 0), ("nhop is not mpath"));
897 
898 	nhg_priv = NHGRP_PRIV_CONST(nhg);
899 	*pnum_nhops = nhg_priv->nhg_nh_count;
900 
901 	return (nhg_priv->nhg_nh_weights);
902 }
903 
904 void
nhgrp_set_uidx(struct nhgrp_object * nhg,uint32_t uidx)905 nhgrp_set_uidx(struct nhgrp_object *nhg, uint32_t uidx)
906 {
907 	struct nhgrp_priv *nhg_priv;
908 
909 	KASSERT(((nhg->nhg_flags & MPF_MULTIPATH) != 0), ("nhop is not mpath"));
910 
911 	nhg_priv = NHGRP_PRIV(nhg);
912 
913 	nhg_priv->nhg_uidx = uidx;
914 }
915 
916 uint32_t
nhgrp_get_uidx(const struct nhgrp_object * nhg)917 nhgrp_get_uidx(const struct nhgrp_object *nhg)
918 {
919 	const struct nhgrp_priv *nhg_priv;
920 
921 	KASSERT(((nhg->nhg_flags & MPF_MULTIPATH) != 0), ("nhop is not mpath"));
922 
923 	nhg_priv = NHGRP_PRIV_CONST(nhg);
924 	return (nhg_priv->nhg_uidx);
925 }
926 
927 /*
928  * Prints nexhop group @nhg data in the provided @buf.
929  * Example: nhg#33/sz=3:[#1:100,#2:100,#3:100]
930  * Example: nhg#33/sz=5:[#1:100,#2:100,..]
931  */
932 char *
nhgrp_print_buf(const struct nhgrp_object * nhg,char * buf,size_t bufsize)933 nhgrp_print_buf(const struct nhgrp_object *nhg, char *buf, size_t bufsize)
934 {
935 	const struct nhgrp_priv *nhg_priv = NHGRP_PRIV_CONST(nhg);
936 
937 	int off = snprintf(buf, bufsize, "nhg#%u/sz=%u:[", nhg_priv->nhg_idx,
938 	    nhg_priv->nhg_nh_count);
939 
940 	for (int i = 0; i < nhg_priv->nhg_nh_count; i++) {
941 		const struct weightened_nhop *wn = &nhg_priv->nhg_nh_weights[i];
942 		int len = snprintf(&buf[off], bufsize - off, "#%u:%u,",
943 		    wn->nh->nh_priv->nh_idx, wn->weight);
944 		if (len + off + 3 >= bufsize) {
945 			int len = snprintf(&buf[off], bufsize - off, "...");
946 			off += len;
947 			break;
948 		}
949 		off += len;
950 	}
951 	if (off > 0)
952 		off--; // remove last ","
953 	if (off + 1 < bufsize)
954 		snprintf(&buf[off], bufsize - off, "]");
955 	return buf;
956 }
957 
958 __noinline static int
dump_nhgrp_entry(struct rib_head * rh,const struct nhgrp_priv * nhg_priv,char * buffer,size_t buffer_size,struct sysctl_req * w)959 dump_nhgrp_entry(struct rib_head *rh, const struct nhgrp_priv *nhg_priv,
960     char *buffer, size_t buffer_size, struct sysctl_req *w)
961 {
962 	struct rt_msghdr *rtm;
963 	struct nhgrp_external *nhge;
964 	struct nhgrp_container *nhgc;
965 	const struct nhgrp_object *nhg;
966 	struct nhgrp_nhop_external *ext;
967 	int error;
968 	size_t sz;
969 
970 	nhg = nhg_priv->nhg;
971 
972 	sz = sizeof(struct rt_msghdr) + sizeof(struct nhgrp_external);
973 	/* controlplane nexthops */
974 	sz += sizeof(struct nhgrp_container);
975 	sz += sizeof(struct nhgrp_nhop_external) * nhg_priv->nhg_nh_count;
976 	/* dataplane nexthops */
977 	sz += sizeof(struct nhgrp_container);
978 	sz += sizeof(struct nhgrp_nhop_external) * nhg->nhg_size;
979 
980 	KASSERT(sz <= buffer_size, ("increase nhgrp buffer size"));
981 
982 	bzero(buffer, sz);
983 
984 	rtm = (struct rt_msghdr *)buffer;
985 	rtm->rtm_msglen = sz;
986 	rtm->rtm_version = RTM_VERSION;
987 	rtm->rtm_type = RTM_GET;
988 
989 	nhge = (struct nhgrp_external *)(rtm + 1);
990 
991 	nhge->nhg_idx = nhg_priv->nhg_idx;
992 	nhge->nhg_refcount = nhg_priv->nhg_refcount;
993 
994 	/* fill in control plane nexthops firs */
995 	nhgc = (struct nhgrp_container *)(nhge + 1);
996 	nhgc->nhgc_type = NHG_C_TYPE_CNHOPS;
997 	nhgc->nhgc_subtype = 0;
998 	nhgc->nhgc_len = sizeof(struct nhgrp_container);
999 	nhgc->nhgc_len += sizeof(struct nhgrp_nhop_external) * nhg_priv->nhg_nh_count;
1000 	nhgc->nhgc_count = nhg_priv->nhg_nh_count;
1001 
1002 	ext = (struct nhgrp_nhop_external *)(nhgc + 1);
1003 	for (int i = 0; i < nhg_priv->nhg_nh_count; i++) {
1004 		ext[i].nh_idx = nhg_priv->nhg_nh_weights[i].nh->nh_priv->nh_idx;
1005 		ext[i].nh_weight = nhg_priv->nhg_nh_weights[i].weight;
1006 	}
1007 
1008 	/* fill in dataplane nexthops */
1009 	nhgc = (struct nhgrp_container *)(&ext[nhg_priv->nhg_nh_count]);
1010 	nhgc->nhgc_type = NHG_C_TYPE_DNHOPS;
1011 	nhgc->nhgc_subtype = 0;
1012 	nhgc->nhgc_len = sizeof(struct nhgrp_container);
1013 	nhgc->nhgc_len += sizeof(struct nhgrp_nhop_external) * nhg->nhg_size;
1014 	nhgc->nhgc_count = nhg->nhg_size;
1015 
1016 	ext = (struct nhgrp_nhop_external *)(nhgc + 1);
1017 	for (int i = 0; i < nhg->nhg_size; i++) {
1018 		ext[i].nh_idx = nhg->nhops[i]->nh_priv->nh_idx;
1019 		ext[i].nh_weight = 0;
1020 	}
1021 
1022 	error = SYSCTL_OUT(w, buffer, sz);
1023 
1024 	return (error);
1025 }
1026 
1027 uint32_t
nhgrp_get_idx(const struct nhgrp_object * nhg)1028 nhgrp_get_idx(const struct nhgrp_object *nhg)
1029 {
1030 	const struct nhgrp_priv *nhg_priv;
1031 
1032 	nhg_priv = NHGRP_PRIV_CONST(nhg);
1033 	return (nhg_priv->nhg_idx);
1034 }
1035 
1036 uint8_t
nhgrp_get_origin(const struct nhgrp_object * nhg)1037 nhgrp_get_origin(const struct nhgrp_object *nhg)
1038 {
1039 	return (NHGRP_PRIV_CONST(nhg)->nhg_origin);
1040 }
1041 
1042 void
nhgrp_set_origin(struct nhgrp_object * nhg,uint8_t origin)1043 nhgrp_set_origin(struct nhgrp_object *nhg, uint8_t origin)
1044 {
1045 	NHGRP_PRIV(nhg)->nhg_origin = origin;
1046 }
1047 
1048 uint32_t
nhgrp_get_count(struct rib_head * rh)1049 nhgrp_get_count(struct rib_head *rh)
1050 {
1051 	struct nh_control *ctl;
1052 	uint32_t count;
1053 
1054 	ctl = rh->nh_control;
1055 
1056 	NHOPS_RLOCK(ctl);
1057 	count = ctl->gr_head.items_count;
1058 	NHOPS_RUNLOCK(ctl);
1059 
1060 	return (count);
1061 }
1062 
1063 int
nhgrp_dump_sysctl(struct rib_head * rh,struct sysctl_req * w)1064 nhgrp_dump_sysctl(struct rib_head *rh, struct sysctl_req *w)
1065 {
1066 	struct nh_control *ctl = rh->nh_control;
1067 	struct epoch_tracker et;
1068 	struct nhgrp_priv *nhg_priv;
1069 	char *buffer;
1070 	size_t sz;
1071 	int error = 0;
1072 
1073 	if (ctl->gr_head.items_count == 0)
1074 		return (0);
1075 
1076 	/* Calculate the maximum nhop group size in bytes */
1077 	sz = sizeof(struct rt_msghdr) + sizeof(struct nhgrp_external);
1078 	sz += 2 * sizeof(struct nhgrp_container);
1079 	sz += 2 * sizeof(struct nhgrp_nhop_external) * RIB_MAX_MPATH_WIDTH;
1080 	buffer = malloc(sz, M_TEMP, M_NOWAIT);
1081 	if (buffer == NULL)
1082 		return (ENOMEM);
1083 
1084 	NET_EPOCH_ENTER(et);
1085 	NHOPS_RLOCK(ctl);
1086 	CHT_SLIST_FOREACH(&ctl->gr_head, mpath, nhg_priv) {
1087 		error = dump_nhgrp_entry(rh, nhg_priv, buffer, sz, w);
1088 		if (error != 0)
1089 			break;
1090 	} CHT_SLIST_FOREACH_END;
1091 	NHOPS_RUNLOCK(ctl);
1092 	NET_EPOCH_EXIT(et);
1093 
1094 	free(buffer, M_TEMP);
1095 
1096 	return (error);
1097 }
1098