1 /*-
2 * SPDX-License-Identifier: BSD-2-Clause
3 *
4 * Copyright (c) 2022 Alexander V. Chernikov <melifaro@FreeBSD.org>
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 * 1. Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 * notice, this list of conditions and the following disclaimer in the
13 * documentation and/or other materials provided with the distribution.
14 *
15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
16 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
19 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
21 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
22 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
23 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
24 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
25 * SUCH DAMAGE.
26 */
27
28 #include <sys/cdefs.h>
29 #include "opt_inet.h"
30 #include "opt_inet6.h"
31 #include "opt_route.h"
32 #include <sys/types.h>
33 #include <sys/ck.h>
34 #include <sys/epoch.h>
35 #include <sys/kernel.h>
36 #include <sys/malloc.h>
37 #include <sys/rmlock.h>
38 #include <sys/socket.h>
39
40 #include <net/if.h>
41 #include <net/route.h>
42 #include <net/route/nhop.h>
43 #include <net/route/nhop_utils.h>
44
45 #include <net/route/route_ctl.h>
46 #include <net/route/route_var.h>
47 #include <netinet6/scope6_var.h>
48 #include <netlink/netlink.h>
49 #include <netlink/netlink_ctl.h>
50 #include <netlink/netlink_route.h>
51 #include <netlink/route/route_var.h>
52
53 #define DEBUG_MOD_NAME nl_nhop
54 #define DEBUG_MAX_LEVEL LOG_DEBUG3
55 #include <netlink/netlink_debug.h>
56 _DECLARE_DEBUG(LOG_INFO);
57
58 /*
59 * This file contains the logic to maintain kernel nexthops and
60 * nexhop groups based om the data provided by the user.
61 *
62 * Kernel stores (nearly) all of the routing data in the nexthops,
63 * including the prefix-specific flags (NHF_HOST and NHF_DEFAULT).
64 *
65 * Netlink API provides higher-level abstraction for the user. Each
66 * user-created nexthop may map to multiple kernel nexthops.
67 *
68 * The following variations require separate kernel nexthop to be
69 * created:
70 * * prefix flags (NHF_HOST, NHF_DEFAULT)
71 * * using IPv6 gateway for IPv4 routes
72 * * different fibnum
73 *
74 * These kernel nexthops have the lifetime bound to the lifetime of
75 * the user_nhop object. They are not collected until user requests
76 * to delete the created user_nhop.
77 *
78 */
79 struct user_nhop {
80 uint32_t un_idx; /* Userland-provided index */
81 uint32_t un_fibfam; /* fibnum+af(as highest byte) */
82 uint8_t un_protocol; /* protocol that install the record */
83 struct nhop_object *un_nhop; /* "production" nexthop */
84 struct nhop_object *un_nhop_src; /* nexthop to copy from */
85 struct weightened_nhop *un_nhgrp_src; /* nexthops for nhg */
86 uint32_t un_nhgrp_count; /* number of nexthops */
87 struct user_nhop *un_next; /* next item in hash chain */
88 struct user_nhop *un_nextchild; /* master -> children */
89 struct epoch_context un_epoch_ctx; /* epoch ctl helper */
90 };
91
92 /* produce hash value for an object */
93 #define unhop_hash_obj(_obj) (hash_unhop(_obj))
94 /* compare two objects */
95 #define unhop_cmp(_one, _two) (cmp_unhop(_one, _two))
96 /* next object accessor */
97 #define unhop_next(_obj) (_obj)->un_next
98
99 CHT_SLIST_DEFINE(unhop, struct user_nhop);
100
101 struct unhop_ctl {
102 struct unhop_head un_head;
103 struct rmlock un_lock;
104 };
105 #define UN_LOCK_INIT(_ctl) rm_init(&(_ctl)->un_lock, "unhop_ctl")
106 #define UN_TRACKER struct rm_priotracker un_tracker
107 #define UN_RLOCK(_ctl) rm_rlock(&((_ctl)->un_lock), &un_tracker)
108 #define UN_RUNLOCK(_ctl) rm_runlock(&((_ctl)->un_lock), &un_tracker)
109
110 #define UN_WLOCK(_ctl) rm_wlock(&(_ctl)->un_lock);
111 #define UN_WUNLOCK(_ctl) rm_wunlock(&(_ctl)->un_lock);
112
113 VNET_DEFINE_STATIC(struct unhop_ctl *, un_ctl) = NULL;
114 #define V_un_ctl VNET(un_ctl)
115
116 static void consider_resize(struct unhop_ctl *ctl, uint32_t new_size);
117 static int cmp_unhop(const struct user_nhop *a, const struct user_nhop *b);
118 static unsigned int hash_unhop(const struct user_nhop *obj);
119
120 static void destroy_unhop(struct user_nhop *unhop);
121 static struct nhop_object *clone_unhop(const struct user_nhop *unhop,
122 uint32_t fibnum, int family, int nh_flags);
123
124 static int
cmp_unhop(const struct user_nhop * a,const struct user_nhop * b)125 cmp_unhop(const struct user_nhop *a, const struct user_nhop *b)
126 {
127 return (a->un_idx == b->un_idx && a->un_fibfam == b->un_fibfam);
128 }
129
130 /*
131 * Hash callback: calculate hash of an object
132 */
133 static unsigned int
hash_unhop(const struct user_nhop * obj)134 hash_unhop(const struct user_nhop *obj)
135 {
136 return (obj->un_idx ^ obj->un_fibfam);
137 }
138
139 #define UNHOP_IS_MASTER(_unhop) ((_unhop)->un_fibfam == 0)
140
141 /*
142 * Factory interface for creating matching kernel nexthops/nexthop groups
143 *
144 * @uidx: userland nexhop index used to create the nexthop
145 * @fibnum: fibnum nexthop will be used in
146 * @family: upper family nexthop will be used in
147 * @nh_flags: desired nexthop prefix flags
148 * @perror: pointer to store error to
149 *
150 * Returns referenced nexthop linked to @fibnum/@family rib on success.
151 */
152 struct nhop_object *
nl_find_nhop(uint32_t fibnum,int family,uint32_t uidx,int nh_flags,int * perror)153 nl_find_nhop(uint32_t fibnum, int family, uint32_t uidx,
154 int nh_flags, int *perror)
155 {
156 struct unhop_ctl *ctl = atomic_load_ptr(&V_un_ctl);
157 UN_TRACKER;
158
159 if (__predict_false(ctl == NULL))
160 return (NULL);
161
162 struct user_nhop key= {
163 .un_idx = uidx,
164 .un_fibfam = fibnum | ((uint32_t)family) << 24,
165 };
166 struct user_nhop *unhop;
167
168 nh_flags = nh_flags & (NHF_HOST | NHF_DEFAULT);
169
170 if (__predict_false(family == 0))
171 return (NULL);
172
173 UN_RLOCK(ctl);
174 CHT_SLIST_FIND_BYOBJ(&ctl->un_head, unhop, &key, unhop);
175 if (unhop != NULL) {
176 struct nhop_object *nh = unhop->un_nhop;
177 UN_RLOCK(ctl);
178 *perror = 0;
179 nhop_ref_any(nh);
180 return (nh);
181 }
182
183 /*
184 * Exact nexthop not found. Search for template nexthop to clone from.
185 */
186 key.un_fibfam = 0;
187 CHT_SLIST_FIND_BYOBJ(&ctl->un_head, unhop, &key, unhop);
188 if (unhop == NULL) {
189 UN_RUNLOCK(ctl);
190 *perror = ESRCH;
191 return (NULL);
192 }
193
194 UN_RUNLOCK(ctl);
195
196 /* Create entry to insert first */
197 struct user_nhop *un_new, *un_tmp;
198 un_new = malloc(sizeof(struct user_nhop), M_NETLINK, M_NOWAIT | M_ZERO);
199 if (un_new == NULL) {
200 *perror = ENOMEM;
201 return (NULL);
202 }
203 un_new->un_idx = uidx;
204 un_new->un_fibfam = fibnum | ((uint32_t)family) << 24;
205
206 /* Relying on epoch to protect unhop here */
207 un_new->un_nhop = clone_unhop(unhop, fibnum, family, nh_flags);
208 if (un_new->un_nhop == NULL) {
209 free(un_new, M_NETLINK);
210 *perror = ENOMEM;
211 return (NULL);
212 }
213
214 /* Insert back and report */
215 UN_WLOCK(ctl);
216
217 /* First, find template record once again */
218 CHT_SLIST_FIND_BYOBJ(&ctl->un_head, unhop, &key, unhop);
219 if (unhop == NULL) {
220 /* Someone deleted the nexthop during the call */
221 UN_WUNLOCK(ctl);
222 *perror = ESRCH;
223 destroy_unhop(un_new);
224 return (NULL);
225 }
226
227 /* Second, check the direct match */
228 CHT_SLIST_FIND_BYOBJ(&ctl->un_head, unhop, un_new, un_tmp);
229 struct nhop_object *nh;
230 if (un_tmp != NULL) {
231 /* Another thread already created the desired nextop, use it */
232 nh = un_tmp->un_nhop;
233 } else {
234 /* Finally, insert the new nexthop and link it to the primary */
235 nh = un_new->un_nhop;
236 CHT_SLIST_INSERT_HEAD(&ctl->un_head, unhop, un_new);
237 un_new->un_nextchild = unhop->un_nextchild;
238 unhop->un_nextchild = un_new;
239 un_new = NULL;
240 NL_LOG(LOG_DEBUG2, "linked cloned nexthop %p", nh);
241 }
242
243 UN_WUNLOCK(ctl);
244
245 if (un_new != NULL)
246 destroy_unhop(un_new);
247
248 *perror = 0;
249 nhop_ref_any(nh);
250 return (nh);
251 }
252
253 static struct user_nhop *
nl_find_base_unhop(struct unhop_ctl * ctl,uint32_t uidx)254 nl_find_base_unhop(struct unhop_ctl *ctl, uint32_t uidx)
255 {
256 struct user_nhop key= { .un_idx = uidx };
257 struct user_nhop *unhop = NULL;
258 UN_TRACKER;
259
260 UN_RLOCK(ctl);
261 CHT_SLIST_FIND_BYOBJ(&ctl->un_head, unhop, &key, unhop);
262 UN_RUNLOCK(ctl);
263
264 return (unhop);
265 }
266
267 #define MAX_STACK_NHOPS 4
268 static struct nhop_object *
clone_unhop(const struct user_nhop * unhop,uint32_t fibnum,int family,int nh_flags)269 clone_unhop(const struct user_nhop *unhop, uint32_t fibnum, int family, int nh_flags)
270 {
271 #ifdef ROUTE_MPATH
272 const struct weightened_nhop *wn;
273 struct weightened_nhop *wn_new, wn_base[MAX_STACK_NHOPS];
274 uint32_t num_nhops;
275 #endif
276 struct nhop_object *nh = NULL;
277 int error;
278
279 if (unhop->un_nhop_src != NULL) {
280 IF_DEBUG_LEVEL(LOG_DEBUG2) {
281 char nhbuf[NHOP_PRINT_BUFSIZE];
282 nhop_print_buf_any(unhop->un_nhop_src, nhbuf, sizeof(nhbuf));
283 FIB_NH_LOG(LOG_DEBUG2, unhop->un_nhop_src,
284 "cloning nhop %s -> %u.%u flags 0x%X", nhbuf, fibnum,
285 family, nh_flags);
286 }
287 struct nhop_object *nh;
288 nh = nhop_alloc(fibnum, AF_UNSPEC);
289 if (nh == NULL)
290 return (NULL);
291 nhop_copy(nh, unhop->un_nhop_src);
292 /* Check that nexthop gateway is compatible with the new family */
293 if (!nhop_set_upper_family(nh, family)) {
294 nhop_free(nh);
295 return (NULL);
296 }
297 nhop_set_uidx(nh, unhop->un_idx);
298 nhop_set_pxtype_flag(nh, nh_flags);
299 return (nhop_get_nhop(nh, &error));
300 }
301 #ifdef ROUTE_MPATH
302 wn = unhop->un_nhgrp_src;
303 num_nhops = unhop->un_nhgrp_count;
304
305 if (num_nhops > MAX_STACK_NHOPS) {
306 wn_new = malloc(num_nhops * sizeof(struct weightened_nhop), M_TEMP, M_NOWAIT);
307 if (wn_new == NULL)
308 return (NULL);
309 } else
310 wn_new = wn_base;
311
312 for (int i = 0; i < num_nhops; i++) {
313 uint32_t uidx = nhop_get_uidx(wn[i].nh);
314 MPASS(uidx != 0);
315 wn_new[i].nh = nl_find_nhop(fibnum, family, uidx, nh_flags, &error);
316 if (error != 0)
317 break;
318 wn_new[i].weight = wn[i].weight;
319 }
320
321 if (error == 0) {
322 struct rib_head *rh = nhop_get_rh(wn_new[0].nh);
323 struct nhgrp_object *nhg;
324
325 error = nhgrp_get_group(rh, wn_new, num_nhops, unhop->un_idx, &nhg);
326 nh = (struct nhop_object *)nhg;
327 }
328
329 if (wn_new != wn_base)
330 free(wn_new, M_TEMP);
331 #endif
332 return (nh);
333 }
334
335 static void
destroy_unhop(struct user_nhop * unhop)336 destroy_unhop(struct user_nhop *unhop)
337 {
338 if (unhop->un_nhop != NULL)
339 nhop_free_any(unhop->un_nhop);
340 if (unhop->un_nhop_src != NULL)
341 nhop_free_any(unhop->un_nhop_src);
342 free(unhop, M_NETLINK);
343 }
344
345 static void
destroy_unhop_epoch(epoch_context_t ctx)346 destroy_unhop_epoch(epoch_context_t ctx)
347 {
348 struct user_nhop *unhop;
349
350 unhop = __containerof(ctx, struct user_nhop, un_epoch_ctx);
351
352 destroy_unhop(unhop);
353 }
354
355 static uint32_t
find_spare_uidx(struct unhop_ctl * ctl)356 find_spare_uidx(struct unhop_ctl *ctl)
357 {
358 struct user_nhop *unhop, key = {};
359 uint32_t uidx = 0;
360 UN_TRACKER;
361
362 UN_RLOCK(ctl);
363 /* This should return spare uid with 75% of 65k used in ~99/100 cases */
364 for (int i = 0; i < 16; i++) {
365 key.un_idx = (arc4random() % 65536) + 65536 * 4;
366 CHT_SLIST_FIND_BYOBJ(&ctl->un_head, unhop, &key, unhop);
367 if (unhop == NULL) {
368 uidx = key.un_idx;
369 break;
370 }
371 }
372 UN_RUNLOCK(ctl);
373
374 return (uidx);
375 }
376
377
378 /*
379 * Actual netlink code
380 */
381 struct netlink_walkargs {
382 struct nl_writer *nw;
383 struct nlmsghdr hdr;
384 struct nlpcb *so;
385 int family;
386 int error;
387 int count;
388 int dumped;
389 };
390 #define ENOMEM_IF_NULL(_v) if ((_v) == NULL) goto enomem
391
392 static bool
dump_nhgrp(const struct user_nhop * unhop,struct nlmsghdr * hdr,struct nl_writer * nw)393 dump_nhgrp(const struct user_nhop *unhop, struct nlmsghdr *hdr,
394 struct nl_writer *nw)
395 {
396
397 if (!nlmsg_reply(nw, hdr, sizeof(struct nhmsg)))
398 goto enomem;
399
400 struct nhmsg *nhm = nlmsg_reserve_object(nw, struct nhmsg);
401 nhm->nh_family = AF_UNSPEC;
402 nhm->nh_scope = 0;
403 nhm->nh_protocol = unhop->un_protocol;
404 nhm->nh_flags = 0;
405
406 nlattr_add_u32(nw, NHA_ID, unhop->un_idx);
407 nlattr_add_u16(nw, NHA_GROUP_TYPE, NEXTHOP_GRP_TYPE_MPATH);
408
409 struct weightened_nhop *wn = unhop->un_nhgrp_src;
410 uint32_t num_nhops = unhop->un_nhgrp_count;
411 /* TODO: a better API? */
412 int nla_len = sizeof(struct nlattr);
413 nla_len += NETLINK_ALIGN(num_nhops * sizeof(struct nexthop_grp));
414 struct nlattr *nla = nlmsg_reserve_data(nw, nla_len, struct nlattr);
415 if (nla == NULL)
416 goto enomem;
417 nla->nla_type = NHA_GROUP;
418 nla->nla_len = nla_len;
419 for (int i = 0; i < num_nhops; i++) {
420 struct nexthop_grp *grp = &((struct nexthop_grp *)(nla + 1))[i];
421 grp->id = nhop_get_uidx(wn[i].nh);
422 grp->weight = wn[i].weight;
423 grp->resvd1 = 0;
424 grp->resvd2 = 0;
425 }
426
427 if (nlmsg_end(nw))
428 return (true);
429 enomem:
430 NL_LOG(LOG_DEBUG, "error: unable to allocate attribute memory");
431 nlmsg_abort(nw);
432 return (false);
433 }
434
435 static bool
dump_nhop(const struct nhop_object * nh,uint32_t uidx,struct nlmsghdr * hdr,struct nl_writer * nw)436 dump_nhop(const struct nhop_object *nh, uint32_t uidx, struct nlmsghdr *hdr,
437 struct nl_writer *nw)
438 {
439 if (!nlmsg_reply(nw, hdr, sizeof(struct nhmsg)))
440 goto enomem;
441
442 struct nhmsg *nhm = nlmsg_reserve_object(nw, struct nhmsg);
443 ENOMEM_IF_NULL(nhm);
444 nhm->nh_family = nhop_get_neigh_family(nh);
445 nhm->nh_scope = 0; // XXX: what's that?
446 nhm->nh_protocol = nhop_get_origin(nh);
447 nhm->nh_flags = 0;
448
449 if (uidx != 0)
450 nlattr_add_u32(nw, NHA_ID, uidx);
451 if (nh->nh_flags & NHF_BLACKHOLE) {
452 nlattr_add_flag(nw, NHA_BLACKHOLE);
453 goto done;
454 }
455 nlattr_add_u32(nw, NHA_OIF, if_getindex(nh->nh_ifp));
456
457 switch (nh->gw_sa.sa_family) {
458 #ifdef INET
459 case AF_INET:
460 nlattr_add(nw, NHA_GATEWAY, 4, &nh->gw4_sa.sin_addr);
461 break;
462 #endif
463 #ifdef INET6
464 case AF_INET6:
465 {
466 struct in6_addr addr = nh->gw6_sa.sin6_addr;
467 in6_clearscope(&addr);
468 nlattr_add(nw, NHA_GATEWAY, 16, &addr);
469 break;
470 }
471 #endif
472 }
473
474 int off = nlattr_add_nested(nw, NHA_FREEBSD);
475 if (off != 0) {
476 nlattr_add_u32(nw, NHAF_AIF, if_getindex(nh->nh_aifp));
477
478 if (uidx == 0) {
479 nlattr_add_u32(nw, NHAF_KID, nhop_get_idx(nh));
480 nlattr_add_u32(nw, NHAF_FAMILY, nhop_get_upper_family(nh));
481 nlattr_add_u32(nw, NHAF_TABLE, nhop_get_fibnum(nh));
482 }
483
484 nlattr_set_len(nw, off);
485 }
486
487 done:
488 if (nlmsg_end(nw))
489 return (true);
490 enomem:
491 nlmsg_abort(nw);
492 return (false);
493 }
494
495 static void
dump_unhop(const struct user_nhop * unhop,struct nlmsghdr * hdr,struct nl_writer * nw)496 dump_unhop(const struct user_nhop *unhop, struct nlmsghdr *hdr,
497 struct nl_writer *nw)
498 {
499 if (unhop->un_nhop_src != NULL)
500 dump_nhop(unhop->un_nhop_src, unhop->un_idx, hdr, nw);
501 else
502 dump_nhgrp(unhop, hdr, nw);
503 }
504
505 static int
delete_unhop(struct unhop_ctl * ctl,struct nlmsghdr * hdr,uint32_t uidx)506 delete_unhop(struct unhop_ctl *ctl, struct nlmsghdr *hdr, uint32_t uidx)
507 {
508 struct user_nhop *unhop_ret, *unhop_base, *unhop_chain;
509
510 struct user_nhop key = { .un_idx = uidx };
511
512 UN_WLOCK(ctl);
513
514 CHT_SLIST_FIND_BYOBJ(&ctl->un_head, unhop, &key, unhop_base);
515
516 if (unhop_base != NULL) {
517 CHT_SLIST_REMOVE(&ctl->un_head, unhop, unhop_base, unhop_ret);
518 IF_DEBUG_LEVEL(LOG_DEBUG2) {
519 char nhbuf[NHOP_PRINT_BUFSIZE];
520 nhop_print_buf_any(unhop_base->un_nhop, nhbuf, sizeof(nhbuf));
521 FIB_NH_LOG(LOG_DEBUG3, unhop_base->un_nhop,
522 "removed base nhop %u: %s", uidx, nhbuf);
523 }
524 /* Unlink all child nexhops as well, keeping the chain intact */
525 unhop_chain = unhop_base->un_nextchild;
526 while (unhop_chain != NULL) {
527 CHT_SLIST_REMOVE(&ctl->un_head, unhop, unhop_chain,
528 unhop_ret);
529 MPASS(unhop_chain == unhop_ret);
530 IF_DEBUG_LEVEL(LOG_DEBUG3) {
531 char nhbuf[NHOP_PRINT_BUFSIZE];
532 nhop_print_buf_any(unhop_chain->un_nhop,
533 nhbuf, sizeof(nhbuf));
534 FIB_NH_LOG(LOG_DEBUG3, unhop_chain->un_nhop,
535 "removed child nhop %u: %s", uidx, nhbuf);
536 }
537 unhop_chain = unhop_chain->un_nextchild;
538 }
539 }
540
541 UN_WUNLOCK(ctl);
542
543 if (unhop_base == NULL) {
544 NL_LOG(LOG_DEBUG, "unable to find unhop %u", uidx);
545 return (ENOENT);
546 }
547
548 /* Report nexthop deletion */
549 struct netlink_walkargs wa = {
550 .hdr.nlmsg_pid = hdr->nlmsg_pid,
551 .hdr.nlmsg_seq = hdr->nlmsg_seq,
552 .hdr.nlmsg_flags = hdr->nlmsg_flags,
553 .hdr.nlmsg_type = NL_RTM_DELNEXTHOP,
554 };
555
556 struct nl_writer nw = {};
557 if (!nlmsg_get_group_writer(&nw, NLMSG_SMALL, NETLINK_ROUTE, RTNLGRP_NEXTHOP)) {
558 NL_LOG(LOG_DEBUG, "error allocating message writer");
559 return (ENOMEM);
560 }
561
562 dump_unhop(unhop_base, &wa.hdr, &nw);
563 nlmsg_flush(&nw);
564
565 while (unhop_base != NULL) {
566 unhop_chain = unhop_base->un_nextchild;
567 NET_EPOCH_CALL(destroy_unhop_epoch, &unhop_base->un_epoch_ctx);
568 unhop_base = unhop_chain;
569 }
570
571 return (0);
572 }
573
574 static void
consider_resize(struct unhop_ctl * ctl,uint32_t new_size)575 consider_resize(struct unhop_ctl *ctl, uint32_t new_size)
576 {
577 void *new_ptr = NULL;
578 size_t alloc_size;
579
580 if (new_size == 0)
581 return;
582
583 if (new_size != 0) {
584 alloc_size = CHT_SLIST_GET_RESIZE_SIZE(new_size);
585 new_ptr = malloc(alloc_size, M_NETLINK, M_NOWAIT | M_ZERO);
586 if (new_ptr == NULL)
587 return;
588 }
589
590 NL_LOG(LOG_DEBUG, "resizing hash: %u -> %u", ctl->un_head.hash_size, new_size);
591 UN_WLOCK(ctl);
592 if (new_ptr != NULL) {
593 CHT_SLIST_RESIZE(&ctl->un_head, unhop, new_ptr, new_size);
594 }
595 UN_WUNLOCK(ctl);
596
597
598 if (new_ptr != NULL)
599 free(new_ptr, M_NETLINK);
600 }
601
602 static bool __noinline
vnet_init_unhops(void)603 vnet_init_unhops(void)
604 {
605 uint32_t num_buckets = 16;
606 size_t alloc_size = CHT_SLIST_GET_RESIZE_SIZE(num_buckets);
607
608 struct unhop_ctl *ctl = malloc(sizeof(struct unhop_ctl), M_NETLINK,
609 M_NOWAIT | M_ZERO);
610 if (ctl == NULL)
611 return (false);
612
613 void *ptr = malloc(alloc_size, M_NETLINK, M_NOWAIT | M_ZERO);
614 if (ptr == NULL) {
615 free(ctl, M_NETLINK);
616 return (false);
617 }
618 CHT_SLIST_INIT(&ctl->un_head, ptr, num_buckets);
619 UN_LOCK_INIT(ctl);
620
621 if (!atomic_cmpset_ptr((uintptr_t *)&V_un_ctl, (uintptr_t)NULL, (uintptr_t)ctl)) {
622 free(ptr, M_NETLINK);
623 free(ctl, M_NETLINK);
624 }
625
626 if (atomic_load_ptr(&V_un_ctl) == NULL)
627 return (false);
628
629 NL_LOG(LOG_NOTICE, "UNHOPS init done");
630
631 return (true);
632 }
633
634 static void
vnet_destroy_unhops(const void * unused __unused)635 vnet_destroy_unhops(const void *unused __unused)
636 {
637 struct unhop_ctl *ctl = atomic_load_ptr(&V_un_ctl);
638 struct user_nhop *unhop, *tmp;
639
640 if (ctl == NULL)
641 return;
642 V_un_ctl = NULL;
643
644 /* Wait till all unhop users finish their reads */
645 NET_EPOCH_WAIT();
646
647 UN_WLOCK(ctl);
648 CHT_SLIST_FOREACH_SAFE(&ctl->un_head, unhop, unhop, tmp) {
649 destroy_unhop(unhop);
650 } CHT_SLIST_FOREACH_SAFE_END;
651 UN_WUNLOCK(ctl);
652
653 free(ctl->un_head.ptr, M_NETLINK);
654 free(ctl, M_NETLINK);
655 }
656 VNET_SYSUNINIT(vnet_destroy_unhops, SI_SUB_PROTO_IF, SI_ORDER_ANY,
657 vnet_destroy_unhops, NULL);
658
659 static int
nlattr_get_nhg(struct nlattr * nla,struct nl_pstate * npt,const void * arg,void * target)660 nlattr_get_nhg(struct nlattr *nla, struct nl_pstate *npt, const void *arg, void *target)
661 {
662 int error = 0;
663
664 /* Verify attribute correctness */
665 struct nexthop_grp *grp = NLA_DATA(nla);
666 int data_len = NLA_DATA_LEN(nla);
667
668 int count = data_len / sizeof(*grp);
669 if (count == 0 || (count * sizeof(*grp) != data_len)) {
670 NL_LOG(LOG_DEBUG, "Invalid length for RTA_GROUP: %d", data_len);
671 return (EINVAL);
672 }
673
674 *((struct nlattr **)target) = nla;
675 return (error);
676 }
677
678 static void
set_scope6(struct sockaddr * sa,if_t ifp)679 set_scope6(struct sockaddr *sa, if_t ifp)
680 {
681 #ifdef INET6
682 if (sa != NULL && sa->sa_family == AF_INET6 && ifp != NULL) {
683 struct sockaddr_in6 *sa6 = (struct sockaddr_in6 *)sa;
684
685 if (IN6_IS_ADDR_LINKLOCAL(&sa6->sin6_addr))
686 in6_set_unicast_scopeid(&sa6->sin6_addr, if_getindex(ifp));
687 }
688 #endif
689 }
690
691 struct nl_parsed_nhop {
692 uint32_t nha_id;
693 uint8_t nha_blackhole;
694 uint8_t nha_groups;
695 uint8_t nhaf_knhops;
696 uint8_t nhaf_family;
697 struct ifnet *nha_oif;
698 struct sockaddr *nha_gw;
699 struct nlattr *nha_group;
700 uint8_t nh_family;
701 uint8_t nh_protocol;
702 uint32_t nhaf_table;
703 uint32_t nhaf_kid;
704 uint32_t nhaf_aif;
705 };
706
707 #define _IN(_field) offsetof(struct nhmsg, _field)
708 #define _OUT(_field) offsetof(struct nl_parsed_nhop, _field)
709 static struct nlattr_parser nla_p_nh_fbsd[] = {
710 { .type = NHAF_KNHOPS, .off = _OUT(nhaf_knhops), .cb = nlattr_get_flag },
711 { .type = NHAF_TABLE, .off = _OUT(nhaf_table), .cb = nlattr_get_uint32 },
712 { .type = NHAF_FAMILY, .off = _OUT(nhaf_family), .cb = nlattr_get_uint8 },
713 { .type = NHAF_KID, .off = _OUT(nhaf_kid), .cb = nlattr_get_uint32 },
714 { .type = NHAF_AIF, .off = _OUT(nhaf_aif), .cb = nlattr_get_uint32 },
715 };
716 NL_DECLARE_ATTR_PARSER(nh_fbsd_parser, nla_p_nh_fbsd);
717
718 static const struct nlfield_parser nlf_p_nh[] = {
719 { .off_in = _IN(nh_family), .off_out = _OUT(nh_family), .cb = nlf_get_u8 },
720 { .off_in = _IN(nh_protocol), .off_out = _OUT(nh_protocol), .cb = nlf_get_u8 },
721 };
722
723 static const struct nlattr_parser nla_p_nh[] = {
724 { .type = NHA_ID, .off = _OUT(nha_id), .cb = nlattr_get_uint32 },
725 { .type = NHA_GROUP, .off = _OUT(nha_group), .cb = nlattr_get_nhg },
726 { .type = NHA_BLACKHOLE, .off = _OUT(nha_blackhole), .cb = nlattr_get_flag },
727 { .type = NHA_OIF, .off = _OUT(nha_oif), .cb = nlattr_get_ifp },
728 { .type = NHA_GATEWAY, .off = _OUT(nha_gw), .cb = nlattr_get_ip },
729 { .type = NHA_GROUPS, .off = _OUT(nha_groups), .cb = nlattr_get_flag },
730 { .type = NHA_FREEBSD, .arg = &nh_fbsd_parser, .cb = nlattr_get_nested },
731 };
732 #undef _IN
733 #undef _OUT
734
735 static bool
post_p_nh(void * _attrs,struct nl_pstate * npt)736 post_p_nh(void *_attrs, struct nl_pstate *npt)
737 {
738 struct nl_parsed_nhop *attrs = (struct nl_parsed_nhop *)_attrs;
739
740 set_scope6(attrs->nha_gw, attrs->nha_oif);
741 return (true);
742 }
743 NL_DECLARE_PARSER_EXT(nhmsg_parser, struct nhmsg, NULL, nlf_p_nh, nla_p_nh, post_p_nh);
744
745 static bool
eligible_nhg(const struct nhop_object * nh)746 eligible_nhg(const struct nhop_object *nh)
747 {
748 return (nh->nh_flags & NHF_GATEWAY);
749 }
750
751 static int
newnhg(struct unhop_ctl * ctl,struct nl_parsed_nhop * attrs,struct user_nhop * unhop)752 newnhg(struct unhop_ctl *ctl, struct nl_parsed_nhop *attrs, struct user_nhop *unhop)
753 {
754 struct nexthop_grp *grp = NLA_DATA(attrs->nha_group);
755 int count = NLA_DATA_LEN(attrs->nha_group) / sizeof(*grp);
756 struct weightened_nhop *wn;
757
758 wn = malloc(sizeof(*wn) * count, M_NETLINK, M_NOWAIT | M_ZERO);
759 if (wn == NULL)
760 return (ENOMEM);
761
762 for (int i = 0; i < count; i++) {
763 struct user_nhop *unhop;
764 unhop = nl_find_base_unhop(ctl, grp[i].id);
765 if (unhop == NULL) {
766 NL_LOG(LOG_DEBUG, "unable to find uidx %u", grp[i].id);
767 free(wn, M_NETLINK);
768 return (ESRCH);
769 } else if (unhop->un_nhop_src == NULL) {
770 NL_LOG(LOG_DEBUG, "uidx %u is a group, nested group unsupported",
771 grp[i].id);
772 free(wn, M_NETLINK);
773 return (ENOTSUP);
774 } else if (!eligible_nhg(unhop->un_nhop_src)) {
775 NL_LOG(LOG_DEBUG, "uidx %u nhop is not mpath-eligible",
776 grp[i].id);
777 free(wn, M_NETLINK);
778 return (ENOTSUP);
779 }
780 /*
781 * TODO: consider more rigid eligibility checks:
782 * restrict nexthops with the same gateway
783 */
784 wn[i].nh = unhop->un_nhop_src;
785 wn[i].weight = grp[i].weight;
786 }
787 unhop->un_nhgrp_src = wn;
788 unhop->un_nhgrp_count = count;
789 return (0);
790 }
791
792 /*
793 * Sets nexthop @nh gateway specified by @gw.
794 * If gateway is IPv6 link-local, alters @gw to include scopeid equal to
795 * @ifp ifindex.
796 * Returns 0 on success or errno.
797 */
798 int
nl_set_nexthop_gw(struct nhop_object * nh,struct sockaddr * gw,if_t ifp,struct nl_pstate * npt)799 nl_set_nexthop_gw(struct nhop_object *nh, struct sockaddr *gw, if_t ifp,
800 struct nl_pstate *npt)
801 {
802 #ifdef INET6
803 if (gw->sa_family == AF_INET6) {
804 struct sockaddr_in6 *gw6 = (struct sockaddr_in6 *)gw;
805 if (IN6_IS_ADDR_LINKLOCAL(&gw6->sin6_addr)) {
806 if (ifp == NULL) {
807 NLMSG_REPORT_ERR_MSG(npt, "interface not set");
808 return (EINVAL);
809 }
810 in6_set_unicast_scopeid(&gw6->sin6_addr, if_getindex(ifp));
811 }
812 }
813 #endif
814 nhop_set_gw(nh, gw, true);
815 return (0);
816 }
817
818 static int
newnhop(struct nl_parsed_nhop * attrs,struct user_nhop * unhop,struct nl_pstate * npt)819 newnhop(struct nl_parsed_nhop *attrs, struct user_nhop *unhop, struct nl_pstate *npt)
820 {
821 struct ifaddr *ifa = NULL;
822 struct nhop_object *nh;
823 int error;
824
825 if (!attrs->nha_blackhole) {
826 if (attrs->nha_gw == NULL) {
827 NLMSG_REPORT_ERR_MSG(npt, "missing NHA_GATEWAY");
828 return (EINVAL);
829 }
830 if (attrs->nha_oif == NULL) {
831 NLMSG_REPORT_ERR_MSG(npt, "missing NHA_OIF");
832 return (EINVAL);
833 }
834 if (ifa == NULL)
835 ifa = ifaof_ifpforaddr(attrs->nha_gw, attrs->nha_oif);
836 if (ifa == NULL) {
837 NLMSG_REPORT_ERR_MSG(npt, "Unable to determine default source IP");
838 return (EINVAL);
839 }
840 }
841
842 int family = attrs->nha_gw != NULL ? attrs->nha_gw->sa_family : attrs->nh_family;
843
844 nh = nhop_alloc(RT_DEFAULT_FIB, family);
845 if (nh == NULL) {
846 NL_LOG(LOG_DEBUG, "Unable to allocate nexthop");
847 return (ENOMEM);
848 }
849 nhop_set_uidx(nh, attrs->nha_id);
850 nhop_set_origin(nh, attrs->nh_protocol);
851
852 if (attrs->nha_blackhole)
853 nhop_set_blackhole(nh, NHF_BLACKHOLE);
854 else {
855 error = nl_set_nexthop_gw(nh, attrs->nha_gw, attrs->nha_oif, npt);
856 if (error != 0) {
857 nhop_free(nh);
858 return (error);
859 }
860 nhop_set_transmit_ifp(nh, attrs->nha_oif);
861 nhop_set_src(nh, ifa);
862 }
863
864 error = nhop_get_unlinked(nh);
865 if (error != 0) {
866 NL_LOG(LOG_DEBUG, "unable to finalize nexthop");
867 return (error);
868 }
869
870 IF_DEBUG_LEVEL(LOG_DEBUG2) {
871 char nhbuf[NHOP_PRINT_BUFSIZE];
872 nhop_print_buf(nh, nhbuf, sizeof(nhbuf));
873 NL_LOG(LOG_DEBUG2, "Adding unhop %u: %s", attrs->nha_id, nhbuf);
874 }
875
876 unhop->un_nhop_src = nh;
877 return (0);
878 }
879
880 static int
rtnl_handle_newnhop(struct nlmsghdr * hdr,struct nlpcb * nlp,struct nl_pstate * npt)881 rtnl_handle_newnhop(struct nlmsghdr *hdr, struct nlpcb *nlp,
882 struct nl_pstate *npt)
883 {
884 struct user_nhop *unhop;
885 int error;
886
887 if ((__predict_false(V_un_ctl == NULL)) && (!vnet_init_unhops()))
888 return (ENOMEM);
889 struct unhop_ctl *ctl = V_un_ctl;
890
891 struct nl_parsed_nhop attrs = {};
892 error = nl_parse_nlmsg(hdr, &nhmsg_parser, npt, &attrs);
893 if (error != 0)
894 return (error);
895
896 /*
897 * Get valid nha_id. Treat nha_id == 0 (auto-assignment) as a second-class
898 * citizen.
899 */
900 if (attrs.nha_id == 0) {
901 attrs.nha_id = find_spare_uidx(ctl);
902 if (attrs.nha_id == 0) {
903 NL_LOG(LOG_DEBUG, "Unable to get spare uidx");
904 return (ENOSPC);
905 }
906 }
907
908 NL_LOG(LOG_DEBUG, "IFINDEX %d", attrs.nha_oif ? if_getindex(attrs.nha_oif) : 0);
909
910 unhop = malloc(sizeof(struct user_nhop), M_NETLINK, M_NOWAIT | M_ZERO);
911 if (unhop == NULL) {
912 NL_LOG(LOG_DEBUG, "Unable to allocate user_nhop");
913 return (ENOMEM);
914 }
915 unhop->un_idx = attrs.nha_id;
916 unhop->un_protocol = attrs.nh_protocol;
917
918 if (attrs.nha_group)
919 error = newnhg(ctl, &attrs, unhop);
920 else
921 error = newnhop(&attrs, unhop, npt);
922
923 if (error != 0) {
924 free(unhop, M_NETLINK);
925 return (error);
926 }
927
928 UN_WLOCK(ctl);
929 /* Check if uidx already exists */
930 struct user_nhop *tmp = NULL;
931 CHT_SLIST_FIND_BYOBJ(&ctl->un_head, unhop, unhop, tmp);
932 if (tmp != NULL) {
933 UN_WUNLOCK(ctl);
934 NL_LOG(LOG_DEBUG, "nhop idx %u already exists", attrs.nha_id);
935 destroy_unhop(unhop);
936 return (EEXIST);
937 }
938 CHT_SLIST_INSERT_HEAD(&ctl->un_head, unhop, unhop);
939 uint32_t num_buckets_new = CHT_SLIST_GET_RESIZE_BUCKETS(&ctl->un_head);
940 UN_WUNLOCK(ctl);
941
942 /* Report addition of the next nexhop */
943 struct netlink_walkargs wa = {
944 .hdr.nlmsg_pid = hdr->nlmsg_pid,
945 .hdr.nlmsg_seq = hdr->nlmsg_seq,
946 .hdr.nlmsg_flags = hdr->nlmsg_flags,
947 .hdr.nlmsg_type = NL_RTM_NEWNEXTHOP,
948 };
949
950 struct nl_writer nw = {};
951 if (!nlmsg_get_group_writer(&nw, NLMSG_SMALL, NETLINK_ROUTE, RTNLGRP_NEXTHOP)) {
952 NL_LOG(LOG_DEBUG, "error allocating message writer");
953 return (ENOMEM);
954 }
955
956 dump_unhop(unhop, &wa.hdr, &nw);
957 nlmsg_flush(&nw);
958
959 consider_resize(ctl, num_buckets_new);
960
961 return (0);
962 }
963
964 static int
rtnl_handle_delnhop(struct nlmsghdr * hdr,struct nlpcb * nlp,struct nl_pstate * npt)965 rtnl_handle_delnhop(struct nlmsghdr *hdr, struct nlpcb *nlp,
966 struct nl_pstate *npt)
967 {
968 struct unhop_ctl *ctl = atomic_load_ptr(&V_un_ctl);
969 int error;
970
971 if (__predict_false(ctl == NULL))
972 return (ESRCH);
973
974 struct nl_parsed_nhop attrs = {};
975 error = nl_parse_nlmsg(hdr, &nhmsg_parser, npt, &attrs);
976 if (error != 0)
977 return (error);
978
979 if (attrs.nha_id == 0) {
980 NL_LOG(LOG_DEBUG, "NHA_ID not set");
981 return (EINVAL);
982 }
983
984 error = delete_unhop(ctl, hdr, attrs.nha_id);
985
986 return (error);
987 }
988
989 static bool
match_unhop(const struct nl_parsed_nhop * attrs,struct user_nhop * unhop)990 match_unhop(const struct nl_parsed_nhop *attrs, struct user_nhop *unhop)
991 {
992 if (attrs->nha_id != 0 && unhop->un_idx != attrs->nha_id)
993 return (false);
994 if (attrs->nha_groups != 0 && unhop->un_nhgrp_src == NULL)
995 return (false);
996 if (attrs->nha_oif != NULL &&
997 (unhop->un_nhop_src == NULL || unhop->un_nhop_src->nh_ifp != attrs->nha_oif))
998 return (false);
999
1000 return (true);
1001 }
1002
1003 static int
rtnl_handle_getnhop(struct nlmsghdr * hdr,struct nlpcb * nlp,struct nl_pstate * npt)1004 rtnl_handle_getnhop(struct nlmsghdr *hdr, struct nlpcb *nlp,
1005 struct nl_pstate *npt)
1006 {
1007 struct user_nhop *unhop;
1008 UN_TRACKER;
1009 int error;
1010
1011 struct nl_parsed_nhop attrs = {};
1012 error = nl_parse_nlmsg(hdr, &nhmsg_parser, npt, &attrs);
1013 if (error != 0)
1014 return (error);
1015
1016 struct netlink_walkargs wa = {
1017 .nw = npt->nw,
1018 .hdr.nlmsg_pid = hdr->nlmsg_pid,
1019 .hdr.nlmsg_seq = hdr->nlmsg_seq,
1020 .hdr.nlmsg_flags = hdr->nlmsg_flags,
1021 .hdr.nlmsg_type = NL_RTM_NEWNEXTHOP,
1022 };
1023
1024 if (attrs.nha_id != 0) {
1025 struct unhop_ctl *ctl = atomic_load_ptr(&V_un_ctl);
1026 struct user_nhop key = { .un_idx = attrs.nha_id };
1027
1028 if (__predict_false(ctl == NULL))
1029 return (ESRCH);
1030
1031 NL_LOG(LOG_DEBUG2, "searching for uidx %u", attrs.nha_id);
1032 UN_RLOCK(ctl);
1033 CHT_SLIST_FIND_BYOBJ(&ctl->un_head, unhop, &key, unhop);
1034 UN_RUNLOCK(ctl);
1035
1036 if (unhop == NULL)
1037 return (ESRCH);
1038 dump_unhop(unhop, &wa.hdr, wa.nw);
1039 return (0);
1040 } else if (attrs.nhaf_kid != 0) {
1041 struct nhop_iter iter = {
1042 .fibnum = attrs.nhaf_table,
1043 .family = attrs.nhaf_family,
1044 };
1045 int error = ESRCH;
1046
1047 NL_LOG(LOG_DEBUG2, "START table %u family %d", attrs.nhaf_table, attrs.nhaf_family);
1048 for (struct nhop_object *nh = nhops_iter_start(&iter); nh;
1049 nh = nhops_iter_next(&iter)) {
1050 NL_LOG(LOG_DEBUG3, "get %u", nhop_get_idx(nh));
1051 if (nhop_get_idx(nh) == attrs.nhaf_kid) {
1052 dump_nhop(nh, 0, &wa.hdr, wa.nw);
1053 error = 0;
1054 break;
1055 }
1056 }
1057 nhops_iter_stop(&iter);
1058 return (error);
1059 } else if (attrs.nhaf_knhops) {
1060 struct nhop_iter iter = {
1061 .fibnum = attrs.nhaf_table,
1062 .family = attrs.nhaf_family,
1063 };
1064
1065 NL_LOG(LOG_DEBUG2, "DUMP table %u family %d", attrs.nhaf_table, attrs.nhaf_family);
1066 wa.hdr.nlmsg_flags |= NLM_F_MULTI;
1067 for (struct nhop_object *nh = nhops_iter_start(&iter); nh;
1068 nh = nhops_iter_next(&iter)) {
1069 dump_nhop(nh, 0, &wa.hdr, wa.nw);
1070 }
1071 nhops_iter_stop(&iter);
1072 } else {
1073 struct unhop_ctl *ctl = atomic_load_ptr(&V_un_ctl);
1074
1075 if (__predict_false(ctl == NULL))
1076 return (ESRCH);
1077
1078 NL_LOG(LOG_DEBUG2, "DUMP unhops");
1079 UN_RLOCK(ctl);
1080 wa.hdr.nlmsg_flags |= NLM_F_MULTI;
1081 CHT_SLIST_FOREACH(&ctl->un_head, unhop, unhop) {
1082 if (UNHOP_IS_MASTER(unhop) && match_unhop(&attrs, unhop))
1083 dump_unhop(unhop, &wa.hdr, wa.nw);
1084 } CHT_SLIST_FOREACH_END;
1085 UN_RUNLOCK(ctl);
1086 }
1087
1088 if (wa.error == 0) {
1089 if (!nlmsg_end_dump(wa.nw, wa.error, &wa.hdr))
1090 return (ENOMEM);
1091 }
1092 return (0);
1093 }
1094
1095 static const struct rtnl_cmd_handler cmd_handlers[] = {
1096 {
1097 .cmd = NL_RTM_NEWNEXTHOP,
1098 .name = "RTM_NEWNEXTHOP",
1099 .cb = &rtnl_handle_newnhop,
1100 .priv = PRIV_NET_ROUTE,
1101 },
1102 {
1103 .cmd = NL_RTM_DELNEXTHOP,
1104 .name = "RTM_DELNEXTHOP",
1105 .cb = &rtnl_handle_delnhop,
1106 .priv = PRIV_NET_ROUTE,
1107 },
1108 {
1109 .cmd = NL_RTM_GETNEXTHOP,
1110 .name = "RTM_GETNEXTHOP",
1111 .cb = &rtnl_handle_getnhop,
1112 }
1113 };
1114
1115 static const struct nlhdr_parser *all_parsers[] = { &nhmsg_parser, &nh_fbsd_parser };
1116
1117 void
rtnl_nexthops_init(void)1118 rtnl_nexthops_init(void)
1119 {
1120 NL_VERIFY_PARSERS(all_parsers);
1121 rtnl_register_messages(cmd_handlers, NL_ARRAY_LEN(cmd_handlers));
1122 }
1123