1 /*-
2 * SPDX-License-Identifier: BSD-2-Clause
3 *
4 * Copyright (c) 2022 Alexander V. Chernikov <melifaro@FreeBSD.org>
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 * 1. Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 * notice, this list of conditions and the following disclaimer in the
13 * documentation and/or other materials provided with the distribution.
14 *
15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
16 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
19 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
21 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
22 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
23 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
24 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
25 * SUCH DAMAGE.
26 */
27
28 #include <sys/cdefs.h>
29 #include "opt_inet.h"
30 #include "opt_inet6.h"
31 #include <sys/types.h>
32 #include <sys/ck.h>
33 #include <sys/epoch.h>
34 #include <sys/kernel.h>
35 #include <sys/malloc.h>
36 #include <sys/rmlock.h>
37 #include <sys/socket.h>
38
39 #include <net/if.h>
40 #include <net/route.h>
41 #include <net/route/nhop.h>
42 #include <net/route/nhop_utils.h>
43
44 #include <net/route/route_ctl.h>
45 #include <net/route/route_var.h>
46 #include <netinet6/scope6_var.h>
47 #include <netlink/netlink.h>
48 #include <netlink/netlink_ctl.h>
49 #include <netlink/netlink_route.h>
50 #include <netlink/route/route_var.h>
51
52 #define DEBUG_MOD_NAME nl_nhop
53 #define DEBUG_MAX_LEVEL LOG_DEBUG3
54 #include <netlink/netlink_debug.h>
55 _DECLARE_DEBUG(LOG_INFO);
56
57 /*
58 * This file contains the logic to maintain kernel nexthops and
59 * nexhop groups based om the data provided by the user.
60 *
61 * Kernel stores (nearly) all of the routing data in the nexthops,
62 * including the prefix-specific flags (NHF_HOST and NHF_DEFAULT).
63 *
64 * Netlink API provides higher-level abstraction for the user. Each
65 * user-created nexthop may map to multiple kernel nexthops.
66 *
67 * The following variations require separate kernel nexthop to be
68 * created:
69 * * prefix flags (NHF_HOST, NHF_DEFAULT)
70 * * using IPv6 gateway for IPv4 routes
71 * * different fibnum
72 *
73 * These kernel nexthops have the lifetime bound to the lifetime of
74 * the user_nhop object. They are not collected until user requests
75 * to delete the created user_nhop.
76 *
77 */
78 struct user_nhop {
79 uint32_t un_idx; /* Userland-provided index */
80 uint32_t un_fibfam; /* fibnum+af(as highest byte) */
81 uint8_t un_protocol; /* protocol that install the record */
82 struct nhop_object *un_nhop; /* "production" nexthop */
83 struct nhop_object *un_nhop_src; /* nexthop to copy from */
84 struct weightened_nhop *un_nhgrp_src; /* nexthops for nhg */
85 uint32_t un_nhgrp_count; /* number of nexthops */
86 struct user_nhop *un_next; /* next item in hash chain */
87 struct user_nhop *un_nextchild; /* master -> children */
88 struct epoch_context un_epoch_ctx; /* epoch ctl helper */
89 };
90
91 /* produce hash value for an object */
92 #define unhop_hash_obj(_obj) (hash_unhop(_obj))
93 /* compare two objects */
94 #define unhop_cmp(_one, _two) (cmp_unhop(_one, _two))
95 /* next object accessor */
96 #define unhop_next(_obj) (_obj)->un_next
97
98 CHT_SLIST_DEFINE(unhop, struct user_nhop);
99
100 struct unhop_ctl {
101 struct unhop_head un_head;
102 struct rmlock un_lock;
103 };
104 #define UN_LOCK_INIT(_ctl) rm_init(&(_ctl)->un_lock, "unhop_ctl")
105 #define UN_TRACKER struct rm_priotracker un_tracker
106 #define UN_RLOCK(_ctl) rm_rlock(&((_ctl)->un_lock), &un_tracker)
107 #define UN_RUNLOCK(_ctl) rm_runlock(&((_ctl)->un_lock), &un_tracker)
108
109 #define UN_WLOCK(_ctl) rm_wlock(&(_ctl)->un_lock);
110 #define UN_WUNLOCK(_ctl) rm_wunlock(&(_ctl)->un_lock);
111
112 VNET_DEFINE_STATIC(struct unhop_ctl *, un_ctl) = NULL;
113 #define V_un_ctl VNET(un_ctl)
114
115 static void consider_resize(struct unhop_ctl *ctl, uint32_t new_size);
116 static int cmp_unhop(const struct user_nhop *a, const struct user_nhop *b);
117 static unsigned int hash_unhop(const struct user_nhop *obj);
118
119 static void destroy_unhop(struct user_nhop *unhop);
120 static struct nhop_object *clone_unhop(const struct user_nhop *unhop,
121 uint32_t fibnum, int family, int nh_flags);
122
123 static int
cmp_unhop(const struct user_nhop * a,const struct user_nhop * b)124 cmp_unhop(const struct user_nhop *a, const struct user_nhop *b)
125 {
126 return (a->un_idx == b->un_idx && a->un_fibfam == b->un_fibfam);
127 }
128
129 /*
130 * Hash callback: calculate hash of an object
131 */
132 static unsigned int
hash_unhop(const struct user_nhop * obj)133 hash_unhop(const struct user_nhop *obj)
134 {
135 return (obj->un_idx ^ obj->un_fibfam);
136 }
137
138 #define UNHOP_IS_MASTER(_unhop) ((_unhop)->un_fibfam == 0)
139
140 /*
141 * Factory interface for creating matching kernel nexthops/nexthop groups
142 *
143 * @uidx: userland nexhop index used to create the nexthop
144 * @fibnum: fibnum nexthop will be used in
145 * @family: upper family nexthop will be used in
146 * @nh_flags: desired nexthop prefix flags
147 * @perror: pointer to store error to
148 *
149 * Returns referenced nexthop linked to @fibnum/@family rib on success.
150 */
151 struct nhop_object *
nl_find_nhop(uint32_t fibnum,int family,uint32_t uidx,int nh_flags,int * perror)152 nl_find_nhop(uint32_t fibnum, int family, uint32_t uidx,
153 int nh_flags, int *perror)
154 {
155 struct unhop_ctl *ctl = atomic_load_ptr(&V_un_ctl);
156 UN_TRACKER;
157
158 if (__predict_false(ctl == NULL))
159 return (NULL);
160
161 struct user_nhop key= {
162 .un_idx = uidx,
163 .un_fibfam = fibnum | ((uint32_t)family) << 24,
164 };
165 struct user_nhop *unhop;
166
167 nh_flags = nh_flags & (NHF_HOST | NHF_DEFAULT);
168
169 if (__predict_false(family == 0))
170 return (NULL);
171
172 UN_RLOCK(ctl);
173 CHT_SLIST_FIND_BYOBJ(&ctl->un_head, unhop, &key, unhop);
174 if (unhop != NULL) {
175 struct nhop_object *nh = unhop->un_nhop;
176 UN_RLOCK(ctl);
177 *perror = 0;
178 nhop_ref_any(nh);
179 return (nh);
180 }
181
182 /*
183 * Exact nexthop not found. Search for template nexthop to clone from.
184 */
185 key.un_fibfam = 0;
186 CHT_SLIST_FIND_BYOBJ(&ctl->un_head, unhop, &key, unhop);
187 if (unhop == NULL) {
188 UN_RUNLOCK(ctl);
189 *perror = ESRCH;
190 return (NULL);
191 }
192
193 UN_RUNLOCK(ctl);
194
195 /* Create entry to insert first */
196 struct user_nhop *un_new, *un_tmp;
197 un_new = malloc(sizeof(struct user_nhop), M_NETLINK, M_NOWAIT | M_ZERO);
198 if (un_new == NULL) {
199 *perror = ENOMEM;
200 return (NULL);
201 }
202 un_new->un_idx = uidx;
203 un_new->un_fibfam = fibnum | ((uint32_t)family) << 24;
204
205 /* Relying on epoch to protect unhop here */
206 un_new->un_nhop = clone_unhop(unhop, fibnum, family, nh_flags);
207 if (un_new->un_nhop == NULL) {
208 free(un_new, M_NETLINK);
209 *perror = ENOMEM;
210 return (NULL);
211 }
212
213 /* Insert back and report */
214 UN_WLOCK(ctl);
215
216 /* First, find template record once again */
217 CHT_SLIST_FIND_BYOBJ(&ctl->un_head, unhop, &key, unhop);
218 if (unhop == NULL) {
219 /* Someone deleted the nexthop during the call */
220 UN_WUNLOCK(ctl);
221 *perror = ESRCH;
222 destroy_unhop(un_new);
223 return (NULL);
224 }
225
226 /* Second, check the direct match */
227 CHT_SLIST_FIND_BYOBJ(&ctl->un_head, unhop, un_new, un_tmp);
228 struct nhop_object *nh;
229 if (un_tmp != NULL) {
230 /* Another thread already created the desired nextop, use it */
231 nh = un_tmp->un_nhop;
232 } else {
233 /* Finally, insert the new nexthop and link it to the primary */
234 nh = un_new->un_nhop;
235 CHT_SLIST_INSERT_HEAD(&ctl->un_head, unhop, un_new);
236 un_new->un_nextchild = unhop->un_nextchild;
237 unhop->un_nextchild = un_new;
238 un_new = NULL;
239 NL_LOG(LOG_DEBUG2, "linked cloned nexthop %p", nh);
240 }
241
242 UN_WUNLOCK(ctl);
243
244 if (un_new != NULL)
245 destroy_unhop(un_new);
246
247 *perror = 0;
248 nhop_ref_any(nh);
249 return (nh);
250 }
251
252 static struct user_nhop *
nl_find_base_unhop(struct unhop_ctl * ctl,uint32_t uidx)253 nl_find_base_unhop(struct unhop_ctl *ctl, uint32_t uidx)
254 {
255 struct user_nhop key= { .un_idx = uidx };
256 struct user_nhop *unhop = NULL;
257 UN_TRACKER;
258
259 UN_RLOCK(ctl);
260 CHT_SLIST_FIND_BYOBJ(&ctl->un_head, unhop, &key, unhop);
261 UN_RUNLOCK(ctl);
262
263 return (unhop);
264 }
265
266 #define MAX_STACK_NHOPS 4
267 static struct nhop_object *
clone_unhop(const struct user_nhop * unhop,uint32_t fibnum,int family,int nh_flags)268 clone_unhop(const struct user_nhop *unhop, uint32_t fibnum, int family, int nh_flags)
269 {
270 const struct weightened_nhop *wn;
271 struct weightened_nhop *wn_new, wn_base[MAX_STACK_NHOPS];
272 struct nhop_object *nh = NULL;
273 uint32_t num_nhops;
274 int error;
275
276 if (unhop->un_nhop_src != NULL) {
277 IF_DEBUG_LEVEL(LOG_DEBUG2) {
278 char nhbuf[NHOP_PRINT_BUFSIZE];
279 nhop_print_buf_any(unhop->un_nhop_src, nhbuf, sizeof(nhbuf));
280 FIB_NH_LOG(LOG_DEBUG2, unhop->un_nhop_src,
281 "cloning nhop %s -> %u.%u flags 0x%X", nhbuf, fibnum,
282 family, nh_flags);
283 }
284 struct nhop_object *nh;
285 nh = nhop_alloc(fibnum, AF_UNSPEC);
286 if (nh == NULL)
287 return (NULL);
288 nhop_copy(nh, unhop->un_nhop_src);
289 /* Check that nexthop gateway is compatible with the new family */
290 if (!nhop_set_upper_family(nh, family)) {
291 nhop_free(nh);
292 return (NULL);
293 }
294 nhop_set_uidx(nh, unhop->un_idx);
295 nhop_set_pxtype_flag(nh, nh_flags);
296 return (nhop_get_nhop(nh, &error));
297 }
298
299 wn = unhop->un_nhgrp_src;
300 num_nhops = unhop->un_nhgrp_count;
301 if (num_nhops > MAX_STACK_NHOPS) {
302 wn_new = malloc(num_nhops * sizeof(struct weightened_nhop), M_TEMP, M_NOWAIT);
303 if (wn_new == NULL)
304 return (NULL);
305 } else
306 wn_new = wn_base;
307
308 for (int i = 0; i < num_nhops; i++) {
309 uint32_t uidx = nhop_get_uidx(wn[i].nh);
310 MPASS(uidx != 0);
311 wn_new[i].nh = nl_find_nhop(fibnum, family, uidx, nh_flags, &error);
312 if (error != 0)
313 break;
314 wn_new[i].weight = wn[i].weight;
315 }
316
317 if (error == 0) {
318 struct rib_head *rh = nhop_get_rh(wn_new[0].nh);
319 struct nhgrp_object *nhg;
320
321 error = nhgrp_get_group(rh, wn_new, num_nhops, unhop->un_idx, &nhg);
322 nh = (struct nhop_object *)nhg;
323 }
324
325 if (wn_new != wn_base)
326 free(wn_new, M_TEMP);
327
328 return (nh);
329 }
330
331 static void
destroy_unhop(struct user_nhop * unhop)332 destroy_unhop(struct user_nhop *unhop)
333 {
334 if (unhop->un_nhop != NULL)
335 nhop_free_any(unhop->un_nhop);
336 if (unhop->un_nhop_src != NULL)
337 nhop_free_any(unhop->un_nhop_src);
338 free(unhop, M_NETLINK);
339 }
340
341 static void
destroy_unhop_epoch(epoch_context_t ctx)342 destroy_unhop_epoch(epoch_context_t ctx)
343 {
344 struct user_nhop *unhop;
345
346 unhop = __containerof(ctx, struct user_nhop, un_epoch_ctx);
347
348 destroy_unhop(unhop);
349 }
350
351 static uint32_t
find_spare_uidx(struct unhop_ctl * ctl)352 find_spare_uidx(struct unhop_ctl *ctl)
353 {
354 struct user_nhop *unhop, key = {};
355 uint32_t uidx = 0;
356 UN_TRACKER;
357
358 UN_RLOCK(ctl);
359 /* This should return spare uid with 75% of 65k used in ~99/100 cases */
360 for (int i = 0; i < 16; i++) {
361 key.un_idx = (arc4random() % 65536) + 65536 * 4;
362 CHT_SLIST_FIND_BYOBJ(&ctl->un_head, unhop, &key, unhop);
363 if (unhop == NULL) {
364 uidx = key.un_idx;
365 break;
366 }
367 }
368 UN_RUNLOCK(ctl);
369
370 return (uidx);
371 }
372
373
374 /*
375 * Actual netlink code
376 */
377 struct netlink_walkargs {
378 struct nl_writer *nw;
379 struct nlmsghdr hdr;
380 struct nlpcb *so;
381 int family;
382 int error;
383 int count;
384 int dumped;
385 };
386 #define ENOMEM_IF_NULL(_v) if ((_v) == NULL) goto enomem
387
388 static bool
dump_nhgrp(const struct user_nhop * unhop,struct nlmsghdr * hdr,struct nl_writer * nw)389 dump_nhgrp(const struct user_nhop *unhop, struct nlmsghdr *hdr,
390 struct nl_writer *nw)
391 {
392
393 if (!nlmsg_reply(nw, hdr, sizeof(struct nhmsg)))
394 goto enomem;
395
396 struct nhmsg *nhm = nlmsg_reserve_object(nw, struct nhmsg);
397 nhm->nh_family = AF_UNSPEC;
398 nhm->nh_scope = 0;
399 nhm->nh_protocol = unhop->un_protocol;
400 nhm->nh_flags = 0;
401
402 nlattr_add_u32(nw, NHA_ID, unhop->un_idx);
403 nlattr_add_u16(nw, NHA_GROUP_TYPE, NEXTHOP_GRP_TYPE_MPATH);
404
405 struct weightened_nhop *wn = unhop->un_nhgrp_src;
406 uint32_t num_nhops = unhop->un_nhgrp_count;
407 /* TODO: a better API? */
408 int nla_len = sizeof(struct nlattr);
409 nla_len += NETLINK_ALIGN(num_nhops * sizeof(struct nexthop_grp));
410 struct nlattr *nla = nlmsg_reserve_data(nw, nla_len, struct nlattr);
411 if (nla == NULL)
412 goto enomem;
413 nla->nla_type = NHA_GROUP;
414 nla->nla_len = nla_len;
415 for (int i = 0; i < num_nhops; i++) {
416 struct nexthop_grp *grp = &((struct nexthop_grp *)(nla + 1))[i];
417 grp->id = nhop_get_uidx(wn[i].nh);
418 grp->weight = wn[i].weight;
419 grp->resvd1 = 0;
420 grp->resvd2 = 0;
421 }
422
423 if (nlmsg_end(nw))
424 return (true);
425 enomem:
426 NL_LOG(LOG_DEBUG, "error: unable to allocate attribute memory");
427 nlmsg_abort(nw);
428 return (false);
429 }
430
431 static bool
dump_nhop(const struct nhop_object * nh,uint32_t uidx,struct nlmsghdr * hdr,struct nl_writer * nw)432 dump_nhop(const struct nhop_object *nh, uint32_t uidx, struct nlmsghdr *hdr,
433 struct nl_writer *nw)
434 {
435 if (!nlmsg_reply(nw, hdr, sizeof(struct nhmsg)))
436 goto enomem;
437
438 struct nhmsg *nhm = nlmsg_reserve_object(nw, struct nhmsg);
439 ENOMEM_IF_NULL(nhm);
440 nhm->nh_family = nhop_get_neigh_family(nh);
441 nhm->nh_scope = 0; // XXX: what's that?
442 nhm->nh_protocol = nhop_get_origin(nh);
443 nhm->nh_flags = 0;
444
445 if (uidx != 0)
446 nlattr_add_u32(nw, NHA_ID, uidx);
447 if (nh->nh_flags & NHF_BLACKHOLE) {
448 nlattr_add_flag(nw, NHA_BLACKHOLE);
449 goto done;
450 }
451 nlattr_add_u32(nw, NHA_OIF, if_getindex(nh->nh_ifp));
452
453 switch (nh->gw_sa.sa_family) {
454 #ifdef INET
455 case AF_INET:
456 nlattr_add(nw, NHA_GATEWAY, 4, &nh->gw4_sa.sin_addr);
457 break;
458 #endif
459 #ifdef INET6
460 case AF_INET6:
461 {
462 struct in6_addr addr = nh->gw6_sa.sin6_addr;
463 in6_clearscope(&addr);
464 nlattr_add(nw, NHA_GATEWAY, 16, &addr);
465 break;
466 }
467 #endif
468 }
469
470 int off = nlattr_add_nested(nw, NHA_FREEBSD);
471 if (off != 0) {
472 nlattr_add_u32(nw, NHAF_AIF, if_getindex(nh->nh_aifp));
473
474 if (uidx == 0) {
475 nlattr_add_u32(nw, NHAF_KID, nhop_get_idx(nh));
476 nlattr_add_u32(nw, NHAF_FAMILY, nhop_get_upper_family(nh));
477 nlattr_add_u32(nw, NHAF_TABLE, nhop_get_fibnum(nh));
478 }
479
480 nlattr_set_len(nw, off);
481 }
482
483 done:
484 if (nlmsg_end(nw))
485 return (true);
486 enomem:
487 nlmsg_abort(nw);
488 return (false);
489 }
490
491 static void
dump_unhop(const struct user_nhop * unhop,struct nlmsghdr * hdr,struct nl_writer * nw)492 dump_unhop(const struct user_nhop *unhop, struct nlmsghdr *hdr,
493 struct nl_writer *nw)
494 {
495 if (unhop->un_nhop_src != NULL)
496 dump_nhop(unhop->un_nhop_src, unhop->un_idx, hdr, nw);
497 else
498 dump_nhgrp(unhop, hdr, nw);
499 }
500
501 static int
delete_unhop(struct unhop_ctl * ctl,struct nlmsghdr * hdr,uint32_t uidx)502 delete_unhop(struct unhop_ctl *ctl, struct nlmsghdr *hdr, uint32_t uidx)
503 {
504 struct user_nhop *unhop_ret, *unhop_base, *unhop_chain;
505 struct nl_writer nw;
506 struct user_nhop key = { .un_idx = uidx };
507
508 UN_WLOCK(ctl);
509
510 CHT_SLIST_FIND_BYOBJ(&ctl->un_head, unhop, &key, unhop_base);
511
512 if (unhop_base != NULL) {
513 CHT_SLIST_REMOVE(&ctl->un_head, unhop, unhop_base, unhop_ret);
514 IF_DEBUG_LEVEL(LOG_DEBUG2) {
515 char nhbuf[NHOP_PRINT_BUFSIZE];
516 nhop_print_buf_any(unhop_base->un_nhop, nhbuf, sizeof(nhbuf));
517 FIB_NH_LOG(LOG_DEBUG3, unhop_base->un_nhop,
518 "removed base nhop %u: %s", uidx, nhbuf);
519 }
520 /* Unlink all child nexhops as well, keeping the chain intact */
521 unhop_chain = unhop_base->un_nextchild;
522 while (unhop_chain != NULL) {
523 CHT_SLIST_REMOVE(&ctl->un_head, unhop, unhop_chain,
524 unhop_ret);
525 MPASS(unhop_chain == unhop_ret);
526 IF_DEBUG_LEVEL(LOG_DEBUG3) {
527 char nhbuf[NHOP_PRINT_BUFSIZE];
528 nhop_print_buf_any(unhop_chain->un_nhop,
529 nhbuf, sizeof(nhbuf));
530 FIB_NH_LOG(LOG_DEBUG3, unhop_chain->un_nhop,
531 "removed child nhop %u: %s", uidx, nhbuf);
532 }
533 unhop_chain = unhop_chain->un_nextchild;
534 }
535 }
536
537 UN_WUNLOCK(ctl);
538
539 if (unhop_base == NULL) {
540 NL_LOG(LOG_DEBUG, "unable to find unhop %u", uidx);
541 return (ENOENT);
542 }
543
544 /* Report nexthop deletion */
545 struct netlink_walkargs wa = {
546 .hdr.nlmsg_pid = hdr->nlmsg_pid,
547 .hdr.nlmsg_seq = hdr->nlmsg_seq,
548 .hdr.nlmsg_flags = hdr->nlmsg_flags,
549 .hdr.nlmsg_type = NL_RTM_DELNEXTHOP,
550 };
551
552 if (!nl_writer_group(&nw, NLMSG_SMALL, NETLINK_ROUTE, RTNLGRP_NEXTHOP,
553 0, false)) {
554 NL_LOG(LOG_DEBUG, "error allocating message writer");
555 return (ENOMEM);
556 }
557
558 dump_unhop(unhop_base, &wa.hdr, &nw);
559 nlmsg_flush(&nw);
560
561 while (unhop_base != NULL) {
562 unhop_chain = unhop_base->un_nextchild;
563 NET_EPOCH_CALL(destroy_unhop_epoch, &unhop_base->un_epoch_ctx);
564 unhop_base = unhop_chain;
565 }
566
567 return (0);
568 }
569
570 static void
consider_resize(struct unhop_ctl * ctl,uint32_t new_size)571 consider_resize(struct unhop_ctl *ctl, uint32_t new_size)
572 {
573 void *new_ptr = NULL;
574 size_t alloc_size;
575
576 if (new_size == 0)
577 return;
578
579 if (new_size != 0) {
580 alloc_size = CHT_SLIST_GET_RESIZE_SIZE(new_size);
581 new_ptr = malloc(alloc_size, M_NETLINK, M_NOWAIT | M_ZERO);
582 if (new_ptr == NULL)
583 return;
584 }
585
586 NL_LOG(LOG_DEBUG, "resizing hash: %u -> %u", ctl->un_head.hash_size, new_size);
587 UN_WLOCK(ctl);
588 if (new_ptr != NULL) {
589 CHT_SLIST_RESIZE(&ctl->un_head, unhop, new_ptr, new_size);
590 }
591 UN_WUNLOCK(ctl);
592
593
594 if (new_ptr != NULL)
595 free(new_ptr, M_NETLINK);
596 }
597
598 static bool __noinline
vnet_init_unhops(void)599 vnet_init_unhops(void)
600 {
601 uint32_t num_buckets = 16;
602 size_t alloc_size = CHT_SLIST_GET_RESIZE_SIZE(num_buckets);
603
604 struct unhop_ctl *ctl = malloc(sizeof(struct unhop_ctl), M_NETLINK,
605 M_NOWAIT | M_ZERO);
606 if (ctl == NULL)
607 return (false);
608
609 void *ptr = malloc(alloc_size, M_NETLINK, M_NOWAIT | M_ZERO);
610 if (ptr == NULL) {
611 free(ctl, M_NETLINK);
612 return (false);
613 }
614 CHT_SLIST_INIT(&ctl->un_head, ptr, num_buckets);
615 UN_LOCK_INIT(ctl);
616
617 if (!atomic_cmpset_ptr((uintptr_t *)&V_un_ctl, (uintptr_t)NULL, (uintptr_t)ctl)) {
618 free(ptr, M_NETLINK);
619 free(ctl, M_NETLINK);
620 }
621
622 if (atomic_load_ptr(&V_un_ctl) == NULL)
623 return (false);
624
625 NL_LOG(LOG_NOTICE, "UNHOPS init done");
626
627 return (true);
628 }
629
630 static void
vnet_destroy_unhops(const void * unused __unused)631 vnet_destroy_unhops(const void *unused __unused)
632 {
633 struct unhop_ctl *ctl = atomic_load_ptr(&V_un_ctl);
634 struct user_nhop *unhop, *tmp;
635
636 if (ctl == NULL)
637 return;
638 V_un_ctl = NULL;
639
640 /* Wait till all unhop users finish their reads */
641 NET_EPOCH_WAIT();
642
643 UN_WLOCK(ctl);
644 CHT_SLIST_FOREACH_SAFE(&ctl->un_head, unhop, unhop, tmp) {
645 destroy_unhop(unhop);
646 } CHT_SLIST_FOREACH_SAFE_END;
647 UN_WUNLOCK(ctl);
648
649 free(ctl->un_head.ptr, M_NETLINK);
650 free(ctl, M_NETLINK);
651 }
652 VNET_SYSUNINIT(vnet_destroy_unhops, SI_SUB_PROTO_IF, SI_ORDER_ANY,
653 vnet_destroy_unhops, NULL);
654
655 static int
nlattr_get_nhg(struct nlattr * nla,struct nl_pstate * npt,const void * arg,void * target)656 nlattr_get_nhg(struct nlattr *nla, struct nl_pstate *npt, const void *arg, void *target)
657 {
658 int error = 0;
659
660 /* Verify attribute correctness */
661 struct nexthop_grp *grp = NLA_DATA(nla);
662 int data_len = NLA_DATA_LEN(nla);
663
664 int count = data_len / sizeof(*grp);
665 if (count == 0 || (count * sizeof(*grp) != data_len)) {
666 NL_LOG(LOG_DEBUG, "Invalid length for RTA_GROUP: %d", data_len);
667 return (EINVAL);
668 }
669
670 *((struct nlattr **)target) = nla;
671 return (error);
672 }
673
674 static void
set_scope6(struct sockaddr * sa,if_t ifp)675 set_scope6(struct sockaddr *sa, if_t ifp)
676 {
677 #ifdef INET6
678 if (sa != NULL && sa->sa_family == AF_INET6 && ifp != NULL) {
679 struct sockaddr_in6 *sa6 = (struct sockaddr_in6 *)sa;
680
681 if (IN6_IS_ADDR_LINKLOCAL(&sa6->sin6_addr))
682 in6_set_unicast_scopeid(&sa6->sin6_addr, if_getindex(ifp));
683 }
684 #endif
685 }
686
687 struct nl_parsed_nhop {
688 uint32_t nha_id;
689 uint8_t nha_blackhole;
690 uint8_t nha_groups;
691 uint8_t nhaf_knhops;
692 uint8_t nhaf_family;
693 struct ifnet *nha_oif;
694 struct sockaddr *nha_gw;
695 struct nlattr *nha_group;
696 uint8_t nh_family;
697 uint8_t nh_protocol;
698 uint32_t nhaf_table;
699 uint32_t nhaf_kid;
700 uint32_t nhaf_aif;
701 };
702
703 #define _IN(_field) offsetof(struct nhmsg, _field)
704 #define _OUT(_field) offsetof(struct nl_parsed_nhop, _field)
705 static struct nlattr_parser nla_p_nh_fbsd[] = {
706 { .type = NHAF_KNHOPS, .off = _OUT(nhaf_knhops), .cb = nlattr_get_flag },
707 { .type = NHAF_TABLE, .off = _OUT(nhaf_table), .cb = nlattr_get_uint32 },
708 { .type = NHAF_FAMILY, .off = _OUT(nhaf_family), .cb = nlattr_get_uint8 },
709 { .type = NHAF_KID, .off = _OUT(nhaf_kid), .cb = nlattr_get_uint32 },
710 { .type = NHAF_AIF, .off = _OUT(nhaf_aif), .cb = nlattr_get_uint32 },
711 };
712 NL_DECLARE_ATTR_PARSER(nh_fbsd_parser, nla_p_nh_fbsd);
713
714 static const struct nlfield_parser nlf_p_nh[] = {
715 { .off_in = _IN(nh_family), .off_out = _OUT(nh_family), .cb = nlf_get_u8 },
716 { .off_in = _IN(nh_protocol), .off_out = _OUT(nh_protocol), .cb = nlf_get_u8 },
717 };
718
719 static const struct nlattr_parser nla_p_nh[] = {
720 { .type = NHA_ID, .off = _OUT(nha_id), .cb = nlattr_get_uint32 },
721 { .type = NHA_GROUP, .off = _OUT(nha_group), .cb = nlattr_get_nhg },
722 { .type = NHA_BLACKHOLE, .off = _OUT(nha_blackhole), .cb = nlattr_get_flag },
723 { .type = NHA_OIF, .off = _OUT(nha_oif), .cb = nlattr_get_ifp },
724 { .type = NHA_GATEWAY, .off = _OUT(nha_gw), .cb = nlattr_get_ip },
725 { .type = NHA_GROUPS, .off = _OUT(nha_groups), .cb = nlattr_get_flag },
726 { .type = NHA_FREEBSD, .arg = &nh_fbsd_parser, .cb = nlattr_get_nested },
727 };
728 #undef _IN
729 #undef _OUT
730
731 static bool
post_p_nh(void * _attrs,struct nl_pstate * npt)732 post_p_nh(void *_attrs, struct nl_pstate *npt)
733 {
734 struct nl_parsed_nhop *attrs = (struct nl_parsed_nhop *)_attrs;
735
736 set_scope6(attrs->nha_gw, attrs->nha_oif);
737 return (true);
738 }
739 NL_DECLARE_PARSER_EXT(nhmsg_parser, struct nhmsg, NULL, nlf_p_nh, nla_p_nh, post_p_nh);
740
741 static bool
eligible_nhg(const struct nhop_object * nh)742 eligible_nhg(const struct nhop_object *nh)
743 {
744 return (nh->nh_flags & NHF_GATEWAY);
745 }
746
747 static int
newnhg(struct unhop_ctl * ctl,struct nl_parsed_nhop * attrs,struct user_nhop * unhop)748 newnhg(struct unhop_ctl *ctl, struct nl_parsed_nhop *attrs, struct user_nhop *unhop)
749 {
750 struct nexthop_grp *grp = NLA_DATA(attrs->nha_group);
751 int count = NLA_DATA_LEN(attrs->nha_group) / sizeof(*grp);
752 struct weightened_nhop *wn;
753
754 wn = malloc(sizeof(*wn) * count, M_NETLINK, M_NOWAIT | M_ZERO);
755 if (wn == NULL)
756 return (ENOMEM);
757
758 for (int i = 0; i < count; i++) {
759 struct user_nhop *unhop;
760 unhop = nl_find_base_unhop(ctl, grp[i].id);
761 if (unhop == NULL) {
762 NL_LOG(LOG_DEBUG, "unable to find uidx %u", grp[i].id);
763 free(wn, M_NETLINK);
764 return (ESRCH);
765 } else if (unhop->un_nhop_src == NULL) {
766 NL_LOG(LOG_DEBUG, "uidx %u is a group, nested group unsupported",
767 grp[i].id);
768 free(wn, M_NETLINK);
769 return (ENOTSUP);
770 } else if (!eligible_nhg(unhop->un_nhop_src)) {
771 NL_LOG(LOG_DEBUG, "uidx %u nhop is not mpath-eligible",
772 grp[i].id);
773 free(wn, M_NETLINK);
774 return (ENOTSUP);
775 }
776 /*
777 * TODO: consider more rigid eligibility checks:
778 * restrict nexthops with the same gateway
779 */
780 wn[i].nh = unhop->un_nhop_src;
781 wn[i].weight = grp[i].weight;
782 }
783 unhop->un_nhgrp_src = wn;
784 unhop->un_nhgrp_count = count;
785 return (0);
786 }
787
788 /*
789 * Sets nexthop @nh gateway specified by @gw.
790 * If gateway is IPv6 link-local, alters @gw to include scopeid equal to
791 * @ifp ifindex.
792 * Returns 0 on success or errno.
793 */
794 int
nl_set_nexthop_gw(struct nhop_object * nh,struct sockaddr * gw,if_t ifp,struct nl_pstate * npt)795 nl_set_nexthop_gw(struct nhop_object *nh, struct sockaddr *gw, if_t ifp,
796 struct nl_pstate *npt)
797 {
798 #ifdef INET6
799 if (gw->sa_family == AF_INET6) {
800 struct sockaddr_in6 *gw6 = (struct sockaddr_in6 *)gw;
801 if (IN6_IS_ADDR_LINKLOCAL(&gw6->sin6_addr)) {
802 if (ifp == NULL) {
803 NLMSG_REPORT_ERR_MSG(npt, "interface not set");
804 return (EINVAL);
805 }
806 in6_set_unicast_scopeid(&gw6->sin6_addr, if_getindex(ifp));
807 }
808 }
809 #endif
810 nhop_set_gw(nh, gw, true);
811 return (0);
812 }
813
814 static int
newnhop(struct nl_parsed_nhop * attrs,struct user_nhop * unhop,struct nl_pstate * npt)815 newnhop(struct nl_parsed_nhop *attrs, struct user_nhop *unhop, struct nl_pstate *npt)
816 {
817 struct ifaddr *ifa = NULL;
818 struct nhop_object *nh;
819 int error;
820
821 if (!attrs->nha_blackhole) {
822 if (attrs->nha_gw == NULL) {
823 NLMSG_REPORT_ERR_MSG(npt, "missing NHA_GATEWAY");
824 return (EINVAL);
825 }
826 if (attrs->nha_oif == NULL) {
827 NLMSG_REPORT_ERR_MSG(npt, "missing NHA_OIF");
828 return (EINVAL);
829 }
830 if (ifa == NULL)
831 ifa = ifaof_ifpforaddr(attrs->nha_gw, attrs->nha_oif);
832 if (ifa == NULL) {
833 NLMSG_REPORT_ERR_MSG(npt, "Unable to determine default source IP");
834 return (EINVAL);
835 }
836 }
837
838 int family = attrs->nha_gw != NULL ? attrs->nha_gw->sa_family : attrs->nh_family;
839
840 nh = nhop_alloc(RT_DEFAULT_FIB, family);
841 if (nh == NULL) {
842 NL_LOG(LOG_DEBUG, "Unable to allocate nexthop");
843 return (ENOMEM);
844 }
845 nhop_set_uidx(nh, attrs->nha_id);
846 nhop_set_origin(nh, attrs->nh_protocol);
847
848 if (attrs->nha_blackhole)
849 nhop_set_blackhole(nh, NHF_BLACKHOLE);
850 else {
851 error = nl_set_nexthop_gw(nh, attrs->nha_gw, attrs->nha_oif, npt);
852 if (error != 0) {
853 nhop_free(nh);
854 return (error);
855 }
856 nhop_set_transmit_ifp(nh, attrs->nha_oif);
857 nhop_set_src(nh, ifa);
858 }
859
860 error = nhop_get_unlinked(nh);
861 if (error != 0) {
862 NL_LOG(LOG_DEBUG, "unable to finalize nexthop");
863 return (error);
864 }
865
866 IF_DEBUG_LEVEL(LOG_DEBUG2) {
867 char nhbuf[NHOP_PRINT_BUFSIZE];
868 nhop_print_buf(nh, nhbuf, sizeof(nhbuf));
869 NL_LOG(LOG_DEBUG2, "Adding unhop %u: %s", attrs->nha_id, nhbuf);
870 }
871
872 unhop->un_nhop_src = nh;
873 return (0);
874 }
875
876 static int
rtnl_handle_newnhop(struct nlmsghdr * hdr,struct nlpcb * nlp,struct nl_pstate * npt)877 rtnl_handle_newnhop(struct nlmsghdr *hdr, struct nlpcb *nlp,
878 struct nl_pstate *npt)
879 {
880 struct nl_writer nw;
881 struct user_nhop *unhop;
882 int error;
883
884 if ((__predict_false(V_un_ctl == NULL)) && (!vnet_init_unhops()))
885 return (ENOMEM);
886 struct unhop_ctl *ctl = V_un_ctl;
887
888 struct nl_parsed_nhop attrs = {};
889 error = nl_parse_nlmsg(hdr, &nhmsg_parser, npt, &attrs);
890 if (error != 0)
891 return (error);
892
893 /*
894 * Get valid nha_id. Treat nha_id == 0 (auto-assignment) as a second-class
895 * citizen.
896 */
897 if (attrs.nha_id == 0) {
898 attrs.nha_id = find_spare_uidx(ctl);
899 if (attrs.nha_id == 0) {
900 NL_LOG(LOG_DEBUG, "Unable to get spare uidx");
901 return (ENOSPC);
902 }
903 }
904
905 NL_LOG(LOG_DEBUG, "IFINDEX %d", attrs.nha_oif ? if_getindex(attrs.nha_oif) : 0);
906
907 unhop = malloc(sizeof(struct user_nhop), M_NETLINK, M_NOWAIT | M_ZERO);
908 if (unhop == NULL) {
909 NL_LOG(LOG_DEBUG, "Unable to allocate user_nhop");
910 return (ENOMEM);
911 }
912 unhop->un_idx = attrs.nha_id;
913 unhop->un_protocol = attrs.nh_protocol;
914
915 if (attrs.nha_group)
916 error = newnhg(ctl, &attrs, unhop);
917 else
918 error = newnhop(&attrs, unhop, npt);
919
920 if (error != 0) {
921 free(unhop, M_NETLINK);
922 return (error);
923 }
924
925 UN_WLOCK(ctl);
926 /* Check if uidx already exists */
927 struct user_nhop *tmp = NULL;
928 CHT_SLIST_FIND_BYOBJ(&ctl->un_head, unhop, unhop, tmp);
929 if (tmp != NULL) {
930 UN_WUNLOCK(ctl);
931 NL_LOG(LOG_DEBUG, "nhop idx %u already exists", attrs.nha_id);
932 destroy_unhop(unhop);
933 return (EEXIST);
934 }
935 CHT_SLIST_INSERT_HEAD(&ctl->un_head, unhop, unhop);
936 uint32_t num_buckets_new = CHT_SLIST_GET_RESIZE_BUCKETS(&ctl->un_head);
937 UN_WUNLOCK(ctl);
938
939 /* Report addition of the next nexhop */
940 struct netlink_walkargs wa = {
941 .hdr.nlmsg_pid = hdr->nlmsg_pid,
942 .hdr.nlmsg_seq = hdr->nlmsg_seq,
943 .hdr.nlmsg_flags = hdr->nlmsg_flags,
944 .hdr.nlmsg_type = NL_RTM_NEWNEXTHOP,
945 };
946
947 if (!nl_writer_group(&nw, NLMSG_SMALL, NETLINK_ROUTE, RTNLGRP_NEXTHOP,
948 0, false)) {
949 NL_LOG(LOG_DEBUG, "error allocating message writer");
950 return (ENOMEM);
951 }
952
953 dump_unhop(unhop, &wa.hdr, &nw);
954 nlmsg_flush(&nw);
955
956 consider_resize(ctl, num_buckets_new);
957
958 return (0);
959 }
960
961 static int
rtnl_handle_delnhop(struct nlmsghdr * hdr,struct nlpcb * nlp,struct nl_pstate * npt)962 rtnl_handle_delnhop(struct nlmsghdr *hdr, struct nlpcb *nlp,
963 struct nl_pstate *npt)
964 {
965 struct unhop_ctl *ctl = atomic_load_ptr(&V_un_ctl);
966 int error;
967
968 if (__predict_false(ctl == NULL))
969 return (ESRCH);
970
971 struct nl_parsed_nhop attrs = {};
972 error = nl_parse_nlmsg(hdr, &nhmsg_parser, npt, &attrs);
973 if (error != 0)
974 return (error);
975
976 if (attrs.nha_id == 0) {
977 NL_LOG(LOG_DEBUG, "NHA_ID not set");
978 return (EINVAL);
979 }
980
981 error = delete_unhop(ctl, hdr, attrs.nha_id);
982
983 return (error);
984 }
985
986 static bool
match_unhop(const struct nl_parsed_nhop * attrs,struct user_nhop * unhop)987 match_unhop(const struct nl_parsed_nhop *attrs, struct user_nhop *unhop)
988 {
989 if (attrs->nha_id != 0 && unhop->un_idx != attrs->nha_id)
990 return (false);
991 if (attrs->nha_groups != 0 && unhop->un_nhgrp_src == NULL)
992 return (false);
993 if (attrs->nha_oif != NULL &&
994 (unhop->un_nhop_src == NULL || unhop->un_nhop_src->nh_ifp != attrs->nha_oif))
995 return (false);
996
997 return (true);
998 }
999
1000 static int
rtnl_handle_getnhop(struct nlmsghdr * hdr,struct nlpcb * nlp,struct nl_pstate * npt)1001 rtnl_handle_getnhop(struct nlmsghdr *hdr, struct nlpcb *nlp,
1002 struct nl_pstate *npt)
1003 {
1004 struct user_nhop *unhop;
1005 UN_TRACKER;
1006 int error;
1007
1008 struct nl_parsed_nhop attrs = {};
1009 error = nl_parse_nlmsg(hdr, &nhmsg_parser, npt, &attrs);
1010 if (error != 0)
1011 return (error);
1012
1013 struct netlink_walkargs wa = {
1014 .nw = npt->nw,
1015 .hdr.nlmsg_pid = hdr->nlmsg_pid,
1016 .hdr.nlmsg_seq = hdr->nlmsg_seq,
1017 .hdr.nlmsg_flags = hdr->nlmsg_flags,
1018 .hdr.nlmsg_type = NL_RTM_NEWNEXTHOP,
1019 };
1020
1021 if (attrs.nha_id != 0) {
1022 struct unhop_ctl *ctl = atomic_load_ptr(&V_un_ctl);
1023 struct user_nhop key = { .un_idx = attrs.nha_id };
1024
1025 if (__predict_false(ctl == NULL))
1026 return (ESRCH);
1027
1028 NL_LOG(LOG_DEBUG2, "searching for uidx %u", attrs.nha_id);
1029 UN_RLOCK(ctl);
1030 CHT_SLIST_FIND_BYOBJ(&ctl->un_head, unhop, &key, unhop);
1031 UN_RUNLOCK(ctl);
1032
1033 if (unhop == NULL)
1034 return (ESRCH);
1035 dump_unhop(unhop, &wa.hdr, wa.nw);
1036 return (0);
1037 } else if (attrs.nhaf_kid != 0) {
1038 struct nhop_iter iter = {
1039 .fibnum = attrs.nhaf_table,
1040 .family = attrs.nhaf_family,
1041 };
1042 int error = ESRCH;
1043
1044 NL_LOG(LOG_DEBUG2, "START table %u family %d", attrs.nhaf_table, attrs.nhaf_family);
1045 for (struct nhop_object *nh = nhops_iter_start(&iter); nh;
1046 nh = nhops_iter_next(&iter)) {
1047 NL_LOG(LOG_DEBUG3, "get %u", nhop_get_idx(nh));
1048 if (nhop_get_idx(nh) == attrs.nhaf_kid) {
1049 dump_nhop(nh, 0, &wa.hdr, wa.nw);
1050 error = 0;
1051 break;
1052 }
1053 }
1054 nhops_iter_stop(&iter);
1055 return (error);
1056 } else if (attrs.nhaf_knhops) {
1057 struct nhop_iter iter = {
1058 .fibnum = attrs.nhaf_table,
1059 .family = attrs.nhaf_family,
1060 };
1061
1062 NL_LOG(LOG_DEBUG2, "DUMP table %u family %d", attrs.nhaf_table, attrs.nhaf_family);
1063 wa.hdr.nlmsg_flags |= NLM_F_MULTI;
1064 for (struct nhop_object *nh = nhops_iter_start(&iter); nh;
1065 nh = nhops_iter_next(&iter)) {
1066 dump_nhop(nh, 0, &wa.hdr, wa.nw);
1067 }
1068 nhops_iter_stop(&iter);
1069 } else {
1070 struct unhop_ctl *ctl = atomic_load_ptr(&V_un_ctl);
1071
1072 if (__predict_false(ctl == NULL))
1073 return (ESRCH);
1074
1075 NL_LOG(LOG_DEBUG2, "DUMP unhops");
1076 UN_RLOCK(ctl);
1077 wa.hdr.nlmsg_flags |= NLM_F_MULTI;
1078 CHT_SLIST_FOREACH(&ctl->un_head, unhop, unhop) {
1079 if (UNHOP_IS_MASTER(unhop) && match_unhop(&attrs, unhop))
1080 dump_unhop(unhop, &wa.hdr, wa.nw);
1081 } CHT_SLIST_FOREACH_END;
1082 UN_RUNLOCK(ctl);
1083 }
1084
1085 if (wa.error == 0) {
1086 if (!nlmsg_end_dump(wa.nw, wa.error, &wa.hdr))
1087 return (ENOMEM);
1088 }
1089 return (0);
1090 }
1091
1092 static const struct rtnl_cmd_handler cmd_handlers[] = {
1093 {
1094 .cmd = NL_RTM_NEWNEXTHOP,
1095 .name = "RTM_NEWNEXTHOP",
1096 .cb = &rtnl_handle_newnhop,
1097 .priv = PRIV_NET_ROUTE,
1098 },
1099 {
1100 .cmd = NL_RTM_DELNEXTHOP,
1101 .name = "RTM_DELNEXTHOP",
1102 .cb = &rtnl_handle_delnhop,
1103 .priv = PRIV_NET_ROUTE,
1104 },
1105 {
1106 .cmd = NL_RTM_GETNEXTHOP,
1107 .name = "RTM_GETNEXTHOP",
1108 .cb = &rtnl_handle_getnhop,
1109 }
1110 };
1111
1112 static const struct nlhdr_parser *all_parsers[] = { &nhmsg_parser, &nh_fbsd_parser };
1113
1114 void
rtnl_nexthops_init(void)1115 rtnl_nexthops_init(void)
1116 {
1117 NL_VERIFY_PARSERS(all_parsers);
1118 rtnl_register_messages(cmd_handlers, nitems(cmd_handlers));
1119 }
1120