xref: /freebsd/sys/netlink/route/rt.c (revision 61b95bcb42993b24633b280791438266d78f2747)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2021 Ng Peng Nam Sean
5  * Copyright (c) 2022 Alexander V. Chernikov <melifaro@FreeBSD.org>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26  * SUCH DAMAGE.
27  */
28 
29 #include "opt_netlink.h"
30 
31 #include <sys/cdefs.h>
32 __FBSDID("$FreeBSD$");
33 #include "opt_inet.h"
34 #include "opt_inet6.h"
35 #include "opt_route.h"
36 #include <sys/types.h>
37 #include <sys/malloc.h>
38 #include <sys/rmlock.h>
39 #include <sys/socket.h>
40 
41 #include <net/if.h>
42 #include <net/route.h>
43 #include <net/route/nhop.h>
44 #include <net/route/route_ctl.h>
45 #include <net/route/route_var.h>
46 #include <netinet6/scope6_var.h>
47 #include <netlink/netlink.h>
48 #include <netlink/netlink_ctl.h>
49 #include <netlink/netlink_route.h>
50 #include <netlink/route/route_var.h>
51 
52 #define	DEBUG_MOD_NAME	nl_route
53 #define	DEBUG_MAX_LEVEL	LOG_DEBUG3
54 #include <netlink/netlink_debug.h>
55 _DECLARE_DEBUG(LOG_DEBUG);
56 
57 static unsigned char
58 get_rtm_type(const struct nhop_object *nh)
59 {
60 	int nh_flags = nh->nh_flags;
61 
62 	/* Use the fact that nhg runtime flags are only NHF_MULTIPATH */
63 	if (nh_flags & NHF_BLACKHOLE)
64 		return (RTN_BLACKHOLE);
65 	else if (nh_flags & NHF_REJECT)
66 		return (RTN_PROHIBIT);
67 	return (RTN_UNICAST);
68 }
69 
70 static uint8_t
71 nl_get_rtm_protocol(const struct nhop_object *nh)
72 {
73 #ifdef ROUTE_MPATH
74 	if (NH_IS_NHGRP(nh)) {
75 		const struct nhgrp_object *nhg = (const struct nhgrp_object *)nh;
76 		uint8_t origin = nhgrp_get_origin(nhg);
77 		if (origin != RTPROT_UNSPEC)
78 			return (origin);
79 		nh = nhg->nhops[0];
80 	}
81 #endif
82 	uint8_t origin = nhop_get_origin(nh);
83 	if (origin != RTPROT_UNSPEC)
84 		return (origin);
85 	/* TODO: remove guesswork once all kernel users fill in origin */
86 	int rt_flags = nhop_get_rtflags(nh);
87 	if (rt_flags & RTF_PROTO1)
88 		return (RTPROT_ZEBRA);
89 	if (rt_flags & RTF_STATIC)
90 		return (RTPROT_STATIC);
91 	return (RTPROT_KERNEL);
92 }
93 
94 static int
95 get_rtmsg_type_from_rtsock(int cmd)
96 {
97 	switch (cmd) {
98 	case RTM_ADD:
99 	case RTM_CHANGE:
100 	case RTM_GET:
101 		return NL_RTM_NEWROUTE;
102 	case RTM_DELETE:
103 		return NL_RTM_DELROUTE;
104 	}
105 
106 	return (0);
107 }
108 
109 /*
110  * fibnum heuristics
111  *
112  * if (dump && rtm_table == 0 && !rta_table) RT_ALL_FIBS
113  * msg                rtm_table     RTA_TABLE            result
114  * RTM_GETROUTE/dump          0             -       RT_ALL_FIBS
115  * RTM_GETROUTE/dump          1             -                 1
116  * RTM_GETROUTE/get           0             -                 0
117  *
118  */
119 
120 static struct nhop_object *
121 rc_get_nhop(const struct rib_cmd_info *rc)
122 {
123 	return ((rc->rc_cmd == RTM_DELETE) ? rc->rc_nh_old : rc->rc_nh_new);
124 }
125 
126 static void
127 dump_rc_nhop_gw(struct nl_writer *nw, const struct nhop_object *nh)
128 {
129 #ifdef INET6
130 	int upper_family;
131 #endif
132 
133 	switch (nhop_get_neigh_family(nh)) {
134 	case AF_LINK:
135 		/* onlink prefix, skip */
136 		break;
137 	case AF_INET:
138 		nlattr_add(nw, NL_RTA_GATEWAY, 4, &nh->gw4_sa.sin_addr);
139 		break;
140 #ifdef INET6
141 	case AF_INET6:
142 		upper_family = nhop_get_upper_family(nh);
143 		if (upper_family == AF_INET6) {
144 			struct in6_addr gw6 = nh->gw6_sa.sin6_addr;
145 			in6_clearscope(&gw6);
146 
147 			nlattr_add(nw, NL_RTA_GATEWAY, 16, &gw6);
148 		} else if (upper_family == AF_INET) {
149 			/* IPv4 over IPv6 */
150 			struct in6_addr gw6 = nh->gw6_sa.sin6_addr;
151 			in6_clearscope(&gw6);
152 
153 			char buf[20];
154 			struct rtvia *via = (struct rtvia *)&buf[0];
155 			via->rtvia_family = AF_INET6;
156 			memcpy(via->rtvia_addr, &gw6, 16);
157 			nlattr_add(nw, NL_RTA_VIA, 17, via);
158 		}
159 		break;
160 #endif
161 	}
162 }
163 
164 static void
165 dump_rc_nhop_mtu(struct nl_writer *nw, const struct nhop_object *nh)
166 {
167 	int nla_len = sizeof(struct nlattr) * 2 + sizeof(uint32_t);
168 	struct nlattr *nla = nlmsg_reserve_data(nw, nla_len, struct nlattr);
169 
170 	if (nla == NULL)
171 		return;
172 	nla->nla_type = NL_RTA_METRICS;
173 	nla->nla_len = nla_len;
174 	nla++;
175 	nla->nla_type = NL_RTAX_MTU;
176 	nla->nla_len = sizeof(struct nlattr) + sizeof(uint32_t);
177 	*((uint32_t *)(nla + 1)) = nh->nh_mtu;
178 }
179 
180 #ifdef ROUTE_MPATH
181 static void
182 dump_rc_nhg(struct nl_writer *nw, const struct nhgrp_object *nhg, struct rtmsg *rtm)
183 {
184 	uint32_t uidx = nhgrp_get_uidx(nhg);
185 	uint32_t num_nhops;
186 	const struct weightened_nhop *wn = nhgrp_get_nhops(nhg, &num_nhops);
187 	uint32_t base_rtflags = nhop_get_rtflags(wn[0].nh);
188 
189 	if (uidx != 0)
190 		nlattr_add_u32(nw, NL_RTA_NH_ID, uidx);
191 	nlattr_add_u32(nw, NL_RTA_KNH_ID, nhgrp_get_idx(nhg));
192 
193 	nlattr_add_u32(nw, NL_RTA_RTFLAGS, base_rtflags);
194 	int off = nlattr_add_nested(nw, NL_RTA_MULTIPATH);
195 	if (off == 0)
196 		return;
197 
198 	for (int i = 0; i < num_nhops; i++) {
199 		int nh_off = nlattr_save_offset(nw);
200 		struct rtnexthop *rtnh = nlmsg_reserve_object(nw, struct rtnexthop);
201 		if (rtnh == NULL)
202 			return;
203 		rtnh->rtnh_flags = 0;
204 		rtnh->rtnh_ifindex = wn[i].nh->nh_ifp->if_index;
205 		rtnh->rtnh_hops = wn[i].weight;
206 		dump_rc_nhop_gw(nw, wn[i].nh);
207 		uint32_t rtflags = nhop_get_rtflags(wn[i].nh);
208 		if (rtflags != base_rtflags)
209 			nlattr_add_u32(nw, NL_RTA_RTFLAGS, rtflags);
210 		if (rtflags & RTF_FIXEDMTU)
211 			dump_rc_nhop_mtu(nw, wn[i].nh);
212 		rtnh = nlattr_restore_offset(nw, nh_off, struct rtnexthop);
213 		/*
214 		 * nlattr_add() allocates 4-byte aligned storage, no need to aligh
215 		 * length here
216 		 * */
217 		rtnh->rtnh_len = nlattr_save_offset(nw) - nh_off;
218 	}
219 	nlattr_set_len(nw, off);
220 }
221 #endif
222 
223 static void
224 dump_rc_nhop(struct nl_writer *nw, const struct route_nhop_data *rnd, struct rtmsg *rtm)
225 {
226 #ifdef ROUTE_MPATH
227 	if (NH_IS_NHGRP(rnd->rnd_nhop)) {
228 		dump_rc_nhg(nw, rnd->rnd_nhgrp, rtm);
229 		return;
230 	}
231 #endif
232 	const struct nhop_object *nh = rnd->rnd_nhop;
233 	uint32_t rtflags = nhop_get_rtflags(nh);
234 
235 	/*
236 	 * IPv4 over IPv6
237 	 *    ('RTA_VIA', {'family': 10, 'addr': 'fe80::20c:29ff:fe67:2dd'}), ('RTA_OIF', 2),
238 	 * IPv4 w/ gw
239 	 *    ('RTA_GATEWAY', '172.16.107.131'), ('RTA_OIF', 2)],
240 	 * Direct route:
241 	 *    ('RTA_OIF', 2)
242 	 */
243 	if (nh->nh_flags & NHF_GATEWAY)
244 		dump_rc_nhop_gw(nw, nh);
245 
246 	uint32_t uidx = nhop_get_uidx(nh);
247 	if (uidx != 0)
248 		nlattr_add_u32(nw, NL_RTA_NH_ID, uidx);
249 	nlattr_add_u32(nw, NL_RTA_KNH_ID, nhop_get_idx(nh));
250 	nlattr_add_u32(nw, NL_RTA_RTFLAGS, rtflags);
251 
252 	if (rtflags & RTF_FIXEDMTU)
253 		dump_rc_nhop_mtu(nw, nh);
254 	uint32_t nh_expire = nhop_get_expire(nh);
255 	if (nh_expire > 0)
256 		nlattr_add_u32(nw, NL_RTA_EXPIRES, nh_expire - time_uptime);
257 
258 	/* In any case, fill outgoing interface */
259 	nlattr_add_u32(nw, NL_RTA_OIF, nh->nh_ifp->if_index);
260 
261 	if (rnd->rnd_weight != RT_DEFAULT_WEIGHT)
262 		nlattr_add_u32(nw, NL_RTA_WEIGHT, rnd->rnd_weight);
263 }
264 
265 /*
266  * Dumps output from a rib command into an rtmsg
267  */
268 
269 static int
270 dump_px(uint32_t fibnum, const struct nlmsghdr *hdr,
271     const struct rtentry *rt, struct route_nhop_data *rnd,
272     struct nl_writer *nw)
273 {
274 	struct rtmsg *rtm;
275 	int error = 0;
276 
277 	NET_EPOCH_ASSERT();
278 
279 	if (!nlmsg_reply(nw, hdr, sizeof(struct rtmsg)))
280 		goto enomem;
281 
282 	int family = rt_get_family(rt);
283 	int rtm_off = nlattr_save_offset(nw);
284 	rtm = nlmsg_reserve_object(nw, struct rtmsg);
285 	rtm->rtm_family = family;
286 	rtm->rtm_dst_len = 0;
287 	rtm->rtm_src_len = 0;
288 	rtm->rtm_tos = 0;
289 	if (fibnum < 255)
290 		rtm->rtm_table = (unsigned char)fibnum;
291 	rtm->rtm_scope = RT_SCOPE_UNIVERSE;
292 	rtm->rtm_protocol = nl_get_rtm_protocol(rnd->rnd_nhop);
293 	rtm->rtm_type = get_rtm_type(rnd->rnd_nhop);
294 
295 	nlattr_add_u32(nw, NL_RTA_TABLE, fibnum);
296 
297 	int plen = 0;
298 #if defined(INET) || defined(INET6)
299 	uint32_t scopeid;
300 #endif
301 	switch (family) {
302 #ifdef INET
303 	case AF_INET:
304 		{
305 			struct in_addr addr;
306 			rt_get_inet_prefix_plen(rt, &addr, &plen, &scopeid);
307 			nlattr_add(nw, NL_RTA_DST, 4, &addr);
308 			break;
309 		}
310 #endif
311 #ifdef INET6
312 	case AF_INET6:
313 		{
314 			struct in6_addr addr;
315 			rt_get_inet6_prefix_plen(rt, &addr, &plen, &scopeid);
316 			nlattr_add(nw, NL_RTA_DST, 16, &addr);
317 			break;
318 		}
319 #endif
320 	default:
321 		FIB_LOG(LOG_NOTICE, fibnum, family, "unsupported rt family: %d", family);
322 		error = EAFNOSUPPORT;
323 		goto flush;
324 	}
325 
326 	rtm = nlattr_restore_offset(nw, rtm_off, struct rtmsg);
327 	if (plen > 0)
328 		rtm->rtm_dst_len = plen;
329 	dump_rc_nhop(nw, rnd, rtm);
330 
331 	if (nlmsg_end(nw))
332 		return (0);
333 enomem:
334 	error = ENOMEM;
335 flush:
336 	nlmsg_abort(nw);
337 	return (error);
338 }
339 
340 static int
341 family_to_group(int family)
342 {
343 	switch (family) {
344 	case AF_INET:
345 		return (RTNLGRP_IPV4_ROUTE);
346 	case AF_INET6:
347 		return (RTNLGRP_IPV6_ROUTE);
348 	}
349 	return (0);
350 }
351 
352 
353 static void
354 report_operation(uint32_t fibnum, struct rib_cmd_info *rc,
355     struct nlpcb *nlp, struct nlmsghdr *hdr)
356 {
357 	struct nl_writer nw = {};
358 	uint32_t group_id = family_to_group(rt_get_family(rc->rc_rt));
359 
360 	if (nlmsg_get_group_writer(&nw, NLMSG_SMALL, NETLINK_ROUTE, group_id)) {
361 		struct route_nhop_data rnd = {
362 			.rnd_nhop = rc_get_nhop(rc),
363 			.rnd_weight = rc->rc_nh_weight,
364 		};
365 		hdr->nlmsg_flags &= ~(NLM_F_REPLACE | NLM_F_CREATE);
366 		hdr->nlmsg_flags &= ~(NLM_F_EXCL | NLM_F_APPEND);
367 		switch (rc->rc_cmd) {
368 		case RTM_ADD:
369 			hdr->nlmsg_type = NL_RTM_NEWROUTE;
370 			hdr->nlmsg_flags |= NLM_F_CREATE | NLM_F_EXCL;
371 			break;
372 		case RTM_CHANGE:
373 			hdr->nlmsg_type = NL_RTM_NEWROUTE;
374 			hdr->nlmsg_flags |= NLM_F_REPLACE;
375 			break;
376 		case RTM_DELETE:
377 			hdr->nlmsg_type = NL_RTM_DELROUTE;
378 			break;
379 		}
380 		dump_px(fibnum, hdr, rc->rc_rt, &rnd, &nw);
381 		nlmsg_flush(&nw);
382 	}
383 
384 	rtsock_callback_p->route_f(fibnum, rc);
385 }
386 
387 struct rta_mpath_nh {
388 	struct sockaddr	*gw;
389 	struct ifnet	*ifp;
390 	uint8_t		rtnh_flags;
391 	uint8_t		rtnh_weight;
392 };
393 
394 #define	_IN(_field)	offsetof(struct rtnexthop, _field)
395 #define	_OUT(_field)	offsetof(struct rta_mpath_nh, _field)
396 const static struct nlattr_parser nla_p_rtnh[] = {
397 	{ .type = NL_RTA_GATEWAY, .off = _OUT(gw), .cb = nlattr_get_ip },
398 	{ .type = NL_RTA_VIA, .off = _OUT(gw), .cb = nlattr_get_ipvia },
399 };
400 const static struct nlfield_parser nlf_p_rtnh[] = {
401 	{ .off_in = _IN(rtnh_flags), .off_out = _OUT(rtnh_flags), .cb = nlf_get_u8 },
402 	{ .off_in = _IN(rtnh_hops), .off_out = _OUT(rtnh_weight), .cb = nlf_get_u8 },
403 	{ .off_in = _IN(rtnh_ifindex), .off_out = _OUT(ifp), .cb = nlf_get_ifpz },
404 };
405 #undef _IN
406 #undef _OUT
407 NL_DECLARE_PARSER(mpath_parser, struct rtnexthop, nlf_p_rtnh, nla_p_rtnh);
408 
409 static void
410 set_scope6(struct sockaddr *sa, struct ifnet *ifp)
411 {
412 #ifdef INET6
413 	if (sa != NULL && sa->sa_family == AF_INET6 && ifp != NULL) {
414 		struct sockaddr_in6 *sa6 = (struct sockaddr_in6 *)sa;
415 
416 		if (IN6_IS_ADDR_LINKLOCAL(&sa6->sin6_addr))
417 			in6_set_unicast_scopeid(&sa6->sin6_addr, ifp->if_index);
418 	}
419 #endif
420 }
421 
422 static void
423 post_p_mpath(struct rta_mpath_nh *mpnh)
424 {
425 	set_scope6(mpnh->gw, mpnh->ifp);
426 }
427 
428 struct rta_mpath {
429 	int num_nhops;
430 	struct rta_mpath_nh nhops[0];
431 };
432 
433 static int
434 nlattr_get_multipath(struct nlattr *nla, struct nl_pstate *npt, const void *arg, void *target)
435 {
436 	int data_len = nla->nla_len - sizeof(struct nlattr);
437 	struct rtnexthop *rtnh;
438 
439 	int max_nhops = data_len / sizeof(struct rtnexthop);
440 
441 	struct rta_mpath *mp = npt_alloc(npt, (max_nhops + 2) * sizeof(struct rta_mpath_nh));
442 	mp->num_nhops = 0;
443 
444 	for (rtnh = (struct rtnexthop *)(nla + 1); data_len > 0; ) {
445 		struct rta_mpath_nh *mpnh = &mp->nhops[mp->num_nhops++];
446 
447 		int error = nl_parse_header(rtnh, rtnh->rtnh_len, &mpath_parser,
448 		    npt, mpnh);
449 		if (error != 0) {
450 			NLMSG_REPORT_ERR_MSG(npt, "RTA_MULTIPATH: nexhop %d: parse failed",
451 			    mp->num_nhops - 1);
452 			return (error);
453 		}
454 		post_p_mpath(mpnh);
455 
456 		int len = NL_ITEM_ALIGN(rtnh->rtnh_len);
457 		data_len -= len;
458 		rtnh = (struct rtnexthop *)((char *)rtnh + len);
459 	}
460 	if (data_len != 0 || mp->num_nhops == 0) {
461 		NLMSG_REPORT_ERR_MSG(npt, "invalid RTA_MULTIPATH attr");
462 		return (EINVAL);
463 	}
464 
465 	*((struct rta_mpath **)target) = mp;
466 	return (0);
467 }
468 
469 
470 struct nl_parsed_route {
471 	struct sockaddr		*rta_dst;
472 	struct sockaddr		*rta_gw;
473 	struct ifnet		*rta_oif;
474 	struct rta_mpath	*rta_multipath;
475 	uint32_t		rta_table;
476 	uint32_t		rta_rtflags;
477 	uint32_t		rta_nh_id;
478 	uint32_t		rta_weight;
479 	uint32_t		rtax_mtu;
480 	uint8_t			rtm_family;
481 	uint8_t			rtm_dst_len;
482 	uint8_t			rtm_protocol;
483 	uint8_t			rtm_type;
484 	uint32_t		rtm_flags;
485 };
486 
487 #define	_IN(_field)	offsetof(struct rtmsg, _field)
488 #define	_OUT(_field)	offsetof(struct nl_parsed_route, _field)
489 static struct nlattr_parser nla_p_rtmetrics[] = {
490 	{ .type = NL_RTAX_MTU, .off = _OUT(rtax_mtu), .cb = nlattr_get_uint32 },
491 };
492 NL_DECLARE_ATTR_PARSER(metrics_parser, nla_p_rtmetrics);
493 
494 static const struct nlattr_parser nla_p_rtmsg[] = {
495 	{ .type = NL_RTA_DST, .off = _OUT(rta_dst), .cb = nlattr_get_ip },
496 	{ .type = NL_RTA_OIF, .off = _OUT(rta_oif), .cb = nlattr_get_ifp },
497 	{ .type = NL_RTA_GATEWAY, .off = _OUT(rta_gw), .cb = nlattr_get_ip },
498 	{ .type = NL_RTA_METRICS, .arg = &metrics_parser, .cb = nlattr_get_nested },
499 	{ .type = NL_RTA_MULTIPATH, .off = _OUT(rta_multipath), .cb = nlattr_get_multipath },
500 	{ .type = NL_RTA_WEIGHT, .off = _OUT(rta_weight), .cb = nlattr_get_uint32 },
501 	{ .type = NL_RTA_RTFLAGS, .off = _OUT(rta_rtflags), .cb = nlattr_get_uint32 },
502 	{ .type = NL_RTA_TABLE, .off = _OUT(rta_table), .cb = nlattr_get_uint32 },
503 	{ .type = NL_RTA_VIA, .off = _OUT(rta_gw), .cb = nlattr_get_ipvia },
504 	{ .type = NL_RTA_NH_ID, .off = _OUT(rta_nh_id), .cb = nlattr_get_uint32 },
505 };
506 
507 static const struct nlfield_parser nlf_p_rtmsg[] = {
508 	{ .off_in = _IN(rtm_family), .off_out = _OUT(rtm_family), .cb = nlf_get_u8 },
509 	{ .off_in = _IN(rtm_dst_len), .off_out = _OUT(rtm_dst_len), .cb = nlf_get_u8 },
510 	{ .off_in = _IN(rtm_protocol), .off_out = _OUT(rtm_protocol), .cb = nlf_get_u8 },
511 	{ .off_in = _IN(rtm_type), .off_out = _OUT(rtm_type), .cb = nlf_get_u8 },
512 	{ .off_in = _IN(rtm_flags), .off_out = _OUT(rtm_flags), .cb = nlf_get_u32 },
513 };
514 #undef _IN
515 #undef _OUT
516 NL_DECLARE_PARSER(rtm_parser, struct rtmsg, nlf_p_rtmsg, nla_p_rtmsg);
517 
518 static void
519 post_p_rtmsg(struct nl_parsed_route *r)
520 {
521 	set_scope6(r->rta_dst, r->rta_oif);
522 	set_scope6(r->rta_gw, r->rta_oif);
523 }
524 
525 struct netlink_walkargs {
526 	struct nl_writer *nw;
527 	struct route_nhop_data rnd;
528 	struct nlmsghdr hdr;
529 	struct nlpcb *nlp;
530 	uint32_t fibnum;
531 	int family;
532 	int error;
533 	int count;
534 	int dumped;
535 	int dumped_tables;
536 };
537 
538 static int
539 dump_rtentry(struct rtentry *rt, void *_arg)
540 {
541 	struct netlink_walkargs *wa = (struct netlink_walkargs *)_arg;
542 	int error;
543 
544 	wa->count++;
545 	if (wa->error != 0)
546 		return (0);
547 	if (!rt_is_exportable(rt, nlp_get_cred(wa->nlp)))
548 		return (0);
549 	wa->dumped++;
550 
551 	rt_get_rnd(rt, &wa->rnd);
552 
553 	error = dump_px(wa->fibnum, &wa->hdr, rt, &wa->rnd, wa->nw);
554 
555 	IF_DEBUG_LEVEL(LOG_DEBUG3) {
556 		char rtbuf[INET6_ADDRSTRLEN + 5];
557 		FIB_LOG(LOG_DEBUG3, wa->fibnum, wa->family,
558 		    "Dump %s, offset %u, error %d",
559 		    rt_print_buf(rt, rtbuf, sizeof(rtbuf)),
560 		    wa->nw->offset, error);
561 	}
562 	wa->error = error;
563 
564 	return (0);
565 }
566 
567 static void
568 dump_rtable_one(struct netlink_walkargs *wa, uint32_t fibnum, int family)
569 {
570 	FIB_LOG(LOG_DEBUG2, fibnum, family, "Start dump");
571 	wa->count = 0;
572 	wa->dumped = 0;
573 
574 	rib_walk(fibnum, family, false, dump_rtentry, wa);
575 
576 	wa->dumped_tables++;
577 
578 	FIB_LOG(LOG_DEBUG2, fibnum, family, "End dump, iterated %d dumped %d",
579 	    wa->count, wa->dumped);
580 	NL_LOG(LOG_DEBUG2, "Current offset: %d", wa->nw->offset);
581 }
582 
583 static int
584 dump_rtable_fib(struct netlink_walkargs *wa, uint32_t fibnum, int family)
585 {
586 	wa->fibnum = fibnum;
587 
588 	if (family == AF_UNSPEC) {
589 		for (int i = 0; i < AF_MAX; i++) {
590 			if (rt_tables_get_rnh(fibnum, i) != 0) {
591 				wa->family = i;
592 				dump_rtable_one(wa, fibnum, i);
593 				if (wa->error != 0)
594 					break;
595 			}
596 		}
597 	} else {
598 		if (rt_tables_get_rnh(fibnum, family) != 0) {
599 			wa->family = family;
600 			dump_rtable_one(wa, fibnum, family);
601 		}
602 	}
603 
604 	return (wa->error);
605 }
606 
607 static int
608 handle_rtm_getroute(struct nlpcb *nlp, struct nl_parsed_route *attrs,
609     struct nlmsghdr *hdr, struct nl_pstate *npt)
610 {
611 	RIB_RLOCK_TRACKER;
612 	struct rib_head *rnh;
613 	const struct rtentry *rt;
614 	struct route_nhop_data rnd;
615 	uint32_t fibnum = attrs->rta_table;
616 	sa_family_t family = attrs->rtm_family;
617 
618 	if (attrs->rta_dst == NULL) {
619 		NLMSG_REPORT_ERR_MSG(npt, "No RTA_DST supplied");
620 			return (EINVAL);
621 	}
622 
623 	rnh = rt_tables_get_rnh(fibnum, family);
624 	if (rnh == NULL)
625 		return (EAFNOSUPPORT);
626 
627 	RIB_RLOCK(rnh);
628 
629 	struct sockaddr *dst = attrs->rta_dst;
630 
631 	if (attrs->rtm_flags & RTM_F_PREFIX)
632 		rt = rib_lookup_prefix_plen(rnh, dst, attrs->rtm_dst_len, &rnd);
633 	else
634 		rt = (const struct rtentry *)rnh->rnh_matchaddr(dst, &rnh->head);
635 	if (rt == NULL) {
636 		RIB_RUNLOCK(rnh);
637 		return (ESRCH);
638 	}
639 
640 	rt_get_rnd(rt, &rnd);
641 	rnd.rnd_nhop = nhop_select_func(rnd.rnd_nhop, 0);
642 
643 	RIB_RUNLOCK(rnh);
644 
645 	if (!rt_is_exportable(rt, nlp_get_cred(nlp)))
646 		return (ESRCH);
647 
648 	IF_DEBUG_LEVEL(LOG_DEBUG2) {
649 		char rtbuf[NHOP_PRINT_BUFSIZE] __unused, nhbuf[NHOP_PRINT_BUFSIZE] __unused;
650 		FIB_LOG(LOG_DEBUG2, fibnum, family, "getroute completed: got %s for %s",
651 		    nhop_print_buf_any(rnd.rnd_nhop, nhbuf, sizeof(nhbuf)),
652 		    rt_print_buf(rt, rtbuf, sizeof(rtbuf)));
653 	}
654 
655 	hdr->nlmsg_type = NL_RTM_NEWROUTE;
656 	dump_px(fibnum, hdr, rt, &rnd, npt->nw);
657 
658 	return (0);
659 }
660 
661 static int
662 handle_rtm_dump(struct nlpcb *nlp, uint32_t fibnum, int family,
663     struct nlmsghdr *hdr, struct nl_writer *nw)
664 {
665 	struct netlink_walkargs wa = {
666 		.nlp = nlp,
667 		.nw = nw,
668 		.hdr.nlmsg_pid = hdr->nlmsg_pid,
669 		.hdr.nlmsg_seq = hdr->nlmsg_seq,
670 		.hdr.nlmsg_type = NL_RTM_NEWROUTE,
671 		.hdr.nlmsg_flags = hdr->nlmsg_flags | NLM_F_MULTI,
672 	};
673 
674 	if (fibnum == RT_TABLE_UNSPEC) {
675 		for (int i = 0; i < V_rt_numfibs; i++) {
676 			dump_rtable_fib(&wa, fibnum, family);
677 			if (wa.error != 0)
678 				break;
679 		}
680 	} else
681 		dump_rtable_fib(&wa, fibnum, family);
682 
683 	if (wa.error == 0 && wa.dumped_tables == 0) {
684 		FIB_LOG(LOG_DEBUG, fibnum, family, "incorrect fibnum/family");
685 		wa.error = ESRCH;
686 		// How do we propagate it?
687 	}
688 
689 	if (!nlmsg_end_dump(wa.nw, wa.error, &wa.hdr)) {
690                 NL_LOG(LOG_DEBUG, "Unable to finalize the dump");
691                 return (ENOMEM);
692         }
693 
694 	return (wa.error);
695 }
696 
697 static struct nhop_object *
698 finalize_nhop(struct nhop_object *nh, const struct sockaddr *dst, int *perror)
699 {
700 	/*
701 	 * The following MUST be filled:
702 	 *  nh_ifp, nh_ifa, nh_gw
703 	 */
704 	if (nh->gw_sa.sa_family == 0) {
705 		/*
706 		 * Empty gateway. Can be direct route with RTA_OIF set.
707 		 */
708 		if (nh->nh_ifp != NULL)
709 			nhop_set_direct_gw(nh, nh->nh_ifp);
710 		else {
711 			NL_LOG(LOG_DEBUG, "empty gateway and interface, skipping");
712 			*perror = EINVAL;
713 			return (NULL);
714 		}
715 		/* Both nh_ifp and gateway are set */
716 	} else {
717 		/* Gateway is set up, we can derive ifp if not set */
718 		if (nh->nh_ifp == NULL) {
719 			uint32_t fibnum = nhop_get_fibnum(nh);
720 			uint32_t flags = 0;
721 
722 			if (nh->nh_flags & NHF_GATEWAY)
723 				flags = RTF_GATEWAY;
724 			else if (nh->nh_flags & NHF_HOST)
725 				flags = RTF_HOST;
726 
727 			struct ifaddr *ifa = ifa_ifwithroute(flags, dst, &nh->gw_sa, fibnum);
728 			if (ifa == NULL) {
729 				NL_LOG(LOG_DEBUG, "Unable to determine ifp, skipping");
730 				*perror = EINVAL;
731 				return (NULL);
732 			}
733 			nhop_set_transmit_ifp(nh, ifa->ifa_ifp);
734 		}
735 	}
736 	/* Both nh_ifp and gateway are set */
737 	if (nh->nh_ifa == NULL) {
738 		const struct sockaddr *gw_sa = &nh->gw_sa;
739 
740 		if (gw_sa->sa_family != dst->sa_family) {
741 			/*
742 			 * Use dst as the target for determining the default
743 			 * preferred ifa IF
744 			 * 1) the gateway is link-level (e.g. direct route)
745 			 * 2) the gateway family is different (e.g. IPv4 over IPv6).
746 			 */
747 			gw_sa = dst;
748 		}
749 
750 		struct ifaddr *ifa = ifaof_ifpforaddr(gw_sa, nh->nh_ifp);
751 		if (ifa == NULL) {
752 			NL_LOG(LOG_DEBUG, "Unable to determine ifa, skipping");
753 			*perror = EINVAL;
754 			return (NULL);
755 		}
756 		nhop_set_src(nh, ifa);
757 	}
758 
759 	return (nhop_get_nhop(nh, perror));
760 }
761 
762 static int
763 get_pxflag(const struct nl_parsed_route *attrs)
764 {
765 	int pxflag = 0;
766 	switch (attrs->rtm_family) {
767 	case AF_INET:
768 		if (attrs->rtm_dst_len == 32)
769 			pxflag = NHF_HOST;
770 		else if (attrs->rtm_dst_len == 0)
771 			pxflag = NHF_DEFAULT;
772 		break;
773 	case AF_INET6:
774 		if (attrs->rtm_dst_len == 128)
775 			pxflag = NHF_HOST;
776 		else if (attrs->rtm_dst_len == 0)
777 			pxflag = NHF_DEFAULT;
778 		break;
779 	}
780 
781 	return (pxflag);
782 }
783 
784 static int
785 get_op_flags(int nlm_flags)
786 {
787 	int op_flags = 0;
788 
789 	op_flags |= (nlm_flags & NLM_F_REPLACE) ? RTM_F_REPLACE : 0;
790 	op_flags |= (nlm_flags & NLM_F_EXCL) ? RTM_F_EXCL : 0;
791 	op_flags |= (nlm_flags & NLM_F_CREATE) ? RTM_F_CREATE : 0;
792 	op_flags |= (nlm_flags & NLM_F_APPEND) ? RTM_F_APPEND : 0;
793 
794 	return (op_flags);
795 }
796 
797 #ifdef ROUTE_MPATH
798 static int
799 create_nexthop_one(struct nl_parsed_route *attrs, struct rta_mpath_nh *mpnh,
800     struct nl_pstate *npt, struct nhop_object **pnh)
801 {
802 	int error;
803 
804 	if (mpnh->gw == NULL)
805 		return (EINVAL);
806 
807 	struct nhop_object *nh = nhop_alloc(attrs->rta_table, attrs->rtm_family);
808 	if (nh == NULL)
809 		return (ENOMEM);
810 
811 	error = nl_set_nexthop_gw(nh, mpnh->gw, mpnh->ifp, npt);
812 	if (error != 0) {
813 		nhop_free(nh);
814 		return (error);
815 	}
816 	if (mpnh->ifp != NULL)
817 		nhop_set_transmit_ifp(nh, mpnh->ifp);
818 	nhop_set_pxtype_flag(nh, get_pxflag(attrs));
819 	nhop_set_rtflags(nh, attrs->rta_rtflags);
820 	if (attrs->rtm_protocol > RTPROT_STATIC)
821 		nhop_set_origin(nh, attrs->rtm_protocol);
822 
823 	*pnh = finalize_nhop(nh, attrs->rta_dst, &error);
824 
825 	return (error);
826 }
827 #endif
828 
829 static struct nhop_object *
830 create_nexthop_from_attrs(struct nl_parsed_route *attrs,
831     struct nl_pstate *npt, int *perror)
832 {
833 	struct nhop_object *nh = NULL;
834 	int error = 0;
835 
836 	if (attrs->rta_multipath != NULL) {
837 #ifdef ROUTE_MPATH
838 		/* Multipath w/o explicit nexthops */
839 		int num_nhops = attrs->rta_multipath->num_nhops;
840 		struct weightened_nhop *wn = npt_alloc(npt, sizeof(*wn) * num_nhops);
841 
842 		for (int i = 0; i < num_nhops; i++) {
843 			struct rta_mpath_nh *mpnh = &attrs->rta_multipath->nhops[i];
844 
845 			error = create_nexthop_one(attrs, mpnh, npt, &wn[i].nh);
846 			if (error != 0) {
847 				for (int j = 0; j < i; j++)
848 					nhop_free(wn[j].nh);
849 				break;
850 			}
851 			wn[i].weight = mpnh->rtnh_weight > 0 ? mpnh->rtnh_weight : 1;
852 		}
853 		if (error == 0) {
854 			struct rib_head *rh = nhop_get_rh(wn[0].nh);
855 			struct nhgrp_object *nhg;
856 
857 			nhg = nhgrp_alloc(rh->rib_fibnum, rh->rib_family,
858 			    wn, num_nhops, perror);
859 			if (nhg != NULL) {
860 				if (attrs->rtm_protocol > RTPROT_STATIC)
861 					nhgrp_set_origin(nhg, attrs->rtm_protocol);
862 				nhg = nhgrp_get_nhgrp(nhg, perror);
863 			}
864 			for (int i = 0; i < num_nhops; i++)
865 				nhop_free(wn[i].nh);
866 			if (nhg != NULL)
867 				return ((struct nhop_object *)nhg);
868 			error = *perror;
869 		}
870 #else
871 		error = ENOTSUP;
872 #endif
873 		*perror = error;
874 	} else {
875 		nh = nhop_alloc(attrs->rta_table, attrs->rtm_family);
876 		if (nh == NULL) {
877 			*perror = ENOMEM;
878 			return (NULL);
879 		}
880 		if (attrs->rta_gw != NULL) {
881 			*perror = nl_set_nexthop_gw(nh, attrs->rta_gw, attrs->rta_oif, npt);
882 			if (*perror != 0) {
883 				nhop_free(nh);
884 				return (NULL);
885 			}
886 		}
887 		if (attrs->rta_oif != NULL)
888 			nhop_set_transmit_ifp(nh, attrs->rta_oif);
889 		if (attrs->rtax_mtu != 0)
890 			nhop_set_mtu(nh, attrs->rtax_mtu, true);
891 		if (attrs->rta_rtflags & RTF_BROADCAST)
892 			nhop_set_broadcast(nh, true);
893 		if (attrs->rtm_protocol > RTPROT_STATIC)
894 			nhop_set_origin(nh, attrs->rtm_protocol);
895 		nhop_set_pxtype_flag(nh, get_pxflag(attrs));
896 		nhop_set_rtflags(nh, attrs->rta_rtflags);
897 
898 		switch (attrs->rtm_type) {
899 		case RTN_UNICAST:
900 			break;
901 		case RTN_BLACKHOLE:
902 			nhop_set_blackhole(nh, RTF_BLACKHOLE);
903 			break;
904 		case RTN_PROHIBIT:
905 		case RTN_UNREACHABLE:
906 			nhop_set_blackhole(nh, RTF_REJECT);
907 			break;
908 		/* TODO: return ENOTSUP for other types if strict option is set */
909 		}
910 
911 		nh = finalize_nhop(nh, attrs->rta_dst, perror);
912 	}
913 
914 	return (nh);
915 }
916 
917 static int
918 rtnl_handle_newroute(struct nlmsghdr *hdr, struct nlpcb *nlp,
919     struct nl_pstate *npt)
920 {
921 	struct rib_cmd_info rc = {};
922 	struct nhop_object *nh = NULL;
923 	int error;
924 
925 	struct nl_parsed_route attrs = {};
926 	error = nl_parse_nlmsg(hdr, &rtm_parser, npt, &attrs);
927 	if (error != 0)
928 		return (error);
929 	post_p_rtmsg(&attrs);
930 
931 	/* Check if we have enough data */
932 	if (attrs.rta_dst == NULL) {
933 		NL_LOG(LOG_DEBUG, "missing RTA_DST");
934 		return (EINVAL);
935 	}
936 
937 	if (attrs.rta_table >= V_rt_numfibs) {
938 		NLMSG_REPORT_ERR_MSG(npt, "invalid fib");
939 		return (EINVAL);
940 	}
941 
942 	if (attrs.rta_nh_id != 0) {
943 		/* Referenced uindex */
944 		int pxflag = get_pxflag(&attrs);
945 		nh = nl_find_nhop(attrs.rta_table, attrs.rtm_family, attrs.rta_nh_id,
946 		    pxflag, &error);
947 		if (error != 0)
948 			return (error);
949 	} else {
950 		nh = create_nexthop_from_attrs(&attrs, npt, &error);
951 		if (error != 0) {
952 			NL_LOG(LOG_DEBUG, "Error creating nexthop");
953 			return (error);
954 		}
955 	}
956 
957 	if (!NH_IS_NHGRP(nh) && attrs.rta_weight == 0)
958 		attrs.rta_weight = RT_DEFAULT_WEIGHT;
959 	struct route_nhop_data rnd = { .rnd_nhop = nh, .rnd_weight = attrs.rta_weight };
960 	int op_flags = get_op_flags(hdr->nlmsg_flags);
961 
962 	error = rib_add_route_px(attrs.rta_table, attrs.rta_dst, attrs.rtm_dst_len,
963 	    &rnd, op_flags, &rc);
964 	if (error == 0)
965 		report_operation(attrs.rta_table, &rc, nlp, hdr);
966 	return (error);
967 }
968 
969 static int
970 path_match_func(const struct rtentry *rt, const struct nhop_object *nh, void *_data)
971 {
972 	struct nl_parsed_route *attrs = (struct nl_parsed_route *)_data;
973 
974 	if ((attrs->rta_gw != NULL) && !rib_match_gw(rt, nh, attrs->rta_gw))
975 		return (0);
976 
977 	if ((attrs->rta_oif != NULL) && (attrs->rta_oif != nh->nh_ifp))
978 		return (0);
979 
980 	return (1);
981 }
982 
983 static int
984 rtnl_handle_delroute(struct nlmsghdr *hdr, struct nlpcb *nlp,
985     struct nl_pstate *npt)
986 {
987 	struct rib_cmd_info rc;
988 	int error;
989 
990 	struct nl_parsed_route attrs = {};
991 	error = nl_parse_nlmsg(hdr, &rtm_parser, npt, &attrs);
992 	if (error != 0)
993 		return (error);
994 	post_p_rtmsg(&attrs);
995 
996 	if (attrs.rta_dst == NULL) {
997 		NLMSG_REPORT_ERR_MSG(npt, "RTA_DST is not set");
998 		return (ESRCH);
999 	}
1000 
1001 	if (attrs.rta_table >= V_rt_numfibs) {
1002 		NLMSG_REPORT_ERR_MSG(npt, "invalid fib");
1003 		return (EINVAL);
1004 	}
1005 
1006 	error = rib_del_route_px(attrs.rta_table, attrs.rta_dst,
1007 	    attrs.rtm_dst_len, path_match_func, &attrs, 0, &rc);
1008 	if (error == 0)
1009 		report_operation(attrs.rta_table, &rc, nlp, hdr);
1010 	return (error);
1011 }
1012 
1013 static int
1014 rtnl_handle_getroute(struct nlmsghdr *hdr, struct nlpcb *nlp, struct nl_pstate *npt)
1015 {
1016 	int error;
1017 
1018 	struct nl_parsed_route attrs = {};
1019 	error = nl_parse_nlmsg(hdr, &rtm_parser, npt, &attrs);
1020 	if (error != 0)
1021 		return (error);
1022 	post_p_rtmsg(&attrs);
1023 
1024 	if (attrs.rta_table >= V_rt_numfibs) {
1025 		NLMSG_REPORT_ERR_MSG(npt, "invalid fib");
1026 		return (EINVAL);
1027 	}
1028 
1029 	if (hdr->nlmsg_flags & NLM_F_DUMP)
1030 		error = handle_rtm_dump(nlp, attrs.rta_table, attrs.rtm_family, hdr, npt->nw);
1031 	else
1032 		error = handle_rtm_getroute(nlp, &attrs, hdr, npt);
1033 
1034 	return (error);
1035 }
1036 
1037 void
1038 rtnl_handle_route_event(uint32_t fibnum, const struct rib_cmd_info *rc)
1039 {
1040 	struct nl_writer nw = {};
1041 	int family, nlm_flags = 0;
1042 
1043 	family = rt_get_family(rc->rc_rt);
1044 
1045 	/* XXX: check if there are active listeners first */
1046 
1047 	/* TODO: consider passing PID/type/seq */
1048 	switch (rc->rc_cmd) {
1049 	case RTM_ADD:
1050 		nlm_flags = NLM_F_EXCL | NLM_F_CREATE;
1051 		break;
1052 	case RTM_CHANGE:
1053 		nlm_flags = NLM_F_REPLACE;
1054 		break;
1055 	case RTM_DELETE:
1056 		nlm_flags = 0;
1057 		break;
1058 	}
1059 	IF_DEBUG_LEVEL(LOG_DEBUG2) {
1060 		char rtbuf[NHOP_PRINT_BUFSIZE] __unused;
1061 		FIB_LOG(LOG_DEBUG2, fibnum, family,
1062 		    "received event %s for %s / nlm_flags=%X",
1063 		    rib_print_cmd(rc->rc_cmd),
1064 		    rt_print_buf(rc->rc_rt, rtbuf, sizeof(rtbuf)),
1065 		    nlm_flags);
1066 	}
1067 
1068 	struct nlmsghdr hdr = {
1069 		.nlmsg_flags = nlm_flags,
1070 		.nlmsg_type = get_rtmsg_type_from_rtsock(rc->rc_cmd),
1071 	};
1072 
1073 	struct route_nhop_data rnd = {
1074 		.rnd_nhop = rc_get_nhop(rc),
1075 		.rnd_weight = rc->rc_nh_weight,
1076 	};
1077 
1078 	uint32_t group_id = family_to_group(family);
1079 	if (!nlmsg_get_group_writer(&nw, NLMSG_SMALL, NETLINK_ROUTE, group_id)) {
1080 		NL_LOG(LOG_DEBUG, "error allocating event buffer");
1081 		return;
1082 	}
1083 
1084 	dump_px(fibnum, &hdr, rc->rc_rt, &rnd, &nw);
1085 	nlmsg_flush(&nw);
1086 }
1087 
1088 static const struct rtnl_cmd_handler cmd_handlers[] = {
1089 	{
1090 		.cmd = NL_RTM_GETROUTE,
1091 		.name = "RTM_GETROUTE",
1092 		.cb = &rtnl_handle_getroute,
1093 		.flags = RTNL_F_ALLOW_NONVNET_JAIL,
1094 	},
1095 	{
1096 		.cmd = NL_RTM_DELROUTE,
1097 		.name = "RTM_DELROUTE",
1098 		.cb = &rtnl_handle_delroute,
1099 		.priv = PRIV_NET_ROUTE,
1100 	},
1101 	{
1102 		.cmd = NL_RTM_NEWROUTE,
1103 		.name = "RTM_NEWROUTE",
1104 		.cb = &rtnl_handle_newroute,
1105 		.priv = PRIV_NET_ROUTE,
1106 	}
1107 };
1108 
1109 static const struct nlhdr_parser *all_parsers[] = {&mpath_parser, &metrics_parser, &rtm_parser};
1110 
1111 void
1112 rtnl_routes_init(void)
1113 {
1114 	NL_VERIFY_PARSERS(all_parsers);
1115 	rtnl_register_messages(cmd_handlers, NL_ARRAY_LEN(cmd_handlers));
1116 }
1117