xref: /freebsd/sys/netlink/route/rt.c (revision 22cf89c938886d14f5796fc49f9f020c23ea8eaf)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause
3  *
4  * Copyright (c) 2021 Ng Peng Nam Sean
5  * Copyright (c) 2022 Alexander V. Chernikov <melifaro@FreeBSD.org>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26  * SUCH DAMAGE.
27  */
28 
29 #include "opt_netlink.h"
30 
31 #include <sys/cdefs.h>
32 #include "opt_inet.h"
33 #include "opt_inet6.h"
34 #include "opt_route.h"
35 #include <sys/types.h>
36 #include <sys/malloc.h>
37 #include <sys/rmlock.h>
38 #include <sys/socket.h>
39 
40 #include <net/if.h>
41 #include <net/route.h>
42 #include <net/route/nhop.h>
43 #include <net/route/route_ctl.h>
44 #include <net/route/route_var.h>
45 #include <netinet6/scope6_var.h>
46 #include <netlink/netlink.h>
47 #include <netlink/netlink_ctl.h>
48 #include <netlink/netlink_route.h>
49 #include <netlink/route/route_var.h>
50 
51 #define	DEBUG_MOD_NAME	nl_route
52 #define	DEBUG_MAX_LEVEL	LOG_DEBUG3
53 #include <netlink/netlink_debug.h>
54 _DECLARE_DEBUG(LOG_INFO);
55 
56 static unsigned char
57 get_rtm_type(const struct nhop_object *nh)
58 {
59 	int nh_flags = nh->nh_flags;
60 
61 	/* Use the fact that nhg runtime flags are only NHF_MULTIPATH */
62 	if (nh_flags & NHF_BLACKHOLE)
63 		return (RTN_BLACKHOLE);
64 	else if (nh_flags & NHF_REJECT)
65 		return (RTN_PROHIBIT);
66 	return (RTN_UNICAST);
67 }
68 
69 static uint8_t
70 nl_get_rtm_protocol(const struct nhop_object *nh)
71 {
72 #ifdef ROUTE_MPATH
73 	if (NH_IS_NHGRP(nh)) {
74 		const struct nhgrp_object *nhg = (const struct nhgrp_object *)nh;
75 		uint8_t origin = nhgrp_get_origin(nhg);
76 		if (origin != RTPROT_UNSPEC)
77 			return (origin);
78 		nh = nhg->nhops[0];
79 	}
80 #endif
81 	uint8_t origin = nhop_get_origin(nh);
82 	if (origin != RTPROT_UNSPEC)
83 		return (origin);
84 	/* TODO: remove guesswork once all kernel users fill in origin */
85 	int rt_flags = nhop_get_rtflags(nh);
86 	if (rt_flags & RTF_PROTO1)
87 		return (RTPROT_ZEBRA);
88 	if (rt_flags & RTF_STATIC)
89 		return (RTPROT_STATIC);
90 	return (RTPROT_KERNEL);
91 }
92 
93 static int
94 get_rtmsg_type_from_rtsock(int cmd)
95 {
96 	switch (cmd) {
97 	case RTM_ADD:
98 	case RTM_CHANGE:
99 	case RTM_GET:
100 		return NL_RTM_NEWROUTE;
101 	case RTM_DELETE:
102 		return NL_RTM_DELROUTE;
103 	}
104 
105 	return (0);
106 }
107 
108 /*
109  * fibnum heuristics
110  *
111  * if (dump && rtm_table == 0 && !rta_table) RT_ALL_FIBS
112  * msg                rtm_table     RTA_TABLE            result
113  * RTM_GETROUTE/dump          0             -       RT_ALL_FIBS
114  * RTM_GETROUTE/dump          1             -                 1
115  * RTM_GETROUTE/get           0             -                 0
116  *
117  */
118 
119 static struct nhop_object *
120 rc_get_nhop(const struct rib_cmd_info *rc)
121 {
122 	return ((rc->rc_cmd == RTM_DELETE) ? rc->rc_nh_old : rc->rc_nh_new);
123 }
124 
125 static void
126 dump_rc_nhop_gw(struct nl_writer *nw, const struct nhop_object *nh)
127 {
128 #ifdef INET6
129 	int upper_family;
130 #endif
131 
132 	switch (nhop_get_neigh_family(nh)) {
133 	case AF_LINK:
134 		/* onlink prefix, skip */
135 		break;
136 	case AF_INET:
137 		nlattr_add(nw, NL_RTA_GATEWAY, 4, &nh->gw4_sa.sin_addr);
138 		break;
139 #ifdef INET6
140 	case AF_INET6:
141 		upper_family = nhop_get_upper_family(nh);
142 		if (upper_family == AF_INET6) {
143 			struct in6_addr gw6 = nh->gw6_sa.sin6_addr;
144 			in6_clearscope(&gw6);
145 
146 			nlattr_add(nw, NL_RTA_GATEWAY, 16, &gw6);
147 		} else if (upper_family == AF_INET) {
148 			/* IPv4 over IPv6 */
149 			struct in6_addr gw6 = nh->gw6_sa.sin6_addr;
150 			in6_clearscope(&gw6);
151 
152 			char buf[20];
153 			struct rtvia *via = (struct rtvia *)&buf[0];
154 			via->rtvia_family = AF_INET6;
155 			memcpy(via->rtvia_addr, &gw6, 16);
156 			nlattr_add(nw, NL_RTA_VIA, 17, via);
157 		}
158 		break;
159 #endif
160 	}
161 }
162 
163 static void
164 dump_rc_nhop_mtu(struct nl_writer *nw, const struct nhop_object *nh)
165 {
166 	int nla_len = sizeof(struct nlattr) * 2 + sizeof(uint32_t);
167 	struct nlattr *nla = nlmsg_reserve_data(nw, nla_len, struct nlattr);
168 
169 	if (nla == NULL)
170 		return;
171 	nla->nla_type = NL_RTA_METRICS;
172 	nla->nla_len = nla_len;
173 	nla++;
174 	nla->nla_type = NL_RTAX_MTU;
175 	nla->nla_len = sizeof(struct nlattr) + sizeof(uint32_t);
176 	*((uint32_t *)(nla + 1)) = nh->nh_mtu;
177 }
178 
179 #ifdef ROUTE_MPATH
180 static void
181 dump_rc_nhg(struct nl_writer *nw, const struct nhgrp_object *nhg, struct rtmsg *rtm)
182 {
183 	uint32_t uidx = nhgrp_get_uidx(nhg);
184 	uint32_t num_nhops;
185 	const struct weightened_nhop *wn = nhgrp_get_nhops(nhg, &num_nhops);
186 	uint32_t base_rtflags = nhop_get_rtflags(wn[0].nh);
187 
188 	if (uidx != 0)
189 		nlattr_add_u32(nw, NL_RTA_NH_ID, uidx);
190 	nlattr_add_u32(nw, NL_RTA_KNH_ID, nhgrp_get_idx(nhg));
191 
192 	nlattr_add_u32(nw, NL_RTA_RTFLAGS, base_rtflags);
193 	int off = nlattr_add_nested(nw, NL_RTA_MULTIPATH);
194 	if (off == 0)
195 		return;
196 
197 	for (int i = 0; i < num_nhops; i++) {
198 		int nh_off = nlattr_save_offset(nw);
199 		struct rtnexthop *rtnh = nlmsg_reserve_object(nw, struct rtnexthop);
200 		if (rtnh == NULL)
201 			return;
202 		rtnh->rtnh_flags = 0;
203 		rtnh->rtnh_ifindex = if_getindex(wn[i].nh->nh_ifp);
204 		rtnh->rtnh_hops = wn[i].weight;
205 		dump_rc_nhop_gw(nw, wn[i].nh);
206 		uint32_t rtflags = nhop_get_rtflags(wn[i].nh);
207 		if (rtflags != base_rtflags)
208 			nlattr_add_u32(nw, NL_RTA_RTFLAGS, rtflags);
209 		if (rtflags & RTF_FIXEDMTU)
210 			dump_rc_nhop_mtu(nw, wn[i].nh);
211 		rtnh = nlattr_restore_offset(nw, nh_off, struct rtnexthop);
212 		/*
213 		 * nlattr_add() allocates 4-byte aligned storage, no need to aligh
214 		 * length here
215 		 * */
216 		rtnh->rtnh_len = nlattr_save_offset(nw) - nh_off;
217 	}
218 	nlattr_set_len(nw, off);
219 }
220 #endif
221 
222 static void
223 dump_rc_nhop(struct nl_writer *nw, const struct route_nhop_data *rnd, struct rtmsg *rtm)
224 {
225 #ifdef ROUTE_MPATH
226 	if (NH_IS_NHGRP(rnd->rnd_nhop)) {
227 		dump_rc_nhg(nw, rnd->rnd_nhgrp, rtm);
228 		return;
229 	}
230 #endif
231 	const struct nhop_object *nh = rnd->rnd_nhop;
232 	uint32_t rtflags = nhop_get_rtflags(nh);
233 
234 	/*
235 	 * IPv4 over IPv6
236 	 *    ('RTA_VIA', {'family': 10, 'addr': 'fe80::20c:29ff:fe67:2dd'}), ('RTA_OIF', 2),
237 	 * IPv4 w/ gw
238 	 *    ('RTA_GATEWAY', '172.16.107.131'), ('RTA_OIF', 2)],
239 	 * Direct route:
240 	 *    ('RTA_OIF', 2)
241 	 */
242 	if (nh->nh_flags & NHF_GATEWAY)
243 		dump_rc_nhop_gw(nw, nh);
244 
245 	uint32_t uidx = nhop_get_uidx(nh);
246 	if (uidx != 0)
247 		nlattr_add_u32(nw, NL_RTA_NH_ID, uidx);
248 	nlattr_add_u32(nw, NL_RTA_KNH_ID, nhop_get_idx(nh));
249 	nlattr_add_u32(nw, NL_RTA_RTFLAGS, rtflags);
250 
251 	if (rtflags & RTF_FIXEDMTU)
252 		dump_rc_nhop_mtu(nw, nh);
253 	uint32_t nh_expire = nhop_get_expire(nh);
254 	if (nh_expire > 0)
255 		nlattr_add_u32(nw, NL_RTA_EXPIRES, nh_expire - time_uptime);
256 
257 	/* In any case, fill outgoing interface */
258 	nlattr_add_u32(nw, NL_RTA_OIF, if_getindex(nh->nh_ifp));
259 
260 	if (rnd->rnd_weight != RT_DEFAULT_WEIGHT)
261 		nlattr_add_u32(nw, NL_RTA_WEIGHT, rnd->rnd_weight);
262 }
263 
264 /*
265  * Dumps output from a rib command into an rtmsg
266  */
267 
268 static int
269 dump_px(uint32_t fibnum, const struct nlmsghdr *hdr,
270     const struct rtentry *rt, struct route_nhop_data *rnd,
271     struct nl_writer *nw)
272 {
273 	struct rtmsg *rtm;
274 	int error = 0;
275 
276 	NET_EPOCH_ASSERT();
277 
278 	if (!nlmsg_reply(nw, hdr, sizeof(struct rtmsg)))
279 		goto enomem;
280 
281 	int family = rt_get_family(rt);
282 	int rtm_off = nlattr_save_offset(nw);
283 	rtm = nlmsg_reserve_object(nw, struct rtmsg);
284 	rtm->rtm_family = family;
285 	rtm->rtm_dst_len = 0;
286 	rtm->rtm_src_len = 0;
287 	rtm->rtm_tos = 0;
288 	if (fibnum < 255)
289 		rtm->rtm_table = (unsigned char)fibnum;
290 	rtm->rtm_scope = RT_SCOPE_UNIVERSE;
291 	rtm->rtm_protocol = nl_get_rtm_protocol(rnd->rnd_nhop);
292 	rtm->rtm_type = get_rtm_type(rnd->rnd_nhop);
293 
294 	nlattr_add_u32(nw, NL_RTA_TABLE, fibnum);
295 
296 	int plen = 0;
297 #if defined(INET) || defined(INET6)
298 	uint32_t scopeid;
299 #endif
300 	switch (family) {
301 #ifdef INET
302 	case AF_INET:
303 		{
304 			struct in_addr addr;
305 			rt_get_inet_prefix_plen(rt, &addr, &plen, &scopeid);
306 			nlattr_add(nw, NL_RTA_DST, 4, &addr);
307 			break;
308 		}
309 #endif
310 #ifdef INET6
311 	case AF_INET6:
312 		{
313 			struct in6_addr addr;
314 			rt_get_inet6_prefix_plen(rt, &addr, &plen, &scopeid);
315 			nlattr_add(nw, NL_RTA_DST, 16, &addr);
316 			break;
317 		}
318 #endif
319 	default:
320 		FIB_LOG(LOG_NOTICE, fibnum, family, "unsupported rt family: %d", family);
321 		error = EAFNOSUPPORT;
322 		goto flush;
323 	}
324 
325 	rtm = nlattr_restore_offset(nw, rtm_off, struct rtmsg);
326 	if (plen > 0)
327 		rtm->rtm_dst_len = plen;
328 	dump_rc_nhop(nw, rnd, rtm);
329 
330 	if (nlmsg_end(nw))
331 		return (0);
332 enomem:
333 	error = ENOMEM;
334 flush:
335 	nlmsg_abort(nw);
336 	return (error);
337 }
338 
339 static int
340 family_to_group(int family)
341 {
342 	switch (family) {
343 	case AF_INET:
344 		return (RTNLGRP_IPV4_ROUTE);
345 	case AF_INET6:
346 		return (RTNLGRP_IPV6_ROUTE);
347 	}
348 	return (0);
349 }
350 
351 static void
352 report_operation(uint32_t fibnum, struct rib_cmd_info *rc,
353     struct nlpcb *nlp, struct nlmsghdr *hdr)
354 {
355 	struct nl_writer nw = {};
356 	uint32_t group_id = family_to_group(rt_get_family(rc->rc_rt));
357 
358 	if (nlmsg_get_group_writer(&nw, NLMSG_SMALL, NETLINK_ROUTE, group_id)) {
359 		struct route_nhop_data rnd = {
360 			.rnd_nhop = rc_get_nhop(rc),
361 			.rnd_weight = rc->rc_nh_weight,
362 		};
363 		hdr->nlmsg_flags &= ~(NLM_F_REPLACE | NLM_F_CREATE);
364 		hdr->nlmsg_flags &= ~(NLM_F_EXCL | NLM_F_APPEND);
365 		switch (rc->rc_cmd) {
366 		case RTM_ADD:
367 			hdr->nlmsg_type = NL_RTM_NEWROUTE;
368 			hdr->nlmsg_flags |= NLM_F_CREATE | NLM_F_EXCL;
369 			break;
370 		case RTM_CHANGE:
371 			hdr->nlmsg_type = NL_RTM_NEWROUTE;
372 			hdr->nlmsg_flags |= NLM_F_REPLACE;
373 			break;
374 		case RTM_DELETE:
375 			hdr->nlmsg_type = NL_RTM_DELROUTE;
376 			break;
377 		}
378 		dump_px(fibnum, hdr, rc->rc_rt, &rnd, &nw);
379 		nlmsg_flush(&nw);
380 	}
381 
382 	rtsock_callback_p->route_f(fibnum, rc);
383 }
384 
385 static void
386 set_scope6(struct sockaddr *sa, struct ifnet *ifp)
387 {
388 #ifdef INET6
389 	if (sa != NULL && sa->sa_family == AF_INET6 && ifp != NULL) {
390 		struct sockaddr_in6 *sa6 = (struct sockaddr_in6 *)sa;
391 
392 		if (IN6_IS_ADDR_LINKLOCAL(&sa6->sin6_addr))
393 			in6_set_unicast_scopeid(&sa6->sin6_addr, if_getindex(ifp));
394 	}
395 #endif
396 }
397 
398 struct rta_mpath_nh {
399 	struct sockaddr	*gw;
400 	struct ifnet	*ifp;
401 	uint8_t		rtnh_flags;
402 	uint8_t		rtnh_weight;
403 };
404 
405 #define	_IN(_field)	offsetof(struct rtnexthop, _field)
406 #define	_OUT(_field)	offsetof(struct rta_mpath_nh, _field)
407 const static struct nlattr_parser nla_p_rtnh[] = {
408 	{ .type = NL_RTA_GATEWAY, .off = _OUT(gw), .cb = nlattr_get_ip },
409 	{ .type = NL_RTA_VIA, .off = _OUT(gw), .cb = nlattr_get_ipvia },
410 };
411 const static struct nlfield_parser nlf_p_rtnh[] = {
412 	{ .off_in = _IN(rtnh_flags), .off_out = _OUT(rtnh_flags), .cb = nlf_get_u8 },
413 	{ .off_in = _IN(rtnh_hops), .off_out = _OUT(rtnh_weight), .cb = nlf_get_u8 },
414 	{ .off_in = _IN(rtnh_ifindex), .off_out = _OUT(ifp), .cb = nlf_get_ifpz },
415 };
416 #undef _IN
417 #undef _OUT
418 
419 static bool
420 post_p_rtnh(void *_attrs, struct nl_pstate *npt __unused)
421 {
422 	struct rta_mpath_nh *attrs = (struct rta_mpath_nh *)_attrs;
423 
424 	set_scope6(attrs->gw, attrs->ifp);
425 	return (true);
426 }
427 NL_DECLARE_PARSER_EXT(mpath_parser, struct rtnexthop, NULL, nlf_p_rtnh, nla_p_rtnh, post_p_rtnh);
428 
429 struct rta_mpath {
430 	int num_nhops;
431 	struct rta_mpath_nh nhops[0];
432 };
433 
434 static int
435 nlattr_get_multipath(struct nlattr *nla, struct nl_pstate *npt, const void *arg, void *target)
436 {
437 	int data_len = nla->nla_len - sizeof(struct nlattr);
438 	struct rtnexthop *rtnh;
439 
440 	int max_nhops = data_len / sizeof(struct rtnexthop);
441 
442 	struct rta_mpath *mp = npt_alloc(npt, (max_nhops + 2) * sizeof(struct rta_mpath_nh));
443 	mp->num_nhops = 0;
444 
445 	for (rtnh = (struct rtnexthop *)(nla + 1); data_len > 0; ) {
446 		struct rta_mpath_nh *mpnh = &mp->nhops[mp->num_nhops++];
447 
448 		int error = nl_parse_header(rtnh, rtnh->rtnh_len, &mpath_parser,
449 		    npt, mpnh);
450 		if (error != 0) {
451 			NLMSG_REPORT_ERR_MSG(npt, "RTA_MULTIPATH: nexhop %d: parse failed",
452 			    mp->num_nhops - 1);
453 			return (error);
454 		}
455 
456 		int len = NL_ITEM_ALIGN(rtnh->rtnh_len);
457 		data_len -= len;
458 		rtnh = (struct rtnexthop *)((char *)rtnh + len);
459 	}
460 	if (data_len != 0 || mp->num_nhops == 0) {
461 		NLMSG_REPORT_ERR_MSG(npt, "invalid RTA_MULTIPATH attr");
462 		return (EINVAL);
463 	}
464 
465 	*((struct rta_mpath **)target) = mp;
466 	return (0);
467 }
468 
469 
470 struct nl_parsed_route {
471 	struct sockaddr		*rta_dst;
472 	struct sockaddr		*rta_gw;
473 	struct ifnet		*rta_oif;
474 	struct rta_mpath	*rta_multipath;
475 	uint32_t		rta_table;
476 	uint32_t		rta_rtflags;
477 	uint32_t		rta_nh_id;
478 	uint32_t		rta_weight;
479 	uint32_t		rtax_mtu;
480 	uint8_t			rtm_family;
481 	uint8_t			rtm_dst_len;
482 	uint8_t			rtm_protocol;
483 	uint8_t			rtm_type;
484 	uint32_t		rtm_flags;
485 };
486 
487 #define	_IN(_field)	offsetof(struct rtmsg, _field)
488 #define	_OUT(_field)	offsetof(struct nl_parsed_route, _field)
489 static struct nlattr_parser nla_p_rtmetrics[] = {
490 	{ .type = NL_RTAX_MTU, .off = _OUT(rtax_mtu), .cb = nlattr_get_uint32 },
491 };
492 NL_DECLARE_ATTR_PARSER(metrics_parser, nla_p_rtmetrics);
493 
494 static const struct nlattr_parser nla_p_rtmsg[] = {
495 	{ .type = NL_RTA_DST, .off = _OUT(rta_dst), .cb = nlattr_get_ip },
496 	{ .type = NL_RTA_OIF, .off = _OUT(rta_oif), .cb = nlattr_get_ifp },
497 	{ .type = NL_RTA_GATEWAY, .off = _OUT(rta_gw), .cb = nlattr_get_ip },
498 	{ .type = NL_RTA_METRICS, .arg = &metrics_parser, .cb = nlattr_get_nested },
499 	{ .type = NL_RTA_MULTIPATH, .off = _OUT(rta_multipath), .cb = nlattr_get_multipath },
500 	{ .type = NL_RTA_WEIGHT, .off = _OUT(rta_weight), .cb = nlattr_get_uint32 },
501 	{ .type = NL_RTA_RTFLAGS, .off = _OUT(rta_rtflags), .cb = nlattr_get_uint32 },
502 	{ .type = NL_RTA_TABLE, .off = _OUT(rta_table), .cb = nlattr_get_uint32 },
503 	{ .type = NL_RTA_VIA, .off = _OUT(rta_gw), .cb = nlattr_get_ipvia },
504 	{ .type = NL_RTA_NH_ID, .off = _OUT(rta_nh_id), .cb = nlattr_get_uint32 },
505 };
506 
507 static const struct nlfield_parser nlf_p_rtmsg[] = {
508 	{ .off_in = _IN(rtm_family), .off_out = _OUT(rtm_family), .cb = nlf_get_u8 },
509 	{ .off_in = _IN(rtm_dst_len), .off_out = _OUT(rtm_dst_len), .cb = nlf_get_u8 },
510 	{ .off_in = _IN(rtm_protocol), .off_out = _OUT(rtm_protocol), .cb = nlf_get_u8 },
511 	{ .off_in = _IN(rtm_type), .off_out = _OUT(rtm_type), .cb = nlf_get_u8 },
512 	{ .off_in = _IN(rtm_flags), .off_out = _OUT(rtm_flags), .cb = nlf_get_u32 },
513 };
514 #undef _IN
515 #undef _OUT
516 
517 static bool
518 post_p_rtmsg(void *_attrs, struct nl_pstate *npt __unused)
519 {
520 	struct nl_parsed_route *attrs = (struct nl_parsed_route *)_attrs;
521 
522 	set_scope6(attrs->rta_dst, attrs->rta_oif);
523 	set_scope6(attrs->rta_gw, attrs->rta_oif);
524 	return (true);
525 }
526 NL_DECLARE_PARSER_EXT(rtm_parser, struct rtmsg, NULL, nlf_p_rtmsg, nla_p_rtmsg, post_p_rtmsg);
527 
528 struct netlink_walkargs {
529 	struct nl_writer *nw;
530 	struct route_nhop_data rnd;
531 	struct nlmsghdr hdr;
532 	struct nlpcb *nlp;
533 	uint32_t fibnum;
534 	int family;
535 	int error;
536 	int count;
537 	int dumped;
538 	int dumped_tables;
539 };
540 
541 static int
542 dump_rtentry(struct rtentry *rt, void *_arg)
543 {
544 	struct netlink_walkargs *wa = (struct netlink_walkargs *)_arg;
545 	int error;
546 
547 	wa->count++;
548 	if (wa->error != 0)
549 		return (0);
550 	if (!rt_is_exportable(rt, nlp_get_cred(wa->nlp)))
551 		return (0);
552 	wa->dumped++;
553 
554 	rt_get_rnd(rt, &wa->rnd);
555 
556 	error = dump_px(wa->fibnum, &wa->hdr, rt, &wa->rnd, wa->nw);
557 
558 	IF_DEBUG_LEVEL(LOG_DEBUG3) {
559 		char rtbuf[INET6_ADDRSTRLEN + 5];
560 		FIB_LOG(LOG_DEBUG3, wa->fibnum, wa->family,
561 		    "Dump %s, offset %u, error %d",
562 		    rt_print_buf(rt, rtbuf, sizeof(rtbuf)),
563 		    wa->nw->offset, error);
564 	}
565 	wa->error = error;
566 
567 	return (0);
568 }
569 
570 static void
571 dump_rtable_one(struct netlink_walkargs *wa, uint32_t fibnum, int family)
572 {
573 	FIB_LOG(LOG_DEBUG2, fibnum, family, "Start dump");
574 	wa->count = 0;
575 	wa->dumped = 0;
576 
577 	rib_walk(fibnum, family, false, dump_rtentry, wa);
578 
579 	wa->dumped_tables++;
580 
581 	FIB_LOG(LOG_DEBUG2, fibnum, family, "End dump, iterated %d dumped %d",
582 	    wa->count, wa->dumped);
583 	NL_LOG(LOG_DEBUG2, "Current offset: %d", wa->nw->offset);
584 }
585 
586 static int
587 dump_rtable_fib(struct netlink_walkargs *wa, uint32_t fibnum, int family)
588 {
589 	wa->fibnum = fibnum;
590 
591 	if (family == AF_UNSPEC) {
592 		for (int i = 0; i < AF_MAX; i++) {
593 			if (rt_tables_get_rnh(fibnum, i) != 0) {
594 				wa->family = i;
595 				dump_rtable_one(wa, fibnum, i);
596 				if (wa->error != 0)
597 					break;
598 			}
599 		}
600 	} else {
601 		if (rt_tables_get_rnh(fibnum, family) != 0) {
602 			wa->family = family;
603 			dump_rtable_one(wa, fibnum, family);
604 		}
605 	}
606 
607 	return (wa->error);
608 }
609 
610 static int
611 handle_rtm_getroute(struct nlpcb *nlp, struct nl_parsed_route *attrs,
612     struct nlmsghdr *hdr, struct nl_pstate *npt)
613 {
614 	RIB_RLOCK_TRACKER;
615 	struct rib_head *rnh;
616 	const struct rtentry *rt;
617 	struct route_nhop_data rnd;
618 	uint32_t fibnum = attrs->rta_table;
619 	sa_family_t family = attrs->rtm_family;
620 
621 	if (attrs->rta_dst == NULL) {
622 		NLMSG_REPORT_ERR_MSG(npt, "No RTA_DST supplied");
623 			return (EINVAL);
624 	}
625 
626 	rnh = rt_tables_get_rnh(fibnum, family);
627 	if (rnh == NULL)
628 		return (EAFNOSUPPORT);
629 
630 	RIB_RLOCK(rnh);
631 
632 	struct sockaddr *dst = attrs->rta_dst;
633 
634 	if (attrs->rtm_flags & RTM_F_PREFIX)
635 		rt = rib_lookup_prefix_plen(rnh, dst, attrs->rtm_dst_len, &rnd);
636 	else
637 		rt = (const struct rtentry *)rnh->rnh_matchaddr(dst, &rnh->head);
638 	if (rt == NULL) {
639 		RIB_RUNLOCK(rnh);
640 		return (ESRCH);
641 	}
642 
643 	rt_get_rnd(rt, &rnd);
644 	rnd.rnd_nhop = nhop_select_func(rnd.rnd_nhop, 0);
645 
646 	RIB_RUNLOCK(rnh);
647 
648 	if (!rt_is_exportable(rt, nlp_get_cred(nlp)))
649 		return (ESRCH);
650 
651 	IF_DEBUG_LEVEL(LOG_DEBUG2) {
652 		char rtbuf[NHOP_PRINT_BUFSIZE] __unused, nhbuf[NHOP_PRINT_BUFSIZE] __unused;
653 		FIB_LOG(LOG_DEBUG2, fibnum, family, "getroute completed: got %s for %s",
654 		    nhop_print_buf_any(rnd.rnd_nhop, nhbuf, sizeof(nhbuf)),
655 		    rt_print_buf(rt, rtbuf, sizeof(rtbuf)));
656 	}
657 
658 	hdr->nlmsg_type = NL_RTM_NEWROUTE;
659 	dump_px(fibnum, hdr, rt, &rnd, npt->nw);
660 
661 	return (0);
662 }
663 
664 static int
665 handle_rtm_dump(struct nlpcb *nlp, uint32_t fibnum, int family,
666     struct nlmsghdr *hdr, struct nl_writer *nw)
667 {
668 	struct netlink_walkargs wa = {
669 		.nlp = nlp,
670 		.nw = nw,
671 		.hdr.nlmsg_pid = hdr->nlmsg_pid,
672 		.hdr.nlmsg_seq = hdr->nlmsg_seq,
673 		.hdr.nlmsg_type = NL_RTM_NEWROUTE,
674 		.hdr.nlmsg_flags = hdr->nlmsg_flags | NLM_F_MULTI,
675 	};
676 
677 	if (fibnum == RT_TABLE_UNSPEC) {
678 		for (int i = 0; i < V_rt_numfibs; i++) {
679 			dump_rtable_fib(&wa, fibnum, family);
680 			if (wa.error != 0)
681 				break;
682 		}
683 	} else
684 		dump_rtable_fib(&wa, fibnum, family);
685 
686 	if (wa.error == 0 && wa.dumped_tables == 0) {
687 		FIB_LOG(LOG_DEBUG, fibnum, family, "incorrect fibnum/family");
688 		wa.error = ESRCH;
689 		// How do we propagate it?
690 	}
691 
692 	if (!nlmsg_end_dump(wa.nw, wa.error, &wa.hdr)) {
693                 NL_LOG(LOG_DEBUG, "Unable to finalize the dump");
694                 return (ENOMEM);
695         }
696 
697 	return (wa.error);
698 }
699 
700 static struct nhop_object *
701 finalize_nhop(struct nhop_object *nh, const struct sockaddr *dst, int *perror)
702 {
703 	/*
704 	 * The following MUST be filled:
705 	 *  nh_ifp, nh_ifa, nh_gw
706 	 */
707 	if (nh->gw_sa.sa_family == 0) {
708 		/*
709 		 * Empty gateway. Can be direct route with RTA_OIF set.
710 		 */
711 		if (nh->nh_ifp != NULL)
712 			nhop_set_direct_gw(nh, nh->nh_ifp);
713 		else {
714 			NL_LOG(LOG_DEBUG, "empty gateway and interface, skipping");
715 			*perror = EINVAL;
716 			return (NULL);
717 		}
718 		/* Both nh_ifp and gateway are set */
719 	} else {
720 		/* Gateway is set up, we can derive ifp if not set */
721 		if (nh->nh_ifp == NULL) {
722 			uint32_t fibnum = nhop_get_fibnum(nh);
723 			uint32_t flags = 0;
724 
725 			if (nh->nh_flags & NHF_GATEWAY)
726 				flags = RTF_GATEWAY;
727 			else if (nh->nh_flags & NHF_HOST)
728 				flags = RTF_HOST;
729 
730 			struct ifaddr *ifa = ifa_ifwithroute(flags, dst, &nh->gw_sa, fibnum);
731 			if (ifa == NULL) {
732 				NL_LOG(LOG_DEBUG, "Unable to determine ifp, skipping");
733 				*perror = EINVAL;
734 				return (NULL);
735 			}
736 			nhop_set_transmit_ifp(nh, ifa->ifa_ifp);
737 		}
738 	}
739 	/* Both nh_ifp and gateway are set */
740 	if (nh->nh_ifa == NULL) {
741 		const struct sockaddr *gw_sa = &nh->gw_sa;
742 
743 		if (gw_sa->sa_family != dst->sa_family) {
744 			/*
745 			 * Use dst as the target for determining the default
746 			 * preferred ifa IF
747 			 * 1) the gateway is link-level (e.g. direct route)
748 			 * 2) the gateway family is different (e.g. IPv4 over IPv6).
749 			 */
750 			gw_sa = dst;
751 		}
752 
753 		struct ifaddr *ifa = ifaof_ifpforaddr(gw_sa, nh->nh_ifp);
754 		if (ifa == NULL) {
755 			NL_LOG(LOG_DEBUG, "Unable to determine ifa, skipping");
756 			*perror = EINVAL;
757 			return (NULL);
758 		}
759 		nhop_set_src(nh, ifa);
760 	}
761 
762 	return (nhop_get_nhop(nh, perror));
763 }
764 
765 static int
766 get_pxflag(const struct nl_parsed_route *attrs)
767 {
768 	int pxflag = 0;
769 	switch (attrs->rtm_family) {
770 	case AF_INET:
771 		if (attrs->rtm_dst_len == 32)
772 			pxflag = NHF_HOST;
773 		else if (attrs->rtm_dst_len == 0)
774 			pxflag = NHF_DEFAULT;
775 		break;
776 	case AF_INET6:
777 		if (attrs->rtm_dst_len == 128)
778 			pxflag = NHF_HOST;
779 		else if (attrs->rtm_dst_len == 0)
780 			pxflag = NHF_DEFAULT;
781 		break;
782 	}
783 
784 	return (pxflag);
785 }
786 
787 static int
788 get_op_flags(int nlm_flags)
789 {
790 	int op_flags = 0;
791 
792 	op_flags |= (nlm_flags & NLM_F_REPLACE) ? RTM_F_REPLACE : 0;
793 	op_flags |= (nlm_flags & NLM_F_EXCL) ? RTM_F_EXCL : 0;
794 	op_flags |= (nlm_flags & NLM_F_CREATE) ? RTM_F_CREATE : 0;
795 	op_flags |= (nlm_flags & NLM_F_APPEND) ? RTM_F_APPEND : 0;
796 
797 	return (op_flags);
798 }
799 
800 #ifdef ROUTE_MPATH
801 static int
802 create_nexthop_one(struct nl_parsed_route *attrs, struct rta_mpath_nh *mpnh,
803     struct nl_pstate *npt, struct nhop_object **pnh)
804 {
805 	int error;
806 
807 	if (mpnh->gw == NULL)
808 		return (EINVAL);
809 
810 	struct nhop_object *nh = nhop_alloc(attrs->rta_table, attrs->rtm_family);
811 	if (nh == NULL)
812 		return (ENOMEM);
813 
814 	error = nl_set_nexthop_gw(nh, mpnh->gw, mpnh->ifp, npt);
815 	if (error != 0) {
816 		nhop_free(nh);
817 		return (error);
818 	}
819 	if (mpnh->ifp != NULL)
820 		nhop_set_transmit_ifp(nh, mpnh->ifp);
821 	nhop_set_pxtype_flag(nh, get_pxflag(attrs));
822 	nhop_set_rtflags(nh, attrs->rta_rtflags);
823 	if (attrs->rtm_protocol > RTPROT_STATIC)
824 		nhop_set_origin(nh, attrs->rtm_protocol);
825 
826 	*pnh = finalize_nhop(nh, attrs->rta_dst, &error);
827 
828 	return (error);
829 }
830 #endif
831 
832 static struct nhop_object *
833 create_nexthop_from_attrs(struct nl_parsed_route *attrs,
834     struct nl_pstate *npt, int *perror)
835 {
836 	struct nhop_object *nh = NULL;
837 	int error = 0;
838 
839 	if (attrs->rta_multipath != NULL) {
840 #ifdef ROUTE_MPATH
841 		/* Multipath w/o explicit nexthops */
842 		int num_nhops = attrs->rta_multipath->num_nhops;
843 		struct weightened_nhop *wn = npt_alloc(npt, sizeof(*wn) * num_nhops);
844 
845 		for (int i = 0; i < num_nhops; i++) {
846 			struct rta_mpath_nh *mpnh = &attrs->rta_multipath->nhops[i];
847 
848 			error = create_nexthop_one(attrs, mpnh, npt, &wn[i].nh);
849 			if (error != 0) {
850 				for (int j = 0; j < i; j++)
851 					nhop_free(wn[j].nh);
852 				break;
853 			}
854 			wn[i].weight = mpnh->rtnh_weight > 0 ? mpnh->rtnh_weight : 1;
855 		}
856 		if (error == 0) {
857 			struct rib_head *rh = nhop_get_rh(wn[0].nh);
858 			struct nhgrp_object *nhg;
859 
860 			nhg = nhgrp_alloc(rh->rib_fibnum, rh->rib_family,
861 			    wn, num_nhops, perror);
862 			if (nhg != NULL) {
863 				if (attrs->rtm_protocol > RTPROT_STATIC)
864 					nhgrp_set_origin(nhg, attrs->rtm_protocol);
865 				nhg = nhgrp_get_nhgrp(nhg, perror);
866 			}
867 			for (int i = 0; i < num_nhops; i++)
868 				nhop_free(wn[i].nh);
869 			if (nhg != NULL)
870 				return ((struct nhop_object *)nhg);
871 			error = *perror;
872 		}
873 #else
874 		error = ENOTSUP;
875 #endif
876 		*perror = error;
877 	} else {
878 		nh = nhop_alloc(attrs->rta_table, attrs->rtm_family);
879 		if (nh == NULL) {
880 			*perror = ENOMEM;
881 			return (NULL);
882 		}
883 		if (attrs->rta_gw != NULL) {
884 			*perror = nl_set_nexthop_gw(nh, attrs->rta_gw, attrs->rta_oif, npt);
885 			if (*perror != 0) {
886 				nhop_free(nh);
887 				return (NULL);
888 			}
889 		}
890 		if (attrs->rta_oif != NULL)
891 			nhop_set_transmit_ifp(nh, attrs->rta_oif);
892 		if (attrs->rtax_mtu != 0)
893 			nhop_set_mtu(nh, attrs->rtax_mtu, true);
894 		if (attrs->rta_rtflags & RTF_BROADCAST)
895 			nhop_set_broadcast(nh, true);
896 		if (attrs->rtm_protocol > RTPROT_STATIC)
897 			nhop_set_origin(nh, attrs->rtm_protocol);
898 		nhop_set_pxtype_flag(nh, get_pxflag(attrs));
899 		nhop_set_rtflags(nh, attrs->rta_rtflags);
900 
901 		switch (attrs->rtm_type) {
902 		case RTN_UNICAST:
903 			break;
904 		case RTN_BLACKHOLE:
905 			nhop_set_blackhole(nh, RTF_BLACKHOLE);
906 			break;
907 		case RTN_PROHIBIT:
908 		case RTN_UNREACHABLE:
909 			nhop_set_blackhole(nh, RTF_REJECT);
910 			break;
911 		/* TODO: return ENOTSUP for other types if strict option is set */
912 		}
913 
914 		nh = finalize_nhop(nh, attrs->rta_dst, perror);
915 	}
916 
917 	return (nh);
918 }
919 
920 static int
921 rtnl_handle_newroute(struct nlmsghdr *hdr, struct nlpcb *nlp,
922     struct nl_pstate *npt)
923 {
924 	struct rib_cmd_info rc = {};
925 	struct nhop_object *nh = NULL;
926 	int error;
927 
928 	struct nl_parsed_route attrs = {};
929 	error = nl_parse_nlmsg(hdr, &rtm_parser, npt, &attrs);
930 	if (error != 0)
931 		return (error);
932 
933 	/* Check if we have enough data */
934 	if (attrs.rta_dst == NULL) {
935 		NL_LOG(LOG_DEBUG, "missing RTA_DST");
936 		return (EINVAL);
937 	}
938 
939 	if (attrs.rta_table >= V_rt_numfibs) {
940 		NLMSG_REPORT_ERR_MSG(npt, "invalid fib");
941 		return (EINVAL);
942 	}
943 
944 	if (attrs.rta_nh_id != 0) {
945 		/* Referenced uindex */
946 		int pxflag = get_pxflag(&attrs);
947 		nh = nl_find_nhop(attrs.rta_table, attrs.rtm_family, attrs.rta_nh_id,
948 		    pxflag, &error);
949 		if (error != 0)
950 			return (error);
951 	} else {
952 		nh = create_nexthop_from_attrs(&attrs, npt, &error);
953 		if (error != 0) {
954 			NL_LOG(LOG_DEBUG, "Error creating nexthop");
955 			return (error);
956 		}
957 	}
958 
959 	if (!NH_IS_NHGRP(nh) && attrs.rta_weight == 0)
960 		attrs.rta_weight = RT_DEFAULT_WEIGHT;
961 	struct route_nhop_data rnd = { .rnd_nhop = nh, .rnd_weight = attrs.rta_weight };
962 	int op_flags = get_op_flags(hdr->nlmsg_flags);
963 
964 	error = rib_add_route_px(attrs.rta_table, attrs.rta_dst, attrs.rtm_dst_len,
965 	    &rnd, op_flags, &rc);
966 	if (error == 0)
967 		report_operation(attrs.rta_table, &rc, nlp, hdr);
968 	return (error);
969 }
970 
971 static int
972 path_match_func(const struct rtentry *rt, const struct nhop_object *nh, void *_data)
973 {
974 	struct nl_parsed_route *attrs = (struct nl_parsed_route *)_data;
975 
976 	if ((attrs->rta_gw != NULL) && !rib_match_gw(rt, nh, attrs->rta_gw))
977 		return (0);
978 
979 	if ((attrs->rta_oif != NULL) && (attrs->rta_oif != nh->nh_ifp))
980 		return (0);
981 
982 	return (1);
983 }
984 
985 static int
986 rtnl_handle_delroute(struct nlmsghdr *hdr, struct nlpcb *nlp,
987     struct nl_pstate *npt)
988 {
989 	struct rib_cmd_info rc;
990 	int error;
991 
992 	struct nl_parsed_route attrs = {};
993 	error = nl_parse_nlmsg(hdr, &rtm_parser, npt, &attrs);
994 	if (error != 0)
995 		return (error);
996 
997 	if (attrs.rta_dst == NULL) {
998 		NLMSG_REPORT_ERR_MSG(npt, "RTA_DST is not set");
999 		return (ESRCH);
1000 	}
1001 
1002 	if (attrs.rta_table >= V_rt_numfibs) {
1003 		NLMSG_REPORT_ERR_MSG(npt, "invalid fib");
1004 		return (EINVAL);
1005 	}
1006 
1007 	error = rib_del_route_px(attrs.rta_table, attrs.rta_dst,
1008 	    attrs.rtm_dst_len, path_match_func, &attrs, 0, &rc);
1009 	if (error == 0)
1010 		report_operation(attrs.rta_table, &rc, nlp, hdr);
1011 	return (error);
1012 }
1013 
1014 static int
1015 rtnl_handle_getroute(struct nlmsghdr *hdr, struct nlpcb *nlp, struct nl_pstate *npt)
1016 {
1017 	int error;
1018 
1019 	struct nl_parsed_route attrs = {};
1020 	error = nl_parse_nlmsg(hdr, &rtm_parser, npt, &attrs);
1021 	if (error != 0)
1022 		return (error);
1023 
1024 	if (attrs.rta_table >= V_rt_numfibs) {
1025 		NLMSG_REPORT_ERR_MSG(npt, "invalid fib");
1026 		return (EINVAL);
1027 	}
1028 
1029 	if (hdr->nlmsg_flags & NLM_F_DUMP)
1030 		error = handle_rtm_dump(nlp, attrs.rta_table, attrs.rtm_family, hdr, npt->nw);
1031 	else
1032 		error = handle_rtm_getroute(nlp, &attrs, hdr, npt);
1033 
1034 	return (error);
1035 }
1036 
1037 void
1038 rtnl_handle_route_event(uint32_t fibnum, const struct rib_cmd_info *rc)
1039 {
1040 	struct nl_writer nw = {};
1041 	int family, nlm_flags = 0;
1042 
1043 	family = rt_get_family(rc->rc_rt);
1044 
1045 	/* XXX: check if there are active listeners first */
1046 
1047 	/* TODO: consider passing PID/type/seq */
1048 	switch (rc->rc_cmd) {
1049 	case RTM_ADD:
1050 		nlm_flags = NLM_F_EXCL | NLM_F_CREATE;
1051 		break;
1052 	case RTM_CHANGE:
1053 		nlm_flags = NLM_F_REPLACE;
1054 		break;
1055 	case RTM_DELETE:
1056 		nlm_flags = 0;
1057 		break;
1058 	}
1059 	IF_DEBUG_LEVEL(LOG_DEBUG2) {
1060 		char rtbuf[NHOP_PRINT_BUFSIZE] __unused;
1061 		FIB_LOG(LOG_DEBUG2, fibnum, family,
1062 		    "received event %s for %s / nlm_flags=%X",
1063 		    rib_print_cmd(rc->rc_cmd),
1064 		    rt_print_buf(rc->rc_rt, rtbuf, sizeof(rtbuf)),
1065 		    nlm_flags);
1066 	}
1067 
1068 	struct nlmsghdr hdr = {
1069 		.nlmsg_flags = nlm_flags,
1070 		.nlmsg_type = get_rtmsg_type_from_rtsock(rc->rc_cmd),
1071 	};
1072 
1073 	struct route_nhop_data rnd = {
1074 		.rnd_nhop = rc_get_nhop(rc),
1075 		.rnd_weight = rc->rc_nh_weight,
1076 	};
1077 
1078 	uint32_t group_id = family_to_group(family);
1079 	if (!nlmsg_get_group_writer(&nw, NLMSG_SMALL, NETLINK_ROUTE, group_id)) {
1080 		NL_LOG(LOG_DEBUG, "error allocating event buffer");
1081 		return;
1082 	}
1083 
1084 	dump_px(fibnum, &hdr, rc->rc_rt, &rnd, &nw);
1085 	nlmsg_flush(&nw);
1086 }
1087 
1088 static const struct rtnl_cmd_handler cmd_handlers[] = {
1089 	{
1090 		.cmd = NL_RTM_GETROUTE,
1091 		.name = "RTM_GETROUTE",
1092 		.cb = &rtnl_handle_getroute,
1093 		.flags = RTNL_F_ALLOW_NONVNET_JAIL,
1094 	},
1095 	{
1096 		.cmd = NL_RTM_DELROUTE,
1097 		.name = "RTM_DELROUTE",
1098 		.cb = &rtnl_handle_delroute,
1099 		.priv = PRIV_NET_ROUTE,
1100 	},
1101 	{
1102 		.cmd = NL_RTM_NEWROUTE,
1103 		.name = "RTM_NEWROUTE",
1104 		.cb = &rtnl_handle_newroute,
1105 		.priv = PRIV_NET_ROUTE,
1106 	}
1107 };
1108 
1109 static const struct nlhdr_parser *all_parsers[] = {&mpath_parser, &metrics_parser, &rtm_parser};
1110 
1111 void
1112 rtnl_routes_init(void)
1113 {
1114 	NL_VERIFY_PARSERS(all_parsers);
1115 	rtnl_register_messages(cmd_handlers, NL_ARRAY_LEN(cmd_handlers));
1116 }
1117