xref: /freebsd/sys/netlink/route/rt.c (revision f374ba41f55c1a127303d92d830dd58eef2f5243)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2021 Ng Peng Nam Sean
5  * Copyright (c) 2022 Alexander V. Chernikov <melifaro@FreeBSD.org>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26  * SUCH DAMAGE.
27  */
28 
29 #include <sys/cdefs.h>
30 __FBSDID("$FreeBSD$");
31 #include "opt_inet.h"
32 #include "opt_inet6.h"
33 #include "opt_route.h"
34 #include <sys/types.h>
35 #include <sys/malloc.h>
36 #include <sys/rmlock.h>
37 #include <sys/socket.h>
38 
39 #include <net/if.h>
40 #include <net/route.h>
41 #include <net/route/nhop.h>
42 #include <net/route/route_ctl.h>
43 #include <net/route/route_var.h>
44 #include <netlink/netlink.h>
45 #include <netlink/netlink_ctl.h>
46 #include <netlink/netlink_route.h>
47 #include <netlink/route/route_var.h>
48 
49 #define	DEBUG_MOD_NAME	nl_route
50 #define	DEBUG_MAX_LEVEL	LOG_DEBUG3
51 #include <netlink/netlink_debug.h>
52 _DECLARE_DEBUG(LOG_DEBUG);
53 
54 static unsigned char
55 get_rtm_type(const struct nhop_object *nh)
56 {
57 	int nh_flags = nh->nh_flags;
58 
59 	/* Use the fact that nhg runtime flags are only NHF_MULTIPATH */
60 	if (nh_flags & NHF_BLACKHOLE)
61 		return (RTN_BLACKHOLE);
62 	else if (nh_flags & NHF_REJECT)
63 		return (RTN_PROHIBIT);
64 	return (RTN_UNICAST);
65 }
66 
67 static uint8_t
68 nl_get_rtm_protocol(const struct nhop_object *nh)
69 {
70 #ifdef ROUTE_MPATH
71 	if (NH_IS_NHGRP(nh)) {
72 		const struct nhgrp_object *nhg = (const struct nhgrp_object *)nh;
73 		uint8_t origin = nhgrp_get_origin(nhg);
74 		if (origin != RTPROT_UNSPEC)
75 			return (origin);
76 		nh = nhg->nhops[0];
77 	}
78 #endif
79 	uint8_t origin = nhop_get_origin(nh);
80 	if (origin != RTPROT_UNSPEC)
81 		return (origin);
82 	/* TODO: remove guesswork once all kernel users fill in origin */
83 	int rt_flags = nhop_get_rtflags(nh);
84 	if (rt_flags & RTF_PROTO1)
85 		return (RTPROT_ZEBRA);
86 	if (rt_flags & RTF_STATIC)
87 		return (RTPROT_STATIC);
88 	return (RTPROT_KERNEL);
89 }
90 
91 static int
92 get_rtmsg_type_from_rtsock(int cmd)
93 {
94 	switch (cmd) {
95 	case RTM_ADD:
96 	case RTM_CHANGE:
97 	case RTM_GET:
98 		return NL_RTM_NEWROUTE;
99 	case RTM_DELETE:
100 		return NL_RTM_DELROUTE;
101 	}
102 
103 	return (0);
104 }
105 
106 /*
107  * fibnum heuristics
108  *
109  * if (dump && rtm_table == 0 && !rta_table) RT_ALL_FIBS
110  * msg                rtm_table     RTA_TABLE            result
111  * RTM_GETROUTE/dump          0             -       RT_ALL_FIBS
112  * RTM_GETROUTE/dump          1             -                 1
113  * RTM_GETROUTE/get           0             -                 0
114  *
115  */
116 
117 static struct nhop_object *
118 rc_get_nhop(const struct rib_cmd_info *rc)
119 {
120 	return ((rc->rc_cmd == RTM_DELETE) ? rc->rc_nh_old : rc->rc_nh_new);
121 }
122 
123 static void
124 dump_rc_nhop_gw(struct nl_writer *nw, const struct nhop_object *nh)
125 {
126 	int upper_family;
127 
128 	switch (nhop_get_neigh_family(nh)) {
129 	case AF_LINK:
130 		/* onlink prefix, skip */
131 		break;
132 	case AF_INET:
133 		nlattr_add(nw, NL_RTA_GATEWAY, 4, &nh->gw4_sa.sin_addr);
134 		break;
135 	case AF_INET6:
136 		upper_family = nhop_get_upper_family(nh);
137 		if (upper_family == AF_INET6) {
138 			nlattr_add(nw, NL_RTA_GATEWAY, 16, &nh->gw6_sa.sin6_addr);
139 		} else if (upper_family == AF_INET) {
140 			/* IPv4 over IPv6 */
141 			char buf[20];
142 			struct rtvia *via = (struct rtvia *)&buf[0];
143 			via->rtvia_family = AF_INET6;
144 			memcpy(via->rtvia_addr, &nh->gw6_sa.sin6_addr, 16);
145 			nlattr_add(nw, NL_RTA_VIA, 17, via);
146 		}
147 		break;
148 	}
149 }
150 
151 static void
152 dump_rc_nhop_mtu(struct nl_writer *nw, const struct nhop_object *nh)
153 {
154 	int nla_len = sizeof(struct nlattr) * 2 + sizeof(uint32_t);
155 	struct nlattr *nla = nlmsg_reserve_data(nw, nla_len, struct nlattr);
156 
157 	if (nla == NULL)
158 		return;
159 	nla->nla_type = NL_RTA_METRICS;
160 	nla->nla_len = nla_len;
161 	nla++;
162 	nla->nla_type = NL_RTAX_MTU;
163 	nla->nla_len = sizeof(struct nlattr) + sizeof(uint32_t);
164 	*((uint32_t *)(nla + 1)) = nh->nh_mtu;
165 }
166 
167 #ifdef ROUTE_MPATH
168 static void
169 dump_rc_nhg(struct nl_writer *nw, const struct nhgrp_object *nhg, struct rtmsg *rtm)
170 {
171 	uint32_t uidx = nhgrp_get_uidx(nhg);
172 	uint32_t num_nhops;
173 	const struct weightened_nhop *wn = nhgrp_get_nhops(nhg, &num_nhops);
174 	uint32_t base_rtflags = nhop_get_rtflags(wn[0].nh);
175 
176 	if (uidx != 0)
177 		nlattr_add_u32(nw, NL_RTA_NH_ID, uidx);
178 	nlattr_add_u32(nw, NL_RTA_KNH_ID, nhgrp_get_idx(nhg));
179 
180 	nlattr_add_u32(nw, NL_RTA_RTFLAGS, base_rtflags);
181 	int off = nlattr_add_nested(nw, NL_RTA_MULTIPATH);
182 	if (off == 0)
183 		return;
184 
185 	for (int i = 0; i < num_nhops; i++) {
186 		int nh_off = nlattr_save_offset(nw);
187 		struct rtnexthop *rtnh = nlmsg_reserve_object(nw, struct rtnexthop);
188 		if (rtnh == NULL)
189 			return;
190 		rtnh->rtnh_flags = 0;
191 		rtnh->rtnh_ifindex = wn[i].nh->nh_ifp->if_index;
192 		rtnh->rtnh_hops = wn[i].weight;
193 		dump_rc_nhop_gw(nw, wn[i].nh);
194 		uint32_t rtflags = nhop_get_rtflags(wn[i].nh);
195 		if (rtflags != base_rtflags)
196 			nlattr_add_u32(nw, NL_RTA_RTFLAGS, rtflags);
197 		if (rtflags & RTF_FIXEDMTU)
198 			dump_rc_nhop_mtu(nw, wn[i].nh);
199 		rtnh = nlattr_restore_offset(nw, nh_off, struct rtnexthop);
200 		/*
201 		 * nlattr_add() allocates 4-byte aligned storage, no need to aligh
202 		 * length here
203 		 * */
204 		rtnh->rtnh_len = nlattr_save_offset(nw) - nh_off;
205 	}
206 	nlattr_set_len(nw, off);
207 }
208 #endif
209 
210 static void
211 dump_rc_nhop(struct nl_writer *nw, const struct route_nhop_data *rnd, struct rtmsg *rtm)
212 {
213 #ifdef ROUTE_MPATH
214 	if (NH_IS_NHGRP(rnd->rnd_nhop)) {
215 		dump_rc_nhg(nw, rnd->rnd_nhgrp, rtm);
216 		return;
217 	}
218 #endif
219 	const struct nhop_object *nh = rnd->rnd_nhop;
220 	uint32_t rtflags = nhop_get_rtflags(nh);
221 
222 	/*
223 	 * IPv4 over IPv6
224 	 *    ('RTA_VIA', {'family': 10, 'addr': 'fe80::20c:29ff:fe67:2dd'}), ('RTA_OIF', 2),
225 	 * IPv4 w/ gw
226 	 *    ('RTA_GATEWAY', '172.16.107.131'), ('RTA_OIF', 2)],
227 	 * Direct route:
228 	 *    ('RTA_OIF', 2)
229 	 */
230 	if (nh->nh_flags & NHF_GATEWAY)
231 		dump_rc_nhop_gw(nw, nh);
232 
233 	uint32_t uidx = nhop_get_uidx(nh);
234 	if (uidx != 0)
235 		nlattr_add_u32(nw, NL_RTA_NH_ID, uidx);
236 	nlattr_add_u32(nw, NL_RTA_KNH_ID, nhop_get_idx(nh));
237 	nlattr_add_u32(nw, NL_RTA_RTFLAGS, rtflags);
238 
239 	if (rtflags & RTF_FIXEDMTU)
240 		dump_rc_nhop_mtu(nw, nh);
241 	uint32_t nh_expire = nhop_get_expire(nh);
242 	if (nh_expire > 0)
243 		nlattr_add_u32(nw, NL_RTA_EXPIRES, nh_expire - time_uptime);
244 
245 	/* In any case, fill outgoing interface */
246 	nlattr_add_u32(nw, NL_RTA_OIF, nh->nh_ifp->if_index);
247 
248 	if (rnd->rnd_weight != RT_DEFAULT_WEIGHT)
249 		nlattr_add_u32(nw, NL_RTA_WEIGHT, rnd->rnd_weight);
250 }
251 
252 /*
253  * Dumps output from a rib command into an rtmsg
254  */
255 
256 static int
257 dump_px(uint32_t fibnum, const struct nlmsghdr *hdr,
258     const struct rtentry *rt, struct route_nhop_data *rnd,
259     struct nl_writer *nw)
260 {
261 	struct rtmsg *rtm;
262 	int error = 0;
263 
264 	NET_EPOCH_ASSERT();
265 
266 	if (!nlmsg_reply(nw, hdr, sizeof(struct rtmsg)))
267 		goto enomem;
268 
269 	int family = rt_get_family(rt);
270 	int rtm_off = nlattr_save_offset(nw);
271 	rtm = nlmsg_reserve_object(nw, struct rtmsg);
272 	rtm->rtm_family = family;
273 	rtm->rtm_dst_len = 0;
274 	rtm->rtm_src_len = 0;
275 	rtm->rtm_tos = 0;
276 	if (fibnum < 255)
277 		rtm->rtm_table = (unsigned char)fibnum;
278 	rtm->rtm_scope = RT_SCOPE_UNIVERSE;
279 	rtm->rtm_protocol = nl_get_rtm_protocol(rnd->rnd_nhop);
280 	rtm->rtm_type = get_rtm_type(rnd->rnd_nhop);
281 
282 	nlattr_add_u32(nw, NL_RTA_TABLE, fibnum);
283 
284 	int plen = 0;
285 #if defined(INET) || defined(INET6)
286 	uint32_t scopeid;
287 #endif
288 	switch (family) {
289 #ifdef INET
290 	case AF_INET:
291 		{
292 			struct in_addr addr;
293 			rt_get_inet_prefix_plen(rt, &addr, &plen, &scopeid);
294 			nlattr_add(nw, NL_RTA_DST, 4, &addr);
295 			break;
296 		}
297 #endif
298 #ifdef INET6
299 	case AF_INET6:
300 		{
301 			struct in6_addr addr;
302 			rt_get_inet6_prefix_plen(rt, &addr, &plen, &scopeid);
303 			nlattr_add(nw, NL_RTA_DST, 16, &addr);
304 			break;
305 		}
306 #endif
307 	default:
308 		FIB_LOG(LOG_NOTICE, fibnum, family, "unsupported rt family: %d", family);
309 		error = EAFNOSUPPORT;
310 		goto flush;
311 	}
312 
313 	rtm = nlattr_restore_offset(nw, rtm_off, struct rtmsg);
314 	if (plen > 0)
315 		rtm->rtm_dst_len = plen;
316 	dump_rc_nhop(nw, rnd, rtm);
317 
318 	if (nlmsg_end(nw))
319 		return (0);
320 enomem:
321 	error = ENOMEM;
322 flush:
323 	nlmsg_abort(nw);
324 	return (error);
325 }
326 
327 static int
328 family_to_group(int family)
329 {
330 	switch (family) {
331 	case AF_INET:
332 		return (RTNLGRP_IPV4_ROUTE);
333 	case AF_INET6:
334 		return (RTNLGRP_IPV6_ROUTE);
335 	}
336 	return (0);
337 }
338 
339 
340 static void
341 report_operation(uint32_t fibnum, struct rib_cmd_info *rc,
342     struct nlpcb *nlp, struct nlmsghdr *hdr)
343 {
344 	struct nl_writer nw = {};
345 	uint32_t group_id = family_to_group(rt_get_family(rc->rc_rt));
346 
347 	if (nlmsg_get_group_writer(&nw, NLMSG_SMALL, NETLINK_ROUTE, group_id)) {
348 		struct route_nhop_data rnd = {
349 			.rnd_nhop = rc_get_nhop(rc),
350 			.rnd_weight = rc->rc_nh_weight,
351 		};
352 		hdr->nlmsg_flags &= ~(NLM_F_REPLACE | NLM_F_CREATE);
353 		hdr->nlmsg_flags &= ~(NLM_F_EXCL | NLM_F_APPEND);
354 		switch (rc->rc_cmd) {
355 		case RTM_ADD:
356 			hdr->nlmsg_type = NL_RTM_NEWROUTE;
357 			hdr->nlmsg_flags |= NLM_F_CREATE | NLM_F_EXCL;
358 			break;
359 		case RTM_CHANGE:
360 			hdr->nlmsg_type = NL_RTM_NEWROUTE;
361 			hdr->nlmsg_flags |= NLM_F_REPLACE;
362 			break;
363 		case RTM_DELETE:
364 			hdr->nlmsg_type = NL_RTM_DELROUTE;
365 			break;
366 		}
367 		dump_px(fibnum, hdr, rc->rc_rt, &rnd, &nw);
368 		nlmsg_flush(&nw);
369 	}
370 
371 	rtsock_callback_p->route_f(fibnum, rc);
372 }
373 
374 struct rta_mpath_nh {
375 	struct sockaddr	*gw;
376 	struct ifnet	*ifp;
377 	uint8_t		rtnh_flags;
378 	uint8_t		rtnh_weight;
379 };
380 
381 #define	_IN(_field)	offsetof(struct rtnexthop, _field)
382 #define	_OUT(_field)	offsetof(struct rta_mpath_nh, _field)
383 const static struct nlattr_parser nla_p_rtnh[] = {
384 	{ .type = NL_RTA_GATEWAY, .off = _OUT(gw), .cb = nlattr_get_ip },
385 	{ .type = NL_RTA_VIA, .off = _OUT(gw), .cb = nlattr_get_ipvia },
386 };
387 const static struct nlfield_parser nlf_p_rtnh[] = {
388 	{ .off_in = _IN(rtnh_flags), .off_out = _OUT(rtnh_flags), .cb = nlf_get_u8 },
389 	{ .off_in = _IN(rtnh_hops), .off_out = _OUT(rtnh_weight), .cb = nlf_get_u8 },
390 	{ .off_in = _IN(rtnh_ifindex), .off_out = _OUT(ifp), .cb = nlf_get_ifpz },
391 };
392 #undef _IN
393 #undef _OUT
394 NL_DECLARE_PARSER(mpath_parser, struct rtnexthop, nlf_p_rtnh, nla_p_rtnh);
395 
396 struct rta_mpath {
397 	int num_nhops;
398 	struct rta_mpath_nh nhops[0];
399 };
400 
401 static int
402 nlattr_get_multipath(struct nlattr *nla, struct nl_pstate *npt, const void *arg, void *target)
403 {
404 	int data_len = nla->nla_len - sizeof(struct nlattr);
405 	struct rtnexthop *rtnh;
406 
407 	int max_nhops = data_len / sizeof(struct rtnexthop);
408 
409 	struct rta_mpath *mp = npt_alloc(npt, (max_nhops + 2) * sizeof(struct rta_mpath_nh));
410 	mp->num_nhops = 0;
411 
412 	for (rtnh = (struct rtnexthop *)(nla + 1); data_len > 0; ) {
413 		struct rta_mpath_nh *mpnh = &mp->nhops[mp->num_nhops++];
414 
415 		int error = nl_parse_header(rtnh, rtnh->rtnh_len, &mpath_parser,
416 		    npt, mpnh);
417 		if (error != 0) {
418 			NLMSG_REPORT_ERR_MSG(npt, "RTA_MULTIPATH: nexhop %d: parse failed",
419 			    mp->num_nhops - 1);
420 			return (error);
421 		}
422 
423 		int len = NL_ITEM_ALIGN(rtnh->rtnh_len);
424 		data_len -= len;
425 		rtnh = (struct rtnexthop *)((char *)rtnh + len);
426 	}
427 	if (data_len != 0 || mp->num_nhops == 0) {
428 		NLMSG_REPORT_ERR_MSG(npt, "invalid RTA_MULTIPATH attr");
429 		return (EINVAL);
430 	}
431 
432 	*((struct rta_mpath **)target) = mp;
433 	return (0);
434 }
435 
436 
437 struct nl_parsed_route {
438 	struct sockaddr		*rta_dst;
439 	struct sockaddr		*rta_gw;
440 	struct ifnet		*rta_oif;
441 	struct rta_mpath	*rta_multipath;
442 	uint32_t		rta_table;
443 	uint32_t		rta_rtflags;
444 	uint32_t		rta_nh_id;
445 	uint32_t		rta_weight;
446 	uint32_t		rtax_mtu;
447 	uint8_t			rtm_family;
448 	uint8_t			rtm_dst_len;
449 	uint8_t			rtm_protocol;
450 };
451 
452 #define	_IN(_field)	offsetof(struct rtmsg, _field)
453 #define	_OUT(_field)	offsetof(struct nl_parsed_route, _field)
454 static struct nlattr_parser nla_p_rtmetrics[] = {
455 	{ .type = NL_RTAX_MTU, .off = _OUT(rtax_mtu), .cb = nlattr_get_uint32 },
456 };
457 NL_DECLARE_ATTR_PARSER(metrics_parser, nla_p_rtmetrics);
458 
459 static const struct nlattr_parser nla_p_rtmsg[] = {
460 	{ .type = NL_RTA_DST, .off = _OUT(rta_dst), .cb = nlattr_get_ip },
461 	{ .type = NL_RTA_OIF, .off = _OUT(rta_oif), .cb = nlattr_get_ifp },
462 	{ .type = NL_RTA_GATEWAY, .off = _OUT(rta_gw), .cb = nlattr_get_ip },
463 	{ .type = NL_RTA_METRICS, .arg = &metrics_parser, .cb = nlattr_get_nested },
464 	{ .type = NL_RTA_MULTIPATH, .off = _OUT(rta_multipath), .cb = nlattr_get_multipath },
465 	{ .type = NL_RTA_WEIGHT, .off = _OUT(rta_weight), .cb = nlattr_get_uint32 },
466 	{ .type = NL_RTA_RTFLAGS, .off = _OUT(rta_rtflags), .cb = nlattr_get_uint32 },
467 	{ .type = NL_RTA_TABLE, .off = _OUT(rta_table), .cb = nlattr_get_uint32 },
468 	{ .type = NL_RTA_VIA, .off = _OUT(rta_gw), .cb = nlattr_get_ipvia },
469 	{ .type = NL_RTA_NH_ID, .off = _OUT(rta_nh_id), .cb = nlattr_get_uint32 },
470 };
471 
472 static const struct nlfield_parser nlf_p_rtmsg[] = {
473 	{.off_in = _IN(rtm_family), .off_out = _OUT(rtm_family), .cb = nlf_get_u8 },
474 	{.off_in = _IN(rtm_dst_len), .off_out = _OUT(rtm_dst_len), .cb = nlf_get_u8 },
475 	{.off_in = _IN(rtm_protocol), .off_out = _OUT(rtm_protocol), .cb = nlf_get_u8 },
476 };
477 #undef _IN
478 #undef _OUT
479 NL_DECLARE_PARSER(rtm_parser, struct rtmsg, nlf_p_rtmsg, nla_p_rtmsg);
480 
481 struct netlink_walkargs {
482 	struct nl_writer *nw;
483 	struct route_nhop_data rnd;
484 	struct nlmsghdr hdr;
485 	struct nlpcb *nlp;
486 	uint32_t fibnum;
487 	int family;
488 	int error;
489 	int count;
490 	int dumped;
491 	int dumped_tables;
492 };
493 
494 static int
495 dump_rtentry(struct rtentry *rt, void *_arg)
496 {
497 	struct netlink_walkargs *wa = (struct netlink_walkargs *)_arg;
498 	int error;
499 
500 	wa->count++;
501 	if (wa->error != 0)
502 		return (0);
503 	wa->dumped++;
504 
505 	rt_get_rnd(rt, &wa->rnd);
506 
507 	error = dump_px(wa->fibnum, &wa->hdr, rt, &wa->rnd, wa->nw);
508 
509 	IF_DEBUG_LEVEL(LOG_DEBUG3) {
510 		char rtbuf[INET6_ADDRSTRLEN + 5];
511 		FIB_LOG(LOG_DEBUG3, wa->fibnum, wa->family,
512 		    "Dump %s, offset %u, error %d",
513 		    rt_print_buf(rt, rtbuf, sizeof(rtbuf)),
514 		    wa->nw->offset, error);
515 	}
516 	wa->error = error;
517 
518 	return (0);
519 }
520 
521 static void
522 dump_rtable_one(struct netlink_walkargs *wa, uint32_t fibnum, int family)
523 {
524 	FIB_LOG(LOG_DEBUG2, fibnum, family, "Start dump");
525 	wa->count = 0;
526 	wa->dumped = 0;
527 
528 	rib_walk(fibnum, family, false, dump_rtentry, wa);
529 
530 	wa->dumped_tables++;
531 
532 	FIB_LOG(LOG_DEBUG2, fibnum, family, "End dump, iterated %d dumped %d",
533 	    wa->count, wa->dumped);
534 	NL_LOG(LOG_DEBUG2, "Current offset: %d", wa->nw->offset);
535 }
536 
537 static int
538 dump_rtable_fib(struct netlink_walkargs *wa, uint32_t fibnum, int family)
539 {
540 	wa->fibnum = fibnum;
541 
542 	if (family == AF_UNSPEC) {
543 		for (int i = 0; i < AF_MAX; i++) {
544 			if (rt_tables_get_rnh(fibnum, i) != 0) {
545 				wa->family = i;
546 				dump_rtable_one(wa, fibnum, i);
547 				if (wa->error != 0)
548 					break;
549 			}
550 		}
551 	} else {
552 		if (rt_tables_get_rnh(fibnum, family) != 0) {
553 			wa->family = family;
554 			dump_rtable_one(wa, fibnum, family);
555 		}
556 	}
557 
558 	return (wa->error);
559 }
560 
561 static int
562 handle_rtm_getroute(struct nlpcb *nlp, struct nl_parsed_route *attrs,
563     struct nlmsghdr *hdr, struct nl_pstate *npt)
564 {
565 	RIB_RLOCK_TRACKER;
566 	struct rib_head *rnh;
567 	struct rtentry *rt;
568 	uint32_t fibnum = attrs->rta_table;
569 	sa_family_t family = attrs->rtm_family;
570 
571 	if (attrs->rta_dst == NULL) {
572 		NLMSG_REPORT_ERR_MSG(npt, "No RTA_DST supplied");
573 			return (EINVAL);
574 	}
575 
576 	FIB_LOG(LOG_DEBUG, fibnum, family, "getroute called");
577 
578 	rnh = rt_tables_get_rnh(fibnum, family);
579 	if (rnh == NULL)
580 		return (EAFNOSUPPORT);
581 
582 	RIB_RLOCK(rnh);
583 
584 	rt = (struct rtentry *)rnh->rnh_matchaddr(attrs->rta_dst, &rnh->head);
585 	if (rt == NULL) {
586 		RIB_RUNLOCK(rnh);
587 		return (ESRCH);
588 	}
589 
590 	struct route_nhop_data rnd;
591 	rt_get_rnd(rt, &rnd);
592 	rnd.rnd_nhop = nhop_select_func(rnd.rnd_nhop, 0);
593 
594 	RIB_RUNLOCK(rnh);
595 
596 	IF_DEBUG_LEVEL(LOG_DEBUG2) {
597 		char rtbuf[NHOP_PRINT_BUFSIZE] __unused, nhbuf[NHOP_PRINT_BUFSIZE] __unused;
598 		FIB_LOG(LOG_DEBUG2, fibnum, family, "getroute completed: got %s for %s",
599 		    nhop_print_buf_any(rnd.rnd_nhop, nhbuf, sizeof(nhbuf)),
600 		    rt_print_buf(rt, rtbuf, sizeof(rtbuf)));
601 	}
602 
603 	hdr->nlmsg_type = NL_RTM_NEWROUTE;
604 	dump_px(fibnum, hdr, rt, &rnd, npt->nw);
605 
606 	return (0);
607 }
608 
609 static int
610 handle_rtm_dump(struct nlpcb *nlp, uint32_t fibnum, int family,
611     struct nlmsghdr *hdr, struct nl_writer *nw)
612 {
613 	struct netlink_walkargs wa = {
614 		.nlp = nlp,
615 		.nw = nw,
616 		.hdr.nlmsg_pid = hdr->nlmsg_pid,
617 		.hdr.nlmsg_seq = hdr->nlmsg_seq,
618 		.hdr.nlmsg_type = NL_RTM_NEWROUTE,
619 		.hdr.nlmsg_flags = hdr->nlmsg_flags | NLM_F_MULTI,
620 	};
621 
622 	if (fibnum == RT_TABLE_UNSPEC) {
623 		for (int i = 0; i < V_rt_numfibs; i++) {
624 			dump_rtable_fib(&wa, fibnum, family);
625 			if (wa.error != 0)
626 				break;
627 		}
628 	} else
629 		dump_rtable_fib(&wa, fibnum, family);
630 
631 	if (wa.error == 0 && wa.dumped_tables == 0) {
632 		FIB_LOG(LOG_DEBUG, fibnum, family, "incorrect fibnum/family");
633 		wa.error = ESRCH;
634 		// How do we propagate it?
635 	}
636 
637 	if (!nlmsg_end_dump(wa.nw, wa.error, &wa.hdr)) {
638                 NL_LOG(LOG_DEBUG, "Unable to finalize the dump");
639                 return (ENOMEM);
640         }
641 
642 	return (wa.error);
643 }
644 
645 static struct nhop_object *
646 finalize_nhop(struct nhop_object *nh, int *perror)
647 {
648 	/*
649 	 * The following MUST be filled:
650 	 *  nh_ifp, nh_ifa, nh_gw
651 	 */
652 	if (nh->gw_sa.sa_family == 0) {
653 		/*
654 		 * Empty gateway. Can be direct route with RTA_OIF set.
655 		 */
656 		if (nh->nh_ifp != NULL)
657 			nhop_set_direct_gw(nh, nh->nh_ifp);
658 		else {
659 			NL_LOG(LOG_DEBUG, "empty gateway and interface, skipping");
660 			*perror = EINVAL;
661 			return (NULL);
662 		}
663 		/* Both nh_ifp and gateway are set */
664 	} else {
665 		/* Gateway is set up, we can derive ifp if not set */
666 		if (nh->nh_ifp == NULL) {
667 			struct ifaddr *ifa = ifa_ifwithnet(&nh->gw_sa, 1, nhop_get_fibnum(nh));
668 			if (ifa == NULL) {
669 				NL_LOG(LOG_DEBUG, "Unable to determine ifp, skipping");
670 				*perror = EINVAL;
671 				return (NULL);
672 			}
673 			nhop_set_transmit_ifp(nh, ifa->ifa_ifp);
674 		}
675 	}
676 	/* Both nh_ifp and gateway are set */
677 	if (nh->nh_ifa == NULL) {
678 		struct ifaddr *ifa = ifaof_ifpforaddr(&nh->gw_sa, nh->nh_ifp);
679 		if (ifa == NULL) {
680 			NL_LOG(LOG_DEBUG, "Unable to determine ifa, skipping");
681 			*perror = EINVAL;
682 			return (NULL);
683 		}
684 		nhop_set_src(nh, ifa);
685 	}
686 
687 	return (nhop_get_nhop(nh, perror));
688 }
689 
690 static int
691 get_pxflag(const struct nl_parsed_route *attrs)
692 {
693 	int pxflag = 0;
694 	switch (attrs->rtm_family) {
695 	case AF_INET:
696 		if (attrs->rtm_dst_len == 32)
697 			pxflag = NHF_HOST;
698 		else if (attrs->rtm_dst_len == 0)
699 			pxflag = NHF_DEFAULT;
700 		break;
701 	case AF_INET6:
702 		if (attrs->rtm_dst_len == 32)
703 			pxflag = NHF_HOST;
704 		else if (attrs->rtm_dst_len == 0)
705 			pxflag = NHF_DEFAULT;
706 		break;
707 	}
708 
709 	return (pxflag);
710 }
711 
712 static int
713 get_op_flags(int nlm_flags)
714 {
715 	int op_flags = 0;
716 
717 	op_flags |= (nlm_flags & NLM_F_REPLACE) ? RTM_F_REPLACE : 0;
718 	op_flags |= (nlm_flags & NLM_F_EXCL) ? RTM_F_EXCL : 0;
719 	op_flags |= (nlm_flags & NLM_F_CREATE) ? RTM_F_CREATE : 0;
720 	op_flags |= (nlm_flags & NLM_F_APPEND) ? RTM_F_APPEND : 0;
721 
722 	return (op_flags);
723 }
724 
725 #ifdef ROUTE_MPATH
726 static int
727 create_nexthop_one(struct nl_parsed_route *attrs, struct rta_mpath_nh *mpnh,
728     struct nl_pstate *npt, struct nhop_object **pnh)
729 {
730 	int error;
731 
732 	if (mpnh->gw == NULL)
733 		return (EINVAL);
734 
735 	struct nhop_object *nh = nhop_alloc(attrs->rta_table, attrs->rtm_family);
736 	if (nh == NULL)
737 		return (ENOMEM);
738 
739 	nhop_set_gw(nh, mpnh->gw, true);
740 	if (mpnh->ifp != NULL)
741 		nhop_set_transmit_ifp(nh, mpnh->ifp);
742 	nhop_set_rtflags(nh, attrs->rta_rtflags);
743 	if (attrs->rtm_protocol > RTPROT_STATIC)
744 		nhop_set_origin(nh, attrs->rtm_protocol);
745 
746 	*pnh = finalize_nhop(nh, &error);
747 
748 	return (error);
749 }
750 #endif
751 
752 static struct nhop_object *
753 create_nexthop_from_attrs(struct nl_parsed_route *attrs,
754     struct nl_pstate *npt, int *perror)
755 {
756 	struct nhop_object *nh = NULL;
757 	int error = 0;
758 
759 	if (attrs->rta_multipath != NULL) {
760 #ifdef ROUTE_MPATH
761 		/* Multipath w/o explicit nexthops */
762 		int num_nhops = attrs->rta_multipath->num_nhops;
763 		struct weightened_nhop *wn = npt_alloc(npt, sizeof(*wn) * num_nhops);
764 
765 		for (int i = 0; i < num_nhops; i++) {
766 			struct rta_mpath_nh *mpnh = &attrs->rta_multipath->nhops[i];
767 
768 			error = create_nexthop_one(attrs, mpnh, npt, &wn[i].nh);
769 			if (error != 0) {
770 				for (int j = 0; j < i; j++)
771 					nhop_free(wn[j].nh);
772 				break;
773 			}
774 			wn[i].weight = mpnh->rtnh_weight > 0 ? mpnh->rtnh_weight : 1;
775 		}
776 		if (error == 0) {
777 			struct rib_head *rh = nhop_get_rh(wn[0].nh);
778 			struct nhgrp_object *nhg;
779 
780 			nhg = nhgrp_alloc(rh->rib_fibnum, rh->rib_family,
781 			    wn, num_nhops, perror);
782 			if (nhg != NULL) {
783 				if (attrs->rtm_protocol > RTPROT_STATIC)
784 					nhgrp_set_origin(nhg, attrs->rtm_protocol);
785 				nhg = nhgrp_get_nhgrp(nhg, perror);
786 			}
787 			for (int i = 0; i < num_nhops; i++)
788 				nhop_free(wn[i].nh);
789 			if (nhg != NULL)
790 				return ((struct nhop_object *)nhg);
791 			error = *perror;
792 		}
793 #else
794 		error = ENOTSUP;
795 #endif
796 		*perror = error;
797 	} else {
798 		nh = nhop_alloc(attrs->rta_table, attrs->rtm_family);
799 		if (nh == NULL) {
800 			*perror = ENOMEM;
801 			return (NULL);
802 		}
803 		if (attrs->rta_gw != NULL)
804 			nhop_set_gw(nh, attrs->rta_gw, true);
805 		if (attrs->rta_oif != NULL)
806 			nhop_set_transmit_ifp(nh, attrs->rta_oif);
807 		if (attrs->rtax_mtu != 0)
808 			nhop_set_mtu(nh, attrs->rtax_mtu, true);
809 		if (attrs->rta_rtflags & RTF_BROADCAST)
810 			nhop_set_broadcast(nh, true);
811 		if (attrs->rta_rtflags & RTF_BLACKHOLE)
812 			nhop_set_blackhole(nh, NHF_BLACKHOLE);
813 		if (attrs->rta_rtflags & RTF_REJECT)
814 			nhop_set_blackhole(nh, NHF_REJECT);
815 		nhop_set_rtflags(nh, attrs->rta_rtflags);
816 		if (attrs->rtm_protocol > RTPROT_STATIC)
817 			nhop_set_origin(nh, attrs->rtm_protocol);
818 		nh = finalize_nhop(nh, perror);
819 	}
820 
821 	return (nh);
822 }
823 
824 static int
825 rtnl_handle_newroute(struct nlmsghdr *hdr, struct nlpcb *nlp,
826     struct nl_pstate *npt)
827 {
828 	struct rib_cmd_info rc = {};
829 	struct nhop_object *nh = NULL;
830 	int error;
831 
832 	struct nl_parsed_route attrs = {};
833 	error = nl_parse_nlmsg(hdr, &rtm_parser, npt, &attrs);
834 	if (error != 0)
835 		return (error);
836 
837 	/* Check if we have enough data */
838 	if (attrs.rta_dst == NULL) {
839 		NL_LOG(LOG_DEBUG, "missing RTA_DST");
840 		return (EINVAL);
841 	}
842 
843 	if (attrs.rta_nh_id != 0) {
844 		/* Referenced uindex */
845 		int pxflag = get_pxflag(&attrs);
846 		nh = nl_find_nhop(attrs.rta_table, attrs.rtm_family, attrs.rta_nh_id,
847 		    pxflag, &error);
848 		if (error != 0)
849 			return (error);
850 	} else {
851 		nh = create_nexthop_from_attrs(&attrs, npt, &error);
852 		if (error != 0) {
853 			NL_LOG(LOG_DEBUG, "Error creating nexthop");
854 			return (error);
855 		}
856 	}
857 
858 	if (!NH_IS_NHGRP(nh) && attrs.rta_weight == 0)
859 		attrs.rta_weight = RT_DEFAULT_WEIGHT;
860 	struct route_nhop_data rnd = { .rnd_nhop = nh, .rnd_weight = attrs.rta_weight };
861 	int op_flags = get_op_flags(hdr->nlmsg_flags);
862 
863 	error = rib_add_route_px(attrs.rta_table, attrs.rta_dst, attrs.rtm_dst_len,
864 	    &rnd, op_flags, &rc);
865 	if (error == 0)
866 		report_operation(attrs.rta_table, &rc, nlp, hdr);
867 	return (error);
868 }
869 
870 static int
871 path_match_func(const struct rtentry *rt, const struct nhop_object *nh, void *_data)
872 {
873 	struct nl_parsed_route *attrs = (struct nl_parsed_route *)_data;
874 
875 	if ((attrs->rta_gw != NULL) && !rib_match_gw(rt, nh, attrs->rta_gw))
876 		return (0);
877 
878 	if ((attrs->rta_oif != NULL) && (attrs->rta_oif != nh->nh_ifp))
879 		return (0);
880 
881 	return (1);
882 }
883 
884 static int
885 rtnl_handle_delroute(struct nlmsghdr *hdr, struct nlpcb *nlp,
886     struct nl_pstate *npt)
887 {
888 	struct rib_cmd_info rc;
889 	int error;
890 
891 	struct nl_parsed_route attrs = {};
892 	error = nl_parse_nlmsg(hdr, &rtm_parser, npt, &attrs);
893 	if (error != 0)
894 		return (error);
895 
896 	if (attrs.rta_dst == NULL) {
897 		NLMSG_REPORT_ERR_MSG(npt, "RTA_DST is not set");
898 		return (ESRCH);
899 	}
900 
901 	error = rib_del_route_px(attrs.rta_table, attrs.rta_dst,
902 	    attrs.rtm_dst_len, path_match_func, &attrs, 0, &rc);
903 	if (error == 0)
904 		report_operation(attrs.rta_table, &rc, nlp, hdr);
905 	return (error);
906 }
907 
908 static int
909 rtnl_handle_getroute(struct nlmsghdr *hdr, struct nlpcb *nlp, struct nl_pstate *npt)
910 {
911 	int error;
912 
913 	struct nl_parsed_route attrs = {};
914 	error = nl_parse_nlmsg(hdr, &rtm_parser, npt, &attrs);
915 	if (error != 0)
916 		return (error);
917 
918 	if (hdr->nlmsg_flags & NLM_F_DUMP)
919 		error = handle_rtm_dump(nlp, attrs.rta_table, attrs.rtm_family, hdr, npt->nw);
920 	else
921 		error = handle_rtm_getroute(nlp, &attrs, hdr, npt);
922 
923 	return (error);
924 }
925 
926 void
927 rtnl_handle_route_event(uint32_t fibnum, const struct rib_cmd_info *rc)
928 {
929 	struct nl_writer nw = {};
930 	int family, nlm_flags = 0;
931 
932 	family = rt_get_family(rc->rc_rt);
933 
934 	/* XXX: check if there are active listeners first */
935 
936 	/* TODO: consider passing PID/type/seq */
937 	switch (rc->rc_cmd) {
938 	case RTM_ADD:
939 		nlm_flags = NLM_F_EXCL | NLM_F_CREATE;
940 		break;
941 	case RTM_CHANGE:
942 		nlm_flags = NLM_F_REPLACE;
943 		break;
944 	case RTM_DELETE:
945 		nlm_flags = 0;
946 		break;
947 	}
948 	IF_DEBUG_LEVEL(LOG_DEBUG2) {
949 		char rtbuf[NHOP_PRINT_BUFSIZE] __unused;
950 		FIB_LOG(LOG_DEBUG2, fibnum, family,
951 		    "received event %s for %s / nlm_flags=%X",
952 		    rib_print_cmd(rc->rc_cmd),
953 		    rt_print_buf(rc->rc_rt, rtbuf, sizeof(rtbuf)),
954 		    nlm_flags);
955 	}
956 
957 	struct nlmsghdr hdr = {
958 		.nlmsg_flags = nlm_flags,
959 		.nlmsg_type = get_rtmsg_type_from_rtsock(rc->rc_cmd),
960 	};
961 
962 	struct route_nhop_data rnd = {
963 		.rnd_nhop = rc_get_nhop(rc),
964 		.rnd_weight = rc->rc_nh_weight,
965 	};
966 
967 	uint32_t group_id = family_to_group(family);
968 	if (!nlmsg_get_group_writer(&nw, NLMSG_SMALL, NETLINK_ROUTE, group_id)) {
969 		NL_LOG(LOG_DEBUG, "error allocating event buffer");
970 		return;
971 	}
972 
973 	dump_px(fibnum, &hdr, rc->rc_rt, &rnd, &nw);
974 	nlmsg_flush(&nw);
975 }
976 
977 static const struct rtnl_cmd_handler cmd_handlers[] = {
978 	{
979 		.cmd = NL_RTM_GETROUTE,
980 		.name = "RTM_GETROUTE",
981 		.cb = &rtnl_handle_getroute,
982 	},
983 	{
984 		.cmd = NL_RTM_DELROUTE,
985 		.name = "RTM_DELROUTE",
986 		.cb = &rtnl_handle_delroute,
987 		.priv = PRIV_NET_ROUTE,
988 	},
989 	{
990 		.cmd = NL_RTM_NEWROUTE,
991 		.name = "RTM_NEWROUTE",
992 		.cb = &rtnl_handle_newroute,
993 		.priv = PRIV_NET_ROUTE,
994 	}
995 };
996 
997 static const struct nlhdr_parser *all_parsers[] = {&mpath_parser, &metrics_parser, &rtm_parser};
998 
999 void
1000 rtnl_routes_init(void)
1001 {
1002 	NL_VERIFY_PARSERS(all_parsers);
1003 	rtnl_register_messages(cmd_handlers, NL_ARRAY_LEN(cmd_handlers));
1004 }
1005