xref: /freebsd/sys/netlink/route/rt.c (revision 46ac8f2e7d9601311eb9b3cd2fed138ff4a11a66)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2021 Ng Peng Nam Sean
5  * Copyright (c) 2022 Alexander V. Chernikov <melifaro@FreeBSD.org>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26  * SUCH DAMAGE.
27  */
28 
29 #include "opt_netlink.h"
30 
31 #include <sys/cdefs.h>
32 __FBSDID("$FreeBSD$");
33 #include "opt_inet.h"
34 #include "opt_inet6.h"
35 #include "opt_route.h"
36 #include <sys/types.h>
37 #include <sys/malloc.h>
38 #include <sys/rmlock.h>
39 #include <sys/socket.h>
40 
41 #include <net/if.h>
42 #include <net/route.h>
43 #include <net/route/nhop.h>
44 #include <net/route/route_ctl.h>
45 #include <net/route/route_var.h>
46 #include <netinet6/scope6_var.h>
47 #include <netlink/netlink.h>
48 #include <netlink/netlink_ctl.h>
49 #include <netlink/netlink_route.h>
50 #include <netlink/route/route_var.h>
51 
52 #define	DEBUG_MOD_NAME	nl_route
53 #define	DEBUG_MAX_LEVEL	LOG_DEBUG3
54 #include <netlink/netlink_debug.h>
55 _DECLARE_DEBUG(LOG_DEBUG);
56 
57 static unsigned char
58 get_rtm_type(const struct nhop_object *nh)
59 {
60 	int nh_flags = nh->nh_flags;
61 
62 	/* Use the fact that nhg runtime flags are only NHF_MULTIPATH */
63 	if (nh_flags & NHF_BLACKHOLE)
64 		return (RTN_BLACKHOLE);
65 	else if (nh_flags & NHF_REJECT)
66 		return (RTN_PROHIBIT);
67 	return (RTN_UNICAST);
68 }
69 
70 static uint8_t
71 nl_get_rtm_protocol(const struct nhop_object *nh)
72 {
73 #ifdef ROUTE_MPATH
74 	if (NH_IS_NHGRP(nh)) {
75 		const struct nhgrp_object *nhg = (const struct nhgrp_object *)nh;
76 		uint8_t origin = nhgrp_get_origin(nhg);
77 		if (origin != RTPROT_UNSPEC)
78 			return (origin);
79 		nh = nhg->nhops[0];
80 	}
81 #endif
82 	uint8_t origin = nhop_get_origin(nh);
83 	if (origin != RTPROT_UNSPEC)
84 		return (origin);
85 	/* TODO: remove guesswork once all kernel users fill in origin */
86 	int rt_flags = nhop_get_rtflags(nh);
87 	if (rt_flags & RTF_PROTO1)
88 		return (RTPROT_ZEBRA);
89 	if (rt_flags & RTF_STATIC)
90 		return (RTPROT_STATIC);
91 	return (RTPROT_KERNEL);
92 }
93 
94 static int
95 get_rtmsg_type_from_rtsock(int cmd)
96 {
97 	switch (cmd) {
98 	case RTM_ADD:
99 	case RTM_CHANGE:
100 	case RTM_GET:
101 		return NL_RTM_NEWROUTE;
102 	case RTM_DELETE:
103 		return NL_RTM_DELROUTE;
104 	}
105 
106 	return (0);
107 }
108 
109 /*
110  * fibnum heuristics
111  *
112  * if (dump && rtm_table == 0 && !rta_table) RT_ALL_FIBS
113  * msg                rtm_table     RTA_TABLE            result
114  * RTM_GETROUTE/dump          0             -       RT_ALL_FIBS
115  * RTM_GETROUTE/dump          1             -                 1
116  * RTM_GETROUTE/get           0             -                 0
117  *
118  */
119 
120 static struct nhop_object *
121 rc_get_nhop(const struct rib_cmd_info *rc)
122 {
123 	return ((rc->rc_cmd == RTM_DELETE) ? rc->rc_nh_old : rc->rc_nh_new);
124 }
125 
126 static void
127 dump_rc_nhop_gw(struct nl_writer *nw, const struct nhop_object *nh)
128 {
129 #ifdef INET6
130 	int upper_family;
131 #endif
132 
133 	switch (nhop_get_neigh_family(nh)) {
134 	case AF_LINK:
135 		/* onlink prefix, skip */
136 		break;
137 	case AF_INET:
138 		nlattr_add(nw, NL_RTA_GATEWAY, 4, &nh->gw4_sa.sin_addr);
139 		break;
140 #ifdef INET6
141 	case AF_INET6:
142 		upper_family = nhop_get_upper_family(nh);
143 		if (upper_family == AF_INET6) {
144 			struct in6_addr gw6 = nh->gw6_sa.sin6_addr;
145 			in6_clearscope(&gw6);
146 
147 			nlattr_add(nw, NL_RTA_GATEWAY, 16, &gw6);
148 		} else if (upper_family == AF_INET) {
149 			/* IPv4 over IPv6 */
150 			struct in6_addr gw6 = nh->gw6_sa.sin6_addr;
151 			in6_clearscope(&gw6);
152 
153 			char buf[20];
154 			struct rtvia *via = (struct rtvia *)&buf[0];
155 			via->rtvia_family = AF_INET6;
156 			memcpy(via->rtvia_addr, &gw6, 16);
157 			nlattr_add(nw, NL_RTA_VIA, 17, via);
158 		}
159 		break;
160 #endif
161 	}
162 }
163 
164 static void
165 dump_rc_nhop_mtu(struct nl_writer *nw, const struct nhop_object *nh)
166 {
167 	int nla_len = sizeof(struct nlattr) * 2 + sizeof(uint32_t);
168 	struct nlattr *nla = nlmsg_reserve_data(nw, nla_len, struct nlattr);
169 
170 	if (nla == NULL)
171 		return;
172 	nla->nla_type = NL_RTA_METRICS;
173 	nla->nla_len = nla_len;
174 	nla++;
175 	nla->nla_type = NL_RTAX_MTU;
176 	nla->nla_len = sizeof(struct nlattr) + sizeof(uint32_t);
177 	*((uint32_t *)(nla + 1)) = nh->nh_mtu;
178 }
179 
180 #ifdef ROUTE_MPATH
181 static void
182 dump_rc_nhg(struct nl_writer *nw, const struct nhgrp_object *nhg, struct rtmsg *rtm)
183 {
184 	uint32_t uidx = nhgrp_get_uidx(nhg);
185 	uint32_t num_nhops;
186 	const struct weightened_nhop *wn = nhgrp_get_nhops(nhg, &num_nhops);
187 	uint32_t base_rtflags = nhop_get_rtflags(wn[0].nh);
188 
189 	if (uidx != 0)
190 		nlattr_add_u32(nw, NL_RTA_NH_ID, uidx);
191 	nlattr_add_u32(nw, NL_RTA_KNH_ID, nhgrp_get_idx(nhg));
192 
193 	nlattr_add_u32(nw, NL_RTA_RTFLAGS, base_rtflags);
194 	int off = nlattr_add_nested(nw, NL_RTA_MULTIPATH);
195 	if (off == 0)
196 		return;
197 
198 	for (int i = 0; i < num_nhops; i++) {
199 		int nh_off = nlattr_save_offset(nw);
200 		struct rtnexthop *rtnh = nlmsg_reserve_object(nw, struct rtnexthop);
201 		if (rtnh == NULL)
202 			return;
203 		rtnh->rtnh_flags = 0;
204 		rtnh->rtnh_ifindex = wn[i].nh->nh_ifp->if_index;
205 		rtnh->rtnh_hops = wn[i].weight;
206 		dump_rc_nhop_gw(nw, wn[i].nh);
207 		uint32_t rtflags = nhop_get_rtflags(wn[i].nh);
208 		if (rtflags != base_rtflags)
209 			nlattr_add_u32(nw, NL_RTA_RTFLAGS, rtflags);
210 		if (rtflags & RTF_FIXEDMTU)
211 			dump_rc_nhop_mtu(nw, wn[i].nh);
212 		rtnh = nlattr_restore_offset(nw, nh_off, struct rtnexthop);
213 		/*
214 		 * nlattr_add() allocates 4-byte aligned storage, no need to aligh
215 		 * length here
216 		 * */
217 		rtnh->rtnh_len = nlattr_save_offset(nw) - nh_off;
218 	}
219 	nlattr_set_len(nw, off);
220 }
221 #endif
222 
223 static void
224 dump_rc_nhop(struct nl_writer *nw, const struct route_nhop_data *rnd, struct rtmsg *rtm)
225 {
226 #ifdef ROUTE_MPATH
227 	if (NH_IS_NHGRP(rnd->rnd_nhop)) {
228 		dump_rc_nhg(nw, rnd->rnd_nhgrp, rtm);
229 		return;
230 	}
231 #endif
232 	const struct nhop_object *nh = rnd->rnd_nhop;
233 	uint32_t rtflags = nhop_get_rtflags(nh);
234 
235 	/*
236 	 * IPv4 over IPv6
237 	 *    ('RTA_VIA', {'family': 10, 'addr': 'fe80::20c:29ff:fe67:2dd'}), ('RTA_OIF', 2),
238 	 * IPv4 w/ gw
239 	 *    ('RTA_GATEWAY', '172.16.107.131'), ('RTA_OIF', 2)],
240 	 * Direct route:
241 	 *    ('RTA_OIF', 2)
242 	 */
243 	if (nh->nh_flags & NHF_GATEWAY)
244 		dump_rc_nhop_gw(nw, nh);
245 
246 	uint32_t uidx = nhop_get_uidx(nh);
247 	if (uidx != 0)
248 		nlattr_add_u32(nw, NL_RTA_NH_ID, uidx);
249 	nlattr_add_u32(nw, NL_RTA_KNH_ID, nhop_get_idx(nh));
250 	nlattr_add_u32(nw, NL_RTA_RTFLAGS, rtflags);
251 
252 	if (rtflags & RTF_FIXEDMTU)
253 		dump_rc_nhop_mtu(nw, nh);
254 	uint32_t nh_expire = nhop_get_expire(nh);
255 	if (nh_expire > 0)
256 		nlattr_add_u32(nw, NL_RTA_EXPIRES, nh_expire - time_uptime);
257 
258 	/* In any case, fill outgoing interface */
259 	nlattr_add_u32(nw, NL_RTA_OIF, nh->nh_ifp->if_index);
260 
261 	if (rnd->rnd_weight != RT_DEFAULT_WEIGHT)
262 		nlattr_add_u32(nw, NL_RTA_WEIGHT, rnd->rnd_weight);
263 }
264 
265 /*
266  * Dumps output from a rib command into an rtmsg
267  */
268 
269 static int
270 dump_px(uint32_t fibnum, const struct nlmsghdr *hdr,
271     const struct rtentry *rt, struct route_nhop_data *rnd,
272     struct nl_writer *nw)
273 {
274 	struct rtmsg *rtm;
275 	int error = 0;
276 
277 	NET_EPOCH_ASSERT();
278 
279 	if (!nlmsg_reply(nw, hdr, sizeof(struct rtmsg)))
280 		goto enomem;
281 
282 	int family = rt_get_family(rt);
283 	int rtm_off = nlattr_save_offset(nw);
284 	rtm = nlmsg_reserve_object(nw, struct rtmsg);
285 	rtm->rtm_family = family;
286 	rtm->rtm_dst_len = 0;
287 	rtm->rtm_src_len = 0;
288 	rtm->rtm_tos = 0;
289 	if (fibnum < 255)
290 		rtm->rtm_table = (unsigned char)fibnum;
291 	rtm->rtm_scope = RT_SCOPE_UNIVERSE;
292 	rtm->rtm_protocol = nl_get_rtm_protocol(rnd->rnd_nhop);
293 	rtm->rtm_type = get_rtm_type(rnd->rnd_nhop);
294 
295 	nlattr_add_u32(nw, NL_RTA_TABLE, fibnum);
296 
297 	int plen = 0;
298 #if defined(INET) || defined(INET6)
299 	uint32_t scopeid;
300 #endif
301 	switch (family) {
302 #ifdef INET
303 	case AF_INET:
304 		{
305 			struct in_addr addr;
306 			rt_get_inet_prefix_plen(rt, &addr, &plen, &scopeid);
307 			nlattr_add(nw, NL_RTA_DST, 4, &addr);
308 			break;
309 		}
310 #endif
311 #ifdef INET6
312 	case AF_INET6:
313 		{
314 			struct in6_addr addr;
315 			rt_get_inet6_prefix_plen(rt, &addr, &plen, &scopeid);
316 			nlattr_add(nw, NL_RTA_DST, 16, &addr);
317 			break;
318 		}
319 #endif
320 	default:
321 		FIB_LOG(LOG_NOTICE, fibnum, family, "unsupported rt family: %d", family);
322 		error = EAFNOSUPPORT;
323 		goto flush;
324 	}
325 
326 	rtm = nlattr_restore_offset(nw, rtm_off, struct rtmsg);
327 	if (plen > 0)
328 		rtm->rtm_dst_len = plen;
329 	dump_rc_nhop(nw, rnd, rtm);
330 
331 	if (nlmsg_end(nw))
332 		return (0);
333 enomem:
334 	error = ENOMEM;
335 flush:
336 	nlmsg_abort(nw);
337 	return (error);
338 }
339 
340 static int
341 family_to_group(int family)
342 {
343 	switch (family) {
344 	case AF_INET:
345 		return (RTNLGRP_IPV4_ROUTE);
346 	case AF_INET6:
347 		return (RTNLGRP_IPV6_ROUTE);
348 	}
349 	return (0);
350 }
351 
352 
353 static void
354 report_operation(uint32_t fibnum, struct rib_cmd_info *rc,
355     struct nlpcb *nlp, struct nlmsghdr *hdr)
356 {
357 	struct nl_writer nw = {};
358 	uint32_t group_id = family_to_group(rt_get_family(rc->rc_rt));
359 
360 	if (nlmsg_get_group_writer(&nw, NLMSG_SMALL, NETLINK_ROUTE, group_id)) {
361 		struct route_nhop_data rnd = {
362 			.rnd_nhop = rc_get_nhop(rc),
363 			.rnd_weight = rc->rc_nh_weight,
364 		};
365 		hdr->nlmsg_flags &= ~(NLM_F_REPLACE | NLM_F_CREATE);
366 		hdr->nlmsg_flags &= ~(NLM_F_EXCL | NLM_F_APPEND);
367 		switch (rc->rc_cmd) {
368 		case RTM_ADD:
369 			hdr->nlmsg_type = NL_RTM_NEWROUTE;
370 			hdr->nlmsg_flags |= NLM_F_CREATE | NLM_F_EXCL;
371 			break;
372 		case RTM_CHANGE:
373 			hdr->nlmsg_type = NL_RTM_NEWROUTE;
374 			hdr->nlmsg_flags |= NLM_F_REPLACE;
375 			break;
376 		case RTM_DELETE:
377 			hdr->nlmsg_type = NL_RTM_DELROUTE;
378 			break;
379 		}
380 		dump_px(fibnum, hdr, rc->rc_rt, &rnd, &nw);
381 		nlmsg_flush(&nw);
382 	}
383 
384 	rtsock_callback_p->route_f(fibnum, rc);
385 }
386 
387 struct rta_mpath_nh {
388 	struct sockaddr	*gw;
389 	struct ifnet	*ifp;
390 	uint8_t		rtnh_flags;
391 	uint8_t		rtnh_weight;
392 };
393 
394 #define	_IN(_field)	offsetof(struct rtnexthop, _field)
395 #define	_OUT(_field)	offsetof(struct rta_mpath_nh, _field)
396 const static struct nlattr_parser nla_p_rtnh[] = {
397 	{ .type = NL_RTA_GATEWAY, .off = _OUT(gw), .cb = nlattr_get_ip },
398 	{ .type = NL_RTA_VIA, .off = _OUT(gw), .cb = nlattr_get_ipvia },
399 };
400 const static struct nlfield_parser nlf_p_rtnh[] = {
401 	{ .off_in = _IN(rtnh_flags), .off_out = _OUT(rtnh_flags), .cb = nlf_get_u8 },
402 	{ .off_in = _IN(rtnh_hops), .off_out = _OUT(rtnh_weight), .cb = nlf_get_u8 },
403 	{ .off_in = _IN(rtnh_ifindex), .off_out = _OUT(ifp), .cb = nlf_get_ifpz },
404 };
405 #undef _IN
406 #undef _OUT
407 NL_DECLARE_PARSER(mpath_parser, struct rtnexthop, nlf_p_rtnh, nla_p_rtnh);
408 
409 struct rta_mpath {
410 	int num_nhops;
411 	struct rta_mpath_nh nhops[0];
412 };
413 
414 static int
415 nlattr_get_multipath(struct nlattr *nla, struct nl_pstate *npt, const void *arg, void *target)
416 {
417 	int data_len = nla->nla_len - sizeof(struct nlattr);
418 	struct rtnexthop *rtnh;
419 
420 	int max_nhops = data_len / sizeof(struct rtnexthop);
421 
422 	struct rta_mpath *mp = npt_alloc(npt, (max_nhops + 2) * sizeof(struct rta_mpath_nh));
423 	mp->num_nhops = 0;
424 
425 	for (rtnh = (struct rtnexthop *)(nla + 1); data_len > 0; ) {
426 		struct rta_mpath_nh *mpnh = &mp->nhops[mp->num_nhops++];
427 
428 		int error = nl_parse_header(rtnh, rtnh->rtnh_len, &mpath_parser,
429 		    npt, mpnh);
430 		if (error != 0) {
431 			NLMSG_REPORT_ERR_MSG(npt, "RTA_MULTIPATH: nexhop %d: parse failed",
432 			    mp->num_nhops - 1);
433 			return (error);
434 		}
435 
436 		int len = NL_ITEM_ALIGN(rtnh->rtnh_len);
437 		data_len -= len;
438 		rtnh = (struct rtnexthop *)((char *)rtnh + len);
439 	}
440 	if (data_len != 0 || mp->num_nhops == 0) {
441 		NLMSG_REPORT_ERR_MSG(npt, "invalid RTA_MULTIPATH attr");
442 		return (EINVAL);
443 	}
444 
445 	*((struct rta_mpath **)target) = mp;
446 	return (0);
447 }
448 
449 
450 struct nl_parsed_route {
451 	struct sockaddr		*rta_dst;
452 	struct sockaddr		*rta_gw;
453 	struct ifnet		*rta_oif;
454 	struct rta_mpath	*rta_multipath;
455 	uint32_t		rta_table;
456 	uint32_t		rta_rtflags;
457 	uint32_t		rta_nh_id;
458 	uint32_t		rta_weight;
459 	uint32_t		rtax_mtu;
460 	uint8_t			rtm_family;
461 	uint8_t			rtm_dst_len;
462 	uint8_t			rtm_protocol;
463 	uint8_t			rtm_type;
464 	uint32_t		rtm_flags;
465 };
466 
467 #define	_IN(_field)	offsetof(struct rtmsg, _field)
468 #define	_OUT(_field)	offsetof(struct nl_parsed_route, _field)
469 static struct nlattr_parser nla_p_rtmetrics[] = {
470 	{ .type = NL_RTAX_MTU, .off = _OUT(rtax_mtu), .cb = nlattr_get_uint32 },
471 };
472 NL_DECLARE_ATTR_PARSER(metrics_parser, nla_p_rtmetrics);
473 
474 static const struct nlattr_parser nla_p_rtmsg[] = {
475 	{ .type = NL_RTA_DST, .off = _OUT(rta_dst), .cb = nlattr_get_ip },
476 	{ .type = NL_RTA_OIF, .off = _OUT(rta_oif), .cb = nlattr_get_ifp },
477 	{ .type = NL_RTA_GATEWAY, .off = _OUT(rta_gw), .cb = nlattr_get_ip },
478 	{ .type = NL_RTA_METRICS, .arg = &metrics_parser, .cb = nlattr_get_nested },
479 	{ .type = NL_RTA_MULTIPATH, .off = _OUT(rta_multipath), .cb = nlattr_get_multipath },
480 	{ .type = NL_RTA_WEIGHT, .off = _OUT(rta_weight), .cb = nlattr_get_uint32 },
481 	{ .type = NL_RTA_RTFLAGS, .off = _OUT(rta_rtflags), .cb = nlattr_get_uint32 },
482 	{ .type = NL_RTA_TABLE, .off = _OUT(rta_table), .cb = nlattr_get_uint32 },
483 	{ .type = NL_RTA_VIA, .off = _OUT(rta_gw), .cb = nlattr_get_ipvia },
484 	{ .type = NL_RTA_NH_ID, .off = _OUT(rta_nh_id), .cb = nlattr_get_uint32 },
485 };
486 
487 static const struct nlfield_parser nlf_p_rtmsg[] = {
488 	{ .off_in = _IN(rtm_family), .off_out = _OUT(rtm_family), .cb = nlf_get_u8 },
489 	{ .off_in = _IN(rtm_dst_len), .off_out = _OUT(rtm_dst_len), .cb = nlf_get_u8 },
490 	{ .off_in = _IN(rtm_protocol), .off_out = _OUT(rtm_protocol), .cb = nlf_get_u8 },
491 	{ .off_in = _IN(rtm_type), .off_out = _OUT(rtm_type), .cb = nlf_get_u8 },
492 	{ .off_in = _IN(rtm_flags), .off_out = _OUT(rtm_flags), .cb = nlf_get_u32 },
493 };
494 #undef _IN
495 #undef _OUT
496 NL_DECLARE_PARSER(rtm_parser, struct rtmsg, nlf_p_rtmsg, nla_p_rtmsg);
497 
498 struct netlink_walkargs {
499 	struct nl_writer *nw;
500 	struct route_nhop_data rnd;
501 	struct nlmsghdr hdr;
502 	struct nlpcb *nlp;
503 	uint32_t fibnum;
504 	int family;
505 	int error;
506 	int count;
507 	int dumped;
508 	int dumped_tables;
509 };
510 
511 static int
512 dump_rtentry(struct rtentry *rt, void *_arg)
513 {
514 	struct netlink_walkargs *wa = (struct netlink_walkargs *)_arg;
515 	int error;
516 
517 	wa->count++;
518 	if (wa->error != 0)
519 		return (0);
520 	if (!rt_is_exportable(rt, nlp_get_cred(wa->nlp)))
521 		return (0);
522 	wa->dumped++;
523 
524 	rt_get_rnd(rt, &wa->rnd);
525 
526 	error = dump_px(wa->fibnum, &wa->hdr, rt, &wa->rnd, wa->nw);
527 
528 	IF_DEBUG_LEVEL(LOG_DEBUG3) {
529 		char rtbuf[INET6_ADDRSTRLEN + 5];
530 		FIB_LOG(LOG_DEBUG3, wa->fibnum, wa->family,
531 		    "Dump %s, offset %u, error %d",
532 		    rt_print_buf(rt, rtbuf, sizeof(rtbuf)),
533 		    wa->nw->offset, error);
534 	}
535 	wa->error = error;
536 
537 	return (0);
538 }
539 
540 static void
541 dump_rtable_one(struct netlink_walkargs *wa, uint32_t fibnum, int family)
542 {
543 	FIB_LOG(LOG_DEBUG2, fibnum, family, "Start dump");
544 	wa->count = 0;
545 	wa->dumped = 0;
546 
547 	rib_walk(fibnum, family, false, dump_rtentry, wa);
548 
549 	wa->dumped_tables++;
550 
551 	FIB_LOG(LOG_DEBUG2, fibnum, family, "End dump, iterated %d dumped %d",
552 	    wa->count, wa->dumped);
553 	NL_LOG(LOG_DEBUG2, "Current offset: %d", wa->nw->offset);
554 }
555 
556 static int
557 dump_rtable_fib(struct netlink_walkargs *wa, uint32_t fibnum, int family)
558 {
559 	wa->fibnum = fibnum;
560 
561 	if (family == AF_UNSPEC) {
562 		for (int i = 0; i < AF_MAX; i++) {
563 			if (rt_tables_get_rnh(fibnum, i) != 0) {
564 				wa->family = i;
565 				dump_rtable_one(wa, fibnum, i);
566 				if (wa->error != 0)
567 					break;
568 			}
569 		}
570 	} else {
571 		if (rt_tables_get_rnh(fibnum, family) != 0) {
572 			wa->family = family;
573 			dump_rtable_one(wa, fibnum, family);
574 		}
575 	}
576 
577 	return (wa->error);
578 }
579 
580 static int
581 handle_rtm_getroute(struct nlpcb *nlp, struct nl_parsed_route *attrs,
582     struct nlmsghdr *hdr, struct nl_pstate *npt)
583 {
584 	RIB_RLOCK_TRACKER;
585 	struct rib_head *rnh;
586 	const struct rtentry *rt;
587 	struct route_nhop_data rnd;
588 	uint32_t fibnum = attrs->rta_table;
589 	sa_family_t family = attrs->rtm_family;
590 
591 	if (attrs->rta_dst == NULL) {
592 		NLMSG_REPORT_ERR_MSG(npt, "No RTA_DST supplied");
593 			return (EINVAL);
594 	}
595 
596 	rnh = rt_tables_get_rnh(fibnum, family);
597 	if (rnh == NULL)
598 		return (EAFNOSUPPORT);
599 
600 	RIB_RLOCK(rnh);
601 
602 	struct sockaddr *dst = attrs->rta_dst;
603 
604 	if (attrs->rtm_flags & RTM_F_PREFIX)
605 		rt = rib_lookup_prefix_plen(rnh, dst, attrs->rtm_dst_len, &rnd);
606 	else
607 		rt = (const struct rtentry *)rnh->rnh_matchaddr(dst, &rnh->head);
608 	if (rt == NULL) {
609 		RIB_RUNLOCK(rnh);
610 		return (ESRCH);
611 	}
612 
613 	rt_get_rnd(rt, &rnd);
614 	rnd.rnd_nhop = nhop_select_func(rnd.rnd_nhop, 0);
615 
616 	RIB_RUNLOCK(rnh);
617 
618 	if (!rt_is_exportable(rt, nlp_get_cred(nlp)))
619 		return (ESRCH);
620 
621 	IF_DEBUG_LEVEL(LOG_DEBUG2) {
622 		char rtbuf[NHOP_PRINT_BUFSIZE] __unused, nhbuf[NHOP_PRINT_BUFSIZE] __unused;
623 		FIB_LOG(LOG_DEBUG2, fibnum, family, "getroute completed: got %s for %s",
624 		    nhop_print_buf_any(rnd.rnd_nhop, nhbuf, sizeof(nhbuf)),
625 		    rt_print_buf(rt, rtbuf, sizeof(rtbuf)));
626 	}
627 
628 	hdr->nlmsg_type = NL_RTM_NEWROUTE;
629 	dump_px(fibnum, hdr, rt, &rnd, npt->nw);
630 
631 	return (0);
632 }
633 
634 static int
635 handle_rtm_dump(struct nlpcb *nlp, uint32_t fibnum, int family,
636     struct nlmsghdr *hdr, struct nl_writer *nw)
637 {
638 	struct netlink_walkargs wa = {
639 		.nlp = nlp,
640 		.nw = nw,
641 		.hdr.nlmsg_pid = hdr->nlmsg_pid,
642 		.hdr.nlmsg_seq = hdr->nlmsg_seq,
643 		.hdr.nlmsg_type = NL_RTM_NEWROUTE,
644 		.hdr.nlmsg_flags = hdr->nlmsg_flags | NLM_F_MULTI,
645 	};
646 
647 	if (fibnum == RT_TABLE_UNSPEC) {
648 		for (int i = 0; i < V_rt_numfibs; i++) {
649 			dump_rtable_fib(&wa, fibnum, family);
650 			if (wa.error != 0)
651 				break;
652 		}
653 	} else
654 		dump_rtable_fib(&wa, fibnum, family);
655 
656 	if (wa.error == 0 && wa.dumped_tables == 0) {
657 		FIB_LOG(LOG_DEBUG, fibnum, family, "incorrect fibnum/family");
658 		wa.error = ESRCH;
659 		// How do we propagate it?
660 	}
661 
662 	if (!nlmsg_end_dump(wa.nw, wa.error, &wa.hdr)) {
663                 NL_LOG(LOG_DEBUG, "Unable to finalize the dump");
664                 return (ENOMEM);
665         }
666 
667 	return (wa.error);
668 }
669 
670 static struct nhop_object *
671 finalize_nhop(struct nhop_object *nh, const struct sockaddr *dst, int *perror)
672 {
673 	/*
674 	 * The following MUST be filled:
675 	 *  nh_ifp, nh_ifa, nh_gw
676 	 */
677 	if (nh->gw_sa.sa_family == 0) {
678 		/*
679 		 * Empty gateway. Can be direct route with RTA_OIF set.
680 		 */
681 		if (nh->nh_ifp != NULL)
682 			nhop_set_direct_gw(nh, nh->nh_ifp);
683 		else {
684 			NL_LOG(LOG_DEBUG, "empty gateway and interface, skipping");
685 			*perror = EINVAL;
686 			return (NULL);
687 		}
688 		/* Both nh_ifp and gateway are set */
689 	} else {
690 		/* Gateway is set up, we can derive ifp if not set */
691 		if (nh->nh_ifp == NULL) {
692 			uint32_t fibnum = nhop_get_fibnum(nh);
693 			uint32_t flags = 0;
694 
695 			if (nh->nh_flags & NHF_GATEWAY)
696 				flags = RTF_GATEWAY;
697 			else if (nh->nh_flags & NHF_HOST)
698 				flags = RTF_HOST;
699 
700 			struct ifaddr *ifa = ifa_ifwithroute(flags, dst, &nh->gw_sa, fibnum);
701 			if (ifa == NULL) {
702 				NL_LOG(LOG_DEBUG, "Unable to determine ifp, skipping");
703 				*perror = EINVAL;
704 				return (NULL);
705 			}
706 			nhop_set_transmit_ifp(nh, ifa->ifa_ifp);
707 		}
708 	}
709 	/* Both nh_ifp and gateway are set */
710 	if (nh->nh_ifa == NULL) {
711 		const struct sockaddr *gw_sa = &nh->gw_sa;
712 
713 		if (gw_sa->sa_family != dst->sa_family) {
714 			/*
715 			 * Use dst as the target for determining the default
716 			 * preferred ifa IF
717 			 * 1) the gateway is link-level (e.g. direct route)
718 			 * 2) the gateway family is different (e.g. IPv4 over IPv6).
719 			 */
720 			gw_sa = dst;
721 		}
722 
723 		struct ifaddr *ifa = ifaof_ifpforaddr(gw_sa, nh->nh_ifp);
724 		if (ifa == NULL) {
725 			NL_LOG(LOG_DEBUG, "Unable to determine ifa, skipping");
726 			*perror = EINVAL;
727 			return (NULL);
728 		}
729 		nhop_set_src(nh, ifa);
730 	}
731 
732 	return (nhop_get_nhop(nh, perror));
733 }
734 
735 static int
736 get_pxflag(const struct nl_parsed_route *attrs)
737 {
738 	int pxflag = 0;
739 	switch (attrs->rtm_family) {
740 	case AF_INET:
741 		if (attrs->rtm_dst_len == 32)
742 			pxflag = NHF_HOST;
743 		else if (attrs->rtm_dst_len == 0)
744 			pxflag = NHF_DEFAULT;
745 		break;
746 	case AF_INET6:
747 		if (attrs->rtm_dst_len == 128)
748 			pxflag = NHF_HOST;
749 		else if (attrs->rtm_dst_len == 0)
750 			pxflag = NHF_DEFAULT;
751 		break;
752 	}
753 
754 	return (pxflag);
755 }
756 
757 static int
758 get_op_flags(int nlm_flags)
759 {
760 	int op_flags = 0;
761 
762 	op_flags |= (nlm_flags & NLM_F_REPLACE) ? RTM_F_REPLACE : 0;
763 	op_flags |= (nlm_flags & NLM_F_EXCL) ? RTM_F_EXCL : 0;
764 	op_flags |= (nlm_flags & NLM_F_CREATE) ? RTM_F_CREATE : 0;
765 	op_flags |= (nlm_flags & NLM_F_APPEND) ? RTM_F_APPEND : 0;
766 
767 	return (op_flags);
768 }
769 
770 #ifdef ROUTE_MPATH
771 static int
772 create_nexthop_one(struct nl_parsed_route *attrs, struct rta_mpath_nh *mpnh,
773     struct nl_pstate *npt, struct nhop_object **pnh)
774 {
775 	int error;
776 
777 	if (mpnh->gw == NULL)
778 		return (EINVAL);
779 
780 	struct nhop_object *nh = nhop_alloc(attrs->rta_table, attrs->rtm_family);
781 	if (nh == NULL)
782 		return (ENOMEM);
783 
784 	error = nl_set_nexthop_gw(nh, mpnh->gw, mpnh->ifp, npt);
785 	if (error != 0) {
786 		nhop_free(nh);
787 		return (error);
788 	}
789 	if (mpnh->ifp != NULL)
790 		nhop_set_transmit_ifp(nh, mpnh->ifp);
791 	nhop_set_pxtype_flag(nh, get_pxflag(attrs));
792 	nhop_set_rtflags(nh, attrs->rta_rtflags);
793 	if (attrs->rtm_protocol > RTPROT_STATIC)
794 		nhop_set_origin(nh, attrs->rtm_protocol);
795 
796 	*pnh = finalize_nhop(nh, attrs->rta_dst, &error);
797 
798 	return (error);
799 }
800 #endif
801 
802 static struct nhop_object *
803 create_nexthop_from_attrs(struct nl_parsed_route *attrs,
804     struct nl_pstate *npt, int *perror)
805 {
806 	struct nhop_object *nh = NULL;
807 	int error = 0;
808 
809 	if (attrs->rta_multipath != NULL) {
810 #ifdef ROUTE_MPATH
811 		/* Multipath w/o explicit nexthops */
812 		int num_nhops = attrs->rta_multipath->num_nhops;
813 		struct weightened_nhop *wn = npt_alloc(npt, sizeof(*wn) * num_nhops);
814 
815 		for (int i = 0; i < num_nhops; i++) {
816 			struct rta_mpath_nh *mpnh = &attrs->rta_multipath->nhops[i];
817 
818 			error = create_nexthop_one(attrs, mpnh, npt, &wn[i].nh);
819 			if (error != 0) {
820 				for (int j = 0; j < i; j++)
821 					nhop_free(wn[j].nh);
822 				break;
823 			}
824 			wn[i].weight = mpnh->rtnh_weight > 0 ? mpnh->rtnh_weight : 1;
825 		}
826 		if (error == 0) {
827 			struct rib_head *rh = nhop_get_rh(wn[0].nh);
828 			struct nhgrp_object *nhg;
829 
830 			nhg = nhgrp_alloc(rh->rib_fibnum, rh->rib_family,
831 			    wn, num_nhops, perror);
832 			if (nhg != NULL) {
833 				if (attrs->rtm_protocol > RTPROT_STATIC)
834 					nhgrp_set_origin(nhg, attrs->rtm_protocol);
835 				nhg = nhgrp_get_nhgrp(nhg, perror);
836 			}
837 			for (int i = 0; i < num_nhops; i++)
838 				nhop_free(wn[i].nh);
839 			if (nhg != NULL)
840 				return ((struct nhop_object *)nhg);
841 			error = *perror;
842 		}
843 #else
844 		error = ENOTSUP;
845 #endif
846 		*perror = error;
847 	} else {
848 		nh = nhop_alloc(attrs->rta_table, attrs->rtm_family);
849 		if (nh == NULL) {
850 			*perror = ENOMEM;
851 			return (NULL);
852 		}
853 		if (attrs->rta_gw != NULL) {
854 			*perror = nl_set_nexthop_gw(nh, attrs->rta_gw, attrs->rta_oif, npt);
855 			if (*perror != 0) {
856 				nhop_free(nh);
857 				return (NULL);
858 			}
859 		}
860 		if (attrs->rta_oif != NULL)
861 			nhop_set_transmit_ifp(nh, attrs->rta_oif);
862 		if (attrs->rtax_mtu != 0)
863 			nhop_set_mtu(nh, attrs->rtax_mtu, true);
864 		if (attrs->rta_rtflags & RTF_BROADCAST)
865 			nhop_set_broadcast(nh, true);
866 		if (attrs->rtm_protocol > RTPROT_STATIC)
867 			nhop_set_origin(nh, attrs->rtm_protocol);
868 		nhop_set_pxtype_flag(nh, get_pxflag(attrs));
869 		nhop_set_rtflags(nh, attrs->rta_rtflags);
870 
871 		switch (attrs->rtm_type) {
872 		case RTN_UNICAST:
873 			break;
874 		case RTN_BLACKHOLE:
875 			nhop_set_blackhole(nh, RTF_BLACKHOLE);
876 			break;
877 		case RTN_PROHIBIT:
878 		case RTN_UNREACHABLE:
879 			nhop_set_blackhole(nh, RTF_REJECT);
880 			break;
881 		/* TODO: return ENOTSUP for other types if strict option is set */
882 		}
883 
884 		nh = finalize_nhop(nh, attrs->rta_dst, perror);
885 	}
886 
887 	return (nh);
888 }
889 
890 static int
891 rtnl_handle_newroute(struct nlmsghdr *hdr, struct nlpcb *nlp,
892     struct nl_pstate *npt)
893 {
894 	struct rib_cmd_info rc = {};
895 	struct nhop_object *nh = NULL;
896 	int error;
897 
898 	struct nl_parsed_route attrs = {};
899 	error = nl_parse_nlmsg(hdr, &rtm_parser, npt, &attrs);
900 	if (error != 0)
901 		return (error);
902 
903 	/* Check if we have enough data */
904 	if (attrs.rta_dst == NULL) {
905 		NL_LOG(LOG_DEBUG, "missing RTA_DST");
906 		return (EINVAL);
907 	}
908 
909 	if (attrs.rta_table >= V_rt_numfibs) {
910 		NLMSG_REPORT_ERR_MSG(npt, "invalid fib");
911 		return (EINVAL);
912 	}
913 
914 	if (attrs.rta_nh_id != 0) {
915 		/* Referenced uindex */
916 		int pxflag = get_pxflag(&attrs);
917 		nh = nl_find_nhop(attrs.rta_table, attrs.rtm_family, attrs.rta_nh_id,
918 		    pxflag, &error);
919 		if (error != 0)
920 			return (error);
921 	} else {
922 		nh = create_nexthop_from_attrs(&attrs, npt, &error);
923 		if (error != 0) {
924 			NL_LOG(LOG_DEBUG, "Error creating nexthop");
925 			return (error);
926 		}
927 	}
928 
929 	if (!NH_IS_NHGRP(nh) && attrs.rta_weight == 0)
930 		attrs.rta_weight = RT_DEFAULT_WEIGHT;
931 	struct route_nhop_data rnd = { .rnd_nhop = nh, .rnd_weight = attrs.rta_weight };
932 	int op_flags = get_op_flags(hdr->nlmsg_flags);
933 
934 	error = rib_add_route_px(attrs.rta_table, attrs.rta_dst, attrs.rtm_dst_len,
935 	    &rnd, op_flags, &rc);
936 	if (error == 0)
937 		report_operation(attrs.rta_table, &rc, nlp, hdr);
938 	return (error);
939 }
940 
941 static int
942 path_match_func(const struct rtentry *rt, const struct nhop_object *nh, void *_data)
943 {
944 	struct nl_parsed_route *attrs = (struct nl_parsed_route *)_data;
945 
946 	if ((attrs->rta_gw != NULL) && !rib_match_gw(rt, nh, attrs->rta_gw))
947 		return (0);
948 
949 	if ((attrs->rta_oif != NULL) && (attrs->rta_oif != nh->nh_ifp))
950 		return (0);
951 
952 	return (1);
953 }
954 
955 static int
956 rtnl_handle_delroute(struct nlmsghdr *hdr, struct nlpcb *nlp,
957     struct nl_pstate *npt)
958 {
959 	struct rib_cmd_info rc;
960 	int error;
961 
962 	struct nl_parsed_route attrs = {};
963 	error = nl_parse_nlmsg(hdr, &rtm_parser, npt, &attrs);
964 	if (error != 0)
965 		return (error);
966 
967 	if (attrs.rta_dst == NULL) {
968 		NLMSG_REPORT_ERR_MSG(npt, "RTA_DST is not set");
969 		return (ESRCH);
970 	}
971 
972 	if (attrs.rta_table >= V_rt_numfibs) {
973 		NLMSG_REPORT_ERR_MSG(npt, "invalid fib");
974 		return (EINVAL);
975 	}
976 
977 	error = rib_del_route_px(attrs.rta_table, attrs.rta_dst,
978 	    attrs.rtm_dst_len, path_match_func, &attrs, 0, &rc);
979 	if (error == 0)
980 		report_operation(attrs.rta_table, &rc, nlp, hdr);
981 	return (error);
982 }
983 
984 static int
985 rtnl_handle_getroute(struct nlmsghdr *hdr, struct nlpcb *nlp, struct nl_pstate *npt)
986 {
987 	int error;
988 
989 	struct nl_parsed_route attrs = {};
990 	error = nl_parse_nlmsg(hdr, &rtm_parser, npt, &attrs);
991 	if (error != 0)
992 		return (error);
993 
994 	if (attrs.rta_table >= V_rt_numfibs) {
995 		NLMSG_REPORT_ERR_MSG(npt, "invalid fib");
996 		return (EINVAL);
997 	}
998 
999 	if (hdr->nlmsg_flags & NLM_F_DUMP)
1000 		error = handle_rtm_dump(nlp, attrs.rta_table, attrs.rtm_family, hdr, npt->nw);
1001 	else
1002 		error = handle_rtm_getroute(nlp, &attrs, hdr, npt);
1003 
1004 	return (error);
1005 }
1006 
1007 void
1008 rtnl_handle_route_event(uint32_t fibnum, const struct rib_cmd_info *rc)
1009 {
1010 	struct nl_writer nw = {};
1011 	int family, nlm_flags = 0;
1012 
1013 	family = rt_get_family(rc->rc_rt);
1014 
1015 	/* XXX: check if there are active listeners first */
1016 
1017 	/* TODO: consider passing PID/type/seq */
1018 	switch (rc->rc_cmd) {
1019 	case RTM_ADD:
1020 		nlm_flags = NLM_F_EXCL | NLM_F_CREATE;
1021 		break;
1022 	case RTM_CHANGE:
1023 		nlm_flags = NLM_F_REPLACE;
1024 		break;
1025 	case RTM_DELETE:
1026 		nlm_flags = 0;
1027 		break;
1028 	}
1029 	IF_DEBUG_LEVEL(LOG_DEBUG2) {
1030 		char rtbuf[NHOP_PRINT_BUFSIZE] __unused;
1031 		FIB_LOG(LOG_DEBUG2, fibnum, family,
1032 		    "received event %s for %s / nlm_flags=%X",
1033 		    rib_print_cmd(rc->rc_cmd),
1034 		    rt_print_buf(rc->rc_rt, rtbuf, sizeof(rtbuf)),
1035 		    nlm_flags);
1036 	}
1037 
1038 	struct nlmsghdr hdr = {
1039 		.nlmsg_flags = nlm_flags,
1040 		.nlmsg_type = get_rtmsg_type_from_rtsock(rc->rc_cmd),
1041 	};
1042 
1043 	struct route_nhop_data rnd = {
1044 		.rnd_nhop = rc_get_nhop(rc),
1045 		.rnd_weight = rc->rc_nh_weight,
1046 	};
1047 
1048 	uint32_t group_id = family_to_group(family);
1049 	if (!nlmsg_get_group_writer(&nw, NLMSG_SMALL, NETLINK_ROUTE, group_id)) {
1050 		NL_LOG(LOG_DEBUG, "error allocating event buffer");
1051 		return;
1052 	}
1053 
1054 	dump_px(fibnum, &hdr, rc->rc_rt, &rnd, &nw);
1055 	nlmsg_flush(&nw);
1056 }
1057 
1058 static const struct rtnl_cmd_handler cmd_handlers[] = {
1059 	{
1060 		.cmd = NL_RTM_GETROUTE,
1061 		.name = "RTM_GETROUTE",
1062 		.cb = &rtnl_handle_getroute,
1063 		.flags = RTNL_F_ALLOW_NONVNET_JAIL,
1064 	},
1065 	{
1066 		.cmd = NL_RTM_DELROUTE,
1067 		.name = "RTM_DELROUTE",
1068 		.cb = &rtnl_handle_delroute,
1069 		.priv = PRIV_NET_ROUTE,
1070 	},
1071 	{
1072 		.cmd = NL_RTM_NEWROUTE,
1073 		.name = "RTM_NEWROUTE",
1074 		.cb = &rtnl_handle_newroute,
1075 		.priv = PRIV_NET_ROUTE,
1076 	}
1077 };
1078 
1079 static const struct nlhdr_parser *all_parsers[] = {&mpath_parser, &metrics_parser, &rtm_parser};
1080 
1081 void
1082 rtnl_routes_init(void)
1083 {
1084 	NL_VERIFY_PARSERS(all_parsers);
1085 	rtnl_register_messages(cmd_handlers, NL_ARRAY_LEN(cmd_handlers));
1086 }
1087