xref: /freebsd/sys/netlink/netlink_io.c (revision 9f23cbd6cae82fd77edfad7173432fa8dccd0a95)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause
3  *
4  * Copyright (c) 2021 Ng Peng Nam Sean
5  * Copyright (c) 2022 Alexander V. Chernikov <melifaro@FreeBSD.org>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26  * SUCH DAMAGE.
27  */
28 
29 #include "opt_netlink.h"
30 
31 #include <sys/cdefs.h>
32 __FBSDID("$FreeBSD$");
33 #include <sys/param.h>
34 #include <sys/ck.h>
35 #include <sys/lock.h>
36 #include <sys/malloc.h>
37 #include <sys/mbuf.h>
38 #include <sys/mutex.h>
39 #include <sys/socket.h>
40 #include <sys/socketvar.h>
41 #include <sys/syslog.h>
42 
43 #include <netlink/netlink.h>
44 #include <netlink/netlink_ctl.h>
45 #include <netlink/netlink_linux.h>
46 #include <netlink/netlink_var.h>
47 
48 #define	DEBUG_MOD_NAME	nl_io
49 #define	DEBUG_MAX_LEVEL	LOG_DEBUG3
50 #include <netlink/netlink_debug.h>
51 _DECLARE_DEBUG(LOG_INFO);
52 
53 /*
54  * The logic below provide a p2p interface for receiving and
55  * sending netlink data between the kernel and userland.
56  */
57 
58 static const struct sockaddr_nl _nl_empty_src = {
59 	.nl_len = sizeof(struct sockaddr_nl),
60 	.nl_family = PF_NETLINK,
61 	.nl_pid = 0 /* comes from the kernel */
62 };
63 static const struct sockaddr *nl_empty_src = (const struct sockaddr *)&_nl_empty_src;
64 
65 static struct mbuf *nl_process_mbuf(struct mbuf *m, struct nlpcb *nlp);
66 
67 
68 static void
69 queue_push(struct nl_io_queue *q, struct mbuf *mq)
70 {
71 	while (mq != NULL) {
72 		struct mbuf *m = mq;
73 		mq = mq->m_nextpkt;
74 		m->m_nextpkt = NULL;
75 
76 		q->length += m_length(m, NULL);
77 		STAILQ_INSERT_TAIL(&q->head, m, m_stailqpkt);
78 	}
79 }
80 
81 static void
82 queue_push_head(struct nl_io_queue *q, struct mbuf *m)
83 {
84 	MPASS(m->m_nextpkt == NULL);
85 
86 	q->length += m_length(m, NULL);
87 	STAILQ_INSERT_HEAD(&q->head, m, m_stailqpkt);
88 }
89 
90 static struct mbuf *
91 queue_pop(struct nl_io_queue *q)
92 {
93 	if (!STAILQ_EMPTY(&q->head)) {
94 		struct mbuf *m = STAILQ_FIRST(&q->head);
95 		STAILQ_REMOVE_HEAD(&q->head, m_stailqpkt);
96 		m->m_nextpkt = NULL;
97 		q->length -= m_length(m, NULL);
98 
99 		return (m);
100 	}
101 	return (NULL);
102 }
103 
104 static struct mbuf *
105 queue_head(const struct nl_io_queue *q)
106 {
107 	return (STAILQ_FIRST(&q->head));
108 }
109 
110 static inline bool
111 queue_empty(const struct nl_io_queue *q)
112 {
113 	return (q->length == 0);
114 }
115 
116 static void
117 queue_free(struct nl_io_queue *q)
118 {
119 	while (!STAILQ_EMPTY(&q->head)) {
120 		struct mbuf *m = STAILQ_FIRST(&q->head);
121 		STAILQ_REMOVE_HEAD(&q->head, m_stailqpkt);
122 		m->m_nextpkt = NULL;
123 		m_freem(m);
124 	}
125 	q->length = 0;
126 }
127 
128 void
129 nl_add_msg_info(struct mbuf *m)
130 {
131 	struct nlpcb *nlp = nl_get_thread_nlp(curthread);
132 	NL_LOG(LOG_DEBUG2, "Trying to recover nlp from thread %p: %p",
133 	    curthread, nlp);
134 
135 	if (nlp == NULL)
136 		return;
137 
138 	/* Prepare what we want to encode - PID, socket PID & msg seq */
139 	struct {
140 		struct nlattr nla;
141 		uint32_t val;
142 	} data[] = {
143 		{
144 			.nla.nla_len = sizeof(struct nlattr) + sizeof(uint32_t),
145 			.nla.nla_type = NLMSGINFO_ATTR_PROCESS_ID,
146 			.val = nlp->nl_process_id,
147 		},
148 		{
149 			.nla.nla_len = sizeof(struct nlattr) + sizeof(uint32_t),
150 			.nla.nla_type = NLMSGINFO_ATTR_PORT_ID,
151 			.val = nlp->nl_port,
152 		},
153 	};
154 
155 
156 	while (m->m_next != NULL)
157 		m = m->m_next;
158 	m->m_next = sbcreatecontrol(data, sizeof(data),
159 	    NETLINK_MSG_INFO, SOL_NETLINK, M_NOWAIT);
160 
161 	NL_LOG(LOG_DEBUG2, "Storing %u bytes of data, ctl: %p",
162 	    (unsigned)sizeof(data), m->m_next);
163 }
164 
165 static __noinline struct mbuf *
166 extract_msg_info(struct mbuf *m)
167 {
168 	while (m->m_next != NULL) {
169 		if (m->m_next->m_type == MT_CONTROL) {
170 			struct mbuf *ctl = m->m_next;
171 			m->m_next = NULL;
172 			return (ctl);
173 		}
174 		m = m->m_next;
175 	}
176 	return (NULL);
177 }
178 
179 static void
180 nl_schedule_taskqueue(struct nlpcb *nlp)
181 {
182 	if (!nlp->nl_task_pending) {
183 		nlp->nl_task_pending = true;
184 		taskqueue_enqueue(nlp->nl_taskqueue, &nlp->nl_task);
185 		NL_LOG(LOG_DEBUG3, "taskqueue scheduled");
186 	} else {
187 		NL_LOG(LOG_DEBUG3, "taskqueue schedule skipped");
188 	}
189 }
190 
191 int
192 nl_receive_async(struct mbuf *m, struct socket *so)
193 {
194 	struct nlpcb *nlp = sotonlpcb(so);
195 	int error = 0;
196 
197 	m->m_nextpkt = NULL;
198 
199 	NLP_LOCK(nlp);
200 
201 	if ((__predict_true(nlp->nl_active))) {
202 		sbappend(&so->so_snd, m, 0);
203 		NL_LOG(LOG_DEBUG3, "enqueue %u bytes", m_length(m, NULL));
204 		nl_schedule_taskqueue(nlp);
205 	} else {
206 		NL_LOG(LOG_DEBUG, "ignoring %u bytes on non-active socket",
207 		    m_length(m, NULL));
208 		m_free(m);
209 		error = EINVAL;
210 	}
211 
212 	NLP_UNLOCK(nlp);
213 
214 	return (error);
215 }
216 
217 static bool
218 tx_check_locked(struct nlpcb *nlp)
219 {
220 	if (queue_empty(&nlp->tx_queue))
221 		return (true);
222 
223 	/*
224 	 * Check if something can be moved from the internal TX queue
225 	 * to the socket queue.
226 	 */
227 
228 	bool appended = false;
229 	struct sockbuf *sb = &nlp->nl_socket->so_rcv;
230 	SOCKBUF_LOCK(sb);
231 
232 	while (true) {
233 		struct mbuf *m = queue_head(&nlp->tx_queue);
234 		if (m != NULL) {
235 			struct mbuf *ctl = NULL;
236 			if (__predict_false(m->m_next != NULL))
237 				ctl = extract_msg_info(m);
238 			if (sbappendaddr_locked(sb, nl_empty_src, m, ctl) != 0) {
239 				/* appended successfully */
240 				queue_pop(&nlp->tx_queue);
241 				appended = true;
242 			} else
243 				break;
244 		} else
245 			break;
246 	}
247 
248 	SOCKBUF_UNLOCK(sb);
249 
250 	if (appended)
251 		sorwakeup(nlp->nl_socket);
252 
253 	return (queue_empty(&nlp->tx_queue));
254 }
255 
256 static bool
257 nl_process_received_one(struct nlpcb *nlp)
258 {
259 	bool reschedule = false;
260 
261 	NLP_LOCK(nlp);
262 	nlp->nl_task_pending = false;
263 
264 	if (!tx_check_locked(nlp)) {
265 		/* TX overflow queue still not empty, ignore RX */
266 		NLP_UNLOCK(nlp);
267 		return (false);
268 	}
269 
270 	if (queue_empty(&nlp->rx_queue)) {
271 		/*
272 		 * Grab all data we have from the socket TX queue
273 		 * and store it the internal queue, so it can be worked on
274 		 * w/o holding socket lock.
275 		 */
276 		struct sockbuf *sb = &nlp->nl_socket->so_snd;
277 
278 		SOCKBUF_LOCK(sb);
279 		unsigned int avail = sbavail(sb);
280 		if (avail > 0) {
281 			NL_LOG(LOG_DEBUG3, "grabbed %u bytes", avail);
282 			queue_push(&nlp->rx_queue, sbcut_locked(sb, avail));
283 		}
284 		SOCKBUF_UNLOCK(sb);
285 	} else {
286 		/* Schedule another pass to read from the socket queue */
287 		reschedule = true;
288 	}
289 
290 	int prev_hiwat = nlp->tx_queue.hiwat;
291 	NLP_UNLOCK(nlp);
292 
293 	while (!queue_empty(&nlp->rx_queue)) {
294 		struct mbuf *m = queue_pop(&nlp->rx_queue);
295 
296 		m = nl_process_mbuf(m, nlp);
297 		if (m != NULL) {
298 			queue_push_head(&nlp->rx_queue, m);
299 			reschedule = false;
300 			break;
301 		}
302 	}
303 	if (nlp->tx_queue.hiwat > prev_hiwat) {
304 		NLP_LOG(LOG_DEBUG, nlp, "TX override peaked to %d", nlp->tx_queue.hiwat);
305 
306 	}
307 
308 	return (reschedule);
309 }
310 
311 static void
312 nl_process_received(struct nlpcb *nlp)
313 {
314 	NL_LOG(LOG_DEBUG3, "taskqueue called");
315 
316 	if (__predict_false(nlp->nl_need_thread_setup)) {
317 		nl_set_thread_nlp(curthread, nlp);
318 		NLP_LOCK(nlp);
319 		nlp->nl_need_thread_setup = false;
320 		NLP_UNLOCK(nlp);
321 	}
322 
323 	while (nl_process_received_one(nlp))
324 		;
325 }
326 
327 void
328 nl_init_io(struct nlpcb *nlp)
329 {
330 	STAILQ_INIT(&nlp->rx_queue.head);
331 	STAILQ_INIT(&nlp->tx_queue.head);
332 }
333 
334 void
335 nl_free_io(struct nlpcb *nlp)
336 {
337 	queue_free(&nlp->rx_queue);
338 	queue_free(&nlp->tx_queue);
339 }
340 
341 /*
342  * Called after some data have been read from the socket.
343  */
344 void
345 nl_on_transmit(struct nlpcb *nlp)
346 {
347 	NLP_LOCK(nlp);
348 
349 	struct socket *so = nlp->nl_socket;
350 	if (__predict_false(nlp->nl_dropped_bytes > 0 && so != NULL)) {
351 		unsigned long dropped_bytes = nlp->nl_dropped_bytes;
352 		unsigned long dropped_messages = nlp->nl_dropped_messages;
353 		nlp->nl_dropped_bytes = 0;
354 		nlp->nl_dropped_messages = 0;
355 
356 		struct sockbuf *sb = &so->so_rcv;
357 		NLP_LOG(LOG_DEBUG, nlp,
358 		    "socket RX overflowed, %lu messages (%lu bytes) dropped. "
359 		    "bytes: [%u/%u] mbufs: [%u/%u]", dropped_messages, dropped_bytes,
360 		    sb->sb_ccc, sb->sb_hiwat, sb->sb_mbcnt, sb->sb_mbmax);
361 		/* TODO: send netlink message */
362 	}
363 
364 	nl_schedule_taskqueue(nlp);
365 	NLP_UNLOCK(nlp);
366 }
367 
368 void
369 nl_taskqueue_handler(void *_arg, int pending)
370 {
371 	struct nlpcb *nlp = (struct nlpcb *)_arg;
372 
373 	CURVNET_SET(nlp->nl_socket->so_vnet);
374 	nl_process_received(nlp);
375 	CURVNET_RESTORE();
376 }
377 
378 static __noinline void
379 queue_push_tx(struct nlpcb *nlp, struct mbuf *m)
380 {
381 	queue_push(&nlp->tx_queue, m);
382 	nlp->nl_tx_blocked = true;
383 
384 	if (nlp->tx_queue.length > nlp->tx_queue.hiwat)
385 		nlp->tx_queue.hiwat = nlp->tx_queue.length;
386 }
387 
388 /*
389  * Tries to send @m to the socket @nlp.
390  *
391  * @m: mbuf(s) to send to. Consumed in any case.
392  * @nlp: socket to send to
393  * @cnt: number of messages in @m
394  * @io_flags: combination of NL_IOF_* flags
395  *
396  * Returns true on success.
397  * If no queue overrunes happened, wakes up socket owner.
398  */
399 bool
400 nl_send_one(struct mbuf *m, struct nlpcb *nlp, int num_messages, int io_flags)
401 {
402 	bool untranslated = io_flags & NL_IOF_UNTRANSLATED;
403 	bool ignore_limits = io_flags & NL_IOF_IGNORE_LIMIT;
404 	bool result = true;
405 
406 	IF_DEBUG_LEVEL(LOG_DEBUG2) {
407 		struct nlmsghdr *hdr = mtod(m, struct nlmsghdr *);
408 		NLP_LOG(LOG_DEBUG2, nlp,
409 		    "TX mbuf len %u msgs %u msg type %d first hdrlen %u io_flags %X",
410 		    m_length(m, NULL), num_messages, hdr->nlmsg_type, hdr->nlmsg_len,
411 		    io_flags);
412 	}
413 
414 	if (__predict_false(nlp->nl_linux && linux_netlink_p != NULL && untranslated)) {
415 		m = linux_netlink_p->mbufs_to_linux(nlp->nl_proto, m, nlp);
416 		if (m == NULL)
417 			return (false);
418 	}
419 
420 	NLP_LOCK(nlp);
421 
422 	if (__predict_false(nlp->nl_socket == NULL)) {
423 		NLP_UNLOCK(nlp);
424 		m_freem(m);
425 		return (false);
426 	}
427 
428 	if (!queue_empty(&nlp->tx_queue)) {
429 		if (ignore_limits) {
430 			queue_push_tx(nlp, m);
431 		} else {
432 			m_free(m);
433 			result = false;
434 		}
435 		NLP_UNLOCK(nlp);
436 		return (result);
437 	}
438 
439 	struct socket *so = nlp->nl_socket;
440 	struct mbuf *ctl = NULL;
441 	if (__predict_false(m->m_next != NULL))
442 		ctl = extract_msg_info(m);
443 	if (sbappendaddr(&so->so_rcv, nl_empty_src, m, ctl) != 0) {
444 		sorwakeup(so);
445 		NLP_LOG(LOG_DEBUG3, nlp, "appended data & woken up");
446 	} else {
447 		if (ignore_limits) {
448 			queue_push_tx(nlp, m);
449 		} else {
450 			/*
451 			 * Store dropped data so it can be reported
452 			 * on the next read
453 			 */
454 			nlp->nl_dropped_bytes += m_length(m, NULL);
455 			nlp->nl_dropped_messages += num_messages;
456 			NLP_LOG(LOG_DEBUG2, nlp, "RX oveflow: %lu m (+%d), %lu b (+%d)",
457 			    (unsigned long)nlp->nl_dropped_messages, num_messages,
458 			    (unsigned long)nlp->nl_dropped_bytes, m_length(m, NULL));
459 			soroverflow(so);
460 			m_freem(m);
461 			result = false;
462 		}
463 	}
464 	NLP_UNLOCK(nlp);
465 
466 	return (result);
467 }
468 
469 static int
470 nl_receive_message(struct nlmsghdr *hdr, int remaining_length,
471     struct nlpcb *nlp, struct nl_pstate *npt)
472 {
473 	nl_handler_f handler = nl_handlers[nlp->nl_proto].cb;
474 	int error = 0;
475 
476 	NLP_LOG(LOG_DEBUG2, nlp, "msg len: %u type: %d: flags: 0x%X seq: %u pid: %u",
477 	    hdr->nlmsg_len, hdr->nlmsg_type, hdr->nlmsg_flags, hdr->nlmsg_seq,
478 	    hdr->nlmsg_pid);
479 
480 	if (__predict_false(hdr->nlmsg_len > remaining_length)) {
481 		NLP_LOG(LOG_DEBUG, nlp, "message is not entirely present: want %d got %d",
482 		    hdr->nlmsg_len, remaining_length);
483 		return (EINVAL);
484 	} else if (__predict_false(hdr->nlmsg_len < sizeof(*hdr))) {
485 		NL_LOG(LOG_DEBUG, "message too short: %d", hdr->nlmsg_len);
486 		return (EINVAL);
487 	}
488 	/* Stamp each message with sender pid */
489 	hdr->nlmsg_pid = nlp->nl_port;
490 
491 	npt->hdr = hdr;
492 
493 	if (hdr->nlmsg_flags & NLM_F_REQUEST && hdr->nlmsg_type >= NLMSG_MIN_TYPE) {
494 		NL_LOG(LOG_DEBUG2, "handling message with msg type: %d",
495 		   hdr->nlmsg_type);
496 
497 		if (nlp->nl_linux && linux_netlink_p != NULL) {
498 			struct nlmsghdr *hdr_orig = hdr;
499 			hdr = linux_netlink_p->msg_from_linux(nlp->nl_proto, hdr, npt);
500 			if (hdr == NULL) {
501 				 /* Failed to translate to kernel format. Report an error back */
502 				hdr = hdr_orig;
503 				npt->hdr = hdr;
504 				if (hdr->nlmsg_flags & NLM_F_ACK)
505 					nlmsg_ack(nlp, EOPNOTSUPP, hdr, npt);
506 				return (0);
507 			}
508 		}
509 		error = handler(hdr, npt);
510 		NL_LOG(LOG_DEBUG2, "retcode: %d", error);
511 	}
512 	if ((hdr->nlmsg_flags & NLM_F_ACK) || (error != 0 && error != EINTR)) {
513 		if (!npt->nw->suppress_ack) {
514 			NL_LOG(LOG_DEBUG3, "ack");
515 			nlmsg_ack(nlp, error, hdr, npt);
516 		}
517 	}
518 
519 	return (0);
520 }
521 
522 static void
523 npt_clear(struct nl_pstate *npt)
524 {
525 	lb_clear(&npt->lb);
526 	npt->error = 0;
527 	npt->err_msg = NULL;
528 	npt->err_off = 0;
529 	npt->hdr = NULL;
530 	npt->nw->suppress_ack = false;
531 }
532 
533 /*
534  * Processes an incoming packet, which can contain multiple netlink messages
535  */
536 static struct mbuf *
537 nl_process_mbuf(struct mbuf *m, struct nlpcb *nlp)
538 {
539 	int offset, buffer_length;
540 	struct nlmsghdr *hdr;
541 	char *buffer;
542 	int error;
543 
544 	NL_LOG(LOG_DEBUG3, "RX netlink mbuf %p on %p", m, nlp->nl_socket);
545 
546 	struct nl_writer nw = {};
547 	if (!nlmsg_get_unicast_writer(&nw, NLMSG_SMALL, nlp)) {
548 		m_freem(m);
549 		NL_LOG(LOG_DEBUG, "error allocating socket writer");
550 		return (NULL);
551 	}
552 
553 	nlmsg_ignore_limit(&nw);
554 	/* TODO: alloc this buf once for nlp */
555 	int data_length = m_length(m, NULL);
556 	buffer_length = roundup2(data_length, 8) + SCRATCH_BUFFER_SIZE;
557 	if (nlp->nl_linux)
558 		buffer_length += roundup2(data_length, 8);
559 	buffer = malloc(buffer_length, M_NETLINK, M_NOWAIT | M_ZERO);
560 	if (buffer == NULL) {
561 		m_freem(m);
562 		nlmsg_flush(&nw);
563 		NL_LOG(LOG_DEBUG, "Unable to allocate %d bytes of memory",
564 		    buffer_length);
565 		return (NULL);
566 	}
567 	m_copydata(m, 0, data_length, buffer);
568 
569 	struct nl_pstate npt = {
570 		.nlp = nlp,
571 		.lb.base = &buffer[roundup2(data_length, 8)],
572 		.lb.size = buffer_length - roundup2(data_length, 8),
573 		.nw = &nw,
574 		.strict = nlp->nl_flags & NLF_STRICT,
575 	};
576 
577 	for (offset = 0; offset + sizeof(struct nlmsghdr) <= data_length;) {
578 		hdr = (struct nlmsghdr *)&buffer[offset];
579 		/* Save length prior to calling handler */
580 		int msglen = NLMSG_ALIGN(hdr->nlmsg_len);
581 		NL_LOG(LOG_DEBUG3, "parsing offset %d/%d", offset, data_length);
582 		npt_clear(&npt);
583 		error = nl_receive_message(hdr, data_length - offset, nlp, &npt);
584 		offset += msglen;
585 		if (__predict_false(error != 0 || nlp->nl_tx_blocked))
586 			break;
587 	}
588 	NL_LOG(LOG_DEBUG3, "packet parsing done");
589 	free(buffer, M_NETLINK);
590 	nlmsg_flush(&nw);
591 
592 	if (nlp->nl_tx_blocked) {
593 		NLP_LOCK(nlp);
594 		nlp->nl_tx_blocked = false;
595 		NLP_UNLOCK(nlp);
596 		m_adj(m, offset);
597 		return (m);
598 	} else {
599 		m_freem(m);
600 		return (NULL);
601 	}
602 }
603