xref: /freebsd/sys/netlink/netlink_io.c (revision c7a063741720ef81d4caa4613242579d12f1d605)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2021 Ng Peng Nam Sean
5  * Copyright (c) 2022 Alexander V. Chernikov <melifaro@FreeBSD.org>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26  * SUCH DAMAGE.
27  */
28 
29 #include <sys/cdefs.h>
30 __FBSDID("$FreeBSD$");
31 #include <sys/param.h>
32 #include <sys/ck.h>
33 #include <sys/lock.h>
34 #include <sys/malloc.h>
35 #include <sys/mbuf.h>
36 #include <sys/mutex.h>
37 #include <sys/socket.h>
38 #include <sys/socketvar.h>
39 #include <sys/syslog.h>
40 
41 #include <netlink/netlink.h>
42 #include <netlink/netlink_ctl.h>
43 #include <netlink/netlink_linux.h>
44 #include <netlink/netlink_var.h>
45 
46 #define	DEBUG_MOD_NAME	nl_io
47 #define	DEBUG_MAX_LEVEL	LOG_DEBUG3
48 #include <netlink/netlink_debug.h>
49 _DECLARE_DEBUG(LOG_DEBUG);
50 
51 /*
52  * The logic below provide a p2p interface for receiving and
53  * sending netlink data between the kernel and userland.
54  */
55 
56 static const struct sockaddr_nl _nl_empty_src = {
57 	.nl_len = sizeof(struct sockaddr_nl),
58 	.nl_family = PF_NETLINK,
59 	.nl_pid = 0 /* comes from the kernel */
60 };
61 static const struct sockaddr *nl_empty_src = (const struct sockaddr *)&_nl_empty_src;
62 
63 static struct mbuf *nl_process_mbuf(struct mbuf *m, struct nlpcb *nlp);
64 
65 
66 static void
67 queue_push(struct nl_io_queue *q, struct mbuf *mq)
68 {
69 	while (mq != NULL) {
70 		struct mbuf *m = mq;
71 		mq = mq->m_nextpkt;
72 		m->m_nextpkt = NULL;
73 
74 		q->length += m_length(m, NULL);
75 		STAILQ_INSERT_TAIL(&q->head, m, m_stailqpkt);
76 	}
77 }
78 
79 static void
80 queue_push_head(struct nl_io_queue *q, struct mbuf *m)
81 {
82 	MPASS(m->m_nextpkt == NULL);
83 
84 	q->length += m_length(m, NULL);
85 	STAILQ_INSERT_HEAD(&q->head, m, m_stailqpkt);
86 }
87 
88 static struct mbuf *
89 queue_pop(struct nl_io_queue *q)
90 {
91 	if (!STAILQ_EMPTY(&q->head)) {
92 		struct mbuf *m = STAILQ_FIRST(&q->head);
93 		STAILQ_REMOVE_HEAD(&q->head, m_stailqpkt);
94 		m->m_nextpkt = NULL;
95 		q->length -= m_length(m, NULL);
96 
97 		return (m);
98 	}
99 	return (NULL);
100 }
101 
102 static struct mbuf *
103 queue_head(const struct nl_io_queue *q)
104 {
105 	return (STAILQ_FIRST(&q->head));
106 }
107 
108 static inline bool
109 queue_empty(const struct nl_io_queue *q)
110 {
111 	return (q->length == 0);
112 }
113 
114 static void
115 queue_free(struct nl_io_queue *q)
116 {
117 	while (!STAILQ_EMPTY(&q->head)) {
118 		struct mbuf *m = STAILQ_FIRST(&q->head);
119 		STAILQ_REMOVE_HEAD(&q->head, m_stailqpkt);
120 		m->m_nextpkt = NULL;
121 		m_freem(m);
122 	}
123 	q->length = 0;
124 }
125 
126 
127 static void
128 nl_schedule_taskqueue(struct nlpcb *nlp)
129 {
130 	if (!nlp->nl_task_pending) {
131 		nlp->nl_task_pending = true;
132 		taskqueue_enqueue(nlp->nl_taskqueue, &nlp->nl_task);
133 		NL_LOG(LOG_DEBUG3, "taskqueue scheduled");
134 	} else {
135 		NL_LOG(LOG_DEBUG3, "taskqueue schedule skipped");
136 	}
137 }
138 
139 int
140 nl_receive_async(struct mbuf *m, struct socket *so)
141 {
142 	struct nlpcb *nlp = sotonlpcb(so);
143 	int error = 0;
144 
145 	m->m_nextpkt = NULL;
146 
147 	NLP_LOCK(nlp);
148 
149 	if ((__predict_true(nlp->nl_active))) {
150 		sbappend(&so->so_snd, m, 0);
151 		NL_LOG(LOG_DEBUG3, "enqueue %u bytes", m_length(m, NULL));
152 		nl_schedule_taskqueue(nlp);
153 	} else {
154 		NL_LOG(LOG_DEBUG, "ignoring %u bytes on non-active socket",
155 		    m_length(m, NULL));
156 		m_free(m);
157 		error = EINVAL;
158 	}
159 
160 	NLP_UNLOCK(nlp);
161 
162 	return (error);
163 }
164 
165 static bool
166 tx_check_locked(struct nlpcb *nlp)
167 {
168 	if (queue_empty(&nlp->tx_queue))
169 		return (true);
170 
171 	/*
172 	 * Check if something can be moved from the internal TX queue
173 	 * to the socket queue.
174 	 */
175 
176 	bool appended = false;
177 	struct sockbuf *sb = &nlp->nl_socket->so_rcv;
178 	SOCKBUF_LOCK(sb);
179 
180 	while (true) {
181 		struct mbuf *m = queue_head(&nlp->tx_queue);
182 		if (m && sbappendaddr_locked(sb, nl_empty_src, m, NULL) != 0) {
183 			/* appended successfully */
184 			queue_pop(&nlp->tx_queue);
185 			appended = true;
186 		} else
187 			break;
188 	}
189 
190 	SOCKBUF_UNLOCK(sb);
191 
192 	if (appended)
193 		sorwakeup(nlp->nl_socket);
194 
195 	return (queue_empty(&nlp->tx_queue));
196 }
197 
198 static bool
199 nl_process_received_one(struct nlpcb *nlp)
200 {
201 	bool reschedule = false;
202 
203 	NLP_LOCK(nlp);
204 	nlp->nl_task_pending = false;
205 
206 	if (!tx_check_locked(nlp)) {
207 		/* TX overflow queue still not empty, ignore RX */
208 		NLP_UNLOCK(nlp);
209 		return (false);
210 	}
211 
212 	if (queue_empty(&nlp->rx_queue)) {
213 		/*
214 		 * Grab all data we have from the socket TX queue
215 		 * and store it the internal queue, so it can be worked on
216 		 * w/o holding socket lock.
217 		 */
218 		struct sockbuf *sb = &nlp->nl_socket->so_snd;
219 
220 		SOCKBUF_LOCK(sb);
221 		unsigned int avail = sbavail(sb);
222 		if (avail > 0) {
223 			NL_LOG(LOG_DEBUG3, "grabbed %u bytes", avail);
224 			queue_push(&nlp->rx_queue, sbcut_locked(sb, avail));
225 		}
226 		SOCKBUF_UNLOCK(sb);
227 	} else {
228 		/* Schedule another pass to read from the socket queue */
229 		reschedule = true;
230 	}
231 
232 	int prev_hiwat = nlp->tx_queue.hiwat;
233 	NLP_UNLOCK(nlp);
234 
235 	while (!queue_empty(&nlp->rx_queue)) {
236 		struct mbuf *m = queue_pop(&nlp->rx_queue);
237 
238 		m = nl_process_mbuf(m, nlp);
239 		if (m != NULL) {
240 			queue_push_head(&nlp->rx_queue, m);
241 			reschedule = false;
242 			break;
243 		}
244 	}
245 	if (nlp->tx_queue.hiwat > prev_hiwat) {
246 		NLP_LOG(LOG_DEBUG, nlp, "TX override peaked to %d", nlp->tx_queue.hiwat);
247 
248 	}
249 
250 	return (reschedule);
251 }
252 
253 static void
254 nl_process_received(struct nlpcb *nlp)
255 {
256 	NL_LOG(LOG_DEBUG3, "taskqueue called");
257 
258 	while (nl_process_received_one(nlp))
259 		;
260 }
261 
262 void
263 nl_init_io(struct nlpcb *nlp)
264 {
265 	STAILQ_INIT(&nlp->rx_queue.head);
266 	STAILQ_INIT(&nlp->tx_queue.head);
267 }
268 
269 void
270 nl_free_io(struct nlpcb *nlp)
271 {
272 	queue_free(&nlp->rx_queue);
273 	queue_free(&nlp->tx_queue);
274 }
275 
276 /*
277  * Called after some data have been read from the socket.
278  */
279 void
280 nl_on_transmit(struct nlpcb *nlp)
281 {
282 	NLP_LOCK(nlp);
283 
284 	struct socket *so = nlp->nl_socket;
285 	if (__predict_false(nlp->nl_dropped_bytes > 0 && so != NULL)) {
286 		unsigned long dropped_bytes = nlp->nl_dropped_bytes;
287 		unsigned long dropped_messages = nlp->nl_dropped_messages;
288 		nlp->nl_dropped_bytes = 0;
289 		nlp->nl_dropped_messages = 0;
290 
291 		struct sockbuf *sb = &so->so_rcv;
292 		NLP_LOG(LOG_DEBUG, nlp,
293 		    "socket RX overflowed, %lu messages (%lu bytes) dropped. "
294 		    "bytes: [%u/%u] mbufs: [%u/%u]", dropped_messages, dropped_bytes,
295 		    sb->sb_ccc, sb->sb_hiwat, sb->sb_mbcnt, sb->sb_mbmax);
296 		/* TODO: send netlink message */
297 	}
298 
299 	nl_schedule_taskqueue(nlp);
300 	NLP_UNLOCK(nlp);
301 }
302 
303 void
304 nl_taskqueue_handler(void *_arg, int pending)
305 {
306 	struct nlpcb *nlp = (struct nlpcb *)_arg;
307 
308 	CURVNET_SET(nlp->nl_socket->so_vnet);
309 	nl_process_received(nlp);
310 	CURVNET_RESTORE();
311 }
312 
313 static __noinline void
314 queue_push_tx(struct nlpcb *nlp, struct mbuf *m)
315 {
316 	queue_push(&nlp->tx_queue, m);
317 	nlp->nl_tx_blocked = true;
318 
319 	if (nlp->tx_queue.length > nlp->tx_queue.hiwat)
320 		nlp->tx_queue.hiwat = nlp->tx_queue.length;
321 }
322 
323 /*
324  * Tries to send @m to the socket @nlp.
325  *
326  * @m: mbuf(s) to send to. Consumed in any case.
327  * @nlp: socket to send to
328  * @cnt: number of messages in @m
329  * @io_flags: combination of NL_IOF_* flags
330  *
331  * Returns true on success.
332  * If no queue overrunes happened, wakes up socket owner.
333  */
334 bool
335 nl_send_one(struct mbuf *m, struct nlpcb *nlp, int num_messages, int io_flags)
336 {
337 	bool untranslated = io_flags & NL_IOF_UNTRANSLATED;
338 	bool ignore_limits = io_flags & NL_IOF_IGNORE_LIMIT;
339 	bool result = true;
340 
341 	IF_DEBUG_LEVEL(LOG_DEBUG2) {
342 		struct nlmsghdr *hdr = mtod(m, struct nlmsghdr *);
343 		NLP_LOG(LOG_DEBUG2, nlp,
344 		    "TX mbuf len %u msgs %u msg type %d first hdrlen %u io_flags %X",
345 		    m_length(m, NULL), num_messages, hdr->nlmsg_type, hdr->nlmsg_len,
346 		    io_flags);
347 	}
348 
349 	if (__predict_false(nlp->nl_linux && linux_netlink_p != NULL && untranslated)) {
350 		m = linux_netlink_p->mbufs_to_linux(nlp->nl_proto, m, nlp);
351 		if (m == NULL)
352 			return (false);
353 	}
354 
355 	NLP_LOCK(nlp);
356 
357 	if (__predict_false(nlp->nl_socket == NULL)) {
358 		NLP_UNLOCK(nlp);
359 		m_freem(m);
360 		return (false);
361 	}
362 
363 	if (!queue_empty(&nlp->tx_queue)) {
364 		if (ignore_limits) {
365 			queue_push_tx(nlp, m);
366 		} else {
367 			m_free(m);
368 			result = false;
369 		}
370 		NLP_UNLOCK(nlp);
371 		return (result);
372 	}
373 
374 	struct socket *so = nlp->nl_socket;
375 	if (sbappendaddr(&so->so_rcv, nl_empty_src, m, NULL) != 0) {
376 		sorwakeup(so);
377 		NLP_LOG(LOG_DEBUG3, nlp, "appended data & woken up");
378 	} else {
379 		if (ignore_limits) {
380 			queue_push_tx(nlp, m);
381 		} else {
382 			/*
383 			 * Store dropped data so it can be reported
384 			 * on the next read
385 			 */
386 			nlp->nl_dropped_bytes += m_length(m, NULL);
387 			nlp->nl_dropped_messages += num_messages;
388 			NLP_LOG(LOG_DEBUG2, nlp, "RX oveflow: %lu m (+%d), %lu b (+%d)",
389 			    (unsigned long)nlp->nl_dropped_messages, num_messages,
390 			    (unsigned long)nlp->nl_dropped_bytes, m_length(m, NULL));
391 			soroverflow(so);
392 			m_freem(m);
393 			result = false;
394 		}
395 	}
396 	NLP_UNLOCK(nlp);
397 
398 	return (result);
399 }
400 
401 static int
402 nl_receive_message(struct nlmsghdr *hdr, int remaining_length,
403     struct nlpcb *nlp, struct nl_pstate *npt)
404 {
405 	nl_handler_f handler = nl_handlers[nlp->nl_proto].cb;
406 	int error = 0;
407 
408 	NLP_LOG(LOG_DEBUG2, nlp, "msg len: %u type: %d: flags: 0x%X seq: %u pid: %u",
409 	    hdr->nlmsg_len, hdr->nlmsg_type, hdr->nlmsg_flags, hdr->nlmsg_seq,
410 	    hdr->nlmsg_pid);
411 
412 	if (__predict_false(hdr->nlmsg_len > remaining_length)) {
413 		NLP_LOG(LOG_DEBUG, nlp, "message is not entirely present: want %d got %d",
414 		    hdr->nlmsg_len, remaining_length);
415 		return (EINVAL);
416 	} else if (__predict_false(hdr->nlmsg_len < sizeof(*hdr))) {
417 		NL_LOG(LOG_DEBUG, "message too short: %d", hdr->nlmsg_len);
418 		return (EINVAL);
419 	}
420 	/* Stamp each message with sender pid */
421 	hdr->nlmsg_pid = nlp->nl_port;
422 
423 	npt->hdr = hdr;
424 
425 	if (hdr->nlmsg_flags & NLM_F_REQUEST && hdr->nlmsg_type >= NLMSG_MIN_TYPE) {
426 		NL_LOG(LOG_DEBUG2, "handling message with msg type: %d",
427 		   hdr->nlmsg_type);
428 
429 		if (nlp->nl_linux && linux_netlink_p != NULL) {
430 			struct nlmsghdr *hdr_orig = hdr;
431 			hdr = linux_netlink_p->msg_from_linux(nlp->nl_proto, hdr, npt);
432 			if (hdr == NULL) {
433 				npt->hdr = hdr_orig;
434 				if (hdr->nlmsg_flags & NLM_F_ACK)
435 					nlmsg_ack(nlp, EAGAIN, hdr, npt);
436 				return (0);
437 			}
438 		}
439 		error = handler(hdr, npt);
440 		NL_LOG(LOG_DEBUG2, "retcode: %d", error);
441 	}
442 	if ((hdr->nlmsg_flags & NLM_F_ACK) || (error != 0 && error != EINTR)) {
443 		if (!npt->nw->suppress_ack) {
444 			NL_LOG(LOG_DEBUG3, "ack");
445 			nlmsg_ack(nlp, error, hdr, npt);
446 		}
447 	}
448 
449 	return (0);
450 }
451 
452 static void
453 npt_clear(struct nl_pstate *npt)
454 {
455 	lb_clear(&npt->lb);
456 	npt->error = 0;
457 	npt->err_msg = NULL;
458 	npt->err_off = 0;
459 	npt->hdr = NULL;
460 	npt->nw->suppress_ack = false;
461 }
462 
463 /*
464  * Processes an incoming packet, which can contain multiple netlink messages
465  */
466 static struct mbuf *
467 nl_process_mbuf(struct mbuf *m, struct nlpcb *nlp)
468 {
469 	int offset, buffer_length;
470 	struct nlmsghdr *hdr;
471 	char *buffer;
472 	int error;
473 
474 	NL_LOG(LOG_DEBUG3, "RX netlink mbuf %p on %p", m, nlp->nl_socket);
475 
476 	struct nl_writer nw = {};
477 	if (!nlmsg_get_unicast_writer(&nw, NLMSG_SMALL, nlp)) {
478 		m_freem(m);
479 		NL_LOG(LOG_DEBUG, "error allocating socket writer");
480 		return (NULL);
481 	}
482 
483 	nlmsg_ignore_limit(&nw);
484 	/* TODO: alloc this buf once for nlp */
485 	int data_length = m_length(m, NULL);
486 	buffer_length = roundup2(data_length, 8) + SCRATCH_BUFFER_SIZE;
487 	if (nlp->nl_linux)
488 		buffer_length += roundup2(data_length, 8);
489 	buffer = malloc(buffer_length, M_NETLINK, M_NOWAIT | M_ZERO);
490 	if (buffer == NULL) {
491 		m_freem(m);
492 		nlmsg_flush(&nw);
493 		NL_LOG(LOG_DEBUG, "Unable to allocate %d bytes of memory",
494 		    buffer_length);
495 		return (NULL);
496 	}
497 	m_copydata(m, 0, data_length, buffer);
498 
499 	struct nl_pstate npt = {
500 		.nlp = nlp,
501 		.lb.base = &buffer[roundup2(data_length, 8)],
502 		.lb.size = buffer_length - roundup2(data_length, 8),
503 		.nw = &nw,
504 		.strict = nlp->nl_flags & NLF_STRICT,
505 	};
506 
507 	for (offset = 0; offset + sizeof(struct nlmsghdr) <= data_length;) {
508 		hdr = (struct nlmsghdr *)&buffer[offset];
509 		/* Save length prior to calling handler */
510 		int msglen = NLMSG_ALIGN(hdr->nlmsg_len);
511 		NL_LOG(LOG_DEBUG3, "parsing offset %d/%d", offset, data_length);
512 		npt_clear(&npt);
513 		error = nl_receive_message(hdr, data_length - offset, nlp, &npt);
514 		offset += msglen;
515 		if (__predict_false(error != 0 || nlp->nl_tx_blocked))
516 			break;
517 	}
518 	NL_LOG(LOG_DEBUG3, "packet parsing done");
519 	free(buffer, M_NETLINK);
520 	nlmsg_flush(&nw);
521 
522 	if (nlp->nl_tx_blocked) {
523 		NLP_LOCK(nlp);
524 		nlp->nl_tx_blocked = false;
525 		NLP_UNLOCK(nlp);
526 		m_adj(m, offset);
527 		return (m);
528 	} else {
529 		m_freem(m);
530 		return (NULL);
531 	}
532 }
533