xref: /freebsd/sys/netlink/netlink_io.c (revision 0aa2700123e22c2b0a977375e087dc2759b8e980)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2021 Ng Peng Nam Sean
5  * Copyright (c) 2022 Alexander V. Chernikov <melifaro@FreeBSD.org>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26  * SUCH DAMAGE.
27  */
28 
29 #include <sys/cdefs.h>
30 __FBSDID("$FreeBSD$");
31 #include <sys/param.h>
32 #include <sys/malloc.h>
33 #include <sys/lock.h>
34 #include <sys/mbuf.h>
35 #include <sys/ck.h>
36 #include <sys/socket.h>
37 #include <sys/socketvar.h>
38 #include <sys/syslog.h>
39 
40 #include <netlink/netlink.h>
41 #include <netlink/netlink_ctl.h>
42 #include <netlink/netlink_linux.h>
43 #include <netlink/netlink_var.h>
44 
45 #define	DEBUG_MOD_NAME	nl_io
46 #define	DEBUG_MAX_LEVEL	LOG_DEBUG3
47 #include <netlink/netlink_debug.h>
48 _DECLARE_DEBUG(LOG_DEBUG);
49 
50 /*
51  * The logic below provide a p2p interface for receiving and
52  * sending netlink data between the kernel and userland.
53  */
54 
55 static const struct sockaddr_nl _nl_empty_src = {
56 	.nl_len = sizeof(struct sockaddr_nl),
57 	.nl_family = PF_NETLINK,
58 	.nl_pid = 0 /* comes from the kernel */
59 };
60 static const struct sockaddr *nl_empty_src = (const struct sockaddr *)&_nl_empty_src;
61 
62 static struct mbuf *nl_process_mbuf(struct mbuf *m, struct nlpcb *nlp);
63 
64 
65 static void
66 queue_push(struct nl_io_queue *q, struct mbuf *mq)
67 {
68 	while (mq != NULL) {
69 		struct mbuf *m = mq;
70 		mq = mq->m_nextpkt;
71 		m->m_nextpkt = NULL;
72 
73 		q->length += m_length(m, NULL);
74 		STAILQ_INSERT_TAIL(&q->head, m, m_stailqpkt);
75 	}
76 }
77 
78 static void
79 queue_push_head(struct nl_io_queue *q, struct mbuf *m)
80 {
81 	MPASS(m->m_nextpkt == NULL);
82 
83 	q->length += m_length(m, NULL);
84 	STAILQ_INSERT_HEAD(&q->head, m, m_stailqpkt);
85 }
86 
87 static struct mbuf *
88 queue_pop(struct nl_io_queue *q)
89 {
90 	if (!STAILQ_EMPTY(&q->head)) {
91 		struct mbuf *m = STAILQ_FIRST(&q->head);
92 		STAILQ_REMOVE_HEAD(&q->head, m_stailqpkt);
93 		m->m_nextpkt = NULL;
94 		q->length -= m_length(m, NULL);
95 
96 		return (m);
97 	}
98 	return (NULL);
99 }
100 
101 static struct mbuf *
102 queue_head(const struct nl_io_queue *q)
103 {
104 	return (STAILQ_FIRST(&q->head));
105 }
106 
107 static inline bool
108 queue_empty(const struct nl_io_queue *q)
109 {
110 	return (q->length == 0);
111 }
112 
113 static void
114 queue_free(struct nl_io_queue *q)
115 {
116 	while (!STAILQ_EMPTY(&q->head)) {
117 		struct mbuf *m = STAILQ_FIRST(&q->head);
118 		STAILQ_REMOVE_HEAD(&q->head, m_stailqpkt);
119 		m->m_nextpkt = NULL;
120 		m_freem(m);
121 	}
122 	q->length = 0;
123 }
124 
125 
126 static void
127 nl_schedule_taskqueue(struct nlpcb *nlp)
128 {
129 	if (!nlp->nl_task_pending) {
130 		nlp->nl_task_pending = true;
131 		taskqueue_enqueue(nlp->nl_taskqueue, &nlp->nl_task);
132 		NL_LOG(LOG_DEBUG3, "taskqueue scheduled");
133 	} else {
134 		NL_LOG(LOG_DEBUG3, "taskqueue schedule skipped");
135 	}
136 }
137 
138 int
139 nl_receive_async(struct mbuf *m, struct socket *so)
140 {
141 	struct nlpcb *nlp = sotonlpcb(so);
142 	int error = 0;
143 
144 	m->m_nextpkt = NULL;
145 
146 	NLP_LOCK(nlp);
147 
148 	if ((__predict_true(nlp->nl_active))) {
149 		sbappend(&so->so_snd, m, 0);
150 		NL_LOG(LOG_DEBUG3, "enqueue %u bytes", m_length(m, NULL));
151 		nl_schedule_taskqueue(nlp);
152 	} else {
153 		NL_LOG(LOG_DEBUG, "ignoring %u bytes on non-active socket",
154 		    m_length(m, NULL));
155 		m_free(m);
156 		error = EINVAL;
157 	}
158 
159 	NLP_UNLOCK(nlp);
160 
161 	return (error);
162 }
163 
164 static bool
165 tx_check_locked(struct nlpcb *nlp)
166 {
167 	if (queue_empty(&nlp->tx_queue))
168 		return (true);
169 
170 	/*
171 	 * Check if something can be moved from the internal TX queue
172 	 * to the socket queue.
173 	 */
174 
175 	bool appended = false;
176 	struct sockbuf *sb = &nlp->nl_socket->so_rcv;
177 	SOCKBUF_LOCK(sb);
178 
179 	while (true) {
180 		struct mbuf *m = queue_head(&nlp->tx_queue);
181 		if (m && sbappendaddr_locked(sb, nl_empty_src, m, NULL) != 0) {
182 			/* appended successfully */
183 			queue_pop(&nlp->tx_queue);
184 			appended = true;
185 		} else
186 			break;
187 	}
188 
189 	SOCKBUF_UNLOCK(sb);
190 
191 	if (appended)
192 		sorwakeup(nlp->nl_socket);
193 
194 	return (queue_empty(&nlp->tx_queue));
195 }
196 
197 static bool
198 nl_process_received_one(struct nlpcb *nlp)
199 {
200 	bool reschedule = false;
201 
202 	NLP_LOCK(nlp);
203 	nlp->nl_task_pending = false;
204 
205 	if (!tx_check_locked(nlp)) {
206 		/* TX overflow queue still not empty, ignore RX */
207 		NLP_UNLOCK(nlp);
208 		return (false);
209 	}
210 
211 	if (queue_empty(&nlp->rx_queue)) {
212 		/*
213 		 * Grab all data we have from the socket TX queue
214 		 * and store it the internal queue, so it can be worked on
215 		 * w/o holding socket lock.
216 		 */
217 		struct sockbuf *sb = &nlp->nl_socket->so_snd;
218 
219 		SOCKBUF_LOCK(sb);
220 		unsigned int avail = sbavail(sb);
221 		if (avail > 0) {
222 			NL_LOG(LOG_DEBUG3, "grabbed %u bytes", avail);
223 			queue_push(&nlp->rx_queue, sbcut_locked(sb, avail));
224 		}
225 		SOCKBUF_UNLOCK(sb);
226 	} else {
227 		/* Schedule another pass to read from the socket queue */
228 		reschedule = true;
229 	}
230 
231 	int prev_hiwat = nlp->tx_queue.hiwat;
232 	NLP_UNLOCK(nlp);
233 
234 	while (!queue_empty(&nlp->rx_queue)) {
235 		struct mbuf *m = queue_pop(&nlp->rx_queue);
236 
237 		m = nl_process_mbuf(m, nlp);
238 		if (m != NULL) {
239 			queue_push_head(&nlp->rx_queue, m);
240 			reschedule = false;
241 			break;
242 		}
243 	}
244 	if (nlp->tx_queue.hiwat > prev_hiwat) {
245 		NLP_LOG(LOG_DEBUG, nlp, "TX override peaked to %d", nlp->tx_queue.hiwat);
246 
247 	}
248 
249 	return (reschedule);
250 }
251 
252 static void
253 nl_process_received(struct nlpcb *nlp)
254 {
255 	NL_LOG(LOG_DEBUG3, "taskqueue called");
256 
257 	while (nl_process_received_one(nlp))
258 		;
259 }
260 
261 void
262 nl_init_io(struct nlpcb *nlp)
263 {
264 	STAILQ_INIT(&nlp->rx_queue.head);
265 	STAILQ_INIT(&nlp->tx_queue.head);
266 }
267 
268 void
269 nl_free_io(struct nlpcb *nlp)
270 {
271 	queue_free(&nlp->rx_queue);
272 	queue_free(&nlp->tx_queue);
273 }
274 
275 /*
276  * Called after some data have been read from the socket.
277  */
278 void
279 nl_on_transmit(struct nlpcb *nlp)
280 {
281 	NLP_LOCK(nlp);
282 
283 	struct socket *so = nlp->nl_socket;
284 	if (__predict_false(nlp->nl_dropped_bytes > 0 && so != NULL)) {
285 		unsigned long dropped_bytes = nlp->nl_dropped_bytes;
286 		unsigned long dropped_messages = nlp->nl_dropped_messages;
287 		nlp->nl_dropped_bytes = 0;
288 		nlp->nl_dropped_messages = 0;
289 
290 		struct sockbuf *sb = &so->so_rcv;
291 		NLP_LOG(LOG_DEBUG, nlp,
292 		    "socket RX overflowed, %lu messages (%lu bytes) dropped. "
293 		    "bytes: [%u/%u] mbufs: [%u/%u]", dropped_messages, dropped_bytes,
294 		    sb->sb_ccc, sb->sb_hiwat, sb->sb_mbcnt, sb->sb_mbmax);
295 		/* TODO: send netlink message */
296 	}
297 
298 	nl_schedule_taskqueue(nlp);
299 	NLP_UNLOCK(nlp);
300 }
301 
302 void
303 nl_taskqueue_handler(void *_arg, int pending)
304 {
305 	struct nlpcb *nlp = (struct nlpcb *)_arg;
306 
307 	CURVNET_SET(nlp->nl_socket->so_vnet);
308 	nl_process_received(nlp);
309 	CURVNET_RESTORE();
310 }
311 
312 static __noinline void
313 queue_push_tx(struct nlpcb *nlp, struct mbuf *m)
314 {
315 	queue_push(&nlp->tx_queue, m);
316 	nlp->nl_tx_blocked = true;
317 
318 	if (nlp->tx_queue.length > nlp->tx_queue.hiwat)
319 		nlp->tx_queue.hiwat = nlp->tx_queue.length;
320 }
321 
322 /*
323  * Tries to send @m to the socket @nlp.
324  *
325  * @m: mbuf(s) to send to. Consumed in any case.
326  * @nlp: socket to send to
327  * @cnt: number of messages in @m
328  * @io_flags: combination of NL_IOF_* flags
329  *
330  * Returns true on success.
331  * If no queue overrunes happened, wakes up socket owner.
332  */
333 bool
334 nl_send_one(struct mbuf *m, struct nlpcb *nlp, int num_messages, int io_flags)
335 {
336 	bool untranslated = io_flags & NL_IOF_UNTRANSLATED;
337 	bool ignore_limits = io_flags & NL_IOF_IGNORE_LIMIT;
338 	bool result = true;
339 
340 	IF_DEBUG_LEVEL(LOG_DEBUG2) {
341 		struct nlmsghdr *hdr = mtod(m, struct nlmsghdr *);
342 		NLP_LOG(LOG_DEBUG2, nlp,
343 		    "TX mbuf len %u msgs %u msg type %d first hdrlen %u io_flags %X",
344 		    m_length(m, NULL), num_messages, hdr->nlmsg_type, hdr->nlmsg_len,
345 		    io_flags);
346 	}
347 
348 	if (__predict_false(nlp->nl_linux && linux_netlink_p != NULL && untranslated)) {
349 		m = linux_netlink_p->mbufs_to_linux(nlp->nl_proto, m, nlp);
350 		if (m == NULL)
351 			return (false);
352 	}
353 
354 	NLP_LOCK(nlp);
355 
356 	if (__predict_false(nlp->nl_socket == NULL)) {
357 		NLP_UNLOCK(nlp);
358 		m_freem(m);
359 		return (false);
360 	}
361 
362 	if (!queue_empty(&nlp->tx_queue)) {
363 		if (ignore_limits) {
364 			queue_push_tx(nlp, m);
365 		} else {
366 			m_free(m);
367 			result = false;
368 		}
369 		NLP_UNLOCK(nlp);
370 		return (result);
371 	}
372 
373 	struct socket *so = nlp->nl_socket;
374 	if (sbappendaddr(&so->so_rcv, nl_empty_src, m, NULL) != 0) {
375 		sorwakeup(so);
376 		NLP_LOG(LOG_DEBUG3, nlp, "appended data & woken up");
377 	} else {
378 		if (ignore_limits) {
379 			queue_push_tx(nlp, m);
380 		} else {
381 			/*
382 			 * Store dropped data so it can be reported
383 			 * on the next read
384 			 */
385 			nlp->nl_dropped_bytes += m_length(m, NULL);
386 			nlp->nl_dropped_messages += num_messages;
387 			NLP_LOG(LOG_DEBUG2, nlp, "RX oveflow: %lu m (+%d), %lu b (+%d)",
388 			    (unsigned long)nlp->nl_dropped_messages, num_messages,
389 			    (unsigned long)nlp->nl_dropped_bytes, m_length(m, NULL));
390 			soroverflow(so);
391 			m_freem(m);
392 			result = false;
393 		}
394 	}
395 	NLP_UNLOCK(nlp);
396 
397 	return (result);
398 }
399 
400 static int
401 nl_receive_message(struct nlmsghdr *hdr, int remaining_length,
402     struct nlpcb *nlp, struct nl_pstate *npt)
403 {
404 	nl_handler_f handler = nl_handlers[nlp->nl_proto].cb;
405 	int error = 0;
406 
407 	NL_LOG(LOG_DEBUG2, "msg len: %d type: %d", hdr->nlmsg_len,
408 	    hdr->nlmsg_type);
409 
410 	if (__predict_false(hdr->nlmsg_len > remaining_length)) {
411 		NLP_LOG(LOG_DEBUG, nlp, "message is not entirely present: want %d got %d",
412 		    hdr->nlmsg_len, remaining_length);
413 		return (EINVAL);
414 	} else if (__predict_false(hdr->nlmsg_len < sizeof(*hdr))) {
415 		NL_LOG(LOG_DEBUG, "message too short: %d", hdr->nlmsg_len);
416 		return (EINVAL);
417 	}
418 	/* Stamp each message with sender pid */
419 	hdr->nlmsg_pid = nlp->nl_port;
420 
421 	npt->hdr = hdr;
422 
423 	if (hdr->nlmsg_flags & NLM_F_REQUEST && hdr->nlmsg_type >= NLMSG_MIN_TYPE) {
424 		NL_LOG(LOG_DEBUG2, "handling message with msg type: %d",
425 		   hdr->nlmsg_type);
426 
427 		if (nlp->nl_linux && linux_netlink_p != NULL) {
428 			struct nlmsghdr *hdr_orig = hdr;
429 			hdr = linux_netlink_p->msg_from_linux(nlp->nl_proto, hdr, npt);
430 			if (hdr == NULL) {
431 				npt->hdr = hdr_orig;
432 				if (hdr->nlmsg_flags & NLM_F_ACK)
433 					nlmsg_ack(nlp, EAGAIN, hdr, npt);
434 				return (0);
435 			}
436 		}
437 		error = handler(hdr, npt);
438 		NL_LOG(LOG_DEBUG2, "retcode: %d", error);
439 	}
440 	if ((hdr->nlmsg_flags & NLM_F_ACK) || (error != 0 && error != EINTR)) {
441 		NL_LOG(LOG_DEBUG3, "ack");
442 		nlmsg_ack(nlp, error, hdr, npt);
443 		NL_LOG(LOG_DEBUG3, "done");
444 	}
445 
446 	return (0);
447 }
448 
449 static void
450 npt_clear(struct nl_pstate *npt)
451 {
452 	lb_clear(&npt->lb);
453 	npt->error = 0;
454 	npt->err_msg = NULL;
455 	npt->err_off = 0;
456 	npt->hdr = NULL;
457 }
458 
459 /*
460  * Processes an incoming packet, which can contain multiple netlink messages
461  */
462 static struct mbuf *
463 nl_process_mbuf(struct mbuf *m, struct nlpcb *nlp)
464 {
465 	int offset, buffer_length;
466 	struct nlmsghdr *hdr;
467 	char *buffer;
468 	int error;
469 
470 	NL_LOG(LOG_DEBUG3, "RX netlink mbuf %p on %p", m, nlp->nl_socket);
471 
472 	struct nl_writer nw = {};
473 	if (!nlmsg_get_unicast_writer(&nw, NLMSG_SMALL, nlp)) {
474 		m_freem(m);
475 		NL_LOG(LOG_DEBUG, "error allocating socket writer");
476 		return (NULL);
477 	}
478 
479 	nlmsg_ignore_limit(&nw);
480 	/* TODO: alloc this buf once for nlp */
481 	int data_length = m_length(m, NULL);
482 	buffer_length = roundup2(data_length, 8) + SCRATCH_BUFFER_SIZE;
483 	if (nlp->nl_linux)
484 		buffer_length += roundup2(data_length, 8);
485 	buffer = malloc(buffer_length, M_NETLINK, M_NOWAIT | M_ZERO);
486 	if (buffer == NULL) {
487 		m_freem(m);
488 		nlmsg_flush(&nw);
489 		NL_LOG(LOG_DEBUG, "Unable to allocate %d bytes of memory",
490 		    buffer_length);
491 		return (NULL);
492 	}
493 	m_copydata(m, 0, data_length, buffer);
494 
495 	struct nl_pstate npt = {
496 		.nlp = nlp,
497 		.lb.base = &buffer[roundup2(data_length, 8)],
498 		.lb.size = buffer_length - roundup2(data_length, 8),
499 		.nw = &nw,
500 		.strict = nlp->nl_flags & NLF_STRICT,
501 	};
502 
503 	for (offset = 0; offset + sizeof(struct nlmsghdr) <= data_length;) {
504 		hdr = (struct nlmsghdr *)&buffer[offset];
505 		/* Save length prior to calling handler */
506 		int msglen = NLMSG_ALIGN(hdr->nlmsg_len);
507 		NL_LOG(LOG_DEBUG3, "parsing offset %d/%d", offset, data_length);
508 		npt_clear(&npt);
509 		error = nl_receive_message(hdr, data_length - offset, nlp, &npt);
510 		offset += msglen;
511 		if (__predict_false(error != 0 || nlp->nl_tx_blocked))
512 			break;
513 	}
514 	NL_LOG(LOG_DEBUG3, "packet parsing done");
515 	free(buffer, M_NETLINK);
516 	nlmsg_flush(&nw);
517 
518 	if (nlp->nl_tx_blocked) {
519 		NLP_LOCK(nlp);
520 		nlp->nl_tx_blocked = false;
521 		NLP_UNLOCK(nlp);
522 		m_adj(m, offset);
523 		return (m);
524 	} else {
525 		m_freem(m);
526 		return (NULL);
527 	}
528 }
529