xref: /freebsd/sys/netlink/netlink_io.c (revision 0d9ef08e099f6837de5a40fd582d9ffb01fd31a4)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause
3  *
4  * Copyright (c) 2021 Ng Peng Nam Sean
5  * Copyright (c) 2022 Alexander V. Chernikov <melifaro@FreeBSD.org>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26  * SUCH DAMAGE.
27  */
28 
29 #include <sys/param.h>
30 #include <sys/ck.h>
31 #include <sys/lock.h>
32 #include <sys/malloc.h>
33 #include <sys/mbuf.h>
34 #include <sys/mutex.h>
35 #include <sys/socket.h>
36 #include <sys/socketvar.h>
37 #include <sys/syslog.h>
38 
39 #include <netlink/netlink.h>
40 #include <netlink/netlink_ctl.h>
41 #include <netlink/netlink_linux.h>
42 #include <netlink/netlink_var.h>
43 
44 #define	DEBUG_MOD_NAME	nl_io
45 #define	DEBUG_MAX_LEVEL	LOG_DEBUG3
46 #include <netlink/netlink_debug.h>
47 _DECLARE_DEBUG(LOG_INFO);
48 
49 /*
50  * The logic below provide a p2p interface for receiving and
51  * sending netlink data between the kernel and userland.
52  */
53 
54 static bool nl_process_nbuf(struct nl_buf *nb, struct nlpcb *nlp);
55 
56 struct nl_buf *
nl_buf_alloc(size_t len,int mflag)57 nl_buf_alloc(size_t len, int mflag)
58 {
59 	struct nl_buf *nb;
60 
61 	KASSERT(len > 0 && len <= UINT_MAX, ("%s: invalid length %zu",
62 	    __func__, len));
63 
64 	nb = malloc(sizeof(struct nl_buf) + len, M_NETLINK, mflag);
65 	if (__predict_true(nb != NULL)) {
66 		nb->buflen = len;
67 		nb->datalen = nb->offset = 0;
68 	}
69 
70 	return (nb);
71 }
72 
73 void
nl_buf_free(struct nl_buf * nb)74 nl_buf_free(struct nl_buf *nb)
75 {
76 
77 	free(nb, M_NETLINK);
78 }
79 
80 void
nl_schedule_taskqueue(struct nlpcb * nlp)81 nl_schedule_taskqueue(struct nlpcb *nlp)
82 {
83 	if (!nlp->nl_task_pending) {
84 		nlp->nl_task_pending = true;
85 		taskqueue_enqueue(nlp->nl_taskqueue, &nlp->nl_task);
86 		NL_LOG(LOG_DEBUG3, "taskqueue scheduled");
87 	} else {
88 		NL_LOG(LOG_DEBUG3, "taskqueue schedule skipped");
89 	}
90 }
91 
92 static bool
nl_process_received_one(struct nlpcb * nlp)93 nl_process_received_one(struct nlpcb *nlp)
94 {
95 	struct socket *so = nlp->nl_socket;
96 	struct sockbuf *sb;
97 	struct nl_buf *nb;
98 	bool reschedule = false;
99 
100 	NLP_LOCK(nlp);
101 	nlp->nl_task_pending = false;
102 	NLP_UNLOCK(nlp);
103 
104 	/*
105 	 * Do not process queued up requests if there is no space to queue
106 	 * replies.
107 	 */
108 	sb = &so->so_rcv;
109 	SOCK_RECVBUF_LOCK(so);
110 	if (sb->sb_hiwat <= sb->sb_ccc) {
111 		SOCK_RECVBUF_UNLOCK(so);
112 		NL_LOG(LOG_DEBUG3, "socket %p stuck", so);
113 		return (false);
114 	}
115 	SOCK_RECVBUF_UNLOCK(so);
116 
117 	sb = &so->so_snd;
118 	SOCK_SENDBUF_LOCK(so);
119 	while ((nb = TAILQ_FIRST(&sb->nl_queue)) != NULL) {
120 		TAILQ_REMOVE(&sb->nl_queue, nb, tailq);
121 		SOCK_SENDBUF_UNLOCK(so);
122 		reschedule = nl_process_nbuf(nb, nlp);
123 		SOCK_SENDBUF_LOCK(so);
124 		if (reschedule) {
125 			sb->sb_acc -= nb->datalen;
126 			sb->sb_ccc -= nb->datalen;
127 			/* XXXGL: potentially can reduce lock&unlock count. */
128 			sowwakeup_locked(so);
129 			nl_buf_free(nb);
130 			SOCK_SENDBUF_LOCK(so);
131 		} else {
132 			TAILQ_INSERT_HEAD(&sb->nl_queue, nb, tailq);
133 			break;
134 		}
135 	}
136 	SOCK_SENDBUF_UNLOCK(so);
137 
138 	return (reschedule);
139 }
140 
141 static void
nl_process_received(struct nlpcb * nlp)142 nl_process_received(struct nlpcb *nlp)
143 {
144 	NL_LOG(LOG_DEBUG3, "taskqueue called");
145 
146 	if (__predict_false(nlp->nl_need_thread_setup)) {
147 		nl_set_thread_nlp(curthread, nlp);
148 		NLP_LOCK(nlp);
149 		nlp->nl_need_thread_setup = false;
150 		NLP_UNLOCK(nlp);
151 	}
152 
153 	while (nl_process_received_one(nlp))
154 		;
155 }
156 
157 /*
158  * Called after some data have been read from the socket.
159  */
160 void
nl_on_transmit(struct nlpcb * nlp)161 nl_on_transmit(struct nlpcb *nlp)
162 {
163 	NLP_LOCK(nlp);
164 
165 	struct socket *so = nlp->nl_socket;
166 	if (__predict_false(nlp->nl_dropped_bytes > 0 && so != NULL)) {
167 		unsigned long dropped_bytes = nlp->nl_dropped_bytes;
168 		unsigned long dropped_messages = nlp->nl_dropped_messages;
169 		nlp->nl_dropped_bytes = 0;
170 		nlp->nl_dropped_messages = 0;
171 
172 		struct sockbuf *sb = &so->so_rcv;
173 		NLP_LOG(LOG_DEBUG, nlp,
174 		    "socket RX overflowed, %lu messages (%lu bytes) dropped. "
175 		    "bytes: [%u/%u]", dropped_messages, dropped_bytes,
176 		    sb->sb_ccc, sb->sb_hiwat);
177 		/* TODO: send netlink message */
178 	}
179 
180 	nl_schedule_taskqueue(nlp);
181 	NLP_UNLOCK(nlp);
182 }
183 
184 void
nl_taskqueue_handler(void * _arg,int pending)185 nl_taskqueue_handler(void *_arg, int pending)
186 {
187 	struct nlpcb *nlp = (struct nlpcb *)_arg;
188 
189 	CURVNET_SET(nlp->nl_socket->so_vnet);
190 	nl_process_received(nlp);
191 	CURVNET_RESTORE();
192 }
193 
194 /*
195  * Tries to send current data buffer from writer.
196  *
197  * Returns true on success.
198  * If no queue overrunes happened, wakes up socket owner.
199  */
200 bool
nl_send(struct nl_writer * nw,struct nlpcb * nlp)201 nl_send(struct nl_writer *nw, struct nlpcb *nlp)
202 {
203 	struct socket *so = nlp->nl_socket;
204 	struct sockbuf *sb = &so->so_rcv;
205 	struct nl_buf *nb;
206 
207 	MPASS(nw->hdr == NULL);
208 	MPASS(nw->buf != NULL);
209 	MPASS(nw->buf->datalen > 0);
210 
211 	IF_DEBUG_LEVEL(LOG_DEBUG2) {
212 		struct nlmsghdr *hdr = (struct nlmsghdr *)nw->buf->data;
213 		NLP_LOG(LOG_DEBUG2, nlp,
214 		    "TX len %u msgs %u msg type %d first hdrlen %u",
215 		    nw->buf->datalen, nw->num_messages, hdr->nlmsg_type,
216 		    hdr->nlmsg_len);
217 	}
218 
219 	if (nlp->nl_linux && linux_netlink_p != NULL) {
220 		nb = linux_netlink_p->msgs_to_linux(nw->buf, nlp);
221 		nl_buf_free(nw->buf);
222 		nw->buf = NULL;
223 		if (nb == NULL)
224 			return (false);
225 	} else {
226 		nb = nw->buf;
227 		nw->buf = NULL;
228 	}
229 
230 	SOCK_RECVBUF_LOCK(so);
231 	if (!nw->ignore_limit && __predict_false(sb->sb_hiwat <= sb->sb_ccc)) {
232 		SOCK_RECVBUF_UNLOCK(so);
233 		NLP_LOCK(nlp);
234 		nlp->nl_dropped_bytes += nb->datalen;
235 		nlp->nl_dropped_messages += nw->num_messages;
236 		NLP_LOG(LOG_DEBUG2, nlp, "RX oveflow: %lu m (+%d), %lu b (+%d)",
237 		    (unsigned long)nlp->nl_dropped_messages, nw->num_messages,
238 		    (unsigned long)nlp->nl_dropped_bytes, nb->datalen);
239 		NLP_UNLOCK(nlp);
240 		nl_buf_free(nb);
241 		return (false);
242 	} else {
243 		bool full;
244 
245 		TAILQ_INSERT_TAIL(&sb->nl_queue, nb, tailq);
246 		sb->sb_acc += nb->datalen;
247 		sb->sb_ccc += nb->datalen;
248 		full = sb->sb_hiwat <= sb->sb_ccc;
249 		sorwakeup_locked(so);
250 		if (full) {
251 			NLP_LOCK(nlp);
252 			nlp->nl_tx_blocked = true;
253 			NLP_UNLOCK(nlp);
254 		}
255 		return (true);
256 	}
257 }
258 
259 static int
nl_receive_message(struct nlmsghdr * hdr,int remaining_length,struct nlpcb * nlp,struct nl_pstate * npt)260 nl_receive_message(struct nlmsghdr *hdr, int remaining_length,
261     struct nlpcb *nlp, struct nl_pstate *npt)
262 {
263 	nl_handler_f handler = nl_handlers[nlp->nl_proto].cb;
264 	int error = 0;
265 
266 	NLP_LOG(LOG_DEBUG2, nlp, "msg len: %u type: %d: flags: 0x%X seq: %u pid: %u",
267 	    hdr->nlmsg_len, hdr->nlmsg_type, hdr->nlmsg_flags, hdr->nlmsg_seq,
268 	    hdr->nlmsg_pid);
269 
270 	if (__predict_false(hdr->nlmsg_len > remaining_length)) {
271 		NLP_LOG(LOG_DEBUG, nlp, "message is not entirely present: want %d got %d",
272 		    hdr->nlmsg_len, remaining_length);
273 		return (EINVAL);
274 	} else if (__predict_false(hdr->nlmsg_len < sizeof(*hdr))) {
275 		NL_LOG(LOG_DEBUG, "message too short: %d", hdr->nlmsg_len);
276 		return (EINVAL);
277 	}
278 	/* Stamp each message with sender pid */
279 	hdr->nlmsg_pid = nlp->nl_port;
280 
281 	npt->hdr = hdr;
282 
283 	if (hdr->nlmsg_flags & NLM_F_REQUEST &&
284 	    hdr->nlmsg_type >= NLMSG_MIN_TYPE) {
285 		NL_LOG(LOG_DEBUG2, "handling message with msg type: %d",
286 		   hdr->nlmsg_type);
287 		if (nlp->nl_linux) {
288 			MPASS(linux_netlink_p != NULL);
289 			error = linux_netlink_p->msg_from_linux(nlp->nl_proto,
290 			    &hdr, npt);
291 			if (error)
292 				goto ack;
293 		}
294 		error = handler(hdr, npt);
295 		NL_LOG(LOG_DEBUG2, "retcode: %d", error);
296 	}
297 ack:
298 	if ((hdr->nlmsg_flags & NLM_F_ACK) || (error != 0 && error != EINTR)) {
299 		if (!npt->nw->suppress_ack) {
300 			NL_LOG(LOG_DEBUG3, "ack");
301 			nlmsg_ack(nlp, error, hdr, npt);
302 		}
303 	}
304 
305 	return (0);
306 }
307 
308 static void
npt_clear(struct nl_pstate * npt)309 npt_clear(struct nl_pstate *npt)
310 {
311 	lb_clear(&npt->lb);
312 	npt->cookie = NULL;
313 	npt->error = 0;
314 	npt->err_msg = NULL;
315 	npt->err_off = 0;
316 	npt->hdr = NULL;
317 	npt->nw->suppress_ack = false;
318 }
319 
320 /*
321  * Processes an incoming packet, which can contain multiple netlink messages
322  */
323 static bool
nl_process_nbuf(struct nl_buf * nb,struct nlpcb * nlp)324 nl_process_nbuf(struct nl_buf *nb, struct nlpcb *nlp)
325 {
326 	struct nl_writer nw;
327 	struct nlmsghdr *hdr;
328 	int error;
329 
330 	NL_LOG(LOG_DEBUG3, "RX netlink buf %p on %p", nb, nlp->nl_socket);
331 
332 	if (!nl_writer_unicast(&nw, NLMSG_SMALL, nlp, false)) {
333 		NL_LOG(LOG_DEBUG, "error allocating socket writer");
334 		return (true);
335 	}
336 
337 	nlmsg_ignore_limit(&nw);
338 
339 	struct nl_pstate npt = {
340 		.nlp = nlp,
341 		.lb.base = &nb->data[roundup2(nb->datalen, 8)],
342 		.lb.size = nb->buflen - roundup2(nb->datalen, 8),
343 		.nw = &nw,
344 		.strict = nlp->nl_flags & NLF_STRICT,
345 	};
346 
347 	for (; nb->offset + sizeof(struct nlmsghdr) <= nb->datalen;) {
348 		hdr = (struct nlmsghdr *)&nb->data[nb->offset];
349 		/* Save length prior to calling handler */
350 		int msglen = NLMSG_ALIGN(hdr->nlmsg_len);
351 		NL_LOG(LOG_DEBUG3, "parsing offset %d/%d",
352 		    nb->offset, nb->datalen);
353 		npt_clear(&npt);
354 		error = nl_receive_message(hdr, nb->datalen - nb->offset, nlp,
355 		    &npt);
356 		nb->offset += msglen;
357 		if (__predict_false(error != 0 || nlp->nl_tx_blocked))
358 			break;
359 	}
360 	NL_LOG(LOG_DEBUG3, "packet parsing done");
361 	nlmsg_flush(&nw);
362 
363 	if (nlp->nl_tx_blocked) {
364 		NLP_LOCK(nlp);
365 		nlp->nl_tx_blocked = false;
366 		NLP_UNLOCK(nlp);
367 		return (false);
368 	} else
369 		return (true);
370 }
371