xref: /freebsd/sys/netlink/netlink_io.c (revision 17083b94a91563aba15ba03d1c74796a35bb1c26)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause
3  *
4  * Copyright (c) 2021 Ng Peng Nam Sean
5  * Copyright (c) 2022 Alexander V. Chernikov <melifaro@FreeBSD.org>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26  * SUCH DAMAGE.
27  */
28 
29 #include <sys/param.h>
30 #include <sys/ck.h>
31 #include <sys/lock.h>
32 #include <sys/malloc.h>
33 #include <sys/mbuf.h>
34 #include <sys/mutex.h>
35 #include <sys/socket.h>
36 #include <sys/socketvar.h>
37 #include <sys/syslog.h>
38 
39 #include <netlink/netlink.h>
40 #include <netlink/netlink_ctl.h>
41 #include <netlink/netlink_linux.h>
42 #include <netlink/netlink_var.h>
43 
44 #define	DEBUG_MOD_NAME	nl_io
45 #define	DEBUG_MAX_LEVEL	LOG_DEBUG3
46 #include <netlink/netlink_debug.h>
47 _DECLARE_DEBUG(LOG_INFO);
48 
49 /*
50  * The logic below provide a p2p interface for receiving and
51  * sending netlink data between the kernel and userland.
52  */
53 
54 static bool nl_process_nbuf(struct nl_buf *nb, struct nlpcb *nlp);
55 
56 struct nl_buf *
57 nl_buf_alloc(size_t len, int mflag)
58 {
59 	struct nl_buf *nb;
60 
61 	nb = malloc(sizeof(struct nl_buf) + len, M_NETLINK, mflag);
62 	if (__predict_true(nb != NULL)) {
63 		nb->buflen = len;
64 		nb->datalen = nb->offset = 0;
65 		nb->control = NULL;
66 	}
67 
68 	return (nb);
69 }
70 
71 void
72 nl_buf_free(struct nl_buf *nb)
73 {
74 
75 	if (nb->control)
76 		m_freem(nb->control);
77 	free(nb, M_NETLINK);
78 }
79 
80 void
81 nl_add_msg_info(struct nl_buf *nb)
82 {
83 	/* XXXGL pass nlp as arg? */
84 	struct nlpcb *nlp = nl_get_thread_nlp(curthread);
85 	NL_LOG(LOG_DEBUG2, "Trying to recover nlp from thread %p: %p",
86 	    curthread, nlp);
87 
88 	if (nlp == NULL)
89 		return;
90 
91 	/* Prepare what we want to encode - PID, socket PID & msg seq */
92 	struct {
93 		struct nlattr nla;
94 		uint32_t val;
95 	} data[] = {
96 		{
97 			.nla.nla_len = sizeof(struct nlattr) + sizeof(uint32_t),
98 			.nla.nla_type = NLMSGINFO_ATTR_PROCESS_ID,
99 			.val = nlp->nl_process_id,
100 		},
101 		{
102 			.nla.nla_len = sizeof(struct nlattr) + sizeof(uint32_t),
103 			.nla.nla_type = NLMSGINFO_ATTR_PORT_ID,
104 			.val = nlp->nl_port,
105 		},
106 	};
107 
108 
109 	nb->control = sbcreatecontrol(data, sizeof(data),
110 	    NETLINK_MSG_INFO, SOL_NETLINK, M_NOWAIT);
111 
112 	if (__predict_true(nb->control != NULL))
113 		NL_LOG(LOG_DEBUG2, "Storing %u bytes of control data, ctl: %p",
114 		    (unsigned)sizeof(data), nb->control);
115 	else
116 		NL_LOG(LOG_DEBUG2, "Failed to allocate %u bytes of control",
117 		    (unsigned)sizeof(data));
118 }
119 
120 void
121 nl_schedule_taskqueue(struct nlpcb *nlp)
122 {
123 	if (!nlp->nl_task_pending) {
124 		nlp->nl_task_pending = true;
125 		taskqueue_enqueue(nlp->nl_taskqueue, &nlp->nl_task);
126 		NL_LOG(LOG_DEBUG3, "taskqueue scheduled");
127 	} else {
128 		NL_LOG(LOG_DEBUG3, "taskqueue schedule skipped");
129 	}
130 }
131 
132 static bool
133 nl_process_received_one(struct nlpcb *nlp)
134 {
135 	struct socket *so = nlp->nl_socket;
136 	struct sockbuf *sb;
137 	struct nl_buf *nb;
138 	bool reschedule = false;
139 
140 	NLP_LOCK(nlp);
141 	nlp->nl_task_pending = false;
142 	NLP_UNLOCK(nlp);
143 
144 	/*
145 	 * Do not process queued up requests if there is no space to queue
146 	 * replies.
147 	 */
148 	sb = &so->so_rcv;
149 	SOCK_RECVBUF_LOCK(so);
150 	if (sb->sb_hiwat <= sb->sb_ccc) {
151 		SOCK_RECVBUF_UNLOCK(so);
152 		return (false);
153 	}
154 	SOCK_RECVBUF_UNLOCK(so);
155 
156 	sb = &so->so_snd;
157 	SOCK_SENDBUF_LOCK(so);
158 	while ((nb = TAILQ_FIRST(&sb->nl_queue)) != NULL) {
159 		TAILQ_REMOVE(&sb->nl_queue, nb, tailq);
160 		SOCK_SENDBUF_UNLOCK(so);
161 		reschedule = nl_process_nbuf(nb, nlp);
162 		SOCK_SENDBUF_LOCK(so);
163 		if (reschedule) {
164 			sb->sb_acc -= nb->datalen;
165 			sb->sb_ccc -= nb->datalen;
166 			/* XXXGL: potentially can reduce lock&unlock count. */
167 			sowwakeup_locked(so);
168 			nl_buf_free(nb);
169 			SOCK_SENDBUF_LOCK(so);
170 		} else {
171 			TAILQ_INSERT_HEAD(&sb->nl_queue, nb, tailq);
172 			break;
173 		}
174 	}
175 	SOCK_SENDBUF_UNLOCK(so);
176 
177 	return (reschedule);
178 }
179 
180 static void
181 nl_process_received(struct nlpcb *nlp)
182 {
183 	NL_LOG(LOG_DEBUG3, "taskqueue called");
184 
185 	if (__predict_false(nlp->nl_need_thread_setup)) {
186 		nl_set_thread_nlp(curthread, nlp);
187 		NLP_LOCK(nlp);
188 		nlp->nl_need_thread_setup = false;
189 		NLP_UNLOCK(nlp);
190 	}
191 
192 	while (nl_process_received_one(nlp))
193 		;
194 }
195 
196 /*
197  * Called after some data have been read from the socket.
198  */
199 void
200 nl_on_transmit(struct nlpcb *nlp)
201 {
202 	NLP_LOCK(nlp);
203 
204 	struct socket *so = nlp->nl_socket;
205 	if (__predict_false(nlp->nl_dropped_bytes > 0 && so != NULL)) {
206 		unsigned long dropped_bytes = nlp->nl_dropped_bytes;
207 		unsigned long dropped_messages = nlp->nl_dropped_messages;
208 		nlp->nl_dropped_bytes = 0;
209 		nlp->nl_dropped_messages = 0;
210 
211 		struct sockbuf *sb = &so->so_rcv;
212 		NLP_LOG(LOG_DEBUG, nlp,
213 		    "socket RX overflowed, %lu messages (%lu bytes) dropped. "
214 		    "bytes: [%u/%u]", dropped_messages, dropped_bytes,
215 		    sb->sb_ccc, sb->sb_hiwat);
216 		/* TODO: send netlink message */
217 	}
218 
219 	nl_schedule_taskqueue(nlp);
220 	NLP_UNLOCK(nlp);
221 }
222 
223 void
224 nl_taskqueue_handler(void *_arg, int pending)
225 {
226 	struct nlpcb *nlp = (struct nlpcb *)_arg;
227 
228 	CURVNET_SET(nlp->nl_socket->so_vnet);
229 	nl_process_received(nlp);
230 	CURVNET_RESTORE();
231 }
232 
233 /*
234  * Tries to send current data buffer from writer.
235  *
236  * Returns true on success.
237  * If no queue overrunes happened, wakes up socket owner.
238  */
239 bool
240 nl_send_one(struct nl_writer *nw)
241 {
242 	struct nlpcb *nlp = nw->nlp;
243 	struct socket *so = nlp->nl_socket;
244 	struct sockbuf *sb = &so->so_rcv;
245 	struct nl_buf *nb;
246 
247 	MPASS(nw->hdr == NULL);
248 
249 	IF_DEBUG_LEVEL(LOG_DEBUG2) {
250 		struct nlmsghdr *hdr = (struct nlmsghdr *)nw->buf->data;
251 		NLP_LOG(LOG_DEBUG2, nlp,
252 		    "TX len %u msgs %u msg type %d first hdrlen %u",
253 		    nw->buf->datalen, nw->num_messages, hdr->nlmsg_type,
254 		    hdr->nlmsg_len);
255 	}
256 
257 	if (nlp->nl_linux && linux_netlink_p != NULL &&
258 	    __predict_false(!linux_netlink_p->msgs_to_linux(nw, nlp))) {
259 		nl_buf_free(nw->buf);
260 		nw->buf = NULL;
261 		return (false);
262 	}
263 
264 	nb = nw->buf;
265 	nw->buf = NULL;
266 
267 	SOCK_RECVBUF_LOCK(so);
268 	if (!nw->ignore_limit && __predict_false(sb->sb_hiwat <= sb->sb_ccc)) {
269 		SOCK_RECVBUF_UNLOCK(so);
270 		NLP_LOCK(nlp);
271 		nlp->nl_dropped_bytes += nb->datalen;
272 		nlp->nl_dropped_messages += nw->num_messages;
273 		NLP_LOG(LOG_DEBUG2, nlp, "RX oveflow: %lu m (+%d), %lu b (+%d)",
274 		    (unsigned long)nlp->nl_dropped_messages, nw->num_messages,
275 		    (unsigned long)nlp->nl_dropped_bytes, nb->datalen);
276 		NLP_UNLOCK(nlp);
277 		nl_buf_free(nb);
278 		return (false);
279 	} else {
280 		bool full;
281 
282 		TAILQ_INSERT_TAIL(&sb->nl_queue, nb, tailq);
283 		sb->sb_acc += nb->datalen;
284 		sb->sb_ccc += nb->datalen;
285 		full = sb->sb_hiwat <= sb->sb_ccc;
286 		sorwakeup_locked(so);
287 		if (full) {
288 			NLP_LOCK(nlp);
289 			nlp->nl_tx_blocked = true;
290 			NLP_UNLOCK(nlp);
291 		}
292 		return (true);
293 	}
294 }
295 
296 static int
297 nl_receive_message(struct nlmsghdr *hdr, int remaining_length,
298     struct nlpcb *nlp, struct nl_pstate *npt)
299 {
300 	nl_handler_f handler = nl_handlers[nlp->nl_proto].cb;
301 	int error = 0;
302 
303 	NLP_LOG(LOG_DEBUG2, nlp, "msg len: %u type: %d: flags: 0x%X seq: %u pid: %u",
304 	    hdr->nlmsg_len, hdr->nlmsg_type, hdr->nlmsg_flags, hdr->nlmsg_seq,
305 	    hdr->nlmsg_pid);
306 
307 	if (__predict_false(hdr->nlmsg_len > remaining_length)) {
308 		NLP_LOG(LOG_DEBUG, nlp, "message is not entirely present: want %d got %d",
309 		    hdr->nlmsg_len, remaining_length);
310 		return (EINVAL);
311 	} else if (__predict_false(hdr->nlmsg_len < sizeof(*hdr))) {
312 		NL_LOG(LOG_DEBUG, "message too short: %d", hdr->nlmsg_len);
313 		return (EINVAL);
314 	}
315 	/* Stamp each message with sender pid */
316 	hdr->nlmsg_pid = nlp->nl_port;
317 
318 	npt->hdr = hdr;
319 
320 	if (hdr->nlmsg_flags & NLM_F_REQUEST && hdr->nlmsg_type >= NLMSG_MIN_TYPE) {
321 		NL_LOG(LOG_DEBUG2, "handling message with msg type: %d",
322 		   hdr->nlmsg_type);
323 
324 		if (nlp->nl_linux && linux_netlink_p != NULL) {
325 			struct nlmsghdr *hdr_orig = hdr;
326 			hdr = linux_netlink_p->msg_from_linux(nlp->nl_proto, hdr, npt);
327 			if (hdr == NULL) {
328 				 /* Failed to translate to kernel format. Report an error back */
329 				hdr = hdr_orig;
330 				npt->hdr = hdr;
331 				if (hdr->nlmsg_flags & NLM_F_ACK)
332 					nlmsg_ack(nlp, EOPNOTSUPP, hdr, npt);
333 				return (0);
334 			}
335 		}
336 		error = handler(hdr, npt);
337 		NL_LOG(LOG_DEBUG2, "retcode: %d", error);
338 	}
339 	if ((hdr->nlmsg_flags & NLM_F_ACK) || (error != 0 && error != EINTR)) {
340 		if (!npt->nw->suppress_ack) {
341 			NL_LOG(LOG_DEBUG3, "ack");
342 			nlmsg_ack(nlp, error, hdr, npt);
343 		}
344 	}
345 
346 	return (0);
347 }
348 
349 static void
350 npt_clear(struct nl_pstate *npt)
351 {
352 	lb_clear(&npt->lb);
353 	npt->error = 0;
354 	npt->err_msg = NULL;
355 	npt->err_off = 0;
356 	npt->hdr = NULL;
357 	npt->nw->suppress_ack = false;
358 }
359 
360 /*
361  * Processes an incoming packet, which can contain multiple netlink messages
362  */
363 static bool
364 nl_process_nbuf(struct nl_buf *nb, struct nlpcb *nlp)
365 {
366 	struct nlmsghdr *hdr;
367 	int error;
368 
369 	NL_LOG(LOG_DEBUG3, "RX netlink buf %p on %p", nb, nlp->nl_socket);
370 
371 	struct nl_writer nw = {};
372 	if (!nlmsg_get_unicast_writer(&nw, NLMSG_SMALL, nlp)) {
373 		NL_LOG(LOG_DEBUG, "error allocating socket writer");
374 		return (true);
375 	}
376 
377 	nlmsg_ignore_limit(&nw);
378 
379 	struct nl_pstate npt = {
380 		.nlp = nlp,
381 		.lb.base = &nb->data[roundup2(nb->datalen, 8)],
382 		.lb.size = nb->buflen - roundup2(nb->datalen, 8),
383 		.nw = &nw,
384 		.strict = nlp->nl_flags & NLF_STRICT,
385 	};
386 
387 	for (; nb->offset + sizeof(struct nlmsghdr) <= nb->datalen;) {
388 		hdr = (struct nlmsghdr *)&nb->data[nb->offset];
389 		/* Save length prior to calling handler */
390 		int msglen = NLMSG_ALIGN(hdr->nlmsg_len);
391 		NL_LOG(LOG_DEBUG3, "parsing offset %d/%d",
392 		    nb->offset, nb->datalen);
393 		npt_clear(&npt);
394 		error = nl_receive_message(hdr, nb->datalen - nb->offset, nlp,
395 		    &npt);
396 		nb->offset += msglen;
397 		if (__predict_false(error != 0 || nlp->nl_tx_blocked))
398 			break;
399 	}
400 	NL_LOG(LOG_DEBUG3, "packet parsing done");
401 	nlmsg_flush(&nw);
402 
403 	if (nlp->nl_tx_blocked) {
404 		NLP_LOCK(nlp);
405 		nlp->nl_tx_blocked = false;
406 		NLP_UNLOCK(nlp);
407 		return (false);
408 	} else
409 		return (true);
410 }
411