1 /*-
2 * SPDX-License-Identifier: BSD-2-Clause
3 *
4 * Copyright (c) 2021 Ng Peng Nam Sean
5 * Copyright (c) 2022 Alexander V. Chernikov <melifaro@FreeBSD.org>
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26 * SUCH DAMAGE.
27 */
28
29 #include <sys/param.h>
30 #include <sys/ck.h>
31 #include <sys/lock.h>
32 #include <sys/malloc.h>
33 #include <sys/mbuf.h>
34 #include <sys/mutex.h>
35 #include <sys/socket.h>
36 #include <sys/socketvar.h>
37 #include <sys/syslog.h>
38
39 #include <netlink/netlink.h>
40 #include <netlink/netlink_ctl.h>
41 #include <netlink/netlink_linux.h>
42 #include <netlink/netlink_var.h>
43
44 #define DEBUG_MOD_NAME nl_io
45 #define DEBUG_MAX_LEVEL LOG_DEBUG3
46 #include <netlink/netlink_debug.h>
47 _DECLARE_DEBUG(LOG_INFO);
48
49 /*
50 * The logic below provide a p2p interface for receiving and
51 * sending netlink data between the kernel and userland.
52 */
53
54 static bool nl_process_nbuf(struct nl_buf *nb, struct nlpcb *nlp);
55
56 struct nl_buf *
nl_buf_alloc(size_t len,int mflag)57 nl_buf_alloc(size_t len, int mflag)
58 {
59 struct nl_buf *nb;
60
61 KASSERT(len > 0 && len <= UINT_MAX, ("%s: invalid length %zu",
62 __func__, len));
63
64 nb = malloc(sizeof(struct nl_buf) + len, M_NETLINK, mflag);
65 if (__predict_true(nb != NULL)) {
66 nb->buflen = len;
67 nb->datalen = nb->offset = 0;
68 }
69
70 return (nb);
71 }
72
73 void
nl_buf_free(struct nl_buf * nb)74 nl_buf_free(struct nl_buf *nb)
75 {
76
77 free(nb, M_NETLINK);
78 }
79
80 void
nl_schedule_taskqueue(struct nlpcb * nlp)81 nl_schedule_taskqueue(struct nlpcb *nlp)
82 {
83 if (!nlp->nl_task_pending) {
84 nlp->nl_task_pending = true;
85 taskqueue_enqueue(nlp->nl_taskqueue, &nlp->nl_task);
86 NL_LOG(LOG_DEBUG3, "taskqueue scheduled");
87 } else {
88 NL_LOG(LOG_DEBUG3, "taskqueue schedule skipped");
89 }
90 }
91
92 static bool
nl_process_received_one(struct nlpcb * nlp)93 nl_process_received_one(struct nlpcb *nlp)
94 {
95 struct socket *so = nlp->nl_socket;
96 struct sockbuf *sb;
97 struct nl_buf *nb;
98 bool reschedule = false;
99
100 NLP_LOCK(nlp);
101 nlp->nl_task_pending = false;
102 NLP_UNLOCK(nlp);
103
104 /*
105 * Do not process queued up requests if there is no space to queue
106 * replies.
107 */
108 sb = &so->so_rcv;
109 SOCK_RECVBUF_LOCK(so);
110 if (sb->sb_hiwat <= sb->sb_ccc) {
111 SOCK_RECVBUF_UNLOCK(so);
112 NL_LOG(LOG_DEBUG3, "socket %p stuck", so);
113 return (false);
114 }
115 SOCK_RECVBUF_UNLOCK(so);
116
117 sb = &so->so_snd;
118 SOCK_SENDBUF_LOCK(so);
119 while ((nb = TAILQ_FIRST(&sb->nl_queue)) != NULL) {
120 TAILQ_REMOVE(&sb->nl_queue, nb, tailq);
121 SOCK_SENDBUF_UNLOCK(so);
122 reschedule = nl_process_nbuf(nb, nlp);
123 SOCK_SENDBUF_LOCK(so);
124 if (reschedule) {
125 sb->sb_acc -= nb->datalen;
126 sb->sb_ccc -= nb->datalen;
127 /* XXXGL: potentially can reduce lock&unlock count. */
128 sowwakeup_locked(so);
129 nl_buf_free(nb);
130 SOCK_SENDBUF_LOCK(so);
131 } else {
132 TAILQ_INSERT_HEAD(&sb->nl_queue, nb, tailq);
133 break;
134 }
135 }
136 SOCK_SENDBUF_UNLOCK(so);
137
138 return (reschedule);
139 }
140
141 static void
nl_process_received(struct nlpcb * nlp)142 nl_process_received(struct nlpcb *nlp)
143 {
144 NL_LOG(LOG_DEBUG3, "taskqueue called");
145
146 if (__predict_false(nlp->nl_need_thread_setup)) {
147 nl_set_thread_nlp(curthread, nlp);
148 NLP_LOCK(nlp);
149 nlp->nl_need_thread_setup = false;
150 NLP_UNLOCK(nlp);
151 }
152
153 while (nl_process_received_one(nlp))
154 ;
155 }
156
157 /*
158 * Called after some data have been read from the socket.
159 */
160 void
nl_on_transmit(struct nlpcb * nlp)161 nl_on_transmit(struct nlpcb *nlp)
162 {
163 NLP_LOCK(nlp);
164
165 struct socket *so = nlp->nl_socket;
166 if (__predict_false(nlp->nl_dropped_bytes > 0 && so != NULL)) {
167 unsigned long dropped_bytes = nlp->nl_dropped_bytes;
168 unsigned long dropped_messages = nlp->nl_dropped_messages;
169 nlp->nl_dropped_bytes = 0;
170 nlp->nl_dropped_messages = 0;
171
172 struct sockbuf *sb = &so->so_rcv;
173 NLP_LOG(LOG_DEBUG, nlp,
174 "socket RX overflowed, %lu messages (%lu bytes) dropped. "
175 "bytes: [%u/%u]", dropped_messages, dropped_bytes,
176 sb->sb_ccc, sb->sb_hiwat);
177 /* TODO: send netlink message */
178 }
179
180 nl_schedule_taskqueue(nlp);
181 NLP_UNLOCK(nlp);
182 }
183
184 void
nl_taskqueue_handler(void * _arg,int pending)185 nl_taskqueue_handler(void *_arg, int pending)
186 {
187 struct nlpcb *nlp = (struct nlpcb *)_arg;
188
189 CURVNET_SET(nlp->nl_socket->so_vnet);
190 nl_process_received(nlp);
191 CURVNET_RESTORE();
192 }
193
194 /*
195 * Tries to send current data buffer from writer.
196 *
197 * Returns true on success.
198 * If no queue overrunes happened, wakes up socket owner.
199 */
200 bool
nl_send(struct nl_writer * nw,struct nlpcb * nlp)201 nl_send(struct nl_writer *nw, struct nlpcb *nlp)
202 {
203 struct socket *so = nlp->nl_socket;
204 struct sockbuf *sb = &so->so_rcv;
205 struct nl_buf *nb;
206
207 MPASS(nw->hdr == NULL);
208 MPASS(nw->buf != NULL);
209 MPASS(nw->buf->datalen > 0);
210
211 IF_DEBUG_LEVEL(LOG_DEBUG2) {
212 struct nlmsghdr *hdr = (struct nlmsghdr *)nw->buf->data;
213 NLP_LOG(LOG_DEBUG2, nlp,
214 "TX len %u msgs %u msg type %d first hdrlen %u",
215 nw->buf->datalen, nw->num_messages, hdr->nlmsg_type,
216 hdr->nlmsg_len);
217 }
218
219 if (nlp->nl_linux && linux_netlink_p != NULL &&
220 __predict_false(!linux_netlink_p->msgs_to_linux(nw, nlp))) {
221 nl_buf_free(nw->buf);
222 nw->buf = NULL;
223 return (false);
224 }
225
226 nb = nw->buf;
227 nw->buf = NULL;
228
229 SOCK_RECVBUF_LOCK(so);
230 if (!nw->ignore_limit && __predict_false(sb->sb_hiwat <= sb->sb_ccc)) {
231 SOCK_RECVBUF_UNLOCK(so);
232 NLP_LOCK(nlp);
233 nlp->nl_dropped_bytes += nb->datalen;
234 nlp->nl_dropped_messages += nw->num_messages;
235 NLP_LOG(LOG_DEBUG2, nlp, "RX oveflow: %lu m (+%d), %lu b (+%d)",
236 (unsigned long)nlp->nl_dropped_messages, nw->num_messages,
237 (unsigned long)nlp->nl_dropped_bytes, nb->datalen);
238 NLP_UNLOCK(nlp);
239 nl_buf_free(nb);
240 return (false);
241 } else {
242 bool full;
243
244 TAILQ_INSERT_TAIL(&sb->nl_queue, nb, tailq);
245 sb->sb_acc += nb->datalen;
246 sb->sb_ccc += nb->datalen;
247 full = sb->sb_hiwat <= sb->sb_ccc;
248 sorwakeup_locked(so);
249 if (full) {
250 NLP_LOCK(nlp);
251 nlp->nl_tx_blocked = true;
252 NLP_UNLOCK(nlp);
253 }
254 return (true);
255 }
256 }
257
258 static int
nl_receive_message(struct nlmsghdr * hdr,int remaining_length,struct nlpcb * nlp,struct nl_pstate * npt)259 nl_receive_message(struct nlmsghdr *hdr, int remaining_length,
260 struct nlpcb *nlp, struct nl_pstate *npt)
261 {
262 nl_handler_f handler = nl_handlers[nlp->nl_proto].cb;
263 int error = 0;
264
265 NLP_LOG(LOG_DEBUG2, nlp, "msg len: %u type: %d: flags: 0x%X seq: %u pid: %u",
266 hdr->nlmsg_len, hdr->nlmsg_type, hdr->nlmsg_flags, hdr->nlmsg_seq,
267 hdr->nlmsg_pid);
268
269 if (__predict_false(hdr->nlmsg_len > remaining_length)) {
270 NLP_LOG(LOG_DEBUG, nlp, "message is not entirely present: want %d got %d",
271 hdr->nlmsg_len, remaining_length);
272 return (EINVAL);
273 } else if (__predict_false(hdr->nlmsg_len < sizeof(*hdr))) {
274 NL_LOG(LOG_DEBUG, "message too short: %d", hdr->nlmsg_len);
275 return (EINVAL);
276 }
277 /* Stamp each message with sender pid */
278 hdr->nlmsg_pid = nlp->nl_port;
279
280 npt->hdr = hdr;
281
282 if (hdr->nlmsg_flags & NLM_F_REQUEST &&
283 hdr->nlmsg_type >= NLMSG_MIN_TYPE) {
284 NL_LOG(LOG_DEBUG2, "handling message with msg type: %d",
285 hdr->nlmsg_type);
286 if (nlp->nl_linux) {
287 MPASS(linux_netlink_p != NULL);
288 error = linux_netlink_p->msg_from_linux(nlp->nl_proto,
289 &hdr, npt);
290 if (error)
291 goto ack;
292 }
293 error = handler(hdr, npt);
294 NL_LOG(LOG_DEBUG2, "retcode: %d", error);
295 }
296 ack:
297 if ((hdr->nlmsg_flags & NLM_F_ACK) || (error != 0 && error != EINTR)) {
298 if (!npt->nw->suppress_ack) {
299 NL_LOG(LOG_DEBUG3, "ack");
300 nlmsg_ack(nlp, error, hdr, npt);
301 }
302 }
303
304 return (0);
305 }
306
307 static void
npt_clear(struct nl_pstate * npt)308 npt_clear(struct nl_pstate *npt)
309 {
310 lb_clear(&npt->lb);
311 npt->error = 0;
312 npt->err_msg = NULL;
313 npt->err_off = 0;
314 npt->hdr = NULL;
315 npt->nw->suppress_ack = false;
316 }
317
318 /*
319 * Processes an incoming packet, which can contain multiple netlink messages
320 */
321 static bool
nl_process_nbuf(struct nl_buf * nb,struct nlpcb * nlp)322 nl_process_nbuf(struct nl_buf *nb, struct nlpcb *nlp)
323 {
324 struct nl_writer nw;
325 struct nlmsghdr *hdr;
326 int error;
327
328 NL_LOG(LOG_DEBUG3, "RX netlink buf %p on %p", nb, nlp->nl_socket);
329
330 if (!nl_writer_unicast(&nw, NLMSG_SMALL, nlp, false)) {
331 NL_LOG(LOG_DEBUG, "error allocating socket writer");
332 return (true);
333 }
334
335 nlmsg_ignore_limit(&nw);
336
337 struct nl_pstate npt = {
338 .nlp = nlp,
339 .lb.base = &nb->data[roundup2(nb->datalen, 8)],
340 .lb.size = nb->buflen - roundup2(nb->datalen, 8),
341 .nw = &nw,
342 .strict = nlp->nl_flags & NLF_STRICT,
343 };
344
345 for (; nb->offset + sizeof(struct nlmsghdr) <= nb->datalen;) {
346 hdr = (struct nlmsghdr *)&nb->data[nb->offset];
347 /* Save length prior to calling handler */
348 int msglen = NLMSG_ALIGN(hdr->nlmsg_len);
349 NL_LOG(LOG_DEBUG3, "parsing offset %d/%d",
350 nb->offset, nb->datalen);
351 npt_clear(&npt);
352 error = nl_receive_message(hdr, nb->datalen - nb->offset, nlp,
353 &npt);
354 nb->offset += msglen;
355 if (__predict_false(error != 0 || nlp->nl_tx_blocked))
356 break;
357 }
358 NL_LOG(LOG_DEBUG3, "packet parsing done");
359 nlmsg_flush(&nw);
360
361 if (nlp->nl_tx_blocked) {
362 NLP_LOCK(nlp);
363 nlp->nl_tx_blocked = false;
364 NLP_UNLOCK(nlp);
365 return (false);
366 } else
367 return (true);
368 }
369