1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3 * 4 * Copyright (c) 2021 Ng Peng Nam Sean 5 * Copyright (c) 2022 Alexander V. Chernikov <melifaro@FreeBSD.org> 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26 * SUCH DAMAGE. 27 */ 28 29 #include <sys/cdefs.h> 30 __FBSDID("$FreeBSD$"); 31 #include <sys/param.h> 32 #include <sys/malloc.h> 33 #include <sys/lock.h> 34 #include <sys/mbuf.h> 35 #include <sys/ck.h> 36 #include <sys/socket.h> 37 #include <sys/socketvar.h> 38 #include <sys/syslog.h> 39 40 #include <netlink/netlink.h> 41 #include <netlink/netlink_ctl.h> 42 #include <netlink/netlink_linux.h> 43 #include <netlink/netlink_var.h> 44 45 #define DEBUG_MOD_NAME nl_io 46 #define DEBUG_MAX_LEVEL LOG_DEBUG3 47 #include <netlink/netlink_debug.h> 48 _DECLARE_DEBUG(LOG_DEBUG); 49 50 /* 51 * The logic below provide a p2p interface for receiving and 52 * sending netlink data between the kernel and userland. 53 */ 54 55 static const struct sockaddr_nl _nl_empty_src = { 56 .nl_len = sizeof(struct sockaddr_nl), 57 .nl_family = PF_NETLINK, 58 .nl_pid = 0 /* comes from the kernel */ 59 }; 60 static const struct sockaddr *nl_empty_src = (const struct sockaddr *)&_nl_empty_src; 61 62 static struct mbuf *nl_process_mbuf(struct mbuf *m, struct nlpcb *nlp); 63 64 65 static void 66 queue_push(struct nl_io_queue *q, struct mbuf *mq) 67 { 68 while (mq != NULL) { 69 struct mbuf *m = mq; 70 mq = mq->m_nextpkt; 71 m->m_nextpkt = NULL; 72 73 q->length += m_length(m, NULL); 74 STAILQ_INSERT_TAIL(&q->head, m, m_stailqpkt); 75 } 76 } 77 78 static void 79 queue_push_head(struct nl_io_queue *q, struct mbuf *m) 80 { 81 MPASS(m->m_nextpkt == NULL); 82 83 q->length += m_length(m, NULL); 84 STAILQ_INSERT_HEAD(&q->head, m, m_stailqpkt); 85 } 86 87 static struct mbuf * 88 queue_pop(struct nl_io_queue *q) 89 { 90 if (!STAILQ_EMPTY(&q->head)) { 91 struct mbuf *m = STAILQ_FIRST(&q->head); 92 STAILQ_REMOVE_HEAD(&q->head, m_stailqpkt); 93 m->m_nextpkt = NULL; 94 q->length -= m_length(m, NULL); 95 96 return (m); 97 } 98 return (NULL); 99 } 100 101 static struct mbuf * 102 queue_head(const struct nl_io_queue *q) 103 { 104 return (STAILQ_FIRST(&q->head)); 105 } 106 107 static inline bool 108 queue_empty(const struct nl_io_queue *q) 109 { 110 return (q->length == 0); 111 } 112 113 static void 114 queue_free(struct nl_io_queue *q) 115 { 116 while (!STAILQ_EMPTY(&q->head)) { 117 struct mbuf *m = STAILQ_FIRST(&q->head); 118 STAILQ_REMOVE_HEAD(&q->head, m_stailqpkt); 119 m->m_nextpkt = NULL; 120 m_freem(m); 121 } 122 q->length = 0; 123 } 124 125 126 static void 127 nl_schedule_taskqueue(struct nlpcb *nlp) 128 { 129 if (!nlp->nl_task_pending) { 130 nlp->nl_task_pending = true; 131 taskqueue_enqueue(nlp->nl_taskqueue, &nlp->nl_task); 132 NL_LOG(LOG_DEBUG3, "taskqueue scheduled"); 133 } else { 134 NL_LOG(LOG_DEBUG3, "taskqueue schedule skipped"); 135 } 136 } 137 138 int 139 nl_receive_async(struct mbuf *m, struct socket *so) 140 { 141 struct nlpcb *nlp = sotonlpcb(so); 142 int error = 0; 143 144 m->m_nextpkt = NULL; 145 146 NLP_LOCK(nlp); 147 148 if ((__predict_true(nlp->nl_active))) { 149 sbappend(&so->so_snd, m, 0); 150 NL_LOG(LOG_DEBUG3, "enqueue %u bytes", m_length(m, NULL)); 151 nl_schedule_taskqueue(nlp); 152 } else { 153 NL_LOG(LOG_DEBUG, "ignoring %u bytes on non-active socket", 154 m_length(m, NULL)); 155 m_free(m); 156 error = EINVAL; 157 } 158 159 NLP_UNLOCK(nlp); 160 161 return (error); 162 } 163 164 static bool 165 tx_check_locked(struct nlpcb *nlp) 166 { 167 if (queue_empty(&nlp->tx_queue)) 168 return (true); 169 170 /* 171 * Check if something can be moved from the internal TX queue 172 * to the socket queue. 173 */ 174 175 bool appended = false; 176 struct sockbuf *sb = &nlp->nl_socket->so_rcv; 177 SOCKBUF_LOCK(sb); 178 179 while (true) { 180 struct mbuf *m = queue_head(&nlp->tx_queue); 181 if (m && sbappendaddr_locked(sb, nl_empty_src, m, NULL) != 0) { 182 /* appended successfully */ 183 queue_pop(&nlp->tx_queue); 184 appended = true; 185 } else 186 break; 187 } 188 189 SOCKBUF_UNLOCK(sb); 190 191 if (appended) 192 sorwakeup(nlp->nl_socket); 193 194 return (queue_empty(&nlp->tx_queue)); 195 } 196 197 static bool 198 nl_process_received_one(struct nlpcb *nlp) 199 { 200 bool reschedule = false; 201 202 NLP_LOCK(nlp); 203 nlp->nl_task_pending = false; 204 205 if (!tx_check_locked(nlp)) { 206 /* TX overflow queue still not empty, ignore RX */ 207 NLP_UNLOCK(nlp); 208 return (false); 209 } 210 211 if (queue_empty(&nlp->rx_queue)) { 212 /* 213 * Grab all data we have from the socket TX queue 214 * and store it the internal queue, so it can be worked on 215 * w/o holding socket lock. 216 */ 217 struct sockbuf *sb = &nlp->nl_socket->so_snd; 218 219 SOCKBUF_LOCK(sb); 220 unsigned int avail = sbavail(sb); 221 if (avail > 0) { 222 NL_LOG(LOG_DEBUG3, "grabbed %u bytes", avail); 223 queue_push(&nlp->rx_queue, sbcut_locked(sb, avail)); 224 } 225 SOCKBUF_UNLOCK(sb); 226 } else { 227 /* Schedule another pass to read from the socket queue */ 228 reschedule = true; 229 } 230 231 int prev_hiwat = nlp->tx_queue.hiwat; 232 NLP_UNLOCK(nlp); 233 234 while (!queue_empty(&nlp->rx_queue)) { 235 struct mbuf *m = queue_pop(&nlp->rx_queue); 236 237 m = nl_process_mbuf(m, nlp); 238 if (m != NULL) { 239 queue_push_head(&nlp->rx_queue, m); 240 reschedule = false; 241 break; 242 } 243 } 244 if (nlp->tx_queue.hiwat > prev_hiwat) { 245 NLP_LOG(LOG_DEBUG, nlp, "TX override peaked to %d", nlp->tx_queue.hiwat); 246 247 } 248 249 return (reschedule); 250 } 251 252 static void 253 nl_process_received(struct nlpcb *nlp) 254 { 255 NL_LOG(LOG_DEBUG3, "taskqueue called"); 256 257 while (nl_process_received_one(nlp)) 258 ; 259 } 260 261 void 262 nl_init_io(struct nlpcb *nlp) 263 { 264 STAILQ_INIT(&nlp->rx_queue.head); 265 STAILQ_INIT(&nlp->tx_queue.head); 266 } 267 268 void 269 nl_free_io(struct nlpcb *nlp) 270 { 271 queue_free(&nlp->rx_queue); 272 queue_free(&nlp->tx_queue); 273 } 274 275 /* 276 * Called after some data have been read from the socket. 277 */ 278 void 279 nl_on_transmit(struct nlpcb *nlp) 280 { 281 NLP_LOCK(nlp); 282 283 struct socket *so = nlp->nl_socket; 284 if (__predict_false(nlp->nl_dropped_bytes > 0 && so != NULL)) { 285 unsigned long dropped_bytes = nlp->nl_dropped_bytes; 286 unsigned long dropped_messages = nlp->nl_dropped_messages; 287 nlp->nl_dropped_bytes = 0; 288 nlp->nl_dropped_messages = 0; 289 290 struct sockbuf *sb = &so->so_rcv; 291 NLP_LOG(LOG_DEBUG, nlp, 292 "socket RX overflowed, %lu messages (%lu bytes) dropped. " 293 "bytes: [%u/%u] mbufs: [%u/%u]", dropped_messages, dropped_bytes, 294 sb->sb_ccc, sb->sb_hiwat, sb->sb_mbcnt, sb->sb_mbmax); 295 /* TODO: send netlink message */ 296 } 297 298 nl_schedule_taskqueue(nlp); 299 NLP_UNLOCK(nlp); 300 } 301 302 void 303 nl_taskqueue_handler(void *_arg, int pending) 304 { 305 struct nlpcb *nlp = (struct nlpcb *)_arg; 306 307 CURVNET_SET(nlp->nl_socket->so_vnet); 308 nl_process_received(nlp); 309 CURVNET_RESTORE(); 310 } 311 312 static __noinline void 313 queue_push_tx(struct nlpcb *nlp, struct mbuf *m) 314 { 315 queue_push(&nlp->tx_queue, m); 316 nlp->nl_tx_blocked = true; 317 318 if (nlp->tx_queue.length > nlp->tx_queue.hiwat) 319 nlp->tx_queue.hiwat = nlp->tx_queue.length; 320 } 321 322 /* 323 * Tries to send @m to the socket @nlp. 324 * 325 * @m: mbuf(s) to send to. Consumed in any case. 326 * @nlp: socket to send to 327 * @cnt: number of messages in @m 328 * @io_flags: combination of NL_IOF_* flags 329 * 330 * Returns true on success. 331 * If no queue overrunes happened, wakes up socket owner. 332 */ 333 bool 334 nl_send_one(struct mbuf *m, struct nlpcb *nlp, int num_messages, int io_flags) 335 { 336 bool untranslated = io_flags & NL_IOF_UNTRANSLATED; 337 bool ignore_limits = io_flags & NL_IOF_IGNORE_LIMIT; 338 bool result = true; 339 340 IF_DEBUG_LEVEL(LOG_DEBUG2) { 341 struct nlmsghdr *hdr = mtod(m, struct nlmsghdr *); 342 NLP_LOG(LOG_DEBUG2, nlp, 343 "TX mbuf len %u msgs %u msg type %d first hdrlen %u io_flags %X", 344 m_length(m, NULL), num_messages, hdr->nlmsg_type, hdr->nlmsg_len, 345 io_flags); 346 } 347 348 if (__predict_false(nlp->nl_linux && linux_netlink_p != NULL && untranslated)) { 349 m = linux_netlink_p->mbufs_to_linux(nlp->nl_proto, m, nlp); 350 if (m == NULL) 351 return (false); 352 } 353 354 NLP_LOCK(nlp); 355 356 if (__predict_false(nlp->nl_socket == NULL)) { 357 NLP_UNLOCK(nlp); 358 m_freem(m); 359 return (false); 360 } 361 362 if (!queue_empty(&nlp->tx_queue)) { 363 if (ignore_limits) { 364 queue_push_tx(nlp, m); 365 } else { 366 m_free(m); 367 result = false; 368 } 369 NLP_UNLOCK(nlp); 370 return (result); 371 } 372 373 struct socket *so = nlp->nl_socket; 374 if (sbappendaddr(&so->so_rcv, nl_empty_src, m, NULL) != 0) { 375 sorwakeup(so); 376 NLP_LOG(LOG_DEBUG3, nlp, "appended data & woken up"); 377 } else { 378 if (ignore_limits) { 379 queue_push_tx(nlp, m); 380 } else { 381 /* 382 * Store dropped data so it can be reported 383 * on the next read 384 */ 385 nlp->nl_dropped_bytes += m_length(m, NULL); 386 nlp->nl_dropped_messages += num_messages; 387 NLP_LOG(LOG_DEBUG2, nlp, "RX oveflow: %lu m (+%d), %lu b (+%d)", 388 (unsigned long)nlp->nl_dropped_messages, num_messages, 389 (unsigned long)nlp->nl_dropped_bytes, m_length(m, NULL)); 390 soroverflow(so); 391 m_freem(m); 392 result = false; 393 } 394 } 395 NLP_UNLOCK(nlp); 396 397 return (result); 398 } 399 400 static int 401 nl_receive_message(struct nlmsghdr *hdr, int remaining_length, 402 struct nlpcb *nlp, struct nl_pstate *npt) 403 { 404 nl_handler_f handler = nl_handlers[nlp->nl_proto].cb; 405 int error = 0; 406 407 NL_LOG(LOG_DEBUG2, "msg len: %d type: %d", hdr->nlmsg_len, 408 hdr->nlmsg_type); 409 410 if (__predict_false(hdr->nlmsg_len > remaining_length)) { 411 NLP_LOG(LOG_DEBUG, nlp, "message is not entirely present: want %d got %d", 412 hdr->nlmsg_len, remaining_length); 413 return (EINVAL); 414 } else if (__predict_false(hdr->nlmsg_len < sizeof(*hdr))) { 415 NL_LOG(LOG_DEBUG, "message too short: %d", hdr->nlmsg_len); 416 return (EINVAL); 417 } 418 /* Stamp each message with sender pid */ 419 hdr->nlmsg_pid = nlp->nl_port; 420 421 npt->hdr = hdr; 422 423 if (hdr->nlmsg_flags & NLM_F_REQUEST && hdr->nlmsg_type >= NLMSG_MIN_TYPE) { 424 NL_LOG(LOG_DEBUG2, "handling message with msg type: %d", 425 hdr->nlmsg_type); 426 427 if (nlp->nl_linux && linux_netlink_p != NULL) { 428 struct nlmsghdr *hdr_orig = hdr; 429 hdr = linux_netlink_p->msg_from_linux(nlp->nl_proto, hdr, npt); 430 if (hdr == NULL) { 431 npt->hdr = hdr_orig; 432 if (hdr->nlmsg_flags & NLM_F_ACK) 433 nlmsg_ack(nlp, EAGAIN, hdr, npt); 434 return (0); 435 } 436 } 437 error = handler(hdr, npt); 438 NL_LOG(LOG_DEBUG2, "retcode: %d", error); 439 } 440 if ((hdr->nlmsg_flags & NLM_F_ACK) || (error != 0 && error != EINTR)) { 441 NL_LOG(LOG_DEBUG3, "ack"); 442 nlmsg_ack(nlp, error, hdr, npt); 443 NL_LOG(LOG_DEBUG3, "done"); 444 } 445 446 return (0); 447 } 448 449 static void 450 npt_clear(struct nl_pstate *npt) 451 { 452 lb_clear(&npt->lb); 453 npt->error = 0; 454 npt->err_msg = NULL; 455 npt->err_off = 0; 456 npt->hdr = NULL; 457 } 458 459 /* 460 * Processes an incoming packet, which can contain multiple netlink messages 461 */ 462 static struct mbuf * 463 nl_process_mbuf(struct mbuf *m, struct nlpcb *nlp) 464 { 465 int offset, buffer_length; 466 struct nlmsghdr *hdr; 467 char *buffer; 468 int error; 469 470 NL_LOG(LOG_DEBUG3, "RX netlink mbuf %p on %p", m, nlp->nl_socket); 471 472 struct nl_writer nw = {}; 473 if (!nlmsg_get_unicast_writer(&nw, NLMSG_SMALL, nlp)) { 474 m_freem(m); 475 NL_LOG(LOG_DEBUG, "error allocating socket writer"); 476 return (NULL); 477 } 478 479 nlmsg_ignore_limit(&nw); 480 /* TODO: alloc this buf once for nlp */ 481 int data_length = m_length(m, NULL); 482 buffer_length = roundup2(data_length, 8) + SCRATCH_BUFFER_SIZE; 483 if (nlp->nl_linux) 484 buffer_length += roundup2(data_length, 8); 485 buffer = malloc(buffer_length, M_NETLINK, M_NOWAIT | M_ZERO); 486 if (buffer == NULL) { 487 m_freem(m); 488 nlmsg_flush(&nw); 489 NL_LOG(LOG_DEBUG, "Unable to allocate %d bytes of memory", 490 buffer_length); 491 return (NULL); 492 } 493 m_copydata(m, 0, data_length, buffer); 494 495 struct nl_pstate npt = { 496 .nlp = nlp, 497 .lb.base = &buffer[roundup2(data_length, 8)], 498 .lb.size = buffer_length - roundup2(data_length, 8), 499 .nw = &nw, 500 .strict = nlp->nl_flags & NLF_STRICT, 501 }; 502 503 for (offset = 0; offset + sizeof(struct nlmsghdr) <= data_length;) { 504 hdr = (struct nlmsghdr *)&buffer[offset]; 505 /* Save length prior to calling handler */ 506 int msglen = NLMSG_ALIGN(hdr->nlmsg_len); 507 NL_LOG(LOG_DEBUG3, "parsing offset %d/%d", offset, data_length); 508 npt_clear(&npt); 509 error = nl_receive_message(hdr, data_length - offset, nlp, &npt); 510 offset += msglen; 511 if (__predict_false(error != 0 || nlp->nl_tx_blocked)) 512 break; 513 } 514 NL_LOG(LOG_DEBUG3, "packet parsing done"); 515 free(buffer, M_NETLINK); 516 nlmsg_flush(&nw); 517 518 if (nlp->nl_tx_blocked) { 519 NLP_LOCK(nlp); 520 nlp->nl_tx_blocked = false; 521 NLP_UNLOCK(nlp); 522 m_adj(m, offset); 523 return (m); 524 } else { 525 m_freem(m); 526 return (NULL); 527 } 528 } 529