1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause 3 * 4 * Copyright (c) 2021 Ng Peng Nam Sean 5 * Copyright (c) 2022 Alexander V. Chernikov <melifaro@FreeBSD.org> 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26 * SUCH DAMAGE. 27 */ 28 29 #include "opt_netlink.h" 30 31 #include <sys/cdefs.h> 32 #include <sys/param.h> 33 #include <sys/ck.h> 34 #include <sys/lock.h> 35 #include <sys/malloc.h> 36 #include <sys/mbuf.h> 37 #include <sys/mutex.h> 38 #include <sys/socket.h> 39 #include <sys/socketvar.h> 40 #include <sys/syslog.h> 41 42 #include <netlink/netlink.h> 43 #include <netlink/netlink_ctl.h> 44 #include <netlink/netlink_linux.h> 45 #include <netlink/netlink_var.h> 46 47 #define DEBUG_MOD_NAME nl_io 48 #define DEBUG_MAX_LEVEL LOG_DEBUG3 49 #include <netlink/netlink_debug.h> 50 _DECLARE_DEBUG(LOG_INFO); 51 52 /* 53 * The logic below provide a p2p interface for receiving and 54 * sending netlink data between the kernel and userland. 55 */ 56 57 static const struct sockaddr_nl _nl_empty_src = { 58 .nl_len = sizeof(struct sockaddr_nl), 59 .nl_family = PF_NETLINK, 60 .nl_pid = 0 /* comes from the kernel */ 61 }; 62 static const struct sockaddr *nl_empty_src = (const struct sockaddr *)&_nl_empty_src; 63 64 static struct mbuf *nl_process_mbuf(struct mbuf *m, struct nlpcb *nlp); 65 66 67 static void 68 queue_push(struct nl_io_queue *q, struct mbuf *mq) 69 { 70 while (mq != NULL) { 71 struct mbuf *m = mq; 72 mq = mq->m_nextpkt; 73 m->m_nextpkt = NULL; 74 75 q->length += m_length(m, NULL); 76 STAILQ_INSERT_TAIL(&q->head, m, m_stailqpkt); 77 } 78 } 79 80 static void 81 queue_push_head(struct nl_io_queue *q, struct mbuf *m) 82 { 83 MPASS(m->m_nextpkt == NULL); 84 85 q->length += m_length(m, NULL); 86 STAILQ_INSERT_HEAD(&q->head, m, m_stailqpkt); 87 } 88 89 static struct mbuf * 90 queue_pop(struct nl_io_queue *q) 91 { 92 if (!STAILQ_EMPTY(&q->head)) { 93 struct mbuf *m = STAILQ_FIRST(&q->head); 94 STAILQ_REMOVE_HEAD(&q->head, m_stailqpkt); 95 m->m_nextpkt = NULL; 96 q->length -= m_length(m, NULL); 97 98 return (m); 99 } 100 return (NULL); 101 } 102 103 static struct mbuf * 104 queue_head(const struct nl_io_queue *q) 105 { 106 return (STAILQ_FIRST(&q->head)); 107 } 108 109 static inline bool 110 queue_empty(const struct nl_io_queue *q) 111 { 112 return (q->length == 0); 113 } 114 115 static void 116 queue_free(struct nl_io_queue *q) 117 { 118 while (!STAILQ_EMPTY(&q->head)) { 119 struct mbuf *m = STAILQ_FIRST(&q->head); 120 STAILQ_REMOVE_HEAD(&q->head, m_stailqpkt); 121 m->m_nextpkt = NULL; 122 m_freem(m); 123 } 124 q->length = 0; 125 } 126 127 void 128 nl_add_msg_info(struct mbuf *m) 129 { 130 struct nlpcb *nlp = nl_get_thread_nlp(curthread); 131 NL_LOG(LOG_DEBUG2, "Trying to recover nlp from thread %p: %p", 132 curthread, nlp); 133 134 if (nlp == NULL) 135 return; 136 137 /* Prepare what we want to encode - PID, socket PID & msg seq */ 138 struct { 139 struct nlattr nla; 140 uint32_t val; 141 } data[] = { 142 { 143 .nla.nla_len = sizeof(struct nlattr) + sizeof(uint32_t), 144 .nla.nla_type = NLMSGINFO_ATTR_PROCESS_ID, 145 .val = nlp->nl_process_id, 146 }, 147 { 148 .nla.nla_len = sizeof(struct nlattr) + sizeof(uint32_t), 149 .nla.nla_type = NLMSGINFO_ATTR_PORT_ID, 150 .val = nlp->nl_port, 151 }, 152 }; 153 154 155 while (m->m_next != NULL) 156 m = m->m_next; 157 m->m_next = sbcreatecontrol(data, sizeof(data), 158 NETLINK_MSG_INFO, SOL_NETLINK, M_NOWAIT); 159 160 NL_LOG(LOG_DEBUG2, "Storing %u bytes of data, ctl: %p", 161 (unsigned)sizeof(data), m->m_next); 162 } 163 164 static __noinline struct mbuf * 165 extract_msg_info(struct mbuf *m) 166 { 167 while (m->m_next != NULL) { 168 if (m->m_next->m_type == MT_CONTROL) { 169 struct mbuf *ctl = m->m_next; 170 m->m_next = NULL; 171 return (ctl); 172 } 173 m = m->m_next; 174 } 175 return (NULL); 176 } 177 178 static void 179 nl_schedule_taskqueue(struct nlpcb *nlp) 180 { 181 if (!nlp->nl_task_pending) { 182 nlp->nl_task_pending = true; 183 taskqueue_enqueue(nlp->nl_taskqueue, &nlp->nl_task); 184 NL_LOG(LOG_DEBUG3, "taskqueue scheduled"); 185 } else { 186 NL_LOG(LOG_DEBUG3, "taskqueue schedule skipped"); 187 } 188 } 189 190 int 191 nl_receive_async(struct mbuf *m, struct socket *so) 192 { 193 struct nlpcb *nlp = sotonlpcb(so); 194 int error = 0; 195 196 m->m_nextpkt = NULL; 197 198 NLP_LOCK(nlp); 199 200 if ((__predict_true(nlp->nl_active))) { 201 sbappend(&so->so_snd, m, 0); 202 NL_LOG(LOG_DEBUG3, "enqueue %u bytes", m_length(m, NULL)); 203 nl_schedule_taskqueue(nlp); 204 } else { 205 NL_LOG(LOG_DEBUG, "ignoring %u bytes on non-active socket", 206 m_length(m, NULL)); 207 m_free(m); 208 error = EINVAL; 209 } 210 211 NLP_UNLOCK(nlp); 212 213 return (error); 214 } 215 216 static bool 217 tx_check_locked(struct nlpcb *nlp) 218 { 219 if (queue_empty(&nlp->tx_queue)) 220 return (true); 221 222 /* 223 * Check if something can be moved from the internal TX queue 224 * to the socket queue. 225 */ 226 227 bool appended = false; 228 struct sockbuf *sb = &nlp->nl_socket->so_rcv; 229 SOCKBUF_LOCK(sb); 230 231 while (true) { 232 struct mbuf *m = queue_head(&nlp->tx_queue); 233 if (m != NULL) { 234 struct mbuf *ctl = NULL; 235 if (__predict_false(m->m_next != NULL)) 236 ctl = extract_msg_info(m); 237 if (sbappendaddr_locked(sb, nl_empty_src, m, ctl) != 0) { 238 /* appended successfully */ 239 queue_pop(&nlp->tx_queue); 240 appended = true; 241 } else 242 break; 243 } else 244 break; 245 } 246 247 SOCKBUF_UNLOCK(sb); 248 249 if (appended) 250 sorwakeup(nlp->nl_socket); 251 252 return (queue_empty(&nlp->tx_queue)); 253 } 254 255 static bool 256 nl_process_received_one(struct nlpcb *nlp) 257 { 258 bool reschedule = false; 259 260 NLP_LOCK(nlp); 261 nlp->nl_task_pending = false; 262 263 if (!tx_check_locked(nlp)) { 264 /* TX overflow queue still not empty, ignore RX */ 265 NLP_UNLOCK(nlp); 266 return (false); 267 } 268 269 if (queue_empty(&nlp->rx_queue)) { 270 /* 271 * Grab all data we have from the socket TX queue 272 * and store it the internal queue, so it can be worked on 273 * w/o holding socket lock. 274 */ 275 struct sockbuf *sb = &nlp->nl_socket->so_snd; 276 277 SOCKBUF_LOCK(sb); 278 unsigned int avail = sbavail(sb); 279 if (avail > 0) { 280 NL_LOG(LOG_DEBUG3, "grabbed %u bytes", avail); 281 queue_push(&nlp->rx_queue, sbcut_locked(sb, avail)); 282 } 283 SOCKBUF_UNLOCK(sb); 284 } else { 285 /* Schedule another pass to read from the socket queue */ 286 reschedule = true; 287 } 288 289 int prev_hiwat = nlp->tx_queue.hiwat; 290 NLP_UNLOCK(nlp); 291 292 while (!queue_empty(&nlp->rx_queue)) { 293 struct mbuf *m = queue_pop(&nlp->rx_queue); 294 295 m = nl_process_mbuf(m, nlp); 296 if (m != NULL) { 297 queue_push_head(&nlp->rx_queue, m); 298 reschedule = false; 299 break; 300 } 301 } 302 if (nlp->tx_queue.hiwat > prev_hiwat) { 303 NLP_LOG(LOG_DEBUG, nlp, "TX override peaked to %d", nlp->tx_queue.hiwat); 304 305 } 306 307 return (reschedule); 308 } 309 310 static void 311 nl_process_received(struct nlpcb *nlp) 312 { 313 NL_LOG(LOG_DEBUG3, "taskqueue called"); 314 315 if (__predict_false(nlp->nl_need_thread_setup)) { 316 nl_set_thread_nlp(curthread, nlp); 317 NLP_LOCK(nlp); 318 nlp->nl_need_thread_setup = false; 319 NLP_UNLOCK(nlp); 320 } 321 322 while (nl_process_received_one(nlp)) 323 ; 324 } 325 326 void 327 nl_init_io(struct nlpcb *nlp) 328 { 329 STAILQ_INIT(&nlp->rx_queue.head); 330 STAILQ_INIT(&nlp->tx_queue.head); 331 } 332 333 void 334 nl_free_io(struct nlpcb *nlp) 335 { 336 queue_free(&nlp->rx_queue); 337 queue_free(&nlp->tx_queue); 338 } 339 340 /* 341 * Called after some data have been read from the socket. 342 */ 343 void 344 nl_on_transmit(struct nlpcb *nlp) 345 { 346 NLP_LOCK(nlp); 347 348 struct socket *so = nlp->nl_socket; 349 if (__predict_false(nlp->nl_dropped_bytes > 0 && so != NULL)) { 350 unsigned long dropped_bytes = nlp->nl_dropped_bytes; 351 unsigned long dropped_messages = nlp->nl_dropped_messages; 352 nlp->nl_dropped_bytes = 0; 353 nlp->nl_dropped_messages = 0; 354 355 struct sockbuf *sb = &so->so_rcv; 356 NLP_LOG(LOG_DEBUG, nlp, 357 "socket RX overflowed, %lu messages (%lu bytes) dropped. " 358 "bytes: [%u/%u] mbufs: [%u/%u]", dropped_messages, dropped_bytes, 359 sb->sb_ccc, sb->sb_hiwat, sb->sb_mbcnt, sb->sb_mbmax); 360 /* TODO: send netlink message */ 361 } 362 363 nl_schedule_taskqueue(nlp); 364 NLP_UNLOCK(nlp); 365 } 366 367 void 368 nl_taskqueue_handler(void *_arg, int pending) 369 { 370 struct nlpcb *nlp = (struct nlpcb *)_arg; 371 372 CURVNET_SET(nlp->nl_socket->so_vnet); 373 nl_process_received(nlp); 374 CURVNET_RESTORE(); 375 } 376 377 static __noinline void 378 queue_push_tx(struct nlpcb *nlp, struct mbuf *m) 379 { 380 queue_push(&nlp->tx_queue, m); 381 nlp->nl_tx_blocked = true; 382 383 if (nlp->tx_queue.length > nlp->tx_queue.hiwat) 384 nlp->tx_queue.hiwat = nlp->tx_queue.length; 385 } 386 387 /* 388 * Tries to send @m to the socket @nlp. 389 * 390 * @m: mbuf(s) to send to. Consumed in any case. 391 * @nlp: socket to send to 392 * @cnt: number of messages in @m 393 * @io_flags: combination of NL_IOF_* flags 394 * 395 * Returns true on success. 396 * If no queue overrunes happened, wakes up socket owner. 397 */ 398 bool 399 nl_send_one(struct mbuf *m, struct nlpcb *nlp, int num_messages, int io_flags) 400 { 401 bool untranslated = io_flags & NL_IOF_UNTRANSLATED; 402 bool ignore_limits = io_flags & NL_IOF_IGNORE_LIMIT; 403 bool result = true; 404 405 IF_DEBUG_LEVEL(LOG_DEBUG2) { 406 struct nlmsghdr *hdr = mtod(m, struct nlmsghdr *); 407 NLP_LOG(LOG_DEBUG2, nlp, 408 "TX mbuf len %u msgs %u msg type %d first hdrlen %u io_flags %X", 409 m_length(m, NULL), num_messages, hdr->nlmsg_type, hdr->nlmsg_len, 410 io_flags); 411 } 412 413 if (__predict_false(nlp->nl_linux && linux_netlink_p != NULL && untranslated)) { 414 m = linux_netlink_p->mbufs_to_linux(nlp->nl_proto, m, nlp); 415 if (m == NULL) 416 return (false); 417 } 418 419 NLP_LOCK(nlp); 420 421 if (__predict_false(nlp->nl_socket == NULL)) { 422 NLP_UNLOCK(nlp); 423 m_freem(m); 424 return (false); 425 } 426 427 if (!queue_empty(&nlp->tx_queue)) { 428 if (ignore_limits) { 429 queue_push_tx(nlp, m); 430 } else { 431 m_free(m); 432 result = false; 433 } 434 NLP_UNLOCK(nlp); 435 return (result); 436 } 437 438 struct socket *so = nlp->nl_socket; 439 struct mbuf *ctl = NULL; 440 if (__predict_false(m->m_next != NULL)) 441 ctl = extract_msg_info(m); 442 if (sbappendaddr(&so->so_rcv, nl_empty_src, m, ctl) != 0) { 443 sorwakeup(so); 444 NLP_LOG(LOG_DEBUG3, nlp, "appended data & woken up"); 445 } else { 446 if (ignore_limits) { 447 queue_push_tx(nlp, m); 448 } else { 449 /* 450 * Store dropped data so it can be reported 451 * on the next read 452 */ 453 nlp->nl_dropped_bytes += m_length(m, NULL); 454 nlp->nl_dropped_messages += num_messages; 455 NLP_LOG(LOG_DEBUG2, nlp, "RX oveflow: %lu m (+%d), %lu b (+%d)", 456 (unsigned long)nlp->nl_dropped_messages, num_messages, 457 (unsigned long)nlp->nl_dropped_bytes, m_length(m, NULL)); 458 soroverflow(so); 459 m_freem(m); 460 result = false; 461 } 462 } 463 NLP_UNLOCK(nlp); 464 465 return (result); 466 } 467 468 static int 469 nl_receive_message(struct nlmsghdr *hdr, int remaining_length, 470 struct nlpcb *nlp, struct nl_pstate *npt) 471 { 472 nl_handler_f handler = nl_handlers[nlp->nl_proto].cb; 473 int error = 0; 474 475 NLP_LOG(LOG_DEBUG2, nlp, "msg len: %u type: %d: flags: 0x%X seq: %u pid: %u", 476 hdr->nlmsg_len, hdr->nlmsg_type, hdr->nlmsg_flags, hdr->nlmsg_seq, 477 hdr->nlmsg_pid); 478 479 if (__predict_false(hdr->nlmsg_len > remaining_length)) { 480 NLP_LOG(LOG_DEBUG, nlp, "message is not entirely present: want %d got %d", 481 hdr->nlmsg_len, remaining_length); 482 return (EINVAL); 483 } else if (__predict_false(hdr->nlmsg_len < sizeof(*hdr))) { 484 NL_LOG(LOG_DEBUG, "message too short: %d", hdr->nlmsg_len); 485 return (EINVAL); 486 } 487 /* Stamp each message with sender pid */ 488 hdr->nlmsg_pid = nlp->nl_port; 489 490 npt->hdr = hdr; 491 492 if (hdr->nlmsg_flags & NLM_F_REQUEST && hdr->nlmsg_type >= NLMSG_MIN_TYPE) { 493 NL_LOG(LOG_DEBUG2, "handling message with msg type: %d", 494 hdr->nlmsg_type); 495 496 if (nlp->nl_linux && linux_netlink_p != NULL) { 497 struct nlmsghdr *hdr_orig = hdr; 498 hdr = linux_netlink_p->msg_from_linux(nlp->nl_proto, hdr, npt); 499 if (hdr == NULL) { 500 /* Failed to translate to kernel format. Report an error back */ 501 hdr = hdr_orig; 502 npt->hdr = hdr; 503 if (hdr->nlmsg_flags & NLM_F_ACK) 504 nlmsg_ack(nlp, EOPNOTSUPP, hdr, npt); 505 return (0); 506 } 507 } 508 error = handler(hdr, npt); 509 NL_LOG(LOG_DEBUG2, "retcode: %d", error); 510 } 511 if ((hdr->nlmsg_flags & NLM_F_ACK) || (error != 0 && error != EINTR)) { 512 if (!npt->nw->suppress_ack) { 513 NL_LOG(LOG_DEBUG3, "ack"); 514 nlmsg_ack(nlp, error, hdr, npt); 515 } 516 } 517 518 return (0); 519 } 520 521 static void 522 npt_clear(struct nl_pstate *npt) 523 { 524 lb_clear(&npt->lb); 525 npt->error = 0; 526 npt->err_msg = NULL; 527 npt->err_off = 0; 528 npt->hdr = NULL; 529 npt->nw->suppress_ack = false; 530 } 531 532 /* 533 * Processes an incoming packet, which can contain multiple netlink messages 534 */ 535 static struct mbuf * 536 nl_process_mbuf(struct mbuf *m, struct nlpcb *nlp) 537 { 538 int offset, buffer_length; 539 struct nlmsghdr *hdr; 540 char *buffer; 541 int error; 542 543 NL_LOG(LOG_DEBUG3, "RX netlink mbuf %p on %p", m, nlp->nl_socket); 544 545 struct nl_writer nw = {}; 546 if (!nlmsg_get_unicast_writer(&nw, NLMSG_SMALL, nlp)) { 547 m_freem(m); 548 NL_LOG(LOG_DEBUG, "error allocating socket writer"); 549 return (NULL); 550 } 551 552 nlmsg_ignore_limit(&nw); 553 /* TODO: alloc this buf once for nlp */ 554 int data_length = m_length(m, NULL); 555 buffer_length = roundup2(data_length, 8) + SCRATCH_BUFFER_SIZE; 556 if (nlp->nl_linux) 557 buffer_length += roundup2(data_length, 8); 558 buffer = malloc(buffer_length, M_NETLINK, M_NOWAIT | M_ZERO); 559 if (buffer == NULL) { 560 m_freem(m); 561 nlmsg_flush(&nw); 562 NL_LOG(LOG_DEBUG, "Unable to allocate %d bytes of memory", 563 buffer_length); 564 return (NULL); 565 } 566 m_copydata(m, 0, data_length, buffer); 567 568 struct nl_pstate npt = { 569 .nlp = nlp, 570 .lb.base = &buffer[roundup2(data_length, 8)], 571 .lb.size = buffer_length - roundup2(data_length, 8), 572 .nw = &nw, 573 .strict = nlp->nl_flags & NLF_STRICT, 574 }; 575 576 for (offset = 0; offset + sizeof(struct nlmsghdr) <= data_length;) { 577 hdr = (struct nlmsghdr *)&buffer[offset]; 578 /* Save length prior to calling handler */ 579 int msglen = NLMSG_ALIGN(hdr->nlmsg_len); 580 NL_LOG(LOG_DEBUG3, "parsing offset %d/%d", offset, data_length); 581 npt_clear(&npt); 582 error = nl_receive_message(hdr, data_length - offset, nlp, &npt); 583 offset += msglen; 584 if (__predict_false(error != 0 || nlp->nl_tx_blocked)) 585 break; 586 } 587 NL_LOG(LOG_DEBUG3, "packet parsing done"); 588 free(buffer, M_NETLINK); 589 nlmsg_flush(&nw); 590 591 if (nlp->nl_tx_blocked) { 592 NLP_LOCK(nlp); 593 nlp->nl_tx_blocked = false; 594 NLP_UNLOCK(nlp); 595 m_adj(m, offset); 596 return (m); 597 } else { 598 m_freem(m); 599 return (NULL); 600 } 601 } 602