1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause 3 * 4 * Copyright (c) 2021 Ng Peng Nam Sean 5 * Copyright (c) 2022 Alexander V. Chernikov <melifaro@FreeBSD.org> 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26 * SUCH DAMAGE. 27 */ 28 29 #include "opt_netlink.h" 30 31 #include <sys/cdefs.h> 32 __FBSDID("$FreeBSD$"); 33 #include <sys/param.h> 34 #include <sys/ck.h> 35 #include <sys/lock.h> 36 #include <sys/malloc.h> 37 #include <sys/mbuf.h> 38 #include <sys/mutex.h> 39 #include <sys/socket.h> 40 #include <sys/socketvar.h> 41 #include <sys/syslog.h> 42 43 #include <netlink/netlink.h> 44 #include <netlink/netlink_ctl.h> 45 #include <netlink/netlink_linux.h> 46 #include <netlink/netlink_var.h> 47 48 #define DEBUG_MOD_NAME nl_io 49 #define DEBUG_MAX_LEVEL LOG_DEBUG3 50 #include <netlink/netlink_debug.h> 51 _DECLARE_DEBUG(LOG_INFO); 52 53 /* 54 * The logic below provide a p2p interface for receiving and 55 * sending netlink data between the kernel and userland. 56 */ 57 58 static const struct sockaddr_nl _nl_empty_src = { 59 .nl_len = sizeof(struct sockaddr_nl), 60 .nl_family = PF_NETLINK, 61 .nl_pid = 0 /* comes from the kernel */ 62 }; 63 static const struct sockaddr *nl_empty_src = (const struct sockaddr *)&_nl_empty_src; 64 65 static struct mbuf *nl_process_mbuf(struct mbuf *m, struct nlpcb *nlp); 66 67 68 static void 69 queue_push(struct nl_io_queue *q, struct mbuf *mq) 70 { 71 while (mq != NULL) { 72 struct mbuf *m = mq; 73 mq = mq->m_nextpkt; 74 m->m_nextpkt = NULL; 75 76 q->length += m_length(m, NULL); 77 STAILQ_INSERT_TAIL(&q->head, m, m_stailqpkt); 78 } 79 } 80 81 static void 82 queue_push_head(struct nl_io_queue *q, struct mbuf *m) 83 { 84 MPASS(m->m_nextpkt == NULL); 85 86 q->length += m_length(m, NULL); 87 STAILQ_INSERT_HEAD(&q->head, m, m_stailqpkt); 88 } 89 90 static struct mbuf * 91 queue_pop(struct nl_io_queue *q) 92 { 93 if (!STAILQ_EMPTY(&q->head)) { 94 struct mbuf *m = STAILQ_FIRST(&q->head); 95 STAILQ_REMOVE_HEAD(&q->head, m_stailqpkt); 96 m->m_nextpkt = NULL; 97 q->length -= m_length(m, NULL); 98 99 return (m); 100 } 101 return (NULL); 102 } 103 104 static struct mbuf * 105 queue_head(const struct nl_io_queue *q) 106 { 107 return (STAILQ_FIRST(&q->head)); 108 } 109 110 static inline bool 111 queue_empty(const struct nl_io_queue *q) 112 { 113 return (q->length == 0); 114 } 115 116 static void 117 queue_free(struct nl_io_queue *q) 118 { 119 while (!STAILQ_EMPTY(&q->head)) { 120 struct mbuf *m = STAILQ_FIRST(&q->head); 121 STAILQ_REMOVE_HEAD(&q->head, m_stailqpkt); 122 m->m_nextpkt = NULL; 123 m_freem(m); 124 } 125 q->length = 0; 126 } 127 128 void 129 nl_add_msg_info(struct mbuf *m) 130 { 131 struct nlpcb *nlp = nl_get_thread_nlp(curthread); 132 NL_LOG(LOG_DEBUG2, "Trying to recover nlp from thread %p: %p", 133 curthread, nlp); 134 135 if (nlp == NULL) 136 return; 137 138 /* Prepare what we want to encode - PID, socket PID & msg seq */ 139 struct { 140 struct nlattr nla; 141 uint32_t val; 142 } data[] = { 143 { 144 .nla.nla_len = sizeof(struct nlattr) + sizeof(uint32_t), 145 .nla.nla_type = NLMSGINFO_ATTR_PROCESS_ID, 146 .val = nlp->nl_process_id, 147 }, 148 { 149 .nla.nla_len = sizeof(struct nlattr) + sizeof(uint32_t), 150 .nla.nla_type = NLMSGINFO_ATTR_PORT_ID, 151 .val = nlp->nl_port, 152 }, 153 }; 154 155 156 while (m->m_next != NULL) 157 m = m->m_next; 158 m->m_next = sbcreatecontrol(data, sizeof(data), 159 NETLINK_MSG_INFO, SOL_NETLINK, M_NOWAIT); 160 161 NL_LOG(LOG_DEBUG2, "Storing %u bytes of data, ctl: %p", 162 (unsigned)sizeof(data), m->m_next); 163 } 164 165 static __noinline struct mbuf * 166 extract_msg_info(struct mbuf *m) 167 { 168 while (m->m_next != NULL) { 169 if (m->m_next->m_type == MT_CONTROL) { 170 struct mbuf *ctl = m->m_next; 171 m->m_next = NULL; 172 return (ctl); 173 } 174 m = m->m_next; 175 } 176 return (NULL); 177 } 178 179 static void 180 nl_schedule_taskqueue(struct nlpcb *nlp) 181 { 182 if (!nlp->nl_task_pending) { 183 nlp->nl_task_pending = true; 184 taskqueue_enqueue(nlp->nl_taskqueue, &nlp->nl_task); 185 NL_LOG(LOG_DEBUG3, "taskqueue scheduled"); 186 } else { 187 NL_LOG(LOG_DEBUG3, "taskqueue schedule skipped"); 188 } 189 } 190 191 int 192 nl_receive_async(struct mbuf *m, struct socket *so) 193 { 194 struct nlpcb *nlp = sotonlpcb(so); 195 int error = 0; 196 197 m->m_nextpkt = NULL; 198 199 NLP_LOCK(nlp); 200 201 if ((__predict_true(nlp->nl_active))) { 202 sbappend(&so->so_snd, m, 0); 203 NL_LOG(LOG_DEBUG3, "enqueue %u bytes", m_length(m, NULL)); 204 nl_schedule_taskqueue(nlp); 205 } else { 206 NL_LOG(LOG_DEBUG, "ignoring %u bytes on non-active socket", 207 m_length(m, NULL)); 208 m_free(m); 209 error = EINVAL; 210 } 211 212 NLP_UNLOCK(nlp); 213 214 return (error); 215 } 216 217 static bool 218 tx_check_locked(struct nlpcb *nlp) 219 { 220 if (queue_empty(&nlp->tx_queue)) 221 return (true); 222 223 /* 224 * Check if something can be moved from the internal TX queue 225 * to the socket queue. 226 */ 227 228 bool appended = false; 229 struct sockbuf *sb = &nlp->nl_socket->so_rcv; 230 SOCKBUF_LOCK(sb); 231 232 while (true) { 233 struct mbuf *m = queue_head(&nlp->tx_queue); 234 if (m != NULL) { 235 struct mbuf *ctl = NULL; 236 if (__predict_false(m->m_next != NULL)) 237 ctl = extract_msg_info(m); 238 if (sbappendaddr_locked(sb, nl_empty_src, m, ctl) != 0) { 239 /* appended successfully */ 240 queue_pop(&nlp->tx_queue); 241 appended = true; 242 } else 243 break; 244 } else 245 break; 246 } 247 248 SOCKBUF_UNLOCK(sb); 249 250 if (appended) 251 sorwakeup(nlp->nl_socket); 252 253 return (queue_empty(&nlp->tx_queue)); 254 } 255 256 static bool 257 nl_process_received_one(struct nlpcb *nlp) 258 { 259 bool reschedule = false; 260 261 NLP_LOCK(nlp); 262 nlp->nl_task_pending = false; 263 264 if (!tx_check_locked(nlp)) { 265 /* TX overflow queue still not empty, ignore RX */ 266 NLP_UNLOCK(nlp); 267 return (false); 268 } 269 270 if (queue_empty(&nlp->rx_queue)) { 271 /* 272 * Grab all data we have from the socket TX queue 273 * and store it the internal queue, so it can be worked on 274 * w/o holding socket lock. 275 */ 276 struct sockbuf *sb = &nlp->nl_socket->so_snd; 277 278 SOCKBUF_LOCK(sb); 279 unsigned int avail = sbavail(sb); 280 if (avail > 0) { 281 NL_LOG(LOG_DEBUG3, "grabbed %u bytes", avail); 282 queue_push(&nlp->rx_queue, sbcut_locked(sb, avail)); 283 } 284 SOCKBUF_UNLOCK(sb); 285 } else { 286 /* Schedule another pass to read from the socket queue */ 287 reschedule = true; 288 } 289 290 int prev_hiwat = nlp->tx_queue.hiwat; 291 NLP_UNLOCK(nlp); 292 293 while (!queue_empty(&nlp->rx_queue)) { 294 struct mbuf *m = queue_pop(&nlp->rx_queue); 295 296 m = nl_process_mbuf(m, nlp); 297 if (m != NULL) { 298 queue_push_head(&nlp->rx_queue, m); 299 reschedule = false; 300 break; 301 } 302 } 303 if (nlp->tx_queue.hiwat > prev_hiwat) { 304 NLP_LOG(LOG_DEBUG, nlp, "TX override peaked to %d", nlp->tx_queue.hiwat); 305 306 } 307 308 return (reschedule); 309 } 310 311 static void 312 nl_process_received(struct nlpcb *nlp) 313 { 314 NL_LOG(LOG_DEBUG3, "taskqueue called"); 315 316 if (__predict_false(nlp->nl_need_thread_setup)) { 317 nl_set_thread_nlp(curthread, nlp); 318 NLP_LOCK(nlp); 319 nlp->nl_need_thread_setup = false; 320 NLP_UNLOCK(nlp); 321 } 322 323 while (nl_process_received_one(nlp)) 324 ; 325 } 326 327 void 328 nl_init_io(struct nlpcb *nlp) 329 { 330 STAILQ_INIT(&nlp->rx_queue.head); 331 STAILQ_INIT(&nlp->tx_queue.head); 332 } 333 334 void 335 nl_free_io(struct nlpcb *nlp) 336 { 337 queue_free(&nlp->rx_queue); 338 queue_free(&nlp->tx_queue); 339 } 340 341 /* 342 * Called after some data have been read from the socket. 343 */ 344 void 345 nl_on_transmit(struct nlpcb *nlp) 346 { 347 NLP_LOCK(nlp); 348 349 struct socket *so = nlp->nl_socket; 350 if (__predict_false(nlp->nl_dropped_bytes > 0 && so != NULL)) { 351 unsigned long dropped_bytes = nlp->nl_dropped_bytes; 352 unsigned long dropped_messages = nlp->nl_dropped_messages; 353 nlp->nl_dropped_bytes = 0; 354 nlp->nl_dropped_messages = 0; 355 356 struct sockbuf *sb = &so->so_rcv; 357 NLP_LOG(LOG_DEBUG, nlp, 358 "socket RX overflowed, %lu messages (%lu bytes) dropped. " 359 "bytes: [%u/%u] mbufs: [%u/%u]", dropped_messages, dropped_bytes, 360 sb->sb_ccc, sb->sb_hiwat, sb->sb_mbcnt, sb->sb_mbmax); 361 /* TODO: send netlink message */ 362 } 363 364 nl_schedule_taskqueue(nlp); 365 NLP_UNLOCK(nlp); 366 } 367 368 void 369 nl_taskqueue_handler(void *_arg, int pending) 370 { 371 struct nlpcb *nlp = (struct nlpcb *)_arg; 372 373 CURVNET_SET(nlp->nl_socket->so_vnet); 374 nl_process_received(nlp); 375 CURVNET_RESTORE(); 376 } 377 378 static __noinline void 379 queue_push_tx(struct nlpcb *nlp, struct mbuf *m) 380 { 381 queue_push(&nlp->tx_queue, m); 382 nlp->nl_tx_blocked = true; 383 384 if (nlp->tx_queue.length > nlp->tx_queue.hiwat) 385 nlp->tx_queue.hiwat = nlp->tx_queue.length; 386 } 387 388 /* 389 * Tries to send @m to the socket @nlp. 390 * 391 * @m: mbuf(s) to send to. Consumed in any case. 392 * @nlp: socket to send to 393 * @cnt: number of messages in @m 394 * @io_flags: combination of NL_IOF_* flags 395 * 396 * Returns true on success. 397 * If no queue overrunes happened, wakes up socket owner. 398 */ 399 bool 400 nl_send_one(struct mbuf *m, struct nlpcb *nlp, int num_messages, int io_flags) 401 { 402 bool untranslated = io_flags & NL_IOF_UNTRANSLATED; 403 bool ignore_limits = io_flags & NL_IOF_IGNORE_LIMIT; 404 bool result = true; 405 406 IF_DEBUG_LEVEL(LOG_DEBUG2) { 407 struct nlmsghdr *hdr = mtod(m, struct nlmsghdr *); 408 NLP_LOG(LOG_DEBUG2, nlp, 409 "TX mbuf len %u msgs %u msg type %d first hdrlen %u io_flags %X", 410 m_length(m, NULL), num_messages, hdr->nlmsg_type, hdr->nlmsg_len, 411 io_flags); 412 } 413 414 if (__predict_false(nlp->nl_linux && linux_netlink_p != NULL && untranslated)) { 415 m = linux_netlink_p->mbufs_to_linux(nlp->nl_proto, m, nlp); 416 if (m == NULL) 417 return (false); 418 } 419 420 NLP_LOCK(nlp); 421 422 if (__predict_false(nlp->nl_socket == NULL)) { 423 NLP_UNLOCK(nlp); 424 m_freem(m); 425 return (false); 426 } 427 428 if (!queue_empty(&nlp->tx_queue)) { 429 if (ignore_limits) { 430 queue_push_tx(nlp, m); 431 } else { 432 m_free(m); 433 result = false; 434 } 435 NLP_UNLOCK(nlp); 436 return (result); 437 } 438 439 struct socket *so = nlp->nl_socket; 440 struct mbuf *ctl = NULL; 441 if (__predict_false(m->m_next != NULL)) 442 ctl = extract_msg_info(m); 443 if (sbappendaddr(&so->so_rcv, nl_empty_src, m, ctl) != 0) { 444 sorwakeup(so); 445 NLP_LOG(LOG_DEBUG3, nlp, "appended data & woken up"); 446 } else { 447 if (ignore_limits) { 448 queue_push_tx(nlp, m); 449 } else { 450 /* 451 * Store dropped data so it can be reported 452 * on the next read 453 */ 454 nlp->nl_dropped_bytes += m_length(m, NULL); 455 nlp->nl_dropped_messages += num_messages; 456 NLP_LOG(LOG_DEBUG2, nlp, "RX oveflow: %lu m (+%d), %lu b (+%d)", 457 (unsigned long)nlp->nl_dropped_messages, num_messages, 458 (unsigned long)nlp->nl_dropped_bytes, m_length(m, NULL)); 459 soroverflow(so); 460 m_freem(m); 461 result = false; 462 } 463 } 464 NLP_UNLOCK(nlp); 465 466 return (result); 467 } 468 469 static int 470 nl_receive_message(struct nlmsghdr *hdr, int remaining_length, 471 struct nlpcb *nlp, struct nl_pstate *npt) 472 { 473 nl_handler_f handler = nl_handlers[nlp->nl_proto].cb; 474 int error = 0; 475 476 NLP_LOG(LOG_DEBUG2, nlp, "msg len: %u type: %d: flags: 0x%X seq: %u pid: %u", 477 hdr->nlmsg_len, hdr->nlmsg_type, hdr->nlmsg_flags, hdr->nlmsg_seq, 478 hdr->nlmsg_pid); 479 480 if (__predict_false(hdr->nlmsg_len > remaining_length)) { 481 NLP_LOG(LOG_DEBUG, nlp, "message is not entirely present: want %d got %d", 482 hdr->nlmsg_len, remaining_length); 483 return (EINVAL); 484 } else if (__predict_false(hdr->nlmsg_len < sizeof(*hdr))) { 485 NL_LOG(LOG_DEBUG, "message too short: %d", hdr->nlmsg_len); 486 return (EINVAL); 487 } 488 /* Stamp each message with sender pid */ 489 hdr->nlmsg_pid = nlp->nl_port; 490 491 npt->hdr = hdr; 492 493 if (hdr->nlmsg_flags & NLM_F_REQUEST && hdr->nlmsg_type >= NLMSG_MIN_TYPE) { 494 NL_LOG(LOG_DEBUG2, "handling message with msg type: %d", 495 hdr->nlmsg_type); 496 497 if (nlp->nl_linux && linux_netlink_p != NULL) { 498 struct nlmsghdr *hdr_orig = hdr; 499 hdr = linux_netlink_p->msg_from_linux(nlp->nl_proto, hdr, npt); 500 if (hdr == NULL) { 501 /* Failed to translate to kernel format. Report an error back */ 502 hdr = hdr_orig; 503 npt->hdr = hdr; 504 if (hdr->nlmsg_flags & NLM_F_ACK) 505 nlmsg_ack(nlp, EOPNOTSUPP, hdr, npt); 506 return (0); 507 } 508 } 509 error = handler(hdr, npt); 510 NL_LOG(LOG_DEBUG2, "retcode: %d", error); 511 } 512 if ((hdr->nlmsg_flags & NLM_F_ACK) || (error != 0 && error != EINTR)) { 513 if (!npt->nw->suppress_ack) { 514 NL_LOG(LOG_DEBUG3, "ack"); 515 nlmsg_ack(nlp, error, hdr, npt); 516 } 517 } 518 519 return (0); 520 } 521 522 static void 523 npt_clear(struct nl_pstate *npt) 524 { 525 lb_clear(&npt->lb); 526 npt->error = 0; 527 npt->err_msg = NULL; 528 npt->err_off = 0; 529 npt->hdr = NULL; 530 npt->nw->suppress_ack = false; 531 } 532 533 /* 534 * Processes an incoming packet, which can contain multiple netlink messages 535 */ 536 static struct mbuf * 537 nl_process_mbuf(struct mbuf *m, struct nlpcb *nlp) 538 { 539 int offset, buffer_length; 540 struct nlmsghdr *hdr; 541 char *buffer; 542 int error; 543 544 NL_LOG(LOG_DEBUG3, "RX netlink mbuf %p on %p", m, nlp->nl_socket); 545 546 struct nl_writer nw = {}; 547 if (!nlmsg_get_unicast_writer(&nw, NLMSG_SMALL, nlp)) { 548 m_freem(m); 549 NL_LOG(LOG_DEBUG, "error allocating socket writer"); 550 return (NULL); 551 } 552 553 nlmsg_ignore_limit(&nw); 554 /* TODO: alloc this buf once for nlp */ 555 int data_length = m_length(m, NULL); 556 buffer_length = roundup2(data_length, 8) + SCRATCH_BUFFER_SIZE; 557 if (nlp->nl_linux) 558 buffer_length += roundup2(data_length, 8); 559 buffer = malloc(buffer_length, M_NETLINK, M_NOWAIT | M_ZERO); 560 if (buffer == NULL) { 561 m_freem(m); 562 nlmsg_flush(&nw); 563 NL_LOG(LOG_DEBUG, "Unable to allocate %d bytes of memory", 564 buffer_length); 565 return (NULL); 566 } 567 m_copydata(m, 0, data_length, buffer); 568 569 struct nl_pstate npt = { 570 .nlp = nlp, 571 .lb.base = &buffer[roundup2(data_length, 8)], 572 .lb.size = buffer_length - roundup2(data_length, 8), 573 .nw = &nw, 574 .strict = nlp->nl_flags & NLF_STRICT, 575 }; 576 577 for (offset = 0; offset + sizeof(struct nlmsghdr) <= data_length;) { 578 hdr = (struct nlmsghdr *)&buffer[offset]; 579 /* Save length prior to calling handler */ 580 int msglen = NLMSG_ALIGN(hdr->nlmsg_len); 581 NL_LOG(LOG_DEBUG3, "parsing offset %d/%d", offset, data_length); 582 npt_clear(&npt); 583 error = nl_receive_message(hdr, data_length - offset, nlp, &npt); 584 offset += msglen; 585 if (__predict_false(error != 0 || nlp->nl_tx_blocked)) 586 break; 587 } 588 NL_LOG(LOG_DEBUG3, "packet parsing done"); 589 free(buffer, M_NETLINK); 590 nlmsg_flush(&nw); 591 592 if (nlp->nl_tx_blocked) { 593 NLP_LOCK(nlp); 594 nlp->nl_tx_blocked = false; 595 NLP_UNLOCK(nlp); 596 m_adj(m, offset); 597 return (m); 598 } else { 599 m_freem(m); 600 return (NULL); 601 } 602 } 603