1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3 * 4 * Copyright (c) 2021 Ng Peng Nam Sean 5 * Copyright (c) 2022 Alexander V. Chernikov <melifaro@FreeBSD.org> 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26 * SUCH DAMAGE. 27 */ 28 29 #include "opt_netlink.h" 30 31 #include <sys/cdefs.h> 32 __FBSDID("$FreeBSD$"); 33 #include <sys/param.h> 34 #include <sys/ck.h> 35 #include <sys/lock.h> 36 #include <sys/malloc.h> 37 #include <sys/mbuf.h> 38 #include <sys/mutex.h> 39 #include <sys/socket.h> 40 #include <sys/socketvar.h> 41 #include <sys/syslog.h> 42 43 #include <netlink/netlink.h> 44 #include <netlink/netlink_ctl.h> 45 #include <netlink/netlink_linux.h> 46 #include <netlink/netlink_var.h> 47 48 #define DEBUG_MOD_NAME nl_io 49 #define DEBUG_MAX_LEVEL LOG_DEBUG3 50 #include <netlink/netlink_debug.h> 51 _DECLARE_DEBUG(LOG_DEBUG); 52 53 /* 54 * The logic below provide a p2p interface for receiving and 55 * sending netlink data between the kernel and userland. 56 */ 57 58 static const struct sockaddr_nl _nl_empty_src = { 59 .nl_len = sizeof(struct sockaddr_nl), 60 .nl_family = PF_NETLINK, 61 .nl_pid = 0 /* comes from the kernel */ 62 }; 63 static const struct sockaddr *nl_empty_src = (const struct sockaddr *)&_nl_empty_src; 64 65 static struct mbuf *nl_process_mbuf(struct mbuf *m, struct nlpcb *nlp); 66 67 68 static void 69 queue_push(struct nl_io_queue *q, struct mbuf *mq) 70 { 71 while (mq != NULL) { 72 struct mbuf *m = mq; 73 mq = mq->m_nextpkt; 74 m->m_nextpkt = NULL; 75 76 q->length += m_length(m, NULL); 77 STAILQ_INSERT_TAIL(&q->head, m, m_stailqpkt); 78 } 79 } 80 81 static void 82 queue_push_head(struct nl_io_queue *q, struct mbuf *m) 83 { 84 MPASS(m->m_nextpkt == NULL); 85 86 q->length += m_length(m, NULL); 87 STAILQ_INSERT_HEAD(&q->head, m, m_stailqpkt); 88 } 89 90 static struct mbuf * 91 queue_pop(struct nl_io_queue *q) 92 { 93 if (!STAILQ_EMPTY(&q->head)) { 94 struct mbuf *m = STAILQ_FIRST(&q->head); 95 STAILQ_REMOVE_HEAD(&q->head, m_stailqpkt); 96 m->m_nextpkt = NULL; 97 q->length -= m_length(m, NULL); 98 99 return (m); 100 } 101 return (NULL); 102 } 103 104 static struct mbuf * 105 queue_head(const struct nl_io_queue *q) 106 { 107 return (STAILQ_FIRST(&q->head)); 108 } 109 110 static inline bool 111 queue_empty(const struct nl_io_queue *q) 112 { 113 return (q->length == 0); 114 } 115 116 static void 117 queue_free(struct nl_io_queue *q) 118 { 119 while (!STAILQ_EMPTY(&q->head)) { 120 struct mbuf *m = STAILQ_FIRST(&q->head); 121 STAILQ_REMOVE_HEAD(&q->head, m_stailqpkt); 122 m->m_nextpkt = NULL; 123 m_freem(m); 124 } 125 q->length = 0; 126 } 127 128 129 static void 130 nl_schedule_taskqueue(struct nlpcb *nlp) 131 { 132 if (!nlp->nl_task_pending) { 133 nlp->nl_task_pending = true; 134 taskqueue_enqueue(nlp->nl_taskqueue, &nlp->nl_task); 135 NL_LOG(LOG_DEBUG3, "taskqueue scheduled"); 136 } else { 137 NL_LOG(LOG_DEBUG3, "taskqueue schedule skipped"); 138 } 139 } 140 141 int 142 nl_receive_async(struct mbuf *m, struct socket *so) 143 { 144 struct nlpcb *nlp = sotonlpcb(so); 145 int error = 0; 146 147 m->m_nextpkt = NULL; 148 149 NLP_LOCK(nlp); 150 151 if ((__predict_true(nlp->nl_active))) { 152 sbappend(&so->so_snd, m, 0); 153 NL_LOG(LOG_DEBUG3, "enqueue %u bytes", m_length(m, NULL)); 154 nl_schedule_taskqueue(nlp); 155 } else { 156 NL_LOG(LOG_DEBUG, "ignoring %u bytes on non-active socket", 157 m_length(m, NULL)); 158 m_free(m); 159 error = EINVAL; 160 } 161 162 NLP_UNLOCK(nlp); 163 164 return (error); 165 } 166 167 static bool 168 tx_check_locked(struct nlpcb *nlp) 169 { 170 if (queue_empty(&nlp->tx_queue)) 171 return (true); 172 173 /* 174 * Check if something can be moved from the internal TX queue 175 * to the socket queue. 176 */ 177 178 bool appended = false; 179 struct sockbuf *sb = &nlp->nl_socket->so_rcv; 180 SOCKBUF_LOCK(sb); 181 182 while (true) { 183 struct mbuf *m = queue_head(&nlp->tx_queue); 184 if (m && sbappendaddr_locked(sb, nl_empty_src, m, NULL) != 0) { 185 /* appended successfully */ 186 queue_pop(&nlp->tx_queue); 187 appended = true; 188 } else 189 break; 190 } 191 192 SOCKBUF_UNLOCK(sb); 193 194 if (appended) 195 sorwakeup(nlp->nl_socket); 196 197 return (queue_empty(&nlp->tx_queue)); 198 } 199 200 static bool 201 nl_process_received_one(struct nlpcb *nlp) 202 { 203 bool reschedule = false; 204 205 NLP_LOCK(nlp); 206 nlp->nl_task_pending = false; 207 208 if (!tx_check_locked(nlp)) { 209 /* TX overflow queue still not empty, ignore RX */ 210 NLP_UNLOCK(nlp); 211 return (false); 212 } 213 214 if (queue_empty(&nlp->rx_queue)) { 215 /* 216 * Grab all data we have from the socket TX queue 217 * and store it the internal queue, so it can be worked on 218 * w/o holding socket lock. 219 */ 220 struct sockbuf *sb = &nlp->nl_socket->so_snd; 221 222 SOCKBUF_LOCK(sb); 223 unsigned int avail = sbavail(sb); 224 if (avail > 0) { 225 NL_LOG(LOG_DEBUG3, "grabbed %u bytes", avail); 226 queue_push(&nlp->rx_queue, sbcut_locked(sb, avail)); 227 } 228 SOCKBUF_UNLOCK(sb); 229 } else { 230 /* Schedule another pass to read from the socket queue */ 231 reschedule = true; 232 } 233 234 int prev_hiwat = nlp->tx_queue.hiwat; 235 NLP_UNLOCK(nlp); 236 237 while (!queue_empty(&nlp->rx_queue)) { 238 struct mbuf *m = queue_pop(&nlp->rx_queue); 239 240 m = nl_process_mbuf(m, nlp); 241 if (m != NULL) { 242 queue_push_head(&nlp->rx_queue, m); 243 reschedule = false; 244 break; 245 } 246 } 247 if (nlp->tx_queue.hiwat > prev_hiwat) { 248 NLP_LOG(LOG_DEBUG, nlp, "TX override peaked to %d", nlp->tx_queue.hiwat); 249 250 } 251 252 return (reschedule); 253 } 254 255 static void 256 nl_process_received(struct nlpcb *nlp) 257 { 258 NL_LOG(LOG_DEBUG3, "taskqueue called"); 259 260 while (nl_process_received_one(nlp)) 261 ; 262 } 263 264 void 265 nl_init_io(struct nlpcb *nlp) 266 { 267 STAILQ_INIT(&nlp->rx_queue.head); 268 STAILQ_INIT(&nlp->tx_queue.head); 269 } 270 271 void 272 nl_free_io(struct nlpcb *nlp) 273 { 274 queue_free(&nlp->rx_queue); 275 queue_free(&nlp->tx_queue); 276 } 277 278 /* 279 * Called after some data have been read from the socket. 280 */ 281 void 282 nl_on_transmit(struct nlpcb *nlp) 283 { 284 NLP_LOCK(nlp); 285 286 struct socket *so = nlp->nl_socket; 287 if (__predict_false(nlp->nl_dropped_bytes > 0 && so != NULL)) { 288 unsigned long dropped_bytes = nlp->nl_dropped_bytes; 289 unsigned long dropped_messages = nlp->nl_dropped_messages; 290 nlp->nl_dropped_bytes = 0; 291 nlp->nl_dropped_messages = 0; 292 293 struct sockbuf *sb = &so->so_rcv; 294 NLP_LOG(LOG_DEBUG, nlp, 295 "socket RX overflowed, %lu messages (%lu bytes) dropped. " 296 "bytes: [%u/%u] mbufs: [%u/%u]", dropped_messages, dropped_bytes, 297 sb->sb_ccc, sb->sb_hiwat, sb->sb_mbcnt, sb->sb_mbmax); 298 /* TODO: send netlink message */ 299 } 300 301 nl_schedule_taskqueue(nlp); 302 NLP_UNLOCK(nlp); 303 } 304 305 void 306 nl_taskqueue_handler(void *_arg, int pending) 307 { 308 struct nlpcb *nlp = (struct nlpcb *)_arg; 309 310 CURVNET_SET(nlp->nl_socket->so_vnet); 311 nl_process_received(nlp); 312 CURVNET_RESTORE(); 313 } 314 315 static __noinline void 316 queue_push_tx(struct nlpcb *nlp, struct mbuf *m) 317 { 318 queue_push(&nlp->tx_queue, m); 319 nlp->nl_tx_blocked = true; 320 321 if (nlp->tx_queue.length > nlp->tx_queue.hiwat) 322 nlp->tx_queue.hiwat = nlp->tx_queue.length; 323 } 324 325 /* 326 * Tries to send @m to the socket @nlp. 327 * 328 * @m: mbuf(s) to send to. Consumed in any case. 329 * @nlp: socket to send to 330 * @cnt: number of messages in @m 331 * @io_flags: combination of NL_IOF_* flags 332 * 333 * Returns true on success. 334 * If no queue overrunes happened, wakes up socket owner. 335 */ 336 bool 337 nl_send_one(struct mbuf *m, struct nlpcb *nlp, int num_messages, int io_flags) 338 { 339 bool untranslated = io_flags & NL_IOF_UNTRANSLATED; 340 bool ignore_limits = io_flags & NL_IOF_IGNORE_LIMIT; 341 bool result = true; 342 343 IF_DEBUG_LEVEL(LOG_DEBUG2) { 344 struct nlmsghdr *hdr = mtod(m, struct nlmsghdr *); 345 NLP_LOG(LOG_DEBUG2, nlp, 346 "TX mbuf len %u msgs %u msg type %d first hdrlen %u io_flags %X", 347 m_length(m, NULL), num_messages, hdr->nlmsg_type, hdr->nlmsg_len, 348 io_flags); 349 } 350 351 if (__predict_false(nlp->nl_linux && linux_netlink_p != NULL && untranslated)) { 352 m = linux_netlink_p->mbufs_to_linux(nlp->nl_proto, m, nlp); 353 if (m == NULL) 354 return (false); 355 } 356 357 NLP_LOCK(nlp); 358 359 if (__predict_false(nlp->nl_socket == NULL)) { 360 NLP_UNLOCK(nlp); 361 m_freem(m); 362 return (false); 363 } 364 365 if (!queue_empty(&nlp->tx_queue)) { 366 if (ignore_limits) { 367 queue_push_tx(nlp, m); 368 } else { 369 m_free(m); 370 result = false; 371 } 372 NLP_UNLOCK(nlp); 373 return (result); 374 } 375 376 struct socket *so = nlp->nl_socket; 377 if (sbappendaddr(&so->so_rcv, nl_empty_src, m, NULL) != 0) { 378 sorwakeup(so); 379 NLP_LOG(LOG_DEBUG3, nlp, "appended data & woken up"); 380 } else { 381 if (ignore_limits) { 382 queue_push_tx(nlp, m); 383 } else { 384 /* 385 * Store dropped data so it can be reported 386 * on the next read 387 */ 388 nlp->nl_dropped_bytes += m_length(m, NULL); 389 nlp->nl_dropped_messages += num_messages; 390 NLP_LOG(LOG_DEBUG2, nlp, "RX oveflow: %lu m (+%d), %lu b (+%d)", 391 (unsigned long)nlp->nl_dropped_messages, num_messages, 392 (unsigned long)nlp->nl_dropped_bytes, m_length(m, NULL)); 393 soroverflow(so); 394 m_freem(m); 395 result = false; 396 } 397 } 398 NLP_UNLOCK(nlp); 399 400 return (result); 401 } 402 403 static int 404 nl_receive_message(struct nlmsghdr *hdr, int remaining_length, 405 struct nlpcb *nlp, struct nl_pstate *npt) 406 { 407 nl_handler_f handler = nl_handlers[nlp->nl_proto].cb; 408 int error = 0; 409 410 NLP_LOG(LOG_DEBUG2, nlp, "msg len: %u type: %d: flags: 0x%X seq: %u pid: %u", 411 hdr->nlmsg_len, hdr->nlmsg_type, hdr->nlmsg_flags, hdr->nlmsg_seq, 412 hdr->nlmsg_pid); 413 414 if (__predict_false(hdr->nlmsg_len > remaining_length)) { 415 NLP_LOG(LOG_DEBUG, nlp, "message is not entirely present: want %d got %d", 416 hdr->nlmsg_len, remaining_length); 417 return (EINVAL); 418 } else if (__predict_false(hdr->nlmsg_len < sizeof(*hdr))) { 419 NL_LOG(LOG_DEBUG, "message too short: %d", hdr->nlmsg_len); 420 return (EINVAL); 421 } 422 /* Stamp each message with sender pid */ 423 hdr->nlmsg_pid = nlp->nl_port; 424 425 npt->hdr = hdr; 426 427 if (hdr->nlmsg_flags & NLM_F_REQUEST && hdr->nlmsg_type >= NLMSG_MIN_TYPE) { 428 NL_LOG(LOG_DEBUG2, "handling message with msg type: %d", 429 hdr->nlmsg_type); 430 431 if (nlp->nl_linux && linux_netlink_p != NULL) { 432 struct nlmsghdr *hdr_orig = hdr; 433 hdr = linux_netlink_p->msg_from_linux(nlp->nl_proto, hdr, npt); 434 if (hdr == NULL) { 435 /* Failed to translate to kernel format. Report an error back */ 436 hdr = hdr_orig; 437 npt->hdr = hdr; 438 if (hdr->nlmsg_flags & NLM_F_ACK) 439 nlmsg_ack(nlp, EOPNOTSUPP, hdr, npt); 440 return (0); 441 } 442 } 443 error = handler(hdr, npt); 444 NL_LOG(LOG_DEBUG2, "retcode: %d", error); 445 } 446 if ((hdr->nlmsg_flags & NLM_F_ACK) || (error != 0 && error != EINTR)) { 447 if (!npt->nw->suppress_ack) { 448 NL_LOG(LOG_DEBUG3, "ack"); 449 nlmsg_ack(nlp, error, hdr, npt); 450 } 451 } 452 453 return (0); 454 } 455 456 static void 457 npt_clear(struct nl_pstate *npt) 458 { 459 lb_clear(&npt->lb); 460 npt->error = 0; 461 npt->err_msg = NULL; 462 npt->err_off = 0; 463 npt->hdr = NULL; 464 npt->nw->suppress_ack = false; 465 } 466 467 /* 468 * Processes an incoming packet, which can contain multiple netlink messages 469 */ 470 static struct mbuf * 471 nl_process_mbuf(struct mbuf *m, struct nlpcb *nlp) 472 { 473 int offset, buffer_length; 474 struct nlmsghdr *hdr; 475 char *buffer; 476 int error; 477 478 NL_LOG(LOG_DEBUG3, "RX netlink mbuf %p on %p", m, nlp->nl_socket); 479 480 struct nl_writer nw = {}; 481 if (!nlmsg_get_unicast_writer(&nw, NLMSG_SMALL, nlp)) { 482 m_freem(m); 483 NL_LOG(LOG_DEBUG, "error allocating socket writer"); 484 return (NULL); 485 } 486 487 nlmsg_ignore_limit(&nw); 488 /* TODO: alloc this buf once for nlp */ 489 int data_length = m_length(m, NULL); 490 buffer_length = roundup2(data_length, 8) + SCRATCH_BUFFER_SIZE; 491 if (nlp->nl_linux) 492 buffer_length += roundup2(data_length, 8); 493 buffer = malloc(buffer_length, M_NETLINK, M_NOWAIT | M_ZERO); 494 if (buffer == NULL) { 495 m_freem(m); 496 nlmsg_flush(&nw); 497 NL_LOG(LOG_DEBUG, "Unable to allocate %d bytes of memory", 498 buffer_length); 499 return (NULL); 500 } 501 m_copydata(m, 0, data_length, buffer); 502 503 struct nl_pstate npt = { 504 .nlp = nlp, 505 .lb.base = &buffer[roundup2(data_length, 8)], 506 .lb.size = buffer_length - roundup2(data_length, 8), 507 .nw = &nw, 508 .strict = nlp->nl_flags & NLF_STRICT, 509 }; 510 511 for (offset = 0; offset + sizeof(struct nlmsghdr) <= data_length;) { 512 hdr = (struct nlmsghdr *)&buffer[offset]; 513 /* Save length prior to calling handler */ 514 int msglen = NLMSG_ALIGN(hdr->nlmsg_len); 515 NL_LOG(LOG_DEBUG3, "parsing offset %d/%d", offset, data_length); 516 npt_clear(&npt); 517 error = nl_receive_message(hdr, data_length - offset, nlp, &npt); 518 offset += msglen; 519 if (__predict_false(error != 0 || nlp->nl_tx_blocked)) 520 break; 521 } 522 NL_LOG(LOG_DEBUG3, "packet parsing done"); 523 free(buffer, M_NETLINK); 524 nlmsg_flush(&nw); 525 526 if (nlp->nl_tx_blocked) { 527 NLP_LOCK(nlp); 528 nlp->nl_tx_blocked = false; 529 NLP_UNLOCK(nlp); 530 m_adj(m, offset); 531 return (m); 532 } else { 533 m_freem(m); 534 return (NULL); 535 } 536 } 537