1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3 * 4 * Copyright (c) 2021 Ng Peng Nam Sean 5 * Copyright (c) 2022 Alexander V. Chernikov <melifaro@FreeBSD.org> 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26 * SUCH DAMAGE. 27 */ 28 29 #include <sys/cdefs.h> 30 __FBSDID("$FreeBSD$"); 31 #include <sys/param.h> 32 #include <sys/ck.h> 33 #include <sys/lock.h> 34 #include <sys/malloc.h> 35 #include <sys/mbuf.h> 36 #include <sys/mutex.h> 37 #include <sys/socket.h> 38 #include <sys/socketvar.h> 39 #include <sys/syslog.h> 40 41 #include <netlink/netlink.h> 42 #include <netlink/netlink_ctl.h> 43 #include <netlink/netlink_linux.h> 44 #include <netlink/netlink_var.h> 45 46 #define DEBUG_MOD_NAME nl_io 47 #define DEBUG_MAX_LEVEL LOG_DEBUG3 48 #include <netlink/netlink_debug.h> 49 _DECLARE_DEBUG(LOG_DEBUG); 50 51 /* 52 * The logic below provide a p2p interface for receiving and 53 * sending netlink data between the kernel and userland. 54 */ 55 56 static const struct sockaddr_nl _nl_empty_src = { 57 .nl_len = sizeof(struct sockaddr_nl), 58 .nl_family = PF_NETLINK, 59 .nl_pid = 0 /* comes from the kernel */ 60 }; 61 static const struct sockaddr *nl_empty_src = (const struct sockaddr *)&_nl_empty_src; 62 63 static struct mbuf *nl_process_mbuf(struct mbuf *m, struct nlpcb *nlp); 64 65 66 static void 67 queue_push(struct nl_io_queue *q, struct mbuf *mq) 68 { 69 while (mq != NULL) { 70 struct mbuf *m = mq; 71 mq = mq->m_nextpkt; 72 m->m_nextpkt = NULL; 73 74 q->length += m_length(m, NULL); 75 STAILQ_INSERT_TAIL(&q->head, m, m_stailqpkt); 76 } 77 } 78 79 static void 80 queue_push_head(struct nl_io_queue *q, struct mbuf *m) 81 { 82 MPASS(m->m_nextpkt == NULL); 83 84 q->length += m_length(m, NULL); 85 STAILQ_INSERT_HEAD(&q->head, m, m_stailqpkt); 86 } 87 88 static struct mbuf * 89 queue_pop(struct nl_io_queue *q) 90 { 91 if (!STAILQ_EMPTY(&q->head)) { 92 struct mbuf *m = STAILQ_FIRST(&q->head); 93 STAILQ_REMOVE_HEAD(&q->head, m_stailqpkt); 94 m->m_nextpkt = NULL; 95 q->length -= m_length(m, NULL); 96 97 return (m); 98 } 99 return (NULL); 100 } 101 102 static struct mbuf * 103 queue_head(const struct nl_io_queue *q) 104 { 105 return (STAILQ_FIRST(&q->head)); 106 } 107 108 static inline bool 109 queue_empty(const struct nl_io_queue *q) 110 { 111 return (q->length == 0); 112 } 113 114 static void 115 queue_free(struct nl_io_queue *q) 116 { 117 while (!STAILQ_EMPTY(&q->head)) { 118 struct mbuf *m = STAILQ_FIRST(&q->head); 119 STAILQ_REMOVE_HEAD(&q->head, m_stailqpkt); 120 m->m_nextpkt = NULL; 121 m_freem(m); 122 } 123 q->length = 0; 124 } 125 126 127 static void 128 nl_schedule_taskqueue(struct nlpcb *nlp) 129 { 130 if (!nlp->nl_task_pending) { 131 nlp->nl_task_pending = true; 132 taskqueue_enqueue(nlp->nl_taskqueue, &nlp->nl_task); 133 NL_LOG(LOG_DEBUG3, "taskqueue scheduled"); 134 } else { 135 NL_LOG(LOG_DEBUG3, "taskqueue schedule skipped"); 136 } 137 } 138 139 int 140 nl_receive_async(struct mbuf *m, struct socket *so) 141 { 142 struct nlpcb *nlp = sotonlpcb(so); 143 int error = 0; 144 145 m->m_nextpkt = NULL; 146 147 NLP_LOCK(nlp); 148 149 if ((__predict_true(nlp->nl_active))) { 150 sbappend(&so->so_snd, m, 0); 151 NL_LOG(LOG_DEBUG3, "enqueue %u bytes", m_length(m, NULL)); 152 nl_schedule_taskqueue(nlp); 153 } else { 154 NL_LOG(LOG_DEBUG, "ignoring %u bytes on non-active socket", 155 m_length(m, NULL)); 156 m_free(m); 157 error = EINVAL; 158 } 159 160 NLP_UNLOCK(nlp); 161 162 return (error); 163 } 164 165 static bool 166 tx_check_locked(struct nlpcb *nlp) 167 { 168 if (queue_empty(&nlp->tx_queue)) 169 return (true); 170 171 /* 172 * Check if something can be moved from the internal TX queue 173 * to the socket queue. 174 */ 175 176 bool appended = false; 177 struct sockbuf *sb = &nlp->nl_socket->so_rcv; 178 SOCKBUF_LOCK(sb); 179 180 while (true) { 181 struct mbuf *m = queue_head(&nlp->tx_queue); 182 if (m && sbappendaddr_locked(sb, nl_empty_src, m, NULL) != 0) { 183 /* appended successfully */ 184 queue_pop(&nlp->tx_queue); 185 appended = true; 186 } else 187 break; 188 } 189 190 SOCKBUF_UNLOCK(sb); 191 192 if (appended) 193 sorwakeup(nlp->nl_socket); 194 195 return (queue_empty(&nlp->tx_queue)); 196 } 197 198 static bool 199 nl_process_received_one(struct nlpcb *nlp) 200 { 201 bool reschedule = false; 202 203 NLP_LOCK(nlp); 204 nlp->nl_task_pending = false; 205 206 if (!tx_check_locked(nlp)) { 207 /* TX overflow queue still not empty, ignore RX */ 208 NLP_UNLOCK(nlp); 209 return (false); 210 } 211 212 if (queue_empty(&nlp->rx_queue)) { 213 /* 214 * Grab all data we have from the socket TX queue 215 * and store it the internal queue, so it can be worked on 216 * w/o holding socket lock. 217 */ 218 struct sockbuf *sb = &nlp->nl_socket->so_snd; 219 220 SOCKBUF_LOCK(sb); 221 unsigned int avail = sbavail(sb); 222 if (avail > 0) { 223 NL_LOG(LOG_DEBUG3, "grabbed %u bytes", avail); 224 queue_push(&nlp->rx_queue, sbcut_locked(sb, avail)); 225 } 226 SOCKBUF_UNLOCK(sb); 227 } else { 228 /* Schedule another pass to read from the socket queue */ 229 reschedule = true; 230 } 231 232 int prev_hiwat = nlp->tx_queue.hiwat; 233 NLP_UNLOCK(nlp); 234 235 while (!queue_empty(&nlp->rx_queue)) { 236 struct mbuf *m = queue_pop(&nlp->rx_queue); 237 238 m = nl_process_mbuf(m, nlp); 239 if (m != NULL) { 240 queue_push_head(&nlp->rx_queue, m); 241 reschedule = false; 242 break; 243 } 244 } 245 if (nlp->tx_queue.hiwat > prev_hiwat) { 246 NLP_LOG(LOG_DEBUG, nlp, "TX override peaked to %d", nlp->tx_queue.hiwat); 247 248 } 249 250 return (reschedule); 251 } 252 253 static void 254 nl_process_received(struct nlpcb *nlp) 255 { 256 NL_LOG(LOG_DEBUG3, "taskqueue called"); 257 258 while (nl_process_received_one(nlp)) 259 ; 260 } 261 262 void 263 nl_init_io(struct nlpcb *nlp) 264 { 265 STAILQ_INIT(&nlp->rx_queue.head); 266 STAILQ_INIT(&nlp->tx_queue.head); 267 } 268 269 void 270 nl_free_io(struct nlpcb *nlp) 271 { 272 queue_free(&nlp->rx_queue); 273 queue_free(&nlp->tx_queue); 274 } 275 276 /* 277 * Called after some data have been read from the socket. 278 */ 279 void 280 nl_on_transmit(struct nlpcb *nlp) 281 { 282 NLP_LOCK(nlp); 283 284 struct socket *so = nlp->nl_socket; 285 if (__predict_false(nlp->nl_dropped_bytes > 0 && so != NULL)) { 286 unsigned long dropped_bytes = nlp->nl_dropped_bytes; 287 unsigned long dropped_messages = nlp->nl_dropped_messages; 288 nlp->nl_dropped_bytes = 0; 289 nlp->nl_dropped_messages = 0; 290 291 struct sockbuf *sb = &so->so_rcv; 292 NLP_LOG(LOG_DEBUG, nlp, 293 "socket RX overflowed, %lu messages (%lu bytes) dropped. " 294 "bytes: [%u/%u] mbufs: [%u/%u]", dropped_messages, dropped_bytes, 295 sb->sb_ccc, sb->sb_hiwat, sb->sb_mbcnt, sb->sb_mbmax); 296 /* TODO: send netlink message */ 297 } 298 299 nl_schedule_taskqueue(nlp); 300 NLP_UNLOCK(nlp); 301 } 302 303 void 304 nl_taskqueue_handler(void *_arg, int pending) 305 { 306 struct nlpcb *nlp = (struct nlpcb *)_arg; 307 308 CURVNET_SET(nlp->nl_socket->so_vnet); 309 nl_process_received(nlp); 310 CURVNET_RESTORE(); 311 } 312 313 static __noinline void 314 queue_push_tx(struct nlpcb *nlp, struct mbuf *m) 315 { 316 queue_push(&nlp->tx_queue, m); 317 nlp->nl_tx_blocked = true; 318 319 if (nlp->tx_queue.length > nlp->tx_queue.hiwat) 320 nlp->tx_queue.hiwat = nlp->tx_queue.length; 321 } 322 323 /* 324 * Tries to send @m to the socket @nlp. 325 * 326 * @m: mbuf(s) to send to. Consumed in any case. 327 * @nlp: socket to send to 328 * @cnt: number of messages in @m 329 * @io_flags: combination of NL_IOF_* flags 330 * 331 * Returns true on success. 332 * If no queue overrunes happened, wakes up socket owner. 333 */ 334 bool 335 nl_send_one(struct mbuf *m, struct nlpcb *nlp, int num_messages, int io_flags) 336 { 337 bool untranslated = io_flags & NL_IOF_UNTRANSLATED; 338 bool ignore_limits = io_flags & NL_IOF_IGNORE_LIMIT; 339 bool result = true; 340 341 IF_DEBUG_LEVEL(LOG_DEBUG2) { 342 struct nlmsghdr *hdr = mtod(m, struct nlmsghdr *); 343 NLP_LOG(LOG_DEBUG2, nlp, 344 "TX mbuf len %u msgs %u msg type %d first hdrlen %u io_flags %X", 345 m_length(m, NULL), num_messages, hdr->nlmsg_type, hdr->nlmsg_len, 346 io_flags); 347 } 348 349 if (__predict_false(nlp->nl_linux && linux_netlink_p != NULL && untranslated)) { 350 m = linux_netlink_p->mbufs_to_linux(nlp->nl_proto, m, nlp); 351 if (m == NULL) 352 return (false); 353 } 354 355 NLP_LOCK(nlp); 356 357 if (__predict_false(nlp->nl_socket == NULL)) { 358 NLP_UNLOCK(nlp); 359 m_freem(m); 360 return (false); 361 } 362 363 if (!queue_empty(&nlp->tx_queue)) { 364 if (ignore_limits) { 365 queue_push_tx(nlp, m); 366 } else { 367 m_free(m); 368 result = false; 369 } 370 NLP_UNLOCK(nlp); 371 return (result); 372 } 373 374 struct socket *so = nlp->nl_socket; 375 if (sbappendaddr(&so->so_rcv, nl_empty_src, m, NULL) != 0) { 376 sorwakeup(so); 377 NLP_LOG(LOG_DEBUG3, nlp, "appended data & woken up"); 378 } else { 379 if (ignore_limits) { 380 queue_push_tx(nlp, m); 381 } else { 382 /* 383 * Store dropped data so it can be reported 384 * on the next read 385 */ 386 nlp->nl_dropped_bytes += m_length(m, NULL); 387 nlp->nl_dropped_messages += num_messages; 388 NLP_LOG(LOG_DEBUG2, nlp, "RX oveflow: %lu m (+%d), %lu b (+%d)", 389 (unsigned long)nlp->nl_dropped_messages, num_messages, 390 (unsigned long)nlp->nl_dropped_bytes, m_length(m, NULL)); 391 soroverflow(so); 392 m_freem(m); 393 result = false; 394 } 395 } 396 NLP_UNLOCK(nlp); 397 398 return (result); 399 } 400 401 static int 402 nl_receive_message(struct nlmsghdr *hdr, int remaining_length, 403 struct nlpcb *nlp, struct nl_pstate *npt) 404 { 405 nl_handler_f handler = nl_handlers[nlp->nl_proto].cb; 406 int error = 0; 407 408 NL_LOG(LOG_DEBUG2, "msg len: %d type: %d", hdr->nlmsg_len, 409 hdr->nlmsg_type); 410 411 if (__predict_false(hdr->nlmsg_len > remaining_length)) { 412 NLP_LOG(LOG_DEBUG, nlp, "message is not entirely present: want %d got %d", 413 hdr->nlmsg_len, remaining_length); 414 return (EINVAL); 415 } else if (__predict_false(hdr->nlmsg_len < sizeof(*hdr))) { 416 NL_LOG(LOG_DEBUG, "message too short: %d", hdr->nlmsg_len); 417 return (EINVAL); 418 } 419 /* Stamp each message with sender pid */ 420 hdr->nlmsg_pid = nlp->nl_port; 421 422 npt->hdr = hdr; 423 424 if (hdr->nlmsg_flags & NLM_F_REQUEST && hdr->nlmsg_type >= NLMSG_MIN_TYPE) { 425 NL_LOG(LOG_DEBUG2, "handling message with msg type: %d", 426 hdr->nlmsg_type); 427 428 if (nlp->nl_linux && linux_netlink_p != NULL) { 429 struct nlmsghdr *hdr_orig = hdr; 430 hdr = linux_netlink_p->msg_from_linux(nlp->nl_proto, hdr, npt); 431 if (hdr == NULL) { 432 npt->hdr = hdr_orig; 433 if (hdr->nlmsg_flags & NLM_F_ACK) 434 nlmsg_ack(nlp, EAGAIN, hdr, npt); 435 return (0); 436 } 437 } 438 error = handler(hdr, npt); 439 NL_LOG(LOG_DEBUG2, "retcode: %d", error); 440 } 441 if ((hdr->nlmsg_flags & NLM_F_ACK) || (error != 0 && error != EINTR)) { 442 NL_LOG(LOG_DEBUG3, "ack"); 443 nlmsg_ack(nlp, error, hdr, npt); 444 NL_LOG(LOG_DEBUG3, "done"); 445 } 446 447 return (0); 448 } 449 450 static void 451 npt_clear(struct nl_pstate *npt) 452 { 453 lb_clear(&npt->lb); 454 npt->error = 0; 455 npt->err_msg = NULL; 456 npt->err_off = 0; 457 npt->hdr = NULL; 458 } 459 460 /* 461 * Processes an incoming packet, which can contain multiple netlink messages 462 */ 463 static struct mbuf * 464 nl_process_mbuf(struct mbuf *m, struct nlpcb *nlp) 465 { 466 int offset, buffer_length; 467 struct nlmsghdr *hdr; 468 char *buffer; 469 int error; 470 471 NL_LOG(LOG_DEBUG3, "RX netlink mbuf %p on %p", m, nlp->nl_socket); 472 473 struct nl_writer nw = {}; 474 if (!nlmsg_get_unicast_writer(&nw, NLMSG_SMALL, nlp)) { 475 m_freem(m); 476 NL_LOG(LOG_DEBUG, "error allocating socket writer"); 477 return (NULL); 478 } 479 480 nlmsg_ignore_limit(&nw); 481 /* TODO: alloc this buf once for nlp */ 482 int data_length = m_length(m, NULL); 483 buffer_length = roundup2(data_length, 8) + SCRATCH_BUFFER_SIZE; 484 if (nlp->nl_linux) 485 buffer_length += roundup2(data_length, 8); 486 buffer = malloc(buffer_length, M_NETLINK, M_NOWAIT | M_ZERO); 487 if (buffer == NULL) { 488 m_freem(m); 489 nlmsg_flush(&nw); 490 NL_LOG(LOG_DEBUG, "Unable to allocate %d bytes of memory", 491 buffer_length); 492 return (NULL); 493 } 494 m_copydata(m, 0, data_length, buffer); 495 496 struct nl_pstate npt = { 497 .nlp = nlp, 498 .lb.base = &buffer[roundup2(data_length, 8)], 499 .lb.size = buffer_length - roundup2(data_length, 8), 500 .nw = &nw, 501 .strict = nlp->nl_flags & NLF_STRICT, 502 }; 503 504 for (offset = 0; offset + sizeof(struct nlmsghdr) <= data_length;) { 505 hdr = (struct nlmsghdr *)&buffer[offset]; 506 /* Save length prior to calling handler */ 507 int msglen = NLMSG_ALIGN(hdr->nlmsg_len); 508 NL_LOG(LOG_DEBUG3, "parsing offset %d/%d", offset, data_length); 509 npt_clear(&npt); 510 error = nl_receive_message(hdr, data_length - offset, nlp, &npt); 511 offset += msglen; 512 if (__predict_false(error != 0 || nlp->nl_tx_blocked)) 513 break; 514 } 515 NL_LOG(LOG_DEBUG3, "packet parsing done"); 516 free(buffer, M_NETLINK); 517 nlmsg_flush(&nw); 518 519 if (nlp->nl_tx_blocked) { 520 NLP_LOCK(nlp); 521 nlp->nl_tx_blocked = false; 522 NLP_UNLOCK(nlp); 523 m_adj(m, offset); 524 return (m); 525 } else { 526 m_freem(m); 527 return (NULL); 528 } 529 } 530