1 /*- 2 * SPDX-License-Identifier: BSD-3-Clause 3 * 4 * Copyright (c) 1982, 1986, 1988, 1990, 1993 5 * The Regents of the University of California. All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 3. Neither the name of the University nor the names of its contributors 16 * may be used to endorse or promote products derived from this software 17 * without specific prior written permission. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 20 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 22 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 23 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 24 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 25 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 26 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 28 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 29 * SUCH DAMAGE. 30 * 31 * @(#)uipc_socket2.c 8.1 (Berkeley) 6/10/93 32 */ 33 34 #include <sys/cdefs.h> 35 __FBSDID("$FreeBSD$"); 36 37 #include "opt_kern_tls.h" 38 #include "opt_param.h" 39 40 #include <sys/param.h> 41 #include <sys/aio.h> /* for aio_swake proto */ 42 #include <sys/kernel.h> 43 #include <sys/ktls.h> 44 #include <sys/lock.h> 45 #include <sys/malloc.h> 46 #include <sys/mbuf.h> 47 #include <sys/msan.h> 48 #include <sys/mutex.h> 49 #include <sys/proc.h> 50 #include <sys/protosw.h> 51 #include <sys/resourcevar.h> 52 #include <sys/signalvar.h> 53 #include <sys/socket.h> 54 #include <sys/socketvar.h> 55 #include <sys/sx.h> 56 #include <sys/sysctl.h> 57 58 #include <netinet/in.h> 59 60 /* 61 * Function pointer set by the AIO routines so that the socket buffer code 62 * can call back into the AIO module if it is loaded. 63 */ 64 void (*aio_swake)(struct socket *, struct sockbuf *); 65 66 /* 67 * Primitive routines for operating on socket buffers 68 */ 69 70 #define BUF_MAX_ADJ(_sz) (((u_quad_t)(_sz)) * MCLBYTES / (MSIZE + MCLBYTES)) 71 72 u_long sb_max = SB_MAX; 73 u_long sb_max_adj = BUF_MAX_ADJ(SB_MAX); 74 75 static u_long sb_efficiency = 8; /* parameter for sbreserve() */ 76 77 #ifdef KERN_TLS 78 static void sbcompress_ktls_rx(struct sockbuf *sb, struct mbuf *m, 79 struct mbuf *n); 80 #endif 81 static struct mbuf *sbcut_internal(struct sockbuf *sb, int len); 82 static void sbflush_internal(struct sockbuf *sb); 83 84 /* 85 * Our own version of m_clrprotoflags(), that can preserve M_NOTREADY. 86 */ 87 static void 88 sbm_clrprotoflags(struct mbuf *m, int flags) 89 { 90 int mask; 91 92 mask = ~M_PROTOFLAGS; 93 if (flags & PRUS_NOTREADY) 94 mask |= M_NOTREADY; 95 while (m) { 96 m->m_flags &= mask; 97 m = m->m_next; 98 } 99 } 100 101 /* 102 * Compress M_NOTREADY mbufs after they have been readied by sbready(). 103 * 104 * sbcompress() skips M_NOTREADY mbufs since the data is not available to 105 * be copied at the time of sbcompress(). This function combines small 106 * mbufs similar to sbcompress() once mbufs are ready. 'm0' is the first 107 * mbuf sbready() marked ready, and 'end' is the first mbuf still not 108 * ready. 109 */ 110 static void 111 sbready_compress(struct sockbuf *sb, struct mbuf *m0, struct mbuf *end) 112 { 113 struct mbuf *m, *n; 114 int ext_size; 115 116 SOCKBUF_LOCK_ASSERT(sb); 117 118 if ((sb->sb_flags & SB_NOCOALESCE) != 0) 119 return; 120 121 for (m = m0; m != end; m = m->m_next) { 122 MPASS((m->m_flags & M_NOTREADY) == 0); 123 /* 124 * NB: In sbcompress(), 'n' is the last mbuf in the 125 * socket buffer and 'm' is the new mbuf being copied 126 * into the trailing space of 'n'. Here, the roles 127 * are reversed and 'n' is the next mbuf after 'm' 128 * that is being copied into the trailing space of 129 * 'm'. 130 */ 131 n = m->m_next; 132 #ifdef KERN_TLS 133 /* Try to coalesce adjacent ktls mbuf hdr/trailers. */ 134 if ((n != NULL) && (n != end) && (m->m_flags & M_EOR) == 0 && 135 (m->m_flags & M_EXTPG) && 136 (n->m_flags & M_EXTPG) && 137 !mbuf_has_tls_session(m) && 138 !mbuf_has_tls_session(n)) { 139 int hdr_len, trail_len; 140 141 hdr_len = n->m_epg_hdrlen; 142 trail_len = m->m_epg_trllen; 143 if (trail_len != 0 && hdr_len != 0 && 144 trail_len + hdr_len <= MBUF_PEXT_TRAIL_LEN) { 145 /* copy n's header to m's trailer */ 146 memcpy(&m->m_epg_trail[trail_len], 147 n->m_epg_hdr, hdr_len); 148 m->m_epg_trllen += hdr_len; 149 m->m_len += hdr_len; 150 n->m_epg_hdrlen = 0; 151 n->m_len -= hdr_len; 152 } 153 } 154 #endif 155 156 /* Compress small unmapped mbufs into plain mbufs. */ 157 if ((m->m_flags & M_EXTPG) && m->m_len <= MLEN && 158 !mbuf_has_tls_session(m)) { 159 ext_size = m->m_ext.ext_size; 160 if (mb_unmapped_compress(m) == 0) 161 sb->sb_mbcnt -= ext_size; 162 } 163 164 while ((n != NULL) && (n != end) && (m->m_flags & M_EOR) == 0 && 165 M_WRITABLE(m) && 166 (m->m_flags & M_EXTPG) == 0 && 167 !mbuf_has_tls_session(n) && 168 !mbuf_has_tls_session(m) && 169 n->m_len <= MCLBYTES / 4 && /* XXX: Don't copy too much */ 170 n->m_len <= M_TRAILINGSPACE(m) && 171 m->m_type == n->m_type) { 172 KASSERT(sb->sb_lastrecord != n, 173 ("%s: merging start of record (%p) into previous mbuf (%p)", 174 __func__, n, m)); 175 m_copydata(n, 0, n->m_len, mtodo(m, m->m_len)); 176 m->m_len += n->m_len; 177 m->m_next = n->m_next; 178 m->m_flags |= n->m_flags & M_EOR; 179 if (sb->sb_mbtail == n) 180 sb->sb_mbtail = m; 181 182 sb->sb_mbcnt -= MSIZE; 183 if (n->m_flags & M_EXT) 184 sb->sb_mbcnt -= n->m_ext.ext_size; 185 m_free(n); 186 n = m->m_next; 187 } 188 } 189 SBLASTRECORDCHK(sb); 190 SBLASTMBUFCHK(sb); 191 } 192 193 /* 194 * Mark ready "count" units of I/O starting with "m". Most mbufs 195 * count as a single unit of I/O except for M_EXTPG mbufs which 196 * are backed by multiple pages. 197 */ 198 int 199 sbready(struct sockbuf *sb, struct mbuf *m0, int count) 200 { 201 struct mbuf *m; 202 u_int blocker; 203 204 SOCKBUF_LOCK_ASSERT(sb); 205 KASSERT(sb->sb_fnrdy != NULL, ("%s: sb %p NULL fnrdy", __func__, sb)); 206 KASSERT(count > 0, ("%s: invalid count %d", __func__, count)); 207 208 m = m0; 209 blocker = (sb->sb_fnrdy == m) ? M_BLOCKED : 0; 210 211 while (count > 0) { 212 KASSERT(m->m_flags & M_NOTREADY, 213 ("%s: m %p !M_NOTREADY", __func__, m)); 214 if ((m->m_flags & M_EXTPG) != 0 && m->m_epg_npgs != 0) { 215 if (count < m->m_epg_nrdy) { 216 m->m_epg_nrdy -= count; 217 count = 0; 218 break; 219 } 220 count -= m->m_epg_nrdy; 221 m->m_epg_nrdy = 0; 222 } else 223 count--; 224 225 m->m_flags &= ~(M_NOTREADY | blocker); 226 if (blocker) 227 sb->sb_acc += m->m_len; 228 m = m->m_next; 229 } 230 231 /* 232 * If the first mbuf is still not fully ready because only 233 * some of its backing pages were readied, no further progress 234 * can be made. 235 */ 236 if (m0 == m) { 237 MPASS(m->m_flags & M_NOTREADY); 238 return (EINPROGRESS); 239 } 240 241 if (!blocker) { 242 sbready_compress(sb, m0, m); 243 return (EINPROGRESS); 244 } 245 246 /* This one was blocking all the queue. */ 247 for (; m && (m->m_flags & M_NOTREADY) == 0; m = m->m_next) { 248 KASSERT(m->m_flags & M_BLOCKED, 249 ("%s: m %p !M_BLOCKED", __func__, m)); 250 m->m_flags &= ~M_BLOCKED; 251 sb->sb_acc += m->m_len; 252 } 253 254 sb->sb_fnrdy = m; 255 sbready_compress(sb, m0, m); 256 257 return (0); 258 } 259 260 /* 261 * Adjust sockbuf state reflecting allocation of m. 262 */ 263 void 264 sballoc(struct sockbuf *sb, struct mbuf *m) 265 { 266 267 SOCKBUF_LOCK_ASSERT(sb); 268 269 sb->sb_ccc += m->m_len; 270 271 if (sb->sb_fnrdy == NULL) { 272 if (m->m_flags & M_NOTREADY) 273 sb->sb_fnrdy = m; 274 else 275 sb->sb_acc += m->m_len; 276 } else 277 m->m_flags |= M_BLOCKED; 278 279 if (m->m_type != MT_DATA && m->m_type != MT_OOBDATA) 280 sb->sb_ctl += m->m_len; 281 282 sb->sb_mbcnt += MSIZE; 283 284 if (m->m_flags & M_EXT) 285 sb->sb_mbcnt += m->m_ext.ext_size; 286 } 287 288 /* 289 * Adjust sockbuf state reflecting freeing of m. 290 */ 291 void 292 sbfree(struct sockbuf *sb, struct mbuf *m) 293 { 294 295 #if 0 /* XXX: not yet: soclose() call path comes here w/o lock. */ 296 SOCKBUF_LOCK_ASSERT(sb); 297 #endif 298 299 sb->sb_ccc -= m->m_len; 300 301 if (!(m->m_flags & M_NOTAVAIL)) 302 sb->sb_acc -= m->m_len; 303 304 if (m == sb->sb_fnrdy) { 305 struct mbuf *n; 306 307 KASSERT(m->m_flags & M_NOTREADY, 308 ("%s: m %p !M_NOTREADY", __func__, m)); 309 310 n = m->m_next; 311 while (n != NULL && !(n->m_flags & M_NOTREADY)) { 312 n->m_flags &= ~M_BLOCKED; 313 sb->sb_acc += n->m_len; 314 n = n->m_next; 315 } 316 sb->sb_fnrdy = n; 317 } 318 319 if (m->m_type != MT_DATA && m->m_type != MT_OOBDATA) 320 sb->sb_ctl -= m->m_len; 321 322 sb->sb_mbcnt -= MSIZE; 323 if (m->m_flags & M_EXT) 324 sb->sb_mbcnt -= m->m_ext.ext_size; 325 326 if (sb->sb_sndptr == m) { 327 sb->sb_sndptr = NULL; 328 sb->sb_sndptroff = 0; 329 } 330 if (sb->sb_sndptroff != 0) 331 sb->sb_sndptroff -= m->m_len; 332 } 333 334 #ifdef KERN_TLS 335 /* 336 * Similar to sballoc/sbfree but does not adjust state associated with 337 * the sb_mb chain such as sb_fnrdy or sb_sndptr*. Also assumes mbufs 338 * are not ready. 339 */ 340 void 341 sballoc_ktls_rx(struct sockbuf *sb, struct mbuf *m) 342 { 343 344 SOCKBUF_LOCK_ASSERT(sb); 345 346 sb->sb_ccc += m->m_len; 347 sb->sb_tlscc += m->m_len; 348 349 sb->sb_mbcnt += MSIZE; 350 351 if (m->m_flags & M_EXT) 352 sb->sb_mbcnt += m->m_ext.ext_size; 353 } 354 355 void 356 sbfree_ktls_rx(struct sockbuf *sb, struct mbuf *m) 357 { 358 359 #if 0 /* XXX: not yet: soclose() call path comes here w/o lock. */ 360 SOCKBUF_LOCK_ASSERT(sb); 361 #endif 362 363 sb->sb_ccc -= m->m_len; 364 sb->sb_tlscc -= m->m_len; 365 366 sb->sb_mbcnt -= MSIZE; 367 368 if (m->m_flags & M_EXT) 369 sb->sb_mbcnt -= m->m_ext.ext_size; 370 } 371 #endif 372 373 /* 374 * Socantsendmore indicates that no more data will be sent on the socket; it 375 * would normally be applied to a socket when the user informs the system 376 * that no more data is to be sent, by the protocol code (in case 377 * PRU_SHUTDOWN). Socantrcvmore indicates that no more data will be 378 * received, and will normally be applied to the socket by a protocol when it 379 * detects that the peer will send no more data. Data queued for reading in 380 * the socket may yet be read. 381 */ 382 void 383 socantsendmore_locked(struct socket *so) 384 { 385 386 SOCK_SENDBUF_LOCK_ASSERT(so); 387 388 so->so_snd.sb_state |= SBS_CANTSENDMORE; 389 sowwakeup_locked(so); 390 SOCK_SENDBUF_UNLOCK_ASSERT(so); 391 } 392 393 void 394 socantsendmore(struct socket *so) 395 { 396 397 SOCK_SENDBUF_LOCK(so); 398 socantsendmore_locked(so); 399 SOCK_SENDBUF_UNLOCK_ASSERT(so); 400 } 401 402 void 403 socantrcvmore_locked(struct socket *so) 404 { 405 406 SOCK_RECVBUF_LOCK_ASSERT(so); 407 408 so->so_rcv.sb_state |= SBS_CANTRCVMORE; 409 #ifdef KERN_TLS 410 if (so->so_rcv.sb_flags & SB_TLS_RX) 411 ktls_check_rx(&so->so_rcv); 412 #endif 413 sorwakeup_locked(so); 414 SOCK_RECVBUF_UNLOCK_ASSERT(so); 415 } 416 417 void 418 socantrcvmore(struct socket *so) 419 { 420 421 SOCK_RECVBUF_LOCK(so); 422 socantrcvmore_locked(so); 423 SOCK_RECVBUF_UNLOCK_ASSERT(so); 424 } 425 426 void 427 soroverflow_locked(struct socket *so) 428 { 429 430 SOCK_RECVBUF_LOCK_ASSERT(so); 431 432 if (so->so_options & SO_RERROR) { 433 so->so_rerror = ENOBUFS; 434 sorwakeup_locked(so); 435 } else 436 SOCK_RECVBUF_UNLOCK(so); 437 438 SOCK_RECVBUF_UNLOCK_ASSERT(so); 439 } 440 441 void 442 soroverflow(struct socket *so) 443 { 444 445 SOCK_RECVBUF_LOCK(so); 446 soroverflow_locked(so); 447 SOCK_RECVBUF_UNLOCK_ASSERT(so); 448 } 449 450 /* 451 * Wait for data to arrive at/drain from a socket buffer. 452 */ 453 int 454 sbwait(struct socket *so, sb_which which) 455 { 456 struct sockbuf *sb; 457 458 SOCK_BUF_LOCK_ASSERT(so, which); 459 460 sb = sobuf(so, which); 461 sb->sb_flags |= SB_WAIT; 462 return (msleep_sbt(&sb->sb_acc, soeventmtx(so, which), 463 (sb->sb_flags & SB_NOINTR) ? PSOCK : PSOCK | PCATCH, "sbwait", 464 sb->sb_timeo, 0, 0)); 465 } 466 467 /* 468 * Wakeup processes waiting on a socket buffer. Do asynchronous notification 469 * via SIGIO if the socket has the SS_ASYNC flag set. 470 * 471 * Called with the socket buffer lock held; will release the lock by the end 472 * of the function. This allows the caller to acquire the socket buffer lock 473 * while testing for the need for various sorts of wakeup and hold it through 474 * to the point where it's no longer required. We currently hold the lock 475 * through calls out to other subsystems (with the exception of kqueue), and 476 * then release it to avoid lock order issues. It's not clear that's 477 * correct. 478 */ 479 static __always_inline void 480 sowakeup(struct socket *so, const sb_which which) 481 { 482 struct sockbuf *sb; 483 int ret; 484 485 SOCK_BUF_LOCK_ASSERT(so, which); 486 487 sb = sobuf(so, which); 488 selwakeuppri(sb->sb_sel, PSOCK); 489 if (!SEL_WAITING(sb->sb_sel)) 490 sb->sb_flags &= ~SB_SEL; 491 if (sb->sb_flags & SB_WAIT) { 492 sb->sb_flags &= ~SB_WAIT; 493 wakeup(&sb->sb_acc); 494 } 495 KNOTE_LOCKED(&sb->sb_sel->si_note, 0); 496 if (sb->sb_upcall != NULL) { 497 ret = sb->sb_upcall(so, sb->sb_upcallarg, M_NOWAIT); 498 if (ret == SU_ISCONNECTED) { 499 KASSERT(sb == &so->so_rcv, 500 ("SO_SND upcall returned SU_ISCONNECTED")); 501 soupcall_clear(so, SO_RCV); 502 } 503 } else 504 ret = SU_OK; 505 if (sb->sb_flags & SB_AIO) 506 sowakeup_aio(so, which); 507 SOCK_BUF_UNLOCK(so, which); 508 if (ret == SU_ISCONNECTED) 509 soisconnected(so); 510 if ((so->so_state & SS_ASYNC) && so->so_sigio != NULL) 511 pgsigio(&so->so_sigio, SIGIO, 0); 512 SOCK_BUF_UNLOCK_ASSERT(so, which); 513 } 514 515 /* 516 * Do we need to notify the other side when I/O is possible? 517 */ 518 static __always_inline bool 519 sb_notify(const struct sockbuf *sb) 520 { 521 return ((sb->sb_flags & (SB_WAIT | SB_SEL | SB_ASYNC | 522 SB_UPCALL | SB_AIO | SB_KNOTE)) != 0); 523 } 524 525 void 526 sorwakeup_locked(struct socket *so) 527 { 528 SOCK_RECVBUF_LOCK_ASSERT(so); 529 if (sb_notify(&so->so_rcv)) 530 sowakeup(so, SO_RCV); 531 else 532 SOCK_RECVBUF_UNLOCK(so); 533 } 534 535 void 536 sowwakeup_locked(struct socket *so) 537 { 538 SOCK_SENDBUF_LOCK_ASSERT(so); 539 if (sb_notify(&so->so_snd)) 540 sowakeup(so, SO_SND); 541 else 542 SOCK_SENDBUF_UNLOCK(so); 543 } 544 545 /* 546 * Socket buffer (struct sockbuf) utility routines. 547 * 548 * Each socket contains two socket buffers: one for sending data and one for 549 * receiving data. Each buffer contains a queue of mbufs, information about 550 * the number of mbufs and amount of data in the queue, and other fields 551 * allowing select() statements and notification on data availability to be 552 * implemented. 553 * 554 * Data stored in a socket buffer is maintained as a list of records. Each 555 * record is a list of mbufs chained together with the m_next field. Records 556 * are chained together with the m_nextpkt field. The upper level routine 557 * soreceive() expects the following conventions to be observed when placing 558 * information in the receive buffer: 559 * 560 * 1. If the protocol requires each message be preceded by the sender's name, 561 * then a record containing that name must be present before any 562 * associated data (mbuf's must be of type MT_SONAME). 563 * 2. If the protocol supports the exchange of ``access rights'' (really just 564 * additional data associated with the message), and there are ``rights'' 565 * to be received, then a record containing this data should be present 566 * (mbuf's must be of type MT_RIGHTS). 567 * 3. If a name or rights record exists, then it must be followed by a data 568 * record, perhaps of zero length. 569 * 570 * Before using a new socket structure it is first necessary to reserve 571 * buffer space to the socket, by calling sbreserve(). This should commit 572 * some of the available buffer space in the system buffer pool for the 573 * socket (currently, it does nothing but enforce limits). The space should 574 * be released by calling sbrelease() when the socket is destroyed. 575 */ 576 int 577 soreserve(struct socket *so, u_long sndcc, u_long rcvcc) 578 { 579 struct thread *td = curthread; 580 581 SOCK_SENDBUF_LOCK(so); 582 SOCK_RECVBUF_LOCK(so); 583 if (sbreserve_locked(so, SO_SND, sndcc, td) == 0) 584 goto bad; 585 if (sbreserve_locked(so, SO_RCV, rcvcc, td) == 0) 586 goto bad2; 587 if (so->so_rcv.sb_lowat == 0) 588 so->so_rcv.sb_lowat = 1; 589 if (so->so_snd.sb_lowat == 0) 590 so->so_snd.sb_lowat = MCLBYTES; 591 if (so->so_snd.sb_lowat > so->so_snd.sb_hiwat) 592 so->so_snd.sb_lowat = so->so_snd.sb_hiwat; 593 SOCK_RECVBUF_UNLOCK(so); 594 SOCK_SENDBUF_UNLOCK(so); 595 return (0); 596 bad2: 597 sbrelease_locked(so, SO_SND); 598 bad: 599 SOCK_RECVBUF_UNLOCK(so); 600 SOCK_SENDBUF_UNLOCK(so); 601 return (ENOBUFS); 602 } 603 604 static int 605 sysctl_handle_sb_max(SYSCTL_HANDLER_ARGS) 606 { 607 int error = 0; 608 u_long tmp_sb_max = sb_max; 609 610 error = sysctl_handle_long(oidp, &tmp_sb_max, arg2, req); 611 if (error || !req->newptr) 612 return (error); 613 if (tmp_sb_max < MSIZE + MCLBYTES) 614 return (EINVAL); 615 sb_max = tmp_sb_max; 616 sb_max_adj = BUF_MAX_ADJ(sb_max); 617 return (0); 618 } 619 620 /* 621 * Allot mbufs to a sockbuf. Attempt to scale mbmax so that mbcnt doesn't 622 * become limiting if buffering efficiency is near the normal case. 623 */ 624 bool 625 sbreserve_locked_limit(struct socket *so, sb_which which, u_long cc, 626 u_long buf_max, struct thread *td) 627 { 628 struct sockbuf *sb = sobuf(so, which); 629 rlim_t sbsize_limit; 630 631 SOCK_BUF_LOCK_ASSERT(so, which); 632 633 /* 634 * When a thread is passed, we take into account the thread's socket 635 * buffer size limit. The caller will generally pass curthread, but 636 * in the TCP input path, NULL will be passed to indicate that no 637 * appropriate thread resource limits are available. In that case, 638 * we don't apply a process limit. 639 */ 640 if (cc > BUF_MAX_ADJ(buf_max)) 641 return (false); 642 if (td != NULL) { 643 sbsize_limit = lim_cur(td, RLIMIT_SBSIZE); 644 } else 645 sbsize_limit = RLIM_INFINITY; 646 if (!chgsbsize(so->so_cred->cr_uidinfo, &sb->sb_hiwat, cc, 647 sbsize_limit)) 648 return (false); 649 sb->sb_mbmax = min(cc * sb_efficiency, buf_max); 650 if (sb->sb_lowat > sb->sb_hiwat) 651 sb->sb_lowat = sb->sb_hiwat; 652 return (true); 653 } 654 655 bool 656 sbreserve_locked(struct socket *so, sb_which which, u_long cc, 657 struct thread *td) 658 { 659 return (sbreserve_locked_limit(so, which, cc, sb_max, td)); 660 } 661 662 int 663 sbsetopt(struct socket *so, struct sockopt *sopt) 664 { 665 struct sockbuf *sb; 666 sb_which wh; 667 short *flags; 668 u_int cc, *hiwat, *lowat; 669 int error, optval; 670 671 error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); 672 if (error != 0) 673 return (error); 674 675 /* 676 * Values < 1 make no sense for any of these options, 677 * so disallow them. 678 */ 679 if (optval < 1) 680 return (EINVAL); 681 cc = optval; 682 683 sb = NULL; 684 SOCK_LOCK(so); 685 if (SOLISTENING(so)) { 686 switch (sopt->sopt_name) { 687 case SO_SNDLOWAT: 688 case SO_SNDBUF: 689 lowat = &so->sol_sbsnd_lowat; 690 hiwat = &so->sol_sbsnd_hiwat; 691 flags = &so->sol_sbsnd_flags; 692 break; 693 case SO_RCVLOWAT: 694 case SO_RCVBUF: 695 lowat = &so->sol_sbrcv_lowat; 696 hiwat = &so->sol_sbrcv_hiwat; 697 flags = &so->sol_sbrcv_flags; 698 break; 699 } 700 } else { 701 switch (sopt->sopt_name) { 702 case SO_SNDLOWAT: 703 case SO_SNDBUF: 704 sb = &so->so_snd; 705 wh = SO_SND; 706 break; 707 case SO_RCVLOWAT: 708 case SO_RCVBUF: 709 sb = &so->so_rcv; 710 wh = SO_RCV; 711 break; 712 } 713 flags = &sb->sb_flags; 714 hiwat = &sb->sb_hiwat; 715 lowat = &sb->sb_lowat; 716 SOCK_BUF_LOCK(so, wh); 717 } 718 719 error = 0; 720 switch (sopt->sopt_name) { 721 case SO_SNDBUF: 722 case SO_RCVBUF: 723 if (SOLISTENING(so)) { 724 if (cc > sb_max_adj) { 725 error = ENOBUFS; 726 break; 727 } 728 *hiwat = cc; 729 if (*lowat > *hiwat) 730 *lowat = *hiwat; 731 } else { 732 if (!sbreserve_locked(so, wh, cc, curthread)) 733 error = ENOBUFS; 734 } 735 if (error == 0) 736 *flags &= ~SB_AUTOSIZE; 737 break; 738 case SO_SNDLOWAT: 739 case SO_RCVLOWAT: 740 /* 741 * Make sure the low-water is never greater than the 742 * high-water. 743 */ 744 *lowat = (cc > *hiwat) ? *hiwat : cc; 745 break; 746 } 747 748 if (!SOLISTENING(so)) 749 SOCK_BUF_UNLOCK(so, wh); 750 SOCK_UNLOCK(so); 751 return (error); 752 } 753 754 /* 755 * Free mbufs held by a socket, and reserved mbuf space. 756 */ 757 static void 758 sbrelease_internal(struct socket *so, sb_which which) 759 { 760 struct sockbuf *sb = sobuf(so, which); 761 762 sbflush_internal(sb); 763 (void)chgsbsize(so->so_cred->cr_uidinfo, &sb->sb_hiwat, 0, 764 RLIM_INFINITY); 765 sb->sb_mbmax = 0; 766 } 767 768 void 769 sbrelease_locked(struct socket *so, sb_which which) 770 { 771 772 SOCK_BUF_LOCK_ASSERT(so, which); 773 774 sbrelease_internal(so, which); 775 } 776 777 void 778 sbrelease(struct socket *so, sb_which which) 779 { 780 781 SOCK_BUF_LOCK(so, which); 782 sbrelease_locked(so, which); 783 SOCK_BUF_UNLOCK(so, which); 784 } 785 786 void 787 sbdestroy(struct socket *so, sb_which which) 788 { 789 #ifdef KERN_TLS 790 struct sockbuf *sb = sobuf(so, which); 791 792 if (sb->sb_tls_info != NULL) 793 ktls_free(sb->sb_tls_info); 794 sb->sb_tls_info = NULL; 795 #endif 796 sbrelease_internal(so, which); 797 } 798 799 /* 800 * Routines to add and remove data from an mbuf queue. 801 * 802 * The routines sbappend() or sbappendrecord() are normally called to append 803 * new mbufs to a socket buffer, after checking that adequate space is 804 * available, comparing the function sbspace() with the amount of data to be 805 * added. sbappendrecord() differs from sbappend() in that data supplied is 806 * treated as the beginning of a new record. To place a sender's address, 807 * optional access rights, and data in a socket receive buffer, 808 * sbappendaddr() should be used. To place access rights and data in a 809 * socket receive buffer, sbappendrights() should be used. In either case, 810 * the new data begins a new record. Note that unlike sbappend() and 811 * sbappendrecord(), these routines check for the caller that there will be 812 * enough space to store the data. Each fails if there is not enough space, 813 * or if it cannot find mbufs to store additional information in. 814 * 815 * Reliable protocols may use the socket send buffer to hold data awaiting 816 * acknowledgement. Data is normally copied from a socket send buffer in a 817 * protocol with m_copy for output to a peer, and then removing the data from 818 * the socket buffer with sbdrop() or sbdroprecord() when the data is 819 * acknowledged by the peer. 820 */ 821 #ifdef SOCKBUF_DEBUG 822 void 823 sblastrecordchk(struct sockbuf *sb, const char *file, int line) 824 { 825 struct mbuf *m = sb->sb_mb; 826 827 SOCKBUF_LOCK_ASSERT(sb); 828 829 while (m && m->m_nextpkt) 830 m = m->m_nextpkt; 831 832 if (m != sb->sb_lastrecord) { 833 printf("%s: sb_mb %p sb_lastrecord %p last %p\n", 834 __func__, sb->sb_mb, sb->sb_lastrecord, m); 835 printf("packet chain:\n"); 836 for (m = sb->sb_mb; m != NULL; m = m->m_nextpkt) 837 printf("\t%p\n", m); 838 panic("%s from %s:%u", __func__, file, line); 839 } 840 } 841 842 void 843 sblastmbufchk(struct sockbuf *sb, const char *file, int line) 844 { 845 struct mbuf *m = sb->sb_mb; 846 struct mbuf *n; 847 848 SOCKBUF_LOCK_ASSERT(sb); 849 850 while (m && m->m_nextpkt) 851 m = m->m_nextpkt; 852 853 while (m && m->m_next) 854 m = m->m_next; 855 856 if (m != sb->sb_mbtail) { 857 printf("%s: sb_mb %p sb_mbtail %p last %p\n", 858 __func__, sb->sb_mb, sb->sb_mbtail, m); 859 printf("packet tree:\n"); 860 for (m = sb->sb_mb; m != NULL; m = m->m_nextpkt) { 861 printf("\t"); 862 for (n = m; n != NULL; n = n->m_next) 863 printf("%p ", n); 864 printf("\n"); 865 } 866 panic("%s from %s:%u", __func__, file, line); 867 } 868 869 #ifdef KERN_TLS 870 m = sb->sb_mtls; 871 while (m && m->m_next) 872 m = m->m_next; 873 874 if (m != sb->sb_mtlstail) { 875 printf("%s: sb_mtls %p sb_mtlstail %p last %p\n", 876 __func__, sb->sb_mtls, sb->sb_mtlstail, m); 877 printf("TLS packet tree:\n"); 878 printf("\t"); 879 for (m = sb->sb_mtls; m != NULL; m = m->m_next) { 880 printf("%p ", m); 881 } 882 printf("\n"); 883 panic("%s from %s:%u", __func__, file, line); 884 } 885 #endif 886 } 887 #endif /* SOCKBUF_DEBUG */ 888 889 #define SBLINKRECORD(sb, m0) do { \ 890 SOCKBUF_LOCK_ASSERT(sb); \ 891 if ((sb)->sb_lastrecord != NULL) \ 892 (sb)->sb_lastrecord->m_nextpkt = (m0); \ 893 else \ 894 (sb)->sb_mb = (m0); \ 895 (sb)->sb_lastrecord = (m0); \ 896 } while (/*CONSTCOND*/0) 897 898 /* 899 * Append mbuf chain m to the last record in the socket buffer sb. The 900 * additional space associated the mbuf chain is recorded in sb. Empty mbufs 901 * are discarded and mbufs are compacted where possible. 902 */ 903 void 904 sbappend_locked(struct sockbuf *sb, struct mbuf *m, int flags) 905 { 906 struct mbuf *n; 907 908 SOCKBUF_LOCK_ASSERT(sb); 909 910 if (m == NULL) 911 return; 912 kmsan_check_mbuf(m, "sbappend"); 913 sbm_clrprotoflags(m, flags); 914 SBLASTRECORDCHK(sb); 915 n = sb->sb_mb; 916 if (n) { 917 while (n->m_nextpkt) 918 n = n->m_nextpkt; 919 do { 920 if (n->m_flags & M_EOR) { 921 sbappendrecord_locked(sb, m); /* XXXXXX!!!! */ 922 return; 923 } 924 } while (n->m_next && (n = n->m_next)); 925 } else { 926 /* 927 * XXX Would like to simply use sb_mbtail here, but 928 * XXX I need to verify that I won't miss an EOR that 929 * XXX way. 930 */ 931 if ((n = sb->sb_lastrecord) != NULL) { 932 do { 933 if (n->m_flags & M_EOR) { 934 sbappendrecord_locked(sb, m); /* XXXXXX!!!! */ 935 return; 936 } 937 } while (n->m_next && (n = n->m_next)); 938 } else { 939 /* 940 * If this is the first record in the socket buffer, 941 * it's also the last record. 942 */ 943 sb->sb_lastrecord = m; 944 } 945 } 946 sbcompress(sb, m, n); 947 SBLASTRECORDCHK(sb); 948 } 949 950 /* 951 * Append mbuf chain m to the last record in the socket buffer sb. The 952 * additional space associated the mbuf chain is recorded in sb. Empty mbufs 953 * are discarded and mbufs are compacted where possible. 954 */ 955 void 956 sbappend(struct sockbuf *sb, struct mbuf *m, int flags) 957 { 958 959 SOCKBUF_LOCK(sb); 960 sbappend_locked(sb, m, flags); 961 SOCKBUF_UNLOCK(sb); 962 } 963 964 #ifdef KERN_TLS 965 /* 966 * Append an mbuf containing encrypted TLS data. The data 967 * is marked M_NOTREADY until it has been decrypted and 968 * stored as a TLS record. 969 */ 970 static void 971 sbappend_ktls_rx(struct sockbuf *sb, struct mbuf *m) 972 { 973 struct ifnet *ifp; 974 struct mbuf *n; 975 int flags; 976 977 ifp = NULL; 978 flags = M_NOTREADY; 979 980 SBLASTMBUFCHK(sb); 981 982 /* Mbuf chain must start with a packet header. */ 983 MPASS((m->m_flags & M_PKTHDR) != 0); 984 985 /* Remove all packet headers and mbuf tags to get a pure data chain. */ 986 for (n = m; n != NULL; n = n->m_next) { 987 if (n->m_flags & M_PKTHDR) { 988 ifp = m->m_pkthdr.leaf_rcvif; 989 if ((n->m_pkthdr.csum_flags & CSUM_TLS_MASK) == 990 CSUM_TLS_DECRYPTED) { 991 /* Mark all mbufs in this packet decrypted. */ 992 flags = M_NOTREADY | M_DECRYPTED; 993 } else { 994 flags = M_NOTREADY; 995 } 996 m_demote_pkthdr(n); 997 } 998 999 n->m_flags &= M_DEMOTEFLAGS; 1000 n->m_flags |= flags; 1001 1002 MPASS((n->m_flags & M_NOTREADY) != 0); 1003 } 1004 1005 sbcompress_ktls_rx(sb, m, sb->sb_mtlstail); 1006 ktls_check_rx(sb); 1007 1008 /* Check for incoming packet route changes: */ 1009 if (ifp != NULL && sb->sb_tls_info->rx_ifp != NULL && 1010 sb->sb_tls_info->rx_ifp != ifp) 1011 ktls_input_ifp_mismatch(sb, ifp); 1012 } 1013 #endif 1014 1015 /* 1016 * This version of sbappend() should only be used when the caller absolutely 1017 * knows that there will never be more than one record in the socket buffer, 1018 * that is, a stream protocol (such as TCP). 1019 */ 1020 void 1021 sbappendstream_locked(struct sockbuf *sb, struct mbuf *m, int flags) 1022 { 1023 SOCKBUF_LOCK_ASSERT(sb); 1024 1025 KASSERT(m->m_nextpkt == NULL,("sbappendstream 0")); 1026 1027 kmsan_check_mbuf(m, "sbappend"); 1028 1029 #ifdef KERN_TLS 1030 /* 1031 * Decrypted TLS records are appended as records via 1032 * sbappendrecord(). TCP passes encrypted TLS records to this 1033 * function which must be scheduled for decryption. 1034 */ 1035 if (sb->sb_flags & SB_TLS_RX) { 1036 sbappend_ktls_rx(sb, m); 1037 return; 1038 } 1039 #endif 1040 1041 KASSERT(sb->sb_mb == sb->sb_lastrecord,("sbappendstream 1")); 1042 1043 SBLASTMBUFCHK(sb); 1044 1045 #ifdef KERN_TLS 1046 if (sb->sb_tls_info != NULL) 1047 ktls_seq(sb, m); 1048 #endif 1049 1050 /* Remove all packet headers and mbuf tags to get a pure data chain. */ 1051 m_demote(m, 1, flags & PRUS_NOTREADY ? M_NOTREADY : 0); 1052 1053 sbcompress(sb, m, sb->sb_mbtail); 1054 1055 sb->sb_lastrecord = sb->sb_mb; 1056 SBLASTRECORDCHK(sb); 1057 } 1058 1059 /* 1060 * This version of sbappend() should only be used when the caller absolutely 1061 * knows that there will never be more than one record in the socket buffer, 1062 * that is, a stream protocol (such as TCP). 1063 */ 1064 void 1065 sbappendstream(struct sockbuf *sb, struct mbuf *m, int flags) 1066 { 1067 1068 SOCKBUF_LOCK(sb); 1069 sbappendstream_locked(sb, m, flags); 1070 SOCKBUF_UNLOCK(sb); 1071 } 1072 1073 #ifdef SOCKBUF_DEBUG 1074 void 1075 sbcheck(struct sockbuf *sb, const char *file, int line) 1076 { 1077 struct mbuf *m, *n, *fnrdy; 1078 u_long acc, ccc, mbcnt; 1079 #ifdef KERN_TLS 1080 u_long tlscc; 1081 #endif 1082 1083 SOCKBUF_LOCK_ASSERT(sb); 1084 1085 acc = ccc = mbcnt = 0; 1086 fnrdy = NULL; 1087 1088 for (m = sb->sb_mb; m; m = n) { 1089 n = m->m_nextpkt; 1090 for (; m; m = m->m_next) { 1091 if (m->m_len == 0) { 1092 printf("sb %p empty mbuf %p\n", sb, m); 1093 goto fail; 1094 } 1095 if ((m->m_flags & M_NOTREADY) && fnrdy == NULL) { 1096 if (m != sb->sb_fnrdy) { 1097 printf("sb %p: fnrdy %p != m %p\n", 1098 sb, sb->sb_fnrdy, m); 1099 goto fail; 1100 } 1101 fnrdy = m; 1102 } 1103 if (fnrdy) { 1104 if (!(m->m_flags & M_NOTAVAIL)) { 1105 printf("sb %p: fnrdy %p, m %p is avail\n", 1106 sb, sb->sb_fnrdy, m); 1107 goto fail; 1108 } 1109 } else 1110 acc += m->m_len; 1111 ccc += m->m_len; 1112 mbcnt += MSIZE; 1113 if (m->m_flags & M_EXT) /*XXX*/ /* pretty sure this is bogus */ 1114 mbcnt += m->m_ext.ext_size; 1115 } 1116 } 1117 #ifdef KERN_TLS 1118 /* 1119 * Account for mbufs "detached" by ktls_detach_record() while 1120 * they are decrypted by ktls_decrypt(). tlsdcc gives a count 1121 * of the detached bytes that are included in ccc. The mbufs 1122 * and clusters are not included in the socket buffer 1123 * accounting. 1124 */ 1125 ccc += sb->sb_tlsdcc; 1126 1127 tlscc = 0; 1128 for (m = sb->sb_mtls; m; m = m->m_next) { 1129 if (m->m_nextpkt != NULL) { 1130 printf("sb %p TLS mbuf %p with nextpkt\n", sb, m); 1131 goto fail; 1132 } 1133 if ((m->m_flags & M_NOTREADY) == 0) { 1134 printf("sb %p TLS mbuf %p ready\n", sb, m); 1135 goto fail; 1136 } 1137 tlscc += m->m_len; 1138 ccc += m->m_len; 1139 mbcnt += MSIZE; 1140 if (m->m_flags & M_EXT) /*XXX*/ /* pretty sure this is bogus */ 1141 mbcnt += m->m_ext.ext_size; 1142 } 1143 1144 if (sb->sb_tlscc != tlscc) { 1145 printf("tlscc %ld/%u dcc %u\n", tlscc, sb->sb_tlscc, 1146 sb->sb_tlsdcc); 1147 goto fail; 1148 } 1149 #endif 1150 if (acc != sb->sb_acc || ccc != sb->sb_ccc || mbcnt != sb->sb_mbcnt) { 1151 printf("acc %ld/%u ccc %ld/%u mbcnt %ld/%u\n", 1152 acc, sb->sb_acc, ccc, sb->sb_ccc, mbcnt, sb->sb_mbcnt); 1153 #ifdef KERN_TLS 1154 printf("tlscc %ld/%u dcc %u\n", tlscc, sb->sb_tlscc, 1155 sb->sb_tlsdcc); 1156 #endif 1157 goto fail; 1158 } 1159 return; 1160 fail: 1161 panic("%s from %s:%u", __func__, file, line); 1162 } 1163 #endif 1164 1165 /* 1166 * As above, except the mbuf chain begins a new record. 1167 */ 1168 void 1169 sbappendrecord_locked(struct sockbuf *sb, struct mbuf *m0) 1170 { 1171 struct mbuf *m; 1172 1173 SOCKBUF_LOCK_ASSERT(sb); 1174 1175 if (m0 == NULL) 1176 return; 1177 1178 kmsan_check_mbuf(m0, "sbappend"); 1179 m_clrprotoflags(m0); 1180 1181 /* 1182 * Put the first mbuf on the queue. Note this permits zero length 1183 * records. 1184 */ 1185 sballoc(sb, m0); 1186 SBLASTRECORDCHK(sb); 1187 SBLINKRECORD(sb, m0); 1188 sb->sb_mbtail = m0; 1189 m = m0->m_next; 1190 m0->m_next = 0; 1191 if (m && (m0->m_flags & M_EOR)) { 1192 m0->m_flags &= ~M_EOR; 1193 m->m_flags |= M_EOR; 1194 } 1195 /* always call sbcompress() so it can do SBLASTMBUFCHK() */ 1196 sbcompress(sb, m, m0); 1197 } 1198 1199 /* 1200 * As above, except the mbuf chain begins a new record. 1201 */ 1202 void 1203 sbappendrecord(struct sockbuf *sb, struct mbuf *m0) 1204 { 1205 1206 SOCKBUF_LOCK(sb); 1207 sbappendrecord_locked(sb, m0); 1208 SOCKBUF_UNLOCK(sb); 1209 } 1210 1211 /* Helper routine that appends data, control, and address to a sockbuf. */ 1212 static int 1213 sbappendaddr_locked_internal(struct sockbuf *sb, const struct sockaddr *asa, 1214 struct mbuf *m0, struct mbuf *control, struct mbuf *ctrl_last) 1215 { 1216 struct mbuf *m, *n, *nlast; 1217 1218 if (m0 != NULL) 1219 kmsan_check_mbuf(m0, "sbappend"); 1220 if (control != NULL) 1221 kmsan_check_mbuf(control, "sbappend"); 1222 1223 #if MSIZE <= 256 1224 if (asa->sa_len > MLEN) 1225 return (0); 1226 #endif 1227 m = m_get(M_NOWAIT, MT_SONAME); 1228 if (m == NULL) 1229 return (0); 1230 m->m_len = asa->sa_len; 1231 bcopy(asa, mtod(m, caddr_t), asa->sa_len); 1232 if (m0) { 1233 M_ASSERT_NO_SND_TAG(m0); 1234 m_clrprotoflags(m0); 1235 m_tag_delete_chain(m0, NULL); 1236 /* 1237 * Clear some persistent info from pkthdr. 1238 * We don't use m_demote(), because some netgraph consumers 1239 * expect M_PKTHDR presence. 1240 */ 1241 m0->m_pkthdr.rcvif = NULL; 1242 m0->m_pkthdr.flowid = 0; 1243 m0->m_pkthdr.csum_flags = 0; 1244 m0->m_pkthdr.fibnum = 0; 1245 m0->m_pkthdr.rsstype = 0; 1246 } 1247 if (ctrl_last) 1248 ctrl_last->m_next = m0; /* concatenate data to control */ 1249 else 1250 control = m0; 1251 m->m_next = control; 1252 for (n = m; n->m_next != NULL; n = n->m_next) 1253 sballoc(sb, n); 1254 sballoc(sb, n); 1255 nlast = n; 1256 SBLINKRECORD(sb, m); 1257 1258 sb->sb_mbtail = nlast; 1259 SBLASTMBUFCHK(sb); 1260 1261 SBLASTRECORDCHK(sb); 1262 return (1); 1263 } 1264 1265 /* 1266 * Append address and data, and optionally, control (ancillary) data to the 1267 * receive queue of a socket. If present, m0 must include a packet header 1268 * with total length. Returns 0 if no space in sockbuf or insufficient 1269 * mbufs. 1270 */ 1271 int 1272 sbappendaddr_locked(struct sockbuf *sb, const struct sockaddr *asa, 1273 struct mbuf *m0, struct mbuf *control) 1274 { 1275 struct mbuf *ctrl_last; 1276 int space = asa->sa_len; 1277 1278 SOCKBUF_LOCK_ASSERT(sb); 1279 1280 if (m0 && (m0->m_flags & M_PKTHDR) == 0) 1281 panic("sbappendaddr_locked"); 1282 if (m0) 1283 space += m0->m_pkthdr.len; 1284 space += m_length(control, &ctrl_last); 1285 1286 if (space > sbspace(sb)) 1287 return (0); 1288 return (sbappendaddr_locked_internal(sb, asa, m0, control, ctrl_last)); 1289 } 1290 1291 /* 1292 * Append address and data, and optionally, control (ancillary) data to the 1293 * receive queue of a socket. If present, m0 must include a packet header 1294 * with total length. Returns 0 if insufficient mbufs. Does not validate space 1295 * on the receiving sockbuf. 1296 */ 1297 int 1298 sbappendaddr_nospacecheck_locked(struct sockbuf *sb, const struct sockaddr *asa, 1299 struct mbuf *m0, struct mbuf *control) 1300 { 1301 struct mbuf *ctrl_last; 1302 1303 SOCKBUF_LOCK_ASSERT(sb); 1304 1305 ctrl_last = (control == NULL) ? NULL : m_last(control); 1306 return (sbappendaddr_locked_internal(sb, asa, m0, control, ctrl_last)); 1307 } 1308 1309 /* 1310 * Append address and data, and optionally, control (ancillary) data to the 1311 * receive queue of a socket. If present, m0 must include a packet header 1312 * with total length. Returns 0 if no space in sockbuf or insufficient 1313 * mbufs. 1314 */ 1315 int 1316 sbappendaddr(struct sockbuf *sb, const struct sockaddr *asa, 1317 struct mbuf *m0, struct mbuf *control) 1318 { 1319 int retval; 1320 1321 SOCKBUF_LOCK(sb); 1322 retval = sbappendaddr_locked(sb, asa, m0, control); 1323 SOCKBUF_UNLOCK(sb); 1324 return (retval); 1325 } 1326 1327 void 1328 sbappendcontrol_locked(struct sockbuf *sb, struct mbuf *m0, 1329 struct mbuf *control, int flags) 1330 { 1331 struct mbuf *m, *mlast; 1332 1333 kmsan_check_mbuf(m0, "sbappend"); 1334 kmsan_check_mbuf(control, "sbappend"); 1335 1336 sbm_clrprotoflags(m0, flags); 1337 m_last(control)->m_next = m0; 1338 1339 SBLASTRECORDCHK(sb); 1340 1341 for (m = control; m->m_next; m = m->m_next) 1342 sballoc(sb, m); 1343 sballoc(sb, m); 1344 mlast = m; 1345 SBLINKRECORD(sb, control); 1346 1347 sb->sb_mbtail = mlast; 1348 SBLASTMBUFCHK(sb); 1349 1350 SBLASTRECORDCHK(sb); 1351 } 1352 1353 void 1354 sbappendcontrol(struct sockbuf *sb, struct mbuf *m0, struct mbuf *control, 1355 int flags) 1356 { 1357 1358 SOCKBUF_LOCK(sb); 1359 sbappendcontrol_locked(sb, m0, control, flags); 1360 SOCKBUF_UNLOCK(sb); 1361 } 1362 1363 /* 1364 * Append the data in mbuf chain (m) into the socket buffer sb following mbuf 1365 * (n). If (n) is NULL, the buffer is presumed empty. 1366 * 1367 * When the data is compressed, mbufs in the chain may be handled in one of 1368 * three ways: 1369 * 1370 * (1) The mbuf may simply be dropped, if it contributes nothing (no data, no 1371 * record boundary, and no change in data type). 1372 * 1373 * (2) The mbuf may be coalesced -- i.e., data in the mbuf may be copied into 1374 * an mbuf already in the socket buffer. This can occur if an 1375 * appropriate mbuf exists, there is room, both mbufs are not marked as 1376 * not ready, and no merging of data types will occur. 1377 * 1378 * (3) The mbuf may be appended to the end of the existing mbuf chain. 1379 * 1380 * If any of the new mbufs is marked as M_EOR, mark the last mbuf appended as 1381 * end-of-record. 1382 */ 1383 void 1384 sbcompress(struct sockbuf *sb, struct mbuf *m, struct mbuf *n) 1385 { 1386 int eor = 0; 1387 struct mbuf *o; 1388 1389 SOCKBUF_LOCK_ASSERT(sb); 1390 1391 while (m) { 1392 eor |= m->m_flags & M_EOR; 1393 if (m->m_len == 0 && 1394 (eor == 0 || 1395 (((o = m->m_next) || (o = n)) && 1396 o->m_type == m->m_type))) { 1397 if (sb->sb_lastrecord == m) 1398 sb->sb_lastrecord = m->m_next; 1399 m = m_free(m); 1400 continue; 1401 } 1402 if (n && (n->m_flags & M_EOR) == 0 && 1403 M_WRITABLE(n) && 1404 ((sb->sb_flags & SB_NOCOALESCE) == 0) && 1405 !(m->m_flags & M_NOTREADY) && 1406 !(n->m_flags & (M_NOTREADY | M_EXTPG)) && 1407 !mbuf_has_tls_session(m) && 1408 !mbuf_has_tls_session(n) && 1409 m->m_len <= MCLBYTES / 4 && /* XXX: Don't copy too much */ 1410 m->m_len <= M_TRAILINGSPACE(n) && 1411 n->m_type == m->m_type) { 1412 m_copydata(m, 0, m->m_len, mtodo(n, n->m_len)); 1413 n->m_len += m->m_len; 1414 sb->sb_ccc += m->m_len; 1415 if (sb->sb_fnrdy == NULL) 1416 sb->sb_acc += m->m_len; 1417 if (m->m_type != MT_DATA && m->m_type != MT_OOBDATA) 1418 /* XXX: Probably don't need.*/ 1419 sb->sb_ctl += m->m_len; 1420 m = m_free(m); 1421 continue; 1422 } 1423 if (m->m_len <= MLEN && (m->m_flags & M_EXTPG) && 1424 (m->m_flags & M_NOTREADY) == 0 && 1425 !mbuf_has_tls_session(m)) 1426 (void)mb_unmapped_compress(m); 1427 if (n) 1428 n->m_next = m; 1429 else 1430 sb->sb_mb = m; 1431 sb->sb_mbtail = m; 1432 sballoc(sb, m); 1433 n = m; 1434 m->m_flags &= ~M_EOR; 1435 m = m->m_next; 1436 n->m_next = 0; 1437 } 1438 if (eor) { 1439 KASSERT(n != NULL, ("sbcompress: eor && n == NULL")); 1440 n->m_flags |= eor; 1441 } 1442 SBLASTMBUFCHK(sb); 1443 } 1444 1445 #ifdef KERN_TLS 1446 /* 1447 * A version of sbcompress() for encrypted TLS RX mbufs. These mbufs 1448 * are appended to the 'sb_mtls' chain instead of 'sb_mb' and are also 1449 * a bit simpler (no EOR markers, always MT_DATA, etc.). 1450 */ 1451 static void 1452 sbcompress_ktls_rx(struct sockbuf *sb, struct mbuf *m, struct mbuf *n) 1453 { 1454 1455 SOCKBUF_LOCK_ASSERT(sb); 1456 1457 while (m) { 1458 KASSERT((m->m_flags & M_EOR) == 0, 1459 ("TLS RX mbuf %p with EOR", m)); 1460 KASSERT(m->m_type == MT_DATA, 1461 ("TLS RX mbuf %p is not MT_DATA", m)); 1462 KASSERT((m->m_flags & M_NOTREADY) != 0, 1463 ("TLS RX mbuf %p ready", m)); 1464 KASSERT((m->m_flags & M_EXTPG) == 0, 1465 ("TLS RX mbuf %p unmapped", m)); 1466 1467 if (m->m_len == 0) { 1468 m = m_free(m); 1469 continue; 1470 } 1471 1472 /* 1473 * Even though both 'n' and 'm' are NOTREADY, it's ok 1474 * to coalesce the data. 1475 */ 1476 if (n && 1477 M_WRITABLE(n) && 1478 ((sb->sb_flags & SB_NOCOALESCE) == 0) && 1479 !((m->m_flags ^ n->m_flags) & M_DECRYPTED) && 1480 !(n->m_flags & M_EXTPG) && 1481 m->m_len <= MCLBYTES / 4 && /* XXX: Don't copy too much */ 1482 m->m_len <= M_TRAILINGSPACE(n)) { 1483 m_copydata(m, 0, m->m_len, mtodo(n, n->m_len)); 1484 n->m_len += m->m_len; 1485 sb->sb_ccc += m->m_len; 1486 sb->sb_tlscc += m->m_len; 1487 m = m_free(m); 1488 continue; 1489 } 1490 if (n) 1491 n->m_next = m; 1492 else 1493 sb->sb_mtls = m; 1494 sb->sb_mtlstail = m; 1495 sballoc_ktls_rx(sb, m); 1496 n = m; 1497 m = m->m_next; 1498 n->m_next = NULL; 1499 } 1500 SBLASTMBUFCHK(sb); 1501 } 1502 #endif 1503 1504 /* 1505 * Free all mbufs in a sockbuf. Check that all resources are reclaimed. 1506 */ 1507 static void 1508 sbflush_internal(struct sockbuf *sb) 1509 { 1510 1511 while (sb->sb_mbcnt || sb->sb_tlsdcc) { 1512 /* 1513 * Don't call sbcut(sb, 0) if the leading mbuf is non-empty: 1514 * we would loop forever. Panic instead. 1515 */ 1516 if (sb->sb_ccc == 0 && (sb->sb_mb == NULL || sb->sb_mb->m_len)) 1517 break; 1518 m_freem(sbcut_internal(sb, (int)sb->sb_ccc)); 1519 } 1520 KASSERT(sb->sb_ccc == 0 && sb->sb_mb == 0 && sb->sb_mbcnt == 0, 1521 ("%s: ccc %u mb %p mbcnt %u", __func__, 1522 sb->sb_ccc, (void *)sb->sb_mb, sb->sb_mbcnt)); 1523 } 1524 1525 void 1526 sbflush_locked(struct sockbuf *sb) 1527 { 1528 1529 SOCKBUF_LOCK_ASSERT(sb); 1530 sbflush_internal(sb); 1531 } 1532 1533 void 1534 sbflush(struct sockbuf *sb) 1535 { 1536 1537 SOCKBUF_LOCK(sb); 1538 sbflush_locked(sb); 1539 SOCKBUF_UNLOCK(sb); 1540 } 1541 1542 /* 1543 * Cut data from (the front of) a sockbuf. 1544 */ 1545 static struct mbuf * 1546 sbcut_internal(struct sockbuf *sb, int len) 1547 { 1548 struct mbuf *m, *next, *mfree; 1549 bool is_tls; 1550 1551 KASSERT(len >= 0, ("%s: len is %d but it is supposed to be >= 0", 1552 __func__, len)); 1553 KASSERT(len <= sb->sb_ccc, ("%s: len: %d is > ccc: %u", 1554 __func__, len, sb->sb_ccc)); 1555 1556 next = (m = sb->sb_mb) ? m->m_nextpkt : 0; 1557 is_tls = false; 1558 mfree = NULL; 1559 1560 while (len > 0) { 1561 if (m == NULL) { 1562 #ifdef KERN_TLS 1563 if (next == NULL && !is_tls) { 1564 if (sb->sb_tlsdcc != 0) { 1565 MPASS(len >= sb->sb_tlsdcc); 1566 len -= sb->sb_tlsdcc; 1567 sb->sb_ccc -= sb->sb_tlsdcc; 1568 sb->sb_tlsdcc = 0; 1569 if (len == 0) 1570 break; 1571 } 1572 next = sb->sb_mtls; 1573 is_tls = true; 1574 } 1575 #endif 1576 KASSERT(next, ("%s: no next, len %d", __func__, len)); 1577 m = next; 1578 next = m->m_nextpkt; 1579 } 1580 if (m->m_len > len) { 1581 KASSERT(!(m->m_flags & M_NOTAVAIL), 1582 ("%s: m %p M_NOTAVAIL", __func__, m)); 1583 m->m_len -= len; 1584 m->m_data += len; 1585 sb->sb_ccc -= len; 1586 sb->sb_acc -= len; 1587 if (sb->sb_sndptroff != 0) 1588 sb->sb_sndptroff -= len; 1589 if (m->m_type != MT_DATA && m->m_type != MT_OOBDATA) 1590 sb->sb_ctl -= len; 1591 break; 1592 } 1593 len -= m->m_len; 1594 #ifdef KERN_TLS 1595 if (is_tls) 1596 sbfree_ktls_rx(sb, m); 1597 else 1598 #endif 1599 sbfree(sb, m); 1600 /* 1601 * Do not put M_NOTREADY buffers to the free list, they 1602 * are referenced from outside. 1603 */ 1604 if (m->m_flags & M_NOTREADY && !is_tls) 1605 m = m->m_next; 1606 else { 1607 struct mbuf *n; 1608 1609 n = m->m_next; 1610 m->m_next = mfree; 1611 mfree = m; 1612 m = n; 1613 } 1614 } 1615 /* 1616 * Free any zero-length mbufs from the buffer. 1617 * For SOCK_DGRAM sockets such mbufs represent empty records. 1618 * XXX: For SOCK_STREAM sockets such mbufs can appear in the buffer, 1619 * when sosend_generic() needs to send only control data. 1620 */ 1621 while (m && m->m_len == 0) { 1622 struct mbuf *n; 1623 1624 sbfree(sb, m); 1625 n = m->m_next; 1626 m->m_next = mfree; 1627 mfree = m; 1628 m = n; 1629 } 1630 #ifdef KERN_TLS 1631 if (is_tls) { 1632 sb->sb_mb = NULL; 1633 sb->sb_mtls = m; 1634 if (m == NULL) 1635 sb->sb_mtlstail = NULL; 1636 } else 1637 #endif 1638 if (m) { 1639 sb->sb_mb = m; 1640 m->m_nextpkt = next; 1641 } else 1642 sb->sb_mb = next; 1643 /* 1644 * First part is an inline SB_EMPTY_FIXUP(). Second part makes sure 1645 * sb_lastrecord is up-to-date if we dropped part of the last record. 1646 */ 1647 m = sb->sb_mb; 1648 if (m == NULL) { 1649 sb->sb_mbtail = NULL; 1650 sb->sb_lastrecord = NULL; 1651 } else if (m->m_nextpkt == NULL) { 1652 sb->sb_lastrecord = m; 1653 } 1654 1655 return (mfree); 1656 } 1657 1658 /* 1659 * Drop data from (the front of) a sockbuf. 1660 */ 1661 void 1662 sbdrop_locked(struct sockbuf *sb, int len) 1663 { 1664 1665 SOCKBUF_LOCK_ASSERT(sb); 1666 m_freem(sbcut_internal(sb, len)); 1667 } 1668 1669 /* 1670 * Drop data from (the front of) a sockbuf, 1671 * and return it to caller. 1672 */ 1673 struct mbuf * 1674 sbcut_locked(struct sockbuf *sb, int len) 1675 { 1676 1677 SOCKBUF_LOCK_ASSERT(sb); 1678 return (sbcut_internal(sb, len)); 1679 } 1680 1681 void 1682 sbdrop(struct sockbuf *sb, int len) 1683 { 1684 struct mbuf *mfree; 1685 1686 SOCKBUF_LOCK(sb); 1687 mfree = sbcut_internal(sb, len); 1688 SOCKBUF_UNLOCK(sb); 1689 1690 m_freem(mfree); 1691 } 1692 1693 struct mbuf * 1694 sbsndptr_noadv(struct sockbuf *sb, uint32_t off, uint32_t *moff) 1695 { 1696 struct mbuf *m; 1697 1698 KASSERT(sb->sb_mb != NULL, ("%s: sb_mb is NULL", __func__)); 1699 if (sb->sb_sndptr == NULL || sb->sb_sndptroff > off) { 1700 *moff = off; 1701 if (sb->sb_sndptr == NULL) { 1702 sb->sb_sndptr = sb->sb_mb; 1703 sb->sb_sndptroff = 0; 1704 } 1705 return (sb->sb_mb); 1706 } else { 1707 m = sb->sb_sndptr; 1708 off -= sb->sb_sndptroff; 1709 } 1710 *moff = off; 1711 return (m); 1712 } 1713 1714 void 1715 sbsndptr_adv(struct sockbuf *sb, struct mbuf *mb, uint32_t len) 1716 { 1717 /* 1718 * A small copy was done, advance forward the sb_sbsndptr to cover 1719 * it. 1720 */ 1721 struct mbuf *m; 1722 1723 if (mb != sb->sb_sndptr) { 1724 /* Did not copyout at the same mbuf */ 1725 return; 1726 } 1727 m = mb; 1728 while (m && (len > 0)) { 1729 if (len >= m->m_len) { 1730 len -= m->m_len; 1731 if (m->m_next) { 1732 sb->sb_sndptroff += m->m_len; 1733 sb->sb_sndptr = m->m_next; 1734 } 1735 m = m->m_next; 1736 } else { 1737 len = 0; 1738 } 1739 } 1740 } 1741 1742 /* 1743 * Return the first mbuf and the mbuf data offset for the provided 1744 * send offset without changing the "sb_sndptroff" field. 1745 */ 1746 struct mbuf * 1747 sbsndmbuf(struct sockbuf *sb, u_int off, u_int *moff) 1748 { 1749 struct mbuf *m; 1750 1751 KASSERT(sb->sb_mb != NULL, ("%s: sb_mb is NULL", __func__)); 1752 1753 /* 1754 * If the "off" is below the stored offset, which happens on 1755 * retransmits, just use "sb_mb": 1756 */ 1757 if (sb->sb_sndptr == NULL || sb->sb_sndptroff > off) { 1758 m = sb->sb_mb; 1759 } else { 1760 m = sb->sb_sndptr; 1761 off -= sb->sb_sndptroff; 1762 } 1763 while (off > 0 && m != NULL) { 1764 if (off < m->m_len) 1765 break; 1766 off -= m->m_len; 1767 m = m->m_next; 1768 } 1769 *moff = off; 1770 return (m); 1771 } 1772 1773 /* 1774 * Drop a record off the front of a sockbuf and move the next record to the 1775 * front. 1776 */ 1777 void 1778 sbdroprecord_locked(struct sockbuf *sb) 1779 { 1780 struct mbuf *m; 1781 1782 SOCKBUF_LOCK_ASSERT(sb); 1783 1784 m = sb->sb_mb; 1785 if (m) { 1786 sb->sb_mb = m->m_nextpkt; 1787 do { 1788 sbfree(sb, m); 1789 m = m_free(m); 1790 } while (m); 1791 } 1792 SB_EMPTY_FIXUP(sb); 1793 } 1794 1795 /* 1796 * Drop a record off the front of a sockbuf and move the next record to the 1797 * front. 1798 */ 1799 void 1800 sbdroprecord(struct sockbuf *sb) 1801 { 1802 1803 SOCKBUF_LOCK(sb); 1804 sbdroprecord_locked(sb); 1805 SOCKBUF_UNLOCK(sb); 1806 } 1807 1808 /* 1809 * Create a "control" mbuf containing the specified data with the specified 1810 * type for presentation on a socket buffer. 1811 */ 1812 struct mbuf * 1813 sbcreatecontrol(const void *p, u_int size, int type, int level, int wait) 1814 { 1815 struct cmsghdr *cp; 1816 struct mbuf *m; 1817 1818 MBUF_CHECKSLEEP(wait); 1819 1820 if (wait == M_NOWAIT) { 1821 if (CMSG_SPACE(size) > MCLBYTES) 1822 return (NULL); 1823 } else 1824 KASSERT(CMSG_SPACE(size) <= MCLBYTES, 1825 ("%s: passed CMSG_SPACE(%u) > MCLBYTES", __func__, size)); 1826 1827 if (CMSG_SPACE(size) > MLEN) 1828 m = m_getcl(wait, MT_CONTROL, 0); 1829 else 1830 m = m_get(wait, MT_CONTROL); 1831 if (m == NULL) 1832 return (NULL); 1833 1834 KASSERT(CMSG_SPACE(size) <= M_TRAILINGSPACE(m), 1835 ("sbcreatecontrol: short mbuf")); 1836 /* 1837 * Don't leave the padding between the msg header and the 1838 * cmsg data and the padding after the cmsg data un-initialized. 1839 */ 1840 cp = mtod(m, struct cmsghdr *); 1841 bzero(cp, CMSG_SPACE(size)); 1842 if (p != NULL) 1843 (void)memcpy(CMSG_DATA(cp), p, size); 1844 m->m_len = CMSG_SPACE(size); 1845 cp->cmsg_len = CMSG_LEN(size); 1846 cp->cmsg_level = level; 1847 cp->cmsg_type = type; 1848 return (m); 1849 } 1850 1851 /* 1852 * This does the same for socket buffers that sotoxsocket does for sockets: 1853 * generate an user-format data structure describing the socket buffer. Note 1854 * that the xsockbuf structure, since it is always embedded in a socket, does 1855 * not include a self pointer nor a length. We make this entry point public 1856 * in case some other mechanism needs it. 1857 */ 1858 void 1859 sbtoxsockbuf(struct sockbuf *sb, struct xsockbuf *xsb) 1860 { 1861 1862 xsb->sb_cc = sb->sb_ccc; 1863 xsb->sb_hiwat = sb->sb_hiwat; 1864 xsb->sb_mbcnt = sb->sb_mbcnt; 1865 xsb->sb_mbmax = sb->sb_mbmax; 1866 xsb->sb_lowat = sb->sb_lowat; 1867 xsb->sb_flags = sb->sb_flags; 1868 xsb->sb_timeo = sb->sb_timeo; 1869 } 1870 1871 /* This takes the place of kern.maxsockbuf, which moved to kern.ipc. */ 1872 static int dummy; 1873 SYSCTL_INT(_kern, KERN_DUMMY, dummy, CTLFLAG_RW | CTLFLAG_SKIP, &dummy, 0, ""); 1874 SYSCTL_OID(_kern_ipc, KIPC_MAXSOCKBUF, maxsockbuf, 1875 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, &sb_max, 0, 1876 sysctl_handle_sb_max, "LU", 1877 "Maximum socket buffer size"); 1878 SYSCTL_ULONG(_kern_ipc, KIPC_SOCKBUF_WASTE, sockbuf_waste_factor, CTLFLAG_RW, 1879 &sb_efficiency, 0, "Socket buffer size waste factor"); 1880