1 /*- 2 * Copyright (c) 1982, 1986, 1988, 1990, 1993 3 * The Regents of the University of California. All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 4. Neither the name of the University nor the names of its contributors 14 * may be used to endorse or promote products derived from this software 15 * without specific prior written permission. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 27 * SUCH DAMAGE. 28 * 29 * @(#)uipc_socket2.c 8.1 (Berkeley) 6/10/93 30 */ 31 32 #include <sys/cdefs.h> 33 __FBSDID("$FreeBSD$"); 34 35 #include "opt_param.h" 36 37 #include <sys/param.h> 38 #include <sys/aio.h> /* for aio_swake proto */ 39 #include <sys/kernel.h> 40 #include <sys/lock.h> 41 #include <sys/mbuf.h> 42 #include <sys/mutex.h> 43 #include <sys/proc.h> 44 #include <sys/protosw.h> 45 #include <sys/resourcevar.h> 46 #include <sys/signalvar.h> 47 #include <sys/socket.h> 48 #include <sys/socketvar.h> 49 #include <sys/sx.h> 50 #include <sys/sysctl.h> 51 52 /* 53 * Function pointer set by the AIO routines so that the socket buffer code 54 * can call back into the AIO module if it is loaded. 55 */ 56 void (*aio_swake)(struct socket *, struct sockbuf *); 57 58 /* 59 * Primitive routines for operating on socket buffers 60 */ 61 62 u_long sb_max = SB_MAX; 63 static u_long sb_max_adj = 64 SB_MAX * MCLBYTES / (MSIZE + MCLBYTES); /* adjusted sb_max */ 65 66 static u_long sb_efficiency = 8; /* parameter for sbreserve() */ 67 68 static void sbdrop_internal(struct sockbuf *sb, int len); 69 static void sbflush_internal(struct sockbuf *sb); 70 static void sbrelease_internal(struct sockbuf *sb, struct socket *so); 71 72 /* 73 * Socantsendmore indicates that no more data will be sent on the socket; it 74 * would normally be applied to a socket when the user informs the system 75 * that no more data is to be sent, by the protocol code (in case 76 * PRU_SHUTDOWN). Socantrcvmore indicates that no more data will be 77 * received, and will normally be applied to the socket by a protocol when it 78 * detects that the peer will send no more data. Data queued for reading in 79 * the socket may yet be read. 80 */ 81 void 82 socantsendmore_locked(struct socket *so) 83 { 84 85 SOCKBUF_LOCK_ASSERT(&so->so_snd); 86 87 so->so_snd.sb_state |= SBS_CANTSENDMORE; 88 sowwakeup_locked(so); 89 mtx_assert(SOCKBUF_MTX(&so->so_snd), MA_NOTOWNED); 90 } 91 92 void 93 socantsendmore(struct socket *so) 94 { 95 96 SOCKBUF_LOCK(&so->so_snd); 97 socantsendmore_locked(so); 98 mtx_assert(SOCKBUF_MTX(&so->so_snd), MA_NOTOWNED); 99 } 100 101 void 102 socantrcvmore_locked(struct socket *so) 103 { 104 105 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 106 107 so->so_rcv.sb_state |= SBS_CANTRCVMORE; 108 sorwakeup_locked(so); 109 mtx_assert(SOCKBUF_MTX(&so->so_rcv), MA_NOTOWNED); 110 } 111 112 void 113 socantrcvmore(struct socket *so) 114 { 115 116 SOCKBUF_LOCK(&so->so_rcv); 117 socantrcvmore_locked(so); 118 mtx_assert(SOCKBUF_MTX(&so->so_rcv), MA_NOTOWNED); 119 } 120 121 /* 122 * Wait for data to arrive at/drain from a socket buffer. 123 */ 124 int 125 sbwait(struct sockbuf *sb) 126 { 127 128 SOCKBUF_LOCK_ASSERT(sb); 129 130 sb->sb_flags |= SB_WAIT; 131 return (msleep(&sb->sb_cc, &sb->sb_mtx, 132 (sb->sb_flags & SB_NOINTR) ? PSOCK : PSOCK | PCATCH, "sbwait", 133 sb->sb_timeo)); 134 } 135 136 int 137 sblock(struct sockbuf *sb, int flags) 138 { 139 140 if (flags == M_WAITOK) { 141 sx_xlock(&sb->sb_sx); 142 return (0); 143 } else { 144 if (sx_try_xlock(&sb->sb_sx) == 0) 145 return (EWOULDBLOCK); 146 return (0); 147 } 148 } 149 150 void 151 sbunlock(struct sockbuf *sb) 152 { 153 154 sx_xunlock(&sb->sb_sx); 155 } 156 157 /* 158 * Wakeup processes waiting on a socket buffer. Do asynchronous notification 159 * via SIGIO if the socket has the SS_ASYNC flag set. 160 * 161 * Called with the socket buffer lock held; will release the lock by the end 162 * of the function. This allows the caller to acquire the socket buffer lock 163 * while testing for the need for various sorts of wakeup and hold it through 164 * to the point where it's no longer required. We currently hold the lock 165 * through calls out to other subsystems (with the exception of kqueue), and 166 * then release it to avoid lock order issues. It's not clear that's 167 * correct. 168 */ 169 void 170 sowakeup(struct socket *so, struct sockbuf *sb) 171 { 172 173 SOCKBUF_LOCK_ASSERT(sb); 174 175 selwakeuppri(&sb->sb_sel, PSOCK); 176 sb->sb_flags &= ~SB_SEL; 177 if (sb->sb_flags & SB_WAIT) { 178 sb->sb_flags &= ~SB_WAIT; 179 wakeup(&sb->sb_cc); 180 } 181 KNOTE_LOCKED(&sb->sb_sel.si_note, 0); 182 SOCKBUF_UNLOCK(sb); 183 if ((so->so_state & SS_ASYNC) && so->so_sigio != NULL) 184 pgsigio(&so->so_sigio, SIGIO, 0); 185 if (sb->sb_flags & SB_UPCALL) 186 (*so->so_upcall)(so, so->so_upcallarg, M_DONTWAIT); 187 if (sb->sb_flags & SB_AIO) 188 aio_swake(so, sb); 189 mtx_assert(SOCKBUF_MTX(sb), MA_NOTOWNED); 190 } 191 192 /* 193 * Socket buffer (struct sockbuf) utility routines. 194 * 195 * Each socket contains two socket buffers: one for sending data and one for 196 * receiving data. Each buffer contains a queue of mbufs, information about 197 * the number of mbufs and amount of data in the queue, and other fields 198 * allowing select() statements and notification on data availability to be 199 * implemented. 200 * 201 * Data stored in a socket buffer is maintained as a list of records. Each 202 * record is a list of mbufs chained together with the m_next field. Records 203 * are chained together with the m_nextpkt field. The upper level routine 204 * soreceive() expects the following conventions to be observed when placing 205 * information in the receive buffer: 206 * 207 * 1. If the protocol requires each message be preceded by the sender's name, 208 * then a record containing that name must be present before any 209 * associated data (mbuf's must be of type MT_SONAME). 210 * 2. If the protocol supports the exchange of ``access rights'' (really just 211 * additional data associated with the message), and there are ``rights'' 212 * to be received, then a record containing this data should be present 213 * (mbuf's must be of type MT_RIGHTS). 214 * 3. If a name or rights record exists, then it must be followed by a data 215 * record, perhaps of zero length. 216 * 217 * Before using a new socket structure it is first necessary to reserve 218 * buffer space to the socket, by calling sbreserve(). This should commit 219 * some of the available buffer space in the system buffer pool for the 220 * socket (currently, it does nothing but enforce limits). The space should 221 * be released by calling sbrelease() when the socket is destroyed. 222 */ 223 int 224 soreserve(struct socket *so, u_long sndcc, u_long rcvcc) 225 { 226 struct thread *td = curthread; 227 228 SOCKBUF_LOCK(&so->so_snd); 229 SOCKBUF_LOCK(&so->so_rcv); 230 if (sbreserve_locked(&so->so_snd, sndcc, so, td) == 0) 231 goto bad; 232 if (sbreserve_locked(&so->so_rcv, rcvcc, so, td) == 0) 233 goto bad2; 234 if (so->so_rcv.sb_lowat == 0) 235 so->so_rcv.sb_lowat = 1; 236 if (so->so_snd.sb_lowat == 0) 237 so->so_snd.sb_lowat = MCLBYTES; 238 if (so->so_snd.sb_lowat > so->so_snd.sb_hiwat) 239 so->so_snd.sb_lowat = so->so_snd.sb_hiwat; 240 SOCKBUF_UNLOCK(&so->so_rcv); 241 SOCKBUF_UNLOCK(&so->so_snd); 242 return (0); 243 bad2: 244 sbrelease_locked(&so->so_snd, so); 245 bad: 246 SOCKBUF_UNLOCK(&so->so_rcv); 247 SOCKBUF_UNLOCK(&so->so_snd); 248 return (ENOBUFS); 249 } 250 251 static int 252 sysctl_handle_sb_max(SYSCTL_HANDLER_ARGS) 253 { 254 int error = 0; 255 u_long tmp_sb_max = sb_max; 256 257 error = sysctl_handle_long(oidp, &tmp_sb_max, arg2, req); 258 if (error || !req->newptr) 259 return (error); 260 if (tmp_sb_max < MSIZE + MCLBYTES) 261 return (EINVAL); 262 sb_max = tmp_sb_max; 263 sb_max_adj = (u_quad_t)sb_max * MCLBYTES / (MSIZE + MCLBYTES); 264 return (0); 265 } 266 267 /* 268 * Allot mbufs to a sockbuf. Attempt to scale mbmax so that mbcnt doesn't 269 * become limiting if buffering efficiency is near the normal case. 270 */ 271 int 272 sbreserve_locked(struct sockbuf *sb, u_long cc, struct socket *so, 273 struct thread *td) 274 { 275 rlim_t sbsize_limit; 276 277 SOCKBUF_LOCK_ASSERT(sb); 278 279 /* 280 * td will only be NULL when we're in an interrupt (e.g. in 281 * tcp_input()). 282 * 283 * XXXRW: This comment needs updating, as might the code. 284 */ 285 if (cc > sb_max_adj) 286 return (0); 287 if (td != NULL) { 288 PROC_LOCK(td->td_proc); 289 sbsize_limit = lim_cur(td->td_proc, RLIMIT_SBSIZE); 290 PROC_UNLOCK(td->td_proc); 291 } else 292 sbsize_limit = RLIM_INFINITY; 293 if (!chgsbsize(so->so_cred->cr_uidinfo, &sb->sb_hiwat, cc, 294 sbsize_limit)) 295 return (0); 296 sb->sb_mbmax = min(cc * sb_efficiency, sb_max); 297 if (sb->sb_lowat > sb->sb_hiwat) 298 sb->sb_lowat = sb->sb_hiwat; 299 return (1); 300 } 301 302 int 303 sbreserve(struct sockbuf *sb, u_long cc, struct socket *so, 304 struct thread *td) 305 { 306 int error; 307 308 SOCKBUF_LOCK(sb); 309 error = sbreserve_locked(sb, cc, so, td); 310 SOCKBUF_UNLOCK(sb); 311 return (error); 312 } 313 314 /* 315 * Free mbufs held by a socket, and reserved mbuf space. 316 */ 317 static void 318 sbrelease_internal(struct sockbuf *sb, struct socket *so) 319 { 320 321 sbflush_internal(sb); 322 (void)chgsbsize(so->so_cred->cr_uidinfo, &sb->sb_hiwat, 0, 323 RLIM_INFINITY); 324 sb->sb_mbmax = 0; 325 } 326 327 void 328 sbrelease_locked(struct sockbuf *sb, struct socket *so) 329 { 330 331 SOCKBUF_LOCK_ASSERT(sb); 332 333 sbrelease_internal(sb, so); 334 } 335 336 void 337 sbrelease(struct sockbuf *sb, struct socket *so) 338 { 339 340 SOCKBUF_LOCK(sb); 341 sbrelease_locked(sb, so); 342 SOCKBUF_UNLOCK(sb); 343 } 344 345 void 346 sbdestroy(struct sockbuf *sb, struct socket *so) 347 { 348 349 sbrelease_internal(sb, so); 350 } 351 352 /* 353 * Routines to add and remove data from an mbuf queue. 354 * 355 * The routines sbappend() or sbappendrecord() are normally called to append 356 * new mbufs to a socket buffer, after checking that adequate space is 357 * available, comparing the function sbspace() with the amount of data to be 358 * added. sbappendrecord() differs from sbappend() in that data supplied is 359 * treated as the beginning of a new record. To place a sender's address, 360 * optional access rights, and data in a socket receive buffer, 361 * sbappendaddr() should be used. To place access rights and data in a 362 * socket receive buffer, sbappendrights() should be used. In either case, 363 * the new data begins a new record. Note that unlike sbappend() and 364 * sbappendrecord(), these routines check for the caller that there will be 365 * enough space to store the data. Each fails if there is not enough space, 366 * or if it cannot find mbufs to store additional information in. 367 * 368 * Reliable protocols may use the socket send buffer to hold data awaiting 369 * acknowledgement. Data is normally copied from a socket send buffer in a 370 * protocol with m_copy for output to a peer, and then removing the data from 371 * the socket buffer with sbdrop() or sbdroprecord() when the data is 372 * acknowledged by the peer. 373 */ 374 #ifdef SOCKBUF_DEBUG 375 void 376 sblastrecordchk(struct sockbuf *sb, const char *file, int line) 377 { 378 struct mbuf *m = sb->sb_mb; 379 380 SOCKBUF_LOCK_ASSERT(sb); 381 382 while (m && m->m_nextpkt) 383 m = m->m_nextpkt; 384 385 if (m != sb->sb_lastrecord) { 386 printf("%s: sb_mb %p sb_lastrecord %p last %p\n", 387 __func__, sb->sb_mb, sb->sb_lastrecord, m); 388 printf("packet chain:\n"); 389 for (m = sb->sb_mb; m != NULL; m = m->m_nextpkt) 390 printf("\t%p\n", m); 391 panic("%s from %s:%u", __func__, file, line); 392 } 393 } 394 395 void 396 sblastmbufchk(struct sockbuf *sb, const char *file, int line) 397 { 398 struct mbuf *m = sb->sb_mb; 399 struct mbuf *n; 400 401 SOCKBUF_LOCK_ASSERT(sb); 402 403 while (m && m->m_nextpkt) 404 m = m->m_nextpkt; 405 406 while (m && m->m_next) 407 m = m->m_next; 408 409 if (m != sb->sb_mbtail) { 410 printf("%s: sb_mb %p sb_mbtail %p last %p\n", 411 __func__, sb->sb_mb, sb->sb_mbtail, m); 412 printf("packet tree:\n"); 413 for (m = sb->sb_mb; m != NULL; m = m->m_nextpkt) { 414 printf("\t"); 415 for (n = m; n != NULL; n = n->m_next) 416 printf("%p ", n); 417 printf("\n"); 418 } 419 panic("%s from %s:%u", __func__, file, line); 420 } 421 } 422 #endif /* SOCKBUF_DEBUG */ 423 424 #define SBLINKRECORD(sb, m0) do { \ 425 SOCKBUF_LOCK_ASSERT(sb); \ 426 if ((sb)->sb_lastrecord != NULL) \ 427 (sb)->sb_lastrecord->m_nextpkt = (m0); \ 428 else \ 429 (sb)->sb_mb = (m0); \ 430 (sb)->sb_lastrecord = (m0); \ 431 } while (/*CONSTCOND*/0) 432 433 /* 434 * Append mbuf chain m to the last record in the socket buffer sb. The 435 * additional space associated the mbuf chain is recorded in sb. Empty mbufs 436 * are discarded and mbufs are compacted where possible. 437 */ 438 void 439 sbappend_locked(struct sockbuf *sb, struct mbuf *m) 440 { 441 struct mbuf *n; 442 443 SOCKBUF_LOCK_ASSERT(sb); 444 445 if (m == 0) 446 return; 447 448 SBLASTRECORDCHK(sb); 449 n = sb->sb_mb; 450 if (n) { 451 while (n->m_nextpkt) 452 n = n->m_nextpkt; 453 do { 454 if (n->m_flags & M_EOR) { 455 sbappendrecord_locked(sb, m); /* XXXXXX!!!! */ 456 return; 457 } 458 } while (n->m_next && (n = n->m_next)); 459 } else { 460 /* 461 * XXX Would like to simply use sb_mbtail here, but 462 * XXX I need to verify that I won't miss an EOR that 463 * XXX way. 464 */ 465 if ((n = sb->sb_lastrecord) != NULL) { 466 do { 467 if (n->m_flags & M_EOR) { 468 sbappendrecord_locked(sb, m); /* XXXXXX!!!! */ 469 return; 470 } 471 } while (n->m_next && (n = n->m_next)); 472 } else { 473 /* 474 * If this is the first record in the socket buffer, 475 * it's also the last record. 476 */ 477 sb->sb_lastrecord = m; 478 } 479 } 480 sbcompress(sb, m, n); 481 SBLASTRECORDCHK(sb); 482 } 483 484 /* 485 * Append mbuf chain m to the last record in the socket buffer sb. The 486 * additional space associated the mbuf chain is recorded in sb. Empty mbufs 487 * are discarded and mbufs are compacted where possible. 488 */ 489 void 490 sbappend(struct sockbuf *sb, struct mbuf *m) 491 { 492 493 SOCKBUF_LOCK(sb); 494 sbappend_locked(sb, m); 495 SOCKBUF_UNLOCK(sb); 496 } 497 498 /* 499 * This version of sbappend() should only be used when the caller absolutely 500 * knows that there will never be more than one record in the socket buffer, 501 * that is, a stream protocol (such as TCP). 502 */ 503 void 504 sbappendstream_locked(struct sockbuf *sb, struct mbuf *m) 505 { 506 SOCKBUF_LOCK_ASSERT(sb); 507 508 KASSERT(m->m_nextpkt == NULL,("sbappendstream 0")); 509 KASSERT(sb->sb_mb == sb->sb_lastrecord,("sbappendstream 1")); 510 511 SBLASTMBUFCHK(sb); 512 513 sbcompress(sb, m, sb->sb_mbtail); 514 515 sb->sb_lastrecord = sb->sb_mb; 516 SBLASTRECORDCHK(sb); 517 } 518 519 /* 520 * This version of sbappend() should only be used when the caller absolutely 521 * knows that there will never be more than one record in the socket buffer, 522 * that is, a stream protocol (such as TCP). 523 */ 524 void 525 sbappendstream(struct sockbuf *sb, struct mbuf *m) 526 { 527 528 SOCKBUF_LOCK(sb); 529 sbappendstream_locked(sb, m); 530 SOCKBUF_UNLOCK(sb); 531 } 532 533 #ifdef SOCKBUF_DEBUG 534 void 535 sbcheck(struct sockbuf *sb) 536 { 537 struct mbuf *m; 538 struct mbuf *n = 0; 539 u_long len = 0, mbcnt = 0; 540 541 SOCKBUF_LOCK_ASSERT(sb); 542 543 for (m = sb->sb_mb; m; m = n) { 544 n = m->m_nextpkt; 545 for (; m; m = m->m_next) { 546 len += m->m_len; 547 mbcnt += MSIZE; 548 if (m->m_flags & M_EXT) /*XXX*/ /* pretty sure this is bogus */ 549 mbcnt += m->m_ext.ext_size; 550 } 551 } 552 if (len != sb->sb_cc || mbcnt != sb->sb_mbcnt) { 553 printf("cc %ld != %u || mbcnt %ld != %u\n", len, sb->sb_cc, 554 mbcnt, sb->sb_mbcnt); 555 panic("sbcheck"); 556 } 557 } 558 #endif 559 560 /* 561 * As above, except the mbuf chain begins a new record. 562 */ 563 void 564 sbappendrecord_locked(struct sockbuf *sb, struct mbuf *m0) 565 { 566 struct mbuf *m; 567 568 SOCKBUF_LOCK_ASSERT(sb); 569 570 if (m0 == 0) 571 return; 572 m = sb->sb_mb; 573 if (m) 574 while (m->m_nextpkt) 575 m = m->m_nextpkt; 576 /* 577 * Put the first mbuf on the queue. Note this permits zero length 578 * records. 579 */ 580 sballoc(sb, m0); 581 SBLASTRECORDCHK(sb); 582 SBLINKRECORD(sb, m0); 583 if (m) 584 m->m_nextpkt = m0; 585 else 586 sb->sb_mb = m0; 587 m = m0->m_next; 588 m0->m_next = 0; 589 if (m && (m0->m_flags & M_EOR)) { 590 m0->m_flags &= ~M_EOR; 591 m->m_flags |= M_EOR; 592 } 593 sbcompress(sb, m, m0); 594 } 595 596 /* 597 * As above, except the mbuf chain begins a new record. 598 */ 599 void 600 sbappendrecord(struct sockbuf *sb, struct mbuf *m0) 601 { 602 603 SOCKBUF_LOCK(sb); 604 sbappendrecord_locked(sb, m0); 605 SOCKBUF_UNLOCK(sb); 606 } 607 608 /* 609 * Append address and data, and optionally, control (ancillary) data to the 610 * receive queue of a socket. If present, m0 must include a packet header 611 * with total length. Returns 0 if no space in sockbuf or insufficient 612 * mbufs. 613 */ 614 int 615 sbappendaddr_locked(struct sockbuf *sb, const struct sockaddr *asa, 616 struct mbuf *m0, struct mbuf *control) 617 { 618 struct mbuf *m, *n, *nlast; 619 int space = asa->sa_len; 620 621 SOCKBUF_LOCK_ASSERT(sb); 622 623 if (m0 && (m0->m_flags & M_PKTHDR) == 0) 624 panic("sbappendaddr_locked"); 625 if (m0) 626 space += m0->m_pkthdr.len; 627 space += m_length(control, &n); 628 629 if (space > sbspace(sb)) 630 return (0); 631 #if MSIZE <= 256 632 if (asa->sa_len > MLEN) 633 return (0); 634 #endif 635 MGET(m, M_DONTWAIT, MT_SONAME); 636 if (m == 0) 637 return (0); 638 m->m_len = asa->sa_len; 639 bcopy(asa, mtod(m, caddr_t), asa->sa_len); 640 if (n) 641 n->m_next = m0; /* concatenate data to control */ 642 else 643 control = m0; 644 m->m_next = control; 645 for (n = m; n->m_next != NULL; n = n->m_next) 646 sballoc(sb, n); 647 sballoc(sb, n); 648 nlast = n; 649 SBLINKRECORD(sb, m); 650 651 sb->sb_mbtail = nlast; 652 SBLASTMBUFCHK(sb); 653 654 SBLASTRECORDCHK(sb); 655 return (1); 656 } 657 658 /* 659 * Append address and data, and optionally, control (ancillary) data to the 660 * receive queue of a socket. If present, m0 must include a packet header 661 * with total length. Returns 0 if no space in sockbuf or insufficient 662 * mbufs. 663 */ 664 int 665 sbappendaddr(struct sockbuf *sb, const struct sockaddr *asa, 666 struct mbuf *m0, struct mbuf *control) 667 { 668 int retval; 669 670 SOCKBUF_LOCK(sb); 671 retval = sbappendaddr_locked(sb, asa, m0, control); 672 SOCKBUF_UNLOCK(sb); 673 return (retval); 674 } 675 676 int 677 sbappendcontrol_locked(struct sockbuf *sb, struct mbuf *m0, 678 struct mbuf *control) 679 { 680 struct mbuf *m, *n, *mlast; 681 int space; 682 683 SOCKBUF_LOCK_ASSERT(sb); 684 685 if (control == 0) 686 panic("sbappendcontrol_locked"); 687 space = m_length(control, &n) + m_length(m0, NULL); 688 689 if (space > sbspace(sb)) 690 return (0); 691 n->m_next = m0; /* concatenate data to control */ 692 693 SBLASTRECORDCHK(sb); 694 695 for (m = control; m->m_next; m = m->m_next) 696 sballoc(sb, m); 697 sballoc(sb, m); 698 mlast = m; 699 SBLINKRECORD(sb, control); 700 701 sb->sb_mbtail = mlast; 702 SBLASTMBUFCHK(sb); 703 704 SBLASTRECORDCHK(sb); 705 return (1); 706 } 707 708 int 709 sbappendcontrol(struct sockbuf *sb, struct mbuf *m0, struct mbuf *control) 710 { 711 int retval; 712 713 SOCKBUF_LOCK(sb); 714 retval = sbappendcontrol_locked(sb, m0, control); 715 SOCKBUF_UNLOCK(sb); 716 return (retval); 717 } 718 719 /* 720 * Append the data in mbuf chain (m) into the socket buffer sb following mbuf 721 * (n). If (n) is NULL, the buffer is presumed empty. 722 * 723 * When the data is compressed, mbufs in the chain may be handled in one of 724 * three ways: 725 * 726 * (1) The mbuf may simply be dropped, if it contributes nothing (no data, no 727 * record boundary, and no change in data type). 728 * 729 * (2) The mbuf may be coalesced -- i.e., data in the mbuf may be copied into 730 * an mbuf already in the socket buffer. This can occur if an 731 * appropriate mbuf exists, there is room, and no merging of data types 732 * will occur. 733 * 734 * (3) The mbuf may be appended to the end of the existing mbuf chain. 735 * 736 * If any of the new mbufs is marked as M_EOR, mark the last mbuf appended as 737 * end-of-record. 738 */ 739 void 740 sbcompress(struct sockbuf *sb, struct mbuf *m, struct mbuf *n) 741 { 742 int eor = 0; 743 struct mbuf *o; 744 745 SOCKBUF_LOCK_ASSERT(sb); 746 747 while (m) { 748 eor |= m->m_flags & M_EOR; 749 if (m->m_len == 0 && 750 (eor == 0 || 751 (((o = m->m_next) || (o = n)) && 752 o->m_type == m->m_type))) { 753 if (sb->sb_lastrecord == m) 754 sb->sb_lastrecord = m->m_next; 755 m = m_free(m); 756 continue; 757 } 758 if (n && (n->m_flags & M_EOR) == 0 && 759 M_WRITABLE(n) && 760 m->m_len <= MCLBYTES / 4 && /* XXX: Don't copy too much */ 761 m->m_len <= M_TRAILINGSPACE(n) && 762 n->m_type == m->m_type) { 763 bcopy(mtod(m, caddr_t), mtod(n, caddr_t) + n->m_len, 764 (unsigned)m->m_len); 765 n->m_len += m->m_len; 766 sb->sb_cc += m->m_len; 767 if (m->m_type != MT_DATA && m->m_type != MT_OOBDATA) 768 /* XXX: Probably don't need.*/ 769 sb->sb_ctl += m->m_len; 770 m = m_free(m); 771 continue; 772 } 773 if (n) 774 n->m_next = m; 775 else 776 sb->sb_mb = m; 777 sb->sb_mbtail = m; 778 sballoc(sb, m); 779 n = m; 780 m->m_flags &= ~M_EOR; 781 m = m->m_next; 782 n->m_next = 0; 783 } 784 if (eor) { 785 KASSERT(n != NULL, ("sbcompress: eor && n == NULL")); 786 n->m_flags |= eor; 787 } 788 SBLASTMBUFCHK(sb); 789 } 790 791 /* 792 * Free all mbufs in a sockbuf. Check that all resources are reclaimed. 793 */ 794 static void 795 sbflush_internal(struct sockbuf *sb) 796 { 797 798 while (sb->sb_mbcnt) { 799 /* 800 * Don't call sbdrop(sb, 0) if the leading mbuf is non-empty: 801 * we would loop forever. Panic instead. 802 */ 803 if (!sb->sb_cc && (sb->sb_mb == NULL || sb->sb_mb->m_len)) 804 break; 805 sbdrop_internal(sb, (int)sb->sb_cc); 806 } 807 if (sb->sb_cc || sb->sb_mb || sb->sb_mbcnt) 808 panic("sbflush_internal: cc %u || mb %p || mbcnt %u", 809 sb->sb_cc, (void *)sb->sb_mb, sb->sb_mbcnt); 810 } 811 812 void 813 sbflush_locked(struct sockbuf *sb) 814 { 815 816 SOCKBUF_LOCK_ASSERT(sb); 817 sbflush_internal(sb); 818 } 819 820 void 821 sbflush(struct sockbuf *sb) 822 { 823 824 SOCKBUF_LOCK(sb); 825 sbflush_locked(sb); 826 SOCKBUF_UNLOCK(sb); 827 } 828 829 /* 830 * Drop data from (the front of) a sockbuf. 831 */ 832 static void 833 sbdrop_internal(struct sockbuf *sb, int len) 834 { 835 struct mbuf *m; 836 struct mbuf *next; 837 838 next = (m = sb->sb_mb) ? m->m_nextpkt : 0; 839 while (len > 0) { 840 if (m == 0) { 841 if (next == 0) 842 panic("sbdrop"); 843 m = next; 844 next = m->m_nextpkt; 845 continue; 846 } 847 if (m->m_len > len) { 848 m->m_len -= len; 849 m->m_data += len; 850 sb->sb_cc -= len; 851 if (sb->sb_sndptroff != 0) 852 sb->sb_sndptroff -= len; 853 if (m->m_type != MT_DATA && m->m_type != MT_OOBDATA) 854 sb->sb_ctl -= len; 855 break; 856 } 857 len -= m->m_len; 858 sbfree(sb, m); 859 m = m_free(m); 860 } 861 while (m && m->m_len == 0) { 862 sbfree(sb, m); 863 m = m_free(m); 864 } 865 if (m) { 866 sb->sb_mb = m; 867 m->m_nextpkt = next; 868 } else 869 sb->sb_mb = next; 870 /* 871 * First part is an inline SB_EMPTY_FIXUP(). Second part makes sure 872 * sb_lastrecord is up-to-date if we dropped part of the last record. 873 */ 874 m = sb->sb_mb; 875 if (m == NULL) { 876 sb->sb_mbtail = NULL; 877 sb->sb_lastrecord = NULL; 878 } else if (m->m_nextpkt == NULL) { 879 sb->sb_lastrecord = m; 880 } 881 } 882 883 /* 884 * Drop data from (the front of) a sockbuf. 885 */ 886 void 887 sbdrop_locked(struct sockbuf *sb, int len) 888 { 889 890 SOCKBUF_LOCK_ASSERT(sb); 891 892 sbdrop_internal(sb, len); 893 } 894 895 void 896 sbdrop(struct sockbuf *sb, int len) 897 { 898 899 SOCKBUF_LOCK(sb); 900 sbdrop_locked(sb, len); 901 SOCKBUF_UNLOCK(sb); 902 } 903 904 /* 905 * Maintain a pointer and offset pair into the socket buffer mbuf chain to 906 * avoid traversal of the entire socket buffer for larger offsets. 907 */ 908 struct mbuf * 909 sbsndptr(struct sockbuf *sb, u_int off, u_int len, u_int *moff) 910 { 911 struct mbuf *m, *ret; 912 913 KASSERT(sb->sb_mb != NULL, ("%s: sb_mb is NULL", __func__)); 914 KASSERT(off + len <= sb->sb_cc, ("%s: beyond sb", __func__)); 915 KASSERT(sb->sb_sndptroff <= sb->sb_cc, ("%s: sndptroff broken", __func__)); 916 917 /* 918 * Is off below stored offset? Happens on retransmits. 919 * Just return, we can't help here. 920 */ 921 if (sb->sb_sndptroff > off) { 922 *moff = off; 923 return (sb->sb_mb); 924 } 925 926 /* Return closest mbuf in chain for current offset. */ 927 *moff = off - sb->sb_sndptroff; 928 m = ret = sb->sb_sndptr ? sb->sb_sndptr : sb->sb_mb; 929 930 /* Advance by len to be as close as possible for the next transmit. */ 931 for (off = off - sb->sb_sndptroff + len - 1; 932 off > 0 && off >= m->m_len; 933 m = m->m_next) { 934 sb->sb_sndptroff += m->m_len; 935 off -= m->m_len; 936 } 937 sb->sb_sndptr = m; 938 939 return (ret); 940 } 941 942 /* 943 * Drop a record off the front of a sockbuf and move the next record to the 944 * front. 945 */ 946 void 947 sbdroprecord_locked(struct sockbuf *sb) 948 { 949 struct mbuf *m; 950 951 SOCKBUF_LOCK_ASSERT(sb); 952 953 m = sb->sb_mb; 954 if (m) { 955 sb->sb_mb = m->m_nextpkt; 956 do { 957 sbfree(sb, m); 958 m = m_free(m); 959 } while (m); 960 } 961 SB_EMPTY_FIXUP(sb); 962 } 963 964 /* 965 * Drop a record off the front of a sockbuf and move the next record to the 966 * front. 967 */ 968 void 969 sbdroprecord(struct sockbuf *sb) 970 { 971 972 SOCKBUF_LOCK(sb); 973 sbdroprecord_locked(sb); 974 SOCKBUF_UNLOCK(sb); 975 } 976 977 /* 978 * Create a "control" mbuf containing the specified data with the specified 979 * type for presentation on a socket buffer. 980 */ 981 struct mbuf * 982 sbcreatecontrol(caddr_t p, int size, int type, int level) 983 { 984 struct cmsghdr *cp; 985 struct mbuf *m; 986 987 if (CMSG_SPACE((u_int)size) > MCLBYTES) 988 return ((struct mbuf *) NULL); 989 if (CMSG_SPACE((u_int)size) > MLEN) 990 m = m_getcl(M_DONTWAIT, MT_CONTROL, 0); 991 else 992 m = m_get(M_DONTWAIT, MT_CONTROL); 993 if (m == NULL) 994 return ((struct mbuf *) NULL); 995 cp = mtod(m, struct cmsghdr *); 996 m->m_len = 0; 997 KASSERT(CMSG_SPACE((u_int)size) <= M_TRAILINGSPACE(m), 998 ("sbcreatecontrol: short mbuf")); 999 if (p != NULL) 1000 (void)memcpy(CMSG_DATA(cp), p, size); 1001 m->m_len = CMSG_SPACE(size); 1002 cp->cmsg_len = CMSG_LEN(size); 1003 cp->cmsg_level = level; 1004 cp->cmsg_type = type; 1005 return (m); 1006 } 1007 1008 /* 1009 * This does the same for socket buffers that sotoxsocket does for sockets: 1010 * generate an user-format data structure describing the socket buffer. Note 1011 * that the xsockbuf structure, since it is always embedded in a socket, does 1012 * not include a self pointer nor a length. We make this entry point public 1013 * in case some other mechanism needs it. 1014 */ 1015 void 1016 sbtoxsockbuf(struct sockbuf *sb, struct xsockbuf *xsb) 1017 { 1018 1019 xsb->sb_cc = sb->sb_cc; 1020 xsb->sb_hiwat = sb->sb_hiwat; 1021 xsb->sb_mbcnt = sb->sb_mbcnt; 1022 xsb->sb_mbmax = sb->sb_mbmax; 1023 xsb->sb_lowat = sb->sb_lowat; 1024 xsb->sb_flags = sb->sb_flags; 1025 xsb->sb_timeo = sb->sb_timeo; 1026 } 1027 1028 /* This takes the place of kern.maxsockbuf, which moved to kern.ipc. */ 1029 static int dummy; 1030 SYSCTL_INT(_kern, KERN_DUMMY, dummy, CTLFLAG_RW, &dummy, 0, ""); 1031 SYSCTL_OID(_kern_ipc, KIPC_MAXSOCKBUF, maxsockbuf, CTLTYPE_ULONG|CTLFLAG_RW, 1032 &sb_max, 0, sysctl_handle_sb_max, "LU", "Maximum socket buffer size"); 1033 SYSCTL_ULONG(_kern_ipc, KIPC_SOCKBUF_WASTE, sockbuf_waste_factor, CTLFLAG_RW, 1034 &sb_efficiency, 0, ""); 1035