1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #include <sys/types.h> 28 #include <sys/param.h> 29 #include <sys/systm.h> 30 #include <sys/sysmacros.h> 31 #include <sys/debug.h> 32 #include <sys/cmn_err.h> 33 #include <sys/vfs.h> 34 #include <sys/policy.h> 35 #include <sys/modctl.h> 36 37 #include <sys/sunddi.h> 38 39 #include <sys/strsun.h> 40 #include <sys/stropts.h> 41 #include <sys/strsubr.h> 42 #include <sys/socket.h> 43 #include <sys/socketvar.h> 44 #include <sys/sodirect.h> 45 #include <sys/uio.h> 46 47 #include <inet/ipclassifier.h> 48 #include <fs/sockfs/sockcommon.h> 49 #include <fs/sockfs/nl7c.h> 50 #include <inet/ip.h> 51 52 extern int xnet_skip_checks, xnet_check_print, xnet_truncate_print; 53 54 static struct kmem_cache *sock_sod_cache; 55 56 /* 57 * Common socket access functions. 58 * 59 * Instead of accessing the sonode switch directly (i.e., SOP_xxx()), 60 * the socket_xxx() function should be used. 61 */ 62 63 /* 64 * Try to create a new sonode of the requested <family, type, protocol>. 65 */ 66 /* ARGSUSED */ 67 struct sonode * 68 socket_create(int family, int type, int protocol, char *devpath, char *mod, 69 int flags, int version, struct cred *cr, int *errorp) 70 { 71 struct sonode *so; 72 struct sockparams *sp = NULL; 73 74 /* 75 * Look for a sockparams entry that match the given criteria. 76 * solookup() returns with the entry held. 77 */ 78 *errorp = solookup(family, type, protocol, &sp); 79 if (sp == NULL) { 80 int kmflags = (flags == SOCKET_SLEEP) ? KM_SLEEP : KM_NOSLEEP; 81 /* 82 * There is no matching sockparams entry. An ephemeral entry is 83 * created if the caller specifies a device or a socket module. 84 */ 85 if (devpath != NULL) { 86 sp = sockparams_hold_ephemeral_bydev(family, type, 87 protocol, devpath, kmflags, errorp); 88 } else if (mod != NULL) { 89 sp = sockparams_hold_ephemeral_bymod(family, type, 90 protocol, mod, kmflags, errorp); 91 } else { 92 return (NULL); 93 } 94 95 if (sp == NULL) 96 return (NULL); 97 } 98 99 ASSERT(sp->sp_smod_info != NULL); 100 ASSERT(flags == SOCKET_SLEEP || flags == SOCKET_NOSLEEP); 101 so = sp->sp_smod_info->smod_sock_create_func(sp, family, type, 102 protocol, version, flags, errorp, cr); 103 if (so == NULL) { 104 SOCKPARAMS_DEC_REF(sp); 105 } else { 106 if ((*errorp = SOP_INIT(so, NULL, cr, flags)) == 0) { 107 /* Cannot fail, only bumps so_count */ 108 (void) VOP_OPEN(&SOTOV(so), FREAD|FWRITE, cr, NULL); 109 } else { 110 socket_destroy(so); 111 so = NULL; 112 } 113 } 114 return (so); 115 } 116 117 struct sonode * 118 socket_newconn(struct sonode *parent, sock_lower_handle_t lh, 119 sock_downcalls_t *dc, int flags, int *errorp) 120 { 121 struct sonode *so; 122 struct sockparams *sp; 123 struct cred *cr; 124 125 if ((cr = CRED()) == NULL) 126 cr = kcred; 127 128 sp = parent->so_sockparams; 129 ASSERT(sp != NULL); 130 131 so = sp->sp_smod_info->smod_sock_create_func(sp, parent->so_family, 132 parent->so_type, parent->so_protocol, parent->so_version, flags, 133 errorp, cr); 134 if (so != NULL) { 135 SOCKPARAMS_INC_REF(sp); 136 137 so->so_proto_handle = lh; 138 so->so_downcalls = dc; 139 /* 140 * This function may be called in interrupt context, and CRED() 141 * will be NULL. In this case, pass in kcred. 142 */ 143 if ((*errorp = SOP_INIT(so, parent, cr, flags)) == 0) { 144 /* Cannot fail, only bumps so_count */ 145 (void) VOP_OPEN(&SOTOV(so), FREAD|FWRITE, cr, NULL); 146 } else { 147 socket_destroy(so); 148 so = NULL; 149 } 150 } 151 152 return (so); 153 } 154 155 /* 156 * Bind local endpoint. 157 */ 158 int 159 socket_bind(struct sonode *so, struct sockaddr *name, socklen_t namelen, 160 int flags, cred_t *cr) 161 { 162 return (SOP_BIND(so, name, namelen, flags, cr)); 163 } 164 165 /* 166 * Turn socket into a listen socket. 167 */ 168 int 169 socket_listen(struct sonode *so, int backlog, cred_t *cr) 170 { 171 if (backlog < 0) { 172 backlog = 0; 173 } 174 175 /* 176 * Use the same qlimit as in BSD. BSD checks the qlimit 177 * before queuing the next connection implying that a 178 * listen(sock, 0) allows one connection to be queued. 179 * BSD also uses 1.5 times the requested backlog. 180 * 181 * XNS Issue 4 required a strict interpretation of the backlog. 182 * This has been waived subsequently for Issue 4 and the change 183 * incorporated in XNS Issue 5. So we aren't required to do 184 * anything special for XPG apps. 185 */ 186 if (backlog >= (INT_MAX - 1) / 3) 187 backlog = INT_MAX; 188 else 189 backlog = backlog * 3 / 2 + 1; 190 191 return (SOP_LISTEN(so, backlog, cr)); 192 } 193 194 /* 195 * Accept incoming connection. 196 */ 197 int 198 socket_accept(struct sonode *lso, int fflag, cred_t *cr, struct sonode **nsop) 199 { 200 return (SOP_ACCEPT(lso, fflag, cr, nsop)); 201 } 202 203 /* 204 * Active open. 205 */ 206 int 207 socket_connect(struct sonode *so, const struct sockaddr *name, 208 socklen_t namelen, int fflag, int flags, cred_t *cr) 209 { 210 int error; 211 212 /* 213 * Handle a connect to a name parameter of type AF_UNSPEC like a 214 * connect to a null address. This is the portable method to 215 * unconnect a socket. 216 */ 217 if ((namelen >= sizeof (sa_family_t)) && 218 (name->sa_family == AF_UNSPEC)) { 219 name = NULL; 220 namelen = 0; 221 } 222 223 error = SOP_CONNECT(so, name, namelen, fflag, flags, cr); 224 225 if (error == EHOSTUNREACH && flags & _SOCONNECT_XPG4_2) { 226 /* 227 * X/Open specification contains a requirement that 228 * ENETUNREACH be returned but does not require 229 * EHOSTUNREACH. In order to keep the test suite 230 * happy we mess with the errno here. 231 */ 232 error = ENETUNREACH; 233 } 234 235 return (error); 236 } 237 238 /* 239 * Get address of remote node. 240 */ 241 int 242 socket_getpeername(struct sonode *so, struct sockaddr *addr, 243 socklen_t *addrlen, boolean_t accept, cred_t *cr) 244 { 245 ASSERT(*addrlen > 0); 246 return (SOP_GETPEERNAME(so, addr, addrlen, accept, cr)); 247 248 } 249 250 /* 251 * Get local address. 252 */ 253 int 254 socket_getsockname(struct sonode *so, struct sockaddr *addr, 255 socklen_t *addrlen, cred_t *cr) 256 { 257 return (SOP_GETSOCKNAME(so, addr, addrlen, cr)); 258 259 } 260 261 /* 262 * Called from shutdown(). 263 */ 264 int 265 socket_shutdown(struct sonode *so, int how, cred_t *cr) 266 { 267 return (SOP_SHUTDOWN(so, how, cr)); 268 } 269 270 /* 271 * Get socket options. 272 */ 273 /*ARGSUSED*/ 274 int 275 socket_getsockopt(struct sonode *so, int level, int option_name, 276 void *optval, socklen_t *optlenp, int flags, cred_t *cr) 277 { 278 return (SOP_GETSOCKOPT(so, level, option_name, optval, 279 optlenp, flags, cr)); 280 } 281 282 /* 283 * Set socket options 284 */ 285 int 286 socket_setsockopt(struct sonode *so, int level, int option_name, 287 const void *optval, t_uscalar_t optlen, cred_t *cr) 288 { 289 /* Caller allocates aligned optval, or passes null */ 290 ASSERT(((uintptr_t)optval & (sizeof (t_scalar_t) - 1)) == 0); 291 /* If optval is null optlen is 0, and vice-versa */ 292 ASSERT(optval != NULL || optlen == 0); 293 ASSERT(optlen != 0 || optval == NULL); 294 295 /* No options should be zero-length */ 296 if (optlen == 0) 297 return (EINVAL); 298 299 return (SOP_SETSOCKOPT(so, level, option_name, optval, optlen, cr)); 300 } 301 302 int 303 socket_sendmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop, 304 cred_t *cr) 305 { 306 int error = 0; 307 ssize_t orig_resid = uiop->uio_resid; 308 309 /* 310 * Do not bypass the cache if we are doing a local (AF_UNIX) write. 311 */ 312 if (so->so_family == AF_UNIX) 313 uiop->uio_extflg |= UIO_COPY_CACHED; 314 else 315 uiop->uio_extflg &= ~UIO_COPY_CACHED; 316 317 error = SOP_SENDMSG(so, msg, uiop, cr); 318 switch (error) { 319 default: 320 break; 321 case EINTR: 322 case ETIME: 323 case EWOULDBLOCK: 324 /* We did a partial send */ 325 if (uiop->uio_resid != orig_resid) 326 error = 0; 327 break; 328 case EPIPE: 329 if ((so->so_mode & SM_KERNEL) == 0) 330 tsignal(curthread, SIGPIPE); 331 break; 332 } 333 334 return (error); 335 } 336 337 int 338 socket_sendmblk(struct sonode *so, struct nmsghdr *msg, int fflag, 339 struct cred *cr, mblk_t **mpp) 340 { 341 int error = 0; 342 343 error = SOP_SENDMBLK(so, msg, fflag, cr, mpp); 344 if (error == EPIPE) { 345 tsignal(curthread, SIGPIPE); 346 } 347 return (error); 348 } 349 350 int 351 socket_recvmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop, 352 cred_t *cr) 353 { 354 int error; 355 ssize_t orig_resid = uiop->uio_resid; 356 357 /* 358 * Do not bypass the cache when reading data, as the application 359 * is likely to access the data shortly. 360 */ 361 uiop->uio_extflg |= UIO_COPY_CACHED; 362 363 error = SOP_RECVMSG(so, msg, uiop, cr); 364 365 switch (error) { 366 case EINTR: 367 case ETIME: 368 case EWOULDBLOCK: 369 /* We did a partial read */ 370 if (uiop->uio_resid != orig_resid) 371 error = 0; 372 break; 373 default: 374 break; 375 } 376 return (error); 377 } 378 379 int 380 socket_ioctl(struct sonode *so, int cmd, intptr_t arg, int mode, 381 struct cred *cr, int32_t *rvalp) 382 { 383 return (SOP_IOCTL(so, cmd, arg, mode, cr, rvalp)); 384 } 385 386 int 387 socket_poll(struct sonode *so, short events, int anyyet, short *reventsp, 388 struct pollhead **phpp) 389 { 390 return (SOP_POLL(so, events, anyyet, reventsp, phpp)); 391 } 392 393 int 394 socket_close(struct sonode *so, int flag, struct cred *cr) 395 { 396 return (VOP_CLOSE(SOTOV(so), flag, 1, 0, cr, NULL)); 397 } 398 399 int 400 socket_close_internal(struct sonode *so, int flag, cred_t *cr) 401 { 402 ASSERT(so->so_count == 0); 403 404 return (SOP_CLOSE(so, flag, cr)); 405 } 406 407 void 408 socket_destroy(struct sonode *so) 409 { 410 vn_invalid(SOTOV(so)); 411 VN_RELE(SOTOV(so)); 412 } 413 414 /* ARGSUSED */ 415 void 416 socket_destroy_internal(struct sonode *so, cred_t *cr) 417 { 418 struct sockparams *sp = so->so_sockparams; 419 ASSERT(so->so_count == 0 && sp != NULL); 420 421 sp->sp_smod_info->smod_sock_destroy_func(so); 422 423 SOCKPARAMS_DEC_REF(sp); 424 } 425 426 /* 427 * TODO Once the common vnode ops is available, then the vnops argument 428 * should be removed. 429 */ 430 /*ARGSUSED*/ 431 int 432 sonode_constructor(void *buf, void *cdrarg, int kmflags) 433 { 434 struct sonode *so = buf; 435 struct vnode *vp; 436 437 vp = so->so_vnode = vn_alloc(kmflags); 438 if (vp == NULL) { 439 return (-1); 440 } 441 vp->v_data = so; 442 vn_setops(vp, socket_vnodeops); 443 444 so->so_priv = NULL; 445 so->so_oobmsg = NULL; 446 447 so->so_proto_handle = NULL; 448 449 so->so_peercred = NULL; 450 451 so->so_rcv_queued = 0; 452 so->so_rcv_q_head = NULL; 453 so->so_rcv_q_last_head = NULL; 454 so->so_rcv_head = NULL; 455 so->so_rcv_last_head = NULL; 456 so->so_rcv_wanted = 0; 457 so->so_rcv_timer_interval = SOCKET_NO_RCVTIMER; 458 so->so_rcv_timer_tid = 0; 459 so->so_rcv_thresh = 0; 460 461 so->so_acceptq_head = NULL; 462 so->so_acceptq_tail = &so->so_acceptq_head; 463 so->so_acceptq_next = NULL; 464 so->so_acceptq_len = 0; 465 so->so_backlog = 0; 466 467 so->so_snd_qfull = B_FALSE; 468 469 mutex_init(&so->so_lock, NULL, MUTEX_DEFAULT, NULL); 470 mutex_init(&so->so_acceptq_lock, NULL, MUTEX_DEFAULT, NULL); 471 rw_init(&so->so_fallback_rwlock, NULL, RW_DEFAULT, NULL); 472 cv_init(&so->so_state_cv, NULL, CV_DEFAULT, NULL); 473 cv_init(&so->so_want_cv, NULL, CV_DEFAULT, NULL); 474 475 cv_init(&so->so_acceptq_cv, NULL, CV_DEFAULT, NULL); 476 cv_init(&so->so_snd_cv, NULL, CV_DEFAULT, NULL); 477 cv_init(&so->so_rcv_cv, NULL, CV_DEFAULT, NULL); 478 cv_init(&so->so_copy_cv, NULL, CV_DEFAULT, NULL); 479 cv_init(&so->so_closing_cv, NULL, CV_DEFAULT, NULL); 480 481 return (0); 482 } 483 484 /*ARGSUSED*/ 485 void 486 sonode_destructor(void *buf, void *cdrarg) 487 { 488 struct sonode *so = buf; 489 struct vnode *vp = SOTOV(so); 490 491 ASSERT(so->so_priv == NULL); 492 ASSERT(so->so_peercred == NULL); 493 494 ASSERT(so->so_oobmsg == NULL); 495 496 ASSERT(so->so_rcv_q_head == NULL); 497 498 ASSERT(so->so_acceptq_head == NULL); 499 ASSERT(so->so_acceptq_tail == &so->so_acceptq_head); 500 ASSERT(so->so_acceptq_next == NULL); 501 502 ASSERT(vp->v_data == so); 503 ASSERT(vn_matchops(vp, socket_vnodeops)); 504 505 vn_free(vp); 506 507 mutex_destroy(&so->so_lock); 508 mutex_destroy(&so->so_acceptq_lock); 509 rw_destroy(&so->so_fallback_rwlock); 510 511 cv_destroy(&so->so_state_cv); 512 cv_destroy(&so->so_want_cv); 513 cv_destroy(&so->so_acceptq_cv); 514 cv_destroy(&so->so_snd_cv); 515 cv_destroy(&so->so_rcv_cv); 516 cv_destroy(&so->so_closing_cv); 517 } 518 519 void 520 sonode_init(struct sonode *so, struct sockparams *sp, int family, 521 int type, int protocol, sonodeops_t *sops) 522 { 523 vnode_t *vp; 524 525 vp = SOTOV(so); 526 527 so->so_flag = 0; 528 529 so->so_state = 0; 530 so->so_mode = 0; 531 532 so->so_count = 0; 533 534 so->so_family = family; 535 so->so_type = type; 536 so->so_protocol = protocol; 537 538 SOCK_CONNID_INIT(so->so_proto_connid); 539 540 so->so_options = 0; 541 so->so_linger.l_onoff = 0; 542 so->so_linger.l_linger = 0; 543 so->so_sndbuf = 0; 544 so->so_error = 0; 545 so->so_rcvtimeo = 0; 546 so->so_sndtimeo = 0; 547 548 ASSERT(so->so_oobmsg == NULL); 549 so->so_oobmark = 0; 550 so->so_pgrp = 0; 551 552 ASSERT(so->so_peercred == NULL); 553 554 so->so_zoneid = getzoneid(); 555 556 so->so_sockparams = sp; 557 558 so->so_ops = sops; 559 560 so->so_proto_handle = NULL; 561 562 so->so_downcalls = NULL; 563 564 so->so_copyflag = 0; 565 566 ASSERT(so->so_acceptq_head == NULL); 567 ASSERT(so->so_acceptq_tail == &so->so_acceptq_head); 568 ASSERT(so->so_acceptq_next == NULL); 569 570 vn_reinit(vp); 571 vp->v_vfsp = rootvfs; 572 vp->v_type = VSOCK; 573 vp->v_rdev = sockdev; 574 575 so->so_rcv_queued = 0; 576 so->so_rcv_q_head = NULL; 577 so->so_rcv_q_last_head = NULL; 578 so->so_rcv_head = NULL; 579 so->so_rcv_last_head = NULL; 580 581 so->so_snd_qfull = B_FALSE; 582 so->so_minpsz = 0; 583 584 so->so_rcv_wakeup = B_FALSE; 585 so->so_snd_wakeup = B_FALSE; 586 so->so_flowctrld = B_FALSE; 587 588 so->so_pollev = 0; 589 bzero(&so->so_poll_list, sizeof (so->so_poll_list)); 590 bzero(&so->so_proto_props, sizeof (struct sock_proto_props)); 591 592 bzero(&(so->so_ksock_callbacks), sizeof (ksocket_callbacks_t)); 593 so->so_ksock_cb_arg = NULL; 594 595 so->so_max_addr_len = sizeof (struct sockaddr_storage); 596 597 so->so_direct = NULL; 598 599 vn_exists(vp); 600 } 601 602 void 603 sonode_fini(struct sonode *so) 604 { 605 mblk_t *mp; 606 vnode_t *vp; 607 608 ASSERT(so->so_count == 0); 609 610 if (so->so_rcv_timer_tid) { 611 ASSERT(MUTEX_NOT_HELD(&so->so_lock)); 612 (void) untimeout(so->so_rcv_timer_tid); 613 so->so_rcv_timer_tid = 0; 614 } 615 616 so_acceptq_flush(so); 617 618 #ifdef DEBUG 619 mutex_enter(&so->so_lock); 620 ASSERT(so_verify_oobstate(so)); 621 mutex_exit(&so->so_lock); 622 #endif /* DEBUG */ 623 if ((mp = so->so_oobmsg) != NULL) { 624 freemsg(mp); 625 so->so_oobmsg = NULL; 626 so->so_state &= ~(SS_OOBPEND|SS_HAVEOOBDATA|SS_HADOOBDATA| 627 SS_RCVATMARK); 628 } 629 630 if (so->so_poll_list.ph_list != NULL) { 631 pollwakeup(&so->so_poll_list, POLLERR); 632 pollhead_clean(&so->so_poll_list); 633 } 634 635 if (so->so_direct != NULL) { 636 sodirect_t *sodp = so->so_direct; 637 638 ASSERT(sodp->sod_uioafh == NULL); 639 640 so->so_direct = NULL; 641 kmem_cache_free(sock_sod_cache, sodp); 642 } 643 644 vp = SOTOV(so); 645 vn_invalid(vp); 646 647 if (so->so_peercred != NULL) { 648 crfree(so->so_peercred); 649 so->so_peercred = NULL; 650 } 651 } 652 653 /* 654 * This function is called at the beginning of recvmsg(). 655 * 656 * If I/OAT is enabled on this sonode, initialize the uioa state machine 657 * with state UIOA_ALLOC. 658 */ 659 uio_t * 660 sod_rcv_init(struct sonode *so, int flags, struct uio **uiopp) 661 { 662 struct uio *suiop; 663 struct uio *uiop; 664 sodirect_t *sodp = so->so_direct; 665 666 if (sodp == NULL) 667 return (NULL); 668 669 suiop = NULL; 670 uiop = *uiopp; 671 672 mutex_enter(sodp->sod_lockp); 673 if (uiop->uio_resid >= uioasync.mincnt && 674 sodp != NULL && (sodp->sod_state & SOD_ENABLED) && 675 uioasync.enabled && !(flags & MSG_PEEK) && 676 !(so->so_state & SS_CANTRCVMORE)) { 677 /* 678 * Big enough I/O for uioa min setup and an sodirect socket 679 * and sodirect enabled and uioa enabled and I/O will be done 680 * and not EOF so initialize the sodirect_t uioa_t with "uiop". 681 */ 682 if (!uioainit(uiop, &sodp->sod_uioa)) { 683 /* 684 * Successful uioainit() so the uio_t part of the 685 * uioa_t will be used for all uio_t work to follow, 686 * we return the original "uiop" in "suiop". 687 */ 688 suiop = uiop; 689 *uiopp = (uio_t *)&sodp->sod_uioa; 690 /* 691 * Before returning to the caller the passed in uio_t 692 * "uiop" will be updated via a call to uioafini() 693 * below. 694 * 695 * Note, the uioa.uioa_state isn't set to UIOA_ENABLED 696 * here as first we have to uioamove() any currently 697 * queued M_DATA mblk_t(s) so it will be done later. 698 */ 699 } 700 /* 701 * In either uioainit() success or not case note the number 702 * of uio bytes the caller wants for sod framework and/or 703 * transport (e.g. TCP) strategy. 704 */ 705 sodp->sod_want = uiop->uio_resid; 706 } else if (sodp != NULL && (sodp->sod_state & SOD_ENABLED)) { 707 /* 708 * No uioa but still using sodirect so note the number of 709 * uio bytes the caller wants for sodirect framework and/or 710 * transport (e.g. TCP) strategy. 711 */ 712 sodp->sod_want = uiop->uio_resid; 713 } 714 mutex_exit(sodp->sod_lockp); 715 716 return (suiop); 717 } 718 719 /* 720 * This function is called at the end of recvmsg(), it finializes all the I/OAT 721 * operations, and reset the uioa state to UIOA_ALLOC. 722 */ 723 int 724 sod_rcv_done(struct sonode *so, struct uio *suiop, struct uio *uiop) 725 { 726 int error = 0; 727 sodirect_t *sodp = so->so_direct; 728 mblk_t *mp; 729 730 if (sodp == NULL) { 731 return (0); 732 } 733 734 ASSERT(MUTEX_HELD(sodp->sod_lockp)); 735 /* Finish any sodirect and uioa processing */ 736 if (suiop != NULL) { 737 /* Finish any uioa_t processing */ 738 739 ASSERT(uiop == (uio_t *)&sodp->sod_uioa); 740 error = uioafini(suiop, (uioa_t *)uiop); 741 if ((mp = sodp->sod_uioafh) != NULL) { 742 sodp->sod_uioafh = NULL; 743 sodp->sod_uioaft = NULL; 744 freemsg(mp); 745 } 746 } 747 ASSERT(sodp->sod_uioafh == NULL); 748 if (!(sodp->sod_state & SOD_WAKE_NOT)) { 749 /* Awoke */ 750 sodp->sod_state &= SOD_WAKE_CLR; 751 sodp->sod_state |= SOD_WAKE_NOT; 752 } 753 /* Last, clear sod_want value */ 754 sodp->sod_want = 0; 755 756 return (error); 757 } 758 759 /* 760 * Schedule a uioamove() on a mblk. This is ususally called from 761 * protocols (e.g. TCP) on a I/OAT enabled sonode. 762 */ 763 mblk_t * 764 sod_uioa_mblk_init(struct sodirect_s *sodp, mblk_t *mp, size_t msg_size) 765 { 766 uioa_t *uioap = &sodp->sod_uioa; 767 mblk_t *mp1 = mp; 768 mblk_t *lmp = NULL; 769 770 ASSERT(DB_TYPE(mp) == M_DATA); 771 ASSERT(msg_size == msgdsize(mp)); 772 773 /* Caller must have lock held */ 774 ASSERT(MUTEX_HELD(sodp->sod_lockp)); 775 776 if (uioap->uioa_state & UIOA_ENABLED) { 777 /* Uioa is enabled */ 778 779 if (msg_size > uioap->uio_resid) { 780 /* 781 * There isn't enough uio space for the mblk_t chain 782 * so disable uioa such that this and any additional 783 * mblk_t data is handled by the socket and schedule 784 * the socket for wakeup to finish this uioa. 785 */ 786 uioap->uioa_state &= UIOA_CLR; 787 uioap->uioa_state |= UIOA_FINI; 788 if (sodp->sod_state & SOD_WAKE_NOT) { 789 sodp->sod_state &= SOD_WAKE_CLR; 790 sodp->sod_state |= SOD_WAKE_NEED; 791 } 792 return (mp); 793 } 794 do { 795 uint32_t len = MBLKL(mp1); 796 797 if (!uioamove(mp1->b_rptr, len, UIO_READ, uioap)) { 798 /* Scheduled, mark dblk_t as such */ 799 DB_FLAGS(mp1) |= DBLK_UIOA; 800 } else { 801 /* Error, turn off async processing */ 802 uioap->uioa_state &= UIOA_CLR; 803 uioap->uioa_state |= UIOA_FINI; 804 break; 805 } 806 lmp = mp1; 807 } while ((mp1 = mp1->b_cont) != NULL); 808 809 if (mp1 != NULL || uioap->uio_resid == 0) { 810 /* 811 * Not all mblk_t(s) uioamoved (error) or all uio 812 * space has been consumed so schedule the socket 813 * for wakeup to finish this uio. 814 */ 815 sodp->sod_state &= SOD_WAKE_CLR; 816 sodp->sod_state |= SOD_WAKE_NEED; 817 818 /* Break the mblk chain if neccessary. */ 819 if (mp1 != NULL && lmp != NULL) { 820 mp->b_next = mp1; 821 lmp->b_cont = NULL; 822 } 823 } 824 } 825 return (mp1); 826 } 827 828 /* 829 * This function is called on a mblk that thas been successfully uioamoved(). 830 */ 831 void 832 sod_uioa_mblk_done(sodirect_t *sodp, mblk_t *bp) 833 { 834 if (bp != NULL && (bp->b_datap->db_flags & DBLK_UIOA)) { 835 /* 836 * A uioa flaged mblk_t chain, already uio processed, 837 * add it to the sodirect uioa pending free list. 838 * 839 * Note, a b_cont chain headed by a DBLK_UIOA enable 840 * mblk_t must have all mblk_t(s) DBLK_UIOA enabled. 841 */ 842 mblk_t *bpt = sodp->sod_uioaft; 843 844 ASSERT(sodp != NULL); 845 846 /* 847 * Add first mblk_t of "bp" chain to current sodirect uioa 848 * free list tail mblk_t, if any, else empty list so new head. 849 */ 850 if (bpt == NULL) 851 sodp->sod_uioafh = bp; 852 else 853 bpt->b_cont = bp; 854 855 /* 856 * Walk mblk_t "bp" chain to find tail and adjust rptr of 857 * each to reflect that uioamove() has consumed all data. 858 */ 859 bpt = bp; 860 for (;;) { 861 ASSERT(bpt->b_datap->db_flags & DBLK_UIOA); 862 863 bpt->b_rptr = bpt->b_wptr; 864 if (bpt->b_cont == NULL) 865 break; 866 bpt = bpt->b_cont; 867 } 868 /* New sodirect uioa free list tail */ 869 sodp->sod_uioaft = bpt; 870 871 /* Only dequeue once with data returned per uioa_t */ 872 if (sodp->sod_uioa.uioa_state & UIOA_ENABLED) { 873 sodp->sod_uioa.uioa_state &= UIOA_CLR; 874 sodp->sod_uioa.uioa_state |= UIOA_FINI; 875 } 876 } 877 } 878 879 /* 880 * When transit from UIOA_INIT state to UIOA_ENABLE state in recvmsg(), call 881 * this function on a non-STREAMS socket to schedule uioamove() on the data 882 * that has already queued in this socket. 883 */ 884 void 885 sod_uioa_so_init(struct sonode *so, struct sodirect_s *sodp, struct uio *uiop) 886 { 887 uioa_t *uioap = (uioa_t *)uiop; 888 mblk_t *lbp; 889 mblk_t *wbp; 890 mblk_t *bp; 891 int len; 892 int error; 893 boolean_t in_rcv_q = B_TRUE; 894 895 ASSERT(MUTEX_HELD(sodp->sod_lockp)); 896 ASSERT(&sodp->sod_uioa == uioap); 897 898 /* 899 * Walk first b_cont chain in sod_q 900 * and schedule any M_DATA mblk_t's for uio asynchronous move. 901 */ 902 bp = so->so_rcv_q_head; 903 904 again: 905 /* Walk the chain */ 906 lbp = NULL; 907 wbp = bp; 908 909 do { 910 if (bp == NULL) 911 break; 912 913 if (wbp->b_datap->db_type != M_DATA) { 914 /* Not M_DATA, no more uioa */ 915 goto nouioa; 916 } 917 if ((len = wbp->b_wptr - wbp->b_rptr) > 0) { 918 /* Have a M_DATA mblk_t with data */ 919 if (len > uioap->uio_resid || (so->so_oobmark > 0 && 920 len + uioap->uioa_mbytes >= so->so_oobmark)) { 921 /* Not enough uio sapce, or beyond oobmark */ 922 goto nouioa; 923 } 924 ASSERT(!(wbp->b_datap->db_flags & DBLK_UIOA)); 925 error = uioamove(wbp->b_rptr, len, 926 UIO_READ, uioap); 927 if (!error) { 928 /* Scheduled, mark dblk_t as such */ 929 wbp->b_datap->db_flags |= DBLK_UIOA; 930 } else { 931 /* Break the mblk chain */ 932 goto nouioa; 933 } 934 } 935 /* Save last wbp processed */ 936 lbp = wbp; 937 } while ((wbp = wbp->b_cont) != NULL); 938 939 if (in_rcv_q && (bp == NULL || bp->b_next == NULL)) { 940 /* 941 * We get here only once to process the sonode dump area 942 * if so_rcv_q_head is NULL or all the mblks have been 943 * successfully uioamoved()ed. 944 */ 945 in_rcv_q = B_FALSE; 946 947 /* move to dump area */ 948 bp = so->so_rcv_head; 949 goto again; 950 } 951 952 return; 953 954 nouioa: 955 /* No more uioa */ 956 uioap->uioa_state &= UIOA_CLR; 957 uioap->uioa_state |= UIOA_FINI; 958 959 /* 960 * If we processed 1 or more mblk_t(s) then we need to split the 961 * current mblk_t chain in 2 so that all the uioamove()ed mblk_t(s) 962 * are in the current chain and the rest are in the following new 963 * chain. 964 */ 965 if (lbp != NULL) { 966 /* New end of current chain */ 967 lbp->b_cont = NULL; 968 969 /* Insert new chain wbp after bp */ 970 if ((wbp->b_next = bp->b_next) == NULL) { 971 /* 972 * No need to grab so_lock, since sod_lockp 973 * points to so_lock. 974 */ 975 if (in_rcv_q) 976 so->so_rcv_q_last_head = wbp; 977 else 978 so->so_rcv_last_head = wbp; 979 } 980 bp->b_next = wbp; 981 bp->b_next->b_prev = bp->b_prev; 982 bp->b_prev = lbp; 983 } 984 } 985 986 /* 987 * Initialize sodirect data structures on a socket. 988 */ 989 void 990 sod_sock_init(struct sonode *so, struct stdata *stp, sod_enq_func enq_func, 991 sod_wakeup_func wake_func, kmutex_t *lockp) 992 { 993 sodirect_t *sodp; 994 995 ASSERT(so->so_direct == NULL); 996 997 so->so_state |= SS_SODIRECT; 998 999 sodp = kmem_cache_alloc(sock_sod_cache, KM_SLEEP); 1000 sodp->sod_state = SOD_ENABLED | SOD_WAKE_NOT; 1001 sodp->sod_want = 0; 1002 sodp->sod_q = (stp != NULL) ? RD(stp->sd_wrq) : NULL; 1003 sodp->sod_enqueue = enq_func; 1004 sodp->sod_wakeup = wake_func; 1005 sodp->sod_uioafh = NULL; 1006 sodp->sod_uioaft = NULL; 1007 sodp->sod_lockp = lockp; 1008 /* 1009 * Remainder of the sod_uioa members are left uninitialized 1010 * but will be initialized later by uioainit() before uioa 1011 * is enabled. 1012 */ 1013 sodp->sod_uioa.uioa_state = UIOA_ALLOC; 1014 so->so_direct = sodp; 1015 if (stp != NULL) 1016 stp->sd_sodirect = sodp; 1017 } 1018 1019 /* 1020 * Init the sodirect kmem cache while sockfs is loading. 1021 */ 1022 void 1023 sod_init() 1024 { 1025 /* Allocate sodirect_t kmem_cache */ 1026 sock_sod_cache = kmem_cache_create("sock_sod_cache", 1027 sizeof (sodirect_t), 0, NULL, NULL, NULL, NULL, NULL, 0); 1028 } 1029 1030 ssize_t 1031 sod_uioa_mblk(struct sonode *so, mblk_t *mp) 1032 { 1033 sodirect_t *sodp = so->so_direct; 1034 1035 ASSERT(sodp != NULL); 1036 ASSERT(MUTEX_HELD(sodp->sod_lockp)); 1037 1038 ASSERT(sodp->sod_state & SOD_ENABLED); 1039 ASSERT(sodp->sod_uioa.uioa_state != (UIOA_ALLOC|UIOA_INIT)); 1040 1041 ASSERT(sodp->sod_uioa.uioa_state & (UIOA_ENABLED|UIOA_FINI)); 1042 1043 if (mp == NULL && so->so_rcv_q_head != NULL) { 1044 mp = so->so_rcv_q_head; 1045 ASSERT(mp->b_prev != NULL); 1046 mp->b_prev = NULL; 1047 so->so_rcv_q_head = mp->b_next; 1048 if (so->so_rcv_q_head == NULL) { 1049 so->so_rcv_q_last_head = NULL; 1050 } 1051 mp->b_next = NULL; 1052 } 1053 1054 sod_uioa_mblk_done(sodp, mp); 1055 1056 if (so->so_rcv_q_head == NULL && so->so_rcv_head != NULL && 1057 DB_TYPE(so->so_rcv_head) == M_DATA && 1058 (DB_FLAGS(so->so_rcv_head) & DBLK_UIOA)) { 1059 /* more arrived */ 1060 ASSERT(so->so_rcv_q_head == NULL); 1061 mp = so->so_rcv_head; 1062 so->so_rcv_head = mp->b_next; 1063 if (so->so_rcv_head == NULL) 1064 so->so_rcv_last_head = NULL; 1065 mp->b_prev = mp->b_next = NULL; 1066 sod_uioa_mblk_done(sodp, mp); 1067 } 1068 1069 #ifdef DEBUG 1070 if (so->so_rcv_q_head != NULL) { 1071 mblk_t *m = so->so_rcv_q_head; 1072 while (m != NULL) { 1073 if (DB_FLAGS(m) & DBLK_UIOA) { 1074 cmn_err(CE_PANIC, "Unexpected I/OAT mblk %p" 1075 " in so_rcv_q_head.\n", (void *)m); 1076 } 1077 m = m->b_next; 1078 } 1079 } 1080 if (so->so_rcv_head != NULL) { 1081 mblk_t *m = so->so_rcv_head; 1082 while (m != NULL) { 1083 if (DB_FLAGS(m) & DBLK_UIOA) { 1084 cmn_err(CE_PANIC, "Unexpected I/OAT mblk %p" 1085 " in so_rcv_head.\n", (void *)m); 1086 } 1087 m = m->b_next; 1088 } 1089 } 1090 #endif 1091 return (sodp->sod_uioa.uioa_mbytes); 1092 } 1093