1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #include <sys/types.h> 28 #include <sys/param.h> 29 #include <sys/systm.h> 30 #include <sys/sysmacros.h> 31 #include <sys/debug.h> 32 #include <sys/cmn_err.h> 33 #include <sys/vfs.h> 34 #include <sys/policy.h> 35 #include <sys/modctl.h> 36 37 #include <sys/sunddi.h> 38 39 #include <sys/strsun.h> 40 #include <sys/stropts.h> 41 #include <sys/strsubr.h> 42 #include <sys/socket.h> 43 #include <sys/socketvar.h> 44 #include <sys/sodirect.h> 45 #include <sys/uio.h> 46 47 #include <inet/ipclassifier.h> 48 #include <fs/sockfs/sockcommon.h> 49 #include <fs/sockfs/nl7c.h> 50 #include <fs/sockfs/socktpi.h> 51 #include <inet/ip.h> 52 53 extern int xnet_skip_checks, xnet_check_print, xnet_truncate_print; 54 55 static struct kmem_cache *sock_sod_cache; 56 57 /* 58 * Common socket access functions. 59 * 60 * Instead of accessing the sonode switch directly (i.e., SOP_xxx()), 61 * the socket_xxx() function should be used. 62 */ 63 64 /* 65 * Try to create a new sonode of the requested <family, type, protocol>. 66 */ 67 /* ARGSUSED */ 68 struct sonode * 69 socket_create(int family, int type, int protocol, char *devpath, char *mod, 70 int flags, int version, struct cred *cr, int *errorp) 71 { 72 struct sonode *so; 73 struct sockparams *sp = NULL; 74 int saved_error; 75 76 /* 77 * Look for a sockparams entry that match the given criteria. 78 * solookup() returns with the entry held. 79 */ 80 *errorp = solookup(family, type, protocol, &sp); 81 saved_error = *errorp; 82 if (sp == NULL) { 83 int kmflags = (flags == SOCKET_SLEEP) ? KM_SLEEP : KM_NOSLEEP; 84 /* 85 * There is no matching sockparams entry. An ephemeral entry is 86 * created if the caller specifies a device or a socket module. 87 */ 88 if (devpath != NULL) { 89 saved_error = 0; 90 sp = sockparams_hold_ephemeral_bydev(family, type, 91 protocol, devpath, kmflags, errorp); 92 } else if (mod != NULL) { 93 saved_error = 0; 94 sp = sockparams_hold_ephemeral_bymod(family, type, 95 protocol, mod, kmflags, errorp); 96 } else { 97 *errorp = solookup(family, type, 0, &sp); 98 } 99 100 if (sp == NULL) { 101 if (saved_error && (*errorp == EPROTONOSUPPORT || 102 *errorp == EPROTOTYPE || *errorp == ENOPROTOOPT)) 103 *errorp = saved_error; 104 return (NULL); 105 } 106 } 107 108 ASSERT(sp->sp_smod_info != NULL); 109 ASSERT(flags == SOCKET_SLEEP || flags == SOCKET_NOSLEEP); 110 so = sp->sp_smod_info->smod_sock_create_func(sp, family, type, 111 protocol, version, flags, errorp, cr); 112 if (so == NULL) { 113 SOCKPARAMS_DEC_REF(sp); 114 } else { 115 if ((*errorp = SOP_INIT(so, NULL, cr, flags)) == 0) { 116 /* Cannot fail, only bumps so_count */ 117 (void) VOP_OPEN(&SOTOV(so), FREAD|FWRITE, cr, NULL); 118 } else { 119 if (saved_error && (*errorp == EPROTONOSUPPORT || 120 *errorp == EPROTOTYPE || *errorp == ENOPROTOOPT)) 121 *errorp = saved_error; 122 socket_destroy(so); 123 so = NULL; 124 } 125 } 126 return (so); 127 } 128 129 struct sonode * 130 socket_newconn(struct sonode *parent, sock_lower_handle_t lh, 131 sock_downcalls_t *dc, int flags, int *errorp) 132 { 133 struct sonode *so; 134 struct sockparams *sp; 135 struct cred *cr; 136 137 if ((cr = CRED()) == NULL) 138 cr = kcred; 139 140 sp = parent->so_sockparams; 141 ASSERT(sp != NULL); 142 143 so = sp->sp_smod_info->smod_sock_create_func(sp, parent->so_family, 144 parent->so_type, parent->so_protocol, parent->so_version, flags, 145 errorp, cr); 146 if (so != NULL) { 147 SOCKPARAMS_INC_REF(sp); 148 149 so->so_proto_handle = lh; 150 so->so_downcalls = dc; 151 /* 152 * This function may be called in interrupt context, and CRED() 153 * will be NULL. In this case, pass in kcred. 154 */ 155 if ((*errorp = SOP_INIT(so, parent, cr, flags)) == 0) { 156 /* Cannot fail, only bumps so_count */ 157 (void) VOP_OPEN(&SOTOV(so), FREAD|FWRITE, cr, NULL); 158 } else { 159 socket_destroy(so); 160 so = NULL; 161 } 162 } 163 164 return (so); 165 } 166 167 /* 168 * Bind local endpoint. 169 */ 170 int 171 socket_bind(struct sonode *so, struct sockaddr *name, socklen_t namelen, 172 int flags, cred_t *cr) 173 { 174 return (SOP_BIND(so, name, namelen, flags, cr)); 175 } 176 177 /* 178 * Turn socket into a listen socket. 179 */ 180 int 181 socket_listen(struct sonode *so, int backlog, cred_t *cr) 182 { 183 if (backlog < 0) { 184 backlog = 0; 185 } 186 187 /* 188 * Use the same qlimit as in BSD. BSD checks the qlimit 189 * before queuing the next connection implying that a 190 * listen(sock, 0) allows one connection to be queued. 191 * BSD also uses 1.5 times the requested backlog. 192 * 193 * XNS Issue 4 required a strict interpretation of the backlog. 194 * This has been waived subsequently for Issue 4 and the change 195 * incorporated in XNS Issue 5. So we aren't required to do 196 * anything special for XPG apps. 197 */ 198 if (backlog >= (INT_MAX - 1) / 3) 199 backlog = INT_MAX; 200 else 201 backlog = backlog * 3 / 2 + 1; 202 203 return (SOP_LISTEN(so, backlog, cr)); 204 } 205 206 /* 207 * Accept incoming connection. 208 */ 209 int 210 socket_accept(struct sonode *lso, int fflag, cred_t *cr, struct sonode **nsop) 211 { 212 return (SOP_ACCEPT(lso, fflag, cr, nsop)); 213 } 214 215 /* 216 * Active open. 217 */ 218 int 219 socket_connect(struct sonode *so, const struct sockaddr *name, 220 socklen_t namelen, int fflag, int flags, cred_t *cr) 221 { 222 int error; 223 224 /* 225 * Handle a connect to a name parameter of type AF_UNSPEC like a 226 * connect to a null address. This is the portable method to 227 * unconnect a socket. 228 */ 229 if ((namelen >= sizeof (sa_family_t)) && 230 (name->sa_family == AF_UNSPEC)) { 231 name = NULL; 232 namelen = 0; 233 } 234 235 error = SOP_CONNECT(so, name, namelen, fflag, flags, cr); 236 237 if (error == EHOSTUNREACH && flags & _SOCONNECT_XPG4_2) { 238 /* 239 * X/Open specification contains a requirement that 240 * ENETUNREACH be returned but does not require 241 * EHOSTUNREACH. In order to keep the test suite 242 * happy we mess with the errno here. 243 */ 244 error = ENETUNREACH; 245 } 246 247 return (error); 248 } 249 250 /* 251 * Get address of remote node. 252 */ 253 int 254 socket_getpeername(struct sonode *so, struct sockaddr *addr, 255 socklen_t *addrlen, boolean_t accept, cred_t *cr) 256 { 257 ASSERT(*addrlen > 0); 258 return (SOP_GETPEERNAME(so, addr, addrlen, accept, cr)); 259 260 } 261 262 /* 263 * Get local address. 264 */ 265 int 266 socket_getsockname(struct sonode *so, struct sockaddr *addr, 267 socklen_t *addrlen, cred_t *cr) 268 { 269 return (SOP_GETSOCKNAME(so, addr, addrlen, cr)); 270 271 } 272 273 /* 274 * Called from shutdown(). 275 */ 276 int 277 socket_shutdown(struct sonode *so, int how, cred_t *cr) 278 { 279 return (SOP_SHUTDOWN(so, how, cr)); 280 } 281 282 /* 283 * Get socket options. 284 */ 285 /*ARGSUSED*/ 286 int 287 socket_getsockopt(struct sonode *so, int level, int option_name, 288 void *optval, socklen_t *optlenp, int flags, cred_t *cr) 289 { 290 return (SOP_GETSOCKOPT(so, level, option_name, optval, 291 optlenp, flags, cr)); 292 } 293 294 /* 295 * Set socket options 296 */ 297 int 298 socket_setsockopt(struct sonode *so, int level, int option_name, 299 const void *optval, t_uscalar_t optlen, cred_t *cr) 300 { 301 int val = 1; 302 /* Caller allocates aligned optval, or passes null */ 303 ASSERT(((uintptr_t)optval & (sizeof (t_scalar_t) - 1)) == 0); 304 /* If optval is null optlen is 0, and vice-versa */ 305 ASSERT(optval != NULL || optlen == 0); 306 ASSERT(optlen != 0 || optval == NULL); 307 308 if (optval == NULL && optlen == 0) 309 optval = &val; 310 311 return (SOP_SETSOCKOPT(so, level, option_name, optval, optlen, cr)); 312 } 313 314 int 315 socket_sendmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop, 316 cred_t *cr) 317 { 318 int error = 0; 319 ssize_t orig_resid = uiop->uio_resid; 320 321 /* 322 * Do not bypass the cache if we are doing a local (AF_UNIX) write. 323 */ 324 if (so->so_family == AF_UNIX) 325 uiop->uio_extflg |= UIO_COPY_CACHED; 326 else 327 uiop->uio_extflg &= ~UIO_COPY_CACHED; 328 329 error = SOP_SENDMSG(so, msg, uiop, cr); 330 switch (error) { 331 default: 332 break; 333 case EINTR: 334 /* EAGAIN is EWOULDBLOCK */ 335 case EWOULDBLOCK: 336 /* We did a partial send */ 337 if (uiop->uio_resid != orig_resid) 338 error = 0; 339 break; 340 case EPIPE: 341 if ((so->so_mode & SM_KERNEL) == 0) 342 tsignal(curthread, SIGPIPE); 343 break; 344 } 345 346 return (error); 347 } 348 349 int 350 socket_sendmblk(struct sonode *so, struct nmsghdr *msg, int fflag, 351 struct cred *cr, mblk_t **mpp) 352 { 353 int error = 0; 354 355 error = SOP_SENDMBLK(so, msg, fflag, cr, mpp); 356 if (error == EPIPE) { 357 tsignal(curthread, SIGPIPE); 358 } 359 return (error); 360 } 361 362 int 363 socket_recvmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop, 364 cred_t *cr) 365 { 366 int error; 367 ssize_t orig_resid = uiop->uio_resid; 368 369 /* 370 * Do not bypass the cache when reading data, as the application 371 * is likely to access the data shortly. 372 */ 373 uiop->uio_extflg |= UIO_COPY_CACHED; 374 375 error = SOP_RECVMSG(so, msg, uiop, cr); 376 377 switch (error) { 378 case EINTR: 379 /* EAGAIN is EWOULDBLOCK */ 380 case EWOULDBLOCK: 381 /* We did a partial read */ 382 if (uiop->uio_resid != orig_resid) 383 error = 0; 384 break; 385 default: 386 break; 387 } 388 return (error); 389 } 390 391 int 392 socket_ioctl(struct sonode *so, int cmd, intptr_t arg, int mode, 393 struct cred *cr, int32_t *rvalp) 394 { 395 return (SOP_IOCTL(so, cmd, arg, mode, cr, rvalp)); 396 } 397 398 int 399 socket_poll(struct sonode *so, short events, int anyyet, short *reventsp, 400 struct pollhead **phpp) 401 { 402 return (SOP_POLL(so, events, anyyet, reventsp, phpp)); 403 } 404 405 int 406 socket_close(struct sonode *so, int flag, struct cred *cr) 407 { 408 return (VOP_CLOSE(SOTOV(so), flag, 1, 0, cr, NULL)); 409 } 410 411 int 412 socket_close_internal(struct sonode *so, int flag, cred_t *cr) 413 { 414 ASSERT(so->so_count == 0); 415 416 return (SOP_CLOSE(so, flag, cr)); 417 } 418 419 void 420 socket_destroy(struct sonode *so) 421 { 422 vn_invalid(SOTOV(so)); 423 VN_RELE(SOTOV(so)); 424 } 425 426 /* ARGSUSED */ 427 void 428 socket_destroy_internal(struct sonode *so, cred_t *cr) 429 { 430 struct sockparams *sp = so->so_sockparams; 431 ASSERT(so->so_count == 0 && sp != NULL); 432 433 sp->sp_smod_info->smod_sock_destroy_func(so); 434 435 SOCKPARAMS_DEC_REF(sp); 436 } 437 438 /* 439 * TODO Once the common vnode ops is available, then the vnops argument 440 * should be removed. 441 */ 442 /*ARGSUSED*/ 443 int 444 sonode_constructor(void *buf, void *cdrarg, int kmflags) 445 { 446 struct sonode *so = buf; 447 struct vnode *vp; 448 449 vp = so->so_vnode = vn_alloc(kmflags); 450 if (vp == NULL) { 451 return (-1); 452 } 453 vp->v_data = so; 454 vn_setops(vp, socket_vnodeops); 455 456 so->so_priv = NULL; 457 so->so_oobmsg = NULL; 458 459 so->so_proto_handle = NULL; 460 461 so->so_peercred = NULL; 462 463 so->so_rcv_queued = 0; 464 so->so_rcv_q_head = NULL; 465 so->so_rcv_q_last_head = NULL; 466 so->so_rcv_head = NULL; 467 so->so_rcv_last_head = NULL; 468 so->so_rcv_wanted = 0; 469 so->so_rcv_timer_interval = SOCKET_NO_RCVTIMER; 470 so->so_rcv_timer_tid = 0; 471 so->so_rcv_thresh = 0; 472 473 so->so_acceptq_head = NULL; 474 so->so_acceptq_tail = &so->so_acceptq_head; 475 so->so_acceptq_next = NULL; 476 so->so_acceptq_len = 0; 477 so->so_backlog = 0; 478 479 so->so_snd_qfull = B_FALSE; 480 481 mutex_init(&so->so_lock, NULL, MUTEX_DEFAULT, NULL); 482 mutex_init(&so->so_acceptq_lock, NULL, MUTEX_DEFAULT, NULL); 483 rw_init(&so->so_fallback_rwlock, NULL, RW_DEFAULT, NULL); 484 cv_init(&so->so_state_cv, NULL, CV_DEFAULT, NULL); 485 cv_init(&so->so_want_cv, NULL, CV_DEFAULT, NULL); 486 487 cv_init(&so->so_acceptq_cv, NULL, CV_DEFAULT, NULL); 488 cv_init(&so->so_snd_cv, NULL, CV_DEFAULT, NULL); 489 cv_init(&so->so_rcv_cv, NULL, CV_DEFAULT, NULL); 490 cv_init(&so->so_copy_cv, NULL, CV_DEFAULT, NULL); 491 cv_init(&so->so_closing_cv, NULL, CV_DEFAULT, NULL); 492 493 return (0); 494 } 495 496 /*ARGSUSED*/ 497 void 498 sonode_destructor(void *buf, void *cdrarg) 499 { 500 struct sonode *so = buf; 501 struct vnode *vp = SOTOV(so); 502 503 ASSERT(so->so_priv == NULL); 504 ASSERT(so->so_peercred == NULL); 505 506 ASSERT(so->so_oobmsg == NULL); 507 508 ASSERT(so->so_rcv_q_head == NULL); 509 510 ASSERT(so->so_acceptq_head == NULL); 511 ASSERT(so->so_acceptq_tail == &so->so_acceptq_head); 512 ASSERT(so->so_acceptq_next == NULL); 513 514 ASSERT(vp->v_data == so); 515 ASSERT(vn_matchops(vp, socket_vnodeops)); 516 517 vn_free(vp); 518 519 mutex_destroy(&so->so_lock); 520 mutex_destroy(&so->so_acceptq_lock); 521 rw_destroy(&so->so_fallback_rwlock); 522 523 cv_destroy(&so->so_state_cv); 524 cv_destroy(&so->so_want_cv); 525 cv_destroy(&so->so_acceptq_cv); 526 cv_destroy(&so->so_snd_cv); 527 cv_destroy(&so->so_rcv_cv); 528 cv_destroy(&so->so_closing_cv); 529 } 530 531 void 532 sonode_init(struct sonode *so, struct sockparams *sp, int family, 533 int type, int protocol, sonodeops_t *sops) 534 { 535 vnode_t *vp; 536 537 vp = SOTOV(so); 538 539 so->so_flag = 0; 540 541 so->so_state = 0; 542 so->so_mode = 0; 543 544 so->so_count = 0; 545 546 so->so_family = family; 547 so->so_type = type; 548 so->so_protocol = protocol; 549 550 SOCK_CONNID_INIT(so->so_proto_connid); 551 552 so->so_options = 0; 553 so->so_linger.l_onoff = 0; 554 so->so_linger.l_linger = 0; 555 so->so_sndbuf = 0; 556 so->so_error = 0; 557 so->so_rcvtimeo = 0; 558 so->so_sndtimeo = 0; 559 so->so_xpg_rcvbuf = 0; 560 561 ASSERT(so->so_oobmsg == NULL); 562 so->so_oobmark = 0; 563 so->so_pgrp = 0; 564 565 ASSERT(so->so_peercred == NULL); 566 567 so->so_zoneid = getzoneid(); 568 569 so->so_sockparams = sp; 570 571 so->so_ops = sops; 572 573 so->so_not_str = (sops != &sotpi_sonodeops); 574 575 so->so_proto_handle = NULL; 576 577 so->so_downcalls = NULL; 578 579 so->so_copyflag = 0; 580 581 ASSERT(so->so_acceptq_head == NULL); 582 ASSERT(so->so_acceptq_tail == &so->so_acceptq_head); 583 ASSERT(so->so_acceptq_next == NULL); 584 585 vn_reinit(vp); 586 vp->v_vfsp = rootvfs; 587 vp->v_type = VSOCK; 588 vp->v_rdev = sockdev; 589 590 so->so_rcv_queued = 0; 591 so->so_rcv_q_head = NULL; 592 so->so_rcv_q_last_head = NULL; 593 so->so_rcv_head = NULL; 594 so->so_rcv_last_head = NULL; 595 596 so->so_snd_qfull = B_FALSE; 597 so->so_minpsz = 0; 598 599 so->so_rcv_wakeup = B_FALSE; 600 so->so_snd_wakeup = B_FALSE; 601 so->so_flowctrld = B_FALSE; 602 603 so->so_pollev = 0; 604 bzero(&so->so_poll_list, sizeof (so->so_poll_list)); 605 bzero(&so->so_proto_props, sizeof (struct sock_proto_props)); 606 607 bzero(&(so->so_ksock_callbacks), sizeof (ksocket_callbacks_t)); 608 so->so_ksock_cb_arg = NULL; 609 610 so->so_max_addr_len = sizeof (struct sockaddr_storage); 611 612 so->so_direct = NULL; 613 614 vn_exists(vp); 615 } 616 617 void 618 sonode_fini(struct sonode *so) 619 { 620 mblk_t *mp; 621 vnode_t *vp; 622 623 ASSERT(so->so_count == 0); 624 625 if (so->so_rcv_timer_tid) { 626 ASSERT(MUTEX_NOT_HELD(&so->so_lock)); 627 (void) untimeout(so->so_rcv_timer_tid); 628 so->so_rcv_timer_tid = 0; 629 } 630 631 so_acceptq_flush(so); 632 633 if ((mp = so->so_oobmsg) != NULL) { 634 freemsg(mp); 635 so->so_oobmsg = NULL; 636 so->so_state &= ~(SS_OOBPEND|SS_HAVEOOBDATA|SS_HADOOBDATA| 637 SS_RCVATMARK); 638 } 639 640 if (so->so_poll_list.ph_list != NULL) { 641 pollwakeup(&so->so_poll_list, POLLERR); 642 pollhead_clean(&so->so_poll_list); 643 } 644 645 if (so->so_direct != NULL) { 646 sodirect_t *sodp = so->so_direct; 647 648 ASSERT(sodp->sod_uioafh == NULL); 649 650 so->so_direct = NULL; 651 kmem_cache_free(sock_sod_cache, sodp); 652 } 653 654 vp = SOTOV(so); 655 vn_invalid(vp); 656 657 if (so->so_peercred != NULL) { 658 crfree(so->so_peercred); 659 so->so_peercred = NULL; 660 } 661 } 662 663 /* 664 * This function is called at the beginning of recvmsg(). 665 * 666 * If I/OAT is enabled on this sonode, initialize the uioa state machine 667 * with state UIOA_ALLOC. 668 */ 669 uio_t * 670 sod_rcv_init(struct sonode *so, int flags, struct uio **uiopp) 671 { 672 struct uio *suiop; 673 struct uio *uiop; 674 sodirect_t *sodp = so->so_direct; 675 676 if (sodp == NULL) 677 return (NULL); 678 679 suiop = NULL; 680 uiop = *uiopp; 681 682 mutex_enter(sodp->sod_lockp); 683 if (uiop->uio_resid >= uioasync.mincnt && 684 sodp != NULL && (sodp->sod_state & SOD_ENABLED) && 685 uioasync.enabled && !(flags & MSG_PEEK) && 686 !(so->so_state & SS_CANTRCVMORE)) { 687 /* 688 * Big enough I/O for uioa min setup and an sodirect socket 689 * and sodirect enabled and uioa enabled and I/O will be done 690 * and not EOF so initialize the sodirect_t uioa_t with "uiop". 691 */ 692 if (!uioainit(uiop, &sodp->sod_uioa)) { 693 /* 694 * Successful uioainit() so the uio_t part of the 695 * uioa_t will be used for all uio_t work to follow, 696 * we return the original "uiop" in "suiop". 697 */ 698 suiop = uiop; 699 *uiopp = (uio_t *)&sodp->sod_uioa; 700 /* 701 * Before returning to the caller the passed in uio_t 702 * "uiop" will be updated via a call to uioafini() 703 * below. 704 * 705 * Note, the uioa.uioa_state isn't set to UIOA_ENABLED 706 * here as first we have to uioamove() any currently 707 * queued M_DATA mblk_t(s) so it will be done later. 708 */ 709 } 710 /* 711 * In either uioainit() success or not case note the number 712 * of uio bytes the caller wants for sod framework and/or 713 * transport (e.g. TCP) strategy. 714 */ 715 sodp->sod_want = uiop->uio_resid; 716 } else if (sodp != NULL && (sodp->sod_state & SOD_ENABLED)) { 717 /* 718 * No uioa but still using sodirect so note the number of 719 * uio bytes the caller wants for sodirect framework and/or 720 * transport (e.g. TCP) strategy. 721 */ 722 sodp->sod_want = uiop->uio_resid; 723 } 724 mutex_exit(sodp->sod_lockp); 725 726 return (suiop); 727 } 728 729 /* 730 * This function is called at the end of recvmsg(), it finializes all the I/OAT 731 * operations, and reset the uioa state to UIOA_ALLOC. 732 */ 733 int 734 sod_rcv_done(struct sonode *so, struct uio *suiop, struct uio *uiop) 735 { 736 int error = 0; 737 sodirect_t *sodp = so->so_direct; 738 mblk_t *mp; 739 740 if (sodp == NULL) { 741 return (0); 742 } 743 744 ASSERT(MUTEX_HELD(sodp->sod_lockp)); 745 /* Finish any sodirect and uioa processing */ 746 if (suiop != NULL) { 747 /* Finish any uioa_t processing */ 748 749 ASSERT(uiop == (uio_t *)&sodp->sod_uioa); 750 error = uioafini(suiop, (uioa_t *)uiop); 751 if ((mp = sodp->sod_uioafh) != NULL) { 752 sodp->sod_uioafh = NULL; 753 sodp->sod_uioaft = NULL; 754 freemsg(mp); 755 } 756 } 757 ASSERT(sodp->sod_uioafh == NULL); 758 if (!(sodp->sod_state & SOD_WAKE_NOT)) { 759 /* Awoke */ 760 sodp->sod_state &= SOD_WAKE_CLR; 761 sodp->sod_state |= SOD_WAKE_NOT; 762 } 763 /* Last, clear sod_want value */ 764 sodp->sod_want = 0; 765 766 return (error); 767 } 768 769 /* 770 * Schedule a uioamove() on a mblk. This is ususally called from 771 * protocols (e.g. TCP) on a I/OAT enabled sonode. 772 */ 773 mblk_t * 774 sod_uioa_mblk_init(struct sodirect_s *sodp, mblk_t *mp, size_t msg_size) 775 { 776 uioa_t *uioap = &sodp->sod_uioa; 777 mblk_t *mp1 = mp; 778 mblk_t *lmp = NULL; 779 780 ASSERT(DB_TYPE(mp) == M_DATA); 781 ASSERT(msg_size == msgdsize(mp)); 782 783 /* Caller must have lock held */ 784 ASSERT(MUTEX_HELD(sodp->sod_lockp)); 785 786 if (uioap->uioa_state & UIOA_ENABLED) { 787 /* Uioa is enabled */ 788 789 if (msg_size > uioap->uio_resid) { 790 /* 791 * There isn't enough uio space for the mblk_t chain 792 * so disable uioa such that this and any additional 793 * mblk_t data is handled by the socket and schedule 794 * the socket for wakeup to finish this uioa. 795 */ 796 uioap->uioa_state &= UIOA_CLR; 797 uioap->uioa_state |= UIOA_FINI; 798 if (sodp->sod_state & SOD_WAKE_NOT) { 799 sodp->sod_state &= SOD_WAKE_CLR; 800 sodp->sod_state |= SOD_WAKE_NEED; 801 } 802 return (mp); 803 } 804 do { 805 uint32_t len = MBLKL(mp1); 806 807 if (!uioamove(mp1->b_rptr, len, UIO_READ, uioap)) { 808 /* Scheduled, mark dblk_t as such */ 809 DB_FLAGS(mp1) |= DBLK_UIOA; 810 } else { 811 /* Error, turn off async processing */ 812 uioap->uioa_state &= UIOA_CLR; 813 uioap->uioa_state |= UIOA_FINI; 814 break; 815 } 816 lmp = mp1; 817 } while ((mp1 = mp1->b_cont) != NULL); 818 819 if (mp1 != NULL || uioap->uio_resid == 0) { 820 /* 821 * Not all mblk_t(s) uioamoved (error) or all uio 822 * space has been consumed so schedule the socket 823 * for wakeup to finish this uio. 824 */ 825 sodp->sod_state &= SOD_WAKE_CLR; 826 sodp->sod_state |= SOD_WAKE_NEED; 827 828 /* Break the mblk chain if neccessary. */ 829 if (mp1 != NULL && lmp != NULL) { 830 mp->b_next = mp1; 831 lmp->b_cont = NULL; 832 } 833 } 834 } 835 return (mp1); 836 } 837 838 /* 839 * This function is called on a mblk that thas been successfully uioamoved(). 840 */ 841 void 842 sod_uioa_mblk_done(sodirect_t *sodp, mblk_t *bp) 843 { 844 if (bp != NULL && (bp->b_datap->db_flags & DBLK_UIOA)) { 845 /* 846 * A uioa flaged mblk_t chain, already uio processed, 847 * add it to the sodirect uioa pending free list. 848 * 849 * Note, a b_cont chain headed by a DBLK_UIOA enable 850 * mblk_t must have all mblk_t(s) DBLK_UIOA enabled. 851 */ 852 mblk_t *bpt = sodp->sod_uioaft; 853 854 ASSERT(sodp != NULL); 855 856 /* 857 * Add first mblk_t of "bp" chain to current sodirect uioa 858 * free list tail mblk_t, if any, else empty list so new head. 859 */ 860 if (bpt == NULL) 861 sodp->sod_uioafh = bp; 862 else 863 bpt->b_cont = bp; 864 865 /* 866 * Walk mblk_t "bp" chain to find tail and adjust rptr of 867 * each to reflect that uioamove() has consumed all data. 868 */ 869 bpt = bp; 870 for (;;) { 871 ASSERT(bpt->b_datap->db_flags & DBLK_UIOA); 872 873 bpt->b_rptr = bpt->b_wptr; 874 if (bpt->b_cont == NULL) 875 break; 876 bpt = bpt->b_cont; 877 } 878 /* New sodirect uioa free list tail */ 879 sodp->sod_uioaft = bpt; 880 881 /* Only dequeue once with data returned per uioa_t */ 882 if (sodp->sod_uioa.uioa_state & UIOA_ENABLED) { 883 sodp->sod_uioa.uioa_state &= UIOA_CLR; 884 sodp->sod_uioa.uioa_state |= UIOA_FINI; 885 } 886 } 887 } 888 889 /* 890 * When transit from UIOA_INIT state to UIOA_ENABLE state in recvmsg(), call 891 * this function on a non-STREAMS socket to schedule uioamove() on the data 892 * that has already queued in this socket. 893 */ 894 void 895 sod_uioa_so_init(struct sonode *so, struct sodirect_s *sodp, struct uio *uiop) 896 { 897 uioa_t *uioap = (uioa_t *)uiop; 898 mblk_t *lbp; 899 mblk_t *wbp; 900 mblk_t *bp; 901 int len; 902 int error; 903 boolean_t in_rcv_q = B_TRUE; 904 905 ASSERT(MUTEX_HELD(sodp->sod_lockp)); 906 ASSERT(&sodp->sod_uioa == uioap); 907 908 /* 909 * Walk first b_cont chain in sod_q 910 * and schedule any M_DATA mblk_t's for uio asynchronous move. 911 */ 912 bp = so->so_rcv_q_head; 913 914 again: 915 /* Walk the chain */ 916 lbp = NULL; 917 wbp = bp; 918 919 do { 920 if (bp == NULL) 921 break; 922 923 if (wbp->b_datap->db_type != M_DATA) { 924 /* Not M_DATA, no more uioa */ 925 goto nouioa; 926 } 927 if ((len = wbp->b_wptr - wbp->b_rptr) > 0) { 928 /* Have a M_DATA mblk_t with data */ 929 if (len > uioap->uio_resid || (so->so_oobmark > 0 && 930 len + uioap->uioa_mbytes >= so->so_oobmark)) { 931 /* Not enough uio sapce, or beyond oobmark */ 932 goto nouioa; 933 } 934 ASSERT(!(wbp->b_datap->db_flags & DBLK_UIOA)); 935 error = uioamove(wbp->b_rptr, len, 936 UIO_READ, uioap); 937 if (!error) { 938 /* Scheduled, mark dblk_t as such */ 939 wbp->b_datap->db_flags |= DBLK_UIOA; 940 } else { 941 /* Break the mblk chain */ 942 goto nouioa; 943 } 944 } 945 /* Save last wbp processed */ 946 lbp = wbp; 947 } while ((wbp = wbp->b_cont) != NULL); 948 949 if (in_rcv_q && (bp == NULL || bp->b_next == NULL)) { 950 /* 951 * We get here only once to process the sonode dump area 952 * if so_rcv_q_head is NULL or all the mblks have been 953 * successfully uioamoved()ed. 954 */ 955 in_rcv_q = B_FALSE; 956 957 /* move to dump area */ 958 bp = so->so_rcv_head; 959 goto again; 960 } 961 962 return; 963 964 nouioa: 965 /* No more uioa */ 966 uioap->uioa_state &= UIOA_CLR; 967 uioap->uioa_state |= UIOA_FINI; 968 969 /* 970 * If we processed 1 or more mblk_t(s) then we need to split the 971 * current mblk_t chain in 2 so that all the uioamove()ed mblk_t(s) 972 * are in the current chain and the rest are in the following new 973 * chain. 974 */ 975 if (lbp != NULL) { 976 /* New end of current chain */ 977 lbp->b_cont = NULL; 978 979 /* Insert new chain wbp after bp */ 980 if ((wbp->b_next = bp->b_next) == NULL) { 981 /* 982 * No need to grab so_lock, since sod_lockp 983 * points to so_lock. 984 */ 985 if (in_rcv_q) 986 so->so_rcv_q_last_head = wbp; 987 else 988 so->so_rcv_last_head = wbp; 989 } 990 bp->b_next = wbp; 991 bp->b_next->b_prev = bp->b_prev; 992 bp->b_prev = lbp; 993 } 994 } 995 996 /* 997 * Initialize sodirect data structures on a socket. 998 */ 999 void 1000 sod_sock_init(struct sonode *so, struct stdata *stp, sod_enq_func enq_func, 1001 sod_wakeup_func wake_func, kmutex_t *lockp) 1002 { 1003 sodirect_t *sodp; 1004 1005 ASSERT(so->so_direct == NULL); 1006 1007 so->so_state |= SS_SODIRECT; 1008 1009 sodp = kmem_cache_alloc(sock_sod_cache, KM_SLEEP); 1010 sodp->sod_state = SOD_ENABLED | SOD_WAKE_NOT; 1011 sodp->sod_want = 0; 1012 sodp->sod_q = (stp != NULL) ? RD(stp->sd_wrq) : NULL; 1013 sodp->sod_enqueue = enq_func; 1014 sodp->sod_wakeup = wake_func; 1015 sodp->sod_uioafh = NULL; 1016 sodp->sod_uioaft = NULL; 1017 sodp->sod_lockp = lockp; 1018 /* 1019 * Remainder of the sod_uioa members are left uninitialized 1020 * but will be initialized later by uioainit() before uioa 1021 * is enabled. 1022 */ 1023 sodp->sod_uioa.uioa_state = UIOA_ALLOC; 1024 so->so_direct = sodp; 1025 if (stp != NULL) 1026 stp->sd_sodirect = sodp; 1027 } 1028 1029 /* 1030 * Init the sodirect kmem cache while sockfs is loading. 1031 */ 1032 void 1033 sod_init() 1034 { 1035 /* Allocate sodirect_t kmem_cache */ 1036 sock_sod_cache = kmem_cache_create("sock_sod_cache", 1037 sizeof (sodirect_t), 0, NULL, NULL, NULL, NULL, NULL, 0); 1038 } 1039 1040 ssize_t 1041 sod_uioa_mblk(struct sonode *so, mblk_t *mp) 1042 { 1043 sodirect_t *sodp = so->so_direct; 1044 1045 ASSERT(sodp != NULL); 1046 ASSERT(MUTEX_HELD(sodp->sod_lockp)); 1047 1048 ASSERT(sodp->sod_state & SOD_ENABLED); 1049 ASSERT(sodp->sod_uioa.uioa_state != (UIOA_ALLOC|UIOA_INIT)); 1050 1051 ASSERT(sodp->sod_uioa.uioa_state & (UIOA_ENABLED|UIOA_FINI)); 1052 1053 if (mp == NULL && so->so_rcv_q_head != NULL) { 1054 mp = so->so_rcv_q_head; 1055 ASSERT(mp->b_prev != NULL); 1056 mp->b_prev = NULL; 1057 so->so_rcv_q_head = mp->b_next; 1058 if (so->so_rcv_q_head == NULL) { 1059 so->so_rcv_q_last_head = NULL; 1060 } 1061 mp->b_next = NULL; 1062 } 1063 1064 sod_uioa_mblk_done(sodp, mp); 1065 1066 if (so->so_rcv_q_head == NULL && so->so_rcv_head != NULL && 1067 DB_TYPE(so->so_rcv_head) == M_DATA && 1068 (DB_FLAGS(so->so_rcv_head) & DBLK_UIOA)) { 1069 /* more arrived */ 1070 ASSERT(so->so_rcv_q_head == NULL); 1071 mp = so->so_rcv_head; 1072 so->so_rcv_head = mp->b_next; 1073 if (so->so_rcv_head == NULL) 1074 so->so_rcv_last_head = NULL; 1075 mp->b_prev = mp->b_next = NULL; 1076 sod_uioa_mblk_done(sodp, mp); 1077 } 1078 1079 #ifdef DEBUG 1080 if (so->so_rcv_q_head != NULL) { 1081 mblk_t *m = so->so_rcv_q_head; 1082 while (m != NULL) { 1083 if (DB_FLAGS(m) & DBLK_UIOA) { 1084 cmn_err(CE_PANIC, "Unexpected I/OAT mblk %p" 1085 " in so_rcv_q_head.\n", (void *)m); 1086 } 1087 m = m->b_next; 1088 } 1089 } 1090 if (so->so_rcv_head != NULL) { 1091 mblk_t *m = so->so_rcv_head; 1092 while (m != NULL) { 1093 if (DB_FLAGS(m) & DBLK_UIOA) { 1094 cmn_err(CE_PANIC, "Unexpected I/OAT mblk %p" 1095 " in so_rcv_head.\n", (void *)m); 1096 } 1097 m = m->b_next; 1098 } 1099 } 1100 #endif 1101 return (sodp->sod_uioa.uioa_mbytes); 1102 } 1103