1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #include <sys/types.h> 28 #include <sys/param.h> 29 #include <sys/systm.h> 30 #include <sys/sysmacros.h> 31 #include <sys/debug.h> 32 #include <sys/cmn_err.h> 33 #include <sys/vfs.h> 34 #include <sys/policy.h> 35 #include <sys/modctl.h> 36 37 #include <sys/sunddi.h> 38 39 #include <sys/strsun.h> 40 #include <sys/stropts.h> 41 #include <sys/strsubr.h> 42 #include <sys/socket.h> 43 #include <sys/socketvar.h> 44 #include <sys/sodirect.h> 45 #include <sys/uio.h> 46 47 #include <inet/ipclassifier.h> 48 #include <fs/sockfs/sockcommon.h> 49 #include <fs/sockfs/nl7c.h> 50 #include <fs/sockfs/socktpi.h> 51 #include <inet/ip.h> 52 53 extern int xnet_skip_checks, xnet_check_print, xnet_truncate_print; 54 55 static struct kmem_cache *sock_sod_cache; 56 57 /* 58 * Common socket access functions. 59 * 60 * Instead of accessing the sonode switch directly (i.e., SOP_xxx()), 61 * the socket_xxx() function should be used. 62 */ 63 64 /* 65 * Try to create a new sonode of the requested <family, type, protocol>. 66 */ 67 /* ARGSUSED */ 68 struct sonode * 69 socket_create(int family, int type, int protocol, char *devpath, char *mod, 70 int flags, int version, struct cred *cr, int *errorp) 71 { 72 struct sonode *so; 73 struct sockparams *sp = NULL; 74 75 /* 76 * Look for a sockparams entry that match the given criteria. 77 * solookup() returns with the entry held. 78 */ 79 *errorp = solookup(family, type, protocol, &sp); 80 if (sp == NULL) { 81 int kmflags = (flags == SOCKET_SLEEP) ? KM_SLEEP : KM_NOSLEEP; 82 /* 83 * There is no matching sockparams entry. An ephemeral entry is 84 * created if the caller specifies a device or a socket module. 85 */ 86 if (devpath != NULL) { 87 sp = sockparams_hold_ephemeral_bydev(family, type, 88 protocol, devpath, kmflags, errorp); 89 } else if (mod != NULL) { 90 sp = sockparams_hold_ephemeral_bymod(family, type, 91 protocol, mod, kmflags, errorp); 92 } else { 93 return (NULL); 94 } 95 96 if (sp == NULL) 97 return (NULL); 98 } 99 100 ASSERT(sp->sp_smod_info != NULL); 101 ASSERT(flags == SOCKET_SLEEP || flags == SOCKET_NOSLEEP); 102 so = sp->sp_smod_info->smod_sock_create_func(sp, family, type, 103 protocol, version, flags, errorp, cr); 104 if (so == NULL) { 105 SOCKPARAMS_DEC_REF(sp); 106 } else { 107 if ((*errorp = SOP_INIT(so, NULL, cr, flags)) == 0) { 108 /* Cannot fail, only bumps so_count */ 109 (void) VOP_OPEN(&SOTOV(so), FREAD|FWRITE, cr, NULL); 110 } else { 111 socket_destroy(so); 112 so = NULL; 113 } 114 } 115 return (so); 116 } 117 118 struct sonode * 119 socket_newconn(struct sonode *parent, sock_lower_handle_t lh, 120 sock_downcalls_t *dc, int flags, int *errorp) 121 { 122 struct sonode *so; 123 struct sockparams *sp; 124 struct cred *cr; 125 126 if ((cr = CRED()) == NULL) 127 cr = kcred; 128 129 sp = parent->so_sockparams; 130 ASSERT(sp != NULL); 131 132 so = sp->sp_smod_info->smod_sock_create_func(sp, parent->so_family, 133 parent->so_type, parent->so_protocol, parent->so_version, flags, 134 errorp, cr); 135 if (so != NULL) { 136 SOCKPARAMS_INC_REF(sp); 137 138 so->so_proto_handle = lh; 139 so->so_downcalls = dc; 140 /* 141 * This function may be called in interrupt context, and CRED() 142 * will be NULL. In this case, pass in kcred. 143 */ 144 if ((*errorp = SOP_INIT(so, parent, cr, flags)) == 0) { 145 /* Cannot fail, only bumps so_count */ 146 (void) VOP_OPEN(&SOTOV(so), FREAD|FWRITE, cr, NULL); 147 } else { 148 socket_destroy(so); 149 so = NULL; 150 } 151 } 152 153 return (so); 154 } 155 156 /* 157 * Bind local endpoint. 158 */ 159 int 160 socket_bind(struct sonode *so, struct sockaddr *name, socklen_t namelen, 161 int flags, cred_t *cr) 162 { 163 return (SOP_BIND(so, name, namelen, flags, cr)); 164 } 165 166 /* 167 * Turn socket into a listen socket. 168 */ 169 int 170 socket_listen(struct sonode *so, int backlog, cred_t *cr) 171 { 172 if (backlog < 0) { 173 backlog = 0; 174 } 175 176 /* 177 * Use the same qlimit as in BSD. BSD checks the qlimit 178 * before queuing the next connection implying that a 179 * listen(sock, 0) allows one connection to be queued. 180 * BSD also uses 1.5 times the requested backlog. 181 * 182 * XNS Issue 4 required a strict interpretation of the backlog. 183 * This has been waived subsequently for Issue 4 and the change 184 * incorporated in XNS Issue 5. So we aren't required to do 185 * anything special for XPG apps. 186 */ 187 if (backlog >= (INT_MAX - 1) / 3) 188 backlog = INT_MAX; 189 else 190 backlog = backlog * 3 / 2 + 1; 191 192 return (SOP_LISTEN(so, backlog, cr)); 193 } 194 195 /* 196 * Accept incoming connection. 197 */ 198 int 199 socket_accept(struct sonode *lso, int fflag, cred_t *cr, struct sonode **nsop) 200 { 201 return (SOP_ACCEPT(lso, fflag, cr, nsop)); 202 } 203 204 /* 205 * Active open. 206 */ 207 int 208 socket_connect(struct sonode *so, const struct sockaddr *name, 209 socklen_t namelen, int fflag, int flags, cred_t *cr) 210 { 211 int error; 212 213 /* 214 * Handle a connect to a name parameter of type AF_UNSPEC like a 215 * connect to a null address. This is the portable method to 216 * unconnect a socket. 217 */ 218 if ((namelen >= sizeof (sa_family_t)) && 219 (name->sa_family == AF_UNSPEC)) { 220 name = NULL; 221 namelen = 0; 222 } 223 224 error = SOP_CONNECT(so, name, namelen, fflag, flags, cr); 225 226 if (error == EHOSTUNREACH && flags & _SOCONNECT_XPG4_2) { 227 /* 228 * X/Open specification contains a requirement that 229 * ENETUNREACH be returned but does not require 230 * EHOSTUNREACH. In order to keep the test suite 231 * happy we mess with the errno here. 232 */ 233 error = ENETUNREACH; 234 } 235 236 return (error); 237 } 238 239 /* 240 * Get address of remote node. 241 */ 242 int 243 socket_getpeername(struct sonode *so, struct sockaddr *addr, 244 socklen_t *addrlen, boolean_t accept, cred_t *cr) 245 { 246 ASSERT(*addrlen > 0); 247 return (SOP_GETPEERNAME(so, addr, addrlen, accept, cr)); 248 249 } 250 251 /* 252 * Get local address. 253 */ 254 int 255 socket_getsockname(struct sonode *so, struct sockaddr *addr, 256 socklen_t *addrlen, cred_t *cr) 257 { 258 return (SOP_GETSOCKNAME(so, addr, addrlen, cr)); 259 260 } 261 262 /* 263 * Called from shutdown(). 264 */ 265 int 266 socket_shutdown(struct sonode *so, int how, cred_t *cr) 267 { 268 return (SOP_SHUTDOWN(so, how, cr)); 269 } 270 271 /* 272 * Get socket options. 273 */ 274 /*ARGSUSED*/ 275 int 276 socket_getsockopt(struct sonode *so, int level, int option_name, 277 void *optval, socklen_t *optlenp, int flags, cred_t *cr) 278 { 279 return (SOP_GETSOCKOPT(so, level, option_name, optval, 280 optlenp, flags, cr)); 281 } 282 283 /* 284 * Set socket options 285 */ 286 int 287 socket_setsockopt(struct sonode *so, int level, int option_name, 288 const void *optval, t_uscalar_t optlen, cred_t *cr) 289 { 290 /* Caller allocates aligned optval, or passes null */ 291 ASSERT(((uintptr_t)optval & (sizeof (t_scalar_t) - 1)) == 0); 292 /* If optval is null optlen is 0, and vice-versa */ 293 ASSERT(optval != NULL || optlen == 0); 294 ASSERT(optlen != 0 || optval == NULL); 295 296 /* No options should be zero-length */ 297 if (optlen == 0) 298 return (EINVAL); 299 300 return (SOP_SETSOCKOPT(so, level, option_name, optval, optlen, cr)); 301 } 302 303 int 304 socket_sendmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop, 305 cred_t *cr) 306 { 307 int error = 0; 308 ssize_t orig_resid = uiop->uio_resid; 309 310 /* 311 * Do not bypass the cache if we are doing a local (AF_UNIX) write. 312 */ 313 if (so->so_family == AF_UNIX) 314 uiop->uio_extflg |= UIO_COPY_CACHED; 315 else 316 uiop->uio_extflg &= ~UIO_COPY_CACHED; 317 318 error = SOP_SENDMSG(so, msg, uiop, cr); 319 switch (error) { 320 default: 321 break; 322 case EINTR: 323 case ETIME: 324 case EWOULDBLOCK: 325 /* We did a partial send */ 326 if (uiop->uio_resid != orig_resid) 327 error = 0; 328 break; 329 case EPIPE: 330 if ((so->so_mode & SM_KERNEL) == 0) 331 tsignal(curthread, SIGPIPE); 332 break; 333 } 334 335 return (error); 336 } 337 338 int 339 socket_sendmblk(struct sonode *so, struct nmsghdr *msg, int fflag, 340 struct cred *cr, mblk_t **mpp) 341 { 342 int error = 0; 343 344 error = SOP_SENDMBLK(so, msg, fflag, cr, mpp); 345 if (error == EPIPE) { 346 tsignal(curthread, SIGPIPE); 347 } 348 return (error); 349 } 350 351 int 352 socket_recvmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop, 353 cred_t *cr) 354 { 355 int error; 356 ssize_t orig_resid = uiop->uio_resid; 357 358 /* 359 * Do not bypass the cache when reading data, as the application 360 * is likely to access the data shortly. 361 */ 362 uiop->uio_extflg |= UIO_COPY_CACHED; 363 364 error = SOP_RECVMSG(so, msg, uiop, cr); 365 366 switch (error) { 367 case EINTR: 368 case ETIME: 369 case EWOULDBLOCK: 370 /* We did a partial read */ 371 if (uiop->uio_resid != orig_resid) 372 error = 0; 373 break; 374 default: 375 break; 376 } 377 return (error); 378 } 379 380 int 381 socket_ioctl(struct sonode *so, int cmd, intptr_t arg, int mode, 382 struct cred *cr, int32_t *rvalp) 383 { 384 return (SOP_IOCTL(so, cmd, arg, mode, cr, rvalp)); 385 } 386 387 int 388 socket_poll(struct sonode *so, short events, int anyyet, short *reventsp, 389 struct pollhead **phpp) 390 { 391 return (SOP_POLL(so, events, anyyet, reventsp, phpp)); 392 } 393 394 int 395 socket_close(struct sonode *so, int flag, struct cred *cr) 396 { 397 return (VOP_CLOSE(SOTOV(so), flag, 1, 0, cr, NULL)); 398 } 399 400 int 401 socket_close_internal(struct sonode *so, int flag, cred_t *cr) 402 { 403 ASSERT(so->so_count == 0); 404 405 return (SOP_CLOSE(so, flag, cr)); 406 } 407 408 void 409 socket_destroy(struct sonode *so) 410 { 411 vn_invalid(SOTOV(so)); 412 VN_RELE(SOTOV(so)); 413 } 414 415 /* ARGSUSED */ 416 void 417 socket_destroy_internal(struct sonode *so, cred_t *cr) 418 { 419 struct sockparams *sp = so->so_sockparams; 420 ASSERT(so->so_count == 0 && sp != NULL); 421 422 sp->sp_smod_info->smod_sock_destroy_func(so); 423 424 SOCKPARAMS_DEC_REF(sp); 425 } 426 427 /* 428 * TODO Once the common vnode ops is available, then the vnops argument 429 * should be removed. 430 */ 431 /*ARGSUSED*/ 432 int 433 sonode_constructor(void *buf, void *cdrarg, int kmflags) 434 { 435 struct sonode *so = buf; 436 struct vnode *vp; 437 438 vp = so->so_vnode = vn_alloc(kmflags); 439 if (vp == NULL) { 440 return (-1); 441 } 442 vp->v_data = so; 443 vn_setops(vp, socket_vnodeops); 444 445 so->so_priv = NULL; 446 so->so_oobmsg = NULL; 447 448 so->so_proto_handle = NULL; 449 450 so->so_peercred = NULL; 451 452 so->so_rcv_queued = 0; 453 so->so_rcv_q_head = NULL; 454 so->so_rcv_q_last_head = NULL; 455 so->so_rcv_head = NULL; 456 so->so_rcv_last_head = NULL; 457 so->so_rcv_wanted = 0; 458 so->so_rcv_timer_interval = SOCKET_NO_RCVTIMER; 459 so->so_rcv_timer_tid = 0; 460 so->so_rcv_thresh = 0; 461 462 so->so_acceptq_head = NULL; 463 so->so_acceptq_tail = &so->so_acceptq_head; 464 so->so_acceptq_next = NULL; 465 so->so_acceptq_len = 0; 466 so->so_backlog = 0; 467 468 so->so_snd_qfull = B_FALSE; 469 470 mutex_init(&so->so_lock, NULL, MUTEX_DEFAULT, NULL); 471 mutex_init(&so->so_acceptq_lock, NULL, MUTEX_DEFAULT, NULL); 472 rw_init(&so->so_fallback_rwlock, NULL, RW_DEFAULT, NULL); 473 cv_init(&so->so_state_cv, NULL, CV_DEFAULT, NULL); 474 cv_init(&so->so_want_cv, NULL, CV_DEFAULT, NULL); 475 476 cv_init(&so->so_acceptq_cv, NULL, CV_DEFAULT, NULL); 477 cv_init(&so->so_snd_cv, NULL, CV_DEFAULT, NULL); 478 cv_init(&so->so_rcv_cv, NULL, CV_DEFAULT, NULL); 479 cv_init(&so->so_copy_cv, NULL, CV_DEFAULT, NULL); 480 cv_init(&so->so_closing_cv, NULL, CV_DEFAULT, NULL); 481 482 return (0); 483 } 484 485 /*ARGSUSED*/ 486 void 487 sonode_destructor(void *buf, void *cdrarg) 488 { 489 struct sonode *so = buf; 490 struct vnode *vp = SOTOV(so); 491 492 ASSERT(so->so_priv == NULL); 493 ASSERT(so->so_peercred == NULL); 494 495 ASSERT(so->so_oobmsg == NULL); 496 497 ASSERT(so->so_rcv_q_head == NULL); 498 499 ASSERT(so->so_acceptq_head == NULL); 500 ASSERT(so->so_acceptq_tail == &so->so_acceptq_head); 501 ASSERT(so->so_acceptq_next == NULL); 502 503 ASSERT(vp->v_data == so); 504 ASSERT(vn_matchops(vp, socket_vnodeops)); 505 506 vn_free(vp); 507 508 mutex_destroy(&so->so_lock); 509 mutex_destroy(&so->so_acceptq_lock); 510 rw_destroy(&so->so_fallback_rwlock); 511 512 cv_destroy(&so->so_state_cv); 513 cv_destroy(&so->so_want_cv); 514 cv_destroy(&so->so_acceptq_cv); 515 cv_destroy(&so->so_snd_cv); 516 cv_destroy(&so->so_rcv_cv); 517 cv_destroy(&so->so_closing_cv); 518 } 519 520 void 521 sonode_init(struct sonode *so, struct sockparams *sp, int family, 522 int type, int protocol, sonodeops_t *sops) 523 { 524 vnode_t *vp; 525 526 vp = SOTOV(so); 527 528 so->so_flag = 0; 529 530 so->so_state = 0; 531 so->so_mode = 0; 532 533 so->so_count = 0; 534 535 so->so_family = family; 536 so->so_type = type; 537 so->so_protocol = protocol; 538 539 SOCK_CONNID_INIT(so->so_proto_connid); 540 541 so->so_options = 0; 542 so->so_linger.l_onoff = 0; 543 so->so_linger.l_linger = 0; 544 so->so_sndbuf = 0; 545 so->so_error = 0; 546 so->so_rcvtimeo = 0; 547 so->so_sndtimeo = 0; 548 so->so_xpg_rcvbuf = 0; 549 550 ASSERT(so->so_oobmsg == NULL); 551 so->so_oobmark = 0; 552 so->so_pgrp = 0; 553 554 ASSERT(so->so_peercred == NULL); 555 556 so->so_zoneid = getzoneid(); 557 558 so->so_sockparams = sp; 559 560 so->so_ops = sops; 561 562 so->so_not_str = (sops != &sotpi_sonodeops); 563 564 so->so_proto_handle = NULL; 565 566 so->so_downcalls = NULL; 567 568 so->so_copyflag = 0; 569 570 ASSERT(so->so_acceptq_head == NULL); 571 ASSERT(so->so_acceptq_tail == &so->so_acceptq_head); 572 ASSERT(so->so_acceptq_next == NULL); 573 574 vn_reinit(vp); 575 vp->v_vfsp = rootvfs; 576 vp->v_type = VSOCK; 577 vp->v_rdev = sockdev; 578 579 so->so_rcv_queued = 0; 580 so->so_rcv_q_head = NULL; 581 so->so_rcv_q_last_head = NULL; 582 so->so_rcv_head = NULL; 583 so->so_rcv_last_head = NULL; 584 585 so->so_snd_qfull = B_FALSE; 586 so->so_minpsz = 0; 587 588 so->so_rcv_wakeup = B_FALSE; 589 so->so_snd_wakeup = B_FALSE; 590 so->so_flowctrld = B_FALSE; 591 592 so->so_pollev = 0; 593 bzero(&so->so_poll_list, sizeof (so->so_poll_list)); 594 bzero(&so->so_proto_props, sizeof (struct sock_proto_props)); 595 596 bzero(&(so->so_ksock_callbacks), sizeof (ksocket_callbacks_t)); 597 so->so_ksock_cb_arg = NULL; 598 599 so->so_max_addr_len = sizeof (struct sockaddr_storage); 600 601 so->so_direct = NULL; 602 603 vn_exists(vp); 604 } 605 606 void 607 sonode_fini(struct sonode *so) 608 { 609 mblk_t *mp; 610 vnode_t *vp; 611 612 ASSERT(so->so_count == 0); 613 614 if (so->so_rcv_timer_tid) { 615 ASSERT(MUTEX_NOT_HELD(&so->so_lock)); 616 (void) untimeout(so->so_rcv_timer_tid); 617 so->so_rcv_timer_tid = 0; 618 } 619 620 so_acceptq_flush(so); 621 622 if ((mp = so->so_oobmsg) != NULL) { 623 freemsg(mp); 624 so->so_oobmsg = NULL; 625 so->so_state &= ~(SS_OOBPEND|SS_HAVEOOBDATA|SS_HADOOBDATA| 626 SS_RCVATMARK); 627 } 628 629 if (so->so_poll_list.ph_list != NULL) { 630 pollwakeup(&so->so_poll_list, POLLERR); 631 pollhead_clean(&so->so_poll_list); 632 } 633 634 if (so->so_direct != NULL) { 635 sodirect_t *sodp = so->so_direct; 636 637 ASSERT(sodp->sod_uioafh == NULL); 638 639 so->so_direct = NULL; 640 kmem_cache_free(sock_sod_cache, sodp); 641 } 642 643 vp = SOTOV(so); 644 vn_invalid(vp); 645 646 if (so->so_peercred != NULL) { 647 crfree(so->so_peercred); 648 so->so_peercred = NULL; 649 } 650 } 651 652 /* 653 * This function is called at the beginning of recvmsg(). 654 * 655 * If I/OAT is enabled on this sonode, initialize the uioa state machine 656 * with state UIOA_ALLOC. 657 */ 658 uio_t * 659 sod_rcv_init(struct sonode *so, int flags, struct uio **uiopp) 660 { 661 struct uio *suiop; 662 struct uio *uiop; 663 sodirect_t *sodp = so->so_direct; 664 665 if (sodp == NULL) 666 return (NULL); 667 668 suiop = NULL; 669 uiop = *uiopp; 670 671 mutex_enter(sodp->sod_lockp); 672 if (uiop->uio_resid >= uioasync.mincnt && 673 sodp != NULL && (sodp->sod_state & SOD_ENABLED) && 674 uioasync.enabled && !(flags & MSG_PEEK) && 675 !(so->so_state & SS_CANTRCVMORE)) { 676 /* 677 * Big enough I/O for uioa min setup and an sodirect socket 678 * and sodirect enabled and uioa enabled and I/O will be done 679 * and not EOF so initialize the sodirect_t uioa_t with "uiop". 680 */ 681 if (!uioainit(uiop, &sodp->sod_uioa)) { 682 /* 683 * Successful uioainit() so the uio_t part of the 684 * uioa_t will be used for all uio_t work to follow, 685 * we return the original "uiop" in "suiop". 686 */ 687 suiop = uiop; 688 *uiopp = (uio_t *)&sodp->sod_uioa; 689 /* 690 * Before returning to the caller the passed in uio_t 691 * "uiop" will be updated via a call to uioafini() 692 * below. 693 * 694 * Note, the uioa.uioa_state isn't set to UIOA_ENABLED 695 * here as first we have to uioamove() any currently 696 * queued M_DATA mblk_t(s) so it will be done later. 697 */ 698 } 699 /* 700 * In either uioainit() success or not case note the number 701 * of uio bytes the caller wants for sod framework and/or 702 * transport (e.g. TCP) strategy. 703 */ 704 sodp->sod_want = uiop->uio_resid; 705 } else if (sodp != NULL && (sodp->sod_state & SOD_ENABLED)) { 706 /* 707 * No uioa but still using sodirect so note the number of 708 * uio bytes the caller wants for sodirect framework and/or 709 * transport (e.g. TCP) strategy. 710 */ 711 sodp->sod_want = uiop->uio_resid; 712 } 713 mutex_exit(sodp->sod_lockp); 714 715 return (suiop); 716 } 717 718 /* 719 * This function is called at the end of recvmsg(), it finializes all the I/OAT 720 * operations, and reset the uioa state to UIOA_ALLOC. 721 */ 722 int 723 sod_rcv_done(struct sonode *so, struct uio *suiop, struct uio *uiop) 724 { 725 int error = 0; 726 sodirect_t *sodp = so->so_direct; 727 mblk_t *mp; 728 729 if (sodp == NULL) { 730 return (0); 731 } 732 733 ASSERT(MUTEX_HELD(sodp->sod_lockp)); 734 /* Finish any sodirect and uioa processing */ 735 if (suiop != NULL) { 736 /* Finish any uioa_t processing */ 737 738 ASSERT(uiop == (uio_t *)&sodp->sod_uioa); 739 error = uioafini(suiop, (uioa_t *)uiop); 740 if ((mp = sodp->sod_uioafh) != NULL) { 741 sodp->sod_uioafh = NULL; 742 sodp->sod_uioaft = NULL; 743 freemsg(mp); 744 } 745 } 746 ASSERT(sodp->sod_uioafh == NULL); 747 if (!(sodp->sod_state & SOD_WAKE_NOT)) { 748 /* Awoke */ 749 sodp->sod_state &= SOD_WAKE_CLR; 750 sodp->sod_state |= SOD_WAKE_NOT; 751 } 752 /* Last, clear sod_want value */ 753 sodp->sod_want = 0; 754 755 return (error); 756 } 757 758 /* 759 * Schedule a uioamove() on a mblk. This is ususally called from 760 * protocols (e.g. TCP) on a I/OAT enabled sonode. 761 */ 762 mblk_t * 763 sod_uioa_mblk_init(struct sodirect_s *sodp, mblk_t *mp, size_t msg_size) 764 { 765 uioa_t *uioap = &sodp->sod_uioa; 766 mblk_t *mp1 = mp; 767 mblk_t *lmp = NULL; 768 769 ASSERT(DB_TYPE(mp) == M_DATA); 770 ASSERT(msg_size == msgdsize(mp)); 771 772 /* Caller must have lock held */ 773 ASSERT(MUTEX_HELD(sodp->sod_lockp)); 774 775 if (uioap->uioa_state & UIOA_ENABLED) { 776 /* Uioa is enabled */ 777 778 if (msg_size > uioap->uio_resid) { 779 /* 780 * There isn't enough uio space for the mblk_t chain 781 * so disable uioa such that this and any additional 782 * mblk_t data is handled by the socket and schedule 783 * the socket for wakeup to finish this uioa. 784 */ 785 uioap->uioa_state &= UIOA_CLR; 786 uioap->uioa_state |= UIOA_FINI; 787 if (sodp->sod_state & SOD_WAKE_NOT) { 788 sodp->sod_state &= SOD_WAKE_CLR; 789 sodp->sod_state |= SOD_WAKE_NEED; 790 } 791 return (mp); 792 } 793 do { 794 uint32_t len = MBLKL(mp1); 795 796 if (!uioamove(mp1->b_rptr, len, UIO_READ, uioap)) { 797 /* Scheduled, mark dblk_t as such */ 798 DB_FLAGS(mp1) |= DBLK_UIOA; 799 } else { 800 /* Error, turn off async processing */ 801 uioap->uioa_state &= UIOA_CLR; 802 uioap->uioa_state |= UIOA_FINI; 803 break; 804 } 805 lmp = mp1; 806 } while ((mp1 = mp1->b_cont) != NULL); 807 808 if (mp1 != NULL || uioap->uio_resid == 0) { 809 /* 810 * Not all mblk_t(s) uioamoved (error) or all uio 811 * space has been consumed so schedule the socket 812 * for wakeup to finish this uio. 813 */ 814 sodp->sod_state &= SOD_WAKE_CLR; 815 sodp->sod_state |= SOD_WAKE_NEED; 816 817 /* Break the mblk chain if neccessary. */ 818 if (mp1 != NULL && lmp != NULL) { 819 mp->b_next = mp1; 820 lmp->b_cont = NULL; 821 } 822 } 823 } 824 return (mp1); 825 } 826 827 /* 828 * This function is called on a mblk that thas been successfully uioamoved(). 829 */ 830 void 831 sod_uioa_mblk_done(sodirect_t *sodp, mblk_t *bp) 832 { 833 if (bp != NULL && (bp->b_datap->db_flags & DBLK_UIOA)) { 834 /* 835 * A uioa flaged mblk_t chain, already uio processed, 836 * add it to the sodirect uioa pending free list. 837 * 838 * Note, a b_cont chain headed by a DBLK_UIOA enable 839 * mblk_t must have all mblk_t(s) DBLK_UIOA enabled. 840 */ 841 mblk_t *bpt = sodp->sod_uioaft; 842 843 ASSERT(sodp != NULL); 844 845 /* 846 * Add first mblk_t of "bp" chain to current sodirect uioa 847 * free list tail mblk_t, if any, else empty list so new head. 848 */ 849 if (bpt == NULL) 850 sodp->sod_uioafh = bp; 851 else 852 bpt->b_cont = bp; 853 854 /* 855 * Walk mblk_t "bp" chain to find tail and adjust rptr of 856 * each to reflect that uioamove() has consumed all data. 857 */ 858 bpt = bp; 859 for (;;) { 860 ASSERT(bpt->b_datap->db_flags & DBLK_UIOA); 861 862 bpt->b_rptr = bpt->b_wptr; 863 if (bpt->b_cont == NULL) 864 break; 865 bpt = bpt->b_cont; 866 } 867 /* New sodirect uioa free list tail */ 868 sodp->sod_uioaft = bpt; 869 870 /* Only dequeue once with data returned per uioa_t */ 871 if (sodp->sod_uioa.uioa_state & UIOA_ENABLED) { 872 sodp->sod_uioa.uioa_state &= UIOA_CLR; 873 sodp->sod_uioa.uioa_state |= UIOA_FINI; 874 } 875 } 876 } 877 878 /* 879 * When transit from UIOA_INIT state to UIOA_ENABLE state in recvmsg(), call 880 * this function on a non-STREAMS socket to schedule uioamove() on the data 881 * that has already queued in this socket. 882 */ 883 void 884 sod_uioa_so_init(struct sonode *so, struct sodirect_s *sodp, struct uio *uiop) 885 { 886 uioa_t *uioap = (uioa_t *)uiop; 887 mblk_t *lbp; 888 mblk_t *wbp; 889 mblk_t *bp; 890 int len; 891 int error; 892 boolean_t in_rcv_q = B_TRUE; 893 894 ASSERT(MUTEX_HELD(sodp->sod_lockp)); 895 ASSERT(&sodp->sod_uioa == uioap); 896 897 /* 898 * Walk first b_cont chain in sod_q 899 * and schedule any M_DATA mblk_t's for uio asynchronous move. 900 */ 901 bp = so->so_rcv_q_head; 902 903 again: 904 /* Walk the chain */ 905 lbp = NULL; 906 wbp = bp; 907 908 do { 909 if (bp == NULL) 910 break; 911 912 if (wbp->b_datap->db_type != M_DATA) { 913 /* Not M_DATA, no more uioa */ 914 goto nouioa; 915 } 916 if ((len = wbp->b_wptr - wbp->b_rptr) > 0) { 917 /* Have a M_DATA mblk_t with data */ 918 if (len > uioap->uio_resid || (so->so_oobmark > 0 && 919 len + uioap->uioa_mbytes >= so->so_oobmark)) { 920 /* Not enough uio sapce, or beyond oobmark */ 921 goto nouioa; 922 } 923 ASSERT(!(wbp->b_datap->db_flags & DBLK_UIOA)); 924 error = uioamove(wbp->b_rptr, len, 925 UIO_READ, uioap); 926 if (!error) { 927 /* Scheduled, mark dblk_t as such */ 928 wbp->b_datap->db_flags |= DBLK_UIOA; 929 } else { 930 /* Break the mblk chain */ 931 goto nouioa; 932 } 933 } 934 /* Save last wbp processed */ 935 lbp = wbp; 936 } while ((wbp = wbp->b_cont) != NULL); 937 938 if (in_rcv_q && (bp == NULL || bp->b_next == NULL)) { 939 /* 940 * We get here only once to process the sonode dump area 941 * if so_rcv_q_head is NULL or all the mblks have been 942 * successfully uioamoved()ed. 943 */ 944 in_rcv_q = B_FALSE; 945 946 /* move to dump area */ 947 bp = so->so_rcv_head; 948 goto again; 949 } 950 951 return; 952 953 nouioa: 954 /* No more uioa */ 955 uioap->uioa_state &= UIOA_CLR; 956 uioap->uioa_state |= UIOA_FINI; 957 958 /* 959 * If we processed 1 or more mblk_t(s) then we need to split the 960 * current mblk_t chain in 2 so that all the uioamove()ed mblk_t(s) 961 * are in the current chain and the rest are in the following new 962 * chain. 963 */ 964 if (lbp != NULL) { 965 /* New end of current chain */ 966 lbp->b_cont = NULL; 967 968 /* Insert new chain wbp after bp */ 969 if ((wbp->b_next = bp->b_next) == NULL) { 970 /* 971 * No need to grab so_lock, since sod_lockp 972 * points to so_lock. 973 */ 974 if (in_rcv_q) 975 so->so_rcv_q_last_head = wbp; 976 else 977 so->so_rcv_last_head = wbp; 978 } 979 bp->b_next = wbp; 980 bp->b_next->b_prev = bp->b_prev; 981 bp->b_prev = lbp; 982 } 983 } 984 985 /* 986 * Initialize sodirect data structures on a socket. 987 */ 988 void 989 sod_sock_init(struct sonode *so, struct stdata *stp, sod_enq_func enq_func, 990 sod_wakeup_func wake_func, kmutex_t *lockp) 991 { 992 sodirect_t *sodp; 993 994 ASSERT(so->so_direct == NULL); 995 996 so->so_state |= SS_SODIRECT; 997 998 sodp = kmem_cache_alloc(sock_sod_cache, KM_SLEEP); 999 sodp->sod_state = SOD_ENABLED | SOD_WAKE_NOT; 1000 sodp->sod_want = 0; 1001 sodp->sod_q = (stp != NULL) ? RD(stp->sd_wrq) : NULL; 1002 sodp->sod_enqueue = enq_func; 1003 sodp->sod_wakeup = wake_func; 1004 sodp->sod_uioafh = NULL; 1005 sodp->sod_uioaft = NULL; 1006 sodp->sod_lockp = lockp; 1007 /* 1008 * Remainder of the sod_uioa members are left uninitialized 1009 * but will be initialized later by uioainit() before uioa 1010 * is enabled. 1011 */ 1012 sodp->sod_uioa.uioa_state = UIOA_ALLOC; 1013 so->so_direct = sodp; 1014 if (stp != NULL) 1015 stp->sd_sodirect = sodp; 1016 } 1017 1018 /* 1019 * Init the sodirect kmem cache while sockfs is loading. 1020 */ 1021 void 1022 sod_init() 1023 { 1024 /* Allocate sodirect_t kmem_cache */ 1025 sock_sod_cache = kmem_cache_create("sock_sod_cache", 1026 sizeof (sodirect_t), 0, NULL, NULL, NULL, NULL, NULL, 0); 1027 } 1028 1029 ssize_t 1030 sod_uioa_mblk(struct sonode *so, mblk_t *mp) 1031 { 1032 sodirect_t *sodp = so->so_direct; 1033 1034 ASSERT(sodp != NULL); 1035 ASSERT(MUTEX_HELD(sodp->sod_lockp)); 1036 1037 ASSERT(sodp->sod_state & SOD_ENABLED); 1038 ASSERT(sodp->sod_uioa.uioa_state != (UIOA_ALLOC|UIOA_INIT)); 1039 1040 ASSERT(sodp->sod_uioa.uioa_state & (UIOA_ENABLED|UIOA_FINI)); 1041 1042 if (mp == NULL && so->so_rcv_q_head != NULL) { 1043 mp = so->so_rcv_q_head; 1044 ASSERT(mp->b_prev != NULL); 1045 mp->b_prev = NULL; 1046 so->so_rcv_q_head = mp->b_next; 1047 if (so->so_rcv_q_head == NULL) { 1048 so->so_rcv_q_last_head = NULL; 1049 } 1050 mp->b_next = NULL; 1051 } 1052 1053 sod_uioa_mblk_done(sodp, mp); 1054 1055 if (so->so_rcv_q_head == NULL && so->so_rcv_head != NULL && 1056 DB_TYPE(so->so_rcv_head) == M_DATA && 1057 (DB_FLAGS(so->so_rcv_head) & DBLK_UIOA)) { 1058 /* more arrived */ 1059 ASSERT(so->so_rcv_q_head == NULL); 1060 mp = so->so_rcv_head; 1061 so->so_rcv_head = mp->b_next; 1062 if (so->so_rcv_head == NULL) 1063 so->so_rcv_last_head = NULL; 1064 mp->b_prev = mp->b_next = NULL; 1065 sod_uioa_mblk_done(sodp, mp); 1066 } 1067 1068 #ifdef DEBUG 1069 if (so->so_rcv_q_head != NULL) { 1070 mblk_t *m = so->so_rcv_q_head; 1071 while (m != NULL) { 1072 if (DB_FLAGS(m) & DBLK_UIOA) { 1073 cmn_err(CE_PANIC, "Unexpected I/OAT mblk %p" 1074 " in so_rcv_q_head.\n", (void *)m); 1075 } 1076 m = m->b_next; 1077 } 1078 } 1079 if (so->so_rcv_head != NULL) { 1080 mblk_t *m = so->so_rcv_head; 1081 while (m != NULL) { 1082 if (DB_FLAGS(m) & DBLK_UIOA) { 1083 cmn_err(CE_PANIC, "Unexpected I/OAT mblk %p" 1084 " in so_rcv_head.\n", (void *)m); 1085 } 1086 m = m->b_next; 1087 } 1088 } 1089 #endif 1090 return (sodp->sod_uioa.uioa_mbytes); 1091 } 1092