1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #include <sys/types.h> 28 #include <sys/param.h> 29 #include <sys/systm.h> 30 #include <sys/sysmacros.h> 31 #include <sys/debug.h> 32 #include <sys/cmn_err.h> 33 #include <sys/vfs.h> 34 #include <sys/policy.h> 35 #include <sys/modctl.h> 36 37 #include <sys/sunddi.h> 38 39 #include <sys/strsun.h> 40 #include <sys/stropts.h> 41 #include <sys/strsubr.h> 42 #include <sys/socket.h> 43 #include <sys/socketvar.h> 44 #include <sys/sodirect.h> 45 #include <sys/uio.h> 46 47 #include <inet/ipclassifier.h> 48 #include <fs/sockfs/sockcommon.h> 49 #include <fs/sockfs/nl7c.h> 50 #include <fs/sockfs/socktpi.h> 51 #include <inet/ip.h> 52 53 extern int xnet_skip_checks, xnet_check_print, xnet_truncate_print; 54 55 static struct kmem_cache *sock_sod_cache; 56 57 /* 58 * Common socket access functions. 59 * 60 * Instead of accessing the sonode switch directly (i.e., SOP_xxx()), 61 * the socket_xxx() function should be used. 62 */ 63 64 /* 65 * Try to create a new sonode of the requested <family, type, protocol>. 66 */ 67 /* ARGSUSED */ 68 struct sonode * 69 socket_create(int family, int type, int protocol, char *devpath, char *mod, 70 int flags, int version, struct cred *cr, int *errorp) 71 { 72 struct sonode *so; 73 struct sockparams *sp = NULL; 74 int saved_error; 75 76 /* 77 * Look for a sockparams entry that match the given criteria. 78 * solookup() returns with the entry held. 79 */ 80 *errorp = solookup(family, type, protocol, &sp); 81 saved_error = *errorp; 82 if (sp == NULL) { 83 int kmflags = (flags == SOCKET_SLEEP) ? KM_SLEEP : KM_NOSLEEP; 84 /* 85 * There is no matching sockparams entry. An ephemeral entry is 86 * created if the caller specifies a device or a socket module. 87 */ 88 if (devpath != NULL) { 89 saved_error = 0; 90 sp = sockparams_hold_ephemeral_bydev(family, type, 91 protocol, devpath, kmflags, errorp); 92 } else if (mod != NULL) { 93 saved_error = 0; 94 sp = sockparams_hold_ephemeral_bymod(family, type, 95 protocol, mod, kmflags, errorp); 96 } else { 97 *errorp = solookup(family, type, 0, &sp); 98 } 99 100 if (sp == NULL) { 101 if (saved_error && (*errorp == EPROTONOSUPPORT || 102 *errorp == EPROTOTYPE || *errorp == ENOPROTOOPT)) 103 *errorp = saved_error; 104 return (NULL); 105 } 106 } 107 108 ASSERT(sp->sp_smod_info != NULL); 109 ASSERT(flags == SOCKET_SLEEP || flags == SOCKET_NOSLEEP); 110 sp->sp_stats.sps_ncreate.value.ui64++; 111 so = sp->sp_smod_info->smod_sock_create_func(sp, family, type, 112 protocol, version, flags, errorp, cr); 113 if (so == NULL) { 114 SOCKPARAMS_DEC_REF(sp); 115 } else { 116 if ((*errorp = SOP_INIT(so, NULL, cr, flags)) == 0) { 117 /* Cannot fail, only bumps so_count */ 118 (void) VOP_OPEN(&SOTOV(so), FREAD|FWRITE, cr, NULL); 119 } else { 120 if (saved_error && (*errorp == EPROTONOSUPPORT || 121 *errorp == EPROTOTYPE || *errorp == ENOPROTOOPT)) 122 *errorp = saved_error; 123 socket_destroy(so); 124 so = NULL; 125 } 126 } 127 return (so); 128 } 129 130 struct sonode * 131 socket_newconn(struct sonode *parent, sock_lower_handle_t lh, 132 sock_downcalls_t *dc, int flags, int *errorp) 133 { 134 struct sonode *so; 135 struct sockparams *sp; 136 struct cred *cr; 137 138 if ((cr = CRED()) == NULL) 139 cr = kcred; 140 141 sp = parent->so_sockparams; 142 ASSERT(sp != NULL); 143 144 sp->sp_stats.sps_ncreate.value.ui64++; 145 so = sp->sp_smod_info->smod_sock_create_func(sp, parent->so_family, 146 parent->so_type, parent->so_protocol, parent->so_version, flags, 147 errorp, cr); 148 if (so != NULL) { 149 SOCKPARAMS_INC_REF(sp); 150 151 so->so_proto_handle = lh; 152 so->so_downcalls = dc; 153 /* 154 * This function may be called in interrupt context, and CRED() 155 * will be NULL. In this case, pass in kcred. 156 */ 157 if ((*errorp = SOP_INIT(so, parent, cr, flags)) == 0) { 158 /* Cannot fail, only bumps so_count */ 159 (void) VOP_OPEN(&SOTOV(so), FREAD|FWRITE, cr, NULL); 160 } else { 161 socket_destroy(so); 162 so = NULL; 163 } 164 } 165 166 return (so); 167 } 168 169 /* 170 * Bind local endpoint. 171 */ 172 int 173 socket_bind(struct sonode *so, struct sockaddr *name, socklen_t namelen, 174 int flags, cred_t *cr) 175 { 176 return (SOP_BIND(so, name, namelen, flags, cr)); 177 } 178 179 /* 180 * Turn socket into a listen socket. 181 */ 182 int 183 socket_listen(struct sonode *so, int backlog, cred_t *cr) 184 { 185 if (backlog < 0) { 186 backlog = 0; 187 } 188 189 /* 190 * Use the same qlimit as in BSD. BSD checks the qlimit 191 * before queuing the next connection implying that a 192 * listen(sock, 0) allows one connection to be queued. 193 * BSD also uses 1.5 times the requested backlog. 194 * 195 * XNS Issue 4 required a strict interpretation of the backlog. 196 * This has been waived subsequently for Issue 4 and the change 197 * incorporated in XNS Issue 5. So we aren't required to do 198 * anything special for XPG apps. 199 */ 200 if (backlog >= (INT_MAX - 1) / 3) 201 backlog = INT_MAX; 202 else 203 backlog = backlog * 3 / 2 + 1; 204 205 return (SOP_LISTEN(so, backlog, cr)); 206 } 207 208 /* 209 * Accept incoming connection. 210 */ 211 int 212 socket_accept(struct sonode *lso, int fflag, cred_t *cr, struct sonode **nsop) 213 { 214 return (SOP_ACCEPT(lso, fflag, cr, nsop)); 215 } 216 217 /* 218 * Active open. 219 */ 220 int 221 socket_connect(struct sonode *so, const struct sockaddr *name, 222 socklen_t namelen, int fflag, int flags, cred_t *cr) 223 { 224 int error; 225 226 /* 227 * Handle a connect to a name parameter of type AF_UNSPEC like a 228 * connect to a null address. This is the portable method to 229 * unconnect a socket. 230 */ 231 if ((namelen >= sizeof (sa_family_t)) && 232 (name->sa_family == AF_UNSPEC)) { 233 name = NULL; 234 namelen = 0; 235 } 236 237 error = SOP_CONNECT(so, name, namelen, fflag, flags, cr); 238 239 if (error == EHOSTUNREACH && flags & _SOCONNECT_XPG4_2) { 240 /* 241 * X/Open specification contains a requirement that 242 * ENETUNREACH be returned but does not require 243 * EHOSTUNREACH. In order to keep the test suite 244 * happy we mess with the errno here. 245 */ 246 error = ENETUNREACH; 247 } 248 249 return (error); 250 } 251 252 /* 253 * Get address of remote node. 254 */ 255 int 256 socket_getpeername(struct sonode *so, struct sockaddr *addr, 257 socklen_t *addrlen, boolean_t accept, cred_t *cr) 258 { 259 ASSERT(*addrlen > 0); 260 return (SOP_GETPEERNAME(so, addr, addrlen, accept, cr)); 261 262 } 263 264 /* 265 * Get local address. 266 */ 267 int 268 socket_getsockname(struct sonode *so, struct sockaddr *addr, 269 socklen_t *addrlen, cred_t *cr) 270 { 271 return (SOP_GETSOCKNAME(so, addr, addrlen, cr)); 272 273 } 274 275 /* 276 * Called from shutdown(). 277 */ 278 int 279 socket_shutdown(struct sonode *so, int how, cred_t *cr) 280 { 281 return (SOP_SHUTDOWN(so, how, cr)); 282 } 283 284 /* 285 * Get socket options. 286 */ 287 /*ARGSUSED*/ 288 int 289 socket_getsockopt(struct sonode *so, int level, int option_name, 290 void *optval, socklen_t *optlenp, int flags, cred_t *cr) 291 { 292 return (SOP_GETSOCKOPT(so, level, option_name, optval, 293 optlenp, flags, cr)); 294 } 295 296 /* 297 * Set socket options 298 */ 299 int 300 socket_setsockopt(struct sonode *so, int level, int option_name, 301 const void *optval, t_uscalar_t optlen, cred_t *cr) 302 { 303 int val = 1; 304 /* Caller allocates aligned optval, or passes null */ 305 ASSERT(((uintptr_t)optval & (sizeof (t_scalar_t) - 1)) == 0); 306 /* If optval is null optlen is 0, and vice-versa */ 307 ASSERT(optval != NULL || optlen == 0); 308 ASSERT(optlen != 0 || optval == NULL); 309 310 if (optval == NULL && optlen == 0) 311 optval = &val; 312 313 return (SOP_SETSOCKOPT(so, level, option_name, optval, optlen, cr)); 314 } 315 316 int 317 socket_sendmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop, 318 cred_t *cr) 319 { 320 int error = 0; 321 ssize_t orig_resid = uiop->uio_resid; 322 323 /* 324 * Do not bypass the cache if we are doing a local (AF_UNIX) write. 325 */ 326 if (so->so_family == AF_UNIX) 327 uiop->uio_extflg |= UIO_COPY_CACHED; 328 else 329 uiop->uio_extflg &= ~UIO_COPY_CACHED; 330 331 error = SOP_SENDMSG(so, msg, uiop, cr); 332 switch (error) { 333 default: 334 break; 335 case EINTR: 336 /* EAGAIN is EWOULDBLOCK */ 337 case EWOULDBLOCK: 338 /* We did a partial send */ 339 if (uiop->uio_resid != orig_resid) 340 error = 0; 341 break; 342 case EPIPE: 343 if ((so->so_mode & SM_KERNEL) == 0) 344 tsignal(curthread, SIGPIPE); 345 break; 346 } 347 348 return (error); 349 } 350 351 int 352 socket_sendmblk(struct sonode *so, struct nmsghdr *msg, int fflag, 353 struct cred *cr, mblk_t **mpp) 354 { 355 int error = 0; 356 357 error = SOP_SENDMBLK(so, msg, fflag, cr, mpp); 358 if (error == EPIPE) { 359 tsignal(curthread, SIGPIPE); 360 } 361 return (error); 362 } 363 364 int 365 socket_recvmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop, 366 cred_t *cr) 367 { 368 int error; 369 ssize_t orig_resid = uiop->uio_resid; 370 371 /* 372 * Do not bypass the cache when reading data, as the application 373 * is likely to access the data shortly. 374 */ 375 uiop->uio_extflg |= UIO_COPY_CACHED; 376 377 error = SOP_RECVMSG(so, msg, uiop, cr); 378 379 switch (error) { 380 case EINTR: 381 /* EAGAIN is EWOULDBLOCK */ 382 case EWOULDBLOCK: 383 /* We did a partial read */ 384 if (uiop->uio_resid != orig_resid) 385 error = 0; 386 break; 387 default: 388 break; 389 } 390 return (error); 391 } 392 393 int 394 socket_ioctl(struct sonode *so, int cmd, intptr_t arg, int mode, 395 struct cred *cr, int32_t *rvalp) 396 { 397 return (SOP_IOCTL(so, cmd, arg, mode, cr, rvalp)); 398 } 399 400 int 401 socket_poll(struct sonode *so, short events, int anyyet, short *reventsp, 402 struct pollhead **phpp) 403 { 404 return (SOP_POLL(so, events, anyyet, reventsp, phpp)); 405 } 406 407 int 408 socket_close(struct sonode *so, int flag, struct cred *cr) 409 { 410 return (VOP_CLOSE(SOTOV(so), flag, 1, 0, cr, NULL)); 411 } 412 413 int 414 socket_close_internal(struct sonode *so, int flag, cred_t *cr) 415 { 416 ASSERT(so->so_count == 0); 417 418 return (SOP_CLOSE(so, flag, cr)); 419 } 420 421 void 422 socket_destroy(struct sonode *so) 423 { 424 vn_invalid(SOTOV(so)); 425 VN_RELE(SOTOV(so)); 426 } 427 428 /* ARGSUSED */ 429 void 430 socket_destroy_internal(struct sonode *so, cred_t *cr) 431 { 432 struct sockparams *sp = so->so_sockparams; 433 ASSERT(so->so_count == 0 && sp != NULL); 434 435 sp->sp_smod_info->smod_sock_destroy_func(so); 436 437 SOCKPARAMS_DEC_REF(sp); 438 } 439 440 /* 441 * TODO Once the common vnode ops is available, then the vnops argument 442 * should be removed. 443 */ 444 /*ARGSUSED*/ 445 int 446 sonode_constructor(void *buf, void *cdrarg, int kmflags) 447 { 448 struct sonode *so = buf; 449 struct vnode *vp; 450 451 vp = so->so_vnode = vn_alloc(kmflags); 452 if (vp == NULL) { 453 return (-1); 454 } 455 vp->v_data = so; 456 vn_setops(vp, socket_vnodeops); 457 458 so->so_priv = NULL; 459 so->so_oobmsg = NULL; 460 461 so->so_proto_handle = NULL; 462 463 so->so_peercred = NULL; 464 465 so->so_rcv_queued = 0; 466 so->so_rcv_q_head = NULL; 467 so->so_rcv_q_last_head = NULL; 468 so->so_rcv_head = NULL; 469 so->so_rcv_last_head = NULL; 470 so->so_rcv_wanted = 0; 471 so->so_rcv_timer_interval = SOCKET_NO_RCVTIMER; 472 so->so_rcv_timer_tid = 0; 473 so->so_rcv_thresh = 0; 474 475 so->so_acceptq_head = NULL; 476 so->so_acceptq_tail = &so->so_acceptq_head; 477 so->so_acceptq_next = NULL; 478 so->so_acceptq_len = 0; 479 so->so_backlog = 0; 480 481 so->so_snd_qfull = B_FALSE; 482 483 mutex_init(&so->so_lock, NULL, MUTEX_DEFAULT, NULL); 484 mutex_init(&so->so_acceptq_lock, NULL, MUTEX_DEFAULT, NULL); 485 rw_init(&so->so_fallback_rwlock, NULL, RW_DEFAULT, NULL); 486 cv_init(&so->so_state_cv, NULL, CV_DEFAULT, NULL); 487 cv_init(&so->so_want_cv, NULL, CV_DEFAULT, NULL); 488 489 cv_init(&so->so_acceptq_cv, NULL, CV_DEFAULT, NULL); 490 cv_init(&so->so_snd_cv, NULL, CV_DEFAULT, NULL); 491 cv_init(&so->so_rcv_cv, NULL, CV_DEFAULT, NULL); 492 cv_init(&so->so_copy_cv, NULL, CV_DEFAULT, NULL); 493 cv_init(&so->so_closing_cv, NULL, CV_DEFAULT, NULL); 494 495 return (0); 496 } 497 498 /*ARGSUSED*/ 499 void 500 sonode_destructor(void *buf, void *cdrarg) 501 { 502 struct sonode *so = buf; 503 struct vnode *vp = SOTOV(so); 504 505 ASSERT(so->so_priv == NULL); 506 ASSERT(so->so_peercred == NULL); 507 508 ASSERT(so->so_oobmsg == NULL); 509 510 ASSERT(so->so_rcv_q_head == NULL); 511 512 ASSERT(so->so_acceptq_head == NULL); 513 ASSERT(so->so_acceptq_tail == &so->so_acceptq_head); 514 ASSERT(so->so_acceptq_next == NULL); 515 516 ASSERT(vp->v_data == so); 517 ASSERT(vn_matchops(vp, socket_vnodeops)); 518 519 vn_free(vp); 520 521 mutex_destroy(&so->so_lock); 522 mutex_destroy(&so->so_acceptq_lock); 523 rw_destroy(&so->so_fallback_rwlock); 524 525 cv_destroy(&so->so_state_cv); 526 cv_destroy(&so->so_want_cv); 527 cv_destroy(&so->so_acceptq_cv); 528 cv_destroy(&so->so_snd_cv); 529 cv_destroy(&so->so_rcv_cv); 530 cv_destroy(&so->so_closing_cv); 531 } 532 533 void 534 sonode_init(struct sonode *so, struct sockparams *sp, int family, 535 int type, int protocol, sonodeops_t *sops) 536 { 537 vnode_t *vp; 538 539 vp = SOTOV(so); 540 541 so->so_flag = 0; 542 543 so->so_state = 0; 544 so->so_mode = 0; 545 546 so->so_count = 0; 547 548 so->so_family = family; 549 so->so_type = type; 550 so->so_protocol = protocol; 551 552 SOCK_CONNID_INIT(so->so_proto_connid); 553 554 so->so_options = 0; 555 so->so_linger.l_onoff = 0; 556 so->so_linger.l_linger = 0; 557 so->so_sndbuf = 0; 558 so->so_error = 0; 559 so->so_rcvtimeo = 0; 560 so->so_sndtimeo = 0; 561 so->so_xpg_rcvbuf = 0; 562 563 ASSERT(so->so_oobmsg == NULL); 564 so->so_oobmark = 0; 565 so->so_pgrp = 0; 566 567 ASSERT(so->so_peercred == NULL); 568 569 so->so_zoneid = getzoneid(); 570 571 so->so_sockparams = sp; 572 573 so->so_ops = sops; 574 575 so->so_not_str = (sops != &sotpi_sonodeops); 576 577 so->so_proto_handle = NULL; 578 579 so->so_downcalls = NULL; 580 581 so->so_copyflag = 0; 582 583 ASSERT(so->so_acceptq_head == NULL); 584 ASSERT(so->so_acceptq_tail == &so->so_acceptq_head); 585 ASSERT(so->so_acceptq_next == NULL); 586 587 vn_reinit(vp); 588 vp->v_vfsp = rootvfs; 589 vp->v_type = VSOCK; 590 vp->v_rdev = sockdev; 591 592 so->so_rcv_queued = 0; 593 so->so_rcv_q_head = NULL; 594 so->so_rcv_q_last_head = NULL; 595 so->so_rcv_head = NULL; 596 so->so_rcv_last_head = NULL; 597 598 so->so_snd_qfull = B_FALSE; 599 so->so_minpsz = 0; 600 601 so->so_rcv_wakeup = B_FALSE; 602 so->so_snd_wakeup = B_FALSE; 603 so->so_flowctrld = B_FALSE; 604 605 so->so_pollev = 0; 606 bzero(&so->so_poll_list, sizeof (so->so_poll_list)); 607 bzero(&so->so_proto_props, sizeof (struct sock_proto_props)); 608 609 bzero(&(so->so_ksock_callbacks), sizeof (ksocket_callbacks_t)); 610 so->so_ksock_cb_arg = NULL; 611 612 so->so_max_addr_len = sizeof (struct sockaddr_storage); 613 614 so->so_direct = NULL; 615 616 vn_exists(vp); 617 } 618 619 void 620 sonode_fini(struct sonode *so) 621 { 622 mblk_t *mp; 623 vnode_t *vp; 624 625 ASSERT(so->so_count == 0); 626 627 if (so->so_rcv_timer_tid) { 628 ASSERT(MUTEX_NOT_HELD(&so->so_lock)); 629 (void) untimeout(so->so_rcv_timer_tid); 630 so->so_rcv_timer_tid = 0; 631 } 632 633 so_acceptq_flush(so); 634 635 if ((mp = so->so_oobmsg) != NULL) { 636 freemsg(mp); 637 so->so_oobmsg = NULL; 638 so->so_state &= ~(SS_OOBPEND|SS_HAVEOOBDATA|SS_HADOOBDATA| 639 SS_RCVATMARK); 640 } 641 642 if (so->so_poll_list.ph_list != NULL) { 643 pollwakeup(&so->so_poll_list, POLLERR); 644 pollhead_clean(&so->so_poll_list); 645 } 646 647 if (so->so_direct != NULL) { 648 sodirect_t *sodp = so->so_direct; 649 650 ASSERT(sodp->sod_uioafh == NULL); 651 652 so->so_direct = NULL; 653 kmem_cache_free(sock_sod_cache, sodp); 654 } 655 656 vp = SOTOV(so); 657 vn_invalid(vp); 658 659 if (so->so_peercred != NULL) { 660 crfree(so->so_peercred); 661 so->so_peercred = NULL; 662 } 663 } 664 665 /* 666 * This function is called at the beginning of recvmsg(). 667 * 668 * If I/OAT is enabled on this sonode, initialize the uioa state machine 669 * with state UIOA_ALLOC. 670 */ 671 uio_t * 672 sod_rcv_init(struct sonode *so, int flags, struct uio **uiopp) 673 { 674 struct uio *suiop; 675 struct uio *uiop; 676 sodirect_t *sodp = so->so_direct; 677 678 if (sodp == NULL) 679 return (NULL); 680 681 suiop = NULL; 682 uiop = *uiopp; 683 684 mutex_enter(sodp->sod_lockp); 685 if (uiop->uio_resid >= uioasync.mincnt && 686 sodp != NULL && (sodp->sod_state & SOD_ENABLED) && 687 uioasync.enabled && !(flags & MSG_PEEK) && 688 !(so->so_state & SS_CANTRCVMORE)) { 689 /* 690 * Big enough I/O for uioa min setup and an sodirect socket 691 * and sodirect enabled and uioa enabled and I/O will be done 692 * and not EOF so initialize the sodirect_t uioa_t with "uiop". 693 */ 694 if (!uioainit(uiop, &sodp->sod_uioa)) { 695 /* 696 * Successful uioainit() so the uio_t part of the 697 * uioa_t will be used for all uio_t work to follow, 698 * we return the original "uiop" in "suiop". 699 */ 700 suiop = uiop; 701 *uiopp = (uio_t *)&sodp->sod_uioa; 702 /* 703 * Before returning to the caller the passed in uio_t 704 * "uiop" will be updated via a call to uioafini() 705 * below. 706 * 707 * Note, the uioa.uioa_state isn't set to UIOA_ENABLED 708 * here as first we have to uioamove() any currently 709 * queued M_DATA mblk_t(s) so it will be done later. 710 */ 711 } 712 /* 713 * In either uioainit() success or not case note the number 714 * of uio bytes the caller wants for sod framework and/or 715 * transport (e.g. TCP) strategy. 716 */ 717 sodp->sod_want = uiop->uio_resid; 718 } else if (sodp != NULL && (sodp->sod_state & SOD_ENABLED)) { 719 /* 720 * No uioa but still using sodirect so note the number of 721 * uio bytes the caller wants for sodirect framework and/or 722 * transport (e.g. TCP) strategy. 723 */ 724 sodp->sod_want = uiop->uio_resid; 725 } 726 mutex_exit(sodp->sod_lockp); 727 728 return (suiop); 729 } 730 731 /* 732 * This function is called at the end of recvmsg(), it finializes all the I/OAT 733 * operations, and reset the uioa state to UIOA_ALLOC. 734 */ 735 int 736 sod_rcv_done(struct sonode *so, struct uio *suiop, struct uio *uiop) 737 { 738 int error = 0; 739 sodirect_t *sodp = so->so_direct; 740 mblk_t *mp; 741 742 if (sodp == NULL) { 743 return (0); 744 } 745 746 ASSERT(MUTEX_HELD(sodp->sod_lockp)); 747 /* Finish any sodirect and uioa processing */ 748 if (suiop != NULL) { 749 /* Finish any uioa_t processing */ 750 751 ASSERT(uiop == (uio_t *)&sodp->sod_uioa); 752 error = uioafini(suiop, (uioa_t *)uiop); 753 if ((mp = sodp->sod_uioafh) != NULL) { 754 sodp->sod_uioafh = NULL; 755 sodp->sod_uioaft = NULL; 756 freemsg(mp); 757 } 758 } 759 ASSERT(sodp->sod_uioafh == NULL); 760 if (!(sodp->sod_state & SOD_WAKE_NOT)) { 761 /* Awoke */ 762 sodp->sod_state &= SOD_WAKE_CLR; 763 sodp->sod_state |= SOD_WAKE_NOT; 764 } 765 /* Last, clear sod_want value */ 766 sodp->sod_want = 0; 767 768 return (error); 769 } 770 771 /* 772 * Schedule a uioamove() on a mblk. This is ususally called from 773 * protocols (e.g. TCP) on a I/OAT enabled sonode. 774 */ 775 mblk_t * 776 sod_uioa_mblk_init(struct sodirect_s *sodp, mblk_t *mp, size_t msg_size) 777 { 778 uioa_t *uioap = &sodp->sod_uioa; 779 mblk_t *mp1 = mp; 780 mblk_t *lmp = NULL; 781 782 ASSERT(DB_TYPE(mp) == M_DATA); 783 ASSERT(msg_size == msgdsize(mp)); 784 785 /* Caller must have lock held */ 786 ASSERT(MUTEX_HELD(sodp->sod_lockp)); 787 788 if (uioap->uioa_state & UIOA_ENABLED) { 789 /* Uioa is enabled */ 790 791 if (msg_size > uioap->uio_resid) { 792 /* 793 * There isn't enough uio space for the mblk_t chain 794 * so disable uioa such that this and any additional 795 * mblk_t data is handled by the socket and schedule 796 * the socket for wakeup to finish this uioa. 797 */ 798 uioap->uioa_state &= UIOA_CLR; 799 uioap->uioa_state |= UIOA_FINI; 800 if (sodp->sod_state & SOD_WAKE_NOT) { 801 sodp->sod_state &= SOD_WAKE_CLR; 802 sodp->sod_state |= SOD_WAKE_NEED; 803 } 804 return (mp); 805 } 806 do { 807 uint32_t len = MBLKL(mp1); 808 809 if (!uioamove(mp1->b_rptr, len, UIO_READ, uioap)) { 810 /* Scheduled, mark dblk_t as such */ 811 DB_FLAGS(mp1) |= DBLK_UIOA; 812 } else { 813 /* Error, turn off async processing */ 814 uioap->uioa_state &= UIOA_CLR; 815 uioap->uioa_state |= UIOA_FINI; 816 break; 817 } 818 lmp = mp1; 819 } while ((mp1 = mp1->b_cont) != NULL); 820 821 if (mp1 != NULL || uioap->uio_resid == 0) { 822 /* 823 * Not all mblk_t(s) uioamoved (error) or all uio 824 * space has been consumed so schedule the socket 825 * for wakeup to finish this uio. 826 */ 827 sodp->sod_state &= SOD_WAKE_CLR; 828 sodp->sod_state |= SOD_WAKE_NEED; 829 830 /* Break the mblk chain if neccessary. */ 831 if (mp1 != NULL && lmp != NULL) { 832 mp->b_next = mp1; 833 lmp->b_cont = NULL; 834 } 835 } 836 } 837 return (mp1); 838 } 839 840 /* 841 * This function is called on a mblk that thas been successfully uioamoved(). 842 */ 843 void 844 sod_uioa_mblk_done(sodirect_t *sodp, mblk_t *bp) 845 { 846 if (bp != NULL && (bp->b_datap->db_flags & DBLK_UIOA)) { 847 /* 848 * A uioa flaged mblk_t chain, already uio processed, 849 * add it to the sodirect uioa pending free list. 850 * 851 * Note, a b_cont chain headed by a DBLK_UIOA enable 852 * mblk_t must have all mblk_t(s) DBLK_UIOA enabled. 853 */ 854 mblk_t *bpt = sodp->sod_uioaft; 855 856 ASSERT(sodp != NULL); 857 858 /* 859 * Add first mblk_t of "bp" chain to current sodirect uioa 860 * free list tail mblk_t, if any, else empty list so new head. 861 */ 862 if (bpt == NULL) 863 sodp->sod_uioafh = bp; 864 else 865 bpt->b_cont = bp; 866 867 /* 868 * Walk mblk_t "bp" chain to find tail and adjust rptr of 869 * each to reflect that uioamove() has consumed all data. 870 */ 871 bpt = bp; 872 for (;;) { 873 ASSERT(bpt->b_datap->db_flags & DBLK_UIOA); 874 875 bpt->b_rptr = bpt->b_wptr; 876 if (bpt->b_cont == NULL) 877 break; 878 bpt = bpt->b_cont; 879 } 880 /* New sodirect uioa free list tail */ 881 sodp->sod_uioaft = bpt; 882 883 /* Only dequeue once with data returned per uioa_t */ 884 if (sodp->sod_uioa.uioa_state & UIOA_ENABLED) { 885 sodp->sod_uioa.uioa_state &= UIOA_CLR; 886 sodp->sod_uioa.uioa_state |= UIOA_FINI; 887 } 888 } 889 } 890 891 /* 892 * When transit from UIOA_INIT state to UIOA_ENABLE state in recvmsg(), call 893 * this function on a non-STREAMS socket to schedule uioamove() on the data 894 * that has already queued in this socket. 895 */ 896 void 897 sod_uioa_so_init(struct sonode *so, struct sodirect_s *sodp, struct uio *uiop) 898 { 899 uioa_t *uioap = (uioa_t *)uiop; 900 mblk_t *lbp; 901 mblk_t *wbp; 902 mblk_t *bp; 903 int len; 904 int error; 905 boolean_t in_rcv_q = B_TRUE; 906 907 ASSERT(MUTEX_HELD(sodp->sod_lockp)); 908 ASSERT(&sodp->sod_uioa == uioap); 909 910 /* 911 * Walk first b_cont chain in sod_q 912 * and schedule any M_DATA mblk_t's for uio asynchronous move. 913 */ 914 bp = so->so_rcv_q_head; 915 916 again: 917 /* Walk the chain */ 918 lbp = NULL; 919 wbp = bp; 920 921 do { 922 if (bp == NULL) 923 break; 924 925 if (wbp->b_datap->db_type != M_DATA) { 926 /* Not M_DATA, no more uioa */ 927 goto nouioa; 928 } 929 if ((len = wbp->b_wptr - wbp->b_rptr) > 0) { 930 /* Have a M_DATA mblk_t with data */ 931 if (len > uioap->uio_resid || (so->so_oobmark > 0 && 932 len + uioap->uioa_mbytes >= so->so_oobmark)) { 933 /* Not enough uio sapce, or beyond oobmark */ 934 goto nouioa; 935 } 936 ASSERT(!(wbp->b_datap->db_flags & DBLK_UIOA)); 937 error = uioamove(wbp->b_rptr, len, 938 UIO_READ, uioap); 939 if (!error) { 940 /* Scheduled, mark dblk_t as such */ 941 wbp->b_datap->db_flags |= DBLK_UIOA; 942 } else { 943 /* Break the mblk chain */ 944 goto nouioa; 945 } 946 } 947 /* Save last wbp processed */ 948 lbp = wbp; 949 } while ((wbp = wbp->b_cont) != NULL); 950 951 if (in_rcv_q && (bp == NULL || bp->b_next == NULL)) { 952 /* 953 * We get here only once to process the sonode dump area 954 * if so_rcv_q_head is NULL or all the mblks have been 955 * successfully uioamoved()ed. 956 */ 957 in_rcv_q = B_FALSE; 958 959 /* move to dump area */ 960 bp = so->so_rcv_head; 961 goto again; 962 } 963 964 return; 965 966 nouioa: 967 /* No more uioa */ 968 uioap->uioa_state &= UIOA_CLR; 969 uioap->uioa_state |= UIOA_FINI; 970 971 /* 972 * If we processed 1 or more mblk_t(s) then we need to split the 973 * current mblk_t chain in 2 so that all the uioamove()ed mblk_t(s) 974 * are in the current chain and the rest are in the following new 975 * chain. 976 */ 977 if (lbp != NULL) { 978 /* New end of current chain */ 979 lbp->b_cont = NULL; 980 981 /* Insert new chain wbp after bp */ 982 if ((wbp->b_next = bp->b_next) == NULL) { 983 /* 984 * No need to grab so_lock, since sod_lockp 985 * points to so_lock. 986 */ 987 if (in_rcv_q) 988 so->so_rcv_q_last_head = wbp; 989 else 990 so->so_rcv_last_head = wbp; 991 } 992 bp->b_next = wbp; 993 bp->b_next->b_prev = bp->b_prev; 994 bp->b_prev = lbp; 995 } 996 } 997 998 /* 999 * Initialize sodirect data structures on a socket. 1000 */ 1001 void 1002 sod_sock_init(struct sonode *so, struct stdata *stp, sod_enq_func enq_func, 1003 sod_wakeup_func wake_func, kmutex_t *lockp) 1004 { 1005 sodirect_t *sodp; 1006 1007 ASSERT(so->so_direct == NULL); 1008 1009 so->so_state |= SS_SODIRECT; 1010 1011 sodp = kmem_cache_alloc(sock_sod_cache, KM_SLEEP); 1012 sodp->sod_state = SOD_ENABLED | SOD_WAKE_NOT; 1013 sodp->sod_want = 0; 1014 sodp->sod_q = (stp != NULL) ? RD(stp->sd_wrq) : NULL; 1015 sodp->sod_enqueue = enq_func; 1016 sodp->sod_wakeup = wake_func; 1017 sodp->sod_uioafh = NULL; 1018 sodp->sod_uioaft = NULL; 1019 sodp->sod_lockp = lockp; 1020 /* 1021 * Remainder of the sod_uioa members are left uninitialized 1022 * but will be initialized later by uioainit() before uioa 1023 * is enabled. 1024 */ 1025 sodp->sod_uioa.uioa_state = UIOA_ALLOC; 1026 so->so_direct = sodp; 1027 if (stp != NULL) 1028 stp->sd_sodirect = sodp; 1029 } 1030 1031 /* 1032 * Init the sodirect kmem cache while sockfs is loading. 1033 */ 1034 void 1035 sod_init() 1036 { 1037 /* Allocate sodirect_t kmem_cache */ 1038 sock_sod_cache = kmem_cache_create("sock_sod_cache", 1039 sizeof (sodirect_t), 0, NULL, NULL, NULL, NULL, NULL, 0); 1040 } 1041 1042 ssize_t 1043 sod_uioa_mblk(struct sonode *so, mblk_t *mp) 1044 { 1045 sodirect_t *sodp = so->so_direct; 1046 1047 ASSERT(sodp != NULL); 1048 ASSERT(MUTEX_HELD(sodp->sod_lockp)); 1049 1050 ASSERT(sodp->sod_state & SOD_ENABLED); 1051 ASSERT(sodp->sod_uioa.uioa_state != (UIOA_ALLOC|UIOA_INIT)); 1052 1053 ASSERT(sodp->sod_uioa.uioa_state & (UIOA_ENABLED|UIOA_FINI)); 1054 1055 if (mp == NULL && so->so_rcv_q_head != NULL) { 1056 mp = so->so_rcv_q_head; 1057 ASSERT(mp->b_prev != NULL); 1058 mp->b_prev = NULL; 1059 so->so_rcv_q_head = mp->b_next; 1060 if (so->so_rcv_q_head == NULL) { 1061 so->so_rcv_q_last_head = NULL; 1062 } 1063 mp->b_next = NULL; 1064 } 1065 1066 sod_uioa_mblk_done(sodp, mp); 1067 1068 if (so->so_rcv_q_head == NULL && so->so_rcv_head != NULL && 1069 DB_TYPE(so->so_rcv_head) == M_DATA && 1070 (DB_FLAGS(so->so_rcv_head) & DBLK_UIOA)) { 1071 /* more arrived */ 1072 ASSERT(so->so_rcv_q_head == NULL); 1073 mp = so->so_rcv_head; 1074 so->so_rcv_head = mp->b_next; 1075 if (so->so_rcv_head == NULL) 1076 so->so_rcv_last_head = NULL; 1077 mp->b_prev = mp->b_next = NULL; 1078 sod_uioa_mblk_done(sodp, mp); 1079 } 1080 1081 #ifdef DEBUG 1082 if (so->so_rcv_q_head != NULL) { 1083 mblk_t *m = so->so_rcv_q_head; 1084 while (m != NULL) { 1085 if (DB_FLAGS(m) & DBLK_UIOA) { 1086 cmn_err(CE_PANIC, "Unexpected I/OAT mblk %p" 1087 " in so_rcv_q_head.\n", (void *)m); 1088 } 1089 m = m->b_next; 1090 } 1091 } 1092 if (so->so_rcv_head != NULL) { 1093 mblk_t *m = so->so_rcv_head; 1094 while (m != NULL) { 1095 if (DB_FLAGS(m) & DBLK_UIOA) { 1096 cmn_err(CE_PANIC, "Unexpected I/OAT mblk %p" 1097 " in so_rcv_head.\n", (void *)m); 1098 } 1099 m = m->b_next; 1100 } 1101 } 1102 #endif 1103 return (sodp->sod_uioa.uioa_mbytes); 1104 } 1105