1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. 24 * Copyright 2015, Joyent, Inc. 25 * Copyright 2017 Sebastian Wiedenroth 26 * Copyright 2022 Garrett D'Amore 27 */ 28 29 #include <sys/types.h> 30 #include <sys/param.h> 31 #include <sys/systm.h> 32 #include <sys/sysmacros.h> 33 #include <sys/debug.h> 34 #include <sys/cmn_err.h> 35 #include <sys/vfs.h> 36 #include <sys/policy.h> 37 #include <sys/modctl.h> 38 39 #include <sys/sunddi.h> 40 41 #include <sys/strsun.h> 42 #include <sys/stropts.h> 43 #include <sys/strsubr.h> 44 #include <sys/socket.h> 45 #include <sys/socketvar.h> 46 #include <sys/uio.h> 47 48 #include <inet/ipclassifier.h> 49 #include <fs/sockfs/sockcommon.h> 50 #include <fs/sockfs/sockfilter_impl.h> 51 #include <fs/sockfs/socktpi.h> 52 #include <fs/sockfs/sodirect.h> 53 #include <inet/ip.h> 54 55 extern int xnet_skip_checks, xnet_check_print, xnet_truncate_print; 56 57 /* 58 * Common socket access functions. 59 * 60 * Instead of accessing the sonode switch directly (i.e., SOP_xxx()), 61 * the socket_xxx() function should be used. 62 */ 63 64 /* 65 * Try to create a new sonode of the requested <family, type, protocol>. 66 */ 67 /* ARGSUSED */ 68 struct sonode * 69 socket_create(int family, int type, int protocol, char *devpath, char *mod, 70 int flags, int version, struct cred *cr, int *errorp) 71 { 72 struct sonode *so; 73 struct sockparams *sp = NULL; 74 int saved_error; 75 76 /* 77 * Look for a sockparams entry that match the given criteria. 78 * solookup() returns with the entry held. 79 */ 80 *errorp = solookup(family, type, protocol, &sp); 81 saved_error = *errorp; 82 if (sp == NULL) { 83 int kmflags = (flags == SOCKET_SLEEP) ? KM_SLEEP : KM_NOSLEEP; 84 /* 85 * There is no matching sockparams entry. An ephemeral entry is 86 * created if the caller specifies a device or a socket module. 87 */ 88 if (devpath != NULL) { 89 saved_error = 0; 90 sp = sockparams_hold_ephemeral_bydev(family, type, 91 protocol, devpath, kmflags, errorp); 92 } else if (mod != NULL) { 93 saved_error = 0; 94 sp = sockparams_hold_ephemeral_bymod(family, type, 95 protocol, mod, kmflags, errorp); 96 } else { 97 *errorp = solookup(family, type, 0, &sp); 98 } 99 100 if (sp == NULL) { 101 if (saved_error && (*errorp == EPROTONOSUPPORT || 102 *errorp == EPROTOTYPE || *errorp == ENOPROTOOPT)) 103 *errorp = saved_error; 104 return (NULL); 105 } 106 } 107 108 ASSERT(sp->sp_smod_info != NULL); 109 ASSERT(flags == SOCKET_SLEEP || flags == SOCKET_NOSLEEP); 110 sp->sp_stats.sps_ncreate.value.ui64++; 111 so = sp->sp_smod_info->smod_sock_create_func(sp, family, type, 112 protocol, version, flags, errorp, cr); 113 if (so == NULL) { 114 SOCKPARAMS_DEC_REF(sp); 115 } else { 116 if ((*errorp = SOP_INIT(so, NULL, cr, flags)) == 0) { 117 /* Cannot fail, only bumps so_count */ 118 (void) VOP_OPEN(&SOTOV(so), FREAD|FWRITE, cr, NULL); 119 } else { 120 if (saved_error && (*errorp == EPROTONOSUPPORT || 121 *errorp == EPROTOTYPE || *errorp == ENOPROTOOPT)) 122 *errorp = saved_error; 123 socket_destroy(so); 124 so = NULL; 125 } 126 } 127 return (so); 128 } 129 130 struct sonode * 131 socket_newconn(struct sonode *parent, sock_lower_handle_t lh, 132 sock_downcalls_t *dc, int flags, int *errorp) 133 { 134 struct sonode *so; 135 struct sockparams *sp; 136 struct cred *cr; 137 138 if ((cr = CRED()) == NULL) 139 cr = kcred; 140 141 sp = parent->so_sockparams; 142 ASSERT(sp != NULL); 143 144 sp->sp_stats.sps_ncreate.value.ui64++; 145 so = sp->sp_smod_info->smod_sock_create_func(sp, parent->so_family, 146 parent->so_type, parent->so_protocol, parent->so_version, flags, 147 errorp, cr); 148 if (so != NULL) { 149 SOCKPARAMS_INC_REF(sp); 150 151 so->so_proto_handle = lh; 152 so->so_downcalls = dc; 153 /* 154 * This function may be called in interrupt context, and CRED() 155 * will be NULL. In this case, pass in kcred. 156 */ 157 if ((*errorp = SOP_INIT(so, parent, cr, flags)) == 0) { 158 /* Cannot fail, only bumps so_count */ 159 (void) VOP_OPEN(&SOTOV(so), FREAD|FWRITE, cr, NULL); 160 } else { 161 socket_destroy(so); 162 so = NULL; 163 } 164 } 165 166 return (so); 167 } 168 169 /* 170 * Bind local endpoint. 171 */ 172 int 173 socket_bind(struct sonode *so, struct sockaddr *name, socklen_t namelen, 174 int flags, cred_t *cr) 175 { 176 return (SOP_BIND(so, name, namelen, flags, cr)); 177 } 178 179 /* 180 * Turn socket into a listen socket. 181 */ 182 int 183 socket_listen(struct sonode *so, int backlog, cred_t *cr) 184 { 185 if (backlog < 0) { 186 backlog = 0; 187 } 188 189 /* 190 * Use the same qlimit as in BSD. BSD checks the qlimit 191 * before queuing the next connection implying that a 192 * listen(sock, 0) allows one connection to be queued. 193 * BSD also uses 1.5 times the requested backlog. 194 * 195 * XNS Issue 4 required a strict interpretation of the backlog. 196 * This has been waived subsequently for Issue 4 and the change 197 * incorporated in XNS Issue 5. So we aren't required to do 198 * anything special for XPG apps. 199 */ 200 if (backlog >= (INT_MAX - 1) / 3) 201 backlog = INT_MAX; 202 else 203 backlog = backlog * 3 / 2 + 1; 204 205 return (SOP_LISTEN(so, backlog, cr)); 206 } 207 208 /* 209 * Accept incoming connection. 210 */ 211 int 212 socket_accept(struct sonode *lso, int fflag, cred_t *cr, struct sonode **nsop) 213 { 214 return (SOP_ACCEPT(lso, fflag, cr, nsop)); 215 } 216 217 /* 218 * Active open. 219 */ 220 int 221 socket_connect(struct sonode *so, struct sockaddr *name, 222 socklen_t namelen, int fflag, int flags, cred_t *cr) 223 { 224 int error; 225 226 /* 227 * Handle a connect to a name parameter of type AF_UNSPEC like a 228 * connect to a null address. This is the portable method to 229 * unconnect a socket. 230 */ 231 if ((namelen >= sizeof (sa_family_t)) && 232 (name->sa_family == AF_UNSPEC)) { 233 name = NULL; 234 namelen = 0; 235 } 236 237 error = SOP_CONNECT(so, name, namelen, fflag, flags, cr); 238 239 if (error == EHOSTUNREACH && flags & _SOCONNECT_XPG4_2) { 240 /* 241 * X/Open specification contains a requirement that 242 * ENETUNREACH be returned but does not require 243 * EHOSTUNREACH. In order to keep the test suite 244 * happy we mess with the errno here. 245 */ 246 error = ENETUNREACH; 247 } 248 249 return (error); 250 } 251 252 /* 253 * Get address of remote node. 254 */ 255 int 256 socket_getpeername(struct sonode *so, struct sockaddr *addr, 257 socklen_t *addrlen, boolean_t accept, cred_t *cr) 258 { 259 ASSERT(*addrlen > 0); 260 return (SOP_GETPEERNAME(so, addr, addrlen, accept, cr)); 261 262 } 263 264 /* 265 * Get local address. 266 */ 267 int 268 socket_getsockname(struct sonode *so, struct sockaddr *addr, 269 socklen_t *addrlen, cred_t *cr) 270 { 271 return (SOP_GETSOCKNAME(so, addr, addrlen, cr)); 272 273 } 274 275 /* 276 * Called from shutdown(). 277 */ 278 int 279 socket_shutdown(struct sonode *so, int how, cred_t *cr) 280 { 281 return (SOP_SHUTDOWN(so, how, cr)); 282 } 283 284 /* 285 * Get socket options. 286 */ 287 /*ARGSUSED*/ 288 int 289 socket_getsockopt(struct sonode *so, int level, int option_name, 290 void *optval, socklen_t *optlenp, int flags, cred_t *cr) 291 { 292 return (SOP_GETSOCKOPT(so, level, option_name, optval, 293 optlenp, flags, cr)); 294 } 295 296 /* 297 * Set socket options 298 */ 299 int 300 socket_setsockopt(struct sonode *so, int level, int option_name, 301 const void *optval, t_uscalar_t optlen, cred_t *cr) 302 { 303 int val = 1; 304 /* Caller allocates aligned optval, or passes null */ 305 ASSERT(((uintptr_t)optval & (sizeof (t_scalar_t) - 1)) == 0); 306 /* If optval is null optlen is 0, and vice-versa */ 307 ASSERT(optval != NULL || optlen == 0); 308 ASSERT(optlen != 0 || optval == NULL); 309 310 if (optval == NULL && optlen == 0) 311 optval = &val; 312 313 return (SOP_SETSOCKOPT(so, level, option_name, optval, optlen, cr)); 314 } 315 316 int 317 socket_sendmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop, 318 cred_t *cr) 319 { 320 int error = 0; 321 ssize_t orig_resid = uiop->uio_resid; 322 323 /* 324 * Do not bypass the cache if we are doing a local (AF_UNIX) write. 325 */ 326 if (so->so_family == AF_UNIX) 327 uiop->uio_extflg |= UIO_COPY_CACHED; 328 else 329 uiop->uio_extflg &= ~UIO_COPY_CACHED; 330 331 error = SOP_SENDMSG(so, msg, uiop, cr); 332 switch (error) { 333 default: 334 break; 335 case EINTR: 336 case ENOMEM: 337 /* EAGAIN is EWOULDBLOCK */ 338 case EWOULDBLOCK: 339 /* We did a partial send */ 340 if (uiop->uio_resid != orig_resid) 341 error = 0; 342 break; 343 case EPIPE: 344 if (((so->so_mode & SM_KERNEL) == 0) && 345 ((msg->msg_flags & MSG_NOSIGNAL) == 0)) { 346 tsignal(curthread, SIGPIPE); 347 } 348 break; 349 } 350 351 return (error); 352 } 353 354 int 355 socket_sendmblk(struct sonode *so, struct nmsghdr *msg, int fflag, 356 struct cred *cr, mblk_t **mpp) 357 { 358 int error = 0; 359 360 error = SOP_SENDMBLK(so, msg, fflag, cr, mpp); 361 if (error == EPIPE) { 362 tsignal(curthread, SIGPIPE); 363 } 364 return (error); 365 } 366 367 int 368 socket_recvmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop, 369 cred_t *cr) 370 { 371 int error; 372 ssize_t orig_resid = uiop->uio_resid; 373 374 /* 375 * Do not bypass the cache when reading data, as the application 376 * is likely to access the data shortly. 377 */ 378 uiop->uio_extflg |= UIO_COPY_CACHED; 379 380 error = SOP_RECVMSG(so, msg, uiop, cr); 381 382 switch (error) { 383 case EINTR: 384 /* EAGAIN is EWOULDBLOCK */ 385 case EWOULDBLOCK: 386 /* We did a partial read */ 387 if (uiop->uio_resid != orig_resid) 388 error = 0; 389 break; 390 default: 391 break; 392 } 393 return (error); 394 } 395 396 int 397 socket_ioctl(struct sonode *so, int cmd, intptr_t arg, int mode, 398 struct cred *cr, int32_t *rvalp) 399 { 400 return (SOP_IOCTL(so, cmd, arg, mode, cr, rvalp)); 401 } 402 403 int 404 socket_poll(struct sonode *so, short events, int anyyet, short *reventsp, 405 struct pollhead **phpp) 406 { 407 return (SOP_POLL(so, events, anyyet, reventsp, phpp)); 408 } 409 410 int 411 socket_close(struct sonode *so, int flag, struct cred *cr) 412 { 413 return (VOP_CLOSE(SOTOV(so), flag, 1, 0, cr, NULL)); 414 } 415 416 int 417 socket_close_internal(struct sonode *so, int flag, cred_t *cr) 418 { 419 ASSERT(so->so_count == 0); 420 421 return (SOP_CLOSE(so, flag, cr)); 422 } 423 424 void 425 socket_destroy(struct sonode *so) 426 { 427 vn_invalid(SOTOV(so)); 428 VN_RELE(SOTOV(so)); 429 } 430 431 /* ARGSUSED */ 432 void 433 socket_destroy_internal(struct sonode *so, cred_t *cr) 434 { 435 struct sockparams *sp = so->so_sockparams; 436 ASSERT(so->so_count == 0 && sp != NULL); 437 438 sp->sp_smod_info->smod_sock_destroy_func(so); 439 440 SOCKPARAMS_DEC_REF(sp); 441 } 442 443 /* 444 * TODO Once the common vnode ops is available, then the vnops argument 445 * should be removed. 446 */ 447 /*ARGSUSED*/ 448 int 449 sonode_constructor(void *buf, void *cdrarg, int kmflags) 450 { 451 struct sonode *so = buf; 452 struct vnode *vp; 453 454 vp = so->so_vnode = vn_alloc(kmflags); 455 if (vp == NULL) { 456 return (-1); 457 } 458 vp->v_data = so; 459 vn_setops(vp, socket_vnodeops); 460 461 so->so_priv = NULL; 462 so->so_oobmsg = NULL; 463 464 so->so_proto_handle = NULL; 465 466 so->so_peercred = NULL; 467 468 so->so_rcv_queued = 0; 469 so->so_rcv_q_head = NULL; 470 so->so_rcv_q_last_head = NULL; 471 so->so_rcv_head = NULL; 472 so->so_rcv_last_head = NULL; 473 so->so_rcv_wanted = 0; 474 so->so_rcv_timer_interval = SOCKET_NO_RCVTIMER; 475 so->so_rcv_timer_tid = 0; 476 so->so_rcv_thresh = 0; 477 478 list_create(&so->so_acceptq_list, sizeof (struct sonode), 479 offsetof(struct sonode, so_acceptq_node)); 480 list_create(&so->so_acceptq_defer, sizeof (struct sonode), 481 offsetof(struct sonode, so_acceptq_node)); 482 list_link_init(&so->so_acceptq_node); 483 so->so_acceptq_len = 0; 484 so->so_backlog = 0; 485 so->so_listener = NULL; 486 487 so->so_snd_qfull = B_FALSE; 488 489 so->so_filter_active = 0; 490 so->so_filter_tx = 0; 491 so->so_filter_defertime = 0; 492 so->so_filter_top = NULL; 493 so->so_filter_bottom = NULL; 494 495 mutex_init(&so->so_lock, NULL, MUTEX_DEFAULT, NULL); 496 mutex_init(&so->so_acceptq_lock, NULL, MUTEX_DEFAULT, NULL); 497 rw_init(&so->so_fallback_rwlock, NULL, RW_DEFAULT, NULL); 498 cv_init(&so->so_state_cv, NULL, CV_DEFAULT, NULL); 499 cv_init(&so->so_single_cv, NULL, CV_DEFAULT, NULL); 500 cv_init(&so->so_read_cv, NULL, CV_DEFAULT, NULL); 501 502 cv_init(&so->so_acceptq_cv, NULL, CV_DEFAULT, NULL); 503 cv_init(&so->so_snd_cv, NULL, CV_DEFAULT, NULL); 504 cv_init(&so->so_rcv_cv, NULL, CV_DEFAULT, NULL); 505 cv_init(&so->so_copy_cv, NULL, CV_DEFAULT, NULL); 506 cv_init(&so->so_closing_cv, NULL, CV_DEFAULT, NULL); 507 508 so->so_krecv_cb = NULL; 509 so->so_krecv_arg = NULL; 510 511 return (0); 512 } 513 514 /*ARGSUSED*/ 515 void 516 sonode_destructor(void *buf, void *cdrarg) 517 { 518 struct sonode *so = buf; 519 struct vnode *vp = SOTOV(so); 520 521 ASSERT(so->so_priv == NULL); 522 ASSERT(so->so_peercred == NULL); 523 524 ASSERT(so->so_oobmsg == NULL); 525 526 ASSERT(so->so_rcv_q_head == NULL); 527 528 list_destroy(&so->so_acceptq_list); 529 list_destroy(&so->so_acceptq_defer); 530 ASSERT(!list_link_active(&so->so_acceptq_node)); 531 ASSERT(so->so_listener == NULL); 532 533 ASSERT(so->so_filter_active == 0); 534 ASSERT(so->so_filter_tx == 0); 535 ASSERT(so->so_filter_top == NULL); 536 ASSERT(so->so_filter_bottom == NULL); 537 538 ASSERT(vp->v_data == so); 539 ASSERT(vn_matchops(vp, socket_vnodeops)); 540 541 vn_free(vp); 542 543 mutex_destroy(&so->so_lock); 544 mutex_destroy(&so->so_acceptq_lock); 545 rw_destroy(&so->so_fallback_rwlock); 546 547 cv_destroy(&so->so_state_cv); 548 cv_destroy(&so->so_single_cv); 549 cv_destroy(&so->so_read_cv); 550 cv_destroy(&so->so_acceptq_cv); 551 cv_destroy(&so->so_snd_cv); 552 cv_destroy(&so->so_rcv_cv); 553 cv_destroy(&so->so_closing_cv); 554 } 555 556 void 557 sonode_init(struct sonode *so, struct sockparams *sp, int family, 558 int type, int protocol, sonodeops_t *sops) 559 { 560 vnode_t *vp; 561 562 vp = SOTOV(so); 563 564 so->so_flag = 0; 565 566 so->so_state = 0; 567 so->so_mode = 0; 568 569 so->so_count = 0; 570 571 so->so_family = family; 572 so->so_type = type; 573 so->so_protocol = protocol; 574 575 SOCK_CONNID_INIT(so->so_proto_connid); 576 577 so->so_options = 0; 578 so->so_linger.l_onoff = 0; 579 so->so_linger.l_linger = 0; 580 so->so_sndbuf = 0; 581 so->so_error = 0; 582 so->so_rcvtimeo = 0; 583 so->so_sndtimeo = 0; 584 so->so_xpg_rcvbuf = 0; 585 586 ASSERT(so->so_oobmsg == NULL); 587 so->so_oobmark = 0; 588 so->so_pgrp = 0; 589 590 ASSERT(so->so_peercred == NULL); 591 592 so->so_zoneid = getzoneid(); 593 594 so->so_sockparams = sp; 595 596 so->so_ops = sops; 597 598 so->so_not_str = (sops != &sotpi_sonodeops); 599 600 so->so_proto_handle = NULL; 601 602 so->so_downcalls = NULL; 603 604 so->so_copyflag = 0; 605 606 vn_reinit(vp); 607 vp->v_vfsp = sock_vfsp; 608 vp->v_type = VSOCK; 609 vp->v_rdev = sockdev; 610 611 so->so_snd_qfull = B_FALSE; 612 so->so_minpsz = 0; 613 614 so->so_rcv_wakeup = B_FALSE; 615 so->so_snd_wakeup = B_FALSE; 616 so->so_flowctrld = B_FALSE; 617 618 so->so_pollev = 0; 619 bzero(&so->so_poll_list, sizeof (so->so_poll_list)); 620 bzero(&so->so_proto_props, sizeof (struct sock_proto_props)); 621 622 bzero(&(so->so_ksock_callbacks), sizeof (ksocket_callbacks_t)); 623 so->so_ksock_cb_arg = NULL; 624 625 so->so_max_addr_len = sizeof (struct sockaddr_storage); 626 627 so->so_direct = NULL; 628 629 vn_exists(vp); 630 } 631 632 void 633 sonode_fini(struct sonode *so) 634 { 635 vnode_t *vp; 636 637 ASSERT(so->so_count == 0); 638 639 if (so->so_rcv_timer_tid) { 640 ASSERT(MUTEX_NOT_HELD(&so->so_lock)); 641 (void) untimeout(so->so_rcv_timer_tid); 642 so->so_rcv_timer_tid = 0; 643 } 644 645 if (so->so_poll_list.ph_list != NULL) { 646 pollwakeup(&so->so_poll_list, POLLERR); 647 pollhead_clean(&so->so_poll_list); 648 } 649 650 if (so->so_direct != NULL) 651 sod_sock_fini(so); 652 653 vp = SOTOV(so); 654 vn_invalid(vp); 655 656 if (so->so_peercred != NULL) { 657 crfree(so->so_peercred); 658 so->so_peercred = NULL; 659 } 660 /* Detach and destroy filters */ 661 if (so->so_filter_top != NULL) 662 sof_sonode_cleanup(so); 663 664 /* Clean up any remnants of krecv callbacks */ 665 so->so_krecv_cb = NULL; 666 so->so_krecv_arg = NULL; 667 668 ASSERT(list_is_empty(&so->so_acceptq_list)); 669 ASSERT(list_is_empty(&so->so_acceptq_defer)); 670 ASSERT(!list_link_active(&so->so_acceptq_node)); 671 672 ASSERT(so->so_rcv_queued == 0); 673 ASSERT(so->so_rcv_q_head == NULL); 674 ASSERT(so->so_rcv_q_last_head == NULL); 675 ASSERT(so->so_rcv_head == NULL); 676 ASSERT(so->so_rcv_last_head == NULL); 677 } 678