1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. 24 */ 25 26 /* This file contains all TCP kernel socket related functions. */ 27 28 #include <sys/types.h> 29 #include <sys/strlog.h> 30 #include <sys/policy.h> 31 #include <sys/sockio.h> 32 #include <sys/strsubr.h> 33 #include <sys/strsun.h> 34 #include <sys/squeue_impl.h> 35 #include <sys/squeue.h> 36 #define _SUN_TPI_VERSION 2 37 #include <sys/tihdr.h> 38 #include <sys/timod.h> 39 #include <sys/tpicommon.h> 40 #include <sys/socketvar.h> 41 42 #include <inet/common.h> 43 #include <inet/proto_set.h> 44 #include <inet/ip.h> 45 #include <inet/tcp.h> 46 #include <inet/tcp_impl.h> 47 48 static void tcp_activate(sock_lower_handle_t, sock_upper_handle_t, 49 sock_upcalls_t *, int, cred_t *); 50 static int tcp_accept(sock_lower_handle_t, sock_lower_handle_t, 51 sock_upper_handle_t, cred_t *); 52 static int tcp_bind(sock_lower_handle_t, struct sockaddr *, 53 socklen_t, cred_t *); 54 static int tcp_listen(sock_lower_handle_t, int, cred_t *); 55 static int tcp_connect(sock_lower_handle_t, const struct sockaddr *, 56 socklen_t, sock_connid_t *, cred_t *); 57 static int tcp_getsockopt(sock_lower_handle_t, int, int, void *, 58 socklen_t *, cred_t *); 59 static int tcp_setsockopt(sock_lower_handle_t, int, int, const void *, 60 socklen_t, cred_t *); 61 static int tcp_sendmsg(sock_lower_handle_t, mblk_t *, struct nmsghdr *, 62 cred_t *cr); 63 static int tcp_shutdown(sock_lower_handle_t, int, cred_t *); 64 static void tcp_clr_flowctrl(sock_lower_handle_t); 65 static int tcp_ioctl(sock_lower_handle_t, int, intptr_t, int, int32_t *, 66 cred_t *); 67 static int tcp_close(sock_lower_handle_t, int, cred_t *); 68 69 sock_downcalls_t sock_tcp_downcalls = { 70 tcp_activate, 71 tcp_accept, 72 tcp_bind, 73 tcp_listen, 74 tcp_connect, 75 tcp_getpeername, 76 tcp_getsockname, 77 tcp_getsockopt, 78 tcp_setsockopt, 79 tcp_sendmsg, 80 NULL, 81 NULL, 82 NULL, 83 tcp_shutdown, 84 tcp_clr_flowctrl, 85 tcp_ioctl, 86 tcp_close, 87 }; 88 89 /* ARGSUSED */ 90 static void 91 tcp_activate(sock_lower_handle_t proto_handle, sock_upper_handle_t sock_handle, 92 sock_upcalls_t *sock_upcalls, int flags, cred_t *cr) 93 { 94 conn_t *connp = (conn_t *)proto_handle; 95 struct sock_proto_props sopp; 96 extern struct module_info tcp_rinfo; 97 98 ASSERT(connp->conn_upper_handle == NULL); 99 100 /* All Solaris components should pass a cred for this operation. */ 101 ASSERT(cr != NULL); 102 103 sopp.sopp_flags = SOCKOPT_RCVHIWAT | SOCKOPT_RCVLOWAT | 104 SOCKOPT_MAXPSZ | SOCKOPT_MAXBLK | SOCKOPT_RCVTIMER | 105 SOCKOPT_RCVTHRESH | SOCKOPT_MAXADDRLEN | SOCKOPT_MINPSZ; 106 107 sopp.sopp_rxhiwat = SOCKET_RECVHIWATER; 108 sopp.sopp_rxlowat = SOCKET_RECVLOWATER; 109 sopp.sopp_maxpsz = INFPSZ; 110 sopp.sopp_maxblk = INFPSZ; 111 sopp.sopp_rcvtimer = SOCKET_TIMER_INTERVAL; 112 sopp.sopp_rcvthresh = SOCKET_RECVHIWATER >> 3; 113 sopp.sopp_maxaddrlen = sizeof (sin6_t); 114 sopp.sopp_minpsz = (tcp_rinfo.mi_minpsz == 1) ? 0 : 115 tcp_rinfo.mi_minpsz; 116 117 connp->conn_upcalls = sock_upcalls; 118 connp->conn_upper_handle = sock_handle; 119 120 ASSERT(connp->conn_rcvbuf != 0 && 121 connp->conn_rcvbuf == connp->conn_tcp->tcp_rwnd); 122 (*sock_upcalls->su_set_proto_props)(sock_handle, &sopp); 123 } 124 125 /*ARGSUSED*/ 126 static int 127 tcp_accept(sock_lower_handle_t lproto_handle, 128 sock_lower_handle_t eproto_handle, sock_upper_handle_t sock_handle, 129 cred_t *cr) 130 { 131 conn_t *lconnp, *econnp; 132 tcp_t *listener, *eager; 133 134 /* 135 * KSSL can move a socket from one listener to another, in which 136 * case `lproto_handle' points to the new listener. To ensure that 137 * the original listener is used the information is obtained from 138 * the eager. 139 */ 140 econnp = (conn_t *)eproto_handle; 141 eager = econnp->conn_tcp; 142 ASSERT(IPCL_IS_NONSTR(econnp)); 143 ASSERT(eager->tcp_listener != NULL); 144 listener = eager->tcp_listener; 145 lconnp = (conn_t *)listener->tcp_connp; 146 ASSERT(listener->tcp_state == TCPS_LISTEN); 147 ASSERT(lconnp->conn_upper_handle != NULL); 148 149 /* 150 * It is possible for the accept thread to race with the thread that 151 * made the su_newconn upcall in tcp_newconn_notify. Both 152 * tcp_newconn_notify and tcp_accept require that conn_upper_handle 153 * and conn_upcalls be set before returning, so they both write to 154 * them. However, we're guaranteed that the value written is the same 155 * for both threads. 156 */ 157 ASSERT(econnp->conn_upper_handle == NULL || 158 econnp->conn_upper_handle == sock_handle); 159 ASSERT(econnp->conn_upcalls == NULL || 160 econnp->conn_upcalls == lconnp->conn_upcalls); 161 econnp->conn_upper_handle = sock_handle; 162 econnp->conn_upcalls = lconnp->conn_upcalls; 163 164 ASSERT(econnp->conn_netstack == 165 listener->tcp_connp->conn_netstack); 166 ASSERT(eager->tcp_tcps == listener->tcp_tcps); 167 168 /* 169 * We should have a minimum of 2 references on the conn at this 170 * point. One for TCP and one for the newconn notification 171 * (which is now taken over by IP). In the normal case we would 172 * also have another reference (making a total of 3) for the conn 173 * being in the classifier hash list. However the eager could have 174 * received an RST subsequently and tcp_closei_local could have 175 * removed the eager from the classifier hash list, hence we can't 176 * assert that reference. 177 */ 178 ASSERT(econnp->conn_ref >= 2); 179 180 /* 181 * An error is returned if this conn has been reset, which will 182 * cause the socket to be closed immediately. The eager will be 183 * unlinked from the listener during close. 184 */ 185 if (eager->tcp_state < TCPS_ESTABLISHED) 186 return (ECONNABORTED); 187 188 mutex_enter(&listener->tcp_eager_lock); 189 /* 190 * Non-STREAMS listeners never defer the notification of new 191 * connections. 192 */ 193 ASSERT(!listener->tcp_eager_prev_q0->tcp_conn_def_q0); 194 tcp_eager_unlink(eager); 195 mutex_exit(&listener->tcp_eager_lock); 196 CONN_DEC_REF(listener->tcp_connp); 197 198 return (0); 199 } 200 201 static int 202 tcp_bind(sock_lower_handle_t proto_handle, struct sockaddr *sa, 203 socklen_t len, cred_t *cr) 204 { 205 int error; 206 conn_t *connp = (conn_t *)proto_handle; 207 208 /* All Solaris components should pass a cred for this operation. */ 209 ASSERT(cr != NULL); 210 ASSERT(connp->conn_upper_handle != NULL); 211 212 error = squeue_synch_enter(connp, NULL); 213 if (error != 0) { 214 /* failed to enter */ 215 return (ENOSR); 216 } 217 218 /* binding to a NULL address really means unbind */ 219 if (sa == NULL) { 220 if (connp->conn_tcp->tcp_state < TCPS_LISTEN) 221 error = tcp_do_unbind(connp); 222 else 223 error = EINVAL; 224 } else { 225 error = tcp_do_bind(connp, sa, len, cr, B_TRUE); 226 } 227 228 squeue_synch_exit(connp); 229 230 if (error < 0) { 231 if (error == -TOUTSTATE) 232 error = EINVAL; 233 else 234 error = proto_tlitosyserr(-error); 235 } 236 237 return (error); 238 } 239 240 /* ARGSUSED */ 241 static int 242 tcp_listen(sock_lower_handle_t proto_handle, int backlog, cred_t *cr) 243 { 244 conn_t *connp = (conn_t *)proto_handle; 245 tcp_t *tcp = connp->conn_tcp; 246 int error; 247 248 ASSERT(connp->conn_upper_handle != NULL); 249 250 /* All Solaris components should pass a cred for this operation. */ 251 ASSERT(cr != NULL); 252 253 error = squeue_synch_enter(connp, NULL); 254 if (error != 0) { 255 /* failed to enter */ 256 return (ENOBUFS); 257 } 258 259 error = tcp_do_listen(connp, NULL, 0, backlog, cr, B_FALSE); 260 if (error == 0) { 261 /* 262 * sockfs needs to know what's the maximum number of socket 263 * that can be queued on the listener. 264 */ 265 (*connp->conn_upcalls->su_opctl)(connp->conn_upper_handle, 266 SOCK_OPCTL_ENAB_ACCEPT, 267 (uintptr_t)(tcp->tcp_conn_req_max + 268 tcp->tcp_tcps->tcps_conn_req_max_q0)); 269 } else if (error < 0) { 270 if (error == -TOUTSTATE) 271 error = EINVAL; 272 else 273 error = proto_tlitosyserr(-error); 274 } 275 squeue_synch_exit(connp); 276 return (error); 277 } 278 279 static int 280 tcp_connect(sock_lower_handle_t proto_handle, const struct sockaddr *sa, 281 socklen_t len, sock_connid_t *id, cred_t *cr) 282 { 283 conn_t *connp = (conn_t *)proto_handle; 284 int error; 285 286 ASSERT(connp->conn_upper_handle != NULL); 287 288 /* All Solaris components should pass a cred for this operation. */ 289 ASSERT(cr != NULL); 290 291 error = proto_verify_ip_addr(connp->conn_family, sa, len); 292 if (error != 0) { 293 return (error); 294 } 295 296 error = squeue_synch_enter(connp, NULL); 297 if (error != 0) { 298 /* failed to enter */ 299 return (ENOSR); 300 } 301 302 /* 303 * TCP supports quick connect, so no need to do an implicit bind 304 */ 305 error = tcp_do_connect(connp, sa, len, cr, curproc->p_pid); 306 if (error == 0) { 307 *id = connp->conn_tcp->tcp_connid; 308 } else if (error < 0) { 309 if (error == -TOUTSTATE) { 310 switch (connp->conn_tcp->tcp_state) { 311 case TCPS_SYN_SENT: 312 error = EALREADY; 313 break; 314 case TCPS_ESTABLISHED: 315 error = EISCONN; 316 break; 317 case TCPS_LISTEN: 318 error = EOPNOTSUPP; 319 break; 320 default: 321 error = EINVAL; 322 break; 323 } 324 } else { 325 error = proto_tlitosyserr(-error); 326 } 327 } 328 329 if (connp->conn_tcp->tcp_loopback) { 330 struct sock_proto_props sopp; 331 332 sopp.sopp_flags = SOCKOPT_LOOPBACK; 333 sopp.sopp_loopback = B_TRUE; 334 335 (*connp->conn_upcalls->su_set_proto_props)( 336 connp->conn_upper_handle, &sopp); 337 } 338 done: 339 squeue_synch_exit(connp); 340 341 return ((error == 0) ? EINPROGRESS : error); 342 } 343 344 /* ARGSUSED3 */ 345 int 346 tcp_getpeername(sock_lower_handle_t proto_handle, struct sockaddr *addr, 347 socklen_t *addrlenp, cred_t *cr) 348 { 349 conn_t *connp = (conn_t *)proto_handle; 350 tcp_t *tcp = connp->conn_tcp; 351 352 /* All Solaris components should pass a cred for this operation. */ 353 ASSERT(cr != NULL); 354 355 ASSERT(tcp != NULL); 356 if (tcp->tcp_state < TCPS_SYN_RCVD) 357 return (ENOTCONN); 358 359 return (conn_getpeername(connp, addr, addrlenp)); 360 } 361 362 /* ARGSUSED3 */ 363 int 364 tcp_getsockname(sock_lower_handle_t proto_handle, struct sockaddr *addr, 365 socklen_t *addrlenp, cred_t *cr) 366 { 367 conn_t *connp = (conn_t *)proto_handle; 368 369 /* All Solaris components should pass a cred for this operation. */ 370 ASSERT(cr != NULL); 371 372 return (conn_getsockname(connp, addr, addrlenp)); 373 } 374 375 /* returns UNIX error, the optlen is a value-result arg */ 376 static int 377 tcp_getsockopt(sock_lower_handle_t proto_handle, int level, int option_name, 378 void *optvalp, socklen_t *optlen, cred_t *cr) 379 { 380 conn_t *connp = (conn_t *)proto_handle; 381 int error; 382 t_uscalar_t max_optbuf_len; 383 void *optvalp_buf; 384 int len; 385 386 ASSERT(connp->conn_upper_handle != NULL); 387 388 error = proto_opt_check(level, option_name, *optlen, &max_optbuf_len, 389 tcp_opt_obj.odb_opt_des_arr, 390 tcp_opt_obj.odb_opt_arr_cnt, 391 B_FALSE, B_TRUE, cr); 392 if (error != 0) { 393 if (error < 0) { 394 error = proto_tlitosyserr(-error); 395 } 396 return (error); 397 } 398 399 optvalp_buf = kmem_alloc(max_optbuf_len, KM_SLEEP); 400 401 error = squeue_synch_enter(connp, NULL); 402 if (error == ENOMEM) { 403 kmem_free(optvalp_buf, max_optbuf_len); 404 return (ENOMEM); 405 } 406 407 len = tcp_opt_get(connp, level, option_name, optvalp_buf); 408 squeue_synch_exit(connp); 409 410 if (len == -1) { 411 kmem_free(optvalp_buf, max_optbuf_len); 412 return (EINVAL); 413 } 414 415 /* 416 * update optlen and copy option value 417 */ 418 t_uscalar_t size = MIN(len, *optlen); 419 420 bcopy(optvalp_buf, optvalp, size); 421 bcopy(&size, optlen, sizeof (size)); 422 423 kmem_free(optvalp_buf, max_optbuf_len); 424 return (0); 425 } 426 427 static int 428 tcp_setsockopt(sock_lower_handle_t proto_handle, int level, int option_name, 429 const void *optvalp, socklen_t optlen, cred_t *cr) 430 { 431 conn_t *connp = (conn_t *)proto_handle; 432 int error; 433 434 ASSERT(connp->conn_upper_handle != NULL); 435 /* 436 * Entering the squeue synchronously can result in a context switch, 437 * which can cause a rather sever performance degradation. So we try to 438 * handle whatever options we can without entering the squeue. 439 */ 440 if (level == IPPROTO_TCP) { 441 switch (option_name) { 442 case TCP_NODELAY: 443 if (optlen != sizeof (int32_t)) 444 return (EINVAL); 445 mutex_enter(&connp->conn_tcp->tcp_non_sq_lock); 446 connp->conn_tcp->tcp_naglim = *(int *)optvalp ? 1 : 447 connp->conn_tcp->tcp_mss; 448 mutex_exit(&connp->conn_tcp->tcp_non_sq_lock); 449 return (0); 450 default: 451 break; 452 } 453 } 454 455 error = squeue_synch_enter(connp, NULL); 456 if (error == ENOMEM) { 457 return (ENOMEM); 458 } 459 460 error = proto_opt_check(level, option_name, optlen, NULL, 461 tcp_opt_obj.odb_opt_des_arr, 462 tcp_opt_obj.odb_opt_arr_cnt, 463 B_TRUE, B_FALSE, cr); 464 465 if (error != 0) { 466 if (error < 0) { 467 error = proto_tlitosyserr(-error); 468 } 469 squeue_synch_exit(connp); 470 return (error); 471 } 472 473 error = tcp_opt_set(connp, SETFN_OPTCOM_NEGOTIATE, level, option_name, 474 optlen, (uchar_t *)optvalp, (uint_t *)&optlen, (uchar_t *)optvalp, 475 NULL, cr); 476 squeue_synch_exit(connp); 477 478 ASSERT(error >= 0); 479 480 return (error); 481 } 482 483 /* ARGSUSED */ 484 static int 485 tcp_sendmsg(sock_lower_handle_t proto_handle, mblk_t *mp, struct nmsghdr *msg, 486 cred_t *cr) 487 { 488 tcp_t *tcp; 489 uint32_t msize; 490 conn_t *connp = (conn_t *)proto_handle; 491 int32_t tcpstate; 492 493 /* All Solaris components should pass a cred for this operation. */ 494 ASSERT(cr != NULL); 495 496 ASSERT(connp->conn_ref >= 2); 497 ASSERT(connp->conn_upper_handle != NULL); 498 499 if (msg->msg_controllen != 0) { 500 freemsg(mp); 501 return (EOPNOTSUPP); 502 } 503 504 switch (DB_TYPE(mp)) { 505 case M_DATA: 506 tcp = connp->conn_tcp; 507 ASSERT(tcp != NULL); 508 509 tcpstate = tcp->tcp_state; 510 if (tcpstate < TCPS_ESTABLISHED) { 511 freemsg(mp); 512 /* 513 * We return ENOTCONN if the endpoint is trying to 514 * connect or has never been connected, and EPIPE if it 515 * has been disconnected. The connection id helps us 516 * distinguish between the last two cases. 517 */ 518 return ((tcpstate == TCPS_SYN_SENT) ? ENOTCONN : 519 ((tcp->tcp_connid > 0) ? EPIPE : ENOTCONN)); 520 } else if (tcpstate > TCPS_CLOSE_WAIT) { 521 freemsg(mp); 522 return (EPIPE); 523 } 524 525 msize = msgdsize(mp); 526 527 mutex_enter(&tcp->tcp_non_sq_lock); 528 tcp->tcp_squeue_bytes += msize; 529 /* 530 * Squeue Flow Control 531 */ 532 if (TCP_UNSENT_BYTES(tcp) > connp->conn_sndbuf) { 533 tcp_setqfull(tcp); 534 } 535 mutex_exit(&tcp->tcp_non_sq_lock); 536 537 /* 538 * The application may pass in an address in the msghdr, but 539 * we ignore the address on connection-oriented sockets. 540 * Just like BSD this code does not generate an error for 541 * TCP (a CONNREQUIRED socket) when sending to an address 542 * passed in with sendto/sendmsg. Instead the data is 543 * delivered on the connection as if no address had been 544 * supplied. 545 */ 546 CONN_INC_REF(connp); 547 548 if (msg->msg_flags & MSG_OOB) { 549 SQUEUE_ENTER_ONE(connp->conn_sqp, mp, tcp_output_urgent, 550 connp, NULL, tcp_squeue_flag, SQTAG_TCP_OUTPUT); 551 } else { 552 SQUEUE_ENTER_ONE(connp->conn_sqp, mp, tcp_output, 553 connp, NULL, tcp_squeue_flag, SQTAG_TCP_OUTPUT); 554 } 555 556 return (0); 557 558 default: 559 ASSERT(0); 560 } 561 562 freemsg(mp); 563 return (0); 564 } 565 566 /* ARGSUSED */ 567 static int 568 tcp_shutdown(sock_lower_handle_t proto_handle, int how, cred_t *cr) 569 { 570 conn_t *connp = (conn_t *)proto_handle; 571 tcp_t *tcp = connp->conn_tcp; 572 573 ASSERT(connp->conn_upper_handle != NULL); 574 575 /* All Solaris components should pass a cred for this operation. */ 576 ASSERT(cr != NULL); 577 578 /* 579 * X/Open requires that we check the connected state. 580 */ 581 if (tcp->tcp_state < TCPS_SYN_SENT) 582 return (ENOTCONN); 583 584 /* shutdown the send side */ 585 if (how != SHUT_RD) { 586 mblk_t *bp; 587 588 bp = allocb_wait(0, BPRI_HI, STR_NOSIG, NULL); 589 CONN_INC_REF(connp); 590 SQUEUE_ENTER_ONE(connp->conn_sqp, bp, tcp_shutdown_output, 591 connp, NULL, SQ_NODRAIN, SQTAG_TCP_SHUTDOWN_OUTPUT); 592 593 (*connp->conn_upcalls->su_opctl)(connp->conn_upper_handle, 594 SOCK_OPCTL_SHUT_SEND, 0); 595 } 596 597 /* shutdown the recv side */ 598 if (how != SHUT_WR) 599 (*connp->conn_upcalls->su_opctl)(connp->conn_upper_handle, 600 SOCK_OPCTL_SHUT_RECV, 0); 601 602 return (0); 603 } 604 605 static void 606 tcp_clr_flowctrl(sock_lower_handle_t proto_handle) 607 { 608 conn_t *connp = (conn_t *)proto_handle; 609 tcp_t *tcp = connp->conn_tcp; 610 mblk_t *mp; 611 int error; 612 613 ASSERT(connp->conn_upper_handle != NULL); 614 615 /* 616 * If tcp->tcp_rsrv_mp == NULL, it means that tcp_clr_flowctrl() 617 * is currently running. 618 */ 619 mutex_enter(&tcp->tcp_rsrv_mp_lock); 620 if ((mp = tcp->tcp_rsrv_mp) == NULL) { 621 mutex_exit(&tcp->tcp_rsrv_mp_lock); 622 return; 623 } 624 tcp->tcp_rsrv_mp = NULL; 625 mutex_exit(&tcp->tcp_rsrv_mp_lock); 626 627 error = squeue_synch_enter(connp, mp); 628 ASSERT(error == 0); 629 630 mutex_enter(&tcp->tcp_rsrv_mp_lock); 631 tcp->tcp_rsrv_mp = mp; 632 mutex_exit(&tcp->tcp_rsrv_mp_lock); 633 634 if (tcp->tcp_fused) { 635 tcp_fuse_backenable(tcp); 636 } else { 637 tcp->tcp_rwnd = connp->conn_rcvbuf; 638 /* 639 * Send back a window update immediately if TCP is above 640 * ESTABLISHED state and the increase of the rcv window 641 * that the other side knows is at least 1 MSS after flow 642 * control is lifted. 643 */ 644 if (tcp->tcp_state >= TCPS_ESTABLISHED && 645 tcp_rwnd_reopen(tcp) == TH_ACK_NEEDED) { 646 tcp_xmit_ctl(NULL, tcp, 647 (tcp->tcp_swnd == 0) ? tcp->tcp_suna : 648 tcp->tcp_snxt, tcp->tcp_rnxt, TH_ACK); 649 } 650 } 651 652 squeue_synch_exit(connp); 653 } 654 655 /* ARGSUSED */ 656 static int 657 tcp_ioctl(sock_lower_handle_t proto_handle, int cmd, intptr_t arg, 658 int mode, int32_t *rvalp, cred_t *cr) 659 { 660 conn_t *connp = (conn_t *)proto_handle; 661 int error; 662 663 ASSERT(connp->conn_upper_handle != NULL); 664 665 /* All Solaris components should pass a cred for this operation. */ 666 ASSERT(cr != NULL); 667 668 /* 669 * If we don't have a helper stream then create one. 670 * ip_create_helper_stream takes care of locking the conn_t, 671 * so this check for NULL is just a performance optimization. 672 */ 673 if (connp->conn_helper_info == NULL) { 674 tcp_stack_t *tcps = connp->conn_tcp->tcp_tcps; 675 676 /* 677 * Create a helper stream for non-STREAMS socket. 678 */ 679 error = ip_create_helper_stream(connp, tcps->tcps_ldi_ident); 680 if (error != 0) { 681 ip0dbg(("tcp_ioctl: create of IP helper stream " 682 "failed %d\n", error)); 683 return (error); 684 } 685 } 686 687 switch (cmd) { 688 case ND_SET: 689 case ND_GET: 690 case _SIOCSOCKFALLBACK: 691 case TCP_IOC_ABORT_CONN: 692 case TI_GETPEERNAME: 693 case TI_GETMYNAME: 694 ip1dbg(("tcp_ioctl: cmd 0x%x on non streams socket", 695 cmd)); 696 error = EINVAL; 697 break; 698 default: 699 /* 700 * If the conn is not closing, pass on to IP using 701 * helper stream. Bump the ioctlref to prevent tcp_close 702 * from closing the rq/wq out from underneath the ioctl 703 * if it ends up queued or aborted/interrupted. 704 */ 705 mutex_enter(&connp->conn_lock); 706 if (connp->conn_state_flags & (CONN_CLOSING)) { 707 mutex_exit(&connp->conn_lock); 708 error = EINVAL; 709 break; 710 } 711 CONN_INC_IOCTLREF_LOCKED(connp); 712 error = ldi_ioctl(connp->conn_helper_info->iphs_handle, 713 cmd, arg, mode, cr, rvalp); 714 CONN_DEC_IOCTLREF(connp); 715 break; 716 } 717 return (error); 718 } 719 720 /* ARGSUSED */ 721 static int 722 tcp_close(sock_lower_handle_t proto_handle, int flags, cred_t *cr) 723 { 724 conn_t *connp = (conn_t *)proto_handle; 725 726 ASSERT(connp->conn_upper_handle != NULL); 727 728 /* All Solaris components should pass a cred for this operation. */ 729 ASSERT(cr != NULL); 730 731 tcp_close_common(connp, flags); 732 733 ip_free_helper_stream(connp); 734 735 /* 736 * Drop IP's reference on the conn. This is the last reference 737 * on the connp if the state was less than established. If the 738 * connection has gone into timewait state, then we will have 739 * one ref for the TCP and one more ref (total of two) for the 740 * classifier connected hash list (a timewait connections stays 741 * in connected hash till closed). 742 * 743 * We can't assert the references because there might be other 744 * transient reference places because of some walkers or queued 745 * packets in squeue for the timewait state. 746 */ 747 CONN_DEC_REF(connp); 748 749 /* 750 * EINPROGRESS tells sockfs to wait for a 'closed' upcall before 751 * freeing the socket. 752 */ 753 return (EINPROGRESS); 754 } 755 756 /* ARGSUSED */ 757 sock_lower_handle_t 758 tcp_create(int family, int type, int proto, sock_downcalls_t **sock_downcalls, 759 uint_t *smodep, int *errorp, int flags, cred_t *credp) 760 { 761 conn_t *connp; 762 boolean_t isv6 = family == AF_INET6; 763 if (type != SOCK_STREAM || (family != AF_INET && family != AF_INET6) || 764 (proto != 0 && proto != IPPROTO_TCP)) { 765 *errorp = EPROTONOSUPPORT; 766 return (NULL); 767 } 768 769 connp = tcp_create_common(credp, isv6, B_TRUE, errorp); 770 if (connp == NULL) { 771 return (NULL); 772 } 773 774 /* 775 * Put the ref for TCP. Ref for IP was already put 776 * by ipcl_conn_create. Also Make the conn_t globally 777 * visible to walkers 778 */ 779 mutex_enter(&connp->conn_lock); 780 CONN_INC_REF_LOCKED(connp); 781 ASSERT(connp->conn_ref == 2); 782 connp->conn_state_flags &= ~CONN_INCIPIENT; 783 784 connp->conn_flags |= IPCL_NONSTR; 785 mutex_exit(&connp->conn_lock); 786 787 ASSERT(errorp != NULL); 788 *errorp = 0; 789 *sock_downcalls = &sock_tcp_downcalls; 790 *smodep = SM_CONNREQUIRED | SM_EXDATA | SM_ACCEPTSUPP | 791 SM_SENDFILESUPP; 792 793 return ((sock_lower_handle_t)connp); 794 } 795 796 /* 797 * tcp_fallback 798 * 799 * A direct socket is falling back to using STREAMS. The queue 800 * that is being passed down was created using tcp_open() with 801 * the SO_FALLBACK flag set. As a result, the queue is not 802 * associated with a conn, and the q_ptrs instead contain the 803 * dev and minor area that should be used. 804 * 805 * The 'issocket' flag indicates whether the FireEngine 806 * optimizations should be used. The common case would be that 807 * optimizations are enabled, and they might be subsequently 808 * disabled using the _SIOCSOCKFALLBACK ioctl. 809 */ 810 811 /* 812 * An active connection is falling back to TPI. Gather all the information 813 * required by the STREAM head and TPI sonode and send it up. 814 */ 815 static void 816 tcp_fallback_noneager(tcp_t *tcp, mblk_t *stropt_mp, queue_t *q, 817 boolean_t issocket, so_proto_quiesced_cb_t quiesced_cb, 818 sock_quiesce_arg_t *arg) 819 { 820 conn_t *connp = tcp->tcp_connp; 821 struct stroptions *stropt; 822 struct T_capability_ack tca; 823 struct sockaddr_in6 laddr, faddr; 824 socklen_t laddrlen, faddrlen; 825 short opts; 826 int error; 827 mblk_t *mp, *mpnext; 828 829 connp->conn_dev = (dev_t)RD(q)->q_ptr; 830 connp->conn_minor_arena = WR(q)->q_ptr; 831 832 RD(q)->q_ptr = WR(q)->q_ptr = connp; 833 834 connp->conn_rq = RD(q); 835 connp->conn_wq = WR(q); 836 837 WR(q)->q_qinfo = &tcp_sock_winit; 838 839 if (!issocket) 840 tcp_use_pure_tpi(tcp); 841 842 /* 843 * free the helper stream 844 */ 845 ip_free_helper_stream(connp); 846 847 /* 848 * Notify the STREAM head about options 849 */ 850 DB_TYPE(stropt_mp) = M_SETOPTS; 851 stropt = (struct stroptions *)stropt_mp->b_rptr; 852 stropt_mp->b_wptr += sizeof (struct stroptions); 853 stropt->so_flags = SO_HIWAT | SO_WROFF | SO_MAXBLK; 854 855 stropt->so_wroff = connp->conn_ht_iphc_len + (tcp->tcp_loopback ? 0 : 856 tcp->tcp_tcps->tcps_wroff_xtra); 857 if (tcp->tcp_snd_sack_ok) 858 stropt->so_wroff += TCPOPT_MAX_SACK_LEN; 859 stropt->so_hiwat = connp->conn_rcvbuf; 860 stropt->so_maxblk = tcp_maxpsz_set(tcp, B_FALSE); 861 862 putnext(RD(q), stropt_mp); 863 864 /* 865 * Collect the information needed to sync with the sonode 866 */ 867 tcp_do_capability_ack(tcp, &tca, TC1_INFO|TC1_ACCEPTOR_ID); 868 869 laddrlen = faddrlen = sizeof (sin6_t); 870 (void) tcp_getsockname((sock_lower_handle_t)connp, 871 (struct sockaddr *)&laddr, &laddrlen, CRED()); 872 error = tcp_getpeername((sock_lower_handle_t)connp, 873 (struct sockaddr *)&faddr, &faddrlen, CRED()); 874 if (error != 0) 875 faddrlen = 0; 876 877 opts = 0; 878 if (connp->conn_oobinline) 879 opts |= SO_OOBINLINE; 880 if (connp->conn_ixa->ixa_flags & IXAF_DONTROUTE) 881 opts |= SO_DONTROUTE; 882 883 /* 884 * Notify the socket that the protocol is now quiescent, 885 * and it's therefore safe move data from the socket 886 * to the stream head. 887 */ 888 mp = (*quiesced_cb)(connp->conn_upper_handle, arg, &tca, 889 (struct sockaddr *)&laddr, laddrlen, 890 (struct sockaddr *)&faddr, faddrlen, opts); 891 892 while (mp != NULL) { 893 mpnext = mp->b_next; 894 tcp->tcp_rcv_list = mp->b_next; 895 mp->b_next = NULL; 896 putnext(q, mp); 897 mp = mpnext; 898 } 899 ASSERT(tcp->tcp_rcv_last_head == NULL); 900 ASSERT(tcp->tcp_rcv_last_tail == NULL); 901 ASSERT(tcp->tcp_rcv_cnt == 0); 902 903 /* 904 * All eagers in q0 are marked as being non-STREAM, so they will 905 * make su_newconn upcalls when the handshake completes, which 906 * will fail (resulting in the conn being closed). So we just blow 907 * off everything in q0 instead of waiting for the inevitable. 908 */ 909 if (tcp->tcp_conn_req_cnt_q0 != 0) 910 tcp_eager_cleanup(tcp, B_TRUE); 911 } 912 913 /* 914 * An eager is falling back to TPI. All we have to do is send 915 * up a T_CONN_IND. 916 */ 917 static void 918 tcp_fallback_eager(tcp_t *eager, boolean_t issocket, 919 so_proto_quiesced_cb_t quiesced_cb, sock_quiesce_arg_t *arg) 920 { 921 conn_t *connp = eager->tcp_connp; 922 tcp_t *listener = eager->tcp_listener; 923 mblk_t *mp; 924 925 ASSERT(listener != NULL); 926 927 /* 928 * Notify the socket that the protocol is now quiescent, 929 * and it's therefore safe move data from the socket 930 * to tcp's rcv queue. 931 */ 932 mp = (*quiesced_cb)(connp->conn_upper_handle, arg, NULL, NULL, 0, 933 NULL, 0, 0); 934 935 if (mp != NULL) { 936 ASSERT(eager->tcp_rcv_cnt == 0); 937 938 eager->tcp_rcv_list = mp; 939 eager->tcp_rcv_cnt = msgdsize(mp); 940 while (mp->b_next != NULL) { 941 mp = mp->b_next; 942 eager->tcp_rcv_cnt += msgdsize(mp); 943 } 944 eager->tcp_rcv_last_head = mp; 945 while (mp->b_cont) 946 mp = mp->b_cont; 947 eager->tcp_rcv_last_tail = mp; 948 if (eager->tcp_rcv_cnt > eager->tcp_rwnd) 949 eager->tcp_rwnd = 0; 950 else 951 eager->tcp_rwnd -= eager->tcp_rcv_cnt; 952 } 953 954 if (!issocket) 955 eager->tcp_issocket = B_FALSE; 956 /* 957 * The stream for this eager does not yet exist, so mark it as 958 * being detached. 959 */ 960 eager->tcp_detached = B_TRUE; 961 eager->tcp_hard_binding = B_TRUE; 962 connp->conn_rq = listener->tcp_connp->conn_rq; 963 connp->conn_wq = listener->tcp_connp->conn_wq; 964 965 /* Send up the connection indication */ 966 mp = eager->tcp_conn.tcp_eager_conn_ind; 967 ASSERT(mp != NULL); 968 eager->tcp_conn.tcp_eager_conn_ind = NULL; 969 970 /* 971 * TLI/XTI applications will get confused by 972 * sending eager as an option since it violates 973 * the option semantics. So remove the eager as 974 * option since TLI/XTI app doesn't need it anyway. 975 */ 976 if (!issocket) { 977 struct T_conn_ind *conn_ind; 978 979 conn_ind = (struct T_conn_ind *)mp->b_rptr; 980 conn_ind->OPT_length = 0; 981 conn_ind->OPT_offset = 0; 982 } 983 984 /* 985 * Sockfs guarantees that the listener will not be closed 986 * during fallback. So we can safely use the listener's queue. 987 */ 988 putnext(listener->tcp_connp->conn_rq, mp); 989 } 990 991 992 int 993 tcp_fallback(sock_lower_handle_t proto_handle, queue_t *q, 994 boolean_t direct_sockfs, so_proto_quiesced_cb_t quiesced_cb, 995 sock_quiesce_arg_t *arg) 996 { 997 tcp_t *tcp; 998 conn_t *connp = (conn_t *)proto_handle; 999 int error; 1000 mblk_t *stropt_mp; 1001 mblk_t *ordrel_mp; 1002 1003 tcp = connp->conn_tcp; 1004 1005 stropt_mp = allocb_wait(sizeof (struct stroptions), BPRI_HI, STR_NOSIG, 1006 NULL); 1007 1008 /* Pre-allocate the T_ordrel_ind mblk. */ 1009 ASSERT(tcp->tcp_ordrel_mp == NULL); 1010 ordrel_mp = allocb_wait(sizeof (struct T_ordrel_ind), BPRI_HI, 1011 STR_NOSIG, NULL); 1012 ordrel_mp->b_datap->db_type = M_PROTO; 1013 ((struct T_ordrel_ind *)ordrel_mp->b_rptr)->PRIM_type = T_ORDREL_IND; 1014 ordrel_mp->b_wptr += sizeof (struct T_ordrel_ind); 1015 1016 /* 1017 * Enter the squeue so that no new packets can come in 1018 */ 1019 error = squeue_synch_enter(connp, NULL); 1020 if (error != 0) { 1021 /* failed to enter, free all the pre-allocated messages. */ 1022 freeb(stropt_mp); 1023 freeb(ordrel_mp); 1024 return (ENOMEM); 1025 } 1026 1027 /* 1028 * Both endpoints must be of the same type (either STREAMS or 1029 * non-STREAMS) for fusion to be enabled. So if we are fused, 1030 * we have to unfuse. 1031 */ 1032 if (tcp->tcp_fused) 1033 tcp_unfuse(tcp); 1034 1035 if (tcp->tcp_listener != NULL) { 1036 /* The eager will deal with opts when accept() is called */ 1037 freeb(stropt_mp); 1038 tcp_fallback_eager(tcp, direct_sockfs, quiesced_cb, arg); 1039 } else { 1040 tcp_fallback_noneager(tcp, stropt_mp, q, direct_sockfs, 1041 quiesced_cb, arg); 1042 } 1043 1044 /* 1045 * No longer a direct socket 1046 * 1047 * Note that we intentionally leave the upper_handle and upcalls 1048 * intact, since eagers may still be using them. 1049 */ 1050 connp->conn_flags &= ~IPCL_NONSTR; 1051 tcp->tcp_ordrel_mp = ordrel_mp; 1052 1053 /* 1054 * There should be atleast two ref's (IP + TCP) 1055 */ 1056 ASSERT(connp->conn_ref >= 2); 1057 squeue_synch_exit(connp); 1058 1059 return (0); 1060 } 1061 1062 /* 1063 * Notifies a non-STREAMS based listener about a new connection. This 1064 * function is executed on the *eager*'s squeue once the 3 way handshake 1065 * has completed. Note that the behavior differs from STREAMS, where the 1066 * T_CONN_IND is sent up by tcp_send_conn_ind while on the *listener*'s 1067 * squeue. 1068 * 1069 * Returns B_TRUE if the notification succeeded, in which case `tcp' will 1070 * be moved over to the ESTABLISHED list (q) of the listener. Othwerise, 1071 * B_FALSE is returned and `tcp' is killed. 1072 */ 1073 boolean_t 1074 tcp_newconn_notify(tcp_t *tcp, ip_recv_attr_t *ira) 1075 { 1076 tcp_t *listener = tcp->tcp_listener; 1077 conn_t *lconnp = listener->tcp_connp; 1078 conn_t *econnp = tcp->tcp_connp; 1079 tcp_t *tail; 1080 ipaddr_t *addr_cache; 1081 sock_upper_handle_t upper; 1082 struct sock_proto_props sopp; 1083 mblk_t *mp; 1084 1085 mutex_enter(&listener->tcp_eager_lock); 1086 /* 1087 * Take the eager out, if it is in the list of droppable eagers 1088 * as we are here because the 3W handshake is over. 1089 */ 1090 MAKE_UNDROPPABLE(tcp); 1091 /* 1092 * The eager already has an extra ref put in tcp_input_data 1093 * so that it stays till accept comes back even though it 1094 * might get into TCPS_CLOSED as a result of a TH_RST etc. 1095 */ 1096 ASSERT(listener->tcp_conn_req_cnt_q0 > 0); 1097 listener->tcp_conn_req_cnt_q0--; 1098 listener->tcp_conn_req_cnt_q++; 1099 1100 /* Move from SYN_RCVD to ESTABLISHED list */ 1101 tcp->tcp_eager_next_q0->tcp_eager_prev_q0 = tcp->tcp_eager_prev_q0; 1102 tcp->tcp_eager_prev_q0->tcp_eager_next_q0 = tcp->tcp_eager_next_q0; 1103 tcp->tcp_eager_prev_q0 = NULL; 1104 tcp->tcp_eager_next_q0 = NULL; 1105 1106 /* 1107 * Insert at end of the queue because connections are accepted 1108 * in chronological order. Leaving the older connections at front 1109 * of the queue helps reducing search time. 1110 */ 1111 tail = listener->tcp_eager_last_q; 1112 if (tail != NULL) 1113 tail->tcp_eager_next_q = tcp; 1114 else 1115 listener->tcp_eager_next_q = tcp; 1116 listener->tcp_eager_last_q = tcp; 1117 tcp->tcp_eager_next_q = NULL; 1118 1119 /* we have timed out before */ 1120 if (tcp->tcp_syn_rcvd_timeout != 0) { 1121 tcp->tcp_syn_rcvd_timeout = 0; 1122 listener->tcp_syn_rcvd_timeout--; 1123 if (listener->tcp_syn_defense && 1124 listener->tcp_syn_rcvd_timeout <= 1125 (listener->tcp_tcps->tcps_conn_req_max_q0 >> 5) && 1126 10*MINUTES < TICK_TO_MSEC(ddi_get_lbolt64() - 1127 listener->tcp_last_rcv_lbolt)) { 1128 /* 1129 * Turn off the defense mode if we 1130 * believe the SYN attack is over. 1131 */ 1132 listener->tcp_syn_defense = B_FALSE; 1133 if (listener->tcp_ip_addr_cache) { 1134 kmem_free((void *)listener->tcp_ip_addr_cache, 1135 IP_ADDR_CACHE_SIZE * sizeof (ipaddr_t)); 1136 listener->tcp_ip_addr_cache = NULL; 1137 } 1138 } 1139 } 1140 addr_cache = (ipaddr_t *)(listener->tcp_ip_addr_cache); 1141 if (addr_cache != NULL) { 1142 /* 1143 * We have finished a 3-way handshake with this 1144 * remote host. This proves the IP addr is good. 1145 * Cache it! 1146 */ 1147 addr_cache[IP_ADDR_CACHE_HASH(tcp->tcp_connp->conn_faddr_v4)] = 1148 tcp->tcp_connp->conn_faddr_v4; 1149 } 1150 mutex_exit(&listener->tcp_eager_lock); 1151 1152 /* 1153 * Notify the ULP about the newconn. It is guaranteed that no 1154 * tcp_accept() call will be made for the eager if the 1155 * notification fails. 1156 */ 1157 if ((upper = (*lconnp->conn_upcalls->su_newconn) 1158 (lconnp->conn_upper_handle, (sock_lower_handle_t)econnp, 1159 &sock_tcp_downcalls, ira->ira_cred, ira->ira_cpid, 1160 &econnp->conn_upcalls)) == NULL) { 1161 /* 1162 * Normally this should not happen, but the listener might 1163 * have done a fallback to TPI followed by a close(), in 1164 * which case tcp_closemp for this conn might have been 1165 * used by tcp_eager_cleanup(). 1166 */ 1167 mutex_enter(&listener->tcp_eager_lock); 1168 if (tcp->tcp_closemp_used) { 1169 mutex_exit(&listener->tcp_eager_lock); 1170 return (B_FALSE); 1171 } 1172 tcp->tcp_closemp_used = B_TRUE; 1173 TCP_DEBUG_GETPCSTACK(tcp->tcmp_stk, 15); 1174 mp = &tcp->tcp_closemp; 1175 mutex_exit(&listener->tcp_eager_lock); 1176 tcp_eager_kill(econnp, mp, NULL, NULL); 1177 return (B_FALSE); 1178 } 1179 econnp->conn_upper_handle = upper; 1180 1181 tcp->tcp_detached = B_FALSE; 1182 tcp->tcp_hard_binding = B_FALSE; 1183 tcp->tcp_tconnind_started = B_TRUE; 1184 1185 if (econnp->conn_keepalive) { 1186 tcp->tcp_ka_last_intrvl = 0; 1187 tcp->tcp_ka_tid = TCP_TIMER(tcp, tcp_keepalive_timer, 1188 tcp->tcp_ka_interval); 1189 } 1190 1191 /* Update the necessary parameters */ 1192 tcp_get_proto_props(tcp, &sopp); 1193 1194 (*econnp->conn_upcalls->su_set_proto_props) 1195 (econnp->conn_upper_handle, &sopp); 1196 1197 return (B_TRUE); 1198 } 1199