1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. 24 */ 25 26 /* This file contains all TCP kernel socket related functions. */ 27 28 #include <sys/types.h> 29 #include <sys/strlog.h> 30 #include <sys/policy.h> 31 #include <sys/sockio.h> 32 #include <sys/strsubr.h> 33 #include <sys/strsun.h> 34 #include <sys/squeue_impl.h> 35 #include <sys/squeue.h> 36 #define _SUN_TPI_VERSION 2 37 #include <sys/tihdr.h> 38 #include <sys/timod.h> 39 #include <sys/tpicommon.h> 40 #include <sys/socketvar.h> 41 42 #include <inet/common.h> 43 #include <inet/proto_set.h> 44 #include <inet/ip.h> 45 #include <inet/tcp.h> 46 #include <inet/tcp_impl.h> 47 48 static void tcp_activate(sock_lower_handle_t, sock_upper_handle_t, 49 sock_upcalls_t *, int, cred_t *); 50 static int tcp_accept(sock_lower_handle_t, sock_lower_handle_t, 51 sock_upper_handle_t, cred_t *); 52 static int tcp_bind(sock_lower_handle_t, struct sockaddr *, 53 socklen_t, cred_t *); 54 static int tcp_listen(sock_lower_handle_t, int, cred_t *); 55 static int tcp_connect(sock_lower_handle_t, const struct sockaddr *, 56 socklen_t, sock_connid_t *, cred_t *); 57 static int tcp_getsockopt(sock_lower_handle_t, int, int, void *, 58 socklen_t *, cred_t *); 59 static int tcp_setsockopt(sock_lower_handle_t, int, int, const void *, 60 socklen_t, cred_t *); 61 static int tcp_sendmsg(sock_lower_handle_t, mblk_t *, struct nmsghdr *, 62 cred_t *cr); 63 static int tcp_shutdown(sock_lower_handle_t, int, cred_t *); 64 static void tcp_clr_flowctrl(sock_lower_handle_t); 65 static int tcp_ioctl(sock_lower_handle_t, int, intptr_t, int, int32_t *, 66 cred_t *); 67 static int tcp_close(sock_lower_handle_t, int, cred_t *); 68 69 sock_downcalls_t sock_tcp_downcalls = { 70 tcp_activate, 71 tcp_accept, 72 tcp_bind, 73 tcp_listen, 74 tcp_connect, 75 tcp_getpeername, 76 tcp_getsockname, 77 tcp_getsockopt, 78 tcp_setsockopt, 79 tcp_sendmsg, 80 NULL, 81 NULL, 82 NULL, 83 tcp_shutdown, 84 tcp_clr_flowctrl, 85 tcp_ioctl, 86 tcp_close, 87 }; 88 89 /* ARGSUSED */ 90 static void 91 tcp_activate(sock_lower_handle_t proto_handle, sock_upper_handle_t sock_handle, 92 sock_upcalls_t *sock_upcalls, int flags, cred_t *cr) 93 { 94 conn_t *connp = (conn_t *)proto_handle; 95 struct sock_proto_props sopp; 96 extern struct module_info tcp_rinfo; 97 98 ASSERT(connp->conn_upper_handle == NULL); 99 100 /* All Solaris components should pass a cred for this operation. */ 101 ASSERT(cr != NULL); 102 103 sopp.sopp_flags = SOCKOPT_RCVHIWAT | SOCKOPT_RCVLOWAT | 104 SOCKOPT_MAXPSZ | SOCKOPT_MAXBLK | SOCKOPT_RCVTIMER | 105 SOCKOPT_RCVTHRESH | SOCKOPT_MAXADDRLEN | SOCKOPT_MINPSZ; 106 107 sopp.sopp_rxhiwat = SOCKET_RECVHIWATER; 108 sopp.sopp_rxlowat = SOCKET_RECVLOWATER; 109 sopp.sopp_maxpsz = INFPSZ; 110 sopp.sopp_maxblk = INFPSZ; 111 sopp.sopp_rcvtimer = SOCKET_TIMER_INTERVAL; 112 sopp.sopp_rcvthresh = SOCKET_RECVHIWATER >> 3; 113 sopp.sopp_maxaddrlen = sizeof (sin6_t); 114 sopp.sopp_minpsz = (tcp_rinfo.mi_minpsz == 1) ? 0 : 115 tcp_rinfo.mi_minpsz; 116 117 connp->conn_upcalls = sock_upcalls; 118 connp->conn_upper_handle = sock_handle; 119 120 ASSERT(connp->conn_rcvbuf != 0 && 121 connp->conn_rcvbuf == connp->conn_tcp->tcp_rwnd); 122 (*sock_upcalls->su_set_proto_props)(sock_handle, &sopp); 123 } 124 125 /*ARGSUSED*/ 126 static int 127 tcp_accept(sock_lower_handle_t lproto_handle, 128 sock_lower_handle_t eproto_handle, sock_upper_handle_t sock_handle, 129 cred_t *cr) 130 { 131 conn_t *lconnp, *econnp; 132 tcp_t *listener, *eager; 133 134 lconnp = (conn_t *)lproto_handle; 135 listener = lconnp->conn_tcp; 136 ASSERT(listener->tcp_state == TCPS_LISTEN); 137 econnp = (conn_t *)eproto_handle; 138 eager = econnp->conn_tcp; 139 ASSERT(eager->tcp_listener != NULL); 140 ASSERT(IPCL_IS_NONSTR(econnp)); 141 ASSERT(lconnp->conn_upper_handle != NULL); 142 143 /* 144 * It is possible for the accept thread to race with the thread that 145 * made the su_newconn upcall in tcp_newconn_notify. Both 146 * tcp_newconn_notify and tcp_accept require that conn_upper_handle 147 * and conn_upcalls be set before returning, so they both write to 148 * them. However, we're guaranteed that the value written is the same 149 * for both threads. 150 */ 151 ASSERT(econnp->conn_upper_handle == NULL || 152 econnp->conn_upper_handle == sock_handle); 153 ASSERT(econnp->conn_upcalls == NULL || 154 econnp->conn_upcalls == lconnp->conn_upcalls); 155 econnp->conn_upper_handle = sock_handle; 156 econnp->conn_upcalls = lconnp->conn_upcalls; 157 158 ASSERT(econnp->conn_netstack == 159 listener->tcp_connp->conn_netstack); 160 ASSERT(eager->tcp_tcps == listener->tcp_tcps); 161 162 /* 163 * We should have a minimum of 2 references on the conn at this 164 * point. One for TCP and one for the newconn notification 165 * (which is now taken over by IP). In the normal case we would 166 * also have another reference (making a total of 3) for the conn 167 * being in the classifier hash list. However the eager could have 168 * received an RST subsequently and tcp_closei_local could have 169 * removed the eager from the classifier hash list, hence we can't 170 * assert that reference. 171 */ 172 ASSERT(econnp->conn_ref >= 2); 173 174 /* 175 * An error is returned if this conn has been reset, which will 176 * cause the socket to be closed immediately. The eager will be 177 * unlinked from the listener during close. 178 */ 179 if (eager->tcp_state < TCPS_ESTABLISHED) 180 return (ECONNABORTED); 181 182 mutex_enter(&listener->tcp_eager_lock); 183 /* 184 * Non-STREAMS listeners never defer the notification of new 185 * connections. 186 */ 187 ASSERT(!listener->tcp_eager_prev_q0->tcp_conn_def_q0); 188 tcp_eager_unlink(eager); 189 mutex_exit(&listener->tcp_eager_lock); 190 CONN_DEC_REF(listener->tcp_connp); 191 192 return (0); 193 } 194 195 static int 196 tcp_bind(sock_lower_handle_t proto_handle, struct sockaddr *sa, 197 socklen_t len, cred_t *cr) 198 { 199 int error; 200 conn_t *connp = (conn_t *)proto_handle; 201 202 /* All Solaris components should pass a cred for this operation. */ 203 ASSERT(cr != NULL); 204 ASSERT(connp->conn_upper_handle != NULL); 205 206 error = squeue_synch_enter(connp, NULL); 207 if (error != 0) { 208 /* failed to enter */ 209 return (ENOSR); 210 } 211 212 /* binding to a NULL address really means unbind */ 213 if (sa == NULL) { 214 if (connp->conn_tcp->tcp_state < TCPS_LISTEN) 215 error = tcp_do_unbind(connp); 216 else 217 error = EINVAL; 218 } else { 219 error = tcp_do_bind(connp, sa, len, cr, B_TRUE); 220 } 221 222 squeue_synch_exit(connp); 223 224 if (error < 0) { 225 if (error == -TOUTSTATE) 226 error = EINVAL; 227 else 228 error = proto_tlitosyserr(-error); 229 } 230 231 return (error); 232 } 233 234 /* ARGSUSED */ 235 static int 236 tcp_listen(sock_lower_handle_t proto_handle, int backlog, cred_t *cr) 237 { 238 conn_t *connp = (conn_t *)proto_handle; 239 tcp_t *tcp = connp->conn_tcp; 240 int error; 241 242 ASSERT(connp->conn_upper_handle != NULL); 243 244 /* All Solaris components should pass a cred for this operation. */ 245 ASSERT(cr != NULL); 246 247 error = squeue_synch_enter(connp, NULL); 248 if (error != 0) { 249 /* failed to enter */ 250 return (ENOBUFS); 251 } 252 253 error = tcp_do_listen(connp, NULL, 0, backlog, cr, B_FALSE); 254 if (error == 0) { 255 /* 256 * sockfs needs to know what's the maximum number of socket 257 * that can be queued on the listener. 258 */ 259 (*connp->conn_upcalls->su_opctl)(connp->conn_upper_handle, 260 SOCK_OPCTL_ENAB_ACCEPT, 261 (uintptr_t)(tcp->tcp_conn_req_max + 262 tcp->tcp_tcps->tcps_conn_req_max_q0)); 263 } else if (error < 0) { 264 if (error == -TOUTSTATE) 265 error = EINVAL; 266 else 267 error = proto_tlitosyserr(-error); 268 } 269 squeue_synch_exit(connp); 270 return (error); 271 } 272 273 static int 274 tcp_connect(sock_lower_handle_t proto_handle, const struct sockaddr *sa, 275 socklen_t len, sock_connid_t *id, cred_t *cr) 276 { 277 conn_t *connp = (conn_t *)proto_handle; 278 int error; 279 280 ASSERT(connp->conn_upper_handle != NULL); 281 282 /* All Solaris components should pass a cred for this operation. */ 283 ASSERT(cr != NULL); 284 285 error = proto_verify_ip_addr(connp->conn_family, sa, len); 286 if (error != 0) { 287 return (error); 288 } 289 290 error = squeue_synch_enter(connp, NULL); 291 if (error != 0) { 292 /* failed to enter */ 293 return (ENOSR); 294 } 295 296 /* 297 * TCP supports quick connect, so no need to do an implicit bind 298 */ 299 error = tcp_do_connect(connp, sa, len, cr, curproc->p_pid); 300 if (error == 0) { 301 *id = connp->conn_tcp->tcp_connid; 302 } else if (error < 0) { 303 if (error == -TOUTSTATE) { 304 switch (connp->conn_tcp->tcp_state) { 305 case TCPS_SYN_SENT: 306 error = EALREADY; 307 break; 308 case TCPS_ESTABLISHED: 309 error = EISCONN; 310 break; 311 case TCPS_LISTEN: 312 error = EOPNOTSUPP; 313 break; 314 default: 315 error = EINVAL; 316 break; 317 } 318 } else { 319 error = proto_tlitosyserr(-error); 320 } 321 } 322 323 if (connp->conn_tcp->tcp_loopback) { 324 struct sock_proto_props sopp; 325 326 sopp.sopp_flags = SOCKOPT_LOOPBACK; 327 sopp.sopp_loopback = B_TRUE; 328 329 (*connp->conn_upcalls->su_set_proto_props)( 330 connp->conn_upper_handle, &sopp); 331 } 332 done: 333 squeue_synch_exit(connp); 334 335 return ((error == 0) ? EINPROGRESS : error); 336 } 337 338 /* ARGSUSED3 */ 339 int 340 tcp_getpeername(sock_lower_handle_t proto_handle, struct sockaddr *addr, 341 socklen_t *addrlenp, cred_t *cr) 342 { 343 conn_t *connp = (conn_t *)proto_handle; 344 tcp_t *tcp = connp->conn_tcp; 345 346 /* All Solaris components should pass a cred for this operation. */ 347 ASSERT(cr != NULL); 348 349 ASSERT(tcp != NULL); 350 if (tcp->tcp_state < TCPS_SYN_RCVD) 351 return (ENOTCONN); 352 353 return (conn_getpeername(connp, addr, addrlenp)); 354 } 355 356 /* ARGSUSED3 */ 357 int 358 tcp_getsockname(sock_lower_handle_t proto_handle, struct sockaddr *addr, 359 socklen_t *addrlenp, cred_t *cr) 360 { 361 conn_t *connp = (conn_t *)proto_handle; 362 363 /* All Solaris components should pass a cred for this operation. */ 364 ASSERT(cr != NULL); 365 366 return (conn_getsockname(connp, addr, addrlenp)); 367 } 368 369 /* returns UNIX error, the optlen is a value-result arg */ 370 static int 371 tcp_getsockopt(sock_lower_handle_t proto_handle, int level, int option_name, 372 void *optvalp, socklen_t *optlen, cred_t *cr) 373 { 374 conn_t *connp = (conn_t *)proto_handle; 375 int error; 376 t_uscalar_t max_optbuf_len; 377 void *optvalp_buf; 378 int len; 379 380 ASSERT(connp->conn_upper_handle != NULL); 381 382 error = proto_opt_check(level, option_name, *optlen, &max_optbuf_len, 383 tcp_opt_obj.odb_opt_des_arr, 384 tcp_opt_obj.odb_opt_arr_cnt, 385 B_FALSE, B_TRUE, cr); 386 if (error != 0) { 387 if (error < 0) { 388 error = proto_tlitosyserr(-error); 389 } 390 return (error); 391 } 392 393 optvalp_buf = kmem_alloc(max_optbuf_len, KM_SLEEP); 394 395 error = squeue_synch_enter(connp, NULL); 396 if (error == ENOMEM) { 397 kmem_free(optvalp_buf, max_optbuf_len); 398 return (ENOMEM); 399 } 400 401 len = tcp_opt_get(connp, level, option_name, optvalp_buf); 402 squeue_synch_exit(connp); 403 404 if (len == -1) { 405 kmem_free(optvalp_buf, max_optbuf_len); 406 return (EINVAL); 407 } 408 409 /* 410 * update optlen and copy option value 411 */ 412 t_uscalar_t size = MIN(len, *optlen); 413 414 bcopy(optvalp_buf, optvalp, size); 415 bcopy(&size, optlen, sizeof (size)); 416 417 kmem_free(optvalp_buf, max_optbuf_len); 418 return (0); 419 } 420 421 static int 422 tcp_setsockopt(sock_lower_handle_t proto_handle, int level, int option_name, 423 const void *optvalp, socklen_t optlen, cred_t *cr) 424 { 425 conn_t *connp = (conn_t *)proto_handle; 426 int error; 427 428 ASSERT(connp->conn_upper_handle != NULL); 429 /* 430 * Entering the squeue synchronously can result in a context switch, 431 * which can cause a rather sever performance degradation. So we try to 432 * handle whatever options we can without entering the squeue. 433 */ 434 if (level == IPPROTO_TCP) { 435 switch (option_name) { 436 case TCP_NODELAY: 437 if (optlen != sizeof (int32_t)) 438 return (EINVAL); 439 mutex_enter(&connp->conn_tcp->tcp_non_sq_lock); 440 connp->conn_tcp->tcp_naglim = *(int *)optvalp ? 1 : 441 connp->conn_tcp->tcp_mss; 442 mutex_exit(&connp->conn_tcp->tcp_non_sq_lock); 443 return (0); 444 default: 445 break; 446 } 447 } 448 449 error = squeue_synch_enter(connp, NULL); 450 if (error == ENOMEM) { 451 return (ENOMEM); 452 } 453 454 error = proto_opt_check(level, option_name, optlen, NULL, 455 tcp_opt_obj.odb_opt_des_arr, 456 tcp_opt_obj.odb_opt_arr_cnt, 457 B_TRUE, B_FALSE, cr); 458 459 if (error != 0) { 460 if (error < 0) { 461 error = proto_tlitosyserr(-error); 462 } 463 squeue_synch_exit(connp); 464 return (error); 465 } 466 467 error = tcp_opt_set(connp, SETFN_OPTCOM_NEGOTIATE, level, option_name, 468 optlen, (uchar_t *)optvalp, (uint_t *)&optlen, (uchar_t *)optvalp, 469 NULL, cr); 470 squeue_synch_exit(connp); 471 472 ASSERT(error >= 0); 473 474 return (error); 475 } 476 477 /* ARGSUSED */ 478 static int 479 tcp_sendmsg(sock_lower_handle_t proto_handle, mblk_t *mp, struct nmsghdr *msg, 480 cred_t *cr) 481 { 482 tcp_t *tcp; 483 uint32_t msize; 484 conn_t *connp = (conn_t *)proto_handle; 485 int32_t tcpstate; 486 487 /* All Solaris components should pass a cred for this operation. */ 488 ASSERT(cr != NULL); 489 490 ASSERT(connp->conn_ref >= 2); 491 ASSERT(connp->conn_upper_handle != NULL); 492 493 if (msg->msg_controllen != 0) { 494 freemsg(mp); 495 return (EOPNOTSUPP); 496 } 497 498 switch (DB_TYPE(mp)) { 499 case M_DATA: 500 tcp = connp->conn_tcp; 501 ASSERT(tcp != NULL); 502 503 tcpstate = tcp->tcp_state; 504 if (tcpstate < TCPS_ESTABLISHED) { 505 freemsg(mp); 506 /* 507 * We return ENOTCONN if the endpoint is trying to 508 * connect or has never been connected, and EPIPE if it 509 * has been disconnected. The connection id helps us 510 * distinguish between the last two cases. 511 */ 512 return ((tcpstate == TCPS_SYN_SENT) ? ENOTCONN : 513 ((tcp->tcp_connid > 0) ? EPIPE : ENOTCONN)); 514 } else if (tcpstate > TCPS_CLOSE_WAIT) { 515 freemsg(mp); 516 return (EPIPE); 517 } 518 519 msize = msgdsize(mp); 520 521 mutex_enter(&tcp->tcp_non_sq_lock); 522 tcp->tcp_squeue_bytes += msize; 523 /* 524 * Squeue Flow Control 525 */ 526 if (TCP_UNSENT_BYTES(tcp) > connp->conn_sndbuf) { 527 tcp_setqfull(tcp); 528 } 529 mutex_exit(&tcp->tcp_non_sq_lock); 530 531 /* 532 * The application may pass in an address in the msghdr, but 533 * we ignore the address on connection-oriented sockets. 534 * Just like BSD this code does not generate an error for 535 * TCP (a CONNREQUIRED socket) when sending to an address 536 * passed in with sendto/sendmsg. Instead the data is 537 * delivered on the connection as if no address had been 538 * supplied. 539 */ 540 CONN_INC_REF(connp); 541 542 if (msg->msg_flags & MSG_OOB) { 543 SQUEUE_ENTER_ONE(connp->conn_sqp, mp, tcp_output_urgent, 544 connp, NULL, tcp_squeue_flag, SQTAG_TCP_OUTPUT); 545 } else { 546 SQUEUE_ENTER_ONE(connp->conn_sqp, mp, tcp_output, 547 connp, NULL, tcp_squeue_flag, SQTAG_TCP_OUTPUT); 548 } 549 550 return (0); 551 552 default: 553 ASSERT(0); 554 } 555 556 freemsg(mp); 557 return (0); 558 } 559 560 /* ARGSUSED */ 561 static int 562 tcp_shutdown(sock_lower_handle_t proto_handle, int how, cred_t *cr) 563 { 564 conn_t *connp = (conn_t *)proto_handle; 565 tcp_t *tcp = connp->conn_tcp; 566 567 ASSERT(connp->conn_upper_handle != NULL); 568 569 /* All Solaris components should pass a cred for this operation. */ 570 ASSERT(cr != NULL); 571 572 /* 573 * X/Open requires that we check the connected state. 574 */ 575 if (tcp->tcp_state < TCPS_SYN_SENT) 576 return (ENOTCONN); 577 578 /* shutdown the send side */ 579 if (how != SHUT_RD) { 580 mblk_t *bp; 581 582 bp = allocb_wait(0, BPRI_HI, STR_NOSIG, NULL); 583 CONN_INC_REF(connp); 584 SQUEUE_ENTER_ONE(connp->conn_sqp, bp, tcp_shutdown_output, 585 connp, NULL, SQ_NODRAIN, SQTAG_TCP_SHUTDOWN_OUTPUT); 586 587 (*connp->conn_upcalls->su_opctl)(connp->conn_upper_handle, 588 SOCK_OPCTL_SHUT_SEND, 0); 589 } 590 591 /* shutdown the recv side */ 592 if (how != SHUT_WR) 593 (*connp->conn_upcalls->su_opctl)(connp->conn_upper_handle, 594 SOCK_OPCTL_SHUT_RECV, 0); 595 596 return (0); 597 } 598 599 static void 600 tcp_clr_flowctrl(sock_lower_handle_t proto_handle) 601 { 602 conn_t *connp = (conn_t *)proto_handle; 603 tcp_t *tcp = connp->conn_tcp; 604 mblk_t *mp; 605 int error; 606 607 ASSERT(connp->conn_upper_handle != NULL); 608 609 /* 610 * If tcp->tcp_rsrv_mp == NULL, it means that tcp_clr_flowctrl() 611 * is currently running. 612 */ 613 mutex_enter(&tcp->tcp_rsrv_mp_lock); 614 if ((mp = tcp->tcp_rsrv_mp) == NULL) { 615 mutex_exit(&tcp->tcp_rsrv_mp_lock); 616 return; 617 } 618 tcp->tcp_rsrv_mp = NULL; 619 mutex_exit(&tcp->tcp_rsrv_mp_lock); 620 621 error = squeue_synch_enter(connp, mp); 622 ASSERT(error == 0); 623 624 mutex_enter(&tcp->tcp_rsrv_mp_lock); 625 tcp->tcp_rsrv_mp = mp; 626 mutex_exit(&tcp->tcp_rsrv_mp_lock); 627 628 if (tcp->tcp_fused) { 629 tcp_fuse_backenable(tcp); 630 } else { 631 tcp->tcp_rwnd = connp->conn_rcvbuf; 632 /* 633 * Send back a window update immediately if TCP is above 634 * ESTABLISHED state and the increase of the rcv window 635 * that the other side knows is at least 1 MSS after flow 636 * control is lifted. 637 */ 638 if (tcp->tcp_state >= TCPS_ESTABLISHED && 639 tcp_rwnd_reopen(tcp) == TH_ACK_NEEDED) { 640 tcp_xmit_ctl(NULL, tcp, 641 (tcp->tcp_swnd == 0) ? tcp->tcp_suna : 642 tcp->tcp_snxt, tcp->tcp_rnxt, TH_ACK); 643 } 644 } 645 646 squeue_synch_exit(connp); 647 } 648 649 /* ARGSUSED */ 650 static int 651 tcp_ioctl(sock_lower_handle_t proto_handle, int cmd, intptr_t arg, 652 int mode, int32_t *rvalp, cred_t *cr) 653 { 654 conn_t *connp = (conn_t *)proto_handle; 655 int error; 656 657 ASSERT(connp->conn_upper_handle != NULL); 658 659 /* All Solaris components should pass a cred for this operation. */ 660 ASSERT(cr != NULL); 661 662 /* 663 * If we don't have a helper stream then create one. 664 * ip_create_helper_stream takes care of locking the conn_t, 665 * so this check for NULL is just a performance optimization. 666 */ 667 if (connp->conn_helper_info == NULL) { 668 tcp_stack_t *tcps = connp->conn_tcp->tcp_tcps; 669 670 /* 671 * Create a helper stream for non-STREAMS socket. 672 */ 673 error = ip_create_helper_stream(connp, tcps->tcps_ldi_ident); 674 if (error != 0) { 675 ip0dbg(("tcp_ioctl: create of IP helper stream " 676 "failed %d\n", error)); 677 return (error); 678 } 679 } 680 681 switch (cmd) { 682 case ND_SET: 683 case ND_GET: 684 case _SIOCSOCKFALLBACK: 685 case TCP_IOC_ABORT_CONN: 686 case TI_GETPEERNAME: 687 case TI_GETMYNAME: 688 ip1dbg(("tcp_ioctl: cmd 0x%x on non streams socket", 689 cmd)); 690 error = EINVAL; 691 break; 692 default: 693 /* 694 * If the conn is not closing, pass on to IP using 695 * helper stream. Bump the ioctlref to prevent tcp_close 696 * from closing the rq/wq out from underneath the ioctl 697 * if it ends up queued or aborted/interrupted. 698 */ 699 mutex_enter(&connp->conn_lock); 700 if (connp->conn_state_flags & (CONN_CLOSING)) { 701 mutex_exit(&connp->conn_lock); 702 error = EINVAL; 703 break; 704 } 705 CONN_INC_IOCTLREF_LOCKED(connp); 706 error = ldi_ioctl(connp->conn_helper_info->iphs_handle, 707 cmd, arg, mode, cr, rvalp); 708 CONN_DEC_IOCTLREF(connp); 709 break; 710 } 711 return (error); 712 } 713 714 /* ARGSUSED */ 715 static int 716 tcp_close(sock_lower_handle_t proto_handle, int flags, cred_t *cr) 717 { 718 conn_t *connp = (conn_t *)proto_handle; 719 720 ASSERT(connp->conn_upper_handle != NULL); 721 722 /* All Solaris components should pass a cred for this operation. */ 723 ASSERT(cr != NULL); 724 725 tcp_close_common(connp, flags); 726 727 ip_free_helper_stream(connp); 728 729 /* 730 * Drop IP's reference on the conn. This is the last reference 731 * on the connp if the state was less than established. If the 732 * connection has gone into timewait state, then we will have 733 * one ref for the TCP and one more ref (total of two) for the 734 * classifier connected hash list (a timewait connections stays 735 * in connected hash till closed). 736 * 737 * We can't assert the references because there might be other 738 * transient reference places because of some walkers or queued 739 * packets in squeue for the timewait state. 740 */ 741 CONN_DEC_REF(connp); 742 743 /* 744 * EINPROGRESS tells sockfs to wait for a 'closed' upcall before 745 * freeing the socket. 746 */ 747 return (EINPROGRESS); 748 } 749 750 /* ARGSUSED */ 751 sock_lower_handle_t 752 tcp_create(int family, int type, int proto, sock_downcalls_t **sock_downcalls, 753 uint_t *smodep, int *errorp, int flags, cred_t *credp) 754 { 755 conn_t *connp; 756 boolean_t isv6 = family == AF_INET6; 757 if (type != SOCK_STREAM || (family != AF_INET && family != AF_INET6) || 758 (proto != 0 && proto != IPPROTO_TCP)) { 759 *errorp = EPROTONOSUPPORT; 760 return (NULL); 761 } 762 763 connp = tcp_create_common(credp, isv6, B_TRUE, errorp); 764 if (connp == NULL) { 765 return (NULL); 766 } 767 768 /* 769 * Put the ref for TCP. Ref for IP was already put 770 * by ipcl_conn_create. Also Make the conn_t globally 771 * visible to walkers 772 */ 773 mutex_enter(&connp->conn_lock); 774 CONN_INC_REF_LOCKED(connp); 775 ASSERT(connp->conn_ref == 2); 776 connp->conn_state_flags &= ~CONN_INCIPIENT; 777 778 connp->conn_flags |= IPCL_NONSTR; 779 mutex_exit(&connp->conn_lock); 780 781 ASSERT(errorp != NULL); 782 *errorp = 0; 783 *sock_downcalls = &sock_tcp_downcalls; 784 *smodep = SM_CONNREQUIRED | SM_EXDATA | SM_ACCEPTSUPP | 785 SM_SENDFILESUPP; 786 787 return ((sock_lower_handle_t)connp); 788 } 789 790 /* 791 * tcp_fallback 792 * 793 * A direct socket is falling back to using STREAMS. The queue 794 * that is being passed down was created using tcp_open() with 795 * the SO_FALLBACK flag set. As a result, the queue is not 796 * associated with a conn, and the q_ptrs instead contain the 797 * dev and minor area that should be used. 798 * 799 * The 'issocket' flag indicates whether the FireEngine 800 * optimizations should be used. The common case would be that 801 * optimizations are enabled, and they might be subsequently 802 * disabled using the _SIOCSOCKFALLBACK ioctl. 803 */ 804 805 /* 806 * An active connection is falling back to TPI. Gather all the information 807 * required by the STREAM head and TPI sonode and send it up. 808 */ 809 static void 810 tcp_fallback_noneager(tcp_t *tcp, mblk_t *stropt_mp, queue_t *q, 811 boolean_t issocket, so_proto_quiesced_cb_t quiesced_cb, 812 sock_quiesce_arg_t *arg) 813 { 814 conn_t *connp = tcp->tcp_connp; 815 struct stroptions *stropt; 816 struct T_capability_ack tca; 817 struct sockaddr_in6 laddr, faddr; 818 socklen_t laddrlen, faddrlen; 819 short opts; 820 int error; 821 mblk_t *mp, *mpnext; 822 823 connp->conn_dev = (dev_t)RD(q)->q_ptr; 824 connp->conn_minor_arena = WR(q)->q_ptr; 825 826 RD(q)->q_ptr = WR(q)->q_ptr = connp; 827 828 connp->conn_rq = RD(q); 829 connp->conn_wq = WR(q); 830 831 WR(q)->q_qinfo = &tcp_sock_winit; 832 833 if (!issocket) 834 tcp_use_pure_tpi(tcp); 835 836 /* 837 * free the helper stream 838 */ 839 ip_free_helper_stream(connp); 840 841 /* 842 * Notify the STREAM head about options 843 */ 844 DB_TYPE(stropt_mp) = M_SETOPTS; 845 stropt = (struct stroptions *)stropt_mp->b_rptr; 846 stropt_mp->b_wptr += sizeof (struct stroptions); 847 stropt->so_flags = SO_HIWAT | SO_WROFF | SO_MAXBLK; 848 849 stropt->so_wroff = connp->conn_ht_iphc_len + (tcp->tcp_loopback ? 0 : 850 tcp->tcp_tcps->tcps_wroff_xtra); 851 if (tcp->tcp_snd_sack_ok) 852 stropt->so_wroff += TCPOPT_MAX_SACK_LEN; 853 stropt->so_hiwat = connp->conn_rcvbuf; 854 stropt->so_maxblk = tcp_maxpsz_set(tcp, B_FALSE); 855 856 putnext(RD(q), stropt_mp); 857 858 /* 859 * Collect the information needed to sync with the sonode 860 */ 861 tcp_do_capability_ack(tcp, &tca, TC1_INFO|TC1_ACCEPTOR_ID); 862 863 laddrlen = faddrlen = sizeof (sin6_t); 864 (void) tcp_getsockname((sock_lower_handle_t)connp, 865 (struct sockaddr *)&laddr, &laddrlen, CRED()); 866 error = tcp_getpeername((sock_lower_handle_t)connp, 867 (struct sockaddr *)&faddr, &faddrlen, CRED()); 868 if (error != 0) 869 faddrlen = 0; 870 871 opts = 0; 872 if (connp->conn_oobinline) 873 opts |= SO_OOBINLINE; 874 if (connp->conn_ixa->ixa_flags & IXAF_DONTROUTE) 875 opts |= SO_DONTROUTE; 876 877 /* 878 * Notify the socket that the protocol is now quiescent, 879 * and it's therefore safe move data from the socket 880 * to the stream head. 881 */ 882 mp = (*quiesced_cb)(connp->conn_upper_handle, arg, &tca, 883 (struct sockaddr *)&laddr, laddrlen, 884 (struct sockaddr *)&faddr, faddrlen, opts); 885 886 while (mp != NULL) { 887 mpnext = mp->b_next; 888 tcp->tcp_rcv_list = mp->b_next; 889 mp->b_next = NULL; 890 putnext(q, mp); 891 mp = mpnext; 892 } 893 ASSERT(tcp->tcp_rcv_last_head == NULL); 894 ASSERT(tcp->tcp_rcv_last_tail == NULL); 895 ASSERT(tcp->tcp_rcv_cnt == 0); 896 897 /* 898 * All eagers in q0 are marked as being non-STREAM, so they will 899 * make su_newconn upcalls when the handshake completes, which 900 * will fail (resulting in the conn being closed). So we just blow 901 * off everything in q0 instead of waiting for the inevitable. 902 */ 903 if (tcp->tcp_conn_req_cnt_q0 != 0) 904 tcp_eager_cleanup(tcp, B_TRUE); 905 } 906 907 /* 908 * An eager is falling back to TPI. All we have to do is send 909 * up a T_CONN_IND. 910 */ 911 static void 912 tcp_fallback_eager(tcp_t *eager, boolean_t issocket, 913 so_proto_quiesced_cb_t quiesced_cb, sock_quiesce_arg_t *arg) 914 { 915 conn_t *connp = eager->tcp_connp; 916 tcp_t *listener = eager->tcp_listener; 917 mblk_t *mp; 918 919 ASSERT(listener != NULL); 920 921 /* 922 * Notify the socket that the protocol is now quiescent, 923 * and it's therefore safe move data from the socket 924 * to tcp's rcv queue. 925 */ 926 mp = (*quiesced_cb)(connp->conn_upper_handle, arg, NULL, NULL, 0, 927 NULL, 0, 0); 928 929 if (mp != NULL) { 930 ASSERT(eager->tcp_rcv_cnt == 0); 931 932 eager->tcp_rcv_list = mp; 933 eager->tcp_rcv_cnt = msgdsize(mp); 934 while (mp->b_next != NULL) { 935 mp = mp->b_next; 936 eager->tcp_rcv_cnt += msgdsize(mp); 937 } 938 eager->tcp_rcv_last_head = mp; 939 while (mp->b_cont) 940 mp = mp->b_cont; 941 eager->tcp_rcv_last_tail = mp; 942 if (eager->tcp_rcv_cnt > eager->tcp_rwnd) 943 eager->tcp_rwnd = 0; 944 else 945 eager->tcp_rwnd -= eager->tcp_rcv_cnt; 946 } 947 948 if (!issocket) 949 eager->tcp_issocket = B_FALSE; 950 /* 951 * The stream for this eager does not yet exist, so mark it as 952 * being detached. 953 */ 954 eager->tcp_detached = B_TRUE; 955 eager->tcp_hard_binding = B_TRUE; 956 connp->conn_rq = listener->tcp_connp->conn_rq; 957 connp->conn_wq = listener->tcp_connp->conn_wq; 958 959 /* Send up the connection indication */ 960 mp = eager->tcp_conn.tcp_eager_conn_ind; 961 ASSERT(mp != NULL); 962 eager->tcp_conn.tcp_eager_conn_ind = NULL; 963 964 /* 965 * TLI/XTI applications will get confused by 966 * sending eager as an option since it violates 967 * the option semantics. So remove the eager as 968 * option since TLI/XTI app doesn't need it anyway. 969 */ 970 if (!issocket) { 971 struct T_conn_ind *conn_ind; 972 973 conn_ind = (struct T_conn_ind *)mp->b_rptr; 974 conn_ind->OPT_length = 0; 975 conn_ind->OPT_offset = 0; 976 } 977 978 /* 979 * Sockfs guarantees that the listener will not be closed 980 * during fallback. So we can safely use the listener's queue. 981 */ 982 putnext(listener->tcp_connp->conn_rq, mp); 983 } 984 985 986 int 987 tcp_fallback(sock_lower_handle_t proto_handle, queue_t *q, 988 boolean_t direct_sockfs, so_proto_quiesced_cb_t quiesced_cb, 989 sock_quiesce_arg_t *arg) 990 { 991 tcp_t *tcp; 992 conn_t *connp = (conn_t *)proto_handle; 993 int error; 994 mblk_t *stropt_mp; 995 mblk_t *ordrel_mp; 996 997 tcp = connp->conn_tcp; 998 999 stropt_mp = allocb_wait(sizeof (struct stroptions), BPRI_HI, STR_NOSIG, 1000 NULL); 1001 1002 /* Pre-allocate the T_ordrel_ind mblk. */ 1003 ASSERT(tcp->tcp_ordrel_mp == NULL); 1004 ordrel_mp = allocb_wait(sizeof (struct T_ordrel_ind), BPRI_HI, 1005 STR_NOSIG, NULL); 1006 ordrel_mp->b_datap->db_type = M_PROTO; 1007 ((struct T_ordrel_ind *)ordrel_mp->b_rptr)->PRIM_type = T_ORDREL_IND; 1008 ordrel_mp->b_wptr += sizeof (struct T_ordrel_ind); 1009 1010 /* 1011 * Enter the squeue so that no new packets can come in 1012 */ 1013 error = squeue_synch_enter(connp, NULL); 1014 if (error != 0) { 1015 /* failed to enter, free all the pre-allocated messages. */ 1016 freeb(stropt_mp); 1017 freeb(ordrel_mp); 1018 return (ENOMEM); 1019 } 1020 1021 /* 1022 * Both endpoints must be of the same type (either STREAMS or 1023 * non-STREAMS) for fusion to be enabled. So if we are fused, 1024 * we have to unfuse. 1025 */ 1026 if (tcp->tcp_fused) 1027 tcp_unfuse(tcp); 1028 1029 if (tcp->tcp_listener != NULL) { 1030 /* The eager will deal with opts when accept() is called */ 1031 freeb(stropt_mp); 1032 tcp_fallback_eager(tcp, direct_sockfs, quiesced_cb, arg); 1033 } else { 1034 tcp_fallback_noneager(tcp, stropt_mp, q, direct_sockfs, 1035 quiesced_cb, arg); 1036 } 1037 1038 /* 1039 * No longer a direct socket 1040 * 1041 * Note that we intentionally leave the upper_handle and upcalls 1042 * intact, since eagers may still be using them. 1043 */ 1044 connp->conn_flags &= ~IPCL_NONSTR; 1045 tcp->tcp_ordrel_mp = ordrel_mp; 1046 1047 /* 1048 * There should be atleast two ref's (IP + TCP) 1049 */ 1050 ASSERT(connp->conn_ref >= 2); 1051 squeue_synch_exit(connp); 1052 1053 return (0); 1054 } 1055 1056 /* 1057 * Notifies a non-STREAMS based listener about a new connection. This 1058 * function is executed on the *eager*'s squeue once the 3 way handshake 1059 * has completed. Note that the behavior differs from STREAMS, where the 1060 * T_CONN_IND is sent up by tcp_send_conn_ind while on the *listener*'s 1061 * squeue. 1062 * 1063 * Returns B_TRUE if the notification succeeded, in which case `tcp' will 1064 * be moved over to the ESTABLISHED list (q) of the listener. Othwerise, 1065 * B_FALSE is returned and `tcp' is killed. 1066 */ 1067 boolean_t 1068 tcp_newconn_notify(tcp_t *tcp, ip_recv_attr_t *ira) 1069 { 1070 tcp_t *listener = tcp->tcp_listener; 1071 conn_t *lconnp = listener->tcp_connp; 1072 conn_t *econnp = tcp->tcp_connp; 1073 tcp_t *tail; 1074 ipaddr_t *addr_cache; 1075 sock_upper_handle_t upper; 1076 struct sock_proto_props sopp; 1077 mblk_t *mp; 1078 1079 mutex_enter(&listener->tcp_eager_lock); 1080 /* 1081 * Take the eager out, if it is in the list of droppable eagers 1082 * as we are here because the 3W handshake is over. 1083 */ 1084 MAKE_UNDROPPABLE(tcp); 1085 /* 1086 * The eager already has an extra ref put in tcp_input_data 1087 * so that it stays till accept comes back even though it 1088 * might get into TCPS_CLOSED as a result of a TH_RST etc. 1089 */ 1090 ASSERT(listener->tcp_conn_req_cnt_q0 > 0); 1091 listener->tcp_conn_req_cnt_q0--; 1092 listener->tcp_conn_req_cnt_q++; 1093 1094 /* Move from SYN_RCVD to ESTABLISHED list */ 1095 tcp->tcp_eager_next_q0->tcp_eager_prev_q0 = tcp->tcp_eager_prev_q0; 1096 tcp->tcp_eager_prev_q0->tcp_eager_next_q0 = tcp->tcp_eager_next_q0; 1097 tcp->tcp_eager_prev_q0 = NULL; 1098 tcp->tcp_eager_next_q0 = NULL; 1099 1100 /* 1101 * Insert at end of the queue because connections are accepted 1102 * in chronological order. Leaving the older connections at front 1103 * of the queue helps reducing search time. 1104 */ 1105 tail = listener->tcp_eager_last_q; 1106 if (tail != NULL) 1107 tail->tcp_eager_next_q = tcp; 1108 else 1109 listener->tcp_eager_next_q = tcp; 1110 listener->tcp_eager_last_q = tcp; 1111 tcp->tcp_eager_next_q = NULL; 1112 1113 /* we have timed out before */ 1114 if (tcp->tcp_syn_rcvd_timeout != 0) { 1115 tcp->tcp_syn_rcvd_timeout = 0; 1116 listener->tcp_syn_rcvd_timeout--; 1117 if (listener->tcp_syn_defense && 1118 listener->tcp_syn_rcvd_timeout <= 1119 (listener->tcp_tcps->tcps_conn_req_max_q0 >> 5) && 1120 10*MINUTES < TICK_TO_MSEC(ddi_get_lbolt64() - 1121 listener->tcp_last_rcv_lbolt)) { 1122 /* 1123 * Turn off the defense mode if we 1124 * believe the SYN attack is over. 1125 */ 1126 listener->tcp_syn_defense = B_FALSE; 1127 if (listener->tcp_ip_addr_cache) { 1128 kmem_free((void *)listener->tcp_ip_addr_cache, 1129 IP_ADDR_CACHE_SIZE * sizeof (ipaddr_t)); 1130 listener->tcp_ip_addr_cache = NULL; 1131 } 1132 } 1133 } 1134 addr_cache = (ipaddr_t *)(listener->tcp_ip_addr_cache); 1135 if (addr_cache != NULL) { 1136 /* 1137 * We have finished a 3-way handshake with this 1138 * remote host. This proves the IP addr is good. 1139 * Cache it! 1140 */ 1141 addr_cache[IP_ADDR_CACHE_HASH(tcp->tcp_connp->conn_faddr_v4)] = 1142 tcp->tcp_connp->conn_faddr_v4; 1143 } 1144 mutex_exit(&listener->tcp_eager_lock); 1145 1146 /* 1147 * Notify the ULP about the newconn. It is guaranteed that no 1148 * tcp_accept() call will be made for the eager if the 1149 * notification fails. 1150 */ 1151 if ((upper = (*lconnp->conn_upcalls->su_newconn) 1152 (lconnp->conn_upper_handle, (sock_lower_handle_t)econnp, 1153 &sock_tcp_downcalls, ira->ira_cred, ira->ira_cpid, 1154 &econnp->conn_upcalls)) == NULL) { 1155 /* 1156 * Normally this should not happen, but the listener might 1157 * have done a fallback to TPI followed by a close(), in 1158 * which case tcp_closemp for this conn might have been 1159 * used by tcp_eager_cleanup(). 1160 */ 1161 mutex_enter(&listener->tcp_eager_lock); 1162 if (tcp->tcp_closemp_used) { 1163 mutex_exit(&listener->tcp_eager_lock); 1164 return (B_FALSE); 1165 } 1166 tcp->tcp_closemp_used = B_TRUE; 1167 TCP_DEBUG_GETPCSTACK(tcp->tcmp_stk, 15); 1168 mp = &tcp->tcp_closemp; 1169 mutex_exit(&listener->tcp_eager_lock); 1170 tcp_eager_kill(econnp, mp, NULL, NULL); 1171 return (B_FALSE); 1172 } 1173 econnp->conn_upper_handle = upper; 1174 1175 tcp->tcp_detached = B_FALSE; 1176 tcp->tcp_hard_binding = B_FALSE; 1177 tcp->tcp_tconnind_started = B_TRUE; 1178 1179 if (econnp->conn_keepalive) { 1180 tcp->tcp_ka_last_intrvl = 0; 1181 tcp->tcp_ka_tid = TCP_TIMER(tcp, tcp_keepalive_timer, 1182 tcp->tcp_ka_interval); 1183 } 1184 1185 /* Update the necessary parameters */ 1186 tcp_get_proto_props(tcp, &sopp); 1187 1188 (*econnp->conn_upcalls->su_set_proto_props) 1189 (econnp->conn_upper_handle, &sopp); 1190 1191 return (B_TRUE); 1192 } 1193