1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. 24 */ 25 26 /* This file contains all TCP kernel socket related functions. */ 27 28 #include <sys/types.h> 29 #include <sys/strlog.h> 30 #include <sys/policy.h> 31 #include <sys/sockio.h> 32 #include <sys/strsubr.h> 33 #include <sys/strsun.h> 34 #include <sys/squeue_impl.h> 35 #include <sys/squeue.h> 36 #define _SUN_TPI_VERSION 2 37 #include <sys/tihdr.h> 38 #include <sys/timod.h> 39 #include <sys/tpicommon.h> 40 #include <sys/socketvar.h> 41 42 #include <inet/common.h> 43 #include <inet/proto_set.h> 44 #include <inet/ip.h> 45 #include <inet/tcp.h> 46 #include <inet/tcp_impl.h> 47 48 static void tcp_activate(sock_lower_handle_t, sock_upper_handle_t, 49 sock_upcalls_t *, int, cred_t *); 50 static int tcp_accept(sock_lower_handle_t, sock_lower_handle_t, 51 sock_upper_handle_t, cred_t *); 52 static int tcp_bind(sock_lower_handle_t, struct sockaddr *, 53 socklen_t, cred_t *); 54 static int tcp_listen(sock_lower_handle_t, int, cred_t *); 55 static int tcp_connect(sock_lower_handle_t, const struct sockaddr *, 56 socklen_t, sock_connid_t *, cred_t *); 57 static int tcp_getsockopt(sock_lower_handle_t, int, int, void *, 58 socklen_t *, cred_t *); 59 static int tcp_setsockopt(sock_lower_handle_t, int, int, const void *, 60 socklen_t, cred_t *); 61 static int tcp_sendmsg(sock_lower_handle_t, mblk_t *, struct nmsghdr *, 62 cred_t *cr); 63 static int tcp_shutdown(sock_lower_handle_t, int, cred_t *); 64 static void tcp_clr_flowctrl(sock_lower_handle_t); 65 static int tcp_ioctl(sock_lower_handle_t, int, intptr_t, int, int32_t *, 66 cred_t *); 67 static int tcp_close(sock_lower_handle_t, int, cred_t *); 68 69 sock_downcalls_t sock_tcp_downcalls = { 70 tcp_activate, 71 tcp_accept, 72 tcp_bind, 73 tcp_listen, 74 tcp_connect, 75 tcp_getpeername, 76 tcp_getsockname, 77 tcp_getsockopt, 78 tcp_setsockopt, 79 tcp_sendmsg, 80 NULL, 81 NULL, 82 NULL, 83 tcp_shutdown, 84 tcp_clr_flowctrl, 85 tcp_ioctl, 86 tcp_close, 87 }; 88 89 /* ARGSUSED */ 90 static void 91 tcp_activate(sock_lower_handle_t proto_handle, sock_upper_handle_t sock_handle, 92 sock_upcalls_t *sock_upcalls, int flags, cred_t *cr) 93 { 94 conn_t *connp = (conn_t *)proto_handle; 95 struct sock_proto_props sopp; 96 extern struct module_info tcp_rinfo; 97 98 ASSERT(connp->conn_upper_handle == NULL); 99 100 /* All Solaris components should pass a cred for this operation. */ 101 ASSERT(cr != NULL); 102 103 sopp.sopp_flags = SOCKOPT_RCVHIWAT | SOCKOPT_RCVLOWAT | 104 SOCKOPT_MAXPSZ | SOCKOPT_MAXBLK | SOCKOPT_RCVTIMER | 105 SOCKOPT_RCVTHRESH | SOCKOPT_MAXADDRLEN | SOCKOPT_MINPSZ; 106 107 sopp.sopp_rxhiwat = SOCKET_RECVHIWATER; 108 sopp.sopp_rxlowat = SOCKET_RECVLOWATER; 109 sopp.sopp_maxpsz = INFPSZ; 110 sopp.sopp_maxblk = INFPSZ; 111 sopp.sopp_rcvtimer = SOCKET_TIMER_INTERVAL; 112 sopp.sopp_rcvthresh = SOCKET_RECVHIWATER >> 3; 113 sopp.sopp_maxaddrlen = sizeof (sin6_t); 114 sopp.sopp_minpsz = (tcp_rinfo.mi_minpsz == 1) ? 0 : 115 tcp_rinfo.mi_minpsz; 116 117 connp->conn_upcalls = sock_upcalls; 118 connp->conn_upper_handle = sock_handle; 119 120 ASSERT(connp->conn_rcvbuf != 0 && 121 connp->conn_rcvbuf == connp->conn_tcp->tcp_rwnd); 122 (*sock_upcalls->su_set_proto_props)(sock_handle, &sopp); 123 } 124 125 /*ARGSUSED*/ 126 static int 127 tcp_accept(sock_lower_handle_t lproto_handle, 128 sock_lower_handle_t eproto_handle, sock_upper_handle_t sock_handle, 129 cred_t *cr) 130 { 131 conn_t *lconnp, *econnp; 132 tcp_t *listener, *eager; 133 134 /* 135 * KSSL can move a socket from one listener to another, in which 136 * case `lproto_handle' points to the new listener. To ensure that 137 * the original listener is used the information is obtained from 138 * the eager. 139 */ 140 econnp = (conn_t *)eproto_handle; 141 eager = econnp->conn_tcp; 142 ASSERT(IPCL_IS_NONSTR(econnp)); 143 ASSERT(eager->tcp_listener != NULL); 144 listener = eager->tcp_listener; 145 lconnp = (conn_t *)listener->tcp_connp; 146 ASSERT(listener->tcp_state == TCPS_LISTEN); 147 ASSERT(lconnp->conn_upper_handle != NULL); 148 149 /* 150 * It is possible for the accept thread to race with the thread that 151 * made the su_newconn upcall in tcp_newconn_notify. Both 152 * tcp_newconn_notify and tcp_accept require that conn_upper_handle 153 * and conn_upcalls be set before returning, so they both write to 154 * them. However, we're guaranteed that the value written is the same 155 * for both threads. 156 */ 157 ASSERT(econnp->conn_upper_handle == NULL || 158 econnp->conn_upper_handle == sock_handle); 159 ASSERT(econnp->conn_upcalls == NULL || 160 econnp->conn_upcalls == lconnp->conn_upcalls); 161 econnp->conn_upper_handle = sock_handle; 162 econnp->conn_upcalls = lconnp->conn_upcalls; 163 164 ASSERT(econnp->conn_netstack == 165 listener->tcp_connp->conn_netstack); 166 ASSERT(eager->tcp_tcps == listener->tcp_tcps); 167 168 /* 169 * We should have a minimum of 2 references on the conn at this 170 * point. One for TCP and one for the newconn notification 171 * (which is now taken over by IP). In the normal case we would 172 * also have another reference (making a total of 3) for the conn 173 * being in the classifier hash list. However the eager could have 174 * received an RST subsequently and tcp_closei_local could have 175 * removed the eager from the classifier hash list, hence we can't 176 * assert that reference. 177 */ 178 ASSERT(econnp->conn_ref >= 2); 179 180 mutex_enter(&listener->tcp_eager_lock); 181 /* 182 * Non-STREAMS listeners never defer the notification of new 183 * connections. 184 */ 185 ASSERT(!listener->tcp_eager_prev_q0->tcp_conn_def_q0); 186 tcp_eager_unlink(eager); 187 mutex_exit(&listener->tcp_eager_lock); 188 CONN_DEC_REF(listener->tcp_connp); 189 190 return ((eager->tcp_state < TCPS_ESTABLISHED) ? ECONNABORTED : 0); 191 } 192 193 static int 194 tcp_bind(sock_lower_handle_t proto_handle, struct sockaddr *sa, 195 socklen_t len, cred_t *cr) 196 { 197 int error; 198 conn_t *connp = (conn_t *)proto_handle; 199 200 /* All Solaris components should pass a cred for this operation. */ 201 ASSERT(cr != NULL); 202 ASSERT(connp->conn_upper_handle != NULL); 203 204 error = squeue_synch_enter(connp, NULL); 205 if (error != 0) { 206 /* failed to enter */ 207 return (ENOSR); 208 } 209 210 /* binding to a NULL address really means unbind */ 211 if (sa == NULL) { 212 if (connp->conn_tcp->tcp_state < TCPS_LISTEN) 213 error = tcp_do_unbind(connp); 214 else 215 error = EINVAL; 216 } else { 217 error = tcp_do_bind(connp, sa, len, cr, B_TRUE); 218 } 219 220 squeue_synch_exit(connp); 221 222 if (error < 0) { 223 if (error == -TOUTSTATE) 224 error = EINVAL; 225 else 226 error = proto_tlitosyserr(-error); 227 } 228 229 return (error); 230 } 231 232 /* ARGSUSED */ 233 static int 234 tcp_listen(sock_lower_handle_t proto_handle, int backlog, cred_t *cr) 235 { 236 conn_t *connp = (conn_t *)proto_handle; 237 tcp_t *tcp = connp->conn_tcp; 238 int error; 239 240 ASSERT(connp->conn_upper_handle != NULL); 241 242 /* All Solaris components should pass a cred for this operation. */ 243 ASSERT(cr != NULL); 244 245 error = squeue_synch_enter(connp, NULL); 246 if (error != 0) { 247 /* failed to enter */ 248 return (ENOBUFS); 249 } 250 251 error = tcp_do_listen(connp, NULL, 0, backlog, cr, B_FALSE); 252 if (error == 0) { 253 /* 254 * sockfs needs to know what's the maximum number of socket 255 * that can be queued on the listener. 256 */ 257 (*connp->conn_upcalls->su_opctl)(connp->conn_upper_handle, 258 SOCK_OPCTL_ENAB_ACCEPT, 259 (uintptr_t)(tcp->tcp_conn_req_max + 260 tcp->tcp_tcps->tcps_conn_req_max_q0)); 261 } else if (error < 0) { 262 if (error == -TOUTSTATE) 263 error = EINVAL; 264 else 265 error = proto_tlitosyserr(-error); 266 } 267 squeue_synch_exit(connp); 268 return (error); 269 } 270 271 static int 272 tcp_connect(sock_lower_handle_t proto_handle, const struct sockaddr *sa, 273 socklen_t len, sock_connid_t *id, cred_t *cr) 274 { 275 conn_t *connp = (conn_t *)proto_handle; 276 int error; 277 278 ASSERT(connp->conn_upper_handle != NULL); 279 280 /* All Solaris components should pass a cred for this operation. */ 281 ASSERT(cr != NULL); 282 283 error = proto_verify_ip_addr(connp->conn_family, sa, len); 284 if (error != 0) { 285 return (error); 286 } 287 288 error = squeue_synch_enter(connp, NULL); 289 if (error != 0) { 290 /* failed to enter */ 291 return (ENOSR); 292 } 293 294 /* 295 * TCP supports quick connect, so no need to do an implicit bind 296 */ 297 error = tcp_do_connect(connp, sa, len, cr, curproc->p_pid); 298 if (error == 0) { 299 *id = connp->conn_tcp->tcp_connid; 300 } else if (error < 0) { 301 if (error == -TOUTSTATE) { 302 switch (connp->conn_tcp->tcp_state) { 303 case TCPS_SYN_SENT: 304 error = EALREADY; 305 break; 306 case TCPS_ESTABLISHED: 307 error = EISCONN; 308 break; 309 case TCPS_LISTEN: 310 error = EOPNOTSUPP; 311 break; 312 default: 313 error = EINVAL; 314 break; 315 } 316 } else { 317 error = proto_tlitosyserr(-error); 318 } 319 } 320 321 if (connp->conn_tcp->tcp_loopback) { 322 struct sock_proto_props sopp; 323 324 sopp.sopp_flags = SOCKOPT_LOOPBACK; 325 sopp.sopp_loopback = B_TRUE; 326 327 (*connp->conn_upcalls->su_set_proto_props)( 328 connp->conn_upper_handle, &sopp); 329 } 330 done: 331 squeue_synch_exit(connp); 332 333 return ((error == 0) ? EINPROGRESS : error); 334 } 335 336 /* ARGSUSED3 */ 337 int 338 tcp_getpeername(sock_lower_handle_t proto_handle, struct sockaddr *addr, 339 socklen_t *addrlenp, cred_t *cr) 340 { 341 conn_t *connp = (conn_t *)proto_handle; 342 tcp_t *tcp = connp->conn_tcp; 343 344 /* All Solaris components should pass a cred for this operation. */ 345 ASSERT(cr != NULL); 346 347 ASSERT(tcp != NULL); 348 if (tcp->tcp_state < TCPS_SYN_RCVD) 349 return (ENOTCONN); 350 351 return (conn_getpeername(connp, addr, addrlenp)); 352 } 353 354 /* ARGSUSED3 */ 355 int 356 tcp_getsockname(sock_lower_handle_t proto_handle, struct sockaddr *addr, 357 socklen_t *addrlenp, cred_t *cr) 358 { 359 conn_t *connp = (conn_t *)proto_handle; 360 361 /* All Solaris components should pass a cred for this operation. */ 362 ASSERT(cr != NULL); 363 364 return (conn_getsockname(connp, addr, addrlenp)); 365 } 366 367 /* returns UNIX error, the optlen is a value-result arg */ 368 static int 369 tcp_getsockopt(sock_lower_handle_t proto_handle, int level, int option_name, 370 void *optvalp, socklen_t *optlen, cred_t *cr) 371 { 372 conn_t *connp = (conn_t *)proto_handle; 373 int error; 374 t_uscalar_t max_optbuf_len; 375 void *optvalp_buf; 376 int len; 377 378 ASSERT(connp->conn_upper_handle != NULL); 379 380 error = proto_opt_check(level, option_name, *optlen, &max_optbuf_len, 381 tcp_opt_obj.odb_opt_des_arr, 382 tcp_opt_obj.odb_opt_arr_cnt, 383 B_FALSE, B_TRUE, cr); 384 if (error != 0) { 385 if (error < 0) { 386 error = proto_tlitosyserr(-error); 387 } 388 return (error); 389 } 390 391 optvalp_buf = kmem_alloc(max_optbuf_len, KM_SLEEP); 392 393 error = squeue_synch_enter(connp, NULL); 394 if (error == ENOMEM) { 395 kmem_free(optvalp_buf, max_optbuf_len); 396 return (ENOMEM); 397 } 398 399 len = tcp_opt_get(connp, level, option_name, optvalp_buf); 400 squeue_synch_exit(connp); 401 402 if (len == -1) { 403 kmem_free(optvalp_buf, max_optbuf_len); 404 return (EINVAL); 405 } 406 407 /* 408 * update optlen and copy option value 409 */ 410 t_uscalar_t size = MIN(len, *optlen); 411 412 bcopy(optvalp_buf, optvalp, size); 413 bcopy(&size, optlen, sizeof (size)); 414 415 kmem_free(optvalp_buf, max_optbuf_len); 416 return (0); 417 } 418 419 static int 420 tcp_setsockopt(sock_lower_handle_t proto_handle, int level, int option_name, 421 const void *optvalp, socklen_t optlen, cred_t *cr) 422 { 423 conn_t *connp = (conn_t *)proto_handle; 424 int error; 425 426 ASSERT(connp->conn_upper_handle != NULL); 427 /* 428 * Entering the squeue synchronously can result in a context switch, 429 * which can cause a rather sever performance degradation. So we try to 430 * handle whatever options we can without entering the squeue. 431 */ 432 if (level == IPPROTO_TCP) { 433 switch (option_name) { 434 case TCP_NODELAY: 435 if (optlen != sizeof (int32_t)) 436 return (EINVAL); 437 mutex_enter(&connp->conn_tcp->tcp_non_sq_lock); 438 connp->conn_tcp->tcp_naglim = *(int *)optvalp ? 1 : 439 connp->conn_tcp->tcp_mss; 440 mutex_exit(&connp->conn_tcp->tcp_non_sq_lock); 441 return (0); 442 default: 443 break; 444 } 445 } 446 447 error = squeue_synch_enter(connp, NULL); 448 if (error == ENOMEM) { 449 return (ENOMEM); 450 } 451 452 error = proto_opt_check(level, option_name, optlen, NULL, 453 tcp_opt_obj.odb_opt_des_arr, 454 tcp_opt_obj.odb_opt_arr_cnt, 455 B_TRUE, B_FALSE, cr); 456 457 if (error != 0) { 458 if (error < 0) { 459 error = proto_tlitosyserr(-error); 460 } 461 squeue_synch_exit(connp); 462 return (error); 463 } 464 465 error = tcp_opt_set(connp, SETFN_OPTCOM_NEGOTIATE, level, option_name, 466 optlen, (uchar_t *)optvalp, (uint_t *)&optlen, (uchar_t *)optvalp, 467 NULL, cr); 468 squeue_synch_exit(connp); 469 470 ASSERT(error >= 0); 471 472 return (error); 473 } 474 475 /* ARGSUSED */ 476 static int 477 tcp_sendmsg(sock_lower_handle_t proto_handle, mblk_t *mp, struct nmsghdr *msg, 478 cred_t *cr) 479 { 480 tcp_t *tcp; 481 uint32_t msize; 482 conn_t *connp = (conn_t *)proto_handle; 483 int32_t tcpstate; 484 485 /* All Solaris components should pass a cred for this operation. */ 486 ASSERT(cr != NULL); 487 488 ASSERT(connp->conn_ref >= 2); 489 ASSERT(connp->conn_upper_handle != NULL); 490 491 if (msg->msg_controllen != 0) { 492 freemsg(mp); 493 return (EOPNOTSUPP); 494 } 495 496 switch (DB_TYPE(mp)) { 497 case M_DATA: 498 tcp = connp->conn_tcp; 499 ASSERT(tcp != NULL); 500 501 tcpstate = tcp->tcp_state; 502 if (tcpstate < TCPS_ESTABLISHED) { 503 freemsg(mp); 504 /* 505 * We return ENOTCONN if the endpoint is trying to 506 * connect or has never been connected, and EPIPE if it 507 * has been disconnected. The connection id helps us 508 * distinguish between the last two cases. 509 */ 510 return ((tcpstate == TCPS_SYN_SENT) ? ENOTCONN : 511 ((tcp->tcp_connid > 0) ? EPIPE : ENOTCONN)); 512 } else if (tcpstate > TCPS_CLOSE_WAIT) { 513 freemsg(mp); 514 return (EPIPE); 515 } 516 517 msize = msgdsize(mp); 518 519 mutex_enter(&tcp->tcp_non_sq_lock); 520 tcp->tcp_squeue_bytes += msize; 521 /* 522 * Squeue Flow Control 523 */ 524 if (TCP_UNSENT_BYTES(tcp) > connp->conn_sndbuf) { 525 tcp_setqfull(tcp); 526 } 527 mutex_exit(&tcp->tcp_non_sq_lock); 528 529 /* 530 * The application may pass in an address in the msghdr, but 531 * we ignore the address on connection-oriented sockets. 532 * Just like BSD this code does not generate an error for 533 * TCP (a CONNREQUIRED socket) when sending to an address 534 * passed in with sendto/sendmsg. Instead the data is 535 * delivered on the connection as if no address had been 536 * supplied. 537 */ 538 CONN_INC_REF(connp); 539 540 if (msg->msg_flags & MSG_OOB) { 541 SQUEUE_ENTER_ONE(connp->conn_sqp, mp, tcp_output_urgent, 542 connp, NULL, tcp_squeue_flag, SQTAG_TCP_OUTPUT); 543 } else { 544 SQUEUE_ENTER_ONE(connp->conn_sqp, mp, tcp_output, 545 connp, NULL, tcp_squeue_flag, SQTAG_TCP_OUTPUT); 546 } 547 548 return (0); 549 550 default: 551 ASSERT(0); 552 } 553 554 freemsg(mp); 555 return (0); 556 } 557 558 /* ARGSUSED */ 559 static int 560 tcp_shutdown(sock_lower_handle_t proto_handle, int how, cred_t *cr) 561 { 562 conn_t *connp = (conn_t *)proto_handle; 563 tcp_t *tcp = connp->conn_tcp; 564 565 ASSERT(connp->conn_upper_handle != NULL); 566 567 /* All Solaris components should pass a cred for this operation. */ 568 ASSERT(cr != NULL); 569 570 /* 571 * X/Open requires that we check the connected state. 572 */ 573 if (tcp->tcp_state < TCPS_SYN_SENT) 574 return (ENOTCONN); 575 576 /* shutdown the send side */ 577 if (how != SHUT_RD) { 578 mblk_t *bp; 579 580 bp = allocb_wait(0, BPRI_HI, STR_NOSIG, NULL); 581 CONN_INC_REF(connp); 582 SQUEUE_ENTER_ONE(connp->conn_sqp, bp, tcp_shutdown_output, 583 connp, NULL, SQ_NODRAIN, SQTAG_TCP_SHUTDOWN_OUTPUT); 584 585 (*connp->conn_upcalls->su_opctl)(connp->conn_upper_handle, 586 SOCK_OPCTL_SHUT_SEND, 0); 587 } 588 589 /* shutdown the recv side */ 590 if (how != SHUT_WR) 591 (*connp->conn_upcalls->su_opctl)(connp->conn_upper_handle, 592 SOCK_OPCTL_SHUT_RECV, 0); 593 594 return (0); 595 } 596 597 static void 598 tcp_clr_flowctrl(sock_lower_handle_t proto_handle) 599 { 600 conn_t *connp = (conn_t *)proto_handle; 601 tcp_t *tcp = connp->conn_tcp; 602 mblk_t *mp; 603 int error; 604 605 ASSERT(connp->conn_upper_handle != NULL); 606 607 /* 608 * If tcp->tcp_rsrv_mp == NULL, it means that tcp_clr_flowctrl() 609 * is currently running. 610 */ 611 mutex_enter(&tcp->tcp_rsrv_mp_lock); 612 if ((mp = tcp->tcp_rsrv_mp) == NULL) { 613 mutex_exit(&tcp->tcp_rsrv_mp_lock); 614 return; 615 } 616 tcp->tcp_rsrv_mp = NULL; 617 mutex_exit(&tcp->tcp_rsrv_mp_lock); 618 619 error = squeue_synch_enter(connp, mp); 620 ASSERT(error == 0); 621 622 mutex_enter(&tcp->tcp_rsrv_mp_lock); 623 tcp->tcp_rsrv_mp = mp; 624 mutex_exit(&tcp->tcp_rsrv_mp_lock); 625 626 if (tcp->tcp_fused) { 627 tcp_fuse_backenable(tcp); 628 } else { 629 tcp->tcp_rwnd = connp->conn_rcvbuf; 630 /* 631 * Send back a window update immediately if TCP is above 632 * ESTABLISHED state and the increase of the rcv window 633 * that the other side knows is at least 1 MSS after flow 634 * control is lifted. 635 */ 636 if (tcp->tcp_state >= TCPS_ESTABLISHED && 637 tcp_rwnd_reopen(tcp) == TH_ACK_NEEDED) { 638 tcp_xmit_ctl(NULL, tcp, 639 (tcp->tcp_swnd == 0) ? tcp->tcp_suna : 640 tcp->tcp_snxt, tcp->tcp_rnxt, TH_ACK); 641 } 642 } 643 644 squeue_synch_exit(connp); 645 } 646 647 /* ARGSUSED */ 648 static int 649 tcp_ioctl(sock_lower_handle_t proto_handle, int cmd, intptr_t arg, 650 int mode, int32_t *rvalp, cred_t *cr) 651 { 652 conn_t *connp = (conn_t *)proto_handle; 653 int error; 654 655 ASSERT(connp->conn_upper_handle != NULL); 656 657 /* All Solaris components should pass a cred for this operation. */ 658 ASSERT(cr != NULL); 659 660 /* 661 * If we don't have a helper stream then create one. 662 * ip_create_helper_stream takes care of locking the conn_t, 663 * so this check for NULL is just a performance optimization. 664 */ 665 if (connp->conn_helper_info == NULL) { 666 tcp_stack_t *tcps = connp->conn_tcp->tcp_tcps; 667 668 /* 669 * Create a helper stream for non-STREAMS socket. 670 */ 671 error = ip_create_helper_stream(connp, tcps->tcps_ldi_ident); 672 if (error != 0) { 673 ip0dbg(("tcp_ioctl: create of IP helper stream " 674 "failed %d\n", error)); 675 return (error); 676 } 677 } 678 679 switch (cmd) { 680 case ND_SET: 681 case ND_GET: 682 case _SIOCSOCKFALLBACK: 683 case TCP_IOC_ABORT_CONN: 684 case TI_GETPEERNAME: 685 case TI_GETMYNAME: 686 ip1dbg(("tcp_ioctl: cmd 0x%x on non streams socket", 687 cmd)); 688 error = EINVAL; 689 break; 690 default: 691 /* 692 * If the conn is not closing, pass on to IP using 693 * helper stream. Bump the ioctlref to prevent tcp_close 694 * from closing the rq/wq out from underneath the ioctl 695 * if it ends up queued or aborted/interrupted. 696 */ 697 mutex_enter(&connp->conn_lock); 698 if (connp->conn_state_flags & (CONN_CLOSING)) { 699 mutex_exit(&connp->conn_lock); 700 error = EINVAL; 701 break; 702 } 703 CONN_INC_IOCTLREF_LOCKED(connp); 704 error = ldi_ioctl(connp->conn_helper_info->iphs_handle, 705 cmd, arg, mode, cr, rvalp); 706 CONN_DEC_IOCTLREF(connp); 707 break; 708 } 709 return (error); 710 } 711 712 /* ARGSUSED */ 713 static int 714 tcp_close(sock_lower_handle_t proto_handle, int flags, cred_t *cr) 715 { 716 conn_t *connp = (conn_t *)proto_handle; 717 718 ASSERT(connp->conn_upper_handle != NULL); 719 720 /* All Solaris components should pass a cred for this operation. */ 721 ASSERT(cr != NULL); 722 723 tcp_close_common(connp, flags); 724 725 ip_free_helper_stream(connp); 726 727 /* 728 * Drop IP's reference on the conn. This is the last reference 729 * on the connp if the state was less than established. If the 730 * connection has gone into timewait state, then we will have 731 * one ref for the TCP and one more ref (total of two) for the 732 * classifier connected hash list (a timewait connections stays 733 * in connected hash till closed). 734 * 735 * We can't assert the references because there might be other 736 * transient reference places because of some walkers or queued 737 * packets in squeue for the timewait state. 738 */ 739 CONN_DEC_REF(connp); 740 741 /* 742 * EINPROGRESS tells sockfs to wait for a 'closed' upcall before 743 * freeing the socket. 744 */ 745 return (EINPROGRESS); 746 } 747 748 /* ARGSUSED */ 749 sock_lower_handle_t 750 tcp_create(int family, int type, int proto, sock_downcalls_t **sock_downcalls, 751 uint_t *smodep, int *errorp, int flags, cred_t *credp) 752 { 753 conn_t *connp; 754 boolean_t isv6 = family == AF_INET6; 755 if (type != SOCK_STREAM || (family != AF_INET && family != AF_INET6) || 756 (proto != 0 && proto != IPPROTO_TCP)) { 757 *errorp = EPROTONOSUPPORT; 758 return (NULL); 759 } 760 761 connp = tcp_create_common(credp, isv6, B_TRUE, errorp); 762 if (connp == NULL) { 763 return (NULL); 764 } 765 766 /* 767 * Put the ref for TCP. Ref for IP was already put 768 * by ipcl_conn_create. Also Make the conn_t globally 769 * visible to walkers 770 */ 771 mutex_enter(&connp->conn_lock); 772 CONN_INC_REF_LOCKED(connp); 773 ASSERT(connp->conn_ref == 2); 774 connp->conn_state_flags &= ~CONN_INCIPIENT; 775 776 connp->conn_flags |= IPCL_NONSTR; 777 mutex_exit(&connp->conn_lock); 778 779 ASSERT(errorp != NULL); 780 *errorp = 0; 781 *sock_downcalls = &sock_tcp_downcalls; 782 *smodep = SM_CONNREQUIRED | SM_EXDATA | SM_ACCEPTSUPP | 783 SM_SENDFILESUPP; 784 785 return ((sock_lower_handle_t)connp); 786 } 787 788 /* 789 * tcp_fallback 790 * 791 * A direct socket is falling back to using STREAMS. The queue 792 * that is being passed down was created using tcp_open() with 793 * the SO_FALLBACK flag set. As a result, the queue is not 794 * associated with a conn, and the q_ptrs instead contain the 795 * dev and minor area that should be used. 796 * 797 * The 'issocket' flag indicates whether the FireEngine 798 * optimizations should be used. The common case would be that 799 * optimizations are enabled, and they might be subsequently 800 * disabled using the _SIOCSOCKFALLBACK ioctl. 801 */ 802 803 /* 804 * An active connection is falling back to TPI. Gather all the information 805 * required by the STREAM head and TPI sonode and send it up. 806 */ 807 static void 808 tcp_fallback_noneager(tcp_t *tcp, mblk_t *stropt_mp, queue_t *q, 809 boolean_t issocket, so_proto_quiesced_cb_t quiesced_cb, 810 sock_quiesce_arg_t *arg) 811 { 812 conn_t *connp = tcp->tcp_connp; 813 struct stroptions *stropt; 814 struct T_capability_ack tca; 815 struct sockaddr_in6 laddr, faddr; 816 socklen_t laddrlen, faddrlen; 817 short opts; 818 int error; 819 mblk_t *mp, *mpnext; 820 821 connp->conn_dev = (dev_t)RD(q)->q_ptr; 822 connp->conn_minor_arena = WR(q)->q_ptr; 823 824 RD(q)->q_ptr = WR(q)->q_ptr = connp; 825 826 connp->conn_rq = RD(q); 827 connp->conn_wq = WR(q); 828 829 WR(q)->q_qinfo = &tcp_sock_winit; 830 831 if (!issocket) 832 tcp_use_pure_tpi(tcp); 833 834 /* 835 * free the helper stream 836 */ 837 ip_free_helper_stream(connp); 838 839 /* 840 * Notify the STREAM head about options 841 */ 842 DB_TYPE(stropt_mp) = M_SETOPTS; 843 stropt = (struct stroptions *)stropt_mp->b_rptr; 844 stropt_mp->b_wptr += sizeof (struct stroptions); 845 stropt->so_flags = SO_HIWAT | SO_WROFF | SO_MAXBLK; 846 847 stropt->so_wroff = connp->conn_ht_iphc_len + (tcp->tcp_loopback ? 0 : 848 tcp->tcp_tcps->tcps_wroff_xtra); 849 if (tcp->tcp_snd_sack_ok) 850 stropt->so_wroff += TCPOPT_MAX_SACK_LEN; 851 stropt->so_hiwat = connp->conn_rcvbuf; 852 stropt->so_maxblk = tcp_maxpsz_set(tcp, B_FALSE); 853 854 putnext(RD(q), stropt_mp); 855 856 /* 857 * Collect the information needed to sync with the sonode 858 */ 859 tcp_do_capability_ack(tcp, &tca, TC1_INFO|TC1_ACCEPTOR_ID); 860 861 laddrlen = faddrlen = sizeof (sin6_t); 862 (void) tcp_getsockname((sock_lower_handle_t)connp, 863 (struct sockaddr *)&laddr, &laddrlen, CRED()); 864 error = tcp_getpeername((sock_lower_handle_t)connp, 865 (struct sockaddr *)&faddr, &faddrlen, CRED()); 866 if (error != 0) 867 faddrlen = 0; 868 869 opts = 0; 870 if (connp->conn_oobinline) 871 opts |= SO_OOBINLINE; 872 if (connp->conn_ixa->ixa_flags & IXAF_DONTROUTE) 873 opts |= SO_DONTROUTE; 874 875 /* 876 * Notify the socket that the protocol is now quiescent, 877 * and it's therefore safe move data from the socket 878 * to the stream head. 879 */ 880 mp = (*quiesced_cb)(connp->conn_upper_handle, arg, &tca, 881 (struct sockaddr *)&laddr, laddrlen, 882 (struct sockaddr *)&faddr, faddrlen, opts); 883 884 while (mp != NULL) { 885 mpnext = mp->b_next; 886 tcp->tcp_rcv_list = mp->b_next; 887 mp->b_next = NULL; 888 putnext(q, mp); 889 mp = mpnext; 890 } 891 ASSERT(tcp->tcp_rcv_last_head == NULL); 892 ASSERT(tcp->tcp_rcv_last_tail == NULL); 893 ASSERT(tcp->tcp_rcv_cnt == 0); 894 895 /* 896 * All eagers in q0 are marked as being non-STREAM, so they will 897 * make su_newconn upcalls when the handshake completes, which 898 * will fail (resulting in the conn being closed). So we just blow 899 * off everything in q0 instead of waiting for the inevitable. 900 */ 901 if (tcp->tcp_conn_req_cnt_q0 != 0) 902 tcp_eager_cleanup(tcp, B_TRUE); 903 } 904 905 /* 906 * An eager is falling back to TPI. All we have to do is send 907 * up a T_CONN_IND. 908 */ 909 static void 910 tcp_fallback_eager(tcp_t *eager, boolean_t issocket, 911 so_proto_quiesced_cb_t quiesced_cb, sock_quiesce_arg_t *arg) 912 { 913 conn_t *connp = eager->tcp_connp; 914 tcp_t *listener = eager->tcp_listener; 915 mblk_t *mp; 916 917 ASSERT(listener != NULL); 918 919 /* 920 * Notify the socket that the protocol is now quiescent, 921 * and it's therefore safe move data from the socket 922 * to tcp's rcv queue. 923 */ 924 mp = (*quiesced_cb)(connp->conn_upper_handle, arg, NULL, NULL, 0, 925 NULL, 0, 0); 926 927 if (mp != NULL) { 928 ASSERT(eager->tcp_rcv_cnt == 0); 929 930 eager->tcp_rcv_list = mp; 931 eager->tcp_rcv_cnt = msgdsize(mp); 932 while (mp->b_next != NULL) { 933 mp = mp->b_next; 934 eager->tcp_rcv_cnt += msgdsize(mp); 935 } 936 eager->tcp_rcv_last_head = mp; 937 while (mp->b_cont) 938 mp = mp->b_cont; 939 eager->tcp_rcv_last_tail = mp; 940 if (eager->tcp_rcv_cnt > eager->tcp_rwnd) 941 eager->tcp_rwnd = 0; 942 else 943 eager->tcp_rwnd -= eager->tcp_rcv_cnt; 944 } 945 946 if (!issocket) 947 eager->tcp_issocket = B_FALSE; 948 /* 949 * The stream for this eager does not yet exist, so mark it as 950 * being detached. 951 */ 952 eager->tcp_detached = B_TRUE; 953 eager->tcp_hard_binding = B_TRUE; 954 connp->conn_rq = listener->tcp_connp->conn_rq; 955 connp->conn_wq = listener->tcp_connp->conn_wq; 956 957 /* Send up the connection indication */ 958 mp = eager->tcp_conn.tcp_eager_conn_ind; 959 ASSERT(mp != NULL); 960 eager->tcp_conn.tcp_eager_conn_ind = NULL; 961 962 /* 963 * TLI/XTI applications will get confused by 964 * sending eager as an option since it violates 965 * the option semantics. So remove the eager as 966 * option since TLI/XTI app doesn't need it anyway. 967 */ 968 if (!issocket) { 969 struct T_conn_ind *conn_ind; 970 971 conn_ind = (struct T_conn_ind *)mp->b_rptr; 972 conn_ind->OPT_length = 0; 973 conn_ind->OPT_offset = 0; 974 } 975 976 /* 977 * Sockfs guarantees that the listener will not be closed 978 * during fallback. So we can safely use the listener's queue. 979 */ 980 putnext(listener->tcp_connp->conn_rq, mp); 981 } 982 983 984 int 985 tcp_fallback(sock_lower_handle_t proto_handle, queue_t *q, 986 boolean_t direct_sockfs, so_proto_quiesced_cb_t quiesced_cb, 987 sock_quiesce_arg_t *arg) 988 { 989 tcp_t *tcp; 990 conn_t *connp = (conn_t *)proto_handle; 991 int error; 992 mblk_t *stropt_mp; 993 mblk_t *ordrel_mp; 994 995 tcp = connp->conn_tcp; 996 997 stropt_mp = allocb_wait(sizeof (struct stroptions), BPRI_HI, STR_NOSIG, 998 NULL); 999 1000 /* Pre-allocate the T_ordrel_ind mblk. */ 1001 ASSERT(tcp->tcp_ordrel_mp == NULL); 1002 ordrel_mp = allocb_wait(sizeof (struct T_ordrel_ind), BPRI_HI, 1003 STR_NOSIG, NULL); 1004 ordrel_mp->b_datap->db_type = M_PROTO; 1005 ((struct T_ordrel_ind *)ordrel_mp->b_rptr)->PRIM_type = T_ORDREL_IND; 1006 ordrel_mp->b_wptr += sizeof (struct T_ordrel_ind); 1007 1008 /* 1009 * Enter the squeue so that no new packets can come in 1010 */ 1011 error = squeue_synch_enter(connp, NULL); 1012 if (error != 0) { 1013 /* failed to enter, free all the pre-allocated messages. */ 1014 freeb(stropt_mp); 1015 freeb(ordrel_mp); 1016 return (ENOMEM); 1017 } 1018 1019 /* 1020 * Both endpoints must be of the same type (either STREAMS or 1021 * non-STREAMS) for fusion to be enabled. So if we are fused, 1022 * we have to unfuse. 1023 */ 1024 if (tcp->tcp_fused) 1025 tcp_unfuse(tcp); 1026 1027 if (tcp->tcp_listener != NULL) { 1028 /* The eager will deal with opts when accept() is called */ 1029 freeb(stropt_mp); 1030 tcp_fallback_eager(tcp, direct_sockfs, quiesced_cb, arg); 1031 } else { 1032 tcp_fallback_noneager(tcp, stropt_mp, q, direct_sockfs, 1033 quiesced_cb, arg); 1034 } 1035 1036 /* 1037 * No longer a direct socket 1038 * 1039 * Note that we intentionally leave the upper_handle and upcalls 1040 * intact, since eagers may still be using them. 1041 */ 1042 connp->conn_flags &= ~IPCL_NONSTR; 1043 tcp->tcp_ordrel_mp = ordrel_mp; 1044 1045 /* 1046 * There should be atleast two ref's (IP + TCP) 1047 */ 1048 ASSERT(connp->conn_ref >= 2); 1049 squeue_synch_exit(connp); 1050 1051 return (0); 1052 } 1053 1054 /* 1055 * Notifies a non-STREAMS based listener about a new connection. This 1056 * function is executed on the *eager*'s squeue once the 3 way handshake 1057 * has completed. Note that the behavior differs from STREAMS, where the 1058 * T_CONN_IND is sent up by tcp_send_conn_ind while on the *listener*'s 1059 * squeue. 1060 * 1061 * Returns B_TRUE if the notification succeeded, in which case `tcp' will 1062 * be moved over to the ESTABLISHED list (q) of the listener. Othwerise, 1063 * B_FALSE is returned and `tcp' is killed. 1064 */ 1065 boolean_t 1066 tcp_newconn_notify(tcp_t *tcp, ip_recv_attr_t *ira) 1067 { 1068 tcp_t *listener = tcp->tcp_listener; 1069 conn_t *lconnp = listener->tcp_connp; 1070 conn_t *econnp = tcp->tcp_connp; 1071 tcp_t *tail; 1072 ipaddr_t *addr_cache; 1073 sock_upper_handle_t upper; 1074 struct sock_proto_props sopp; 1075 mblk_t *mp; 1076 1077 mutex_enter(&listener->tcp_eager_lock); 1078 /* 1079 * Take the eager out, if it is in the list of droppable eagers 1080 * as we are here because the 3W handshake is over. 1081 */ 1082 MAKE_UNDROPPABLE(tcp); 1083 /* 1084 * The eager already has an extra ref put in tcp_input_data 1085 * so that it stays till accept comes back even though it 1086 * might get into TCPS_CLOSED as a result of a TH_RST etc. 1087 */ 1088 ASSERT(listener->tcp_conn_req_cnt_q0 > 0); 1089 listener->tcp_conn_req_cnt_q0--; 1090 listener->tcp_conn_req_cnt_q++; 1091 1092 /* Move from SYN_RCVD to ESTABLISHED list */ 1093 tcp->tcp_eager_next_q0->tcp_eager_prev_q0 = tcp->tcp_eager_prev_q0; 1094 tcp->tcp_eager_prev_q0->tcp_eager_next_q0 = tcp->tcp_eager_next_q0; 1095 tcp->tcp_eager_prev_q0 = NULL; 1096 tcp->tcp_eager_next_q0 = NULL; 1097 1098 /* 1099 * Insert at end of the queue because connections are accepted 1100 * in chronological order. Leaving the older connections at front 1101 * of the queue helps reducing search time. 1102 */ 1103 tail = listener->tcp_eager_last_q; 1104 if (tail != NULL) 1105 tail->tcp_eager_next_q = tcp; 1106 else 1107 listener->tcp_eager_next_q = tcp; 1108 listener->tcp_eager_last_q = tcp; 1109 tcp->tcp_eager_next_q = NULL; 1110 1111 /* we have timed out before */ 1112 if (tcp->tcp_syn_rcvd_timeout != 0) { 1113 tcp->tcp_syn_rcvd_timeout = 0; 1114 listener->tcp_syn_rcvd_timeout--; 1115 if (listener->tcp_syn_defense && 1116 listener->tcp_syn_rcvd_timeout <= 1117 (listener->tcp_tcps->tcps_conn_req_max_q0 >> 5) && 1118 10*MINUTES < TICK_TO_MSEC(ddi_get_lbolt64() - 1119 listener->tcp_last_rcv_lbolt)) { 1120 /* 1121 * Turn off the defense mode if we 1122 * believe the SYN attack is over. 1123 */ 1124 listener->tcp_syn_defense = B_FALSE; 1125 if (listener->tcp_ip_addr_cache) { 1126 kmem_free((void *)listener->tcp_ip_addr_cache, 1127 IP_ADDR_CACHE_SIZE * sizeof (ipaddr_t)); 1128 listener->tcp_ip_addr_cache = NULL; 1129 } 1130 } 1131 } 1132 addr_cache = (ipaddr_t *)(listener->tcp_ip_addr_cache); 1133 if (addr_cache != NULL) { 1134 /* 1135 * We have finished a 3-way handshake with this 1136 * remote host. This proves the IP addr is good. 1137 * Cache it! 1138 */ 1139 addr_cache[IP_ADDR_CACHE_HASH(tcp->tcp_connp->conn_faddr_v4)] = 1140 tcp->tcp_connp->conn_faddr_v4; 1141 } 1142 mutex_exit(&listener->tcp_eager_lock); 1143 1144 /* 1145 * Notify the ULP about the newconn. It is guaranteed that no 1146 * tcp_accept() call will be made for the eager if the 1147 * notification fails. 1148 */ 1149 if ((upper = (*lconnp->conn_upcalls->su_newconn) 1150 (lconnp->conn_upper_handle, (sock_lower_handle_t)econnp, 1151 &sock_tcp_downcalls, ira->ira_cred, ira->ira_cpid, 1152 &econnp->conn_upcalls)) == NULL) { 1153 /* 1154 * Normally this should not happen, but the listener might 1155 * have done a fallback to TPI followed by a close(), in 1156 * which case tcp_closemp for this conn might have been 1157 * used by tcp_eager_cleanup(). 1158 */ 1159 mutex_enter(&listener->tcp_eager_lock); 1160 if (tcp->tcp_closemp_used) { 1161 mutex_exit(&listener->tcp_eager_lock); 1162 return (B_FALSE); 1163 } 1164 tcp->tcp_closemp_used = B_TRUE; 1165 TCP_DEBUG_GETPCSTACK(tcp->tcmp_stk, 15); 1166 mp = &tcp->tcp_closemp; 1167 mutex_exit(&listener->tcp_eager_lock); 1168 tcp_eager_kill(econnp, mp, NULL, NULL); 1169 return (B_FALSE); 1170 } 1171 econnp->conn_upper_handle = upper; 1172 1173 tcp->tcp_detached = B_FALSE; 1174 tcp->tcp_hard_binding = B_FALSE; 1175 tcp->tcp_tconnind_started = B_TRUE; 1176 1177 if (econnp->conn_keepalive) { 1178 tcp->tcp_ka_last_intrvl = 0; 1179 tcp->tcp_ka_tid = TCP_TIMER(tcp, tcp_keepalive_timer, 1180 tcp->tcp_ka_interval); 1181 } 1182 1183 /* Update the necessary parameters */ 1184 tcp_get_proto_props(tcp, &sopp); 1185 1186 (*econnp->conn_upcalls->su_set_proto_props) 1187 (econnp->conn_upper_handle, &sopp); 1188 1189 return (B_TRUE); 1190 } 1191