1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. 24 * Copyright 2017 Joyent, Inc. 25 */ 26 27 /* This file contains all TCP kernel socket related functions. */ 28 29 #include <sys/types.h> 30 #include <sys/strlog.h> 31 #include <sys/policy.h> 32 #include <sys/sockio.h> 33 #include <sys/strsubr.h> 34 #include <sys/strsun.h> 35 #include <sys/squeue_impl.h> 36 #include <sys/squeue.h> 37 #define _SUN_TPI_VERSION 2 38 #include <sys/tihdr.h> 39 #include <sys/timod.h> 40 #include <sys/tpicommon.h> 41 #include <sys/socketvar.h> 42 43 #include <inet/common.h> 44 #include <inet/proto_set.h> 45 #include <inet/ip.h> 46 #include <inet/tcp.h> 47 #include <inet/tcp_impl.h> 48 49 static void tcp_activate(sock_lower_handle_t, sock_upper_handle_t, 50 sock_upcalls_t *, int, cred_t *); 51 static int tcp_accept(sock_lower_handle_t, sock_lower_handle_t, 52 sock_upper_handle_t, cred_t *); 53 static int tcp_bind(sock_lower_handle_t, struct sockaddr *, 54 socklen_t, cred_t *); 55 static int tcp_listen(sock_lower_handle_t, int, cred_t *); 56 static int tcp_connect(sock_lower_handle_t, const struct sockaddr *, 57 socklen_t, sock_connid_t *, cred_t *); 58 static int tcp_getpeername(sock_lower_handle_t, struct sockaddr *, 59 socklen_t *, cred_t *); 60 static int tcp_getsockname(sock_lower_handle_t, struct sockaddr *, 61 socklen_t *, cred_t *); 62 static int tcp_getsockopt(sock_lower_handle_t, int, int, void *, 63 socklen_t *, cred_t *); 64 static int tcp_setsockopt(sock_lower_handle_t, int, int, const void *, 65 socklen_t, cred_t *); 66 static int tcp_sendmsg(sock_lower_handle_t, mblk_t *, struct nmsghdr *, 67 cred_t *); 68 static int tcp_shutdown(sock_lower_handle_t, int, cred_t *); 69 static void tcp_clr_flowctrl(sock_lower_handle_t); 70 static int tcp_ioctl(sock_lower_handle_t, int, intptr_t, int, int32_t *, 71 cred_t *); 72 static int tcp_close(sock_lower_handle_t, int, cred_t *); 73 74 sock_downcalls_t sock_tcp_downcalls = { 75 tcp_activate, 76 tcp_accept, 77 tcp_bind, 78 tcp_listen, 79 tcp_connect, 80 tcp_getpeername, 81 tcp_getsockname, 82 tcp_getsockopt, 83 tcp_setsockopt, 84 tcp_sendmsg, 85 NULL, 86 NULL, 87 NULL, 88 tcp_shutdown, 89 tcp_clr_flowctrl, 90 tcp_ioctl, 91 tcp_close, 92 }; 93 94 /* ARGSUSED */ 95 static void 96 tcp_activate(sock_lower_handle_t proto_handle, sock_upper_handle_t sock_handle, 97 sock_upcalls_t *sock_upcalls, int flags, cred_t *cr) 98 { 99 conn_t *connp = (conn_t *)proto_handle; 100 struct sock_proto_props sopp; 101 extern struct module_info tcp_rinfo; 102 103 ASSERT(connp->conn_upper_handle == NULL); 104 105 /* All Solaris components should pass a cred for this operation. */ 106 ASSERT(cr != NULL); 107 108 sopp.sopp_flags = SOCKOPT_RCVHIWAT | SOCKOPT_RCVLOWAT | 109 SOCKOPT_MAXPSZ | SOCKOPT_MAXBLK | SOCKOPT_RCVTIMER | 110 SOCKOPT_RCVTHRESH | SOCKOPT_MAXADDRLEN | SOCKOPT_MINPSZ; 111 112 sopp.sopp_rxhiwat = SOCKET_RECVHIWATER; 113 sopp.sopp_rxlowat = SOCKET_RECVLOWATER; 114 sopp.sopp_maxpsz = INFPSZ; 115 sopp.sopp_maxblk = INFPSZ; 116 sopp.sopp_rcvtimer = SOCKET_TIMER_INTERVAL; 117 sopp.sopp_rcvthresh = SOCKET_RECVHIWATER >> 3; 118 sopp.sopp_maxaddrlen = sizeof (sin6_t); 119 sopp.sopp_minpsz = (tcp_rinfo.mi_minpsz == 1) ? 0 : 120 tcp_rinfo.mi_minpsz; 121 122 connp->conn_upcalls = sock_upcalls; 123 connp->conn_upper_handle = sock_handle; 124 125 ASSERT(connp->conn_rcvbuf != 0 && 126 connp->conn_rcvbuf == connp->conn_tcp->tcp_rwnd); 127 (*sock_upcalls->su_set_proto_props)(sock_handle, &sopp); 128 } 129 130 /*ARGSUSED*/ 131 static int 132 tcp_accept(sock_lower_handle_t lproto_handle, 133 sock_lower_handle_t eproto_handle, sock_upper_handle_t sock_handle, 134 cred_t *cr) 135 { 136 conn_t *lconnp, *econnp; 137 tcp_t *listener, *eager; 138 139 /* 140 * KSSL can move a socket from one listener to another, in which 141 * case `lproto_handle' points to the new listener. To ensure that 142 * the original listener is used the information is obtained from 143 * the eager. 144 */ 145 econnp = (conn_t *)eproto_handle; 146 eager = econnp->conn_tcp; 147 ASSERT(IPCL_IS_NONSTR(econnp)); 148 ASSERT(eager->tcp_listener != NULL); 149 listener = eager->tcp_listener; 150 lconnp = (conn_t *)listener->tcp_connp; 151 ASSERT(listener->tcp_state == TCPS_LISTEN); 152 ASSERT(lconnp->conn_upper_handle != NULL); 153 154 /* 155 * It is possible for the accept thread to race with the thread that 156 * made the su_newconn upcall in tcp_newconn_notify. Both 157 * tcp_newconn_notify and tcp_accept require that conn_upper_handle 158 * and conn_upcalls be set before returning, so they both write to 159 * them. However, we're guaranteed that the value written is the same 160 * for both threads. 161 */ 162 ASSERT(econnp->conn_upper_handle == NULL || 163 econnp->conn_upper_handle == sock_handle); 164 ASSERT(econnp->conn_upcalls == NULL || 165 econnp->conn_upcalls == lconnp->conn_upcalls); 166 econnp->conn_upper_handle = sock_handle; 167 econnp->conn_upcalls = lconnp->conn_upcalls; 168 169 ASSERT(econnp->conn_netstack == 170 listener->tcp_connp->conn_netstack); 171 ASSERT(eager->tcp_tcps == listener->tcp_tcps); 172 173 /* 174 * We should have a minimum of 2 references on the conn at this 175 * point. One for TCP and one for the newconn notification 176 * (which is now taken over by IP). In the normal case we would 177 * also have another reference (making a total of 3) for the conn 178 * being in the classifier hash list. However the eager could have 179 * received an RST subsequently and tcp_closei_local could have 180 * removed the eager from the classifier hash list, hence we can't 181 * assert that reference. 182 */ 183 ASSERT(econnp->conn_ref >= 2); 184 185 mutex_enter(&listener->tcp_eager_lock); 186 /* 187 * Non-STREAMS listeners never defer the notification of new 188 * connections. 189 */ 190 ASSERT(!listener->tcp_eager_prev_q0->tcp_conn_def_q0); 191 tcp_eager_unlink(eager); 192 mutex_exit(&listener->tcp_eager_lock); 193 CONN_DEC_REF(listener->tcp_connp); 194 195 return ((eager->tcp_state < TCPS_ESTABLISHED) ? ECONNABORTED : 0); 196 } 197 198 static int 199 tcp_bind(sock_lower_handle_t proto_handle, struct sockaddr *sa, 200 socklen_t len, cred_t *cr) 201 { 202 int error; 203 conn_t *connp = (conn_t *)proto_handle; 204 205 /* All Solaris components should pass a cred for this operation. */ 206 ASSERT(cr != NULL); 207 ASSERT(connp->conn_upper_handle != NULL); 208 209 error = squeue_synch_enter(connp, NULL); 210 if (error != 0) { 211 /* failed to enter */ 212 return (ENOSR); 213 } 214 215 /* binding to a NULL address really means unbind */ 216 if (sa == NULL) { 217 if (connp->conn_tcp->tcp_state < TCPS_LISTEN) 218 error = tcp_do_unbind(connp); 219 else 220 error = EINVAL; 221 } else { 222 error = tcp_do_bind(connp, sa, len, cr, B_TRUE); 223 } 224 225 squeue_synch_exit(connp, SQ_NODRAIN); 226 227 if (error < 0) { 228 if (error == -TOUTSTATE) 229 error = EINVAL; 230 else 231 error = proto_tlitosyserr(-error); 232 } 233 234 return (error); 235 } 236 237 /* ARGSUSED */ 238 static int 239 tcp_listen(sock_lower_handle_t proto_handle, int backlog, cred_t *cr) 240 { 241 conn_t *connp = (conn_t *)proto_handle; 242 tcp_t *tcp = connp->conn_tcp; 243 int error; 244 245 ASSERT(connp->conn_upper_handle != NULL); 246 247 /* All Solaris components should pass a cred for this operation. */ 248 ASSERT(cr != NULL); 249 250 error = squeue_synch_enter(connp, NULL); 251 if (error != 0) { 252 /* failed to enter */ 253 return (ENOBUFS); 254 } 255 256 error = tcp_do_listen(connp, NULL, 0, backlog, cr, B_FALSE); 257 if (error == 0) { 258 /* 259 * sockfs needs to know what's the maximum number of socket 260 * that can be queued on the listener. 261 */ 262 (*connp->conn_upcalls->su_opctl)(connp->conn_upper_handle, 263 SOCK_OPCTL_ENAB_ACCEPT, 264 (uintptr_t)(tcp->tcp_conn_req_max + 265 tcp->tcp_tcps->tcps_conn_req_max_q0)); 266 } else if (error < 0) { 267 if (error == -TOUTSTATE) 268 error = EINVAL; 269 else 270 error = proto_tlitosyserr(-error); 271 } 272 squeue_synch_exit(connp, SQ_NODRAIN); 273 return (error); 274 } 275 276 static int 277 tcp_connect(sock_lower_handle_t proto_handle, const struct sockaddr *sa, 278 socklen_t len, sock_connid_t *id, cred_t *cr) 279 { 280 conn_t *connp = (conn_t *)proto_handle; 281 int error; 282 283 ASSERT(connp->conn_upper_handle != NULL); 284 285 /* All Solaris components should pass a cred for this operation. */ 286 ASSERT(cr != NULL); 287 288 error = proto_verify_ip_addr(connp->conn_family, sa, len); 289 if (error != 0) { 290 return (error); 291 } 292 293 error = squeue_synch_enter(connp, NULL); 294 if (error != 0) { 295 /* failed to enter */ 296 return (ENOSR); 297 } 298 299 /* 300 * TCP supports quick connect, so no need to do an implicit bind 301 */ 302 error = tcp_do_connect(connp, sa, len, cr, curproc->p_pid); 303 if (error == 0) { 304 *id = connp->conn_tcp->tcp_connid; 305 } else if (error < 0) { 306 if (error == -TOUTSTATE) { 307 switch (connp->conn_tcp->tcp_state) { 308 case TCPS_SYN_SENT: 309 error = EALREADY; 310 break; 311 case TCPS_ESTABLISHED: 312 error = EISCONN; 313 break; 314 case TCPS_LISTEN: 315 error = EOPNOTSUPP; 316 break; 317 default: 318 error = EINVAL; 319 break; 320 } 321 } else { 322 error = proto_tlitosyserr(-error); 323 } 324 } 325 326 if (connp->conn_tcp->tcp_loopback) { 327 struct sock_proto_props sopp; 328 329 sopp.sopp_flags = SOCKOPT_LOOPBACK; 330 sopp.sopp_loopback = B_TRUE; 331 332 (*connp->conn_upcalls->su_set_proto_props)( 333 connp->conn_upper_handle, &sopp); 334 } 335 done: 336 /* 337 * Indicate (via SQ_PROCESS) that it is acceptable for the squeue to 338 * attempt to drain a pending request relevant to this connection when 339 * exiting the synchronous context. This can improve the performance 340 * and efficiency of TCP connect(2) operations to localhost. 341 */ 342 squeue_synch_exit(connp, SQ_PROCESS); 343 344 return ((error == 0) ? EINPROGRESS : error); 345 } 346 347 /* ARGSUSED3 */ 348 static int 349 tcp_getpeername(sock_lower_handle_t proto_handle, struct sockaddr *addr, 350 socklen_t *addrlenp, cred_t *cr) 351 { 352 conn_t *connp = (conn_t *)proto_handle; 353 tcp_t *tcp = connp->conn_tcp; 354 355 /* All Solaris components should pass a cred for this operation. */ 356 ASSERT(cr != NULL); 357 358 ASSERT(tcp != NULL); 359 if (tcp->tcp_state < TCPS_SYN_RCVD) 360 return (ENOTCONN); 361 362 return (conn_getpeername(connp, addr, addrlenp)); 363 } 364 365 /* ARGSUSED3 */ 366 static int 367 tcp_getsockname(sock_lower_handle_t proto_handle, struct sockaddr *addr, 368 socklen_t *addrlenp, cred_t *cr) 369 { 370 conn_t *connp = (conn_t *)proto_handle; 371 372 /* All Solaris components should pass a cred for this operation. */ 373 ASSERT(cr != NULL); 374 375 return (conn_getsockname(connp, addr, addrlenp)); 376 } 377 378 /* returns UNIX error, the optlen is a value-result arg */ 379 static int 380 tcp_getsockopt(sock_lower_handle_t proto_handle, int level, int option_name, 381 void *optvalp, socklen_t *optlen, cred_t *cr) 382 { 383 conn_t *connp = (conn_t *)proto_handle; 384 int error; 385 t_uscalar_t max_optbuf_len; 386 void *optvalp_buf; 387 int len; 388 389 ASSERT(connp->conn_upper_handle != NULL); 390 391 error = proto_opt_check(level, option_name, *optlen, &max_optbuf_len, 392 tcp_opt_obj.odb_opt_des_arr, 393 tcp_opt_obj.odb_opt_arr_cnt, 394 B_FALSE, B_TRUE, cr); 395 if (error != 0) { 396 if (error < 0) { 397 error = proto_tlitosyserr(-error); 398 } 399 return (error); 400 } 401 402 optvalp_buf = kmem_alloc(max_optbuf_len, KM_SLEEP); 403 404 error = squeue_synch_enter(connp, NULL); 405 if (error == ENOMEM) { 406 kmem_free(optvalp_buf, max_optbuf_len); 407 return (ENOMEM); 408 } 409 410 len = tcp_opt_get(connp, level, option_name, optvalp_buf); 411 squeue_synch_exit(connp, SQ_NODRAIN); 412 413 if (len == -1) { 414 kmem_free(optvalp_buf, max_optbuf_len); 415 return (EINVAL); 416 } 417 418 /* 419 * update optlen and copy option value 420 */ 421 t_uscalar_t size = MIN(len, *optlen); 422 423 bcopy(optvalp_buf, optvalp, size); 424 bcopy(&size, optlen, sizeof (size)); 425 426 kmem_free(optvalp_buf, max_optbuf_len); 427 return (0); 428 } 429 430 static int 431 tcp_setsockopt(sock_lower_handle_t proto_handle, int level, int option_name, 432 const void *optvalp, socklen_t optlen, cred_t *cr) 433 { 434 conn_t *connp = (conn_t *)proto_handle; 435 int error; 436 437 ASSERT(connp->conn_upper_handle != NULL); 438 /* 439 * Entering the squeue synchronously can result in a context switch, 440 * which can cause a rather sever performance degradation. So we try to 441 * handle whatever options we can without entering the squeue. 442 */ 443 if (level == IPPROTO_TCP) { 444 switch (option_name) { 445 case TCP_NODELAY: 446 if (optlen != sizeof (int32_t)) 447 return (EINVAL); 448 mutex_enter(&connp->conn_tcp->tcp_non_sq_lock); 449 connp->conn_tcp->tcp_naglim = *(int *)optvalp ? 1 : 450 connp->conn_tcp->tcp_mss; 451 mutex_exit(&connp->conn_tcp->tcp_non_sq_lock); 452 return (0); 453 default: 454 break; 455 } 456 } 457 458 error = squeue_synch_enter(connp, NULL); 459 if (error == ENOMEM) { 460 return (ENOMEM); 461 } 462 463 error = proto_opt_check(level, option_name, optlen, NULL, 464 tcp_opt_obj.odb_opt_des_arr, 465 tcp_opt_obj.odb_opt_arr_cnt, 466 B_TRUE, B_FALSE, cr); 467 468 if (error != 0) { 469 if (error < 0) { 470 error = proto_tlitosyserr(-error); 471 } 472 squeue_synch_exit(connp, SQ_NODRAIN); 473 return (error); 474 } 475 476 error = tcp_opt_set(connp, SETFN_OPTCOM_NEGOTIATE, level, option_name, 477 optlen, (uchar_t *)optvalp, (uint_t *)&optlen, (uchar_t *)optvalp, 478 NULL, cr); 479 squeue_synch_exit(connp, SQ_NODRAIN); 480 481 ASSERT(error >= 0); 482 483 return (error); 484 } 485 486 /* ARGSUSED */ 487 static int 488 tcp_sendmsg(sock_lower_handle_t proto_handle, mblk_t *mp, struct nmsghdr *msg, 489 cred_t *cr) 490 { 491 tcp_t *tcp; 492 uint32_t msize; 493 conn_t *connp = (conn_t *)proto_handle; 494 int32_t tcpstate; 495 496 /* All Solaris components should pass a cred for this operation. */ 497 ASSERT(cr != NULL); 498 499 ASSERT(connp->conn_ref >= 2); 500 ASSERT(connp->conn_upper_handle != NULL); 501 502 if (msg->msg_controllen != 0) { 503 freemsg(mp); 504 return (EOPNOTSUPP); 505 } 506 507 switch (DB_TYPE(mp)) { 508 case M_DATA: 509 tcp = connp->conn_tcp; 510 ASSERT(tcp != NULL); 511 512 tcpstate = tcp->tcp_state; 513 if (tcpstate < TCPS_ESTABLISHED) { 514 freemsg(mp); 515 /* 516 * We return ENOTCONN if the endpoint is trying to 517 * connect or has never been connected, and EPIPE if it 518 * has been disconnected. The connection id helps us 519 * distinguish between the last two cases. 520 */ 521 return ((tcpstate == TCPS_SYN_SENT) ? ENOTCONN : 522 ((tcp->tcp_connid > 0) ? EPIPE : ENOTCONN)); 523 } else if (tcpstate > TCPS_CLOSE_WAIT) { 524 freemsg(mp); 525 return (EPIPE); 526 } 527 528 msize = msgdsize(mp); 529 530 mutex_enter(&tcp->tcp_non_sq_lock); 531 tcp->tcp_squeue_bytes += msize; 532 /* 533 * Squeue Flow Control 534 */ 535 if (TCP_UNSENT_BYTES(tcp) > connp->conn_sndbuf) { 536 tcp_setqfull(tcp); 537 } 538 mutex_exit(&tcp->tcp_non_sq_lock); 539 540 /* 541 * The application may pass in an address in the msghdr, but 542 * we ignore the address on connection-oriented sockets. 543 * Just like BSD this code does not generate an error for 544 * TCP (a CONNREQUIRED socket) when sending to an address 545 * passed in with sendto/sendmsg. Instead the data is 546 * delivered on the connection as if no address had been 547 * supplied. 548 */ 549 CONN_INC_REF(connp); 550 551 if (msg->msg_flags & MSG_OOB) { 552 SQUEUE_ENTER_ONE(connp->conn_sqp, mp, tcp_output_urgent, 553 connp, NULL, tcp_squeue_flag, SQTAG_TCP_OUTPUT); 554 } else { 555 SQUEUE_ENTER_ONE(connp->conn_sqp, mp, tcp_output, 556 connp, NULL, tcp_squeue_flag, SQTAG_TCP_OUTPUT); 557 } 558 559 return (0); 560 561 default: 562 ASSERT(0); 563 } 564 565 freemsg(mp); 566 return (0); 567 } 568 569 /* ARGSUSED */ 570 static int 571 tcp_shutdown(sock_lower_handle_t proto_handle, int how, cred_t *cr) 572 { 573 conn_t *connp = (conn_t *)proto_handle; 574 tcp_t *tcp = connp->conn_tcp; 575 576 ASSERT(connp->conn_upper_handle != NULL); 577 578 /* All Solaris components should pass a cred for this operation. */ 579 ASSERT(cr != NULL); 580 581 /* 582 * X/Open requires that we check the connected state. 583 */ 584 if (tcp->tcp_state < TCPS_SYN_SENT) 585 return (ENOTCONN); 586 587 /* shutdown the send side */ 588 if (how != SHUT_RD) { 589 mblk_t *bp; 590 591 bp = allocb_wait(0, BPRI_HI, STR_NOSIG, NULL); 592 CONN_INC_REF(connp); 593 SQUEUE_ENTER_ONE(connp->conn_sqp, bp, tcp_shutdown_output, 594 connp, NULL, SQ_NODRAIN, SQTAG_TCP_SHUTDOWN_OUTPUT); 595 596 (*connp->conn_upcalls->su_opctl)(connp->conn_upper_handle, 597 SOCK_OPCTL_SHUT_SEND, 0); 598 } 599 600 /* shutdown the recv side */ 601 if (how != SHUT_WR) 602 (*connp->conn_upcalls->su_opctl)(connp->conn_upper_handle, 603 SOCK_OPCTL_SHUT_RECV, 0); 604 605 return (0); 606 } 607 608 static void 609 tcp_clr_flowctrl(sock_lower_handle_t proto_handle) 610 { 611 conn_t *connp = (conn_t *)proto_handle; 612 tcp_t *tcp = connp->conn_tcp; 613 mblk_t *mp; 614 int error; 615 616 ASSERT(connp->conn_upper_handle != NULL); 617 618 /* 619 * If tcp->tcp_rsrv_mp == NULL, it means that tcp_clr_flowctrl() 620 * is currently running. 621 */ 622 mutex_enter(&tcp->tcp_rsrv_mp_lock); 623 if ((mp = tcp->tcp_rsrv_mp) == NULL) { 624 mutex_exit(&tcp->tcp_rsrv_mp_lock); 625 return; 626 } 627 tcp->tcp_rsrv_mp = NULL; 628 mutex_exit(&tcp->tcp_rsrv_mp_lock); 629 630 error = squeue_synch_enter(connp, mp); 631 ASSERT(error == 0); 632 633 mutex_enter(&tcp->tcp_rsrv_mp_lock); 634 tcp->tcp_rsrv_mp = mp; 635 mutex_exit(&tcp->tcp_rsrv_mp_lock); 636 637 if (tcp->tcp_fused) { 638 tcp_fuse_backenable(tcp); 639 } else { 640 tcp->tcp_rwnd = connp->conn_rcvbuf; 641 /* 642 * Send back a window update immediately if TCP is above 643 * ESTABLISHED state and the increase of the rcv window 644 * that the other side knows is at least 1 MSS after flow 645 * control is lifted. 646 */ 647 if (tcp->tcp_state >= TCPS_ESTABLISHED && 648 tcp_rwnd_reopen(tcp) == TH_ACK_NEEDED) { 649 tcp_xmit_ctl(NULL, tcp, 650 (tcp->tcp_swnd == 0) ? tcp->tcp_suna : 651 tcp->tcp_snxt, tcp->tcp_rnxt, TH_ACK); 652 } 653 } 654 655 squeue_synch_exit(connp, SQ_NODRAIN); 656 } 657 658 /* ARGSUSED */ 659 static int 660 tcp_ioctl(sock_lower_handle_t proto_handle, int cmd, intptr_t arg, 661 int mode, int32_t *rvalp, cred_t *cr) 662 { 663 conn_t *connp = (conn_t *)proto_handle; 664 int error; 665 666 ASSERT(connp->conn_upper_handle != NULL); 667 668 /* All Solaris components should pass a cred for this operation. */ 669 ASSERT(cr != NULL); 670 671 /* 672 * If we don't have a helper stream then create one. 673 * ip_create_helper_stream takes care of locking the conn_t, 674 * so this check for NULL is just a performance optimization. 675 */ 676 if (connp->conn_helper_info == NULL) { 677 tcp_stack_t *tcps = connp->conn_tcp->tcp_tcps; 678 679 /* 680 * Create a helper stream for non-STREAMS socket. 681 */ 682 error = ip_create_helper_stream(connp, tcps->tcps_ldi_ident); 683 if (error != 0) { 684 ip0dbg(("tcp_ioctl: create of IP helper stream " 685 "failed %d\n", error)); 686 return (error); 687 } 688 } 689 690 switch (cmd) { 691 case ND_SET: 692 case ND_GET: 693 case _SIOCSOCKFALLBACK: 694 case TCP_IOC_ABORT_CONN: 695 case TI_GETPEERNAME: 696 case TI_GETMYNAME: 697 ip1dbg(("tcp_ioctl: cmd 0x%x on non streams socket", 698 cmd)); 699 error = EINVAL; 700 break; 701 default: 702 /* 703 * If the conn is not closing, pass on to IP using 704 * helper stream. Bump the ioctlref to prevent tcp_close 705 * from closing the rq/wq out from underneath the ioctl 706 * if it ends up queued or aborted/interrupted. 707 */ 708 mutex_enter(&connp->conn_lock); 709 if (connp->conn_state_flags & (CONN_CLOSING)) { 710 mutex_exit(&connp->conn_lock); 711 error = EINVAL; 712 break; 713 } 714 CONN_INC_IOCTLREF_LOCKED(connp); 715 error = ldi_ioctl(connp->conn_helper_info->iphs_handle, 716 cmd, arg, mode, cr, rvalp); 717 CONN_DEC_IOCTLREF(connp); 718 break; 719 } 720 return (error); 721 } 722 723 /* ARGSUSED */ 724 static int 725 tcp_close(sock_lower_handle_t proto_handle, int flags, cred_t *cr) 726 { 727 conn_t *connp = (conn_t *)proto_handle; 728 729 ASSERT(connp->conn_upper_handle != NULL); 730 731 /* All Solaris components should pass a cred for this operation. */ 732 ASSERT(cr != NULL); 733 734 tcp_close_common(connp, flags); 735 736 ip_free_helper_stream(connp); 737 738 /* 739 * Drop IP's reference on the conn. This is the last reference 740 * on the connp if the state was less than established. If the 741 * connection has gone into timewait state, then we will have 742 * one ref for the TCP and one more ref (total of two) for the 743 * classifier connected hash list (a timewait connections stays 744 * in connected hash till closed). 745 * 746 * We can't assert the references because there might be other 747 * transient reference places because of some walkers or queued 748 * packets in squeue for the timewait state. 749 */ 750 CONN_DEC_REF(connp); 751 752 /* 753 * EINPROGRESS tells sockfs to wait for a 'closed' upcall before 754 * freeing the socket. 755 */ 756 return (EINPROGRESS); 757 } 758 759 /* ARGSUSED */ 760 sock_lower_handle_t 761 tcp_create(int family, int type, int proto, sock_downcalls_t **sock_downcalls, 762 uint_t *smodep, int *errorp, int flags, cred_t *credp) 763 { 764 conn_t *connp; 765 boolean_t isv6 = family == AF_INET6; 766 767 if (type != SOCK_STREAM || (family != AF_INET && family != AF_INET6) || 768 (proto != 0 && proto != IPPROTO_TCP)) { 769 *errorp = EPROTONOSUPPORT; 770 return (NULL); 771 } 772 773 connp = tcp_create_common(credp, isv6, B_TRUE, errorp); 774 if (connp == NULL) { 775 return (NULL); 776 } 777 778 /* 779 * Put the ref for TCP. Ref for IP was already put 780 * by ipcl_conn_create. Also make the conn_t globally 781 * visible to walkers 782 */ 783 mutex_enter(&connp->conn_lock); 784 CONN_INC_REF_LOCKED(connp); 785 ASSERT(connp->conn_ref == 2); 786 connp->conn_state_flags &= ~CONN_INCIPIENT; 787 788 connp->conn_flags |= IPCL_NONSTR; 789 mutex_exit(&connp->conn_lock); 790 791 ASSERT(errorp != NULL); 792 *errorp = 0; 793 *sock_downcalls = &sock_tcp_downcalls; 794 *smodep = SM_CONNREQUIRED | SM_EXDATA | SM_ACCEPTSUPP | 795 SM_SENDFILESUPP; 796 797 return ((sock_lower_handle_t)connp); 798 } 799 800 /* 801 * tcp_fallback 802 * 803 * A direct socket is falling back to using STREAMS. The queue 804 * that is being passed down was created using tcp_open() with 805 * the SO_FALLBACK flag set. As a result, the queue is not 806 * associated with a conn, and the q_ptrs instead contain the 807 * dev and minor area that should be used. 808 * 809 * The 'issocket' flag indicates whether the FireEngine 810 * optimizations should be used. The common case would be that 811 * optimizations are enabled, and they might be subsequently 812 * disabled using the _SIOCSOCKFALLBACK ioctl. 813 */ 814 815 /* 816 * An active connection is falling back to TPI. Gather all the information 817 * required by the STREAM head and TPI sonode and send it up. 818 */ 819 static void 820 tcp_fallback_noneager(tcp_t *tcp, mblk_t *stropt_mp, queue_t *q, 821 boolean_t issocket, so_proto_quiesced_cb_t quiesced_cb, 822 sock_quiesce_arg_t *arg) 823 { 824 conn_t *connp = tcp->tcp_connp; 825 struct stroptions *stropt; 826 struct T_capability_ack tca; 827 struct sockaddr_in6 laddr, faddr; 828 socklen_t laddrlen, faddrlen; 829 short opts; 830 int error; 831 mblk_t *mp, *mpnext; 832 833 connp->conn_dev = (dev_t)RD(q)->q_ptr; 834 connp->conn_minor_arena = WR(q)->q_ptr; 835 836 RD(q)->q_ptr = WR(q)->q_ptr = connp; 837 838 connp->conn_rq = RD(q); 839 connp->conn_wq = WR(q); 840 841 WR(q)->q_qinfo = &tcp_sock_winit; 842 843 if (!issocket) 844 tcp_use_pure_tpi(tcp); 845 846 /* 847 * free the helper stream 848 */ 849 ip_free_helper_stream(connp); 850 851 /* 852 * Notify the STREAM head about options 853 */ 854 DB_TYPE(stropt_mp) = M_SETOPTS; 855 stropt = (struct stroptions *)stropt_mp->b_rptr; 856 stropt_mp->b_wptr += sizeof (struct stroptions); 857 stropt->so_flags = SO_HIWAT | SO_WROFF | SO_MAXBLK; 858 859 stropt->so_wroff = connp->conn_ht_iphc_len + (tcp->tcp_loopback ? 0 : 860 tcp->tcp_tcps->tcps_wroff_xtra); 861 if (tcp->tcp_snd_sack_ok) 862 stropt->so_wroff += TCPOPT_MAX_SACK_LEN; 863 stropt->so_hiwat = connp->conn_rcvbuf; 864 stropt->so_maxblk = tcp_maxpsz_set(tcp, B_FALSE); 865 866 putnext(RD(q), stropt_mp); 867 868 /* 869 * Collect the information needed to sync with the sonode 870 */ 871 tcp_do_capability_ack(tcp, &tca, TC1_INFO|TC1_ACCEPTOR_ID); 872 873 laddrlen = faddrlen = sizeof (sin6_t); 874 (void) tcp_getsockname((sock_lower_handle_t)connp, 875 (struct sockaddr *)&laddr, &laddrlen, CRED()); 876 error = tcp_getpeername((sock_lower_handle_t)connp, 877 (struct sockaddr *)&faddr, &faddrlen, CRED()); 878 if (error != 0) 879 faddrlen = 0; 880 881 opts = 0; 882 if (connp->conn_oobinline) 883 opts |= SO_OOBINLINE; 884 if (connp->conn_ixa->ixa_flags & IXAF_DONTROUTE) 885 opts |= SO_DONTROUTE; 886 887 /* 888 * Notify the socket that the protocol is now quiescent, 889 * and it's therefore safe move data from the socket 890 * to the stream head. 891 */ 892 mp = (*quiesced_cb)(connp->conn_upper_handle, arg, &tca, 893 (struct sockaddr *)&laddr, laddrlen, 894 (struct sockaddr *)&faddr, faddrlen, opts); 895 896 while (mp != NULL) { 897 mpnext = mp->b_next; 898 tcp->tcp_rcv_list = mp->b_next; 899 mp->b_next = NULL; 900 putnext(q, mp); 901 mp = mpnext; 902 } 903 ASSERT(tcp->tcp_rcv_last_head == NULL); 904 ASSERT(tcp->tcp_rcv_last_tail == NULL); 905 ASSERT(tcp->tcp_rcv_cnt == 0); 906 907 /* 908 * All eagers in q0 are marked as being non-STREAM, so they will 909 * make su_newconn upcalls when the handshake completes, which 910 * will fail (resulting in the conn being closed). So we just blow 911 * off everything in q0 instead of waiting for the inevitable. 912 */ 913 if (tcp->tcp_conn_req_cnt_q0 != 0) 914 tcp_eager_cleanup(tcp, B_TRUE); 915 } 916 917 /* 918 * An eager is falling back to TPI. All we have to do is send 919 * up a T_CONN_IND. 920 */ 921 static void 922 tcp_fallback_eager(tcp_t *eager, boolean_t issocket, 923 so_proto_quiesced_cb_t quiesced_cb, sock_quiesce_arg_t *arg) 924 { 925 conn_t *connp = eager->tcp_connp; 926 tcp_t *listener = eager->tcp_listener; 927 mblk_t *mp; 928 929 ASSERT(listener != NULL); 930 931 /* 932 * Notify the socket that the protocol is now quiescent, 933 * and it's therefore safe move data from the socket 934 * to tcp's rcv queue. 935 */ 936 mp = (*quiesced_cb)(connp->conn_upper_handle, arg, NULL, NULL, 0, 937 NULL, 0, 0); 938 939 if (mp != NULL) { 940 ASSERT(eager->tcp_rcv_cnt == 0); 941 942 eager->tcp_rcv_list = mp; 943 eager->tcp_rcv_cnt = msgdsize(mp); 944 while (mp->b_next != NULL) { 945 mp = mp->b_next; 946 eager->tcp_rcv_cnt += msgdsize(mp); 947 } 948 eager->tcp_rcv_last_head = mp; 949 while (mp->b_cont) 950 mp = mp->b_cont; 951 eager->tcp_rcv_last_tail = mp; 952 if (eager->tcp_rcv_cnt > eager->tcp_rwnd) 953 eager->tcp_rwnd = 0; 954 else 955 eager->tcp_rwnd -= eager->tcp_rcv_cnt; 956 } 957 958 if (!issocket) 959 eager->tcp_issocket = B_FALSE; 960 /* 961 * The stream for this eager does not yet exist, so mark it as 962 * being detached. 963 */ 964 eager->tcp_detached = B_TRUE; 965 eager->tcp_hard_binding = B_TRUE; 966 connp->conn_rq = listener->tcp_connp->conn_rq; 967 connp->conn_wq = listener->tcp_connp->conn_wq; 968 969 /* Send up the connection indication */ 970 mp = eager->tcp_conn.tcp_eager_conn_ind; 971 ASSERT(mp != NULL); 972 eager->tcp_conn.tcp_eager_conn_ind = NULL; 973 974 /* 975 * TLI/XTI applications will get confused by 976 * sending eager as an option since it violates 977 * the option semantics. So remove the eager as 978 * option since TLI/XTI app doesn't need it anyway. 979 */ 980 if (!issocket) { 981 struct T_conn_ind *conn_ind; 982 983 conn_ind = (struct T_conn_ind *)mp->b_rptr; 984 conn_ind->OPT_length = 0; 985 conn_ind->OPT_offset = 0; 986 } 987 988 /* 989 * Sockfs guarantees that the listener will not be closed 990 * during fallback. So we can safely use the listener's queue. 991 */ 992 putnext(listener->tcp_connp->conn_rq, mp); 993 } 994 995 996 int 997 tcp_fallback(sock_lower_handle_t proto_handle, queue_t *q, 998 boolean_t direct_sockfs, so_proto_quiesced_cb_t quiesced_cb, 999 sock_quiesce_arg_t *arg) 1000 { 1001 tcp_t *tcp; 1002 conn_t *connp = (conn_t *)proto_handle; 1003 int error; 1004 mblk_t *stropt_mp; 1005 mblk_t *ordrel_mp; 1006 1007 tcp = connp->conn_tcp; 1008 1009 stropt_mp = allocb_wait(sizeof (struct stroptions), BPRI_HI, STR_NOSIG, 1010 NULL); 1011 1012 /* Pre-allocate the T_ordrel_ind mblk. */ 1013 ASSERT(tcp->tcp_ordrel_mp == NULL); 1014 ordrel_mp = allocb_wait(sizeof (struct T_ordrel_ind), BPRI_HI, 1015 STR_NOSIG, NULL); 1016 ordrel_mp->b_datap->db_type = M_PROTO; 1017 ((struct T_ordrel_ind *)ordrel_mp->b_rptr)->PRIM_type = T_ORDREL_IND; 1018 ordrel_mp->b_wptr += sizeof (struct T_ordrel_ind); 1019 1020 /* 1021 * Enter the squeue so that no new packets can come in 1022 */ 1023 error = squeue_synch_enter(connp, NULL); 1024 if (error != 0) { 1025 /* failed to enter, free all the pre-allocated messages. */ 1026 freeb(stropt_mp); 1027 freeb(ordrel_mp); 1028 return (ENOMEM); 1029 } 1030 1031 /* 1032 * Both endpoints must be of the same type (either STREAMS or 1033 * non-STREAMS) for fusion to be enabled. So if we are fused, 1034 * we have to unfuse. 1035 */ 1036 if (tcp->tcp_fused) 1037 tcp_unfuse(tcp); 1038 1039 if (tcp->tcp_listener != NULL) { 1040 /* The eager will deal with opts when accept() is called */ 1041 freeb(stropt_mp); 1042 tcp_fallback_eager(tcp, direct_sockfs, quiesced_cb, arg); 1043 } else { 1044 tcp_fallback_noneager(tcp, stropt_mp, q, direct_sockfs, 1045 quiesced_cb, arg); 1046 } 1047 1048 /* 1049 * No longer a direct socket 1050 * 1051 * Note that we intentionally leave the upper_handle and upcalls 1052 * intact, since eagers may still be using them. 1053 */ 1054 connp->conn_flags &= ~IPCL_NONSTR; 1055 tcp->tcp_ordrel_mp = ordrel_mp; 1056 1057 /* 1058 * There should be atleast two ref's (IP + TCP) 1059 */ 1060 ASSERT(connp->conn_ref >= 2); 1061 squeue_synch_exit(connp, SQ_NODRAIN); 1062 1063 return (0); 1064 } 1065 1066 /* 1067 * Notifies a non-STREAMS based listener about a new connection. This 1068 * function is executed on the *eager*'s squeue once the 3 way handshake 1069 * has completed. Note that the behavior differs from STREAMS, where the 1070 * T_CONN_IND is sent up by tcp_send_conn_ind() while on the *listener*'s 1071 * squeue. 1072 * 1073 * Returns B_TRUE if the notification succeeded and an upper handle was 1074 * obtained. `tcp' should be closed on failure. 1075 */ 1076 boolean_t 1077 tcp_newconn_notify(tcp_t *tcp, ip_recv_attr_t *ira) 1078 { 1079 tcp_t *listener = tcp->tcp_listener; 1080 conn_t *lconnp = listener->tcp_connp; 1081 conn_t *econnp = tcp->tcp_connp; 1082 tcp_t *tail; 1083 ipaddr_t *addr_cache; 1084 sock_upper_handle_t upper; 1085 struct sock_proto_props sopp; 1086 1087 mutex_enter(&listener->tcp_eager_lock); 1088 /* 1089 * Take the eager out, if it is in the list of droppable eagers 1090 * as we are here because the 3W handshake is over. 1091 */ 1092 MAKE_UNDROPPABLE(tcp); 1093 /* 1094 * The eager already has an extra ref put in tcp_input_data 1095 * so that it stays till accept comes back even though it 1096 * might get into TCPS_CLOSED as a result of a TH_RST etc. 1097 */ 1098 ASSERT(listener->tcp_conn_req_cnt_q0 > 0); 1099 listener->tcp_conn_req_cnt_q0--; 1100 listener->tcp_conn_req_cnt_q++; 1101 1102 /* Move from SYN_RCVD to ESTABLISHED list */ 1103 tcp->tcp_eager_next_q0->tcp_eager_prev_q0 = tcp->tcp_eager_prev_q0; 1104 tcp->tcp_eager_prev_q0->tcp_eager_next_q0 = tcp->tcp_eager_next_q0; 1105 tcp->tcp_eager_prev_q0 = NULL; 1106 tcp->tcp_eager_next_q0 = NULL; 1107 1108 /* 1109 * Insert at end of the queue because connections are accepted 1110 * in chronological order. Leaving the older connections at front 1111 * of the queue helps reducing search time. 1112 */ 1113 tail = listener->tcp_eager_last_q; 1114 if (tail != NULL) 1115 tail->tcp_eager_next_q = tcp; 1116 else 1117 listener->tcp_eager_next_q = tcp; 1118 listener->tcp_eager_last_q = tcp; 1119 tcp->tcp_eager_next_q = NULL; 1120 1121 /* we have timed out before */ 1122 if (tcp->tcp_syn_rcvd_timeout != 0) { 1123 tcp->tcp_syn_rcvd_timeout = 0; 1124 listener->tcp_syn_rcvd_timeout--; 1125 if (listener->tcp_syn_defense && 1126 listener->tcp_syn_rcvd_timeout <= 1127 (listener->tcp_tcps->tcps_conn_req_max_q0 >> 5) && 1128 10*MINUTES < TICK_TO_MSEC(ddi_get_lbolt64() - 1129 listener->tcp_last_rcv_lbolt)) { 1130 /* 1131 * Turn off the defense mode if we 1132 * believe the SYN attack is over. 1133 */ 1134 listener->tcp_syn_defense = B_FALSE; 1135 if (listener->tcp_ip_addr_cache) { 1136 kmem_free((void *)listener->tcp_ip_addr_cache, 1137 IP_ADDR_CACHE_SIZE * sizeof (ipaddr_t)); 1138 listener->tcp_ip_addr_cache = NULL; 1139 } 1140 } 1141 } 1142 addr_cache = (ipaddr_t *)(listener->tcp_ip_addr_cache); 1143 if (addr_cache != NULL) { 1144 /* 1145 * We have finished a 3-way handshake with this 1146 * remote host. This proves the IP addr is good. 1147 * Cache it! 1148 */ 1149 addr_cache[IP_ADDR_CACHE_HASH(tcp->tcp_connp->conn_faddr_v4)] = 1150 tcp->tcp_connp->conn_faddr_v4; 1151 } 1152 mutex_exit(&listener->tcp_eager_lock); 1153 1154 /* 1155 * Notify the ULP about the newconn. It is guaranteed that no 1156 * tcp_accept() call will be made for the eager if the 1157 * notification fails. 1158 */ 1159 if ((upper = (*lconnp->conn_upcalls->su_newconn) 1160 (lconnp->conn_upper_handle, (sock_lower_handle_t)econnp, 1161 &sock_tcp_downcalls, ira->ira_cred, ira->ira_cpid, 1162 &econnp->conn_upcalls)) == NULL) { 1163 return (B_FALSE); 1164 } 1165 econnp->conn_upper_handle = upper; 1166 1167 tcp->tcp_detached = B_FALSE; 1168 tcp->tcp_hard_binding = B_FALSE; 1169 tcp->tcp_tconnind_started = B_TRUE; 1170 1171 if (econnp->conn_keepalive) { 1172 tcp->tcp_ka_last_intrvl = 0; 1173 tcp->tcp_ka_tid = TCP_TIMER(tcp, tcp_keepalive_timer, 1174 tcp->tcp_ka_interval); 1175 } 1176 1177 /* Update the necessary parameters */ 1178 tcp_get_proto_props(tcp, &sopp); 1179 1180 (*econnp->conn_upcalls->su_set_proto_props) 1181 (econnp->conn_upper_handle, &sopp); 1182 1183 return (B_TRUE); 1184 } 1185