1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. 24 */ 25 26 /* This files contains all TCP TLI/TPI related functions */ 27 28 #include <sys/types.h> 29 #include <sys/stream.h> 30 #include <sys/strsun.h> 31 #include <sys/strsubr.h> 32 #include <sys/stropts.h> 33 #include <sys/strlog.h> 34 #define _SUN_TPI_VERSION 2 35 #include <sys/tihdr.h> 36 #include <sys/suntpi.h> 37 #include <sys/xti_inet.h> 38 #include <sys/squeue_impl.h> 39 #include <sys/squeue.h> 40 41 #include <inet/common.h> 42 #include <inet/ip.h> 43 #include <inet/tcp.h> 44 #include <inet/tcp_impl.h> 45 #include <inet/proto_set.h> 46 47 static void tcp_accept_swap(tcp_t *, tcp_t *, tcp_t *); 48 static int tcp_conprim_opt_process(tcp_t *, mblk_t *, int *, int *, int *); 49 50 void 51 tcp_use_pure_tpi(tcp_t *tcp) 52 { 53 conn_t *connp = tcp->tcp_connp; 54 55 #ifdef _ILP32 56 tcp->tcp_acceptor_id = (t_uscalar_t)connp->conn_rq; 57 #else 58 tcp->tcp_acceptor_id = connp->conn_dev; 59 #endif 60 /* 61 * Insert this socket into the acceptor hash. 62 * We might need it for T_CONN_RES message 63 */ 64 tcp_acceptor_hash_insert(tcp->tcp_acceptor_id, tcp); 65 66 tcp->tcp_issocket = B_FALSE; 67 TCP_STAT(tcp->tcp_tcps, tcp_sock_fallback); 68 } 69 70 /* Shorthand to generate and send TPI error acks to our client */ 71 void 72 tcp_err_ack(tcp_t *tcp, mblk_t *mp, int t_error, int sys_error) 73 { 74 if ((mp = mi_tpi_err_ack_alloc(mp, t_error, sys_error)) != NULL) 75 putnext(tcp->tcp_connp->conn_rq, mp); 76 } 77 78 /* Shorthand to generate and send TPI error acks to our client */ 79 void 80 tcp_err_ack_prim(tcp_t *tcp, mblk_t *mp, int primitive, 81 int t_error, int sys_error) 82 { 83 struct T_error_ack *teackp; 84 85 if ((mp = tpi_ack_alloc(mp, sizeof (struct T_error_ack), 86 M_PCPROTO, T_ERROR_ACK)) != NULL) { 87 teackp = (struct T_error_ack *)mp->b_rptr; 88 teackp->ERROR_prim = primitive; 89 teackp->TLI_error = t_error; 90 teackp->UNIX_error = sys_error; 91 putnext(tcp->tcp_connp->conn_rq, mp); 92 } 93 } 94 95 /* 96 * TCP routine to get the values of options. 97 */ 98 int 99 tcp_tpi_opt_get(queue_t *q, int level, int name, uchar_t *ptr) 100 { 101 return (tcp_opt_get(Q_TO_CONN(q), level, name, ptr)); 102 } 103 104 /* ARGSUSED */ 105 int 106 tcp_tpi_opt_set(queue_t *q, uint_t optset_context, int level, int name, 107 uint_t inlen, uchar_t *invalp, uint_t *outlenp, uchar_t *outvalp, 108 void *thisdg_attrs, cred_t *cr) 109 { 110 conn_t *connp = Q_TO_CONN(q); 111 112 return (tcp_opt_set(connp, optset_context, level, name, inlen, invalp, 113 outlenp, outvalp, thisdg_attrs, cr)); 114 } 115 116 static int 117 tcp_conprim_opt_process(tcp_t *tcp, mblk_t *mp, int *do_disconnectp, 118 int *t_errorp, int *sys_errorp) 119 { 120 int error; 121 int is_absreq_failure; 122 t_scalar_t *opt_lenp; 123 t_scalar_t opt_offset; 124 int prim_type; 125 struct T_conn_req *tcreqp; 126 struct T_conn_res *tcresp; 127 cred_t *cr; 128 129 /* 130 * All Solaris components should pass a db_credp 131 * for this TPI message, hence we ASSERT. 132 * But in case there is some other M_PROTO that looks 133 * like a TPI message sent by some other kernel 134 * component, we check and return an error. 135 */ 136 cr = msg_getcred(mp, NULL); 137 ASSERT(cr != NULL); 138 if (cr == NULL) 139 return (-1); 140 141 prim_type = ((union T_primitives *)mp->b_rptr)->type; 142 ASSERT(prim_type == T_CONN_REQ || prim_type == O_T_CONN_RES || 143 prim_type == T_CONN_RES); 144 145 switch (prim_type) { 146 case T_CONN_REQ: 147 tcreqp = (struct T_conn_req *)mp->b_rptr; 148 opt_offset = tcreqp->OPT_offset; 149 opt_lenp = (t_scalar_t *)&tcreqp->OPT_length; 150 break; 151 case O_T_CONN_RES: 152 case T_CONN_RES: 153 tcresp = (struct T_conn_res *)mp->b_rptr; 154 opt_offset = tcresp->OPT_offset; 155 opt_lenp = (t_scalar_t *)&tcresp->OPT_length; 156 break; 157 } 158 159 *t_errorp = 0; 160 *sys_errorp = 0; 161 *do_disconnectp = 0; 162 163 error = tpi_optcom_buf(tcp->tcp_connp->conn_wq, mp, opt_lenp, 164 opt_offset, cr, &tcp_opt_obj, 165 NULL, &is_absreq_failure); 166 167 switch (error) { 168 case 0: /* no error */ 169 ASSERT(is_absreq_failure == 0); 170 return (0); 171 case ENOPROTOOPT: 172 *t_errorp = TBADOPT; 173 break; 174 case EACCES: 175 *t_errorp = TACCES; 176 break; 177 default: 178 *t_errorp = TSYSERR; *sys_errorp = error; 179 break; 180 } 181 if (is_absreq_failure != 0) { 182 /* 183 * The connection request should get the local ack 184 * T_OK_ACK and then a T_DISCON_IND. 185 */ 186 *do_disconnectp = 1; 187 } 188 return (-1); 189 } 190 191 void 192 tcp_tpi_bind(tcp_t *tcp, mblk_t *mp) 193 { 194 int error; 195 conn_t *connp = tcp->tcp_connp; 196 struct sockaddr *sa; 197 mblk_t *mp1; 198 struct T_bind_req *tbr; 199 int backlog; 200 socklen_t len; 201 sin_t *sin; 202 sin6_t *sin6; 203 cred_t *cr; 204 205 /* 206 * All Solaris components should pass a db_credp 207 * for this TPI message, hence we ASSERT. 208 * But in case there is some other M_PROTO that looks 209 * like a TPI message sent by some other kernel 210 * component, we check and return an error. 211 */ 212 cr = msg_getcred(mp, NULL); 213 ASSERT(cr != NULL); 214 if (cr == NULL) { 215 tcp_err_ack(tcp, mp, TSYSERR, EINVAL); 216 return; 217 } 218 219 ASSERT((uintptr_t)(mp->b_wptr - mp->b_rptr) <= (uintptr_t)INT_MAX); 220 if ((mp->b_wptr - mp->b_rptr) < sizeof (*tbr)) { 221 if (connp->conn_debug) { 222 (void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE, 223 "tcp_tpi_bind: bad req, len %u", 224 (uint_t)(mp->b_wptr - mp->b_rptr)); 225 } 226 tcp_err_ack(tcp, mp, TPROTO, 0); 227 return; 228 } 229 /* Make sure the largest address fits */ 230 mp1 = reallocb(mp, sizeof (struct T_bind_ack) + sizeof (sin6_t), 1); 231 if (mp1 == NULL) { 232 tcp_err_ack(tcp, mp, TSYSERR, ENOMEM); 233 return; 234 } 235 mp = mp1; 236 tbr = (struct T_bind_req *)mp->b_rptr; 237 238 backlog = tbr->CONIND_number; 239 len = tbr->ADDR_length; 240 241 switch (len) { 242 case 0: /* request for a generic port */ 243 tbr->ADDR_offset = sizeof (struct T_bind_req); 244 if (connp->conn_family == AF_INET) { 245 tbr->ADDR_length = sizeof (sin_t); 246 sin = (sin_t *)&tbr[1]; 247 *sin = sin_null; 248 sin->sin_family = AF_INET; 249 sa = (struct sockaddr *)sin; 250 len = sizeof (sin_t); 251 mp->b_wptr = (uchar_t *)&sin[1]; 252 } else { 253 ASSERT(connp->conn_family == AF_INET6); 254 tbr->ADDR_length = sizeof (sin6_t); 255 sin6 = (sin6_t *)&tbr[1]; 256 *sin6 = sin6_null; 257 sin6->sin6_family = AF_INET6; 258 sa = (struct sockaddr *)sin6; 259 len = sizeof (sin6_t); 260 mp->b_wptr = (uchar_t *)&sin6[1]; 261 } 262 break; 263 264 case sizeof (sin_t): /* Complete IPv4 address */ 265 sa = (struct sockaddr *)mi_offset_param(mp, tbr->ADDR_offset, 266 sizeof (sin_t)); 267 break; 268 269 case sizeof (sin6_t): /* Complete IPv6 address */ 270 sa = (struct sockaddr *)mi_offset_param(mp, 271 tbr->ADDR_offset, sizeof (sin6_t)); 272 break; 273 274 default: 275 if (connp->conn_debug) { 276 (void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE, 277 "tcp_tpi_bind: bad address length, %d", 278 tbr->ADDR_length); 279 } 280 tcp_err_ack(tcp, mp, TBADADDR, 0); 281 return; 282 } 283 284 if (backlog > 0) { 285 error = tcp_do_listen(connp, sa, len, backlog, DB_CRED(mp), 286 tbr->PRIM_type != O_T_BIND_REQ); 287 } else { 288 error = tcp_do_bind(connp, sa, len, DB_CRED(mp), 289 tbr->PRIM_type != O_T_BIND_REQ); 290 } 291 done: 292 if (error > 0) { 293 tcp_err_ack(tcp, mp, TSYSERR, error); 294 } else if (error < 0) { 295 tcp_err_ack(tcp, mp, -error, 0); 296 } else { 297 /* 298 * Update port information as sockfs/tpi needs it for checking 299 */ 300 if (connp->conn_family == AF_INET) { 301 sin = (sin_t *)sa; 302 sin->sin_port = connp->conn_lport; 303 } else { 304 sin6 = (sin6_t *)sa; 305 sin6->sin6_port = connp->conn_lport; 306 } 307 mp->b_datap->db_type = M_PCPROTO; 308 tbr->PRIM_type = T_BIND_ACK; 309 putnext(connp->conn_rq, mp); 310 } 311 } 312 313 /* tcp_unbind is called by tcp_wput_proto to handle T_UNBIND_REQ messages. */ 314 void 315 tcp_tpi_unbind(tcp_t *tcp, mblk_t *mp) 316 { 317 conn_t *connp = tcp->tcp_connp; 318 int error; 319 320 error = tcp_do_unbind(connp); 321 if (error > 0) { 322 tcp_err_ack(tcp, mp, TSYSERR, error); 323 } else if (error < 0) { 324 tcp_err_ack(tcp, mp, -error, 0); 325 } else { 326 /* Send M_FLUSH according to TPI */ 327 (void) putnextctl1(connp->conn_rq, M_FLUSH, FLUSHRW); 328 329 mp = mi_tpi_ok_ack_alloc(mp); 330 if (mp != NULL) 331 putnext(connp->conn_rq, mp); 332 } 333 } 334 335 int 336 tcp_tpi_close(queue_t *q, int flags) 337 { 338 conn_t *connp; 339 340 ASSERT(WR(q)->q_next == NULL); 341 342 if (flags & SO_FALLBACK) { 343 /* 344 * stream is being closed while in fallback 345 * simply free the resources that were allocated 346 */ 347 inet_minor_free(WR(q)->q_ptr, (dev_t)(RD(q)->q_ptr)); 348 qprocsoff(q); 349 goto done; 350 } 351 352 connp = Q_TO_CONN(q); 353 /* 354 * We are being closed as /dev/tcp or /dev/tcp6. 355 */ 356 tcp_close_common(connp, flags); 357 358 qprocsoff(q); 359 inet_minor_free(connp->conn_minor_arena, connp->conn_dev); 360 361 /* 362 * Drop IP's reference on the conn. This is the last reference 363 * on the connp if the state was less than established. If the 364 * connection has gone into timewait state, then we will have 365 * one ref for the TCP and one more ref (total of two) for the 366 * classifier connected hash list (a timewait connections stays 367 * in connected hash till closed). 368 * 369 * We can't assert the references because there might be other 370 * transient reference places because of some walkers or queued 371 * packets in squeue for the timewait state. 372 */ 373 CONN_DEC_REF(connp); 374 done: 375 q->q_ptr = WR(q)->q_ptr = NULL; 376 return (0); 377 } 378 379 int 380 tcp_tpi_close_accept(queue_t *q) 381 { 382 vmem_t *minor_arena; 383 dev_t conn_dev; 384 extern struct qinit tcp_acceptor_winit; 385 386 ASSERT(WR(q)->q_qinfo == &tcp_acceptor_winit); 387 388 /* 389 * We had opened an acceptor STREAM for sockfs which is 390 * now being closed due to some error. 391 */ 392 qprocsoff(q); 393 394 minor_arena = (vmem_t *)WR(q)->q_ptr; 395 conn_dev = (dev_t)RD(q)->q_ptr; 396 ASSERT(minor_arena != NULL); 397 ASSERT(conn_dev != 0); 398 inet_minor_free(minor_arena, conn_dev); 399 q->q_ptr = WR(q)->q_ptr = NULL; 400 return (0); 401 } 402 403 /* 404 * Put a connection confirmation message upstream built from the 405 * address/flowid information with the conn and iph. Report our success or 406 * failure. 407 */ 408 boolean_t 409 tcp_conn_con(tcp_t *tcp, uchar_t *iphdr, mblk_t *idmp, 410 mblk_t **defermp, ip_recv_attr_t *ira) 411 { 412 sin_t sin; 413 sin6_t sin6; 414 mblk_t *mp; 415 char *optp = NULL; 416 int optlen = 0; 417 conn_t *connp = tcp->tcp_connp; 418 419 if (defermp != NULL) 420 *defermp = NULL; 421 422 if (tcp->tcp_conn.tcp_opts_conn_req != NULL) { 423 /* 424 * Return in T_CONN_CON results of option negotiation through 425 * the T_CONN_REQ. Note: If there is an real end-to-end option 426 * negotiation, then what is received from remote end needs 427 * to be taken into account but there is no such thing (yet?) 428 * in our TCP/IP. 429 * Note: We do not use mi_offset_param() here as 430 * tcp_opts_conn_req contents do not directly come from 431 * an application and are either generated in kernel or 432 * from user input that was already verified. 433 */ 434 mp = tcp->tcp_conn.tcp_opts_conn_req; 435 optp = (char *)(mp->b_rptr + 436 ((struct T_conn_req *)mp->b_rptr)->OPT_offset); 437 optlen = (int) 438 ((struct T_conn_req *)mp->b_rptr)->OPT_length; 439 } 440 441 if (IPH_HDR_VERSION(iphdr) == IPV4_VERSION) { 442 443 /* packet is IPv4 */ 444 if (connp->conn_family == AF_INET) { 445 sin = sin_null; 446 sin.sin_addr.s_addr = connp->conn_faddr_v4; 447 sin.sin_port = connp->conn_fport; 448 sin.sin_family = AF_INET; 449 mp = mi_tpi_conn_con(NULL, (char *)&sin, 450 (int)sizeof (sin_t), optp, optlen); 451 } else { 452 sin6 = sin6_null; 453 sin6.sin6_addr = connp->conn_faddr_v6; 454 sin6.sin6_port = connp->conn_fport; 455 sin6.sin6_family = AF_INET6; 456 mp = mi_tpi_conn_con(NULL, (char *)&sin6, 457 (int)sizeof (sin6_t), optp, optlen); 458 459 } 460 } else { 461 ip6_t *ip6h = (ip6_t *)iphdr; 462 463 ASSERT(IPH_HDR_VERSION(iphdr) == IPV6_VERSION); 464 ASSERT(connp->conn_family == AF_INET6); 465 sin6 = sin6_null; 466 sin6.sin6_addr = connp->conn_faddr_v6; 467 sin6.sin6_port = connp->conn_fport; 468 sin6.sin6_family = AF_INET6; 469 sin6.sin6_flowinfo = ip6h->ip6_vcf & ~IPV6_VERS_AND_FLOW_MASK; 470 mp = mi_tpi_conn_con(NULL, (char *)&sin6, 471 (int)sizeof (sin6_t), optp, optlen); 472 } 473 474 if (!mp) 475 return (B_FALSE); 476 477 mblk_copycred(mp, idmp); 478 479 if (defermp == NULL) { 480 conn_t *connp = tcp->tcp_connp; 481 if (IPCL_IS_NONSTR(connp)) { 482 (*connp->conn_upcalls->su_connected) 483 (connp->conn_upper_handle, tcp->tcp_connid, 484 ira->ira_cred, ira->ira_cpid); 485 freemsg(mp); 486 } else { 487 if (ira->ira_cred != NULL) { 488 /* So that getpeerucred works for TPI sockfs */ 489 mblk_setcred(mp, ira->ira_cred, ira->ira_cpid); 490 } 491 putnext(connp->conn_rq, mp); 492 } 493 } else { 494 *defermp = mp; 495 } 496 497 if (tcp->tcp_conn.tcp_opts_conn_req != NULL) 498 tcp_close_mpp(&tcp->tcp_conn.tcp_opts_conn_req); 499 return (B_TRUE); 500 } 501 502 /* 503 * Successful connect request processing begins when our client passes 504 * a T_CONN_REQ message into tcp_wput(), which performs function calls into 505 * IP and the passes a T_OK_ACK (or T_ERROR_ACK upstream). 506 * 507 * After various error checks are completed, tcp_tpi_connect() lays 508 * the target address and port into the composite header template. 509 * Then we ask IP for information, including a source address if we didn't 510 * already have one. Finally we prepare to send the SYN packet, and then 511 * send up the T_OK_ACK reply message. 512 */ 513 void 514 tcp_tpi_connect(tcp_t *tcp, mblk_t *mp) 515 { 516 sin_t *sin; 517 struct T_conn_req *tcr; 518 struct sockaddr *sa; 519 socklen_t len; 520 int error; 521 cred_t *cr; 522 pid_t cpid; 523 conn_t *connp = tcp->tcp_connp; 524 queue_t *q = connp->conn_wq; 525 526 /* 527 * All Solaris components should pass a db_credp 528 * for this TPI message, hence we ASSERT. 529 * But in case there is some other M_PROTO that looks 530 * like a TPI message sent by some other kernel 531 * component, we check and return an error. 532 */ 533 cr = msg_getcred(mp, &cpid); 534 ASSERT(cr != NULL); 535 if (cr == NULL) { 536 tcp_err_ack(tcp, mp, TSYSERR, EINVAL); 537 return; 538 } 539 540 tcr = (struct T_conn_req *)mp->b_rptr; 541 542 ASSERT((uintptr_t)(mp->b_wptr - mp->b_rptr) <= (uintptr_t)INT_MAX); 543 if ((mp->b_wptr - mp->b_rptr) < sizeof (*tcr)) { 544 tcp_err_ack(tcp, mp, TPROTO, 0); 545 return; 546 } 547 548 /* 549 * Pre-allocate the T_ordrel_ind mblk so that at close time, we 550 * will always have that to send up. Otherwise, we need to do 551 * special handling in case the allocation fails at that time. 552 * If the end point is TPI, the tcp_t can be reused and the 553 * tcp_ordrel_mp may be allocated already. 554 */ 555 if (tcp->tcp_ordrel_mp == NULL) { 556 if ((tcp->tcp_ordrel_mp = mi_tpi_ordrel_ind()) == NULL) { 557 tcp_err_ack(tcp, mp, TSYSERR, ENOMEM); 558 return; 559 } 560 } 561 562 /* 563 * Determine packet type based on type of address passed in 564 * the request should contain an IPv4 or IPv6 address. 565 * Make sure that address family matches the type of 566 * family of the address passed down. 567 */ 568 switch (tcr->DEST_length) { 569 default: 570 tcp_err_ack(tcp, mp, TBADADDR, 0); 571 return; 572 573 case (sizeof (sin_t) - sizeof (sin->sin_zero)): { 574 /* 575 * XXX: The check for valid DEST_length was not there 576 * in earlier releases and some buggy 577 * TLI apps (e.g Sybase) got away with not feeding 578 * in sin_zero part of address. 579 * We allow that bug to keep those buggy apps humming. 580 * Test suites require the check on DEST_length. 581 * We construct a new mblk with valid DEST_length 582 * free the original so the rest of the code does 583 * not have to keep track of this special shorter 584 * length address case. 585 */ 586 mblk_t *nmp; 587 struct T_conn_req *ntcr; 588 sin_t *nsin; 589 590 nmp = allocb(sizeof (struct T_conn_req) + sizeof (sin_t) + 591 tcr->OPT_length, BPRI_HI); 592 if (nmp == NULL) { 593 tcp_err_ack(tcp, mp, TSYSERR, ENOMEM); 594 return; 595 } 596 ntcr = (struct T_conn_req *)nmp->b_rptr; 597 bzero(ntcr, sizeof (struct T_conn_req)); /* zero fill */ 598 ntcr->PRIM_type = T_CONN_REQ; 599 ntcr->DEST_length = sizeof (sin_t); 600 ntcr->DEST_offset = sizeof (struct T_conn_req); 601 602 nsin = (sin_t *)((uchar_t *)ntcr + ntcr->DEST_offset); 603 *nsin = sin_null; 604 /* Get pointer to shorter address to copy from original mp */ 605 sin = (sin_t *)mi_offset_param(mp, tcr->DEST_offset, 606 tcr->DEST_length); /* extract DEST_length worth of sin_t */ 607 if (sin == NULL || !OK_32PTR((char *)sin)) { 608 freemsg(nmp); 609 tcp_err_ack(tcp, mp, TSYSERR, EINVAL); 610 return; 611 } 612 nsin->sin_family = sin->sin_family; 613 nsin->sin_port = sin->sin_port; 614 nsin->sin_addr = sin->sin_addr; 615 /* Note:nsin->sin_zero zero-fill with sin_null assign above */ 616 nmp->b_wptr = (uchar_t *)&nsin[1]; 617 if (tcr->OPT_length != 0) { 618 ntcr->OPT_length = tcr->OPT_length; 619 ntcr->OPT_offset = nmp->b_wptr - nmp->b_rptr; 620 bcopy((uchar_t *)tcr + tcr->OPT_offset, 621 (uchar_t *)ntcr + ntcr->OPT_offset, 622 tcr->OPT_length); 623 nmp->b_wptr += tcr->OPT_length; 624 } 625 freemsg(mp); /* original mp freed */ 626 mp = nmp; /* re-initialize original variables */ 627 tcr = ntcr; 628 } 629 /* FALLTHRU */ 630 631 case sizeof (sin_t): 632 sa = (struct sockaddr *)mi_offset_param(mp, tcr->DEST_offset, 633 sizeof (sin_t)); 634 len = sizeof (sin_t); 635 break; 636 637 case sizeof (sin6_t): 638 sa = (struct sockaddr *)mi_offset_param(mp, tcr->DEST_offset, 639 sizeof (sin6_t)); 640 len = sizeof (sin6_t); 641 break; 642 } 643 644 error = proto_verify_ip_addr(connp->conn_family, sa, len); 645 if (error != 0) { 646 tcp_err_ack(tcp, mp, TSYSERR, error); 647 return; 648 } 649 650 /* 651 * TODO: If someone in TCPS_TIME_WAIT has this dst/port we 652 * should key on their sequence number and cut them loose. 653 */ 654 655 /* 656 * If options passed in, feed it for verification and handling 657 */ 658 if (tcr->OPT_length != 0) { 659 mblk_t *ok_mp; 660 mblk_t *discon_mp; 661 mblk_t *conn_opts_mp; 662 int t_error, sys_error, do_disconnect; 663 664 conn_opts_mp = NULL; 665 666 if (tcp_conprim_opt_process(tcp, mp, 667 &do_disconnect, &t_error, &sys_error) < 0) { 668 if (do_disconnect) { 669 ASSERT(t_error == 0 && sys_error == 0); 670 discon_mp = mi_tpi_discon_ind(NULL, 671 ECONNREFUSED, 0); 672 if (!discon_mp) { 673 tcp_err_ack_prim(tcp, mp, T_CONN_REQ, 674 TSYSERR, ENOMEM); 675 return; 676 } 677 ok_mp = mi_tpi_ok_ack_alloc(mp); 678 if (!ok_mp) { 679 tcp_err_ack_prim(tcp, NULL, T_CONN_REQ, 680 TSYSERR, ENOMEM); 681 return; 682 } 683 qreply(q, ok_mp); 684 qreply(q, discon_mp); /* no flush! */ 685 } else { 686 ASSERT(t_error != 0); 687 tcp_err_ack_prim(tcp, mp, T_CONN_REQ, t_error, 688 sys_error); 689 } 690 return; 691 } 692 /* 693 * Success in setting options, the mp option buffer represented 694 * by OPT_length/offset has been potentially modified and 695 * contains results of option processing. We copy it in 696 * another mp to save it for potentially influencing returning 697 * it in T_CONN_CONN. 698 */ 699 if (tcr->OPT_length != 0) { /* there are resulting options */ 700 conn_opts_mp = copyb(mp); 701 if (!conn_opts_mp) { 702 tcp_err_ack_prim(tcp, mp, T_CONN_REQ, 703 TSYSERR, ENOMEM); 704 return; 705 } 706 ASSERT(tcp->tcp_conn.tcp_opts_conn_req == NULL); 707 tcp->tcp_conn.tcp_opts_conn_req = conn_opts_mp; 708 /* 709 * Note: 710 * These resulting option negotiation can include any 711 * end-to-end negotiation options but there no such 712 * thing (yet?) in our TCP/IP. 713 */ 714 } 715 } 716 717 /* call the non-TPI version */ 718 error = tcp_do_connect(tcp->tcp_connp, sa, len, cr, cpid); 719 if (error < 0) { 720 mp = mi_tpi_err_ack_alloc(mp, -error, 0); 721 } else if (error > 0) { 722 mp = mi_tpi_err_ack_alloc(mp, TSYSERR, error); 723 } else { 724 mp = mi_tpi_ok_ack_alloc(mp); 725 } 726 727 /* 728 * Note: Code below is the "failure" case 729 */ 730 /* return error ack and blow away saved option results if any */ 731 connect_failed: 732 if (mp != NULL) 733 putnext(connp->conn_rq, mp); 734 else { 735 tcp_err_ack_prim(tcp, NULL, T_CONN_REQ, 736 TSYSERR, ENOMEM); 737 } 738 } 739 740 /* Return the TPI/TLI equivalent of our current tcp_state */ 741 static int 742 tcp_tpistate(tcp_t *tcp) 743 { 744 switch (tcp->tcp_state) { 745 case TCPS_IDLE: 746 return (TS_UNBND); 747 case TCPS_LISTEN: 748 /* 749 * Return whether there are outstanding T_CONN_IND waiting 750 * for the matching T_CONN_RES. Therefore don't count q0. 751 */ 752 if (tcp->tcp_conn_req_cnt_q > 0) 753 return (TS_WRES_CIND); 754 else 755 return (TS_IDLE); 756 case TCPS_BOUND: 757 return (TS_IDLE); 758 case TCPS_SYN_SENT: 759 return (TS_WCON_CREQ); 760 case TCPS_SYN_RCVD: 761 /* 762 * Note: assumption: this has to the active open SYN_RCVD. 763 * The passive instance is detached in SYN_RCVD stage of 764 * incoming connection processing so we cannot get request 765 * for T_info_ack on it. 766 */ 767 return (TS_WACK_CRES); 768 case TCPS_ESTABLISHED: 769 return (TS_DATA_XFER); 770 case TCPS_CLOSE_WAIT: 771 return (TS_WREQ_ORDREL); 772 case TCPS_FIN_WAIT_1: 773 return (TS_WIND_ORDREL); 774 case TCPS_FIN_WAIT_2: 775 return (TS_WIND_ORDREL); 776 777 case TCPS_CLOSING: 778 case TCPS_LAST_ACK: 779 case TCPS_TIME_WAIT: 780 case TCPS_CLOSED: 781 /* 782 * Following TS_WACK_DREQ7 is a rendition of "not 783 * yet TS_IDLE" TPI state. There is no best match to any 784 * TPI state for TCPS_{CLOSING, LAST_ACK, TIME_WAIT} but we 785 * choose a value chosen that will map to TLI/XTI level 786 * state of TSTATECHNG (state is process of changing) which 787 * captures what this dummy state represents. 788 */ 789 return (TS_WACK_DREQ7); 790 default: 791 cmn_err(CE_WARN, "tcp_tpistate: strange state (%d) %s", 792 tcp->tcp_state, tcp_display(tcp, NULL, 793 DISP_PORT_ONLY)); 794 return (TS_UNBND); 795 } 796 } 797 798 static void 799 tcp_copy_info(struct T_info_ack *tia, tcp_t *tcp) 800 { 801 tcp_stack_t *tcps = tcp->tcp_tcps; 802 conn_t *connp = tcp->tcp_connp; 803 extern struct T_info_ack tcp_g_t_info_ack; 804 extern struct T_info_ack tcp_g_t_info_ack_v6; 805 806 if (connp->conn_family == AF_INET6) 807 *tia = tcp_g_t_info_ack_v6; 808 else 809 *tia = tcp_g_t_info_ack; 810 tia->CURRENT_state = tcp_tpistate(tcp); 811 tia->OPT_size = tcp_max_optsize; 812 if (tcp->tcp_mss == 0) { 813 /* Not yet set - tcp_open does not set mss */ 814 if (connp->conn_ipversion == IPV4_VERSION) 815 tia->TIDU_size = tcps->tcps_mss_def_ipv4; 816 else 817 tia->TIDU_size = tcps->tcps_mss_def_ipv6; 818 } else { 819 tia->TIDU_size = tcp->tcp_mss; 820 } 821 /* TODO: Default ETSDU is 1. Is that correct for tcp? */ 822 } 823 824 void 825 tcp_do_capability_ack(tcp_t *tcp, struct T_capability_ack *tcap, 826 t_uscalar_t cap_bits1) 827 { 828 tcap->CAP_bits1 = 0; 829 830 if (cap_bits1 & TC1_INFO) { 831 tcp_copy_info(&tcap->INFO_ack, tcp); 832 tcap->CAP_bits1 |= TC1_INFO; 833 } 834 835 if (cap_bits1 & TC1_ACCEPTOR_ID) { 836 tcap->ACCEPTOR_id = tcp->tcp_acceptor_id; 837 tcap->CAP_bits1 |= TC1_ACCEPTOR_ID; 838 } 839 840 } 841 842 /* 843 * This routine responds to T_CAPABILITY_REQ messages. It is called by 844 * tcp_wput. Much of the T_CAPABILITY_ACK information is copied from 845 * tcp_g_t_info_ack. The current state of the stream is copied from 846 * tcp_state. 847 */ 848 void 849 tcp_capability_req(tcp_t *tcp, mblk_t *mp) 850 { 851 t_uscalar_t cap_bits1; 852 struct T_capability_ack *tcap; 853 854 if (MBLKL(mp) < sizeof (struct T_capability_req)) { 855 freemsg(mp); 856 return; 857 } 858 859 cap_bits1 = ((struct T_capability_req *)mp->b_rptr)->CAP_bits1; 860 861 mp = tpi_ack_alloc(mp, sizeof (struct T_capability_ack), 862 mp->b_datap->db_type, T_CAPABILITY_ACK); 863 if (mp == NULL) 864 return; 865 866 tcap = (struct T_capability_ack *)mp->b_rptr; 867 tcp_do_capability_ack(tcp, tcap, cap_bits1); 868 869 putnext(tcp->tcp_connp->conn_rq, mp); 870 } 871 872 /* 873 * This routine responds to T_INFO_REQ messages. It is called by tcp_wput. 874 * Most of the T_INFO_ACK information is copied from tcp_g_t_info_ack. 875 * The current state of the stream is copied from tcp_state. 876 */ 877 void 878 tcp_info_req(tcp_t *tcp, mblk_t *mp) 879 { 880 mp = tpi_ack_alloc(mp, sizeof (struct T_info_ack), M_PCPROTO, 881 T_INFO_ACK); 882 if (!mp) { 883 tcp_err_ack(tcp, mp, TSYSERR, ENOMEM); 884 return; 885 } 886 tcp_copy_info((struct T_info_ack *)mp->b_rptr, tcp); 887 putnext(tcp->tcp_connp->conn_rq, mp); 888 } 889 890 /* Respond to the TPI addr request */ 891 void 892 tcp_addr_req(tcp_t *tcp, mblk_t *mp) 893 { 894 struct sockaddr *sa; 895 mblk_t *ackmp; 896 struct T_addr_ack *taa; 897 conn_t *connp = tcp->tcp_connp; 898 uint_t addrlen; 899 900 /* Make it large enough for worst case */ 901 ackmp = reallocb(mp, sizeof (struct T_addr_ack) + 902 2 * sizeof (sin6_t), 1); 903 if (ackmp == NULL) { 904 tcp_err_ack(tcp, mp, TSYSERR, ENOMEM); 905 return; 906 } 907 908 taa = (struct T_addr_ack *)ackmp->b_rptr; 909 910 bzero(taa, sizeof (struct T_addr_ack)); 911 ackmp->b_wptr = (uchar_t *)&taa[1]; 912 913 taa->PRIM_type = T_ADDR_ACK; 914 ackmp->b_datap->db_type = M_PCPROTO; 915 916 if (connp->conn_family == AF_INET) 917 addrlen = sizeof (sin_t); 918 else 919 addrlen = sizeof (sin6_t); 920 921 /* 922 * Note: Following code assumes 32 bit alignment of basic 923 * data structures like sin_t and struct T_addr_ack. 924 */ 925 if (tcp->tcp_state >= TCPS_BOUND) { 926 /* 927 * Fill in local address first 928 */ 929 taa->LOCADDR_offset = sizeof (*taa); 930 taa->LOCADDR_length = addrlen; 931 sa = (struct sockaddr *)&taa[1]; 932 (void) conn_getsockname(connp, sa, &addrlen); 933 ackmp->b_wptr += addrlen; 934 } 935 if (tcp->tcp_state >= TCPS_SYN_RCVD) { 936 /* 937 * Fill in Remote address 938 */ 939 taa->REMADDR_length = addrlen; 940 /* assumed 32-bit alignment */ 941 taa->REMADDR_offset = taa->LOCADDR_offset + taa->LOCADDR_length; 942 sa = (struct sockaddr *)(ackmp->b_rptr + taa->REMADDR_offset); 943 (void) conn_getpeername(connp, sa, &addrlen); 944 ackmp->b_wptr += addrlen; 945 } 946 ASSERT(ackmp->b_wptr <= ackmp->b_datap->db_lim); 947 putnext(tcp->tcp_connp->conn_rq, ackmp); 948 } 949 950 /* 951 * Swap information between the eager and acceptor for a TLI/XTI client. 952 * The sockfs accept is done on the acceptor stream and control goes 953 * through tcp_tli_accept() and tcp_accept()/tcp_accept_swap() is not 954 * called. In either case, both the eager and listener are in their own 955 * perimeter (squeue) and the code has to deal with potential race. 956 * 957 * See the block comment on top of tcp_accept() and tcp_tli_accept(). 958 */ 959 static void 960 tcp_accept_swap(tcp_t *listener, tcp_t *acceptor, tcp_t *eager) 961 { 962 conn_t *econnp, *aconnp; 963 964 ASSERT(eager->tcp_connp->conn_rq == listener->tcp_connp->conn_rq); 965 ASSERT(eager->tcp_detached && !acceptor->tcp_detached); 966 ASSERT(!TCP_IS_SOCKET(acceptor)); 967 ASSERT(!TCP_IS_SOCKET(eager)); 968 ASSERT(!TCP_IS_SOCKET(listener)); 969 970 /* 971 * Trusted Extensions may need to use a security label that is 972 * different from the acceptor's label on MLP and MAC-Exempt 973 * sockets. If this is the case, the required security label 974 * already exists in econnp->conn_ixa->ixa_tsl. Since we make the 975 * acceptor stream refer to econnp we atomatically get that label. 976 */ 977 978 acceptor->tcp_detached = B_TRUE; 979 /* 980 * To permit stream re-use by TLI/XTI, the eager needs a copy of 981 * the acceptor id. 982 */ 983 eager->tcp_acceptor_id = acceptor->tcp_acceptor_id; 984 985 /* remove eager from listen list... */ 986 mutex_enter(&listener->tcp_eager_lock); 987 tcp_eager_unlink(eager); 988 ASSERT(eager->tcp_eager_next_q == NULL && 989 eager->tcp_eager_last_q == NULL); 990 ASSERT(eager->tcp_eager_next_q0 == NULL && 991 eager->tcp_eager_prev_q0 == NULL); 992 mutex_exit(&listener->tcp_eager_lock); 993 994 econnp = eager->tcp_connp; 995 aconnp = acceptor->tcp_connp; 996 econnp->conn_rq = aconnp->conn_rq; 997 econnp->conn_wq = aconnp->conn_wq; 998 econnp->conn_rq->q_ptr = econnp; 999 econnp->conn_wq->q_ptr = econnp; 1000 1001 /* 1002 * In the TLI/XTI loopback case, we are inside the listener's squeue, 1003 * which might be a different squeue from our peer TCP instance. 1004 * For TCP Fusion, the peer expects that whenever tcp_detached is 1005 * clear, our TCP queues point to the acceptor's queues. Thus, use 1006 * membar_producer() to ensure that the assignments of conn_rq/conn_wq 1007 * above reach global visibility prior to the clearing of tcp_detached. 1008 */ 1009 membar_producer(); 1010 eager->tcp_detached = B_FALSE; 1011 1012 ASSERT(eager->tcp_ack_tid == 0); 1013 1014 econnp->conn_dev = aconnp->conn_dev; 1015 econnp->conn_minor_arena = aconnp->conn_minor_arena; 1016 1017 ASSERT(econnp->conn_minor_arena != NULL); 1018 if (econnp->conn_cred != NULL) 1019 crfree(econnp->conn_cred); 1020 econnp->conn_cred = aconnp->conn_cred; 1021 ASSERT(!(econnp->conn_ixa->ixa_free_flags & IXA_FREE_CRED)); 1022 econnp->conn_ixa->ixa_cred = econnp->conn_cred; 1023 aconnp->conn_cred = NULL; 1024 econnp->conn_cpid = aconnp->conn_cpid; 1025 ASSERT(econnp->conn_netstack == aconnp->conn_netstack); 1026 ASSERT(eager->tcp_tcps == acceptor->tcp_tcps); 1027 1028 econnp->conn_zoneid = aconnp->conn_zoneid; 1029 econnp->conn_allzones = aconnp->conn_allzones; 1030 econnp->conn_ixa->ixa_zoneid = aconnp->conn_ixa->ixa_zoneid; 1031 1032 econnp->conn_mac_mode = aconnp->conn_mac_mode; 1033 econnp->conn_zone_is_global = aconnp->conn_zone_is_global; 1034 aconnp->conn_mac_mode = CONN_MAC_DEFAULT; 1035 1036 /* Do the IPC initialization */ 1037 CONN_INC_REF(econnp); 1038 1039 /* Done with old IPC. Drop its ref on its connp */ 1040 CONN_DEC_REF(aconnp); 1041 } 1042 1043 /* 1044 * This runs at the tail end of accept processing on the squeue of the 1045 * new connection. 1046 */ 1047 /* ARGSUSED */ 1048 static void 1049 tcp_accept_finish(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy) 1050 { 1051 conn_t *connp = (conn_t *)arg; 1052 tcp_t *tcp = connp->conn_tcp; 1053 queue_t *q = connp->conn_rq; 1054 tcp_stack_t *tcps = tcp->tcp_tcps; 1055 struct stroptions *stropt; 1056 struct sock_proto_props sopp; 1057 1058 /* Should never be called for non-STREAMS sockets */ 1059 ASSERT(!IPCL_IS_NONSTR(connp)); 1060 1061 /* We should just receive a single mblk that fits a T_discon_ind */ 1062 ASSERT(mp->b_cont == NULL); 1063 1064 /* 1065 * Drop the eager's ref on the listener, that was placed when 1066 * this eager began life in tcp_input_listener. 1067 */ 1068 CONN_DEC_REF(tcp->tcp_saved_listener->tcp_connp); 1069 1070 tcp->tcp_detached = B_FALSE; 1071 1072 if (tcp->tcp_state <= TCPS_BOUND || tcp->tcp_accept_error) { 1073 /* 1074 * Someone blewoff the eager before we could finish 1075 * the accept. 1076 * 1077 * The only reason eager exists it because we put in 1078 * a ref on it when conn ind went up. We need to send 1079 * a disconnect indication up while the last reference 1080 * on the eager will be dropped by the squeue when we 1081 * return. 1082 */ 1083 ASSERT(tcp->tcp_listener == NULL); 1084 if (tcp->tcp_issocket || tcp->tcp_send_discon_ind) { 1085 struct T_discon_ind *tdi; 1086 1087 (void) putnextctl1(q, M_FLUSH, FLUSHRW); 1088 /* 1089 * Let us reuse the incoming mblk to avoid 1090 * memory allocation failure problems. We know 1091 * that the size of the incoming mblk i.e. 1092 * stroptions is greater than sizeof 1093 * T_discon_ind. 1094 */ 1095 ASSERT(DB_REF(mp) == 1); 1096 ASSERT(MBLKSIZE(mp) >= 1097 sizeof (struct T_discon_ind)); 1098 1099 DB_TYPE(mp) = M_PROTO; 1100 ((union T_primitives *)mp->b_rptr)->type = 1101 T_DISCON_IND; 1102 tdi = (struct T_discon_ind *)mp->b_rptr; 1103 if (tcp->tcp_issocket) { 1104 tdi->DISCON_reason = ECONNREFUSED; 1105 tdi->SEQ_number = 0; 1106 } else { 1107 tdi->DISCON_reason = ENOPROTOOPT; 1108 tdi->SEQ_number = 1109 tcp->tcp_conn_req_seqnum; 1110 } 1111 mp->b_wptr = mp->b_rptr + 1112 sizeof (struct T_discon_ind); 1113 putnext(q, mp); 1114 } 1115 tcp->tcp_hard_binding = B_FALSE; 1116 return; 1117 } 1118 1119 /* 1120 * This is the first time we run on the correct 1121 * queue after tcp_accept. So fix all the q parameters 1122 * here. 1123 * 1124 * Let us reuse the incoming mblk to avoid 1125 * memory allocation failure problems. We know 1126 * that the size of the incoming mblk is at least 1127 * stroptions 1128 */ 1129 tcp_get_proto_props(tcp, &sopp); 1130 1131 ASSERT(DB_REF(mp) == 1); 1132 ASSERT(MBLKSIZE(mp) >= sizeof (struct stroptions)); 1133 1134 DB_TYPE(mp) = M_SETOPTS; 1135 stropt = (struct stroptions *)mp->b_rptr; 1136 mp->b_wptr = mp->b_rptr + sizeof (struct stroptions); 1137 stropt = (struct stroptions *)mp->b_rptr; 1138 ASSERT(sopp.sopp_flags & (SO_HIWAT|SO_WROFF|SO_MAXBLK)); 1139 stropt->so_flags = SO_HIWAT | SO_WROFF | SO_MAXBLK; 1140 stropt->so_hiwat = sopp.sopp_rxhiwat; 1141 stropt->so_wroff = sopp.sopp_wroff; 1142 stropt->so_maxblk = sopp.sopp_maxblk; 1143 1144 if (sopp.sopp_flags & SOCKOPT_TAIL) { 1145 ASSERT(tcp->tcp_kssl_ctx != NULL); 1146 1147 stropt->so_flags |= SO_TAIL | SO_COPYOPT; 1148 stropt->so_tail = sopp.sopp_tail; 1149 stropt->so_copyopt = sopp.sopp_zcopyflag; 1150 } 1151 1152 /* Send the options up */ 1153 putnext(q, mp); 1154 1155 /* 1156 * Pass up any data and/or a fin that has been received. 1157 * 1158 * Adjust receive window in case it had decreased 1159 * (because there is data <=> tcp_rcv_list != NULL) 1160 * while the connection was detached. Note that 1161 * in case the eager was flow-controlled, w/o this 1162 * code, the rwnd may never open up again! 1163 */ 1164 if (tcp->tcp_rcv_list != NULL) { 1165 /* We drain directly in case of fused tcp loopback */ 1166 1167 if (!tcp->tcp_fused && canputnext(q)) { 1168 tcp->tcp_rwnd = connp->conn_rcvbuf; 1169 if (tcp->tcp_state >= TCPS_ESTABLISHED && 1170 tcp_rwnd_reopen(tcp) == TH_ACK_NEEDED) { 1171 tcp_xmit_ctl(NULL, 1172 tcp, (tcp->tcp_swnd == 0) ? 1173 tcp->tcp_suna : tcp->tcp_snxt, 1174 tcp->tcp_rnxt, TH_ACK); 1175 } 1176 } 1177 1178 (void) tcp_rcv_drain(tcp); 1179 1180 /* 1181 * For fused tcp loopback, back-enable peer endpoint 1182 * if it's currently flow-controlled. 1183 */ 1184 if (tcp->tcp_fused) { 1185 tcp_t *peer_tcp = tcp->tcp_loopback_peer; 1186 1187 ASSERT(peer_tcp != NULL); 1188 ASSERT(peer_tcp->tcp_fused); 1189 1190 mutex_enter(&peer_tcp->tcp_non_sq_lock); 1191 if (peer_tcp->tcp_flow_stopped) { 1192 tcp_clrqfull(peer_tcp); 1193 TCP_STAT(tcps, tcp_fusion_backenabled); 1194 } 1195 mutex_exit(&peer_tcp->tcp_non_sq_lock); 1196 } 1197 } 1198 ASSERT(tcp->tcp_rcv_list == NULL || tcp->tcp_fused_sigurg); 1199 if (tcp->tcp_fin_rcvd && !tcp->tcp_ordrel_done) { 1200 tcp->tcp_ordrel_done = B_TRUE; 1201 mp = tcp->tcp_ordrel_mp; 1202 tcp->tcp_ordrel_mp = NULL; 1203 putnext(q, mp); 1204 } 1205 tcp->tcp_hard_binding = B_FALSE; 1206 1207 if (connp->conn_keepalive) { 1208 tcp->tcp_ka_last_intrvl = 0; 1209 tcp->tcp_ka_tid = TCP_TIMER(tcp, tcp_keepalive_timer, 1210 tcp->tcp_ka_interval); 1211 } 1212 1213 /* 1214 * At this point, eager is fully established and will 1215 * have the following references - 1216 * 1217 * 2 references for connection to exist (1 for TCP and 1 for IP). 1218 * 1 reference for the squeue which will be dropped by the squeue as 1219 * soon as this function returns. 1220 * There will be 1 additonal reference for being in classifier 1221 * hash list provided something bad hasn't happened. 1222 */ 1223 ASSERT((connp->conn_fanout != NULL && connp->conn_ref >= 4) || 1224 (connp->conn_fanout == NULL && connp->conn_ref >= 3)); 1225 } 1226 1227 1228 /* 1229 * Reply to a clients T_CONN_RES TPI message. This function 1230 * is used only for TLI/XTI listener. Sockfs sends T_CONN_RES 1231 * on the acceptor STREAM and processed in tcp_accept_common(). 1232 * Read the block comment on top of tcp_input_listener(). 1233 */ 1234 void 1235 tcp_tli_accept(tcp_t *listener, mblk_t *mp) 1236 { 1237 tcp_t *acceptor; 1238 tcp_t *eager; 1239 tcp_t *tcp; 1240 struct T_conn_res *tcr; 1241 t_uscalar_t acceptor_id; 1242 t_scalar_t seqnum; 1243 mblk_t *discon_mp = NULL; 1244 mblk_t *ok_mp; 1245 mblk_t *mp1; 1246 tcp_stack_t *tcps = listener->tcp_tcps; 1247 conn_t *econnp; 1248 1249 if ((mp->b_wptr - mp->b_rptr) < sizeof (*tcr)) { 1250 tcp_err_ack(listener, mp, TPROTO, 0); 1251 return; 1252 } 1253 tcr = (struct T_conn_res *)mp->b_rptr; 1254 1255 /* 1256 * Under ILP32 the stream head points tcr->ACCEPTOR_id at the 1257 * read side queue of the streams device underneath us i.e. the 1258 * read side queue of 'ip'. Since we can't deference QUEUE_ptr we 1259 * look it up in the queue_hash. Under LP64 it sends down the 1260 * minor_t of the accepting endpoint. 1261 * 1262 * Once the acceptor/eager are modified (in tcp_accept_swap) the 1263 * fanout hash lock is held. 1264 * This prevents any thread from entering the acceptor queue from 1265 * below (since it has not been hard bound yet i.e. any inbound 1266 * packets will arrive on the listener conn_t and 1267 * go through the classifier). 1268 * The CONN_INC_REF will prevent the acceptor from closing. 1269 * 1270 * XXX It is still possible for a tli application to send down data 1271 * on the accepting stream while another thread calls t_accept. 1272 * This should not be a problem for well-behaved applications since 1273 * the T_OK_ACK is sent after the queue swapping is completed. 1274 * 1275 * If the accepting fd is the same as the listening fd, avoid 1276 * queue hash lookup since that will return an eager listener in a 1277 * already established state. 1278 */ 1279 acceptor_id = tcr->ACCEPTOR_id; 1280 mutex_enter(&listener->tcp_eager_lock); 1281 if (listener->tcp_acceptor_id == acceptor_id) { 1282 eager = listener->tcp_eager_next_q; 1283 /* only count how many T_CONN_INDs so don't count q0 */ 1284 if ((listener->tcp_conn_req_cnt_q != 1) || 1285 (eager->tcp_conn_req_seqnum != tcr->SEQ_number)) { 1286 mutex_exit(&listener->tcp_eager_lock); 1287 tcp_err_ack(listener, mp, TBADF, 0); 1288 return; 1289 } 1290 if (listener->tcp_conn_req_cnt_q0 != 0) { 1291 /* Throw away all the eagers on q0. */ 1292 tcp_eager_cleanup(listener, 1); 1293 } 1294 if (listener->tcp_syn_defense) { 1295 listener->tcp_syn_defense = B_FALSE; 1296 if (listener->tcp_ip_addr_cache != NULL) { 1297 kmem_free(listener->tcp_ip_addr_cache, 1298 IP_ADDR_CACHE_SIZE * sizeof (ipaddr_t)); 1299 listener->tcp_ip_addr_cache = NULL; 1300 } 1301 } 1302 /* 1303 * Transfer tcp_conn_req_max to the eager so that when 1304 * a disconnect occurs we can revert the endpoint to the 1305 * listen state. 1306 */ 1307 eager->tcp_conn_req_max = listener->tcp_conn_req_max; 1308 ASSERT(listener->tcp_conn_req_cnt_q0 == 0); 1309 /* 1310 * Get a reference on the acceptor just like the 1311 * tcp_acceptor_hash_lookup below. 1312 */ 1313 acceptor = listener; 1314 CONN_INC_REF(acceptor->tcp_connp); 1315 } else { 1316 acceptor = tcp_acceptor_hash_lookup(acceptor_id, tcps); 1317 if (acceptor == NULL) { 1318 if (listener->tcp_connp->conn_debug) { 1319 (void) strlog(TCP_MOD_ID, 0, 1, 1320 SL_ERROR|SL_TRACE, 1321 "tcp_accept: did not find acceptor 0x%x\n", 1322 acceptor_id); 1323 } 1324 mutex_exit(&listener->tcp_eager_lock); 1325 tcp_err_ack(listener, mp, TPROVMISMATCH, 0); 1326 return; 1327 } 1328 /* 1329 * Verify acceptor state. The acceptable states for an acceptor 1330 * include TCPS_IDLE and TCPS_BOUND. 1331 */ 1332 switch (acceptor->tcp_state) { 1333 case TCPS_IDLE: 1334 /* FALLTHRU */ 1335 case TCPS_BOUND: 1336 break; 1337 default: 1338 CONN_DEC_REF(acceptor->tcp_connp); 1339 mutex_exit(&listener->tcp_eager_lock); 1340 tcp_err_ack(listener, mp, TOUTSTATE, 0); 1341 return; 1342 } 1343 } 1344 1345 /* The listener must be in TCPS_LISTEN */ 1346 if (listener->tcp_state != TCPS_LISTEN) { 1347 CONN_DEC_REF(acceptor->tcp_connp); 1348 mutex_exit(&listener->tcp_eager_lock); 1349 tcp_err_ack(listener, mp, TOUTSTATE, 0); 1350 return; 1351 } 1352 1353 /* 1354 * Rendezvous with an eager connection request packet hanging off 1355 * 'tcp' that has the 'seqnum' tag. We tagged the detached open 1356 * tcp structure when the connection packet arrived in 1357 * tcp_input_listener(). 1358 */ 1359 seqnum = tcr->SEQ_number; 1360 eager = listener; 1361 do { 1362 eager = eager->tcp_eager_next_q; 1363 if (eager == NULL) { 1364 CONN_DEC_REF(acceptor->tcp_connp); 1365 mutex_exit(&listener->tcp_eager_lock); 1366 tcp_err_ack(listener, mp, TBADSEQ, 0); 1367 return; 1368 } 1369 } while (eager->tcp_conn_req_seqnum != seqnum); 1370 mutex_exit(&listener->tcp_eager_lock); 1371 1372 /* 1373 * At this point, both acceptor and listener have 2 ref 1374 * that they begin with. Acceptor has one additional ref 1375 * we placed in lookup while listener has 3 additional 1376 * ref for being behind the squeue (tcp_accept() is 1377 * done on listener's squeue); being in classifier hash; 1378 * and eager's ref on listener. 1379 */ 1380 ASSERT(listener->tcp_connp->conn_ref >= 5); 1381 ASSERT(acceptor->tcp_connp->conn_ref >= 3); 1382 1383 /* 1384 * The eager at this point is set in its own squeue and 1385 * could easily have been killed (tcp_accept_finish will 1386 * deal with that) because of a TH_RST so we can only 1387 * ASSERT for a single ref. 1388 */ 1389 ASSERT(eager->tcp_connp->conn_ref >= 1); 1390 1391 /* 1392 * Pre allocate the discon_ind mblk also. tcp_accept_finish will 1393 * use it if something failed. 1394 */ 1395 discon_mp = allocb(MAX(sizeof (struct T_discon_ind), 1396 sizeof (struct stroptions)), BPRI_HI); 1397 if (discon_mp == NULL) { 1398 CONN_DEC_REF(acceptor->tcp_connp); 1399 CONN_DEC_REF(eager->tcp_connp); 1400 tcp_err_ack(listener, mp, TSYSERR, ENOMEM); 1401 return; 1402 } 1403 1404 econnp = eager->tcp_connp; 1405 1406 /* Hold a copy of mp, in case reallocb fails */ 1407 if ((mp1 = copymsg(mp)) == NULL) { 1408 CONN_DEC_REF(acceptor->tcp_connp); 1409 CONN_DEC_REF(eager->tcp_connp); 1410 freemsg(discon_mp); 1411 tcp_err_ack(listener, mp, TSYSERR, ENOMEM); 1412 return; 1413 } 1414 1415 tcr = (struct T_conn_res *)mp1->b_rptr; 1416 1417 /* 1418 * This is an expanded version of mi_tpi_ok_ack_alloc() 1419 * which allocates a larger mblk and appends the new 1420 * local address to the ok_ack. The address is copied by 1421 * soaccept() for getsockname(). 1422 */ 1423 { 1424 int extra; 1425 1426 extra = (econnp->conn_family == AF_INET) ? 1427 sizeof (sin_t) : sizeof (sin6_t); 1428 1429 /* 1430 * Try to re-use mp, if possible. Otherwise, allocate 1431 * an mblk and return it as ok_mp. In any case, mp 1432 * is no longer usable upon return. 1433 */ 1434 if ((ok_mp = mi_tpi_ok_ack_alloc_extra(mp, extra)) == NULL) { 1435 CONN_DEC_REF(acceptor->tcp_connp); 1436 CONN_DEC_REF(eager->tcp_connp); 1437 freemsg(discon_mp); 1438 /* Original mp has been freed by now, so use mp1 */ 1439 tcp_err_ack(listener, mp1, TSYSERR, ENOMEM); 1440 return; 1441 } 1442 1443 mp = NULL; /* We should never use mp after this point */ 1444 1445 switch (extra) { 1446 case sizeof (sin_t): { 1447 sin_t *sin = (sin_t *)ok_mp->b_wptr; 1448 1449 ok_mp->b_wptr += extra; 1450 sin->sin_family = AF_INET; 1451 sin->sin_port = econnp->conn_lport; 1452 sin->sin_addr.s_addr = econnp->conn_laddr_v4; 1453 break; 1454 } 1455 case sizeof (sin6_t): { 1456 sin6_t *sin6 = (sin6_t *)ok_mp->b_wptr; 1457 1458 ok_mp->b_wptr += extra; 1459 sin6->sin6_family = AF_INET6; 1460 sin6->sin6_port = econnp->conn_lport; 1461 sin6->sin6_addr = econnp->conn_laddr_v6; 1462 sin6->sin6_flowinfo = econnp->conn_flowinfo; 1463 if (IN6_IS_ADDR_LINKSCOPE(&econnp->conn_laddr_v6) && 1464 (econnp->conn_ixa->ixa_flags & IXAF_SCOPEID_SET)) { 1465 sin6->sin6_scope_id = 1466 econnp->conn_ixa->ixa_scopeid; 1467 } else { 1468 sin6->sin6_scope_id = 0; 1469 } 1470 sin6->__sin6_src_id = 0; 1471 break; 1472 } 1473 default: 1474 break; 1475 } 1476 ASSERT(ok_mp->b_wptr <= ok_mp->b_datap->db_lim); 1477 } 1478 1479 /* 1480 * If there are no options we know that the T_CONN_RES will 1481 * succeed. However, we can't send the T_OK_ACK upstream until 1482 * the tcp_accept_swap is done since it would be dangerous to 1483 * let the application start using the new fd prior to the swap. 1484 */ 1485 tcp_accept_swap(listener, acceptor, eager); 1486 1487 /* 1488 * tcp_accept_swap unlinks eager from listener but does not drop 1489 * the eager's reference on the listener. 1490 */ 1491 ASSERT(eager->tcp_listener == NULL); 1492 ASSERT(listener->tcp_connp->conn_ref >= 5); 1493 1494 /* 1495 * The eager is now associated with its own queue. Insert in 1496 * the hash so that the connection can be reused for a future 1497 * T_CONN_RES. 1498 */ 1499 tcp_acceptor_hash_insert(acceptor_id, eager); 1500 1501 /* 1502 * We now do the processing of options with T_CONN_RES. 1503 * We delay till now since we wanted to have queue to pass to 1504 * option processing routines that points back to the right 1505 * instance structure which does not happen until after 1506 * tcp_accept_swap(). 1507 * 1508 * Note: 1509 * The sanity of the logic here assumes that whatever options 1510 * are appropriate to inherit from listner=>eager are done 1511 * before this point, and whatever were to be overridden (or not) 1512 * in transfer logic from eager=>acceptor in tcp_accept_swap(). 1513 * [ Warning: acceptor endpoint can have T_OPTMGMT_REQ done to it 1514 * before its ACCEPTOR_id comes down in T_CONN_RES ] 1515 * This may not be true at this point in time but can be fixed 1516 * independently. This option processing code starts with 1517 * the instantiated acceptor instance and the final queue at 1518 * this point. 1519 */ 1520 1521 if (tcr->OPT_length != 0) { 1522 /* Options to process */ 1523 int t_error = 0; 1524 int sys_error = 0; 1525 int do_disconnect = 0; 1526 1527 if (tcp_conprim_opt_process(eager, mp1, 1528 &do_disconnect, &t_error, &sys_error) < 0) { 1529 eager->tcp_accept_error = 1; 1530 if (do_disconnect) { 1531 /* 1532 * An option failed which does not allow 1533 * connection to be accepted. 1534 * 1535 * We allow T_CONN_RES to succeed and 1536 * put a T_DISCON_IND on the eager queue. 1537 */ 1538 ASSERT(t_error == 0 && sys_error == 0); 1539 eager->tcp_send_discon_ind = 1; 1540 } else { 1541 ASSERT(t_error != 0); 1542 freemsg(ok_mp); 1543 /* 1544 * Original mp was either freed or set 1545 * to ok_mp above, so use mp1 instead. 1546 */ 1547 tcp_err_ack(listener, mp1, t_error, sys_error); 1548 goto finish; 1549 } 1550 } 1551 /* 1552 * Most likely success in setting options (except if 1553 * eager->tcp_send_discon_ind set). 1554 * mp1 option buffer represented by OPT_length/offset 1555 * potentially modified and contains results of setting 1556 * options at this point 1557 */ 1558 } 1559 1560 /* We no longer need mp1, since all options processing has passed */ 1561 freemsg(mp1); 1562 1563 putnext(listener->tcp_connp->conn_rq, ok_mp); 1564 1565 mutex_enter(&listener->tcp_eager_lock); 1566 if (listener->tcp_eager_prev_q0->tcp_conn_def_q0) { 1567 tcp_t *tail; 1568 mblk_t *conn_ind; 1569 1570 /* 1571 * This path should not be executed if listener and 1572 * acceptor streams are the same. 1573 */ 1574 ASSERT(listener != acceptor); 1575 1576 tcp = listener->tcp_eager_prev_q0; 1577 /* 1578 * listener->tcp_eager_prev_q0 points to the TAIL of the 1579 * deferred T_conn_ind queue. We need to get to the head of 1580 * the queue in order to send up T_conn_ind the same order as 1581 * how the 3WHS is completed. 1582 */ 1583 while (tcp != listener) { 1584 if (!tcp->tcp_eager_prev_q0->tcp_conn_def_q0) 1585 break; 1586 else 1587 tcp = tcp->tcp_eager_prev_q0; 1588 } 1589 ASSERT(tcp != listener); 1590 conn_ind = tcp->tcp_conn.tcp_eager_conn_ind; 1591 ASSERT(conn_ind != NULL); 1592 tcp->tcp_conn.tcp_eager_conn_ind = NULL; 1593 1594 /* Move from q0 to q */ 1595 ASSERT(listener->tcp_conn_req_cnt_q0 > 0); 1596 listener->tcp_conn_req_cnt_q0--; 1597 listener->tcp_conn_req_cnt_q++; 1598 tcp->tcp_eager_next_q0->tcp_eager_prev_q0 = 1599 tcp->tcp_eager_prev_q0; 1600 tcp->tcp_eager_prev_q0->tcp_eager_next_q0 = 1601 tcp->tcp_eager_next_q0; 1602 tcp->tcp_eager_prev_q0 = NULL; 1603 tcp->tcp_eager_next_q0 = NULL; 1604 tcp->tcp_conn_def_q0 = B_FALSE; 1605 1606 /* Make sure the tcp isn't in the list of droppables */ 1607 ASSERT(tcp->tcp_eager_next_drop_q0 == NULL && 1608 tcp->tcp_eager_prev_drop_q0 == NULL); 1609 1610 /* 1611 * Insert at end of the queue because sockfs sends 1612 * down T_CONN_RES in chronological order. Leaving 1613 * the older conn indications at front of the queue 1614 * helps reducing search time. 1615 */ 1616 tail = listener->tcp_eager_last_q; 1617 if (tail != NULL) 1618 tail->tcp_eager_next_q = tcp; 1619 else 1620 listener->tcp_eager_next_q = tcp; 1621 listener->tcp_eager_last_q = tcp; 1622 tcp->tcp_eager_next_q = NULL; 1623 mutex_exit(&listener->tcp_eager_lock); 1624 putnext(tcp->tcp_connp->conn_rq, conn_ind); 1625 } else { 1626 mutex_exit(&listener->tcp_eager_lock); 1627 } 1628 1629 /* 1630 * Done with the acceptor - free it 1631 * 1632 * Note: from this point on, no access to listener should be made 1633 * as listener can be equal to acceptor. 1634 */ 1635 finish: 1636 ASSERT(acceptor->tcp_detached); 1637 acceptor->tcp_connp->conn_rq = NULL; 1638 ASSERT(!IPCL_IS_NONSTR(acceptor->tcp_connp)); 1639 acceptor->tcp_connp->conn_wq = NULL; 1640 (void) tcp_clean_death(acceptor, 0); 1641 CONN_DEC_REF(acceptor->tcp_connp); 1642 1643 /* 1644 * We pass discon_mp to tcp_accept_finish to get on the right squeue. 1645 * 1646 * It will update the setting for sockfs/stream head and also take 1647 * care of any data that arrived before accept() wad called. 1648 * In case we already received a FIN then tcp_accept_finish will send up 1649 * the ordrel. It will also send up a window update if the window 1650 * has opened up. 1651 */ 1652 1653 /* 1654 * XXX: we currently have a problem if XTI application closes the 1655 * acceptor stream in between. This problem exists in on10-gate also 1656 * and is well know but nothing can be done short of major rewrite 1657 * to fix it. Now it is possible to take care of it by assigning TLI/XTI 1658 * eager same squeue as listener (we can distinguish non socket 1659 * listeners at the time of handling a SYN in tcp_input_listener) 1660 * and do most of the work that tcp_accept_finish does here itself 1661 * and then get behind the acceptor squeue to access the acceptor 1662 * queue. 1663 */ 1664 /* 1665 * We already have a ref on tcp so no need to do one before squeue_enter 1666 */ 1667 SQUEUE_ENTER_ONE(eager->tcp_connp->conn_sqp, discon_mp, 1668 tcp_accept_finish, eager->tcp_connp, NULL, SQ_FILL, 1669 SQTAG_TCP_ACCEPT_FINISH); 1670 } 1671 1672 1673 /* 1674 * This is the STREAMS entry point for T_CONN_RES coming down on 1675 * Acceptor STREAM when sockfs listener does accept processing. 1676 * Read the block comment on top of tcp_input_listener(). 1677 */ 1678 void 1679 tcp_tpi_accept(queue_t *q, mblk_t *mp) 1680 { 1681 queue_t *rq = RD(q); 1682 struct T_conn_res *conn_res; 1683 tcp_t *eager; 1684 tcp_t *listener; 1685 struct T_ok_ack *ok; 1686 t_scalar_t PRIM_type; 1687 mblk_t *discon_mp; 1688 conn_t *econnp; 1689 cred_t *cr; 1690 1691 ASSERT(DB_TYPE(mp) == M_PROTO); 1692 1693 /* 1694 * All Solaris components should pass a db_credp 1695 * for this TPI message, hence we ASSERT. 1696 * But in case there is some other M_PROTO that looks 1697 * like a TPI message sent by some other kernel 1698 * component, we check and return an error. 1699 */ 1700 cr = msg_getcred(mp, NULL); 1701 ASSERT(cr != NULL); 1702 if (cr == NULL) { 1703 mp = mi_tpi_err_ack_alloc(mp, TSYSERR, EINVAL); 1704 if (mp != NULL) 1705 putnext(rq, mp); 1706 return; 1707 } 1708 conn_res = (struct T_conn_res *)mp->b_rptr; 1709 ASSERT((uintptr_t)(mp->b_wptr - mp->b_rptr) <= (uintptr_t)INT_MAX); 1710 if ((mp->b_wptr - mp->b_rptr) < sizeof (struct T_conn_res)) { 1711 mp = mi_tpi_err_ack_alloc(mp, TPROTO, 0); 1712 if (mp != NULL) 1713 putnext(rq, mp); 1714 return; 1715 } 1716 switch (conn_res->PRIM_type) { 1717 case O_T_CONN_RES: 1718 case T_CONN_RES: 1719 /* 1720 * We pass up an err ack if allocb fails. This will 1721 * cause sockfs to issue a T_DISCON_REQ which will cause 1722 * tcp_eager_blowoff to be called. sockfs will then call 1723 * rq->q_qinfo->qi_qclose to cleanup the acceptor stream. 1724 * we need to do the allocb up here because we have to 1725 * make sure rq->q_qinfo->qi_qclose still points to the 1726 * correct function (tcp_tpi_close_accept) in case allocb 1727 * fails. 1728 */ 1729 bcopy(mp->b_rptr + conn_res->OPT_offset, 1730 &eager, conn_res->OPT_length); 1731 PRIM_type = conn_res->PRIM_type; 1732 mp->b_datap->db_type = M_PCPROTO; 1733 mp->b_wptr = mp->b_rptr + sizeof (struct T_ok_ack); 1734 ok = (struct T_ok_ack *)mp->b_rptr; 1735 ok->PRIM_type = T_OK_ACK; 1736 ok->CORRECT_prim = PRIM_type; 1737 econnp = eager->tcp_connp; 1738 econnp->conn_dev = (dev_t)RD(q)->q_ptr; 1739 econnp->conn_minor_arena = (vmem_t *)(WR(q)->q_ptr); 1740 econnp->conn_rq = rq; 1741 econnp->conn_wq = q; 1742 rq->q_ptr = econnp; 1743 rq->q_qinfo = &tcp_rinitv4; /* No open - same as rinitv6 */ 1744 q->q_ptr = econnp; 1745 q->q_qinfo = &tcp_winit; 1746 listener = eager->tcp_listener; 1747 1748 /* 1749 * Pre allocate the discon_ind mblk also. tcp_accept_finish will 1750 * use it if something failed. 1751 */ 1752 discon_mp = allocb(MAX(sizeof (struct T_discon_ind), 1753 sizeof (struct stroptions)), BPRI_HI); 1754 1755 if (discon_mp == NULL) { 1756 mp = mi_tpi_err_ack_alloc(mp, TPROTO, 0); 1757 if (mp != NULL) 1758 putnext(rq, mp); 1759 return; 1760 } 1761 1762 eager->tcp_issocket = B_TRUE; 1763 1764 ASSERT(econnp->conn_netstack == 1765 listener->tcp_connp->conn_netstack); 1766 ASSERT(eager->tcp_tcps == listener->tcp_tcps); 1767 1768 /* Put the ref for IP */ 1769 CONN_INC_REF(econnp); 1770 1771 /* 1772 * We should have minimum of 3 references on the conn 1773 * at this point. One each for TCP and IP and one for 1774 * the T_conn_ind that was sent up when the 3-way handshake 1775 * completed. In the normal case we would also have another 1776 * reference (making a total of 4) for the conn being in the 1777 * classifier hash list. However the eager could have received 1778 * an RST subsequently and tcp_closei_local could have removed 1779 * the eager from the classifier hash list, hence we can't 1780 * assert that reference. 1781 */ 1782 ASSERT(econnp->conn_ref >= 3); 1783 1784 mutex_enter(&listener->tcp_eager_lock); 1785 if (listener->tcp_eager_prev_q0->tcp_conn_def_q0) { 1786 1787 tcp_t *tail; 1788 tcp_t *tcp; 1789 mblk_t *mp1; 1790 1791 tcp = listener->tcp_eager_prev_q0; 1792 /* 1793 * listener->tcp_eager_prev_q0 points to the TAIL of the 1794 * deferred T_conn_ind queue. We need to get to the head 1795 * of the queue in order to send up T_conn_ind the same 1796 * order as how the 3WHS is completed. 1797 */ 1798 while (tcp != listener) { 1799 if (!tcp->tcp_eager_prev_q0->tcp_conn_def_q0 && 1800 !tcp->tcp_kssl_pending) 1801 break; 1802 else 1803 tcp = tcp->tcp_eager_prev_q0; 1804 } 1805 /* None of the pending eagers can be sent up now */ 1806 if (tcp == listener) 1807 goto no_more_eagers; 1808 1809 mp1 = tcp->tcp_conn.tcp_eager_conn_ind; 1810 tcp->tcp_conn.tcp_eager_conn_ind = NULL; 1811 /* Move from q0 to q */ 1812 ASSERT(listener->tcp_conn_req_cnt_q0 > 0); 1813 listener->tcp_conn_req_cnt_q0--; 1814 listener->tcp_conn_req_cnt_q++; 1815 tcp->tcp_eager_next_q0->tcp_eager_prev_q0 = 1816 tcp->tcp_eager_prev_q0; 1817 tcp->tcp_eager_prev_q0->tcp_eager_next_q0 = 1818 tcp->tcp_eager_next_q0; 1819 tcp->tcp_eager_prev_q0 = NULL; 1820 tcp->tcp_eager_next_q0 = NULL; 1821 tcp->tcp_conn_def_q0 = B_FALSE; 1822 1823 /* Make sure the tcp isn't in the list of droppables */ 1824 ASSERT(tcp->tcp_eager_next_drop_q0 == NULL && 1825 tcp->tcp_eager_prev_drop_q0 == NULL); 1826 1827 /* 1828 * Insert at end of the queue because sockfs sends 1829 * down T_CONN_RES in chronological order. Leaving 1830 * the older conn indications at front of the queue 1831 * helps reducing search time. 1832 */ 1833 tail = listener->tcp_eager_last_q; 1834 if (tail != NULL) { 1835 tail->tcp_eager_next_q = tcp; 1836 } else { 1837 listener->tcp_eager_next_q = tcp; 1838 } 1839 listener->tcp_eager_last_q = tcp; 1840 tcp->tcp_eager_next_q = NULL; 1841 1842 /* Need to get inside the listener perimeter */ 1843 CONN_INC_REF(listener->tcp_connp); 1844 SQUEUE_ENTER_ONE(listener->tcp_connp->conn_sqp, mp1, 1845 tcp_send_pending, listener->tcp_connp, NULL, 1846 SQ_FILL, SQTAG_TCP_SEND_PENDING); 1847 } 1848 no_more_eagers: 1849 tcp_eager_unlink(eager); 1850 mutex_exit(&listener->tcp_eager_lock); 1851 1852 /* 1853 * At this point, the eager is detached from the listener 1854 * but we still have an extra refs on eager (apart from the 1855 * usual tcp references). The ref was placed in tcp_input_data 1856 * before sending the conn_ind in tcp_send_conn_ind. 1857 * The ref will be dropped in tcp_accept_finish(). 1858 */ 1859 SQUEUE_ENTER_ONE(econnp->conn_sqp, discon_mp, tcp_accept_finish, 1860 econnp, NULL, SQ_NODRAIN, SQTAG_TCP_ACCEPT_FINISH_Q0); 1861 1862 /* 1863 * Send the new local address also up to sockfs. There 1864 * should already be enough space in the mp that came 1865 * down from soaccept(). 1866 */ 1867 if (econnp->conn_family == AF_INET) { 1868 sin_t *sin; 1869 1870 ASSERT((mp->b_datap->db_lim - mp->b_datap->db_base) >= 1871 (sizeof (struct T_ok_ack) + sizeof (sin_t))); 1872 sin = (sin_t *)mp->b_wptr; 1873 mp->b_wptr += sizeof (sin_t); 1874 sin->sin_family = AF_INET; 1875 sin->sin_port = econnp->conn_lport; 1876 sin->sin_addr.s_addr = econnp->conn_laddr_v4; 1877 } else { 1878 sin6_t *sin6; 1879 1880 ASSERT((mp->b_datap->db_lim - mp->b_datap->db_base) >= 1881 sizeof (struct T_ok_ack) + sizeof (sin6_t)); 1882 sin6 = (sin6_t *)mp->b_wptr; 1883 mp->b_wptr += sizeof (sin6_t); 1884 sin6->sin6_family = AF_INET6; 1885 sin6->sin6_port = econnp->conn_lport; 1886 sin6->sin6_addr = econnp->conn_laddr_v6; 1887 if (econnp->conn_ipversion == IPV4_VERSION) 1888 sin6->sin6_flowinfo = 0; 1889 else 1890 sin6->sin6_flowinfo = econnp->conn_flowinfo; 1891 if (IN6_IS_ADDR_LINKSCOPE(&econnp->conn_laddr_v6) && 1892 (econnp->conn_ixa->ixa_flags & IXAF_SCOPEID_SET)) { 1893 sin6->sin6_scope_id = 1894 econnp->conn_ixa->ixa_scopeid; 1895 } else { 1896 sin6->sin6_scope_id = 0; 1897 } 1898 sin6->__sin6_src_id = 0; 1899 } 1900 1901 putnext(rq, mp); 1902 return; 1903 default: 1904 mp = mi_tpi_err_ack_alloc(mp, TNOTSUPPORT, 0); 1905 if (mp != NULL) 1906 putnext(rq, mp); 1907 return; 1908 } 1909 } 1910 1911 /* 1912 * The function called through squeue to get behind listener's perimeter to 1913 * send a deferred conn_ind. 1914 */ 1915 /* ARGSUSED */ 1916 void 1917 tcp_send_pending(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy) 1918 { 1919 conn_t *lconnp = (conn_t *)arg; 1920 tcp_t *listener = lconnp->conn_tcp; 1921 struct T_conn_ind *conn_ind; 1922 tcp_t *tcp; 1923 1924 conn_ind = (struct T_conn_ind *)mp->b_rptr; 1925 bcopy(mp->b_rptr + conn_ind->OPT_offset, &tcp, 1926 conn_ind->OPT_length); 1927 1928 if (listener->tcp_state != TCPS_LISTEN) { 1929 /* 1930 * If listener has closed, it would have caused a 1931 * a cleanup/blowoff to happen for the eager, so 1932 * we don't need to do anything more. 1933 */ 1934 freemsg(mp); 1935 return; 1936 } 1937 1938 putnext(lconnp->conn_rq, mp); 1939 } 1940 1941 /* 1942 * Sends the T_CONN_IND to the listener. The caller calls this 1943 * functions via squeue to get inside the listener's perimeter 1944 * once the 3 way hand shake is done a T_CONN_IND needs to be 1945 * sent. As an optimization, the caller can call this directly 1946 * if listener's perimeter is same as eager's. 1947 */ 1948 /* ARGSUSED */ 1949 void 1950 tcp_send_conn_ind(void *arg, mblk_t *mp, void *arg2) 1951 { 1952 conn_t *lconnp = (conn_t *)arg; 1953 tcp_t *listener = lconnp->conn_tcp; 1954 tcp_t *tcp; 1955 struct T_conn_ind *conn_ind; 1956 ipaddr_t *addr_cache; 1957 boolean_t need_send_conn_ind = B_FALSE; 1958 tcp_stack_t *tcps = listener->tcp_tcps; 1959 1960 /* retrieve the eager */ 1961 conn_ind = (struct T_conn_ind *)mp->b_rptr; 1962 ASSERT(conn_ind->OPT_offset != 0 && 1963 conn_ind->OPT_length == sizeof (intptr_t)); 1964 bcopy(mp->b_rptr + conn_ind->OPT_offset, &tcp, 1965 conn_ind->OPT_length); 1966 1967 /* 1968 * TLI/XTI applications will get confused by 1969 * sending eager as an option since it violates 1970 * the option semantics. So remove the eager as 1971 * option since TLI/XTI app doesn't need it anyway. 1972 */ 1973 if (!TCP_IS_SOCKET(listener)) { 1974 conn_ind->OPT_length = 0; 1975 conn_ind->OPT_offset = 0; 1976 } 1977 if (listener->tcp_state != TCPS_LISTEN) { 1978 /* 1979 * If listener has closed, it would have caused a 1980 * a cleanup/blowoff to happen for the eager. We 1981 * just need to return. 1982 */ 1983 freemsg(mp); 1984 return; 1985 } 1986 1987 1988 /* 1989 * if the conn_req_q is full defer passing up the 1990 * T_CONN_IND until space is availabe after t_accept() 1991 * processing 1992 */ 1993 mutex_enter(&listener->tcp_eager_lock); 1994 1995 /* 1996 * Take the eager out, if it is in the list of droppable eagers 1997 * as we are here because the 3W handshake is over. 1998 */ 1999 MAKE_UNDROPPABLE(tcp); 2000 2001 if (listener->tcp_conn_req_cnt_q < listener->tcp_conn_req_max) { 2002 tcp_t *tail; 2003 2004 /* 2005 * The eager already has an extra ref put in tcp_input_data 2006 * so that it stays till accept comes back even though it 2007 * might get into TCPS_CLOSED as a result of a TH_RST etc. 2008 */ 2009 ASSERT(listener->tcp_conn_req_cnt_q0 > 0); 2010 listener->tcp_conn_req_cnt_q0--; 2011 listener->tcp_conn_req_cnt_q++; 2012 2013 /* Move from SYN_RCVD to ESTABLISHED list */ 2014 tcp->tcp_eager_next_q0->tcp_eager_prev_q0 = 2015 tcp->tcp_eager_prev_q0; 2016 tcp->tcp_eager_prev_q0->tcp_eager_next_q0 = 2017 tcp->tcp_eager_next_q0; 2018 tcp->tcp_eager_prev_q0 = NULL; 2019 tcp->tcp_eager_next_q0 = NULL; 2020 2021 /* 2022 * Insert at end of the queue because sockfs 2023 * sends down T_CONN_RES in chronological 2024 * order. Leaving the older conn indications 2025 * at front of the queue helps reducing search 2026 * time. 2027 */ 2028 tail = listener->tcp_eager_last_q; 2029 if (tail != NULL) 2030 tail->tcp_eager_next_q = tcp; 2031 else 2032 listener->tcp_eager_next_q = tcp; 2033 listener->tcp_eager_last_q = tcp; 2034 tcp->tcp_eager_next_q = NULL; 2035 /* 2036 * Delay sending up the T_conn_ind until we are 2037 * done with the eager. Once we have have sent up 2038 * the T_conn_ind, the accept can potentially complete 2039 * any time and release the refhold we have on the eager. 2040 */ 2041 need_send_conn_ind = B_TRUE; 2042 } else { 2043 /* 2044 * Defer connection on q0 and set deferred 2045 * connection bit true 2046 */ 2047 tcp->tcp_conn_def_q0 = B_TRUE; 2048 2049 /* take tcp out of q0 ... */ 2050 tcp->tcp_eager_prev_q0->tcp_eager_next_q0 = 2051 tcp->tcp_eager_next_q0; 2052 tcp->tcp_eager_next_q0->tcp_eager_prev_q0 = 2053 tcp->tcp_eager_prev_q0; 2054 2055 /* ... and place it at the end of q0 */ 2056 tcp->tcp_eager_prev_q0 = listener->tcp_eager_prev_q0; 2057 tcp->tcp_eager_next_q0 = listener; 2058 listener->tcp_eager_prev_q0->tcp_eager_next_q0 = tcp; 2059 listener->tcp_eager_prev_q0 = tcp; 2060 tcp->tcp_conn.tcp_eager_conn_ind = mp; 2061 } 2062 2063 /* we have timed out before */ 2064 if (tcp->tcp_syn_rcvd_timeout != 0) { 2065 tcp->tcp_syn_rcvd_timeout = 0; 2066 listener->tcp_syn_rcvd_timeout--; 2067 if (listener->tcp_syn_defense && 2068 listener->tcp_syn_rcvd_timeout <= 2069 (tcps->tcps_conn_req_max_q0 >> 5) && 2070 10*MINUTES < TICK_TO_MSEC(ddi_get_lbolt64() - 2071 listener->tcp_last_rcv_lbolt)) { 2072 /* 2073 * Turn off the defense mode if we 2074 * believe the SYN attack is over. 2075 */ 2076 listener->tcp_syn_defense = B_FALSE; 2077 if (listener->tcp_ip_addr_cache) { 2078 kmem_free((void *)listener->tcp_ip_addr_cache, 2079 IP_ADDR_CACHE_SIZE * sizeof (ipaddr_t)); 2080 listener->tcp_ip_addr_cache = NULL; 2081 } 2082 } 2083 } 2084 addr_cache = (ipaddr_t *)(listener->tcp_ip_addr_cache); 2085 if (addr_cache != NULL) { 2086 /* 2087 * We have finished a 3-way handshake with this 2088 * remote host. This proves the IP addr is good. 2089 * Cache it! 2090 */ 2091 addr_cache[IP_ADDR_CACHE_HASH(tcp->tcp_connp->conn_faddr_v4)] = 2092 tcp->tcp_connp->conn_faddr_v4; 2093 } 2094 mutex_exit(&listener->tcp_eager_lock); 2095 if (need_send_conn_ind) 2096 putnext(lconnp->conn_rq, mp); 2097 } 2098