1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2010 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 /* This files contains all TCP TLI/TPI related functions */ 28 29 #include <sys/types.h> 30 #include <sys/stream.h> 31 #include <sys/strsun.h> 32 #include <sys/strsubr.h> 33 #include <sys/stropts.h> 34 #include <sys/strlog.h> 35 #define _SUN_TPI_VERSION 2 36 #include <sys/tihdr.h> 37 #include <sys/suntpi.h> 38 #include <sys/xti_inet.h> 39 #include <sys/squeue_impl.h> 40 #include <sys/squeue.h> 41 42 #include <inet/common.h> 43 #include <inet/ip.h> 44 #include <inet/tcp.h> 45 #include <inet/tcp_impl.h> 46 #include <inet/proto_set.h> 47 48 static void tcp_accept_swap(tcp_t *, tcp_t *, tcp_t *); 49 static int tcp_conprim_opt_process(tcp_t *, mblk_t *, int *, int *, int *); 50 static void tcp_ulp_newconn(conn_t *, conn_t *, mblk_t *); 51 52 void 53 tcp_use_pure_tpi(tcp_t *tcp) 54 { 55 conn_t *connp = tcp->tcp_connp; 56 57 #ifdef _ILP32 58 tcp->tcp_acceptor_id = (t_uscalar_t)connp->conn_rq; 59 #else 60 tcp->tcp_acceptor_id = connp->conn_dev; 61 #endif 62 /* 63 * Insert this socket into the acceptor hash. 64 * We might need it for T_CONN_RES message 65 */ 66 tcp_acceptor_hash_insert(tcp->tcp_acceptor_id, tcp); 67 68 tcp->tcp_issocket = B_FALSE; 69 TCP_STAT(tcp->tcp_tcps, tcp_sock_fallback); 70 } 71 72 /* Shorthand to generate and send TPI error acks to our client */ 73 void 74 tcp_err_ack(tcp_t *tcp, mblk_t *mp, int t_error, int sys_error) 75 { 76 if ((mp = mi_tpi_err_ack_alloc(mp, t_error, sys_error)) != NULL) 77 putnext(tcp->tcp_connp->conn_rq, mp); 78 } 79 80 /* Shorthand to generate and send TPI error acks to our client */ 81 void 82 tcp_err_ack_prim(tcp_t *tcp, mblk_t *mp, int primitive, 83 int t_error, int sys_error) 84 { 85 struct T_error_ack *teackp; 86 87 if ((mp = tpi_ack_alloc(mp, sizeof (struct T_error_ack), 88 M_PCPROTO, T_ERROR_ACK)) != NULL) { 89 teackp = (struct T_error_ack *)mp->b_rptr; 90 teackp->ERROR_prim = primitive; 91 teackp->TLI_error = t_error; 92 teackp->UNIX_error = sys_error; 93 putnext(tcp->tcp_connp->conn_rq, mp); 94 } 95 } 96 97 /* 98 * TCP routine to get the values of options. 99 */ 100 int 101 tcp_tpi_opt_get(queue_t *q, int level, int name, uchar_t *ptr) 102 { 103 return (tcp_opt_get(Q_TO_CONN(q), level, name, ptr)); 104 } 105 106 /* ARGSUSED */ 107 int 108 tcp_tpi_opt_set(queue_t *q, uint_t optset_context, int level, int name, 109 uint_t inlen, uchar_t *invalp, uint_t *outlenp, uchar_t *outvalp, 110 void *thisdg_attrs, cred_t *cr) 111 { 112 conn_t *connp = Q_TO_CONN(q); 113 114 return (tcp_opt_set(connp, optset_context, level, name, inlen, invalp, 115 outlenp, outvalp, thisdg_attrs, cr)); 116 } 117 118 static int 119 tcp_conprim_opt_process(tcp_t *tcp, mblk_t *mp, int *do_disconnectp, 120 int *t_errorp, int *sys_errorp) 121 { 122 int error; 123 int is_absreq_failure; 124 t_scalar_t *opt_lenp; 125 t_scalar_t opt_offset; 126 int prim_type; 127 struct T_conn_req *tcreqp; 128 struct T_conn_res *tcresp; 129 cred_t *cr; 130 131 /* 132 * All Solaris components should pass a db_credp 133 * for this TPI message, hence we ASSERT. 134 * But in case there is some other M_PROTO that looks 135 * like a TPI message sent by some other kernel 136 * component, we check and return an error. 137 */ 138 cr = msg_getcred(mp, NULL); 139 ASSERT(cr != NULL); 140 if (cr == NULL) 141 return (-1); 142 143 prim_type = ((union T_primitives *)mp->b_rptr)->type; 144 ASSERT(prim_type == T_CONN_REQ || prim_type == O_T_CONN_RES || 145 prim_type == T_CONN_RES); 146 147 switch (prim_type) { 148 case T_CONN_REQ: 149 tcreqp = (struct T_conn_req *)mp->b_rptr; 150 opt_offset = tcreqp->OPT_offset; 151 opt_lenp = (t_scalar_t *)&tcreqp->OPT_length; 152 break; 153 case O_T_CONN_RES: 154 case T_CONN_RES: 155 tcresp = (struct T_conn_res *)mp->b_rptr; 156 opt_offset = tcresp->OPT_offset; 157 opt_lenp = (t_scalar_t *)&tcresp->OPT_length; 158 break; 159 } 160 161 *t_errorp = 0; 162 *sys_errorp = 0; 163 *do_disconnectp = 0; 164 165 error = tpi_optcom_buf(tcp->tcp_connp->conn_wq, mp, opt_lenp, 166 opt_offset, cr, &tcp_opt_obj, 167 NULL, &is_absreq_failure); 168 169 switch (error) { 170 case 0: /* no error */ 171 ASSERT(is_absreq_failure == 0); 172 return (0); 173 case ENOPROTOOPT: 174 *t_errorp = TBADOPT; 175 break; 176 case EACCES: 177 *t_errorp = TACCES; 178 break; 179 default: 180 *t_errorp = TSYSERR; *sys_errorp = error; 181 break; 182 } 183 if (is_absreq_failure != 0) { 184 /* 185 * The connection request should get the local ack 186 * T_OK_ACK and then a T_DISCON_IND. 187 */ 188 *do_disconnectp = 1; 189 } 190 return (-1); 191 } 192 193 void 194 tcp_tpi_bind(tcp_t *tcp, mblk_t *mp) 195 { 196 int error; 197 conn_t *connp = tcp->tcp_connp; 198 struct sockaddr *sa; 199 mblk_t *mp1; 200 struct T_bind_req *tbr; 201 int backlog; 202 socklen_t len; 203 sin_t *sin; 204 sin6_t *sin6; 205 cred_t *cr; 206 207 /* 208 * All Solaris components should pass a db_credp 209 * for this TPI message, hence we ASSERT. 210 * But in case there is some other M_PROTO that looks 211 * like a TPI message sent by some other kernel 212 * component, we check and return an error. 213 */ 214 cr = msg_getcred(mp, NULL); 215 ASSERT(cr != NULL); 216 if (cr == NULL) { 217 tcp_err_ack(tcp, mp, TSYSERR, EINVAL); 218 return; 219 } 220 221 ASSERT((uintptr_t)(mp->b_wptr - mp->b_rptr) <= (uintptr_t)INT_MAX); 222 if ((mp->b_wptr - mp->b_rptr) < sizeof (*tbr)) { 223 if (connp->conn_debug) { 224 (void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE, 225 "tcp_tpi_bind: bad req, len %u", 226 (uint_t)(mp->b_wptr - mp->b_rptr)); 227 } 228 tcp_err_ack(tcp, mp, TPROTO, 0); 229 return; 230 } 231 /* Make sure the largest address fits */ 232 mp1 = reallocb(mp, sizeof (struct T_bind_ack) + sizeof (sin6_t), 1); 233 if (mp1 == NULL) { 234 tcp_err_ack(tcp, mp, TSYSERR, ENOMEM); 235 return; 236 } 237 mp = mp1; 238 tbr = (struct T_bind_req *)mp->b_rptr; 239 240 backlog = tbr->CONIND_number; 241 len = tbr->ADDR_length; 242 243 switch (len) { 244 case 0: /* request for a generic port */ 245 tbr->ADDR_offset = sizeof (struct T_bind_req); 246 if (connp->conn_family == AF_INET) { 247 tbr->ADDR_length = sizeof (sin_t); 248 sin = (sin_t *)&tbr[1]; 249 *sin = sin_null; 250 sin->sin_family = AF_INET; 251 sa = (struct sockaddr *)sin; 252 len = sizeof (sin_t); 253 mp->b_wptr = (uchar_t *)&sin[1]; 254 } else { 255 ASSERT(connp->conn_family == AF_INET6); 256 tbr->ADDR_length = sizeof (sin6_t); 257 sin6 = (sin6_t *)&tbr[1]; 258 *sin6 = sin6_null; 259 sin6->sin6_family = AF_INET6; 260 sa = (struct sockaddr *)sin6; 261 len = sizeof (sin6_t); 262 mp->b_wptr = (uchar_t *)&sin6[1]; 263 } 264 break; 265 266 case sizeof (sin_t): /* Complete IPv4 address */ 267 sa = (struct sockaddr *)mi_offset_param(mp, tbr->ADDR_offset, 268 sizeof (sin_t)); 269 break; 270 271 case sizeof (sin6_t): /* Complete IPv6 address */ 272 sa = (struct sockaddr *)mi_offset_param(mp, 273 tbr->ADDR_offset, sizeof (sin6_t)); 274 break; 275 276 default: 277 if (connp->conn_debug) { 278 (void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE, 279 "tcp_tpi_bind: bad address length, %d", 280 tbr->ADDR_length); 281 } 282 tcp_err_ack(tcp, mp, TBADADDR, 0); 283 return; 284 } 285 286 if (backlog > 0) { 287 error = tcp_do_listen(connp, sa, len, backlog, DB_CRED(mp), 288 tbr->PRIM_type != O_T_BIND_REQ); 289 } else { 290 error = tcp_do_bind(connp, sa, len, DB_CRED(mp), 291 tbr->PRIM_type != O_T_BIND_REQ); 292 } 293 done: 294 if (error > 0) { 295 tcp_err_ack(tcp, mp, TSYSERR, error); 296 } else if (error < 0) { 297 tcp_err_ack(tcp, mp, -error, 0); 298 } else { 299 /* 300 * Update port information as sockfs/tpi needs it for checking 301 */ 302 if (connp->conn_family == AF_INET) { 303 sin = (sin_t *)sa; 304 sin->sin_port = connp->conn_lport; 305 } else { 306 sin6 = (sin6_t *)sa; 307 sin6->sin6_port = connp->conn_lport; 308 } 309 mp->b_datap->db_type = M_PCPROTO; 310 tbr->PRIM_type = T_BIND_ACK; 311 putnext(connp->conn_rq, mp); 312 } 313 } 314 315 /* tcp_unbind is called by tcp_wput_proto to handle T_UNBIND_REQ messages. */ 316 void 317 tcp_tpi_unbind(tcp_t *tcp, mblk_t *mp) 318 { 319 conn_t *connp = tcp->tcp_connp; 320 int error; 321 322 error = tcp_do_unbind(connp); 323 if (error > 0) { 324 tcp_err_ack(tcp, mp, TSYSERR, error); 325 } else if (error < 0) { 326 tcp_err_ack(tcp, mp, -error, 0); 327 } else { 328 /* Send M_FLUSH according to TPI */ 329 (void) putnextctl1(connp->conn_rq, M_FLUSH, FLUSHRW); 330 331 mp = mi_tpi_ok_ack_alloc(mp); 332 if (mp != NULL) 333 putnext(connp->conn_rq, mp); 334 } 335 } 336 337 int 338 tcp_tpi_close(queue_t *q, int flags) 339 { 340 conn_t *connp; 341 342 ASSERT(WR(q)->q_next == NULL); 343 344 if (flags & SO_FALLBACK) { 345 /* 346 * stream is being closed while in fallback 347 * simply free the resources that were allocated 348 */ 349 inet_minor_free(WR(q)->q_ptr, (dev_t)(RD(q)->q_ptr)); 350 qprocsoff(q); 351 goto done; 352 } 353 354 connp = Q_TO_CONN(q); 355 /* 356 * We are being closed as /dev/tcp or /dev/tcp6. 357 */ 358 tcp_close_common(connp, flags); 359 360 qprocsoff(q); 361 inet_minor_free(connp->conn_minor_arena, connp->conn_dev); 362 363 /* 364 * Drop IP's reference on the conn. This is the last reference 365 * on the connp if the state was less than established. If the 366 * connection has gone into timewait state, then we will have 367 * one ref for the TCP and one more ref (total of two) for the 368 * classifier connected hash list (a timewait connections stays 369 * in connected hash till closed). 370 * 371 * We can't assert the references because there might be other 372 * transient reference places because of some walkers or queued 373 * packets in squeue for the timewait state. 374 */ 375 CONN_DEC_REF(connp); 376 done: 377 q->q_ptr = WR(q)->q_ptr = NULL; 378 return (0); 379 } 380 381 int 382 tcp_tpi_close_accept(queue_t *q) 383 { 384 vmem_t *minor_arena; 385 dev_t conn_dev; 386 extern struct qinit tcp_acceptor_winit; 387 388 ASSERT(WR(q)->q_qinfo == &tcp_acceptor_winit); 389 390 /* 391 * We had opened an acceptor STREAM for sockfs which is 392 * now being closed due to some error. 393 */ 394 qprocsoff(q); 395 396 minor_arena = (vmem_t *)WR(q)->q_ptr; 397 conn_dev = (dev_t)RD(q)->q_ptr; 398 ASSERT(minor_arena != NULL); 399 ASSERT(conn_dev != 0); 400 inet_minor_free(minor_arena, conn_dev); 401 q->q_ptr = WR(q)->q_ptr = NULL; 402 return (0); 403 } 404 405 /* 406 * Put a connection confirmation message upstream built from the 407 * address/flowid information with the conn and iph. Report our success or 408 * failure. 409 */ 410 boolean_t 411 tcp_conn_con(tcp_t *tcp, uchar_t *iphdr, mblk_t *idmp, 412 mblk_t **defermp, ip_recv_attr_t *ira) 413 { 414 sin_t sin; 415 sin6_t sin6; 416 mblk_t *mp; 417 char *optp = NULL; 418 int optlen = 0; 419 conn_t *connp = tcp->tcp_connp; 420 421 if (defermp != NULL) 422 *defermp = NULL; 423 424 if (tcp->tcp_conn.tcp_opts_conn_req != NULL) { 425 /* 426 * Return in T_CONN_CON results of option negotiation through 427 * the T_CONN_REQ. Note: If there is an real end-to-end option 428 * negotiation, then what is received from remote end needs 429 * to be taken into account but there is no such thing (yet?) 430 * in our TCP/IP. 431 * Note: We do not use mi_offset_param() here as 432 * tcp_opts_conn_req contents do not directly come from 433 * an application and are either generated in kernel or 434 * from user input that was already verified. 435 */ 436 mp = tcp->tcp_conn.tcp_opts_conn_req; 437 optp = (char *)(mp->b_rptr + 438 ((struct T_conn_req *)mp->b_rptr)->OPT_offset); 439 optlen = (int) 440 ((struct T_conn_req *)mp->b_rptr)->OPT_length; 441 } 442 443 if (IPH_HDR_VERSION(iphdr) == IPV4_VERSION) { 444 445 /* packet is IPv4 */ 446 if (connp->conn_family == AF_INET) { 447 sin = sin_null; 448 sin.sin_addr.s_addr = connp->conn_faddr_v4; 449 sin.sin_port = connp->conn_fport; 450 sin.sin_family = AF_INET; 451 mp = mi_tpi_conn_con(NULL, (char *)&sin, 452 (int)sizeof (sin_t), optp, optlen); 453 } else { 454 sin6 = sin6_null; 455 sin6.sin6_addr = connp->conn_faddr_v6; 456 sin6.sin6_port = connp->conn_fport; 457 sin6.sin6_family = AF_INET6; 458 mp = mi_tpi_conn_con(NULL, (char *)&sin6, 459 (int)sizeof (sin6_t), optp, optlen); 460 461 } 462 } else { 463 ip6_t *ip6h = (ip6_t *)iphdr; 464 465 ASSERT(IPH_HDR_VERSION(iphdr) == IPV6_VERSION); 466 ASSERT(connp->conn_family == AF_INET6); 467 sin6 = sin6_null; 468 sin6.sin6_addr = connp->conn_faddr_v6; 469 sin6.sin6_port = connp->conn_fport; 470 sin6.sin6_family = AF_INET6; 471 sin6.sin6_flowinfo = ip6h->ip6_vcf & ~IPV6_VERS_AND_FLOW_MASK; 472 mp = mi_tpi_conn_con(NULL, (char *)&sin6, 473 (int)sizeof (sin6_t), optp, optlen); 474 } 475 476 if (!mp) 477 return (B_FALSE); 478 479 mblk_copycred(mp, idmp); 480 481 if (defermp == NULL) { 482 conn_t *connp = tcp->tcp_connp; 483 if (IPCL_IS_NONSTR(connp)) { 484 (*connp->conn_upcalls->su_connected) 485 (connp->conn_upper_handle, tcp->tcp_connid, 486 ira->ira_cred, ira->ira_cpid); 487 freemsg(mp); 488 } else { 489 if (ira->ira_cred != NULL) { 490 /* So that getpeerucred works for TPI sockfs */ 491 mblk_setcred(mp, ira->ira_cred, ira->ira_cpid); 492 } 493 putnext(connp->conn_rq, mp); 494 } 495 } else { 496 *defermp = mp; 497 } 498 499 if (tcp->tcp_conn.tcp_opts_conn_req != NULL) 500 tcp_close_mpp(&tcp->tcp_conn.tcp_opts_conn_req); 501 return (B_TRUE); 502 } 503 504 /* 505 * Successful connect request processing begins when our client passes 506 * a T_CONN_REQ message into tcp_wput(), which performs function calls into 507 * IP and the passes a T_OK_ACK (or T_ERROR_ACK upstream). 508 * 509 * After various error checks are completed, tcp_tpi_connect() lays 510 * the target address and port into the composite header template. 511 * Then we ask IP for information, including a source address if we didn't 512 * already have one. Finally we prepare to send the SYN packet, and then 513 * send up the T_OK_ACK reply message. 514 */ 515 void 516 tcp_tpi_connect(tcp_t *tcp, mblk_t *mp) 517 { 518 sin_t *sin; 519 struct T_conn_req *tcr; 520 struct sockaddr *sa; 521 socklen_t len; 522 int error; 523 cred_t *cr; 524 pid_t cpid; 525 conn_t *connp = tcp->tcp_connp; 526 queue_t *q = connp->conn_wq; 527 528 /* 529 * All Solaris components should pass a db_credp 530 * for this TPI message, hence we ASSERT. 531 * But in case there is some other M_PROTO that looks 532 * like a TPI message sent by some other kernel 533 * component, we check and return an error. 534 */ 535 cr = msg_getcred(mp, &cpid); 536 ASSERT(cr != NULL); 537 if (cr == NULL) { 538 tcp_err_ack(tcp, mp, TSYSERR, EINVAL); 539 return; 540 } 541 542 tcr = (struct T_conn_req *)mp->b_rptr; 543 544 ASSERT((uintptr_t)(mp->b_wptr - mp->b_rptr) <= (uintptr_t)INT_MAX); 545 if ((mp->b_wptr - mp->b_rptr) < sizeof (*tcr)) { 546 tcp_err_ack(tcp, mp, TPROTO, 0); 547 return; 548 } 549 550 /* 551 * Pre-allocate the T_ordrel_ind mblk so that at close time, we 552 * will always have that to send up. Otherwise, we need to do 553 * special handling in case the allocation fails at that time. 554 * If the end point is TPI, the tcp_t can be reused and the 555 * tcp_ordrel_mp may be allocated already. 556 */ 557 if (tcp->tcp_ordrel_mp == NULL) { 558 if ((tcp->tcp_ordrel_mp = mi_tpi_ordrel_ind()) == NULL) { 559 tcp_err_ack(tcp, mp, TSYSERR, ENOMEM); 560 return; 561 } 562 } 563 564 /* 565 * Determine packet type based on type of address passed in 566 * the request should contain an IPv4 or IPv6 address. 567 * Make sure that address family matches the type of 568 * family of the address passed down. 569 */ 570 switch (tcr->DEST_length) { 571 default: 572 tcp_err_ack(tcp, mp, TBADADDR, 0); 573 return; 574 575 case (sizeof (sin_t) - sizeof (sin->sin_zero)): { 576 /* 577 * XXX: The check for valid DEST_length was not there 578 * in earlier releases and some buggy 579 * TLI apps (e.g Sybase) got away with not feeding 580 * in sin_zero part of address. 581 * We allow that bug to keep those buggy apps humming. 582 * Test suites require the check on DEST_length. 583 * We construct a new mblk with valid DEST_length 584 * free the original so the rest of the code does 585 * not have to keep track of this special shorter 586 * length address case. 587 */ 588 mblk_t *nmp; 589 struct T_conn_req *ntcr; 590 sin_t *nsin; 591 592 nmp = allocb(sizeof (struct T_conn_req) + sizeof (sin_t) + 593 tcr->OPT_length, BPRI_HI); 594 if (nmp == NULL) { 595 tcp_err_ack(tcp, mp, TSYSERR, ENOMEM); 596 return; 597 } 598 ntcr = (struct T_conn_req *)nmp->b_rptr; 599 bzero(ntcr, sizeof (struct T_conn_req)); /* zero fill */ 600 ntcr->PRIM_type = T_CONN_REQ; 601 ntcr->DEST_length = sizeof (sin_t); 602 ntcr->DEST_offset = sizeof (struct T_conn_req); 603 604 nsin = (sin_t *)((uchar_t *)ntcr + ntcr->DEST_offset); 605 *nsin = sin_null; 606 /* Get pointer to shorter address to copy from original mp */ 607 sin = (sin_t *)mi_offset_param(mp, tcr->DEST_offset, 608 tcr->DEST_length); /* extract DEST_length worth of sin_t */ 609 if (sin == NULL || !OK_32PTR((char *)sin)) { 610 freemsg(nmp); 611 tcp_err_ack(tcp, mp, TSYSERR, EINVAL); 612 return; 613 } 614 nsin->sin_family = sin->sin_family; 615 nsin->sin_port = sin->sin_port; 616 nsin->sin_addr = sin->sin_addr; 617 /* Note:nsin->sin_zero zero-fill with sin_null assign above */ 618 nmp->b_wptr = (uchar_t *)&nsin[1]; 619 if (tcr->OPT_length != 0) { 620 ntcr->OPT_length = tcr->OPT_length; 621 ntcr->OPT_offset = nmp->b_wptr - nmp->b_rptr; 622 bcopy((uchar_t *)tcr + tcr->OPT_offset, 623 (uchar_t *)ntcr + ntcr->OPT_offset, 624 tcr->OPT_length); 625 nmp->b_wptr += tcr->OPT_length; 626 } 627 freemsg(mp); /* original mp freed */ 628 mp = nmp; /* re-initialize original variables */ 629 tcr = ntcr; 630 } 631 /* FALLTHRU */ 632 633 case sizeof (sin_t): 634 sa = (struct sockaddr *)mi_offset_param(mp, tcr->DEST_offset, 635 sizeof (sin_t)); 636 len = sizeof (sin_t); 637 break; 638 639 case sizeof (sin6_t): 640 sa = (struct sockaddr *)mi_offset_param(mp, tcr->DEST_offset, 641 sizeof (sin6_t)); 642 len = sizeof (sin6_t); 643 break; 644 } 645 646 error = proto_verify_ip_addr(connp->conn_family, sa, len); 647 if (error != 0) { 648 tcp_err_ack(tcp, mp, TSYSERR, error); 649 return; 650 } 651 652 /* 653 * TODO: If someone in TCPS_TIME_WAIT has this dst/port we 654 * should key on their sequence number and cut them loose. 655 */ 656 657 /* 658 * If options passed in, feed it for verification and handling 659 */ 660 if (tcr->OPT_length != 0) { 661 mblk_t *ok_mp; 662 mblk_t *discon_mp; 663 mblk_t *conn_opts_mp; 664 int t_error, sys_error, do_disconnect; 665 666 conn_opts_mp = NULL; 667 668 if (tcp_conprim_opt_process(tcp, mp, 669 &do_disconnect, &t_error, &sys_error) < 0) { 670 if (do_disconnect) { 671 ASSERT(t_error == 0 && sys_error == 0); 672 discon_mp = mi_tpi_discon_ind(NULL, 673 ECONNREFUSED, 0); 674 if (!discon_mp) { 675 tcp_err_ack_prim(tcp, mp, T_CONN_REQ, 676 TSYSERR, ENOMEM); 677 return; 678 } 679 ok_mp = mi_tpi_ok_ack_alloc(mp); 680 if (!ok_mp) { 681 tcp_err_ack_prim(tcp, NULL, T_CONN_REQ, 682 TSYSERR, ENOMEM); 683 return; 684 } 685 qreply(q, ok_mp); 686 qreply(q, discon_mp); /* no flush! */ 687 } else { 688 ASSERT(t_error != 0); 689 tcp_err_ack_prim(tcp, mp, T_CONN_REQ, t_error, 690 sys_error); 691 } 692 return; 693 } 694 /* 695 * Success in setting options, the mp option buffer represented 696 * by OPT_length/offset has been potentially modified and 697 * contains results of option processing. We copy it in 698 * another mp to save it for potentially influencing returning 699 * it in T_CONN_CONN. 700 */ 701 if (tcr->OPT_length != 0) { /* there are resulting options */ 702 conn_opts_mp = copyb(mp); 703 if (!conn_opts_mp) { 704 tcp_err_ack_prim(tcp, mp, T_CONN_REQ, 705 TSYSERR, ENOMEM); 706 return; 707 } 708 ASSERT(tcp->tcp_conn.tcp_opts_conn_req == NULL); 709 tcp->tcp_conn.tcp_opts_conn_req = conn_opts_mp; 710 /* 711 * Note: 712 * These resulting option negotiation can include any 713 * end-to-end negotiation options but there no such 714 * thing (yet?) in our TCP/IP. 715 */ 716 } 717 } 718 719 /* call the non-TPI version */ 720 error = tcp_do_connect(tcp->tcp_connp, sa, len, cr, cpid); 721 if (error < 0) { 722 mp = mi_tpi_err_ack_alloc(mp, -error, 0); 723 } else if (error > 0) { 724 mp = mi_tpi_err_ack_alloc(mp, TSYSERR, error); 725 } else { 726 mp = mi_tpi_ok_ack_alloc(mp); 727 } 728 729 /* 730 * Note: Code below is the "failure" case 731 */ 732 /* return error ack and blow away saved option results if any */ 733 connect_failed: 734 if (mp != NULL) 735 putnext(connp->conn_rq, mp); 736 else { 737 tcp_err_ack_prim(tcp, NULL, T_CONN_REQ, 738 TSYSERR, ENOMEM); 739 } 740 } 741 742 /* Return the TPI/TLI equivalent of our current tcp_state */ 743 static int 744 tcp_tpistate(tcp_t *tcp) 745 { 746 switch (tcp->tcp_state) { 747 case TCPS_IDLE: 748 return (TS_UNBND); 749 case TCPS_LISTEN: 750 /* 751 * Return whether there are outstanding T_CONN_IND waiting 752 * for the matching T_CONN_RES. Therefore don't count q0. 753 */ 754 if (tcp->tcp_conn_req_cnt_q > 0) 755 return (TS_WRES_CIND); 756 else 757 return (TS_IDLE); 758 case TCPS_BOUND: 759 return (TS_IDLE); 760 case TCPS_SYN_SENT: 761 return (TS_WCON_CREQ); 762 case TCPS_SYN_RCVD: 763 /* 764 * Note: assumption: this has to the active open SYN_RCVD. 765 * The passive instance is detached in SYN_RCVD stage of 766 * incoming connection processing so we cannot get request 767 * for T_info_ack on it. 768 */ 769 return (TS_WACK_CRES); 770 case TCPS_ESTABLISHED: 771 return (TS_DATA_XFER); 772 case TCPS_CLOSE_WAIT: 773 return (TS_WREQ_ORDREL); 774 case TCPS_FIN_WAIT_1: 775 return (TS_WIND_ORDREL); 776 case TCPS_FIN_WAIT_2: 777 return (TS_WIND_ORDREL); 778 779 case TCPS_CLOSING: 780 case TCPS_LAST_ACK: 781 case TCPS_TIME_WAIT: 782 case TCPS_CLOSED: 783 /* 784 * Following TS_WACK_DREQ7 is a rendition of "not 785 * yet TS_IDLE" TPI state. There is no best match to any 786 * TPI state for TCPS_{CLOSING, LAST_ACK, TIME_WAIT} but we 787 * choose a value chosen that will map to TLI/XTI level 788 * state of TSTATECHNG (state is process of changing) which 789 * captures what this dummy state represents. 790 */ 791 return (TS_WACK_DREQ7); 792 default: 793 cmn_err(CE_WARN, "tcp_tpistate: strange state (%d) %s", 794 tcp->tcp_state, tcp_display(tcp, NULL, 795 DISP_PORT_ONLY)); 796 return (TS_UNBND); 797 } 798 } 799 800 static void 801 tcp_copy_info(struct T_info_ack *tia, tcp_t *tcp) 802 { 803 tcp_stack_t *tcps = tcp->tcp_tcps; 804 conn_t *connp = tcp->tcp_connp; 805 extern struct T_info_ack tcp_g_t_info_ack; 806 extern struct T_info_ack tcp_g_t_info_ack_v6; 807 808 if (connp->conn_family == AF_INET6) 809 *tia = tcp_g_t_info_ack_v6; 810 else 811 *tia = tcp_g_t_info_ack; 812 tia->CURRENT_state = tcp_tpistate(tcp); 813 tia->OPT_size = tcp_max_optsize; 814 if (tcp->tcp_mss == 0) { 815 /* Not yet set - tcp_open does not set mss */ 816 if (connp->conn_ipversion == IPV4_VERSION) 817 tia->TIDU_size = tcps->tcps_mss_def_ipv4; 818 else 819 tia->TIDU_size = tcps->tcps_mss_def_ipv6; 820 } else { 821 tia->TIDU_size = tcp->tcp_mss; 822 } 823 /* TODO: Default ETSDU is 1. Is that correct for tcp? */ 824 } 825 826 static void 827 tcp_do_capability_ack(tcp_t *tcp, struct T_capability_ack *tcap, 828 t_uscalar_t cap_bits1) 829 { 830 tcap->CAP_bits1 = 0; 831 832 if (cap_bits1 & TC1_INFO) { 833 tcp_copy_info(&tcap->INFO_ack, tcp); 834 tcap->CAP_bits1 |= TC1_INFO; 835 } 836 837 if (cap_bits1 & TC1_ACCEPTOR_ID) { 838 tcap->ACCEPTOR_id = tcp->tcp_acceptor_id; 839 tcap->CAP_bits1 |= TC1_ACCEPTOR_ID; 840 } 841 842 } 843 844 /* 845 * This routine responds to T_CAPABILITY_REQ messages. It is called by 846 * tcp_wput. Much of the T_CAPABILITY_ACK information is copied from 847 * tcp_g_t_info_ack. The current state of the stream is copied from 848 * tcp_state. 849 */ 850 void 851 tcp_capability_req(tcp_t *tcp, mblk_t *mp) 852 { 853 t_uscalar_t cap_bits1; 854 struct T_capability_ack *tcap; 855 856 if (MBLKL(mp) < sizeof (struct T_capability_req)) { 857 freemsg(mp); 858 return; 859 } 860 861 cap_bits1 = ((struct T_capability_req *)mp->b_rptr)->CAP_bits1; 862 863 mp = tpi_ack_alloc(mp, sizeof (struct T_capability_ack), 864 mp->b_datap->db_type, T_CAPABILITY_ACK); 865 if (mp == NULL) 866 return; 867 868 tcap = (struct T_capability_ack *)mp->b_rptr; 869 tcp_do_capability_ack(tcp, tcap, cap_bits1); 870 871 putnext(tcp->tcp_connp->conn_rq, mp); 872 } 873 874 /* 875 * This routine responds to T_INFO_REQ messages. It is called by tcp_wput. 876 * Most of the T_INFO_ACK information is copied from tcp_g_t_info_ack. 877 * The current state of the stream is copied from tcp_state. 878 */ 879 void 880 tcp_info_req(tcp_t *tcp, mblk_t *mp) 881 { 882 mp = tpi_ack_alloc(mp, sizeof (struct T_info_ack), M_PCPROTO, 883 T_INFO_ACK); 884 if (!mp) { 885 tcp_err_ack(tcp, mp, TSYSERR, ENOMEM); 886 return; 887 } 888 tcp_copy_info((struct T_info_ack *)mp->b_rptr, tcp); 889 putnext(tcp->tcp_connp->conn_rq, mp); 890 } 891 892 /* Respond to the TPI addr request */ 893 void 894 tcp_addr_req(tcp_t *tcp, mblk_t *mp) 895 { 896 struct sockaddr *sa; 897 mblk_t *ackmp; 898 struct T_addr_ack *taa; 899 conn_t *connp = tcp->tcp_connp; 900 uint_t addrlen; 901 902 /* Make it large enough for worst case */ 903 ackmp = reallocb(mp, sizeof (struct T_addr_ack) + 904 2 * sizeof (sin6_t), 1); 905 if (ackmp == NULL) { 906 tcp_err_ack(tcp, mp, TSYSERR, ENOMEM); 907 return; 908 } 909 910 taa = (struct T_addr_ack *)ackmp->b_rptr; 911 912 bzero(taa, sizeof (struct T_addr_ack)); 913 ackmp->b_wptr = (uchar_t *)&taa[1]; 914 915 taa->PRIM_type = T_ADDR_ACK; 916 ackmp->b_datap->db_type = M_PCPROTO; 917 918 if (connp->conn_family == AF_INET) 919 addrlen = sizeof (sin_t); 920 else 921 addrlen = sizeof (sin6_t); 922 923 /* 924 * Note: Following code assumes 32 bit alignment of basic 925 * data structures like sin_t and struct T_addr_ack. 926 */ 927 if (tcp->tcp_state >= TCPS_BOUND) { 928 /* 929 * Fill in local address first 930 */ 931 taa->LOCADDR_offset = sizeof (*taa); 932 taa->LOCADDR_length = addrlen; 933 sa = (struct sockaddr *)&taa[1]; 934 (void) conn_getsockname(connp, sa, &addrlen); 935 ackmp->b_wptr += addrlen; 936 } 937 if (tcp->tcp_state >= TCPS_SYN_RCVD) { 938 /* 939 * Fill in Remote address 940 */ 941 taa->REMADDR_length = addrlen; 942 /* assumed 32-bit alignment */ 943 taa->REMADDR_offset = taa->LOCADDR_offset + taa->LOCADDR_length; 944 sa = (struct sockaddr *)(ackmp->b_rptr + taa->REMADDR_offset); 945 (void) conn_getpeername(connp, sa, &addrlen); 946 ackmp->b_wptr += addrlen; 947 } 948 ASSERT(ackmp->b_wptr <= ackmp->b_datap->db_lim); 949 putnext(tcp->tcp_connp->conn_rq, ackmp); 950 } 951 952 /* 953 * tcp_fallback 954 * 955 * A direct socket is falling back to using STREAMS. The queue 956 * that is being passed down was created using tcp_open() with 957 * the SO_FALLBACK flag set. As a result, the queue is not 958 * associated with a conn, and the q_ptrs instead contain the 959 * dev and minor area that should be used. 960 * 961 * The 'issocket' flag indicates whether the FireEngine 962 * optimizations should be used. The common case would be that 963 * optimizations are enabled, and they might be subsequently 964 * disabled using the _SIOCSOCKFALLBACK ioctl. 965 */ 966 967 /* 968 * An active connection is falling back to TPI. Gather all the information 969 * required by the STREAM head and TPI sonode and send it up. 970 */ 971 void 972 tcp_fallback_noneager(tcp_t *tcp, mblk_t *stropt_mp, queue_t *q, 973 boolean_t issocket, so_proto_quiesced_cb_t quiesced_cb) 974 { 975 conn_t *connp = tcp->tcp_connp; 976 struct stroptions *stropt; 977 struct T_capability_ack tca; 978 struct sockaddr_in6 laddr, faddr; 979 socklen_t laddrlen, faddrlen; 980 short opts; 981 int error; 982 mblk_t *mp; 983 984 connp->conn_dev = (dev_t)RD(q)->q_ptr; 985 connp->conn_minor_arena = WR(q)->q_ptr; 986 987 RD(q)->q_ptr = WR(q)->q_ptr = connp; 988 989 connp->conn_rq = RD(q); 990 connp->conn_wq = WR(q); 991 992 WR(q)->q_qinfo = &tcp_sock_winit; 993 994 if (!issocket) 995 tcp_use_pure_tpi(tcp); 996 997 /* 998 * free the helper stream 999 */ 1000 ip_free_helper_stream(connp); 1001 1002 /* 1003 * Notify the STREAM head about options 1004 */ 1005 DB_TYPE(stropt_mp) = M_SETOPTS; 1006 stropt = (struct stroptions *)stropt_mp->b_rptr; 1007 stropt_mp->b_wptr += sizeof (struct stroptions); 1008 stropt->so_flags = SO_HIWAT | SO_WROFF | SO_MAXBLK; 1009 1010 stropt->so_wroff = connp->conn_ht_iphc_len + (tcp->tcp_loopback ? 0 : 1011 tcp->tcp_tcps->tcps_wroff_xtra); 1012 if (tcp->tcp_snd_sack_ok) 1013 stropt->so_wroff += TCPOPT_MAX_SACK_LEN; 1014 stropt->so_hiwat = connp->conn_rcvbuf; 1015 stropt->so_maxblk = tcp_maxpsz_set(tcp, B_FALSE); 1016 1017 putnext(RD(q), stropt_mp); 1018 1019 /* 1020 * Collect the information needed to sync with the sonode 1021 */ 1022 tcp_do_capability_ack(tcp, &tca, TC1_INFO|TC1_ACCEPTOR_ID); 1023 1024 laddrlen = faddrlen = sizeof (sin6_t); 1025 (void) tcp_getsockname((sock_lower_handle_t)connp, 1026 (struct sockaddr *)&laddr, &laddrlen, CRED()); 1027 error = tcp_getpeername((sock_lower_handle_t)connp, 1028 (struct sockaddr *)&faddr, &faddrlen, CRED()); 1029 if (error != 0) 1030 faddrlen = 0; 1031 1032 opts = 0; 1033 if (connp->conn_oobinline) 1034 opts |= SO_OOBINLINE; 1035 if (connp->conn_ixa->ixa_flags & IXAF_DONTROUTE) 1036 opts |= SO_DONTROUTE; 1037 1038 /* 1039 * Notify the socket that the protocol is now quiescent, 1040 * and it's therefore safe move data from the socket 1041 * to the stream head. 1042 */ 1043 (*quiesced_cb)(connp->conn_upper_handle, q, &tca, 1044 (struct sockaddr *)&laddr, laddrlen, 1045 (struct sockaddr *)&faddr, faddrlen, opts); 1046 1047 while ((mp = tcp->tcp_rcv_list) != NULL) { 1048 tcp->tcp_rcv_list = mp->b_next; 1049 mp->b_next = NULL; 1050 /* We never do fallback for kernel RPC */ 1051 putnext(q, mp); 1052 } 1053 tcp->tcp_rcv_last_head = NULL; 1054 tcp->tcp_rcv_last_tail = NULL; 1055 tcp->tcp_rcv_cnt = 0; 1056 } 1057 1058 /* 1059 * An eager is falling back to TPI. All we have to do is send 1060 * up a T_CONN_IND. 1061 */ 1062 void 1063 tcp_fallback_eager(tcp_t *eager, boolean_t direct_sockfs) 1064 { 1065 tcp_t *listener = eager->tcp_listener; 1066 mblk_t *mp = eager->tcp_conn.tcp_eager_conn_ind; 1067 1068 ASSERT(listener != NULL); 1069 ASSERT(mp != NULL); 1070 1071 eager->tcp_conn.tcp_eager_conn_ind = NULL; 1072 1073 /* 1074 * TLI/XTI applications will get confused by 1075 * sending eager as an option since it violates 1076 * the option semantics. So remove the eager as 1077 * option since TLI/XTI app doesn't need it anyway. 1078 */ 1079 if (!direct_sockfs) { 1080 struct T_conn_ind *conn_ind; 1081 1082 conn_ind = (struct T_conn_ind *)mp->b_rptr; 1083 conn_ind->OPT_length = 0; 1084 conn_ind->OPT_offset = 0; 1085 } 1086 1087 /* 1088 * Sockfs guarantees that the listener will not be closed 1089 * during fallback. So we can safely use the listener's queue. 1090 */ 1091 putnext(listener->tcp_connp->conn_rq, mp); 1092 } 1093 1094 /* 1095 * Swap information between the eager and acceptor for a TLI/XTI client. 1096 * The sockfs accept is done on the acceptor stream and control goes 1097 * through tcp_tli_accept() and tcp_accept()/tcp_accept_swap() is not 1098 * called. In either case, both the eager and listener are in their own 1099 * perimeter (squeue) and the code has to deal with potential race. 1100 * 1101 * See the block comment on top of tcp_accept() and tcp_tli_accept(). 1102 */ 1103 static void 1104 tcp_accept_swap(tcp_t *listener, tcp_t *acceptor, tcp_t *eager) 1105 { 1106 conn_t *econnp, *aconnp; 1107 1108 ASSERT(eager->tcp_connp->conn_rq == listener->tcp_connp->conn_rq); 1109 ASSERT(eager->tcp_detached && !acceptor->tcp_detached); 1110 ASSERT(!TCP_IS_SOCKET(acceptor)); 1111 ASSERT(!TCP_IS_SOCKET(eager)); 1112 ASSERT(!TCP_IS_SOCKET(listener)); 1113 1114 /* 1115 * Trusted Extensions may need to use a security label that is 1116 * different from the acceptor's label on MLP and MAC-Exempt 1117 * sockets. If this is the case, the required security label 1118 * already exists in econnp->conn_ixa->ixa_tsl. Since we make the 1119 * acceptor stream refer to econnp we atomatically get that label. 1120 */ 1121 1122 acceptor->tcp_detached = B_TRUE; 1123 /* 1124 * To permit stream re-use by TLI/XTI, the eager needs a copy of 1125 * the acceptor id. 1126 */ 1127 eager->tcp_acceptor_id = acceptor->tcp_acceptor_id; 1128 1129 /* remove eager from listen list... */ 1130 mutex_enter(&listener->tcp_eager_lock); 1131 tcp_eager_unlink(eager); 1132 ASSERT(eager->tcp_eager_next_q == NULL && 1133 eager->tcp_eager_last_q == NULL); 1134 ASSERT(eager->tcp_eager_next_q0 == NULL && 1135 eager->tcp_eager_prev_q0 == NULL); 1136 mutex_exit(&listener->tcp_eager_lock); 1137 1138 econnp = eager->tcp_connp; 1139 aconnp = acceptor->tcp_connp; 1140 econnp->conn_rq = aconnp->conn_rq; 1141 econnp->conn_wq = aconnp->conn_wq; 1142 econnp->conn_rq->q_ptr = econnp; 1143 econnp->conn_wq->q_ptr = econnp; 1144 1145 /* 1146 * In the TLI/XTI loopback case, we are inside the listener's squeue, 1147 * which might be a different squeue from our peer TCP instance. 1148 * For TCP Fusion, the peer expects that whenever tcp_detached is 1149 * clear, our TCP queues point to the acceptor's queues. Thus, use 1150 * membar_producer() to ensure that the assignments of conn_rq/conn_wq 1151 * above reach global visibility prior to the clearing of tcp_detached. 1152 */ 1153 membar_producer(); 1154 eager->tcp_detached = B_FALSE; 1155 1156 ASSERT(eager->tcp_ack_tid == 0); 1157 1158 econnp->conn_dev = aconnp->conn_dev; 1159 econnp->conn_minor_arena = aconnp->conn_minor_arena; 1160 1161 ASSERT(econnp->conn_minor_arena != NULL); 1162 if (econnp->conn_cred != NULL) 1163 crfree(econnp->conn_cred); 1164 econnp->conn_cred = aconnp->conn_cred; 1165 ASSERT(!(econnp->conn_ixa->ixa_free_flags & IXA_FREE_CRED)); 1166 econnp->conn_ixa->ixa_cred = econnp->conn_cred; 1167 aconnp->conn_cred = NULL; 1168 econnp->conn_cpid = aconnp->conn_cpid; 1169 ASSERT(econnp->conn_netstack == aconnp->conn_netstack); 1170 ASSERT(eager->tcp_tcps == acceptor->tcp_tcps); 1171 1172 econnp->conn_zoneid = aconnp->conn_zoneid; 1173 econnp->conn_allzones = aconnp->conn_allzones; 1174 econnp->conn_ixa->ixa_zoneid = aconnp->conn_ixa->ixa_zoneid; 1175 1176 econnp->conn_mac_mode = aconnp->conn_mac_mode; 1177 econnp->conn_zone_is_global = aconnp->conn_zone_is_global; 1178 aconnp->conn_mac_mode = CONN_MAC_DEFAULT; 1179 1180 /* Do the IPC initialization */ 1181 CONN_INC_REF(econnp); 1182 1183 /* Done with old IPC. Drop its ref on its connp */ 1184 CONN_DEC_REF(aconnp); 1185 } 1186 1187 /* 1188 * Reply to a clients T_CONN_RES TPI message. This function 1189 * is used only for TLI/XTI listener. Sockfs sends T_CONN_RES 1190 * on the acceptor STREAM and processed in tcp_accept_common(). 1191 * Read the block comment on top of tcp_input_listener(). 1192 */ 1193 void 1194 tcp_tli_accept(tcp_t *listener, mblk_t *mp) 1195 { 1196 tcp_t *acceptor; 1197 tcp_t *eager; 1198 tcp_t *tcp; 1199 struct T_conn_res *tcr; 1200 t_uscalar_t acceptor_id; 1201 t_scalar_t seqnum; 1202 mblk_t *discon_mp = NULL; 1203 mblk_t *ok_mp; 1204 mblk_t *mp1; 1205 tcp_stack_t *tcps = listener->tcp_tcps; 1206 conn_t *econnp; 1207 1208 if ((mp->b_wptr - mp->b_rptr) < sizeof (*tcr)) { 1209 tcp_err_ack(listener, mp, TPROTO, 0); 1210 return; 1211 } 1212 tcr = (struct T_conn_res *)mp->b_rptr; 1213 1214 /* 1215 * Under ILP32 the stream head points tcr->ACCEPTOR_id at the 1216 * read side queue of the streams device underneath us i.e. the 1217 * read side queue of 'ip'. Since we can't deference QUEUE_ptr we 1218 * look it up in the queue_hash. Under LP64 it sends down the 1219 * minor_t of the accepting endpoint. 1220 * 1221 * Once the acceptor/eager are modified (in tcp_accept_swap) the 1222 * fanout hash lock is held. 1223 * This prevents any thread from entering the acceptor queue from 1224 * below (since it has not been hard bound yet i.e. any inbound 1225 * packets will arrive on the listener conn_t and 1226 * go through the classifier). 1227 * The CONN_INC_REF will prevent the acceptor from closing. 1228 * 1229 * XXX It is still possible for a tli application to send down data 1230 * on the accepting stream while another thread calls t_accept. 1231 * This should not be a problem for well-behaved applications since 1232 * the T_OK_ACK is sent after the queue swapping is completed. 1233 * 1234 * If the accepting fd is the same as the listening fd, avoid 1235 * queue hash lookup since that will return an eager listener in a 1236 * already established state. 1237 */ 1238 acceptor_id = tcr->ACCEPTOR_id; 1239 mutex_enter(&listener->tcp_eager_lock); 1240 if (listener->tcp_acceptor_id == acceptor_id) { 1241 eager = listener->tcp_eager_next_q; 1242 /* only count how many T_CONN_INDs so don't count q0 */ 1243 if ((listener->tcp_conn_req_cnt_q != 1) || 1244 (eager->tcp_conn_req_seqnum != tcr->SEQ_number)) { 1245 mutex_exit(&listener->tcp_eager_lock); 1246 tcp_err_ack(listener, mp, TBADF, 0); 1247 return; 1248 } 1249 if (listener->tcp_conn_req_cnt_q0 != 0) { 1250 /* Throw away all the eagers on q0. */ 1251 tcp_eager_cleanup(listener, 1); 1252 } 1253 if (listener->tcp_syn_defense) { 1254 listener->tcp_syn_defense = B_FALSE; 1255 if (listener->tcp_ip_addr_cache != NULL) { 1256 kmem_free(listener->tcp_ip_addr_cache, 1257 IP_ADDR_CACHE_SIZE * sizeof (ipaddr_t)); 1258 listener->tcp_ip_addr_cache = NULL; 1259 } 1260 } 1261 /* 1262 * Transfer tcp_conn_req_max to the eager so that when 1263 * a disconnect occurs we can revert the endpoint to the 1264 * listen state. 1265 */ 1266 eager->tcp_conn_req_max = listener->tcp_conn_req_max; 1267 ASSERT(listener->tcp_conn_req_cnt_q0 == 0); 1268 /* 1269 * Get a reference on the acceptor just like the 1270 * tcp_acceptor_hash_lookup below. 1271 */ 1272 acceptor = listener; 1273 CONN_INC_REF(acceptor->tcp_connp); 1274 } else { 1275 acceptor = tcp_acceptor_hash_lookup(acceptor_id, tcps); 1276 if (acceptor == NULL) { 1277 if (listener->tcp_connp->conn_debug) { 1278 (void) strlog(TCP_MOD_ID, 0, 1, 1279 SL_ERROR|SL_TRACE, 1280 "tcp_accept: did not find acceptor 0x%x\n", 1281 acceptor_id); 1282 } 1283 mutex_exit(&listener->tcp_eager_lock); 1284 tcp_err_ack(listener, mp, TPROVMISMATCH, 0); 1285 return; 1286 } 1287 /* 1288 * Verify acceptor state. The acceptable states for an acceptor 1289 * include TCPS_IDLE and TCPS_BOUND. 1290 */ 1291 switch (acceptor->tcp_state) { 1292 case TCPS_IDLE: 1293 /* FALLTHRU */ 1294 case TCPS_BOUND: 1295 break; 1296 default: 1297 CONN_DEC_REF(acceptor->tcp_connp); 1298 mutex_exit(&listener->tcp_eager_lock); 1299 tcp_err_ack(listener, mp, TOUTSTATE, 0); 1300 return; 1301 } 1302 } 1303 1304 /* The listener must be in TCPS_LISTEN */ 1305 if (listener->tcp_state != TCPS_LISTEN) { 1306 CONN_DEC_REF(acceptor->tcp_connp); 1307 mutex_exit(&listener->tcp_eager_lock); 1308 tcp_err_ack(listener, mp, TOUTSTATE, 0); 1309 return; 1310 } 1311 1312 /* 1313 * Rendezvous with an eager connection request packet hanging off 1314 * 'tcp' that has the 'seqnum' tag. We tagged the detached open 1315 * tcp structure when the connection packet arrived in 1316 * tcp_input_listener(). 1317 */ 1318 seqnum = tcr->SEQ_number; 1319 eager = listener; 1320 do { 1321 eager = eager->tcp_eager_next_q; 1322 if (eager == NULL) { 1323 CONN_DEC_REF(acceptor->tcp_connp); 1324 mutex_exit(&listener->tcp_eager_lock); 1325 tcp_err_ack(listener, mp, TBADSEQ, 0); 1326 return; 1327 } 1328 } while (eager->tcp_conn_req_seqnum != seqnum); 1329 mutex_exit(&listener->tcp_eager_lock); 1330 1331 /* 1332 * At this point, both acceptor and listener have 2 ref 1333 * that they begin with. Acceptor has one additional ref 1334 * we placed in lookup while listener has 3 additional 1335 * ref for being behind the squeue (tcp_accept() is 1336 * done on listener's squeue); being in classifier hash; 1337 * and eager's ref on listener. 1338 */ 1339 ASSERT(listener->tcp_connp->conn_ref >= 5); 1340 ASSERT(acceptor->tcp_connp->conn_ref >= 3); 1341 1342 /* 1343 * The eager at this point is set in its own squeue and 1344 * could easily have been killed (tcp_accept_finish will 1345 * deal with that) because of a TH_RST so we can only 1346 * ASSERT for a single ref. 1347 */ 1348 ASSERT(eager->tcp_connp->conn_ref >= 1); 1349 1350 /* 1351 * Pre allocate the discon_ind mblk also. tcp_accept_finish will 1352 * use it if something failed. 1353 */ 1354 discon_mp = allocb(MAX(sizeof (struct T_discon_ind), 1355 sizeof (struct stroptions)), BPRI_HI); 1356 if (discon_mp == NULL) { 1357 CONN_DEC_REF(acceptor->tcp_connp); 1358 CONN_DEC_REF(eager->tcp_connp); 1359 tcp_err_ack(listener, mp, TSYSERR, ENOMEM); 1360 return; 1361 } 1362 1363 econnp = eager->tcp_connp; 1364 1365 /* Hold a copy of mp, in case reallocb fails */ 1366 if ((mp1 = copymsg(mp)) == NULL) { 1367 CONN_DEC_REF(acceptor->tcp_connp); 1368 CONN_DEC_REF(eager->tcp_connp); 1369 freemsg(discon_mp); 1370 tcp_err_ack(listener, mp, TSYSERR, ENOMEM); 1371 return; 1372 } 1373 1374 tcr = (struct T_conn_res *)mp1->b_rptr; 1375 1376 /* 1377 * This is an expanded version of mi_tpi_ok_ack_alloc() 1378 * which allocates a larger mblk and appends the new 1379 * local address to the ok_ack. The address is copied by 1380 * soaccept() for getsockname(). 1381 */ 1382 { 1383 int extra; 1384 1385 extra = (econnp->conn_family == AF_INET) ? 1386 sizeof (sin_t) : sizeof (sin6_t); 1387 1388 /* 1389 * Try to re-use mp, if possible. Otherwise, allocate 1390 * an mblk and return it as ok_mp. In any case, mp 1391 * is no longer usable upon return. 1392 */ 1393 if ((ok_mp = mi_tpi_ok_ack_alloc_extra(mp, extra)) == NULL) { 1394 CONN_DEC_REF(acceptor->tcp_connp); 1395 CONN_DEC_REF(eager->tcp_connp); 1396 freemsg(discon_mp); 1397 /* Original mp has been freed by now, so use mp1 */ 1398 tcp_err_ack(listener, mp1, TSYSERR, ENOMEM); 1399 return; 1400 } 1401 1402 mp = NULL; /* We should never use mp after this point */ 1403 1404 switch (extra) { 1405 case sizeof (sin_t): { 1406 sin_t *sin = (sin_t *)ok_mp->b_wptr; 1407 1408 ok_mp->b_wptr += extra; 1409 sin->sin_family = AF_INET; 1410 sin->sin_port = econnp->conn_lport; 1411 sin->sin_addr.s_addr = econnp->conn_laddr_v4; 1412 break; 1413 } 1414 case sizeof (sin6_t): { 1415 sin6_t *sin6 = (sin6_t *)ok_mp->b_wptr; 1416 1417 ok_mp->b_wptr += extra; 1418 sin6->sin6_family = AF_INET6; 1419 sin6->sin6_port = econnp->conn_lport; 1420 sin6->sin6_addr = econnp->conn_laddr_v6; 1421 sin6->sin6_flowinfo = econnp->conn_flowinfo; 1422 if (IN6_IS_ADDR_LINKSCOPE(&econnp->conn_laddr_v6) && 1423 (econnp->conn_ixa->ixa_flags & IXAF_SCOPEID_SET)) { 1424 sin6->sin6_scope_id = 1425 econnp->conn_ixa->ixa_scopeid; 1426 } else { 1427 sin6->sin6_scope_id = 0; 1428 } 1429 sin6->__sin6_src_id = 0; 1430 break; 1431 } 1432 default: 1433 break; 1434 } 1435 ASSERT(ok_mp->b_wptr <= ok_mp->b_datap->db_lim); 1436 } 1437 1438 /* 1439 * If there are no options we know that the T_CONN_RES will 1440 * succeed. However, we can't send the T_OK_ACK upstream until 1441 * the tcp_accept_swap is done since it would be dangerous to 1442 * let the application start using the new fd prior to the swap. 1443 */ 1444 tcp_accept_swap(listener, acceptor, eager); 1445 1446 /* 1447 * tcp_accept_swap unlinks eager from listener but does not drop 1448 * the eager's reference on the listener. 1449 */ 1450 ASSERT(eager->tcp_listener == NULL); 1451 ASSERT(listener->tcp_connp->conn_ref >= 5); 1452 1453 /* 1454 * The eager is now associated with its own queue. Insert in 1455 * the hash so that the connection can be reused for a future 1456 * T_CONN_RES. 1457 */ 1458 tcp_acceptor_hash_insert(acceptor_id, eager); 1459 1460 /* 1461 * We now do the processing of options with T_CONN_RES. 1462 * We delay till now since we wanted to have queue to pass to 1463 * option processing routines that points back to the right 1464 * instance structure which does not happen until after 1465 * tcp_accept_swap(). 1466 * 1467 * Note: 1468 * The sanity of the logic here assumes that whatever options 1469 * are appropriate to inherit from listner=>eager are done 1470 * before this point, and whatever were to be overridden (or not) 1471 * in transfer logic from eager=>acceptor in tcp_accept_swap(). 1472 * [ Warning: acceptor endpoint can have T_OPTMGMT_REQ done to it 1473 * before its ACCEPTOR_id comes down in T_CONN_RES ] 1474 * This may not be true at this point in time but can be fixed 1475 * independently. This option processing code starts with 1476 * the instantiated acceptor instance and the final queue at 1477 * this point. 1478 */ 1479 1480 if (tcr->OPT_length != 0) { 1481 /* Options to process */ 1482 int t_error = 0; 1483 int sys_error = 0; 1484 int do_disconnect = 0; 1485 1486 if (tcp_conprim_opt_process(eager, mp1, 1487 &do_disconnect, &t_error, &sys_error) < 0) { 1488 eager->tcp_accept_error = 1; 1489 if (do_disconnect) { 1490 /* 1491 * An option failed which does not allow 1492 * connection to be accepted. 1493 * 1494 * We allow T_CONN_RES to succeed and 1495 * put a T_DISCON_IND on the eager queue. 1496 */ 1497 ASSERT(t_error == 0 && sys_error == 0); 1498 eager->tcp_send_discon_ind = 1; 1499 } else { 1500 ASSERT(t_error != 0); 1501 freemsg(ok_mp); 1502 /* 1503 * Original mp was either freed or set 1504 * to ok_mp above, so use mp1 instead. 1505 */ 1506 tcp_err_ack(listener, mp1, t_error, sys_error); 1507 goto finish; 1508 } 1509 } 1510 /* 1511 * Most likely success in setting options (except if 1512 * eager->tcp_send_discon_ind set). 1513 * mp1 option buffer represented by OPT_length/offset 1514 * potentially modified and contains results of setting 1515 * options at this point 1516 */ 1517 } 1518 1519 /* We no longer need mp1, since all options processing has passed */ 1520 freemsg(mp1); 1521 1522 putnext(listener->tcp_connp->conn_rq, ok_mp); 1523 1524 mutex_enter(&listener->tcp_eager_lock); 1525 if (listener->tcp_eager_prev_q0->tcp_conn_def_q0) { 1526 tcp_t *tail; 1527 mblk_t *conn_ind; 1528 1529 /* 1530 * This path should not be executed if listener and 1531 * acceptor streams are the same. 1532 */ 1533 ASSERT(listener != acceptor); 1534 1535 tcp = listener->tcp_eager_prev_q0; 1536 /* 1537 * listener->tcp_eager_prev_q0 points to the TAIL of the 1538 * deferred T_conn_ind queue. We need to get to the head of 1539 * the queue in order to send up T_conn_ind the same order as 1540 * how the 3WHS is completed. 1541 */ 1542 while (tcp != listener) { 1543 if (!tcp->tcp_eager_prev_q0->tcp_conn_def_q0) 1544 break; 1545 else 1546 tcp = tcp->tcp_eager_prev_q0; 1547 } 1548 ASSERT(tcp != listener); 1549 conn_ind = tcp->tcp_conn.tcp_eager_conn_ind; 1550 ASSERT(conn_ind != NULL); 1551 tcp->tcp_conn.tcp_eager_conn_ind = NULL; 1552 1553 /* Move from q0 to q */ 1554 ASSERT(listener->tcp_conn_req_cnt_q0 > 0); 1555 listener->tcp_conn_req_cnt_q0--; 1556 listener->tcp_conn_req_cnt_q++; 1557 tcp->tcp_eager_next_q0->tcp_eager_prev_q0 = 1558 tcp->tcp_eager_prev_q0; 1559 tcp->tcp_eager_prev_q0->tcp_eager_next_q0 = 1560 tcp->tcp_eager_next_q0; 1561 tcp->tcp_eager_prev_q0 = NULL; 1562 tcp->tcp_eager_next_q0 = NULL; 1563 tcp->tcp_conn_def_q0 = B_FALSE; 1564 1565 /* Make sure the tcp isn't in the list of droppables */ 1566 ASSERT(tcp->tcp_eager_next_drop_q0 == NULL && 1567 tcp->tcp_eager_prev_drop_q0 == NULL); 1568 1569 /* 1570 * Insert at end of the queue because sockfs sends 1571 * down T_CONN_RES in chronological order. Leaving 1572 * the older conn indications at front of the queue 1573 * helps reducing search time. 1574 */ 1575 tail = listener->tcp_eager_last_q; 1576 if (tail != NULL) 1577 tail->tcp_eager_next_q = tcp; 1578 else 1579 listener->tcp_eager_next_q = tcp; 1580 listener->tcp_eager_last_q = tcp; 1581 tcp->tcp_eager_next_q = NULL; 1582 mutex_exit(&listener->tcp_eager_lock); 1583 putnext(tcp->tcp_connp->conn_rq, conn_ind); 1584 } else { 1585 mutex_exit(&listener->tcp_eager_lock); 1586 } 1587 1588 /* 1589 * Done with the acceptor - free it 1590 * 1591 * Note: from this point on, no access to listener should be made 1592 * as listener can be equal to acceptor. 1593 */ 1594 finish: 1595 ASSERT(acceptor->tcp_detached); 1596 acceptor->tcp_connp->conn_rq = NULL; 1597 ASSERT(!IPCL_IS_NONSTR(acceptor->tcp_connp)); 1598 acceptor->tcp_connp->conn_wq = NULL; 1599 (void) tcp_clean_death(acceptor, 0); 1600 CONN_DEC_REF(acceptor->tcp_connp); 1601 1602 /* 1603 * We pass discon_mp to tcp_accept_finish to get on the right squeue. 1604 * 1605 * It will update the setting for sockfs/stream head and also take 1606 * care of any data that arrived before accept() wad called. 1607 * In case we already received a FIN then tcp_accept_finish will send up 1608 * the ordrel. It will also send up a window update if the window 1609 * has opened up. 1610 */ 1611 1612 /* 1613 * XXX: we currently have a problem if XTI application closes the 1614 * acceptor stream in between. This problem exists in on10-gate also 1615 * and is well know but nothing can be done short of major rewrite 1616 * to fix it. Now it is possible to take care of it by assigning TLI/XTI 1617 * eager same squeue as listener (we can distinguish non socket 1618 * listeners at the time of handling a SYN in tcp_input_listener) 1619 * and do most of the work that tcp_accept_finish does here itself 1620 * and then get behind the acceptor squeue to access the acceptor 1621 * queue. 1622 */ 1623 /* 1624 * We already have a ref on tcp so no need to do one before squeue_enter 1625 */ 1626 SQUEUE_ENTER_ONE(eager->tcp_connp->conn_sqp, discon_mp, 1627 tcp_accept_finish, eager->tcp_connp, NULL, SQ_FILL, 1628 SQTAG_TCP_ACCEPT_FINISH); 1629 } 1630 1631 1632 /* 1633 * This is the STREAMS entry point for T_CONN_RES coming down on 1634 * Acceptor STREAM when sockfs listener does accept processing. 1635 * Read the block comment on top of tcp_input_listener(). 1636 */ 1637 void 1638 tcp_tpi_accept(queue_t *q, mblk_t *mp) 1639 { 1640 queue_t *rq = RD(q); 1641 struct T_conn_res *conn_res; 1642 tcp_t *eager; 1643 tcp_t *listener; 1644 struct T_ok_ack *ok; 1645 t_scalar_t PRIM_type; 1646 conn_t *econnp; 1647 cred_t *cr; 1648 1649 ASSERT(DB_TYPE(mp) == M_PROTO); 1650 1651 /* 1652 * All Solaris components should pass a db_credp 1653 * for this TPI message, hence we ASSERT. 1654 * But in case there is some other M_PROTO that looks 1655 * like a TPI message sent by some other kernel 1656 * component, we check and return an error. 1657 */ 1658 cr = msg_getcred(mp, NULL); 1659 ASSERT(cr != NULL); 1660 if (cr == NULL) { 1661 mp = mi_tpi_err_ack_alloc(mp, TSYSERR, EINVAL); 1662 if (mp != NULL) 1663 putnext(rq, mp); 1664 return; 1665 } 1666 conn_res = (struct T_conn_res *)mp->b_rptr; 1667 ASSERT((uintptr_t)(mp->b_wptr - mp->b_rptr) <= (uintptr_t)INT_MAX); 1668 if ((mp->b_wptr - mp->b_rptr) < sizeof (struct T_conn_res)) { 1669 mp = mi_tpi_err_ack_alloc(mp, TPROTO, 0); 1670 if (mp != NULL) 1671 putnext(rq, mp); 1672 return; 1673 } 1674 switch (conn_res->PRIM_type) { 1675 case O_T_CONN_RES: 1676 case T_CONN_RES: 1677 /* 1678 * We pass up an err ack if allocb fails. This will 1679 * cause sockfs to issue a T_DISCON_REQ which will cause 1680 * tcp_eager_blowoff to be called. sockfs will then call 1681 * rq->q_qinfo->qi_qclose to cleanup the acceptor stream. 1682 * we need to do the allocb up here because we have to 1683 * make sure rq->q_qinfo->qi_qclose still points to the 1684 * correct function (tcp_tpi_close_accept) in case allocb 1685 * fails. 1686 */ 1687 bcopy(mp->b_rptr + conn_res->OPT_offset, 1688 &eager, conn_res->OPT_length); 1689 PRIM_type = conn_res->PRIM_type; 1690 mp->b_datap->db_type = M_PCPROTO; 1691 mp->b_wptr = mp->b_rptr + sizeof (struct T_ok_ack); 1692 ok = (struct T_ok_ack *)mp->b_rptr; 1693 ok->PRIM_type = T_OK_ACK; 1694 ok->CORRECT_prim = PRIM_type; 1695 econnp = eager->tcp_connp; 1696 econnp->conn_dev = (dev_t)RD(q)->q_ptr; 1697 econnp->conn_minor_arena = (vmem_t *)(WR(q)->q_ptr); 1698 econnp->conn_rq = rq; 1699 econnp->conn_wq = q; 1700 rq->q_ptr = econnp; 1701 rq->q_qinfo = &tcp_rinitv4; /* No open - same as rinitv6 */ 1702 q->q_ptr = econnp; 1703 q->q_qinfo = &tcp_winit; 1704 listener = eager->tcp_listener; 1705 1706 if (tcp_accept_common(listener->tcp_connp, 1707 econnp, cr) < 0) { 1708 mp = mi_tpi_err_ack_alloc(mp, TPROTO, 0); 1709 if (mp != NULL) 1710 putnext(rq, mp); 1711 return; 1712 } 1713 1714 /* 1715 * Send the new local address also up to sockfs. There 1716 * should already be enough space in the mp that came 1717 * down from soaccept(). 1718 */ 1719 if (econnp->conn_family == AF_INET) { 1720 sin_t *sin; 1721 1722 ASSERT((mp->b_datap->db_lim - mp->b_datap->db_base) >= 1723 (sizeof (struct T_ok_ack) + sizeof (sin_t))); 1724 sin = (sin_t *)mp->b_wptr; 1725 mp->b_wptr += sizeof (sin_t); 1726 sin->sin_family = AF_INET; 1727 sin->sin_port = econnp->conn_lport; 1728 sin->sin_addr.s_addr = econnp->conn_laddr_v4; 1729 } else { 1730 sin6_t *sin6; 1731 1732 ASSERT((mp->b_datap->db_lim - mp->b_datap->db_base) >= 1733 sizeof (struct T_ok_ack) + sizeof (sin6_t)); 1734 sin6 = (sin6_t *)mp->b_wptr; 1735 mp->b_wptr += sizeof (sin6_t); 1736 sin6->sin6_family = AF_INET6; 1737 sin6->sin6_port = econnp->conn_lport; 1738 sin6->sin6_addr = econnp->conn_laddr_v6; 1739 if (econnp->conn_ipversion == IPV4_VERSION) 1740 sin6->sin6_flowinfo = 0; 1741 else 1742 sin6->sin6_flowinfo = econnp->conn_flowinfo; 1743 if (IN6_IS_ADDR_LINKSCOPE(&econnp->conn_laddr_v6) && 1744 (econnp->conn_ixa->ixa_flags & IXAF_SCOPEID_SET)) { 1745 sin6->sin6_scope_id = 1746 econnp->conn_ixa->ixa_scopeid; 1747 } else { 1748 sin6->sin6_scope_id = 0; 1749 } 1750 sin6->__sin6_src_id = 0; 1751 } 1752 1753 putnext(rq, mp); 1754 return; 1755 default: 1756 mp = mi_tpi_err_ack_alloc(mp, TNOTSUPPORT, 0); 1757 if (mp != NULL) 1758 putnext(rq, mp); 1759 return; 1760 } 1761 } 1762 1763 /* 1764 * Send the newconn notification to ulp. The eager is blown off if the 1765 * notification fails. 1766 */ 1767 static void 1768 tcp_ulp_newconn(conn_t *lconnp, conn_t *econnp, mblk_t *mp) 1769 { 1770 if (IPCL_IS_NONSTR(lconnp)) { 1771 cred_t *cr; 1772 pid_t cpid = NOPID; 1773 1774 ASSERT(econnp->conn_tcp->tcp_listener == lconnp->conn_tcp); 1775 ASSERT(econnp->conn_tcp->tcp_saved_listener == 1776 lconnp->conn_tcp); 1777 1778 cr = msg_getcred(mp, &cpid); 1779 1780 /* Keep the message around in case of a fallback to TPI */ 1781 econnp->conn_tcp->tcp_conn.tcp_eager_conn_ind = mp; 1782 /* 1783 * Notify the ULP about the newconn. It is guaranteed that no 1784 * tcp_accept() call will be made for the eager if the 1785 * notification fails, so it's safe to blow it off in that 1786 * case. 1787 * 1788 * The upper handle will be assigned when tcp_accept() is 1789 * called. 1790 */ 1791 if ((*lconnp->conn_upcalls->su_newconn) 1792 (lconnp->conn_upper_handle, 1793 (sock_lower_handle_t)econnp, 1794 &sock_tcp_downcalls, cr, cpid, 1795 &econnp->conn_upcalls) == NULL) { 1796 /* Failed to allocate a socket */ 1797 TCPS_BUMP_MIB(lconnp->conn_tcp->tcp_tcps, 1798 tcpEstabResets); 1799 (void) tcp_eager_blowoff(lconnp->conn_tcp, 1800 econnp->conn_tcp->tcp_conn_req_seqnum); 1801 } 1802 } else { 1803 putnext(lconnp->conn_rq, mp); 1804 } 1805 } 1806 1807 /* 1808 * The function called through squeue to get behind listener's perimeter to 1809 * send a deferred conn_ind. 1810 */ 1811 /* ARGSUSED */ 1812 void 1813 tcp_send_pending(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy) 1814 { 1815 conn_t *lconnp = (conn_t *)arg; 1816 tcp_t *listener = lconnp->conn_tcp; 1817 struct T_conn_ind *conn_ind; 1818 tcp_t *tcp; 1819 1820 conn_ind = (struct T_conn_ind *)mp->b_rptr; 1821 bcopy(mp->b_rptr + conn_ind->OPT_offset, &tcp, 1822 conn_ind->OPT_length); 1823 1824 if (listener->tcp_state != TCPS_LISTEN) { 1825 /* 1826 * If listener has closed, it would have caused a 1827 * a cleanup/blowoff to happen for the eager, so 1828 * we don't need to do anything more. 1829 */ 1830 freemsg(mp); 1831 return; 1832 } 1833 1834 tcp_ulp_newconn(lconnp, tcp->tcp_connp, mp); 1835 } 1836 1837 /* 1838 * Sends the T_CONN_IND to the listener. The caller calls this 1839 * functions via squeue to get inside the listener's perimeter 1840 * once the 3 way hand shake is done a T_CONN_IND needs to be 1841 * sent. As an optimization, the caller can call this directly 1842 * if listener's perimeter is same as eager's. 1843 */ 1844 /* ARGSUSED */ 1845 void 1846 tcp_send_conn_ind(void *arg, mblk_t *mp, void *arg2) 1847 { 1848 conn_t *lconnp = (conn_t *)arg; 1849 tcp_t *listener = lconnp->conn_tcp; 1850 tcp_t *tcp; 1851 struct T_conn_ind *conn_ind; 1852 ipaddr_t *addr_cache; 1853 boolean_t need_send_conn_ind = B_FALSE; 1854 tcp_stack_t *tcps = listener->tcp_tcps; 1855 1856 /* retrieve the eager */ 1857 conn_ind = (struct T_conn_ind *)mp->b_rptr; 1858 ASSERT(conn_ind->OPT_offset != 0 && 1859 conn_ind->OPT_length == sizeof (intptr_t)); 1860 bcopy(mp->b_rptr + conn_ind->OPT_offset, &tcp, 1861 conn_ind->OPT_length); 1862 1863 /* 1864 * TLI/XTI applications will get confused by 1865 * sending eager as an option since it violates 1866 * the option semantics. So remove the eager as 1867 * option since TLI/XTI app doesn't need it anyway. 1868 */ 1869 if (!TCP_IS_SOCKET(listener)) { 1870 conn_ind->OPT_length = 0; 1871 conn_ind->OPT_offset = 0; 1872 } 1873 if (listener->tcp_state != TCPS_LISTEN) { 1874 /* 1875 * If listener has closed, it would have caused a 1876 * a cleanup/blowoff to happen for the eager. We 1877 * just need to return. 1878 */ 1879 freemsg(mp); 1880 return; 1881 } 1882 1883 1884 /* 1885 * if the conn_req_q is full defer passing up the 1886 * T_CONN_IND until space is availabe after t_accept() 1887 * processing 1888 */ 1889 mutex_enter(&listener->tcp_eager_lock); 1890 1891 /* 1892 * Take the eager out, if it is in the list of droppable eagers 1893 * as we are here because the 3W handshake is over. 1894 */ 1895 MAKE_UNDROPPABLE(tcp); 1896 1897 if (listener->tcp_conn_req_cnt_q < listener->tcp_conn_req_max) { 1898 tcp_t *tail; 1899 1900 /* 1901 * The eager already has an extra ref put in tcp_input_data 1902 * so that it stays till accept comes back even though it 1903 * might get into TCPS_CLOSED as a result of a TH_RST etc. 1904 */ 1905 ASSERT(listener->tcp_conn_req_cnt_q0 > 0); 1906 listener->tcp_conn_req_cnt_q0--; 1907 listener->tcp_conn_req_cnt_q++; 1908 1909 /* Move from SYN_RCVD to ESTABLISHED list */ 1910 tcp->tcp_eager_next_q0->tcp_eager_prev_q0 = 1911 tcp->tcp_eager_prev_q0; 1912 tcp->tcp_eager_prev_q0->tcp_eager_next_q0 = 1913 tcp->tcp_eager_next_q0; 1914 tcp->tcp_eager_prev_q0 = NULL; 1915 tcp->tcp_eager_next_q0 = NULL; 1916 1917 /* 1918 * Insert at end of the queue because sockfs 1919 * sends down T_CONN_RES in chronological 1920 * order. Leaving the older conn indications 1921 * at front of the queue helps reducing search 1922 * time. 1923 */ 1924 tail = listener->tcp_eager_last_q; 1925 if (tail != NULL) 1926 tail->tcp_eager_next_q = tcp; 1927 else 1928 listener->tcp_eager_next_q = tcp; 1929 listener->tcp_eager_last_q = tcp; 1930 tcp->tcp_eager_next_q = NULL; 1931 /* 1932 * Delay sending up the T_conn_ind until we are 1933 * done with the eager. Once we have have sent up 1934 * the T_conn_ind, the accept can potentially complete 1935 * any time and release the refhold we have on the eager. 1936 */ 1937 need_send_conn_ind = B_TRUE; 1938 } else { 1939 /* 1940 * Defer connection on q0 and set deferred 1941 * connection bit true 1942 */ 1943 tcp->tcp_conn_def_q0 = B_TRUE; 1944 1945 /* take tcp out of q0 ... */ 1946 tcp->tcp_eager_prev_q0->tcp_eager_next_q0 = 1947 tcp->tcp_eager_next_q0; 1948 tcp->tcp_eager_next_q0->tcp_eager_prev_q0 = 1949 tcp->tcp_eager_prev_q0; 1950 1951 /* ... and place it at the end of q0 */ 1952 tcp->tcp_eager_prev_q0 = listener->tcp_eager_prev_q0; 1953 tcp->tcp_eager_next_q0 = listener; 1954 listener->tcp_eager_prev_q0->tcp_eager_next_q0 = tcp; 1955 listener->tcp_eager_prev_q0 = tcp; 1956 tcp->tcp_conn.tcp_eager_conn_ind = mp; 1957 } 1958 1959 /* we have timed out before */ 1960 if (tcp->tcp_syn_rcvd_timeout != 0) { 1961 tcp->tcp_syn_rcvd_timeout = 0; 1962 listener->tcp_syn_rcvd_timeout--; 1963 if (listener->tcp_syn_defense && 1964 listener->tcp_syn_rcvd_timeout <= 1965 (tcps->tcps_conn_req_max_q0 >> 5) && 1966 10*MINUTES < TICK_TO_MSEC(ddi_get_lbolt64() - 1967 listener->tcp_last_rcv_lbolt)) { 1968 /* 1969 * Turn off the defense mode if we 1970 * believe the SYN attack is over. 1971 */ 1972 listener->tcp_syn_defense = B_FALSE; 1973 if (listener->tcp_ip_addr_cache) { 1974 kmem_free((void *)listener->tcp_ip_addr_cache, 1975 IP_ADDR_CACHE_SIZE * sizeof (ipaddr_t)); 1976 listener->tcp_ip_addr_cache = NULL; 1977 } 1978 } 1979 } 1980 addr_cache = (ipaddr_t *)(listener->tcp_ip_addr_cache); 1981 if (addr_cache != NULL) { 1982 /* 1983 * We have finished a 3-way handshake with this 1984 * remote host. This proves the IP addr is good. 1985 * Cache it! 1986 */ 1987 addr_cache[IP_ADDR_CACHE_HASH(tcp->tcp_connp->conn_faddr_v4)] = 1988 tcp->tcp_connp->conn_faddr_v4; 1989 } 1990 mutex_exit(&listener->tcp_eager_lock); 1991 if (need_send_conn_ind) 1992 tcp_ulp_newconn(lconnp, tcp->tcp_connp, mp); 1993 } 1994