1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. 24 */ 25 26 /* This files contains all TCP TLI/TPI related functions */ 27 28 #include <sys/types.h> 29 #include <sys/stream.h> 30 #include <sys/strsun.h> 31 #include <sys/strsubr.h> 32 #include <sys/stropts.h> 33 #include <sys/strlog.h> 34 #define _SUN_TPI_VERSION 2 35 #include <sys/tihdr.h> 36 #include <sys/suntpi.h> 37 #include <sys/xti_inet.h> 38 #include <sys/squeue_impl.h> 39 #include <sys/squeue.h> 40 41 #include <inet/common.h> 42 #include <inet/ip.h> 43 #include <inet/tcp.h> 44 #include <inet/tcp_impl.h> 45 #include <inet/proto_set.h> 46 47 static void tcp_accept_swap(tcp_t *, tcp_t *, tcp_t *); 48 static int tcp_conprim_opt_process(tcp_t *, mblk_t *, int *, int *, int *); 49 50 void 51 tcp_use_pure_tpi(tcp_t *tcp) 52 { 53 conn_t *connp = tcp->tcp_connp; 54 55 #ifdef _ILP32 56 tcp->tcp_acceptor_id = (t_uscalar_t)connp->conn_rq; 57 #else 58 tcp->tcp_acceptor_id = connp->conn_dev; 59 #endif 60 /* 61 * Insert this socket into the acceptor hash. 62 * We might need it for T_CONN_RES message 63 */ 64 tcp_acceptor_hash_insert(tcp->tcp_acceptor_id, tcp); 65 66 tcp->tcp_issocket = B_FALSE; 67 TCP_STAT(tcp->tcp_tcps, tcp_sock_fallback); 68 } 69 70 /* Shorthand to generate and send TPI error acks to our client */ 71 void 72 tcp_err_ack(tcp_t *tcp, mblk_t *mp, int t_error, int sys_error) 73 { 74 if ((mp = mi_tpi_err_ack_alloc(mp, t_error, sys_error)) != NULL) 75 putnext(tcp->tcp_connp->conn_rq, mp); 76 } 77 78 /* Shorthand to generate and send TPI error acks to our client */ 79 void 80 tcp_err_ack_prim(tcp_t *tcp, mblk_t *mp, int primitive, 81 int t_error, int sys_error) 82 { 83 struct T_error_ack *teackp; 84 85 if ((mp = tpi_ack_alloc(mp, sizeof (struct T_error_ack), 86 M_PCPROTO, T_ERROR_ACK)) != NULL) { 87 teackp = (struct T_error_ack *)mp->b_rptr; 88 teackp->ERROR_prim = primitive; 89 teackp->TLI_error = t_error; 90 teackp->UNIX_error = sys_error; 91 putnext(tcp->tcp_connp->conn_rq, mp); 92 } 93 } 94 95 /* 96 * TCP routine to get the values of options. 97 */ 98 int 99 tcp_tpi_opt_get(queue_t *q, int level, int name, uchar_t *ptr) 100 { 101 return (tcp_opt_get(Q_TO_CONN(q), level, name, ptr)); 102 } 103 104 /* ARGSUSED */ 105 int 106 tcp_tpi_opt_set(queue_t *q, uint_t optset_context, int level, int name, 107 uint_t inlen, uchar_t *invalp, uint_t *outlenp, uchar_t *outvalp, 108 void *thisdg_attrs, cred_t *cr) 109 { 110 conn_t *connp = Q_TO_CONN(q); 111 112 return (tcp_opt_set(connp, optset_context, level, name, inlen, invalp, 113 outlenp, outvalp, thisdg_attrs, cr)); 114 } 115 116 static int 117 tcp_conprim_opt_process(tcp_t *tcp, mblk_t *mp, int *do_disconnectp, 118 int *t_errorp, int *sys_errorp) 119 { 120 int error; 121 int is_absreq_failure; 122 t_scalar_t *opt_lenp; 123 t_scalar_t opt_offset; 124 int prim_type; 125 struct T_conn_req *tcreqp; 126 struct T_conn_res *tcresp; 127 cred_t *cr; 128 129 /* 130 * All Solaris components should pass a db_credp 131 * for this TPI message, hence we ASSERT. 132 * But in case there is some other M_PROTO that looks 133 * like a TPI message sent by some other kernel 134 * component, we check and return an error. 135 */ 136 cr = msg_getcred(mp, NULL); 137 ASSERT(cr != NULL); 138 if (cr == NULL) 139 return (-1); 140 141 prim_type = ((union T_primitives *)mp->b_rptr)->type; 142 ASSERT(prim_type == T_CONN_REQ || prim_type == O_T_CONN_RES || 143 prim_type == T_CONN_RES); 144 145 switch (prim_type) { 146 case T_CONN_REQ: 147 tcreqp = (struct T_conn_req *)mp->b_rptr; 148 opt_offset = tcreqp->OPT_offset; 149 opt_lenp = (t_scalar_t *)&tcreqp->OPT_length; 150 break; 151 case O_T_CONN_RES: 152 case T_CONN_RES: 153 tcresp = (struct T_conn_res *)mp->b_rptr; 154 opt_offset = tcresp->OPT_offset; 155 opt_lenp = (t_scalar_t *)&tcresp->OPT_length; 156 break; 157 } 158 159 *t_errorp = 0; 160 *sys_errorp = 0; 161 *do_disconnectp = 0; 162 163 error = tpi_optcom_buf(tcp->tcp_connp->conn_wq, mp, opt_lenp, 164 opt_offset, cr, &tcp_opt_obj, 165 NULL, &is_absreq_failure); 166 167 switch (error) { 168 case 0: /* no error */ 169 ASSERT(is_absreq_failure == 0); 170 return (0); 171 case ENOPROTOOPT: 172 *t_errorp = TBADOPT; 173 break; 174 case EACCES: 175 *t_errorp = TACCES; 176 break; 177 default: 178 *t_errorp = TSYSERR; *sys_errorp = error; 179 break; 180 } 181 if (is_absreq_failure != 0) { 182 /* 183 * The connection request should get the local ack 184 * T_OK_ACK and then a T_DISCON_IND. 185 */ 186 *do_disconnectp = 1; 187 } 188 return (-1); 189 } 190 191 void 192 tcp_tpi_bind(tcp_t *tcp, mblk_t *mp) 193 { 194 int error; 195 conn_t *connp = tcp->tcp_connp; 196 struct sockaddr *sa; 197 mblk_t *mp1; 198 struct T_bind_req *tbr; 199 int backlog; 200 socklen_t len; 201 sin_t *sin; 202 sin6_t *sin6; 203 cred_t *cr; 204 205 /* 206 * All Solaris components should pass a db_credp 207 * for this TPI message, hence we ASSERT. 208 * But in case there is some other M_PROTO that looks 209 * like a TPI message sent by some other kernel 210 * component, we check and return an error. 211 */ 212 cr = msg_getcred(mp, NULL); 213 ASSERT(cr != NULL); 214 if (cr == NULL) { 215 tcp_err_ack(tcp, mp, TSYSERR, EINVAL); 216 return; 217 } 218 219 ASSERT((uintptr_t)(mp->b_wptr - mp->b_rptr) <= (uintptr_t)INT_MAX); 220 if ((mp->b_wptr - mp->b_rptr) < sizeof (*tbr)) { 221 if (connp->conn_debug) { 222 (void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE, 223 "tcp_tpi_bind: bad req, len %u", 224 (uint_t)(mp->b_wptr - mp->b_rptr)); 225 } 226 tcp_err_ack(tcp, mp, TPROTO, 0); 227 return; 228 } 229 /* Make sure the largest address fits */ 230 mp1 = reallocb(mp, sizeof (struct T_bind_ack) + sizeof (sin6_t), 1); 231 if (mp1 == NULL) { 232 tcp_err_ack(tcp, mp, TSYSERR, ENOMEM); 233 return; 234 } 235 mp = mp1; 236 tbr = (struct T_bind_req *)mp->b_rptr; 237 238 backlog = tbr->CONIND_number; 239 len = tbr->ADDR_length; 240 241 switch (len) { 242 case 0: /* request for a generic port */ 243 tbr->ADDR_offset = sizeof (struct T_bind_req); 244 if (connp->conn_family == AF_INET) { 245 tbr->ADDR_length = sizeof (sin_t); 246 sin = (sin_t *)&tbr[1]; 247 *sin = sin_null; 248 sin->sin_family = AF_INET; 249 sa = (struct sockaddr *)sin; 250 len = sizeof (sin_t); 251 mp->b_wptr = (uchar_t *)&sin[1]; 252 } else { 253 ASSERT(connp->conn_family == AF_INET6); 254 tbr->ADDR_length = sizeof (sin6_t); 255 sin6 = (sin6_t *)&tbr[1]; 256 *sin6 = sin6_null; 257 sin6->sin6_family = AF_INET6; 258 sa = (struct sockaddr *)sin6; 259 len = sizeof (sin6_t); 260 mp->b_wptr = (uchar_t *)&sin6[1]; 261 } 262 break; 263 264 case sizeof (sin_t): /* Complete IPv4 address */ 265 sa = (struct sockaddr *)mi_offset_param(mp, tbr->ADDR_offset, 266 sizeof (sin_t)); 267 break; 268 269 case sizeof (sin6_t): /* Complete IPv6 address */ 270 sa = (struct sockaddr *)mi_offset_param(mp, 271 tbr->ADDR_offset, sizeof (sin6_t)); 272 break; 273 274 default: 275 if (connp->conn_debug) { 276 (void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE, 277 "tcp_tpi_bind: bad address length, %d", 278 tbr->ADDR_length); 279 } 280 tcp_err_ack(tcp, mp, TBADADDR, 0); 281 return; 282 } 283 284 if (backlog > 0) { 285 error = tcp_do_listen(connp, sa, len, backlog, DB_CRED(mp), 286 tbr->PRIM_type != O_T_BIND_REQ); 287 } else { 288 error = tcp_do_bind(connp, sa, len, DB_CRED(mp), 289 tbr->PRIM_type != O_T_BIND_REQ); 290 } 291 done: 292 if (error > 0) { 293 tcp_err_ack(tcp, mp, TSYSERR, error); 294 } else if (error < 0) { 295 tcp_err_ack(tcp, mp, -error, 0); 296 } else { 297 /* 298 * Update port information as sockfs/tpi needs it for checking 299 */ 300 if (connp->conn_family == AF_INET) { 301 sin = (sin_t *)sa; 302 sin->sin_port = connp->conn_lport; 303 } else { 304 sin6 = (sin6_t *)sa; 305 sin6->sin6_port = connp->conn_lport; 306 } 307 mp->b_datap->db_type = M_PCPROTO; 308 tbr->PRIM_type = T_BIND_ACK; 309 putnext(connp->conn_rq, mp); 310 } 311 } 312 313 /* tcp_unbind is called by tcp_wput_proto to handle T_UNBIND_REQ messages. */ 314 void 315 tcp_tpi_unbind(tcp_t *tcp, mblk_t *mp) 316 { 317 conn_t *connp = tcp->tcp_connp; 318 int error; 319 320 error = tcp_do_unbind(connp); 321 if (error > 0) { 322 tcp_err_ack(tcp, mp, TSYSERR, error); 323 } else if (error < 0) { 324 tcp_err_ack(tcp, mp, -error, 0); 325 } else { 326 /* Send M_FLUSH according to TPI */ 327 (void) putnextctl1(connp->conn_rq, M_FLUSH, FLUSHRW); 328 329 mp = mi_tpi_ok_ack_alloc(mp); 330 if (mp != NULL) 331 putnext(connp->conn_rq, mp); 332 } 333 } 334 335 int 336 tcp_tpi_close(queue_t *q, int flags) 337 { 338 conn_t *connp; 339 340 ASSERT(WR(q)->q_next == NULL); 341 342 if (flags & SO_FALLBACK) { 343 /* 344 * stream is being closed while in fallback 345 * simply free the resources that were allocated 346 */ 347 inet_minor_free(WR(q)->q_ptr, (dev_t)(RD(q)->q_ptr)); 348 qprocsoff(q); 349 goto done; 350 } 351 352 connp = Q_TO_CONN(q); 353 /* 354 * We are being closed as /dev/tcp or /dev/tcp6. 355 */ 356 tcp_close_common(connp, flags); 357 358 qprocsoff(q); 359 inet_minor_free(connp->conn_minor_arena, connp->conn_dev); 360 361 /* 362 * Drop IP's reference on the conn. This is the last reference 363 * on the connp if the state was less than established. If the 364 * connection has gone into timewait state, then we will have 365 * one ref for the TCP and one more ref (total of two) for the 366 * classifier connected hash list (a timewait connections stays 367 * in connected hash till closed). 368 * 369 * We can't assert the references because there might be other 370 * transient reference places because of some walkers or queued 371 * packets in squeue for the timewait state. 372 */ 373 CONN_DEC_REF(connp); 374 done: 375 q->q_ptr = WR(q)->q_ptr = NULL; 376 return (0); 377 } 378 379 int 380 tcp_tpi_close_accept(queue_t *q) 381 { 382 vmem_t *minor_arena; 383 dev_t conn_dev; 384 extern struct qinit tcp_acceptor_winit; 385 386 ASSERT(WR(q)->q_qinfo == &tcp_acceptor_winit); 387 388 /* 389 * We had opened an acceptor STREAM for sockfs which is 390 * now being closed due to some error. 391 */ 392 qprocsoff(q); 393 394 minor_arena = (vmem_t *)WR(q)->q_ptr; 395 conn_dev = (dev_t)RD(q)->q_ptr; 396 ASSERT(minor_arena != NULL); 397 ASSERT(conn_dev != 0); 398 inet_minor_free(minor_arena, conn_dev); 399 q->q_ptr = WR(q)->q_ptr = NULL; 400 return (0); 401 } 402 403 /* 404 * Put a connection confirmation message upstream built from the 405 * address/flowid information with the conn and iph. Report our success or 406 * failure. 407 */ 408 boolean_t 409 tcp_conn_con(tcp_t *tcp, uchar_t *iphdr, mblk_t *idmp, 410 mblk_t **defermp, ip_recv_attr_t *ira) 411 { 412 sin_t sin; 413 sin6_t sin6; 414 mblk_t *mp; 415 char *optp = NULL; 416 int optlen = 0; 417 conn_t *connp = tcp->tcp_connp; 418 419 if (defermp != NULL) 420 *defermp = NULL; 421 422 if (tcp->tcp_conn.tcp_opts_conn_req != NULL) { 423 /* 424 * Return in T_CONN_CON results of option negotiation through 425 * the T_CONN_REQ. Note: If there is an real end-to-end option 426 * negotiation, then what is received from remote end needs 427 * to be taken into account but there is no such thing (yet?) 428 * in our TCP/IP. 429 * Note: We do not use mi_offset_param() here as 430 * tcp_opts_conn_req contents do not directly come from 431 * an application and are either generated in kernel or 432 * from user input that was already verified. 433 */ 434 mp = tcp->tcp_conn.tcp_opts_conn_req; 435 optp = (char *)(mp->b_rptr + 436 ((struct T_conn_req *)mp->b_rptr)->OPT_offset); 437 optlen = (int) 438 ((struct T_conn_req *)mp->b_rptr)->OPT_length; 439 } 440 441 if (IPH_HDR_VERSION(iphdr) == IPV4_VERSION) { 442 443 /* packet is IPv4 */ 444 if (connp->conn_family == AF_INET) { 445 sin = sin_null; 446 sin.sin_addr.s_addr = connp->conn_faddr_v4; 447 sin.sin_port = connp->conn_fport; 448 sin.sin_family = AF_INET; 449 mp = mi_tpi_conn_con(NULL, (char *)&sin, 450 (int)sizeof (sin_t), optp, optlen); 451 } else { 452 sin6 = sin6_null; 453 sin6.sin6_addr = connp->conn_faddr_v6; 454 sin6.sin6_port = connp->conn_fport; 455 sin6.sin6_family = AF_INET6; 456 mp = mi_tpi_conn_con(NULL, (char *)&sin6, 457 (int)sizeof (sin6_t), optp, optlen); 458 459 } 460 } else { 461 ip6_t *ip6h = (ip6_t *)iphdr; 462 463 ASSERT(IPH_HDR_VERSION(iphdr) == IPV6_VERSION); 464 ASSERT(connp->conn_family == AF_INET6); 465 sin6 = sin6_null; 466 sin6.sin6_addr = connp->conn_faddr_v6; 467 sin6.sin6_port = connp->conn_fport; 468 sin6.sin6_family = AF_INET6; 469 sin6.sin6_flowinfo = ip6h->ip6_vcf & ~IPV6_VERS_AND_FLOW_MASK; 470 mp = mi_tpi_conn_con(NULL, (char *)&sin6, 471 (int)sizeof (sin6_t), optp, optlen); 472 } 473 474 if (!mp) 475 return (B_FALSE); 476 477 mblk_copycred(mp, idmp); 478 479 if (defermp == NULL) { 480 conn_t *connp = tcp->tcp_connp; 481 if (IPCL_IS_NONSTR(connp)) { 482 (*connp->conn_upcalls->su_connected) 483 (connp->conn_upper_handle, tcp->tcp_connid, 484 ira->ira_cred, ira->ira_cpid); 485 freemsg(mp); 486 } else { 487 if (ira->ira_cred != NULL) { 488 /* So that getpeerucred works for TPI sockfs */ 489 mblk_setcred(mp, ira->ira_cred, ira->ira_cpid); 490 } 491 putnext(connp->conn_rq, mp); 492 } 493 } else { 494 *defermp = mp; 495 } 496 497 if (tcp->tcp_conn.tcp_opts_conn_req != NULL) 498 tcp_close_mpp(&tcp->tcp_conn.tcp_opts_conn_req); 499 return (B_TRUE); 500 } 501 502 /* 503 * Successful connect request processing begins when our client passes 504 * a T_CONN_REQ message into tcp_wput(), which performs function calls into 505 * IP and the passes a T_OK_ACK (or T_ERROR_ACK upstream). 506 * 507 * After various error checks are completed, tcp_tpi_connect() lays 508 * the target address and port into the composite header template. 509 * Then we ask IP for information, including a source address if we didn't 510 * already have one. Finally we prepare to send the SYN packet, and then 511 * send up the T_OK_ACK reply message. 512 */ 513 void 514 tcp_tpi_connect(tcp_t *tcp, mblk_t *mp) 515 { 516 sin_t *sin; 517 struct T_conn_req *tcr; 518 struct sockaddr *sa; 519 socklen_t len; 520 int error; 521 cred_t *cr; 522 pid_t cpid; 523 conn_t *connp = tcp->tcp_connp; 524 queue_t *q = connp->conn_wq; 525 526 /* 527 * All Solaris components should pass a db_credp 528 * for this TPI message, hence we ASSERT. 529 * But in case there is some other M_PROTO that looks 530 * like a TPI message sent by some other kernel 531 * component, we check and return an error. 532 */ 533 cr = msg_getcred(mp, &cpid); 534 ASSERT(cr != NULL); 535 if (cr == NULL) { 536 tcp_err_ack(tcp, mp, TSYSERR, EINVAL); 537 return; 538 } 539 540 tcr = (struct T_conn_req *)mp->b_rptr; 541 542 ASSERT((uintptr_t)(mp->b_wptr - mp->b_rptr) <= (uintptr_t)INT_MAX); 543 if ((mp->b_wptr - mp->b_rptr) < sizeof (*tcr)) { 544 tcp_err_ack(tcp, mp, TPROTO, 0); 545 return; 546 } 547 548 /* 549 * Pre-allocate the T_ordrel_ind mblk so that at close time, we 550 * will always have that to send up. Otherwise, we need to do 551 * special handling in case the allocation fails at that time. 552 * If the end point is TPI, the tcp_t can be reused and the 553 * tcp_ordrel_mp may be allocated already. 554 */ 555 if (tcp->tcp_ordrel_mp == NULL) { 556 if ((tcp->tcp_ordrel_mp = mi_tpi_ordrel_ind()) == NULL) { 557 tcp_err_ack(tcp, mp, TSYSERR, ENOMEM); 558 return; 559 } 560 } 561 562 /* 563 * Determine packet type based on type of address passed in 564 * the request should contain an IPv4 or IPv6 address. 565 * Make sure that address family matches the type of 566 * family of the address passed down. 567 */ 568 switch (tcr->DEST_length) { 569 default: 570 tcp_err_ack(tcp, mp, TBADADDR, 0); 571 return; 572 573 case (sizeof (sin_t) - sizeof (sin->sin_zero)): { 574 /* 575 * XXX: The check for valid DEST_length was not there 576 * in earlier releases and some buggy 577 * TLI apps (e.g Sybase) got away with not feeding 578 * in sin_zero part of address. 579 * We allow that bug to keep those buggy apps humming. 580 * Test suites require the check on DEST_length. 581 * We construct a new mblk with valid DEST_length 582 * free the original so the rest of the code does 583 * not have to keep track of this special shorter 584 * length address case. 585 */ 586 mblk_t *nmp; 587 struct T_conn_req *ntcr; 588 sin_t *nsin; 589 590 nmp = allocb(sizeof (struct T_conn_req) + sizeof (sin_t) + 591 tcr->OPT_length, BPRI_HI); 592 if (nmp == NULL) { 593 tcp_err_ack(tcp, mp, TSYSERR, ENOMEM); 594 return; 595 } 596 ntcr = (struct T_conn_req *)nmp->b_rptr; 597 bzero(ntcr, sizeof (struct T_conn_req)); /* zero fill */ 598 ntcr->PRIM_type = T_CONN_REQ; 599 ntcr->DEST_length = sizeof (sin_t); 600 ntcr->DEST_offset = sizeof (struct T_conn_req); 601 602 nsin = (sin_t *)((uchar_t *)ntcr + ntcr->DEST_offset); 603 *nsin = sin_null; 604 /* Get pointer to shorter address to copy from original mp */ 605 sin = (sin_t *)mi_offset_param(mp, tcr->DEST_offset, 606 tcr->DEST_length); /* extract DEST_length worth of sin_t */ 607 if (sin == NULL || !OK_32PTR((char *)sin)) { 608 freemsg(nmp); 609 tcp_err_ack(tcp, mp, TSYSERR, EINVAL); 610 return; 611 } 612 nsin->sin_family = sin->sin_family; 613 nsin->sin_port = sin->sin_port; 614 nsin->sin_addr = sin->sin_addr; 615 /* Note:nsin->sin_zero zero-fill with sin_null assign above */ 616 nmp->b_wptr = (uchar_t *)&nsin[1]; 617 if (tcr->OPT_length != 0) { 618 ntcr->OPT_length = tcr->OPT_length; 619 ntcr->OPT_offset = nmp->b_wptr - nmp->b_rptr; 620 bcopy((uchar_t *)tcr + tcr->OPT_offset, 621 (uchar_t *)ntcr + ntcr->OPT_offset, 622 tcr->OPT_length); 623 nmp->b_wptr += tcr->OPT_length; 624 } 625 freemsg(mp); /* original mp freed */ 626 mp = nmp; /* re-initialize original variables */ 627 tcr = ntcr; 628 } 629 /* FALLTHRU */ 630 631 case sizeof (sin_t): 632 sa = (struct sockaddr *)mi_offset_param(mp, tcr->DEST_offset, 633 sizeof (sin_t)); 634 len = sizeof (sin_t); 635 break; 636 637 case sizeof (sin6_t): 638 sa = (struct sockaddr *)mi_offset_param(mp, tcr->DEST_offset, 639 sizeof (sin6_t)); 640 len = sizeof (sin6_t); 641 break; 642 } 643 644 error = proto_verify_ip_addr(connp->conn_family, sa, len); 645 if (error != 0) { 646 tcp_err_ack(tcp, mp, TSYSERR, error); 647 return; 648 } 649 650 /* 651 * TODO: If someone in TCPS_TIME_WAIT has this dst/port we 652 * should key on their sequence number and cut them loose. 653 */ 654 655 /* 656 * If options passed in, feed it for verification and handling 657 */ 658 if (tcr->OPT_length != 0) { 659 mblk_t *ok_mp; 660 mblk_t *discon_mp; 661 mblk_t *conn_opts_mp; 662 int t_error, sys_error, do_disconnect; 663 664 conn_opts_mp = NULL; 665 666 if (tcp_conprim_opt_process(tcp, mp, 667 &do_disconnect, &t_error, &sys_error) < 0) { 668 if (do_disconnect) { 669 ASSERT(t_error == 0 && sys_error == 0); 670 discon_mp = mi_tpi_discon_ind(NULL, 671 ECONNREFUSED, 0); 672 if (!discon_mp) { 673 tcp_err_ack_prim(tcp, mp, T_CONN_REQ, 674 TSYSERR, ENOMEM); 675 return; 676 } 677 ok_mp = mi_tpi_ok_ack_alloc(mp); 678 if (!ok_mp) { 679 tcp_err_ack_prim(tcp, NULL, T_CONN_REQ, 680 TSYSERR, ENOMEM); 681 return; 682 } 683 qreply(q, ok_mp); 684 qreply(q, discon_mp); /* no flush! */ 685 } else { 686 ASSERT(t_error != 0); 687 tcp_err_ack_prim(tcp, mp, T_CONN_REQ, t_error, 688 sys_error); 689 } 690 return; 691 } 692 /* 693 * Success in setting options, the mp option buffer represented 694 * by OPT_length/offset has been potentially modified and 695 * contains results of option processing. We copy it in 696 * another mp to save it for potentially influencing returning 697 * it in T_CONN_CONN. 698 */ 699 if (tcr->OPT_length != 0) { /* there are resulting options */ 700 conn_opts_mp = copyb(mp); 701 if (!conn_opts_mp) { 702 tcp_err_ack_prim(tcp, mp, T_CONN_REQ, 703 TSYSERR, ENOMEM); 704 return; 705 } 706 ASSERT(tcp->tcp_conn.tcp_opts_conn_req == NULL); 707 tcp->tcp_conn.tcp_opts_conn_req = conn_opts_mp; 708 /* 709 * Note: 710 * These resulting option negotiation can include any 711 * end-to-end negotiation options but there no such 712 * thing (yet?) in our TCP/IP. 713 */ 714 } 715 } 716 717 /* call the non-TPI version */ 718 error = tcp_do_connect(tcp->tcp_connp, sa, len, cr, cpid); 719 if (error < 0) { 720 mp = mi_tpi_err_ack_alloc(mp, -error, 0); 721 } else if (error > 0) { 722 mp = mi_tpi_err_ack_alloc(mp, TSYSERR, error); 723 } else { 724 mp = mi_tpi_ok_ack_alloc(mp); 725 } 726 727 /* 728 * Note: Code below is the "failure" case 729 */ 730 /* return error ack and blow away saved option results if any */ 731 connect_failed: 732 if (mp != NULL) 733 putnext(connp->conn_rq, mp); 734 else { 735 tcp_err_ack_prim(tcp, NULL, T_CONN_REQ, 736 TSYSERR, ENOMEM); 737 } 738 } 739 740 /* Return the TPI/TLI equivalent of our current tcp_state */ 741 static int 742 tcp_tpistate(tcp_t *tcp) 743 { 744 switch (tcp->tcp_state) { 745 case TCPS_IDLE: 746 return (TS_UNBND); 747 case TCPS_LISTEN: 748 /* 749 * Return whether there are outstanding T_CONN_IND waiting 750 * for the matching T_CONN_RES. Therefore don't count q0. 751 */ 752 if (tcp->tcp_conn_req_cnt_q > 0) 753 return (TS_WRES_CIND); 754 else 755 return (TS_IDLE); 756 case TCPS_BOUND: 757 return (TS_IDLE); 758 case TCPS_SYN_SENT: 759 return (TS_WCON_CREQ); 760 case TCPS_SYN_RCVD: 761 /* 762 * Note: assumption: this has to the active open SYN_RCVD. 763 * The passive instance is detached in SYN_RCVD stage of 764 * incoming connection processing so we cannot get request 765 * for T_info_ack on it. 766 */ 767 return (TS_WACK_CRES); 768 case TCPS_ESTABLISHED: 769 return (TS_DATA_XFER); 770 case TCPS_CLOSE_WAIT: 771 return (TS_WREQ_ORDREL); 772 case TCPS_FIN_WAIT_1: 773 return (TS_WIND_ORDREL); 774 case TCPS_FIN_WAIT_2: 775 return (TS_WIND_ORDREL); 776 777 case TCPS_CLOSING: 778 case TCPS_LAST_ACK: 779 case TCPS_TIME_WAIT: 780 case TCPS_CLOSED: 781 /* 782 * Following TS_WACK_DREQ7 is a rendition of "not 783 * yet TS_IDLE" TPI state. There is no best match to any 784 * TPI state for TCPS_{CLOSING, LAST_ACK, TIME_WAIT} but we 785 * choose a value chosen that will map to TLI/XTI level 786 * state of TSTATECHNG (state is process of changing) which 787 * captures what this dummy state represents. 788 */ 789 return (TS_WACK_DREQ7); 790 default: 791 cmn_err(CE_WARN, "tcp_tpistate: strange state (%d) %s", 792 tcp->tcp_state, tcp_display(tcp, NULL, 793 DISP_PORT_ONLY)); 794 return (TS_UNBND); 795 } 796 } 797 798 static void 799 tcp_copy_info(struct T_info_ack *tia, tcp_t *tcp) 800 { 801 tcp_stack_t *tcps = tcp->tcp_tcps; 802 conn_t *connp = tcp->tcp_connp; 803 extern struct T_info_ack tcp_g_t_info_ack; 804 extern struct T_info_ack tcp_g_t_info_ack_v6; 805 806 if (connp->conn_family == AF_INET6) 807 *tia = tcp_g_t_info_ack_v6; 808 else 809 *tia = tcp_g_t_info_ack; 810 tia->CURRENT_state = tcp_tpistate(tcp); 811 tia->OPT_size = tcp_max_optsize; 812 if (tcp->tcp_mss == 0) { 813 /* Not yet set - tcp_open does not set mss */ 814 if (connp->conn_ipversion == IPV4_VERSION) 815 tia->TIDU_size = tcps->tcps_mss_def_ipv4; 816 else 817 tia->TIDU_size = tcps->tcps_mss_def_ipv6; 818 } else { 819 tia->TIDU_size = tcp->tcp_mss; 820 } 821 /* TODO: Default ETSDU is 1. Is that correct for tcp? */ 822 } 823 824 void 825 tcp_do_capability_ack(tcp_t *tcp, struct T_capability_ack *tcap, 826 t_uscalar_t cap_bits1) 827 { 828 tcap->CAP_bits1 = 0; 829 830 if (cap_bits1 & TC1_INFO) { 831 tcp_copy_info(&tcap->INFO_ack, tcp); 832 tcap->CAP_bits1 |= TC1_INFO; 833 } 834 835 if (cap_bits1 & TC1_ACCEPTOR_ID) { 836 tcap->ACCEPTOR_id = tcp->tcp_acceptor_id; 837 tcap->CAP_bits1 |= TC1_ACCEPTOR_ID; 838 } 839 840 } 841 842 /* 843 * This routine responds to T_CAPABILITY_REQ messages. It is called by 844 * tcp_wput. Much of the T_CAPABILITY_ACK information is copied from 845 * tcp_g_t_info_ack. The current state of the stream is copied from 846 * tcp_state. 847 */ 848 void 849 tcp_capability_req(tcp_t *tcp, mblk_t *mp) 850 { 851 t_uscalar_t cap_bits1; 852 struct T_capability_ack *tcap; 853 854 if (MBLKL(mp) < sizeof (struct T_capability_req)) { 855 freemsg(mp); 856 return; 857 } 858 859 cap_bits1 = ((struct T_capability_req *)mp->b_rptr)->CAP_bits1; 860 861 mp = tpi_ack_alloc(mp, sizeof (struct T_capability_ack), 862 mp->b_datap->db_type, T_CAPABILITY_ACK); 863 if (mp == NULL) 864 return; 865 866 tcap = (struct T_capability_ack *)mp->b_rptr; 867 tcp_do_capability_ack(tcp, tcap, cap_bits1); 868 869 putnext(tcp->tcp_connp->conn_rq, mp); 870 } 871 872 /* 873 * This routine responds to T_INFO_REQ messages. It is called by tcp_wput. 874 * Most of the T_INFO_ACK information is copied from tcp_g_t_info_ack. 875 * The current state of the stream is copied from tcp_state. 876 */ 877 void 878 tcp_info_req(tcp_t *tcp, mblk_t *mp) 879 { 880 mp = tpi_ack_alloc(mp, sizeof (struct T_info_ack), M_PCPROTO, 881 T_INFO_ACK); 882 if (!mp) { 883 tcp_err_ack(tcp, mp, TSYSERR, ENOMEM); 884 return; 885 } 886 tcp_copy_info((struct T_info_ack *)mp->b_rptr, tcp); 887 putnext(tcp->tcp_connp->conn_rq, mp); 888 } 889 890 /* Respond to the TPI addr request */ 891 void 892 tcp_addr_req(tcp_t *tcp, mblk_t *mp) 893 { 894 struct sockaddr *sa; 895 mblk_t *ackmp; 896 struct T_addr_ack *taa; 897 conn_t *connp = tcp->tcp_connp; 898 uint_t addrlen; 899 900 /* Make it large enough for worst case */ 901 ackmp = reallocb(mp, sizeof (struct T_addr_ack) + 902 2 * sizeof (sin6_t), 1); 903 if (ackmp == NULL) { 904 tcp_err_ack(tcp, mp, TSYSERR, ENOMEM); 905 return; 906 } 907 908 taa = (struct T_addr_ack *)ackmp->b_rptr; 909 910 bzero(taa, sizeof (struct T_addr_ack)); 911 ackmp->b_wptr = (uchar_t *)&taa[1]; 912 913 taa->PRIM_type = T_ADDR_ACK; 914 ackmp->b_datap->db_type = M_PCPROTO; 915 916 if (connp->conn_family == AF_INET) 917 addrlen = sizeof (sin_t); 918 else 919 addrlen = sizeof (sin6_t); 920 921 /* 922 * Note: Following code assumes 32 bit alignment of basic 923 * data structures like sin_t and struct T_addr_ack. 924 */ 925 if (tcp->tcp_state >= TCPS_BOUND) { 926 /* 927 * Fill in local address first 928 */ 929 taa->LOCADDR_offset = sizeof (*taa); 930 taa->LOCADDR_length = addrlen; 931 sa = (struct sockaddr *)&taa[1]; 932 (void) conn_getsockname(connp, sa, &addrlen); 933 ackmp->b_wptr += addrlen; 934 } 935 if (tcp->tcp_state >= TCPS_SYN_RCVD) { 936 /* 937 * Fill in Remote address 938 */ 939 taa->REMADDR_length = addrlen; 940 /* assumed 32-bit alignment */ 941 taa->REMADDR_offset = taa->LOCADDR_offset + taa->LOCADDR_length; 942 sa = (struct sockaddr *)(ackmp->b_rptr + taa->REMADDR_offset); 943 (void) conn_getpeername(connp, sa, &addrlen); 944 ackmp->b_wptr += addrlen; 945 } 946 ASSERT(ackmp->b_wptr <= ackmp->b_datap->db_lim); 947 putnext(tcp->tcp_connp->conn_rq, ackmp); 948 } 949 950 /* 951 * Swap information between the eager and acceptor for a TLI/XTI client. 952 * The sockfs accept is done on the acceptor stream and control goes 953 * through tcp_tli_accept() and tcp_accept()/tcp_accept_swap() is not 954 * called. In either case, both the eager and listener are in their own 955 * perimeter (squeue) and the code has to deal with potential race. 956 * 957 * See the block comment on top of tcp_accept() and tcp_tli_accept(). 958 */ 959 static void 960 tcp_accept_swap(tcp_t *listener, tcp_t *acceptor, tcp_t *eager) 961 { 962 conn_t *econnp, *aconnp; 963 964 ASSERT(eager->tcp_connp->conn_rq == listener->tcp_connp->conn_rq); 965 ASSERT(eager->tcp_detached && !acceptor->tcp_detached); 966 ASSERT(!TCP_IS_SOCKET(acceptor)); 967 ASSERT(!TCP_IS_SOCKET(eager)); 968 ASSERT(!TCP_IS_SOCKET(listener)); 969 970 /* 971 * Trusted Extensions may need to use a security label that is 972 * different from the acceptor's label on MLP and MAC-Exempt 973 * sockets. If this is the case, the required security label 974 * already exists in econnp->conn_ixa->ixa_tsl. Since we make the 975 * acceptor stream refer to econnp we atomatically get that label. 976 */ 977 978 acceptor->tcp_detached = B_TRUE; 979 /* 980 * To permit stream re-use by TLI/XTI, the eager needs a copy of 981 * the acceptor id. 982 */ 983 eager->tcp_acceptor_id = acceptor->tcp_acceptor_id; 984 985 /* remove eager from listen list... */ 986 mutex_enter(&listener->tcp_eager_lock); 987 tcp_eager_unlink(eager); 988 ASSERT(eager->tcp_eager_next_q == NULL && 989 eager->tcp_eager_last_q == NULL); 990 ASSERT(eager->tcp_eager_next_q0 == NULL && 991 eager->tcp_eager_prev_q0 == NULL); 992 mutex_exit(&listener->tcp_eager_lock); 993 994 econnp = eager->tcp_connp; 995 aconnp = acceptor->tcp_connp; 996 econnp->conn_rq = aconnp->conn_rq; 997 econnp->conn_wq = aconnp->conn_wq; 998 econnp->conn_rq->q_ptr = econnp; 999 econnp->conn_wq->q_ptr = econnp; 1000 1001 /* 1002 * In the TLI/XTI loopback case, we are inside the listener's squeue, 1003 * which might be a different squeue from our peer TCP instance. 1004 * For TCP Fusion, the peer expects that whenever tcp_detached is 1005 * clear, our TCP queues point to the acceptor's queues. Thus, use 1006 * membar_producer() to ensure that the assignments of conn_rq/conn_wq 1007 * above reach global visibility prior to the clearing of tcp_detached. 1008 */ 1009 membar_producer(); 1010 eager->tcp_detached = B_FALSE; 1011 1012 ASSERT(eager->tcp_ack_tid == 0); 1013 1014 econnp->conn_dev = aconnp->conn_dev; 1015 econnp->conn_minor_arena = aconnp->conn_minor_arena; 1016 1017 ASSERT(econnp->conn_minor_arena != NULL); 1018 if (econnp->conn_cred != NULL) 1019 crfree(econnp->conn_cred); 1020 econnp->conn_cred = aconnp->conn_cred; 1021 ASSERT(!(econnp->conn_ixa->ixa_free_flags & IXA_FREE_CRED)); 1022 econnp->conn_ixa->ixa_cred = econnp->conn_cred; 1023 aconnp->conn_cred = NULL; 1024 econnp->conn_cpid = aconnp->conn_cpid; 1025 ASSERT(econnp->conn_netstack == aconnp->conn_netstack); 1026 ASSERT(eager->tcp_tcps == acceptor->tcp_tcps); 1027 1028 econnp->conn_zoneid = aconnp->conn_zoneid; 1029 econnp->conn_allzones = aconnp->conn_allzones; 1030 econnp->conn_ixa->ixa_zoneid = aconnp->conn_ixa->ixa_zoneid; 1031 1032 econnp->conn_mac_mode = aconnp->conn_mac_mode; 1033 econnp->conn_zone_is_global = aconnp->conn_zone_is_global; 1034 aconnp->conn_mac_mode = CONN_MAC_DEFAULT; 1035 1036 /* Do the IPC initialization */ 1037 CONN_INC_REF(econnp); 1038 1039 /* Done with old IPC. Drop its ref on its connp */ 1040 CONN_DEC_REF(aconnp); 1041 } 1042 1043 /* 1044 * This runs at the tail end of accept processing on the squeue of the 1045 * new connection. 1046 */ 1047 /* ARGSUSED */ 1048 static void 1049 tcp_accept_finish(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy) 1050 { 1051 conn_t *connp = (conn_t *)arg; 1052 tcp_t *tcp = connp->conn_tcp; 1053 queue_t *q = connp->conn_rq; 1054 tcp_stack_t *tcps = tcp->tcp_tcps; 1055 struct stroptions *stropt; 1056 struct sock_proto_props sopp; 1057 1058 /* Should never be called for non-STREAMS sockets */ 1059 ASSERT(!IPCL_IS_NONSTR(connp)); 1060 1061 /* We should just receive a single mblk that fits a T_discon_ind */ 1062 ASSERT(mp->b_cont == NULL); 1063 1064 /* 1065 * Drop the eager's ref on the listener, that was placed when 1066 * this eager began life in tcp_input_listener. 1067 */ 1068 CONN_DEC_REF(tcp->tcp_saved_listener->tcp_connp); 1069 1070 tcp->tcp_detached = B_FALSE; 1071 1072 if (tcp->tcp_state <= TCPS_BOUND || tcp->tcp_accept_error) { 1073 /* 1074 * Someone blewoff the eager before we could finish 1075 * the accept. 1076 * 1077 * The only reason eager exists it because we put in 1078 * a ref on it when conn ind went up. We need to send 1079 * a disconnect indication up while the last reference 1080 * on the eager will be dropped by the squeue when we 1081 * return. 1082 */ 1083 ASSERT(tcp->tcp_listener == NULL); 1084 if (tcp->tcp_issocket || tcp->tcp_send_discon_ind) { 1085 struct T_discon_ind *tdi; 1086 1087 (void) putnextctl1(q, M_FLUSH, FLUSHRW); 1088 /* 1089 * Let us reuse the incoming mblk to avoid 1090 * memory allocation failure problems. We know 1091 * that the size of the incoming mblk i.e. 1092 * stroptions is greater than sizeof 1093 * T_discon_ind. 1094 */ 1095 ASSERT(DB_REF(mp) == 1); 1096 ASSERT(MBLKSIZE(mp) >= 1097 sizeof (struct T_discon_ind)); 1098 1099 DB_TYPE(mp) = M_PROTO; 1100 ((union T_primitives *)mp->b_rptr)->type = 1101 T_DISCON_IND; 1102 tdi = (struct T_discon_ind *)mp->b_rptr; 1103 if (tcp->tcp_issocket) { 1104 tdi->DISCON_reason = ECONNREFUSED; 1105 tdi->SEQ_number = 0; 1106 } else { 1107 tdi->DISCON_reason = ENOPROTOOPT; 1108 tdi->SEQ_number = 1109 tcp->tcp_conn_req_seqnum; 1110 } 1111 mp->b_wptr = mp->b_rptr + 1112 sizeof (struct T_discon_ind); 1113 putnext(q, mp); 1114 } 1115 tcp->tcp_hard_binding = B_FALSE; 1116 return; 1117 } 1118 1119 /* 1120 * This is the first time we run on the correct 1121 * queue after tcp_accept. So fix all the q parameters 1122 * here. 1123 * 1124 * Let us reuse the incoming mblk to avoid 1125 * memory allocation failure problems. We know 1126 * that the size of the incoming mblk is at least 1127 * stroptions 1128 */ 1129 tcp_get_proto_props(tcp, &sopp); 1130 1131 ASSERT(DB_REF(mp) == 1); 1132 ASSERT(MBLKSIZE(mp) >= sizeof (struct stroptions)); 1133 1134 DB_TYPE(mp) = M_SETOPTS; 1135 stropt = (struct stroptions *)mp->b_rptr; 1136 mp->b_wptr = mp->b_rptr + sizeof (struct stroptions); 1137 stropt = (struct stroptions *)mp->b_rptr; 1138 ASSERT(sopp.sopp_flags & (SO_HIWAT|SO_WROFF|SO_MAXBLK)); 1139 stropt->so_flags = SO_HIWAT | SO_WROFF | SO_MAXBLK; 1140 stropt->so_hiwat = sopp.sopp_rxhiwat; 1141 stropt->so_wroff = sopp.sopp_wroff; 1142 stropt->so_maxblk = sopp.sopp_maxblk; 1143 1144 /* Send the options up */ 1145 putnext(q, mp); 1146 1147 /* 1148 * Pass up any data and/or a fin that has been received. 1149 * 1150 * Adjust receive window in case it had decreased 1151 * (because there is data <=> tcp_rcv_list != NULL) 1152 * while the connection was detached. Note that 1153 * in case the eager was flow-controlled, w/o this 1154 * code, the rwnd may never open up again! 1155 */ 1156 if (tcp->tcp_rcv_list != NULL) { 1157 /* We drain directly in case of fused tcp loopback */ 1158 1159 if (!tcp->tcp_fused && canputnext(q)) { 1160 tcp->tcp_rwnd = connp->conn_rcvbuf; 1161 if (tcp->tcp_state >= TCPS_ESTABLISHED && 1162 tcp_rwnd_reopen(tcp) == TH_ACK_NEEDED) { 1163 tcp_xmit_ctl(NULL, 1164 tcp, (tcp->tcp_swnd == 0) ? 1165 tcp->tcp_suna : tcp->tcp_snxt, 1166 tcp->tcp_rnxt, TH_ACK); 1167 } 1168 } 1169 1170 (void) tcp_rcv_drain(tcp); 1171 1172 /* 1173 * For fused tcp loopback, back-enable peer endpoint 1174 * if it's currently flow-controlled. 1175 */ 1176 if (tcp->tcp_fused) { 1177 tcp_t *peer_tcp = tcp->tcp_loopback_peer; 1178 1179 ASSERT(peer_tcp != NULL); 1180 ASSERT(peer_tcp->tcp_fused); 1181 1182 mutex_enter(&peer_tcp->tcp_non_sq_lock); 1183 if (peer_tcp->tcp_flow_stopped) { 1184 tcp_clrqfull(peer_tcp); 1185 TCP_STAT(tcps, tcp_fusion_backenabled); 1186 } 1187 mutex_exit(&peer_tcp->tcp_non_sq_lock); 1188 } 1189 } 1190 ASSERT(tcp->tcp_rcv_list == NULL || tcp->tcp_fused_sigurg); 1191 if (tcp->tcp_fin_rcvd && !tcp->tcp_ordrel_done) { 1192 tcp->tcp_ordrel_done = B_TRUE; 1193 mp = tcp->tcp_ordrel_mp; 1194 tcp->tcp_ordrel_mp = NULL; 1195 putnext(q, mp); 1196 } 1197 tcp->tcp_hard_binding = B_FALSE; 1198 1199 if (connp->conn_keepalive) { 1200 tcp->tcp_ka_last_intrvl = 0; 1201 tcp->tcp_ka_tid = TCP_TIMER(tcp, tcp_keepalive_timer, 1202 tcp->tcp_ka_interval); 1203 } 1204 1205 /* 1206 * At this point, eager is fully established and will 1207 * have the following references - 1208 * 1209 * 2 references for connection to exist (1 for TCP and 1 for IP). 1210 * 1 reference for the squeue which will be dropped by the squeue as 1211 * soon as this function returns. 1212 * There will be 1 additonal reference for being in classifier 1213 * hash list provided something bad hasn't happened. 1214 */ 1215 ASSERT((connp->conn_fanout != NULL && connp->conn_ref >= 4) || 1216 (connp->conn_fanout == NULL && connp->conn_ref >= 3)); 1217 } 1218 1219 /* 1220 * Pull a deferred connection indication off of the listener. The caller 1221 * must verify that there is a deferred conn ind under eager_lock before 1222 * calling this function. 1223 */ 1224 static mblk_t * 1225 tcp_get_def_conn_ind(tcp_t *listener) 1226 { 1227 tcp_t *tail; 1228 tcp_t *tcp; 1229 mblk_t *conn_ind; 1230 1231 ASSERT(MUTEX_HELD(&listener->tcp_eager_lock)); 1232 ASSERT(listener->tcp_eager_prev_q0->tcp_conn_def_q0); 1233 1234 tcp = listener->tcp_eager_prev_q0; 1235 /* 1236 * listener->tcp_eager_prev_q0 points to the TAIL of the 1237 * deferred T_conn_ind queue. We need to get to the head 1238 * of the queue in order to send up T_conn_ind the same 1239 * order as how the 3WHS is completed. 1240 */ 1241 while (tcp != listener) { 1242 if (!tcp->tcp_eager_prev_q0->tcp_conn_def_q0) 1243 break; 1244 else 1245 tcp = tcp->tcp_eager_prev_q0; 1246 } 1247 1248 conn_ind = tcp->tcp_conn.tcp_eager_conn_ind; 1249 tcp->tcp_conn.tcp_eager_conn_ind = NULL; 1250 /* Move from q0 to q */ 1251 ASSERT(listener->tcp_conn_req_cnt_q0 > 0); 1252 listener->tcp_conn_req_cnt_q0--; 1253 listener->tcp_conn_req_cnt_q++; 1254 tcp->tcp_eager_next_q0->tcp_eager_prev_q0 = 1255 tcp->tcp_eager_prev_q0; 1256 tcp->tcp_eager_prev_q0->tcp_eager_next_q0 = 1257 tcp->tcp_eager_next_q0; 1258 tcp->tcp_eager_prev_q0 = NULL; 1259 tcp->tcp_eager_next_q0 = NULL; 1260 tcp->tcp_conn_def_q0 = B_FALSE; 1261 1262 /* Make sure the tcp isn't in the list of droppables */ 1263 ASSERT(tcp->tcp_eager_next_drop_q0 == NULL && 1264 tcp->tcp_eager_prev_drop_q0 == NULL); 1265 1266 /* 1267 * Insert at end of the queue because sockfs sends 1268 * down T_CONN_RES in chronological order. Leaving 1269 * the older conn indications at front of the queue 1270 * helps reducing search time. 1271 */ 1272 tail = listener->tcp_eager_last_q; 1273 if (tail != NULL) { 1274 tail->tcp_eager_next_q = tcp; 1275 } else { 1276 listener->tcp_eager_next_q = tcp; 1277 } 1278 listener->tcp_eager_last_q = tcp; 1279 tcp->tcp_eager_next_q = NULL; 1280 1281 return (conn_ind); 1282 } 1283 1284 1285 /* 1286 * Reply to a clients T_CONN_RES TPI message. This function 1287 * is used only for TLI/XTI listener. Sockfs sends T_CONN_RES 1288 * on the acceptor STREAM and processed in tcp_accept_common(). 1289 * Read the block comment on top of tcp_input_listener(). 1290 */ 1291 void 1292 tcp_tli_accept(tcp_t *listener, mblk_t *mp) 1293 { 1294 tcp_t *acceptor; 1295 tcp_t *eager; 1296 struct T_conn_res *tcr; 1297 t_uscalar_t acceptor_id; 1298 t_scalar_t seqnum; 1299 mblk_t *discon_mp = NULL; 1300 mblk_t *ok_mp; 1301 mblk_t *mp1; 1302 tcp_stack_t *tcps = listener->tcp_tcps; 1303 conn_t *econnp; 1304 1305 if ((mp->b_wptr - mp->b_rptr) < sizeof (*tcr)) { 1306 tcp_err_ack(listener, mp, TPROTO, 0); 1307 return; 1308 } 1309 tcr = (struct T_conn_res *)mp->b_rptr; 1310 1311 /* 1312 * Under ILP32 the stream head points tcr->ACCEPTOR_id at the 1313 * read side queue of the streams device underneath us i.e. the 1314 * read side queue of 'ip'. Since we can't deference QUEUE_ptr we 1315 * look it up in the queue_hash. Under LP64 it sends down the 1316 * minor_t of the accepting endpoint. 1317 * 1318 * Once the acceptor/eager are modified (in tcp_accept_swap) the 1319 * fanout hash lock is held. 1320 * This prevents any thread from entering the acceptor queue from 1321 * below (since it has not been hard bound yet i.e. any inbound 1322 * packets will arrive on the listener conn_t and 1323 * go through the classifier). 1324 * The CONN_INC_REF will prevent the acceptor from closing. 1325 * 1326 * XXX It is still possible for a tli application to send down data 1327 * on the accepting stream while another thread calls t_accept. 1328 * This should not be a problem for well-behaved applications since 1329 * the T_OK_ACK is sent after the queue swapping is completed. 1330 * 1331 * If the accepting fd is the same as the listening fd, avoid 1332 * queue hash lookup since that will return an eager listener in a 1333 * already established state. 1334 */ 1335 acceptor_id = tcr->ACCEPTOR_id; 1336 mutex_enter(&listener->tcp_eager_lock); 1337 if (listener->tcp_acceptor_id == acceptor_id) { 1338 eager = listener->tcp_eager_next_q; 1339 /* only count how many T_CONN_INDs so don't count q0 */ 1340 if ((listener->tcp_conn_req_cnt_q != 1) || 1341 (eager->tcp_conn_req_seqnum != tcr->SEQ_number)) { 1342 mutex_exit(&listener->tcp_eager_lock); 1343 tcp_err_ack(listener, mp, TBADF, 0); 1344 return; 1345 } 1346 if (listener->tcp_conn_req_cnt_q0 != 0) { 1347 /* Throw away all the eagers on q0. */ 1348 tcp_eager_cleanup(listener, 1); 1349 } 1350 if (listener->tcp_syn_defense) { 1351 listener->tcp_syn_defense = B_FALSE; 1352 if (listener->tcp_ip_addr_cache != NULL) { 1353 kmem_free(listener->tcp_ip_addr_cache, 1354 IP_ADDR_CACHE_SIZE * sizeof (ipaddr_t)); 1355 listener->tcp_ip_addr_cache = NULL; 1356 } 1357 } 1358 /* 1359 * Transfer tcp_conn_req_max to the eager so that when 1360 * a disconnect occurs we can revert the endpoint to the 1361 * listen state. 1362 */ 1363 eager->tcp_conn_req_max = listener->tcp_conn_req_max; 1364 ASSERT(listener->tcp_conn_req_cnt_q0 == 0); 1365 /* 1366 * Get a reference on the acceptor just like the 1367 * tcp_acceptor_hash_lookup below. 1368 */ 1369 acceptor = listener; 1370 CONN_INC_REF(acceptor->tcp_connp); 1371 } else { 1372 acceptor = tcp_acceptor_hash_lookup(acceptor_id, tcps); 1373 if (acceptor == NULL) { 1374 if (listener->tcp_connp->conn_debug) { 1375 (void) strlog(TCP_MOD_ID, 0, 1, 1376 SL_ERROR|SL_TRACE, 1377 "tcp_accept: did not find acceptor 0x%x\n", 1378 acceptor_id); 1379 } 1380 mutex_exit(&listener->tcp_eager_lock); 1381 tcp_err_ack(listener, mp, TPROVMISMATCH, 0); 1382 return; 1383 } 1384 /* 1385 * Verify acceptor state. The acceptable states for an acceptor 1386 * include TCPS_IDLE and TCPS_BOUND. 1387 */ 1388 switch (acceptor->tcp_state) { 1389 case TCPS_IDLE: 1390 /* FALLTHRU */ 1391 case TCPS_BOUND: 1392 break; 1393 default: 1394 CONN_DEC_REF(acceptor->tcp_connp); 1395 mutex_exit(&listener->tcp_eager_lock); 1396 tcp_err_ack(listener, mp, TOUTSTATE, 0); 1397 return; 1398 } 1399 } 1400 1401 /* The listener must be in TCPS_LISTEN */ 1402 if (listener->tcp_state != TCPS_LISTEN) { 1403 CONN_DEC_REF(acceptor->tcp_connp); 1404 mutex_exit(&listener->tcp_eager_lock); 1405 tcp_err_ack(listener, mp, TOUTSTATE, 0); 1406 return; 1407 } 1408 1409 /* 1410 * Rendezvous with an eager connection request packet hanging off 1411 * 'tcp' that has the 'seqnum' tag. We tagged the detached open 1412 * tcp structure when the connection packet arrived in 1413 * tcp_input_listener(). 1414 */ 1415 seqnum = tcr->SEQ_number; 1416 eager = listener; 1417 do { 1418 eager = eager->tcp_eager_next_q; 1419 if (eager == NULL) { 1420 CONN_DEC_REF(acceptor->tcp_connp); 1421 mutex_exit(&listener->tcp_eager_lock); 1422 tcp_err_ack(listener, mp, TBADSEQ, 0); 1423 return; 1424 } 1425 } while (eager->tcp_conn_req_seqnum != seqnum); 1426 mutex_exit(&listener->tcp_eager_lock); 1427 1428 /* 1429 * At this point, both acceptor and listener have 2 ref 1430 * that they begin with. Acceptor has one additional ref 1431 * we placed in lookup while listener has 3 additional 1432 * ref for being behind the squeue (tcp_accept() is 1433 * done on listener's squeue); being in classifier hash; 1434 * and eager's ref on listener. 1435 */ 1436 ASSERT(listener->tcp_connp->conn_ref >= 5); 1437 ASSERT(acceptor->tcp_connp->conn_ref >= 3); 1438 1439 /* 1440 * The eager at this point is set in its own squeue and 1441 * could easily have been killed (tcp_accept_finish will 1442 * deal with that) because of a TH_RST so we can only 1443 * ASSERT for a single ref. 1444 */ 1445 ASSERT(eager->tcp_connp->conn_ref >= 1); 1446 1447 /* 1448 * Pre allocate the discon_ind mblk also. tcp_accept_finish will 1449 * use it if something failed. 1450 */ 1451 discon_mp = allocb(MAX(sizeof (struct T_discon_ind), 1452 sizeof (struct stroptions)), BPRI_HI); 1453 if (discon_mp == NULL) { 1454 CONN_DEC_REF(acceptor->tcp_connp); 1455 CONN_DEC_REF(eager->tcp_connp); 1456 tcp_err_ack(listener, mp, TSYSERR, ENOMEM); 1457 return; 1458 } 1459 1460 econnp = eager->tcp_connp; 1461 1462 /* Hold a copy of mp, in case reallocb fails */ 1463 if ((mp1 = copymsg(mp)) == NULL) { 1464 CONN_DEC_REF(acceptor->tcp_connp); 1465 CONN_DEC_REF(eager->tcp_connp); 1466 freemsg(discon_mp); 1467 tcp_err_ack(listener, mp, TSYSERR, ENOMEM); 1468 return; 1469 } 1470 1471 tcr = (struct T_conn_res *)mp1->b_rptr; 1472 1473 /* 1474 * This is an expanded version of mi_tpi_ok_ack_alloc() 1475 * which allocates a larger mblk and appends the new 1476 * local address to the ok_ack. The address is copied by 1477 * soaccept() for getsockname(). 1478 */ 1479 { 1480 int extra; 1481 1482 extra = (econnp->conn_family == AF_INET) ? 1483 sizeof (sin_t) : sizeof (sin6_t); 1484 1485 /* 1486 * Try to re-use mp, if possible. Otherwise, allocate 1487 * an mblk and return it as ok_mp. In any case, mp 1488 * is no longer usable upon return. 1489 */ 1490 if ((ok_mp = mi_tpi_ok_ack_alloc_extra(mp, extra)) == NULL) { 1491 CONN_DEC_REF(acceptor->tcp_connp); 1492 CONN_DEC_REF(eager->tcp_connp); 1493 freemsg(discon_mp); 1494 /* Original mp has been freed by now, so use mp1 */ 1495 tcp_err_ack(listener, mp1, TSYSERR, ENOMEM); 1496 return; 1497 } 1498 1499 mp = NULL; /* We should never use mp after this point */ 1500 1501 switch (extra) { 1502 case sizeof (sin_t): { 1503 sin_t *sin = (sin_t *)ok_mp->b_wptr; 1504 1505 ok_mp->b_wptr += extra; 1506 sin->sin_family = AF_INET; 1507 sin->sin_port = econnp->conn_lport; 1508 sin->sin_addr.s_addr = econnp->conn_laddr_v4; 1509 break; 1510 } 1511 case sizeof (sin6_t): { 1512 sin6_t *sin6 = (sin6_t *)ok_mp->b_wptr; 1513 1514 ok_mp->b_wptr += extra; 1515 sin6->sin6_family = AF_INET6; 1516 sin6->sin6_port = econnp->conn_lport; 1517 sin6->sin6_addr = econnp->conn_laddr_v6; 1518 sin6->sin6_flowinfo = econnp->conn_flowinfo; 1519 if (IN6_IS_ADDR_LINKSCOPE(&econnp->conn_laddr_v6) && 1520 (econnp->conn_ixa->ixa_flags & IXAF_SCOPEID_SET)) { 1521 sin6->sin6_scope_id = 1522 econnp->conn_ixa->ixa_scopeid; 1523 } else { 1524 sin6->sin6_scope_id = 0; 1525 } 1526 sin6->__sin6_src_id = 0; 1527 break; 1528 } 1529 default: 1530 break; 1531 } 1532 ASSERT(ok_mp->b_wptr <= ok_mp->b_datap->db_lim); 1533 } 1534 1535 /* 1536 * If there are no options we know that the T_CONN_RES will 1537 * succeed. However, we can't send the T_OK_ACK upstream until 1538 * the tcp_accept_swap is done since it would be dangerous to 1539 * let the application start using the new fd prior to the swap. 1540 */ 1541 tcp_accept_swap(listener, acceptor, eager); 1542 1543 /* 1544 * tcp_accept_swap unlinks eager from listener but does not drop 1545 * the eager's reference on the listener. 1546 */ 1547 ASSERT(eager->tcp_listener == NULL); 1548 ASSERT(listener->tcp_connp->conn_ref >= 5); 1549 1550 /* 1551 * The eager is now associated with its own queue. Insert in 1552 * the hash so that the connection can be reused for a future 1553 * T_CONN_RES. 1554 */ 1555 tcp_acceptor_hash_insert(acceptor_id, eager); 1556 1557 /* 1558 * We now do the processing of options with T_CONN_RES. 1559 * We delay till now since we wanted to have queue to pass to 1560 * option processing routines that points back to the right 1561 * instance structure which does not happen until after 1562 * tcp_accept_swap(). 1563 * 1564 * Note: 1565 * The sanity of the logic here assumes that whatever options 1566 * are appropriate to inherit from listner=>eager are done 1567 * before this point, and whatever were to be overridden (or not) 1568 * in transfer logic from eager=>acceptor in tcp_accept_swap(). 1569 * [ Warning: acceptor endpoint can have T_OPTMGMT_REQ done to it 1570 * before its ACCEPTOR_id comes down in T_CONN_RES ] 1571 * This may not be true at this point in time but can be fixed 1572 * independently. This option processing code starts with 1573 * the instantiated acceptor instance and the final queue at 1574 * this point. 1575 */ 1576 1577 if (tcr->OPT_length != 0) { 1578 /* Options to process */ 1579 int t_error = 0; 1580 int sys_error = 0; 1581 int do_disconnect = 0; 1582 1583 if (tcp_conprim_opt_process(eager, mp1, 1584 &do_disconnect, &t_error, &sys_error) < 0) { 1585 eager->tcp_accept_error = 1; 1586 if (do_disconnect) { 1587 /* 1588 * An option failed which does not allow 1589 * connection to be accepted. 1590 * 1591 * We allow T_CONN_RES to succeed and 1592 * put a T_DISCON_IND on the eager queue. 1593 */ 1594 ASSERT(t_error == 0 && sys_error == 0); 1595 eager->tcp_send_discon_ind = 1; 1596 } else { 1597 ASSERT(t_error != 0); 1598 freemsg(ok_mp); 1599 /* 1600 * Original mp was either freed or set 1601 * to ok_mp above, so use mp1 instead. 1602 */ 1603 tcp_err_ack(listener, mp1, t_error, sys_error); 1604 goto finish; 1605 } 1606 } 1607 /* 1608 * Most likely success in setting options (except if 1609 * eager->tcp_send_discon_ind set). 1610 * mp1 option buffer represented by OPT_length/offset 1611 * potentially modified and contains results of setting 1612 * options at this point 1613 */ 1614 } 1615 1616 /* We no longer need mp1, since all options processing has passed */ 1617 freemsg(mp1); 1618 1619 putnext(listener->tcp_connp->conn_rq, ok_mp); 1620 1621 mutex_enter(&listener->tcp_eager_lock); 1622 if (listener->tcp_eager_prev_q0->tcp_conn_def_q0) { 1623 mblk_t *conn_ind; 1624 1625 /* 1626 * This path should not be executed if listener and 1627 * acceptor streams are the same. 1628 */ 1629 ASSERT(listener != acceptor); 1630 conn_ind = tcp_get_def_conn_ind(listener); 1631 mutex_exit(&listener->tcp_eager_lock); 1632 putnext(listener->tcp_connp->conn_rq, conn_ind); 1633 } else { 1634 mutex_exit(&listener->tcp_eager_lock); 1635 } 1636 1637 /* 1638 * Done with the acceptor - free it 1639 * 1640 * Note: from this point on, no access to listener should be made 1641 * as listener can be equal to acceptor. 1642 */ 1643 finish: 1644 ASSERT(acceptor->tcp_detached); 1645 acceptor->tcp_connp->conn_rq = NULL; 1646 ASSERT(!IPCL_IS_NONSTR(acceptor->tcp_connp)); 1647 acceptor->tcp_connp->conn_wq = NULL; 1648 (void) tcp_clean_death(acceptor, 0); 1649 CONN_DEC_REF(acceptor->tcp_connp); 1650 1651 /* 1652 * We pass discon_mp to tcp_accept_finish to get on the right squeue. 1653 * 1654 * It will update the setting for sockfs/stream head and also take 1655 * care of any data that arrived before accept() wad called. 1656 * In case we already received a FIN then tcp_accept_finish will send up 1657 * the ordrel. It will also send up a window update if the window 1658 * has opened up. 1659 */ 1660 1661 /* 1662 * XXX: we currently have a problem if XTI application closes the 1663 * acceptor stream in between. This problem exists in on10-gate also 1664 * and is well know but nothing can be done short of major rewrite 1665 * to fix it. Now it is possible to take care of it by assigning TLI/XTI 1666 * eager same squeue as listener (we can distinguish non socket 1667 * listeners at the time of handling a SYN in tcp_input_listener) 1668 * and do most of the work that tcp_accept_finish does here itself 1669 * and then get behind the acceptor squeue to access the acceptor 1670 * queue. 1671 */ 1672 /* 1673 * We already have a ref on tcp so no need to do one before squeue_enter 1674 */ 1675 SQUEUE_ENTER_ONE(eager->tcp_connp->conn_sqp, discon_mp, 1676 tcp_accept_finish, eager->tcp_connp, NULL, SQ_FILL, 1677 SQTAG_TCP_ACCEPT_FINISH); 1678 } 1679 1680 1681 /* 1682 * This is the STREAMS entry point for T_CONN_RES coming down on 1683 * Acceptor STREAM when sockfs listener does accept processing. 1684 * Read the block comment on top of tcp_input_listener(). 1685 */ 1686 void 1687 tcp_tpi_accept(queue_t *q, mblk_t *mp) 1688 { 1689 queue_t *rq = RD(q); 1690 struct T_conn_res *conn_res; 1691 tcp_t *eager; 1692 tcp_t *listener; 1693 struct T_ok_ack *ok; 1694 t_scalar_t PRIM_type; 1695 mblk_t *discon_mp; 1696 conn_t *econnp; 1697 cred_t *cr; 1698 1699 ASSERT(DB_TYPE(mp) == M_PROTO); 1700 1701 /* 1702 * All Solaris components should pass a db_credp 1703 * for this TPI message, hence we ASSERT. 1704 * But in case there is some other M_PROTO that looks 1705 * like a TPI message sent by some other kernel 1706 * component, we check and return an error. 1707 */ 1708 cr = msg_getcred(mp, NULL); 1709 ASSERT(cr != NULL); 1710 if (cr == NULL) { 1711 mp = mi_tpi_err_ack_alloc(mp, TSYSERR, EINVAL); 1712 if (mp != NULL) 1713 putnext(rq, mp); 1714 return; 1715 } 1716 conn_res = (struct T_conn_res *)mp->b_rptr; 1717 ASSERT((uintptr_t)(mp->b_wptr - mp->b_rptr) <= (uintptr_t)INT_MAX); 1718 if ((mp->b_wptr - mp->b_rptr) < sizeof (struct T_conn_res)) { 1719 mp = mi_tpi_err_ack_alloc(mp, TPROTO, 0); 1720 if (mp != NULL) 1721 putnext(rq, mp); 1722 return; 1723 } 1724 switch (conn_res->PRIM_type) { 1725 case O_T_CONN_RES: 1726 case T_CONN_RES: 1727 /* 1728 * We pass up an err ack if allocb fails. This will 1729 * cause sockfs to issue a T_DISCON_REQ which will cause 1730 * tcp_eager_blowoff to be called. sockfs will then call 1731 * rq->q_qinfo->qi_qclose to cleanup the acceptor stream. 1732 * we need to do the allocb up here because we have to 1733 * make sure rq->q_qinfo->qi_qclose still points to the 1734 * correct function (tcp_tpi_close_accept) in case allocb 1735 * fails. 1736 */ 1737 bcopy(mp->b_rptr + conn_res->OPT_offset, 1738 &eager, conn_res->OPT_length); 1739 PRIM_type = conn_res->PRIM_type; 1740 mp->b_datap->db_type = M_PCPROTO; 1741 mp->b_wptr = mp->b_rptr + sizeof (struct T_ok_ack); 1742 ok = (struct T_ok_ack *)mp->b_rptr; 1743 ok->PRIM_type = T_OK_ACK; 1744 ok->CORRECT_prim = PRIM_type; 1745 econnp = eager->tcp_connp; 1746 econnp->conn_dev = (dev_t)RD(q)->q_ptr; 1747 econnp->conn_minor_arena = (vmem_t *)(WR(q)->q_ptr); 1748 econnp->conn_rq = rq; 1749 econnp->conn_wq = q; 1750 rq->q_ptr = econnp; 1751 rq->q_qinfo = &tcp_rinitv4; /* No open - same as rinitv6 */ 1752 q->q_ptr = econnp; 1753 q->q_qinfo = &tcp_winit; 1754 listener = eager->tcp_listener; 1755 1756 /* 1757 * Pre allocate the discon_ind mblk also. tcp_accept_finish will 1758 * use it if something failed. 1759 */ 1760 discon_mp = allocb(MAX(sizeof (struct T_discon_ind), 1761 sizeof (struct stroptions)), BPRI_HI); 1762 1763 if (discon_mp == NULL) { 1764 mp = mi_tpi_err_ack_alloc(mp, TPROTO, 0); 1765 if (mp != NULL) 1766 putnext(rq, mp); 1767 return; 1768 } 1769 1770 eager->tcp_issocket = B_TRUE; 1771 1772 ASSERT(econnp->conn_netstack == 1773 listener->tcp_connp->conn_netstack); 1774 ASSERT(eager->tcp_tcps == listener->tcp_tcps); 1775 1776 /* Put the ref for IP */ 1777 CONN_INC_REF(econnp); 1778 1779 /* 1780 * We should have minimum of 3 references on the conn 1781 * at this point. One each for TCP and IP and one for 1782 * the T_conn_ind that was sent up when the 3-way handshake 1783 * completed. In the normal case we would also have another 1784 * reference (making a total of 4) for the conn being in the 1785 * classifier hash list. However the eager could have received 1786 * an RST subsequently and tcp_closei_local could have removed 1787 * the eager from the classifier hash list, hence we can't 1788 * assert that reference. 1789 */ 1790 ASSERT(econnp->conn_ref >= 3); 1791 1792 mutex_enter(&listener->tcp_eager_lock); 1793 if (listener->tcp_eager_prev_q0->tcp_conn_def_q0) { 1794 mblk_t *conn_ind = tcp_get_def_conn_ind(listener); 1795 1796 /* Need to get inside the listener perimeter */ 1797 CONN_INC_REF(listener->tcp_connp); 1798 SQUEUE_ENTER_ONE(listener->tcp_connp->conn_sqp, 1799 conn_ind, tcp_send_pending, listener->tcp_connp, 1800 NULL, SQ_FILL, SQTAG_TCP_SEND_PENDING); 1801 } 1802 tcp_eager_unlink(eager); 1803 mutex_exit(&listener->tcp_eager_lock); 1804 1805 /* 1806 * At this point, the eager is detached from the listener 1807 * but we still have an extra refs on eager (apart from the 1808 * usual tcp references). The ref was placed in tcp_input_data 1809 * before sending the conn_ind in tcp_send_conn_ind. 1810 * The ref will be dropped in tcp_accept_finish(). 1811 */ 1812 SQUEUE_ENTER_ONE(econnp->conn_sqp, discon_mp, tcp_accept_finish, 1813 econnp, NULL, SQ_NODRAIN, SQTAG_TCP_ACCEPT_FINISH_Q0); 1814 1815 /* 1816 * Send the new local address also up to sockfs. There 1817 * should already be enough space in the mp that came 1818 * down from soaccept(). 1819 */ 1820 if (econnp->conn_family == AF_INET) { 1821 sin_t *sin; 1822 1823 ASSERT((mp->b_datap->db_lim - mp->b_datap->db_base) >= 1824 (sizeof (struct T_ok_ack) + sizeof (sin_t))); 1825 sin = (sin_t *)mp->b_wptr; 1826 mp->b_wptr += sizeof (sin_t); 1827 sin->sin_family = AF_INET; 1828 sin->sin_port = econnp->conn_lport; 1829 sin->sin_addr.s_addr = econnp->conn_laddr_v4; 1830 } else { 1831 sin6_t *sin6; 1832 1833 ASSERT((mp->b_datap->db_lim - mp->b_datap->db_base) >= 1834 sizeof (struct T_ok_ack) + sizeof (sin6_t)); 1835 sin6 = (sin6_t *)mp->b_wptr; 1836 mp->b_wptr += sizeof (sin6_t); 1837 sin6->sin6_family = AF_INET6; 1838 sin6->sin6_port = econnp->conn_lport; 1839 sin6->sin6_addr = econnp->conn_laddr_v6; 1840 if (econnp->conn_ipversion == IPV4_VERSION) 1841 sin6->sin6_flowinfo = 0; 1842 else 1843 sin6->sin6_flowinfo = econnp->conn_flowinfo; 1844 if (IN6_IS_ADDR_LINKSCOPE(&econnp->conn_laddr_v6) && 1845 (econnp->conn_ixa->ixa_flags & IXAF_SCOPEID_SET)) { 1846 sin6->sin6_scope_id = 1847 econnp->conn_ixa->ixa_scopeid; 1848 } else { 1849 sin6->sin6_scope_id = 0; 1850 } 1851 sin6->__sin6_src_id = 0; 1852 } 1853 1854 putnext(rq, mp); 1855 return; 1856 default: 1857 mp = mi_tpi_err_ack_alloc(mp, TNOTSUPPORT, 0); 1858 if (mp != NULL) 1859 putnext(rq, mp); 1860 return; 1861 } 1862 } 1863 1864 /* 1865 * The function called through squeue to get behind listener's perimeter to 1866 * send a deferred conn_ind. 1867 */ 1868 /* ARGSUSED */ 1869 void 1870 tcp_send_pending(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy) 1871 { 1872 conn_t *lconnp = (conn_t *)arg; 1873 tcp_t *listener = lconnp->conn_tcp; 1874 struct T_conn_ind *conn_ind; 1875 tcp_t *tcp; 1876 1877 conn_ind = (struct T_conn_ind *)mp->b_rptr; 1878 bcopy(mp->b_rptr + conn_ind->OPT_offset, &tcp, 1879 conn_ind->OPT_length); 1880 1881 if (listener->tcp_state != TCPS_LISTEN) { 1882 /* 1883 * If listener has closed, it would have caused a 1884 * a cleanup/blowoff to happen for the eager, so 1885 * we don't need to do anything more. 1886 */ 1887 freemsg(mp); 1888 return; 1889 } 1890 1891 putnext(lconnp->conn_rq, mp); 1892 } 1893 1894 /* 1895 * Sends the T_CONN_IND to the listener. The caller calls this 1896 * functions via squeue to get inside the listener's perimeter 1897 * once the 3 way hand shake is done a T_CONN_IND needs to be 1898 * sent. As an optimization, the caller can call this directly 1899 * if listener's perimeter is same as eager's. 1900 */ 1901 /* ARGSUSED */ 1902 void 1903 tcp_send_conn_ind(void *arg, mblk_t *mp, void *arg2) 1904 { 1905 conn_t *lconnp = (conn_t *)arg; 1906 tcp_t *listener = lconnp->conn_tcp; 1907 tcp_t *tcp; 1908 struct T_conn_ind *conn_ind; 1909 ipaddr_t *addr_cache; 1910 boolean_t need_send_conn_ind = B_FALSE; 1911 tcp_stack_t *tcps = listener->tcp_tcps; 1912 1913 /* retrieve the eager */ 1914 conn_ind = (struct T_conn_ind *)mp->b_rptr; 1915 ASSERT(conn_ind->OPT_offset != 0 && 1916 conn_ind->OPT_length == sizeof (intptr_t)); 1917 bcopy(mp->b_rptr + conn_ind->OPT_offset, &tcp, 1918 conn_ind->OPT_length); 1919 1920 /* 1921 * TLI/XTI applications will get confused by 1922 * sending eager as an option since it violates 1923 * the option semantics. So remove the eager as 1924 * option since TLI/XTI app doesn't need it anyway. 1925 */ 1926 if (!TCP_IS_SOCKET(listener)) { 1927 conn_ind->OPT_length = 0; 1928 conn_ind->OPT_offset = 0; 1929 } 1930 if (listener->tcp_state != TCPS_LISTEN) { 1931 /* 1932 * If listener has closed, it would have caused a 1933 * a cleanup/blowoff to happen for the eager. We 1934 * just need to return. 1935 */ 1936 freemsg(mp); 1937 return; 1938 } 1939 1940 1941 /* 1942 * if the conn_req_q is full defer passing up the 1943 * T_CONN_IND until space is availabe after t_accept() 1944 * processing 1945 */ 1946 mutex_enter(&listener->tcp_eager_lock); 1947 1948 /* 1949 * Take the eager out, if it is in the list of droppable eagers 1950 * as we are here because the 3W handshake is over. 1951 */ 1952 MAKE_UNDROPPABLE(tcp); 1953 1954 if (listener->tcp_conn_req_cnt_q < listener->tcp_conn_req_max) { 1955 tcp_t *tail; 1956 1957 /* 1958 * The eager already has an extra ref put in tcp_input_data 1959 * so that it stays till accept comes back even though it 1960 * might get into TCPS_CLOSED as a result of a TH_RST etc. 1961 */ 1962 ASSERT(listener->tcp_conn_req_cnt_q0 > 0); 1963 listener->tcp_conn_req_cnt_q0--; 1964 listener->tcp_conn_req_cnt_q++; 1965 1966 /* Move from SYN_RCVD to ESTABLISHED list */ 1967 tcp->tcp_eager_next_q0->tcp_eager_prev_q0 = 1968 tcp->tcp_eager_prev_q0; 1969 tcp->tcp_eager_prev_q0->tcp_eager_next_q0 = 1970 tcp->tcp_eager_next_q0; 1971 tcp->tcp_eager_prev_q0 = NULL; 1972 tcp->tcp_eager_next_q0 = NULL; 1973 1974 /* 1975 * Insert at end of the queue because sockfs 1976 * sends down T_CONN_RES in chronological 1977 * order. Leaving the older conn indications 1978 * at front of the queue helps reducing search 1979 * time. 1980 */ 1981 tail = listener->tcp_eager_last_q; 1982 if (tail != NULL) 1983 tail->tcp_eager_next_q = tcp; 1984 else 1985 listener->tcp_eager_next_q = tcp; 1986 listener->tcp_eager_last_q = tcp; 1987 tcp->tcp_eager_next_q = NULL; 1988 /* 1989 * Delay sending up the T_conn_ind until we are 1990 * done with the eager. Once we have have sent up 1991 * the T_conn_ind, the accept can potentially complete 1992 * any time and release the refhold we have on the eager. 1993 */ 1994 need_send_conn_ind = B_TRUE; 1995 } else { 1996 /* 1997 * Defer connection on q0 and set deferred 1998 * connection bit true 1999 */ 2000 tcp->tcp_conn_def_q0 = B_TRUE; 2001 2002 /* take tcp out of q0 ... */ 2003 tcp->tcp_eager_prev_q0->tcp_eager_next_q0 = 2004 tcp->tcp_eager_next_q0; 2005 tcp->tcp_eager_next_q0->tcp_eager_prev_q0 = 2006 tcp->tcp_eager_prev_q0; 2007 2008 /* ... and place it at the end of q0 */ 2009 tcp->tcp_eager_prev_q0 = listener->tcp_eager_prev_q0; 2010 tcp->tcp_eager_next_q0 = listener; 2011 listener->tcp_eager_prev_q0->tcp_eager_next_q0 = tcp; 2012 listener->tcp_eager_prev_q0 = tcp; 2013 tcp->tcp_conn.tcp_eager_conn_ind = mp; 2014 } 2015 2016 /* we have timed out before */ 2017 if (tcp->tcp_syn_rcvd_timeout != 0) { 2018 tcp->tcp_syn_rcvd_timeout = 0; 2019 listener->tcp_syn_rcvd_timeout--; 2020 if (listener->tcp_syn_defense && 2021 listener->tcp_syn_rcvd_timeout <= 2022 (tcps->tcps_conn_req_max_q0 >> 5) && 2023 10*MINUTES < TICK_TO_MSEC(ddi_get_lbolt64() - 2024 listener->tcp_last_rcv_lbolt)) { 2025 /* 2026 * Turn off the defense mode if we 2027 * believe the SYN attack is over. 2028 */ 2029 listener->tcp_syn_defense = B_FALSE; 2030 if (listener->tcp_ip_addr_cache) { 2031 kmem_free((void *)listener->tcp_ip_addr_cache, 2032 IP_ADDR_CACHE_SIZE * sizeof (ipaddr_t)); 2033 listener->tcp_ip_addr_cache = NULL; 2034 } 2035 } 2036 } 2037 addr_cache = (ipaddr_t *)(listener->tcp_ip_addr_cache); 2038 if (addr_cache != NULL) { 2039 /* 2040 * We have finished a 3-way handshake with this 2041 * remote host. This proves the IP addr is good. 2042 * Cache it! 2043 */ 2044 addr_cache[IP_ADDR_CACHE_HASH(tcp->tcp_connp->conn_faddr_v4)] = 2045 tcp->tcp_connp->conn_faddr_v4; 2046 } 2047 mutex_exit(&listener->tcp_eager_lock); 2048 if (need_send_conn_ind) 2049 putnext(lconnp->conn_rq, mp); 2050 } 2051