1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. 24 */ 25 26 /* This files contains all TCP TLI/TPI related functions */ 27 28 #include <sys/types.h> 29 #include <sys/stream.h> 30 #include <sys/strsun.h> 31 #include <sys/strsubr.h> 32 #include <sys/stropts.h> 33 #include <sys/strlog.h> 34 #define _SUN_TPI_VERSION 2 35 #include <sys/tihdr.h> 36 #include <sys/suntpi.h> 37 #include <sys/xti_inet.h> 38 #include <sys/squeue_impl.h> 39 #include <sys/squeue.h> 40 41 #include <inet/common.h> 42 #include <inet/ip.h> 43 #include <inet/tcp.h> 44 #include <inet/tcp_impl.h> 45 #include <inet/proto_set.h> 46 47 static void tcp_accept_swap(tcp_t *, tcp_t *, tcp_t *); 48 static int tcp_conprim_opt_process(tcp_t *, mblk_t *, int *, int *, int *); 49 50 void 51 tcp_use_pure_tpi(tcp_t *tcp) 52 { 53 conn_t *connp = tcp->tcp_connp; 54 55 #ifdef _ILP32 56 tcp->tcp_acceptor_id = (t_uscalar_t)connp->conn_rq; 57 #else 58 tcp->tcp_acceptor_id = connp->conn_dev; 59 #endif 60 /* 61 * Insert this socket into the acceptor hash. 62 * We might need it for T_CONN_RES message 63 */ 64 tcp_acceptor_hash_insert(tcp->tcp_acceptor_id, tcp); 65 66 tcp->tcp_issocket = B_FALSE; 67 TCP_STAT(tcp->tcp_tcps, tcp_sock_fallback); 68 } 69 70 /* Shorthand to generate and send TPI error acks to our client */ 71 void 72 tcp_err_ack(tcp_t *tcp, mblk_t *mp, int t_error, int sys_error) 73 { 74 if ((mp = mi_tpi_err_ack_alloc(mp, t_error, sys_error)) != NULL) 75 putnext(tcp->tcp_connp->conn_rq, mp); 76 } 77 78 /* Shorthand to generate and send TPI error acks to our client */ 79 void 80 tcp_err_ack_prim(tcp_t *tcp, mblk_t *mp, int primitive, 81 int t_error, int sys_error) 82 { 83 struct T_error_ack *teackp; 84 85 if ((mp = tpi_ack_alloc(mp, sizeof (struct T_error_ack), 86 M_PCPROTO, T_ERROR_ACK)) != NULL) { 87 teackp = (struct T_error_ack *)mp->b_rptr; 88 teackp->ERROR_prim = primitive; 89 teackp->TLI_error = t_error; 90 teackp->UNIX_error = sys_error; 91 putnext(tcp->tcp_connp->conn_rq, mp); 92 } 93 } 94 95 /* 96 * TCP routine to get the values of options. 97 */ 98 int 99 tcp_tpi_opt_get(queue_t *q, int level, int name, uchar_t *ptr) 100 { 101 return (tcp_opt_get(Q_TO_CONN(q), level, name, ptr)); 102 } 103 104 /* ARGSUSED */ 105 int 106 tcp_tpi_opt_set(queue_t *q, uint_t optset_context, int level, int name, 107 uint_t inlen, uchar_t *invalp, uint_t *outlenp, uchar_t *outvalp, 108 void *thisdg_attrs, cred_t *cr) 109 { 110 conn_t *connp = Q_TO_CONN(q); 111 112 return (tcp_opt_set(connp, optset_context, level, name, inlen, invalp, 113 outlenp, outvalp, thisdg_attrs, cr)); 114 } 115 116 static int 117 tcp_conprim_opt_process(tcp_t *tcp, mblk_t *mp, int *do_disconnectp, 118 int *t_errorp, int *sys_errorp) 119 { 120 int error; 121 int is_absreq_failure; 122 t_scalar_t *opt_lenp; 123 t_scalar_t opt_offset; 124 int prim_type; 125 struct T_conn_req *tcreqp; 126 struct T_conn_res *tcresp; 127 cred_t *cr; 128 129 /* 130 * All Solaris components should pass a db_credp 131 * for this TPI message, hence we ASSERT. 132 * But in case there is some other M_PROTO that looks 133 * like a TPI message sent by some other kernel 134 * component, we check and return an error. 135 */ 136 cr = msg_getcred(mp, NULL); 137 ASSERT(cr != NULL); 138 if (cr == NULL) 139 return (-1); 140 141 prim_type = ((union T_primitives *)mp->b_rptr)->type; 142 ASSERT(prim_type == T_CONN_REQ || prim_type == O_T_CONN_RES || 143 prim_type == T_CONN_RES); 144 145 switch (prim_type) { 146 case T_CONN_REQ: 147 tcreqp = (struct T_conn_req *)mp->b_rptr; 148 opt_offset = tcreqp->OPT_offset; 149 opt_lenp = (t_scalar_t *)&tcreqp->OPT_length; 150 break; 151 case O_T_CONN_RES: 152 case T_CONN_RES: 153 tcresp = (struct T_conn_res *)mp->b_rptr; 154 opt_offset = tcresp->OPT_offset; 155 opt_lenp = (t_scalar_t *)&tcresp->OPT_length; 156 break; 157 default: 158 opt_lenp = 0; 159 opt_offset = 0; 160 break; 161 } 162 163 *t_errorp = 0; 164 *sys_errorp = 0; 165 *do_disconnectp = 0; 166 167 error = tpi_optcom_buf(tcp->tcp_connp->conn_wq, mp, opt_lenp, 168 opt_offset, cr, &tcp_opt_obj, 169 NULL, &is_absreq_failure); 170 171 switch (error) { 172 case 0: /* no error */ 173 ASSERT(is_absreq_failure == 0); 174 return (0); 175 case ENOPROTOOPT: 176 *t_errorp = TBADOPT; 177 break; 178 case EACCES: 179 *t_errorp = TACCES; 180 break; 181 default: 182 *t_errorp = TSYSERR; *sys_errorp = error; 183 break; 184 } 185 if (is_absreq_failure != 0) { 186 /* 187 * The connection request should get the local ack 188 * T_OK_ACK and then a T_DISCON_IND. 189 */ 190 *do_disconnectp = 1; 191 } 192 return (-1); 193 } 194 195 void 196 tcp_tpi_bind(tcp_t *tcp, mblk_t *mp) 197 { 198 int error; 199 conn_t *connp = tcp->tcp_connp; 200 struct sockaddr *sa; 201 mblk_t *mp1; 202 struct T_bind_req *tbr; 203 int backlog; 204 socklen_t len; 205 sin_t *sin; 206 sin6_t *sin6; 207 cred_t *cr; 208 209 /* 210 * All Solaris components should pass a db_credp 211 * for this TPI message, hence we ASSERT. 212 * But in case there is some other M_PROTO that looks 213 * like a TPI message sent by some other kernel 214 * component, we check and return an error. 215 */ 216 cr = msg_getcred(mp, NULL); 217 ASSERT(cr != NULL); 218 if (cr == NULL) { 219 tcp_err_ack(tcp, mp, TSYSERR, EINVAL); 220 return; 221 } 222 223 ASSERT((uintptr_t)(mp->b_wptr - mp->b_rptr) <= (uintptr_t)INT_MAX); 224 if ((mp->b_wptr - mp->b_rptr) < sizeof (*tbr)) { 225 if (connp->conn_debug) { 226 (void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE, 227 "tcp_tpi_bind: bad req, len %u", 228 (uint_t)(mp->b_wptr - mp->b_rptr)); 229 } 230 tcp_err_ack(tcp, mp, TPROTO, 0); 231 return; 232 } 233 /* Make sure the largest address fits */ 234 mp1 = reallocb(mp, sizeof (struct T_bind_ack) + sizeof (sin6_t), 1); 235 if (mp1 == NULL) { 236 tcp_err_ack(tcp, mp, TSYSERR, ENOMEM); 237 return; 238 } 239 mp = mp1; 240 tbr = (struct T_bind_req *)mp->b_rptr; 241 242 backlog = tbr->CONIND_number; 243 len = tbr->ADDR_length; 244 245 switch (len) { 246 case 0: /* request for a generic port */ 247 tbr->ADDR_offset = sizeof (struct T_bind_req); 248 if (connp->conn_family == AF_INET) { 249 tbr->ADDR_length = sizeof (sin_t); 250 sin = (sin_t *)&tbr[1]; 251 *sin = sin_null; 252 sin->sin_family = AF_INET; 253 sa = (struct sockaddr *)sin; 254 len = sizeof (sin_t); 255 mp->b_wptr = (uchar_t *)&sin[1]; 256 } else { 257 ASSERT(connp->conn_family == AF_INET6); 258 tbr->ADDR_length = sizeof (sin6_t); 259 sin6 = (sin6_t *)&tbr[1]; 260 *sin6 = sin6_null; 261 sin6->sin6_family = AF_INET6; 262 sa = (struct sockaddr *)sin6; 263 len = sizeof (sin6_t); 264 mp->b_wptr = (uchar_t *)&sin6[1]; 265 } 266 break; 267 268 case sizeof (sin_t): /* Complete IPv4 address */ 269 sa = (struct sockaddr *)mi_offset_param(mp, tbr->ADDR_offset, 270 sizeof (sin_t)); 271 break; 272 273 case sizeof (sin6_t): /* Complete IPv6 address */ 274 sa = (struct sockaddr *)mi_offset_param(mp, 275 tbr->ADDR_offset, sizeof (sin6_t)); 276 break; 277 278 default: 279 if (connp->conn_debug) { 280 (void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE, 281 "tcp_tpi_bind: bad address length, %d", 282 tbr->ADDR_length); 283 } 284 tcp_err_ack(tcp, mp, TBADADDR, 0); 285 return; 286 } 287 288 if (backlog > 0) { 289 error = tcp_do_listen(connp, sa, len, backlog, DB_CRED(mp), 290 tbr->PRIM_type != O_T_BIND_REQ); 291 } else { 292 error = tcp_do_bind(connp, sa, len, DB_CRED(mp), 293 tbr->PRIM_type != O_T_BIND_REQ); 294 } 295 done: 296 if (error > 0) { 297 tcp_err_ack(tcp, mp, TSYSERR, error); 298 } else if (error < 0) { 299 tcp_err_ack(tcp, mp, -error, 0); 300 } else { 301 /* 302 * Update port information as sockfs/tpi needs it for checking 303 */ 304 if (connp->conn_family == AF_INET) { 305 sin = (sin_t *)sa; 306 sin->sin_port = connp->conn_lport; 307 } else { 308 sin6 = (sin6_t *)sa; 309 sin6->sin6_port = connp->conn_lport; 310 } 311 mp->b_datap->db_type = M_PCPROTO; 312 tbr->PRIM_type = T_BIND_ACK; 313 putnext(connp->conn_rq, mp); 314 } 315 } 316 317 /* tcp_unbind is called by tcp_wput_proto to handle T_UNBIND_REQ messages. */ 318 void 319 tcp_tpi_unbind(tcp_t *tcp, mblk_t *mp) 320 { 321 conn_t *connp = tcp->tcp_connp; 322 int error; 323 324 error = tcp_do_unbind(connp); 325 if (error > 0) { 326 tcp_err_ack(tcp, mp, TSYSERR, error); 327 } else if (error < 0) { 328 tcp_err_ack(tcp, mp, -error, 0); 329 } else { 330 /* Send M_FLUSH according to TPI */ 331 (void) putnextctl1(connp->conn_rq, M_FLUSH, FLUSHRW); 332 333 mp = mi_tpi_ok_ack_alloc(mp); 334 if (mp != NULL) 335 putnext(connp->conn_rq, mp); 336 } 337 } 338 339 /* ARGSUSED */ 340 int 341 tcp_tpi_close(queue_t *q, int flags, cred_t *credp __unused) 342 { 343 conn_t *connp; 344 345 ASSERT(WR(q)->q_next == NULL); 346 347 if (flags & SO_FALLBACK) { 348 /* 349 * stream is being closed while in fallback 350 * simply free the resources that were allocated 351 */ 352 inet_minor_free(WR(q)->q_ptr, (dev_t)(RD(q)->q_ptr)); 353 qprocsoff(q); 354 goto done; 355 } 356 357 connp = Q_TO_CONN(q); 358 /* 359 * We are being closed as /dev/tcp or /dev/tcp6. 360 */ 361 tcp_close_common(connp, flags); 362 363 qprocsoff(q); 364 inet_minor_free(connp->conn_minor_arena, connp->conn_dev); 365 366 /* 367 * Drop IP's reference on the conn. This is the last reference 368 * on the connp if the state was less than established. If the 369 * connection has gone into timewait state, then we will have 370 * one ref for the TCP and one more ref (total of two) for the 371 * classifier connected hash list (a timewait connections stays 372 * in connected hash till closed). 373 * 374 * We can't assert the references because there might be other 375 * transient reference places because of some walkers or queued 376 * packets in squeue for the timewait state. 377 */ 378 CONN_DEC_REF(connp); 379 done: 380 q->q_ptr = WR(q)->q_ptr = NULL; 381 return (0); 382 } 383 384 /* ARGSUSED */ 385 int 386 tcp_tpi_close_accept(queue_t *q, int flags __unused, cred_t *credp __unused) 387 { 388 vmem_t *minor_arena; 389 dev_t conn_dev; 390 extern struct qinit tcp_acceptor_winit; 391 392 ASSERT(WR(q)->q_qinfo == &tcp_acceptor_winit); 393 394 /* 395 * We had opened an acceptor STREAM for sockfs which is 396 * now being closed due to some error. 397 */ 398 qprocsoff(q); 399 400 minor_arena = (vmem_t *)WR(q)->q_ptr; 401 conn_dev = (dev_t)RD(q)->q_ptr; 402 ASSERT(minor_arena != NULL); 403 ASSERT(conn_dev != 0); 404 inet_minor_free(minor_arena, conn_dev); 405 q->q_ptr = WR(q)->q_ptr = NULL; 406 return (0); 407 } 408 409 /* 410 * Put a connection confirmation message upstream built from the 411 * address/flowid information with the conn and iph. Report our success or 412 * failure. 413 */ 414 boolean_t 415 tcp_conn_con(tcp_t *tcp, uchar_t *iphdr, mblk_t *idmp, 416 mblk_t **defermp, ip_recv_attr_t *ira) 417 { 418 sin_t sin; 419 sin6_t sin6; 420 mblk_t *mp; 421 char *optp = NULL; 422 int optlen = 0; 423 conn_t *connp = tcp->tcp_connp; 424 425 if (defermp != NULL) 426 *defermp = NULL; 427 428 if (tcp->tcp_conn.tcp_opts_conn_req != NULL) { 429 /* 430 * Return in T_CONN_CON results of option negotiation through 431 * the T_CONN_REQ. Note: If there is an real end-to-end option 432 * negotiation, then what is received from remote end needs 433 * to be taken into account but there is no such thing (yet?) 434 * in our TCP/IP. 435 * Note: We do not use mi_offset_param() here as 436 * tcp_opts_conn_req contents do not directly come from 437 * an application and are either generated in kernel or 438 * from user input that was already verified. 439 */ 440 mp = tcp->tcp_conn.tcp_opts_conn_req; 441 optp = (char *)(mp->b_rptr + 442 ((struct T_conn_req *)mp->b_rptr)->OPT_offset); 443 optlen = (int) 444 ((struct T_conn_req *)mp->b_rptr)->OPT_length; 445 } 446 447 if (IPH_HDR_VERSION(iphdr) == IPV4_VERSION) { 448 449 /* packet is IPv4 */ 450 if (connp->conn_family == AF_INET) { 451 sin = sin_null; 452 sin.sin_addr.s_addr = connp->conn_faddr_v4; 453 sin.sin_port = connp->conn_fport; 454 sin.sin_family = AF_INET; 455 mp = mi_tpi_conn_con(NULL, (char *)&sin, 456 (int)sizeof (sin_t), optp, optlen); 457 } else { 458 sin6 = sin6_null; 459 sin6.sin6_addr = connp->conn_faddr_v6; 460 sin6.sin6_port = connp->conn_fport; 461 sin6.sin6_family = AF_INET6; 462 mp = mi_tpi_conn_con(NULL, (char *)&sin6, 463 (int)sizeof (sin6_t), optp, optlen); 464 465 } 466 } else { 467 ip6_t *ip6h = (ip6_t *)iphdr; 468 469 ASSERT(IPH_HDR_VERSION(iphdr) == IPV6_VERSION); 470 ASSERT(connp->conn_family == AF_INET6); 471 sin6 = sin6_null; 472 sin6.sin6_addr = connp->conn_faddr_v6; 473 sin6.sin6_port = connp->conn_fport; 474 sin6.sin6_family = AF_INET6; 475 sin6.sin6_flowinfo = ip6h->ip6_vcf & ~IPV6_VERS_AND_FLOW_MASK; 476 mp = mi_tpi_conn_con(NULL, (char *)&sin6, 477 (int)sizeof (sin6_t), optp, optlen); 478 } 479 480 if (!mp) 481 return (B_FALSE); 482 483 mblk_copycred(mp, idmp); 484 485 if (defermp == NULL) { 486 conn_t *connp = tcp->tcp_connp; 487 if (IPCL_IS_NONSTR(connp)) { 488 (*connp->conn_upcalls->su_connected) 489 (connp->conn_upper_handle, tcp->tcp_connid, 490 ira->ira_cred, ira->ira_cpid); 491 freemsg(mp); 492 } else { 493 if (ira->ira_cred != NULL) { 494 /* So that getpeerucred works for TPI sockfs */ 495 mblk_setcred(mp, ira->ira_cred, ira->ira_cpid); 496 } 497 putnext(connp->conn_rq, mp); 498 } 499 } else { 500 *defermp = mp; 501 } 502 503 if (tcp->tcp_conn.tcp_opts_conn_req != NULL) 504 tcp_close_mpp(&tcp->tcp_conn.tcp_opts_conn_req); 505 return (B_TRUE); 506 } 507 508 /* 509 * Successful connect request processing begins when our client passes 510 * a T_CONN_REQ message into tcp_wput(), which performs function calls into 511 * IP and the passes a T_OK_ACK (or T_ERROR_ACK upstream). 512 * 513 * After various error checks are completed, tcp_tpi_connect() lays 514 * the target address and port into the composite header template. 515 * Then we ask IP for information, including a source address if we didn't 516 * already have one. Finally we prepare to send the SYN packet, and then 517 * send up the T_OK_ACK reply message. 518 */ 519 void 520 tcp_tpi_connect(tcp_t *tcp, mblk_t *mp) 521 { 522 sin_t *sin; 523 struct T_conn_req *tcr; 524 struct sockaddr *sa; 525 socklen_t len; 526 int error; 527 cred_t *cr; 528 pid_t cpid; 529 conn_t *connp = tcp->tcp_connp; 530 queue_t *q = connp->conn_wq; 531 532 /* 533 * All Solaris components should pass a db_credp 534 * for this TPI message, hence we ASSERT. 535 * But in case there is some other M_PROTO that looks 536 * like a TPI message sent by some other kernel 537 * component, we check and return an error. 538 */ 539 cr = msg_getcred(mp, &cpid); 540 ASSERT(cr != NULL); 541 if (cr == NULL) { 542 tcp_err_ack(tcp, mp, TSYSERR, EINVAL); 543 return; 544 } 545 546 tcr = (struct T_conn_req *)mp->b_rptr; 547 548 ASSERT((uintptr_t)(mp->b_wptr - mp->b_rptr) <= (uintptr_t)INT_MAX); 549 if ((mp->b_wptr - mp->b_rptr) < sizeof (*tcr)) { 550 tcp_err_ack(tcp, mp, TPROTO, 0); 551 return; 552 } 553 554 /* 555 * Pre-allocate the T_ordrel_ind mblk so that at close time, we 556 * will always have that to send up. Otherwise, we need to do 557 * special handling in case the allocation fails at that time. 558 * If the end point is TPI, the tcp_t can be reused and the 559 * tcp_ordrel_mp may be allocated already. 560 */ 561 if (tcp->tcp_ordrel_mp == NULL) { 562 if ((tcp->tcp_ordrel_mp = mi_tpi_ordrel_ind()) == NULL) { 563 tcp_err_ack(tcp, mp, TSYSERR, ENOMEM); 564 return; 565 } 566 } 567 568 /* 569 * Determine packet type based on type of address passed in 570 * the request should contain an IPv4 or IPv6 address. 571 * Make sure that address family matches the type of 572 * family of the address passed down. 573 */ 574 switch (tcr->DEST_length) { 575 default: 576 tcp_err_ack(tcp, mp, TBADADDR, 0); 577 return; 578 579 case (sizeof (sin_t) - sizeof (sin->sin_zero)): { 580 /* 581 * XXX: The check for valid DEST_length was not there 582 * in earlier releases and some buggy 583 * TLI apps (e.g Sybase) got away with not feeding 584 * in sin_zero part of address. 585 * We allow that bug to keep those buggy apps humming. 586 * Test suites require the check on DEST_length. 587 * We construct a new mblk with valid DEST_length 588 * free the original so the rest of the code does 589 * not have to keep track of this special shorter 590 * length address case. 591 */ 592 mblk_t *nmp; 593 struct T_conn_req *ntcr; 594 sin_t *nsin; 595 596 nmp = allocb(sizeof (struct T_conn_req) + sizeof (sin_t) + 597 tcr->OPT_length, BPRI_HI); 598 if (nmp == NULL) { 599 tcp_err_ack(tcp, mp, TSYSERR, ENOMEM); 600 return; 601 } 602 ntcr = (struct T_conn_req *)nmp->b_rptr; 603 bzero(ntcr, sizeof (struct T_conn_req)); /* zero fill */ 604 ntcr->PRIM_type = T_CONN_REQ; 605 ntcr->DEST_length = sizeof (sin_t); 606 ntcr->DEST_offset = sizeof (struct T_conn_req); 607 608 nsin = (sin_t *)((uchar_t *)ntcr + ntcr->DEST_offset); 609 *nsin = sin_null; 610 /* Get pointer to shorter address to copy from original mp */ 611 sin = (sin_t *)mi_offset_param(mp, tcr->DEST_offset, 612 tcr->DEST_length); /* extract DEST_length worth of sin_t */ 613 if (sin == NULL || !OK_32PTR((char *)sin)) { 614 freemsg(nmp); 615 tcp_err_ack(tcp, mp, TSYSERR, EINVAL); 616 return; 617 } 618 nsin->sin_family = sin->sin_family; 619 nsin->sin_port = sin->sin_port; 620 nsin->sin_addr = sin->sin_addr; 621 /* Note:nsin->sin_zero zero-fill with sin_null assign above */ 622 nmp->b_wptr = (uchar_t *)&nsin[1]; 623 if (tcr->OPT_length != 0) { 624 ntcr->OPT_length = tcr->OPT_length; 625 ntcr->OPT_offset = nmp->b_wptr - nmp->b_rptr; 626 bcopy((uchar_t *)tcr + tcr->OPT_offset, 627 (uchar_t *)ntcr + ntcr->OPT_offset, 628 tcr->OPT_length); 629 nmp->b_wptr += tcr->OPT_length; 630 } 631 freemsg(mp); /* original mp freed */ 632 mp = nmp; /* re-initialize original variables */ 633 tcr = ntcr; 634 } 635 /* FALLTHRU */ 636 637 case sizeof (sin_t): 638 sa = (struct sockaddr *)mi_offset_param(mp, tcr->DEST_offset, 639 sizeof (sin_t)); 640 len = sizeof (sin_t); 641 break; 642 643 case sizeof (sin6_t): 644 sa = (struct sockaddr *)mi_offset_param(mp, tcr->DEST_offset, 645 sizeof (sin6_t)); 646 len = sizeof (sin6_t); 647 break; 648 } 649 650 error = proto_verify_ip_addr(connp->conn_family, sa, len); 651 if (error != 0) { 652 tcp_err_ack(tcp, mp, TSYSERR, error); 653 return; 654 } 655 656 /* 657 * TODO: If someone in TCPS_TIME_WAIT has this dst/port we 658 * should key on their sequence number and cut them loose. 659 */ 660 661 /* 662 * If options passed in, feed it for verification and handling 663 */ 664 if (tcr->OPT_length != 0) { 665 mblk_t *ok_mp; 666 mblk_t *discon_mp; 667 mblk_t *conn_opts_mp; 668 int t_error, sys_error, do_disconnect; 669 670 conn_opts_mp = NULL; 671 672 if (tcp_conprim_opt_process(tcp, mp, 673 &do_disconnect, &t_error, &sys_error) < 0) { 674 if (do_disconnect) { 675 ASSERT(t_error == 0 && sys_error == 0); 676 discon_mp = mi_tpi_discon_ind(NULL, 677 ECONNREFUSED, 0); 678 if (!discon_mp) { 679 tcp_err_ack_prim(tcp, mp, T_CONN_REQ, 680 TSYSERR, ENOMEM); 681 return; 682 } 683 ok_mp = mi_tpi_ok_ack_alloc(mp); 684 if (!ok_mp) { 685 tcp_err_ack_prim(tcp, NULL, T_CONN_REQ, 686 TSYSERR, ENOMEM); 687 return; 688 } 689 qreply(q, ok_mp); 690 qreply(q, discon_mp); /* no flush! */ 691 } else { 692 ASSERT(t_error != 0); 693 tcp_err_ack_prim(tcp, mp, T_CONN_REQ, t_error, 694 sys_error); 695 } 696 return; 697 } 698 /* 699 * Success in setting options, the mp option buffer represented 700 * by OPT_length/offset has been potentially modified and 701 * contains results of option processing. We copy it in 702 * another mp to save it for potentially influencing returning 703 * it in T_CONN_CONN. 704 */ 705 if (tcr->OPT_length != 0) { /* there are resulting options */ 706 conn_opts_mp = copyb(mp); 707 if (!conn_opts_mp) { 708 tcp_err_ack_prim(tcp, mp, T_CONN_REQ, 709 TSYSERR, ENOMEM); 710 return; 711 } 712 ASSERT(tcp->tcp_conn.tcp_opts_conn_req == NULL); 713 tcp->tcp_conn.tcp_opts_conn_req = conn_opts_mp; 714 /* 715 * Note: 716 * These resulting option negotiation can include any 717 * end-to-end negotiation options but there no such 718 * thing (yet?) in our TCP/IP. 719 */ 720 } 721 } 722 723 /* call the non-TPI version */ 724 error = tcp_do_connect(tcp->tcp_connp, sa, len, cr, cpid); 725 if (error < 0) { 726 mp = mi_tpi_err_ack_alloc(mp, -error, 0); 727 } else if (error > 0) { 728 mp = mi_tpi_err_ack_alloc(mp, TSYSERR, error); 729 } else { 730 mp = mi_tpi_ok_ack_alloc(mp); 731 } 732 733 /* 734 * Note: Code below is the "failure" case 735 */ 736 /* return error ack and blow away saved option results if any */ 737 connect_failed: 738 if (mp != NULL) 739 putnext(connp->conn_rq, mp); 740 else { 741 tcp_err_ack_prim(tcp, NULL, T_CONN_REQ, 742 TSYSERR, ENOMEM); 743 } 744 } 745 746 /* Return the TPI/TLI equivalent of our current tcp_state */ 747 static int 748 tcp_tpistate(tcp_t *tcp) 749 { 750 switch (tcp->tcp_state) { 751 case TCPS_IDLE: 752 return (TS_UNBND); 753 case TCPS_LISTEN: 754 /* 755 * Return whether there are outstanding T_CONN_IND waiting 756 * for the matching T_CONN_RES. Therefore don't count q0. 757 */ 758 if (tcp->tcp_conn_req_cnt_q > 0) 759 return (TS_WRES_CIND); 760 else 761 return (TS_IDLE); 762 case TCPS_BOUND: 763 return (TS_IDLE); 764 case TCPS_SYN_SENT: 765 return (TS_WCON_CREQ); 766 case TCPS_SYN_RCVD: 767 /* 768 * Note: assumption: this has to the active open SYN_RCVD. 769 * The passive instance is detached in SYN_RCVD stage of 770 * incoming connection processing so we cannot get request 771 * for T_info_ack on it. 772 */ 773 return (TS_WACK_CRES); 774 case TCPS_ESTABLISHED: 775 return (TS_DATA_XFER); 776 case TCPS_CLOSE_WAIT: 777 return (TS_WREQ_ORDREL); 778 case TCPS_FIN_WAIT_1: 779 return (TS_WIND_ORDREL); 780 case TCPS_FIN_WAIT_2: 781 return (TS_WIND_ORDREL); 782 783 case TCPS_CLOSING: 784 case TCPS_LAST_ACK: 785 case TCPS_TIME_WAIT: 786 case TCPS_CLOSED: 787 /* 788 * Following TS_WACK_DREQ7 is a rendition of "not 789 * yet TS_IDLE" TPI state. There is no best match to any 790 * TPI state for TCPS_{CLOSING, LAST_ACK, TIME_WAIT} but we 791 * choose a value chosen that will map to TLI/XTI level 792 * state of TSTATECHNG (state is process of changing) which 793 * captures what this dummy state represents. 794 */ 795 return (TS_WACK_DREQ7); 796 default: 797 cmn_err(CE_WARN, "tcp_tpistate: strange state (%d) %s", 798 tcp->tcp_state, tcp_display(tcp, NULL, 799 DISP_PORT_ONLY)); 800 return (TS_UNBND); 801 } 802 } 803 804 static void 805 tcp_copy_info(struct T_info_ack *tia, tcp_t *tcp) 806 { 807 tcp_stack_t *tcps = tcp->tcp_tcps; 808 conn_t *connp = tcp->tcp_connp; 809 extern struct T_info_ack tcp_g_t_info_ack; 810 extern struct T_info_ack tcp_g_t_info_ack_v6; 811 812 if (connp->conn_family == AF_INET6) 813 *tia = tcp_g_t_info_ack_v6; 814 else 815 *tia = tcp_g_t_info_ack; 816 tia->CURRENT_state = tcp_tpistate(tcp); 817 tia->OPT_size = tcp_max_optsize; 818 if (tcp->tcp_mss == 0) { 819 /* Not yet set - tcp_open does not set mss */ 820 if (connp->conn_ipversion == IPV4_VERSION) 821 tia->TIDU_size = tcps->tcps_mss_def_ipv4; 822 else 823 tia->TIDU_size = tcps->tcps_mss_def_ipv6; 824 } else { 825 tia->TIDU_size = tcp->tcp_mss; 826 } 827 /* TODO: Default ETSDU is 1. Is that correct for tcp? */ 828 } 829 830 void 831 tcp_do_capability_ack(tcp_t *tcp, struct T_capability_ack *tcap, 832 t_uscalar_t cap_bits1) 833 { 834 tcap->CAP_bits1 = 0; 835 836 if (cap_bits1 & TC1_INFO) { 837 tcp_copy_info(&tcap->INFO_ack, tcp); 838 tcap->CAP_bits1 |= TC1_INFO; 839 } 840 841 if (cap_bits1 & TC1_ACCEPTOR_ID) { 842 tcap->ACCEPTOR_id = tcp->tcp_acceptor_id; 843 tcap->CAP_bits1 |= TC1_ACCEPTOR_ID; 844 } 845 846 } 847 848 /* 849 * This routine responds to T_CAPABILITY_REQ messages. It is called by 850 * tcp_wput. Much of the T_CAPABILITY_ACK information is copied from 851 * tcp_g_t_info_ack. The current state of the stream is copied from 852 * tcp_state. 853 */ 854 void 855 tcp_capability_req(tcp_t *tcp, mblk_t *mp) 856 { 857 t_uscalar_t cap_bits1; 858 struct T_capability_ack *tcap; 859 860 if (MBLKL(mp) < sizeof (struct T_capability_req)) { 861 freemsg(mp); 862 return; 863 } 864 865 cap_bits1 = ((struct T_capability_req *)mp->b_rptr)->CAP_bits1; 866 867 mp = tpi_ack_alloc(mp, sizeof (struct T_capability_ack), 868 mp->b_datap->db_type, T_CAPABILITY_ACK); 869 if (mp == NULL) 870 return; 871 872 tcap = (struct T_capability_ack *)mp->b_rptr; 873 tcp_do_capability_ack(tcp, tcap, cap_bits1); 874 875 putnext(tcp->tcp_connp->conn_rq, mp); 876 } 877 878 /* 879 * This routine responds to T_INFO_REQ messages. It is called by tcp_wput. 880 * Most of the T_INFO_ACK information is copied from tcp_g_t_info_ack. 881 * The current state of the stream is copied from tcp_state. 882 */ 883 void 884 tcp_info_req(tcp_t *tcp, mblk_t *mp) 885 { 886 mp = tpi_ack_alloc(mp, sizeof (struct T_info_ack), M_PCPROTO, 887 T_INFO_ACK); 888 if (!mp) { 889 tcp_err_ack(tcp, mp, TSYSERR, ENOMEM); 890 return; 891 } 892 tcp_copy_info((struct T_info_ack *)mp->b_rptr, tcp); 893 putnext(tcp->tcp_connp->conn_rq, mp); 894 } 895 896 /* Respond to the TPI addr request */ 897 void 898 tcp_addr_req(tcp_t *tcp, mblk_t *mp) 899 { 900 struct sockaddr *sa; 901 mblk_t *ackmp; 902 struct T_addr_ack *taa; 903 conn_t *connp = tcp->tcp_connp; 904 uint_t addrlen; 905 906 /* Make it large enough for worst case */ 907 ackmp = reallocb(mp, sizeof (struct T_addr_ack) + 908 2 * sizeof (sin6_t), 1); 909 if (ackmp == NULL) { 910 tcp_err_ack(tcp, mp, TSYSERR, ENOMEM); 911 return; 912 } 913 914 taa = (struct T_addr_ack *)ackmp->b_rptr; 915 916 bzero(taa, sizeof (struct T_addr_ack)); 917 ackmp->b_wptr = (uchar_t *)&taa[1]; 918 919 taa->PRIM_type = T_ADDR_ACK; 920 ackmp->b_datap->db_type = M_PCPROTO; 921 922 if (connp->conn_family == AF_INET) 923 addrlen = sizeof (sin_t); 924 else 925 addrlen = sizeof (sin6_t); 926 927 /* 928 * Note: Following code assumes 32 bit alignment of basic 929 * data structures like sin_t and struct T_addr_ack. 930 */ 931 if (tcp->tcp_state >= TCPS_BOUND) { 932 /* 933 * Fill in local address first 934 */ 935 taa->LOCADDR_offset = sizeof (*taa); 936 taa->LOCADDR_length = addrlen; 937 sa = (struct sockaddr *)&taa[1]; 938 (void) conn_getsockname(connp, sa, &addrlen); 939 ackmp->b_wptr += addrlen; 940 } 941 if (tcp->tcp_state >= TCPS_SYN_RCVD) { 942 /* 943 * Fill in Remote address 944 */ 945 taa->REMADDR_length = addrlen; 946 /* assumed 32-bit alignment */ 947 taa->REMADDR_offset = taa->LOCADDR_offset + taa->LOCADDR_length; 948 sa = (struct sockaddr *)(ackmp->b_rptr + taa->REMADDR_offset); 949 (void) conn_getpeername(connp, sa, &addrlen); 950 ackmp->b_wptr += addrlen; 951 } 952 ASSERT(ackmp->b_wptr <= ackmp->b_datap->db_lim); 953 putnext(tcp->tcp_connp->conn_rq, ackmp); 954 } 955 956 /* 957 * Swap information between the eager and acceptor for a TLI/XTI client. 958 * The sockfs accept is done on the acceptor stream and control goes 959 * through tcp_tli_accept() and tcp_accept()/tcp_accept_swap() is not 960 * called. In either case, both the eager and listener are in their own 961 * perimeter (squeue) and the code has to deal with potential race. 962 * 963 * See the block comment on top of tcp_accept() and tcp_tli_accept(). 964 */ 965 static void 966 tcp_accept_swap(tcp_t *listener, tcp_t *acceptor, tcp_t *eager) 967 { 968 conn_t *econnp, *aconnp; 969 970 ASSERT(eager->tcp_connp->conn_rq == listener->tcp_connp->conn_rq); 971 ASSERT(eager->tcp_detached && !acceptor->tcp_detached); 972 ASSERT(!TCP_IS_SOCKET(acceptor)); 973 ASSERT(!TCP_IS_SOCKET(eager)); 974 ASSERT(!TCP_IS_SOCKET(listener)); 975 976 /* 977 * Trusted Extensions may need to use a security label that is 978 * different from the acceptor's label on MLP and MAC-Exempt 979 * sockets. If this is the case, the required security label 980 * already exists in econnp->conn_ixa->ixa_tsl. Since we make the 981 * acceptor stream refer to econnp we atomatically get that label. 982 */ 983 984 acceptor->tcp_detached = B_TRUE; 985 /* 986 * To permit stream re-use by TLI/XTI, the eager needs a copy of 987 * the acceptor id. 988 */ 989 eager->tcp_acceptor_id = acceptor->tcp_acceptor_id; 990 991 /* remove eager from listen list... */ 992 mutex_enter(&listener->tcp_eager_lock); 993 tcp_eager_unlink(eager); 994 ASSERT(eager->tcp_eager_next_q == NULL && 995 eager->tcp_eager_last_q == NULL); 996 ASSERT(eager->tcp_eager_next_q0 == NULL && 997 eager->tcp_eager_prev_q0 == NULL); 998 mutex_exit(&listener->tcp_eager_lock); 999 1000 econnp = eager->tcp_connp; 1001 aconnp = acceptor->tcp_connp; 1002 econnp->conn_rq = aconnp->conn_rq; 1003 econnp->conn_wq = aconnp->conn_wq; 1004 econnp->conn_rq->q_ptr = econnp; 1005 econnp->conn_wq->q_ptr = econnp; 1006 1007 /* 1008 * In the TLI/XTI loopback case, we are inside the listener's squeue, 1009 * which might be a different squeue from our peer TCP instance. 1010 * For TCP Fusion, the peer expects that whenever tcp_detached is 1011 * clear, our TCP queues point to the acceptor's queues. Thus, use 1012 * membar_producer() to ensure that the assignments of conn_rq/conn_wq 1013 * above reach global visibility prior to the clearing of tcp_detached. 1014 */ 1015 membar_producer(); 1016 eager->tcp_detached = B_FALSE; 1017 1018 ASSERT(eager->tcp_ack_tid == 0); 1019 1020 econnp->conn_dev = aconnp->conn_dev; 1021 econnp->conn_minor_arena = aconnp->conn_minor_arena; 1022 1023 ASSERT(econnp->conn_minor_arena != NULL); 1024 if (econnp->conn_cred != NULL) 1025 crfree(econnp->conn_cred); 1026 econnp->conn_cred = aconnp->conn_cred; 1027 ASSERT(!(econnp->conn_ixa->ixa_free_flags & IXA_FREE_CRED)); 1028 econnp->conn_ixa->ixa_cred = econnp->conn_cred; 1029 aconnp->conn_cred = NULL; 1030 econnp->conn_cpid = aconnp->conn_cpid; 1031 ASSERT(econnp->conn_netstack == aconnp->conn_netstack); 1032 ASSERT(eager->tcp_tcps == acceptor->tcp_tcps); 1033 1034 econnp->conn_zoneid = aconnp->conn_zoneid; 1035 econnp->conn_allzones = aconnp->conn_allzones; 1036 econnp->conn_ixa->ixa_zoneid = aconnp->conn_ixa->ixa_zoneid; 1037 1038 econnp->conn_mac_mode = aconnp->conn_mac_mode; 1039 econnp->conn_zone_is_global = aconnp->conn_zone_is_global; 1040 aconnp->conn_mac_mode = CONN_MAC_DEFAULT; 1041 1042 /* Do the IPC initialization */ 1043 CONN_INC_REF(econnp); 1044 1045 /* Done with old IPC. Drop its ref on its connp */ 1046 CONN_DEC_REF(aconnp); 1047 } 1048 1049 /* 1050 * This runs at the tail end of accept processing on the squeue of the 1051 * new connection. 1052 */ 1053 /* ARGSUSED */ 1054 static void 1055 tcp_accept_finish(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy) 1056 { 1057 conn_t *connp = (conn_t *)arg; 1058 tcp_t *tcp = connp->conn_tcp; 1059 queue_t *q = connp->conn_rq; 1060 tcp_stack_t *tcps = tcp->tcp_tcps; 1061 struct stroptions *stropt; 1062 struct sock_proto_props sopp; 1063 1064 /* Should never be called for non-STREAMS sockets */ 1065 ASSERT(!IPCL_IS_NONSTR(connp)); 1066 1067 /* We should just receive a single mblk that fits a T_discon_ind */ 1068 ASSERT(mp->b_cont == NULL); 1069 1070 /* 1071 * Drop the eager's ref on the listener, that was placed when 1072 * this eager began life in tcp_input_listener. 1073 */ 1074 CONN_DEC_REF(tcp->tcp_saved_listener->tcp_connp); 1075 1076 tcp->tcp_detached = B_FALSE; 1077 1078 if (tcp->tcp_state <= TCPS_BOUND || tcp->tcp_accept_error) { 1079 /* 1080 * Someone blewoff the eager before we could finish 1081 * the accept. 1082 * 1083 * The only reason eager exists it because we put in 1084 * a ref on it when conn ind went up. We need to send 1085 * a disconnect indication up while the last reference 1086 * on the eager will be dropped by the squeue when we 1087 * return. 1088 */ 1089 ASSERT(tcp->tcp_listener == NULL); 1090 if (tcp->tcp_issocket || tcp->tcp_send_discon_ind) { 1091 struct T_discon_ind *tdi; 1092 1093 (void) putnextctl1(q, M_FLUSH, FLUSHRW); 1094 /* 1095 * Let us reuse the incoming mblk to avoid 1096 * memory allocation failure problems. We know 1097 * that the size of the incoming mblk i.e. 1098 * stroptions is greater than sizeof 1099 * T_discon_ind. 1100 */ 1101 ASSERT(DB_REF(mp) == 1); 1102 ASSERT(MBLKSIZE(mp) >= 1103 sizeof (struct T_discon_ind)); 1104 1105 DB_TYPE(mp) = M_PROTO; 1106 ((union T_primitives *)mp->b_rptr)->type = 1107 T_DISCON_IND; 1108 tdi = (struct T_discon_ind *)mp->b_rptr; 1109 if (tcp->tcp_issocket) { 1110 tdi->DISCON_reason = ECONNREFUSED; 1111 tdi->SEQ_number = 0; 1112 } else { 1113 tdi->DISCON_reason = ENOPROTOOPT; 1114 tdi->SEQ_number = 1115 tcp->tcp_conn_req_seqnum; 1116 } 1117 mp->b_wptr = mp->b_rptr + 1118 sizeof (struct T_discon_ind); 1119 putnext(q, mp); 1120 } 1121 tcp->tcp_hard_binding = B_FALSE; 1122 return; 1123 } 1124 1125 /* 1126 * This is the first time we run on the correct 1127 * queue after tcp_accept. So fix all the q parameters 1128 * here. 1129 * 1130 * Let us reuse the incoming mblk to avoid 1131 * memory allocation failure problems. We know 1132 * that the size of the incoming mblk is at least 1133 * stroptions 1134 */ 1135 tcp_get_proto_props(tcp, &sopp); 1136 1137 ASSERT(DB_REF(mp) == 1); 1138 ASSERT(MBLKSIZE(mp) >= sizeof (struct stroptions)); 1139 1140 DB_TYPE(mp) = M_SETOPTS; 1141 stropt = (struct stroptions *)mp->b_rptr; 1142 mp->b_wptr = mp->b_rptr + sizeof (struct stroptions); 1143 stropt = (struct stroptions *)mp->b_rptr; 1144 ASSERT(sopp.sopp_flags & (SO_HIWAT|SO_WROFF|SO_MAXBLK)); 1145 stropt->so_flags = SO_HIWAT | SO_WROFF | SO_MAXBLK; 1146 stropt->so_hiwat = sopp.sopp_rxhiwat; 1147 stropt->so_wroff = sopp.sopp_wroff; 1148 stropt->so_maxblk = sopp.sopp_maxblk; 1149 1150 /* Send the options up */ 1151 putnext(q, mp); 1152 1153 /* 1154 * Pass up any data and/or a fin that has been received. 1155 * 1156 * Adjust receive window in case it had decreased 1157 * (because there is data <=> tcp_rcv_list != NULL) 1158 * while the connection was detached. Note that 1159 * in case the eager was flow-controlled, w/o this 1160 * code, the rwnd may never open up again! 1161 */ 1162 if (tcp->tcp_rcv_list != NULL) { 1163 /* We drain directly in case of fused tcp loopback */ 1164 1165 if (!tcp->tcp_fused && canputnext(q)) { 1166 tcp->tcp_rwnd = connp->conn_rcvbuf; 1167 if (tcp->tcp_state >= TCPS_ESTABLISHED && 1168 tcp_rwnd_reopen(tcp) == TH_ACK_NEEDED) { 1169 tcp_xmit_ctl(NULL, 1170 tcp, (tcp->tcp_swnd == 0) ? 1171 tcp->tcp_suna : tcp->tcp_snxt, 1172 tcp->tcp_rnxt, TH_ACK); 1173 } 1174 } 1175 1176 (void) tcp_rcv_drain(tcp); 1177 1178 /* 1179 * For fused tcp loopback, back-enable peer endpoint 1180 * if it's currently flow-controlled. 1181 */ 1182 if (tcp->tcp_fused) { 1183 tcp_t *peer_tcp = tcp->tcp_loopback_peer; 1184 1185 ASSERT(peer_tcp != NULL); 1186 ASSERT(peer_tcp->tcp_fused); 1187 1188 mutex_enter(&peer_tcp->tcp_non_sq_lock); 1189 if (peer_tcp->tcp_flow_stopped) { 1190 tcp_clrqfull(peer_tcp); 1191 TCP_STAT(tcps, tcp_fusion_backenabled); 1192 } 1193 mutex_exit(&peer_tcp->tcp_non_sq_lock); 1194 } 1195 } 1196 ASSERT(tcp->tcp_rcv_list == NULL || tcp->tcp_fused_sigurg); 1197 if (tcp->tcp_fin_rcvd && !tcp->tcp_ordrel_done) { 1198 tcp->tcp_ordrel_done = B_TRUE; 1199 mp = tcp->tcp_ordrel_mp; 1200 tcp->tcp_ordrel_mp = NULL; 1201 putnext(q, mp); 1202 } 1203 tcp->tcp_hard_binding = B_FALSE; 1204 1205 if (connp->conn_keepalive) { 1206 tcp->tcp_ka_last_intrvl = 0; 1207 tcp->tcp_ka_tid = TCP_TIMER(tcp, tcp_keepalive_timer, 1208 tcp->tcp_ka_interval); 1209 } 1210 1211 /* 1212 * At this point, eager is fully established and will 1213 * have the following references - 1214 * 1215 * 2 references for connection to exist (1 for TCP and 1 for IP). 1216 * 1 reference for the squeue which will be dropped by the squeue as 1217 * soon as this function returns. 1218 * There will be 1 additonal reference for being in classifier 1219 * hash list provided something bad hasn't happened. 1220 */ 1221 ASSERT((connp->conn_fanout != NULL && connp->conn_ref >= 4) || 1222 (connp->conn_fanout == NULL && connp->conn_ref >= 3)); 1223 } 1224 1225 /* 1226 * Pull a deferred connection indication off of the listener. The caller 1227 * must verify that there is a deferred conn ind under eager_lock before 1228 * calling this function. 1229 */ 1230 static mblk_t * 1231 tcp_get_def_conn_ind(tcp_t *listener) 1232 { 1233 tcp_t *tail; 1234 tcp_t *tcp; 1235 mblk_t *conn_ind; 1236 1237 ASSERT(MUTEX_HELD(&listener->tcp_eager_lock)); 1238 ASSERT(listener->tcp_eager_prev_q0->tcp_conn_def_q0); 1239 1240 tcp = listener->tcp_eager_prev_q0; 1241 /* 1242 * listener->tcp_eager_prev_q0 points to the TAIL of the 1243 * deferred T_conn_ind queue. We need to get to the head 1244 * of the queue in order to send up T_conn_ind the same 1245 * order as how the 3WHS is completed. 1246 */ 1247 while (tcp != listener) { 1248 if (!tcp->tcp_eager_prev_q0->tcp_conn_def_q0) 1249 break; 1250 else 1251 tcp = tcp->tcp_eager_prev_q0; 1252 } 1253 1254 conn_ind = tcp->tcp_conn.tcp_eager_conn_ind; 1255 tcp->tcp_conn.tcp_eager_conn_ind = NULL; 1256 /* Move from q0 to q */ 1257 ASSERT(listener->tcp_conn_req_cnt_q0 > 0); 1258 listener->tcp_conn_req_cnt_q0--; 1259 listener->tcp_conn_req_cnt_q++; 1260 tcp->tcp_eager_next_q0->tcp_eager_prev_q0 = 1261 tcp->tcp_eager_prev_q0; 1262 tcp->tcp_eager_prev_q0->tcp_eager_next_q0 = 1263 tcp->tcp_eager_next_q0; 1264 tcp->tcp_eager_prev_q0 = NULL; 1265 tcp->tcp_eager_next_q0 = NULL; 1266 tcp->tcp_conn_def_q0 = B_FALSE; 1267 1268 /* Make sure the tcp isn't in the list of droppables */ 1269 ASSERT(tcp->tcp_eager_next_drop_q0 == NULL && 1270 tcp->tcp_eager_prev_drop_q0 == NULL); 1271 1272 /* 1273 * Insert at end of the queue because sockfs sends 1274 * down T_CONN_RES in chronological order. Leaving 1275 * the older conn indications at front of the queue 1276 * helps reducing search time. 1277 */ 1278 tail = listener->tcp_eager_last_q; 1279 if (tail != NULL) { 1280 tail->tcp_eager_next_q = tcp; 1281 } else { 1282 listener->tcp_eager_next_q = tcp; 1283 } 1284 listener->tcp_eager_last_q = tcp; 1285 tcp->tcp_eager_next_q = NULL; 1286 1287 return (conn_ind); 1288 } 1289 1290 1291 /* 1292 * Reply to a clients T_CONN_RES TPI message. This function 1293 * is used only for TLI/XTI listener. Sockfs sends T_CONN_RES 1294 * on the acceptor STREAM and processed in tcp_accept_common(). 1295 * Read the block comment on top of tcp_input_listener(). 1296 */ 1297 void 1298 tcp_tli_accept(tcp_t *listener, mblk_t *mp) 1299 { 1300 tcp_t *acceptor; 1301 tcp_t *eager; 1302 struct T_conn_res *tcr; 1303 t_uscalar_t acceptor_id; 1304 t_scalar_t seqnum; 1305 mblk_t *discon_mp = NULL; 1306 mblk_t *ok_mp; 1307 mblk_t *mp1; 1308 tcp_stack_t *tcps = listener->tcp_tcps; 1309 conn_t *econnp; 1310 1311 if ((mp->b_wptr - mp->b_rptr) < sizeof (*tcr)) { 1312 tcp_err_ack(listener, mp, TPROTO, 0); 1313 return; 1314 } 1315 tcr = (struct T_conn_res *)mp->b_rptr; 1316 1317 /* 1318 * Under ILP32 the stream head points tcr->ACCEPTOR_id at the 1319 * read side queue of the streams device underneath us i.e. the 1320 * read side queue of 'ip'. Since we can't deference QUEUE_ptr we 1321 * look it up in the queue_hash. Under LP64 it sends down the 1322 * minor_t of the accepting endpoint. 1323 * 1324 * Once the acceptor/eager are modified (in tcp_accept_swap) the 1325 * fanout hash lock is held. 1326 * This prevents any thread from entering the acceptor queue from 1327 * below (since it has not been hard bound yet i.e. any inbound 1328 * packets will arrive on the listener conn_t and 1329 * go through the classifier). 1330 * The CONN_INC_REF will prevent the acceptor from closing. 1331 * 1332 * XXX It is still possible for a tli application to send down data 1333 * on the accepting stream while another thread calls t_accept. 1334 * This should not be a problem for well-behaved applications since 1335 * the T_OK_ACK is sent after the queue swapping is completed. 1336 * 1337 * If the accepting fd is the same as the listening fd, avoid 1338 * queue hash lookup since that will return an eager listener in a 1339 * already established state. 1340 */ 1341 acceptor_id = tcr->ACCEPTOR_id; 1342 mutex_enter(&listener->tcp_eager_lock); 1343 if (listener->tcp_acceptor_id == acceptor_id) { 1344 eager = listener->tcp_eager_next_q; 1345 /* only count how many T_CONN_INDs so don't count q0 */ 1346 if ((listener->tcp_conn_req_cnt_q != 1) || 1347 (eager->tcp_conn_req_seqnum != tcr->SEQ_number)) { 1348 mutex_exit(&listener->tcp_eager_lock); 1349 tcp_err_ack(listener, mp, TBADF, 0); 1350 return; 1351 } 1352 if (listener->tcp_conn_req_cnt_q0 != 0) { 1353 /* Throw away all the eagers on q0. */ 1354 tcp_eager_cleanup(listener, 1); 1355 } 1356 if (listener->tcp_syn_defense) { 1357 listener->tcp_syn_defense = B_FALSE; 1358 if (listener->tcp_ip_addr_cache != NULL) { 1359 kmem_free(listener->tcp_ip_addr_cache, 1360 IP_ADDR_CACHE_SIZE * sizeof (ipaddr_t)); 1361 listener->tcp_ip_addr_cache = NULL; 1362 } 1363 } 1364 /* 1365 * Transfer tcp_conn_req_max to the eager so that when 1366 * a disconnect occurs we can revert the endpoint to the 1367 * listen state. 1368 */ 1369 eager->tcp_conn_req_max = listener->tcp_conn_req_max; 1370 ASSERT(listener->tcp_conn_req_cnt_q0 == 0); 1371 /* 1372 * Get a reference on the acceptor just like the 1373 * tcp_acceptor_hash_lookup below. 1374 */ 1375 acceptor = listener; 1376 CONN_INC_REF(acceptor->tcp_connp); 1377 } else { 1378 acceptor = tcp_acceptor_hash_lookup(acceptor_id, tcps); 1379 if (acceptor == NULL) { 1380 if (listener->tcp_connp->conn_debug) { 1381 (void) strlog(TCP_MOD_ID, 0, 1, 1382 SL_ERROR|SL_TRACE, 1383 "tcp_accept: did not find acceptor 0x%x\n", 1384 acceptor_id); 1385 } 1386 mutex_exit(&listener->tcp_eager_lock); 1387 tcp_err_ack(listener, mp, TPROVMISMATCH, 0); 1388 return; 1389 } 1390 /* 1391 * Verify acceptor state. The acceptable states for an acceptor 1392 * include TCPS_IDLE and TCPS_BOUND. 1393 */ 1394 switch (acceptor->tcp_state) { 1395 case TCPS_IDLE: 1396 /* FALLTHRU */ 1397 case TCPS_BOUND: 1398 break; 1399 default: 1400 CONN_DEC_REF(acceptor->tcp_connp); 1401 mutex_exit(&listener->tcp_eager_lock); 1402 tcp_err_ack(listener, mp, TOUTSTATE, 0); 1403 return; 1404 } 1405 } 1406 1407 /* The listener must be in TCPS_LISTEN */ 1408 if (listener->tcp_state != TCPS_LISTEN) { 1409 CONN_DEC_REF(acceptor->tcp_connp); 1410 mutex_exit(&listener->tcp_eager_lock); 1411 tcp_err_ack(listener, mp, TOUTSTATE, 0); 1412 return; 1413 } 1414 1415 /* 1416 * Rendezvous with an eager connection request packet hanging off 1417 * 'tcp' that has the 'seqnum' tag. We tagged the detached open 1418 * tcp structure when the connection packet arrived in 1419 * tcp_input_listener(). 1420 */ 1421 seqnum = tcr->SEQ_number; 1422 eager = listener; 1423 do { 1424 eager = eager->tcp_eager_next_q; 1425 if (eager == NULL) { 1426 CONN_DEC_REF(acceptor->tcp_connp); 1427 mutex_exit(&listener->tcp_eager_lock); 1428 tcp_err_ack(listener, mp, TBADSEQ, 0); 1429 return; 1430 } 1431 } while (eager->tcp_conn_req_seqnum != seqnum); 1432 mutex_exit(&listener->tcp_eager_lock); 1433 1434 /* 1435 * At this point, both acceptor and listener have 2 ref 1436 * that they begin with. Acceptor has one additional ref 1437 * we placed in lookup while listener has 3 additional 1438 * ref for being behind the squeue (tcp_accept() is 1439 * done on listener's squeue); being in classifier hash; 1440 * and eager's ref on listener. 1441 */ 1442 ASSERT(listener->tcp_connp->conn_ref >= 5); 1443 ASSERT(acceptor->tcp_connp->conn_ref >= 3); 1444 1445 /* 1446 * The eager at this point is set in its own squeue and 1447 * could easily have been killed (tcp_accept_finish will 1448 * deal with that) because of a TH_RST so we can only 1449 * ASSERT for a single ref. 1450 */ 1451 ASSERT(eager->tcp_connp->conn_ref >= 1); 1452 1453 /* 1454 * Pre allocate the discon_ind mblk also. tcp_accept_finish will 1455 * use it if something failed. 1456 */ 1457 discon_mp = allocb(MAX(sizeof (struct T_discon_ind), 1458 sizeof (struct stroptions)), BPRI_HI); 1459 if (discon_mp == NULL) { 1460 CONN_DEC_REF(acceptor->tcp_connp); 1461 CONN_DEC_REF(eager->tcp_connp); 1462 tcp_err_ack(listener, mp, TSYSERR, ENOMEM); 1463 return; 1464 } 1465 1466 econnp = eager->tcp_connp; 1467 1468 /* Hold a copy of mp, in case reallocb fails */ 1469 if ((mp1 = copymsg(mp)) == NULL) { 1470 CONN_DEC_REF(acceptor->tcp_connp); 1471 CONN_DEC_REF(eager->tcp_connp); 1472 freemsg(discon_mp); 1473 tcp_err_ack(listener, mp, TSYSERR, ENOMEM); 1474 return; 1475 } 1476 1477 tcr = (struct T_conn_res *)mp1->b_rptr; 1478 1479 /* 1480 * This is an expanded version of mi_tpi_ok_ack_alloc() 1481 * which allocates a larger mblk and appends the new 1482 * local address to the ok_ack. The address is copied by 1483 * soaccept() for getsockname(). 1484 */ 1485 { 1486 int extra; 1487 1488 extra = (econnp->conn_family == AF_INET) ? 1489 sizeof (sin_t) : sizeof (sin6_t); 1490 1491 /* 1492 * Try to re-use mp, if possible. Otherwise, allocate 1493 * an mblk and return it as ok_mp. In any case, mp 1494 * is no longer usable upon return. 1495 */ 1496 if ((ok_mp = mi_tpi_ok_ack_alloc_extra(mp, extra)) == NULL) { 1497 CONN_DEC_REF(acceptor->tcp_connp); 1498 CONN_DEC_REF(eager->tcp_connp); 1499 freemsg(discon_mp); 1500 /* Original mp has been freed by now, so use mp1 */ 1501 tcp_err_ack(listener, mp1, TSYSERR, ENOMEM); 1502 return; 1503 } 1504 1505 mp = NULL; /* We should never use mp after this point */ 1506 1507 switch (extra) { 1508 case sizeof (sin_t): { 1509 sin_t *sin = (sin_t *)ok_mp->b_wptr; 1510 1511 ok_mp->b_wptr += extra; 1512 sin->sin_family = AF_INET; 1513 sin->sin_port = econnp->conn_lport; 1514 sin->sin_addr.s_addr = econnp->conn_laddr_v4; 1515 break; 1516 } 1517 case sizeof (sin6_t): { 1518 sin6_t *sin6 = (sin6_t *)ok_mp->b_wptr; 1519 1520 ok_mp->b_wptr += extra; 1521 sin6->sin6_family = AF_INET6; 1522 sin6->sin6_port = econnp->conn_lport; 1523 sin6->sin6_addr = econnp->conn_laddr_v6; 1524 sin6->sin6_flowinfo = econnp->conn_flowinfo; 1525 if (IN6_IS_ADDR_LINKSCOPE(&econnp->conn_laddr_v6) && 1526 (econnp->conn_ixa->ixa_flags & IXAF_SCOPEID_SET)) { 1527 sin6->sin6_scope_id = 1528 econnp->conn_ixa->ixa_scopeid; 1529 } else { 1530 sin6->sin6_scope_id = 0; 1531 } 1532 sin6->__sin6_src_id = 0; 1533 break; 1534 } 1535 default: 1536 break; 1537 } 1538 ASSERT(ok_mp->b_wptr <= ok_mp->b_datap->db_lim); 1539 } 1540 1541 /* 1542 * If there are no options we know that the T_CONN_RES will 1543 * succeed. However, we can't send the T_OK_ACK upstream until 1544 * the tcp_accept_swap is done since it would be dangerous to 1545 * let the application start using the new fd prior to the swap. 1546 */ 1547 tcp_accept_swap(listener, acceptor, eager); 1548 1549 /* 1550 * tcp_accept_swap unlinks eager from listener but does not drop 1551 * the eager's reference on the listener. 1552 */ 1553 ASSERT(eager->tcp_listener == NULL); 1554 ASSERT(listener->tcp_connp->conn_ref >= 5); 1555 1556 /* 1557 * The eager is now associated with its own queue. Insert in 1558 * the hash so that the connection can be reused for a future 1559 * T_CONN_RES. 1560 */ 1561 tcp_acceptor_hash_insert(acceptor_id, eager); 1562 1563 /* 1564 * We now do the processing of options with T_CONN_RES. 1565 * We delay till now since we wanted to have queue to pass to 1566 * option processing routines that points back to the right 1567 * instance structure which does not happen until after 1568 * tcp_accept_swap(). 1569 * 1570 * Note: 1571 * The sanity of the logic here assumes that whatever options 1572 * are appropriate to inherit from listner=>eager are done 1573 * before this point, and whatever were to be overridden (or not) 1574 * in transfer logic from eager=>acceptor in tcp_accept_swap(). 1575 * [ Warning: acceptor endpoint can have T_OPTMGMT_REQ done to it 1576 * before its ACCEPTOR_id comes down in T_CONN_RES ] 1577 * This may not be true at this point in time but can be fixed 1578 * independently. This option processing code starts with 1579 * the instantiated acceptor instance and the final queue at 1580 * this point. 1581 */ 1582 1583 if (tcr->OPT_length != 0) { 1584 /* Options to process */ 1585 int t_error = 0; 1586 int sys_error = 0; 1587 int do_disconnect = 0; 1588 1589 if (tcp_conprim_opt_process(eager, mp1, 1590 &do_disconnect, &t_error, &sys_error) < 0) { 1591 eager->tcp_accept_error = 1; 1592 if (do_disconnect) { 1593 /* 1594 * An option failed which does not allow 1595 * connection to be accepted. 1596 * 1597 * We allow T_CONN_RES to succeed and 1598 * put a T_DISCON_IND on the eager queue. 1599 */ 1600 ASSERT(t_error == 0 && sys_error == 0); 1601 eager->tcp_send_discon_ind = 1; 1602 } else { 1603 ASSERT(t_error != 0); 1604 freemsg(ok_mp); 1605 /* 1606 * Original mp was either freed or set 1607 * to ok_mp above, so use mp1 instead. 1608 */ 1609 tcp_err_ack(listener, mp1, t_error, sys_error); 1610 goto finish; 1611 } 1612 } 1613 /* 1614 * Most likely success in setting options (except if 1615 * eager->tcp_send_discon_ind set). 1616 * mp1 option buffer represented by OPT_length/offset 1617 * potentially modified and contains results of setting 1618 * options at this point 1619 */ 1620 } 1621 1622 /* We no longer need mp1, since all options processing has passed */ 1623 freemsg(mp1); 1624 1625 putnext(listener->tcp_connp->conn_rq, ok_mp); 1626 1627 mutex_enter(&listener->tcp_eager_lock); 1628 if (listener->tcp_eager_prev_q0->tcp_conn_def_q0) { 1629 mblk_t *conn_ind; 1630 1631 /* 1632 * This path should not be executed if listener and 1633 * acceptor streams are the same. 1634 */ 1635 ASSERT(listener != acceptor); 1636 conn_ind = tcp_get_def_conn_ind(listener); 1637 mutex_exit(&listener->tcp_eager_lock); 1638 putnext(listener->tcp_connp->conn_rq, conn_ind); 1639 } else { 1640 mutex_exit(&listener->tcp_eager_lock); 1641 } 1642 1643 /* 1644 * Done with the acceptor - free it 1645 * 1646 * Note: from this point on, no access to listener should be made 1647 * as listener can be equal to acceptor. 1648 */ 1649 finish: 1650 ASSERT(acceptor->tcp_detached); 1651 acceptor->tcp_connp->conn_rq = NULL; 1652 ASSERT(!IPCL_IS_NONSTR(acceptor->tcp_connp)); 1653 acceptor->tcp_connp->conn_wq = NULL; 1654 (void) tcp_clean_death(acceptor, 0); 1655 CONN_DEC_REF(acceptor->tcp_connp); 1656 1657 /* 1658 * We pass discon_mp to tcp_accept_finish to get on the right squeue. 1659 * 1660 * It will update the setting for sockfs/stream head and also take 1661 * care of any data that arrived before accept() wad called. 1662 * In case we already received a FIN then tcp_accept_finish will send up 1663 * the ordrel. It will also send up a window update if the window 1664 * has opened up. 1665 */ 1666 1667 /* 1668 * XXX: we currently have a problem if XTI application closes the 1669 * acceptor stream in between. This problem exists in on10-gate also 1670 * and is well know but nothing can be done short of major rewrite 1671 * to fix it. Now it is possible to take care of it by assigning TLI/XTI 1672 * eager same squeue as listener (we can distinguish non socket 1673 * listeners at the time of handling a SYN in tcp_input_listener) 1674 * and do most of the work that tcp_accept_finish does here itself 1675 * and then get behind the acceptor squeue to access the acceptor 1676 * queue. 1677 */ 1678 /* 1679 * We already have a ref on tcp so no need to do one before squeue_enter 1680 */ 1681 SQUEUE_ENTER_ONE(eager->tcp_connp->conn_sqp, discon_mp, 1682 tcp_accept_finish, eager->tcp_connp, NULL, SQ_FILL, 1683 SQTAG_TCP_ACCEPT_FINISH); 1684 } 1685 1686 1687 /* 1688 * This is the STREAMS entry point for T_CONN_RES coming down on 1689 * Acceptor STREAM when sockfs listener does accept processing. 1690 * Read the block comment on top of tcp_input_listener(). 1691 */ 1692 int 1693 tcp_tpi_accept(queue_t *q, mblk_t *mp) 1694 { 1695 queue_t *rq = RD(q); 1696 struct T_conn_res *conn_res; 1697 tcp_t *eager; 1698 tcp_t *listener; 1699 struct T_ok_ack *ok; 1700 t_scalar_t PRIM_type; 1701 mblk_t *discon_mp; 1702 conn_t *econnp; 1703 cred_t *cr; 1704 1705 ASSERT(DB_TYPE(mp) == M_PROTO); 1706 1707 /* 1708 * All Solaris components should pass a db_credp 1709 * for this TPI message, hence we ASSERT. 1710 * But in case there is some other M_PROTO that looks 1711 * like a TPI message sent by some other kernel 1712 * component, we check and return an error. 1713 */ 1714 cr = msg_getcred(mp, NULL); 1715 ASSERT(cr != NULL); 1716 if (cr == NULL) { 1717 mp = mi_tpi_err_ack_alloc(mp, TSYSERR, EINVAL); 1718 if (mp != NULL) 1719 putnext(rq, mp); 1720 return (0); 1721 } 1722 conn_res = (struct T_conn_res *)mp->b_rptr; 1723 ASSERT((uintptr_t)(mp->b_wptr - mp->b_rptr) <= (uintptr_t)INT_MAX); 1724 if ((mp->b_wptr - mp->b_rptr) < sizeof (struct T_conn_res)) { 1725 mp = mi_tpi_err_ack_alloc(mp, TPROTO, 0); 1726 if (mp != NULL) 1727 putnext(rq, mp); 1728 return (0); 1729 } 1730 switch (conn_res->PRIM_type) { 1731 case O_T_CONN_RES: 1732 case T_CONN_RES: 1733 /* 1734 * We pass up an err ack if allocb fails. This will 1735 * cause sockfs to issue a T_DISCON_REQ which will cause 1736 * tcp_eager_blowoff to be called. sockfs will then call 1737 * rq->q_qinfo->qi_qclose to cleanup the acceptor stream. 1738 * we need to do the allocb up here because we have to 1739 * make sure rq->q_qinfo->qi_qclose still points to the 1740 * correct function (tcp_tpi_close_accept) in case allocb 1741 * fails. 1742 */ 1743 bcopy(mp->b_rptr + conn_res->OPT_offset, 1744 &eager, conn_res->OPT_length); 1745 PRIM_type = conn_res->PRIM_type; 1746 mp->b_datap->db_type = M_PCPROTO; 1747 mp->b_wptr = mp->b_rptr + sizeof (struct T_ok_ack); 1748 ok = (struct T_ok_ack *)mp->b_rptr; 1749 ok->PRIM_type = T_OK_ACK; 1750 ok->CORRECT_prim = PRIM_type; 1751 econnp = eager->tcp_connp; 1752 econnp->conn_dev = (dev_t)RD(q)->q_ptr; 1753 econnp->conn_minor_arena = (vmem_t *)(WR(q)->q_ptr); 1754 econnp->conn_rq = rq; 1755 econnp->conn_wq = q; 1756 rq->q_ptr = econnp; 1757 rq->q_qinfo = &tcp_rinitv4; /* No open - same as rinitv6 */ 1758 q->q_ptr = econnp; 1759 q->q_qinfo = &tcp_winit; 1760 listener = eager->tcp_listener; 1761 1762 /* 1763 * Pre allocate the discon_ind mblk also. tcp_accept_finish will 1764 * use it if something failed. 1765 */ 1766 discon_mp = allocb(MAX(sizeof (struct T_discon_ind), 1767 sizeof (struct stroptions)), BPRI_HI); 1768 1769 if (discon_mp == NULL) { 1770 mp = mi_tpi_err_ack_alloc(mp, TPROTO, 0); 1771 if (mp != NULL) 1772 putnext(rq, mp); 1773 return (0); 1774 } 1775 1776 eager->tcp_issocket = B_TRUE; 1777 1778 ASSERT(econnp->conn_netstack == 1779 listener->tcp_connp->conn_netstack); 1780 ASSERT(eager->tcp_tcps == listener->tcp_tcps); 1781 1782 /* Put the ref for IP */ 1783 CONN_INC_REF(econnp); 1784 1785 /* 1786 * We should have minimum of 3 references on the conn 1787 * at this point. One each for TCP and IP and one for 1788 * the T_conn_ind that was sent up when the 3-way handshake 1789 * completed. In the normal case we would also have another 1790 * reference (making a total of 4) for the conn being in the 1791 * classifier hash list. However the eager could have received 1792 * an RST subsequently and tcp_closei_local could have removed 1793 * the eager from the classifier hash list, hence we can't 1794 * assert that reference. 1795 */ 1796 ASSERT(econnp->conn_ref >= 3); 1797 1798 mutex_enter(&listener->tcp_eager_lock); 1799 if (listener->tcp_eager_prev_q0->tcp_conn_def_q0) { 1800 mblk_t *conn_ind = tcp_get_def_conn_ind(listener); 1801 1802 /* Need to get inside the listener perimeter */ 1803 CONN_INC_REF(listener->tcp_connp); 1804 SQUEUE_ENTER_ONE(listener->tcp_connp->conn_sqp, 1805 conn_ind, tcp_send_pending, listener->tcp_connp, 1806 NULL, SQ_FILL, SQTAG_TCP_SEND_PENDING); 1807 } 1808 tcp_eager_unlink(eager); 1809 mutex_exit(&listener->tcp_eager_lock); 1810 1811 /* 1812 * At this point, the eager is detached from the listener 1813 * but we still have an extra refs on eager (apart from the 1814 * usual tcp references). The ref was placed in tcp_input_data 1815 * before sending the conn_ind in tcp_send_conn_ind. 1816 * The ref will be dropped in tcp_accept_finish(). 1817 */ 1818 SQUEUE_ENTER_ONE(econnp->conn_sqp, discon_mp, tcp_accept_finish, 1819 econnp, NULL, SQ_NODRAIN, SQTAG_TCP_ACCEPT_FINISH_Q0); 1820 1821 /* 1822 * Send the new local address also up to sockfs. There 1823 * should already be enough space in the mp that came 1824 * down from soaccept(). 1825 */ 1826 if (econnp->conn_family == AF_INET) { 1827 sin_t *sin; 1828 1829 ASSERT((mp->b_datap->db_lim - mp->b_datap->db_base) >= 1830 (sizeof (struct T_ok_ack) + sizeof (sin_t))); 1831 sin = (sin_t *)mp->b_wptr; 1832 mp->b_wptr += sizeof (sin_t); 1833 sin->sin_family = AF_INET; 1834 sin->sin_port = econnp->conn_lport; 1835 sin->sin_addr.s_addr = econnp->conn_laddr_v4; 1836 } else { 1837 sin6_t *sin6; 1838 1839 ASSERT((mp->b_datap->db_lim - mp->b_datap->db_base) >= 1840 sizeof (struct T_ok_ack) + sizeof (sin6_t)); 1841 sin6 = (sin6_t *)mp->b_wptr; 1842 mp->b_wptr += sizeof (sin6_t); 1843 sin6->sin6_family = AF_INET6; 1844 sin6->sin6_port = econnp->conn_lport; 1845 sin6->sin6_addr = econnp->conn_laddr_v6; 1846 if (econnp->conn_ipversion == IPV4_VERSION) 1847 sin6->sin6_flowinfo = 0; 1848 else 1849 sin6->sin6_flowinfo = econnp->conn_flowinfo; 1850 if (IN6_IS_ADDR_LINKSCOPE(&econnp->conn_laddr_v6) && 1851 (econnp->conn_ixa->ixa_flags & IXAF_SCOPEID_SET)) { 1852 sin6->sin6_scope_id = 1853 econnp->conn_ixa->ixa_scopeid; 1854 } else { 1855 sin6->sin6_scope_id = 0; 1856 } 1857 sin6->__sin6_src_id = 0; 1858 } 1859 1860 putnext(rq, mp); 1861 break; 1862 default: 1863 mp = mi_tpi_err_ack_alloc(mp, TNOTSUPPORT, 0); 1864 if (mp != NULL) 1865 putnext(rq, mp); 1866 break; 1867 } 1868 return (0); 1869 } 1870 1871 /* 1872 * The function called through squeue to get behind listener's perimeter to 1873 * send a deferred conn_ind. 1874 */ 1875 /* ARGSUSED */ 1876 void 1877 tcp_send_pending(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy) 1878 { 1879 conn_t *lconnp = (conn_t *)arg; 1880 tcp_t *listener = lconnp->conn_tcp; 1881 struct T_conn_ind *conn_ind; 1882 tcp_t *tcp; 1883 1884 conn_ind = (struct T_conn_ind *)mp->b_rptr; 1885 bcopy(mp->b_rptr + conn_ind->OPT_offset, &tcp, 1886 conn_ind->OPT_length); 1887 1888 if (listener->tcp_state != TCPS_LISTEN) { 1889 /* 1890 * If listener has closed, it would have caused a 1891 * a cleanup/blowoff to happen for the eager, so 1892 * we don't need to do anything more. 1893 */ 1894 freemsg(mp); 1895 return; 1896 } 1897 1898 putnext(lconnp->conn_rq, mp); 1899 } 1900 1901 /* 1902 * Sends the T_CONN_IND to the listener. The caller calls this 1903 * functions via squeue to get inside the listener's perimeter 1904 * once the 3 way hand shake is done a T_CONN_IND needs to be 1905 * sent. As an optimization, the caller can call this directly 1906 * if listener's perimeter is same as eager's. 1907 */ 1908 /* ARGSUSED */ 1909 void 1910 tcp_send_conn_ind(void *arg, mblk_t *mp, void *arg2) 1911 { 1912 conn_t *lconnp = (conn_t *)arg; 1913 tcp_t *listener = lconnp->conn_tcp; 1914 tcp_t *tcp; 1915 struct T_conn_ind *conn_ind; 1916 ipaddr_t *addr_cache; 1917 boolean_t need_send_conn_ind = B_FALSE; 1918 tcp_stack_t *tcps = listener->tcp_tcps; 1919 1920 /* retrieve the eager */ 1921 conn_ind = (struct T_conn_ind *)mp->b_rptr; 1922 ASSERT(conn_ind->OPT_offset != 0 && 1923 conn_ind->OPT_length == sizeof (intptr_t)); 1924 bcopy(mp->b_rptr + conn_ind->OPT_offset, &tcp, 1925 conn_ind->OPT_length); 1926 1927 /* 1928 * TLI/XTI applications will get confused by 1929 * sending eager as an option since it violates 1930 * the option semantics. So remove the eager as 1931 * option since TLI/XTI app doesn't need it anyway. 1932 */ 1933 if (!TCP_IS_SOCKET(listener)) { 1934 conn_ind->OPT_length = 0; 1935 conn_ind->OPT_offset = 0; 1936 } 1937 if (listener->tcp_state != TCPS_LISTEN) { 1938 /* 1939 * If listener has closed, it would have caused a 1940 * a cleanup/blowoff to happen for the eager. We 1941 * just need to return. 1942 */ 1943 freemsg(mp); 1944 return; 1945 } 1946 1947 1948 /* 1949 * if the conn_req_q is full defer passing up the 1950 * T_CONN_IND until space is availabe after t_accept() 1951 * processing 1952 */ 1953 mutex_enter(&listener->tcp_eager_lock); 1954 1955 /* 1956 * Take the eager out, if it is in the list of droppable eagers 1957 * as we are here because the 3W handshake is over. 1958 */ 1959 MAKE_UNDROPPABLE(tcp); 1960 1961 if (listener->tcp_conn_req_cnt_q < listener->tcp_conn_req_max) { 1962 tcp_t *tail; 1963 1964 /* 1965 * The eager already has an extra ref put in tcp_input_data 1966 * so that it stays till accept comes back even though it 1967 * might get into TCPS_CLOSED as a result of a TH_RST etc. 1968 */ 1969 ASSERT(listener->tcp_conn_req_cnt_q0 > 0); 1970 listener->tcp_conn_req_cnt_q0--; 1971 listener->tcp_conn_req_cnt_q++; 1972 1973 /* Move from SYN_RCVD to ESTABLISHED list */ 1974 tcp->tcp_eager_next_q0->tcp_eager_prev_q0 = 1975 tcp->tcp_eager_prev_q0; 1976 tcp->tcp_eager_prev_q0->tcp_eager_next_q0 = 1977 tcp->tcp_eager_next_q0; 1978 tcp->tcp_eager_prev_q0 = NULL; 1979 tcp->tcp_eager_next_q0 = NULL; 1980 1981 /* 1982 * Insert at end of the queue because sockfs 1983 * sends down T_CONN_RES in chronological 1984 * order. Leaving the older conn indications 1985 * at front of the queue helps reducing search 1986 * time. 1987 */ 1988 tail = listener->tcp_eager_last_q; 1989 if (tail != NULL) 1990 tail->tcp_eager_next_q = tcp; 1991 else 1992 listener->tcp_eager_next_q = tcp; 1993 listener->tcp_eager_last_q = tcp; 1994 tcp->tcp_eager_next_q = NULL; 1995 /* 1996 * Delay sending up the T_conn_ind until we are 1997 * done with the eager. Once we have have sent up 1998 * the T_conn_ind, the accept can potentially complete 1999 * any time and release the refhold we have on the eager. 2000 */ 2001 need_send_conn_ind = B_TRUE; 2002 } else { 2003 /* 2004 * Defer connection on q0 and set deferred 2005 * connection bit true 2006 */ 2007 tcp->tcp_conn_def_q0 = B_TRUE; 2008 2009 /* take tcp out of q0 ... */ 2010 tcp->tcp_eager_prev_q0->tcp_eager_next_q0 = 2011 tcp->tcp_eager_next_q0; 2012 tcp->tcp_eager_next_q0->tcp_eager_prev_q0 = 2013 tcp->tcp_eager_prev_q0; 2014 2015 /* ... and place it at the end of q0 */ 2016 tcp->tcp_eager_prev_q0 = listener->tcp_eager_prev_q0; 2017 tcp->tcp_eager_next_q0 = listener; 2018 listener->tcp_eager_prev_q0->tcp_eager_next_q0 = tcp; 2019 listener->tcp_eager_prev_q0 = tcp; 2020 tcp->tcp_conn.tcp_eager_conn_ind = mp; 2021 } 2022 2023 /* we have timed out before */ 2024 if (tcp->tcp_syn_rcvd_timeout != 0) { 2025 tcp->tcp_syn_rcvd_timeout = 0; 2026 listener->tcp_syn_rcvd_timeout--; 2027 if (listener->tcp_syn_defense && 2028 listener->tcp_syn_rcvd_timeout <= 2029 (tcps->tcps_conn_req_max_q0 >> 5) && 2030 10*MINUTES < TICK_TO_MSEC(ddi_get_lbolt64() - 2031 listener->tcp_last_rcv_lbolt)) { 2032 /* 2033 * Turn off the defense mode if we 2034 * believe the SYN attack is over. 2035 */ 2036 listener->tcp_syn_defense = B_FALSE; 2037 if (listener->tcp_ip_addr_cache) { 2038 kmem_free((void *)listener->tcp_ip_addr_cache, 2039 IP_ADDR_CACHE_SIZE * sizeof (ipaddr_t)); 2040 listener->tcp_ip_addr_cache = NULL; 2041 } 2042 } 2043 } 2044 addr_cache = (ipaddr_t *)(listener->tcp_ip_addr_cache); 2045 if (addr_cache != NULL) { 2046 /* 2047 * We have finished a 3-way handshake with this 2048 * remote host. This proves the IP addr is good. 2049 * Cache it! 2050 */ 2051 addr_cache[IP_ADDR_CACHE_HASH(tcp->tcp_connp->conn_faddr_v4)] = 2052 tcp->tcp_connp->conn_faddr_v4; 2053 } 2054 mutex_exit(&listener->tcp_eager_lock); 2055 if (need_send_conn_ind) 2056 putnext(lconnp->conn_rq, mp); 2057 } 2058