1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. 24 */ 25 26 /* This files contains all TCP TLI/TPI related functions */ 27 28 #include <sys/types.h> 29 #include <sys/stream.h> 30 #include <sys/strsun.h> 31 #include <sys/strsubr.h> 32 #include <sys/stropts.h> 33 #include <sys/strlog.h> 34 #define _SUN_TPI_VERSION 2 35 #include <sys/tihdr.h> 36 #include <sys/suntpi.h> 37 #include <sys/xti_inet.h> 38 #include <sys/squeue_impl.h> 39 #include <sys/squeue.h> 40 41 #include <inet/common.h> 42 #include <inet/ip.h> 43 #include <inet/tcp.h> 44 #include <inet/tcp_impl.h> 45 #include <inet/proto_set.h> 46 47 static void tcp_accept_swap(tcp_t *, tcp_t *, tcp_t *); 48 static int tcp_conprim_opt_process(tcp_t *, mblk_t *, int *, int *, int *); 49 50 void 51 tcp_use_pure_tpi(tcp_t *tcp) 52 { 53 conn_t *connp = tcp->tcp_connp; 54 55 #ifdef _ILP32 56 tcp->tcp_acceptor_id = (t_uscalar_t)connp->conn_rq; 57 #else 58 tcp->tcp_acceptor_id = connp->conn_dev; 59 #endif 60 /* 61 * Insert this socket into the acceptor hash. 62 * We might need it for T_CONN_RES message 63 */ 64 tcp_acceptor_hash_insert(tcp->tcp_acceptor_id, tcp); 65 66 tcp->tcp_issocket = B_FALSE; 67 TCP_STAT(tcp->tcp_tcps, tcp_sock_fallback); 68 } 69 70 /* Shorthand to generate and send TPI error acks to our client */ 71 void 72 tcp_err_ack(tcp_t *tcp, mblk_t *mp, int t_error, int sys_error) 73 { 74 if ((mp = mi_tpi_err_ack_alloc(mp, t_error, sys_error)) != NULL) 75 putnext(tcp->tcp_connp->conn_rq, mp); 76 } 77 78 /* Shorthand to generate and send TPI error acks to our client */ 79 void 80 tcp_err_ack_prim(tcp_t *tcp, mblk_t *mp, int primitive, 81 int t_error, int sys_error) 82 { 83 struct T_error_ack *teackp; 84 85 if ((mp = tpi_ack_alloc(mp, sizeof (struct T_error_ack), 86 M_PCPROTO, T_ERROR_ACK)) != NULL) { 87 teackp = (struct T_error_ack *)mp->b_rptr; 88 teackp->ERROR_prim = primitive; 89 teackp->TLI_error = t_error; 90 teackp->UNIX_error = sys_error; 91 putnext(tcp->tcp_connp->conn_rq, mp); 92 } 93 } 94 95 /* 96 * TCP routine to get the values of options. 97 */ 98 int 99 tcp_tpi_opt_get(queue_t *q, int level, int name, uchar_t *ptr) 100 { 101 return (tcp_opt_get(Q_TO_CONN(q), level, name, ptr)); 102 } 103 104 /* ARGSUSED */ 105 int 106 tcp_tpi_opt_set(queue_t *q, uint_t optset_context, int level, int name, 107 uint_t inlen, uchar_t *invalp, uint_t *outlenp, uchar_t *outvalp, 108 void *thisdg_attrs, cred_t *cr) 109 { 110 conn_t *connp = Q_TO_CONN(q); 111 112 return (tcp_opt_set(connp, optset_context, level, name, inlen, invalp, 113 outlenp, outvalp, thisdg_attrs, cr)); 114 } 115 116 static int 117 tcp_conprim_opt_process(tcp_t *tcp, mblk_t *mp, int *do_disconnectp, 118 int *t_errorp, int *sys_errorp) 119 { 120 int error; 121 int is_absreq_failure; 122 t_scalar_t *opt_lenp; 123 t_scalar_t opt_offset; 124 int prim_type; 125 struct T_conn_req *tcreqp; 126 struct T_conn_res *tcresp; 127 cred_t *cr; 128 129 /* 130 * All Solaris components should pass a db_credp 131 * for this TPI message, hence we ASSERT. 132 * But in case there is some other M_PROTO that looks 133 * like a TPI message sent by some other kernel 134 * component, we check and return an error. 135 */ 136 cr = msg_getcred(mp, NULL); 137 ASSERT(cr != NULL); 138 if (cr == NULL) 139 return (-1); 140 141 prim_type = ((union T_primitives *)mp->b_rptr)->type; 142 ASSERT(prim_type == T_CONN_REQ || prim_type == O_T_CONN_RES || 143 prim_type == T_CONN_RES); 144 145 switch (prim_type) { 146 case T_CONN_REQ: 147 tcreqp = (struct T_conn_req *)mp->b_rptr; 148 opt_offset = tcreqp->OPT_offset; 149 opt_lenp = (t_scalar_t *)&tcreqp->OPT_length; 150 break; 151 case O_T_CONN_RES: 152 case T_CONN_RES: 153 tcresp = (struct T_conn_res *)mp->b_rptr; 154 opt_offset = tcresp->OPT_offset; 155 opt_lenp = (t_scalar_t *)&tcresp->OPT_length; 156 break; 157 } 158 159 *t_errorp = 0; 160 *sys_errorp = 0; 161 *do_disconnectp = 0; 162 163 error = tpi_optcom_buf(tcp->tcp_connp->conn_wq, mp, opt_lenp, 164 opt_offset, cr, &tcp_opt_obj, 165 NULL, &is_absreq_failure); 166 167 switch (error) { 168 case 0: /* no error */ 169 ASSERT(is_absreq_failure == 0); 170 return (0); 171 case ENOPROTOOPT: 172 *t_errorp = TBADOPT; 173 break; 174 case EACCES: 175 *t_errorp = TACCES; 176 break; 177 default: 178 *t_errorp = TSYSERR; *sys_errorp = error; 179 break; 180 } 181 if (is_absreq_failure != 0) { 182 /* 183 * The connection request should get the local ack 184 * T_OK_ACK and then a T_DISCON_IND. 185 */ 186 *do_disconnectp = 1; 187 } 188 return (-1); 189 } 190 191 void 192 tcp_tpi_bind(tcp_t *tcp, mblk_t *mp) 193 { 194 int error; 195 conn_t *connp = tcp->tcp_connp; 196 struct sockaddr *sa; 197 mblk_t *mp1; 198 struct T_bind_req *tbr; 199 int backlog; 200 socklen_t len; 201 sin_t *sin; 202 sin6_t *sin6; 203 cred_t *cr; 204 205 /* 206 * All Solaris components should pass a db_credp 207 * for this TPI message, hence we ASSERT. 208 * But in case there is some other M_PROTO that looks 209 * like a TPI message sent by some other kernel 210 * component, we check and return an error. 211 */ 212 cr = msg_getcred(mp, NULL); 213 ASSERT(cr != NULL); 214 if (cr == NULL) { 215 tcp_err_ack(tcp, mp, TSYSERR, EINVAL); 216 return; 217 } 218 219 ASSERT((uintptr_t)(mp->b_wptr - mp->b_rptr) <= (uintptr_t)INT_MAX); 220 if ((mp->b_wptr - mp->b_rptr) < sizeof (*tbr)) { 221 if (connp->conn_debug) { 222 (void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE, 223 "tcp_tpi_bind: bad req, len %u", 224 (uint_t)(mp->b_wptr - mp->b_rptr)); 225 } 226 tcp_err_ack(tcp, mp, TPROTO, 0); 227 return; 228 } 229 /* Make sure the largest address fits */ 230 mp1 = reallocb(mp, sizeof (struct T_bind_ack) + sizeof (sin6_t), 1); 231 if (mp1 == NULL) { 232 tcp_err_ack(tcp, mp, TSYSERR, ENOMEM); 233 return; 234 } 235 mp = mp1; 236 tbr = (struct T_bind_req *)mp->b_rptr; 237 238 backlog = tbr->CONIND_number; 239 len = tbr->ADDR_length; 240 241 switch (len) { 242 case 0: /* request for a generic port */ 243 tbr->ADDR_offset = sizeof (struct T_bind_req); 244 if (connp->conn_family == AF_INET) { 245 tbr->ADDR_length = sizeof (sin_t); 246 sin = (sin_t *)&tbr[1]; 247 *sin = sin_null; 248 sin->sin_family = AF_INET; 249 sa = (struct sockaddr *)sin; 250 len = sizeof (sin_t); 251 mp->b_wptr = (uchar_t *)&sin[1]; 252 } else { 253 ASSERT(connp->conn_family == AF_INET6); 254 tbr->ADDR_length = sizeof (sin6_t); 255 sin6 = (sin6_t *)&tbr[1]; 256 *sin6 = sin6_null; 257 sin6->sin6_family = AF_INET6; 258 sa = (struct sockaddr *)sin6; 259 len = sizeof (sin6_t); 260 mp->b_wptr = (uchar_t *)&sin6[1]; 261 } 262 break; 263 264 case sizeof (sin_t): /* Complete IPv4 address */ 265 sa = (struct sockaddr *)mi_offset_param(mp, tbr->ADDR_offset, 266 sizeof (sin_t)); 267 break; 268 269 case sizeof (sin6_t): /* Complete IPv6 address */ 270 sa = (struct sockaddr *)mi_offset_param(mp, 271 tbr->ADDR_offset, sizeof (sin6_t)); 272 break; 273 274 default: 275 if (connp->conn_debug) { 276 (void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE, 277 "tcp_tpi_bind: bad address length, %d", 278 tbr->ADDR_length); 279 } 280 tcp_err_ack(tcp, mp, TBADADDR, 0); 281 return; 282 } 283 284 if (backlog > 0) { 285 error = tcp_do_listen(connp, sa, len, backlog, DB_CRED(mp), 286 tbr->PRIM_type != O_T_BIND_REQ); 287 } else { 288 error = tcp_do_bind(connp, sa, len, DB_CRED(mp), 289 tbr->PRIM_type != O_T_BIND_REQ); 290 } 291 done: 292 if (error > 0) { 293 tcp_err_ack(tcp, mp, TSYSERR, error); 294 } else if (error < 0) { 295 tcp_err_ack(tcp, mp, -error, 0); 296 } else { 297 /* 298 * Update port information as sockfs/tpi needs it for checking 299 */ 300 if (connp->conn_family == AF_INET) { 301 sin = (sin_t *)sa; 302 sin->sin_port = connp->conn_lport; 303 } else { 304 sin6 = (sin6_t *)sa; 305 sin6->sin6_port = connp->conn_lport; 306 } 307 mp->b_datap->db_type = M_PCPROTO; 308 tbr->PRIM_type = T_BIND_ACK; 309 putnext(connp->conn_rq, mp); 310 } 311 } 312 313 /* tcp_unbind is called by tcp_wput_proto to handle T_UNBIND_REQ messages. */ 314 void 315 tcp_tpi_unbind(tcp_t *tcp, mblk_t *mp) 316 { 317 conn_t *connp = tcp->tcp_connp; 318 int error; 319 320 error = tcp_do_unbind(connp); 321 if (error > 0) { 322 tcp_err_ack(tcp, mp, TSYSERR, error); 323 } else if (error < 0) { 324 tcp_err_ack(tcp, mp, -error, 0); 325 } else { 326 /* Send M_FLUSH according to TPI */ 327 (void) putnextctl1(connp->conn_rq, M_FLUSH, FLUSHRW); 328 329 mp = mi_tpi_ok_ack_alloc(mp); 330 if (mp != NULL) 331 putnext(connp->conn_rq, mp); 332 } 333 } 334 335 /* ARGSUSED */ 336 int 337 tcp_tpi_close(queue_t *q, int flags, cred_t *credp __unused) 338 { 339 conn_t *connp; 340 341 ASSERT(WR(q)->q_next == NULL); 342 343 if (flags & SO_FALLBACK) { 344 /* 345 * stream is being closed while in fallback 346 * simply free the resources that were allocated 347 */ 348 inet_minor_free(WR(q)->q_ptr, (dev_t)(RD(q)->q_ptr)); 349 qprocsoff(q); 350 goto done; 351 } 352 353 connp = Q_TO_CONN(q); 354 /* 355 * We are being closed as /dev/tcp or /dev/tcp6. 356 */ 357 tcp_close_common(connp, flags); 358 359 qprocsoff(q); 360 inet_minor_free(connp->conn_minor_arena, connp->conn_dev); 361 362 /* 363 * Drop IP's reference on the conn. This is the last reference 364 * on the connp if the state was less than established. If the 365 * connection has gone into timewait state, then we will have 366 * one ref for the TCP and one more ref (total of two) for the 367 * classifier connected hash list (a timewait connections stays 368 * in connected hash till closed). 369 * 370 * We can't assert the references because there might be other 371 * transient reference places because of some walkers or queued 372 * packets in squeue for the timewait state. 373 */ 374 CONN_DEC_REF(connp); 375 done: 376 q->q_ptr = WR(q)->q_ptr = NULL; 377 return (0); 378 } 379 380 /* ARGSUSED */ 381 int 382 tcp_tpi_close_accept(queue_t *q, int flags __unused, cred_t *credp __unused) 383 { 384 vmem_t *minor_arena; 385 dev_t conn_dev; 386 extern struct qinit tcp_acceptor_winit; 387 388 ASSERT(WR(q)->q_qinfo == &tcp_acceptor_winit); 389 390 /* 391 * We had opened an acceptor STREAM for sockfs which is 392 * now being closed due to some error. 393 */ 394 qprocsoff(q); 395 396 minor_arena = (vmem_t *)WR(q)->q_ptr; 397 conn_dev = (dev_t)RD(q)->q_ptr; 398 ASSERT(minor_arena != NULL); 399 ASSERT(conn_dev != 0); 400 inet_minor_free(minor_arena, conn_dev); 401 q->q_ptr = WR(q)->q_ptr = NULL; 402 return (0); 403 } 404 405 /* 406 * Put a connection confirmation message upstream built from the 407 * address/flowid information with the conn and iph. Report our success or 408 * failure. 409 */ 410 boolean_t 411 tcp_conn_con(tcp_t *tcp, uchar_t *iphdr, mblk_t *idmp, 412 mblk_t **defermp, ip_recv_attr_t *ira) 413 { 414 sin_t sin; 415 sin6_t sin6; 416 mblk_t *mp; 417 char *optp = NULL; 418 int optlen = 0; 419 conn_t *connp = tcp->tcp_connp; 420 421 if (defermp != NULL) 422 *defermp = NULL; 423 424 if (tcp->tcp_conn.tcp_opts_conn_req != NULL) { 425 /* 426 * Return in T_CONN_CON results of option negotiation through 427 * the T_CONN_REQ. Note: If there is an real end-to-end option 428 * negotiation, then what is received from remote end needs 429 * to be taken into account but there is no such thing (yet?) 430 * in our TCP/IP. 431 * Note: We do not use mi_offset_param() here as 432 * tcp_opts_conn_req contents do not directly come from 433 * an application and are either generated in kernel or 434 * from user input that was already verified. 435 */ 436 mp = tcp->tcp_conn.tcp_opts_conn_req; 437 optp = (char *)(mp->b_rptr + 438 ((struct T_conn_req *)mp->b_rptr)->OPT_offset); 439 optlen = (int) 440 ((struct T_conn_req *)mp->b_rptr)->OPT_length; 441 } 442 443 if (IPH_HDR_VERSION(iphdr) == IPV4_VERSION) { 444 445 /* packet is IPv4 */ 446 if (connp->conn_family == AF_INET) { 447 sin = sin_null; 448 sin.sin_addr.s_addr = connp->conn_faddr_v4; 449 sin.sin_port = connp->conn_fport; 450 sin.sin_family = AF_INET; 451 mp = mi_tpi_conn_con(NULL, (char *)&sin, 452 (int)sizeof (sin_t), optp, optlen); 453 } else { 454 sin6 = sin6_null; 455 sin6.sin6_addr = connp->conn_faddr_v6; 456 sin6.sin6_port = connp->conn_fport; 457 sin6.sin6_family = AF_INET6; 458 mp = mi_tpi_conn_con(NULL, (char *)&sin6, 459 (int)sizeof (sin6_t), optp, optlen); 460 461 } 462 } else { 463 ip6_t *ip6h = (ip6_t *)iphdr; 464 465 ASSERT(IPH_HDR_VERSION(iphdr) == IPV6_VERSION); 466 ASSERT(connp->conn_family == AF_INET6); 467 sin6 = sin6_null; 468 sin6.sin6_addr = connp->conn_faddr_v6; 469 sin6.sin6_port = connp->conn_fport; 470 sin6.sin6_family = AF_INET6; 471 sin6.sin6_flowinfo = ip6h->ip6_vcf & ~IPV6_VERS_AND_FLOW_MASK; 472 mp = mi_tpi_conn_con(NULL, (char *)&sin6, 473 (int)sizeof (sin6_t), optp, optlen); 474 } 475 476 if (!mp) 477 return (B_FALSE); 478 479 mblk_copycred(mp, idmp); 480 481 if (defermp == NULL) { 482 conn_t *connp = tcp->tcp_connp; 483 if (IPCL_IS_NONSTR(connp)) { 484 (*connp->conn_upcalls->su_connected) 485 (connp->conn_upper_handle, tcp->tcp_connid, 486 ira->ira_cred, ira->ira_cpid); 487 freemsg(mp); 488 } else { 489 if (ira->ira_cred != NULL) { 490 /* So that getpeerucred works for TPI sockfs */ 491 mblk_setcred(mp, ira->ira_cred, ira->ira_cpid); 492 } 493 putnext(connp->conn_rq, mp); 494 } 495 } else { 496 *defermp = mp; 497 } 498 499 if (tcp->tcp_conn.tcp_opts_conn_req != NULL) 500 tcp_close_mpp(&tcp->tcp_conn.tcp_opts_conn_req); 501 return (B_TRUE); 502 } 503 504 /* 505 * Successful connect request processing begins when our client passes 506 * a T_CONN_REQ message into tcp_wput(), which performs function calls into 507 * IP and the passes a T_OK_ACK (or T_ERROR_ACK upstream). 508 * 509 * After various error checks are completed, tcp_tpi_connect() lays 510 * the target address and port into the composite header template. 511 * Then we ask IP for information, including a source address if we didn't 512 * already have one. Finally we prepare to send the SYN packet, and then 513 * send up the T_OK_ACK reply message. 514 */ 515 void 516 tcp_tpi_connect(tcp_t *tcp, mblk_t *mp) 517 { 518 sin_t *sin; 519 struct T_conn_req *tcr; 520 struct sockaddr *sa; 521 socklen_t len; 522 int error; 523 cred_t *cr; 524 pid_t cpid; 525 conn_t *connp = tcp->tcp_connp; 526 queue_t *q = connp->conn_wq; 527 528 /* 529 * All Solaris components should pass a db_credp 530 * for this TPI message, hence we ASSERT. 531 * But in case there is some other M_PROTO that looks 532 * like a TPI message sent by some other kernel 533 * component, we check and return an error. 534 */ 535 cr = msg_getcred(mp, &cpid); 536 ASSERT(cr != NULL); 537 if (cr == NULL) { 538 tcp_err_ack(tcp, mp, TSYSERR, EINVAL); 539 return; 540 } 541 542 tcr = (struct T_conn_req *)mp->b_rptr; 543 544 ASSERT((uintptr_t)(mp->b_wptr - mp->b_rptr) <= (uintptr_t)INT_MAX); 545 if ((mp->b_wptr - mp->b_rptr) < sizeof (*tcr)) { 546 tcp_err_ack(tcp, mp, TPROTO, 0); 547 return; 548 } 549 550 /* 551 * Pre-allocate the T_ordrel_ind mblk so that at close time, we 552 * will always have that to send up. Otherwise, we need to do 553 * special handling in case the allocation fails at that time. 554 * If the end point is TPI, the tcp_t can be reused and the 555 * tcp_ordrel_mp may be allocated already. 556 */ 557 if (tcp->tcp_ordrel_mp == NULL) { 558 if ((tcp->tcp_ordrel_mp = mi_tpi_ordrel_ind()) == NULL) { 559 tcp_err_ack(tcp, mp, TSYSERR, ENOMEM); 560 return; 561 } 562 } 563 564 /* 565 * Determine packet type based on type of address passed in 566 * the request should contain an IPv4 or IPv6 address. 567 * Make sure that address family matches the type of 568 * family of the address passed down. 569 */ 570 switch (tcr->DEST_length) { 571 default: 572 tcp_err_ack(tcp, mp, TBADADDR, 0); 573 return; 574 575 case (sizeof (sin_t) - sizeof (sin->sin_zero)): { 576 /* 577 * XXX: The check for valid DEST_length was not there 578 * in earlier releases and some buggy 579 * TLI apps (e.g Sybase) got away with not feeding 580 * in sin_zero part of address. 581 * We allow that bug to keep those buggy apps humming. 582 * Test suites require the check on DEST_length. 583 * We construct a new mblk with valid DEST_length 584 * free the original so the rest of the code does 585 * not have to keep track of this special shorter 586 * length address case. 587 */ 588 mblk_t *nmp; 589 struct T_conn_req *ntcr; 590 sin_t *nsin; 591 592 nmp = allocb(sizeof (struct T_conn_req) + sizeof (sin_t) + 593 tcr->OPT_length, BPRI_HI); 594 if (nmp == NULL) { 595 tcp_err_ack(tcp, mp, TSYSERR, ENOMEM); 596 return; 597 } 598 ntcr = (struct T_conn_req *)nmp->b_rptr; 599 bzero(ntcr, sizeof (struct T_conn_req)); /* zero fill */ 600 ntcr->PRIM_type = T_CONN_REQ; 601 ntcr->DEST_length = sizeof (sin_t); 602 ntcr->DEST_offset = sizeof (struct T_conn_req); 603 604 nsin = (sin_t *)((uchar_t *)ntcr + ntcr->DEST_offset); 605 *nsin = sin_null; 606 /* Get pointer to shorter address to copy from original mp */ 607 sin = (sin_t *)mi_offset_param(mp, tcr->DEST_offset, 608 tcr->DEST_length); /* extract DEST_length worth of sin_t */ 609 if (sin == NULL || !OK_32PTR((char *)sin)) { 610 freemsg(nmp); 611 tcp_err_ack(tcp, mp, TSYSERR, EINVAL); 612 return; 613 } 614 nsin->sin_family = sin->sin_family; 615 nsin->sin_port = sin->sin_port; 616 nsin->sin_addr = sin->sin_addr; 617 /* Note:nsin->sin_zero zero-fill with sin_null assign above */ 618 nmp->b_wptr = (uchar_t *)&nsin[1]; 619 if (tcr->OPT_length != 0) { 620 ntcr->OPT_length = tcr->OPT_length; 621 ntcr->OPT_offset = nmp->b_wptr - nmp->b_rptr; 622 bcopy((uchar_t *)tcr + tcr->OPT_offset, 623 (uchar_t *)ntcr + ntcr->OPT_offset, 624 tcr->OPT_length); 625 nmp->b_wptr += tcr->OPT_length; 626 } 627 freemsg(mp); /* original mp freed */ 628 mp = nmp; /* re-initialize original variables */ 629 tcr = ntcr; 630 } 631 /* FALLTHRU */ 632 633 case sizeof (sin_t): 634 sa = (struct sockaddr *)mi_offset_param(mp, tcr->DEST_offset, 635 sizeof (sin_t)); 636 len = sizeof (sin_t); 637 break; 638 639 case sizeof (sin6_t): 640 sa = (struct sockaddr *)mi_offset_param(mp, tcr->DEST_offset, 641 sizeof (sin6_t)); 642 len = sizeof (sin6_t); 643 break; 644 } 645 646 error = proto_verify_ip_addr(connp->conn_family, sa, len); 647 if (error != 0) { 648 tcp_err_ack(tcp, mp, TSYSERR, error); 649 return; 650 } 651 652 /* 653 * TODO: If someone in TCPS_TIME_WAIT has this dst/port we 654 * should key on their sequence number and cut them loose. 655 */ 656 657 /* 658 * If options passed in, feed it for verification and handling 659 */ 660 if (tcr->OPT_length != 0) { 661 mblk_t *ok_mp; 662 mblk_t *discon_mp; 663 mblk_t *conn_opts_mp; 664 int t_error, sys_error, do_disconnect; 665 666 conn_opts_mp = NULL; 667 668 if (tcp_conprim_opt_process(tcp, mp, 669 &do_disconnect, &t_error, &sys_error) < 0) { 670 if (do_disconnect) { 671 ASSERT(t_error == 0 && sys_error == 0); 672 discon_mp = mi_tpi_discon_ind(NULL, 673 ECONNREFUSED, 0); 674 if (!discon_mp) { 675 tcp_err_ack_prim(tcp, mp, T_CONN_REQ, 676 TSYSERR, ENOMEM); 677 return; 678 } 679 ok_mp = mi_tpi_ok_ack_alloc(mp); 680 if (!ok_mp) { 681 tcp_err_ack_prim(tcp, NULL, T_CONN_REQ, 682 TSYSERR, ENOMEM); 683 return; 684 } 685 qreply(q, ok_mp); 686 qreply(q, discon_mp); /* no flush! */ 687 } else { 688 ASSERT(t_error != 0); 689 tcp_err_ack_prim(tcp, mp, T_CONN_REQ, t_error, 690 sys_error); 691 } 692 return; 693 } 694 /* 695 * Success in setting options, the mp option buffer represented 696 * by OPT_length/offset has been potentially modified and 697 * contains results of option processing. We copy it in 698 * another mp to save it for potentially influencing returning 699 * it in T_CONN_CONN. 700 */ 701 if (tcr->OPT_length != 0) { /* there are resulting options */ 702 conn_opts_mp = copyb(mp); 703 if (!conn_opts_mp) { 704 tcp_err_ack_prim(tcp, mp, T_CONN_REQ, 705 TSYSERR, ENOMEM); 706 return; 707 } 708 ASSERT(tcp->tcp_conn.tcp_opts_conn_req == NULL); 709 tcp->tcp_conn.tcp_opts_conn_req = conn_opts_mp; 710 /* 711 * Note: 712 * These resulting option negotiation can include any 713 * end-to-end negotiation options but there no such 714 * thing (yet?) in our TCP/IP. 715 */ 716 } 717 } 718 719 /* call the non-TPI version */ 720 error = tcp_do_connect(tcp->tcp_connp, sa, len, cr, cpid); 721 if (error < 0) { 722 mp = mi_tpi_err_ack_alloc(mp, -error, 0); 723 } else if (error > 0) { 724 mp = mi_tpi_err_ack_alloc(mp, TSYSERR, error); 725 } else { 726 mp = mi_tpi_ok_ack_alloc(mp); 727 } 728 729 /* 730 * Note: Code below is the "failure" case 731 */ 732 /* return error ack and blow away saved option results if any */ 733 connect_failed: 734 if (mp != NULL) 735 putnext(connp->conn_rq, mp); 736 else { 737 tcp_err_ack_prim(tcp, NULL, T_CONN_REQ, 738 TSYSERR, ENOMEM); 739 } 740 } 741 742 /* Return the TPI/TLI equivalent of our current tcp_state */ 743 static int 744 tcp_tpistate(tcp_t *tcp) 745 { 746 switch (tcp->tcp_state) { 747 case TCPS_IDLE: 748 return (TS_UNBND); 749 case TCPS_LISTEN: 750 /* 751 * Return whether there are outstanding T_CONN_IND waiting 752 * for the matching T_CONN_RES. Therefore don't count q0. 753 */ 754 if (tcp->tcp_conn_req_cnt_q > 0) 755 return (TS_WRES_CIND); 756 else 757 return (TS_IDLE); 758 case TCPS_BOUND: 759 return (TS_IDLE); 760 case TCPS_SYN_SENT: 761 return (TS_WCON_CREQ); 762 case TCPS_SYN_RCVD: 763 /* 764 * Note: assumption: this has to the active open SYN_RCVD. 765 * The passive instance is detached in SYN_RCVD stage of 766 * incoming connection processing so we cannot get request 767 * for T_info_ack on it. 768 */ 769 return (TS_WACK_CRES); 770 case TCPS_ESTABLISHED: 771 return (TS_DATA_XFER); 772 case TCPS_CLOSE_WAIT: 773 return (TS_WREQ_ORDREL); 774 case TCPS_FIN_WAIT_1: 775 return (TS_WIND_ORDREL); 776 case TCPS_FIN_WAIT_2: 777 return (TS_WIND_ORDREL); 778 779 case TCPS_CLOSING: 780 case TCPS_LAST_ACK: 781 case TCPS_TIME_WAIT: 782 case TCPS_CLOSED: 783 /* 784 * Following TS_WACK_DREQ7 is a rendition of "not 785 * yet TS_IDLE" TPI state. There is no best match to any 786 * TPI state for TCPS_{CLOSING, LAST_ACK, TIME_WAIT} but we 787 * choose a value chosen that will map to TLI/XTI level 788 * state of TSTATECHNG (state is process of changing) which 789 * captures what this dummy state represents. 790 */ 791 return (TS_WACK_DREQ7); 792 default: 793 cmn_err(CE_WARN, "tcp_tpistate: strange state (%d) %s", 794 tcp->tcp_state, tcp_display(tcp, NULL, 795 DISP_PORT_ONLY)); 796 return (TS_UNBND); 797 } 798 } 799 800 static void 801 tcp_copy_info(struct T_info_ack *tia, tcp_t *tcp) 802 { 803 tcp_stack_t *tcps = tcp->tcp_tcps; 804 conn_t *connp = tcp->tcp_connp; 805 extern struct T_info_ack tcp_g_t_info_ack; 806 extern struct T_info_ack tcp_g_t_info_ack_v6; 807 808 if (connp->conn_family == AF_INET6) 809 *tia = tcp_g_t_info_ack_v6; 810 else 811 *tia = tcp_g_t_info_ack; 812 tia->CURRENT_state = tcp_tpistate(tcp); 813 tia->OPT_size = tcp_max_optsize; 814 if (tcp->tcp_mss == 0) { 815 /* Not yet set - tcp_open does not set mss */ 816 if (connp->conn_ipversion == IPV4_VERSION) 817 tia->TIDU_size = tcps->tcps_mss_def_ipv4; 818 else 819 tia->TIDU_size = tcps->tcps_mss_def_ipv6; 820 } else { 821 tia->TIDU_size = tcp->tcp_mss; 822 } 823 /* TODO: Default ETSDU is 1. Is that correct for tcp? */ 824 } 825 826 void 827 tcp_do_capability_ack(tcp_t *tcp, struct T_capability_ack *tcap, 828 t_uscalar_t cap_bits1) 829 { 830 tcap->CAP_bits1 = 0; 831 832 if (cap_bits1 & TC1_INFO) { 833 tcp_copy_info(&tcap->INFO_ack, tcp); 834 tcap->CAP_bits1 |= TC1_INFO; 835 } 836 837 if (cap_bits1 & TC1_ACCEPTOR_ID) { 838 tcap->ACCEPTOR_id = tcp->tcp_acceptor_id; 839 tcap->CAP_bits1 |= TC1_ACCEPTOR_ID; 840 } 841 842 } 843 844 /* 845 * This routine responds to T_CAPABILITY_REQ messages. It is called by 846 * tcp_wput. Much of the T_CAPABILITY_ACK information is copied from 847 * tcp_g_t_info_ack. The current state of the stream is copied from 848 * tcp_state. 849 */ 850 void 851 tcp_capability_req(tcp_t *tcp, mblk_t *mp) 852 { 853 t_uscalar_t cap_bits1; 854 struct T_capability_ack *tcap; 855 856 if (MBLKL(mp) < sizeof (struct T_capability_req)) { 857 freemsg(mp); 858 return; 859 } 860 861 cap_bits1 = ((struct T_capability_req *)mp->b_rptr)->CAP_bits1; 862 863 mp = tpi_ack_alloc(mp, sizeof (struct T_capability_ack), 864 mp->b_datap->db_type, T_CAPABILITY_ACK); 865 if (mp == NULL) 866 return; 867 868 tcap = (struct T_capability_ack *)mp->b_rptr; 869 tcp_do_capability_ack(tcp, tcap, cap_bits1); 870 871 putnext(tcp->tcp_connp->conn_rq, mp); 872 } 873 874 /* 875 * This routine responds to T_INFO_REQ messages. It is called by tcp_wput. 876 * Most of the T_INFO_ACK information is copied from tcp_g_t_info_ack. 877 * The current state of the stream is copied from tcp_state. 878 */ 879 void 880 tcp_info_req(tcp_t *tcp, mblk_t *mp) 881 { 882 mp = tpi_ack_alloc(mp, sizeof (struct T_info_ack), M_PCPROTO, 883 T_INFO_ACK); 884 if (!mp) { 885 tcp_err_ack(tcp, mp, TSYSERR, ENOMEM); 886 return; 887 } 888 tcp_copy_info((struct T_info_ack *)mp->b_rptr, tcp); 889 putnext(tcp->tcp_connp->conn_rq, mp); 890 } 891 892 /* Respond to the TPI addr request */ 893 void 894 tcp_addr_req(tcp_t *tcp, mblk_t *mp) 895 { 896 struct sockaddr *sa; 897 mblk_t *ackmp; 898 struct T_addr_ack *taa; 899 conn_t *connp = tcp->tcp_connp; 900 uint_t addrlen; 901 902 /* Make it large enough for worst case */ 903 ackmp = reallocb(mp, sizeof (struct T_addr_ack) + 904 2 * sizeof (sin6_t), 1); 905 if (ackmp == NULL) { 906 tcp_err_ack(tcp, mp, TSYSERR, ENOMEM); 907 return; 908 } 909 910 taa = (struct T_addr_ack *)ackmp->b_rptr; 911 912 bzero(taa, sizeof (struct T_addr_ack)); 913 ackmp->b_wptr = (uchar_t *)&taa[1]; 914 915 taa->PRIM_type = T_ADDR_ACK; 916 ackmp->b_datap->db_type = M_PCPROTO; 917 918 if (connp->conn_family == AF_INET) 919 addrlen = sizeof (sin_t); 920 else 921 addrlen = sizeof (sin6_t); 922 923 /* 924 * Note: Following code assumes 32 bit alignment of basic 925 * data structures like sin_t and struct T_addr_ack. 926 */ 927 if (tcp->tcp_state >= TCPS_BOUND) { 928 /* 929 * Fill in local address first 930 */ 931 taa->LOCADDR_offset = sizeof (*taa); 932 taa->LOCADDR_length = addrlen; 933 sa = (struct sockaddr *)&taa[1]; 934 (void) conn_getsockname(connp, sa, &addrlen); 935 ackmp->b_wptr += addrlen; 936 } 937 if (tcp->tcp_state >= TCPS_SYN_RCVD) { 938 /* 939 * Fill in Remote address 940 */ 941 taa->REMADDR_length = addrlen; 942 /* assumed 32-bit alignment */ 943 taa->REMADDR_offset = taa->LOCADDR_offset + taa->LOCADDR_length; 944 sa = (struct sockaddr *)(ackmp->b_rptr + taa->REMADDR_offset); 945 (void) conn_getpeername(connp, sa, &addrlen); 946 ackmp->b_wptr += addrlen; 947 } 948 ASSERT(ackmp->b_wptr <= ackmp->b_datap->db_lim); 949 putnext(tcp->tcp_connp->conn_rq, ackmp); 950 } 951 952 /* 953 * Swap information between the eager and acceptor for a TLI/XTI client. 954 * The sockfs accept is done on the acceptor stream and control goes 955 * through tcp_tli_accept() and tcp_accept()/tcp_accept_swap() is not 956 * called. In either case, both the eager and listener are in their own 957 * perimeter (squeue) and the code has to deal with potential race. 958 * 959 * See the block comment on top of tcp_accept() and tcp_tli_accept(). 960 */ 961 static void 962 tcp_accept_swap(tcp_t *listener, tcp_t *acceptor, tcp_t *eager) 963 { 964 conn_t *econnp, *aconnp; 965 966 ASSERT(eager->tcp_connp->conn_rq == listener->tcp_connp->conn_rq); 967 ASSERT(eager->tcp_detached && !acceptor->tcp_detached); 968 ASSERT(!TCP_IS_SOCKET(acceptor)); 969 ASSERT(!TCP_IS_SOCKET(eager)); 970 ASSERT(!TCP_IS_SOCKET(listener)); 971 972 /* 973 * Trusted Extensions may need to use a security label that is 974 * different from the acceptor's label on MLP and MAC-Exempt 975 * sockets. If this is the case, the required security label 976 * already exists in econnp->conn_ixa->ixa_tsl. Since we make the 977 * acceptor stream refer to econnp we atomatically get that label. 978 */ 979 980 acceptor->tcp_detached = B_TRUE; 981 /* 982 * To permit stream re-use by TLI/XTI, the eager needs a copy of 983 * the acceptor id. 984 */ 985 eager->tcp_acceptor_id = acceptor->tcp_acceptor_id; 986 987 /* remove eager from listen list... */ 988 mutex_enter(&listener->tcp_eager_lock); 989 tcp_eager_unlink(eager); 990 ASSERT(eager->tcp_eager_next_q == NULL && 991 eager->tcp_eager_last_q == NULL); 992 ASSERT(eager->tcp_eager_next_q0 == NULL && 993 eager->tcp_eager_prev_q0 == NULL); 994 mutex_exit(&listener->tcp_eager_lock); 995 996 econnp = eager->tcp_connp; 997 aconnp = acceptor->tcp_connp; 998 econnp->conn_rq = aconnp->conn_rq; 999 econnp->conn_wq = aconnp->conn_wq; 1000 econnp->conn_rq->q_ptr = econnp; 1001 econnp->conn_wq->q_ptr = econnp; 1002 1003 /* 1004 * In the TLI/XTI loopback case, we are inside the listener's squeue, 1005 * which might be a different squeue from our peer TCP instance. 1006 * For TCP Fusion, the peer expects that whenever tcp_detached is 1007 * clear, our TCP queues point to the acceptor's queues. Thus, use 1008 * membar_producer() to ensure that the assignments of conn_rq/conn_wq 1009 * above reach global visibility prior to the clearing of tcp_detached. 1010 */ 1011 membar_producer(); 1012 eager->tcp_detached = B_FALSE; 1013 1014 ASSERT(eager->tcp_ack_tid == 0); 1015 1016 econnp->conn_dev = aconnp->conn_dev; 1017 econnp->conn_minor_arena = aconnp->conn_minor_arena; 1018 1019 ASSERT(econnp->conn_minor_arena != NULL); 1020 if (econnp->conn_cred != NULL) 1021 crfree(econnp->conn_cred); 1022 econnp->conn_cred = aconnp->conn_cred; 1023 ASSERT(!(econnp->conn_ixa->ixa_free_flags & IXA_FREE_CRED)); 1024 econnp->conn_ixa->ixa_cred = econnp->conn_cred; 1025 aconnp->conn_cred = NULL; 1026 econnp->conn_cpid = aconnp->conn_cpid; 1027 ASSERT(econnp->conn_netstack == aconnp->conn_netstack); 1028 ASSERT(eager->tcp_tcps == acceptor->tcp_tcps); 1029 1030 econnp->conn_zoneid = aconnp->conn_zoneid; 1031 econnp->conn_allzones = aconnp->conn_allzones; 1032 econnp->conn_ixa->ixa_zoneid = aconnp->conn_ixa->ixa_zoneid; 1033 1034 econnp->conn_mac_mode = aconnp->conn_mac_mode; 1035 econnp->conn_zone_is_global = aconnp->conn_zone_is_global; 1036 aconnp->conn_mac_mode = CONN_MAC_DEFAULT; 1037 1038 /* Do the IPC initialization */ 1039 CONN_INC_REF(econnp); 1040 1041 /* Done with old IPC. Drop its ref on its connp */ 1042 CONN_DEC_REF(aconnp); 1043 } 1044 1045 /* 1046 * This runs at the tail end of accept processing on the squeue of the 1047 * new connection. 1048 */ 1049 /* ARGSUSED */ 1050 static void 1051 tcp_accept_finish(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy) 1052 { 1053 conn_t *connp = (conn_t *)arg; 1054 tcp_t *tcp = connp->conn_tcp; 1055 queue_t *q = connp->conn_rq; 1056 tcp_stack_t *tcps = tcp->tcp_tcps; 1057 struct stroptions *stropt; 1058 struct sock_proto_props sopp; 1059 1060 /* Should never be called for non-STREAMS sockets */ 1061 ASSERT(!IPCL_IS_NONSTR(connp)); 1062 1063 /* We should just receive a single mblk that fits a T_discon_ind */ 1064 ASSERT(mp->b_cont == NULL); 1065 1066 /* 1067 * Drop the eager's ref on the listener, that was placed when 1068 * this eager began life in tcp_input_listener. 1069 */ 1070 CONN_DEC_REF(tcp->tcp_saved_listener->tcp_connp); 1071 1072 tcp->tcp_detached = B_FALSE; 1073 1074 if (tcp->tcp_state <= TCPS_BOUND || tcp->tcp_accept_error) { 1075 /* 1076 * Someone blewoff the eager before we could finish 1077 * the accept. 1078 * 1079 * The only reason eager exists it because we put in 1080 * a ref on it when conn ind went up. We need to send 1081 * a disconnect indication up while the last reference 1082 * on the eager will be dropped by the squeue when we 1083 * return. 1084 */ 1085 ASSERT(tcp->tcp_listener == NULL); 1086 if (tcp->tcp_issocket || tcp->tcp_send_discon_ind) { 1087 struct T_discon_ind *tdi; 1088 1089 (void) putnextctl1(q, M_FLUSH, FLUSHRW); 1090 /* 1091 * Let us reuse the incoming mblk to avoid 1092 * memory allocation failure problems. We know 1093 * that the size of the incoming mblk i.e. 1094 * stroptions is greater than sizeof 1095 * T_discon_ind. 1096 */ 1097 ASSERT(DB_REF(mp) == 1); 1098 ASSERT(MBLKSIZE(mp) >= 1099 sizeof (struct T_discon_ind)); 1100 1101 DB_TYPE(mp) = M_PROTO; 1102 ((union T_primitives *)mp->b_rptr)->type = 1103 T_DISCON_IND; 1104 tdi = (struct T_discon_ind *)mp->b_rptr; 1105 if (tcp->tcp_issocket) { 1106 tdi->DISCON_reason = ECONNREFUSED; 1107 tdi->SEQ_number = 0; 1108 } else { 1109 tdi->DISCON_reason = ENOPROTOOPT; 1110 tdi->SEQ_number = 1111 tcp->tcp_conn_req_seqnum; 1112 } 1113 mp->b_wptr = mp->b_rptr + 1114 sizeof (struct T_discon_ind); 1115 putnext(q, mp); 1116 } 1117 tcp->tcp_hard_binding = B_FALSE; 1118 return; 1119 } 1120 1121 /* 1122 * This is the first time we run on the correct 1123 * queue after tcp_accept. So fix all the q parameters 1124 * here. 1125 * 1126 * Let us reuse the incoming mblk to avoid 1127 * memory allocation failure problems. We know 1128 * that the size of the incoming mblk is at least 1129 * stroptions 1130 */ 1131 tcp_get_proto_props(tcp, &sopp); 1132 1133 ASSERT(DB_REF(mp) == 1); 1134 ASSERT(MBLKSIZE(mp) >= sizeof (struct stroptions)); 1135 1136 DB_TYPE(mp) = M_SETOPTS; 1137 stropt = (struct stroptions *)mp->b_rptr; 1138 mp->b_wptr = mp->b_rptr + sizeof (struct stroptions); 1139 stropt = (struct stroptions *)mp->b_rptr; 1140 ASSERT(sopp.sopp_flags & (SO_HIWAT|SO_WROFF|SO_MAXBLK)); 1141 stropt->so_flags = SO_HIWAT | SO_WROFF | SO_MAXBLK; 1142 stropt->so_hiwat = sopp.sopp_rxhiwat; 1143 stropt->so_wroff = sopp.sopp_wroff; 1144 stropt->so_maxblk = sopp.sopp_maxblk; 1145 1146 /* Send the options up */ 1147 putnext(q, mp); 1148 1149 /* 1150 * Pass up any data and/or a fin that has been received. 1151 * 1152 * Adjust receive window in case it had decreased 1153 * (because there is data <=> tcp_rcv_list != NULL) 1154 * while the connection was detached. Note that 1155 * in case the eager was flow-controlled, w/o this 1156 * code, the rwnd may never open up again! 1157 */ 1158 if (tcp->tcp_rcv_list != NULL) { 1159 /* We drain directly in case of fused tcp loopback */ 1160 1161 if (!tcp->tcp_fused && canputnext(q)) { 1162 tcp->tcp_rwnd = connp->conn_rcvbuf; 1163 if (tcp->tcp_state >= TCPS_ESTABLISHED && 1164 tcp_rwnd_reopen(tcp) == TH_ACK_NEEDED) { 1165 tcp_xmit_ctl(NULL, 1166 tcp, (tcp->tcp_swnd == 0) ? 1167 tcp->tcp_suna : tcp->tcp_snxt, 1168 tcp->tcp_rnxt, TH_ACK); 1169 } 1170 } 1171 1172 (void) tcp_rcv_drain(tcp); 1173 1174 /* 1175 * For fused tcp loopback, back-enable peer endpoint 1176 * if it's currently flow-controlled. 1177 */ 1178 if (tcp->tcp_fused) { 1179 tcp_t *peer_tcp = tcp->tcp_loopback_peer; 1180 1181 ASSERT(peer_tcp != NULL); 1182 ASSERT(peer_tcp->tcp_fused); 1183 1184 mutex_enter(&peer_tcp->tcp_non_sq_lock); 1185 if (peer_tcp->tcp_flow_stopped) { 1186 tcp_clrqfull(peer_tcp); 1187 TCP_STAT(tcps, tcp_fusion_backenabled); 1188 } 1189 mutex_exit(&peer_tcp->tcp_non_sq_lock); 1190 } 1191 } 1192 ASSERT(tcp->tcp_rcv_list == NULL || tcp->tcp_fused_sigurg); 1193 if (tcp->tcp_fin_rcvd && !tcp->tcp_ordrel_done) { 1194 tcp->tcp_ordrel_done = B_TRUE; 1195 mp = tcp->tcp_ordrel_mp; 1196 tcp->tcp_ordrel_mp = NULL; 1197 putnext(q, mp); 1198 } 1199 tcp->tcp_hard_binding = B_FALSE; 1200 1201 if (connp->conn_keepalive) { 1202 tcp->tcp_ka_last_intrvl = 0; 1203 tcp->tcp_ka_tid = TCP_TIMER(tcp, tcp_keepalive_timer, 1204 tcp->tcp_ka_interval); 1205 } 1206 1207 /* 1208 * At this point, eager is fully established and will 1209 * have the following references - 1210 * 1211 * 2 references for connection to exist (1 for TCP and 1 for IP). 1212 * 1 reference for the squeue which will be dropped by the squeue as 1213 * soon as this function returns. 1214 * There will be 1 additonal reference for being in classifier 1215 * hash list provided something bad hasn't happened. 1216 */ 1217 ASSERT((connp->conn_fanout != NULL && connp->conn_ref >= 4) || 1218 (connp->conn_fanout == NULL && connp->conn_ref >= 3)); 1219 } 1220 1221 /* 1222 * Pull a deferred connection indication off of the listener. The caller 1223 * must verify that there is a deferred conn ind under eager_lock before 1224 * calling this function. 1225 */ 1226 static mblk_t * 1227 tcp_get_def_conn_ind(tcp_t *listener) 1228 { 1229 tcp_t *tail; 1230 tcp_t *tcp; 1231 mblk_t *conn_ind; 1232 1233 ASSERT(MUTEX_HELD(&listener->tcp_eager_lock)); 1234 ASSERT(listener->tcp_eager_prev_q0->tcp_conn_def_q0); 1235 1236 tcp = listener->tcp_eager_prev_q0; 1237 /* 1238 * listener->tcp_eager_prev_q0 points to the TAIL of the 1239 * deferred T_conn_ind queue. We need to get to the head 1240 * of the queue in order to send up T_conn_ind the same 1241 * order as how the 3WHS is completed. 1242 */ 1243 while (tcp != listener) { 1244 if (!tcp->tcp_eager_prev_q0->tcp_conn_def_q0) 1245 break; 1246 else 1247 tcp = tcp->tcp_eager_prev_q0; 1248 } 1249 1250 conn_ind = tcp->tcp_conn.tcp_eager_conn_ind; 1251 tcp->tcp_conn.tcp_eager_conn_ind = NULL; 1252 /* Move from q0 to q */ 1253 ASSERT(listener->tcp_conn_req_cnt_q0 > 0); 1254 listener->tcp_conn_req_cnt_q0--; 1255 listener->tcp_conn_req_cnt_q++; 1256 tcp->tcp_eager_next_q0->tcp_eager_prev_q0 = 1257 tcp->tcp_eager_prev_q0; 1258 tcp->tcp_eager_prev_q0->tcp_eager_next_q0 = 1259 tcp->tcp_eager_next_q0; 1260 tcp->tcp_eager_prev_q0 = NULL; 1261 tcp->tcp_eager_next_q0 = NULL; 1262 tcp->tcp_conn_def_q0 = B_FALSE; 1263 1264 /* Make sure the tcp isn't in the list of droppables */ 1265 ASSERT(tcp->tcp_eager_next_drop_q0 == NULL && 1266 tcp->tcp_eager_prev_drop_q0 == NULL); 1267 1268 /* 1269 * Insert at end of the queue because sockfs sends 1270 * down T_CONN_RES in chronological order. Leaving 1271 * the older conn indications at front of the queue 1272 * helps reducing search time. 1273 */ 1274 tail = listener->tcp_eager_last_q; 1275 if (tail != NULL) { 1276 tail->tcp_eager_next_q = tcp; 1277 } else { 1278 listener->tcp_eager_next_q = tcp; 1279 } 1280 listener->tcp_eager_last_q = tcp; 1281 tcp->tcp_eager_next_q = NULL; 1282 1283 return (conn_ind); 1284 } 1285 1286 1287 /* 1288 * Reply to a clients T_CONN_RES TPI message. This function 1289 * is used only for TLI/XTI listener. Sockfs sends T_CONN_RES 1290 * on the acceptor STREAM and processed in tcp_accept_common(). 1291 * Read the block comment on top of tcp_input_listener(). 1292 */ 1293 void 1294 tcp_tli_accept(tcp_t *listener, mblk_t *mp) 1295 { 1296 tcp_t *acceptor; 1297 tcp_t *eager; 1298 struct T_conn_res *tcr; 1299 t_uscalar_t acceptor_id; 1300 t_scalar_t seqnum; 1301 mblk_t *discon_mp = NULL; 1302 mblk_t *ok_mp; 1303 mblk_t *mp1; 1304 tcp_stack_t *tcps = listener->tcp_tcps; 1305 conn_t *econnp; 1306 1307 if ((mp->b_wptr - mp->b_rptr) < sizeof (*tcr)) { 1308 tcp_err_ack(listener, mp, TPROTO, 0); 1309 return; 1310 } 1311 tcr = (struct T_conn_res *)mp->b_rptr; 1312 1313 /* 1314 * Under ILP32 the stream head points tcr->ACCEPTOR_id at the 1315 * read side queue of the streams device underneath us i.e. the 1316 * read side queue of 'ip'. Since we can't deference QUEUE_ptr we 1317 * look it up in the queue_hash. Under LP64 it sends down the 1318 * minor_t of the accepting endpoint. 1319 * 1320 * Once the acceptor/eager are modified (in tcp_accept_swap) the 1321 * fanout hash lock is held. 1322 * This prevents any thread from entering the acceptor queue from 1323 * below (since it has not been hard bound yet i.e. any inbound 1324 * packets will arrive on the listener conn_t and 1325 * go through the classifier). 1326 * The CONN_INC_REF will prevent the acceptor from closing. 1327 * 1328 * XXX It is still possible for a tli application to send down data 1329 * on the accepting stream while another thread calls t_accept. 1330 * This should not be a problem for well-behaved applications since 1331 * the T_OK_ACK is sent after the queue swapping is completed. 1332 * 1333 * If the accepting fd is the same as the listening fd, avoid 1334 * queue hash lookup since that will return an eager listener in a 1335 * already established state. 1336 */ 1337 acceptor_id = tcr->ACCEPTOR_id; 1338 mutex_enter(&listener->tcp_eager_lock); 1339 if (listener->tcp_acceptor_id == acceptor_id) { 1340 eager = listener->tcp_eager_next_q; 1341 /* only count how many T_CONN_INDs so don't count q0 */ 1342 if ((listener->tcp_conn_req_cnt_q != 1) || 1343 (eager->tcp_conn_req_seqnum != tcr->SEQ_number)) { 1344 mutex_exit(&listener->tcp_eager_lock); 1345 tcp_err_ack(listener, mp, TBADF, 0); 1346 return; 1347 } 1348 if (listener->tcp_conn_req_cnt_q0 != 0) { 1349 /* Throw away all the eagers on q0. */ 1350 tcp_eager_cleanup(listener, 1); 1351 } 1352 if (listener->tcp_syn_defense) { 1353 listener->tcp_syn_defense = B_FALSE; 1354 if (listener->tcp_ip_addr_cache != NULL) { 1355 kmem_free(listener->tcp_ip_addr_cache, 1356 IP_ADDR_CACHE_SIZE * sizeof (ipaddr_t)); 1357 listener->tcp_ip_addr_cache = NULL; 1358 } 1359 } 1360 /* 1361 * Transfer tcp_conn_req_max to the eager so that when 1362 * a disconnect occurs we can revert the endpoint to the 1363 * listen state. 1364 */ 1365 eager->tcp_conn_req_max = listener->tcp_conn_req_max; 1366 ASSERT(listener->tcp_conn_req_cnt_q0 == 0); 1367 /* 1368 * Get a reference on the acceptor just like the 1369 * tcp_acceptor_hash_lookup below. 1370 */ 1371 acceptor = listener; 1372 CONN_INC_REF(acceptor->tcp_connp); 1373 } else { 1374 acceptor = tcp_acceptor_hash_lookup(acceptor_id, tcps); 1375 if (acceptor == NULL) { 1376 if (listener->tcp_connp->conn_debug) { 1377 (void) strlog(TCP_MOD_ID, 0, 1, 1378 SL_ERROR|SL_TRACE, 1379 "tcp_accept: did not find acceptor 0x%x\n", 1380 acceptor_id); 1381 } 1382 mutex_exit(&listener->tcp_eager_lock); 1383 tcp_err_ack(listener, mp, TPROVMISMATCH, 0); 1384 return; 1385 } 1386 /* 1387 * Verify acceptor state. The acceptable states for an acceptor 1388 * include TCPS_IDLE and TCPS_BOUND. 1389 */ 1390 switch (acceptor->tcp_state) { 1391 case TCPS_IDLE: 1392 /* FALLTHRU */ 1393 case TCPS_BOUND: 1394 break; 1395 default: 1396 CONN_DEC_REF(acceptor->tcp_connp); 1397 mutex_exit(&listener->tcp_eager_lock); 1398 tcp_err_ack(listener, mp, TOUTSTATE, 0); 1399 return; 1400 } 1401 } 1402 1403 /* The listener must be in TCPS_LISTEN */ 1404 if (listener->tcp_state != TCPS_LISTEN) { 1405 CONN_DEC_REF(acceptor->tcp_connp); 1406 mutex_exit(&listener->tcp_eager_lock); 1407 tcp_err_ack(listener, mp, TOUTSTATE, 0); 1408 return; 1409 } 1410 1411 /* 1412 * Rendezvous with an eager connection request packet hanging off 1413 * 'tcp' that has the 'seqnum' tag. We tagged the detached open 1414 * tcp structure when the connection packet arrived in 1415 * tcp_input_listener(). 1416 */ 1417 seqnum = tcr->SEQ_number; 1418 eager = listener; 1419 do { 1420 eager = eager->tcp_eager_next_q; 1421 if (eager == NULL) { 1422 CONN_DEC_REF(acceptor->tcp_connp); 1423 mutex_exit(&listener->tcp_eager_lock); 1424 tcp_err_ack(listener, mp, TBADSEQ, 0); 1425 return; 1426 } 1427 } while (eager->tcp_conn_req_seqnum != seqnum); 1428 mutex_exit(&listener->tcp_eager_lock); 1429 1430 /* 1431 * At this point, both acceptor and listener have 2 ref 1432 * that they begin with. Acceptor has one additional ref 1433 * we placed in lookup while listener has 3 additional 1434 * ref for being behind the squeue (tcp_accept() is 1435 * done on listener's squeue); being in classifier hash; 1436 * and eager's ref on listener. 1437 */ 1438 ASSERT(listener->tcp_connp->conn_ref >= 5); 1439 ASSERT(acceptor->tcp_connp->conn_ref >= 3); 1440 1441 /* 1442 * The eager at this point is set in its own squeue and 1443 * could easily have been killed (tcp_accept_finish will 1444 * deal with that) because of a TH_RST so we can only 1445 * ASSERT for a single ref. 1446 */ 1447 ASSERT(eager->tcp_connp->conn_ref >= 1); 1448 1449 /* 1450 * Pre allocate the discon_ind mblk also. tcp_accept_finish will 1451 * use it if something failed. 1452 */ 1453 discon_mp = allocb(MAX(sizeof (struct T_discon_ind), 1454 sizeof (struct stroptions)), BPRI_HI); 1455 if (discon_mp == NULL) { 1456 CONN_DEC_REF(acceptor->tcp_connp); 1457 CONN_DEC_REF(eager->tcp_connp); 1458 tcp_err_ack(listener, mp, TSYSERR, ENOMEM); 1459 return; 1460 } 1461 1462 econnp = eager->tcp_connp; 1463 1464 /* Hold a copy of mp, in case reallocb fails */ 1465 if ((mp1 = copymsg(mp)) == NULL) { 1466 CONN_DEC_REF(acceptor->tcp_connp); 1467 CONN_DEC_REF(eager->tcp_connp); 1468 freemsg(discon_mp); 1469 tcp_err_ack(listener, mp, TSYSERR, ENOMEM); 1470 return; 1471 } 1472 1473 tcr = (struct T_conn_res *)mp1->b_rptr; 1474 1475 /* 1476 * This is an expanded version of mi_tpi_ok_ack_alloc() 1477 * which allocates a larger mblk and appends the new 1478 * local address to the ok_ack. The address is copied by 1479 * soaccept() for getsockname(). 1480 */ 1481 { 1482 int extra; 1483 1484 extra = (econnp->conn_family == AF_INET) ? 1485 sizeof (sin_t) : sizeof (sin6_t); 1486 1487 /* 1488 * Try to re-use mp, if possible. Otherwise, allocate 1489 * an mblk and return it as ok_mp. In any case, mp 1490 * is no longer usable upon return. 1491 */ 1492 if ((ok_mp = mi_tpi_ok_ack_alloc_extra(mp, extra)) == NULL) { 1493 CONN_DEC_REF(acceptor->tcp_connp); 1494 CONN_DEC_REF(eager->tcp_connp); 1495 freemsg(discon_mp); 1496 /* Original mp has been freed by now, so use mp1 */ 1497 tcp_err_ack(listener, mp1, TSYSERR, ENOMEM); 1498 return; 1499 } 1500 1501 mp = NULL; /* We should never use mp after this point */ 1502 1503 switch (extra) { 1504 case sizeof (sin_t): { 1505 sin_t *sin = (sin_t *)ok_mp->b_wptr; 1506 1507 ok_mp->b_wptr += extra; 1508 sin->sin_family = AF_INET; 1509 sin->sin_port = econnp->conn_lport; 1510 sin->sin_addr.s_addr = econnp->conn_laddr_v4; 1511 break; 1512 } 1513 case sizeof (sin6_t): { 1514 sin6_t *sin6 = (sin6_t *)ok_mp->b_wptr; 1515 1516 ok_mp->b_wptr += extra; 1517 sin6->sin6_family = AF_INET6; 1518 sin6->sin6_port = econnp->conn_lport; 1519 sin6->sin6_addr = econnp->conn_laddr_v6; 1520 sin6->sin6_flowinfo = econnp->conn_flowinfo; 1521 if (IN6_IS_ADDR_LINKSCOPE(&econnp->conn_laddr_v6) && 1522 (econnp->conn_ixa->ixa_flags & IXAF_SCOPEID_SET)) { 1523 sin6->sin6_scope_id = 1524 econnp->conn_ixa->ixa_scopeid; 1525 } else { 1526 sin6->sin6_scope_id = 0; 1527 } 1528 sin6->__sin6_src_id = 0; 1529 break; 1530 } 1531 default: 1532 break; 1533 } 1534 ASSERT(ok_mp->b_wptr <= ok_mp->b_datap->db_lim); 1535 } 1536 1537 /* 1538 * If there are no options we know that the T_CONN_RES will 1539 * succeed. However, we can't send the T_OK_ACK upstream until 1540 * the tcp_accept_swap is done since it would be dangerous to 1541 * let the application start using the new fd prior to the swap. 1542 */ 1543 tcp_accept_swap(listener, acceptor, eager); 1544 1545 /* 1546 * tcp_accept_swap unlinks eager from listener but does not drop 1547 * the eager's reference on the listener. 1548 */ 1549 ASSERT(eager->tcp_listener == NULL); 1550 ASSERT(listener->tcp_connp->conn_ref >= 5); 1551 1552 /* 1553 * The eager is now associated with its own queue. Insert in 1554 * the hash so that the connection can be reused for a future 1555 * T_CONN_RES. 1556 */ 1557 tcp_acceptor_hash_insert(acceptor_id, eager); 1558 1559 /* 1560 * We now do the processing of options with T_CONN_RES. 1561 * We delay till now since we wanted to have queue to pass to 1562 * option processing routines that points back to the right 1563 * instance structure which does not happen until after 1564 * tcp_accept_swap(). 1565 * 1566 * Note: 1567 * The sanity of the logic here assumes that whatever options 1568 * are appropriate to inherit from listner=>eager are done 1569 * before this point, and whatever were to be overridden (or not) 1570 * in transfer logic from eager=>acceptor in tcp_accept_swap(). 1571 * [ Warning: acceptor endpoint can have T_OPTMGMT_REQ done to it 1572 * before its ACCEPTOR_id comes down in T_CONN_RES ] 1573 * This may not be true at this point in time but can be fixed 1574 * independently. This option processing code starts with 1575 * the instantiated acceptor instance and the final queue at 1576 * this point. 1577 */ 1578 1579 if (tcr->OPT_length != 0) { 1580 /* Options to process */ 1581 int t_error = 0; 1582 int sys_error = 0; 1583 int do_disconnect = 0; 1584 1585 if (tcp_conprim_opt_process(eager, mp1, 1586 &do_disconnect, &t_error, &sys_error) < 0) { 1587 eager->tcp_accept_error = 1; 1588 if (do_disconnect) { 1589 /* 1590 * An option failed which does not allow 1591 * connection to be accepted. 1592 * 1593 * We allow T_CONN_RES to succeed and 1594 * put a T_DISCON_IND on the eager queue. 1595 */ 1596 ASSERT(t_error == 0 && sys_error == 0); 1597 eager->tcp_send_discon_ind = 1; 1598 } else { 1599 ASSERT(t_error != 0); 1600 freemsg(ok_mp); 1601 /* 1602 * Original mp was either freed or set 1603 * to ok_mp above, so use mp1 instead. 1604 */ 1605 tcp_err_ack(listener, mp1, t_error, sys_error); 1606 goto finish; 1607 } 1608 } 1609 /* 1610 * Most likely success in setting options (except if 1611 * eager->tcp_send_discon_ind set). 1612 * mp1 option buffer represented by OPT_length/offset 1613 * potentially modified and contains results of setting 1614 * options at this point 1615 */ 1616 } 1617 1618 /* We no longer need mp1, since all options processing has passed */ 1619 freemsg(mp1); 1620 1621 putnext(listener->tcp_connp->conn_rq, ok_mp); 1622 1623 mutex_enter(&listener->tcp_eager_lock); 1624 if (listener->tcp_eager_prev_q0->tcp_conn_def_q0) { 1625 mblk_t *conn_ind; 1626 1627 /* 1628 * This path should not be executed if listener and 1629 * acceptor streams are the same. 1630 */ 1631 ASSERT(listener != acceptor); 1632 conn_ind = tcp_get_def_conn_ind(listener); 1633 mutex_exit(&listener->tcp_eager_lock); 1634 putnext(listener->tcp_connp->conn_rq, conn_ind); 1635 } else { 1636 mutex_exit(&listener->tcp_eager_lock); 1637 } 1638 1639 /* 1640 * Done with the acceptor - free it 1641 * 1642 * Note: from this point on, no access to listener should be made 1643 * as listener can be equal to acceptor. 1644 */ 1645 finish: 1646 ASSERT(acceptor->tcp_detached); 1647 acceptor->tcp_connp->conn_rq = NULL; 1648 ASSERT(!IPCL_IS_NONSTR(acceptor->tcp_connp)); 1649 acceptor->tcp_connp->conn_wq = NULL; 1650 (void) tcp_clean_death(acceptor, 0); 1651 CONN_DEC_REF(acceptor->tcp_connp); 1652 1653 /* 1654 * We pass discon_mp to tcp_accept_finish to get on the right squeue. 1655 * 1656 * It will update the setting for sockfs/stream head and also take 1657 * care of any data that arrived before accept() wad called. 1658 * In case we already received a FIN then tcp_accept_finish will send up 1659 * the ordrel. It will also send up a window update if the window 1660 * has opened up. 1661 */ 1662 1663 /* 1664 * XXX: we currently have a problem if XTI application closes the 1665 * acceptor stream in between. This problem exists in on10-gate also 1666 * and is well know but nothing can be done short of major rewrite 1667 * to fix it. Now it is possible to take care of it by assigning TLI/XTI 1668 * eager same squeue as listener (we can distinguish non socket 1669 * listeners at the time of handling a SYN in tcp_input_listener) 1670 * and do most of the work that tcp_accept_finish does here itself 1671 * and then get behind the acceptor squeue to access the acceptor 1672 * queue. 1673 */ 1674 /* 1675 * We already have a ref on tcp so no need to do one before squeue_enter 1676 */ 1677 SQUEUE_ENTER_ONE(eager->tcp_connp->conn_sqp, discon_mp, 1678 tcp_accept_finish, eager->tcp_connp, NULL, SQ_FILL, 1679 SQTAG_TCP_ACCEPT_FINISH); 1680 } 1681 1682 1683 /* 1684 * This is the STREAMS entry point for T_CONN_RES coming down on 1685 * Acceptor STREAM when sockfs listener does accept processing. 1686 * Read the block comment on top of tcp_input_listener(). 1687 */ 1688 int 1689 tcp_tpi_accept(queue_t *q, mblk_t *mp) 1690 { 1691 queue_t *rq = RD(q); 1692 struct T_conn_res *conn_res; 1693 tcp_t *eager; 1694 tcp_t *listener; 1695 struct T_ok_ack *ok; 1696 t_scalar_t PRIM_type; 1697 mblk_t *discon_mp; 1698 conn_t *econnp; 1699 cred_t *cr; 1700 1701 ASSERT(DB_TYPE(mp) == M_PROTO); 1702 1703 /* 1704 * All Solaris components should pass a db_credp 1705 * for this TPI message, hence we ASSERT. 1706 * But in case there is some other M_PROTO that looks 1707 * like a TPI message sent by some other kernel 1708 * component, we check and return an error. 1709 */ 1710 cr = msg_getcred(mp, NULL); 1711 ASSERT(cr != NULL); 1712 if (cr == NULL) { 1713 mp = mi_tpi_err_ack_alloc(mp, TSYSERR, EINVAL); 1714 if (mp != NULL) 1715 putnext(rq, mp); 1716 return (0); 1717 } 1718 conn_res = (struct T_conn_res *)mp->b_rptr; 1719 ASSERT((uintptr_t)(mp->b_wptr - mp->b_rptr) <= (uintptr_t)INT_MAX); 1720 if ((mp->b_wptr - mp->b_rptr) < sizeof (struct T_conn_res)) { 1721 mp = mi_tpi_err_ack_alloc(mp, TPROTO, 0); 1722 if (mp != NULL) 1723 putnext(rq, mp); 1724 return (0); 1725 } 1726 switch (conn_res->PRIM_type) { 1727 case O_T_CONN_RES: 1728 case T_CONN_RES: 1729 /* 1730 * We pass up an err ack if allocb fails. This will 1731 * cause sockfs to issue a T_DISCON_REQ which will cause 1732 * tcp_eager_blowoff to be called. sockfs will then call 1733 * rq->q_qinfo->qi_qclose to cleanup the acceptor stream. 1734 * we need to do the allocb up here because we have to 1735 * make sure rq->q_qinfo->qi_qclose still points to the 1736 * correct function (tcp_tpi_close_accept) in case allocb 1737 * fails. 1738 */ 1739 bcopy(mp->b_rptr + conn_res->OPT_offset, 1740 &eager, conn_res->OPT_length); 1741 PRIM_type = conn_res->PRIM_type; 1742 mp->b_datap->db_type = M_PCPROTO; 1743 mp->b_wptr = mp->b_rptr + sizeof (struct T_ok_ack); 1744 ok = (struct T_ok_ack *)mp->b_rptr; 1745 ok->PRIM_type = T_OK_ACK; 1746 ok->CORRECT_prim = PRIM_type; 1747 econnp = eager->tcp_connp; 1748 econnp->conn_dev = (dev_t)RD(q)->q_ptr; 1749 econnp->conn_minor_arena = (vmem_t *)(WR(q)->q_ptr); 1750 econnp->conn_rq = rq; 1751 econnp->conn_wq = q; 1752 rq->q_ptr = econnp; 1753 rq->q_qinfo = &tcp_rinitv4; /* No open - same as rinitv6 */ 1754 q->q_ptr = econnp; 1755 q->q_qinfo = &tcp_winit; 1756 listener = eager->tcp_listener; 1757 1758 /* 1759 * Pre allocate the discon_ind mblk also. tcp_accept_finish will 1760 * use it if something failed. 1761 */ 1762 discon_mp = allocb(MAX(sizeof (struct T_discon_ind), 1763 sizeof (struct stroptions)), BPRI_HI); 1764 1765 if (discon_mp == NULL) { 1766 mp = mi_tpi_err_ack_alloc(mp, TPROTO, 0); 1767 if (mp != NULL) 1768 putnext(rq, mp); 1769 return (0); 1770 } 1771 1772 eager->tcp_issocket = B_TRUE; 1773 1774 ASSERT(econnp->conn_netstack == 1775 listener->tcp_connp->conn_netstack); 1776 ASSERT(eager->tcp_tcps == listener->tcp_tcps); 1777 1778 /* Put the ref for IP */ 1779 CONN_INC_REF(econnp); 1780 1781 /* 1782 * We should have minimum of 3 references on the conn 1783 * at this point. One each for TCP and IP and one for 1784 * the T_conn_ind that was sent up when the 3-way handshake 1785 * completed. In the normal case we would also have another 1786 * reference (making a total of 4) for the conn being in the 1787 * classifier hash list. However the eager could have received 1788 * an RST subsequently and tcp_closei_local could have removed 1789 * the eager from the classifier hash list, hence we can't 1790 * assert that reference. 1791 */ 1792 ASSERT(econnp->conn_ref >= 3); 1793 1794 mutex_enter(&listener->tcp_eager_lock); 1795 if (listener->tcp_eager_prev_q0->tcp_conn_def_q0) { 1796 mblk_t *conn_ind = tcp_get_def_conn_ind(listener); 1797 1798 /* Need to get inside the listener perimeter */ 1799 CONN_INC_REF(listener->tcp_connp); 1800 SQUEUE_ENTER_ONE(listener->tcp_connp->conn_sqp, 1801 conn_ind, tcp_send_pending, listener->tcp_connp, 1802 NULL, SQ_FILL, SQTAG_TCP_SEND_PENDING); 1803 } 1804 tcp_eager_unlink(eager); 1805 mutex_exit(&listener->tcp_eager_lock); 1806 1807 /* 1808 * At this point, the eager is detached from the listener 1809 * but we still have an extra refs on eager (apart from the 1810 * usual tcp references). The ref was placed in tcp_input_data 1811 * before sending the conn_ind in tcp_send_conn_ind. 1812 * The ref will be dropped in tcp_accept_finish(). 1813 */ 1814 SQUEUE_ENTER_ONE(econnp->conn_sqp, discon_mp, tcp_accept_finish, 1815 econnp, NULL, SQ_NODRAIN, SQTAG_TCP_ACCEPT_FINISH_Q0); 1816 1817 /* 1818 * Send the new local address also up to sockfs. There 1819 * should already be enough space in the mp that came 1820 * down from soaccept(). 1821 */ 1822 if (econnp->conn_family == AF_INET) { 1823 sin_t *sin; 1824 1825 ASSERT((mp->b_datap->db_lim - mp->b_datap->db_base) >= 1826 (sizeof (struct T_ok_ack) + sizeof (sin_t))); 1827 sin = (sin_t *)mp->b_wptr; 1828 mp->b_wptr += sizeof (sin_t); 1829 sin->sin_family = AF_INET; 1830 sin->sin_port = econnp->conn_lport; 1831 sin->sin_addr.s_addr = econnp->conn_laddr_v4; 1832 } else { 1833 sin6_t *sin6; 1834 1835 ASSERT((mp->b_datap->db_lim - mp->b_datap->db_base) >= 1836 sizeof (struct T_ok_ack) + sizeof (sin6_t)); 1837 sin6 = (sin6_t *)mp->b_wptr; 1838 mp->b_wptr += sizeof (sin6_t); 1839 sin6->sin6_family = AF_INET6; 1840 sin6->sin6_port = econnp->conn_lport; 1841 sin6->sin6_addr = econnp->conn_laddr_v6; 1842 if (econnp->conn_ipversion == IPV4_VERSION) 1843 sin6->sin6_flowinfo = 0; 1844 else 1845 sin6->sin6_flowinfo = econnp->conn_flowinfo; 1846 if (IN6_IS_ADDR_LINKSCOPE(&econnp->conn_laddr_v6) && 1847 (econnp->conn_ixa->ixa_flags & IXAF_SCOPEID_SET)) { 1848 sin6->sin6_scope_id = 1849 econnp->conn_ixa->ixa_scopeid; 1850 } else { 1851 sin6->sin6_scope_id = 0; 1852 } 1853 sin6->__sin6_src_id = 0; 1854 } 1855 1856 putnext(rq, mp); 1857 break; 1858 default: 1859 mp = mi_tpi_err_ack_alloc(mp, TNOTSUPPORT, 0); 1860 if (mp != NULL) 1861 putnext(rq, mp); 1862 break; 1863 } 1864 return (0); 1865 } 1866 1867 /* 1868 * The function called through squeue to get behind listener's perimeter to 1869 * send a deferred conn_ind. 1870 */ 1871 /* ARGSUSED */ 1872 void 1873 tcp_send_pending(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy) 1874 { 1875 conn_t *lconnp = (conn_t *)arg; 1876 tcp_t *listener = lconnp->conn_tcp; 1877 struct T_conn_ind *conn_ind; 1878 tcp_t *tcp; 1879 1880 conn_ind = (struct T_conn_ind *)mp->b_rptr; 1881 bcopy(mp->b_rptr + conn_ind->OPT_offset, &tcp, 1882 conn_ind->OPT_length); 1883 1884 if (listener->tcp_state != TCPS_LISTEN) { 1885 /* 1886 * If listener has closed, it would have caused a 1887 * a cleanup/blowoff to happen for the eager, so 1888 * we don't need to do anything more. 1889 */ 1890 freemsg(mp); 1891 return; 1892 } 1893 1894 putnext(lconnp->conn_rq, mp); 1895 } 1896 1897 /* 1898 * Sends the T_CONN_IND to the listener. The caller calls this 1899 * functions via squeue to get inside the listener's perimeter 1900 * once the 3 way hand shake is done a T_CONN_IND needs to be 1901 * sent. As an optimization, the caller can call this directly 1902 * if listener's perimeter is same as eager's. 1903 */ 1904 /* ARGSUSED */ 1905 void 1906 tcp_send_conn_ind(void *arg, mblk_t *mp, void *arg2) 1907 { 1908 conn_t *lconnp = (conn_t *)arg; 1909 tcp_t *listener = lconnp->conn_tcp; 1910 tcp_t *tcp; 1911 struct T_conn_ind *conn_ind; 1912 ipaddr_t *addr_cache; 1913 boolean_t need_send_conn_ind = B_FALSE; 1914 tcp_stack_t *tcps = listener->tcp_tcps; 1915 1916 /* retrieve the eager */ 1917 conn_ind = (struct T_conn_ind *)mp->b_rptr; 1918 ASSERT(conn_ind->OPT_offset != 0 && 1919 conn_ind->OPT_length == sizeof (intptr_t)); 1920 bcopy(mp->b_rptr + conn_ind->OPT_offset, &tcp, 1921 conn_ind->OPT_length); 1922 1923 /* 1924 * TLI/XTI applications will get confused by 1925 * sending eager as an option since it violates 1926 * the option semantics. So remove the eager as 1927 * option since TLI/XTI app doesn't need it anyway. 1928 */ 1929 if (!TCP_IS_SOCKET(listener)) { 1930 conn_ind->OPT_length = 0; 1931 conn_ind->OPT_offset = 0; 1932 } 1933 if (listener->tcp_state != TCPS_LISTEN) { 1934 /* 1935 * If listener has closed, it would have caused a 1936 * a cleanup/blowoff to happen for the eager. We 1937 * just need to return. 1938 */ 1939 freemsg(mp); 1940 return; 1941 } 1942 1943 1944 /* 1945 * if the conn_req_q is full defer passing up the 1946 * T_CONN_IND until space is availabe after t_accept() 1947 * processing 1948 */ 1949 mutex_enter(&listener->tcp_eager_lock); 1950 1951 /* 1952 * Take the eager out, if it is in the list of droppable eagers 1953 * as we are here because the 3W handshake is over. 1954 */ 1955 MAKE_UNDROPPABLE(tcp); 1956 1957 if (listener->tcp_conn_req_cnt_q < listener->tcp_conn_req_max) { 1958 tcp_t *tail; 1959 1960 /* 1961 * The eager already has an extra ref put in tcp_input_data 1962 * so that it stays till accept comes back even though it 1963 * might get into TCPS_CLOSED as a result of a TH_RST etc. 1964 */ 1965 ASSERT(listener->tcp_conn_req_cnt_q0 > 0); 1966 listener->tcp_conn_req_cnt_q0--; 1967 listener->tcp_conn_req_cnt_q++; 1968 1969 /* Move from SYN_RCVD to ESTABLISHED list */ 1970 tcp->tcp_eager_next_q0->tcp_eager_prev_q0 = 1971 tcp->tcp_eager_prev_q0; 1972 tcp->tcp_eager_prev_q0->tcp_eager_next_q0 = 1973 tcp->tcp_eager_next_q0; 1974 tcp->tcp_eager_prev_q0 = NULL; 1975 tcp->tcp_eager_next_q0 = NULL; 1976 1977 /* 1978 * Insert at end of the queue because sockfs 1979 * sends down T_CONN_RES in chronological 1980 * order. Leaving the older conn indications 1981 * at front of the queue helps reducing search 1982 * time. 1983 */ 1984 tail = listener->tcp_eager_last_q; 1985 if (tail != NULL) 1986 tail->tcp_eager_next_q = tcp; 1987 else 1988 listener->tcp_eager_next_q = tcp; 1989 listener->tcp_eager_last_q = tcp; 1990 tcp->tcp_eager_next_q = NULL; 1991 /* 1992 * Delay sending up the T_conn_ind until we are 1993 * done with the eager. Once we have have sent up 1994 * the T_conn_ind, the accept can potentially complete 1995 * any time and release the refhold we have on the eager. 1996 */ 1997 need_send_conn_ind = B_TRUE; 1998 } else { 1999 /* 2000 * Defer connection on q0 and set deferred 2001 * connection bit true 2002 */ 2003 tcp->tcp_conn_def_q0 = B_TRUE; 2004 2005 /* take tcp out of q0 ... */ 2006 tcp->tcp_eager_prev_q0->tcp_eager_next_q0 = 2007 tcp->tcp_eager_next_q0; 2008 tcp->tcp_eager_next_q0->tcp_eager_prev_q0 = 2009 tcp->tcp_eager_prev_q0; 2010 2011 /* ... and place it at the end of q0 */ 2012 tcp->tcp_eager_prev_q0 = listener->tcp_eager_prev_q0; 2013 tcp->tcp_eager_next_q0 = listener; 2014 listener->tcp_eager_prev_q0->tcp_eager_next_q0 = tcp; 2015 listener->tcp_eager_prev_q0 = tcp; 2016 tcp->tcp_conn.tcp_eager_conn_ind = mp; 2017 } 2018 2019 /* we have timed out before */ 2020 if (tcp->tcp_syn_rcvd_timeout != 0) { 2021 tcp->tcp_syn_rcvd_timeout = 0; 2022 listener->tcp_syn_rcvd_timeout--; 2023 if (listener->tcp_syn_defense && 2024 listener->tcp_syn_rcvd_timeout <= 2025 (tcps->tcps_conn_req_max_q0 >> 5) && 2026 10*MINUTES < TICK_TO_MSEC(ddi_get_lbolt64() - 2027 listener->tcp_last_rcv_lbolt)) { 2028 /* 2029 * Turn off the defense mode if we 2030 * believe the SYN attack is over. 2031 */ 2032 listener->tcp_syn_defense = B_FALSE; 2033 if (listener->tcp_ip_addr_cache) { 2034 kmem_free((void *)listener->tcp_ip_addr_cache, 2035 IP_ADDR_CACHE_SIZE * sizeof (ipaddr_t)); 2036 listener->tcp_ip_addr_cache = NULL; 2037 } 2038 } 2039 } 2040 addr_cache = (ipaddr_t *)(listener->tcp_ip_addr_cache); 2041 if (addr_cache != NULL) { 2042 /* 2043 * We have finished a 3-way handshake with this 2044 * remote host. This proves the IP addr is good. 2045 * Cache it! 2046 */ 2047 addr_cache[IP_ADDR_CACHE_HASH(tcp->tcp_connp->conn_faddr_v4)] = 2048 tcp->tcp_connp->conn_faddr_v4; 2049 } 2050 mutex_exit(&listener->tcp_eager_lock); 2051 if (need_send_conn_ind) 2052 putnext(lconnp->conn_rq, mp); 2053 } 2054