1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
24 */
25
26 /* This files contains all TCP TLI/TPI related functions */
27
28 #include <sys/types.h>
29 #include <sys/stream.h>
30 #include <sys/strsun.h>
31 #include <sys/strsubr.h>
32 #include <sys/stropts.h>
33 #include <sys/strlog.h>
34 #define _SUN_TPI_VERSION 2
35 #include <sys/tihdr.h>
36 #include <sys/suntpi.h>
37 #include <sys/xti_inet.h>
38 #include <sys/squeue_impl.h>
39 #include <sys/squeue.h>
40
41 #include <inet/common.h>
42 #include <inet/ip.h>
43 #include <inet/tcp.h>
44 #include <inet/tcp_impl.h>
45 #include <inet/proto_set.h>
46
47 static void tcp_accept_swap(tcp_t *, tcp_t *, tcp_t *);
48 static int tcp_conprim_opt_process(tcp_t *, mblk_t *, int *, int *, int *);
49
50 void
tcp_use_pure_tpi(tcp_t * tcp)51 tcp_use_pure_tpi(tcp_t *tcp)
52 {
53 conn_t *connp = tcp->tcp_connp;
54
55 #ifdef _ILP32
56 tcp->tcp_acceptor_id = (t_uscalar_t)connp->conn_rq;
57 #else
58 tcp->tcp_acceptor_id = connp->conn_dev;
59 #endif
60 /*
61 * Insert this socket into the acceptor hash.
62 * We might need it for T_CONN_RES message
63 */
64 tcp_acceptor_hash_insert(tcp->tcp_acceptor_id, tcp);
65
66 tcp->tcp_issocket = B_FALSE;
67 TCP_STAT(tcp->tcp_tcps, tcp_sock_fallback);
68 }
69
70 /* Shorthand to generate and send TPI error acks to our client */
71 void
tcp_err_ack(tcp_t * tcp,mblk_t * mp,int t_error,int sys_error)72 tcp_err_ack(tcp_t *tcp, mblk_t *mp, int t_error, int sys_error)
73 {
74 if ((mp = mi_tpi_err_ack_alloc(mp, t_error, sys_error)) != NULL)
75 putnext(tcp->tcp_connp->conn_rq, mp);
76 }
77
78 /* Shorthand to generate and send TPI error acks to our client */
79 void
tcp_err_ack_prim(tcp_t * tcp,mblk_t * mp,int primitive,int t_error,int sys_error)80 tcp_err_ack_prim(tcp_t *tcp, mblk_t *mp, int primitive,
81 int t_error, int sys_error)
82 {
83 struct T_error_ack *teackp;
84
85 if ((mp = tpi_ack_alloc(mp, sizeof (struct T_error_ack),
86 M_PCPROTO, T_ERROR_ACK)) != NULL) {
87 teackp = (struct T_error_ack *)mp->b_rptr;
88 teackp->ERROR_prim = primitive;
89 teackp->TLI_error = t_error;
90 teackp->UNIX_error = sys_error;
91 putnext(tcp->tcp_connp->conn_rq, mp);
92 }
93 }
94
95 /*
96 * TCP routine to get the values of options.
97 */
98 int
tcp_tpi_opt_get(queue_t * q,int level,int name,uchar_t * ptr)99 tcp_tpi_opt_get(queue_t *q, int level, int name, uchar_t *ptr)
100 {
101 return (tcp_opt_get(Q_TO_CONN(q), level, name, ptr));
102 }
103
104 /* ARGSUSED */
105 int
tcp_tpi_opt_set(queue_t * q,uint_t optset_context,int level,int name,uint_t inlen,uchar_t * invalp,uint_t * outlenp,uchar_t * outvalp,void * thisdg_attrs,cred_t * cr)106 tcp_tpi_opt_set(queue_t *q, uint_t optset_context, int level, int name,
107 uint_t inlen, uchar_t *invalp, uint_t *outlenp, uchar_t *outvalp,
108 void *thisdg_attrs, cred_t *cr)
109 {
110 conn_t *connp = Q_TO_CONN(q);
111
112 return (tcp_opt_set(connp, optset_context, level, name, inlen, invalp,
113 outlenp, outvalp, thisdg_attrs, cr));
114 }
115
116 static int
tcp_conprim_opt_process(tcp_t * tcp,mblk_t * mp,int * do_disconnectp,int * t_errorp,int * sys_errorp)117 tcp_conprim_opt_process(tcp_t *tcp, mblk_t *mp, int *do_disconnectp,
118 int *t_errorp, int *sys_errorp)
119 {
120 int error;
121 int is_absreq_failure;
122 t_scalar_t *opt_lenp;
123 t_scalar_t opt_offset;
124 int prim_type;
125 struct T_conn_req *tcreqp;
126 struct T_conn_res *tcresp;
127 cred_t *cr;
128
129 /*
130 * All Solaris components should pass a db_credp
131 * for this TPI message, hence we ASSERT.
132 * But in case there is some other M_PROTO that looks
133 * like a TPI message sent by some other kernel
134 * component, we check and return an error.
135 */
136 cr = msg_getcred(mp, NULL);
137 ASSERT(cr != NULL);
138 if (cr == NULL)
139 return (-1);
140
141 prim_type = ((union T_primitives *)mp->b_rptr)->type;
142 ASSERT(prim_type == T_CONN_REQ || prim_type == O_T_CONN_RES ||
143 prim_type == T_CONN_RES);
144
145 switch (prim_type) {
146 case T_CONN_REQ:
147 tcreqp = (struct T_conn_req *)mp->b_rptr;
148 opt_offset = tcreqp->OPT_offset;
149 opt_lenp = (t_scalar_t *)&tcreqp->OPT_length;
150 break;
151 case O_T_CONN_RES:
152 case T_CONN_RES:
153 tcresp = (struct T_conn_res *)mp->b_rptr;
154 opt_offset = tcresp->OPT_offset;
155 opt_lenp = (t_scalar_t *)&tcresp->OPT_length;
156 break;
157 default:
158 opt_lenp = 0;
159 opt_offset = 0;
160 break;
161 }
162
163 *t_errorp = 0;
164 *sys_errorp = 0;
165 *do_disconnectp = 0;
166
167 error = tpi_optcom_buf(tcp->tcp_connp->conn_wq, mp, opt_lenp,
168 opt_offset, cr, &tcp_opt_obj,
169 NULL, &is_absreq_failure);
170
171 switch (error) {
172 case 0: /* no error */
173 ASSERT(is_absreq_failure == 0);
174 return (0);
175 case ENOPROTOOPT:
176 *t_errorp = TBADOPT;
177 break;
178 case EACCES:
179 *t_errorp = TACCES;
180 break;
181 default:
182 *t_errorp = TSYSERR; *sys_errorp = error;
183 break;
184 }
185 if (is_absreq_failure != 0) {
186 /*
187 * The connection request should get the local ack
188 * T_OK_ACK and then a T_DISCON_IND.
189 */
190 *do_disconnectp = 1;
191 }
192 return (-1);
193 }
194
195 void
tcp_tpi_bind(tcp_t * tcp,mblk_t * mp)196 tcp_tpi_bind(tcp_t *tcp, mblk_t *mp)
197 {
198 int error;
199 conn_t *connp = tcp->tcp_connp;
200 struct sockaddr *sa;
201 mblk_t *mp1;
202 struct T_bind_req *tbr;
203 int backlog;
204 socklen_t len;
205 sin_t *sin;
206 sin6_t *sin6;
207 cred_t *cr;
208
209 /*
210 * All Solaris components should pass a db_credp
211 * for this TPI message, hence we ASSERT.
212 * But in case there is some other M_PROTO that looks
213 * like a TPI message sent by some other kernel
214 * component, we check and return an error.
215 */
216 cr = msg_getcred(mp, NULL);
217 ASSERT(cr != NULL);
218 if (cr == NULL) {
219 tcp_err_ack(tcp, mp, TSYSERR, EINVAL);
220 return;
221 }
222
223 ASSERT((uintptr_t)(mp->b_wptr - mp->b_rptr) <= (uintptr_t)INT_MAX);
224 if ((mp->b_wptr - mp->b_rptr) < sizeof (*tbr)) {
225 if (connp->conn_debug) {
226 (void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE,
227 "tcp_tpi_bind: bad req, len %u",
228 (uint_t)(mp->b_wptr - mp->b_rptr));
229 }
230 tcp_err_ack(tcp, mp, TPROTO, 0);
231 return;
232 }
233 /* Make sure the largest address fits */
234 mp1 = reallocb(mp, sizeof (struct T_bind_ack) + sizeof (sin6_t), 1);
235 if (mp1 == NULL) {
236 tcp_err_ack(tcp, mp, TSYSERR, ENOMEM);
237 return;
238 }
239 mp = mp1;
240 tbr = (struct T_bind_req *)mp->b_rptr;
241
242 backlog = tbr->CONIND_number;
243 len = tbr->ADDR_length;
244
245 switch (len) {
246 case 0: /* request for a generic port */
247 tbr->ADDR_offset = sizeof (struct T_bind_req);
248 if (connp->conn_family == AF_INET) {
249 tbr->ADDR_length = sizeof (sin_t);
250 sin = (sin_t *)&tbr[1];
251 *sin = sin_null;
252 sin->sin_family = AF_INET;
253 sa = (struct sockaddr *)sin;
254 len = sizeof (sin_t);
255 mp->b_wptr = (uchar_t *)&sin[1];
256 } else {
257 ASSERT(connp->conn_family == AF_INET6);
258 tbr->ADDR_length = sizeof (sin6_t);
259 sin6 = (sin6_t *)&tbr[1];
260 *sin6 = sin6_null;
261 sin6->sin6_family = AF_INET6;
262 sa = (struct sockaddr *)sin6;
263 len = sizeof (sin6_t);
264 mp->b_wptr = (uchar_t *)&sin6[1];
265 }
266 break;
267
268 case sizeof (sin_t): /* Complete IPv4 address */
269 sa = (struct sockaddr *)mi_offset_param(mp, tbr->ADDR_offset,
270 sizeof (sin_t));
271 break;
272
273 case sizeof (sin6_t): /* Complete IPv6 address */
274 sa = (struct sockaddr *)mi_offset_param(mp,
275 tbr->ADDR_offset, sizeof (sin6_t));
276 break;
277
278 default:
279 if (connp->conn_debug) {
280 (void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE,
281 "tcp_tpi_bind: bad address length, %d",
282 tbr->ADDR_length);
283 }
284 tcp_err_ack(tcp, mp, TBADADDR, 0);
285 return;
286 }
287
288 if (backlog > 0) {
289 error = tcp_do_listen(connp, sa, len, backlog, DB_CRED(mp),
290 tbr->PRIM_type != O_T_BIND_REQ);
291 } else {
292 error = tcp_do_bind(connp, sa, len, DB_CRED(mp),
293 tbr->PRIM_type != O_T_BIND_REQ);
294 }
295 done:
296 if (error > 0) {
297 tcp_err_ack(tcp, mp, TSYSERR, error);
298 } else if (error < 0) {
299 tcp_err_ack(tcp, mp, -error, 0);
300 } else {
301 /*
302 * Update port information as sockfs/tpi needs it for checking
303 */
304 if (connp->conn_family == AF_INET) {
305 sin = (sin_t *)sa;
306 sin->sin_port = connp->conn_lport;
307 } else {
308 sin6 = (sin6_t *)sa;
309 sin6->sin6_port = connp->conn_lport;
310 }
311 mp->b_datap->db_type = M_PCPROTO;
312 tbr->PRIM_type = T_BIND_ACK;
313 putnext(connp->conn_rq, mp);
314 }
315 }
316
317 /* tcp_unbind is called by tcp_wput_proto to handle T_UNBIND_REQ messages. */
318 void
tcp_tpi_unbind(tcp_t * tcp,mblk_t * mp)319 tcp_tpi_unbind(tcp_t *tcp, mblk_t *mp)
320 {
321 conn_t *connp = tcp->tcp_connp;
322 int error;
323
324 error = tcp_do_unbind(connp);
325 if (error > 0) {
326 tcp_err_ack(tcp, mp, TSYSERR, error);
327 } else if (error < 0) {
328 tcp_err_ack(tcp, mp, -error, 0);
329 } else {
330 /* Send M_FLUSH according to TPI */
331 (void) putnextctl1(connp->conn_rq, M_FLUSH, FLUSHRW);
332
333 mp = mi_tpi_ok_ack_alloc(mp);
334 if (mp != NULL)
335 putnext(connp->conn_rq, mp);
336 }
337 }
338
339 /* ARGSUSED */
340 int
tcp_tpi_close(queue_t * q,int flags,cred_t * credp __unused)341 tcp_tpi_close(queue_t *q, int flags, cred_t *credp __unused)
342 {
343 conn_t *connp;
344
345 ASSERT(WR(q)->q_next == NULL);
346
347 if (flags & SO_FALLBACK) {
348 /*
349 * stream is being closed while in fallback
350 * simply free the resources that were allocated
351 */
352 inet_minor_free(WR(q)->q_ptr, (dev_t)(RD(q)->q_ptr));
353 qprocsoff(q);
354 goto done;
355 }
356
357 connp = Q_TO_CONN(q);
358 /*
359 * We are being closed as /dev/tcp or /dev/tcp6.
360 */
361 tcp_close_common(connp, flags);
362
363 qprocsoff(q);
364 inet_minor_free(connp->conn_minor_arena, connp->conn_dev);
365
366 /*
367 * Drop IP's reference on the conn. This is the last reference
368 * on the connp if the state was less than established. If the
369 * connection has gone into timewait state, then we will have
370 * one ref for the TCP and one more ref (total of two) for the
371 * classifier connected hash list (a timewait connections stays
372 * in connected hash till closed).
373 *
374 * We can't assert the references because there might be other
375 * transient reference places because of some walkers or queued
376 * packets in squeue for the timewait state.
377 */
378 CONN_DEC_REF(connp);
379 done:
380 q->q_ptr = WR(q)->q_ptr = NULL;
381 return (0);
382 }
383
384 /* ARGSUSED */
385 int
tcp_tpi_close_accept(queue_t * q,int flags __unused,cred_t * credp __unused)386 tcp_tpi_close_accept(queue_t *q, int flags __unused, cred_t *credp __unused)
387 {
388 vmem_t *minor_arena;
389 dev_t conn_dev;
390 extern struct qinit tcp_acceptor_winit;
391
392 ASSERT(WR(q)->q_qinfo == &tcp_acceptor_winit);
393
394 /*
395 * We had opened an acceptor STREAM for sockfs which is
396 * now being closed due to some error.
397 */
398 qprocsoff(q);
399
400 minor_arena = (vmem_t *)WR(q)->q_ptr;
401 conn_dev = (dev_t)RD(q)->q_ptr;
402 ASSERT(minor_arena != NULL);
403 ASSERT(conn_dev != 0);
404 inet_minor_free(minor_arena, conn_dev);
405 q->q_ptr = WR(q)->q_ptr = NULL;
406 return (0);
407 }
408
409 /*
410 * Put a connection confirmation message upstream built from the
411 * address/flowid information with the conn and iph. Report our success or
412 * failure.
413 */
414 boolean_t
tcp_conn_con(tcp_t * tcp,uchar_t * iphdr,mblk_t * idmp,mblk_t ** defermp,ip_recv_attr_t * ira)415 tcp_conn_con(tcp_t *tcp, uchar_t *iphdr, mblk_t *idmp,
416 mblk_t **defermp, ip_recv_attr_t *ira)
417 {
418 sin_t sin;
419 sin6_t sin6;
420 mblk_t *mp;
421 char *optp = NULL;
422 int optlen = 0;
423 conn_t *connp = tcp->tcp_connp;
424
425 if (defermp != NULL)
426 *defermp = NULL;
427
428 if (tcp->tcp_conn.tcp_opts_conn_req != NULL) {
429 /*
430 * Return in T_CONN_CON results of option negotiation through
431 * the T_CONN_REQ. Note: If there is an real end-to-end option
432 * negotiation, then what is received from remote end needs
433 * to be taken into account but there is no such thing (yet?)
434 * in our TCP/IP.
435 * Note: We do not use mi_offset_param() here as
436 * tcp_opts_conn_req contents do not directly come from
437 * an application and are either generated in kernel or
438 * from user input that was already verified.
439 */
440 mp = tcp->tcp_conn.tcp_opts_conn_req;
441 optp = (char *)(mp->b_rptr +
442 ((struct T_conn_req *)mp->b_rptr)->OPT_offset);
443 optlen = (int)
444 ((struct T_conn_req *)mp->b_rptr)->OPT_length;
445 }
446
447 if (IPH_HDR_VERSION(iphdr) == IPV4_VERSION) {
448
449 /* packet is IPv4 */
450 if (connp->conn_family == AF_INET) {
451 sin = sin_null;
452 sin.sin_addr.s_addr = connp->conn_faddr_v4;
453 sin.sin_port = connp->conn_fport;
454 sin.sin_family = AF_INET;
455 mp = mi_tpi_conn_con(NULL, (char *)&sin,
456 (int)sizeof (sin_t), optp, optlen);
457 } else {
458 sin6 = sin6_null;
459 sin6.sin6_addr = connp->conn_faddr_v6;
460 sin6.sin6_port = connp->conn_fport;
461 sin6.sin6_family = AF_INET6;
462 mp = mi_tpi_conn_con(NULL, (char *)&sin6,
463 (int)sizeof (sin6_t), optp, optlen);
464
465 }
466 } else {
467 ip6_t *ip6h = (ip6_t *)iphdr;
468
469 ASSERT(IPH_HDR_VERSION(iphdr) == IPV6_VERSION);
470 ASSERT(connp->conn_family == AF_INET6);
471 sin6 = sin6_null;
472 sin6.sin6_addr = connp->conn_faddr_v6;
473 sin6.sin6_port = connp->conn_fport;
474 sin6.sin6_family = AF_INET6;
475 sin6.sin6_flowinfo = ip6h->ip6_vcf & ~IPV6_VERS_AND_FLOW_MASK;
476 mp = mi_tpi_conn_con(NULL, (char *)&sin6,
477 (int)sizeof (sin6_t), optp, optlen);
478 }
479
480 if (!mp)
481 return (B_FALSE);
482
483 mblk_copycred(mp, idmp);
484
485 if (defermp == NULL) {
486 conn_t *connp = tcp->tcp_connp;
487 if (IPCL_IS_NONSTR(connp)) {
488 (*connp->conn_upcalls->su_connected)
489 (connp->conn_upper_handle, tcp->tcp_connid,
490 ira->ira_cred, ira->ira_cpid);
491 freemsg(mp);
492 } else {
493 if (ira->ira_cred != NULL) {
494 /* So that getpeerucred works for TPI sockfs */
495 mblk_setcred(mp, ira->ira_cred, ira->ira_cpid);
496 }
497 putnext(connp->conn_rq, mp);
498 }
499 } else {
500 *defermp = mp;
501 }
502
503 if (tcp->tcp_conn.tcp_opts_conn_req != NULL)
504 tcp_close_mpp(&tcp->tcp_conn.tcp_opts_conn_req);
505 return (B_TRUE);
506 }
507
508 /*
509 * Successful connect request processing begins when our client passes
510 * a T_CONN_REQ message into tcp_wput(), which performs function calls into
511 * IP and the passes a T_OK_ACK (or T_ERROR_ACK upstream).
512 *
513 * After various error checks are completed, tcp_tpi_connect() lays
514 * the target address and port into the composite header template.
515 * Then we ask IP for information, including a source address if we didn't
516 * already have one. Finally we prepare to send the SYN packet, and then
517 * send up the T_OK_ACK reply message.
518 */
519 void
tcp_tpi_connect(tcp_t * tcp,mblk_t * mp)520 tcp_tpi_connect(tcp_t *tcp, mblk_t *mp)
521 {
522 sin_t *sin;
523 struct T_conn_req *tcr;
524 struct sockaddr *sa;
525 socklen_t len;
526 int error;
527 cred_t *cr;
528 pid_t cpid;
529 conn_t *connp = tcp->tcp_connp;
530 queue_t *q = connp->conn_wq;
531
532 /*
533 * All Solaris components should pass a db_credp
534 * for this TPI message, hence we ASSERT.
535 * But in case there is some other M_PROTO that looks
536 * like a TPI message sent by some other kernel
537 * component, we check and return an error.
538 */
539 cr = msg_getcred(mp, &cpid);
540 ASSERT(cr != NULL);
541 if (cr == NULL) {
542 tcp_err_ack(tcp, mp, TSYSERR, EINVAL);
543 return;
544 }
545
546 tcr = (struct T_conn_req *)mp->b_rptr;
547
548 ASSERT((uintptr_t)(mp->b_wptr - mp->b_rptr) <= (uintptr_t)INT_MAX);
549 if ((mp->b_wptr - mp->b_rptr) < sizeof (*tcr)) {
550 tcp_err_ack(tcp, mp, TPROTO, 0);
551 return;
552 }
553
554 /*
555 * Pre-allocate the T_ordrel_ind mblk so that at close time, we
556 * will always have that to send up. Otherwise, we need to do
557 * special handling in case the allocation fails at that time.
558 * If the end point is TPI, the tcp_t can be reused and the
559 * tcp_ordrel_mp may be allocated already.
560 */
561 if (tcp->tcp_ordrel_mp == NULL) {
562 if ((tcp->tcp_ordrel_mp = mi_tpi_ordrel_ind()) == NULL) {
563 tcp_err_ack(tcp, mp, TSYSERR, ENOMEM);
564 return;
565 }
566 }
567
568 /*
569 * Determine packet type based on type of address passed in
570 * the request should contain an IPv4 or IPv6 address.
571 * Make sure that address family matches the type of
572 * family of the address passed down.
573 */
574 switch (tcr->DEST_length) {
575 default:
576 tcp_err_ack(tcp, mp, TBADADDR, 0);
577 return;
578
579 case (sizeof (sin_t) - sizeof (sin->sin_zero)): {
580 /*
581 * XXX: The check for valid DEST_length was not there
582 * in earlier releases and some buggy
583 * TLI apps (e.g Sybase) got away with not feeding
584 * in sin_zero part of address.
585 * We allow that bug to keep those buggy apps humming.
586 * Test suites require the check on DEST_length.
587 * We construct a new mblk with valid DEST_length
588 * free the original so the rest of the code does
589 * not have to keep track of this special shorter
590 * length address case.
591 */
592 mblk_t *nmp;
593 struct T_conn_req *ntcr;
594 sin_t *nsin;
595
596 nmp = allocb(sizeof (struct T_conn_req) + sizeof (sin_t) +
597 tcr->OPT_length, BPRI_HI);
598 if (nmp == NULL) {
599 tcp_err_ack(tcp, mp, TSYSERR, ENOMEM);
600 return;
601 }
602 ntcr = (struct T_conn_req *)nmp->b_rptr;
603 bzero(ntcr, sizeof (struct T_conn_req)); /* zero fill */
604 ntcr->PRIM_type = T_CONN_REQ;
605 ntcr->DEST_length = sizeof (sin_t);
606 ntcr->DEST_offset = sizeof (struct T_conn_req);
607
608 nsin = (sin_t *)((uchar_t *)ntcr + ntcr->DEST_offset);
609 *nsin = sin_null;
610 /* Get pointer to shorter address to copy from original mp */
611 sin = (sin_t *)mi_offset_param(mp, tcr->DEST_offset,
612 tcr->DEST_length); /* extract DEST_length worth of sin_t */
613 if (sin == NULL || !OK_32PTR((char *)sin)) {
614 freemsg(nmp);
615 tcp_err_ack(tcp, mp, TSYSERR, EINVAL);
616 return;
617 }
618 nsin->sin_family = sin->sin_family;
619 nsin->sin_port = sin->sin_port;
620 nsin->sin_addr = sin->sin_addr;
621 /* Note:nsin->sin_zero zero-fill with sin_null assign above */
622 nmp->b_wptr = (uchar_t *)&nsin[1];
623 if (tcr->OPT_length != 0) {
624 ntcr->OPT_length = tcr->OPT_length;
625 ntcr->OPT_offset = nmp->b_wptr - nmp->b_rptr;
626 bcopy((uchar_t *)tcr + tcr->OPT_offset,
627 (uchar_t *)ntcr + ntcr->OPT_offset,
628 tcr->OPT_length);
629 nmp->b_wptr += tcr->OPT_length;
630 }
631 freemsg(mp); /* original mp freed */
632 mp = nmp; /* re-initialize original variables */
633 tcr = ntcr;
634 }
635 /* FALLTHRU */
636
637 case sizeof (sin_t):
638 sa = (struct sockaddr *)mi_offset_param(mp, tcr->DEST_offset,
639 sizeof (sin_t));
640 len = sizeof (sin_t);
641 break;
642
643 case sizeof (sin6_t):
644 sa = (struct sockaddr *)mi_offset_param(mp, tcr->DEST_offset,
645 sizeof (sin6_t));
646 len = sizeof (sin6_t);
647 break;
648 }
649
650 error = proto_verify_ip_addr(connp->conn_family, sa, len);
651 if (error != 0) {
652 tcp_err_ack(tcp, mp, TSYSERR, error);
653 return;
654 }
655
656 /*
657 * TODO: If someone in TCPS_TIME_WAIT has this dst/port we
658 * should key on their sequence number and cut them loose.
659 */
660
661 /*
662 * If options passed in, feed it for verification and handling
663 */
664 if (tcr->OPT_length != 0) {
665 mblk_t *ok_mp;
666 mblk_t *discon_mp;
667 mblk_t *conn_opts_mp;
668 int t_error, sys_error, do_disconnect;
669
670 conn_opts_mp = NULL;
671
672 if (tcp_conprim_opt_process(tcp, mp,
673 &do_disconnect, &t_error, &sys_error) < 0) {
674 if (do_disconnect) {
675 ASSERT(t_error == 0 && sys_error == 0);
676 discon_mp = mi_tpi_discon_ind(NULL,
677 ECONNREFUSED, 0);
678 if (!discon_mp) {
679 tcp_err_ack_prim(tcp, mp, T_CONN_REQ,
680 TSYSERR, ENOMEM);
681 return;
682 }
683 ok_mp = mi_tpi_ok_ack_alloc(mp);
684 if (!ok_mp) {
685 tcp_err_ack_prim(tcp, NULL, T_CONN_REQ,
686 TSYSERR, ENOMEM);
687 return;
688 }
689 qreply(q, ok_mp);
690 qreply(q, discon_mp); /* no flush! */
691 } else {
692 ASSERT(t_error != 0);
693 tcp_err_ack_prim(tcp, mp, T_CONN_REQ, t_error,
694 sys_error);
695 }
696 return;
697 }
698 /*
699 * Success in setting options, the mp option buffer represented
700 * by OPT_length/offset has been potentially modified and
701 * contains results of option processing. We copy it in
702 * another mp to save it for potentially influencing returning
703 * it in T_CONN_CONN.
704 */
705 if (tcr->OPT_length != 0) { /* there are resulting options */
706 conn_opts_mp = copyb(mp);
707 if (!conn_opts_mp) {
708 tcp_err_ack_prim(tcp, mp, T_CONN_REQ,
709 TSYSERR, ENOMEM);
710 return;
711 }
712 ASSERT(tcp->tcp_conn.tcp_opts_conn_req == NULL);
713 tcp->tcp_conn.tcp_opts_conn_req = conn_opts_mp;
714 /*
715 * Note:
716 * These resulting option negotiation can include any
717 * end-to-end negotiation options but there no such
718 * thing (yet?) in our TCP/IP.
719 */
720 }
721 }
722
723 /* call the non-TPI version */
724 error = tcp_do_connect(tcp->tcp_connp, sa, len, cr, cpid);
725 if (error < 0) {
726 mp = mi_tpi_err_ack_alloc(mp, -error, 0);
727 } else if (error > 0) {
728 mp = mi_tpi_err_ack_alloc(mp, TSYSERR, error);
729 } else {
730 mp = mi_tpi_ok_ack_alloc(mp);
731 }
732
733 /*
734 * Note: Code below is the "failure" case
735 */
736 /* return error ack and blow away saved option results if any */
737 connect_failed:
738 if (mp != NULL)
739 putnext(connp->conn_rq, mp);
740 else {
741 tcp_err_ack_prim(tcp, NULL, T_CONN_REQ,
742 TSYSERR, ENOMEM);
743 }
744 }
745
746 /* Return the TPI/TLI equivalent of our current tcp_state */
747 static int
tcp_tpistate(tcp_t * tcp)748 tcp_tpistate(tcp_t *tcp)
749 {
750 switch (tcp->tcp_state) {
751 case TCPS_IDLE:
752 return (TS_UNBND);
753 case TCPS_LISTEN:
754 /*
755 * Return whether there are outstanding T_CONN_IND waiting
756 * for the matching T_CONN_RES. Therefore don't count q0.
757 */
758 if (tcp->tcp_conn_req_cnt_q > 0)
759 return (TS_WRES_CIND);
760 else
761 return (TS_IDLE);
762 case TCPS_BOUND:
763 return (TS_IDLE);
764 case TCPS_SYN_SENT:
765 return (TS_WCON_CREQ);
766 case TCPS_SYN_RCVD:
767 /*
768 * Note: assumption: this has to the active open SYN_RCVD.
769 * The passive instance is detached in SYN_RCVD stage of
770 * incoming connection processing so we cannot get request
771 * for T_info_ack on it.
772 */
773 return (TS_WACK_CRES);
774 case TCPS_ESTABLISHED:
775 return (TS_DATA_XFER);
776 case TCPS_CLOSE_WAIT:
777 return (TS_WREQ_ORDREL);
778 case TCPS_FIN_WAIT_1:
779 return (TS_WIND_ORDREL);
780 case TCPS_FIN_WAIT_2:
781 return (TS_WIND_ORDREL);
782
783 case TCPS_CLOSING:
784 case TCPS_LAST_ACK:
785 case TCPS_TIME_WAIT:
786 case TCPS_CLOSED:
787 /*
788 * Following TS_WACK_DREQ7 is a rendition of "not
789 * yet TS_IDLE" TPI state. There is no best match to any
790 * TPI state for TCPS_{CLOSING, LAST_ACK, TIME_WAIT} but we
791 * choose a value chosen that will map to TLI/XTI level
792 * state of TSTATECHNG (state is process of changing) which
793 * captures what this dummy state represents.
794 */
795 return (TS_WACK_DREQ7);
796 default:
797 cmn_err(CE_WARN, "tcp_tpistate: strange state (%d) %s",
798 tcp->tcp_state, tcp_display(tcp, NULL,
799 DISP_PORT_ONLY));
800 return (TS_UNBND);
801 }
802 }
803
804 static void
tcp_copy_info(struct T_info_ack * tia,tcp_t * tcp)805 tcp_copy_info(struct T_info_ack *tia, tcp_t *tcp)
806 {
807 tcp_stack_t *tcps = tcp->tcp_tcps;
808 conn_t *connp = tcp->tcp_connp;
809 extern struct T_info_ack tcp_g_t_info_ack;
810 extern struct T_info_ack tcp_g_t_info_ack_v6;
811
812 if (connp->conn_family == AF_INET6)
813 *tia = tcp_g_t_info_ack_v6;
814 else
815 *tia = tcp_g_t_info_ack;
816 tia->CURRENT_state = tcp_tpistate(tcp);
817 tia->OPT_size = tcp_max_optsize;
818 if (tcp->tcp_mss == 0) {
819 /* Not yet set - tcp_open does not set mss */
820 if (connp->conn_ipversion == IPV4_VERSION)
821 tia->TIDU_size = tcps->tcps_mss_def_ipv4;
822 else
823 tia->TIDU_size = tcps->tcps_mss_def_ipv6;
824 } else {
825 tia->TIDU_size = tcp->tcp_mss;
826 }
827 /* TODO: Default ETSDU is 1. Is that correct for tcp? */
828 }
829
830 void
tcp_do_capability_ack(tcp_t * tcp,struct T_capability_ack * tcap,t_uscalar_t cap_bits1)831 tcp_do_capability_ack(tcp_t *tcp, struct T_capability_ack *tcap,
832 t_uscalar_t cap_bits1)
833 {
834 tcap->CAP_bits1 = 0;
835
836 if (cap_bits1 & TC1_INFO) {
837 tcp_copy_info(&tcap->INFO_ack, tcp);
838 tcap->CAP_bits1 |= TC1_INFO;
839 }
840
841 if (cap_bits1 & TC1_ACCEPTOR_ID) {
842 tcap->ACCEPTOR_id = tcp->tcp_acceptor_id;
843 tcap->CAP_bits1 |= TC1_ACCEPTOR_ID;
844 }
845
846 }
847
848 /*
849 * This routine responds to T_CAPABILITY_REQ messages. It is called by
850 * tcp_wput. Much of the T_CAPABILITY_ACK information is copied from
851 * tcp_g_t_info_ack. The current state of the stream is copied from
852 * tcp_state.
853 */
854 void
tcp_capability_req(tcp_t * tcp,mblk_t * mp)855 tcp_capability_req(tcp_t *tcp, mblk_t *mp)
856 {
857 t_uscalar_t cap_bits1;
858 struct T_capability_ack *tcap;
859
860 if (MBLKL(mp) < sizeof (struct T_capability_req)) {
861 freemsg(mp);
862 return;
863 }
864
865 cap_bits1 = ((struct T_capability_req *)mp->b_rptr)->CAP_bits1;
866
867 mp = tpi_ack_alloc(mp, sizeof (struct T_capability_ack),
868 mp->b_datap->db_type, T_CAPABILITY_ACK);
869 if (mp == NULL)
870 return;
871
872 tcap = (struct T_capability_ack *)mp->b_rptr;
873 tcp_do_capability_ack(tcp, tcap, cap_bits1);
874
875 putnext(tcp->tcp_connp->conn_rq, mp);
876 }
877
878 /*
879 * This routine responds to T_INFO_REQ messages. It is called by tcp_wput.
880 * Most of the T_INFO_ACK information is copied from tcp_g_t_info_ack.
881 * The current state of the stream is copied from tcp_state.
882 */
883 void
tcp_info_req(tcp_t * tcp,mblk_t * mp)884 tcp_info_req(tcp_t *tcp, mblk_t *mp)
885 {
886 mp = tpi_ack_alloc(mp, sizeof (struct T_info_ack), M_PCPROTO,
887 T_INFO_ACK);
888 if (!mp) {
889 tcp_err_ack(tcp, mp, TSYSERR, ENOMEM);
890 return;
891 }
892 tcp_copy_info((struct T_info_ack *)mp->b_rptr, tcp);
893 putnext(tcp->tcp_connp->conn_rq, mp);
894 }
895
896 /* Respond to the TPI addr request */
897 void
tcp_addr_req(tcp_t * tcp,mblk_t * mp)898 tcp_addr_req(tcp_t *tcp, mblk_t *mp)
899 {
900 struct sockaddr *sa;
901 mblk_t *ackmp;
902 struct T_addr_ack *taa;
903 conn_t *connp = tcp->tcp_connp;
904 uint_t addrlen;
905
906 /* Make it large enough for worst case */
907 ackmp = reallocb(mp, sizeof (struct T_addr_ack) +
908 2 * sizeof (sin6_t), 1);
909 if (ackmp == NULL) {
910 tcp_err_ack(tcp, mp, TSYSERR, ENOMEM);
911 return;
912 }
913
914 taa = (struct T_addr_ack *)ackmp->b_rptr;
915
916 bzero(taa, sizeof (struct T_addr_ack));
917 ackmp->b_wptr = (uchar_t *)&taa[1];
918
919 taa->PRIM_type = T_ADDR_ACK;
920 ackmp->b_datap->db_type = M_PCPROTO;
921
922 if (connp->conn_family == AF_INET)
923 addrlen = sizeof (sin_t);
924 else
925 addrlen = sizeof (sin6_t);
926
927 /*
928 * Note: Following code assumes 32 bit alignment of basic
929 * data structures like sin_t and struct T_addr_ack.
930 */
931 if (tcp->tcp_state >= TCPS_BOUND) {
932 /*
933 * Fill in local address first
934 */
935 taa->LOCADDR_offset = sizeof (*taa);
936 taa->LOCADDR_length = addrlen;
937 sa = (struct sockaddr *)&taa[1];
938 (void) conn_getsockname(connp, sa, &addrlen);
939 ackmp->b_wptr += addrlen;
940 }
941 if (tcp->tcp_state >= TCPS_SYN_RCVD) {
942 /*
943 * Fill in Remote address
944 */
945 taa->REMADDR_length = addrlen;
946 /* assumed 32-bit alignment */
947 taa->REMADDR_offset = taa->LOCADDR_offset + taa->LOCADDR_length;
948 sa = (struct sockaddr *)(ackmp->b_rptr + taa->REMADDR_offset);
949 (void) conn_getpeername(connp, sa, &addrlen);
950 ackmp->b_wptr += addrlen;
951 }
952 ASSERT(ackmp->b_wptr <= ackmp->b_datap->db_lim);
953 putnext(tcp->tcp_connp->conn_rq, ackmp);
954 }
955
956 /*
957 * Swap information between the eager and acceptor for a TLI/XTI client.
958 * The sockfs accept is done on the acceptor stream and control goes
959 * through tcp_tli_accept() and tcp_accept()/tcp_accept_swap() is not
960 * called. In either case, both the eager and listener are in their own
961 * perimeter (squeue) and the code has to deal with potential race.
962 *
963 * See the block comment on top of tcp_accept() and tcp_tli_accept().
964 */
965 static void
tcp_accept_swap(tcp_t * listener,tcp_t * acceptor,tcp_t * eager)966 tcp_accept_swap(tcp_t *listener, tcp_t *acceptor, tcp_t *eager)
967 {
968 conn_t *econnp, *aconnp;
969
970 ASSERT(eager->tcp_connp->conn_rq == listener->tcp_connp->conn_rq);
971 ASSERT(eager->tcp_detached && !acceptor->tcp_detached);
972 ASSERT(!TCP_IS_SOCKET(acceptor));
973 ASSERT(!TCP_IS_SOCKET(eager));
974 ASSERT(!TCP_IS_SOCKET(listener));
975
976 /*
977 * Trusted Extensions may need to use a security label that is
978 * different from the acceptor's label on MLP and MAC-Exempt
979 * sockets. If this is the case, the required security label
980 * already exists in econnp->conn_ixa->ixa_tsl. Since we make the
981 * acceptor stream refer to econnp we atomatically get that label.
982 */
983
984 acceptor->tcp_detached = B_TRUE;
985 /*
986 * To permit stream re-use by TLI/XTI, the eager needs a copy of
987 * the acceptor id.
988 */
989 eager->tcp_acceptor_id = acceptor->tcp_acceptor_id;
990
991 /* remove eager from listen list... */
992 mutex_enter(&listener->tcp_eager_lock);
993 tcp_eager_unlink(eager);
994 ASSERT(eager->tcp_eager_next_q == NULL &&
995 eager->tcp_eager_last_q == NULL);
996 ASSERT(eager->tcp_eager_next_q0 == NULL &&
997 eager->tcp_eager_prev_q0 == NULL);
998 mutex_exit(&listener->tcp_eager_lock);
999
1000 econnp = eager->tcp_connp;
1001 aconnp = acceptor->tcp_connp;
1002 econnp->conn_rq = aconnp->conn_rq;
1003 econnp->conn_wq = aconnp->conn_wq;
1004 econnp->conn_rq->q_ptr = econnp;
1005 econnp->conn_wq->q_ptr = econnp;
1006
1007 /*
1008 * In the TLI/XTI loopback case, we are inside the listener's squeue,
1009 * which might be a different squeue from our peer TCP instance.
1010 * For TCP Fusion, the peer expects that whenever tcp_detached is
1011 * clear, our TCP queues point to the acceptor's queues. Thus, use
1012 * membar_producer() to ensure that the assignments of conn_rq/conn_wq
1013 * above reach global visibility prior to the clearing of tcp_detached.
1014 */
1015 membar_producer();
1016 eager->tcp_detached = B_FALSE;
1017
1018 ASSERT(eager->tcp_ack_tid == 0);
1019
1020 econnp->conn_dev = aconnp->conn_dev;
1021 econnp->conn_minor_arena = aconnp->conn_minor_arena;
1022
1023 ASSERT(econnp->conn_minor_arena != NULL);
1024 if (econnp->conn_cred != NULL)
1025 crfree(econnp->conn_cred);
1026 econnp->conn_cred = aconnp->conn_cred;
1027 ASSERT(!(econnp->conn_ixa->ixa_free_flags & IXA_FREE_CRED));
1028 econnp->conn_ixa->ixa_cred = econnp->conn_cred;
1029 aconnp->conn_cred = NULL;
1030 econnp->conn_cpid = aconnp->conn_cpid;
1031 ASSERT(econnp->conn_netstack == aconnp->conn_netstack);
1032 ASSERT(eager->tcp_tcps == acceptor->tcp_tcps);
1033
1034 econnp->conn_zoneid = aconnp->conn_zoneid;
1035 econnp->conn_allzones = aconnp->conn_allzones;
1036 econnp->conn_ixa->ixa_zoneid = aconnp->conn_ixa->ixa_zoneid;
1037
1038 econnp->conn_mac_mode = aconnp->conn_mac_mode;
1039 econnp->conn_zone_is_global = aconnp->conn_zone_is_global;
1040 aconnp->conn_mac_mode = CONN_MAC_DEFAULT;
1041
1042 /* Do the IPC initialization */
1043 CONN_INC_REF(econnp);
1044
1045 /* Done with old IPC. Drop its ref on its connp */
1046 CONN_DEC_REF(aconnp);
1047 }
1048
1049 /*
1050 * This runs at the tail end of accept processing on the squeue of the
1051 * new connection.
1052 */
1053 /* ARGSUSED */
1054 static void
tcp_accept_finish(void * arg,mblk_t * mp,void * arg2,ip_recv_attr_t * dummy)1055 tcp_accept_finish(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy)
1056 {
1057 conn_t *connp = (conn_t *)arg;
1058 tcp_t *tcp = connp->conn_tcp;
1059 queue_t *q = connp->conn_rq;
1060 tcp_stack_t *tcps = tcp->tcp_tcps;
1061 struct stroptions *stropt;
1062 struct sock_proto_props sopp;
1063
1064 /* Should never be called for non-STREAMS sockets */
1065 ASSERT(!IPCL_IS_NONSTR(connp));
1066
1067 /* We should just receive a single mblk that fits a T_discon_ind */
1068 ASSERT(mp->b_cont == NULL);
1069
1070 /*
1071 * Drop the eager's ref on the listener, that was placed when
1072 * this eager began life in tcp_input_listener.
1073 */
1074 CONN_DEC_REF(tcp->tcp_saved_listener->tcp_connp);
1075
1076 tcp->tcp_detached = B_FALSE;
1077
1078 if (tcp->tcp_state <= TCPS_BOUND || tcp->tcp_accept_error) {
1079 /*
1080 * Someone blewoff the eager before we could finish
1081 * the accept.
1082 *
1083 * The only reason eager exists it because we put in
1084 * a ref on it when conn ind went up. We need to send
1085 * a disconnect indication up while the last reference
1086 * on the eager will be dropped by the squeue when we
1087 * return.
1088 */
1089 ASSERT(tcp->tcp_listener == NULL);
1090 if (tcp->tcp_issocket || tcp->tcp_send_discon_ind) {
1091 struct T_discon_ind *tdi;
1092
1093 (void) putnextctl1(q, M_FLUSH, FLUSHRW);
1094 /*
1095 * Let us reuse the incoming mblk to avoid
1096 * memory allocation failure problems. We know
1097 * that the size of the incoming mblk i.e.
1098 * stroptions is greater than sizeof
1099 * T_discon_ind.
1100 */
1101 ASSERT(DB_REF(mp) == 1);
1102 ASSERT(MBLKSIZE(mp) >=
1103 sizeof (struct T_discon_ind));
1104
1105 DB_TYPE(mp) = M_PROTO;
1106 ((union T_primitives *)mp->b_rptr)->type =
1107 T_DISCON_IND;
1108 tdi = (struct T_discon_ind *)mp->b_rptr;
1109 if (tcp->tcp_issocket) {
1110 tdi->DISCON_reason = ECONNREFUSED;
1111 tdi->SEQ_number = 0;
1112 } else {
1113 tdi->DISCON_reason = ENOPROTOOPT;
1114 tdi->SEQ_number =
1115 tcp->tcp_conn_req_seqnum;
1116 }
1117 mp->b_wptr = mp->b_rptr +
1118 sizeof (struct T_discon_ind);
1119 putnext(q, mp);
1120 }
1121 tcp->tcp_hard_binding = B_FALSE;
1122 return;
1123 }
1124
1125 /*
1126 * This is the first time we run on the correct
1127 * queue after tcp_accept. So fix all the q parameters
1128 * here.
1129 *
1130 * Let us reuse the incoming mblk to avoid
1131 * memory allocation failure problems. We know
1132 * that the size of the incoming mblk is at least
1133 * stroptions
1134 */
1135 tcp_get_proto_props(tcp, &sopp);
1136
1137 ASSERT(DB_REF(mp) == 1);
1138 ASSERT(MBLKSIZE(mp) >= sizeof (struct stroptions));
1139
1140 DB_TYPE(mp) = M_SETOPTS;
1141 stropt = (struct stroptions *)mp->b_rptr;
1142 mp->b_wptr = mp->b_rptr + sizeof (struct stroptions);
1143 stropt = (struct stroptions *)mp->b_rptr;
1144 ASSERT(sopp.sopp_flags & (SO_HIWAT|SO_WROFF|SO_MAXBLK));
1145 stropt->so_flags = SO_HIWAT | SO_WROFF | SO_MAXBLK;
1146 stropt->so_hiwat = sopp.sopp_rxhiwat;
1147 stropt->so_wroff = sopp.sopp_wroff;
1148 stropt->so_maxblk = sopp.sopp_maxblk;
1149
1150 /* Send the options up */
1151 putnext(q, mp);
1152
1153 /*
1154 * Pass up any data and/or a fin that has been received.
1155 *
1156 * Adjust receive window in case it had decreased
1157 * (because there is data <=> tcp_rcv_list != NULL)
1158 * while the connection was detached. Note that
1159 * in case the eager was flow-controlled, w/o this
1160 * code, the rwnd may never open up again!
1161 */
1162 if (tcp->tcp_rcv_list != NULL) {
1163 /* We drain directly in case of fused tcp loopback */
1164
1165 if (!tcp->tcp_fused && canputnext(q)) {
1166 tcp->tcp_rwnd = connp->conn_rcvbuf;
1167 if (tcp->tcp_state >= TCPS_ESTABLISHED &&
1168 tcp_rwnd_reopen(tcp) == TH_ACK_NEEDED) {
1169 tcp_xmit_ctl(NULL,
1170 tcp, (tcp->tcp_swnd == 0) ?
1171 tcp->tcp_suna : tcp->tcp_snxt,
1172 tcp->tcp_rnxt, TH_ACK);
1173 }
1174 }
1175
1176 (void) tcp_rcv_drain(tcp);
1177
1178 /*
1179 * For fused tcp loopback, back-enable peer endpoint
1180 * if it's currently flow-controlled.
1181 */
1182 if (tcp->tcp_fused) {
1183 tcp_t *peer_tcp = tcp->tcp_loopback_peer;
1184
1185 ASSERT(peer_tcp != NULL);
1186 ASSERT(peer_tcp->tcp_fused);
1187
1188 mutex_enter(&peer_tcp->tcp_non_sq_lock);
1189 if (peer_tcp->tcp_flow_stopped) {
1190 tcp_clrqfull(peer_tcp);
1191 TCP_STAT(tcps, tcp_fusion_backenabled);
1192 }
1193 mutex_exit(&peer_tcp->tcp_non_sq_lock);
1194 }
1195 }
1196 ASSERT(tcp->tcp_rcv_list == NULL || tcp->tcp_fused_sigurg);
1197 if (tcp->tcp_fin_rcvd && !tcp->tcp_ordrel_done) {
1198 tcp->tcp_ordrel_done = B_TRUE;
1199 mp = tcp->tcp_ordrel_mp;
1200 tcp->tcp_ordrel_mp = NULL;
1201 putnext(q, mp);
1202 }
1203 tcp->tcp_hard_binding = B_FALSE;
1204
1205 if (connp->conn_keepalive) {
1206 tcp->tcp_ka_last_intrvl = 0;
1207 tcp->tcp_ka_tid = TCP_TIMER(tcp, tcp_keepalive_timer,
1208 tcp->tcp_ka_interval);
1209 }
1210
1211 /*
1212 * At this point, eager is fully established and will
1213 * have the following references -
1214 *
1215 * 2 references for connection to exist (1 for TCP and 1 for IP).
1216 * 1 reference for the squeue which will be dropped by the squeue as
1217 * soon as this function returns.
1218 * There will be 1 additonal reference for being in classifier
1219 * hash list provided something bad hasn't happened.
1220 */
1221 ASSERT((connp->conn_fanout != NULL && connp->conn_ref >= 4) ||
1222 (connp->conn_fanout == NULL && connp->conn_ref >= 3));
1223 }
1224
1225 /*
1226 * Pull a deferred connection indication off of the listener. The caller
1227 * must verify that there is a deferred conn ind under eager_lock before
1228 * calling this function.
1229 */
1230 static mblk_t *
tcp_get_def_conn_ind(tcp_t * listener)1231 tcp_get_def_conn_ind(tcp_t *listener)
1232 {
1233 tcp_t *tail;
1234 tcp_t *tcp;
1235 mblk_t *conn_ind;
1236
1237 ASSERT(MUTEX_HELD(&listener->tcp_eager_lock));
1238 ASSERT(listener->tcp_eager_prev_q0->tcp_conn_def_q0);
1239
1240 tcp = listener->tcp_eager_prev_q0;
1241 /*
1242 * listener->tcp_eager_prev_q0 points to the TAIL of the
1243 * deferred T_conn_ind queue. We need to get to the head
1244 * of the queue in order to send up T_conn_ind the same
1245 * order as how the 3WHS is completed.
1246 */
1247 while (tcp != listener) {
1248 if (!tcp->tcp_eager_prev_q0->tcp_conn_def_q0)
1249 break;
1250 else
1251 tcp = tcp->tcp_eager_prev_q0;
1252 }
1253
1254 conn_ind = tcp->tcp_conn.tcp_eager_conn_ind;
1255 tcp->tcp_conn.tcp_eager_conn_ind = NULL;
1256 /* Move from q0 to q */
1257 ASSERT(listener->tcp_conn_req_cnt_q0 > 0);
1258 listener->tcp_conn_req_cnt_q0--;
1259 listener->tcp_conn_req_cnt_q++;
1260 tcp->tcp_eager_next_q0->tcp_eager_prev_q0 =
1261 tcp->tcp_eager_prev_q0;
1262 tcp->tcp_eager_prev_q0->tcp_eager_next_q0 =
1263 tcp->tcp_eager_next_q0;
1264 tcp->tcp_eager_prev_q0 = NULL;
1265 tcp->tcp_eager_next_q0 = NULL;
1266 tcp->tcp_conn_def_q0 = B_FALSE;
1267
1268 /* Make sure the tcp isn't in the list of droppables */
1269 ASSERT(tcp->tcp_eager_next_drop_q0 == NULL &&
1270 tcp->tcp_eager_prev_drop_q0 == NULL);
1271
1272 /*
1273 * Insert at end of the queue because sockfs sends
1274 * down T_CONN_RES in chronological order. Leaving
1275 * the older conn indications at front of the queue
1276 * helps reducing search time.
1277 */
1278 tail = listener->tcp_eager_last_q;
1279 if (tail != NULL) {
1280 tail->tcp_eager_next_q = tcp;
1281 } else {
1282 listener->tcp_eager_next_q = tcp;
1283 }
1284 listener->tcp_eager_last_q = tcp;
1285 tcp->tcp_eager_next_q = NULL;
1286
1287 return (conn_ind);
1288 }
1289
1290
1291 /*
1292 * Reply to a clients T_CONN_RES TPI message. This function
1293 * is used only for TLI/XTI listener. Sockfs sends T_CONN_RES
1294 * on the acceptor STREAM and processed in tcp_accept_common().
1295 * Read the block comment on top of tcp_input_listener().
1296 */
1297 void
tcp_tli_accept(tcp_t * listener,mblk_t * mp)1298 tcp_tli_accept(tcp_t *listener, mblk_t *mp)
1299 {
1300 tcp_t *acceptor;
1301 tcp_t *eager;
1302 struct T_conn_res *tcr;
1303 t_uscalar_t acceptor_id;
1304 t_scalar_t seqnum;
1305 mblk_t *discon_mp = NULL;
1306 mblk_t *ok_mp;
1307 mblk_t *mp1;
1308 tcp_stack_t *tcps = listener->tcp_tcps;
1309 conn_t *econnp;
1310
1311 if ((mp->b_wptr - mp->b_rptr) < sizeof (*tcr)) {
1312 tcp_err_ack(listener, mp, TPROTO, 0);
1313 return;
1314 }
1315 tcr = (struct T_conn_res *)mp->b_rptr;
1316
1317 /*
1318 * Under ILP32 the stream head points tcr->ACCEPTOR_id at the
1319 * read side queue of the streams device underneath us i.e. the
1320 * read side queue of 'ip'. Since we can't deference QUEUE_ptr we
1321 * look it up in the queue_hash. Under LP64 it sends down the
1322 * minor_t of the accepting endpoint.
1323 *
1324 * Once the acceptor/eager are modified (in tcp_accept_swap) the
1325 * fanout hash lock is held.
1326 * This prevents any thread from entering the acceptor queue from
1327 * below (since it has not been hard bound yet i.e. any inbound
1328 * packets will arrive on the listener conn_t and
1329 * go through the classifier).
1330 * The CONN_INC_REF will prevent the acceptor from closing.
1331 *
1332 * XXX It is still possible for a tli application to send down data
1333 * on the accepting stream while another thread calls t_accept.
1334 * This should not be a problem for well-behaved applications since
1335 * the T_OK_ACK is sent after the queue swapping is completed.
1336 *
1337 * If the accepting fd is the same as the listening fd, avoid
1338 * queue hash lookup since that will return an eager listener in a
1339 * already established state.
1340 */
1341 acceptor_id = tcr->ACCEPTOR_id;
1342 mutex_enter(&listener->tcp_eager_lock);
1343 if (listener->tcp_acceptor_id == acceptor_id) {
1344 eager = listener->tcp_eager_next_q;
1345 /* only count how many T_CONN_INDs so don't count q0 */
1346 if ((listener->tcp_conn_req_cnt_q != 1) ||
1347 (eager->tcp_conn_req_seqnum != tcr->SEQ_number)) {
1348 mutex_exit(&listener->tcp_eager_lock);
1349 tcp_err_ack(listener, mp, TBADF, 0);
1350 return;
1351 }
1352 if (listener->tcp_conn_req_cnt_q0 != 0) {
1353 /* Throw away all the eagers on q0. */
1354 tcp_eager_cleanup(listener, 1);
1355 }
1356 if (listener->tcp_syn_defense) {
1357 listener->tcp_syn_defense = B_FALSE;
1358 if (listener->tcp_ip_addr_cache != NULL) {
1359 kmem_free(listener->tcp_ip_addr_cache,
1360 IP_ADDR_CACHE_SIZE * sizeof (ipaddr_t));
1361 listener->tcp_ip_addr_cache = NULL;
1362 }
1363 }
1364 /*
1365 * Transfer tcp_conn_req_max to the eager so that when
1366 * a disconnect occurs we can revert the endpoint to the
1367 * listen state.
1368 */
1369 eager->tcp_conn_req_max = listener->tcp_conn_req_max;
1370 ASSERT(listener->tcp_conn_req_cnt_q0 == 0);
1371 /*
1372 * Get a reference on the acceptor just like the
1373 * tcp_acceptor_hash_lookup below.
1374 */
1375 acceptor = listener;
1376 CONN_INC_REF(acceptor->tcp_connp);
1377 } else {
1378 acceptor = tcp_acceptor_hash_lookup(acceptor_id, tcps);
1379 if (acceptor == NULL) {
1380 if (listener->tcp_connp->conn_debug) {
1381 (void) strlog(TCP_MOD_ID, 0, 1,
1382 SL_ERROR|SL_TRACE,
1383 "tcp_accept: did not find acceptor 0x%x\n",
1384 acceptor_id);
1385 }
1386 mutex_exit(&listener->tcp_eager_lock);
1387 tcp_err_ack(listener, mp, TPROVMISMATCH, 0);
1388 return;
1389 }
1390 /*
1391 * Verify acceptor state. The acceptable states for an acceptor
1392 * include TCPS_IDLE and TCPS_BOUND.
1393 */
1394 switch (acceptor->tcp_state) {
1395 case TCPS_IDLE:
1396 /* FALLTHRU */
1397 case TCPS_BOUND:
1398 break;
1399 default:
1400 CONN_DEC_REF(acceptor->tcp_connp);
1401 mutex_exit(&listener->tcp_eager_lock);
1402 tcp_err_ack(listener, mp, TOUTSTATE, 0);
1403 return;
1404 }
1405 }
1406
1407 /* The listener must be in TCPS_LISTEN */
1408 if (listener->tcp_state != TCPS_LISTEN) {
1409 CONN_DEC_REF(acceptor->tcp_connp);
1410 mutex_exit(&listener->tcp_eager_lock);
1411 tcp_err_ack(listener, mp, TOUTSTATE, 0);
1412 return;
1413 }
1414
1415 /*
1416 * Rendezvous with an eager connection request packet hanging off
1417 * 'tcp' that has the 'seqnum' tag. We tagged the detached open
1418 * tcp structure when the connection packet arrived in
1419 * tcp_input_listener().
1420 */
1421 seqnum = tcr->SEQ_number;
1422 eager = listener;
1423 do {
1424 eager = eager->tcp_eager_next_q;
1425 if (eager == NULL) {
1426 CONN_DEC_REF(acceptor->tcp_connp);
1427 mutex_exit(&listener->tcp_eager_lock);
1428 tcp_err_ack(listener, mp, TBADSEQ, 0);
1429 return;
1430 }
1431 } while (eager->tcp_conn_req_seqnum != seqnum);
1432 mutex_exit(&listener->tcp_eager_lock);
1433
1434 /*
1435 * At this point, both acceptor and listener have 2 ref
1436 * that they begin with. Acceptor has one additional ref
1437 * we placed in lookup while listener has 3 additional
1438 * ref for being behind the squeue (tcp_accept() is
1439 * done on listener's squeue); being in classifier hash;
1440 * and eager's ref on listener.
1441 */
1442 ASSERT(listener->tcp_connp->conn_ref >= 5);
1443 ASSERT(acceptor->tcp_connp->conn_ref >= 3);
1444
1445 /*
1446 * The eager at this point is set in its own squeue and
1447 * could easily have been killed (tcp_accept_finish will
1448 * deal with that) because of a TH_RST so we can only
1449 * ASSERT for a single ref.
1450 */
1451 ASSERT(eager->tcp_connp->conn_ref >= 1);
1452
1453 /*
1454 * Pre allocate the discon_ind mblk also. tcp_accept_finish will
1455 * use it if something failed.
1456 */
1457 discon_mp = allocb(MAX(sizeof (struct T_discon_ind),
1458 sizeof (struct stroptions)), BPRI_HI);
1459 if (discon_mp == NULL) {
1460 CONN_DEC_REF(acceptor->tcp_connp);
1461 CONN_DEC_REF(eager->tcp_connp);
1462 tcp_err_ack(listener, mp, TSYSERR, ENOMEM);
1463 return;
1464 }
1465
1466 econnp = eager->tcp_connp;
1467
1468 /* Hold a copy of mp, in case reallocb fails */
1469 if ((mp1 = copymsg(mp)) == NULL) {
1470 CONN_DEC_REF(acceptor->tcp_connp);
1471 CONN_DEC_REF(eager->tcp_connp);
1472 freemsg(discon_mp);
1473 tcp_err_ack(listener, mp, TSYSERR, ENOMEM);
1474 return;
1475 }
1476
1477 tcr = (struct T_conn_res *)mp1->b_rptr;
1478
1479 /*
1480 * This is an expanded version of mi_tpi_ok_ack_alloc()
1481 * which allocates a larger mblk and appends the new
1482 * local address to the ok_ack. The address is copied by
1483 * soaccept() for getsockname().
1484 */
1485 {
1486 int extra;
1487
1488 extra = (econnp->conn_family == AF_INET) ?
1489 sizeof (sin_t) : sizeof (sin6_t);
1490
1491 /*
1492 * Try to re-use mp, if possible. Otherwise, allocate
1493 * an mblk and return it as ok_mp. In any case, mp
1494 * is no longer usable upon return.
1495 */
1496 if ((ok_mp = mi_tpi_ok_ack_alloc_extra(mp, extra)) == NULL) {
1497 CONN_DEC_REF(acceptor->tcp_connp);
1498 CONN_DEC_REF(eager->tcp_connp);
1499 freemsg(discon_mp);
1500 /* Original mp has been freed by now, so use mp1 */
1501 tcp_err_ack(listener, mp1, TSYSERR, ENOMEM);
1502 return;
1503 }
1504
1505 mp = NULL; /* We should never use mp after this point */
1506
1507 switch (extra) {
1508 case sizeof (sin_t): {
1509 sin_t *sin = (sin_t *)ok_mp->b_wptr;
1510
1511 ok_mp->b_wptr += extra;
1512 sin->sin_family = AF_INET;
1513 sin->sin_port = econnp->conn_lport;
1514 sin->sin_addr.s_addr = econnp->conn_laddr_v4;
1515 break;
1516 }
1517 case sizeof (sin6_t): {
1518 sin6_t *sin6 = (sin6_t *)ok_mp->b_wptr;
1519
1520 ok_mp->b_wptr += extra;
1521 sin6->sin6_family = AF_INET6;
1522 sin6->sin6_port = econnp->conn_lport;
1523 sin6->sin6_addr = econnp->conn_laddr_v6;
1524 sin6->sin6_flowinfo = econnp->conn_flowinfo;
1525 if (IN6_IS_ADDR_LINKSCOPE(&econnp->conn_laddr_v6) &&
1526 (econnp->conn_ixa->ixa_flags & IXAF_SCOPEID_SET)) {
1527 sin6->sin6_scope_id =
1528 econnp->conn_ixa->ixa_scopeid;
1529 } else {
1530 sin6->sin6_scope_id = 0;
1531 }
1532 sin6->__sin6_src_id = 0;
1533 break;
1534 }
1535 default:
1536 break;
1537 }
1538 ASSERT(ok_mp->b_wptr <= ok_mp->b_datap->db_lim);
1539 }
1540
1541 /*
1542 * If there are no options we know that the T_CONN_RES will
1543 * succeed. However, we can't send the T_OK_ACK upstream until
1544 * the tcp_accept_swap is done since it would be dangerous to
1545 * let the application start using the new fd prior to the swap.
1546 */
1547 tcp_accept_swap(listener, acceptor, eager);
1548
1549 /*
1550 * tcp_accept_swap unlinks eager from listener but does not drop
1551 * the eager's reference on the listener.
1552 */
1553 ASSERT(eager->tcp_listener == NULL);
1554 ASSERT(listener->tcp_connp->conn_ref >= 5);
1555
1556 /*
1557 * The eager is now associated with its own queue. Insert in
1558 * the hash so that the connection can be reused for a future
1559 * T_CONN_RES.
1560 */
1561 tcp_acceptor_hash_insert(acceptor_id, eager);
1562
1563 /*
1564 * We now do the processing of options with T_CONN_RES.
1565 * We delay till now since we wanted to have queue to pass to
1566 * option processing routines that points back to the right
1567 * instance structure which does not happen until after
1568 * tcp_accept_swap().
1569 *
1570 * Note:
1571 * The sanity of the logic here assumes that whatever options
1572 * are appropriate to inherit from listner=>eager are done
1573 * before this point, and whatever were to be overridden (or not)
1574 * in transfer logic from eager=>acceptor in tcp_accept_swap().
1575 * [ Warning: acceptor endpoint can have T_OPTMGMT_REQ done to it
1576 * before its ACCEPTOR_id comes down in T_CONN_RES ]
1577 * This may not be true at this point in time but can be fixed
1578 * independently. This option processing code starts with
1579 * the instantiated acceptor instance and the final queue at
1580 * this point.
1581 */
1582
1583 if (tcr->OPT_length != 0) {
1584 /* Options to process */
1585 int t_error = 0;
1586 int sys_error = 0;
1587 int do_disconnect = 0;
1588
1589 if (tcp_conprim_opt_process(eager, mp1,
1590 &do_disconnect, &t_error, &sys_error) < 0) {
1591 eager->tcp_accept_error = 1;
1592 if (do_disconnect) {
1593 /*
1594 * An option failed which does not allow
1595 * connection to be accepted.
1596 *
1597 * We allow T_CONN_RES to succeed and
1598 * put a T_DISCON_IND on the eager queue.
1599 */
1600 ASSERT(t_error == 0 && sys_error == 0);
1601 eager->tcp_send_discon_ind = 1;
1602 } else {
1603 ASSERT(t_error != 0);
1604 freemsg(ok_mp);
1605 /*
1606 * Original mp was either freed or set
1607 * to ok_mp above, so use mp1 instead.
1608 */
1609 tcp_err_ack(listener, mp1, t_error, sys_error);
1610 goto finish;
1611 }
1612 }
1613 /*
1614 * Most likely success in setting options (except if
1615 * eager->tcp_send_discon_ind set).
1616 * mp1 option buffer represented by OPT_length/offset
1617 * potentially modified and contains results of setting
1618 * options at this point
1619 */
1620 }
1621
1622 /* We no longer need mp1, since all options processing has passed */
1623 freemsg(mp1);
1624
1625 putnext(listener->tcp_connp->conn_rq, ok_mp);
1626
1627 mutex_enter(&listener->tcp_eager_lock);
1628 if (listener->tcp_eager_prev_q0->tcp_conn_def_q0) {
1629 mblk_t *conn_ind;
1630
1631 /*
1632 * This path should not be executed if listener and
1633 * acceptor streams are the same.
1634 */
1635 ASSERT(listener != acceptor);
1636 conn_ind = tcp_get_def_conn_ind(listener);
1637 mutex_exit(&listener->tcp_eager_lock);
1638 putnext(listener->tcp_connp->conn_rq, conn_ind);
1639 } else {
1640 mutex_exit(&listener->tcp_eager_lock);
1641 }
1642
1643 /*
1644 * Done with the acceptor - free it
1645 *
1646 * Note: from this point on, no access to listener should be made
1647 * as listener can be equal to acceptor.
1648 */
1649 finish:
1650 ASSERT(acceptor->tcp_detached);
1651 acceptor->tcp_connp->conn_rq = NULL;
1652 ASSERT(!IPCL_IS_NONSTR(acceptor->tcp_connp));
1653 acceptor->tcp_connp->conn_wq = NULL;
1654 (void) tcp_clean_death(acceptor, 0);
1655 CONN_DEC_REF(acceptor->tcp_connp);
1656
1657 /*
1658 * We pass discon_mp to tcp_accept_finish to get on the right squeue.
1659 *
1660 * It will update the setting for sockfs/stream head and also take
1661 * care of any data that arrived before accept() wad called.
1662 * In case we already received a FIN then tcp_accept_finish will send up
1663 * the ordrel. It will also send up a window update if the window
1664 * has opened up.
1665 */
1666
1667 /*
1668 * XXX: we currently have a problem if XTI application closes the
1669 * acceptor stream in between. This problem exists in on10-gate also
1670 * and is well know but nothing can be done short of major rewrite
1671 * to fix it. Now it is possible to take care of it by assigning TLI/XTI
1672 * eager same squeue as listener (we can distinguish non socket
1673 * listeners at the time of handling a SYN in tcp_input_listener)
1674 * and do most of the work that tcp_accept_finish does here itself
1675 * and then get behind the acceptor squeue to access the acceptor
1676 * queue.
1677 */
1678 /*
1679 * We already have a ref on tcp so no need to do one before squeue_enter
1680 */
1681 SQUEUE_ENTER_ONE(eager->tcp_connp->conn_sqp, discon_mp,
1682 tcp_accept_finish, eager->tcp_connp, NULL, SQ_FILL,
1683 SQTAG_TCP_ACCEPT_FINISH);
1684 }
1685
1686
1687 /*
1688 * This is the STREAMS entry point for T_CONN_RES coming down on
1689 * Acceptor STREAM when sockfs listener does accept processing.
1690 * Read the block comment on top of tcp_input_listener().
1691 */
1692 int
tcp_tpi_accept(queue_t * q,mblk_t * mp)1693 tcp_tpi_accept(queue_t *q, mblk_t *mp)
1694 {
1695 queue_t *rq = RD(q);
1696 struct T_conn_res *conn_res;
1697 tcp_t *eager;
1698 tcp_t *listener;
1699 struct T_ok_ack *ok;
1700 t_scalar_t PRIM_type;
1701 mblk_t *discon_mp;
1702 conn_t *econnp;
1703 cred_t *cr;
1704
1705 ASSERT(DB_TYPE(mp) == M_PROTO);
1706
1707 /*
1708 * All Solaris components should pass a db_credp
1709 * for this TPI message, hence we ASSERT.
1710 * But in case there is some other M_PROTO that looks
1711 * like a TPI message sent by some other kernel
1712 * component, we check and return an error.
1713 */
1714 cr = msg_getcred(mp, NULL);
1715 ASSERT(cr != NULL);
1716 if (cr == NULL) {
1717 mp = mi_tpi_err_ack_alloc(mp, TSYSERR, EINVAL);
1718 if (mp != NULL)
1719 putnext(rq, mp);
1720 return (0);
1721 }
1722 conn_res = (struct T_conn_res *)mp->b_rptr;
1723 ASSERT((uintptr_t)(mp->b_wptr - mp->b_rptr) <= (uintptr_t)INT_MAX);
1724 if ((mp->b_wptr - mp->b_rptr) < sizeof (struct T_conn_res)) {
1725 mp = mi_tpi_err_ack_alloc(mp, TPROTO, 0);
1726 if (mp != NULL)
1727 putnext(rq, mp);
1728 return (0);
1729 }
1730 switch (conn_res->PRIM_type) {
1731 case O_T_CONN_RES:
1732 case T_CONN_RES:
1733 /*
1734 * We pass up an err ack if allocb fails. This will
1735 * cause sockfs to issue a T_DISCON_REQ which will cause
1736 * tcp_eager_blowoff to be called. sockfs will then call
1737 * rq->q_qinfo->qi_qclose to cleanup the acceptor stream.
1738 * we need to do the allocb up here because we have to
1739 * make sure rq->q_qinfo->qi_qclose still points to the
1740 * correct function (tcp_tpi_close_accept) in case allocb
1741 * fails.
1742 */
1743 bcopy(mp->b_rptr + conn_res->OPT_offset,
1744 &eager, conn_res->OPT_length);
1745 PRIM_type = conn_res->PRIM_type;
1746 mp->b_datap->db_type = M_PCPROTO;
1747 mp->b_wptr = mp->b_rptr + sizeof (struct T_ok_ack);
1748 ok = (struct T_ok_ack *)mp->b_rptr;
1749 ok->PRIM_type = T_OK_ACK;
1750 ok->CORRECT_prim = PRIM_type;
1751 econnp = eager->tcp_connp;
1752 econnp->conn_dev = (dev_t)RD(q)->q_ptr;
1753 econnp->conn_minor_arena = (vmem_t *)(WR(q)->q_ptr);
1754 econnp->conn_rq = rq;
1755 econnp->conn_wq = q;
1756 rq->q_ptr = econnp;
1757 rq->q_qinfo = &tcp_rinitv4; /* No open - same as rinitv6 */
1758 q->q_ptr = econnp;
1759 q->q_qinfo = &tcp_winit;
1760 listener = eager->tcp_listener;
1761
1762 /*
1763 * Pre allocate the discon_ind mblk also. tcp_accept_finish will
1764 * use it if something failed.
1765 */
1766 discon_mp = allocb(MAX(sizeof (struct T_discon_ind),
1767 sizeof (struct stroptions)), BPRI_HI);
1768
1769 if (discon_mp == NULL) {
1770 mp = mi_tpi_err_ack_alloc(mp, TPROTO, 0);
1771 if (mp != NULL)
1772 putnext(rq, mp);
1773 return (0);
1774 }
1775
1776 eager->tcp_issocket = B_TRUE;
1777
1778 ASSERT(econnp->conn_netstack ==
1779 listener->tcp_connp->conn_netstack);
1780 ASSERT(eager->tcp_tcps == listener->tcp_tcps);
1781
1782 /* Put the ref for IP */
1783 CONN_INC_REF(econnp);
1784
1785 /*
1786 * We should have minimum of 3 references on the conn
1787 * at this point. One each for TCP and IP and one for
1788 * the T_conn_ind that was sent up when the 3-way handshake
1789 * completed. In the normal case we would also have another
1790 * reference (making a total of 4) for the conn being in the
1791 * classifier hash list. However the eager could have received
1792 * an RST subsequently and tcp_closei_local could have removed
1793 * the eager from the classifier hash list, hence we can't
1794 * assert that reference.
1795 */
1796 ASSERT(econnp->conn_ref >= 3);
1797
1798 mutex_enter(&listener->tcp_eager_lock);
1799 if (listener->tcp_eager_prev_q0->tcp_conn_def_q0) {
1800 mblk_t *conn_ind = tcp_get_def_conn_ind(listener);
1801
1802 /* Need to get inside the listener perimeter */
1803 CONN_INC_REF(listener->tcp_connp);
1804 SQUEUE_ENTER_ONE(listener->tcp_connp->conn_sqp,
1805 conn_ind, tcp_send_pending, listener->tcp_connp,
1806 NULL, SQ_FILL, SQTAG_TCP_SEND_PENDING);
1807 }
1808 tcp_eager_unlink(eager);
1809 mutex_exit(&listener->tcp_eager_lock);
1810
1811 /*
1812 * At this point, the eager is detached from the listener
1813 * but we still have an extra refs on eager (apart from the
1814 * usual tcp references). The ref was placed in tcp_input_data
1815 * before sending the conn_ind in tcp_send_conn_ind.
1816 * The ref will be dropped in tcp_accept_finish().
1817 */
1818 SQUEUE_ENTER_ONE(econnp->conn_sqp, discon_mp, tcp_accept_finish,
1819 econnp, NULL, SQ_NODRAIN, SQTAG_TCP_ACCEPT_FINISH_Q0);
1820
1821 /*
1822 * Send the new local address also up to sockfs. There
1823 * should already be enough space in the mp that came
1824 * down from soaccept().
1825 */
1826 if (econnp->conn_family == AF_INET) {
1827 sin_t *sin;
1828
1829 ASSERT((mp->b_datap->db_lim - mp->b_datap->db_base) >=
1830 (sizeof (struct T_ok_ack) + sizeof (sin_t)));
1831 sin = (sin_t *)mp->b_wptr;
1832 mp->b_wptr += sizeof (sin_t);
1833 sin->sin_family = AF_INET;
1834 sin->sin_port = econnp->conn_lport;
1835 sin->sin_addr.s_addr = econnp->conn_laddr_v4;
1836 } else {
1837 sin6_t *sin6;
1838
1839 ASSERT((mp->b_datap->db_lim - mp->b_datap->db_base) >=
1840 sizeof (struct T_ok_ack) + sizeof (sin6_t));
1841 sin6 = (sin6_t *)mp->b_wptr;
1842 mp->b_wptr += sizeof (sin6_t);
1843 sin6->sin6_family = AF_INET6;
1844 sin6->sin6_port = econnp->conn_lport;
1845 sin6->sin6_addr = econnp->conn_laddr_v6;
1846 if (econnp->conn_ipversion == IPV4_VERSION)
1847 sin6->sin6_flowinfo = 0;
1848 else
1849 sin6->sin6_flowinfo = econnp->conn_flowinfo;
1850 if (IN6_IS_ADDR_LINKSCOPE(&econnp->conn_laddr_v6) &&
1851 (econnp->conn_ixa->ixa_flags & IXAF_SCOPEID_SET)) {
1852 sin6->sin6_scope_id =
1853 econnp->conn_ixa->ixa_scopeid;
1854 } else {
1855 sin6->sin6_scope_id = 0;
1856 }
1857 sin6->__sin6_src_id = 0;
1858 }
1859
1860 putnext(rq, mp);
1861 break;
1862 default:
1863 mp = mi_tpi_err_ack_alloc(mp, TNOTSUPPORT, 0);
1864 if (mp != NULL)
1865 putnext(rq, mp);
1866 break;
1867 }
1868 return (0);
1869 }
1870
1871 /*
1872 * The function called through squeue to get behind listener's perimeter to
1873 * send a deferred conn_ind.
1874 */
1875 /* ARGSUSED */
1876 void
tcp_send_pending(void * arg,mblk_t * mp,void * arg2,ip_recv_attr_t * dummy)1877 tcp_send_pending(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy)
1878 {
1879 conn_t *lconnp = (conn_t *)arg;
1880 tcp_t *listener = lconnp->conn_tcp;
1881 struct T_conn_ind *conn_ind;
1882 tcp_t *tcp;
1883
1884 conn_ind = (struct T_conn_ind *)mp->b_rptr;
1885 bcopy(mp->b_rptr + conn_ind->OPT_offset, &tcp,
1886 conn_ind->OPT_length);
1887
1888 if (listener->tcp_state != TCPS_LISTEN) {
1889 /*
1890 * If listener has closed, it would have caused a
1891 * a cleanup/blowoff to happen for the eager, so
1892 * we don't need to do anything more.
1893 */
1894 freemsg(mp);
1895 return;
1896 }
1897
1898 putnext(lconnp->conn_rq, mp);
1899 }
1900
1901 /*
1902 * Sends the T_CONN_IND to the listener. The caller calls this
1903 * functions via squeue to get inside the listener's perimeter
1904 * once the 3 way hand shake is done a T_CONN_IND needs to be
1905 * sent. As an optimization, the caller can call this directly
1906 * if listener's perimeter is same as eager's.
1907 */
1908 /* ARGSUSED */
1909 void
tcp_send_conn_ind(void * arg,mblk_t * mp,void * arg2)1910 tcp_send_conn_ind(void *arg, mblk_t *mp, void *arg2)
1911 {
1912 conn_t *lconnp = (conn_t *)arg;
1913 tcp_t *listener = lconnp->conn_tcp;
1914 tcp_t *tcp;
1915 struct T_conn_ind *conn_ind;
1916 ipaddr_t *addr_cache;
1917 boolean_t need_send_conn_ind = B_FALSE;
1918 tcp_stack_t *tcps = listener->tcp_tcps;
1919
1920 /* retrieve the eager */
1921 conn_ind = (struct T_conn_ind *)mp->b_rptr;
1922 ASSERT(conn_ind->OPT_offset != 0 &&
1923 conn_ind->OPT_length == sizeof (intptr_t));
1924 bcopy(mp->b_rptr + conn_ind->OPT_offset, &tcp,
1925 conn_ind->OPT_length);
1926
1927 /*
1928 * TLI/XTI applications will get confused by
1929 * sending eager as an option since it violates
1930 * the option semantics. So remove the eager as
1931 * option since TLI/XTI app doesn't need it anyway.
1932 */
1933 if (!TCP_IS_SOCKET(listener)) {
1934 conn_ind->OPT_length = 0;
1935 conn_ind->OPT_offset = 0;
1936 }
1937 if (listener->tcp_state != TCPS_LISTEN) {
1938 /*
1939 * If listener has closed, it would have caused a
1940 * a cleanup/blowoff to happen for the eager. We
1941 * just need to return.
1942 */
1943 freemsg(mp);
1944 return;
1945 }
1946
1947
1948 /*
1949 * if the conn_req_q is full defer passing up the
1950 * T_CONN_IND until space is availabe after t_accept()
1951 * processing
1952 */
1953 mutex_enter(&listener->tcp_eager_lock);
1954
1955 /*
1956 * Take the eager out, if it is in the list of droppable eagers
1957 * as we are here because the 3W handshake is over.
1958 */
1959 MAKE_UNDROPPABLE(tcp);
1960
1961 if (listener->tcp_conn_req_cnt_q < listener->tcp_conn_req_max) {
1962 tcp_t *tail;
1963
1964 /*
1965 * The eager already has an extra ref put in tcp_input_data
1966 * so that it stays till accept comes back even though it
1967 * might get into TCPS_CLOSED as a result of a TH_RST etc.
1968 */
1969 ASSERT(listener->tcp_conn_req_cnt_q0 > 0);
1970 listener->tcp_conn_req_cnt_q0--;
1971 listener->tcp_conn_req_cnt_q++;
1972
1973 /* Move from SYN_RCVD to ESTABLISHED list */
1974 tcp->tcp_eager_next_q0->tcp_eager_prev_q0 =
1975 tcp->tcp_eager_prev_q0;
1976 tcp->tcp_eager_prev_q0->tcp_eager_next_q0 =
1977 tcp->tcp_eager_next_q0;
1978 tcp->tcp_eager_prev_q0 = NULL;
1979 tcp->tcp_eager_next_q0 = NULL;
1980
1981 /*
1982 * Insert at end of the queue because sockfs
1983 * sends down T_CONN_RES in chronological
1984 * order. Leaving the older conn indications
1985 * at front of the queue helps reducing search
1986 * time.
1987 */
1988 tail = listener->tcp_eager_last_q;
1989 if (tail != NULL)
1990 tail->tcp_eager_next_q = tcp;
1991 else
1992 listener->tcp_eager_next_q = tcp;
1993 listener->tcp_eager_last_q = tcp;
1994 tcp->tcp_eager_next_q = NULL;
1995 /*
1996 * Delay sending up the T_conn_ind until we are
1997 * done with the eager. Once we have have sent up
1998 * the T_conn_ind, the accept can potentially complete
1999 * any time and release the refhold we have on the eager.
2000 */
2001 need_send_conn_ind = B_TRUE;
2002 } else {
2003 /*
2004 * Defer connection on q0 and set deferred
2005 * connection bit true
2006 */
2007 tcp->tcp_conn_def_q0 = B_TRUE;
2008
2009 /* take tcp out of q0 ... */
2010 tcp->tcp_eager_prev_q0->tcp_eager_next_q0 =
2011 tcp->tcp_eager_next_q0;
2012 tcp->tcp_eager_next_q0->tcp_eager_prev_q0 =
2013 tcp->tcp_eager_prev_q0;
2014
2015 /* ... and place it at the end of q0 */
2016 tcp->tcp_eager_prev_q0 = listener->tcp_eager_prev_q0;
2017 tcp->tcp_eager_next_q0 = listener;
2018 listener->tcp_eager_prev_q0->tcp_eager_next_q0 = tcp;
2019 listener->tcp_eager_prev_q0 = tcp;
2020 tcp->tcp_conn.tcp_eager_conn_ind = mp;
2021 }
2022
2023 /* we have timed out before */
2024 if (tcp->tcp_syn_rcvd_timeout != 0) {
2025 tcp->tcp_syn_rcvd_timeout = 0;
2026 listener->tcp_syn_rcvd_timeout--;
2027 if (listener->tcp_syn_defense &&
2028 listener->tcp_syn_rcvd_timeout <=
2029 (tcps->tcps_conn_req_max_q0 >> 5) &&
2030 10*MINUTES < TICK_TO_MSEC(ddi_get_lbolt64() -
2031 listener->tcp_last_rcv_lbolt)) {
2032 /*
2033 * Turn off the defense mode if we
2034 * believe the SYN attack is over.
2035 */
2036 listener->tcp_syn_defense = B_FALSE;
2037 if (listener->tcp_ip_addr_cache) {
2038 kmem_free((void *)listener->tcp_ip_addr_cache,
2039 IP_ADDR_CACHE_SIZE * sizeof (ipaddr_t));
2040 listener->tcp_ip_addr_cache = NULL;
2041 }
2042 }
2043 }
2044 addr_cache = (ipaddr_t *)(listener->tcp_ip_addr_cache);
2045 if (addr_cache != NULL) {
2046 /*
2047 * We have finished a 3-way handshake with this
2048 * remote host. This proves the IP addr is good.
2049 * Cache it!
2050 */
2051 addr_cache[IP_ADDR_CACHE_HASH(tcp->tcp_connp->conn_faddr_v4)] =
2052 tcp->tcp_connp->conn_faddr_v4;
2053 }
2054 mutex_exit(&listener->tcp_eager_lock);
2055 if (need_send_conn_ind)
2056 putnext(lconnp->conn_rq, mp);
2057 }
2058