1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
24 */
25
26 /* This files contains all TCP TLI/TPI related functions */
27
28 #include <sys/types.h>
29 #include <sys/stream.h>
30 #include <sys/strsun.h>
31 #include <sys/strsubr.h>
32 #include <sys/stropts.h>
33 #include <sys/strlog.h>
34 #define _SUN_TPI_VERSION 2
35 #include <sys/tihdr.h>
36 #include <sys/suntpi.h>
37 #include <sys/xti_inet.h>
38 #include <sys/squeue_impl.h>
39 #include <sys/squeue.h>
40
41 #include <inet/common.h>
42 #include <inet/ip.h>
43 #include <inet/tcp.h>
44 #include <inet/tcp_impl.h>
45 #include <inet/proto_set.h>
46
47 static void tcp_accept_swap(tcp_t *, tcp_t *, tcp_t *);
48 static int tcp_conprim_opt_process(tcp_t *, mblk_t *, int *, int *, int *);
49
50 void
tcp_use_pure_tpi(tcp_t * tcp)51 tcp_use_pure_tpi(tcp_t *tcp)
52 {
53 conn_t *connp = tcp->tcp_connp;
54
55 #ifdef _ILP32
56 tcp->tcp_acceptor_id = (t_uscalar_t)connp->conn_rq;
57 #else
58 tcp->tcp_acceptor_id = connp->conn_dev;
59 #endif
60 /*
61 * Insert this socket into the acceptor hash.
62 * We might need it for T_CONN_RES message
63 */
64 tcp_acceptor_hash_insert(tcp->tcp_acceptor_id, tcp);
65
66 tcp->tcp_issocket = B_FALSE;
67 TCP_STAT(tcp->tcp_tcps, tcp_sock_fallback);
68 }
69
70 /* Shorthand to generate and send TPI error acks to our client */
71 void
tcp_err_ack(tcp_t * tcp,mblk_t * mp,int t_error,int sys_error)72 tcp_err_ack(tcp_t *tcp, mblk_t *mp, int t_error, int sys_error)
73 {
74 if ((mp = mi_tpi_err_ack_alloc(mp, t_error, sys_error)) != NULL)
75 putnext(tcp->tcp_connp->conn_rq, mp);
76 }
77
78 /* Shorthand to generate and send TPI error acks to our client */
79 void
tcp_err_ack_prim(tcp_t * tcp,mblk_t * mp,int primitive,int t_error,int sys_error)80 tcp_err_ack_prim(tcp_t *tcp, mblk_t *mp, int primitive,
81 int t_error, int sys_error)
82 {
83 struct T_error_ack *teackp;
84
85 if ((mp = tpi_ack_alloc(mp, sizeof (struct T_error_ack),
86 M_PCPROTO, T_ERROR_ACK)) != NULL) {
87 teackp = (struct T_error_ack *)mp->b_rptr;
88 teackp->ERROR_prim = primitive;
89 teackp->TLI_error = t_error;
90 teackp->UNIX_error = sys_error;
91 putnext(tcp->tcp_connp->conn_rq, mp);
92 }
93 }
94
95 /*
96 * TCP routine to get the values of options.
97 */
98 int
tcp_tpi_opt_get(queue_t * q,int level,int name,uchar_t * ptr)99 tcp_tpi_opt_get(queue_t *q, int level, int name, uchar_t *ptr)
100 {
101 return (tcp_opt_get(Q_TO_CONN(q), level, name, ptr));
102 }
103
104 /* ARGSUSED */
105 int
tcp_tpi_opt_set(queue_t * q,uint_t optset_context,int level,int name,uint_t inlen,uchar_t * invalp,uint_t * outlenp,uchar_t * outvalp,void * thisdg_attrs,cred_t * cr)106 tcp_tpi_opt_set(queue_t *q, uint_t optset_context, int level, int name,
107 uint_t inlen, uchar_t *invalp, uint_t *outlenp, uchar_t *outvalp,
108 void *thisdg_attrs, cred_t *cr)
109 {
110 conn_t *connp = Q_TO_CONN(q);
111
112 return (tcp_opt_set(connp, optset_context, level, name, inlen, invalp,
113 outlenp, outvalp, thisdg_attrs, cr));
114 }
115
116 static int
tcp_conprim_opt_process(tcp_t * tcp,mblk_t * mp,int * do_disconnectp,int * t_errorp,int * sys_errorp)117 tcp_conprim_opt_process(tcp_t *tcp, mblk_t *mp, int *do_disconnectp,
118 int *t_errorp, int *sys_errorp)
119 {
120 int error;
121 int is_absreq_failure;
122 t_scalar_t *opt_lenp;
123 t_scalar_t opt_offset;
124 int prim_type;
125 struct T_conn_req *tcreqp;
126 struct T_conn_res *tcresp;
127 cred_t *cr;
128
129 /*
130 * All Solaris components should pass a db_credp
131 * for this TPI message, hence we ASSERT.
132 * But in case there is some other M_PROTO that looks
133 * like a TPI message sent by some other kernel
134 * component, we check and return an error.
135 */
136 cr = msg_getcred(mp, NULL);
137 ASSERT(cr != NULL);
138 if (cr == NULL)
139 return (-1);
140
141 prim_type = ((union T_primitives *)mp->b_rptr)->type;
142 ASSERT(prim_type == T_CONN_REQ || prim_type == O_T_CONN_RES ||
143 prim_type == T_CONN_RES);
144
145 switch (prim_type) {
146 case T_CONN_REQ:
147 tcreqp = (struct T_conn_req *)mp->b_rptr;
148 opt_offset = tcreqp->OPT_offset;
149 opt_lenp = (t_scalar_t *)&tcreqp->OPT_length;
150 break;
151 case O_T_CONN_RES:
152 case T_CONN_RES:
153 tcresp = (struct T_conn_res *)mp->b_rptr;
154 opt_offset = tcresp->OPT_offset;
155 opt_lenp = (t_scalar_t *)&tcresp->OPT_length;
156 break;
157 }
158
159 *t_errorp = 0;
160 *sys_errorp = 0;
161 *do_disconnectp = 0;
162
163 error = tpi_optcom_buf(tcp->tcp_connp->conn_wq, mp, opt_lenp,
164 opt_offset, cr, &tcp_opt_obj,
165 NULL, &is_absreq_failure);
166
167 switch (error) {
168 case 0: /* no error */
169 ASSERT(is_absreq_failure == 0);
170 return (0);
171 case ENOPROTOOPT:
172 *t_errorp = TBADOPT;
173 break;
174 case EACCES:
175 *t_errorp = TACCES;
176 break;
177 default:
178 *t_errorp = TSYSERR; *sys_errorp = error;
179 break;
180 }
181 if (is_absreq_failure != 0) {
182 /*
183 * The connection request should get the local ack
184 * T_OK_ACK and then a T_DISCON_IND.
185 */
186 *do_disconnectp = 1;
187 }
188 return (-1);
189 }
190
191 void
tcp_tpi_bind(tcp_t * tcp,mblk_t * mp)192 tcp_tpi_bind(tcp_t *tcp, mblk_t *mp)
193 {
194 int error;
195 conn_t *connp = tcp->tcp_connp;
196 struct sockaddr *sa;
197 mblk_t *mp1;
198 struct T_bind_req *tbr;
199 int backlog;
200 socklen_t len;
201 sin_t *sin;
202 sin6_t *sin6;
203 cred_t *cr;
204
205 /*
206 * All Solaris components should pass a db_credp
207 * for this TPI message, hence we ASSERT.
208 * But in case there is some other M_PROTO that looks
209 * like a TPI message sent by some other kernel
210 * component, we check and return an error.
211 */
212 cr = msg_getcred(mp, NULL);
213 ASSERT(cr != NULL);
214 if (cr == NULL) {
215 tcp_err_ack(tcp, mp, TSYSERR, EINVAL);
216 return;
217 }
218
219 ASSERT((uintptr_t)(mp->b_wptr - mp->b_rptr) <= (uintptr_t)INT_MAX);
220 if ((mp->b_wptr - mp->b_rptr) < sizeof (*tbr)) {
221 if (connp->conn_debug) {
222 (void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE,
223 "tcp_tpi_bind: bad req, len %u",
224 (uint_t)(mp->b_wptr - mp->b_rptr));
225 }
226 tcp_err_ack(tcp, mp, TPROTO, 0);
227 return;
228 }
229 /* Make sure the largest address fits */
230 mp1 = reallocb(mp, sizeof (struct T_bind_ack) + sizeof (sin6_t), 1);
231 if (mp1 == NULL) {
232 tcp_err_ack(tcp, mp, TSYSERR, ENOMEM);
233 return;
234 }
235 mp = mp1;
236 tbr = (struct T_bind_req *)mp->b_rptr;
237
238 backlog = tbr->CONIND_number;
239 len = tbr->ADDR_length;
240
241 switch (len) {
242 case 0: /* request for a generic port */
243 tbr->ADDR_offset = sizeof (struct T_bind_req);
244 if (connp->conn_family == AF_INET) {
245 tbr->ADDR_length = sizeof (sin_t);
246 sin = (sin_t *)&tbr[1];
247 *sin = sin_null;
248 sin->sin_family = AF_INET;
249 sa = (struct sockaddr *)sin;
250 len = sizeof (sin_t);
251 mp->b_wptr = (uchar_t *)&sin[1];
252 } else {
253 ASSERT(connp->conn_family == AF_INET6);
254 tbr->ADDR_length = sizeof (sin6_t);
255 sin6 = (sin6_t *)&tbr[1];
256 *sin6 = sin6_null;
257 sin6->sin6_family = AF_INET6;
258 sa = (struct sockaddr *)sin6;
259 len = sizeof (sin6_t);
260 mp->b_wptr = (uchar_t *)&sin6[1];
261 }
262 break;
263
264 case sizeof (sin_t): /* Complete IPv4 address */
265 sa = (struct sockaddr *)mi_offset_param(mp, tbr->ADDR_offset,
266 sizeof (sin_t));
267 break;
268
269 case sizeof (sin6_t): /* Complete IPv6 address */
270 sa = (struct sockaddr *)mi_offset_param(mp,
271 tbr->ADDR_offset, sizeof (sin6_t));
272 break;
273
274 default:
275 if (connp->conn_debug) {
276 (void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE,
277 "tcp_tpi_bind: bad address length, %d",
278 tbr->ADDR_length);
279 }
280 tcp_err_ack(tcp, mp, TBADADDR, 0);
281 return;
282 }
283
284 if (backlog > 0) {
285 error = tcp_do_listen(connp, sa, len, backlog, DB_CRED(mp),
286 tbr->PRIM_type != O_T_BIND_REQ);
287 } else {
288 error = tcp_do_bind(connp, sa, len, DB_CRED(mp),
289 tbr->PRIM_type != O_T_BIND_REQ);
290 }
291 done:
292 if (error > 0) {
293 tcp_err_ack(tcp, mp, TSYSERR, error);
294 } else if (error < 0) {
295 tcp_err_ack(tcp, mp, -error, 0);
296 } else {
297 /*
298 * Update port information as sockfs/tpi needs it for checking
299 */
300 if (connp->conn_family == AF_INET) {
301 sin = (sin_t *)sa;
302 sin->sin_port = connp->conn_lport;
303 } else {
304 sin6 = (sin6_t *)sa;
305 sin6->sin6_port = connp->conn_lport;
306 }
307 mp->b_datap->db_type = M_PCPROTO;
308 tbr->PRIM_type = T_BIND_ACK;
309 putnext(connp->conn_rq, mp);
310 }
311 }
312
313 /* tcp_unbind is called by tcp_wput_proto to handle T_UNBIND_REQ messages. */
314 void
tcp_tpi_unbind(tcp_t * tcp,mblk_t * mp)315 tcp_tpi_unbind(tcp_t *tcp, mblk_t *mp)
316 {
317 conn_t *connp = tcp->tcp_connp;
318 int error;
319
320 error = tcp_do_unbind(connp);
321 if (error > 0) {
322 tcp_err_ack(tcp, mp, TSYSERR, error);
323 } else if (error < 0) {
324 tcp_err_ack(tcp, mp, -error, 0);
325 } else {
326 /* Send M_FLUSH according to TPI */
327 (void) putnextctl1(connp->conn_rq, M_FLUSH, FLUSHRW);
328
329 mp = mi_tpi_ok_ack_alloc(mp);
330 if (mp != NULL)
331 putnext(connp->conn_rq, mp);
332 }
333 }
334
335 int
tcp_tpi_close(queue_t * q,int flags)336 tcp_tpi_close(queue_t *q, int flags)
337 {
338 conn_t *connp;
339
340 ASSERT(WR(q)->q_next == NULL);
341
342 if (flags & SO_FALLBACK) {
343 /*
344 * stream is being closed while in fallback
345 * simply free the resources that were allocated
346 */
347 inet_minor_free(WR(q)->q_ptr, (dev_t)(RD(q)->q_ptr));
348 qprocsoff(q);
349 goto done;
350 }
351
352 connp = Q_TO_CONN(q);
353 /*
354 * We are being closed as /dev/tcp or /dev/tcp6.
355 */
356 tcp_close_common(connp, flags);
357
358 qprocsoff(q);
359 inet_minor_free(connp->conn_minor_arena, connp->conn_dev);
360
361 /*
362 * Drop IP's reference on the conn. This is the last reference
363 * on the connp if the state was less than established. If the
364 * connection has gone into timewait state, then we will have
365 * one ref for the TCP and one more ref (total of two) for the
366 * classifier connected hash list (a timewait connections stays
367 * in connected hash till closed).
368 *
369 * We can't assert the references because there might be other
370 * transient reference places because of some walkers or queued
371 * packets in squeue for the timewait state.
372 */
373 CONN_DEC_REF(connp);
374 done:
375 q->q_ptr = WR(q)->q_ptr = NULL;
376 return (0);
377 }
378
379 int
tcp_tpi_close_accept(queue_t * q)380 tcp_tpi_close_accept(queue_t *q)
381 {
382 vmem_t *minor_arena;
383 dev_t conn_dev;
384 extern struct qinit tcp_acceptor_winit;
385
386 ASSERT(WR(q)->q_qinfo == &tcp_acceptor_winit);
387
388 /*
389 * We had opened an acceptor STREAM for sockfs which is
390 * now being closed due to some error.
391 */
392 qprocsoff(q);
393
394 minor_arena = (vmem_t *)WR(q)->q_ptr;
395 conn_dev = (dev_t)RD(q)->q_ptr;
396 ASSERT(minor_arena != NULL);
397 ASSERT(conn_dev != 0);
398 inet_minor_free(minor_arena, conn_dev);
399 q->q_ptr = WR(q)->q_ptr = NULL;
400 return (0);
401 }
402
403 /*
404 * Put a connection confirmation message upstream built from the
405 * address/flowid information with the conn and iph. Report our success or
406 * failure.
407 */
408 boolean_t
tcp_conn_con(tcp_t * tcp,uchar_t * iphdr,mblk_t * idmp,mblk_t ** defermp,ip_recv_attr_t * ira)409 tcp_conn_con(tcp_t *tcp, uchar_t *iphdr, mblk_t *idmp,
410 mblk_t **defermp, ip_recv_attr_t *ira)
411 {
412 sin_t sin;
413 sin6_t sin6;
414 mblk_t *mp;
415 char *optp = NULL;
416 int optlen = 0;
417 conn_t *connp = tcp->tcp_connp;
418
419 if (defermp != NULL)
420 *defermp = NULL;
421
422 if (tcp->tcp_conn.tcp_opts_conn_req != NULL) {
423 /*
424 * Return in T_CONN_CON results of option negotiation through
425 * the T_CONN_REQ. Note: If there is an real end-to-end option
426 * negotiation, then what is received from remote end needs
427 * to be taken into account but there is no such thing (yet?)
428 * in our TCP/IP.
429 * Note: We do not use mi_offset_param() here as
430 * tcp_opts_conn_req contents do not directly come from
431 * an application and are either generated in kernel or
432 * from user input that was already verified.
433 */
434 mp = tcp->tcp_conn.tcp_opts_conn_req;
435 optp = (char *)(mp->b_rptr +
436 ((struct T_conn_req *)mp->b_rptr)->OPT_offset);
437 optlen = (int)
438 ((struct T_conn_req *)mp->b_rptr)->OPT_length;
439 }
440
441 if (IPH_HDR_VERSION(iphdr) == IPV4_VERSION) {
442
443 /* packet is IPv4 */
444 if (connp->conn_family == AF_INET) {
445 sin = sin_null;
446 sin.sin_addr.s_addr = connp->conn_faddr_v4;
447 sin.sin_port = connp->conn_fport;
448 sin.sin_family = AF_INET;
449 mp = mi_tpi_conn_con(NULL, (char *)&sin,
450 (int)sizeof (sin_t), optp, optlen);
451 } else {
452 sin6 = sin6_null;
453 sin6.sin6_addr = connp->conn_faddr_v6;
454 sin6.sin6_port = connp->conn_fport;
455 sin6.sin6_family = AF_INET6;
456 mp = mi_tpi_conn_con(NULL, (char *)&sin6,
457 (int)sizeof (sin6_t), optp, optlen);
458
459 }
460 } else {
461 ip6_t *ip6h = (ip6_t *)iphdr;
462
463 ASSERT(IPH_HDR_VERSION(iphdr) == IPV6_VERSION);
464 ASSERT(connp->conn_family == AF_INET6);
465 sin6 = sin6_null;
466 sin6.sin6_addr = connp->conn_faddr_v6;
467 sin6.sin6_port = connp->conn_fport;
468 sin6.sin6_family = AF_INET6;
469 sin6.sin6_flowinfo = ip6h->ip6_vcf & ~IPV6_VERS_AND_FLOW_MASK;
470 mp = mi_tpi_conn_con(NULL, (char *)&sin6,
471 (int)sizeof (sin6_t), optp, optlen);
472 }
473
474 if (!mp)
475 return (B_FALSE);
476
477 mblk_copycred(mp, idmp);
478
479 if (defermp == NULL) {
480 conn_t *connp = tcp->tcp_connp;
481 if (IPCL_IS_NONSTR(connp)) {
482 (*connp->conn_upcalls->su_connected)
483 (connp->conn_upper_handle, tcp->tcp_connid,
484 ira->ira_cred, ira->ira_cpid);
485 freemsg(mp);
486 } else {
487 if (ira->ira_cred != NULL) {
488 /* So that getpeerucred works for TPI sockfs */
489 mblk_setcred(mp, ira->ira_cred, ira->ira_cpid);
490 }
491 putnext(connp->conn_rq, mp);
492 }
493 } else {
494 *defermp = mp;
495 }
496
497 if (tcp->tcp_conn.tcp_opts_conn_req != NULL)
498 tcp_close_mpp(&tcp->tcp_conn.tcp_opts_conn_req);
499 return (B_TRUE);
500 }
501
502 /*
503 * Successful connect request processing begins when our client passes
504 * a T_CONN_REQ message into tcp_wput(), which performs function calls into
505 * IP and the passes a T_OK_ACK (or T_ERROR_ACK upstream).
506 *
507 * After various error checks are completed, tcp_tpi_connect() lays
508 * the target address and port into the composite header template.
509 * Then we ask IP for information, including a source address if we didn't
510 * already have one. Finally we prepare to send the SYN packet, and then
511 * send up the T_OK_ACK reply message.
512 */
513 void
tcp_tpi_connect(tcp_t * tcp,mblk_t * mp)514 tcp_tpi_connect(tcp_t *tcp, mblk_t *mp)
515 {
516 sin_t *sin;
517 struct T_conn_req *tcr;
518 struct sockaddr *sa;
519 socklen_t len;
520 int error;
521 cred_t *cr;
522 pid_t cpid;
523 conn_t *connp = tcp->tcp_connp;
524 queue_t *q = connp->conn_wq;
525
526 /*
527 * All Solaris components should pass a db_credp
528 * for this TPI message, hence we ASSERT.
529 * But in case there is some other M_PROTO that looks
530 * like a TPI message sent by some other kernel
531 * component, we check and return an error.
532 */
533 cr = msg_getcred(mp, &cpid);
534 ASSERT(cr != NULL);
535 if (cr == NULL) {
536 tcp_err_ack(tcp, mp, TSYSERR, EINVAL);
537 return;
538 }
539
540 tcr = (struct T_conn_req *)mp->b_rptr;
541
542 ASSERT((uintptr_t)(mp->b_wptr - mp->b_rptr) <= (uintptr_t)INT_MAX);
543 if ((mp->b_wptr - mp->b_rptr) < sizeof (*tcr)) {
544 tcp_err_ack(tcp, mp, TPROTO, 0);
545 return;
546 }
547
548 /*
549 * Pre-allocate the T_ordrel_ind mblk so that at close time, we
550 * will always have that to send up. Otherwise, we need to do
551 * special handling in case the allocation fails at that time.
552 * If the end point is TPI, the tcp_t can be reused and the
553 * tcp_ordrel_mp may be allocated already.
554 */
555 if (tcp->tcp_ordrel_mp == NULL) {
556 if ((tcp->tcp_ordrel_mp = mi_tpi_ordrel_ind()) == NULL) {
557 tcp_err_ack(tcp, mp, TSYSERR, ENOMEM);
558 return;
559 }
560 }
561
562 /*
563 * Determine packet type based on type of address passed in
564 * the request should contain an IPv4 or IPv6 address.
565 * Make sure that address family matches the type of
566 * family of the address passed down.
567 */
568 switch (tcr->DEST_length) {
569 default:
570 tcp_err_ack(tcp, mp, TBADADDR, 0);
571 return;
572
573 case (sizeof (sin_t) - sizeof (sin->sin_zero)): {
574 /*
575 * XXX: The check for valid DEST_length was not there
576 * in earlier releases and some buggy
577 * TLI apps (e.g Sybase) got away with not feeding
578 * in sin_zero part of address.
579 * We allow that bug to keep those buggy apps humming.
580 * Test suites require the check on DEST_length.
581 * We construct a new mblk with valid DEST_length
582 * free the original so the rest of the code does
583 * not have to keep track of this special shorter
584 * length address case.
585 */
586 mblk_t *nmp;
587 struct T_conn_req *ntcr;
588 sin_t *nsin;
589
590 nmp = allocb(sizeof (struct T_conn_req) + sizeof (sin_t) +
591 tcr->OPT_length, BPRI_HI);
592 if (nmp == NULL) {
593 tcp_err_ack(tcp, mp, TSYSERR, ENOMEM);
594 return;
595 }
596 ntcr = (struct T_conn_req *)nmp->b_rptr;
597 bzero(ntcr, sizeof (struct T_conn_req)); /* zero fill */
598 ntcr->PRIM_type = T_CONN_REQ;
599 ntcr->DEST_length = sizeof (sin_t);
600 ntcr->DEST_offset = sizeof (struct T_conn_req);
601
602 nsin = (sin_t *)((uchar_t *)ntcr + ntcr->DEST_offset);
603 *nsin = sin_null;
604 /* Get pointer to shorter address to copy from original mp */
605 sin = (sin_t *)mi_offset_param(mp, tcr->DEST_offset,
606 tcr->DEST_length); /* extract DEST_length worth of sin_t */
607 if (sin == NULL || !OK_32PTR((char *)sin)) {
608 freemsg(nmp);
609 tcp_err_ack(tcp, mp, TSYSERR, EINVAL);
610 return;
611 }
612 nsin->sin_family = sin->sin_family;
613 nsin->sin_port = sin->sin_port;
614 nsin->sin_addr = sin->sin_addr;
615 /* Note:nsin->sin_zero zero-fill with sin_null assign above */
616 nmp->b_wptr = (uchar_t *)&nsin[1];
617 if (tcr->OPT_length != 0) {
618 ntcr->OPT_length = tcr->OPT_length;
619 ntcr->OPT_offset = nmp->b_wptr - nmp->b_rptr;
620 bcopy((uchar_t *)tcr + tcr->OPT_offset,
621 (uchar_t *)ntcr + ntcr->OPT_offset,
622 tcr->OPT_length);
623 nmp->b_wptr += tcr->OPT_length;
624 }
625 freemsg(mp); /* original mp freed */
626 mp = nmp; /* re-initialize original variables */
627 tcr = ntcr;
628 }
629 /* FALLTHRU */
630
631 case sizeof (sin_t):
632 sa = (struct sockaddr *)mi_offset_param(mp, tcr->DEST_offset,
633 sizeof (sin_t));
634 len = sizeof (sin_t);
635 break;
636
637 case sizeof (sin6_t):
638 sa = (struct sockaddr *)mi_offset_param(mp, tcr->DEST_offset,
639 sizeof (sin6_t));
640 len = sizeof (sin6_t);
641 break;
642 }
643
644 error = proto_verify_ip_addr(connp->conn_family, sa, len);
645 if (error != 0) {
646 tcp_err_ack(tcp, mp, TSYSERR, error);
647 return;
648 }
649
650 /*
651 * TODO: If someone in TCPS_TIME_WAIT has this dst/port we
652 * should key on their sequence number and cut them loose.
653 */
654
655 /*
656 * If options passed in, feed it for verification and handling
657 */
658 if (tcr->OPT_length != 0) {
659 mblk_t *ok_mp;
660 mblk_t *discon_mp;
661 mblk_t *conn_opts_mp;
662 int t_error, sys_error, do_disconnect;
663
664 conn_opts_mp = NULL;
665
666 if (tcp_conprim_opt_process(tcp, mp,
667 &do_disconnect, &t_error, &sys_error) < 0) {
668 if (do_disconnect) {
669 ASSERT(t_error == 0 && sys_error == 0);
670 discon_mp = mi_tpi_discon_ind(NULL,
671 ECONNREFUSED, 0);
672 if (!discon_mp) {
673 tcp_err_ack_prim(tcp, mp, T_CONN_REQ,
674 TSYSERR, ENOMEM);
675 return;
676 }
677 ok_mp = mi_tpi_ok_ack_alloc(mp);
678 if (!ok_mp) {
679 tcp_err_ack_prim(tcp, NULL, T_CONN_REQ,
680 TSYSERR, ENOMEM);
681 return;
682 }
683 qreply(q, ok_mp);
684 qreply(q, discon_mp); /* no flush! */
685 } else {
686 ASSERT(t_error != 0);
687 tcp_err_ack_prim(tcp, mp, T_CONN_REQ, t_error,
688 sys_error);
689 }
690 return;
691 }
692 /*
693 * Success in setting options, the mp option buffer represented
694 * by OPT_length/offset has been potentially modified and
695 * contains results of option processing. We copy it in
696 * another mp to save it for potentially influencing returning
697 * it in T_CONN_CONN.
698 */
699 if (tcr->OPT_length != 0) { /* there are resulting options */
700 conn_opts_mp = copyb(mp);
701 if (!conn_opts_mp) {
702 tcp_err_ack_prim(tcp, mp, T_CONN_REQ,
703 TSYSERR, ENOMEM);
704 return;
705 }
706 ASSERT(tcp->tcp_conn.tcp_opts_conn_req == NULL);
707 tcp->tcp_conn.tcp_opts_conn_req = conn_opts_mp;
708 /*
709 * Note:
710 * These resulting option negotiation can include any
711 * end-to-end negotiation options but there no such
712 * thing (yet?) in our TCP/IP.
713 */
714 }
715 }
716
717 /* call the non-TPI version */
718 error = tcp_do_connect(tcp->tcp_connp, sa, len, cr, cpid);
719 if (error < 0) {
720 mp = mi_tpi_err_ack_alloc(mp, -error, 0);
721 } else if (error > 0) {
722 mp = mi_tpi_err_ack_alloc(mp, TSYSERR, error);
723 } else {
724 mp = mi_tpi_ok_ack_alloc(mp);
725 }
726
727 /*
728 * Note: Code below is the "failure" case
729 */
730 /* return error ack and blow away saved option results if any */
731 connect_failed:
732 if (mp != NULL)
733 putnext(connp->conn_rq, mp);
734 else {
735 tcp_err_ack_prim(tcp, NULL, T_CONN_REQ,
736 TSYSERR, ENOMEM);
737 }
738 }
739
740 /* Return the TPI/TLI equivalent of our current tcp_state */
741 static int
tcp_tpistate(tcp_t * tcp)742 tcp_tpistate(tcp_t *tcp)
743 {
744 switch (tcp->tcp_state) {
745 case TCPS_IDLE:
746 return (TS_UNBND);
747 case TCPS_LISTEN:
748 /*
749 * Return whether there are outstanding T_CONN_IND waiting
750 * for the matching T_CONN_RES. Therefore don't count q0.
751 */
752 if (tcp->tcp_conn_req_cnt_q > 0)
753 return (TS_WRES_CIND);
754 else
755 return (TS_IDLE);
756 case TCPS_BOUND:
757 return (TS_IDLE);
758 case TCPS_SYN_SENT:
759 return (TS_WCON_CREQ);
760 case TCPS_SYN_RCVD:
761 /*
762 * Note: assumption: this has to the active open SYN_RCVD.
763 * The passive instance is detached in SYN_RCVD stage of
764 * incoming connection processing so we cannot get request
765 * for T_info_ack on it.
766 */
767 return (TS_WACK_CRES);
768 case TCPS_ESTABLISHED:
769 return (TS_DATA_XFER);
770 case TCPS_CLOSE_WAIT:
771 return (TS_WREQ_ORDREL);
772 case TCPS_FIN_WAIT_1:
773 return (TS_WIND_ORDREL);
774 case TCPS_FIN_WAIT_2:
775 return (TS_WIND_ORDREL);
776
777 case TCPS_CLOSING:
778 case TCPS_LAST_ACK:
779 case TCPS_TIME_WAIT:
780 case TCPS_CLOSED:
781 /*
782 * Following TS_WACK_DREQ7 is a rendition of "not
783 * yet TS_IDLE" TPI state. There is no best match to any
784 * TPI state for TCPS_{CLOSING, LAST_ACK, TIME_WAIT} but we
785 * choose a value chosen that will map to TLI/XTI level
786 * state of TSTATECHNG (state is process of changing) which
787 * captures what this dummy state represents.
788 */
789 return (TS_WACK_DREQ7);
790 default:
791 cmn_err(CE_WARN, "tcp_tpistate: strange state (%d) %s",
792 tcp->tcp_state, tcp_display(tcp, NULL,
793 DISP_PORT_ONLY));
794 return (TS_UNBND);
795 }
796 }
797
798 static void
tcp_copy_info(struct T_info_ack * tia,tcp_t * tcp)799 tcp_copy_info(struct T_info_ack *tia, tcp_t *tcp)
800 {
801 tcp_stack_t *tcps = tcp->tcp_tcps;
802 conn_t *connp = tcp->tcp_connp;
803 extern struct T_info_ack tcp_g_t_info_ack;
804 extern struct T_info_ack tcp_g_t_info_ack_v6;
805
806 if (connp->conn_family == AF_INET6)
807 *tia = tcp_g_t_info_ack_v6;
808 else
809 *tia = tcp_g_t_info_ack;
810 tia->CURRENT_state = tcp_tpistate(tcp);
811 tia->OPT_size = tcp_max_optsize;
812 if (tcp->tcp_mss == 0) {
813 /* Not yet set - tcp_open does not set mss */
814 if (connp->conn_ipversion == IPV4_VERSION)
815 tia->TIDU_size = tcps->tcps_mss_def_ipv4;
816 else
817 tia->TIDU_size = tcps->tcps_mss_def_ipv6;
818 } else {
819 tia->TIDU_size = tcp->tcp_mss;
820 }
821 /* TODO: Default ETSDU is 1. Is that correct for tcp? */
822 }
823
824 void
tcp_do_capability_ack(tcp_t * tcp,struct T_capability_ack * tcap,t_uscalar_t cap_bits1)825 tcp_do_capability_ack(tcp_t *tcp, struct T_capability_ack *tcap,
826 t_uscalar_t cap_bits1)
827 {
828 tcap->CAP_bits1 = 0;
829
830 if (cap_bits1 & TC1_INFO) {
831 tcp_copy_info(&tcap->INFO_ack, tcp);
832 tcap->CAP_bits1 |= TC1_INFO;
833 }
834
835 if (cap_bits1 & TC1_ACCEPTOR_ID) {
836 tcap->ACCEPTOR_id = tcp->tcp_acceptor_id;
837 tcap->CAP_bits1 |= TC1_ACCEPTOR_ID;
838 }
839
840 }
841
842 /*
843 * This routine responds to T_CAPABILITY_REQ messages. It is called by
844 * tcp_wput. Much of the T_CAPABILITY_ACK information is copied from
845 * tcp_g_t_info_ack. The current state of the stream is copied from
846 * tcp_state.
847 */
848 void
tcp_capability_req(tcp_t * tcp,mblk_t * mp)849 tcp_capability_req(tcp_t *tcp, mblk_t *mp)
850 {
851 t_uscalar_t cap_bits1;
852 struct T_capability_ack *tcap;
853
854 if (MBLKL(mp) < sizeof (struct T_capability_req)) {
855 freemsg(mp);
856 return;
857 }
858
859 cap_bits1 = ((struct T_capability_req *)mp->b_rptr)->CAP_bits1;
860
861 mp = tpi_ack_alloc(mp, sizeof (struct T_capability_ack),
862 mp->b_datap->db_type, T_CAPABILITY_ACK);
863 if (mp == NULL)
864 return;
865
866 tcap = (struct T_capability_ack *)mp->b_rptr;
867 tcp_do_capability_ack(tcp, tcap, cap_bits1);
868
869 putnext(tcp->tcp_connp->conn_rq, mp);
870 }
871
872 /*
873 * This routine responds to T_INFO_REQ messages. It is called by tcp_wput.
874 * Most of the T_INFO_ACK information is copied from tcp_g_t_info_ack.
875 * The current state of the stream is copied from tcp_state.
876 */
877 void
tcp_info_req(tcp_t * tcp,mblk_t * mp)878 tcp_info_req(tcp_t *tcp, mblk_t *mp)
879 {
880 mp = tpi_ack_alloc(mp, sizeof (struct T_info_ack), M_PCPROTO,
881 T_INFO_ACK);
882 if (!mp) {
883 tcp_err_ack(tcp, mp, TSYSERR, ENOMEM);
884 return;
885 }
886 tcp_copy_info((struct T_info_ack *)mp->b_rptr, tcp);
887 putnext(tcp->tcp_connp->conn_rq, mp);
888 }
889
890 /* Respond to the TPI addr request */
891 void
tcp_addr_req(tcp_t * tcp,mblk_t * mp)892 tcp_addr_req(tcp_t *tcp, mblk_t *mp)
893 {
894 struct sockaddr *sa;
895 mblk_t *ackmp;
896 struct T_addr_ack *taa;
897 conn_t *connp = tcp->tcp_connp;
898 uint_t addrlen;
899
900 /* Make it large enough for worst case */
901 ackmp = reallocb(mp, sizeof (struct T_addr_ack) +
902 2 * sizeof (sin6_t), 1);
903 if (ackmp == NULL) {
904 tcp_err_ack(tcp, mp, TSYSERR, ENOMEM);
905 return;
906 }
907
908 taa = (struct T_addr_ack *)ackmp->b_rptr;
909
910 bzero(taa, sizeof (struct T_addr_ack));
911 ackmp->b_wptr = (uchar_t *)&taa[1];
912
913 taa->PRIM_type = T_ADDR_ACK;
914 ackmp->b_datap->db_type = M_PCPROTO;
915
916 if (connp->conn_family == AF_INET)
917 addrlen = sizeof (sin_t);
918 else
919 addrlen = sizeof (sin6_t);
920
921 /*
922 * Note: Following code assumes 32 bit alignment of basic
923 * data structures like sin_t and struct T_addr_ack.
924 */
925 if (tcp->tcp_state >= TCPS_BOUND) {
926 /*
927 * Fill in local address first
928 */
929 taa->LOCADDR_offset = sizeof (*taa);
930 taa->LOCADDR_length = addrlen;
931 sa = (struct sockaddr *)&taa[1];
932 (void) conn_getsockname(connp, sa, &addrlen);
933 ackmp->b_wptr += addrlen;
934 }
935 if (tcp->tcp_state >= TCPS_SYN_RCVD) {
936 /*
937 * Fill in Remote address
938 */
939 taa->REMADDR_length = addrlen;
940 /* assumed 32-bit alignment */
941 taa->REMADDR_offset = taa->LOCADDR_offset + taa->LOCADDR_length;
942 sa = (struct sockaddr *)(ackmp->b_rptr + taa->REMADDR_offset);
943 (void) conn_getpeername(connp, sa, &addrlen);
944 ackmp->b_wptr += addrlen;
945 }
946 ASSERT(ackmp->b_wptr <= ackmp->b_datap->db_lim);
947 putnext(tcp->tcp_connp->conn_rq, ackmp);
948 }
949
950 /*
951 * Swap information between the eager and acceptor for a TLI/XTI client.
952 * The sockfs accept is done on the acceptor stream and control goes
953 * through tcp_tli_accept() and tcp_accept()/tcp_accept_swap() is not
954 * called. In either case, both the eager and listener are in their own
955 * perimeter (squeue) and the code has to deal with potential race.
956 *
957 * See the block comment on top of tcp_accept() and tcp_tli_accept().
958 */
959 static void
tcp_accept_swap(tcp_t * listener,tcp_t * acceptor,tcp_t * eager)960 tcp_accept_swap(tcp_t *listener, tcp_t *acceptor, tcp_t *eager)
961 {
962 conn_t *econnp, *aconnp;
963
964 ASSERT(eager->tcp_connp->conn_rq == listener->tcp_connp->conn_rq);
965 ASSERT(eager->tcp_detached && !acceptor->tcp_detached);
966 ASSERT(!TCP_IS_SOCKET(acceptor));
967 ASSERT(!TCP_IS_SOCKET(eager));
968 ASSERT(!TCP_IS_SOCKET(listener));
969
970 /*
971 * Trusted Extensions may need to use a security label that is
972 * different from the acceptor's label on MLP and MAC-Exempt
973 * sockets. If this is the case, the required security label
974 * already exists in econnp->conn_ixa->ixa_tsl. Since we make the
975 * acceptor stream refer to econnp we atomatically get that label.
976 */
977
978 acceptor->tcp_detached = B_TRUE;
979 /*
980 * To permit stream re-use by TLI/XTI, the eager needs a copy of
981 * the acceptor id.
982 */
983 eager->tcp_acceptor_id = acceptor->tcp_acceptor_id;
984
985 /* remove eager from listen list... */
986 mutex_enter(&listener->tcp_eager_lock);
987 tcp_eager_unlink(eager);
988 ASSERT(eager->tcp_eager_next_q == NULL &&
989 eager->tcp_eager_last_q == NULL);
990 ASSERT(eager->tcp_eager_next_q0 == NULL &&
991 eager->tcp_eager_prev_q0 == NULL);
992 mutex_exit(&listener->tcp_eager_lock);
993
994 econnp = eager->tcp_connp;
995 aconnp = acceptor->tcp_connp;
996 econnp->conn_rq = aconnp->conn_rq;
997 econnp->conn_wq = aconnp->conn_wq;
998 econnp->conn_rq->q_ptr = econnp;
999 econnp->conn_wq->q_ptr = econnp;
1000
1001 /*
1002 * In the TLI/XTI loopback case, we are inside the listener's squeue,
1003 * which might be a different squeue from our peer TCP instance.
1004 * For TCP Fusion, the peer expects that whenever tcp_detached is
1005 * clear, our TCP queues point to the acceptor's queues. Thus, use
1006 * membar_producer() to ensure that the assignments of conn_rq/conn_wq
1007 * above reach global visibility prior to the clearing of tcp_detached.
1008 */
1009 membar_producer();
1010 eager->tcp_detached = B_FALSE;
1011
1012 ASSERT(eager->tcp_ack_tid == 0);
1013
1014 econnp->conn_dev = aconnp->conn_dev;
1015 econnp->conn_minor_arena = aconnp->conn_minor_arena;
1016
1017 ASSERT(econnp->conn_minor_arena != NULL);
1018 if (econnp->conn_cred != NULL)
1019 crfree(econnp->conn_cred);
1020 econnp->conn_cred = aconnp->conn_cred;
1021 ASSERT(!(econnp->conn_ixa->ixa_free_flags & IXA_FREE_CRED));
1022 econnp->conn_ixa->ixa_cred = econnp->conn_cred;
1023 aconnp->conn_cred = NULL;
1024 econnp->conn_cpid = aconnp->conn_cpid;
1025 ASSERT(econnp->conn_netstack == aconnp->conn_netstack);
1026 ASSERT(eager->tcp_tcps == acceptor->tcp_tcps);
1027
1028 econnp->conn_zoneid = aconnp->conn_zoneid;
1029 econnp->conn_allzones = aconnp->conn_allzones;
1030 econnp->conn_ixa->ixa_zoneid = aconnp->conn_ixa->ixa_zoneid;
1031
1032 econnp->conn_mac_mode = aconnp->conn_mac_mode;
1033 econnp->conn_zone_is_global = aconnp->conn_zone_is_global;
1034 aconnp->conn_mac_mode = CONN_MAC_DEFAULT;
1035
1036 /* Do the IPC initialization */
1037 CONN_INC_REF(econnp);
1038
1039 /* Done with old IPC. Drop its ref on its connp */
1040 CONN_DEC_REF(aconnp);
1041 }
1042
1043 /*
1044 * This runs at the tail end of accept processing on the squeue of the
1045 * new connection.
1046 */
1047 /* ARGSUSED */
1048 static void
tcp_accept_finish(void * arg,mblk_t * mp,void * arg2,ip_recv_attr_t * dummy)1049 tcp_accept_finish(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy)
1050 {
1051 conn_t *connp = (conn_t *)arg;
1052 tcp_t *tcp = connp->conn_tcp;
1053 queue_t *q = connp->conn_rq;
1054 tcp_stack_t *tcps = tcp->tcp_tcps;
1055 struct stroptions *stropt;
1056 struct sock_proto_props sopp;
1057
1058 /* Should never be called for non-STREAMS sockets */
1059 ASSERT(!IPCL_IS_NONSTR(connp));
1060
1061 /* We should just receive a single mblk that fits a T_discon_ind */
1062 ASSERT(mp->b_cont == NULL);
1063
1064 /*
1065 * Drop the eager's ref on the listener, that was placed when
1066 * this eager began life in tcp_input_listener.
1067 */
1068 CONN_DEC_REF(tcp->tcp_saved_listener->tcp_connp);
1069
1070 tcp->tcp_detached = B_FALSE;
1071
1072 if (tcp->tcp_state <= TCPS_BOUND || tcp->tcp_accept_error) {
1073 /*
1074 * Someone blewoff the eager before we could finish
1075 * the accept.
1076 *
1077 * The only reason eager exists it because we put in
1078 * a ref on it when conn ind went up. We need to send
1079 * a disconnect indication up while the last reference
1080 * on the eager will be dropped by the squeue when we
1081 * return.
1082 */
1083 ASSERT(tcp->tcp_listener == NULL);
1084 if (tcp->tcp_issocket || tcp->tcp_send_discon_ind) {
1085 struct T_discon_ind *tdi;
1086
1087 (void) putnextctl1(q, M_FLUSH, FLUSHRW);
1088 /*
1089 * Let us reuse the incoming mblk to avoid
1090 * memory allocation failure problems. We know
1091 * that the size of the incoming mblk i.e.
1092 * stroptions is greater than sizeof
1093 * T_discon_ind.
1094 */
1095 ASSERT(DB_REF(mp) == 1);
1096 ASSERT(MBLKSIZE(mp) >=
1097 sizeof (struct T_discon_ind));
1098
1099 DB_TYPE(mp) = M_PROTO;
1100 ((union T_primitives *)mp->b_rptr)->type =
1101 T_DISCON_IND;
1102 tdi = (struct T_discon_ind *)mp->b_rptr;
1103 if (tcp->tcp_issocket) {
1104 tdi->DISCON_reason = ECONNREFUSED;
1105 tdi->SEQ_number = 0;
1106 } else {
1107 tdi->DISCON_reason = ENOPROTOOPT;
1108 tdi->SEQ_number =
1109 tcp->tcp_conn_req_seqnum;
1110 }
1111 mp->b_wptr = mp->b_rptr +
1112 sizeof (struct T_discon_ind);
1113 putnext(q, mp);
1114 }
1115 tcp->tcp_hard_binding = B_FALSE;
1116 return;
1117 }
1118
1119 /*
1120 * This is the first time we run on the correct
1121 * queue after tcp_accept. So fix all the q parameters
1122 * here.
1123 *
1124 * Let us reuse the incoming mblk to avoid
1125 * memory allocation failure problems. We know
1126 * that the size of the incoming mblk is at least
1127 * stroptions
1128 */
1129 tcp_get_proto_props(tcp, &sopp);
1130
1131 ASSERT(DB_REF(mp) == 1);
1132 ASSERT(MBLKSIZE(mp) >= sizeof (struct stroptions));
1133
1134 DB_TYPE(mp) = M_SETOPTS;
1135 stropt = (struct stroptions *)mp->b_rptr;
1136 mp->b_wptr = mp->b_rptr + sizeof (struct stroptions);
1137 stropt = (struct stroptions *)mp->b_rptr;
1138 ASSERT(sopp.sopp_flags & (SO_HIWAT|SO_WROFF|SO_MAXBLK));
1139 stropt->so_flags = SO_HIWAT | SO_WROFF | SO_MAXBLK;
1140 stropt->so_hiwat = sopp.sopp_rxhiwat;
1141 stropt->so_wroff = sopp.sopp_wroff;
1142 stropt->so_maxblk = sopp.sopp_maxblk;
1143
1144 /* Send the options up */
1145 putnext(q, mp);
1146
1147 /*
1148 * Pass up any data and/or a fin that has been received.
1149 *
1150 * Adjust receive window in case it had decreased
1151 * (because there is data <=> tcp_rcv_list != NULL)
1152 * while the connection was detached. Note that
1153 * in case the eager was flow-controlled, w/o this
1154 * code, the rwnd may never open up again!
1155 */
1156 if (tcp->tcp_rcv_list != NULL) {
1157 /* We drain directly in case of fused tcp loopback */
1158
1159 if (!tcp->tcp_fused && canputnext(q)) {
1160 tcp->tcp_rwnd = connp->conn_rcvbuf;
1161 if (tcp->tcp_state >= TCPS_ESTABLISHED &&
1162 tcp_rwnd_reopen(tcp) == TH_ACK_NEEDED) {
1163 tcp_xmit_ctl(NULL,
1164 tcp, (tcp->tcp_swnd == 0) ?
1165 tcp->tcp_suna : tcp->tcp_snxt,
1166 tcp->tcp_rnxt, TH_ACK);
1167 }
1168 }
1169
1170 (void) tcp_rcv_drain(tcp);
1171
1172 /*
1173 * For fused tcp loopback, back-enable peer endpoint
1174 * if it's currently flow-controlled.
1175 */
1176 if (tcp->tcp_fused) {
1177 tcp_t *peer_tcp = tcp->tcp_loopback_peer;
1178
1179 ASSERT(peer_tcp != NULL);
1180 ASSERT(peer_tcp->tcp_fused);
1181
1182 mutex_enter(&peer_tcp->tcp_non_sq_lock);
1183 if (peer_tcp->tcp_flow_stopped) {
1184 tcp_clrqfull(peer_tcp);
1185 TCP_STAT(tcps, tcp_fusion_backenabled);
1186 }
1187 mutex_exit(&peer_tcp->tcp_non_sq_lock);
1188 }
1189 }
1190 ASSERT(tcp->tcp_rcv_list == NULL || tcp->tcp_fused_sigurg);
1191 if (tcp->tcp_fin_rcvd && !tcp->tcp_ordrel_done) {
1192 tcp->tcp_ordrel_done = B_TRUE;
1193 mp = tcp->tcp_ordrel_mp;
1194 tcp->tcp_ordrel_mp = NULL;
1195 putnext(q, mp);
1196 }
1197 tcp->tcp_hard_binding = B_FALSE;
1198
1199 if (connp->conn_keepalive) {
1200 tcp->tcp_ka_last_intrvl = 0;
1201 tcp->tcp_ka_tid = TCP_TIMER(tcp, tcp_keepalive_timer,
1202 tcp->tcp_ka_interval);
1203 }
1204
1205 /*
1206 * At this point, eager is fully established and will
1207 * have the following references -
1208 *
1209 * 2 references for connection to exist (1 for TCP and 1 for IP).
1210 * 1 reference for the squeue which will be dropped by the squeue as
1211 * soon as this function returns.
1212 * There will be 1 additonal reference for being in classifier
1213 * hash list provided something bad hasn't happened.
1214 */
1215 ASSERT((connp->conn_fanout != NULL && connp->conn_ref >= 4) ||
1216 (connp->conn_fanout == NULL && connp->conn_ref >= 3));
1217 }
1218
1219 /*
1220 * Pull a deferred connection indication off of the listener. The caller
1221 * must verify that there is a deferred conn ind under eager_lock before
1222 * calling this function.
1223 */
1224 static mblk_t *
tcp_get_def_conn_ind(tcp_t * listener)1225 tcp_get_def_conn_ind(tcp_t *listener)
1226 {
1227 tcp_t *tail;
1228 tcp_t *tcp;
1229 mblk_t *conn_ind;
1230
1231 ASSERT(MUTEX_HELD(&listener->tcp_eager_lock));
1232 ASSERT(listener->tcp_eager_prev_q0->tcp_conn_def_q0);
1233
1234 tcp = listener->tcp_eager_prev_q0;
1235 /*
1236 * listener->tcp_eager_prev_q0 points to the TAIL of the
1237 * deferred T_conn_ind queue. We need to get to the head
1238 * of the queue in order to send up T_conn_ind the same
1239 * order as how the 3WHS is completed.
1240 */
1241 while (tcp != listener) {
1242 if (!tcp->tcp_eager_prev_q0->tcp_conn_def_q0)
1243 break;
1244 else
1245 tcp = tcp->tcp_eager_prev_q0;
1246 }
1247
1248 conn_ind = tcp->tcp_conn.tcp_eager_conn_ind;
1249 tcp->tcp_conn.tcp_eager_conn_ind = NULL;
1250 /* Move from q0 to q */
1251 ASSERT(listener->tcp_conn_req_cnt_q0 > 0);
1252 listener->tcp_conn_req_cnt_q0--;
1253 listener->tcp_conn_req_cnt_q++;
1254 tcp->tcp_eager_next_q0->tcp_eager_prev_q0 =
1255 tcp->tcp_eager_prev_q0;
1256 tcp->tcp_eager_prev_q0->tcp_eager_next_q0 =
1257 tcp->tcp_eager_next_q0;
1258 tcp->tcp_eager_prev_q0 = NULL;
1259 tcp->tcp_eager_next_q0 = NULL;
1260 tcp->tcp_conn_def_q0 = B_FALSE;
1261
1262 /* Make sure the tcp isn't in the list of droppables */
1263 ASSERT(tcp->tcp_eager_next_drop_q0 == NULL &&
1264 tcp->tcp_eager_prev_drop_q0 == NULL);
1265
1266 /*
1267 * Insert at end of the queue because sockfs sends
1268 * down T_CONN_RES in chronological order. Leaving
1269 * the older conn indications at front of the queue
1270 * helps reducing search time.
1271 */
1272 tail = listener->tcp_eager_last_q;
1273 if (tail != NULL) {
1274 tail->tcp_eager_next_q = tcp;
1275 } else {
1276 listener->tcp_eager_next_q = tcp;
1277 }
1278 listener->tcp_eager_last_q = tcp;
1279 tcp->tcp_eager_next_q = NULL;
1280
1281 return (conn_ind);
1282 }
1283
1284
1285 /*
1286 * Reply to a clients T_CONN_RES TPI message. This function
1287 * is used only for TLI/XTI listener. Sockfs sends T_CONN_RES
1288 * on the acceptor STREAM and processed in tcp_accept_common().
1289 * Read the block comment on top of tcp_input_listener().
1290 */
1291 void
tcp_tli_accept(tcp_t * listener,mblk_t * mp)1292 tcp_tli_accept(tcp_t *listener, mblk_t *mp)
1293 {
1294 tcp_t *acceptor;
1295 tcp_t *eager;
1296 struct T_conn_res *tcr;
1297 t_uscalar_t acceptor_id;
1298 t_scalar_t seqnum;
1299 mblk_t *discon_mp = NULL;
1300 mblk_t *ok_mp;
1301 mblk_t *mp1;
1302 tcp_stack_t *tcps = listener->tcp_tcps;
1303 conn_t *econnp;
1304
1305 if ((mp->b_wptr - mp->b_rptr) < sizeof (*tcr)) {
1306 tcp_err_ack(listener, mp, TPROTO, 0);
1307 return;
1308 }
1309 tcr = (struct T_conn_res *)mp->b_rptr;
1310
1311 /*
1312 * Under ILP32 the stream head points tcr->ACCEPTOR_id at the
1313 * read side queue of the streams device underneath us i.e. the
1314 * read side queue of 'ip'. Since we can't deference QUEUE_ptr we
1315 * look it up in the queue_hash. Under LP64 it sends down the
1316 * minor_t of the accepting endpoint.
1317 *
1318 * Once the acceptor/eager are modified (in tcp_accept_swap) the
1319 * fanout hash lock is held.
1320 * This prevents any thread from entering the acceptor queue from
1321 * below (since it has not been hard bound yet i.e. any inbound
1322 * packets will arrive on the listener conn_t and
1323 * go through the classifier).
1324 * The CONN_INC_REF will prevent the acceptor from closing.
1325 *
1326 * XXX It is still possible for a tli application to send down data
1327 * on the accepting stream while another thread calls t_accept.
1328 * This should not be a problem for well-behaved applications since
1329 * the T_OK_ACK is sent after the queue swapping is completed.
1330 *
1331 * If the accepting fd is the same as the listening fd, avoid
1332 * queue hash lookup since that will return an eager listener in a
1333 * already established state.
1334 */
1335 acceptor_id = tcr->ACCEPTOR_id;
1336 mutex_enter(&listener->tcp_eager_lock);
1337 if (listener->tcp_acceptor_id == acceptor_id) {
1338 eager = listener->tcp_eager_next_q;
1339 /* only count how many T_CONN_INDs so don't count q0 */
1340 if ((listener->tcp_conn_req_cnt_q != 1) ||
1341 (eager->tcp_conn_req_seqnum != tcr->SEQ_number)) {
1342 mutex_exit(&listener->tcp_eager_lock);
1343 tcp_err_ack(listener, mp, TBADF, 0);
1344 return;
1345 }
1346 if (listener->tcp_conn_req_cnt_q0 != 0) {
1347 /* Throw away all the eagers on q0. */
1348 tcp_eager_cleanup(listener, 1);
1349 }
1350 if (listener->tcp_syn_defense) {
1351 listener->tcp_syn_defense = B_FALSE;
1352 if (listener->tcp_ip_addr_cache != NULL) {
1353 kmem_free(listener->tcp_ip_addr_cache,
1354 IP_ADDR_CACHE_SIZE * sizeof (ipaddr_t));
1355 listener->tcp_ip_addr_cache = NULL;
1356 }
1357 }
1358 /*
1359 * Transfer tcp_conn_req_max to the eager so that when
1360 * a disconnect occurs we can revert the endpoint to the
1361 * listen state.
1362 */
1363 eager->tcp_conn_req_max = listener->tcp_conn_req_max;
1364 ASSERT(listener->tcp_conn_req_cnt_q0 == 0);
1365 /*
1366 * Get a reference on the acceptor just like the
1367 * tcp_acceptor_hash_lookup below.
1368 */
1369 acceptor = listener;
1370 CONN_INC_REF(acceptor->tcp_connp);
1371 } else {
1372 acceptor = tcp_acceptor_hash_lookup(acceptor_id, tcps);
1373 if (acceptor == NULL) {
1374 if (listener->tcp_connp->conn_debug) {
1375 (void) strlog(TCP_MOD_ID, 0, 1,
1376 SL_ERROR|SL_TRACE,
1377 "tcp_accept: did not find acceptor 0x%x\n",
1378 acceptor_id);
1379 }
1380 mutex_exit(&listener->tcp_eager_lock);
1381 tcp_err_ack(listener, mp, TPROVMISMATCH, 0);
1382 return;
1383 }
1384 /*
1385 * Verify acceptor state. The acceptable states for an acceptor
1386 * include TCPS_IDLE and TCPS_BOUND.
1387 */
1388 switch (acceptor->tcp_state) {
1389 case TCPS_IDLE:
1390 /* FALLTHRU */
1391 case TCPS_BOUND:
1392 break;
1393 default:
1394 CONN_DEC_REF(acceptor->tcp_connp);
1395 mutex_exit(&listener->tcp_eager_lock);
1396 tcp_err_ack(listener, mp, TOUTSTATE, 0);
1397 return;
1398 }
1399 }
1400
1401 /* The listener must be in TCPS_LISTEN */
1402 if (listener->tcp_state != TCPS_LISTEN) {
1403 CONN_DEC_REF(acceptor->tcp_connp);
1404 mutex_exit(&listener->tcp_eager_lock);
1405 tcp_err_ack(listener, mp, TOUTSTATE, 0);
1406 return;
1407 }
1408
1409 /*
1410 * Rendezvous with an eager connection request packet hanging off
1411 * 'tcp' that has the 'seqnum' tag. We tagged the detached open
1412 * tcp structure when the connection packet arrived in
1413 * tcp_input_listener().
1414 */
1415 seqnum = tcr->SEQ_number;
1416 eager = listener;
1417 do {
1418 eager = eager->tcp_eager_next_q;
1419 if (eager == NULL) {
1420 CONN_DEC_REF(acceptor->tcp_connp);
1421 mutex_exit(&listener->tcp_eager_lock);
1422 tcp_err_ack(listener, mp, TBADSEQ, 0);
1423 return;
1424 }
1425 } while (eager->tcp_conn_req_seqnum != seqnum);
1426 mutex_exit(&listener->tcp_eager_lock);
1427
1428 /*
1429 * At this point, both acceptor and listener have 2 ref
1430 * that they begin with. Acceptor has one additional ref
1431 * we placed in lookup while listener has 3 additional
1432 * ref for being behind the squeue (tcp_accept() is
1433 * done on listener's squeue); being in classifier hash;
1434 * and eager's ref on listener.
1435 */
1436 ASSERT(listener->tcp_connp->conn_ref >= 5);
1437 ASSERT(acceptor->tcp_connp->conn_ref >= 3);
1438
1439 /*
1440 * The eager at this point is set in its own squeue and
1441 * could easily have been killed (tcp_accept_finish will
1442 * deal with that) because of a TH_RST so we can only
1443 * ASSERT for a single ref.
1444 */
1445 ASSERT(eager->tcp_connp->conn_ref >= 1);
1446
1447 /*
1448 * Pre allocate the discon_ind mblk also. tcp_accept_finish will
1449 * use it if something failed.
1450 */
1451 discon_mp = allocb(MAX(sizeof (struct T_discon_ind),
1452 sizeof (struct stroptions)), BPRI_HI);
1453 if (discon_mp == NULL) {
1454 CONN_DEC_REF(acceptor->tcp_connp);
1455 CONN_DEC_REF(eager->tcp_connp);
1456 tcp_err_ack(listener, mp, TSYSERR, ENOMEM);
1457 return;
1458 }
1459
1460 econnp = eager->tcp_connp;
1461
1462 /* Hold a copy of mp, in case reallocb fails */
1463 if ((mp1 = copymsg(mp)) == NULL) {
1464 CONN_DEC_REF(acceptor->tcp_connp);
1465 CONN_DEC_REF(eager->tcp_connp);
1466 freemsg(discon_mp);
1467 tcp_err_ack(listener, mp, TSYSERR, ENOMEM);
1468 return;
1469 }
1470
1471 tcr = (struct T_conn_res *)mp1->b_rptr;
1472
1473 /*
1474 * This is an expanded version of mi_tpi_ok_ack_alloc()
1475 * which allocates a larger mblk and appends the new
1476 * local address to the ok_ack. The address is copied by
1477 * soaccept() for getsockname().
1478 */
1479 {
1480 int extra;
1481
1482 extra = (econnp->conn_family == AF_INET) ?
1483 sizeof (sin_t) : sizeof (sin6_t);
1484
1485 /*
1486 * Try to re-use mp, if possible. Otherwise, allocate
1487 * an mblk and return it as ok_mp. In any case, mp
1488 * is no longer usable upon return.
1489 */
1490 if ((ok_mp = mi_tpi_ok_ack_alloc_extra(mp, extra)) == NULL) {
1491 CONN_DEC_REF(acceptor->tcp_connp);
1492 CONN_DEC_REF(eager->tcp_connp);
1493 freemsg(discon_mp);
1494 /* Original mp has been freed by now, so use mp1 */
1495 tcp_err_ack(listener, mp1, TSYSERR, ENOMEM);
1496 return;
1497 }
1498
1499 mp = NULL; /* We should never use mp after this point */
1500
1501 switch (extra) {
1502 case sizeof (sin_t): {
1503 sin_t *sin = (sin_t *)ok_mp->b_wptr;
1504
1505 ok_mp->b_wptr += extra;
1506 sin->sin_family = AF_INET;
1507 sin->sin_port = econnp->conn_lport;
1508 sin->sin_addr.s_addr = econnp->conn_laddr_v4;
1509 break;
1510 }
1511 case sizeof (sin6_t): {
1512 sin6_t *sin6 = (sin6_t *)ok_mp->b_wptr;
1513
1514 ok_mp->b_wptr += extra;
1515 sin6->sin6_family = AF_INET6;
1516 sin6->sin6_port = econnp->conn_lport;
1517 sin6->sin6_addr = econnp->conn_laddr_v6;
1518 sin6->sin6_flowinfo = econnp->conn_flowinfo;
1519 if (IN6_IS_ADDR_LINKSCOPE(&econnp->conn_laddr_v6) &&
1520 (econnp->conn_ixa->ixa_flags & IXAF_SCOPEID_SET)) {
1521 sin6->sin6_scope_id =
1522 econnp->conn_ixa->ixa_scopeid;
1523 } else {
1524 sin6->sin6_scope_id = 0;
1525 }
1526 sin6->__sin6_src_id = 0;
1527 break;
1528 }
1529 default:
1530 break;
1531 }
1532 ASSERT(ok_mp->b_wptr <= ok_mp->b_datap->db_lim);
1533 }
1534
1535 /*
1536 * If there are no options we know that the T_CONN_RES will
1537 * succeed. However, we can't send the T_OK_ACK upstream until
1538 * the tcp_accept_swap is done since it would be dangerous to
1539 * let the application start using the new fd prior to the swap.
1540 */
1541 tcp_accept_swap(listener, acceptor, eager);
1542
1543 /*
1544 * tcp_accept_swap unlinks eager from listener but does not drop
1545 * the eager's reference on the listener.
1546 */
1547 ASSERT(eager->tcp_listener == NULL);
1548 ASSERT(listener->tcp_connp->conn_ref >= 5);
1549
1550 /*
1551 * The eager is now associated with its own queue. Insert in
1552 * the hash so that the connection can be reused for a future
1553 * T_CONN_RES.
1554 */
1555 tcp_acceptor_hash_insert(acceptor_id, eager);
1556
1557 /*
1558 * We now do the processing of options with T_CONN_RES.
1559 * We delay till now since we wanted to have queue to pass to
1560 * option processing routines that points back to the right
1561 * instance structure which does not happen until after
1562 * tcp_accept_swap().
1563 *
1564 * Note:
1565 * The sanity of the logic here assumes that whatever options
1566 * are appropriate to inherit from listner=>eager are done
1567 * before this point, and whatever were to be overridden (or not)
1568 * in transfer logic from eager=>acceptor in tcp_accept_swap().
1569 * [ Warning: acceptor endpoint can have T_OPTMGMT_REQ done to it
1570 * before its ACCEPTOR_id comes down in T_CONN_RES ]
1571 * This may not be true at this point in time but can be fixed
1572 * independently. This option processing code starts with
1573 * the instantiated acceptor instance and the final queue at
1574 * this point.
1575 */
1576
1577 if (tcr->OPT_length != 0) {
1578 /* Options to process */
1579 int t_error = 0;
1580 int sys_error = 0;
1581 int do_disconnect = 0;
1582
1583 if (tcp_conprim_opt_process(eager, mp1,
1584 &do_disconnect, &t_error, &sys_error) < 0) {
1585 eager->tcp_accept_error = 1;
1586 if (do_disconnect) {
1587 /*
1588 * An option failed which does not allow
1589 * connection to be accepted.
1590 *
1591 * We allow T_CONN_RES to succeed and
1592 * put a T_DISCON_IND on the eager queue.
1593 */
1594 ASSERT(t_error == 0 && sys_error == 0);
1595 eager->tcp_send_discon_ind = 1;
1596 } else {
1597 ASSERT(t_error != 0);
1598 freemsg(ok_mp);
1599 /*
1600 * Original mp was either freed or set
1601 * to ok_mp above, so use mp1 instead.
1602 */
1603 tcp_err_ack(listener, mp1, t_error, sys_error);
1604 goto finish;
1605 }
1606 }
1607 /*
1608 * Most likely success in setting options (except if
1609 * eager->tcp_send_discon_ind set).
1610 * mp1 option buffer represented by OPT_length/offset
1611 * potentially modified and contains results of setting
1612 * options at this point
1613 */
1614 }
1615
1616 /* We no longer need mp1, since all options processing has passed */
1617 freemsg(mp1);
1618
1619 putnext(listener->tcp_connp->conn_rq, ok_mp);
1620
1621 mutex_enter(&listener->tcp_eager_lock);
1622 if (listener->tcp_eager_prev_q0->tcp_conn_def_q0) {
1623 mblk_t *conn_ind;
1624
1625 /*
1626 * This path should not be executed if listener and
1627 * acceptor streams are the same.
1628 */
1629 ASSERT(listener != acceptor);
1630 conn_ind = tcp_get_def_conn_ind(listener);
1631 mutex_exit(&listener->tcp_eager_lock);
1632 putnext(listener->tcp_connp->conn_rq, conn_ind);
1633 } else {
1634 mutex_exit(&listener->tcp_eager_lock);
1635 }
1636
1637 /*
1638 * Done with the acceptor - free it
1639 *
1640 * Note: from this point on, no access to listener should be made
1641 * as listener can be equal to acceptor.
1642 */
1643 finish:
1644 ASSERT(acceptor->tcp_detached);
1645 acceptor->tcp_connp->conn_rq = NULL;
1646 ASSERT(!IPCL_IS_NONSTR(acceptor->tcp_connp));
1647 acceptor->tcp_connp->conn_wq = NULL;
1648 (void) tcp_clean_death(acceptor, 0);
1649 CONN_DEC_REF(acceptor->tcp_connp);
1650
1651 /*
1652 * We pass discon_mp to tcp_accept_finish to get on the right squeue.
1653 *
1654 * It will update the setting for sockfs/stream head and also take
1655 * care of any data that arrived before accept() wad called.
1656 * In case we already received a FIN then tcp_accept_finish will send up
1657 * the ordrel. It will also send up a window update if the window
1658 * has opened up.
1659 */
1660
1661 /*
1662 * XXX: we currently have a problem if XTI application closes the
1663 * acceptor stream in between. This problem exists in on10-gate also
1664 * and is well know but nothing can be done short of major rewrite
1665 * to fix it. Now it is possible to take care of it by assigning TLI/XTI
1666 * eager same squeue as listener (we can distinguish non socket
1667 * listeners at the time of handling a SYN in tcp_input_listener)
1668 * and do most of the work that tcp_accept_finish does here itself
1669 * and then get behind the acceptor squeue to access the acceptor
1670 * queue.
1671 */
1672 /*
1673 * We already have a ref on tcp so no need to do one before squeue_enter
1674 */
1675 SQUEUE_ENTER_ONE(eager->tcp_connp->conn_sqp, discon_mp,
1676 tcp_accept_finish, eager->tcp_connp, NULL, SQ_FILL,
1677 SQTAG_TCP_ACCEPT_FINISH);
1678 }
1679
1680
1681 /*
1682 * This is the STREAMS entry point for T_CONN_RES coming down on
1683 * Acceptor STREAM when sockfs listener does accept processing.
1684 * Read the block comment on top of tcp_input_listener().
1685 */
1686 void
tcp_tpi_accept(queue_t * q,mblk_t * mp)1687 tcp_tpi_accept(queue_t *q, mblk_t *mp)
1688 {
1689 queue_t *rq = RD(q);
1690 struct T_conn_res *conn_res;
1691 tcp_t *eager;
1692 tcp_t *listener;
1693 struct T_ok_ack *ok;
1694 t_scalar_t PRIM_type;
1695 mblk_t *discon_mp;
1696 conn_t *econnp;
1697 cred_t *cr;
1698
1699 ASSERT(DB_TYPE(mp) == M_PROTO);
1700
1701 /*
1702 * All Solaris components should pass a db_credp
1703 * for this TPI message, hence we ASSERT.
1704 * But in case there is some other M_PROTO that looks
1705 * like a TPI message sent by some other kernel
1706 * component, we check and return an error.
1707 */
1708 cr = msg_getcred(mp, NULL);
1709 ASSERT(cr != NULL);
1710 if (cr == NULL) {
1711 mp = mi_tpi_err_ack_alloc(mp, TSYSERR, EINVAL);
1712 if (mp != NULL)
1713 putnext(rq, mp);
1714 return;
1715 }
1716 conn_res = (struct T_conn_res *)mp->b_rptr;
1717 ASSERT((uintptr_t)(mp->b_wptr - mp->b_rptr) <= (uintptr_t)INT_MAX);
1718 if ((mp->b_wptr - mp->b_rptr) < sizeof (struct T_conn_res)) {
1719 mp = mi_tpi_err_ack_alloc(mp, TPROTO, 0);
1720 if (mp != NULL)
1721 putnext(rq, mp);
1722 return;
1723 }
1724 switch (conn_res->PRIM_type) {
1725 case O_T_CONN_RES:
1726 case T_CONN_RES:
1727 /*
1728 * We pass up an err ack if allocb fails. This will
1729 * cause sockfs to issue a T_DISCON_REQ which will cause
1730 * tcp_eager_blowoff to be called. sockfs will then call
1731 * rq->q_qinfo->qi_qclose to cleanup the acceptor stream.
1732 * we need to do the allocb up here because we have to
1733 * make sure rq->q_qinfo->qi_qclose still points to the
1734 * correct function (tcp_tpi_close_accept) in case allocb
1735 * fails.
1736 */
1737 bcopy(mp->b_rptr + conn_res->OPT_offset,
1738 &eager, conn_res->OPT_length);
1739 PRIM_type = conn_res->PRIM_type;
1740 mp->b_datap->db_type = M_PCPROTO;
1741 mp->b_wptr = mp->b_rptr + sizeof (struct T_ok_ack);
1742 ok = (struct T_ok_ack *)mp->b_rptr;
1743 ok->PRIM_type = T_OK_ACK;
1744 ok->CORRECT_prim = PRIM_type;
1745 econnp = eager->tcp_connp;
1746 econnp->conn_dev = (dev_t)RD(q)->q_ptr;
1747 econnp->conn_minor_arena = (vmem_t *)(WR(q)->q_ptr);
1748 econnp->conn_rq = rq;
1749 econnp->conn_wq = q;
1750 rq->q_ptr = econnp;
1751 rq->q_qinfo = &tcp_rinitv4; /* No open - same as rinitv6 */
1752 q->q_ptr = econnp;
1753 q->q_qinfo = &tcp_winit;
1754 listener = eager->tcp_listener;
1755
1756 /*
1757 * Pre allocate the discon_ind mblk also. tcp_accept_finish will
1758 * use it if something failed.
1759 */
1760 discon_mp = allocb(MAX(sizeof (struct T_discon_ind),
1761 sizeof (struct stroptions)), BPRI_HI);
1762
1763 if (discon_mp == NULL) {
1764 mp = mi_tpi_err_ack_alloc(mp, TPROTO, 0);
1765 if (mp != NULL)
1766 putnext(rq, mp);
1767 return;
1768 }
1769
1770 eager->tcp_issocket = B_TRUE;
1771
1772 ASSERT(econnp->conn_netstack ==
1773 listener->tcp_connp->conn_netstack);
1774 ASSERT(eager->tcp_tcps == listener->tcp_tcps);
1775
1776 /* Put the ref for IP */
1777 CONN_INC_REF(econnp);
1778
1779 /*
1780 * We should have minimum of 3 references on the conn
1781 * at this point. One each for TCP and IP and one for
1782 * the T_conn_ind that was sent up when the 3-way handshake
1783 * completed. In the normal case we would also have another
1784 * reference (making a total of 4) for the conn being in the
1785 * classifier hash list. However the eager could have received
1786 * an RST subsequently and tcp_closei_local could have removed
1787 * the eager from the classifier hash list, hence we can't
1788 * assert that reference.
1789 */
1790 ASSERT(econnp->conn_ref >= 3);
1791
1792 mutex_enter(&listener->tcp_eager_lock);
1793 if (listener->tcp_eager_prev_q0->tcp_conn_def_q0) {
1794 mblk_t *conn_ind = tcp_get_def_conn_ind(listener);
1795
1796 /* Need to get inside the listener perimeter */
1797 CONN_INC_REF(listener->tcp_connp);
1798 SQUEUE_ENTER_ONE(listener->tcp_connp->conn_sqp,
1799 conn_ind, tcp_send_pending, listener->tcp_connp,
1800 NULL, SQ_FILL, SQTAG_TCP_SEND_PENDING);
1801 }
1802 tcp_eager_unlink(eager);
1803 mutex_exit(&listener->tcp_eager_lock);
1804
1805 /*
1806 * At this point, the eager is detached from the listener
1807 * but we still have an extra refs on eager (apart from the
1808 * usual tcp references). The ref was placed in tcp_input_data
1809 * before sending the conn_ind in tcp_send_conn_ind.
1810 * The ref will be dropped in tcp_accept_finish().
1811 */
1812 SQUEUE_ENTER_ONE(econnp->conn_sqp, discon_mp, tcp_accept_finish,
1813 econnp, NULL, SQ_NODRAIN, SQTAG_TCP_ACCEPT_FINISH_Q0);
1814
1815 /*
1816 * Send the new local address also up to sockfs. There
1817 * should already be enough space in the mp that came
1818 * down from soaccept().
1819 */
1820 if (econnp->conn_family == AF_INET) {
1821 sin_t *sin;
1822
1823 ASSERT((mp->b_datap->db_lim - mp->b_datap->db_base) >=
1824 (sizeof (struct T_ok_ack) + sizeof (sin_t)));
1825 sin = (sin_t *)mp->b_wptr;
1826 mp->b_wptr += sizeof (sin_t);
1827 sin->sin_family = AF_INET;
1828 sin->sin_port = econnp->conn_lport;
1829 sin->sin_addr.s_addr = econnp->conn_laddr_v4;
1830 } else {
1831 sin6_t *sin6;
1832
1833 ASSERT((mp->b_datap->db_lim - mp->b_datap->db_base) >=
1834 sizeof (struct T_ok_ack) + sizeof (sin6_t));
1835 sin6 = (sin6_t *)mp->b_wptr;
1836 mp->b_wptr += sizeof (sin6_t);
1837 sin6->sin6_family = AF_INET6;
1838 sin6->sin6_port = econnp->conn_lport;
1839 sin6->sin6_addr = econnp->conn_laddr_v6;
1840 if (econnp->conn_ipversion == IPV4_VERSION)
1841 sin6->sin6_flowinfo = 0;
1842 else
1843 sin6->sin6_flowinfo = econnp->conn_flowinfo;
1844 if (IN6_IS_ADDR_LINKSCOPE(&econnp->conn_laddr_v6) &&
1845 (econnp->conn_ixa->ixa_flags & IXAF_SCOPEID_SET)) {
1846 sin6->sin6_scope_id =
1847 econnp->conn_ixa->ixa_scopeid;
1848 } else {
1849 sin6->sin6_scope_id = 0;
1850 }
1851 sin6->__sin6_src_id = 0;
1852 }
1853
1854 putnext(rq, mp);
1855 return;
1856 default:
1857 mp = mi_tpi_err_ack_alloc(mp, TNOTSUPPORT, 0);
1858 if (mp != NULL)
1859 putnext(rq, mp);
1860 return;
1861 }
1862 }
1863
1864 /*
1865 * The function called through squeue to get behind listener's perimeter to
1866 * send a deferred conn_ind.
1867 */
1868 /* ARGSUSED */
1869 void
tcp_send_pending(void * arg,mblk_t * mp,void * arg2,ip_recv_attr_t * dummy)1870 tcp_send_pending(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy)
1871 {
1872 conn_t *lconnp = (conn_t *)arg;
1873 tcp_t *listener = lconnp->conn_tcp;
1874 struct T_conn_ind *conn_ind;
1875 tcp_t *tcp;
1876
1877 conn_ind = (struct T_conn_ind *)mp->b_rptr;
1878 bcopy(mp->b_rptr + conn_ind->OPT_offset, &tcp,
1879 conn_ind->OPT_length);
1880
1881 if (listener->tcp_state != TCPS_LISTEN) {
1882 /*
1883 * If listener has closed, it would have caused a
1884 * a cleanup/blowoff to happen for the eager, so
1885 * we don't need to do anything more.
1886 */
1887 freemsg(mp);
1888 return;
1889 }
1890
1891 putnext(lconnp->conn_rq, mp);
1892 }
1893
1894 /*
1895 * Sends the T_CONN_IND to the listener. The caller calls this
1896 * functions via squeue to get inside the listener's perimeter
1897 * once the 3 way hand shake is done a T_CONN_IND needs to be
1898 * sent. As an optimization, the caller can call this directly
1899 * if listener's perimeter is same as eager's.
1900 */
1901 /* ARGSUSED */
1902 void
tcp_send_conn_ind(void * arg,mblk_t * mp,void * arg2)1903 tcp_send_conn_ind(void *arg, mblk_t *mp, void *arg2)
1904 {
1905 conn_t *lconnp = (conn_t *)arg;
1906 tcp_t *listener = lconnp->conn_tcp;
1907 tcp_t *tcp;
1908 struct T_conn_ind *conn_ind;
1909 ipaddr_t *addr_cache;
1910 boolean_t need_send_conn_ind = B_FALSE;
1911 tcp_stack_t *tcps = listener->tcp_tcps;
1912
1913 /* retrieve the eager */
1914 conn_ind = (struct T_conn_ind *)mp->b_rptr;
1915 ASSERT(conn_ind->OPT_offset != 0 &&
1916 conn_ind->OPT_length == sizeof (intptr_t));
1917 bcopy(mp->b_rptr + conn_ind->OPT_offset, &tcp,
1918 conn_ind->OPT_length);
1919
1920 /*
1921 * TLI/XTI applications will get confused by
1922 * sending eager as an option since it violates
1923 * the option semantics. So remove the eager as
1924 * option since TLI/XTI app doesn't need it anyway.
1925 */
1926 if (!TCP_IS_SOCKET(listener)) {
1927 conn_ind->OPT_length = 0;
1928 conn_ind->OPT_offset = 0;
1929 }
1930 if (listener->tcp_state != TCPS_LISTEN) {
1931 /*
1932 * If listener has closed, it would have caused a
1933 * a cleanup/blowoff to happen for the eager. We
1934 * just need to return.
1935 */
1936 freemsg(mp);
1937 return;
1938 }
1939
1940
1941 /*
1942 * if the conn_req_q is full defer passing up the
1943 * T_CONN_IND until space is availabe after t_accept()
1944 * processing
1945 */
1946 mutex_enter(&listener->tcp_eager_lock);
1947
1948 /*
1949 * Take the eager out, if it is in the list of droppable eagers
1950 * as we are here because the 3W handshake is over.
1951 */
1952 MAKE_UNDROPPABLE(tcp);
1953
1954 if (listener->tcp_conn_req_cnt_q < listener->tcp_conn_req_max) {
1955 tcp_t *tail;
1956
1957 /*
1958 * The eager already has an extra ref put in tcp_input_data
1959 * so that it stays till accept comes back even though it
1960 * might get into TCPS_CLOSED as a result of a TH_RST etc.
1961 */
1962 ASSERT(listener->tcp_conn_req_cnt_q0 > 0);
1963 listener->tcp_conn_req_cnt_q0--;
1964 listener->tcp_conn_req_cnt_q++;
1965
1966 /* Move from SYN_RCVD to ESTABLISHED list */
1967 tcp->tcp_eager_next_q0->tcp_eager_prev_q0 =
1968 tcp->tcp_eager_prev_q0;
1969 tcp->tcp_eager_prev_q0->tcp_eager_next_q0 =
1970 tcp->tcp_eager_next_q0;
1971 tcp->tcp_eager_prev_q0 = NULL;
1972 tcp->tcp_eager_next_q0 = NULL;
1973
1974 /*
1975 * Insert at end of the queue because sockfs
1976 * sends down T_CONN_RES in chronological
1977 * order. Leaving the older conn indications
1978 * at front of the queue helps reducing search
1979 * time.
1980 */
1981 tail = listener->tcp_eager_last_q;
1982 if (tail != NULL)
1983 tail->tcp_eager_next_q = tcp;
1984 else
1985 listener->tcp_eager_next_q = tcp;
1986 listener->tcp_eager_last_q = tcp;
1987 tcp->tcp_eager_next_q = NULL;
1988 /*
1989 * Delay sending up the T_conn_ind until we are
1990 * done with the eager. Once we have have sent up
1991 * the T_conn_ind, the accept can potentially complete
1992 * any time and release the refhold we have on the eager.
1993 */
1994 need_send_conn_ind = B_TRUE;
1995 } else {
1996 /*
1997 * Defer connection on q0 and set deferred
1998 * connection bit true
1999 */
2000 tcp->tcp_conn_def_q0 = B_TRUE;
2001
2002 /* take tcp out of q0 ... */
2003 tcp->tcp_eager_prev_q0->tcp_eager_next_q0 =
2004 tcp->tcp_eager_next_q0;
2005 tcp->tcp_eager_next_q0->tcp_eager_prev_q0 =
2006 tcp->tcp_eager_prev_q0;
2007
2008 /* ... and place it at the end of q0 */
2009 tcp->tcp_eager_prev_q0 = listener->tcp_eager_prev_q0;
2010 tcp->tcp_eager_next_q0 = listener;
2011 listener->tcp_eager_prev_q0->tcp_eager_next_q0 = tcp;
2012 listener->tcp_eager_prev_q0 = tcp;
2013 tcp->tcp_conn.tcp_eager_conn_ind = mp;
2014 }
2015
2016 /* we have timed out before */
2017 if (tcp->tcp_syn_rcvd_timeout != 0) {
2018 tcp->tcp_syn_rcvd_timeout = 0;
2019 listener->tcp_syn_rcvd_timeout--;
2020 if (listener->tcp_syn_defense &&
2021 listener->tcp_syn_rcvd_timeout <=
2022 (tcps->tcps_conn_req_max_q0 >> 5) &&
2023 10*MINUTES < TICK_TO_MSEC(ddi_get_lbolt64() -
2024 listener->tcp_last_rcv_lbolt)) {
2025 /*
2026 * Turn off the defense mode if we
2027 * believe the SYN attack is over.
2028 */
2029 listener->tcp_syn_defense = B_FALSE;
2030 if (listener->tcp_ip_addr_cache) {
2031 kmem_free((void *)listener->tcp_ip_addr_cache,
2032 IP_ADDR_CACHE_SIZE * sizeof (ipaddr_t));
2033 listener->tcp_ip_addr_cache = NULL;
2034 }
2035 }
2036 }
2037 addr_cache = (ipaddr_t *)(listener->tcp_ip_addr_cache);
2038 if (addr_cache != NULL) {
2039 /*
2040 * We have finished a 3-way handshake with this
2041 * remote host. This proves the IP addr is good.
2042 * Cache it!
2043 */
2044 addr_cache[IP_ADDR_CACHE_HASH(tcp->tcp_connp->conn_faddr_v4)] =
2045 tcp->tcp_connp->conn_faddr_v4;
2046 }
2047 mutex_exit(&listener->tcp_eager_lock);
2048 if (need_send_conn_ind)
2049 putnext(lconnp->conn_rq, mp);
2050 }
2051