xref: /titanic_50/usr/src/uts/common/inet/tcp/tcp_socket.c (revision e6f8def1ace27f327240a0b4b090911007f71137)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 /* This file contains all TCP kernel socket related functions. */
28 
29 #include <sys/types.h>
30 #include <sys/strlog.h>
31 #include <sys/policy.h>
32 #include <sys/sockio.h>
33 #include <sys/strsubr.h>
34 #include <sys/strsun.h>
35 #include <sys/squeue_impl.h>
36 #include <sys/squeue.h>
37 #include <sys/tihdr.h>
38 #include <sys/timod.h>
39 #include <sys/tpicommon.h>
40 #include <sys/socketvar.h>
41 
42 #include <inet/common.h>
43 #include <inet/proto_set.h>
44 #include <inet/ip.h>
45 #include <inet/tcp.h>
46 #include <inet/tcp_impl.h>
47 
48 static void	tcp_activate(sock_lower_handle_t, sock_upper_handle_t,
49 		    sock_upcalls_t *, int, cred_t *);
50 static int	tcp_accept(sock_lower_handle_t, sock_lower_handle_t,
51 		    sock_upper_handle_t, cred_t *);
52 static int	tcp_bind(sock_lower_handle_t, struct sockaddr *,
53 		    socklen_t, cred_t *);
54 static int	tcp_listen(sock_lower_handle_t, int, cred_t *);
55 static int	tcp_connect(sock_lower_handle_t, const struct sockaddr *,
56 		    socklen_t, sock_connid_t *, cred_t *);
57 static int	tcp_getsockopt(sock_lower_handle_t, int, int, void *,
58 		    socklen_t *, cred_t *);
59 static int	tcp_setsockopt(sock_lower_handle_t, int, int, const void *,
60 		    socklen_t, cred_t *);
61 static int	tcp_sendmsg(sock_lower_handle_t, mblk_t *, struct nmsghdr *,
62 		    cred_t *cr);
63 static int	tcp_shutdown(sock_lower_handle_t, int, cred_t *);
64 static void	tcp_clr_flowctrl(sock_lower_handle_t);
65 static int	tcp_ioctl(sock_lower_handle_t, int, intptr_t, int, int32_t *,
66 		    cred_t *);
67 static int	tcp_close(sock_lower_handle_t, int, cred_t *);
68 
69 sock_downcalls_t sock_tcp_downcalls = {
70 	tcp_activate,
71 	tcp_accept,
72 	tcp_bind,
73 	tcp_listen,
74 	tcp_connect,
75 	tcp_getpeername,
76 	tcp_getsockname,
77 	tcp_getsockopt,
78 	tcp_setsockopt,
79 	tcp_sendmsg,
80 	NULL,
81 	NULL,
82 	NULL,
83 	tcp_shutdown,
84 	tcp_clr_flowctrl,
85 	tcp_ioctl,
86 	tcp_close,
87 };
88 
89 /* ARGSUSED */
90 static void
91 tcp_activate(sock_lower_handle_t proto_handle, sock_upper_handle_t sock_handle,
92     sock_upcalls_t *sock_upcalls, int flags, cred_t *cr)
93 {
94 	conn_t *connp = (conn_t *)proto_handle;
95 	struct sock_proto_props sopp;
96 	extern struct module_info tcp_rinfo;
97 
98 	ASSERT(connp->conn_upper_handle == NULL);
99 
100 	/* All Solaris components should pass a cred for this operation. */
101 	ASSERT(cr != NULL);
102 
103 	sopp.sopp_flags = SOCKOPT_RCVHIWAT | SOCKOPT_RCVLOWAT |
104 	    SOCKOPT_MAXPSZ | SOCKOPT_MAXBLK | SOCKOPT_RCVTIMER |
105 	    SOCKOPT_RCVTHRESH | SOCKOPT_MAXADDRLEN | SOCKOPT_MINPSZ;
106 
107 	sopp.sopp_rxhiwat = SOCKET_RECVHIWATER;
108 	sopp.sopp_rxlowat = SOCKET_RECVLOWATER;
109 	sopp.sopp_maxpsz = INFPSZ;
110 	sopp.sopp_maxblk = INFPSZ;
111 	sopp.sopp_rcvtimer = SOCKET_TIMER_INTERVAL;
112 	sopp.sopp_rcvthresh = SOCKET_RECVHIWATER >> 3;
113 	sopp.sopp_maxaddrlen = sizeof (sin6_t);
114 	sopp.sopp_minpsz = (tcp_rinfo.mi_minpsz == 1) ? 0 :
115 	    tcp_rinfo.mi_minpsz;
116 
117 	connp->conn_upcalls = sock_upcalls;
118 	connp->conn_upper_handle = sock_handle;
119 
120 	ASSERT(connp->conn_rcvbuf != 0 &&
121 	    connp->conn_rcvbuf == connp->conn_tcp->tcp_rwnd);
122 	(*sock_upcalls->su_set_proto_props)(sock_handle, &sopp);
123 }
124 
125 static int
126 tcp_accept(sock_lower_handle_t lproto_handle,
127     sock_lower_handle_t eproto_handle, sock_upper_handle_t sock_handle,
128     cred_t *cr)
129 {
130 	conn_t *lconnp, *econnp;
131 	tcp_t *listener, *eager;
132 
133 	lconnp = (conn_t *)lproto_handle;
134 	listener = lconnp->conn_tcp;
135 	ASSERT(listener->tcp_state == TCPS_LISTEN);
136 	econnp = (conn_t *)eproto_handle;
137 	eager = econnp->conn_tcp;
138 	ASSERT(eager->tcp_listener != NULL);
139 
140 	/*
141 	 * It is OK to manipulate these fields outside the eager's squeue
142 	 * because they will not start being used until tcp_accept_finish
143 	 * has been called.
144 	 */
145 	ASSERT(lconnp->conn_upper_handle != NULL);
146 	ASSERT(econnp->conn_upper_handle == NULL);
147 	econnp->conn_upper_handle = sock_handle;
148 	econnp->conn_upcalls = lconnp->conn_upcalls;
149 	ASSERT(IPCL_IS_NONSTR(econnp));
150 	return (tcp_accept_common(lconnp, econnp, cr));
151 }
152 
153 static int
154 tcp_bind(sock_lower_handle_t proto_handle, struct sockaddr *sa,
155     socklen_t len, cred_t *cr)
156 {
157 	int 		error;
158 	conn_t		*connp = (conn_t *)proto_handle;
159 	squeue_t	*sqp = connp->conn_sqp;
160 
161 	/* All Solaris components should pass a cred for this operation. */
162 	ASSERT(cr != NULL);
163 
164 	ASSERT(sqp != NULL);
165 	ASSERT(connp->conn_upper_handle != NULL);
166 
167 	error = squeue_synch_enter(sqp, connp, NULL);
168 	if (error != 0) {
169 		/* failed to enter */
170 		return (ENOSR);
171 	}
172 
173 	/* binding to a NULL address really means unbind */
174 	if (sa == NULL) {
175 		if (connp->conn_tcp->tcp_state < TCPS_LISTEN)
176 			error = tcp_do_unbind(connp);
177 		else
178 			error = EINVAL;
179 	} else {
180 		error = tcp_do_bind(connp, sa, len, cr, B_TRUE);
181 	}
182 
183 	squeue_synch_exit(sqp, connp);
184 
185 	if (error < 0) {
186 		if (error == -TOUTSTATE)
187 			error = EINVAL;
188 		else
189 			error = proto_tlitosyserr(-error);
190 	}
191 
192 	return (error);
193 }
194 
195 /*
196  * SOP_LISTEN() calls into tcp_listen().
197  */
198 /* ARGSUSED */
199 static int
200 tcp_listen(sock_lower_handle_t proto_handle, int backlog, cred_t *cr)
201 {
202 	conn_t	*connp = (conn_t *)proto_handle;
203 	int 	error;
204 	squeue_t *sqp = connp->conn_sqp;
205 
206 	ASSERT(connp->conn_upper_handle != NULL);
207 
208 	/* All Solaris components should pass a cred for this operation. */
209 	ASSERT(cr != NULL);
210 
211 	error = squeue_synch_enter(sqp, connp, NULL);
212 	if (error != 0) {
213 		/* failed to enter */
214 		return (ENOBUFS);
215 	}
216 
217 	error = tcp_do_listen(connp, NULL, 0, backlog, cr, B_FALSE);
218 	if (error == 0) {
219 		(*connp->conn_upcalls->su_opctl)(connp->conn_upper_handle,
220 		    SOCK_OPCTL_ENAB_ACCEPT, (uintptr_t)backlog);
221 	} else if (error < 0) {
222 		if (error == -TOUTSTATE)
223 			error = EINVAL;
224 		else
225 			error = proto_tlitosyserr(-error);
226 	}
227 	squeue_synch_exit(sqp, connp);
228 	return (error);
229 }
230 
231 static int
232 tcp_connect(sock_lower_handle_t proto_handle, const struct sockaddr *sa,
233     socklen_t len, sock_connid_t *id, cred_t *cr)
234 {
235 	conn_t		*connp = (conn_t *)proto_handle;
236 	squeue_t	*sqp = connp->conn_sqp;
237 	int		error;
238 
239 	ASSERT(connp->conn_upper_handle != NULL);
240 
241 	/* All Solaris components should pass a cred for this operation. */
242 	ASSERT(cr != NULL);
243 
244 	error = proto_verify_ip_addr(connp->conn_family, sa, len);
245 	if (error != 0) {
246 		return (error);
247 	}
248 
249 	error = squeue_synch_enter(sqp, connp, NULL);
250 	if (error != 0) {
251 		/* failed to enter */
252 		return (ENOSR);
253 	}
254 
255 	/*
256 	 * TCP supports quick connect, so no need to do an implicit bind
257 	 */
258 	error = tcp_do_connect(connp, sa, len, cr, curproc->p_pid);
259 	if (error == 0) {
260 		*id = connp->conn_tcp->tcp_connid;
261 	} else if (error < 0) {
262 		if (error == -TOUTSTATE) {
263 			switch (connp->conn_tcp->tcp_state) {
264 			case TCPS_SYN_SENT:
265 				error = EALREADY;
266 				break;
267 			case TCPS_ESTABLISHED:
268 				error = EISCONN;
269 				break;
270 			case TCPS_LISTEN:
271 				error = EOPNOTSUPP;
272 				break;
273 			default:
274 				error = EINVAL;
275 				break;
276 			}
277 		} else {
278 			error = proto_tlitosyserr(-error);
279 		}
280 	}
281 
282 	if (connp->conn_tcp->tcp_loopback) {
283 		struct sock_proto_props sopp;
284 
285 		sopp.sopp_flags = SOCKOPT_LOOPBACK;
286 		sopp.sopp_loopback = B_TRUE;
287 
288 		(*connp->conn_upcalls->su_set_proto_props)(
289 		    connp->conn_upper_handle, &sopp);
290 	}
291 done:
292 	squeue_synch_exit(sqp, connp);
293 
294 	return ((error == 0) ? EINPROGRESS : error);
295 }
296 
297 /* ARGSUSED3 */
298 int
299 tcp_getpeername(sock_lower_handle_t proto_handle, struct sockaddr *addr,
300     socklen_t *addrlenp, cred_t *cr)
301 {
302 	conn_t	*connp = (conn_t *)proto_handle;
303 	tcp_t	*tcp = connp->conn_tcp;
304 
305 	ASSERT(connp->conn_upper_handle != NULL);
306 	/* All Solaris components should pass a cred for this operation. */
307 	ASSERT(cr != NULL);
308 
309 	ASSERT(tcp != NULL);
310 	if (tcp->tcp_state < TCPS_SYN_RCVD)
311 		return (ENOTCONN);
312 
313 	return (conn_getpeername(connp, addr, addrlenp));
314 }
315 
316 /* ARGSUSED3 */
317 int
318 tcp_getsockname(sock_lower_handle_t proto_handle, struct sockaddr *addr,
319     socklen_t *addrlenp, cred_t *cr)
320 {
321 	conn_t	*connp = (conn_t *)proto_handle;
322 
323 	/* All Solaris components should pass a cred for this operation. */
324 	ASSERT(cr != NULL);
325 
326 	ASSERT(connp->conn_upper_handle != NULL);
327 	return (conn_getsockname(connp, addr, addrlenp));
328 }
329 
330 /* returns UNIX error, the optlen is a value-result arg */
331 static int
332 tcp_getsockopt(sock_lower_handle_t proto_handle, int level, int option_name,
333     void *optvalp, socklen_t *optlen, cred_t *cr)
334 {
335 	conn_t		*connp = (conn_t *)proto_handle;
336 	squeue_t	*sqp = connp->conn_sqp;
337 	int		error;
338 	t_uscalar_t	max_optbuf_len;
339 	void		*optvalp_buf;
340 	int		len;
341 
342 	ASSERT(connp->conn_upper_handle != NULL);
343 
344 	error = proto_opt_check(level, option_name, *optlen, &max_optbuf_len,
345 	    tcp_opt_obj.odb_opt_des_arr,
346 	    tcp_opt_obj.odb_opt_arr_cnt,
347 	    B_FALSE, B_TRUE, cr);
348 	if (error != 0) {
349 		if (error < 0) {
350 			error = proto_tlitosyserr(-error);
351 		}
352 		return (error);
353 	}
354 
355 	optvalp_buf = kmem_alloc(max_optbuf_len, KM_SLEEP);
356 
357 	error = squeue_synch_enter(sqp, connp, NULL);
358 	if (error == ENOMEM) {
359 		kmem_free(optvalp_buf, max_optbuf_len);
360 		return (ENOMEM);
361 	}
362 
363 	len = tcp_opt_get(connp, level, option_name, optvalp_buf);
364 	squeue_synch_exit(sqp, connp);
365 
366 	if (len == -1) {
367 		kmem_free(optvalp_buf, max_optbuf_len);
368 		return (EINVAL);
369 	}
370 
371 	/*
372 	 * update optlen and copy option value
373 	 */
374 	t_uscalar_t size = MIN(len, *optlen);
375 
376 	bcopy(optvalp_buf, optvalp, size);
377 	bcopy(&size, optlen, sizeof (size));
378 
379 	kmem_free(optvalp_buf, max_optbuf_len);
380 	return (0);
381 }
382 
383 static int
384 tcp_setsockopt(sock_lower_handle_t proto_handle, int level, int option_name,
385     const void *optvalp, socklen_t optlen, cred_t *cr)
386 {
387 	conn_t		*connp = (conn_t *)proto_handle;
388 	squeue_t	*sqp = connp->conn_sqp;
389 	int		error;
390 
391 	ASSERT(connp->conn_upper_handle != NULL);
392 	/*
393 	 * Entering the squeue synchronously can result in a context switch,
394 	 * which can cause a rather sever performance degradation. So we try to
395 	 * handle whatever options we can without entering the squeue.
396 	 */
397 	if (level == IPPROTO_TCP) {
398 		switch (option_name) {
399 		case TCP_NODELAY:
400 			if (optlen != sizeof (int32_t))
401 				return (EINVAL);
402 			mutex_enter(&connp->conn_tcp->tcp_non_sq_lock);
403 			connp->conn_tcp->tcp_naglim = *(int *)optvalp ? 1 :
404 			    connp->conn_tcp->tcp_mss;
405 			mutex_exit(&connp->conn_tcp->tcp_non_sq_lock);
406 			return (0);
407 		default:
408 			break;
409 		}
410 	}
411 
412 	error = squeue_synch_enter(sqp, connp, NULL);
413 	if (error == ENOMEM) {
414 		return (ENOMEM);
415 	}
416 
417 	error = proto_opt_check(level, option_name, optlen, NULL,
418 	    tcp_opt_obj.odb_opt_des_arr,
419 	    tcp_opt_obj.odb_opt_arr_cnt,
420 	    B_TRUE, B_FALSE, cr);
421 
422 	if (error != 0) {
423 		if (error < 0) {
424 			error = proto_tlitosyserr(-error);
425 		}
426 		squeue_synch_exit(sqp, connp);
427 		return (error);
428 	}
429 
430 	error = tcp_opt_set(connp, SETFN_OPTCOM_NEGOTIATE, level, option_name,
431 	    optlen, (uchar_t *)optvalp, (uint_t *)&optlen, (uchar_t *)optvalp,
432 	    NULL, cr);
433 	squeue_synch_exit(sqp, connp);
434 
435 	ASSERT(error >= 0);
436 
437 	return (error);
438 }
439 
440 /* ARGSUSED */
441 static int
442 tcp_sendmsg(sock_lower_handle_t proto_handle, mblk_t *mp, struct nmsghdr *msg,
443     cred_t *cr)
444 {
445 	tcp_t		*tcp;
446 	uint32_t	msize;
447 	conn_t *connp = (conn_t *)proto_handle;
448 	int32_t		tcpstate;
449 
450 	/* All Solaris components should pass a cred for this operation. */
451 	ASSERT(cr != NULL);
452 
453 	ASSERT(connp->conn_ref >= 2);
454 	ASSERT(connp->conn_upper_handle != NULL);
455 
456 	if (msg->msg_controllen != 0) {
457 		freemsg(mp);
458 		return (EOPNOTSUPP);
459 	}
460 
461 	switch (DB_TYPE(mp)) {
462 	case M_DATA:
463 		tcp = connp->conn_tcp;
464 		ASSERT(tcp != NULL);
465 
466 		tcpstate = tcp->tcp_state;
467 		if (tcpstate < TCPS_ESTABLISHED) {
468 			freemsg(mp);
469 			/*
470 			 * We return ENOTCONN if the endpoint is trying to
471 			 * connect or has never been connected, and EPIPE if it
472 			 * has been disconnected. The connection id helps us
473 			 * distinguish between the last two cases.
474 			 */
475 			return ((tcpstate == TCPS_SYN_SENT) ? ENOTCONN :
476 			    ((tcp->tcp_connid > 0) ? EPIPE : ENOTCONN));
477 		} else if (tcpstate > TCPS_CLOSE_WAIT) {
478 			freemsg(mp);
479 			return (EPIPE);
480 		}
481 
482 		msize = msgdsize(mp);
483 
484 		mutex_enter(&tcp->tcp_non_sq_lock);
485 		tcp->tcp_squeue_bytes += msize;
486 		/*
487 		 * Squeue Flow Control
488 		 */
489 		if (TCP_UNSENT_BYTES(tcp) > connp->conn_sndbuf) {
490 			tcp_setqfull(tcp);
491 		}
492 		mutex_exit(&tcp->tcp_non_sq_lock);
493 
494 		/*
495 		 * The application may pass in an address in the msghdr, but
496 		 * we ignore the address on connection-oriented sockets.
497 		 * Just like BSD this code does not generate an error for
498 		 * TCP (a CONNREQUIRED socket) when sending to an address
499 		 * passed in with sendto/sendmsg. Instead the data is
500 		 * delivered on the connection as if no address had been
501 		 * supplied.
502 		 */
503 		CONN_INC_REF(connp);
504 
505 		if (msg->msg_flags & MSG_OOB) {
506 			SQUEUE_ENTER_ONE(connp->conn_sqp, mp, tcp_output_urgent,
507 			    connp, NULL, tcp_squeue_flag, SQTAG_TCP_OUTPUT);
508 		} else {
509 			SQUEUE_ENTER_ONE(connp->conn_sqp, mp, tcp_output,
510 			    connp, NULL, tcp_squeue_flag, SQTAG_TCP_OUTPUT);
511 		}
512 
513 		return (0);
514 
515 	default:
516 		ASSERT(0);
517 	}
518 
519 	freemsg(mp);
520 	return (0);
521 }
522 
523 /* ARGSUSED */
524 static int
525 tcp_shutdown(sock_lower_handle_t proto_handle, int how, cred_t *cr)
526 {
527 	conn_t  *connp = (conn_t *)proto_handle;
528 	tcp_t   *tcp = connp->conn_tcp;
529 
530 	ASSERT(connp->conn_upper_handle != NULL);
531 
532 	/* All Solaris components should pass a cred for this operation. */
533 	ASSERT(cr != NULL);
534 
535 	/*
536 	 * X/Open requires that we check the connected state.
537 	 */
538 	if (tcp->tcp_state < TCPS_SYN_SENT)
539 		return (ENOTCONN);
540 
541 	/* shutdown the send side */
542 	if (how != SHUT_RD) {
543 		mblk_t *bp;
544 
545 		bp = allocb_wait(0, BPRI_HI, STR_NOSIG, NULL);
546 		CONN_INC_REF(connp);
547 		SQUEUE_ENTER_ONE(connp->conn_sqp, bp, tcp_shutdown_output,
548 		    connp, NULL, SQ_NODRAIN, SQTAG_TCP_SHUTDOWN_OUTPUT);
549 
550 		(*connp->conn_upcalls->su_opctl)(connp->conn_upper_handle,
551 		    SOCK_OPCTL_SHUT_SEND, 0);
552 	}
553 
554 	/* shutdown the recv side */
555 	if (how != SHUT_WR)
556 		(*connp->conn_upcalls->su_opctl)(connp->conn_upper_handle,
557 		    SOCK_OPCTL_SHUT_RECV, 0);
558 
559 	return (0);
560 }
561 
562 static void
563 tcp_clr_flowctrl(sock_lower_handle_t proto_handle)
564 {
565 	conn_t  *connp = (conn_t *)proto_handle;
566 	tcp_t	*tcp = connp->conn_tcp;
567 	mblk_t *mp;
568 	int error;
569 
570 	ASSERT(connp->conn_upper_handle != NULL);
571 
572 	/*
573 	 * If tcp->tcp_rsrv_mp == NULL, it means that tcp_clr_flowctrl()
574 	 * is currently running.
575 	 */
576 	mutex_enter(&tcp->tcp_rsrv_mp_lock);
577 	if ((mp = tcp->tcp_rsrv_mp) == NULL) {
578 		mutex_exit(&tcp->tcp_rsrv_mp_lock);
579 		return;
580 	}
581 	tcp->tcp_rsrv_mp = NULL;
582 	mutex_exit(&tcp->tcp_rsrv_mp_lock);
583 
584 	error = squeue_synch_enter(connp->conn_sqp, connp, mp);
585 	ASSERT(error == 0);
586 
587 	mutex_enter(&tcp->tcp_rsrv_mp_lock);
588 	tcp->tcp_rsrv_mp = mp;
589 	mutex_exit(&tcp->tcp_rsrv_mp_lock);
590 
591 	if (tcp->tcp_fused) {
592 		tcp_fuse_backenable(tcp);
593 	} else {
594 		tcp->tcp_rwnd = connp->conn_rcvbuf;
595 		/*
596 		 * Send back a window update immediately if TCP is above
597 		 * ESTABLISHED state and the increase of the rcv window
598 		 * that the other side knows is at least 1 MSS after flow
599 		 * control is lifted.
600 		 */
601 		if (tcp->tcp_state >= TCPS_ESTABLISHED &&
602 		    tcp_rwnd_reopen(tcp) == TH_ACK_NEEDED) {
603 			tcp_xmit_ctl(NULL, tcp,
604 			    (tcp->tcp_swnd == 0) ? tcp->tcp_suna :
605 			    tcp->tcp_snxt, tcp->tcp_rnxt, TH_ACK);
606 		}
607 	}
608 
609 	squeue_synch_exit(connp->conn_sqp, connp);
610 }
611 
612 /* ARGSUSED */
613 static int
614 tcp_ioctl(sock_lower_handle_t proto_handle, int cmd, intptr_t arg,
615     int mode, int32_t *rvalp, cred_t *cr)
616 {
617 	conn_t  	*connp = (conn_t *)proto_handle;
618 	int		error;
619 
620 	ASSERT(connp->conn_upper_handle != NULL);
621 
622 	/* All Solaris components should pass a cred for this operation. */
623 	ASSERT(cr != NULL);
624 
625 	/*
626 	 * If we don't have a helper stream then create one.
627 	 * ip_create_helper_stream takes care of locking the conn_t,
628 	 * so this check for NULL is just a performance optimization.
629 	 */
630 	if (connp->conn_helper_info == NULL) {
631 		tcp_stack_t *tcps = connp->conn_tcp->tcp_tcps;
632 
633 		/*
634 		 * Create a helper stream for non-STREAMS socket.
635 		 */
636 		error = ip_create_helper_stream(connp, tcps->tcps_ldi_ident);
637 		if (error != 0) {
638 			ip0dbg(("tcp_ioctl: create of IP helper stream "
639 			    "failed %d\n", error));
640 			return (error);
641 		}
642 	}
643 
644 	switch (cmd) {
645 		case ND_SET:
646 		case ND_GET:
647 		case _SIOCSOCKFALLBACK:
648 		case TCP_IOC_ABORT_CONN:
649 		case TI_GETPEERNAME:
650 		case TI_GETMYNAME:
651 			ip1dbg(("tcp_ioctl: cmd 0x%x on non streams socket",
652 			    cmd));
653 			error = EINVAL;
654 			break;
655 		default:
656 			/*
657 			 * If the conn is not closing, pass on to IP using
658 			 * helper stream. Bump the ioctlref to prevent tcp_close
659 			 * from closing the rq/wq out from underneath the ioctl
660 			 * if it ends up queued or aborted/interrupted.
661 			 */
662 			mutex_enter(&connp->conn_lock);
663 			if (connp->conn_state_flags & (CONN_CLOSING)) {
664 				mutex_exit(&connp->conn_lock);
665 				error = EINVAL;
666 				break;
667 			}
668 			CONN_INC_IOCTLREF_LOCKED(connp);
669 			error = ldi_ioctl(connp->conn_helper_info->iphs_handle,
670 			    cmd, arg, mode, cr, rvalp);
671 			CONN_DEC_IOCTLREF(connp);
672 			break;
673 	}
674 	return (error);
675 }
676 
677 /* ARGSUSED */
678 static int
679 tcp_close(sock_lower_handle_t proto_handle, int flags, cred_t *cr)
680 {
681 	conn_t *connp = (conn_t *)proto_handle;
682 
683 	ASSERT(connp->conn_upper_handle != NULL);
684 
685 	/* All Solaris components should pass a cred for this operation. */
686 	ASSERT(cr != NULL);
687 
688 	tcp_close_common(connp, flags);
689 
690 	ip_free_helper_stream(connp);
691 
692 	/*
693 	 * Drop IP's reference on the conn. This is the last reference
694 	 * on the connp if the state was less than established. If the
695 	 * connection has gone into timewait state, then we will have
696 	 * one ref for the TCP and one more ref (total of two) for the
697 	 * classifier connected hash list (a timewait connections stays
698 	 * in connected hash till closed).
699 	 *
700 	 * We can't assert the references because there might be other
701 	 * transient reference places because of some walkers or queued
702 	 * packets in squeue for the timewait state.
703 	 */
704 	CONN_DEC_REF(connp);
705 	return (0);
706 }
707 
708 /* ARGSUSED */
709 sock_lower_handle_t
710 tcp_create(int family, int type, int proto, sock_downcalls_t **sock_downcalls,
711     uint_t *smodep, int *errorp, int flags, cred_t *credp)
712 {
713 	conn_t		*connp;
714 	boolean_t	isv6 = family == AF_INET6;
715 	if (type != SOCK_STREAM || (family != AF_INET && family != AF_INET6) ||
716 	    (proto != 0 && proto != IPPROTO_TCP)) {
717 		*errorp = EPROTONOSUPPORT;
718 		return (NULL);
719 	}
720 
721 	connp = tcp_create_common(credp, isv6, B_TRUE, errorp);
722 	if (connp == NULL) {
723 		return (NULL);
724 	}
725 
726 	/*
727 	 * Put the ref for TCP. Ref for IP was already put
728 	 * by ipcl_conn_create. Also Make the conn_t globally
729 	 * visible to walkers
730 	 */
731 	mutex_enter(&connp->conn_lock);
732 	CONN_INC_REF_LOCKED(connp);
733 	ASSERT(connp->conn_ref == 2);
734 	connp->conn_state_flags &= ~CONN_INCIPIENT;
735 
736 	connp->conn_flags |= IPCL_NONSTR;
737 	mutex_exit(&connp->conn_lock);
738 
739 	ASSERT(errorp != NULL);
740 	*errorp = 0;
741 	*sock_downcalls = &sock_tcp_downcalls;
742 	*smodep = SM_CONNREQUIRED | SM_EXDATA | SM_ACCEPTSUPP |
743 	    SM_SENDFILESUPP;
744 
745 	return ((sock_lower_handle_t)connp);
746 }
747 
748 int
749 tcp_fallback(sock_lower_handle_t proto_handle, queue_t *q,
750     boolean_t direct_sockfs, so_proto_quiesced_cb_t quiesced_cb)
751 {
752 	tcp_t			*tcp;
753 	conn_t 			*connp = (conn_t *)proto_handle;
754 	int			error;
755 	mblk_t			*stropt_mp;
756 	mblk_t			*ordrel_mp;
757 
758 	tcp = connp->conn_tcp;
759 
760 	stropt_mp = allocb_wait(sizeof (struct stroptions), BPRI_HI, STR_NOSIG,
761 	    NULL);
762 
763 	/* Pre-allocate the T_ordrel_ind mblk. */
764 	ASSERT(tcp->tcp_ordrel_mp == NULL);
765 	ordrel_mp = allocb_wait(sizeof (struct T_ordrel_ind), BPRI_HI,
766 	    STR_NOSIG, NULL);
767 	ordrel_mp->b_datap->db_type = M_PROTO;
768 	((struct T_ordrel_ind *)ordrel_mp->b_rptr)->PRIM_type = T_ORDREL_IND;
769 	ordrel_mp->b_wptr += sizeof (struct T_ordrel_ind);
770 
771 	/*
772 	 * Enter the squeue so that no new packets can come in
773 	 */
774 	error = squeue_synch_enter(connp->conn_sqp, connp, NULL);
775 	if (error != 0) {
776 		/* failed to enter, free all the pre-allocated messages. */
777 		freeb(stropt_mp);
778 		freeb(ordrel_mp);
779 		/*
780 		 * We cannot process the eager, so at least send out a
781 		 * RST so the peer can reconnect.
782 		 */
783 		if (tcp->tcp_listener != NULL) {
784 			(void) tcp_eager_blowoff(tcp->tcp_listener,
785 			    tcp->tcp_conn_req_seqnum);
786 		}
787 		return (ENOMEM);
788 	}
789 
790 	/*
791 	 * Both endpoints must be of the same type (either STREAMS or
792 	 * non-STREAMS) for fusion to be enabled. So if we are fused,
793 	 * we have to unfuse.
794 	 */
795 	if (tcp->tcp_fused)
796 		tcp_unfuse(tcp);
797 
798 	/*
799 	 * No longer a direct socket
800 	 */
801 	connp->conn_flags &= ~IPCL_NONSTR;
802 	tcp->tcp_ordrel_mp = ordrel_mp;
803 
804 	if (tcp->tcp_listener != NULL) {
805 		/* The eager will deal with opts when accept() is called */
806 		freeb(stropt_mp);
807 		tcp_fallback_eager(tcp, direct_sockfs);
808 	} else {
809 		tcp_fallback_noneager(tcp, stropt_mp, q, direct_sockfs,
810 		    quiesced_cb);
811 	}
812 
813 	/*
814 	 * There should be atleast two ref's (IP + TCP)
815 	 */
816 	ASSERT(connp->conn_ref >= 2);
817 	squeue_synch_exit(connp->conn_sqp, connp);
818 
819 	return (0);
820 }
821