xref: /titanic_50/usr/src/uts/common/inet/tcp/tcp_socket.c (revision d5ab4bd8f9e03d84b5f600a779f771e2efa7eb82)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
24  */
25 
26 /* This file contains all TCP kernel socket related functions. */
27 
28 #include <sys/types.h>
29 #include <sys/strlog.h>
30 #include <sys/policy.h>
31 #include <sys/sockio.h>
32 #include <sys/strsubr.h>
33 #include <sys/strsun.h>
34 #include <sys/squeue_impl.h>
35 #include <sys/squeue.h>
36 #include <sys/tihdr.h>
37 #include <sys/timod.h>
38 #include <sys/tpicommon.h>
39 #include <sys/socketvar.h>
40 
41 #include <inet/common.h>
42 #include <inet/proto_set.h>
43 #include <inet/ip.h>
44 #include <inet/tcp.h>
45 #include <inet/tcp_impl.h>
46 
47 static void	tcp_activate(sock_lower_handle_t, sock_upper_handle_t,
48 		    sock_upcalls_t *, int, cred_t *);
49 static int	tcp_accept(sock_lower_handle_t, sock_lower_handle_t,
50 		    sock_upper_handle_t, cred_t *);
51 static int	tcp_bind(sock_lower_handle_t, struct sockaddr *,
52 		    socklen_t, cred_t *);
53 static int	tcp_listen(sock_lower_handle_t, int, cred_t *);
54 static int	tcp_connect(sock_lower_handle_t, const struct sockaddr *,
55 		    socklen_t, sock_connid_t *, cred_t *);
56 static int	tcp_getsockopt(sock_lower_handle_t, int, int, void *,
57 		    socklen_t *, cred_t *);
58 static int	tcp_setsockopt(sock_lower_handle_t, int, int, const void *,
59 		    socklen_t, cred_t *);
60 static int	tcp_sendmsg(sock_lower_handle_t, mblk_t *, struct nmsghdr *,
61 		    cred_t *cr);
62 static int	tcp_shutdown(sock_lower_handle_t, int, cred_t *);
63 static void	tcp_clr_flowctrl(sock_lower_handle_t);
64 static int	tcp_ioctl(sock_lower_handle_t, int, intptr_t, int, int32_t *,
65 		    cred_t *);
66 static int	tcp_close(sock_lower_handle_t, int, cred_t *);
67 
68 sock_downcalls_t sock_tcp_downcalls = {
69 	tcp_activate,
70 	tcp_accept,
71 	tcp_bind,
72 	tcp_listen,
73 	tcp_connect,
74 	tcp_getpeername,
75 	tcp_getsockname,
76 	tcp_getsockopt,
77 	tcp_setsockopt,
78 	tcp_sendmsg,
79 	NULL,
80 	NULL,
81 	NULL,
82 	tcp_shutdown,
83 	tcp_clr_flowctrl,
84 	tcp_ioctl,
85 	tcp_close,
86 };
87 
88 /* ARGSUSED */
89 static void
90 tcp_activate(sock_lower_handle_t proto_handle, sock_upper_handle_t sock_handle,
91     sock_upcalls_t *sock_upcalls, int flags, cred_t *cr)
92 {
93 	conn_t *connp = (conn_t *)proto_handle;
94 	struct sock_proto_props sopp;
95 	extern struct module_info tcp_rinfo;
96 
97 	ASSERT(connp->conn_upper_handle == NULL);
98 
99 	/* All Solaris components should pass a cred for this operation. */
100 	ASSERT(cr != NULL);
101 
102 	sopp.sopp_flags = SOCKOPT_RCVHIWAT | SOCKOPT_RCVLOWAT |
103 	    SOCKOPT_MAXPSZ | SOCKOPT_MAXBLK | SOCKOPT_RCVTIMER |
104 	    SOCKOPT_RCVTHRESH | SOCKOPT_MAXADDRLEN | SOCKOPT_MINPSZ;
105 
106 	sopp.sopp_rxhiwat = SOCKET_RECVHIWATER;
107 	sopp.sopp_rxlowat = SOCKET_RECVLOWATER;
108 	sopp.sopp_maxpsz = INFPSZ;
109 	sopp.sopp_maxblk = INFPSZ;
110 	sopp.sopp_rcvtimer = SOCKET_TIMER_INTERVAL;
111 	sopp.sopp_rcvthresh = SOCKET_RECVHIWATER >> 3;
112 	sopp.sopp_maxaddrlen = sizeof (sin6_t);
113 	sopp.sopp_minpsz = (tcp_rinfo.mi_minpsz == 1) ? 0 :
114 	    tcp_rinfo.mi_minpsz;
115 
116 	connp->conn_upcalls = sock_upcalls;
117 	connp->conn_upper_handle = sock_handle;
118 
119 	ASSERT(connp->conn_rcvbuf != 0 &&
120 	    connp->conn_rcvbuf == connp->conn_tcp->tcp_rwnd);
121 	(*sock_upcalls->su_set_proto_props)(sock_handle, &sopp);
122 }
123 
124 static int
125 tcp_accept(sock_lower_handle_t lproto_handle,
126     sock_lower_handle_t eproto_handle, sock_upper_handle_t sock_handle,
127     cred_t *cr)
128 {
129 	conn_t *lconnp, *econnp;
130 	tcp_t *listener, *eager;
131 
132 	lconnp = (conn_t *)lproto_handle;
133 	listener = lconnp->conn_tcp;
134 	ASSERT(listener->tcp_state == TCPS_LISTEN);
135 	econnp = (conn_t *)eproto_handle;
136 	eager = econnp->conn_tcp;
137 	ASSERT(eager->tcp_listener != NULL);
138 
139 	/*
140 	 * It is OK to manipulate these fields outside the eager's squeue
141 	 * because they will not start being used until tcp_accept_finish
142 	 * has been called.
143 	 */
144 	ASSERT(lconnp->conn_upper_handle != NULL);
145 	ASSERT(econnp->conn_upper_handle == NULL);
146 	econnp->conn_upper_handle = sock_handle;
147 	econnp->conn_upcalls = lconnp->conn_upcalls;
148 	ASSERT(IPCL_IS_NONSTR(econnp));
149 	return (tcp_accept_common(lconnp, econnp, cr));
150 }
151 
152 static int
153 tcp_bind(sock_lower_handle_t proto_handle, struct sockaddr *sa,
154     socklen_t len, cred_t *cr)
155 {
156 	int 		error;
157 	conn_t		*connp = (conn_t *)proto_handle;
158 
159 	/* All Solaris components should pass a cred for this operation. */
160 	ASSERT(cr != NULL);
161 	ASSERT(connp->conn_upper_handle != NULL);
162 
163 	error = squeue_synch_enter(connp, NULL);
164 	if (error != 0) {
165 		/* failed to enter */
166 		return (ENOSR);
167 	}
168 
169 	/* binding to a NULL address really means unbind */
170 	if (sa == NULL) {
171 		if (connp->conn_tcp->tcp_state < TCPS_LISTEN)
172 			error = tcp_do_unbind(connp);
173 		else
174 			error = EINVAL;
175 	} else {
176 		error = tcp_do_bind(connp, sa, len, cr, B_TRUE);
177 	}
178 
179 	squeue_synch_exit(connp);
180 
181 	if (error < 0) {
182 		if (error == -TOUTSTATE)
183 			error = EINVAL;
184 		else
185 			error = proto_tlitosyserr(-error);
186 	}
187 
188 	return (error);
189 }
190 
191 /*
192  * SOP_LISTEN() calls into tcp_listen().
193  */
194 /* ARGSUSED */
195 static int
196 tcp_listen(sock_lower_handle_t proto_handle, int backlog, cred_t *cr)
197 {
198 	conn_t	*connp = (conn_t *)proto_handle;
199 	int 	error;
200 
201 	ASSERT(connp->conn_upper_handle != NULL);
202 
203 	/* All Solaris components should pass a cred for this operation. */
204 	ASSERT(cr != NULL);
205 
206 	error = squeue_synch_enter(connp, NULL);
207 	if (error != 0) {
208 		/* failed to enter */
209 		return (ENOBUFS);
210 	}
211 
212 	error = tcp_do_listen(connp, NULL, 0, backlog, cr, B_FALSE);
213 	if (error == 0) {
214 		(*connp->conn_upcalls->su_opctl)(connp->conn_upper_handle,
215 		    SOCK_OPCTL_ENAB_ACCEPT, (uintptr_t)backlog);
216 	} else if (error < 0) {
217 		if (error == -TOUTSTATE)
218 			error = EINVAL;
219 		else
220 			error = proto_tlitosyserr(-error);
221 	}
222 	squeue_synch_exit(connp);
223 	return (error);
224 }
225 
226 static int
227 tcp_connect(sock_lower_handle_t proto_handle, const struct sockaddr *sa,
228     socklen_t len, sock_connid_t *id, cred_t *cr)
229 {
230 	conn_t		*connp = (conn_t *)proto_handle;
231 	int		error;
232 
233 	ASSERT(connp->conn_upper_handle != NULL);
234 
235 	/* All Solaris components should pass a cred for this operation. */
236 	ASSERT(cr != NULL);
237 
238 	error = proto_verify_ip_addr(connp->conn_family, sa, len);
239 	if (error != 0) {
240 		return (error);
241 	}
242 
243 	error = squeue_synch_enter(connp, NULL);
244 	if (error != 0) {
245 		/* failed to enter */
246 		return (ENOSR);
247 	}
248 
249 	/*
250 	 * TCP supports quick connect, so no need to do an implicit bind
251 	 */
252 	error = tcp_do_connect(connp, sa, len, cr, curproc->p_pid);
253 	if (error == 0) {
254 		*id = connp->conn_tcp->tcp_connid;
255 	} else if (error < 0) {
256 		if (error == -TOUTSTATE) {
257 			switch (connp->conn_tcp->tcp_state) {
258 			case TCPS_SYN_SENT:
259 				error = EALREADY;
260 				break;
261 			case TCPS_ESTABLISHED:
262 				error = EISCONN;
263 				break;
264 			case TCPS_LISTEN:
265 				error = EOPNOTSUPP;
266 				break;
267 			default:
268 				error = EINVAL;
269 				break;
270 			}
271 		} else {
272 			error = proto_tlitosyserr(-error);
273 		}
274 	}
275 
276 	if (connp->conn_tcp->tcp_loopback) {
277 		struct sock_proto_props sopp;
278 
279 		sopp.sopp_flags = SOCKOPT_LOOPBACK;
280 		sopp.sopp_loopback = B_TRUE;
281 
282 		(*connp->conn_upcalls->su_set_proto_props)(
283 		    connp->conn_upper_handle, &sopp);
284 	}
285 done:
286 	squeue_synch_exit(connp);
287 
288 	return ((error == 0) ? EINPROGRESS : error);
289 }
290 
291 /* ARGSUSED3 */
292 int
293 tcp_getpeername(sock_lower_handle_t proto_handle, struct sockaddr *addr,
294     socklen_t *addrlenp, cred_t *cr)
295 {
296 	conn_t	*connp = (conn_t *)proto_handle;
297 	tcp_t	*tcp = connp->conn_tcp;
298 
299 	ASSERT(connp->conn_upper_handle != NULL);
300 	/* All Solaris components should pass a cred for this operation. */
301 	ASSERT(cr != NULL);
302 
303 	ASSERT(tcp != NULL);
304 	if (tcp->tcp_state < TCPS_SYN_RCVD)
305 		return (ENOTCONN);
306 
307 	return (conn_getpeername(connp, addr, addrlenp));
308 }
309 
310 /* ARGSUSED3 */
311 int
312 tcp_getsockname(sock_lower_handle_t proto_handle, struct sockaddr *addr,
313     socklen_t *addrlenp, cred_t *cr)
314 {
315 	conn_t	*connp = (conn_t *)proto_handle;
316 
317 	/* All Solaris components should pass a cred for this operation. */
318 	ASSERT(cr != NULL);
319 
320 	ASSERT(connp->conn_upper_handle != NULL);
321 	return (conn_getsockname(connp, addr, addrlenp));
322 }
323 
324 /* returns UNIX error, the optlen is a value-result arg */
325 static int
326 tcp_getsockopt(sock_lower_handle_t proto_handle, int level, int option_name,
327     void *optvalp, socklen_t *optlen, cred_t *cr)
328 {
329 	conn_t		*connp = (conn_t *)proto_handle;
330 	int		error;
331 	t_uscalar_t	max_optbuf_len;
332 	void		*optvalp_buf;
333 	int		len;
334 
335 	ASSERT(connp->conn_upper_handle != NULL);
336 
337 	error = proto_opt_check(level, option_name, *optlen, &max_optbuf_len,
338 	    tcp_opt_obj.odb_opt_des_arr,
339 	    tcp_opt_obj.odb_opt_arr_cnt,
340 	    B_FALSE, B_TRUE, cr);
341 	if (error != 0) {
342 		if (error < 0) {
343 			error = proto_tlitosyserr(-error);
344 		}
345 		return (error);
346 	}
347 
348 	optvalp_buf = kmem_alloc(max_optbuf_len, KM_SLEEP);
349 
350 	error = squeue_synch_enter(connp, NULL);
351 	if (error == ENOMEM) {
352 		kmem_free(optvalp_buf, max_optbuf_len);
353 		return (ENOMEM);
354 	}
355 
356 	len = tcp_opt_get(connp, level, option_name, optvalp_buf);
357 	squeue_synch_exit(connp);
358 
359 	if (len == -1) {
360 		kmem_free(optvalp_buf, max_optbuf_len);
361 		return (EINVAL);
362 	}
363 
364 	/*
365 	 * update optlen and copy option value
366 	 */
367 	t_uscalar_t size = MIN(len, *optlen);
368 
369 	bcopy(optvalp_buf, optvalp, size);
370 	bcopy(&size, optlen, sizeof (size));
371 
372 	kmem_free(optvalp_buf, max_optbuf_len);
373 	return (0);
374 }
375 
376 static int
377 tcp_setsockopt(sock_lower_handle_t proto_handle, int level, int option_name,
378     const void *optvalp, socklen_t optlen, cred_t *cr)
379 {
380 	conn_t		*connp = (conn_t *)proto_handle;
381 	int		error;
382 
383 	ASSERT(connp->conn_upper_handle != NULL);
384 	/*
385 	 * Entering the squeue synchronously can result in a context switch,
386 	 * which can cause a rather sever performance degradation. So we try to
387 	 * handle whatever options we can without entering the squeue.
388 	 */
389 	if (level == IPPROTO_TCP) {
390 		switch (option_name) {
391 		case TCP_NODELAY:
392 			if (optlen != sizeof (int32_t))
393 				return (EINVAL);
394 			mutex_enter(&connp->conn_tcp->tcp_non_sq_lock);
395 			connp->conn_tcp->tcp_naglim = *(int *)optvalp ? 1 :
396 			    connp->conn_tcp->tcp_mss;
397 			mutex_exit(&connp->conn_tcp->tcp_non_sq_lock);
398 			return (0);
399 		default:
400 			break;
401 		}
402 	}
403 
404 	error = squeue_synch_enter(connp, NULL);
405 	if (error == ENOMEM) {
406 		return (ENOMEM);
407 	}
408 
409 	error = proto_opt_check(level, option_name, optlen, NULL,
410 	    tcp_opt_obj.odb_opt_des_arr,
411 	    tcp_opt_obj.odb_opt_arr_cnt,
412 	    B_TRUE, B_FALSE, cr);
413 
414 	if (error != 0) {
415 		if (error < 0) {
416 			error = proto_tlitosyserr(-error);
417 		}
418 		squeue_synch_exit(connp);
419 		return (error);
420 	}
421 
422 	error = tcp_opt_set(connp, SETFN_OPTCOM_NEGOTIATE, level, option_name,
423 	    optlen, (uchar_t *)optvalp, (uint_t *)&optlen, (uchar_t *)optvalp,
424 	    NULL, cr);
425 	squeue_synch_exit(connp);
426 
427 	ASSERT(error >= 0);
428 
429 	return (error);
430 }
431 
432 /* ARGSUSED */
433 static int
434 tcp_sendmsg(sock_lower_handle_t proto_handle, mblk_t *mp, struct nmsghdr *msg,
435     cred_t *cr)
436 {
437 	tcp_t		*tcp;
438 	uint32_t	msize;
439 	conn_t *connp = (conn_t *)proto_handle;
440 	int32_t		tcpstate;
441 
442 	/* All Solaris components should pass a cred for this operation. */
443 	ASSERT(cr != NULL);
444 
445 	ASSERT(connp->conn_ref >= 2);
446 	ASSERT(connp->conn_upper_handle != NULL);
447 
448 	if (msg->msg_controllen != 0) {
449 		freemsg(mp);
450 		return (EOPNOTSUPP);
451 	}
452 
453 	switch (DB_TYPE(mp)) {
454 	case M_DATA:
455 		tcp = connp->conn_tcp;
456 		ASSERT(tcp != NULL);
457 
458 		tcpstate = tcp->tcp_state;
459 		if (tcpstate < TCPS_ESTABLISHED) {
460 			freemsg(mp);
461 			/*
462 			 * We return ENOTCONN if the endpoint is trying to
463 			 * connect or has never been connected, and EPIPE if it
464 			 * has been disconnected. The connection id helps us
465 			 * distinguish between the last two cases.
466 			 */
467 			return ((tcpstate == TCPS_SYN_SENT) ? ENOTCONN :
468 			    ((tcp->tcp_connid > 0) ? EPIPE : ENOTCONN));
469 		} else if (tcpstate > TCPS_CLOSE_WAIT) {
470 			freemsg(mp);
471 			return (EPIPE);
472 		}
473 
474 		msize = msgdsize(mp);
475 
476 		mutex_enter(&tcp->tcp_non_sq_lock);
477 		tcp->tcp_squeue_bytes += msize;
478 		/*
479 		 * Squeue Flow Control
480 		 */
481 		if (TCP_UNSENT_BYTES(tcp) > connp->conn_sndbuf) {
482 			tcp_setqfull(tcp);
483 		}
484 		mutex_exit(&tcp->tcp_non_sq_lock);
485 
486 		/*
487 		 * The application may pass in an address in the msghdr, but
488 		 * we ignore the address on connection-oriented sockets.
489 		 * Just like BSD this code does not generate an error for
490 		 * TCP (a CONNREQUIRED socket) when sending to an address
491 		 * passed in with sendto/sendmsg. Instead the data is
492 		 * delivered on the connection as if no address had been
493 		 * supplied.
494 		 */
495 		CONN_INC_REF(connp);
496 
497 		if (msg->msg_flags & MSG_OOB) {
498 			SQUEUE_ENTER_ONE(connp->conn_sqp, mp, tcp_output_urgent,
499 			    connp, NULL, tcp_squeue_flag, SQTAG_TCP_OUTPUT);
500 		} else {
501 			SQUEUE_ENTER_ONE(connp->conn_sqp, mp, tcp_output,
502 			    connp, NULL, tcp_squeue_flag, SQTAG_TCP_OUTPUT);
503 		}
504 
505 		return (0);
506 
507 	default:
508 		ASSERT(0);
509 	}
510 
511 	freemsg(mp);
512 	return (0);
513 }
514 
515 /* ARGSUSED */
516 static int
517 tcp_shutdown(sock_lower_handle_t proto_handle, int how, cred_t *cr)
518 {
519 	conn_t  *connp = (conn_t *)proto_handle;
520 	tcp_t   *tcp = connp->conn_tcp;
521 
522 	ASSERT(connp->conn_upper_handle != NULL);
523 
524 	/* All Solaris components should pass a cred for this operation. */
525 	ASSERT(cr != NULL);
526 
527 	/*
528 	 * X/Open requires that we check the connected state.
529 	 */
530 	if (tcp->tcp_state < TCPS_SYN_SENT)
531 		return (ENOTCONN);
532 
533 	/* shutdown the send side */
534 	if (how != SHUT_RD) {
535 		mblk_t *bp;
536 
537 		bp = allocb_wait(0, BPRI_HI, STR_NOSIG, NULL);
538 		CONN_INC_REF(connp);
539 		SQUEUE_ENTER_ONE(connp->conn_sqp, bp, tcp_shutdown_output,
540 		    connp, NULL, SQ_NODRAIN, SQTAG_TCP_SHUTDOWN_OUTPUT);
541 
542 		(*connp->conn_upcalls->su_opctl)(connp->conn_upper_handle,
543 		    SOCK_OPCTL_SHUT_SEND, 0);
544 	}
545 
546 	/* shutdown the recv side */
547 	if (how != SHUT_WR)
548 		(*connp->conn_upcalls->su_opctl)(connp->conn_upper_handle,
549 		    SOCK_OPCTL_SHUT_RECV, 0);
550 
551 	return (0);
552 }
553 
554 static void
555 tcp_clr_flowctrl(sock_lower_handle_t proto_handle)
556 {
557 	conn_t  *connp = (conn_t *)proto_handle;
558 	tcp_t	*tcp = connp->conn_tcp;
559 	mblk_t *mp;
560 	int error;
561 
562 	ASSERT(connp->conn_upper_handle != NULL);
563 
564 	/*
565 	 * If tcp->tcp_rsrv_mp == NULL, it means that tcp_clr_flowctrl()
566 	 * is currently running.
567 	 */
568 	mutex_enter(&tcp->tcp_rsrv_mp_lock);
569 	if ((mp = tcp->tcp_rsrv_mp) == NULL) {
570 		mutex_exit(&tcp->tcp_rsrv_mp_lock);
571 		return;
572 	}
573 	tcp->tcp_rsrv_mp = NULL;
574 	mutex_exit(&tcp->tcp_rsrv_mp_lock);
575 
576 	error = squeue_synch_enter(connp, mp);
577 	ASSERT(error == 0);
578 
579 	mutex_enter(&tcp->tcp_rsrv_mp_lock);
580 	tcp->tcp_rsrv_mp = mp;
581 	mutex_exit(&tcp->tcp_rsrv_mp_lock);
582 
583 	if (tcp->tcp_fused) {
584 		tcp_fuse_backenable(tcp);
585 	} else {
586 		tcp->tcp_rwnd = connp->conn_rcvbuf;
587 		/*
588 		 * Send back a window update immediately if TCP is above
589 		 * ESTABLISHED state and the increase of the rcv window
590 		 * that the other side knows is at least 1 MSS after flow
591 		 * control is lifted.
592 		 */
593 		if (tcp->tcp_state >= TCPS_ESTABLISHED &&
594 		    tcp_rwnd_reopen(tcp) == TH_ACK_NEEDED) {
595 			tcp_xmit_ctl(NULL, tcp,
596 			    (tcp->tcp_swnd == 0) ? tcp->tcp_suna :
597 			    tcp->tcp_snxt, tcp->tcp_rnxt, TH_ACK);
598 		}
599 	}
600 
601 	squeue_synch_exit(connp);
602 }
603 
604 /* ARGSUSED */
605 static int
606 tcp_ioctl(sock_lower_handle_t proto_handle, int cmd, intptr_t arg,
607     int mode, int32_t *rvalp, cred_t *cr)
608 {
609 	conn_t  	*connp = (conn_t *)proto_handle;
610 	int		error;
611 
612 	ASSERT(connp->conn_upper_handle != NULL);
613 
614 	/* All Solaris components should pass a cred for this operation. */
615 	ASSERT(cr != NULL);
616 
617 	/*
618 	 * If we don't have a helper stream then create one.
619 	 * ip_create_helper_stream takes care of locking the conn_t,
620 	 * so this check for NULL is just a performance optimization.
621 	 */
622 	if (connp->conn_helper_info == NULL) {
623 		tcp_stack_t *tcps = connp->conn_tcp->tcp_tcps;
624 
625 		/*
626 		 * Create a helper stream for non-STREAMS socket.
627 		 */
628 		error = ip_create_helper_stream(connp, tcps->tcps_ldi_ident);
629 		if (error != 0) {
630 			ip0dbg(("tcp_ioctl: create of IP helper stream "
631 			    "failed %d\n", error));
632 			return (error);
633 		}
634 	}
635 
636 	switch (cmd) {
637 		case ND_SET:
638 		case ND_GET:
639 		case _SIOCSOCKFALLBACK:
640 		case TCP_IOC_ABORT_CONN:
641 		case TI_GETPEERNAME:
642 		case TI_GETMYNAME:
643 			ip1dbg(("tcp_ioctl: cmd 0x%x on non streams socket",
644 			    cmd));
645 			error = EINVAL;
646 			break;
647 		default:
648 			/*
649 			 * If the conn is not closing, pass on to IP using
650 			 * helper stream. Bump the ioctlref to prevent tcp_close
651 			 * from closing the rq/wq out from underneath the ioctl
652 			 * if it ends up queued or aborted/interrupted.
653 			 */
654 			mutex_enter(&connp->conn_lock);
655 			if (connp->conn_state_flags & (CONN_CLOSING)) {
656 				mutex_exit(&connp->conn_lock);
657 				error = EINVAL;
658 				break;
659 			}
660 			CONN_INC_IOCTLREF_LOCKED(connp);
661 			error = ldi_ioctl(connp->conn_helper_info->iphs_handle,
662 			    cmd, arg, mode, cr, rvalp);
663 			CONN_DEC_IOCTLREF(connp);
664 			break;
665 	}
666 	return (error);
667 }
668 
669 /* ARGSUSED */
670 static int
671 tcp_close(sock_lower_handle_t proto_handle, int flags, cred_t *cr)
672 {
673 	conn_t *connp = (conn_t *)proto_handle;
674 
675 	ASSERT(connp->conn_upper_handle != NULL);
676 
677 	/* All Solaris components should pass a cred for this operation. */
678 	ASSERT(cr != NULL);
679 
680 	tcp_close_common(connp, flags);
681 
682 	ip_free_helper_stream(connp);
683 
684 	/*
685 	 * Drop IP's reference on the conn. This is the last reference
686 	 * on the connp if the state was less than established. If the
687 	 * connection has gone into timewait state, then we will have
688 	 * one ref for the TCP and one more ref (total of two) for the
689 	 * classifier connected hash list (a timewait connections stays
690 	 * in connected hash till closed).
691 	 *
692 	 * We can't assert the references because there might be other
693 	 * transient reference places because of some walkers or queued
694 	 * packets in squeue for the timewait state.
695 	 */
696 	CONN_DEC_REF(connp);
697 	return (0);
698 }
699 
700 /* ARGSUSED */
701 sock_lower_handle_t
702 tcp_create(int family, int type, int proto, sock_downcalls_t **sock_downcalls,
703     uint_t *smodep, int *errorp, int flags, cred_t *credp)
704 {
705 	conn_t		*connp;
706 	boolean_t	isv6 = family == AF_INET6;
707 	if (type != SOCK_STREAM || (family != AF_INET && family != AF_INET6) ||
708 	    (proto != 0 && proto != IPPROTO_TCP)) {
709 		*errorp = EPROTONOSUPPORT;
710 		return (NULL);
711 	}
712 
713 	connp = tcp_create_common(credp, isv6, B_TRUE, errorp);
714 	if (connp == NULL) {
715 		return (NULL);
716 	}
717 
718 	/*
719 	 * Put the ref for TCP. Ref for IP was already put
720 	 * by ipcl_conn_create. Also Make the conn_t globally
721 	 * visible to walkers
722 	 */
723 	mutex_enter(&connp->conn_lock);
724 	CONN_INC_REF_LOCKED(connp);
725 	ASSERT(connp->conn_ref == 2);
726 	connp->conn_state_flags &= ~CONN_INCIPIENT;
727 
728 	connp->conn_flags |= IPCL_NONSTR;
729 	mutex_exit(&connp->conn_lock);
730 
731 	ASSERT(errorp != NULL);
732 	*errorp = 0;
733 	*sock_downcalls = &sock_tcp_downcalls;
734 	*smodep = SM_CONNREQUIRED | SM_EXDATA | SM_ACCEPTSUPP |
735 	    SM_SENDFILESUPP;
736 
737 	return ((sock_lower_handle_t)connp);
738 }
739 
740 int
741 tcp_fallback(sock_lower_handle_t proto_handle, queue_t *q,
742     boolean_t direct_sockfs, so_proto_quiesced_cb_t quiesced_cb)
743 {
744 	tcp_t			*tcp;
745 	conn_t 			*connp = (conn_t *)proto_handle;
746 	int			error;
747 	mblk_t			*stropt_mp;
748 	mblk_t			*ordrel_mp;
749 
750 	tcp = connp->conn_tcp;
751 
752 	stropt_mp = allocb_wait(sizeof (struct stroptions), BPRI_HI, STR_NOSIG,
753 	    NULL);
754 
755 	/* Pre-allocate the T_ordrel_ind mblk. */
756 	ASSERT(tcp->tcp_ordrel_mp == NULL);
757 	ordrel_mp = allocb_wait(sizeof (struct T_ordrel_ind), BPRI_HI,
758 	    STR_NOSIG, NULL);
759 	ordrel_mp->b_datap->db_type = M_PROTO;
760 	((struct T_ordrel_ind *)ordrel_mp->b_rptr)->PRIM_type = T_ORDREL_IND;
761 	ordrel_mp->b_wptr += sizeof (struct T_ordrel_ind);
762 
763 	/*
764 	 * Enter the squeue so that no new packets can come in
765 	 */
766 	error = squeue_synch_enter(connp, NULL);
767 	if (error != 0) {
768 		/* failed to enter, free all the pre-allocated messages. */
769 		freeb(stropt_mp);
770 		freeb(ordrel_mp);
771 		/*
772 		 * We cannot process the eager, so at least send out a
773 		 * RST so the peer can reconnect.
774 		 */
775 		if (tcp->tcp_listener != NULL) {
776 			(void) tcp_eager_blowoff(tcp->tcp_listener,
777 			    tcp->tcp_conn_req_seqnum);
778 		}
779 		return (ENOMEM);
780 	}
781 
782 	/*
783 	 * Both endpoints must be of the same type (either STREAMS or
784 	 * non-STREAMS) for fusion to be enabled. So if we are fused,
785 	 * we have to unfuse.
786 	 */
787 	if (tcp->tcp_fused)
788 		tcp_unfuse(tcp);
789 
790 	/*
791 	 * No longer a direct socket
792 	 */
793 	connp->conn_flags &= ~IPCL_NONSTR;
794 	tcp->tcp_ordrel_mp = ordrel_mp;
795 
796 	if (tcp->tcp_listener != NULL) {
797 		/* The eager will deal with opts when accept() is called */
798 		freeb(stropt_mp);
799 		tcp_fallback_eager(tcp, direct_sockfs);
800 	} else {
801 		tcp_fallback_noneager(tcp, stropt_mp, q, direct_sockfs,
802 		    quiesced_cb);
803 	}
804 
805 	/*
806 	 * There should be atleast two ref's (IP + TCP)
807 	 */
808 	ASSERT(connp->conn_ref >= 2);
809 	squeue_synch_exit(connp);
810 
811 	return (0);
812 }
813