xref: /illumos-gate/usr/src/uts/common/inet/tcp/tcp_socket.c (revision fec047081731fd77caf46ec0471c501b2cb33894)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
24  * Copyright 2017 Joyent, Inc.
25  */
26 
27 /* This file contains all TCP kernel socket related functions. */
28 
29 #include <sys/types.h>
30 #include <sys/strlog.h>
31 #include <sys/policy.h>
32 #include <sys/sockio.h>
33 #include <sys/strsubr.h>
34 #include <sys/strsun.h>
35 #include <sys/squeue_impl.h>
36 #include <sys/squeue.h>
37 #define	_SUN_TPI_VERSION 2
38 #include <sys/tihdr.h>
39 #include <sys/timod.h>
40 #include <sys/tpicommon.h>
41 #include <sys/socketvar.h>
42 
43 #include <inet/common.h>
44 #include <inet/proto_set.h>
45 #include <inet/ip.h>
46 #include <inet/tcp.h>
47 #include <inet/tcp_impl.h>
48 
49 static void	tcp_activate(sock_lower_handle_t, sock_upper_handle_t,
50 		    sock_upcalls_t *, int, cred_t *);
51 static int	tcp_accept(sock_lower_handle_t, sock_lower_handle_t,
52 		    sock_upper_handle_t, cred_t *);
53 static int	tcp_bind(sock_lower_handle_t, struct sockaddr *,
54 		    socklen_t, cred_t *);
55 static int	tcp_listen(sock_lower_handle_t, int, cred_t *);
56 static int	tcp_connect(sock_lower_handle_t, const struct sockaddr *,
57 		    socklen_t, sock_connid_t *, cred_t *);
58 static int	tcp_getpeername(sock_lower_handle_t, struct sockaddr *,
59 		    socklen_t *, cred_t *);
60 static int	tcp_getsockname(sock_lower_handle_t, struct sockaddr *,
61 		    socklen_t *, cred_t *);
62 static int	tcp_getsockopt(sock_lower_handle_t, int, int, void *,
63 		    socklen_t *, cred_t *);
64 static int	tcp_setsockopt(sock_lower_handle_t, int, int, const void *,
65 		    socklen_t, cred_t *);
66 static int	tcp_sendmsg(sock_lower_handle_t, mblk_t *, struct nmsghdr *,
67 		    cred_t *);
68 static int	tcp_shutdown(sock_lower_handle_t, int, cred_t *);
69 static void	tcp_clr_flowctrl(sock_lower_handle_t);
70 static int	tcp_ioctl(sock_lower_handle_t, int, intptr_t, int, int32_t *,
71 		    cred_t *);
72 static int	tcp_close(sock_lower_handle_t, int, cred_t *);
73 
74 sock_downcalls_t sock_tcp_downcalls = {
75 	tcp_activate,
76 	tcp_accept,
77 	tcp_bind,
78 	tcp_listen,
79 	tcp_connect,
80 	tcp_getpeername,
81 	tcp_getsockname,
82 	tcp_getsockopt,
83 	tcp_setsockopt,
84 	tcp_sendmsg,
85 	NULL,
86 	NULL,
87 	NULL,
88 	tcp_shutdown,
89 	tcp_clr_flowctrl,
90 	tcp_ioctl,
91 	tcp_close,
92 };
93 
94 /* ARGSUSED */
95 static void
96 tcp_activate(sock_lower_handle_t proto_handle, sock_upper_handle_t sock_handle,
97     sock_upcalls_t *sock_upcalls, int flags, cred_t *cr)
98 {
99 	conn_t *connp = (conn_t *)proto_handle;
100 	struct sock_proto_props sopp;
101 	extern struct module_info tcp_rinfo;
102 
103 	ASSERT(connp->conn_upper_handle == NULL);
104 
105 	/* All Solaris components should pass a cred for this operation. */
106 	ASSERT(cr != NULL);
107 
108 	sopp.sopp_flags = SOCKOPT_RCVHIWAT | SOCKOPT_RCVLOWAT |
109 	    SOCKOPT_MAXPSZ | SOCKOPT_MAXBLK | SOCKOPT_RCVTIMER |
110 	    SOCKOPT_RCVTHRESH | SOCKOPT_MAXADDRLEN | SOCKOPT_MINPSZ;
111 
112 	sopp.sopp_rxhiwat = SOCKET_RECVHIWATER;
113 	sopp.sopp_rxlowat = SOCKET_RECVLOWATER;
114 	sopp.sopp_maxpsz = INFPSZ;
115 	sopp.sopp_maxblk = INFPSZ;
116 	sopp.sopp_rcvtimer = SOCKET_TIMER_INTERVAL;
117 	sopp.sopp_rcvthresh = SOCKET_RECVHIWATER >> 3;
118 	sopp.sopp_maxaddrlen = sizeof (sin6_t);
119 	sopp.sopp_minpsz = (tcp_rinfo.mi_minpsz == 1) ? 0 :
120 	    tcp_rinfo.mi_minpsz;
121 
122 	connp->conn_upcalls = sock_upcalls;
123 	connp->conn_upper_handle = sock_handle;
124 
125 	ASSERT(connp->conn_rcvbuf != 0 &&
126 	    connp->conn_rcvbuf == connp->conn_tcp->tcp_rwnd);
127 	(*sock_upcalls->su_set_proto_props)(sock_handle, &sopp);
128 }
129 
130 /*ARGSUSED*/
131 static int
132 tcp_accept(sock_lower_handle_t lproto_handle,
133     sock_lower_handle_t eproto_handle, sock_upper_handle_t sock_handle,
134     cred_t *cr)
135 {
136 	conn_t *lconnp, *econnp;
137 	tcp_t *listener, *eager;
138 
139 	/*
140 	 * KSSL can move a socket from one listener to another, in which
141 	 * case `lproto_handle' points to the new listener. To ensure that
142 	 * the original listener is used the information is obtained from
143 	 * the eager.
144 	 */
145 	econnp = (conn_t *)eproto_handle;
146 	eager = econnp->conn_tcp;
147 	ASSERT(IPCL_IS_NONSTR(econnp));
148 	ASSERT(eager->tcp_listener != NULL);
149 	listener = eager->tcp_listener;
150 	lconnp = (conn_t *)listener->tcp_connp;
151 	ASSERT(listener->tcp_state == TCPS_LISTEN);
152 	ASSERT(lconnp->conn_upper_handle != NULL);
153 
154 	/*
155 	 * It is possible for the accept thread to race with the thread that
156 	 * made the su_newconn upcall in tcp_newconn_notify. Both
157 	 * tcp_newconn_notify and tcp_accept require that conn_upper_handle
158 	 * and conn_upcalls be set before returning, so they both write to
159 	 * them. However, we're guaranteed that the value written is the same
160 	 * for both threads.
161 	 */
162 	ASSERT(econnp->conn_upper_handle == NULL ||
163 	    econnp->conn_upper_handle == sock_handle);
164 	ASSERT(econnp->conn_upcalls == NULL ||
165 	    econnp->conn_upcalls == lconnp->conn_upcalls);
166 	econnp->conn_upper_handle = sock_handle;
167 	econnp->conn_upcalls = lconnp->conn_upcalls;
168 
169 	ASSERT(econnp->conn_netstack ==
170 	    listener->tcp_connp->conn_netstack);
171 	ASSERT(eager->tcp_tcps == listener->tcp_tcps);
172 
173 	/*
174 	 * We should have a minimum of 2 references on the conn at this
175 	 * point. One for TCP and one for the newconn notification
176 	 * (which is now taken over by IP). In the normal case we would
177 	 * also have another reference (making a total of 3) for the conn
178 	 * being in the classifier hash list. However the eager could have
179 	 * received an RST subsequently and tcp_closei_local could have
180 	 * removed the eager from the classifier hash list, hence we can't
181 	 * assert that reference.
182 	 */
183 	ASSERT(econnp->conn_ref >= 2);
184 
185 	mutex_enter(&listener->tcp_eager_lock);
186 	/*
187 	 * Non-STREAMS listeners never defer the notification of new
188 	 * connections.
189 	 */
190 	ASSERT(!listener->tcp_eager_prev_q0->tcp_conn_def_q0);
191 	tcp_eager_unlink(eager);
192 	mutex_exit(&listener->tcp_eager_lock);
193 	CONN_DEC_REF(listener->tcp_connp);
194 
195 	return ((eager->tcp_state < TCPS_ESTABLISHED) ? ECONNABORTED : 0);
196 }
197 
198 static int
199 tcp_bind(sock_lower_handle_t proto_handle, struct sockaddr *sa,
200     socklen_t len, cred_t *cr)
201 {
202 	int		error;
203 	conn_t		*connp = (conn_t *)proto_handle;
204 
205 	/* All Solaris components should pass a cred for this operation. */
206 	ASSERT(cr != NULL);
207 	ASSERT(connp->conn_upper_handle != NULL);
208 
209 	error = squeue_synch_enter(connp, NULL);
210 	if (error != 0) {
211 		/* failed to enter */
212 		return (ENOSR);
213 	}
214 
215 	/* binding to a NULL address really means unbind */
216 	if (sa == NULL) {
217 		if (connp->conn_tcp->tcp_state < TCPS_LISTEN)
218 			error = tcp_do_unbind(connp);
219 		else
220 			error = EINVAL;
221 	} else {
222 		error = tcp_do_bind(connp, sa, len, cr, B_TRUE);
223 	}
224 
225 	squeue_synch_exit(connp, SQ_NODRAIN);
226 
227 	if (error < 0) {
228 		if (error == -TOUTSTATE)
229 			error = EINVAL;
230 		else
231 			error = proto_tlitosyserr(-error);
232 	}
233 
234 	return (error);
235 }
236 
237 /* ARGSUSED */
238 static int
239 tcp_listen(sock_lower_handle_t proto_handle, int backlog, cred_t *cr)
240 {
241 	conn_t	*connp = (conn_t *)proto_handle;
242 	tcp_t	*tcp = connp->conn_tcp;
243 	int	error;
244 
245 	ASSERT(connp->conn_upper_handle != NULL);
246 
247 	/* All Solaris components should pass a cred for this operation. */
248 	ASSERT(cr != NULL);
249 
250 	error = squeue_synch_enter(connp, NULL);
251 	if (error != 0) {
252 		/* failed to enter */
253 		return (ENOBUFS);
254 	}
255 
256 	error = tcp_do_listen(connp, NULL, 0, backlog, cr, B_FALSE);
257 	if (error == 0) {
258 		/*
259 		 * sockfs needs to know what's the maximum number of socket
260 		 * that can be queued on the listener.
261 		 */
262 		(*connp->conn_upcalls->su_opctl)(connp->conn_upper_handle,
263 		    SOCK_OPCTL_ENAB_ACCEPT,
264 		    (uintptr_t)(tcp->tcp_conn_req_max +
265 		    tcp->tcp_tcps->tcps_conn_req_max_q0));
266 	} else if (error < 0) {
267 		if (error == -TOUTSTATE)
268 			error = EINVAL;
269 		else
270 			error = proto_tlitosyserr(-error);
271 	}
272 	squeue_synch_exit(connp, SQ_NODRAIN);
273 	return (error);
274 }
275 
276 static int
277 tcp_connect(sock_lower_handle_t proto_handle, const struct sockaddr *sa,
278     socklen_t len, sock_connid_t *id, cred_t *cr)
279 {
280 	conn_t		*connp = (conn_t *)proto_handle;
281 	int		error;
282 
283 	ASSERT(connp->conn_upper_handle != NULL);
284 
285 	/* All Solaris components should pass a cred for this operation. */
286 	ASSERT(cr != NULL);
287 
288 	error = proto_verify_ip_addr(connp->conn_family, sa, len);
289 	if (error != 0) {
290 		return (error);
291 	}
292 
293 	error = squeue_synch_enter(connp, NULL);
294 	if (error != 0) {
295 		/* failed to enter */
296 		return (ENOSR);
297 	}
298 
299 	/*
300 	 * TCP supports quick connect, so no need to do an implicit bind
301 	 */
302 	error = tcp_do_connect(connp, sa, len, cr, curproc->p_pid);
303 	if (error == 0) {
304 		*id = connp->conn_tcp->tcp_connid;
305 	} else if (error < 0) {
306 		if (error == -TOUTSTATE) {
307 			switch (connp->conn_tcp->tcp_state) {
308 			case TCPS_SYN_SENT:
309 				error = EALREADY;
310 				break;
311 			case TCPS_ESTABLISHED:
312 				error = EISCONN;
313 				break;
314 			case TCPS_LISTEN:
315 				error = EOPNOTSUPP;
316 				break;
317 			default:
318 				error = EINVAL;
319 				break;
320 			}
321 		} else {
322 			error = proto_tlitosyserr(-error);
323 		}
324 	}
325 
326 	if (connp->conn_tcp->tcp_loopback) {
327 		struct sock_proto_props sopp;
328 
329 		sopp.sopp_flags = SOCKOPT_LOOPBACK;
330 		sopp.sopp_loopback = B_TRUE;
331 
332 		(*connp->conn_upcalls->su_set_proto_props)(
333 		    connp->conn_upper_handle, &sopp);
334 	}
335 done:
336 	/*
337 	 * Indicate (via SQ_PROCESS) that it is acceptable for the squeue to
338 	 * attempt to drain a pending request relevant to this connection when
339 	 * exiting the synchronous context.  This can improve the performance
340 	 * and efficiency of TCP connect(2) operations to localhost.
341 	 */
342 	squeue_synch_exit(connp, SQ_PROCESS);
343 
344 	return ((error == 0) ? EINPROGRESS : error);
345 }
346 
347 /* ARGSUSED3 */
348 static int
349 tcp_getpeername(sock_lower_handle_t proto_handle, struct sockaddr *addr,
350     socklen_t *addrlenp, cred_t *cr)
351 {
352 	conn_t	*connp = (conn_t *)proto_handle;
353 	tcp_t	*tcp = connp->conn_tcp;
354 
355 	/* All Solaris components should pass a cred for this operation. */
356 	ASSERT(cr != NULL);
357 
358 	ASSERT(tcp != NULL);
359 	if (tcp->tcp_state < TCPS_SYN_RCVD)
360 		return (ENOTCONN);
361 
362 	return (conn_getpeername(connp, addr, addrlenp));
363 }
364 
365 /* ARGSUSED3 */
366 static int
367 tcp_getsockname(sock_lower_handle_t proto_handle, struct sockaddr *addr,
368     socklen_t *addrlenp, cred_t *cr)
369 {
370 	conn_t	*connp = (conn_t *)proto_handle;
371 
372 	/* All Solaris components should pass a cred for this operation. */
373 	ASSERT(cr != NULL);
374 
375 	return (conn_getsockname(connp, addr, addrlenp));
376 }
377 
378 /* returns UNIX error, the optlen is a value-result arg */
379 static int
380 tcp_getsockopt(sock_lower_handle_t proto_handle, int level, int option_name,
381     void *optvalp, socklen_t *optlen, cred_t *cr)
382 {
383 	conn_t		*connp = (conn_t *)proto_handle;
384 	int		error;
385 	t_uscalar_t	max_optbuf_len;
386 	void		*optvalp_buf;
387 	int		len;
388 
389 	ASSERT(connp->conn_upper_handle != NULL);
390 
391 	error = proto_opt_check(level, option_name, *optlen, &max_optbuf_len,
392 	    tcp_opt_obj.odb_opt_des_arr,
393 	    tcp_opt_obj.odb_opt_arr_cnt,
394 	    B_FALSE, B_TRUE, cr);
395 	if (error != 0) {
396 		if (error < 0) {
397 			error = proto_tlitosyserr(-error);
398 		}
399 		return (error);
400 	}
401 
402 	optvalp_buf = kmem_alloc(max_optbuf_len, KM_SLEEP);
403 
404 	error = squeue_synch_enter(connp, NULL);
405 	if (error == ENOMEM) {
406 		kmem_free(optvalp_buf, max_optbuf_len);
407 		return (ENOMEM);
408 	}
409 
410 	len = tcp_opt_get(connp, level, option_name, optvalp_buf);
411 	squeue_synch_exit(connp, SQ_NODRAIN);
412 
413 	if (len == -1) {
414 		kmem_free(optvalp_buf, max_optbuf_len);
415 		return (EINVAL);
416 	}
417 
418 	/*
419 	 * update optlen and copy option value
420 	 */
421 	t_uscalar_t size = MIN(len, *optlen);
422 
423 	bcopy(optvalp_buf, optvalp, size);
424 	bcopy(&size, optlen, sizeof (size));
425 
426 	kmem_free(optvalp_buf, max_optbuf_len);
427 	return (0);
428 }
429 
430 static int
431 tcp_setsockopt(sock_lower_handle_t proto_handle, int level, int option_name,
432     const void *optvalp, socklen_t optlen, cred_t *cr)
433 {
434 	conn_t		*connp = (conn_t *)proto_handle;
435 	int		error;
436 
437 	ASSERT(connp->conn_upper_handle != NULL);
438 	/*
439 	 * Entering the squeue synchronously can result in a context switch,
440 	 * which can cause a rather sever performance degradation. So we try to
441 	 * handle whatever options we can without entering the squeue.
442 	 */
443 	if (level == IPPROTO_TCP) {
444 		switch (option_name) {
445 		case TCP_NODELAY:
446 			if (optlen != sizeof (int32_t))
447 				return (EINVAL);
448 			mutex_enter(&connp->conn_tcp->tcp_non_sq_lock);
449 			connp->conn_tcp->tcp_naglim = *(int *)optvalp ? 1 :
450 			    connp->conn_tcp->tcp_mss;
451 			mutex_exit(&connp->conn_tcp->tcp_non_sq_lock);
452 			return (0);
453 		default:
454 			break;
455 		}
456 	}
457 
458 	error = squeue_synch_enter(connp, NULL);
459 	if (error == ENOMEM) {
460 		return (ENOMEM);
461 	}
462 
463 	error = proto_opt_check(level, option_name, optlen, NULL,
464 	    tcp_opt_obj.odb_opt_des_arr,
465 	    tcp_opt_obj.odb_opt_arr_cnt,
466 	    B_TRUE, B_FALSE, cr);
467 
468 	if (error != 0) {
469 		if (error < 0) {
470 			error = proto_tlitosyserr(-error);
471 		}
472 		squeue_synch_exit(connp, SQ_NODRAIN);
473 		return (error);
474 	}
475 
476 	error = tcp_opt_set(connp, SETFN_OPTCOM_NEGOTIATE, level, option_name,
477 	    optlen, (uchar_t *)optvalp, (uint_t *)&optlen, (uchar_t *)optvalp,
478 	    NULL, cr);
479 	squeue_synch_exit(connp, SQ_NODRAIN);
480 
481 	ASSERT(error >= 0);
482 
483 	return (error);
484 }
485 
486 /* ARGSUSED */
487 static int
488 tcp_sendmsg(sock_lower_handle_t proto_handle, mblk_t *mp, struct nmsghdr *msg,
489     cred_t *cr)
490 {
491 	tcp_t		*tcp;
492 	uint32_t	msize;
493 	conn_t *connp = (conn_t *)proto_handle;
494 	int32_t		tcpstate;
495 
496 	/* All Solaris components should pass a cred for this operation. */
497 	ASSERT(cr != NULL);
498 
499 	ASSERT(connp->conn_ref >= 2);
500 	ASSERT(connp->conn_upper_handle != NULL);
501 
502 	if (msg->msg_controllen != 0) {
503 		freemsg(mp);
504 		return (EOPNOTSUPP);
505 	}
506 
507 	switch (DB_TYPE(mp)) {
508 	case M_DATA:
509 		tcp = connp->conn_tcp;
510 		ASSERT(tcp != NULL);
511 
512 		tcpstate = tcp->tcp_state;
513 		if (tcpstate < TCPS_ESTABLISHED) {
514 			freemsg(mp);
515 			/*
516 			 * We return ENOTCONN if the endpoint is trying to
517 			 * connect or has never been connected, and EPIPE if it
518 			 * has been disconnected. The connection id helps us
519 			 * distinguish between the last two cases.
520 			 */
521 			return ((tcpstate == TCPS_SYN_SENT) ? ENOTCONN :
522 			    ((tcp->tcp_connid > 0) ? EPIPE : ENOTCONN));
523 		} else if (tcpstate > TCPS_CLOSE_WAIT) {
524 			freemsg(mp);
525 			return (EPIPE);
526 		}
527 
528 		msize = msgdsize(mp);
529 
530 		mutex_enter(&tcp->tcp_non_sq_lock);
531 		tcp->tcp_squeue_bytes += msize;
532 		/*
533 		 * Squeue Flow Control
534 		 */
535 		if (TCP_UNSENT_BYTES(tcp) > connp->conn_sndbuf) {
536 			tcp_setqfull(tcp);
537 		}
538 		mutex_exit(&tcp->tcp_non_sq_lock);
539 
540 		/*
541 		 * The application may pass in an address in the msghdr, but
542 		 * we ignore the address on connection-oriented sockets.
543 		 * Just like BSD this code does not generate an error for
544 		 * TCP (a CONNREQUIRED socket) when sending to an address
545 		 * passed in with sendto/sendmsg. Instead the data is
546 		 * delivered on the connection as if no address had been
547 		 * supplied.
548 		 */
549 		CONN_INC_REF(connp);
550 
551 		if (msg->msg_flags & MSG_OOB) {
552 			SQUEUE_ENTER_ONE(connp->conn_sqp, mp, tcp_output_urgent,
553 			    connp, NULL, tcp_squeue_flag, SQTAG_TCP_OUTPUT);
554 		} else {
555 			SQUEUE_ENTER_ONE(connp->conn_sqp, mp, tcp_output,
556 			    connp, NULL, tcp_squeue_flag, SQTAG_TCP_OUTPUT);
557 		}
558 
559 		return (0);
560 
561 	default:
562 		ASSERT(0);
563 	}
564 
565 	freemsg(mp);
566 	return (0);
567 }
568 
569 /* ARGSUSED */
570 static int
571 tcp_shutdown(sock_lower_handle_t proto_handle, int how, cred_t *cr)
572 {
573 	conn_t  *connp = (conn_t *)proto_handle;
574 	tcp_t   *tcp = connp->conn_tcp;
575 
576 	ASSERT(connp->conn_upper_handle != NULL);
577 
578 	/* All Solaris components should pass a cred for this operation. */
579 	ASSERT(cr != NULL);
580 
581 	/*
582 	 * X/Open requires that we check the connected state.
583 	 */
584 	if (tcp->tcp_state < TCPS_SYN_SENT)
585 		return (ENOTCONN);
586 
587 	/* shutdown the send side */
588 	if (how != SHUT_RD) {
589 		mblk_t *bp;
590 
591 		bp = allocb_wait(0, BPRI_HI, STR_NOSIG, NULL);
592 		CONN_INC_REF(connp);
593 		SQUEUE_ENTER_ONE(connp->conn_sqp, bp, tcp_shutdown_output,
594 		    connp, NULL, SQ_NODRAIN, SQTAG_TCP_SHUTDOWN_OUTPUT);
595 
596 		(*connp->conn_upcalls->su_opctl)(connp->conn_upper_handle,
597 		    SOCK_OPCTL_SHUT_SEND, 0);
598 	}
599 
600 	/* shutdown the recv side */
601 	if (how != SHUT_WR)
602 		(*connp->conn_upcalls->su_opctl)(connp->conn_upper_handle,
603 		    SOCK_OPCTL_SHUT_RECV, 0);
604 
605 	return (0);
606 }
607 
608 static void
609 tcp_clr_flowctrl(sock_lower_handle_t proto_handle)
610 {
611 	conn_t  *connp = (conn_t *)proto_handle;
612 	tcp_t	*tcp = connp->conn_tcp;
613 	mblk_t *mp;
614 	int error;
615 
616 	ASSERT(connp->conn_upper_handle != NULL);
617 
618 	/*
619 	 * If tcp->tcp_rsrv_mp == NULL, it means that tcp_clr_flowctrl()
620 	 * is currently running.
621 	 */
622 	mutex_enter(&tcp->tcp_rsrv_mp_lock);
623 	if ((mp = tcp->tcp_rsrv_mp) == NULL) {
624 		mutex_exit(&tcp->tcp_rsrv_mp_lock);
625 		return;
626 	}
627 	tcp->tcp_rsrv_mp = NULL;
628 	mutex_exit(&tcp->tcp_rsrv_mp_lock);
629 
630 	error = squeue_synch_enter(connp, mp);
631 	ASSERT(error == 0);
632 
633 	mutex_enter(&tcp->tcp_rsrv_mp_lock);
634 	tcp->tcp_rsrv_mp = mp;
635 	mutex_exit(&tcp->tcp_rsrv_mp_lock);
636 
637 	if (tcp->tcp_fused) {
638 		tcp_fuse_backenable(tcp);
639 	} else {
640 		tcp->tcp_rwnd = connp->conn_rcvbuf;
641 		/*
642 		 * Send back a window update immediately if TCP is above
643 		 * ESTABLISHED state and the increase of the rcv window
644 		 * that the other side knows is at least 1 MSS after flow
645 		 * control is lifted.
646 		 */
647 		if (tcp->tcp_state >= TCPS_ESTABLISHED &&
648 		    tcp_rwnd_reopen(tcp) == TH_ACK_NEEDED) {
649 			tcp_xmit_ctl(NULL, tcp,
650 			    (tcp->tcp_swnd == 0) ? tcp->tcp_suna :
651 			    tcp->tcp_snxt, tcp->tcp_rnxt, TH_ACK);
652 		}
653 	}
654 
655 	squeue_synch_exit(connp, SQ_NODRAIN);
656 }
657 
658 /* ARGSUSED */
659 static int
660 tcp_ioctl(sock_lower_handle_t proto_handle, int cmd, intptr_t arg,
661     int mode, int32_t *rvalp, cred_t *cr)
662 {
663 	conn_t		*connp = (conn_t *)proto_handle;
664 	int		error;
665 
666 	ASSERT(connp->conn_upper_handle != NULL);
667 
668 	/* All Solaris components should pass a cred for this operation. */
669 	ASSERT(cr != NULL);
670 
671 	/*
672 	 * If we don't have a helper stream then create one.
673 	 * ip_create_helper_stream takes care of locking the conn_t,
674 	 * so this check for NULL is just a performance optimization.
675 	 */
676 	if (connp->conn_helper_info == NULL) {
677 		tcp_stack_t *tcps = connp->conn_tcp->tcp_tcps;
678 
679 		/*
680 		 * Create a helper stream for non-STREAMS socket.
681 		 */
682 		error = ip_create_helper_stream(connp, tcps->tcps_ldi_ident);
683 		if (error != 0) {
684 			ip0dbg(("tcp_ioctl: create of IP helper stream "
685 			    "failed %d\n", error));
686 			return (error);
687 		}
688 	}
689 
690 	switch (cmd) {
691 		case ND_SET:
692 		case ND_GET:
693 		case _SIOCSOCKFALLBACK:
694 		case TCP_IOC_ABORT_CONN:
695 		case TI_GETPEERNAME:
696 		case TI_GETMYNAME:
697 			ip1dbg(("tcp_ioctl: cmd 0x%x on non streams socket",
698 			    cmd));
699 			error = EINVAL;
700 			break;
701 		default:
702 			/*
703 			 * If the conn is not closing, pass on to IP using
704 			 * helper stream. Bump the ioctlref to prevent tcp_close
705 			 * from closing the rq/wq out from underneath the ioctl
706 			 * if it ends up queued or aborted/interrupted.
707 			 */
708 			mutex_enter(&connp->conn_lock);
709 			if (connp->conn_state_flags & (CONN_CLOSING)) {
710 				mutex_exit(&connp->conn_lock);
711 				error = EINVAL;
712 				break;
713 			}
714 			CONN_INC_IOCTLREF_LOCKED(connp);
715 			error = ldi_ioctl(connp->conn_helper_info->iphs_handle,
716 			    cmd, arg, mode, cr, rvalp);
717 			CONN_DEC_IOCTLREF(connp);
718 			break;
719 	}
720 	return (error);
721 }
722 
723 /* ARGSUSED */
724 static int
725 tcp_close(sock_lower_handle_t proto_handle, int flags, cred_t *cr)
726 {
727 	conn_t *connp = (conn_t *)proto_handle;
728 
729 	ASSERT(connp->conn_upper_handle != NULL);
730 
731 	/* All Solaris components should pass a cred for this operation. */
732 	ASSERT(cr != NULL);
733 
734 	tcp_close_common(connp, flags);
735 
736 	ip_free_helper_stream(connp);
737 
738 	/*
739 	 * Drop IP's reference on the conn. This is the last reference
740 	 * on the connp if the state was less than established. If the
741 	 * connection has gone into timewait state, then we will have
742 	 * one ref for the TCP and one more ref (total of two) for the
743 	 * classifier connected hash list (a timewait connections stays
744 	 * in connected hash till closed).
745 	 *
746 	 * We can't assert the references because there might be other
747 	 * transient reference places because of some walkers or queued
748 	 * packets in squeue for the timewait state.
749 	 */
750 	CONN_DEC_REF(connp);
751 
752 	/*
753 	 * EINPROGRESS tells sockfs to wait for a 'closed' upcall before
754 	 * freeing the socket.
755 	 */
756 	return (EINPROGRESS);
757 }
758 
759 /* ARGSUSED */
760 sock_lower_handle_t
761 tcp_create(int family, int type, int proto, sock_downcalls_t **sock_downcalls,
762     uint_t *smodep, int *errorp, int flags, cred_t *credp)
763 {
764 	conn_t		*connp;
765 	boolean_t	isv6 = family == AF_INET6;
766 
767 	if (type != SOCK_STREAM || (family != AF_INET && family != AF_INET6) ||
768 	    (proto != 0 && proto != IPPROTO_TCP)) {
769 		*errorp = EPROTONOSUPPORT;
770 		return (NULL);
771 	}
772 
773 	connp = tcp_create_common(credp, isv6, B_TRUE, errorp);
774 	if (connp == NULL) {
775 		return (NULL);
776 	}
777 
778 	/*
779 	 * Put the ref for TCP. Ref for IP was already put
780 	 * by ipcl_conn_create. Also make the conn_t globally
781 	 * visible to walkers
782 	 */
783 	mutex_enter(&connp->conn_lock);
784 	CONN_INC_REF_LOCKED(connp);
785 	ASSERT(connp->conn_ref == 2);
786 	connp->conn_state_flags &= ~CONN_INCIPIENT;
787 
788 	connp->conn_flags |= IPCL_NONSTR;
789 	mutex_exit(&connp->conn_lock);
790 
791 	ASSERT(errorp != NULL);
792 	*errorp = 0;
793 	*sock_downcalls = &sock_tcp_downcalls;
794 	*smodep = SM_CONNREQUIRED | SM_EXDATA | SM_ACCEPTSUPP |
795 	    SM_SENDFILESUPP;
796 
797 	return ((sock_lower_handle_t)connp);
798 }
799 
800 /*
801  * tcp_fallback
802  *
803  * A direct socket is falling back to using STREAMS. The queue
804  * that is being passed down was created using tcp_open() with
805  * the SO_FALLBACK flag set. As a result, the queue is not
806  * associated with a conn, and the q_ptrs instead contain the
807  * dev and minor area that should be used.
808  *
809  * The 'issocket' flag indicates whether the FireEngine
810  * optimizations should be used. The common case would be that
811  * optimizations are enabled, and they might be subsequently
812  * disabled using the _SIOCSOCKFALLBACK ioctl.
813  */
814 
815 /*
816  * An active connection is falling back to TPI. Gather all the information
817  * required by the STREAM head and TPI sonode and send it up.
818  */
819 static void
820 tcp_fallback_noneager(tcp_t *tcp, mblk_t *stropt_mp, queue_t *q,
821     boolean_t issocket, so_proto_quiesced_cb_t quiesced_cb,
822     sock_quiesce_arg_t *arg)
823 {
824 	conn_t			*connp = tcp->tcp_connp;
825 	struct stroptions	*stropt;
826 	struct T_capability_ack tca;
827 	struct sockaddr_in6	laddr, faddr;
828 	socklen_t		laddrlen, faddrlen;
829 	short			opts;
830 	int			error;
831 	mblk_t			*mp, *mpnext;
832 
833 	connp->conn_dev = (dev_t)RD(q)->q_ptr;
834 	connp->conn_minor_arena = WR(q)->q_ptr;
835 
836 	RD(q)->q_ptr = WR(q)->q_ptr = connp;
837 
838 	connp->conn_rq = RD(q);
839 	connp->conn_wq = WR(q);
840 
841 	WR(q)->q_qinfo = &tcp_sock_winit;
842 
843 	if (!issocket)
844 		tcp_use_pure_tpi(tcp);
845 
846 	/*
847 	 * free the helper stream
848 	 */
849 	ip_free_helper_stream(connp);
850 
851 	/*
852 	 * Notify the STREAM head about options
853 	 */
854 	DB_TYPE(stropt_mp) = M_SETOPTS;
855 	stropt = (struct stroptions *)stropt_mp->b_rptr;
856 	stropt_mp->b_wptr += sizeof (struct stroptions);
857 	stropt->so_flags = SO_HIWAT | SO_WROFF | SO_MAXBLK;
858 
859 	stropt->so_wroff = connp->conn_ht_iphc_len + (tcp->tcp_loopback ? 0 :
860 	    tcp->tcp_tcps->tcps_wroff_xtra);
861 	if (tcp->tcp_snd_sack_ok)
862 		stropt->so_wroff += TCPOPT_MAX_SACK_LEN;
863 	stropt->so_hiwat = connp->conn_rcvbuf;
864 	stropt->so_maxblk = tcp_maxpsz_set(tcp, B_FALSE);
865 
866 	putnext(RD(q), stropt_mp);
867 
868 	/*
869 	 * Collect the information needed to sync with the sonode
870 	 */
871 	tcp_do_capability_ack(tcp, &tca, TC1_INFO|TC1_ACCEPTOR_ID);
872 
873 	laddrlen = faddrlen = sizeof (sin6_t);
874 	(void) tcp_getsockname((sock_lower_handle_t)connp,
875 	    (struct sockaddr *)&laddr, &laddrlen, CRED());
876 	error = tcp_getpeername((sock_lower_handle_t)connp,
877 	    (struct sockaddr *)&faddr, &faddrlen, CRED());
878 	if (error != 0)
879 		faddrlen = 0;
880 
881 	opts = 0;
882 	if (connp->conn_oobinline)
883 		opts |= SO_OOBINLINE;
884 	if (connp->conn_ixa->ixa_flags & IXAF_DONTROUTE)
885 		opts |= SO_DONTROUTE;
886 
887 	/*
888 	 * Notify the socket that the protocol is now quiescent,
889 	 * and it's therefore safe move data from the socket
890 	 * to the stream head.
891 	 */
892 	mp = (*quiesced_cb)(connp->conn_upper_handle, arg, &tca,
893 	    (struct sockaddr *)&laddr, laddrlen,
894 	    (struct sockaddr *)&faddr, faddrlen, opts);
895 
896 	while (mp != NULL) {
897 		mpnext = mp->b_next;
898 		tcp->tcp_rcv_list = mp->b_next;
899 		mp->b_next = NULL;
900 		putnext(q, mp);
901 		mp = mpnext;
902 	}
903 	ASSERT(tcp->tcp_rcv_last_head == NULL);
904 	ASSERT(tcp->tcp_rcv_last_tail == NULL);
905 	ASSERT(tcp->tcp_rcv_cnt == 0);
906 
907 	/*
908 	 * All eagers in q0 are marked as being non-STREAM, so they will
909 	 * make su_newconn upcalls when the handshake completes, which
910 	 * will fail (resulting in the conn being closed). So we just blow
911 	 * off everything in q0 instead of waiting for the inevitable.
912 	 */
913 	if (tcp->tcp_conn_req_cnt_q0 != 0)
914 		tcp_eager_cleanup(tcp, B_TRUE);
915 }
916 
917 /*
918  * An eager is falling back to TPI. All we have to do is send
919  * up a T_CONN_IND.
920  */
921 static void
922 tcp_fallback_eager(tcp_t *eager, boolean_t issocket,
923     so_proto_quiesced_cb_t quiesced_cb, sock_quiesce_arg_t *arg)
924 {
925 	conn_t *connp = eager->tcp_connp;
926 	tcp_t *listener = eager->tcp_listener;
927 	mblk_t *mp;
928 
929 	ASSERT(listener != NULL);
930 
931 	/*
932 	 * Notify the socket that the protocol is now quiescent,
933 	 * and it's therefore safe move data from the socket
934 	 * to tcp's rcv queue.
935 	 */
936 	mp = (*quiesced_cb)(connp->conn_upper_handle, arg, NULL, NULL, 0,
937 	    NULL, 0, 0);
938 
939 	if (mp != NULL) {
940 		ASSERT(eager->tcp_rcv_cnt == 0);
941 
942 		eager->tcp_rcv_list = mp;
943 		eager->tcp_rcv_cnt = msgdsize(mp);
944 		while (mp->b_next != NULL) {
945 			mp = mp->b_next;
946 			eager->tcp_rcv_cnt += msgdsize(mp);
947 		}
948 		eager->tcp_rcv_last_head = mp;
949 		while (mp->b_cont)
950 			mp = mp->b_cont;
951 		eager->tcp_rcv_last_tail = mp;
952 		if (eager->tcp_rcv_cnt > eager->tcp_rwnd)
953 			eager->tcp_rwnd = 0;
954 		else
955 			eager->tcp_rwnd -= eager->tcp_rcv_cnt;
956 	}
957 
958 	if (!issocket)
959 		eager->tcp_issocket = B_FALSE;
960 	/*
961 	 * The stream for this eager does not yet exist, so mark it as
962 	 * being detached.
963 	 */
964 	eager->tcp_detached = B_TRUE;
965 	eager->tcp_hard_binding = B_TRUE;
966 	connp->conn_rq = listener->tcp_connp->conn_rq;
967 	connp->conn_wq = listener->tcp_connp->conn_wq;
968 
969 	/* Send up the connection indication */
970 	mp = eager->tcp_conn.tcp_eager_conn_ind;
971 	ASSERT(mp != NULL);
972 	eager->tcp_conn.tcp_eager_conn_ind = NULL;
973 
974 	/*
975 	 * TLI/XTI applications will get confused by
976 	 * sending eager as an option since it violates
977 	 * the option semantics. So remove the eager as
978 	 * option since TLI/XTI app doesn't need it anyway.
979 	 */
980 	if (!issocket) {
981 		struct T_conn_ind *conn_ind;
982 
983 		conn_ind = (struct T_conn_ind *)mp->b_rptr;
984 		conn_ind->OPT_length = 0;
985 		conn_ind->OPT_offset = 0;
986 	}
987 
988 	/*
989 	 * Sockfs guarantees that the listener will not be closed
990 	 * during fallback. So we can safely use the listener's queue.
991 	 */
992 	putnext(listener->tcp_connp->conn_rq, mp);
993 }
994 
995 
996 int
997 tcp_fallback(sock_lower_handle_t proto_handle, queue_t *q,
998     boolean_t direct_sockfs, so_proto_quiesced_cb_t quiesced_cb,
999     sock_quiesce_arg_t *arg)
1000 {
1001 	tcp_t			*tcp;
1002 	conn_t			*connp = (conn_t *)proto_handle;
1003 	int			error;
1004 	mblk_t			*stropt_mp;
1005 	mblk_t			*ordrel_mp;
1006 
1007 	tcp = connp->conn_tcp;
1008 
1009 	stropt_mp = allocb_wait(sizeof (struct stroptions), BPRI_HI, STR_NOSIG,
1010 	    NULL);
1011 
1012 	/* Pre-allocate the T_ordrel_ind mblk. */
1013 	ASSERT(tcp->tcp_ordrel_mp == NULL);
1014 	ordrel_mp = allocb_wait(sizeof (struct T_ordrel_ind), BPRI_HI,
1015 	    STR_NOSIG, NULL);
1016 	ordrel_mp->b_datap->db_type = M_PROTO;
1017 	((struct T_ordrel_ind *)ordrel_mp->b_rptr)->PRIM_type = T_ORDREL_IND;
1018 	ordrel_mp->b_wptr += sizeof (struct T_ordrel_ind);
1019 
1020 	/*
1021 	 * Enter the squeue so that no new packets can come in
1022 	 */
1023 	error = squeue_synch_enter(connp, NULL);
1024 	if (error != 0) {
1025 		/* failed to enter, free all the pre-allocated messages. */
1026 		freeb(stropt_mp);
1027 		freeb(ordrel_mp);
1028 		return (ENOMEM);
1029 	}
1030 
1031 	/*
1032 	 * Both endpoints must be of the same type (either STREAMS or
1033 	 * non-STREAMS) for fusion to be enabled. So if we are fused,
1034 	 * we have to unfuse.
1035 	 */
1036 	if (tcp->tcp_fused)
1037 		tcp_unfuse(tcp);
1038 
1039 	if (tcp->tcp_listener != NULL) {
1040 		/* The eager will deal with opts when accept() is called */
1041 		freeb(stropt_mp);
1042 		tcp_fallback_eager(tcp, direct_sockfs, quiesced_cb, arg);
1043 	} else {
1044 		tcp_fallback_noneager(tcp, stropt_mp, q, direct_sockfs,
1045 		    quiesced_cb, arg);
1046 	}
1047 
1048 	/*
1049 	 * No longer a direct socket
1050 	 *
1051 	 * Note that we intentionally leave the upper_handle and upcalls
1052 	 * intact, since eagers may still be using them.
1053 	 */
1054 	connp->conn_flags &= ~IPCL_NONSTR;
1055 	tcp->tcp_ordrel_mp = ordrel_mp;
1056 
1057 	/*
1058 	 * There should be atleast two ref's (IP + TCP)
1059 	 */
1060 	ASSERT(connp->conn_ref >= 2);
1061 	squeue_synch_exit(connp, SQ_NODRAIN);
1062 
1063 	return (0);
1064 }
1065 
1066 /*
1067  * Notifies a non-STREAMS based listener about a new connection. This
1068  * function is executed on the *eager*'s squeue once the 3 way handshake
1069  * has completed. Note that the behavior differs from STREAMS, where the
1070  * T_CONN_IND is sent up by tcp_send_conn_ind() while on the *listener*'s
1071  * squeue.
1072  *
1073  * Returns B_TRUE if the notification succeeded and an upper handle was
1074  * obtained. `tcp' should be closed on failure.
1075  */
1076 boolean_t
1077 tcp_newconn_notify(tcp_t *tcp, ip_recv_attr_t *ira)
1078 {
1079 	tcp_t *listener = tcp->tcp_listener;
1080 	conn_t *lconnp = listener->tcp_connp;
1081 	conn_t *econnp = tcp->tcp_connp;
1082 	tcp_t *tail;
1083 	ipaddr_t *addr_cache;
1084 	sock_upper_handle_t upper;
1085 	struct sock_proto_props sopp;
1086 
1087 	mutex_enter(&listener->tcp_eager_lock);
1088 	/*
1089 	 * Take the eager out, if it is in the list of droppable eagers
1090 	 * as we are here because the 3W handshake is over.
1091 	 */
1092 	MAKE_UNDROPPABLE(tcp);
1093 	/*
1094 	 * The eager already has an extra ref put in tcp_input_data
1095 	 * so that it stays till accept comes back even though it
1096 	 * might get into TCPS_CLOSED as a result of a TH_RST etc.
1097 	 */
1098 	ASSERT(listener->tcp_conn_req_cnt_q0 > 0);
1099 	listener->tcp_conn_req_cnt_q0--;
1100 	listener->tcp_conn_req_cnt_q++;
1101 
1102 	/* Move from SYN_RCVD to ESTABLISHED list  */
1103 	tcp->tcp_eager_next_q0->tcp_eager_prev_q0 = tcp->tcp_eager_prev_q0;
1104 	tcp->tcp_eager_prev_q0->tcp_eager_next_q0 = tcp->tcp_eager_next_q0;
1105 	tcp->tcp_eager_prev_q0 = NULL;
1106 	tcp->tcp_eager_next_q0 = NULL;
1107 
1108 	/*
1109 	 * Insert at end of the queue because connections are accepted
1110 	 * in chronological order. Leaving the older connections at front
1111 	 * of the queue helps reducing search time.
1112 	 */
1113 	tail = listener->tcp_eager_last_q;
1114 	if (tail != NULL)
1115 		tail->tcp_eager_next_q = tcp;
1116 	else
1117 		listener->tcp_eager_next_q = tcp;
1118 	listener->tcp_eager_last_q = tcp;
1119 	tcp->tcp_eager_next_q = NULL;
1120 
1121 	/* we have timed out before */
1122 	if (tcp->tcp_syn_rcvd_timeout != 0) {
1123 		tcp->tcp_syn_rcvd_timeout = 0;
1124 		listener->tcp_syn_rcvd_timeout--;
1125 		if (listener->tcp_syn_defense &&
1126 		    listener->tcp_syn_rcvd_timeout <=
1127 		    (listener->tcp_tcps->tcps_conn_req_max_q0 >> 5) &&
1128 		    10*MINUTES < TICK_TO_MSEC(ddi_get_lbolt64() -
1129 		    listener->tcp_last_rcv_lbolt)) {
1130 			/*
1131 			 * Turn off the defense mode if we
1132 			 * believe the SYN attack is over.
1133 			 */
1134 			listener->tcp_syn_defense = B_FALSE;
1135 			if (listener->tcp_ip_addr_cache) {
1136 				kmem_free((void *)listener->tcp_ip_addr_cache,
1137 				    IP_ADDR_CACHE_SIZE * sizeof (ipaddr_t));
1138 				listener->tcp_ip_addr_cache = NULL;
1139 			}
1140 		}
1141 	}
1142 	addr_cache = (ipaddr_t *)(listener->tcp_ip_addr_cache);
1143 	if (addr_cache != NULL) {
1144 		/*
1145 		 * We have finished a 3-way handshake with this
1146 		 * remote host. This proves the IP addr is good.
1147 		 * Cache it!
1148 		 */
1149 		addr_cache[IP_ADDR_CACHE_HASH(tcp->tcp_connp->conn_faddr_v4)] =
1150 		    tcp->tcp_connp->conn_faddr_v4;
1151 	}
1152 	mutex_exit(&listener->tcp_eager_lock);
1153 
1154 	/*
1155 	 * Notify the ULP about the newconn. It is guaranteed that no
1156 	 * tcp_accept() call will be made for the eager if the
1157 	 * notification fails.
1158 	 */
1159 	if ((upper = (*lconnp->conn_upcalls->su_newconn)
1160 	    (lconnp->conn_upper_handle, (sock_lower_handle_t)econnp,
1161 	    &sock_tcp_downcalls, ira->ira_cred, ira->ira_cpid,
1162 	    &econnp->conn_upcalls)) == NULL) {
1163 		return (B_FALSE);
1164 	}
1165 	econnp->conn_upper_handle = upper;
1166 
1167 	tcp->tcp_detached = B_FALSE;
1168 	tcp->tcp_hard_binding = B_FALSE;
1169 	tcp->tcp_tconnind_started = B_TRUE;
1170 
1171 	if (econnp->conn_keepalive) {
1172 		tcp->tcp_ka_last_intrvl = 0;
1173 		tcp->tcp_ka_tid = TCP_TIMER(tcp, tcp_keepalive_timer,
1174 		    tcp->tcp_ka_interval);
1175 	}
1176 
1177 	/* Update the necessary parameters */
1178 	tcp_get_proto_props(tcp, &sopp);
1179 
1180 	(*econnp->conn_upcalls->su_set_proto_props)
1181 	    (econnp->conn_upper_handle, &sopp);
1182 
1183 	return (B_TRUE);
1184 }
1185