xref: /illumos-gate/usr/src/uts/common/inet/tcp/tcp_socket.c (revision 5e989a96186a37eb528fb7bb4d28a150874ec799)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
24  */
25 
26 /* This file contains all TCP kernel socket related functions. */
27 
28 #include <sys/types.h>
29 #include <sys/strlog.h>
30 #include <sys/policy.h>
31 #include <sys/sockio.h>
32 #include <sys/strsubr.h>
33 #include <sys/strsun.h>
34 #include <sys/squeue_impl.h>
35 #include <sys/squeue.h>
36 #define	_SUN_TPI_VERSION 2
37 #include <sys/tihdr.h>
38 #include <sys/timod.h>
39 #include <sys/tpicommon.h>
40 #include <sys/socketvar.h>
41 
42 #include <inet/common.h>
43 #include <inet/proto_set.h>
44 #include <inet/ip.h>
45 #include <inet/tcp.h>
46 #include <inet/tcp_impl.h>
47 
48 static void	tcp_activate(sock_lower_handle_t, sock_upper_handle_t,
49 		    sock_upcalls_t *, int, cred_t *);
50 static int	tcp_accept(sock_lower_handle_t, sock_lower_handle_t,
51 		    sock_upper_handle_t, cred_t *);
52 static int	tcp_bind(sock_lower_handle_t, struct sockaddr *,
53 		    socklen_t, cred_t *);
54 static int	tcp_listen(sock_lower_handle_t, int, cred_t *);
55 static int	tcp_connect(sock_lower_handle_t, const struct sockaddr *,
56 		    socklen_t, sock_connid_t *, cred_t *);
57 static int	tcp_getpeername(sock_lower_handle_t, struct sockaddr *,
58 		    socklen_t *, cred_t *);
59 static int	tcp_getsockname(sock_lower_handle_t, struct sockaddr *,
60 		    socklen_t *, cred_t *);
61 static int	tcp_getsockopt(sock_lower_handle_t, int, int, void *,
62 		    socklen_t *, cred_t *);
63 static int	tcp_setsockopt(sock_lower_handle_t, int, int, const void *,
64 		    socklen_t, cred_t *);
65 static int	tcp_sendmsg(sock_lower_handle_t, mblk_t *, struct nmsghdr *,
66 		    cred_t *);
67 static int	tcp_shutdown(sock_lower_handle_t, int, cred_t *);
68 static void	tcp_clr_flowctrl(sock_lower_handle_t);
69 static int	tcp_ioctl(sock_lower_handle_t, int, intptr_t, int, int32_t *,
70 		    cred_t *);
71 static int	tcp_close(sock_lower_handle_t, int, cred_t *);
72 
73 sock_downcalls_t sock_tcp_downcalls = {
74 	tcp_activate,
75 	tcp_accept,
76 	tcp_bind,
77 	tcp_listen,
78 	tcp_connect,
79 	tcp_getpeername,
80 	tcp_getsockname,
81 	tcp_getsockopt,
82 	tcp_setsockopt,
83 	tcp_sendmsg,
84 	NULL,
85 	NULL,
86 	NULL,
87 	tcp_shutdown,
88 	tcp_clr_flowctrl,
89 	tcp_ioctl,
90 	tcp_close,
91 };
92 
93 /* ARGSUSED */
94 static void
95 tcp_activate(sock_lower_handle_t proto_handle, sock_upper_handle_t sock_handle,
96     sock_upcalls_t *sock_upcalls, int flags, cred_t *cr)
97 {
98 	conn_t *connp = (conn_t *)proto_handle;
99 	struct sock_proto_props sopp;
100 	extern struct module_info tcp_rinfo;
101 
102 	ASSERT(connp->conn_upper_handle == NULL);
103 
104 	/* All Solaris components should pass a cred for this operation. */
105 	ASSERT(cr != NULL);
106 
107 	sopp.sopp_flags = SOCKOPT_RCVHIWAT | SOCKOPT_RCVLOWAT |
108 	    SOCKOPT_MAXPSZ | SOCKOPT_MAXBLK | SOCKOPT_RCVTIMER |
109 	    SOCKOPT_RCVTHRESH | SOCKOPT_MAXADDRLEN | SOCKOPT_MINPSZ;
110 
111 	sopp.sopp_rxhiwat = SOCKET_RECVHIWATER;
112 	sopp.sopp_rxlowat = SOCKET_RECVLOWATER;
113 	sopp.sopp_maxpsz = INFPSZ;
114 	sopp.sopp_maxblk = INFPSZ;
115 	sopp.sopp_rcvtimer = SOCKET_TIMER_INTERVAL;
116 	sopp.sopp_rcvthresh = SOCKET_RECVHIWATER >> 3;
117 	sopp.sopp_maxaddrlen = sizeof (sin6_t);
118 	sopp.sopp_minpsz = (tcp_rinfo.mi_minpsz == 1) ? 0 :
119 	    tcp_rinfo.mi_minpsz;
120 
121 	connp->conn_upcalls = sock_upcalls;
122 	connp->conn_upper_handle = sock_handle;
123 
124 	ASSERT(connp->conn_rcvbuf != 0 &&
125 	    connp->conn_rcvbuf == connp->conn_tcp->tcp_rwnd);
126 	(*sock_upcalls->su_set_proto_props)(sock_handle, &sopp);
127 }
128 
129 /*ARGSUSED*/
130 static int
131 tcp_accept(sock_lower_handle_t lproto_handle,
132     sock_lower_handle_t eproto_handle, sock_upper_handle_t sock_handle,
133     cred_t *cr)
134 {
135 	conn_t *lconnp, *econnp;
136 	tcp_t *listener, *eager;
137 
138 	/*
139 	 * KSSL can move a socket from one listener to another, in which
140 	 * case `lproto_handle' points to the new listener. To ensure that
141 	 * the original listener is used the information is obtained from
142 	 * the eager.
143 	 */
144 	econnp = (conn_t *)eproto_handle;
145 	eager = econnp->conn_tcp;
146 	ASSERT(IPCL_IS_NONSTR(econnp));
147 	ASSERT(eager->tcp_listener != NULL);
148 	listener = eager->tcp_listener;
149 	lconnp = (conn_t *)listener->tcp_connp;
150 	ASSERT(listener->tcp_state == TCPS_LISTEN);
151 	ASSERT(lconnp->conn_upper_handle != NULL);
152 
153 	/*
154 	 * It is possible for the accept thread to race with the thread that
155 	 * made the su_newconn upcall in tcp_newconn_notify. Both
156 	 * tcp_newconn_notify and tcp_accept require that conn_upper_handle
157 	 * and conn_upcalls be set before returning, so they both write to
158 	 * them. However, we're guaranteed that the value written is the same
159 	 * for both threads.
160 	 */
161 	ASSERT(econnp->conn_upper_handle == NULL ||
162 	    econnp->conn_upper_handle == sock_handle);
163 	ASSERT(econnp->conn_upcalls == NULL ||
164 	    econnp->conn_upcalls == lconnp->conn_upcalls);
165 	econnp->conn_upper_handle = sock_handle;
166 	econnp->conn_upcalls = lconnp->conn_upcalls;
167 
168 	ASSERT(econnp->conn_netstack ==
169 	    listener->tcp_connp->conn_netstack);
170 	ASSERT(eager->tcp_tcps == listener->tcp_tcps);
171 
172 	/*
173 	 * We should have a minimum of 2 references on the conn at this
174 	 * point. One for TCP and one for the newconn notification
175 	 * (which is now taken over by IP). In the normal case we would
176 	 * also have another reference (making a total of 3) for the conn
177 	 * being in the classifier hash list. However the eager could have
178 	 * received an RST subsequently and tcp_closei_local could have
179 	 * removed the eager from the classifier hash list, hence we can't
180 	 * assert that reference.
181 	 */
182 	ASSERT(econnp->conn_ref >= 2);
183 
184 	mutex_enter(&listener->tcp_eager_lock);
185 	/*
186 	 * Non-STREAMS listeners never defer the notification of new
187 	 * connections.
188 	 */
189 	ASSERT(!listener->tcp_eager_prev_q0->tcp_conn_def_q0);
190 	tcp_eager_unlink(eager);
191 	mutex_exit(&listener->tcp_eager_lock);
192 	CONN_DEC_REF(listener->tcp_connp);
193 
194 	return ((eager->tcp_state < TCPS_ESTABLISHED) ? ECONNABORTED : 0);
195 }
196 
197 static int
198 tcp_bind(sock_lower_handle_t proto_handle, struct sockaddr *sa,
199     socklen_t len, cred_t *cr)
200 {
201 	int 		error;
202 	conn_t		*connp = (conn_t *)proto_handle;
203 
204 	/* All Solaris components should pass a cred for this operation. */
205 	ASSERT(cr != NULL);
206 	ASSERT(connp->conn_upper_handle != NULL);
207 
208 	error = squeue_synch_enter(connp, NULL);
209 	if (error != 0) {
210 		/* failed to enter */
211 		return (ENOSR);
212 	}
213 
214 	/* binding to a NULL address really means unbind */
215 	if (sa == NULL) {
216 		if (connp->conn_tcp->tcp_state < TCPS_LISTEN)
217 			error = tcp_do_unbind(connp);
218 		else
219 			error = EINVAL;
220 	} else {
221 		error = tcp_do_bind(connp, sa, len, cr, B_TRUE);
222 	}
223 
224 	squeue_synch_exit(connp);
225 
226 	if (error < 0) {
227 		if (error == -TOUTSTATE)
228 			error = EINVAL;
229 		else
230 			error = proto_tlitosyserr(-error);
231 	}
232 
233 	return (error);
234 }
235 
236 /* ARGSUSED */
237 static int
238 tcp_listen(sock_lower_handle_t proto_handle, int backlog, cred_t *cr)
239 {
240 	conn_t	*connp = (conn_t *)proto_handle;
241 	tcp_t	*tcp = connp->conn_tcp;
242 	int 	error;
243 
244 	ASSERT(connp->conn_upper_handle != NULL);
245 
246 	/* All Solaris components should pass a cred for this operation. */
247 	ASSERT(cr != NULL);
248 
249 	error = squeue_synch_enter(connp, NULL);
250 	if (error != 0) {
251 		/* failed to enter */
252 		return (ENOBUFS);
253 	}
254 
255 	error = tcp_do_listen(connp, NULL, 0, backlog, cr, B_FALSE);
256 	if (error == 0) {
257 		/*
258 		 * sockfs needs to know what's the maximum number of socket
259 		 * that can be queued on the listener.
260 		 */
261 		(*connp->conn_upcalls->su_opctl)(connp->conn_upper_handle,
262 		    SOCK_OPCTL_ENAB_ACCEPT,
263 		    (uintptr_t)(tcp->tcp_conn_req_max +
264 		    tcp->tcp_tcps->tcps_conn_req_max_q0));
265 	} else if (error < 0) {
266 		if (error == -TOUTSTATE)
267 			error = EINVAL;
268 		else
269 			error = proto_tlitosyserr(-error);
270 	}
271 	squeue_synch_exit(connp);
272 	return (error);
273 }
274 
275 static int
276 tcp_connect(sock_lower_handle_t proto_handle, const struct sockaddr *sa,
277     socklen_t len, sock_connid_t *id, cred_t *cr)
278 {
279 	conn_t		*connp = (conn_t *)proto_handle;
280 	int		error;
281 
282 	ASSERT(connp->conn_upper_handle != NULL);
283 
284 	/* All Solaris components should pass a cred for this operation. */
285 	ASSERT(cr != NULL);
286 
287 	error = proto_verify_ip_addr(connp->conn_family, sa, len);
288 	if (error != 0) {
289 		return (error);
290 	}
291 
292 	error = squeue_synch_enter(connp, NULL);
293 	if (error != 0) {
294 		/* failed to enter */
295 		return (ENOSR);
296 	}
297 
298 	/*
299 	 * TCP supports quick connect, so no need to do an implicit bind
300 	 */
301 	error = tcp_do_connect(connp, sa, len, cr, curproc->p_pid);
302 	if (error == 0) {
303 		*id = connp->conn_tcp->tcp_connid;
304 	} else if (error < 0) {
305 		if (error == -TOUTSTATE) {
306 			switch (connp->conn_tcp->tcp_state) {
307 			case TCPS_SYN_SENT:
308 				error = EALREADY;
309 				break;
310 			case TCPS_ESTABLISHED:
311 				error = EISCONN;
312 				break;
313 			case TCPS_LISTEN:
314 				error = EOPNOTSUPP;
315 				break;
316 			default:
317 				error = EINVAL;
318 				break;
319 			}
320 		} else {
321 			error = proto_tlitosyserr(-error);
322 		}
323 	}
324 
325 	if (connp->conn_tcp->tcp_loopback) {
326 		struct sock_proto_props sopp;
327 
328 		sopp.sopp_flags = SOCKOPT_LOOPBACK;
329 		sopp.sopp_loopback = B_TRUE;
330 
331 		(*connp->conn_upcalls->su_set_proto_props)(
332 		    connp->conn_upper_handle, &sopp);
333 	}
334 done:
335 	squeue_synch_exit(connp);
336 
337 	return ((error == 0) ? EINPROGRESS : error);
338 }
339 
340 /* ARGSUSED3 */
341 static int
342 tcp_getpeername(sock_lower_handle_t proto_handle, struct sockaddr *addr,
343     socklen_t *addrlenp, cred_t *cr)
344 {
345 	conn_t	*connp = (conn_t *)proto_handle;
346 	tcp_t	*tcp = connp->conn_tcp;
347 
348 	/* All Solaris components should pass a cred for this operation. */
349 	ASSERT(cr != NULL);
350 
351 	ASSERT(tcp != NULL);
352 	if (tcp->tcp_state < TCPS_SYN_RCVD)
353 		return (ENOTCONN);
354 
355 	return (conn_getpeername(connp, addr, addrlenp));
356 }
357 
358 /* ARGSUSED3 */
359 static int
360 tcp_getsockname(sock_lower_handle_t proto_handle, struct sockaddr *addr,
361     socklen_t *addrlenp, cred_t *cr)
362 {
363 	conn_t	*connp = (conn_t *)proto_handle;
364 
365 	/* All Solaris components should pass a cred for this operation. */
366 	ASSERT(cr != NULL);
367 
368 	return (conn_getsockname(connp, addr, addrlenp));
369 }
370 
371 /* returns UNIX error, the optlen is a value-result arg */
372 static int
373 tcp_getsockopt(sock_lower_handle_t proto_handle, int level, int option_name,
374     void *optvalp, socklen_t *optlen, cred_t *cr)
375 {
376 	conn_t		*connp = (conn_t *)proto_handle;
377 	int		error;
378 	t_uscalar_t	max_optbuf_len;
379 	void		*optvalp_buf;
380 	int		len;
381 
382 	ASSERT(connp->conn_upper_handle != NULL);
383 
384 	error = proto_opt_check(level, option_name, *optlen, &max_optbuf_len,
385 	    tcp_opt_obj.odb_opt_des_arr,
386 	    tcp_opt_obj.odb_opt_arr_cnt,
387 	    B_FALSE, B_TRUE, cr);
388 	if (error != 0) {
389 		if (error < 0) {
390 			error = proto_tlitosyserr(-error);
391 		}
392 		return (error);
393 	}
394 
395 	optvalp_buf = kmem_alloc(max_optbuf_len, KM_SLEEP);
396 
397 	error = squeue_synch_enter(connp, NULL);
398 	if (error == ENOMEM) {
399 		kmem_free(optvalp_buf, max_optbuf_len);
400 		return (ENOMEM);
401 	}
402 
403 	len = tcp_opt_get(connp, level, option_name, optvalp_buf);
404 	squeue_synch_exit(connp);
405 
406 	if (len == -1) {
407 		kmem_free(optvalp_buf, max_optbuf_len);
408 		return (EINVAL);
409 	}
410 
411 	/*
412 	 * update optlen and copy option value
413 	 */
414 	t_uscalar_t size = MIN(len, *optlen);
415 
416 	bcopy(optvalp_buf, optvalp, size);
417 	bcopy(&size, optlen, sizeof (size));
418 
419 	kmem_free(optvalp_buf, max_optbuf_len);
420 	return (0);
421 }
422 
423 static int
424 tcp_setsockopt(sock_lower_handle_t proto_handle, int level, int option_name,
425     const void *optvalp, socklen_t optlen, cred_t *cr)
426 {
427 	conn_t		*connp = (conn_t *)proto_handle;
428 	int		error;
429 
430 	ASSERT(connp->conn_upper_handle != NULL);
431 	/*
432 	 * Entering the squeue synchronously can result in a context switch,
433 	 * which can cause a rather sever performance degradation. So we try to
434 	 * handle whatever options we can without entering the squeue.
435 	 */
436 	if (level == IPPROTO_TCP) {
437 		switch (option_name) {
438 		case TCP_NODELAY:
439 			if (optlen != sizeof (int32_t))
440 				return (EINVAL);
441 			mutex_enter(&connp->conn_tcp->tcp_non_sq_lock);
442 			connp->conn_tcp->tcp_naglim = *(int *)optvalp ? 1 :
443 			    connp->conn_tcp->tcp_mss;
444 			mutex_exit(&connp->conn_tcp->tcp_non_sq_lock);
445 			return (0);
446 		default:
447 			break;
448 		}
449 	}
450 
451 	error = squeue_synch_enter(connp, NULL);
452 	if (error == ENOMEM) {
453 		return (ENOMEM);
454 	}
455 
456 	error = proto_opt_check(level, option_name, optlen, NULL,
457 	    tcp_opt_obj.odb_opt_des_arr,
458 	    tcp_opt_obj.odb_opt_arr_cnt,
459 	    B_TRUE, B_FALSE, cr);
460 
461 	if (error != 0) {
462 		if (error < 0) {
463 			error = proto_tlitosyserr(-error);
464 		}
465 		squeue_synch_exit(connp);
466 		return (error);
467 	}
468 
469 	error = tcp_opt_set(connp, SETFN_OPTCOM_NEGOTIATE, level, option_name,
470 	    optlen, (uchar_t *)optvalp, (uint_t *)&optlen, (uchar_t *)optvalp,
471 	    NULL, cr);
472 	squeue_synch_exit(connp);
473 
474 	ASSERT(error >= 0);
475 
476 	return (error);
477 }
478 
479 /* ARGSUSED */
480 static int
481 tcp_sendmsg(sock_lower_handle_t proto_handle, mblk_t *mp, struct nmsghdr *msg,
482     cred_t *cr)
483 {
484 	tcp_t		*tcp;
485 	uint32_t	msize;
486 	conn_t *connp = (conn_t *)proto_handle;
487 	int32_t		tcpstate;
488 
489 	/* All Solaris components should pass a cred for this operation. */
490 	ASSERT(cr != NULL);
491 
492 	ASSERT(connp->conn_ref >= 2);
493 	ASSERT(connp->conn_upper_handle != NULL);
494 
495 	if (msg->msg_controllen != 0) {
496 		freemsg(mp);
497 		return (EOPNOTSUPP);
498 	}
499 
500 	switch (DB_TYPE(mp)) {
501 	case M_DATA:
502 		tcp = connp->conn_tcp;
503 		ASSERT(tcp != NULL);
504 
505 		tcpstate = tcp->tcp_state;
506 		if (tcpstate < TCPS_ESTABLISHED) {
507 			freemsg(mp);
508 			/*
509 			 * We return ENOTCONN if the endpoint is trying to
510 			 * connect or has never been connected, and EPIPE if it
511 			 * has been disconnected. The connection id helps us
512 			 * distinguish between the last two cases.
513 			 */
514 			return ((tcpstate == TCPS_SYN_SENT) ? ENOTCONN :
515 			    ((tcp->tcp_connid > 0) ? EPIPE : ENOTCONN));
516 		} else if (tcpstate > TCPS_CLOSE_WAIT) {
517 			freemsg(mp);
518 			return (EPIPE);
519 		}
520 
521 		msize = msgdsize(mp);
522 
523 		mutex_enter(&tcp->tcp_non_sq_lock);
524 		tcp->tcp_squeue_bytes += msize;
525 		/*
526 		 * Squeue Flow Control
527 		 */
528 		if (TCP_UNSENT_BYTES(tcp) > connp->conn_sndbuf) {
529 			tcp_setqfull(tcp);
530 		}
531 		mutex_exit(&tcp->tcp_non_sq_lock);
532 
533 		/*
534 		 * The application may pass in an address in the msghdr, but
535 		 * we ignore the address on connection-oriented sockets.
536 		 * Just like BSD this code does not generate an error for
537 		 * TCP (a CONNREQUIRED socket) when sending to an address
538 		 * passed in with sendto/sendmsg. Instead the data is
539 		 * delivered on the connection as if no address had been
540 		 * supplied.
541 		 */
542 		CONN_INC_REF(connp);
543 
544 		if (msg->msg_flags & MSG_OOB) {
545 			SQUEUE_ENTER_ONE(connp->conn_sqp, mp, tcp_output_urgent,
546 			    connp, NULL, tcp_squeue_flag, SQTAG_TCP_OUTPUT);
547 		} else {
548 			SQUEUE_ENTER_ONE(connp->conn_sqp, mp, tcp_output,
549 			    connp, NULL, tcp_squeue_flag, SQTAG_TCP_OUTPUT);
550 		}
551 
552 		return (0);
553 
554 	default:
555 		ASSERT(0);
556 	}
557 
558 	freemsg(mp);
559 	return (0);
560 }
561 
562 /* ARGSUSED */
563 static int
564 tcp_shutdown(sock_lower_handle_t proto_handle, int how, cred_t *cr)
565 {
566 	conn_t  *connp = (conn_t *)proto_handle;
567 	tcp_t   *tcp = connp->conn_tcp;
568 
569 	ASSERT(connp->conn_upper_handle != NULL);
570 
571 	/* All Solaris components should pass a cred for this operation. */
572 	ASSERT(cr != NULL);
573 
574 	/*
575 	 * X/Open requires that we check the connected state.
576 	 */
577 	if (tcp->tcp_state < TCPS_SYN_SENT)
578 		return (ENOTCONN);
579 
580 	/* shutdown the send side */
581 	if (how != SHUT_RD) {
582 		mblk_t *bp;
583 
584 		bp = allocb_wait(0, BPRI_HI, STR_NOSIG, NULL);
585 		CONN_INC_REF(connp);
586 		SQUEUE_ENTER_ONE(connp->conn_sqp, bp, tcp_shutdown_output,
587 		    connp, NULL, SQ_NODRAIN, SQTAG_TCP_SHUTDOWN_OUTPUT);
588 
589 		(*connp->conn_upcalls->su_opctl)(connp->conn_upper_handle,
590 		    SOCK_OPCTL_SHUT_SEND, 0);
591 	}
592 
593 	/* shutdown the recv side */
594 	if (how != SHUT_WR)
595 		(*connp->conn_upcalls->su_opctl)(connp->conn_upper_handle,
596 		    SOCK_OPCTL_SHUT_RECV, 0);
597 
598 	return (0);
599 }
600 
601 static void
602 tcp_clr_flowctrl(sock_lower_handle_t proto_handle)
603 {
604 	conn_t  *connp = (conn_t *)proto_handle;
605 	tcp_t	*tcp = connp->conn_tcp;
606 	mblk_t *mp;
607 	int error;
608 
609 	ASSERT(connp->conn_upper_handle != NULL);
610 
611 	/*
612 	 * If tcp->tcp_rsrv_mp == NULL, it means that tcp_clr_flowctrl()
613 	 * is currently running.
614 	 */
615 	mutex_enter(&tcp->tcp_rsrv_mp_lock);
616 	if ((mp = tcp->tcp_rsrv_mp) == NULL) {
617 		mutex_exit(&tcp->tcp_rsrv_mp_lock);
618 		return;
619 	}
620 	tcp->tcp_rsrv_mp = NULL;
621 	mutex_exit(&tcp->tcp_rsrv_mp_lock);
622 
623 	error = squeue_synch_enter(connp, mp);
624 	ASSERT(error == 0);
625 
626 	mutex_enter(&tcp->tcp_rsrv_mp_lock);
627 	tcp->tcp_rsrv_mp = mp;
628 	mutex_exit(&tcp->tcp_rsrv_mp_lock);
629 
630 	if (tcp->tcp_fused) {
631 		tcp_fuse_backenable(tcp);
632 	} else {
633 		tcp->tcp_rwnd = connp->conn_rcvbuf;
634 		/*
635 		 * Send back a window update immediately if TCP is above
636 		 * ESTABLISHED state and the increase of the rcv window
637 		 * that the other side knows is at least 1 MSS after flow
638 		 * control is lifted.
639 		 */
640 		if (tcp->tcp_state >= TCPS_ESTABLISHED &&
641 		    tcp_rwnd_reopen(tcp) == TH_ACK_NEEDED) {
642 			tcp_xmit_ctl(NULL, tcp,
643 			    (tcp->tcp_swnd == 0) ? tcp->tcp_suna :
644 			    tcp->tcp_snxt, tcp->tcp_rnxt, TH_ACK);
645 		}
646 	}
647 
648 	squeue_synch_exit(connp);
649 }
650 
651 /* ARGSUSED */
652 static int
653 tcp_ioctl(sock_lower_handle_t proto_handle, int cmd, intptr_t arg,
654     int mode, int32_t *rvalp, cred_t *cr)
655 {
656 	conn_t  	*connp = (conn_t *)proto_handle;
657 	int		error;
658 
659 	ASSERT(connp->conn_upper_handle != NULL);
660 
661 	/* All Solaris components should pass a cred for this operation. */
662 	ASSERT(cr != NULL);
663 
664 	/*
665 	 * If we don't have a helper stream then create one.
666 	 * ip_create_helper_stream takes care of locking the conn_t,
667 	 * so this check for NULL is just a performance optimization.
668 	 */
669 	if (connp->conn_helper_info == NULL) {
670 		tcp_stack_t *tcps = connp->conn_tcp->tcp_tcps;
671 
672 		/*
673 		 * Create a helper stream for non-STREAMS socket.
674 		 */
675 		error = ip_create_helper_stream(connp, tcps->tcps_ldi_ident);
676 		if (error != 0) {
677 			ip0dbg(("tcp_ioctl: create of IP helper stream "
678 			    "failed %d\n", error));
679 			return (error);
680 		}
681 	}
682 
683 	switch (cmd) {
684 		case ND_SET:
685 		case ND_GET:
686 		case _SIOCSOCKFALLBACK:
687 		case TCP_IOC_ABORT_CONN:
688 		case TI_GETPEERNAME:
689 		case TI_GETMYNAME:
690 			ip1dbg(("tcp_ioctl: cmd 0x%x on non streams socket",
691 			    cmd));
692 			error = EINVAL;
693 			break;
694 		default:
695 			/*
696 			 * If the conn is not closing, pass on to IP using
697 			 * helper stream. Bump the ioctlref to prevent tcp_close
698 			 * from closing the rq/wq out from underneath the ioctl
699 			 * if it ends up queued or aborted/interrupted.
700 			 */
701 			mutex_enter(&connp->conn_lock);
702 			if (connp->conn_state_flags & (CONN_CLOSING)) {
703 				mutex_exit(&connp->conn_lock);
704 				error = EINVAL;
705 				break;
706 			}
707 			CONN_INC_IOCTLREF_LOCKED(connp);
708 			error = ldi_ioctl(connp->conn_helper_info->iphs_handle,
709 			    cmd, arg, mode, cr, rvalp);
710 			CONN_DEC_IOCTLREF(connp);
711 			break;
712 	}
713 	return (error);
714 }
715 
716 /* ARGSUSED */
717 static int
718 tcp_close(sock_lower_handle_t proto_handle, int flags, cred_t *cr)
719 {
720 	conn_t *connp = (conn_t *)proto_handle;
721 
722 	ASSERT(connp->conn_upper_handle != NULL);
723 
724 	/* All Solaris components should pass a cred for this operation. */
725 	ASSERT(cr != NULL);
726 
727 	tcp_close_common(connp, flags);
728 
729 	ip_free_helper_stream(connp);
730 
731 	/*
732 	 * Drop IP's reference on the conn. This is the last reference
733 	 * on the connp if the state was less than established. If the
734 	 * connection has gone into timewait state, then we will have
735 	 * one ref for the TCP and one more ref (total of two) for the
736 	 * classifier connected hash list (a timewait connections stays
737 	 * in connected hash till closed).
738 	 *
739 	 * We can't assert the references because there might be other
740 	 * transient reference places because of some walkers or queued
741 	 * packets in squeue for the timewait state.
742 	 */
743 	CONN_DEC_REF(connp);
744 
745 	/*
746 	 * EINPROGRESS tells sockfs to wait for a 'closed' upcall before
747 	 * freeing the socket.
748 	 */
749 	return (EINPROGRESS);
750 }
751 
752 /* ARGSUSED */
753 sock_lower_handle_t
754 tcp_create(int family, int type, int proto, sock_downcalls_t **sock_downcalls,
755     uint_t *smodep, int *errorp, int flags, cred_t *credp)
756 {
757 	conn_t		*connp;
758 	boolean_t	isv6 = family == AF_INET6;
759 
760 	if (type != SOCK_STREAM || (family != AF_INET && family != AF_INET6) ||
761 	    (proto != 0 && proto != IPPROTO_TCP)) {
762 		*errorp = EPROTONOSUPPORT;
763 		return (NULL);
764 	}
765 
766 	connp = tcp_create_common(credp, isv6, B_TRUE, errorp);
767 	if (connp == NULL) {
768 		return (NULL);
769 	}
770 
771 	/*
772 	 * Put the ref for TCP. Ref for IP was already put
773 	 * by ipcl_conn_create. Also make the conn_t globally
774 	 * visible to walkers
775 	 */
776 	mutex_enter(&connp->conn_lock);
777 	CONN_INC_REF_LOCKED(connp);
778 	ASSERT(connp->conn_ref == 2);
779 	connp->conn_state_flags &= ~CONN_INCIPIENT;
780 
781 	connp->conn_flags |= IPCL_NONSTR;
782 	mutex_exit(&connp->conn_lock);
783 
784 	ASSERT(errorp != NULL);
785 	*errorp = 0;
786 	*sock_downcalls = &sock_tcp_downcalls;
787 	*smodep = SM_CONNREQUIRED | SM_EXDATA | SM_ACCEPTSUPP |
788 	    SM_SENDFILESUPP;
789 
790 	return ((sock_lower_handle_t)connp);
791 }
792 
793 /*
794  * tcp_fallback
795  *
796  * A direct socket is falling back to using STREAMS. The queue
797  * that is being passed down was created using tcp_open() with
798  * the SO_FALLBACK flag set. As a result, the queue is not
799  * associated with a conn, and the q_ptrs instead contain the
800  * dev and minor area that should be used.
801  *
802  * The 'issocket' flag indicates whether the FireEngine
803  * optimizations should be used. The common case would be that
804  * optimizations are enabled, and they might be subsequently
805  * disabled using the _SIOCSOCKFALLBACK ioctl.
806  */
807 
808 /*
809  * An active connection is falling back to TPI. Gather all the information
810  * required by the STREAM head and TPI sonode and send it up.
811  */
812 static void
813 tcp_fallback_noneager(tcp_t *tcp, mblk_t *stropt_mp, queue_t *q,
814     boolean_t issocket, so_proto_quiesced_cb_t quiesced_cb,
815     sock_quiesce_arg_t *arg)
816 {
817 	conn_t			*connp = tcp->tcp_connp;
818 	struct stroptions	*stropt;
819 	struct T_capability_ack tca;
820 	struct sockaddr_in6	laddr, faddr;
821 	socklen_t 		laddrlen, faddrlen;
822 	short			opts;
823 	int			error;
824 	mblk_t			*mp, *mpnext;
825 
826 	connp->conn_dev = (dev_t)RD(q)->q_ptr;
827 	connp->conn_minor_arena = WR(q)->q_ptr;
828 
829 	RD(q)->q_ptr = WR(q)->q_ptr = connp;
830 
831 	connp->conn_rq = RD(q);
832 	connp->conn_wq = WR(q);
833 
834 	WR(q)->q_qinfo = &tcp_sock_winit;
835 
836 	if (!issocket)
837 		tcp_use_pure_tpi(tcp);
838 
839 	/*
840 	 * free the helper stream
841 	 */
842 	ip_free_helper_stream(connp);
843 
844 	/*
845 	 * Notify the STREAM head about options
846 	 */
847 	DB_TYPE(stropt_mp) = M_SETOPTS;
848 	stropt = (struct stroptions *)stropt_mp->b_rptr;
849 	stropt_mp->b_wptr += sizeof (struct stroptions);
850 	stropt->so_flags = SO_HIWAT | SO_WROFF | SO_MAXBLK;
851 
852 	stropt->so_wroff = connp->conn_ht_iphc_len + (tcp->tcp_loopback ? 0 :
853 	    tcp->tcp_tcps->tcps_wroff_xtra);
854 	if (tcp->tcp_snd_sack_ok)
855 		stropt->so_wroff += TCPOPT_MAX_SACK_LEN;
856 	stropt->so_hiwat = connp->conn_rcvbuf;
857 	stropt->so_maxblk = tcp_maxpsz_set(tcp, B_FALSE);
858 
859 	putnext(RD(q), stropt_mp);
860 
861 	/*
862 	 * Collect the information needed to sync with the sonode
863 	 */
864 	tcp_do_capability_ack(tcp, &tca, TC1_INFO|TC1_ACCEPTOR_ID);
865 
866 	laddrlen = faddrlen = sizeof (sin6_t);
867 	(void) tcp_getsockname((sock_lower_handle_t)connp,
868 	    (struct sockaddr *)&laddr, &laddrlen, CRED());
869 	error = tcp_getpeername((sock_lower_handle_t)connp,
870 	    (struct sockaddr *)&faddr, &faddrlen, CRED());
871 	if (error != 0)
872 		faddrlen = 0;
873 
874 	opts = 0;
875 	if (connp->conn_oobinline)
876 		opts |= SO_OOBINLINE;
877 	if (connp->conn_ixa->ixa_flags & IXAF_DONTROUTE)
878 		opts |= SO_DONTROUTE;
879 
880 	/*
881 	 * Notify the socket that the protocol is now quiescent,
882 	 * and it's therefore safe move data from the socket
883 	 * to the stream head.
884 	 */
885 	mp = (*quiesced_cb)(connp->conn_upper_handle, arg, &tca,
886 	    (struct sockaddr *)&laddr, laddrlen,
887 	    (struct sockaddr *)&faddr, faddrlen, opts);
888 
889 	while (mp != NULL) {
890 		mpnext = mp->b_next;
891 		tcp->tcp_rcv_list = mp->b_next;
892 		mp->b_next = NULL;
893 		putnext(q, mp);
894 		mp = mpnext;
895 	}
896 	ASSERT(tcp->tcp_rcv_last_head == NULL);
897 	ASSERT(tcp->tcp_rcv_last_tail == NULL);
898 	ASSERT(tcp->tcp_rcv_cnt == 0);
899 
900 	/*
901 	 * All eagers in q0 are marked as being non-STREAM, so they will
902 	 * make su_newconn upcalls when the handshake completes, which
903 	 * will fail (resulting in the conn being closed). So we just blow
904 	 * off everything in q0 instead of waiting for the inevitable.
905 	 */
906 	if (tcp->tcp_conn_req_cnt_q0 != 0)
907 		tcp_eager_cleanup(tcp, B_TRUE);
908 }
909 
910 /*
911  * An eager is falling back to TPI. All we have to do is send
912  * up a T_CONN_IND.
913  */
914 static void
915 tcp_fallback_eager(tcp_t *eager, boolean_t issocket,
916     so_proto_quiesced_cb_t quiesced_cb, sock_quiesce_arg_t *arg)
917 {
918 	conn_t *connp = eager->tcp_connp;
919 	tcp_t *listener = eager->tcp_listener;
920 	mblk_t *mp;
921 
922 	ASSERT(listener != NULL);
923 
924 	/*
925 	 * Notify the socket that the protocol is now quiescent,
926 	 * and it's therefore safe move data from the socket
927 	 * to tcp's rcv queue.
928 	 */
929 	mp = (*quiesced_cb)(connp->conn_upper_handle, arg, NULL, NULL, 0,
930 	    NULL, 0, 0);
931 
932 	if (mp != NULL) {
933 		ASSERT(eager->tcp_rcv_cnt == 0);
934 
935 		eager->tcp_rcv_list = mp;
936 		eager->tcp_rcv_cnt = msgdsize(mp);
937 		while (mp->b_next != NULL) {
938 			mp = mp->b_next;
939 			eager->tcp_rcv_cnt += msgdsize(mp);
940 		}
941 		eager->tcp_rcv_last_head = mp;
942 		while (mp->b_cont)
943 			mp = mp->b_cont;
944 		eager->tcp_rcv_last_tail = mp;
945 		if (eager->tcp_rcv_cnt > eager->tcp_rwnd)
946 			eager->tcp_rwnd = 0;
947 		else
948 			eager->tcp_rwnd -= eager->tcp_rcv_cnt;
949 	}
950 
951 	if (!issocket)
952 		eager->tcp_issocket = B_FALSE;
953 	/*
954 	 * The stream for this eager does not yet exist, so mark it as
955 	 * being detached.
956 	 */
957 	eager->tcp_detached = B_TRUE;
958 	eager->tcp_hard_binding = B_TRUE;
959 	connp->conn_rq = listener->tcp_connp->conn_rq;
960 	connp->conn_wq = listener->tcp_connp->conn_wq;
961 
962 	/* Send up the connection indication */
963 	mp = eager->tcp_conn.tcp_eager_conn_ind;
964 	ASSERT(mp != NULL);
965 	eager->tcp_conn.tcp_eager_conn_ind = NULL;
966 
967 	/*
968 	 * TLI/XTI applications will get confused by
969 	 * sending eager as an option since it violates
970 	 * the option semantics. So remove the eager as
971 	 * option since TLI/XTI app doesn't need it anyway.
972 	 */
973 	if (!issocket) {
974 		struct T_conn_ind *conn_ind;
975 
976 		conn_ind = (struct T_conn_ind *)mp->b_rptr;
977 		conn_ind->OPT_length = 0;
978 		conn_ind->OPT_offset = 0;
979 	}
980 
981 	/*
982 	 * Sockfs guarantees that the listener will not be closed
983 	 * during fallback. So we can safely use the listener's queue.
984 	 */
985 	putnext(listener->tcp_connp->conn_rq, mp);
986 }
987 
988 
989 int
990 tcp_fallback(sock_lower_handle_t proto_handle, queue_t *q,
991     boolean_t direct_sockfs, so_proto_quiesced_cb_t quiesced_cb,
992     sock_quiesce_arg_t *arg)
993 {
994 	tcp_t			*tcp;
995 	conn_t 			*connp = (conn_t *)proto_handle;
996 	int			error;
997 	mblk_t			*stropt_mp;
998 	mblk_t			*ordrel_mp;
999 
1000 	tcp = connp->conn_tcp;
1001 
1002 	stropt_mp = allocb_wait(sizeof (struct stroptions), BPRI_HI, STR_NOSIG,
1003 	    NULL);
1004 
1005 	/* Pre-allocate the T_ordrel_ind mblk. */
1006 	ASSERT(tcp->tcp_ordrel_mp == NULL);
1007 	ordrel_mp = allocb_wait(sizeof (struct T_ordrel_ind), BPRI_HI,
1008 	    STR_NOSIG, NULL);
1009 	ordrel_mp->b_datap->db_type = M_PROTO;
1010 	((struct T_ordrel_ind *)ordrel_mp->b_rptr)->PRIM_type = T_ORDREL_IND;
1011 	ordrel_mp->b_wptr += sizeof (struct T_ordrel_ind);
1012 
1013 	/*
1014 	 * Enter the squeue so that no new packets can come in
1015 	 */
1016 	error = squeue_synch_enter(connp, NULL);
1017 	if (error != 0) {
1018 		/* failed to enter, free all the pre-allocated messages. */
1019 		freeb(stropt_mp);
1020 		freeb(ordrel_mp);
1021 		return (ENOMEM);
1022 	}
1023 
1024 	/*
1025 	 * Both endpoints must be of the same type (either STREAMS or
1026 	 * non-STREAMS) for fusion to be enabled. So if we are fused,
1027 	 * we have to unfuse.
1028 	 */
1029 	if (tcp->tcp_fused)
1030 		tcp_unfuse(tcp);
1031 
1032 	if (tcp->tcp_listener != NULL) {
1033 		/* The eager will deal with opts when accept() is called */
1034 		freeb(stropt_mp);
1035 		tcp_fallback_eager(tcp, direct_sockfs, quiesced_cb, arg);
1036 	} else {
1037 		tcp_fallback_noneager(tcp, stropt_mp, q, direct_sockfs,
1038 		    quiesced_cb, arg);
1039 	}
1040 
1041 	/*
1042 	 * No longer a direct socket
1043 	 *
1044 	 * Note that we intentionally leave the upper_handle and upcalls
1045 	 * intact, since eagers may still be using them.
1046 	 */
1047 	connp->conn_flags &= ~IPCL_NONSTR;
1048 	tcp->tcp_ordrel_mp = ordrel_mp;
1049 
1050 	/*
1051 	 * There should be atleast two ref's (IP + TCP)
1052 	 */
1053 	ASSERT(connp->conn_ref >= 2);
1054 	squeue_synch_exit(connp);
1055 
1056 	return (0);
1057 }
1058 
1059 /*
1060  * Notifies a non-STREAMS based listener about a new connection. This
1061  * function is executed on the *eager*'s squeue once the 3 way handshake
1062  * has completed. Note that the behavior differs from STREAMS, where the
1063  * T_CONN_IND is sent up by tcp_send_conn_ind() while on the *listener*'s
1064  * squeue.
1065  *
1066  * Returns B_TRUE if the notification succeeded and an upper handle was
1067  * obtained. `tcp' should be closed on failure.
1068  */
1069 boolean_t
1070 tcp_newconn_notify(tcp_t *tcp, ip_recv_attr_t *ira)
1071 {
1072 	tcp_t *listener = tcp->tcp_listener;
1073 	conn_t *lconnp = listener->tcp_connp;
1074 	conn_t *econnp = tcp->tcp_connp;
1075 	tcp_t *tail;
1076 	ipaddr_t *addr_cache;
1077 	sock_upper_handle_t upper;
1078 	struct sock_proto_props sopp;
1079 
1080 	mutex_enter(&listener->tcp_eager_lock);
1081 	/*
1082 	 * Take the eager out, if it is in the list of droppable eagers
1083 	 * as we are here because the 3W handshake is over.
1084 	 */
1085 	MAKE_UNDROPPABLE(tcp);
1086 	/*
1087 	 * The eager already has an extra ref put in tcp_input_data
1088 	 * so that it stays till accept comes back even though it
1089 	 * might get into TCPS_CLOSED as a result of a TH_RST etc.
1090 	 */
1091 	ASSERT(listener->tcp_conn_req_cnt_q0 > 0);
1092 	listener->tcp_conn_req_cnt_q0--;
1093 	listener->tcp_conn_req_cnt_q++;
1094 
1095 	/* Move from SYN_RCVD to ESTABLISHED list  */
1096 	tcp->tcp_eager_next_q0->tcp_eager_prev_q0 = tcp->tcp_eager_prev_q0;
1097 	tcp->tcp_eager_prev_q0->tcp_eager_next_q0 = tcp->tcp_eager_next_q0;
1098 	tcp->tcp_eager_prev_q0 = NULL;
1099 	tcp->tcp_eager_next_q0 = NULL;
1100 
1101 	/*
1102 	 * Insert at end of the queue because connections are accepted
1103 	 * in chronological order. Leaving the older connections at front
1104 	 * of the queue helps reducing search time.
1105 	 */
1106 	tail = listener->tcp_eager_last_q;
1107 	if (tail != NULL)
1108 		tail->tcp_eager_next_q = tcp;
1109 	else
1110 		listener->tcp_eager_next_q = tcp;
1111 	listener->tcp_eager_last_q = tcp;
1112 	tcp->tcp_eager_next_q = NULL;
1113 
1114 	/* we have timed out before */
1115 	if (tcp->tcp_syn_rcvd_timeout != 0) {
1116 		tcp->tcp_syn_rcvd_timeout = 0;
1117 		listener->tcp_syn_rcvd_timeout--;
1118 		if (listener->tcp_syn_defense &&
1119 		    listener->tcp_syn_rcvd_timeout <=
1120 		    (listener->tcp_tcps->tcps_conn_req_max_q0 >> 5) &&
1121 		    10*MINUTES < TICK_TO_MSEC(ddi_get_lbolt64() -
1122 		    listener->tcp_last_rcv_lbolt)) {
1123 			/*
1124 			 * Turn off the defense mode if we
1125 			 * believe the SYN attack is over.
1126 			 */
1127 			listener->tcp_syn_defense = B_FALSE;
1128 			if (listener->tcp_ip_addr_cache) {
1129 				kmem_free((void *)listener->tcp_ip_addr_cache,
1130 				    IP_ADDR_CACHE_SIZE * sizeof (ipaddr_t));
1131 				listener->tcp_ip_addr_cache = NULL;
1132 			}
1133 		}
1134 	}
1135 	addr_cache = (ipaddr_t *)(listener->tcp_ip_addr_cache);
1136 	if (addr_cache != NULL) {
1137 		/*
1138 		 * We have finished a 3-way handshake with this
1139 		 * remote host. This proves the IP addr is good.
1140 		 * Cache it!
1141 		 */
1142 		addr_cache[IP_ADDR_CACHE_HASH(tcp->tcp_connp->conn_faddr_v4)] =
1143 		    tcp->tcp_connp->conn_faddr_v4;
1144 	}
1145 	mutex_exit(&listener->tcp_eager_lock);
1146 
1147 	/*
1148 	 * Notify the ULP about the newconn. It is guaranteed that no
1149 	 * tcp_accept() call will be made for the eager if the
1150 	 * notification fails.
1151 	 */
1152 	if ((upper = (*lconnp->conn_upcalls->su_newconn)
1153 	    (lconnp->conn_upper_handle, (sock_lower_handle_t)econnp,
1154 	    &sock_tcp_downcalls, ira->ira_cred, ira->ira_cpid,
1155 	    &econnp->conn_upcalls)) == NULL) {
1156 		return (B_FALSE);
1157 	}
1158 	econnp->conn_upper_handle = upper;
1159 
1160 	tcp->tcp_detached = B_FALSE;
1161 	tcp->tcp_hard_binding = B_FALSE;
1162 	tcp->tcp_tconnind_started = B_TRUE;
1163 
1164 	if (econnp->conn_keepalive) {
1165 		tcp->tcp_ka_last_intrvl = 0;
1166 		tcp->tcp_ka_tid = TCP_TIMER(tcp, tcp_keepalive_timer,
1167 		    tcp->tcp_ka_interval);
1168 	}
1169 
1170 	/* Update the necessary parameters */
1171 	tcp_get_proto_props(tcp, &sopp);
1172 
1173 	(*econnp->conn_upcalls->su_set_proto_props)
1174 	    (econnp->conn_upper_handle, &sopp);
1175 
1176 	return (B_TRUE);
1177 }
1178