xref: /titanic_41/usr/src/uts/common/inet/tcp/tcp_socket.c (revision e4f5a11d4a234623168c1558fcdf4341e11769e1)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
24  */
25 
26 /* This file contains all TCP kernel socket related functions. */
27 
28 #include <sys/types.h>
29 #include <sys/strlog.h>
30 #include <sys/policy.h>
31 #include <sys/sockio.h>
32 #include <sys/strsubr.h>
33 #include <sys/strsun.h>
34 #include <sys/squeue_impl.h>
35 #include <sys/squeue.h>
36 #define	_SUN_TPI_VERSION 2
37 #include <sys/tihdr.h>
38 #include <sys/timod.h>
39 #include <sys/tpicommon.h>
40 #include <sys/socketvar.h>
41 
42 #include <inet/common.h>
43 #include <inet/proto_set.h>
44 #include <inet/ip.h>
45 #include <inet/tcp.h>
46 #include <inet/tcp_impl.h>
47 
48 static void	tcp_activate(sock_lower_handle_t, sock_upper_handle_t,
49 		    sock_upcalls_t *, int, cred_t *);
50 static int	tcp_accept(sock_lower_handle_t, sock_lower_handle_t,
51 		    sock_upper_handle_t, cred_t *);
52 static int	tcp_bind(sock_lower_handle_t, struct sockaddr *,
53 		    socklen_t, cred_t *);
54 static int	tcp_listen(sock_lower_handle_t, int, cred_t *);
55 static int	tcp_connect(sock_lower_handle_t, const struct sockaddr *,
56 		    socklen_t, sock_connid_t *, cred_t *);
57 static int	tcp_getsockopt(sock_lower_handle_t, int, int, void *,
58 		    socklen_t *, cred_t *);
59 static int	tcp_setsockopt(sock_lower_handle_t, int, int, const void *,
60 		    socklen_t, cred_t *);
61 static int	tcp_sendmsg(sock_lower_handle_t, mblk_t *, struct nmsghdr *,
62 		    cred_t *cr);
63 static int	tcp_shutdown(sock_lower_handle_t, int, cred_t *);
64 static void	tcp_clr_flowctrl(sock_lower_handle_t);
65 static int	tcp_ioctl(sock_lower_handle_t, int, intptr_t, int, int32_t *,
66 		    cred_t *);
67 static int	tcp_close(sock_lower_handle_t, int, cred_t *);
68 
69 sock_downcalls_t sock_tcp_downcalls = {
70 	tcp_activate,
71 	tcp_accept,
72 	tcp_bind,
73 	tcp_listen,
74 	tcp_connect,
75 	tcp_getpeername,
76 	tcp_getsockname,
77 	tcp_getsockopt,
78 	tcp_setsockopt,
79 	tcp_sendmsg,
80 	NULL,
81 	NULL,
82 	NULL,
83 	tcp_shutdown,
84 	tcp_clr_flowctrl,
85 	tcp_ioctl,
86 	tcp_close,
87 };
88 
89 /* ARGSUSED */
90 static void
91 tcp_activate(sock_lower_handle_t proto_handle, sock_upper_handle_t sock_handle,
92     sock_upcalls_t *sock_upcalls, int flags, cred_t *cr)
93 {
94 	conn_t *connp = (conn_t *)proto_handle;
95 	struct sock_proto_props sopp;
96 	extern struct module_info tcp_rinfo;
97 
98 	ASSERT(connp->conn_upper_handle == NULL);
99 
100 	/* All Solaris components should pass a cred for this operation. */
101 	ASSERT(cr != NULL);
102 
103 	sopp.sopp_flags = SOCKOPT_RCVHIWAT | SOCKOPT_RCVLOWAT |
104 	    SOCKOPT_MAXPSZ | SOCKOPT_MAXBLK | SOCKOPT_RCVTIMER |
105 	    SOCKOPT_RCVTHRESH | SOCKOPT_MAXADDRLEN | SOCKOPT_MINPSZ;
106 
107 	sopp.sopp_rxhiwat = SOCKET_RECVHIWATER;
108 	sopp.sopp_rxlowat = SOCKET_RECVLOWATER;
109 	sopp.sopp_maxpsz = INFPSZ;
110 	sopp.sopp_maxblk = INFPSZ;
111 	sopp.sopp_rcvtimer = SOCKET_TIMER_INTERVAL;
112 	sopp.sopp_rcvthresh = SOCKET_RECVHIWATER >> 3;
113 	sopp.sopp_maxaddrlen = sizeof (sin6_t);
114 	sopp.sopp_minpsz = (tcp_rinfo.mi_minpsz == 1) ? 0 :
115 	    tcp_rinfo.mi_minpsz;
116 
117 	connp->conn_upcalls = sock_upcalls;
118 	connp->conn_upper_handle = sock_handle;
119 
120 	ASSERT(connp->conn_rcvbuf != 0 &&
121 	    connp->conn_rcvbuf == connp->conn_tcp->tcp_rwnd);
122 	(*sock_upcalls->su_set_proto_props)(sock_handle, &sopp);
123 }
124 
125 /*ARGSUSED*/
126 static int
127 tcp_accept(sock_lower_handle_t lproto_handle,
128     sock_lower_handle_t eproto_handle, sock_upper_handle_t sock_handle,
129     cred_t *cr)
130 {
131 	conn_t *lconnp, *econnp;
132 	tcp_t *listener, *eager;
133 
134 	/*
135 	 * KSSL can move a socket from one listener to another, in which
136 	 * case `lproto_handle' points to the new listener. To ensure that
137 	 * the original listener is used the information is obtained from
138 	 * the eager.
139 	 */
140 	econnp = (conn_t *)eproto_handle;
141 	eager = econnp->conn_tcp;
142 	ASSERT(IPCL_IS_NONSTR(econnp));
143 	ASSERT(eager->tcp_listener != NULL);
144 	listener = eager->tcp_listener;
145 	lconnp = (conn_t *)listener->tcp_connp;
146 	ASSERT(listener->tcp_state == TCPS_LISTEN);
147 	ASSERT(lconnp->conn_upper_handle != NULL);
148 
149 	/*
150 	 * It is possible for the accept thread to race with the thread that
151 	 * made the su_newconn upcall in tcp_newconn_notify. Both
152 	 * tcp_newconn_notify and tcp_accept require that conn_upper_handle
153 	 * and conn_upcalls be set before returning, so they both write to
154 	 * them. However, we're guaranteed that the value written is the same
155 	 * for both threads.
156 	 */
157 	ASSERT(econnp->conn_upper_handle == NULL ||
158 	    econnp->conn_upper_handle == sock_handle);
159 	ASSERT(econnp->conn_upcalls == NULL ||
160 	    econnp->conn_upcalls == lconnp->conn_upcalls);
161 	econnp->conn_upper_handle = sock_handle;
162 	econnp->conn_upcalls = lconnp->conn_upcalls;
163 
164 	ASSERT(econnp->conn_netstack ==
165 	    listener->tcp_connp->conn_netstack);
166 	ASSERT(eager->tcp_tcps == listener->tcp_tcps);
167 
168 	/*
169 	 * We should have a minimum of 2 references on the conn at this
170 	 * point. One for TCP and one for the newconn notification
171 	 * (which is now taken over by IP). In the normal case we would
172 	 * also have another reference (making a total of 3) for the conn
173 	 * being in the classifier hash list. However the eager could have
174 	 * received an RST subsequently and tcp_closei_local could have
175 	 * removed the eager from the classifier hash list, hence we can't
176 	 * assert that reference.
177 	 */
178 	ASSERT(econnp->conn_ref >= 2);
179 
180 	/*
181 	 * An error is returned if this conn has been reset, which will
182 	 * cause the socket to be closed immediately. The eager will be
183 	 * unlinked from the listener during close.
184 	 */
185 	if (eager->tcp_state < TCPS_ESTABLISHED)
186 		return (ECONNABORTED);
187 
188 	mutex_enter(&listener->tcp_eager_lock);
189 	/*
190 	 * Non-STREAMS listeners never defer the notification of new
191 	 * connections.
192 	 */
193 	ASSERT(!listener->tcp_eager_prev_q0->tcp_conn_def_q0);
194 	tcp_eager_unlink(eager);
195 	mutex_exit(&listener->tcp_eager_lock);
196 	CONN_DEC_REF(listener->tcp_connp);
197 
198 	return (0);
199 }
200 
201 static int
202 tcp_bind(sock_lower_handle_t proto_handle, struct sockaddr *sa,
203     socklen_t len, cred_t *cr)
204 {
205 	int 		error;
206 	conn_t		*connp = (conn_t *)proto_handle;
207 
208 	/* All Solaris components should pass a cred for this operation. */
209 	ASSERT(cr != NULL);
210 	ASSERT(connp->conn_upper_handle != NULL);
211 
212 	error = squeue_synch_enter(connp, NULL);
213 	if (error != 0) {
214 		/* failed to enter */
215 		return (ENOSR);
216 	}
217 
218 	/* binding to a NULL address really means unbind */
219 	if (sa == NULL) {
220 		if (connp->conn_tcp->tcp_state < TCPS_LISTEN)
221 			error = tcp_do_unbind(connp);
222 		else
223 			error = EINVAL;
224 	} else {
225 		error = tcp_do_bind(connp, sa, len, cr, B_TRUE);
226 	}
227 
228 	squeue_synch_exit(connp);
229 
230 	if (error < 0) {
231 		if (error == -TOUTSTATE)
232 			error = EINVAL;
233 		else
234 			error = proto_tlitosyserr(-error);
235 	}
236 
237 	return (error);
238 }
239 
240 /* ARGSUSED */
241 static int
242 tcp_listen(sock_lower_handle_t proto_handle, int backlog, cred_t *cr)
243 {
244 	conn_t	*connp = (conn_t *)proto_handle;
245 	tcp_t	*tcp = connp->conn_tcp;
246 	int 	error;
247 
248 	ASSERT(connp->conn_upper_handle != NULL);
249 
250 	/* All Solaris components should pass a cred for this operation. */
251 	ASSERT(cr != NULL);
252 
253 	error = squeue_synch_enter(connp, NULL);
254 	if (error != 0) {
255 		/* failed to enter */
256 		return (ENOBUFS);
257 	}
258 
259 	error = tcp_do_listen(connp, NULL, 0, backlog, cr, B_FALSE);
260 	if (error == 0) {
261 		/*
262 		 * sockfs needs to know what's the maximum number of socket
263 		 * that can be queued on the listener.
264 		 */
265 		(*connp->conn_upcalls->su_opctl)(connp->conn_upper_handle,
266 		    SOCK_OPCTL_ENAB_ACCEPT,
267 		    (uintptr_t)(tcp->tcp_conn_req_max +
268 		    tcp->tcp_tcps->tcps_conn_req_max_q0));
269 	} else if (error < 0) {
270 		if (error == -TOUTSTATE)
271 			error = EINVAL;
272 		else
273 			error = proto_tlitosyserr(-error);
274 	}
275 	squeue_synch_exit(connp);
276 	return (error);
277 }
278 
279 static int
280 tcp_connect(sock_lower_handle_t proto_handle, const struct sockaddr *sa,
281     socklen_t len, sock_connid_t *id, cred_t *cr)
282 {
283 	conn_t		*connp = (conn_t *)proto_handle;
284 	int		error;
285 
286 	ASSERT(connp->conn_upper_handle != NULL);
287 
288 	/* All Solaris components should pass a cred for this operation. */
289 	ASSERT(cr != NULL);
290 
291 	error = proto_verify_ip_addr(connp->conn_family, sa, len);
292 	if (error != 0) {
293 		return (error);
294 	}
295 
296 	error = squeue_synch_enter(connp, NULL);
297 	if (error != 0) {
298 		/* failed to enter */
299 		return (ENOSR);
300 	}
301 
302 	/*
303 	 * TCP supports quick connect, so no need to do an implicit bind
304 	 */
305 	error = tcp_do_connect(connp, sa, len, cr, curproc->p_pid);
306 	if (error == 0) {
307 		*id = connp->conn_tcp->tcp_connid;
308 	} else if (error < 0) {
309 		if (error == -TOUTSTATE) {
310 			switch (connp->conn_tcp->tcp_state) {
311 			case TCPS_SYN_SENT:
312 				error = EALREADY;
313 				break;
314 			case TCPS_ESTABLISHED:
315 				error = EISCONN;
316 				break;
317 			case TCPS_LISTEN:
318 				error = EOPNOTSUPP;
319 				break;
320 			default:
321 				error = EINVAL;
322 				break;
323 			}
324 		} else {
325 			error = proto_tlitosyserr(-error);
326 		}
327 	}
328 
329 	if (connp->conn_tcp->tcp_loopback) {
330 		struct sock_proto_props sopp;
331 
332 		sopp.sopp_flags = SOCKOPT_LOOPBACK;
333 		sopp.sopp_loopback = B_TRUE;
334 
335 		(*connp->conn_upcalls->su_set_proto_props)(
336 		    connp->conn_upper_handle, &sopp);
337 	}
338 done:
339 	squeue_synch_exit(connp);
340 
341 	return ((error == 0) ? EINPROGRESS : error);
342 }
343 
344 /* ARGSUSED3 */
345 int
346 tcp_getpeername(sock_lower_handle_t proto_handle, struct sockaddr *addr,
347     socklen_t *addrlenp, cred_t *cr)
348 {
349 	conn_t	*connp = (conn_t *)proto_handle;
350 	tcp_t	*tcp = connp->conn_tcp;
351 
352 	/* All Solaris components should pass a cred for this operation. */
353 	ASSERT(cr != NULL);
354 
355 	ASSERT(tcp != NULL);
356 	if (tcp->tcp_state < TCPS_SYN_RCVD)
357 		return (ENOTCONN);
358 
359 	return (conn_getpeername(connp, addr, addrlenp));
360 }
361 
362 /* ARGSUSED3 */
363 int
364 tcp_getsockname(sock_lower_handle_t proto_handle, struct sockaddr *addr,
365     socklen_t *addrlenp, cred_t *cr)
366 {
367 	conn_t	*connp = (conn_t *)proto_handle;
368 
369 	/* All Solaris components should pass a cred for this operation. */
370 	ASSERT(cr != NULL);
371 
372 	return (conn_getsockname(connp, addr, addrlenp));
373 }
374 
375 /* returns UNIX error, the optlen is a value-result arg */
376 static int
377 tcp_getsockopt(sock_lower_handle_t proto_handle, int level, int option_name,
378     void *optvalp, socklen_t *optlen, cred_t *cr)
379 {
380 	conn_t		*connp = (conn_t *)proto_handle;
381 	int		error;
382 	t_uscalar_t	max_optbuf_len;
383 	void		*optvalp_buf;
384 	int		len;
385 
386 	ASSERT(connp->conn_upper_handle != NULL);
387 
388 	error = proto_opt_check(level, option_name, *optlen, &max_optbuf_len,
389 	    tcp_opt_obj.odb_opt_des_arr,
390 	    tcp_opt_obj.odb_opt_arr_cnt,
391 	    B_FALSE, B_TRUE, cr);
392 	if (error != 0) {
393 		if (error < 0) {
394 			error = proto_tlitosyserr(-error);
395 		}
396 		return (error);
397 	}
398 
399 	optvalp_buf = kmem_alloc(max_optbuf_len, KM_SLEEP);
400 
401 	error = squeue_synch_enter(connp, NULL);
402 	if (error == ENOMEM) {
403 		kmem_free(optvalp_buf, max_optbuf_len);
404 		return (ENOMEM);
405 	}
406 
407 	len = tcp_opt_get(connp, level, option_name, optvalp_buf);
408 	squeue_synch_exit(connp);
409 
410 	if (len == -1) {
411 		kmem_free(optvalp_buf, max_optbuf_len);
412 		return (EINVAL);
413 	}
414 
415 	/*
416 	 * update optlen and copy option value
417 	 */
418 	t_uscalar_t size = MIN(len, *optlen);
419 
420 	bcopy(optvalp_buf, optvalp, size);
421 	bcopy(&size, optlen, sizeof (size));
422 
423 	kmem_free(optvalp_buf, max_optbuf_len);
424 	return (0);
425 }
426 
427 static int
428 tcp_setsockopt(sock_lower_handle_t proto_handle, int level, int option_name,
429     const void *optvalp, socklen_t optlen, cred_t *cr)
430 {
431 	conn_t		*connp = (conn_t *)proto_handle;
432 	int		error;
433 
434 	ASSERT(connp->conn_upper_handle != NULL);
435 	/*
436 	 * Entering the squeue synchronously can result in a context switch,
437 	 * which can cause a rather sever performance degradation. So we try to
438 	 * handle whatever options we can without entering the squeue.
439 	 */
440 	if (level == IPPROTO_TCP) {
441 		switch (option_name) {
442 		case TCP_NODELAY:
443 			if (optlen != sizeof (int32_t))
444 				return (EINVAL);
445 			mutex_enter(&connp->conn_tcp->tcp_non_sq_lock);
446 			connp->conn_tcp->tcp_naglim = *(int *)optvalp ? 1 :
447 			    connp->conn_tcp->tcp_mss;
448 			mutex_exit(&connp->conn_tcp->tcp_non_sq_lock);
449 			return (0);
450 		default:
451 			break;
452 		}
453 	}
454 
455 	error = squeue_synch_enter(connp, NULL);
456 	if (error == ENOMEM) {
457 		return (ENOMEM);
458 	}
459 
460 	error = proto_opt_check(level, option_name, optlen, NULL,
461 	    tcp_opt_obj.odb_opt_des_arr,
462 	    tcp_opt_obj.odb_opt_arr_cnt,
463 	    B_TRUE, B_FALSE, cr);
464 
465 	if (error != 0) {
466 		if (error < 0) {
467 			error = proto_tlitosyserr(-error);
468 		}
469 		squeue_synch_exit(connp);
470 		return (error);
471 	}
472 
473 	error = tcp_opt_set(connp, SETFN_OPTCOM_NEGOTIATE, level, option_name,
474 	    optlen, (uchar_t *)optvalp, (uint_t *)&optlen, (uchar_t *)optvalp,
475 	    NULL, cr);
476 	squeue_synch_exit(connp);
477 
478 	ASSERT(error >= 0);
479 
480 	return (error);
481 }
482 
483 /* ARGSUSED */
484 static int
485 tcp_sendmsg(sock_lower_handle_t proto_handle, mblk_t *mp, struct nmsghdr *msg,
486     cred_t *cr)
487 {
488 	tcp_t		*tcp;
489 	uint32_t	msize;
490 	conn_t *connp = (conn_t *)proto_handle;
491 	int32_t		tcpstate;
492 
493 	/* All Solaris components should pass a cred for this operation. */
494 	ASSERT(cr != NULL);
495 
496 	ASSERT(connp->conn_ref >= 2);
497 	ASSERT(connp->conn_upper_handle != NULL);
498 
499 	if (msg->msg_controllen != 0) {
500 		freemsg(mp);
501 		return (EOPNOTSUPP);
502 	}
503 
504 	switch (DB_TYPE(mp)) {
505 	case M_DATA:
506 		tcp = connp->conn_tcp;
507 		ASSERT(tcp != NULL);
508 
509 		tcpstate = tcp->tcp_state;
510 		if (tcpstate < TCPS_ESTABLISHED) {
511 			freemsg(mp);
512 			/*
513 			 * We return ENOTCONN if the endpoint is trying to
514 			 * connect or has never been connected, and EPIPE if it
515 			 * has been disconnected. The connection id helps us
516 			 * distinguish between the last two cases.
517 			 */
518 			return ((tcpstate == TCPS_SYN_SENT) ? ENOTCONN :
519 			    ((tcp->tcp_connid > 0) ? EPIPE : ENOTCONN));
520 		} else if (tcpstate > TCPS_CLOSE_WAIT) {
521 			freemsg(mp);
522 			return (EPIPE);
523 		}
524 
525 		msize = msgdsize(mp);
526 
527 		mutex_enter(&tcp->tcp_non_sq_lock);
528 		tcp->tcp_squeue_bytes += msize;
529 		/*
530 		 * Squeue Flow Control
531 		 */
532 		if (TCP_UNSENT_BYTES(tcp) > connp->conn_sndbuf) {
533 			tcp_setqfull(tcp);
534 		}
535 		mutex_exit(&tcp->tcp_non_sq_lock);
536 
537 		/*
538 		 * The application may pass in an address in the msghdr, but
539 		 * we ignore the address on connection-oriented sockets.
540 		 * Just like BSD this code does not generate an error for
541 		 * TCP (a CONNREQUIRED socket) when sending to an address
542 		 * passed in with sendto/sendmsg. Instead the data is
543 		 * delivered on the connection as if no address had been
544 		 * supplied.
545 		 */
546 		CONN_INC_REF(connp);
547 
548 		if (msg->msg_flags & MSG_OOB) {
549 			SQUEUE_ENTER_ONE(connp->conn_sqp, mp, tcp_output_urgent,
550 			    connp, NULL, tcp_squeue_flag, SQTAG_TCP_OUTPUT);
551 		} else {
552 			SQUEUE_ENTER_ONE(connp->conn_sqp, mp, tcp_output,
553 			    connp, NULL, tcp_squeue_flag, SQTAG_TCP_OUTPUT);
554 		}
555 
556 		return (0);
557 
558 	default:
559 		ASSERT(0);
560 	}
561 
562 	freemsg(mp);
563 	return (0);
564 }
565 
566 /* ARGSUSED */
567 static int
568 tcp_shutdown(sock_lower_handle_t proto_handle, int how, cred_t *cr)
569 {
570 	conn_t  *connp = (conn_t *)proto_handle;
571 	tcp_t   *tcp = connp->conn_tcp;
572 
573 	ASSERT(connp->conn_upper_handle != NULL);
574 
575 	/* All Solaris components should pass a cred for this operation. */
576 	ASSERT(cr != NULL);
577 
578 	/*
579 	 * X/Open requires that we check the connected state.
580 	 */
581 	if (tcp->tcp_state < TCPS_SYN_SENT)
582 		return (ENOTCONN);
583 
584 	/* shutdown the send side */
585 	if (how != SHUT_RD) {
586 		mblk_t *bp;
587 
588 		bp = allocb_wait(0, BPRI_HI, STR_NOSIG, NULL);
589 		CONN_INC_REF(connp);
590 		SQUEUE_ENTER_ONE(connp->conn_sqp, bp, tcp_shutdown_output,
591 		    connp, NULL, SQ_NODRAIN, SQTAG_TCP_SHUTDOWN_OUTPUT);
592 
593 		(*connp->conn_upcalls->su_opctl)(connp->conn_upper_handle,
594 		    SOCK_OPCTL_SHUT_SEND, 0);
595 	}
596 
597 	/* shutdown the recv side */
598 	if (how != SHUT_WR)
599 		(*connp->conn_upcalls->su_opctl)(connp->conn_upper_handle,
600 		    SOCK_OPCTL_SHUT_RECV, 0);
601 
602 	return (0);
603 }
604 
605 static void
606 tcp_clr_flowctrl(sock_lower_handle_t proto_handle)
607 {
608 	conn_t  *connp = (conn_t *)proto_handle;
609 	tcp_t	*tcp = connp->conn_tcp;
610 	mblk_t *mp;
611 	int error;
612 
613 	ASSERT(connp->conn_upper_handle != NULL);
614 
615 	/*
616 	 * If tcp->tcp_rsrv_mp == NULL, it means that tcp_clr_flowctrl()
617 	 * is currently running.
618 	 */
619 	mutex_enter(&tcp->tcp_rsrv_mp_lock);
620 	if ((mp = tcp->tcp_rsrv_mp) == NULL) {
621 		mutex_exit(&tcp->tcp_rsrv_mp_lock);
622 		return;
623 	}
624 	tcp->tcp_rsrv_mp = NULL;
625 	mutex_exit(&tcp->tcp_rsrv_mp_lock);
626 
627 	error = squeue_synch_enter(connp, mp);
628 	ASSERT(error == 0);
629 
630 	mutex_enter(&tcp->tcp_rsrv_mp_lock);
631 	tcp->tcp_rsrv_mp = mp;
632 	mutex_exit(&tcp->tcp_rsrv_mp_lock);
633 
634 	if (tcp->tcp_fused) {
635 		tcp_fuse_backenable(tcp);
636 	} else {
637 		tcp->tcp_rwnd = connp->conn_rcvbuf;
638 		/*
639 		 * Send back a window update immediately if TCP is above
640 		 * ESTABLISHED state and the increase of the rcv window
641 		 * that the other side knows is at least 1 MSS after flow
642 		 * control is lifted.
643 		 */
644 		if (tcp->tcp_state >= TCPS_ESTABLISHED &&
645 		    tcp_rwnd_reopen(tcp) == TH_ACK_NEEDED) {
646 			tcp_xmit_ctl(NULL, tcp,
647 			    (tcp->tcp_swnd == 0) ? tcp->tcp_suna :
648 			    tcp->tcp_snxt, tcp->tcp_rnxt, TH_ACK);
649 		}
650 	}
651 
652 	squeue_synch_exit(connp);
653 }
654 
655 /* ARGSUSED */
656 static int
657 tcp_ioctl(sock_lower_handle_t proto_handle, int cmd, intptr_t arg,
658     int mode, int32_t *rvalp, cred_t *cr)
659 {
660 	conn_t  	*connp = (conn_t *)proto_handle;
661 	int		error;
662 
663 	ASSERT(connp->conn_upper_handle != NULL);
664 
665 	/* All Solaris components should pass a cred for this operation. */
666 	ASSERT(cr != NULL);
667 
668 	/*
669 	 * If we don't have a helper stream then create one.
670 	 * ip_create_helper_stream takes care of locking the conn_t,
671 	 * so this check for NULL is just a performance optimization.
672 	 */
673 	if (connp->conn_helper_info == NULL) {
674 		tcp_stack_t *tcps = connp->conn_tcp->tcp_tcps;
675 
676 		/*
677 		 * Create a helper stream for non-STREAMS socket.
678 		 */
679 		error = ip_create_helper_stream(connp, tcps->tcps_ldi_ident);
680 		if (error != 0) {
681 			ip0dbg(("tcp_ioctl: create of IP helper stream "
682 			    "failed %d\n", error));
683 			return (error);
684 		}
685 	}
686 
687 	switch (cmd) {
688 		case ND_SET:
689 		case ND_GET:
690 		case _SIOCSOCKFALLBACK:
691 		case TCP_IOC_ABORT_CONN:
692 		case TI_GETPEERNAME:
693 		case TI_GETMYNAME:
694 			ip1dbg(("tcp_ioctl: cmd 0x%x on non streams socket",
695 			    cmd));
696 			error = EINVAL;
697 			break;
698 		default:
699 			/*
700 			 * If the conn is not closing, pass on to IP using
701 			 * helper stream. Bump the ioctlref to prevent tcp_close
702 			 * from closing the rq/wq out from underneath the ioctl
703 			 * if it ends up queued or aborted/interrupted.
704 			 */
705 			mutex_enter(&connp->conn_lock);
706 			if (connp->conn_state_flags & (CONN_CLOSING)) {
707 				mutex_exit(&connp->conn_lock);
708 				error = EINVAL;
709 				break;
710 			}
711 			CONN_INC_IOCTLREF_LOCKED(connp);
712 			error = ldi_ioctl(connp->conn_helper_info->iphs_handle,
713 			    cmd, arg, mode, cr, rvalp);
714 			CONN_DEC_IOCTLREF(connp);
715 			break;
716 	}
717 	return (error);
718 }
719 
720 /* ARGSUSED */
721 static int
722 tcp_close(sock_lower_handle_t proto_handle, int flags, cred_t *cr)
723 {
724 	conn_t *connp = (conn_t *)proto_handle;
725 
726 	ASSERT(connp->conn_upper_handle != NULL);
727 
728 	/* All Solaris components should pass a cred for this operation. */
729 	ASSERT(cr != NULL);
730 
731 	tcp_close_common(connp, flags);
732 
733 	ip_free_helper_stream(connp);
734 
735 	/*
736 	 * Drop IP's reference on the conn. This is the last reference
737 	 * on the connp if the state was less than established. If the
738 	 * connection has gone into timewait state, then we will have
739 	 * one ref for the TCP and one more ref (total of two) for the
740 	 * classifier connected hash list (a timewait connections stays
741 	 * in connected hash till closed).
742 	 *
743 	 * We can't assert the references because there might be other
744 	 * transient reference places because of some walkers or queued
745 	 * packets in squeue for the timewait state.
746 	 */
747 	CONN_DEC_REF(connp);
748 
749 	/*
750 	 * EINPROGRESS tells sockfs to wait for a 'closed' upcall before
751 	 * freeing the socket.
752 	 */
753 	return (EINPROGRESS);
754 }
755 
756 /* ARGSUSED */
757 sock_lower_handle_t
758 tcp_create(int family, int type, int proto, sock_downcalls_t **sock_downcalls,
759     uint_t *smodep, int *errorp, int flags, cred_t *credp)
760 {
761 	conn_t		*connp;
762 	boolean_t	isv6 = family == AF_INET6;
763 	if (type != SOCK_STREAM || (family != AF_INET && family != AF_INET6) ||
764 	    (proto != 0 && proto != IPPROTO_TCP)) {
765 		*errorp = EPROTONOSUPPORT;
766 		return (NULL);
767 	}
768 
769 	connp = tcp_create_common(credp, isv6, B_TRUE, errorp);
770 	if (connp == NULL) {
771 		return (NULL);
772 	}
773 
774 	/*
775 	 * Put the ref for TCP. Ref for IP was already put
776 	 * by ipcl_conn_create. Also Make the conn_t globally
777 	 * visible to walkers
778 	 */
779 	mutex_enter(&connp->conn_lock);
780 	CONN_INC_REF_LOCKED(connp);
781 	ASSERT(connp->conn_ref == 2);
782 	connp->conn_state_flags &= ~CONN_INCIPIENT;
783 
784 	connp->conn_flags |= IPCL_NONSTR;
785 	mutex_exit(&connp->conn_lock);
786 
787 	ASSERT(errorp != NULL);
788 	*errorp = 0;
789 	*sock_downcalls = &sock_tcp_downcalls;
790 	*smodep = SM_CONNREQUIRED | SM_EXDATA | SM_ACCEPTSUPP |
791 	    SM_SENDFILESUPP;
792 
793 	return ((sock_lower_handle_t)connp);
794 }
795 
796 /*
797  * tcp_fallback
798  *
799  * A direct socket is falling back to using STREAMS. The queue
800  * that is being passed down was created using tcp_open() with
801  * the SO_FALLBACK flag set. As a result, the queue is not
802  * associated with a conn, and the q_ptrs instead contain the
803  * dev and minor area that should be used.
804  *
805  * The 'issocket' flag indicates whether the FireEngine
806  * optimizations should be used. The common case would be that
807  * optimizations are enabled, and they might be subsequently
808  * disabled using the _SIOCSOCKFALLBACK ioctl.
809  */
810 
811 /*
812  * An active connection is falling back to TPI. Gather all the information
813  * required by the STREAM head and TPI sonode and send it up.
814  */
815 static void
816 tcp_fallback_noneager(tcp_t *tcp, mblk_t *stropt_mp, queue_t *q,
817     boolean_t issocket, so_proto_quiesced_cb_t quiesced_cb,
818     sock_quiesce_arg_t *arg)
819 {
820 	conn_t			*connp = tcp->tcp_connp;
821 	struct stroptions	*stropt;
822 	struct T_capability_ack tca;
823 	struct sockaddr_in6	laddr, faddr;
824 	socklen_t 		laddrlen, faddrlen;
825 	short			opts;
826 	int			error;
827 	mblk_t			*mp, *mpnext;
828 
829 	connp->conn_dev = (dev_t)RD(q)->q_ptr;
830 	connp->conn_minor_arena = WR(q)->q_ptr;
831 
832 	RD(q)->q_ptr = WR(q)->q_ptr = connp;
833 
834 	connp->conn_rq = RD(q);
835 	connp->conn_wq = WR(q);
836 
837 	WR(q)->q_qinfo = &tcp_sock_winit;
838 
839 	if (!issocket)
840 		tcp_use_pure_tpi(tcp);
841 
842 	/*
843 	 * free the helper stream
844 	 */
845 	ip_free_helper_stream(connp);
846 
847 	/*
848 	 * Notify the STREAM head about options
849 	 */
850 	DB_TYPE(stropt_mp) = M_SETOPTS;
851 	stropt = (struct stroptions *)stropt_mp->b_rptr;
852 	stropt_mp->b_wptr += sizeof (struct stroptions);
853 	stropt->so_flags = SO_HIWAT | SO_WROFF | SO_MAXBLK;
854 
855 	stropt->so_wroff = connp->conn_ht_iphc_len + (tcp->tcp_loopback ? 0 :
856 	    tcp->tcp_tcps->tcps_wroff_xtra);
857 	if (tcp->tcp_snd_sack_ok)
858 		stropt->so_wroff += TCPOPT_MAX_SACK_LEN;
859 	stropt->so_hiwat = connp->conn_rcvbuf;
860 	stropt->so_maxblk = tcp_maxpsz_set(tcp, B_FALSE);
861 
862 	putnext(RD(q), stropt_mp);
863 
864 	/*
865 	 * Collect the information needed to sync with the sonode
866 	 */
867 	tcp_do_capability_ack(tcp, &tca, TC1_INFO|TC1_ACCEPTOR_ID);
868 
869 	laddrlen = faddrlen = sizeof (sin6_t);
870 	(void) tcp_getsockname((sock_lower_handle_t)connp,
871 	    (struct sockaddr *)&laddr, &laddrlen, CRED());
872 	error = tcp_getpeername((sock_lower_handle_t)connp,
873 	    (struct sockaddr *)&faddr, &faddrlen, CRED());
874 	if (error != 0)
875 		faddrlen = 0;
876 
877 	opts = 0;
878 	if (connp->conn_oobinline)
879 		opts |= SO_OOBINLINE;
880 	if (connp->conn_ixa->ixa_flags & IXAF_DONTROUTE)
881 		opts |= SO_DONTROUTE;
882 
883 	/*
884 	 * Notify the socket that the protocol is now quiescent,
885 	 * and it's therefore safe move data from the socket
886 	 * to the stream head.
887 	 */
888 	mp = (*quiesced_cb)(connp->conn_upper_handle, arg, &tca,
889 	    (struct sockaddr *)&laddr, laddrlen,
890 	    (struct sockaddr *)&faddr, faddrlen, opts);
891 
892 	while (mp != NULL) {
893 		mpnext = mp->b_next;
894 		tcp->tcp_rcv_list = mp->b_next;
895 		mp->b_next = NULL;
896 		putnext(q, mp);
897 		mp = mpnext;
898 	}
899 	ASSERT(tcp->tcp_rcv_last_head == NULL);
900 	ASSERT(tcp->tcp_rcv_last_tail == NULL);
901 	ASSERT(tcp->tcp_rcv_cnt == 0);
902 
903 	/*
904 	 * All eagers in q0 are marked as being non-STREAM, so they will
905 	 * make su_newconn upcalls when the handshake completes, which
906 	 * will fail (resulting in the conn being closed). So we just blow
907 	 * off everything in q0 instead of waiting for the inevitable.
908 	 */
909 	if (tcp->tcp_conn_req_cnt_q0 != 0)
910 		tcp_eager_cleanup(tcp, B_TRUE);
911 }
912 
913 /*
914  * An eager is falling back to TPI. All we have to do is send
915  * up a T_CONN_IND.
916  */
917 static void
918 tcp_fallback_eager(tcp_t *eager, boolean_t issocket,
919     so_proto_quiesced_cb_t quiesced_cb, sock_quiesce_arg_t *arg)
920 {
921 	conn_t *connp = eager->tcp_connp;
922 	tcp_t *listener = eager->tcp_listener;
923 	mblk_t *mp;
924 
925 	ASSERT(listener != NULL);
926 
927 	/*
928 	 * Notify the socket that the protocol is now quiescent,
929 	 * and it's therefore safe move data from the socket
930 	 * to tcp's rcv queue.
931 	 */
932 	mp = (*quiesced_cb)(connp->conn_upper_handle, arg, NULL, NULL, 0,
933 	    NULL, 0, 0);
934 
935 	if (mp != NULL) {
936 		ASSERT(eager->tcp_rcv_cnt == 0);
937 
938 		eager->tcp_rcv_list = mp;
939 		eager->tcp_rcv_cnt = msgdsize(mp);
940 		while (mp->b_next != NULL) {
941 			mp = mp->b_next;
942 			eager->tcp_rcv_cnt += msgdsize(mp);
943 		}
944 		eager->tcp_rcv_last_head = mp;
945 		while (mp->b_cont)
946 			mp = mp->b_cont;
947 		eager->tcp_rcv_last_tail = mp;
948 		if (eager->tcp_rcv_cnt > eager->tcp_rwnd)
949 			eager->tcp_rwnd = 0;
950 		else
951 			eager->tcp_rwnd -= eager->tcp_rcv_cnt;
952 	}
953 
954 	if (!issocket)
955 		eager->tcp_issocket = B_FALSE;
956 	/*
957 	 * The stream for this eager does not yet exist, so mark it as
958 	 * being detached.
959 	 */
960 	eager->tcp_detached = B_TRUE;
961 	eager->tcp_hard_binding = B_TRUE;
962 	connp->conn_rq = listener->tcp_connp->conn_rq;
963 	connp->conn_wq = listener->tcp_connp->conn_wq;
964 
965 	/* Send up the connection indication */
966 	mp = eager->tcp_conn.tcp_eager_conn_ind;
967 	ASSERT(mp != NULL);
968 	eager->tcp_conn.tcp_eager_conn_ind = NULL;
969 
970 	/*
971 	 * TLI/XTI applications will get confused by
972 	 * sending eager as an option since it violates
973 	 * the option semantics. So remove the eager as
974 	 * option since TLI/XTI app doesn't need it anyway.
975 	 */
976 	if (!issocket) {
977 		struct T_conn_ind *conn_ind;
978 
979 		conn_ind = (struct T_conn_ind *)mp->b_rptr;
980 		conn_ind->OPT_length = 0;
981 		conn_ind->OPT_offset = 0;
982 	}
983 
984 	/*
985 	 * Sockfs guarantees that the listener will not be closed
986 	 * during fallback. So we can safely use the listener's queue.
987 	 */
988 	putnext(listener->tcp_connp->conn_rq, mp);
989 }
990 
991 
992 int
993 tcp_fallback(sock_lower_handle_t proto_handle, queue_t *q,
994     boolean_t direct_sockfs, so_proto_quiesced_cb_t quiesced_cb,
995     sock_quiesce_arg_t *arg)
996 {
997 	tcp_t			*tcp;
998 	conn_t 			*connp = (conn_t *)proto_handle;
999 	int			error;
1000 	mblk_t			*stropt_mp;
1001 	mblk_t			*ordrel_mp;
1002 
1003 	tcp = connp->conn_tcp;
1004 
1005 	stropt_mp = allocb_wait(sizeof (struct stroptions), BPRI_HI, STR_NOSIG,
1006 	    NULL);
1007 
1008 	/* Pre-allocate the T_ordrel_ind mblk. */
1009 	ASSERT(tcp->tcp_ordrel_mp == NULL);
1010 	ordrel_mp = allocb_wait(sizeof (struct T_ordrel_ind), BPRI_HI,
1011 	    STR_NOSIG, NULL);
1012 	ordrel_mp->b_datap->db_type = M_PROTO;
1013 	((struct T_ordrel_ind *)ordrel_mp->b_rptr)->PRIM_type = T_ORDREL_IND;
1014 	ordrel_mp->b_wptr += sizeof (struct T_ordrel_ind);
1015 
1016 	/*
1017 	 * Enter the squeue so that no new packets can come in
1018 	 */
1019 	error = squeue_synch_enter(connp, NULL);
1020 	if (error != 0) {
1021 		/* failed to enter, free all the pre-allocated messages. */
1022 		freeb(stropt_mp);
1023 		freeb(ordrel_mp);
1024 		return (ENOMEM);
1025 	}
1026 
1027 	/*
1028 	 * Both endpoints must be of the same type (either STREAMS or
1029 	 * non-STREAMS) for fusion to be enabled. So if we are fused,
1030 	 * we have to unfuse.
1031 	 */
1032 	if (tcp->tcp_fused)
1033 		tcp_unfuse(tcp);
1034 
1035 	if (tcp->tcp_listener != NULL) {
1036 		/* The eager will deal with opts when accept() is called */
1037 		freeb(stropt_mp);
1038 		tcp_fallback_eager(tcp, direct_sockfs, quiesced_cb, arg);
1039 	} else {
1040 		tcp_fallback_noneager(tcp, stropt_mp, q, direct_sockfs,
1041 		    quiesced_cb, arg);
1042 	}
1043 
1044 	/*
1045 	 * No longer a direct socket
1046 	 *
1047 	 * Note that we intentionally leave the upper_handle and upcalls
1048 	 * intact, since eagers may still be using them.
1049 	 */
1050 	connp->conn_flags &= ~IPCL_NONSTR;
1051 	tcp->tcp_ordrel_mp = ordrel_mp;
1052 
1053 	/*
1054 	 * There should be atleast two ref's (IP + TCP)
1055 	 */
1056 	ASSERT(connp->conn_ref >= 2);
1057 	squeue_synch_exit(connp);
1058 
1059 	return (0);
1060 }
1061 
1062 /*
1063  * Notifies a non-STREAMS based listener about a new connection. This
1064  * function is executed on the *eager*'s squeue once the 3 way handshake
1065  * has completed. Note that the behavior differs from STREAMS, where the
1066  * T_CONN_IND is sent up by tcp_send_conn_ind while on the *listener*'s
1067  * squeue.
1068  *
1069  * Returns B_TRUE if the notification succeeded, in which case `tcp' will
1070  * be moved over to the ESTABLISHED list (q) of the listener. Othwerise,
1071  * B_FALSE is returned and `tcp' is killed.
1072  */
1073 boolean_t
1074 tcp_newconn_notify(tcp_t *tcp, ip_recv_attr_t *ira)
1075 {
1076 	tcp_t *listener = tcp->tcp_listener;
1077 	conn_t *lconnp = listener->tcp_connp;
1078 	conn_t *econnp = tcp->tcp_connp;
1079 	tcp_t *tail;
1080 	ipaddr_t *addr_cache;
1081 	sock_upper_handle_t upper;
1082 	struct sock_proto_props sopp;
1083 	mblk_t *mp;
1084 
1085 	mutex_enter(&listener->tcp_eager_lock);
1086 	/*
1087 	 * Take the eager out, if it is in the list of droppable eagers
1088 	 * as we are here because the 3W handshake is over.
1089 	 */
1090 	MAKE_UNDROPPABLE(tcp);
1091 	/*
1092 	 * The eager already has an extra ref put in tcp_input_data
1093 	 * so that it stays till accept comes back even though it
1094 	 * might get into TCPS_CLOSED as a result of a TH_RST etc.
1095 	 */
1096 	ASSERT(listener->tcp_conn_req_cnt_q0 > 0);
1097 	listener->tcp_conn_req_cnt_q0--;
1098 	listener->tcp_conn_req_cnt_q++;
1099 
1100 	/* Move from SYN_RCVD to ESTABLISHED list  */
1101 	tcp->tcp_eager_next_q0->tcp_eager_prev_q0 = tcp->tcp_eager_prev_q0;
1102 	tcp->tcp_eager_prev_q0->tcp_eager_next_q0 = tcp->tcp_eager_next_q0;
1103 	tcp->tcp_eager_prev_q0 = NULL;
1104 	tcp->tcp_eager_next_q0 = NULL;
1105 
1106 	/*
1107 	 * Insert at end of the queue because connections are accepted
1108 	 * in chronological order. Leaving the older connections at front
1109 	 * of the queue helps reducing search time.
1110 	 */
1111 	tail = listener->tcp_eager_last_q;
1112 	if (tail != NULL)
1113 		tail->tcp_eager_next_q = tcp;
1114 	else
1115 		listener->tcp_eager_next_q = tcp;
1116 	listener->tcp_eager_last_q = tcp;
1117 	tcp->tcp_eager_next_q = NULL;
1118 
1119 	/* we have timed out before */
1120 	if (tcp->tcp_syn_rcvd_timeout != 0) {
1121 		tcp->tcp_syn_rcvd_timeout = 0;
1122 		listener->tcp_syn_rcvd_timeout--;
1123 		if (listener->tcp_syn_defense &&
1124 		    listener->tcp_syn_rcvd_timeout <=
1125 		    (listener->tcp_tcps->tcps_conn_req_max_q0 >> 5) &&
1126 		    10*MINUTES < TICK_TO_MSEC(ddi_get_lbolt64() -
1127 		    listener->tcp_last_rcv_lbolt)) {
1128 			/*
1129 			 * Turn off the defense mode if we
1130 			 * believe the SYN attack is over.
1131 			 */
1132 			listener->tcp_syn_defense = B_FALSE;
1133 			if (listener->tcp_ip_addr_cache) {
1134 				kmem_free((void *)listener->tcp_ip_addr_cache,
1135 				    IP_ADDR_CACHE_SIZE * sizeof (ipaddr_t));
1136 				listener->tcp_ip_addr_cache = NULL;
1137 			}
1138 		}
1139 	}
1140 	addr_cache = (ipaddr_t *)(listener->tcp_ip_addr_cache);
1141 	if (addr_cache != NULL) {
1142 		/*
1143 		 * We have finished a 3-way handshake with this
1144 		 * remote host. This proves the IP addr is good.
1145 		 * Cache it!
1146 		 */
1147 		addr_cache[IP_ADDR_CACHE_HASH(tcp->tcp_connp->conn_faddr_v4)] =
1148 		    tcp->tcp_connp->conn_faddr_v4;
1149 	}
1150 	mutex_exit(&listener->tcp_eager_lock);
1151 
1152 	/*
1153 	 * Notify the ULP about the newconn. It is guaranteed that no
1154 	 * tcp_accept() call will be made for the eager if the
1155 	 * notification fails.
1156 	 */
1157 	if ((upper = (*lconnp->conn_upcalls->su_newconn)
1158 	    (lconnp->conn_upper_handle, (sock_lower_handle_t)econnp,
1159 	    &sock_tcp_downcalls, ira->ira_cred, ira->ira_cpid,
1160 	    &econnp->conn_upcalls)) == NULL) {
1161 		/*
1162 		 * Normally this should not happen, but the listener might
1163 		 * have done a fallback to TPI followed by a close(), in
1164 		 * which case tcp_closemp for this conn might have been
1165 		 * used by tcp_eager_cleanup().
1166 		 */
1167 		mutex_enter(&listener->tcp_eager_lock);
1168 		if (tcp->tcp_closemp_used) {
1169 			mutex_exit(&listener->tcp_eager_lock);
1170 			return (B_FALSE);
1171 		}
1172 		tcp->tcp_closemp_used = B_TRUE;
1173 		TCP_DEBUG_GETPCSTACK(tcp->tcmp_stk, 15);
1174 		mp = &tcp->tcp_closemp;
1175 		mutex_exit(&listener->tcp_eager_lock);
1176 		tcp_eager_kill(econnp, mp, NULL, NULL);
1177 		return (B_FALSE);
1178 	}
1179 	econnp->conn_upper_handle = upper;
1180 
1181 	tcp->tcp_detached = B_FALSE;
1182 	tcp->tcp_hard_binding = B_FALSE;
1183 	tcp->tcp_tconnind_started = B_TRUE;
1184 
1185 	if (econnp->conn_keepalive) {
1186 		tcp->tcp_ka_last_intrvl = 0;
1187 		tcp->tcp_ka_tid = TCP_TIMER(tcp, tcp_keepalive_timer,
1188 		    tcp->tcp_ka_interval);
1189 	}
1190 
1191 	/* Update the necessary parameters */
1192 	tcp_get_proto_props(tcp, &sopp);
1193 
1194 	(*econnp->conn_upcalls->su_set_proto_props)
1195 	    (econnp->conn_upper_handle, &sopp);
1196 
1197 	return (B_TRUE);
1198 }
1199