xref: /titanic_52/usr/src/uts/common/inet/tcp/tcp_socket.c (revision 3e95bd4ab92abca814bd28e854607d1975c7dc88)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
24  */
25 
26 /* This file contains all TCP kernel socket related functions. */
27 
28 #include <sys/types.h>
29 #include <sys/strlog.h>
30 #include <sys/policy.h>
31 #include <sys/sockio.h>
32 #include <sys/strsubr.h>
33 #include <sys/strsun.h>
34 #include <sys/squeue_impl.h>
35 #include <sys/squeue.h>
36 #define	_SUN_TPI_VERSION 2
37 #include <sys/tihdr.h>
38 #include <sys/timod.h>
39 #include <sys/tpicommon.h>
40 #include <sys/socketvar.h>
41 
42 #include <inet/common.h>
43 #include <inet/proto_set.h>
44 #include <inet/ip.h>
45 #include <inet/tcp.h>
46 #include <inet/tcp_impl.h>
47 
48 static void	tcp_activate(sock_lower_handle_t, sock_upper_handle_t,
49 		    sock_upcalls_t *, int, cred_t *);
50 static int	tcp_accept(sock_lower_handle_t, sock_lower_handle_t,
51 		    sock_upper_handle_t, cred_t *);
52 static int	tcp_bind(sock_lower_handle_t, struct sockaddr *,
53 		    socklen_t, cred_t *);
54 static int	tcp_listen(sock_lower_handle_t, int, cred_t *);
55 static int	tcp_connect(sock_lower_handle_t, const struct sockaddr *,
56 		    socklen_t, sock_connid_t *, cred_t *);
57 static int	tcp_getsockopt(sock_lower_handle_t, int, int, void *,
58 		    socklen_t *, cred_t *);
59 static int	tcp_setsockopt(sock_lower_handle_t, int, int, const void *,
60 		    socklen_t, cred_t *);
61 static int	tcp_sendmsg(sock_lower_handle_t, mblk_t *, struct nmsghdr *,
62 		    cred_t *cr);
63 static int	tcp_shutdown(sock_lower_handle_t, int, cred_t *);
64 static void	tcp_clr_flowctrl(sock_lower_handle_t);
65 static int	tcp_ioctl(sock_lower_handle_t, int, intptr_t, int, int32_t *,
66 		    cred_t *);
67 static int	tcp_close(sock_lower_handle_t, int, cred_t *);
68 
69 sock_downcalls_t sock_tcp_downcalls = {
70 	tcp_activate,
71 	tcp_accept,
72 	tcp_bind,
73 	tcp_listen,
74 	tcp_connect,
75 	tcp_getpeername,
76 	tcp_getsockname,
77 	tcp_getsockopt,
78 	tcp_setsockopt,
79 	tcp_sendmsg,
80 	NULL,
81 	NULL,
82 	NULL,
83 	tcp_shutdown,
84 	tcp_clr_flowctrl,
85 	tcp_ioctl,
86 	tcp_close,
87 };
88 
89 /* ARGSUSED */
90 static void
91 tcp_activate(sock_lower_handle_t proto_handle, sock_upper_handle_t sock_handle,
92     sock_upcalls_t *sock_upcalls, int flags, cred_t *cr)
93 {
94 	conn_t *connp = (conn_t *)proto_handle;
95 	struct sock_proto_props sopp;
96 	extern struct module_info tcp_rinfo;
97 
98 	ASSERT(connp->conn_upper_handle == NULL);
99 
100 	/* All Solaris components should pass a cred for this operation. */
101 	ASSERT(cr != NULL);
102 
103 	sopp.sopp_flags = SOCKOPT_RCVHIWAT | SOCKOPT_RCVLOWAT |
104 	    SOCKOPT_MAXPSZ | SOCKOPT_MAXBLK | SOCKOPT_RCVTIMER |
105 	    SOCKOPT_RCVTHRESH | SOCKOPT_MAXADDRLEN | SOCKOPT_MINPSZ;
106 
107 	sopp.sopp_rxhiwat = SOCKET_RECVHIWATER;
108 	sopp.sopp_rxlowat = SOCKET_RECVLOWATER;
109 	sopp.sopp_maxpsz = INFPSZ;
110 	sopp.sopp_maxblk = INFPSZ;
111 	sopp.sopp_rcvtimer = SOCKET_TIMER_INTERVAL;
112 	sopp.sopp_rcvthresh = SOCKET_RECVHIWATER >> 3;
113 	sopp.sopp_maxaddrlen = sizeof (sin6_t);
114 	sopp.sopp_minpsz = (tcp_rinfo.mi_minpsz == 1) ? 0 :
115 	    tcp_rinfo.mi_minpsz;
116 
117 	connp->conn_upcalls = sock_upcalls;
118 	connp->conn_upper_handle = sock_handle;
119 
120 	ASSERT(connp->conn_rcvbuf != 0 &&
121 	    connp->conn_rcvbuf == connp->conn_tcp->tcp_rwnd);
122 	(*sock_upcalls->su_set_proto_props)(sock_handle, &sopp);
123 }
124 
125 /*ARGSUSED*/
126 static int
127 tcp_accept(sock_lower_handle_t lproto_handle,
128     sock_lower_handle_t eproto_handle, sock_upper_handle_t sock_handle,
129     cred_t *cr)
130 {
131 	conn_t *lconnp, *econnp;
132 	tcp_t *listener, *eager;
133 
134 	lconnp = (conn_t *)lproto_handle;
135 	listener = lconnp->conn_tcp;
136 	ASSERT(listener->tcp_state == TCPS_LISTEN);
137 	econnp = (conn_t *)eproto_handle;
138 	eager = econnp->conn_tcp;
139 	ASSERT(eager->tcp_listener != NULL);
140 	ASSERT(IPCL_IS_NONSTR(econnp));
141 	ASSERT(lconnp->conn_upper_handle != NULL);
142 
143 	/*
144 	 * It is possible for the accept thread to race with the thread that
145 	 * made the su_newconn upcall in tcp_newconn_notify. Both
146 	 * tcp_newconn_notify and tcp_accept require that conn_upper_handle
147 	 * and conn_upcalls be set before returning, so they both write to
148 	 * them. However, we're guaranteed that the value written is the same
149 	 * for both threads.
150 	 */
151 	ASSERT(econnp->conn_upper_handle == NULL ||
152 	    econnp->conn_upper_handle == sock_handle);
153 	ASSERT(econnp->conn_upcalls == NULL ||
154 	    econnp->conn_upcalls == lconnp->conn_upcalls);
155 	econnp->conn_upper_handle = sock_handle;
156 	econnp->conn_upcalls = lconnp->conn_upcalls;
157 
158 	ASSERT(econnp->conn_netstack ==
159 	    listener->tcp_connp->conn_netstack);
160 	ASSERT(eager->tcp_tcps == listener->tcp_tcps);
161 
162 	/*
163 	 * We should have a minimum of 2 references on the conn at this
164 	 * point. One for TCP and one for the newconn notification
165 	 * (which is now taken over by IP). In the normal case we would
166 	 * also have another reference (making a total of 3) for the conn
167 	 * being in the classifier hash list. However the eager could have
168 	 * received an RST subsequently and tcp_closei_local could have
169 	 * removed the eager from the classifier hash list, hence we can't
170 	 * assert that reference.
171 	 */
172 	ASSERT(econnp->conn_ref >= 2);
173 
174 	/*
175 	 * An error is returned if this conn has been reset, which will
176 	 * cause the socket to be closed immediately. The eager will be
177 	 * unlinked from the listener during close.
178 	 */
179 	if (eager->tcp_state < TCPS_ESTABLISHED)
180 		return (ECONNABORTED);
181 
182 	mutex_enter(&listener->tcp_eager_lock);
183 	/*
184 	 * Non-STREAMS listeners never defer the notification of new
185 	 * connections.
186 	 */
187 	ASSERT(!listener->tcp_eager_prev_q0->tcp_conn_def_q0);
188 	tcp_eager_unlink(eager);
189 	mutex_exit(&listener->tcp_eager_lock);
190 	CONN_DEC_REF(listener->tcp_connp);
191 
192 	return (0);
193 }
194 
195 static int
196 tcp_bind(sock_lower_handle_t proto_handle, struct sockaddr *sa,
197     socklen_t len, cred_t *cr)
198 {
199 	int 		error;
200 	conn_t		*connp = (conn_t *)proto_handle;
201 
202 	/* All Solaris components should pass a cred for this operation. */
203 	ASSERT(cr != NULL);
204 	ASSERT(connp->conn_upper_handle != NULL);
205 
206 	error = squeue_synch_enter(connp, NULL);
207 	if (error != 0) {
208 		/* failed to enter */
209 		return (ENOSR);
210 	}
211 
212 	/* binding to a NULL address really means unbind */
213 	if (sa == NULL) {
214 		if (connp->conn_tcp->tcp_state < TCPS_LISTEN)
215 			error = tcp_do_unbind(connp);
216 		else
217 			error = EINVAL;
218 	} else {
219 		error = tcp_do_bind(connp, sa, len, cr, B_TRUE);
220 	}
221 
222 	squeue_synch_exit(connp);
223 
224 	if (error < 0) {
225 		if (error == -TOUTSTATE)
226 			error = EINVAL;
227 		else
228 			error = proto_tlitosyserr(-error);
229 	}
230 
231 	return (error);
232 }
233 
234 /* ARGSUSED */
235 static int
236 tcp_listen(sock_lower_handle_t proto_handle, int backlog, cred_t *cr)
237 {
238 	conn_t	*connp = (conn_t *)proto_handle;
239 	tcp_t	*tcp = connp->conn_tcp;
240 	int 	error;
241 
242 	ASSERT(connp->conn_upper_handle != NULL);
243 
244 	/* All Solaris components should pass a cred for this operation. */
245 	ASSERT(cr != NULL);
246 
247 	error = squeue_synch_enter(connp, NULL);
248 	if (error != 0) {
249 		/* failed to enter */
250 		return (ENOBUFS);
251 	}
252 
253 	error = tcp_do_listen(connp, NULL, 0, backlog, cr, B_FALSE);
254 	if (error == 0) {
255 		/*
256 		 * sockfs needs to know what's the maximum number of socket
257 		 * that can be queued on the listener.
258 		 */
259 		(*connp->conn_upcalls->su_opctl)(connp->conn_upper_handle,
260 		    SOCK_OPCTL_ENAB_ACCEPT,
261 		    (uintptr_t)(tcp->tcp_conn_req_max +
262 		    tcp->tcp_tcps->tcps_conn_req_max_q0));
263 	} else if (error < 0) {
264 		if (error == -TOUTSTATE)
265 			error = EINVAL;
266 		else
267 			error = proto_tlitosyserr(-error);
268 	}
269 	squeue_synch_exit(connp);
270 	return (error);
271 }
272 
273 static int
274 tcp_connect(sock_lower_handle_t proto_handle, const struct sockaddr *sa,
275     socklen_t len, sock_connid_t *id, cred_t *cr)
276 {
277 	conn_t		*connp = (conn_t *)proto_handle;
278 	int		error;
279 
280 	ASSERT(connp->conn_upper_handle != NULL);
281 
282 	/* All Solaris components should pass a cred for this operation. */
283 	ASSERT(cr != NULL);
284 
285 	error = proto_verify_ip_addr(connp->conn_family, sa, len);
286 	if (error != 0) {
287 		return (error);
288 	}
289 
290 	error = squeue_synch_enter(connp, NULL);
291 	if (error != 0) {
292 		/* failed to enter */
293 		return (ENOSR);
294 	}
295 
296 	/*
297 	 * TCP supports quick connect, so no need to do an implicit bind
298 	 */
299 	error = tcp_do_connect(connp, sa, len, cr, curproc->p_pid);
300 	if (error == 0) {
301 		*id = connp->conn_tcp->tcp_connid;
302 	} else if (error < 0) {
303 		if (error == -TOUTSTATE) {
304 			switch (connp->conn_tcp->tcp_state) {
305 			case TCPS_SYN_SENT:
306 				error = EALREADY;
307 				break;
308 			case TCPS_ESTABLISHED:
309 				error = EISCONN;
310 				break;
311 			case TCPS_LISTEN:
312 				error = EOPNOTSUPP;
313 				break;
314 			default:
315 				error = EINVAL;
316 				break;
317 			}
318 		} else {
319 			error = proto_tlitosyserr(-error);
320 		}
321 	}
322 
323 	if (connp->conn_tcp->tcp_loopback) {
324 		struct sock_proto_props sopp;
325 
326 		sopp.sopp_flags = SOCKOPT_LOOPBACK;
327 		sopp.sopp_loopback = B_TRUE;
328 
329 		(*connp->conn_upcalls->su_set_proto_props)(
330 		    connp->conn_upper_handle, &sopp);
331 	}
332 done:
333 	squeue_synch_exit(connp);
334 
335 	return ((error == 0) ? EINPROGRESS : error);
336 }
337 
338 /* ARGSUSED3 */
339 int
340 tcp_getpeername(sock_lower_handle_t proto_handle, struct sockaddr *addr,
341     socklen_t *addrlenp, cred_t *cr)
342 {
343 	conn_t	*connp = (conn_t *)proto_handle;
344 	tcp_t	*tcp = connp->conn_tcp;
345 
346 	/* All Solaris components should pass a cred for this operation. */
347 	ASSERT(cr != NULL);
348 
349 	ASSERT(tcp != NULL);
350 	if (tcp->tcp_state < TCPS_SYN_RCVD)
351 		return (ENOTCONN);
352 
353 	return (conn_getpeername(connp, addr, addrlenp));
354 }
355 
356 /* ARGSUSED3 */
357 int
358 tcp_getsockname(sock_lower_handle_t proto_handle, struct sockaddr *addr,
359     socklen_t *addrlenp, cred_t *cr)
360 {
361 	conn_t	*connp = (conn_t *)proto_handle;
362 
363 	/* All Solaris components should pass a cred for this operation. */
364 	ASSERT(cr != NULL);
365 
366 	return (conn_getsockname(connp, addr, addrlenp));
367 }
368 
369 /* returns UNIX error, the optlen is a value-result arg */
370 static int
371 tcp_getsockopt(sock_lower_handle_t proto_handle, int level, int option_name,
372     void *optvalp, socklen_t *optlen, cred_t *cr)
373 {
374 	conn_t		*connp = (conn_t *)proto_handle;
375 	int		error;
376 	t_uscalar_t	max_optbuf_len;
377 	void		*optvalp_buf;
378 	int		len;
379 
380 	ASSERT(connp->conn_upper_handle != NULL);
381 
382 	error = proto_opt_check(level, option_name, *optlen, &max_optbuf_len,
383 	    tcp_opt_obj.odb_opt_des_arr,
384 	    tcp_opt_obj.odb_opt_arr_cnt,
385 	    B_FALSE, B_TRUE, cr);
386 	if (error != 0) {
387 		if (error < 0) {
388 			error = proto_tlitosyserr(-error);
389 		}
390 		return (error);
391 	}
392 
393 	optvalp_buf = kmem_alloc(max_optbuf_len, KM_SLEEP);
394 
395 	error = squeue_synch_enter(connp, NULL);
396 	if (error == ENOMEM) {
397 		kmem_free(optvalp_buf, max_optbuf_len);
398 		return (ENOMEM);
399 	}
400 
401 	len = tcp_opt_get(connp, level, option_name, optvalp_buf);
402 	squeue_synch_exit(connp);
403 
404 	if (len == -1) {
405 		kmem_free(optvalp_buf, max_optbuf_len);
406 		return (EINVAL);
407 	}
408 
409 	/*
410 	 * update optlen and copy option value
411 	 */
412 	t_uscalar_t size = MIN(len, *optlen);
413 
414 	bcopy(optvalp_buf, optvalp, size);
415 	bcopy(&size, optlen, sizeof (size));
416 
417 	kmem_free(optvalp_buf, max_optbuf_len);
418 	return (0);
419 }
420 
421 static int
422 tcp_setsockopt(sock_lower_handle_t proto_handle, int level, int option_name,
423     const void *optvalp, socklen_t optlen, cred_t *cr)
424 {
425 	conn_t		*connp = (conn_t *)proto_handle;
426 	int		error;
427 
428 	ASSERT(connp->conn_upper_handle != NULL);
429 	/*
430 	 * Entering the squeue synchronously can result in a context switch,
431 	 * which can cause a rather sever performance degradation. So we try to
432 	 * handle whatever options we can without entering the squeue.
433 	 */
434 	if (level == IPPROTO_TCP) {
435 		switch (option_name) {
436 		case TCP_NODELAY:
437 			if (optlen != sizeof (int32_t))
438 				return (EINVAL);
439 			mutex_enter(&connp->conn_tcp->tcp_non_sq_lock);
440 			connp->conn_tcp->tcp_naglim = *(int *)optvalp ? 1 :
441 			    connp->conn_tcp->tcp_mss;
442 			mutex_exit(&connp->conn_tcp->tcp_non_sq_lock);
443 			return (0);
444 		default:
445 			break;
446 		}
447 	}
448 
449 	error = squeue_synch_enter(connp, NULL);
450 	if (error == ENOMEM) {
451 		return (ENOMEM);
452 	}
453 
454 	error = proto_opt_check(level, option_name, optlen, NULL,
455 	    tcp_opt_obj.odb_opt_des_arr,
456 	    tcp_opt_obj.odb_opt_arr_cnt,
457 	    B_TRUE, B_FALSE, cr);
458 
459 	if (error != 0) {
460 		if (error < 0) {
461 			error = proto_tlitosyserr(-error);
462 		}
463 		squeue_synch_exit(connp);
464 		return (error);
465 	}
466 
467 	error = tcp_opt_set(connp, SETFN_OPTCOM_NEGOTIATE, level, option_name,
468 	    optlen, (uchar_t *)optvalp, (uint_t *)&optlen, (uchar_t *)optvalp,
469 	    NULL, cr);
470 	squeue_synch_exit(connp);
471 
472 	ASSERT(error >= 0);
473 
474 	return (error);
475 }
476 
477 /* ARGSUSED */
478 static int
479 tcp_sendmsg(sock_lower_handle_t proto_handle, mblk_t *mp, struct nmsghdr *msg,
480     cred_t *cr)
481 {
482 	tcp_t		*tcp;
483 	uint32_t	msize;
484 	conn_t *connp = (conn_t *)proto_handle;
485 	int32_t		tcpstate;
486 
487 	/* All Solaris components should pass a cred for this operation. */
488 	ASSERT(cr != NULL);
489 
490 	ASSERT(connp->conn_ref >= 2);
491 	ASSERT(connp->conn_upper_handle != NULL);
492 
493 	if (msg->msg_controllen != 0) {
494 		freemsg(mp);
495 		return (EOPNOTSUPP);
496 	}
497 
498 	switch (DB_TYPE(mp)) {
499 	case M_DATA:
500 		tcp = connp->conn_tcp;
501 		ASSERT(tcp != NULL);
502 
503 		tcpstate = tcp->tcp_state;
504 		if (tcpstate < TCPS_ESTABLISHED) {
505 			freemsg(mp);
506 			/*
507 			 * We return ENOTCONN if the endpoint is trying to
508 			 * connect or has never been connected, and EPIPE if it
509 			 * has been disconnected. The connection id helps us
510 			 * distinguish between the last two cases.
511 			 */
512 			return ((tcpstate == TCPS_SYN_SENT) ? ENOTCONN :
513 			    ((tcp->tcp_connid > 0) ? EPIPE : ENOTCONN));
514 		} else if (tcpstate > TCPS_CLOSE_WAIT) {
515 			freemsg(mp);
516 			return (EPIPE);
517 		}
518 
519 		msize = msgdsize(mp);
520 
521 		mutex_enter(&tcp->tcp_non_sq_lock);
522 		tcp->tcp_squeue_bytes += msize;
523 		/*
524 		 * Squeue Flow Control
525 		 */
526 		if (TCP_UNSENT_BYTES(tcp) > connp->conn_sndbuf) {
527 			tcp_setqfull(tcp);
528 		}
529 		mutex_exit(&tcp->tcp_non_sq_lock);
530 
531 		/*
532 		 * The application may pass in an address in the msghdr, but
533 		 * we ignore the address on connection-oriented sockets.
534 		 * Just like BSD this code does not generate an error for
535 		 * TCP (a CONNREQUIRED socket) when sending to an address
536 		 * passed in with sendto/sendmsg. Instead the data is
537 		 * delivered on the connection as if no address had been
538 		 * supplied.
539 		 */
540 		CONN_INC_REF(connp);
541 
542 		if (msg->msg_flags & MSG_OOB) {
543 			SQUEUE_ENTER_ONE(connp->conn_sqp, mp, tcp_output_urgent,
544 			    connp, NULL, tcp_squeue_flag, SQTAG_TCP_OUTPUT);
545 		} else {
546 			SQUEUE_ENTER_ONE(connp->conn_sqp, mp, tcp_output,
547 			    connp, NULL, tcp_squeue_flag, SQTAG_TCP_OUTPUT);
548 		}
549 
550 		return (0);
551 
552 	default:
553 		ASSERT(0);
554 	}
555 
556 	freemsg(mp);
557 	return (0);
558 }
559 
560 /* ARGSUSED */
561 static int
562 tcp_shutdown(sock_lower_handle_t proto_handle, int how, cred_t *cr)
563 {
564 	conn_t  *connp = (conn_t *)proto_handle;
565 	tcp_t   *tcp = connp->conn_tcp;
566 
567 	ASSERT(connp->conn_upper_handle != NULL);
568 
569 	/* All Solaris components should pass a cred for this operation. */
570 	ASSERT(cr != NULL);
571 
572 	/*
573 	 * X/Open requires that we check the connected state.
574 	 */
575 	if (tcp->tcp_state < TCPS_SYN_SENT)
576 		return (ENOTCONN);
577 
578 	/* shutdown the send side */
579 	if (how != SHUT_RD) {
580 		mblk_t *bp;
581 
582 		bp = allocb_wait(0, BPRI_HI, STR_NOSIG, NULL);
583 		CONN_INC_REF(connp);
584 		SQUEUE_ENTER_ONE(connp->conn_sqp, bp, tcp_shutdown_output,
585 		    connp, NULL, SQ_NODRAIN, SQTAG_TCP_SHUTDOWN_OUTPUT);
586 
587 		(*connp->conn_upcalls->su_opctl)(connp->conn_upper_handle,
588 		    SOCK_OPCTL_SHUT_SEND, 0);
589 	}
590 
591 	/* shutdown the recv side */
592 	if (how != SHUT_WR)
593 		(*connp->conn_upcalls->su_opctl)(connp->conn_upper_handle,
594 		    SOCK_OPCTL_SHUT_RECV, 0);
595 
596 	return (0);
597 }
598 
599 static void
600 tcp_clr_flowctrl(sock_lower_handle_t proto_handle)
601 {
602 	conn_t  *connp = (conn_t *)proto_handle;
603 	tcp_t	*tcp = connp->conn_tcp;
604 	mblk_t *mp;
605 	int error;
606 
607 	ASSERT(connp->conn_upper_handle != NULL);
608 
609 	/*
610 	 * If tcp->tcp_rsrv_mp == NULL, it means that tcp_clr_flowctrl()
611 	 * is currently running.
612 	 */
613 	mutex_enter(&tcp->tcp_rsrv_mp_lock);
614 	if ((mp = tcp->tcp_rsrv_mp) == NULL) {
615 		mutex_exit(&tcp->tcp_rsrv_mp_lock);
616 		return;
617 	}
618 	tcp->tcp_rsrv_mp = NULL;
619 	mutex_exit(&tcp->tcp_rsrv_mp_lock);
620 
621 	error = squeue_synch_enter(connp, mp);
622 	ASSERT(error == 0);
623 
624 	mutex_enter(&tcp->tcp_rsrv_mp_lock);
625 	tcp->tcp_rsrv_mp = mp;
626 	mutex_exit(&tcp->tcp_rsrv_mp_lock);
627 
628 	if (tcp->tcp_fused) {
629 		tcp_fuse_backenable(tcp);
630 	} else {
631 		tcp->tcp_rwnd = connp->conn_rcvbuf;
632 		/*
633 		 * Send back a window update immediately if TCP is above
634 		 * ESTABLISHED state and the increase of the rcv window
635 		 * that the other side knows is at least 1 MSS after flow
636 		 * control is lifted.
637 		 */
638 		if (tcp->tcp_state >= TCPS_ESTABLISHED &&
639 		    tcp_rwnd_reopen(tcp) == TH_ACK_NEEDED) {
640 			tcp_xmit_ctl(NULL, tcp,
641 			    (tcp->tcp_swnd == 0) ? tcp->tcp_suna :
642 			    tcp->tcp_snxt, tcp->tcp_rnxt, TH_ACK);
643 		}
644 	}
645 
646 	squeue_synch_exit(connp);
647 }
648 
649 /* ARGSUSED */
650 static int
651 tcp_ioctl(sock_lower_handle_t proto_handle, int cmd, intptr_t arg,
652     int mode, int32_t *rvalp, cred_t *cr)
653 {
654 	conn_t  	*connp = (conn_t *)proto_handle;
655 	int		error;
656 
657 	ASSERT(connp->conn_upper_handle != NULL);
658 
659 	/* All Solaris components should pass a cred for this operation. */
660 	ASSERT(cr != NULL);
661 
662 	/*
663 	 * If we don't have a helper stream then create one.
664 	 * ip_create_helper_stream takes care of locking the conn_t,
665 	 * so this check for NULL is just a performance optimization.
666 	 */
667 	if (connp->conn_helper_info == NULL) {
668 		tcp_stack_t *tcps = connp->conn_tcp->tcp_tcps;
669 
670 		/*
671 		 * Create a helper stream for non-STREAMS socket.
672 		 */
673 		error = ip_create_helper_stream(connp, tcps->tcps_ldi_ident);
674 		if (error != 0) {
675 			ip0dbg(("tcp_ioctl: create of IP helper stream "
676 			    "failed %d\n", error));
677 			return (error);
678 		}
679 	}
680 
681 	switch (cmd) {
682 		case ND_SET:
683 		case ND_GET:
684 		case _SIOCSOCKFALLBACK:
685 		case TCP_IOC_ABORT_CONN:
686 		case TI_GETPEERNAME:
687 		case TI_GETMYNAME:
688 			ip1dbg(("tcp_ioctl: cmd 0x%x on non streams socket",
689 			    cmd));
690 			error = EINVAL;
691 			break;
692 		default:
693 			/*
694 			 * If the conn is not closing, pass on to IP using
695 			 * helper stream. Bump the ioctlref to prevent tcp_close
696 			 * from closing the rq/wq out from underneath the ioctl
697 			 * if it ends up queued or aborted/interrupted.
698 			 */
699 			mutex_enter(&connp->conn_lock);
700 			if (connp->conn_state_flags & (CONN_CLOSING)) {
701 				mutex_exit(&connp->conn_lock);
702 				error = EINVAL;
703 				break;
704 			}
705 			CONN_INC_IOCTLREF_LOCKED(connp);
706 			error = ldi_ioctl(connp->conn_helper_info->iphs_handle,
707 			    cmd, arg, mode, cr, rvalp);
708 			CONN_DEC_IOCTLREF(connp);
709 			break;
710 	}
711 	return (error);
712 }
713 
714 /* ARGSUSED */
715 static int
716 tcp_close(sock_lower_handle_t proto_handle, int flags, cred_t *cr)
717 {
718 	conn_t *connp = (conn_t *)proto_handle;
719 
720 	ASSERT(connp->conn_upper_handle != NULL);
721 
722 	/* All Solaris components should pass a cred for this operation. */
723 	ASSERT(cr != NULL);
724 
725 	tcp_close_common(connp, flags);
726 
727 	ip_free_helper_stream(connp);
728 
729 	/*
730 	 * Drop IP's reference on the conn. This is the last reference
731 	 * on the connp if the state was less than established. If the
732 	 * connection has gone into timewait state, then we will have
733 	 * one ref for the TCP and one more ref (total of two) for the
734 	 * classifier connected hash list (a timewait connections stays
735 	 * in connected hash till closed).
736 	 *
737 	 * We can't assert the references because there might be other
738 	 * transient reference places because of some walkers or queued
739 	 * packets in squeue for the timewait state.
740 	 */
741 	CONN_DEC_REF(connp);
742 
743 	/*
744 	 * EINPROGRESS tells sockfs to wait for a 'closed' upcall before
745 	 * freeing the socket.
746 	 */
747 	return (EINPROGRESS);
748 }
749 
750 /* ARGSUSED */
751 sock_lower_handle_t
752 tcp_create(int family, int type, int proto, sock_downcalls_t **sock_downcalls,
753     uint_t *smodep, int *errorp, int flags, cred_t *credp)
754 {
755 	conn_t		*connp;
756 	boolean_t	isv6 = family == AF_INET6;
757 	if (type != SOCK_STREAM || (family != AF_INET && family != AF_INET6) ||
758 	    (proto != 0 && proto != IPPROTO_TCP)) {
759 		*errorp = EPROTONOSUPPORT;
760 		return (NULL);
761 	}
762 
763 	connp = tcp_create_common(credp, isv6, B_TRUE, errorp);
764 	if (connp == NULL) {
765 		return (NULL);
766 	}
767 
768 	/*
769 	 * Put the ref for TCP. Ref for IP was already put
770 	 * by ipcl_conn_create. Also Make the conn_t globally
771 	 * visible to walkers
772 	 */
773 	mutex_enter(&connp->conn_lock);
774 	CONN_INC_REF_LOCKED(connp);
775 	ASSERT(connp->conn_ref == 2);
776 	connp->conn_state_flags &= ~CONN_INCIPIENT;
777 
778 	connp->conn_flags |= IPCL_NONSTR;
779 	mutex_exit(&connp->conn_lock);
780 
781 	ASSERT(errorp != NULL);
782 	*errorp = 0;
783 	*sock_downcalls = &sock_tcp_downcalls;
784 	*smodep = SM_CONNREQUIRED | SM_EXDATA | SM_ACCEPTSUPP |
785 	    SM_SENDFILESUPP;
786 
787 	return ((sock_lower_handle_t)connp);
788 }
789 
790 /*
791  * tcp_fallback
792  *
793  * A direct socket is falling back to using STREAMS. The queue
794  * that is being passed down was created using tcp_open() with
795  * the SO_FALLBACK flag set. As a result, the queue is not
796  * associated with a conn, and the q_ptrs instead contain the
797  * dev and minor area that should be used.
798  *
799  * The 'issocket' flag indicates whether the FireEngine
800  * optimizations should be used. The common case would be that
801  * optimizations are enabled, and they might be subsequently
802  * disabled using the _SIOCSOCKFALLBACK ioctl.
803  */
804 
805 /*
806  * An active connection is falling back to TPI. Gather all the information
807  * required by the STREAM head and TPI sonode and send it up.
808  */
809 static void
810 tcp_fallback_noneager(tcp_t *tcp, mblk_t *stropt_mp, queue_t *q,
811     boolean_t issocket, so_proto_quiesced_cb_t quiesced_cb,
812     sock_quiesce_arg_t *arg)
813 {
814 	conn_t			*connp = tcp->tcp_connp;
815 	struct stroptions	*stropt;
816 	struct T_capability_ack tca;
817 	struct sockaddr_in6	laddr, faddr;
818 	socklen_t 		laddrlen, faddrlen;
819 	short			opts;
820 	int			error;
821 	mblk_t			*mp, *mpnext;
822 
823 	connp->conn_dev = (dev_t)RD(q)->q_ptr;
824 	connp->conn_minor_arena = WR(q)->q_ptr;
825 
826 	RD(q)->q_ptr = WR(q)->q_ptr = connp;
827 
828 	connp->conn_rq = RD(q);
829 	connp->conn_wq = WR(q);
830 
831 	WR(q)->q_qinfo = &tcp_sock_winit;
832 
833 	if (!issocket)
834 		tcp_use_pure_tpi(tcp);
835 
836 	/*
837 	 * free the helper stream
838 	 */
839 	ip_free_helper_stream(connp);
840 
841 	/*
842 	 * Notify the STREAM head about options
843 	 */
844 	DB_TYPE(stropt_mp) = M_SETOPTS;
845 	stropt = (struct stroptions *)stropt_mp->b_rptr;
846 	stropt_mp->b_wptr += sizeof (struct stroptions);
847 	stropt->so_flags = SO_HIWAT | SO_WROFF | SO_MAXBLK;
848 
849 	stropt->so_wroff = connp->conn_ht_iphc_len + (tcp->tcp_loopback ? 0 :
850 	    tcp->tcp_tcps->tcps_wroff_xtra);
851 	if (tcp->tcp_snd_sack_ok)
852 		stropt->so_wroff += TCPOPT_MAX_SACK_LEN;
853 	stropt->so_hiwat = connp->conn_rcvbuf;
854 	stropt->so_maxblk = tcp_maxpsz_set(tcp, B_FALSE);
855 
856 	putnext(RD(q), stropt_mp);
857 
858 	/*
859 	 * Collect the information needed to sync with the sonode
860 	 */
861 	tcp_do_capability_ack(tcp, &tca, TC1_INFO|TC1_ACCEPTOR_ID);
862 
863 	laddrlen = faddrlen = sizeof (sin6_t);
864 	(void) tcp_getsockname((sock_lower_handle_t)connp,
865 	    (struct sockaddr *)&laddr, &laddrlen, CRED());
866 	error = tcp_getpeername((sock_lower_handle_t)connp,
867 	    (struct sockaddr *)&faddr, &faddrlen, CRED());
868 	if (error != 0)
869 		faddrlen = 0;
870 
871 	opts = 0;
872 	if (connp->conn_oobinline)
873 		opts |= SO_OOBINLINE;
874 	if (connp->conn_ixa->ixa_flags & IXAF_DONTROUTE)
875 		opts |= SO_DONTROUTE;
876 
877 	/*
878 	 * Notify the socket that the protocol is now quiescent,
879 	 * and it's therefore safe move data from the socket
880 	 * to the stream head.
881 	 */
882 	mp = (*quiesced_cb)(connp->conn_upper_handle, arg, &tca,
883 	    (struct sockaddr *)&laddr, laddrlen,
884 	    (struct sockaddr *)&faddr, faddrlen, opts);
885 
886 	while (mp != NULL) {
887 		mpnext = mp->b_next;
888 		tcp->tcp_rcv_list = mp->b_next;
889 		mp->b_next = NULL;
890 		putnext(q, mp);
891 		mp = mpnext;
892 	}
893 	ASSERT(tcp->tcp_rcv_last_head == NULL);
894 	ASSERT(tcp->tcp_rcv_last_tail == NULL);
895 	ASSERT(tcp->tcp_rcv_cnt == 0);
896 
897 	/*
898 	 * All eagers in q0 are marked as being non-STREAM, so they will
899 	 * make su_newconn upcalls when the handshake completes, which
900 	 * will fail (resulting in the conn being closed). So we just blow
901 	 * off everything in q0 instead of waiting for the inevitable.
902 	 */
903 	if (tcp->tcp_conn_req_cnt_q0 != 0)
904 		tcp_eager_cleanup(tcp, B_TRUE);
905 }
906 
907 /*
908  * An eager is falling back to TPI. All we have to do is send
909  * up a T_CONN_IND.
910  */
911 static void
912 tcp_fallback_eager(tcp_t *eager, boolean_t issocket,
913     so_proto_quiesced_cb_t quiesced_cb, sock_quiesce_arg_t *arg)
914 {
915 	conn_t *connp = eager->tcp_connp;
916 	tcp_t *listener = eager->tcp_listener;
917 	mblk_t *mp;
918 
919 	ASSERT(listener != NULL);
920 
921 	/*
922 	 * Notify the socket that the protocol is now quiescent,
923 	 * and it's therefore safe move data from the socket
924 	 * to tcp's rcv queue.
925 	 */
926 	mp = (*quiesced_cb)(connp->conn_upper_handle, arg, NULL, NULL, 0,
927 	    NULL, 0, 0);
928 
929 	if (mp != NULL) {
930 		ASSERT(eager->tcp_rcv_cnt == 0);
931 
932 		eager->tcp_rcv_list = mp;
933 		eager->tcp_rcv_cnt = msgdsize(mp);
934 		while (mp->b_next != NULL) {
935 			mp = mp->b_next;
936 			eager->tcp_rcv_cnt += msgdsize(mp);
937 		}
938 		eager->tcp_rcv_last_head = mp;
939 		while (mp->b_cont)
940 			mp = mp->b_cont;
941 		eager->tcp_rcv_last_tail = mp;
942 		if (eager->tcp_rcv_cnt > eager->tcp_rwnd)
943 			eager->tcp_rwnd = 0;
944 		else
945 			eager->tcp_rwnd -= eager->tcp_rcv_cnt;
946 	}
947 
948 	if (!issocket)
949 		eager->tcp_issocket = B_FALSE;
950 	/*
951 	 * The stream for this eager does not yet exist, so mark it as
952 	 * being detached.
953 	 */
954 	eager->tcp_detached = B_TRUE;
955 	eager->tcp_hard_binding = B_TRUE;
956 	connp->conn_rq = listener->tcp_connp->conn_rq;
957 	connp->conn_wq = listener->tcp_connp->conn_wq;
958 
959 	/* Send up the connection indication */
960 	mp = eager->tcp_conn.tcp_eager_conn_ind;
961 	ASSERT(mp != NULL);
962 	eager->tcp_conn.tcp_eager_conn_ind = NULL;
963 
964 	/*
965 	 * TLI/XTI applications will get confused by
966 	 * sending eager as an option since it violates
967 	 * the option semantics. So remove the eager as
968 	 * option since TLI/XTI app doesn't need it anyway.
969 	 */
970 	if (!issocket) {
971 		struct T_conn_ind *conn_ind;
972 
973 		conn_ind = (struct T_conn_ind *)mp->b_rptr;
974 		conn_ind->OPT_length = 0;
975 		conn_ind->OPT_offset = 0;
976 	}
977 
978 	/*
979 	 * Sockfs guarantees that the listener will not be closed
980 	 * during fallback. So we can safely use the listener's queue.
981 	 */
982 	putnext(listener->tcp_connp->conn_rq, mp);
983 }
984 
985 
986 int
987 tcp_fallback(sock_lower_handle_t proto_handle, queue_t *q,
988     boolean_t direct_sockfs, so_proto_quiesced_cb_t quiesced_cb,
989     sock_quiesce_arg_t *arg)
990 {
991 	tcp_t			*tcp;
992 	conn_t 			*connp = (conn_t *)proto_handle;
993 	int			error;
994 	mblk_t			*stropt_mp;
995 	mblk_t			*ordrel_mp;
996 
997 	tcp = connp->conn_tcp;
998 
999 	stropt_mp = allocb_wait(sizeof (struct stroptions), BPRI_HI, STR_NOSIG,
1000 	    NULL);
1001 
1002 	/* Pre-allocate the T_ordrel_ind mblk. */
1003 	ASSERT(tcp->tcp_ordrel_mp == NULL);
1004 	ordrel_mp = allocb_wait(sizeof (struct T_ordrel_ind), BPRI_HI,
1005 	    STR_NOSIG, NULL);
1006 	ordrel_mp->b_datap->db_type = M_PROTO;
1007 	((struct T_ordrel_ind *)ordrel_mp->b_rptr)->PRIM_type = T_ORDREL_IND;
1008 	ordrel_mp->b_wptr += sizeof (struct T_ordrel_ind);
1009 
1010 	/*
1011 	 * Enter the squeue so that no new packets can come in
1012 	 */
1013 	error = squeue_synch_enter(connp, NULL);
1014 	if (error != 0) {
1015 		/* failed to enter, free all the pre-allocated messages. */
1016 		freeb(stropt_mp);
1017 		freeb(ordrel_mp);
1018 		return (ENOMEM);
1019 	}
1020 
1021 	/*
1022 	 * Both endpoints must be of the same type (either STREAMS or
1023 	 * non-STREAMS) for fusion to be enabled. So if we are fused,
1024 	 * we have to unfuse.
1025 	 */
1026 	if (tcp->tcp_fused)
1027 		tcp_unfuse(tcp);
1028 
1029 	if (tcp->tcp_listener != NULL) {
1030 		/* The eager will deal with opts when accept() is called */
1031 		freeb(stropt_mp);
1032 		tcp_fallback_eager(tcp, direct_sockfs, quiesced_cb, arg);
1033 	} else {
1034 		tcp_fallback_noneager(tcp, stropt_mp, q, direct_sockfs,
1035 		    quiesced_cb, arg);
1036 	}
1037 
1038 	/*
1039 	 * No longer a direct socket
1040 	 *
1041 	 * Note that we intentionally leave the upper_handle and upcalls
1042 	 * intact, since eagers may still be using them.
1043 	 */
1044 	connp->conn_flags &= ~IPCL_NONSTR;
1045 	tcp->tcp_ordrel_mp = ordrel_mp;
1046 
1047 	/*
1048 	 * There should be atleast two ref's (IP + TCP)
1049 	 */
1050 	ASSERT(connp->conn_ref >= 2);
1051 	squeue_synch_exit(connp);
1052 
1053 	return (0);
1054 }
1055 
1056 /*
1057  * Notifies a non-STREAMS based listener about a new connection. This
1058  * function is executed on the *eager*'s squeue once the 3 way handshake
1059  * has completed. Note that the behavior differs from STREAMS, where the
1060  * T_CONN_IND is sent up by tcp_send_conn_ind while on the *listener*'s
1061  * squeue.
1062  *
1063  * Returns B_TRUE if the notification succeeded, in which case `tcp' will
1064  * be moved over to the ESTABLISHED list (q) of the listener. Othwerise,
1065  * B_FALSE is returned and `tcp' is killed.
1066  */
1067 boolean_t
1068 tcp_newconn_notify(tcp_t *tcp, ip_recv_attr_t *ira)
1069 {
1070 	tcp_t *listener = tcp->tcp_listener;
1071 	conn_t *lconnp = listener->tcp_connp;
1072 	conn_t *econnp = tcp->tcp_connp;
1073 	tcp_t *tail;
1074 	ipaddr_t *addr_cache;
1075 	sock_upper_handle_t upper;
1076 	struct sock_proto_props sopp;
1077 	mblk_t *mp;
1078 
1079 	mutex_enter(&listener->tcp_eager_lock);
1080 	/*
1081 	 * Take the eager out, if it is in the list of droppable eagers
1082 	 * as we are here because the 3W handshake is over.
1083 	 */
1084 	MAKE_UNDROPPABLE(tcp);
1085 	/*
1086 	 * The eager already has an extra ref put in tcp_input_data
1087 	 * so that it stays till accept comes back even though it
1088 	 * might get into TCPS_CLOSED as a result of a TH_RST etc.
1089 	 */
1090 	ASSERT(listener->tcp_conn_req_cnt_q0 > 0);
1091 	listener->tcp_conn_req_cnt_q0--;
1092 	listener->tcp_conn_req_cnt_q++;
1093 
1094 	/* Move from SYN_RCVD to ESTABLISHED list  */
1095 	tcp->tcp_eager_next_q0->tcp_eager_prev_q0 = tcp->tcp_eager_prev_q0;
1096 	tcp->tcp_eager_prev_q0->tcp_eager_next_q0 = tcp->tcp_eager_next_q0;
1097 	tcp->tcp_eager_prev_q0 = NULL;
1098 	tcp->tcp_eager_next_q0 = NULL;
1099 
1100 	/*
1101 	 * Insert at end of the queue because connections are accepted
1102 	 * in chronological order. Leaving the older connections at front
1103 	 * of the queue helps reducing search time.
1104 	 */
1105 	tail = listener->tcp_eager_last_q;
1106 	if (tail != NULL)
1107 		tail->tcp_eager_next_q = tcp;
1108 	else
1109 		listener->tcp_eager_next_q = tcp;
1110 	listener->tcp_eager_last_q = tcp;
1111 	tcp->tcp_eager_next_q = NULL;
1112 
1113 	/* we have timed out before */
1114 	if (tcp->tcp_syn_rcvd_timeout != 0) {
1115 		tcp->tcp_syn_rcvd_timeout = 0;
1116 		listener->tcp_syn_rcvd_timeout--;
1117 		if (listener->tcp_syn_defense &&
1118 		    listener->tcp_syn_rcvd_timeout <=
1119 		    (listener->tcp_tcps->tcps_conn_req_max_q0 >> 5) &&
1120 		    10*MINUTES < TICK_TO_MSEC(ddi_get_lbolt64() -
1121 		    listener->tcp_last_rcv_lbolt)) {
1122 			/*
1123 			 * Turn off the defense mode if we
1124 			 * believe the SYN attack is over.
1125 			 */
1126 			listener->tcp_syn_defense = B_FALSE;
1127 			if (listener->tcp_ip_addr_cache) {
1128 				kmem_free((void *)listener->tcp_ip_addr_cache,
1129 				    IP_ADDR_CACHE_SIZE * sizeof (ipaddr_t));
1130 				listener->tcp_ip_addr_cache = NULL;
1131 			}
1132 		}
1133 	}
1134 	addr_cache = (ipaddr_t *)(listener->tcp_ip_addr_cache);
1135 	if (addr_cache != NULL) {
1136 		/*
1137 		 * We have finished a 3-way handshake with this
1138 		 * remote host. This proves the IP addr is good.
1139 		 * Cache it!
1140 		 */
1141 		addr_cache[IP_ADDR_CACHE_HASH(tcp->tcp_connp->conn_faddr_v4)] =
1142 		    tcp->tcp_connp->conn_faddr_v4;
1143 	}
1144 	mutex_exit(&listener->tcp_eager_lock);
1145 
1146 	/*
1147 	 * Notify the ULP about the newconn. It is guaranteed that no
1148 	 * tcp_accept() call will be made for the eager if the
1149 	 * notification fails.
1150 	 */
1151 	if ((upper = (*lconnp->conn_upcalls->su_newconn)
1152 	    (lconnp->conn_upper_handle, (sock_lower_handle_t)econnp,
1153 	    &sock_tcp_downcalls, ira->ira_cred, ira->ira_cpid,
1154 	    &econnp->conn_upcalls)) == NULL) {
1155 		/*
1156 		 * Normally this should not happen, but the listener might
1157 		 * have done a fallback to TPI followed by a close(), in
1158 		 * which case tcp_closemp for this conn might have been
1159 		 * used by tcp_eager_cleanup().
1160 		 */
1161 		mutex_enter(&listener->tcp_eager_lock);
1162 		if (tcp->tcp_closemp_used) {
1163 			mutex_exit(&listener->tcp_eager_lock);
1164 			return (B_FALSE);
1165 		}
1166 		tcp->tcp_closemp_used = B_TRUE;
1167 		TCP_DEBUG_GETPCSTACK(tcp->tcmp_stk, 15);
1168 		mp = &tcp->tcp_closemp;
1169 		mutex_exit(&listener->tcp_eager_lock);
1170 		tcp_eager_kill(econnp, mp, NULL, NULL);
1171 		return (B_FALSE);
1172 	}
1173 	econnp->conn_upper_handle = upper;
1174 
1175 	tcp->tcp_detached = B_FALSE;
1176 	tcp->tcp_hard_binding = B_FALSE;
1177 	tcp->tcp_tconnind_started = B_TRUE;
1178 
1179 	if (econnp->conn_keepalive) {
1180 		tcp->tcp_ka_last_intrvl = 0;
1181 		tcp->tcp_ka_tid = TCP_TIMER(tcp, tcp_keepalive_timer,
1182 		    tcp->tcp_ka_interval);
1183 	}
1184 
1185 	/* Update the necessary parameters */
1186 	tcp_get_proto_props(tcp, &sopp);
1187 
1188 	(*econnp->conn_upcalls->su_set_proto_props)
1189 	    (econnp->conn_upper_handle, &sopp);
1190 
1191 	return (B_TRUE);
1192 }
1193