1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
24 */
25
26 /* This file contains all TCP kernel socket related functions. */
27
28 #include <sys/types.h>
29 #include <sys/strlog.h>
30 #include <sys/policy.h>
31 #include <sys/sockio.h>
32 #include <sys/strsubr.h>
33 #include <sys/strsun.h>
34 #include <sys/squeue_impl.h>
35 #include <sys/squeue.h>
36 #define _SUN_TPI_VERSION 2
37 #include <sys/tihdr.h>
38 #include <sys/timod.h>
39 #include <sys/tpicommon.h>
40 #include <sys/socketvar.h>
41
42 #include <inet/common.h>
43 #include <inet/proto_set.h>
44 #include <inet/ip.h>
45 #include <inet/tcp.h>
46 #include <inet/tcp_impl.h>
47
48 static void tcp_activate(sock_lower_handle_t, sock_upper_handle_t,
49 sock_upcalls_t *, int, cred_t *);
50 static int tcp_accept(sock_lower_handle_t, sock_lower_handle_t,
51 sock_upper_handle_t, cred_t *);
52 static int tcp_bind(sock_lower_handle_t, struct sockaddr *,
53 socklen_t, cred_t *);
54 static int tcp_listen(sock_lower_handle_t, int, cred_t *);
55 static int tcp_connect(sock_lower_handle_t, const struct sockaddr *,
56 socklen_t, sock_connid_t *, cred_t *);
57 static int tcp_getpeername(sock_lower_handle_t, struct sockaddr *,
58 socklen_t *, cred_t *);
59 static int tcp_getsockname(sock_lower_handle_t, struct sockaddr *,
60 socklen_t *, cred_t *);
61 static int tcp_getsockopt(sock_lower_handle_t, int, int, void *,
62 socklen_t *, cred_t *);
63 static int tcp_setsockopt(sock_lower_handle_t, int, int, const void *,
64 socklen_t, cred_t *);
65 static int tcp_sendmsg(sock_lower_handle_t, mblk_t *, struct nmsghdr *,
66 cred_t *);
67 static int tcp_shutdown(sock_lower_handle_t, int, cred_t *);
68 static void tcp_clr_flowctrl(sock_lower_handle_t);
69 static int tcp_ioctl(sock_lower_handle_t, int, intptr_t, int, int32_t *,
70 cred_t *);
71 static int tcp_close(sock_lower_handle_t, int, cred_t *);
72
73 sock_downcalls_t sock_tcp_downcalls = {
74 tcp_activate,
75 tcp_accept,
76 tcp_bind,
77 tcp_listen,
78 tcp_connect,
79 tcp_getpeername,
80 tcp_getsockname,
81 tcp_getsockopt,
82 tcp_setsockopt,
83 tcp_sendmsg,
84 NULL,
85 NULL,
86 NULL,
87 tcp_shutdown,
88 tcp_clr_flowctrl,
89 tcp_ioctl,
90 tcp_close,
91 };
92
93 /* ARGSUSED */
94 static void
tcp_activate(sock_lower_handle_t proto_handle,sock_upper_handle_t sock_handle,sock_upcalls_t * sock_upcalls,int flags,cred_t * cr)95 tcp_activate(sock_lower_handle_t proto_handle, sock_upper_handle_t sock_handle,
96 sock_upcalls_t *sock_upcalls, int flags, cred_t *cr)
97 {
98 conn_t *connp = (conn_t *)proto_handle;
99 struct sock_proto_props sopp;
100 extern struct module_info tcp_rinfo;
101
102 ASSERT(connp->conn_upper_handle == NULL);
103
104 /* All Solaris components should pass a cred for this operation. */
105 ASSERT(cr != NULL);
106
107 sopp.sopp_flags = SOCKOPT_RCVHIWAT | SOCKOPT_RCVLOWAT |
108 SOCKOPT_MAXPSZ | SOCKOPT_MAXBLK | SOCKOPT_RCVTIMER |
109 SOCKOPT_RCVTHRESH | SOCKOPT_MAXADDRLEN | SOCKOPT_MINPSZ;
110
111 sopp.sopp_rxhiwat = SOCKET_RECVHIWATER;
112 sopp.sopp_rxlowat = SOCKET_RECVLOWATER;
113 sopp.sopp_maxpsz = INFPSZ;
114 sopp.sopp_maxblk = INFPSZ;
115 sopp.sopp_rcvtimer = SOCKET_TIMER_INTERVAL;
116 sopp.sopp_rcvthresh = SOCKET_RECVHIWATER >> 3;
117 sopp.sopp_maxaddrlen = sizeof (sin6_t);
118 sopp.sopp_minpsz = (tcp_rinfo.mi_minpsz == 1) ? 0 :
119 tcp_rinfo.mi_minpsz;
120
121 connp->conn_upcalls = sock_upcalls;
122 connp->conn_upper_handle = sock_handle;
123
124 ASSERT(connp->conn_rcvbuf != 0 &&
125 connp->conn_rcvbuf == connp->conn_tcp->tcp_rwnd);
126 (*sock_upcalls->su_set_proto_props)(sock_handle, &sopp);
127 }
128
129 /*ARGSUSED*/
130 static int
tcp_accept(sock_lower_handle_t lproto_handle,sock_lower_handle_t eproto_handle,sock_upper_handle_t sock_handle,cred_t * cr)131 tcp_accept(sock_lower_handle_t lproto_handle,
132 sock_lower_handle_t eproto_handle, sock_upper_handle_t sock_handle,
133 cred_t *cr)
134 {
135 conn_t *lconnp, *econnp;
136 tcp_t *listener, *eager;
137
138 /*
139 * KSSL can move a socket from one listener to another, in which
140 * case `lproto_handle' points to the new listener. To ensure that
141 * the original listener is used the information is obtained from
142 * the eager.
143 */
144 econnp = (conn_t *)eproto_handle;
145 eager = econnp->conn_tcp;
146 ASSERT(IPCL_IS_NONSTR(econnp));
147 ASSERT(eager->tcp_listener != NULL);
148 listener = eager->tcp_listener;
149 lconnp = (conn_t *)listener->tcp_connp;
150 ASSERT(listener->tcp_state == TCPS_LISTEN);
151 ASSERT(lconnp->conn_upper_handle != NULL);
152
153 /*
154 * It is possible for the accept thread to race with the thread that
155 * made the su_newconn upcall in tcp_newconn_notify. Both
156 * tcp_newconn_notify and tcp_accept require that conn_upper_handle
157 * and conn_upcalls be set before returning, so they both write to
158 * them. However, we're guaranteed that the value written is the same
159 * for both threads.
160 */
161 ASSERT(econnp->conn_upper_handle == NULL ||
162 econnp->conn_upper_handle == sock_handle);
163 ASSERT(econnp->conn_upcalls == NULL ||
164 econnp->conn_upcalls == lconnp->conn_upcalls);
165 econnp->conn_upper_handle = sock_handle;
166 econnp->conn_upcalls = lconnp->conn_upcalls;
167
168 ASSERT(econnp->conn_netstack ==
169 listener->tcp_connp->conn_netstack);
170 ASSERT(eager->tcp_tcps == listener->tcp_tcps);
171
172 /*
173 * We should have a minimum of 2 references on the conn at this
174 * point. One for TCP and one for the newconn notification
175 * (which is now taken over by IP). In the normal case we would
176 * also have another reference (making a total of 3) for the conn
177 * being in the classifier hash list. However the eager could have
178 * received an RST subsequently and tcp_closei_local could have
179 * removed the eager from the classifier hash list, hence we can't
180 * assert that reference.
181 */
182 ASSERT(econnp->conn_ref >= 2);
183
184 mutex_enter(&listener->tcp_eager_lock);
185 /*
186 * Non-STREAMS listeners never defer the notification of new
187 * connections.
188 */
189 ASSERT(!listener->tcp_eager_prev_q0->tcp_conn_def_q0);
190 tcp_eager_unlink(eager);
191 mutex_exit(&listener->tcp_eager_lock);
192 CONN_DEC_REF(listener->tcp_connp);
193
194 return ((eager->tcp_state < TCPS_ESTABLISHED) ? ECONNABORTED : 0);
195 }
196
197 static int
tcp_bind(sock_lower_handle_t proto_handle,struct sockaddr * sa,socklen_t len,cred_t * cr)198 tcp_bind(sock_lower_handle_t proto_handle, struct sockaddr *sa,
199 socklen_t len, cred_t *cr)
200 {
201 int error;
202 conn_t *connp = (conn_t *)proto_handle;
203
204 /* All Solaris components should pass a cred for this operation. */
205 ASSERT(cr != NULL);
206 ASSERT(connp->conn_upper_handle != NULL);
207
208 error = squeue_synch_enter(connp, NULL);
209 if (error != 0) {
210 /* failed to enter */
211 return (ENOSR);
212 }
213
214 /* binding to a NULL address really means unbind */
215 if (sa == NULL) {
216 if (connp->conn_tcp->tcp_state < TCPS_LISTEN)
217 error = tcp_do_unbind(connp);
218 else
219 error = EINVAL;
220 } else {
221 error = tcp_do_bind(connp, sa, len, cr, B_TRUE);
222 }
223
224 squeue_synch_exit(connp);
225
226 if (error < 0) {
227 if (error == -TOUTSTATE)
228 error = EINVAL;
229 else
230 error = proto_tlitosyserr(-error);
231 }
232
233 return (error);
234 }
235
236 /* ARGSUSED */
237 static int
tcp_listen(sock_lower_handle_t proto_handle,int backlog,cred_t * cr)238 tcp_listen(sock_lower_handle_t proto_handle, int backlog, cred_t *cr)
239 {
240 conn_t *connp = (conn_t *)proto_handle;
241 tcp_t *tcp = connp->conn_tcp;
242 int error;
243
244 ASSERT(connp->conn_upper_handle != NULL);
245
246 /* All Solaris components should pass a cred for this operation. */
247 ASSERT(cr != NULL);
248
249 error = squeue_synch_enter(connp, NULL);
250 if (error != 0) {
251 /* failed to enter */
252 return (ENOBUFS);
253 }
254
255 error = tcp_do_listen(connp, NULL, 0, backlog, cr, B_FALSE);
256 if (error == 0) {
257 /*
258 * sockfs needs to know what's the maximum number of socket
259 * that can be queued on the listener.
260 */
261 (*connp->conn_upcalls->su_opctl)(connp->conn_upper_handle,
262 SOCK_OPCTL_ENAB_ACCEPT,
263 (uintptr_t)(tcp->tcp_conn_req_max +
264 tcp->tcp_tcps->tcps_conn_req_max_q0));
265 } else if (error < 0) {
266 if (error == -TOUTSTATE)
267 error = EINVAL;
268 else
269 error = proto_tlitosyserr(-error);
270 }
271 squeue_synch_exit(connp);
272 return (error);
273 }
274
275 static int
tcp_connect(sock_lower_handle_t proto_handle,const struct sockaddr * sa,socklen_t len,sock_connid_t * id,cred_t * cr)276 tcp_connect(sock_lower_handle_t proto_handle, const struct sockaddr *sa,
277 socklen_t len, sock_connid_t *id, cred_t *cr)
278 {
279 conn_t *connp = (conn_t *)proto_handle;
280 int error;
281
282 ASSERT(connp->conn_upper_handle != NULL);
283
284 /* All Solaris components should pass a cred for this operation. */
285 ASSERT(cr != NULL);
286
287 error = proto_verify_ip_addr(connp->conn_family, sa, len);
288 if (error != 0) {
289 return (error);
290 }
291
292 error = squeue_synch_enter(connp, NULL);
293 if (error != 0) {
294 /* failed to enter */
295 return (ENOSR);
296 }
297
298 /*
299 * TCP supports quick connect, so no need to do an implicit bind
300 */
301 error = tcp_do_connect(connp, sa, len, cr, curproc->p_pid);
302 if (error == 0) {
303 *id = connp->conn_tcp->tcp_connid;
304 } else if (error < 0) {
305 if (error == -TOUTSTATE) {
306 switch (connp->conn_tcp->tcp_state) {
307 case TCPS_SYN_SENT:
308 error = EALREADY;
309 break;
310 case TCPS_ESTABLISHED:
311 error = EISCONN;
312 break;
313 case TCPS_LISTEN:
314 error = EOPNOTSUPP;
315 break;
316 default:
317 error = EINVAL;
318 break;
319 }
320 } else {
321 error = proto_tlitosyserr(-error);
322 }
323 }
324
325 if (connp->conn_tcp->tcp_loopback) {
326 struct sock_proto_props sopp;
327
328 sopp.sopp_flags = SOCKOPT_LOOPBACK;
329 sopp.sopp_loopback = B_TRUE;
330
331 (*connp->conn_upcalls->su_set_proto_props)(
332 connp->conn_upper_handle, &sopp);
333 }
334 done:
335 squeue_synch_exit(connp);
336
337 return ((error == 0) ? EINPROGRESS : error);
338 }
339
340 /* ARGSUSED3 */
341 static int
tcp_getpeername(sock_lower_handle_t proto_handle,struct sockaddr * addr,socklen_t * addrlenp,cred_t * cr)342 tcp_getpeername(sock_lower_handle_t proto_handle, struct sockaddr *addr,
343 socklen_t *addrlenp, cred_t *cr)
344 {
345 conn_t *connp = (conn_t *)proto_handle;
346 tcp_t *tcp = connp->conn_tcp;
347
348 /* All Solaris components should pass a cred for this operation. */
349 ASSERT(cr != NULL);
350
351 ASSERT(tcp != NULL);
352 if (tcp->tcp_state < TCPS_SYN_RCVD)
353 return (ENOTCONN);
354
355 return (conn_getpeername(connp, addr, addrlenp));
356 }
357
358 /* ARGSUSED3 */
359 static int
tcp_getsockname(sock_lower_handle_t proto_handle,struct sockaddr * addr,socklen_t * addrlenp,cred_t * cr)360 tcp_getsockname(sock_lower_handle_t proto_handle, struct sockaddr *addr,
361 socklen_t *addrlenp, cred_t *cr)
362 {
363 conn_t *connp = (conn_t *)proto_handle;
364
365 /* All Solaris components should pass a cred for this operation. */
366 ASSERT(cr != NULL);
367
368 return (conn_getsockname(connp, addr, addrlenp));
369 }
370
371 /* returns UNIX error, the optlen is a value-result arg */
372 static int
tcp_getsockopt(sock_lower_handle_t proto_handle,int level,int option_name,void * optvalp,socklen_t * optlen,cred_t * cr)373 tcp_getsockopt(sock_lower_handle_t proto_handle, int level, int option_name,
374 void *optvalp, socklen_t *optlen, cred_t *cr)
375 {
376 conn_t *connp = (conn_t *)proto_handle;
377 int error;
378 t_uscalar_t max_optbuf_len;
379 void *optvalp_buf;
380 int len;
381
382 ASSERT(connp->conn_upper_handle != NULL);
383
384 error = proto_opt_check(level, option_name, *optlen, &max_optbuf_len,
385 tcp_opt_obj.odb_opt_des_arr,
386 tcp_opt_obj.odb_opt_arr_cnt,
387 B_FALSE, B_TRUE, cr);
388 if (error != 0) {
389 if (error < 0) {
390 error = proto_tlitosyserr(-error);
391 }
392 return (error);
393 }
394
395 optvalp_buf = kmem_alloc(max_optbuf_len, KM_SLEEP);
396
397 error = squeue_synch_enter(connp, NULL);
398 if (error == ENOMEM) {
399 kmem_free(optvalp_buf, max_optbuf_len);
400 return (ENOMEM);
401 }
402
403 len = tcp_opt_get(connp, level, option_name, optvalp_buf);
404 squeue_synch_exit(connp);
405
406 if (len == -1) {
407 kmem_free(optvalp_buf, max_optbuf_len);
408 return (EINVAL);
409 }
410
411 /*
412 * update optlen and copy option value
413 */
414 t_uscalar_t size = MIN(len, *optlen);
415
416 bcopy(optvalp_buf, optvalp, size);
417 bcopy(&size, optlen, sizeof (size));
418
419 kmem_free(optvalp_buf, max_optbuf_len);
420 return (0);
421 }
422
423 static int
tcp_setsockopt(sock_lower_handle_t proto_handle,int level,int option_name,const void * optvalp,socklen_t optlen,cred_t * cr)424 tcp_setsockopt(sock_lower_handle_t proto_handle, int level, int option_name,
425 const void *optvalp, socklen_t optlen, cred_t *cr)
426 {
427 conn_t *connp = (conn_t *)proto_handle;
428 int error;
429
430 ASSERT(connp->conn_upper_handle != NULL);
431 /*
432 * Entering the squeue synchronously can result in a context switch,
433 * which can cause a rather sever performance degradation. So we try to
434 * handle whatever options we can without entering the squeue.
435 */
436 if (level == IPPROTO_TCP) {
437 switch (option_name) {
438 case TCP_NODELAY:
439 if (optlen != sizeof (int32_t))
440 return (EINVAL);
441 mutex_enter(&connp->conn_tcp->tcp_non_sq_lock);
442 connp->conn_tcp->tcp_naglim = *(int *)optvalp ? 1 :
443 connp->conn_tcp->tcp_mss;
444 mutex_exit(&connp->conn_tcp->tcp_non_sq_lock);
445 return (0);
446 default:
447 break;
448 }
449 }
450
451 error = squeue_synch_enter(connp, NULL);
452 if (error == ENOMEM) {
453 return (ENOMEM);
454 }
455
456 error = proto_opt_check(level, option_name, optlen, NULL,
457 tcp_opt_obj.odb_opt_des_arr,
458 tcp_opt_obj.odb_opt_arr_cnt,
459 B_TRUE, B_FALSE, cr);
460
461 if (error != 0) {
462 if (error < 0) {
463 error = proto_tlitosyserr(-error);
464 }
465 squeue_synch_exit(connp);
466 return (error);
467 }
468
469 error = tcp_opt_set(connp, SETFN_OPTCOM_NEGOTIATE, level, option_name,
470 optlen, (uchar_t *)optvalp, (uint_t *)&optlen, (uchar_t *)optvalp,
471 NULL, cr);
472 squeue_synch_exit(connp);
473
474 ASSERT(error >= 0);
475
476 return (error);
477 }
478
479 /* ARGSUSED */
480 static int
tcp_sendmsg(sock_lower_handle_t proto_handle,mblk_t * mp,struct nmsghdr * msg,cred_t * cr)481 tcp_sendmsg(sock_lower_handle_t proto_handle, mblk_t *mp, struct nmsghdr *msg,
482 cred_t *cr)
483 {
484 tcp_t *tcp;
485 uint32_t msize;
486 conn_t *connp = (conn_t *)proto_handle;
487 int32_t tcpstate;
488
489 /* All Solaris components should pass a cred for this operation. */
490 ASSERT(cr != NULL);
491
492 ASSERT(connp->conn_ref >= 2);
493 ASSERT(connp->conn_upper_handle != NULL);
494
495 if (msg->msg_controllen != 0) {
496 freemsg(mp);
497 return (EOPNOTSUPP);
498 }
499
500 switch (DB_TYPE(mp)) {
501 case M_DATA:
502 tcp = connp->conn_tcp;
503 ASSERT(tcp != NULL);
504
505 tcpstate = tcp->tcp_state;
506 if (tcpstate < TCPS_ESTABLISHED) {
507 freemsg(mp);
508 /*
509 * We return ENOTCONN if the endpoint is trying to
510 * connect or has never been connected, and EPIPE if it
511 * has been disconnected. The connection id helps us
512 * distinguish between the last two cases.
513 */
514 return ((tcpstate == TCPS_SYN_SENT) ? ENOTCONN :
515 ((tcp->tcp_connid > 0) ? EPIPE : ENOTCONN));
516 } else if (tcpstate > TCPS_CLOSE_WAIT) {
517 freemsg(mp);
518 return (EPIPE);
519 }
520
521 msize = msgdsize(mp);
522
523 mutex_enter(&tcp->tcp_non_sq_lock);
524 tcp->tcp_squeue_bytes += msize;
525 /*
526 * Squeue Flow Control
527 */
528 if (TCP_UNSENT_BYTES(tcp) > connp->conn_sndbuf) {
529 tcp_setqfull(tcp);
530 }
531 mutex_exit(&tcp->tcp_non_sq_lock);
532
533 /*
534 * The application may pass in an address in the msghdr, but
535 * we ignore the address on connection-oriented sockets.
536 * Just like BSD this code does not generate an error for
537 * TCP (a CONNREQUIRED socket) when sending to an address
538 * passed in with sendto/sendmsg. Instead the data is
539 * delivered on the connection as if no address had been
540 * supplied.
541 */
542 CONN_INC_REF(connp);
543
544 if (msg->msg_flags & MSG_OOB) {
545 SQUEUE_ENTER_ONE(connp->conn_sqp, mp, tcp_output_urgent,
546 connp, NULL, tcp_squeue_flag, SQTAG_TCP_OUTPUT);
547 } else {
548 SQUEUE_ENTER_ONE(connp->conn_sqp, mp, tcp_output,
549 connp, NULL, tcp_squeue_flag, SQTAG_TCP_OUTPUT);
550 }
551
552 return (0);
553
554 default:
555 ASSERT(0);
556 }
557
558 freemsg(mp);
559 return (0);
560 }
561
562 /* ARGSUSED */
563 static int
tcp_shutdown(sock_lower_handle_t proto_handle,int how,cred_t * cr)564 tcp_shutdown(sock_lower_handle_t proto_handle, int how, cred_t *cr)
565 {
566 conn_t *connp = (conn_t *)proto_handle;
567 tcp_t *tcp = connp->conn_tcp;
568
569 ASSERT(connp->conn_upper_handle != NULL);
570
571 /* All Solaris components should pass a cred for this operation. */
572 ASSERT(cr != NULL);
573
574 /*
575 * X/Open requires that we check the connected state.
576 */
577 if (tcp->tcp_state < TCPS_SYN_SENT)
578 return (ENOTCONN);
579
580 /* shutdown the send side */
581 if (how != SHUT_RD) {
582 mblk_t *bp;
583
584 bp = allocb_wait(0, BPRI_HI, STR_NOSIG, NULL);
585 CONN_INC_REF(connp);
586 SQUEUE_ENTER_ONE(connp->conn_sqp, bp, tcp_shutdown_output,
587 connp, NULL, SQ_NODRAIN, SQTAG_TCP_SHUTDOWN_OUTPUT);
588
589 (*connp->conn_upcalls->su_opctl)(connp->conn_upper_handle,
590 SOCK_OPCTL_SHUT_SEND, 0);
591 }
592
593 /* shutdown the recv side */
594 if (how != SHUT_WR)
595 (*connp->conn_upcalls->su_opctl)(connp->conn_upper_handle,
596 SOCK_OPCTL_SHUT_RECV, 0);
597
598 return (0);
599 }
600
601 static void
tcp_clr_flowctrl(sock_lower_handle_t proto_handle)602 tcp_clr_flowctrl(sock_lower_handle_t proto_handle)
603 {
604 conn_t *connp = (conn_t *)proto_handle;
605 tcp_t *tcp = connp->conn_tcp;
606 mblk_t *mp;
607 int error;
608
609 ASSERT(connp->conn_upper_handle != NULL);
610
611 /*
612 * If tcp->tcp_rsrv_mp == NULL, it means that tcp_clr_flowctrl()
613 * is currently running.
614 */
615 mutex_enter(&tcp->tcp_rsrv_mp_lock);
616 if ((mp = tcp->tcp_rsrv_mp) == NULL) {
617 mutex_exit(&tcp->tcp_rsrv_mp_lock);
618 return;
619 }
620 tcp->tcp_rsrv_mp = NULL;
621 mutex_exit(&tcp->tcp_rsrv_mp_lock);
622
623 error = squeue_synch_enter(connp, mp);
624 ASSERT(error == 0);
625
626 mutex_enter(&tcp->tcp_rsrv_mp_lock);
627 tcp->tcp_rsrv_mp = mp;
628 mutex_exit(&tcp->tcp_rsrv_mp_lock);
629
630 if (tcp->tcp_fused) {
631 tcp_fuse_backenable(tcp);
632 } else {
633 tcp->tcp_rwnd = connp->conn_rcvbuf;
634 /*
635 * Send back a window update immediately if TCP is above
636 * ESTABLISHED state and the increase of the rcv window
637 * that the other side knows is at least 1 MSS after flow
638 * control is lifted.
639 */
640 if (tcp->tcp_state >= TCPS_ESTABLISHED &&
641 tcp_rwnd_reopen(tcp) == TH_ACK_NEEDED) {
642 tcp_xmit_ctl(NULL, tcp,
643 (tcp->tcp_swnd == 0) ? tcp->tcp_suna :
644 tcp->tcp_snxt, tcp->tcp_rnxt, TH_ACK);
645 }
646 }
647
648 squeue_synch_exit(connp);
649 }
650
651 /* ARGSUSED */
652 static int
tcp_ioctl(sock_lower_handle_t proto_handle,int cmd,intptr_t arg,int mode,int32_t * rvalp,cred_t * cr)653 tcp_ioctl(sock_lower_handle_t proto_handle, int cmd, intptr_t arg,
654 int mode, int32_t *rvalp, cred_t *cr)
655 {
656 conn_t *connp = (conn_t *)proto_handle;
657 int error;
658
659 ASSERT(connp->conn_upper_handle != NULL);
660
661 /* All Solaris components should pass a cred for this operation. */
662 ASSERT(cr != NULL);
663
664 /*
665 * If we don't have a helper stream then create one.
666 * ip_create_helper_stream takes care of locking the conn_t,
667 * so this check for NULL is just a performance optimization.
668 */
669 if (connp->conn_helper_info == NULL) {
670 tcp_stack_t *tcps = connp->conn_tcp->tcp_tcps;
671
672 /*
673 * Create a helper stream for non-STREAMS socket.
674 */
675 error = ip_create_helper_stream(connp, tcps->tcps_ldi_ident);
676 if (error != 0) {
677 ip0dbg(("tcp_ioctl: create of IP helper stream "
678 "failed %d\n", error));
679 return (error);
680 }
681 }
682
683 switch (cmd) {
684 case ND_SET:
685 case ND_GET:
686 case _SIOCSOCKFALLBACK:
687 case TCP_IOC_ABORT_CONN:
688 case TI_GETPEERNAME:
689 case TI_GETMYNAME:
690 ip1dbg(("tcp_ioctl: cmd 0x%x on non streams socket",
691 cmd));
692 error = EINVAL;
693 break;
694 default:
695 /*
696 * If the conn is not closing, pass on to IP using
697 * helper stream. Bump the ioctlref to prevent tcp_close
698 * from closing the rq/wq out from underneath the ioctl
699 * if it ends up queued or aborted/interrupted.
700 */
701 mutex_enter(&connp->conn_lock);
702 if (connp->conn_state_flags & (CONN_CLOSING)) {
703 mutex_exit(&connp->conn_lock);
704 error = EINVAL;
705 break;
706 }
707 CONN_INC_IOCTLREF_LOCKED(connp);
708 error = ldi_ioctl(connp->conn_helper_info->iphs_handle,
709 cmd, arg, mode, cr, rvalp);
710 CONN_DEC_IOCTLREF(connp);
711 break;
712 }
713 return (error);
714 }
715
716 /* ARGSUSED */
717 static int
tcp_close(sock_lower_handle_t proto_handle,int flags,cred_t * cr)718 tcp_close(sock_lower_handle_t proto_handle, int flags, cred_t *cr)
719 {
720 conn_t *connp = (conn_t *)proto_handle;
721
722 ASSERT(connp->conn_upper_handle != NULL);
723
724 /* All Solaris components should pass a cred for this operation. */
725 ASSERT(cr != NULL);
726
727 tcp_close_common(connp, flags);
728
729 ip_free_helper_stream(connp);
730
731 /*
732 * Drop IP's reference on the conn. This is the last reference
733 * on the connp if the state was less than established. If the
734 * connection has gone into timewait state, then we will have
735 * one ref for the TCP and one more ref (total of two) for the
736 * classifier connected hash list (a timewait connections stays
737 * in connected hash till closed).
738 *
739 * We can't assert the references because there might be other
740 * transient reference places because of some walkers or queued
741 * packets in squeue for the timewait state.
742 */
743 CONN_DEC_REF(connp);
744
745 /*
746 * EINPROGRESS tells sockfs to wait for a 'closed' upcall before
747 * freeing the socket.
748 */
749 return (EINPROGRESS);
750 }
751
752 /* ARGSUSED */
753 sock_lower_handle_t
tcp_create(int family,int type,int proto,sock_downcalls_t ** sock_downcalls,uint_t * smodep,int * errorp,int flags,cred_t * credp)754 tcp_create(int family, int type, int proto, sock_downcalls_t **sock_downcalls,
755 uint_t *smodep, int *errorp, int flags, cred_t *credp)
756 {
757 conn_t *connp;
758 boolean_t isv6 = family == AF_INET6;
759
760 if (type != SOCK_STREAM || (family != AF_INET && family != AF_INET6) ||
761 (proto != 0 && proto != IPPROTO_TCP)) {
762 *errorp = EPROTONOSUPPORT;
763 return (NULL);
764 }
765
766 connp = tcp_create_common(credp, isv6, B_TRUE, errorp);
767 if (connp == NULL) {
768 return (NULL);
769 }
770
771 /*
772 * Put the ref for TCP. Ref for IP was already put
773 * by ipcl_conn_create. Also make the conn_t globally
774 * visible to walkers
775 */
776 mutex_enter(&connp->conn_lock);
777 CONN_INC_REF_LOCKED(connp);
778 ASSERT(connp->conn_ref == 2);
779 connp->conn_state_flags &= ~CONN_INCIPIENT;
780
781 connp->conn_flags |= IPCL_NONSTR;
782 mutex_exit(&connp->conn_lock);
783
784 ASSERT(errorp != NULL);
785 *errorp = 0;
786 *sock_downcalls = &sock_tcp_downcalls;
787 *smodep = SM_CONNREQUIRED | SM_EXDATA | SM_ACCEPTSUPP |
788 SM_SENDFILESUPP;
789
790 return ((sock_lower_handle_t)connp);
791 }
792
793 /*
794 * tcp_fallback
795 *
796 * A direct socket is falling back to using STREAMS. The queue
797 * that is being passed down was created using tcp_open() with
798 * the SO_FALLBACK flag set. As a result, the queue is not
799 * associated with a conn, and the q_ptrs instead contain the
800 * dev and minor area that should be used.
801 *
802 * The 'issocket' flag indicates whether the FireEngine
803 * optimizations should be used. The common case would be that
804 * optimizations are enabled, and they might be subsequently
805 * disabled using the _SIOCSOCKFALLBACK ioctl.
806 */
807
808 /*
809 * An active connection is falling back to TPI. Gather all the information
810 * required by the STREAM head and TPI sonode and send it up.
811 */
812 static void
tcp_fallback_noneager(tcp_t * tcp,mblk_t * stropt_mp,queue_t * q,boolean_t issocket,so_proto_quiesced_cb_t quiesced_cb,sock_quiesce_arg_t * arg)813 tcp_fallback_noneager(tcp_t *tcp, mblk_t *stropt_mp, queue_t *q,
814 boolean_t issocket, so_proto_quiesced_cb_t quiesced_cb,
815 sock_quiesce_arg_t *arg)
816 {
817 conn_t *connp = tcp->tcp_connp;
818 struct stroptions *stropt;
819 struct T_capability_ack tca;
820 struct sockaddr_in6 laddr, faddr;
821 socklen_t laddrlen, faddrlen;
822 short opts;
823 int error;
824 mblk_t *mp, *mpnext;
825
826 connp->conn_dev = (dev_t)RD(q)->q_ptr;
827 connp->conn_minor_arena = WR(q)->q_ptr;
828
829 RD(q)->q_ptr = WR(q)->q_ptr = connp;
830
831 connp->conn_rq = RD(q);
832 connp->conn_wq = WR(q);
833
834 WR(q)->q_qinfo = &tcp_sock_winit;
835
836 if (!issocket)
837 tcp_use_pure_tpi(tcp);
838
839 /*
840 * free the helper stream
841 */
842 ip_free_helper_stream(connp);
843
844 /*
845 * Notify the STREAM head about options
846 */
847 DB_TYPE(stropt_mp) = M_SETOPTS;
848 stropt = (struct stroptions *)stropt_mp->b_rptr;
849 stropt_mp->b_wptr += sizeof (struct stroptions);
850 stropt->so_flags = SO_HIWAT | SO_WROFF | SO_MAXBLK;
851
852 stropt->so_wroff = connp->conn_ht_iphc_len + (tcp->tcp_loopback ? 0 :
853 tcp->tcp_tcps->tcps_wroff_xtra);
854 if (tcp->tcp_snd_sack_ok)
855 stropt->so_wroff += TCPOPT_MAX_SACK_LEN;
856 stropt->so_hiwat = connp->conn_rcvbuf;
857 stropt->so_maxblk = tcp_maxpsz_set(tcp, B_FALSE);
858
859 putnext(RD(q), stropt_mp);
860
861 /*
862 * Collect the information needed to sync with the sonode
863 */
864 tcp_do_capability_ack(tcp, &tca, TC1_INFO|TC1_ACCEPTOR_ID);
865
866 laddrlen = faddrlen = sizeof (sin6_t);
867 (void) tcp_getsockname((sock_lower_handle_t)connp,
868 (struct sockaddr *)&laddr, &laddrlen, CRED());
869 error = tcp_getpeername((sock_lower_handle_t)connp,
870 (struct sockaddr *)&faddr, &faddrlen, CRED());
871 if (error != 0)
872 faddrlen = 0;
873
874 opts = 0;
875 if (connp->conn_oobinline)
876 opts |= SO_OOBINLINE;
877 if (connp->conn_ixa->ixa_flags & IXAF_DONTROUTE)
878 opts |= SO_DONTROUTE;
879
880 /*
881 * Notify the socket that the protocol is now quiescent,
882 * and it's therefore safe move data from the socket
883 * to the stream head.
884 */
885 mp = (*quiesced_cb)(connp->conn_upper_handle, arg, &tca,
886 (struct sockaddr *)&laddr, laddrlen,
887 (struct sockaddr *)&faddr, faddrlen, opts);
888
889 while (mp != NULL) {
890 mpnext = mp->b_next;
891 tcp->tcp_rcv_list = mp->b_next;
892 mp->b_next = NULL;
893 putnext(q, mp);
894 mp = mpnext;
895 }
896 ASSERT(tcp->tcp_rcv_last_head == NULL);
897 ASSERT(tcp->tcp_rcv_last_tail == NULL);
898 ASSERT(tcp->tcp_rcv_cnt == 0);
899
900 /*
901 * All eagers in q0 are marked as being non-STREAM, so they will
902 * make su_newconn upcalls when the handshake completes, which
903 * will fail (resulting in the conn being closed). So we just blow
904 * off everything in q0 instead of waiting for the inevitable.
905 */
906 if (tcp->tcp_conn_req_cnt_q0 != 0)
907 tcp_eager_cleanup(tcp, B_TRUE);
908 }
909
910 /*
911 * An eager is falling back to TPI. All we have to do is send
912 * up a T_CONN_IND.
913 */
914 static void
tcp_fallback_eager(tcp_t * eager,boolean_t issocket,so_proto_quiesced_cb_t quiesced_cb,sock_quiesce_arg_t * arg)915 tcp_fallback_eager(tcp_t *eager, boolean_t issocket,
916 so_proto_quiesced_cb_t quiesced_cb, sock_quiesce_arg_t *arg)
917 {
918 conn_t *connp = eager->tcp_connp;
919 tcp_t *listener = eager->tcp_listener;
920 mblk_t *mp;
921
922 ASSERT(listener != NULL);
923
924 /*
925 * Notify the socket that the protocol is now quiescent,
926 * and it's therefore safe move data from the socket
927 * to tcp's rcv queue.
928 */
929 mp = (*quiesced_cb)(connp->conn_upper_handle, arg, NULL, NULL, 0,
930 NULL, 0, 0);
931
932 if (mp != NULL) {
933 ASSERT(eager->tcp_rcv_cnt == 0);
934
935 eager->tcp_rcv_list = mp;
936 eager->tcp_rcv_cnt = msgdsize(mp);
937 while (mp->b_next != NULL) {
938 mp = mp->b_next;
939 eager->tcp_rcv_cnt += msgdsize(mp);
940 }
941 eager->tcp_rcv_last_head = mp;
942 while (mp->b_cont)
943 mp = mp->b_cont;
944 eager->tcp_rcv_last_tail = mp;
945 if (eager->tcp_rcv_cnt > eager->tcp_rwnd)
946 eager->tcp_rwnd = 0;
947 else
948 eager->tcp_rwnd -= eager->tcp_rcv_cnt;
949 }
950
951 if (!issocket)
952 eager->tcp_issocket = B_FALSE;
953 /*
954 * The stream for this eager does not yet exist, so mark it as
955 * being detached.
956 */
957 eager->tcp_detached = B_TRUE;
958 eager->tcp_hard_binding = B_TRUE;
959 connp->conn_rq = listener->tcp_connp->conn_rq;
960 connp->conn_wq = listener->tcp_connp->conn_wq;
961
962 /* Send up the connection indication */
963 mp = eager->tcp_conn.tcp_eager_conn_ind;
964 ASSERT(mp != NULL);
965 eager->tcp_conn.tcp_eager_conn_ind = NULL;
966
967 /*
968 * TLI/XTI applications will get confused by
969 * sending eager as an option since it violates
970 * the option semantics. So remove the eager as
971 * option since TLI/XTI app doesn't need it anyway.
972 */
973 if (!issocket) {
974 struct T_conn_ind *conn_ind;
975
976 conn_ind = (struct T_conn_ind *)mp->b_rptr;
977 conn_ind->OPT_length = 0;
978 conn_ind->OPT_offset = 0;
979 }
980
981 /*
982 * Sockfs guarantees that the listener will not be closed
983 * during fallback. So we can safely use the listener's queue.
984 */
985 putnext(listener->tcp_connp->conn_rq, mp);
986 }
987
988
989 int
tcp_fallback(sock_lower_handle_t proto_handle,queue_t * q,boolean_t direct_sockfs,so_proto_quiesced_cb_t quiesced_cb,sock_quiesce_arg_t * arg)990 tcp_fallback(sock_lower_handle_t proto_handle, queue_t *q,
991 boolean_t direct_sockfs, so_proto_quiesced_cb_t quiesced_cb,
992 sock_quiesce_arg_t *arg)
993 {
994 tcp_t *tcp;
995 conn_t *connp = (conn_t *)proto_handle;
996 int error;
997 mblk_t *stropt_mp;
998 mblk_t *ordrel_mp;
999
1000 tcp = connp->conn_tcp;
1001
1002 stropt_mp = allocb_wait(sizeof (struct stroptions), BPRI_HI, STR_NOSIG,
1003 NULL);
1004
1005 /* Pre-allocate the T_ordrel_ind mblk. */
1006 ASSERT(tcp->tcp_ordrel_mp == NULL);
1007 ordrel_mp = allocb_wait(sizeof (struct T_ordrel_ind), BPRI_HI,
1008 STR_NOSIG, NULL);
1009 ordrel_mp->b_datap->db_type = M_PROTO;
1010 ((struct T_ordrel_ind *)ordrel_mp->b_rptr)->PRIM_type = T_ORDREL_IND;
1011 ordrel_mp->b_wptr += sizeof (struct T_ordrel_ind);
1012
1013 /*
1014 * Enter the squeue so that no new packets can come in
1015 */
1016 error = squeue_synch_enter(connp, NULL);
1017 if (error != 0) {
1018 /* failed to enter, free all the pre-allocated messages. */
1019 freeb(stropt_mp);
1020 freeb(ordrel_mp);
1021 return (ENOMEM);
1022 }
1023
1024 /*
1025 * Both endpoints must be of the same type (either STREAMS or
1026 * non-STREAMS) for fusion to be enabled. So if we are fused,
1027 * we have to unfuse.
1028 */
1029 if (tcp->tcp_fused)
1030 tcp_unfuse(tcp);
1031
1032 if (tcp->tcp_listener != NULL) {
1033 /* The eager will deal with opts when accept() is called */
1034 freeb(stropt_mp);
1035 tcp_fallback_eager(tcp, direct_sockfs, quiesced_cb, arg);
1036 } else {
1037 tcp_fallback_noneager(tcp, stropt_mp, q, direct_sockfs,
1038 quiesced_cb, arg);
1039 }
1040
1041 /*
1042 * No longer a direct socket
1043 *
1044 * Note that we intentionally leave the upper_handle and upcalls
1045 * intact, since eagers may still be using them.
1046 */
1047 connp->conn_flags &= ~IPCL_NONSTR;
1048 tcp->tcp_ordrel_mp = ordrel_mp;
1049
1050 /*
1051 * There should be atleast two ref's (IP + TCP)
1052 */
1053 ASSERT(connp->conn_ref >= 2);
1054 squeue_synch_exit(connp);
1055
1056 return (0);
1057 }
1058
1059 /*
1060 * Notifies a non-STREAMS based listener about a new connection. This
1061 * function is executed on the *eager*'s squeue once the 3 way handshake
1062 * has completed. Note that the behavior differs from STREAMS, where the
1063 * T_CONN_IND is sent up by tcp_send_conn_ind() while on the *listener*'s
1064 * squeue.
1065 *
1066 * Returns B_TRUE if the notification succeeded and an upper handle was
1067 * obtained. `tcp' should be closed on failure.
1068 */
1069 boolean_t
tcp_newconn_notify(tcp_t * tcp,ip_recv_attr_t * ira)1070 tcp_newconn_notify(tcp_t *tcp, ip_recv_attr_t *ira)
1071 {
1072 tcp_t *listener = tcp->tcp_listener;
1073 conn_t *lconnp = listener->tcp_connp;
1074 conn_t *econnp = tcp->tcp_connp;
1075 tcp_t *tail;
1076 ipaddr_t *addr_cache;
1077 sock_upper_handle_t upper;
1078 struct sock_proto_props sopp;
1079
1080 mutex_enter(&listener->tcp_eager_lock);
1081 /*
1082 * Take the eager out, if it is in the list of droppable eagers
1083 * as we are here because the 3W handshake is over.
1084 */
1085 MAKE_UNDROPPABLE(tcp);
1086 /*
1087 * The eager already has an extra ref put in tcp_input_data
1088 * so that it stays till accept comes back even though it
1089 * might get into TCPS_CLOSED as a result of a TH_RST etc.
1090 */
1091 ASSERT(listener->tcp_conn_req_cnt_q0 > 0);
1092 listener->tcp_conn_req_cnt_q0--;
1093 listener->tcp_conn_req_cnt_q++;
1094
1095 /* Move from SYN_RCVD to ESTABLISHED list */
1096 tcp->tcp_eager_next_q0->tcp_eager_prev_q0 = tcp->tcp_eager_prev_q0;
1097 tcp->tcp_eager_prev_q0->tcp_eager_next_q0 = tcp->tcp_eager_next_q0;
1098 tcp->tcp_eager_prev_q0 = NULL;
1099 tcp->tcp_eager_next_q0 = NULL;
1100
1101 /*
1102 * Insert at end of the queue because connections are accepted
1103 * in chronological order. Leaving the older connections at front
1104 * of the queue helps reducing search time.
1105 */
1106 tail = listener->tcp_eager_last_q;
1107 if (tail != NULL)
1108 tail->tcp_eager_next_q = tcp;
1109 else
1110 listener->tcp_eager_next_q = tcp;
1111 listener->tcp_eager_last_q = tcp;
1112 tcp->tcp_eager_next_q = NULL;
1113
1114 /* we have timed out before */
1115 if (tcp->tcp_syn_rcvd_timeout != 0) {
1116 tcp->tcp_syn_rcvd_timeout = 0;
1117 listener->tcp_syn_rcvd_timeout--;
1118 if (listener->tcp_syn_defense &&
1119 listener->tcp_syn_rcvd_timeout <=
1120 (listener->tcp_tcps->tcps_conn_req_max_q0 >> 5) &&
1121 10*MINUTES < TICK_TO_MSEC(ddi_get_lbolt64() -
1122 listener->tcp_last_rcv_lbolt)) {
1123 /*
1124 * Turn off the defense mode if we
1125 * believe the SYN attack is over.
1126 */
1127 listener->tcp_syn_defense = B_FALSE;
1128 if (listener->tcp_ip_addr_cache) {
1129 kmem_free((void *)listener->tcp_ip_addr_cache,
1130 IP_ADDR_CACHE_SIZE * sizeof (ipaddr_t));
1131 listener->tcp_ip_addr_cache = NULL;
1132 }
1133 }
1134 }
1135 addr_cache = (ipaddr_t *)(listener->tcp_ip_addr_cache);
1136 if (addr_cache != NULL) {
1137 /*
1138 * We have finished a 3-way handshake with this
1139 * remote host. This proves the IP addr is good.
1140 * Cache it!
1141 */
1142 addr_cache[IP_ADDR_CACHE_HASH(tcp->tcp_connp->conn_faddr_v4)] =
1143 tcp->tcp_connp->conn_faddr_v4;
1144 }
1145 mutex_exit(&listener->tcp_eager_lock);
1146
1147 /*
1148 * Notify the ULP about the newconn. It is guaranteed that no
1149 * tcp_accept() call will be made for the eager if the
1150 * notification fails.
1151 */
1152 if ((upper = (*lconnp->conn_upcalls->su_newconn)
1153 (lconnp->conn_upper_handle, (sock_lower_handle_t)econnp,
1154 &sock_tcp_downcalls, ira->ira_cred, ira->ira_cpid,
1155 &econnp->conn_upcalls)) == NULL) {
1156 return (B_FALSE);
1157 }
1158 econnp->conn_upper_handle = upper;
1159
1160 tcp->tcp_detached = B_FALSE;
1161 tcp->tcp_hard_binding = B_FALSE;
1162 tcp->tcp_tconnind_started = B_TRUE;
1163
1164 if (econnp->conn_keepalive) {
1165 tcp->tcp_ka_last_intrvl = 0;
1166 tcp->tcp_ka_tid = TCP_TIMER(tcp, tcp_keepalive_timer,
1167 tcp->tcp_ka_interval);
1168 }
1169
1170 /* Update the necessary parameters */
1171 tcp_get_proto_props(tcp, &sopp);
1172
1173 (*econnp->conn_upcalls->su_set_proto_props)
1174 (econnp->conn_upper_handle, &sopp);
1175
1176 return (B_TRUE);
1177 }
1178