1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
24 */
25
26 #include <sys/types.h>
27 #include <sys/param.h>
28 #include <sys/systm.h>
29 #include <sys/sysmacros.h>
30 #include <sys/debug.h>
31 #include <sys/cmn_err.h>
32 #include <sys/vfs.h>
33 #include <sys/policy.h>
34 #include <sys/modctl.h>
35
36 #include <sys/sunddi.h>
37
38 #include <sys/strsun.h>
39 #include <sys/stropts.h>
40 #include <sys/strsubr.h>
41 #include <sys/socket.h>
42 #include <sys/socketvar.h>
43 #include <sys/uio.h>
44
45 #include <inet/ipclassifier.h>
46 #include <fs/sockfs/sockcommon.h>
47 #include <fs/sockfs/sockfilter_impl.h>
48 #include <fs/sockfs/nl7c.h>
49 #include <fs/sockfs/socktpi.h>
50 #include <fs/sockfs/sodirect.h>
51 #include <inet/ip.h>
52
53 extern int xnet_skip_checks, xnet_check_print, xnet_truncate_print;
54
55 /*
56 * Common socket access functions.
57 *
58 * Instead of accessing the sonode switch directly (i.e., SOP_xxx()),
59 * the socket_xxx() function should be used.
60 */
61
62 /*
63 * Try to create a new sonode of the requested <family, type, protocol>.
64 */
65 /* ARGSUSED */
66 struct sonode *
socket_create(int family,int type,int protocol,char * devpath,char * mod,int flags,int version,struct cred * cr,int * errorp)67 socket_create(int family, int type, int protocol, char *devpath, char *mod,
68 int flags, int version, struct cred *cr, int *errorp)
69 {
70 struct sonode *so;
71 struct sockparams *sp = NULL;
72 int saved_error;
73
74 /*
75 * Look for a sockparams entry that match the given criteria.
76 * solookup() returns with the entry held.
77 */
78 *errorp = solookup(family, type, protocol, &sp);
79 saved_error = *errorp;
80 if (sp == NULL) {
81 int kmflags = (flags == SOCKET_SLEEP) ? KM_SLEEP : KM_NOSLEEP;
82 /*
83 * There is no matching sockparams entry. An ephemeral entry is
84 * created if the caller specifies a device or a socket module.
85 */
86 if (devpath != NULL) {
87 saved_error = 0;
88 sp = sockparams_hold_ephemeral_bydev(family, type,
89 protocol, devpath, kmflags, errorp);
90 } else if (mod != NULL) {
91 saved_error = 0;
92 sp = sockparams_hold_ephemeral_bymod(family, type,
93 protocol, mod, kmflags, errorp);
94 } else {
95 *errorp = solookup(family, type, 0, &sp);
96 }
97
98 if (sp == NULL) {
99 if (saved_error && (*errorp == EPROTONOSUPPORT ||
100 *errorp == EPROTOTYPE || *errorp == ENOPROTOOPT))
101 *errorp = saved_error;
102 return (NULL);
103 }
104 }
105
106 ASSERT(sp->sp_smod_info != NULL);
107 ASSERT(flags == SOCKET_SLEEP || flags == SOCKET_NOSLEEP);
108 sp->sp_stats.sps_ncreate.value.ui64++;
109 so = sp->sp_smod_info->smod_sock_create_func(sp, family, type,
110 protocol, version, flags, errorp, cr);
111 if (so == NULL) {
112 SOCKPARAMS_DEC_REF(sp);
113 } else {
114 if ((*errorp = SOP_INIT(so, NULL, cr, flags)) == 0) {
115 /* Cannot fail, only bumps so_count */
116 (void) VOP_OPEN(&SOTOV(so), FREAD|FWRITE, cr, NULL);
117 } else {
118 if (saved_error && (*errorp == EPROTONOSUPPORT ||
119 *errorp == EPROTOTYPE || *errorp == ENOPROTOOPT))
120 *errorp = saved_error;
121 socket_destroy(so);
122 so = NULL;
123 }
124 }
125 return (so);
126 }
127
128 struct sonode *
socket_newconn(struct sonode * parent,sock_lower_handle_t lh,sock_downcalls_t * dc,int flags,int * errorp)129 socket_newconn(struct sonode *parent, sock_lower_handle_t lh,
130 sock_downcalls_t *dc, int flags, int *errorp)
131 {
132 struct sonode *so;
133 struct sockparams *sp;
134 struct cred *cr;
135
136 if ((cr = CRED()) == NULL)
137 cr = kcred;
138
139 sp = parent->so_sockparams;
140 ASSERT(sp != NULL);
141
142 sp->sp_stats.sps_ncreate.value.ui64++;
143 so = sp->sp_smod_info->smod_sock_create_func(sp, parent->so_family,
144 parent->so_type, parent->so_protocol, parent->so_version, flags,
145 errorp, cr);
146 if (so != NULL) {
147 SOCKPARAMS_INC_REF(sp);
148
149 so->so_proto_handle = lh;
150 so->so_downcalls = dc;
151 /*
152 * This function may be called in interrupt context, and CRED()
153 * will be NULL. In this case, pass in kcred.
154 */
155 if ((*errorp = SOP_INIT(so, parent, cr, flags)) == 0) {
156 /* Cannot fail, only bumps so_count */
157 (void) VOP_OPEN(&SOTOV(so), FREAD|FWRITE, cr, NULL);
158 } else {
159 socket_destroy(so);
160 so = NULL;
161 }
162 }
163
164 return (so);
165 }
166
167 /*
168 * Bind local endpoint.
169 */
170 int
socket_bind(struct sonode * so,struct sockaddr * name,socklen_t namelen,int flags,cred_t * cr)171 socket_bind(struct sonode *so, struct sockaddr *name, socklen_t namelen,
172 int flags, cred_t *cr)
173 {
174 return (SOP_BIND(so, name, namelen, flags, cr));
175 }
176
177 /*
178 * Turn socket into a listen socket.
179 */
180 int
socket_listen(struct sonode * so,int backlog,cred_t * cr)181 socket_listen(struct sonode *so, int backlog, cred_t *cr)
182 {
183 if (backlog < 0) {
184 backlog = 0;
185 }
186
187 /*
188 * Use the same qlimit as in BSD. BSD checks the qlimit
189 * before queuing the next connection implying that a
190 * listen(sock, 0) allows one connection to be queued.
191 * BSD also uses 1.5 times the requested backlog.
192 *
193 * XNS Issue 4 required a strict interpretation of the backlog.
194 * This has been waived subsequently for Issue 4 and the change
195 * incorporated in XNS Issue 5. So we aren't required to do
196 * anything special for XPG apps.
197 */
198 if (backlog >= (INT_MAX - 1) / 3)
199 backlog = INT_MAX;
200 else
201 backlog = backlog * 3 / 2 + 1;
202
203 return (SOP_LISTEN(so, backlog, cr));
204 }
205
206 /*
207 * Accept incoming connection.
208 */
209 int
socket_accept(struct sonode * lso,int fflag,cred_t * cr,struct sonode ** nsop)210 socket_accept(struct sonode *lso, int fflag, cred_t *cr, struct sonode **nsop)
211 {
212 return (SOP_ACCEPT(lso, fflag, cr, nsop));
213 }
214
215 /*
216 * Active open.
217 */
218 int
socket_connect(struct sonode * so,struct sockaddr * name,socklen_t namelen,int fflag,int flags,cred_t * cr)219 socket_connect(struct sonode *so, struct sockaddr *name,
220 socklen_t namelen, int fflag, int flags, cred_t *cr)
221 {
222 int error;
223
224 /*
225 * Handle a connect to a name parameter of type AF_UNSPEC like a
226 * connect to a null address. This is the portable method to
227 * unconnect a socket.
228 */
229 if ((namelen >= sizeof (sa_family_t)) &&
230 (name->sa_family == AF_UNSPEC)) {
231 name = NULL;
232 namelen = 0;
233 }
234
235 error = SOP_CONNECT(so, name, namelen, fflag, flags, cr);
236
237 if (error == EHOSTUNREACH && flags & _SOCONNECT_XPG4_2) {
238 /*
239 * X/Open specification contains a requirement that
240 * ENETUNREACH be returned but does not require
241 * EHOSTUNREACH. In order to keep the test suite
242 * happy we mess with the errno here.
243 */
244 error = ENETUNREACH;
245 }
246
247 return (error);
248 }
249
250 /*
251 * Get address of remote node.
252 */
253 int
socket_getpeername(struct sonode * so,struct sockaddr * addr,socklen_t * addrlen,boolean_t accept,cred_t * cr)254 socket_getpeername(struct sonode *so, struct sockaddr *addr,
255 socklen_t *addrlen, boolean_t accept, cred_t *cr)
256 {
257 ASSERT(*addrlen > 0);
258 return (SOP_GETPEERNAME(so, addr, addrlen, accept, cr));
259
260 }
261
262 /*
263 * Get local address.
264 */
265 int
socket_getsockname(struct sonode * so,struct sockaddr * addr,socklen_t * addrlen,cred_t * cr)266 socket_getsockname(struct sonode *so, struct sockaddr *addr,
267 socklen_t *addrlen, cred_t *cr)
268 {
269 return (SOP_GETSOCKNAME(so, addr, addrlen, cr));
270
271 }
272
273 /*
274 * Called from shutdown().
275 */
276 int
socket_shutdown(struct sonode * so,int how,cred_t * cr)277 socket_shutdown(struct sonode *so, int how, cred_t *cr)
278 {
279 return (SOP_SHUTDOWN(so, how, cr));
280 }
281
282 /*
283 * Get socket options.
284 */
285 /*ARGSUSED*/
286 int
socket_getsockopt(struct sonode * so,int level,int option_name,void * optval,socklen_t * optlenp,int flags,cred_t * cr)287 socket_getsockopt(struct sonode *so, int level, int option_name,
288 void *optval, socklen_t *optlenp, int flags, cred_t *cr)
289 {
290 return (SOP_GETSOCKOPT(so, level, option_name, optval,
291 optlenp, flags, cr));
292 }
293
294 /*
295 * Set socket options
296 */
297 int
socket_setsockopt(struct sonode * so,int level,int option_name,const void * optval,t_uscalar_t optlen,cred_t * cr)298 socket_setsockopt(struct sonode *so, int level, int option_name,
299 const void *optval, t_uscalar_t optlen, cred_t *cr)
300 {
301 int val = 1;
302 /* Caller allocates aligned optval, or passes null */
303 ASSERT(((uintptr_t)optval & (sizeof (t_scalar_t) - 1)) == 0);
304 /* If optval is null optlen is 0, and vice-versa */
305 ASSERT(optval != NULL || optlen == 0);
306 ASSERT(optlen != 0 || optval == NULL);
307
308 if (optval == NULL && optlen == 0)
309 optval = &val;
310
311 return (SOP_SETSOCKOPT(so, level, option_name, optval, optlen, cr));
312 }
313
314 int
socket_sendmsg(struct sonode * so,struct nmsghdr * msg,struct uio * uiop,cred_t * cr)315 socket_sendmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop,
316 cred_t *cr)
317 {
318 int error = 0;
319 ssize_t orig_resid = uiop->uio_resid;
320
321 /*
322 * Do not bypass the cache if we are doing a local (AF_UNIX) write.
323 */
324 if (so->so_family == AF_UNIX)
325 uiop->uio_extflg |= UIO_COPY_CACHED;
326 else
327 uiop->uio_extflg &= ~UIO_COPY_CACHED;
328
329 error = SOP_SENDMSG(so, msg, uiop, cr);
330 switch (error) {
331 default:
332 break;
333 case EINTR:
334 case ENOMEM:
335 /* EAGAIN is EWOULDBLOCK */
336 case EWOULDBLOCK:
337 /* We did a partial send */
338 if (uiop->uio_resid != orig_resid)
339 error = 0;
340 break;
341 case EPIPE:
342 if ((so->so_mode & SM_KERNEL) == 0)
343 tsignal(curthread, SIGPIPE);
344 break;
345 }
346
347 return (error);
348 }
349
350 int
socket_sendmblk(struct sonode * so,struct nmsghdr * msg,int fflag,struct cred * cr,mblk_t ** mpp)351 socket_sendmblk(struct sonode *so, struct nmsghdr *msg, int fflag,
352 struct cred *cr, mblk_t **mpp)
353 {
354 int error = 0;
355
356 error = SOP_SENDMBLK(so, msg, fflag, cr, mpp);
357 if (error == EPIPE) {
358 tsignal(curthread, SIGPIPE);
359 }
360 return (error);
361 }
362
363 int
socket_recvmsg(struct sonode * so,struct nmsghdr * msg,struct uio * uiop,cred_t * cr)364 socket_recvmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop,
365 cred_t *cr)
366 {
367 int error;
368 ssize_t orig_resid = uiop->uio_resid;
369
370 /*
371 * Do not bypass the cache when reading data, as the application
372 * is likely to access the data shortly.
373 */
374 uiop->uio_extflg |= UIO_COPY_CACHED;
375
376 error = SOP_RECVMSG(so, msg, uiop, cr);
377
378 switch (error) {
379 case EINTR:
380 /* EAGAIN is EWOULDBLOCK */
381 case EWOULDBLOCK:
382 /* We did a partial read */
383 if (uiop->uio_resid != orig_resid)
384 error = 0;
385 break;
386 default:
387 break;
388 }
389 return (error);
390 }
391
392 int
socket_ioctl(struct sonode * so,int cmd,intptr_t arg,int mode,struct cred * cr,int32_t * rvalp)393 socket_ioctl(struct sonode *so, int cmd, intptr_t arg, int mode,
394 struct cred *cr, int32_t *rvalp)
395 {
396 return (SOP_IOCTL(so, cmd, arg, mode, cr, rvalp));
397 }
398
399 int
socket_poll(struct sonode * so,short events,int anyyet,short * reventsp,struct pollhead ** phpp)400 socket_poll(struct sonode *so, short events, int anyyet, short *reventsp,
401 struct pollhead **phpp)
402 {
403 return (SOP_POLL(so, events, anyyet, reventsp, phpp));
404 }
405
406 int
socket_close(struct sonode * so,int flag,struct cred * cr)407 socket_close(struct sonode *so, int flag, struct cred *cr)
408 {
409 return (VOP_CLOSE(SOTOV(so), flag, 1, 0, cr, NULL));
410 }
411
412 int
socket_close_internal(struct sonode * so,int flag,cred_t * cr)413 socket_close_internal(struct sonode *so, int flag, cred_t *cr)
414 {
415 ASSERT(so->so_count == 0);
416
417 return (SOP_CLOSE(so, flag, cr));
418 }
419
420 void
socket_destroy(struct sonode * so)421 socket_destroy(struct sonode *so)
422 {
423 vn_invalid(SOTOV(so));
424 VN_RELE(SOTOV(so));
425 }
426
427 /* ARGSUSED */
428 void
socket_destroy_internal(struct sonode * so,cred_t * cr)429 socket_destroy_internal(struct sonode *so, cred_t *cr)
430 {
431 struct sockparams *sp = so->so_sockparams;
432 ASSERT(so->so_count == 0 && sp != NULL);
433
434 sp->sp_smod_info->smod_sock_destroy_func(so);
435
436 SOCKPARAMS_DEC_REF(sp);
437 }
438
439 /*
440 * TODO Once the common vnode ops is available, then the vnops argument
441 * should be removed.
442 */
443 /*ARGSUSED*/
444 int
sonode_constructor(void * buf,void * cdrarg,int kmflags)445 sonode_constructor(void *buf, void *cdrarg, int kmflags)
446 {
447 struct sonode *so = buf;
448 struct vnode *vp;
449
450 vp = so->so_vnode = vn_alloc(kmflags);
451 if (vp == NULL) {
452 return (-1);
453 }
454 vp->v_data = so;
455 vn_setops(vp, socket_vnodeops);
456
457 so->so_priv = NULL;
458 so->so_oobmsg = NULL;
459
460 so->so_proto_handle = NULL;
461
462 so->so_peercred = NULL;
463
464 so->so_rcv_queued = 0;
465 so->so_rcv_q_head = NULL;
466 so->so_rcv_q_last_head = NULL;
467 so->so_rcv_head = NULL;
468 so->so_rcv_last_head = NULL;
469 so->so_rcv_wanted = 0;
470 so->so_rcv_timer_interval = SOCKET_NO_RCVTIMER;
471 so->so_rcv_timer_tid = 0;
472 so->so_rcv_thresh = 0;
473
474 list_create(&so->so_acceptq_list, sizeof (struct sonode),
475 offsetof(struct sonode, so_acceptq_node));
476 list_create(&so->so_acceptq_defer, sizeof (struct sonode),
477 offsetof(struct sonode, so_acceptq_node));
478 list_link_init(&so->so_acceptq_node);
479 so->so_acceptq_len = 0;
480 so->so_backlog = 0;
481 so->so_listener = NULL;
482
483 so->so_snd_qfull = B_FALSE;
484
485 so->so_filter_active = 0;
486 so->so_filter_tx = 0;
487 so->so_filter_defertime = 0;
488 so->so_filter_top = NULL;
489 so->so_filter_bottom = NULL;
490
491 mutex_init(&so->so_lock, NULL, MUTEX_DEFAULT, NULL);
492 mutex_init(&so->so_acceptq_lock, NULL, MUTEX_DEFAULT, NULL);
493 rw_init(&so->so_fallback_rwlock, NULL, RW_DEFAULT, NULL);
494 cv_init(&so->so_state_cv, NULL, CV_DEFAULT, NULL);
495 cv_init(&so->so_single_cv, NULL, CV_DEFAULT, NULL);
496 cv_init(&so->so_read_cv, NULL, CV_DEFAULT, NULL);
497
498 cv_init(&so->so_acceptq_cv, NULL, CV_DEFAULT, NULL);
499 cv_init(&so->so_snd_cv, NULL, CV_DEFAULT, NULL);
500 cv_init(&so->so_rcv_cv, NULL, CV_DEFAULT, NULL);
501 cv_init(&so->so_copy_cv, NULL, CV_DEFAULT, NULL);
502 cv_init(&so->so_closing_cv, NULL, CV_DEFAULT, NULL);
503
504 return (0);
505 }
506
507 /*ARGSUSED*/
508 void
sonode_destructor(void * buf,void * cdrarg)509 sonode_destructor(void *buf, void *cdrarg)
510 {
511 struct sonode *so = buf;
512 struct vnode *vp = SOTOV(so);
513
514 ASSERT(so->so_priv == NULL);
515 ASSERT(so->so_peercred == NULL);
516
517 ASSERT(so->so_oobmsg == NULL);
518
519 ASSERT(so->so_rcv_q_head == NULL);
520
521 list_destroy(&so->so_acceptq_list);
522 list_destroy(&so->so_acceptq_defer);
523 ASSERT(!list_link_active(&so->so_acceptq_node));
524 ASSERT(so->so_listener == NULL);
525
526 ASSERT(so->so_filter_active == 0);
527 ASSERT(so->so_filter_tx == 0);
528 ASSERT(so->so_filter_top == NULL);
529 ASSERT(so->so_filter_bottom == NULL);
530
531 ASSERT(vp->v_data == so);
532 ASSERT(vn_matchops(vp, socket_vnodeops));
533
534 vn_free(vp);
535
536 mutex_destroy(&so->so_lock);
537 mutex_destroy(&so->so_acceptq_lock);
538 rw_destroy(&so->so_fallback_rwlock);
539
540 cv_destroy(&so->so_state_cv);
541 cv_destroy(&so->so_single_cv);
542 cv_destroy(&so->so_read_cv);
543 cv_destroy(&so->so_acceptq_cv);
544 cv_destroy(&so->so_snd_cv);
545 cv_destroy(&so->so_rcv_cv);
546 cv_destroy(&so->so_closing_cv);
547 }
548
549 void
sonode_init(struct sonode * so,struct sockparams * sp,int family,int type,int protocol,sonodeops_t * sops)550 sonode_init(struct sonode *so, struct sockparams *sp, int family,
551 int type, int protocol, sonodeops_t *sops)
552 {
553 vnode_t *vp;
554
555 vp = SOTOV(so);
556
557 so->so_flag = 0;
558
559 so->so_state = 0;
560 so->so_mode = 0;
561
562 so->so_count = 0;
563
564 so->so_family = family;
565 so->so_type = type;
566 so->so_protocol = protocol;
567
568 SOCK_CONNID_INIT(so->so_proto_connid);
569
570 so->so_options = 0;
571 so->so_linger.l_onoff = 0;
572 so->so_linger.l_linger = 0;
573 so->so_sndbuf = 0;
574 so->so_error = 0;
575 so->so_rcvtimeo = 0;
576 so->so_sndtimeo = 0;
577 so->so_xpg_rcvbuf = 0;
578
579 ASSERT(so->so_oobmsg == NULL);
580 so->so_oobmark = 0;
581 so->so_pgrp = 0;
582
583 ASSERT(so->so_peercred == NULL);
584
585 so->so_zoneid = getzoneid();
586
587 so->so_sockparams = sp;
588
589 so->so_ops = sops;
590
591 so->so_not_str = (sops != &sotpi_sonodeops);
592
593 so->so_proto_handle = NULL;
594
595 so->so_downcalls = NULL;
596
597 so->so_copyflag = 0;
598
599 vn_reinit(vp);
600 vp->v_vfsp = rootvfs;
601 vp->v_type = VSOCK;
602 vp->v_rdev = sockdev;
603
604 so->so_snd_qfull = B_FALSE;
605 so->so_minpsz = 0;
606
607 so->so_rcv_wakeup = B_FALSE;
608 so->so_snd_wakeup = B_FALSE;
609 so->so_flowctrld = B_FALSE;
610
611 so->so_pollev = 0;
612 bzero(&so->so_poll_list, sizeof (so->so_poll_list));
613 bzero(&so->so_proto_props, sizeof (struct sock_proto_props));
614
615 bzero(&(so->so_ksock_callbacks), sizeof (ksocket_callbacks_t));
616 so->so_ksock_cb_arg = NULL;
617
618 so->so_max_addr_len = sizeof (struct sockaddr_storage);
619
620 so->so_direct = NULL;
621
622 vn_exists(vp);
623 }
624
625 void
sonode_fini(struct sonode * so)626 sonode_fini(struct sonode *so)
627 {
628 vnode_t *vp;
629
630 ASSERT(so->so_count == 0);
631
632 if (so->so_rcv_timer_tid) {
633 ASSERT(MUTEX_NOT_HELD(&so->so_lock));
634 (void) untimeout(so->so_rcv_timer_tid);
635 so->so_rcv_timer_tid = 0;
636 }
637
638 if (so->so_poll_list.ph_list != NULL) {
639 pollwakeup(&so->so_poll_list, POLLERR);
640 pollhead_clean(&so->so_poll_list);
641 }
642
643 if (so->so_direct != NULL)
644 sod_sock_fini(so);
645
646 vp = SOTOV(so);
647 vn_invalid(vp);
648
649 if (so->so_peercred != NULL) {
650 crfree(so->so_peercred);
651 so->so_peercred = NULL;
652 }
653 /* Detach and destroy filters */
654 if (so->so_filter_top != NULL)
655 sof_sonode_cleanup(so);
656
657 ASSERT(list_is_empty(&so->so_acceptq_list));
658 ASSERT(list_is_empty(&so->so_acceptq_defer));
659 ASSERT(!list_link_active(&so->so_acceptq_node));
660
661 ASSERT(so->so_rcv_queued == 0);
662 ASSERT(so->so_rcv_q_head == NULL);
663 ASSERT(so->so_rcv_q_last_head == NULL);
664 ASSERT(so->so_rcv_head == NULL);
665 ASSERT(so->so_rcv_last_head == NULL);
666 }
667