xref: /titanic_44/usr/src/uts/common/fs/sockfs/sockcommon.c (revision 07925104db56e5c3eacc4865b918bd16af5cec59)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
24  */
25 
26 #include <sys/types.h>
27 #include <sys/param.h>
28 #include <sys/systm.h>
29 #include <sys/sysmacros.h>
30 #include <sys/debug.h>
31 #include <sys/cmn_err.h>
32 #include <sys/vfs.h>
33 #include <sys/policy.h>
34 #include <sys/modctl.h>
35 
36 #include <sys/sunddi.h>
37 
38 #include <sys/strsun.h>
39 #include <sys/stropts.h>
40 #include <sys/strsubr.h>
41 #include <sys/socket.h>
42 #include <sys/socketvar.h>
43 #include <sys/uio.h>
44 
45 #include <inet/ipclassifier.h>
46 #include <fs/sockfs/sockcommon.h>
47 #include <fs/sockfs/sockfilter_impl.h>
48 #include <fs/sockfs/nl7c.h>
49 #include <fs/sockfs/socktpi.h>
50 #include <fs/sockfs/sodirect.h>
51 #include <inet/ip.h>
52 
53 extern int xnet_skip_checks, xnet_check_print, xnet_truncate_print;
54 
55 /*
56  * Common socket access functions.
57  *
58  * Instead of accessing the sonode switch directly (i.e., SOP_xxx()),
59  * the socket_xxx() function should be used.
60  */
61 
62 /*
63  * Try to create a new sonode of the requested <family, type, protocol>.
64  */
65 /* ARGSUSED */
66 struct sonode *
67 socket_create(int family, int type, int protocol, char *devpath, char *mod,
68     int flags, int version, struct cred *cr, int *errorp)
69 {
70 	struct sonode *so;
71 	struct sockparams *sp = NULL;
72 	int saved_error;
73 
74 	/*
75 	 * Look for a sockparams entry that match the given criteria.
76 	 * solookup() returns with the entry held.
77 	 */
78 	*errorp = solookup(family, type, protocol, &sp);
79 	saved_error = *errorp;
80 	if (sp == NULL) {
81 		int kmflags = (flags == SOCKET_SLEEP) ? KM_SLEEP : KM_NOSLEEP;
82 		/*
83 		 * There is no matching sockparams entry. An ephemeral entry is
84 		 * created if the caller specifies a device or a socket module.
85 		 */
86 		if (devpath != NULL) {
87 			saved_error = 0;
88 			sp = sockparams_hold_ephemeral_bydev(family, type,
89 			    protocol, devpath, kmflags, errorp);
90 		} else if (mod != NULL) {
91 			saved_error = 0;
92 			sp = sockparams_hold_ephemeral_bymod(family, type,
93 			    protocol, mod, kmflags, errorp);
94 		} else {
95 			*errorp = solookup(family, type, 0, &sp);
96 		}
97 
98 		if (sp == NULL) {
99 			if (saved_error && (*errorp == EPROTONOSUPPORT ||
100 			    *errorp == EPROTOTYPE || *errorp == ENOPROTOOPT))
101 				*errorp = saved_error;
102 			return (NULL);
103 		}
104 	}
105 
106 	ASSERT(sp->sp_smod_info != NULL);
107 	ASSERT(flags == SOCKET_SLEEP || flags == SOCKET_NOSLEEP);
108 	sp->sp_stats.sps_ncreate.value.ui64++;
109 	so = sp->sp_smod_info->smod_sock_create_func(sp, family, type,
110 	    protocol, version, flags, errorp, cr);
111 	if (so == NULL) {
112 		SOCKPARAMS_DEC_REF(sp);
113 	} else {
114 		if ((*errorp = SOP_INIT(so, NULL, cr, flags)) == 0) {
115 			/* Cannot fail, only bumps so_count */
116 			(void) VOP_OPEN(&SOTOV(so), FREAD|FWRITE, cr, NULL);
117 		} else {
118 			if (saved_error && (*errorp == EPROTONOSUPPORT ||
119 			    *errorp == EPROTOTYPE || *errorp == ENOPROTOOPT))
120 				*errorp = saved_error;
121 			socket_destroy(so);
122 			so = NULL;
123 		}
124 	}
125 	return (so);
126 }
127 
128 struct sonode *
129 socket_newconn(struct sonode *parent, sock_lower_handle_t lh,
130     sock_downcalls_t *dc, int flags, int *errorp)
131 {
132 	struct sonode *so;
133 	struct sockparams *sp;
134 	struct cred *cr;
135 
136 	if ((cr = CRED()) == NULL)
137 		cr = kcred;
138 
139 	sp = parent->so_sockparams;
140 	ASSERT(sp != NULL);
141 
142 	sp->sp_stats.sps_ncreate.value.ui64++;
143 	so = sp->sp_smod_info->smod_sock_create_func(sp, parent->so_family,
144 	    parent->so_type, parent->so_protocol, parent->so_version, flags,
145 	    errorp, cr);
146 	if (so != NULL) {
147 		SOCKPARAMS_INC_REF(sp);
148 
149 		so->so_proto_handle = lh;
150 		so->so_downcalls = dc;
151 		/*
152 		 * This function may be called in interrupt context, and CRED()
153 		 * will be NULL. In this case, pass in kcred.
154 		 */
155 		if ((*errorp = SOP_INIT(so, parent, cr, flags)) == 0) {
156 			/* Cannot fail, only bumps so_count */
157 			(void) VOP_OPEN(&SOTOV(so), FREAD|FWRITE, cr, NULL);
158 		} else  {
159 			socket_destroy(so);
160 			so = NULL;
161 		}
162 	}
163 
164 	return (so);
165 }
166 
167 /*
168  * Bind local endpoint.
169  */
170 int
171 socket_bind(struct sonode *so, struct sockaddr *name, socklen_t namelen,
172     int flags, cred_t *cr)
173 {
174 	return (SOP_BIND(so, name, namelen, flags, cr));
175 }
176 
177 /*
178  * Turn socket into a listen socket.
179  */
180 int
181 socket_listen(struct sonode *so, int backlog, cred_t *cr)
182 {
183 	if (backlog < 0) {
184 		backlog = 0;
185 	}
186 
187 	/*
188 	 * Use the same qlimit as in BSD. BSD checks the qlimit
189 	 * before queuing the next connection implying that a
190 	 * listen(sock, 0) allows one connection to be queued.
191 	 * BSD also uses 1.5 times the requested backlog.
192 	 *
193 	 * XNS Issue 4 required a strict interpretation of the backlog.
194 	 * This has been waived subsequently for Issue 4 and the change
195 	 * incorporated in XNS Issue 5. So we aren't required to do
196 	 * anything special for XPG apps.
197 	 */
198 	if (backlog >= (INT_MAX - 1) / 3)
199 		backlog = INT_MAX;
200 	else
201 		backlog = backlog * 3 / 2 + 1;
202 
203 	return (SOP_LISTEN(so, backlog, cr));
204 }
205 
206 /*
207  * Accept incoming connection.
208  */
209 int
210 socket_accept(struct sonode *lso, int fflag, cred_t *cr, struct sonode **nsop)
211 {
212 	return (SOP_ACCEPT(lso, fflag, cr, nsop));
213 }
214 
215 /*
216  * Active open.
217  */
218 int
219 socket_connect(struct sonode *so, struct sockaddr *name,
220     socklen_t namelen, int fflag, int flags, cred_t *cr)
221 {
222 	int error;
223 
224 	/*
225 	 * Handle a connect to a name parameter of type AF_UNSPEC like a
226 	 * connect to a null address. This is the portable method to
227 	 * unconnect a socket.
228 	 */
229 	if ((namelen >= sizeof (sa_family_t)) &&
230 	    (name->sa_family == AF_UNSPEC)) {
231 		name = NULL;
232 		namelen = 0;
233 	}
234 
235 	error = SOP_CONNECT(so, name, namelen, fflag, flags, cr);
236 
237 	if (error == EHOSTUNREACH && flags & _SOCONNECT_XPG4_2) {
238 		/*
239 		 * X/Open specification contains a requirement that
240 		 * ENETUNREACH be returned but does not require
241 		 * EHOSTUNREACH. In order to keep the test suite
242 		 * happy we mess with the errno here.
243 		 */
244 		error = ENETUNREACH;
245 	}
246 
247 	return (error);
248 }
249 
250 /*
251  * Get address of remote node.
252  */
253 int
254 socket_getpeername(struct sonode *so, struct sockaddr *addr,
255     socklen_t *addrlen, boolean_t accept, cred_t *cr)
256 {
257 	ASSERT(*addrlen > 0);
258 	return (SOP_GETPEERNAME(so, addr, addrlen, accept, cr));
259 
260 }
261 
262 /*
263  * Get local address.
264  */
265 int
266 socket_getsockname(struct sonode *so, struct sockaddr *addr,
267     socklen_t *addrlen, cred_t *cr)
268 {
269 	return (SOP_GETSOCKNAME(so, addr, addrlen, cr));
270 
271 }
272 
273 /*
274  * Called from shutdown().
275  */
276 int
277 socket_shutdown(struct sonode *so, int how, cred_t *cr)
278 {
279 	return (SOP_SHUTDOWN(so, how, cr));
280 }
281 
282 /*
283  * Get socket options.
284  */
285 /*ARGSUSED*/
286 int
287 socket_getsockopt(struct sonode *so, int level, int option_name,
288     void *optval, socklen_t *optlenp, int flags, cred_t *cr)
289 {
290 	return (SOP_GETSOCKOPT(so, level, option_name, optval,
291 	    optlenp, flags, cr));
292 }
293 
294 /*
295  * Set socket options
296  */
297 int
298 socket_setsockopt(struct sonode *so, int level, int option_name,
299     const void *optval, t_uscalar_t optlen, cred_t *cr)
300 {
301 	int val = 1;
302 	/* Caller allocates aligned optval, or passes null */
303 	ASSERT(((uintptr_t)optval & (sizeof (t_scalar_t) - 1)) == 0);
304 	/* If optval is null optlen is 0, and vice-versa */
305 	ASSERT(optval != NULL || optlen == 0);
306 	ASSERT(optlen != 0 || optval == NULL);
307 
308 	if (optval == NULL && optlen == 0)
309 		optval = &val;
310 
311 	return (SOP_SETSOCKOPT(so, level, option_name, optval, optlen, cr));
312 }
313 
314 int
315 socket_sendmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop,
316     cred_t *cr)
317 {
318 	int error = 0;
319 	ssize_t orig_resid = uiop->uio_resid;
320 
321 	/*
322 	 * Do not bypass the cache if we are doing a local (AF_UNIX) write.
323 	 */
324 	if (so->so_family == AF_UNIX)
325 		uiop->uio_extflg |= UIO_COPY_CACHED;
326 	else
327 		uiop->uio_extflg &= ~UIO_COPY_CACHED;
328 
329 	error = SOP_SENDMSG(so, msg, uiop, cr);
330 	switch (error) {
331 	default:
332 		break;
333 	case EINTR:
334 	case ENOMEM:
335 	/* EAGAIN is EWOULDBLOCK */
336 	case EWOULDBLOCK:
337 		/* We did a partial send */
338 		if (uiop->uio_resid != orig_resid)
339 			error = 0;
340 		break;
341 	case EPIPE:
342 		if ((so->so_mode & SM_KERNEL) == 0)
343 			tsignal(curthread, SIGPIPE);
344 		break;
345 	}
346 
347 	return (error);
348 }
349 
350 int
351 socket_sendmblk(struct sonode *so, struct nmsghdr *msg, int fflag,
352     struct cred *cr, mblk_t **mpp)
353 {
354 	int error = 0;
355 
356 	error = SOP_SENDMBLK(so, msg, fflag, cr, mpp);
357 	if (error == EPIPE) {
358 		tsignal(curthread, SIGPIPE);
359 	}
360 	return (error);
361 }
362 
363 int
364 socket_recvmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop,
365     cred_t *cr)
366 {
367 	int error;
368 	ssize_t orig_resid = uiop->uio_resid;
369 
370 	/*
371 	 * Do not bypass the cache when reading data, as the application
372 	 * is likely to access the data shortly.
373 	 */
374 	uiop->uio_extflg |= UIO_COPY_CACHED;
375 
376 	error = SOP_RECVMSG(so, msg, uiop, cr);
377 
378 	switch (error) {
379 	case EINTR:
380 	/* EAGAIN is EWOULDBLOCK */
381 	case EWOULDBLOCK:
382 		/* We did a partial read */
383 		if (uiop->uio_resid != orig_resid)
384 			error = 0;
385 		break;
386 	default:
387 		break;
388 	}
389 	return (error);
390 }
391 
392 int
393 socket_ioctl(struct sonode *so, int cmd, intptr_t arg, int mode,
394     struct cred *cr, int32_t *rvalp)
395 {
396 	return (SOP_IOCTL(so, cmd, arg, mode, cr, rvalp));
397 }
398 
399 int
400 socket_poll(struct sonode *so, short events, int anyyet, short *reventsp,
401     struct pollhead **phpp)
402 {
403 	return (SOP_POLL(so, events, anyyet, reventsp, phpp));
404 }
405 
406 int
407 socket_close(struct sonode *so, int flag, struct cred *cr)
408 {
409 	return (VOP_CLOSE(SOTOV(so), flag, 1, 0, cr, NULL));
410 }
411 
412 int
413 socket_close_internal(struct sonode *so, int flag, cred_t *cr)
414 {
415 	ASSERT(so->so_count == 0);
416 
417 	return (SOP_CLOSE(so, flag, cr));
418 }
419 
420 void
421 socket_destroy(struct sonode *so)
422 {
423 	vn_invalid(SOTOV(so));
424 	VN_RELE(SOTOV(so));
425 }
426 
427 /* ARGSUSED */
428 void
429 socket_destroy_internal(struct sonode *so, cred_t *cr)
430 {
431 	struct sockparams *sp = so->so_sockparams;
432 	ASSERT(so->so_count == 0 && sp != NULL);
433 
434 	sp->sp_smod_info->smod_sock_destroy_func(so);
435 
436 	SOCKPARAMS_DEC_REF(sp);
437 }
438 
439 /*
440  * TODO Once the common vnode ops is available, then the vnops argument
441  * should be removed.
442  */
443 /*ARGSUSED*/
444 int
445 sonode_constructor(void *buf, void *cdrarg, int kmflags)
446 {
447 	struct sonode *so = buf;
448 	struct vnode *vp;
449 
450 	vp = so->so_vnode = vn_alloc(kmflags);
451 	if (vp == NULL) {
452 		return (-1);
453 	}
454 	vp->v_data = so;
455 	vn_setops(vp, socket_vnodeops);
456 
457 	so->so_priv 		= NULL;
458 	so->so_oobmsg		= NULL;
459 
460 	so->so_proto_handle	= NULL;
461 
462 	so->so_peercred 	= NULL;
463 
464 	so->so_rcv_queued	= 0;
465 	so->so_rcv_q_head 	= NULL;
466 	so->so_rcv_q_last_head 	= NULL;
467 	so->so_rcv_head		= NULL;
468 	so->so_rcv_last_head	= NULL;
469 	so->so_rcv_wanted	= 0;
470 	so->so_rcv_timer_interval = SOCKET_NO_RCVTIMER;
471 	so->so_rcv_timer_tid	= 0;
472 	so->so_rcv_thresh	= 0;
473 
474 	list_create(&so->so_acceptq_list, sizeof (struct sonode),
475 	    offsetof(struct sonode, so_acceptq_node));
476 	list_create(&so->so_acceptq_defer, sizeof (struct sonode),
477 	    offsetof(struct sonode, so_acceptq_node));
478 	list_link_init(&so->so_acceptq_node);
479 	so->so_acceptq_len	= 0;
480 	so->so_backlog		= 0;
481 	so->so_listener		= NULL;
482 
483 	so->so_snd_qfull	= B_FALSE;
484 
485 	so->so_filter_active	= 0;
486 	so->so_filter_tx	= 0;
487 	so->so_filter_defertime = 0;
488 	so->so_filter_top	= NULL;
489 	so->so_filter_bottom	= NULL;
490 
491 	mutex_init(&so->so_lock, NULL, MUTEX_DEFAULT, NULL);
492 	mutex_init(&so->so_acceptq_lock, NULL, MUTEX_DEFAULT, NULL);
493 	rw_init(&so->so_fallback_rwlock, NULL, RW_DEFAULT, NULL);
494 	cv_init(&so->so_state_cv, NULL, CV_DEFAULT, NULL);
495 	cv_init(&so->so_single_cv, NULL, CV_DEFAULT, NULL);
496 	cv_init(&so->so_read_cv, NULL, CV_DEFAULT, NULL);
497 
498 	cv_init(&so->so_acceptq_cv, NULL, CV_DEFAULT, NULL);
499 	cv_init(&so->so_snd_cv, NULL, CV_DEFAULT, NULL);
500 	cv_init(&so->so_rcv_cv, NULL, CV_DEFAULT, NULL);
501 	cv_init(&so->so_copy_cv, NULL, CV_DEFAULT, NULL);
502 	cv_init(&so->so_closing_cv, NULL, CV_DEFAULT, NULL);
503 
504 	return (0);
505 }
506 
507 /*ARGSUSED*/
508 void
509 sonode_destructor(void *buf, void *cdrarg)
510 {
511 	struct sonode *so = buf;
512 	struct vnode *vp = SOTOV(so);
513 
514 	ASSERT(so->so_priv == NULL);
515 	ASSERT(so->so_peercred == NULL);
516 
517 	ASSERT(so->so_oobmsg == NULL);
518 
519 	ASSERT(so->so_rcv_q_head == NULL);
520 
521 	list_destroy(&so->so_acceptq_list);
522 	list_destroy(&so->so_acceptq_defer);
523 	ASSERT(!list_link_active(&so->so_acceptq_node));
524 	ASSERT(so->so_listener == NULL);
525 
526 	ASSERT(so->so_filter_active == 0);
527 	ASSERT(so->so_filter_tx == 0);
528 	ASSERT(so->so_filter_top == NULL);
529 	ASSERT(so->so_filter_bottom == NULL);
530 
531 	ASSERT(vp->v_data == so);
532 	ASSERT(vn_matchops(vp, socket_vnodeops));
533 
534 	vn_free(vp);
535 
536 	mutex_destroy(&so->so_lock);
537 	mutex_destroy(&so->so_acceptq_lock);
538 	rw_destroy(&so->so_fallback_rwlock);
539 
540 	cv_destroy(&so->so_state_cv);
541 	cv_destroy(&so->so_single_cv);
542 	cv_destroy(&so->so_read_cv);
543 	cv_destroy(&so->so_acceptq_cv);
544 	cv_destroy(&so->so_snd_cv);
545 	cv_destroy(&so->so_rcv_cv);
546 	cv_destroy(&so->so_closing_cv);
547 }
548 
549 void
550 sonode_init(struct sonode *so, struct sockparams *sp, int family,
551     int type, int protocol, sonodeops_t *sops)
552 {
553 	vnode_t *vp;
554 
555 	vp = SOTOV(so);
556 
557 	so->so_flag	= 0;
558 
559 	so->so_state	= 0;
560 	so->so_mode	= 0;
561 
562 	so->so_count	= 0;
563 
564 	so->so_family	= family;
565 	so->so_type	= type;
566 	so->so_protocol	= protocol;
567 
568 	SOCK_CONNID_INIT(so->so_proto_connid);
569 
570 	so->so_options	= 0;
571 	so->so_linger.l_onoff   = 0;
572 	so->so_linger.l_linger = 0;
573 	so->so_sndbuf	= 0;
574 	so->so_error	= 0;
575 	so->so_rcvtimeo	= 0;
576 	so->so_sndtimeo = 0;
577 	so->so_xpg_rcvbuf = 0;
578 
579 	ASSERT(so->so_oobmsg == NULL);
580 	so->so_oobmark	= 0;
581 	so->so_pgrp	= 0;
582 
583 	ASSERT(so->so_peercred == NULL);
584 
585 	so->so_zoneid = getzoneid();
586 
587 	so->so_sockparams = sp;
588 
589 	so->so_ops = sops;
590 
591 	so->so_not_str = (sops != &sotpi_sonodeops);
592 
593 	so->so_proto_handle = NULL;
594 
595 	so->so_downcalls = NULL;
596 
597 	so->so_copyflag = 0;
598 
599 	vn_reinit(vp);
600 	vp->v_vfsp	= rootvfs;
601 	vp->v_type	= VSOCK;
602 	vp->v_rdev	= sockdev;
603 
604 	so->so_snd_qfull = B_FALSE;
605 	so->so_minpsz = 0;
606 
607 	so->so_rcv_wakeup = B_FALSE;
608 	so->so_snd_wakeup = B_FALSE;
609 	so->so_flowctrld = B_FALSE;
610 
611 	so->so_pollev = 0;
612 	bzero(&so->so_poll_list, sizeof (so->so_poll_list));
613 	bzero(&so->so_proto_props, sizeof (struct sock_proto_props));
614 
615 	bzero(&(so->so_ksock_callbacks), sizeof (ksocket_callbacks_t));
616 	so->so_ksock_cb_arg = NULL;
617 
618 	so->so_max_addr_len = sizeof (struct sockaddr_storage);
619 
620 	so->so_direct = NULL;
621 
622 	vn_exists(vp);
623 }
624 
625 void
626 sonode_fini(struct sonode *so)
627 {
628 	vnode_t *vp;
629 
630 	ASSERT(so->so_count == 0);
631 
632 	if (so->so_rcv_timer_tid) {
633 		ASSERT(MUTEX_NOT_HELD(&so->so_lock));
634 		(void) untimeout(so->so_rcv_timer_tid);
635 		so->so_rcv_timer_tid = 0;
636 	}
637 
638 	if (so->so_poll_list.ph_list != NULL) {
639 		pollwakeup(&so->so_poll_list, POLLERR);
640 		pollhead_clean(&so->so_poll_list);
641 	}
642 
643 	if (so->so_direct != NULL)
644 		sod_sock_fini(so);
645 
646 	vp = SOTOV(so);
647 	vn_invalid(vp);
648 
649 	if (so->so_peercred != NULL) {
650 		crfree(so->so_peercred);
651 		so->so_peercred = NULL;
652 	}
653 	/* Detach and destroy filters */
654 	if (so->so_filter_top != NULL)
655 		sof_sonode_cleanup(so);
656 
657 	ASSERT(list_is_empty(&so->so_acceptq_list));
658 	ASSERT(list_is_empty(&so->so_acceptq_defer));
659 	ASSERT(!list_link_active(&so->so_acceptq_node));
660 
661 	ASSERT(so->so_rcv_queued == 0);
662 	ASSERT(so->so_rcv_q_head == NULL);
663 	ASSERT(so->so_rcv_q_last_head == NULL);
664 	ASSERT(so->so_rcv_head == NULL);
665 	ASSERT(so->so_rcv_last_head == NULL);
666 }
667