xref: /illumos-gate/usr/src/uts/common/fs/sockfs/sockcommon.c (revision f362c74cdda7b4819bb5d3360149ac0fae9ea013)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
24  * Copyright 2017 Sebastian Wiedenroth
25  */
26 
27 #include <sys/types.h>
28 #include <sys/param.h>
29 #include <sys/systm.h>
30 #include <sys/sysmacros.h>
31 #include <sys/debug.h>
32 #include <sys/cmn_err.h>
33 #include <sys/vfs.h>
34 #include <sys/policy.h>
35 #include <sys/modctl.h>
36 
37 #include <sys/sunddi.h>
38 
39 #include <sys/strsun.h>
40 #include <sys/stropts.h>
41 #include <sys/strsubr.h>
42 #include <sys/socket.h>
43 #include <sys/socketvar.h>
44 #include <sys/uio.h>
45 
46 #include <inet/ipclassifier.h>
47 #include <fs/sockfs/sockcommon.h>
48 #include <fs/sockfs/sockfilter_impl.h>
49 #include <fs/sockfs/nl7c.h>
50 #include <fs/sockfs/socktpi.h>
51 #include <fs/sockfs/sodirect.h>
52 #include <inet/ip.h>
53 
54 extern int xnet_skip_checks, xnet_check_print, xnet_truncate_print;
55 
56 /*
57  * Common socket access functions.
58  *
59  * Instead of accessing the sonode switch directly (i.e., SOP_xxx()),
60  * the socket_xxx() function should be used.
61  */
62 
63 /*
64  * Try to create a new sonode of the requested <family, type, protocol>.
65  */
66 /* ARGSUSED */
67 struct sonode *
68 socket_create(int family, int type, int protocol, char *devpath, char *mod,
69     int flags, int version, struct cred *cr, int *errorp)
70 {
71 	struct sonode *so;
72 	struct sockparams *sp = NULL;
73 	int saved_error;
74 
75 	/*
76 	 * Look for a sockparams entry that match the given criteria.
77 	 * solookup() returns with the entry held.
78 	 */
79 	*errorp = solookup(family, type, protocol, &sp);
80 	saved_error = *errorp;
81 	if (sp == NULL) {
82 		int kmflags = (flags == SOCKET_SLEEP) ? KM_SLEEP : KM_NOSLEEP;
83 		/*
84 		 * There is no matching sockparams entry. An ephemeral entry is
85 		 * created if the caller specifies a device or a socket module.
86 		 */
87 		if (devpath != NULL) {
88 			saved_error = 0;
89 			sp = sockparams_hold_ephemeral_bydev(family, type,
90 			    protocol, devpath, kmflags, errorp);
91 		} else if (mod != NULL) {
92 			saved_error = 0;
93 			sp = sockparams_hold_ephemeral_bymod(family, type,
94 			    protocol, mod, kmflags, errorp);
95 		} else {
96 			*errorp = solookup(family, type, 0, &sp);
97 		}
98 
99 		if (sp == NULL) {
100 			if (saved_error && (*errorp == EPROTONOSUPPORT ||
101 			    *errorp == EPROTOTYPE || *errorp == ENOPROTOOPT))
102 				*errorp = saved_error;
103 			return (NULL);
104 		}
105 	}
106 
107 	ASSERT(sp->sp_smod_info != NULL);
108 	ASSERT(flags == SOCKET_SLEEP || flags == SOCKET_NOSLEEP);
109 	sp->sp_stats.sps_ncreate.value.ui64++;
110 	so = sp->sp_smod_info->smod_sock_create_func(sp, family, type,
111 	    protocol, version, flags, errorp, cr);
112 	if (so == NULL) {
113 		SOCKPARAMS_DEC_REF(sp);
114 	} else {
115 		if ((*errorp = SOP_INIT(so, NULL, cr, flags)) == 0) {
116 			/* Cannot fail, only bumps so_count */
117 			(void) VOP_OPEN(&SOTOV(so), FREAD|FWRITE, cr, NULL);
118 		} else {
119 			if (saved_error && (*errorp == EPROTONOSUPPORT ||
120 			    *errorp == EPROTOTYPE || *errorp == ENOPROTOOPT))
121 				*errorp = saved_error;
122 			socket_destroy(so);
123 			so = NULL;
124 		}
125 	}
126 	return (so);
127 }
128 
129 struct sonode *
130 socket_newconn(struct sonode *parent, sock_lower_handle_t lh,
131     sock_downcalls_t *dc, int flags, int *errorp)
132 {
133 	struct sonode *so;
134 	struct sockparams *sp;
135 	struct cred *cr;
136 
137 	if ((cr = CRED()) == NULL)
138 		cr = kcred;
139 
140 	sp = parent->so_sockparams;
141 	ASSERT(sp != NULL);
142 
143 	sp->sp_stats.sps_ncreate.value.ui64++;
144 	so = sp->sp_smod_info->smod_sock_create_func(sp, parent->so_family,
145 	    parent->so_type, parent->so_protocol, parent->so_version, flags,
146 	    errorp, cr);
147 	if (so != NULL) {
148 		SOCKPARAMS_INC_REF(sp);
149 
150 		so->so_proto_handle = lh;
151 		so->so_downcalls = dc;
152 		/*
153 		 * This function may be called in interrupt context, and CRED()
154 		 * will be NULL. In this case, pass in kcred.
155 		 */
156 		if ((*errorp = SOP_INIT(so, parent, cr, flags)) == 0) {
157 			/* Cannot fail, only bumps so_count */
158 			(void) VOP_OPEN(&SOTOV(so), FREAD|FWRITE, cr, NULL);
159 		} else  {
160 			socket_destroy(so);
161 			so = NULL;
162 		}
163 	}
164 
165 	return (so);
166 }
167 
168 /*
169  * Bind local endpoint.
170  */
171 int
172 socket_bind(struct sonode *so, struct sockaddr *name, socklen_t namelen,
173     int flags, cred_t *cr)
174 {
175 	return (SOP_BIND(so, name, namelen, flags, cr));
176 }
177 
178 /*
179  * Turn socket into a listen socket.
180  */
181 int
182 socket_listen(struct sonode *so, int backlog, cred_t *cr)
183 {
184 	if (backlog < 0) {
185 		backlog = 0;
186 	}
187 
188 	/*
189 	 * Use the same qlimit as in BSD. BSD checks the qlimit
190 	 * before queuing the next connection implying that a
191 	 * listen(sock, 0) allows one connection to be queued.
192 	 * BSD also uses 1.5 times the requested backlog.
193 	 *
194 	 * XNS Issue 4 required a strict interpretation of the backlog.
195 	 * This has been waived subsequently for Issue 4 and the change
196 	 * incorporated in XNS Issue 5. So we aren't required to do
197 	 * anything special for XPG apps.
198 	 */
199 	if (backlog >= (INT_MAX - 1) / 3)
200 		backlog = INT_MAX;
201 	else
202 		backlog = backlog * 3 / 2 + 1;
203 
204 	return (SOP_LISTEN(so, backlog, cr));
205 }
206 
207 /*
208  * Accept incoming connection.
209  */
210 int
211 socket_accept(struct sonode *lso, int fflag, cred_t *cr, struct sonode **nsop)
212 {
213 	return (SOP_ACCEPT(lso, fflag, cr, nsop));
214 }
215 
216 /*
217  * Active open.
218  */
219 int
220 socket_connect(struct sonode *so, struct sockaddr *name,
221     socklen_t namelen, int fflag, int flags, cred_t *cr)
222 {
223 	int error;
224 
225 	/*
226 	 * Handle a connect to a name parameter of type AF_UNSPEC like a
227 	 * connect to a null address. This is the portable method to
228 	 * unconnect a socket.
229 	 */
230 	if ((namelen >= sizeof (sa_family_t)) &&
231 	    (name->sa_family == AF_UNSPEC)) {
232 		name = NULL;
233 		namelen = 0;
234 	}
235 
236 	error = SOP_CONNECT(so, name, namelen, fflag, flags, cr);
237 
238 	if (error == EHOSTUNREACH && flags & _SOCONNECT_XPG4_2) {
239 		/*
240 		 * X/Open specification contains a requirement that
241 		 * ENETUNREACH be returned but does not require
242 		 * EHOSTUNREACH. In order to keep the test suite
243 		 * happy we mess with the errno here.
244 		 */
245 		error = ENETUNREACH;
246 	}
247 
248 	return (error);
249 }
250 
251 /*
252  * Get address of remote node.
253  */
254 int
255 socket_getpeername(struct sonode *so, struct sockaddr *addr,
256     socklen_t *addrlen, boolean_t accept, cred_t *cr)
257 {
258 	ASSERT(*addrlen > 0);
259 	return (SOP_GETPEERNAME(so, addr, addrlen, accept, cr));
260 
261 }
262 
263 /*
264  * Get local address.
265  */
266 int
267 socket_getsockname(struct sonode *so, struct sockaddr *addr,
268     socklen_t *addrlen, cred_t *cr)
269 {
270 	return (SOP_GETSOCKNAME(so, addr, addrlen, cr));
271 
272 }
273 
274 /*
275  * Called from shutdown().
276  */
277 int
278 socket_shutdown(struct sonode *so, int how, cred_t *cr)
279 {
280 	return (SOP_SHUTDOWN(so, how, cr));
281 }
282 
283 /*
284  * Get socket options.
285  */
286 /*ARGSUSED*/
287 int
288 socket_getsockopt(struct sonode *so, int level, int option_name,
289     void *optval, socklen_t *optlenp, int flags, cred_t *cr)
290 {
291 	return (SOP_GETSOCKOPT(so, level, option_name, optval,
292 	    optlenp, flags, cr));
293 }
294 
295 /*
296  * Set socket options
297  */
298 int
299 socket_setsockopt(struct sonode *so, int level, int option_name,
300     const void *optval, t_uscalar_t optlen, cred_t *cr)
301 {
302 	int val = 1;
303 	/* Caller allocates aligned optval, or passes null */
304 	ASSERT(((uintptr_t)optval & (sizeof (t_scalar_t) - 1)) == 0);
305 	/* If optval is null optlen is 0, and vice-versa */
306 	ASSERT(optval != NULL || optlen == 0);
307 	ASSERT(optlen != 0 || optval == NULL);
308 
309 	if (optval == NULL && optlen == 0)
310 		optval = &val;
311 
312 	return (SOP_SETSOCKOPT(so, level, option_name, optval, optlen, cr));
313 }
314 
315 int
316 socket_sendmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop,
317     cred_t *cr)
318 {
319 	int error = 0;
320 	ssize_t orig_resid = uiop->uio_resid;
321 
322 	/*
323 	 * Do not bypass the cache if we are doing a local (AF_UNIX) write.
324 	 */
325 	if (so->so_family == AF_UNIX)
326 		uiop->uio_extflg |= UIO_COPY_CACHED;
327 	else
328 		uiop->uio_extflg &= ~UIO_COPY_CACHED;
329 
330 	error = SOP_SENDMSG(so, msg, uiop, cr);
331 	switch (error) {
332 	default:
333 		break;
334 	case EINTR:
335 	case ENOMEM:
336 	/* EAGAIN is EWOULDBLOCK */
337 	case EWOULDBLOCK:
338 		/* We did a partial send */
339 		if (uiop->uio_resid != orig_resid)
340 			error = 0;
341 		break;
342 	case EPIPE:
343 		if (((so->so_mode & SM_KERNEL) == 0) &&
344 		    ((msg->msg_flags & MSG_NOSIGNAL) == 0)) {
345 			tsignal(curthread, SIGPIPE);
346 		}
347 		break;
348 	}
349 
350 	return (error);
351 }
352 
353 int
354 socket_sendmblk(struct sonode *so, struct nmsghdr *msg, int fflag,
355     struct cred *cr, mblk_t **mpp)
356 {
357 	int error = 0;
358 
359 	error = SOP_SENDMBLK(so, msg, fflag, cr, mpp);
360 	if (error == EPIPE) {
361 		tsignal(curthread, SIGPIPE);
362 	}
363 	return (error);
364 }
365 
366 int
367 socket_recvmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop,
368     cred_t *cr)
369 {
370 	int error;
371 	ssize_t orig_resid = uiop->uio_resid;
372 
373 	/*
374 	 * Do not bypass the cache when reading data, as the application
375 	 * is likely to access the data shortly.
376 	 */
377 	uiop->uio_extflg |= UIO_COPY_CACHED;
378 
379 	error = SOP_RECVMSG(so, msg, uiop, cr);
380 
381 	switch (error) {
382 	case EINTR:
383 	/* EAGAIN is EWOULDBLOCK */
384 	case EWOULDBLOCK:
385 		/* We did a partial read */
386 		if (uiop->uio_resid != orig_resid)
387 			error = 0;
388 		break;
389 	default:
390 		break;
391 	}
392 	return (error);
393 }
394 
395 int
396 socket_ioctl(struct sonode *so, int cmd, intptr_t arg, int mode,
397     struct cred *cr, int32_t *rvalp)
398 {
399 	return (SOP_IOCTL(so, cmd, arg, mode, cr, rvalp));
400 }
401 
402 int
403 socket_poll(struct sonode *so, short events, int anyyet, short *reventsp,
404     struct pollhead **phpp)
405 {
406 	return (SOP_POLL(so, events, anyyet, reventsp, phpp));
407 }
408 
409 int
410 socket_close(struct sonode *so, int flag, struct cred *cr)
411 {
412 	return (VOP_CLOSE(SOTOV(so), flag, 1, 0, cr, NULL));
413 }
414 
415 int
416 socket_close_internal(struct sonode *so, int flag, cred_t *cr)
417 {
418 	ASSERT(so->so_count == 0);
419 
420 	return (SOP_CLOSE(so, flag, cr));
421 }
422 
423 void
424 socket_destroy(struct sonode *so)
425 {
426 	vn_invalid(SOTOV(so));
427 	VN_RELE(SOTOV(so));
428 }
429 
430 /* ARGSUSED */
431 void
432 socket_destroy_internal(struct sonode *so, cred_t *cr)
433 {
434 	struct sockparams *sp = so->so_sockparams;
435 	ASSERT(so->so_count == 0 && sp != NULL);
436 
437 	sp->sp_smod_info->smod_sock_destroy_func(so);
438 
439 	SOCKPARAMS_DEC_REF(sp);
440 }
441 
442 /*
443  * TODO Once the common vnode ops is available, then the vnops argument
444  * should be removed.
445  */
446 /*ARGSUSED*/
447 int
448 sonode_constructor(void *buf, void *cdrarg, int kmflags)
449 {
450 	struct sonode *so = buf;
451 	struct vnode *vp;
452 
453 	vp = so->so_vnode = vn_alloc(kmflags);
454 	if (vp == NULL) {
455 		return (-1);
456 	}
457 	vp->v_data = so;
458 	vn_setops(vp, socket_vnodeops);
459 
460 	so->so_priv 		= NULL;
461 	so->so_oobmsg		= NULL;
462 
463 	so->so_proto_handle	= NULL;
464 
465 	so->so_peercred 	= NULL;
466 
467 	so->so_rcv_queued	= 0;
468 	so->so_rcv_q_head 	= NULL;
469 	so->so_rcv_q_last_head 	= NULL;
470 	so->so_rcv_head		= NULL;
471 	so->so_rcv_last_head	= NULL;
472 	so->so_rcv_wanted	= 0;
473 	so->so_rcv_timer_interval = SOCKET_NO_RCVTIMER;
474 	so->so_rcv_timer_tid	= 0;
475 	so->so_rcv_thresh	= 0;
476 
477 	list_create(&so->so_acceptq_list, sizeof (struct sonode),
478 	    offsetof(struct sonode, so_acceptq_node));
479 	list_create(&so->so_acceptq_defer, sizeof (struct sonode),
480 	    offsetof(struct sonode, so_acceptq_node));
481 	list_link_init(&so->so_acceptq_node);
482 	so->so_acceptq_len	= 0;
483 	so->so_backlog		= 0;
484 	so->so_listener		= NULL;
485 
486 	so->so_snd_qfull	= B_FALSE;
487 
488 	so->so_filter_active	= 0;
489 	so->so_filter_tx	= 0;
490 	so->so_filter_defertime = 0;
491 	so->so_filter_top	= NULL;
492 	so->so_filter_bottom	= NULL;
493 
494 	mutex_init(&so->so_lock, NULL, MUTEX_DEFAULT, NULL);
495 	mutex_init(&so->so_acceptq_lock, NULL, MUTEX_DEFAULT, NULL);
496 	rw_init(&so->so_fallback_rwlock, NULL, RW_DEFAULT, NULL);
497 	cv_init(&so->so_state_cv, NULL, CV_DEFAULT, NULL);
498 	cv_init(&so->so_single_cv, NULL, CV_DEFAULT, NULL);
499 	cv_init(&so->so_read_cv, NULL, CV_DEFAULT, NULL);
500 
501 	cv_init(&so->so_acceptq_cv, NULL, CV_DEFAULT, NULL);
502 	cv_init(&so->so_snd_cv, NULL, CV_DEFAULT, NULL);
503 	cv_init(&so->so_rcv_cv, NULL, CV_DEFAULT, NULL);
504 	cv_init(&so->so_copy_cv, NULL, CV_DEFAULT, NULL);
505 	cv_init(&so->so_closing_cv, NULL, CV_DEFAULT, NULL);
506 
507 	return (0);
508 }
509 
510 /*ARGSUSED*/
511 void
512 sonode_destructor(void *buf, void *cdrarg)
513 {
514 	struct sonode *so = buf;
515 	struct vnode *vp = SOTOV(so);
516 
517 	ASSERT(so->so_priv == NULL);
518 	ASSERT(so->so_peercred == NULL);
519 
520 	ASSERT(so->so_oobmsg == NULL);
521 
522 	ASSERT(so->so_rcv_q_head == NULL);
523 
524 	list_destroy(&so->so_acceptq_list);
525 	list_destroy(&so->so_acceptq_defer);
526 	ASSERT(!list_link_active(&so->so_acceptq_node));
527 	ASSERT(so->so_listener == NULL);
528 
529 	ASSERT(so->so_filter_active == 0);
530 	ASSERT(so->so_filter_tx == 0);
531 	ASSERT(so->so_filter_top == NULL);
532 	ASSERT(so->so_filter_bottom == NULL);
533 
534 	ASSERT(vp->v_data == so);
535 	ASSERT(vn_matchops(vp, socket_vnodeops));
536 
537 	vn_free(vp);
538 
539 	mutex_destroy(&so->so_lock);
540 	mutex_destroy(&so->so_acceptq_lock);
541 	rw_destroy(&so->so_fallback_rwlock);
542 
543 	cv_destroy(&so->so_state_cv);
544 	cv_destroy(&so->so_single_cv);
545 	cv_destroy(&so->so_read_cv);
546 	cv_destroy(&so->so_acceptq_cv);
547 	cv_destroy(&so->so_snd_cv);
548 	cv_destroy(&so->so_rcv_cv);
549 	cv_destroy(&so->so_closing_cv);
550 }
551 
552 void
553 sonode_init(struct sonode *so, struct sockparams *sp, int family,
554     int type, int protocol, sonodeops_t *sops)
555 {
556 	vnode_t *vp;
557 
558 	vp = SOTOV(so);
559 
560 	so->so_flag	= 0;
561 
562 	so->so_state	= 0;
563 	so->so_mode	= 0;
564 
565 	so->so_count	= 0;
566 
567 	so->so_family	= family;
568 	so->so_type	= type;
569 	so->so_protocol	= protocol;
570 
571 	SOCK_CONNID_INIT(so->so_proto_connid);
572 
573 	so->so_options	= 0;
574 	so->so_linger.l_onoff   = 0;
575 	so->so_linger.l_linger = 0;
576 	so->so_sndbuf	= 0;
577 	so->so_error	= 0;
578 	so->so_rcvtimeo	= 0;
579 	so->so_sndtimeo = 0;
580 	so->so_xpg_rcvbuf = 0;
581 
582 	ASSERT(so->so_oobmsg == NULL);
583 	so->so_oobmark	= 0;
584 	so->so_pgrp	= 0;
585 
586 	ASSERT(so->so_peercred == NULL);
587 
588 	so->so_zoneid = getzoneid();
589 
590 	so->so_sockparams = sp;
591 
592 	so->so_ops = sops;
593 
594 	so->so_not_str = (sops != &sotpi_sonodeops);
595 
596 	so->so_proto_handle = NULL;
597 
598 	so->so_downcalls = NULL;
599 
600 	so->so_copyflag = 0;
601 
602 	vn_reinit(vp);
603 	vp->v_vfsp	= rootvfs;
604 	vp->v_type	= VSOCK;
605 	vp->v_rdev	= sockdev;
606 
607 	so->so_snd_qfull = B_FALSE;
608 	so->so_minpsz = 0;
609 
610 	so->so_rcv_wakeup = B_FALSE;
611 	so->so_snd_wakeup = B_FALSE;
612 	so->so_flowctrld = B_FALSE;
613 
614 	so->so_pollev = 0;
615 	bzero(&so->so_poll_list, sizeof (so->so_poll_list));
616 	bzero(&so->so_proto_props, sizeof (struct sock_proto_props));
617 
618 	bzero(&(so->so_ksock_callbacks), sizeof (ksocket_callbacks_t));
619 	so->so_ksock_cb_arg = NULL;
620 
621 	so->so_max_addr_len = sizeof (struct sockaddr_storage);
622 
623 	so->so_direct = NULL;
624 
625 	vn_exists(vp);
626 }
627 
628 void
629 sonode_fini(struct sonode *so)
630 {
631 	vnode_t *vp;
632 
633 	ASSERT(so->so_count == 0);
634 
635 	if (so->so_rcv_timer_tid) {
636 		ASSERT(MUTEX_NOT_HELD(&so->so_lock));
637 		(void) untimeout(so->so_rcv_timer_tid);
638 		so->so_rcv_timer_tid = 0;
639 	}
640 
641 	if (so->so_poll_list.ph_list != NULL) {
642 		pollwakeup(&so->so_poll_list, POLLERR);
643 		pollhead_clean(&so->so_poll_list);
644 	}
645 
646 	if (so->so_direct != NULL)
647 		sod_sock_fini(so);
648 
649 	vp = SOTOV(so);
650 	vn_invalid(vp);
651 
652 	if (so->so_peercred != NULL) {
653 		crfree(so->so_peercred);
654 		so->so_peercred = NULL;
655 	}
656 	/* Detach and destroy filters */
657 	if (so->so_filter_top != NULL)
658 		sof_sonode_cleanup(so);
659 
660 	ASSERT(list_is_empty(&so->so_acceptq_list));
661 	ASSERT(list_is_empty(&so->so_acceptq_defer));
662 	ASSERT(!list_link_active(&so->so_acceptq_node));
663 
664 	ASSERT(so->so_rcv_queued == 0);
665 	ASSERT(so->so_rcv_q_head == NULL);
666 	ASSERT(so->so_rcv_q_last_head == NULL);
667 	ASSERT(so->so_rcv_head == NULL);
668 	ASSERT(so->so_rcv_last_head == NULL);
669 }
670