xref: /illumos-gate/usr/src/uts/common/fs/sockfs/sockcommon.c (revision 2833423dc59f4c35fe4713dbb942950c82df0437)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
24  * Copyright 2015, Joyent, Inc.
25  * Copyright 2017 Sebastian Wiedenroth
26  * Copyright 2022 Garrett D'Amore
27  */
28 
29 #include <sys/types.h>
30 #include <sys/param.h>
31 #include <sys/systm.h>
32 #include <sys/sysmacros.h>
33 #include <sys/debug.h>
34 #include <sys/cmn_err.h>
35 #include <sys/vfs.h>
36 #include <sys/policy.h>
37 #include <sys/modctl.h>
38 
39 #include <sys/sunddi.h>
40 
41 #include <sys/strsun.h>
42 #include <sys/stropts.h>
43 #include <sys/strsubr.h>
44 #include <sys/socket.h>
45 #include <sys/socketvar.h>
46 #include <sys/uio.h>
47 
48 #include <inet/ipclassifier.h>
49 #include <fs/sockfs/sockcommon.h>
50 #include <fs/sockfs/sockfilter_impl.h>
51 #include <fs/sockfs/socktpi.h>
52 #include <fs/sockfs/sodirect.h>
53 #include <inet/ip.h>
54 
55 extern int xnet_skip_checks, xnet_check_print, xnet_truncate_print;
56 
57 /*
58  * Common socket access functions.
59  *
60  * Instead of accessing the sonode switch directly (i.e., SOP_xxx()),
61  * the socket_xxx() function should be used.
62  */
63 
64 /*
65  * Try to create a new sonode of the requested <family, type, protocol>.
66  */
67 /* ARGSUSED */
68 struct sonode *
69 socket_create(int family, int type, int protocol, char *devpath, char *mod,
70     int flags, int version, struct cred *cr, int *errorp)
71 {
72 	struct sonode *so;
73 	struct sockparams *sp = NULL;
74 	int saved_error;
75 
76 	/*
77 	 * Look for a sockparams entry that match the given criteria.
78 	 * solookup() returns with the entry held.
79 	 */
80 	*errorp = solookup(family, type, protocol, &sp);
81 	saved_error = *errorp;
82 	if (sp == NULL) {
83 		int kmflags = (flags == SOCKET_SLEEP) ? KM_SLEEP : KM_NOSLEEP;
84 		/*
85 		 * There is no matching sockparams entry. An ephemeral entry is
86 		 * created if the caller specifies a device or a socket module.
87 		 */
88 		if (devpath != NULL) {
89 			saved_error = 0;
90 			sp = sockparams_hold_ephemeral_bydev(family, type,
91 			    protocol, devpath, kmflags, errorp);
92 		} else if (mod != NULL) {
93 			saved_error = 0;
94 			sp = sockparams_hold_ephemeral_bymod(family, type,
95 			    protocol, mod, kmflags, errorp);
96 		} else {
97 			*errorp = solookup(family, type, 0, &sp);
98 		}
99 
100 		if (sp == NULL) {
101 			if (saved_error && (*errorp == EPROTONOSUPPORT ||
102 			    *errorp == EPROTOTYPE || *errorp == ENOPROTOOPT))
103 				*errorp = saved_error;
104 			return (NULL);
105 		}
106 	}
107 
108 	ASSERT(sp->sp_smod_info != NULL);
109 	ASSERT(flags == SOCKET_SLEEP || flags == SOCKET_NOSLEEP);
110 	sp->sp_stats.sps_ncreate.value.ui64++;
111 	so = sp->sp_smod_info->smod_sock_create_func(sp, family, type,
112 	    protocol, version, flags, errorp, cr);
113 	if (so == NULL) {
114 		SOCKPARAMS_DEC_REF(sp);
115 	} else {
116 		if ((*errorp = SOP_INIT(so, NULL, cr, flags)) == 0) {
117 			/* Cannot fail, only bumps so_count */
118 			(void) VOP_OPEN(&SOTOV(so), FREAD|FWRITE, cr, NULL);
119 		} else {
120 			if (saved_error && (*errorp == EPROTONOSUPPORT ||
121 			    *errorp == EPROTOTYPE || *errorp == ENOPROTOOPT))
122 				*errorp = saved_error;
123 			socket_destroy(so);
124 			so = NULL;
125 		}
126 	}
127 	return (so);
128 }
129 
130 struct sonode *
131 socket_newconn(struct sonode *parent, sock_lower_handle_t lh,
132     sock_downcalls_t *dc, int flags, int *errorp)
133 {
134 	struct sonode *so;
135 	struct sockparams *sp;
136 	struct cred *cr;
137 
138 	if ((cr = CRED()) == NULL)
139 		cr = kcred;
140 
141 	sp = parent->so_sockparams;
142 	ASSERT(sp != NULL);
143 
144 	sp->sp_stats.sps_ncreate.value.ui64++;
145 	so = sp->sp_smod_info->smod_sock_create_func(sp, parent->so_family,
146 	    parent->so_type, parent->so_protocol, parent->so_version, flags,
147 	    errorp, cr);
148 	if (so != NULL) {
149 		SOCKPARAMS_INC_REF(sp);
150 
151 		so->so_proto_handle = lh;
152 		so->so_downcalls = dc;
153 		/*
154 		 * This function may be called in interrupt context, and CRED()
155 		 * will be NULL. In this case, pass in kcred.
156 		 */
157 		if ((*errorp = SOP_INIT(so, parent, cr, flags)) == 0) {
158 			/* Cannot fail, only bumps so_count */
159 			(void) VOP_OPEN(&SOTOV(so), FREAD|FWRITE, cr, NULL);
160 		} else  {
161 			socket_destroy(so);
162 			so = NULL;
163 		}
164 	}
165 
166 	return (so);
167 }
168 
169 /*
170  * Bind local endpoint.
171  */
172 int
173 socket_bind(struct sonode *so, struct sockaddr *name, socklen_t namelen,
174     int flags, cred_t *cr)
175 {
176 	return (SOP_BIND(so, name, namelen, flags, cr));
177 }
178 
179 /*
180  * Turn socket into a listen socket.
181  */
182 int
183 socket_listen(struct sonode *so, int backlog, cred_t *cr)
184 {
185 	if (backlog < 0) {
186 		backlog = 0;
187 	}
188 
189 	/*
190 	 * Use the same qlimit as in BSD. BSD checks the qlimit
191 	 * before queuing the next connection implying that a
192 	 * listen(sock, 0) allows one connection to be queued.
193 	 * BSD also uses 1.5 times the requested backlog.
194 	 *
195 	 * XNS Issue 4 required a strict interpretation of the backlog.
196 	 * This has been waived subsequently for Issue 4 and the change
197 	 * incorporated in XNS Issue 5. So we aren't required to do
198 	 * anything special for XPG apps.
199 	 */
200 	if (backlog >= (INT_MAX - 1) / 3)
201 		backlog = INT_MAX;
202 	else
203 		backlog = backlog * 3 / 2 + 1;
204 
205 	return (SOP_LISTEN(so, backlog, cr));
206 }
207 
208 /*
209  * Accept incoming connection.
210  */
211 int
212 socket_accept(struct sonode *lso, int fflag, cred_t *cr, struct sonode **nsop)
213 {
214 	return (SOP_ACCEPT(lso, fflag, cr, nsop));
215 }
216 
217 /*
218  * Active open.
219  */
220 int
221 socket_connect(struct sonode *so, struct sockaddr *name,
222     socklen_t namelen, int fflag, int flags, cred_t *cr)
223 {
224 	int error;
225 
226 	/*
227 	 * Handle a connect to a name parameter of type AF_UNSPEC like a
228 	 * connect to a null address. This is the portable method to
229 	 * unconnect a socket.
230 	 */
231 	if ((namelen >= sizeof (sa_family_t)) &&
232 	    (name->sa_family == AF_UNSPEC)) {
233 		name = NULL;
234 		namelen = 0;
235 	}
236 
237 	error = SOP_CONNECT(so, name, namelen, fflag, flags, cr);
238 
239 	if (error == EHOSTUNREACH && flags & _SOCONNECT_XPG4_2) {
240 		/*
241 		 * X/Open specification contains a requirement that
242 		 * ENETUNREACH be returned but does not require
243 		 * EHOSTUNREACH. In order to keep the test suite
244 		 * happy we mess with the errno here.
245 		 */
246 		error = ENETUNREACH;
247 	}
248 
249 	return (error);
250 }
251 
252 /*
253  * Get address of remote node.
254  */
255 int
256 socket_getpeername(struct sonode *so, struct sockaddr *addr,
257     socklen_t *addrlen, boolean_t accept, cred_t *cr)
258 {
259 	ASSERT(*addrlen > 0);
260 	return (SOP_GETPEERNAME(so, addr, addrlen, accept, cr));
261 
262 }
263 
264 /*
265  * Get local address.
266  */
267 int
268 socket_getsockname(struct sonode *so, struct sockaddr *addr,
269     socklen_t *addrlen, cred_t *cr)
270 {
271 	return (SOP_GETSOCKNAME(so, addr, addrlen, cr));
272 
273 }
274 
275 /*
276  * Called from shutdown().
277  */
278 int
279 socket_shutdown(struct sonode *so, int how, cred_t *cr)
280 {
281 	return (SOP_SHUTDOWN(so, how, cr));
282 }
283 
284 /*
285  * Get socket options.
286  */
287 /*ARGSUSED*/
288 int
289 socket_getsockopt(struct sonode *so, int level, int option_name,
290     void *optval, socklen_t *optlenp, int flags, cred_t *cr)
291 {
292 	return (SOP_GETSOCKOPT(so, level, option_name, optval,
293 	    optlenp, flags, cr));
294 }
295 
296 /*
297  * Set socket options
298  */
299 int
300 socket_setsockopt(struct sonode *so, int level, int option_name,
301     const void *optval, t_uscalar_t optlen, cred_t *cr)
302 {
303 	int val = 1;
304 	/* Caller allocates aligned optval, or passes null */
305 	ASSERT(((uintptr_t)optval & (sizeof (t_scalar_t) - 1)) == 0);
306 	/* If optval is null optlen is 0, and vice-versa */
307 	ASSERT(optval != NULL || optlen == 0);
308 	ASSERT(optlen != 0 || optval == NULL);
309 
310 	if (optval == NULL && optlen == 0)
311 		optval = &val;
312 
313 	return (SOP_SETSOCKOPT(so, level, option_name, optval, optlen, cr));
314 }
315 
316 int
317 socket_sendmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop,
318     cred_t *cr)
319 {
320 	int error = 0;
321 	ssize_t orig_resid = uiop->uio_resid;
322 
323 	/*
324 	 * Do not bypass the cache if we are doing a local (AF_UNIX) write.
325 	 */
326 	if (so->so_family == AF_UNIX)
327 		uiop->uio_extflg |= UIO_COPY_CACHED;
328 	else
329 		uiop->uio_extflg &= ~UIO_COPY_CACHED;
330 
331 	error = SOP_SENDMSG(so, msg, uiop, cr);
332 	switch (error) {
333 	default:
334 		break;
335 	case EINTR:
336 	case ENOMEM:
337 	/* EAGAIN is EWOULDBLOCK */
338 	case EWOULDBLOCK:
339 		/* We did a partial send */
340 		if (uiop->uio_resid != orig_resid)
341 			error = 0;
342 		break;
343 	case EPIPE:
344 		if (((so->so_mode & SM_KERNEL) == 0) &&
345 		    ((msg->msg_flags & MSG_NOSIGNAL) == 0)) {
346 			tsignal(curthread, SIGPIPE);
347 		}
348 		break;
349 	}
350 
351 	return (error);
352 }
353 
354 int
355 socket_sendmblk(struct sonode *so, struct nmsghdr *msg, int fflag,
356     struct cred *cr, mblk_t **mpp)
357 {
358 	int error = 0;
359 
360 	error = SOP_SENDMBLK(so, msg, fflag, cr, mpp);
361 	if (error == EPIPE) {
362 		tsignal(curthread, SIGPIPE);
363 	}
364 	return (error);
365 }
366 
367 int
368 socket_recvmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop,
369     cred_t *cr)
370 {
371 	int error;
372 	ssize_t orig_resid = uiop->uio_resid;
373 
374 	/*
375 	 * Do not bypass the cache when reading data, as the application
376 	 * is likely to access the data shortly.
377 	 */
378 	uiop->uio_extflg |= UIO_COPY_CACHED;
379 
380 	error = SOP_RECVMSG(so, msg, uiop, cr);
381 
382 	switch (error) {
383 	case EINTR:
384 	/* EAGAIN is EWOULDBLOCK */
385 	case EWOULDBLOCK:
386 		/* We did a partial read */
387 		if (uiop->uio_resid != orig_resid)
388 			error = 0;
389 		break;
390 	default:
391 		break;
392 	}
393 	return (error);
394 }
395 
396 int
397 socket_ioctl(struct sonode *so, int cmd, intptr_t arg, int mode,
398     struct cred *cr, int32_t *rvalp)
399 {
400 	return (SOP_IOCTL(so, cmd, arg, mode, cr, rvalp));
401 }
402 
403 int
404 socket_poll(struct sonode *so, short events, int anyyet, short *reventsp,
405     struct pollhead **phpp)
406 {
407 	return (SOP_POLL(so, events, anyyet, reventsp, phpp));
408 }
409 
410 int
411 socket_close(struct sonode *so, int flag, struct cred *cr)
412 {
413 	return (VOP_CLOSE(SOTOV(so), flag, 1, 0, cr, NULL));
414 }
415 
416 int
417 socket_close_internal(struct sonode *so, int flag, cred_t *cr)
418 {
419 	ASSERT(so->so_count == 0);
420 
421 	return (SOP_CLOSE(so, flag, cr));
422 }
423 
424 void
425 socket_destroy(struct sonode *so)
426 {
427 	vn_invalid(SOTOV(so));
428 	VN_RELE(SOTOV(so));
429 }
430 
431 /* ARGSUSED */
432 void
433 socket_destroy_internal(struct sonode *so, cred_t *cr)
434 {
435 	struct sockparams *sp = so->so_sockparams;
436 	ASSERT(so->so_count == 0 && sp != NULL);
437 
438 	sp->sp_smod_info->smod_sock_destroy_func(so);
439 
440 	SOCKPARAMS_DEC_REF(sp);
441 }
442 
443 /*
444  * TODO Once the common vnode ops is available, then the vnops argument
445  * should be removed.
446  */
447 /*ARGSUSED*/
448 int
449 sonode_constructor(void *buf, void *cdrarg, int kmflags)
450 {
451 	struct sonode *so = buf;
452 	struct vnode *vp;
453 
454 	vp = so->so_vnode = vn_alloc(kmflags);
455 	if (vp == NULL) {
456 		return (-1);
457 	}
458 	vp->v_data = so;
459 	vn_setops(vp, socket_vnodeops);
460 
461 	so->so_priv		= NULL;
462 	so->so_oobmsg		= NULL;
463 
464 	so->so_proto_handle	= NULL;
465 
466 	so->so_peercred		= NULL;
467 
468 	so->so_rcv_queued	= 0;
469 	so->so_rcv_q_head	= NULL;
470 	so->so_rcv_q_last_head	= NULL;
471 	so->so_rcv_head		= NULL;
472 	so->so_rcv_last_head	= NULL;
473 	so->so_rcv_wanted	= 0;
474 	so->so_rcv_timer_interval = SOCKET_NO_RCVTIMER;
475 	so->so_rcv_timer_tid	= 0;
476 	so->so_rcv_thresh	= 0;
477 
478 	list_create(&so->so_acceptq_list, sizeof (struct sonode),
479 	    offsetof(struct sonode, so_acceptq_node));
480 	list_create(&so->so_acceptq_defer, sizeof (struct sonode),
481 	    offsetof(struct sonode, so_acceptq_node));
482 	list_link_init(&so->so_acceptq_node);
483 	so->so_acceptq_len	= 0;
484 	so->so_backlog		= 0;
485 	so->so_listener		= NULL;
486 
487 	so->so_snd_qfull	= B_FALSE;
488 
489 	so->so_filter_active	= 0;
490 	so->so_filter_tx	= 0;
491 	so->so_filter_defertime = 0;
492 	so->so_filter_top	= NULL;
493 	so->so_filter_bottom	= NULL;
494 
495 	mutex_init(&so->so_lock, NULL, MUTEX_DEFAULT, NULL);
496 	mutex_init(&so->so_acceptq_lock, NULL, MUTEX_DEFAULT, NULL);
497 	rw_init(&so->so_fallback_rwlock, NULL, RW_DEFAULT, NULL);
498 	cv_init(&so->so_state_cv, NULL, CV_DEFAULT, NULL);
499 	cv_init(&so->so_single_cv, NULL, CV_DEFAULT, NULL);
500 	cv_init(&so->so_read_cv, NULL, CV_DEFAULT, NULL);
501 
502 	cv_init(&so->so_acceptq_cv, NULL, CV_DEFAULT, NULL);
503 	cv_init(&so->so_snd_cv, NULL, CV_DEFAULT, NULL);
504 	cv_init(&so->so_rcv_cv, NULL, CV_DEFAULT, NULL);
505 	cv_init(&so->so_copy_cv, NULL, CV_DEFAULT, NULL);
506 	cv_init(&so->so_closing_cv, NULL, CV_DEFAULT, NULL);
507 
508 	so->so_krecv_cb = NULL;
509 	so->so_krecv_arg = NULL;
510 
511 	return (0);
512 }
513 
514 /*ARGSUSED*/
515 void
516 sonode_destructor(void *buf, void *cdrarg)
517 {
518 	struct sonode *so = buf;
519 	struct vnode *vp = SOTOV(so);
520 
521 	ASSERT(so->so_priv == NULL);
522 	ASSERT(so->so_peercred == NULL);
523 
524 	ASSERT(so->so_oobmsg == NULL);
525 
526 	ASSERT(so->so_rcv_q_head == NULL);
527 
528 	list_destroy(&so->so_acceptq_list);
529 	list_destroy(&so->so_acceptq_defer);
530 	ASSERT(!list_link_active(&so->so_acceptq_node));
531 	ASSERT(so->so_listener == NULL);
532 
533 	ASSERT(so->so_filter_active == 0);
534 	ASSERT(so->so_filter_tx == 0);
535 	ASSERT(so->so_filter_top == NULL);
536 	ASSERT(so->so_filter_bottom == NULL);
537 
538 	ASSERT(vp->v_data == so);
539 	ASSERT(vn_matchops(vp, socket_vnodeops));
540 
541 	vn_free(vp);
542 
543 	mutex_destroy(&so->so_lock);
544 	mutex_destroy(&so->so_acceptq_lock);
545 	rw_destroy(&so->so_fallback_rwlock);
546 
547 	cv_destroy(&so->so_state_cv);
548 	cv_destroy(&so->so_single_cv);
549 	cv_destroy(&so->so_read_cv);
550 	cv_destroy(&so->so_acceptq_cv);
551 	cv_destroy(&so->so_snd_cv);
552 	cv_destroy(&so->so_rcv_cv);
553 	cv_destroy(&so->so_closing_cv);
554 }
555 
556 void
557 sonode_init(struct sonode *so, struct sockparams *sp, int family,
558     int type, int protocol, sonodeops_t *sops)
559 {
560 	vnode_t *vp;
561 
562 	vp = SOTOV(so);
563 
564 	so->so_flag	= 0;
565 
566 	so->so_state	= 0;
567 	so->so_mode	= 0;
568 
569 	so->so_count	= 0;
570 
571 	so->so_family	= family;
572 	so->so_type	= type;
573 	so->so_protocol	= protocol;
574 
575 	SOCK_CONNID_INIT(so->so_proto_connid);
576 
577 	so->so_options	= 0;
578 	so->so_linger.l_onoff   = 0;
579 	so->so_linger.l_linger = 0;
580 	so->so_sndbuf	= 0;
581 	so->so_error	= 0;
582 	so->so_rcvtimeo	= 0;
583 	so->so_sndtimeo = 0;
584 	so->so_xpg_rcvbuf = 0;
585 
586 	ASSERT(so->so_oobmsg == NULL);
587 	so->so_oobmark	= 0;
588 	so->so_pgrp	= 0;
589 
590 	ASSERT(so->so_peercred == NULL);
591 
592 	so->so_zoneid = getzoneid();
593 
594 	so->so_sockparams = sp;
595 
596 	so->so_ops = sops;
597 
598 	so->so_not_str = (sops != &sotpi_sonodeops);
599 
600 	so->so_proto_handle = NULL;
601 
602 	so->so_downcalls = NULL;
603 
604 	so->so_copyflag = 0;
605 
606 	vn_reinit(vp);
607 	vp->v_vfsp	= sock_vfsp;
608 	vp->v_type	= VSOCK;
609 	vp->v_rdev	= sockdev;
610 
611 	so->so_snd_qfull = B_FALSE;
612 	so->so_minpsz = 0;
613 
614 	so->so_rcv_wakeup = B_FALSE;
615 	so->so_snd_wakeup = B_FALSE;
616 	so->so_flowctrld = B_FALSE;
617 
618 	so->so_pollev = 0;
619 	bzero(&so->so_poll_list, sizeof (so->so_poll_list));
620 	bzero(&so->so_proto_props, sizeof (struct sock_proto_props));
621 
622 	bzero(&(so->so_ksock_callbacks), sizeof (ksocket_callbacks_t));
623 	so->so_ksock_cb_arg = NULL;
624 
625 	so->so_max_addr_len = sizeof (struct sockaddr_storage);
626 
627 	so->so_direct = NULL;
628 
629 	vn_exists(vp);
630 }
631 
632 void
633 sonode_fini(struct sonode *so)
634 {
635 	vnode_t *vp;
636 
637 	ASSERT(so->so_count == 0);
638 
639 	if (so->so_rcv_timer_tid) {
640 		ASSERT(MUTEX_NOT_HELD(&so->so_lock));
641 		(void) untimeout(so->so_rcv_timer_tid);
642 		so->so_rcv_timer_tid = 0;
643 	}
644 
645 	if (so->so_poll_list.ph_list != NULL) {
646 		pollwakeup(&so->so_poll_list, POLLERR);
647 		pollhead_clean(&so->so_poll_list);
648 	}
649 
650 	if (so->so_direct != NULL)
651 		sod_sock_fini(so);
652 
653 	vp = SOTOV(so);
654 	vn_invalid(vp);
655 
656 	if (so->so_peercred != NULL) {
657 		crfree(so->so_peercred);
658 		so->so_peercred = NULL;
659 	}
660 	/* Detach and destroy filters */
661 	if (so->so_filter_top != NULL)
662 		sof_sonode_cleanup(so);
663 
664 	/* Clean up any remnants of krecv callbacks */
665 	so->so_krecv_cb = NULL;
666 	so->so_krecv_arg = NULL;
667 
668 	ASSERT(list_is_empty(&so->so_acceptq_list));
669 	ASSERT(list_is_empty(&so->so_acceptq_defer));
670 	ASSERT(!list_link_active(&so->so_acceptq_node));
671 
672 	ASSERT(so->so_rcv_queued == 0);
673 	ASSERT(so->so_rcv_q_head == NULL);
674 	ASSERT(so->so_rcv_q_last_head == NULL);
675 	ASSERT(so->so_rcv_head == NULL);
676 	ASSERT(so->so_rcv_last_head == NULL);
677 }
678