xref: /titanic_51/usr/src/uts/common/sys/socketvar.h (revision 59596c01ca1b980a016d25670874f53e64c27ec0)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright (c) 1996, 2010, Oracle and/or its affiliates. All rights reserved.
24  */
25 
26 /*	Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T	*/
27 /*	  All Rights Reserved  	*/
28 
29 /*
30  * University Copyright- Copyright (c) 1982, 1986, 1988
31  * The Regents of the University of California
32  * All Rights Reserved
33  *
34  * University Acknowledgment- Portions of this document are derived from
35  * software developed by the University of California, Berkeley, and its
36  * contributors.
37  */
38 /*
39  * Copyright 2015 Nexenta Systems, Inc. All rights reserved.
40  */
41 
42 #ifndef _SYS_SOCKETVAR_H
43 #define	_SYS_SOCKETVAR_H
44 
45 #include <sys/types.h>
46 #include <sys/stream.h>
47 #include <sys/t_lock.h>
48 #include <sys/cred.h>
49 #include <sys/vnode.h>
50 #include <sys/file.h>
51 #include <sys/param.h>
52 #include <sys/zone.h>
53 #include <sys/sdt.h>
54 #include <sys/modctl.h>
55 #include <sys/atomic.h>
56 #include <sys/socket.h>
57 #include <sys/ksocket.h>
58 #include <sys/kstat.h>
59 
60 #ifdef _KERNEL
61 #include <sys/vfs_opreg.h>
62 #endif
63 
64 #ifdef	__cplusplus
65 extern "C" {
66 #endif
67 
68 /*
69  * Internal representation of the address used to represent addresses
70  * in the loopback transport for AF_UNIX. While the sockaddr_un is used
71  * as the sockfs layer address for AF_UNIX the pathnames contained in
72  * these addresses are not unique (due to relative pathnames) thus can not
73  * be used in the transport.
74  *
75  * The transport level address consists of a magic number (used to separate the
76  * name space for specific and implicit binds). For a specific bind
77  * this is followed by a "vnode *" which ensures that all specific binds
78  * have a unique transport level address. For implicit binds the latter
79  * part of the address is a byte string (of the same length as a pointer)
80  * that is assigned by the loopback transport.
81  *
82  * The uniqueness assumes that the loopback transport has a separate namespace
83  * for sockets in order to avoid name conflicts with e.g. TLI use of the
84  * same transport.
85  */
86 struct so_ux_addr {
87 	void	*soua_vp;	/* vnode pointer or assigned by tl */
88 	uint_t	soua_magic;	/* See below */
89 };
90 
91 #define	SOU_MAGIC_EXPLICIT	0x75787670	/* "uxvp" */
92 #define	SOU_MAGIC_IMPLICIT	0x616e6f6e	/* "anon" */
93 
94 struct sockaddr_ux {
95 	sa_family_t		sou_family;	/* AF_UNIX */
96 	struct so_ux_addr	sou_addr;
97 };
98 
99 #if defined(_KERNEL) || defined(_KMEMUSER)
100 
101 #include <sys/socket_proto.h>
102 
103 typedef struct sonodeops sonodeops_t;
104 typedef struct sonode sonode_t;
105 
106 struct sodirect_s;
107 
108 /*
109  * The sonode represents a socket. A sonode never exist in the file system
110  * name space and can not be opened using open() - only the socket, socketpair
111  * and accept calls create sonodes.
112  *
113  * The locking of sockfs uses the so_lock mutex plus the SOLOCKED and
114  * SOREADLOCKED flags in so_flag. The mutex protects all the state in the
115  * sonode. It is expected that the underlying transport protocol serializes
116  * socket operations, so sockfs will not normally not single-thread
117  * operations. However, certain sockets, including TPI based ones, can only
118  * handle one control operation at a time. The SOLOCKED flag is used to
119  * single-thread operations from sockfs users to prevent e.g. multiple bind()
120  * calls to operate on the same sonode concurrently. The SOREADLOCKED flag is
121  * used to ensure that only one thread sleeps in kstrgetmsg for a given
122  * sonode. This is needed to ensure atomic operation for things like
123  * MSG_WAITALL.
124  *
125  * The so_fallback_rwlock is used to ensure that for sockets that can
126  * fall back to TPI, the fallback is not initiated until all pending
127  * operations have completed.
128  *
129  * Note that so_lock is sometimes held across calls that might go to sleep
130  * (kmem_alloc and soallocproto*). This implies that no other lock in
131  * the system should be held when calling into sockfs; from the system call
132  * side or from strrput (in case of TPI based sockets). If locks are held
133  * while calling into sockfs the system might hang when running low on memory.
134  */
135 struct sonode {
136 	struct	vnode	*so_vnode;	/* vnode associated with this sonode */
137 
138 	sonodeops_t 	*so_ops;	/* operations vector for this sonode */
139 	void		*so_priv;	/* sonode private data */
140 
141 	krwlock_t	so_fallback_rwlock;
142 	kmutex_t	so_lock;	/* protects sonode fields */
143 
144 	kcondvar_t	so_state_cv;	/* synchronize state changes */
145 	kcondvar_t	so_single_cv;	/* wait due to SOLOCKED */
146 	kcondvar_t	so_read_cv;	/* wait due to SOREADLOCKED */
147 
148 	/* These fields are protected by so_lock */
149 
150 	uint_t		so_state;	/* internal state flags SS_*, below */
151 	uint_t		so_mode;	/* characteristics on socket. SM_* */
152 	ushort_t 	so_flag;	/* flags, see below */
153 	int		so_count;	/* count of opened references */
154 
155 	sock_connid_t	so_proto_connid; /* protocol generation number */
156 
157 	ushort_t 	so_error;	/* error affecting connection */
158 
159 	struct sockparams *so_sockparams;	/* vnode or socket module */
160 	/* Needed to recreate the same socket for accept */
161 	short	so_family;
162 	short	so_type;
163 	short	so_protocol;
164 	short	so_version;		/* From so_socket call */
165 
166 	/* Accept queue */
167 	kmutex_t	so_acceptq_lock;	/* protects accept queue */
168 	list_t		so_acceptq_list;	/* pending conns */
169 	list_t		so_acceptq_defer;	/* deferred conns */
170 	list_node_t	so_acceptq_node;	/* acceptq list node */
171 	unsigned int	so_acceptq_len;		/* # of conns (both lists) */
172 	unsigned int	so_backlog;		/* Listen backlog */
173 	kcondvar_t	so_acceptq_cv;		/* wait for new conn. */
174 	struct sonode	*so_listener;		/* parent socket */
175 
176 	/* Options */
177 	short	so_options;		/* From socket call, see socket.h */
178 	struct linger	so_linger;	/* SO_LINGER value */
179 #define	so_sndbuf	so_proto_props.sopp_txhiwat	/* SO_SNDBUF value */
180 #define	so_sndlowat	so_proto_props.sopp_txlowat	/* tx low water mark */
181 #define	so_rcvbuf	so_proto_props.sopp_rxhiwat	/* SO_RCVBUF value */
182 #define	so_rcvlowat	so_proto_props.sopp_rxlowat	/* rx low water mark */
183 #define	so_max_addr_len	so_proto_props.sopp_maxaddrlen
184 #define	so_minpsz	so_proto_props.sopp_minpsz
185 #define	so_maxpsz	so_proto_props.sopp_maxpsz
186 
187 	int	so_xpg_rcvbuf;		/* SO_RCVBUF value for XPG4 socket */
188 	clock_t	so_sndtimeo;		/* send timeout */
189 	clock_t	so_rcvtimeo;		/* recv timeout */
190 
191 	mblk_t	*so_oobmsg;		/* outofline oob data */
192 	ssize_t	so_oobmark;		/* offset of the oob data */
193 
194 	pid_t	so_pgrp;		/* pgrp for signals */
195 
196 	cred_t		*so_peercred;	/* connected socket peer cred */
197 	pid_t		so_cpid;	/* connected socket peer cached pid */
198 	zoneid_t	so_zoneid;	/* opener's zoneid */
199 
200 	struct pollhead	so_poll_list;	/* common pollhead */
201 	short		so_pollev;	/* events that should be generated */
202 
203 	/* Receive */
204 	unsigned int	so_rcv_queued;	/* # bytes on both rcv lists */
205 	mblk_t		*so_rcv_q_head;	/* processing/copyout rcv queue */
206 	mblk_t		*so_rcv_q_last_head;
207 	mblk_t		*so_rcv_head;	/* protocol prequeue */
208 	mblk_t		*so_rcv_last_head;	/* last mblk in b_next chain */
209 	kcondvar_t	so_rcv_cv;	/* wait for data */
210 	uint_t		so_rcv_wanted;	/* # of bytes wanted by app */
211 	timeout_id_t	so_rcv_timer_tid;
212 
213 #define	so_rcv_thresh	so_proto_props.sopp_rcvthresh
214 #define	so_rcv_timer_interval so_proto_props.sopp_rcvtimer
215 
216 	kcondvar_t	so_snd_cv;	/* wait for snd buffers */
217 	uint32_t
218 		so_snd_qfull: 1,	/* Transmit full */
219 		so_rcv_wakeup: 1,
220 		so_snd_wakeup: 1,
221 		so_not_str: 1,	/* B_TRUE if not streams based socket */
222 		so_pad_to_bit_31: 28;
223 
224 	/* Communication channel with protocol */
225 	sock_lower_handle_t	so_proto_handle;
226 	sock_downcalls_t 	*so_downcalls;
227 
228 	struct sock_proto_props	so_proto_props; /* protocol settings */
229 	boolean_t		so_flowctrld;	/* Flow controlled */
230 	uint_t			so_copyflag;	/* Copy related flag */
231 	kcondvar_t		so_copy_cv;	/* Copy cond variable */
232 
233 	/* kernel sockets */
234 	ksocket_callbacks_t 	so_ksock_callbacks;
235 	void			*so_ksock_cb_arg;	/* callback argument */
236 	kcondvar_t		so_closing_cv;
237 
238 	/* != NULL for sodirect enabled socket */
239 	struct sodirect_s	*so_direct;
240 
241 	/* socket filters */
242 	uint_t			so_filter_active;	/* # of active fil */
243 	uint_t			so_filter_tx;		/* pending tx ops */
244 	struct sof_instance	*so_filter_top;		/* top of stack */
245 	struct sof_instance	*so_filter_bottom;	/* bottom of stack */
246 	clock_t			so_filter_defertime;	/* time when deferred */
247 };
248 
249 #define	SO_HAVE_DATA(so)						\
250 	/*								\
251 	 * For the (tid == 0) case we must check so_rcv_{q_,}head	\
252 	 * rather than (so_rcv_queued > 0), since the latter does not	\
253 	 * take into account mblks with only control/name information.	\
254 	 */								\
255 	((so)->so_rcv_timer_tid == 0 && ((so)->so_rcv_head != NULL ||	\
256 	(so)->so_rcv_q_head != NULL)) ||				\
257 	((so)->so_state & SS_CANTRCVMORE)
258 
259 /*
260  * Events handled by the protocol (in case sd_poll is set)
261  */
262 #define	SO_PROTO_POLLEV		(POLLIN|POLLRDNORM|POLLRDBAND)
263 
264 
265 #endif /* _KERNEL || _KMEMUSER */
266 
267 /* flags */
268 #define	SOMOD		0x0001		/* update socket modification time */
269 #define	SOACC		0x0002		/* update socket access time */
270 
271 #define	SOLOCKED	0x0010		/* use to serialize open/closes */
272 #define	SOREADLOCKED	0x0020		/* serialize kstrgetmsg calls */
273 #define	SOCLONE		0x0040		/* child of clone driver */
274 #define	SOASYNC_UNBIND	0x0080		/* wait for ACK of async unbind */
275 
276 #define	SOCK_IS_NONSTR(so)	((so)->so_not_str)
277 
278 /*
279  * Socket state bits.
280  */
281 #define	SS_ISCONNECTED		0x00000001 /* socket connected to a peer */
282 #define	SS_ISCONNECTING		0x00000002 /* in process, connecting to peer */
283 #define	SS_ISDISCONNECTING	0x00000004 /* in process of disconnecting */
284 #define	SS_CANTSENDMORE		0x00000008 /* can't send more data to peer */
285 
286 #define	SS_CANTRCVMORE		0x00000010 /* can't receive more data */
287 #define	SS_ISBOUND		0x00000020 /* socket is bound */
288 #define	SS_NDELAY		0x00000040 /* FNDELAY non-blocking */
289 #define	SS_NONBLOCK		0x00000080 /* O_NONBLOCK non-blocking */
290 
291 #define	SS_ASYNC		0x00000100 /* async i/o notify */
292 #define	SS_ACCEPTCONN		0x00000200 /* listen done */
293 /*	unused			0x00000400 */	/* was SS_HASCONNIND */
294 #define	SS_SAVEDEOR		0x00000800 /* Saved MSG_EOR rcv side state */
295 
296 #define	SS_RCVATMARK		0x00001000 /* at mark on input */
297 #define	SS_OOBPEND		0x00002000 /* OOB pending or present - poll */
298 #define	SS_HAVEOOBDATA		0x00004000 /* OOB data present */
299 #define	SS_HADOOBDATA		0x00008000 /* OOB data consumed */
300 #define	SS_CLOSING		0x00010000 /* in process of closing */
301 
302 #define	SS_FIL_DEFER		0x00020000 /* filter deferred notification */
303 #define	SS_FILOP_OK		0x00040000 /* socket can attach filters */
304 #define	SS_FIL_RCV_FLOWCTRL	0x00080000 /* filter asserted rcv flow ctrl */
305 #define	SS_FIL_SND_FLOWCTRL	0x00100000 /* filter asserted snd flow ctrl */
306 #define	SS_FIL_STOP		0x00200000 /* no more filter actions */
307 
308 #define	SS_SODIRECT		0x00400000 /* transport supports sodirect */
309 
310 #define	SS_SENTLASTREADSIG	0x01000000 /* last rx signal has been sent */
311 #define	SS_SENTLASTWRITESIG	0x02000000 /* last tx signal has been sent */
312 
313 #define	SS_FALLBACK_DRAIN	0x20000000 /* data was/is being drained */
314 #define	SS_FALLBACK_PENDING	0x40000000 /* fallback is pending */
315 #define	SS_FALLBACK_COMP	0x80000000 /* fallback has completed */
316 
317 
318 /* Set of states when the socket can't be rebound */
319 #define	SS_CANTREBIND	(SS_ISCONNECTED|SS_ISCONNECTING|SS_ISDISCONNECTING|\
320 			    SS_CANTSENDMORE|SS_CANTRCVMORE|SS_ACCEPTCONN)
321 
322 /*
323  * Sockets that can fall back to TPI must ensure that fall back is not
324  * initiated while a thread is using a socket.
325  */
326 #define	SO_BLOCK_FALLBACK(so, fn)				\
327 	ASSERT(MUTEX_NOT_HELD(&(so)->so_lock));			\
328 	rw_enter(&(so)->so_fallback_rwlock, RW_READER);		\
329 	if ((so)->so_state & (SS_FALLBACK_COMP|SS_FILOP_OK)) {	\
330 		if ((so)->so_state & SS_FALLBACK_COMP) {	\
331 			rw_exit(&(so)->so_fallback_rwlock);	\
332 			return (fn);				\
333 		} else {					\
334 			mutex_enter(&(so)->so_lock);		\
335 			(so)->so_state &= ~SS_FILOP_OK;		\
336 			mutex_exit(&(so)->so_lock);		\
337 		}						\
338 	}
339 
340 #define	SO_UNBLOCK_FALLBACK(so)	{			\
341 	rw_exit(&(so)->so_fallback_rwlock);		\
342 }
343 
344 #define	SO_SND_FLOWCTRLD(so)	\
345 	((so)->so_snd_qfull || (so)->so_state & SS_FIL_SND_FLOWCTRL)
346 
347 /* Poll events */
348 #define	SO_POLLEV_IN		0x1	/* POLLIN wakeup needed */
349 #define	SO_POLLEV_ALWAYS	0x2	/* wakeups */
350 
351 /*
352  * Characteristics of sockets. Not changed after the socket is created.
353  */
354 #define	SM_PRIV			0x001	/* privileged for broadcast, raw... */
355 #define	SM_ATOMIC		0x002	/* atomic data transmission */
356 #define	SM_ADDR			0x004	/* addresses given with messages */
357 #define	SM_CONNREQUIRED		0x008	/* connection required by protocol */
358 
359 #define	SM_FDPASSING		0x010	/* passes file descriptors */
360 #define	SM_EXDATA		0x020	/* Can handle T_EXDATA_REQ */
361 #define	SM_OPTDATA		0x040	/* Can handle T_OPTDATA_REQ */
362 #define	SM_BYTESTREAM		0x080	/* Byte stream - can use M_DATA */
363 
364 #define	SM_ACCEPTOR_ID		0x100	/* so_acceptor_id is valid */
365 
366 #define	SM_KERNEL		0x200	/* kernel socket */
367 
368 /* The modes below are only for non-streams sockets */
369 #define	SM_ACCEPTSUPP		0x400	/* can handle accept() */
370 #define	SM_SENDFILESUPP		0x800	/* Private: proto supp sendfile  */
371 
372 /*
373  * Socket versions. Used by the socket library when calling _so_socket().
374  */
375 #define	SOV_STREAM	0	/* Not a socket - just a stream */
376 #define	SOV_DEFAULT	1	/* Select based on so_default_version */
377 #define	SOV_SOCKSTREAM	2	/* Socket plus streams operations */
378 #define	SOV_SOCKBSD	3	/* Socket with no streams operations */
379 #define	SOV_XPG4_2	4	/* Xnet socket */
380 
381 #if defined(_KERNEL) || defined(_KMEMUSER)
382 
383 /*
384  * sonode create and destroy functions.
385  */
386 typedef struct sonode *(*so_create_func_t)(struct sockparams *,
387     int, int, int, int, int, int *, cred_t *);
388 typedef void (*so_destroy_func_t)(struct sonode *);
389 
390 /* STREAM device information */
391 typedef struct sdev_info {
392 	char	*sd_devpath;
393 	int	sd_devpathlen; /* Is 0 if sp_devpath is a static string */
394 	vnode_t	*sd_vnode;
395 } sdev_info_t;
396 
397 #define	SOCKMOD_VERSION_1	1
398 #define	SOCKMOD_VERSION		2
399 
400 /* name of the TPI pseudo socket module */
401 #define	SOTPI_SMOD_NAME		"socktpi"
402 
403 typedef struct __smod_priv_s {
404 	so_create_func_t	smodp_sock_create_func;
405 	so_destroy_func_t	smodp_sock_destroy_func;
406 	so_proto_fallback_func_t smodp_proto_fallback_func;
407 	const char		*smodp_fallback_devpath_v4;
408 	const char		*smodp_fallback_devpath_v6;
409 } __smod_priv_t;
410 
411 /*
412  * Socket module register information
413  */
414 typedef struct smod_reg_s {
415 	int		smod_version;
416 	char		*smod_name;
417 	size_t		smod_uc_version;
418 	size_t		smod_dc_version;
419 	so_proto_create_func_t	smod_proto_create_func;
420 
421 	/* __smod_priv_data must be NULL */
422 	__smod_priv_t	*__smod_priv;
423 } smod_reg_t;
424 
425 /*
426  * Socket module information
427  */
428 typedef struct smod_info {
429 	int		smod_version;
430 	char		*smod_name;
431 	uint_t		smod_refcnt;		/* # of entries */
432 	size_t		smod_uc_version; 	/* upcall version */
433 	size_t		smod_dc_version;	/* down call version */
434 	so_proto_create_func_t	smod_proto_create_func;
435 	so_proto_fallback_func_t smod_proto_fallback_func;
436 	const char		*smod_fallback_devpath_v4;
437 	const char		*smod_fallback_devpath_v6;
438 	so_create_func_t	smod_sock_create_func;
439 	so_destroy_func_t	smod_sock_destroy_func;
440 	list_node_t	smod_node;
441 } smod_info_t;
442 
443 typedef struct sockparams_stats {
444 	kstat_named_t	sps_nfallback;	/* # of fallbacks to TPI */
445 	kstat_named_t	sps_nactive;	/* # of active sockets */
446 	kstat_named_t	sps_ncreate;	/* total # of created sockets */
447 } sockparams_stats_t;
448 
449 /*
450  * sockparams
451  *
452  * Used for mapping family/type/protocol to a socket module or STREAMS device
453  */
454 struct sockparams {
455 	/*
456 	 * The family, type, protocol, sdev_info and smod_name are
457 	 * set when the entry is created, and they will never change
458 	 * thereafter.
459 	 */
460 	int		sp_family;
461 	int		sp_type;
462 	int		sp_protocol;
463 
464 	sdev_info_t	sp_sdev_info;	/* STREAM device */
465 	char		*sp_smod_name;	/* socket module name */
466 
467 	kmutex_t	sp_lock;	/* lock for refcnt and smod_info */
468 	uint64_t	sp_refcnt;	/* entry reference count */
469 	smod_info_t	*sp_smod_info;	/* socket module */
470 
471 	sockparams_stats_t sp_stats;
472 	kstat_t		*sp_kstat;
473 
474 	/*
475 	 * The entries below are only modified while holding
476 	 * sockconf_lock as a writer.
477 	 */
478 	int		sp_flags;	/* see below */
479 	list_node_t	sp_node;
480 
481 	list_t		sp_auto_filters; /* list of automatic filters */
482 	list_t		sp_prog_filters; /* list of programmatic filters */
483 };
484 
485 struct sof_entry;
486 
487 typedef struct sp_filter {
488 	struct sof_entry *spf_filter;
489 	list_node_t	spf_node;
490 } sp_filter_t;
491 
492 
493 /*
494  * sockparams flags
495  */
496 #define	SOCKPARAMS_EPHEMERAL	0x1	/* temp. entry, not on global list */
497 
498 extern void sockparams_init(void);
499 extern struct sockparams *sockparams_hold_ephemeral_bydev(int, int, int,
500     const char *, int, int *);
501 extern struct sockparams *sockparams_hold_ephemeral_bymod(int, int, int,
502     const char *, int, int *);
503 extern void sockparams_ephemeral_drop_last_ref(struct sockparams *);
504 
505 extern struct sockparams *sockparams_create(int, int, int, char *, char *, int,
506     int, int, int *);
507 extern void 	sockparams_destroy(struct sockparams *);
508 extern int 	sockparams_add(struct sockparams *);
509 extern int	sockparams_delete(int, int, int);
510 extern int	sockparams_new_filter(struct sof_entry *);
511 extern void	sockparams_filter_cleanup(struct sof_entry *);
512 extern int	sockparams_copyout_socktable(uintptr_t);
513 
514 extern void smod_init(void);
515 extern void smod_add(smod_info_t *);
516 extern int smod_register(const smod_reg_t *);
517 extern int smod_unregister(const char *);
518 extern smod_info_t *smod_lookup_byname(const char *);
519 
520 #define	SOCKPARAMS_HAS_DEVICE(sp)					\
521 	((sp)->sp_sdev_info.sd_devpath != NULL)
522 
523 /* Increase the smod_info_t reference count */
524 #define	SMOD_INC_REF(smodp) {						\
525 	ASSERT((smodp) != NULL);					\
526 	DTRACE_PROBE1(smodinfo__inc__ref, struct smod_info *, (smodp));	\
527 	atomic_inc_uint(&(smodp)->smod_refcnt);				\
528 }
529 
530 /*
531  * Decreace the socket module entry reference count.
532  * When no one mapping to the entry, we try to unload the module from the
533  * kernel. If the module can't unload, just leave the module entry with
534  * a zero refcnt.
535  */
536 #define	SMOD_DEC_REF(smodp, modname) {					\
537 	ASSERT((smodp) != NULL);					\
538 	ASSERT((smodp)->smod_refcnt != 0);				\
539 	atomic_dec_uint(&(smodp)->smod_refcnt);				\
540 	/*								\
541 	 * No need to atomically check the return value because the	\
542 	 * socket module framework will verify that no one is using	\
543 	 * the module before unloading. Worst thing that can happen	\
544 	 * here is multiple calls to mod_remove_by_name(), which is OK.	\
545 	 */								\
546 	if ((smodp)->smod_refcnt == 0)					\
547 		(void) mod_remove_by_name(modname);			\
548 }
549 
550 /* Increase the reference count */
551 #define	SOCKPARAMS_INC_REF(sp) {					\
552 	ASSERT((sp) != NULL);						\
553 	DTRACE_PROBE1(sockparams__inc__ref, struct sockparams *, (sp));	\
554 	mutex_enter(&(sp)->sp_lock);					\
555 	(sp)->sp_refcnt++;						\
556 	ASSERT((sp)->sp_refcnt != 0);					\
557 	mutex_exit(&(sp)->sp_lock);					\
558 }
559 
560 /*
561  * Decrease the reference count.
562  *
563  * If the sockparams is ephemeral, then the thread dropping the last ref
564  * count will destroy the entry.
565  */
566 #define	SOCKPARAMS_DEC_REF(sp) {					\
567 	ASSERT((sp) != NULL);						\
568 	DTRACE_PROBE1(sockparams__dec__ref, struct sockparams *, (sp));	\
569 	mutex_enter(&(sp)->sp_lock);					\
570 	ASSERT((sp)->sp_refcnt > 0);					\
571 	if ((sp)->sp_refcnt == 1) {					\
572 		if ((sp)->sp_flags & SOCKPARAMS_EPHEMERAL) {		\
573 			mutex_exit(&(sp)->sp_lock);			\
574 			sockparams_ephemeral_drop_last_ref((sp));	\
575 		} else {						\
576 			(sp)->sp_refcnt--;				\
577 			if ((sp)->sp_smod_info != NULL) {		\
578 				SMOD_DEC_REF((sp)->sp_smod_info,	\
579 				    (sp)->sp_smod_name);		\
580 			}						\
581 			(sp)->sp_smod_info = NULL;			\
582 			mutex_exit(&(sp)->sp_lock);			\
583 		}							\
584 	} else {							\
585 		(sp)->sp_refcnt--;					\
586 		mutex_exit(&(sp)->sp_lock);				\
587 	}								\
588 }
589 
590 /*
591  * Used to traverse the list of AF_UNIX sockets to construct the kstat
592  * for netstat(1m).
593  */
594 struct socklist {
595 	kmutex_t	sl_lock;
596 	struct sonode	*sl_list;
597 };
598 
599 extern struct socklist socklist;
600 /*
601  * ss_full_waits is the number of times the reader thread
602  * waits when the queue is full and ss_empty_waits is the number
603  * of times the consumer thread waits when the queue is empty.
604  * No locks for these as they are just indicators of whether
605  * disk or network or both is slow or fast.
606  */
607 struct sendfile_stats {
608 	uint32_t ss_file_cached;
609 	uint32_t ss_file_not_cached;
610 	uint32_t ss_full_waits;
611 	uint32_t ss_empty_waits;
612 	uint32_t ss_file_segmap;
613 };
614 
615 /*
616  * A single sendfile request is represented by snf_req.
617  */
618 typedef struct snf_req {
619 	struct snf_req	*sr_next;
620 	mblk_t		*sr_mp_head;
621 	mblk_t		*sr_mp_tail;
622 	kmutex_t	sr_lock;
623 	kcondvar_t	sr_cv;
624 	uint_t		sr_qlen;
625 	int		sr_hiwat;
626 	int		sr_lowat;
627 	int		sr_operation;
628 	struct vnode	*sr_vp;
629 	file_t 		*sr_fp;
630 	ssize_t		sr_maxpsz;
631 	u_offset_t	sr_file_off;
632 	u_offset_t	sr_file_size;
633 #define	SR_READ_DONE	0x80000000
634 	int		sr_read_error;
635 	int		sr_write_error;
636 } snf_req_t;
637 
638 /* A queue of sendfile requests */
639 struct sendfile_queue {
640 	snf_req_t	*snfq_req_head;
641 	snf_req_t	*snfq_req_tail;
642 	kmutex_t	snfq_lock;
643 	kcondvar_t	snfq_cv;
644 	int		snfq_svc_threads;	/* # of service threads */
645 	int		snfq_idle_cnt;		/* # of idling threads */
646 	int		snfq_max_threads;
647 	int		snfq_req_cnt;		/* Number of requests */
648 };
649 
650 #define	READ_OP			1
651 #define	SNFQ_TIMEOUT		(60 * 5 * hz)	/* 5 minutes */
652 
653 /* Socket network operations switch */
654 struct sonodeops {
655 	int 	(*sop_init)(struct sonode *, struct sonode *, cred_t *,
656 		    int);
657 	int	(*sop_accept)(struct sonode *, int, cred_t *, struct sonode **);
658 	int	(*sop_bind)(struct sonode *, struct sockaddr *, socklen_t,
659 		    int, cred_t *);
660 	int	(*sop_listen)(struct sonode *, int, cred_t *);
661 	int	(*sop_connect)(struct sonode *, struct sockaddr *,
662 		    socklen_t, int, int, cred_t *);
663 	int	(*sop_recvmsg)(struct sonode *, struct msghdr *,
664 		    struct uio *, cred_t *);
665 	int	(*sop_sendmsg)(struct sonode *, struct msghdr *,
666 		    struct uio *, cred_t *);
667 	int	(*sop_sendmblk)(struct sonode *, struct msghdr *, int,
668 		    cred_t *, mblk_t **);
669 	int	(*sop_getpeername)(struct sonode *, struct sockaddr *,
670 		    socklen_t *, boolean_t, cred_t *);
671 	int	(*sop_getsockname)(struct sonode *, struct sockaddr *,
672 		    socklen_t *, cred_t *);
673 	int	(*sop_shutdown)(struct sonode *, int, cred_t *);
674 	int	(*sop_getsockopt)(struct sonode *, int, int, void *,
675 		    socklen_t *, int, cred_t *);
676 	int 	(*sop_setsockopt)(struct sonode *, int, int, const void *,
677 		    socklen_t, cred_t *);
678 	int 	(*sop_ioctl)(struct sonode *, int, intptr_t, int,
679 		    cred_t *, int32_t *);
680 	int 	(*sop_poll)(struct sonode *, short, int, short *,
681 		    struct pollhead **);
682 	int 	(*sop_close)(struct sonode *, int, cred_t *);
683 };
684 
685 #define	SOP_INIT(so, flag, cr, flags)	\
686 	((so)->so_ops->sop_init((so), (flag), (cr), (flags)))
687 #define	SOP_ACCEPT(so, fflag, cr, nsop)	\
688 	((so)->so_ops->sop_accept((so), (fflag), (cr), (nsop)))
689 #define	SOP_BIND(so, name, namelen, flags, cr)	\
690 	((so)->so_ops->sop_bind((so), (name), (namelen), (flags), (cr)))
691 #define	SOP_LISTEN(so, backlog, cr)	\
692 	((so)->so_ops->sop_listen((so), (backlog), (cr)))
693 #define	SOP_CONNECT(so, name, namelen, fflag, flags, cr)	\
694 	((so)->so_ops->sop_connect((so), (name), (namelen), (fflag), (flags), \
695 	(cr)))
696 #define	SOP_RECVMSG(so, msg, uiop, cr)	\
697 	((so)->so_ops->sop_recvmsg((so), (msg), (uiop), (cr)))
698 #define	SOP_SENDMSG(so, msg, uiop, cr)	\
699 	((so)->so_ops->sop_sendmsg((so), (msg), (uiop), (cr)))
700 #define	SOP_SENDMBLK(so, msg, size, cr, mpp)	\
701 	((so)->so_ops->sop_sendmblk((so), (msg), (size), (cr), (mpp)))
702 #define	SOP_GETPEERNAME(so, addr, addrlen, accept, cr)	\
703 	((so)->so_ops->sop_getpeername((so), (addr), (addrlen), (accept), (cr)))
704 #define	SOP_GETSOCKNAME(so, addr, addrlen, cr)	\
705 	((so)->so_ops->sop_getsockname((so), (addr), (addrlen), (cr)))
706 #define	SOP_SHUTDOWN(so, how, cr)	\
707 	((so)->so_ops->sop_shutdown((so), (how), (cr)))
708 #define	SOP_GETSOCKOPT(so, level, optionname, optval, optlenp, flags, cr) \
709 	((so)->so_ops->sop_getsockopt((so), (level), (optionname),	\
710 	    (optval), (optlenp), (flags), (cr)))
711 #define	SOP_SETSOCKOPT(so, level, optionname, optval, optlen, cr)	\
712 	((so)->so_ops->sop_setsockopt((so), (level), (optionname),	\
713 	    (optval), (optlen), (cr)))
714 #define	SOP_IOCTL(so, cmd, arg, mode, cr, rvalp)	\
715 	((so)->so_ops->sop_ioctl((so), (cmd), (arg), (mode), (cr), (rvalp)))
716 #define	SOP_POLL(so, events, anyyet, reventsp, phpp) \
717 	((so)->so_ops->sop_poll((so), (events), (anyyet), (reventsp), (phpp)))
718 #define	SOP_CLOSE(so, flag, cr)	\
719 	((so)->so_ops->sop_close((so), (flag), (cr)))
720 
721 #endif /* defined(_KERNEL) || defined(_KMEMUSER) */
722 
723 #ifdef _KERNEL
724 
725 #define	ISALIGNED_cmsghdr(addr) \
726 		(((uintptr_t)(addr) & (_CMSG_HDR_ALIGNMENT - 1)) == 0)
727 
728 #define	ROUNDUP_cmsglen(len) \
729 	(((len) + _CMSG_HDR_ALIGNMENT - 1) & ~(_CMSG_HDR_ALIGNMENT - 1))
730 
731 #define	IS_NON_STREAM_SOCK(vp) \
732 	((vp)->v_type == VSOCK && (vp)->v_stream == NULL)
733 /*
734  * Macros that operate on struct cmsghdr.
735  * Used in parsing msg_control.
736  * The CMSG_VALID macro does not assume that the last option buffer is padded.
737  */
738 #define	CMSG_NEXT(cmsg)						\
739 	(struct cmsghdr *)((uintptr_t)(cmsg) +			\
740 	    ROUNDUP_cmsglen((cmsg)->cmsg_len))
741 #define	CMSG_CONTENT(cmsg)	(&((cmsg)[1]))
742 #define	CMSG_CONTENTLEN(cmsg)	((cmsg)->cmsg_len - sizeof (struct cmsghdr))
743 #define	CMSG_VALID(cmsg, start, end)					\
744 	(ISALIGNED_cmsghdr(cmsg) &&					\
745 	((uintptr_t)(cmsg) >= (uintptr_t)(start)) &&			\
746 	((uintptr_t)(cmsg) < (uintptr_t)(end)) &&			\
747 	((ssize_t)(cmsg)->cmsg_len >= sizeof (struct cmsghdr)) &&	\
748 	((uintptr_t)(cmsg) + (cmsg)->cmsg_len <= (uintptr_t)(end)))
749 
750 /*
751  * Maximum size of any argument that is copied in (addresses, options,
752  * access rights). MUST be at least MAXPATHLEN + 3.
753  * BSD and SunOS 4.X limited this to MLEN or MCLBYTES.
754  */
755 #define	SO_MAXARGSIZE	8192
756 
757 /*
758  * Convert between vnode and sonode
759  */
760 #define	VTOSO(vp)	((struct sonode *)((vp)->v_data))
761 #define	SOTOV(sp)	((sp)->so_vnode)
762 
763 /*
764  * Internal flags for sobind()
765  */
766 #define	_SOBIND_REBIND		0x01	/* Bind to existing local address */
767 #define	_SOBIND_UNSPEC		0x02	/* Bind to unspecified address */
768 #define	_SOBIND_LOCK_HELD	0x04	/* so_excl_lock held by caller */
769 #define	_SOBIND_NOXLATE		0x08	/* No addr translation for AF_UNIX */
770 #define	_SOBIND_XPG4_2		0x10	/* xpg4.2 semantics */
771 #define	_SOBIND_SOCKBSD		0x20	/* BSD semantics */
772 #define	_SOBIND_LISTEN		0x40	/* Make into SS_ACCEPTCONN */
773 #define	_SOBIND_SOCKETPAIR	0x80	/* Internal flag for so_socketpair() */
774 					/* to enable listen with backlog = 1 */
775 
776 /*
777  * Internal flags for sounbind()
778  */
779 #define	_SOUNBIND_REBIND	0x01	/* Don't clear fields - will rebind */
780 
781 /*
782  * Internal flags for soconnect()
783  */
784 #define	_SOCONNECT_NOXLATE	0x01	/* No addr translation for AF_UNIX */
785 #define	_SOCONNECT_DID_BIND	0x02	/* Unbind when connect fails */
786 #define	_SOCONNECT_XPG4_2	0x04	/* xpg4.2 semantics */
787 
788 /*
789  * Internal flags for sodisconnect()
790  */
791 #define	_SODISCONNECT_LOCK_HELD	0x01	/* so_excl_lock held by caller */
792 
793 /*
794  * Internal flags for sotpi_getsockopt().
795  */
796 #define	_SOGETSOCKOPT_XPG4_2	0x01	/* xpg4.2 semantics */
797 
798 /*
799  * Internal flags for soallocproto*()
800  */
801 #define	_ALLOC_NOSLEEP		0	/* Don't sleep for memory */
802 #define	_ALLOC_INTR		1	/* Sleep until interrupt */
803 #define	_ALLOC_SLEEP		2	/* Sleep forever */
804 
805 /*
806  * Internal structure for handling AF_UNIX file descriptor passing
807  */
808 struct fdbuf {
809 	int		fd_size;	/* In bytes, for kmem_free */
810 	int		fd_numfd;	/* Number of elements below */
811 	char		*fd_ebuf;	/* Extra buffer to free  */
812 	int		fd_ebuflen;
813 	frtn_t		fd_frtn;
814 	struct file	*fd_fds[1];	/* One or more */
815 };
816 #define	FDBUF_HDRSIZE	(sizeof (struct fdbuf) - sizeof (struct file *))
817 
818 /*
819  * Variable that can be patched to set what version of socket socket()
820  * will create.
821  */
822 extern int so_default_version;
823 
824 #ifdef DEBUG
825 /* Turn on extra testing capabilities */
826 #define	SOCK_TEST
827 #endif /* DEBUG */
828 
829 #ifdef DEBUG
830 char	*pr_state(uint_t, uint_t);
831 char	*pr_addr(int, struct sockaddr *, t_uscalar_t);
832 int	so_verify_oobstate(struct sonode *);
833 #endif /* DEBUG */
834 
835 /*
836  * DEBUG macros
837  */
838 #if defined(DEBUG)
839 #define	SOCK_DEBUG
840 
841 extern int sockdebug;
842 extern int sockprinterr;
843 
844 #define	eprint(args)	printf args
845 #define	eprintso(so, args) \
846 { if (sockprinterr && ((so)->so_options & SO_DEBUG)) printf args; }
847 #define	eprintline(error)					\
848 {								\
849 	if (error != EINTR && (sockprinterr || sockdebug > 0))	\
850 		printf("socket error %d: line %d file %s\n",	\
851 			(error), __LINE__, __FILE__);		\
852 }
853 
854 #define	eprintsoline(so, error)					\
855 { if (sockprinterr && ((so)->so_options & SO_DEBUG))		\
856 	printf("socket(%p) error %d: line %d file %s\n",	\
857 		(void *)(so), (error), __LINE__, __FILE__);	\
858 }
859 #define	dprint(level, args)	{ if (sockdebug > (level)) printf args; }
860 #define	dprintso(so, level, args) \
861 { if (sockdebug > (level) && ((so)->so_options & SO_DEBUG)) printf args; }
862 
863 #else /* define(DEBUG) */
864 
865 #define	eprint(args)		{}
866 #define	eprintso(so, args)	{}
867 #define	eprintline(error)	{}
868 #define	eprintsoline(so, error)	{}
869 #define	dprint(level, args)	{}
870 #define	dprintso(so, level, args) {}
871 
872 #endif /* defined(DEBUG) */
873 
874 extern struct vfsops			sock_vfsops;
875 extern struct vnodeops			*socket_vnodeops;
876 extern const struct fs_operation_def	socket_vnodeops_template[];
877 
878 extern dev_t				sockdev;
879 
880 extern krwlock_t			sockconf_lock;
881 
882 /*
883  * sockfs functions
884  */
885 extern int	sock_getmsg(vnode_t *, struct strbuf *, struct strbuf *,
886 			uchar_t *, int *, int, rval_t *);
887 extern int	sock_putmsg(vnode_t *, struct strbuf *, struct strbuf *,
888 			uchar_t, int, int);
889 extern int	sogetvp(char *, vnode_t **, int);
890 extern int	sockinit(int, char *);
891 extern int	solookup(int, int, int, struct sockparams **);
892 extern void	so_lock_single(struct sonode *);
893 extern void	so_unlock_single(struct sonode *, int);
894 extern int	so_lock_read(struct sonode *, int);
895 extern int	so_lock_read_intr(struct sonode *, int);
896 extern void	so_unlock_read(struct sonode *);
897 extern void	*sogetoff(mblk_t *, t_uscalar_t, t_uscalar_t, uint_t);
898 extern void	so_getopt_srcaddr(void *, t_uscalar_t,
899 			void **, t_uscalar_t *);
900 extern int	so_getopt_unix_close(void *, t_uscalar_t);
901 extern void	fdbuf_free(struct fdbuf *);
902 extern mblk_t	*fdbuf_allocmsg(int, struct fdbuf *);
903 extern int	fdbuf_create(void *, int, struct fdbuf **);
904 extern void	so_closefds(void *, t_uscalar_t, int, int);
905 extern int	so_getfdopt(void *, t_uscalar_t, int, void **, int *);
906 t_uscalar_t	so_optlen(void *, t_uscalar_t, int);
907 extern void	so_cmsg2opt(void *, t_uscalar_t, int, mblk_t *);
908 extern t_uscalar_t
909 		so_cmsglen(mblk_t *, void *, t_uscalar_t, int);
910 extern int	so_opt2cmsg(mblk_t *, void *, t_uscalar_t, int,
911 			void *, t_uscalar_t);
912 extern void	soisconnecting(struct sonode *);
913 extern void	soisconnected(struct sonode *);
914 extern void	soisdisconnected(struct sonode *, int);
915 extern void	socantsendmore(struct sonode *);
916 extern void	socantrcvmore(struct sonode *);
917 extern void	soseterror(struct sonode *, int);
918 extern int	sogeterr(struct sonode *, boolean_t);
919 extern int	sowaitconnected(struct sonode *, int, int);
920 
921 extern ssize_t	soreadfile(file_t *, uchar_t *, u_offset_t, int *, size_t);
922 extern void	*sock_kstat_init(zoneid_t);
923 extern void	sock_kstat_fini(zoneid_t, void *);
924 extern struct sonode *getsonode(int, int *, file_t **);
925 /*
926  * Function wrappers (mostly around the sonode switch) for
927  * backward compatibility.
928  */
929 extern int	soaccept(struct sonode *, int, struct sonode **);
930 extern int	sobind(struct sonode *, struct sockaddr *, socklen_t,
931 		    int, int);
932 extern int	solisten(struct sonode *, int);
933 extern int	soconnect(struct sonode *, struct sockaddr *, socklen_t,
934 		    int, int);
935 extern int	sorecvmsg(struct sonode *, struct nmsghdr *, struct uio *);
936 extern int	sosendmsg(struct sonode *, struct nmsghdr *, struct uio *);
937 extern int	soshutdown(struct sonode *, int);
938 extern int	sogetsockopt(struct sonode *, int, int, void *, socklen_t *,
939 		    int);
940 extern int	sosetsockopt(struct sonode *, int, int, const void *,
941 		    t_uscalar_t);
942 
943 extern struct sonode	*socreate(struct sockparams *, int, int, int, int,
944 			    int *);
945 
946 extern int	so_copyin(const void *, void *, size_t, int);
947 extern int	so_copyout(const void *, void *, size_t, int);
948 
949 #endif
950 
951 /*
952  * Internal structure for obtaining sonode information from the socklist.
953  * These types match those corresponding in the sonode structure.
954  * This is not a published interface, and may change at any time.
955  */
956 struct sockinfo {
957 	uint_t		si_size;		/* real length of this struct */
958 	short		si_family;
959 	short		si_type;
960 	ushort_t	si_flag;
961 	uint_t		si_state;
962 	uint_t		si_ux_laddr_sou_magic;
963 	uint_t		si_ux_faddr_sou_magic;
964 	t_scalar_t	si_serv_type;
965 	t_uscalar_t	si_laddr_soa_len;
966 	t_uscalar_t	si_faddr_soa_len;
967 	uint16_t	si_laddr_family;
968 	uint16_t	si_faddr_family;
969 	char		si_laddr_sun_path[MAXPATHLEN + 1]; /* NULL terminated */
970 	char		si_faddr_sun_path[MAXPATHLEN + 1];
971 	boolean_t	si_faddr_noxlate;
972 	zoneid_t	si_szoneid;
973 };
974 
975 /*
976  * Subcodes for sockconf() system call
977  */
978 #define	SOCKCONFIG_ADD_SOCK		0
979 #define	SOCKCONFIG_REMOVE_SOCK		1
980 #define	SOCKCONFIG_ADD_FILTER		2
981 #define	SOCKCONFIG_REMOVE_FILTER	3
982 #define	SOCKCONFIG_GET_SOCKTABLE	4
983 
984 /*
985  * Data structures for configuring socket filters.
986  */
987 
988 /*
989  * Placement hint for automatic filters
990  */
991 typedef enum {
992 	SOF_HINT_NONE,
993 	SOF_HINT_TOP,
994 	SOF_HINT_BOTTOM,
995 	SOF_HINT_BEFORE,
996 	SOF_HINT_AFTER
997 } sof_hint_t;
998 
999 /*
1000  * Socket tuple. Used by sockconfig_filter_props to list socket
1001  * types of interest.
1002  */
1003 typedef struct sof_socktuple {
1004 	int	sofst_family;
1005 	int	sofst_type;
1006 	int	sofst_protocol;
1007 } sof_socktuple_t;
1008 
1009 /*
1010  * Socket filter properties used by sockconfig() system call.
1011  */
1012 struct sockconfig_filter_props {
1013 	char		*sfp_modname;
1014 	boolean_t	sfp_autoattach;
1015 	sof_hint_t	sfp_hint;
1016 	char		*sfp_hintarg;
1017 	uint_t		sfp_socktuple_cnt;
1018 	sof_socktuple_t	*sfp_socktuple;
1019 };
1020 
1021 /*
1022  * Data structures for the in-kernel socket configuration table.
1023  */
1024 typedef struct sockconfig_socktable_entry {
1025 	int		se_family;
1026 	int		se_type;
1027 	int		se_protocol;
1028 	int		se_refcnt;
1029 	int		se_flags;
1030 	char		se_modname[MODMAXNAMELEN];
1031 	char		se_strdev[MAXPATHLEN];
1032 } sockconfig_socktable_entry_t;
1033 
1034 typedef struct sockconfig_socktable {
1035 	uint_t		num_of_entries;
1036 	sockconfig_socktable_entry_t *st_entries;
1037 } sockconfig_socktable_t;
1038 
1039 #ifdef	_SYSCALL32
1040 
1041 typedef struct sof_socktuple32 {
1042 	int32_t	sofst_family;
1043 	int32_t	sofst_type;
1044 	int32_t	sofst_protocol;
1045 } sof_socktuple32_t;
1046 
1047 struct sockconfig_filter_props32 {
1048 	caddr32_t	sfp_modname;
1049 	boolean_t	sfp_autoattach;
1050 	sof_hint_t	sfp_hint;
1051 	caddr32_t	sfp_hintarg;
1052 	uint32_t	sfp_socktuple_cnt;
1053 	caddr32_t	sfp_socktuple;
1054 };
1055 
1056 typedef struct sockconfig_socktable32 {
1057 	uint_t		num_of_entries;
1058 	caddr32_t	st_entries;
1059 } sockconfig_socktable32_t;
1060 
1061 #endif	/* _SYSCALL32 */
1062 
1063 #define	SOCKMOD_PATH	"socketmod"	/* dir where sockmods are stored */
1064 
1065 #ifdef	__cplusplus
1066 }
1067 #endif
1068 
1069 #endif	/* _SYS_SOCKETVAR_H */
1070