xref: /illumos-gate/usr/src/uts/common/sys/socketvar.h (revision 9a5d73e03cd3312ddb571a748c40a63c58bd66e5)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 /*	Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T	*/
28 /*	  All Rights Reserved  	*/
29 
30 /*
31  * University Copyright- Copyright (c) 1982, 1986, 1988
32  * The Regents of the University of California
33  * All Rights Reserved
34  *
35  * University Acknowledgment- Portions of this document are derived from
36  * software developed by the University of California, Berkeley, and its
37  * contributors.
38  */
39 
40 #ifndef _SYS_SOCKETVAR_H
41 #define	_SYS_SOCKETVAR_H
42 
43 #include <sys/types.h>
44 #include <sys/stream.h>
45 #include <sys/t_lock.h>
46 #include <sys/cred.h>
47 #include <sys/vnode.h>
48 #include <sys/file.h>
49 #include <sys/param.h>
50 #include <sys/zone.h>
51 #include <sys/sdt.h>
52 #include <sys/modctl.h>
53 #include <sys/atomic.h>
54 #include <sys/socket.h>
55 #include <sys/ksocket.h>
56 #include <sys/sodirect.h>
57 
58 #ifdef	__cplusplus
59 extern "C" {
60 #endif
61 
62 /*
63  * Internal representation of the address used to represent addresses
64  * in the loopback transport for AF_UNIX. While the sockaddr_un is used
65  * as the sockfs layer address for AF_UNIX the pathnames contained in
66  * these addresses are not unique (due to relative pathnames) thus can not
67  * be used in the transport.
68  *
69  * The transport level address consists of a magic number (used to separate the
70  * name space for specific and implicit binds). For a specific bind
71  * this is followed by a "vnode *" which ensures that all specific binds
72  * have a unique transport level address. For implicit binds the latter
73  * part of the address is a byte string (of the same length as a pointer)
74  * that is assigned by the loopback transport.
75  *
76  * The uniqueness assumes that the loopback transport has a separate namespace
77  * for sockets in order to avoid name conflicts with e.g. TLI use of the
78  * same transport.
79  */
80 struct so_ux_addr {
81 	void	*soua_vp;	/* vnode pointer or assigned by tl */
82 	uint_t	soua_magic;	/* See below */
83 };
84 
85 #define	SOU_MAGIC_EXPLICIT	0x75787670	/* "uxvp" */
86 #define	SOU_MAGIC_IMPLICIT	0x616e6f6e	/* "anon" */
87 
88 struct sockaddr_ux {
89 	sa_family_t		sou_family;	/* AF_UNIX */
90 	struct so_ux_addr	sou_addr;
91 };
92 
93 #if defined(_KERNEL) || defined(_KMEMUSER)
94 
95 #include <sys/socket_proto.h>
96 
97 typedef struct sonodeops sonodeops_t;
98 typedef struct sonode sonode_t;
99 
100 /*
101  * The sonode represents a socket. A sonode never exist in the file system
102  * name space and can not be opened using open() - only the socket, socketpair
103  * and accept calls create sonodes.
104  *
105  * The locking of sockfs uses the so_lock mutex plus the SOLOCKED and
106  * SOREADLOCKED flags in so_flag. The mutex protects all the state in the
107  * sonode. It is expected that the underlying transport protocol serializes
108  * socket operations, so sockfs will not normally not single-thread
109  * operations. However, certain sockets, including TPI based ones, can only
110  * handle one control operation at a time. The SOLOCKED flag is used to
111  * single-thread operations from sockfs users to prevent e.g. multiple bind()
112  * calls to operate on the same sonode concurrently. The SOREADLOCKED flag is
113  * used to ensure that only one thread sleeps in kstrgetmsg for a given
114  * sonode. This is needed to ensure atomic operation for things like
115  * MSG_WAITALL.
116  *
117  * The so_fallback_rwlock is used to ensure that for sockets that can
118  * fall back to TPI, the fallback is not initiated until all pending
119  * operations have completed.
120  *
121  * Note that so_lock is sometimes held across calls that might go to sleep
122  * (kmem_alloc and soallocproto*). This implies that no other lock in
123  * the system should be held when calling into sockfs; from the system call
124  * side or from strrput (in case of TPI based sockets). If locks are held
125  * while calling into sockfs the system might hang when running low on memory.
126  */
127 struct sonode {
128 	struct	vnode	*so_vnode;	/* vnode associated with this sonode */
129 
130 	sonodeops_t 	*so_ops;	/* operations vector for this sonode */
131 	void		*so_priv;	/* sonode private data */
132 
133 	krwlock_t	so_fallback_rwlock;
134 	kmutex_t	so_lock;	/* protects sonode fields */
135 
136 	kcondvar_t	so_state_cv;	/* synchronize state changes */
137 	kcondvar_t	so_want_cv;	/* wait due to SOLOCKED */
138 
139 	/* These fields are protected by so_lock */
140 
141 	uint_t		so_state;	/* internal state flags SS_*, below */
142 	uint_t		so_mode;	/* characteristics on socket. SM_* */
143 	ushort_t 	so_flag;	/* flags, see below */
144 	int		so_count;	/* count of opened references */
145 
146 	sock_connid_t	so_proto_connid; /* protocol generation number */
147 
148 	ushort_t 	so_error;	/* error affecting connection */
149 
150 	struct sockparams *so_sockparams;	/* vnode or socket module */
151 	/* Needed to recreate the same socket for accept */
152 	short	so_family;
153 	short	so_type;
154 	short	so_protocol;
155 	short	so_version;		/* From so_socket call */
156 
157 	/* Accept queue */
158 	kmutex_t	so_acceptq_lock;	/* protects accept queue */
159 	struct sonode	*so_acceptq_next;	/* acceptq list node */
160 	struct sonode 	*so_acceptq_head;
161 	struct sonode	**so_acceptq_tail;
162 	unsigned int	so_acceptq_len;
163 	unsigned int	so_backlog;		/* Listen backlog */
164 	kcondvar_t	so_acceptq_cv;		/* wait for new conn. */
165 
166 	/* Options */
167 	short	so_options;		/* From socket call, see socket.h */
168 	struct linger	so_linger;	/* SO_LINGER value */
169 #define	so_sndbuf	so_proto_props.sopp_txhiwat	/* SO_SNDBUF value */
170 #define	so_sndlowat	so_proto_props.sopp_txlowat	/* tx low water mark */
171 #define	so_rcvbuf	so_proto_props.sopp_rxhiwat	/* SO_RCVBUF value */
172 #define	so_rcvlowat	so_proto_props.sopp_rxlowat	/* rx low water mark */
173 #define	so_max_addr_len	so_proto_props.sopp_maxaddrlen
174 #define	so_minpsz	so_proto_props.sopp_minpsz
175 #define	so_maxpsz	so_proto_props.sopp_maxpsz
176 
177 	int	so_xpg_rcvbuf;		/* SO_RCVBUF value for XPG4 socket */
178 	clock_t	so_sndtimeo;		/* send timeout */
179 	clock_t	so_rcvtimeo;		/* recv timeout */
180 
181 	mblk_t	*so_oobmsg;		/* outofline oob data */
182 	ssize_t	so_oobmark;		/* offset of the oob data */
183 
184 	pid_t	so_pgrp;		/* pgrp for signals */
185 
186 	cred_t		*so_peercred;	/* connected socket peer cred */
187 	pid_t		so_cpid;	/* connected socket peer cached pid */
188 	zoneid_t	so_zoneid;	/* opener's zoneid */
189 
190 	struct pollhead	so_poll_list;	/* common pollhead */
191 	short		so_pollev;	/* events that should be generated */
192 
193 	/* Receive */
194 	unsigned int	so_rcv_queued;
195 	mblk_t		*so_rcv_q_head;
196 	mblk_t		*so_rcv_q_last_head;
197 	mblk_t		*so_rcv_head;		/* 1st mblk in the list */
198 	mblk_t		*so_rcv_last_head;	/* last mblk in b_next chain */
199 	kcondvar_t	so_rcv_cv;
200 	uint_t		so_rcv_wanted;	/* # of bytes wanted by app */
201 	timeout_id_t	so_rcv_timer_tid;
202 
203 #define	so_rcv_thresh	so_proto_props.sopp_rcvthresh
204 #define	so_rcv_timer_interval so_proto_props.sopp_rcvtimer
205 
206 	kcondvar_t	so_snd_cv;
207 	uint32_t
208 		so_snd_qfull: 1,	/* Transmit full */
209 		so_rcv_wakeup: 1,
210 		so_snd_wakeup: 1,
211 		so_not_str: 1,	/* B_TRUE if not streams based socket */
212 		so_pad_to_bit_31: 28;
213 
214 	/* Communication channel with protocol */
215 	sock_lower_handle_t	so_proto_handle;
216 	sock_downcalls_t 	*so_downcalls;
217 
218 	struct sock_proto_props	so_proto_props; /* protocol settings */
219 	boolean_t		so_flowctrld;	/* Flow controlled */
220 	uint_t			so_copyflag;	/* Copy related flag */
221 	kcondvar_t		so_copy_cv;	/* Copy cond variable */
222 
223 	/* kernel sockets */
224 	ksocket_callbacks_t 	so_ksock_callbacks;
225 	void			*so_ksock_cb_arg;	/* callback argument */
226 	kcondvar_t		so_closing_cv;
227 
228 	/* != NULL for sodirect_t enabled socket */
229 	sodirect_t		*so_direct;
230 };
231 
232 #define	SO_HAVE_DATA(so)						\
233 	/*								\
234 	 * For the (tid == 0) case we must check so_rcv_{q_,}head	\
235 	 * rather than (so_rcv_queued > 0), since the latter does not	\
236 	 * take into account mblks with only control/name information.	\
237 	 */								\
238 	((so)->so_rcv_timer_tid == 0 && ((so)->so_rcv_head != NULL ||	\
239 	(so)->so_rcv_q_head != NULL)) ||				\
240 	((so)->so_state & SS_CANTRCVMORE)
241 
242 /*
243  * Events handled by the protocol (in case sd_poll is set)
244  */
245 #define	SO_PROTO_POLLEV		(POLLIN|POLLRDNORM|POLLRDBAND)
246 
247 
248 #endif /* _KERNEL || _KMEMUSER */
249 
250 /* flags */
251 #define	SOMOD		0x0001		/* update socket modification time */
252 #define	SOACC		0x0002		/* update socket access time */
253 
254 #define	SOLOCKED	0x0010		/* use to serialize open/closes */
255 #define	SOREADLOCKED	0x0020		/* serialize kstrgetmsg calls */
256 #define	SOWANT		0x0040		/* some process waiting on lock */
257 #define	SOCLONE		0x0080		/* child of clone driver */
258 #define	SOASYNC_UNBIND	0x0100		/* wait for ACK of async unbind */
259 
260 #define	SOCK_IS_NONSTR(so)	((so)->so_not_str)
261 
262 /*
263  * Socket state bits.
264  */
265 #define	SS_ISCONNECTED		0x00000001 /* socket connected to a peer */
266 #define	SS_ISCONNECTING		0x00000002 /* in process, connecting to peer */
267 #define	SS_ISDISCONNECTING	0x00000004 /* in process of disconnecting */
268 #define	SS_CANTSENDMORE		0x00000008 /* can't send more data to peer */
269 
270 #define	SS_CANTRCVMORE		0x00000010 /* can't receive more data */
271 #define	SS_ISBOUND		0x00000020 /* socket is bound */
272 #define	SS_NDELAY		0x00000040 /* FNDELAY non-blocking */
273 #define	SS_NONBLOCK		0x00000080 /* O_NONBLOCK non-blocking */
274 
275 #define	SS_ASYNC		0x00000100 /* async i/o notify */
276 #define	SS_ACCEPTCONN		0x00000200 /* listen done */
277 /*	unused			0x00000400 */	/* was SS_HASCONNIND */
278 #define	SS_SAVEDEOR		0x00000800 /* Saved MSG_EOR rcv side state */
279 
280 #define	SS_RCVATMARK		0x00001000 /* at mark on input */
281 #define	SS_OOBPEND		0x00002000 /* OOB pending or present - poll */
282 #define	SS_HAVEOOBDATA		0x00004000 /* OOB data present */
283 #define	SS_HADOOBDATA		0x00008000 /* OOB data consumed */
284 #define	SS_CLOSING		0x00010000 /* in process of closing */
285 
286 /*	unused			0x00020000 */	/* was SS_FADDR_NOXLATE */
287 /*	unused			0x00040000 */	/* was SS_HASDATA */
288 /*	unused 			0x00080000 */	/* was SS_DONEREAD */
289 /*	unused 			0x00100000 */	/* was SS_MOREDATA */
290 /*	unused 			0x00200000 */	/* was SS_DIRECT */
291 
292 #define	SS_SODIRECT		0x00400000 /* transport supports sodirect */
293 
294 /*	unused			0x01000000 */	/* was SS_LADDR_VALID */
295 /*	unused			0x02000000 */	/* was SS_FADDR_VALID */
296 
297 #define	SS_SENTLASTREADSIG	0x10000000 /* last rx signal has been sent */
298 #define	SS_SENTLASTWRITESIG	0x20000000 /* last tx signal has been sent */
299 
300 #define	SS_FALLBACK_PENDING	0x40000000
301 #define	SS_FALLBACK_COMP	0x80000000
302 
303 
304 /* Set of states when the socket can't be rebound */
305 #define	SS_CANTREBIND	(SS_ISCONNECTED|SS_ISCONNECTING|SS_ISDISCONNECTING|\
306 			    SS_CANTSENDMORE|SS_CANTRCVMORE|SS_ACCEPTCONN)
307 
308 /*
309  * Sockets that can fall back to TPI must ensure that fall back is not
310  * initiated while a thread is using a socket.
311  */
312 #define	SO_BLOCK_FALLBACK(so, fn) {			\
313 	ASSERT(MUTEX_NOT_HELD(&(so)->so_lock));		\
314 	rw_enter(&(so)->so_fallback_rwlock, RW_READER);	\
315 	if ((so)->so_state & SS_FALLBACK_COMP) {	\
316 		rw_exit(&(so)->so_fallback_rwlock);	\
317 		return (fn);				\
318 	}						\
319 }
320 
321 #define	SO_UNBLOCK_FALLBACK(so)	{			\
322 	rw_exit(&(so)->so_fallback_rwlock);		\
323 }
324 
325 /* Poll events */
326 #define	SO_POLLEV_IN		0x1	/* POLLIN wakeup needed */
327 #define	SO_POLLEV_ALWAYS	0x2	/* wakeups */
328 
329 /*
330  * Characteristics of sockets. Not changed after the socket is created.
331  */
332 #define	SM_PRIV			0x001	/* privileged for broadcast, raw... */
333 #define	SM_ATOMIC		0x002	/* atomic data transmission */
334 #define	SM_ADDR			0x004	/* addresses given with messages */
335 #define	SM_CONNREQUIRED		0x008	/* connection required by protocol */
336 
337 #define	SM_FDPASSING		0x010	/* passes file descriptors */
338 #define	SM_EXDATA		0x020	/* Can handle T_EXDATA_REQ */
339 #define	SM_OPTDATA		0x040	/* Can handle T_OPTDATA_REQ */
340 #define	SM_BYTESTREAM		0x080	/* Byte stream - can use M_DATA */
341 
342 #define	SM_ACCEPTOR_ID		0x100	/* so_acceptor_id is valid */
343 
344 #define	SM_KERNEL		0x200	/* kernel socket */
345 
346 /* The modes below are only for non-streams sockets */
347 #define	SM_ACCEPTSUPP		0x400	/* can handle accept() */
348 #define	SM_SENDFILESUPP		0x800	/* Private: proto supp sendfile  */
349 
350 /*
351  * Socket versions. Used by the socket library when calling _so_socket().
352  */
353 #define	SOV_STREAM	0	/* Not a socket - just a stream */
354 #define	SOV_DEFAULT	1	/* Select based on so_default_version */
355 #define	SOV_SOCKSTREAM	2	/* Socket plus streams operations */
356 #define	SOV_SOCKBSD	3	/* Socket with no streams operations */
357 #define	SOV_XPG4_2	4	/* Xnet socket */
358 
359 #if defined(_KERNEL) || defined(_KMEMUSER)
360 
361 /*
362  * sonode create and destroy functions.
363  */
364 typedef struct sonode *(*so_create_func_t)(struct sockparams *,
365     int, int, int, int, int, int *, cred_t *);
366 typedef void (*so_destroy_func_t)(struct sonode *);
367 
368 /* STREAM device information */
369 typedef struct sdev_info {
370 	char	*sd_devpath;
371 	int	sd_devpathlen; /* Is 0 if sp_devpath is a static string */
372 	vnode_t	*sd_vnode;
373 } sdev_info_t;
374 
375 #define	SOCKMOD_VERSION		1
376 /* name of the TPI pseudo socket module */
377 #define	SOTPI_SMOD_NAME		"socktpi"
378 
379 typedef struct __smod_priv_s {
380 	so_create_func_t	smodp_sock_create_func;
381 	so_destroy_func_t	smodp_sock_destroy_func;
382 	so_proto_fallback_func_t smodp_proto_fallback_func;
383 } __smod_priv_t;
384 
385 /*
386  * Socket module register information
387  */
388 typedef struct smod_reg_s {
389 	int		smod_version;
390 	char		*smod_name;
391 	size_t		smod_uc_version;
392 	size_t		smod_dc_version;
393 	so_proto_create_func_t	smod_proto_create_func;
394 
395 	/* __smod_priv_data must be NULL */
396 	__smod_priv_t	*__smod_priv;
397 } smod_reg_t;
398 
399 /*
400  * Socket module information
401  */
402 typedef struct smod_info {
403 	int		smod_version;
404 	char		*smod_name;
405 	uint_t		smod_refcnt;		/* # of entries */
406 	size_t		smod_uc_version; 	/* upcall version */
407 	size_t		smod_dc_version;	/* down call version */
408 	so_proto_create_func_t	smod_proto_create_func;
409 	so_proto_fallback_func_t smod_proto_fallback_func;
410 	so_create_func_t	smod_sock_create_func;
411 	so_destroy_func_t	smod_sock_destroy_func;
412 	list_node_t	smod_node;
413 } smod_info_t;
414 
415 /*
416  * sockparams
417  *
418  * Used for mapping family/type/protocol to module
419  */
420 struct sockparams {
421 	/*
422 	 * The family, type, protocol, sdev_info and smod_info are
423 	 * set when the entry is created, and they will never change
424 	 * thereafter.
425 	 */
426 	int		sp_family;
427 	int		sp_type;
428 	int		sp_protocol;
429 
430 	sdev_info_t	sp_sdev_info;	/* STREAM device */
431 	char		*sp_smod_name;	/* socket module name */
432 	smod_info_t	*sp_smod_info;	/* socket module */
433 
434 	kmutex_t	sp_lock;	/* lock for refcnt */
435 	uint64_t	sp_refcnt;	/* entry reference count */
436 
437 	/*
438 	 * The entries below are only modified while holding
439 	 * splist_lock as a writer.
440 	 */
441 	int		sp_flags;	/* see below */
442 	list_node_t	sp_node;
443 };
444 
445 
446 /*
447  * sockparams flags
448  */
449 #define	SOCKPARAMS_EPHEMERAL	0x1	/* temp. entry, not on global list */
450 
451 extern void sockparams_init(void);
452 extern struct sockparams *sockparams_hold_ephemeral_bydev(int, int, int,
453     const char *, int, int *);
454 extern struct sockparams *sockparams_hold_ephemeral_bymod(int, int, int,
455     const char *, int, int *);
456 extern void sockparams_ephemeral_drop_last_ref(struct sockparams *);
457 
458 extern void smod_init(void);
459 extern void smod_add(smod_info_t *);
460 extern int smod_register(const smod_reg_t *);
461 extern int smod_unregister(const char *);
462 extern smod_info_t *smod_lookup_byname(const char *);
463 
464 #define	SOCKPARAMS_HAS_DEVICE(sp)					\
465 	((sp)->sp_sdev_info.sd_devpath != NULL)
466 
467 /* Increase the smod_info_t reference count */
468 #define	SMOD_INC_REF(smodp) {						\
469 	ASSERT((smodp) != NULL);					\
470 	DTRACE_PROBE1(smodinfo__inc__ref, struct smod_info *, (smodp));	\
471 	atomic_inc_uint(&(smodp)->smod_refcnt);				\
472 }
473 
474 /*
475  * Decreace the socket module entry reference count.
476  * When no one mapping to the entry, we try to unload the module from the
477  * kernel. If the module can't unload, just leave the module entry with
478  * a zero refcnt.
479  */
480 #define	SMOD_DEC_REF(sp, smodp) {					\
481 	ASSERT((smodp) != NULL);					\
482 	ASSERT((smodp)->smod_refcnt != 0);				\
483 	atomic_dec_uint(&(smodp)->smod_refcnt);				\
484 	/*								\
485 	 * No need to atomically check the return value because the	\
486 	 * socket module framework will verify that no one is using	\
487 	 * the module before unloading. Worst thing that can happen	\
488 	 * here is multiple calls to mod_remove_by_name(), which is OK.	\
489 	 */								\
490 	if ((smodp)->smod_refcnt == 0)					\
491 		(void) mod_remove_by_name((sp)->sp_smod_name);		\
492 }
493 
494 /* Increase the reference count */
495 #define	SOCKPARAMS_INC_REF(sp) {					\
496 	ASSERT((sp) != NULL);						\
497 	DTRACE_PROBE1(sockparams__inc__ref, struct sockparams *, (sp));	\
498 	mutex_enter(&(sp)->sp_lock);					\
499 	(sp)->sp_refcnt++;						\
500 	ASSERT((sp)->sp_refcnt != 0);					\
501 	mutex_exit(&(sp)->sp_lock);					\
502 }
503 
504 /*
505  * Decrease the reference count.
506  *
507  * If the sockparams is ephemeral, then the thread dropping the last ref
508  * count will destroy the entry.
509  */
510 #define	SOCKPARAMS_DEC_REF(sp) {					\
511 	ASSERT((sp) != NULL);						\
512 	DTRACE_PROBE1(sockparams__dec__ref, struct sockparams *, (sp));	\
513 	mutex_enter(&(sp)->sp_lock);					\
514 	ASSERT((sp)->sp_refcnt > 0);					\
515 	if ((sp)->sp_refcnt == 1) {					\
516 		if ((sp)->sp_flags & SOCKPARAMS_EPHEMERAL) {		\
517 			mutex_exit(&(sp)->sp_lock);			\
518 			sockparams_ephemeral_drop_last_ref((sp));	\
519 		} else {						\
520 			(sp)->sp_refcnt--;				\
521 			if ((sp)->sp_smod_info != NULL)			\
522 				SMOD_DEC_REF(sp, (sp)->sp_smod_info);	\
523 			(sp)->sp_smod_info = NULL;			\
524 			mutex_exit(&(sp)->sp_lock);			\
525 		}							\
526 	} else {							\
527 		(sp)->sp_refcnt--;					\
528 		mutex_exit(&(sp)->sp_lock);				\
529 	}								\
530 }
531 
532 /*
533  * Used to traverse the list of AF_UNIX sockets to construct the kstat
534  * for netstat(1m).
535  */
536 struct socklist {
537 	kmutex_t	sl_lock;
538 	struct sonode	*sl_list;
539 };
540 
541 extern struct socklist socklist;
542 /*
543  * ss_full_waits is the number of times the reader thread
544  * waits when the queue is full and ss_empty_waits is the number
545  * of times the consumer thread waits when the queue is empty.
546  * No locks for these as they are just indicators of whether
547  * disk or network or both is slow or fast.
548  */
549 struct sendfile_stats {
550 	uint32_t ss_file_cached;
551 	uint32_t ss_file_not_cached;
552 	uint32_t ss_full_waits;
553 	uint32_t ss_empty_waits;
554 	uint32_t ss_file_segmap;
555 };
556 
557 /*
558  * A single sendfile request is represented by snf_req.
559  */
560 typedef struct snf_req {
561 	struct snf_req	*sr_next;
562 	mblk_t		*sr_mp_head;
563 	mblk_t		*sr_mp_tail;
564 	kmutex_t	sr_lock;
565 	kcondvar_t	sr_cv;
566 	uint_t		sr_qlen;
567 	int		sr_hiwat;
568 	int		sr_lowat;
569 	int		sr_operation;
570 	struct vnode	*sr_vp;
571 	file_t 		*sr_fp;
572 	ssize_t		sr_maxpsz;
573 	u_offset_t	sr_file_off;
574 	u_offset_t	sr_file_size;
575 #define	SR_READ_DONE	0x80000000
576 	int		sr_read_error;
577 	int		sr_write_error;
578 } snf_req_t;
579 
580 /* A queue of sendfile requests */
581 struct sendfile_queue {
582 	snf_req_t	*snfq_req_head;
583 	snf_req_t	*snfq_req_tail;
584 	kmutex_t	snfq_lock;
585 	kcondvar_t	snfq_cv;
586 	int		snfq_svc_threads;	/* # of service threads */
587 	int		snfq_idle_cnt;		/* # of idling threads */
588 	int		snfq_max_threads;
589 	int		snfq_req_cnt;		/* Number of requests */
590 };
591 
592 #define	READ_OP			1
593 #define	SNFQ_TIMEOUT		(60 * 5 * hz)	/* 5 minutes */
594 
595 /* Socket network operations switch */
596 struct sonodeops {
597 	int 	(*sop_init)(struct sonode *, struct sonode *, cred_t *,
598 		    int);
599 	int	(*sop_accept)(struct sonode *, int, cred_t *, struct sonode **);
600 	int	(*sop_bind)(struct sonode *, struct sockaddr *, socklen_t,
601 		    int, cred_t *);
602 	int	(*sop_listen)(struct sonode *, int, cred_t *);
603 	int	(*sop_connect)(struct sonode *, const struct sockaddr *,
604 		    socklen_t, int, int, cred_t *);
605 	int	(*sop_recvmsg)(struct sonode *, struct msghdr *,
606 		    struct uio *, cred_t *);
607 	int	(*sop_sendmsg)(struct sonode *, struct msghdr *,
608 		    struct uio *, cred_t *);
609 	int	(*sop_sendmblk)(struct sonode *, struct msghdr *, int,
610 		    cred_t *, mblk_t **);
611 	int	(*sop_getpeername)(struct sonode *, struct sockaddr *,
612 		    socklen_t *, boolean_t, cred_t *);
613 	int	(*sop_getsockname)(struct sonode *, struct sockaddr *,
614 		    socklen_t *, cred_t *);
615 	int	(*sop_shutdown)(struct sonode *, int, cred_t *);
616 	int	(*sop_getsockopt)(struct sonode *, int, int, void *,
617 		    socklen_t *, int, cred_t *);
618 	int 	(*sop_setsockopt)(struct sonode *, int, int, const void *,
619 		    socklen_t, cred_t *);
620 	int 	(*sop_ioctl)(struct sonode *, int, intptr_t, int,
621 		    cred_t *, int32_t *);
622 	int 	(*sop_poll)(struct sonode *, short, int, short *,
623 		    struct pollhead **);
624 	int 	(*sop_close)(struct sonode *, int, cred_t *);
625 };
626 
627 #define	SOP_INIT(so, flag, cr, flags)	\
628 	((so)->so_ops->sop_init((so), (flag), (cr), (flags)))
629 #define	SOP_ACCEPT(so, fflag, cr, nsop)	\
630 	((so)->so_ops->sop_accept((so), (fflag), (cr), (nsop)))
631 #define	SOP_BIND(so, name, namelen, flags, cr)	\
632 	((so)->so_ops->sop_bind((so), (name), (namelen), (flags), (cr)))
633 #define	SOP_LISTEN(so, backlog, cr)	\
634 	((so)->so_ops->sop_listen((so), (backlog), (cr)))
635 #define	SOP_CONNECT(so, name, namelen, fflag, flags, cr)	\
636 	((so)->so_ops->sop_connect((so), (name), (namelen), (fflag), (flags), \
637 	(cr)))
638 #define	SOP_RECVMSG(so, msg, uiop, cr)	\
639 	((so)->so_ops->sop_recvmsg((so), (msg), (uiop), (cr)))
640 #define	SOP_SENDMSG(so, msg, uiop, cr)	\
641 	((so)->so_ops->sop_sendmsg((so), (msg), (uiop), (cr)))
642 #define	SOP_SENDMBLK(so, msg, size, cr, mpp)	\
643 	((so)->so_ops->sop_sendmblk((so), (msg), (size), (cr), (mpp)))
644 #define	SOP_GETPEERNAME(so, addr, addrlen, accept, cr)	\
645 	((so)->so_ops->sop_getpeername((so), (addr), (addrlen), (accept), (cr)))
646 #define	SOP_GETSOCKNAME(so, addr, addrlen, cr)	\
647 	((so)->so_ops->sop_getsockname((so), (addr), (addrlen), (cr)))
648 #define	SOP_SHUTDOWN(so, how, cr)	\
649 	((so)->so_ops->sop_shutdown((so), (how), (cr)))
650 #define	SOP_GETSOCKOPT(so, level, optionname, optval, optlenp, flags, cr) \
651 	((so)->so_ops->sop_getsockopt((so), (level), (optionname),	\
652 	    (optval), (optlenp), (flags), (cr)))
653 #define	SOP_SETSOCKOPT(so, level, optionname, optval, optlen, cr)	\
654 	((so)->so_ops->sop_setsockopt((so), (level), (optionname),	\
655 	    (optval), (optlen), (cr)))
656 #define	SOP_IOCTL(so, cmd, arg, mode, cr, rvalp)	\
657 	((so)->so_ops->sop_ioctl((so), (cmd), (arg), (mode), (cr), (rvalp)))
658 #define	SOP_POLL(so, events, anyyet, reventsp, phpp) \
659 	((so)->so_ops->sop_poll((so), (events), (anyyet), (reventsp), (phpp)))
660 #define	SOP_CLOSE(so, flag, cr)	\
661 	((so)->so_ops->sop_close((so), (flag), (cr)))
662 
663 #endif /* defined(_KERNEL) || defined(_KMEMUSER) */
664 
665 #ifdef _KERNEL
666 
667 #define	ISALIGNED_cmsghdr(addr) \
668 		(((uintptr_t)(addr) & (_CMSG_HDR_ALIGNMENT - 1)) == 0)
669 
670 #define	ROUNDUP_cmsglen(len) \
671 	(((len) + _CMSG_HDR_ALIGNMENT - 1) & ~(_CMSG_HDR_ALIGNMENT - 1))
672 
673 #define	IS_NON_STREAM_SOCK(vp) \
674 	((vp)->v_type == VSOCK && (vp)->v_stream == NULL)
675 /*
676  * Macros that operate on struct cmsghdr.
677  * Used in parsing msg_control.
678  * The CMSG_VALID macro does not assume that the last option buffer is padded.
679  */
680 #define	CMSG_NEXT(cmsg)						\
681 	(struct cmsghdr *)((uintptr_t)(cmsg) +			\
682 	    ROUNDUP_cmsglen((cmsg)->cmsg_len))
683 #define	CMSG_CONTENT(cmsg)	(&((cmsg)[1]))
684 #define	CMSG_CONTENTLEN(cmsg)	((cmsg)->cmsg_len - sizeof (struct cmsghdr))
685 #define	CMSG_VALID(cmsg, start, end)					\
686 	(ISALIGNED_cmsghdr(cmsg) &&					\
687 	((uintptr_t)(cmsg) >= (uintptr_t)(start)) &&			\
688 	((uintptr_t)(cmsg) < (uintptr_t)(end)) &&			\
689 	((ssize_t)(cmsg)->cmsg_len >= sizeof (struct cmsghdr)) &&	\
690 	((uintptr_t)(cmsg) + (cmsg)->cmsg_len <= (uintptr_t)(end)))
691 
692 /*
693  * Maximum size of any argument that is copied in (addresses, options,
694  * access rights). MUST be at least MAXPATHLEN + 3.
695  * BSD and SunOS 4.X limited this to MLEN or MCLBYTES.
696  */
697 #define	SO_MAXARGSIZE	8192
698 
699 /*
700  * Convert between vnode and sonode
701  */
702 #define	VTOSO(vp)	((struct sonode *)((vp)->v_data))
703 #define	SOTOV(sp)	((sp)->so_vnode)
704 
705 /*
706  * Internal flags for sobind()
707  */
708 #define	_SOBIND_REBIND		0x01	/* Bind to existing local address */
709 #define	_SOBIND_UNSPEC		0x02	/* Bind to unspecified address */
710 #define	_SOBIND_LOCK_HELD	0x04	/* so_excl_lock held by caller */
711 #define	_SOBIND_NOXLATE		0x08	/* No addr translation for AF_UNIX */
712 #define	_SOBIND_XPG4_2		0x10	/* xpg4.2 semantics */
713 #define	_SOBIND_SOCKBSD		0x20	/* BSD semantics */
714 #define	_SOBIND_LISTEN		0x40	/* Make into SS_ACCEPTCONN */
715 #define	_SOBIND_SOCKETPAIR	0x80	/* Internal flag for so_socketpair() */
716 					/* to enable listen with backlog = 1 */
717 
718 /*
719  * Internal flags for sounbind()
720  */
721 #define	_SOUNBIND_REBIND	0x01	/* Don't clear fields - will rebind */
722 
723 /*
724  * Internal flags for soconnect()
725  */
726 #define	_SOCONNECT_NOXLATE	0x01	/* No addr translation for AF_UNIX */
727 #define	_SOCONNECT_DID_BIND	0x02	/* Unbind when connect fails */
728 #define	_SOCONNECT_XPG4_2	0x04	/* xpg4.2 semantics */
729 
730 /*
731  * Internal flags for sodisconnect()
732  */
733 #define	_SODISCONNECT_LOCK_HELD	0x01	/* so_excl_lock held by caller */
734 
735 /*
736  * Internal flags for sotpi_getsockopt().
737  */
738 #define	_SOGETSOCKOPT_XPG4_2	0x01	/* xpg4.2 semantics */
739 
740 /*
741  * Internal flags for soallocproto*()
742  */
743 #define	_ALLOC_NOSLEEP		0	/* Don't sleep for memory */
744 #define	_ALLOC_INTR		1	/* Sleep until interrupt */
745 #define	_ALLOC_SLEEP		2	/* Sleep forever */
746 
747 /*
748  * Internal structure for handling AF_UNIX file descriptor passing
749  */
750 struct fdbuf {
751 	int		fd_size;	/* In bytes, for kmem_free */
752 	int		fd_numfd;	/* Number of elements below */
753 	char		*fd_ebuf;	/* Extra buffer to free  */
754 	int		fd_ebuflen;
755 	frtn_t		fd_frtn;
756 	struct file	*fd_fds[1];	/* One or more */
757 };
758 #define	FDBUF_HDRSIZE	(sizeof (struct fdbuf) - sizeof (struct file *))
759 
760 /*
761  * Variable that can be patched to set what version of socket socket()
762  * will create.
763  */
764 extern int so_default_version;
765 
766 #ifdef DEBUG
767 /* Turn on extra testing capabilities */
768 #define	SOCK_TEST
769 #endif /* DEBUG */
770 
771 #ifdef DEBUG
772 char	*pr_state(uint_t, uint_t);
773 char	*pr_addr(int, struct sockaddr *, t_uscalar_t);
774 int	so_verify_oobstate(struct sonode *);
775 #endif /* DEBUG */
776 
777 /*
778  * DEBUG macros
779  */
780 #if defined(DEBUG)
781 #define	SOCK_DEBUG
782 
783 extern int sockdebug;
784 extern int sockprinterr;
785 
786 #define	eprint(args)	printf args
787 #define	eprintso(so, args) \
788 { if (sockprinterr && ((so)->so_options & SO_DEBUG)) printf args; }
789 #define	eprintline(error)					\
790 {								\
791 	if (error != EINTR && (sockprinterr || sockdebug > 0))	\
792 		printf("socket error %d: line %d file %s\n",	\
793 			(error), __LINE__, __FILE__);		\
794 }
795 
796 #define	eprintsoline(so, error)					\
797 { if (sockprinterr && ((so)->so_options & SO_DEBUG))		\
798 	printf("socket(%p) error %d: line %d file %s\n",	\
799 		(void *)(so), (error), __LINE__, __FILE__);	\
800 }
801 #define	dprint(level, args)	{ if (sockdebug > (level)) printf args; }
802 #define	dprintso(so, level, args) \
803 { if (sockdebug > (level) && ((so)->so_options & SO_DEBUG)) printf args; }
804 
805 #else /* define(DEBUG) */
806 
807 #define	eprint(args)		{}
808 #define	eprintso(so, args)	{}
809 #define	eprintline(error)	{}
810 #define	eprintsoline(so, error)	{}
811 #define	dprint(level, args)	{}
812 #define	dprintso(so, level, args) {}
813 
814 #endif /* defined(DEBUG) */
815 
816 extern struct vfsops			sock_vfsops;
817 extern struct vnodeops			*socket_vnodeops;
818 extern const struct fs_operation_def	socket_vnodeops_template[];
819 
820 extern dev_t				sockdev;
821 
822 /*
823  * sockfs functions
824  */
825 extern int	sock_getmsg(vnode_t *, struct strbuf *, struct strbuf *,
826 			uchar_t *, int *, int, rval_t *);
827 extern int	sock_putmsg(vnode_t *, struct strbuf *, struct strbuf *,
828 			uchar_t, int, int);
829 extern int	sogetvp(char *, vnode_t **, int);
830 extern int	sockinit(int, char *);
831 extern int	soconfig(int, int, int,	char *, int, char *);
832 extern int	solookup(int, int, int, struct sockparams **);
833 extern void	so_lock_single(struct sonode *);
834 extern void	so_unlock_single(struct sonode *, int);
835 extern int	so_lock_read(struct sonode *, int);
836 extern int	so_lock_read_intr(struct sonode *, int);
837 extern void	so_unlock_read(struct sonode *);
838 extern void	*sogetoff(mblk_t *, t_uscalar_t, t_uscalar_t, uint_t);
839 extern void	so_getopt_srcaddr(void *, t_uscalar_t,
840 			void **, t_uscalar_t *);
841 extern int	so_getopt_unix_close(void *, t_uscalar_t);
842 extern void	fdbuf_free(struct fdbuf *);
843 extern mblk_t	*fdbuf_allocmsg(int, struct fdbuf *);
844 extern int	fdbuf_create(void *, int, struct fdbuf **);
845 extern void	so_closefds(void *, t_uscalar_t, int, int);
846 extern int	so_getfdopt(void *, t_uscalar_t, int, void **, int *);
847 t_uscalar_t	so_optlen(void *, t_uscalar_t, int);
848 extern void	so_cmsg2opt(void *, t_uscalar_t, int, mblk_t *);
849 extern t_uscalar_t
850 		so_cmsglen(mblk_t *, void *, t_uscalar_t, int);
851 extern int	so_opt2cmsg(mblk_t *, void *, t_uscalar_t, int,
852 			void *, t_uscalar_t);
853 extern void	soisconnecting(struct sonode *);
854 extern void	soisconnected(struct sonode *);
855 extern void	soisdisconnected(struct sonode *, int);
856 extern void	socantsendmore(struct sonode *);
857 extern void	socantrcvmore(struct sonode *);
858 extern void	soseterror(struct sonode *, int);
859 extern int	sogeterr(struct sonode *, boolean_t);
860 extern int	sowaitconnected(struct sonode *, int, int);
861 
862 extern ssize_t	soreadfile(file_t *, uchar_t *, u_offset_t, int *, size_t);
863 extern void	*sock_kstat_init(zoneid_t);
864 extern void	sock_kstat_fini(zoneid_t, void *);
865 extern struct sonode *getsonode(int, int *, file_t **);
866 /*
867  * Function wrappers (mostly around the sonode switch) for
868  * backward compatibility.
869  */
870 extern int	soaccept(struct sonode *, int, struct sonode **);
871 extern int	sobind(struct sonode *, struct sockaddr *, socklen_t,
872 		    int, int);
873 extern int	solisten(struct sonode *, int);
874 extern int	soconnect(struct sonode *, const struct sockaddr *, socklen_t,
875 		    int, int);
876 extern int	sorecvmsg(struct sonode *, struct nmsghdr *, struct uio *);
877 extern int	sosendmsg(struct sonode *, struct nmsghdr *, struct uio *);
878 extern int	soshutdown(struct sonode *, int);
879 extern int	sogetsockopt(struct sonode *, int, int, void *, socklen_t *,
880 		    int);
881 extern int	sosetsockopt(struct sonode *, int, int, const void *,
882 		    t_uscalar_t);
883 
884 extern struct sonode	*socreate(struct sockparams *, int, int, int, int,
885 			    int *);
886 
887 extern int	so_copyin(const void *, void *, size_t, int);
888 extern int	so_copyout(const void *, void *, size_t, int);
889 
890 #endif
891 
892 /*
893  * Internal structure for obtaining sonode information from the socklist.
894  * These types match those corresponding in the sonode structure.
895  * This is not a published interface, and may change at any time.
896  */
897 struct sockinfo {
898 	uint_t		si_size;		/* real length of this struct */
899 	short		si_family;
900 	short		si_type;
901 	ushort_t	si_flag;
902 	uint_t		si_state;
903 	uint_t		si_ux_laddr_sou_magic;
904 	uint_t		si_ux_faddr_sou_magic;
905 	t_scalar_t	si_serv_type;
906 	t_uscalar_t	si_laddr_soa_len;
907 	t_uscalar_t	si_faddr_soa_len;
908 	uint16_t	si_laddr_family;
909 	uint16_t	si_faddr_family;
910 	char		si_laddr_sun_path[MAXPATHLEN + 1]; /* NULL terminated */
911 	char		si_faddr_sun_path[MAXPATHLEN + 1];
912 	boolean_t	si_faddr_noxlate;
913 	zoneid_t	si_szoneid;
914 };
915 
916 #define	SOCKMOD_PATH	"socketmod"	/* dir where sockmods are stored */
917 
918 #ifdef	__cplusplus
919 }
920 #endif
921 
922 #endif	/* _SYS_SOCKETVAR_H */
923