xref: /illumos-gate/usr/src/uts/common/sys/poll_impl.h (revision 76c08ae9d10f4e0b653a6ea98c06a7868246164b)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*
23  * Copyright 2003 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 /*
28  * Copyright 2017 Joyent, Inc.
29  * Copyright 2022 Oxide Computer Company
30  */
31 
32 #ifndef _SYS_POLL_IMPL_H
33 #define	_SYS_POLL_IMPL_H
34 
35 /*
36  * Caching Poll Subsystem:
37  *
38  * Each kernel thread (1), if engaged in poll system call, has a reference to
39  * a pollstate_t (2), which contains relevant flags and locks.  The pollstate_t
40  * contains a pointer to a pollcache_t (3), which caches the state of previous
41  * calls to poll.  A bitmap (4) is stored inside the poll cache, where each
42  * bit represents a file descriptor.  The bits are set if the corresponding
43  * device has a polled event pending.  Only fds with their bit set will be
44  * examined on the next poll invocation.  The pollstate_t also contains a list
45  * of fd sets (5), which are represented by the pollcacheset_t type.  These
46  * structures keep track of the pollfd_t arrays (6) passed in from userland.
47  * Each polled file descriptor has a corresponding polldat_t which can be
48  * chained onto a device's pollhead, and these are kept in a hash table (7)
49  * inside the pollcache_t.  The hash table allows efficient conversion of a
50  * given fd to its corresponding polldat_t.
51  *
52  * (1)              (2)
53  * +-----------+    +-------------+
54  * | kthread_t |--->| pollstate_t |-->+-------------+  (6)
55  * +-----------+    +-------------+(5)| pcacheset_t |->[_][_][_][_] pollfd_t
56  *                          |         +-------------+
57  *                          |         | pcacheset_t |->[_][_][_][_] pollfd_t
58  * (1a)                     |         +-------------+
59  * +---------------+	    |
60  * | /dev/poll tbl |	    |
61  * +-v-------------+	    |
62  *   |			    |
63  *   +------------------+   |
64  * (7)              (3) V   v
65  * polldat hash     +-------------+    (4) bitmap representing fd space
66  * [_][_][_][_]<----|             |--->000010010010001010101010101010110
67  *  |  |  |  |      | pollcache_t |
68  *  .  v  .  .      |             |
69  *    [polldat_t]   +-------------+
70  *     |
71  *    [polldat_t]
72  *     |
73  *     v
74  *     NULL
75  *
76  *
77  * Both poll system call and /dev/poll use the pollcache_t structure
78  * definition and the routines managing the structure. But poll(2) and
79  * /dev/poll have their own copy of the structures. The /dev/poll driver
80  * table (1a) contains an array of pointers, each pointing at a pollcache_t
81  * struct (3). A device minor number is used as an device table index.
82  *
83  */
84 #include <sys/poll.h>
85 
86 #if defined(_KERNEL) || defined(_KMEMUSER)
87 
88 #include <sys/thread.h>
89 #include <sys/file.h>
90 #include <sys/port_kernel.h>
91 
92 #ifdef	__cplusplus
93 extern "C" {
94 #endif
95 
96 /*
97  * Typedefs
98  */
99 struct pollcache;
100 struct pollstate;
101 struct pcachelink;
102 struct polldat;
103 
104 typedef struct pollcache pollcache_t;
105 typedef struct pollstate pollstate_t;
106 typedef struct pcachelink pcachelink_t;
107 typedef struct polldat polldat_t;
108 
109 /*
110  * description of pollcacheset structure
111  */
112 typedef struct pollcacheset {
113 	uintptr_t	pcs_usradr;	/* usr pollfd array address */
114 	pollfd_t	*pcs_pollfd;	/* cached poll lists */
115 	size_t		pcs_nfds;	/* number of poll fd in cached list */
116 	ulong_t		pcs_count;	/* for LU replacement policy */
117 } pollcacheset_t;
118 
119 #define	POLLFDSETS	2
120 
121 /*
122  * Maximum depth for recusive poll operations.
123  */
124 #define	POLLMAXDEPTH	5
125 
126 /*
127  * State information kept by each polling thread
128  */
129 struct pollstate {
130 	pollfd_t	*ps_pollfd;	/* hold the current poll list */
131 	size_t		ps_nfds;	/* size of ps_pollfd */
132 	kmutex_t	ps_lock;	/* mutex for sleep/wakeup */
133 	pollcache_t	*ps_pcache;	/* cached poll fd set */
134 	pollcacheset_t	*ps_pcacheset;	/* cached poll lists */
135 	int		ps_nsets;	/* no. of cached poll sets */
136 	pollfd_t	*ps_dpbuf;	/* return pollfd buf used by devpoll */
137 	size_t		ps_dpbufsize;	/* size of ps_dpbuf */
138 	int		ps_depth;	/* epoll recursion depth */
139 	pollcache_t	*ps_pc_stack[POLLMAXDEPTH]; /* epoll recursion state */
140 	pollcache_t	*ps_contend_pc;		/* pollcache waited on */
141 	pollstate_t	*ps_contend_nextp;	/* next in contender list */
142 	pollstate_t	**ps_contend_pnextp;	/* pointer-to-previous-next */
143 	int		ps_flags;	/* state flags */
144 };
145 
146 /* pollstate flags */
147 #define	POLLSTATE_STALEMATE	0x1
148 #define	POLLSTATE_ULFAIL	0x2
149 
150 /* pollstate_enter results */
151 #define	PSE_SUCCESS		0
152 #define	PSE_FAIL_DEPTH		1
153 #define	PSE_FAIL_LOOP		2
154 #define	PSE_FAIL_DEADLOCK	3
155 #define	PSE_FAIL_POLLSTATE	4
156 
157 /*
158  * poll cache size defines
159  */
160 #define	POLLCHUNKSHIFT		8	/* hash table increment size is 256 */
161 #define	POLLHASHCHUNKSZ		(1 << POLLCHUNKSHIFT)
162 #define	POLLHASHINC		2	/* poll hash table growth factor */
163 #define	POLLHASHTHRESHOLD	2	/* poll hash list length threshold */
164 #define	POLLHASH(x, y)	((y) % (x))	/* poll hash function */
165 
166 /*
167  * poll.c assumes the POLLMAPCHUNK is power of 2
168  */
169 #define	POLLMAPCHUNK	2048	/* bitmap inc -- each for 2K of polled fd's */
170 
171 /*
172  * used to refrence from watched fd back to the fd position in cached
173  * poll list for quick revents update.
174  */
175 typedef struct xref {
176 	ssize_t	xf_position;    /* xref fd position in poll fd list */
177 	short	xf_refcnt;	/* ref cnt of same fd in poll list */
178 } xref_t;
179 
180 #define	POLLPOSINVAL	(-1L)	/* xf_position is invalid */
181 #define	POLLPOSTRANS	(-2L)	/* xf_position is transient state */
182 
183 
184 typedef enum pclstate {
185 	PCL_INIT = 0,	/* just allocated/zeroed, prior */
186 	PCL_VALID,	/* linked with both parent and child pollcaches */
187 	PCL_STALE,	/* still linked but marked stale, pending refresh */
188 	PCL_INVALID,	/* dissociated from one pollcache, awaiting cleanup */
189 	PCL_FREE	/* only meant to indicate use-after-free */
190 } pclstate_t;
191 
192 /*
193  * The pcachelink struct creates an association between parent and child
194  * pollcaches in a recursive /dev/poll operation.  Fields are protected by
195  * pcl_lock although manipulation of pcl_child_next or pcl_parent_next also
196  * requires holding pc_lock in the respective pcl_parent_pc or pcl_child_pc
197  * pollcache.
198  */
199 struct pcachelink {
200 	kmutex_t	pcl_lock;		/* protects contents */
201 	pclstate_t	pcl_state;		/* status of link entry */
202 	int		pcl_refcnt;		/* ref cnt of linked pcaches */
203 	pollcache_t	*pcl_child_pc;		/* child pollcache */
204 	pollcache_t	*pcl_parent_pc;		/* parent pollcache */
205 	pcachelink_t	*pcl_child_next;	/* next in child list */
206 	pcachelink_t	*pcl_parent_next;	/* next in parents list */
207 };
208 
209 
210 /*
211  * polldat is an entry for a cached poll fd. A polldat struct can be in
212  * poll cache table as well as on pollhead ph_list, which is used by
213  * pollwakeup to wake up a sleeping poller. There should be one polldat
214  * per polled fd hanging off pollstate struct.
215  */
216 struct polldat {
217 	int		pd_fd;		/* cached poll fd */
218 	int		pd_events;	/* union of all polled events */
219 	file_t		*pd_fp;		/* used to detect fd reuse */
220 	pollhead_t	*pd_php;	/* used to undo poll registration */
221 	kthread_t	*pd_thread;	/* used for waking up a sleep thrd */
222 	pollcache_t	*pd_pcache;	/* a ptr to the pollcache of this fd */
223 	polldat_t	*pd_next;	/* next on pollhead's ph_list */
224 	polldat_t	*pd_hashnext;	/* next on pollhead's ph_list */
225 	int		pd_count;	/* total count from all ref'ed sets */
226 	int		pd_nsets;	/* num of xref sets, used by poll(2) */
227 	xref_t		*pd_ref;	/* ptr to xref info, 1 for each set */
228 	port_kevent_t	*pd_portev;	/* associated port event struct */
229 	uf_entry_gen_t	pd_gen;		/* fd generation at cache time */
230 	uint64_t	pd_epolldata;	/* epoll data, if any */
231 };
232 
233 /*
234  * One cache for each thread that polls. Points to a bitmap (used by pollwakeup)
235  * and a hash table of polldats.
236  *
237  * Because of the handling required in pollrelock(), portfs abuses the notion of
238  * an active pollcache (t_pollcache), providing its own struct port_fdcache_t.
239  * It has matching pc_lock and pc_flag members at the correct offsets, but none
240  * of its other fields can be accessed (through t_pollcache) safetly.
241  */
242 struct pollcache {
243 	kmutex_t	pc_lock;	/* lock to protect pollcache */
244 	ulong_t		*pc_bitmap;	/* point to poll fd bitmap */
245 	polldat_t	**pc_hash;	/* points to a hash table of ptrs */
246 	int		pc_mapend;	/* the largest fd encountered so far */
247 	int		pc_mapsize;	/* the size of current map */
248 	int		pc_hashsize;	/* the size of current hash table */
249 	int		pc_fdcount;	/* track how many fd's are hashed */
250 	int		pc_flag;	/* see pc_flag define below */
251 	int		pc_busy;	/* can only exit when its 0 */
252 	kmutex_t	pc_no_exit;	/* protects pc_busy*, can't be nested */
253 	kcondvar_t	pc_busy_cv;	/* cv to wait on if ps_busy != 0 */
254 	kcondvar_t	pc_cv;		/* cv to wait on if needed */
255 	pid_t		pc_pid;		/* for check acc rights, devpoll only */
256 	int		pc_mapstart;	/* where search start, devpoll only */
257 	pcachelink_t	*pc_parents;	/* linked list of epoll parents */
258 	pcachelink_t	*pc_children;	/* linked list of epoll children */
259 };
260 
261 /* pc_flag */
262 #define	PC_POLLWAKE	0x02	/* pollwakeup() occurred */
263 #define	PC_EPOLL	0x04	/* pollcache is epoll-enabled */
264 /*
265  * PC_PORTFS is not a flag for "real" pollcaches, but rather an indicator for
266  * when portfs sets t_pollcache to a port_fdcache_t pointer.  If, while
267  * debugging a system, one sees PC_PORTFS in pc_flag, they will know to
268  * disregard the other fields, as it is not a pollcache.
269  */
270 #define	PC_PORTFS	0x08
271 
272 #if defined(_KERNEL)
273 /*
274  * Internal routines.
275  */
276 extern void pollnotify(pollcache_t *, int);
277 
278 /*
279  * public poll head interfaces (see poll.h):
280  *
281  *  pollhead_clean      clean up all polldats on a pollhead list
282  */
283 extern void pollhead_clean(pollhead_t *);
284 
285 /*
286  * private poll head interfaces:
287  *
288  *  polldat_associate		adds a polldat to a pollhead list
289  *  polldat_disassociate	remove polldat from its assoc'd pollhead list
290  */
291 extern void polldat_associate(polldat_t *, pollhead_t *);
292 extern void polldat_disassociate(polldat_t *);
293 
294 /*
295  * poll state interfaces:
296  *
297  *  pollstate_create	initializes per-thread pollstate
298  *  pollstate_destroy	cleans up per-thread pollstate
299  *  pollstate_enter	safely lock pollcache for pollstate
300  *  pollstate_exit	unlock pollcache from pollstate
301  */
302 extern pollstate_t *pollstate_create(void);
303 extern void pollstate_destroy(pollstate_t *);
304 extern int pollstate_enter(pollcache_t *);
305 extern void pollstate_exit(pollcache_t *);
306 
307 /*
308  * public pcache interfaces:
309  *
310  *  pcache_alloc	allocate a poll cache skeleton
311  *  pcache_create       creates all poll cache supporting data struct
312  *  pcache_insert	cache a poll fd, calls pcache_insert_fd
313  *  pcache_lookup       given an fd list, returns a cookie
314  *  pcache_poll         polls the cache for fd's having events on them
315  *  pcache_clean        clean up all the pollhead and fpollinfo reference
316  *  pcache_destroy      destroys the pcache
317  */
318 extern pollcache_t *pcache_alloc();
319 extern void pcache_create(pollcache_t *, nfds_t);
320 extern int pcache_insert(pollstate_t *, file_t *, pollfd_t *, int *, ssize_t,
321     int);
322 extern int pcache_poll(pollfd_t *, pollstate_t *, nfds_t, int *, int);
323 extern void pcache_clean(pollcache_t *);
324 extern void pcache_destroy(pollcache_t *);
325 
326 /*
327  * private pcache interfaces:
328  *
329  *  pcache_lookup_fd	lookup an fd, returns a polldat
330  *  pcache_alloc_fd	allocates and returns a polldat
331  *  pcache_insert_fd	insert an fd into pcache (called by pcache_insert)
332  *  pcache_delete_fd	insert an fd into pcache (called by pcacheset_delete_fd)
333  *  pcache_grow_hashtbl	grows the pollcache hash table and rehash
334  *  pcache_grow_map	grows the pollcache bitmap
335  *  pcache_update_xref	update cross ref (from polldat back to cacheset) info
336  *  pcache_clean_entry	cleanup an entry in pcache and more...
337  *  pcache_wake_parents	wake linked parent pollcaches
338  */
339 extern polldat_t *pcache_lookup_fd(pollcache_t *, int);
340 extern polldat_t *pcache_alloc_fd(int);
341 extern void pcache_insert_fd(pollcache_t *, polldat_t *, nfds_t);
342 extern int pcache_delete_fd(pollstate_t *, int, size_t, int, uint_t);
343 extern void pcache_grow_hashtbl(pollcache_t *, nfds_t);
344 extern void pcache_grow_map(pollcache_t *, int);
345 extern void pcache_update_xref(pollcache_t *, int, ssize_t, int);
346 extern void pcache_clean_entry(pollstate_t *, int);
347 extern void pcache_wake_parents(pollcache_t *);
348 
349 /*
350  * pcacheset interfaces:
351  *
352  * pcacheset_create     creates new pcachesets (easier for dynamic pcachesets)
353  * pcacheset_destroy    destroys a pcacheset
354  * pcacheset_cache_list caches and polls a new poll list
355  * pcacheset_remove_list removes (usually a partial) cached poll list
356  * pcacheset_resolve    resolves extant pcacheset and fd list
357  * pcacheset_cmp        compares a pcacheset with an fd list
358  * pcacheset_invalidate invalidate entries in pcachesets
359  * pcacheset_reset_count resets the usage counter of pcachesets
360  * pcacheset_replace	selects a poll cacheset for replacement
361  */
362 extern pollcacheset_t *pcacheset_create(int);
363 extern void pcacheset_destroy(pollcacheset_t *, int);
364 extern int pcacheset_cache_list(pollstate_t *, pollfd_t *, int *, int);
365 extern void pcacheset_remove_list(pollstate_t *, pollfd_t *, int, int, int,
366     int);
367 extern int pcacheset_resolve(pollstate_t *, nfds_t, int *, int);
368 extern int pcacheset_cmp(pollfd_t *, pollfd_t *, pollfd_t *, int);
369 extern void pcacheset_invalidate(pollstate_t *, polldat_t *);
370 extern void pcacheset_reset_count(pollstate_t *, int);
371 extern int pcacheset_replace(pollstate_t *);
372 
373 #endif /* defined(_KERNEL) */
374 
375 #ifdef	__cplusplus
376 }
377 #endif
378 
379 #endif /* defined(_KERNEL) || defined(_KMEMUSER) */
380 
381 #endif	/* _SYS_POLL_IMPL_H */
382