1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License, Version 1.0 only 6 * (the "License"). You may not use this file except in compliance 7 * with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or http://www.opensolaris.org/os/licensing. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* 23 * Copyright 2003 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 /* 28 * Copyright 2017 Joyent, Inc. 29 * Copyright 2022 Oxide Computer Company 30 */ 31 32 #ifndef _SYS_POLL_IMPL_H 33 #define _SYS_POLL_IMPL_H 34 35 /* 36 * Caching Poll Subsystem: 37 * 38 * Each kernel thread (1), if engaged in poll system call, has a reference to 39 * a pollstate_t (2), which contains relevant flags and locks. The pollstate_t 40 * contains a pointer to a pollcache_t (3), which caches the state of previous 41 * calls to poll. A bitmap (4) is stored inside the poll cache, where each 42 * bit represents a file descriptor. The bits are set if the corresponding 43 * device has a polled event pending. Only fds with their bit set will be 44 * examined on the next poll invocation. The pollstate_t also contains a list 45 * of fd sets (5), which are represented by the pollcacheset_t type. These 46 * structures keep track of the pollfd_t arrays (6) passed in from userland. 47 * Each polled file descriptor has a corresponding polldat_t which can be 48 * chained onto a device's pollhead, and these are kept in a hash table (7) 49 * inside the pollcache_t. The hash table allows efficient conversion of a 50 * given fd to its corresponding polldat_t. 51 * 52 * (1) (2) 53 * +-----------+ +-------------+ 54 * | kthread_t |--->| pollstate_t |-->+-------------+ (6) 55 * +-----------+ +-------------+(5)| pcacheset_t |->[_][_][_][_] pollfd_t 56 * | +-------------+ 57 * | | pcacheset_t |->[_][_][_][_] pollfd_t 58 * (1a) | +-------------+ 59 * +---------------+ | 60 * | /dev/poll tbl | | 61 * +-v-------------+ | 62 * | | 63 * +------------------+ | 64 * (7) (3) V v 65 * polldat hash +-------------+ (4) bitmap representing fd space 66 * [_][_][_][_]<----| |--->000010010010001010101010101010110 67 * | | | | | pollcache_t | 68 * . v . . | | 69 * [polldat_t] +-------------+ 70 * | 71 * [polldat_t] 72 * | 73 * v 74 * NULL 75 * 76 * 77 * Both poll system call and /dev/poll use the pollcache_t structure 78 * definition and the routines managing the structure. But poll(2) and 79 * /dev/poll have their own copy of the structures. The /dev/poll driver 80 * table (1a) contains an array of pointers, each pointing at a pollcache_t 81 * struct (3). A device minor number is used as an device table index. 82 * 83 */ 84 #include <sys/poll.h> 85 86 #if defined(_KERNEL) || defined(_KMEMUSER) 87 88 #include <sys/thread.h> 89 #include <sys/file.h> 90 #include <sys/port_kernel.h> 91 92 #ifdef __cplusplus 93 extern "C" { 94 #endif 95 96 /* 97 * Typedefs 98 */ 99 struct pollcache; 100 struct pollstate; 101 struct pcachelink; 102 struct polldat; 103 104 typedef struct pollcache pollcache_t; 105 typedef struct pollstate pollstate_t; 106 typedef struct pcachelink pcachelink_t; 107 typedef struct polldat polldat_t; 108 109 /* 110 * description of pollcacheset structure 111 */ 112 typedef struct pollcacheset { 113 uintptr_t pcs_usradr; /* usr pollfd array address */ 114 pollfd_t *pcs_pollfd; /* cached poll lists */ 115 size_t pcs_nfds; /* number of poll fd in cached list */ 116 ulong_t pcs_count; /* for LU replacement policy */ 117 } pollcacheset_t; 118 119 #define POLLFDSETS 2 120 121 /* 122 * Maximum depth for recusive poll operations. 123 */ 124 #define POLLMAXDEPTH 5 125 126 /* 127 * State information kept by each polling thread 128 */ 129 struct pollstate { 130 pollfd_t *ps_pollfd; /* hold the current poll list */ 131 size_t ps_nfds; /* size of ps_pollfd */ 132 kmutex_t ps_lock; /* mutex for sleep/wakeup */ 133 pollcache_t *ps_pcache; /* cached poll fd set */ 134 pollcacheset_t *ps_pcacheset; /* cached poll lists */ 135 int ps_nsets; /* no. of cached poll sets */ 136 pollfd_t *ps_dpbuf; /* return pollfd buf used by devpoll */ 137 size_t ps_dpbufsize; /* size of ps_dpbuf */ 138 int ps_depth; /* epoll recursion depth */ 139 pollcache_t *ps_pc_stack[POLLMAXDEPTH]; /* epoll recursion state */ 140 pollcache_t *ps_contend_pc; /* pollcache waited on */ 141 pollstate_t *ps_contend_nextp; /* next in contender list */ 142 pollstate_t **ps_contend_pnextp; /* pointer-to-previous-next */ 143 int ps_flags; /* state flags */ 144 }; 145 146 /* pollstate flags */ 147 #define POLLSTATE_STALEMATE 0x1 148 #define POLLSTATE_ULFAIL 0x2 149 150 /* pollstate_enter results */ 151 #define PSE_SUCCESS 0 152 #define PSE_FAIL_DEPTH 1 153 #define PSE_FAIL_LOOP 2 154 #define PSE_FAIL_DEADLOCK 3 155 #define PSE_FAIL_POLLSTATE 4 156 157 /* 158 * poll cache size defines 159 */ 160 #define POLLCHUNKSHIFT 8 /* hash table increment size is 256 */ 161 #define POLLHASHCHUNKSZ (1 << POLLCHUNKSHIFT) 162 #define POLLHASHINC 2 /* poll hash table growth factor */ 163 #define POLLHASHTHRESHOLD 2 /* poll hash list length threshold */ 164 #define POLLHASH(x, y) ((y) % (x)) /* poll hash function */ 165 166 /* 167 * poll.c assumes the POLLMAPCHUNK is power of 2 168 */ 169 #define POLLMAPCHUNK 2048 /* bitmap inc -- each for 2K of polled fd's */ 170 171 /* 172 * used to refrence from watched fd back to the fd position in cached 173 * poll list for quick revents update. 174 */ 175 typedef struct xref { 176 ssize_t xf_position; /* xref fd position in poll fd list */ 177 short xf_refcnt; /* ref cnt of same fd in poll list */ 178 } xref_t; 179 180 #define POLLPOSINVAL (-1L) /* xf_position is invalid */ 181 #define POLLPOSTRANS (-2L) /* xf_position is transient state */ 182 183 184 typedef enum pclstate { 185 PCL_INIT = 0, /* just allocated/zeroed, prior */ 186 PCL_VALID, /* linked with both parent and child pollcaches */ 187 PCL_STALE, /* still linked but marked stale, pending refresh */ 188 PCL_INVALID, /* dissociated from one pollcache, awaiting cleanup */ 189 PCL_FREE /* only meant to indicate use-after-free */ 190 } pclstate_t; 191 192 /* 193 * The pcachelink struct creates an association between parent and child 194 * pollcaches in a recursive /dev/poll operation. Fields are protected by 195 * pcl_lock although manipulation of pcl_child_next or pcl_parent_next also 196 * requires holding pc_lock in the respective pcl_parent_pc or pcl_child_pc 197 * pollcache. 198 */ 199 struct pcachelink { 200 kmutex_t pcl_lock; /* protects contents */ 201 pclstate_t pcl_state; /* status of link entry */ 202 int pcl_refcnt; /* ref cnt of linked pcaches */ 203 pollcache_t *pcl_child_pc; /* child pollcache */ 204 pollcache_t *pcl_parent_pc; /* parent pollcache */ 205 pcachelink_t *pcl_child_next; /* next in child list */ 206 pcachelink_t *pcl_parent_next; /* next in parents list */ 207 }; 208 209 210 /* 211 * polldat is an entry for a cached poll fd. A polldat struct can be in 212 * poll cache table as well as on pollhead ph_list, which is used by 213 * pollwakeup to wake up a sleeping poller. There should be one polldat 214 * per polled fd hanging off pollstate struct. 215 */ 216 struct polldat { 217 int pd_fd; /* cached poll fd */ 218 int pd_events; /* union of all polled events */ 219 file_t *pd_fp; /* used to detect fd reuse */ 220 pollhead_t *pd_php; /* used to undo poll registration */ 221 kthread_t *pd_thread; /* used for waking up a sleep thrd */ 222 pollcache_t *pd_pcache; /* a ptr to the pollcache of this fd */ 223 polldat_t *pd_next; /* next on pollhead's ph_list */ 224 polldat_t *pd_hashnext; /* next on pollhead's ph_list */ 225 int pd_count; /* total count from all ref'ed sets */ 226 int pd_nsets; /* num of xref sets, used by poll(2) */ 227 xref_t *pd_ref; /* ptr to xref info, 1 for each set */ 228 port_kevent_t *pd_portev; /* associated port event struct */ 229 uf_entry_gen_t pd_gen; /* fd generation at cache time */ 230 uint64_t pd_epolldata; /* epoll data, if any */ 231 }; 232 233 /* 234 * One cache for each thread that polls. Points to a bitmap (used by pollwakeup) 235 * and a hash table of polldats. 236 * 237 * Because of the handling required in pollrelock(), portfs abuses the notion of 238 * an active pollcache (t_pollcache), providing its own struct port_fdcache_t. 239 * It has matching pc_lock and pc_flag members at the correct offsets, but none 240 * of its other fields can be accessed (through t_pollcache) safetly. 241 */ 242 struct pollcache { 243 kmutex_t pc_lock; /* lock to protect pollcache */ 244 ulong_t *pc_bitmap; /* point to poll fd bitmap */ 245 polldat_t **pc_hash; /* points to a hash table of ptrs */ 246 int pc_mapend; /* the largest fd encountered so far */ 247 int pc_mapsize; /* the size of current map */ 248 int pc_hashsize; /* the size of current hash table */ 249 int pc_fdcount; /* track how many fd's are hashed */ 250 int pc_flag; /* see pc_flag define below */ 251 int pc_busy; /* can only exit when its 0 */ 252 kmutex_t pc_no_exit; /* protects pc_busy*, can't be nested */ 253 kcondvar_t pc_busy_cv; /* cv to wait on if ps_busy != 0 */ 254 kcondvar_t pc_cv; /* cv to wait on if needed */ 255 pid_t pc_pid; /* for check acc rights, devpoll only */ 256 int pc_mapstart; /* where search start, devpoll only */ 257 pcachelink_t *pc_parents; /* linked list of epoll parents */ 258 pcachelink_t *pc_children; /* linked list of epoll children */ 259 }; 260 261 /* pc_flag */ 262 #define PC_POLLWAKE 0x02 /* pollwakeup() occurred */ 263 #define PC_EPOLL 0x04 /* pollcache is epoll-enabled */ 264 /* 265 * PC_PORTFS is not a flag for "real" pollcaches, but rather an indicator for 266 * when portfs sets t_pollcache to a port_fdcache_t pointer. If, while 267 * debugging a system, one sees PC_PORTFS in pc_flag, they will know to 268 * disregard the other fields, as it is not a pollcache. 269 */ 270 #define PC_PORTFS 0x08 271 272 #if defined(_KERNEL) 273 /* 274 * Internal routines. 275 */ 276 extern void pollnotify(pollcache_t *, int); 277 278 /* 279 * public poll head interfaces (see poll.h): 280 * 281 * pollhead_clean clean up all polldats on a pollhead list 282 */ 283 extern void pollhead_clean(pollhead_t *); 284 285 /* 286 * private poll head interfaces: 287 * 288 * polldat_associate adds a polldat to a pollhead list 289 * polldat_disassociate remove polldat from its assoc'd pollhead list 290 */ 291 extern void polldat_associate(polldat_t *, pollhead_t *); 292 extern void polldat_disassociate(polldat_t *); 293 294 /* 295 * poll state interfaces: 296 * 297 * pollstate_create initializes per-thread pollstate 298 * pollstate_destroy cleans up per-thread pollstate 299 * pollstate_enter safely lock pollcache for pollstate 300 * pollstate_exit unlock pollcache from pollstate 301 */ 302 extern pollstate_t *pollstate_create(void); 303 extern void pollstate_destroy(pollstate_t *); 304 extern int pollstate_enter(pollcache_t *); 305 extern void pollstate_exit(pollcache_t *); 306 307 /* 308 * public pcache interfaces: 309 * 310 * pcache_alloc allocate a poll cache skeleton 311 * pcache_create creates all poll cache supporting data struct 312 * pcache_insert cache a poll fd, calls pcache_insert_fd 313 * pcache_lookup given an fd list, returns a cookie 314 * pcache_poll polls the cache for fd's having events on them 315 * pcache_clean clean up all the pollhead and fpollinfo reference 316 * pcache_destroy destroys the pcache 317 */ 318 extern pollcache_t *pcache_alloc(); 319 extern void pcache_create(pollcache_t *, nfds_t); 320 extern int pcache_insert(pollstate_t *, file_t *, pollfd_t *, int *, ssize_t, 321 int); 322 extern int pcache_poll(pollfd_t *, pollstate_t *, nfds_t, int *, int); 323 extern void pcache_clean(pollcache_t *); 324 extern void pcache_destroy(pollcache_t *); 325 326 /* 327 * private pcache interfaces: 328 * 329 * pcache_lookup_fd lookup an fd, returns a polldat 330 * pcache_alloc_fd allocates and returns a polldat 331 * pcache_insert_fd insert an fd into pcache (called by pcache_insert) 332 * pcache_delete_fd insert an fd into pcache (called by pcacheset_delete_fd) 333 * pcache_grow_hashtbl grows the pollcache hash table and rehash 334 * pcache_grow_map grows the pollcache bitmap 335 * pcache_update_xref update cross ref (from polldat back to cacheset) info 336 * pcache_clean_entry cleanup an entry in pcache and more... 337 * pcache_wake_parents wake linked parent pollcaches 338 */ 339 extern polldat_t *pcache_lookup_fd(pollcache_t *, int); 340 extern polldat_t *pcache_alloc_fd(int); 341 extern void pcache_insert_fd(pollcache_t *, polldat_t *, nfds_t); 342 extern int pcache_delete_fd(pollstate_t *, int, size_t, int, uint_t); 343 extern void pcache_grow_hashtbl(pollcache_t *, nfds_t); 344 extern void pcache_grow_map(pollcache_t *, int); 345 extern void pcache_update_xref(pollcache_t *, int, ssize_t, int); 346 extern void pcache_clean_entry(pollstate_t *, int); 347 extern void pcache_wake_parents(pollcache_t *); 348 349 /* 350 * pcacheset interfaces: 351 * 352 * pcacheset_create creates new pcachesets (easier for dynamic pcachesets) 353 * pcacheset_destroy destroys a pcacheset 354 * pcacheset_cache_list caches and polls a new poll list 355 * pcacheset_remove_list removes (usually a partial) cached poll list 356 * pcacheset_resolve resolves extant pcacheset and fd list 357 * pcacheset_cmp compares a pcacheset with an fd list 358 * pcacheset_invalidate invalidate entries in pcachesets 359 * pcacheset_reset_count resets the usage counter of pcachesets 360 * pcacheset_replace selects a poll cacheset for replacement 361 */ 362 extern pollcacheset_t *pcacheset_create(int); 363 extern void pcacheset_destroy(pollcacheset_t *, int); 364 extern int pcacheset_cache_list(pollstate_t *, pollfd_t *, int *, int); 365 extern void pcacheset_remove_list(pollstate_t *, pollfd_t *, int, int, int, 366 int); 367 extern int pcacheset_resolve(pollstate_t *, nfds_t, int *, int); 368 extern int pcacheset_cmp(pollfd_t *, pollfd_t *, pollfd_t *, int); 369 extern void pcacheset_invalidate(pollstate_t *, polldat_t *); 370 extern void pcacheset_reset_count(pollstate_t *, int); 371 extern int pcacheset_replace(pollstate_t *); 372 373 #endif /* defined(_KERNEL) */ 374 375 #ifdef __cplusplus 376 } 377 #endif 378 379 #endif /* defined(_KERNEL) || defined(_KMEMUSER) */ 380 381 #endif /* _SYS_POLL_IMPL_H */ 382