xref: /illumos-gate/usr/src/uts/common/fs/portfs/port.c (revision 2e837a72011f54762249b6612c2a64f171efcd43)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #include <sys/types.h>
28 #include <sys/systm.h>
29 #include <sys/cred.h>
30 #include <sys/modctl.h>
31 #include <sys/vfs.h>
32 #include <sys/vfs_opreg.h>
33 #include <sys/sysmacros.h>
34 #include <sys/cmn_err.h>
35 #include <sys/stat.h>
36 #include <sys/errno.h>
37 #include <sys/kmem.h>
38 #include <sys/file.h>
39 #include <sys/kstat.h>
40 #include <sys/port_impl.h>
41 #include <sys/task.h>
42 #include <sys/project.h>
43 
44 /*
45  * Event Ports can be shared across threads or across processes.
46  * Every thread/process can use an own event port or a group of them
47  * can use a single port. A major request was also to get the ability
48  * to submit user-defined events to a port. The idea of the
49  * user-defined events is to use the event ports for communication between
50  * threads/processes (like message queues). User defined-events are queued
51  * in a port with the same priority as other event types.
52  *
53  * Events are delivered only once. The thread/process which is waiting
54  * for events with the "highest priority" (priority here is related to the
55  * internal strategy to wakeup waiting threads) will retrieve the event,
56  * all other threads/processes will not be notified. There is also
57  * the requirement to have events which should be submitted immediately
58  * to all "waiting" threads. That is the main task of the alert event.
59  * The alert event is submitted by the application to a port. The port
60  * changes from a standard mode to the alert mode. Now all waiting threads
61  * will be awaken immediately and they will return with the alert event.
62  * Threads trying to retrieve events from a port in alert mode will
63  * return immediately with the alert event.
64  *
65  *
66  * An event port is like a kernel queue, which accept events submitted from
67  * user level as well as events submitted from kernel sub-systems. Sub-systems
68  * able to submit events to a port are the so-called "event sources".
69  * Current event sources:
70  * PORT_SOURCE_AIO	 : events submitted per transaction completion from
71  *			   POSIX-I/O framework.
72  * PORT_SOURCE_TIMER	 : events submitted when a timer fires
73  *			   (see timer_create(3RT)).
74  * PORT_SOURCE_FD	 : events submitted per file descriptor (see poll(2)).
75  * PORT_SOURCE_ALERT	 : events submitted from user. This is not really a
76  *			   single event, this is actually a port mode
77  *			   (see port_alert(3c)).
78  * PORT_SOURCE_USER	 : events submitted by applications with
79  *			   port_send(3c) or port_sendn(3c).
80  * PORT_SOURCE_FILE	 : events submitted per file being watched for file
81  *			   change events  (see port_create(3c).
82  *
83  * There is a user API implemented in the libc library as well as a
84  * kernel API implemented in port_subr.c in genunix.
85  * The available user API functions are:
86  * port_create() : create a port as a file descriptor of portfs file system
87  *		   The standard close(2) function closes a port.
88  * port_associate() : associate a file descriptor with a port to be able to
89  *		      retrieve events from that file descriptor.
90  * port_dissociate(): remove the association of a file descriptor with a port.
91  * port_alert()	 : set/unset a port in alert mode
92  * port_send()	 : send an event of type PORT_SOURCE_USER to a port
93  * port_sendn()	 : send an event of type PORT_SOURCE_USER to a list of ports
94  * port_get()	 : retrieve a single event from a port
95  * port_getn()	 : retrieve a list of events from a port
96  *
97  * The available kernel API functions are:
98  * port_allocate_event(): allocate an event slot/structure of/from a port
99  * port_init_event()    : set event data in the event structure
100  * port_send_event()    : send event to a port
101  * port_free_event()    : deliver allocated slot/structure back to a port
102  * port_associate_ksource(): associate a kernel event source with a port
103  * port_dissociate_ksource(): dissociate a kernel event source from a port
104  *
105  * The libc implementation consists of small functions which pass the
106  * arguments to the kernel using the "portfs" system call. It means, all the
107  * synchronisation work is being done in the kernel. The "portfs" system
108  * call loads the portfs file system into the kernel.
109  *
110  * PORT CREATION
111  * The first function to be used is port_create() which internally creates
112  * a vnode and a portfs node. The portfs node is represented by the port_t
113  * structure, which again includes all the data necessary to control a port.
114  * port_create() returns a file descriptor, which needs to be used in almost
115  * all other event port functions.
116  * The maximum number of ports per system is controlled by the resource
117  * control: project:port-max-ids.
118  *
119  * EVENT GENERATION
120  * The second step is the triggering of events, which could be sent to a port.
121  * Every event source implements an own method to generate events for a port:
122  * PORT_SOURCE_AIO:
123  *	The sigevent structure of the standard POSIX-IO functions
124  *	was extended by an additional notification type.
125  *	Standard notification types:
126  *	SIGEV_NONE, SIGEV_SIGNAL and SIGEV_THREAD
127  *	Event ports introduced now SIGEV_PORT.
128  *	The notification type SIGEV_PORT specifies that a structure
129  *	of type port_notify_t has to be attached to the sigev_value.
130  *	The port_notify_t structure contains the event port file
131  *	descriptor and a user-defined pointer.
132  *	Internally the AIO implementation will use the kernel API
133  *	functions to allocate an event port slot per transaction (aiocb)
134  *	and sent the event to the port as soon as the transaction completes.
135  *	All the events submitted per transaction are of type
136  *	PORT_SOURCE_AIO.
137  * PORT_SOURCE_TIMER:
138  *	The timer_create() function uses the same method as the
139  *	PORT_SOURCE_AIO event source. It also uses the sigevent structure
140  *	to deliver the port information.
141  *	Internally the timer code will allocate a single event slot/struct
142  *	per timer and it will send the timer event as soon as the timer
143  *	fires. If the timer-fired event is not delivered to the application
144  *	before the next period elapsed, then an overrun counter will be
145  *	incremented. The timer event source uses a callback function to
146  *	detect the delivery of the event to the application. At that time
147  *	the timer callback function will update the event overrun counter.
148  * PORT_SOURCE_FD:
149  *	This event source uses the port_associate() function to allocate
150  *	an event slot/struct from a port. The application defines in the
151  *	events argument of port_associate() the type of events which it is
152  *	interested on.
153  *	The internal pollwakeup() function is used by all the file
154  *	systems --which are supporting the VOP_POLL() interface- to notify
155  *	the upper layer (poll(2), devpoll(7d) and now event ports) about
156  *	the event triggered (see valid events in poll(2)).
157  *	The pollwakeup() function forwards the event to the layer registered
158  *	to receive the current event.
159  *	The port_dissociate() function can be used to free the allocated
160  *	event slot from the port. Anyway, file descriptors deliver events
161  *	only one time and remain deactivated until the application
162  *	reactivates the association of a file descriptor with port_associate().
163  *	If an associated file descriptor is closed then the file descriptor
164  *	will be dissociated automatically from the port.
165  *
166  * PORT_SOURCE_ALERT:
167  *	This event type is generated when the port was previously set in
168  *	alert mode using the port_alert() function.
169  *	A single alert event is delivered to every thread which tries to
170  *	retrieve events from a port.
171  * PORT_SOURCE_USER:
172  *	This type of event is generated from user level using the port_send()
173  *	function to send a user event to a port or the port_sendn() function
174  *	to send an event to a list of ports.
175  * PORT_SOURCE_FILE:
176  *	This event source uses the port_associate() interface to register
177  *	a file to be monitored for changes. The file name that needs to be
178  *	monitored is specified in the file_obj_t structure, a pointer to which
179  *	is passed as an argument. The event types to be monitored are specified
180  *	in the events argument.
181  *	A file events monitor is represented internal per port per object
182  *	address(the file_obj_t pointer). Which means there can be multiple
183  *	watches registered on the same file using different file_obj_t
184  *	structure pointer. With the help of the	FEM(File Event Monitoring)
185  *	hooks, the file's vnode ops are intercepted and relevant events
186  *	delivered. The port_dissociate() function is used to de-register a
187  *	file events monitor on a file. When the specified file is
188  *	removed/renamed, the file events watch/monitor is automatically
189  *	removed.
190  *
191  * EVENT DELIVERY / RETRIEVING EVENTS
192  * Events remain in the port queue until:
193  * - the application uses port_get() or port_getn() to retrieve events,
194  * - the event source cancel the event,
195  * - the event port is closed or
196  * - the process exits.
197  * The maximal number of events in a port queue is the maximal number
198  * of event slots/structures which can be allocated by event sources.
199  * The allocation of event slots/structures is controlled by the resource
200  * control: process.port-max-events.
201  * The port_get() function retrieves a single event and the port_getn()
202  * function retrieves a list of events.
203  * Events are classified as shareable and non-shareable events across processes.
204  * Non-shareable events are invisible for the port_get(n)() functions of
205  * processes other than the owner of the event.
206  *    Shareable event types are:
207  *    PORT_SOURCE_USER events
208  *	This type of event is unconditionally shareable and without
209  *	limitations. If the parent process sends a user event and closes
210  *	the port afterwards, the event remains in the port and the child
211  *	process will still be able to retrieve the user event.
212  *    PORT_SOURCE_ALERT events
213  *	This type of event is shareable between processes.
214  *	Limitation:	The alert mode of the port is removed if the owner
215  *			(process which set the port in alert mode) of the
216  *			alert event closes the port.
217  *    PORT_SOURCE_FD events
218  *	This type of event is conditional shareable between processes.
219  *	After fork(2) all forked file descriptors are shareable between
220  *	the processes. The child process is allowed to retrieve events
221  *	from the associated file descriptors and it can also re-associate
222  *	the fd with the port.
223  *	Limitations:	The child process is not allowed to dissociate
224  *			the file descriptor from the port. Only the
225  *			owner (process) of the association is allowed to
226  *			dissociate the file descriptor from the port.
227  *			If the owner of the association closes the port
228  *			the association will be removed.
229  *    PORT_SOURCE_AIO events
230  *	This type of event is not shareable between processes.
231  *    PORT_SOURCE_TIMER events
232  *	This type of event is not shareable between processes.
233  *    PORT_SOURCE_FILE events
234  *	This type of event is not shareable between processes.
235  *
236  * FORK BEHAVIOUR
237  * On fork(2) the child process inherits all opened file descriptors from
238  * the parent process. This is also valid for port file descriptors.
239  * Associated file descriptors with a port maintain the association across the
240  * fork(2). It means, the child process gets full access to the port and
241  * it can retrieve events from all common associated file descriptors.
242  * Events of file descriptors created and associated with a port after the
243  * fork(2) are non-shareable and can only be retrieved by the same process.
244  *
245  * If the parent or the child process closes an exported port (using fork(2)
246  * or I_SENDFD) all the file descriptors associated with the port by the
247  * process will be dissociated from the port. Events of dissociated file
248  * descriptors as well as all non-shareable events will be discarded.
249  * The other process can continue working with the port as usual.
250  *
251  * CLOSING A PORT
252  * close(2) has to be used to close a port. See FORK BEHAVIOUR for details.
253  *
254  * PORT EVENT STRUCTURES
255  * The global control structure of the event ports framework is port_control_t.
256  * port_control_t keeps track of the number of created ports in the system.
257  * The cache of the port event structures is also located in port_control_t.
258  *
259  * On port_create() the vnode and the portfs node is also created.
260  * The portfs node is represented by the port_t structure.
261  * The port_t structure manages all port specific tasks:
262  * - management of resource control values
263  * - port VOP_POLL interface
264  * - creation time
265  * - uid and gid of the port
266  *
267  * The port_t structure contains the port_queue_t structure.
268  * The port_queue_t structure contains all the data necessary for the
269  * queue management:
270  * - locking
271  * - condition variables
272  * - event counters
273  * - submitted events	(represented by port_kevent_t structures)
274  * - threads waiting for event delivery (check portget_t structure)
275  * - PORT_SOURCE_FD cache	(managed by the port_fdcache_t structure)
276  * - event source management (managed by the port_source_t structure)
277  * - alert mode management	(check port_alert_t structure)
278  *
279  * EVENT MANAGEMENT
280  * The event port file system creates a kmem_cache for internal allocation of
281  * event port structures.
282  *
283  * 1. Event source association with a port:
284  * The first step to do for event sources is to get associated with a port
285  * using the port_associate_ksource() function or adding an entry to the
286  * port_ksource_tab[]. An event source can get dissociated from a port
287  * using the port_dissociate_ksource() function. An entry in the
288  * port_ksource_tab[] implies that the source will be associated
289  * automatically with every new created port.
290  * The event source can deliver a callback function, which is used by the
291  * port to notify the event source about close(2). The idea is that
292  * in such a case the event source should free all allocated resources
293  * and it must return to the port all allocated slots/structures.
294  * The port_close() function will wait until all allocated event
295  * structures/slots are returned to the port.
296  * The callback function is not necessary when the event source does not
297  * maintain local resources, a second condition is that the event source
298  * can guarantee that allocated event slots will be returned without
299  * delay to the port (it will not block and sleep somewhere).
300  *
301  * 2. Reservation of an event slot / event structure
302  * The event port reliability is based on the reservation of an event "slot"
303  * (allocation of an event structure) by the event source as part of the
304  * application call. If the maximal number of event slots is exhausted then
305  * the event source can return a corresponding error code to the application.
306  *
307  * The port_alloc_event() function has to be used by event sources to
308  * allocate an event slot (reserve an event structure). The port_alloc_event()
309  * doesn not block and it will return a 0 value on success or an error code
310  * if it fails.
311  * An argument of port_alloc_event() is a flag which determines the behavior
312  * of the event after it was delivered to the application:
313  * PORT_ALLOC_DEFAULT	: event slot becomes free after delivery to the
314  *			  application.
315  * PORT_ALLOC_PRIVATE	: event slot remains under the control of the event
316  *			  source. This kind of slots can not be used for
317  *			  event delivery and should only be used internally
318  *			  by the event source.
319  * PORT_KEV_CACHED	: event slot remains under the control of an event
320  *			  port cache. It does not become free after delivery
321  *			  to the application.
322  * PORT_ALLOC_SCACHED	: event slot remains under the control of the event
323  *			  source. The event source takes the control over
324  *			  the slot after the event is delivered to the
325  *			  application.
326  *
327  * 3. Delivery of events to the event port
328  * Earlier allocated event structure/slot has to be used to deliver
329  * event data to the port. Event source has to use the function
330  * port_send_event(). The single argument is a pointer to the previously
331  * reserved event structure/slot.
332  * The portkev_events field of the port_kevent_t structure can be updated/set
333  * in two ways:
334  * 1. using the port_set_event() function, or
335  * 2. updating the portkev_events field out of the callback function:
336  *    The event source can deliver a callback function to the port as an
337  *    argument of port_init_event().
338  *    One of the arguments of the callback function is a pointer to the
339  *    events field, which will be delivered to the application.
340  *    (see Delivery of events to the application).
341  * Event structures/slots can be delivered to the event port only one time,
342  * they remain blocked until the data is delivered to the application and the
343  * slot becomes free or it is delivered back to the event source
344  * (PORT_ALLOC_SCACHED). The activation of the callback function mentioned above
345  * is at the same time the indicator for the event source that the event
346  * structure/slot is free for reuse.
347  *
348  * 4. Delivery of events to the application
349  * The events structures/slots delivered by event sources remain in the
350  * port queue until they are retrieved by the application or the port
351  * is closed (exit(2) also closes all opened file descriptors)..
352  * The application uses port_get() or port_getn() to retrieve events from
353  * a port. port_get() retrieves a single event structure/slot and port_getn()
354  * retrieves a list of event structures/slots.
355  * Both functions are able to poll for events and return immediately or they
356  * can specify a timeout value.
357  * Before the events are delivered to the application they are moved to a
358  * second temporary internal queue. The idea is to avoid lock collisions or
359  * contentions of the global queue lock.
360  * The global queue lock is used every time when an event source delivers
361  * new events to the port.
362  * The port_get() and port_getn() functions
363  * a) retrieve single events from the temporary queue,
364  * b) prepare the data to be passed to the application memory,
365  * c) activate the callback function of the event sources:
366  *    - to get the latest event data,
367  *    - the event source can free all allocated resources associated with the
368  *      current event,
369  *    - the event source can re-use the current event slot/structure
370  *    - the event source can deny the delivery of the event to the application
371  *      (e.g. because of the wrong process).
372  * d) put the event back to the temporary queue if the event delivery was denied
373  * e) repeat a) until d) as long as there are events in the queue and
374  *    there is enough user space available.
375  *
376  * The loop described above could block for a very long time the global mutex,
377  * to avoid that a second mutex was introduced to synchronized concurrent
378  * threads accessing the temporary queue.
379  */
380 
381 static int64_t portfs(int, uintptr_t, uintptr_t, uintptr_t, uintptr_t,
382     uintptr_t);
383 
384 static struct sysent port_sysent = {
385 	6,
386 	SE_ARGC | SE_64RVAL | SE_NOUNLOAD,
387 	(int (*)())portfs,
388 };
389 
390 static struct modlsys modlsys = {
391 	&mod_syscallops, "event ports", &port_sysent
392 };
393 
394 #ifdef _SYSCALL32_IMPL
395 
396 static int64_t
397 portfs32(uint32_t arg1, int32_t arg2, uint32_t arg3, uint32_t arg4,
398     uint32_t arg5, uint32_t arg6);
399 
400 static struct sysent port_sysent32 = {
401 	6,
402 	SE_ARGC | SE_64RVAL | SE_NOUNLOAD,
403 	(int (*)())portfs32,
404 };
405 
406 static struct modlsys modlsys32 = {
407 	&mod_syscallops32,
408 	"32-bit event ports syscalls",
409 	&port_sysent32
410 };
411 #endif	/* _SYSCALL32_IMPL */
412 
413 static struct modlinkage modlinkage = {
414 	MODREV_1,
415 	&modlsys,
416 #ifdef _SYSCALL32_IMPL
417 	&modlsys32,
418 #endif
419 	NULL
420 };
421 
422 port_kstat_t port_kstat = {
423 	{ "ports",	KSTAT_DATA_UINT32 }
424 };
425 
426 dev_t	portdev;
427 struct	vnodeops *port_vnodeops;
428 struct	vfs port_vfs;
429 
430 extern	rctl_hndl_t rc_process_portev;
431 extern	rctl_hndl_t rc_project_portids;
432 extern	void aio_close_port(void *, int, pid_t, int);
433 
434 /*
435  * This table contains a list of event sources which need a static
436  * association with a port (every port).
437  * The last NULL entry in the table is required to detect "end of table".
438  */
439 struct port_ksource port_ksource_tab[] = {
440 	{PORT_SOURCE_AIO, aio_close_port, NULL, NULL},
441 	{0, NULL, NULL, NULL}
442 };
443 
444 /* local functions */
445 static int port_getn(port_t *, port_event_t *, uint_t, uint_t *,
446     port_gettimer_t *);
447 static int port_sendn(int [], int [], uint_t, int, void *, uint_t *);
448 static int port_alert(port_t *, int, int, void *);
449 static int port_dispatch_event(port_t *, int, int, int, uintptr_t, void *);
450 static int port_send(port_t *, int, int, void *);
451 static int port_create(int *);
452 static int port_get_alert(port_alert_t *, port_event_t *);
453 static int port_copy_event(port_event_t *, port_kevent_t *, list_t *);
454 static int *port_errorn(int *, int, int, int);
455 static int port_noshare(void *, int *, pid_t, int, void *);
456 static int port_get_timeout(timespec_t *, timespec_t *, timespec_t **, int *,
457     int);
458 static void port_init(port_t *);
459 static void port_remove_alert(port_queue_t *);
460 static void port_add_ksource_local(port_t *, port_ksource_t *);
461 static void port_check_return_cond(port_queue_t *);
462 static void port_dequeue_thread(port_queue_t *, portget_t *);
463 static portget_t *port_queue_thread(port_queue_t *, uint_t);
464 static void port_kstat_init(void);
465 
466 #ifdef	_SYSCALL32_IMPL
467 static int port_copy_event32(port_event32_t *, port_kevent_t *, list_t *);
468 #endif
469 
470 int
471 _init(void)
472 {
473 	static const fs_operation_def_t port_vfsops_template[] = {
474 		NULL, NULL
475 	};
476 	extern const	fs_operation_def_t port_vnodeops_template[];
477 	vfsops_t	*port_vfsops;
478 	int		error;
479 	major_t		major;
480 
481 	if ((major = getudev()) == (major_t)-1)
482 		return (ENXIO);
483 	portdev = makedevice(major, 0);
484 
485 	/* Create a dummy vfs */
486 	error = vfs_makefsops(port_vfsops_template, &port_vfsops);
487 	if (error) {
488 		cmn_err(CE_WARN, "port init: bad vfs ops");
489 		return (error);
490 	}
491 	vfs_setops(&port_vfs, port_vfsops);
492 	port_vfs.vfs_flag = VFS_RDONLY;
493 	port_vfs.vfs_dev = portdev;
494 	vfs_make_fsid(&(port_vfs.vfs_fsid), portdev, 0);
495 
496 	error = vn_make_ops("portfs", port_vnodeops_template, &port_vnodeops);
497 	if (error) {
498 		vfs_freevfsops(port_vfsops);
499 		cmn_err(CE_WARN, "port init: bad vnode ops");
500 		return (error);
501 	}
502 
503 	mutex_init(&port_control.pc_mutex, NULL, MUTEX_DEFAULT, NULL);
504 	port_control.pc_nents = 0;	/* number of active ports */
505 
506 	/* create kmem_cache for port event structures */
507 	port_control.pc_cache = kmem_cache_create("port_cache",
508 	    sizeof (port_kevent_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
509 
510 	port_kstat_init();		/* init port kstats */
511 	return (mod_install(&modlinkage));
512 }
513 
514 int
515 _info(struct modinfo *modinfop)
516 {
517 	return (mod_info(&modlinkage, modinfop));
518 }
519 
520 /*
521  * System call wrapper for all port related system calls from 32-bit programs.
522  */
523 #ifdef _SYSCALL32_IMPL
524 static int64_t
525 portfs32(uint32_t opcode, int32_t a0, uint32_t a1, uint32_t a2, uint32_t a3,
526     uint32_t a4)
527 {
528 	int64_t	error;
529 
530 	switch (opcode & PORT_CODE_MASK) {
531 	case PORT_GET:
532 		error = portfs(PORT_GET, a0, a1, (int)a2, (int)a3, a4);
533 		break;
534 	case PORT_SENDN:
535 		error = portfs(opcode, (uint32_t)a0, a1, a2, a3, a4);
536 		break;
537 	default:
538 		error = portfs(opcode, a0, a1, a2, a3, a4);
539 		break;
540 	}
541 	return (error);
542 }
543 #endif	/* _SYSCALL32_IMPL */
544 
545 /*
546  * System entry point for port functions.
547  * a0 is a port file descriptor (except for PORT_SENDN and PORT_CREATE).
548  * The libc uses PORT_SYS_NOPORT in functions which do not deliver a
549  * port file descriptor as first argument.
550  */
551 static int64_t
552 portfs(int opcode, uintptr_t a0, uintptr_t a1, uintptr_t a2, uintptr_t a3,
553     uintptr_t a4)
554 {
555 	rval_t		r;
556 	port_t		*pp;
557 	int		error = 0;
558 	uint_t		nget;
559 	file_t		*fp;
560 	port_gettimer_t	port_timer;
561 
562 	r.r_vals = 0;
563 	if (opcode & PORT_SYS_NOPORT) {
564 		opcode &= PORT_CODE_MASK;
565 		if (opcode == PORT_SENDN) {
566 			error = port_sendn((int *)a0, (int *)a1, (uint_t)a2,
567 			    (int)a3, (void *)a4, (uint_t *)&r.r_val1);
568 			if (error && (error != EIO))
569 				return ((int64_t)set_errno(error));
570 			return (r.r_vals);
571 		}
572 
573 		if (opcode == PORT_CREATE) {
574 			error = port_create(&r.r_val1);
575 			if (error)
576 				return ((int64_t)set_errno(error));
577 			return (r.r_vals);
578 		}
579 	}
580 
581 	/* opcodes using port as first argument (a0) */
582 
583 	if ((fp = getf((int)a0)) == NULL)
584 		return ((uintptr_t)set_errno(EBADF));
585 
586 	if (fp->f_vnode->v_type != VPORT) {
587 		releasef((int)a0);
588 		return ((uintptr_t)set_errno(EBADFD));
589 	}
590 
591 	pp = VTOEP(fp->f_vnode);
592 
593 	switch (opcode & PORT_CODE_MASK) {
594 	case	PORT_GET:
595 	{
596 		/* see PORT_GETN description */
597 		struct	timespec timeout;
598 
599 		port_timer.pgt_flags = PORTGET_ONE;
600 		port_timer.pgt_loop = 0;
601 		port_timer.pgt_rqtp = NULL;
602 		if (a4 != 0) {
603 			port_timer.pgt_timeout = &timeout;
604 			timeout.tv_sec = (time_t)a2;
605 			timeout.tv_nsec = (long)a3;
606 		} else {
607 			port_timer.pgt_timeout = NULL;
608 		}
609 		do {
610 			nget = 1;
611 			error = port_getn(pp, (port_event_t *)a1, 1,
612 			    (uint_t *)&nget, &port_timer);
613 		} while (nget == 0 && error == 0 && port_timer.pgt_loop);
614 		break;
615 	}
616 	case	PORT_GETN:
617 	{
618 		/*
619 		 * port_getn() can only retrieve own or shareable events from
620 		 * other processes. The port_getn() function remains in the
621 		 * kernel until own or shareable events are available or the
622 		 * timeout elapses.
623 		 */
624 		port_timer.pgt_flags = 0;
625 		port_timer.pgt_loop = 0;
626 		port_timer.pgt_rqtp = NULL;
627 		port_timer.pgt_timeout = (struct timespec *)a4;
628 		do {
629 			nget = a3;
630 			error = port_getn(pp, (port_event_t *)a1, (uint_t)a2,
631 			    (uint_t *)&nget, &port_timer);
632 		} while (nget == 0 && error == 0 && port_timer.pgt_loop);
633 		r.r_val1 = nget;
634 		r.r_val2 = error;
635 		releasef((int)a0);
636 		if (error && error != ETIME)
637 			return ((int64_t)set_errno(error));
638 		return (r.r_vals);
639 	}
640 	case	PORT_ASSOCIATE:
641 	{
642 		switch ((int)a1) {
643 		case PORT_SOURCE_FD:
644 			error = port_associate_fd(pp, (int)a1, (uintptr_t)a2,
645 			    (int)a3, (void *)a4);
646 			break;
647 		case PORT_SOURCE_FILE:
648 			error = port_associate_fop(pp, (int)a1, (uintptr_t)a2,
649 			    (int)a3, (void *)a4);
650 			break;
651 		default:
652 			error = EINVAL;
653 			break;
654 		}
655 		break;
656 	}
657 	case	PORT_SEND:
658 	{
659 		/* user-defined events */
660 		error = port_send(pp, PORT_SOURCE_USER, (int)a1, (void *)a2);
661 		break;
662 	}
663 	case	PORT_DISPATCH:
664 	{
665 		/*
666 		 * library events, blocking
667 		 * Only events of type PORT_SOURCE_AIO or PORT_SOURCE_MQ
668 		 * are currently allowed.
669 		 */
670 		if ((int)a1 != PORT_SOURCE_AIO && (int)a1 != PORT_SOURCE_MQ) {
671 			error = EINVAL;
672 			break;
673 		}
674 		error = port_dispatch_event(pp, (int)opcode, (int)a1, (int)a2,
675 		    (uintptr_t)a3, (void *)a4);
676 		break;
677 	}
678 	case	PORT_DISSOCIATE:
679 	{
680 		switch ((int)a1) {
681 		case PORT_SOURCE_FD:
682 			error = port_dissociate_fd(pp, (uintptr_t)a2);
683 			break;
684 		case PORT_SOURCE_FILE:
685 			error = port_dissociate_fop(pp, (uintptr_t)a2);
686 			break;
687 		default:
688 			error = EINVAL;
689 			break;
690 		}
691 		break;
692 	}
693 	case	PORT_ALERT:
694 	{
695 		if ((int)a2)	/* a2 = events */
696 			error = port_alert(pp, (int)a1, (int)a2, (void *)a3);
697 		else
698 			port_remove_alert(&pp->port_queue);
699 		break;
700 	}
701 	default:
702 		error = EINVAL;
703 		break;
704 	}
705 
706 	releasef((int)a0);
707 	if (error)
708 		return ((int64_t)set_errno(error));
709 	return (r.r_vals);
710 }
711 
712 /*
713  * System call to create a port.
714  *
715  * The port_create() function creates a vnode of type VPORT per port.
716  * The port control data is associated with the vnode as vnode private data.
717  * The port_create() function returns an event port file descriptor.
718  */
719 static int
720 port_create(int *fdp)
721 {
722 	port_t		*pp;
723 	vnode_t		*vp;
724 	struct file	*fp;
725 	proc_t		*p = curproc;
726 
727 	/* initialize vnode and port private data */
728 	pp = kmem_zalloc(sizeof (port_t), KM_SLEEP);
729 
730 	pp->port_vnode = vn_alloc(KM_SLEEP);
731 	vp = EPTOV(pp);
732 	vn_setops(vp, port_vnodeops);
733 	vp->v_type = VPORT;
734 	vp->v_vfsp = &port_vfs;
735 	vp->v_data = (caddr_t)pp;
736 
737 	mutex_enter(&port_control.pc_mutex);
738 	/*
739 	 * Retrieve the maximal number of event ports allowed per system from
740 	 * the resource control: project.port-max-ids.
741 	 */
742 	mutex_enter(&p->p_lock);
743 	if (rctl_test(rc_project_portids, p->p_task->tk_proj->kpj_rctls, p,
744 	    port_control.pc_nents + 1, RCA_SAFE) & RCT_DENY) {
745 		mutex_exit(&p->p_lock);
746 		vn_free(vp);
747 		kmem_free(pp, sizeof (port_t));
748 		mutex_exit(&port_control.pc_mutex);
749 		return (EAGAIN);
750 	}
751 
752 	/*
753 	 * Retrieve the maximal number of events allowed per port from
754 	 * the resource control: process.port-max-events.
755 	 */
756 	pp->port_max_events = rctl_enforced_value(rc_process_portev,
757 	    p->p_rctls, p);
758 	mutex_exit(&p->p_lock);
759 
760 	/* allocate a new user file descriptor and a file structure */
761 	if (falloc(vp, 0, &fp, fdp)) {
762 		/*
763 		 * If the file table is full, free allocated resources.
764 		 */
765 		vn_free(vp);
766 		kmem_free(pp, sizeof (port_t));
767 		mutex_exit(&port_control.pc_mutex);
768 		return (EMFILE);
769 	}
770 
771 	mutex_exit(&fp->f_tlock);
772 
773 	pp->port_fd = *fdp;
774 	port_control.pc_nents++;
775 	p->p_portcnt++;
776 	port_kstat.pks_ports.value.ui32++;
777 	mutex_exit(&port_control.pc_mutex);
778 
779 	/* initializes port private data */
780 	port_init(pp);
781 	/* set user file pointer */
782 	setf(*fdp, fp);
783 	return (0);
784 }
785 
786 /*
787  * port_init() initializes event port specific data
788  */
789 static void
790 port_init(port_t *pp)
791 {
792 	port_queue_t	*portq;
793 	port_ksource_t	*pks;
794 
795 	mutex_init(&pp->port_mutex, NULL, MUTEX_DEFAULT, NULL);
796 	portq = &pp->port_queue;
797 	mutex_init(&portq->portq_mutex, NULL, MUTEX_DEFAULT, NULL);
798 	pp->port_flags |= PORT_INIT;
799 
800 	/*
801 	 * If it is not enough memory available to satisfy a user
802 	 * request using a single port_getn() call then port_getn()
803 	 * will reduce the size of the list to PORT_MAX_LIST.
804 	 */
805 	pp->port_max_list = port_max_list;
806 
807 	/* Set timestamp entries required for fstat(2) requests */
808 	gethrestime(&pp->port_ctime);
809 	pp->port_uid = crgetuid(curproc->p_cred);
810 	pp->port_gid = crgetgid(curproc->p_cred);
811 
812 	/* initialize port queue structs */
813 	list_create(&portq->portq_list, sizeof (port_kevent_t),
814 	    offsetof(port_kevent_t, portkev_node));
815 	list_create(&portq->portq_get_list, sizeof (port_kevent_t),
816 	    offsetof(port_kevent_t, portkev_node));
817 	portq->portq_flags = 0;
818 	pp->port_pid = curproc->p_pid;
819 
820 	/* Allocate cache skeleton for PORT_SOURCE_FD events */
821 	portq->portq_pcp = kmem_zalloc(sizeof (port_fdcache_t), KM_SLEEP);
822 	mutex_init(&portq->portq_pcp->pc_lock, NULL, MUTEX_DEFAULT, NULL);
823 
824 	/*
825 	 * Allocate cache skeleton for association of event sources.
826 	 */
827 	mutex_init(&portq->portq_source_mutex, NULL, MUTEX_DEFAULT, NULL);
828 	portq->portq_scache = kmem_zalloc(
829 	    PORT_SCACHE_SIZE * sizeof (port_source_t *), KM_SLEEP);
830 
831 	/*
832 	 * pre-associate some kernel sources with this port.
833 	 * The pre-association is required to create port_source_t
834 	 * structures for object association.
835 	 * Some sources can not get associated with a port before the first
836 	 * object association is requested. Another reason to pre_associate
837 	 * a particular source with a port is because of performance.
838 	 */
839 
840 	for (pks = port_ksource_tab; pks->pks_source != 0; pks++)
841 		port_add_ksource_local(pp, pks);
842 }
843 
844 /*
845  * The port_add_ksource_local() function is being used to associate
846  * event sources with every new port.
847  * The event sources need to be added to port_ksource_tab[].
848  */
849 static void
850 port_add_ksource_local(port_t *pp, port_ksource_t *pks)
851 {
852 	port_source_t	*pse;
853 	port_source_t	**ps;
854 
855 	mutex_enter(&pp->port_queue.portq_source_mutex);
856 	ps = &pp->port_queue.portq_scache[PORT_SHASH(pks->pks_source)];
857 	for (pse = *ps; pse != NULL; pse = pse->portsrc_next) {
858 		if (pse->portsrc_source == pks->pks_source)
859 			break;
860 	}
861 
862 	if (pse == NULL) {
863 		/* associate new source with the port */
864 		pse = kmem_zalloc(sizeof (port_source_t), KM_SLEEP);
865 		pse->portsrc_source = pks->pks_source;
866 		pse->portsrc_close = pks->pks_close;
867 		pse->portsrc_closearg = pks->pks_closearg;
868 		pse->portsrc_cnt = 1;
869 
870 		pks->pks_portsrc = pse;
871 		if (*ps != NULL)
872 			pse->portsrc_next = (*ps)->portsrc_next;
873 		*ps = pse;
874 	}
875 	mutex_exit(&pp->port_queue.portq_source_mutex);
876 }
877 
878 /*
879  * The port_send() function sends an event of type "source" to a
880  * port. This function is non-blocking. An event can be sent to
881  * a port as long as the number of events per port does not achieve the
882  * maximal allowed number of events. The max. number of events per port is
883  * defined by the resource control process.max-port-events.
884  * This function is used by the port library function port_send()
885  * and port_dispatch(). The port_send(3c) function is part of the
886  * event ports API and submits events of type PORT_SOURCE_USER. The
887  * port_dispatch() function is project private and it is used by library
888  * functions to submit events of other types than PORT_SOURCE_USER
889  * (e.g. PORT_SOURCE_AIO).
890  */
891 static int
892 port_send(port_t *pp, int source, int events, void *user)
893 {
894 	port_kevent_t	*pev;
895 	int		error;
896 
897 	error = port_alloc_event_local(pp, source, PORT_ALLOC_DEFAULT, &pev);
898 	if (error)
899 		return (error);
900 
901 	pev->portkev_object = 0;
902 	pev->portkev_events = events;
903 	pev->portkev_user = user;
904 	pev->portkev_callback = NULL;
905 	pev->portkev_arg = NULL;
906 	pev->portkev_flags = 0;
907 
908 	port_send_event(pev);
909 	return (0);
910 }
911 
912 /*
913  * The port_noshare() function returns 0 if the current event was generated
914  * by the same process. Otherwise is returns a value other than 0 and the
915  * event should not be delivered to the current processe.
916  * The port_noshare() function is normally used by the port_dispatch()
917  * function. The port_dispatch() function is project private and can only be
918  * used within the event port project.
919  * Currently the libaio uses the port_dispatch() function to deliver events
920  * of types PORT_SOURCE_AIO.
921  */
922 /* ARGSUSED */
923 static int
924 port_noshare(void *arg, int *events, pid_t pid, int flag, void *evp)
925 {
926 	if (flag == PORT_CALLBACK_DEFAULT && curproc->p_pid != pid)
927 		return (1);
928 	return (0);
929 }
930 
931 /*
932  * The port_dispatch_event() function is project private and it is used by
933  * libraries involved in the project to deliver events to the port.
934  * port_dispatch will sleep and wait for enough resources to satisfy the
935  * request, if necessary.
936  * The library can specify if the delivered event is shareable with other
937  * processes (see PORT_SYS_NOSHARE flag).
938  */
939 static int
940 port_dispatch_event(port_t *pp, int opcode, int source, int events,
941     uintptr_t object, void *user)
942 {
943 	port_kevent_t	*pev;
944 	int		error;
945 
946 	error = port_alloc_event_block(pp, source, PORT_ALLOC_DEFAULT, &pev);
947 	if (error)
948 		return (error);
949 
950 	pev->portkev_object = object;
951 	pev->portkev_events = events;
952 	pev->portkev_user = user;
953 	pev->portkev_arg = NULL;
954 	if (opcode & PORT_SYS_NOSHARE) {
955 		pev->portkev_flags = PORT_KEV_NOSHARE;
956 		pev->portkev_callback = port_noshare;
957 	} else {
958 		pev->portkev_flags = 0;
959 		pev->portkev_callback = NULL;
960 	}
961 
962 	port_send_event(pev);
963 	return (0);
964 }
965 
966 
967 /*
968  * The port_sendn() function is the kernel implementation of the event
969  * port API function port_sendn(3c).
970  * This function is able to send an event to a list of event ports.
971  */
972 static int
973 port_sendn(int ports[], int errors[], uint_t nent, int events, void *user,
974     uint_t *nget)
975 {
976 	port_kevent_t	*pev;
977 	int		errorcnt = 0;
978 	int		error = 0;
979 	int		count;
980 	int		port;
981 	int		*plist;
982 	int		*elist = NULL;
983 	file_t		*fp;
984 	port_t		*pp;
985 
986 	if (nent == 0 || nent > port_max_list)
987 		return (EINVAL);
988 
989 	plist = kmem_alloc(nent * sizeof (int), KM_SLEEP);
990 	if (copyin((void *)ports, plist, nent * sizeof (int))) {
991 		kmem_free(plist, nent * sizeof (int));
992 		return (EFAULT);
993 	}
994 
995 	/*
996 	 * Scan the list for event port file descriptors and send the
997 	 * attached user event data embedded in a event of type
998 	 * PORT_SOURCE_USER to every event port in the list.
999 	 * If a list entry is not a valid event port then the corresponding
1000 	 * error code will be stored in the errors[] list with the same
1001 	 * list offset as in the ports[] list.
1002 	 */
1003 
1004 	for (count = 0; count < nent; count++) {
1005 		port = plist[count];
1006 		if ((fp = getf(port)) == NULL) {
1007 			elist = port_errorn(elist, nent, EBADF, count);
1008 			errorcnt++;
1009 			continue;
1010 		}
1011 
1012 		pp = VTOEP(fp->f_vnode);
1013 		if (fp->f_vnode->v_type != VPORT) {
1014 			releasef(port);
1015 			elist = port_errorn(elist, nent, EBADFD, count);
1016 			errorcnt++;
1017 			continue;
1018 		}
1019 
1020 		error = port_alloc_event_local(pp, PORT_SOURCE_USER,
1021 		    PORT_ALLOC_DEFAULT, &pev);
1022 		if (error) {
1023 			releasef(port);
1024 			elist = port_errorn(elist, nent, error, count);
1025 			errorcnt++;
1026 			continue;
1027 		}
1028 
1029 		pev->portkev_object = 0;
1030 		pev->portkev_events = events;
1031 		pev->portkev_user = user;
1032 		pev->portkev_callback = NULL;
1033 		pev->portkev_arg = NULL;
1034 		pev->portkev_flags = 0;
1035 
1036 		port_send_event(pev);
1037 		releasef(port);
1038 	}
1039 	if (errorcnt) {
1040 		error = EIO;
1041 		if (copyout(elist, (void *)errors, nent * sizeof (int)))
1042 			error = EFAULT;
1043 		kmem_free(elist, nent * sizeof (int));
1044 	}
1045 	*nget = nent - errorcnt;
1046 	kmem_free(plist, nent * sizeof (int));
1047 	return (error);
1048 }
1049 
1050 static int *
1051 port_errorn(int *elist, int nent, int error, int index)
1052 {
1053 	if (elist == NULL)
1054 		elist = kmem_zalloc(nent * sizeof (int), KM_SLEEP);
1055 	elist[index] = error;
1056 	return (elist);
1057 }
1058 
1059 /*
1060  * port_alert()
1061  * The port_alert() funcion is a high priority event and it is always set
1062  * on top of the queue. It is also delivered as single event.
1063  * flags:
1064  *	- SET	:overwrite current alert data
1065  *	- UPDATE:set alert data or return EBUSY if alert mode is already set
1066  *
1067  * - set the ALERT flag
1068  * - wakeup all sleeping threads
1069  */
1070 static int
1071 port_alert(port_t *pp, int flags, int events, void *user)
1072 {
1073 	port_queue_t	*portq;
1074 	portget_t	*pgetp;
1075 	port_alert_t	*pa;
1076 
1077 	if ((flags & PORT_ALERT_INVALID) == PORT_ALERT_INVALID)
1078 		return (EINVAL);
1079 
1080 	portq = &pp->port_queue;
1081 	pa = &portq->portq_alert;
1082 	mutex_enter(&portq->portq_mutex);
1083 
1084 	/* check alert conditions */
1085 	if (flags == PORT_ALERT_UPDATE) {
1086 		if (portq->portq_flags & PORTQ_ALERT) {
1087 			mutex_exit(&portq->portq_mutex);
1088 			return (EBUSY);
1089 		}
1090 	}
1091 
1092 	/*
1093 	 * Store alert data in the port to be delivered to threads
1094 	 * which are using port_get(n) to retrieve events.
1095 	 */
1096 
1097 	portq->portq_flags |= PORTQ_ALERT;
1098 	pa->portal_events = events;		/* alert info */
1099 	pa->portal_pid = curproc->p_pid;	/* process owner */
1100 	pa->portal_object = 0;			/* no object */
1101 	pa->portal_user = user;			/* user alert data */
1102 
1103 	/* alert and deliver alert data to waiting threads */
1104 	pgetp = portq->portq_thread;
1105 	if (pgetp == NULL) {
1106 		/* no threads waiting for events */
1107 		mutex_exit(&portq->portq_mutex);
1108 		return (0);
1109 	}
1110 
1111 	/*
1112 	 * Set waiting threads in alert mode (PORTGET_ALERT)..
1113 	 * Every thread waiting for events already allocated a portget_t
1114 	 * structure to sleep on.
1115 	 * The port alert arguments are stored in the portget_t structure.
1116 	 * The PORTGET_ALERT flag is set to indicate the thread to return
1117 	 * immediately with the alert event.
1118 	 */
1119 	do {
1120 		if ((pgetp->portget_state & PORTGET_ALERT) == 0) {
1121 			pa = &pgetp->portget_alert;
1122 			pa->portal_events = events;
1123 			pa->portal_object = 0;
1124 			pa->portal_user = user;
1125 			pgetp->portget_state |= PORTGET_ALERT;
1126 			cv_signal(&pgetp->portget_cv);
1127 		}
1128 	} while ((pgetp = pgetp->portget_next) != portq->portq_thread);
1129 	mutex_exit(&portq->portq_mutex);
1130 	return (0);
1131 }
1132 
1133 /*
1134  * Clear alert state of the port
1135  */
1136 static void
1137 port_remove_alert(port_queue_t *portq)
1138 {
1139 	mutex_enter(&portq->portq_mutex);
1140 	portq->portq_flags &= ~PORTQ_ALERT;
1141 	mutex_exit(&portq->portq_mutex);
1142 }
1143 
1144 /*
1145  * The port_getn() function is used to retrieve events from a port.
1146  *
1147  * The port_getn() function returns immediately if there are enough events
1148  * available in the port to satisfy the request or if the port is in alert
1149  * mode (see port_alert(3c)).
1150  * The timeout argument of port_getn(3c) -which is embedded in the
1151  * port_gettimer_t structure- specifies if the system call should block or if it
1152  * should return immediately depending on the number of events available.
1153  * This function is internally used by port_getn(3c) as well as by
1154  * port_get(3c).
1155  */
1156 static int
1157 port_getn(port_t *pp, port_event_t *uevp, uint_t max, uint_t *nget,
1158     port_gettimer_t *pgt)
1159 {
1160 	port_queue_t	*portq;
1161 	port_kevent_t	*pev;
1162 	port_kevent_t	*lev;
1163 	int		error = 0;
1164 	uint_t		nmax;
1165 	uint_t		nevents;
1166 	uint_t		eventsz;
1167 	port_event_t	*kevp;
1168 	list_t		*glist;
1169 	uint_t		tnent;
1170 	int		rval;
1171 	int		blocking = -1;
1172 	int		timecheck;
1173 	int		flag;
1174 	timespec_t	rqtime;
1175 	timespec_t	*rqtp = NULL;
1176 	portget_t	*pgetp;
1177 	void		*results;
1178 	model_t		model = get_udatamodel();
1179 
1180 	flag = pgt->pgt_flags;
1181 
1182 	if (*nget > max && max > 0)
1183 		return (EINVAL);
1184 
1185 	portq = &pp->port_queue;
1186 	mutex_enter(&portq->portq_mutex);
1187 	if (max == 0) {
1188 		/*
1189 		 * Return number of objects with events.
1190 		 * The port_block() call is required to synchronize this
1191 		 * thread with another possible thread, which could be
1192 		 * retrieving events from the port queue.
1193 		 */
1194 		port_block(portq);
1195 		/*
1196 		 * Check if a second thread is currently retrieving events
1197 		 * and it is using the temporary event queue.
1198 		 */
1199 		if (portq->portq_tnent) {
1200 			/* put remaining events back to the port queue */
1201 			port_push_eventq(portq);
1202 		}
1203 		*nget = portq->portq_nent;
1204 		port_unblock(portq);
1205 		mutex_exit(&portq->portq_mutex);
1206 		return (0);
1207 	}
1208 
1209 	if (uevp == NULL) {
1210 		mutex_exit(&portq->portq_mutex);
1211 		return (EFAULT);
1212 	}
1213 	if (*nget == 0) {		/* no events required */
1214 		mutex_exit(&portq->portq_mutex);
1215 		return (0);
1216 	}
1217 
1218 	/* port is being closed ... */
1219 	if (portq->portq_flags & PORTQ_CLOSE) {
1220 		mutex_exit(&portq->portq_mutex);
1221 		return (EBADFD);
1222 	}
1223 
1224 	/* return immediately if port in alert mode */
1225 	if (portq->portq_flags & PORTQ_ALERT) {
1226 		error = port_get_alert(&portq->portq_alert, uevp);
1227 		if (error == 0)
1228 			*nget = 1;
1229 		mutex_exit(&portq->portq_mutex);
1230 		return (error);
1231 	}
1232 
1233 	portq->portq_thrcnt++;
1234 
1235 	/*
1236 	 * Now check if the completed events satisfy the
1237 	 * "wait" requirements of the current thread:
1238 	 */
1239 
1240 	if (pgt->pgt_loop) {
1241 		/*
1242 		 * loop entry of same thread
1243 		 * pgt_loop is set when the current thread returns
1244 		 * prematurely from this function. That could happen
1245 		 * when a port is being shared between processes and
1246 		 * this thread could not find events to return.
1247 		 * It is not allowed to a thread to retrieve non-shareable
1248 		 * events generated in other processes.
1249 		 * PORTQ_WAIT_EVENTS is set when a thread already
1250 		 * checked the current event queue and no new events
1251 		 * are added to the queue.
1252 		 */
1253 		if (((portq->portq_flags & PORTQ_WAIT_EVENTS) == 0) &&
1254 		    (portq->portq_nent >= *nget)) {
1255 			/* some new events arrived ...check them */
1256 			goto portnowait;
1257 		}
1258 		rqtp = pgt->pgt_rqtp;
1259 		timecheck = pgt->pgt_timecheck;
1260 		pgt->pgt_flags |= PORTGET_WAIT_EVENTS;
1261 	} else {
1262 		/* check if enough events are available ... */
1263 		if (portq->portq_nent >= *nget)
1264 			goto portnowait;
1265 		/*
1266 		 * There are not enough events available to satisfy
1267 		 * the request, check timeout value and wait for
1268 		 * incoming events.
1269 		 */
1270 		error = port_get_timeout(pgt->pgt_timeout, &rqtime, &rqtp,
1271 		    &blocking, flag);
1272 		if (error) {
1273 			port_check_return_cond(portq);
1274 			mutex_exit(&portq->portq_mutex);
1275 			return (error);
1276 		}
1277 
1278 		if (blocking == 0) /* don't block, check fired events */
1279 			goto portnowait;
1280 
1281 		if (rqtp != NULL) {
1282 			timespec_t	now;
1283 			timecheck = timechanged;
1284 			gethrestime(&now);
1285 			timespecadd(rqtp, &now);
1286 		}
1287 	}
1288 
1289 	/* enqueue thread in the list of waiting threads */
1290 	pgetp = port_queue_thread(portq, *nget);
1291 
1292 
1293 	/* Wait here until return conditions met */
1294 	for (;;) {
1295 		if (pgetp->portget_state & PORTGET_ALERT) {
1296 			/* reap alert event and return */
1297 			error = port_get_alert(&pgetp->portget_alert, uevp);
1298 			if (error)
1299 				*nget = 0;
1300 			else
1301 				*nget = 1;
1302 			port_dequeue_thread(&pp->port_queue, pgetp);
1303 			portq->portq_thrcnt--;
1304 			mutex_exit(&portq->portq_mutex);
1305 			return (error);
1306 		}
1307 
1308 		/*
1309 		 * Check if some other thread is already retrieving
1310 		 * events (portq_getn > 0).
1311 		 */
1312 
1313 		if ((portq->portq_getn  == 0) &&
1314 		    ((portq)->portq_nent >= *nget) &&
1315 		    (!((pgt)->pgt_flags & PORTGET_WAIT_EVENTS) ||
1316 		    !((portq)->portq_flags & PORTQ_WAIT_EVENTS)))
1317 			break;
1318 
1319 		if (portq->portq_flags & PORTQ_CLOSE) {
1320 			error = EBADFD;
1321 			break;
1322 		}
1323 
1324 		rval = cv_waituntil_sig(&pgetp->portget_cv, &portq->portq_mutex,
1325 		    rqtp, timecheck);
1326 
1327 		if (rval <= 0) {
1328 			error = (rval == 0) ? EINTR : ETIME;
1329 			break;
1330 		}
1331 	}
1332 
1333 	/* take thread out of the wait queue */
1334 	port_dequeue_thread(portq, pgetp);
1335 
1336 	if (error != 0 && (error == EINTR || error == EBADFD ||
1337 	    (error == ETIME && flag))) {
1338 		/* return without events */
1339 		port_check_return_cond(portq);
1340 		mutex_exit(&portq->portq_mutex);
1341 		return (error);
1342 	}
1343 
1344 portnowait:
1345 	/*
1346 	 * Move port event queue to a temporary event queue .
1347 	 * New incoming events will be continue be posted to the event queue
1348 	 * and they will not be considered by the current thread.
1349 	 * The idea is to avoid lock contentions or an often locking/unlocking
1350 	 * of the port queue mutex. The contention and performance degradation
1351 	 * could happen because:
1352 	 * a) incoming events use the port queue mutex to enqueue new events and
1353 	 * b) before the event can be delivered to the application it is
1354 	 *    necessary to notify the event sources about the event delivery.
1355 	 *    Sometimes the event sources can require a long time to return and
1356 	 *    the queue mutex would block incoming events.
1357 	 * During this time incoming events (port_send_event()) do not need
1358 	 * to awake threads waiting for events. Before the current thread
1359 	 * returns it will check the conditions to awake other waiting threads.
1360 	 */
1361 	portq->portq_getn++;	/* number of threads retrieving events */
1362 	port_block(portq);	/* block other threads here */
1363 	nmax = max < portq->portq_nent ? max : portq->portq_nent;
1364 
1365 	if (portq->portq_tnent) {
1366 		/*
1367 		 * Move remaining events from previous thread back to the
1368 		 * port event queue.
1369 		 */
1370 		port_push_eventq(portq);
1371 	}
1372 	/* move port event queue to a temporary queue */
1373 	list_move_tail(&portq->portq_get_list, &portq->portq_list);
1374 	glist = &portq->portq_get_list;	/* use temporary event queue */
1375 	tnent = portq->portq_nent;	/* get current number of events */
1376 	portq->portq_nent = 0;		/* no events in the port event queue */
1377 	portq->portq_flags |= PORTQ_WAIT_EVENTS; /* detect incoming events */
1378 	mutex_exit(&portq->portq_mutex);    /* event queue can be reused now */
1379 
1380 	if (model == DATAMODEL_NATIVE) {
1381 		eventsz = sizeof (port_event_t);
1382 		kevp = kmem_alloc(eventsz * nmax, KM_NOSLEEP);
1383 		if (kevp == NULL) {
1384 			if (nmax > pp->port_max_list)
1385 				nmax = pp->port_max_list;
1386 			kevp = kmem_alloc(eventsz * nmax, KM_SLEEP);
1387 		}
1388 		results = kevp;
1389 		lev = NULL;	/* start with first event in the queue */
1390 		for (nevents = 0; nevents < nmax; ) {
1391 			pev = port_get_kevent(glist, lev);
1392 			if (pev == NULL)	/* no more events available */
1393 				break;
1394 			if (pev->portkev_flags & PORT_KEV_FREE) {
1395 				/* Just discard event */
1396 				list_remove(glist, pev);
1397 				pev->portkev_flags &= ~(PORT_CLEANUP_DONE);
1398 				if (PORT_FREE_EVENT(pev))
1399 					port_free_event_local(pev, 0);
1400 				tnent--;
1401 				continue;
1402 			}
1403 
1404 			/* move event data to copyout list */
1405 			if (port_copy_event(&kevp[nevents], pev, glist)) {
1406 				/*
1407 				 * Event can not be delivered to the
1408 				 * current process.
1409 				 */
1410 				if (lev != NULL)
1411 					list_insert_after(glist, lev, pev);
1412 				else
1413 					list_insert_head(glist, pev);
1414 				lev = pev;  /* last checked event */
1415 			} else {
1416 				nevents++;	/* # of events ready */
1417 			}
1418 		}
1419 #ifdef	_SYSCALL32_IMPL
1420 	} else {
1421 		port_event32_t	*kevp32;
1422 
1423 		eventsz = sizeof (port_event32_t);
1424 		kevp32 = kmem_alloc(eventsz * nmax, KM_NOSLEEP);
1425 		if (kevp32 == NULL) {
1426 			if (nmax > pp->port_max_list)
1427 				nmax = pp->port_max_list;
1428 			kevp32 = kmem_alloc(eventsz * nmax, KM_SLEEP);
1429 		}
1430 		results = kevp32;
1431 		lev = NULL;	/* start with first event in the queue */
1432 		for (nevents = 0; nevents < nmax; ) {
1433 			pev = port_get_kevent(glist, lev);
1434 			if (pev == NULL)	/* no more events available */
1435 				break;
1436 			if (pev->portkev_flags & PORT_KEV_FREE) {
1437 				/* Just discard event */
1438 				list_remove(glist, pev);
1439 				pev->portkev_flags &= ~(PORT_CLEANUP_DONE);
1440 				if (PORT_FREE_EVENT(pev))
1441 					port_free_event_local(pev, 0);
1442 				tnent--;
1443 				continue;
1444 			}
1445 
1446 			/* move event data to copyout list */
1447 			if (port_copy_event32(&kevp32[nevents], pev, glist)) {
1448 				/*
1449 				 * Event can not be delivered to the
1450 				 * current process.
1451 				 */
1452 				if (lev != NULL)
1453 					list_insert_after(glist, lev, pev);
1454 				else
1455 					list_insert_head(glist, pev);
1456 				lev = pev;  /* last checked event */
1457 			} else {
1458 				nevents++;	/* # of events ready */
1459 			}
1460 		}
1461 #endif	/* _SYSCALL32_IMPL */
1462 	}
1463 
1464 	/*
1465 	 *  Remember number of remaining events in the temporary event queue.
1466 	 */
1467 	portq->portq_tnent = tnent - nevents;
1468 
1469 	/*
1470 	 * Work to do before return :
1471 	 * - push list of remaining events back to the top of the standard
1472 	 *   port queue.
1473 	 * - if this is the last thread calling port_get(n) then wakeup the
1474 	 *   thread waiting on close(2).
1475 	 * - check for a deferred cv_signal from port_send_event() and wakeup
1476 	 *   the sleeping thread.
1477 	 */
1478 
1479 	mutex_enter(&portq->portq_mutex);
1480 	port_unblock(portq);
1481 	if (portq->portq_tnent) {
1482 		/*
1483 		 * move remaining events in the temporary event queue back
1484 		 * to the port event queue
1485 		 */
1486 		port_push_eventq(portq);
1487 	}
1488 	portq->portq_getn--;	/* update # of threads retrieving events */
1489 	if (--portq->portq_thrcnt == 0) { /* # of threads waiting ... */
1490 		/* Last thread => check close(2) conditions ... */
1491 		if (portq->portq_flags & PORTQ_CLOSE) {
1492 			cv_signal(&portq->portq_closecv);
1493 			mutex_exit(&portq->portq_mutex);
1494 			kmem_free(results, eventsz * nmax);
1495 			/* do not copyout events */
1496 			*nget = 0;
1497 			return (EBADFD);
1498 		}
1499 	} else if (portq->portq_getn == 0) {
1500 		/*
1501 		 * no other threads retrieving events ...
1502 		 * check wakeup conditions of sleeping threads
1503 		 */
1504 		if ((portq->portq_thread != NULL) &&
1505 		    (portq->portq_nent >= portq->portq_nget))
1506 			cv_signal(&portq->portq_thread->portget_cv);
1507 	}
1508 
1509 	/*
1510 	 * Check PORTQ_POLLIN here because the current thread set temporarily
1511 	 * the number of events in the queue to zero.
1512 	 */
1513 	if (portq->portq_flags & PORTQ_POLLIN) {
1514 		portq->portq_flags &= ~PORTQ_POLLIN;
1515 		mutex_exit(&portq->portq_mutex);
1516 		pollwakeup(&pp->port_pollhd, POLLIN);
1517 	} else {
1518 		mutex_exit(&portq->portq_mutex);
1519 	}
1520 
1521 	/* now copyout list of user event structures to user space */
1522 	if (nevents) {
1523 		if (copyout(results, uevp, nevents * eventsz))
1524 			error = EFAULT;
1525 	}
1526 	kmem_free(results, eventsz * nmax);
1527 
1528 	if (nevents == 0 && error == 0 && pgt->pgt_loop == 0 && blocking != 0) {
1529 		/* no events retrieved: check loop conditions */
1530 		if (blocking == -1) {
1531 			/* no timeout checked */
1532 			error = port_get_timeout(pgt->pgt_timeout,
1533 			    &pgt->pgt_rqtime, &rqtp, &blocking, flag);
1534 			if (error) {
1535 				*nget = nevents;
1536 				return (error);
1537 			}
1538 			if (rqtp != NULL) {
1539 				timespec_t	now;
1540 				pgt->pgt_timecheck = timechanged;
1541 				gethrestime(&now);
1542 				timespecadd(&pgt->pgt_rqtime, &now);
1543 			}
1544 			pgt->pgt_rqtp = rqtp;
1545 		} else {
1546 			/* timeout already checked -> remember values */
1547 			pgt->pgt_rqtp = rqtp;
1548 			if (rqtp != NULL) {
1549 				pgt->pgt_timecheck = timecheck;
1550 				pgt->pgt_rqtime = *rqtp;
1551 			}
1552 		}
1553 		if (blocking)
1554 			/* timeout remaining */
1555 			pgt->pgt_loop = 1;
1556 	}
1557 
1558 	/* set number of user event structures completed */
1559 	*nget = nevents;
1560 	return (error);
1561 }
1562 
1563 /*
1564  * 1. copy kernel event structure to user event structure.
1565  * 2. PORT_KEV_WIRED event structures will be reused by the "source"
1566  * 3. Remove PORT_KEV_DONEQ flag (event removed from the event queue)
1567  * 4. Other types of event structures can be delivered back to the port cache
1568  *    (port_free_event_local()).
1569  * 5. The event source callback function is the last opportunity for the
1570  *    event source to update events, to free local resources associated with
1571  *    the event or to deny the delivery of the event.
1572  */
1573 static int
1574 port_copy_event(port_event_t *puevp, port_kevent_t *pkevp, list_t *list)
1575 {
1576 	int	free_event = 0;
1577 	int	flags;
1578 	int	error;
1579 
1580 	puevp->portev_source = pkevp->portkev_source;
1581 	puevp->portev_object = pkevp->portkev_object;
1582 	puevp->portev_user = pkevp->portkev_user;
1583 	puevp->portev_events = pkevp->portkev_events;
1584 
1585 	/* remove event from the queue */
1586 	list_remove(list, pkevp);
1587 
1588 	/*
1589 	 * Events of type PORT_KEV_WIRED remain allocated by the
1590 	 * event source.
1591 	 */
1592 	flags = pkevp->portkev_flags;
1593 	if (pkevp->portkev_flags & PORT_KEV_WIRED)
1594 		pkevp->portkev_flags &= ~PORT_KEV_DONEQ;
1595 	else
1596 		free_event = 1;
1597 
1598 	if (pkevp->portkev_callback) {
1599 		error = (*pkevp->portkev_callback)(pkevp->portkev_arg,
1600 		    &puevp->portev_events, pkevp->portkev_pid,
1601 		    PORT_CALLBACK_DEFAULT, pkevp);
1602 
1603 		if (error) {
1604 			/*
1605 			 * Event can not be delivered.
1606 			 * Caller must reinsert the event into the queue.
1607 			 */
1608 			pkevp->portkev_flags = flags;
1609 			return (error);
1610 		}
1611 	}
1612 	if (free_event)
1613 		port_free_event_local(pkevp, 0);
1614 	return (0);
1615 }
1616 
1617 #ifdef	_SYSCALL32_IMPL
1618 /*
1619  * 1. copy kernel event structure to user event structure.
1620  * 2. PORT_KEV_WIRED event structures will be reused by the "source"
1621  * 3. Remove PORT_KEV_DONEQ flag (event removed from the event queue)
1622  * 4. Other types of event structures can be delivered back to the port cache
1623  *    (port_free_event_local()).
1624  * 5. The event source callback function is the last opportunity for the
1625  *    event source to update events, to free local resources associated with
1626  *    the event or to deny the delivery of the event.
1627  */
1628 static int
1629 port_copy_event32(port_event32_t *puevp, port_kevent_t *pkevp, list_t *list)
1630 {
1631 	int	free_event = 0;
1632 	int	error;
1633 	int	flags;
1634 
1635 	puevp->portev_source = pkevp->portkev_source;
1636 	puevp->portev_object = (daddr32_t)pkevp->portkev_object;
1637 	puevp->portev_user = (caddr32_t)(uintptr_t)pkevp->portkev_user;
1638 	puevp->portev_events = pkevp->portkev_events;
1639 
1640 	/* remove event from the queue */
1641 	list_remove(list, pkevp);
1642 
1643 	/*
1644 	 * Events if type PORT_KEV_WIRED remain allocated by the
1645 	 * sub-system (source).
1646 	 */
1647 
1648 	flags = pkevp->portkev_flags;
1649 	if (pkevp->portkev_flags & PORT_KEV_WIRED)
1650 		pkevp->portkev_flags &= ~PORT_KEV_DONEQ;
1651 	else
1652 		free_event = 1;
1653 
1654 	if (pkevp->portkev_callback != NULL) {
1655 		error = (*pkevp->portkev_callback)(pkevp->portkev_arg,
1656 		    &puevp->portev_events, pkevp->portkev_pid,
1657 		    PORT_CALLBACK_DEFAULT, pkevp);
1658 		if (error) {
1659 			/*
1660 			 * Event can not be delivered.
1661 			 * Caller must reinsert the event into the queue.
1662 			 */
1663 			pkevp->portkev_flags = flags;
1664 			return (error);
1665 		}
1666 	}
1667 	if (free_event)
1668 		port_free_event_local(pkevp, 0);
1669 	return (0);
1670 }
1671 #endif	/* _SYSCALL32_IMPL */
1672 
1673 /*
1674  * copyout alert event.
1675  */
1676 static int
1677 port_get_alert(port_alert_t *pa, port_event_t *uevp)
1678 {
1679 	model_t	model = get_udatamodel();
1680 
1681 	/* copyout alert event structures to user space */
1682 	if (model == DATAMODEL_NATIVE) {
1683 		port_event_t	uev;
1684 		uev.portev_source = PORT_SOURCE_ALERT;
1685 		uev.portev_object = pa->portal_object;
1686 		uev.portev_events = pa->portal_events;
1687 		uev.portev_user = pa->portal_user;
1688 		if (copyout(&uev, uevp, sizeof (port_event_t)))
1689 			return (EFAULT);
1690 #ifdef	_SYSCALL32_IMPL
1691 	} else {
1692 		port_event32_t	uev32;
1693 		uev32.portev_source = PORT_SOURCE_ALERT;
1694 		uev32.portev_object = (daddr32_t)pa->portal_object;
1695 		uev32.portev_events = pa->portal_events;
1696 		uev32.portev_user = (daddr32_t)(uintptr_t)pa->portal_user;
1697 		if (copyout(&uev32, uevp, sizeof (port_event32_t)))
1698 			return (EFAULT);
1699 #endif	/* _SYSCALL32_IMPL */
1700 	}
1701 	return (0);
1702 }
1703 
1704 /*
1705  * Check return conditions :
1706  * - pending port close(2)
1707  * - threads waiting for events
1708  */
1709 static void
1710 port_check_return_cond(port_queue_t *portq)
1711 {
1712 	ASSERT(MUTEX_HELD(&portq->portq_mutex));
1713 	portq->portq_thrcnt--;
1714 	if (portq->portq_flags & PORTQ_CLOSE) {
1715 		if (portq->portq_thrcnt == 0)
1716 			cv_signal(&portq->portq_closecv);
1717 		else
1718 			cv_signal(&portq->portq_thread->portget_cv);
1719 	}
1720 }
1721 
1722 /*
1723  * The port_get_kevent() function returns
1724  * - the event located at the head of the queue if 'last' pointer is NULL
1725  * - the next event after the event pointed by 'last'
1726  * The caller of this function is responsible for the integrity of the queue
1727  * in use:
1728  * - port_getn() is using a temporary queue protected with port_block().
1729  * - port_close_events() is working on the global event queue and protects
1730  *   the queue with portq->portq_mutex.
1731  */
1732 port_kevent_t *
1733 port_get_kevent(list_t *list, port_kevent_t *last)
1734 {
1735 	if (last == NULL)
1736 		return (list_head(list));
1737 	else
1738 		return (list_next(list, last));
1739 }
1740 
1741 /*
1742  * The port_get_timeout() function gets the timeout data from user space
1743  * and converts that info into a corresponding internal representation.
1744  * The kerneldata flag means that the timeout data is already loaded.
1745  */
1746 static int
1747 port_get_timeout(timespec_t *timeout, timespec_t *rqtime, timespec_t **rqtp,
1748     int *blocking, int kerneldata)
1749 {
1750 	model_t	model = get_udatamodel();
1751 
1752 	*rqtp = NULL;
1753 	if (timeout == NULL) {
1754 		*blocking = 1;
1755 		return (0);
1756 	}
1757 
1758 	if (kerneldata) {
1759 		*rqtime = *timeout;
1760 	} else {
1761 		if (model == DATAMODEL_NATIVE) {
1762 			if (copyin(timeout, rqtime, sizeof (*rqtime)))
1763 				return (EFAULT);
1764 #ifdef	_SYSCALL32_IMPL
1765 		} else {
1766 			timespec32_t	wait_time_32;
1767 			if (copyin(timeout, &wait_time_32,
1768 			    sizeof (wait_time_32)))
1769 				return (EFAULT);
1770 			TIMESPEC32_TO_TIMESPEC(rqtime, &wait_time_32);
1771 #endif  /* _SYSCALL32_IMPL */
1772 		}
1773 	}
1774 
1775 	if (rqtime->tv_sec == 0 && rqtime->tv_nsec == 0) {
1776 		*blocking = 0;
1777 		return (0);
1778 	}
1779 
1780 	if (rqtime->tv_sec < 0 ||
1781 	    rqtime->tv_nsec < 0 || rqtime->tv_nsec >= NANOSEC)
1782 		return (EINVAL);
1783 
1784 	*rqtp = rqtime;
1785 	*blocking = 1;
1786 	return (0);
1787 }
1788 
1789 /*
1790  * port_queue_thread()
1791  * Threads requiring more events than available will be put in a wait queue.
1792  * There is a "thread wait queue" per port.
1793  * Threads requiring less events get a higher priority than others and they
1794  * will be awoken first.
1795  */
1796 static portget_t *
1797 port_queue_thread(port_queue_t *portq, uint_t nget)
1798 {
1799 	portget_t	*pgetp;
1800 	portget_t	*ttp;
1801 	portget_t	*htp;
1802 
1803 	pgetp = kmem_zalloc(sizeof (portget_t), KM_SLEEP);
1804 	pgetp->portget_nget = nget;
1805 	pgetp->portget_pid = curproc->p_pid;
1806 	if (portq->portq_thread == NULL) {
1807 		/* first waiting thread */
1808 		portq->portq_thread = pgetp;
1809 		portq->portq_nget = nget;
1810 		pgetp->portget_prev = pgetp;
1811 		pgetp->portget_next = pgetp;
1812 		return (pgetp);
1813 	}
1814 
1815 	/*
1816 	 * thread waiting for less events will be set on top of the queue.
1817 	 */
1818 	ttp = portq->portq_thread;
1819 	htp = ttp;
1820 	for (;;) {
1821 		if (nget <= ttp->portget_nget)
1822 			break;
1823 		if (htp == ttp->portget_next)
1824 			break;	/* last event */
1825 		ttp = ttp->portget_next;
1826 	}
1827 
1828 	/* add thread to the queue */
1829 	pgetp->portget_next = ttp;
1830 	pgetp->portget_prev = ttp->portget_prev;
1831 	ttp->portget_prev->portget_next = pgetp;
1832 	ttp->portget_prev = pgetp;
1833 	if (portq->portq_thread == ttp)
1834 		portq->portq_thread = pgetp;
1835 	portq->portq_nget = portq->portq_thread->portget_nget;
1836 	return (pgetp);
1837 }
1838 
1839 /*
1840  * Take thread out of the queue.
1841  */
1842 static void
1843 port_dequeue_thread(port_queue_t *portq, portget_t *pgetp)
1844 {
1845 	if (pgetp->portget_next == pgetp) {
1846 		/* last (single) waiting thread */
1847 		portq->portq_thread = NULL;
1848 		portq->portq_nget = 0;
1849 	} else {
1850 		pgetp->portget_prev->portget_next = pgetp->portget_next;
1851 		pgetp->portget_next->portget_prev = pgetp->portget_prev;
1852 		if (portq->portq_thread == pgetp)
1853 			portq->portq_thread = pgetp->portget_next;
1854 		portq->portq_nget = portq->portq_thread->portget_nget;
1855 	}
1856 	kmem_free(pgetp, sizeof (portget_t));
1857 }
1858 
1859 /*
1860  * Set up event port kstats.
1861  */
1862 static void
1863 port_kstat_init()
1864 {
1865 	kstat_t	*ksp;
1866 	uint_t	ndata;
1867 
1868 	ndata = sizeof (port_kstat) / sizeof (kstat_named_t);
1869 	ksp = kstat_create("portfs", 0, "Event Ports", "misc",
1870 	    KSTAT_TYPE_NAMED, ndata, KSTAT_FLAG_VIRTUAL);
1871 	if (ksp) {
1872 		ksp->ks_data = &port_kstat;
1873 		kstat_install(ksp);
1874 	}
1875 }
1876