xref: /illumos-gate/usr/src/uts/common/fs/portfs/port.c (revision a07094369b21309434206d9b3601d162693466fc)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*
23  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 #include <sys/types.h>
30 #include <sys/systm.h>
31 #include <sys/cred.h>
32 #include <sys/modctl.h>
33 #include <sys/vfs.h>
34 #include <sys/sysmacros.h>
35 #include <sys/cmn_err.h>
36 #include <sys/stat.h>
37 #include <sys/errno.h>
38 #include <sys/kmem.h>
39 #include <sys/file.h>
40 #include <sys/kstat.h>
41 #include <sys/port_impl.h>
42 #include <sys/task.h>
43 #include <sys/project.h>
44 
45 /*
46  * Event Ports can be shared across threads or across processes.
47  * Every thread/process can use an own event port or a group of them
48  * can use a single port. A major request was also to get the ability
49  * to submit user-defined events to a port. The idea of the
50  * user-defined events is to use the event ports for communication between
51  * threads/processes (like message queues). User defined-events are queued
52  * in a port with the same priority as other event types.
53  *
54  * Events are delivered only once. The thread/process which is waiting
55  * for events with the "highest priority" (priority here is related to the
56  * internal strategy to wakeup waiting threads) will retrieve the event,
57  * all other threads/processes will not be notified. There is also
58  * the requirement to have events which should be submitted immediately
59  * to all "waiting" threads. That is the main task of the alert event.
60  * The alert event is submitted by the application to a port. The port
61  * changes from a standard mode to the alert mode. Now all waiting threads
62  * will be awaken immediately and they will return with the alert event.
63  * Threads trying to retrieve events from a port in alert mode will
64  * return immediately with the alert event.
65  *
66  *
67  * An event port is like a kernel queue, which accept events submitted from
68  * user level as well as events submitted from kernel sub-systems. Sub-systems
69  * able to submit events to a port are the so-called "event sources".
70  * Current event sources:
71  * PORT_SOURCE_AIO	 : events submitted per transaction completion from
72  *			   POSIX-I/O framework.
73  * PORT_SOURCE_TIMER	 : events submitted when a timer fires
74  *			   (see timer_create(3RT)).
75  * PORT_SOURCE_FD	 : events submitted per file descriptor (see poll(2)).
76  * PORT_SOURCE_ALERT	 : events submitted from user. This is not really a
77  *			   single event, this is actually a port mode
78  *			   (see port_alert(3c)).
79  * PORT_SOURCE_USER	 : events submitted by applications with
80  *			   port_send(3c) or port_sendn(3c).
81  *
82  * There is a user API implemented in the libc library as well as a
83  * kernel API implemented in port_subr.c in genunix.
84  * The available user API functions are:
85  * port_create() : create a port as a file descriptor of portfs file system
86  *		   The standard close(2) function closes a port.
87  * port_associate() : associate a file descriptor with a port to be able to
88  *		      retrieve events from that file descriptor.
89  * port_dissociate(): remove the association of a file descriptor with a port.
90  * port_alert()	 : set/unset a port in alert mode
91  * port_send()	 : send an event of type PORT_SOURCE_USER to a port
92  * port_sendn()	 : send an event of type PORT_SOURCE_USER to a list of ports
93  * port_get()	 : retrieve a single event from a port
94  * port_getn()	 : retrieve a list of events from a port
95  *
96  * The available kernel API functions are:
97  * port_allocate_event(): allocate an event slot/structure of/from a port
98  * port_init_event()    : set event data in the event structure
99  * port_send_event()    : send event to a port
100  * port_free_event()    : deliver allocated slot/structure back to a port
101  * port_associate_ksource(): associate a kernel event source with a port
102  * port_dissociate_ksource(): dissociate a kernel event source from a port
103  *
104  * The libc implementation consists of small functions which pass the
105  * arguments to the kernel using the "portfs" system call. It means, all the
106  * synchronisation work is being done in the kernel. The "portfs" system
107  * call loads the portfs file system into the kernel.
108  *
109  * PORT CREATION
110  * The first function to be used is port_create() which internally creates
111  * a vnode and a portfs node. The portfs node is represented by the port_t
112  * structure, which again includes all the data necessary to control a port.
113  * port_create() returns a file descriptor, which needs to be used in almost
114  * all other event port functions.
115  * The maximum number of ports per system is controlled by the resource
116  * control: project:port-max-ids.
117  *
118  * EVENT GENERATION
119  * The second step is the triggering of events, which could be sent to a port.
120  * Every event source implements an own method to generate events for a port:
121  * PORT_SOURCE_AIO:
122  * 	The sigevent structure of the standard POSIX-IO functions
123  * 	was extended by an additional notification type.
124  * 	Standard notification types:
125  * 	SIGEV_NONE, SIGEV_SIGNAL and SIGEV_THREAD
126  * 	Event ports introduced now SIGEV_PORT.
127  * 	The notification type SIGEV_PORT specifies that a structure
128  * 	of type port_notify_t has to be attached to the sigev_value.
129  * 	The port_notify_t structure contains the event port file
130  * 	descriptor and a user-defined pointer.
131  * 	Internally the AIO implementation will use the kernel API
132  * 	functions to allocate an event port slot per transaction (aiocb)
133  * 	and sent the event to the port as soon as the transaction completes.
134  * 	All the events submitted per transaction are of type
135  * 	PORT_SOURCE_AIO.
136  * PORT_SOURCE_TIMER:
137  * 	The timer_create() function uses the same method as the
138  * 	PORT_SOURCE_AIO event source. It also uses the sigevent structure
139  * 	to deliver the port information.
140  * 	Internally the timer code will allocate a single event slot/struct
141  * 	per timer and it will send the timer event as soon as the timer
142  * 	fires. If the timer-fired event is not delivered to the application
143  * 	before the next period elapsed, then an overrun counter will be
144  * 	incremented. The timer event source uses a callback function to
145  * 	detect the delivery of the event to the application. At that time
146  * 	the timer callback function will update the event overrun counter.
147  * PORT_SOURCE_FD:
148  * 	This event source uses the port_associate() function to allocate
149  * 	an event slot/struct from a port. The application defines in the
150  * 	events argument of port_associate() the type of events which it is
151  * 	interested on.
152  * 	The internal pollwakeup() function is used by all the file
153  * 	systems --which are supporting the VOP_POLL() interface- to notify
154  * 	the upper layer (poll(2), devpoll(7d) and now event ports) about
155  * 	the event triggered (see valid events in poll(2)).
156  * 	The pollwakeup() function forwards the event to the layer registered
157  * 	to receive the current event.
158  * 	The port_dissociate() function can be used to free the allocated
159  * 	event slot from the port. Anyway, file descriptors deliver events
160  * 	only one time and remain deactivated until the application
161  * 	reactivates the association of a file descriptor with port_associate().
162  * 	If an associated file descriptor is closed then the file descriptor
163  * 	will be dissociated automatically from the port.
164  *
165  * PORT_SOURCE_ALERT:
166  * 	This event type is generated when the port was previously set in
167  * 	alert mode using the port_alert() function.
168  * 	A single alert event is delivered to every thread which tries to
169  * 	retrieve events from a port.
170  * PORT_SOURCE_USER:
171  * 	This type of event is generated from user level using the port_send()
172  * 	function to send a user event to a port or the port_sendn() function
173  * 	to send an event to a list of ports.
174  *
175  * EVENT DELIVERY / RETRIEVING EVENTS
176  * Events remain in the port queue until:
177  * - the application uses port_get() or port_getn() to retrieve events,
178  * - the event source cancel the event,
179  * - the event port is closed or
180  * - the process exits.
181  * The maximal number of events in a port queue is the maximal number
182  * of event slots/structures which can be allocated by event sources.
183  * The allocation of event slots/structures is controlled by the resource
184  * control: process.port-max-events.
185  * The port_get() function retrieves a single event and the port_getn()
186  * function retrieves a list of events.
187  * Events are classified as shareable and non-shareable events across processes.
188  * Non-shareable events are invisible for the port_get(n)() functions of
189  * processes other than the owner of the event.
190  *    Shareable event types are:
191  *    PORT_SOURCE_USER events
192  * 	This type of event is unconditionally shareable and without
193  * 	limitations. If the parent process sends a user event and closes
194  * 	the port afterwards, the event remains in the port and the child
195  * 	process will still be able to retrieve the user event.
196  *    PORT_SOURCE_ALERT events
197  * 	This type of event is shareable between processes.
198  * 	Limitation:	The alert mode of the port is removed if the owner
199  * 			(process which set the port in alert mode) of the
200  * 			alert event closes the port.
201  *    PORT_SOURCE_FD events
202  * 	This type of event is conditional shareable between processes.
203  * 	After fork(2) all forked file descriptors are shareable between
204  * 	the processes. The child process is allowed to retrieve events
205  * 	from the associated file descriptors and it can also re-associate
206  * 	the fd with the port.
207  * 	Limitations:	The child process is not allowed to dissociate
208  * 			the file descriptor from the port. Only the
209  * 			owner (process) of the association is allowed to
210  * 			dissociate the file descriptor from the port.
211  * 			If the owner of the association closes the port
212  * 			the association will be removed.
213  *    PORT_SOURCE_AIO events
214  * 	This type of event is not shareable between processes.
215  *    PORT_SOURCE_TIMER events
216  * 	This type of event is not shareable between processes.
217  *
218  * FORK BEHAVIOUR
219  * On fork(2) the child process inherits all opened file descriptors from
220  * the parent process. This is also valid for port file descriptors.
221  * Associated file descriptors with a port maintain the association across the
222  * fork(2). It means, the child process gets full access to the port and
223  * it can retrieve events from all common associated file descriptors.
224  * Events of file descriptors created and associated with a port after the
225  * fork(2) are non-shareable and can only be retrieved by the same process.
226  *
227  * If the parent or the child process closes an exported port (using fork(2)
228  * or I_SENDFD) all the file descriptors associated with the port by the
229  * process will be dissociated from the port. Events of dissociated file
230  * descriptors as well as all non-shareable events will be discarded.
231  * The other process can continue working with the port as usual.
232  *
233  * CLOSING A PORT
234  * close(2) has to be used to close a port. See FORK BEHAVIOUR for details.
235  *
236  * PORT EVENT STRUCTURES
237  * The global control structure of the event ports framework is port_control_t.
238  * port_control_t keeps track of the number of created ports in the system.
239  * The cache of the port event structures is also located in port_control_t.
240  *
241  * On port_create() the vnode and the portfs node is also created.
242  * The portfs node is represented by the port_t structure.
243  * The port_t structure manages all port specific tasks:
244  * - management of resource control values
245  * - port VOP_POLL interface
246  * - creation time
247  * - uid and gid of the port
248  *
249  * The port_t structure contains the port_queue_t structure.
250  * The port_queue_t structure contains all the data necessary for the
251  * queue management:
252  * - locking
253  * - condition variables
254  * - event counters
255  * - submitted events	(represented by port_kevent_t structures)
256  * - threads waiting for event delivery (check portget_t structure)
257  * - PORT_SOURCE_FD cache	(managed by the port_fdcache_t structure)
258  * - event source management (managed by the port_source_t structure)
259  * - alert mode management	(check port_alert_t structure)
260  *
261  * EVENT MANAGEMENT
262  * The event port file system creates a kmem_cache for internal allocation of
263  * event port structures.
264  *
265  * 1. Event source association with a port:
266  * The first step to do for event sources is to get associated with a port
267  * using the port_associate_ksource() function or adding an entry to the
268  * port_ksource_tab[]. An event source can get dissociated from a port
269  * using the port_dissociate_ksource() function. An entry in the
270  * port_ksource_tab[] implies that the source will be associated
271  * automatically with every new created port.
272  * The event source can deliver a callback function, which is used by the
273  * port to notify the event source about close(2). The idea is that
274  * in such a case the event source should free all allocated resources
275  * and it must return to the port all allocated slots/structures.
276  * The port_close() function will wait until all allocated event
277  * structures/slots are returned to the port.
278  * The callback function is not necessary when the event source does not
279  * maintain local resources, a second condition is that the event source
280  * can guarantee that allocated event slots will be returned without
281  * delay to the port (it will not block and sleep somewhere).
282  *
283  * 2. Reservation of an event slot / event structure
284  * The event port reliability is based on the reservation of an event "slot"
285  * (allocation of an event structure) by the event source as part of the
286  * application call. If the maximal number of event slots is exhausted then
287  * the event source can return a corresponding error code to the application.
288  *
289  * The port_alloc_event() function has to be used by event sources to
290  * allocate an event slot (reserve an event structure). The port_alloc_event()
291  * doesn not block and it will return a 0 value on success or an error code
292  * if it fails.
293  * An argument of port_alloc_event() is a flag which determines the behavior
294  * of the event after it was delivered to the application:
295  * PORT_ALLOC_DEFAULT	: event slot becomes free after delivery to the
296  *			  application.
297  * PORT_ALLOC_PRIVATE	: event slot remains under the control of the event
298  *			  source. This kind of slots can not be used for
299  *			  event delivery and should only be used internally
300  *			  by the event source.
301  * PORT_KEV_CACHED	: event slot remains under the control of an event
302  *			  port cache. It does not become free after delivery
303  *			  to the application.
304  * PORT_ALLOC_SCACHED	: event slot remains under the control of the event
305  *			  source. The event source takes the control over
306  *			  the slot after the event is delivered to the
307  *			  application.
308  *
309  * 3. Delivery of events to the event port
310  * Earlier allocated event structure/slot has to be used to deliver
311  * event data to the port. Event source has to use the function
312  * port_send_event(). The single argument is a pointer to the previously
313  * reserved event structure/slot.
314  * The portkev_events field of the port_kevent_t structure can be updated/set
315  * in two ways:
316  * 1. using the port_set_event() function, or
317  * 2. updating the portkev_events field out of the callback function:
318  *    The event source can deliver a callback function to the port as an
319  *    argument of port_init_event().
320  *    One of the arguments of the callback function is a pointer to the
321  *    events field, which will be delivered to the application.
322  *    (see Delivery of events to the application).
323  * Event structures/slots can be delivered to the event port only one time,
324  * they remain blocked until the data is delivered to the application and the
325  * slot becomes free or it is delivered back to the event source
326  * (PORT_ALLOC_SCACHED). The activation of the callback function mentioned above
327  * is at the same time the indicator for the event source that the event
328  * structure/slot is free for reuse.
329  *
330  * 4. Delivery of events to the application
331  * The events structures/slots delivered by event sources remain in the
332  * port queue until they are retrieved by the application or the port
333  * is closed (exit(2) also closes all opened file descriptors)..
334  * The application uses port_get() or port_getn() to retrieve events from
335  * a port. port_get() retrieves a single event structure/slot and port_getn()
336  * retrieves a list of event structures/slots.
337  * Both functions are able to poll for events and return immediately or they
338  * can specify a timeout value.
339  * Before the events are delivered to the application they are moved to a
340  * second temporary internal queue. The idea is to avoid lock collisions or
341  * contentions of the global queue lock.
342  * The global queue lock is used every time when an event source delivers
343  * new events to the port.
344  * The port_get() and port_getn() functions
345  * a) retrieve single events from the temporary queue,
346  * b) prepare the data to be passed to the application memory,
347  * c) activate the callback function of the event sources:
348  *    - to get the latest event data,
349  *    - the event source can free all allocated resources associated with the
350  *      current event,
351  *    - the event source can re-use the current event slot/structure
352  *    - the event source can deny the delivery of the event to the application
353  *      (e.g. because of the wrong process).
354  * d) put the event back to the temporary queue if the event delivery was denied
355  * e) repeat a) until d) as long as there are events in the queue and
356  *    there is enough user space available.
357  *
358  * The loop described above could block for a very long time the global mutex,
359  * to avoid that a second mutex was introduced to synchronized concurrent
360  * threads accessing the temporary queue.
361  */
362 
363 static int64_t portfs(int, uintptr_t, uintptr_t, uintptr_t, uintptr_t,
364     uintptr_t);
365 
366 static struct sysent port_sysent = {
367 	6,
368 	SE_ARGC | SE_64RVAL | SE_NOUNLOAD,
369 	(int (*)())portfs,
370 };
371 
372 static struct modlsys modlsys = {
373 	&mod_syscallops, "event ports", &port_sysent
374 };
375 
376 #ifdef _SYSCALL32_IMPL
377 
378 static int64_t
379 portfs32(uint32_t arg1, int32_t arg2, uint32_t arg3, uint32_t arg4,
380     uint32_t arg5, uint32_t arg6);
381 
382 static struct sysent port_sysent32 = {
383 	6,
384 	SE_ARGC | SE_64RVAL | SE_NOUNLOAD,
385 	(int (*)())portfs32,
386 };
387 
388 static struct modlsys modlsys32 = {
389 	&mod_syscallops32,
390 	"32-bit event ports syscalls",
391 	&port_sysent32
392 };
393 #endif	/* _SYSCALL32_IMPL */
394 
395 static struct modlinkage modlinkage = {
396 	MODREV_1,
397 	&modlsys,
398 #ifdef _SYSCALL32_IMPL
399 	&modlsys32,
400 #endif
401 	NULL
402 };
403 
404 port_kstat_t port_kstat = {
405 	{ "ports",	KSTAT_DATA_UINT32 }
406 };
407 
408 dev_t	portdev;
409 struct	vnodeops *port_vnodeops;
410 struct	vfs port_vfs;
411 
412 extern	rctl_hndl_t rc_process_portev;
413 extern	rctl_hndl_t rc_project_portids;
414 extern	void aio_close_port(void *, int, pid_t, int);
415 
416 /*
417  * This table contains a list of event sources which need a static
418  * association with a port (every port).
419  * The last NULL entry in the table is required to detect "end of table".
420  */
421 struct port_ksource port_ksource_tab[] = {
422 	{PORT_SOURCE_AIO, aio_close_port, NULL, NULL},
423 	{0, NULL, NULL, NULL}
424 };
425 
426 /* local functions */
427 static int port_getn(port_t *, port_event_t *, uint_t, uint_t *,
428     port_gettimer_t *);
429 static int port_sendn(int [], int [], uint_t, int, void *, uint_t *);
430 static int port_alert(port_t *, int, int, void *);
431 static int port_dispatch_event(port_t *, int, int, int, uintptr_t, void *);
432 static int port_send(port_t *, int, int, void *);
433 static int port_create(int *);
434 static int port_get_alert(port_alert_t *, port_event_t *);
435 static int port_copy_event(port_event_t *, port_kevent_t *, list_t *);
436 static int *port_errorn(int *, int, int, int);
437 static int port_noshare(void *, int *, pid_t, int, void *);
438 static int port_get_timeout(timespec_t *, timespec_t *, timespec_t **, int *,
439     int);
440 static void port_init(port_t *);
441 static void port_remove_alert(port_queue_t *);
442 static void port_add_ksource_local(port_t *, port_ksource_t *);
443 static void port_check_return_cond(port_queue_t *);
444 static void port_dequeue_thread(port_queue_t *, portget_t *);
445 static portget_t *port_queue_thread(port_queue_t *, uint_t);
446 static void port_kstat_init(void);
447 
448 #ifdef	_SYSCALL32_IMPL
449 static int port_copy_event32(port_event32_t *, port_kevent_t *, list_t *);
450 #endif
451 
452 int
453 _init(void)
454 {
455 	static const fs_operation_def_t port_vfsops_template[] = {
456 		NULL, NULL
457 	};
458 	extern const	fs_operation_def_t port_vnodeops_template[];
459 	vfsops_t	*port_vfsops;
460 	int		error;
461 	major_t 	major;
462 
463 	if ((major = getudev()) == (major_t)-1)
464 		return (ENXIO);
465 	portdev = makedevice(major, 0);
466 
467 	/* Create a dummy vfs */
468 	error = vfs_makefsops(port_vfsops_template, &port_vfsops);
469 	if (error) {
470 		cmn_err(CE_WARN, "port init: bad vfs ops");
471 		return (error);
472 	}
473 	vfs_setops(&port_vfs, port_vfsops);
474 	port_vfs.vfs_flag = VFS_RDONLY;
475 	port_vfs.vfs_dev = portdev;
476 	vfs_make_fsid(&(port_vfs.vfs_fsid), portdev, 0);
477 
478 	error = vn_make_ops("portfs", port_vnodeops_template, &port_vnodeops);
479 	if (error) {
480 		vfs_freevfsops(port_vfsops);
481 		cmn_err(CE_WARN, "port init: bad vnode ops");
482 		return (error);
483 	}
484 
485 	mutex_init(&port_control.pc_mutex, NULL, MUTEX_DEFAULT, NULL);
486 	port_control.pc_nents = 0;	/* number of active ports */
487 
488 	/* create kmem_cache for port event structures */
489 	port_control.pc_cache = kmem_cache_create("port_cache",
490 	    sizeof (port_kevent_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
491 
492 	port_kstat_init();		/* init port kstats */
493 	return (mod_install(&modlinkage));
494 }
495 
496 int
497 _info(struct modinfo *modinfop)
498 {
499 	return (mod_info(&modlinkage, modinfop));
500 }
501 
502 /*
503  * System call wrapper for all port related system calls from 32-bit programs.
504  */
505 #ifdef _SYSCALL32_IMPL
506 static int64_t
507 portfs32(uint32_t opcode, int32_t a0, uint32_t a1, uint32_t a2, uint32_t a3,
508     uint32_t a4)
509 {
510 	int64_t	error;
511 
512 	switch (opcode & PORT_CODE_MASK) {
513 	case PORT_GET:
514 		error = portfs(PORT_GET, a0, a1, (int)a2, (int)a3, a4);
515 		break;
516 	case PORT_SENDN:
517 		error = portfs(opcode, (uint32_t)a0, a1, a2, a3, a4);
518 		break;
519 	default:
520 		error = portfs(opcode, a0, a1, a2, a3, a4);
521 		break;
522 	}
523 	return (error);
524 }
525 #endif	/* _SYSCALL32_IMPL */
526 
527 /*
528  * System entry point for port functions.
529  * a0 is a port file descriptor (except for PORT_SENDN and PORT_CREATE).
530  * The libc uses PORT_SYS_NOPORT in functions which do not deliver a
531  * port file descriptor as first argument.
532  */
533 static int64_t
534 portfs(int opcode, uintptr_t a0, uintptr_t a1, uintptr_t a2, uintptr_t a3,
535     uintptr_t a4)
536 {
537 	rval_t		r;
538 	port_t		*pp;
539 	int 		error = 0;
540 	uint_t		nget;
541 	file_t		*fp;
542 	port_gettimer_t	port_timer;
543 
544 	r.r_vals = 0;
545 	if (opcode & PORT_SYS_NOPORT) {
546 		opcode &= PORT_CODE_MASK;
547 		if (opcode == PORT_SENDN) {
548 			error = port_sendn((int *)a0, (int *)a1, (uint_t)a2,
549 			    (int)a3, (void *)a4, (uint_t *)&r.r_val1);
550 			if (error && (error != EIO))
551 				return ((int64_t)set_errno(error));
552 			return (r.r_vals);
553 		}
554 
555 		if (opcode == PORT_CREATE) {
556 			error = port_create(&r.r_val1);
557 			if (error)
558 				return ((int64_t)set_errno(error));
559 			return (r.r_vals);
560 		}
561 	}
562 
563 	/* opcodes using port as first argument (a0) */
564 
565 	if ((fp = getf((int)a0)) == NULL)
566 		return ((uintptr_t)set_errno(EBADF));
567 
568 	if (fp->f_vnode->v_type != VPORT) {
569 		releasef((int)a0);
570 		return ((uintptr_t)set_errno(EBADFD));
571 	}
572 
573 	pp = VTOEP(fp->f_vnode);
574 
575 	switch (opcode & PORT_CODE_MASK) {
576 	case	PORT_GET:
577 	{
578 		/* see PORT_GETN description */
579 		struct	timespec timeout;
580 
581 		port_timer.pgt_flags = PORTGET_ONE;
582 		port_timer.pgt_loop = 0;
583 		port_timer.pgt_rqtp = NULL;
584 		if (a4 != NULL) {
585 			port_timer.pgt_timeout = &timeout;
586 			timeout.tv_sec = (time_t)a2;
587 			timeout.tv_nsec = (long)a3;
588 		} else {
589 			port_timer.pgt_timeout = NULL;
590 		}
591 		do {
592 			nget = 1;
593 			error = port_getn(pp, (port_event_t *)a1, 1,
594 			    (uint_t *)&nget, &port_timer);
595 		} while (nget == 0 && error == 0 && port_timer.pgt_loop);
596 		break;
597 	}
598 	case	PORT_GETN:
599 	{
600 		/*
601 		 * port_getn() can only retrieve own or shareable events from
602 		 * other processes. The port_getn() function remains in the
603 		 * kernel until own or shareable events are available or the
604 		 * timeout elapses.
605 		 */
606 		port_timer.pgt_flags = 0;
607 		port_timer.pgt_loop = 0;
608 		port_timer.pgt_rqtp = NULL;
609 		port_timer.pgt_timeout = (struct timespec *)a4;
610 		do {
611 			nget = a3;
612 			error = port_getn(pp, (port_event_t *)a1, (uint_t)a2,
613 			    (uint_t *)&nget, &port_timer);
614 		} while (nget == 0 && error == 0 && port_timer.pgt_loop);
615 		r.r_val1 = nget;
616 		r.r_val2 = error;
617 		releasef((int)a0);
618 		if (error && error != ETIME)
619 			return ((int64_t)set_errno(error));
620 		return (r.r_vals);
621 	}
622 	case	PORT_ASSOCIATE:
623 	{
624 		/* currently only PORT_SOURCE_FD is implemented */
625 		if ((int)a1 != PORT_SOURCE_FD) {
626 			error = EINVAL;
627 			break;
628 		}
629 		error = port_associate_fd(pp, (int)a1, (uintptr_t)a2, (int)a3,
630 			    (void *)a4);
631 		break;
632 	}
633 	case	PORT_SEND:
634 	{
635 		/* user-defined events */
636 		error = port_send(pp, PORT_SOURCE_USER, (int)a1, (void *)a2);
637 		break;
638 	}
639 	case	PORT_DISPATCH:
640 	{
641 		/*
642 		 * library events, blocking
643 		 * Only events of type PORT_SOURCE_AIO are currently allowed.
644 		 */
645 		if ((int)a1 != PORT_SOURCE_AIO) {
646 			error = EINVAL;
647 			break;
648 		}
649 		error = port_dispatch_event(pp, (int)opcode, (int)a1, (int)a2,
650 		    (uintptr_t)a3, (void *)a4);
651 		break;
652 	}
653 	case	PORT_DISSOCIATE:
654 	{
655 		/* currently only PORT_SOURCE_FD is implemented */
656 		if ((int)a1 != PORT_SOURCE_FD) {
657 			error = EINVAL;
658 			break;
659 		}
660 		error = port_dissociate_fd(pp, (uintptr_t)a2);
661 		break;
662 	}
663 	case	PORT_ALERT:
664 	{
665 		if ((int)a2)	/* a2 = events */
666 			error = port_alert(pp, (int)a1, (int)a2, (void *)a3);
667 		else
668 			port_remove_alert(&pp->port_queue);
669 		break;
670 	}
671 	default:
672 		error = EINVAL;
673 		break;
674 	}
675 
676 	releasef((int)a0);
677 	if (error)
678 		return ((int64_t)set_errno(error));
679 	return (r.r_vals);
680 }
681 
682 /*
683  * System call to create a port.
684  *
685  * The port_create() function creates a vnode of type VPORT per port.
686  * The port control data is associated with the vnode as vnode private data.
687  * The port_create() function returns an event port file descriptor.
688  */
689 static int
690 port_create(int *fdp)
691 {
692 	port_t		*pp;
693 	vnode_t		*vp;
694 	struct file	*fp;
695 	proc_t		*p = curproc;
696 
697 	/* initialize vnode and port private data */
698 	pp = kmem_zalloc(sizeof (port_t), KM_SLEEP);
699 
700 	pp->port_vnode = vn_alloc(KM_SLEEP);
701 	vp = EPTOV(pp);
702 	vn_setops(vp, port_vnodeops);
703 	vp->v_type = VPORT;
704 	vp->v_vfsp = &port_vfs;
705 	vp->v_data = (caddr_t)pp;
706 
707 	mutex_enter(&port_control.pc_mutex);
708 	/*
709 	 * Retrieve the maximal number of event ports allowed per system from
710 	 * the resource control: project.port-max-ids.
711 	 */
712 	mutex_enter(&p->p_lock);
713 	if (rctl_test(rc_project_portids, p->p_task->tk_proj->kpj_rctls, p,
714 	    port_control.pc_nents + 1, RCA_SAFE) & RCT_DENY) {
715 		mutex_exit(&p->p_lock);
716 		vn_free(vp);
717 		kmem_free(pp, sizeof (port_t));
718 		mutex_exit(&port_control.pc_mutex);
719 		return (EAGAIN);
720 	}
721 
722 	/*
723 	 * Retrieve the maximal number of events allowed per port from
724 	 * the resource control: process.port-max-events.
725 	 */
726 	pp->port_max_events = rctl_enforced_value(rc_process_portev,
727 	    p->p_rctls, p);
728 	mutex_exit(&p->p_lock);
729 
730 	/* allocate a new user file descriptor and a file structure */
731 	if (falloc(vp, 0, &fp, fdp)) {
732 		/*
733 		 * If the file table is full, free allocated resources.
734 		 */
735 		vn_free(vp);
736 		kmem_free(pp, sizeof (port_t));
737 		mutex_exit(&port_control.pc_mutex);
738 		return (EMFILE);
739 	}
740 
741 	mutex_exit(&fp->f_tlock);
742 
743 	pp->port_fd = *fdp;
744 	port_control.pc_nents++;
745 	p->p_portcnt++;
746 	port_kstat.pks_ports.value.ui32++;
747 	mutex_exit(&port_control.pc_mutex);
748 
749 	/* initializes port private data */
750 	port_init(pp);
751 	/* set user file pointer */
752 	setf(*fdp, fp);
753 	return (0);
754 }
755 
756 /*
757  * port_init() initializes event port specific data
758  */
759 static void
760 port_init(port_t *pp)
761 {
762 	port_queue_t	*portq;
763 	port_ksource_t	*pks;
764 
765 	mutex_init(&pp->port_mutex, NULL, MUTEX_DEFAULT, NULL);
766 	portq = &pp->port_queue;
767 	mutex_init(&portq->portq_mutex, NULL, MUTEX_DEFAULT, NULL);
768 	pp->port_flags |= PORT_INIT;
769 
770 	/*
771 	 * If it is not enough memory available to satisfy a user
772 	 * request using a single port_getn() call then port_getn()
773 	 * will reduce the size of the list to PORT_MAX_LIST.
774 	 */
775 	pp->port_max_list = port_max_list;
776 
777 	/* Set timestamp entries required for fstat(2) requests */
778 	gethrestime(&pp->port_ctime);
779 	pp->port_uid = crgetuid(curproc->p_cred);
780 	pp->port_gid = crgetgid(curproc->p_cred);
781 
782 	/* initialize port queue structs */
783 	list_create(&portq->portq_list, sizeof (port_kevent_t),
784 	    offsetof(port_kevent_t, portkev_node));
785 	list_create(&portq->portq_get_list, sizeof (port_kevent_t),
786 	    offsetof(port_kevent_t, portkev_node));
787 	portq->portq_flags = 0;
788 	pp->port_pid = curproc->p_pid;
789 
790 	/* Allocate cache skeleton for PORT_SOURCE_FD events */
791 	portq->portq_pcp = kmem_zalloc(sizeof (port_fdcache_t), KM_SLEEP);
792 	mutex_init(&portq->portq_pcp->pc_lock, NULL, MUTEX_DEFAULT, NULL);
793 
794 	/*
795 	 * Allocate cache skeleton for association of event sources.
796 	 */
797 	mutex_init(&portq->portq_source_mutex, NULL, MUTEX_DEFAULT, NULL);
798 	portq->portq_scache = kmem_zalloc(
799 	    PORT_SCACHE_SIZE * sizeof (port_source_t *), KM_SLEEP);
800 
801 	/*
802 	 * pre-associate some kernel sources with this port.
803 	 * The pre-association is required to create port_source_t
804 	 * structures for object association.
805 	 * Some sources can not get associated with a port before the first
806 	 * object association is requested. Another reason to pre_associate
807 	 * a particular source with a port is because of performance.
808 	 */
809 
810 	for (pks = port_ksource_tab; pks->pks_source != 0; pks++)
811 		port_add_ksource_local(pp, pks);
812 }
813 
814 /*
815  * The port_add_ksource_local() function is being used to associate
816  * event sources with every new port.
817  * The event sources need to be added to port_ksource_tab[].
818  */
819 static void
820 port_add_ksource_local(port_t *pp, port_ksource_t *pks)
821 {
822 	port_source_t	*pse;
823 	port_source_t	**ps;
824 
825 	mutex_enter(&pp->port_queue.portq_source_mutex);
826 	ps = &pp->port_queue.portq_scache[PORT_SHASH(pks->pks_source)];
827 	for (pse = *ps; pse != NULL; pse = pse->portsrc_next) {
828 		if (pse->portsrc_source == pks->pks_source)
829 			break;
830 	}
831 
832 	if (pse == NULL) {
833 		/* associate new source with the port */
834 		pse = kmem_zalloc(sizeof (port_source_t), KM_SLEEP);
835 		pse->portsrc_source = pks->pks_source;
836 		pse->portsrc_close = pks->pks_close;
837 		pse->portsrc_closearg = pks->pks_closearg;
838 		pse->portsrc_cnt = 1;
839 
840 		pks->pks_portsrc = pse;
841 		if (*ps != NULL)
842 			pse->portsrc_next = (*ps)->portsrc_next;
843 		*ps = pse;
844 	}
845 	mutex_exit(&pp->port_queue.portq_source_mutex);
846 }
847 
848 /*
849  * The port_send() function sends an event of type "source" to a
850  * port. This function is non-blocking. An event can be sent to
851  * a port as long as the number of events per port does not achieve the
852  * maximal allowed number of events. The max. number of events per port is
853  * defined by the resource control process.max-port-events.
854  * This function is used by the port library function port_send()
855  * and port_dispatch(). The port_send(3c) function is part of the
856  * event ports API and submits events of type PORT_SOURCE_USER. The
857  * port_dispatch() function is project private and it is used by library
858  * functions to submit events of other types than PORT_SOURCE_USER
859  * (e.g. PORT_SOURCE_AIO).
860  */
861 static int
862 port_send(port_t *pp, int source, int events, void *user)
863 {
864 	port_kevent_t	*pev;
865 	int		error;
866 
867 	error = port_alloc_event_local(pp, source, PORT_ALLOC_DEFAULT, &pev);
868 	if (error)
869 		return (error);
870 
871 	pev->portkev_object = 0;
872 	pev->portkev_events = events;
873 	pev->portkev_user = user;
874 	pev->portkev_callback = NULL;
875 	pev->portkev_arg = NULL;
876 	pev->portkev_flags = 0;
877 
878 	error = port_send_event(pev);
879 	if (error) {
880 		port_free_event_local(pev, 0);
881 		return (error);
882 	}
883 	return (0);
884 }
885 
886 /*
887  * The port_noshare() function returns 0 if the current event was generated
888  * by the same process. Otherwise is returns a value other than 0 and the
889  * event should not be delivered to the current processe.
890  * The port_noshare() function is normally used by the port_dispatch()
891  * function. The port_dispatch() function is project private and can only be
892  * used within the event port project.
893  * Currently the libaio uses the port_dispatch() function to deliver events
894  * of types PORT_SOURCE_AIO.
895  */
896 /* ARGSUSED */
897 static int
898 port_noshare(void *arg, int *events, pid_t pid, int flag, void *evp)
899 {
900 	if (flag == PORT_CALLBACK_DEFAULT && curproc->p_pid != pid)
901 		return (1);
902 	return (0);
903 }
904 
905 /*
906  * The port_dispatch_event() function is project private and it is used by
907  * libraries involved in the project to deliver events to the port.
908  * port_dispatch will sleep and wait for enough resources to satisfy the
909  * request, if necessary.
910  * The library can specify if the delivered event is shareable with other
911  * processes (see PORT_SYS_NOSHARE flag).
912  */
913 static int
914 port_dispatch_event(port_t *pp, int opcode, int source, int events,
915     uintptr_t object, void *user)
916 {
917 	port_kevent_t	*pev;
918 	int		error;
919 
920 	error = port_alloc_event_block(pp, source, PORT_ALLOC_DEFAULT, &pev);
921 	if (error)
922 		return (error);
923 
924 	pev->portkev_object = object;
925 	pev->portkev_events = events;
926 	pev->portkev_user = user;
927 	pev->portkev_arg = NULL;
928 	if (opcode & PORT_SYS_NOSHARE) {
929 		pev->portkev_flags = PORT_KEV_NOSHARE;
930 		pev->portkev_callback = port_noshare;
931 	} else {
932 		pev->portkev_flags = 0;
933 		pev->portkev_callback = NULL;
934 	}
935 
936 	error = port_send_event(pev);
937 	if (error) {
938 		port_free_event_local(pev, 0);
939 		return (error);
940 	}
941 	return (0);
942 }
943 
944 
945 /*
946  * The port_sendn() function is the kernel implementation of the event
947  * port API function port_sendn(3c).
948  * This function is able to send an event to a list of event ports.
949  */
950 static int
951 port_sendn(int ports[], int errors[], uint_t nent, int events, void *user,
952     uint_t *nget)
953 {
954 	port_kevent_t	*pev;
955 	int		errorcnt = 0;
956 	int		error = 0;
957 	int		count;
958 	int		port;
959 	int		*plist;
960 	int		*elist = NULL;
961 	file_t		*fp;
962 	port_t		*pp;
963 
964 	if (nent == 0 || nent > port_max_list)
965 		return (EINVAL);
966 
967 	plist = kmem_alloc(nent * sizeof (int), KM_SLEEP);
968 	if (copyin((void *)ports, plist, nent * sizeof (int))) {
969 		kmem_free(plist, nent * sizeof (int));
970 		return (EFAULT);
971 	}
972 
973 	/*
974 	 * Scan the list for event port file descriptors and send the
975 	 * attached user event data embedded in a event of type
976 	 * PORT_SOURCE_USER to every event port in the list.
977 	 * If a list entry is not a valid event port then the corresponding
978 	 * error code will be stored in the errors[] list with the same
979 	 * list offset as in the ports[] list.
980 	 */
981 
982 	for (count = 0; count < nent; count++) {
983 		port = plist[count];
984 		if ((fp = getf(port)) == NULL) {
985 			elist = port_errorn(elist, nent, EBADF, count);
986 			errorcnt++;
987 			continue;
988 		}
989 
990 		pp = VTOEP(fp->f_vnode);
991 		if (fp->f_vnode->v_type != VPORT) {
992 			releasef(port);
993 			elist = port_errorn(elist, nent, EBADFD, count);
994 			errorcnt++;
995 			continue;
996 		}
997 
998 		error = port_alloc_event_local(pp, PORT_SOURCE_USER,
999 		    PORT_ALLOC_DEFAULT, &pev);
1000 		if (error) {
1001 			releasef(port);
1002 			elist = port_errorn(elist, nent, error, count);
1003 			errorcnt++;
1004 			continue;
1005 		}
1006 
1007 		pev->portkev_object = 0;
1008 		pev->portkev_events = events;
1009 		pev->portkev_user = user;
1010 		pev->portkev_callback = NULL;
1011 		pev->portkev_arg = NULL;
1012 		pev->portkev_flags = 0;
1013 
1014 		(void) port_send_event(pev);
1015 		releasef(port);
1016 	}
1017 	if (errorcnt) {
1018 		error = EIO;
1019 		if (copyout(elist, (void *)errors, nent * sizeof (int)))
1020 			error = EFAULT;
1021 		kmem_free(elist, nent * sizeof (int));
1022 	}
1023 	*nget = nent - errorcnt;
1024 	kmem_free(plist, nent * sizeof (int));
1025 	return (error);
1026 }
1027 
1028 static int *
1029 port_errorn(int *elist, int nent, int error, int index)
1030 {
1031 	if (elist == NULL)
1032 		elist = kmem_zalloc(nent * sizeof (int), KM_SLEEP);
1033 	elist[index] = error;
1034 	return (elist);
1035 }
1036 
1037 /*
1038  * port_alert()
1039  * The port_alert() funcion is a high priority event and it is always set
1040  * on top of the queue. It is also delivered as single event.
1041  * flags:
1042  *	- SET	:overwrite current alert data
1043  *	- UPDATE:set alert data or return EBUSY if alert mode is already set
1044  *
1045  * - set the ALERT flag
1046  * - wakeup all sleeping threads
1047  */
1048 static int
1049 port_alert(port_t *pp, int flags, int events, void *user)
1050 {
1051 	port_queue_t	*portq;
1052 	portget_t	*pgetp;
1053 	port_alert_t	*pa;
1054 
1055 	if ((flags & PORT_ALERT_INVALID) == PORT_ALERT_INVALID)
1056 		return (EINVAL);
1057 
1058 	portq = &pp->port_queue;
1059 	pa = &portq->portq_alert;
1060 	mutex_enter(&portq->portq_mutex);
1061 
1062 	/* check alert conditions */
1063 	if (flags == PORT_ALERT_UPDATE) {
1064 		if (portq->portq_flags & PORTQ_ALERT) {
1065 			mutex_exit(&portq->portq_mutex);
1066 			return (EBUSY);
1067 		}
1068 	}
1069 
1070 	/*
1071 	 * Store alert data in the port to be delivered to threads
1072 	 * which are using port_get(n) to retrieve events.
1073 	 */
1074 
1075 	portq->portq_flags |= PORTQ_ALERT;
1076 	pa->portal_events = events;		/* alert info */
1077 	pa->portal_pid = curproc->p_pid;	/* process owner */
1078 	pa->portal_object = 0;			/* no object */
1079 	pa->portal_user = user;			/* user alert data */
1080 
1081 	/* alert and deliver alert data to waiting threads */
1082 	pgetp = portq->portq_thread;
1083 	if (pgetp == NULL) {
1084 		/* no threads waiting for events */
1085 		mutex_exit(&portq->portq_mutex);
1086 		return (0);
1087 	}
1088 
1089 	/*
1090 	 * Set waiting threads in alert mode (PORTGET_ALERT)..
1091 	 * Every thread waiting for events already allocated a portget_t
1092 	 * structure to sleep on.
1093 	 * The port alert arguments are stored in the portget_t structure.
1094 	 * The PORTGET_ALERT flag is set to indicate the thread to return
1095 	 * immediately with the alert event.
1096 	 */
1097 	do {
1098 		if ((pgetp->portget_state & PORTGET_ALERT) == 0) {
1099 			pa = &pgetp->portget_alert;
1100 			pa->portal_events = events;
1101 			pa->portal_object = 0;
1102 			pa->portal_user = user;
1103 			pgetp->portget_state |= PORTGET_ALERT;
1104 			cv_signal(&pgetp->portget_cv);
1105 		}
1106 	} while ((pgetp = pgetp->portget_next) != portq->portq_thread);
1107 	mutex_exit(&portq->portq_mutex);
1108 	return (0);
1109 }
1110 
1111 /*
1112  * Clear alert state of the port
1113  */
1114 static void
1115 port_remove_alert(port_queue_t *portq)
1116 {
1117 	mutex_enter(&portq->portq_mutex);
1118 	portq->portq_flags &= ~PORTQ_ALERT;
1119 	mutex_exit(&portq->portq_mutex);
1120 }
1121 
1122 /*
1123  * The port_getn() function is used to retrieve events from a port.
1124  *
1125  * The port_getn() function returns immediately if there are enough events
1126  * available in the port to satisfy the request or if the port is in alert
1127  * mode (see port_alert(3c)).
1128  * The timeout argument of port_getn(3c) -which is embedded in the
1129  * port_gettimer_t structure- specifies if the system call should block or if it
1130  * should return immediately depending on the number of events available.
1131  * This function is internally used by port_getn(3c) as well as by
1132  * port_get(3c).
1133  */
1134 static int
1135 port_getn(port_t *pp, port_event_t *uevp, uint_t max, uint_t *nget,
1136     port_gettimer_t *pgt)
1137 {
1138 	port_queue_t	*portq;
1139 	port_kevent_t 	*pev;
1140 	port_kevent_t 	*lev;
1141 	int		error = 0;
1142 	uint_t		nmax;
1143 	uint_t		nevents;
1144 	uint_t		eventsz;
1145 	port_event_t	*kevp;
1146 	list_t		*glist;
1147 	uint_t		tnent;
1148 	int		rval;
1149 	int		blocking = -1;
1150 	int		timecheck;
1151 	int		flag;
1152 	timespec_t	rqtime;
1153 	timespec_t	*rqtp = NULL;
1154 	portget_t	*pgetp;
1155 	void		*results;
1156 	model_t		model = get_udatamodel();
1157 
1158 	flag = pgt->pgt_flags;
1159 
1160 	if (*nget > max && max > 0)
1161 		return (EINVAL);
1162 
1163 	portq = &pp->port_queue;
1164 	mutex_enter(&portq->portq_mutex);
1165 	if (max == 0) {
1166 		/*
1167 		 * Return number of objects with events
1168 		 * The portq_block_mutex is required to synchronize this
1169 		 * thread with another possible thread, which could be
1170 		 * retrieving events from the port queue.
1171 		 */
1172 		mutex_enter(&portq->portq_block_mutex);
1173 		/*
1174 		 * Check if a second thread is currently retrieving events
1175 		 * and it is using the temporary event queue.
1176 		 */
1177 		if (portq->portq_tnent) {
1178 			/* put remaining events back to the port queue */
1179 			port_push_eventq(portq);
1180 		}
1181 		*nget = portq->portq_nent;
1182 		mutex_exit(&portq->portq_block_mutex);
1183 		mutex_exit(&portq->portq_mutex);
1184 		return (0);
1185 	}
1186 
1187 	if (uevp == NULL) {
1188 		mutex_exit(&portq->portq_mutex);
1189 		return (EFAULT);
1190 	}
1191 	if (*nget == 0) {		/* no events required */
1192 		mutex_exit(&portq->portq_mutex);
1193 		return (0);
1194 	}
1195 
1196 	/* port is being closed ... */
1197 	if (portq->portq_flags & PORTQ_CLOSE) {
1198 		mutex_exit(&portq->portq_mutex);
1199 		return (EBADFD);
1200 	}
1201 
1202 	/* return immediately if port in alert mode */
1203 	if (portq->portq_flags & PORTQ_ALERT) {
1204 		error = port_get_alert(&portq->portq_alert, uevp);
1205 		if (error == 0)
1206 			*nget = 1;
1207 		mutex_exit(&portq->portq_mutex);
1208 		return (error);
1209 	}
1210 
1211 	portq->portq_thrcnt++;
1212 
1213 	/*
1214 	 * Now check if the completed events satisfy the
1215 	 * "wait" requirements of the current thread:
1216 	 */
1217 
1218 	if (pgt->pgt_loop) {
1219 		/*
1220 		 * loop entry of same thread
1221 		 * pgt_loop is set when the current thread returns
1222 		 * prematurely from this function. That could happen
1223 		 * when a port is being shared between processes and
1224 		 * this thread could not find events to return.
1225 		 * It is not allowed to a thread to retrieve non-shareable
1226 		 * events generated in other processes.
1227 		 * PORTQ_WAIT_EVENTS is set when a thread already
1228 		 * checked the current event queue and no new events
1229 		 * are added to the queue.
1230 		 */
1231 		if (((portq->portq_flags & PORTQ_WAIT_EVENTS) == 0) &&
1232 		    (portq->portq_nent >= *nget)) {
1233 			/* some new events arrived ...check them */
1234 			goto portnowait;
1235 		}
1236 		rqtp = pgt->pgt_rqtp;
1237 		timecheck = pgt->pgt_timecheck;
1238 		pgt->pgt_flags |= PORTGET_WAIT_EVENTS;
1239 	} else {
1240 		/* check if enough events are available ... */
1241 		if (portq->portq_nent >= *nget)
1242 			goto portnowait;
1243 		/*
1244 		 * There are not enough events available to satisfy
1245 		 * the request, check timeout value and wait for
1246 		 * incoming events.
1247 		 */
1248 		error = port_get_timeout(pgt->pgt_timeout, &rqtime, &rqtp,
1249 		    &blocking, flag);
1250 		if (error) {
1251 			port_check_return_cond(portq);
1252 			mutex_exit(&portq->portq_mutex);
1253 			return (error);
1254 		}
1255 
1256 		if (blocking == 0) /* don't block, check fired events */
1257 			goto portnowait;
1258 
1259 		if (rqtp != NULL) {
1260 			timespec_t	now;
1261 			timecheck = timechanged;
1262 			gethrestime(&now);
1263 			timespecadd(rqtp, &now);
1264 		}
1265 	}
1266 
1267 	/* enqueue thread in the list of waiting threads */
1268 	pgetp = port_queue_thread(portq, *nget);
1269 
1270 
1271 	/* Wait here until return conditions met */
1272 	for (;;) {
1273 		if (pgetp->portget_state & PORTGET_ALERT) {
1274 			/* reap alert event and return */
1275 			error = port_get_alert(&pgetp->portget_alert, uevp);
1276 			if (error)
1277 				*nget = 0;
1278 			else
1279 				*nget = 1;
1280 			port_dequeue_thread(&pp->port_queue, pgetp);
1281 			portq->portq_thrcnt--;
1282 			mutex_exit(&portq->portq_mutex);
1283 			return (error);
1284 		}
1285 
1286 		/*
1287 		 * Check if some other thread is already retrieving
1288 		 * events (portq_getn > 0).
1289 		 */
1290 
1291 		if ((portq->portq_getn  == 0) &&
1292 		    ((portq)->portq_nent >= *nget) &&
1293 		    (!((pgt)->pgt_flags & PORTGET_WAIT_EVENTS) ||
1294 		    !((portq)->portq_flags & PORTQ_WAIT_EVENTS)))
1295 			break;
1296 
1297 		if (portq->portq_flags & PORTQ_CLOSE) {
1298 			error = EBADFD;
1299 			break;
1300 		}
1301 
1302 		rval = cv_waituntil_sig(&pgetp->portget_cv, &portq->portq_mutex,
1303 		    rqtp, timecheck);
1304 
1305 		if (rval <= 0) {
1306 			error = (rval == 0) ? EINTR : ETIME;
1307 			break;
1308 		}
1309 	}
1310 
1311 	/* take thread out of the wait queue */
1312 	port_dequeue_thread(portq, pgetp);
1313 
1314 	if (error != 0 && (error == EINTR || error == EBADFD ||
1315 	    (error == ETIME && flag))) {
1316 		/* return without events */
1317 		port_check_return_cond(portq);
1318 		mutex_exit(&portq->portq_mutex);
1319 		return (error);
1320 	}
1321 
1322 portnowait:
1323 	nmax = max < portq->portq_nent ? max : portq->portq_nent;
1324 
1325 	/*
1326 	 * Move port event queue to a temporary event queue .
1327 	 * New incoming events will be continue be posted to the event queue
1328 	 * and they will not be considered by the current thread.
1329 	 * The idea is to avoid lock contentions or an often locking/unlocking
1330 	 * of the port queue mutex. The contention and performance degradation
1331 	 * could happen because:
1332 	 * a) incoming events use the port queue mutex to enqueue new events and
1333 	 * b) before the event can be delivered to the application it is
1334 	 *    necessary to notify the event sources about the event delivery.
1335 	 *    Sometimes the event sources can require a long time to return and
1336 	 *    the queue mutex would block incoming events.
1337 	 * During this time incoming events (port_send_event()) do not need
1338 	 * to awake threads waiting for events. Before the current thread
1339 	 * returns it will check the conditions to awake other waiting threads.
1340 	 */
1341 	portq->portq_getn++;	/* number of threads retrieving events */
1342 	mutex_enter(&portq->portq_block_mutex); /* block other threads here */
1343 	if (portq->portq_tnent) {
1344 		/*
1345 		 * Move remaining events from previous thread back to the
1346 		 * port event queue.
1347 		 */
1348 		port_push_eventq(portq);
1349 	}
1350 	/* move port event queue to a temporary queue */
1351 	list_move_tail(&portq->portq_get_list, &portq->portq_list);
1352 	glist = &portq->portq_get_list;	/* use temporary event queue */
1353 	tnent = portq->portq_nent;	/* get current number of events */
1354 	portq->portq_nent = 0;		/* no events in the port event queue */
1355 	portq->portq_flags |= PORTQ_WAIT_EVENTS; /* detect incoming events */
1356 	mutex_exit(&portq->portq_mutex);    /* event queue can be reused now */
1357 
1358 	if (model == DATAMODEL_NATIVE) {
1359 		eventsz = sizeof (port_event_t);
1360 		kevp = kmem_alloc(eventsz * nmax, KM_NOSLEEP);
1361 		if (kevp == NULL) {
1362 			if (nmax > pp->port_max_list)
1363 				nmax = pp->port_max_list;
1364 			kevp = kmem_alloc(eventsz * nmax, KM_SLEEP);
1365 		}
1366 		results = kevp;
1367 		lev = NULL;	/* start with first event in the queue */
1368 		for (nevents = 0; nevents < nmax; ) {
1369 			pev = port_get_kevent(glist, lev);
1370 			if (pev == NULL)	/* no more events available */
1371 				break;
1372 			if (pev->portkev_flags & PORT_KEV_FREE) {
1373 				/* Just discard event */
1374 				list_remove(glist, pev);
1375 				pev->portkev_flags &= ~(PORT_CLEANUP_DONE);
1376 				if (PORT_FREE_EVENT(pev))
1377 					port_free_event_local(pev, 0);
1378 				tnent--;
1379 				continue;
1380 			}
1381 
1382 			/* move event data to copyout list */
1383 			if (port_copy_event(&kevp[nevents], pev, glist)) {
1384 				/*
1385 				 * Event can not be delivered to the
1386 				 * current process.
1387 				 */
1388 				if (lev != NULL)
1389 					list_insert_after(glist, lev, pev);
1390 				else
1391 					list_insert_head(glist, pev);
1392 				lev = pev;  /* last checked event */
1393 			} else {
1394 				nevents++;	/* # of events ready */
1395 			}
1396 		}
1397 #ifdef	_SYSCALL32_IMPL
1398 	} else {
1399 		port_event32_t	*kevp32;
1400 
1401 		eventsz = sizeof (port_event32_t);
1402 		kevp32 = kmem_alloc(eventsz * nmax, KM_NOSLEEP);
1403 		if (kevp32 == NULL) {
1404 			if (nmax > pp->port_max_list)
1405 				nmax = pp->port_max_list;
1406 			kevp32 = kmem_alloc(eventsz * nmax, KM_SLEEP);
1407 		}
1408 		results = kevp32;
1409 		lev = NULL;	/* start with first event in the queue */
1410 		for (nevents = 0; nevents < nmax; ) {
1411 			pev = port_get_kevent(glist, lev);
1412 			if (pev == NULL)	/* no more events available */
1413 				break;
1414 			if (pev->portkev_flags & PORT_KEV_FREE) {
1415 				/* Just discard event */
1416 				list_remove(glist, pev);
1417 				pev->portkev_flags &= ~(PORT_CLEANUP_DONE);
1418 				if (PORT_FREE_EVENT(pev))
1419 					port_free_event_local(pev, 0);
1420 				tnent--;
1421 				continue;
1422 			}
1423 
1424 			/* move event data to copyout list */
1425 			if (port_copy_event32(&kevp32[nevents], pev, glist)) {
1426 				/*
1427 				 * Event can not be delivered to the
1428 				 * current process.
1429 				 */
1430 				if (lev != NULL)
1431 					list_insert_after(glist, lev, pev);
1432 				else
1433 					list_insert_head(glist, pev);
1434 				lev = pev;  /* last checked event */
1435 			} else {
1436 				nevents++;	/* # of events ready */
1437 			}
1438 		}
1439 #endif	/* _SYSCALL32_IMPL */
1440 	}
1441 
1442 	/*
1443 	 *  Remember number of remaining events in the temporary event queue.
1444 	 */
1445 	portq->portq_tnent = tnent - nevents;
1446 	mutex_exit(&portq->portq_block_mutex);
1447 
1448 	/*
1449 	 * Work to do before return :
1450 	 * - push list of remaining events back to the top of the standard
1451 	 *   port queue.
1452 	 * - if this is the last thread calling port_get(n) then wakeup the
1453 	 *   thread waiting on close(2).
1454 	 * - check for a deferred cv_signal from port_send_event() and wakeup
1455 	 *   the sleeping thread.
1456 	 */
1457 
1458 	mutex_enter(&portq->portq_mutex);
1459 	if (portq->portq_tnent) {
1460 		/*
1461 		 * move remaining events in the temporary event queue back
1462 		 * to the port event queue
1463 		 */
1464 		port_push_eventq(portq);
1465 	}
1466 	portq->portq_getn--;	/* update # of threads retrieving events */
1467 	if (--portq->portq_thrcnt == 0) { /* # of threads waiting ... */
1468 		/* Last thread => check close(2) conditions ... */
1469 		if (portq->portq_flags & PORTQ_CLOSE) {
1470 			cv_signal(&portq->portq_closecv);
1471 			mutex_exit(&portq->portq_mutex);
1472 			kmem_free(results, eventsz * nmax);
1473 			/* do not copyout events */
1474 			*nget = 0;
1475 			return (EBADFD);
1476 		}
1477 	} else if (portq->portq_getn == 0) {
1478 		/*
1479 		 * no other threads retrieving events ...
1480 		 * check wakeup conditions of sleeping threads
1481 		 */
1482 		if ((portq->portq_thread != NULL) &&
1483 		    (portq->portq_nent >= portq->portq_nget))
1484 			cv_signal(&portq->portq_thread->portget_cv);
1485 	}
1486 
1487 	/*
1488 	 * Check PORTQ_POLLIN here because the current thread set temporarily
1489 	 * the number of events in the queue to zero.
1490 	 */
1491 	if (portq->portq_flags & PORTQ_POLLIN) {
1492 		portq->portq_flags &= ~PORTQ_POLLIN;
1493 		mutex_exit(&portq->portq_mutex);
1494 		pollwakeup(&pp->port_pollhd, POLLIN);
1495 	} else {
1496 		mutex_exit(&portq->portq_mutex);
1497 	}
1498 
1499 	/* now copyout list of user event structures to user space */
1500 	if (nevents) {
1501 		if (copyout(results, uevp, nevents * eventsz))
1502 			error = EFAULT;
1503 	}
1504 	kmem_free(results, eventsz * nmax);
1505 
1506 	if (nevents == 0 && error == 0 && pgt->pgt_loop == 0 && blocking != 0) {
1507 		/* no events retrieved: check loop conditions */
1508 		if (blocking == -1) {
1509 			/* no timeout checked */
1510 			error = port_get_timeout(pgt->pgt_timeout,
1511 			    &pgt->pgt_rqtime, &rqtp, &blocking, flag);
1512 			if (error) {
1513 				*nget = nevents;
1514 				return (error);
1515 			}
1516 			if (rqtp != NULL) {
1517 				timespec_t	now;
1518 				pgt->pgt_timecheck = timechanged;
1519 				gethrestime(&now);
1520 				timespecadd(&pgt->pgt_rqtime, &now);
1521 			}
1522 			pgt->pgt_rqtp = rqtp;
1523 		} else {
1524 			/* timeout already checked -> remember values */
1525 			pgt->pgt_rqtp = rqtp;
1526 			if (rqtp != NULL) {
1527 				pgt->pgt_timecheck = timecheck;
1528 				pgt->pgt_rqtime = *rqtp;
1529 			}
1530 		}
1531 		if (blocking)
1532 			/* timeout remaining */
1533 			pgt->pgt_loop = 1;
1534 	}
1535 
1536 	/* set number of user event structures completed */
1537 	*nget = nevents;
1538 	return (error);
1539 }
1540 
1541 /*
1542  * 1. copy kernel event structure to user event structure.
1543  * 2. PORT_KEV_WIRED event structures will be reused by the "source"
1544  * 3. Remove PORT_KEV_DONEQ flag (event removed from the event queue)
1545  * 4. Other types of event structures can be delivered back to the port cache
1546  *    (port_free_event_local()).
1547  * 5. The event source callback function is the last opportunity for the
1548  *    event source to update events, to free local resources associated with
1549  *    the event or to deny the delivery of the event.
1550  */
1551 static int
1552 port_copy_event(port_event_t *puevp, port_kevent_t *pkevp, list_t *list)
1553 {
1554 	int	free_event = 0;
1555 	int	flags;
1556 	int	error;
1557 
1558 	puevp->portev_source = pkevp->portkev_source;
1559 	puevp->portev_object = pkevp->portkev_object;
1560 	puevp->portev_user = pkevp->portkev_user;
1561 	puevp->portev_events = pkevp->portkev_events;
1562 
1563 	/* remove event from the queue */
1564 	list_remove(list, pkevp);
1565 
1566 	/*
1567 	 * Events of type PORT_KEV_WIRED remain allocated by the
1568 	 * event source.
1569 	 */
1570 	flags = pkevp->portkev_flags;
1571 	if (pkevp->portkev_flags & PORT_KEV_WIRED)
1572 		pkevp->portkev_flags &= ~PORT_KEV_DONEQ;
1573 	else
1574 		free_event = 1;
1575 
1576 	if (pkevp->portkev_callback) {
1577 		error = (*pkevp->portkev_callback)(pkevp->portkev_arg,
1578 		    &puevp->portev_events, pkevp->portkev_pid,
1579 		    PORT_CALLBACK_DEFAULT, pkevp);
1580 
1581 		if (error) {
1582 			/*
1583 			 * Event can not be delivered.
1584 			 * Caller must reinsert the event into the queue.
1585 			 */
1586 			pkevp->portkev_flags = flags;
1587 			return (error);
1588 		}
1589 	}
1590 	if (free_event)
1591 		port_free_event_local(pkevp, 0);
1592 	return (0);
1593 }
1594 
1595 #ifdef	_SYSCALL32_IMPL
1596 /*
1597  * 1. copy kernel event structure to user event structure.
1598  * 2. PORT_KEV_WIRED event structures will be reused by the "source"
1599  * 3. Remove PORT_KEV_DONEQ flag (event removed from the event queue)
1600  * 4. Other types of event structures can be delivered back to the port cache
1601  *    (port_free_event_local()).
1602  * 5. The event source callback function is the last opportunity for the
1603  *    event source to update events, to free local resources associated with
1604  *    the event or to deny the delivery of the event.
1605  */
1606 static int
1607 port_copy_event32(port_event32_t *puevp, port_kevent_t *pkevp, list_t *list)
1608 {
1609 	int	free_event = 0;
1610 	int	error;
1611 	int	flags;
1612 
1613 	puevp->portev_source = pkevp->portkev_source;
1614 	puevp->portev_object = (daddr32_t)pkevp->portkev_object;
1615 	puevp->portev_user = (caddr32_t)(uintptr_t)pkevp->portkev_user;
1616 	puevp->portev_events = pkevp->portkev_events;
1617 
1618 	/* remove event from the queue */
1619 	list_remove(list, pkevp);
1620 
1621 	/*
1622 	 * Events if type PORT_KEV_WIRED remain allocated by the
1623 	 * sub-system (source).
1624 	 */
1625 
1626 	flags = pkevp->portkev_flags;
1627 	if (pkevp->portkev_flags & PORT_KEV_WIRED)
1628 		pkevp->portkev_flags &= ~PORT_KEV_DONEQ;
1629 	else
1630 		free_event = 1;
1631 
1632 	if (pkevp->portkev_callback != NULL) {
1633 		error = (*pkevp->portkev_callback)(pkevp->portkev_arg,
1634 		    &puevp->portev_events, pkevp->portkev_pid,
1635 		    PORT_CALLBACK_DEFAULT, pkevp);
1636 		if (error) {
1637 			/*
1638 			 * Event can not be delivered.
1639 			 * Caller must reinsert the event into the queue.
1640 			 */
1641 			pkevp->portkev_flags = flags;
1642 			return (error);
1643 		}
1644 	}
1645 	if (free_event)
1646 		port_free_event_local(pkevp, 0);
1647 	return (0);
1648 }
1649 #endif	/* _SYSCALL32_IMPL */
1650 
1651 /*
1652  * copyout alert event.
1653  */
1654 static int
1655 port_get_alert(port_alert_t *pa, port_event_t *uevp)
1656 {
1657 	model_t	model = get_udatamodel();
1658 
1659 	/* copyout alert event structures to user space */
1660 	if (model == DATAMODEL_NATIVE) {
1661 		port_event_t	uev;
1662 		uev.portev_source = PORT_SOURCE_ALERT;
1663 		uev.portev_object = pa->portal_object;
1664 		uev.portev_events = pa->portal_events;
1665 		uev.portev_user = pa->portal_user;
1666 		if (copyout(&uev, uevp, sizeof (port_event_t)))
1667 			return (EFAULT);
1668 #ifdef	_SYSCALL32_IMPL
1669 	} else {
1670 		port_event32_t	uev32;
1671 		uev32.portev_source = PORT_SOURCE_ALERT;
1672 		uev32.portev_object = (daddr32_t)pa->portal_object;
1673 		uev32.portev_events = pa->portal_events;
1674 		uev32.portev_user = (daddr32_t)(uintptr_t)pa->portal_user;
1675 		if (copyout(&uev32, uevp, sizeof (port_event32_t)))
1676 			return (EFAULT);
1677 #endif	/* _SYSCALL32_IMPL */
1678 	}
1679 	return (0);
1680 }
1681 
1682 /*
1683  * Check return conditions :
1684  * - pending port close(2)
1685  * - threads waiting for events
1686  */
1687 static void
1688 port_check_return_cond(port_queue_t *portq)
1689 {
1690 	ASSERT(MUTEX_HELD(&portq->portq_mutex));
1691 	portq->portq_thrcnt--;
1692 	if (portq->portq_flags & PORTQ_CLOSE) {
1693 		if (portq->portq_thrcnt == 0)
1694 			cv_signal(&portq->portq_closecv);
1695 		else
1696 			cv_signal(&portq->portq_thread->portget_cv);
1697 	}
1698 }
1699 
1700 /*
1701  * The port_get_kevent() function returns
1702  * - the event located at the head of the queue if 'last' pointer is NULL
1703  * - the next event after the event pointed by 'last'
1704  * The caller of this function is responsible for the integrity of the queue
1705  * in use:
1706  * - port_getn() is using a temporary queue protected with
1707  *   portq->portq_block_mutex
1708  * - port_close_events() is working on the global event queue and protects the
1709  *   queue with portq->portq_mutex.
1710  */
1711 
1712 port_kevent_t *
1713 port_get_kevent(list_t *list, port_kevent_t *last)
1714 {
1715 	if (last == NULL)
1716 		return (list_head(list));
1717 	else
1718 		return (list_next(list, last));
1719 }
1720 
1721 /*
1722  * The port_get_timeout() function gets the timeout data from user space
1723  * and converts that info into a corresponding internal representation.
1724  * The kerneldata flag means that the timeout data is already loaded.
1725  */
1726 static int
1727 port_get_timeout(timespec_t *timeout, timespec_t *rqtime, timespec_t **rqtp,
1728     int *blocking, int kerneldata)
1729 {
1730 	model_t	model = get_udatamodel();
1731 
1732 	*rqtp = NULL;
1733 	if (timeout == NULL) {
1734 		*blocking = 1;
1735 		return (0);
1736 	}
1737 
1738 	if (kerneldata) {
1739 		*rqtime = *timeout;
1740 	} else {
1741 		if (model == DATAMODEL_NATIVE) {
1742 			if (copyin(timeout, rqtime, sizeof (*rqtime)))
1743 				return (EFAULT);
1744 #ifdef	_SYSCALL32_IMPL
1745 		} else {
1746 			timespec32_t 	wait_time_32;
1747 			if (copyin(timeout, &wait_time_32,
1748 			    sizeof (wait_time_32)))
1749 				return (EFAULT);
1750 			TIMESPEC32_TO_TIMESPEC(rqtime, &wait_time_32);
1751 #endif  /* _SYSCALL32_IMPL */
1752 		}
1753 	}
1754 
1755 	if (rqtime->tv_sec == 0 && rqtime->tv_nsec == 0) {
1756 		*blocking = 0;
1757 		return (0);
1758 	}
1759 
1760 	if (rqtime->tv_sec < 0 ||
1761 	    rqtime->tv_nsec < 0 || rqtime->tv_nsec >= NANOSEC)
1762 		return (EINVAL);
1763 
1764 	*rqtp = rqtime;
1765 	*blocking = 1;
1766 	return (0);
1767 }
1768 
1769 /*
1770  * port_queue_thread()
1771  * Threads requiring more events than available will be put in a wait queue.
1772  * There is a "thread wait queue" per port.
1773  * Threads requiring less events get a higher priority than others and they
1774  * will be awoken first.
1775  */
1776 static portget_t *
1777 port_queue_thread(port_queue_t *portq, uint_t nget)
1778 {
1779 	portget_t	*pgetp;
1780 	portget_t	*ttp;
1781 	portget_t	*htp;
1782 
1783 	pgetp = kmem_zalloc(sizeof (portget_t), KM_SLEEP);
1784 	pgetp->portget_nget = nget;
1785 	pgetp->portget_pid = curproc->p_pid;
1786 	if (portq->portq_thread == NULL) {
1787 		/* first waiting thread */
1788 		portq->portq_thread = pgetp;
1789 		portq->portq_nget = nget;
1790 		pgetp->portget_prev = pgetp;
1791 		pgetp->portget_next = pgetp;
1792 		return (pgetp);
1793 	}
1794 
1795 	/*
1796 	 * thread waiting for less events will be set on top of the queue.
1797 	 */
1798 	ttp = portq->portq_thread;
1799 	htp = ttp;
1800 	for (;;) {
1801 		if (nget <= ttp->portget_nget)
1802 			break;
1803 		if (htp == ttp->portget_next)
1804 			break;	/* last event */
1805 		ttp = ttp->portget_next;
1806 	}
1807 
1808 	/* add thread to the queue */
1809 	pgetp->portget_next = ttp;
1810 	pgetp->portget_prev = ttp->portget_prev;
1811 	ttp->portget_prev->portget_next = pgetp;
1812 	ttp->portget_prev = pgetp;
1813 	if (portq->portq_thread == ttp)
1814 		portq->portq_thread = pgetp;
1815 	portq->portq_nget = portq->portq_thread->portget_nget;
1816 	return (pgetp);
1817 }
1818 
1819 /*
1820  * Take thread out of the queue.
1821  */
1822 static void
1823 port_dequeue_thread(port_queue_t *portq, portget_t *pgetp)
1824 {
1825 	if (pgetp->portget_next == pgetp) {
1826 		/* last (single) waiting thread */
1827 		portq->portq_thread = NULL;
1828 	} else {
1829 		pgetp->portget_prev->portget_next = pgetp->portget_next;
1830 		pgetp->portget_next->portget_prev = pgetp->portget_prev;
1831 		if (portq->portq_thread == pgetp)
1832 			portq->portq_thread = pgetp->portget_next;
1833 		portq->portq_nget = portq->portq_thread->portget_nget;
1834 	}
1835 	kmem_free(pgetp, sizeof (portget_t));
1836 }
1837 
1838 /*
1839  * Set up event port kstats.
1840  */
1841 static void
1842 port_kstat_init()
1843 {
1844 	kstat_t	*ksp;
1845 	uint_t	ndata;
1846 
1847 	ndata = sizeof (port_kstat) / sizeof (kstat_named_t);
1848 	ksp = kstat_create("portfs", 0, "Event Ports", "misc",
1849 	    KSTAT_TYPE_NAMED, ndata, KSTAT_FLAG_VIRTUAL);
1850 	if (ksp) {
1851 		ksp->ks_data = &port_kstat;
1852 		kstat_install(ksp);
1853 	}
1854 }
1855