xref: /illumos-gate/usr/src/uts/common/os/aio_subr.c (revision 7f3d7c9289dee6488b3cd2848a68c0b8580d750c)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #include <sys/types.h>
28 #include <sys/proc.h>
29 #include <sys/file.h>
30 #include <sys/errno.h>
31 #include <sys/param.h>
32 #include <sys/sysmacros.h>
33 #include <sys/cmn_err.h>
34 #include <sys/systm.h>
35 #include <vm/as.h>
36 #include <vm/page.h>
37 #include <sys/uio.h>
38 #include <sys/kmem.h>
39 #include <sys/debug.h>
40 #include <sys/aio_impl.h>
41 #include <sys/epm.h>
42 #include <sys/fs/snode.h>
43 #include <sys/siginfo.h>
44 #include <sys/cpuvar.h>
45 #include <sys/conf.h>
46 #include <sys/sdt.h>
47 
48 int aphysio(int (*)(), int (*)(), dev_t, int, void (*)(), struct aio_req *);
49 int aio_done(struct buf *);
50 void aphysio_unlock(aio_req_t *);
51 void aio_cleanup(int);
52 void aio_cleanup_exit(void);
53 
54 /*
55  * private functions
56  */
57 static void aio_sigev_send(proc_t *, sigqueue_t *);
58 static void aio_hash_delete(aio_t *, aio_req_t *);
59 static void aio_lio_free(aio_t *, aio_lio_t *);
60 static int aio_cleanup_cleanupq(aio_t *, aio_req_t *, int);
61 static int aio_cleanup_notifyq(aio_t *, aio_req_t *, int);
62 static void aio_cleanup_pollq(aio_t *, aio_req_t *, int);
63 static void aio_cleanup_portq(aio_t *, aio_req_t *, int);
64 
65 /*
66  * async version of physio() that doesn't wait synchronously
67  * for the driver's strategy routine to complete.
68  */
69 
70 int
71 aphysio(
72 	int (*strategy)(struct buf *),
73 	int (*cancel)(struct buf *),
74 	dev_t dev,
75 	int rw,
76 	void (*mincnt)(struct buf *),
77 	struct aio_req *aio)
78 {
79 	struct uio *uio = aio->aio_uio;
80 	aio_req_t *reqp = (aio_req_t *)aio->aio_private;
81 	struct buf *bp = &reqp->aio_req_buf;
82 	struct iovec *iov;
83 	struct as *as;
84 	char *a;
85 	int	error;
86 	size_t	c;
87 	struct page **pplist;
88 	struct dev_ops *ops = devopsp[getmajor(dev)];
89 
90 	if (uio->uio_loffset < 0)
91 		return (EINVAL);
92 #ifdef	_ILP32
93 	/*
94 	 * For 32-bit kernels, check against SPEC_MAXOFFSET_T which represents
95 	 * the maximum size that can be supported by the IO subsystem.
96 	 * XXX this code assumes a D_64BIT driver.
97 	 */
98 	if (uio->uio_loffset > SPEC_MAXOFFSET_T)
99 		return (EINVAL);
100 #endif	/* _ILP32 */
101 
102 	if (rw == B_READ) {
103 		CPU_STATS_ADD_K(sys, phread, 1);
104 	} else {
105 		CPU_STATS_ADD_K(sys, phwrite, 1);
106 	}
107 
108 	iov = uio->uio_iov;
109 	sema_init(&bp->b_sem, 0, NULL, SEMA_DEFAULT, NULL);
110 	sema_init(&bp->b_io, 0, NULL, SEMA_DEFAULT, NULL);
111 
112 	bp->b_error = 0;
113 	bp->b_flags = B_BUSY | B_PHYS | B_ASYNC | rw;
114 	bp->b_edev = dev;
115 	bp->b_dev = cmpdev(dev);
116 	bp->b_lblkno = btodt(uio->uio_loffset);
117 	bp->b_offset = uio->uio_loffset;
118 	(void) ops->devo_getinfo(NULL, DDI_INFO_DEVT2DEVINFO,
119 	    (void *)bp->b_edev, (void **)&bp->b_dip);
120 
121 	/*
122 	 * Clustering: Clustering can set the b_iodone, b_forw and
123 	 * b_proc fields to cluster-specifc values.
124 	 */
125 	if (bp->b_iodone == NULL) {
126 		bp->b_iodone = aio_done;
127 		/* b_forw points at an aio_req_t structure */
128 		bp->b_forw = (struct buf *)reqp;
129 		bp->b_proc = curproc;
130 	}
131 
132 	a = bp->b_un.b_addr = iov->iov_base;
133 	c = bp->b_bcount = iov->iov_len;
134 
135 	(*mincnt)(bp);
136 	if (bp->b_bcount != iov->iov_len)
137 		return (ENOTSUP);
138 
139 	as = bp->b_proc->p_as;
140 
141 	error = as_pagelock(as, &pplist, a,
142 	    c, rw == B_READ? S_WRITE : S_READ);
143 	if (error != 0) {
144 		bp->b_flags |= B_ERROR;
145 		bp->b_error = error;
146 		bp->b_flags &= ~(B_BUSY|B_WANTED|B_PHYS|B_SHADOW);
147 		return (error);
148 	}
149 	reqp->aio_req_flags |= AIO_PAGELOCKDONE;
150 	bp->b_shadow = pplist;
151 	if (pplist != NULL) {
152 		bp->b_flags |= B_SHADOW;
153 	}
154 
155 	if (cancel != anocancel)
156 		cmn_err(CE_PANIC,
157 		    "aphysio: cancellation not supported, use anocancel");
158 
159 	reqp->aio_req_cancel = cancel;
160 
161 	DTRACE_IO1(start, struct buf *, bp);
162 
163 	return ((*strategy)(bp));
164 }
165 
166 /*ARGSUSED*/
167 int
168 anocancel(struct buf *bp)
169 {
170 	return (ENXIO);
171 }
172 
173 /*
174  * Called from biodone().
175  * Notify process that a pending AIO has finished.
176  */
177 
178 /*
179  * Clustering: This function is made non-static as it is used
180  * by clustering s/w as contract private interface.
181  */
182 
183 int
184 aio_done(struct buf *bp)
185 {
186 	proc_t *p;
187 	struct as *as;
188 	aio_req_t *reqp;
189 	aio_lio_t *head = NULL;
190 	aio_t *aiop;
191 	sigqueue_t *sigev = NULL;
192 	sigqueue_t *lio_sigev = NULL;
193 	port_kevent_t *pkevp = NULL;
194 	port_kevent_t *lio_pkevp = NULL;
195 	int fd;
196 	int cleanupqflag;
197 	int pollqflag;
198 	int portevpend;
199 	void (*func)();
200 	int use_port = 0;
201 	int reqp_flags = 0;
202 	int send_signal = 0;
203 
204 	p = bp->b_proc;
205 	as = p->p_as;
206 	reqp = (aio_req_t *)bp->b_forw;
207 	fd = reqp->aio_req_fd;
208 
209 	/*
210 	 * mapout earlier so that more kmem is available when aio is
211 	 * heavily used. bug #1262082
212 	 */
213 	if (bp->b_flags & B_REMAPPED)
214 		bp_mapout(bp);
215 
216 	/* decrement fd's ref count by one, now that aio request is done. */
217 	areleasef(fd, P_FINFO(p));
218 
219 	aiop = p->p_aio;
220 	ASSERT(aiop != NULL);
221 
222 	mutex_enter(&aiop->aio_portq_mutex);
223 	mutex_enter(&aiop->aio_mutex);
224 	ASSERT(aiop->aio_pending > 0);
225 	ASSERT(reqp->aio_req_flags & AIO_PENDING);
226 	aiop->aio_pending--;
227 	reqp->aio_req_flags &= ~AIO_PENDING;
228 	reqp_flags = reqp->aio_req_flags;
229 	if ((pkevp = reqp->aio_req_portkev) != NULL) {
230 		/* Event port notification is desired for this transaction */
231 		if (reqp->aio_req_flags & AIO_CLOSE_PORT) {
232 			/*
233 			 * The port is being closed and it is waiting for
234 			 * pending asynchronous I/O transactions to complete.
235 			 */
236 			portevpend = --aiop->aio_portpendcnt;
237 			aio_deq(&aiop->aio_portpending, reqp);
238 			aio_enq(&aiop->aio_portq, reqp, 0);
239 			mutex_exit(&aiop->aio_mutex);
240 			mutex_exit(&aiop->aio_portq_mutex);
241 			port_send_event(pkevp);
242 			if (portevpend == 0)
243 				cv_broadcast(&aiop->aio_portcv);
244 			return (0);
245 		}
246 
247 		if (aiop->aio_flags & AIO_CLEANUP) {
248 			/*
249 			 * aio_cleanup_thread() is waiting for completion of
250 			 * transactions.
251 			 */
252 			mutex_enter(&as->a_contents);
253 			aio_deq(&aiop->aio_portpending, reqp);
254 			aio_enq(&aiop->aio_portcleanupq, reqp, 0);
255 			cv_signal(&aiop->aio_cleanupcv);
256 			mutex_exit(&as->a_contents);
257 			mutex_exit(&aiop->aio_mutex);
258 			mutex_exit(&aiop->aio_portq_mutex);
259 			return (0);
260 		}
261 
262 		aio_deq(&aiop->aio_portpending, reqp);
263 		aio_enq(&aiop->aio_portq, reqp, 0);
264 
265 		use_port = 1;
266 	} else {
267 		/*
268 		 * when the AIO_CLEANUP flag is enabled for this
269 		 * process, or when the AIO_POLL bit is set for
270 		 * this request, special handling is required.
271 		 * otherwise the request is put onto the doneq.
272 		 */
273 		cleanupqflag = (aiop->aio_flags & AIO_CLEANUP);
274 		pollqflag = (reqp->aio_req_flags & AIO_POLL);
275 		if (cleanupqflag | pollqflag) {
276 
277 			if (cleanupqflag)
278 				mutex_enter(&as->a_contents);
279 
280 			/*
281 			 * requests with their AIO_POLL bit set are put
282 			 * on the pollq, requests with sigevent structures
283 			 * or with listio heads are put on the notifyq, and
284 			 * the remaining requests don't require any special
285 			 * cleanup handling, so they're put onto the default
286 			 * cleanupq.
287 			 */
288 			if (pollqflag)
289 				aio_enq(&aiop->aio_pollq, reqp, AIO_POLLQ);
290 			else if (reqp->aio_req_sigqp || reqp->aio_req_lio)
291 				aio_enq(&aiop->aio_notifyq, reqp, AIO_NOTIFYQ);
292 			else
293 				aio_enq(&aiop->aio_cleanupq, reqp,
294 				    AIO_CLEANUPQ);
295 
296 			if (cleanupqflag) {
297 				cv_signal(&aiop->aio_cleanupcv);
298 				mutex_exit(&as->a_contents);
299 				mutex_exit(&aiop->aio_mutex);
300 				mutex_exit(&aiop->aio_portq_mutex);
301 			} else {
302 				ASSERT(pollqflag);
303 				/* block aio_cleanup_exit until we're done */
304 				aiop->aio_flags |= AIO_DONE_ACTIVE;
305 				mutex_exit(&aiop->aio_mutex);
306 				mutex_exit(&aiop->aio_portq_mutex);
307 				/*
308 				 * let the cleanup processing happen from an AST
309 				 * set an AST on all threads in this process
310 				 */
311 				mutex_enter(&p->p_lock);
312 				set_proc_ast(p);
313 				mutex_exit(&p->p_lock);
314 				mutex_enter(&aiop->aio_mutex);
315 				/* wakeup anybody waiting in aiowait() */
316 				cv_broadcast(&aiop->aio_waitcv);
317 
318 				/* wakeup aio_cleanup_exit if needed */
319 				if (aiop->aio_flags & AIO_CLEANUP)
320 					cv_signal(&aiop->aio_cleanupcv);
321 				aiop->aio_flags &= ~AIO_DONE_ACTIVE;
322 				mutex_exit(&aiop->aio_mutex);
323 			}
324 			return (0);
325 		}
326 
327 		/*
328 		 * save req's sigevent pointer, and check its
329 		 * value after releasing aio_mutex lock.
330 		 */
331 		sigev = reqp->aio_req_sigqp;
332 		reqp->aio_req_sigqp = NULL;
333 
334 		/* put request on done queue. */
335 		aio_enq(&aiop->aio_doneq, reqp, AIO_DONEQ);
336 	} /* portkevent */
337 
338 	/*
339 	 * when list IO notification is enabled, a notification or
340 	 * signal is sent only when all entries in the list are done.
341 	 */
342 	if ((head = reqp->aio_req_lio) != NULL) {
343 		ASSERT(head->lio_refcnt > 0);
344 		if (--head->lio_refcnt == 0) {
345 			/*
346 			 * save lio's sigevent pointer, and check
347 			 * its value after releasing aio_mutex lock.
348 			 */
349 			lio_sigev = head->lio_sigqp;
350 			head->lio_sigqp = NULL;
351 			cv_signal(&head->lio_notify);
352 			if (head->lio_port >= 0 &&
353 			    (lio_pkevp = head->lio_portkev) != NULL)
354 				head->lio_port = -1;
355 		}
356 	}
357 
358 	/*
359 	 * if AIO_WAITN set then
360 	 * send signal only when we reached the
361 	 * required amount of IO's finished
362 	 * or when all IO's are done
363 	 */
364 	if (aiop->aio_flags & AIO_WAITN) {
365 		if (aiop->aio_waitncnt > 0)
366 			aiop->aio_waitncnt--;
367 		if (aiop->aio_pending == 0 ||
368 		    aiop->aio_waitncnt == 0)
369 			cv_broadcast(&aiop->aio_waitcv);
370 	} else {
371 		cv_broadcast(&aiop->aio_waitcv);
372 	}
373 
374 	/*
375 	 * No need to set this flag for pollq, portq, lio requests.
376 	 * If this is an old Solaris aio request, and the process has
377 	 * a SIGIO signal handler enabled, then send a SIGIO signal.
378 	 */
379 	if (!sigev && !use_port && head == NULL &&
380 	    (reqp->aio_req_flags & AIO_SOLARIS) &&
381 	    (func = PTOU(p)->u_signal[SIGIO - 1]) != SIG_DFL &&
382 	    (func != SIG_IGN)) {
383 		send_signal = 1;
384 		reqp->aio_req_flags |= AIO_SIGNALLED;
385 	}
386 
387 	mutex_exit(&aiop->aio_mutex);
388 	mutex_exit(&aiop->aio_portq_mutex);
389 
390 	/*
391 	 * Could the cleanup thread be waiting for AIO with locked
392 	 * resources to finish?
393 	 * Ideally in that case cleanup thread should block on cleanupcv,
394 	 * but there is a window, where it could miss to see a new aio
395 	 * request that sneaked in.
396 	 */
397 	mutex_enter(&as->a_contents);
398 	if ((reqp_flags & AIO_PAGELOCKDONE) && AS_ISUNMAPWAIT(as))
399 		cv_broadcast(&as->a_cv);
400 	mutex_exit(&as->a_contents);
401 
402 	if (sigev)
403 		aio_sigev_send(p, sigev);
404 	else if (send_signal)
405 		psignal(p, SIGIO);
406 
407 	if (pkevp)
408 		port_send_event(pkevp);
409 	if (lio_sigev)
410 		aio_sigev_send(p, lio_sigev);
411 	if (lio_pkevp)
412 		port_send_event(lio_pkevp);
413 
414 	return (0);
415 }
416 
417 /*
418  * send a queued signal to the specified process when
419  * the event signal is non-NULL. A return value of 1
420  * will indicate that a signal is queued, and 0 means that
421  * no signal was specified, nor sent.
422  */
423 static void
424 aio_sigev_send(proc_t *p, sigqueue_t *sigev)
425 {
426 	ASSERT(sigev != NULL);
427 
428 	mutex_enter(&p->p_lock);
429 	sigaddqa(p, NULL, sigev);
430 	mutex_exit(&p->p_lock);
431 }
432 
433 /*
434  * special case handling for zero length requests. the aio request
435  * short circuits the normal completion path since all that's required
436  * to complete this request is to copyout a zero to the aio request's
437  * return value.
438  */
439 void
440 aio_zerolen(aio_req_t *reqp)
441 {
442 
443 	struct buf *bp = &reqp->aio_req_buf;
444 
445 	reqp->aio_req_flags |= AIO_ZEROLEN;
446 
447 	bp->b_forw = (struct buf *)reqp;
448 	bp->b_proc = curproc;
449 
450 	bp->b_resid = 0;
451 	bp->b_flags = 0;
452 
453 	aio_done(bp);
454 }
455 
456 /*
457  * unlock pages previously locked by as_pagelock
458  */
459 void
460 aphysio_unlock(aio_req_t *reqp)
461 {
462 	struct buf *bp;
463 	struct iovec *iov;
464 	int flags;
465 
466 	if (reqp->aio_req_flags & AIO_PHYSIODONE)
467 		return;
468 
469 	reqp->aio_req_flags |= AIO_PHYSIODONE;
470 
471 	if (reqp->aio_req_flags & AIO_ZEROLEN)
472 		return;
473 
474 	bp = &reqp->aio_req_buf;
475 	iov = reqp->aio_req_uio.uio_iov;
476 	flags = (((bp->b_flags & B_READ) == B_READ) ? S_WRITE : S_READ);
477 	if (reqp->aio_req_flags & AIO_PAGELOCKDONE) {
478 		as_pageunlock(bp->b_proc->p_as,
479 		    bp->b_flags & B_SHADOW ? bp->b_shadow : NULL,
480 		    iov->iov_base, iov->iov_len, flags);
481 		reqp->aio_req_flags &= ~AIO_PAGELOCKDONE;
482 	}
483 	bp->b_flags &= ~(B_BUSY|B_WANTED|B_PHYS|B_SHADOW);
484 	bp->b_flags |= B_DONE;
485 }
486 
487 /*
488  * deletes a requests id from the hash table of outstanding io.
489  */
490 static void
491 aio_hash_delete(aio_t *aiop, struct aio_req_t *reqp)
492 {
493 	long index;
494 	aio_result_t *resultp = reqp->aio_req_resultp;
495 	aio_req_t *current;
496 	aio_req_t **nextp;
497 
498 	index = AIO_HASH(resultp);
499 	nextp = (aiop->aio_hash + index);
500 	while ((current = *nextp) != NULL) {
501 		if (current->aio_req_resultp == resultp) {
502 			*nextp = current->aio_hash_next;
503 			return;
504 		}
505 		nextp = &current->aio_hash_next;
506 	}
507 }
508 
509 /*
510  * Put a list head struct onto its free list.
511  */
512 static void
513 aio_lio_free(aio_t *aiop, aio_lio_t *head)
514 {
515 	ASSERT(MUTEX_HELD(&aiop->aio_mutex));
516 
517 	if (head->lio_sigqp != NULL)
518 		kmem_free(head->lio_sigqp, sizeof (sigqueue_t));
519 	head->lio_next = aiop->aio_lio_free;
520 	aiop->aio_lio_free = head;
521 }
522 
523 /*
524  * Put a reqp onto the freelist.
525  */
526 void
527 aio_req_free(aio_t *aiop, aio_req_t *reqp)
528 {
529 	aio_lio_t *liop;
530 
531 	ASSERT(MUTEX_HELD(&aiop->aio_mutex));
532 
533 	if (reqp->aio_req_portkev) {
534 		port_free_event(reqp->aio_req_portkev);
535 		reqp->aio_req_portkev = NULL;
536 	}
537 
538 	if ((liop = reqp->aio_req_lio) != NULL) {
539 		if (--liop->lio_nent == 0)
540 			aio_lio_free(aiop, liop);
541 		reqp->aio_req_lio = NULL;
542 	}
543 	if (reqp->aio_req_sigqp != NULL) {
544 		kmem_free(reqp->aio_req_sigqp, sizeof (sigqueue_t));
545 		reqp->aio_req_sigqp = NULL;
546 	}
547 	reqp->aio_req_next = aiop->aio_free;
548 	reqp->aio_req_prev = NULL;
549 	aiop->aio_free = reqp;
550 	aiop->aio_outstanding--;
551 	if (aiop->aio_outstanding == 0)
552 		cv_broadcast(&aiop->aio_waitcv);
553 	aio_hash_delete(aiop, reqp);
554 }
555 
556 /*
557  * Put a reqp onto the freelist.
558  */
559 void
560 aio_req_free_port(aio_t *aiop, aio_req_t *reqp)
561 {
562 	ASSERT(MUTEX_HELD(&aiop->aio_mutex));
563 
564 	reqp->aio_req_next = aiop->aio_free;
565 	reqp->aio_req_prev = NULL;
566 	aiop->aio_free = reqp;
567 	aiop->aio_outstanding--;
568 	aio_hash_delete(aiop, reqp);
569 }
570 
571 
572 /*
573  * Verify the integrity of a queue.
574  */
575 #if defined(DEBUG)
576 static void
577 aio_verify_queue(aio_req_t *head,
578 	aio_req_t *entry_present, aio_req_t *entry_missing)
579 {
580 	aio_req_t *reqp;
581 	int found = 0;
582 	int present = 0;
583 
584 	if ((reqp = head) != NULL) {
585 		do {
586 			ASSERT(reqp->aio_req_prev->aio_req_next == reqp);
587 			ASSERT(reqp->aio_req_next->aio_req_prev == reqp);
588 			if (entry_present == reqp)
589 				found++;
590 			if (entry_missing == reqp)
591 				present++;
592 		} while ((reqp = reqp->aio_req_next) != head);
593 	}
594 	ASSERT(entry_present == NULL || found == 1);
595 	ASSERT(entry_missing == NULL || present == 0);
596 }
597 #else
598 #define	aio_verify_queue(x, y, z)
599 #endif
600 
601 /*
602  * Put a request onto the tail of a queue.
603  */
604 void
605 aio_enq(aio_req_t **qhead, aio_req_t *reqp, int qflg_new)
606 {
607 	aio_req_t *head;
608 	aio_req_t *prev;
609 
610 	aio_verify_queue(*qhead, NULL, reqp);
611 
612 	if ((head = *qhead) == NULL) {
613 		reqp->aio_req_next = reqp;
614 		reqp->aio_req_prev = reqp;
615 		*qhead = reqp;
616 	} else {
617 		reqp->aio_req_next = head;
618 		reqp->aio_req_prev = prev = head->aio_req_prev;
619 		prev->aio_req_next = reqp;
620 		head->aio_req_prev = reqp;
621 	}
622 	reqp->aio_req_flags |= qflg_new;
623 }
624 
625 /*
626  * Remove a request from its queue.
627  */
628 void
629 aio_deq(aio_req_t **qhead, aio_req_t *reqp)
630 {
631 	aio_verify_queue(*qhead, reqp, NULL);
632 
633 	if (reqp->aio_req_next == reqp) {
634 		*qhead = NULL;
635 	} else {
636 		reqp->aio_req_prev->aio_req_next = reqp->aio_req_next;
637 		reqp->aio_req_next->aio_req_prev = reqp->aio_req_prev;
638 		if (*qhead == reqp)
639 			*qhead = reqp->aio_req_next;
640 	}
641 	reqp->aio_req_next = NULL;
642 	reqp->aio_req_prev = NULL;
643 }
644 
645 /*
646  * concatenate a specified queue with the cleanupq. the specified
647  * queue is put onto the tail of the cleanupq. all elements on the
648  * specified queue should have their aio_req_flags field cleared.
649  */
650 /*ARGSUSED*/
651 void
652 aio_cleanupq_concat(aio_t *aiop, aio_req_t *q2, int qflg)
653 {
654 	aio_req_t *cleanupqhead, *q2tail;
655 	aio_req_t *reqp = q2;
656 
657 	do {
658 		ASSERT(reqp->aio_req_flags & qflg);
659 		reqp->aio_req_flags &= ~qflg;
660 		reqp->aio_req_flags |= AIO_CLEANUPQ;
661 	} while ((reqp = reqp->aio_req_next) != q2);
662 
663 	cleanupqhead = aiop->aio_cleanupq;
664 	if (cleanupqhead == NULL)
665 		aiop->aio_cleanupq = q2;
666 	else {
667 		cleanupqhead->aio_req_prev->aio_req_next = q2;
668 		q2tail = q2->aio_req_prev;
669 		q2tail->aio_req_next = cleanupqhead;
670 		q2->aio_req_prev = cleanupqhead->aio_req_prev;
671 		cleanupqhead->aio_req_prev = q2tail;
672 	}
673 }
674 
675 /*
676  * cleanup aio requests that are on the per-process poll queue.
677  */
678 void
679 aio_cleanup(int flag)
680 {
681 	aio_t *aiop = curproc->p_aio;
682 	aio_req_t *pollqhead, *cleanupqhead, *notifyqhead;
683 	aio_req_t *cleanupport;
684 	aio_req_t *portq = NULL;
685 	void (*func)();
686 	int signalled = 0;
687 	int qflag = 0;
688 	int exitflg;
689 
690 	ASSERT(aiop != NULL);
691 
692 	if (flag == AIO_CLEANUP_EXIT)
693 		exitflg = AIO_CLEANUP_EXIT;
694 	else
695 		exitflg = 0;
696 
697 	/*
698 	 * We need to get the aio_cleanupq_mutex because we are calling
699 	 * aio_cleanup_cleanupq()
700 	 */
701 	mutex_enter(&aiop->aio_cleanupq_mutex);
702 	/*
703 	 * take all the requests off the cleanupq, the notifyq,
704 	 * and the pollq.
705 	 */
706 	mutex_enter(&aiop->aio_mutex);
707 	if ((cleanupqhead = aiop->aio_cleanupq) != NULL) {
708 		aiop->aio_cleanupq = NULL;
709 		qflag++;
710 	}
711 	if ((notifyqhead = aiop->aio_notifyq) != NULL) {
712 		aiop->aio_notifyq = NULL;
713 		qflag++;
714 	}
715 	if ((pollqhead = aiop->aio_pollq) != NULL) {
716 		aiop->aio_pollq = NULL;
717 		qflag++;
718 	}
719 	if (flag) {
720 		if ((portq = aiop->aio_portq) != NULL)
721 			qflag++;
722 
723 		if ((cleanupport = aiop->aio_portcleanupq) != NULL) {
724 			aiop->aio_portcleanupq = NULL;
725 			qflag++;
726 		}
727 	}
728 	mutex_exit(&aiop->aio_mutex);
729 
730 	/*
731 	 * return immediately if cleanupq, pollq, and
732 	 * notifyq are all empty. someone else must have
733 	 * emptied them.
734 	 */
735 	if (!qflag) {
736 		mutex_exit(&aiop->aio_cleanupq_mutex);
737 		return;
738 	}
739 
740 	/*
741 	 * do cleanup for the various queues.
742 	 */
743 	if (cleanupqhead)
744 		signalled = aio_cleanup_cleanupq(aiop, cleanupqhead, exitflg);
745 	mutex_exit(&aiop->aio_cleanupq_mutex);
746 	if (notifyqhead)
747 		signalled = aio_cleanup_notifyq(aiop, notifyqhead, exitflg);
748 	if (pollqhead)
749 		aio_cleanup_pollq(aiop, pollqhead, exitflg);
750 	if (flag && (cleanupport || portq))
751 		aio_cleanup_portq(aiop, cleanupport, exitflg);
752 
753 	if (exitflg)
754 		return;
755 
756 	/*
757 	 * If we have an active aio_cleanup_thread it's possible for
758 	 * this routine to push something on to the done queue after
759 	 * an aiowait/aiosuspend thread has already decided to block.
760 	 * This being the case, we need a cv_broadcast here to wake
761 	 * these threads up. It is simpler and cleaner to do this
762 	 * broadcast here than in the individual cleanup routines.
763 	 */
764 
765 	mutex_enter(&aiop->aio_mutex);
766 	/*
767 	 * If there has never been an old solaris aio request
768 	 * issued by this process, then do not send a SIGIO signal.
769 	 */
770 	if (!(aiop->aio_flags & AIO_SOLARIS_REQ))
771 		signalled = 1;
772 	cv_broadcast(&aiop->aio_waitcv);
773 	mutex_exit(&aiop->aio_mutex);
774 
775 	/*
776 	 * Only if the process wasn't already signalled,
777 	 * determine if a SIGIO signal should be delievered.
778 	 */
779 	if (!signalled &&
780 	    (func = PTOU(curproc)->u_signal[SIGIO - 1]) != SIG_DFL &&
781 	    func != SIG_IGN)
782 		psignal(curproc, SIGIO);
783 }
784 
785 
786 /*
787  * Do cleanup for every element of the port cleanup queue.
788  */
789 static void
790 aio_cleanup_portq(aio_t *aiop, aio_req_t *cleanupq, int exitflag)
791 {
792 	aio_req_t	*reqp;
793 	aio_req_t	*next;
794 	aio_req_t	*headp;
795 	aio_lio_t	*liop;
796 
797 	/* first check the portq */
798 	if (exitflag || ((aiop->aio_flags & AIO_CLEANUP_PORT) == 0)) {
799 		mutex_enter(&aiop->aio_mutex);
800 		if (aiop->aio_flags & AIO_CLEANUP)
801 			aiop->aio_flags |= AIO_CLEANUP_PORT;
802 		mutex_exit(&aiop->aio_mutex);
803 
804 		/*
805 		 * It is not allowed to hold locks during aphysio_unlock().
806 		 * The aio_done() interrupt function will try to acquire
807 		 * aio_mutex and aio_portq_mutex.  Therefore we disconnect
808 		 * the portq list from the aiop for the duration of the
809 		 * aphysio_unlock() loop below.
810 		 */
811 		mutex_enter(&aiop->aio_portq_mutex);
812 		headp = aiop->aio_portq;
813 		aiop->aio_portq = NULL;
814 		mutex_exit(&aiop->aio_portq_mutex);
815 		if ((reqp = headp) != NULL) {
816 			do {
817 				next = reqp->aio_req_next;
818 				aphysio_unlock(reqp);
819 				if (exitflag) {
820 					mutex_enter(&aiop->aio_mutex);
821 					aio_req_free(aiop, reqp);
822 					mutex_exit(&aiop->aio_mutex);
823 				}
824 			} while ((reqp = next) != headp);
825 		}
826 
827 		if (headp != NULL && exitflag == 0) {
828 			/* move unlocked requests back to the port queue */
829 			aio_req_t *newq;
830 
831 			mutex_enter(&aiop->aio_portq_mutex);
832 			if ((newq = aiop->aio_portq) != NULL) {
833 				aio_req_t *headprev = headp->aio_req_prev;
834 				aio_req_t *newqprev = newq->aio_req_prev;
835 
836 				headp->aio_req_prev = newqprev;
837 				newq->aio_req_prev = headprev;
838 				headprev->aio_req_next = newq;
839 				newqprev->aio_req_next = headp;
840 			}
841 			aiop->aio_portq = headp;
842 			cv_broadcast(&aiop->aio_portcv);
843 			mutex_exit(&aiop->aio_portq_mutex);
844 		}
845 	}
846 
847 	/* now check the port cleanup queue */
848 	if ((reqp = cleanupq) == NULL)
849 		return;
850 	do {
851 		next = reqp->aio_req_next;
852 		aphysio_unlock(reqp);
853 		if (exitflag) {
854 			mutex_enter(&aiop->aio_mutex);
855 			aio_req_free(aiop, reqp);
856 			mutex_exit(&aiop->aio_mutex);
857 		} else {
858 			mutex_enter(&aiop->aio_portq_mutex);
859 			aio_enq(&aiop->aio_portq, reqp, 0);
860 			mutex_exit(&aiop->aio_portq_mutex);
861 			port_send_event(reqp->aio_req_portkev);
862 			if ((liop = reqp->aio_req_lio) != NULL) {
863 				int send_event = 0;
864 
865 				mutex_enter(&aiop->aio_mutex);
866 				ASSERT(liop->lio_refcnt > 0);
867 				if (--liop->lio_refcnt == 0) {
868 					if (liop->lio_port >= 0 &&
869 					    liop->lio_portkev) {
870 						liop->lio_port = -1;
871 						send_event = 1;
872 					}
873 				}
874 				mutex_exit(&aiop->aio_mutex);
875 				if (send_event)
876 					port_send_event(liop->lio_portkev);
877 			}
878 		}
879 	} while ((reqp = next) != cleanupq);
880 }
881 
882 /*
883  * Do cleanup for every element of the cleanupq.
884  */
885 static int
886 aio_cleanup_cleanupq(aio_t *aiop, aio_req_t *qhead, int exitflg)
887 {
888 	aio_req_t *reqp, *next;
889 	int signalled = 0;
890 
891 	ASSERT(MUTEX_HELD(&aiop->aio_cleanupq_mutex));
892 
893 	/*
894 	 * Since aio_req_done() or aio_req_find() use the HASH list to find
895 	 * the required requests, they could potentially take away elements
896 	 * if they are already done (AIO_DONEQ is set).
897 	 * The aio_cleanupq_mutex protects the queue for the duration of the
898 	 * loop from aio_req_done() and aio_req_find().
899 	 */
900 	if ((reqp = qhead) == NULL)
901 		return (0);
902 	do {
903 		ASSERT(reqp->aio_req_flags & AIO_CLEANUPQ);
904 		ASSERT(reqp->aio_req_portkev == NULL);
905 		next = reqp->aio_req_next;
906 		aphysio_unlock(reqp);
907 		mutex_enter(&aiop->aio_mutex);
908 		if (exitflg)
909 			aio_req_free(aiop, reqp);
910 		else
911 			aio_enq(&aiop->aio_doneq, reqp, AIO_DONEQ);
912 		if (!exitflg) {
913 			if (reqp->aio_req_flags & AIO_SIGNALLED)
914 				signalled++;
915 			else
916 				reqp->aio_req_flags |= AIO_SIGNALLED;
917 		}
918 		mutex_exit(&aiop->aio_mutex);
919 	} while ((reqp = next) != qhead);
920 	return (signalled);
921 }
922 
923 /*
924  * do cleanup for every element of the notify queue.
925  */
926 static int
927 aio_cleanup_notifyq(aio_t *aiop, aio_req_t *qhead, int exitflg)
928 {
929 	aio_req_t *reqp, *next;
930 	aio_lio_t *liohead;
931 	sigqueue_t *sigev, *lio_sigev = NULL;
932 	int signalled = 0;
933 
934 	if ((reqp = qhead) == NULL)
935 		return (0);
936 	do {
937 		ASSERT(reqp->aio_req_flags & AIO_NOTIFYQ);
938 		next = reqp->aio_req_next;
939 		aphysio_unlock(reqp);
940 		if (exitflg) {
941 			mutex_enter(&aiop->aio_mutex);
942 			aio_req_free(aiop, reqp);
943 			mutex_exit(&aiop->aio_mutex);
944 		} else {
945 			mutex_enter(&aiop->aio_mutex);
946 			aio_enq(&aiop->aio_doneq, reqp, AIO_DONEQ);
947 			sigev = reqp->aio_req_sigqp;
948 			reqp->aio_req_sigqp = NULL;
949 			if ((liohead = reqp->aio_req_lio) != NULL) {
950 				ASSERT(liohead->lio_refcnt > 0);
951 				if (--liohead->lio_refcnt == 0) {
952 					cv_signal(&liohead->lio_notify);
953 					lio_sigev = liohead->lio_sigqp;
954 					liohead->lio_sigqp = NULL;
955 				}
956 			}
957 			mutex_exit(&aiop->aio_mutex);
958 			if (sigev) {
959 				signalled++;
960 				aio_sigev_send(reqp->aio_req_buf.b_proc,
961 				    sigev);
962 			}
963 			if (lio_sigev) {
964 				signalled++;
965 				aio_sigev_send(reqp->aio_req_buf.b_proc,
966 				    lio_sigev);
967 			}
968 		}
969 	} while ((reqp = next) != qhead);
970 
971 	return (signalled);
972 }
973 
974 /*
975  * Do cleanup for every element of the poll queue.
976  */
977 static void
978 aio_cleanup_pollq(aio_t *aiop, aio_req_t *qhead, int exitflg)
979 {
980 	aio_req_t *reqp, *next;
981 
982 	/*
983 	 * As no other threads should be accessing the queue at this point,
984 	 * it isn't necessary to hold aio_mutex while we traverse its elements.
985 	 */
986 	if ((reqp = qhead) == NULL)
987 		return;
988 	do {
989 		ASSERT(reqp->aio_req_flags & AIO_POLLQ);
990 		next = reqp->aio_req_next;
991 		aphysio_unlock(reqp);
992 		if (exitflg) {
993 			mutex_enter(&aiop->aio_mutex);
994 			aio_req_free(aiop, reqp);
995 			mutex_exit(&aiop->aio_mutex);
996 		} else {
997 			aio_copyout_result(reqp);
998 			mutex_enter(&aiop->aio_mutex);
999 			aio_enq(&aiop->aio_doneq, reqp, AIO_DONEQ);
1000 			mutex_exit(&aiop->aio_mutex);
1001 		}
1002 	} while ((reqp = next) != qhead);
1003 }
1004 
1005 /*
1006  * called by exit(). waits for all outstanding kaio to finish
1007  * before the kaio resources are freed.
1008  */
1009 void
1010 aio_cleanup_exit(void)
1011 {
1012 	proc_t *p = curproc;
1013 	aio_t *aiop = p->p_aio;
1014 	aio_req_t *reqp, *next, *head;
1015 	aio_lio_t *nxtlio, *liop;
1016 
1017 	/*
1018 	 * wait for all outstanding kaio to complete. process
1019 	 * is now single-threaded; no other kaio requests can
1020 	 * happen once aio_pending is zero.
1021 	 */
1022 	mutex_enter(&aiop->aio_mutex);
1023 	aiop->aio_flags |= AIO_CLEANUP;
1024 	while ((aiop->aio_pending != 0) || (aiop->aio_flags & AIO_DONE_ACTIVE))
1025 		cv_wait(&aiop->aio_cleanupcv, &aiop->aio_mutex);
1026 	mutex_exit(&aiop->aio_mutex);
1027 
1028 	/* cleanup the cleanup-thread queues. */
1029 	aio_cleanup(AIO_CLEANUP_EXIT);
1030 
1031 	/*
1032 	 * Although this process is now single-threaded, we
1033 	 * still need to protect ourselves against a race with
1034 	 * aio_cleanup_dr_delete_memory().
1035 	 */
1036 	mutex_enter(&p->p_lock);
1037 
1038 	/*
1039 	 * free up the done queue's resources.
1040 	 */
1041 	if ((head = aiop->aio_doneq) != NULL) {
1042 		aiop->aio_doneq = NULL;
1043 		reqp = head;
1044 		do {
1045 			next = reqp->aio_req_next;
1046 			aphysio_unlock(reqp);
1047 			kmem_free(reqp, sizeof (struct aio_req_t));
1048 		} while ((reqp = next) != head);
1049 	}
1050 	/*
1051 	 * release aio request freelist.
1052 	 */
1053 	for (reqp = aiop->aio_free; reqp != NULL; reqp = next) {
1054 		next = reqp->aio_req_next;
1055 		kmem_free(reqp, sizeof (struct aio_req_t));
1056 	}
1057 
1058 	/*
1059 	 * release io list head freelist.
1060 	 */
1061 	for (liop = aiop->aio_lio_free; liop != NULL; liop = nxtlio) {
1062 		nxtlio = liop->lio_next;
1063 		kmem_free(liop, sizeof (aio_lio_t));
1064 	}
1065 
1066 	if (aiop->aio_iocb)
1067 		kmem_free(aiop->aio_iocb, aiop->aio_iocbsz);
1068 
1069 	mutex_destroy(&aiop->aio_mutex);
1070 	mutex_destroy(&aiop->aio_portq_mutex);
1071 	mutex_destroy(&aiop->aio_cleanupq_mutex);
1072 	p->p_aio = NULL;
1073 	mutex_exit(&p->p_lock);
1074 	kmem_free(aiop, sizeof (struct aio));
1075 }
1076 
1077 /*
1078  * copy out aio request's result to a user-level result_t buffer.
1079  */
1080 void
1081 aio_copyout_result(aio_req_t *reqp)
1082 {
1083 	struct buf	*bp;
1084 	struct iovec	*iov;
1085 	void		*resultp;
1086 	int		error;
1087 	size_t		retval;
1088 
1089 	if (reqp->aio_req_flags & AIO_COPYOUTDONE)
1090 		return;
1091 
1092 	reqp->aio_req_flags |= AIO_COPYOUTDONE;
1093 
1094 	iov = reqp->aio_req_uio.uio_iov;
1095 	bp = &reqp->aio_req_buf;
1096 	/* "resultp" points to user-level result_t buffer */
1097 	resultp = (void *)reqp->aio_req_resultp;
1098 	if (bp->b_flags & B_ERROR) {
1099 		if (bp->b_error)
1100 			error = bp->b_error;
1101 		else
1102 			error = EIO;
1103 		retval = (size_t)-1;
1104 	} else {
1105 		error = 0;
1106 		retval = iov->iov_len - bp->b_resid;
1107 	}
1108 #ifdef	_SYSCALL32_IMPL
1109 	if (get_udatamodel() == DATAMODEL_NATIVE) {
1110 		(void) sulword(&((aio_result_t *)resultp)->aio_return, retval);
1111 		(void) suword32(&((aio_result_t *)resultp)->aio_errno, error);
1112 	} else {
1113 		(void) suword32(&((aio_result32_t *)resultp)->aio_return,
1114 		    (int)retval);
1115 		(void) suword32(&((aio_result32_t *)resultp)->aio_errno, error);
1116 	}
1117 #else
1118 	(void) suword32(&((aio_result_t *)resultp)->aio_return, retval);
1119 	(void) suword32(&((aio_result_t *)resultp)->aio_errno, error);
1120 #endif
1121 }
1122 
1123 
1124 void
1125 aio_copyout_result_port(struct iovec *iov, struct buf *bp, void *resultp)
1126 {
1127 	int errno;
1128 	size_t retval;
1129 
1130 	if (bp->b_flags & B_ERROR) {
1131 		if (bp->b_error)
1132 			errno = bp->b_error;
1133 		else
1134 			errno = EIO;
1135 		retval = (size_t)-1;
1136 	} else {
1137 		errno = 0;
1138 		retval = iov->iov_len - bp->b_resid;
1139 	}
1140 #ifdef	_SYSCALL32_IMPL
1141 	if (get_udatamodel() == DATAMODEL_NATIVE) {
1142 		(void) sulword(&((aio_result_t *)resultp)->aio_return, retval);
1143 		(void) suword32(&((aio_result_t *)resultp)->aio_errno, errno);
1144 	} else {
1145 		(void) suword32(&((aio_result32_t *)resultp)->aio_return,
1146 		    (int)retval);
1147 		(void) suword32(&((aio_result32_t *)resultp)->aio_errno, errno);
1148 	}
1149 #else
1150 	(void) suword32(&((aio_result_t *)resultp)->aio_return, retval);
1151 	(void) suword32(&((aio_result_t *)resultp)->aio_errno, errno);
1152 #endif
1153 }
1154 
1155 /*
1156  * This function is used to remove a request from the done queue.
1157  */
1158 
1159 void
1160 aio_req_remove_portq(aio_t *aiop, aio_req_t *reqp)
1161 {
1162 	ASSERT(MUTEX_HELD(&aiop->aio_portq_mutex));
1163 	while (aiop->aio_portq == NULL) {
1164 		/*
1165 		 * aio_portq is set to NULL when aio_cleanup_portq()
1166 		 * is working with the event queue.
1167 		 * The aio_cleanup_thread() uses aio_cleanup_portq()
1168 		 * to unlock all AIO buffers with completed transactions.
1169 		 * Wait here until aio_cleanup_portq() restores the
1170 		 * list of completed transactions in aio_portq.
1171 		 */
1172 		cv_wait(&aiop->aio_portcv, &aiop->aio_portq_mutex);
1173 	}
1174 	aio_deq(&aiop->aio_portq, reqp);
1175 }
1176 
1177 /* ARGSUSED */
1178 void
1179 aio_close_port(void *arg, int port, pid_t pid, int lastclose)
1180 {
1181 	aio_t		*aiop;
1182 	aio_req_t 	*reqp;
1183 	aio_req_t 	*next;
1184 	aio_req_t	*headp;
1185 	int		counter;
1186 
1187 	if (arg == NULL)
1188 		aiop = curproc->p_aio;
1189 	else
1190 		aiop = (aio_t *)arg;
1191 
1192 	/*
1193 	 * The PORT_SOURCE_AIO source is always associated with every new
1194 	 * created port by default.
1195 	 * If no asynchronous I/O transactions were associated with the port
1196 	 * then the aiop pointer will still be set to NULL.
1197 	 */
1198 	if (aiop == NULL)
1199 		return;
1200 
1201 	/*
1202 	 * Within a process event ports can be used to collect events other
1203 	 * than PORT_SOURCE_AIO events. At the same time the process can submit
1204 	 * asynchronous I/Os transactions which are not associated with the
1205 	 * current port.
1206 	 * The current process oriented model of AIO uses a sigle queue for
1207 	 * pending events. On close the pending queue (queue of asynchronous
1208 	 * I/O transactions using event port notification) must be scanned
1209 	 * to detect and handle pending I/Os using the current port.
1210 	 */
1211 	mutex_enter(&aiop->aio_portq_mutex);
1212 	mutex_enter(&aiop->aio_mutex);
1213 	counter = 0;
1214 	if ((headp = aiop->aio_portpending) != NULL) {
1215 		reqp = headp;
1216 		do {
1217 			if (reqp->aio_req_portkev &&
1218 			    reqp->aio_req_port == port) {
1219 				reqp->aio_req_flags |= AIO_CLOSE_PORT;
1220 				counter++;
1221 			}
1222 		} while ((reqp = reqp->aio_req_next) != headp);
1223 	}
1224 	if (counter == 0) {
1225 		/* no AIOs pending */
1226 		mutex_exit(&aiop->aio_mutex);
1227 		mutex_exit(&aiop->aio_portq_mutex);
1228 		return;
1229 	}
1230 	aiop->aio_portpendcnt += counter;
1231 	mutex_exit(&aiop->aio_mutex);
1232 	while (aiop->aio_portpendcnt)
1233 		cv_wait(&aiop->aio_portcv, &aiop->aio_portq_mutex);
1234 
1235 	/*
1236 	 * all pending AIOs are completed.
1237 	 * check port doneq
1238 	 */
1239 	headp = NULL;
1240 	if ((reqp = aiop->aio_portq) != NULL) {
1241 		do {
1242 			next = reqp->aio_req_next;
1243 			if (reqp->aio_req_port == port) {
1244 				/* dequeue request and discard event */
1245 				aio_req_remove_portq(aiop, reqp);
1246 				port_free_event(reqp->aio_req_portkev);
1247 				/* put request in temporary queue */
1248 				reqp->aio_req_next = headp;
1249 				headp = reqp;
1250 			}
1251 		} while ((reqp = next) != aiop->aio_portq);
1252 	}
1253 	mutex_exit(&aiop->aio_portq_mutex);
1254 
1255 	/* headp points to the list of requests to be discarded */
1256 	for (reqp = headp; reqp != NULL; reqp = next) {
1257 		next = reqp->aio_req_next;
1258 		aphysio_unlock(reqp);
1259 		mutex_enter(&aiop->aio_mutex);
1260 		aio_req_free_port(aiop, reqp);
1261 		mutex_exit(&aiop->aio_mutex);
1262 	}
1263 
1264 	if (aiop->aio_flags & AIO_CLEANUP)
1265 		cv_broadcast(&aiop->aio_waitcv);
1266 }
1267 
1268 /*
1269  * aio_cleanup_dr_delete_memory is used by dr's delete_memory_thread
1270  * to kick start the aio_cleanup_thread for the give process to do the
1271  * necessary cleanup.
1272  * This is needed so that delete_memory_thread can obtain writer locks
1273  * on pages that need to be relocated during a dr memory delete operation,
1274  * otherwise a deadly embrace may occur.
1275  */
1276 int
1277 aio_cleanup_dr_delete_memory(proc_t *procp)
1278 {
1279 	struct aio *aiop = procp->p_aio;
1280 	struct as *as = procp->p_as;
1281 	int ret = 0;
1282 
1283 	ASSERT(MUTEX_HELD(&procp->p_lock));
1284 
1285 	mutex_enter(&as->a_contents);
1286 
1287 	if (aiop != NULL) {
1288 		aiop->aio_rqclnup = 1;
1289 		cv_broadcast(&as->a_cv);
1290 		ret = 1;
1291 	}
1292 	mutex_exit(&as->a_contents);
1293 	return (ret);
1294 }
1295