xref: /titanic_50/usr/src/uts/common/os/aio_subr.c (revision 1100f00d5652de2808b73c61bcfdb3fc87ef1fc8)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 #include <sys/types.h>
30 #include <sys/proc.h>
31 #include <sys/file.h>
32 #include <sys/errno.h>
33 #include <sys/param.h>
34 #include <sys/sysmacros.h>
35 #include <sys/cmn_err.h>
36 #include <sys/systm.h>
37 #include <vm/as.h>
38 #include <vm/page.h>
39 #include <sys/uio.h>
40 #include <sys/kmem.h>
41 #include <sys/debug.h>
42 #include <sys/aio_impl.h>
43 #include <sys/epm.h>
44 #include <sys/fs/snode.h>
45 #include <sys/siginfo.h>
46 #include <sys/cpuvar.h>
47 #include <sys/tnf_probe.h>
48 #include <sys/conf.h>
49 #include <sys/sdt.h>
50 
51 int aphysio(int (*)(), int (*)(), dev_t, int, void (*)(), struct aio_req *);
52 void aio_done(struct buf *);
53 void aphysio_unlock(aio_req_t *);
54 void aio_cleanup(int);
55 void aio_cleanup_exit(void);
56 
57 /*
58  * private functions
59  */
60 static void aio_sigev_send(proc_t *, sigqueue_t *);
61 static void aio_hash_delete(aio_t *, aio_req_t *);
62 static void aio_lio_free(aio_t *, aio_lio_t *);
63 static void aio_cleanup_cleanupq(aio_t *, aio_req_t *, int);
64 static int aio_cleanup_notifyq(aio_t *, aio_req_t *, int);
65 static void aio_cleanup_pollq(aio_t *, aio_req_t *, int);
66 static void aio_cleanup_portq(aio_t *, aio_req_t *, int);
67 
68 /*
69  * async version of physio() that doesn't wait synchronously
70  * for the driver's strategy routine to complete.
71  */
72 
73 int
74 aphysio(
75 	int (*strategy)(struct buf *),
76 	int (*cancel)(struct buf *),
77 	dev_t dev,
78 	int rw,
79 	void (*mincnt)(struct buf *),
80 	struct aio_req *aio)
81 {
82 	struct uio *uio = aio->aio_uio;
83 	aio_req_t *reqp = (aio_req_t *)aio->aio_private;
84 	struct buf *bp = &reqp->aio_req_buf;
85 	struct iovec *iov;
86 	struct as *as;
87 	char *a;
88 	int	error;
89 	size_t	c;
90 	struct page **pplist;
91 	struct dev_ops *ops = devopsp[getmajor(dev)];
92 
93 	if (uio->uio_loffset < 0)
94 		return (EINVAL);
95 #ifdef	_ILP32
96 	/*
97 	 * For 32-bit kernels, check against SPEC_MAXOFFSET_T which represents
98 	 * the maximum size that can be supported by the IO subsystem.
99 	 * XXX this code assumes a D_64BIT driver.
100 	 */
101 	if (uio->uio_loffset > SPEC_MAXOFFSET_T)
102 		return (EINVAL);
103 #endif	/* _ILP32 */
104 
105 	TNF_PROBE_5(aphysio_start, "kaio", /* CSTYLED */,
106 		tnf_opaque, bp, bp,
107 		tnf_device, device, dev,
108 		tnf_offset, blkno, btodt(uio->uio_loffset),
109 		tnf_size, size, uio->uio_iov->iov_len,
110 		tnf_bioflags, rw, rw);
111 
112 	if (rw == B_READ) {
113 		CPU_STATS_ADD_K(sys, phread, 1);
114 	} else {
115 		CPU_STATS_ADD_K(sys, phwrite, 1);
116 	}
117 
118 	iov = uio->uio_iov;
119 	sema_init(&bp->b_sem, 0, NULL, SEMA_DEFAULT, NULL);
120 	sema_init(&bp->b_io, 0, NULL, SEMA_DEFAULT, NULL);
121 
122 	bp->b_error = 0;
123 	bp->b_flags = B_BUSY | B_PHYS | B_ASYNC | rw;
124 	bp->b_edev = dev;
125 	bp->b_dev = cmpdev(dev);
126 	bp->b_lblkno = btodt(uio->uio_loffset);
127 	bp->b_offset = uio->uio_loffset;
128 	(void) ops->devo_getinfo(NULL, DDI_INFO_DEVT2DEVINFO,
129 	    (void *)bp->b_edev, (void **)&bp->b_dip);
130 
131 	/*
132 	 * Clustering: Clustering can set the b_iodone, b_forw and
133 	 * b_proc fields to cluster-specifc values.
134 	 */
135 	if (bp->b_iodone == NULL) {
136 		bp->b_iodone = (int (*)()) aio_done;
137 		/* b_forw points at an aio_req_t structure */
138 		bp->b_forw = (struct buf *)reqp;
139 		bp->b_proc = curproc;
140 	}
141 
142 	a = bp->b_un.b_addr = iov->iov_base;
143 	c = bp->b_bcount = iov->iov_len;
144 
145 	(*mincnt)(bp);
146 	if (bp->b_bcount != iov->iov_len)
147 		return (ENOTSUP);
148 
149 	as = bp->b_proc->p_as;
150 
151 	error = as_pagelock(as, &pplist, a,
152 	    c, rw == B_READ? S_WRITE : S_READ);
153 	if (error != 0) {
154 		bp->b_flags |= B_ERROR;
155 		bp->b_error = error;
156 		bp->b_flags &= ~(B_BUSY|B_WANTED|B_PHYS|B_SHADOW);
157 		return (error);
158 	}
159 	reqp->aio_req_flags |= AIO_PAGELOCKDONE;
160 	bp->b_shadow = pplist;
161 	if (pplist != NULL) {
162 		bp->b_flags |= B_SHADOW;
163 	}
164 
165 	if (cancel != anocancel)
166 		cmn_err(CE_PANIC,
167 		    "aphysio: cancellation not supported, use anocancel");
168 
169 	reqp->aio_req_cancel = cancel;
170 
171 	DTRACE_IO1(start, struct buf *, bp);
172 
173 	return ((*strategy)(bp));
174 }
175 
176 /*ARGSUSED*/
177 int
178 anocancel(struct buf *bp)
179 {
180 	return (ENXIO);
181 }
182 
183 /*
184  * Called from biodone().
185  * Notify process that a pending AIO has finished.
186  */
187 
188 /*
189  * Clustering: This function is made non-static as it is used
190  * by clustering s/w as contract private interface.
191  */
192 
193 void
194 aio_done(struct buf *bp)
195 {
196 	proc_t *p;
197 	struct as *as;
198 	aio_req_t *reqp;
199 	aio_lio_t *head = NULL;
200 	aio_t *aiop;
201 	sigqueue_t *sigev = NULL;
202 	sigqueue_t *lio_sigev = NULL;
203 	port_kevent_t *pkevp = NULL;
204 	port_kevent_t *lio_pkevp = NULL;
205 	int fd;
206 	int cleanupqflag;
207 	int pollqflag;
208 	int portevpend;
209 	void (*func)();
210 	int use_port = 0;
211 	int reqp_flags = 0;
212 
213 	p = bp->b_proc;
214 	as = p->p_as;
215 	reqp = (aio_req_t *)bp->b_forw;
216 	fd = reqp->aio_req_fd;
217 
218 	TNF_PROBE_5(aphysio_end, "kaio", /* CSTYLED */,
219 		tnf_opaque, bp, bp,
220 		tnf_device, device, bp->b_edev,
221 		tnf_offset, blkno, btodt(reqp->aio_req_uio.uio_loffset),
222 		tnf_size, size, reqp->aio_req_uio.uio_iov->iov_len,
223 		tnf_bioflags, rw, (bp->b_flags & (B_READ|B_WRITE)));
224 
225 	/*
226 	 * mapout earlier so that more kmem is available when aio is
227 	 * heavily used. bug #1262082
228 	 */
229 	if (bp->b_flags & B_REMAPPED)
230 		bp_mapout(bp);
231 
232 	/* decrement fd's ref count by one, now that aio request is done. */
233 	areleasef(fd, P_FINFO(p));
234 
235 	aiop = p->p_aio;
236 	ASSERT(aiop != NULL);
237 
238 	mutex_enter(&aiop->aio_portq_mutex);
239 	mutex_enter(&aiop->aio_mutex);
240 	ASSERT(aiop->aio_pending > 0);
241 	ASSERT(reqp->aio_req_flags & AIO_PENDING);
242 	aiop->aio_pending--;
243 	reqp->aio_req_flags &= ~AIO_PENDING;
244 	reqp_flags = reqp->aio_req_flags;
245 	if ((pkevp = reqp->aio_req_portkev) != NULL) {
246 		/* Event port notification is desired for this transaction */
247 		if (reqp->aio_req_flags & AIO_CLOSE_PORT) {
248 			/*
249 			 * The port is being closed and it is waiting for
250 			 * pending asynchronous I/O transactions to complete.
251 			 */
252 			portevpend = --aiop->aio_portpendcnt;
253 			aio_deq(&aiop->aio_portpending, reqp);
254 			aio_enq(&aiop->aio_portq, reqp, 0);
255 			mutex_exit(&aiop->aio_mutex);
256 			mutex_exit(&aiop->aio_portq_mutex);
257 			port_send_event(pkevp);
258 			if (portevpend == 0)
259 				cv_broadcast(&aiop->aio_portcv);
260 			return;
261 		}
262 
263 		if (aiop->aio_flags & AIO_CLEANUP) {
264 			/*
265 			 * aio_cleanup_thread() is waiting for completion of
266 			 * transactions.
267 			 */
268 			mutex_enter(&as->a_contents);
269 			aio_deq(&aiop->aio_portpending, reqp);
270 			aio_enq(&aiop->aio_portcleanupq, reqp, 0);
271 			cv_signal(&aiop->aio_cleanupcv);
272 			mutex_exit(&as->a_contents);
273 			mutex_exit(&aiop->aio_mutex);
274 			mutex_exit(&aiop->aio_portq_mutex);
275 			return;
276 		}
277 
278 		aio_deq(&aiop->aio_portpending, reqp);
279 		aio_enq(&aiop->aio_portq, reqp, 0);
280 
281 		use_port = 1;
282 	} else {
283 		/*
284 		 * when the AIO_CLEANUP flag is enabled for this
285 		 * process, or when the AIO_POLL bit is set for
286 		 * this request, special handling is required.
287 		 * otherwise the request is put onto the doneq.
288 		 */
289 		cleanupqflag = (aiop->aio_flags & AIO_CLEANUP);
290 		pollqflag = (reqp->aio_req_flags & AIO_POLL);
291 		if (cleanupqflag | pollqflag) {
292 
293 			if (cleanupqflag)
294 				mutex_enter(&as->a_contents);
295 
296 			/*
297 			 * requests with their AIO_POLL bit set are put
298 			 * on the pollq, requests with sigevent structures
299 			 * or with listio heads are put on the notifyq, and
300 			 * the remaining requests don't require any special
301 			 * cleanup handling, so they're put onto the default
302 			 * cleanupq.
303 			 */
304 			if (pollqflag)
305 				aio_enq(&aiop->aio_pollq, reqp, AIO_POLLQ);
306 			else if (reqp->aio_req_sigqp || reqp->aio_req_lio)
307 				aio_enq(&aiop->aio_notifyq, reqp, AIO_NOTIFYQ);
308 			else
309 				aio_enq(&aiop->aio_cleanupq, reqp,
310 				    AIO_CLEANUPQ);
311 
312 			if (cleanupqflag) {
313 				cv_signal(&aiop->aio_cleanupcv);
314 				mutex_exit(&as->a_contents);
315 				mutex_exit(&aiop->aio_mutex);
316 				mutex_exit(&aiop->aio_portq_mutex);
317 			} else {
318 				ASSERT(pollqflag);
319 				/* block aio_cleanup_exit until we're done */
320 				aiop->aio_flags |= AIO_DONE_ACTIVE;
321 				mutex_exit(&aiop->aio_mutex);
322 				mutex_exit(&aiop->aio_portq_mutex);
323 				/*
324 				 * let the cleanup processing happen from an AST
325 				 * set an AST on all threads in this process
326 				 */
327 				mutex_enter(&p->p_lock);
328 				set_proc_ast(p);
329 				mutex_exit(&p->p_lock);
330 				mutex_enter(&aiop->aio_mutex);
331 				/* wakeup anybody waiting in aiowait() */
332 				cv_broadcast(&aiop->aio_waitcv);
333 
334 				/* wakeup aio_cleanup_exit if needed */
335 				if (aiop->aio_flags & AIO_CLEANUP)
336 					cv_signal(&aiop->aio_cleanupcv);
337 				aiop->aio_flags &= ~AIO_DONE_ACTIVE;
338 				mutex_exit(&aiop->aio_mutex);
339 			}
340 			return;
341 		}
342 
343 		/*
344 		 * save req's sigevent pointer, and check its
345 		 * value after releasing aio_mutex lock.
346 		 */
347 		sigev = reqp->aio_req_sigqp;
348 		reqp->aio_req_sigqp = NULL;
349 
350 		/* put request on done queue. */
351 		aio_enq(&aiop->aio_doneq, reqp, AIO_DONEQ);
352 	} /* portkevent */
353 
354 	/*
355 	 * when list IO notification is enabled, a notification or
356 	 * signal is sent only when all entries in the list are done.
357 	 */
358 	if ((head = reqp->aio_req_lio) != NULL) {
359 		ASSERT(head->lio_refcnt > 0);
360 		if (--head->lio_refcnt == 0) {
361 			/*
362 			 * save lio's sigevent pointer, and check
363 			 * its value after releasing aio_mutex lock.
364 			 */
365 			lio_sigev = head->lio_sigqp;
366 			head->lio_sigqp = NULL;
367 			cv_signal(&head->lio_notify);
368 			if (head->lio_port >= 0 &&
369 			    (lio_pkevp = head->lio_portkev) != NULL)
370 				head->lio_port = -1;
371 		}
372 	}
373 
374 	/*
375 	 * if AIO_WAITN set then
376 	 * send signal only when we reached the
377 	 * required amount of IO's finished
378 	 * or when all IO's are done
379 	 */
380 	if (aiop->aio_flags & AIO_WAITN) {
381 		if (aiop->aio_waitncnt > 0)
382 			aiop->aio_waitncnt--;
383 		if (aiop->aio_pending == 0 ||
384 		    aiop->aio_waitncnt == 0)
385 			cv_broadcast(&aiop->aio_waitcv);
386 	} else {
387 		cv_broadcast(&aiop->aio_waitcv);
388 	}
389 
390 	mutex_exit(&aiop->aio_mutex);
391 	mutex_exit(&aiop->aio_portq_mutex);
392 
393 	/*
394 	 * Could the cleanup thread be waiting for AIO with locked
395 	 * resources to finish?
396 	 * Ideally in that case cleanup thread should block on cleanupcv,
397 	 * but there is a window, where it could miss to see a new aio
398 	 * request that sneaked in.
399 	 */
400 	mutex_enter(&as->a_contents);
401 	if ((reqp_flags & AIO_PAGELOCKDONE) && AS_ISUNMAPWAIT(as))
402 		cv_broadcast(&as->a_cv);
403 	mutex_exit(&as->a_contents);
404 
405 	if (sigev)
406 		aio_sigev_send(p, sigev);
407 	else if (!use_port && head == NULL) {
408 		/*
409 		 * Send a SIGIO signal when the process has a handler enabled.
410 		 */
411 		if ((func = PTOU(p)->u_signal[SIGIO - 1]) != SIG_DFL &&
412 		    func != SIG_IGN)
413 			psignal(p, SIGIO);
414 	}
415 	if (pkevp)
416 		port_send_event(pkevp);
417 	if (lio_sigev)
418 		aio_sigev_send(p, lio_sigev);
419 	if (lio_pkevp)
420 		port_send_event(lio_pkevp);
421 }
422 
423 /*
424  * send a queued signal to the specified process when
425  * the event signal is non-NULL. A return value of 1
426  * will indicate that a signal is queued, and 0 means that
427  * no signal was specified, nor sent.
428  */
429 static void
430 aio_sigev_send(proc_t *p, sigqueue_t *sigev)
431 {
432 	ASSERT(sigev != NULL);
433 
434 	mutex_enter(&p->p_lock);
435 	sigaddqa(p, NULL, sigev);
436 	mutex_exit(&p->p_lock);
437 }
438 
439 /*
440  * special case handling for zero length requests. the aio request
441  * short circuits the normal completion path since all that's required
442  * to complete this request is to copyout a zero to the aio request's
443  * return value.
444  */
445 void
446 aio_zerolen(aio_req_t *reqp)
447 {
448 
449 	struct buf *bp = &reqp->aio_req_buf;
450 
451 	reqp->aio_req_flags |= AIO_ZEROLEN;
452 
453 	bp->b_forw = (struct buf *)reqp;
454 	bp->b_proc = curproc;
455 
456 	bp->b_resid = 0;
457 	bp->b_flags = 0;
458 
459 	aio_done(bp);
460 }
461 
462 /*
463  * unlock pages previously locked by as_pagelock
464  */
465 void
466 aphysio_unlock(aio_req_t *reqp)
467 {
468 	struct buf *bp;
469 	struct iovec *iov;
470 	int flags;
471 
472 	if (reqp->aio_req_flags & AIO_PHYSIODONE)
473 		return;
474 
475 	reqp->aio_req_flags |= AIO_PHYSIODONE;
476 
477 	if (reqp->aio_req_flags & AIO_ZEROLEN)
478 		return;
479 
480 	bp = &reqp->aio_req_buf;
481 	iov = reqp->aio_req_uio.uio_iov;
482 	flags = (((bp->b_flags & B_READ) == B_READ) ? S_WRITE : S_READ);
483 	if (reqp->aio_req_flags & AIO_PAGELOCKDONE) {
484 		as_pageunlock(bp->b_proc->p_as,
485 			bp->b_flags & B_SHADOW ? bp->b_shadow : NULL,
486 			iov->iov_base, iov->iov_len, flags);
487 		reqp->aio_req_flags &= ~AIO_PAGELOCKDONE;
488 	}
489 	bp->b_flags &= ~(B_BUSY|B_WANTED|B_PHYS|B_SHADOW);
490 	bp->b_flags |= B_DONE;
491 }
492 
493 /*
494  * deletes a requests id from the hash table of outstanding io.
495  */
496 static void
497 aio_hash_delete(aio_t *aiop, struct aio_req_t *reqp)
498 {
499 	long index;
500 	aio_result_t *resultp = reqp->aio_req_resultp;
501 	aio_req_t *current;
502 	aio_req_t **nextp;
503 
504 	index = AIO_HASH(resultp);
505 	nextp = (aiop->aio_hash + index);
506 	while ((current = *nextp) != NULL) {
507 		if (current->aio_req_resultp == resultp) {
508 			*nextp = current->aio_hash_next;
509 			return;
510 		}
511 		nextp = &current->aio_hash_next;
512 	}
513 }
514 
515 /*
516  * Put a list head struct onto its free list.
517  */
518 static void
519 aio_lio_free(aio_t *aiop, aio_lio_t *head)
520 {
521 	ASSERT(MUTEX_HELD(&aiop->aio_mutex));
522 
523 	if (head->lio_sigqp != NULL)
524 		kmem_free(head->lio_sigqp, sizeof (sigqueue_t));
525 	head->lio_next = aiop->aio_lio_free;
526 	aiop->aio_lio_free = head;
527 }
528 
529 /*
530  * Put a reqp onto the freelist.
531  */
532 void
533 aio_req_free(aio_t *aiop, aio_req_t *reqp)
534 {
535 	aio_lio_t *liop;
536 
537 	ASSERT(MUTEX_HELD(&aiop->aio_mutex));
538 
539 	if (reqp->aio_req_portkev) {
540 		port_free_event(reqp->aio_req_portkev);
541 		reqp->aio_req_portkev = NULL;
542 	}
543 
544 	if ((liop = reqp->aio_req_lio) != NULL) {
545 		if (--liop->lio_nent == 0)
546 			aio_lio_free(aiop, liop);
547 		reqp->aio_req_lio = NULL;
548 	}
549 	if (reqp->aio_req_sigqp != NULL) {
550 		kmem_free(reqp->aio_req_sigqp, sizeof (sigqueue_t));
551 		reqp->aio_req_sigqp = NULL;
552 	}
553 	reqp->aio_req_next = aiop->aio_free;
554 	reqp->aio_req_prev = NULL;
555 	aiop->aio_free = reqp;
556 	aiop->aio_outstanding--;
557 	if (aiop->aio_outstanding == 0)
558 		cv_broadcast(&aiop->aio_waitcv);
559 	aio_hash_delete(aiop, reqp);
560 }
561 
562 /*
563  * Put a reqp onto the freelist.
564  */
565 void
566 aio_req_free_port(aio_t *aiop, aio_req_t *reqp)
567 {
568 	ASSERT(MUTEX_HELD(&aiop->aio_mutex));
569 
570 	reqp->aio_req_next = aiop->aio_free;
571 	reqp->aio_req_prev = NULL;
572 	aiop->aio_free = reqp;
573 	aiop->aio_outstanding--;
574 	aio_hash_delete(aiop, reqp);
575 }
576 
577 
578 /*
579  * Verify the integrity of a queue.
580  */
581 #if defined(DEBUG)
582 static void
583 aio_verify_queue(aio_req_t *head,
584 	aio_req_t *entry_present, aio_req_t *entry_missing)
585 {
586 	aio_req_t *reqp;
587 	int found = 0;
588 	int present = 0;
589 
590 	if ((reqp = head) != NULL) {
591 		do {
592 			ASSERT(reqp->aio_req_prev->aio_req_next == reqp);
593 			ASSERT(reqp->aio_req_next->aio_req_prev == reqp);
594 			if (entry_present == reqp)
595 				found++;
596 			if (entry_missing == reqp)
597 				present++;
598 		} while ((reqp = reqp->aio_req_next) != head);
599 	}
600 	ASSERT(entry_present == NULL || found == 1);
601 	ASSERT(entry_missing == NULL || present == 0);
602 }
603 #else
604 #define	aio_verify_queue(x, y, z)
605 #endif
606 
607 /*
608  * Put a request onto the tail of a queue.
609  */
610 void
611 aio_enq(aio_req_t **qhead, aio_req_t *reqp, int qflg_new)
612 {
613 	aio_req_t *head;
614 	aio_req_t *prev;
615 
616 	aio_verify_queue(*qhead, NULL, reqp);
617 
618 	if ((head = *qhead) == NULL) {
619 		reqp->aio_req_next = reqp;
620 		reqp->aio_req_prev = reqp;
621 		*qhead = reqp;
622 	} else {
623 		reqp->aio_req_next = head;
624 		reqp->aio_req_prev = prev = head->aio_req_prev;
625 		prev->aio_req_next = reqp;
626 		head->aio_req_prev = reqp;
627 	}
628 	reqp->aio_req_flags |= qflg_new;
629 }
630 
631 /*
632  * Remove a request from its queue.
633  */
634 void
635 aio_deq(aio_req_t **qhead, aio_req_t *reqp)
636 {
637 	aio_verify_queue(*qhead, reqp, NULL);
638 
639 	if (reqp->aio_req_next == reqp) {
640 		*qhead = NULL;
641 	} else {
642 		reqp->aio_req_prev->aio_req_next = reqp->aio_req_next;
643 		reqp->aio_req_next->aio_req_prev = reqp->aio_req_prev;
644 		if (*qhead == reqp)
645 			*qhead = reqp->aio_req_next;
646 	}
647 	reqp->aio_req_next = NULL;
648 	reqp->aio_req_prev = NULL;
649 }
650 
651 /*
652  * concatenate a specified queue with the cleanupq. the specified
653  * queue is put onto the tail of the cleanupq. all elements on the
654  * specified queue should have their aio_req_flags field cleared.
655  */
656 /*ARGSUSED*/
657 void
658 aio_cleanupq_concat(aio_t *aiop, aio_req_t *q2, int qflg)
659 {
660 	aio_req_t *cleanupqhead, *q2tail;
661 	aio_req_t *reqp = q2;
662 
663 	do {
664 		ASSERT(reqp->aio_req_flags & qflg);
665 		reqp->aio_req_flags &= ~qflg;
666 		reqp->aio_req_flags |= AIO_CLEANUPQ;
667 	} while ((reqp = reqp->aio_req_next) != q2);
668 
669 	cleanupqhead = aiop->aio_cleanupq;
670 	if (cleanupqhead == NULL)
671 		aiop->aio_cleanupq = q2;
672 	else {
673 		cleanupqhead->aio_req_prev->aio_req_next = q2;
674 		q2tail = q2->aio_req_prev;
675 		q2tail->aio_req_next = cleanupqhead;
676 		q2->aio_req_prev = cleanupqhead->aio_req_prev;
677 		cleanupqhead->aio_req_prev = q2tail;
678 	}
679 }
680 
681 /*
682  * cleanup aio requests that are on the per-process poll queue.
683  */
684 void
685 aio_cleanup(int flag)
686 {
687 	aio_t *aiop = curproc->p_aio;
688 	aio_req_t *pollqhead, *cleanupqhead, *notifyqhead;
689 	aio_req_t *cleanupport;
690 	aio_req_t *portq = NULL;
691 	void (*func)();
692 	int signalled = 0;
693 	int qflag = 0;
694 	int exitflg;
695 
696 	ASSERT(aiop != NULL);
697 
698 	if (flag == AIO_CLEANUP_EXIT)
699 		exitflg = AIO_CLEANUP_EXIT;
700 	else
701 		exitflg = 0;
702 
703 	/*
704 	 * We need to get the aio_cleanupq_mutex because we are calling
705 	 * aio_cleanup_cleanupq()
706 	 */
707 	mutex_enter(&aiop->aio_cleanupq_mutex);
708 	/*
709 	 * take all the requests off the cleanupq, the notifyq,
710 	 * and the pollq.
711 	 */
712 	mutex_enter(&aiop->aio_mutex);
713 	if ((cleanupqhead = aiop->aio_cleanupq) != NULL) {
714 		aiop->aio_cleanupq = NULL;
715 		qflag++;
716 	}
717 	if ((notifyqhead = aiop->aio_notifyq) != NULL) {
718 		aiop->aio_notifyq = NULL;
719 		qflag++;
720 	}
721 	if ((pollqhead = aiop->aio_pollq) != NULL) {
722 		aiop->aio_pollq = NULL;
723 		qflag++;
724 	}
725 	if (flag) {
726 		if ((portq = aiop->aio_portq) != NULL)
727 			qflag++;
728 
729 		if ((cleanupport = aiop->aio_portcleanupq) != NULL) {
730 			aiop->aio_portcleanupq = NULL;
731 			qflag++;
732 		}
733 	}
734 	mutex_exit(&aiop->aio_mutex);
735 
736 	/*
737 	 * return immediately if cleanupq, pollq, and
738 	 * notifyq are all empty. someone else must have
739 	 * emptied them.
740 	 */
741 	if (!qflag) {
742 		mutex_exit(&aiop->aio_cleanupq_mutex);
743 		return;
744 	}
745 
746 	/*
747 	 * do cleanup for the various queues.
748 	 */
749 	if (cleanupqhead)
750 		aio_cleanup_cleanupq(aiop, cleanupqhead, exitflg);
751 	mutex_exit(&aiop->aio_cleanupq_mutex);
752 	if (notifyqhead)
753 		signalled = aio_cleanup_notifyq(aiop, notifyqhead, exitflg);
754 	if (pollqhead)
755 		aio_cleanup_pollq(aiop, pollqhead, exitflg);
756 	if (flag && (cleanupport || portq))
757 		aio_cleanup_portq(aiop, cleanupport, exitflg);
758 
759 	if (exitflg)
760 		return;
761 
762 	/*
763 	 * If we have an active aio_cleanup_thread it's possible for
764 	 * this routine to push something on to the done queue after
765 	 * an aiowait/aiosuspend thread has already decided to block.
766 	 * This being the case, we need a cv_broadcast here to wake
767 	 * these threads up. It is simpler and cleaner to do this
768 	 * broadcast here than in the individual cleanup routines.
769 	 */
770 
771 	mutex_enter(&aiop->aio_mutex);
772 	cv_broadcast(&aiop->aio_waitcv);
773 	mutex_exit(&aiop->aio_mutex);
774 
775 	/*
776 	 * Only if the process wasn't already signalled,
777 	 * determine if a SIGIO signal should be delievered.
778 	 */
779 	if (!signalled &&
780 	    (func = PTOU(curproc)->u_signal[SIGIO - 1]) != SIG_DFL &&
781 	    func != SIG_IGN)
782 		psignal(curproc, SIGIO);
783 }
784 
785 
786 /*
787  * Do cleanup for every element of the port cleanup queue.
788  */
789 static void
790 aio_cleanup_portq(aio_t *aiop, aio_req_t *cleanupq, int exitflag)
791 {
792 	aio_req_t	*reqp;
793 	aio_req_t	*next;
794 	aio_req_t	*headp;
795 	aio_lio_t	*liop;
796 
797 	/* first check the portq */
798 	if (exitflag || ((aiop->aio_flags & AIO_CLEANUP_PORT) == 0)) {
799 		mutex_enter(&aiop->aio_mutex);
800 		if (aiop->aio_flags & AIO_CLEANUP)
801 			aiop->aio_flags |= AIO_CLEANUP_PORT;
802 		mutex_exit(&aiop->aio_mutex);
803 
804 		/*
805 		 * It is not allowed to hold locks during aphysio_unlock().
806 		 * The aio_done() interrupt function will try to acquire
807 		 * aio_mutex and aio_portq_mutex.  Therefore we disconnect
808 		 * the portq list from the aiop for the duration of the
809 		 * aphysio_unlock() loop below.
810 		 */
811 		mutex_enter(&aiop->aio_portq_mutex);
812 		headp = aiop->aio_portq;
813 		aiop->aio_portq = NULL;
814 		mutex_exit(&aiop->aio_portq_mutex);
815 		if ((reqp = headp) != NULL) {
816 			do {
817 				next = reqp->aio_req_next;
818 				aphysio_unlock(reqp);
819 				if (exitflag) {
820 					mutex_enter(&aiop->aio_mutex);
821 					aio_req_free(aiop, reqp);
822 					mutex_exit(&aiop->aio_mutex);
823 				}
824 			} while ((reqp = next) != headp);
825 		}
826 
827 		if (headp != NULL && exitflag == 0) {
828 			/* move unlocked requests back to the port queue */
829 			aio_req_t *newq;
830 
831 			mutex_enter(&aiop->aio_portq_mutex);
832 			if ((newq = aiop->aio_portq) != NULL) {
833 				aio_req_t *headprev = headp->aio_req_prev;
834 				aio_req_t *newqprev = newq->aio_req_prev;
835 
836 				headp->aio_req_prev = newqprev;
837 				newq->aio_req_prev = headprev;
838 				headprev->aio_req_next = newq;
839 				newqprev->aio_req_next = headp;
840 			}
841 			aiop->aio_portq = headp;
842 			cv_broadcast(&aiop->aio_portcv);
843 			mutex_exit(&aiop->aio_portq_mutex);
844 		}
845 	}
846 
847 	/* now check the port cleanup queue */
848 	if ((reqp = cleanupq) == NULL)
849 		return;
850 	do {
851 		next = reqp->aio_req_next;
852 		aphysio_unlock(reqp);
853 		if (exitflag) {
854 			mutex_enter(&aiop->aio_mutex);
855 			aio_req_free(aiop, reqp);
856 			mutex_exit(&aiop->aio_mutex);
857 		} else {
858 			mutex_enter(&aiop->aio_portq_mutex);
859 			aio_enq(&aiop->aio_portq, reqp, 0);
860 			mutex_exit(&aiop->aio_portq_mutex);
861 			port_send_event(reqp->aio_req_portkev);
862 			if ((liop = reqp->aio_req_lio) != NULL) {
863 				int send_event = 0;
864 
865 				mutex_enter(&aiop->aio_mutex);
866 				ASSERT(liop->lio_refcnt > 0);
867 				if (--liop->lio_refcnt == 0) {
868 					if (liop->lio_port >= 0 &&
869 					    liop->lio_portkev) {
870 						liop->lio_port = -1;
871 						send_event = 1;
872 					}
873 				}
874 				mutex_exit(&aiop->aio_mutex);
875 				if (send_event)
876 					port_send_event(liop->lio_portkev);
877 			}
878 		}
879 	} while ((reqp = next) != cleanupq);
880 }
881 
882 /*
883  * Do cleanup for every element of the cleanupq.
884  */
885 static void
886 aio_cleanup_cleanupq(aio_t *aiop, aio_req_t *qhead, int exitflg)
887 {
888 	aio_req_t *reqp, *next;
889 
890 	ASSERT(MUTEX_HELD(&aiop->aio_cleanupq_mutex));
891 
892 	/*
893 	 * Since aio_req_done() or aio_req_find() use the HASH list to find
894 	 * the required requests, they could potentially take away elements
895 	 * if they are already done (AIO_DONEQ is set).
896 	 * The aio_cleanupq_mutex protects the queue for the duration of the
897 	 * loop from aio_req_done() and aio_req_find().
898 	 */
899 	if ((reqp = qhead) == NULL)
900 		return;
901 	do {
902 		ASSERT(reqp->aio_req_flags & AIO_CLEANUPQ);
903 		ASSERT(reqp->aio_req_portkev == NULL);
904 		next = reqp->aio_req_next;
905 		aphysio_unlock(reqp);
906 		mutex_enter(&aiop->aio_mutex);
907 		if (exitflg)
908 			aio_req_free(aiop, reqp);
909 		else
910 			aio_enq(&aiop->aio_doneq, reqp, AIO_DONEQ);
911 		mutex_exit(&aiop->aio_mutex);
912 	} while ((reqp = next) != qhead);
913 }
914 
915 /*
916  * do cleanup for every element of the notify queue.
917  */
918 static int
919 aio_cleanup_notifyq(aio_t *aiop, aio_req_t *qhead, int exitflg)
920 {
921 	aio_req_t *reqp, *next;
922 	aio_lio_t *liohead;
923 	sigqueue_t *sigev, *lio_sigev = NULL;
924 	int signalled = 0;
925 
926 	if ((reqp = qhead) == NULL)
927 		return (0);
928 	do {
929 		ASSERT(reqp->aio_req_flags & AIO_NOTIFYQ);
930 		next = reqp->aio_req_next;
931 		aphysio_unlock(reqp);
932 		if (exitflg) {
933 			mutex_enter(&aiop->aio_mutex);
934 			aio_req_free(aiop, reqp);
935 			mutex_exit(&aiop->aio_mutex);
936 		} else {
937 			mutex_enter(&aiop->aio_mutex);
938 			aio_enq(&aiop->aio_doneq, reqp, AIO_DONEQ);
939 			sigev = reqp->aio_req_sigqp;
940 			reqp->aio_req_sigqp = NULL;
941 			if ((liohead = reqp->aio_req_lio) != NULL) {
942 				ASSERT(liohead->lio_refcnt > 0);
943 				if (--liohead->lio_refcnt == 0) {
944 					cv_signal(&liohead->lio_notify);
945 					lio_sigev = liohead->lio_sigqp;
946 					liohead->lio_sigqp = NULL;
947 				}
948 			}
949 			mutex_exit(&aiop->aio_mutex);
950 			if (sigev) {
951 				signalled++;
952 				aio_sigev_send(reqp->aio_req_buf.b_proc,
953 				    sigev);
954 			}
955 			if (lio_sigev) {
956 				signalled++;
957 				aio_sigev_send(reqp->aio_req_buf.b_proc,
958 				    lio_sigev);
959 			}
960 		}
961 	} while ((reqp = next) != qhead);
962 
963 	return (signalled);
964 }
965 
966 /*
967  * Do cleanup for every element of the poll queue.
968  */
969 static void
970 aio_cleanup_pollq(aio_t *aiop, aio_req_t *qhead, int exitflg)
971 {
972 	aio_req_t *reqp, *next;
973 
974 	/*
975 	 * As no other threads should be accessing the queue at this point,
976 	 * it isn't necessary to hold aio_mutex while we traverse its elements.
977 	 */
978 	if ((reqp = qhead) == NULL)
979 		return;
980 	do {
981 		ASSERT(reqp->aio_req_flags & AIO_POLLQ);
982 		next = reqp->aio_req_next;
983 		aphysio_unlock(reqp);
984 		if (exitflg) {
985 			mutex_enter(&aiop->aio_mutex);
986 			aio_req_free(aiop, reqp);
987 			mutex_exit(&aiop->aio_mutex);
988 		} else {
989 			aio_copyout_result(reqp);
990 			mutex_enter(&aiop->aio_mutex);
991 			aio_enq(&aiop->aio_doneq, reqp, AIO_DONEQ);
992 			mutex_exit(&aiop->aio_mutex);
993 		}
994 	} while ((reqp = next) != qhead);
995 }
996 
997 /*
998  * called by exit(). waits for all outstanding kaio to finish
999  * before the kaio resources are freed.
1000  */
1001 void
1002 aio_cleanup_exit(void)
1003 {
1004 	proc_t *p = curproc;
1005 	aio_t *aiop = p->p_aio;
1006 	aio_req_t *reqp, *next, *head;
1007 	aio_lio_t *nxtlio, *liop;
1008 
1009 	/*
1010 	 * wait for all outstanding kaio to complete. process
1011 	 * is now single-threaded; no other kaio requests can
1012 	 * happen once aio_pending is zero.
1013 	 */
1014 	mutex_enter(&aiop->aio_mutex);
1015 	aiop->aio_flags |= AIO_CLEANUP;
1016 	while ((aiop->aio_pending != 0) || (aiop->aio_flags & AIO_DONE_ACTIVE))
1017 		cv_wait(&aiop->aio_cleanupcv, &aiop->aio_mutex);
1018 	mutex_exit(&aiop->aio_mutex);
1019 
1020 	/* cleanup the cleanup-thread queues. */
1021 	aio_cleanup(AIO_CLEANUP_EXIT);
1022 
1023 	/*
1024 	 * Although this process is now single-threaded, we
1025 	 * still need to protect ourselves against a race with
1026 	 * aio_cleanup_dr_delete_memory().
1027 	 */
1028 	mutex_enter(&p->p_lock);
1029 
1030 	/*
1031 	 * free up the done queue's resources.
1032 	 */
1033 	if ((head = aiop->aio_doneq) != NULL) {
1034 		aiop->aio_doneq = NULL;
1035 		reqp = head;
1036 		do {
1037 			next = reqp->aio_req_next;
1038 			aphysio_unlock(reqp);
1039 			kmem_free(reqp, sizeof (struct aio_req_t));
1040 		} while ((reqp = next) != head);
1041 	}
1042 	/*
1043 	 * release aio request freelist.
1044 	 */
1045 	for (reqp = aiop->aio_free; reqp != NULL; reqp = next) {
1046 		next = reqp->aio_req_next;
1047 		kmem_free(reqp, sizeof (struct aio_req_t));
1048 	}
1049 
1050 	/*
1051 	 * release io list head freelist.
1052 	 */
1053 	for (liop = aiop->aio_lio_free; liop != NULL; liop = nxtlio) {
1054 		nxtlio = liop->lio_next;
1055 		kmem_free(liop, sizeof (aio_lio_t));
1056 	}
1057 
1058 	if (aiop->aio_iocb)
1059 		kmem_free(aiop->aio_iocb, aiop->aio_iocbsz);
1060 
1061 	mutex_destroy(&aiop->aio_mutex);
1062 	mutex_destroy(&aiop->aio_portq_mutex);
1063 	mutex_destroy(&aiop->aio_cleanupq_mutex);
1064 	p->p_aio = NULL;
1065 	mutex_exit(&p->p_lock);
1066 	kmem_free(aiop, sizeof (struct aio));
1067 }
1068 
1069 /*
1070  * copy out aio request's result to a user-level result_t buffer.
1071  */
1072 void
1073 aio_copyout_result(aio_req_t *reqp)
1074 {
1075 	struct buf	*bp;
1076 	struct iovec	*iov;
1077 	void		*resultp;
1078 	int		error;
1079 	size_t		retval;
1080 
1081 	if (reqp->aio_req_flags & AIO_COPYOUTDONE)
1082 		return;
1083 
1084 	reqp->aio_req_flags |= AIO_COPYOUTDONE;
1085 
1086 	iov = reqp->aio_req_uio.uio_iov;
1087 	bp = &reqp->aio_req_buf;
1088 	/* "resultp" points to user-level result_t buffer */
1089 	resultp = (void *)reqp->aio_req_resultp;
1090 	if (bp->b_flags & B_ERROR) {
1091 		if (bp->b_error)
1092 			error = bp->b_error;
1093 		else
1094 			error = EIO;
1095 		retval = (size_t)-1;
1096 	} else {
1097 		error = 0;
1098 		retval = iov->iov_len - bp->b_resid;
1099 	}
1100 #ifdef	_SYSCALL32_IMPL
1101 	if (get_udatamodel() == DATAMODEL_NATIVE) {
1102 		(void) sulword(&((aio_result_t *)resultp)->aio_return, retval);
1103 		(void) suword32(&((aio_result_t *)resultp)->aio_errno, error);
1104 	} else {
1105 		(void) suword32(&((aio_result32_t *)resultp)->aio_return,
1106 		    (int)retval);
1107 		(void) suword32(&((aio_result32_t *)resultp)->aio_errno, error);
1108 	}
1109 #else
1110 	(void) suword32(&((aio_result_t *)resultp)->aio_return, retval);
1111 	(void) suword32(&((aio_result_t *)resultp)->aio_errno, error);
1112 #endif
1113 }
1114 
1115 
1116 void
1117 aio_copyout_result_port(struct iovec *iov, struct buf *bp, void *resultp)
1118 {
1119 	int errno;
1120 	size_t retval;
1121 
1122 	if (bp->b_flags & B_ERROR) {
1123 		if (bp->b_error)
1124 			errno = bp->b_error;
1125 		else
1126 			errno = EIO;
1127 		retval = (size_t)-1;
1128 	} else {
1129 		errno = 0;
1130 		retval = iov->iov_len - bp->b_resid;
1131 	}
1132 #ifdef	_SYSCALL32_IMPL
1133 	if (get_udatamodel() == DATAMODEL_NATIVE) {
1134 		(void) sulword(&((aio_result_t *)resultp)->aio_return, retval);
1135 		(void) suword32(&((aio_result_t *)resultp)->aio_errno, errno);
1136 	} else {
1137 		(void) suword32(&((aio_result32_t *)resultp)->aio_return,
1138 		    (int)retval);
1139 		(void) suword32(&((aio_result32_t *)resultp)->aio_errno, errno);
1140 	}
1141 #else
1142 	(void) suword32(&((aio_result_t *)resultp)->aio_return, retval);
1143 	(void) suword32(&((aio_result_t *)resultp)->aio_errno, errno);
1144 #endif
1145 }
1146 
1147 /*
1148  * This function is used to remove a request from the done queue.
1149  */
1150 
1151 void
1152 aio_req_remove_portq(aio_t *aiop, aio_req_t *reqp)
1153 {
1154 	ASSERT(MUTEX_HELD(&aiop->aio_portq_mutex));
1155 	while (aiop->aio_portq == NULL) {
1156 		/*
1157 		 * aio_portq is set to NULL when aio_cleanup_portq()
1158 		 * is working with the event queue.
1159 		 * The aio_cleanup_thread() uses aio_cleanup_portq()
1160 		 * to unlock all AIO buffers with completed transactions.
1161 		 * Wait here until aio_cleanup_portq() restores the
1162 		 * list of completed transactions in aio_portq.
1163 		 */
1164 		cv_wait(&aiop->aio_portcv, &aiop->aio_portq_mutex);
1165 	}
1166 	aio_deq(&aiop->aio_portq, reqp);
1167 }
1168 
1169 /* ARGSUSED */
1170 void
1171 aio_close_port(void *arg, int port, pid_t pid, int lastclose)
1172 {
1173 	aio_t		*aiop;
1174 	aio_req_t 	*reqp;
1175 	aio_req_t 	*next;
1176 	aio_req_t	*headp;
1177 	int		counter;
1178 
1179 	if (arg == NULL)
1180 		aiop = curproc->p_aio;
1181 	else
1182 		aiop = (aio_t *)arg;
1183 
1184 	/*
1185 	 * The PORT_SOURCE_AIO source is always associated with every new
1186 	 * created port by default.
1187 	 * If no asynchronous I/O transactions were associated with the port
1188 	 * then the aiop pointer will still be set to NULL.
1189 	 */
1190 	if (aiop == NULL)
1191 		return;
1192 
1193 	/*
1194 	 * Within a process event ports can be used to collect events other
1195 	 * than PORT_SOURCE_AIO events. At the same time the process can submit
1196 	 * asynchronous I/Os transactions which are not associated with the
1197 	 * current port.
1198 	 * The current process oriented model of AIO uses a sigle queue for
1199 	 * pending events. On close the pending queue (queue of asynchronous
1200 	 * I/O transactions using event port notification) must be scanned
1201 	 * to detect and handle pending I/Os using the current port.
1202 	 */
1203 	mutex_enter(&aiop->aio_portq_mutex);
1204 	mutex_enter(&aiop->aio_mutex);
1205 	counter = 0;
1206 	if ((headp = aiop->aio_portpending) != NULL) {
1207 		reqp = headp;
1208 		do {
1209 			if (reqp->aio_req_portkev &&
1210 			    reqp->aio_req_port == port) {
1211 				reqp->aio_req_flags |= AIO_CLOSE_PORT;
1212 				counter++;
1213 			}
1214 		} while ((reqp = reqp->aio_req_next) != headp);
1215 	}
1216 	if (counter == 0) {
1217 		/* no AIOs pending */
1218 		mutex_exit(&aiop->aio_mutex);
1219 		mutex_exit(&aiop->aio_portq_mutex);
1220 		return;
1221 	}
1222 	aiop->aio_portpendcnt += counter;
1223 	mutex_exit(&aiop->aio_mutex);
1224 	while (aiop->aio_portpendcnt)
1225 		cv_wait(&aiop->aio_portcv, &aiop->aio_portq_mutex);
1226 
1227 	/*
1228 	 * all pending AIOs are completed.
1229 	 * check port doneq
1230 	 */
1231 	headp = NULL;
1232 	if ((reqp = aiop->aio_portq) != NULL) {
1233 		do {
1234 			next = reqp->aio_req_next;
1235 			if (reqp->aio_req_port == port) {
1236 				/* dequeue request and discard event */
1237 				aio_req_remove_portq(aiop, reqp);
1238 				port_free_event(reqp->aio_req_portkev);
1239 				/* put request in temporary queue */
1240 				reqp->aio_req_next = headp;
1241 				headp = reqp;
1242 			}
1243 		} while ((reqp = next) != aiop->aio_portq);
1244 	}
1245 	mutex_exit(&aiop->aio_portq_mutex);
1246 
1247 	/* headp points to the list of requests to be discarded */
1248 	for (reqp = headp; reqp != NULL; reqp = next) {
1249 		next = reqp->aio_req_next;
1250 		aphysio_unlock(reqp);
1251 		mutex_enter(&aiop->aio_mutex);
1252 		aio_req_free_port(aiop, reqp);
1253 		mutex_exit(&aiop->aio_mutex);
1254 	}
1255 
1256 	if (aiop->aio_flags & AIO_CLEANUP)
1257 		cv_broadcast(&aiop->aio_waitcv);
1258 }
1259 
1260 /*
1261  * aio_cleanup_dr_delete_memory is used by dr's delete_memory_thread
1262  * to kick start the aio_cleanup_thread for the give process to do the
1263  * necessary cleanup.
1264  * This is needed so that delete_memory_thread can obtain writer locks
1265  * on pages that need to be relocated during a dr memory delete operation,
1266  * otherwise a deadly embrace may occur.
1267  */
1268 int
1269 aio_cleanup_dr_delete_memory(proc_t *procp)
1270 {
1271 	struct aio *aiop = procp->p_aio;
1272 	struct as *as = procp->p_as;
1273 	int ret = 0;
1274 
1275 	ASSERT(MUTEX_HELD(&procp->p_lock));
1276 
1277 	mutex_enter(&as->a_contents);
1278 
1279 	if (aiop != NULL) {
1280 		aiop->aio_rqclnup = 1;
1281 		cv_broadcast(&as->a_cv);
1282 		ret = 1;
1283 	}
1284 	mutex_exit(&as->a_contents);
1285 	return (ret);
1286 }
1287