1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
24 * Use is subject to license terms.
25 */
26
27 #include <sys/types.h>
28 #include <sys/proc.h>
29 #include <sys/file.h>
30 #include <sys/errno.h>
31 #include <sys/param.h>
32 #include <sys/sysmacros.h>
33 #include <sys/cmn_err.h>
34 #include <sys/systm.h>
35 #include <vm/as.h>
36 #include <vm/page.h>
37 #include <sys/uio.h>
38 #include <sys/kmem.h>
39 #include <sys/debug.h>
40 #include <sys/aio_impl.h>
41 #include <sys/epm.h>
42 #include <sys/fs/snode.h>
43 #include <sys/siginfo.h>
44 #include <sys/cpuvar.h>
45 #include <sys/tnf_probe.h>
46 #include <sys/conf.h>
47 #include <sys/sdt.h>
48
49 int aphysio(int (*)(), int (*)(), dev_t, int, void (*)(), struct aio_req *);
50 void aio_done(struct buf *);
51 void aphysio_unlock(aio_req_t *);
52 void aio_cleanup(int);
53 void aio_cleanup_exit(void);
54
55 /*
56 * private functions
57 */
58 static void aio_sigev_send(proc_t *, sigqueue_t *);
59 static void aio_hash_delete(aio_t *, aio_req_t *);
60 static void aio_lio_free(aio_t *, aio_lio_t *);
61 static int aio_cleanup_cleanupq(aio_t *, aio_req_t *, int);
62 static int aio_cleanup_notifyq(aio_t *, aio_req_t *, int);
63 static void aio_cleanup_pollq(aio_t *, aio_req_t *, int);
64 static void aio_cleanup_portq(aio_t *, aio_req_t *, int);
65
66 /*
67 * async version of physio() that doesn't wait synchronously
68 * for the driver's strategy routine to complete.
69 */
70
71 int
aphysio(int (* strategy)(struct buf *),int (* cancel)(struct buf *),dev_t dev,int rw,void (* mincnt)(struct buf *),struct aio_req * aio)72 aphysio(
73 int (*strategy)(struct buf *),
74 int (*cancel)(struct buf *),
75 dev_t dev,
76 int rw,
77 void (*mincnt)(struct buf *),
78 struct aio_req *aio)
79 {
80 struct uio *uio = aio->aio_uio;
81 aio_req_t *reqp = (aio_req_t *)aio->aio_private;
82 struct buf *bp = &reqp->aio_req_buf;
83 struct iovec *iov;
84 struct as *as;
85 char *a;
86 int error;
87 size_t c;
88 struct page **pplist;
89 struct dev_ops *ops = devopsp[getmajor(dev)];
90
91 if (uio->uio_loffset < 0)
92 return (EINVAL);
93 #ifdef _ILP32
94 /*
95 * For 32-bit kernels, check against SPEC_MAXOFFSET_T which represents
96 * the maximum size that can be supported by the IO subsystem.
97 * XXX this code assumes a D_64BIT driver.
98 */
99 if (uio->uio_loffset > SPEC_MAXOFFSET_T)
100 return (EINVAL);
101 #endif /* _ILP32 */
102
103 TNF_PROBE_5(aphysio_start, "kaio", /* CSTYLED */,
104 tnf_opaque, bp, bp,
105 tnf_device, device, dev,
106 tnf_offset, blkno, btodt(uio->uio_loffset),
107 tnf_size, size, uio->uio_iov->iov_len,
108 tnf_bioflags, rw, rw);
109
110 if (rw == B_READ) {
111 CPU_STATS_ADD_K(sys, phread, 1);
112 } else {
113 CPU_STATS_ADD_K(sys, phwrite, 1);
114 }
115
116 iov = uio->uio_iov;
117 sema_init(&bp->b_sem, 0, NULL, SEMA_DEFAULT, NULL);
118 sema_init(&bp->b_io, 0, NULL, SEMA_DEFAULT, NULL);
119
120 bp->b_error = 0;
121 bp->b_flags = B_BUSY | B_PHYS | B_ASYNC | rw;
122 bp->b_edev = dev;
123 bp->b_dev = cmpdev(dev);
124 bp->b_lblkno = btodt(uio->uio_loffset);
125 bp->b_offset = uio->uio_loffset;
126 (void) ops->devo_getinfo(NULL, DDI_INFO_DEVT2DEVINFO,
127 (void *)bp->b_edev, (void **)&bp->b_dip);
128
129 /*
130 * Clustering: Clustering can set the b_iodone, b_forw and
131 * b_proc fields to cluster-specifc values.
132 */
133 if (bp->b_iodone == NULL) {
134 bp->b_iodone = (int (*)()) aio_done;
135 /* b_forw points at an aio_req_t structure */
136 bp->b_forw = (struct buf *)reqp;
137 bp->b_proc = curproc;
138 }
139
140 a = bp->b_un.b_addr = iov->iov_base;
141 c = bp->b_bcount = iov->iov_len;
142
143 (*mincnt)(bp);
144 if (bp->b_bcount != iov->iov_len)
145 return (ENOTSUP);
146
147 as = bp->b_proc->p_as;
148
149 error = as_pagelock(as, &pplist, a,
150 c, rw == B_READ? S_WRITE : S_READ);
151 if (error != 0) {
152 bp->b_flags |= B_ERROR;
153 bp->b_error = error;
154 bp->b_flags &= ~(B_BUSY|B_WANTED|B_PHYS|B_SHADOW);
155 return (error);
156 }
157 reqp->aio_req_flags |= AIO_PAGELOCKDONE;
158 bp->b_shadow = pplist;
159 if (pplist != NULL) {
160 bp->b_flags |= B_SHADOW;
161 }
162
163 if (cancel != anocancel)
164 cmn_err(CE_PANIC,
165 "aphysio: cancellation not supported, use anocancel");
166
167 reqp->aio_req_cancel = cancel;
168
169 DTRACE_IO1(start, struct buf *, bp);
170
171 return ((*strategy)(bp));
172 }
173
174 /*ARGSUSED*/
175 int
anocancel(struct buf * bp)176 anocancel(struct buf *bp)
177 {
178 return (ENXIO);
179 }
180
181 /*
182 * Called from biodone().
183 * Notify process that a pending AIO has finished.
184 */
185
186 /*
187 * Clustering: This function is made non-static as it is used
188 * by clustering s/w as contract private interface.
189 */
190
191 void
aio_done(struct buf * bp)192 aio_done(struct buf *bp)
193 {
194 proc_t *p;
195 struct as *as;
196 aio_req_t *reqp;
197 aio_lio_t *head = NULL;
198 aio_t *aiop;
199 sigqueue_t *sigev = NULL;
200 sigqueue_t *lio_sigev = NULL;
201 port_kevent_t *pkevp = NULL;
202 port_kevent_t *lio_pkevp = NULL;
203 int fd;
204 int cleanupqflag;
205 int pollqflag;
206 int portevpend;
207 void (*func)();
208 int use_port = 0;
209 int reqp_flags = 0;
210 int send_signal = 0;
211
212 p = bp->b_proc;
213 as = p->p_as;
214 reqp = (aio_req_t *)bp->b_forw;
215 fd = reqp->aio_req_fd;
216
217 TNF_PROBE_5(aphysio_end, "kaio", /* CSTYLED */,
218 tnf_opaque, bp, bp,
219 tnf_device, device, bp->b_edev,
220 tnf_offset, blkno, btodt(reqp->aio_req_uio.uio_loffset),
221 tnf_size, size, reqp->aio_req_uio.uio_iov->iov_len,
222 tnf_bioflags, rw, (bp->b_flags & (B_READ|B_WRITE)));
223
224 /*
225 * mapout earlier so that more kmem is available when aio is
226 * heavily used. bug #1262082
227 */
228 if (bp->b_flags & B_REMAPPED)
229 bp_mapout(bp);
230
231 /* decrement fd's ref count by one, now that aio request is done. */
232 areleasef(fd, P_FINFO(p));
233
234 aiop = p->p_aio;
235 ASSERT(aiop != NULL);
236
237 mutex_enter(&aiop->aio_portq_mutex);
238 mutex_enter(&aiop->aio_mutex);
239 ASSERT(aiop->aio_pending > 0);
240 ASSERT(reqp->aio_req_flags & AIO_PENDING);
241 aiop->aio_pending--;
242 reqp->aio_req_flags &= ~AIO_PENDING;
243 reqp_flags = reqp->aio_req_flags;
244 if ((pkevp = reqp->aio_req_portkev) != NULL) {
245 /* Event port notification is desired for this transaction */
246 if (reqp->aio_req_flags & AIO_CLOSE_PORT) {
247 /*
248 * The port is being closed and it is waiting for
249 * pending asynchronous I/O transactions to complete.
250 */
251 portevpend = --aiop->aio_portpendcnt;
252 aio_deq(&aiop->aio_portpending, reqp);
253 aio_enq(&aiop->aio_portq, reqp, 0);
254 mutex_exit(&aiop->aio_mutex);
255 mutex_exit(&aiop->aio_portq_mutex);
256 port_send_event(pkevp);
257 if (portevpend == 0)
258 cv_broadcast(&aiop->aio_portcv);
259 return;
260 }
261
262 if (aiop->aio_flags & AIO_CLEANUP) {
263 /*
264 * aio_cleanup_thread() is waiting for completion of
265 * transactions.
266 */
267 mutex_enter(&as->a_contents);
268 aio_deq(&aiop->aio_portpending, reqp);
269 aio_enq(&aiop->aio_portcleanupq, reqp, 0);
270 cv_signal(&aiop->aio_cleanupcv);
271 mutex_exit(&as->a_contents);
272 mutex_exit(&aiop->aio_mutex);
273 mutex_exit(&aiop->aio_portq_mutex);
274 return;
275 }
276
277 aio_deq(&aiop->aio_portpending, reqp);
278 aio_enq(&aiop->aio_portq, reqp, 0);
279
280 use_port = 1;
281 } else {
282 /*
283 * when the AIO_CLEANUP flag is enabled for this
284 * process, or when the AIO_POLL bit is set for
285 * this request, special handling is required.
286 * otherwise the request is put onto the doneq.
287 */
288 cleanupqflag = (aiop->aio_flags & AIO_CLEANUP);
289 pollqflag = (reqp->aio_req_flags & AIO_POLL);
290 if (cleanupqflag | pollqflag) {
291
292 if (cleanupqflag)
293 mutex_enter(&as->a_contents);
294
295 /*
296 * requests with their AIO_POLL bit set are put
297 * on the pollq, requests with sigevent structures
298 * or with listio heads are put on the notifyq, and
299 * the remaining requests don't require any special
300 * cleanup handling, so they're put onto the default
301 * cleanupq.
302 */
303 if (pollqflag)
304 aio_enq(&aiop->aio_pollq, reqp, AIO_POLLQ);
305 else if (reqp->aio_req_sigqp || reqp->aio_req_lio)
306 aio_enq(&aiop->aio_notifyq, reqp, AIO_NOTIFYQ);
307 else
308 aio_enq(&aiop->aio_cleanupq, reqp,
309 AIO_CLEANUPQ);
310
311 if (cleanupqflag) {
312 cv_signal(&aiop->aio_cleanupcv);
313 mutex_exit(&as->a_contents);
314 mutex_exit(&aiop->aio_mutex);
315 mutex_exit(&aiop->aio_portq_mutex);
316 } else {
317 ASSERT(pollqflag);
318 /* block aio_cleanup_exit until we're done */
319 aiop->aio_flags |= AIO_DONE_ACTIVE;
320 mutex_exit(&aiop->aio_mutex);
321 mutex_exit(&aiop->aio_portq_mutex);
322 /*
323 * let the cleanup processing happen from an AST
324 * set an AST on all threads in this process
325 */
326 mutex_enter(&p->p_lock);
327 set_proc_ast(p);
328 mutex_exit(&p->p_lock);
329 mutex_enter(&aiop->aio_mutex);
330 /* wakeup anybody waiting in aiowait() */
331 cv_broadcast(&aiop->aio_waitcv);
332
333 /* wakeup aio_cleanup_exit if needed */
334 if (aiop->aio_flags & AIO_CLEANUP)
335 cv_signal(&aiop->aio_cleanupcv);
336 aiop->aio_flags &= ~AIO_DONE_ACTIVE;
337 mutex_exit(&aiop->aio_mutex);
338 }
339 return;
340 }
341
342 /*
343 * save req's sigevent pointer, and check its
344 * value after releasing aio_mutex lock.
345 */
346 sigev = reqp->aio_req_sigqp;
347 reqp->aio_req_sigqp = NULL;
348
349 /* put request on done queue. */
350 aio_enq(&aiop->aio_doneq, reqp, AIO_DONEQ);
351 } /* portkevent */
352
353 /*
354 * when list IO notification is enabled, a notification or
355 * signal is sent only when all entries in the list are done.
356 */
357 if ((head = reqp->aio_req_lio) != NULL) {
358 ASSERT(head->lio_refcnt > 0);
359 if (--head->lio_refcnt == 0) {
360 /*
361 * save lio's sigevent pointer, and check
362 * its value after releasing aio_mutex lock.
363 */
364 lio_sigev = head->lio_sigqp;
365 head->lio_sigqp = NULL;
366 cv_signal(&head->lio_notify);
367 if (head->lio_port >= 0 &&
368 (lio_pkevp = head->lio_portkev) != NULL)
369 head->lio_port = -1;
370 }
371 }
372
373 /*
374 * if AIO_WAITN set then
375 * send signal only when we reached the
376 * required amount of IO's finished
377 * or when all IO's are done
378 */
379 if (aiop->aio_flags & AIO_WAITN) {
380 if (aiop->aio_waitncnt > 0)
381 aiop->aio_waitncnt--;
382 if (aiop->aio_pending == 0 ||
383 aiop->aio_waitncnt == 0)
384 cv_broadcast(&aiop->aio_waitcv);
385 } else {
386 cv_broadcast(&aiop->aio_waitcv);
387 }
388
389 /*
390 * No need to set this flag for pollq, portq, lio requests.
391 * If this is an old Solaris aio request, and the process has
392 * a SIGIO signal handler enabled, then send a SIGIO signal.
393 */
394 if (!sigev && !use_port && head == NULL &&
395 (reqp->aio_req_flags & AIO_SOLARIS) &&
396 (func = PTOU(p)->u_signal[SIGIO - 1]) != SIG_DFL &&
397 (func != SIG_IGN)) {
398 send_signal = 1;
399 reqp->aio_req_flags |= AIO_SIGNALLED;
400 }
401
402 mutex_exit(&aiop->aio_mutex);
403 mutex_exit(&aiop->aio_portq_mutex);
404
405 /*
406 * Could the cleanup thread be waiting for AIO with locked
407 * resources to finish?
408 * Ideally in that case cleanup thread should block on cleanupcv,
409 * but there is a window, where it could miss to see a new aio
410 * request that sneaked in.
411 */
412 mutex_enter(&as->a_contents);
413 if ((reqp_flags & AIO_PAGELOCKDONE) && AS_ISUNMAPWAIT(as))
414 cv_broadcast(&as->a_cv);
415 mutex_exit(&as->a_contents);
416
417 if (sigev)
418 aio_sigev_send(p, sigev);
419 else if (send_signal)
420 psignal(p, SIGIO);
421
422 if (pkevp)
423 port_send_event(pkevp);
424 if (lio_sigev)
425 aio_sigev_send(p, lio_sigev);
426 if (lio_pkevp)
427 port_send_event(lio_pkevp);
428 }
429
430 /*
431 * send a queued signal to the specified process when
432 * the event signal is non-NULL. A return value of 1
433 * will indicate that a signal is queued, and 0 means that
434 * no signal was specified, nor sent.
435 */
436 static void
aio_sigev_send(proc_t * p,sigqueue_t * sigev)437 aio_sigev_send(proc_t *p, sigqueue_t *sigev)
438 {
439 ASSERT(sigev != NULL);
440
441 mutex_enter(&p->p_lock);
442 sigaddqa(p, NULL, sigev);
443 mutex_exit(&p->p_lock);
444 }
445
446 /*
447 * special case handling for zero length requests. the aio request
448 * short circuits the normal completion path since all that's required
449 * to complete this request is to copyout a zero to the aio request's
450 * return value.
451 */
452 void
aio_zerolen(aio_req_t * reqp)453 aio_zerolen(aio_req_t *reqp)
454 {
455
456 struct buf *bp = &reqp->aio_req_buf;
457
458 reqp->aio_req_flags |= AIO_ZEROLEN;
459
460 bp->b_forw = (struct buf *)reqp;
461 bp->b_proc = curproc;
462
463 bp->b_resid = 0;
464 bp->b_flags = 0;
465
466 aio_done(bp);
467 }
468
469 /*
470 * unlock pages previously locked by as_pagelock
471 */
472 void
aphysio_unlock(aio_req_t * reqp)473 aphysio_unlock(aio_req_t *reqp)
474 {
475 struct buf *bp;
476 struct iovec *iov;
477 int flags;
478
479 if (reqp->aio_req_flags & AIO_PHYSIODONE)
480 return;
481
482 reqp->aio_req_flags |= AIO_PHYSIODONE;
483
484 if (reqp->aio_req_flags & AIO_ZEROLEN)
485 return;
486
487 bp = &reqp->aio_req_buf;
488 iov = reqp->aio_req_uio.uio_iov;
489 flags = (((bp->b_flags & B_READ) == B_READ) ? S_WRITE : S_READ);
490 if (reqp->aio_req_flags & AIO_PAGELOCKDONE) {
491 as_pageunlock(bp->b_proc->p_as,
492 bp->b_flags & B_SHADOW ? bp->b_shadow : NULL,
493 iov->iov_base, iov->iov_len, flags);
494 reqp->aio_req_flags &= ~AIO_PAGELOCKDONE;
495 }
496 bp->b_flags &= ~(B_BUSY|B_WANTED|B_PHYS|B_SHADOW);
497 bp->b_flags |= B_DONE;
498 }
499
500 /*
501 * deletes a requests id from the hash table of outstanding io.
502 */
503 static void
aio_hash_delete(aio_t * aiop,struct aio_req_t * reqp)504 aio_hash_delete(aio_t *aiop, struct aio_req_t *reqp)
505 {
506 long index;
507 aio_result_t *resultp = reqp->aio_req_resultp;
508 aio_req_t *current;
509 aio_req_t **nextp;
510
511 index = AIO_HASH(resultp);
512 nextp = (aiop->aio_hash + index);
513 while ((current = *nextp) != NULL) {
514 if (current->aio_req_resultp == resultp) {
515 *nextp = current->aio_hash_next;
516 return;
517 }
518 nextp = ¤t->aio_hash_next;
519 }
520 }
521
522 /*
523 * Put a list head struct onto its free list.
524 */
525 static void
aio_lio_free(aio_t * aiop,aio_lio_t * head)526 aio_lio_free(aio_t *aiop, aio_lio_t *head)
527 {
528 ASSERT(MUTEX_HELD(&aiop->aio_mutex));
529
530 if (head->lio_sigqp != NULL)
531 kmem_free(head->lio_sigqp, sizeof (sigqueue_t));
532 head->lio_next = aiop->aio_lio_free;
533 aiop->aio_lio_free = head;
534 }
535
536 /*
537 * Put a reqp onto the freelist.
538 */
539 void
aio_req_free(aio_t * aiop,aio_req_t * reqp)540 aio_req_free(aio_t *aiop, aio_req_t *reqp)
541 {
542 aio_lio_t *liop;
543
544 ASSERT(MUTEX_HELD(&aiop->aio_mutex));
545
546 if (reqp->aio_req_portkev) {
547 port_free_event(reqp->aio_req_portkev);
548 reqp->aio_req_portkev = NULL;
549 }
550
551 if ((liop = reqp->aio_req_lio) != NULL) {
552 if (--liop->lio_nent == 0)
553 aio_lio_free(aiop, liop);
554 reqp->aio_req_lio = NULL;
555 }
556 if (reqp->aio_req_sigqp != NULL) {
557 kmem_free(reqp->aio_req_sigqp, sizeof (sigqueue_t));
558 reqp->aio_req_sigqp = NULL;
559 }
560 reqp->aio_req_next = aiop->aio_free;
561 reqp->aio_req_prev = NULL;
562 aiop->aio_free = reqp;
563 aiop->aio_outstanding--;
564 if (aiop->aio_outstanding == 0)
565 cv_broadcast(&aiop->aio_waitcv);
566 aio_hash_delete(aiop, reqp);
567 }
568
569 /*
570 * Put a reqp onto the freelist.
571 */
572 void
aio_req_free_port(aio_t * aiop,aio_req_t * reqp)573 aio_req_free_port(aio_t *aiop, aio_req_t *reqp)
574 {
575 ASSERT(MUTEX_HELD(&aiop->aio_mutex));
576
577 reqp->aio_req_next = aiop->aio_free;
578 reqp->aio_req_prev = NULL;
579 aiop->aio_free = reqp;
580 aiop->aio_outstanding--;
581 aio_hash_delete(aiop, reqp);
582 }
583
584
585 /*
586 * Verify the integrity of a queue.
587 */
588 #if defined(DEBUG)
589 static void
aio_verify_queue(aio_req_t * head,aio_req_t * entry_present,aio_req_t * entry_missing)590 aio_verify_queue(aio_req_t *head,
591 aio_req_t *entry_present, aio_req_t *entry_missing)
592 {
593 aio_req_t *reqp;
594 int found = 0;
595 int present = 0;
596
597 if ((reqp = head) != NULL) {
598 do {
599 ASSERT(reqp->aio_req_prev->aio_req_next == reqp);
600 ASSERT(reqp->aio_req_next->aio_req_prev == reqp);
601 if (entry_present == reqp)
602 found++;
603 if (entry_missing == reqp)
604 present++;
605 } while ((reqp = reqp->aio_req_next) != head);
606 }
607 ASSERT(entry_present == NULL || found == 1);
608 ASSERT(entry_missing == NULL || present == 0);
609 }
610 #else
611 #define aio_verify_queue(x, y, z)
612 #endif
613
614 /*
615 * Put a request onto the tail of a queue.
616 */
617 void
aio_enq(aio_req_t ** qhead,aio_req_t * reqp,int qflg_new)618 aio_enq(aio_req_t **qhead, aio_req_t *reqp, int qflg_new)
619 {
620 aio_req_t *head;
621 aio_req_t *prev;
622
623 aio_verify_queue(*qhead, NULL, reqp);
624
625 if ((head = *qhead) == NULL) {
626 reqp->aio_req_next = reqp;
627 reqp->aio_req_prev = reqp;
628 *qhead = reqp;
629 } else {
630 reqp->aio_req_next = head;
631 reqp->aio_req_prev = prev = head->aio_req_prev;
632 prev->aio_req_next = reqp;
633 head->aio_req_prev = reqp;
634 }
635 reqp->aio_req_flags |= qflg_new;
636 }
637
638 /*
639 * Remove a request from its queue.
640 */
641 void
aio_deq(aio_req_t ** qhead,aio_req_t * reqp)642 aio_deq(aio_req_t **qhead, aio_req_t *reqp)
643 {
644 aio_verify_queue(*qhead, reqp, NULL);
645
646 if (reqp->aio_req_next == reqp) {
647 *qhead = NULL;
648 } else {
649 reqp->aio_req_prev->aio_req_next = reqp->aio_req_next;
650 reqp->aio_req_next->aio_req_prev = reqp->aio_req_prev;
651 if (*qhead == reqp)
652 *qhead = reqp->aio_req_next;
653 }
654 reqp->aio_req_next = NULL;
655 reqp->aio_req_prev = NULL;
656 }
657
658 /*
659 * concatenate a specified queue with the cleanupq. the specified
660 * queue is put onto the tail of the cleanupq. all elements on the
661 * specified queue should have their aio_req_flags field cleared.
662 */
663 /*ARGSUSED*/
664 void
aio_cleanupq_concat(aio_t * aiop,aio_req_t * q2,int qflg)665 aio_cleanupq_concat(aio_t *aiop, aio_req_t *q2, int qflg)
666 {
667 aio_req_t *cleanupqhead, *q2tail;
668 aio_req_t *reqp = q2;
669
670 do {
671 ASSERT(reqp->aio_req_flags & qflg);
672 reqp->aio_req_flags &= ~qflg;
673 reqp->aio_req_flags |= AIO_CLEANUPQ;
674 } while ((reqp = reqp->aio_req_next) != q2);
675
676 cleanupqhead = aiop->aio_cleanupq;
677 if (cleanupqhead == NULL)
678 aiop->aio_cleanupq = q2;
679 else {
680 cleanupqhead->aio_req_prev->aio_req_next = q2;
681 q2tail = q2->aio_req_prev;
682 q2tail->aio_req_next = cleanupqhead;
683 q2->aio_req_prev = cleanupqhead->aio_req_prev;
684 cleanupqhead->aio_req_prev = q2tail;
685 }
686 }
687
688 /*
689 * cleanup aio requests that are on the per-process poll queue.
690 */
691 void
aio_cleanup(int flag)692 aio_cleanup(int flag)
693 {
694 aio_t *aiop = curproc->p_aio;
695 aio_req_t *pollqhead, *cleanupqhead, *notifyqhead;
696 aio_req_t *cleanupport;
697 aio_req_t *portq = NULL;
698 void (*func)();
699 int signalled = 0;
700 int qflag = 0;
701 int exitflg;
702
703 ASSERT(aiop != NULL);
704
705 if (flag == AIO_CLEANUP_EXIT)
706 exitflg = AIO_CLEANUP_EXIT;
707 else
708 exitflg = 0;
709
710 /*
711 * We need to get the aio_cleanupq_mutex because we are calling
712 * aio_cleanup_cleanupq()
713 */
714 mutex_enter(&aiop->aio_cleanupq_mutex);
715 /*
716 * take all the requests off the cleanupq, the notifyq,
717 * and the pollq.
718 */
719 mutex_enter(&aiop->aio_mutex);
720 if ((cleanupqhead = aiop->aio_cleanupq) != NULL) {
721 aiop->aio_cleanupq = NULL;
722 qflag++;
723 }
724 if ((notifyqhead = aiop->aio_notifyq) != NULL) {
725 aiop->aio_notifyq = NULL;
726 qflag++;
727 }
728 if ((pollqhead = aiop->aio_pollq) != NULL) {
729 aiop->aio_pollq = NULL;
730 qflag++;
731 }
732 if (flag) {
733 if ((portq = aiop->aio_portq) != NULL)
734 qflag++;
735
736 if ((cleanupport = aiop->aio_portcleanupq) != NULL) {
737 aiop->aio_portcleanupq = NULL;
738 qflag++;
739 }
740 }
741 mutex_exit(&aiop->aio_mutex);
742
743 /*
744 * return immediately if cleanupq, pollq, and
745 * notifyq are all empty. someone else must have
746 * emptied them.
747 */
748 if (!qflag) {
749 mutex_exit(&aiop->aio_cleanupq_mutex);
750 return;
751 }
752
753 /*
754 * do cleanup for the various queues.
755 */
756 if (cleanupqhead)
757 signalled = aio_cleanup_cleanupq(aiop, cleanupqhead, exitflg);
758 mutex_exit(&aiop->aio_cleanupq_mutex);
759 if (notifyqhead)
760 signalled = aio_cleanup_notifyq(aiop, notifyqhead, exitflg);
761 if (pollqhead)
762 aio_cleanup_pollq(aiop, pollqhead, exitflg);
763 if (flag && (cleanupport || portq))
764 aio_cleanup_portq(aiop, cleanupport, exitflg);
765
766 if (exitflg)
767 return;
768
769 /*
770 * If we have an active aio_cleanup_thread it's possible for
771 * this routine to push something on to the done queue after
772 * an aiowait/aiosuspend thread has already decided to block.
773 * This being the case, we need a cv_broadcast here to wake
774 * these threads up. It is simpler and cleaner to do this
775 * broadcast here than in the individual cleanup routines.
776 */
777
778 mutex_enter(&aiop->aio_mutex);
779 /*
780 * If there has never been an old solaris aio request
781 * issued by this process, then do not send a SIGIO signal.
782 */
783 if (!(aiop->aio_flags & AIO_SOLARIS_REQ))
784 signalled = 1;
785 cv_broadcast(&aiop->aio_waitcv);
786 mutex_exit(&aiop->aio_mutex);
787
788 /*
789 * Only if the process wasn't already signalled,
790 * determine if a SIGIO signal should be delievered.
791 */
792 if (!signalled &&
793 (func = PTOU(curproc)->u_signal[SIGIO - 1]) != SIG_DFL &&
794 func != SIG_IGN)
795 psignal(curproc, SIGIO);
796 }
797
798
799 /*
800 * Do cleanup for every element of the port cleanup queue.
801 */
802 static void
aio_cleanup_portq(aio_t * aiop,aio_req_t * cleanupq,int exitflag)803 aio_cleanup_portq(aio_t *aiop, aio_req_t *cleanupq, int exitflag)
804 {
805 aio_req_t *reqp;
806 aio_req_t *next;
807 aio_req_t *headp;
808 aio_lio_t *liop;
809
810 /* first check the portq */
811 if (exitflag || ((aiop->aio_flags & AIO_CLEANUP_PORT) == 0)) {
812 mutex_enter(&aiop->aio_mutex);
813 if (aiop->aio_flags & AIO_CLEANUP)
814 aiop->aio_flags |= AIO_CLEANUP_PORT;
815 mutex_exit(&aiop->aio_mutex);
816
817 /*
818 * It is not allowed to hold locks during aphysio_unlock().
819 * The aio_done() interrupt function will try to acquire
820 * aio_mutex and aio_portq_mutex. Therefore we disconnect
821 * the portq list from the aiop for the duration of the
822 * aphysio_unlock() loop below.
823 */
824 mutex_enter(&aiop->aio_portq_mutex);
825 headp = aiop->aio_portq;
826 aiop->aio_portq = NULL;
827 mutex_exit(&aiop->aio_portq_mutex);
828 if ((reqp = headp) != NULL) {
829 do {
830 next = reqp->aio_req_next;
831 aphysio_unlock(reqp);
832 if (exitflag) {
833 mutex_enter(&aiop->aio_mutex);
834 aio_req_free(aiop, reqp);
835 mutex_exit(&aiop->aio_mutex);
836 }
837 } while ((reqp = next) != headp);
838 }
839
840 if (headp != NULL && exitflag == 0) {
841 /* move unlocked requests back to the port queue */
842 aio_req_t *newq;
843
844 mutex_enter(&aiop->aio_portq_mutex);
845 if ((newq = aiop->aio_portq) != NULL) {
846 aio_req_t *headprev = headp->aio_req_prev;
847 aio_req_t *newqprev = newq->aio_req_prev;
848
849 headp->aio_req_prev = newqprev;
850 newq->aio_req_prev = headprev;
851 headprev->aio_req_next = newq;
852 newqprev->aio_req_next = headp;
853 }
854 aiop->aio_portq = headp;
855 cv_broadcast(&aiop->aio_portcv);
856 mutex_exit(&aiop->aio_portq_mutex);
857 }
858 }
859
860 /* now check the port cleanup queue */
861 if ((reqp = cleanupq) == NULL)
862 return;
863 do {
864 next = reqp->aio_req_next;
865 aphysio_unlock(reqp);
866 if (exitflag) {
867 mutex_enter(&aiop->aio_mutex);
868 aio_req_free(aiop, reqp);
869 mutex_exit(&aiop->aio_mutex);
870 } else {
871 mutex_enter(&aiop->aio_portq_mutex);
872 aio_enq(&aiop->aio_portq, reqp, 0);
873 mutex_exit(&aiop->aio_portq_mutex);
874 port_send_event(reqp->aio_req_portkev);
875 if ((liop = reqp->aio_req_lio) != NULL) {
876 int send_event = 0;
877
878 mutex_enter(&aiop->aio_mutex);
879 ASSERT(liop->lio_refcnt > 0);
880 if (--liop->lio_refcnt == 0) {
881 if (liop->lio_port >= 0 &&
882 liop->lio_portkev) {
883 liop->lio_port = -1;
884 send_event = 1;
885 }
886 }
887 mutex_exit(&aiop->aio_mutex);
888 if (send_event)
889 port_send_event(liop->lio_portkev);
890 }
891 }
892 } while ((reqp = next) != cleanupq);
893 }
894
895 /*
896 * Do cleanup for every element of the cleanupq.
897 */
898 static int
aio_cleanup_cleanupq(aio_t * aiop,aio_req_t * qhead,int exitflg)899 aio_cleanup_cleanupq(aio_t *aiop, aio_req_t *qhead, int exitflg)
900 {
901 aio_req_t *reqp, *next;
902 int signalled = 0;
903
904 ASSERT(MUTEX_HELD(&aiop->aio_cleanupq_mutex));
905
906 /*
907 * Since aio_req_done() or aio_req_find() use the HASH list to find
908 * the required requests, they could potentially take away elements
909 * if they are already done (AIO_DONEQ is set).
910 * The aio_cleanupq_mutex protects the queue for the duration of the
911 * loop from aio_req_done() and aio_req_find().
912 */
913 if ((reqp = qhead) == NULL)
914 return (0);
915 do {
916 ASSERT(reqp->aio_req_flags & AIO_CLEANUPQ);
917 ASSERT(reqp->aio_req_portkev == NULL);
918 next = reqp->aio_req_next;
919 aphysio_unlock(reqp);
920 mutex_enter(&aiop->aio_mutex);
921 if (exitflg)
922 aio_req_free(aiop, reqp);
923 else
924 aio_enq(&aiop->aio_doneq, reqp, AIO_DONEQ);
925 if (!exitflg) {
926 if (reqp->aio_req_flags & AIO_SIGNALLED)
927 signalled++;
928 else
929 reqp->aio_req_flags |= AIO_SIGNALLED;
930 }
931 mutex_exit(&aiop->aio_mutex);
932 } while ((reqp = next) != qhead);
933 return (signalled);
934 }
935
936 /*
937 * do cleanup for every element of the notify queue.
938 */
939 static int
aio_cleanup_notifyq(aio_t * aiop,aio_req_t * qhead,int exitflg)940 aio_cleanup_notifyq(aio_t *aiop, aio_req_t *qhead, int exitflg)
941 {
942 aio_req_t *reqp, *next;
943 aio_lio_t *liohead;
944 sigqueue_t *sigev, *lio_sigev = NULL;
945 int signalled = 0;
946
947 if ((reqp = qhead) == NULL)
948 return (0);
949 do {
950 ASSERT(reqp->aio_req_flags & AIO_NOTIFYQ);
951 next = reqp->aio_req_next;
952 aphysio_unlock(reqp);
953 if (exitflg) {
954 mutex_enter(&aiop->aio_mutex);
955 aio_req_free(aiop, reqp);
956 mutex_exit(&aiop->aio_mutex);
957 } else {
958 mutex_enter(&aiop->aio_mutex);
959 aio_enq(&aiop->aio_doneq, reqp, AIO_DONEQ);
960 sigev = reqp->aio_req_sigqp;
961 reqp->aio_req_sigqp = NULL;
962 if ((liohead = reqp->aio_req_lio) != NULL) {
963 ASSERT(liohead->lio_refcnt > 0);
964 if (--liohead->lio_refcnt == 0) {
965 cv_signal(&liohead->lio_notify);
966 lio_sigev = liohead->lio_sigqp;
967 liohead->lio_sigqp = NULL;
968 }
969 }
970 mutex_exit(&aiop->aio_mutex);
971 if (sigev) {
972 signalled++;
973 aio_sigev_send(reqp->aio_req_buf.b_proc,
974 sigev);
975 }
976 if (lio_sigev) {
977 signalled++;
978 aio_sigev_send(reqp->aio_req_buf.b_proc,
979 lio_sigev);
980 }
981 }
982 } while ((reqp = next) != qhead);
983
984 return (signalled);
985 }
986
987 /*
988 * Do cleanup for every element of the poll queue.
989 */
990 static void
aio_cleanup_pollq(aio_t * aiop,aio_req_t * qhead,int exitflg)991 aio_cleanup_pollq(aio_t *aiop, aio_req_t *qhead, int exitflg)
992 {
993 aio_req_t *reqp, *next;
994
995 /*
996 * As no other threads should be accessing the queue at this point,
997 * it isn't necessary to hold aio_mutex while we traverse its elements.
998 */
999 if ((reqp = qhead) == NULL)
1000 return;
1001 do {
1002 ASSERT(reqp->aio_req_flags & AIO_POLLQ);
1003 next = reqp->aio_req_next;
1004 aphysio_unlock(reqp);
1005 if (exitflg) {
1006 mutex_enter(&aiop->aio_mutex);
1007 aio_req_free(aiop, reqp);
1008 mutex_exit(&aiop->aio_mutex);
1009 } else {
1010 aio_copyout_result(reqp);
1011 mutex_enter(&aiop->aio_mutex);
1012 aio_enq(&aiop->aio_doneq, reqp, AIO_DONEQ);
1013 mutex_exit(&aiop->aio_mutex);
1014 }
1015 } while ((reqp = next) != qhead);
1016 }
1017
1018 /*
1019 * called by exit(). waits for all outstanding kaio to finish
1020 * before the kaio resources are freed.
1021 */
1022 void
aio_cleanup_exit(void)1023 aio_cleanup_exit(void)
1024 {
1025 proc_t *p = curproc;
1026 aio_t *aiop = p->p_aio;
1027 aio_req_t *reqp, *next, *head;
1028 aio_lio_t *nxtlio, *liop;
1029
1030 /*
1031 * wait for all outstanding kaio to complete. process
1032 * is now single-threaded; no other kaio requests can
1033 * happen once aio_pending is zero.
1034 */
1035 mutex_enter(&aiop->aio_mutex);
1036 aiop->aio_flags |= AIO_CLEANUP;
1037 while ((aiop->aio_pending != 0) || (aiop->aio_flags & AIO_DONE_ACTIVE))
1038 cv_wait(&aiop->aio_cleanupcv, &aiop->aio_mutex);
1039 mutex_exit(&aiop->aio_mutex);
1040
1041 /* cleanup the cleanup-thread queues. */
1042 aio_cleanup(AIO_CLEANUP_EXIT);
1043
1044 /*
1045 * Although this process is now single-threaded, we
1046 * still need to protect ourselves against a race with
1047 * aio_cleanup_dr_delete_memory().
1048 */
1049 mutex_enter(&p->p_lock);
1050
1051 /*
1052 * free up the done queue's resources.
1053 */
1054 if ((head = aiop->aio_doneq) != NULL) {
1055 aiop->aio_doneq = NULL;
1056 reqp = head;
1057 do {
1058 next = reqp->aio_req_next;
1059 aphysio_unlock(reqp);
1060 kmem_free(reqp, sizeof (struct aio_req_t));
1061 } while ((reqp = next) != head);
1062 }
1063 /*
1064 * release aio request freelist.
1065 */
1066 for (reqp = aiop->aio_free; reqp != NULL; reqp = next) {
1067 next = reqp->aio_req_next;
1068 kmem_free(reqp, sizeof (struct aio_req_t));
1069 }
1070
1071 /*
1072 * release io list head freelist.
1073 */
1074 for (liop = aiop->aio_lio_free; liop != NULL; liop = nxtlio) {
1075 nxtlio = liop->lio_next;
1076 kmem_free(liop, sizeof (aio_lio_t));
1077 }
1078
1079 if (aiop->aio_iocb)
1080 kmem_free(aiop->aio_iocb, aiop->aio_iocbsz);
1081
1082 mutex_destroy(&aiop->aio_mutex);
1083 mutex_destroy(&aiop->aio_portq_mutex);
1084 mutex_destroy(&aiop->aio_cleanupq_mutex);
1085 p->p_aio = NULL;
1086 mutex_exit(&p->p_lock);
1087 kmem_free(aiop, sizeof (struct aio));
1088 }
1089
1090 /*
1091 * copy out aio request's result to a user-level result_t buffer.
1092 */
1093 void
aio_copyout_result(aio_req_t * reqp)1094 aio_copyout_result(aio_req_t *reqp)
1095 {
1096 struct buf *bp;
1097 struct iovec *iov;
1098 void *resultp;
1099 int error;
1100 size_t retval;
1101
1102 if (reqp->aio_req_flags & AIO_COPYOUTDONE)
1103 return;
1104
1105 reqp->aio_req_flags |= AIO_COPYOUTDONE;
1106
1107 iov = reqp->aio_req_uio.uio_iov;
1108 bp = &reqp->aio_req_buf;
1109 /* "resultp" points to user-level result_t buffer */
1110 resultp = (void *)reqp->aio_req_resultp;
1111 if (bp->b_flags & B_ERROR) {
1112 if (bp->b_error)
1113 error = bp->b_error;
1114 else
1115 error = EIO;
1116 retval = (size_t)-1;
1117 } else {
1118 error = 0;
1119 retval = iov->iov_len - bp->b_resid;
1120 }
1121 #ifdef _SYSCALL32_IMPL
1122 if (get_udatamodel() == DATAMODEL_NATIVE) {
1123 (void) sulword(&((aio_result_t *)resultp)->aio_return, retval);
1124 (void) suword32(&((aio_result_t *)resultp)->aio_errno, error);
1125 } else {
1126 (void) suword32(&((aio_result32_t *)resultp)->aio_return,
1127 (int)retval);
1128 (void) suword32(&((aio_result32_t *)resultp)->aio_errno, error);
1129 }
1130 #else
1131 (void) suword32(&((aio_result_t *)resultp)->aio_return, retval);
1132 (void) suword32(&((aio_result_t *)resultp)->aio_errno, error);
1133 #endif
1134 }
1135
1136
1137 void
aio_copyout_result_port(struct iovec * iov,struct buf * bp,void * resultp)1138 aio_copyout_result_port(struct iovec *iov, struct buf *bp, void *resultp)
1139 {
1140 int errno;
1141 size_t retval;
1142
1143 if (bp->b_flags & B_ERROR) {
1144 if (bp->b_error)
1145 errno = bp->b_error;
1146 else
1147 errno = EIO;
1148 retval = (size_t)-1;
1149 } else {
1150 errno = 0;
1151 retval = iov->iov_len - bp->b_resid;
1152 }
1153 #ifdef _SYSCALL32_IMPL
1154 if (get_udatamodel() == DATAMODEL_NATIVE) {
1155 (void) sulword(&((aio_result_t *)resultp)->aio_return, retval);
1156 (void) suword32(&((aio_result_t *)resultp)->aio_errno, errno);
1157 } else {
1158 (void) suword32(&((aio_result32_t *)resultp)->aio_return,
1159 (int)retval);
1160 (void) suword32(&((aio_result32_t *)resultp)->aio_errno, errno);
1161 }
1162 #else
1163 (void) suword32(&((aio_result_t *)resultp)->aio_return, retval);
1164 (void) suword32(&((aio_result_t *)resultp)->aio_errno, errno);
1165 #endif
1166 }
1167
1168 /*
1169 * This function is used to remove a request from the done queue.
1170 */
1171
1172 void
aio_req_remove_portq(aio_t * aiop,aio_req_t * reqp)1173 aio_req_remove_portq(aio_t *aiop, aio_req_t *reqp)
1174 {
1175 ASSERT(MUTEX_HELD(&aiop->aio_portq_mutex));
1176 while (aiop->aio_portq == NULL) {
1177 /*
1178 * aio_portq is set to NULL when aio_cleanup_portq()
1179 * is working with the event queue.
1180 * The aio_cleanup_thread() uses aio_cleanup_portq()
1181 * to unlock all AIO buffers with completed transactions.
1182 * Wait here until aio_cleanup_portq() restores the
1183 * list of completed transactions in aio_portq.
1184 */
1185 cv_wait(&aiop->aio_portcv, &aiop->aio_portq_mutex);
1186 }
1187 aio_deq(&aiop->aio_portq, reqp);
1188 }
1189
1190 /* ARGSUSED */
1191 void
aio_close_port(void * arg,int port,pid_t pid,int lastclose)1192 aio_close_port(void *arg, int port, pid_t pid, int lastclose)
1193 {
1194 aio_t *aiop;
1195 aio_req_t *reqp;
1196 aio_req_t *next;
1197 aio_req_t *headp;
1198 int counter;
1199
1200 if (arg == NULL)
1201 aiop = curproc->p_aio;
1202 else
1203 aiop = (aio_t *)arg;
1204
1205 /*
1206 * The PORT_SOURCE_AIO source is always associated with every new
1207 * created port by default.
1208 * If no asynchronous I/O transactions were associated with the port
1209 * then the aiop pointer will still be set to NULL.
1210 */
1211 if (aiop == NULL)
1212 return;
1213
1214 /*
1215 * Within a process event ports can be used to collect events other
1216 * than PORT_SOURCE_AIO events. At the same time the process can submit
1217 * asynchronous I/Os transactions which are not associated with the
1218 * current port.
1219 * The current process oriented model of AIO uses a sigle queue for
1220 * pending events. On close the pending queue (queue of asynchronous
1221 * I/O transactions using event port notification) must be scanned
1222 * to detect and handle pending I/Os using the current port.
1223 */
1224 mutex_enter(&aiop->aio_portq_mutex);
1225 mutex_enter(&aiop->aio_mutex);
1226 counter = 0;
1227 if ((headp = aiop->aio_portpending) != NULL) {
1228 reqp = headp;
1229 do {
1230 if (reqp->aio_req_portkev &&
1231 reqp->aio_req_port == port) {
1232 reqp->aio_req_flags |= AIO_CLOSE_PORT;
1233 counter++;
1234 }
1235 } while ((reqp = reqp->aio_req_next) != headp);
1236 }
1237 if (counter == 0) {
1238 /* no AIOs pending */
1239 mutex_exit(&aiop->aio_mutex);
1240 mutex_exit(&aiop->aio_portq_mutex);
1241 return;
1242 }
1243 aiop->aio_portpendcnt += counter;
1244 mutex_exit(&aiop->aio_mutex);
1245 while (aiop->aio_portpendcnt)
1246 cv_wait(&aiop->aio_portcv, &aiop->aio_portq_mutex);
1247
1248 /*
1249 * all pending AIOs are completed.
1250 * check port doneq
1251 */
1252 headp = NULL;
1253 if ((reqp = aiop->aio_portq) != NULL) {
1254 do {
1255 next = reqp->aio_req_next;
1256 if (reqp->aio_req_port == port) {
1257 /* dequeue request and discard event */
1258 aio_req_remove_portq(aiop, reqp);
1259 port_free_event(reqp->aio_req_portkev);
1260 /* put request in temporary queue */
1261 reqp->aio_req_next = headp;
1262 headp = reqp;
1263 }
1264 } while ((reqp = next) != aiop->aio_portq);
1265 }
1266 mutex_exit(&aiop->aio_portq_mutex);
1267
1268 /* headp points to the list of requests to be discarded */
1269 for (reqp = headp; reqp != NULL; reqp = next) {
1270 next = reqp->aio_req_next;
1271 aphysio_unlock(reqp);
1272 mutex_enter(&aiop->aio_mutex);
1273 aio_req_free_port(aiop, reqp);
1274 mutex_exit(&aiop->aio_mutex);
1275 }
1276
1277 if (aiop->aio_flags & AIO_CLEANUP)
1278 cv_broadcast(&aiop->aio_waitcv);
1279 }
1280
1281 /*
1282 * aio_cleanup_dr_delete_memory is used by dr's delete_memory_thread
1283 * to kick start the aio_cleanup_thread for the give process to do the
1284 * necessary cleanup.
1285 * This is needed so that delete_memory_thread can obtain writer locks
1286 * on pages that need to be relocated during a dr memory delete operation,
1287 * otherwise a deadly embrace may occur.
1288 */
1289 int
aio_cleanup_dr_delete_memory(proc_t * procp)1290 aio_cleanup_dr_delete_memory(proc_t *procp)
1291 {
1292 struct aio *aiop = procp->p_aio;
1293 struct as *as = procp->p_as;
1294 int ret = 0;
1295
1296 ASSERT(MUTEX_HELD(&procp->p_lock));
1297
1298 mutex_enter(&as->a_contents);
1299
1300 if (aiop != NULL) {
1301 aiop->aio_rqclnup = 1;
1302 cv_broadcast(&as->a_cv);
1303 ret = 1;
1304 }
1305 mutex_exit(&as->a_contents);
1306 return (ret);
1307 }
1308