1 /* 2 * Copyright (c) 1997 John S. Dyson. All rights reserved. 3 * 4 * Redistribution and use in source and binary forms, with or without 5 * modification, are permitted provided that the following conditions 6 * are met: 7 * 1. Redistributions of source code must retain the above copyright 8 * notice, this list of conditions and the following disclaimer. 9 * 2. John S. Dyson's name may not be used to endorse or promote products 10 * derived from this software without specific prior written permission. 11 * 12 * DISCLAIMER: This code isn't warranted to do anything useful. Anything 13 * bad that happens because of using this software isn't the responsibility 14 * of the author. This software is distributed AS-IS. 15 * 16 * $FreeBSD$ 17 */ 18 19 /* 20 * This file contains support for the POSIX 1003.1B AIO/LIO facility. 21 */ 22 23 #include <sys/param.h> 24 #include <sys/systm.h> 25 #include <sys/bio.h> 26 #include <sys/buf.h> 27 #include <sys/sysproto.h> 28 #include <sys/filedesc.h> 29 #include <sys/kernel.h> 30 #include <sys/kthread.h> 31 #include <sys/fcntl.h> 32 #include <sys/file.h> 33 #include <sys/lock.h> 34 #include <sys/mutex.h> 35 #include <sys/unistd.h> 36 #include <sys/proc.h> 37 #include <sys/resourcevar.h> 38 #include <sys/signalvar.h> 39 #include <sys/protosw.h> 40 #include <sys/socketvar.h> 41 #include <sys/syscall.h> 42 #include <sys/sysent.h> 43 #include <sys/sysctl.h> 44 #include <sys/vnode.h> 45 #include <sys/conf.h> 46 #include <sys/event.h> 47 48 #include <vm/vm.h> 49 #include <vm/vm_extern.h> 50 #include <vm/pmap.h> 51 #include <vm/vm_map.h> 52 #include <vm/vm_zone.h> 53 #include <sys/aio.h> 54 55 #include <machine/limits.h> 56 57 #include "opt_vfs_aio.h" 58 59 static long jobrefid; 60 61 #define JOBST_NULL 0x0 62 #define JOBST_JOBQGLOBAL 0x2 63 #define JOBST_JOBRUNNING 0x3 64 #define JOBST_JOBFINISHED 0x4 65 #define JOBST_JOBQBUF 0x5 66 #define JOBST_JOBBFINISHED 0x6 67 68 #ifndef MAX_AIO_PER_PROC 69 #define MAX_AIO_PER_PROC 32 70 #endif 71 72 #ifndef MAX_AIO_QUEUE_PER_PROC 73 #define MAX_AIO_QUEUE_PER_PROC 256 /* Bigger than AIO_LISTIO_MAX */ 74 #endif 75 76 #ifndef MAX_AIO_PROCS 77 #define MAX_AIO_PROCS 32 78 #endif 79 80 #ifndef MAX_AIO_QUEUE 81 #define MAX_AIO_QUEUE 1024 /* Bigger than AIO_LISTIO_MAX */ 82 #endif 83 84 #ifndef TARGET_AIO_PROCS 85 #define TARGET_AIO_PROCS 4 86 #endif 87 88 #ifndef MAX_BUF_AIO 89 #define MAX_BUF_AIO 16 90 #endif 91 92 #ifndef AIOD_TIMEOUT_DEFAULT 93 #define AIOD_TIMEOUT_DEFAULT (10 * hz) 94 #endif 95 96 #ifndef AIOD_LIFETIME_DEFAULT 97 #define AIOD_LIFETIME_DEFAULT (30 * hz) 98 #endif 99 100 static int max_aio_procs = MAX_AIO_PROCS; 101 static int num_aio_procs = 0; 102 static int target_aio_procs = TARGET_AIO_PROCS; 103 static int max_queue_count = MAX_AIO_QUEUE; 104 static int num_queue_count = 0; 105 static int num_buf_aio = 0; 106 static int num_aio_resv_start = 0; 107 static int aiod_timeout; 108 static int aiod_lifetime; 109 static int unloadable = 0; 110 111 static int max_aio_per_proc = MAX_AIO_PER_PROC; 112 static int max_aio_queue_per_proc = MAX_AIO_QUEUE_PER_PROC; 113 static int max_buf_aio = MAX_BUF_AIO; 114 115 SYSCTL_NODE(_vfs, OID_AUTO, aio, CTLFLAG_RW, 0, "AIO mgmt"); 116 117 SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_per_proc, 118 CTLFLAG_RW, &max_aio_per_proc, 0, ""); 119 120 SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_queue_per_proc, 121 CTLFLAG_RW, &max_aio_queue_per_proc, 0, ""); 122 123 SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_procs, 124 CTLFLAG_RW, &max_aio_procs, 0, ""); 125 126 SYSCTL_INT(_vfs_aio, OID_AUTO, num_aio_procs, 127 CTLFLAG_RD, &num_aio_procs, 0, ""); 128 129 SYSCTL_INT(_vfs_aio, OID_AUTO, num_queue_count, 130 CTLFLAG_RD, &num_queue_count, 0, ""); 131 132 SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_queue, 133 CTLFLAG_RW, &max_queue_count, 0, ""); 134 135 SYSCTL_INT(_vfs_aio, OID_AUTO, target_aio_procs, 136 CTLFLAG_RW, &target_aio_procs, 0, ""); 137 138 SYSCTL_INT(_vfs_aio, OID_AUTO, max_buf_aio, 139 CTLFLAG_RW, &max_buf_aio, 0, ""); 140 141 SYSCTL_INT(_vfs_aio, OID_AUTO, num_buf_aio, 142 CTLFLAG_RD, &num_buf_aio, 0, ""); 143 144 SYSCTL_INT(_vfs_aio, OID_AUTO, aiod_lifetime, 145 CTLFLAG_RW, &aiod_lifetime, 0, ""); 146 147 SYSCTL_INT(_vfs_aio, OID_AUTO, aiod_timeout, 148 CTLFLAG_RW, &aiod_timeout, 0, ""); 149 150 SYSCTL_INT(_vfs_aio, OID_AUTO, unloadable, CTLFLAG_RW, &unloadable, 0, 151 "Allow unload of aio (not recommended)"); 152 153 struct aiocblist { 154 TAILQ_ENTRY(aiocblist) list; /* List of jobs */ 155 TAILQ_ENTRY(aiocblist) plist; /* List of jobs for proc */ 156 int jobflags; 157 int jobstate; 158 int inputcharge; 159 int outputcharge; 160 struct callout_handle timeouthandle; 161 struct buf *bp; /* Buffer pointer */ 162 struct proc *userproc; /* User process */ /* Not td! */ 163 struct file *fd_file; /* Pointer to file structure */ 164 struct aiothreadlist *jobaiothread; /* AIO process descriptor */ 165 struct aio_liojob *lio; /* Optional lio job */ 166 struct aiocb *uuaiocb; /* Pointer in userspace of aiocb */ 167 struct klist klist; /* list of knotes */ 168 struct aiocb uaiocb; /* Kernel I/O control block */ 169 }; 170 171 /* jobflags */ 172 #define AIOCBLIST_RUNDOWN 0x4 173 #define AIOCBLIST_ASYNCFREE 0x8 174 #define AIOCBLIST_DONE 0x10 175 176 /* 177 * AIO process info 178 */ 179 #define AIOP_FREE 0x1 /* proc on free queue */ 180 #define AIOP_SCHED 0x2 /* proc explicitly scheduled */ 181 182 struct aiothreadlist { 183 int aiothreadflags; /* AIO proc flags */ 184 TAILQ_ENTRY(aiothreadlist) list; /* List of processes */ 185 struct thread *aiothread; /* The AIO thread */ 186 }; 187 188 /* 189 * data-structure for lio signal management 190 */ 191 struct aio_liojob { 192 int lioj_flags; 193 int lioj_buffer_count; 194 int lioj_buffer_finished_count; 195 int lioj_queue_count; 196 int lioj_queue_finished_count; 197 struct sigevent lioj_signal; /* signal on all I/O done */ 198 TAILQ_ENTRY(aio_liojob) lioj_list; 199 struct kaioinfo *lioj_ki; 200 }; 201 #define LIOJ_SIGNAL 0x1 /* signal on all done (lio) */ 202 #define LIOJ_SIGNAL_POSTED 0x2 /* signal has been posted */ 203 204 /* 205 * per process aio data structure 206 */ 207 struct kaioinfo { 208 int kaio_flags; /* per process kaio flags */ 209 int kaio_maxactive_count; /* maximum number of AIOs */ 210 int kaio_active_count; /* number of currently used AIOs */ 211 int kaio_qallowed_count; /* maxiumu size of AIO queue */ 212 int kaio_queue_count; /* size of AIO queue */ 213 int kaio_ballowed_count; /* maximum number of buffers */ 214 int kaio_queue_finished_count; /* number of daemon jobs finished */ 215 int kaio_buffer_count; /* number of physio buffers */ 216 int kaio_buffer_finished_count; /* count of I/O done */ 217 struct proc *kaio_p; /* process that uses this kaio block */ 218 TAILQ_HEAD(,aio_liojob) kaio_liojoblist; /* list of lio jobs */ 219 TAILQ_HEAD(,aiocblist) kaio_jobqueue; /* job queue for process */ 220 TAILQ_HEAD(,aiocblist) kaio_jobdone; /* done queue for process */ 221 TAILQ_HEAD(,aiocblist) kaio_bufqueue; /* buffer job queue for process */ 222 TAILQ_HEAD(,aiocblist) kaio_bufdone; /* buffer done queue for process */ 223 TAILQ_HEAD(,aiocblist) kaio_sockqueue; /* queue for aios waiting on sockets */ 224 }; 225 226 #define KAIO_RUNDOWN 0x1 /* process is being run down */ 227 #define KAIO_WAKEUP 0x2 /* wakeup process when there is a significant event */ 228 229 static TAILQ_HEAD(,aiothreadlist) aio_freeproc, aio_activeproc; 230 static TAILQ_HEAD(,aiocblist) aio_jobs; /* Async job list */ 231 static TAILQ_HEAD(,aiocblist) aio_bufjobs; /* Phys I/O job list */ 232 233 static void aio_init_aioinfo(struct proc *p); 234 static void aio_onceonly(void); 235 static int aio_free_entry(struct aiocblist *aiocbe); 236 static void aio_process(struct aiocblist *aiocbe); 237 static int aio_newproc(void); 238 static int aio_aqueue(struct thread *td, struct aiocb *job, int type); 239 static void aio_physwakeup(struct buf *bp); 240 static void aio_proc_rundown(struct proc *p); 241 static int aio_fphysio(struct aiocblist *aiocbe); 242 static int aio_qphysio(struct proc *p, struct aiocblist *iocb); 243 static void aio_daemon(void *uproc); 244 static void aio_swake_cb(struct socket *, struct sockbuf *); 245 static int aio_unload(void); 246 static void process_signal(void *aioj); 247 static int filt_aioattach(struct knote *kn); 248 static void filt_aiodetach(struct knote *kn); 249 static int filt_aio(struct knote *kn, long hint); 250 251 static vm_zone_t kaio_zone, aiop_zone, aiocb_zone, aiol_zone; 252 static vm_zone_t aiolio_zone; 253 254 static struct filterops aio_filtops = 255 { 0, filt_aioattach, filt_aiodetach, filt_aio }; 256 257 static int 258 aio_modload(struct module *module, int cmd, void *arg) 259 { 260 int error = 0; 261 262 switch (cmd) { 263 case MOD_LOAD: 264 aio_onceonly(); 265 break; 266 case MOD_UNLOAD: 267 error = aio_unload(); 268 break; 269 case MOD_SHUTDOWN: 270 break; 271 default: 272 error = EINVAL; 273 break; 274 } 275 return (error); 276 } 277 278 static moduledata_t aio_mod = { 279 "aio", 280 &aio_modload, 281 NULL 282 }; 283 284 SYSCALL_MODULE_HELPER(aio_return); 285 SYSCALL_MODULE_HELPER(aio_suspend); 286 SYSCALL_MODULE_HELPER(aio_cancel); 287 SYSCALL_MODULE_HELPER(aio_error); 288 SYSCALL_MODULE_HELPER(aio_read); 289 SYSCALL_MODULE_HELPER(aio_write); 290 SYSCALL_MODULE_HELPER(aio_waitcomplete); 291 SYSCALL_MODULE_HELPER(lio_listio); 292 293 DECLARE_MODULE(aio, aio_mod, 294 SI_SUB_VFS, SI_ORDER_ANY); 295 MODULE_VERSION(aio, 1); 296 297 /* 298 * Startup initialization 299 */ 300 static void 301 aio_onceonly(void) 302 { 303 304 /* XXX: should probably just use so->callback */ 305 aio_swake = &aio_swake_cb; 306 at_exit(aio_proc_rundown); 307 at_exec(aio_proc_rundown); 308 kqueue_add_filteropts(EVFILT_AIO, &aio_filtops); 309 TAILQ_INIT(&aio_freeproc); 310 TAILQ_INIT(&aio_activeproc); 311 TAILQ_INIT(&aio_jobs); 312 TAILQ_INIT(&aio_bufjobs); 313 kaio_zone = zinit("AIO", sizeof(struct kaioinfo), 0, 0, 1); 314 aiop_zone = zinit("AIOP", sizeof(struct aiothreadlist), 0, 0, 1); 315 aiocb_zone = zinit("AIOCB", sizeof(struct aiocblist), 0, 0, 1); 316 aiol_zone = zinit("AIOL", AIO_LISTIO_MAX*sizeof(intptr_t), 0, 0, 1); 317 aiolio_zone = zinit("AIOLIO", sizeof(struct aio_liojob), 0, 0, 1); 318 aiod_timeout = AIOD_TIMEOUT_DEFAULT; 319 aiod_lifetime = AIOD_LIFETIME_DEFAULT; 320 jobrefid = 1; 321 } 322 323 static int 324 aio_unload(void) 325 { 326 327 /* 328 * XXX: no unloads by default, it's too dangerous. 329 * perhaps we could do it if locked out callers and then 330 * did an aio_proc_rundown() on each process. 331 */ 332 if (!unloadable) 333 return (EOPNOTSUPP); 334 335 aio_swake = NULL; 336 rm_at_exit(aio_proc_rundown); 337 rm_at_exec(aio_proc_rundown); 338 kqueue_del_filteropts(EVFILT_AIO); 339 return (0); 340 } 341 342 /* 343 * Init the per-process aioinfo structure. The aioinfo limits are set 344 * per-process for user limit (resource) management. 345 */ 346 static void 347 aio_init_aioinfo(struct proc *p) 348 { 349 struct kaioinfo *ki; 350 if (p->p_aioinfo == NULL) { 351 ki = zalloc(kaio_zone); 352 p->p_aioinfo = ki; 353 ki->kaio_flags = 0; 354 ki->kaio_maxactive_count = max_aio_per_proc; 355 ki->kaio_active_count = 0; 356 ki->kaio_qallowed_count = max_aio_queue_per_proc; 357 ki->kaio_queue_count = 0; 358 ki->kaio_ballowed_count = max_buf_aio; 359 ki->kaio_buffer_count = 0; 360 ki->kaio_buffer_finished_count = 0; 361 ki->kaio_p = p; 362 TAILQ_INIT(&ki->kaio_jobdone); 363 TAILQ_INIT(&ki->kaio_jobqueue); 364 TAILQ_INIT(&ki->kaio_bufdone); 365 TAILQ_INIT(&ki->kaio_bufqueue); 366 TAILQ_INIT(&ki->kaio_liojoblist); 367 TAILQ_INIT(&ki->kaio_sockqueue); 368 } 369 370 while (num_aio_procs < target_aio_procs) 371 aio_newproc(); 372 } 373 374 /* 375 * Free a job entry. Wait for completion if it is currently active, but don't 376 * delay forever. If we delay, we return a flag that says that we have to 377 * restart the queue scan. 378 */ 379 static int 380 aio_free_entry(struct aiocblist *aiocbe) 381 { 382 struct kaioinfo *ki; 383 struct aio_liojob *lj; 384 struct proc *p; 385 int error; 386 int s; 387 388 if (aiocbe->jobstate == JOBST_NULL) 389 panic("aio_free_entry: freeing already free job"); 390 391 p = aiocbe->userproc; 392 ki = p->p_aioinfo; 393 lj = aiocbe->lio; 394 if (ki == NULL) 395 panic("aio_free_entry: missing p->p_aioinfo"); 396 397 while (aiocbe->jobstate == JOBST_JOBRUNNING) { 398 if (aiocbe->jobflags & AIOCBLIST_ASYNCFREE) 399 return 0; 400 aiocbe->jobflags |= AIOCBLIST_RUNDOWN; 401 tsleep(aiocbe, PRIBIO, "jobwai", 0); 402 } 403 aiocbe->jobflags &= ~AIOCBLIST_ASYNCFREE; 404 405 if (aiocbe->bp == NULL) { 406 if (ki->kaio_queue_count <= 0) 407 panic("aio_free_entry: process queue size <= 0"); 408 if (num_queue_count <= 0) 409 panic("aio_free_entry: system wide queue size <= 0"); 410 411 if (lj) { 412 lj->lioj_queue_count--; 413 if (aiocbe->jobflags & AIOCBLIST_DONE) 414 lj->lioj_queue_finished_count--; 415 } 416 ki->kaio_queue_count--; 417 if (aiocbe->jobflags & AIOCBLIST_DONE) 418 ki->kaio_queue_finished_count--; 419 num_queue_count--; 420 } else { 421 if (lj) { 422 lj->lioj_buffer_count--; 423 if (aiocbe->jobflags & AIOCBLIST_DONE) 424 lj->lioj_buffer_finished_count--; 425 } 426 if (aiocbe->jobflags & AIOCBLIST_DONE) 427 ki->kaio_buffer_finished_count--; 428 ki->kaio_buffer_count--; 429 num_buf_aio--; 430 } 431 432 /* aiocbe is going away, we need to destroy any knotes */ 433 /* XXXKSE Note the thread here is used to eventually find the 434 * owning process again, but it is also used to do a fo_close 435 * and that requires the thread. (but does it require the 436 * OWNING thread? (or maybe the running thread?) 437 * There is a semantic problem here... 438 */ 439 knote_remove(FIRST_THREAD_IN_PROC(p), &aiocbe->klist); /* XXXKSE */ 440 441 if ((ki->kaio_flags & KAIO_WAKEUP) || ((ki->kaio_flags & KAIO_RUNDOWN) 442 && ((ki->kaio_buffer_count == 0) && (ki->kaio_queue_count == 0)))) { 443 ki->kaio_flags &= ~KAIO_WAKEUP; 444 wakeup(p); 445 } 446 447 if (aiocbe->jobstate == JOBST_JOBQBUF) { 448 if ((error = aio_fphysio(aiocbe)) != 0) 449 return error; 450 if (aiocbe->jobstate != JOBST_JOBBFINISHED) 451 panic("aio_free_entry: invalid physio finish-up state"); 452 s = splbio(); 453 TAILQ_REMOVE(&ki->kaio_bufdone, aiocbe, plist); 454 splx(s); 455 } else if (aiocbe->jobstate == JOBST_JOBQGLOBAL) { 456 s = splnet(); 457 TAILQ_REMOVE(&aio_jobs, aiocbe, list); 458 TAILQ_REMOVE(&ki->kaio_jobqueue, aiocbe, plist); 459 splx(s); 460 } else if (aiocbe->jobstate == JOBST_JOBFINISHED) 461 TAILQ_REMOVE(&ki->kaio_jobdone, aiocbe, plist); 462 else if (aiocbe->jobstate == JOBST_JOBBFINISHED) { 463 s = splbio(); 464 TAILQ_REMOVE(&ki->kaio_bufdone, aiocbe, plist); 465 splx(s); 466 if (aiocbe->bp) { 467 vunmapbuf(aiocbe->bp); 468 relpbuf(aiocbe->bp, NULL); 469 aiocbe->bp = NULL; 470 } 471 } 472 if (lj && (lj->lioj_buffer_count == 0) && (lj->lioj_queue_count == 0)) { 473 TAILQ_REMOVE(&ki->kaio_liojoblist, lj, lioj_list); 474 zfree(aiolio_zone, lj); 475 } 476 aiocbe->jobstate = JOBST_NULL; 477 untimeout(process_signal, aiocbe, aiocbe->timeouthandle); 478 zfree(aiocb_zone, aiocbe); 479 return 0; 480 } 481 482 /* 483 * Rundown the jobs for a given process. 484 */ 485 static void 486 aio_proc_rundown(struct proc *p) 487 { 488 int s; 489 struct kaioinfo *ki; 490 struct aio_liojob *lj, *ljn; 491 struct aiocblist *aiocbe, *aiocbn; 492 struct file *fp; 493 struct filedesc *fdp; 494 struct socket *so; 495 496 ki = p->p_aioinfo; 497 if (ki == NULL) 498 return; 499 500 ki->kaio_flags |= LIOJ_SIGNAL_POSTED; 501 while ((ki->kaio_active_count > 0) || (ki->kaio_buffer_count > 502 ki->kaio_buffer_finished_count)) { 503 ki->kaio_flags |= KAIO_RUNDOWN; 504 if (tsleep(p, PRIBIO, "kaiowt", aiod_timeout)) 505 break; 506 } 507 508 /* 509 * Move any aio ops that are waiting on socket I/O to the normal job 510 * queues so they are cleaned up with any others. 511 */ 512 fdp = p->p_fd; 513 514 s = splnet(); 515 for (aiocbe = TAILQ_FIRST(&ki->kaio_sockqueue); aiocbe; aiocbe = 516 aiocbn) { 517 aiocbn = TAILQ_NEXT(aiocbe, plist); 518 fp = fdp->fd_ofiles[aiocbe->uaiocb.aio_fildes]; 519 520 /* 521 * Under some circumstances, the aio_fildes and the file 522 * structure don't match. This would leave aiocbe's in the 523 * TAILQ associated with the socket and cause a panic later. 524 * 525 * Detect and fix. 526 */ 527 if ((fp == NULL) || (fp != aiocbe->fd_file)) 528 fp = aiocbe->fd_file; 529 if (fp) { 530 so = (struct socket *)fp->f_data; 531 TAILQ_REMOVE(&so->so_aiojobq, aiocbe, list); 532 if (TAILQ_EMPTY(&so->so_aiojobq)) { 533 so->so_snd.sb_flags &= ~SB_AIO; 534 so->so_rcv.sb_flags &= ~SB_AIO; 535 } 536 } 537 TAILQ_REMOVE(&ki->kaio_sockqueue, aiocbe, plist); 538 TAILQ_INSERT_HEAD(&aio_jobs, aiocbe, list); 539 TAILQ_INSERT_HEAD(&ki->kaio_jobqueue, aiocbe, plist); 540 } 541 splx(s); 542 543 restart1: 544 for (aiocbe = TAILQ_FIRST(&ki->kaio_jobdone); aiocbe; aiocbe = aiocbn) { 545 aiocbn = TAILQ_NEXT(aiocbe, plist); 546 if (aio_free_entry(aiocbe)) 547 goto restart1; 548 } 549 550 restart2: 551 for (aiocbe = TAILQ_FIRST(&ki->kaio_jobqueue); aiocbe; aiocbe = 552 aiocbn) { 553 aiocbn = TAILQ_NEXT(aiocbe, plist); 554 if (aio_free_entry(aiocbe)) 555 goto restart2; 556 } 557 558 /* 559 * Note the use of lots of splbio here, trying to avoid splbio for long chains 560 * of I/O. Probably unnecessary. 561 */ 562 restart3: 563 s = splbio(); 564 while (TAILQ_FIRST(&ki->kaio_bufqueue)) { 565 ki->kaio_flags |= KAIO_WAKEUP; 566 tsleep(p, PRIBIO, "aioprn", 0); 567 splx(s); 568 goto restart3; 569 } 570 splx(s); 571 572 restart4: 573 s = splbio(); 574 for (aiocbe = TAILQ_FIRST(&ki->kaio_bufdone); aiocbe; aiocbe = aiocbn) { 575 aiocbn = TAILQ_NEXT(aiocbe, plist); 576 if (aio_free_entry(aiocbe)) { 577 splx(s); 578 goto restart4; 579 } 580 } 581 splx(s); 582 583 /* 584 * If we've slept, jobs might have moved from one queue to another. 585 * Retry rundown if we didn't manage to empty the queues. 586 */ 587 if (TAILQ_FIRST(&ki->kaio_jobdone) != NULL || 588 TAILQ_FIRST(&ki->kaio_jobqueue) != NULL || 589 TAILQ_FIRST(&ki->kaio_bufqueue) != NULL || 590 TAILQ_FIRST(&ki->kaio_bufdone) != NULL) 591 goto restart1; 592 593 for (lj = TAILQ_FIRST(&ki->kaio_liojoblist); lj; lj = ljn) { 594 ljn = TAILQ_NEXT(lj, lioj_list); 595 if ((lj->lioj_buffer_count == 0) && (lj->lioj_queue_count == 596 0)) { 597 TAILQ_REMOVE(&ki->kaio_liojoblist, lj, lioj_list); 598 zfree(aiolio_zone, lj); 599 } else { 600 #ifdef DIAGNOSTIC 601 printf("LIO job not cleaned up: B:%d, BF:%d, Q:%d, " 602 "QF:%d\n", lj->lioj_buffer_count, 603 lj->lioj_buffer_finished_count, 604 lj->lioj_queue_count, 605 lj->lioj_queue_finished_count); 606 #endif 607 } 608 } 609 610 zfree(kaio_zone, ki); 611 p->p_aioinfo = NULL; 612 } 613 614 /* 615 * Select a job to run (called by an AIO daemon). 616 */ 617 static struct aiocblist * 618 aio_selectjob(struct aiothreadlist *aiop) 619 { 620 int s; 621 struct aiocblist *aiocbe; 622 struct kaioinfo *ki; 623 struct proc *userp; 624 625 s = splnet(); 626 for (aiocbe = TAILQ_FIRST(&aio_jobs); aiocbe; aiocbe = 627 TAILQ_NEXT(aiocbe, list)) { 628 userp = aiocbe->userproc; 629 ki = userp->p_aioinfo; 630 631 if (ki->kaio_active_count < ki->kaio_maxactive_count) { 632 TAILQ_REMOVE(&aio_jobs, aiocbe, list); 633 splx(s); 634 return aiocbe; 635 } 636 } 637 splx(s); 638 639 return NULL; 640 } 641 642 /* 643 * The AIO processing activity. This is the code that does the I/O request for 644 * the non-physio version of the operations. The normal vn operations are used, 645 * and this code should work in all instances for every type of file, including 646 * pipes, sockets, fifos, and regular files. 647 */ 648 static void 649 aio_process(struct aiocblist *aiocbe) 650 { 651 struct filedesc *fdp; 652 struct thread *td; 653 struct proc *userp; 654 struct proc *mycp; 655 struct aiocb *cb; 656 struct file *fp; 657 struct uio auio; 658 struct iovec aiov; 659 unsigned int fd; 660 int cnt; 661 int error; 662 off_t offset; 663 int oublock_st, oublock_end; 664 int inblock_st, inblock_end; 665 666 userp = aiocbe->userproc; 667 td = curthread; 668 mycp = td->td_proc; 669 cb = &aiocbe->uaiocb; 670 671 fdp = mycp->p_fd; 672 fd = cb->aio_fildes; 673 fp = fdp->fd_ofiles[fd]; 674 675 if ((fp == NULL) || (fp != aiocbe->fd_file)) { 676 cb->_aiocb_private.error = EBADF; 677 cb->_aiocb_private.status = -1; 678 return; 679 } 680 681 aiov.iov_base = (void *)(uintptr_t)cb->aio_buf; 682 aiov.iov_len = cb->aio_nbytes; 683 684 auio.uio_iov = &aiov; 685 auio.uio_iovcnt = 1; 686 auio.uio_offset = offset = cb->aio_offset; 687 auio.uio_resid = cb->aio_nbytes; 688 cnt = cb->aio_nbytes; 689 auio.uio_segflg = UIO_USERSPACE; 690 auio.uio_td = td; 691 692 inblock_st = mycp->p_stats->p_ru.ru_inblock; 693 oublock_st = mycp->p_stats->p_ru.ru_oublock; 694 /* 695 * Temporarily bump the ref count while reading to avoid the 696 * descriptor being ripped out from under us. 697 */ 698 fhold(fp); 699 if (cb->aio_lio_opcode == LIO_READ) { 700 auio.uio_rw = UIO_READ; 701 error = fo_read(fp, &auio, fp->f_cred, FOF_OFFSET, td); 702 } else { 703 auio.uio_rw = UIO_WRITE; 704 error = fo_write(fp, &auio, fp->f_cred, FOF_OFFSET, td); 705 } 706 fdrop(fp, td); 707 inblock_end = mycp->p_stats->p_ru.ru_inblock; 708 oublock_end = mycp->p_stats->p_ru.ru_oublock; 709 710 aiocbe->inputcharge = inblock_end - inblock_st; 711 aiocbe->outputcharge = oublock_end - oublock_st; 712 713 if ((error) && (auio.uio_resid != cnt)) { 714 if (error == ERESTART || error == EINTR || error == EWOULDBLOCK) 715 error = 0; 716 if ((error == EPIPE) && (cb->aio_lio_opcode == LIO_WRITE)) { 717 PROC_LOCK(userp); 718 psignal(userp, SIGPIPE); 719 PROC_UNLOCK(userp); 720 } 721 } 722 723 cnt -= auio.uio_resid; 724 cb->_aiocb_private.error = error; 725 cb->_aiocb_private.status = cnt; 726 } 727 728 /* 729 * The AIO daemon, most of the actual work is done in aio_process, 730 * but the setup (and address space mgmt) is done in this routine. 731 */ 732 static void 733 aio_daemon(void *uproc) 734 { 735 int s; 736 struct aio_liojob *lj; 737 struct aiocb *cb; 738 struct aiocblist *aiocbe; 739 struct aiothreadlist *aiop; 740 struct kaioinfo *ki; 741 struct proc *curcp, *mycp, *userp; 742 struct vmspace *myvm, *tmpvm; 743 struct thread *td = curthread; 744 745 mtx_lock(&Giant); 746 /* 747 * Local copies of curproc (cp) and vmspace (myvm) 748 */ 749 mycp = td->td_proc; 750 myvm = mycp->p_vmspace; 751 752 if (mycp->p_textvp) { 753 vrele(mycp->p_textvp); 754 mycp->p_textvp = NULL; 755 } 756 757 /* 758 * Allocate and ready the aio control info. There is one aiop structure 759 * per daemon. 760 */ 761 aiop = zalloc(aiop_zone); 762 aiop->aiothread = td; 763 aiop->aiothreadflags |= AIOP_FREE; 764 765 s = splnet(); 766 767 /* 768 * Place thread (lightweight process) onto the AIO free thread list. 769 */ 770 if (TAILQ_EMPTY(&aio_freeproc)) 771 wakeup(&aio_freeproc); 772 TAILQ_INSERT_HEAD(&aio_freeproc, aiop, list); 773 774 splx(s); 775 776 /* 777 * Get rid of our current filedescriptors. AIOD's don't need any 778 * filedescriptors, except as temporarily inherited from the client. 779 */ 780 fdfree(td); 781 mycp->p_fd = NULL; 782 783 /* The daemon resides in its own pgrp. */ 784 enterpgrp(mycp, mycp->p_pid, 1); 785 786 /* Mark special process type. */ 787 mycp->p_flag |= P_SYSTEM; 788 789 /* 790 * Wakeup parent process. (Parent sleeps to keep from blasting away 791 * and creating too many daemons.) 792 */ 793 wakeup(mycp); 794 795 for (;;) { 796 /* 797 * curcp is the current daemon process context. 798 * userp is the current user process context. 799 */ 800 curcp = mycp; 801 802 /* 803 * Take daemon off of free queue 804 */ 805 if (aiop->aiothreadflags & AIOP_FREE) { 806 s = splnet(); 807 TAILQ_REMOVE(&aio_freeproc, aiop, list); 808 TAILQ_INSERT_TAIL(&aio_activeproc, aiop, list); 809 aiop->aiothreadflags &= ~AIOP_FREE; 810 splx(s); 811 } 812 aiop->aiothreadflags &= ~AIOP_SCHED; 813 814 /* 815 * Check for jobs. 816 */ 817 while ((aiocbe = aio_selectjob(aiop)) != NULL) { 818 cb = &aiocbe->uaiocb; 819 userp = aiocbe->userproc; 820 821 aiocbe->jobstate = JOBST_JOBRUNNING; 822 823 /* 824 * Connect to process address space for user program. 825 */ 826 if (userp != curcp) { 827 /* 828 * Save the current address space that we are 829 * connected to. 830 */ 831 tmpvm = mycp->p_vmspace; 832 833 /* 834 * Point to the new user address space, and 835 * refer to it. 836 */ 837 mycp->p_vmspace = userp->p_vmspace; 838 mycp->p_vmspace->vm_refcnt++; 839 840 /* Activate the new mapping. */ 841 pmap_activate(FIRST_THREAD_IN_PROC(mycp)); 842 843 /* 844 * If the old address space wasn't the daemons 845 * own address space, then we need to remove the 846 * daemon's reference from the other process 847 * that it was acting on behalf of. 848 */ 849 if (tmpvm != myvm) { 850 vmspace_free(tmpvm); 851 } 852 853 /* 854 * Disassociate from previous clients file 855 * descriptors, and associate to the new clients 856 * descriptors. Note that the daemon doesn't 857 * need to worry about its orginal descriptors, 858 * because they were originally freed. 859 */ 860 if (mycp->p_fd) 861 fdfree(td); 862 mycp->p_fd = fdshare(userp); 863 curcp = userp; 864 } 865 866 ki = userp->p_aioinfo; 867 lj = aiocbe->lio; 868 869 /* Account for currently active jobs. */ 870 ki->kaio_active_count++; 871 872 /* Do the I/O function. */ 873 aiocbe->jobaiothread = aiop; 874 aio_process(aiocbe); 875 876 /* Decrement the active job count. */ 877 ki->kaio_active_count--; 878 879 /* 880 * Increment the completion count for wakeup/signal 881 * comparisons. 882 */ 883 aiocbe->jobflags |= AIOCBLIST_DONE; 884 ki->kaio_queue_finished_count++; 885 if (lj) 886 lj->lioj_queue_finished_count++; 887 if ((ki->kaio_flags & KAIO_WAKEUP) || ((ki->kaio_flags 888 & KAIO_RUNDOWN) && (ki->kaio_active_count == 0))) { 889 ki->kaio_flags &= ~KAIO_WAKEUP; 890 wakeup(userp); 891 } 892 893 s = splbio(); 894 if (lj && (lj->lioj_flags & 895 (LIOJ_SIGNAL|LIOJ_SIGNAL_POSTED)) == LIOJ_SIGNAL) { 896 if ((lj->lioj_queue_finished_count == 897 lj->lioj_queue_count) && 898 (lj->lioj_buffer_finished_count == 899 lj->lioj_buffer_count)) { 900 PROC_LOCK(userp); 901 psignal(userp, 902 lj->lioj_signal.sigev_signo); 903 PROC_UNLOCK(userp); 904 lj->lioj_flags |= LIOJ_SIGNAL_POSTED; 905 } 906 } 907 splx(s); 908 909 aiocbe->jobstate = JOBST_JOBFINISHED; 910 911 /* 912 * If the I/O request should be automatically rundown, 913 * do the needed cleanup. Otherwise, place the queue 914 * entry for the just finished I/O request into the done 915 * queue for the associated client. 916 */ 917 s = splnet(); 918 if (aiocbe->jobflags & AIOCBLIST_ASYNCFREE) { 919 aiocbe->jobflags &= ~AIOCBLIST_ASYNCFREE; 920 zfree(aiocb_zone, aiocbe); 921 } else { 922 TAILQ_REMOVE(&ki->kaio_jobqueue, aiocbe, plist); 923 TAILQ_INSERT_TAIL(&ki->kaio_jobdone, aiocbe, 924 plist); 925 } 926 splx(s); 927 KNOTE(&aiocbe->klist, 0); 928 929 if (aiocbe->jobflags & AIOCBLIST_RUNDOWN) { 930 wakeup(aiocbe); 931 aiocbe->jobflags &= ~AIOCBLIST_RUNDOWN; 932 } 933 934 if (cb->aio_sigevent.sigev_notify == SIGEV_SIGNAL) { 935 PROC_LOCK(userp); 936 psignal(userp, cb->aio_sigevent.sigev_signo); 937 PROC_UNLOCK(userp); 938 } 939 } 940 941 /* 942 * Disconnect from user address space. 943 */ 944 if (curcp != mycp) { 945 /* Get the user address space to disconnect from. */ 946 tmpvm = mycp->p_vmspace; 947 948 /* Get original address space for daemon. */ 949 mycp->p_vmspace = myvm; 950 951 /* Activate the daemon's address space. */ 952 pmap_activate(FIRST_THREAD_IN_PROC(mycp)); 953 #ifdef DIAGNOSTIC 954 if (tmpvm == myvm) { 955 printf("AIOD: vmspace problem -- %d\n", 956 mycp->p_pid); 957 } 958 #endif 959 /* Remove our vmspace reference. */ 960 vmspace_free(tmpvm); 961 962 /* 963 * Disassociate from the user process's file 964 * descriptors. 965 */ 966 if (mycp->p_fd) 967 fdfree(td); 968 mycp->p_fd = NULL; 969 curcp = mycp; 970 } 971 972 /* 973 * If we are the first to be put onto the free queue, wakeup 974 * anyone waiting for a daemon. 975 */ 976 s = splnet(); 977 TAILQ_REMOVE(&aio_activeproc, aiop, list); 978 if (TAILQ_EMPTY(&aio_freeproc)) 979 wakeup(&aio_freeproc); 980 TAILQ_INSERT_HEAD(&aio_freeproc, aiop, list); 981 aiop->aiothreadflags |= AIOP_FREE; 982 splx(s); 983 984 /* 985 * If daemon is inactive for a long time, allow it to exit, 986 * thereby freeing resources. 987 */ 988 if ((aiop->aiothreadflags & AIOP_SCHED) == 0 && 989 tsleep(aiop->aiothread, PRIBIO, "aiordy", aiod_lifetime)) { 990 s = splnet(); 991 if (TAILQ_EMPTY(&aio_jobs)) { 992 if ((aiop->aiothreadflags & AIOP_FREE) && 993 (num_aio_procs > target_aio_procs)) { 994 TAILQ_REMOVE(&aio_freeproc, aiop, list); 995 splx(s); 996 zfree(aiop_zone, aiop); 997 num_aio_procs--; 998 #ifdef DIAGNOSTIC 999 if (mycp->p_vmspace->vm_refcnt <= 1) { 1000 printf("AIOD: bad vm refcnt for" 1001 " exiting daemon: %d\n", 1002 mycp->p_vmspace->vm_refcnt); 1003 } 1004 #endif 1005 kthread_exit(0); 1006 } 1007 } 1008 splx(s); 1009 } 1010 } 1011 } 1012 1013 /* 1014 * Create a new AIO daemon. This is mostly a kernel-thread fork routine. The 1015 * AIO daemon modifies its environment itself. 1016 */ 1017 static int 1018 aio_newproc() 1019 { 1020 int error; 1021 struct proc *p; 1022 1023 error = kthread_create(aio_daemon, curproc, &p, RFNOWAIT, "aiod%d", 1024 num_aio_procs); 1025 if (error) 1026 return error; 1027 1028 /* 1029 * Wait until daemon is started, but continue on just in case to 1030 * handle error conditions. 1031 */ 1032 error = tsleep(p, PZERO, "aiosta", aiod_timeout); 1033 1034 num_aio_procs++; 1035 1036 return error; 1037 } 1038 1039 /* 1040 * Try the high-performance, low-overhead physio method for eligible 1041 * VCHR devices. This method doesn't use an aio helper thread, and 1042 * thus has very low overhead. 1043 * 1044 * Assumes that the caller, _aio_aqueue(), has incremented the file 1045 * structure's reference count, preventing its deallocation for the 1046 * duration of this call. 1047 */ 1048 static int 1049 aio_qphysio(struct proc *p, struct aiocblist *aiocbe) 1050 { 1051 int error; 1052 struct aiocb *cb; 1053 struct file *fp; 1054 struct buf *bp; 1055 struct vnode *vp; 1056 struct kaioinfo *ki; 1057 struct aio_liojob *lj; 1058 int s; 1059 int notify; 1060 1061 cb = &aiocbe->uaiocb; 1062 fp = aiocbe->fd_file; 1063 1064 if (fp->f_type != DTYPE_VNODE) 1065 return (-1); 1066 1067 vp = (struct vnode *)fp->f_data; 1068 1069 /* 1070 * If its not a disk, we don't want to return a positive error. 1071 * It causes the aio code to not fall through to try the thread 1072 * way when you're talking to a regular file. 1073 */ 1074 if (!vn_isdisk(vp, &error)) { 1075 if (error == ENOTBLK) 1076 return (-1); 1077 else 1078 return (error); 1079 } 1080 1081 if (cb->aio_nbytes % vp->v_rdev->si_bsize_phys) 1082 return (-1); 1083 1084 if (cb->aio_nbytes > 1085 MAXPHYS - (((vm_offset_t) cb->aio_buf) & PAGE_MASK)) 1086 return (-1); 1087 1088 ki = p->p_aioinfo; 1089 if (ki->kaio_buffer_count >= ki->kaio_ballowed_count) 1090 return (-1); 1091 1092 ki->kaio_buffer_count++; 1093 1094 lj = aiocbe->lio; 1095 if (lj) 1096 lj->lioj_buffer_count++; 1097 1098 /* Create and build a buffer header for a transfer. */ 1099 bp = (struct buf *)getpbuf(NULL); 1100 BUF_KERNPROC(bp); 1101 1102 /* 1103 * Get a copy of the kva from the physical buffer. 1104 */ 1105 bp->b_caller1 = p; 1106 bp->b_dev = vp->v_rdev; 1107 error = bp->b_error = 0; 1108 1109 bp->b_bcount = cb->aio_nbytes; 1110 bp->b_bufsize = cb->aio_nbytes; 1111 bp->b_flags = B_PHYS; 1112 bp->b_iodone = aio_physwakeup; 1113 bp->b_saveaddr = bp->b_data; 1114 bp->b_data = (void *)(uintptr_t)cb->aio_buf; 1115 bp->b_blkno = btodb(cb->aio_offset); 1116 1117 if (cb->aio_lio_opcode == LIO_WRITE) { 1118 bp->b_iocmd = BIO_WRITE; 1119 if (!useracc(bp->b_data, bp->b_bufsize, VM_PROT_READ)) { 1120 error = EFAULT; 1121 goto doerror; 1122 } 1123 } else { 1124 bp->b_iocmd = BIO_READ; 1125 if (!useracc(bp->b_data, bp->b_bufsize, VM_PROT_WRITE)) { 1126 error = EFAULT; 1127 goto doerror; 1128 } 1129 } 1130 1131 /* Bring buffer into kernel space. */ 1132 vmapbuf(bp); 1133 1134 s = splbio(); 1135 aiocbe->bp = bp; 1136 bp->b_spc = (void *)aiocbe; 1137 TAILQ_INSERT_TAIL(&aio_bufjobs, aiocbe, list); 1138 TAILQ_INSERT_TAIL(&ki->kaio_bufqueue, aiocbe, plist); 1139 aiocbe->jobstate = JOBST_JOBQBUF; 1140 cb->_aiocb_private.status = cb->aio_nbytes; 1141 num_buf_aio++; 1142 bp->b_error = 0; 1143 1144 splx(s); 1145 1146 /* Perform transfer. */ 1147 DEV_STRATEGY(bp, 0); 1148 1149 notify = 0; 1150 s = splbio(); 1151 1152 /* 1153 * If we had an error invoking the request, or an error in processing 1154 * the request before we have returned, we process it as an error in 1155 * transfer. Note that such an I/O error is not indicated immediately, 1156 * but is returned using the aio_error mechanism. In this case, 1157 * aio_suspend will return immediately. 1158 */ 1159 if (bp->b_error || (bp->b_ioflags & BIO_ERROR)) { 1160 struct aiocb *job = aiocbe->uuaiocb; 1161 1162 aiocbe->uaiocb._aiocb_private.status = 0; 1163 suword(&job->_aiocb_private.status, 0); 1164 aiocbe->uaiocb._aiocb_private.error = bp->b_error; 1165 suword(&job->_aiocb_private.error, bp->b_error); 1166 1167 ki->kaio_buffer_finished_count++; 1168 1169 if (aiocbe->jobstate != JOBST_JOBBFINISHED) { 1170 aiocbe->jobstate = JOBST_JOBBFINISHED; 1171 aiocbe->jobflags |= AIOCBLIST_DONE; 1172 TAILQ_REMOVE(&aio_bufjobs, aiocbe, list); 1173 TAILQ_REMOVE(&ki->kaio_bufqueue, aiocbe, plist); 1174 TAILQ_INSERT_TAIL(&ki->kaio_bufdone, aiocbe, plist); 1175 notify = 1; 1176 } 1177 } 1178 splx(s); 1179 if (notify) 1180 KNOTE(&aiocbe->klist, 0); 1181 return 0; 1182 1183 doerror: 1184 ki->kaio_buffer_count--; 1185 if (lj) 1186 lj->lioj_buffer_count--; 1187 aiocbe->bp = NULL; 1188 relpbuf(bp, NULL); 1189 return error; 1190 } 1191 1192 /* 1193 * This waits/tests physio completion. 1194 */ 1195 static int 1196 aio_fphysio(struct aiocblist *iocb) 1197 { 1198 int s; 1199 struct buf *bp; 1200 int error; 1201 1202 bp = iocb->bp; 1203 1204 s = splbio(); 1205 while ((bp->b_flags & B_DONE) == 0) { 1206 if (tsleep(bp, PRIBIO, "physstr", aiod_timeout)) { 1207 if ((bp->b_flags & B_DONE) == 0) { 1208 splx(s); 1209 return EINPROGRESS; 1210 } else 1211 break; 1212 } 1213 } 1214 splx(s); 1215 1216 /* Release mapping into kernel space. */ 1217 vunmapbuf(bp); 1218 iocb->bp = 0; 1219 1220 error = 0; 1221 1222 /* Check for an error. */ 1223 if (bp->b_ioflags & BIO_ERROR) 1224 error = bp->b_error; 1225 1226 relpbuf(bp, NULL); 1227 return (error); 1228 } 1229 1230 /* 1231 * Wake up aio requests that may be serviceable now. 1232 */ 1233 static void 1234 aio_swake_cb(struct socket *so, struct sockbuf *sb) 1235 { 1236 struct aiocblist *cb,*cbn; 1237 struct proc *p; 1238 struct kaioinfo *ki = NULL; 1239 int opcode, wakecount = 0; 1240 struct aiothreadlist *aiop; 1241 1242 if (sb == &so->so_snd) { 1243 opcode = LIO_WRITE; 1244 so->so_snd.sb_flags &= ~SB_AIO; 1245 } else { 1246 opcode = LIO_READ; 1247 so->so_rcv.sb_flags &= ~SB_AIO; 1248 } 1249 1250 for (cb = TAILQ_FIRST(&so->so_aiojobq); cb; cb = cbn) { 1251 cbn = TAILQ_NEXT(cb, list); 1252 if (opcode == cb->uaiocb.aio_lio_opcode) { 1253 p = cb->userproc; 1254 ki = p->p_aioinfo; 1255 TAILQ_REMOVE(&so->so_aiojobq, cb, list); 1256 TAILQ_REMOVE(&ki->kaio_sockqueue, cb, plist); 1257 TAILQ_INSERT_TAIL(&aio_jobs, cb, list); 1258 TAILQ_INSERT_TAIL(&ki->kaio_jobqueue, cb, plist); 1259 wakecount++; 1260 if (cb->jobstate != JOBST_JOBQGLOBAL) 1261 panic("invalid queue value"); 1262 } 1263 } 1264 1265 while (wakecount--) { 1266 if ((aiop = TAILQ_FIRST(&aio_freeproc)) != 0) { 1267 TAILQ_REMOVE(&aio_freeproc, aiop, list); 1268 TAILQ_INSERT_TAIL(&aio_activeproc, aiop, list); 1269 aiop->aiothreadflags &= ~AIOP_FREE; 1270 wakeup(aiop->aiothread); 1271 } 1272 } 1273 } 1274 1275 /* 1276 * Queue a new AIO request. Choosing either the threaded or direct physio VCHR 1277 * technique is done in this code. 1278 */ 1279 static int 1280 _aio_aqueue(struct thread *td, struct aiocb *job, struct aio_liojob *lj, int type) 1281 { 1282 struct proc *p = td->td_proc; 1283 struct filedesc *fdp; 1284 struct file *fp; 1285 unsigned int fd; 1286 struct socket *so; 1287 int s; 1288 int error; 1289 int opcode; 1290 struct aiocblist *aiocbe; 1291 struct aiothreadlist *aiop; 1292 struct kaioinfo *ki; 1293 struct kevent kev; 1294 struct kqueue *kq; 1295 struct file *kq_fp; 1296 1297 aiocbe = zalloc(aiocb_zone); 1298 aiocbe->inputcharge = 0; 1299 aiocbe->outputcharge = 0; 1300 callout_handle_init(&aiocbe->timeouthandle); 1301 SLIST_INIT(&aiocbe->klist); 1302 1303 suword(&job->_aiocb_private.status, -1); 1304 suword(&job->_aiocb_private.error, 0); 1305 suword(&job->_aiocb_private.kernelinfo, -1); 1306 1307 error = copyin(job, &aiocbe->uaiocb, sizeof(aiocbe->uaiocb)); 1308 if (error) { 1309 suword(&job->_aiocb_private.error, error); 1310 zfree(aiocb_zone, aiocbe); 1311 return error; 1312 } 1313 if (aiocbe->uaiocb.aio_sigevent.sigev_notify == SIGEV_SIGNAL && 1314 !_SIG_VALID(aiocbe->uaiocb.aio_sigevent.sigev_signo)) { 1315 zfree(aiocb_zone, aiocbe); 1316 return EINVAL; 1317 } 1318 1319 /* Save userspace address of the job info. */ 1320 aiocbe->uuaiocb = job; 1321 1322 /* Get the opcode. */ 1323 if (type != LIO_NOP) 1324 aiocbe->uaiocb.aio_lio_opcode = type; 1325 opcode = aiocbe->uaiocb.aio_lio_opcode; 1326 1327 /* Get the fd info for process. */ 1328 fdp = p->p_fd; 1329 1330 /* 1331 * Range check file descriptor. 1332 */ 1333 fd = aiocbe->uaiocb.aio_fildes; 1334 if (fd >= fdp->fd_nfiles) { 1335 zfree(aiocb_zone, aiocbe); 1336 if (type == 0) 1337 suword(&job->_aiocb_private.error, EBADF); 1338 return EBADF; 1339 } 1340 1341 fp = aiocbe->fd_file = fdp->fd_ofiles[fd]; 1342 if ((fp == NULL) || ((opcode == LIO_WRITE) && ((fp->f_flag & FWRITE) == 1343 0))) { 1344 zfree(aiocb_zone, aiocbe); 1345 if (type == 0) 1346 suword(&job->_aiocb_private.error, EBADF); 1347 return EBADF; 1348 } 1349 1350 if (aiocbe->uaiocb.aio_offset == -1LL) { 1351 zfree(aiocb_zone, aiocbe); 1352 if (type == 0) 1353 suword(&job->_aiocb_private.error, EINVAL); 1354 return EINVAL; 1355 } 1356 1357 error = suword(&job->_aiocb_private.kernelinfo, jobrefid); 1358 if (error) { 1359 zfree(aiocb_zone, aiocbe); 1360 if (type == 0) 1361 suword(&job->_aiocb_private.error, EINVAL); 1362 return error; 1363 } 1364 1365 aiocbe->uaiocb._aiocb_private.kernelinfo = (void *)(intptr_t)jobrefid; 1366 if (jobrefid == LONG_MAX) 1367 jobrefid = 1; 1368 else 1369 jobrefid++; 1370 1371 if (opcode == LIO_NOP) { 1372 zfree(aiocb_zone, aiocbe); 1373 if (type == 0) { 1374 suword(&job->_aiocb_private.error, 0); 1375 suword(&job->_aiocb_private.status, 0); 1376 suword(&job->_aiocb_private.kernelinfo, 0); 1377 } 1378 return 0; 1379 } 1380 1381 if ((opcode != LIO_READ) && (opcode != LIO_WRITE)) { 1382 zfree(aiocb_zone, aiocbe); 1383 if (type == 0) { 1384 suword(&job->_aiocb_private.status, 0); 1385 suword(&job->_aiocb_private.error, EINVAL); 1386 } 1387 return EINVAL; 1388 } 1389 1390 fhold(fp); 1391 1392 if (aiocbe->uaiocb.aio_sigevent.sigev_notify == SIGEV_KEVENT) { 1393 kev.ident = aiocbe->uaiocb.aio_sigevent.sigev_notify_kqueue; 1394 kev.udata = aiocbe->uaiocb.aio_sigevent.sigev_value.sigval_ptr; 1395 } 1396 else { 1397 /* 1398 * This method for requesting kevent-based notification won't 1399 * work on the alpha, since we're passing in a pointer 1400 * via aio_lio_opcode, which is an int. Use the SIGEV_KEVENT- 1401 * based method instead. 1402 */ 1403 struct kevent *kevp; 1404 1405 kevp = (struct kevent *)(uintptr_t)job->aio_lio_opcode; 1406 if (kevp == NULL) 1407 goto no_kqueue; 1408 1409 error = copyin(kevp, &kev, sizeof(kev)); 1410 if (error) 1411 goto aqueue_fail; 1412 } 1413 if ((u_int)kev.ident >= fdp->fd_nfiles || 1414 (kq_fp = fdp->fd_ofiles[kev.ident]) == NULL || 1415 (kq_fp->f_type != DTYPE_KQUEUE)) { 1416 error = EBADF; 1417 goto aqueue_fail; 1418 } 1419 kq = (struct kqueue *)kq_fp->f_data; 1420 kev.ident = (uintptr_t)aiocbe; 1421 kev.filter = EVFILT_AIO; 1422 kev.flags = EV_ADD | EV_ENABLE | EV_FLAG1; 1423 error = kqueue_register(kq, &kev, td); 1424 aqueue_fail: 1425 if (error) { 1426 zfree(aiocb_zone, aiocbe); 1427 if (type == 0) 1428 suword(&job->_aiocb_private.error, error); 1429 goto done; 1430 } 1431 no_kqueue: 1432 1433 suword(&job->_aiocb_private.error, EINPROGRESS); 1434 aiocbe->uaiocb._aiocb_private.error = EINPROGRESS; 1435 aiocbe->userproc = p; 1436 aiocbe->jobflags = 0; 1437 aiocbe->lio = lj; 1438 ki = p->p_aioinfo; 1439 1440 if (fp->f_type == DTYPE_SOCKET) { 1441 /* 1442 * Alternate queueing for socket ops: Reach down into the 1443 * descriptor to get the socket data. Then check to see if the 1444 * socket is ready to be read or written (based on the requested 1445 * operation). 1446 * 1447 * If it is not ready for io, then queue the aiocbe on the 1448 * socket, and set the flags so we get a call when sbnotify() 1449 * happens. 1450 */ 1451 so = (struct socket *)fp->f_data; 1452 s = splnet(); 1453 if (((opcode == LIO_READ) && (!soreadable(so))) || ((opcode == 1454 LIO_WRITE) && (!sowriteable(so)))) { 1455 TAILQ_INSERT_TAIL(&so->so_aiojobq, aiocbe, list); 1456 TAILQ_INSERT_TAIL(&ki->kaio_sockqueue, aiocbe, plist); 1457 if (opcode == LIO_READ) 1458 so->so_rcv.sb_flags |= SB_AIO; 1459 else 1460 so->so_snd.sb_flags |= SB_AIO; 1461 aiocbe->jobstate = JOBST_JOBQGLOBAL; /* XXX */ 1462 ki->kaio_queue_count++; 1463 num_queue_count++; 1464 splx(s); 1465 error = 0; 1466 goto done; 1467 } 1468 splx(s); 1469 } 1470 1471 if ((error = aio_qphysio(p, aiocbe)) == 0) 1472 goto done; 1473 if (error > 0) { 1474 suword(&job->_aiocb_private.status, 0); 1475 aiocbe->uaiocb._aiocb_private.error = error; 1476 suword(&job->_aiocb_private.error, error); 1477 goto done; 1478 } 1479 1480 /* No buffer for daemon I/O. */ 1481 aiocbe->bp = NULL; 1482 1483 ki->kaio_queue_count++; 1484 if (lj) 1485 lj->lioj_queue_count++; 1486 s = splnet(); 1487 TAILQ_INSERT_TAIL(&ki->kaio_jobqueue, aiocbe, plist); 1488 TAILQ_INSERT_TAIL(&aio_jobs, aiocbe, list); 1489 splx(s); 1490 aiocbe->jobstate = JOBST_JOBQGLOBAL; 1491 1492 num_queue_count++; 1493 error = 0; 1494 1495 /* 1496 * If we don't have a free AIO process, and we are below our quota, then 1497 * start one. Otherwise, depend on the subsequent I/O completions to 1498 * pick-up this job. If we don't sucessfully create the new process 1499 * (thread) due to resource issues, we return an error for now (EAGAIN), 1500 * which is likely not the correct thing to do. 1501 */ 1502 s = splnet(); 1503 retryproc: 1504 if ((aiop = TAILQ_FIRST(&aio_freeproc)) != NULL) { 1505 TAILQ_REMOVE(&aio_freeproc, aiop, list); 1506 TAILQ_INSERT_TAIL(&aio_activeproc, aiop, list); 1507 aiop->aiothreadflags &= ~AIOP_FREE; 1508 wakeup(aiop->aiothread); 1509 } else if (((num_aio_resv_start + num_aio_procs) < max_aio_procs) && 1510 ((ki->kaio_active_count + num_aio_resv_start) < 1511 ki->kaio_maxactive_count)) { 1512 num_aio_resv_start++; 1513 if ((error = aio_newproc()) == 0) { 1514 num_aio_resv_start--; 1515 goto retryproc; 1516 } 1517 num_aio_resv_start--; 1518 } 1519 splx(s); 1520 done: 1521 fdrop(fp, td); 1522 return error; 1523 } 1524 1525 /* 1526 * This routine queues an AIO request, checking for quotas. 1527 */ 1528 static int 1529 aio_aqueue(struct thread *td, struct aiocb *job, int type) 1530 { 1531 struct proc *p = td->td_proc; 1532 struct kaioinfo *ki; 1533 1534 if (p->p_aioinfo == NULL) 1535 aio_init_aioinfo(p); 1536 1537 if (num_queue_count >= max_queue_count) 1538 return EAGAIN; 1539 1540 ki = p->p_aioinfo; 1541 if (ki->kaio_queue_count >= ki->kaio_qallowed_count) 1542 return EAGAIN; 1543 1544 return _aio_aqueue(td, job, NULL, type); 1545 } 1546 1547 /* 1548 * Support the aio_return system call, as a side-effect, kernel resources are 1549 * released. 1550 */ 1551 int 1552 aio_return(struct thread *td, struct aio_return_args *uap) 1553 { 1554 struct proc *p = td->td_proc; 1555 int s; 1556 int jobref; 1557 struct aiocblist *cb, *ncb; 1558 struct aiocb *ujob; 1559 struct kaioinfo *ki; 1560 1561 ki = p->p_aioinfo; 1562 if (ki == NULL) 1563 return EINVAL; 1564 1565 ujob = uap->aiocbp; 1566 1567 jobref = fuword(&ujob->_aiocb_private.kernelinfo); 1568 if (jobref == -1 || jobref == 0) 1569 return EINVAL; 1570 1571 TAILQ_FOREACH(cb, &ki->kaio_jobdone, plist) { 1572 if (((intptr_t) cb->uaiocb._aiocb_private.kernelinfo) == 1573 jobref) { 1574 if (ujob == cb->uuaiocb) { 1575 td->td_retval[0] = 1576 cb->uaiocb._aiocb_private.status; 1577 } else 1578 td->td_retval[0] = EFAULT; 1579 if (cb->uaiocb.aio_lio_opcode == LIO_WRITE) { 1580 p->p_stats->p_ru.ru_oublock += 1581 cb->outputcharge; 1582 cb->outputcharge = 0; 1583 } else if (cb->uaiocb.aio_lio_opcode == LIO_READ) { 1584 p->p_stats->p_ru.ru_inblock += cb->inputcharge; 1585 cb->inputcharge = 0; 1586 } 1587 aio_free_entry(cb); 1588 return 0; 1589 } 1590 } 1591 s = splbio(); 1592 for (cb = TAILQ_FIRST(&ki->kaio_bufdone); cb; cb = ncb) { 1593 ncb = TAILQ_NEXT(cb, plist); 1594 if (((intptr_t) cb->uaiocb._aiocb_private.kernelinfo) 1595 == jobref) { 1596 splx(s); 1597 if (ujob == cb->uuaiocb) { 1598 td->td_retval[0] = 1599 cb->uaiocb._aiocb_private.status; 1600 } else 1601 td->td_retval[0] = EFAULT; 1602 aio_free_entry(cb); 1603 return 0; 1604 } 1605 } 1606 splx(s); 1607 1608 return (EINVAL); 1609 } 1610 1611 /* 1612 * Allow a process to wakeup when any of the I/O requests are completed. 1613 */ 1614 int 1615 aio_suspend(struct thread *td, struct aio_suspend_args *uap) 1616 { 1617 struct proc *p = td->td_proc; 1618 struct timeval atv; 1619 struct timespec ts; 1620 struct aiocb *const *cbptr, *cbp; 1621 struct kaioinfo *ki; 1622 struct aiocblist *cb; 1623 int i; 1624 int njoblist; 1625 int error, s, timo; 1626 int *ijoblist; 1627 struct aiocb **ujoblist; 1628 1629 if (uap->nent > AIO_LISTIO_MAX) 1630 return EINVAL; 1631 1632 timo = 0; 1633 if (uap->timeout) { 1634 /* Get timespec struct. */ 1635 if ((error = copyin(uap->timeout, &ts, sizeof(ts))) != 0) 1636 return error; 1637 1638 if (ts.tv_nsec < 0 || ts.tv_nsec >= 1000000000) 1639 return (EINVAL); 1640 1641 TIMESPEC_TO_TIMEVAL(&atv, &ts); 1642 if (itimerfix(&atv)) 1643 return (EINVAL); 1644 timo = tvtohz(&atv); 1645 } 1646 1647 ki = p->p_aioinfo; 1648 if (ki == NULL) 1649 return EAGAIN; 1650 1651 njoblist = 0; 1652 ijoblist = zalloc(aiol_zone); 1653 ujoblist = zalloc(aiol_zone); 1654 cbptr = uap->aiocbp; 1655 1656 for (i = 0; i < uap->nent; i++) { 1657 cbp = (struct aiocb *)(intptr_t)fuword((caddr_t)&cbptr[i]); 1658 if (cbp == 0) 1659 continue; 1660 ujoblist[njoblist] = cbp; 1661 ijoblist[njoblist] = fuword(&cbp->_aiocb_private.kernelinfo); 1662 njoblist++; 1663 } 1664 1665 if (njoblist == 0) { 1666 zfree(aiol_zone, ijoblist); 1667 zfree(aiol_zone, ujoblist); 1668 return 0; 1669 } 1670 1671 error = 0; 1672 for (;;) { 1673 TAILQ_FOREACH(cb, &ki->kaio_jobdone, plist) { 1674 for (i = 0; i < njoblist; i++) { 1675 if (((intptr_t) 1676 cb->uaiocb._aiocb_private.kernelinfo) == 1677 ijoblist[i]) { 1678 if (ujoblist[i] != cb->uuaiocb) 1679 error = EINVAL; 1680 zfree(aiol_zone, ijoblist); 1681 zfree(aiol_zone, ujoblist); 1682 return error; 1683 } 1684 } 1685 } 1686 1687 s = splbio(); 1688 for (cb = TAILQ_FIRST(&ki->kaio_bufdone); cb; cb = 1689 TAILQ_NEXT(cb, plist)) { 1690 for (i = 0; i < njoblist; i++) { 1691 if (((intptr_t) 1692 cb->uaiocb._aiocb_private.kernelinfo) == 1693 ijoblist[i]) { 1694 splx(s); 1695 if (ujoblist[i] != cb->uuaiocb) 1696 error = EINVAL; 1697 zfree(aiol_zone, ijoblist); 1698 zfree(aiol_zone, ujoblist); 1699 return error; 1700 } 1701 } 1702 } 1703 1704 ki->kaio_flags |= KAIO_WAKEUP; 1705 error = tsleep(p, PRIBIO | PCATCH, "aiospn", timo); 1706 splx(s); 1707 1708 if (error == ERESTART || error == EINTR) { 1709 zfree(aiol_zone, ijoblist); 1710 zfree(aiol_zone, ujoblist); 1711 return EINTR; 1712 } else if (error == EWOULDBLOCK) { 1713 zfree(aiol_zone, ijoblist); 1714 zfree(aiol_zone, ujoblist); 1715 return EAGAIN; 1716 } 1717 } 1718 1719 /* NOTREACHED */ 1720 return EINVAL; 1721 } 1722 1723 /* 1724 * aio_cancel cancels any non-physio aio operations not currently in 1725 * progress. 1726 */ 1727 int 1728 aio_cancel(struct thread *td, struct aio_cancel_args *uap) 1729 { 1730 struct proc *p = td->td_proc; 1731 struct kaioinfo *ki; 1732 struct aiocblist *cbe, *cbn; 1733 struct file *fp; 1734 struct filedesc *fdp; 1735 struct socket *so; 1736 struct proc *po; 1737 int s,error; 1738 int cancelled=0; 1739 int notcancelled=0; 1740 struct vnode *vp; 1741 1742 fdp = p->p_fd; 1743 if ((u_int)uap->fd >= fdp->fd_nfiles || 1744 (fp = fdp->fd_ofiles[uap->fd]) == NULL) 1745 return (EBADF); 1746 1747 if (fp->f_type == DTYPE_VNODE) { 1748 vp = (struct vnode *)fp->f_data; 1749 1750 if (vn_isdisk(vp,&error)) { 1751 td->td_retval[0] = AIO_NOTCANCELED; 1752 return 0; 1753 } 1754 } else if (fp->f_type == DTYPE_SOCKET) { 1755 so = (struct socket *)fp->f_data; 1756 1757 s = splnet(); 1758 1759 for (cbe = TAILQ_FIRST(&so->so_aiojobq); cbe; cbe = cbn) { 1760 cbn = TAILQ_NEXT(cbe, list); 1761 if ((uap->aiocbp == NULL) || 1762 (uap->aiocbp == cbe->uuaiocb) ) { 1763 po = cbe->userproc; 1764 ki = po->p_aioinfo; 1765 TAILQ_REMOVE(&so->so_aiojobq, cbe, list); 1766 TAILQ_REMOVE(&ki->kaio_sockqueue, cbe, plist); 1767 TAILQ_INSERT_TAIL(&ki->kaio_jobdone, cbe, plist); 1768 if (ki->kaio_flags & KAIO_WAKEUP) { 1769 wakeup(po); 1770 } 1771 cbe->jobstate = JOBST_JOBFINISHED; 1772 cbe->uaiocb._aiocb_private.status=-1; 1773 cbe->uaiocb._aiocb_private.error=ECANCELED; 1774 cancelled++; 1775 /* XXX cancelled, knote? */ 1776 if (cbe->uaiocb.aio_sigevent.sigev_notify == 1777 SIGEV_SIGNAL) { 1778 PROC_LOCK(cbe->userproc); 1779 psignal(cbe->userproc, cbe->uaiocb.aio_sigevent.sigev_signo); 1780 PROC_UNLOCK(cbe->userproc); 1781 } 1782 if (uap->aiocbp) 1783 break; 1784 } 1785 } 1786 splx(s); 1787 1788 if ((cancelled) && (uap->aiocbp)) { 1789 td->td_retval[0] = AIO_CANCELED; 1790 return 0; 1791 } 1792 } 1793 ki=p->p_aioinfo; 1794 s = splnet(); 1795 1796 for (cbe = TAILQ_FIRST(&ki->kaio_jobqueue); cbe; cbe = cbn) { 1797 cbn = TAILQ_NEXT(cbe, plist); 1798 1799 if ((uap->fd == cbe->uaiocb.aio_fildes) && 1800 ((uap->aiocbp == NULL ) || 1801 (uap->aiocbp == cbe->uuaiocb))) { 1802 1803 if (cbe->jobstate == JOBST_JOBQGLOBAL) { 1804 TAILQ_REMOVE(&aio_jobs, cbe, list); 1805 TAILQ_REMOVE(&ki->kaio_jobqueue, cbe, plist); 1806 TAILQ_INSERT_TAIL(&ki->kaio_jobdone, cbe, 1807 plist); 1808 cancelled++; 1809 ki->kaio_queue_finished_count++; 1810 cbe->jobstate = JOBST_JOBFINISHED; 1811 cbe->uaiocb._aiocb_private.status = -1; 1812 cbe->uaiocb._aiocb_private.error = ECANCELED; 1813 /* XXX cancelled, knote? */ 1814 if (cbe->uaiocb.aio_sigevent.sigev_notify == 1815 SIGEV_SIGNAL) { 1816 PROC_LOCK(cbe->userproc); 1817 psignal(cbe->userproc, cbe->uaiocb.aio_sigevent.sigev_signo); 1818 PROC_UNLOCK(cbe->userproc); 1819 } 1820 } else { 1821 notcancelled++; 1822 } 1823 } 1824 } 1825 splx(s); 1826 1827 if (notcancelled) { 1828 td->td_retval[0] = AIO_NOTCANCELED; 1829 return 0; 1830 } 1831 if (cancelled) { 1832 td->td_retval[0] = AIO_CANCELED; 1833 return 0; 1834 } 1835 td->td_retval[0] = AIO_ALLDONE; 1836 1837 return 0; 1838 } 1839 1840 /* 1841 * aio_error is implemented in the kernel level for compatibility purposes only. 1842 * For a user mode async implementation, it would be best to do it in a userland 1843 * subroutine. 1844 */ 1845 int 1846 aio_error(struct thread *td, struct aio_error_args *uap) 1847 { 1848 struct proc *p = td->td_proc; 1849 int s; 1850 struct aiocblist *cb; 1851 struct kaioinfo *ki; 1852 int jobref; 1853 1854 ki = p->p_aioinfo; 1855 if (ki == NULL) 1856 return EINVAL; 1857 1858 jobref = fuword(&uap->aiocbp->_aiocb_private.kernelinfo); 1859 if ((jobref == -1) || (jobref == 0)) 1860 return EINVAL; 1861 1862 TAILQ_FOREACH(cb, &ki->kaio_jobdone, plist) { 1863 if (((intptr_t)cb->uaiocb._aiocb_private.kernelinfo) == 1864 jobref) { 1865 td->td_retval[0] = cb->uaiocb._aiocb_private.error; 1866 return 0; 1867 } 1868 } 1869 1870 s = splnet(); 1871 1872 for (cb = TAILQ_FIRST(&ki->kaio_jobqueue); cb; cb = TAILQ_NEXT(cb, 1873 plist)) { 1874 if (((intptr_t)cb->uaiocb._aiocb_private.kernelinfo) == 1875 jobref) { 1876 td->td_retval[0] = EINPROGRESS; 1877 splx(s); 1878 return 0; 1879 } 1880 } 1881 1882 for (cb = TAILQ_FIRST(&ki->kaio_sockqueue); cb; cb = TAILQ_NEXT(cb, 1883 plist)) { 1884 if (((intptr_t)cb->uaiocb._aiocb_private.kernelinfo) == 1885 jobref) { 1886 td->td_retval[0] = EINPROGRESS; 1887 splx(s); 1888 return 0; 1889 } 1890 } 1891 splx(s); 1892 1893 s = splbio(); 1894 for (cb = TAILQ_FIRST(&ki->kaio_bufdone); cb; cb = TAILQ_NEXT(cb, 1895 plist)) { 1896 if (((intptr_t)cb->uaiocb._aiocb_private.kernelinfo) == 1897 jobref) { 1898 td->td_retval[0] = cb->uaiocb._aiocb_private.error; 1899 splx(s); 1900 return 0; 1901 } 1902 } 1903 1904 for (cb = TAILQ_FIRST(&ki->kaio_bufqueue); cb; cb = TAILQ_NEXT(cb, 1905 plist)) { 1906 if (((intptr_t)cb->uaiocb._aiocb_private.kernelinfo) == 1907 jobref) { 1908 td->td_retval[0] = EINPROGRESS; 1909 splx(s); 1910 return 0; 1911 } 1912 } 1913 splx(s); 1914 1915 #if (0) 1916 /* 1917 * Hack for lio. 1918 */ 1919 status = fuword(&uap->aiocbp->_aiocb_private.status); 1920 if (status == -1) 1921 return fuword(&uap->aiocbp->_aiocb_private.error); 1922 #endif 1923 return EINVAL; 1924 } 1925 1926 int 1927 aio_read(struct thread *td, struct aio_read_args *uap) 1928 { 1929 1930 return aio_aqueue(td, uap->aiocbp, LIO_READ); 1931 } 1932 1933 int 1934 aio_write(struct thread *td, struct aio_write_args *uap) 1935 { 1936 1937 return aio_aqueue(td, uap->aiocbp, LIO_WRITE); 1938 } 1939 1940 int 1941 lio_listio(struct thread *td, struct lio_listio_args *uap) 1942 { 1943 struct proc *p = td->td_proc; 1944 int nent, nentqueued; 1945 struct aiocb *iocb, * const *cbptr; 1946 struct aiocblist *cb; 1947 struct kaioinfo *ki; 1948 struct aio_liojob *lj; 1949 int error, runningcode; 1950 int nerror; 1951 int i; 1952 int s; 1953 1954 if ((uap->mode != LIO_NOWAIT) && (uap->mode != LIO_WAIT)) 1955 return EINVAL; 1956 1957 nent = uap->nent; 1958 if (nent > AIO_LISTIO_MAX) 1959 return EINVAL; 1960 1961 if (p->p_aioinfo == NULL) 1962 aio_init_aioinfo(p); 1963 1964 if ((nent + num_queue_count) > max_queue_count) 1965 return EAGAIN; 1966 1967 ki = p->p_aioinfo; 1968 if ((nent + ki->kaio_queue_count) > ki->kaio_qallowed_count) 1969 return EAGAIN; 1970 1971 lj = zalloc(aiolio_zone); 1972 if (!lj) 1973 return EAGAIN; 1974 1975 lj->lioj_flags = 0; 1976 lj->lioj_buffer_count = 0; 1977 lj->lioj_buffer_finished_count = 0; 1978 lj->lioj_queue_count = 0; 1979 lj->lioj_queue_finished_count = 0; 1980 lj->lioj_ki = ki; 1981 1982 /* 1983 * Setup signal. 1984 */ 1985 if (uap->sig && (uap->mode == LIO_NOWAIT)) { 1986 error = copyin(uap->sig, &lj->lioj_signal, 1987 sizeof(lj->lioj_signal)); 1988 if (error) { 1989 zfree(aiolio_zone, lj); 1990 return error; 1991 } 1992 if (!_SIG_VALID(lj->lioj_signal.sigev_signo)) { 1993 zfree(aiolio_zone, lj); 1994 return EINVAL; 1995 } 1996 lj->lioj_flags |= LIOJ_SIGNAL; 1997 lj->lioj_flags &= ~LIOJ_SIGNAL_POSTED; 1998 } else 1999 lj->lioj_flags &= ~LIOJ_SIGNAL; 2000 2001 TAILQ_INSERT_TAIL(&ki->kaio_liojoblist, lj, lioj_list); 2002 /* 2003 * Get pointers to the list of I/O requests. 2004 */ 2005 nerror = 0; 2006 nentqueued = 0; 2007 cbptr = uap->acb_list; 2008 for (i = 0; i < uap->nent; i++) { 2009 iocb = (struct aiocb *)(intptr_t)fuword((caddr_t)&cbptr[i]); 2010 if (((intptr_t)iocb != -1) && ((intptr_t)iocb != NULL)) { 2011 error = _aio_aqueue(td, iocb, lj, 0); 2012 if (error == 0) 2013 nentqueued++; 2014 else 2015 nerror++; 2016 } 2017 } 2018 2019 /* 2020 * If we haven't queued any, then just return error. 2021 */ 2022 if (nentqueued == 0) 2023 return 0; 2024 2025 /* 2026 * Calculate the appropriate error return. 2027 */ 2028 runningcode = 0; 2029 if (nerror) 2030 runningcode = EIO; 2031 2032 if (uap->mode == LIO_WAIT) { 2033 int command, found, jobref; 2034 2035 for (;;) { 2036 found = 0; 2037 for (i = 0; i < uap->nent; i++) { 2038 /* 2039 * Fetch address of the control buf pointer in 2040 * user space. 2041 */ 2042 iocb = (struct aiocb *)(intptr_t)fuword((caddr_t)&cbptr[i]); 2043 if (((intptr_t)iocb == -1) || ((intptr_t)iocb 2044 == 0)) 2045 continue; 2046 2047 /* 2048 * Fetch the associated command from user space. 2049 */ 2050 command = fuword(&iocb->aio_lio_opcode); 2051 if (command == LIO_NOP) { 2052 found++; 2053 continue; 2054 } 2055 2056 jobref = fuword(&iocb->_aiocb_private.kernelinfo); 2057 2058 TAILQ_FOREACH(cb, &ki->kaio_jobdone, plist) { 2059 if (((intptr_t)cb->uaiocb._aiocb_private.kernelinfo) 2060 == jobref) { 2061 if (cb->uaiocb.aio_lio_opcode 2062 == LIO_WRITE) { 2063 p->p_stats->p_ru.ru_oublock 2064 += 2065 cb->outputcharge; 2066 cb->outputcharge = 0; 2067 } else if (cb->uaiocb.aio_lio_opcode 2068 == LIO_READ) { 2069 p->p_stats->p_ru.ru_inblock 2070 += cb->inputcharge; 2071 cb->inputcharge = 0; 2072 } 2073 found++; 2074 break; 2075 } 2076 } 2077 2078 s = splbio(); 2079 TAILQ_FOREACH(cb, &ki->kaio_bufdone, plist) { 2080 if (((intptr_t)cb->uaiocb._aiocb_private.kernelinfo) 2081 == jobref) { 2082 found++; 2083 break; 2084 } 2085 } 2086 splx(s); 2087 } 2088 2089 /* 2090 * If all I/Os have been disposed of, then we can 2091 * return. 2092 */ 2093 if (found == nentqueued) 2094 return runningcode; 2095 2096 ki->kaio_flags |= KAIO_WAKEUP; 2097 error = tsleep(p, PRIBIO | PCATCH, "aiospn", 0); 2098 2099 if (error == EINTR) 2100 return EINTR; 2101 else if (error == EWOULDBLOCK) 2102 return EAGAIN; 2103 } 2104 } 2105 2106 return runningcode; 2107 } 2108 2109 /* 2110 * This is a weird hack so that we can post a signal. It is safe to do so from 2111 * a timeout routine, but *not* from an interrupt routine. 2112 */ 2113 static void 2114 process_signal(void *aioj) 2115 { 2116 struct aiocblist *aiocbe = aioj; 2117 struct aio_liojob *lj = aiocbe->lio; 2118 struct aiocb *cb = &aiocbe->uaiocb; 2119 2120 if ((lj) && (lj->lioj_signal.sigev_notify == SIGEV_SIGNAL) && 2121 (lj->lioj_queue_count == lj->lioj_queue_finished_count)) { 2122 PROC_LOCK(lj->lioj_ki->kaio_p); 2123 psignal(lj->lioj_ki->kaio_p, lj->lioj_signal.sigev_signo); 2124 PROC_UNLOCK(lj->lioj_ki->kaio_p); 2125 lj->lioj_flags |= LIOJ_SIGNAL_POSTED; 2126 } 2127 2128 if (cb->aio_sigevent.sigev_notify == SIGEV_SIGNAL) { 2129 PROC_LOCK(aiocbe->userproc); 2130 psignal(aiocbe->userproc, cb->aio_sigevent.sigev_signo); 2131 PROC_UNLOCK(aiocbe->userproc); 2132 } 2133 } 2134 2135 /* 2136 * Interrupt handler for physio, performs the necessary process wakeups, and 2137 * signals. 2138 */ 2139 static void 2140 aio_physwakeup(struct buf *bp) 2141 { 2142 struct aiocblist *aiocbe; 2143 struct proc *p; 2144 struct kaioinfo *ki; 2145 struct aio_liojob *lj; 2146 2147 wakeup(bp); 2148 2149 aiocbe = (struct aiocblist *)bp->b_spc; 2150 if (aiocbe) { 2151 p = bp->b_caller1; 2152 2153 aiocbe->jobstate = JOBST_JOBBFINISHED; 2154 aiocbe->uaiocb._aiocb_private.status -= bp->b_resid; 2155 aiocbe->uaiocb._aiocb_private.error = 0; 2156 aiocbe->jobflags |= AIOCBLIST_DONE; 2157 2158 if (bp->b_ioflags & BIO_ERROR) 2159 aiocbe->uaiocb._aiocb_private.error = bp->b_error; 2160 2161 lj = aiocbe->lio; 2162 if (lj) { 2163 lj->lioj_buffer_finished_count++; 2164 2165 /* 2166 * wakeup/signal if all of the interrupt jobs are done. 2167 */ 2168 if (lj->lioj_buffer_finished_count == 2169 lj->lioj_buffer_count) { 2170 /* 2171 * Post a signal if it is called for. 2172 */ 2173 if ((lj->lioj_flags & 2174 (LIOJ_SIGNAL|LIOJ_SIGNAL_POSTED)) == 2175 LIOJ_SIGNAL) { 2176 lj->lioj_flags |= LIOJ_SIGNAL_POSTED; 2177 aiocbe->timeouthandle = 2178 timeout(process_signal, 2179 aiocbe, 0); 2180 } 2181 } 2182 } 2183 2184 ki = p->p_aioinfo; 2185 if (ki) { 2186 ki->kaio_buffer_finished_count++; 2187 TAILQ_REMOVE(&aio_bufjobs, aiocbe, list); 2188 TAILQ_REMOVE(&ki->kaio_bufqueue, aiocbe, plist); 2189 TAILQ_INSERT_TAIL(&ki->kaio_bufdone, aiocbe, plist); 2190 2191 KNOTE(&aiocbe->klist, 0); 2192 /* Do the wakeup. */ 2193 if (ki->kaio_flags & (KAIO_RUNDOWN|KAIO_WAKEUP)) { 2194 ki->kaio_flags &= ~KAIO_WAKEUP; 2195 wakeup(p); 2196 } 2197 } 2198 2199 if (aiocbe->uaiocb.aio_sigevent.sigev_notify == SIGEV_SIGNAL) 2200 aiocbe->timeouthandle = 2201 timeout(process_signal, aiocbe, 0); 2202 } 2203 } 2204 2205 int 2206 aio_waitcomplete(struct thread *td, struct aio_waitcomplete_args *uap) 2207 { 2208 struct proc *p = td->td_proc; 2209 struct timeval atv; 2210 struct timespec ts; 2211 struct aiocb **cbptr; 2212 struct kaioinfo *ki; 2213 struct aiocblist *cb = NULL; 2214 int error, s, timo; 2215 2216 suword(uap->aiocbp, (int)NULL); 2217 2218 timo = 0; 2219 if (uap->timeout) { 2220 /* Get timespec struct. */ 2221 error = copyin(uap->timeout, &ts, sizeof(ts)); 2222 if (error) 2223 return error; 2224 2225 if ((ts.tv_nsec < 0) || (ts.tv_nsec >= 1000000000)) 2226 return (EINVAL); 2227 2228 TIMESPEC_TO_TIMEVAL(&atv, &ts); 2229 if (itimerfix(&atv)) 2230 return (EINVAL); 2231 timo = tvtohz(&atv); 2232 } 2233 2234 ki = p->p_aioinfo; 2235 if (ki == NULL) 2236 return EAGAIN; 2237 2238 cbptr = uap->aiocbp; 2239 2240 for (;;) { 2241 if ((cb = TAILQ_FIRST(&ki->kaio_jobdone)) != 0) { 2242 suword(uap->aiocbp, (uintptr_t)cb->uuaiocb); 2243 td->td_retval[0] = cb->uaiocb._aiocb_private.status; 2244 if (cb->uaiocb.aio_lio_opcode == LIO_WRITE) { 2245 p->p_stats->p_ru.ru_oublock += 2246 cb->outputcharge; 2247 cb->outputcharge = 0; 2248 } else if (cb->uaiocb.aio_lio_opcode == LIO_READ) { 2249 p->p_stats->p_ru.ru_inblock += cb->inputcharge; 2250 cb->inputcharge = 0; 2251 } 2252 aio_free_entry(cb); 2253 return cb->uaiocb._aiocb_private.error; 2254 } 2255 2256 s = splbio(); 2257 if ((cb = TAILQ_FIRST(&ki->kaio_bufdone)) != 0 ) { 2258 splx(s); 2259 suword(uap->aiocbp, (uintptr_t)cb->uuaiocb); 2260 td->td_retval[0] = cb->uaiocb._aiocb_private.status; 2261 aio_free_entry(cb); 2262 return cb->uaiocb._aiocb_private.error; 2263 } 2264 2265 ki->kaio_flags |= KAIO_WAKEUP; 2266 error = tsleep(p, PRIBIO | PCATCH, "aiowc", timo); 2267 splx(s); 2268 2269 if (error == ERESTART) 2270 return EINTR; 2271 else if (error < 0) 2272 return error; 2273 else if (error == EINTR) 2274 return EINTR; 2275 else if (error == EWOULDBLOCK) 2276 return EAGAIN; 2277 } 2278 } 2279 2280 static int 2281 filt_aioattach(struct knote *kn) 2282 { 2283 struct aiocblist *aiocbe = (struct aiocblist *)kn->kn_id; 2284 2285 /* 2286 * The aiocbe pointer must be validated before using it, so 2287 * registration is restricted to the kernel; the user cannot 2288 * set EV_FLAG1. 2289 */ 2290 if ((kn->kn_flags & EV_FLAG1) == 0) 2291 return (EPERM); 2292 kn->kn_flags &= ~EV_FLAG1; 2293 2294 SLIST_INSERT_HEAD(&aiocbe->klist, kn, kn_selnext); 2295 2296 return (0); 2297 } 2298 2299 static void 2300 filt_aiodetach(struct knote *kn) 2301 { 2302 struct aiocblist *aiocbe = (struct aiocblist *)kn->kn_id; 2303 2304 SLIST_REMOVE(&aiocbe->klist, kn, knote, kn_selnext); 2305 } 2306 2307 /*ARGSUSED*/ 2308 static int 2309 filt_aio(struct knote *kn, long hint) 2310 { 2311 struct aiocblist *aiocbe = (struct aiocblist *)kn->kn_id; 2312 2313 kn->kn_data = aiocbe->uaiocb._aiocb_private.error; 2314 if (aiocbe->jobstate != JOBST_JOBFINISHED && 2315 aiocbe->jobstate != JOBST_JOBBFINISHED) 2316 return (0); 2317 kn->kn_flags |= EV_EOF; 2318 return (1); 2319 } 2320