1 /* 2 * Copyright (c) 1997 John S. Dyson. All rights reserved. 3 * 4 * Redistribution and use in source and binary forms, with or without 5 * modification, are permitted provided that the following conditions 6 * are met: 7 * 1. Redistributions of source code must retain the above copyright 8 * notice, this list of conditions and the following disclaimer. 9 * 2. John S. Dyson's name may not be used to endorse or promote products 10 * derived from this software without specific prior written permission. 11 * 12 * DISCLAIMER: This code isn't warranted to do anything useful. Anything 13 * bad that happens because of using this software isn't the responsibility 14 * of the author. This software is distributed AS-IS. 15 * 16 * $FreeBSD$ 17 */ 18 19 /* 20 * This file contains support for the POSIX 1003.1B AIO/LIO facility. 21 */ 22 23 #include <sys/param.h> 24 #include <sys/systm.h> 25 #include <sys/buf.h> 26 #include <sys/sysproto.h> 27 #include <sys/filedesc.h> 28 #include <sys/kernel.h> 29 #include <sys/fcntl.h> 30 #include <sys/file.h> 31 #include <sys/lock.h> 32 #include <sys/unistd.h> 33 #include <sys/proc.h> 34 #include <sys/resourcevar.h> 35 #include <sys/signalvar.h> 36 #include <sys/sysctl.h> 37 #include <sys/vnode.h> 38 #include <sys/conf.h> 39 40 #include <vm/vm.h> 41 #include <vm/vm_extern.h> 42 #include <vm/pmap.h> 43 #include <vm/vm_map.h> 44 #include <vm/vm_zone.h> 45 #include <sys/aio.h> 46 47 #include <machine/limits.h> 48 49 static long jobrefid; 50 51 #define JOBST_NULL 0x0 52 #define JOBST_JOBQPROC 0x1 53 #define JOBST_JOBQGLOBAL 0x2 54 #define JOBST_JOBRUNNING 0x3 55 #define JOBST_JOBFINISHED 0x4 56 #define JOBST_JOBQBUF 0x5 57 #define JOBST_JOBBFINISHED 0x6 58 59 #ifndef MAX_AIO_PER_PROC 60 #define MAX_AIO_PER_PROC 32 61 #endif 62 63 #ifndef MAX_AIO_QUEUE_PER_PROC 64 #define MAX_AIO_QUEUE_PER_PROC 256 /* Bigger than AIO_LISTIO_MAX */ 65 #endif 66 67 #ifndef MAX_AIO_PROCS 68 #define MAX_AIO_PROCS 32 69 #endif 70 71 #ifndef MAX_AIO_QUEUE 72 #define MAX_AIO_QUEUE 1024 /* Bigger than AIO_LISTIO_MAX */ 73 #endif 74 75 #ifndef TARGET_AIO_PROCS 76 #define TARGET_AIO_PROCS 0 77 #endif 78 79 #ifndef MAX_BUF_AIO 80 #define MAX_BUF_AIO 16 81 #endif 82 83 #ifndef AIOD_TIMEOUT_DEFAULT 84 #define AIOD_TIMEOUT_DEFAULT (10 * hz) 85 #endif 86 87 #ifndef AIOD_LIFETIME_DEFAULT 88 #define AIOD_LIFETIME_DEFAULT (30 * hz) 89 #endif 90 91 static int max_aio_procs = MAX_AIO_PROCS; 92 static int num_aio_procs = 0; 93 static int target_aio_procs = TARGET_AIO_PROCS; 94 static int max_queue_count = MAX_AIO_QUEUE; 95 static int num_queue_count = 0; 96 static int num_buf_aio = 0; 97 static int num_aio_resv_start = 0; 98 static int aiod_timeout; 99 static int aiod_lifetime; 100 101 static int max_aio_per_proc = MAX_AIO_PER_PROC, 102 max_aio_queue_per_proc=MAX_AIO_QUEUE_PER_PROC; 103 104 static int max_buf_aio = MAX_BUF_AIO; 105 106 SYSCTL_NODE(_vfs, OID_AUTO, aio, CTLFLAG_RW, 0, "AIO mgmt"); 107 108 SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_per_proc, 109 CTLFLAG_RW, &max_aio_per_proc, 0, ""); 110 111 SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_queue_per_proc, 112 CTLFLAG_RW, &max_aio_queue_per_proc, 0, ""); 113 114 SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_procs, 115 CTLFLAG_RW, &max_aio_procs, 0, ""); 116 117 SYSCTL_INT(_vfs_aio, OID_AUTO, num_aio_procs, 118 CTLFLAG_RD, &num_aio_procs, 0, ""); 119 120 SYSCTL_INT(_vfs_aio, OID_AUTO, num_queue_count, 121 CTLFLAG_RD, &num_queue_count, 0, ""); 122 123 SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_queue, 124 CTLFLAG_RW, &max_queue_count, 0, ""); 125 126 SYSCTL_INT(_vfs_aio, OID_AUTO, target_aio_procs, 127 CTLFLAG_RW, &target_aio_procs, 0, ""); 128 129 SYSCTL_INT(_vfs_aio, OID_AUTO, max_buf_aio, 130 CTLFLAG_RW, &max_buf_aio, 0, ""); 131 132 SYSCTL_INT(_vfs_aio, OID_AUTO, num_buf_aio, 133 CTLFLAG_RD, &num_buf_aio, 0, ""); 134 135 SYSCTL_INT(_vfs_aio, OID_AUTO, aiod_lifetime, 136 CTLFLAG_RW, &aiod_lifetime, 0, ""); 137 138 SYSCTL_INT(_vfs_aio, OID_AUTO, aiod_timeout, 139 CTLFLAG_RW, &aiod_timeout, 0, ""); 140 141 142 /* 143 * Job queue item 144 */ 145 146 #define AIOCBLIST_CANCELLED 0x1 147 #define AIOCBLIST_RUNDOWN 0x4 148 #define AIOCBLIST_ASYNCFREE 0x8 149 #define AIOCBLIST_DONE 0x10 150 151 struct aiocblist { 152 TAILQ_ENTRY (aiocblist) list; /* List of jobs */ 153 TAILQ_ENTRY (aiocblist) plist; /* List of jobs for proc */ 154 int jobflags; 155 int jobstate; 156 int inputcharge, outputcharge; 157 struct buf *bp; /* buffer pointer */ 158 struct proc *userproc; /* User process */ 159 struct aioproclist *jobaioproc; /* AIO process descriptor */ 160 struct aio_liojob *lio; /* optional lio job */ 161 struct aiocb *uuaiocb; /* pointer in userspace of aiocb */ 162 struct aiocb uaiocb; /* Kernel I/O control block */ 163 }; 164 165 166 /* 167 * AIO process info 168 */ 169 #define AIOP_FREE 0x1 /* proc on free queue */ 170 #define AIOP_SCHED 0x2 /* proc explicitly scheduled */ 171 172 struct aioproclist { 173 int aioprocflags; /* AIO proc flags */ 174 TAILQ_ENTRY(aioproclist) list; /* List of processes */ 175 struct proc *aioproc; /* The AIO thread */ 176 TAILQ_HEAD (,aiocblist) jobtorun; /* suggested job to run */ 177 }; 178 179 /* 180 * data-structure for lio signal management 181 */ 182 struct aio_liojob { 183 int lioj_flags; 184 int lioj_buffer_count; 185 int lioj_buffer_finished_count; 186 int lioj_queue_count; 187 int lioj_queue_finished_count; 188 struct sigevent lioj_signal; /* signal on all I/O done */ 189 TAILQ_ENTRY (aio_liojob) lioj_list; 190 struct kaioinfo *lioj_ki; 191 }; 192 #define LIOJ_SIGNAL 0x1 /* signal on all done (lio) */ 193 #define LIOJ_SIGNAL_POSTED 0x2 /* signal has been posted */ 194 195 /* 196 * per process aio data structure 197 */ 198 struct kaioinfo { 199 int kaio_flags; /* per process kaio flags */ 200 int kaio_maxactive_count; /* maximum number of AIOs */ 201 int kaio_active_count; /* number of currently used AIOs */ 202 int kaio_qallowed_count; /* maxiumu size of AIO queue */ 203 int kaio_queue_count; /* size of AIO queue */ 204 int kaio_ballowed_count; /* maximum number of buffers */ 205 int kaio_queue_finished_count; /* number of daemon jobs finished */ 206 int kaio_buffer_count; /* number of physio buffers */ 207 int kaio_buffer_finished_count; /* count of I/O done */ 208 struct proc *kaio_p; /* process that uses this kaio block */ 209 TAILQ_HEAD (,aio_liojob) kaio_liojoblist; /* list of lio jobs */ 210 TAILQ_HEAD (,aiocblist) kaio_jobqueue; /* job queue for process */ 211 TAILQ_HEAD (,aiocblist) kaio_jobdone; /* done queue for process */ 212 TAILQ_HEAD (,aiocblist) kaio_bufqueue; /* buffer job queue for process */ 213 TAILQ_HEAD (,aiocblist) kaio_bufdone; /* buffer done queue for process */ 214 }; 215 216 #define KAIO_RUNDOWN 0x1 /* process is being run down */ 217 #define KAIO_WAKEUP 0x2 /* wakeup process when there is a significant 218 event */ 219 220 221 static TAILQ_HEAD (,aioproclist) aio_freeproc, aio_activeproc; 222 static TAILQ_HEAD(,aiocblist) aio_jobs; /* Async job list */ 223 static TAILQ_HEAD(,aiocblist) aio_bufjobs; /* Phys I/O job list */ 224 static TAILQ_HEAD(,aiocblist) aio_freejobs; /* Pool of free jobs */ 225 226 static void aio_init_aioinfo(struct proc *p) ; 227 static void aio_onceonly(void *) ; 228 static int aio_free_entry(struct aiocblist *aiocbe); 229 static void aio_process(struct aiocblist *aiocbe); 230 static int aio_newproc(void) ; 231 static int aio_aqueue(struct proc *p, struct aiocb *job, int type) ; 232 static void aio_physwakeup(struct buf *bp); 233 static int aio_fphysio(struct proc *p, struct aiocblist *aiocbe, int type); 234 static int aio_qphysio(struct proc *p, struct aiocblist *iocb); 235 static void aio_daemon(void *uproc); 236 237 SYSINIT(aio, SI_SUB_VFS, SI_ORDER_ANY, aio_onceonly, NULL); 238 239 static vm_zone_t kaio_zone=0, aiop_zone=0, 240 aiocb_zone=0, aiol_zone=0, aiolio_zone=0; 241 242 /* 243 * Startup initialization 244 */ 245 void 246 aio_onceonly(void *na) 247 { 248 TAILQ_INIT(&aio_freeproc); 249 TAILQ_INIT(&aio_activeproc); 250 TAILQ_INIT(&aio_jobs); 251 TAILQ_INIT(&aio_bufjobs); 252 TAILQ_INIT(&aio_freejobs); 253 kaio_zone = zinit("AIO", sizeof (struct kaioinfo), 0, 0, 1); 254 aiop_zone = zinit("AIOP", sizeof (struct aioproclist), 0, 0, 1); 255 aiocb_zone = zinit("AIOCB", sizeof (struct aiocblist), 0, 0, 1); 256 aiol_zone = zinit("AIOL", AIO_LISTIO_MAX * sizeof (int), 0, 0, 1); 257 aiolio_zone = zinit("AIOLIO", 258 AIO_LISTIO_MAX * sizeof (struct aio_liojob), 0, 0, 1); 259 aiod_timeout = AIOD_TIMEOUT_DEFAULT; 260 aiod_lifetime = AIOD_LIFETIME_DEFAULT; 261 jobrefid = 1; 262 } 263 264 /* 265 * Init the per-process aioinfo structure. 266 * The aioinfo limits are set per-process for user limit (resource) management. 267 */ 268 void 269 aio_init_aioinfo(struct proc *p) 270 { 271 struct kaioinfo *ki; 272 if (p->p_aioinfo == NULL) { 273 ki = zalloc(kaio_zone); 274 p->p_aioinfo = ki; 275 ki->kaio_flags = 0; 276 ki->kaio_maxactive_count = max_aio_per_proc; 277 ki->kaio_active_count = 0; 278 ki->kaio_qallowed_count = max_aio_queue_per_proc; 279 ki->kaio_queue_count = 0; 280 ki->kaio_ballowed_count = max_buf_aio; 281 ki->kaio_buffer_count = 0; 282 ki->kaio_buffer_finished_count = 0; 283 ki->kaio_p = p; 284 TAILQ_INIT(&ki->kaio_jobdone); 285 TAILQ_INIT(&ki->kaio_jobqueue); 286 TAILQ_INIT(&ki->kaio_bufdone); 287 TAILQ_INIT(&ki->kaio_bufqueue); 288 TAILQ_INIT(&ki->kaio_liojoblist); 289 } 290 } 291 292 /* 293 * Free a job entry. Wait for completion if it is currently 294 * active, but don't delay forever. If we delay, we return 295 * a flag that says that we have to restart the queue scan. 296 */ 297 int 298 aio_free_entry(struct aiocblist *aiocbe) 299 { 300 struct kaioinfo *ki; 301 struct aioproclist *aiop; 302 struct aio_liojob *lj; 303 struct proc *p; 304 int error; 305 int s; 306 307 if (aiocbe->jobstate == JOBST_NULL) 308 panic("aio_free_entry: freeing already free job"); 309 310 p = aiocbe->userproc; 311 ki = p->p_aioinfo; 312 lj = aiocbe->lio; 313 if (ki == NULL) 314 panic("aio_free_entry: missing p->p_aioinfo"); 315 316 if (aiocbe->jobstate == JOBST_JOBRUNNING) { 317 if (aiocbe->jobflags & AIOCBLIST_ASYNCFREE) 318 return 0; 319 aiocbe->jobflags |= AIOCBLIST_RUNDOWN; 320 tsleep(aiocbe, PRIBIO|PCATCH, "jobwai", 0); 321 } 322 aiocbe->jobflags &= ~AIOCBLIST_ASYNCFREE; 323 324 if (aiocbe->bp == NULL) { 325 if (ki->kaio_queue_count <= 0) 326 panic("aio_free_entry: process queue size <= 0"); 327 if (num_queue_count <= 0) 328 panic("aio_free_entry: system wide queue size <= 0"); 329 330 if(lj) { 331 lj->lioj_queue_count--; 332 if (aiocbe->jobflags & AIOCBLIST_DONE) 333 lj->lioj_queue_finished_count--; 334 } 335 ki->kaio_queue_count--; 336 if (aiocbe->jobflags & AIOCBLIST_DONE) 337 ki->kaio_queue_finished_count--; 338 num_queue_count--; 339 340 } else { 341 if(lj) { 342 lj->lioj_buffer_count--; 343 if (aiocbe->jobflags & AIOCBLIST_DONE) 344 lj->lioj_buffer_finished_count--; 345 } 346 if (aiocbe->jobflags & AIOCBLIST_DONE) 347 ki->kaio_buffer_finished_count--; 348 ki->kaio_buffer_count--; 349 num_buf_aio--; 350 351 } 352 353 if ((ki->kaio_flags & KAIO_WAKEUP) || 354 ((ki->kaio_flags & KAIO_RUNDOWN) && 355 ((ki->kaio_buffer_count == 0) && 356 (ki->kaio_queue_count == 0)))) { 357 ki->kaio_flags &= ~KAIO_WAKEUP; 358 wakeup(p); 359 } 360 361 if ( aiocbe->jobstate == JOBST_JOBQBUF) { 362 if ((error = aio_fphysio(p, aiocbe, 1)) != 0) 363 return error; 364 if (aiocbe->jobstate != JOBST_JOBBFINISHED) 365 panic("aio_free_entry: invalid physio finish-up state"); 366 s = splbio(); 367 TAILQ_REMOVE(&ki->kaio_bufdone, aiocbe, plist); 368 splx(s); 369 } else if ( aiocbe->jobstate == JOBST_JOBQPROC) { 370 aiop = aiocbe->jobaioproc; 371 TAILQ_REMOVE(&aiop->jobtorun, aiocbe, list); 372 } else if ( aiocbe->jobstate == JOBST_JOBQGLOBAL) { 373 TAILQ_REMOVE(&aio_jobs, aiocbe, list); 374 } else if ( aiocbe->jobstate == JOBST_JOBFINISHED) { 375 TAILQ_REMOVE(&ki->kaio_jobdone, aiocbe, plist); 376 } else if ( aiocbe->jobstate == JOBST_JOBBFINISHED) { 377 s = splbio(); 378 TAILQ_REMOVE(&ki->kaio_bufdone, aiocbe, plist); 379 splx(s); 380 if (aiocbe->bp) { 381 vunmapbuf(aiocbe->bp); 382 relpbuf(aiocbe->bp, NULL); 383 aiocbe->bp = NULL; 384 } 385 } 386 if (lj && (lj->lioj_buffer_count == 0) && (lj->lioj_queue_count == 0)) { 387 TAILQ_REMOVE(&ki->kaio_liojoblist, lj, lioj_list); 388 zfree(aiolio_zone, lj); 389 } 390 TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list); 391 aiocbe->jobstate = JOBST_NULL; 392 return 0; 393 } 394 395 /* 396 * Rundown the jobs for a given process. 397 */ 398 void 399 aio_proc_rundown(struct proc *p) 400 { 401 int s; 402 struct kaioinfo *ki; 403 struct aio_liojob *lj, *ljn; 404 struct aiocblist *aiocbe, *aiocbn; 405 406 ki = p->p_aioinfo; 407 if (ki == NULL) 408 return; 409 410 ki->kaio_flags |= LIOJ_SIGNAL_POSTED; 411 while ((ki->kaio_active_count > 0) || 412 (ki->kaio_buffer_count > ki->kaio_buffer_finished_count)) { 413 ki->kaio_flags |= KAIO_RUNDOWN; 414 if (tsleep(p, PRIBIO, "kaiowt", aiod_timeout)) 415 break; 416 } 417 418 restart1: 419 for ( aiocbe = TAILQ_FIRST(&ki->kaio_jobdone); 420 aiocbe; 421 aiocbe = aiocbn) { 422 aiocbn = TAILQ_NEXT(aiocbe, plist); 423 if (aio_free_entry(aiocbe)) 424 goto restart1; 425 } 426 427 restart2: 428 for ( aiocbe = TAILQ_FIRST(&ki->kaio_jobqueue); 429 aiocbe; 430 aiocbe = aiocbn) { 431 aiocbn = TAILQ_NEXT(aiocbe, plist); 432 if (aio_free_entry(aiocbe)) 433 goto restart2; 434 } 435 436 /* 437 * Note the use of lots of splbio here, trying to avoid 438 * splbio for long chains of I/O. Probably unnecessary. 439 */ 440 441 restart3: 442 s = splbio(); 443 while (TAILQ_FIRST(&ki->kaio_bufqueue)) { 444 ki->kaio_flags |= KAIO_WAKEUP; 445 tsleep (p, PRIBIO, "aioprn", 0); 446 splx(s); 447 goto restart3; 448 } 449 splx(s); 450 451 restart4: 452 s = splbio(); 453 for ( aiocbe = TAILQ_FIRST(&ki->kaio_bufdone); 454 aiocbe; 455 aiocbe = aiocbn) { 456 aiocbn = TAILQ_NEXT(aiocbe, plist); 457 if (aio_free_entry(aiocbe)) { 458 splx(s); 459 goto restart4; 460 } 461 } 462 splx(s); 463 464 for ( lj = TAILQ_FIRST(&ki->kaio_liojoblist); 465 lj; 466 lj = ljn) { 467 ljn = TAILQ_NEXT(lj, lioj_list); 468 if ((lj->lioj_buffer_count == 0) && (lj->lioj_queue_count == 0)) { 469 TAILQ_REMOVE(&ki->kaio_liojoblist, lj, lioj_list); 470 zfree(aiolio_zone, lj); 471 } else { 472 #if defined(DIAGNOSTIC) 473 printf("LIO job not cleaned up: B:%d, BF:%d, Q:%d, QF:%d\n", 474 lj->lioj_buffer_count, lj->lioj_buffer_finished_count, 475 lj->lioj_queue_count, lj->lioj_queue_finished_count); 476 #endif 477 } 478 } 479 480 zfree(kaio_zone, ki); 481 p->p_aioinfo = NULL; 482 } 483 484 /* 485 * Select a job to run (called by an AIO daemon) 486 */ 487 static struct aiocblist * 488 aio_selectjob(struct aioproclist *aiop) 489 { 490 491 struct aiocblist *aiocbe; 492 493 aiocbe = TAILQ_FIRST(&aiop->jobtorun); 494 if (aiocbe) { 495 TAILQ_REMOVE(&aiop->jobtorun, aiocbe, list); 496 return aiocbe; 497 } 498 499 for (aiocbe = TAILQ_FIRST(&aio_jobs); 500 aiocbe; 501 aiocbe = TAILQ_NEXT(aiocbe, list)) { 502 struct kaioinfo *ki; 503 struct proc *userp; 504 505 userp = aiocbe->userproc; 506 ki = userp->p_aioinfo; 507 508 if (ki->kaio_active_count < ki->kaio_maxactive_count) { 509 TAILQ_REMOVE(&aio_jobs, aiocbe, list); 510 return aiocbe; 511 } 512 } 513 514 return NULL; 515 } 516 517 /* 518 * The AIO processing activity. This is the code that does the 519 * I/O request for the non-physio version of the operations. The 520 * normal vn operations are used, and this code should work in 521 * all instances for every type of file, including pipes, sockets, 522 * fifos, and regular files. 523 */ 524 void 525 aio_process(struct aiocblist *aiocbe) 526 { 527 struct filedesc *fdp; 528 struct proc *userp, *mycp; 529 struct aiocb *cb; 530 struct file *fp; 531 struct uio auio; 532 struct iovec aiov; 533 unsigned int fd; 534 int cnt; 535 int error; 536 off_t offset; 537 int oublock_st, oublock_end; 538 int inblock_st, inblock_end; 539 540 userp = aiocbe->userproc; 541 cb = &aiocbe->uaiocb; 542 543 mycp = curproc; 544 545 fdp = mycp->p_fd; 546 fd = cb->aio_fildes; 547 fp = fdp->fd_ofiles[fd]; 548 549 aiov.iov_base = (void *) cb->aio_buf; 550 aiov.iov_len = cb->aio_nbytes; 551 552 auio.uio_iov = &aiov; 553 auio.uio_iovcnt = 1; 554 auio.uio_offset = offset = cb->aio_offset; 555 auio.uio_resid = cb->aio_nbytes; 556 cnt = cb->aio_nbytes; 557 auio.uio_segflg = UIO_USERSPACE; 558 auio.uio_procp = mycp; 559 560 inblock_st = mycp->p_stats->p_ru.ru_inblock; 561 oublock_st = mycp->p_stats->p_ru.ru_oublock; 562 if (cb->aio_lio_opcode == LIO_READ) { 563 auio.uio_rw = UIO_READ; 564 error = fo_read(fp, &auio, fp->f_cred, FOF_OFFSET, mycp); 565 } else { 566 auio.uio_rw = UIO_WRITE; 567 error = fo_write(fp, &auio, fp->f_cred, FOF_OFFSET, mycp); 568 } 569 inblock_end = mycp->p_stats->p_ru.ru_inblock; 570 oublock_end = mycp->p_stats->p_ru.ru_oublock; 571 572 aiocbe->inputcharge = inblock_end - inblock_st; 573 aiocbe->outputcharge = oublock_end - oublock_st; 574 575 if (error) { 576 if (auio.uio_resid != cnt) { 577 if (error == ERESTART || error == EINTR || error == EWOULDBLOCK) 578 error = 0; 579 if ((error == EPIPE) && (cb->aio_lio_opcode == LIO_WRITE)) 580 psignal(userp, SIGPIPE); 581 } 582 } 583 584 cnt -= auio.uio_resid; 585 cb->_aiocb_private.error = error; 586 cb->_aiocb_private.status = cnt; 587 588 return; 589 590 } 591 592 /* 593 * The AIO daemon, most of the actual work is done in aio_process, 594 * but the setup (and address space mgmt) is done in this routine. 595 */ 596 static void 597 aio_daemon(void *uproc) 598 { 599 int s; 600 struct aioproclist *aiop; 601 struct vmspace *myvm; 602 struct proc *mycp; 603 604 /* 605 * Local copies of curproc (cp) and vmspace (myvm) 606 */ 607 mycp = curproc; 608 myvm = mycp->p_vmspace; 609 610 if (mycp->p_textvp) { 611 vrele(mycp->p_textvp); 612 mycp->p_textvp = NULL; 613 } 614 615 /* 616 * Allocate and ready the aio control info. There is one 617 * aiop structure per daemon. 618 */ 619 aiop = zalloc(aiop_zone); 620 aiop->aioproc = mycp; 621 aiop->aioprocflags |= AIOP_FREE; 622 TAILQ_INIT(&aiop->jobtorun); 623 624 /* 625 * Place thread (lightweight process) onto the AIO free thread list 626 */ 627 if (TAILQ_EMPTY(&aio_freeproc)) 628 wakeup(&aio_freeproc); 629 TAILQ_INSERT_HEAD(&aio_freeproc, aiop, list); 630 631 /* 632 * Make up a name for the daemon 633 */ 634 strcpy(mycp->p_comm, "aiod"); 635 636 /* 637 * Get rid of our current filedescriptors. AIOD's don't need any 638 * filedescriptors, except as temporarily inherited from the client. 639 * Credentials are also cloned, and made equivalent to "root." 640 */ 641 fdfree(mycp); 642 mycp->p_fd = NULL; 643 mycp->p_ucred = crcopy(mycp->p_ucred); 644 mycp->p_ucred->cr_uid = 0; 645 mycp->p_ucred->cr_ngroups = 1; 646 mycp->p_ucred->cr_groups[0] = 1; 647 648 /* 649 * The daemon resides in its own pgrp. 650 */ 651 enterpgrp(mycp, mycp->p_pid, 1); 652 653 /* 654 * Mark special process type 655 */ 656 mycp->p_flag |= P_SYSTEM|P_KTHREADP; 657 658 /* 659 * Wakeup parent process. (Parent sleeps to keep from blasting away 660 * creating to many daemons.) 661 */ 662 wakeup(mycp); 663 664 while(1) { 665 struct proc *curcp; 666 struct aiocblist *aiocbe; 667 668 /* 669 * curcp is the current daemon process context. 670 * userp is the current user process context. 671 */ 672 curcp = mycp; 673 674 /* 675 * Take daemon off of free queue 676 */ 677 if (aiop->aioprocflags & AIOP_FREE) { 678 TAILQ_REMOVE(&aio_freeproc, aiop, list); 679 TAILQ_INSERT_TAIL(&aio_activeproc, aiop, list); 680 aiop->aioprocflags &= ~AIOP_FREE; 681 } 682 aiop->aioprocflags &= ~AIOP_SCHED; 683 684 /* 685 * Check for jobs 686 */ 687 while ((aiocbe = aio_selectjob(aiop)) != NULL) { 688 struct proc *userp; 689 struct aiocb *cb; 690 struct kaioinfo *ki; 691 struct aio_liojob *lj; 692 693 cb = &aiocbe->uaiocb; 694 userp = aiocbe->userproc; 695 696 aiocbe->jobstate = JOBST_JOBRUNNING; 697 698 /* 699 * Connect to process address space for user program 700 */ 701 if (userp != curcp) { 702 struct vmspace *tmpvm; 703 /* 704 * Save the current address space that we are connected to. 705 */ 706 tmpvm = mycp->p_vmspace; 707 /* 708 * Point to the new user address space, and refer to it. 709 */ 710 mycp->p_vmspace = userp->p_vmspace; 711 mycp->p_vmspace->vm_refcnt++; 712 /* 713 * Activate the new mapping. 714 */ 715 pmap_activate(mycp); 716 /* 717 * If the old address space wasn't the daemons own address 718 * space, then we need to remove the daemon's reference from 719 * the other process that it was acting on behalf of. 720 */ 721 if (tmpvm != myvm) { 722 vmspace_free(tmpvm); 723 } 724 /* 725 * Disassociate from previous clients file descriptors, and 726 * associate to the new clients descriptors. Note that 727 * the daemon doesn't need to worry about its orginal 728 * descriptors, because they were originally freed. 729 */ 730 if (mycp->p_fd) 731 fdfree(mycp); 732 mycp->p_fd = fdshare(userp); 733 curcp = userp; 734 } 735 736 ki = userp->p_aioinfo; 737 lj = aiocbe->lio; 738 739 /* 740 * Account for currently active jobs 741 */ 742 ki->kaio_active_count++; 743 744 /* 745 * Do the I/O function 746 */ 747 aiocbe->jobaioproc = aiop; 748 aio_process(aiocbe); 749 750 /* 751 * decrement the active job count 752 */ 753 ki->kaio_active_count--; 754 755 /* 756 * increment the completion count for wakeup/signal comparisons 757 */ 758 aiocbe->jobflags |= AIOCBLIST_DONE; 759 ki->kaio_queue_finished_count++; 760 if (lj) { 761 lj->lioj_queue_finished_count++; 762 } 763 if ((ki->kaio_flags & KAIO_WAKEUP) || 764 ((ki->kaio_flags & KAIO_RUNDOWN) && 765 (ki->kaio_active_count == 0))) { 766 ki->kaio_flags &= ~KAIO_WAKEUP; 767 wakeup(userp); 768 } 769 770 s = splbio(); 771 if (lj && (lj->lioj_flags & (LIOJ_SIGNAL|LIOJ_SIGNAL_POSTED)) == 772 LIOJ_SIGNAL) { 773 if ((lj->lioj_queue_finished_count == lj->lioj_queue_count) && 774 (lj->lioj_buffer_finished_count == lj->lioj_buffer_count)) { 775 psignal(userp, lj->lioj_signal.sigev_signo); 776 lj->lioj_flags |= LIOJ_SIGNAL_POSTED; 777 } 778 } 779 splx(s); 780 781 aiocbe->jobstate = JOBST_JOBFINISHED; 782 783 /* 784 * If the I/O request should be automatically rundown, do the 785 * needed cleanup. Otherwise, place the queue entry for 786 * the just finished I/O request into the done queue for the 787 * associated client. 788 */ 789 if (aiocbe->jobflags & AIOCBLIST_ASYNCFREE) { 790 aiocbe->jobflags &= ~AIOCBLIST_ASYNCFREE; 791 TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list); 792 } else { 793 TAILQ_REMOVE(&ki->kaio_jobqueue, 794 aiocbe, plist); 795 TAILQ_INSERT_TAIL(&ki->kaio_jobdone, 796 aiocbe, plist); 797 } 798 799 if (aiocbe->jobflags & AIOCBLIST_RUNDOWN) { 800 wakeup(aiocbe); 801 aiocbe->jobflags &= ~AIOCBLIST_RUNDOWN; 802 } 803 804 if (cb->aio_sigevent.sigev_notify == SIGEV_SIGNAL) { 805 psignal(userp, cb->aio_sigevent.sigev_signo); 806 } 807 } 808 809 /* 810 * Disconnect from user address space 811 */ 812 if (curcp != mycp) { 813 struct vmspace *tmpvm; 814 /* 815 * Get the user address space to disconnect from. 816 */ 817 tmpvm = mycp->p_vmspace; 818 /* 819 * Get original address space for daemon. 820 */ 821 mycp->p_vmspace = myvm; 822 /* 823 * Activate the daemon's address space. 824 */ 825 pmap_activate(mycp); 826 #if defined(DIAGNOSTIC) 827 if (tmpvm == myvm) 828 printf("AIOD: vmspace problem -- %d\n", mycp->p_pid); 829 #endif 830 /* 831 * remove our vmspace reference. 832 */ 833 vmspace_free(tmpvm); 834 /* 835 * disassociate from the user process's file descriptors. 836 */ 837 if (mycp->p_fd) 838 fdfree(mycp); 839 mycp->p_fd = NULL; 840 curcp = mycp; 841 } 842 843 /* 844 * If we are the first to be put onto the free queue, wakeup 845 * anyone waiting for a daemon. 846 */ 847 TAILQ_REMOVE(&aio_activeproc, aiop, list); 848 if (TAILQ_EMPTY(&aio_freeproc)) 849 wakeup(&aio_freeproc); 850 TAILQ_INSERT_HEAD(&aio_freeproc, aiop, list); 851 aiop->aioprocflags |= AIOP_FREE; 852 853 /* 854 * If daemon is inactive for a long time, allow it to exit, thereby 855 * freeing resources. 856 */ 857 if (((aiop->aioprocflags & AIOP_SCHED) == 0) && 858 tsleep(mycp, PRIBIO, "aiordy", aiod_lifetime)) { 859 if ((TAILQ_FIRST(&aio_jobs) == NULL) && 860 (TAILQ_FIRST(&aiop->jobtorun) == NULL)) { 861 if ((aiop->aioprocflags & AIOP_FREE) && 862 (num_aio_procs > target_aio_procs)) { 863 TAILQ_REMOVE(&aio_freeproc, aiop, list); 864 zfree(aiop_zone, aiop); 865 num_aio_procs--; 866 #if defined(DIAGNOSTIC) 867 if (mycp->p_vmspace->vm_refcnt <= 1) 868 printf("AIOD: bad vm refcnt for exiting daemon: %d\n", 869 mycp->p_vmspace->vm_refcnt); 870 #endif 871 exit1(mycp, 0); 872 } 873 } 874 } 875 } 876 } 877 878 /* 879 * Create a new AIO daemon. This is mostly a kernel-thread fork routine. 880 * The AIO daemon modifies its environment itself. 881 */ 882 static int 883 aio_newproc() 884 { 885 int error; 886 struct proc *p, *np; 887 888 p = &proc0; 889 error = fork1(p, RFPROC|RFMEM|RFNOWAIT, &np); 890 if (error) 891 return error; 892 cpu_set_fork_handler(np, aio_daemon, curproc); 893 894 /* 895 * Wait until daemon is started, but continue on just in case (to 896 * handle error conditions. 897 */ 898 error = tsleep(np, PZERO, "aiosta", aiod_timeout); 899 num_aio_procs++; 900 901 return error; 902 903 } 904 905 /* 906 * Try the high-performance physio method for eligible VCHR devices. This 907 * routine doesn't require the use of any additional threads, and have 908 * overhead. 909 */ 910 int 911 aio_qphysio(p, aiocbe) 912 struct proc *p; 913 struct aiocblist *aiocbe; 914 { 915 int error; 916 struct aiocb *cb; 917 struct file *fp; 918 struct buf *bp; 919 struct vnode *vp; 920 struct kaioinfo *ki; 921 struct filedesc *fdp; 922 struct aio_liojob *lj; 923 int fd; 924 int s; 925 int cnt; 926 struct cdevsw *cdev; 927 928 cb = &aiocbe->uaiocb; 929 fdp = p->p_fd; 930 fd = cb->aio_fildes; 931 fp = fdp->fd_ofiles[fd]; 932 933 if (fp->f_type != DTYPE_VNODE) { 934 return -1; 935 } 936 937 vp = (struct vnode *)fp->f_data; 938 if (vp->v_type != VCHR || ((cb->aio_nbytes & (DEV_BSIZE - 1)) != 0)) { 939 return -1; 940 } 941 942 if ((cb->aio_nbytes > MAXPHYS) && (num_buf_aio >= max_buf_aio)) { 943 return -1; 944 } 945 946 if ((vp->v_rdev == NULL) || (vp->v_flag & VISTTY)) { 947 return -1; 948 } 949 950 if (vp->v_rdev == NODEV) { 951 return -1; 952 } 953 954 cdev = devsw(vp->v_rdev); 955 if (cdev == NULL) { 956 return -1; 957 } 958 959 if (cdev->d_bmaj == -1) { 960 return -1; 961 } 962 963 ki = p->p_aioinfo; 964 if (ki->kaio_buffer_count >= ki->kaio_ballowed_count) { 965 return -1; 966 } 967 968 cnt = cb->aio_nbytes; 969 if (cnt > MAXPHYS) { 970 return -1; 971 } 972 973 /* 974 * Physical I/O is charged directly to the process, so we don't have 975 * to fake it. 976 */ 977 aiocbe->inputcharge = 0; 978 aiocbe->outputcharge = 0; 979 980 ki->kaio_buffer_count++; 981 982 lj = aiocbe->lio; 983 if (lj) { 984 lj->lioj_buffer_count++; 985 } 986 987 /* create and build a buffer header for a transfer */ 988 bp = (struct buf *)getpbuf(NULL); 989 990 /* 991 * get a copy of the kva from the physical buffer 992 */ 993 bp->b_caller1 = p; 994 bp->b_dev = vp->v_rdev; 995 error = bp->b_error = 0; 996 997 bp->b_bcount = cb->aio_nbytes; 998 bp->b_bufsize = cb->aio_nbytes; 999 bp->b_flags = B_PHYS | B_CALL; 1000 bp->b_iodone = aio_physwakeup; 1001 bp->b_saveaddr = bp->b_data; 1002 bp->b_data = (void *) cb->aio_buf; 1003 bp->b_blkno = btodb(cb->aio_offset); 1004 1005 if (cb->aio_lio_opcode == LIO_WRITE) { 1006 bp->b_flags |= B_WRITE; 1007 if (!useracc(bp->b_data, bp->b_bufsize, VM_PROT_READ)) { 1008 error = EFAULT; 1009 goto doerror; 1010 } 1011 } else { 1012 bp->b_flags |= B_READ; 1013 if (!useracc(bp->b_data, bp->b_bufsize, VM_PROT_WRITE)) { 1014 error = EFAULT; 1015 goto doerror; 1016 } 1017 } 1018 1019 /* bring buffer into kernel space */ 1020 vmapbuf(bp); 1021 1022 s = splbio(); 1023 aiocbe->bp = bp; 1024 bp->b_spc = (void *)aiocbe; 1025 TAILQ_INSERT_TAIL(&aio_bufjobs, aiocbe, list); 1026 TAILQ_INSERT_TAIL(&ki->kaio_bufqueue, aiocbe, plist); 1027 aiocbe->jobstate = JOBST_JOBQBUF; 1028 cb->_aiocb_private.status = cb->aio_nbytes; 1029 num_buf_aio++; 1030 bp->b_error = 0; 1031 1032 splx(s); 1033 /* perform transfer */ 1034 BUF_STRATEGY(bp, 0); 1035 1036 s = splbio(); 1037 /* 1038 * If we had an error invoking the request, or an error in processing 1039 * the request before we have returned, we process it as an error 1040 * in transfer. Note that such an I/O error is not indicated immediately, 1041 * but is returned using the aio_error mechanism. In this case, aio_suspend 1042 * will return immediately. 1043 */ 1044 if (bp->b_error || (bp->b_flags & B_ERROR)) { 1045 struct aiocb *job = aiocbe->uuaiocb; 1046 1047 aiocbe->uaiocb._aiocb_private.status = 0; 1048 suword(&job->_aiocb_private.status, 0); 1049 aiocbe->uaiocb._aiocb_private.error = bp->b_error; 1050 suword(&job->_aiocb_private.error, bp->b_error); 1051 1052 ki->kaio_buffer_finished_count++; 1053 1054 if (aiocbe->jobstate != JOBST_JOBBFINISHED) { 1055 aiocbe->jobstate = JOBST_JOBBFINISHED; 1056 aiocbe->jobflags |= AIOCBLIST_DONE; 1057 TAILQ_REMOVE(&aio_bufjobs, aiocbe, list); 1058 TAILQ_REMOVE(&ki->kaio_bufqueue, aiocbe, plist); 1059 TAILQ_INSERT_TAIL(&ki->kaio_bufdone, aiocbe, plist); 1060 } 1061 } 1062 splx(s); 1063 return 0; 1064 1065 doerror: 1066 ki->kaio_buffer_count--; 1067 if (lj) { 1068 lj->lioj_buffer_count--; 1069 } 1070 aiocbe->bp = NULL; 1071 relpbuf(bp, NULL); 1072 return error; 1073 } 1074 1075 /* 1076 * This waits/tests physio completion. 1077 */ 1078 int 1079 aio_fphysio(p, iocb, flgwait) 1080 struct proc *p; 1081 struct aiocblist *iocb; 1082 int flgwait; 1083 { 1084 int s; 1085 struct buf *bp; 1086 int error; 1087 1088 bp = iocb->bp; 1089 1090 s = splbio(); 1091 if (flgwait == 0) { 1092 if ((bp->b_flags & B_DONE) == 0) { 1093 splx(s); 1094 return EINPROGRESS; 1095 } 1096 } 1097 1098 while ((bp->b_flags & B_DONE) == 0) { 1099 if (tsleep((caddr_t)bp, PRIBIO, "physstr", aiod_timeout)) { 1100 if ((bp->b_flags & B_DONE) == 0) { 1101 splx(s); 1102 return EINPROGRESS; 1103 } else { 1104 break; 1105 } 1106 } 1107 } 1108 1109 /* release mapping into kernel space */ 1110 vunmapbuf(bp); 1111 iocb->bp = 0; 1112 1113 error = 0; 1114 /* 1115 * check for an error 1116 */ 1117 if (bp->b_flags & B_ERROR) { 1118 error = bp->b_error; 1119 } 1120 1121 relpbuf(bp, NULL); 1122 return (error); 1123 } 1124 1125 /* 1126 * Queue a new AIO request. Choosing either the threaded or direct physio 1127 * VCHR technique is done in this code. 1128 */ 1129 static int 1130 _aio_aqueue(struct proc *p, struct aiocb *job, struct aio_liojob *lj, int type) 1131 { 1132 struct filedesc *fdp; 1133 struct file *fp; 1134 unsigned int fd; 1135 1136 int error; 1137 int opcode; 1138 struct aiocblist *aiocbe; 1139 struct aioproclist *aiop; 1140 struct kaioinfo *ki; 1141 1142 if ((aiocbe = TAILQ_FIRST(&aio_freejobs)) != NULL) { 1143 TAILQ_REMOVE(&aio_freejobs, aiocbe, list); 1144 } else { 1145 aiocbe = zalloc (aiocb_zone); 1146 } 1147 1148 aiocbe->inputcharge = 0; 1149 aiocbe->outputcharge = 0; 1150 1151 suword(&job->_aiocb_private.status, -1); 1152 suword(&job->_aiocb_private.error, 0); 1153 suword(&job->_aiocb_private.kernelinfo, -1); 1154 1155 error = copyin((caddr_t)job, 1156 (caddr_t) &aiocbe->uaiocb, sizeof aiocbe->uaiocb); 1157 if (error) { 1158 suword(&job->_aiocb_private.error, error); 1159 1160 TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list); 1161 return error; 1162 } 1163 1164 /* 1165 * Save userspace address of the job info 1166 */ 1167 aiocbe->uuaiocb = job; 1168 1169 /* 1170 * Get the opcode 1171 */ 1172 if (type != LIO_NOP) { 1173 aiocbe->uaiocb.aio_lio_opcode = type; 1174 } 1175 opcode = aiocbe->uaiocb.aio_lio_opcode; 1176 1177 /* 1178 * Get the fd info for process 1179 */ 1180 fdp = p->p_fd; 1181 1182 /* 1183 * Range check file descriptor 1184 */ 1185 fd = aiocbe->uaiocb.aio_fildes; 1186 if (fd >= fdp->fd_nfiles) { 1187 TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list); 1188 if (type == 0) { 1189 suword(&job->_aiocb_private.error, EBADF); 1190 } 1191 return EBADF; 1192 } 1193 1194 fp = fdp->fd_ofiles[fd]; 1195 if ((fp == NULL) || 1196 ((opcode == LIO_WRITE) && ((fp->f_flag & FWRITE) == 0))) { 1197 TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list); 1198 if (type == 0) { 1199 suword(&job->_aiocb_private.error, EBADF); 1200 } 1201 return EBADF; 1202 } 1203 1204 if (aiocbe->uaiocb.aio_offset == -1LL) { 1205 TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list); 1206 if (type == 0) { 1207 suword(&job->_aiocb_private.error, EINVAL); 1208 } 1209 return EINVAL; 1210 } 1211 1212 error = suword(&job->_aiocb_private.kernelinfo, jobrefid); 1213 if (error) { 1214 TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list); 1215 if (type == 0) { 1216 suword(&job->_aiocb_private.error, EINVAL); 1217 } 1218 return error; 1219 } 1220 1221 aiocbe->uaiocb._aiocb_private.kernelinfo = (void *)(intptr_t)jobrefid; 1222 if (jobrefid == LONG_MAX) 1223 jobrefid = 1; 1224 else 1225 jobrefid++; 1226 1227 if (opcode == LIO_NOP) { 1228 TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list); 1229 if (type == 0) { 1230 suword(&job->_aiocb_private.error, 0); 1231 suword(&job->_aiocb_private.status, 0); 1232 suword(&job->_aiocb_private.kernelinfo, 0); 1233 } 1234 return 0; 1235 } 1236 1237 if ((opcode != LIO_READ) && (opcode != LIO_WRITE)) { 1238 TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list); 1239 if (type == 0) { 1240 suword(&job->_aiocb_private.status, 0); 1241 suword(&job->_aiocb_private.error, EINVAL); 1242 } 1243 return EINVAL; 1244 } 1245 1246 suword(&job->_aiocb_private.error, EINPROGRESS); 1247 aiocbe->uaiocb._aiocb_private.error = EINPROGRESS; 1248 aiocbe->userproc = p; 1249 aiocbe->jobflags = 0; 1250 aiocbe->lio = lj; 1251 ki = p->p_aioinfo; 1252 1253 if ((error = aio_qphysio(p, aiocbe)) == 0) { 1254 return 0; 1255 } else if (error > 0) { 1256 suword(&job->_aiocb_private.status, 0); 1257 aiocbe->uaiocb._aiocb_private.error = error; 1258 suword(&job->_aiocb_private.error, error); 1259 return error; 1260 } 1261 1262 /* 1263 * No buffer for daemon I/O 1264 */ 1265 aiocbe->bp = NULL; 1266 1267 ki->kaio_queue_count++; 1268 if (lj) { 1269 lj->lioj_queue_count++; 1270 } 1271 TAILQ_INSERT_TAIL(&ki->kaio_jobqueue, aiocbe, plist); 1272 TAILQ_INSERT_TAIL(&aio_jobs, aiocbe, list); 1273 aiocbe->jobstate = JOBST_JOBQGLOBAL; 1274 1275 num_queue_count++; 1276 error = 0; 1277 1278 /* 1279 * If we don't have a free AIO process, and we are below our 1280 * quota, then start one. Otherwise, depend on the subsequent 1281 * I/O completions to pick-up this job. If we don't sucessfully 1282 * create the new process (thread) due to resource issues, we 1283 * return an error for now (EAGAIN), which is likely not the 1284 * correct thing to do. 1285 */ 1286 retryproc: 1287 if ((aiop = TAILQ_FIRST(&aio_freeproc)) != NULL) { 1288 TAILQ_REMOVE(&aio_freeproc, aiop, list); 1289 TAILQ_INSERT_TAIL(&aio_activeproc, aiop, list); 1290 aiop->aioprocflags &= ~AIOP_FREE; 1291 wakeup(aiop->aioproc); 1292 } else if (((num_aio_resv_start + num_aio_procs) < max_aio_procs) && 1293 ((ki->kaio_active_count + num_aio_resv_start) < 1294 ki->kaio_maxactive_count)) { 1295 num_aio_resv_start++; 1296 if ((error = aio_newproc()) == 0) { 1297 num_aio_resv_start--; 1298 p->p_retval[0] = 0; 1299 goto retryproc; 1300 } 1301 num_aio_resv_start--; 1302 } 1303 return error; 1304 } 1305 1306 /* 1307 * This routine queues an AIO request, checking for quotas. 1308 */ 1309 static int 1310 aio_aqueue(struct proc *p, struct aiocb *job, int type) 1311 { 1312 struct kaioinfo *ki; 1313 1314 if (p->p_aioinfo == NULL) { 1315 aio_init_aioinfo(p); 1316 } 1317 1318 if (num_queue_count >= max_queue_count) 1319 return EAGAIN; 1320 1321 ki = p->p_aioinfo; 1322 if (ki->kaio_queue_count >= ki->kaio_qallowed_count) 1323 return EAGAIN; 1324 1325 return _aio_aqueue(p, job, NULL, type); 1326 } 1327 1328 /* 1329 * Support the aio_return system call, as a side-effect, kernel 1330 * resources are released. 1331 */ 1332 int 1333 aio_return(struct proc *p, struct aio_return_args *uap) 1334 { 1335 int s; 1336 int jobref; 1337 struct aiocblist *cb, *ncb; 1338 struct aiocb *ujob; 1339 struct kaioinfo *ki; 1340 1341 ki = p->p_aioinfo; 1342 if (ki == NULL) { 1343 return EINVAL; 1344 } 1345 1346 ujob = uap->aiocbp; 1347 1348 jobref = fuword(&ujob->_aiocb_private.kernelinfo); 1349 if (jobref == -1 || jobref == 0) 1350 return EINVAL; 1351 1352 for (cb = TAILQ_FIRST(&ki->kaio_jobdone); 1353 cb; 1354 cb = TAILQ_NEXT(cb, plist)) { 1355 if (((intptr_t) cb->uaiocb._aiocb_private.kernelinfo) == jobref) { 1356 if (ujob == cb->uuaiocb) { 1357 p->p_retval[0] = cb->uaiocb._aiocb_private.status; 1358 } else { 1359 p->p_retval[0] = EFAULT; 1360 } 1361 if (cb->uaiocb.aio_lio_opcode == LIO_WRITE) { 1362 curproc->p_stats->p_ru.ru_oublock += cb->outputcharge; 1363 cb->outputcharge = 0; 1364 } else if (cb->uaiocb.aio_lio_opcode == LIO_READ) { 1365 curproc->p_stats->p_ru.ru_inblock += cb->inputcharge; 1366 cb->inputcharge = 0; 1367 } 1368 aio_free_entry(cb); 1369 return 0; 1370 } 1371 } 1372 1373 s = splbio(); 1374 for (cb = TAILQ_FIRST(&ki->kaio_bufdone); 1375 cb; 1376 cb = ncb) { 1377 ncb = TAILQ_NEXT(cb, plist); 1378 if (((intptr_t) cb->uaiocb._aiocb_private.kernelinfo) == jobref) { 1379 splx(s); 1380 if (ujob == cb->uuaiocb) { 1381 p->p_retval[0] = cb->uaiocb._aiocb_private.status; 1382 } else { 1383 p->p_retval[0] = EFAULT; 1384 } 1385 aio_free_entry(cb); 1386 return 0; 1387 } 1388 } 1389 splx(s); 1390 1391 return (EINVAL); 1392 } 1393 1394 /* 1395 * Allow a process to wakeup when any of the I/O requests are 1396 * completed. 1397 */ 1398 int 1399 aio_suspend(struct proc *p, struct aio_suspend_args *uap) 1400 { 1401 struct timeval atv; 1402 struct timespec ts; 1403 struct aiocb *const *cbptr, *cbp; 1404 struct kaioinfo *ki; 1405 struct aiocblist *cb; 1406 int i; 1407 int njoblist; 1408 int error, s, timo; 1409 int *ijoblist; 1410 struct aiocb **ujoblist; 1411 1412 if (uap->nent >= AIO_LISTIO_MAX) 1413 return EINVAL; 1414 1415 timo = 0; 1416 if (uap->timeout) { 1417 /* 1418 * Get timespec struct 1419 */ 1420 if ((error = copyin(uap->timeout, &ts, sizeof(ts))) != 0) { 1421 return error; 1422 } 1423 1424 if (ts.tv_nsec < 0 || ts.tv_nsec >= 1000000000) 1425 return (EINVAL); 1426 1427 TIMESPEC_TO_TIMEVAL(&atv, &ts); 1428 if (itimerfix(&atv)) 1429 return (EINVAL); 1430 timo = tvtohz(&atv); 1431 } 1432 1433 ki = p->p_aioinfo; 1434 if (ki == NULL) 1435 return EAGAIN; 1436 1437 njoblist = 0; 1438 ijoblist = zalloc(aiol_zone); 1439 ujoblist = zalloc(aiol_zone); 1440 cbptr = uap->aiocbp; 1441 1442 for(i = 0; i < uap->nent; i++) { 1443 cbp = (struct aiocb *) (intptr_t) fuword((caddr_t) &cbptr[i]); 1444 if (cbp == 0) 1445 continue; 1446 ujoblist[njoblist] = cbp; 1447 ijoblist[njoblist] = fuword(&cbp->_aiocb_private.kernelinfo); 1448 njoblist++; 1449 } 1450 if (njoblist == 0) { 1451 zfree(aiol_zone, ijoblist); 1452 zfree(aiol_zone, ujoblist); 1453 return 0; 1454 } 1455 1456 error = 0; 1457 while (1) { 1458 for (cb = TAILQ_FIRST(&ki->kaio_jobdone); 1459 cb; cb = TAILQ_NEXT(cb, plist)) { 1460 for(i = 0; i < njoblist; i++) { 1461 if (((intptr_t) cb->uaiocb._aiocb_private.kernelinfo) == 1462 ijoblist[i]) { 1463 if (ujoblist[i] != cb->uuaiocb) 1464 error = EINVAL; 1465 zfree(aiol_zone, ijoblist); 1466 zfree(aiol_zone, ujoblist); 1467 return error; 1468 } 1469 } 1470 } 1471 1472 s = splbio(); 1473 for (cb = TAILQ_FIRST(&ki->kaio_bufdone); 1474 cb; cb = TAILQ_NEXT(cb, plist)) { 1475 for(i = 0; i < njoblist; i++) { 1476 if (((intptr_t) cb->uaiocb._aiocb_private.kernelinfo) == 1477 ijoblist[i]) { 1478 splx(s); 1479 if (ujoblist[i] != cb->uuaiocb) 1480 error = EINVAL; 1481 zfree(aiol_zone, ijoblist); 1482 zfree(aiol_zone, ujoblist); 1483 return error; 1484 } 1485 } 1486 } 1487 1488 ki->kaio_flags |= KAIO_WAKEUP; 1489 error = tsleep(p, PRIBIO|PCATCH, "aiospn", timo); 1490 splx(s); 1491 1492 if (error == EINTR) { 1493 zfree(aiol_zone, ijoblist); 1494 zfree(aiol_zone, ujoblist); 1495 return EINTR; 1496 } else if (error == EWOULDBLOCK) { 1497 zfree(aiol_zone, ijoblist); 1498 zfree(aiol_zone, ujoblist); 1499 return EAGAIN; 1500 } 1501 } 1502 1503 /* NOTREACHED */ 1504 return EINVAL; 1505 } 1506 1507 /* 1508 * aio_cancel at the kernel level is a NOOP right now. It 1509 * might be possible to support it partially in user mode, or 1510 * in kernel mode later on. 1511 */ 1512 int 1513 aio_cancel(struct proc *p, struct aio_cancel_args *uap) 1514 { 1515 return ENOSYS; 1516 } 1517 1518 /* 1519 * aio_error is implemented in the kernel level for compatibility 1520 * purposes only. For a user mode async implementation, it would be 1521 * best to do it in a userland subroutine. 1522 */ 1523 int 1524 aio_error(struct proc *p, struct aio_error_args *uap) 1525 { 1526 int s; 1527 struct aiocblist *cb; 1528 struct kaioinfo *ki; 1529 int jobref; 1530 1531 ki = p->p_aioinfo; 1532 if (ki == NULL) 1533 return EINVAL; 1534 1535 jobref = fuword(&uap->aiocbp->_aiocb_private.kernelinfo); 1536 if ((jobref == -1) || (jobref == 0)) 1537 return EINVAL; 1538 1539 for (cb = TAILQ_FIRST(&ki->kaio_jobdone); 1540 cb; 1541 cb = TAILQ_NEXT(cb, plist)) { 1542 1543 if (((intptr_t) cb->uaiocb._aiocb_private.kernelinfo) == jobref) { 1544 p->p_retval[0] = cb->uaiocb._aiocb_private.error; 1545 return 0; 1546 } 1547 } 1548 1549 for (cb = TAILQ_FIRST(&ki->kaio_jobqueue); 1550 cb; 1551 cb = TAILQ_NEXT(cb, plist)) { 1552 1553 if (((intptr_t) cb->uaiocb._aiocb_private.kernelinfo) == jobref) { 1554 p->p_retval[0] = EINPROGRESS; 1555 return 0; 1556 } 1557 } 1558 1559 s = splbio(); 1560 for (cb = TAILQ_FIRST(&ki->kaio_bufdone); 1561 cb; 1562 cb = TAILQ_NEXT(cb, plist)) { 1563 if (((intptr_t) cb->uaiocb._aiocb_private.kernelinfo) == jobref) { 1564 p->p_retval[0] = cb->uaiocb._aiocb_private.error; 1565 splx(s); 1566 return 0; 1567 } 1568 } 1569 1570 for (cb = TAILQ_FIRST(&ki->kaio_bufqueue); 1571 cb; 1572 cb = TAILQ_NEXT(cb, plist)) { 1573 if (((intptr_t) cb->uaiocb._aiocb_private.kernelinfo) == jobref) { 1574 p->p_retval[0] = EINPROGRESS; 1575 splx(s); 1576 return 0; 1577 } 1578 } 1579 splx(s); 1580 1581 1582 /* 1583 * Hack for lio 1584 */ 1585 /* 1586 status = fuword(&uap->aiocbp->_aiocb_private.status); 1587 if (status == -1) { 1588 return fuword(&uap->aiocbp->_aiocb_private.error); 1589 } 1590 */ 1591 return EINVAL; 1592 } 1593 1594 int 1595 aio_read(struct proc *p, struct aio_read_args *uap) 1596 { 1597 struct filedesc *fdp; 1598 struct file *fp; 1599 struct uio auio; 1600 struct iovec aiov; 1601 unsigned int fd; 1602 int cnt; 1603 struct aiocb iocb; 1604 int error, pmodes; 1605 1606 pmodes = fuword(&uap->aiocbp->_aiocb_private.privatemodes); 1607 if ((pmodes & AIO_PMODE_SYNC) == 0) { 1608 return aio_aqueue(p, (struct aiocb *) uap->aiocbp, LIO_READ); 1609 } 1610 1611 /* 1612 * Get control block 1613 */ 1614 if ((error = copyin((caddr_t) uap->aiocbp, (caddr_t) &iocb, sizeof iocb)) != 0) 1615 return error; 1616 1617 /* 1618 * Get the fd info for process 1619 */ 1620 fdp = p->p_fd; 1621 1622 /* 1623 * Range check file descriptor 1624 */ 1625 fd = iocb.aio_fildes; 1626 if (fd >= fdp->fd_nfiles) 1627 return EBADF; 1628 fp = fdp->fd_ofiles[fd]; 1629 if ((fp == NULL) || ((fp->f_flag & FREAD) == 0)) 1630 return EBADF; 1631 if (iocb.aio_offset == -1LL) 1632 return EINVAL; 1633 1634 auio.uio_resid = iocb.aio_nbytes; 1635 if (auio.uio_resid < 0) 1636 return (EINVAL); 1637 1638 /* 1639 * Process sync simply -- queue async request. 1640 */ 1641 if ((iocb._aiocb_private.privatemodes & AIO_PMODE_SYNC) == 0) { 1642 return aio_aqueue(p, (struct aiocb *) uap->aiocbp, LIO_READ); 1643 } 1644 1645 aiov.iov_base = (void *) iocb.aio_buf; 1646 aiov.iov_len = iocb.aio_nbytes; 1647 1648 auio.uio_iov = &aiov; 1649 auio.uio_iovcnt = 1; 1650 auio.uio_offset = iocb.aio_offset; 1651 auio.uio_rw = UIO_READ; 1652 auio.uio_segflg = UIO_USERSPACE; 1653 auio.uio_procp = p; 1654 1655 cnt = iocb.aio_nbytes; 1656 error = fo_read(fp, &auio, fp->f_cred, FOF_OFFSET, p); 1657 if (error && 1658 (auio.uio_resid != cnt) && 1659 (error == ERESTART || error == EINTR || error == EWOULDBLOCK)) 1660 error = 0; 1661 cnt -= auio.uio_resid; 1662 p->p_retval[0] = cnt; 1663 return error; 1664 } 1665 1666 int 1667 aio_write(struct proc *p, struct aio_write_args *uap) 1668 { 1669 struct filedesc *fdp; 1670 struct file *fp; 1671 struct uio auio; 1672 struct iovec aiov; 1673 unsigned int fd; 1674 int cnt; 1675 struct aiocb iocb; 1676 int error; 1677 int pmodes; 1678 1679 /* 1680 * Process sync simply -- queue async request. 1681 */ 1682 pmodes = fuword(&uap->aiocbp->_aiocb_private.privatemodes); 1683 if ((pmodes & AIO_PMODE_SYNC) == 0) { 1684 return aio_aqueue(p, (struct aiocb *) uap->aiocbp, LIO_WRITE); 1685 } 1686 1687 if ((error = copyin((caddr_t) uap->aiocbp, (caddr_t) &iocb, sizeof iocb)) != 0) 1688 return error; 1689 1690 /* 1691 * Get the fd info for process 1692 */ 1693 fdp = p->p_fd; 1694 1695 /* 1696 * Range check file descriptor 1697 */ 1698 fd = iocb.aio_fildes; 1699 if (fd >= fdp->fd_nfiles) 1700 return EBADF; 1701 fp = fdp->fd_ofiles[fd]; 1702 if ((fp == NULL) || ((fp->f_flag & FWRITE) == 0)) 1703 return EBADF; 1704 if (iocb.aio_offset == -1LL) 1705 return EINVAL; 1706 1707 aiov.iov_base = (void *) iocb.aio_buf; 1708 aiov.iov_len = iocb.aio_nbytes; 1709 auio.uio_iov = &aiov; 1710 auio.uio_iovcnt = 1; 1711 auio.uio_offset = iocb.aio_offset; 1712 1713 auio.uio_resid = iocb.aio_nbytes; 1714 if (auio.uio_resid < 0) 1715 return (EINVAL); 1716 1717 auio.uio_rw = UIO_WRITE; 1718 auio.uio_segflg = UIO_USERSPACE; 1719 auio.uio_procp = p; 1720 1721 cnt = iocb.aio_nbytes; 1722 error = fo_write(fp, &auio, fp->f_cred, FOF_OFFSET, p); 1723 if (error) { 1724 if (auio.uio_resid != cnt) { 1725 if (error == ERESTART || error == EINTR || error == EWOULDBLOCK) 1726 error = 0; 1727 if (error == EPIPE) 1728 psignal(p, SIGPIPE); 1729 } 1730 } 1731 cnt -= auio.uio_resid; 1732 p->p_retval[0] = cnt; 1733 return error; 1734 } 1735 1736 int 1737 lio_listio(struct proc *p, struct lio_listio_args *uap) 1738 { 1739 int nent, nentqueued; 1740 struct aiocb *iocb, * const *cbptr; 1741 struct aiocblist *cb; 1742 struct kaioinfo *ki; 1743 struct aio_liojob *lj; 1744 int error, runningcode; 1745 int nerror; 1746 int i; 1747 int s; 1748 1749 if ((uap->mode != LIO_NOWAIT) && (uap->mode != LIO_WAIT)) { 1750 return EINVAL; 1751 } 1752 1753 nent = uap->nent; 1754 if (nent > AIO_LISTIO_MAX) { 1755 return EINVAL; 1756 } 1757 1758 if (p->p_aioinfo == NULL) { 1759 aio_init_aioinfo(p); 1760 } 1761 1762 if ((nent + num_queue_count) > max_queue_count) { 1763 return EAGAIN; 1764 } 1765 1766 ki = p->p_aioinfo; 1767 if ((nent + ki->kaio_queue_count) > ki->kaio_qallowed_count) { 1768 return EAGAIN; 1769 } 1770 1771 lj = zalloc(aiolio_zone); 1772 if (!lj) { 1773 return EAGAIN; 1774 } 1775 1776 lj->lioj_flags = 0; 1777 lj->lioj_buffer_count = 0; 1778 lj->lioj_buffer_finished_count = 0; 1779 lj->lioj_queue_count = 0; 1780 lj->lioj_queue_finished_count = 0; 1781 lj->lioj_ki = ki; 1782 TAILQ_INSERT_TAIL(&ki->kaio_liojoblist, lj, lioj_list); 1783 1784 /* 1785 * Setup signal 1786 */ 1787 if (uap->sig && (uap->mode == LIO_NOWAIT)) { 1788 error = copyin(uap->sig, &lj->lioj_signal, sizeof lj->lioj_signal); 1789 if (error) 1790 return error; 1791 lj->lioj_flags |= LIOJ_SIGNAL; 1792 lj->lioj_flags &= ~LIOJ_SIGNAL_POSTED; 1793 } else { 1794 lj->lioj_flags &= ~LIOJ_SIGNAL; 1795 } 1796 1797 /* 1798 * get pointers to the list of I/O requests 1799 */ 1800 1801 nerror = 0; 1802 nentqueued = 0; 1803 cbptr = uap->acb_list; 1804 for(i = 0; i < uap->nent; i++) { 1805 iocb = (struct aiocb *) (intptr_t) fuword((caddr_t) &cbptr[i]); 1806 if (((intptr_t) iocb != -1) && ((intptr_t) iocb != NULL)) { 1807 error = _aio_aqueue(p, iocb, lj, 0); 1808 if (error == 0) { 1809 nentqueued++; 1810 } else { 1811 nerror++; 1812 } 1813 } 1814 } 1815 1816 /* 1817 * If we haven't queued any, then just return error 1818 */ 1819 if (nentqueued == 0) { 1820 return 0; 1821 } 1822 1823 /* 1824 * Calculate the appropriate error return 1825 */ 1826 runningcode = 0; 1827 if (nerror) 1828 runningcode = EIO; 1829 1830 if (uap->mode == LIO_WAIT) { 1831 while (1) { 1832 int found; 1833 found = 0; 1834 for(i = 0; i < uap->nent; i++) { 1835 int jobref, command; 1836 1837 /* 1838 * Fetch address of the control buf pointer in user space 1839 */ 1840 iocb = (struct aiocb *) (intptr_t) fuword((caddr_t) &cbptr[i]); 1841 if (((intptr_t) iocb == -1) || ((intptr_t) iocb == 0)) 1842 continue; 1843 1844 /* 1845 * Fetch the associated command from user space 1846 */ 1847 command = fuword(&iocb->aio_lio_opcode); 1848 if (command == LIO_NOP) { 1849 found++; 1850 continue; 1851 } 1852 1853 jobref = fuword(&iocb->_aiocb_private.kernelinfo); 1854 1855 for (cb = TAILQ_FIRST(&ki->kaio_jobdone); 1856 cb; 1857 cb = TAILQ_NEXT(cb, plist)) { 1858 if (((intptr_t) cb->uaiocb._aiocb_private.kernelinfo) == 1859 jobref) { 1860 if (cb->uaiocb.aio_lio_opcode == LIO_WRITE) { 1861 curproc->p_stats->p_ru.ru_oublock += 1862 cb->outputcharge; 1863 cb->outputcharge = 0; 1864 } else if (cb->uaiocb.aio_lio_opcode == LIO_READ) { 1865 curproc->p_stats->p_ru.ru_inblock += 1866 cb->inputcharge; 1867 cb->inputcharge = 0; 1868 } 1869 found++; 1870 break; 1871 } 1872 } 1873 1874 s = splbio(); 1875 for (cb = TAILQ_FIRST(&ki->kaio_bufdone); 1876 cb; 1877 cb = TAILQ_NEXT(cb, plist)) { 1878 if (((intptr_t) cb->uaiocb._aiocb_private.kernelinfo) == 1879 jobref) { 1880 found++; 1881 break; 1882 } 1883 } 1884 splx(s); 1885 1886 } 1887 1888 /* 1889 * If all I/Os have been disposed of, then we can return 1890 */ 1891 if (found == nentqueued) { 1892 return runningcode; 1893 } 1894 1895 ki->kaio_flags |= KAIO_WAKEUP; 1896 error = tsleep(p, PRIBIO|PCATCH, "aiospn", 0); 1897 1898 if (error == EINTR) { 1899 return EINTR; 1900 } else if (error == EWOULDBLOCK) { 1901 return EAGAIN; 1902 } 1903 1904 } 1905 } 1906 1907 return runningcode; 1908 } 1909 1910 /* 1911 * This is a wierd hack so that we can post a signal. It is safe 1912 * to do so from a timeout routine, but *not* from an interrupt routine. 1913 */ 1914 static void 1915 process_signal(void *ljarg) 1916 { 1917 struct aio_liojob *lj = ljarg; 1918 if (lj->lioj_signal.sigev_notify == SIGEV_SIGNAL) { 1919 if (lj->lioj_queue_count == lj->lioj_queue_finished_count) { 1920 psignal(lj->lioj_ki->kaio_p, lj->lioj_signal.sigev_signo); 1921 lj->lioj_flags |= LIOJ_SIGNAL_POSTED; 1922 } 1923 } 1924 } 1925 1926 /* 1927 * Interrupt handler for physio, performs the necessary process wakeups, 1928 * and signals. 1929 */ 1930 static void 1931 aio_physwakeup(bp) 1932 struct buf *bp; 1933 { 1934 struct aiocblist *aiocbe; 1935 struct proc *p; 1936 struct kaioinfo *ki; 1937 struct aio_liojob *lj; 1938 int s; 1939 s = splbio(); 1940 1941 wakeup((caddr_t) bp); 1942 bp->b_flags &= ~B_CALL; 1943 bp->b_flags |= B_DONE; 1944 1945 aiocbe = (struct aiocblist *)bp->b_spc; 1946 if (aiocbe) { 1947 p = bp->b_caller1; 1948 1949 aiocbe->jobstate = JOBST_JOBBFINISHED; 1950 aiocbe->uaiocb._aiocb_private.status -= bp->b_resid; 1951 aiocbe->uaiocb._aiocb_private.error = 0; 1952 aiocbe->jobflags |= AIOCBLIST_DONE; 1953 1954 if (bp->b_flags & B_ERROR) { 1955 aiocbe->uaiocb._aiocb_private.error = bp->b_error; 1956 } 1957 1958 lj = aiocbe->lio; 1959 if (lj) { 1960 lj->lioj_buffer_finished_count++; 1961 /* 1962 * wakeup/signal if all of the interrupt jobs are done 1963 */ 1964 if (lj->lioj_buffer_finished_count == lj->lioj_buffer_count) { 1965 /* 1966 * post a signal if it is called for 1967 */ 1968 if ((lj->lioj_flags & (LIOJ_SIGNAL|LIOJ_SIGNAL_POSTED)) == 1969 LIOJ_SIGNAL) { 1970 lj->lioj_flags |= LIOJ_SIGNAL_POSTED; 1971 timeout(process_signal, lj, 0); 1972 } 1973 } 1974 } 1975 1976 ki = p->p_aioinfo; 1977 if (ki) { 1978 ki->kaio_buffer_finished_count++; 1979 TAILQ_REMOVE(&aio_bufjobs, aiocbe, list); 1980 TAILQ_REMOVE(&ki->kaio_bufqueue, aiocbe, plist); 1981 TAILQ_INSERT_TAIL(&ki->kaio_bufdone, aiocbe, plist); 1982 /* 1983 * and do the wakeup 1984 */ 1985 if (ki->kaio_flags & (KAIO_RUNDOWN|KAIO_WAKEUP)) { 1986 ki->kaio_flags &= ~KAIO_WAKEUP; 1987 wakeup(p); 1988 } 1989 } 1990 } 1991 splx(s); 1992 } 1993