1 /* 2 * Copyright (c) 1997 John S. Dyson. All rights reserved. 3 * 4 * Redistribution and use in source and binary forms, with or without 5 * modification, are permitted provided that the following conditions 6 * are met: 7 * 1. Redistributions of source code must retain the above copyright 8 * notice, this list of conditions and the following disclaimer. 9 * 2. John S. Dyson's name may not be used to endorse or promote products 10 * derived from this software without specific prior written permission. 11 * 12 * DISCLAIMER: This code isn't warranted to do anything useful. Anything 13 * bad that happens because of using this software isn't the responsibility 14 * of the author. This software is distributed AS-IS. 15 * 16 * $FreeBSD$ 17 */ 18 19 /* 20 * This file contains support for the POSIX 1003.1B AIO/LIO facility. 21 */ 22 23 #include <sys/param.h> 24 #include <sys/systm.h> 25 #include <sys/buf.h> 26 #include <sys/sysproto.h> 27 #include <sys/filedesc.h> 28 #include <sys/kernel.h> 29 #include <sys/fcntl.h> 30 #include <sys/file.h> 31 #include <sys/lock.h> 32 #include <sys/unistd.h> 33 #include <sys/proc.h> 34 #include <sys/resourcevar.h> 35 #include <sys/signalvar.h> 36 #include <sys/sysctl.h> 37 #include <sys/vnode.h> 38 #include <sys/conf.h> 39 40 #include <vm/vm.h> 41 #include <vm/vm_extern.h> 42 #include <vm/pmap.h> 43 #include <vm/vm_map.h> 44 #include <vm/vm_zone.h> 45 #include <sys/aio.h> 46 47 #include <machine/limits.h> 48 49 static long jobrefid; 50 51 #define JOBST_NULL 0x0 52 #define JOBST_JOBQPROC 0x1 53 #define JOBST_JOBQGLOBAL 0x2 54 #define JOBST_JOBRUNNING 0x3 55 #define JOBST_JOBFINISHED 0x4 56 #define JOBST_JOBQBUF 0x5 57 #define JOBST_JOBBFINISHED 0x6 58 59 #ifndef MAX_AIO_PER_PROC 60 #define MAX_AIO_PER_PROC 32 61 #endif 62 63 #ifndef MAX_AIO_QUEUE_PER_PROC 64 #define MAX_AIO_QUEUE_PER_PROC 256 /* Bigger than AIO_LISTIO_MAX */ 65 #endif 66 67 #ifndef MAX_AIO_PROCS 68 #define MAX_AIO_PROCS 32 69 #endif 70 71 #ifndef MAX_AIO_QUEUE 72 #define MAX_AIO_QUEUE 1024 /* Bigger than AIO_LISTIO_MAX */ 73 #endif 74 75 #ifndef TARGET_AIO_PROCS 76 #define TARGET_AIO_PROCS 0 77 #endif 78 79 #ifndef MAX_BUF_AIO 80 #define MAX_BUF_AIO 16 81 #endif 82 83 #ifndef AIOD_TIMEOUT_DEFAULT 84 #define AIOD_TIMEOUT_DEFAULT (10 * hz) 85 #endif 86 87 #ifndef AIOD_LIFETIME_DEFAULT 88 #define AIOD_LIFETIME_DEFAULT (30 * hz) 89 #endif 90 91 static int max_aio_procs = MAX_AIO_PROCS; 92 static int num_aio_procs = 0; 93 static int target_aio_procs = TARGET_AIO_PROCS; 94 static int max_queue_count = MAX_AIO_QUEUE; 95 static int num_queue_count = 0; 96 static int num_buf_aio = 0; 97 static int num_aio_resv_start = 0; 98 static int aiod_timeout; 99 static int aiod_lifetime; 100 101 static int max_aio_per_proc = MAX_AIO_PER_PROC, 102 max_aio_queue_per_proc=MAX_AIO_QUEUE_PER_PROC; 103 104 static int max_buf_aio = MAX_BUF_AIO; 105 106 SYSCTL_NODE(_vfs, OID_AUTO, aio, CTLFLAG_RW, 0, "AIO mgmt"); 107 108 SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_per_proc, 109 CTLFLAG_RW, &max_aio_per_proc, 0, ""); 110 111 SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_queue_per_proc, 112 CTLFLAG_RW, &max_aio_queue_per_proc, 0, ""); 113 114 SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_procs, 115 CTLFLAG_RW, &max_aio_procs, 0, ""); 116 117 SYSCTL_INT(_vfs_aio, OID_AUTO, num_aio_procs, 118 CTLFLAG_RD, &num_aio_procs, 0, ""); 119 120 SYSCTL_INT(_vfs_aio, OID_AUTO, num_queue_count, 121 CTLFLAG_RD, &num_queue_count, 0, ""); 122 123 SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_queue, 124 CTLFLAG_RW, &max_queue_count, 0, ""); 125 126 SYSCTL_INT(_vfs_aio, OID_AUTO, target_aio_procs, 127 CTLFLAG_RW, &target_aio_procs, 0, ""); 128 129 SYSCTL_INT(_vfs_aio, OID_AUTO, max_buf_aio, 130 CTLFLAG_RW, &max_buf_aio, 0, ""); 131 132 SYSCTL_INT(_vfs_aio, OID_AUTO, num_buf_aio, 133 CTLFLAG_RD, &num_buf_aio, 0, ""); 134 135 SYSCTL_INT(_vfs_aio, OID_AUTO, aiod_lifetime, 136 CTLFLAG_RW, &aiod_lifetime, 0, ""); 137 138 SYSCTL_INT(_vfs_aio, OID_AUTO, aiod_timeout, 139 CTLFLAG_RW, &aiod_timeout, 0, ""); 140 141 142 /* 143 * Job queue item 144 */ 145 146 #define AIOCBLIST_CANCELLED 0x1 147 #define AIOCBLIST_RUNDOWN 0x4 148 #define AIOCBLIST_ASYNCFREE 0x8 149 #define AIOCBLIST_DONE 0x10 150 151 struct aiocblist { 152 TAILQ_ENTRY (aiocblist) list; /* List of jobs */ 153 TAILQ_ENTRY (aiocblist) plist; /* List of jobs for proc */ 154 int jobflags; 155 int jobstate; 156 int inputcharge, outputcharge; 157 struct buf *bp; /* buffer pointer */ 158 struct proc *userproc; /* User process */ 159 struct aioproclist *jobaioproc; /* AIO process descriptor */ 160 struct aio_liojob *lio; /* optional lio job */ 161 struct aiocb *uuaiocb; /* pointer in userspace of aiocb */ 162 struct aiocb uaiocb; /* Kernel I/O control block */ 163 }; 164 165 166 /* 167 * AIO process info 168 */ 169 #define AIOP_FREE 0x1 /* proc on free queue */ 170 #define AIOP_SCHED 0x2 /* proc explicitly scheduled */ 171 172 struct aioproclist { 173 int aioprocflags; /* AIO proc flags */ 174 TAILQ_ENTRY(aioproclist) list; /* List of processes */ 175 struct proc *aioproc; /* The AIO thread */ 176 TAILQ_HEAD (,aiocblist) jobtorun; /* suggested job to run */ 177 }; 178 179 /* 180 * data-structure for lio signal management 181 */ 182 struct aio_liojob { 183 int lioj_flags; 184 int lioj_buffer_count; 185 int lioj_buffer_finished_count; 186 int lioj_queue_count; 187 int lioj_queue_finished_count; 188 struct sigevent lioj_signal; /* signal on all I/O done */ 189 TAILQ_ENTRY (aio_liojob) lioj_list; 190 struct kaioinfo *lioj_ki; 191 }; 192 #define LIOJ_SIGNAL 0x1 /* signal on all done (lio) */ 193 #define LIOJ_SIGNAL_POSTED 0x2 /* signal has been posted */ 194 195 /* 196 * per process aio data structure 197 */ 198 struct kaioinfo { 199 int kaio_flags; /* per process kaio flags */ 200 int kaio_maxactive_count; /* maximum number of AIOs */ 201 int kaio_active_count; /* number of currently used AIOs */ 202 int kaio_qallowed_count; /* maxiumu size of AIO queue */ 203 int kaio_queue_count; /* size of AIO queue */ 204 int kaio_ballowed_count; /* maximum number of buffers */ 205 int kaio_queue_finished_count; /* number of daemon jobs finished */ 206 int kaio_buffer_count; /* number of physio buffers */ 207 int kaio_buffer_finished_count; /* count of I/O done */ 208 struct proc *kaio_p; /* process that uses this kaio block */ 209 TAILQ_HEAD (,aio_liojob) kaio_liojoblist; /* list of lio jobs */ 210 TAILQ_HEAD (,aiocblist) kaio_jobqueue; /* job queue for process */ 211 TAILQ_HEAD (,aiocblist) kaio_jobdone; /* done queue for process */ 212 TAILQ_HEAD (,aiocblist) kaio_bufqueue; /* buffer job queue for process */ 213 TAILQ_HEAD (,aiocblist) kaio_bufdone; /* buffer done queue for process */ 214 }; 215 216 #define KAIO_RUNDOWN 0x1 /* process is being run down */ 217 #define KAIO_WAKEUP 0x2 /* wakeup process when there is a significant 218 event */ 219 220 221 static TAILQ_HEAD (,aioproclist) aio_freeproc, aio_activeproc; 222 static TAILQ_HEAD(,aiocblist) aio_jobs; /* Async job list */ 223 static TAILQ_HEAD(,aiocblist) aio_bufjobs; /* Phys I/O job list */ 224 static TAILQ_HEAD(,aiocblist) aio_freejobs; /* Pool of free jobs */ 225 226 static void aio_init_aioinfo(struct proc *p) ; 227 static void aio_onceonly(void *) ; 228 static int aio_free_entry(struct aiocblist *aiocbe); 229 static void aio_process(struct aiocblist *aiocbe); 230 static int aio_newproc(void) ; 231 static int aio_aqueue(struct proc *p, struct aiocb *job, int type) ; 232 static void aio_physwakeup(struct buf *bp); 233 static int aio_fphysio(struct proc *p, struct aiocblist *aiocbe, int type); 234 static int aio_qphysio(struct proc *p, struct aiocblist *iocb); 235 static void aio_daemon(void *uproc); 236 237 SYSINIT(aio, SI_SUB_VFS, SI_ORDER_ANY, aio_onceonly, NULL); 238 239 static vm_zone_t kaio_zone=0, aiop_zone=0, 240 aiocb_zone=0, aiol_zone=0, aiolio_zone=0; 241 242 /* 243 * Startup initialization 244 */ 245 void 246 aio_onceonly(void *na) 247 { 248 TAILQ_INIT(&aio_freeproc); 249 TAILQ_INIT(&aio_activeproc); 250 TAILQ_INIT(&aio_jobs); 251 TAILQ_INIT(&aio_bufjobs); 252 TAILQ_INIT(&aio_freejobs); 253 kaio_zone = zinit("AIO", sizeof (struct kaioinfo), 0, 0, 1); 254 aiop_zone = zinit("AIOP", sizeof (struct aioproclist), 0, 0, 1); 255 aiocb_zone = zinit("AIOCB", sizeof (struct aiocblist), 0, 0, 1); 256 aiol_zone = zinit("AIOL", AIO_LISTIO_MAX * sizeof (int), 0, 0, 1); 257 aiolio_zone = zinit("AIOLIO", 258 AIO_LISTIO_MAX * sizeof (struct aio_liojob), 0, 0, 1); 259 aiod_timeout = AIOD_TIMEOUT_DEFAULT; 260 aiod_lifetime = AIOD_LIFETIME_DEFAULT; 261 jobrefid = 1; 262 } 263 264 /* 265 * Init the per-process aioinfo structure. 266 * The aioinfo limits are set per-process for user limit (resource) management. 267 */ 268 void 269 aio_init_aioinfo(struct proc *p) 270 { 271 struct kaioinfo *ki; 272 if (p->p_aioinfo == NULL) { 273 ki = zalloc(kaio_zone); 274 p->p_aioinfo = ki; 275 ki->kaio_flags = 0; 276 ki->kaio_maxactive_count = max_aio_per_proc; 277 ki->kaio_active_count = 0; 278 ki->kaio_qallowed_count = max_aio_queue_per_proc; 279 ki->kaio_queue_count = 0; 280 ki->kaio_ballowed_count = max_buf_aio; 281 ki->kaio_buffer_count = 0; 282 ki->kaio_buffer_finished_count = 0; 283 ki->kaio_p = p; 284 TAILQ_INIT(&ki->kaio_jobdone); 285 TAILQ_INIT(&ki->kaio_jobqueue); 286 TAILQ_INIT(&ki->kaio_bufdone); 287 TAILQ_INIT(&ki->kaio_bufqueue); 288 TAILQ_INIT(&ki->kaio_liojoblist); 289 } 290 } 291 292 /* 293 * Free a job entry. Wait for completion if it is currently 294 * active, but don't delay forever. If we delay, we return 295 * a flag that says that we have to restart the queue scan. 296 */ 297 int 298 aio_free_entry(struct aiocblist *aiocbe) 299 { 300 struct kaioinfo *ki; 301 struct aioproclist *aiop; 302 struct aio_liojob *lj; 303 struct proc *p; 304 int error; 305 int s; 306 307 if (aiocbe->jobstate == JOBST_NULL) 308 panic("aio_free_entry: freeing already free job"); 309 310 p = aiocbe->userproc; 311 ki = p->p_aioinfo; 312 lj = aiocbe->lio; 313 if (ki == NULL) 314 panic("aio_free_entry: missing p->p_aioinfo"); 315 316 if (aiocbe->jobstate == JOBST_JOBRUNNING) { 317 if (aiocbe->jobflags & AIOCBLIST_ASYNCFREE) 318 return 0; 319 aiocbe->jobflags |= AIOCBLIST_RUNDOWN; 320 tsleep(aiocbe, PRIBIO|PCATCH, "jobwai", 0); 321 } 322 aiocbe->jobflags &= ~AIOCBLIST_ASYNCFREE; 323 324 if (aiocbe->bp == NULL) { 325 if (ki->kaio_queue_count <= 0) 326 panic("aio_free_entry: process queue size <= 0"); 327 if (num_queue_count <= 0) 328 panic("aio_free_entry: system wide queue size <= 0"); 329 330 if(lj) { 331 lj->lioj_queue_count--; 332 if (aiocbe->jobflags & AIOCBLIST_DONE) 333 lj->lioj_queue_finished_count--; 334 } 335 ki->kaio_queue_count--; 336 if (aiocbe->jobflags & AIOCBLIST_DONE) 337 ki->kaio_queue_finished_count--; 338 num_queue_count--; 339 340 } else { 341 if(lj) { 342 lj->lioj_buffer_count--; 343 if (aiocbe->jobflags & AIOCBLIST_DONE) 344 lj->lioj_buffer_finished_count--; 345 } 346 if (aiocbe->jobflags & AIOCBLIST_DONE) 347 ki->kaio_buffer_finished_count--; 348 ki->kaio_buffer_count--; 349 num_buf_aio--; 350 351 } 352 353 if ((ki->kaio_flags & KAIO_WAKEUP) || 354 ((ki->kaio_flags & KAIO_RUNDOWN) && 355 ((ki->kaio_buffer_count == 0) && 356 (ki->kaio_queue_count == 0)))) { 357 ki->kaio_flags &= ~KAIO_WAKEUP; 358 wakeup(p); 359 } 360 361 if ( aiocbe->jobstate == JOBST_JOBQBUF) { 362 if ((error = aio_fphysio(p, aiocbe, 1)) != 0) 363 return error; 364 if (aiocbe->jobstate != JOBST_JOBBFINISHED) 365 panic("aio_free_entry: invalid physio finish-up state"); 366 s = splbio(); 367 TAILQ_REMOVE(&ki->kaio_bufdone, aiocbe, plist); 368 splx(s); 369 } else if ( aiocbe->jobstate == JOBST_JOBQPROC) { 370 aiop = aiocbe->jobaioproc; 371 TAILQ_REMOVE(&aiop->jobtorun, aiocbe, list); 372 } else if ( aiocbe->jobstate == JOBST_JOBQGLOBAL) { 373 TAILQ_REMOVE(&aio_jobs, aiocbe, list); 374 } else if ( aiocbe->jobstate == JOBST_JOBFINISHED) { 375 TAILQ_REMOVE(&ki->kaio_jobdone, aiocbe, plist); 376 } else if ( aiocbe->jobstate == JOBST_JOBBFINISHED) { 377 s = splbio(); 378 TAILQ_REMOVE(&ki->kaio_bufdone, aiocbe, plist); 379 splx(s); 380 if (aiocbe->bp) { 381 vunmapbuf(aiocbe->bp); 382 relpbuf(aiocbe->bp, NULL); 383 aiocbe->bp = NULL; 384 } 385 } 386 if (lj && (lj->lioj_buffer_count == 0) && (lj->lioj_queue_count == 0)) { 387 TAILQ_REMOVE(&ki->kaio_liojoblist, lj, lioj_list); 388 zfree(aiolio_zone, lj); 389 } 390 TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list); 391 aiocbe->jobstate = JOBST_NULL; 392 return 0; 393 } 394 395 /* 396 * Rundown the jobs for a given process. 397 */ 398 void 399 aio_proc_rundown(struct proc *p) 400 { 401 int s; 402 struct kaioinfo *ki; 403 struct aio_liojob *lj, *ljn; 404 struct aiocblist *aiocbe, *aiocbn; 405 406 ki = p->p_aioinfo; 407 if (ki == NULL) 408 return; 409 410 ki->kaio_flags |= LIOJ_SIGNAL_POSTED; 411 while ((ki->kaio_active_count > 0) || 412 (ki->kaio_buffer_count > ki->kaio_buffer_finished_count)) { 413 ki->kaio_flags |= KAIO_RUNDOWN; 414 if (tsleep(p, PRIBIO, "kaiowt", aiod_timeout)) 415 break; 416 } 417 418 restart1: 419 for ( aiocbe = TAILQ_FIRST(&ki->kaio_jobdone); 420 aiocbe; 421 aiocbe = aiocbn) { 422 aiocbn = TAILQ_NEXT(aiocbe, plist); 423 if (aio_free_entry(aiocbe)) 424 goto restart1; 425 } 426 427 restart2: 428 for ( aiocbe = TAILQ_FIRST(&ki->kaio_jobqueue); 429 aiocbe; 430 aiocbe = aiocbn) { 431 aiocbn = TAILQ_NEXT(aiocbe, plist); 432 if (aio_free_entry(aiocbe)) 433 goto restart2; 434 } 435 436 /* 437 * Note the use of lots of splbio here, trying to avoid 438 * splbio for long chains of I/O. Probably unnecessary. 439 */ 440 441 restart3: 442 s = splbio(); 443 while (TAILQ_FIRST(&ki->kaio_bufqueue)) { 444 ki->kaio_flags |= KAIO_WAKEUP; 445 tsleep (p, PRIBIO, "aioprn", 0); 446 splx(s); 447 goto restart3; 448 } 449 splx(s); 450 451 restart4: 452 s = splbio(); 453 for ( aiocbe = TAILQ_FIRST(&ki->kaio_bufdone); 454 aiocbe; 455 aiocbe = aiocbn) { 456 aiocbn = TAILQ_NEXT(aiocbe, plist); 457 if (aio_free_entry(aiocbe)) { 458 splx(s); 459 goto restart4; 460 } 461 } 462 splx(s); 463 464 for ( lj = TAILQ_FIRST(&ki->kaio_liojoblist); 465 lj; 466 lj = ljn) { 467 ljn = TAILQ_NEXT(lj, lioj_list); 468 if ((lj->lioj_buffer_count == 0) && (lj->lioj_queue_count == 0)) { 469 TAILQ_REMOVE(&ki->kaio_liojoblist, lj, lioj_list); 470 zfree(aiolio_zone, lj); 471 } else { 472 #if defined(DIAGNOSTIC) 473 printf("LIO job not cleaned up: B:%d, BF:%d, Q:%d, QF:%d\n", 474 lj->lioj_buffer_count, lj->lioj_buffer_finished_count, 475 lj->lioj_queue_count, lj->lioj_queue_finished_count); 476 #endif 477 } 478 } 479 480 zfree(kaio_zone, ki); 481 p->p_aioinfo = NULL; 482 } 483 484 /* 485 * Select a job to run (called by an AIO daemon) 486 */ 487 static struct aiocblist * 488 aio_selectjob(struct aioproclist *aiop) 489 { 490 491 struct aiocblist *aiocbe; 492 493 aiocbe = TAILQ_FIRST(&aiop->jobtorun); 494 if (aiocbe) { 495 TAILQ_REMOVE(&aiop->jobtorun, aiocbe, list); 496 return aiocbe; 497 } 498 499 for (aiocbe = TAILQ_FIRST(&aio_jobs); 500 aiocbe; 501 aiocbe = TAILQ_NEXT(aiocbe, list)) { 502 struct kaioinfo *ki; 503 struct proc *userp; 504 505 userp = aiocbe->userproc; 506 ki = userp->p_aioinfo; 507 508 if (ki->kaio_active_count < ki->kaio_maxactive_count) { 509 TAILQ_REMOVE(&aio_jobs, aiocbe, list); 510 return aiocbe; 511 } 512 } 513 514 return NULL; 515 } 516 517 /* 518 * The AIO processing activity. This is the code that does the 519 * I/O request for the non-physio version of the operations. The 520 * normal vn operations are used, and this code should work in 521 * all instances for every type of file, including pipes, sockets, 522 * fifos, and regular files. 523 */ 524 void 525 aio_process(struct aiocblist *aiocbe) 526 { 527 struct filedesc *fdp; 528 struct proc *userp, *mycp; 529 struct aiocb *cb; 530 struct file *fp; 531 struct uio auio; 532 struct iovec aiov; 533 unsigned int fd; 534 int cnt; 535 int error; 536 off_t offset; 537 int oublock_st, oublock_end; 538 int inblock_st, inblock_end; 539 540 userp = aiocbe->userproc; 541 cb = &aiocbe->uaiocb; 542 543 mycp = curproc; 544 545 fdp = mycp->p_fd; 546 fd = cb->aio_fildes; 547 fp = fdp->fd_ofiles[fd]; 548 549 aiov.iov_base = (void *) cb->aio_buf; 550 aiov.iov_len = cb->aio_nbytes; 551 552 auio.uio_iov = &aiov; 553 auio.uio_iovcnt = 1; 554 auio.uio_offset = offset = cb->aio_offset; 555 auio.uio_resid = cb->aio_nbytes; 556 cnt = cb->aio_nbytes; 557 auio.uio_segflg = UIO_USERSPACE; 558 auio.uio_procp = mycp; 559 560 inblock_st = mycp->p_stats->p_ru.ru_inblock; 561 oublock_st = mycp->p_stats->p_ru.ru_oublock; 562 if (cb->aio_lio_opcode == LIO_READ) { 563 auio.uio_rw = UIO_READ; 564 error = fo_read(fp, &auio, fp->f_cred, FOF_OFFSET, mycp); 565 } else { 566 auio.uio_rw = UIO_WRITE; 567 error = fo_write(fp, &auio, fp->f_cred, FOF_OFFSET, mycp); 568 } 569 inblock_end = mycp->p_stats->p_ru.ru_inblock; 570 oublock_end = mycp->p_stats->p_ru.ru_oublock; 571 572 aiocbe->inputcharge = inblock_end - inblock_st; 573 aiocbe->outputcharge = oublock_end - oublock_st; 574 575 if (error) { 576 if (auio.uio_resid != cnt) { 577 if (error == ERESTART || error == EINTR || error == EWOULDBLOCK) 578 error = 0; 579 if ((error == EPIPE) && (cb->aio_lio_opcode == LIO_WRITE)) 580 psignal(userp, SIGPIPE); 581 } 582 } 583 584 cnt -= auio.uio_resid; 585 cb->_aiocb_private.error = error; 586 cb->_aiocb_private.status = cnt; 587 588 return; 589 590 } 591 592 /* 593 * The AIO daemon, most of the actual work is done in aio_process, 594 * but the setup (and address space mgmt) is done in this routine. 595 */ 596 static void 597 aio_daemon(void *uproc) 598 { 599 int s; 600 struct aioproclist *aiop; 601 struct vmspace *myvm; 602 struct proc *mycp; 603 604 /* 605 * Local copies of curproc (cp) and vmspace (myvm) 606 */ 607 mycp = curproc; 608 myvm = mycp->p_vmspace; 609 610 if (mycp->p_textvp) { 611 vrele(mycp->p_textvp); 612 mycp->p_textvp = NULL; 613 } 614 615 /* 616 * Allocate and ready the aio control info. There is one 617 * aiop structure per daemon. 618 */ 619 aiop = zalloc(aiop_zone); 620 aiop->aioproc = mycp; 621 aiop->aioprocflags |= AIOP_FREE; 622 TAILQ_INIT(&aiop->jobtorun); 623 624 /* 625 * Place thread (lightweight process) onto the AIO free thread list 626 */ 627 if (TAILQ_EMPTY(&aio_freeproc)) 628 wakeup(&aio_freeproc); 629 TAILQ_INSERT_HEAD(&aio_freeproc, aiop, list); 630 631 /* 632 * Make up a name for the daemon 633 */ 634 strcpy(mycp->p_comm, "aiod"); 635 636 /* 637 * Get rid of our current filedescriptors. AIOD's don't need any 638 * filedescriptors, except as temporarily inherited from the client. 639 * Credentials are also cloned, and made equivalent to "root." 640 */ 641 fdfree(mycp); 642 mycp->p_fd = NULL; 643 mycp->p_ucred = crcopy(mycp->p_ucred); 644 mycp->p_ucred->cr_uid = 0; 645 mycp->p_ucred->cr_ngroups = 1; 646 mycp->p_ucred->cr_groups[0] = 1; 647 648 /* 649 * The daemon resides in its own pgrp. 650 */ 651 enterpgrp(mycp, mycp->p_pid, 1); 652 653 /* 654 * Mark special process type 655 */ 656 mycp->p_flag |= P_SYSTEM|P_KTHREADP; 657 658 /* 659 * Wakeup parent process. (Parent sleeps to keep from blasting away 660 * creating to many daemons.) 661 */ 662 wakeup(mycp); 663 664 while(1) { 665 struct proc *curcp; 666 struct aiocblist *aiocbe; 667 668 /* 669 * curcp is the current daemon process context. 670 * userp is the current user process context. 671 */ 672 curcp = mycp; 673 674 /* 675 * Take daemon off of free queue 676 */ 677 if (aiop->aioprocflags & AIOP_FREE) { 678 TAILQ_REMOVE(&aio_freeproc, aiop, list); 679 TAILQ_INSERT_TAIL(&aio_activeproc, aiop, list); 680 aiop->aioprocflags &= ~AIOP_FREE; 681 } 682 aiop->aioprocflags &= ~AIOP_SCHED; 683 684 /* 685 * Check for jobs 686 */ 687 while ((aiocbe = aio_selectjob(aiop)) != NULL) { 688 struct proc *userp; 689 struct aiocb *cb; 690 struct kaioinfo *ki; 691 struct aio_liojob *lj; 692 693 cb = &aiocbe->uaiocb; 694 userp = aiocbe->userproc; 695 696 aiocbe->jobstate = JOBST_JOBRUNNING; 697 698 /* 699 * Connect to process address space for user program 700 */ 701 if (userp != curcp) { 702 struct vmspace *tmpvm; 703 /* 704 * Save the current address space that we are connected to. 705 */ 706 tmpvm = mycp->p_vmspace; 707 /* 708 * Point to the new user address space, and refer to it. 709 */ 710 mycp->p_vmspace = userp->p_vmspace; 711 mycp->p_vmspace->vm_refcnt++; 712 /* 713 * Activate the new mapping. 714 */ 715 pmap_activate(mycp); 716 /* 717 * If the old address space wasn't the daemons own address 718 * space, then we need to remove the daemon's reference from 719 * the other process that it was acting on behalf of. 720 */ 721 if (tmpvm != myvm) { 722 vmspace_free(tmpvm); 723 } 724 /* 725 * Disassociate from previous clients file descriptors, and 726 * associate to the new clients descriptors. Note that 727 * the daemon doesn't need to worry about its orginal 728 * descriptors, because they were originally freed. 729 */ 730 if (mycp->p_fd) 731 fdfree(mycp); 732 mycp->p_fd = fdshare(userp); 733 curcp = userp; 734 } 735 736 ki = userp->p_aioinfo; 737 lj = aiocbe->lio; 738 739 /* 740 * Account for currently active jobs 741 */ 742 ki->kaio_active_count++; 743 744 /* 745 * Do the I/O function 746 */ 747 aiocbe->jobaioproc = aiop; 748 aio_process(aiocbe); 749 750 /* 751 * decrement the active job count 752 */ 753 ki->kaio_active_count--; 754 755 /* 756 * increment the completion count for wakeup/signal comparisons 757 */ 758 aiocbe->jobflags |= AIOCBLIST_DONE; 759 ki->kaio_queue_finished_count++; 760 if (lj) { 761 lj->lioj_queue_finished_count++; 762 } 763 if ((ki->kaio_flags & KAIO_WAKEUP) || 764 ((ki->kaio_flags & KAIO_RUNDOWN) && 765 (ki->kaio_active_count == 0))) { 766 ki->kaio_flags &= ~KAIO_WAKEUP; 767 wakeup(userp); 768 } 769 770 s = splbio(); 771 if (lj && (lj->lioj_flags & (LIOJ_SIGNAL|LIOJ_SIGNAL_POSTED)) == 772 LIOJ_SIGNAL) { 773 if ((lj->lioj_queue_finished_count == lj->lioj_queue_count) && 774 (lj->lioj_buffer_finished_count == lj->lioj_buffer_count)) { 775 psignal(userp, lj->lioj_signal.sigev_signo); 776 lj->lioj_flags |= LIOJ_SIGNAL_POSTED; 777 } 778 } 779 splx(s); 780 781 aiocbe->jobstate = JOBST_JOBFINISHED; 782 783 /* 784 * If the I/O request should be automatically rundown, do the 785 * needed cleanup. Otherwise, place the queue entry for 786 * the just finished I/O request into the done queue for the 787 * associated client. 788 */ 789 if (aiocbe->jobflags & AIOCBLIST_ASYNCFREE) { 790 aiocbe->jobflags &= ~AIOCBLIST_ASYNCFREE; 791 TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list); 792 } else { 793 TAILQ_REMOVE(&ki->kaio_jobqueue, 794 aiocbe, plist); 795 TAILQ_INSERT_TAIL(&ki->kaio_jobdone, 796 aiocbe, plist); 797 } 798 799 if (aiocbe->jobflags & AIOCBLIST_RUNDOWN) { 800 wakeup(aiocbe); 801 aiocbe->jobflags &= ~AIOCBLIST_RUNDOWN; 802 } 803 804 if (cb->aio_sigevent.sigev_notify == SIGEV_SIGNAL) { 805 psignal(userp, cb->aio_sigevent.sigev_signo); 806 } 807 } 808 809 /* 810 * Disconnect from user address space 811 */ 812 if (curcp != mycp) { 813 struct vmspace *tmpvm; 814 /* 815 * Get the user address space to disconnect from. 816 */ 817 tmpvm = mycp->p_vmspace; 818 /* 819 * Get original address space for daemon. 820 */ 821 mycp->p_vmspace = myvm; 822 /* 823 * Activate the daemon's address space. 824 */ 825 pmap_activate(mycp); 826 #if defined(DIAGNOSTIC) 827 if (tmpvm == myvm) 828 printf("AIOD: vmspace problem -- %d\n", mycp->p_pid); 829 #endif 830 /* 831 * remove our vmspace reference. 832 */ 833 vmspace_free(tmpvm); 834 /* 835 * disassociate from the user process's file descriptors. 836 */ 837 if (mycp->p_fd) 838 fdfree(mycp); 839 mycp->p_fd = NULL; 840 curcp = mycp; 841 } 842 843 /* 844 * If we are the first to be put onto the free queue, wakeup 845 * anyone waiting for a daemon. 846 */ 847 TAILQ_REMOVE(&aio_activeproc, aiop, list); 848 if (TAILQ_EMPTY(&aio_freeproc)) 849 wakeup(&aio_freeproc); 850 TAILQ_INSERT_HEAD(&aio_freeproc, aiop, list); 851 aiop->aioprocflags |= AIOP_FREE; 852 853 /* 854 * If daemon is inactive for a long time, allow it to exit, thereby 855 * freeing resources. 856 */ 857 if (((aiop->aioprocflags & AIOP_SCHED) == 0) && 858 tsleep(mycp, PRIBIO, "aiordy", aiod_lifetime)) { 859 if ((TAILQ_FIRST(&aio_jobs) == NULL) && 860 (TAILQ_FIRST(&aiop->jobtorun) == NULL)) { 861 if ((aiop->aioprocflags & AIOP_FREE) && 862 (num_aio_procs > target_aio_procs)) { 863 TAILQ_REMOVE(&aio_freeproc, aiop, list); 864 zfree(aiop_zone, aiop); 865 num_aio_procs--; 866 #if defined(DIAGNOSTIC) 867 if (mycp->p_vmspace->vm_refcnt <= 1) 868 printf("AIOD: bad vm refcnt for exiting daemon: %d\n", 869 mycp->p_vmspace->vm_refcnt); 870 #endif 871 exit1(mycp, 0); 872 } 873 } 874 } 875 } 876 } 877 878 /* 879 * Create a new AIO daemon. This is mostly a kernel-thread fork routine. 880 * The AIO daemon modifies its environment itself. 881 */ 882 static int 883 aio_newproc() 884 { 885 int error; 886 struct proc *p, *np; 887 888 p = &proc0; 889 error = fork1(p, RFPROC|RFMEM|RFNOWAIT, &np); 890 if (error) 891 return error; 892 cpu_set_fork_handler(np, aio_daemon, curproc); 893 894 /* 895 * Wait until daemon is started, but continue on just in case (to 896 * handle error conditions. 897 */ 898 error = tsleep(np, PZERO, "aiosta", aiod_timeout); 899 num_aio_procs++; 900 901 return error; 902 903 } 904 905 /* 906 * Try the high-performance physio method for eligible VCHR devices. This 907 * routine doesn't require the use of any additional threads, and have 908 * overhead. 909 */ 910 int 911 aio_qphysio(p, aiocbe) 912 struct proc *p; 913 struct aiocblist *aiocbe; 914 { 915 int error; 916 struct aiocb *cb; 917 struct file *fp; 918 struct buf *bp; 919 int bflags; 920 struct vnode *vp; 921 struct kaioinfo *ki; 922 struct filedesc *fdp; 923 struct aio_liojob *lj; 924 int fd; 925 int s; 926 int cnt; 927 int rw; 928 struct cdevsw *cdev; 929 930 cb = &aiocbe->uaiocb; 931 fdp = p->p_fd; 932 fd = cb->aio_fildes; 933 fp = fdp->fd_ofiles[fd]; 934 935 if (fp->f_type != DTYPE_VNODE) { 936 return -1; 937 } 938 939 vp = (struct vnode *)fp->f_data; 940 if (vp->v_type != VCHR || ((cb->aio_nbytes & (DEV_BSIZE - 1)) != 0)) { 941 return -1; 942 } 943 944 if ((cb->aio_nbytes > MAXPHYS) && (num_buf_aio >= max_buf_aio)) { 945 return -1; 946 } 947 948 if ((vp->v_rdev == NULL) || (vp->v_flag & VISTTY)) { 949 return -1; 950 } 951 952 if (vp->v_rdev == NODEV) { 953 return -1; 954 } 955 956 cdev = devsw(vp->v_rdev); 957 if (cdev == NULL) { 958 return -1; 959 } 960 961 if (cdev->d_bmaj == -1) { 962 return -1; 963 } 964 965 ki = p->p_aioinfo; 966 if (ki->kaio_buffer_count >= ki->kaio_ballowed_count) { 967 return -1; 968 } 969 970 cnt = cb->aio_nbytes; 971 if (cnt > MAXPHYS) { 972 return -1; 973 } 974 975 /* 976 * Physical I/O is charged directly to the process, so we don't have 977 * to fake it. 978 */ 979 aiocbe->inputcharge = 0; 980 aiocbe->outputcharge = 0; 981 982 ki->kaio_buffer_count++; 983 984 lj = aiocbe->lio; 985 if (lj) { 986 lj->lioj_buffer_count++; 987 } 988 989 /* create and build a buffer header for a transfer */ 990 bp = (struct buf *)getpbuf(NULL); 991 992 /* 993 * get a copy of the kva from the physical buffer 994 */ 995 bp->b_caller1 = p; 996 bp->b_dev = vp->v_rdev; 997 error = bp->b_error = 0; 998 999 if (cb->aio_lio_opcode == LIO_WRITE) { 1000 rw = 0; 1001 bflags = B_WRITE; 1002 } else { 1003 rw = 1; 1004 bflags = B_READ; 1005 } 1006 1007 bp->b_bcount = cb->aio_nbytes; 1008 bp->b_bufsize = cb->aio_nbytes; 1009 bp->b_flags = B_PHYS | B_CALL | bflags; 1010 bp->b_iodone = aio_physwakeup; 1011 bp->b_saveaddr = bp->b_data; 1012 bp->b_data = (void *) cb->aio_buf; 1013 bp->b_blkno = btodb(cb->aio_offset); 1014 1015 if (rw && !useracc(bp->b_data, bp->b_bufsize, B_WRITE)) { 1016 error = EFAULT; 1017 goto doerror; 1018 } 1019 if (!rw && !useracc(bp->b_data, bp->b_bufsize, B_READ)) { 1020 error = EFAULT; 1021 goto doerror; 1022 } 1023 1024 /* bring buffer into kernel space */ 1025 vmapbuf(bp); 1026 1027 s = splbio(); 1028 aiocbe->bp = bp; 1029 bp->b_spc = (void *)aiocbe; 1030 TAILQ_INSERT_TAIL(&aio_bufjobs, aiocbe, list); 1031 TAILQ_INSERT_TAIL(&ki->kaio_bufqueue, aiocbe, plist); 1032 aiocbe->jobstate = JOBST_JOBQBUF; 1033 cb->_aiocb_private.status = cb->aio_nbytes; 1034 num_buf_aio++; 1035 bp->b_error = 0; 1036 1037 splx(s); 1038 /* perform transfer */ 1039 BUF_STRATEGY(bp, 0); 1040 1041 s = splbio(); 1042 /* 1043 * If we had an error invoking the request, or an error in processing 1044 * the request before we have returned, we process it as an error 1045 * in transfer. Note that such an I/O error is not indicated immediately, 1046 * but is returned using the aio_error mechanism. In this case, aio_suspend 1047 * will return immediately. 1048 */ 1049 if (bp->b_error || (bp->b_flags & B_ERROR)) { 1050 struct aiocb *job = aiocbe->uuaiocb; 1051 1052 aiocbe->uaiocb._aiocb_private.status = 0; 1053 suword(&job->_aiocb_private.status, 0); 1054 aiocbe->uaiocb._aiocb_private.error = bp->b_error; 1055 suword(&job->_aiocb_private.error, bp->b_error); 1056 1057 ki->kaio_buffer_finished_count++; 1058 1059 if (aiocbe->jobstate != JOBST_JOBBFINISHED) { 1060 aiocbe->jobstate = JOBST_JOBBFINISHED; 1061 aiocbe->jobflags |= AIOCBLIST_DONE; 1062 TAILQ_REMOVE(&aio_bufjobs, aiocbe, list); 1063 TAILQ_REMOVE(&ki->kaio_bufqueue, aiocbe, plist); 1064 TAILQ_INSERT_TAIL(&ki->kaio_bufdone, aiocbe, plist); 1065 } 1066 } 1067 splx(s); 1068 return 0; 1069 1070 doerror: 1071 ki->kaio_buffer_count--; 1072 if (lj) { 1073 lj->lioj_buffer_count--; 1074 } 1075 aiocbe->bp = NULL; 1076 relpbuf(bp, NULL); 1077 return error; 1078 } 1079 1080 /* 1081 * This waits/tests physio completion. 1082 */ 1083 int 1084 aio_fphysio(p, iocb, flgwait) 1085 struct proc *p; 1086 struct aiocblist *iocb; 1087 int flgwait; 1088 { 1089 int s; 1090 struct buf *bp; 1091 int error; 1092 1093 bp = iocb->bp; 1094 1095 s = splbio(); 1096 if (flgwait == 0) { 1097 if ((bp->b_flags & B_DONE) == 0) { 1098 splx(s); 1099 return EINPROGRESS; 1100 } 1101 } 1102 1103 while ((bp->b_flags & B_DONE) == 0) { 1104 if (tsleep((caddr_t)bp, PRIBIO, "physstr", aiod_timeout)) { 1105 if ((bp->b_flags & B_DONE) == 0) { 1106 splx(s); 1107 return EINPROGRESS; 1108 } else { 1109 break; 1110 } 1111 } 1112 } 1113 1114 /* release mapping into kernel space */ 1115 vunmapbuf(bp); 1116 iocb->bp = 0; 1117 1118 error = 0; 1119 /* 1120 * check for an error 1121 */ 1122 if (bp->b_flags & B_ERROR) { 1123 error = bp->b_error; 1124 } 1125 1126 relpbuf(bp, NULL); 1127 return (error); 1128 } 1129 1130 /* 1131 * Queue a new AIO request. Choosing either the threaded or direct physio 1132 * VCHR technique is done in this code. 1133 */ 1134 static int 1135 _aio_aqueue(struct proc *p, struct aiocb *job, struct aio_liojob *lj, int type) 1136 { 1137 struct filedesc *fdp; 1138 struct file *fp; 1139 unsigned int fd; 1140 1141 int error; 1142 int opcode; 1143 struct aiocblist *aiocbe; 1144 struct aioproclist *aiop; 1145 struct kaioinfo *ki; 1146 1147 if ((aiocbe = TAILQ_FIRST(&aio_freejobs)) != NULL) { 1148 TAILQ_REMOVE(&aio_freejobs, aiocbe, list); 1149 } else { 1150 aiocbe = zalloc (aiocb_zone); 1151 } 1152 1153 aiocbe->inputcharge = 0; 1154 aiocbe->outputcharge = 0; 1155 1156 suword(&job->_aiocb_private.status, -1); 1157 suword(&job->_aiocb_private.error, 0); 1158 suword(&job->_aiocb_private.kernelinfo, -1); 1159 1160 error = copyin((caddr_t)job, 1161 (caddr_t) &aiocbe->uaiocb, sizeof aiocbe->uaiocb); 1162 if (error) { 1163 suword(&job->_aiocb_private.error, error); 1164 1165 TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list); 1166 return error; 1167 } 1168 1169 /* 1170 * Save userspace address of the job info 1171 */ 1172 aiocbe->uuaiocb = job; 1173 1174 /* 1175 * Get the opcode 1176 */ 1177 if (type != LIO_NOP) { 1178 aiocbe->uaiocb.aio_lio_opcode = type; 1179 } 1180 opcode = aiocbe->uaiocb.aio_lio_opcode; 1181 1182 /* 1183 * Get the fd info for process 1184 */ 1185 fdp = p->p_fd; 1186 1187 /* 1188 * Range check file descriptor 1189 */ 1190 fd = aiocbe->uaiocb.aio_fildes; 1191 if (fd >= fdp->fd_nfiles) { 1192 TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list); 1193 if (type == 0) { 1194 suword(&job->_aiocb_private.error, EBADF); 1195 } 1196 return EBADF; 1197 } 1198 1199 fp = fdp->fd_ofiles[fd]; 1200 if ((fp == NULL) || 1201 ((opcode == LIO_WRITE) && ((fp->f_flag & FWRITE) == 0))) { 1202 TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list); 1203 if (type == 0) { 1204 suword(&job->_aiocb_private.error, EBADF); 1205 } 1206 return EBADF; 1207 } 1208 1209 if (aiocbe->uaiocb.aio_offset == -1LL) { 1210 TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list); 1211 if (type == 0) { 1212 suword(&job->_aiocb_private.error, EINVAL); 1213 } 1214 return EINVAL; 1215 } 1216 1217 error = suword(&job->_aiocb_private.kernelinfo, jobrefid); 1218 if (error) { 1219 TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list); 1220 if (type == 0) { 1221 suword(&job->_aiocb_private.error, EINVAL); 1222 } 1223 return error; 1224 } 1225 1226 aiocbe->uaiocb._aiocb_private.kernelinfo = (void *)(intptr_t)jobrefid; 1227 if (jobrefid == LONG_MAX) 1228 jobrefid = 1; 1229 else 1230 jobrefid++; 1231 1232 if (opcode == LIO_NOP) { 1233 TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list); 1234 if (type == 0) { 1235 suword(&job->_aiocb_private.error, 0); 1236 suword(&job->_aiocb_private.status, 0); 1237 suword(&job->_aiocb_private.kernelinfo, 0); 1238 } 1239 return 0; 1240 } 1241 1242 if ((opcode != LIO_READ) && (opcode != LIO_WRITE)) { 1243 TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list); 1244 if (type == 0) { 1245 suword(&job->_aiocb_private.status, 0); 1246 suword(&job->_aiocb_private.error, EINVAL); 1247 } 1248 return EINVAL; 1249 } 1250 1251 suword(&job->_aiocb_private.error, EINPROGRESS); 1252 aiocbe->uaiocb._aiocb_private.error = EINPROGRESS; 1253 aiocbe->userproc = p; 1254 aiocbe->jobflags = 0; 1255 aiocbe->lio = lj; 1256 ki = p->p_aioinfo; 1257 1258 if ((error = aio_qphysio(p, aiocbe)) == 0) { 1259 return 0; 1260 } else if (error > 0) { 1261 suword(&job->_aiocb_private.status, 0); 1262 aiocbe->uaiocb._aiocb_private.error = error; 1263 suword(&job->_aiocb_private.error, error); 1264 return error; 1265 } 1266 1267 /* 1268 * No buffer for daemon I/O 1269 */ 1270 aiocbe->bp = NULL; 1271 1272 ki->kaio_queue_count++; 1273 if (lj) { 1274 lj->lioj_queue_count++; 1275 } 1276 TAILQ_INSERT_TAIL(&ki->kaio_jobqueue, aiocbe, plist); 1277 TAILQ_INSERT_TAIL(&aio_jobs, aiocbe, list); 1278 aiocbe->jobstate = JOBST_JOBQGLOBAL; 1279 1280 num_queue_count++; 1281 error = 0; 1282 1283 /* 1284 * If we don't have a free AIO process, and we are below our 1285 * quota, then start one. Otherwise, depend on the subsequent 1286 * I/O completions to pick-up this job. If we don't sucessfully 1287 * create the new process (thread) due to resource issues, we 1288 * return an error for now (EAGAIN), which is likely not the 1289 * correct thing to do. 1290 */ 1291 retryproc: 1292 if ((aiop = TAILQ_FIRST(&aio_freeproc)) != NULL) { 1293 TAILQ_REMOVE(&aio_freeproc, aiop, list); 1294 TAILQ_INSERT_TAIL(&aio_activeproc, aiop, list); 1295 aiop->aioprocflags &= ~AIOP_FREE; 1296 wakeup(aiop->aioproc); 1297 } else if (((num_aio_resv_start + num_aio_procs) < max_aio_procs) && 1298 ((ki->kaio_active_count + num_aio_resv_start) < 1299 ki->kaio_maxactive_count)) { 1300 num_aio_resv_start++; 1301 if ((error = aio_newproc()) == 0) { 1302 num_aio_resv_start--; 1303 p->p_retval[0] = 0; 1304 goto retryproc; 1305 } 1306 num_aio_resv_start--; 1307 } 1308 return error; 1309 } 1310 1311 /* 1312 * This routine queues an AIO request, checking for quotas. 1313 */ 1314 static int 1315 aio_aqueue(struct proc *p, struct aiocb *job, int type) 1316 { 1317 struct kaioinfo *ki; 1318 1319 if (p->p_aioinfo == NULL) { 1320 aio_init_aioinfo(p); 1321 } 1322 1323 if (num_queue_count >= max_queue_count) 1324 return EAGAIN; 1325 1326 ki = p->p_aioinfo; 1327 if (ki->kaio_queue_count >= ki->kaio_qallowed_count) 1328 return EAGAIN; 1329 1330 return _aio_aqueue(p, job, NULL, type); 1331 } 1332 1333 /* 1334 * Support the aio_return system call, as a side-effect, kernel 1335 * resources are released. 1336 */ 1337 int 1338 aio_return(struct proc *p, struct aio_return_args *uap) 1339 { 1340 int s; 1341 int jobref; 1342 struct aiocblist *cb, *ncb; 1343 struct aiocb *ujob; 1344 struct kaioinfo *ki; 1345 1346 ki = p->p_aioinfo; 1347 if (ki == NULL) { 1348 return EINVAL; 1349 } 1350 1351 ujob = uap->aiocbp; 1352 1353 jobref = fuword(&ujob->_aiocb_private.kernelinfo); 1354 if (jobref == -1 || jobref == 0) 1355 return EINVAL; 1356 1357 for (cb = TAILQ_FIRST(&ki->kaio_jobdone); 1358 cb; 1359 cb = TAILQ_NEXT(cb, plist)) { 1360 if (((intptr_t) cb->uaiocb._aiocb_private.kernelinfo) == jobref) { 1361 if (ujob == cb->uuaiocb) { 1362 p->p_retval[0] = cb->uaiocb._aiocb_private.status; 1363 } else { 1364 p->p_retval[0] = EFAULT; 1365 } 1366 if (cb->uaiocb.aio_lio_opcode == LIO_WRITE) { 1367 curproc->p_stats->p_ru.ru_oublock += cb->outputcharge; 1368 cb->outputcharge = 0; 1369 } else if (cb->uaiocb.aio_lio_opcode == LIO_READ) { 1370 curproc->p_stats->p_ru.ru_inblock += cb->inputcharge; 1371 cb->inputcharge = 0; 1372 } 1373 aio_free_entry(cb); 1374 return 0; 1375 } 1376 } 1377 1378 s = splbio(); 1379 for (cb = TAILQ_FIRST(&ki->kaio_bufdone); 1380 cb; 1381 cb = ncb) { 1382 ncb = TAILQ_NEXT(cb, plist); 1383 if (((intptr_t) cb->uaiocb._aiocb_private.kernelinfo) == jobref) { 1384 splx(s); 1385 if (ujob == cb->uuaiocb) { 1386 p->p_retval[0] = cb->uaiocb._aiocb_private.status; 1387 } else { 1388 p->p_retval[0] = EFAULT; 1389 } 1390 aio_free_entry(cb); 1391 return 0; 1392 } 1393 } 1394 splx(s); 1395 1396 return (EINVAL); 1397 } 1398 1399 /* 1400 * Allow a process to wakeup when any of the I/O requests are 1401 * completed. 1402 */ 1403 int 1404 aio_suspend(struct proc *p, struct aio_suspend_args *uap) 1405 { 1406 struct timeval atv; 1407 struct timespec ts; 1408 struct aiocb *const *cbptr, *cbp; 1409 struct kaioinfo *ki; 1410 struct aiocblist *cb; 1411 int i; 1412 int njoblist; 1413 int error, s, timo; 1414 int *ijoblist; 1415 struct aiocb **ujoblist; 1416 1417 if (uap->nent >= AIO_LISTIO_MAX) 1418 return EINVAL; 1419 1420 timo = 0; 1421 if (uap->timeout) { 1422 /* 1423 * Get timespec struct 1424 */ 1425 if ((error = copyin(uap->timeout, &ts, sizeof(ts))) != 0) { 1426 return error; 1427 } 1428 1429 if (ts.tv_nsec < 0 || ts.tv_nsec >= 1000000000) 1430 return (EINVAL); 1431 1432 TIMESPEC_TO_TIMEVAL(&atv, &ts); 1433 if (itimerfix(&atv)) 1434 return (EINVAL); 1435 timo = tvtohz(&atv); 1436 } 1437 1438 ki = p->p_aioinfo; 1439 if (ki == NULL) 1440 return EAGAIN; 1441 1442 njoblist = 0; 1443 ijoblist = zalloc(aiol_zone); 1444 ujoblist = zalloc(aiol_zone); 1445 cbptr = uap->aiocbp; 1446 1447 for(i = 0; i < uap->nent; i++) { 1448 cbp = (struct aiocb *) (intptr_t) fuword((caddr_t) &cbptr[i]); 1449 if (cbp == 0) 1450 continue; 1451 ujoblist[njoblist] = cbp; 1452 ijoblist[njoblist] = fuword(&cbp->_aiocb_private.kernelinfo); 1453 njoblist++; 1454 } 1455 if (njoblist == 0) { 1456 zfree(aiol_zone, ijoblist); 1457 zfree(aiol_zone, ujoblist); 1458 return 0; 1459 } 1460 1461 error = 0; 1462 while (1) { 1463 for (cb = TAILQ_FIRST(&ki->kaio_jobdone); 1464 cb; cb = TAILQ_NEXT(cb, plist)) { 1465 for(i = 0; i < njoblist; i++) { 1466 if (((intptr_t) cb->uaiocb._aiocb_private.kernelinfo) == 1467 ijoblist[i]) { 1468 if (ujoblist[i] != cb->uuaiocb) 1469 error = EINVAL; 1470 zfree(aiol_zone, ijoblist); 1471 zfree(aiol_zone, ujoblist); 1472 return error; 1473 } 1474 } 1475 } 1476 1477 s = splbio(); 1478 for (cb = TAILQ_FIRST(&ki->kaio_bufdone); 1479 cb; cb = TAILQ_NEXT(cb, plist)) { 1480 for(i = 0; i < njoblist; i++) { 1481 if (((intptr_t) cb->uaiocb._aiocb_private.kernelinfo) == 1482 ijoblist[i]) { 1483 splx(s); 1484 if (ujoblist[i] != cb->uuaiocb) 1485 error = EINVAL; 1486 zfree(aiol_zone, ijoblist); 1487 zfree(aiol_zone, ujoblist); 1488 return error; 1489 } 1490 } 1491 } 1492 1493 ki->kaio_flags |= KAIO_WAKEUP; 1494 error = tsleep(p, PRIBIO|PCATCH, "aiospn", timo); 1495 splx(s); 1496 1497 if (error == EINTR) { 1498 zfree(aiol_zone, ijoblist); 1499 zfree(aiol_zone, ujoblist); 1500 return EINTR; 1501 } else if (error == EWOULDBLOCK) { 1502 zfree(aiol_zone, ijoblist); 1503 zfree(aiol_zone, ujoblist); 1504 return EAGAIN; 1505 } 1506 } 1507 1508 /* NOTREACHED */ 1509 return EINVAL; 1510 } 1511 1512 /* 1513 * aio_cancel at the kernel level is a NOOP right now. It 1514 * might be possible to support it partially in user mode, or 1515 * in kernel mode later on. 1516 */ 1517 int 1518 aio_cancel(struct proc *p, struct aio_cancel_args *uap) 1519 { 1520 return ENOSYS; 1521 } 1522 1523 /* 1524 * aio_error is implemented in the kernel level for compatibility 1525 * purposes only. For a user mode async implementation, it would be 1526 * best to do it in a userland subroutine. 1527 */ 1528 int 1529 aio_error(struct proc *p, struct aio_error_args *uap) 1530 { 1531 int s; 1532 struct aiocblist *cb; 1533 struct kaioinfo *ki; 1534 int jobref; 1535 1536 ki = p->p_aioinfo; 1537 if (ki == NULL) 1538 return EINVAL; 1539 1540 jobref = fuword(&uap->aiocbp->_aiocb_private.kernelinfo); 1541 if ((jobref == -1) || (jobref == 0)) 1542 return EINVAL; 1543 1544 for (cb = TAILQ_FIRST(&ki->kaio_jobdone); 1545 cb; 1546 cb = TAILQ_NEXT(cb, plist)) { 1547 1548 if (((intptr_t) cb->uaiocb._aiocb_private.kernelinfo) == jobref) { 1549 p->p_retval[0] = cb->uaiocb._aiocb_private.error; 1550 return 0; 1551 } 1552 } 1553 1554 for (cb = TAILQ_FIRST(&ki->kaio_jobqueue); 1555 cb; 1556 cb = TAILQ_NEXT(cb, plist)) { 1557 1558 if (((intptr_t) cb->uaiocb._aiocb_private.kernelinfo) == jobref) { 1559 p->p_retval[0] = EINPROGRESS; 1560 return 0; 1561 } 1562 } 1563 1564 s = splbio(); 1565 for (cb = TAILQ_FIRST(&ki->kaio_bufdone); 1566 cb; 1567 cb = TAILQ_NEXT(cb, plist)) { 1568 if (((intptr_t) cb->uaiocb._aiocb_private.kernelinfo) == jobref) { 1569 p->p_retval[0] = cb->uaiocb._aiocb_private.error; 1570 splx(s); 1571 return 0; 1572 } 1573 } 1574 1575 for (cb = TAILQ_FIRST(&ki->kaio_bufqueue); 1576 cb; 1577 cb = TAILQ_NEXT(cb, plist)) { 1578 if (((intptr_t) cb->uaiocb._aiocb_private.kernelinfo) == jobref) { 1579 p->p_retval[0] = EINPROGRESS; 1580 splx(s); 1581 return 0; 1582 } 1583 } 1584 splx(s); 1585 1586 1587 /* 1588 * Hack for lio 1589 */ 1590 /* 1591 status = fuword(&uap->aiocbp->_aiocb_private.status); 1592 if (status == -1) { 1593 return fuword(&uap->aiocbp->_aiocb_private.error); 1594 } 1595 */ 1596 return EINVAL; 1597 } 1598 1599 int 1600 aio_read(struct proc *p, struct aio_read_args *uap) 1601 { 1602 struct filedesc *fdp; 1603 struct file *fp; 1604 struct uio auio; 1605 struct iovec aiov; 1606 unsigned int fd; 1607 int cnt; 1608 struct aiocb iocb; 1609 int error, pmodes; 1610 1611 pmodes = fuword(&uap->aiocbp->_aiocb_private.privatemodes); 1612 if ((pmodes & AIO_PMODE_SYNC) == 0) { 1613 return aio_aqueue(p, (struct aiocb *) uap->aiocbp, LIO_READ); 1614 } 1615 1616 /* 1617 * Get control block 1618 */ 1619 if ((error = copyin((caddr_t) uap->aiocbp, (caddr_t) &iocb, sizeof iocb)) != 0) 1620 return error; 1621 1622 /* 1623 * Get the fd info for process 1624 */ 1625 fdp = p->p_fd; 1626 1627 /* 1628 * Range check file descriptor 1629 */ 1630 fd = iocb.aio_fildes; 1631 if (fd >= fdp->fd_nfiles) 1632 return EBADF; 1633 fp = fdp->fd_ofiles[fd]; 1634 if ((fp == NULL) || ((fp->f_flag & FREAD) == 0)) 1635 return EBADF; 1636 if (iocb.aio_offset == -1LL) 1637 return EINVAL; 1638 1639 auio.uio_resid = iocb.aio_nbytes; 1640 if (auio.uio_resid < 0) 1641 return (EINVAL); 1642 1643 /* 1644 * Process sync simply -- queue async request. 1645 */ 1646 if ((iocb._aiocb_private.privatemodes & AIO_PMODE_SYNC) == 0) { 1647 return aio_aqueue(p, (struct aiocb *) uap->aiocbp, LIO_READ); 1648 } 1649 1650 aiov.iov_base = (void *) iocb.aio_buf; 1651 aiov.iov_len = iocb.aio_nbytes; 1652 1653 auio.uio_iov = &aiov; 1654 auio.uio_iovcnt = 1; 1655 auio.uio_offset = iocb.aio_offset; 1656 auio.uio_rw = UIO_READ; 1657 auio.uio_segflg = UIO_USERSPACE; 1658 auio.uio_procp = p; 1659 1660 cnt = iocb.aio_nbytes; 1661 error = fo_read(fp, &auio, fp->f_cred, FOF_OFFSET, p); 1662 if (error && 1663 (auio.uio_resid != cnt) && 1664 (error == ERESTART || error == EINTR || error == EWOULDBLOCK)) 1665 error = 0; 1666 cnt -= auio.uio_resid; 1667 p->p_retval[0] = cnt; 1668 return error; 1669 } 1670 1671 int 1672 aio_write(struct proc *p, struct aio_write_args *uap) 1673 { 1674 struct filedesc *fdp; 1675 struct file *fp; 1676 struct uio auio; 1677 struct iovec aiov; 1678 unsigned int fd; 1679 int cnt; 1680 struct aiocb iocb; 1681 int error; 1682 int pmodes; 1683 1684 /* 1685 * Process sync simply -- queue async request. 1686 */ 1687 pmodes = fuword(&uap->aiocbp->_aiocb_private.privatemodes); 1688 if ((pmodes & AIO_PMODE_SYNC) == 0) { 1689 return aio_aqueue(p, (struct aiocb *) uap->aiocbp, LIO_WRITE); 1690 } 1691 1692 if ((error = copyin((caddr_t) uap->aiocbp, (caddr_t) &iocb, sizeof iocb)) != 0) 1693 return error; 1694 1695 /* 1696 * Get the fd info for process 1697 */ 1698 fdp = p->p_fd; 1699 1700 /* 1701 * Range check file descriptor 1702 */ 1703 fd = iocb.aio_fildes; 1704 if (fd >= fdp->fd_nfiles) 1705 return EBADF; 1706 fp = fdp->fd_ofiles[fd]; 1707 if ((fp == NULL) || ((fp->f_flag & FWRITE) == 0)) 1708 return EBADF; 1709 if (iocb.aio_offset == -1LL) 1710 return EINVAL; 1711 1712 aiov.iov_base = (void *) iocb.aio_buf; 1713 aiov.iov_len = iocb.aio_nbytes; 1714 auio.uio_iov = &aiov; 1715 auio.uio_iovcnt = 1; 1716 auio.uio_offset = iocb.aio_offset; 1717 1718 auio.uio_resid = iocb.aio_nbytes; 1719 if (auio.uio_resid < 0) 1720 return (EINVAL); 1721 1722 auio.uio_rw = UIO_WRITE; 1723 auio.uio_segflg = UIO_USERSPACE; 1724 auio.uio_procp = p; 1725 1726 cnt = iocb.aio_nbytes; 1727 error = fo_write(fp, &auio, fp->f_cred, FOF_OFFSET, p); 1728 if (error) { 1729 if (auio.uio_resid != cnt) { 1730 if (error == ERESTART || error == EINTR || error == EWOULDBLOCK) 1731 error = 0; 1732 if (error == EPIPE) 1733 psignal(p, SIGPIPE); 1734 } 1735 } 1736 cnt -= auio.uio_resid; 1737 p->p_retval[0] = cnt; 1738 return error; 1739 } 1740 1741 int 1742 lio_listio(struct proc *p, struct lio_listio_args *uap) 1743 { 1744 int nent, nentqueued; 1745 struct aiocb *iocb, * const *cbptr; 1746 struct aiocblist *cb; 1747 struct kaioinfo *ki; 1748 struct aio_liojob *lj; 1749 int error, runningcode; 1750 int nerror; 1751 int i; 1752 int s; 1753 1754 if ((uap->mode != LIO_NOWAIT) && (uap->mode != LIO_WAIT)) { 1755 return EINVAL; 1756 } 1757 1758 nent = uap->nent; 1759 if (nent > AIO_LISTIO_MAX) { 1760 return EINVAL; 1761 } 1762 1763 if (p->p_aioinfo == NULL) { 1764 aio_init_aioinfo(p); 1765 } 1766 1767 if ((nent + num_queue_count) > max_queue_count) { 1768 return EAGAIN; 1769 } 1770 1771 ki = p->p_aioinfo; 1772 if ((nent + ki->kaio_queue_count) > ki->kaio_qallowed_count) { 1773 return EAGAIN; 1774 } 1775 1776 lj = zalloc(aiolio_zone); 1777 if (!lj) { 1778 return EAGAIN; 1779 } 1780 1781 lj->lioj_flags = 0; 1782 lj->lioj_buffer_count = 0; 1783 lj->lioj_buffer_finished_count = 0; 1784 lj->lioj_queue_count = 0; 1785 lj->lioj_queue_finished_count = 0; 1786 lj->lioj_ki = ki; 1787 TAILQ_INSERT_TAIL(&ki->kaio_liojoblist, lj, lioj_list); 1788 1789 /* 1790 * Setup signal 1791 */ 1792 if (uap->sig && (uap->mode == LIO_NOWAIT)) { 1793 error = copyin(uap->sig, &lj->lioj_signal, sizeof lj->lioj_signal); 1794 if (error) 1795 return error; 1796 lj->lioj_flags |= LIOJ_SIGNAL; 1797 lj->lioj_flags &= ~LIOJ_SIGNAL_POSTED; 1798 } else { 1799 lj->lioj_flags &= ~LIOJ_SIGNAL; 1800 } 1801 1802 /* 1803 * get pointers to the list of I/O requests 1804 */ 1805 1806 nerror = 0; 1807 nentqueued = 0; 1808 cbptr = uap->acb_list; 1809 for(i = 0; i < uap->nent; i++) { 1810 iocb = (struct aiocb *) (intptr_t) fuword((caddr_t) &cbptr[i]); 1811 if (((intptr_t) iocb != -1) && ((intptr_t) iocb != NULL)) { 1812 error = _aio_aqueue(p, iocb, lj, 0); 1813 if (error == 0) { 1814 nentqueued++; 1815 } else { 1816 nerror++; 1817 } 1818 } 1819 } 1820 1821 /* 1822 * If we haven't queued any, then just return error 1823 */ 1824 if (nentqueued == 0) { 1825 return 0; 1826 } 1827 1828 /* 1829 * Calculate the appropriate error return 1830 */ 1831 runningcode = 0; 1832 if (nerror) 1833 runningcode = EIO; 1834 1835 if (uap->mode == LIO_WAIT) { 1836 while (1) { 1837 int found; 1838 found = 0; 1839 for(i = 0; i < uap->nent; i++) { 1840 int jobref, command; 1841 1842 /* 1843 * Fetch address of the control buf pointer in user space 1844 */ 1845 iocb = (struct aiocb *) (intptr_t) fuword((caddr_t) &cbptr[i]); 1846 if (((intptr_t) iocb == -1) || ((intptr_t) iocb == 0)) 1847 continue; 1848 1849 /* 1850 * Fetch the associated command from user space 1851 */ 1852 command = fuword(&iocb->aio_lio_opcode); 1853 if (command == LIO_NOP) { 1854 found++; 1855 continue; 1856 } 1857 1858 jobref = fuword(&iocb->_aiocb_private.kernelinfo); 1859 1860 for (cb = TAILQ_FIRST(&ki->kaio_jobdone); 1861 cb; 1862 cb = TAILQ_NEXT(cb, plist)) { 1863 if (((intptr_t) cb->uaiocb._aiocb_private.kernelinfo) == 1864 jobref) { 1865 if (cb->uaiocb.aio_lio_opcode == LIO_WRITE) { 1866 curproc->p_stats->p_ru.ru_oublock += 1867 cb->outputcharge; 1868 cb->outputcharge = 0; 1869 } else if (cb->uaiocb.aio_lio_opcode == LIO_READ) { 1870 curproc->p_stats->p_ru.ru_inblock += 1871 cb->inputcharge; 1872 cb->inputcharge = 0; 1873 } 1874 found++; 1875 break; 1876 } 1877 } 1878 1879 s = splbio(); 1880 for (cb = TAILQ_FIRST(&ki->kaio_bufdone); 1881 cb; 1882 cb = TAILQ_NEXT(cb, plist)) { 1883 if (((intptr_t) cb->uaiocb._aiocb_private.kernelinfo) == 1884 jobref) { 1885 found++; 1886 break; 1887 } 1888 } 1889 splx(s); 1890 1891 } 1892 1893 /* 1894 * If all I/Os have been disposed of, then we can return 1895 */ 1896 if (found == nentqueued) { 1897 return runningcode; 1898 } 1899 1900 ki->kaio_flags |= KAIO_WAKEUP; 1901 error = tsleep(p, PRIBIO|PCATCH, "aiospn", 0); 1902 1903 if (error == EINTR) { 1904 return EINTR; 1905 } else if (error == EWOULDBLOCK) { 1906 return EAGAIN; 1907 } 1908 1909 } 1910 } 1911 1912 return runningcode; 1913 } 1914 1915 /* 1916 * This is a wierd hack so that we can post a signal. It is safe 1917 * to do so from a timeout routine, but *not* from an interrupt routine. 1918 */ 1919 static void 1920 process_signal(void *ljarg) 1921 { 1922 struct aio_liojob *lj = ljarg; 1923 if (lj->lioj_signal.sigev_notify == SIGEV_SIGNAL) { 1924 if (lj->lioj_queue_count == lj->lioj_queue_finished_count) { 1925 psignal(lj->lioj_ki->kaio_p, lj->lioj_signal.sigev_signo); 1926 lj->lioj_flags |= LIOJ_SIGNAL_POSTED; 1927 } 1928 } 1929 } 1930 1931 /* 1932 * Interrupt handler for physio, performs the necessary process wakeups, 1933 * and signals. 1934 */ 1935 static void 1936 aio_physwakeup(bp) 1937 struct buf *bp; 1938 { 1939 struct aiocblist *aiocbe; 1940 struct proc *p; 1941 struct kaioinfo *ki; 1942 struct aio_liojob *lj; 1943 int s; 1944 s = splbio(); 1945 1946 wakeup((caddr_t) bp); 1947 bp->b_flags &= ~B_CALL; 1948 bp->b_flags |= B_DONE; 1949 1950 aiocbe = (struct aiocblist *)bp->b_spc; 1951 if (aiocbe) { 1952 p = bp->b_caller1; 1953 1954 aiocbe->jobstate = JOBST_JOBBFINISHED; 1955 aiocbe->uaiocb._aiocb_private.status -= bp->b_resid; 1956 aiocbe->uaiocb._aiocb_private.error = 0; 1957 aiocbe->jobflags |= AIOCBLIST_DONE; 1958 1959 if (bp->b_flags & B_ERROR) { 1960 aiocbe->uaiocb._aiocb_private.error = bp->b_error; 1961 } 1962 1963 lj = aiocbe->lio; 1964 if (lj) { 1965 lj->lioj_buffer_finished_count++; 1966 /* 1967 * wakeup/signal if all of the interrupt jobs are done 1968 */ 1969 if (lj->lioj_buffer_finished_count == lj->lioj_buffer_count) { 1970 /* 1971 * post a signal if it is called for 1972 */ 1973 if ((lj->lioj_flags & (LIOJ_SIGNAL|LIOJ_SIGNAL_POSTED)) == 1974 LIOJ_SIGNAL) { 1975 lj->lioj_flags |= LIOJ_SIGNAL_POSTED; 1976 timeout(process_signal, lj, 0); 1977 } 1978 } 1979 } 1980 1981 ki = p->p_aioinfo; 1982 if (ki) { 1983 ki->kaio_buffer_finished_count++; 1984 TAILQ_REMOVE(&aio_bufjobs, aiocbe, list); 1985 TAILQ_REMOVE(&ki->kaio_bufqueue, aiocbe, plist); 1986 TAILQ_INSERT_TAIL(&ki->kaio_bufdone, aiocbe, plist); 1987 /* 1988 * and do the wakeup 1989 */ 1990 if (ki->kaio_flags & (KAIO_RUNDOWN|KAIO_WAKEUP)) { 1991 ki->kaio_flags &= ~KAIO_WAKEUP; 1992 wakeup(p); 1993 } 1994 } 1995 } 1996 splx(s); 1997 } 1998