1 /* 2 * Copyright (c) 1997 John S. Dyson. All rights reserved. 3 * 4 * Redistribution and use in source and binary forms, with or without 5 * modification, are permitted provided that the following conditions 6 * are met: 7 * 1. Redistributions of source code must retain the above copyright 8 * notice, this list of conditions and the following disclaimer. 9 * 2. John S. Dyson's name may not be used to endorse or promote products 10 * derived from this software without specific prior written permission. 11 * 12 * DISCLAIMER: This code isn't warranted to do anything useful. Anything 13 * bad that happens because of using this software isn't the responsibility 14 * of the author. This software is distributed AS-IS. 15 * 16 * $FreeBSD$ 17 */ 18 19 /* 20 * This file contains support for the POSIX 1003.1B AIO/LIO facility. 21 */ 22 23 #include <sys/param.h> 24 #include <sys/systm.h> 25 #include <sys/buf.h> 26 #include <sys/sysproto.h> 27 #include <sys/filedesc.h> 28 #include <sys/kernel.h> 29 #include <sys/fcntl.h> 30 #include <sys/file.h> 31 #include <sys/lock.h> 32 #include <sys/unistd.h> 33 #include <sys/proc.h> 34 #include <sys/resourcevar.h> 35 #include <sys/signalvar.h> 36 #include <sys/sysctl.h> 37 #include <sys/vnode.h> 38 #include <sys/conf.h> 39 40 #include <vm/vm.h> 41 #include <vm/vm_extern.h> 42 #include <vm/pmap.h> 43 #include <vm/vm_map.h> 44 #include <vm/vm_zone.h> 45 #include <sys/aio.h> 46 47 #include <machine/limits.h> 48 49 static long jobrefid; 50 51 #define JOBST_NULL 0x0 52 #define JOBST_JOBQPROC 0x1 53 #define JOBST_JOBQGLOBAL 0x2 54 #define JOBST_JOBRUNNING 0x3 55 #define JOBST_JOBFINISHED 0x4 56 #define JOBST_JOBQBUF 0x5 57 #define JOBST_JOBBFINISHED 0x6 58 59 #ifndef MAX_AIO_PER_PROC 60 #define MAX_AIO_PER_PROC 32 61 #endif 62 63 #ifndef MAX_AIO_QUEUE_PER_PROC 64 #define MAX_AIO_QUEUE_PER_PROC 256 /* Bigger than AIO_LISTIO_MAX */ 65 #endif 66 67 #ifndef MAX_AIO_PROCS 68 #define MAX_AIO_PROCS 32 69 #endif 70 71 #ifndef MAX_AIO_QUEUE 72 #define MAX_AIO_QUEUE 1024 /* Bigger than AIO_LISTIO_MAX */ 73 #endif 74 75 #ifndef TARGET_AIO_PROCS 76 #define TARGET_AIO_PROCS 0 77 #endif 78 79 #ifndef MAX_BUF_AIO 80 #define MAX_BUF_AIO 16 81 #endif 82 83 #ifndef AIOD_TIMEOUT_DEFAULT 84 #define AIOD_TIMEOUT_DEFAULT (10 * hz) 85 #endif 86 87 #ifndef AIOD_LIFETIME_DEFAULT 88 #define AIOD_LIFETIME_DEFAULT (30 * hz) 89 #endif 90 91 static int max_aio_procs = MAX_AIO_PROCS; 92 static int num_aio_procs = 0; 93 static int target_aio_procs = TARGET_AIO_PROCS; 94 static int max_queue_count = MAX_AIO_QUEUE; 95 static int num_queue_count = 0; 96 static int num_buf_aio = 0; 97 static int num_aio_resv_start = 0; 98 static int aiod_timeout; 99 static int aiod_lifetime; 100 101 static int max_aio_per_proc = MAX_AIO_PER_PROC, 102 max_aio_queue_per_proc=MAX_AIO_QUEUE_PER_PROC; 103 104 static int max_buf_aio = MAX_BUF_AIO; 105 106 SYSCTL_NODE(_vfs, OID_AUTO, aio, CTLFLAG_RW, 0, "AIO mgmt"); 107 108 SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_per_proc, 109 CTLFLAG_RW, &max_aio_per_proc, 0, ""); 110 111 SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_queue_per_proc, 112 CTLFLAG_RW, &max_aio_queue_per_proc, 0, ""); 113 114 SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_procs, 115 CTLFLAG_RW, &max_aio_procs, 0, ""); 116 117 SYSCTL_INT(_vfs_aio, OID_AUTO, num_aio_procs, 118 CTLFLAG_RD, &num_aio_procs, 0, ""); 119 120 SYSCTL_INT(_vfs_aio, OID_AUTO, num_queue_count, 121 CTLFLAG_RD, &num_queue_count, 0, ""); 122 123 SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_queue, 124 CTLFLAG_RW, &max_queue_count, 0, ""); 125 126 SYSCTL_INT(_vfs_aio, OID_AUTO, target_aio_procs, 127 CTLFLAG_RW, &target_aio_procs, 0, ""); 128 129 SYSCTL_INT(_vfs_aio, OID_AUTO, max_buf_aio, 130 CTLFLAG_RW, &max_buf_aio, 0, ""); 131 132 SYSCTL_INT(_vfs_aio, OID_AUTO, num_buf_aio, 133 CTLFLAG_RD, &num_buf_aio, 0, ""); 134 135 SYSCTL_INT(_vfs_aio, OID_AUTO, aiod_lifetime, 136 CTLFLAG_RW, &aiod_lifetime, 0, ""); 137 138 SYSCTL_INT(_vfs_aio, OID_AUTO, aiod_timeout, 139 CTLFLAG_RW, &aiod_timeout, 0, ""); 140 141 142 /* 143 * Job queue item 144 */ 145 146 #define AIOCBLIST_CANCELLED 0x1 147 #define AIOCBLIST_RUNDOWN 0x4 148 #define AIOCBLIST_ASYNCFREE 0x8 149 #define AIOCBLIST_DONE 0x10 150 151 struct aiocblist { 152 TAILQ_ENTRY (aiocblist) list; /* List of jobs */ 153 TAILQ_ENTRY (aiocblist) plist; /* List of jobs for proc */ 154 int jobflags; 155 int jobstate; 156 int inputcharge, outputcharge; 157 struct buf *bp; /* buffer pointer */ 158 struct proc *userproc; /* User process */ 159 struct aioproclist *jobaioproc; /* AIO process descriptor */ 160 struct aio_liojob *lio; /* optional lio job */ 161 struct aiocb *uuaiocb; /* pointer in userspace of aiocb */ 162 struct aiocb uaiocb; /* Kernel I/O control block */ 163 }; 164 165 166 /* 167 * AIO process info 168 */ 169 #define AIOP_FREE 0x1 /* proc on free queue */ 170 #define AIOP_SCHED 0x2 /* proc explicitly scheduled */ 171 172 struct aioproclist { 173 int aioprocflags; /* AIO proc flags */ 174 TAILQ_ENTRY(aioproclist) list; /* List of processes */ 175 struct proc *aioproc; /* The AIO thread */ 176 TAILQ_HEAD (,aiocblist) jobtorun; /* suggested job to run */ 177 }; 178 179 /* 180 * data-structure for lio signal management 181 */ 182 struct aio_liojob { 183 int lioj_flags; 184 int lioj_buffer_count; 185 int lioj_buffer_finished_count; 186 int lioj_queue_count; 187 int lioj_queue_finished_count; 188 struct sigevent lioj_signal; /* signal on all I/O done */ 189 TAILQ_ENTRY (aio_liojob) lioj_list; 190 struct kaioinfo *lioj_ki; 191 }; 192 #define LIOJ_SIGNAL 0x1 /* signal on all done (lio) */ 193 #define LIOJ_SIGNAL_POSTED 0x2 /* signal has been posted */ 194 195 /* 196 * per process aio data structure 197 */ 198 struct kaioinfo { 199 int kaio_flags; /* per process kaio flags */ 200 int kaio_maxactive_count; /* maximum number of AIOs */ 201 int kaio_active_count; /* number of currently used AIOs */ 202 int kaio_qallowed_count; /* maxiumu size of AIO queue */ 203 int kaio_queue_count; /* size of AIO queue */ 204 int kaio_ballowed_count; /* maximum number of buffers */ 205 int kaio_queue_finished_count; /* number of daemon jobs finished */ 206 int kaio_buffer_count; /* number of physio buffers */ 207 int kaio_buffer_finished_count; /* count of I/O done */ 208 struct proc *kaio_p; /* process that uses this kaio block */ 209 TAILQ_HEAD (,aio_liojob) kaio_liojoblist; /* list of lio jobs */ 210 TAILQ_HEAD (,aiocblist) kaio_jobqueue; /* job queue for process */ 211 TAILQ_HEAD (,aiocblist) kaio_jobdone; /* done queue for process */ 212 TAILQ_HEAD (,aiocblist) kaio_bufqueue; /* buffer job queue for process */ 213 TAILQ_HEAD (,aiocblist) kaio_bufdone; /* buffer done queue for process */ 214 }; 215 216 #define KAIO_RUNDOWN 0x1 /* process is being run down */ 217 #define KAIO_WAKEUP 0x2 /* wakeup process when there is a significant 218 event */ 219 220 221 static TAILQ_HEAD (,aioproclist) aio_freeproc, aio_activeproc; 222 static TAILQ_HEAD(,aiocblist) aio_jobs; /* Async job list */ 223 static TAILQ_HEAD(,aiocblist) aio_bufjobs; /* Phys I/O job list */ 224 static TAILQ_HEAD(,aiocblist) aio_freejobs; /* Pool of free jobs */ 225 226 static void aio_init_aioinfo(struct proc *p) ; 227 static void aio_onceonly(void *) ; 228 static int aio_free_entry(struct aiocblist *aiocbe); 229 static void aio_process(struct aiocblist *aiocbe); 230 static int aio_newproc(void) ; 231 static int aio_aqueue(struct proc *p, struct aiocb *job, int type) ; 232 static void aio_physwakeup(struct buf *bp); 233 static int aio_fphysio(struct proc *p, struct aiocblist *aiocbe, int type); 234 static int aio_qphysio(struct proc *p, struct aiocblist *iocb); 235 static void aio_daemon(void *uproc); 236 237 SYSINIT(aio, SI_SUB_VFS, SI_ORDER_ANY, aio_onceonly, NULL); 238 239 static vm_zone_t kaio_zone=0, aiop_zone=0, 240 aiocb_zone=0, aiol_zone=0, aiolio_zone=0; 241 242 /* 243 * Startup initialization 244 */ 245 void 246 aio_onceonly(void *na) 247 { 248 TAILQ_INIT(&aio_freeproc); 249 TAILQ_INIT(&aio_activeproc); 250 TAILQ_INIT(&aio_jobs); 251 TAILQ_INIT(&aio_bufjobs); 252 TAILQ_INIT(&aio_freejobs); 253 kaio_zone = zinit("AIO", sizeof (struct kaioinfo), 0, 0, 1); 254 aiop_zone = zinit("AIOP", sizeof (struct aioproclist), 0, 0, 1); 255 aiocb_zone = zinit("AIOCB", sizeof (struct aiocblist), 0, 0, 1); 256 aiol_zone = zinit("AIOL", AIO_LISTIO_MAX * sizeof (int), 0, 0, 1); 257 aiolio_zone = zinit("AIOLIO", 258 AIO_LISTIO_MAX * sizeof (struct aio_liojob), 0, 0, 1); 259 aiod_timeout = AIOD_TIMEOUT_DEFAULT; 260 aiod_lifetime = AIOD_LIFETIME_DEFAULT; 261 jobrefid = 1; 262 } 263 264 /* 265 * Init the per-process aioinfo structure. 266 * The aioinfo limits are set per-process for user limit (resource) management. 267 */ 268 void 269 aio_init_aioinfo(struct proc *p) 270 { 271 struct kaioinfo *ki; 272 if (p->p_aioinfo == NULL) { 273 ki = zalloc(kaio_zone); 274 p->p_aioinfo = ki; 275 ki->kaio_flags = 0; 276 ki->kaio_maxactive_count = max_aio_per_proc; 277 ki->kaio_active_count = 0; 278 ki->kaio_qallowed_count = max_aio_queue_per_proc; 279 ki->kaio_queue_count = 0; 280 ki->kaio_ballowed_count = max_buf_aio; 281 ki->kaio_buffer_count = 0; 282 ki->kaio_buffer_finished_count = 0; 283 ki->kaio_p = p; 284 TAILQ_INIT(&ki->kaio_jobdone); 285 TAILQ_INIT(&ki->kaio_jobqueue); 286 TAILQ_INIT(&ki->kaio_bufdone); 287 TAILQ_INIT(&ki->kaio_bufqueue); 288 TAILQ_INIT(&ki->kaio_liojoblist); 289 } 290 } 291 292 /* 293 * Free a job entry. Wait for completion if it is currently 294 * active, but don't delay forever. If we delay, we return 295 * a flag that says that we have to restart the queue scan. 296 */ 297 int 298 aio_free_entry(struct aiocblist *aiocbe) 299 { 300 struct kaioinfo *ki; 301 struct aioproclist *aiop; 302 struct aio_liojob *lj; 303 struct proc *p; 304 int error; 305 int s; 306 307 if (aiocbe->jobstate == JOBST_NULL) 308 panic("aio_free_entry: freeing already free job"); 309 310 p = aiocbe->userproc; 311 ki = p->p_aioinfo; 312 lj = aiocbe->lio; 313 if (ki == NULL) 314 panic("aio_free_entry: missing p->p_aioinfo"); 315 316 if (aiocbe->jobstate == JOBST_JOBRUNNING) { 317 if (aiocbe->jobflags & AIOCBLIST_ASYNCFREE) 318 return 0; 319 aiocbe->jobflags |= AIOCBLIST_RUNDOWN; 320 tsleep(aiocbe, PRIBIO|PCATCH, "jobwai", 0); 321 } 322 aiocbe->jobflags &= ~AIOCBLIST_ASYNCFREE; 323 324 if (aiocbe->bp == NULL) { 325 if (ki->kaio_queue_count <= 0) 326 panic("aio_free_entry: process queue size <= 0"); 327 if (num_queue_count <= 0) 328 panic("aio_free_entry: system wide queue size <= 0"); 329 330 if(lj) { 331 lj->lioj_queue_count--; 332 if (aiocbe->jobflags & AIOCBLIST_DONE) 333 lj->lioj_queue_finished_count--; 334 } 335 ki->kaio_queue_count--; 336 if (aiocbe->jobflags & AIOCBLIST_DONE) 337 ki->kaio_queue_finished_count--; 338 num_queue_count--; 339 340 } else { 341 if(lj) { 342 lj->lioj_buffer_count--; 343 if (aiocbe->jobflags & AIOCBLIST_DONE) 344 lj->lioj_buffer_finished_count--; 345 } 346 if (aiocbe->jobflags & AIOCBLIST_DONE) 347 ki->kaio_buffer_finished_count--; 348 ki->kaio_buffer_count--; 349 num_buf_aio--; 350 351 } 352 353 if ((ki->kaio_flags & KAIO_WAKEUP) || 354 ((ki->kaio_flags & KAIO_RUNDOWN) && 355 ((ki->kaio_buffer_count == 0) && 356 (ki->kaio_queue_count == 0)))) { 357 ki->kaio_flags &= ~KAIO_WAKEUP; 358 wakeup(p); 359 } 360 361 if ( aiocbe->jobstate == JOBST_JOBQBUF) { 362 if ((error = aio_fphysio(p, aiocbe, 1)) != 0) 363 return error; 364 if (aiocbe->jobstate != JOBST_JOBBFINISHED) 365 panic("aio_free_entry: invalid physio finish-up state"); 366 s = splbio(); 367 TAILQ_REMOVE(&ki->kaio_bufdone, aiocbe, plist); 368 splx(s); 369 } else if ( aiocbe->jobstate == JOBST_JOBQPROC) { 370 aiop = aiocbe->jobaioproc; 371 TAILQ_REMOVE(&aiop->jobtorun, aiocbe, list); 372 } else if ( aiocbe->jobstate == JOBST_JOBQGLOBAL) { 373 TAILQ_REMOVE(&aio_jobs, aiocbe, list); 374 } else if ( aiocbe->jobstate == JOBST_JOBFINISHED) { 375 TAILQ_REMOVE(&ki->kaio_jobdone, aiocbe, plist); 376 } else if ( aiocbe->jobstate == JOBST_JOBBFINISHED) { 377 s = splbio(); 378 TAILQ_REMOVE(&ki->kaio_bufdone, aiocbe, plist); 379 splx(s); 380 if (aiocbe->bp) { 381 vunmapbuf(aiocbe->bp); 382 relpbuf(aiocbe->bp, NULL); 383 aiocbe->bp = NULL; 384 } 385 } 386 if (lj && (lj->lioj_buffer_count == 0) && (lj->lioj_queue_count == 0)) { 387 TAILQ_REMOVE(&ki->kaio_liojoblist, lj, lioj_list); 388 zfree(aiolio_zone, lj); 389 } 390 TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list); 391 aiocbe->jobstate = JOBST_NULL; 392 return 0; 393 } 394 395 /* 396 * Rundown the jobs for a given process. 397 */ 398 void 399 aio_proc_rundown(struct proc *p) 400 { 401 int s; 402 struct kaioinfo *ki; 403 struct aio_liojob *lj, *ljn; 404 struct aiocblist *aiocbe, *aiocbn; 405 406 ki = p->p_aioinfo; 407 if (ki == NULL) 408 return; 409 410 ki->kaio_flags |= LIOJ_SIGNAL_POSTED; 411 while ((ki->kaio_active_count > 0) || 412 (ki->kaio_buffer_count > ki->kaio_buffer_finished_count)) { 413 ki->kaio_flags |= KAIO_RUNDOWN; 414 if (tsleep(p, PRIBIO, "kaiowt", aiod_timeout)) 415 break; 416 } 417 418 restart1: 419 for ( aiocbe = TAILQ_FIRST(&ki->kaio_jobdone); 420 aiocbe; 421 aiocbe = aiocbn) { 422 aiocbn = TAILQ_NEXT(aiocbe, plist); 423 if (aio_free_entry(aiocbe)) 424 goto restart1; 425 } 426 427 restart2: 428 for ( aiocbe = TAILQ_FIRST(&ki->kaio_jobqueue); 429 aiocbe; 430 aiocbe = aiocbn) { 431 aiocbn = TAILQ_NEXT(aiocbe, plist); 432 if (aio_free_entry(aiocbe)) 433 goto restart2; 434 } 435 436 /* 437 * Note the use of lots of splbio here, trying to avoid 438 * splbio for long chains of I/O. Probably unnecessary. 439 */ 440 441 restart3: 442 s = splbio(); 443 while (TAILQ_FIRST(&ki->kaio_bufqueue)) { 444 ki->kaio_flags |= KAIO_WAKEUP; 445 tsleep (p, PRIBIO, "aioprn", 0); 446 splx(s); 447 goto restart3; 448 } 449 splx(s); 450 451 restart4: 452 s = splbio(); 453 for ( aiocbe = TAILQ_FIRST(&ki->kaio_bufdone); 454 aiocbe; 455 aiocbe = aiocbn) { 456 aiocbn = TAILQ_NEXT(aiocbe, plist); 457 if (aio_free_entry(aiocbe)) { 458 splx(s); 459 goto restart4; 460 } 461 } 462 splx(s); 463 464 for ( lj = TAILQ_FIRST(&ki->kaio_liojoblist); 465 lj; 466 lj = ljn) { 467 ljn = TAILQ_NEXT(lj, lioj_list); 468 if ((lj->lioj_buffer_count == 0) && (lj->lioj_queue_count == 0)) { 469 TAILQ_REMOVE(&ki->kaio_liojoblist, lj, lioj_list); 470 zfree(aiolio_zone, lj); 471 } else { 472 #if defined(DIAGNOSTIC) 473 printf("LIO job not cleaned up: B:%d, BF:%d, Q:%d, QF:%d\n", 474 lj->lioj_buffer_count, lj->lioj_buffer_finished_count, 475 lj->lioj_queue_count, lj->lioj_queue_finished_count); 476 #endif 477 } 478 } 479 480 zfree(kaio_zone, ki); 481 p->p_aioinfo = NULL; 482 } 483 484 /* 485 * Select a job to run (called by an AIO daemon) 486 */ 487 static struct aiocblist * 488 aio_selectjob(struct aioproclist *aiop) 489 { 490 491 struct aiocblist *aiocbe; 492 493 aiocbe = TAILQ_FIRST(&aiop->jobtorun); 494 if (aiocbe) { 495 TAILQ_REMOVE(&aiop->jobtorun, aiocbe, list); 496 return aiocbe; 497 } 498 499 for (aiocbe = TAILQ_FIRST(&aio_jobs); 500 aiocbe; 501 aiocbe = TAILQ_NEXT(aiocbe, list)) { 502 struct kaioinfo *ki; 503 struct proc *userp; 504 505 userp = aiocbe->userproc; 506 ki = userp->p_aioinfo; 507 508 if (ki->kaio_active_count < ki->kaio_maxactive_count) { 509 TAILQ_REMOVE(&aio_jobs, aiocbe, list); 510 return aiocbe; 511 } 512 } 513 514 return NULL; 515 } 516 517 /* 518 * The AIO processing activity. This is the code that does the 519 * I/O request for the non-physio version of the operations. The 520 * normal vn operations are used, and this code should work in 521 * all instances for every type of file, including pipes, sockets, 522 * fifos, and regular files. 523 */ 524 void 525 aio_process(struct aiocblist *aiocbe) 526 { 527 struct filedesc *fdp; 528 struct proc *userp, *mycp; 529 struct aiocb *cb; 530 struct file *fp; 531 struct uio auio; 532 struct iovec aiov; 533 unsigned int fd; 534 int cnt; 535 int error; 536 off_t offset; 537 int oublock_st, oublock_end; 538 int inblock_st, inblock_end; 539 540 userp = aiocbe->userproc; 541 cb = &aiocbe->uaiocb; 542 543 mycp = curproc; 544 545 fdp = mycp->p_fd; 546 fd = cb->aio_fildes; 547 fp = fdp->fd_ofiles[fd]; 548 549 aiov.iov_base = (void *) cb->aio_buf; 550 aiov.iov_len = cb->aio_nbytes; 551 552 auio.uio_iov = &aiov; 553 auio.uio_iovcnt = 1; 554 auio.uio_offset = offset = cb->aio_offset; 555 auio.uio_resid = cb->aio_nbytes; 556 cnt = cb->aio_nbytes; 557 auio.uio_segflg = UIO_USERSPACE; 558 auio.uio_procp = mycp; 559 560 inblock_st = mycp->p_stats->p_ru.ru_inblock; 561 oublock_st = mycp->p_stats->p_ru.ru_oublock; 562 if (cb->aio_lio_opcode == LIO_READ) { 563 auio.uio_rw = UIO_READ; 564 error = fo_read(fp, &auio, fp->f_cred, FOF_OFFSET, mycp); 565 } else { 566 auio.uio_rw = UIO_WRITE; 567 error = fo_write(fp, &auio, fp->f_cred, FOF_OFFSET, mycp); 568 } 569 inblock_end = mycp->p_stats->p_ru.ru_inblock; 570 oublock_end = mycp->p_stats->p_ru.ru_oublock; 571 572 aiocbe->inputcharge = inblock_end - inblock_st; 573 aiocbe->outputcharge = oublock_end - oublock_st; 574 575 if (error) { 576 if (auio.uio_resid != cnt) { 577 if (error == ERESTART || error == EINTR || error == EWOULDBLOCK) 578 error = 0; 579 if ((error == EPIPE) && (cb->aio_lio_opcode == LIO_WRITE)) 580 psignal(userp, SIGPIPE); 581 } 582 } 583 584 cnt -= auio.uio_resid; 585 cb->_aiocb_private.error = error; 586 cb->_aiocb_private.status = cnt; 587 588 return; 589 590 } 591 592 /* 593 * The AIO daemon, most of the actual work is done in aio_process, 594 * but the setup (and address space mgmt) is done in this routine. 595 */ 596 static void 597 aio_daemon(void *uproc) 598 { 599 int s; 600 struct aioproclist *aiop; 601 struct vmspace *myvm; 602 struct proc *mycp; 603 604 /* 605 * Local copies of curproc (cp) and vmspace (myvm) 606 */ 607 mycp = curproc; 608 myvm = mycp->p_vmspace; 609 610 if (mycp->p_textvp) { 611 vrele(mycp->p_textvp); 612 mycp->p_textvp = NULL; 613 } 614 615 /* 616 * Allocate and ready the aio control info. There is one 617 * aiop structure per daemon. 618 */ 619 aiop = zalloc(aiop_zone); 620 aiop->aioproc = mycp; 621 aiop->aioprocflags |= AIOP_FREE; 622 TAILQ_INIT(&aiop->jobtorun); 623 624 /* 625 * Place thread (lightweight process) onto the AIO free thread list 626 */ 627 if (TAILQ_EMPTY(&aio_freeproc)) 628 wakeup(&aio_freeproc); 629 TAILQ_INSERT_HEAD(&aio_freeproc, aiop, list); 630 631 /* 632 * Make up a name for the daemon 633 */ 634 strcpy(mycp->p_comm, "aiod"); 635 636 /* 637 * Get rid of our current filedescriptors. AIOD's don't need any 638 * filedescriptors, except as temporarily inherited from the client. 639 * Credentials are also cloned, and made equivalent to "root." 640 */ 641 fdfree(mycp); 642 mycp->p_fd = NULL; 643 mycp->p_ucred = crcopy(mycp->p_ucred); 644 mycp->p_ucred->cr_uid = 0; 645 mycp->p_ucred->cr_ngroups = 1; 646 mycp->p_ucred->cr_groups[0] = 1; 647 648 /* 649 * The daemon resides in its own pgrp. 650 */ 651 enterpgrp(mycp, mycp->p_pid, 1); 652 653 /* 654 * Mark special process type 655 */ 656 mycp->p_flag |= P_SYSTEM|P_KTHREADP; 657 658 /* 659 * Wakeup parent process. (Parent sleeps to keep from blasting away 660 * creating to many daemons.) 661 */ 662 wakeup(mycp); 663 664 while(1) { 665 struct proc *curcp; 666 struct aiocblist *aiocbe; 667 668 /* 669 * curcp is the current daemon process context. 670 * userp is the current user process context. 671 */ 672 curcp = mycp; 673 674 /* 675 * Take daemon off of free queue 676 */ 677 if (aiop->aioprocflags & AIOP_FREE) { 678 TAILQ_REMOVE(&aio_freeproc, aiop, list); 679 TAILQ_INSERT_TAIL(&aio_activeproc, aiop, list); 680 aiop->aioprocflags &= ~AIOP_FREE; 681 } 682 aiop->aioprocflags &= ~AIOP_SCHED; 683 684 /* 685 * Check for jobs 686 */ 687 while ((aiocbe = aio_selectjob(aiop)) != NULL) { 688 struct proc *userp; 689 struct aiocb *cb; 690 struct kaioinfo *ki; 691 struct aio_liojob *lj; 692 693 cb = &aiocbe->uaiocb; 694 userp = aiocbe->userproc; 695 696 aiocbe->jobstate = JOBST_JOBRUNNING; 697 698 /* 699 * Connect to process address space for user program 700 */ 701 if (userp != curcp) { 702 struct vmspace *tmpvm; 703 /* 704 * Save the current address space that we are connected to. 705 */ 706 tmpvm = mycp->p_vmspace; 707 /* 708 * Point to the new user address space, and refer to it. 709 */ 710 mycp->p_vmspace = userp->p_vmspace; 711 mycp->p_vmspace->vm_refcnt++; 712 /* 713 * Activate the new mapping. 714 */ 715 pmap_activate(mycp); 716 /* 717 * If the old address space wasn't the daemons own address 718 * space, then we need to remove the daemon's reference from 719 * the other process that it was acting on behalf of. 720 */ 721 if (tmpvm != myvm) { 722 vmspace_free(tmpvm); 723 } 724 /* 725 * Disassociate from previous clients file descriptors, and 726 * associate to the new clients descriptors. Note that 727 * the daemon doesn't need to worry about its orginal 728 * descriptors, because they were originally freed. 729 */ 730 if (mycp->p_fd) 731 fdfree(mycp); 732 mycp->p_fd = fdshare(userp); 733 curcp = userp; 734 } 735 736 ki = userp->p_aioinfo; 737 lj = aiocbe->lio; 738 739 /* 740 * Account for currently active jobs 741 */ 742 ki->kaio_active_count++; 743 744 /* 745 * Do the I/O function 746 */ 747 aiocbe->jobaioproc = aiop; 748 aio_process(aiocbe); 749 750 /* 751 * decrement the active job count 752 */ 753 ki->kaio_active_count--; 754 755 /* 756 * increment the completion count for wakeup/signal comparisons 757 */ 758 aiocbe->jobflags |= AIOCBLIST_DONE; 759 ki->kaio_queue_finished_count++; 760 if (lj) { 761 lj->lioj_queue_finished_count++; 762 } 763 if ((ki->kaio_flags & KAIO_WAKEUP) || 764 ((ki->kaio_flags & KAIO_RUNDOWN) && 765 (ki->kaio_active_count == 0))) { 766 ki->kaio_flags &= ~KAIO_WAKEUP; 767 wakeup(userp); 768 } 769 770 s = splbio(); 771 if (lj && (lj->lioj_flags & (LIOJ_SIGNAL|LIOJ_SIGNAL_POSTED)) == 772 LIOJ_SIGNAL) { 773 if ((lj->lioj_queue_finished_count == lj->lioj_queue_count) && 774 (lj->lioj_buffer_finished_count == lj->lioj_buffer_count)) { 775 psignal(userp, lj->lioj_signal.sigev_signo); 776 lj->lioj_flags |= LIOJ_SIGNAL_POSTED; 777 } 778 } 779 splx(s); 780 781 aiocbe->jobstate = JOBST_JOBFINISHED; 782 783 /* 784 * If the I/O request should be automatically rundown, do the 785 * needed cleanup. Otherwise, place the queue entry for 786 * the just finished I/O request into the done queue for the 787 * associated client. 788 */ 789 if (aiocbe->jobflags & AIOCBLIST_ASYNCFREE) { 790 aiocbe->jobflags &= ~AIOCBLIST_ASYNCFREE; 791 TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list); 792 } else { 793 TAILQ_REMOVE(&ki->kaio_jobqueue, 794 aiocbe, plist); 795 TAILQ_INSERT_TAIL(&ki->kaio_jobdone, 796 aiocbe, plist); 797 } 798 799 if (aiocbe->jobflags & AIOCBLIST_RUNDOWN) { 800 wakeup(aiocbe); 801 aiocbe->jobflags &= ~AIOCBLIST_RUNDOWN; 802 } 803 804 if (cb->aio_sigevent.sigev_notify == SIGEV_SIGNAL) { 805 psignal(userp, cb->aio_sigevent.sigev_signo); 806 } 807 } 808 809 /* 810 * Disconnect from user address space 811 */ 812 if (curcp != mycp) { 813 struct vmspace *tmpvm; 814 /* 815 * Get the user address space to disconnect from. 816 */ 817 tmpvm = mycp->p_vmspace; 818 /* 819 * Get original address space for daemon. 820 */ 821 mycp->p_vmspace = myvm; 822 /* 823 * Activate the daemon's address space. 824 */ 825 pmap_activate(mycp); 826 #if defined(DIAGNOSTIC) 827 if (tmpvm == myvm) 828 printf("AIOD: vmspace problem -- %d\n", mycp->p_pid); 829 #endif 830 /* 831 * remove our vmspace reference. 832 */ 833 vmspace_free(tmpvm); 834 /* 835 * disassociate from the user process's file descriptors. 836 */ 837 if (mycp->p_fd) 838 fdfree(mycp); 839 mycp->p_fd = NULL; 840 curcp = mycp; 841 } 842 843 /* 844 * If we are the first to be put onto the free queue, wakeup 845 * anyone waiting for a daemon. 846 */ 847 TAILQ_REMOVE(&aio_activeproc, aiop, list); 848 if (TAILQ_EMPTY(&aio_freeproc)) 849 wakeup(&aio_freeproc); 850 TAILQ_INSERT_HEAD(&aio_freeproc, aiop, list); 851 aiop->aioprocflags |= AIOP_FREE; 852 853 /* 854 * If daemon is inactive for a long time, allow it to exit, thereby 855 * freeing resources. 856 */ 857 if (((aiop->aioprocflags & AIOP_SCHED) == 0) && 858 tsleep(mycp, PRIBIO, "aiordy", aiod_lifetime)) { 859 if ((TAILQ_FIRST(&aio_jobs) == NULL) && 860 (TAILQ_FIRST(&aiop->jobtorun) == NULL)) { 861 if ((aiop->aioprocflags & AIOP_FREE) && 862 (num_aio_procs > target_aio_procs)) { 863 TAILQ_REMOVE(&aio_freeproc, aiop, list); 864 zfree(aiop_zone, aiop); 865 num_aio_procs--; 866 #if defined(DIAGNOSTIC) 867 if (mycp->p_vmspace->vm_refcnt <= 1) 868 printf("AIOD: bad vm refcnt for exiting daemon: %d\n", 869 mycp->p_vmspace->vm_refcnt); 870 #endif 871 exit1(mycp, 0); 872 } 873 } 874 } 875 } 876 } 877 878 /* 879 * Create a new AIO daemon. This is mostly a kernel-thread fork routine. 880 * The AIO daemon modifies its environment itself. 881 */ 882 static int 883 aio_newproc() 884 { 885 int error; 886 struct proc *p, *np; 887 888 p = &proc0; 889 error = fork1(p, RFPROC|RFMEM|RFNOWAIT, &np); 890 if (error) 891 return error; 892 cpu_set_fork_handler(np, aio_daemon, curproc); 893 894 /* 895 * Wait until daemon is started, but continue on just in case (to 896 * handle error conditions. 897 */ 898 error = tsleep(np, PZERO, "aiosta", aiod_timeout); 899 num_aio_procs++; 900 901 return error; 902 903 } 904 905 /* 906 * Try the high-performance physio method for eligible VCHR devices. This 907 * routine doesn't require the use of any additional threads, and have 908 * overhead. 909 */ 910 int 911 aio_qphysio(p, aiocbe) 912 struct proc *p; 913 struct aiocblist *aiocbe; 914 { 915 int error; 916 struct aiocb *cb; 917 struct file *fp; 918 struct buf *bp; 919 struct vnode *vp; 920 struct kaioinfo *ki; 921 struct filedesc *fdp; 922 struct aio_liojob *lj; 923 int fd; 924 int s; 925 int cnt; 926 927 cb = &aiocbe->uaiocb; 928 fdp = p->p_fd; 929 fd = cb->aio_fildes; 930 fp = fdp->fd_ofiles[fd]; 931 932 if (fp->f_type != DTYPE_VNODE) 933 return (-1); 934 935 vp = (struct vnode *)fp->f_data; 936 937 if (!vn_isdisk(vp)) 938 return (-1); 939 940 if (cb->aio_nbytes % vp->v_rdev->si_bsize_phys) 941 return (-1); 942 943 if ((cb->aio_nbytes > MAXPHYS) && (num_buf_aio >= max_buf_aio)) 944 return (-1); 945 946 ki = p->p_aioinfo; 947 if (ki->kaio_buffer_count >= ki->kaio_ballowed_count) 948 return (-1); 949 950 cnt = cb->aio_nbytes; 951 if (cnt > MAXPHYS) 952 return (-1); 953 954 /* 955 * Physical I/O is charged directly to the process, so we don't have 956 * to fake it. 957 */ 958 aiocbe->inputcharge = 0; 959 aiocbe->outputcharge = 0; 960 961 ki->kaio_buffer_count++; 962 963 lj = aiocbe->lio; 964 if (lj) { 965 lj->lioj_buffer_count++; 966 } 967 968 /* create and build a buffer header for a transfer */ 969 bp = (struct buf *)getpbuf(NULL); 970 971 /* 972 * get a copy of the kva from the physical buffer 973 */ 974 bp->b_caller1 = p; 975 bp->b_dev = vp->v_rdev; 976 error = bp->b_error = 0; 977 978 bp->b_bcount = cb->aio_nbytes; 979 bp->b_bufsize = cb->aio_nbytes; 980 bp->b_flags = B_PHYS | B_CALL; 981 bp->b_iodone = aio_physwakeup; 982 bp->b_saveaddr = bp->b_data; 983 bp->b_data = (void *) cb->aio_buf; 984 bp->b_blkno = btodb(cb->aio_offset); 985 986 if (cb->aio_lio_opcode == LIO_WRITE) { 987 bp->b_flags |= B_WRITE; 988 if (!useracc(bp->b_data, bp->b_bufsize, VM_PROT_READ)) { 989 error = EFAULT; 990 goto doerror; 991 } 992 } else { 993 bp->b_flags |= B_READ; 994 if (!useracc(bp->b_data, bp->b_bufsize, VM_PROT_WRITE)) { 995 error = EFAULT; 996 goto doerror; 997 } 998 } 999 1000 /* bring buffer into kernel space */ 1001 vmapbuf(bp); 1002 1003 s = splbio(); 1004 aiocbe->bp = bp; 1005 bp->b_spc = (void *)aiocbe; 1006 TAILQ_INSERT_TAIL(&aio_bufjobs, aiocbe, list); 1007 TAILQ_INSERT_TAIL(&ki->kaio_bufqueue, aiocbe, plist); 1008 aiocbe->jobstate = JOBST_JOBQBUF; 1009 cb->_aiocb_private.status = cb->aio_nbytes; 1010 num_buf_aio++; 1011 bp->b_error = 0; 1012 1013 splx(s); 1014 /* perform transfer */ 1015 BUF_STRATEGY(bp, 0); 1016 1017 s = splbio(); 1018 /* 1019 * If we had an error invoking the request, or an error in processing 1020 * the request before we have returned, we process it as an error 1021 * in transfer. Note that such an I/O error is not indicated immediately, 1022 * but is returned using the aio_error mechanism. In this case, aio_suspend 1023 * will return immediately. 1024 */ 1025 if (bp->b_error || (bp->b_flags & B_ERROR)) { 1026 struct aiocb *job = aiocbe->uuaiocb; 1027 1028 aiocbe->uaiocb._aiocb_private.status = 0; 1029 suword(&job->_aiocb_private.status, 0); 1030 aiocbe->uaiocb._aiocb_private.error = bp->b_error; 1031 suword(&job->_aiocb_private.error, bp->b_error); 1032 1033 ki->kaio_buffer_finished_count++; 1034 1035 if (aiocbe->jobstate != JOBST_JOBBFINISHED) { 1036 aiocbe->jobstate = JOBST_JOBBFINISHED; 1037 aiocbe->jobflags |= AIOCBLIST_DONE; 1038 TAILQ_REMOVE(&aio_bufjobs, aiocbe, list); 1039 TAILQ_REMOVE(&ki->kaio_bufqueue, aiocbe, plist); 1040 TAILQ_INSERT_TAIL(&ki->kaio_bufdone, aiocbe, plist); 1041 } 1042 } 1043 splx(s); 1044 return 0; 1045 1046 doerror: 1047 ki->kaio_buffer_count--; 1048 if (lj) { 1049 lj->lioj_buffer_count--; 1050 } 1051 aiocbe->bp = NULL; 1052 relpbuf(bp, NULL); 1053 return error; 1054 } 1055 1056 /* 1057 * This waits/tests physio completion. 1058 */ 1059 int 1060 aio_fphysio(p, iocb, flgwait) 1061 struct proc *p; 1062 struct aiocblist *iocb; 1063 int flgwait; 1064 { 1065 int s; 1066 struct buf *bp; 1067 int error; 1068 1069 bp = iocb->bp; 1070 1071 s = splbio(); 1072 if (flgwait == 0) { 1073 if ((bp->b_flags & B_DONE) == 0) { 1074 splx(s); 1075 return EINPROGRESS; 1076 } 1077 } 1078 1079 while ((bp->b_flags & B_DONE) == 0) { 1080 if (tsleep((caddr_t)bp, PRIBIO, "physstr", aiod_timeout)) { 1081 if ((bp->b_flags & B_DONE) == 0) { 1082 splx(s); 1083 return EINPROGRESS; 1084 } else { 1085 break; 1086 } 1087 } 1088 } 1089 1090 /* release mapping into kernel space */ 1091 vunmapbuf(bp); 1092 iocb->bp = 0; 1093 1094 error = 0; 1095 /* 1096 * check for an error 1097 */ 1098 if (bp->b_flags & B_ERROR) { 1099 error = bp->b_error; 1100 } 1101 1102 relpbuf(bp, NULL); 1103 return (error); 1104 } 1105 1106 /* 1107 * Queue a new AIO request. Choosing either the threaded or direct physio 1108 * VCHR technique is done in this code. 1109 */ 1110 static int 1111 _aio_aqueue(struct proc *p, struct aiocb *job, struct aio_liojob *lj, int type) 1112 { 1113 struct filedesc *fdp; 1114 struct file *fp; 1115 unsigned int fd; 1116 1117 int error; 1118 int opcode; 1119 struct aiocblist *aiocbe; 1120 struct aioproclist *aiop; 1121 struct kaioinfo *ki; 1122 1123 if ((aiocbe = TAILQ_FIRST(&aio_freejobs)) != NULL) { 1124 TAILQ_REMOVE(&aio_freejobs, aiocbe, list); 1125 } else { 1126 aiocbe = zalloc (aiocb_zone); 1127 } 1128 1129 aiocbe->inputcharge = 0; 1130 aiocbe->outputcharge = 0; 1131 1132 suword(&job->_aiocb_private.status, -1); 1133 suword(&job->_aiocb_private.error, 0); 1134 suword(&job->_aiocb_private.kernelinfo, -1); 1135 1136 error = copyin((caddr_t)job, 1137 (caddr_t) &aiocbe->uaiocb, sizeof aiocbe->uaiocb); 1138 if (error) { 1139 suword(&job->_aiocb_private.error, error); 1140 1141 TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list); 1142 return error; 1143 } 1144 1145 /* 1146 * Save userspace address of the job info 1147 */ 1148 aiocbe->uuaiocb = job; 1149 1150 /* 1151 * Get the opcode 1152 */ 1153 if (type != LIO_NOP) { 1154 aiocbe->uaiocb.aio_lio_opcode = type; 1155 } 1156 opcode = aiocbe->uaiocb.aio_lio_opcode; 1157 1158 /* 1159 * Get the fd info for process 1160 */ 1161 fdp = p->p_fd; 1162 1163 /* 1164 * Range check file descriptor 1165 */ 1166 fd = aiocbe->uaiocb.aio_fildes; 1167 if (fd >= fdp->fd_nfiles) { 1168 TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list); 1169 if (type == 0) { 1170 suword(&job->_aiocb_private.error, EBADF); 1171 } 1172 return EBADF; 1173 } 1174 1175 fp = fdp->fd_ofiles[fd]; 1176 if ((fp == NULL) || 1177 ((opcode == LIO_WRITE) && ((fp->f_flag & FWRITE) == 0))) { 1178 TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list); 1179 if (type == 0) { 1180 suword(&job->_aiocb_private.error, EBADF); 1181 } 1182 return EBADF; 1183 } 1184 1185 if (aiocbe->uaiocb.aio_offset == -1LL) { 1186 TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list); 1187 if (type == 0) { 1188 suword(&job->_aiocb_private.error, EINVAL); 1189 } 1190 return EINVAL; 1191 } 1192 1193 error = suword(&job->_aiocb_private.kernelinfo, jobrefid); 1194 if (error) { 1195 TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list); 1196 if (type == 0) { 1197 suword(&job->_aiocb_private.error, EINVAL); 1198 } 1199 return error; 1200 } 1201 1202 aiocbe->uaiocb._aiocb_private.kernelinfo = (void *)(intptr_t)jobrefid; 1203 if (jobrefid == LONG_MAX) 1204 jobrefid = 1; 1205 else 1206 jobrefid++; 1207 1208 if (opcode == LIO_NOP) { 1209 TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list); 1210 if (type == 0) { 1211 suword(&job->_aiocb_private.error, 0); 1212 suword(&job->_aiocb_private.status, 0); 1213 suword(&job->_aiocb_private.kernelinfo, 0); 1214 } 1215 return 0; 1216 } 1217 1218 if ((opcode != LIO_READ) && (opcode != LIO_WRITE)) { 1219 TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list); 1220 if (type == 0) { 1221 suword(&job->_aiocb_private.status, 0); 1222 suword(&job->_aiocb_private.error, EINVAL); 1223 } 1224 return EINVAL; 1225 } 1226 1227 suword(&job->_aiocb_private.error, EINPROGRESS); 1228 aiocbe->uaiocb._aiocb_private.error = EINPROGRESS; 1229 aiocbe->userproc = p; 1230 aiocbe->jobflags = 0; 1231 aiocbe->lio = lj; 1232 ki = p->p_aioinfo; 1233 1234 if ((error = aio_qphysio(p, aiocbe)) == 0) { 1235 return 0; 1236 } else if (error > 0) { 1237 suword(&job->_aiocb_private.status, 0); 1238 aiocbe->uaiocb._aiocb_private.error = error; 1239 suword(&job->_aiocb_private.error, error); 1240 return error; 1241 } 1242 1243 /* 1244 * No buffer for daemon I/O 1245 */ 1246 aiocbe->bp = NULL; 1247 1248 ki->kaio_queue_count++; 1249 if (lj) { 1250 lj->lioj_queue_count++; 1251 } 1252 TAILQ_INSERT_TAIL(&ki->kaio_jobqueue, aiocbe, plist); 1253 TAILQ_INSERT_TAIL(&aio_jobs, aiocbe, list); 1254 aiocbe->jobstate = JOBST_JOBQGLOBAL; 1255 1256 num_queue_count++; 1257 error = 0; 1258 1259 /* 1260 * If we don't have a free AIO process, and we are below our 1261 * quota, then start one. Otherwise, depend on the subsequent 1262 * I/O completions to pick-up this job. If we don't sucessfully 1263 * create the new process (thread) due to resource issues, we 1264 * return an error for now (EAGAIN), which is likely not the 1265 * correct thing to do. 1266 */ 1267 retryproc: 1268 if ((aiop = TAILQ_FIRST(&aio_freeproc)) != NULL) { 1269 TAILQ_REMOVE(&aio_freeproc, aiop, list); 1270 TAILQ_INSERT_TAIL(&aio_activeproc, aiop, list); 1271 aiop->aioprocflags &= ~AIOP_FREE; 1272 wakeup(aiop->aioproc); 1273 } else if (((num_aio_resv_start + num_aio_procs) < max_aio_procs) && 1274 ((ki->kaio_active_count + num_aio_resv_start) < 1275 ki->kaio_maxactive_count)) { 1276 num_aio_resv_start++; 1277 if ((error = aio_newproc()) == 0) { 1278 num_aio_resv_start--; 1279 p->p_retval[0] = 0; 1280 goto retryproc; 1281 } 1282 num_aio_resv_start--; 1283 } 1284 return error; 1285 } 1286 1287 /* 1288 * This routine queues an AIO request, checking for quotas. 1289 */ 1290 static int 1291 aio_aqueue(struct proc *p, struct aiocb *job, int type) 1292 { 1293 struct kaioinfo *ki; 1294 1295 if (p->p_aioinfo == NULL) { 1296 aio_init_aioinfo(p); 1297 } 1298 1299 if (num_queue_count >= max_queue_count) 1300 return EAGAIN; 1301 1302 ki = p->p_aioinfo; 1303 if (ki->kaio_queue_count >= ki->kaio_qallowed_count) 1304 return EAGAIN; 1305 1306 return _aio_aqueue(p, job, NULL, type); 1307 } 1308 1309 /* 1310 * Support the aio_return system call, as a side-effect, kernel 1311 * resources are released. 1312 */ 1313 int 1314 aio_return(struct proc *p, struct aio_return_args *uap) 1315 { 1316 int s; 1317 int jobref; 1318 struct aiocblist *cb, *ncb; 1319 struct aiocb *ujob; 1320 struct kaioinfo *ki; 1321 1322 ki = p->p_aioinfo; 1323 if (ki == NULL) { 1324 return EINVAL; 1325 } 1326 1327 ujob = uap->aiocbp; 1328 1329 jobref = fuword(&ujob->_aiocb_private.kernelinfo); 1330 if (jobref == -1 || jobref == 0) 1331 return EINVAL; 1332 1333 for (cb = TAILQ_FIRST(&ki->kaio_jobdone); 1334 cb; 1335 cb = TAILQ_NEXT(cb, plist)) { 1336 if (((intptr_t) cb->uaiocb._aiocb_private.kernelinfo) == jobref) { 1337 if (ujob == cb->uuaiocb) { 1338 p->p_retval[0] = cb->uaiocb._aiocb_private.status; 1339 } else { 1340 p->p_retval[0] = EFAULT; 1341 } 1342 if (cb->uaiocb.aio_lio_opcode == LIO_WRITE) { 1343 curproc->p_stats->p_ru.ru_oublock += cb->outputcharge; 1344 cb->outputcharge = 0; 1345 } else if (cb->uaiocb.aio_lio_opcode == LIO_READ) { 1346 curproc->p_stats->p_ru.ru_inblock += cb->inputcharge; 1347 cb->inputcharge = 0; 1348 } 1349 aio_free_entry(cb); 1350 return 0; 1351 } 1352 } 1353 1354 s = splbio(); 1355 for (cb = TAILQ_FIRST(&ki->kaio_bufdone); 1356 cb; 1357 cb = ncb) { 1358 ncb = TAILQ_NEXT(cb, plist); 1359 if (((intptr_t) cb->uaiocb._aiocb_private.kernelinfo) == jobref) { 1360 splx(s); 1361 if (ujob == cb->uuaiocb) { 1362 p->p_retval[0] = cb->uaiocb._aiocb_private.status; 1363 } else { 1364 p->p_retval[0] = EFAULT; 1365 } 1366 aio_free_entry(cb); 1367 return 0; 1368 } 1369 } 1370 splx(s); 1371 1372 return (EINVAL); 1373 } 1374 1375 /* 1376 * Allow a process to wakeup when any of the I/O requests are 1377 * completed. 1378 */ 1379 int 1380 aio_suspend(struct proc *p, struct aio_suspend_args *uap) 1381 { 1382 struct timeval atv; 1383 struct timespec ts; 1384 struct aiocb *const *cbptr, *cbp; 1385 struct kaioinfo *ki; 1386 struct aiocblist *cb; 1387 int i; 1388 int njoblist; 1389 int error, s, timo; 1390 int *ijoblist; 1391 struct aiocb **ujoblist; 1392 1393 if (uap->nent >= AIO_LISTIO_MAX) 1394 return EINVAL; 1395 1396 timo = 0; 1397 if (uap->timeout) { 1398 /* 1399 * Get timespec struct 1400 */ 1401 if ((error = copyin(uap->timeout, &ts, sizeof(ts))) != 0) { 1402 return error; 1403 } 1404 1405 if (ts.tv_nsec < 0 || ts.tv_nsec >= 1000000000) 1406 return (EINVAL); 1407 1408 TIMESPEC_TO_TIMEVAL(&atv, &ts); 1409 if (itimerfix(&atv)) 1410 return (EINVAL); 1411 timo = tvtohz(&atv); 1412 } 1413 1414 ki = p->p_aioinfo; 1415 if (ki == NULL) 1416 return EAGAIN; 1417 1418 njoblist = 0; 1419 ijoblist = zalloc(aiol_zone); 1420 ujoblist = zalloc(aiol_zone); 1421 cbptr = uap->aiocbp; 1422 1423 for(i = 0; i < uap->nent; i++) { 1424 cbp = (struct aiocb *) (intptr_t) fuword((caddr_t) &cbptr[i]); 1425 if (cbp == 0) 1426 continue; 1427 ujoblist[njoblist] = cbp; 1428 ijoblist[njoblist] = fuword(&cbp->_aiocb_private.kernelinfo); 1429 njoblist++; 1430 } 1431 if (njoblist == 0) { 1432 zfree(aiol_zone, ijoblist); 1433 zfree(aiol_zone, ujoblist); 1434 return 0; 1435 } 1436 1437 error = 0; 1438 while (1) { 1439 for (cb = TAILQ_FIRST(&ki->kaio_jobdone); 1440 cb; cb = TAILQ_NEXT(cb, plist)) { 1441 for(i = 0; i < njoblist; i++) { 1442 if (((intptr_t) cb->uaiocb._aiocb_private.kernelinfo) == 1443 ijoblist[i]) { 1444 if (ujoblist[i] != cb->uuaiocb) 1445 error = EINVAL; 1446 zfree(aiol_zone, ijoblist); 1447 zfree(aiol_zone, ujoblist); 1448 return error; 1449 } 1450 } 1451 } 1452 1453 s = splbio(); 1454 for (cb = TAILQ_FIRST(&ki->kaio_bufdone); 1455 cb; cb = TAILQ_NEXT(cb, plist)) { 1456 for(i = 0; i < njoblist; i++) { 1457 if (((intptr_t) cb->uaiocb._aiocb_private.kernelinfo) == 1458 ijoblist[i]) { 1459 splx(s); 1460 if (ujoblist[i] != cb->uuaiocb) 1461 error = EINVAL; 1462 zfree(aiol_zone, ijoblist); 1463 zfree(aiol_zone, ujoblist); 1464 return error; 1465 } 1466 } 1467 } 1468 1469 ki->kaio_flags |= KAIO_WAKEUP; 1470 error = tsleep(p, PRIBIO|PCATCH, "aiospn", timo); 1471 splx(s); 1472 1473 if (error == EINTR) { 1474 zfree(aiol_zone, ijoblist); 1475 zfree(aiol_zone, ujoblist); 1476 return EINTR; 1477 } else if (error == EWOULDBLOCK) { 1478 zfree(aiol_zone, ijoblist); 1479 zfree(aiol_zone, ujoblist); 1480 return EAGAIN; 1481 } 1482 } 1483 1484 /* NOTREACHED */ 1485 return EINVAL; 1486 } 1487 1488 /* 1489 * aio_cancel at the kernel level is a NOOP right now. It 1490 * might be possible to support it partially in user mode, or 1491 * in kernel mode later on. 1492 */ 1493 int 1494 aio_cancel(struct proc *p, struct aio_cancel_args *uap) 1495 { 1496 return ENOSYS; 1497 } 1498 1499 /* 1500 * aio_error is implemented in the kernel level for compatibility 1501 * purposes only. For a user mode async implementation, it would be 1502 * best to do it in a userland subroutine. 1503 */ 1504 int 1505 aio_error(struct proc *p, struct aio_error_args *uap) 1506 { 1507 int s; 1508 struct aiocblist *cb; 1509 struct kaioinfo *ki; 1510 int jobref; 1511 1512 ki = p->p_aioinfo; 1513 if (ki == NULL) 1514 return EINVAL; 1515 1516 jobref = fuword(&uap->aiocbp->_aiocb_private.kernelinfo); 1517 if ((jobref == -1) || (jobref == 0)) 1518 return EINVAL; 1519 1520 for (cb = TAILQ_FIRST(&ki->kaio_jobdone); 1521 cb; 1522 cb = TAILQ_NEXT(cb, plist)) { 1523 1524 if (((intptr_t) cb->uaiocb._aiocb_private.kernelinfo) == jobref) { 1525 p->p_retval[0] = cb->uaiocb._aiocb_private.error; 1526 return 0; 1527 } 1528 } 1529 1530 for (cb = TAILQ_FIRST(&ki->kaio_jobqueue); 1531 cb; 1532 cb = TAILQ_NEXT(cb, plist)) { 1533 1534 if (((intptr_t) cb->uaiocb._aiocb_private.kernelinfo) == jobref) { 1535 p->p_retval[0] = EINPROGRESS; 1536 return 0; 1537 } 1538 } 1539 1540 s = splbio(); 1541 for (cb = TAILQ_FIRST(&ki->kaio_bufdone); 1542 cb; 1543 cb = TAILQ_NEXT(cb, plist)) { 1544 if (((intptr_t) cb->uaiocb._aiocb_private.kernelinfo) == jobref) { 1545 p->p_retval[0] = cb->uaiocb._aiocb_private.error; 1546 splx(s); 1547 return 0; 1548 } 1549 } 1550 1551 for (cb = TAILQ_FIRST(&ki->kaio_bufqueue); 1552 cb; 1553 cb = TAILQ_NEXT(cb, plist)) { 1554 if (((intptr_t) cb->uaiocb._aiocb_private.kernelinfo) == jobref) { 1555 p->p_retval[0] = EINPROGRESS; 1556 splx(s); 1557 return 0; 1558 } 1559 } 1560 splx(s); 1561 1562 1563 /* 1564 * Hack for lio 1565 */ 1566 /* 1567 status = fuword(&uap->aiocbp->_aiocb_private.status); 1568 if (status == -1) { 1569 return fuword(&uap->aiocbp->_aiocb_private.error); 1570 } 1571 */ 1572 return EINVAL; 1573 } 1574 1575 int 1576 aio_read(struct proc *p, struct aio_read_args *uap) 1577 { 1578 struct filedesc *fdp; 1579 struct file *fp; 1580 struct uio auio; 1581 struct iovec aiov; 1582 unsigned int fd; 1583 int cnt; 1584 struct aiocb iocb; 1585 int error, pmodes; 1586 1587 pmodes = fuword(&uap->aiocbp->_aiocb_private.privatemodes); 1588 if ((pmodes & AIO_PMODE_SYNC) == 0) { 1589 return aio_aqueue(p, (struct aiocb *) uap->aiocbp, LIO_READ); 1590 } 1591 1592 /* 1593 * Get control block 1594 */ 1595 if ((error = copyin((caddr_t) uap->aiocbp, (caddr_t) &iocb, sizeof iocb)) != 0) 1596 return error; 1597 1598 /* 1599 * Get the fd info for process 1600 */ 1601 fdp = p->p_fd; 1602 1603 /* 1604 * Range check file descriptor 1605 */ 1606 fd = iocb.aio_fildes; 1607 if (fd >= fdp->fd_nfiles) 1608 return EBADF; 1609 fp = fdp->fd_ofiles[fd]; 1610 if ((fp == NULL) || ((fp->f_flag & FREAD) == 0)) 1611 return EBADF; 1612 if (iocb.aio_offset == -1LL) 1613 return EINVAL; 1614 1615 auio.uio_resid = iocb.aio_nbytes; 1616 if (auio.uio_resid < 0) 1617 return (EINVAL); 1618 1619 /* 1620 * Process sync simply -- queue async request. 1621 */ 1622 if ((iocb._aiocb_private.privatemodes & AIO_PMODE_SYNC) == 0) { 1623 return aio_aqueue(p, (struct aiocb *) uap->aiocbp, LIO_READ); 1624 } 1625 1626 aiov.iov_base = (void *) iocb.aio_buf; 1627 aiov.iov_len = iocb.aio_nbytes; 1628 1629 auio.uio_iov = &aiov; 1630 auio.uio_iovcnt = 1; 1631 auio.uio_offset = iocb.aio_offset; 1632 auio.uio_rw = UIO_READ; 1633 auio.uio_segflg = UIO_USERSPACE; 1634 auio.uio_procp = p; 1635 1636 cnt = iocb.aio_nbytes; 1637 error = fo_read(fp, &auio, fp->f_cred, FOF_OFFSET, p); 1638 if (error && 1639 (auio.uio_resid != cnt) && 1640 (error == ERESTART || error == EINTR || error == EWOULDBLOCK)) 1641 error = 0; 1642 cnt -= auio.uio_resid; 1643 p->p_retval[0] = cnt; 1644 return error; 1645 } 1646 1647 int 1648 aio_write(struct proc *p, struct aio_write_args *uap) 1649 { 1650 struct filedesc *fdp; 1651 struct file *fp; 1652 struct uio auio; 1653 struct iovec aiov; 1654 unsigned int fd; 1655 int cnt; 1656 struct aiocb iocb; 1657 int error; 1658 int pmodes; 1659 1660 /* 1661 * Process sync simply -- queue async request. 1662 */ 1663 pmodes = fuword(&uap->aiocbp->_aiocb_private.privatemodes); 1664 if ((pmodes & AIO_PMODE_SYNC) == 0) { 1665 return aio_aqueue(p, (struct aiocb *) uap->aiocbp, LIO_WRITE); 1666 } 1667 1668 if ((error = copyin((caddr_t) uap->aiocbp, (caddr_t) &iocb, sizeof iocb)) != 0) 1669 return error; 1670 1671 /* 1672 * Get the fd info for process 1673 */ 1674 fdp = p->p_fd; 1675 1676 /* 1677 * Range check file descriptor 1678 */ 1679 fd = iocb.aio_fildes; 1680 if (fd >= fdp->fd_nfiles) 1681 return EBADF; 1682 fp = fdp->fd_ofiles[fd]; 1683 if ((fp == NULL) || ((fp->f_flag & FWRITE) == 0)) 1684 return EBADF; 1685 if (iocb.aio_offset == -1LL) 1686 return EINVAL; 1687 1688 aiov.iov_base = (void *) iocb.aio_buf; 1689 aiov.iov_len = iocb.aio_nbytes; 1690 auio.uio_iov = &aiov; 1691 auio.uio_iovcnt = 1; 1692 auio.uio_offset = iocb.aio_offset; 1693 1694 auio.uio_resid = iocb.aio_nbytes; 1695 if (auio.uio_resid < 0) 1696 return (EINVAL); 1697 1698 auio.uio_rw = UIO_WRITE; 1699 auio.uio_segflg = UIO_USERSPACE; 1700 auio.uio_procp = p; 1701 1702 cnt = iocb.aio_nbytes; 1703 error = fo_write(fp, &auio, fp->f_cred, FOF_OFFSET, p); 1704 if (error) { 1705 if (auio.uio_resid != cnt) { 1706 if (error == ERESTART || error == EINTR || error == EWOULDBLOCK) 1707 error = 0; 1708 if (error == EPIPE) 1709 psignal(p, SIGPIPE); 1710 } 1711 } 1712 cnt -= auio.uio_resid; 1713 p->p_retval[0] = cnt; 1714 return error; 1715 } 1716 1717 int 1718 lio_listio(struct proc *p, struct lio_listio_args *uap) 1719 { 1720 int nent, nentqueued; 1721 struct aiocb *iocb, * const *cbptr; 1722 struct aiocblist *cb; 1723 struct kaioinfo *ki; 1724 struct aio_liojob *lj; 1725 int error, runningcode; 1726 int nerror; 1727 int i; 1728 int s; 1729 1730 if ((uap->mode != LIO_NOWAIT) && (uap->mode != LIO_WAIT)) { 1731 return EINVAL; 1732 } 1733 1734 nent = uap->nent; 1735 if (nent > AIO_LISTIO_MAX) { 1736 return EINVAL; 1737 } 1738 1739 if (p->p_aioinfo == NULL) { 1740 aio_init_aioinfo(p); 1741 } 1742 1743 if ((nent + num_queue_count) > max_queue_count) { 1744 return EAGAIN; 1745 } 1746 1747 ki = p->p_aioinfo; 1748 if ((nent + ki->kaio_queue_count) > ki->kaio_qallowed_count) { 1749 return EAGAIN; 1750 } 1751 1752 lj = zalloc(aiolio_zone); 1753 if (!lj) { 1754 return EAGAIN; 1755 } 1756 1757 lj->lioj_flags = 0; 1758 lj->lioj_buffer_count = 0; 1759 lj->lioj_buffer_finished_count = 0; 1760 lj->lioj_queue_count = 0; 1761 lj->lioj_queue_finished_count = 0; 1762 lj->lioj_ki = ki; 1763 TAILQ_INSERT_TAIL(&ki->kaio_liojoblist, lj, lioj_list); 1764 1765 /* 1766 * Setup signal 1767 */ 1768 if (uap->sig && (uap->mode == LIO_NOWAIT)) { 1769 error = copyin(uap->sig, &lj->lioj_signal, sizeof lj->lioj_signal); 1770 if (error) 1771 return error; 1772 lj->lioj_flags |= LIOJ_SIGNAL; 1773 lj->lioj_flags &= ~LIOJ_SIGNAL_POSTED; 1774 } else { 1775 lj->lioj_flags &= ~LIOJ_SIGNAL; 1776 } 1777 1778 /* 1779 * get pointers to the list of I/O requests 1780 */ 1781 1782 nerror = 0; 1783 nentqueued = 0; 1784 cbptr = uap->acb_list; 1785 for(i = 0; i < uap->nent; i++) { 1786 iocb = (struct aiocb *) (intptr_t) fuword((caddr_t) &cbptr[i]); 1787 if (((intptr_t) iocb != -1) && ((intptr_t) iocb != NULL)) { 1788 error = _aio_aqueue(p, iocb, lj, 0); 1789 if (error == 0) { 1790 nentqueued++; 1791 } else { 1792 nerror++; 1793 } 1794 } 1795 } 1796 1797 /* 1798 * If we haven't queued any, then just return error 1799 */ 1800 if (nentqueued == 0) { 1801 return 0; 1802 } 1803 1804 /* 1805 * Calculate the appropriate error return 1806 */ 1807 runningcode = 0; 1808 if (nerror) 1809 runningcode = EIO; 1810 1811 if (uap->mode == LIO_WAIT) { 1812 while (1) { 1813 int found; 1814 found = 0; 1815 for(i = 0; i < uap->nent; i++) { 1816 int jobref, command; 1817 1818 /* 1819 * Fetch address of the control buf pointer in user space 1820 */ 1821 iocb = (struct aiocb *) (intptr_t) fuword((caddr_t) &cbptr[i]); 1822 if (((intptr_t) iocb == -1) || ((intptr_t) iocb == 0)) 1823 continue; 1824 1825 /* 1826 * Fetch the associated command from user space 1827 */ 1828 command = fuword(&iocb->aio_lio_opcode); 1829 if (command == LIO_NOP) { 1830 found++; 1831 continue; 1832 } 1833 1834 jobref = fuword(&iocb->_aiocb_private.kernelinfo); 1835 1836 for (cb = TAILQ_FIRST(&ki->kaio_jobdone); 1837 cb; 1838 cb = TAILQ_NEXT(cb, plist)) { 1839 if (((intptr_t) cb->uaiocb._aiocb_private.kernelinfo) == 1840 jobref) { 1841 if (cb->uaiocb.aio_lio_opcode == LIO_WRITE) { 1842 curproc->p_stats->p_ru.ru_oublock += 1843 cb->outputcharge; 1844 cb->outputcharge = 0; 1845 } else if (cb->uaiocb.aio_lio_opcode == LIO_READ) { 1846 curproc->p_stats->p_ru.ru_inblock += 1847 cb->inputcharge; 1848 cb->inputcharge = 0; 1849 } 1850 found++; 1851 break; 1852 } 1853 } 1854 1855 s = splbio(); 1856 for (cb = TAILQ_FIRST(&ki->kaio_bufdone); 1857 cb; 1858 cb = TAILQ_NEXT(cb, plist)) { 1859 if (((intptr_t) cb->uaiocb._aiocb_private.kernelinfo) == 1860 jobref) { 1861 found++; 1862 break; 1863 } 1864 } 1865 splx(s); 1866 1867 } 1868 1869 /* 1870 * If all I/Os have been disposed of, then we can return 1871 */ 1872 if (found == nentqueued) { 1873 return runningcode; 1874 } 1875 1876 ki->kaio_flags |= KAIO_WAKEUP; 1877 error = tsleep(p, PRIBIO|PCATCH, "aiospn", 0); 1878 1879 if (error == EINTR) { 1880 return EINTR; 1881 } else if (error == EWOULDBLOCK) { 1882 return EAGAIN; 1883 } 1884 1885 } 1886 } 1887 1888 return runningcode; 1889 } 1890 1891 /* 1892 * This is a wierd hack so that we can post a signal. It is safe 1893 * to do so from a timeout routine, but *not* from an interrupt routine. 1894 */ 1895 static void 1896 process_signal(void *ljarg) 1897 { 1898 struct aio_liojob *lj = ljarg; 1899 if (lj->lioj_signal.sigev_notify == SIGEV_SIGNAL) { 1900 if (lj->lioj_queue_count == lj->lioj_queue_finished_count) { 1901 psignal(lj->lioj_ki->kaio_p, lj->lioj_signal.sigev_signo); 1902 lj->lioj_flags |= LIOJ_SIGNAL_POSTED; 1903 } 1904 } 1905 } 1906 1907 /* 1908 * Interrupt handler for physio, performs the necessary process wakeups, 1909 * and signals. 1910 */ 1911 static void 1912 aio_physwakeup(bp) 1913 struct buf *bp; 1914 { 1915 struct aiocblist *aiocbe; 1916 struct proc *p; 1917 struct kaioinfo *ki; 1918 struct aio_liojob *lj; 1919 int s; 1920 s = splbio(); 1921 1922 wakeup((caddr_t) bp); 1923 bp->b_flags &= ~B_CALL; 1924 bp->b_flags |= B_DONE; 1925 1926 aiocbe = (struct aiocblist *)bp->b_spc; 1927 if (aiocbe) { 1928 p = bp->b_caller1; 1929 1930 aiocbe->jobstate = JOBST_JOBBFINISHED; 1931 aiocbe->uaiocb._aiocb_private.status -= bp->b_resid; 1932 aiocbe->uaiocb._aiocb_private.error = 0; 1933 aiocbe->jobflags |= AIOCBLIST_DONE; 1934 1935 if (bp->b_flags & B_ERROR) { 1936 aiocbe->uaiocb._aiocb_private.error = bp->b_error; 1937 } 1938 1939 lj = aiocbe->lio; 1940 if (lj) { 1941 lj->lioj_buffer_finished_count++; 1942 /* 1943 * wakeup/signal if all of the interrupt jobs are done 1944 */ 1945 if (lj->lioj_buffer_finished_count == lj->lioj_buffer_count) { 1946 /* 1947 * post a signal if it is called for 1948 */ 1949 if ((lj->lioj_flags & (LIOJ_SIGNAL|LIOJ_SIGNAL_POSTED)) == 1950 LIOJ_SIGNAL) { 1951 lj->lioj_flags |= LIOJ_SIGNAL_POSTED; 1952 timeout(process_signal, lj, 0); 1953 } 1954 } 1955 } 1956 1957 ki = p->p_aioinfo; 1958 if (ki) { 1959 ki->kaio_buffer_finished_count++; 1960 TAILQ_REMOVE(&aio_bufjobs, aiocbe, list); 1961 TAILQ_REMOVE(&ki->kaio_bufqueue, aiocbe, plist); 1962 TAILQ_INSERT_TAIL(&ki->kaio_bufdone, aiocbe, plist); 1963 /* 1964 * and do the wakeup 1965 */ 1966 if (ki->kaio_flags & (KAIO_RUNDOWN|KAIO_WAKEUP)) { 1967 ki->kaio_flags &= ~KAIO_WAKEUP; 1968 wakeup(p); 1969 } 1970 } 1971 } 1972 splx(s); 1973 } 1974