1 /* 2 * Copyright (c) 1997 John S. Dyson. All rights reserved. 3 * 4 * Redistribution and use in source and binary forms, with or without 5 * modification, are permitted provided that the following conditions 6 * are met: 7 * 1. Redistributions of source code must retain the above copyright 8 * notice, this list of conditions and the following disclaimer. 9 * 2. John S. Dyson's name may not be used to endorse or promote products 10 * derived from this software without specific prior written permission. 11 * 12 * DISCLAIMER: This code isn't warranted to do anything useful. Anything 13 * bad that happens because of using this software isn't the responsibility 14 * of the author. This software is distributed AS-IS. 15 * 16 * $Id: vfs_aio.c,v 1.35 1998/11/27 01:14:21 tegge Exp $ 17 */ 18 19 /* 20 * This file contains support for the POSIX 1003.1B AIO/LIO facility. 21 */ 22 23 #include <sys/param.h> 24 #include <sys/systm.h> 25 #include <sys/sysproto.h> 26 #include <sys/filedesc.h> 27 #include <sys/kernel.h> 28 #include <sys/fcntl.h> 29 #include <sys/file.h> 30 #include <sys/lock.h> 31 #include <sys/unistd.h> 32 #include <sys/proc.h> 33 #include <sys/resourcevar.h> 34 #include <sys/signalvar.h> 35 #include <sys/sysctl.h> 36 #include <sys/vnode.h> 37 #include <sys/conf.h> 38 #include <miscfs/specfs/specdev.h> 39 40 #include <vm/vm.h> 41 #include <vm/vm_param.h> 42 #include <vm/vm_extern.h> 43 #include <vm/pmap.h> 44 #include <vm/vm_map.h> 45 #include <vm/vm_zone.h> 46 #include <sys/aio.h> 47 #include <sys/shm.h> 48 49 #include <machine/cpu.h> 50 #include <machine/limits.h> 51 52 static long jobrefid; 53 54 #define JOBST_NULL 0x0 55 #define JOBST_JOBQPROC 0x1 56 #define JOBST_JOBQGLOBAL 0x2 57 #define JOBST_JOBRUNNING 0x3 58 #define JOBST_JOBFINISHED 0x4 59 #define JOBST_JOBQBUF 0x5 60 #define JOBST_JOBBFINISHED 0x6 61 62 #ifndef MAX_AIO_PER_PROC 63 #define MAX_AIO_PER_PROC 32 64 #endif 65 66 #ifndef MAX_AIO_QUEUE_PER_PROC 67 #define MAX_AIO_QUEUE_PER_PROC 256 /* Bigger than AIO_LISTIO_MAX */ 68 #endif 69 70 #ifndef MAX_AIO_PROCS 71 #define MAX_AIO_PROCS 32 72 #endif 73 74 #ifndef MAX_AIO_QUEUE 75 #define MAX_AIO_QUEUE 1024 /* Bigger than AIO_LISTIO_MAX */ 76 #endif 77 78 #ifndef TARGET_AIO_PROCS 79 #define TARGET_AIO_PROCS 0 80 #endif 81 82 #ifndef MAX_BUF_AIO 83 #define MAX_BUF_AIO 16 84 #endif 85 86 #ifndef AIOD_TIMEOUT_DEFAULT 87 #define AIOD_TIMEOUT_DEFAULT (10 * hz) 88 #endif 89 90 #ifndef AIOD_LIFETIME_DEFAULT 91 #define AIOD_LIFETIME_DEFAULT (30 * hz) 92 #endif 93 94 static int max_aio_procs = MAX_AIO_PROCS; 95 static int num_aio_procs = 0; 96 static int target_aio_procs = TARGET_AIO_PROCS; 97 static int max_queue_count = MAX_AIO_QUEUE; 98 static int num_queue_count = 0; 99 static int num_buf_aio = 0; 100 static int num_aio_resv_start = 0; 101 static int aiod_timeout; 102 static int aiod_lifetime; 103 104 static int max_aio_per_proc = MAX_AIO_PER_PROC, 105 max_aio_queue_per_proc=MAX_AIO_QUEUE_PER_PROC; 106 107 static int max_buf_aio = MAX_BUF_AIO; 108 109 SYSCTL_NODE(_vfs, OID_AUTO, aio, CTLFLAG_RW, 0, "AIO mgmt"); 110 111 SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_per_proc, 112 CTLFLAG_RW, &max_aio_per_proc, 0, ""); 113 114 SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_queue_per_proc, 115 CTLFLAG_RW, &max_aio_queue_per_proc, 0, ""); 116 117 SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_procs, 118 CTLFLAG_RW, &max_aio_procs, 0, ""); 119 120 SYSCTL_INT(_vfs_aio, OID_AUTO, num_aio_procs, 121 CTLFLAG_RD, &num_aio_procs, 0, ""); 122 123 SYSCTL_INT(_vfs_aio, OID_AUTO, num_queue_count, 124 CTLFLAG_RD, &num_queue_count, 0, ""); 125 126 SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_queue, 127 CTLFLAG_RW, &max_queue_count, 0, ""); 128 129 SYSCTL_INT(_vfs_aio, OID_AUTO, target_aio_procs, 130 CTLFLAG_RW, &target_aio_procs, 0, ""); 131 132 SYSCTL_INT(_vfs_aio, OID_AUTO, max_buf_aio, 133 CTLFLAG_RW, &max_buf_aio, 0, ""); 134 135 SYSCTL_INT(_vfs_aio, OID_AUTO, num_buf_aio, 136 CTLFLAG_RD, &num_buf_aio, 0, ""); 137 138 SYSCTL_INT(_vfs_aio, OID_AUTO, aiod_lifetime, 139 CTLFLAG_RW, &aiod_lifetime, 0, ""); 140 141 SYSCTL_INT(_vfs_aio, OID_AUTO, aiod_timeout, 142 CTLFLAG_RW, &aiod_timeout, 0, ""); 143 144 145 /* 146 * Job queue item 147 */ 148 149 #define AIOCBLIST_CANCELLED 0x1 150 #define AIOCBLIST_RUNDOWN 0x4 151 #define AIOCBLIST_ASYNCFREE 0x8 152 #define AIOCBLIST_DONE 0x10 153 154 struct aiocblist { 155 TAILQ_ENTRY (aiocblist) list; /* List of jobs */ 156 TAILQ_ENTRY (aiocblist) plist; /* List of jobs for proc */ 157 int jobflags; 158 int jobstate; 159 int inputcharge, outputcharge; 160 struct buf *bp; /* buffer pointer */ 161 struct proc *userproc; /* User process */ 162 struct aioproclist *jobaioproc; /* AIO process descriptor */ 163 struct aio_liojob *lio; /* optional lio job */ 164 struct aiocb *uuaiocb; /* pointer in userspace of aiocb */ 165 struct aiocb uaiocb; /* Kernel I/O control block */ 166 }; 167 168 169 /* 170 * AIO process info 171 */ 172 #define AIOP_FREE 0x1 /* proc on free queue */ 173 #define AIOP_SCHED 0x2 /* proc explicitly scheduled */ 174 175 struct aioproclist { 176 int aioprocflags; /* AIO proc flags */ 177 TAILQ_ENTRY(aioproclist) list; /* List of processes */ 178 struct proc *aioproc; /* The AIO thread */ 179 TAILQ_HEAD (,aiocblist) jobtorun; /* suggested job to run */ 180 }; 181 182 /* 183 * data-structure for lio signal management 184 */ 185 struct aio_liojob { 186 int lioj_flags; 187 int lioj_buffer_count; 188 int lioj_buffer_finished_count; 189 int lioj_queue_count; 190 int lioj_queue_finished_count; 191 struct sigevent lioj_signal; /* signal on all I/O done */ 192 TAILQ_ENTRY (aio_liojob) lioj_list; 193 struct kaioinfo *lioj_ki; 194 }; 195 #define LIOJ_SIGNAL 0x1 /* signal on all done (lio) */ 196 #define LIOJ_SIGNAL_POSTED 0x2 /* signal has been posted */ 197 198 /* 199 * per process aio data structure 200 */ 201 struct kaioinfo { 202 int kaio_flags; /* per process kaio flags */ 203 int kaio_maxactive_count; /* maximum number of AIOs */ 204 int kaio_active_count; /* number of currently used AIOs */ 205 int kaio_qallowed_count; /* maxiumu size of AIO queue */ 206 int kaio_queue_count; /* size of AIO queue */ 207 int kaio_ballowed_count; /* maximum number of buffers */ 208 int kaio_queue_finished_count; /* number of daemon jobs finished */ 209 int kaio_buffer_count; /* number of physio buffers */ 210 int kaio_buffer_finished_count; /* count of I/O done */ 211 struct proc *kaio_p; /* process that uses this kaio block */ 212 TAILQ_HEAD (,aio_liojob) kaio_liojoblist; /* list of lio jobs */ 213 TAILQ_HEAD (,aiocblist) kaio_jobqueue; /* job queue for process */ 214 TAILQ_HEAD (,aiocblist) kaio_jobdone; /* done queue for process */ 215 TAILQ_HEAD (,aiocblist) kaio_bufqueue; /* buffer job queue for process */ 216 TAILQ_HEAD (,aiocblist) kaio_bufdone; /* buffer done queue for process */ 217 }; 218 219 #define KAIO_RUNDOWN 0x1 /* process is being run down */ 220 #define KAIO_WAKEUP 0x2 /* wakeup process when there is a significant 221 event */ 222 223 224 static TAILQ_HEAD (,aioproclist) aio_freeproc, aio_activeproc; 225 static TAILQ_HEAD(,aiocblist) aio_jobs; /* Async job list */ 226 static TAILQ_HEAD(,aiocblist) aio_bufjobs; /* Phys I/O job list */ 227 static TAILQ_HEAD(,aiocblist) aio_freejobs; /* Pool of free jobs */ 228 229 static void aio_init_aioinfo(struct proc *p) ; 230 static void aio_onceonly(void *) ; 231 static int aio_free_entry(struct aiocblist *aiocbe); 232 static void aio_process(struct aiocblist *aiocbe); 233 static int aio_newproc(void) ; 234 static int aio_aqueue(struct proc *p, struct aiocb *job, int type) ; 235 static void aio_physwakeup(struct buf *bp); 236 static int aio_fphysio(struct proc *p, struct aiocblist *aiocbe, int type); 237 static int aio_qphysio(struct proc *p, struct aiocblist *iocb); 238 static void aio_daemon(void *uproc); 239 240 SYSINIT(aio, SI_SUB_VFS, SI_ORDER_ANY, aio_onceonly, NULL); 241 242 static vm_zone_t kaio_zone=0, aiop_zone=0, 243 aiocb_zone=0, aiol_zone=0, aiolio_zone=0; 244 245 /* 246 * Single AIOD vmspace shared amongst all of them 247 */ 248 struct vmspace *aiovmspace = NULL; 249 250 /* 251 * Startup initialization 252 */ 253 void 254 aio_onceonly(void *na) 255 { 256 TAILQ_INIT(&aio_freeproc); 257 TAILQ_INIT(&aio_activeproc); 258 TAILQ_INIT(&aio_jobs); 259 TAILQ_INIT(&aio_bufjobs); 260 TAILQ_INIT(&aio_freejobs); 261 kaio_zone = zinit("AIO", sizeof (struct kaioinfo), 0, 0, 1); 262 aiop_zone = zinit("AIOP", sizeof (struct aioproclist), 0, 0, 1); 263 aiocb_zone = zinit("AIOCB", sizeof (struct aiocblist), 0, 0, 1); 264 aiol_zone = zinit("AIOL", AIO_LISTIO_MAX * sizeof (int), 0, 0, 1); 265 aiolio_zone = zinit("AIOLIO", 266 AIO_LISTIO_MAX * sizeof (struct aio_liojob), 0, 0, 1); 267 aiod_timeout = AIOD_TIMEOUT_DEFAULT; 268 aiod_lifetime = AIOD_LIFETIME_DEFAULT; 269 jobrefid = 1; 270 } 271 272 /* 273 * Init the per-process aioinfo structure. 274 * The aioinfo limits are set per-process for user limit (resource) management. 275 */ 276 void 277 aio_init_aioinfo(struct proc *p) 278 { 279 struct kaioinfo *ki; 280 if (p->p_aioinfo == NULL) { 281 ki = zalloc(kaio_zone); 282 p->p_aioinfo = ki; 283 ki->kaio_flags = 0; 284 ki->kaio_maxactive_count = max_aio_per_proc; 285 ki->kaio_active_count = 0; 286 ki->kaio_qallowed_count = max_aio_queue_per_proc; 287 ki->kaio_queue_count = 0; 288 ki->kaio_ballowed_count = max_buf_aio; 289 ki->kaio_buffer_count = 0; 290 ki->kaio_buffer_finished_count = 0; 291 ki->kaio_p = p; 292 TAILQ_INIT(&ki->kaio_jobdone); 293 TAILQ_INIT(&ki->kaio_jobqueue); 294 TAILQ_INIT(&ki->kaio_bufdone); 295 TAILQ_INIT(&ki->kaio_bufqueue); 296 TAILQ_INIT(&ki->kaio_liojoblist); 297 } 298 } 299 300 /* 301 * Free a job entry. Wait for completion if it is currently 302 * active, but don't delay forever. If we delay, we return 303 * a flag that says that we have to restart the queue scan. 304 */ 305 int 306 aio_free_entry(struct aiocblist *aiocbe) 307 { 308 struct kaioinfo *ki; 309 struct aioproclist *aiop; 310 struct aio_liojob *lj; 311 struct proc *p; 312 int error; 313 int s; 314 315 if (aiocbe->jobstate == JOBST_NULL) 316 panic("aio_free_entry: freeing already free job"); 317 318 p = aiocbe->userproc; 319 ki = p->p_aioinfo; 320 lj = aiocbe->lio; 321 if (ki == NULL) 322 panic("aio_free_entry: missing p->p_aioinfo"); 323 324 if (aiocbe->jobstate == JOBST_JOBRUNNING) { 325 if (aiocbe->jobflags & AIOCBLIST_ASYNCFREE) 326 return 0; 327 aiocbe->jobflags |= AIOCBLIST_RUNDOWN; 328 tsleep(aiocbe, PRIBIO|PCATCH, "jobwai", 0); 329 } 330 aiocbe->jobflags &= ~AIOCBLIST_ASYNCFREE; 331 332 if (aiocbe->bp == NULL) { 333 if (ki->kaio_queue_count <= 0) 334 panic("aio_free_entry: process queue size <= 0"); 335 if (num_queue_count <= 0) 336 panic("aio_free_entry: system wide queue size <= 0"); 337 338 if(lj) { 339 lj->lioj_queue_count--; 340 if (aiocbe->jobflags & AIOCBLIST_DONE) 341 lj->lioj_queue_finished_count--; 342 } 343 ki->kaio_queue_count--; 344 if (aiocbe->jobflags & AIOCBLIST_DONE) 345 ki->kaio_queue_finished_count--; 346 num_queue_count--; 347 348 } else { 349 if(lj) { 350 lj->lioj_buffer_count--; 351 if (aiocbe->jobflags & AIOCBLIST_DONE) 352 lj->lioj_buffer_finished_count--; 353 } 354 if (aiocbe->jobflags & AIOCBLIST_DONE) 355 ki->kaio_buffer_finished_count--; 356 ki->kaio_buffer_count--; 357 num_buf_aio--; 358 359 } 360 361 if ((ki->kaio_flags & KAIO_WAKEUP) || 362 (ki->kaio_flags & KAIO_RUNDOWN) && 363 ((ki->kaio_buffer_count == 0) && (ki->kaio_queue_count == 0))) { 364 ki->kaio_flags &= ~KAIO_WAKEUP; 365 wakeup(p); 366 } 367 368 if ( aiocbe->jobstate == JOBST_JOBQBUF) { 369 if ((error = aio_fphysio(p, aiocbe, 1)) != 0) 370 return error; 371 if (aiocbe->jobstate != JOBST_JOBBFINISHED) 372 panic("aio_free_entry: invalid physio finish-up state"); 373 s = splbio(); 374 TAILQ_REMOVE(&ki->kaio_bufdone, aiocbe, plist); 375 splx(s); 376 } else if ( aiocbe->jobstate == JOBST_JOBQPROC) { 377 aiop = aiocbe->jobaioproc; 378 TAILQ_REMOVE(&aiop->jobtorun, aiocbe, list); 379 } else if ( aiocbe->jobstate == JOBST_JOBQGLOBAL) { 380 TAILQ_REMOVE(&aio_jobs, aiocbe, list); 381 } else if ( aiocbe->jobstate == JOBST_JOBFINISHED) { 382 TAILQ_REMOVE(&ki->kaio_jobdone, aiocbe, plist); 383 } else if ( aiocbe->jobstate == JOBST_JOBBFINISHED) { 384 s = splbio(); 385 TAILQ_REMOVE(&ki->kaio_bufdone, aiocbe, plist); 386 splx(s); 387 if (aiocbe->bp) { 388 vunmapbuf(aiocbe->bp); 389 relpbuf(aiocbe->bp); 390 aiocbe->bp = NULL; 391 } 392 } 393 if (lj && (lj->lioj_buffer_count == 0) && (lj->lioj_queue_count == 0)) { 394 TAILQ_REMOVE(&ki->kaio_liojoblist, lj, lioj_list); 395 zfree(aiolio_zone, lj); 396 } 397 TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list); 398 aiocbe->jobstate = JOBST_NULL; 399 return 0; 400 } 401 402 /* 403 * Rundown the jobs for a given process. 404 */ 405 void 406 aio_proc_rundown(struct proc *p) 407 { 408 int s; 409 struct kaioinfo *ki; 410 struct aio_liojob *lj, *ljn; 411 struct aiocblist *aiocbe, *aiocbn; 412 413 ki = p->p_aioinfo; 414 if (ki == NULL) 415 return; 416 417 ki->kaio_flags |= LIOJ_SIGNAL_POSTED; 418 while ((ki->kaio_active_count > 0) || 419 (ki->kaio_buffer_count > ki->kaio_buffer_finished_count)) { 420 ki->kaio_flags |= KAIO_RUNDOWN; 421 if (tsleep(p, PRIBIO, "kaiowt", aiod_timeout)) 422 break; 423 } 424 425 restart1: 426 for ( aiocbe = TAILQ_FIRST(&ki->kaio_jobdone); 427 aiocbe; 428 aiocbe = aiocbn) { 429 aiocbn = TAILQ_NEXT(aiocbe, plist); 430 if (aio_free_entry(aiocbe)) 431 goto restart1; 432 } 433 434 restart2: 435 for ( aiocbe = TAILQ_FIRST(&ki->kaio_jobqueue); 436 aiocbe; 437 aiocbe = aiocbn) { 438 aiocbn = TAILQ_NEXT(aiocbe, plist); 439 if (aio_free_entry(aiocbe)) 440 goto restart2; 441 } 442 443 /* 444 * Note the use of lots of splbio here, trying to avoid 445 * splbio for long chains of I/O. Probably unnecessary. 446 */ 447 448 restart3: 449 s = splbio(); 450 while (TAILQ_FIRST(&ki->kaio_bufqueue)) { 451 ki->kaio_flags |= KAIO_WAKEUP; 452 tsleep (p, PRIBIO, "aioprn", 0); 453 splx(s); 454 goto restart3; 455 } 456 splx(s); 457 458 restart4: 459 s = splbio(); 460 for ( aiocbe = TAILQ_FIRST(&ki->kaio_bufdone); 461 aiocbe; 462 aiocbe = aiocbn) { 463 aiocbn = TAILQ_NEXT(aiocbe, plist); 464 if (aio_free_entry(aiocbe)) { 465 splx(s); 466 goto restart4; 467 } 468 } 469 splx(s); 470 471 for ( lj = TAILQ_FIRST(&ki->kaio_liojoblist); 472 lj; 473 lj = ljn) { 474 ljn = TAILQ_NEXT(lj, lioj_list); 475 if ((lj->lioj_buffer_count == 0) && (lj->lioj_queue_count == 0)) { 476 TAILQ_REMOVE(&ki->kaio_liojoblist, lj, lioj_list); 477 zfree(aiolio_zone, lj); 478 } else { 479 #if defined(DIAGNOSTIC) 480 printf("LIO job not cleaned up: B:%d, BF:%d, Q:%d, QF:%d\n", 481 lj->lioj_buffer_count, lj->lioj_buffer_finished_count, 482 lj->lioj_queue_count, lj->lioj_queue_finished_count); 483 #endif 484 } 485 } 486 487 zfree(kaio_zone, ki); 488 p->p_aioinfo = NULL; 489 } 490 491 /* 492 * Select a job to run (called by an AIO daemon) 493 */ 494 static struct aiocblist * 495 aio_selectjob(struct aioproclist *aiop) 496 { 497 498 struct aiocblist *aiocbe; 499 500 aiocbe = TAILQ_FIRST(&aiop->jobtorun); 501 if (aiocbe) { 502 TAILQ_REMOVE(&aiop->jobtorun, aiocbe, list); 503 return aiocbe; 504 } 505 506 for (aiocbe = TAILQ_FIRST(&aio_jobs); 507 aiocbe; 508 aiocbe = TAILQ_NEXT(aiocbe, list)) { 509 struct kaioinfo *ki; 510 struct proc *userp; 511 512 userp = aiocbe->userproc; 513 ki = userp->p_aioinfo; 514 515 if (ki->kaio_active_count < ki->kaio_maxactive_count) { 516 TAILQ_REMOVE(&aio_jobs, aiocbe, list); 517 return aiocbe; 518 } 519 } 520 521 return NULL; 522 } 523 524 /* 525 * The AIO processing activity. This is the code that does the 526 * I/O request for the non-physio version of the operations. The 527 * normal vn operations are used, and this code should work in 528 * all instances for every type of file, including pipes, sockets, 529 * fifos, and regular files. 530 */ 531 void 532 aio_process(struct aiocblist *aiocbe) 533 { 534 struct filedesc *fdp; 535 struct proc *userp, *mycp; 536 struct aiocb *cb; 537 struct file *fp; 538 struct uio auio; 539 struct iovec aiov; 540 unsigned int fd; 541 int cnt; 542 int error; 543 off_t offset; 544 int oublock_st, oublock_end; 545 int inblock_st, inblock_end; 546 547 userp = aiocbe->userproc; 548 cb = &aiocbe->uaiocb; 549 550 mycp = curproc; 551 552 fdp = mycp->p_fd; 553 fd = cb->aio_fildes; 554 fp = fdp->fd_ofiles[fd]; 555 556 aiov.iov_base = (void *) cb->aio_buf; 557 aiov.iov_len = cb->aio_nbytes; 558 559 auio.uio_iov = &aiov; 560 auio.uio_iovcnt = 1; 561 auio.uio_offset = offset = cb->aio_offset; 562 auio.uio_resid = cb->aio_nbytes; 563 cnt = cb->aio_nbytes; 564 auio.uio_segflg = UIO_USERSPACE; 565 auio.uio_procp = mycp; 566 567 inblock_st = mycp->p_stats->p_ru.ru_inblock; 568 oublock_st = mycp->p_stats->p_ru.ru_oublock; 569 if (cb->aio_lio_opcode == LIO_READ) { 570 auio.uio_rw = UIO_READ; 571 error = (*fp->f_ops->fo_read)(fp, &auio, fp->f_cred); 572 } else { 573 auio.uio_rw = UIO_WRITE; 574 error = (*fp->f_ops->fo_write)(fp, &auio, fp->f_cred); 575 } 576 inblock_end = mycp->p_stats->p_ru.ru_inblock; 577 oublock_end = mycp->p_stats->p_ru.ru_oublock; 578 579 aiocbe->inputcharge = inblock_end - inblock_st; 580 aiocbe->outputcharge = oublock_end - oublock_st; 581 582 if (error) { 583 if (auio.uio_resid != cnt) { 584 if (error == ERESTART || error == EINTR || error == EWOULDBLOCK) 585 error = 0; 586 if ((error == EPIPE) && (cb->aio_lio_opcode == LIO_WRITE)) 587 psignal(userp, SIGPIPE); 588 } 589 } 590 591 cnt -= auio.uio_resid; 592 cb->_aiocb_private.error = error; 593 cb->_aiocb_private.status = cnt; 594 595 return; 596 597 } 598 599 /* 600 * The AIO daemon, most of the actual work is done in aio_process, 601 * but the setup (and address space mgmt) is done in this routine. 602 */ 603 static void 604 aio_daemon(void *uproc) 605 { 606 int s; 607 struct aioproclist *aiop; 608 struct vmspace *myvm, *aiovm; 609 struct proc *mycp; 610 611 /* 612 * Local copies of curproc (cp) and vmspace (myvm) 613 */ 614 mycp = curproc; 615 myvm = mycp->p_vmspace; 616 617 /* 618 * We manage to create only one VM space for all AIOD processes. 619 * The VM space for the first AIOD created becomes the shared VM 620 * space for all of them. We add an additional reference count, 621 * even for the first AIOD, so the address space does not go away, 622 * and we continue to use that original VM space even if the first 623 * AIOD exits. 624 */ 625 if ((aiovm = aiovmspace) == NULL) { 626 aiovmspace = myvm; 627 myvm->vm_refcnt++; 628 /* 629 * Remove userland cruft from address space. 630 */ 631 if (myvm->vm_shm) 632 shmexit(mycp); 633 pmap_remove_pages(&myvm->vm_pmap, 0, USRSTACK); 634 vm_map_remove(&myvm->vm_map, 0, USRSTACK); 635 myvm->vm_tsize = 0; 636 myvm->vm_dsize = 0; 637 myvm->vm_ssize = 0; 638 } else { 639 aiovm->vm_refcnt++; 640 mycp->p_vmspace = aiovm; 641 pmap_activate(mycp); 642 vmspace_free(myvm); 643 myvm = aiovm; 644 } 645 646 if (mycp->p_textvp) { 647 vrele(mycp->p_textvp); 648 mycp->p_textvp = NULL; 649 } 650 651 /* 652 * Allocate and ready the aio control info. There is one 653 * aiop structure per daemon. 654 */ 655 aiop = zalloc(aiop_zone); 656 aiop->aioproc = mycp; 657 aiop->aioprocflags |= AIOP_FREE; 658 TAILQ_INIT(&aiop->jobtorun); 659 660 /* 661 * Place thread (lightweight process) onto the AIO free thread list 662 */ 663 if (TAILQ_EMPTY(&aio_freeproc)) 664 wakeup(&aio_freeproc); 665 TAILQ_INSERT_HEAD(&aio_freeproc, aiop, list); 666 667 /* 668 * Make up a name for the daemon 669 */ 670 strcpy(mycp->p_comm, "aiod"); 671 672 /* 673 * Get rid of our current filedescriptors. AIOD's don't need any 674 * filedescriptors, except as temporarily inherited from the client. 675 * Credentials are also cloned, and made equivalent to "root." 676 */ 677 fdfree(mycp); 678 mycp->p_fd = NULL; 679 mycp->p_ucred = crcopy(mycp->p_ucred); 680 mycp->p_ucred->cr_uid = 0; 681 mycp->p_ucred->cr_ngroups = 1; 682 mycp->p_ucred->cr_groups[0] = 1; 683 684 /* 685 * The daemon resides in its own pgrp. 686 */ 687 enterpgrp(mycp, mycp->p_pid, 1); 688 689 /* 690 * Mark special process type 691 */ 692 mycp->p_flag |= P_SYSTEM|P_KTHREADP; 693 694 /* 695 * Wakeup parent process. (Parent sleeps to keep from blasting away 696 * creating to many daemons.) 697 */ 698 wakeup(mycp); 699 700 while(1) { 701 struct proc *curcp; 702 struct aiocblist *aiocbe; 703 704 /* 705 * curcp is the current daemon process context. 706 * userp is the current user process context. 707 */ 708 curcp = mycp; 709 710 /* 711 * Take daemon off of free queue 712 */ 713 if (aiop->aioprocflags & AIOP_FREE) { 714 TAILQ_REMOVE(&aio_freeproc, aiop, list); 715 TAILQ_INSERT_TAIL(&aio_activeproc, aiop, list); 716 aiop->aioprocflags &= ~AIOP_FREE; 717 } 718 aiop->aioprocflags &= ~AIOP_SCHED; 719 720 /* 721 * Check for jobs 722 */ 723 while ( aiocbe = aio_selectjob(aiop)) { 724 struct proc *userp; 725 struct aiocb *cb; 726 struct kaioinfo *ki; 727 struct aio_liojob *lj; 728 729 cb = &aiocbe->uaiocb; 730 userp = aiocbe->userproc; 731 732 aiocbe->jobstate = JOBST_JOBRUNNING; 733 734 /* 735 * Connect to process address space for user program 736 */ 737 if (userp != curcp) { 738 struct vmspace *tmpvm; 739 /* 740 * Save the current address space that we are connected to. 741 */ 742 tmpvm = mycp->p_vmspace; 743 /* 744 * Point to the new user address space, and refer to it. 745 */ 746 mycp->p_vmspace = userp->p_vmspace; 747 mycp->p_vmspace->vm_refcnt++; 748 /* 749 * Activate the new mapping. 750 */ 751 pmap_activate(mycp); 752 /* 753 * If the old address space wasn't the daemons own address 754 * space, then we need to remove the daemon's reference from 755 * the other process that it was acting on behalf of. 756 */ 757 if (tmpvm != myvm) { 758 vmspace_free(tmpvm); 759 } 760 /* 761 * Disassociate from previous clients file descriptors, and 762 * associate to the new clients descriptors. Note that 763 * the daemon doesn't need to worry about its orginal 764 * descriptors, because they were originally freed. 765 */ 766 if (mycp->p_fd) 767 fdfree(mycp); 768 mycp->p_fd = fdshare(userp); 769 curcp = userp; 770 } 771 772 ki = userp->p_aioinfo; 773 lj = aiocbe->lio; 774 775 /* 776 * Account for currently active jobs 777 */ 778 ki->kaio_active_count++; 779 780 /* 781 * Do the I/O function 782 */ 783 aiocbe->jobaioproc = aiop; 784 aio_process(aiocbe); 785 786 /* 787 * decrement the active job count 788 */ 789 ki->kaio_active_count--; 790 791 /* 792 * increment the completion count for wakeup/signal comparisons 793 */ 794 aiocbe->jobflags |= AIOCBLIST_DONE; 795 ki->kaio_queue_finished_count++; 796 if (lj) { 797 lj->lioj_queue_finished_count++; 798 } 799 if ((ki->kaio_flags & KAIO_WAKEUP) || 800 (ki->kaio_flags & KAIO_RUNDOWN) && 801 (ki->kaio_active_count == 0)) { 802 ki->kaio_flags &= ~KAIO_WAKEUP; 803 wakeup(userp); 804 } 805 806 s = splbio(); 807 if (lj && (lj->lioj_flags & (LIOJ_SIGNAL|LIOJ_SIGNAL_POSTED)) == 808 LIOJ_SIGNAL) { 809 if ((lj->lioj_queue_finished_count == lj->lioj_queue_count) && 810 (lj->lioj_buffer_finished_count == lj->lioj_buffer_count)) { 811 psignal(userp, lj->lioj_signal.sigev_signo); 812 lj->lioj_flags |= LIOJ_SIGNAL_POSTED; 813 } 814 } 815 splx(s); 816 817 aiocbe->jobstate = JOBST_JOBFINISHED; 818 819 /* 820 * If the I/O request should be automatically rundown, do the 821 * needed cleanup. Otherwise, place the queue entry for 822 * the just finished I/O request into the done queue for the 823 * associated client. 824 */ 825 if (aiocbe->jobflags & AIOCBLIST_ASYNCFREE) { 826 aiocbe->jobflags &= ~AIOCBLIST_ASYNCFREE; 827 TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list); 828 } else { 829 TAILQ_REMOVE(&ki->kaio_jobqueue, 830 aiocbe, plist); 831 TAILQ_INSERT_TAIL(&ki->kaio_jobdone, 832 aiocbe, plist); 833 } 834 835 if (aiocbe->jobflags & AIOCBLIST_RUNDOWN) { 836 wakeup(aiocbe); 837 aiocbe->jobflags &= ~AIOCBLIST_RUNDOWN; 838 } 839 840 if (cb->aio_sigevent.sigev_notify == SIGEV_SIGNAL) { 841 psignal(userp, cb->aio_sigevent.sigev_signo); 842 } 843 } 844 845 /* 846 * Disconnect from user address space 847 */ 848 if (curcp != mycp) { 849 struct vmspace *tmpvm; 850 /* 851 * Get the user address space to disconnect from. 852 */ 853 tmpvm = mycp->p_vmspace; 854 /* 855 * Get original address space for daemon. 856 */ 857 mycp->p_vmspace = myvm; 858 /* 859 * Activate the daemon's address space. 860 */ 861 pmap_activate(mycp); 862 #if defined(DIAGNOSTIC) 863 if (tmpvm == myvm) 864 printf("AIOD: vmspace problem -- %d\n", mycp->p_pid); 865 #endif 866 /* 867 * remove our vmspace reference. 868 */ 869 vmspace_free(tmpvm); 870 /* 871 * disassociate from the user process's file descriptors. 872 */ 873 if (mycp->p_fd) 874 fdfree(mycp); 875 mycp->p_fd = NULL; 876 curcp = mycp; 877 } 878 879 /* 880 * If we are the first to be put onto the free queue, wakeup 881 * anyone waiting for a daemon. 882 */ 883 TAILQ_REMOVE(&aio_activeproc, aiop, list); 884 if (TAILQ_EMPTY(&aio_freeproc)) 885 wakeup(&aio_freeproc); 886 TAILQ_INSERT_HEAD(&aio_freeproc, aiop, list); 887 aiop->aioprocflags |= AIOP_FREE; 888 889 /* 890 * If daemon is inactive for a long time, allow it to exit, thereby 891 * freeing resources. 892 */ 893 if (((aiop->aioprocflags & AIOP_SCHED) == 0) && 894 tsleep(mycp, PRIBIO, "aiordy", aiod_lifetime)) { 895 if ((TAILQ_FIRST(&aio_jobs) == NULL) && 896 (TAILQ_FIRST(&aiop->jobtorun) == NULL)) { 897 if ((aiop->aioprocflags & AIOP_FREE) && 898 (num_aio_procs > target_aio_procs)) { 899 TAILQ_REMOVE(&aio_freeproc, aiop, list); 900 zfree(aiop_zone, aiop); 901 num_aio_procs--; 902 #if defined(DIAGNOSTIC) 903 if (mycp->p_vmspace->vm_refcnt <= 1) 904 printf("AIOD: bad vm refcnt for exiting daemon: %d\n", 905 mycp->p_vmspace->vm_refcnt); 906 #endif 907 exit1(mycp, 0); 908 } 909 } 910 } 911 } 912 } 913 914 /* 915 * Create a new AIO daemon. This is mostly a kernel-thread fork routine. 916 * The AIO daemon modifies its environment itself. 917 */ 918 static int 919 aio_newproc() 920 { 921 int error; 922 struct rfork_args rfa; 923 struct proc *p, *np; 924 925 rfa.flags = RFPROC | RFCFDG; 926 927 p = curproc; 928 if (error = rfork(p, &rfa)) 929 return error; 930 931 np = pfind(p->p_retval[0]); 932 cpu_set_fork_handler(np, aio_daemon, p); 933 934 /* 935 * Wait until daemon is started, but continue on just in case (to 936 * handle error conditions. 937 */ 938 error = tsleep(np, PZERO, "aiosta", aiod_timeout); 939 num_aio_procs++; 940 941 return error; 942 943 } 944 945 /* 946 * Try the high-performance physio method for eligible VCHR devices. This 947 * routine doesn't require the use of any additional threads, and have 948 * overhead. 949 */ 950 int 951 aio_qphysio(p, aiocbe) 952 struct proc *p; 953 struct aiocblist *aiocbe; 954 { 955 int error; 956 struct aiocb *cb; 957 struct file *fp; 958 struct buf *bp; 959 int bflags; 960 struct vnode *vp; 961 struct kaioinfo *ki; 962 struct filedesc *fdp; 963 struct aio_liojob *lj; 964 int fd; 965 int majordev; 966 int s; 967 int cnt; 968 dev_t dev; 969 int rw; 970 d_strategy_t *fstrategy; 971 struct cdevsw *cdev; 972 struct cdevsw *bdev; 973 974 cb = &aiocbe->uaiocb; 975 fdp = p->p_fd; 976 fd = cb->aio_fildes; 977 fp = fdp->fd_ofiles[fd]; 978 979 if (fp->f_type != DTYPE_VNODE) { 980 return -1; 981 } 982 983 vp = (struct vnode *)fp->f_data; 984 if (vp->v_type != VCHR || ((cb->aio_nbytes & (DEV_BSIZE - 1)) != 0)) { 985 return -1; 986 } 987 988 if ((cb->aio_nbytes > MAXPHYS) && (num_buf_aio >= max_buf_aio)) { 989 return -1; 990 } 991 992 if ((vp->v_specinfo == NULL) || (vp->v_flag & VISTTY)) { 993 return -1; 994 } 995 996 majordev = major(vp->v_rdev); 997 if (majordev == NODEV) { 998 return -1; 999 } 1000 1001 cdev = cdevsw[major(vp->v_rdev)]; 1002 if (cdev == NULL) { 1003 return -1; 1004 } 1005 1006 if (cdev->d_bmaj == -1) { 1007 return -1; 1008 } 1009 bdev = cdev; 1010 1011 ki = p->p_aioinfo; 1012 if (ki->kaio_buffer_count >= ki->kaio_ballowed_count) { 1013 return -1; 1014 } 1015 1016 cnt = cb->aio_nbytes; 1017 if (cnt > MAXPHYS) { 1018 return -1; 1019 } 1020 1021 dev = makedev(bdev->d_bmaj, minor(vp->v_rdev)); 1022 1023 /* 1024 * Physical I/O is charged directly to the process, so we don't have 1025 * to fake it. 1026 */ 1027 aiocbe->inputcharge = 0; 1028 aiocbe->outputcharge = 0; 1029 1030 ki->kaio_buffer_count++; 1031 1032 lj = aiocbe->lio; 1033 if (lj) { 1034 lj->lioj_buffer_count++; 1035 } 1036 1037 /* create and build a buffer header for a transfer */ 1038 bp = (struct buf *)getpbuf(); 1039 1040 /* 1041 * get a copy of the kva from the physical buffer 1042 */ 1043 bp->b_proc = p; 1044 bp->b_dev = dev; 1045 error = bp->b_error = 0; 1046 1047 if (cb->aio_lio_opcode == LIO_WRITE) { 1048 rw = 0; 1049 bflags = B_WRITE; 1050 } else { 1051 rw = 1; 1052 bflags = B_READ; 1053 } 1054 1055 bp->b_bcount = cb->aio_nbytes; 1056 bp->b_bufsize = cb->aio_nbytes; 1057 bp->b_flags = B_BUSY | B_PHYS | B_CALL | bflags; 1058 bp->b_iodone = aio_physwakeup; 1059 bp->b_saveaddr = bp->b_data; 1060 bp->b_data = (void *) cb->aio_buf; 1061 bp->b_blkno = btodb(cb->aio_offset); 1062 1063 if (rw && !useracc(bp->b_data, bp->b_bufsize, B_WRITE)) { 1064 error = EFAULT; 1065 goto doerror; 1066 } 1067 if (!rw && !useracc(bp->b_data, bp->b_bufsize, B_READ)) { 1068 error = EFAULT; 1069 goto doerror; 1070 } 1071 1072 /* bring buffer into kernel space */ 1073 vmapbuf(bp); 1074 1075 s = splbio(); 1076 aiocbe->bp = bp; 1077 bp->b_spc = (void *)aiocbe; 1078 TAILQ_INSERT_TAIL(&aio_bufjobs, aiocbe, list); 1079 TAILQ_INSERT_TAIL(&ki->kaio_bufqueue, aiocbe, plist); 1080 aiocbe->jobstate = JOBST_JOBQBUF; 1081 cb->_aiocb_private.status = cb->aio_nbytes; 1082 num_buf_aio++; 1083 fstrategy = bdev->d_strategy; 1084 bp->b_error = 0; 1085 1086 splx(s); 1087 /* perform transfer */ 1088 (*fstrategy)(bp); 1089 1090 s = splbio(); 1091 /* 1092 * If we had an error invoking the request, or an error in processing 1093 * the request before we have returned, we process it as an error 1094 * in transfer. Note that such an I/O error is not indicated immediately, 1095 * but is returned using the aio_error mechanism. In this case, aio_suspend 1096 * will return immediately. 1097 */ 1098 if (bp->b_error || (bp->b_flags & B_ERROR)) { 1099 struct aiocb *job = aiocbe->uuaiocb; 1100 1101 aiocbe->uaiocb._aiocb_private.status = 0; 1102 suword(&job->_aiocb_private.status, 0); 1103 aiocbe->uaiocb._aiocb_private.error = bp->b_error; 1104 suword(&job->_aiocb_private.error, bp->b_error); 1105 1106 ki->kaio_buffer_finished_count++; 1107 1108 if (aiocbe->jobstate != JOBST_JOBBFINISHED) { 1109 aiocbe->jobstate = JOBST_JOBBFINISHED; 1110 aiocbe->jobflags |= AIOCBLIST_DONE; 1111 TAILQ_REMOVE(&aio_bufjobs, aiocbe, list); 1112 TAILQ_REMOVE(&ki->kaio_bufqueue, aiocbe, plist); 1113 TAILQ_INSERT_TAIL(&ki->kaio_bufdone, aiocbe, plist); 1114 } 1115 } 1116 splx(s); 1117 return 0; 1118 1119 doerror: 1120 ki->kaio_buffer_count--; 1121 if (lj) { 1122 lj->lioj_buffer_count--; 1123 } 1124 aiocbe->bp = NULL; 1125 relpbuf(bp); 1126 return error; 1127 } 1128 1129 /* 1130 * This waits/tests physio completion. 1131 */ 1132 int 1133 aio_fphysio(p, iocb, flgwait) 1134 struct proc *p; 1135 struct aiocblist *iocb; 1136 int flgwait; 1137 { 1138 int s; 1139 struct buf *bp; 1140 int error; 1141 1142 bp = iocb->bp; 1143 1144 s = splbio(); 1145 if (flgwait == 0) { 1146 if ((bp->b_flags & B_DONE) == 0) { 1147 splx(s); 1148 return EINPROGRESS; 1149 } 1150 } 1151 1152 while ((bp->b_flags & B_DONE) == 0) { 1153 if (tsleep((caddr_t)bp, PRIBIO, "physstr", aiod_timeout)) { 1154 if ((bp->b_flags & B_DONE) == 0) { 1155 splx(s); 1156 return EINPROGRESS; 1157 } else { 1158 break; 1159 } 1160 } 1161 } 1162 1163 /* release mapping into kernel space */ 1164 vunmapbuf(bp); 1165 iocb->bp = 0; 1166 1167 error = 0; 1168 /* 1169 * check for an error 1170 */ 1171 if (bp->b_flags & B_ERROR) { 1172 error = bp->b_error; 1173 } 1174 1175 relpbuf(bp); 1176 return (error); 1177 } 1178 1179 /* 1180 * Queue a new AIO request. Choosing either the threaded or direct physio 1181 * VCHR technique is done in this code. 1182 */ 1183 static int 1184 _aio_aqueue(struct proc *p, struct aiocb *job, struct aio_liojob *lj, int type) 1185 { 1186 struct filedesc *fdp; 1187 struct file *fp; 1188 unsigned int fd; 1189 1190 int error; 1191 int opcode; 1192 struct aiocblist *aiocbe; 1193 struct aioproclist *aiop; 1194 struct kaioinfo *ki; 1195 1196 if (aiocbe = TAILQ_FIRST(&aio_freejobs)) { 1197 TAILQ_REMOVE(&aio_freejobs, aiocbe, list); 1198 } else { 1199 aiocbe = zalloc (aiocb_zone); 1200 } 1201 1202 aiocbe->inputcharge = 0; 1203 aiocbe->outputcharge = 0; 1204 1205 suword(&job->_aiocb_private.status, -1); 1206 suword(&job->_aiocb_private.error, 0); 1207 suword(&job->_aiocb_private.kernelinfo, -1); 1208 1209 error = copyin((caddr_t)job, 1210 (caddr_t) &aiocbe->uaiocb, sizeof aiocbe->uaiocb); 1211 if (error) { 1212 suword(&job->_aiocb_private.error, error); 1213 1214 TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list); 1215 return error; 1216 } 1217 1218 /* 1219 * Save userspace address of the job info 1220 */ 1221 aiocbe->uuaiocb = job; 1222 1223 /* 1224 * Get the opcode 1225 */ 1226 if (type != LIO_NOP) { 1227 aiocbe->uaiocb.aio_lio_opcode = type; 1228 } 1229 opcode = aiocbe->uaiocb.aio_lio_opcode; 1230 1231 /* 1232 * Get the fd info for process 1233 */ 1234 fdp = p->p_fd; 1235 1236 /* 1237 * Range check file descriptor 1238 */ 1239 fd = aiocbe->uaiocb.aio_fildes; 1240 if (fd >= fdp->fd_nfiles) { 1241 TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list); 1242 if (type == 0) { 1243 suword(&job->_aiocb_private.error, EBADF); 1244 } 1245 return EBADF; 1246 } 1247 1248 fp = fdp->fd_ofiles[fd]; 1249 if ((fp == NULL) || 1250 ((opcode == LIO_WRITE) && ((fp->f_flag & FWRITE) == 0))) { 1251 TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list); 1252 if (type == 0) { 1253 suword(&job->_aiocb_private.error, EBADF); 1254 } 1255 return EBADF; 1256 } 1257 1258 if (aiocbe->uaiocb.aio_offset == -1LL) { 1259 TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list); 1260 if (type == 0) { 1261 suword(&job->_aiocb_private.error, EINVAL); 1262 } 1263 return EINVAL; 1264 } 1265 1266 error = suword(&job->_aiocb_private.kernelinfo, jobrefid); 1267 if (error) { 1268 TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list); 1269 if (type == 0) { 1270 suword(&job->_aiocb_private.error, EINVAL); 1271 } 1272 return error; 1273 } 1274 1275 aiocbe->uaiocb._aiocb_private.kernelinfo = (void *)(intptr_t)jobrefid; 1276 if (jobrefid == LONG_MAX) 1277 jobrefid = 1; 1278 else 1279 jobrefid++; 1280 1281 if (opcode == LIO_NOP) { 1282 TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list); 1283 if (type == 0) { 1284 suword(&job->_aiocb_private.error, 0); 1285 suword(&job->_aiocb_private.status, 0); 1286 suword(&job->_aiocb_private.kernelinfo, 0); 1287 } 1288 return 0; 1289 } 1290 1291 if ((opcode != LIO_READ) && (opcode != LIO_WRITE)) { 1292 TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list); 1293 if (type == 0) { 1294 suword(&job->_aiocb_private.status, 0); 1295 suword(&job->_aiocb_private.error, EINVAL); 1296 } 1297 return EINVAL; 1298 } 1299 1300 suword(&job->_aiocb_private.error, EINPROGRESS); 1301 aiocbe->uaiocb._aiocb_private.error = EINPROGRESS; 1302 aiocbe->userproc = p; 1303 aiocbe->jobflags = 0; 1304 aiocbe->lio = lj; 1305 ki = p->p_aioinfo; 1306 1307 if ((error = aio_qphysio(p, aiocbe)) == 0) { 1308 return 0; 1309 } else if (error > 0) { 1310 suword(&job->_aiocb_private.status, 0); 1311 aiocbe->uaiocb._aiocb_private.error = error; 1312 suword(&job->_aiocb_private.error, error); 1313 return error; 1314 } 1315 1316 /* 1317 * No buffer for daemon I/O 1318 */ 1319 aiocbe->bp = NULL; 1320 1321 ki->kaio_queue_count++; 1322 if (lj) { 1323 lj->lioj_queue_count++; 1324 } 1325 TAILQ_INSERT_TAIL(&ki->kaio_jobqueue, aiocbe, plist); 1326 TAILQ_INSERT_TAIL(&aio_jobs, aiocbe, list); 1327 aiocbe->jobstate = JOBST_JOBQGLOBAL; 1328 1329 num_queue_count++; 1330 error = 0; 1331 1332 /* 1333 * If we don't have a free AIO process, and we are below our 1334 * quota, then start one. Otherwise, depend on the subsequent 1335 * I/O completions to pick-up this job. If we don't sucessfully 1336 * create the new process (thread) due to resource issues, we 1337 * return an error for now (EAGAIN), which is likely not the 1338 * correct thing to do. 1339 */ 1340 retryproc: 1341 if (aiop = TAILQ_FIRST(&aio_freeproc)) { 1342 TAILQ_REMOVE(&aio_freeproc, aiop, list); 1343 TAILQ_INSERT_TAIL(&aio_activeproc, aiop, list); 1344 aiop->aioprocflags &= ~AIOP_FREE; 1345 wakeup(aiop->aioproc); 1346 } else if (((num_aio_resv_start + num_aio_procs) < max_aio_procs) && 1347 ((ki->kaio_active_count + num_aio_resv_start) < 1348 ki->kaio_maxactive_count)) { 1349 num_aio_resv_start++; 1350 if ((error = aio_newproc()) == 0) { 1351 num_aio_resv_start--; 1352 p->p_retval[0] = 0; 1353 goto retryproc; 1354 } 1355 num_aio_resv_start--; 1356 } 1357 return error; 1358 } 1359 1360 /* 1361 * This routine queues an AIO request, checking for quotas. 1362 */ 1363 static int 1364 aio_aqueue(struct proc *p, struct aiocb *job, int type) 1365 { 1366 struct kaioinfo *ki; 1367 1368 if (p->p_aioinfo == NULL) { 1369 aio_init_aioinfo(p); 1370 } 1371 1372 if (num_queue_count >= max_queue_count) 1373 return EAGAIN; 1374 1375 ki = p->p_aioinfo; 1376 if (ki->kaio_queue_count >= ki->kaio_qallowed_count) 1377 return EAGAIN; 1378 1379 return _aio_aqueue(p, job, NULL, type); 1380 } 1381 1382 /* 1383 * Support the aio_return system call, as a side-effect, kernel 1384 * resources are released. 1385 */ 1386 int 1387 aio_return(struct proc *p, struct aio_return_args *uap) 1388 { 1389 int s; 1390 int jobref; 1391 struct aiocblist *cb, *ncb; 1392 struct aiocb *ujob; 1393 struct kaioinfo *ki; 1394 1395 ki = p->p_aioinfo; 1396 if (ki == NULL) { 1397 return EINVAL; 1398 } 1399 1400 ujob = uap->aiocbp; 1401 1402 jobref = fuword(&ujob->_aiocb_private.kernelinfo); 1403 if (jobref == -1 || jobref == 0) 1404 return EINVAL; 1405 1406 for (cb = TAILQ_FIRST(&ki->kaio_jobdone); 1407 cb; 1408 cb = TAILQ_NEXT(cb, plist)) { 1409 if (((intptr_t) cb->uaiocb._aiocb_private.kernelinfo) == jobref) { 1410 if (ujob == cb->uuaiocb) { 1411 p->p_retval[0] = cb->uaiocb._aiocb_private.status; 1412 } else { 1413 p->p_retval[0] = EFAULT; 1414 } 1415 if (cb->uaiocb.aio_lio_opcode == LIO_WRITE) { 1416 curproc->p_stats->p_ru.ru_oublock += cb->outputcharge; 1417 cb->outputcharge = 0; 1418 } else if (cb->uaiocb.aio_lio_opcode == LIO_READ) { 1419 curproc->p_stats->p_ru.ru_inblock += cb->inputcharge; 1420 cb->inputcharge = 0; 1421 } 1422 aio_free_entry(cb); 1423 return 0; 1424 } 1425 } 1426 1427 s = splbio(); 1428 for (cb = TAILQ_FIRST(&ki->kaio_bufdone); 1429 cb; 1430 cb = ncb) { 1431 ncb = TAILQ_NEXT(cb, plist); 1432 if (((intptr_t) cb->uaiocb._aiocb_private.kernelinfo) == jobref) { 1433 splx(s); 1434 if (ujob == cb->uuaiocb) { 1435 p->p_retval[0] = cb->uaiocb._aiocb_private.status; 1436 } else { 1437 p->p_retval[0] = EFAULT; 1438 } 1439 aio_free_entry(cb); 1440 return 0; 1441 } 1442 } 1443 splx(s); 1444 1445 return (EINVAL); 1446 } 1447 1448 /* 1449 * Allow a process to wakeup when any of the I/O requests are 1450 * completed. 1451 */ 1452 int 1453 aio_suspend(struct proc *p, struct aio_suspend_args *uap) 1454 { 1455 struct timeval atv; 1456 struct timespec ts; 1457 struct aiocb *const *cbptr, *cbp; 1458 struct kaioinfo *ki; 1459 struct aiocblist *cb; 1460 int i; 1461 int njoblist; 1462 int error, s, timo; 1463 int *ijoblist; 1464 struct aiocb **ujoblist; 1465 1466 if (uap->nent >= AIO_LISTIO_MAX) 1467 return EINVAL; 1468 1469 timo = 0; 1470 if (uap->timeout) { 1471 /* 1472 * Get timespec struct 1473 */ 1474 if (error = copyin((caddr_t) uap->timeout, (caddr_t) &ts, sizeof ts)) { 1475 return error; 1476 } 1477 1478 if (ts.tv_nsec < 0 || ts.tv_nsec >= 1000000000) 1479 return (EINVAL); 1480 1481 TIMESPEC_TO_TIMEVAL(&atv, &ts); 1482 if (itimerfix(&atv)) 1483 return (EINVAL); 1484 timo = tvtohz(&atv); 1485 } 1486 1487 ki = p->p_aioinfo; 1488 if (ki == NULL) 1489 return EAGAIN; 1490 1491 njoblist = 0; 1492 ijoblist = zalloc(aiol_zone); 1493 ujoblist = zalloc(aiol_zone); 1494 cbptr = uap->aiocbp; 1495 1496 for(i = 0; i < uap->nent; i++) { 1497 cbp = (struct aiocb *) (intptr_t) fuword((caddr_t) &cbptr[i]); 1498 if (cbp == 0) 1499 continue; 1500 ujoblist[njoblist] = cbp; 1501 ijoblist[njoblist] = fuword(&cbp->_aiocb_private.kernelinfo); 1502 njoblist++; 1503 } 1504 if (njoblist == 0) { 1505 zfree(aiol_zone, ijoblist); 1506 zfree(aiol_zone, ujoblist); 1507 return 0; 1508 } 1509 1510 error = 0; 1511 while (1) { 1512 for (cb = TAILQ_FIRST(&ki->kaio_jobdone); 1513 cb; cb = TAILQ_NEXT(cb, plist)) { 1514 for(i = 0; i < njoblist; i++) { 1515 if (((intptr_t) cb->uaiocb._aiocb_private.kernelinfo) == 1516 ijoblist[i]) { 1517 if (ujoblist[i] != cb->uuaiocb) 1518 error = EINVAL; 1519 zfree(aiol_zone, ijoblist); 1520 zfree(aiol_zone, ujoblist); 1521 return error; 1522 } 1523 } 1524 } 1525 1526 s = splbio(); 1527 for (cb = TAILQ_FIRST(&ki->kaio_bufdone); 1528 cb; cb = TAILQ_NEXT(cb, plist)) { 1529 for(i = 0; i < njoblist; i++) { 1530 if (((intptr_t) cb->uaiocb._aiocb_private.kernelinfo) == 1531 ijoblist[i]) { 1532 splx(s); 1533 if (ujoblist[i] != cb->uuaiocb) 1534 error = EINVAL; 1535 zfree(aiol_zone, ijoblist); 1536 zfree(aiol_zone, ujoblist); 1537 return error; 1538 } 1539 } 1540 } 1541 1542 ki->kaio_flags |= KAIO_WAKEUP; 1543 error = tsleep(p, PRIBIO|PCATCH, "aiospn", timo); 1544 splx(s); 1545 1546 if (error == EINTR) { 1547 zfree(aiol_zone, ijoblist); 1548 zfree(aiol_zone, ujoblist); 1549 return EINTR; 1550 } else if (error == EWOULDBLOCK) { 1551 zfree(aiol_zone, ijoblist); 1552 zfree(aiol_zone, ujoblist); 1553 return EAGAIN; 1554 } 1555 } 1556 1557 /* NOTREACHED */ 1558 return EINVAL; 1559 } 1560 1561 /* 1562 * aio_cancel at the kernel level is a NOOP right now. It 1563 * might be possible to support it partially in user mode, or 1564 * in kernel mode later on. 1565 */ 1566 int 1567 aio_cancel(struct proc *p, struct aio_cancel_args *uap) 1568 { 1569 return ENOSYS; 1570 } 1571 1572 /* 1573 * aio_error is implemented in the kernel level for compatibility 1574 * purposes only. For a user mode async implementation, it would be 1575 * best to do it in a userland subroutine. 1576 */ 1577 int 1578 aio_error(struct proc *p, struct aio_error_args *uap) 1579 { 1580 int s; 1581 struct aiocblist *cb; 1582 struct kaioinfo *ki; 1583 int jobref; 1584 1585 ki = p->p_aioinfo; 1586 if (ki == NULL) 1587 return EINVAL; 1588 1589 jobref = fuword(&uap->aiocbp->_aiocb_private.kernelinfo); 1590 if ((jobref == -1) || (jobref == 0)) 1591 return EINVAL; 1592 1593 for (cb = TAILQ_FIRST(&ki->kaio_jobdone); 1594 cb; 1595 cb = TAILQ_NEXT(cb, plist)) { 1596 1597 if (((intptr_t) cb->uaiocb._aiocb_private.kernelinfo) == jobref) { 1598 p->p_retval[0] = cb->uaiocb._aiocb_private.error; 1599 return 0; 1600 } 1601 } 1602 1603 for (cb = TAILQ_FIRST(&ki->kaio_jobqueue); 1604 cb; 1605 cb = TAILQ_NEXT(cb, plist)) { 1606 1607 if (((intptr_t) cb->uaiocb._aiocb_private.kernelinfo) == jobref) { 1608 p->p_retval[0] = EINPROGRESS; 1609 return 0; 1610 } 1611 } 1612 1613 s = splbio(); 1614 for (cb = TAILQ_FIRST(&ki->kaio_bufdone); 1615 cb; 1616 cb = TAILQ_NEXT(cb, plist)) { 1617 if (((intptr_t) cb->uaiocb._aiocb_private.kernelinfo) == jobref) { 1618 p->p_retval[0] = cb->uaiocb._aiocb_private.error; 1619 splx(s); 1620 return 0; 1621 } 1622 } 1623 1624 for (cb = TAILQ_FIRST(&ki->kaio_bufqueue); 1625 cb; 1626 cb = TAILQ_NEXT(cb, plist)) { 1627 if (((intptr_t) cb->uaiocb._aiocb_private.kernelinfo) == jobref) { 1628 p->p_retval[0] = EINPROGRESS; 1629 splx(s); 1630 return 0; 1631 } 1632 } 1633 splx(s); 1634 1635 1636 /* 1637 * Hack for lio 1638 */ 1639 /* 1640 status = fuword(&uap->aiocbp->_aiocb_private.status); 1641 if (status == -1) { 1642 return fuword(&uap->aiocbp->_aiocb_private.error); 1643 } 1644 */ 1645 return EINVAL; 1646 } 1647 1648 int 1649 aio_read(struct proc *p, struct aio_read_args *uap) 1650 { 1651 struct filedesc *fdp; 1652 struct file *fp; 1653 struct uio auio; 1654 struct iovec aiov; 1655 unsigned int fd; 1656 int cnt; 1657 struct aiocb iocb; 1658 int error, pmodes; 1659 1660 pmodes = fuword(&uap->aiocbp->_aiocb_private.privatemodes); 1661 if ((pmodes & AIO_PMODE_SYNC) == 0) { 1662 return aio_aqueue(p, (struct aiocb *) uap->aiocbp, LIO_READ); 1663 } 1664 1665 /* 1666 * Get control block 1667 */ 1668 if (error = copyin((caddr_t) uap->aiocbp, (caddr_t) &iocb, sizeof iocb)) 1669 return error; 1670 1671 /* 1672 * Get the fd info for process 1673 */ 1674 fdp = p->p_fd; 1675 1676 /* 1677 * Range check file descriptor 1678 */ 1679 fd = iocb.aio_fildes; 1680 if (fd >= fdp->fd_nfiles) 1681 return EBADF; 1682 fp = fdp->fd_ofiles[fd]; 1683 if ((fp == NULL) || ((fp->f_flag & FREAD) == 0)) 1684 return EBADF; 1685 if (iocb.aio_offset == -1LL) 1686 return EINVAL; 1687 1688 auio.uio_resid = iocb.aio_nbytes; 1689 if (auio.uio_resid < 0) 1690 return (EINVAL); 1691 1692 /* 1693 * Process sync simply -- queue async request. 1694 */ 1695 if ((iocb._aiocb_private.privatemodes & AIO_PMODE_SYNC) == 0) { 1696 return aio_aqueue(p, (struct aiocb *) uap->aiocbp, LIO_READ); 1697 } 1698 1699 aiov.iov_base = (void *) iocb.aio_buf; 1700 aiov.iov_len = iocb.aio_nbytes; 1701 1702 auio.uio_iov = &aiov; 1703 auio.uio_iovcnt = 1; 1704 auio.uio_offset = iocb.aio_offset; 1705 auio.uio_rw = UIO_READ; 1706 auio.uio_segflg = UIO_USERSPACE; 1707 auio.uio_procp = p; 1708 1709 cnt = iocb.aio_nbytes; 1710 error = (*fp->f_ops->fo_read)(fp, &auio, fp->f_cred); 1711 if (error && 1712 (auio.uio_resid != cnt) && 1713 (error == ERESTART || error == EINTR || error == EWOULDBLOCK)) 1714 error = 0; 1715 cnt -= auio.uio_resid; 1716 p->p_retval[0] = cnt; 1717 return error; 1718 } 1719 1720 int 1721 aio_write(struct proc *p, struct aio_write_args *uap) 1722 { 1723 struct filedesc *fdp; 1724 struct file *fp; 1725 struct uio auio; 1726 struct iovec aiov; 1727 unsigned int fd; 1728 int cnt; 1729 struct aiocb iocb; 1730 int error; 1731 int pmodes; 1732 1733 /* 1734 * Process sync simply -- queue async request. 1735 */ 1736 pmodes = fuword(&uap->aiocbp->_aiocb_private.privatemodes); 1737 if ((pmodes & AIO_PMODE_SYNC) == 0) { 1738 return aio_aqueue(p, (struct aiocb *) uap->aiocbp, LIO_WRITE); 1739 } 1740 1741 if (error = copyin((caddr_t) uap->aiocbp, (caddr_t) &iocb, sizeof iocb)) 1742 return error; 1743 1744 /* 1745 * Get the fd info for process 1746 */ 1747 fdp = p->p_fd; 1748 1749 /* 1750 * Range check file descriptor 1751 */ 1752 fd = iocb.aio_fildes; 1753 if (fd >= fdp->fd_nfiles) 1754 return EBADF; 1755 fp = fdp->fd_ofiles[fd]; 1756 if ((fp == NULL) || ((fp->f_flag & FWRITE) == 0)) 1757 return EBADF; 1758 if (iocb.aio_offset == -1LL) 1759 return EINVAL; 1760 1761 aiov.iov_base = (void *) iocb.aio_buf; 1762 aiov.iov_len = iocb.aio_nbytes; 1763 auio.uio_iov = &aiov; 1764 auio.uio_iovcnt = 1; 1765 auio.uio_offset = iocb.aio_offset; 1766 1767 auio.uio_resid = iocb.aio_nbytes; 1768 if (auio.uio_resid < 0) 1769 return (EINVAL); 1770 1771 auio.uio_rw = UIO_WRITE; 1772 auio.uio_segflg = UIO_USERSPACE; 1773 auio.uio_procp = p; 1774 1775 cnt = iocb.aio_nbytes; 1776 error = (*fp->f_ops->fo_write)(fp, &auio, fp->f_cred); 1777 if (error) { 1778 if (auio.uio_resid != cnt) { 1779 if (error == ERESTART || error == EINTR || error == EWOULDBLOCK) 1780 error = 0; 1781 if (error == EPIPE) 1782 psignal(p, SIGPIPE); 1783 } 1784 } 1785 cnt -= auio.uio_resid; 1786 p->p_retval[0] = cnt; 1787 return error; 1788 } 1789 1790 int 1791 lio_listio(struct proc *p, struct lio_listio_args *uap) 1792 { 1793 int nent, nentqueued; 1794 struct aiocb *iocb, * const *cbptr; 1795 struct aiocblist *cb; 1796 struct kaioinfo *ki; 1797 struct aio_liojob *lj; 1798 int error, runningcode; 1799 int nerror; 1800 int i; 1801 int s; 1802 1803 if ((uap->mode != LIO_NOWAIT) && (uap->mode != LIO_WAIT)) { 1804 return EINVAL; 1805 } 1806 1807 nent = uap->nent; 1808 if (nent > AIO_LISTIO_MAX) { 1809 return EINVAL; 1810 } 1811 1812 if (p->p_aioinfo == NULL) { 1813 aio_init_aioinfo(p); 1814 } 1815 1816 if ((nent + num_queue_count) > max_queue_count) { 1817 return EAGAIN; 1818 } 1819 1820 ki = p->p_aioinfo; 1821 if ((nent + ki->kaio_queue_count) > ki->kaio_qallowed_count) { 1822 return EAGAIN; 1823 } 1824 1825 lj = zalloc(aiolio_zone); 1826 if (!lj) { 1827 return EAGAIN; 1828 } 1829 1830 lj->lioj_flags = 0; 1831 lj->lioj_buffer_count = 0; 1832 lj->lioj_buffer_finished_count = 0; 1833 lj->lioj_queue_count = 0; 1834 lj->lioj_queue_finished_count = 0; 1835 lj->lioj_ki = ki; 1836 TAILQ_INSERT_TAIL(&ki->kaio_liojoblist, lj, lioj_list); 1837 1838 /* 1839 * Setup signal 1840 */ 1841 if (uap->sig && (uap->mode == LIO_NOWAIT)) { 1842 error = copyin(uap->sig, &lj->lioj_signal, sizeof lj->lioj_signal); 1843 if (error) 1844 return error; 1845 lj->lioj_flags |= LIOJ_SIGNAL; 1846 lj->lioj_flags &= ~LIOJ_SIGNAL_POSTED; 1847 } else { 1848 lj->lioj_flags &= ~LIOJ_SIGNAL; 1849 } 1850 1851 /* 1852 * get pointers to the list of I/O requests 1853 */ 1854 1855 nerror = 0; 1856 nentqueued = 0; 1857 cbptr = uap->acb_list; 1858 for(i = 0; i < uap->nent; i++) { 1859 iocb = (struct aiocb *) (intptr_t) fuword((caddr_t) &cbptr[i]); 1860 if (((intptr_t) iocb != -1) && ((intptr_t) iocb != NULL)) { 1861 error = _aio_aqueue(p, iocb, lj, 0); 1862 if (error == 0) { 1863 nentqueued++; 1864 } else { 1865 nerror++; 1866 } 1867 } 1868 } 1869 1870 /* 1871 * If we haven't queued any, then just return error 1872 */ 1873 if (nentqueued == 0) { 1874 return 0; 1875 } 1876 1877 /* 1878 * Calculate the appropriate error return 1879 */ 1880 runningcode = 0; 1881 if (nerror) 1882 runningcode = EIO; 1883 1884 if (uap->mode == LIO_WAIT) { 1885 while (1) { 1886 int found; 1887 found = 0; 1888 for(i = 0; i < uap->nent; i++) { 1889 int jobref, command; 1890 1891 /* 1892 * Fetch address of the control buf pointer in user space 1893 */ 1894 iocb = (struct aiocb *) (intptr_t) fuword((caddr_t) &cbptr[i]); 1895 if (((intptr_t) iocb == -1) || ((intptr_t) iocb == 0)) 1896 continue; 1897 1898 /* 1899 * Fetch the associated command from user space 1900 */ 1901 command = fuword(&iocb->aio_lio_opcode); 1902 if (command == LIO_NOP) { 1903 found++; 1904 continue; 1905 } 1906 1907 jobref = fuword(&iocb->_aiocb_private.kernelinfo); 1908 1909 for (cb = TAILQ_FIRST(&ki->kaio_jobdone); 1910 cb; 1911 cb = TAILQ_NEXT(cb, plist)) { 1912 if (((intptr_t) cb->uaiocb._aiocb_private.kernelinfo) == 1913 jobref) { 1914 if (cb->uaiocb.aio_lio_opcode == LIO_WRITE) { 1915 curproc->p_stats->p_ru.ru_oublock += 1916 cb->outputcharge; 1917 cb->outputcharge = 0; 1918 } else if (cb->uaiocb.aio_lio_opcode == LIO_READ) { 1919 curproc->p_stats->p_ru.ru_inblock += 1920 cb->inputcharge; 1921 cb->inputcharge = 0; 1922 } 1923 found++; 1924 break; 1925 } 1926 } 1927 1928 s = splbio(); 1929 for (cb = TAILQ_FIRST(&ki->kaio_bufdone); 1930 cb; 1931 cb = TAILQ_NEXT(cb, plist)) { 1932 if (((intptr_t) cb->uaiocb._aiocb_private.kernelinfo) == 1933 jobref) { 1934 found++; 1935 break; 1936 } 1937 } 1938 splx(s); 1939 1940 } 1941 1942 /* 1943 * If all I/Os have been disposed of, then we can return 1944 */ 1945 if (found == nentqueued) { 1946 return runningcode; 1947 } 1948 1949 ki->kaio_flags |= KAIO_WAKEUP; 1950 error = tsleep(p, PRIBIO|PCATCH, "aiospn", 0); 1951 1952 if (error == EINTR) { 1953 return EINTR; 1954 } else if (error == EWOULDBLOCK) { 1955 return EAGAIN; 1956 } 1957 1958 } 1959 } 1960 1961 return runningcode; 1962 } 1963 1964 /* 1965 * This is a wierd hack so that we can post a signal. It is safe 1966 * to do so from a timeout routine, but *not* from an interrupt routine. 1967 */ 1968 static void 1969 process_signal(void *ljarg) 1970 { 1971 struct aio_liojob *lj = ljarg; 1972 if (lj->lioj_signal.sigev_notify == SIGEV_SIGNAL) { 1973 if (lj->lioj_queue_count == lj->lioj_queue_finished_count) { 1974 psignal(lj->lioj_ki->kaio_p, lj->lioj_signal.sigev_signo); 1975 lj->lioj_flags |= LIOJ_SIGNAL_POSTED; 1976 } 1977 } 1978 } 1979 1980 /* 1981 * Interrupt handler for physio, performs the necessary process wakeups, 1982 * and signals. 1983 */ 1984 static void 1985 aio_physwakeup(bp) 1986 struct buf *bp; 1987 { 1988 struct aiocblist *aiocbe; 1989 struct proc *p; 1990 struct kaioinfo *ki; 1991 struct aio_liojob *lj; 1992 int s; 1993 s = splbio(); 1994 1995 wakeup((caddr_t) bp); 1996 bp->b_flags &= ~B_CALL; 1997 bp->b_flags |= B_DONE; 1998 1999 aiocbe = (struct aiocblist *)bp->b_spc; 2000 if (aiocbe) { 2001 p = bp->b_proc; 2002 2003 aiocbe->jobstate = JOBST_JOBBFINISHED; 2004 aiocbe->uaiocb._aiocb_private.status -= bp->b_resid; 2005 aiocbe->uaiocb._aiocb_private.error = 0; 2006 aiocbe->jobflags |= AIOCBLIST_DONE; 2007 2008 if (bp->b_flags & B_ERROR) { 2009 aiocbe->uaiocb._aiocb_private.error = bp->b_error; 2010 } 2011 2012 lj = aiocbe->lio; 2013 if (lj) { 2014 lj->lioj_buffer_finished_count++; 2015 /* 2016 * wakeup/signal if all of the interrupt jobs are done 2017 */ 2018 if (lj->lioj_buffer_finished_count == lj->lioj_buffer_count) { 2019 /* 2020 * post a signal if it is called for 2021 */ 2022 if ((lj->lioj_flags & (LIOJ_SIGNAL|LIOJ_SIGNAL_POSTED)) == 2023 LIOJ_SIGNAL) { 2024 lj->lioj_flags |= LIOJ_SIGNAL_POSTED; 2025 timeout(process_signal, lj, 0); 2026 } 2027 } 2028 } 2029 2030 ki = p->p_aioinfo; 2031 if (ki) { 2032 ki->kaio_buffer_finished_count++; 2033 TAILQ_REMOVE(&aio_bufjobs, aiocbe, list); 2034 TAILQ_REMOVE(&ki->kaio_bufqueue, aiocbe, plist); 2035 TAILQ_INSERT_TAIL(&ki->kaio_bufdone, aiocbe, plist); 2036 /* 2037 * and do the wakeup 2038 */ 2039 if (ki->kaio_flags & (KAIO_RUNDOWN|KAIO_WAKEUP)) { 2040 ki->kaio_flags &= ~KAIO_WAKEUP; 2041 wakeup(p); 2042 } 2043 } 2044 } 2045 splx(s); 2046 } 2047