1 /* 2 * Copyright (c) 1997 John S. Dyson. All rights reserved. 3 * 4 * Redistribution and use in source and binary forms, with or without 5 * modification, are permitted provided that the following conditions 6 * are met: 7 * 1. Redistributions of source code must retain the above copyright 8 * notice, this list of conditions and the following disclaimer. 9 * 2. John S. Dyson's name may not be used to endorse or promote products 10 * derived from this software without specific prior written permission. 11 * 12 * DISCLAIMER: This code isn't warranted to do anything useful. Anything 13 * bad that happens because of using this software isn't the responsibility 14 * of the author. This software is distributed AS-IS. 15 * 16 * $FreeBSD$ 17 */ 18 19 /* 20 * This file contains support for the POSIX 1003.1B AIO/LIO facility. 21 */ 22 23 #include <sys/param.h> 24 #include <sys/systm.h> 25 #include <sys/bio.h> 26 #include <sys/buf.h> 27 #include <sys/sysproto.h> 28 #include <sys/filedesc.h> 29 #include <sys/kernel.h> 30 #include <sys/fcntl.h> 31 #include <sys/file.h> 32 #include <sys/lock.h> 33 #include <sys/mutex.h> 34 #include <sys/unistd.h> 35 #include <sys/proc.h> 36 #include <sys/resourcevar.h> 37 #include <sys/signalvar.h> 38 #include <sys/protosw.h> 39 #include <sys/socketvar.h> 40 #include <sys/sysctl.h> 41 #include <sys/vnode.h> 42 #include <sys/conf.h> 43 #include <sys/event.h> 44 45 #include <vm/vm.h> 46 #include <vm/vm_extern.h> 47 #include <vm/pmap.h> 48 #include <vm/vm_map.h> 49 #include <vm/vm_zone.h> 50 #include <sys/aio.h> 51 52 #include <machine/limits.h> 53 54 #include "opt_vfs_aio.h" 55 56 #ifdef VFS_AIO 57 58 static long jobrefid; 59 60 #define JOBST_NULL 0x0 61 #define JOBST_JOBQPROC 0x1 62 #define JOBST_JOBQGLOBAL 0x2 63 #define JOBST_JOBRUNNING 0x3 64 #define JOBST_JOBFINISHED 0x4 65 #define JOBST_JOBQBUF 0x5 66 #define JOBST_JOBBFINISHED 0x6 67 68 #ifndef MAX_AIO_PER_PROC 69 #define MAX_AIO_PER_PROC 32 70 #endif 71 72 #ifndef MAX_AIO_QUEUE_PER_PROC 73 #define MAX_AIO_QUEUE_PER_PROC 256 /* Bigger than AIO_LISTIO_MAX */ 74 #endif 75 76 #ifndef MAX_AIO_PROCS 77 #define MAX_AIO_PROCS 32 78 #endif 79 80 #ifndef MAX_AIO_QUEUE 81 #define MAX_AIO_QUEUE 1024 /* Bigger than AIO_LISTIO_MAX */ 82 #endif 83 84 #ifndef TARGET_AIO_PROCS 85 #define TARGET_AIO_PROCS 4 86 #endif 87 88 #ifndef MAX_BUF_AIO 89 #define MAX_BUF_AIO 16 90 #endif 91 92 #ifndef AIOD_TIMEOUT_DEFAULT 93 #define AIOD_TIMEOUT_DEFAULT (10 * hz) 94 #endif 95 96 #ifndef AIOD_LIFETIME_DEFAULT 97 #define AIOD_LIFETIME_DEFAULT (30 * hz) 98 #endif 99 100 static int max_aio_procs = MAX_AIO_PROCS; 101 static int num_aio_procs = 0; 102 static int target_aio_procs = TARGET_AIO_PROCS; 103 static int max_queue_count = MAX_AIO_QUEUE; 104 static int num_queue_count = 0; 105 static int num_buf_aio = 0; 106 static int num_aio_resv_start = 0; 107 static int aiod_timeout; 108 static int aiod_lifetime; 109 110 static int max_aio_per_proc = MAX_AIO_PER_PROC; 111 static int max_aio_queue_per_proc = MAX_AIO_QUEUE_PER_PROC; 112 static int max_buf_aio = MAX_BUF_AIO; 113 114 SYSCTL_NODE(_vfs, OID_AUTO, aio, CTLFLAG_RW, 0, "AIO mgmt"); 115 116 SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_per_proc, 117 CTLFLAG_RW, &max_aio_per_proc, 0, ""); 118 119 SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_queue_per_proc, 120 CTLFLAG_RW, &max_aio_queue_per_proc, 0, ""); 121 122 SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_procs, 123 CTLFLAG_RW, &max_aio_procs, 0, ""); 124 125 SYSCTL_INT(_vfs_aio, OID_AUTO, num_aio_procs, 126 CTLFLAG_RD, &num_aio_procs, 0, ""); 127 128 SYSCTL_INT(_vfs_aio, OID_AUTO, num_queue_count, 129 CTLFLAG_RD, &num_queue_count, 0, ""); 130 131 SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_queue, 132 CTLFLAG_RW, &max_queue_count, 0, ""); 133 134 SYSCTL_INT(_vfs_aio, OID_AUTO, target_aio_procs, 135 CTLFLAG_RW, &target_aio_procs, 0, ""); 136 137 SYSCTL_INT(_vfs_aio, OID_AUTO, max_buf_aio, 138 CTLFLAG_RW, &max_buf_aio, 0, ""); 139 140 SYSCTL_INT(_vfs_aio, OID_AUTO, num_buf_aio, 141 CTLFLAG_RD, &num_buf_aio, 0, ""); 142 143 SYSCTL_INT(_vfs_aio, OID_AUTO, aiod_lifetime, 144 CTLFLAG_RW, &aiod_lifetime, 0, ""); 145 146 SYSCTL_INT(_vfs_aio, OID_AUTO, aiod_timeout, 147 CTLFLAG_RW, &aiod_timeout, 0, ""); 148 149 /* 150 * AIO process info 151 */ 152 #define AIOP_FREE 0x1 /* proc on free queue */ 153 #define AIOP_SCHED 0x2 /* proc explicitly scheduled */ 154 155 struct aioproclist { 156 int aioprocflags; /* AIO proc flags */ 157 TAILQ_ENTRY(aioproclist) list; /* List of processes */ 158 struct proc *aioproc; /* The AIO thread */ 159 TAILQ_HEAD (,aiocblist) jobtorun; /* suggested job to run */ 160 }; 161 162 /* 163 * data-structure for lio signal management 164 */ 165 struct aio_liojob { 166 int lioj_flags; 167 int lioj_buffer_count; 168 int lioj_buffer_finished_count; 169 int lioj_queue_count; 170 int lioj_queue_finished_count; 171 struct sigevent lioj_signal; /* signal on all I/O done */ 172 TAILQ_ENTRY (aio_liojob) lioj_list; 173 struct kaioinfo *lioj_ki; 174 }; 175 #define LIOJ_SIGNAL 0x1 /* signal on all done (lio) */ 176 #define LIOJ_SIGNAL_POSTED 0x2 /* signal has been posted */ 177 178 /* 179 * per process aio data structure 180 */ 181 struct kaioinfo { 182 int kaio_flags; /* per process kaio flags */ 183 int kaio_maxactive_count; /* maximum number of AIOs */ 184 int kaio_active_count; /* number of currently used AIOs */ 185 int kaio_qallowed_count; /* maxiumu size of AIO queue */ 186 int kaio_queue_count; /* size of AIO queue */ 187 int kaio_ballowed_count; /* maximum number of buffers */ 188 int kaio_queue_finished_count; /* number of daemon jobs finished */ 189 int kaio_buffer_count; /* number of physio buffers */ 190 int kaio_buffer_finished_count; /* count of I/O done */ 191 struct proc *kaio_p; /* process that uses this kaio block */ 192 TAILQ_HEAD (,aio_liojob) kaio_liojoblist; /* list of lio jobs */ 193 TAILQ_HEAD (,aiocblist) kaio_jobqueue; /* job queue for process */ 194 TAILQ_HEAD (,aiocblist) kaio_jobdone; /* done queue for process */ 195 TAILQ_HEAD (,aiocblist) kaio_bufqueue; /* buffer job queue for process */ 196 TAILQ_HEAD (,aiocblist) kaio_bufdone; /* buffer done queue for process */ 197 TAILQ_HEAD (,aiocblist) kaio_sockqueue; /* queue for aios waiting on sockets */ 198 }; 199 200 #define KAIO_RUNDOWN 0x1 /* process is being run down */ 201 #define KAIO_WAKEUP 0x2 /* wakeup process when there is a significant event */ 202 203 static TAILQ_HEAD(,aioproclist) aio_freeproc, aio_activeproc; 204 static TAILQ_HEAD(,aiocblist) aio_jobs; /* Async job list */ 205 static TAILQ_HEAD(,aiocblist) aio_bufjobs; /* Phys I/O job list */ 206 static TAILQ_HEAD(,aiocblist) aio_freejobs; /* Pool of free jobs */ 207 208 static void aio_init_aioinfo(struct proc *p); 209 static void aio_onceonly(void *); 210 static int aio_free_entry(struct aiocblist *aiocbe); 211 static void aio_process(struct aiocblist *aiocbe); 212 static int aio_newproc(void); 213 static int aio_aqueue(struct proc *p, struct aiocb *job, int type); 214 static void aio_physwakeup(struct buf *bp); 215 static int aio_fphysio(struct proc *p, struct aiocblist *aiocbe, int type); 216 static int aio_qphysio(struct proc *p, struct aiocblist *iocb); 217 static void aio_daemon(void *uproc); 218 219 SYSINIT(aio, SI_SUB_VFS, SI_ORDER_ANY, aio_onceonly, NULL); 220 221 static vm_zone_t kaio_zone = 0, aiop_zone = 0, aiocb_zone = 0, aiol_zone = 0; 222 static vm_zone_t aiolio_zone = 0; 223 224 /* 225 * Startup initialization 226 */ 227 void 228 aio_onceonly(void *na) 229 { 230 TAILQ_INIT(&aio_freeproc); 231 TAILQ_INIT(&aio_activeproc); 232 TAILQ_INIT(&aio_jobs); 233 TAILQ_INIT(&aio_bufjobs); 234 TAILQ_INIT(&aio_freejobs); 235 kaio_zone = zinit("AIO", sizeof (struct kaioinfo), 0, 0, 1); 236 aiop_zone = zinit("AIOP", sizeof (struct aioproclist), 0, 0, 1); 237 aiocb_zone = zinit("AIOCB", sizeof (struct aiocblist), 0, 0, 1); 238 aiol_zone = zinit("AIOL", AIO_LISTIO_MAX * sizeof (int), 0, 0, 1); 239 aiolio_zone = zinit("AIOLIO", AIO_LISTIO_MAX * sizeof (struct 240 aio_liojob), 0, 0, 1); 241 aiod_timeout = AIOD_TIMEOUT_DEFAULT; 242 aiod_lifetime = AIOD_LIFETIME_DEFAULT; 243 jobrefid = 1; 244 } 245 246 /* 247 * Init the per-process aioinfo structure. The aioinfo limits are set 248 * per-process for user limit (resource) management. 249 */ 250 void 251 aio_init_aioinfo(struct proc *p) 252 { 253 struct kaioinfo *ki; 254 if (p->p_aioinfo == NULL) { 255 ki = zalloc(kaio_zone); 256 p->p_aioinfo = ki; 257 ki->kaio_flags = 0; 258 ki->kaio_maxactive_count = max_aio_per_proc; 259 ki->kaio_active_count = 0; 260 ki->kaio_qallowed_count = max_aio_queue_per_proc; 261 ki->kaio_queue_count = 0; 262 ki->kaio_ballowed_count = max_buf_aio; 263 ki->kaio_buffer_count = 0; 264 ki->kaio_buffer_finished_count = 0; 265 ki->kaio_p = p; 266 TAILQ_INIT(&ki->kaio_jobdone); 267 TAILQ_INIT(&ki->kaio_jobqueue); 268 TAILQ_INIT(&ki->kaio_bufdone); 269 TAILQ_INIT(&ki->kaio_bufqueue); 270 TAILQ_INIT(&ki->kaio_liojoblist); 271 TAILQ_INIT(&ki->kaio_sockqueue); 272 } 273 274 while (num_aio_procs < target_aio_procs) 275 aio_newproc(); 276 } 277 278 /* 279 * Free a job entry. Wait for completion if it is currently active, but don't 280 * delay forever. If we delay, we return a flag that says that we have to 281 * restart the queue scan. 282 */ 283 int 284 aio_free_entry(struct aiocblist *aiocbe) 285 { 286 struct kaioinfo *ki; 287 struct aioproclist *aiop; 288 struct aio_liojob *lj; 289 struct proc *p; 290 int error; 291 int s; 292 293 if (aiocbe->jobstate == JOBST_NULL) 294 panic("aio_free_entry: freeing already free job"); 295 296 p = aiocbe->userproc; 297 ki = p->p_aioinfo; 298 lj = aiocbe->lio; 299 if (ki == NULL) 300 panic("aio_free_entry: missing p->p_aioinfo"); 301 302 if (aiocbe->jobstate == JOBST_JOBRUNNING) { 303 if (aiocbe->jobflags & AIOCBLIST_ASYNCFREE) 304 return 0; 305 aiocbe->jobflags |= AIOCBLIST_RUNDOWN; 306 tsleep(aiocbe, PRIBIO|PCATCH, "jobwai", 0); 307 } 308 aiocbe->jobflags &= ~AIOCBLIST_ASYNCFREE; 309 310 if (aiocbe->bp == NULL) { 311 if (ki->kaio_queue_count <= 0) 312 panic("aio_free_entry: process queue size <= 0"); 313 if (num_queue_count <= 0) 314 panic("aio_free_entry: system wide queue size <= 0"); 315 316 if (lj) { 317 lj->lioj_queue_count--; 318 if (aiocbe->jobflags & AIOCBLIST_DONE) 319 lj->lioj_queue_finished_count--; 320 } 321 ki->kaio_queue_count--; 322 if (aiocbe->jobflags & AIOCBLIST_DONE) 323 ki->kaio_queue_finished_count--; 324 num_queue_count--; 325 } else { 326 if (lj) { 327 lj->lioj_buffer_count--; 328 if (aiocbe->jobflags & AIOCBLIST_DONE) 329 lj->lioj_buffer_finished_count--; 330 } 331 if (aiocbe->jobflags & AIOCBLIST_DONE) 332 ki->kaio_buffer_finished_count--; 333 ki->kaio_buffer_count--; 334 num_buf_aio--; 335 } 336 337 /* aiocbe is going away, we need to destroy any knotes */ 338 knote_remove(p, &aiocbe->klist); 339 340 if ((ki->kaio_flags & KAIO_WAKEUP) || ((ki->kaio_flags & KAIO_RUNDOWN) 341 && ((ki->kaio_buffer_count == 0) && (ki->kaio_queue_count == 0)))) { 342 ki->kaio_flags &= ~KAIO_WAKEUP; 343 wakeup(p); 344 } 345 346 if (aiocbe->jobstate == JOBST_JOBQBUF) { 347 if ((error = aio_fphysio(p, aiocbe, 1)) != 0) 348 return error; 349 if (aiocbe->jobstate != JOBST_JOBBFINISHED) 350 panic("aio_free_entry: invalid physio finish-up state"); 351 s = splbio(); 352 TAILQ_REMOVE(&ki->kaio_bufdone, aiocbe, plist); 353 splx(s); 354 } else if (aiocbe->jobstate == JOBST_JOBQPROC) { 355 aiop = aiocbe->jobaioproc; 356 TAILQ_REMOVE(&aiop->jobtorun, aiocbe, list); 357 } else if (aiocbe->jobstate == JOBST_JOBQGLOBAL) 358 TAILQ_REMOVE(&aio_jobs, aiocbe, list); 359 else if (aiocbe->jobstate == JOBST_JOBFINISHED) 360 TAILQ_REMOVE(&ki->kaio_jobdone, aiocbe, plist); 361 else if (aiocbe->jobstate == JOBST_JOBBFINISHED) { 362 s = splbio(); 363 TAILQ_REMOVE(&ki->kaio_bufdone, aiocbe, plist); 364 splx(s); 365 if (aiocbe->bp) { 366 vunmapbuf(aiocbe->bp); 367 relpbuf(aiocbe->bp, NULL); 368 aiocbe->bp = NULL; 369 } 370 } 371 if (lj && (lj->lioj_buffer_count == 0) && (lj->lioj_queue_count == 0)) { 372 TAILQ_REMOVE(&ki->kaio_liojoblist, lj, lioj_list); 373 zfree(aiolio_zone, lj); 374 } 375 TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list); 376 aiocbe->jobstate = JOBST_NULL; 377 return 0; 378 } 379 #endif /* VFS_AIO */ 380 381 /* 382 * Rundown the jobs for a given process. 383 */ 384 void 385 aio_proc_rundown(struct proc *p) 386 { 387 #ifndef VFS_AIO 388 return; 389 #else 390 int s; 391 struct kaioinfo *ki; 392 struct aio_liojob *lj, *ljn; 393 struct aiocblist *aiocbe, *aiocbn; 394 struct file *fp; 395 struct filedesc *fdp; 396 struct socket *so; 397 398 ki = p->p_aioinfo; 399 if (ki == NULL) 400 return; 401 402 ki->kaio_flags |= LIOJ_SIGNAL_POSTED; 403 while ((ki->kaio_active_count > 0) || (ki->kaio_buffer_count > 404 ki->kaio_buffer_finished_count)) { 405 ki->kaio_flags |= KAIO_RUNDOWN; 406 if (tsleep(p, PRIBIO, "kaiowt", aiod_timeout)) 407 break; 408 } 409 410 /* 411 * Move any aio ops that are waiting on socket I/O to the normal job 412 * queues so they are cleaned up with any others. 413 */ 414 fdp = p->p_fd; 415 416 s = splnet(); 417 for (aiocbe = TAILQ_FIRST(&ki->kaio_sockqueue); aiocbe; aiocbe = 418 aiocbn) { 419 aiocbn = TAILQ_NEXT(aiocbe, plist); 420 fp = fdp->fd_ofiles[aiocbe->uaiocb.aio_fildes]; 421 422 /* 423 * Under some circumstances, the aio_fildes and the file 424 * structure don't match. This would leave aiocbe's in the 425 * TAILQ associated with the socket and cause a panic later. 426 * 427 * Detect and fix. 428 */ 429 if ((fp == NULL) || (fp != aiocbe->fd_file)) 430 fp = aiocbe->fd_file; 431 if (fp) { 432 so = (struct socket *)fp->f_data; 433 TAILQ_REMOVE(&so->so_aiojobq, aiocbe, list); 434 if (TAILQ_EMPTY(&so->so_aiojobq)) { 435 so->so_snd.sb_flags &= ~SB_AIO; 436 so->so_rcv.sb_flags &= ~SB_AIO; 437 } 438 } 439 TAILQ_REMOVE(&ki->kaio_sockqueue, aiocbe, plist); 440 TAILQ_INSERT_HEAD(&aio_jobs, aiocbe, list); 441 TAILQ_INSERT_HEAD(&ki->kaio_jobqueue, aiocbe, plist); 442 } 443 splx(s); 444 445 restart1: 446 for (aiocbe = TAILQ_FIRST(&ki->kaio_jobdone); aiocbe; aiocbe = aiocbn) { 447 aiocbn = TAILQ_NEXT(aiocbe, plist); 448 if (aio_free_entry(aiocbe)) 449 goto restart1; 450 } 451 452 restart2: 453 for (aiocbe = TAILQ_FIRST(&ki->kaio_jobqueue); aiocbe; aiocbe = 454 aiocbn) { 455 aiocbn = TAILQ_NEXT(aiocbe, plist); 456 if (aio_free_entry(aiocbe)) 457 goto restart2; 458 } 459 460 /* 461 * Note the use of lots of splbio here, trying to avoid splbio for long chains 462 * of I/O. Probably unnecessary. 463 */ 464 restart3: 465 s = splbio(); 466 while (TAILQ_FIRST(&ki->kaio_bufqueue)) { 467 ki->kaio_flags |= KAIO_WAKEUP; 468 tsleep(p, PRIBIO, "aioprn", 0); 469 splx(s); 470 goto restart3; 471 } 472 splx(s); 473 474 restart4: 475 s = splbio(); 476 for (aiocbe = TAILQ_FIRST(&ki->kaio_bufdone); aiocbe; aiocbe = aiocbn) { 477 aiocbn = TAILQ_NEXT(aiocbe, plist); 478 if (aio_free_entry(aiocbe)) { 479 splx(s); 480 goto restart4; 481 } 482 } 483 splx(s); 484 485 for (lj = TAILQ_FIRST(&ki->kaio_liojoblist); lj; lj = ljn) { 486 ljn = TAILQ_NEXT(lj, lioj_list); 487 if ((lj->lioj_buffer_count == 0) && (lj->lioj_queue_count == 488 0)) { 489 TAILQ_REMOVE(&ki->kaio_liojoblist, lj, lioj_list); 490 zfree(aiolio_zone, lj); 491 } else { 492 #ifdef DIAGNOSTIC 493 printf("LIO job not cleaned up: B:%d, BF:%d, Q:%d, " 494 "QF:%d\n", lj->lioj_buffer_count, 495 lj->lioj_buffer_finished_count, 496 lj->lioj_queue_count, 497 lj->lioj_queue_finished_count); 498 #endif 499 } 500 } 501 502 zfree(kaio_zone, ki); 503 p->p_aioinfo = NULL; 504 #endif /* VFS_AIO */ 505 } 506 507 #ifdef VFS_AIO 508 /* 509 * Select a job to run (called by an AIO daemon). 510 */ 511 static struct aiocblist * 512 aio_selectjob(struct aioproclist *aiop) 513 { 514 int s; 515 struct aiocblist *aiocbe; 516 struct kaioinfo *ki; 517 struct proc *userp; 518 519 aiocbe = TAILQ_FIRST(&aiop->jobtorun); 520 if (aiocbe) { 521 TAILQ_REMOVE(&aiop->jobtorun, aiocbe, list); 522 return aiocbe; 523 } 524 525 s = splnet(); 526 for (aiocbe = TAILQ_FIRST(&aio_jobs); aiocbe; aiocbe = 527 TAILQ_NEXT(aiocbe, list)) { 528 userp = aiocbe->userproc; 529 ki = userp->p_aioinfo; 530 531 if (ki->kaio_active_count < ki->kaio_maxactive_count) { 532 TAILQ_REMOVE(&aio_jobs, aiocbe, list); 533 splx(s); 534 return aiocbe; 535 } 536 } 537 splx(s); 538 539 return NULL; 540 } 541 542 /* 543 * The AIO processing activity. This is the code that does the I/O request for 544 * the non-physio version of the operations. The normal vn operations are used, 545 * and this code should work in all instances for every type of file, including 546 * pipes, sockets, fifos, and regular files. 547 */ 548 void 549 aio_process(struct aiocblist *aiocbe) 550 { 551 struct filedesc *fdp; 552 struct proc *userp, *mycp; 553 struct aiocb *cb; 554 struct file *fp; 555 struct uio auio; 556 struct iovec aiov; 557 unsigned int fd; 558 int cnt; 559 int error; 560 off_t offset; 561 int oublock_st, oublock_end; 562 int inblock_st, inblock_end; 563 564 userp = aiocbe->userproc; 565 cb = &aiocbe->uaiocb; 566 567 mycp = curproc; 568 569 fdp = mycp->p_fd; 570 fd = cb->aio_fildes; 571 fp = fdp->fd_ofiles[fd]; 572 573 if ((fp == NULL) || (fp != aiocbe->fd_file)) { 574 cb->_aiocb_private.error = EBADF; 575 cb->_aiocb_private.status = -1; 576 return; 577 } 578 579 aiov.iov_base = (void *)cb->aio_buf; 580 aiov.iov_len = cb->aio_nbytes; 581 582 auio.uio_iov = &aiov; 583 auio.uio_iovcnt = 1; 584 auio.uio_offset = offset = cb->aio_offset; 585 auio.uio_resid = cb->aio_nbytes; 586 cnt = cb->aio_nbytes; 587 auio.uio_segflg = UIO_USERSPACE; 588 auio.uio_procp = mycp; 589 590 inblock_st = mycp->p_stats->p_ru.ru_inblock; 591 oublock_st = mycp->p_stats->p_ru.ru_oublock; 592 /* 593 * Temporarily bump the ref count while reading to avoid the 594 * descriptor being ripped out from under us. 595 */ 596 fhold(fp); 597 if (cb->aio_lio_opcode == LIO_READ) { 598 auio.uio_rw = UIO_READ; 599 error = fo_read(fp, &auio, fp->f_cred, FOF_OFFSET, mycp); 600 } else { 601 auio.uio_rw = UIO_WRITE; 602 error = fo_write(fp, &auio, fp->f_cred, FOF_OFFSET, mycp); 603 } 604 fdrop(fp, mycp); 605 inblock_end = mycp->p_stats->p_ru.ru_inblock; 606 oublock_end = mycp->p_stats->p_ru.ru_oublock; 607 608 aiocbe->inputcharge = inblock_end - inblock_st; 609 aiocbe->outputcharge = oublock_end - oublock_st; 610 611 if ((error) && (auio.uio_resid != cnt)) { 612 if (error == ERESTART || error == EINTR || error == EWOULDBLOCK) 613 error = 0; 614 if ((error == EPIPE) && (cb->aio_lio_opcode == LIO_WRITE)) 615 psignal(userp, SIGPIPE); 616 } 617 618 cnt -= auio.uio_resid; 619 cb->_aiocb_private.error = error; 620 cb->_aiocb_private.status = cnt; 621 622 return; 623 } 624 625 /* 626 * The AIO daemon, most of the actual work is done in aio_process, 627 * but the setup (and address space mgmt) is done in this routine. 628 */ 629 static void 630 aio_daemon(void *uproc) 631 { 632 int s; 633 struct aio_liojob *lj; 634 struct aiocb *cb; 635 struct aiocblist *aiocbe; 636 struct aioproclist *aiop; 637 struct kaioinfo *ki; 638 struct proc *curcp, *mycp, *userp; 639 struct vmspace *myvm, *tmpvm; 640 641 mtx_enter(&Giant, MTX_DEF); 642 /* 643 * Local copies of curproc (cp) and vmspace (myvm) 644 */ 645 mycp = curproc; 646 myvm = mycp->p_vmspace; 647 648 if (mycp->p_textvp) { 649 vrele(mycp->p_textvp); 650 mycp->p_textvp = NULL; 651 } 652 653 /* 654 * Allocate and ready the aio control info. There is one aiop structure 655 * per daemon. 656 */ 657 aiop = zalloc(aiop_zone); 658 aiop->aioproc = mycp; 659 aiop->aioprocflags |= AIOP_FREE; 660 TAILQ_INIT(&aiop->jobtorun); 661 662 s = splnet(); 663 664 /* 665 * Place thread (lightweight process) onto the AIO free thread list. 666 */ 667 if (TAILQ_EMPTY(&aio_freeproc)) 668 wakeup(&aio_freeproc); 669 TAILQ_INSERT_HEAD(&aio_freeproc, aiop, list); 670 671 splx(s); 672 673 /* Make up a name for the daemon. */ 674 strcpy(mycp->p_comm, "aiod"); 675 676 /* 677 * Get rid of our current filedescriptors. AIOD's don't need any 678 * filedescriptors, except as temporarily inherited from the client. 679 * Credentials are also cloned, and made equivalent to "root". 680 */ 681 fdfree(mycp); 682 mycp->p_fd = NULL; 683 mycp->p_ucred = crcopy(mycp->p_ucred); 684 mycp->p_ucred->cr_uid = 0; 685 uifree(mycp->p_ucred->cr_uidinfo); 686 mycp->p_ucred->cr_uidinfo = uifind(0); 687 mycp->p_ucred->cr_ngroups = 1; 688 mycp->p_ucred->cr_groups[0] = 1; 689 690 /* The daemon resides in its own pgrp. */ 691 enterpgrp(mycp, mycp->p_pid, 1); 692 693 /* Mark special process type. */ 694 mycp->p_flag |= P_SYSTEM | P_KTHREADP; 695 696 /* 697 * Wakeup parent process. (Parent sleeps to keep from blasting away 698 * creating to many daemons.) 699 */ 700 wakeup(mycp); 701 702 for (;;) { 703 /* 704 * curcp is the current daemon process context. 705 * userp is the current user process context. 706 */ 707 curcp = mycp; 708 709 /* 710 * Take daemon off of free queue 711 */ 712 if (aiop->aioprocflags & AIOP_FREE) { 713 s = splnet(); 714 TAILQ_REMOVE(&aio_freeproc, aiop, list); 715 TAILQ_INSERT_TAIL(&aio_activeproc, aiop, list); 716 aiop->aioprocflags &= ~AIOP_FREE; 717 splx(s); 718 } 719 aiop->aioprocflags &= ~AIOP_SCHED; 720 721 /* 722 * Check for jobs. 723 */ 724 while ((aiocbe = aio_selectjob(aiop)) != NULL) { 725 cb = &aiocbe->uaiocb; 726 userp = aiocbe->userproc; 727 728 aiocbe->jobstate = JOBST_JOBRUNNING; 729 730 /* 731 * Connect to process address space for user program. 732 */ 733 if (userp != curcp) { 734 /* 735 * Save the current address space that we are 736 * connected to. 737 */ 738 tmpvm = mycp->p_vmspace; 739 740 /* 741 * Point to the new user address space, and 742 * refer to it. 743 */ 744 mycp->p_vmspace = userp->p_vmspace; 745 mycp->p_vmspace->vm_refcnt++; 746 747 /* Activate the new mapping. */ 748 pmap_activate(mycp); 749 750 /* 751 * If the old address space wasn't the daemons 752 * own address space, then we need to remove the 753 * daemon's reference from the other process 754 * that it was acting on behalf of. 755 */ 756 if (tmpvm != myvm) { 757 vmspace_free(tmpvm); 758 } 759 760 /* 761 * Disassociate from previous clients file 762 * descriptors, and associate to the new clients 763 * descriptors. Note that the daemon doesn't 764 * need to worry about its orginal descriptors, 765 * because they were originally freed. 766 */ 767 if (mycp->p_fd) 768 fdfree(mycp); 769 mycp->p_fd = fdshare(userp); 770 curcp = userp; 771 } 772 773 ki = userp->p_aioinfo; 774 lj = aiocbe->lio; 775 776 /* Account for currently active jobs. */ 777 ki->kaio_active_count++; 778 779 /* Do the I/O function. */ 780 aiocbe->jobaioproc = aiop; 781 aio_process(aiocbe); 782 783 /* Decrement the active job count. */ 784 ki->kaio_active_count--; 785 786 /* 787 * Increment the completion count for wakeup/signal 788 * comparisons. 789 */ 790 aiocbe->jobflags |= AIOCBLIST_DONE; 791 ki->kaio_queue_finished_count++; 792 if (lj) 793 lj->lioj_queue_finished_count++; 794 if ((ki->kaio_flags & KAIO_WAKEUP) || ((ki->kaio_flags 795 & KAIO_RUNDOWN) && (ki->kaio_active_count == 0))) { 796 ki->kaio_flags &= ~KAIO_WAKEUP; 797 wakeup(userp); 798 } 799 800 s = splbio(); 801 if (lj && (lj->lioj_flags & 802 (LIOJ_SIGNAL|LIOJ_SIGNAL_POSTED)) == LIOJ_SIGNAL) { 803 if ((lj->lioj_queue_finished_count == 804 lj->lioj_queue_count) && 805 (lj->lioj_buffer_finished_count == 806 lj->lioj_buffer_count)) { 807 psignal(userp, 808 lj->lioj_signal.sigev_signo); 809 lj->lioj_flags |= 810 LIOJ_SIGNAL_POSTED; 811 } 812 } 813 splx(s); 814 815 aiocbe->jobstate = JOBST_JOBFINISHED; 816 817 /* 818 * If the I/O request should be automatically rundown, 819 * do the needed cleanup. Otherwise, place the queue 820 * entry for the just finished I/O request into the done 821 * queue for the associated client. 822 */ 823 s = splnet(); 824 if (aiocbe->jobflags & AIOCBLIST_ASYNCFREE) { 825 aiocbe->jobflags &= ~AIOCBLIST_ASYNCFREE; 826 TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list); 827 } else { 828 TAILQ_REMOVE(&ki->kaio_jobqueue, aiocbe, plist); 829 TAILQ_INSERT_TAIL(&ki->kaio_jobdone, aiocbe, 830 plist); 831 } 832 splx(s); 833 KNOTE(&aiocbe->klist, 0); 834 835 if (aiocbe->jobflags & AIOCBLIST_RUNDOWN) { 836 wakeup(aiocbe); 837 aiocbe->jobflags &= ~AIOCBLIST_RUNDOWN; 838 } 839 840 if (cb->aio_sigevent.sigev_notify == SIGEV_SIGNAL) { 841 psignal(userp, cb->aio_sigevent.sigev_signo); 842 } 843 } 844 845 /* 846 * Disconnect from user address space. 847 */ 848 if (curcp != mycp) { 849 /* Get the user address space to disconnect from. */ 850 tmpvm = mycp->p_vmspace; 851 852 /* Get original address space for daemon. */ 853 mycp->p_vmspace = myvm; 854 855 /* Activate the daemon's address space. */ 856 pmap_activate(mycp); 857 #ifdef DIAGNOSTIC 858 if (tmpvm == myvm) { 859 printf("AIOD: vmspace problem -- %d\n", 860 mycp->p_pid); 861 } 862 #endif 863 /* Remove our vmspace reference. */ 864 vmspace_free(tmpvm); 865 866 /* 867 * Disassociate from the user process's file 868 * descriptors. 869 */ 870 if (mycp->p_fd) 871 fdfree(mycp); 872 mycp->p_fd = NULL; 873 curcp = mycp; 874 } 875 876 /* 877 * If we are the first to be put onto the free queue, wakeup 878 * anyone waiting for a daemon. 879 */ 880 s = splnet(); 881 TAILQ_REMOVE(&aio_activeproc, aiop, list); 882 if (TAILQ_EMPTY(&aio_freeproc)) 883 wakeup(&aio_freeproc); 884 TAILQ_INSERT_HEAD(&aio_freeproc, aiop, list); 885 aiop->aioprocflags |= AIOP_FREE; 886 splx(s); 887 888 /* 889 * If daemon is inactive for a long time, allow it to exit, 890 * thereby freeing resources. 891 */ 892 if (((aiop->aioprocflags & AIOP_SCHED) == 0) && tsleep(mycp, 893 PRIBIO, "aiordy", aiod_lifetime)) { 894 s = splnet(); 895 if ((TAILQ_FIRST(&aio_jobs) == NULL) && 896 (TAILQ_FIRST(&aiop->jobtorun) == NULL)) { 897 if ((aiop->aioprocflags & AIOP_FREE) && 898 (num_aio_procs > target_aio_procs)) { 899 TAILQ_REMOVE(&aio_freeproc, aiop, list); 900 splx(s); 901 zfree(aiop_zone, aiop); 902 num_aio_procs--; 903 #ifdef DIAGNOSTIC 904 if (mycp->p_vmspace->vm_refcnt <= 1) { 905 printf("AIOD: bad vm refcnt for" 906 " exiting daemon: %d\n", 907 mycp->p_vmspace->vm_refcnt); 908 } 909 #endif 910 exit1(mycp, 0); 911 } 912 } 913 splx(s); 914 } 915 } 916 } 917 918 /* 919 * Create a new AIO daemon. This is mostly a kernel-thread fork routine. The 920 * AIO daemon modifies its environment itself. 921 */ 922 static int 923 aio_newproc() 924 { 925 int error; 926 struct proc *p, *np; 927 928 p = &proc0; 929 error = fork1(p, RFPROC|RFMEM|RFNOWAIT, &np); 930 if (error) 931 return error; 932 cpu_set_fork_handler(np, aio_daemon, curproc); 933 934 /* 935 * Wait until daemon is started, but continue on just in case to 936 * handle error conditions. 937 */ 938 error = tsleep(np, PZERO, "aiosta", aiod_timeout); 939 num_aio_procs++; 940 941 return error; 942 } 943 944 /* 945 * Try the high-performance physio method for eligible VCHR devices. This 946 * routine doesn't require the use of any additional threads, and have overhead. 947 */ 948 int 949 aio_qphysio(struct proc *p, struct aiocblist *aiocbe) 950 { 951 int error; 952 struct aiocb *cb; 953 struct file *fp; 954 struct buf *bp; 955 struct vnode *vp; 956 struct kaioinfo *ki; 957 struct filedesc *fdp; 958 struct aio_liojob *lj; 959 int fd; 960 int s; 961 int notify; 962 963 cb = &aiocbe->uaiocb; 964 fdp = p->p_fd; 965 fd = cb->aio_fildes; 966 fp = fdp->fd_ofiles[fd]; 967 968 if (fp->f_type != DTYPE_VNODE) 969 return (-1); 970 971 vp = (struct vnode *)fp->f_data; 972 973 /* 974 * If its not a disk, we don't want to return a positive error. 975 * It causes the aio code to not fall through to try the thread 976 * way when you're talking to a regular file. 977 */ 978 if (!vn_isdisk(vp, &error)) { 979 if (error == ENOTBLK) 980 return (-1); 981 else 982 return (error); 983 } 984 985 if (cb->aio_nbytes % vp->v_rdev->si_bsize_phys) 986 return (-1); 987 988 if (cb->aio_nbytes > MAXPHYS) 989 return (-1); 990 991 ki = p->p_aioinfo; 992 if (ki->kaio_buffer_count >= ki->kaio_ballowed_count) 993 return (-1); 994 995 fhold(fp); 996 997 ki->kaio_buffer_count++; 998 999 lj = aiocbe->lio; 1000 if (lj) 1001 lj->lioj_buffer_count++; 1002 1003 /* Create and build a buffer header for a transfer. */ 1004 bp = (struct buf *)getpbuf(NULL); 1005 1006 /* 1007 * Get a copy of the kva from the physical buffer. 1008 */ 1009 bp->b_caller1 = p; 1010 bp->b_dev = vp->v_rdev; 1011 error = bp->b_error = 0; 1012 1013 bp->b_bcount = cb->aio_nbytes; 1014 bp->b_bufsize = cb->aio_nbytes; 1015 bp->b_flags = B_PHYS; 1016 bp->b_iodone = aio_physwakeup; 1017 bp->b_saveaddr = bp->b_data; 1018 bp->b_data = (void *)cb->aio_buf; 1019 bp->b_blkno = btodb(cb->aio_offset); 1020 1021 if (cb->aio_lio_opcode == LIO_WRITE) { 1022 bp->b_iocmd = BIO_WRITE; 1023 if (!useracc(bp->b_data, bp->b_bufsize, VM_PROT_READ)) { 1024 error = EFAULT; 1025 goto doerror; 1026 } 1027 } else { 1028 bp->b_iocmd = BIO_READ; 1029 if (!useracc(bp->b_data, bp->b_bufsize, VM_PROT_WRITE)) { 1030 error = EFAULT; 1031 goto doerror; 1032 } 1033 } 1034 1035 /* Bring buffer into kernel space. */ 1036 vmapbuf(bp); 1037 1038 s = splbio(); 1039 aiocbe->bp = bp; 1040 bp->b_spc = (void *)aiocbe; 1041 TAILQ_INSERT_TAIL(&aio_bufjobs, aiocbe, list); 1042 TAILQ_INSERT_TAIL(&ki->kaio_bufqueue, aiocbe, plist); 1043 aiocbe->jobstate = JOBST_JOBQBUF; 1044 cb->_aiocb_private.status = cb->aio_nbytes; 1045 num_buf_aio++; 1046 bp->b_error = 0; 1047 1048 splx(s); 1049 1050 /* Perform transfer. */ 1051 DEV_STRATEGY(bp, 0); 1052 1053 notify = 0; 1054 s = splbio(); 1055 1056 /* 1057 * If we had an error invoking the request, or an error in processing 1058 * the request before we have returned, we process it as an error in 1059 * transfer. Note that such an I/O error is not indicated immediately, 1060 * but is returned using the aio_error mechanism. In this case, 1061 * aio_suspend will return immediately. 1062 */ 1063 if (bp->b_error || (bp->b_ioflags & BIO_ERROR)) { 1064 struct aiocb *job = aiocbe->uuaiocb; 1065 1066 aiocbe->uaiocb._aiocb_private.status = 0; 1067 suword(&job->_aiocb_private.status, 0); 1068 aiocbe->uaiocb._aiocb_private.error = bp->b_error; 1069 suword(&job->_aiocb_private.error, bp->b_error); 1070 1071 ki->kaio_buffer_finished_count++; 1072 1073 if (aiocbe->jobstate != JOBST_JOBBFINISHED) { 1074 aiocbe->jobstate = JOBST_JOBBFINISHED; 1075 aiocbe->jobflags |= AIOCBLIST_DONE; 1076 TAILQ_REMOVE(&aio_bufjobs, aiocbe, list); 1077 TAILQ_REMOVE(&ki->kaio_bufqueue, aiocbe, plist); 1078 TAILQ_INSERT_TAIL(&ki->kaio_bufdone, aiocbe, plist); 1079 notify = 1; 1080 } 1081 } 1082 splx(s); 1083 if (notify) 1084 KNOTE(&aiocbe->klist, 0); 1085 fdrop(fp, p); 1086 return 0; 1087 1088 doerror: 1089 ki->kaio_buffer_count--; 1090 if (lj) 1091 lj->lioj_buffer_count--; 1092 aiocbe->bp = NULL; 1093 relpbuf(bp, NULL); 1094 fdrop(fp, p); 1095 return error; 1096 } 1097 1098 /* 1099 * This waits/tests physio completion. 1100 */ 1101 int 1102 aio_fphysio(struct proc *p, struct aiocblist *iocb, int flgwait) 1103 { 1104 int s; 1105 struct buf *bp; 1106 int error; 1107 1108 bp = iocb->bp; 1109 1110 s = splbio(); 1111 if (flgwait == 0) { 1112 if ((bp->b_flags & B_DONE) == 0) { 1113 splx(s); 1114 return EINPROGRESS; 1115 } 1116 } 1117 1118 while ((bp->b_flags & B_DONE) == 0) { 1119 if (tsleep((caddr_t)bp, PRIBIO, "physstr", aiod_timeout)) { 1120 if ((bp->b_flags & B_DONE) == 0) { 1121 splx(s); 1122 return EINPROGRESS; 1123 } else 1124 break; 1125 } 1126 } 1127 1128 /* Release mapping into kernel space. */ 1129 vunmapbuf(bp); 1130 iocb->bp = 0; 1131 1132 error = 0; 1133 1134 /* Check for an error. */ 1135 if (bp->b_ioflags & BIO_ERROR) 1136 error = bp->b_error; 1137 1138 relpbuf(bp, NULL); 1139 return (error); 1140 } 1141 #endif /* VFS_AIO */ 1142 1143 /* 1144 * Wake up aio requests that may be serviceable now. 1145 */ 1146 void 1147 aio_swake(struct socket *so, struct sockbuf *sb) 1148 { 1149 #ifndef VFS_AIO 1150 return; 1151 #else 1152 struct aiocblist *cb,*cbn; 1153 struct proc *p; 1154 struct kaioinfo *ki = NULL; 1155 int opcode, wakecount = 0; 1156 struct aioproclist *aiop; 1157 1158 if (sb == &so->so_snd) { 1159 opcode = LIO_WRITE; 1160 so->so_snd.sb_flags &= ~SB_AIO; 1161 } else { 1162 opcode = LIO_READ; 1163 so->so_rcv.sb_flags &= ~SB_AIO; 1164 } 1165 1166 for (cb = TAILQ_FIRST(&so->so_aiojobq); cb; cb = cbn) { 1167 cbn = TAILQ_NEXT(cb, list); 1168 if (opcode == cb->uaiocb.aio_lio_opcode) { 1169 p = cb->userproc; 1170 ki = p->p_aioinfo; 1171 TAILQ_REMOVE(&so->so_aiojobq, cb, list); 1172 TAILQ_REMOVE(&ki->kaio_sockqueue, cb, plist); 1173 TAILQ_INSERT_TAIL(&aio_jobs, cb, list); 1174 TAILQ_INSERT_TAIL(&ki->kaio_jobqueue, cb, plist); 1175 wakecount++; 1176 if (cb->jobstate != JOBST_JOBQGLOBAL) 1177 panic("invalid queue value"); 1178 } 1179 } 1180 1181 while (wakecount--) { 1182 if ((aiop = TAILQ_FIRST(&aio_freeproc)) != 0) { 1183 TAILQ_REMOVE(&aio_freeproc, aiop, list); 1184 TAILQ_INSERT_TAIL(&aio_activeproc, aiop, list); 1185 aiop->aioprocflags &= ~AIOP_FREE; 1186 wakeup(aiop->aioproc); 1187 } 1188 } 1189 #endif /* VFS_AIO */ 1190 } 1191 1192 #ifdef VFS_AIO 1193 /* 1194 * Queue a new AIO request. Choosing either the threaded or direct physio VCHR 1195 * technique is done in this code. 1196 */ 1197 static int 1198 _aio_aqueue(struct proc *p, struct aiocb *job, struct aio_liojob *lj, int type) 1199 { 1200 struct filedesc *fdp; 1201 struct file *fp; 1202 unsigned int fd; 1203 struct socket *so; 1204 int s; 1205 int error; 1206 int opcode; 1207 struct aiocblist *aiocbe; 1208 struct aioproclist *aiop; 1209 struct kaioinfo *ki; 1210 struct kevent kev; 1211 struct kqueue *kq; 1212 struct file *kq_fp; 1213 1214 if ((aiocbe = TAILQ_FIRST(&aio_freejobs)) != NULL) 1215 TAILQ_REMOVE(&aio_freejobs, aiocbe, list); 1216 else 1217 aiocbe = zalloc (aiocb_zone); 1218 1219 aiocbe->inputcharge = 0; 1220 aiocbe->outputcharge = 0; 1221 SLIST_INIT(&aiocbe->klist); 1222 1223 suword(&job->_aiocb_private.status, -1); 1224 suword(&job->_aiocb_private.error, 0); 1225 suword(&job->_aiocb_private.kernelinfo, -1); 1226 1227 error = copyin((caddr_t)job, (caddr_t) &aiocbe->uaiocb, sizeof 1228 aiocbe->uaiocb); 1229 if (error) { 1230 suword(&job->_aiocb_private.error, error); 1231 1232 TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list); 1233 return error; 1234 } 1235 1236 /* Save userspace address of the job info. */ 1237 aiocbe->uuaiocb = job; 1238 1239 /* Get the opcode. */ 1240 if (type != LIO_NOP) 1241 aiocbe->uaiocb.aio_lio_opcode = type; 1242 opcode = aiocbe->uaiocb.aio_lio_opcode; 1243 1244 /* Get the fd info for process. */ 1245 fdp = p->p_fd; 1246 1247 /* 1248 * Range check file descriptor. 1249 */ 1250 fd = aiocbe->uaiocb.aio_fildes; 1251 if (fd >= fdp->fd_nfiles) { 1252 TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list); 1253 if (type == 0) 1254 suword(&job->_aiocb_private.error, EBADF); 1255 return EBADF; 1256 } 1257 1258 fp = aiocbe->fd_file = fdp->fd_ofiles[fd]; 1259 if ((fp == NULL) || ((opcode == LIO_WRITE) && ((fp->f_flag & FWRITE) == 1260 0))) { 1261 TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list); 1262 if (type == 0) 1263 suword(&job->_aiocb_private.error, EBADF); 1264 return EBADF; 1265 } 1266 1267 if (aiocbe->uaiocb.aio_offset == -1LL) { 1268 TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list); 1269 if (type == 0) 1270 suword(&job->_aiocb_private.error, EINVAL); 1271 return EINVAL; 1272 } 1273 1274 error = suword(&job->_aiocb_private.kernelinfo, jobrefid); 1275 if (error) { 1276 TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list); 1277 if (type == 0) 1278 suword(&job->_aiocb_private.error, EINVAL); 1279 return error; 1280 } 1281 1282 aiocbe->uaiocb._aiocb_private.kernelinfo = (void *)(intptr_t)jobrefid; 1283 if (jobrefid == LONG_MAX) 1284 jobrefid = 1; 1285 else 1286 jobrefid++; 1287 1288 if (opcode == LIO_NOP) { 1289 TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list); 1290 if (type == 0) { 1291 suword(&job->_aiocb_private.error, 0); 1292 suword(&job->_aiocb_private.status, 0); 1293 suword(&job->_aiocb_private.kernelinfo, 0); 1294 } 1295 return 0; 1296 } 1297 1298 if ((opcode != LIO_READ) && (opcode != LIO_WRITE)) { 1299 TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list); 1300 if (type == 0) { 1301 suword(&job->_aiocb_private.status, 0); 1302 suword(&job->_aiocb_private.error, EINVAL); 1303 } 1304 return EINVAL; 1305 } 1306 1307 fhold(fp); 1308 1309 if (aiocbe->uaiocb.aio_sigevent.sigev_notify == SIGEV_KEVENT) { 1310 kev.ident = aiocbe->uaiocb.aio_sigevent.sigev_notify_kqueue; 1311 kev.udata = aiocbe->uaiocb.aio_sigevent.sigev_value.sigval_ptr; 1312 } 1313 else { 1314 /* 1315 * This method for requesting kevent-based notification won't 1316 * work on the alpha, since we're passing in a pointer 1317 * via aio_lio_opcode, which is an int. Use the SIGEV_KEVENT- 1318 * based method instead. 1319 */ 1320 struct kevent *kevp; 1321 1322 kevp = (struct kevent *)job->aio_lio_opcode; 1323 if (kevp == NULL) 1324 goto no_kqueue; 1325 1326 error = copyin((caddr_t)kevp, (caddr_t)&kev, sizeof(kev)); 1327 if (error) 1328 goto aqueue_fail; 1329 } 1330 if ((u_int)kev.ident >= fdp->fd_nfiles || 1331 (kq_fp = fdp->fd_ofiles[kev.ident]) == NULL || 1332 (kq_fp->f_type != DTYPE_KQUEUE)) { 1333 error = EBADF; 1334 goto aqueue_fail; 1335 } 1336 kq = (struct kqueue *)kq_fp->f_data; 1337 kev.ident = (uintptr_t)aiocbe; 1338 kev.filter = EVFILT_AIO; 1339 kev.flags = EV_ADD | EV_ENABLE | EV_FLAG1; 1340 error = kqueue_register(kq, &kev, p); 1341 aqueue_fail: 1342 if (error) { 1343 TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list); 1344 if (type == 0) 1345 suword(&job->_aiocb_private.error, error); 1346 goto done; 1347 } 1348 no_kqueue: 1349 1350 suword(&job->_aiocb_private.error, EINPROGRESS); 1351 aiocbe->uaiocb._aiocb_private.error = EINPROGRESS; 1352 aiocbe->userproc = p; 1353 aiocbe->jobflags = 0; 1354 aiocbe->lio = lj; 1355 ki = p->p_aioinfo; 1356 1357 if (fp->f_type == DTYPE_SOCKET) { 1358 /* 1359 * Alternate queueing for socket ops: Reach down into the 1360 * descriptor to get the socket data. Then check to see if the 1361 * socket is ready to be read or written (based on the requested 1362 * operation). 1363 * 1364 * If it is not ready for io, then queue the aiocbe on the 1365 * socket, and set the flags so we get a call when sbnotify() 1366 * happens. 1367 */ 1368 so = (struct socket *)fp->f_data; 1369 s = splnet(); 1370 if (((opcode == LIO_READ) && (!soreadable(so))) || ((opcode == 1371 LIO_WRITE) && (!sowriteable(so)))) { 1372 TAILQ_INSERT_TAIL(&so->so_aiojobq, aiocbe, list); 1373 TAILQ_INSERT_TAIL(&ki->kaio_sockqueue, aiocbe, plist); 1374 if (opcode == LIO_READ) 1375 so->so_rcv.sb_flags |= SB_AIO; 1376 else 1377 so->so_snd.sb_flags |= SB_AIO; 1378 aiocbe->jobstate = JOBST_JOBQGLOBAL; /* XXX */ 1379 ki->kaio_queue_count++; 1380 num_queue_count++; 1381 splx(s); 1382 error = 0; 1383 goto done; 1384 } 1385 splx(s); 1386 } 1387 1388 if ((error = aio_qphysio(p, aiocbe)) == 0) 1389 goto done; 1390 if (error > 0) { 1391 suword(&job->_aiocb_private.status, 0); 1392 aiocbe->uaiocb._aiocb_private.error = error; 1393 suword(&job->_aiocb_private.error, error); 1394 goto done; 1395 } 1396 1397 /* No buffer for daemon I/O. */ 1398 aiocbe->bp = NULL; 1399 1400 ki->kaio_queue_count++; 1401 if (lj) 1402 lj->lioj_queue_count++; 1403 s = splnet(); 1404 TAILQ_INSERT_TAIL(&ki->kaio_jobqueue, aiocbe, plist); 1405 TAILQ_INSERT_TAIL(&aio_jobs, aiocbe, list); 1406 splx(s); 1407 aiocbe->jobstate = JOBST_JOBQGLOBAL; 1408 1409 num_queue_count++; 1410 error = 0; 1411 1412 /* 1413 * If we don't have a free AIO process, and we are below our quota, then 1414 * start one. Otherwise, depend on the subsequent I/O completions to 1415 * pick-up this job. If we don't sucessfully create the new process 1416 * (thread) due to resource issues, we return an error for now (EAGAIN), 1417 * which is likely not the correct thing to do. 1418 */ 1419 retryproc: 1420 s = splnet(); 1421 if ((aiop = TAILQ_FIRST(&aio_freeproc)) != NULL) { 1422 TAILQ_REMOVE(&aio_freeproc, aiop, list); 1423 TAILQ_INSERT_TAIL(&aio_activeproc, aiop, list); 1424 aiop->aioprocflags &= ~AIOP_FREE; 1425 wakeup(aiop->aioproc); 1426 } else if (((num_aio_resv_start + num_aio_procs) < max_aio_procs) && 1427 ((ki->kaio_active_count + num_aio_resv_start) < 1428 ki->kaio_maxactive_count)) { 1429 num_aio_resv_start++; 1430 if ((error = aio_newproc()) == 0) { 1431 num_aio_resv_start--; 1432 p->p_retval[0] = 0; 1433 goto retryproc; 1434 } 1435 num_aio_resv_start--; 1436 } 1437 splx(s); 1438 done: 1439 fdrop(fp, p); 1440 return error; 1441 } 1442 1443 /* 1444 * This routine queues an AIO request, checking for quotas. 1445 */ 1446 static int 1447 aio_aqueue(struct proc *p, struct aiocb *job, int type) 1448 { 1449 struct kaioinfo *ki; 1450 1451 if (p->p_aioinfo == NULL) 1452 aio_init_aioinfo(p); 1453 1454 if (num_queue_count >= max_queue_count) 1455 return EAGAIN; 1456 1457 ki = p->p_aioinfo; 1458 if (ki->kaio_queue_count >= ki->kaio_qallowed_count) 1459 return EAGAIN; 1460 1461 return _aio_aqueue(p, job, NULL, type); 1462 } 1463 #endif /* VFS_AIO */ 1464 1465 /* 1466 * Support the aio_return system call, as a side-effect, kernel resources are 1467 * released. 1468 */ 1469 int 1470 aio_return(struct proc *p, struct aio_return_args *uap) 1471 { 1472 #ifndef VFS_AIO 1473 return ENOSYS; 1474 #else 1475 int s; 1476 int jobref; 1477 struct aiocblist *cb, *ncb; 1478 struct aiocb *ujob; 1479 struct kaioinfo *ki; 1480 1481 ki = p->p_aioinfo; 1482 if (ki == NULL) 1483 return EINVAL; 1484 1485 ujob = uap->aiocbp; 1486 1487 jobref = fuword(&ujob->_aiocb_private.kernelinfo); 1488 if (jobref == -1 || jobref == 0) 1489 return EINVAL; 1490 1491 s = splnet(); 1492 for (cb = TAILQ_FIRST(&ki->kaio_jobdone); cb; cb = TAILQ_NEXT(cb, 1493 plist)) { 1494 if (((intptr_t) cb->uaiocb._aiocb_private.kernelinfo) == 1495 jobref) { 1496 splx(s); 1497 if (ujob == cb->uuaiocb) { 1498 p->p_retval[0] = 1499 cb->uaiocb._aiocb_private.status; 1500 } else 1501 p->p_retval[0] = EFAULT; 1502 if (cb->uaiocb.aio_lio_opcode == LIO_WRITE) { 1503 curproc->p_stats->p_ru.ru_oublock += 1504 cb->outputcharge; 1505 cb->outputcharge = 0; 1506 } else if (cb->uaiocb.aio_lio_opcode == LIO_READ) { 1507 curproc->p_stats->p_ru.ru_inblock += 1508 cb->inputcharge; 1509 cb->inputcharge = 0; 1510 } 1511 aio_free_entry(cb); 1512 return 0; 1513 } 1514 } 1515 splx(s); 1516 1517 s = splbio(); 1518 for (cb = TAILQ_FIRST(&ki->kaio_bufdone); cb; cb = ncb) { 1519 ncb = TAILQ_NEXT(cb, plist); 1520 if (((intptr_t) cb->uaiocb._aiocb_private.kernelinfo) 1521 == jobref) { 1522 splx(s); 1523 if (ujob == cb->uuaiocb) { 1524 p->p_retval[0] = 1525 cb->uaiocb._aiocb_private.status; 1526 } else 1527 p->p_retval[0] = EFAULT; 1528 aio_free_entry(cb); 1529 return 0; 1530 } 1531 } 1532 splx(s); 1533 1534 return (EINVAL); 1535 #endif /* VFS_AIO */ 1536 } 1537 1538 /* 1539 * Allow a process to wakeup when any of the I/O requests are completed. 1540 */ 1541 int 1542 aio_suspend(struct proc *p, struct aio_suspend_args *uap) 1543 { 1544 #ifndef VFS_AIO 1545 return ENOSYS; 1546 #else 1547 struct timeval atv; 1548 struct timespec ts; 1549 struct aiocb *const *cbptr, *cbp; 1550 struct kaioinfo *ki; 1551 struct aiocblist *cb; 1552 int i; 1553 int njoblist; 1554 int error, s, timo; 1555 int *ijoblist; 1556 struct aiocb **ujoblist; 1557 1558 if (uap->nent >= AIO_LISTIO_MAX) 1559 return EINVAL; 1560 1561 timo = 0; 1562 if (uap->timeout) { 1563 /* Get timespec struct. */ 1564 if ((error = copyin(uap->timeout, &ts, sizeof(ts))) != 0) 1565 return error; 1566 1567 if (ts.tv_nsec < 0 || ts.tv_nsec >= 1000000000) 1568 return (EINVAL); 1569 1570 TIMESPEC_TO_TIMEVAL(&atv, &ts); 1571 if (itimerfix(&atv)) 1572 return (EINVAL); 1573 timo = tvtohz(&atv); 1574 } 1575 1576 ki = p->p_aioinfo; 1577 if (ki == NULL) 1578 return EAGAIN; 1579 1580 njoblist = 0; 1581 ijoblist = zalloc(aiol_zone); 1582 ujoblist = zalloc(aiol_zone); 1583 cbptr = uap->aiocbp; 1584 1585 for (i = 0; i < uap->nent; i++) { 1586 cbp = (struct aiocb *)(intptr_t)fuword((caddr_t)&cbptr[i]); 1587 if (cbp == 0) 1588 continue; 1589 ujoblist[njoblist] = cbp; 1590 ijoblist[njoblist] = fuword(&cbp->_aiocb_private.kernelinfo); 1591 njoblist++; 1592 } 1593 1594 if (njoblist == 0) { 1595 zfree(aiol_zone, ijoblist); 1596 zfree(aiol_zone, ujoblist); 1597 return 0; 1598 } 1599 1600 error = 0; 1601 for (;;) { 1602 for (cb = TAILQ_FIRST(&ki->kaio_jobdone); cb; cb = 1603 TAILQ_NEXT(cb, plist)) { 1604 for (i = 0; i < njoblist; i++) { 1605 if (((intptr_t) 1606 cb->uaiocb._aiocb_private.kernelinfo) == 1607 ijoblist[i]) { 1608 if (ujoblist[i] != cb->uuaiocb) 1609 error = EINVAL; 1610 zfree(aiol_zone, ijoblist); 1611 zfree(aiol_zone, ujoblist); 1612 return error; 1613 } 1614 } 1615 } 1616 1617 s = splbio(); 1618 for (cb = TAILQ_FIRST(&ki->kaio_bufdone); cb; cb = 1619 TAILQ_NEXT(cb, plist)) { 1620 for (i = 0; i < njoblist; i++) { 1621 if (((intptr_t) 1622 cb->uaiocb._aiocb_private.kernelinfo) == 1623 ijoblist[i]) { 1624 splx(s); 1625 if (ujoblist[i] != cb->uuaiocb) 1626 error = EINVAL; 1627 zfree(aiol_zone, ijoblist); 1628 zfree(aiol_zone, ujoblist); 1629 return error; 1630 } 1631 } 1632 } 1633 1634 ki->kaio_flags |= KAIO_WAKEUP; 1635 error = tsleep(p, PRIBIO | PCATCH, "aiospn", timo); 1636 splx(s); 1637 1638 if (error == ERESTART || error == EINTR) { 1639 zfree(aiol_zone, ijoblist); 1640 zfree(aiol_zone, ujoblist); 1641 return EINTR; 1642 } else if (error == EWOULDBLOCK) { 1643 zfree(aiol_zone, ijoblist); 1644 zfree(aiol_zone, ujoblist); 1645 return EAGAIN; 1646 } 1647 } 1648 1649 /* NOTREACHED */ 1650 return EINVAL; 1651 #endif /* VFS_AIO */ 1652 } 1653 1654 /* 1655 * aio_cancel cancels any non-physio aio operations not currently in 1656 * progress. 1657 */ 1658 int 1659 aio_cancel(struct proc *p, struct aio_cancel_args *uap) 1660 { 1661 #ifndef VFS_AIO 1662 return ENOSYS; 1663 #else 1664 struct kaioinfo *ki; 1665 struct aiocblist *cbe, *cbn; 1666 struct file *fp; 1667 struct filedesc *fdp; 1668 struct socket *so; 1669 struct proc *po; 1670 int s,error; 1671 int cancelled=0; 1672 int notcancelled=0; 1673 struct vnode *vp; 1674 1675 fdp = p->p_fd; 1676 1677 fp = fdp->fd_ofiles[uap->fd]; 1678 1679 if (fp == NULL) { 1680 return EBADF; 1681 } 1682 1683 if (fp->f_type == DTYPE_VNODE) { 1684 vp = (struct vnode *)fp->f_data; 1685 1686 if (vn_isdisk(vp,&error)) { 1687 p->p_retval[0] = AIO_NOTCANCELED; 1688 return 0; 1689 } 1690 } else if (fp->f_type == DTYPE_SOCKET) { 1691 so = (struct socket *)fp->f_data; 1692 1693 s = splnet(); 1694 1695 for (cbe = TAILQ_FIRST(&so->so_aiojobq); cbe; cbe = cbn) { 1696 cbn = TAILQ_NEXT(cbe, list); 1697 if ((uap->aiocbp == NULL) || 1698 (uap->aiocbp == cbe->uuaiocb) ) { 1699 po = cbe->userproc; 1700 ki = po->p_aioinfo; 1701 TAILQ_REMOVE(&so->so_aiojobq, cbe, list); 1702 TAILQ_REMOVE(&ki->kaio_sockqueue, cbe, plist); 1703 TAILQ_INSERT_TAIL(&ki->kaio_jobdone, cbe, plist); 1704 if (ki->kaio_flags & KAIO_WAKEUP) { 1705 wakeup(po); 1706 } 1707 cbe->jobstate = JOBST_JOBFINISHED; 1708 cbe->uaiocb._aiocb_private.status=-1; 1709 cbe->uaiocb._aiocb_private.error=ECANCELED; 1710 cancelled++; 1711 /* XXX cancelled, knote? */ 1712 if (cbe->uaiocb.aio_sigevent.sigev_notify == 1713 SIGEV_SIGNAL) 1714 psignal(cbe->userproc, cbe->uaiocb.aio_sigevent.sigev_signo); 1715 if (uap->aiocbp) 1716 break; 1717 } 1718 } 1719 1720 splx(s); 1721 1722 if ((cancelled) && (uap->aiocbp)) { 1723 p->p_retval[0] = AIO_CANCELED; 1724 return 0; 1725 } 1726 1727 } 1728 1729 ki=p->p_aioinfo; 1730 1731 s = splnet(); 1732 1733 for (cbe = TAILQ_FIRST(&ki->kaio_jobqueue); cbe; cbe = cbn) { 1734 cbn = TAILQ_NEXT(cbe, plist); 1735 1736 if ((uap->fd == cbe->uaiocb.aio_fildes) && 1737 ((uap->aiocbp == NULL ) || 1738 (uap->aiocbp == cbe->uuaiocb))) { 1739 1740 if (cbe->jobstate == JOBST_JOBQGLOBAL) { 1741 TAILQ_REMOVE(&aio_jobs, cbe, list); 1742 TAILQ_REMOVE(&ki->kaio_jobqueue, cbe, plist); 1743 TAILQ_INSERT_TAIL(&ki->kaio_jobdone, cbe, 1744 plist); 1745 cancelled++; 1746 ki->kaio_queue_finished_count++; 1747 cbe->jobstate = JOBST_JOBFINISHED; 1748 cbe->uaiocb._aiocb_private.status = -1; 1749 cbe->uaiocb._aiocb_private.error = ECANCELED; 1750 /* XXX cancelled, knote? */ 1751 if (cbe->uaiocb.aio_sigevent.sigev_notify == 1752 SIGEV_SIGNAL) 1753 psignal(cbe->userproc, cbe->uaiocb.aio_sigevent.sigev_signo); 1754 } else { 1755 notcancelled++; 1756 } 1757 } 1758 } 1759 1760 splx(s); 1761 1762 1763 if (notcancelled) { 1764 p->p_retval[0] = AIO_NOTCANCELED; 1765 return 0; 1766 } 1767 1768 if (cancelled) { 1769 p->p_retval[0] = AIO_CANCELED; 1770 return 0; 1771 } 1772 1773 p->p_retval[0] = AIO_ALLDONE; 1774 1775 return 0; 1776 #endif /* VFS_AIO */ 1777 } 1778 1779 /* 1780 * aio_error is implemented in the kernel level for compatibility purposes only. 1781 * For a user mode async implementation, it would be best to do it in a userland 1782 * subroutine. 1783 */ 1784 int 1785 aio_error(struct proc *p, struct aio_error_args *uap) 1786 { 1787 #ifndef VFS_AIO 1788 return ENOSYS; 1789 #else 1790 int s; 1791 struct aiocblist *cb; 1792 struct kaioinfo *ki; 1793 int jobref; 1794 1795 ki = p->p_aioinfo; 1796 if (ki == NULL) 1797 return EINVAL; 1798 1799 jobref = fuword(&uap->aiocbp->_aiocb_private.kernelinfo); 1800 if ((jobref == -1) || (jobref == 0)) 1801 return EINVAL; 1802 1803 for (cb = TAILQ_FIRST(&ki->kaio_jobdone); cb; cb = TAILQ_NEXT(cb, 1804 plist)) { 1805 if (((intptr_t)cb->uaiocb._aiocb_private.kernelinfo) == 1806 jobref) { 1807 p->p_retval[0] = cb->uaiocb._aiocb_private.error; 1808 return 0; 1809 } 1810 } 1811 1812 s = splnet(); 1813 1814 for (cb = TAILQ_FIRST(&ki->kaio_jobqueue); cb; cb = TAILQ_NEXT(cb, 1815 plist)) { 1816 if (((intptr_t)cb->uaiocb._aiocb_private.kernelinfo) == 1817 jobref) { 1818 p->p_retval[0] = EINPROGRESS; 1819 splx(s); 1820 return 0; 1821 } 1822 } 1823 1824 for (cb = TAILQ_FIRST(&ki->kaio_sockqueue); cb; cb = TAILQ_NEXT(cb, 1825 plist)) { 1826 if (((intptr_t)cb->uaiocb._aiocb_private.kernelinfo) == 1827 jobref) { 1828 p->p_retval[0] = EINPROGRESS; 1829 splx(s); 1830 return 0; 1831 } 1832 } 1833 splx(s); 1834 1835 s = splbio(); 1836 for (cb = TAILQ_FIRST(&ki->kaio_bufdone); cb; cb = TAILQ_NEXT(cb, 1837 plist)) { 1838 if (((intptr_t)cb->uaiocb._aiocb_private.kernelinfo) == 1839 jobref) { 1840 p->p_retval[0] = cb->uaiocb._aiocb_private.error; 1841 splx(s); 1842 return 0; 1843 } 1844 } 1845 1846 for (cb = TAILQ_FIRST(&ki->kaio_bufqueue); cb; cb = TAILQ_NEXT(cb, 1847 plist)) { 1848 if (((intptr_t)cb->uaiocb._aiocb_private.kernelinfo) == 1849 jobref) { 1850 p->p_retval[0] = EINPROGRESS; 1851 splx(s); 1852 return 0; 1853 } 1854 } 1855 splx(s); 1856 1857 #if (0) 1858 /* 1859 * Hack for lio. 1860 */ 1861 status = fuword(&uap->aiocbp->_aiocb_private.status); 1862 if (status == -1) 1863 return fuword(&uap->aiocbp->_aiocb_private.error); 1864 #endif 1865 return EINVAL; 1866 #endif /* VFS_AIO */ 1867 } 1868 1869 int 1870 aio_read(struct proc *p, struct aio_read_args *uap) 1871 { 1872 #ifndef VFS_AIO 1873 return ENOSYS; 1874 #else 1875 struct filedesc *fdp; 1876 struct file *fp; 1877 struct uio auio; 1878 struct iovec aiov; 1879 unsigned int fd; 1880 int cnt; 1881 struct aiocb iocb; 1882 int error, pmodes; 1883 1884 pmodes = fuword(&uap->aiocbp->_aiocb_private.privatemodes); 1885 if ((pmodes & AIO_PMODE_SYNC) == 0) 1886 return aio_aqueue(p, (struct aiocb *)uap->aiocbp, LIO_READ); 1887 1888 /* Get control block. */ 1889 if ((error = copyin((caddr_t)uap->aiocbp, (caddr_t)&iocb, sizeof iocb)) 1890 != 0) 1891 return error; 1892 1893 /* Get the fd info for process. */ 1894 fdp = p->p_fd; 1895 1896 /* 1897 * Range check file descriptor. 1898 */ 1899 fd = iocb.aio_fildes; 1900 if (fd >= fdp->fd_nfiles) 1901 return EBADF; 1902 fp = fdp->fd_ofiles[fd]; 1903 if ((fp == NULL) || ((fp->f_flag & FREAD) == 0)) 1904 return EBADF; 1905 if (iocb.aio_offset == -1LL) 1906 return EINVAL; 1907 1908 auio.uio_resid = iocb.aio_nbytes; 1909 if (auio.uio_resid < 0) 1910 return (EINVAL); 1911 1912 /* 1913 * Process sync simply -- queue async request. 1914 */ 1915 if ((iocb._aiocb_private.privatemodes & AIO_PMODE_SYNC) == 0) 1916 return aio_aqueue(p, (struct aiocb *)uap->aiocbp, LIO_READ); 1917 1918 aiov.iov_base = (void *)iocb.aio_buf; 1919 aiov.iov_len = iocb.aio_nbytes; 1920 1921 auio.uio_iov = &aiov; 1922 auio.uio_iovcnt = 1; 1923 auio.uio_offset = iocb.aio_offset; 1924 auio.uio_rw = UIO_READ; 1925 auio.uio_segflg = UIO_USERSPACE; 1926 auio.uio_procp = p; 1927 1928 cnt = iocb.aio_nbytes; 1929 /* 1930 * Temporarily bump the ref count while reading to avoid the 1931 * descriptor being ripped out from under us. 1932 */ 1933 fhold(fp); 1934 error = fo_read(fp, &auio, fp->f_cred, FOF_OFFSET, p); 1935 fdrop(fp, p); 1936 if (error && (auio.uio_resid != cnt) && (error == ERESTART || error == 1937 EINTR || error == EWOULDBLOCK)) 1938 error = 0; 1939 cnt -= auio.uio_resid; 1940 p->p_retval[0] = cnt; 1941 return error; 1942 #endif /* VFS_AIO */ 1943 } 1944 1945 int 1946 aio_write(struct proc *p, struct aio_write_args *uap) 1947 { 1948 #ifndef VFS_AIO 1949 return ENOSYS; 1950 #else 1951 struct filedesc *fdp; 1952 struct file *fp; 1953 struct uio auio; 1954 struct iovec aiov; 1955 unsigned int fd; 1956 int cnt; 1957 struct aiocb iocb; 1958 int error; 1959 int pmodes; 1960 1961 /* 1962 * Process sync simply -- queue async request. 1963 */ 1964 pmodes = fuword(&uap->aiocbp->_aiocb_private.privatemodes); 1965 if ((pmodes & AIO_PMODE_SYNC) == 0) 1966 return aio_aqueue(p, (struct aiocb *)uap->aiocbp, LIO_WRITE); 1967 1968 if ((error = copyin((caddr_t)uap->aiocbp, (caddr_t)&iocb, sizeof iocb)) 1969 != 0) 1970 return error; 1971 1972 /* Get the fd info for process. */ 1973 fdp = p->p_fd; 1974 1975 /* 1976 * Range check file descriptor. 1977 */ 1978 fd = iocb.aio_fildes; 1979 if (fd >= fdp->fd_nfiles) 1980 return EBADF; 1981 fp = fdp->fd_ofiles[fd]; 1982 if ((fp == NULL) || ((fp->f_flag & FWRITE) == 0)) 1983 return EBADF; 1984 if (iocb.aio_offset == -1LL) 1985 return EINVAL; 1986 1987 aiov.iov_base = (void *)iocb.aio_buf; 1988 aiov.iov_len = iocb.aio_nbytes; 1989 auio.uio_iov = &aiov; 1990 auio.uio_iovcnt = 1; 1991 auio.uio_offset = iocb.aio_offset; 1992 1993 auio.uio_resid = iocb.aio_nbytes; 1994 if (auio.uio_resid < 0) 1995 return (EINVAL); 1996 1997 auio.uio_rw = UIO_WRITE; 1998 auio.uio_segflg = UIO_USERSPACE; 1999 auio.uio_procp = p; 2000 2001 cnt = iocb.aio_nbytes; 2002 /* 2003 * Temporarily bump the ref count while writing to avoid the 2004 * descriptor being ripped out from under us. 2005 */ 2006 fhold(fp); 2007 error = fo_write(fp, &auio, fp->f_cred, FOF_OFFSET, p); 2008 fdrop(fp, p); 2009 if (error) { 2010 if (auio.uio_resid != cnt) { 2011 if (error == ERESTART || error == EINTR || error == 2012 EWOULDBLOCK) 2013 error = 0; 2014 if (error == EPIPE) 2015 psignal(p, SIGPIPE); 2016 } 2017 } 2018 cnt -= auio.uio_resid; 2019 p->p_retval[0] = cnt; 2020 return error; 2021 #endif /* VFS_AIO */ 2022 } 2023 2024 int 2025 lio_listio(struct proc *p, struct lio_listio_args *uap) 2026 { 2027 #ifndef VFS_AIO 2028 return ENOSYS; 2029 #else 2030 int nent, nentqueued; 2031 struct aiocb *iocb, * const *cbptr; 2032 struct aiocblist *cb; 2033 struct kaioinfo *ki; 2034 struct aio_liojob *lj; 2035 int error, runningcode; 2036 int nerror; 2037 int i; 2038 int s; 2039 2040 if ((uap->mode != LIO_NOWAIT) && (uap->mode != LIO_WAIT)) 2041 return EINVAL; 2042 2043 nent = uap->nent; 2044 if (nent > AIO_LISTIO_MAX) 2045 return EINVAL; 2046 2047 if (p->p_aioinfo == NULL) 2048 aio_init_aioinfo(p); 2049 2050 if ((nent + num_queue_count) > max_queue_count) 2051 return EAGAIN; 2052 2053 ki = p->p_aioinfo; 2054 if ((nent + ki->kaio_queue_count) > ki->kaio_qallowed_count) 2055 return EAGAIN; 2056 2057 lj = zalloc(aiolio_zone); 2058 if (!lj) 2059 return EAGAIN; 2060 2061 lj->lioj_flags = 0; 2062 lj->lioj_buffer_count = 0; 2063 lj->lioj_buffer_finished_count = 0; 2064 lj->lioj_queue_count = 0; 2065 lj->lioj_queue_finished_count = 0; 2066 lj->lioj_ki = ki; 2067 TAILQ_INSERT_TAIL(&ki->kaio_liojoblist, lj, lioj_list); 2068 2069 /* 2070 * Setup signal. 2071 */ 2072 if (uap->sig && (uap->mode == LIO_NOWAIT)) { 2073 error = copyin(uap->sig, &lj->lioj_signal, 2074 sizeof(lj->lioj_signal)); 2075 if (error) 2076 return error; 2077 lj->lioj_flags |= LIOJ_SIGNAL; 2078 lj->lioj_flags &= ~LIOJ_SIGNAL_POSTED; 2079 } else 2080 lj->lioj_flags &= ~LIOJ_SIGNAL; 2081 2082 /* 2083 * Get pointers to the list of I/O requests. 2084 */ 2085 nerror = 0; 2086 nentqueued = 0; 2087 cbptr = uap->acb_list; 2088 for (i = 0; i < uap->nent; i++) { 2089 iocb = (struct aiocb *)(intptr_t)fuword((caddr_t)&cbptr[i]); 2090 if (((intptr_t)iocb != -1) && ((intptr_t)iocb != NULL)) { 2091 error = _aio_aqueue(p, iocb, lj, 0); 2092 if (error == 0) 2093 nentqueued++; 2094 else 2095 nerror++; 2096 } 2097 } 2098 2099 /* 2100 * If we haven't queued any, then just return error. 2101 */ 2102 if (nentqueued == 0) 2103 return 0; 2104 2105 /* 2106 * Calculate the appropriate error return. 2107 */ 2108 runningcode = 0; 2109 if (nerror) 2110 runningcode = EIO; 2111 2112 if (uap->mode == LIO_WAIT) { 2113 int command, found, jobref; 2114 2115 for (;;) { 2116 found = 0; 2117 for (i = 0; i < uap->nent; i++) { 2118 /* 2119 * Fetch address of the control buf pointer in 2120 * user space. 2121 */ 2122 iocb = (struct aiocb *)(intptr_t)fuword((caddr_t)&cbptr[i]); 2123 if (((intptr_t)iocb == -1) || ((intptr_t)iocb 2124 == 0)) 2125 continue; 2126 2127 /* 2128 * Fetch the associated command from user space. 2129 */ 2130 command = fuword(&iocb->aio_lio_opcode); 2131 if (command == LIO_NOP) { 2132 found++; 2133 continue; 2134 } 2135 2136 jobref = fuword(&iocb->_aiocb_private.kernelinfo); 2137 2138 for (cb = TAILQ_FIRST(&ki->kaio_jobdone); cb; 2139 cb = TAILQ_NEXT(cb, plist)) { 2140 if (((intptr_t)cb->uaiocb._aiocb_private.kernelinfo) 2141 == jobref) { 2142 if (cb->uaiocb.aio_lio_opcode 2143 == LIO_WRITE) { 2144 curproc->p_stats->p_ru.ru_oublock 2145 += 2146 cb->outputcharge; 2147 cb->outputcharge = 0; 2148 } else if (cb->uaiocb.aio_lio_opcode 2149 == LIO_READ) { 2150 curproc->p_stats->p_ru.ru_inblock 2151 += cb->inputcharge; 2152 cb->inputcharge = 0; 2153 } 2154 found++; 2155 break; 2156 } 2157 } 2158 2159 s = splbio(); 2160 for (cb = TAILQ_FIRST(&ki->kaio_bufdone); cb; 2161 cb = TAILQ_NEXT(cb, plist)) { 2162 if (((intptr_t)cb->uaiocb._aiocb_private.kernelinfo) 2163 == jobref) { 2164 found++; 2165 break; 2166 } 2167 } 2168 splx(s); 2169 } 2170 2171 /* 2172 * If all I/Os have been disposed of, then we can 2173 * return. 2174 */ 2175 if (found == nentqueued) 2176 return runningcode; 2177 2178 ki->kaio_flags |= KAIO_WAKEUP; 2179 error = tsleep(p, PRIBIO | PCATCH, "aiospn", 0); 2180 2181 if (error == EINTR) 2182 return EINTR; 2183 else if (error == EWOULDBLOCK) 2184 return EAGAIN; 2185 } 2186 } 2187 2188 return runningcode; 2189 #endif /* VFS_AIO */ 2190 } 2191 2192 #ifdef VFS_AIO 2193 /* 2194 * This is a wierd hack so that we can post a signal. It is safe to do so from 2195 * a timeout routine, but *not* from an interrupt routine. 2196 */ 2197 static void 2198 process_signal(void *aioj) 2199 { 2200 struct aiocblist *aiocbe = aioj; 2201 struct aio_liojob *lj = aiocbe->lio; 2202 struct aiocb *cb = &aiocbe->uaiocb; 2203 2204 if ((lj) && (lj->lioj_signal.sigev_notify == SIGEV_SIGNAL) && 2205 (lj->lioj_queue_count == lj->lioj_queue_finished_count)) { 2206 psignal(lj->lioj_ki->kaio_p, lj->lioj_signal.sigev_signo); 2207 lj->lioj_flags |= LIOJ_SIGNAL_POSTED; 2208 } 2209 2210 if (cb->aio_sigevent.sigev_notify == SIGEV_SIGNAL) 2211 psignal(aiocbe->userproc, cb->aio_sigevent.sigev_signo); 2212 } 2213 2214 /* 2215 * Interrupt handler for physio, performs the necessary process wakeups, and 2216 * signals. 2217 */ 2218 static void 2219 aio_physwakeup(struct buf *bp) 2220 { 2221 struct aiocblist *aiocbe; 2222 struct proc *p; 2223 struct kaioinfo *ki; 2224 struct aio_liojob *lj; 2225 2226 wakeup((caddr_t)bp); 2227 2228 aiocbe = (struct aiocblist *)bp->b_spc; 2229 if (aiocbe) { 2230 p = bp->b_caller1; 2231 2232 aiocbe->jobstate = JOBST_JOBBFINISHED; 2233 aiocbe->uaiocb._aiocb_private.status -= bp->b_resid; 2234 aiocbe->uaiocb._aiocb_private.error = 0; 2235 aiocbe->jobflags |= AIOCBLIST_DONE; 2236 2237 if (bp->b_ioflags & BIO_ERROR) 2238 aiocbe->uaiocb._aiocb_private.error = bp->b_error; 2239 2240 lj = aiocbe->lio; 2241 if (lj) { 2242 lj->lioj_buffer_finished_count++; 2243 2244 /* 2245 * wakeup/signal if all of the interrupt jobs are done. 2246 */ 2247 if (lj->lioj_buffer_finished_count == 2248 lj->lioj_buffer_count) { 2249 /* 2250 * Post a signal if it is called for. 2251 */ 2252 if ((lj->lioj_flags & 2253 (LIOJ_SIGNAL|LIOJ_SIGNAL_POSTED)) == 2254 LIOJ_SIGNAL) { 2255 lj->lioj_flags |= LIOJ_SIGNAL_POSTED; 2256 timeout(process_signal, aiocbe, 0); 2257 } 2258 } 2259 } 2260 2261 ki = p->p_aioinfo; 2262 if (ki) { 2263 ki->kaio_buffer_finished_count++; 2264 TAILQ_REMOVE(&aio_bufjobs, aiocbe, list); 2265 TAILQ_REMOVE(&ki->kaio_bufqueue, aiocbe, plist); 2266 TAILQ_INSERT_TAIL(&ki->kaio_bufdone, aiocbe, plist); 2267 2268 KNOTE(&aiocbe->klist, 0); 2269 /* Do the wakeup. */ 2270 if (ki->kaio_flags & (KAIO_RUNDOWN|KAIO_WAKEUP)) { 2271 ki->kaio_flags &= ~KAIO_WAKEUP; 2272 wakeup(p); 2273 } 2274 } 2275 2276 if (aiocbe->uaiocb.aio_sigevent.sigev_notify == SIGEV_SIGNAL) 2277 timeout(process_signal, aiocbe, 0); 2278 } 2279 } 2280 #endif /* VFS_AIO */ 2281 2282 int 2283 aio_waitcomplete(struct proc *p, struct aio_waitcomplete_args *uap) 2284 { 2285 #ifndef VFS_AIO 2286 return ENOSYS; 2287 #else 2288 struct timeval atv; 2289 struct timespec ts; 2290 struct aiocb **cbptr; 2291 struct kaioinfo *ki; 2292 struct aiocblist *cb = NULL; 2293 int error, s, timo; 2294 2295 suword(uap->aiocbp, (int)NULL); 2296 2297 timo = 0; 2298 if (uap->timeout) { 2299 /* Get timespec struct. */ 2300 error = copyin((caddr_t)uap->timeout, (caddr_t)&ts, 2301 sizeof(ts)); 2302 if (error) 2303 return error; 2304 2305 if ((ts.tv_nsec < 0) || (ts.tv_nsec >= 1000000000)) 2306 return (EINVAL); 2307 2308 TIMESPEC_TO_TIMEVAL(&atv, &ts); 2309 if (itimerfix(&atv)) 2310 return (EINVAL); 2311 timo = tvtohz(&atv); 2312 } 2313 2314 ki = p->p_aioinfo; 2315 if (ki == NULL) 2316 return EAGAIN; 2317 2318 cbptr = uap->aiocbp; 2319 2320 for (;;) { 2321 if ((cb = TAILQ_FIRST(&ki->kaio_jobdone)) != 0) { 2322 suword(uap->aiocbp, (int)cb->uuaiocb); 2323 p->p_retval[0] = cb->uaiocb._aiocb_private.status; 2324 if (cb->uaiocb.aio_lio_opcode == LIO_WRITE) { 2325 curproc->p_stats->p_ru.ru_oublock += 2326 cb->outputcharge; 2327 cb->outputcharge = 0; 2328 } else if (cb->uaiocb.aio_lio_opcode == LIO_READ) { 2329 curproc->p_stats->p_ru.ru_inblock += 2330 cb->inputcharge; 2331 cb->inputcharge = 0; 2332 } 2333 aio_free_entry(cb); 2334 return cb->uaiocb._aiocb_private.error; 2335 } 2336 2337 s = splbio(); 2338 if ((cb = TAILQ_FIRST(&ki->kaio_bufdone)) != 0 ) { 2339 splx(s); 2340 suword(uap->aiocbp, (int)cb->uuaiocb); 2341 p->p_retval[0] = cb->uaiocb._aiocb_private.status; 2342 aio_free_entry(cb); 2343 return cb->uaiocb._aiocb_private.error; 2344 } 2345 2346 ki->kaio_flags |= KAIO_WAKEUP; 2347 error = tsleep(p, PRIBIO | PCATCH, "aiowc", timo); 2348 splx(s); 2349 2350 if (error == ERESTART) 2351 return EINTR; 2352 else if (error < 0) 2353 return error; 2354 else if (error == EINTR) 2355 return EINTR; 2356 else if (error == EWOULDBLOCK) 2357 return EAGAIN; 2358 } 2359 #endif /* VFS_AIO */ 2360 } 2361 2362 2363 #ifndef VFS_AIO 2364 static int 2365 filt_aioattach(struct knote *kn) 2366 { 2367 2368 return (ENXIO); 2369 } 2370 2371 struct filterops aio_filtops = 2372 { 0, filt_aioattach, NULL, NULL }; 2373 2374 #else 2375 static int 2376 filt_aioattach(struct knote *kn) 2377 { 2378 struct aiocblist *aiocbe = (struct aiocblist *)kn->kn_id; 2379 2380 /* 2381 * The aiocbe pointer must be validated before using it, so 2382 * registration is restricted to the kernel; the user cannot 2383 * set EV_FLAG1. 2384 */ 2385 if ((kn->kn_flags & EV_FLAG1) == 0) 2386 return (EPERM); 2387 kn->kn_flags &= ~EV_FLAG1; 2388 2389 SLIST_INSERT_HEAD(&aiocbe->klist, kn, kn_selnext); 2390 2391 return (0); 2392 } 2393 2394 static void 2395 filt_aiodetach(struct knote *kn) 2396 { 2397 struct aiocblist *aiocbe = (struct aiocblist *)kn->kn_id; 2398 int s = splhigh(); /* XXX no clue, so overkill */ 2399 2400 SLIST_REMOVE(&aiocbe->klist, kn, knote, kn_selnext); 2401 splx(s); 2402 } 2403 2404 /*ARGSUSED*/ 2405 static int 2406 filt_aio(struct knote *kn, long hint) 2407 { 2408 struct aiocblist *aiocbe = (struct aiocblist *)kn->kn_id; 2409 2410 kn->kn_data = 0; /* XXX data returned? */ 2411 if (aiocbe->jobstate != JOBST_JOBFINISHED && 2412 aiocbe->jobstate != JOBST_JOBBFINISHED) 2413 return (0); 2414 kn->kn_flags |= EV_EOF; 2415 return (1); 2416 } 2417 2418 struct filterops aio_filtops = 2419 { 0, filt_aioattach, filt_aiodetach, filt_aio }; 2420 #endif /* VFS_AIO */ 2421