1 /* 2 * Copyright (c) 1997 John S. Dyson. All rights reserved. 3 * 4 * Redistribution and use in source and binary forms, with or without 5 * modification, are permitted provided that the following conditions 6 * are met: 7 * 1. Redistributions of source code must retain the above copyright 8 * notice, this list of conditions and the following disclaimer. 9 * 2. John S. Dyson's name may not be used to endorse or promote products 10 * derived from this software without specific prior written permission. 11 * 12 * DISCLAIMER: This code isn't warranted to do anything useful. Anything 13 * bad that happens because of using this software isn't the responsibility 14 * of the author. This software is distributed AS-IS. 15 * 16 * $FreeBSD$ 17 */ 18 19 /* 20 * This file contains support for the POSIX 1003.1B AIO/LIO facility. 21 */ 22 23 #include <sys/param.h> 24 #include <sys/systm.h> 25 #include <sys/bio.h> 26 #include <sys/buf.h> 27 #include <sys/sysproto.h> 28 #include <sys/filedesc.h> 29 #include <sys/kernel.h> 30 #include <sys/fcntl.h> 31 #include <sys/file.h> 32 #include <sys/lock.h> 33 #include <sys/unistd.h> 34 #include <sys/proc.h> 35 #include <sys/resourcevar.h> 36 #include <sys/signalvar.h> 37 #include <sys/protosw.h> 38 #include <sys/socketvar.h> 39 #include <sys/sysctl.h> 40 #include <sys/vnode.h> 41 #include <sys/conf.h> 42 #include <sys/event.h> 43 44 #include <vm/vm.h> 45 #include <vm/vm_extern.h> 46 #include <vm/pmap.h> 47 #include <vm/vm_map.h> 48 #include <vm/vm_zone.h> 49 #include <sys/aio.h> 50 51 #include <machine/limits.h> 52 #include "opt_vfs_aio.h" 53 54 static long jobrefid; 55 56 #define JOBST_NULL 0x0 57 #define JOBST_JOBQPROC 0x1 58 #define JOBST_JOBQGLOBAL 0x2 59 #define JOBST_JOBRUNNING 0x3 60 #define JOBST_JOBFINISHED 0x4 61 #define JOBST_JOBQBUF 0x5 62 #define JOBST_JOBBFINISHED 0x6 63 64 #ifndef MAX_AIO_PER_PROC 65 #define MAX_AIO_PER_PROC 32 66 #endif 67 68 #ifndef MAX_AIO_QUEUE_PER_PROC 69 #define MAX_AIO_QUEUE_PER_PROC 256 /* Bigger than AIO_LISTIO_MAX */ 70 #endif 71 72 #ifndef MAX_AIO_PROCS 73 #define MAX_AIO_PROCS 32 74 #endif 75 76 #ifndef MAX_AIO_QUEUE 77 #define MAX_AIO_QUEUE 1024 /* Bigger than AIO_LISTIO_MAX */ 78 #endif 79 80 #ifndef TARGET_AIO_PROCS 81 #define TARGET_AIO_PROCS 4 82 #endif 83 84 #ifndef MAX_BUF_AIO 85 #define MAX_BUF_AIO 16 86 #endif 87 88 #ifndef AIOD_TIMEOUT_DEFAULT 89 #define AIOD_TIMEOUT_DEFAULT (10 * hz) 90 #endif 91 92 #ifndef AIOD_LIFETIME_DEFAULT 93 #define AIOD_LIFETIME_DEFAULT (30 * hz) 94 #endif 95 96 static int max_aio_procs = MAX_AIO_PROCS; 97 static int num_aio_procs = 0; 98 static int target_aio_procs = TARGET_AIO_PROCS; 99 static int max_queue_count = MAX_AIO_QUEUE; 100 static int num_queue_count = 0; 101 static int num_buf_aio = 0; 102 static int num_aio_resv_start = 0; 103 static int aiod_timeout; 104 static int aiod_lifetime; 105 106 static int max_aio_per_proc = MAX_AIO_PER_PROC; 107 static int max_aio_queue_per_proc = MAX_AIO_QUEUE_PER_PROC; 108 static int max_buf_aio = MAX_BUF_AIO; 109 110 SYSCTL_NODE(_vfs, OID_AUTO, aio, CTLFLAG_RW, 0, "AIO mgmt"); 111 112 SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_per_proc, 113 CTLFLAG_RW, &max_aio_per_proc, 0, ""); 114 115 SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_queue_per_proc, 116 CTLFLAG_RW, &max_aio_queue_per_proc, 0, ""); 117 118 SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_procs, 119 CTLFLAG_RW, &max_aio_procs, 0, ""); 120 121 SYSCTL_INT(_vfs_aio, OID_AUTO, num_aio_procs, 122 CTLFLAG_RD, &num_aio_procs, 0, ""); 123 124 SYSCTL_INT(_vfs_aio, OID_AUTO, num_queue_count, 125 CTLFLAG_RD, &num_queue_count, 0, ""); 126 127 SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_queue, 128 CTLFLAG_RW, &max_queue_count, 0, ""); 129 130 SYSCTL_INT(_vfs_aio, OID_AUTO, target_aio_procs, 131 CTLFLAG_RW, &target_aio_procs, 0, ""); 132 133 SYSCTL_INT(_vfs_aio, OID_AUTO, max_buf_aio, 134 CTLFLAG_RW, &max_buf_aio, 0, ""); 135 136 SYSCTL_INT(_vfs_aio, OID_AUTO, num_buf_aio, 137 CTLFLAG_RD, &num_buf_aio, 0, ""); 138 139 SYSCTL_INT(_vfs_aio, OID_AUTO, aiod_lifetime, 140 CTLFLAG_RW, &aiod_lifetime, 0, ""); 141 142 SYSCTL_INT(_vfs_aio, OID_AUTO, aiod_timeout, 143 CTLFLAG_RW, &aiod_timeout, 0, ""); 144 145 /* 146 * AIO process info 147 */ 148 #define AIOP_FREE 0x1 /* proc on free queue */ 149 #define AIOP_SCHED 0x2 /* proc explicitly scheduled */ 150 151 struct aioproclist { 152 int aioprocflags; /* AIO proc flags */ 153 TAILQ_ENTRY(aioproclist) list; /* List of processes */ 154 struct proc *aioproc; /* The AIO thread */ 155 TAILQ_HEAD (,aiocblist) jobtorun; /* suggested job to run */ 156 }; 157 158 /* 159 * data-structure for lio signal management 160 */ 161 struct aio_liojob { 162 int lioj_flags; 163 int lioj_buffer_count; 164 int lioj_buffer_finished_count; 165 int lioj_queue_count; 166 int lioj_queue_finished_count; 167 struct sigevent lioj_signal; /* signal on all I/O done */ 168 TAILQ_ENTRY (aio_liojob) lioj_list; 169 struct kaioinfo *lioj_ki; 170 }; 171 #define LIOJ_SIGNAL 0x1 /* signal on all done (lio) */ 172 #define LIOJ_SIGNAL_POSTED 0x2 /* signal has been posted */ 173 174 /* 175 * per process aio data structure 176 */ 177 struct kaioinfo { 178 int kaio_flags; /* per process kaio flags */ 179 int kaio_maxactive_count; /* maximum number of AIOs */ 180 int kaio_active_count; /* number of currently used AIOs */ 181 int kaio_qallowed_count; /* maxiumu size of AIO queue */ 182 int kaio_queue_count; /* size of AIO queue */ 183 int kaio_ballowed_count; /* maximum number of buffers */ 184 int kaio_queue_finished_count; /* number of daemon jobs finished */ 185 int kaio_buffer_count; /* number of physio buffers */ 186 int kaio_buffer_finished_count; /* count of I/O done */ 187 struct proc *kaio_p; /* process that uses this kaio block */ 188 TAILQ_HEAD (,aio_liojob) kaio_liojoblist; /* list of lio jobs */ 189 TAILQ_HEAD (,aiocblist) kaio_jobqueue; /* job queue for process */ 190 TAILQ_HEAD (,aiocblist) kaio_jobdone; /* done queue for process */ 191 TAILQ_HEAD (,aiocblist) kaio_bufqueue; /* buffer job queue for process */ 192 TAILQ_HEAD (,aiocblist) kaio_bufdone; /* buffer done queue for process */ 193 TAILQ_HEAD (,aiocblist) kaio_sockqueue; /* queue for aios waiting on sockets */ 194 }; 195 196 #define KAIO_RUNDOWN 0x1 /* process is being run down */ 197 #define KAIO_WAKEUP 0x2 /* wakeup process when there is a significant event */ 198 199 static TAILQ_HEAD(,aioproclist) aio_freeproc, aio_activeproc; 200 static TAILQ_HEAD(,aiocblist) aio_jobs; /* Async job list */ 201 static TAILQ_HEAD(,aiocblist) aio_bufjobs; /* Phys I/O job list */ 202 static TAILQ_HEAD(,aiocblist) aio_freejobs; /* Pool of free jobs */ 203 204 static void aio_init_aioinfo(struct proc *p); 205 static void aio_onceonly(void *); 206 static int aio_free_entry(struct aiocblist *aiocbe); 207 static void aio_process(struct aiocblist *aiocbe); 208 static int aio_newproc(void); 209 static int aio_aqueue(struct proc *p, struct aiocb *job, int type); 210 static void aio_physwakeup(struct buf *bp); 211 static int aio_fphysio(struct proc *p, struct aiocblist *aiocbe, int type); 212 static int aio_qphysio(struct proc *p, struct aiocblist *iocb); 213 static void aio_daemon(void *uproc); 214 215 static int filt_aioattach(struct knote *kn); 216 static void filt_aiodetach(struct knote *kn); 217 static int filt_aio(struct knote *kn, long hint); 218 219 struct filterops aio_filtops = 220 { 0, filt_aioattach, filt_aiodetach, filt_aio }; 221 222 SYSINIT(aio, SI_SUB_VFS, SI_ORDER_ANY, aio_onceonly, NULL); 223 224 static vm_zone_t kaio_zone = 0, aiop_zone = 0, aiocb_zone = 0, aiol_zone = 0; 225 static vm_zone_t aiolio_zone = 0; 226 227 /* 228 * Startup initialization 229 */ 230 void 231 aio_onceonly(void *na) 232 { 233 TAILQ_INIT(&aio_freeproc); 234 TAILQ_INIT(&aio_activeproc); 235 TAILQ_INIT(&aio_jobs); 236 TAILQ_INIT(&aio_bufjobs); 237 TAILQ_INIT(&aio_freejobs); 238 kaio_zone = zinit("AIO", sizeof (struct kaioinfo), 0, 0, 1); 239 aiop_zone = zinit("AIOP", sizeof (struct aioproclist), 0, 0, 1); 240 aiocb_zone = zinit("AIOCB", sizeof (struct aiocblist), 0, 0, 1); 241 aiol_zone = zinit("AIOL", AIO_LISTIO_MAX * sizeof (int), 0, 0, 1); 242 aiolio_zone = zinit("AIOLIO", AIO_LISTIO_MAX * sizeof (struct 243 aio_liojob), 0, 0, 1); 244 aiod_timeout = AIOD_TIMEOUT_DEFAULT; 245 aiod_lifetime = AIOD_LIFETIME_DEFAULT; 246 jobrefid = 1; 247 } 248 249 /* 250 * Init the per-process aioinfo structure. The aioinfo limits are set 251 * per-process for user limit (resource) management. 252 */ 253 void 254 aio_init_aioinfo(struct proc *p) 255 { 256 struct kaioinfo *ki; 257 if (p->p_aioinfo == NULL) { 258 ki = zalloc(kaio_zone); 259 p->p_aioinfo = ki; 260 ki->kaio_flags = 0; 261 ki->kaio_maxactive_count = max_aio_per_proc; 262 ki->kaio_active_count = 0; 263 ki->kaio_qallowed_count = max_aio_queue_per_proc; 264 ki->kaio_queue_count = 0; 265 ki->kaio_ballowed_count = max_buf_aio; 266 ki->kaio_buffer_count = 0; 267 ki->kaio_buffer_finished_count = 0; 268 ki->kaio_p = p; 269 TAILQ_INIT(&ki->kaio_jobdone); 270 TAILQ_INIT(&ki->kaio_jobqueue); 271 TAILQ_INIT(&ki->kaio_bufdone); 272 TAILQ_INIT(&ki->kaio_bufqueue); 273 TAILQ_INIT(&ki->kaio_liojoblist); 274 TAILQ_INIT(&ki->kaio_sockqueue); 275 } 276 277 while (num_aio_procs < target_aio_procs) 278 aio_newproc(); 279 } 280 281 /* 282 * Free a job entry. Wait for completion if it is currently active, but don't 283 * delay forever. If we delay, we return a flag that says that we have to 284 * restart the queue scan. 285 */ 286 int 287 aio_free_entry(struct aiocblist *aiocbe) 288 { 289 struct kaioinfo *ki; 290 struct aioproclist *aiop; 291 struct aio_liojob *lj; 292 struct proc *p; 293 int error; 294 int s; 295 296 if (aiocbe->jobstate == JOBST_NULL) 297 panic("aio_free_entry: freeing already free job"); 298 299 p = aiocbe->userproc; 300 ki = p->p_aioinfo; 301 lj = aiocbe->lio; 302 if (ki == NULL) 303 panic("aio_free_entry: missing p->p_aioinfo"); 304 305 if (aiocbe->jobstate == JOBST_JOBRUNNING) { 306 if (aiocbe->jobflags & AIOCBLIST_ASYNCFREE) 307 return 0; 308 aiocbe->jobflags |= AIOCBLIST_RUNDOWN; 309 tsleep(aiocbe, PRIBIO|PCATCH, "jobwai", 0); 310 } 311 aiocbe->jobflags &= ~AIOCBLIST_ASYNCFREE; 312 313 if (aiocbe->bp == NULL) { 314 if (ki->kaio_queue_count <= 0) 315 panic("aio_free_entry: process queue size <= 0"); 316 if (num_queue_count <= 0) 317 panic("aio_free_entry: system wide queue size <= 0"); 318 319 if (lj) { 320 lj->lioj_queue_count--; 321 if (aiocbe->jobflags & AIOCBLIST_DONE) 322 lj->lioj_queue_finished_count--; 323 } 324 ki->kaio_queue_count--; 325 if (aiocbe->jobflags & AIOCBLIST_DONE) 326 ki->kaio_queue_finished_count--; 327 num_queue_count--; 328 } else { 329 if (lj) { 330 lj->lioj_buffer_count--; 331 if (aiocbe->jobflags & AIOCBLIST_DONE) 332 lj->lioj_buffer_finished_count--; 333 } 334 if (aiocbe->jobflags & AIOCBLIST_DONE) 335 ki->kaio_buffer_finished_count--; 336 ki->kaio_buffer_count--; 337 num_buf_aio--; 338 } 339 340 /* aiocbe is going away, we need to destroy any knotes */ 341 knote_remove(p, &aiocbe->klist); 342 343 if ((ki->kaio_flags & KAIO_WAKEUP) || ((ki->kaio_flags & KAIO_RUNDOWN) 344 && ((ki->kaio_buffer_count == 0) && (ki->kaio_queue_count == 0)))) { 345 ki->kaio_flags &= ~KAIO_WAKEUP; 346 wakeup(p); 347 } 348 349 if (aiocbe->jobstate == JOBST_JOBQBUF) { 350 if ((error = aio_fphysio(p, aiocbe, 1)) != 0) 351 return error; 352 if (aiocbe->jobstate != JOBST_JOBBFINISHED) 353 panic("aio_free_entry: invalid physio finish-up state"); 354 s = splbio(); 355 TAILQ_REMOVE(&ki->kaio_bufdone, aiocbe, plist); 356 splx(s); 357 } else if (aiocbe->jobstate == JOBST_JOBQPROC) { 358 aiop = aiocbe->jobaioproc; 359 TAILQ_REMOVE(&aiop->jobtorun, aiocbe, list); 360 } else if (aiocbe->jobstate == JOBST_JOBQGLOBAL) 361 TAILQ_REMOVE(&aio_jobs, aiocbe, list); 362 else if (aiocbe->jobstate == JOBST_JOBFINISHED) 363 TAILQ_REMOVE(&ki->kaio_jobdone, aiocbe, plist); 364 else if (aiocbe->jobstate == JOBST_JOBBFINISHED) { 365 s = splbio(); 366 TAILQ_REMOVE(&ki->kaio_bufdone, aiocbe, plist); 367 splx(s); 368 if (aiocbe->bp) { 369 vunmapbuf(aiocbe->bp); 370 relpbuf(aiocbe->bp, NULL); 371 aiocbe->bp = NULL; 372 } 373 } 374 if (lj && (lj->lioj_buffer_count == 0) && (lj->lioj_queue_count == 0)) { 375 TAILQ_REMOVE(&ki->kaio_liojoblist, lj, lioj_list); 376 zfree(aiolio_zone, lj); 377 } 378 TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list); 379 aiocbe->jobstate = JOBST_NULL; 380 return 0; 381 } 382 383 /* 384 * Rundown the jobs for a given process. 385 */ 386 void 387 aio_proc_rundown(struct proc *p) 388 { 389 int s; 390 struct kaioinfo *ki; 391 struct aio_liojob *lj, *ljn; 392 struct aiocblist *aiocbe, *aiocbn; 393 struct file *fp; 394 struct filedesc *fdp; 395 struct socket *so; 396 397 ki = p->p_aioinfo; 398 if (ki == NULL) 399 return; 400 401 ki->kaio_flags |= LIOJ_SIGNAL_POSTED; 402 while ((ki->kaio_active_count > 0) || (ki->kaio_buffer_count > 403 ki->kaio_buffer_finished_count)) { 404 ki->kaio_flags |= KAIO_RUNDOWN; 405 if (tsleep(p, PRIBIO, "kaiowt", aiod_timeout)) 406 break; 407 } 408 409 /* 410 * Move any aio ops that are waiting on socket I/O to the normal job 411 * queues so they are cleaned up with any others. 412 */ 413 fdp = p->p_fd; 414 415 s = splnet(); 416 for (aiocbe = TAILQ_FIRST(&ki->kaio_sockqueue); aiocbe; aiocbe = 417 aiocbn) { 418 aiocbn = TAILQ_NEXT(aiocbe, plist); 419 fp = fdp->fd_ofiles[aiocbe->uaiocb.aio_fildes]; 420 421 /* 422 * Under some circumstances, the aio_fildes and the file 423 * structure don't match. This would leave aiocbe's in the 424 * TAILQ associated with the socket and cause a panic later. 425 * 426 * Detect and fix. 427 */ 428 if ((fp == NULL) || (fp != aiocbe->fd_file)) 429 fp = aiocbe->fd_file; 430 if (fp) { 431 so = (struct socket *)fp->f_data; 432 TAILQ_REMOVE(&so->so_aiojobq, aiocbe, list); 433 if (TAILQ_EMPTY(&so->so_aiojobq)) { 434 so->so_snd.sb_flags &= ~SB_AIO; 435 so->so_rcv.sb_flags &= ~SB_AIO; 436 } 437 } 438 TAILQ_REMOVE(&ki->kaio_sockqueue, aiocbe, plist); 439 TAILQ_INSERT_HEAD(&aio_jobs, aiocbe, list); 440 TAILQ_INSERT_HEAD(&ki->kaio_jobqueue, aiocbe, plist); 441 } 442 splx(s); 443 444 restart1: 445 for (aiocbe = TAILQ_FIRST(&ki->kaio_jobdone); aiocbe; aiocbe = aiocbn) { 446 aiocbn = TAILQ_NEXT(aiocbe, plist); 447 if (aio_free_entry(aiocbe)) 448 goto restart1; 449 } 450 451 restart2: 452 for (aiocbe = TAILQ_FIRST(&ki->kaio_jobqueue); aiocbe; aiocbe = 453 aiocbn) { 454 aiocbn = TAILQ_NEXT(aiocbe, plist); 455 if (aio_free_entry(aiocbe)) 456 goto restart2; 457 } 458 459 /* 460 * Note the use of lots of splbio here, trying to avoid splbio for long chains 461 * of I/O. Probably unnecessary. 462 */ 463 restart3: 464 s = splbio(); 465 while (TAILQ_FIRST(&ki->kaio_bufqueue)) { 466 ki->kaio_flags |= KAIO_WAKEUP; 467 tsleep(p, PRIBIO, "aioprn", 0); 468 splx(s); 469 goto restart3; 470 } 471 splx(s); 472 473 restart4: 474 s = splbio(); 475 for (aiocbe = TAILQ_FIRST(&ki->kaio_bufdone); aiocbe; aiocbe = aiocbn) { 476 aiocbn = TAILQ_NEXT(aiocbe, plist); 477 if (aio_free_entry(aiocbe)) { 478 splx(s); 479 goto restart4; 480 } 481 } 482 splx(s); 483 484 for (lj = TAILQ_FIRST(&ki->kaio_liojoblist); lj; lj = ljn) { 485 ljn = TAILQ_NEXT(lj, lioj_list); 486 if ((lj->lioj_buffer_count == 0) && (lj->lioj_queue_count == 487 0)) { 488 TAILQ_REMOVE(&ki->kaio_liojoblist, lj, lioj_list); 489 zfree(aiolio_zone, lj); 490 } else { 491 #ifdef DIAGNOSTIC 492 printf("LIO job not cleaned up: B:%d, BF:%d, Q:%d, " 493 "QF:%d\n", lj->lioj_buffer_count, 494 lj->lioj_buffer_finished_count, 495 lj->lioj_queue_count, 496 lj->lioj_queue_finished_count); 497 #endif 498 } 499 } 500 501 zfree(kaio_zone, ki); 502 p->p_aioinfo = NULL; 503 } 504 505 /* 506 * Select a job to run (called by an AIO daemon). 507 */ 508 static struct aiocblist * 509 aio_selectjob(struct aioproclist *aiop) 510 { 511 int s; 512 struct aiocblist *aiocbe; 513 struct kaioinfo *ki; 514 struct proc *userp; 515 516 aiocbe = TAILQ_FIRST(&aiop->jobtorun); 517 if (aiocbe) { 518 TAILQ_REMOVE(&aiop->jobtorun, aiocbe, list); 519 return aiocbe; 520 } 521 522 s = splnet(); 523 for (aiocbe = TAILQ_FIRST(&aio_jobs); aiocbe; aiocbe = 524 TAILQ_NEXT(aiocbe, list)) { 525 userp = aiocbe->userproc; 526 ki = userp->p_aioinfo; 527 528 if (ki->kaio_active_count < ki->kaio_maxactive_count) { 529 TAILQ_REMOVE(&aio_jobs, aiocbe, list); 530 splx(s); 531 return aiocbe; 532 } 533 } 534 splx(s); 535 536 return NULL; 537 } 538 539 /* 540 * The AIO processing activity. This is the code that does the I/O request for 541 * the non-physio version of the operations. The normal vn operations are used, 542 * and this code should work in all instances for every type of file, including 543 * pipes, sockets, fifos, and regular files. 544 */ 545 void 546 aio_process(struct aiocblist *aiocbe) 547 { 548 struct filedesc *fdp; 549 struct proc *userp, *mycp; 550 struct aiocb *cb; 551 struct file *fp; 552 struct uio auio; 553 struct iovec aiov; 554 unsigned int fd; 555 int cnt; 556 int error; 557 off_t offset; 558 int oublock_st, oublock_end; 559 int inblock_st, inblock_end; 560 561 userp = aiocbe->userproc; 562 cb = &aiocbe->uaiocb; 563 564 mycp = curproc; 565 566 fdp = mycp->p_fd; 567 fd = cb->aio_fildes; 568 fp = fdp->fd_ofiles[fd]; 569 570 if ((fp == NULL) || (fp != aiocbe->fd_file)) { 571 cb->_aiocb_private.error = EBADF; 572 cb->_aiocb_private.status = -1; 573 return; 574 } 575 576 aiov.iov_base = (void *)cb->aio_buf; 577 aiov.iov_len = cb->aio_nbytes; 578 579 auio.uio_iov = &aiov; 580 auio.uio_iovcnt = 1; 581 auio.uio_offset = offset = cb->aio_offset; 582 auio.uio_resid = cb->aio_nbytes; 583 cnt = cb->aio_nbytes; 584 auio.uio_segflg = UIO_USERSPACE; 585 auio.uio_procp = mycp; 586 587 inblock_st = mycp->p_stats->p_ru.ru_inblock; 588 oublock_st = mycp->p_stats->p_ru.ru_oublock; 589 if (cb->aio_lio_opcode == LIO_READ) { 590 auio.uio_rw = UIO_READ; 591 error = fo_read(fp, &auio, fp->f_cred, FOF_OFFSET, mycp); 592 } else { 593 auio.uio_rw = UIO_WRITE; 594 error = fo_write(fp, &auio, fp->f_cred, FOF_OFFSET, mycp); 595 } 596 inblock_end = mycp->p_stats->p_ru.ru_inblock; 597 oublock_end = mycp->p_stats->p_ru.ru_oublock; 598 599 aiocbe->inputcharge = inblock_end - inblock_st; 600 aiocbe->outputcharge = oublock_end - oublock_st; 601 602 if ((error) && (auio.uio_resid != cnt)) { 603 if (error == ERESTART || error == EINTR || error == EWOULDBLOCK) 604 error = 0; 605 if ((error == EPIPE) && (cb->aio_lio_opcode == LIO_WRITE)) 606 psignal(userp, SIGPIPE); 607 } 608 609 cnt -= auio.uio_resid; 610 cb->_aiocb_private.error = error; 611 cb->_aiocb_private.status = cnt; 612 613 return; 614 } 615 616 /* 617 * The AIO daemon, most of the actual work is done in aio_process, 618 * but the setup (and address space mgmt) is done in this routine. 619 */ 620 static void 621 aio_daemon(void *uproc) 622 { 623 int s; 624 struct aio_liojob *lj; 625 struct aiocb *cb; 626 struct aiocblist *aiocbe; 627 struct aioproclist *aiop; 628 struct kaioinfo *ki; 629 struct proc *curcp, *mycp, *userp; 630 struct vmspace *myvm, *tmpvm; 631 632 /* 633 * Local copies of curproc (cp) and vmspace (myvm) 634 */ 635 mycp = curproc; 636 myvm = mycp->p_vmspace; 637 638 if (mycp->p_textvp) { 639 vrele(mycp->p_textvp); 640 mycp->p_textvp = NULL; 641 } 642 643 /* 644 * Allocate and ready the aio control info. There is one aiop structure 645 * per daemon. 646 */ 647 aiop = zalloc(aiop_zone); 648 aiop->aioproc = mycp; 649 aiop->aioprocflags |= AIOP_FREE; 650 TAILQ_INIT(&aiop->jobtorun); 651 652 s = splnet(); 653 654 /* 655 * Place thread (lightweight process) onto the AIO free thread list. 656 */ 657 if (TAILQ_EMPTY(&aio_freeproc)) 658 wakeup(&aio_freeproc); 659 TAILQ_INSERT_HEAD(&aio_freeproc, aiop, list); 660 661 splx(s); 662 663 /* Make up a name for the daemon. */ 664 strcpy(mycp->p_comm, "aiod"); 665 666 /* 667 * Get rid of our current filedescriptors. AIOD's don't need any 668 * filedescriptors, except as temporarily inherited from the client. 669 * Credentials are also cloned, and made equivalent to "root". 670 */ 671 fdfree(mycp); 672 mycp->p_fd = NULL; 673 mycp->p_ucred = crcopy(mycp->p_ucred); 674 mycp->p_ucred->cr_uid = 0; 675 mycp->p_ucred->cr_ngroups = 1; 676 mycp->p_ucred->cr_groups[0] = 1; 677 678 /* The daemon resides in its own pgrp. */ 679 enterpgrp(mycp, mycp->p_pid, 1); 680 681 /* Mark special process type. */ 682 mycp->p_flag |= P_SYSTEM | P_KTHREADP; 683 684 /* 685 * Wakeup parent process. (Parent sleeps to keep from blasting away 686 * creating to many daemons.) 687 */ 688 wakeup(mycp); 689 690 for (;;) { 691 /* 692 * curcp is the current daemon process context. 693 * userp is the current user process context. 694 */ 695 curcp = mycp; 696 697 /* 698 * Take daemon off of free queue 699 */ 700 if (aiop->aioprocflags & AIOP_FREE) { 701 s = splnet(); 702 TAILQ_REMOVE(&aio_freeproc, aiop, list); 703 TAILQ_INSERT_TAIL(&aio_activeproc, aiop, list); 704 aiop->aioprocflags &= ~AIOP_FREE; 705 splx(s); 706 } 707 aiop->aioprocflags &= ~AIOP_SCHED; 708 709 /* 710 * Check for jobs. 711 */ 712 while ((aiocbe = aio_selectjob(aiop)) != NULL) { 713 cb = &aiocbe->uaiocb; 714 userp = aiocbe->userproc; 715 716 aiocbe->jobstate = JOBST_JOBRUNNING; 717 718 /* 719 * Connect to process address space for user program. 720 */ 721 if (userp != curcp) { 722 /* 723 * Save the current address space that we are 724 * connected to. 725 */ 726 tmpvm = mycp->p_vmspace; 727 728 /* 729 * Point to the new user address space, and 730 * refer to it. 731 */ 732 mycp->p_vmspace = userp->p_vmspace; 733 mycp->p_vmspace->vm_refcnt++; 734 735 /* Activate the new mapping. */ 736 pmap_activate(mycp); 737 738 /* 739 * If the old address space wasn't the daemons 740 * own address space, then we need to remove the 741 * daemon's reference from the other process 742 * that it was acting on behalf of. 743 */ 744 if (tmpvm != myvm) { 745 vmspace_free(tmpvm); 746 } 747 748 /* 749 * Disassociate from previous clients file 750 * descriptors, and associate to the new clients 751 * descriptors. Note that the daemon doesn't 752 * need to worry about its orginal descriptors, 753 * because they were originally freed. 754 */ 755 if (mycp->p_fd) 756 fdfree(mycp); 757 mycp->p_fd = fdshare(userp); 758 curcp = userp; 759 } 760 761 ki = userp->p_aioinfo; 762 lj = aiocbe->lio; 763 764 /* Account for currently active jobs. */ 765 ki->kaio_active_count++; 766 767 /* Do the I/O function. */ 768 aiocbe->jobaioproc = aiop; 769 aio_process(aiocbe); 770 771 /* Decrement the active job count. */ 772 ki->kaio_active_count--; 773 774 /* 775 * Increment the completion count for wakeup/signal 776 * comparisons. 777 */ 778 aiocbe->jobflags |= AIOCBLIST_DONE; 779 ki->kaio_queue_finished_count++; 780 if (lj) 781 lj->lioj_queue_finished_count++; 782 if ((ki->kaio_flags & KAIO_WAKEUP) || ((ki->kaio_flags 783 & KAIO_RUNDOWN) && (ki->kaio_active_count == 0))) { 784 ki->kaio_flags &= ~KAIO_WAKEUP; 785 wakeup(userp); 786 } 787 788 s = splbio(); 789 if (lj && (lj->lioj_flags & 790 (LIOJ_SIGNAL|LIOJ_SIGNAL_POSTED)) == LIOJ_SIGNAL) { 791 if ((lj->lioj_queue_finished_count == 792 lj->lioj_queue_count) && 793 (lj->lioj_buffer_finished_count == 794 lj->lioj_buffer_count)) { 795 psignal(userp, 796 lj->lioj_signal.sigev_signo); 797 lj->lioj_flags |= 798 LIOJ_SIGNAL_POSTED; 799 } 800 } 801 splx(s); 802 803 aiocbe->jobstate = JOBST_JOBFINISHED; 804 805 /* 806 * If the I/O request should be automatically rundown, 807 * do the needed cleanup. Otherwise, place the queue 808 * entry for the just finished I/O request into the done 809 * queue for the associated client. 810 */ 811 s = splnet(); 812 if (aiocbe->jobflags & AIOCBLIST_ASYNCFREE) { 813 aiocbe->jobflags &= ~AIOCBLIST_ASYNCFREE; 814 TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list); 815 } else { 816 TAILQ_REMOVE(&ki->kaio_jobqueue, aiocbe, plist); 817 TAILQ_INSERT_TAIL(&ki->kaio_jobdone, aiocbe, 818 plist); 819 } 820 splx(s); 821 KNOTE(&aiocbe->klist, 0); 822 823 if (aiocbe->jobflags & AIOCBLIST_RUNDOWN) { 824 wakeup(aiocbe); 825 aiocbe->jobflags &= ~AIOCBLIST_RUNDOWN; 826 } 827 828 if (cb->aio_sigevent.sigev_notify == SIGEV_SIGNAL) { 829 psignal(userp, cb->aio_sigevent.sigev_signo); 830 } 831 } 832 833 /* 834 * Disconnect from user address space. 835 */ 836 if (curcp != mycp) { 837 /* Get the user address space to disconnect from. */ 838 tmpvm = mycp->p_vmspace; 839 840 /* Get original address space for daemon. */ 841 mycp->p_vmspace = myvm; 842 843 /* Activate the daemon's address space. */ 844 pmap_activate(mycp); 845 #ifdef DIAGNOSTIC 846 if (tmpvm == myvm) { 847 printf("AIOD: vmspace problem -- %d\n", 848 mycp->p_pid); 849 } 850 #endif 851 /* Remove our vmspace reference. */ 852 vmspace_free(tmpvm); 853 854 /* 855 * Disassociate from the user process's file 856 * descriptors. 857 */ 858 if (mycp->p_fd) 859 fdfree(mycp); 860 mycp->p_fd = NULL; 861 curcp = mycp; 862 } 863 864 /* 865 * If we are the first to be put onto the free queue, wakeup 866 * anyone waiting for a daemon. 867 */ 868 s = splnet(); 869 TAILQ_REMOVE(&aio_activeproc, aiop, list); 870 if (TAILQ_EMPTY(&aio_freeproc)) 871 wakeup(&aio_freeproc); 872 TAILQ_INSERT_HEAD(&aio_freeproc, aiop, list); 873 aiop->aioprocflags |= AIOP_FREE; 874 splx(s); 875 876 /* 877 * If daemon is inactive for a long time, allow it to exit, 878 * thereby freeing resources. 879 */ 880 if (((aiop->aioprocflags & AIOP_SCHED) == 0) && tsleep(mycp, 881 PRIBIO, "aiordy", aiod_lifetime)) { 882 s = splnet(); 883 if ((TAILQ_FIRST(&aio_jobs) == NULL) && 884 (TAILQ_FIRST(&aiop->jobtorun) == NULL)) { 885 if ((aiop->aioprocflags & AIOP_FREE) && 886 (num_aio_procs > target_aio_procs)) { 887 TAILQ_REMOVE(&aio_freeproc, aiop, list); 888 splx(s); 889 zfree(aiop_zone, aiop); 890 num_aio_procs--; 891 #ifdef DIAGNOSTIC 892 if (mycp->p_vmspace->vm_refcnt <= 1) { 893 printf("AIOD: bad vm refcnt for" 894 " exiting daemon: %d\n", 895 mycp->p_vmspace->vm_refcnt); 896 } 897 #endif 898 exit1(mycp, 0); 899 } 900 } 901 splx(s); 902 } 903 } 904 } 905 906 /* 907 * Create a new AIO daemon. This is mostly a kernel-thread fork routine. The 908 * AIO daemon modifies its environment itself. 909 */ 910 static int 911 aio_newproc() 912 { 913 int error; 914 struct proc *p, *np; 915 916 p = &proc0; 917 error = fork1(p, RFPROC|RFMEM|RFNOWAIT, &np); 918 if (error) 919 return error; 920 cpu_set_fork_handler(np, aio_daemon, curproc); 921 922 /* 923 * Wait until daemon is started, but continue on just in case to 924 * handle error conditions. 925 */ 926 error = tsleep(np, PZERO, "aiosta", aiod_timeout); 927 num_aio_procs++; 928 929 return error; 930 } 931 932 /* 933 * Try the high-performance physio method for eligible VCHR devices. This 934 * routine doesn't require the use of any additional threads, and have overhead. 935 */ 936 int 937 aio_qphysio(struct proc *p, struct aiocblist *aiocbe) 938 { 939 int error; 940 struct aiocb *cb; 941 struct file *fp; 942 struct buf *bp; 943 struct vnode *vp; 944 struct kaioinfo *ki; 945 struct filedesc *fdp; 946 struct aio_liojob *lj; 947 int fd; 948 int s; 949 int cnt, notify; 950 951 cb = &aiocbe->uaiocb; 952 fdp = p->p_fd; 953 fd = cb->aio_fildes; 954 fp = fdp->fd_ofiles[fd]; 955 956 if (fp->f_type != DTYPE_VNODE) 957 return (-1); 958 959 vp = (struct vnode *)fp->f_data; 960 961 /* 962 * If its not a disk, we don't want to return a positive error. 963 * It causes the aio code to not fall through to try the thread 964 * way when you're talking to a regular file. 965 */ 966 if (!vn_isdisk(vp, &error)) { 967 if (error == ENOTBLK) 968 return (-1); 969 else 970 return (error); 971 } 972 973 if (cb->aio_nbytes % vp->v_rdev->si_bsize_phys) 974 return (-1); 975 976 if ((cb->aio_nbytes > MAXPHYS) && (num_buf_aio >= max_buf_aio)) 977 return (-1); 978 979 ki = p->p_aioinfo; 980 if (ki->kaio_buffer_count >= ki->kaio_ballowed_count) 981 return (-1); 982 983 cnt = cb->aio_nbytes; 984 if (cnt > MAXPHYS) 985 return (-1); 986 987 /* 988 * Physical I/O is charged directly to the process, so we don't have to 989 * fake it. 990 */ 991 aiocbe->inputcharge = 0; 992 aiocbe->outputcharge = 0; 993 994 ki->kaio_buffer_count++; 995 996 lj = aiocbe->lio; 997 if (lj) 998 lj->lioj_buffer_count++; 999 1000 /* Create and build a buffer header for a transfer. */ 1001 bp = (struct buf *)getpbuf(NULL); 1002 1003 /* 1004 * Get a copy of the kva from the physical buffer. 1005 */ 1006 bp->b_caller1 = p; 1007 bp->b_dev = vp->v_rdev; 1008 error = bp->b_error = 0; 1009 1010 bp->b_bcount = cb->aio_nbytes; 1011 bp->b_bufsize = cb->aio_nbytes; 1012 bp->b_flags = B_PHYS; 1013 bp->b_iodone = aio_physwakeup; 1014 bp->b_saveaddr = bp->b_data; 1015 bp->b_data = (void *)cb->aio_buf; 1016 bp->b_blkno = btodb(cb->aio_offset); 1017 1018 if (cb->aio_lio_opcode == LIO_WRITE) { 1019 bp->b_iocmd = BIO_WRITE; 1020 if (!useracc(bp->b_data, bp->b_bufsize, VM_PROT_READ)) { 1021 error = EFAULT; 1022 goto doerror; 1023 } 1024 } else { 1025 bp->b_iocmd = BIO_READ; 1026 if (!useracc(bp->b_data, bp->b_bufsize, VM_PROT_WRITE)) { 1027 error = EFAULT; 1028 goto doerror; 1029 } 1030 } 1031 1032 /* Bring buffer into kernel space. */ 1033 vmapbuf(bp); 1034 1035 s = splbio(); 1036 aiocbe->bp = bp; 1037 bp->b_spc = (void *)aiocbe; 1038 TAILQ_INSERT_TAIL(&aio_bufjobs, aiocbe, list); 1039 TAILQ_INSERT_TAIL(&ki->kaio_bufqueue, aiocbe, plist); 1040 aiocbe->jobstate = JOBST_JOBQBUF; 1041 cb->_aiocb_private.status = cb->aio_nbytes; 1042 num_buf_aio++; 1043 bp->b_error = 0; 1044 1045 splx(s); 1046 1047 /* Perform transfer. */ 1048 DEV_STRATEGY(bp, 0); 1049 1050 notify = 0; 1051 s = splbio(); 1052 1053 /* 1054 * If we had an error invoking the request, or an error in processing 1055 * the request before we have returned, we process it as an error in 1056 * transfer. Note that such an I/O error is not indicated immediately, 1057 * but is returned using the aio_error mechanism. In this case, 1058 * aio_suspend will return immediately. 1059 */ 1060 if (bp->b_error || (bp->b_ioflags & BIO_ERROR)) { 1061 struct aiocb *job = aiocbe->uuaiocb; 1062 1063 aiocbe->uaiocb._aiocb_private.status = 0; 1064 suword(&job->_aiocb_private.status, 0); 1065 aiocbe->uaiocb._aiocb_private.error = bp->b_error; 1066 suword(&job->_aiocb_private.error, bp->b_error); 1067 1068 ki->kaio_buffer_finished_count++; 1069 1070 if (aiocbe->jobstate != JOBST_JOBBFINISHED) { 1071 aiocbe->jobstate = JOBST_JOBBFINISHED; 1072 aiocbe->jobflags |= AIOCBLIST_DONE; 1073 TAILQ_REMOVE(&aio_bufjobs, aiocbe, list); 1074 TAILQ_REMOVE(&ki->kaio_bufqueue, aiocbe, plist); 1075 TAILQ_INSERT_TAIL(&ki->kaio_bufdone, aiocbe, plist); 1076 notify = 1; 1077 } 1078 } 1079 splx(s); 1080 if (notify) 1081 KNOTE(&aiocbe->klist, 0); 1082 return 0; 1083 1084 doerror: 1085 ki->kaio_buffer_count--; 1086 if (lj) 1087 lj->lioj_buffer_count--; 1088 aiocbe->bp = NULL; 1089 relpbuf(bp, NULL); 1090 return error; 1091 } 1092 1093 /* 1094 * This waits/tests physio completion. 1095 */ 1096 int 1097 aio_fphysio(struct proc *p, struct aiocblist *iocb, int flgwait) 1098 { 1099 int s; 1100 struct buf *bp; 1101 int error; 1102 1103 bp = iocb->bp; 1104 1105 s = splbio(); 1106 if (flgwait == 0) { 1107 if ((bp->b_flags & B_DONE) == 0) { 1108 splx(s); 1109 return EINPROGRESS; 1110 } 1111 } 1112 1113 while ((bp->b_flags & B_DONE) == 0) { 1114 if (tsleep((caddr_t)bp, PRIBIO, "physstr", aiod_timeout)) { 1115 if ((bp->b_flags & B_DONE) == 0) { 1116 splx(s); 1117 return EINPROGRESS; 1118 } else 1119 break; 1120 } 1121 } 1122 1123 /* Release mapping into kernel space. */ 1124 vunmapbuf(bp); 1125 iocb->bp = 0; 1126 1127 error = 0; 1128 1129 /* Check for an error. */ 1130 if (bp->b_ioflags & BIO_ERROR) 1131 error = bp->b_error; 1132 1133 relpbuf(bp, NULL); 1134 return (error); 1135 } 1136 1137 /* 1138 * Wake up aio requests that may be serviceable now. 1139 */ 1140 void 1141 aio_swake(struct socket *so, struct sockbuf *sb) 1142 { 1143 struct aiocblist *cb,*cbn; 1144 struct proc *p; 1145 struct kaioinfo *ki = NULL; 1146 int opcode, wakecount = 0; 1147 struct aioproclist *aiop; 1148 1149 if (sb == &so->so_snd) { 1150 opcode = LIO_WRITE; 1151 so->so_snd.sb_flags &= ~SB_AIO; 1152 } else { 1153 opcode = LIO_READ; 1154 so->so_rcv.sb_flags &= ~SB_AIO; 1155 } 1156 1157 for (cb = TAILQ_FIRST(&so->so_aiojobq); cb; cb = cbn) { 1158 cbn = TAILQ_NEXT(cb, list); 1159 if (opcode == cb->uaiocb.aio_lio_opcode) { 1160 p = cb->userproc; 1161 ki = p->p_aioinfo; 1162 TAILQ_REMOVE(&so->so_aiojobq, cb, list); 1163 TAILQ_REMOVE(&ki->kaio_sockqueue, cb, plist); 1164 TAILQ_INSERT_TAIL(&aio_jobs, cb, list); 1165 TAILQ_INSERT_TAIL(&ki->kaio_jobqueue, cb, plist); 1166 wakecount++; 1167 if (cb->jobstate != JOBST_JOBQGLOBAL) 1168 panic("invalid queue value"); 1169 } 1170 } 1171 1172 while (wakecount--) { 1173 if ((aiop = TAILQ_FIRST(&aio_freeproc)) != 0) { 1174 TAILQ_REMOVE(&aio_freeproc, aiop, list); 1175 TAILQ_INSERT_TAIL(&aio_activeproc, aiop, list); 1176 aiop->aioprocflags &= ~AIOP_FREE; 1177 wakeup(aiop->aioproc); 1178 } 1179 } 1180 } 1181 1182 /* 1183 * Queue a new AIO request. Choosing either the threaded or direct physio VCHR 1184 * technique is done in this code. 1185 */ 1186 static int 1187 _aio_aqueue(struct proc *p, struct aiocb *job, struct aio_liojob *lj, int type) 1188 { 1189 struct filedesc *fdp; 1190 struct file *fp; 1191 unsigned int fd; 1192 struct socket *so; 1193 int s; 1194 int error = 0; 1195 int opcode; 1196 struct aiocblist *aiocbe; 1197 struct aioproclist *aiop; 1198 struct kaioinfo *ki; 1199 1200 if ((aiocbe = TAILQ_FIRST(&aio_freejobs)) != NULL) 1201 TAILQ_REMOVE(&aio_freejobs, aiocbe, list); 1202 else 1203 aiocbe = zalloc (aiocb_zone); 1204 1205 aiocbe->inputcharge = 0; 1206 aiocbe->outputcharge = 0; 1207 SLIST_INIT(&aiocbe->klist); 1208 1209 suword(&job->_aiocb_private.status, -1); 1210 suword(&job->_aiocb_private.error, 0); 1211 suword(&job->_aiocb_private.kernelinfo, -1); 1212 1213 error = copyin((caddr_t)job, (caddr_t) &aiocbe->uaiocb, sizeof 1214 aiocbe->uaiocb); 1215 if (error) { 1216 suword(&job->_aiocb_private.error, error); 1217 1218 TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list); 1219 return error; 1220 } 1221 1222 /* Save userspace address of the job info. */ 1223 aiocbe->uuaiocb = job; 1224 1225 /* Get the opcode. */ 1226 if (type != LIO_NOP) 1227 aiocbe->uaiocb.aio_lio_opcode = type; 1228 opcode = aiocbe->uaiocb.aio_lio_opcode; 1229 1230 /* Get the fd info for process. */ 1231 fdp = p->p_fd; 1232 1233 /* 1234 * Range check file descriptor. 1235 */ 1236 fd = aiocbe->uaiocb.aio_fildes; 1237 if (fd >= fdp->fd_nfiles) { 1238 TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list); 1239 if (type == 0) 1240 suword(&job->_aiocb_private.error, EBADF); 1241 return EBADF; 1242 } 1243 1244 fp = aiocbe->fd_file = fdp->fd_ofiles[fd]; 1245 if ((fp == NULL) || ((opcode == LIO_WRITE) && ((fp->f_flag & FWRITE) == 1246 0))) { 1247 TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list); 1248 if (type == 0) 1249 suword(&job->_aiocb_private.error, EBADF); 1250 return EBADF; 1251 } 1252 1253 if (aiocbe->uaiocb.aio_offset == -1LL) { 1254 TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list); 1255 if (type == 0) 1256 suword(&job->_aiocb_private.error, EINVAL); 1257 return EINVAL; 1258 } 1259 1260 error = suword(&job->_aiocb_private.kernelinfo, jobrefid); 1261 if (error) { 1262 TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list); 1263 if (type == 0) 1264 suword(&job->_aiocb_private.error, EINVAL); 1265 return error; 1266 } 1267 1268 aiocbe->uaiocb._aiocb_private.kernelinfo = (void *)(intptr_t)jobrefid; 1269 if (jobrefid == LONG_MAX) 1270 jobrefid = 1; 1271 else 1272 jobrefid++; 1273 1274 if (opcode == LIO_NOP) { 1275 TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list); 1276 if (type == 0) { 1277 suword(&job->_aiocb_private.error, 0); 1278 suword(&job->_aiocb_private.status, 0); 1279 suword(&job->_aiocb_private.kernelinfo, 0); 1280 } 1281 return 0; 1282 } 1283 1284 if ((opcode != LIO_READ) && (opcode != LIO_WRITE)) { 1285 TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list); 1286 if (type == 0) { 1287 suword(&job->_aiocb_private.status, 0); 1288 suword(&job->_aiocb_private.error, EINVAL); 1289 } 1290 return EINVAL; 1291 } 1292 1293 /* 1294 * XXX 1295 * Figure out how to do this properly. This currently won't 1296 * work on the alpha, since we're passing in a pointer via 1297 * aio_lio_opcode, which is an int. 1298 */ 1299 { 1300 struct kevent kev, *kevp; 1301 struct kqueue *kq; 1302 1303 kevp = (struct kevent *)job->aio_lio_opcode; 1304 if (kevp == NULL) 1305 goto no_kqueue; 1306 1307 error = copyin((caddr_t)kevp, (caddr_t)&kev, sizeof(kev)); 1308 if (error) 1309 goto aqueue_fail; 1310 1311 if ((u_int)kev.ident >= fdp->fd_nfiles || 1312 (fp = fdp->fd_ofiles[kev.ident]) == NULL || 1313 (fp->f_type != DTYPE_KQUEUE)) { 1314 error = EBADF; 1315 goto aqueue_fail; 1316 } 1317 kq = (struct kqueue *)fp->f_data; 1318 kev.ident = (u_long)aiocbe; 1319 kev.filter = EVFILT_AIO; 1320 kev.flags = EV_ADD | EV_ENABLE | EV_FLAG1; 1321 error = kqueue_register(kq, &kev, p); 1322 aqueue_fail: 1323 if (error) { 1324 TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list); 1325 if (type == 0) 1326 suword(&job->_aiocb_private.error, error); 1327 return (error); 1328 } 1329 no_kqueue: 1330 } 1331 1332 suword(&job->_aiocb_private.error, EINPROGRESS); 1333 aiocbe->uaiocb._aiocb_private.error = EINPROGRESS; 1334 aiocbe->userproc = p; 1335 aiocbe->jobflags = 0; 1336 aiocbe->lio = lj; 1337 ki = p->p_aioinfo; 1338 1339 if (fp->f_type == DTYPE_SOCKET) { 1340 /* 1341 * Alternate queueing for socket ops: Reach down into the 1342 * descriptor to get the socket data. Then check to see if the 1343 * socket is ready to be read or written (based on the requested 1344 * operation). 1345 * 1346 * If it is not ready for io, then queue the aiocbe on the 1347 * socket, and set the flags so we get a call when sbnotify() 1348 * happens. 1349 */ 1350 so = (struct socket *)fp->f_data; 1351 s = splnet(); 1352 if (((opcode == LIO_READ) && (!soreadable(so))) || ((opcode == 1353 LIO_WRITE) && (!sowriteable(so)))) { 1354 TAILQ_INSERT_TAIL(&so->so_aiojobq, aiocbe, list); 1355 TAILQ_INSERT_TAIL(&ki->kaio_sockqueue, aiocbe, plist); 1356 if (opcode == LIO_READ) 1357 so->so_rcv.sb_flags |= SB_AIO; 1358 else 1359 so->so_snd.sb_flags |= SB_AIO; 1360 aiocbe->jobstate = JOBST_JOBQGLOBAL; /* XXX */ 1361 ki->kaio_queue_count++; 1362 num_queue_count++; 1363 splx(s); 1364 return 0; 1365 } 1366 splx(s); 1367 } 1368 1369 if ((error = aio_qphysio(p, aiocbe)) == 0) 1370 return 0; 1371 else if (error > 0) { 1372 suword(&job->_aiocb_private.status, 0); 1373 aiocbe->uaiocb._aiocb_private.error = error; 1374 suword(&job->_aiocb_private.error, error); 1375 return error; 1376 } 1377 1378 /* No buffer for daemon I/O. */ 1379 aiocbe->bp = NULL; 1380 1381 ki->kaio_queue_count++; 1382 if (lj) 1383 lj->lioj_queue_count++; 1384 s = splnet(); 1385 TAILQ_INSERT_TAIL(&ki->kaio_jobqueue, aiocbe, plist); 1386 TAILQ_INSERT_TAIL(&aio_jobs, aiocbe, list); 1387 splx(s); 1388 aiocbe->jobstate = JOBST_JOBQGLOBAL; 1389 1390 num_queue_count++; 1391 error = 0; 1392 1393 /* 1394 * If we don't have a free AIO process, and we are below our quota, then 1395 * start one. Otherwise, depend on the subsequent I/O completions to 1396 * pick-up this job. If we don't sucessfully create the new process 1397 * (thread) due to resource issues, we return an error for now (EAGAIN), 1398 * which is likely not the correct thing to do. 1399 */ 1400 retryproc: 1401 s = splnet(); 1402 if ((aiop = TAILQ_FIRST(&aio_freeproc)) != NULL) { 1403 TAILQ_REMOVE(&aio_freeproc, aiop, list); 1404 TAILQ_INSERT_TAIL(&aio_activeproc, aiop, list); 1405 aiop->aioprocflags &= ~AIOP_FREE; 1406 wakeup(aiop->aioproc); 1407 } else if (((num_aio_resv_start + num_aio_procs) < max_aio_procs) && 1408 ((ki->kaio_active_count + num_aio_resv_start) < 1409 ki->kaio_maxactive_count)) { 1410 num_aio_resv_start++; 1411 if ((error = aio_newproc()) == 0) { 1412 num_aio_resv_start--; 1413 p->p_retval[0] = 0; 1414 goto retryproc; 1415 } 1416 num_aio_resv_start--; 1417 } 1418 splx(s); 1419 return error; 1420 } 1421 1422 /* 1423 * This routine queues an AIO request, checking for quotas. 1424 */ 1425 static int 1426 aio_aqueue(struct proc *p, struct aiocb *job, int type) 1427 { 1428 struct kaioinfo *ki; 1429 1430 if (p->p_aioinfo == NULL) 1431 aio_init_aioinfo(p); 1432 1433 if (num_queue_count >= max_queue_count) 1434 return EAGAIN; 1435 1436 ki = p->p_aioinfo; 1437 if (ki->kaio_queue_count >= ki->kaio_qallowed_count) 1438 return EAGAIN; 1439 1440 return _aio_aqueue(p, job, NULL, type); 1441 } 1442 1443 /* 1444 * Support the aio_return system call, as a side-effect, kernel resources are 1445 * released. 1446 */ 1447 int 1448 aio_return(struct proc *p, struct aio_return_args *uap) 1449 { 1450 #ifndef VFS_AIO 1451 return ENOSYS; 1452 #else 1453 int s; 1454 int jobref; 1455 struct aiocblist *cb, *ncb; 1456 struct aiocb *ujob; 1457 struct kaioinfo *ki; 1458 1459 ki = p->p_aioinfo; 1460 if (ki == NULL) 1461 return EINVAL; 1462 1463 ujob = uap->aiocbp; 1464 1465 jobref = fuword(&ujob->_aiocb_private.kernelinfo); 1466 if (jobref == -1 || jobref == 0) 1467 return EINVAL; 1468 1469 s = splnet(); 1470 for (cb = TAILQ_FIRST(&ki->kaio_jobdone); cb; cb = TAILQ_NEXT(cb, 1471 plist)) { 1472 if (((intptr_t) cb->uaiocb._aiocb_private.kernelinfo) == 1473 jobref) { 1474 splx(s); 1475 if (ujob == cb->uuaiocb) { 1476 p->p_retval[0] = 1477 cb->uaiocb._aiocb_private.status; 1478 } else 1479 p->p_retval[0] = EFAULT; 1480 if (cb->uaiocb.aio_lio_opcode == LIO_WRITE) { 1481 curproc->p_stats->p_ru.ru_oublock += 1482 cb->outputcharge; 1483 cb->outputcharge = 0; 1484 } else if (cb->uaiocb.aio_lio_opcode == LIO_READ) { 1485 curproc->p_stats->p_ru.ru_inblock += 1486 cb->inputcharge; 1487 cb->inputcharge = 0; 1488 } 1489 aio_free_entry(cb); 1490 return 0; 1491 } 1492 } 1493 splx(s); 1494 1495 s = splbio(); 1496 for (cb = TAILQ_FIRST(&ki->kaio_bufdone); cb; cb = ncb) { 1497 ncb = TAILQ_NEXT(cb, plist); 1498 if (((intptr_t) cb->uaiocb._aiocb_private.kernelinfo) 1499 == jobref) { 1500 splx(s); 1501 if (ujob == cb->uuaiocb) { 1502 p->p_retval[0] = 1503 cb->uaiocb._aiocb_private.status; 1504 } else 1505 p->p_retval[0] = EFAULT; 1506 aio_free_entry(cb); 1507 return 0; 1508 } 1509 } 1510 splx(s); 1511 1512 return (EINVAL); 1513 #endif /* VFS_AIO */ 1514 } 1515 1516 /* 1517 * Allow a process to wakeup when any of the I/O requests are completed. 1518 */ 1519 int 1520 aio_suspend(struct proc *p, struct aio_suspend_args *uap) 1521 { 1522 #ifndef VFS_AIO 1523 return ENOSYS; 1524 #else 1525 struct timeval atv; 1526 struct timespec ts; 1527 struct aiocb *const *cbptr, *cbp; 1528 struct kaioinfo *ki; 1529 struct aiocblist *cb; 1530 int i; 1531 int njoblist; 1532 int error, s, timo; 1533 int *ijoblist; 1534 struct aiocb **ujoblist; 1535 1536 if (uap->nent >= AIO_LISTIO_MAX) 1537 return EINVAL; 1538 1539 timo = 0; 1540 if (uap->timeout) { 1541 /* Get timespec struct. */ 1542 if ((error = copyin(uap->timeout, &ts, sizeof(ts))) != 0) 1543 return error; 1544 1545 if (ts.tv_nsec < 0 || ts.tv_nsec >= 1000000000) 1546 return (EINVAL); 1547 1548 TIMESPEC_TO_TIMEVAL(&atv, &ts); 1549 if (itimerfix(&atv)) 1550 return (EINVAL); 1551 timo = tvtohz(&atv); 1552 } 1553 1554 ki = p->p_aioinfo; 1555 if (ki == NULL) 1556 return EAGAIN; 1557 1558 njoblist = 0; 1559 ijoblist = zalloc(aiol_zone); 1560 ujoblist = zalloc(aiol_zone); 1561 cbptr = uap->aiocbp; 1562 1563 for (i = 0; i < uap->nent; i++) { 1564 cbp = (struct aiocb *)(intptr_t)fuword((caddr_t)&cbptr[i]); 1565 if (cbp == 0) 1566 continue; 1567 ujoblist[njoblist] = cbp; 1568 ijoblist[njoblist] = fuword(&cbp->_aiocb_private.kernelinfo); 1569 njoblist++; 1570 } 1571 1572 if (njoblist == 0) { 1573 zfree(aiol_zone, ijoblist); 1574 zfree(aiol_zone, ujoblist); 1575 return 0; 1576 } 1577 1578 error = 0; 1579 for (;;) { 1580 for (cb = TAILQ_FIRST(&ki->kaio_jobdone); cb; cb = 1581 TAILQ_NEXT(cb, plist)) { 1582 for (i = 0; i < njoblist; i++) { 1583 if (((intptr_t) 1584 cb->uaiocb._aiocb_private.kernelinfo) == 1585 ijoblist[i]) { 1586 if (ujoblist[i] != cb->uuaiocb) 1587 error = EINVAL; 1588 zfree(aiol_zone, ijoblist); 1589 zfree(aiol_zone, ujoblist); 1590 return error; 1591 } 1592 } 1593 } 1594 1595 s = splbio(); 1596 for (cb = TAILQ_FIRST(&ki->kaio_bufdone); cb; cb = 1597 TAILQ_NEXT(cb, plist)) { 1598 for (i = 0; i < njoblist; i++) { 1599 if (((intptr_t) 1600 cb->uaiocb._aiocb_private.kernelinfo) == 1601 ijoblist[i]) { 1602 splx(s); 1603 if (ujoblist[i] != cb->uuaiocb) 1604 error = EINVAL; 1605 zfree(aiol_zone, ijoblist); 1606 zfree(aiol_zone, ujoblist); 1607 return error; 1608 } 1609 } 1610 } 1611 1612 ki->kaio_flags |= KAIO_WAKEUP; 1613 error = tsleep(p, PRIBIO | PCATCH, "aiospn", timo); 1614 splx(s); 1615 1616 if (error == ERESTART || error == EINTR) { 1617 zfree(aiol_zone, ijoblist); 1618 zfree(aiol_zone, ujoblist); 1619 return EINTR; 1620 } else if (error == EWOULDBLOCK) { 1621 zfree(aiol_zone, ijoblist); 1622 zfree(aiol_zone, ujoblist); 1623 return EAGAIN; 1624 } 1625 } 1626 1627 /* NOTREACHED */ 1628 return EINVAL; 1629 #endif /* VFS_AIO */ 1630 } 1631 1632 /* 1633 * aio_cancel cancels any non-physio aio operations not currently in 1634 * progress. 1635 */ 1636 int 1637 aio_cancel(struct proc *p, struct aio_cancel_args *uap) 1638 { 1639 #ifndef VFS_AIO 1640 return ENOSYS; 1641 #else 1642 struct kaioinfo *ki; 1643 struct aiocblist *cbe, *cbn; 1644 struct file *fp; 1645 struct filedesc *fdp; 1646 struct socket *so; 1647 struct proc *po; 1648 int s,error; 1649 int cancelled=0; 1650 int notcancelled=0; 1651 struct vnode *vp; 1652 1653 fdp = p->p_fd; 1654 1655 fp = fdp->fd_ofiles[uap->fd]; 1656 1657 if (fp == NULL) { 1658 return EBADF; 1659 } 1660 1661 if (fp->f_type == DTYPE_VNODE) { 1662 vp = (struct vnode *)fp->f_data; 1663 1664 if (vn_isdisk(vp,&error)) { 1665 p->p_retval[0] = AIO_NOTCANCELED; 1666 return 0; 1667 } 1668 } else if (fp->f_type == DTYPE_SOCKET) { 1669 so = (struct socket *)fp->f_data; 1670 1671 s = splnet(); 1672 1673 for (cbe = TAILQ_FIRST(&so->so_aiojobq); cbe; cbe = cbn) { 1674 cbn = TAILQ_NEXT(cbe, list); 1675 if ((uap->aiocbp == NULL) || 1676 (uap->aiocbp == cbe->uuaiocb) ) { 1677 po = cbe->userproc; 1678 ki = po->p_aioinfo; 1679 TAILQ_REMOVE(&so->so_aiojobq, cbe, list); 1680 TAILQ_REMOVE(&ki->kaio_sockqueue, cbe, plist); 1681 TAILQ_INSERT_TAIL(&ki->kaio_jobdone, cbe, plist); 1682 if (ki->kaio_flags & KAIO_WAKEUP) { 1683 wakeup(po); 1684 } 1685 cbe->jobstate = JOBST_JOBFINISHED; 1686 cbe->uaiocb._aiocb_private.status=-1; 1687 cbe->uaiocb._aiocb_private.error=ECANCELED; 1688 cancelled++; 1689 /* XXX cancelled, knote? */ 1690 if (cbe->uaiocb.aio_sigevent.sigev_notify == 1691 SIGEV_SIGNAL) 1692 psignal(cbe->userproc, cbe->uaiocb.aio_sigevent.sigev_signo); 1693 if (uap->aiocbp) 1694 break; 1695 } 1696 } 1697 1698 splx(s); 1699 1700 if ((cancelled) && (uap->aiocbp)) { 1701 p->p_retval[0] = AIO_CANCELED; 1702 return 0; 1703 } 1704 1705 } 1706 1707 ki=p->p_aioinfo; 1708 1709 s = splnet(); 1710 1711 for (cbe = TAILQ_FIRST(&ki->kaio_jobqueue); cbe; cbe = cbn) { 1712 cbn = TAILQ_NEXT(cbe, plist); 1713 1714 if ((uap->fd == cbe->uaiocb.aio_fildes) && 1715 ((uap->aiocbp == NULL ) || 1716 (uap->aiocbp == cbe->uuaiocb))) { 1717 1718 if (cbe->jobstate == JOBST_JOBQGLOBAL) { 1719 TAILQ_REMOVE(&aio_jobs, cbe, list); 1720 TAILQ_REMOVE(&ki->kaio_jobqueue, cbe, plist); 1721 TAILQ_INSERT_TAIL(&ki->kaio_jobdone, cbe, 1722 plist); 1723 cancelled++; 1724 ki->kaio_queue_finished_count++; 1725 cbe->jobstate = JOBST_JOBFINISHED; 1726 cbe->uaiocb._aiocb_private.status = -1; 1727 cbe->uaiocb._aiocb_private.error = ECANCELED; 1728 /* XXX cancelled, knote? */ 1729 if (cbe->uaiocb.aio_sigevent.sigev_notify == 1730 SIGEV_SIGNAL) 1731 psignal(cbe->userproc, cbe->uaiocb.aio_sigevent.sigev_signo); 1732 } else { 1733 notcancelled++; 1734 } 1735 } 1736 } 1737 1738 splx(s); 1739 1740 1741 if (notcancelled) { 1742 p->p_retval[0] = AIO_NOTCANCELED; 1743 return 0; 1744 } 1745 1746 if (cancelled) { 1747 p->p_retval[0] = AIO_CANCELED; 1748 return 0; 1749 } 1750 1751 p->p_retval[0] = AIO_ALLDONE; 1752 1753 return 0; 1754 #endif /* VFS_AIO */ 1755 } 1756 1757 /* 1758 * aio_error is implemented in the kernel level for compatibility purposes only. 1759 * For a user mode async implementation, it would be best to do it in a userland 1760 * subroutine. 1761 */ 1762 int 1763 aio_error(struct proc *p, struct aio_error_args *uap) 1764 { 1765 #ifndef VFS_AIO 1766 return ENOSYS; 1767 #else 1768 int s; 1769 struct aiocblist *cb; 1770 struct kaioinfo *ki; 1771 int jobref; 1772 1773 ki = p->p_aioinfo; 1774 if (ki == NULL) 1775 return EINVAL; 1776 1777 jobref = fuword(&uap->aiocbp->_aiocb_private.kernelinfo); 1778 if ((jobref == -1) || (jobref == 0)) 1779 return EINVAL; 1780 1781 for (cb = TAILQ_FIRST(&ki->kaio_jobdone); cb; cb = TAILQ_NEXT(cb, 1782 plist)) { 1783 if (((intptr_t)cb->uaiocb._aiocb_private.kernelinfo) == 1784 jobref) { 1785 p->p_retval[0] = cb->uaiocb._aiocb_private.error; 1786 return 0; 1787 } 1788 } 1789 1790 s = splnet(); 1791 1792 for (cb = TAILQ_FIRST(&ki->kaio_jobqueue); cb; cb = TAILQ_NEXT(cb, 1793 plist)) { 1794 if (((intptr_t)cb->uaiocb._aiocb_private.kernelinfo) == 1795 jobref) { 1796 p->p_retval[0] = EINPROGRESS; 1797 splx(s); 1798 return 0; 1799 } 1800 } 1801 1802 for (cb = TAILQ_FIRST(&ki->kaio_sockqueue); cb; cb = TAILQ_NEXT(cb, 1803 plist)) { 1804 if (((intptr_t)cb->uaiocb._aiocb_private.kernelinfo) == 1805 jobref) { 1806 p->p_retval[0] = EINPROGRESS; 1807 splx(s); 1808 return 0; 1809 } 1810 } 1811 splx(s); 1812 1813 s = splbio(); 1814 for (cb = TAILQ_FIRST(&ki->kaio_bufdone); cb; cb = TAILQ_NEXT(cb, 1815 plist)) { 1816 if (((intptr_t)cb->uaiocb._aiocb_private.kernelinfo) == 1817 jobref) { 1818 p->p_retval[0] = cb->uaiocb._aiocb_private.error; 1819 splx(s); 1820 return 0; 1821 } 1822 } 1823 1824 for (cb = TAILQ_FIRST(&ki->kaio_bufqueue); cb; cb = TAILQ_NEXT(cb, 1825 plist)) { 1826 if (((intptr_t)cb->uaiocb._aiocb_private.kernelinfo) == 1827 jobref) { 1828 p->p_retval[0] = EINPROGRESS; 1829 splx(s); 1830 return 0; 1831 } 1832 } 1833 splx(s); 1834 1835 #if (0) 1836 /* 1837 * Hack for lio. 1838 */ 1839 status = fuword(&uap->aiocbp->_aiocb_private.status); 1840 if (status == -1) 1841 return fuword(&uap->aiocbp->_aiocb_private.error); 1842 #endif 1843 return EINVAL; 1844 #endif /* VFS_AIO */ 1845 } 1846 1847 int 1848 aio_read(struct proc *p, struct aio_read_args *uap) 1849 { 1850 #ifndef VFS_AIO 1851 return ENOSYS; 1852 #else 1853 struct filedesc *fdp; 1854 struct file *fp; 1855 struct uio auio; 1856 struct iovec aiov; 1857 unsigned int fd; 1858 int cnt; 1859 struct aiocb iocb; 1860 int error, pmodes; 1861 1862 pmodes = fuword(&uap->aiocbp->_aiocb_private.privatemodes); 1863 if ((pmodes & AIO_PMODE_SYNC) == 0) 1864 return aio_aqueue(p, (struct aiocb *)uap->aiocbp, LIO_READ); 1865 1866 /* Get control block. */ 1867 if ((error = copyin((caddr_t)uap->aiocbp, (caddr_t)&iocb, sizeof iocb)) 1868 != 0) 1869 return error; 1870 1871 /* Get the fd info for process. */ 1872 fdp = p->p_fd; 1873 1874 /* 1875 * Range check file descriptor. 1876 */ 1877 fd = iocb.aio_fildes; 1878 if (fd >= fdp->fd_nfiles) 1879 return EBADF; 1880 fp = fdp->fd_ofiles[fd]; 1881 if ((fp == NULL) || ((fp->f_flag & FREAD) == 0)) 1882 return EBADF; 1883 if (iocb.aio_offset == -1LL) 1884 return EINVAL; 1885 1886 auio.uio_resid = iocb.aio_nbytes; 1887 if (auio.uio_resid < 0) 1888 return (EINVAL); 1889 1890 /* 1891 * Process sync simply -- queue async request. 1892 */ 1893 if ((iocb._aiocb_private.privatemodes & AIO_PMODE_SYNC) == 0) 1894 return aio_aqueue(p, (struct aiocb *)uap->aiocbp, LIO_READ); 1895 1896 aiov.iov_base = (void *)iocb.aio_buf; 1897 aiov.iov_len = iocb.aio_nbytes; 1898 1899 auio.uio_iov = &aiov; 1900 auio.uio_iovcnt = 1; 1901 auio.uio_offset = iocb.aio_offset; 1902 auio.uio_rw = UIO_READ; 1903 auio.uio_segflg = UIO_USERSPACE; 1904 auio.uio_procp = p; 1905 1906 cnt = iocb.aio_nbytes; 1907 error = fo_read(fp, &auio, fp->f_cred, FOF_OFFSET, p); 1908 if (error && (auio.uio_resid != cnt) && (error == ERESTART || error == 1909 EINTR || error == EWOULDBLOCK)) 1910 error = 0; 1911 cnt -= auio.uio_resid; 1912 p->p_retval[0] = cnt; 1913 return error; 1914 #endif /* VFS_AIO */ 1915 } 1916 1917 int 1918 aio_write(struct proc *p, struct aio_write_args *uap) 1919 { 1920 #ifndef VFS_AIO 1921 return ENOSYS; 1922 #else 1923 struct filedesc *fdp; 1924 struct file *fp; 1925 struct uio auio; 1926 struct iovec aiov; 1927 unsigned int fd; 1928 int cnt; 1929 struct aiocb iocb; 1930 int error; 1931 int pmodes; 1932 1933 /* 1934 * Process sync simply -- queue async request. 1935 */ 1936 pmodes = fuword(&uap->aiocbp->_aiocb_private.privatemodes); 1937 if ((pmodes & AIO_PMODE_SYNC) == 0) 1938 return aio_aqueue(p, (struct aiocb *)uap->aiocbp, LIO_WRITE); 1939 1940 if ((error = copyin((caddr_t)uap->aiocbp, (caddr_t)&iocb, sizeof iocb)) 1941 != 0) 1942 return error; 1943 1944 /* Get the fd info for process. */ 1945 fdp = p->p_fd; 1946 1947 /* 1948 * Range check file descriptor. 1949 */ 1950 fd = iocb.aio_fildes; 1951 if (fd >= fdp->fd_nfiles) 1952 return EBADF; 1953 fp = fdp->fd_ofiles[fd]; 1954 if ((fp == NULL) || ((fp->f_flag & FWRITE) == 0)) 1955 return EBADF; 1956 if (iocb.aio_offset == -1LL) 1957 return EINVAL; 1958 1959 aiov.iov_base = (void *)iocb.aio_buf; 1960 aiov.iov_len = iocb.aio_nbytes; 1961 auio.uio_iov = &aiov; 1962 auio.uio_iovcnt = 1; 1963 auio.uio_offset = iocb.aio_offset; 1964 1965 auio.uio_resid = iocb.aio_nbytes; 1966 if (auio.uio_resid < 0) 1967 return (EINVAL); 1968 1969 auio.uio_rw = UIO_WRITE; 1970 auio.uio_segflg = UIO_USERSPACE; 1971 auio.uio_procp = p; 1972 1973 cnt = iocb.aio_nbytes; 1974 error = fo_write(fp, &auio, fp->f_cred, FOF_OFFSET, p); 1975 if (error) { 1976 if (auio.uio_resid != cnt) { 1977 if (error == ERESTART || error == EINTR || error == 1978 EWOULDBLOCK) 1979 error = 0; 1980 if (error == EPIPE) 1981 psignal(p, SIGPIPE); 1982 } 1983 } 1984 cnt -= auio.uio_resid; 1985 p->p_retval[0] = cnt; 1986 return error; 1987 #endif /* VFS_AIO */ 1988 } 1989 1990 int 1991 lio_listio(struct proc *p, struct lio_listio_args *uap) 1992 { 1993 #ifndef VFS_AIO 1994 return ENOSYS; 1995 #else 1996 int nent, nentqueued; 1997 struct aiocb *iocb, * const *cbptr; 1998 struct aiocblist *cb; 1999 struct kaioinfo *ki; 2000 struct aio_liojob *lj; 2001 int error, runningcode; 2002 int nerror; 2003 int i; 2004 int s; 2005 2006 if ((uap->mode != LIO_NOWAIT) && (uap->mode != LIO_WAIT)) 2007 return EINVAL; 2008 2009 nent = uap->nent; 2010 if (nent > AIO_LISTIO_MAX) 2011 return EINVAL; 2012 2013 if (p->p_aioinfo == NULL) 2014 aio_init_aioinfo(p); 2015 2016 if ((nent + num_queue_count) > max_queue_count) 2017 return EAGAIN; 2018 2019 ki = p->p_aioinfo; 2020 if ((nent + ki->kaio_queue_count) > ki->kaio_qallowed_count) 2021 return EAGAIN; 2022 2023 lj = zalloc(aiolio_zone); 2024 if (!lj) 2025 return EAGAIN; 2026 2027 lj->lioj_flags = 0; 2028 lj->lioj_buffer_count = 0; 2029 lj->lioj_buffer_finished_count = 0; 2030 lj->lioj_queue_count = 0; 2031 lj->lioj_queue_finished_count = 0; 2032 lj->lioj_ki = ki; 2033 TAILQ_INSERT_TAIL(&ki->kaio_liojoblist, lj, lioj_list); 2034 2035 /* 2036 * Setup signal. 2037 */ 2038 if (uap->sig && (uap->mode == LIO_NOWAIT)) { 2039 error = copyin(uap->sig, &lj->lioj_signal, 2040 sizeof(lj->lioj_signal)); 2041 if (error) 2042 return error; 2043 lj->lioj_flags |= LIOJ_SIGNAL; 2044 lj->lioj_flags &= ~LIOJ_SIGNAL_POSTED; 2045 } else 2046 lj->lioj_flags &= ~LIOJ_SIGNAL; 2047 2048 /* 2049 * Get pointers to the list of I/O requests. 2050 */ 2051 nerror = 0; 2052 nentqueued = 0; 2053 cbptr = uap->acb_list; 2054 for (i = 0; i < uap->nent; i++) { 2055 iocb = (struct aiocb *)(intptr_t)fuword((caddr_t)&cbptr[i]); 2056 if (((intptr_t)iocb != -1) && ((intptr_t)iocb != NULL)) { 2057 error = _aio_aqueue(p, iocb, lj, 0); 2058 if (error == 0) 2059 nentqueued++; 2060 else 2061 nerror++; 2062 } 2063 } 2064 2065 /* 2066 * If we haven't queued any, then just return error. 2067 */ 2068 if (nentqueued == 0) 2069 return 0; 2070 2071 /* 2072 * Calculate the appropriate error return. 2073 */ 2074 runningcode = 0; 2075 if (nerror) 2076 runningcode = EIO; 2077 2078 if (uap->mode == LIO_WAIT) { 2079 int command, found, jobref; 2080 2081 for (;;) { 2082 found = 0; 2083 for (i = 0; i < uap->nent; i++) { 2084 /* 2085 * Fetch address of the control buf pointer in 2086 * user space. 2087 */ 2088 iocb = (struct aiocb *)(intptr_t)fuword((caddr_t)&cbptr[i]); 2089 if (((intptr_t)iocb == -1) || ((intptr_t)iocb 2090 == 0)) 2091 continue; 2092 2093 /* 2094 * Fetch the associated command from user space. 2095 */ 2096 command = fuword(&iocb->aio_lio_opcode); 2097 if (command == LIO_NOP) { 2098 found++; 2099 continue; 2100 } 2101 2102 jobref = fuword(&iocb->_aiocb_private.kernelinfo); 2103 2104 for (cb = TAILQ_FIRST(&ki->kaio_jobdone); cb; 2105 cb = TAILQ_NEXT(cb, plist)) { 2106 if (((intptr_t)cb->uaiocb._aiocb_private.kernelinfo) 2107 == jobref) { 2108 if (cb->uaiocb.aio_lio_opcode 2109 == LIO_WRITE) { 2110 curproc->p_stats->p_ru.ru_oublock 2111 += 2112 cb->outputcharge; 2113 cb->outputcharge = 0; 2114 } else if (cb->uaiocb.aio_lio_opcode 2115 == LIO_READ) { 2116 curproc->p_stats->p_ru.ru_inblock 2117 += cb->inputcharge; 2118 cb->inputcharge = 0; 2119 } 2120 found++; 2121 break; 2122 } 2123 } 2124 2125 s = splbio(); 2126 for (cb = TAILQ_FIRST(&ki->kaio_bufdone); cb; 2127 cb = TAILQ_NEXT(cb, plist)) { 2128 if (((intptr_t)cb->uaiocb._aiocb_private.kernelinfo) 2129 == jobref) { 2130 found++; 2131 break; 2132 } 2133 } 2134 splx(s); 2135 } 2136 2137 /* 2138 * If all I/Os have been disposed of, then we can 2139 * return. 2140 */ 2141 if (found == nentqueued) 2142 return runningcode; 2143 2144 ki->kaio_flags |= KAIO_WAKEUP; 2145 error = tsleep(p, PRIBIO | PCATCH, "aiospn", 0); 2146 2147 if (error == EINTR) 2148 return EINTR; 2149 else if (error == EWOULDBLOCK) 2150 return EAGAIN; 2151 } 2152 } 2153 2154 return runningcode; 2155 #endif /* VFS_AIO */ 2156 } 2157 2158 /* 2159 * This is a wierd hack so that we can post a signal. It is safe to do so from 2160 * a timeout routine, but *not* from an interrupt routine. 2161 */ 2162 static void 2163 process_signal(void *aioj) 2164 { 2165 struct aiocblist *aiocbe = aioj; 2166 struct aio_liojob *lj = aiocbe->lio; 2167 struct aiocb *cb = &aiocbe->uaiocb; 2168 2169 if ((lj) && (lj->lioj_signal.sigev_notify == SIGEV_SIGNAL) && 2170 (lj->lioj_queue_count == lj->lioj_queue_finished_count)) { 2171 psignal(lj->lioj_ki->kaio_p, lj->lioj_signal.sigev_signo); 2172 lj->lioj_flags |= LIOJ_SIGNAL_POSTED; 2173 } 2174 2175 if (cb->aio_sigevent.sigev_notify == SIGEV_SIGNAL) 2176 psignal(aiocbe->userproc, cb->aio_sigevent.sigev_signo); 2177 } 2178 2179 /* 2180 * Interrupt handler for physio, performs the necessary process wakeups, and 2181 * signals. 2182 */ 2183 static void 2184 aio_physwakeup(struct buf *bp) 2185 { 2186 struct aiocblist *aiocbe; 2187 struct proc *p; 2188 struct kaioinfo *ki; 2189 struct aio_liojob *lj; 2190 int s; 2191 s = splbio(); 2192 2193 wakeup((caddr_t)bp); 2194 bp->b_flags |= B_DONE; 2195 2196 aiocbe = (struct aiocblist *)bp->b_spc; 2197 if (aiocbe) { 2198 p = bp->b_caller1; 2199 2200 aiocbe->jobstate = JOBST_JOBBFINISHED; 2201 aiocbe->uaiocb._aiocb_private.status -= bp->b_resid; 2202 aiocbe->uaiocb._aiocb_private.error = 0; 2203 aiocbe->jobflags |= AIOCBLIST_DONE; 2204 2205 if (bp->b_ioflags & BIO_ERROR) 2206 aiocbe->uaiocb._aiocb_private.error = bp->b_error; 2207 2208 lj = aiocbe->lio; 2209 if (lj) { 2210 lj->lioj_buffer_finished_count++; 2211 2212 /* 2213 * wakeup/signal if all of the interrupt jobs are done. 2214 */ 2215 if (lj->lioj_buffer_finished_count == 2216 lj->lioj_buffer_count) { 2217 /* 2218 * Post a signal if it is called for. 2219 */ 2220 if ((lj->lioj_flags & 2221 (LIOJ_SIGNAL|LIOJ_SIGNAL_POSTED)) == 2222 LIOJ_SIGNAL) { 2223 lj->lioj_flags |= LIOJ_SIGNAL_POSTED; 2224 timeout(process_signal, aiocbe, 0); 2225 } 2226 } 2227 } 2228 2229 ki = p->p_aioinfo; 2230 if (ki) { 2231 ki->kaio_buffer_finished_count++; 2232 TAILQ_REMOVE(&aio_bufjobs, aiocbe, list); 2233 TAILQ_REMOVE(&ki->kaio_bufqueue, aiocbe, plist); 2234 TAILQ_INSERT_TAIL(&ki->kaio_bufdone, aiocbe, plist); 2235 2236 KNOTE(&aiocbe->klist, 0); 2237 /* Do the wakeup. */ 2238 if (ki->kaio_flags & (KAIO_RUNDOWN|KAIO_WAKEUP)) { 2239 ki->kaio_flags &= ~KAIO_WAKEUP; 2240 wakeup(p); 2241 } 2242 } 2243 2244 if (aiocbe->uaiocb.aio_sigevent.sigev_notify == SIGEV_SIGNAL) 2245 timeout(process_signal, aiocbe, 0); 2246 } 2247 splx(s); 2248 } 2249 2250 int 2251 aio_waitcomplete(struct proc *p, struct aio_waitcomplete_args *uap) 2252 { 2253 #ifndef VFS_AIO 2254 return ENOSYS; 2255 #else 2256 struct timeval atv; 2257 struct timespec ts; 2258 struct aiocb **cbptr; 2259 struct kaioinfo *ki; 2260 struct aiocblist *cb = NULL; 2261 int error, s, timo; 2262 2263 suword(uap->aiocbp, (int)NULL); 2264 2265 timo = 0; 2266 if (uap->timeout) { 2267 /* Get timespec struct. */ 2268 error = copyin((caddr_t)uap->timeout, (caddr_t)&ts, 2269 sizeof(ts)); 2270 if (error) 2271 return error; 2272 2273 if ((ts.tv_nsec < 0) || (ts.tv_nsec >= 1000000000)) 2274 return (EINVAL); 2275 2276 TIMESPEC_TO_TIMEVAL(&atv, &ts); 2277 if (itimerfix(&atv)) 2278 return (EINVAL); 2279 timo = tvtohz(&atv); 2280 } 2281 2282 ki = p->p_aioinfo; 2283 if (ki == NULL) 2284 return EAGAIN; 2285 2286 cbptr = uap->aiocbp; 2287 2288 for (;;) { 2289 if ((cb = TAILQ_FIRST(&ki->kaio_jobdone)) != 0) { 2290 suword(uap->aiocbp, (int)cb->uuaiocb); 2291 p->p_retval[0] = cb->uaiocb._aiocb_private.status; 2292 if (cb->uaiocb.aio_lio_opcode == LIO_WRITE) { 2293 curproc->p_stats->p_ru.ru_oublock += 2294 cb->outputcharge; 2295 cb->outputcharge = 0; 2296 } else if (cb->uaiocb.aio_lio_opcode == LIO_READ) { 2297 curproc->p_stats->p_ru.ru_inblock += 2298 cb->inputcharge; 2299 cb->inputcharge = 0; 2300 } 2301 aio_free_entry(cb); 2302 return cb->uaiocb._aiocb_private.error; 2303 } 2304 2305 s = splbio(); 2306 if ((cb = TAILQ_FIRST(&ki->kaio_bufdone)) != 0 ) { 2307 splx(s); 2308 suword(uap->aiocbp, (int)cb->uuaiocb); 2309 p->p_retval[0] = cb->uaiocb._aiocb_private.status; 2310 aio_free_entry(cb); 2311 return cb->uaiocb._aiocb_private.error; 2312 } 2313 2314 ki->kaio_flags |= KAIO_WAKEUP; 2315 error = tsleep(p, PRIBIO | PCATCH, "aiowc", timo); 2316 splx(s); 2317 2318 if (error == ERESTART) 2319 return EINTR; 2320 else if (error < 0) 2321 return error; 2322 else if (error == EINTR) 2323 return EINTR; 2324 else if (error == EWOULDBLOCK) 2325 return EAGAIN; 2326 } 2327 #endif /* VFS_AIO */ 2328 } 2329 2330 static int 2331 filt_aioattach(struct knote *kn) 2332 { 2333 struct aiocblist *aiocbe = (struct aiocblist *)kn->kn_id; 2334 2335 /* 2336 * The aiocbe pointer must be validated before using it, so 2337 * registration is restricted to the kernel; the user cannot 2338 * set EV_FLAG1. 2339 */ 2340 if ((kn->kn_flags & EV_FLAG1) == 0) 2341 return (EPERM); 2342 kn->kn_flags &= ~EV_FLAG1; 2343 2344 SLIST_INSERT_HEAD(&aiocbe->klist, kn, kn_selnext); 2345 2346 return (0); 2347 } 2348 2349 static void 2350 filt_aiodetach(struct knote *kn) 2351 { 2352 struct aiocblist *aiocbe = (struct aiocblist *)kn->kn_id; 2353 int s = splhigh(); /* XXX no clue, so overkill */ 2354 2355 SLIST_REMOVE(&aiocbe->klist, kn, knote, kn_selnext); 2356 splx(s); 2357 } 2358 2359 /*ARGSUSED*/ 2360 static int 2361 filt_aio(struct knote *kn, long hint) 2362 { 2363 struct aiocblist *aiocbe = (struct aiocblist *)kn->kn_id; 2364 2365 kn->kn_data = 0; /* XXX data returned? */ 2366 if (aiocbe->jobstate != JOBST_JOBFINISHED) 2367 return (0); 2368 kn->kn_flags |= EV_EOF; 2369 return (1); 2370 } 2371