1 /* 2 * Copyright (c) 1997 John S. Dyson. All rights reserved. 3 * 4 * Redistribution and use in source and binary forms, with or without 5 * modification, are permitted provided that the following conditions 6 * are met: 7 * 1. Redistributions of source code must retain the above copyright 8 * notice, this list of conditions and the following disclaimer. 9 * 2. John S. Dyson's name may not be used to endorse or promote products 10 * derived from this software without specific prior written permission. 11 * 12 * DISCLAIMER: This code isn't warranted to do anything useful. Anything 13 * bad that happens because of using this software isn't the responsibility 14 * of the author. This software is distributed AS-IS. 15 * 16 * $FreeBSD$ 17 */ 18 19 /* 20 * This file contains support for the POSIX 1003.1B AIO/LIO facility. 21 */ 22 23 #include <sys/param.h> 24 #include <sys/systm.h> 25 #include <sys/buf.h> 26 #include <sys/sysproto.h> 27 #include <sys/filedesc.h> 28 #include <sys/kernel.h> 29 #include <sys/fcntl.h> 30 #include <sys/file.h> 31 #include <sys/lock.h> 32 #include <sys/unistd.h> 33 #include <sys/proc.h> 34 #include <sys/resourcevar.h> 35 #include <sys/signalvar.h> 36 #include <sys/protosw.h> 37 #include <sys/socketvar.h> 38 #include <sys/sysctl.h> 39 #include <sys/vnode.h> 40 #include <sys/conf.h> 41 #include <sys/event.h> 42 43 #include <vm/vm.h> 44 #include <vm/vm_extern.h> 45 #include <vm/pmap.h> 46 #include <vm/vm_map.h> 47 #include <vm/vm_zone.h> 48 #include <sys/aio.h> 49 50 #include <machine/limits.h> 51 #include "opt_vfs_aio.h" 52 53 static long jobrefid; 54 55 #define JOBST_NULL 0x0 56 #define JOBST_JOBQPROC 0x1 57 #define JOBST_JOBQGLOBAL 0x2 58 #define JOBST_JOBRUNNING 0x3 59 #define JOBST_JOBFINISHED 0x4 60 #define JOBST_JOBQBUF 0x5 61 #define JOBST_JOBBFINISHED 0x6 62 63 #ifndef MAX_AIO_PER_PROC 64 #define MAX_AIO_PER_PROC 32 65 #endif 66 67 #ifndef MAX_AIO_QUEUE_PER_PROC 68 #define MAX_AIO_QUEUE_PER_PROC 256 /* Bigger than AIO_LISTIO_MAX */ 69 #endif 70 71 #ifndef MAX_AIO_PROCS 72 #define MAX_AIO_PROCS 32 73 #endif 74 75 #ifndef MAX_AIO_QUEUE 76 #define MAX_AIO_QUEUE 1024 /* Bigger than AIO_LISTIO_MAX */ 77 #endif 78 79 #ifndef TARGET_AIO_PROCS 80 #define TARGET_AIO_PROCS 4 81 #endif 82 83 #ifndef MAX_BUF_AIO 84 #define MAX_BUF_AIO 16 85 #endif 86 87 #ifndef AIOD_TIMEOUT_DEFAULT 88 #define AIOD_TIMEOUT_DEFAULT (10 * hz) 89 #endif 90 91 #ifndef AIOD_LIFETIME_DEFAULT 92 #define AIOD_LIFETIME_DEFAULT (30 * hz) 93 #endif 94 95 static int max_aio_procs = MAX_AIO_PROCS; 96 static int num_aio_procs = 0; 97 static int target_aio_procs = TARGET_AIO_PROCS; 98 static int max_queue_count = MAX_AIO_QUEUE; 99 static int num_queue_count = 0; 100 static int num_buf_aio = 0; 101 static int num_aio_resv_start = 0; 102 static int aiod_timeout; 103 static int aiod_lifetime; 104 105 static int max_aio_per_proc = MAX_AIO_PER_PROC; 106 static int max_aio_queue_per_proc = MAX_AIO_QUEUE_PER_PROC; 107 static int max_buf_aio = MAX_BUF_AIO; 108 109 SYSCTL_NODE(_vfs, OID_AUTO, aio, CTLFLAG_RW, 0, "AIO mgmt"); 110 111 SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_per_proc, 112 CTLFLAG_RW, &max_aio_per_proc, 0, ""); 113 114 SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_queue_per_proc, 115 CTLFLAG_RW, &max_aio_queue_per_proc, 0, ""); 116 117 SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_procs, 118 CTLFLAG_RW, &max_aio_procs, 0, ""); 119 120 SYSCTL_INT(_vfs_aio, OID_AUTO, num_aio_procs, 121 CTLFLAG_RD, &num_aio_procs, 0, ""); 122 123 SYSCTL_INT(_vfs_aio, OID_AUTO, num_queue_count, 124 CTLFLAG_RD, &num_queue_count, 0, ""); 125 126 SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_queue, 127 CTLFLAG_RW, &max_queue_count, 0, ""); 128 129 SYSCTL_INT(_vfs_aio, OID_AUTO, target_aio_procs, 130 CTLFLAG_RW, &target_aio_procs, 0, ""); 131 132 SYSCTL_INT(_vfs_aio, OID_AUTO, max_buf_aio, 133 CTLFLAG_RW, &max_buf_aio, 0, ""); 134 135 SYSCTL_INT(_vfs_aio, OID_AUTO, num_buf_aio, 136 CTLFLAG_RD, &num_buf_aio, 0, ""); 137 138 SYSCTL_INT(_vfs_aio, OID_AUTO, aiod_lifetime, 139 CTLFLAG_RW, &aiod_lifetime, 0, ""); 140 141 SYSCTL_INT(_vfs_aio, OID_AUTO, aiod_timeout, 142 CTLFLAG_RW, &aiod_timeout, 0, ""); 143 144 /* 145 * AIO process info 146 */ 147 #define AIOP_FREE 0x1 /* proc on free queue */ 148 #define AIOP_SCHED 0x2 /* proc explicitly scheduled */ 149 150 struct aioproclist { 151 int aioprocflags; /* AIO proc flags */ 152 TAILQ_ENTRY(aioproclist) list; /* List of processes */ 153 struct proc *aioproc; /* The AIO thread */ 154 TAILQ_HEAD (,aiocblist) jobtorun; /* suggested job to run */ 155 }; 156 157 /* 158 * data-structure for lio signal management 159 */ 160 struct aio_liojob { 161 int lioj_flags; 162 int lioj_buffer_count; 163 int lioj_buffer_finished_count; 164 int lioj_queue_count; 165 int lioj_queue_finished_count; 166 struct sigevent lioj_signal; /* signal on all I/O done */ 167 TAILQ_ENTRY (aio_liojob) lioj_list; 168 struct kaioinfo *lioj_ki; 169 }; 170 #define LIOJ_SIGNAL 0x1 /* signal on all done (lio) */ 171 #define LIOJ_SIGNAL_POSTED 0x2 /* signal has been posted */ 172 173 /* 174 * per process aio data structure 175 */ 176 struct kaioinfo { 177 int kaio_flags; /* per process kaio flags */ 178 int kaio_maxactive_count; /* maximum number of AIOs */ 179 int kaio_active_count; /* number of currently used AIOs */ 180 int kaio_qallowed_count; /* maxiumu size of AIO queue */ 181 int kaio_queue_count; /* size of AIO queue */ 182 int kaio_ballowed_count; /* maximum number of buffers */ 183 int kaio_queue_finished_count; /* number of daemon jobs finished */ 184 int kaio_buffer_count; /* number of physio buffers */ 185 int kaio_buffer_finished_count; /* count of I/O done */ 186 struct proc *kaio_p; /* process that uses this kaio block */ 187 TAILQ_HEAD (,aio_liojob) kaio_liojoblist; /* list of lio jobs */ 188 TAILQ_HEAD (,aiocblist) kaio_jobqueue; /* job queue for process */ 189 TAILQ_HEAD (,aiocblist) kaio_jobdone; /* done queue for process */ 190 TAILQ_HEAD (,aiocblist) kaio_bufqueue; /* buffer job queue for process */ 191 TAILQ_HEAD (,aiocblist) kaio_bufdone; /* buffer done queue for process */ 192 TAILQ_HEAD (,aiocblist) kaio_sockqueue; /* queue for aios waiting on sockets */ 193 }; 194 195 #define KAIO_RUNDOWN 0x1 /* process is being run down */ 196 #define KAIO_WAKEUP 0x2 /* wakeup process when there is a significant event */ 197 198 static TAILQ_HEAD(,aioproclist) aio_freeproc, aio_activeproc; 199 static TAILQ_HEAD(,aiocblist) aio_jobs; /* Async job list */ 200 static TAILQ_HEAD(,aiocblist) aio_bufjobs; /* Phys I/O job list */ 201 static TAILQ_HEAD(,aiocblist) aio_freejobs; /* Pool of free jobs */ 202 203 static void aio_init_aioinfo(struct proc *p); 204 static void aio_onceonly(void *); 205 static int aio_free_entry(struct aiocblist *aiocbe); 206 static void aio_process(struct aiocblist *aiocbe); 207 static int aio_newproc(void); 208 static int aio_aqueue(struct proc *p, struct aiocb *job, int type); 209 static void aio_physwakeup(struct buf *bp); 210 static int aio_fphysio(struct proc *p, struct aiocblist *aiocbe, int type); 211 static int aio_qphysio(struct proc *p, struct aiocblist *iocb); 212 static void aio_daemon(void *uproc); 213 214 static int filt_aioattach(struct knote *kn); 215 static void filt_aiodetach(struct knote *kn); 216 static int filt_aio(struct knote *kn, long hint); 217 218 struct filterops aio_filtops = 219 { 0, filt_aioattach, filt_aiodetach, filt_aio }; 220 221 SYSINIT(aio, SI_SUB_VFS, SI_ORDER_ANY, aio_onceonly, NULL); 222 223 static vm_zone_t kaio_zone = 0, aiop_zone = 0, aiocb_zone = 0, aiol_zone = 0; 224 static vm_zone_t aiolio_zone = 0; 225 226 /* 227 * Startup initialization 228 */ 229 void 230 aio_onceonly(void *na) 231 { 232 TAILQ_INIT(&aio_freeproc); 233 TAILQ_INIT(&aio_activeproc); 234 TAILQ_INIT(&aio_jobs); 235 TAILQ_INIT(&aio_bufjobs); 236 TAILQ_INIT(&aio_freejobs); 237 kaio_zone = zinit("AIO", sizeof (struct kaioinfo), 0, 0, 1); 238 aiop_zone = zinit("AIOP", sizeof (struct aioproclist), 0, 0, 1); 239 aiocb_zone = zinit("AIOCB", sizeof (struct aiocblist), 0, 0, 1); 240 aiol_zone = zinit("AIOL", AIO_LISTIO_MAX * sizeof (int), 0, 0, 1); 241 aiolio_zone = zinit("AIOLIO", AIO_LISTIO_MAX * sizeof (struct 242 aio_liojob), 0, 0, 1); 243 aiod_timeout = AIOD_TIMEOUT_DEFAULT; 244 aiod_lifetime = AIOD_LIFETIME_DEFAULT; 245 jobrefid = 1; 246 } 247 248 /* 249 * Init the per-process aioinfo structure. The aioinfo limits are set 250 * per-process for user limit (resource) management. 251 */ 252 void 253 aio_init_aioinfo(struct proc *p) 254 { 255 struct kaioinfo *ki; 256 if (p->p_aioinfo == NULL) { 257 ki = zalloc(kaio_zone); 258 p->p_aioinfo = ki; 259 ki->kaio_flags = 0; 260 ki->kaio_maxactive_count = max_aio_per_proc; 261 ki->kaio_active_count = 0; 262 ki->kaio_qallowed_count = max_aio_queue_per_proc; 263 ki->kaio_queue_count = 0; 264 ki->kaio_ballowed_count = max_buf_aio; 265 ki->kaio_buffer_count = 0; 266 ki->kaio_buffer_finished_count = 0; 267 ki->kaio_p = p; 268 TAILQ_INIT(&ki->kaio_jobdone); 269 TAILQ_INIT(&ki->kaio_jobqueue); 270 TAILQ_INIT(&ki->kaio_bufdone); 271 TAILQ_INIT(&ki->kaio_bufqueue); 272 TAILQ_INIT(&ki->kaio_liojoblist); 273 TAILQ_INIT(&ki->kaio_sockqueue); 274 } 275 276 while (num_aio_procs < target_aio_procs) 277 aio_newproc(); 278 } 279 280 /* 281 * Free a job entry. Wait for completion if it is currently active, but don't 282 * delay forever. If we delay, we return a flag that says that we have to 283 * restart the queue scan. 284 */ 285 int 286 aio_free_entry(struct aiocblist *aiocbe) 287 { 288 struct kaioinfo *ki; 289 struct aioproclist *aiop; 290 struct aio_liojob *lj; 291 struct proc *p; 292 int error; 293 int s; 294 295 if (aiocbe->jobstate == JOBST_NULL) 296 panic("aio_free_entry: freeing already free job"); 297 298 p = aiocbe->userproc; 299 ki = p->p_aioinfo; 300 lj = aiocbe->lio; 301 if (ki == NULL) 302 panic("aio_free_entry: missing p->p_aioinfo"); 303 304 if (aiocbe->jobstate == JOBST_JOBRUNNING) { 305 if (aiocbe->jobflags & AIOCBLIST_ASYNCFREE) 306 return 0; 307 aiocbe->jobflags |= AIOCBLIST_RUNDOWN; 308 tsleep(aiocbe, PRIBIO|PCATCH, "jobwai", 0); 309 } 310 aiocbe->jobflags &= ~AIOCBLIST_ASYNCFREE; 311 312 if (aiocbe->bp == NULL) { 313 if (ki->kaio_queue_count <= 0) 314 panic("aio_free_entry: process queue size <= 0"); 315 if (num_queue_count <= 0) 316 panic("aio_free_entry: system wide queue size <= 0"); 317 318 if (lj) { 319 lj->lioj_queue_count--; 320 if (aiocbe->jobflags & AIOCBLIST_DONE) 321 lj->lioj_queue_finished_count--; 322 } 323 ki->kaio_queue_count--; 324 if (aiocbe->jobflags & AIOCBLIST_DONE) 325 ki->kaio_queue_finished_count--; 326 num_queue_count--; 327 } else { 328 if (lj) { 329 lj->lioj_buffer_count--; 330 if (aiocbe->jobflags & AIOCBLIST_DONE) 331 lj->lioj_buffer_finished_count--; 332 } 333 if (aiocbe->jobflags & AIOCBLIST_DONE) 334 ki->kaio_buffer_finished_count--; 335 ki->kaio_buffer_count--; 336 num_buf_aio--; 337 } 338 339 /* aiocbe is going away, we need to destroy any knotes */ 340 knote_remove(p, &aiocbe->klist); 341 342 if ((ki->kaio_flags & KAIO_WAKEUP) || ((ki->kaio_flags & KAIO_RUNDOWN) 343 && ((ki->kaio_buffer_count == 0) && (ki->kaio_queue_count == 0)))) { 344 ki->kaio_flags &= ~KAIO_WAKEUP; 345 wakeup(p); 346 } 347 348 if (aiocbe->jobstate == JOBST_JOBQBUF) { 349 if ((error = aio_fphysio(p, aiocbe, 1)) != 0) 350 return error; 351 if (aiocbe->jobstate != JOBST_JOBBFINISHED) 352 panic("aio_free_entry: invalid physio finish-up state"); 353 s = splbio(); 354 TAILQ_REMOVE(&ki->kaio_bufdone, aiocbe, plist); 355 splx(s); 356 } else if (aiocbe->jobstate == JOBST_JOBQPROC) { 357 aiop = aiocbe->jobaioproc; 358 TAILQ_REMOVE(&aiop->jobtorun, aiocbe, list); 359 } else if (aiocbe->jobstate == JOBST_JOBQGLOBAL) 360 TAILQ_REMOVE(&aio_jobs, aiocbe, list); 361 else if (aiocbe->jobstate == JOBST_JOBFINISHED) 362 TAILQ_REMOVE(&ki->kaio_jobdone, aiocbe, plist); 363 else if (aiocbe->jobstate == JOBST_JOBBFINISHED) { 364 s = splbio(); 365 TAILQ_REMOVE(&ki->kaio_bufdone, aiocbe, plist); 366 splx(s); 367 if (aiocbe->bp) { 368 vunmapbuf(aiocbe->bp); 369 relpbuf(aiocbe->bp, NULL); 370 aiocbe->bp = NULL; 371 } 372 } 373 if (lj && (lj->lioj_buffer_count == 0) && (lj->lioj_queue_count == 0)) { 374 TAILQ_REMOVE(&ki->kaio_liojoblist, lj, lioj_list); 375 zfree(aiolio_zone, lj); 376 } 377 TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list); 378 aiocbe->jobstate = JOBST_NULL; 379 return 0; 380 } 381 382 /* 383 * Rundown the jobs for a given process. 384 */ 385 void 386 aio_proc_rundown(struct proc *p) 387 { 388 int s; 389 struct kaioinfo *ki; 390 struct aio_liojob *lj, *ljn; 391 struct aiocblist *aiocbe, *aiocbn; 392 struct file *fp; 393 struct filedesc *fdp; 394 struct socket *so; 395 396 ki = p->p_aioinfo; 397 if (ki == NULL) 398 return; 399 400 ki->kaio_flags |= LIOJ_SIGNAL_POSTED; 401 while ((ki->kaio_active_count > 0) || (ki->kaio_buffer_count > 402 ki->kaio_buffer_finished_count)) { 403 ki->kaio_flags |= KAIO_RUNDOWN; 404 if (tsleep(p, PRIBIO, "kaiowt", aiod_timeout)) 405 break; 406 } 407 408 /* 409 * Move any aio ops that are waiting on socket I/O to the normal job 410 * queues so they are cleaned up with any others. 411 */ 412 fdp = p->p_fd; 413 414 s = splnet(); 415 for (aiocbe = TAILQ_FIRST(&ki->kaio_sockqueue); aiocbe; aiocbe = 416 aiocbn) { 417 aiocbn = TAILQ_NEXT(aiocbe, plist); 418 fp = fdp->fd_ofiles[aiocbe->uaiocb.aio_fildes]; 419 420 /* 421 * Under some circumstances, the aio_fildes and the file 422 * structure don't match. This would leave aiocbe's in the 423 * TAILQ associated with the socket and cause a panic later. 424 * 425 * Detect and fix. 426 */ 427 if ((fp == NULL) || (fp != aiocbe->fd_file)) 428 fp = aiocbe->fd_file; 429 if (fp) { 430 so = (struct socket *)fp->f_data; 431 TAILQ_REMOVE(&so->so_aiojobq, aiocbe, list); 432 if (TAILQ_EMPTY(&so->so_aiojobq)) { 433 so->so_snd.sb_flags &= ~SB_AIO; 434 so->so_rcv.sb_flags &= ~SB_AIO; 435 } 436 } 437 TAILQ_REMOVE(&ki->kaio_sockqueue, aiocbe, plist); 438 TAILQ_INSERT_HEAD(&aio_jobs, aiocbe, list); 439 TAILQ_INSERT_HEAD(&ki->kaio_jobqueue, aiocbe, plist); 440 } 441 splx(s); 442 443 restart1: 444 for (aiocbe = TAILQ_FIRST(&ki->kaio_jobdone); aiocbe; aiocbe = aiocbn) { 445 aiocbn = TAILQ_NEXT(aiocbe, plist); 446 if (aio_free_entry(aiocbe)) 447 goto restart1; 448 } 449 450 restart2: 451 for (aiocbe = TAILQ_FIRST(&ki->kaio_jobqueue); aiocbe; aiocbe = 452 aiocbn) { 453 aiocbn = TAILQ_NEXT(aiocbe, plist); 454 if (aio_free_entry(aiocbe)) 455 goto restart2; 456 } 457 458 /* 459 * Note the use of lots of splbio here, trying to avoid splbio for long chains 460 * of I/O. Probably unnecessary. 461 */ 462 restart3: 463 s = splbio(); 464 while (TAILQ_FIRST(&ki->kaio_bufqueue)) { 465 ki->kaio_flags |= KAIO_WAKEUP; 466 tsleep(p, PRIBIO, "aioprn", 0); 467 splx(s); 468 goto restart3; 469 } 470 splx(s); 471 472 restart4: 473 s = splbio(); 474 for (aiocbe = TAILQ_FIRST(&ki->kaio_bufdone); aiocbe; aiocbe = aiocbn) { 475 aiocbn = TAILQ_NEXT(aiocbe, plist); 476 if (aio_free_entry(aiocbe)) { 477 splx(s); 478 goto restart4; 479 } 480 } 481 splx(s); 482 483 for (lj = TAILQ_FIRST(&ki->kaio_liojoblist); lj; lj = ljn) { 484 ljn = TAILQ_NEXT(lj, lioj_list); 485 if ((lj->lioj_buffer_count == 0) && (lj->lioj_queue_count == 486 0)) { 487 TAILQ_REMOVE(&ki->kaio_liojoblist, lj, lioj_list); 488 zfree(aiolio_zone, lj); 489 } else { 490 #ifdef DIAGNOSTIC 491 printf("LIO job not cleaned up: B:%d, BF:%d, Q:%d, " 492 "QF:%d\n", lj->lioj_buffer_count, 493 lj->lioj_buffer_finished_count, 494 lj->lioj_queue_count, 495 lj->lioj_queue_finished_count); 496 #endif 497 } 498 } 499 500 zfree(kaio_zone, ki); 501 p->p_aioinfo = NULL; 502 } 503 504 /* 505 * Select a job to run (called by an AIO daemon). 506 */ 507 static struct aiocblist * 508 aio_selectjob(struct aioproclist *aiop) 509 { 510 int s; 511 struct aiocblist *aiocbe; 512 struct kaioinfo *ki; 513 struct proc *userp; 514 515 aiocbe = TAILQ_FIRST(&aiop->jobtorun); 516 if (aiocbe) { 517 TAILQ_REMOVE(&aiop->jobtorun, aiocbe, list); 518 return aiocbe; 519 } 520 521 s = splnet(); 522 for (aiocbe = TAILQ_FIRST(&aio_jobs); aiocbe; aiocbe = 523 TAILQ_NEXT(aiocbe, list)) { 524 userp = aiocbe->userproc; 525 ki = userp->p_aioinfo; 526 527 if (ki->kaio_active_count < ki->kaio_maxactive_count) { 528 TAILQ_REMOVE(&aio_jobs, aiocbe, list); 529 splx(s); 530 return aiocbe; 531 } 532 } 533 splx(s); 534 535 return NULL; 536 } 537 538 /* 539 * The AIO processing activity. This is the code that does the I/O request for 540 * the non-physio version of the operations. The normal vn operations are used, 541 * and this code should work in all instances for every type of file, including 542 * pipes, sockets, fifos, and regular files. 543 */ 544 void 545 aio_process(struct aiocblist *aiocbe) 546 { 547 struct filedesc *fdp; 548 struct proc *userp, *mycp; 549 struct aiocb *cb; 550 struct file *fp; 551 struct uio auio; 552 struct iovec aiov; 553 unsigned int fd; 554 int cnt; 555 int error; 556 off_t offset; 557 int oublock_st, oublock_end; 558 int inblock_st, inblock_end; 559 560 userp = aiocbe->userproc; 561 cb = &aiocbe->uaiocb; 562 563 mycp = curproc; 564 565 fdp = mycp->p_fd; 566 fd = cb->aio_fildes; 567 fp = fdp->fd_ofiles[fd]; 568 569 if ((fp == NULL) || (fp != aiocbe->fd_file)) { 570 cb->_aiocb_private.error = EBADF; 571 cb->_aiocb_private.status = -1; 572 return; 573 } 574 575 aiov.iov_base = (void *)cb->aio_buf; 576 aiov.iov_len = cb->aio_nbytes; 577 578 auio.uio_iov = &aiov; 579 auio.uio_iovcnt = 1; 580 auio.uio_offset = offset = cb->aio_offset; 581 auio.uio_resid = cb->aio_nbytes; 582 cnt = cb->aio_nbytes; 583 auio.uio_segflg = UIO_USERSPACE; 584 auio.uio_procp = mycp; 585 586 inblock_st = mycp->p_stats->p_ru.ru_inblock; 587 oublock_st = mycp->p_stats->p_ru.ru_oublock; 588 if (cb->aio_lio_opcode == LIO_READ) { 589 auio.uio_rw = UIO_READ; 590 error = fo_read(fp, &auio, fp->f_cred, FOF_OFFSET, mycp); 591 } else { 592 auio.uio_rw = UIO_WRITE; 593 error = fo_write(fp, &auio, fp->f_cred, FOF_OFFSET, mycp); 594 } 595 inblock_end = mycp->p_stats->p_ru.ru_inblock; 596 oublock_end = mycp->p_stats->p_ru.ru_oublock; 597 598 aiocbe->inputcharge = inblock_end - inblock_st; 599 aiocbe->outputcharge = oublock_end - oublock_st; 600 601 if ((error) && (auio.uio_resid != cnt)) { 602 if (error == ERESTART || error == EINTR || error == EWOULDBLOCK) 603 error = 0; 604 if ((error == EPIPE) && (cb->aio_lio_opcode == LIO_WRITE)) 605 psignal(userp, SIGPIPE); 606 } 607 608 cnt -= auio.uio_resid; 609 cb->_aiocb_private.error = error; 610 cb->_aiocb_private.status = cnt; 611 612 return; 613 } 614 615 /* 616 * The AIO daemon, most of the actual work is done in aio_process, 617 * but the setup (and address space mgmt) is done in this routine. 618 */ 619 static void 620 aio_daemon(void *uproc) 621 { 622 int s; 623 struct aio_liojob *lj; 624 struct aiocb *cb; 625 struct aiocblist *aiocbe; 626 struct aioproclist *aiop; 627 struct kaioinfo *ki; 628 struct proc *curcp, *mycp, *userp; 629 struct vmspace *myvm, *tmpvm; 630 631 /* 632 * Local copies of curproc (cp) and vmspace (myvm) 633 */ 634 mycp = curproc; 635 myvm = mycp->p_vmspace; 636 637 if (mycp->p_textvp) { 638 vrele(mycp->p_textvp); 639 mycp->p_textvp = NULL; 640 } 641 642 /* 643 * Allocate and ready the aio control info. There is one aiop structure 644 * per daemon. 645 */ 646 aiop = zalloc(aiop_zone); 647 aiop->aioproc = mycp; 648 aiop->aioprocflags |= AIOP_FREE; 649 TAILQ_INIT(&aiop->jobtorun); 650 651 s = splnet(); 652 653 /* 654 * Place thread (lightweight process) onto the AIO free thread list. 655 */ 656 if (TAILQ_EMPTY(&aio_freeproc)) 657 wakeup(&aio_freeproc); 658 TAILQ_INSERT_HEAD(&aio_freeproc, aiop, list); 659 660 splx(s); 661 662 /* Make up a name for the daemon. */ 663 strcpy(mycp->p_comm, "aiod"); 664 665 /* 666 * Get rid of our current filedescriptors. AIOD's don't need any 667 * filedescriptors, except as temporarily inherited from the client. 668 * Credentials are also cloned, and made equivalent to "root". 669 */ 670 fdfree(mycp); 671 mycp->p_fd = NULL; 672 mycp->p_ucred = crcopy(mycp->p_ucred); 673 mycp->p_ucred->cr_uid = 0; 674 mycp->p_ucred->cr_ngroups = 1; 675 mycp->p_ucred->cr_groups[0] = 1; 676 677 /* The daemon resides in its own pgrp. */ 678 enterpgrp(mycp, mycp->p_pid, 1); 679 680 /* Mark special process type. */ 681 mycp->p_flag |= P_SYSTEM | P_KTHREADP; 682 683 /* 684 * Wakeup parent process. (Parent sleeps to keep from blasting away 685 * creating to many daemons.) 686 */ 687 wakeup(mycp); 688 689 for (;;) { 690 /* 691 * curcp is the current daemon process context. 692 * userp is the current user process context. 693 */ 694 curcp = mycp; 695 696 /* 697 * Take daemon off of free queue 698 */ 699 if (aiop->aioprocflags & AIOP_FREE) { 700 s = splnet(); 701 TAILQ_REMOVE(&aio_freeproc, aiop, list); 702 TAILQ_INSERT_TAIL(&aio_activeproc, aiop, list); 703 aiop->aioprocflags &= ~AIOP_FREE; 704 splx(s); 705 } 706 aiop->aioprocflags &= ~AIOP_SCHED; 707 708 /* 709 * Check for jobs. 710 */ 711 while ((aiocbe = aio_selectjob(aiop)) != NULL) { 712 cb = &aiocbe->uaiocb; 713 userp = aiocbe->userproc; 714 715 aiocbe->jobstate = JOBST_JOBRUNNING; 716 717 /* 718 * Connect to process address space for user program. 719 */ 720 if (userp != curcp) { 721 /* 722 * Save the current address space that we are 723 * connected to. 724 */ 725 tmpvm = mycp->p_vmspace; 726 727 /* 728 * Point to the new user address space, and 729 * refer to it. 730 */ 731 mycp->p_vmspace = userp->p_vmspace; 732 mycp->p_vmspace->vm_refcnt++; 733 734 /* Activate the new mapping. */ 735 pmap_activate(mycp); 736 737 /* 738 * If the old address space wasn't the daemons 739 * own address space, then we need to remove the 740 * daemon's reference from the other process 741 * that it was acting on behalf of. 742 */ 743 if (tmpvm != myvm) { 744 vmspace_free(tmpvm); 745 } 746 747 /* 748 * Disassociate from previous clients file 749 * descriptors, and associate to the new clients 750 * descriptors. Note that the daemon doesn't 751 * need to worry about its orginal descriptors, 752 * because they were originally freed. 753 */ 754 if (mycp->p_fd) 755 fdfree(mycp); 756 mycp->p_fd = fdshare(userp); 757 curcp = userp; 758 } 759 760 ki = userp->p_aioinfo; 761 lj = aiocbe->lio; 762 763 /* Account for currently active jobs. */ 764 ki->kaio_active_count++; 765 766 /* Do the I/O function. */ 767 aiocbe->jobaioproc = aiop; 768 aio_process(aiocbe); 769 770 /* Decrement the active job count. */ 771 ki->kaio_active_count--; 772 773 /* 774 * Increment the completion count for wakeup/signal 775 * comparisons. 776 */ 777 aiocbe->jobflags |= AIOCBLIST_DONE; 778 ki->kaio_queue_finished_count++; 779 if (lj) 780 lj->lioj_queue_finished_count++; 781 if ((ki->kaio_flags & KAIO_WAKEUP) || ((ki->kaio_flags 782 & KAIO_RUNDOWN) && (ki->kaio_active_count == 0))) { 783 ki->kaio_flags &= ~KAIO_WAKEUP; 784 wakeup(userp); 785 } 786 787 s = splbio(); 788 if (lj && (lj->lioj_flags & 789 (LIOJ_SIGNAL|LIOJ_SIGNAL_POSTED)) == LIOJ_SIGNAL) { 790 if ((lj->lioj_queue_finished_count == 791 lj->lioj_queue_count) && 792 (lj->lioj_buffer_finished_count == 793 lj->lioj_buffer_count)) { 794 psignal(userp, 795 lj->lioj_signal.sigev_signo); 796 lj->lioj_flags |= 797 LIOJ_SIGNAL_POSTED; 798 } 799 } 800 splx(s); 801 802 aiocbe->jobstate = JOBST_JOBFINISHED; 803 804 /* 805 * If the I/O request should be automatically rundown, 806 * do the needed cleanup. Otherwise, place the queue 807 * entry for the just finished I/O request into the done 808 * queue for the associated client. 809 */ 810 s = splnet(); 811 if (aiocbe->jobflags & AIOCBLIST_ASYNCFREE) { 812 aiocbe->jobflags &= ~AIOCBLIST_ASYNCFREE; 813 TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list); 814 } else { 815 TAILQ_REMOVE(&ki->kaio_jobqueue, aiocbe, plist); 816 TAILQ_INSERT_TAIL(&ki->kaio_jobdone, aiocbe, 817 plist); 818 } 819 splx(s); 820 KNOTE(&aiocbe->klist, 0); 821 822 if (aiocbe->jobflags & AIOCBLIST_RUNDOWN) { 823 wakeup(aiocbe); 824 aiocbe->jobflags &= ~AIOCBLIST_RUNDOWN; 825 } 826 827 if (cb->aio_sigevent.sigev_notify == SIGEV_SIGNAL) { 828 psignal(userp, cb->aio_sigevent.sigev_signo); 829 } 830 } 831 832 /* 833 * Disconnect from user address space. 834 */ 835 if (curcp != mycp) { 836 /* Get the user address space to disconnect from. */ 837 tmpvm = mycp->p_vmspace; 838 839 /* Get original address space for daemon. */ 840 mycp->p_vmspace = myvm; 841 842 /* Activate the daemon's address space. */ 843 pmap_activate(mycp); 844 #ifdef DIAGNOSTIC 845 if (tmpvm == myvm) { 846 printf("AIOD: vmspace problem -- %d\n", 847 mycp->p_pid); 848 } 849 #endif 850 /* Remove our vmspace reference. */ 851 vmspace_free(tmpvm); 852 853 /* 854 * Disassociate from the user process's file 855 * descriptors. 856 */ 857 if (mycp->p_fd) 858 fdfree(mycp); 859 mycp->p_fd = NULL; 860 curcp = mycp; 861 } 862 863 /* 864 * If we are the first to be put onto the free queue, wakeup 865 * anyone waiting for a daemon. 866 */ 867 s = splnet(); 868 TAILQ_REMOVE(&aio_activeproc, aiop, list); 869 if (TAILQ_EMPTY(&aio_freeproc)) 870 wakeup(&aio_freeproc); 871 TAILQ_INSERT_HEAD(&aio_freeproc, aiop, list); 872 aiop->aioprocflags |= AIOP_FREE; 873 splx(s); 874 875 /* 876 * If daemon is inactive for a long time, allow it to exit, 877 * thereby freeing resources. 878 */ 879 if (((aiop->aioprocflags & AIOP_SCHED) == 0) && tsleep(mycp, 880 PRIBIO, "aiordy", aiod_lifetime)) { 881 s = splnet(); 882 if ((TAILQ_FIRST(&aio_jobs) == NULL) && 883 (TAILQ_FIRST(&aiop->jobtorun) == NULL)) { 884 if ((aiop->aioprocflags & AIOP_FREE) && 885 (num_aio_procs > target_aio_procs)) { 886 TAILQ_REMOVE(&aio_freeproc, aiop, list); 887 splx(s); 888 zfree(aiop_zone, aiop); 889 num_aio_procs--; 890 #ifdef DIAGNOSTIC 891 if (mycp->p_vmspace->vm_refcnt <= 1) { 892 printf("AIOD: bad vm refcnt for" 893 " exiting daemon: %d\n", 894 mycp->p_vmspace->vm_refcnt); 895 } 896 #endif 897 exit1(mycp, 0); 898 } 899 } 900 splx(s); 901 } 902 } 903 } 904 905 /* 906 * Create a new AIO daemon. This is mostly a kernel-thread fork routine. The 907 * AIO daemon modifies its environment itself. 908 */ 909 static int 910 aio_newproc() 911 { 912 int error; 913 struct proc *p, *np; 914 915 p = &proc0; 916 error = fork1(p, RFPROC|RFMEM|RFNOWAIT, &np); 917 if (error) 918 return error; 919 cpu_set_fork_handler(np, aio_daemon, curproc); 920 921 /* 922 * Wait until daemon is started, but continue on just in case to 923 * handle error conditions. 924 */ 925 error = tsleep(np, PZERO, "aiosta", aiod_timeout); 926 num_aio_procs++; 927 928 return error; 929 } 930 931 /* 932 * Try the high-performance physio method for eligible VCHR devices. This 933 * routine doesn't require the use of any additional threads, and have overhead. 934 */ 935 int 936 aio_qphysio(struct proc *p, struct aiocblist *aiocbe) 937 { 938 int error; 939 struct aiocb *cb; 940 struct file *fp; 941 struct buf *bp; 942 struct vnode *vp; 943 struct kaioinfo *ki; 944 struct filedesc *fdp; 945 struct aio_liojob *lj; 946 int fd; 947 int s; 948 int cnt, notify; 949 950 cb = &aiocbe->uaiocb; 951 fdp = p->p_fd; 952 fd = cb->aio_fildes; 953 fp = fdp->fd_ofiles[fd]; 954 955 if (fp->f_type != DTYPE_VNODE) 956 return (-1); 957 958 vp = (struct vnode *)fp->f_data; 959 960 /* 961 * If its not a disk, we don't want to return a positive error. 962 * It causes the aio code to not fall through to try the thread 963 * way when you're talking to a regular file. 964 */ 965 if (!vn_isdisk(vp, &error)) { 966 if (error == ENOTBLK) 967 return (-1); 968 else 969 return (error); 970 } 971 972 if (cb->aio_nbytes % vp->v_rdev->si_bsize_phys) 973 return (-1); 974 975 if ((cb->aio_nbytes > MAXPHYS) && (num_buf_aio >= max_buf_aio)) 976 return (-1); 977 978 ki = p->p_aioinfo; 979 if (ki->kaio_buffer_count >= ki->kaio_ballowed_count) 980 return (-1); 981 982 cnt = cb->aio_nbytes; 983 if (cnt > MAXPHYS) 984 return (-1); 985 986 /* 987 * Physical I/O is charged directly to the process, so we don't have to 988 * fake it. 989 */ 990 aiocbe->inputcharge = 0; 991 aiocbe->outputcharge = 0; 992 993 ki->kaio_buffer_count++; 994 995 lj = aiocbe->lio; 996 if (lj) 997 lj->lioj_buffer_count++; 998 999 /* Create and build a buffer header for a transfer. */ 1000 bp = (struct buf *)getpbuf(NULL); 1001 1002 /* 1003 * Get a copy of the kva from the physical buffer. 1004 */ 1005 bp->b_caller1 = p; 1006 bp->b_dev = vp->v_rdev; 1007 error = bp->b_error = 0; 1008 1009 bp->b_bcount = cb->aio_nbytes; 1010 bp->b_bufsize = cb->aio_nbytes; 1011 bp->b_flags = B_PHYS; 1012 bp->b_iodone = aio_physwakeup; 1013 bp->b_saveaddr = bp->b_data; 1014 bp->b_data = (void *)cb->aio_buf; 1015 bp->b_blkno = btodb(cb->aio_offset); 1016 1017 if (cb->aio_lio_opcode == LIO_WRITE) { 1018 bp->b_iocmd = BIO_WRITE; 1019 if (!useracc(bp->b_data, bp->b_bufsize, VM_PROT_READ)) { 1020 error = EFAULT; 1021 goto doerror; 1022 } 1023 } else { 1024 bp->b_iocmd = BIO_READ; 1025 if (!useracc(bp->b_data, bp->b_bufsize, VM_PROT_WRITE)) { 1026 error = EFAULT; 1027 goto doerror; 1028 } 1029 } 1030 1031 /* Bring buffer into kernel space. */ 1032 vmapbuf(bp); 1033 1034 s = splbio(); 1035 aiocbe->bp = bp; 1036 bp->b_spc = (void *)aiocbe; 1037 TAILQ_INSERT_TAIL(&aio_bufjobs, aiocbe, list); 1038 TAILQ_INSERT_TAIL(&ki->kaio_bufqueue, aiocbe, plist); 1039 aiocbe->jobstate = JOBST_JOBQBUF; 1040 cb->_aiocb_private.status = cb->aio_nbytes; 1041 num_buf_aio++; 1042 bp->b_error = 0; 1043 1044 splx(s); 1045 1046 /* Perform transfer. */ 1047 DEV_STRATEGY(bp, 0); 1048 1049 notify = 0; 1050 s = splbio(); 1051 1052 /* 1053 * If we had an error invoking the request, or an error in processing 1054 * the request before we have returned, we process it as an error in 1055 * transfer. Note that such an I/O error is not indicated immediately, 1056 * but is returned using the aio_error mechanism. In this case, 1057 * aio_suspend will return immediately. 1058 */ 1059 if (bp->b_error || (bp->b_ioflags & BIO_ERROR)) { 1060 struct aiocb *job = aiocbe->uuaiocb; 1061 1062 aiocbe->uaiocb._aiocb_private.status = 0; 1063 suword(&job->_aiocb_private.status, 0); 1064 aiocbe->uaiocb._aiocb_private.error = bp->b_error; 1065 suword(&job->_aiocb_private.error, bp->b_error); 1066 1067 ki->kaio_buffer_finished_count++; 1068 1069 if (aiocbe->jobstate != JOBST_JOBBFINISHED) { 1070 aiocbe->jobstate = JOBST_JOBBFINISHED; 1071 aiocbe->jobflags |= AIOCBLIST_DONE; 1072 TAILQ_REMOVE(&aio_bufjobs, aiocbe, list); 1073 TAILQ_REMOVE(&ki->kaio_bufqueue, aiocbe, plist); 1074 TAILQ_INSERT_TAIL(&ki->kaio_bufdone, aiocbe, plist); 1075 notify = 1; 1076 } 1077 } 1078 splx(s); 1079 if (notify) 1080 KNOTE(&aiocbe->klist, 0); 1081 return 0; 1082 1083 doerror: 1084 ki->kaio_buffer_count--; 1085 if (lj) 1086 lj->lioj_buffer_count--; 1087 aiocbe->bp = NULL; 1088 relpbuf(bp, NULL); 1089 return error; 1090 } 1091 1092 /* 1093 * This waits/tests physio completion. 1094 */ 1095 int 1096 aio_fphysio(struct proc *p, struct aiocblist *iocb, int flgwait) 1097 { 1098 int s; 1099 struct buf *bp; 1100 int error; 1101 1102 bp = iocb->bp; 1103 1104 s = splbio(); 1105 if (flgwait == 0) { 1106 if ((bp->b_flags & B_DONE) == 0) { 1107 splx(s); 1108 return EINPROGRESS; 1109 } 1110 } 1111 1112 while ((bp->b_flags & B_DONE) == 0) { 1113 if (tsleep((caddr_t)bp, PRIBIO, "physstr", aiod_timeout)) { 1114 if ((bp->b_flags & B_DONE) == 0) { 1115 splx(s); 1116 return EINPROGRESS; 1117 } else 1118 break; 1119 } 1120 } 1121 1122 /* Release mapping into kernel space. */ 1123 vunmapbuf(bp); 1124 iocb->bp = 0; 1125 1126 error = 0; 1127 1128 /* Check for an error. */ 1129 if (bp->b_ioflags & BIO_ERROR) 1130 error = bp->b_error; 1131 1132 relpbuf(bp, NULL); 1133 return (error); 1134 } 1135 1136 /* 1137 * Wake up aio requests that may be serviceable now. 1138 */ 1139 void 1140 aio_swake(struct socket *so, struct sockbuf *sb) 1141 { 1142 struct aiocblist *cb,*cbn; 1143 struct proc *p; 1144 struct kaioinfo *ki = NULL; 1145 int opcode, wakecount = 0; 1146 struct aioproclist *aiop; 1147 1148 if (sb == &so->so_snd) { 1149 opcode = LIO_WRITE; 1150 so->so_snd.sb_flags &= ~SB_AIO; 1151 } else { 1152 opcode = LIO_READ; 1153 so->so_rcv.sb_flags &= ~SB_AIO; 1154 } 1155 1156 for (cb = TAILQ_FIRST(&so->so_aiojobq); cb; cb = cbn) { 1157 cbn = TAILQ_NEXT(cb, list); 1158 if (opcode == cb->uaiocb.aio_lio_opcode) { 1159 p = cb->userproc; 1160 ki = p->p_aioinfo; 1161 TAILQ_REMOVE(&so->so_aiojobq, cb, list); 1162 TAILQ_REMOVE(&ki->kaio_sockqueue, cb, plist); 1163 TAILQ_INSERT_TAIL(&aio_jobs, cb, list); 1164 TAILQ_INSERT_TAIL(&ki->kaio_jobqueue, cb, plist); 1165 wakecount++; 1166 if (cb->jobstate != JOBST_JOBQGLOBAL) 1167 panic("invalid queue value"); 1168 } 1169 } 1170 1171 while (wakecount--) { 1172 if ((aiop = TAILQ_FIRST(&aio_freeproc)) != 0) { 1173 TAILQ_REMOVE(&aio_freeproc, aiop, list); 1174 TAILQ_INSERT_TAIL(&aio_activeproc, aiop, list); 1175 aiop->aioprocflags &= ~AIOP_FREE; 1176 wakeup(aiop->aioproc); 1177 } 1178 } 1179 } 1180 1181 /* 1182 * Queue a new AIO request. Choosing either the threaded or direct physio VCHR 1183 * technique is done in this code. 1184 */ 1185 static int 1186 _aio_aqueue(struct proc *p, struct aiocb *job, struct aio_liojob *lj, int type) 1187 { 1188 struct filedesc *fdp; 1189 struct file *fp; 1190 unsigned int fd; 1191 struct socket *so; 1192 int s; 1193 int error = 0; 1194 int opcode; 1195 struct aiocblist *aiocbe; 1196 struct aioproclist *aiop; 1197 struct kaioinfo *ki; 1198 1199 if ((aiocbe = TAILQ_FIRST(&aio_freejobs)) != NULL) 1200 TAILQ_REMOVE(&aio_freejobs, aiocbe, list); 1201 else 1202 aiocbe = zalloc (aiocb_zone); 1203 1204 aiocbe->inputcharge = 0; 1205 aiocbe->outputcharge = 0; 1206 SLIST_INIT(&aiocbe->klist); 1207 1208 suword(&job->_aiocb_private.status, -1); 1209 suword(&job->_aiocb_private.error, 0); 1210 suword(&job->_aiocb_private.kernelinfo, -1); 1211 1212 error = copyin((caddr_t)job, (caddr_t) &aiocbe->uaiocb, sizeof 1213 aiocbe->uaiocb); 1214 if (error) { 1215 suword(&job->_aiocb_private.error, error); 1216 1217 TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list); 1218 return error; 1219 } 1220 1221 /* Save userspace address of the job info. */ 1222 aiocbe->uuaiocb = job; 1223 1224 /* Get the opcode. */ 1225 if (type != LIO_NOP) 1226 aiocbe->uaiocb.aio_lio_opcode = type; 1227 opcode = aiocbe->uaiocb.aio_lio_opcode; 1228 1229 /* Get the fd info for process. */ 1230 fdp = p->p_fd; 1231 1232 /* 1233 * Range check file descriptor. 1234 */ 1235 fd = aiocbe->uaiocb.aio_fildes; 1236 if (fd >= fdp->fd_nfiles) { 1237 TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list); 1238 if (type == 0) 1239 suword(&job->_aiocb_private.error, EBADF); 1240 return EBADF; 1241 } 1242 1243 fp = aiocbe->fd_file = fdp->fd_ofiles[fd]; 1244 if ((fp == NULL) || ((opcode == LIO_WRITE) && ((fp->f_flag & FWRITE) == 1245 0))) { 1246 TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list); 1247 if (type == 0) 1248 suword(&job->_aiocb_private.error, EBADF); 1249 return EBADF; 1250 } 1251 1252 if (aiocbe->uaiocb.aio_offset == -1LL) { 1253 TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list); 1254 if (type == 0) 1255 suword(&job->_aiocb_private.error, EINVAL); 1256 return EINVAL; 1257 } 1258 1259 error = suword(&job->_aiocb_private.kernelinfo, jobrefid); 1260 if (error) { 1261 TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list); 1262 if (type == 0) 1263 suword(&job->_aiocb_private.error, EINVAL); 1264 return error; 1265 } 1266 1267 aiocbe->uaiocb._aiocb_private.kernelinfo = (void *)(intptr_t)jobrefid; 1268 if (jobrefid == LONG_MAX) 1269 jobrefid = 1; 1270 else 1271 jobrefid++; 1272 1273 if (opcode == LIO_NOP) { 1274 TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list); 1275 if (type == 0) { 1276 suword(&job->_aiocb_private.error, 0); 1277 suword(&job->_aiocb_private.status, 0); 1278 suword(&job->_aiocb_private.kernelinfo, 0); 1279 } 1280 return 0; 1281 } 1282 1283 if ((opcode != LIO_READ) && (opcode != LIO_WRITE)) { 1284 TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list); 1285 if (type == 0) { 1286 suword(&job->_aiocb_private.status, 0); 1287 suword(&job->_aiocb_private.error, EINVAL); 1288 } 1289 return EINVAL; 1290 } 1291 1292 /* 1293 * XXX 1294 * Figure out how to do this properly. This currently won't 1295 * work on the alpha, since we're passing in a pointer via 1296 * aio_lio_opcode, which is an int. 1297 */ 1298 { 1299 struct kevent kev, *kevp; 1300 struct kqueue *kq; 1301 1302 kevp = (struct kevent *)job->aio_lio_opcode; 1303 if (kevp == NULL) 1304 goto no_kqueue; 1305 1306 error = copyin((caddr_t)kevp, (caddr_t)&kev, sizeof(kev)); 1307 if (error) 1308 goto aqueue_fail; 1309 1310 if ((u_int)kev.ident >= fdp->fd_nfiles || 1311 (fp = fdp->fd_ofiles[kev.ident]) == NULL || 1312 (fp->f_type != DTYPE_KQUEUE)) { 1313 error = EBADF; 1314 goto aqueue_fail; 1315 } 1316 kq = (struct kqueue *)fp->f_data; 1317 kev.ident = (u_long)aiocbe; 1318 kev.filter = EVFILT_AIO; 1319 kev.flags = EV_ADD | EV_ENABLE | EV_FLAG1; 1320 error = kqueue_register(kq, &kev, p); 1321 aqueue_fail: 1322 if (error) { 1323 TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list); 1324 if (type == 0) 1325 suword(&job->_aiocb_private.error, error); 1326 return (error); 1327 } 1328 no_kqueue: 1329 } 1330 1331 suword(&job->_aiocb_private.error, EINPROGRESS); 1332 aiocbe->uaiocb._aiocb_private.error = EINPROGRESS; 1333 aiocbe->userproc = p; 1334 aiocbe->jobflags = 0; 1335 aiocbe->lio = lj; 1336 ki = p->p_aioinfo; 1337 1338 if (fp->f_type == DTYPE_SOCKET) { 1339 /* 1340 * Alternate queueing for socket ops: Reach down into the 1341 * descriptor to get the socket data. Then check to see if the 1342 * socket is ready to be read or written (based on the requested 1343 * operation). 1344 * 1345 * If it is not ready for io, then queue the aiocbe on the 1346 * socket, and set the flags so we get a call when sbnotify() 1347 * happens. 1348 */ 1349 so = (struct socket *)fp->f_data; 1350 s = splnet(); 1351 if (((opcode == LIO_READ) && (!soreadable(so))) || ((opcode == 1352 LIO_WRITE) && (!sowriteable(so)))) { 1353 TAILQ_INSERT_TAIL(&so->so_aiojobq, aiocbe, list); 1354 TAILQ_INSERT_TAIL(&ki->kaio_sockqueue, aiocbe, plist); 1355 if (opcode == LIO_READ) 1356 so->so_rcv.sb_flags |= SB_AIO; 1357 else 1358 so->so_snd.sb_flags |= SB_AIO; 1359 aiocbe->jobstate = JOBST_JOBQGLOBAL; /* XXX */ 1360 ki->kaio_queue_count++; 1361 num_queue_count++; 1362 splx(s); 1363 return 0; 1364 } 1365 splx(s); 1366 } 1367 1368 if ((error = aio_qphysio(p, aiocbe)) == 0) 1369 return 0; 1370 else if (error > 0) { 1371 suword(&job->_aiocb_private.status, 0); 1372 aiocbe->uaiocb._aiocb_private.error = error; 1373 suword(&job->_aiocb_private.error, error); 1374 return error; 1375 } 1376 1377 /* No buffer for daemon I/O. */ 1378 aiocbe->bp = NULL; 1379 1380 ki->kaio_queue_count++; 1381 if (lj) 1382 lj->lioj_queue_count++; 1383 s = splnet(); 1384 TAILQ_INSERT_TAIL(&ki->kaio_jobqueue, aiocbe, plist); 1385 TAILQ_INSERT_TAIL(&aio_jobs, aiocbe, list); 1386 splx(s); 1387 aiocbe->jobstate = JOBST_JOBQGLOBAL; 1388 1389 num_queue_count++; 1390 error = 0; 1391 1392 /* 1393 * If we don't have a free AIO process, and we are below our quota, then 1394 * start one. Otherwise, depend on the subsequent I/O completions to 1395 * pick-up this job. If we don't sucessfully create the new process 1396 * (thread) due to resource issues, we return an error for now (EAGAIN), 1397 * which is likely not the correct thing to do. 1398 */ 1399 retryproc: 1400 s = splnet(); 1401 if ((aiop = TAILQ_FIRST(&aio_freeproc)) != NULL) { 1402 TAILQ_REMOVE(&aio_freeproc, aiop, list); 1403 TAILQ_INSERT_TAIL(&aio_activeproc, aiop, list); 1404 aiop->aioprocflags &= ~AIOP_FREE; 1405 wakeup(aiop->aioproc); 1406 } else if (((num_aio_resv_start + num_aio_procs) < max_aio_procs) && 1407 ((ki->kaio_active_count + num_aio_resv_start) < 1408 ki->kaio_maxactive_count)) { 1409 num_aio_resv_start++; 1410 if ((error = aio_newproc()) == 0) { 1411 num_aio_resv_start--; 1412 p->p_retval[0] = 0; 1413 goto retryproc; 1414 } 1415 num_aio_resv_start--; 1416 } 1417 splx(s); 1418 return error; 1419 } 1420 1421 /* 1422 * This routine queues an AIO request, checking for quotas. 1423 */ 1424 static int 1425 aio_aqueue(struct proc *p, struct aiocb *job, int type) 1426 { 1427 struct kaioinfo *ki; 1428 1429 if (p->p_aioinfo == NULL) 1430 aio_init_aioinfo(p); 1431 1432 if (num_queue_count >= max_queue_count) 1433 return EAGAIN; 1434 1435 ki = p->p_aioinfo; 1436 if (ki->kaio_queue_count >= ki->kaio_qallowed_count) 1437 return EAGAIN; 1438 1439 return _aio_aqueue(p, job, NULL, type); 1440 } 1441 1442 /* 1443 * Support the aio_return system call, as a side-effect, kernel resources are 1444 * released. 1445 */ 1446 int 1447 aio_return(struct proc *p, struct aio_return_args *uap) 1448 { 1449 #ifndef VFS_AIO 1450 return ENOSYS; 1451 #else 1452 int s; 1453 int jobref; 1454 struct aiocblist *cb, *ncb; 1455 struct aiocb *ujob; 1456 struct kaioinfo *ki; 1457 1458 ki = p->p_aioinfo; 1459 if (ki == NULL) 1460 return EINVAL; 1461 1462 ujob = uap->aiocbp; 1463 1464 jobref = fuword(&ujob->_aiocb_private.kernelinfo); 1465 if (jobref == -1 || jobref == 0) 1466 return EINVAL; 1467 1468 s = splnet(); 1469 for (cb = TAILQ_FIRST(&ki->kaio_jobdone); cb; cb = TAILQ_NEXT(cb, 1470 plist)) { 1471 if (((intptr_t) cb->uaiocb._aiocb_private.kernelinfo) == 1472 jobref) { 1473 splx(s); 1474 if (ujob == cb->uuaiocb) { 1475 p->p_retval[0] = 1476 cb->uaiocb._aiocb_private.status; 1477 } else 1478 p->p_retval[0] = EFAULT; 1479 if (cb->uaiocb.aio_lio_opcode == LIO_WRITE) { 1480 curproc->p_stats->p_ru.ru_oublock += 1481 cb->outputcharge; 1482 cb->outputcharge = 0; 1483 } else if (cb->uaiocb.aio_lio_opcode == LIO_READ) { 1484 curproc->p_stats->p_ru.ru_inblock += 1485 cb->inputcharge; 1486 cb->inputcharge = 0; 1487 } 1488 aio_free_entry(cb); 1489 return 0; 1490 } 1491 } 1492 splx(s); 1493 1494 s = splbio(); 1495 for (cb = TAILQ_FIRST(&ki->kaio_bufdone); cb; cb = ncb) { 1496 ncb = TAILQ_NEXT(cb, plist); 1497 if (((intptr_t) cb->uaiocb._aiocb_private.kernelinfo) 1498 == jobref) { 1499 splx(s); 1500 if (ujob == cb->uuaiocb) { 1501 p->p_retval[0] = 1502 cb->uaiocb._aiocb_private.status; 1503 } else 1504 p->p_retval[0] = EFAULT; 1505 aio_free_entry(cb); 1506 return 0; 1507 } 1508 } 1509 splx(s); 1510 1511 return (EINVAL); 1512 #endif /* VFS_AIO */ 1513 } 1514 1515 /* 1516 * Allow a process to wakeup when any of the I/O requests are completed. 1517 */ 1518 int 1519 aio_suspend(struct proc *p, struct aio_suspend_args *uap) 1520 { 1521 #ifndef VFS_AIO 1522 return ENOSYS; 1523 #else 1524 struct timeval atv; 1525 struct timespec ts; 1526 struct aiocb *const *cbptr, *cbp; 1527 struct kaioinfo *ki; 1528 struct aiocblist *cb; 1529 int i; 1530 int njoblist; 1531 int error, s, timo; 1532 int *ijoblist; 1533 struct aiocb **ujoblist; 1534 1535 if (uap->nent >= AIO_LISTIO_MAX) 1536 return EINVAL; 1537 1538 timo = 0; 1539 if (uap->timeout) { 1540 /* Get timespec struct. */ 1541 if ((error = copyin(uap->timeout, &ts, sizeof(ts))) != 0) 1542 return error; 1543 1544 if (ts.tv_nsec < 0 || ts.tv_nsec >= 1000000000) 1545 return (EINVAL); 1546 1547 TIMESPEC_TO_TIMEVAL(&atv, &ts); 1548 if (itimerfix(&atv)) 1549 return (EINVAL); 1550 timo = tvtohz(&atv); 1551 } 1552 1553 ki = p->p_aioinfo; 1554 if (ki == NULL) 1555 return EAGAIN; 1556 1557 njoblist = 0; 1558 ijoblist = zalloc(aiol_zone); 1559 ujoblist = zalloc(aiol_zone); 1560 cbptr = uap->aiocbp; 1561 1562 for (i = 0; i < uap->nent; i++) { 1563 cbp = (struct aiocb *)(intptr_t)fuword((caddr_t)&cbptr[i]); 1564 if (cbp == 0) 1565 continue; 1566 ujoblist[njoblist] = cbp; 1567 ijoblist[njoblist] = fuword(&cbp->_aiocb_private.kernelinfo); 1568 njoblist++; 1569 } 1570 1571 if (njoblist == 0) { 1572 zfree(aiol_zone, ijoblist); 1573 zfree(aiol_zone, ujoblist); 1574 return 0; 1575 } 1576 1577 error = 0; 1578 for (;;) { 1579 for (cb = TAILQ_FIRST(&ki->kaio_jobdone); cb; cb = 1580 TAILQ_NEXT(cb, plist)) { 1581 for (i = 0; i < njoblist; i++) { 1582 if (((intptr_t) 1583 cb->uaiocb._aiocb_private.kernelinfo) == 1584 ijoblist[i]) { 1585 if (ujoblist[i] != cb->uuaiocb) 1586 error = EINVAL; 1587 zfree(aiol_zone, ijoblist); 1588 zfree(aiol_zone, ujoblist); 1589 return error; 1590 } 1591 } 1592 } 1593 1594 s = splbio(); 1595 for (cb = TAILQ_FIRST(&ki->kaio_bufdone); cb; cb = 1596 TAILQ_NEXT(cb, plist)) { 1597 for (i = 0; i < njoblist; i++) { 1598 if (((intptr_t) 1599 cb->uaiocb._aiocb_private.kernelinfo) == 1600 ijoblist[i]) { 1601 splx(s); 1602 if (ujoblist[i] != cb->uuaiocb) 1603 error = EINVAL; 1604 zfree(aiol_zone, ijoblist); 1605 zfree(aiol_zone, ujoblist); 1606 return error; 1607 } 1608 } 1609 } 1610 1611 ki->kaio_flags |= KAIO_WAKEUP; 1612 error = tsleep(p, PRIBIO | PCATCH, "aiospn", timo); 1613 splx(s); 1614 1615 if (error == ERESTART || error == EINTR) { 1616 zfree(aiol_zone, ijoblist); 1617 zfree(aiol_zone, ujoblist); 1618 return EINTR; 1619 } else if (error == EWOULDBLOCK) { 1620 zfree(aiol_zone, ijoblist); 1621 zfree(aiol_zone, ujoblist); 1622 return EAGAIN; 1623 } 1624 } 1625 1626 /* NOTREACHED */ 1627 return EINVAL; 1628 #endif /* VFS_AIO */ 1629 } 1630 1631 /* 1632 * aio_cancel cancels any non-physio aio operations not currently in 1633 * progress. 1634 */ 1635 int 1636 aio_cancel(struct proc *p, struct aio_cancel_args *uap) 1637 { 1638 #ifndef VFS_AIO 1639 return ENOSYS; 1640 #else 1641 struct kaioinfo *ki; 1642 struct aiocblist *cbe, *cbn; 1643 struct file *fp; 1644 struct filedesc *fdp; 1645 struct socket *so; 1646 struct proc *po; 1647 int s,error; 1648 int cancelled=0; 1649 int notcancelled=0; 1650 struct vnode *vp; 1651 1652 fdp = p->p_fd; 1653 1654 fp = fdp->fd_ofiles[uap->fd]; 1655 1656 if (fp == NULL) { 1657 return EBADF; 1658 } 1659 1660 if (fp->f_type == DTYPE_VNODE) { 1661 vp = (struct vnode *)fp->f_data; 1662 1663 if (vn_isdisk(vp,&error)) { 1664 p->p_retval[0] = AIO_NOTCANCELED; 1665 return 0; 1666 } 1667 } else if (fp->f_type == DTYPE_SOCKET) { 1668 so = (struct socket *)fp->f_data; 1669 1670 s = splnet(); 1671 1672 for (cbe = TAILQ_FIRST(&so->so_aiojobq); cbe; cbe = cbn) { 1673 cbn = TAILQ_NEXT(cbe, list); 1674 if ((uap->aiocbp == NULL) || 1675 (uap->aiocbp == cbe->uuaiocb) ) { 1676 po = cbe->userproc; 1677 ki = po->p_aioinfo; 1678 TAILQ_REMOVE(&so->so_aiojobq, cbe, list); 1679 TAILQ_REMOVE(&ki->kaio_sockqueue, cbe, plist); 1680 TAILQ_INSERT_TAIL(&ki->kaio_jobdone, cbe, plist); 1681 if (ki->kaio_flags & KAIO_WAKEUP) { 1682 wakeup(po); 1683 } 1684 cbe->jobstate = JOBST_JOBFINISHED; 1685 cbe->uaiocb._aiocb_private.status=-1; 1686 cbe->uaiocb._aiocb_private.error=ECANCELED; 1687 cancelled++; 1688 /* XXX cancelled, knote? */ 1689 if (cbe->uaiocb.aio_sigevent.sigev_notify == 1690 SIGEV_SIGNAL) 1691 psignal(cbe->userproc, cbe->uaiocb.aio_sigevent.sigev_signo); 1692 if (uap->aiocbp) 1693 break; 1694 } 1695 } 1696 1697 splx(s); 1698 1699 if ((cancelled) && (uap->aiocbp)) { 1700 p->p_retval[0] = AIO_CANCELED; 1701 return 0; 1702 } 1703 1704 } 1705 1706 ki=p->p_aioinfo; 1707 1708 s = splnet(); 1709 1710 for (cbe = TAILQ_FIRST(&ki->kaio_jobqueue); cbe; cbe = cbn) { 1711 cbn = TAILQ_NEXT(cbe, plist); 1712 1713 if ((uap->fd == cbe->uaiocb.aio_fildes) && 1714 ((uap->aiocbp == NULL ) || 1715 (uap->aiocbp == cbe->uuaiocb))) { 1716 1717 if (cbe->jobstate == JOBST_JOBQGLOBAL) { 1718 TAILQ_REMOVE(&aio_jobs, cbe, list); 1719 TAILQ_REMOVE(&ki->kaio_jobqueue, cbe, plist); 1720 TAILQ_INSERT_TAIL(&ki->kaio_jobdone, cbe, 1721 plist); 1722 cancelled++; 1723 ki->kaio_queue_finished_count++; 1724 cbe->jobstate = JOBST_JOBFINISHED; 1725 cbe->uaiocb._aiocb_private.status = -1; 1726 cbe->uaiocb._aiocb_private.error = ECANCELED; 1727 /* XXX cancelled, knote? */ 1728 if (cbe->uaiocb.aio_sigevent.sigev_notify == 1729 SIGEV_SIGNAL) 1730 psignal(cbe->userproc, cbe->uaiocb.aio_sigevent.sigev_signo); 1731 } else { 1732 notcancelled++; 1733 } 1734 } 1735 } 1736 1737 splx(s); 1738 1739 1740 if (notcancelled) { 1741 p->p_retval[0] = AIO_NOTCANCELED; 1742 return 0; 1743 } 1744 1745 if (cancelled) { 1746 p->p_retval[0] = AIO_CANCELED; 1747 return 0; 1748 } 1749 1750 p->p_retval[0] = AIO_ALLDONE; 1751 1752 return 0; 1753 #endif /* VFS_AIO */ 1754 } 1755 1756 /* 1757 * aio_error is implemented in the kernel level for compatibility purposes only. 1758 * For a user mode async implementation, it would be best to do it in a userland 1759 * subroutine. 1760 */ 1761 int 1762 aio_error(struct proc *p, struct aio_error_args *uap) 1763 { 1764 #ifndef VFS_AIO 1765 return ENOSYS; 1766 #else 1767 int s; 1768 struct aiocblist *cb; 1769 struct kaioinfo *ki; 1770 int jobref; 1771 1772 ki = p->p_aioinfo; 1773 if (ki == NULL) 1774 return EINVAL; 1775 1776 jobref = fuword(&uap->aiocbp->_aiocb_private.kernelinfo); 1777 if ((jobref == -1) || (jobref == 0)) 1778 return EINVAL; 1779 1780 for (cb = TAILQ_FIRST(&ki->kaio_jobdone); cb; cb = TAILQ_NEXT(cb, 1781 plist)) { 1782 if (((intptr_t)cb->uaiocb._aiocb_private.kernelinfo) == 1783 jobref) { 1784 p->p_retval[0] = cb->uaiocb._aiocb_private.error; 1785 return 0; 1786 } 1787 } 1788 1789 s = splnet(); 1790 1791 for (cb = TAILQ_FIRST(&ki->kaio_jobqueue); cb; cb = TAILQ_NEXT(cb, 1792 plist)) { 1793 if (((intptr_t)cb->uaiocb._aiocb_private.kernelinfo) == 1794 jobref) { 1795 p->p_retval[0] = EINPROGRESS; 1796 splx(s); 1797 return 0; 1798 } 1799 } 1800 1801 for (cb = TAILQ_FIRST(&ki->kaio_sockqueue); cb; cb = TAILQ_NEXT(cb, 1802 plist)) { 1803 if (((intptr_t)cb->uaiocb._aiocb_private.kernelinfo) == 1804 jobref) { 1805 p->p_retval[0] = EINPROGRESS; 1806 splx(s); 1807 return 0; 1808 } 1809 } 1810 splx(s); 1811 1812 s = splbio(); 1813 for (cb = TAILQ_FIRST(&ki->kaio_bufdone); cb; cb = TAILQ_NEXT(cb, 1814 plist)) { 1815 if (((intptr_t)cb->uaiocb._aiocb_private.kernelinfo) == 1816 jobref) { 1817 p->p_retval[0] = cb->uaiocb._aiocb_private.error; 1818 splx(s); 1819 return 0; 1820 } 1821 } 1822 1823 for (cb = TAILQ_FIRST(&ki->kaio_bufqueue); cb; cb = TAILQ_NEXT(cb, 1824 plist)) { 1825 if (((intptr_t)cb->uaiocb._aiocb_private.kernelinfo) == 1826 jobref) { 1827 p->p_retval[0] = EINPROGRESS; 1828 splx(s); 1829 return 0; 1830 } 1831 } 1832 splx(s); 1833 1834 #if (0) 1835 /* 1836 * Hack for lio. 1837 */ 1838 status = fuword(&uap->aiocbp->_aiocb_private.status); 1839 if (status == -1) 1840 return fuword(&uap->aiocbp->_aiocb_private.error); 1841 #endif 1842 return EINVAL; 1843 #endif /* VFS_AIO */ 1844 } 1845 1846 int 1847 aio_read(struct proc *p, struct aio_read_args *uap) 1848 { 1849 #ifndef VFS_AIO 1850 return ENOSYS; 1851 #else 1852 struct filedesc *fdp; 1853 struct file *fp; 1854 struct uio auio; 1855 struct iovec aiov; 1856 unsigned int fd; 1857 int cnt; 1858 struct aiocb iocb; 1859 int error, pmodes; 1860 1861 pmodes = fuword(&uap->aiocbp->_aiocb_private.privatemodes); 1862 if ((pmodes & AIO_PMODE_SYNC) == 0) 1863 return aio_aqueue(p, (struct aiocb *)uap->aiocbp, LIO_READ); 1864 1865 /* Get control block. */ 1866 if ((error = copyin((caddr_t)uap->aiocbp, (caddr_t)&iocb, sizeof iocb)) 1867 != 0) 1868 return error; 1869 1870 /* Get the fd info for process. */ 1871 fdp = p->p_fd; 1872 1873 /* 1874 * Range check file descriptor. 1875 */ 1876 fd = iocb.aio_fildes; 1877 if (fd >= fdp->fd_nfiles) 1878 return EBADF; 1879 fp = fdp->fd_ofiles[fd]; 1880 if ((fp == NULL) || ((fp->f_flag & FREAD) == 0)) 1881 return EBADF; 1882 if (iocb.aio_offset == -1LL) 1883 return EINVAL; 1884 1885 auio.uio_resid = iocb.aio_nbytes; 1886 if (auio.uio_resid < 0) 1887 return (EINVAL); 1888 1889 /* 1890 * Process sync simply -- queue async request. 1891 */ 1892 if ((iocb._aiocb_private.privatemodes & AIO_PMODE_SYNC) == 0) 1893 return aio_aqueue(p, (struct aiocb *)uap->aiocbp, LIO_READ); 1894 1895 aiov.iov_base = (void *)iocb.aio_buf; 1896 aiov.iov_len = iocb.aio_nbytes; 1897 1898 auio.uio_iov = &aiov; 1899 auio.uio_iovcnt = 1; 1900 auio.uio_offset = iocb.aio_offset; 1901 auio.uio_rw = UIO_READ; 1902 auio.uio_segflg = UIO_USERSPACE; 1903 auio.uio_procp = p; 1904 1905 cnt = iocb.aio_nbytes; 1906 error = fo_read(fp, &auio, fp->f_cred, FOF_OFFSET, p); 1907 if (error && (auio.uio_resid != cnt) && (error == ERESTART || error == 1908 EINTR || error == EWOULDBLOCK)) 1909 error = 0; 1910 cnt -= auio.uio_resid; 1911 p->p_retval[0] = cnt; 1912 return error; 1913 #endif /* VFS_AIO */ 1914 } 1915 1916 int 1917 aio_write(struct proc *p, struct aio_write_args *uap) 1918 { 1919 #ifndef VFS_AIO 1920 return ENOSYS; 1921 #else 1922 struct filedesc *fdp; 1923 struct file *fp; 1924 struct uio auio; 1925 struct iovec aiov; 1926 unsigned int fd; 1927 int cnt; 1928 struct aiocb iocb; 1929 int error; 1930 int pmodes; 1931 1932 /* 1933 * Process sync simply -- queue async request. 1934 */ 1935 pmodes = fuword(&uap->aiocbp->_aiocb_private.privatemodes); 1936 if ((pmodes & AIO_PMODE_SYNC) == 0) 1937 return aio_aqueue(p, (struct aiocb *)uap->aiocbp, LIO_WRITE); 1938 1939 if ((error = copyin((caddr_t)uap->aiocbp, (caddr_t)&iocb, sizeof iocb)) 1940 != 0) 1941 return error; 1942 1943 /* Get the fd info for process. */ 1944 fdp = p->p_fd; 1945 1946 /* 1947 * Range check file descriptor. 1948 */ 1949 fd = iocb.aio_fildes; 1950 if (fd >= fdp->fd_nfiles) 1951 return EBADF; 1952 fp = fdp->fd_ofiles[fd]; 1953 if ((fp == NULL) || ((fp->f_flag & FWRITE) == 0)) 1954 return EBADF; 1955 if (iocb.aio_offset == -1LL) 1956 return EINVAL; 1957 1958 aiov.iov_base = (void *)iocb.aio_buf; 1959 aiov.iov_len = iocb.aio_nbytes; 1960 auio.uio_iov = &aiov; 1961 auio.uio_iovcnt = 1; 1962 auio.uio_offset = iocb.aio_offset; 1963 1964 auio.uio_resid = iocb.aio_nbytes; 1965 if (auio.uio_resid < 0) 1966 return (EINVAL); 1967 1968 auio.uio_rw = UIO_WRITE; 1969 auio.uio_segflg = UIO_USERSPACE; 1970 auio.uio_procp = p; 1971 1972 cnt = iocb.aio_nbytes; 1973 error = fo_write(fp, &auio, fp->f_cred, FOF_OFFSET, p); 1974 if (error) { 1975 if (auio.uio_resid != cnt) { 1976 if (error == ERESTART || error == EINTR || error == 1977 EWOULDBLOCK) 1978 error = 0; 1979 if (error == EPIPE) 1980 psignal(p, SIGPIPE); 1981 } 1982 } 1983 cnt -= auio.uio_resid; 1984 p->p_retval[0] = cnt; 1985 return error; 1986 #endif /* VFS_AIO */ 1987 } 1988 1989 int 1990 lio_listio(struct proc *p, struct lio_listio_args *uap) 1991 { 1992 #ifndef VFS_AIO 1993 return ENOSYS; 1994 #else 1995 int nent, nentqueued; 1996 struct aiocb *iocb, * const *cbptr; 1997 struct aiocblist *cb; 1998 struct kaioinfo *ki; 1999 struct aio_liojob *lj; 2000 int error, runningcode; 2001 int nerror; 2002 int i; 2003 int s; 2004 2005 if ((uap->mode != LIO_NOWAIT) && (uap->mode != LIO_WAIT)) 2006 return EINVAL; 2007 2008 nent = uap->nent; 2009 if (nent > AIO_LISTIO_MAX) 2010 return EINVAL; 2011 2012 if (p->p_aioinfo == NULL) 2013 aio_init_aioinfo(p); 2014 2015 if ((nent + num_queue_count) > max_queue_count) 2016 return EAGAIN; 2017 2018 ki = p->p_aioinfo; 2019 if ((nent + ki->kaio_queue_count) > ki->kaio_qallowed_count) 2020 return EAGAIN; 2021 2022 lj = zalloc(aiolio_zone); 2023 if (!lj) 2024 return EAGAIN; 2025 2026 lj->lioj_flags = 0; 2027 lj->lioj_buffer_count = 0; 2028 lj->lioj_buffer_finished_count = 0; 2029 lj->lioj_queue_count = 0; 2030 lj->lioj_queue_finished_count = 0; 2031 lj->lioj_ki = ki; 2032 TAILQ_INSERT_TAIL(&ki->kaio_liojoblist, lj, lioj_list); 2033 2034 /* 2035 * Setup signal. 2036 */ 2037 if (uap->sig && (uap->mode == LIO_NOWAIT)) { 2038 error = copyin(uap->sig, &lj->lioj_signal, 2039 sizeof(lj->lioj_signal)); 2040 if (error) 2041 return error; 2042 lj->lioj_flags |= LIOJ_SIGNAL; 2043 lj->lioj_flags &= ~LIOJ_SIGNAL_POSTED; 2044 } else 2045 lj->lioj_flags &= ~LIOJ_SIGNAL; 2046 2047 /* 2048 * Get pointers to the list of I/O requests. 2049 */ 2050 nerror = 0; 2051 nentqueued = 0; 2052 cbptr = uap->acb_list; 2053 for (i = 0; i < uap->nent; i++) { 2054 iocb = (struct aiocb *)(intptr_t)fuword((caddr_t)&cbptr[i]); 2055 if (((intptr_t)iocb != -1) && ((intptr_t)iocb != NULL)) { 2056 error = _aio_aqueue(p, iocb, lj, 0); 2057 if (error == 0) 2058 nentqueued++; 2059 else 2060 nerror++; 2061 } 2062 } 2063 2064 /* 2065 * If we haven't queued any, then just return error. 2066 */ 2067 if (nentqueued == 0) 2068 return 0; 2069 2070 /* 2071 * Calculate the appropriate error return. 2072 */ 2073 runningcode = 0; 2074 if (nerror) 2075 runningcode = EIO; 2076 2077 if (uap->mode == LIO_WAIT) { 2078 int command, found, jobref; 2079 2080 for (;;) { 2081 found = 0; 2082 for (i = 0; i < uap->nent; i++) { 2083 /* 2084 * Fetch address of the control buf pointer in 2085 * user space. 2086 */ 2087 iocb = (struct aiocb *)(intptr_t)fuword((caddr_t)&cbptr[i]); 2088 if (((intptr_t)iocb == -1) || ((intptr_t)iocb 2089 == 0)) 2090 continue; 2091 2092 /* 2093 * Fetch the associated command from user space. 2094 */ 2095 command = fuword(&iocb->aio_lio_opcode); 2096 if (command == LIO_NOP) { 2097 found++; 2098 continue; 2099 } 2100 2101 jobref = fuword(&iocb->_aiocb_private.kernelinfo); 2102 2103 for (cb = TAILQ_FIRST(&ki->kaio_jobdone); cb; 2104 cb = TAILQ_NEXT(cb, plist)) { 2105 if (((intptr_t)cb->uaiocb._aiocb_private.kernelinfo) 2106 == jobref) { 2107 if (cb->uaiocb.aio_lio_opcode 2108 == LIO_WRITE) { 2109 curproc->p_stats->p_ru.ru_oublock 2110 += 2111 cb->outputcharge; 2112 cb->outputcharge = 0; 2113 } else if (cb->uaiocb.aio_lio_opcode 2114 == LIO_READ) { 2115 curproc->p_stats->p_ru.ru_inblock 2116 += cb->inputcharge; 2117 cb->inputcharge = 0; 2118 } 2119 found++; 2120 break; 2121 } 2122 } 2123 2124 s = splbio(); 2125 for (cb = TAILQ_FIRST(&ki->kaio_bufdone); cb; 2126 cb = TAILQ_NEXT(cb, plist)) { 2127 if (((intptr_t)cb->uaiocb._aiocb_private.kernelinfo) 2128 == jobref) { 2129 found++; 2130 break; 2131 } 2132 } 2133 splx(s); 2134 } 2135 2136 /* 2137 * If all I/Os have been disposed of, then we can 2138 * return. 2139 */ 2140 if (found == nentqueued) 2141 return runningcode; 2142 2143 ki->kaio_flags |= KAIO_WAKEUP; 2144 error = tsleep(p, PRIBIO | PCATCH, "aiospn", 0); 2145 2146 if (error == EINTR) 2147 return EINTR; 2148 else if (error == EWOULDBLOCK) 2149 return EAGAIN; 2150 } 2151 } 2152 2153 return runningcode; 2154 #endif /* VFS_AIO */ 2155 } 2156 2157 /* 2158 * This is a wierd hack so that we can post a signal. It is safe to do so from 2159 * a timeout routine, but *not* from an interrupt routine. 2160 */ 2161 static void 2162 process_signal(void *aioj) 2163 { 2164 struct aiocblist *aiocbe = aioj; 2165 struct aio_liojob *lj = aiocbe->lio; 2166 struct aiocb *cb = &aiocbe->uaiocb; 2167 2168 if ((lj) && (lj->lioj_signal.sigev_notify == SIGEV_SIGNAL) && 2169 (lj->lioj_queue_count == lj->lioj_queue_finished_count)) { 2170 psignal(lj->lioj_ki->kaio_p, lj->lioj_signal.sigev_signo); 2171 lj->lioj_flags |= LIOJ_SIGNAL_POSTED; 2172 } 2173 2174 if (cb->aio_sigevent.sigev_notify == SIGEV_SIGNAL) 2175 psignal(aiocbe->userproc, cb->aio_sigevent.sigev_signo); 2176 } 2177 2178 /* 2179 * Interrupt handler for physio, performs the necessary process wakeups, and 2180 * signals. 2181 */ 2182 static void 2183 aio_physwakeup(struct buf *bp) 2184 { 2185 struct aiocblist *aiocbe; 2186 struct proc *p; 2187 struct kaioinfo *ki; 2188 struct aio_liojob *lj; 2189 int s; 2190 s = splbio(); 2191 2192 wakeup((caddr_t)bp); 2193 bp->b_flags |= B_DONE; 2194 2195 aiocbe = (struct aiocblist *)bp->b_spc; 2196 if (aiocbe) { 2197 p = bp->b_caller1; 2198 2199 aiocbe->jobstate = JOBST_JOBBFINISHED; 2200 aiocbe->uaiocb._aiocb_private.status -= bp->b_resid; 2201 aiocbe->uaiocb._aiocb_private.error = 0; 2202 aiocbe->jobflags |= AIOCBLIST_DONE; 2203 2204 if (bp->b_ioflags & BIO_ERROR) 2205 aiocbe->uaiocb._aiocb_private.error = bp->b_error; 2206 2207 lj = aiocbe->lio; 2208 if (lj) { 2209 lj->lioj_buffer_finished_count++; 2210 2211 /* 2212 * wakeup/signal if all of the interrupt jobs are done. 2213 */ 2214 if (lj->lioj_buffer_finished_count == 2215 lj->lioj_buffer_count) { 2216 /* 2217 * Post a signal if it is called for. 2218 */ 2219 if ((lj->lioj_flags & 2220 (LIOJ_SIGNAL|LIOJ_SIGNAL_POSTED)) == 2221 LIOJ_SIGNAL) { 2222 lj->lioj_flags |= LIOJ_SIGNAL_POSTED; 2223 timeout(process_signal, aiocbe, 0); 2224 } 2225 } 2226 } 2227 2228 ki = p->p_aioinfo; 2229 if (ki) { 2230 ki->kaio_buffer_finished_count++; 2231 TAILQ_REMOVE(&aio_bufjobs, aiocbe, list); 2232 TAILQ_REMOVE(&ki->kaio_bufqueue, aiocbe, plist); 2233 TAILQ_INSERT_TAIL(&ki->kaio_bufdone, aiocbe, plist); 2234 2235 KNOTE(&aiocbe->klist, 0); 2236 /* Do the wakeup. */ 2237 if (ki->kaio_flags & (KAIO_RUNDOWN|KAIO_WAKEUP)) { 2238 ki->kaio_flags &= ~KAIO_WAKEUP; 2239 wakeup(p); 2240 } 2241 } 2242 2243 if (aiocbe->uaiocb.aio_sigevent.sigev_notify == SIGEV_SIGNAL) 2244 timeout(process_signal, aiocbe, 0); 2245 } 2246 splx(s); 2247 } 2248 2249 int 2250 aio_waitcomplete(struct proc *p, struct aio_waitcomplete_args *uap) 2251 { 2252 #ifndef VFS_AIO 2253 return ENOSYS; 2254 #else 2255 struct timeval atv; 2256 struct timespec ts; 2257 struct aiocb **cbptr; 2258 struct kaioinfo *ki; 2259 struct aiocblist *cb = NULL; 2260 int error, s, timo; 2261 2262 suword(uap->aiocbp, (int)NULL); 2263 2264 timo = 0; 2265 if (uap->timeout) { 2266 /* Get timespec struct. */ 2267 error = copyin((caddr_t)uap->timeout, (caddr_t)&ts, 2268 sizeof(ts)); 2269 if (error) 2270 return error; 2271 2272 if ((ts.tv_nsec < 0) || (ts.tv_nsec >= 1000000000)) 2273 return (EINVAL); 2274 2275 TIMESPEC_TO_TIMEVAL(&atv, &ts); 2276 if (itimerfix(&atv)) 2277 return (EINVAL); 2278 timo = tvtohz(&atv); 2279 } 2280 2281 ki = p->p_aioinfo; 2282 if (ki == NULL) 2283 return EAGAIN; 2284 2285 cbptr = uap->aiocbp; 2286 2287 for (;;) { 2288 if ((cb = TAILQ_FIRST(&ki->kaio_jobdone)) != 0) { 2289 suword(uap->aiocbp, (int)cb->uuaiocb); 2290 p->p_retval[0] = cb->uaiocb._aiocb_private.status; 2291 if (cb->uaiocb.aio_lio_opcode == LIO_WRITE) { 2292 curproc->p_stats->p_ru.ru_oublock += 2293 cb->outputcharge; 2294 cb->outputcharge = 0; 2295 } else if (cb->uaiocb.aio_lio_opcode == LIO_READ) { 2296 curproc->p_stats->p_ru.ru_inblock += 2297 cb->inputcharge; 2298 cb->inputcharge = 0; 2299 } 2300 aio_free_entry(cb); 2301 return cb->uaiocb._aiocb_private.error; 2302 } 2303 2304 s = splbio(); 2305 if ((cb = TAILQ_FIRST(&ki->kaio_bufdone)) != 0 ) { 2306 splx(s); 2307 suword(uap->aiocbp, (int)cb->uuaiocb); 2308 p->p_retval[0] = cb->uaiocb._aiocb_private.status; 2309 aio_free_entry(cb); 2310 return cb->uaiocb._aiocb_private.error; 2311 } 2312 2313 ki->kaio_flags |= KAIO_WAKEUP; 2314 error = tsleep(p, PRIBIO | PCATCH, "aiowc", timo); 2315 splx(s); 2316 2317 if (error == ERESTART) 2318 return EINTR; 2319 else if (error < 0) 2320 return error; 2321 else if (error == EINTR) 2322 return EINTR; 2323 else if (error == EWOULDBLOCK) 2324 return EAGAIN; 2325 } 2326 #endif /* VFS_AIO */ 2327 } 2328 2329 static int 2330 filt_aioattach(struct knote *kn) 2331 { 2332 struct aiocblist *aiocbe = (struct aiocblist *)kn->kn_id; 2333 2334 /* 2335 * The aiocbe pointer must be validated before using it, so 2336 * registration is restricted to the kernel; the user cannot 2337 * set EV_FLAG1. 2338 */ 2339 if ((kn->kn_flags & EV_FLAG1) == 0) 2340 return (EPERM); 2341 kn->kn_flags &= ~EV_FLAG1; 2342 2343 SLIST_INSERT_HEAD(&aiocbe->klist, kn, kn_selnext); 2344 2345 return (0); 2346 } 2347 2348 static void 2349 filt_aiodetach(struct knote *kn) 2350 { 2351 struct aiocblist *aiocbe = (struct aiocblist *)kn->kn_id; 2352 int s = splhigh(); /* XXX no clue, so overkill */ 2353 2354 SLIST_REMOVE(&aiocbe->klist, kn, knote, kn_selnext); 2355 splx(s); 2356 } 2357 2358 /*ARGSUSED*/ 2359 static int 2360 filt_aio(struct knote *kn, long hint) 2361 { 2362 struct aiocblist *aiocbe = (struct aiocblist *)kn->kn_id; 2363 2364 kn->kn_data = 0; /* XXX data returned? */ 2365 if (aiocbe->jobstate != JOBST_JOBFINISHED) 2366 return (0); 2367 kn->kn_flags |= EV_EOF; 2368 return (1); 2369 } 2370