1 /*- 2 * Copyright (c) 1997 John S. Dyson. All rights reserved. 3 * 4 * Redistribution and use in source and binary forms, with or without 5 * modification, are permitted provided that the following conditions 6 * are met: 7 * 1. Redistributions of source code must retain the above copyright 8 * notice, this list of conditions and the following disclaimer. 9 * 2. John S. Dyson's name may not be used to endorse or promote products 10 * derived from this software without specific prior written permission. 11 * 12 * DISCLAIMER: This code isn't warranted to do anything useful. Anything 13 * bad that happens because of using this software isn't the responsibility 14 * of the author. This software is distributed AS-IS. 15 */ 16 17 /* 18 * This file contains support for the POSIX 1003.1B AIO/LIO facility. 19 */ 20 21 #include <sys/cdefs.h> 22 __FBSDID("$FreeBSD$"); 23 24 #include "opt_compat.h" 25 26 #include <sys/param.h> 27 #include <sys/systm.h> 28 #include <sys/malloc.h> 29 #include <sys/bio.h> 30 #include <sys/buf.h> 31 #include <sys/capsicum.h> 32 #include <sys/eventhandler.h> 33 #include <sys/sysproto.h> 34 #include <sys/filedesc.h> 35 #include <sys/kernel.h> 36 #include <sys/module.h> 37 #include <sys/kthread.h> 38 #include <sys/fcntl.h> 39 #include <sys/file.h> 40 #include <sys/limits.h> 41 #include <sys/lock.h> 42 #include <sys/mutex.h> 43 #include <sys/unistd.h> 44 #include <sys/posix4.h> 45 #include <sys/proc.h> 46 #include <sys/resourcevar.h> 47 #include <sys/signalvar.h> 48 #include <sys/protosw.h> 49 #include <sys/rwlock.h> 50 #include <sys/sema.h> 51 #include <sys/socket.h> 52 #include <sys/socketvar.h> 53 #include <sys/syscall.h> 54 #include <sys/sysent.h> 55 #include <sys/sysctl.h> 56 #include <sys/sx.h> 57 #include <sys/taskqueue.h> 58 #include <sys/vnode.h> 59 #include <sys/conf.h> 60 #include <sys/event.h> 61 #include <sys/mount.h> 62 #include <geom/geom.h> 63 64 #include <machine/atomic.h> 65 66 #include <vm/vm.h> 67 #include <vm/vm_page.h> 68 #include <vm/vm_extern.h> 69 #include <vm/pmap.h> 70 #include <vm/vm_map.h> 71 #include <vm/vm_object.h> 72 #include <vm/uma.h> 73 #include <sys/aio.h> 74 75 #include "opt_vfs_aio.h" 76 77 /* 78 * Counter for allocating reference ids to new jobs. Wrapped to 1 on 79 * overflow. (XXX will be removed soon.) 80 */ 81 static u_long jobrefid; 82 83 /* 84 * Counter for aio_fsync. 85 */ 86 static uint64_t jobseqno; 87 88 #define JOBST_NULL 0 89 #define JOBST_JOBQSOCK 1 90 #define JOBST_JOBQGLOBAL 2 91 #define JOBST_JOBRUNNING 3 92 #define JOBST_JOBFINISHED 4 93 #define JOBST_JOBQBUF 5 94 #define JOBST_JOBQSYNC 6 95 96 #ifndef MAX_AIO_PER_PROC 97 #define MAX_AIO_PER_PROC 32 98 #endif 99 100 #ifndef MAX_AIO_QUEUE_PER_PROC 101 #define MAX_AIO_QUEUE_PER_PROC 256 /* Bigger than AIO_LISTIO_MAX */ 102 #endif 103 104 #ifndef MAX_AIO_PROCS 105 #define MAX_AIO_PROCS 32 106 #endif 107 108 #ifndef MAX_AIO_QUEUE 109 #define MAX_AIO_QUEUE 1024 /* Bigger than AIO_LISTIO_MAX */ 110 #endif 111 112 #ifndef TARGET_AIO_PROCS 113 #define TARGET_AIO_PROCS 4 114 #endif 115 116 #ifndef MAX_BUF_AIO 117 #define MAX_BUF_AIO 16 118 #endif 119 120 #ifndef AIOD_LIFETIME_DEFAULT 121 #define AIOD_LIFETIME_DEFAULT (30 * hz) 122 #endif 123 124 FEATURE(aio, "Asynchronous I/O"); 125 126 static MALLOC_DEFINE(M_LIO, "lio", "listio aio control block list"); 127 128 static SYSCTL_NODE(_vfs, OID_AUTO, aio, CTLFLAG_RW, 0, "Async IO management"); 129 130 static int max_aio_procs = MAX_AIO_PROCS; 131 SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_procs, 132 CTLFLAG_RW, &max_aio_procs, 0, 133 "Maximum number of kernel processes to use for handling async IO "); 134 135 static int num_aio_procs = 0; 136 SYSCTL_INT(_vfs_aio, OID_AUTO, num_aio_procs, 137 CTLFLAG_RD, &num_aio_procs, 0, 138 "Number of presently active kernel processes for async IO"); 139 140 /* 141 * The code will adjust the actual number of AIO processes towards this 142 * number when it gets a chance. 143 */ 144 static int target_aio_procs = TARGET_AIO_PROCS; 145 SYSCTL_INT(_vfs_aio, OID_AUTO, target_aio_procs, CTLFLAG_RW, &target_aio_procs, 146 0, "Preferred number of ready kernel processes for async IO"); 147 148 static int max_queue_count = MAX_AIO_QUEUE; 149 SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_queue, CTLFLAG_RW, &max_queue_count, 0, 150 "Maximum number of aio requests to queue, globally"); 151 152 static int num_queue_count = 0; 153 SYSCTL_INT(_vfs_aio, OID_AUTO, num_queue_count, CTLFLAG_RD, &num_queue_count, 0, 154 "Number of queued aio requests"); 155 156 static int num_buf_aio = 0; 157 SYSCTL_INT(_vfs_aio, OID_AUTO, num_buf_aio, CTLFLAG_RD, &num_buf_aio, 0, 158 "Number of aio requests presently handled by the buf subsystem"); 159 160 /* Number of async I/O processes in the process of being started */ 161 /* XXX This should be local to aio_aqueue() */ 162 static int num_aio_resv_start = 0; 163 164 static int aiod_lifetime; 165 SYSCTL_INT(_vfs_aio, OID_AUTO, aiod_lifetime, CTLFLAG_RW, &aiod_lifetime, 0, 166 "Maximum lifetime for idle aiod"); 167 168 static int unloadable = 0; 169 SYSCTL_INT(_vfs_aio, OID_AUTO, unloadable, CTLFLAG_RW, &unloadable, 0, 170 "Allow unload of aio (not recommended)"); 171 172 173 static int max_aio_per_proc = MAX_AIO_PER_PROC; 174 SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_per_proc, CTLFLAG_RW, &max_aio_per_proc, 175 0, "Maximum active aio requests per process (stored in the process)"); 176 177 static int max_aio_queue_per_proc = MAX_AIO_QUEUE_PER_PROC; 178 SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_queue_per_proc, CTLFLAG_RW, 179 &max_aio_queue_per_proc, 0, 180 "Maximum queued aio requests per process (stored in the process)"); 181 182 static int max_buf_aio = MAX_BUF_AIO; 183 SYSCTL_INT(_vfs_aio, OID_AUTO, max_buf_aio, CTLFLAG_RW, &max_buf_aio, 0, 184 "Maximum buf aio requests per process (stored in the process)"); 185 186 typedef struct oaiocb { 187 int aio_fildes; /* File descriptor */ 188 off_t aio_offset; /* File offset for I/O */ 189 volatile void *aio_buf; /* I/O buffer in process space */ 190 size_t aio_nbytes; /* Number of bytes for I/O */ 191 struct osigevent aio_sigevent; /* Signal to deliver */ 192 int aio_lio_opcode; /* LIO opcode */ 193 int aio_reqprio; /* Request priority -- ignored */ 194 struct __aiocb_private _aiocb_private; 195 } oaiocb_t; 196 197 /* 198 * Below is a key of locks used to protect each member of struct aiocblist 199 * aioliojob and kaioinfo and any backends. 200 * 201 * * - need not protected 202 * a - locked by kaioinfo lock 203 * b - locked by backend lock, the backend lock can be null in some cases, 204 * for example, BIO belongs to this type, in this case, proc lock is 205 * reused. 206 * c - locked by aio_job_mtx, the lock for the generic file I/O backend. 207 */ 208 209 /* 210 * Current, there is only two backends: BIO and generic file I/O. 211 * socket I/O is served by generic file I/O, this is not a good idea, since 212 * disk file I/O and any other types without O_NONBLOCK flag can block daemon 213 * processes, if there is no thread to serve socket I/O, the socket I/O will be 214 * delayed too long or starved, we should create some processes dedicated to 215 * sockets to do non-blocking I/O, same for pipe and fifo, for these I/O 216 * systems we really need non-blocking interface, fiddling O_NONBLOCK in file 217 * structure is not safe because there is race between userland and aio 218 * daemons. 219 */ 220 221 struct aiocblist { 222 TAILQ_ENTRY(aiocblist) list; /* (b) internal list of for backend */ 223 TAILQ_ENTRY(aiocblist) plist; /* (a) list of jobs for each backend */ 224 TAILQ_ENTRY(aiocblist) allist; /* (a) list of all jobs in proc */ 225 int jobflags; /* (a) job flags */ 226 int jobstate; /* (b) job state */ 227 int inputcharge; /* (*) input blockes */ 228 int outputcharge; /* (*) output blockes */ 229 struct bio *bp; /* (*) BIO backend BIO pointer */ 230 struct buf *pbuf; /* (*) BIO backend buffer pointer */ 231 struct vm_page *pages[btoc(MAXPHYS)+1]; /* BIO backend pages */ 232 int npages; /* BIO backend number of pages */ 233 struct proc *userproc; /* (*) user process */ 234 struct ucred *cred; /* (*) active credential when created */ 235 struct file *fd_file; /* (*) pointer to file structure */ 236 struct aioliojob *lio; /* (*) optional lio job */ 237 struct aiocb *uuaiocb; /* (*) pointer in userspace of aiocb */ 238 struct knlist klist; /* (a) list of knotes */ 239 struct aiocb uaiocb; /* (*) kernel I/O control block */ 240 ksiginfo_t ksi; /* (a) realtime signal info */ 241 uint64_t seqno; /* (*) job number */ 242 int pending; /* (a) number of pending I/O, aio_fsync only */ 243 }; 244 245 /* jobflags */ 246 #define AIOCBLIST_DONE 0x01 247 #define AIOCBLIST_BUFDONE 0x02 248 #define AIOCBLIST_RUNDOWN 0x04 249 #define AIOCBLIST_CHECKSYNC 0x08 250 251 /* 252 * AIO process info 253 */ 254 #define AIOP_FREE 0x1 /* proc on free queue */ 255 256 struct aioproc { 257 int aioprocflags; /* (c) AIO proc flags */ 258 TAILQ_ENTRY(aioproc) list; /* (c) list of processes */ 259 struct proc *aioproc; /* (*) the AIO proc */ 260 }; 261 262 /* 263 * data-structure for lio signal management 264 */ 265 struct aioliojob { 266 int lioj_flags; /* (a) listio flags */ 267 int lioj_count; /* (a) listio flags */ 268 int lioj_finished_count; /* (a) listio flags */ 269 struct sigevent lioj_signal; /* (a) signal on all I/O done */ 270 TAILQ_ENTRY(aioliojob) lioj_list; /* (a) lio list */ 271 struct knlist klist; /* (a) list of knotes */ 272 ksiginfo_t lioj_ksi; /* (a) Realtime signal info */ 273 }; 274 275 #define LIOJ_SIGNAL 0x1 /* signal on all done (lio) */ 276 #define LIOJ_SIGNAL_POSTED 0x2 /* signal has been posted */ 277 #define LIOJ_KEVENT_POSTED 0x4 /* kevent triggered */ 278 279 /* 280 * per process aio data structure 281 */ 282 struct kaioinfo { 283 struct mtx kaio_mtx; /* the lock to protect this struct */ 284 int kaio_flags; /* (a) per process kaio flags */ 285 int kaio_maxactive_count; /* (*) maximum number of AIOs */ 286 int kaio_active_count; /* (c) number of currently used AIOs */ 287 int kaio_qallowed_count; /* (*) maxiumu size of AIO queue */ 288 int kaio_count; /* (a) size of AIO queue */ 289 int kaio_ballowed_count; /* (*) maximum number of buffers */ 290 int kaio_buffer_count; /* (a) number of physio buffers */ 291 TAILQ_HEAD(,aiocblist) kaio_all; /* (a) all AIOs in the process */ 292 TAILQ_HEAD(,aiocblist) kaio_done; /* (a) done queue for process */ 293 TAILQ_HEAD(,aioliojob) kaio_liojoblist; /* (a) list of lio jobs */ 294 TAILQ_HEAD(,aiocblist) kaio_jobqueue; /* (a) job queue for process */ 295 TAILQ_HEAD(,aiocblist) kaio_bufqueue; /* (a) buffer job queue for process */ 296 TAILQ_HEAD(,aiocblist) kaio_syncqueue; /* (a) queue for aio_fsync */ 297 struct task kaio_task; /* (*) task to kick aio processes */ 298 }; 299 300 #define AIO_LOCK(ki) mtx_lock(&(ki)->kaio_mtx) 301 #define AIO_UNLOCK(ki) mtx_unlock(&(ki)->kaio_mtx) 302 #define AIO_LOCK_ASSERT(ki, f) mtx_assert(&(ki)->kaio_mtx, (f)) 303 #define AIO_MTX(ki) (&(ki)->kaio_mtx) 304 305 #define KAIO_RUNDOWN 0x1 /* process is being run down */ 306 #define KAIO_WAKEUP 0x2 /* wakeup process when there is a significant event */ 307 308 /* 309 * Operations used to interact with userland aio control blocks. 310 * Different ABIs provide their own operations. 311 */ 312 struct aiocb_ops { 313 int (*copyin)(struct aiocb *ujob, struct aiocb *kjob); 314 long (*fetch_status)(struct aiocb *ujob); 315 long (*fetch_error)(struct aiocb *ujob); 316 int (*store_status)(struct aiocb *ujob, long status); 317 int (*store_error)(struct aiocb *ujob, long error); 318 int (*store_kernelinfo)(struct aiocb *ujob, long jobref); 319 int (*store_aiocb)(struct aiocb **ujobp, struct aiocb *ujob); 320 }; 321 322 static TAILQ_HEAD(,aioproc) aio_freeproc; /* (c) Idle daemons */ 323 static struct sema aio_newproc_sem; 324 static struct mtx aio_job_mtx; 325 static TAILQ_HEAD(,aiocblist) aio_jobs; /* (c) Async job list */ 326 static struct unrhdr *aiod_unr; 327 328 void aio_init_aioinfo(struct proc *p); 329 static int aio_onceonly(void); 330 static int aio_free_entry(struct aiocblist *aiocbe); 331 static void aio_process_rw(struct aiocblist *aiocbe); 332 static void aio_process_sync(struct aiocblist *aiocbe); 333 static void aio_process_mlock(struct aiocblist *aiocbe); 334 static int aio_newproc(int *); 335 int aio_aqueue(struct thread *td, struct aiocb *job, 336 struct aioliojob *lio, int type, struct aiocb_ops *ops); 337 static void aio_physwakeup(struct bio *bp); 338 static void aio_proc_rundown(void *arg, struct proc *p); 339 static void aio_proc_rundown_exec(void *arg, struct proc *p, struct image_params *imgp); 340 static int aio_qphysio(struct proc *p, struct aiocblist *iocb); 341 static void aio_daemon(void *param); 342 static void aio_swake_cb(struct socket *, struct sockbuf *); 343 static int aio_unload(void); 344 static void aio_bio_done_notify(struct proc *userp, struct aiocblist *aiocbe, int type); 345 #define DONE_BUF 1 346 #define DONE_QUEUE 2 347 static int aio_kick(struct proc *userp); 348 static void aio_kick_nowait(struct proc *userp); 349 static void aio_kick_helper(void *context, int pending); 350 static int filt_aioattach(struct knote *kn); 351 static void filt_aiodetach(struct knote *kn); 352 static int filt_aio(struct knote *kn, long hint); 353 static int filt_lioattach(struct knote *kn); 354 static void filt_liodetach(struct knote *kn); 355 static int filt_lio(struct knote *kn, long hint); 356 357 /* 358 * Zones for: 359 * kaio Per process async io info 360 * aiop async io process data 361 * aiocb async io jobs 362 * aiol list io job pointer - internal to aio_suspend XXX 363 * aiolio list io jobs 364 */ 365 static uma_zone_t kaio_zone, aiop_zone, aiocb_zone, aiol_zone, aiolio_zone; 366 367 /* kqueue filters for aio */ 368 static struct filterops aio_filtops = { 369 .f_isfd = 0, 370 .f_attach = filt_aioattach, 371 .f_detach = filt_aiodetach, 372 .f_event = filt_aio, 373 }; 374 static struct filterops lio_filtops = { 375 .f_isfd = 0, 376 .f_attach = filt_lioattach, 377 .f_detach = filt_liodetach, 378 .f_event = filt_lio 379 }; 380 381 static eventhandler_tag exit_tag, exec_tag; 382 383 TASKQUEUE_DEFINE_THREAD(aiod_kick); 384 385 /* 386 * Main operations function for use as a kernel module. 387 */ 388 static int 389 aio_modload(struct module *module, int cmd, void *arg) 390 { 391 int error = 0; 392 393 switch (cmd) { 394 case MOD_LOAD: 395 aio_onceonly(); 396 break; 397 case MOD_UNLOAD: 398 error = aio_unload(); 399 break; 400 case MOD_SHUTDOWN: 401 break; 402 default: 403 error = EINVAL; 404 break; 405 } 406 return (error); 407 } 408 409 static moduledata_t aio_mod = { 410 "aio", 411 &aio_modload, 412 NULL 413 }; 414 415 static struct syscall_helper_data aio_syscalls[] = { 416 SYSCALL_INIT_HELPER(aio_cancel), 417 SYSCALL_INIT_HELPER(aio_error), 418 SYSCALL_INIT_HELPER(aio_fsync), 419 SYSCALL_INIT_HELPER(aio_mlock), 420 SYSCALL_INIT_HELPER(aio_read), 421 SYSCALL_INIT_HELPER(aio_return), 422 SYSCALL_INIT_HELPER(aio_suspend), 423 SYSCALL_INIT_HELPER(aio_waitcomplete), 424 SYSCALL_INIT_HELPER(aio_write), 425 SYSCALL_INIT_HELPER(lio_listio), 426 SYSCALL_INIT_HELPER(oaio_read), 427 SYSCALL_INIT_HELPER(oaio_write), 428 SYSCALL_INIT_HELPER(olio_listio), 429 SYSCALL_INIT_LAST 430 }; 431 432 #ifdef COMPAT_FREEBSD32 433 #include <sys/mount.h> 434 #include <sys/socket.h> 435 #include <compat/freebsd32/freebsd32.h> 436 #include <compat/freebsd32/freebsd32_proto.h> 437 #include <compat/freebsd32/freebsd32_signal.h> 438 #include <compat/freebsd32/freebsd32_syscall.h> 439 #include <compat/freebsd32/freebsd32_util.h> 440 441 static struct syscall_helper_data aio32_syscalls[] = { 442 SYSCALL32_INIT_HELPER(freebsd32_aio_return), 443 SYSCALL32_INIT_HELPER(freebsd32_aio_suspend), 444 SYSCALL32_INIT_HELPER(freebsd32_aio_cancel), 445 SYSCALL32_INIT_HELPER(freebsd32_aio_error), 446 SYSCALL32_INIT_HELPER(freebsd32_aio_fsync), 447 SYSCALL32_INIT_HELPER(freebsd32_aio_mlock), 448 SYSCALL32_INIT_HELPER(freebsd32_aio_read), 449 SYSCALL32_INIT_HELPER(freebsd32_aio_write), 450 SYSCALL32_INIT_HELPER(freebsd32_aio_waitcomplete), 451 SYSCALL32_INIT_HELPER(freebsd32_lio_listio), 452 SYSCALL32_INIT_HELPER(freebsd32_oaio_read), 453 SYSCALL32_INIT_HELPER(freebsd32_oaio_write), 454 SYSCALL32_INIT_HELPER(freebsd32_olio_listio), 455 SYSCALL_INIT_LAST 456 }; 457 #endif 458 459 DECLARE_MODULE(aio, aio_mod, 460 SI_SUB_VFS, SI_ORDER_ANY); 461 MODULE_VERSION(aio, 1); 462 463 /* 464 * Startup initialization 465 */ 466 static int 467 aio_onceonly(void) 468 { 469 int error; 470 471 /* XXX: should probably just use so->callback */ 472 aio_swake = &aio_swake_cb; 473 exit_tag = EVENTHANDLER_REGISTER(process_exit, aio_proc_rundown, NULL, 474 EVENTHANDLER_PRI_ANY); 475 exec_tag = EVENTHANDLER_REGISTER(process_exec, aio_proc_rundown_exec, NULL, 476 EVENTHANDLER_PRI_ANY); 477 kqueue_add_filteropts(EVFILT_AIO, &aio_filtops); 478 kqueue_add_filteropts(EVFILT_LIO, &lio_filtops); 479 TAILQ_INIT(&aio_freeproc); 480 sema_init(&aio_newproc_sem, 0, "aio_new_proc"); 481 mtx_init(&aio_job_mtx, "aio_job", NULL, MTX_DEF); 482 TAILQ_INIT(&aio_jobs); 483 aiod_unr = new_unrhdr(1, INT_MAX, NULL); 484 kaio_zone = uma_zcreate("AIO", sizeof(struct kaioinfo), NULL, NULL, 485 NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); 486 aiop_zone = uma_zcreate("AIOP", sizeof(struct aioproc), NULL, 487 NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); 488 aiocb_zone = uma_zcreate("AIOCB", sizeof(struct aiocblist), NULL, NULL, 489 NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); 490 aiol_zone = uma_zcreate("AIOL", AIO_LISTIO_MAX*sizeof(intptr_t) , NULL, 491 NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); 492 aiolio_zone = uma_zcreate("AIOLIO", sizeof(struct aioliojob), NULL, 493 NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); 494 aiod_lifetime = AIOD_LIFETIME_DEFAULT; 495 jobrefid = 1; 496 async_io_version = _POSIX_VERSION; 497 p31b_setcfg(CTL_P1003_1B_AIO_LISTIO_MAX, AIO_LISTIO_MAX); 498 p31b_setcfg(CTL_P1003_1B_AIO_MAX, MAX_AIO_QUEUE); 499 p31b_setcfg(CTL_P1003_1B_AIO_PRIO_DELTA_MAX, 0); 500 501 error = syscall_helper_register(aio_syscalls, SY_THR_STATIC_KLD); 502 if (error) 503 return (error); 504 #ifdef COMPAT_FREEBSD32 505 error = syscall32_helper_register(aio32_syscalls, SY_THR_STATIC_KLD); 506 if (error) 507 return (error); 508 #endif 509 return (0); 510 } 511 512 /* 513 * Callback for unload of AIO when used as a module. 514 */ 515 static int 516 aio_unload(void) 517 { 518 int error; 519 520 /* 521 * XXX: no unloads by default, it's too dangerous. 522 * perhaps we could do it if locked out callers and then 523 * did an aio_proc_rundown() on each process. 524 * 525 * jhb: aio_proc_rundown() needs to run on curproc though, 526 * so I don't think that would fly. 527 */ 528 if (!unloadable) 529 return (EOPNOTSUPP); 530 531 #ifdef COMPAT_FREEBSD32 532 syscall32_helper_unregister(aio32_syscalls); 533 #endif 534 syscall_helper_unregister(aio_syscalls); 535 536 error = kqueue_del_filteropts(EVFILT_AIO); 537 if (error) 538 return error; 539 error = kqueue_del_filteropts(EVFILT_LIO); 540 if (error) 541 return error; 542 async_io_version = 0; 543 aio_swake = NULL; 544 taskqueue_free(taskqueue_aiod_kick); 545 delete_unrhdr(aiod_unr); 546 uma_zdestroy(kaio_zone); 547 uma_zdestroy(aiop_zone); 548 uma_zdestroy(aiocb_zone); 549 uma_zdestroy(aiol_zone); 550 uma_zdestroy(aiolio_zone); 551 EVENTHANDLER_DEREGISTER(process_exit, exit_tag); 552 EVENTHANDLER_DEREGISTER(process_exec, exec_tag); 553 mtx_destroy(&aio_job_mtx); 554 sema_destroy(&aio_newproc_sem); 555 p31b_setcfg(CTL_P1003_1B_AIO_LISTIO_MAX, -1); 556 p31b_setcfg(CTL_P1003_1B_AIO_MAX, -1); 557 p31b_setcfg(CTL_P1003_1B_AIO_PRIO_DELTA_MAX, -1); 558 return (0); 559 } 560 561 /* 562 * Init the per-process aioinfo structure. The aioinfo limits are set 563 * per-process for user limit (resource) management. 564 */ 565 void 566 aio_init_aioinfo(struct proc *p) 567 { 568 struct kaioinfo *ki; 569 570 ki = uma_zalloc(kaio_zone, M_WAITOK); 571 mtx_init(&ki->kaio_mtx, "aiomtx", NULL, MTX_DEF | MTX_NEW); 572 ki->kaio_flags = 0; 573 ki->kaio_maxactive_count = max_aio_per_proc; 574 ki->kaio_active_count = 0; 575 ki->kaio_qallowed_count = max_aio_queue_per_proc; 576 ki->kaio_count = 0; 577 ki->kaio_ballowed_count = max_buf_aio; 578 ki->kaio_buffer_count = 0; 579 TAILQ_INIT(&ki->kaio_all); 580 TAILQ_INIT(&ki->kaio_done); 581 TAILQ_INIT(&ki->kaio_jobqueue); 582 TAILQ_INIT(&ki->kaio_bufqueue); 583 TAILQ_INIT(&ki->kaio_liojoblist); 584 TAILQ_INIT(&ki->kaio_syncqueue); 585 TASK_INIT(&ki->kaio_task, 0, aio_kick_helper, p); 586 PROC_LOCK(p); 587 if (p->p_aioinfo == NULL) { 588 p->p_aioinfo = ki; 589 PROC_UNLOCK(p); 590 } else { 591 PROC_UNLOCK(p); 592 mtx_destroy(&ki->kaio_mtx); 593 uma_zfree(kaio_zone, ki); 594 } 595 596 while (num_aio_procs < MIN(target_aio_procs, max_aio_procs)) 597 aio_newproc(NULL); 598 } 599 600 static int 601 aio_sendsig(struct proc *p, struct sigevent *sigev, ksiginfo_t *ksi) 602 { 603 struct thread *td; 604 int error; 605 606 error = sigev_findtd(p, sigev, &td); 607 if (error) 608 return (error); 609 if (!KSI_ONQ(ksi)) { 610 ksiginfo_set_sigev(ksi, sigev); 611 ksi->ksi_code = SI_ASYNCIO; 612 ksi->ksi_flags |= KSI_EXT | KSI_INS; 613 tdsendsignal(p, td, ksi->ksi_signo, ksi); 614 } 615 PROC_UNLOCK(p); 616 return (error); 617 } 618 619 /* 620 * Free a job entry. Wait for completion if it is currently active, but don't 621 * delay forever. If we delay, we return a flag that says that we have to 622 * restart the queue scan. 623 */ 624 static int 625 aio_free_entry(struct aiocblist *aiocbe) 626 { 627 struct kaioinfo *ki; 628 struct aioliojob *lj; 629 struct proc *p; 630 631 p = aiocbe->userproc; 632 MPASS(curproc == p); 633 ki = p->p_aioinfo; 634 MPASS(ki != NULL); 635 636 AIO_LOCK_ASSERT(ki, MA_OWNED); 637 MPASS(aiocbe->jobstate == JOBST_JOBFINISHED); 638 639 atomic_subtract_int(&num_queue_count, 1); 640 641 ki->kaio_count--; 642 MPASS(ki->kaio_count >= 0); 643 644 TAILQ_REMOVE(&ki->kaio_done, aiocbe, plist); 645 TAILQ_REMOVE(&ki->kaio_all, aiocbe, allist); 646 647 lj = aiocbe->lio; 648 if (lj) { 649 lj->lioj_count--; 650 lj->lioj_finished_count--; 651 652 if (lj->lioj_count == 0) { 653 TAILQ_REMOVE(&ki->kaio_liojoblist, lj, lioj_list); 654 /* lio is going away, we need to destroy any knotes */ 655 knlist_delete(&lj->klist, curthread, 1); 656 PROC_LOCK(p); 657 sigqueue_take(&lj->lioj_ksi); 658 PROC_UNLOCK(p); 659 uma_zfree(aiolio_zone, lj); 660 } 661 } 662 663 /* aiocbe is going away, we need to destroy any knotes */ 664 knlist_delete(&aiocbe->klist, curthread, 1); 665 PROC_LOCK(p); 666 sigqueue_take(&aiocbe->ksi); 667 PROC_UNLOCK(p); 668 669 MPASS(aiocbe->bp == NULL); 670 aiocbe->jobstate = JOBST_NULL; 671 AIO_UNLOCK(ki); 672 673 /* 674 * The thread argument here is used to find the owning process 675 * and is also passed to fo_close() which may pass it to various 676 * places such as devsw close() routines. Because of that, we 677 * need a thread pointer from the process owning the job that is 678 * persistent and won't disappear out from under us or move to 679 * another process. 680 * 681 * Currently, all the callers of this function call it to remove 682 * an aiocblist from the current process' job list either via a 683 * syscall or due to the current process calling exit() or 684 * execve(). Thus, we know that p == curproc. We also know that 685 * curthread can't exit since we are curthread. 686 * 687 * Therefore, we use curthread as the thread to pass to 688 * knlist_delete(). This does mean that it is possible for the 689 * thread pointer at close time to differ from the thread pointer 690 * at open time, but this is already true of file descriptors in 691 * a multithreaded process. 692 */ 693 if (aiocbe->fd_file) 694 fdrop(aiocbe->fd_file, curthread); 695 crfree(aiocbe->cred); 696 uma_zfree(aiocb_zone, aiocbe); 697 AIO_LOCK(ki); 698 699 return (0); 700 } 701 702 static void 703 aio_proc_rundown_exec(void *arg, struct proc *p, struct image_params *imgp __unused) 704 { 705 aio_proc_rundown(arg, p); 706 } 707 708 /* 709 * Rundown the jobs for a given process. 710 */ 711 static void 712 aio_proc_rundown(void *arg, struct proc *p) 713 { 714 struct kaioinfo *ki; 715 struct aioliojob *lj; 716 struct aiocblist *cbe, *cbn; 717 struct file *fp; 718 struct socket *so; 719 int remove; 720 721 KASSERT(curthread->td_proc == p, 722 ("%s: called on non-curproc", __func__)); 723 ki = p->p_aioinfo; 724 if (ki == NULL) 725 return; 726 727 AIO_LOCK(ki); 728 ki->kaio_flags |= KAIO_RUNDOWN; 729 730 restart: 731 732 /* 733 * Try to cancel all pending requests. This code simulates 734 * aio_cancel on all pending I/O requests. 735 */ 736 TAILQ_FOREACH_SAFE(cbe, &ki->kaio_jobqueue, plist, cbn) { 737 remove = 0; 738 mtx_lock(&aio_job_mtx); 739 if (cbe->jobstate == JOBST_JOBQGLOBAL) { 740 TAILQ_REMOVE(&aio_jobs, cbe, list); 741 remove = 1; 742 } else if (cbe->jobstate == JOBST_JOBQSOCK) { 743 fp = cbe->fd_file; 744 MPASS(fp->f_type == DTYPE_SOCKET); 745 so = fp->f_data; 746 TAILQ_REMOVE(&so->so_aiojobq, cbe, list); 747 remove = 1; 748 } else if (cbe->jobstate == JOBST_JOBQSYNC) { 749 TAILQ_REMOVE(&ki->kaio_syncqueue, cbe, list); 750 remove = 1; 751 } 752 mtx_unlock(&aio_job_mtx); 753 754 if (remove) { 755 cbe->jobstate = JOBST_JOBFINISHED; 756 cbe->uaiocb._aiocb_private.status = -1; 757 cbe->uaiocb._aiocb_private.error = ECANCELED; 758 TAILQ_REMOVE(&ki->kaio_jobqueue, cbe, plist); 759 aio_bio_done_notify(p, cbe, DONE_QUEUE); 760 } 761 } 762 763 /* Wait for all running I/O to be finished */ 764 if (TAILQ_FIRST(&ki->kaio_bufqueue) || 765 TAILQ_FIRST(&ki->kaio_jobqueue)) { 766 ki->kaio_flags |= KAIO_WAKEUP; 767 msleep(&p->p_aioinfo, AIO_MTX(ki), PRIBIO, "aioprn", hz); 768 goto restart; 769 } 770 771 /* Free all completed I/O requests. */ 772 while ((cbe = TAILQ_FIRST(&ki->kaio_done)) != NULL) 773 aio_free_entry(cbe); 774 775 while ((lj = TAILQ_FIRST(&ki->kaio_liojoblist)) != NULL) { 776 if (lj->lioj_count == 0) { 777 TAILQ_REMOVE(&ki->kaio_liojoblist, lj, lioj_list); 778 knlist_delete(&lj->klist, curthread, 1); 779 PROC_LOCK(p); 780 sigqueue_take(&lj->lioj_ksi); 781 PROC_UNLOCK(p); 782 uma_zfree(aiolio_zone, lj); 783 } else { 784 panic("LIO job not cleaned up: C:%d, FC:%d\n", 785 lj->lioj_count, lj->lioj_finished_count); 786 } 787 } 788 AIO_UNLOCK(ki); 789 taskqueue_drain(taskqueue_aiod_kick, &ki->kaio_task); 790 mtx_destroy(&ki->kaio_mtx); 791 uma_zfree(kaio_zone, ki); 792 p->p_aioinfo = NULL; 793 } 794 795 /* 796 * Select a job to run (called by an AIO daemon). 797 */ 798 static struct aiocblist * 799 aio_selectjob(struct aioproc *aiop) 800 { 801 struct aiocblist *aiocbe; 802 struct kaioinfo *ki; 803 struct proc *userp; 804 805 mtx_assert(&aio_job_mtx, MA_OWNED); 806 TAILQ_FOREACH(aiocbe, &aio_jobs, list) { 807 userp = aiocbe->userproc; 808 ki = userp->p_aioinfo; 809 810 if (ki->kaio_active_count < ki->kaio_maxactive_count) { 811 TAILQ_REMOVE(&aio_jobs, aiocbe, list); 812 /* Account for currently active jobs. */ 813 ki->kaio_active_count++; 814 aiocbe->jobstate = JOBST_JOBRUNNING; 815 break; 816 } 817 } 818 return (aiocbe); 819 } 820 821 /* 822 * Move all data to a permanent storage device, this code 823 * simulates fsync syscall. 824 */ 825 static int 826 aio_fsync_vnode(struct thread *td, struct vnode *vp) 827 { 828 struct mount *mp; 829 int error; 830 831 if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) 832 goto drop; 833 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 834 if (vp->v_object != NULL) { 835 VM_OBJECT_WLOCK(vp->v_object); 836 vm_object_page_clean(vp->v_object, 0, 0, 0); 837 VM_OBJECT_WUNLOCK(vp->v_object); 838 } 839 error = VOP_FSYNC(vp, MNT_WAIT, td); 840 841 VOP_UNLOCK(vp, 0); 842 vn_finished_write(mp); 843 drop: 844 return (error); 845 } 846 847 /* 848 * The AIO processing activity for LIO_READ/LIO_WRITE. This is the code that 849 * does the I/O request for the non-physio version of the operations. The 850 * normal vn operations are used, and this code should work in all instances 851 * for every type of file, including pipes, sockets, fifos, and regular files. 852 * 853 * XXX I don't think it works well for socket, pipe, and fifo. 854 */ 855 static void 856 aio_process_rw(struct aiocblist *aiocbe) 857 { 858 struct ucred *td_savedcred; 859 struct thread *td; 860 struct aiocb *cb; 861 struct file *fp; 862 struct socket *so; 863 struct uio auio; 864 struct iovec aiov; 865 int cnt; 866 int error; 867 int oublock_st, oublock_end; 868 int inblock_st, inblock_end; 869 870 KASSERT(aiocbe->uaiocb.aio_lio_opcode == LIO_READ || 871 aiocbe->uaiocb.aio_lio_opcode == LIO_WRITE, 872 ("%s: opcode %d", __func__, aiocbe->uaiocb.aio_lio_opcode)); 873 874 td = curthread; 875 td_savedcred = td->td_ucred; 876 td->td_ucred = aiocbe->cred; 877 cb = &aiocbe->uaiocb; 878 fp = aiocbe->fd_file; 879 880 aiov.iov_base = (void *)(uintptr_t)cb->aio_buf; 881 aiov.iov_len = cb->aio_nbytes; 882 883 auio.uio_iov = &aiov; 884 auio.uio_iovcnt = 1; 885 auio.uio_offset = cb->aio_offset; 886 auio.uio_resid = cb->aio_nbytes; 887 cnt = cb->aio_nbytes; 888 auio.uio_segflg = UIO_USERSPACE; 889 auio.uio_td = td; 890 891 inblock_st = td->td_ru.ru_inblock; 892 oublock_st = td->td_ru.ru_oublock; 893 /* 894 * aio_aqueue() acquires a reference to the file that is 895 * released in aio_free_entry(). 896 */ 897 if (cb->aio_lio_opcode == LIO_READ) { 898 auio.uio_rw = UIO_READ; 899 if (auio.uio_resid == 0) 900 error = 0; 901 else 902 error = fo_read(fp, &auio, fp->f_cred, FOF_OFFSET, td); 903 } else { 904 if (fp->f_type == DTYPE_VNODE) 905 bwillwrite(); 906 auio.uio_rw = UIO_WRITE; 907 error = fo_write(fp, &auio, fp->f_cred, FOF_OFFSET, td); 908 } 909 inblock_end = td->td_ru.ru_inblock; 910 oublock_end = td->td_ru.ru_oublock; 911 912 aiocbe->inputcharge = inblock_end - inblock_st; 913 aiocbe->outputcharge = oublock_end - oublock_st; 914 915 if ((error) && (auio.uio_resid != cnt)) { 916 if (error == ERESTART || error == EINTR || error == EWOULDBLOCK) 917 error = 0; 918 if ((error == EPIPE) && (cb->aio_lio_opcode == LIO_WRITE)) { 919 int sigpipe = 1; 920 if (fp->f_type == DTYPE_SOCKET) { 921 so = fp->f_data; 922 if (so->so_options & SO_NOSIGPIPE) 923 sigpipe = 0; 924 } 925 if (sigpipe) { 926 PROC_LOCK(aiocbe->userproc); 927 kern_psignal(aiocbe->userproc, SIGPIPE); 928 PROC_UNLOCK(aiocbe->userproc); 929 } 930 } 931 } 932 933 cnt -= auio.uio_resid; 934 cb->_aiocb_private.error = error; 935 cb->_aiocb_private.status = cnt; 936 td->td_ucred = td_savedcred; 937 } 938 939 static void 940 aio_process_sync(struct aiocblist *aiocbe) 941 { 942 struct thread *td = curthread; 943 struct ucred *td_savedcred = td->td_ucred; 944 struct aiocb *cb = &aiocbe->uaiocb; 945 struct file *fp = aiocbe->fd_file; 946 int error = 0; 947 948 KASSERT(aiocbe->uaiocb.aio_lio_opcode == LIO_SYNC, 949 ("%s: opcode %d", __func__, aiocbe->uaiocb.aio_lio_opcode)); 950 951 td->td_ucred = aiocbe->cred; 952 if (fp->f_vnode != NULL) 953 error = aio_fsync_vnode(td, fp->f_vnode); 954 cb->_aiocb_private.error = error; 955 cb->_aiocb_private.status = 0; 956 td->td_ucred = td_savedcred; 957 } 958 959 static void 960 aio_process_mlock(struct aiocblist *aiocbe) 961 { 962 struct aiocb *cb = &aiocbe->uaiocb; 963 int error; 964 965 KASSERT(aiocbe->uaiocb.aio_lio_opcode == LIO_MLOCK, 966 ("%s: opcode %d", __func__, aiocbe->uaiocb.aio_lio_opcode)); 967 968 error = vm_mlock(aiocbe->userproc, aiocbe->cred, 969 __DEVOLATILE(void *, cb->aio_buf), cb->aio_nbytes); 970 cb->_aiocb_private.error = error; 971 cb->_aiocb_private.status = 0; 972 } 973 974 static void 975 aio_bio_done_notify(struct proc *userp, struct aiocblist *aiocbe, int type) 976 { 977 struct aioliojob *lj; 978 struct kaioinfo *ki; 979 struct aiocblist *scb, *scbn; 980 int lj_done; 981 982 ki = userp->p_aioinfo; 983 AIO_LOCK_ASSERT(ki, MA_OWNED); 984 lj = aiocbe->lio; 985 lj_done = 0; 986 if (lj) { 987 lj->lioj_finished_count++; 988 if (lj->lioj_count == lj->lioj_finished_count) 989 lj_done = 1; 990 } 991 if (type == DONE_QUEUE) { 992 aiocbe->jobflags |= AIOCBLIST_DONE; 993 } else { 994 aiocbe->jobflags |= AIOCBLIST_BUFDONE; 995 } 996 TAILQ_INSERT_TAIL(&ki->kaio_done, aiocbe, plist); 997 aiocbe->jobstate = JOBST_JOBFINISHED; 998 999 if (ki->kaio_flags & KAIO_RUNDOWN) 1000 goto notification_done; 1001 1002 if (aiocbe->uaiocb.aio_sigevent.sigev_notify == SIGEV_SIGNAL || 1003 aiocbe->uaiocb.aio_sigevent.sigev_notify == SIGEV_THREAD_ID) 1004 aio_sendsig(userp, &aiocbe->uaiocb.aio_sigevent, &aiocbe->ksi); 1005 1006 KNOTE_LOCKED(&aiocbe->klist, 1); 1007 1008 if (lj_done) { 1009 if (lj->lioj_signal.sigev_notify == SIGEV_KEVENT) { 1010 lj->lioj_flags |= LIOJ_KEVENT_POSTED; 1011 KNOTE_LOCKED(&lj->klist, 1); 1012 } 1013 if ((lj->lioj_flags & (LIOJ_SIGNAL|LIOJ_SIGNAL_POSTED)) 1014 == LIOJ_SIGNAL 1015 && (lj->lioj_signal.sigev_notify == SIGEV_SIGNAL || 1016 lj->lioj_signal.sigev_notify == SIGEV_THREAD_ID)) { 1017 aio_sendsig(userp, &lj->lioj_signal, &lj->lioj_ksi); 1018 lj->lioj_flags |= LIOJ_SIGNAL_POSTED; 1019 } 1020 } 1021 1022 notification_done: 1023 if (aiocbe->jobflags & AIOCBLIST_CHECKSYNC) { 1024 TAILQ_FOREACH_SAFE(scb, &ki->kaio_syncqueue, list, scbn) { 1025 if (aiocbe->fd_file == scb->fd_file && 1026 aiocbe->seqno < scb->seqno) { 1027 if (--scb->pending == 0) { 1028 mtx_lock(&aio_job_mtx); 1029 scb->jobstate = JOBST_JOBQGLOBAL; 1030 TAILQ_REMOVE(&ki->kaio_syncqueue, scb, list); 1031 TAILQ_INSERT_TAIL(&aio_jobs, scb, list); 1032 aio_kick_nowait(userp); 1033 mtx_unlock(&aio_job_mtx); 1034 } 1035 } 1036 } 1037 } 1038 if (ki->kaio_flags & KAIO_WAKEUP) { 1039 ki->kaio_flags &= ~KAIO_WAKEUP; 1040 wakeup(&userp->p_aioinfo); 1041 } 1042 } 1043 1044 static void 1045 aio_switch_vmspace(struct aiocblist *aiocbe) 1046 { 1047 1048 vmspace_switch_aio(aiocbe->userproc->p_vmspace); 1049 } 1050 1051 /* 1052 * The AIO daemon, most of the actual work is done in aio_process_*, 1053 * but the setup (and address space mgmt) is done in this routine. 1054 */ 1055 static void 1056 aio_daemon(void *_id) 1057 { 1058 struct aiocblist *aiocbe; 1059 struct aioproc *aiop; 1060 struct kaioinfo *ki; 1061 struct proc *p, *userp; 1062 struct vmspace *myvm; 1063 struct thread *td = curthread; 1064 int id = (intptr_t)_id; 1065 1066 /* 1067 * Grab an extra reference on the daemon's vmspace so that it 1068 * doesn't get freed by jobs that switch to a different 1069 * vmspace. 1070 */ 1071 p = td->td_proc; 1072 myvm = vmspace_acquire_ref(p); 1073 1074 KASSERT(p->p_textvp == NULL, ("kthread has a textvp")); 1075 1076 /* 1077 * Allocate and ready the aio control info. There is one aiop structure 1078 * per daemon. 1079 */ 1080 aiop = uma_zalloc(aiop_zone, M_WAITOK); 1081 aiop->aioproc = p; 1082 aiop->aioprocflags = 0; 1083 1084 /* 1085 * Wakeup parent process. (Parent sleeps to keep from blasting away 1086 * and creating too many daemons.) 1087 */ 1088 sema_post(&aio_newproc_sem); 1089 1090 mtx_lock(&aio_job_mtx); 1091 for (;;) { 1092 /* 1093 * Take daemon off of free queue 1094 */ 1095 if (aiop->aioprocflags & AIOP_FREE) { 1096 TAILQ_REMOVE(&aio_freeproc, aiop, list); 1097 aiop->aioprocflags &= ~AIOP_FREE; 1098 } 1099 1100 /* 1101 * Check for jobs. 1102 */ 1103 while ((aiocbe = aio_selectjob(aiop)) != NULL) { 1104 mtx_unlock(&aio_job_mtx); 1105 userp = aiocbe->userproc; 1106 1107 /* 1108 * Connect to process address space for user program. 1109 */ 1110 aio_switch_vmspace(aiocbe); 1111 1112 ki = userp->p_aioinfo; 1113 1114 /* Do the I/O function. */ 1115 switch(aiocbe->uaiocb.aio_lio_opcode) { 1116 case LIO_READ: 1117 case LIO_WRITE: 1118 aio_process_rw(aiocbe); 1119 break; 1120 case LIO_SYNC: 1121 aio_process_sync(aiocbe); 1122 break; 1123 case LIO_MLOCK: 1124 aio_process_mlock(aiocbe); 1125 break; 1126 } 1127 1128 mtx_lock(&aio_job_mtx); 1129 /* Decrement the active job count. */ 1130 ki->kaio_active_count--; 1131 mtx_unlock(&aio_job_mtx); 1132 1133 AIO_LOCK(ki); 1134 TAILQ_REMOVE(&ki->kaio_jobqueue, aiocbe, plist); 1135 aio_bio_done_notify(userp, aiocbe, DONE_QUEUE); 1136 AIO_UNLOCK(ki); 1137 1138 mtx_lock(&aio_job_mtx); 1139 } 1140 1141 /* 1142 * Disconnect from user address space. 1143 */ 1144 if (p->p_vmspace != myvm) { 1145 mtx_unlock(&aio_job_mtx); 1146 vmspace_switch_aio(myvm); 1147 mtx_lock(&aio_job_mtx); 1148 /* 1149 * We have to restart to avoid race, we only sleep if 1150 * no job can be selected. 1151 */ 1152 continue; 1153 } 1154 1155 mtx_assert(&aio_job_mtx, MA_OWNED); 1156 1157 TAILQ_INSERT_HEAD(&aio_freeproc, aiop, list); 1158 aiop->aioprocflags |= AIOP_FREE; 1159 1160 /* 1161 * If daemon is inactive for a long time, allow it to exit, 1162 * thereby freeing resources. 1163 */ 1164 if (msleep(p, &aio_job_mtx, PRIBIO, "aiordy", 1165 aiod_lifetime) == EWOULDBLOCK && TAILQ_EMPTY(&aio_jobs) && 1166 (aiop->aioprocflags & AIOP_FREE) && 1167 num_aio_procs > target_aio_procs) 1168 break; 1169 } 1170 TAILQ_REMOVE(&aio_freeproc, aiop, list); 1171 num_aio_procs--; 1172 mtx_unlock(&aio_job_mtx); 1173 uma_zfree(aiop_zone, aiop); 1174 free_unr(aiod_unr, id); 1175 vmspace_free(myvm); 1176 1177 KASSERT(p->p_vmspace == myvm, 1178 ("AIOD: bad vmspace for exiting daemon")); 1179 KASSERT(myvm->vm_refcnt > 1, 1180 ("AIOD: bad vm refcnt for exiting daemon: %d", myvm->vm_refcnt)); 1181 kproc_exit(0); 1182 } 1183 1184 /* 1185 * Create a new AIO daemon. This is mostly a kernel-thread fork routine. The 1186 * AIO daemon modifies its environment itself. 1187 */ 1188 static int 1189 aio_newproc(int *start) 1190 { 1191 int error; 1192 struct proc *p; 1193 int id; 1194 1195 id = alloc_unr(aiod_unr); 1196 error = kproc_create(aio_daemon, (void *)(intptr_t)id, &p, 1197 RFNOWAIT, 0, "aiod%d", id); 1198 if (error == 0) { 1199 /* 1200 * Wait until daemon is started. 1201 */ 1202 sema_wait(&aio_newproc_sem); 1203 mtx_lock(&aio_job_mtx); 1204 num_aio_procs++; 1205 if (start != NULL) 1206 (*start)--; 1207 mtx_unlock(&aio_job_mtx); 1208 } else { 1209 free_unr(aiod_unr, id); 1210 } 1211 return (error); 1212 } 1213 1214 /* 1215 * Try the high-performance, low-overhead physio method for eligible 1216 * VCHR devices. This method doesn't use an aio helper thread, and 1217 * thus has very low overhead. 1218 * 1219 * Assumes that the caller, aio_aqueue(), has incremented the file 1220 * structure's reference count, preventing its deallocation for the 1221 * duration of this call. 1222 */ 1223 static int 1224 aio_qphysio(struct proc *p, struct aiocblist *aiocbe) 1225 { 1226 struct aiocb *cb; 1227 struct file *fp; 1228 struct bio *bp; 1229 struct buf *pbuf; 1230 struct vnode *vp; 1231 struct cdevsw *csw; 1232 struct cdev *dev; 1233 struct kaioinfo *ki; 1234 struct aioliojob *lj; 1235 int error, ref, unmap, poff; 1236 vm_prot_t prot; 1237 1238 cb = &aiocbe->uaiocb; 1239 fp = aiocbe->fd_file; 1240 1241 if (fp == NULL || fp->f_type != DTYPE_VNODE) 1242 return (-1); 1243 1244 vp = fp->f_vnode; 1245 if (vp->v_type != VCHR) 1246 return (-1); 1247 if (vp->v_bufobj.bo_bsize == 0) 1248 return (-1); 1249 if (cb->aio_nbytes % vp->v_bufobj.bo_bsize) 1250 return (-1); 1251 1252 ref = 0; 1253 csw = devvn_refthread(vp, &dev, &ref); 1254 if (csw == NULL) 1255 return (ENXIO); 1256 1257 if ((csw->d_flags & D_DISK) == 0) { 1258 error = -1; 1259 goto unref; 1260 } 1261 if (cb->aio_nbytes > dev->si_iosize_max) { 1262 error = -1; 1263 goto unref; 1264 } 1265 1266 ki = p->p_aioinfo; 1267 poff = (vm_offset_t)cb->aio_buf & PAGE_MASK; 1268 unmap = ((dev->si_flags & SI_UNMAPPED) && unmapped_buf_allowed); 1269 if (unmap) { 1270 if (cb->aio_nbytes > MAXPHYS) { 1271 error = -1; 1272 goto unref; 1273 } 1274 } else { 1275 if (cb->aio_nbytes > MAXPHYS - poff) { 1276 error = -1; 1277 goto unref; 1278 } 1279 if (ki->kaio_buffer_count >= ki->kaio_ballowed_count) { 1280 error = -1; 1281 goto unref; 1282 } 1283 } 1284 aiocbe->bp = bp = g_alloc_bio(); 1285 if (!unmap) { 1286 aiocbe->pbuf = pbuf = (struct buf *)getpbuf(NULL); 1287 BUF_KERNPROC(pbuf); 1288 } 1289 1290 AIO_LOCK(ki); 1291 ki->kaio_count++; 1292 if (!unmap) 1293 ki->kaio_buffer_count++; 1294 lj = aiocbe->lio; 1295 if (lj) 1296 lj->lioj_count++; 1297 TAILQ_INSERT_TAIL(&ki->kaio_bufqueue, aiocbe, plist); 1298 TAILQ_INSERT_TAIL(&ki->kaio_all, aiocbe, allist); 1299 aiocbe->jobstate = JOBST_JOBQBUF; 1300 cb->_aiocb_private.status = cb->aio_nbytes; 1301 AIO_UNLOCK(ki); 1302 1303 bp->bio_length = cb->aio_nbytes; 1304 bp->bio_bcount = cb->aio_nbytes; 1305 bp->bio_done = aio_physwakeup; 1306 bp->bio_data = (void *)(uintptr_t)cb->aio_buf; 1307 bp->bio_offset = cb->aio_offset; 1308 bp->bio_cmd = cb->aio_lio_opcode == LIO_WRITE ? BIO_WRITE : BIO_READ; 1309 bp->bio_dev = dev; 1310 bp->bio_caller1 = (void *)aiocbe; 1311 1312 prot = VM_PROT_READ; 1313 if (cb->aio_lio_opcode == LIO_READ) 1314 prot |= VM_PROT_WRITE; /* Less backwards than it looks */ 1315 if ((aiocbe->npages = vm_fault_quick_hold_pages( 1316 &curproc->p_vmspace->vm_map, 1317 (vm_offset_t)bp->bio_data, bp->bio_length, prot, aiocbe->pages, 1318 sizeof(aiocbe->pages)/sizeof(aiocbe->pages[0]))) < 0) { 1319 error = EFAULT; 1320 goto doerror; 1321 } 1322 if (!unmap) { 1323 pmap_qenter((vm_offset_t)pbuf->b_data, 1324 aiocbe->pages, aiocbe->npages); 1325 bp->bio_data = pbuf->b_data + poff; 1326 } else { 1327 bp->bio_ma = aiocbe->pages; 1328 bp->bio_ma_n = aiocbe->npages; 1329 bp->bio_ma_offset = poff; 1330 bp->bio_data = unmapped_buf; 1331 bp->bio_flags |= BIO_UNMAPPED; 1332 } 1333 1334 atomic_add_int(&num_queue_count, 1); 1335 if (!unmap) 1336 atomic_add_int(&num_buf_aio, 1); 1337 1338 /* Perform transfer. */ 1339 csw->d_strategy(bp); 1340 dev_relthread(dev, ref); 1341 return (0); 1342 1343 doerror: 1344 AIO_LOCK(ki); 1345 aiocbe->jobstate = JOBST_NULL; 1346 TAILQ_REMOVE(&ki->kaio_bufqueue, aiocbe, plist); 1347 TAILQ_REMOVE(&ki->kaio_all, aiocbe, allist); 1348 ki->kaio_count--; 1349 if (!unmap) 1350 ki->kaio_buffer_count--; 1351 if (lj) 1352 lj->lioj_count--; 1353 AIO_UNLOCK(ki); 1354 if (pbuf) { 1355 relpbuf(pbuf, NULL); 1356 aiocbe->pbuf = NULL; 1357 } 1358 g_destroy_bio(bp); 1359 aiocbe->bp = NULL; 1360 unref: 1361 dev_relthread(dev, ref); 1362 return (error); 1363 } 1364 1365 /* 1366 * Wake up aio requests that may be serviceable now. 1367 */ 1368 static void 1369 aio_swake_cb(struct socket *so, struct sockbuf *sb) 1370 { 1371 struct aiocblist *cb, *cbn; 1372 int opcode; 1373 1374 SOCKBUF_LOCK_ASSERT(sb); 1375 if (sb == &so->so_snd) 1376 opcode = LIO_WRITE; 1377 else 1378 opcode = LIO_READ; 1379 1380 sb->sb_flags &= ~SB_AIO; 1381 mtx_lock(&aio_job_mtx); 1382 TAILQ_FOREACH_SAFE(cb, &so->so_aiojobq, list, cbn) { 1383 if (opcode == cb->uaiocb.aio_lio_opcode) { 1384 if (cb->jobstate != JOBST_JOBQSOCK) 1385 panic("invalid queue value"); 1386 /* XXX 1387 * We don't have actual sockets backend yet, 1388 * so we simply move the requests to the generic 1389 * file I/O backend. 1390 */ 1391 TAILQ_REMOVE(&so->so_aiojobq, cb, list); 1392 TAILQ_INSERT_TAIL(&aio_jobs, cb, list); 1393 aio_kick_nowait(cb->userproc); 1394 } 1395 } 1396 mtx_unlock(&aio_job_mtx); 1397 } 1398 1399 static int 1400 convert_old_sigevent(struct osigevent *osig, struct sigevent *nsig) 1401 { 1402 1403 /* 1404 * Only SIGEV_NONE, SIGEV_SIGNAL, and SIGEV_KEVENT are 1405 * supported by AIO with the old sigevent structure. 1406 */ 1407 nsig->sigev_notify = osig->sigev_notify; 1408 switch (nsig->sigev_notify) { 1409 case SIGEV_NONE: 1410 break; 1411 case SIGEV_SIGNAL: 1412 nsig->sigev_signo = osig->__sigev_u.__sigev_signo; 1413 break; 1414 case SIGEV_KEVENT: 1415 nsig->sigev_notify_kqueue = 1416 osig->__sigev_u.__sigev_notify_kqueue; 1417 nsig->sigev_value.sival_ptr = osig->sigev_value.sival_ptr; 1418 break; 1419 default: 1420 return (EINVAL); 1421 } 1422 return (0); 1423 } 1424 1425 static int 1426 aiocb_copyin_old_sigevent(struct aiocb *ujob, struct aiocb *kjob) 1427 { 1428 struct oaiocb *ojob; 1429 int error; 1430 1431 bzero(kjob, sizeof(struct aiocb)); 1432 error = copyin(ujob, kjob, sizeof(struct oaiocb)); 1433 if (error) 1434 return (error); 1435 ojob = (struct oaiocb *)kjob; 1436 return (convert_old_sigevent(&ojob->aio_sigevent, &kjob->aio_sigevent)); 1437 } 1438 1439 static int 1440 aiocb_copyin(struct aiocb *ujob, struct aiocb *kjob) 1441 { 1442 1443 return (copyin(ujob, kjob, sizeof(struct aiocb))); 1444 } 1445 1446 static long 1447 aiocb_fetch_status(struct aiocb *ujob) 1448 { 1449 1450 return (fuword(&ujob->_aiocb_private.status)); 1451 } 1452 1453 static long 1454 aiocb_fetch_error(struct aiocb *ujob) 1455 { 1456 1457 return (fuword(&ujob->_aiocb_private.error)); 1458 } 1459 1460 static int 1461 aiocb_store_status(struct aiocb *ujob, long status) 1462 { 1463 1464 return (suword(&ujob->_aiocb_private.status, status)); 1465 } 1466 1467 static int 1468 aiocb_store_error(struct aiocb *ujob, long error) 1469 { 1470 1471 return (suword(&ujob->_aiocb_private.error, error)); 1472 } 1473 1474 static int 1475 aiocb_store_kernelinfo(struct aiocb *ujob, long jobref) 1476 { 1477 1478 return (suword(&ujob->_aiocb_private.kernelinfo, jobref)); 1479 } 1480 1481 static int 1482 aiocb_store_aiocb(struct aiocb **ujobp, struct aiocb *ujob) 1483 { 1484 1485 return (suword(ujobp, (long)ujob)); 1486 } 1487 1488 static struct aiocb_ops aiocb_ops = { 1489 .copyin = aiocb_copyin, 1490 .fetch_status = aiocb_fetch_status, 1491 .fetch_error = aiocb_fetch_error, 1492 .store_status = aiocb_store_status, 1493 .store_error = aiocb_store_error, 1494 .store_kernelinfo = aiocb_store_kernelinfo, 1495 .store_aiocb = aiocb_store_aiocb, 1496 }; 1497 1498 static struct aiocb_ops aiocb_ops_osigevent = { 1499 .copyin = aiocb_copyin_old_sigevent, 1500 .fetch_status = aiocb_fetch_status, 1501 .fetch_error = aiocb_fetch_error, 1502 .store_status = aiocb_store_status, 1503 .store_error = aiocb_store_error, 1504 .store_kernelinfo = aiocb_store_kernelinfo, 1505 .store_aiocb = aiocb_store_aiocb, 1506 }; 1507 1508 /* 1509 * Queue a new AIO request. Choosing either the threaded or direct physio VCHR 1510 * technique is done in this code. 1511 */ 1512 int 1513 aio_aqueue(struct thread *td, struct aiocb *job, struct aioliojob *lj, 1514 int type, struct aiocb_ops *ops) 1515 { 1516 struct proc *p = td->td_proc; 1517 cap_rights_t rights; 1518 struct file *fp; 1519 struct socket *so; 1520 struct aiocblist *aiocbe, *cb; 1521 struct kaioinfo *ki; 1522 struct kevent kev; 1523 struct sockbuf *sb; 1524 int opcode; 1525 int error; 1526 int fd, kqfd; 1527 int jid; 1528 u_short evflags; 1529 1530 if (p->p_aioinfo == NULL) 1531 aio_init_aioinfo(p); 1532 1533 ki = p->p_aioinfo; 1534 1535 ops->store_status(job, -1); 1536 ops->store_error(job, 0); 1537 ops->store_kernelinfo(job, -1); 1538 1539 if (num_queue_count >= max_queue_count || 1540 ki->kaio_count >= ki->kaio_qallowed_count) { 1541 ops->store_error(job, EAGAIN); 1542 return (EAGAIN); 1543 } 1544 1545 aiocbe = uma_zalloc(aiocb_zone, M_WAITOK | M_ZERO); 1546 knlist_init_mtx(&aiocbe->klist, AIO_MTX(ki)); 1547 1548 error = ops->copyin(job, &aiocbe->uaiocb); 1549 if (error) { 1550 ops->store_error(job, error); 1551 uma_zfree(aiocb_zone, aiocbe); 1552 return (error); 1553 } 1554 1555 /* XXX: aio_nbytes is later casted to signed types. */ 1556 if (aiocbe->uaiocb.aio_nbytes > INT_MAX) { 1557 uma_zfree(aiocb_zone, aiocbe); 1558 return (EINVAL); 1559 } 1560 1561 if (aiocbe->uaiocb.aio_sigevent.sigev_notify != SIGEV_KEVENT && 1562 aiocbe->uaiocb.aio_sigevent.sigev_notify != SIGEV_SIGNAL && 1563 aiocbe->uaiocb.aio_sigevent.sigev_notify != SIGEV_THREAD_ID && 1564 aiocbe->uaiocb.aio_sigevent.sigev_notify != SIGEV_NONE) { 1565 ops->store_error(job, EINVAL); 1566 uma_zfree(aiocb_zone, aiocbe); 1567 return (EINVAL); 1568 } 1569 1570 if ((aiocbe->uaiocb.aio_sigevent.sigev_notify == SIGEV_SIGNAL || 1571 aiocbe->uaiocb.aio_sigevent.sigev_notify == SIGEV_THREAD_ID) && 1572 !_SIG_VALID(aiocbe->uaiocb.aio_sigevent.sigev_signo)) { 1573 uma_zfree(aiocb_zone, aiocbe); 1574 return (EINVAL); 1575 } 1576 1577 ksiginfo_init(&aiocbe->ksi); 1578 1579 /* Save userspace address of the job info. */ 1580 aiocbe->uuaiocb = job; 1581 1582 /* Get the opcode. */ 1583 if (type != LIO_NOP) 1584 aiocbe->uaiocb.aio_lio_opcode = type; 1585 opcode = aiocbe->uaiocb.aio_lio_opcode; 1586 1587 /* 1588 * Validate the opcode and fetch the file object for the specified 1589 * file descriptor. 1590 * 1591 * XXXRW: Moved the opcode validation up here so that we don't 1592 * retrieve a file descriptor without knowing what the capabiltity 1593 * should be. 1594 */ 1595 fd = aiocbe->uaiocb.aio_fildes; 1596 switch (opcode) { 1597 case LIO_WRITE: 1598 error = fget_write(td, fd, 1599 cap_rights_init(&rights, CAP_PWRITE), &fp); 1600 break; 1601 case LIO_READ: 1602 error = fget_read(td, fd, 1603 cap_rights_init(&rights, CAP_PREAD), &fp); 1604 break; 1605 case LIO_SYNC: 1606 error = fget(td, fd, cap_rights_init(&rights, CAP_FSYNC), &fp); 1607 break; 1608 case LIO_MLOCK: 1609 fp = NULL; 1610 break; 1611 case LIO_NOP: 1612 error = fget(td, fd, cap_rights_init(&rights), &fp); 1613 break; 1614 default: 1615 error = EINVAL; 1616 } 1617 if (error) { 1618 uma_zfree(aiocb_zone, aiocbe); 1619 ops->store_error(job, error); 1620 return (error); 1621 } 1622 1623 if (opcode == LIO_SYNC && fp->f_vnode == NULL) { 1624 error = EINVAL; 1625 goto aqueue_fail; 1626 } 1627 1628 if (opcode != LIO_SYNC && aiocbe->uaiocb.aio_offset == -1LL) { 1629 error = EINVAL; 1630 goto aqueue_fail; 1631 } 1632 1633 aiocbe->fd_file = fp; 1634 1635 mtx_lock(&aio_job_mtx); 1636 jid = jobrefid++; 1637 aiocbe->seqno = jobseqno++; 1638 mtx_unlock(&aio_job_mtx); 1639 error = ops->store_kernelinfo(job, jid); 1640 if (error) { 1641 error = EINVAL; 1642 goto aqueue_fail; 1643 } 1644 aiocbe->uaiocb._aiocb_private.kernelinfo = (void *)(intptr_t)jid; 1645 1646 if (opcode == LIO_NOP) { 1647 fdrop(fp, td); 1648 uma_zfree(aiocb_zone, aiocbe); 1649 return (0); 1650 } 1651 1652 if (aiocbe->uaiocb.aio_sigevent.sigev_notify != SIGEV_KEVENT) 1653 goto no_kqueue; 1654 evflags = aiocbe->uaiocb.aio_sigevent.sigev_notify_kevent_flags; 1655 if ((evflags & ~(EV_CLEAR | EV_DISPATCH | EV_ONESHOT)) != 0) { 1656 error = EINVAL; 1657 goto aqueue_fail; 1658 } 1659 kqfd = aiocbe->uaiocb.aio_sigevent.sigev_notify_kqueue; 1660 kev.ident = (uintptr_t)aiocbe->uuaiocb; 1661 kev.filter = EVFILT_AIO; 1662 kev.flags = EV_ADD | EV_ENABLE | EV_FLAG1 | evflags; 1663 kev.data = (intptr_t)aiocbe; 1664 kev.udata = aiocbe->uaiocb.aio_sigevent.sigev_value.sival_ptr; 1665 error = kqfd_register(kqfd, &kev, td, 1); 1666 aqueue_fail: 1667 if (error) { 1668 if (fp) 1669 fdrop(fp, td); 1670 uma_zfree(aiocb_zone, aiocbe); 1671 ops->store_error(job, error); 1672 goto done; 1673 } 1674 no_kqueue: 1675 1676 ops->store_error(job, EINPROGRESS); 1677 aiocbe->uaiocb._aiocb_private.error = EINPROGRESS; 1678 aiocbe->userproc = p; 1679 aiocbe->cred = crhold(td->td_ucred); 1680 aiocbe->jobflags = 0; 1681 aiocbe->lio = lj; 1682 1683 if (opcode == LIO_SYNC) 1684 goto queueit; 1685 1686 if (fp && fp->f_type == DTYPE_SOCKET) { 1687 /* 1688 * Alternate queueing for socket ops: Reach down into the 1689 * descriptor to get the socket data. Then check to see if the 1690 * socket is ready to be read or written (based on the requested 1691 * operation). 1692 * 1693 * If it is not ready for io, then queue the aiocbe on the 1694 * socket, and set the flags so we get a call when sbnotify() 1695 * happens. 1696 * 1697 * Note if opcode is neither LIO_WRITE nor LIO_READ we lock 1698 * and unlock the snd sockbuf for no reason. 1699 */ 1700 so = fp->f_data; 1701 sb = (opcode == LIO_READ) ? &so->so_rcv : &so->so_snd; 1702 SOCKBUF_LOCK(sb); 1703 if (((opcode == LIO_READ) && (!soreadable(so))) || ((opcode == 1704 LIO_WRITE) && (!sowriteable(so)))) { 1705 sb->sb_flags |= SB_AIO; 1706 1707 mtx_lock(&aio_job_mtx); 1708 TAILQ_INSERT_TAIL(&so->so_aiojobq, aiocbe, list); 1709 mtx_unlock(&aio_job_mtx); 1710 1711 AIO_LOCK(ki); 1712 TAILQ_INSERT_TAIL(&ki->kaio_all, aiocbe, allist); 1713 TAILQ_INSERT_TAIL(&ki->kaio_jobqueue, aiocbe, plist); 1714 aiocbe->jobstate = JOBST_JOBQSOCK; 1715 ki->kaio_count++; 1716 if (lj) 1717 lj->lioj_count++; 1718 AIO_UNLOCK(ki); 1719 SOCKBUF_UNLOCK(sb); 1720 atomic_add_int(&num_queue_count, 1); 1721 error = 0; 1722 goto done; 1723 } 1724 SOCKBUF_UNLOCK(sb); 1725 } 1726 1727 if ((error = aio_qphysio(p, aiocbe)) == 0) 1728 goto done; 1729 #if 0 1730 if (error > 0) { 1731 aiocbe->uaiocb._aiocb_private.error = error; 1732 ops->store_error(job, error); 1733 goto done; 1734 } 1735 #endif 1736 queueit: 1737 atomic_add_int(&num_queue_count, 1); 1738 1739 AIO_LOCK(ki); 1740 ki->kaio_count++; 1741 if (lj) 1742 lj->lioj_count++; 1743 TAILQ_INSERT_TAIL(&ki->kaio_jobqueue, aiocbe, plist); 1744 TAILQ_INSERT_TAIL(&ki->kaio_all, aiocbe, allist); 1745 if (opcode == LIO_SYNC) { 1746 TAILQ_FOREACH(cb, &ki->kaio_jobqueue, plist) { 1747 if (cb->fd_file == aiocbe->fd_file && 1748 cb->uaiocb.aio_lio_opcode != LIO_SYNC && 1749 cb->seqno < aiocbe->seqno) { 1750 cb->jobflags |= AIOCBLIST_CHECKSYNC; 1751 aiocbe->pending++; 1752 } 1753 } 1754 TAILQ_FOREACH(cb, &ki->kaio_bufqueue, plist) { 1755 if (cb->fd_file == aiocbe->fd_file && 1756 cb->uaiocb.aio_lio_opcode != LIO_SYNC && 1757 cb->seqno < aiocbe->seqno) { 1758 cb->jobflags |= AIOCBLIST_CHECKSYNC; 1759 aiocbe->pending++; 1760 } 1761 } 1762 if (aiocbe->pending != 0) { 1763 TAILQ_INSERT_TAIL(&ki->kaio_syncqueue, aiocbe, list); 1764 aiocbe->jobstate = JOBST_JOBQSYNC; 1765 AIO_UNLOCK(ki); 1766 goto done; 1767 } 1768 } 1769 mtx_lock(&aio_job_mtx); 1770 TAILQ_INSERT_TAIL(&aio_jobs, aiocbe, list); 1771 aiocbe->jobstate = JOBST_JOBQGLOBAL; 1772 aio_kick_nowait(p); 1773 mtx_unlock(&aio_job_mtx); 1774 AIO_UNLOCK(ki); 1775 error = 0; 1776 done: 1777 return (error); 1778 } 1779 1780 static void 1781 aio_kick_nowait(struct proc *userp) 1782 { 1783 struct kaioinfo *ki = userp->p_aioinfo; 1784 struct aioproc *aiop; 1785 1786 mtx_assert(&aio_job_mtx, MA_OWNED); 1787 if ((aiop = TAILQ_FIRST(&aio_freeproc)) != NULL) { 1788 TAILQ_REMOVE(&aio_freeproc, aiop, list); 1789 aiop->aioprocflags &= ~AIOP_FREE; 1790 wakeup(aiop->aioproc); 1791 } else if (((num_aio_resv_start + num_aio_procs) < max_aio_procs) && 1792 ((ki->kaio_active_count + num_aio_resv_start) < 1793 ki->kaio_maxactive_count)) { 1794 taskqueue_enqueue(taskqueue_aiod_kick, &ki->kaio_task); 1795 } 1796 } 1797 1798 static int 1799 aio_kick(struct proc *userp) 1800 { 1801 struct kaioinfo *ki = userp->p_aioinfo; 1802 struct aioproc *aiop; 1803 int error, ret = 0; 1804 1805 mtx_assert(&aio_job_mtx, MA_OWNED); 1806 retryproc: 1807 if ((aiop = TAILQ_FIRST(&aio_freeproc)) != NULL) { 1808 TAILQ_REMOVE(&aio_freeproc, aiop, list); 1809 aiop->aioprocflags &= ~AIOP_FREE; 1810 wakeup(aiop->aioproc); 1811 } else if (((num_aio_resv_start + num_aio_procs) < max_aio_procs) && 1812 ((ki->kaio_active_count + num_aio_resv_start) < 1813 ki->kaio_maxactive_count)) { 1814 num_aio_resv_start++; 1815 mtx_unlock(&aio_job_mtx); 1816 error = aio_newproc(&num_aio_resv_start); 1817 mtx_lock(&aio_job_mtx); 1818 if (error) { 1819 num_aio_resv_start--; 1820 goto retryproc; 1821 } 1822 } else { 1823 ret = -1; 1824 } 1825 return (ret); 1826 } 1827 1828 static void 1829 aio_kick_helper(void *context, int pending) 1830 { 1831 struct proc *userp = context; 1832 1833 mtx_lock(&aio_job_mtx); 1834 while (--pending >= 0) { 1835 if (aio_kick(userp)) 1836 break; 1837 } 1838 mtx_unlock(&aio_job_mtx); 1839 } 1840 1841 /* 1842 * Support the aio_return system call, as a side-effect, kernel resources are 1843 * released. 1844 */ 1845 static int 1846 kern_aio_return(struct thread *td, struct aiocb *uaiocb, struct aiocb_ops *ops) 1847 { 1848 struct proc *p = td->td_proc; 1849 struct aiocblist *cb; 1850 struct kaioinfo *ki; 1851 int status, error; 1852 1853 ki = p->p_aioinfo; 1854 if (ki == NULL) 1855 return (EINVAL); 1856 AIO_LOCK(ki); 1857 TAILQ_FOREACH(cb, &ki->kaio_done, plist) { 1858 if (cb->uuaiocb == uaiocb) 1859 break; 1860 } 1861 if (cb != NULL) { 1862 MPASS(cb->jobstate == JOBST_JOBFINISHED); 1863 status = cb->uaiocb._aiocb_private.status; 1864 error = cb->uaiocb._aiocb_private.error; 1865 td->td_retval[0] = status; 1866 if (cb->uaiocb.aio_lio_opcode == LIO_WRITE) { 1867 td->td_ru.ru_oublock += cb->outputcharge; 1868 cb->outputcharge = 0; 1869 } else if (cb->uaiocb.aio_lio_opcode == LIO_READ) { 1870 td->td_ru.ru_inblock += cb->inputcharge; 1871 cb->inputcharge = 0; 1872 } 1873 aio_free_entry(cb); 1874 AIO_UNLOCK(ki); 1875 ops->store_error(uaiocb, error); 1876 ops->store_status(uaiocb, status); 1877 } else { 1878 error = EINVAL; 1879 AIO_UNLOCK(ki); 1880 } 1881 return (error); 1882 } 1883 1884 int 1885 sys_aio_return(struct thread *td, struct aio_return_args *uap) 1886 { 1887 1888 return (kern_aio_return(td, uap->aiocbp, &aiocb_ops)); 1889 } 1890 1891 /* 1892 * Allow a process to wakeup when any of the I/O requests are completed. 1893 */ 1894 static int 1895 kern_aio_suspend(struct thread *td, int njoblist, struct aiocb **ujoblist, 1896 struct timespec *ts) 1897 { 1898 struct proc *p = td->td_proc; 1899 struct timeval atv; 1900 struct kaioinfo *ki; 1901 struct aiocblist *cb, *cbfirst; 1902 int error, i, timo; 1903 1904 timo = 0; 1905 if (ts) { 1906 if (ts->tv_nsec < 0 || ts->tv_nsec >= 1000000000) 1907 return (EINVAL); 1908 1909 TIMESPEC_TO_TIMEVAL(&atv, ts); 1910 if (itimerfix(&atv)) 1911 return (EINVAL); 1912 timo = tvtohz(&atv); 1913 } 1914 1915 ki = p->p_aioinfo; 1916 if (ki == NULL) 1917 return (EAGAIN); 1918 1919 if (njoblist == 0) 1920 return (0); 1921 1922 AIO_LOCK(ki); 1923 for (;;) { 1924 cbfirst = NULL; 1925 error = 0; 1926 TAILQ_FOREACH(cb, &ki->kaio_all, allist) { 1927 for (i = 0; i < njoblist; i++) { 1928 if (cb->uuaiocb == ujoblist[i]) { 1929 if (cbfirst == NULL) 1930 cbfirst = cb; 1931 if (cb->jobstate == JOBST_JOBFINISHED) 1932 goto RETURN; 1933 } 1934 } 1935 } 1936 /* All tasks were finished. */ 1937 if (cbfirst == NULL) 1938 break; 1939 1940 ki->kaio_flags |= KAIO_WAKEUP; 1941 error = msleep(&p->p_aioinfo, AIO_MTX(ki), PRIBIO | PCATCH, 1942 "aiospn", timo); 1943 if (error == ERESTART) 1944 error = EINTR; 1945 if (error) 1946 break; 1947 } 1948 RETURN: 1949 AIO_UNLOCK(ki); 1950 return (error); 1951 } 1952 1953 int 1954 sys_aio_suspend(struct thread *td, struct aio_suspend_args *uap) 1955 { 1956 struct timespec ts, *tsp; 1957 struct aiocb **ujoblist; 1958 int error; 1959 1960 if (uap->nent < 0 || uap->nent > AIO_LISTIO_MAX) 1961 return (EINVAL); 1962 1963 if (uap->timeout) { 1964 /* Get timespec struct. */ 1965 if ((error = copyin(uap->timeout, &ts, sizeof(ts))) != 0) 1966 return (error); 1967 tsp = &ts; 1968 } else 1969 tsp = NULL; 1970 1971 ujoblist = uma_zalloc(aiol_zone, M_WAITOK); 1972 error = copyin(uap->aiocbp, ujoblist, uap->nent * sizeof(ujoblist[0])); 1973 if (error == 0) 1974 error = kern_aio_suspend(td, uap->nent, ujoblist, tsp); 1975 uma_zfree(aiol_zone, ujoblist); 1976 return (error); 1977 } 1978 1979 /* 1980 * aio_cancel cancels any non-physio aio operations not currently in 1981 * progress. 1982 */ 1983 int 1984 sys_aio_cancel(struct thread *td, struct aio_cancel_args *uap) 1985 { 1986 struct proc *p = td->td_proc; 1987 struct kaioinfo *ki; 1988 struct aiocblist *cbe, *cbn; 1989 struct file *fp; 1990 struct socket *so; 1991 cap_rights_t rights; 1992 int error; 1993 int remove; 1994 int cancelled = 0; 1995 int notcancelled = 0; 1996 struct vnode *vp; 1997 1998 /* Lookup file object. */ 1999 error = fget(td, uap->fd, cap_rights_init(&rights), &fp); 2000 if (error) 2001 return (error); 2002 2003 ki = p->p_aioinfo; 2004 if (ki == NULL) 2005 goto done; 2006 2007 if (fp->f_type == DTYPE_VNODE) { 2008 vp = fp->f_vnode; 2009 if (vn_isdisk(vp, &error)) { 2010 fdrop(fp, td); 2011 td->td_retval[0] = AIO_NOTCANCELED; 2012 return (0); 2013 } 2014 } 2015 2016 AIO_LOCK(ki); 2017 TAILQ_FOREACH_SAFE(cbe, &ki->kaio_jobqueue, plist, cbn) { 2018 if ((uap->fd == cbe->uaiocb.aio_fildes) && 2019 ((uap->aiocbp == NULL) || 2020 (uap->aiocbp == cbe->uuaiocb))) { 2021 remove = 0; 2022 2023 mtx_lock(&aio_job_mtx); 2024 if (cbe->jobstate == JOBST_JOBQGLOBAL) { 2025 TAILQ_REMOVE(&aio_jobs, cbe, list); 2026 remove = 1; 2027 } else if (cbe->jobstate == JOBST_JOBQSOCK) { 2028 MPASS(fp->f_type == DTYPE_SOCKET); 2029 so = fp->f_data; 2030 TAILQ_REMOVE(&so->so_aiojobq, cbe, list); 2031 remove = 1; 2032 } else if (cbe->jobstate == JOBST_JOBQSYNC) { 2033 TAILQ_REMOVE(&ki->kaio_syncqueue, cbe, list); 2034 remove = 1; 2035 } 2036 mtx_unlock(&aio_job_mtx); 2037 2038 if (remove) { 2039 TAILQ_REMOVE(&ki->kaio_jobqueue, cbe, plist); 2040 cbe->uaiocb._aiocb_private.status = -1; 2041 cbe->uaiocb._aiocb_private.error = ECANCELED; 2042 aio_bio_done_notify(p, cbe, DONE_QUEUE); 2043 cancelled++; 2044 } else { 2045 notcancelled++; 2046 } 2047 if (uap->aiocbp != NULL) 2048 break; 2049 } 2050 } 2051 AIO_UNLOCK(ki); 2052 2053 done: 2054 fdrop(fp, td); 2055 2056 if (uap->aiocbp != NULL) { 2057 if (cancelled) { 2058 td->td_retval[0] = AIO_CANCELED; 2059 return (0); 2060 } 2061 } 2062 2063 if (notcancelled) { 2064 td->td_retval[0] = AIO_NOTCANCELED; 2065 return (0); 2066 } 2067 2068 if (cancelled) { 2069 td->td_retval[0] = AIO_CANCELED; 2070 return (0); 2071 } 2072 2073 td->td_retval[0] = AIO_ALLDONE; 2074 2075 return (0); 2076 } 2077 2078 /* 2079 * aio_error is implemented in the kernel level for compatibility purposes 2080 * only. For a user mode async implementation, it would be best to do it in 2081 * a userland subroutine. 2082 */ 2083 static int 2084 kern_aio_error(struct thread *td, struct aiocb *aiocbp, struct aiocb_ops *ops) 2085 { 2086 struct proc *p = td->td_proc; 2087 struct aiocblist *cb; 2088 struct kaioinfo *ki; 2089 int status; 2090 2091 ki = p->p_aioinfo; 2092 if (ki == NULL) { 2093 td->td_retval[0] = EINVAL; 2094 return (0); 2095 } 2096 2097 AIO_LOCK(ki); 2098 TAILQ_FOREACH(cb, &ki->kaio_all, allist) { 2099 if (cb->uuaiocb == aiocbp) { 2100 if (cb->jobstate == JOBST_JOBFINISHED) 2101 td->td_retval[0] = 2102 cb->uaiocb._aiocb_private.error; 2103 else 2104 td->td_retval[0] = EINPROGRESS; 2105 AIO_UNLOCK(ki); 2106 return (0); 2107 } 2108 } 2109 AIO_UNLOCK(ki); 2110 2111 /* 2112 * Hack for failure of aio_aqueue. 2113 */ 2114 status = ops->fetch_status(aiocbp); 2115 if (status == -1) { 2116 td->td_retval[0] = ops->fetch_error(aiocbp); 2117 return (0); 2118 } 2119 2120 td->td_retval[0] = EINVAL; 2121 return (0); 2122 } 2123 2124 int 2125 sys_aio_error(struct thread *td, struct aio_error_args *uap) 2126 { 2127 2128 return (kern_aio_error(td, uap->aiocbp, &aiocb_ops)); 2129 } 2130 2131 /* syscall - asynchronous read from a file (REALTIME) */ 2132 int 2133 sys_oaio_read(struct thread *td, struct oaio_read_args *uap) 2134 { 2135 2136 return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_READ, 2137 &aiocb_ops_osigevent)); 2138 } 2139 2140 int 2141 sys_aio_read(struct thread *td, struct aio_read_args *uap) 2142 { 2143 2144 return (aio_aqueue(td, uap->aiocbp, NULL, LIO_READ, &aiocb_ops)); 2145 } 2146 2147 /* syscall - asynchronous write to a file (REALTIME) */ 2148 int 2149 sys_oaio_write(struct thread *td, struct oaio_write_args *uap) 2150 { 2151 2152 return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_WRITE, 2153 &aiocb_ops_osigevent)); 2154 } 2155 2156 int 2157 sys_aio_write(struct thread *td, struct aio_write_args *uap) 2158 { 2159 2160 return (aio_aqueue(td, uap->aiocbp, NULL, LIO_WRITE, &aiocb_ops)); 2161 } 2162 2163 int 2164 sys_aio_mlock(struct thread *td, struct aio_mlock_args *uap) 2165 { 2166 2167 return (aio_aqueue(td, uap->aiocbp, NULL, LIO_MLOCK, &aiocb_ops)); 2168 } 2169 2170 static int 2171 kern_lio_listio(struct thread *td, int mode, struct aiocb * const *uacb_list, 2172 struct aiocb **acb_list, int nent, struct sigevent *sig, 2173 struct aiocb_ops *ops) 2174 { 2175 struct proc *p = td->td_proc; 2176 struct aiocb *iocb; 2177 struct kaioinfo *ki; 2178 struct aioliojob *lj; 2179 struct kevent kev; 2180 int error; 2181 int nerror; 2182 int i; 2183 2184 if ((mode != LIO_NOWAIT) && (mode != LIO_WAIT)) 2185 return (EINVAL); 2186 2187 if (nent < 0 || nent > AIO_LISTIO_MAX) 2188 return (EINVAL); 2189 2190 if (p->p_aioinfo == NULL) 2191 aio_init_aioinfo(p); 2192 2193 ki = p->p_aioinfo; 2194 2195 lj = uma_zalloc(aiolio_zone, M_WAITOK); 2196 lj->lioj_flags = 0; 2197 lj->lioj_count = 0; 2198 lj->lioj_finished_count = 0; 2199 knlist_init_mtx(&lj->klist, AIO_MTX(ki)); 2200 ksiginfo_init(&lj->lioj_ksi); 2201 2202 /* 2203 * Setup signal. 2204 */ 2205 if (sig && (mode == LIO_NOWAIT)) { 2206 bcopy(sig, &lj->lioj_signal, sizeof(lj->lioj_signal)); 2207 if (lj->lioj_signal.sigev_notify == SIGEV_KEVENT) { 2208 /* Assume only new style KEVENT */ 2209 kev.filter = EVFILT_LIO; 2210 kev.flags = EV_ADD | EV_ENABLE | EV_FLAG1; 2211 kev.ident = (uintptr_t)uacb_list; /* something unique */ 2212 kev.data = (intptr_t)lj; 2213 /* pass user defined sigval data */ 2214 kev.udata = lj->lioj_signal.sigev_value.sival_ptr; 2215 error = kqfd_register( 2216 lj->lioj_signal.sigev_notify_kqueue, &kev, td, 1); 2217 if (error) { 2218 uma_zfree(aiolio_zone, lj); 2219 return (error); 2220 } 2221 } else if (lj->lioj_signal.sigev_notify == SIGEV_NONE) { 2222 ; 2223 } else if (lj->lioj_signal.sigev_notify == SIGEV_SIGNAL || 2224 lj->lioj_signal.sigev_notify == SIGEV_THREAD_ID) { 2225 if (!_SIG_VALID(lj->lioj_signal.sigev_signo)) { 2226 uma_zfree(aiolio_zone, lj); 2227 return EINVAL; 2228 } 2229 lj->lioj_flags |= LIOJ_SIGNAL; 2230 } else { 2231 uma_zfree(aiolio_zone, lj); 2232 return EINVAL; 2233 } 2234 } 2235 2236 AIO_LOCK(ki); 2237 TAILQ_INSERT_TAIL(&ki->kaio_liojoblist, lj, lioj_list); 2238 /* 2239 * Add extra aiocb count to avoid the lio to be freed 2240 * by other threads doing aio_waitcomplete or aio_return, 2241 * and prevent event from being sent until we have queued 2242 * all tasks. 2243 */ 2244 lj->lioj_count = 1; 2245 AIO_UNLOCK(ki); 2246 2247 /* 2248 * Get pointers to the list of I/O requests. 2249 */ 2250 nerror = 0; 2251 for (i = 0; i < nent; i++) { 2252 iocb = acb_list[i]; 2253 if (iocb != NULL) { 2254 error = aio_aqueue(td, iocb, lj, LIO_NOP, ops); 2255 if (error != 0) 2256 nerror++; 2257 } 2258 } 2259 2260 error = 0; 2261 AIO_LOCK(ki); 2262 if (mode == LIO_WAIT) { 2263 while (lj->lioj_count - 1 != lj->lioj_finished_count) { 2264 ki->kaio_flags |= KAIO_WAKEUP; 2265 error = msleep(&p->p_aioinfo, AIO_MTX(ki), 2266 PRIBIO | PCATCH, "aiospn", 0); 2267 if (error == ERESTART) 2268 error = EINTR; 2269 if (error) 2270 break; 2271 } 2272 } else { 2273 if (lj->lioj_count - 1 == lj->lioj_finished_count) { 2274 if (lj->lioj_signal.sigev_notify == SIGEV_KEVENT) { 2275 lj->lioj_flags |= LIOJ_KEVENT_POSTED; 2276 KNOTE_LOCKED(&lj->klist, 1); 2277 } 2278 if ((lj->lioj_flags & (LIOJ_SIGNAL|LIOJ_SIGNAL_POSTED)) 2279 == LIOJ_SIGNAL 2280 && (lj->lioj_signal.sigev_notify == SIGEV_SIGNAL || 2281 lj->lioj_signal.sigev_notify == SIGEV_THREAD_ID)) { 2282 aio_sendsig(p, &lj->lioj_signal, 2283 &lj->lioj_ksi); 2284 lj->lioj_flags |= LIOJ_SIGNAL_POSTED; 2285 } 2286 } 2287 } 2288 lj->lioj_count--; 2289 if (lj->lioj_count == 0) { 2290 TAILQ_REMOVE(&ki->kaio_liojoblist, lj, lioj_list); 2291 knlist_delete(&lj->klist, curthread, 1); 2292 PROC_LOCK(p); 2293 sigqueue_take(&lj->lioj_ksi); 2294 PROC_UNLOCK(p); 2295 AIO_UNLOCK(ki); 2296 uma_zfree(aiolio_zone, lj); 2297 } else 2298 AIO_UNLOCK(ki); 2299 2300 if (nerror) 2301 return (EIO); 2302 return (error); 2303 } 2304 2305 /* syscall - list directed I/O (REALTIME) */ 2306 int 2307 sys_olio_listio(struct thread *td, struct olio_listio_args *uap) 2308 { 2309 struct aiocb **acb_list; 2310 struct sigevent *sigp, sig; 2311 struct osigevent osig; 2312 int error, nent; 2313 2314 if ((uap->mode != LIO_NOWAIT) && (uap->mode != LIO_WAIT)) 2315 return (EINVAL); 2316 2317 nent = uap->nent; 2318 if (nent < 0 || nent > AIO_LISTIO_MAX) 2319 return (EINVAL); 2320 2321 if (uap->sig && (uap->mode == LIO_NOWAIT)) { 2322 error = copyin(uap->sig, &osig, sizeof(osig)); 2323 if (error) 2324 return (error); 2325 error = convert_old_sigevent(&osig, &sig); 2326 if (error) 2327 return (error); 2328 sigp = &sig; 2329 } else 2330 sigp = NULL; 2331 2332 acb_list = malloc(sizeof(struct aiocb *) * nent, M_LIO, M_WAITOK); 2333 error = copyin(uap->acb_list, acb_list, nent * sizeof(acb_list[0])); 2334 if (error == 0) 2335 error = kern_lio_listio(td, uap->mode, 2336 (struct aiocb * const *)uap->acb_list, acb_list, nent, sigp, 2337 &aiocb_ops_osigevent); 2338 free(acb_list, M_LIO); 2339 return (error); 2340 } 2341 2342 /* syscall - list directed I/O (REALTIME) */ 2343 int 2344 sys_lio_listio(struct thread *td, struct lio_listio_args *uap) 2345 { 2346 struct aiocb **acb_list; 2347 struct sigevent *sigp, sig; 2348 int error, nent; 2349 2350 if ((uap->mode != LIO_NOWAIT) && (uap->mode != LIO_WAIT)) 2351 return (EINVAL); 2352 2353 nent = uap->nent; 2354 if (nent < 0 || nent > AIO_LISTIO_MAX) 2355 return (EINVAL); 2356 2357 if (uap->sig && (uap->mode == LIO_NOWAIT)) { 2358 error = copyin(uap->sig, &sig, sizeof(sig)); 2359 if (error) 2360 return (error); 2361 sigp = &sig; 2362 } else 2363 sigp = NULL; 2364 2365 acb_list = malloc(sizeof(struct aiocb *) * nent, M_LIO, M_WAITOK); 2366 error = copyin(uap->acb_list, acb_list, nent * sizeof(acb_list[0])); 2367 if (error == 0) 2368 error = kern_lio_listio(td, uap->mode, uap->acb_list, acb_list, 2369 nent, sigp, &aiocb_ops); 2370 free(acb_list, M_LIO); 2371 return (error); 2372 } 2373 2374 static void 2375 aio_physwakeup(struct bio *bp) 2376 { 2377 struct aiocblist *aiocbe = (struct aiocblist *)bp->bio_caller1; 2378 struct proc *userp; 2379 struct kaioinfo *ki; 2380 int nblks; 2381 2382 /* Release mapping into kernel space. */ 2383 if (aiocbe->pbuf) { 2384 pmap_qremove((vm_offset_t)aiocbe->pbuf->b_data, aiocbe->npages); 2385 relpbuf(aiocbe->pbuf, NULL); 2386 aiocbe->pbuf = NULL; 2387 atomic_subtract_int(&num_buf_aio, 1); 2388 } 2389 vm_page_unhold_pages(aiocbe->pages, aiocbe->npages); 2390 2391 bp = aiocbe->bp; 2392 aiocbe->bp = NULL; 2393 userp = aiocbe->userproc; 2394 ki = userp->p_aioinfo; 2395 AIO_LOCK(ki); 2396 aiocbe->uaiocb._aiocb_private.status -= bp->bio_resid; 2397 aiocbe->uaiocb._aiocb_private.error = 0; 2398 if (bp->bio_flags & BIO_ERROR) 2399 aiocbe->uaiocb._aiocb_private.error = bp->bio_error; 2400 nblks = btodb(aiocbe->uaiocb.aio_nbytes); 2401 if (aiocbe->uaiocb.aio_lio_opcode == LIO_WRITE) 2402 aiocbe->outputcharge += nblks; 2403 else 2404 aiocbe->inputcharge += nblks; 2405 TAILQ_REMOVE(&userp->p_aioinfo->kaio_bufqueue, aiocbe, plist); 2406 ki->kaio_buffer_count--; 2407 aio_bio_done_notify(userp, aiocbe, DONE_BUF); 2408 AIO_UNLOCK(ki); 2409 2410 g_destroy_bio(bp); 2411 } 2412 2413 /* syscall - wait for the next completion of an aio request */ 2414 static int 2415 kern_aio_waitcomplete(struct thread *td, struct aiocb **aiocbp, 2416 struct timespec *ts, struct aiocb_ops *ops) 2417 { 2418 struct proc *p = td->td_proc; 2419 struct timeval atv; 2420 struct kaioinfo *ki; 2421 struct aiocblist *cb; 2422 struct aiocb *uuaiocb; 2423 int error, status, timo; 2424 2425 ops->store_aiocb(aiocbp, NULL); 2426 2427 if (ts == NULL) { 2428 timo = 0; 2429 } else if (ts->tv_sec == 0 && ts->tv_nsec == 0) { 2430 timo = -1; 2431 } else { 2432 if ((ts->tv_nsec < 0) || (ts->tv_nsec >= 1000000000)) 2433 return (EINVAL); 2434 2435 TIMESPEC_TO_TIMEVAL(&atv, ts); 2436 if (itimerfix(&atv)) 2437 return (EINVAL); 2438 timo = tvtohz(&atv); 2439 } 2440 2441 if (p->p_aioinfo == NULL) 2442 aio_init_aioinfo(p); 2443 ki = p->p_aioinfo; 2444 2445 error = 0; 2446 cb = NULL; 2447 AIO_LOCK(ki); 2448 while ((cb = TAILQ_FIRST(&ki->kaio_done)) == NULL) { 2449 if (timo == -1) { 2450 error = EWOULDBLOCK; 2451 break; 2452 } 2453 ki->kaio_flags |= KAIO_WAKEUP; 2454 error = msleep(&p->p_aioinfo, AIO_MTX(ki), PRIBIO | PCATCH, 2455 "aiowc", timo); 2456 if (timo && error == ERESTART) 2457 error = EINTR; 2458 if (error) 2459 break; 2460 } 2461 2462 if (cb != NULL) { 2463 MPASS(cb->jobstate == JOBST_JOBFINISHED); 2464 uuaiocb = cb->uuaiocb; 2465 status = cb->uaiocb._aiocb_private.status; 2466 error = cb->uaiocb._aiocb_private.error; 2467 td->td_retval[0] = status; 2468 if (cb->uaiocb.aio_lio_opcode == LIO_WRITE) { 2469 td->td_ru.ru_oublock += cb->outputcharge; 2470 cb->outputcharge = 0; 2471 } else if (cb->uaiocb.aio_lio_opcode == LIO_READ) { 2472 td->td_ru.ru_inblock += cb->inputcharge; 2473 cb->inputcharge = 0; 2474 } 2475 aio_free_entry(cb); 2476 AIO_UNLOCK(ki); 2477 ops->store_aiocb(aiocbp, uuaiocb); 2478 ops->store_error(uuaiocb, error); 2479 ops->store_status(uuaiocb, status); 2480 } else 2481 AIO_UNLOCK(ki); 2482 2483 return (error); 2484 } 2485 2486 int 2487 sys_aio_waitcomplete(struct thread *td, struct aio_waitcomplete_args *uap) 2488 { 2489 struct timespec ts, *tsp; 2490 int error; 2491 2492 if (uap->timeout) { 2493 /* Get timespec struct. */ 2494 error = copyin(uap->timeout, &ts, sizeof(ts)); 2495 if (error) 2496 return (error); 2497 tsp = &ts; 2498 } else 2499 tsp = NULL; 2500 2501 return (kern_aio_waitcomplete(td, uap->aiocbp, tsp, &aiocb_ops)); 2502 } 2503 2504 static int 2505 kern_aio_fsync(struct thread *td, int op, struct aiocb *aiocbp, 2506 struct aiocb_ops *ops) 2507 { 2508 struct proc *p = td->td_proc; 2509 struct kaioinfo *ki; 2510 2511 if (op != O_SYNC) /* XXX lack of O_DSYNC */ 2512 return (EINVAL); 2513 ki = p->p_aioinfo; 2514 if (ki == NULL) 2515 aio_init_aioinfo(p); 2516 return (aio_aqueue(td, aiocbp, NULL, LIO_SYNC, ops)); 2517 } 2518 2519 int 2520 sys_aio_fsync(struct thread *td, struct aio_fsync_args *uap) 2521 { 2522 2523 return (kern_aio_fsync(td, uap->op, uap->aiocbp, &aiocb_ops)); 2524 } 2525 2526 /* kqueue attach function */ 2527 static int 2528 filt_aioattach(struct knote *kn) 2529 { 2530 struct aiocblist *aiocbe = (struct aiocblist *)kn->kn_sdata; 2531 2532 /* 2533 * The aiocbe pointer must be validated before using it, so 2534 * registration is restricted to the kernel; the user cannot 2535 * set EV_FLAG1. 2536 */ 2537 if ((kn->kn_flags & EV_FLAG1) == 0) 2538 return (EPERM); 2539 kn->kn_ptr.p_aio = aiocbe; 2540 kn->kn_flags &= ~EV_FLAG1; 2541 2542 knlist_add(&aiocbe->klist, kn, 0); 2543 2544 return (0); 2545 } 2546 2547 /* kqueue detach function */ 2548 static void 2549 filt_aiodetach(struct knote *kn) 2550 { 2551 struct knlist *knl; 2552 2553 knl = &kn->kn_ptr.p_aio->klist; 2554 knl->kl_lock(knl->kl_lockarg); 2555 if (!knlist_empty(knl)) 2556 knlist_remove(knl, kn, 1); 2557 knl->kl_unlock(knl->kl_lockarg); 2558 } 2559 2560 /* kqueue filter function */ 2561 /*ARGSUSED*/ 2562 static int 2563 filt_aio(struct knote *kn, long hint) 2564 { 2565 struct aiocblist *aiocbe = kn->kn_ptr.p_aio; 2566 2567 kn->kn_data = aiocbe->uaiocb._aiocb_private.error; 2568 if (aiocbe->jobstate != JOBST_JOBFINISHED) 2569 return (0); 2570 kn->kn_flags |= EV_EOF; 2571 return (1); 2572 } 2573 2574 /* kqueue attach function */ 2575 static int 2576 filt_lioattach(struct knote *kn) 2577 { 2578 struct aioliojob * lj = (struct aioliojob *)kn->kn_sdata; 2579 2580 /* 2581 * The aioliojob pointer must be validated before using it, so 2582 * registration is restricted to the kernel; the user cannot 2583 * set EV_FLAG1. 2584 */ 2585 if ((kn->kn_flags & EV_FLAG1) == 0) 2586 return (EPERM); 2587 kn->kn_ptr.p_lio = lj; 2588 kn->kn_flags &= ~EV_FLAG1; 2589 2590 knlist_add(&lj->klist, kn, 0); 2591 2592 return (0); 2593 } 2594 2595 /* kqueue detach function */ 2596 static void 2597 filt_liodetach(struct knote *kn) 2598 { 2599 struct knlist *knl; 2600 2601 knl = &kn->kn_ptr.p_lio->klist; 2602 knl->kl_lock(knl->kl_lockarg); 2603 if (!knlist_empty(knl)) 2604 knlist_remove(knl, kn, 1); 2605 knl->kl_unlock(knl->kl_lockarg); 2606 } 2607 2608 /* kqueue filter function */ 2609 /*ARGSUSED*/ 2610 static int 2611 filt_lio(struct knote *kn, long hint) 2612 { 2613 struct aioliojob * lj = kn->kn_ptr.p_lio; 2614 2615 return (lj->lioj_flags & LIOJ_KEVENT_POSTED); 2616 } 2617 2618 #ifdef COMPAT_FREEBSD32 2619 2620 struct __aiocb_private32 { 2621 int32_t status; 2622 int32_t error; 2623 uint32_t kernelinfo; 2624 }; 2625 2626 typedef struct oaiocb32 { 2627 int aio_fildes; /* File descriptor */ 2628 uint64_t aio_offset __packed; /* File offset for I/O */ 2629 uint32_t aio_buf; /* I/O buffer in process space */ 2630 uint32_t aio_nbytes; /* Number of bytes for I/O */ 2631 struct osigevent32 aio_sigevent; /* Signal to deliver */ 2632 int aio_lio_opcode; /* LIO opcode */ 2633 int aio_reqprio; /* Request priority -- ignored */ 2634 struct __aiocb_private32 _aiocb_private; 2635 } oaiocb32_t; 2636 2637 typedef struct aiocb32 { 2638 int32_t aio_fildes; /* File descriptor */ 2639 uint64_t aio_offset __packed; /* File offset for I/O */ 2640 uint32_t aio_buf; /* I/O buffer in process space */ 2641 uint32_t aio_nbytes; /* Number of bytes for I/O */ 2642 int __spare__[2]; 2643 uint32_t __spare2__; 2644 int aio_lio_opcode; /* LIO opcode */ 2645 int aio_reqprio; /* Request priority -- ignored */ 2646 struct __aiocb_private32 _aiocb_private; 2647 struct sigevent32 aio_sigevent; /* Signal to deliver */ 2648 } aiocb32_t; 2649 2650 static int 2651 convert_old_sigevent32(struct osigevent32 *osig, struct sigevent *nsig) 2652 { 2653 2654 /* 2655 * Only SIGEV_NONE, SIGEV_SIGNAL, and SIGEV_KEVENT are 2656 * supported by AIO with the old sigevent structure. 2657 */ 2658 CP(*osig, *nsig, sigev_notify); 2659 switch (nsig->sigev_notify) { 2660 case SIGEV_NONE: 2661 break; 2662 case SIGEV_SIGNAL: 2663 nsig->sigev_signo = osig->__sigev_u.__sigev_signo; 2664 break; 2665 case SIGEV_KEVENT: 2666 nsig->sigev_notify_kqueue = 2667 osig->__sigev_u.__sigev_notify_kqueue; 2668 PTRIN_CP(*osig, *nsig, sigev_value.sival_ptr); 2669 break; 2670 default: 2671 return (EINVAL); 2672 } 2673 return (0); 2674 } 2675 2676 static int 2677 aiocb32_copyin_old_sigevent(struct aiocb *ujob, struct aiocb *kjob) 2678 { 2679 struct oaiocb32 job32; 2680 int error; 2681 2682 bzero(kjob, sizeof(struct aiocb)); 2683 error = copyin(ujob, &job32, sizeof(job32)); 2684 if (error) 2685 return (error); 2686 2687 CP(job32, *kjob, aio_fildes); 2688 CP(job32, *kjob, aio_offset); 2689 PTRIN_CP(job32, *kjob, aio_buf); 2690 CP(job32, *kjob, aio_nbytes); 2691 CP(job32, *kjob, aio_lio_opcode); 2692 CP(job32, *kjob, aio_reqprio); 2693 CP(job32, *kjob, _aiocb_private.status); 2694 CP(job32, *kjob, _aiocb_private.error); 2695 PTRIN_CP(job32, *kjob, _aiocb_private.kernelinfo); 2696 return (convert_old_sigevent32(&job32.aio_sigevent, 2697 &kjob->aio_sigevent)); 2698 } 2699 2700 static int 2701 aiocb32_copyin(struct aiocb *ujob, struct aiocb *kjob) 2702 { 2703 struct aiocb32 job32; 2704 int error; 2705 2706 error = copyin(ujob, &job32, sizeof(job32)); 2707 if (error) 2708 return (error); 2709 CP(job32, *kjob, aio_fildes); 2710 CP(job32, *kjob, aio_offset); 2711 PTRIN_CP(job32, *kjob, aio_buf); 2712 CP(job32, *kjob, aio_nbytes); 2713 CP(job32, *kjob, aio_lio_opcode); 2714 CP(job32, *kjob, aio_reqprio); 2715 CP(job32, *kjob, _aiocb_private.status); 2716 CP(job32, *kjob, _aiocb_private.error); 2717 PTRIN_CP(job32, *kjob, _aiocb_private.kernelinfo); 2718 return (convert_sigevent32(&job32.aio_sigevent, &kjob->aio_sigevent)); 2719 } 2720 2721 static long 2722 aiocb32_fetch_status(struct aiocb *ujob) 2723 { 2724 struct aiocb32 *ujob32; 2725 2726 ujob32 = (struct aiocb32 *)ujob; 2727 return (fuword32(&ujob32->_aiocb_private.status)); 2728 } 2729 2730 static long 2731 aiocb32_fetch_error(struct aiocb *ujob) 2732 { 2733 struct aiocb32 *ujob32; 2734 2735 ujob32 = (struct aiocb32 *)ujob; 2736 return (fuword32(&ujob32->_aiocb_private.error)); 2737 } 2738 2739 static int 2740 aiocb32_store_status(struct aiocb *ujob, long status) 2741 { 2742 struct aiocb32 *ujob32; 2743 2744 ujob32 = (struct aiocb32 *)ujob; 2745 return (suword32(&ujob32->_aiocb_private.status, status)); 2746 } 2747 2748 static int 2749 aiocb32_store_error(struct aiocb *ujob, long error) 2750 { 2751 struct aiocb32 *ujob32; 2752 2753 ujob32 = (struct aiocb32 *)ujob; 2754 return (suword32(&ujob32->_aiocb_private.error, error)); 2755 } 2756 2757 static int 2758 aiocb32_store_kernelinfo(struct aiocb *ujob, long jobref) 2759 { 2760 struct aiocb32 *ujob32; 2761 2762 ujob32 = (struct aiocb32 *)ujob; 2763 return (suword32(&ujob32->_aiocb_private.kernelinfo, jobref)); 2764 } 2765 2766 static int 2767 aiocb32_store_aiocb(struct aiocb **ujobp, struct aiocb *ujob) 2768 { 2769 2770 return (suword32(ujobp, (long)ujob)); 2771 } 2772 2773 static struct aiocb_ops aiocb32_ops = { 2774 .copyin = aiocb32_copyin, 2775 .fetch_status = aiocb32_fetch_status, 2776 .fetch_error = aiocb32_fetch_error, 2777 .store_status = aiocb32_store_status, 2778 .store_error = aiocb32_store_error, 2779 .store_kernelinfo = aiocb32_store_kernelinfo, 2780 .store_aiocb = aiocb32_store_aiocb, 2781 }; 2782 2783 static struct aiocb_ops aiocb32_ops_osigevent = { 2784 .copyin = aiocb32_copyin_old_sigevent, 2785 .fetch_status = aiocb32_fetch_status, 2786 .fetch_error = aiocb32_fetch_error, 2787 .store_status = aiocb32_store_status, 2788 .store_error = aiocb32_store_error, 2789 .store_kernelinfo = aiocb32_store_kernelinfo, 2790 .store_aiocb = aiocb32_store_aiocb, 2791 }; 2792 2793 int 2794 freebsd32_aio_return(struct thread *td, struct freebsd32_aio_return_args *uap) 2795 { 2796 2797 return (kern_aio_return(td, (struct aiocb *)uap->aiocbp, &aiocb32_ops)); 2798 } 2799 2800 int 2801 freebsd32_aio_suspend(struct thread *td, struct freebsd32_aio_suspend_args *uap) 2802 { 2803 struct timespec32 ts32; 2804 struct timespec ts, *tsp; 2805 struct aiocb **ujoblist; 2806 uint32_t *ujoblist32; 2807 int error, i; 2808 2809 if (uap->nent < 0 || uap->nent > AIO_LISTIO_MAX) 2810 return (EINVAL); 2811 2812 if (uap->timeout) { 2813 /* Get timespec struct. */ 2814 if ((error = copyin(uap->timeout, &ts32, sizeof(ts32))) != 0) 2815 return (error); 2816 CP(ts32, ts, tv_sec); 2817 CP(ts32, ts, tv_nsec); 2818 tsp = &ts; 2819 } else 2820 tsp = NULL; 2821 2822 ujoblist = uma_zalloc(aiol_zone, M_WAITOK); 2823 ujoblist32 = (uint32_t *)ujoblist; 2824 error = copyin(uap->aiocbp, ujoblist32, uap->nent * 2825 sizeof(ujoblist32[0])); 2826 if (error == 0) { 2827 for (i = uap->nent; i > 0; i--) 2828 ujoblist[i] = PTRIN(ujoblist32[i]); 2829 2830 error = kern_aio_suspend(td, uap->nent, ujoblist, tsp); 2831 } 2832 uma_zfree(aiol_zone, ujoblist); 2833 return (error); 2834 } 2835 2836 int 2837 freebsd32_aio_cancel(struct thread *td, struct freebsd32_aio_cancel_args *uap) 2838 { 2839 2840 return (sys_aio_cancel(td, (struct aio_cancel_args *)uap)); 2841 } 2842 2843 int 2844 freebsd32_aio_error(struct thread *td, struct freebsd32_aio_error_args *uap) 2845 { 2846 2847 return (kern_aio_error(td, (struct aiocb *)uap->aiocbp, &aiocb32_ops)); 2848 } 2849 2850 int 2851 freebsd32_oaio_read(struct thread *td, struct freebsd32_oaio_read_args *uap) 2852 { 2853 2854 return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_READ, 2855 &aiocb32_ops_osigevent)); 2856 } 2857 2858 int 2859 freebsd32_aio_read(struct thread *td, struct freebsd32_aio_read_args *uap) 2860 { 2861 2862 return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_READ, 2863 &aiocb32_ops)); 2864 } 2865 2866 int 2867 freebsd32_oaio_write(struct thread *td, struct freebsd32_oaio_write_args *uap) 2868 { 2869 2870 return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_WRITE, 2871 &aiocb32_ops_osigevent)); 2872 } 2873 2874 int 2875 freebsd32_aio_write(struct thread *td, struct freebsd32_aio_write_args *uap) 2876 { 2877 2878 return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_WRITE, 2879 &aiocb32_ops)); 2880 } 2881 2882 int 2883 freebsd32_aio_mlock(struct thread *td, struct freebsd32_aio_mlock_args *uap) 2884 { 2885 2886 return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_MLOCK, 2887 &aiocb32_ops)); 2888 } 2889 2890 int 2891 freebsd32_aio_waitcomplete(struct thread *td, 2892 struct freebsd32_aio_waitcomplete_args *uap) 2893 { 2894 struct timespec32 ts32; 2895 struct timespec ts, *tsp; 2896 int error; 2897 2898 if (uap->timeout) { 2899 /* Get timespec struct. */ 2900 error = copyin(uap->timeout, &ts32, sizeof(ts32)); 2901 if (error) 2902 return (error); 2903 CP(ts32, ts, tv_sec); 2904 CP(ts32, ts, tv_nsec); 2905 tsp = &ts; 2906 } else 2907 tsp = NULL; 2908 2909 return (kern_aio_waitcomplete(td, (struct aiocb **)uap->aiocbp, tsp, 2910 &aiocb32_ops)); 2911 } 2912 2913 int 2914 freebsd32_aio_fsync(struct thread *td, struct freebsd32_aio_fsync_args *uap) 2915 { 2916 2917 return (kern_aio_fsync(td, uap->op, (struct aiocb *)uap->aiocbp, 2918 &aiocb32_ops)); 2919 } 2920 2921 int 2922 freebsd32_olio_listio(struct thread *td, struct freebsd32_olio_listio_args *uap) 2923 { 2924 struct aiocb **acb_list; 2925 struct sigevent *sigp, sig; 2926 struct osigevent32 osig; 2927 uint32_t *acb_list32; 2928 int error, i, nent; 2929 2930 if ((uap->mode != LIO_NOWAIT) && (uap->mode != LIO_WAIT)) 2931 return (EINVAL); 2932 2933 nent = uap->nent; 2934 if (nent < 0 || nent > AIO_LISTIO_MAX) 2935 return (EINVAL); 2936 2937 if (uap->sig && (uap->mode == LIO_NOWAIT)) { 2938 error = copyin(uap->sig, &osig, sizeof(osig)); 2939 if (error) 2940 return (error); 2941 error = convert_old_sigevent32(&osig, &sig); 2942 if (error) 2943 return (error); 2944 sigp = &sig; 2945 } else 2946 sigp = NULL; 2947 2948 acb_list32 = malloc(sizeof(uint32_t) * nent, M_LIO, M_WAITOK); 2949 error = copyin(uap->acb_list, acb_list32, nent * sizeof(uint32_t)); 2950 if (error) { 2951 free(acb_list32, M_LIO); 2952 return (error); 2953 } 2954 acb_list = malloc(sizeof(struct aiocb *) * nent, M_LIO, M_WAITOK); 2955 for (i = 0; i < nent; i++) 2956 acb_list[i] = PTRIN(acb_list32[i]); 2957 free(acb_list32, M_LIO); 2958 2959 error = kern_lio_listio(td, uap->mode, 2960 (struct aiocb * const *)uap->acb_list, acb_list, nent, sigp, 2961 &aiocb32_ops_osigevent); 2962 free(acb_list, M_LIO); 2963 return (error); 2964 } 2965 2966 int 2967 freebsd32_lio_listio(struct thread *td, struct freebsd32_lio_listio_args *uap) 2968 { 2969 struct aiocb **acb_list; 2970 struct sigevent *sigp, sig; 2971 struct sigevent32 sig32; 2972 uint32_t *acb_list32; 2973 int error, i, nent; 2974 2975 if ((uap->mode != LIO_NOWAIT) && (uap->mode != LIO_WAIT)) 2976 return (EINVAL); 2977 2978 nent = uap->nent; 2979 if (nent < 0 || nent > AIO_LISTIO_MAX) 2980 return (EINVAL); 2981 2982 if (uap->sig && (uap->mode == LIO_NOWAIT)) { 2983 error = copyin(uap->sig, &sig32, sizeof(sig32)); 2984 if (error) 2985 return (error); 2986 error = convert_sigevent32(&sig32, &sig); 2987 if (error) 2988 return (error); 2989 sigp = &sig; 2990 } else 2991 sigp = NULL; 2992 2993 acb_list32 = malloc(sizeof(uint32_t) * nent, M_LIO, M_WAITOK); 2994 error = copyin(uap->acb_list, acb_list32, nent * sizeof(uint32_t)); 2995 if (error) { 2996 free(acb_list32, M_LIO); 2997 return (error); 2998 } 2999 acb_list = malloc(sizeof(struct aiocb *) * nent, M_LIO, M_WAITOK); 3000 for (i = 0; i < nent; i++) 3001 acb_list[i] = PTRIN(acb_list32[i]); 3002 free(acb_list32, M_LIO); 3003 3004 error = kern_lio_listio(td, uap->mode, 3005 (struct aiocb * const *)uap->acb_list, acb_list, nent, sigp, 3006 &aiocb32_ops); 3007 free(acb_list, M_LIO); 3008 return (error); 3009 } 3010 3011 #endif 3012