1 /*- 2 * Copyright (c) 1997 John S. Dyson. All rights reserved. 3 * 4 * Redistribution and use in source and binary forms, with or without 5 * modification, are permitted provided that the following conditions 6 * are met: 7 * 1. Redistributions of source code must retain the above copyright 8 * notice, this list of conditions and the following disclaimer. 9 * 2. John S. Dyson's name may not be used to endorse or promote products 10 * derived from this software without specific prior written permission. 11 * 12 * DISCLAIMER: This code isn't warranted to do anything useful. Anything 13 * bad that happens because of using this software isn't the responsibility 14 * of the author. This software is distributed AS-IS. 15 */ 16 17 /* 18 * This file contains support for the POSIX 1003.1B AIO/LIO facility. 19 */ 20 21 #include <sys/cdefs.h> 22 __FBSDID("$FreeBSD$"); 23 24 #include "opt_compat.h" 25 26 #include <sys/param.h> 27 #include <sys/systm.h> 28 #include <sys/malloc.h> 29 #include <sys/bio.h> 30 #include <sys/buf.h> 31 #include <sys/eventhandler.h> 32 #include <sys/sysproto.h> 33 #include <sys/filedesc.h> 34 #include <sys/kernel.h> 35 #include <sys/module.h> 36 #include <sys/kthread.h> 37 #include <sys/fcntl.h> 38 #include <sys/file.h> 39 #include <sys/limits.h> 40 #include <sys/lock.h> 41 #include <sys/mutex.h> 42 #include <sys/unistd.h> 43 #include <sys/posix4.h> 44 #include <sys/proc.h> 45 #include <sys/resourcevar.h> 46 #include <sys/signalvar.h> 47 #include <sys/protosw.h> 48 #include <sys/sema.h> 49 #include <sys/socket.h> 50 #include <sys/socketvar.h> 51 #include <sys/syscall.h> 52 #include <sys/sysent.h> 53 #include <sys/sysctl.h> 54 #include <sys/sx.h> 55 #include <sys/taskqueue.h> 56 #include <sys/vnode.h> 57 #include <sys/conf.h> 58 #include <sys/event.h> 59 #include <sys/mount.h> 60 61 #include <machine/atomic.h> 62 63 #include <vm/vm.h> 64 #include <vm/vm_extern.h> 65 #include <vm/pmap.h> 66 #include <vm/vm_map.h> 67 #include <vm/vm_object.h> 68 #include <vm/uma.h> 69 #include <sys/aio.h> 70 71 #include "opt_vfs_aio.h" 72 73 /* 74 * Counter for allocating reference ids to new jobs. Wrapped to 1 on 75 * overflow. (XXX will be removed soon.) 76 */ 77 static u_long jobrefid; 78 79 /* 80 * Counter for aio_fsync. 81 */ 82 static uint64_t jobseqno; 83 84 #define JOBST_NULL 0 85 #define JOBST_JOBQSOCK 1 86 #define JOBST_JOBQGLOBAL 2 87 #define JOBST_JOBRUNNING 3 88 #define JOBST_JOBFINISHED 4 89 #define JOBST_JOBQBUF 5 90 #define JOBST_JOBQSYNC 6 91 92 #ifndef MAX_AIO_PER_PROC 93 #define MAX_AIO_PER_PROC 32 94 #endif 95 96 #ifndef MAX_AIO_QUEUE_PER_PROC 97 #define MAX_AIO_QUEUE_PER_PROC 256 /* Bigger than AIO_LISTIO_MAX */ 98 #endif 99 100 #ifndef MAX_AIO_PROCS 101 #define MAX_AIO_PROCS 32 102 #endif 103 104 #ifndef MAX_AIO_QUEUE 105 #define MAX_AIO_QUEUE 1024 /* Bigger than AIO_LISTIO_MAX */ 106 #endif 107 108 #ifndef TARGET_AIO_PROCS 109 #define TARGET_AIO_PROCS 4 110 #endif 111 112 #ifndef MAX_BUF_AIO 113 #define MAX_BUF_AIO 16 114 #endif 115 116 #ifndef AIOD_TIMEOUT_DEFAULT 117 #define AIOD_TIMEOUT_DEFAULT (10 * hz) 118 #endif 119 120 #ifndef AIOD_LIFETIME_DEFAULT 121 #define AIOD_LIFETIME_DEFAULT (30 * hz) 122 #endif 123 124 FEATURE(aio, "Asynchronous I/O"); 125 126 static MALLOC_DEFINE(M_LIO, "lio", "listio aio control block list"); 127 128 static SYSCTL_NODE(_vfs, OID_AUTO, aio, CTLFLAG_RW, 0, "Async IO management"); 129 130 static int max_aio_procs = MAX_AIO_PROCS; 131 SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_procs, 132 CTLFLAG_RW, &max_aio_procs, 0, 133 "Maximum number of kernel threads to use for handling async IO "); 134 135 static int num_aio_procs = 0; 136 SYSCTL_INT(_vfs_aio, OID_AUTO, num_aio_procs, 137 CTLFLAG_RD, &num_aio_procs, 0, 138 "Number of presently active kernel threads for async IO"); 139 140 /* 141 * The code will adjust the actual number of AIO processes towards this 142 * number when it gets a chance. 143 */ 144 static int target_aio_procs = TARGET_AIO_PROCS; 145 SYSCTL_INT(_vfs_aio, OID_AUTO, target_aio_procs, CTLFLAG_RW, &target_aio_procs, 146 0, "Preferred number of ready kernel threads for async IO"); 147 148 static int max_queue_count = MAX_AIO_QUEUE; 149 SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_queue, CTLFLAG_RW, &max_queue_count, 0, 150 "Maximum number of aio requests to queue, globally"); 151 152 static int num_queue_count = 0; 153 SYSCTL_INT(_vfs_aio, OID_AUTO, num_queue_count, CTLFLAG_RD, &num_queue_count, 0, 154 "Number of queued aio requests"); 155 156 static int num_buf_aio = 0; 157 SYSCTL_INT(_vfs_aio, OID_AUTO, num_buf_aio, CTLFLAG_RD, &num_buf_aio, 0, 158 "Number of aio requests presently handled by the buf subsystem"); 159 160 /* Number of async I/O thread in the process of being started */ 161 /* XXX This should be local to aio_aqueue() */ 162 static int num_aio_resv_start = 0; 163 164 static int aiod_timeout; 165 SYSCTL_INT(_vfs_aio, OID_AUTO, aiod_timeout, CTLFLAG_RW, &aiod_timeout, 0, 166 "Timeout value for synchronous aio operations"); 167 168 static int aiod_lifetime; 169 SYSCTL_INT(_vfs_aio, OID_AUTO, aiod_lifetime, CTLFLAG_RW, &aiod_lifetime, 0, 170 "Maximum lifetime for idle aiod"); 171 172 static int unloadable = 0; 173 SYSCTL_INT(_vfs_aio, OID_AUTO, unloadable, CTLFLAG_RW, &unloadable, 0, 174 "Allow unload of aio (not recommended)"); 175 176 177 static int max_aio_per_proc = MAX_AIO_PER_PROC; 178 SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_per_proc, CTLFLAG_RW, &max_aio_per_proc, 179 0, "Maximum active aio requests per process (stored in the process)"); 180 181 static int max_aio_queue_per_proc = MAX_AIO_QUEUE_PER_PROC; 182 SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_queue_per_proc, CTLFLAG_RW, 183 &max_aio_queue_per_proc, 0, 184 "Maximum queued aio requests per process (stored in the process)"); 185 186 static int max_buf_aio = MAX_BUF_AIO; 187 SYSCTL_INT(_vfs_aio, OID_AUTO, max_buf_aio, CTLFLAG_RW, &max_buf_aio, 0, 188 "Maximum buf aio requests per process (stored in the process)"); 189 190 typedef struct oaiocb { 191 int aio_fildes; /* File descriptor */ 192 off_t aio_offset; /* File offset for I/O */ 193 volatile void *aio_buf; /* I/O buffer in process space */ 194 size_t aio_nbytes; /* Number of bytes for I/O */ 195 struct osigevent aio_sigevent; /* Signal to deliver */ 196 int aio_lio_opcode; /* LIO opcode */ 197 int aio_reqprio; /* Request priority -- ignored */ 198 struct __aiocb_private _aiocb_private; 199 } oaiocb_t; 200 201 /* 202 * Below is a key of locks used to protect each member of struct aiocblist 203 * aioliojob and kaioinfo and any backends. 204 * 205 * * - need not protected 206 * a - locked by kaioinfo lock 207 * b - locked by backend lock, the backend lock can be null in some cases, 208 * for example, BIO belongs to this type, in this case, proc lock is 209 * reused. 210 * c - locked by aio_job_mtx, the lock for the generic file I/O backend. 211 */ 212 213 /* 214 * Current, there is only two backends: BIO and generic file I/O. 215 * socket I/O is served by generic file I/O, this is not a good idea, since 216 * disk file I/O and any other types without O_NONBLOCK flag can block daemon 217 * threads, if there is no thread to serve socket I/O, the socket I/O will be 218 * delayed too long or starved, we should create some threads dedicated to 219 * sockets to do non-blocking I/O, same for pipe and fifo, for these I/O 220 * systems we really need non-blocking interface, fiddling O_NONBLOCK in file 221 * structure is not safe because there is race between userland and aio 222 * daemons. 223 */ 224 225 struct aiocblist { 226 TAILQ_ENTRY(aiocblist) list; /* (b) internal list of for backend */ 227 TAILQ_ENTRY(aiocblist) plist; /* (a) list of jobs for each backend */ 228 TAILQ_ENTRY(aiocblist) allist; /* (a) list of all jobs in proc */ 229 int jobflags; /* (a) job flags */ 230 int jobstate; /* (b) job state */ 231 int inputcharge; /* (*) input blockes */ 232 int outputcharge; /* (*) output blockes */ 233 struct buf *bp; /* (*) private to BIO backend, 234 * buffer pointer 235 */ 236 struct proc *userproc; /* (*) user process */ 237 struct ucred *cred; /* (*) active credential when created */ 238 struct file *fd_file; /* (*) pointer to file structure */ 239 struct aioliojob *lio; /* (*) optional lio job */ 240 struct aiocb *uuaiocb; /* (*) pointer in userspace of aiocb */ 241 struct knlist klist; /* (a) list of knotes */ 242 struct aiocb uaiocb; /* (*) kernel I/O control block */ 243 ksiginfo_t ksi; /* (a) realtime signal info */ 244 struct task biotask; /* (*) private to BIO backend */ 245 uint64_t seqno; /* (*) job number */ 246 int pending; /* (a) number of pending I/O, aio_fsync only */ 247 }; 248 249 /* jobflags */ 250 #define AIOCBLIST_DONE 0x01 251 #define AIOCBLIST_BUFDONE 0x02 252 #define AIOCBLIST_RUNDOWN 0x04 253 #define AIOCBLIST_CHECKSYNC 0x08 254 255 /* 256 * AIO process info 257 */ 258 #define AIOP_FREE 0x1 /* proc on free queue */ 259 260 struct aiothreadlist { 261 int aiothreadflags; /* (c) AIO proc flags */ 262 TAILQ_ENTRY(aiothreadlist) list; /* (c) list of processes */ 263 struct thread *aiothread; /* (*) the AIO thread */ 264 }; 265 266 /* 267 * data-structure for lio signal management 268 */ 269 struct aioliojob { 270 int lioj_flags; /* (a) listio flags */ 271 int lioj_count; /* (a) listio flags */ 272 int lioj_finished_count; /* (a) listio flags */ 273 struct sigevent lioj_signal; /* (a) signal on all I/O done */ 274 TAILQ_ENTRY(aioliojob) lioj_list; /* (a) lio list */ 275 struct knlist klist; /* (a) list of knotes */ 276 ksiginfo_t lioj_ksi; /* (a) Realtime signal info */ 277 }; 278 279 #define LIOJ_SIGNAL 0x1 /* signal on all done (lio) */ 280 #define LIOJ_SIGNAL_POSTED 0x2 /* signal has been posted */ 281 #define LIOJ_KEVENT_POSTED 0x4 /* kevent triggered */ 282 283 /* 284 * per process aio data structure 285 */ 286 struct kaioinfo { 287 struct mtx kaio_mtx; /* the lock to protect this struct */ 288 int kaio_flags; /* (a) per process kaio flags */ 289 int kaio_maxactive_count; /* (*) maximum number of AIOs */ 290 int kaio_active_count; /* (c) number of currently used AIOs */ 291 int kaio_qallowed_count; /* (*) maxiumu size of AIO queue */ 292 int kaio_count; /* (a) size of AIO queue */ 293 int kaio_ballowed_count; /* (*) maximum number of buffers */ 294 int kaio_buffer_count; /* (a) number of physio buffers */ 295 TAILQ_HEAD(,aiocblist) kaio_all; /* (a) all AIOs in the process */ 296 TAILQ_HEAD(,aiocblist) kaio_done; /* (a) done queue for process */ 297 TAILQ_HEAD(,aioliojob) kaio_liojoblist; /* (a) list of lio jobs */ 298 TAILQ_HEAD(,aiocblist) kaio_jobqueue; /* (a) job queue for process */ 299 TAILQ_HEAD(,aiocblist) kaio_bufqueue; /* (a) buffer job queue for process */ 300 TAILQ_HEAD(,aiocblist) kaio_sockqueue; /* (a) queue for aios waiting on sockets, 301 * NOT USED YET. 302 */ 303 TAILQ_HEAD(,aiocblist) kaio_syncqueue; /* (a) queue for aio_fsync */ 304 struct task kaio_task; /* (*) task to kick aio threads */ 305 }; 306 307 #define AIO_LOCK(ki) mtx_lock(&(ki)->kaio_mtx) 308 #define AIO_UNLOCK(ki) mtx_unlock(&(ki)->kaio_mtx) 309 #define AIO_LOCK_ASSERT(ki, f) mtx_assert(&(ki)->kaio_mtx, (f)) 310 #define AIO_MTX(ki) (&(ki)->kaio_mtx) 311 312 #define KAIO_RUNDOWN 0x1 /* process is being run down */ 313 #define KAIO_WAKEUP 0x2 /* wakeup process when there is a significant event */ 314 315 /* 316 * Operations used to interact with userland aio control blocks. 317 * Different ABIs provide their own operations. 318 */ 319 struct aiocb_ops { 320 int (*copyin)(struct aiocb *ujob, struct aiocb *kjob); 321 long (*fetch_status)(struct aiocb *ujob); 322 long (*fetch_error)(struct aiocb *ujob); 323 int (*store_status)(struct aiocb *ujob, long status); 324 int (*store_error)(struct aiocb *ujob, long error); 325 int (*store_kernelinfo)(struct aiocb *ujob, long jobref); 326 int (*store_aiocb)(struct aiocb **ujobp, struct aiocb *ujob); 327 }; 328 329 static TAILQ_HEAD(,aiothreadlist) aio_freeproc; /* (c) Idle daemons */ 330 static struct sema aio_newproc_sem; 331 static struct mtx aio_job_mtx; 332 static struct mtx aio_sock_mtx; 333 static TAILQ_HEAD(,aiocblist) aio_jobs; /* (c) Async job list */ 334 static struct unrhdr *aiod_unr; 335 336 void aio_init_aioinfo(struct proc *p); 337 static int aio_onceonly(void); 338 static int aio_free_entry(struct aiocblist *aiocbe); 339 static void aio_process(struct aiocblist *aiocbe); 340 static int aio_newproc(int *); 341 int aio_aqueue(struct thread *td, struct aiocb *job, 342 struct aioliojob *lio, int type, struct aiocb_ops *ops); 343 static void aio_physwakeup(struct buf *bp); 344 static void aio_proc_rundown(void *arg, struct proc *p); 345 static void aio_proc_rundown_exec(void *arg, struct proc *p, struct image_params *imgp); 346 static int aio_qphysio(struct proc *p, struct aiocblist *iocb); 347 static void biohelper(void *, int); 348 static void aio_daemon(void *param); 349 static void aio_swake_cb(struct socket *, struct sockbuf *); 350 static int aio_unload(void); 351 static void aio_bio_done_notify(struct proc *userp, struct aiocblist *aiocbe, int type); 352 #define DONE_BUF 1 353 #define DONE_QUEUE 2 354 static int aio_kick(struct proc *userp); 355 static void aio_kick_nowait(struct proc *userp); 356 static void aio_kick_helper(void *context, int pending); 357 static int filt_aioattach(struct knote *kn); 358 static void filt_aiodetach(struct knote *kn); 359 static int filt_aio(struct knote *kn, long hint); 360 static int filt_lioattach(struct knote *kn); 361 static void filt_liodetach(struct knote *kn); 362 static int filt_lio(struct knote *kn, long hint); 363 364 /* 365 * Zones for: 366 * kaio Per process async io info 367 * aiop async io thread data 368 * aiocb async io jobs 369 * aiol list io job pointer - internal to aio_suspend XXX 370 * aiolio list io jobs 371 */ 372 static uma_zone_t kaio_zone, aiop_zone, aiocb_zone, aiol_zone, aiolio_zone; 373 374 /* kqueue filters for aio */ 375 static struct filterops aio_filtops = { 376 .f_isfd = 0, 377 .f_attach = filt_aioattach, 378 .f_detach = filt_aiodetach, 379 .f_event = filt_aio, 380 }; 381 static struct filterops lio_filtops = { 382 .f_isfd = 0, 383 .f_attach = filt_lioattach, 384 .f_detach = filt_liodetach, 385 .f_event = filt_lio 386 }; 387 388 static eventhandler_tag exit_tag, exec_tag; 389 390 TASKQUEUE_DEFINE_THREAD(aiod_bio); 391 392 /* 393 * Main operations function for use as a kernel module. 394 */ 395 static int 396 aio_modload(struct module *module, int cmd, void *arg) 397 { 398 int error = 0; 399 400 switch (cmd) { 401 case MOD_LOAD: 402 aio_onceonly(); 403 break; 404 case MOD_UNLOAD: 405 error = aio_unload(); 406 break; 407 case MOD_SHUTDOWN: 408 break; 409 default: 410 error = EINVAL; 411 break; 412 } 413 return (error); 414 } 415 416 static moduledata_t aio_mod = { 417 "aio", 418 &aio_modload, 419 NULL 420 }; 421 422 static struct syscall_helper_data aio_syscalls[] = { 423 SYSCALL_INIT_HELPER(aio_cancel), 424 SYSCALL_INIT_HELPER(aio_error), 425 SYSCALL_INIT_HELPER(aio_fsync), 426 SYSCALL_INIT_HELPER(aio_read), 427 SYSCALL_INIT_HELPER(aio_return), 428 SYSCALL_INIT_HELPER(aio_suspend), 429 SYSCALL_INIT_HELPER(aio_waitcomplete), 430 SYSCALL_INIT_HELPER(aio_write), 431 SYSCALL_INIT_HELPER(lio_listio), 432 SYSCALL_INIT_HELPER(oaio_read), 433 SYSCALL_INIT_HELPER(oaio_write), 434 SYSCALL_INIT_HELPER(olio_listio), 435 SYSCALL_INIT_LAST 436 }; 437 438 #ifdef COMPAT_FREEBSD32 439 #include <sys/mount.h> 440 #include <sys/socket.h> 441 #include <compat/freebsd32/freebsd32.h> 442 #include <compat/freebsd32/freebsd32_proto.h> 443 #include <compat/freebsd32/freebsd32_signal.h> 444 #include <compat/freebsd32/freebsd32_syscall.h> 445 #include <compat/freebsd32/freebsd32_util.h> 446 447 static struct syscall_helper_data aio32_syscalls[] = { 448 SYSCALL32_INIT_HELPER(freebsd32_aio_return), 449 SYSCALL32_INIT_HELPER(freebsd32_aio_suspend), 450 SYSCALL32_INIT_HELPER(freebsd32_aio_cancel), 451 SYSCALL32_INIT_HELPER(freebsd32_aio_error), 452 SYSCALL32_INIT_HELPER(freebsd32_aio_fsync), 453 SYSCALL32_INIT_HELPER(freebsd32_aio_read), 454 SYSCALL32_INIT_HELPER(freebsd32_aio_write), 455 SYSCALL32_INIT_HELPER(freebsd32_aio_waitcomplete), 456 SYSCALL32_INIT_HELPER(freebsd32_lio_listio), 457 SYSCALL32_INIT_HELPER(freebsd32_oaio_read), 458 SYSCALL32_INIT_HELPER(freebsd32_oaio_write), 459 SYSCALL32_INIT_HELPER(freebsd32_olio_listio), 460 SYSCALL_INIT_LAST 461 }; 462 #endif 463 464 DECLARE_MODULE(aio, aio_mod, 465 SI_SUB_VFS, SI_ORDER_ANY); 466 MODULE_VERSION(aio, 1); 467 468 /* 469 * Startup initialization 470 */ 471 static int 472 aio_onceonly(void) 473 { 474 int error; 475 476 /* XXX: should probably just use so->callback */ 477 aio_swake = &aio_swake_cb; 478 exit_tag = EVENTHANDLER_REGISTER(process_exit, aio_proc_rundown, NULL, 479 EVENTHANDLER_PRI_ANY); 480 exec_tag = EVENTHANDLER_REGISTER(process_exec, aio_proc_rundown_exec, NULL, 481 EVENTHANDLER_PRI_ANY); 482 kqueue_add_filteropts(EVFILT_AIO, &aio_filtops); 483 kqueue_add_filteropts(EVFILT_LIO, &lio_filtops); 484 TAILQ_INIT(&aio_freeproc); 485 sema_init(&aio_newproc_sem, 0, "aio_new_proc"); 486 mtx_init(&aio_job_mtx, "aio_job", NULL, MTX_DEF); 487 mtx_init(&aio_sock_mtx, "aio_sock", NULL, MTX_DEF); 488 TAILQ_INIT(&aio_jobs); 489 aiod_unr = new_unrhdr(1, INT_MAX, NULL); 490 kaio_zone = uma_zcreate("AIO", sizeof(struct kaioinfo), NULL, NULL, 491 NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); 492 aiop_zone = uma_zcreate("AIOP", sizeof(struct aiothreadlist), NULL, 493 NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); 494 aiocb_zone = uma_zcreate("AIOCB", sizeof(struct aiocblist), NULL, NULL, 495 NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); 496 aiol_zone = uma_zcreate("AIOL", AIO_LISTIO_MAX*sizeof(intptr_t) , NULL, 497 NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); 498 aiolio_zone = uma_zcreate("AIOLIO", sizeof(struct aioliojob), NULL, 499 NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); 500 aiod_timeout = AIOD_TIMEOUT_DEFAULT; 501 aiod_lifetime = AIOD_LIFETIME_DEFAULT; 502 jobrefid = 1; 503 async_io_version = _POSIX_VERSION; 504 p31b_setcfg(CTL_P1003_1B_AIO_LISTIO_MAX, AIO_LISTIO_MAX); 505 p31b_setcfg(CTL_P1003_1B_AIO_MAX, MAX_AIO_QUEUE); 506 p31b_setcfg(CTL_P1003_1B_AIO_PRIO_DELTA_MAX, 0); 507 508 error = syscall_helper_register(aio_syscalls); 509 if (error) 510 return (error); 511 #ifdef COMPAT_FREEBSD32 512 error = syscall32_helper_register(aio32_syscalls); 513 if (error) 514 return (error); 515 #endif 516 return (0); 517 } 518 519 /* 520 * Callback for unload of AIO when used as a module. 521 */ 522 static int 523 aio_unload(void) 524 { 525 int error; 526 527 /* 528 * XXX: no unloads by default, it's too dangerous. 529 * perhaps we could do it if locked out callers and then 530 * did an aio_proc_rundown() on each process. 531 * 532 * jhb: aio_proc_rundown() needs to run on curproc though, 533 * so I don't think that would fly. 534 */ 535 if (!unloadable) 536 return (EOPNOTSUPP); 537 538 #ifdef COMPAT_FREEBSD32 539 syscall32_helper_unregister(aio32_syscalls); 540 #endif 541 syscall_helper_unregister(aio_syscalls); 542 543 error = kqueue_del_filteropts(EVFILT_AIO); 544 if (error) 545 return error; 546 error = kqueue_del_filteropts(EVFILT_LIO); 547 if (error) 548 return error; 549 async_io_version = 0; 550 aio_swake = NULL; 551 taskqueue_free(taskqueue_aiod_bio); 552 delete_unrhdr(aiod_unr); 553 uma_zdestroy(kaio_zone); 554 uma_zdestroy(aiop_zone); 555 uma_zdestroy(aiocb_zone); 556 uma_zdestroy(aiol_zone); 557 uma_zdestroy(aiolio_zone); 558 EVENTHANDLER_DEREGISTER(process_exit, exit_tag); 559 EVENTHANDLER_DEREGISTER(process_exec, exec_tag); 560 mtx_destroy(&aio_job_mtx); 561 mtx_destroy(&aio_sock_mtx); 562 sema_destroy(&aio_newproc_sem); 563 p31b_setcfg(CTL_P1003_1B_AIO_LISTIO_MAX, -1); 564 p31b_setcfg(CTL_P1003_1B_AIO_MAX, -1); 565 p31b_setcfg(CTL_P1003_1B_AIO_PRIO_DELTA_MAX, -1); 566 return (0); 567 } 568 569 /* 570 * Init the per-process aioinfo structure. The aioinfo limits are set 571 * per-process for user limit (resource) management. 572 */ 573 void 574 aio_init_aioinfo(struct proc *p) 575 { 576 struct kaioinfo *ki; 577 578 ki = uma_zalloc(kaio_zone, M_WAITOK); 579 mtx_init(&ki->kaio_mtx, "aiomtx", NULL, MTX_DEF); 580 ki->kaio_flags = 0; 581 ki->kaio_maxactive_count = max_aio_per_proc; 582 ki->kaio_active_count = 0; 583 ki->kaio_qallowed_count = max_aio_queue_per_proc; 584 ki->kaio_count = 0; 585 ki->kaio_ballowed_count = max_buf_aio; 586 ki->kaio_buffer_count = 0; 587 TAILQ_INIT(&ki->kaio_all); 588 TAILQ_INIT(&ki->kaio_done); 589 TAILQ_INIT(&ki->kaio_jobqueue); 590 TAILQ_INIT(&ki->kaio_bufqueue); 591 TAILQ_INIT(&ki->kaio_liojoblist); 592 TAILQ_INIT(&ki->kaio_sockqueue); 593 TAILQ_INIT(&ki->kaio_syncqueue); 594 TASK_INIT(&ki->kaio_task, 0, aio_kick_helper, p); 595 PROC_LOCK(p); 596 if (p->p_aioinfo == NULL) { 597 p->p_aioinfo = ki; 598 PROC_UNLOCK(p); 599 } else { 600 PROC_UNLOCK(p); 601 mtx_destroy(&ki->kaio_mtx); 602 uma_zfree(kaio_zone, ki); 603 } 604 605 while (num_aio_procs < MIN(target_aio_procs, max_aio_procs)) 606 aio_newproc(NULL); 607 } 608 609 static int 610 aio_sendsig(struct proc *p, struct sigevent *sigev, ksiginfo_t *ksi) 611 { 612 int ret = 0; 613 614 PROC_LOCK(p); 615 if (!KSI_ONQ(ksi)) { 616 ksi->ksi_code = SI_ASYNCIO; 617 ksi->ksi_flags |= KSI_EXT | KSI_INS; 618 ret = psignal_event(p, sigev, ksi); 619 } 620 PROC_UNLOCK(p); 621 return (ret); 622 } 623 624 /* 625 * Free a job entry. Wait for completion if it is currently active, but don't 626 * delay forever. If we delay, we return a flag that says that we have to 627 * restart the queue scan. 628 */ 629 static int 630 aio_free_entry(struct aiocblist *aiocbe) 631 { 632 struct kaioinfo *ki; 633 struct aioliojob *lj; 634 struct proc *p; 635 636 p = aiocbe->userproc; 637 MPASS(curproc == p); 638 ki = p->p_aioinfo; 639 MPASS(ki != NULL); 640 641 AIO_LOCK_ASSERT(ki, MA_OWNED); 642 MPASS(aiocbe->jobstate == JOBST_JOBFINISHED); 643 644 atomic_subtract_int(&num_queue_count, 1); 645 646 ki->kaio_count--; 647 MPASS(ki->kaio_count >= 0); 648 649 TAILQ_REMOVE(&ki->kaio_done, aiocbe, plist); 650 TAILQ_REMOVE(&ki->kaio_all, aiocbe, allist); 651 652 lj = aiocbe->lio; 653 if (lj) { 654 lj->lioj_count--; 655 lj->lioj_finished_count--; 656 657 if (lj->lioj_count == 0) { 658 TAILQ_REMOVE(&ki->kaio_liojoblist, lj, lioj_list); 659 /* lio is going away, we need to destroy any knotes */ 660 knlist_delete(&lj->klist, curthread, 1); 661 PROC_LOCK(p); 662 sigqueue_take(&lj->lioj_ksi); 663 PROC_UNLOCK(p); 664 uma_zfree(aiolio_zone, lj); 665 } 666 } 667 668 /* aiocbe is going away, we need to destroy any knotes */ 669 knlist_delete(&aiocbe->klist, curthread, 1); 670 PROC_LOCK(p); 671 sigqueue_take(&aiocbe->ksi); 672 PROC_UNLOCK(p); 673 674 MPASS(aiocbe->bp == NULL); 675 aiocbe->jobstate = JOBST_NULL; 676 AIO_UNLOCK(ki); 677 678 /* 679 * The thread argument here is used to find the owning process 680 * and is also passed to fo_close() which may pass it to various 681 * places such as devsw close() routines. Because of that, we 682 * need a thread pointer from the process owning the job that is 683 * persistent and won't disappear out from under us or move to 684 * another process. 685 * 686 * Currently, all the callers of this function call it to remove 687 * an aiocblist from the current process' job list either via a 688 * syscall or due to the current process calling exit() or 689 * execve(). Thus, we know that p == curproc. We also know that 690 * curthread can't exit since we are curthread. 691 * 692 * Therefore, we use curthread as the thread to pass to 693 * knlist_delete(). This does mean that it is possible for the 694 * thread pointer at close time to differ from the thread pointer 695 * at open time, but this is already true of file descriptors in 696 * a multithreaded process. 697 */ 698 fdrop(aiocbe->fd_file, curthread); 699 crfree(aiocbe->cred); 700 uma_zfree(aiocb_zone, aiocbe); 701 AIO_LOCK(ki); 702 703 return (0); 704 } 705 706 static void 707 aio_proc_rundown_exec(void *arg, struct proc *p, struct image_params *imgp __unused) 708 { 709 aio_proc_rundown(arg, p); 710 } 711 712 /* 713 * Rundown the jobs for a given process. 714 */ 715 static void 716 aio_proc_rundown(void *arg, struct proc *p) 717 { 718 struct kaioinfo *ki; 719 struct aioliojob *lj; 720 struct aiocblist *cbe, *cbn; 721 struct file *fp; 722 struct socket *so; 723 int remove; 724 725 KASSERT(curthread->td_proc == p, 726 ("%s: called on non-curproc", __func__)); 727 ki = p->p_aioinfo; 728 if (ki == NULL) 729 return; 730 731 AIO_LOCK(ki); 732 ki->kaio_flags |= KAIO_RUNDOWN; 733 734 restart: 735 736 /* 737 * Try to cancel all pending requests. This code simulates 738 * aio_cancel on all pending I/O requests. 739 */ 740 TAILQ_FOREACH_SAFE(cbe, &ki->kaio_jobqueue, plist, cbn) { 741 remove = 0; 742 mtx_lock(&aio_job_mtx); 743 if (cbe->jobstate == JOBST_JOBQGLOBAL) { 744 TAILQ_REMOVE(&aio_jobs, cbe, list); 745 remove = 1; 746 } else if (cbe->jobstate == JOBST_JOBQSOCK) { 747 fp = cbe->fd_file; 748 MPASS(fp->f_type == DTYPE_SOCKET); 749 so = fp->f_data; 750 TAILQ_REMOVE(&so->so_aiojobq, cbe, list); 751 remove = 1; 752 } else if (cbe->jobstate == JOBST_JOBQSYNC) { 753 TAILQ_REMOVE(&ki->kaio_syncqueue, cbe, list); 754 remove = 1; 755 } 756 mtx_unlock(&aio_job_mtx); 757 758 if (remove) { 759 cbe->jobstate = JOBST_JOBFINISHED; 760 cbe->uaiocb._aiocb_private.status = -1; 761 cbe->uaiocb._aiocb_private.error = ECANCELED; 762 TAILQ_REMOVE(&ki->kaio_jobqueue, cbe, plist); 763 aio_bio_done_notify(p, cbe, DONE_QUEUE); 764 } 765 } 766 767 /* Wait for all running I/O to be finished */ 768 if (TAILQ_FIRST(&ki->kaio_bufqueue) || 769 TAILQ_FIRST(&ki->kaio_jobqueue)) { 770 ki->kaio_flags |= KAIO_WAKEUP; 771 msleep(&p->p_aioinfo, AIO_MTX(ki), PRIBIO, "aioprn", hz); 772 goto restart; 773 } 774 775 /* Free all completed I/O requests. */ 776 while ((cbe = TAILQ_FIRST(&ki->kaio_done)) != NULL) 777 aio_free_entry(cbe); 778 779 while ((lj = TAILQ_FIRST(&ki->kaio_liojoblist)) != NULL) { 780 if (lj->lioj_count == 0) { 781 TAILQ_REMOVE(&ki->kaio_liojoblist, lj, lioj_list); 782 knlist_delete(&lj->klist, curthread, 1); 783 PROC_LOCK(p); 784 sigqueue_take(&lj->lioj_ksi); 785 PROC_UNLOCK(p); 786 uma_zfree(aiolio_zone, lj); 787 } else { 788 panic("LIO job not cleaned up: C:%d, FC:%d\n", 789 lj->lioj_count, lj->lioj_finished_count); 790 } 791 } 792 AIO_UNLOCK(ki); 793 taskqueue_drain(taskqueue_aiod_bio, &ki->kaio_task); 794 mtx_destroy(&ki->kaio_mtx); 795 uma_zfree(kaio_zone, ki); 796 p->p_aioinfo = NULL; 797 } 798 799 /* 800 * Select a job to run (called by an AIO daemon). 801 */ 802 static struct aiocblist * 803 aio_selectjob(struct aiothreadlist *aiop) 804 { 805 struct aiocblist *aiocbe; 806 struct kaioinfo *ki; 807 struct proc *userp; 808 809 mtx_assert(&aio_job_mtx, MA_OWNED); 810 TAILQ_FOREACH(aiocbe, &aio_jobs, list) { 811 userp = aiocbe->userproc; 812 ki = userp->p_aioinfo; 813 814 if (ki->kaio_active_count < ki->kaio_maxactive_count) { 815 TAILQ_REMOVE(&aio_jobs, aiocbe, list); 816 /* Account for currently active jobs. */ 817 ki->kaio_active_count++; 818 aiocbe->jobstate = JOBST_JOBRUNNING; 819 break; 820 } 821 } 822 return (aiocbe); 823 } 824 825 /* 826 * Move all data to a permanent storage device, this code 827 * simulates fsync syscall. 828 */ 829 static int 830 aio_fsync_vnode(struct thread *td, struct vnode *vp) 831 { 832 struct mount *mp; 833 int vfslocked; 834 int error; 835 836 vfslocked = VFS_LOCK_GIANT(vp->v_mount); 837 if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) 838 goto drop; 839 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 840 if (vp->v_object != NULL) { 841 VM_OBJECT_LOCK(vp->v_object); 842 vm_object_page_clean(vp->v_object, 0, 0, 0); 843 VM_OBJECT_UNLOCK(vp->v_object); 844 } 845 error = VOP_FSYNC(vp, MNT_WAIT, td); 846 847 VOP_UNLOCK(vp, 0); 848 vn_finished_write(mp); 849 drop: 850 VFS_UNLOCK_GIANT(vfslocked); 851 return (error); 852 } 853 854 /* 855 * The AIO processing activity. This is the code that does the I/O request for 856 * the non-physio version of the operations. The normal vn operations are used, 857 * and this code should work in all instances for every type of file, including 858 * pipes, sockets, fifos, and regular files. 859 * 860 * XXX I don't think it works well for socket, pipe, and fifo. 861 */ 862 static void 863 aio_process(struct aiocblist *aiocbe) 864 { 865 struct ucred *td_savedcred; 866 struct thread *td; 867 struct aiocb *cb; 868 struct file *fp; 869 struct socket *so; 870 struct uio auio; 871 struct iovec aiov; 872 int cnt; 873 int error; 874 int oublock_st, oublock_end; 875 int inblock_st, inblock_end; 876 877 td = curthread; 878 td_savedcred = td->td_ucred; 879 td->td_ucred = aiocbe->cred; 880 cb = &aiocbe->uaiocb; 881 fp = aiocbe->fd_file; 882 883 if (cb->aio_lio_opcode == LIO_SYNC) { 884 error = 0; 885 cnt = 0; 886 if (fp->f_vnode != NULL) 887 error = aio_fsync_vnode(td, fp->f_vnode); 888 cb->_aiocb_private.error = error; 889 cb->_aiocb_private.status = 0; 890 td->td_ucred = td_savedcred; 891 return; 892 } 893 894 aiov.iov_base = (void *)(uintptr_t)cb->aio_buf; 895 aiov.iov_len = cb->aio_nbytes; 896 897 auio.uio_iov = &aiov; 898 auio.uio_iovcnt = 1; 899 auio.uio_offset = cb->aio_offset; 900 auio.uio_resid = cb->aio_nbytes; 901 cnt = cb->aio_nbytes; 902 auio.uio_segflg = UIO_USERSPACE; 903 auio.uio_td = td; 904 905 inblock_st = td->td_ru.ru_inblock; 906 oublock_st = td->td_ru.ru_oublock; 907 /* 908 * aio_aqueue() acquires a reference to the file that is 909 * released in aio_free_entry(). 910 */ 911 if (cb->aio_lio_opcode == LIO_READ) { 912 auio.uio_rw = UIO_READ; 913 if (auio.uio_resid == 0) 914 error = 0; 915 else 916 error = fo_read(fp, &auio, fp->f_cred, FOF_OFFSET, td); 917 } else { 918 if (fp->f_type == DTYPE_VNODE) 919 bwillwrite(); 920 auio.uio_rw = UIO_WRITE; 921 error = fo_write(fp, &auio, fp->f_cred, FOF_OFFSET, td); 922 } 923 inblock_end = td->td_ru.ru_inblock; 924 oublock_end = td->td_ru.ru_oublock; 925 926 aiocbe->inputcharge = inblock_end - inblock_st; 927 aiocbe->outputcharge = oublock_end - oublock_st; 928 929 if ((error) && (auio.uio_resid != cnt)) { 930 if (error == ERESTART || error == EINTR || error == EWOULDBLOCK) 931 error = 0; 932 if ((error == EPIPE) && (cb->aio_lio_opcode == LIO_WRITE)) { 933 int sigpipe = 1; 934 if (fp->f_type == DTYPE_SOCKET) { 935 so = fp->f_data; 936 if (so->so_options & SO_NOSIGPIPE) 937 sigpipe = 0; 938 } 939 if (sigpipe) { 940 PROC_LOCK(aiocbe->userproc); 941 psignal(aiocbe->userproc, SIGPIPE); 942 PROC_UNLOCK(aiocbe->userproc); 943 } 944 } 945 } 946 947 cnt -= auio.uio_resid; 948 cb->_aiocb_private.error = error; 949 cb->_aiocb_private.status = cnt; 950 td->td_ucred = td_savedcred; 951 } 952 953 static void 954 aio_bio_done_notify(struct proc *userp, struct aiocblist *aiocbe, int type) 955 { 956 struct aioliojob *lj; 957 struct kaioinfo *ki; 958 struct aiocblist *scb, *scbn; 959 int lj_done; 960 961 ki = userp->p_aioinfo; 962 AIO_LOCK_ASSERT(ki, MA_OWNED); 963 lj = aiocbe->lio; 964 lj_done = 0; 965 if (lj) { 966 lj->lioj_finished_count++; 967 if (lj->lioj_count == lj->lioj_finished_count) 968 lj_done = 1; 969 } 970 if (type == DONE_QUEUE) { 971 aiocbe->jobflags |= AIOCBLIST_DONE; 972 } else { 973 aiocbe->jobflags |= AIOCBLIST_BUFDONE; 974 } 975 TAILQ_INSERT_TAIL(&ki->kaio_done, aiocbe, plist); 976 aiocbe->jobstate = JOBST_JOBFINISHED; 977 978 if (ki->kaio_flags & KAIO_RUNDOWN) 979 goto notification_done; 980 981 if (aiocbe->uaiocb.aio_sigevent.sigev_notify == SIGEV_SIGNAL || 982 aiocbe->uaiocb.aio_sigevent.sigev_notify == SIGEV_THREAD_ID) 983 aio_sendsig(userp, &aiocbe->uaiocb.aio_sigevent, &aiocbe->ksi); 984 985 KNOTE_LOCKED(&aiocbe->klist, 1); 986 987 if (lj_done) { 988 if (lj->lioj_signal.sigev_notify == SIGEV_KEVENT) { 989 lj->lioj_flags |= LIOJ_KEVENT_POSTED; 990 KNOTE_LOCKED(&lj->klist, 1); 991 } 992 if ((lj->lioj_flags & (LIOJ_SIGNAL|LIOJ_SIGNAL_POSTED)) 993 == LIOJ_SIGNAL 994 && (lj->lioj_signal.sigev_notify == SIGEV_SIGNAL || 995 lj->lioj_signal.sigev_notify == SIGEV_THREAD_ID)) { 996 aio_sendsig(userp, &lj->lioj_signal, &lj->lioj_ksi); 997 lj->lioj_flags |= LIOJ_SIGNAL_POSTED; 998 } 999 } 1000 1001 notification_done: 1002 if (aiocbe->jobflags & AIOCBLIST_CHECKSYNC) { 1003 TAILQ_FOREACH_SAFE(scb, &ki->kaio_syncqueue, list, scbn) { 1004 if (aiocbe->fd_file == scb->fd_file && 1005 aiocbe->seqno < scb->seqno) { 1006 if (--scb->pending == 0) { 1007 mtx_lock(&aio_job_mtx); 1008 scb->jobstate = JOBST_JOBQGLOBAL; 1009 TAILQ_REMOVE(&ki->kaio_syncqueue, scb, list); 1010 TAILQ_INSERT_TAIL(&aio_jobs, scb, list); 1011 aio_kick_nowait(userp); 1012 mtx_unlock(&aio_job_mtx); 1013 } 1014 } 1015 } 1016 } 1017 if (ki->kaio_flags & KAIO_WAKEUP) { 1018 ki->kaio_flags &= ~KAIO_WAKEUP; 1019 wakeup(&userp->p_aioinfo); 1020 } 1021 } 1022 1023 /* 1024 * The AIO daemon, most of the actual work is done in aio_process, 1025 * but the setup (and address space mgmt) is done in this routine. 1026 */ 1027 static void 1028 aio_daemon(void *_id) 1029 { 1030 struct aiocblist *aiocbe; 1031 struct aiothreadlist *aiop; 1032 struct kaioinfo *ki; 1033 struct proc *curcp, *mycp, *userp; 1034 struct vmspace *myvm, *tmpvm; 1035 struct thread *td = curthread; 1036 int id = (intptr_t)_id; 1037 1038 /* 1039 * Local copies of curproc (cp) and vmspace (myvm) 1040 */ 1041 mycp = td->td_proc; 1042 myvm = mycp->p_vmspace; 1043 1044 KASSERT(mycp->p_textvp == NULL, ("kthread has a textvp")); 1045 1046 /* 1047 * Allocate and ready the aio control info. There is one aiop structure 1048 * per daemon. 1049 */ 1050 aiop = uma_zalloc(aiop_zone, M_WAITOK); 1051 aiop->aiothread = td; 1052 aiop->aiothreadflags = 0; 1053 1054 /* The daemon resides in its own pgrp. */ 1055 setsid(td, NULL); 1056 1057 /* 1058 * Wakeup parent process. (Parent sleeps to keep from blasting away 1059 * and creating too many daemons.) 1060 */ 1061 sema_post(&aio_newproc_sem); 1062 1063 mtx_lock(&aio_job_mtx); 1064 for (;;) { 1065 /* 1066 * curcp is the current daemon process context. 1067 * userp is the current user process context. 1068 */ 1069 curcp = mycp; 1070 1071 /* 1072 * Take daemon off of free queue 1073 */ 1074 if (aiop->aiothreadflags & AIOP_FREE) { 1075 TAILQ_REMOVE(&aio_freeproc, aiop, list); 1076 aiop->aiothreadflags &= ~AIOP_FREE; 1077 } 1078 1079 /* 1080 * Check for jobs. 1081 */ 1082 while ((aiocbe = aio_selectjob(aiop)) != NULL) { 1083 mtx_unlock(&aio_job_mtx); 1084 userp = aiocbe->userproc; 1085 1086 /* 1087 * Connect to process address space for user program. 1088 */ 1089 if (userp != curcp) { 1090 /* 1091 * Save the current address space that we are 1092 * connected to. 1093 */ 1094 tmpvm = mycp->p_vmspace; 1095 1096 /* 1097 * Point to the new user address space, and 1098 * refer to it. 1099 */ 1100 mycp->p_vmspace = userp->p_vmspace; 1101 atomic_add_int(&mycp->p_vmspace->vm_refcnt, 1); 1102 1103 /* Activate the new mapping. */ 1104 pmap_activate(FIRST_THREAD_IN_PROC(mycp)); 1105 1106 /* 1107 * If the old address space wasn't the daemons 1108 * own address space, then we need to remove the 1109 * daemon's reference from the other process 1110 * that it was acting on behalf of. 1111 */ 1112 if (tmpvm != myvm) { 1113 vmspace_free(tmpvm); 1114 } 1115 curcp = userp; 1116 } 1117 1118 ki = userp->p_aioinfo; 1119 1120 /* Do the I/O function. */ 1121 aio_process(aiocbe); 1122 1123 mtx_lock(&aio_job_mtx); 1124 /* Decrement the active job count. */ 1125 ki->kaio_active_count--; 1126 mtx_unlock(&aio_job_mtx); 1127 1128 AIO_LOCK(ki); 1129 TAILQ_REMOVE(&ki->kaio_jobqueue, aiocbe, plist); 1130 aio_bio_done_notify(userp, aiocbe, DONE_QUEUE); 1131 AIO_UNLOCK(ki); 1132 1133 mtx_lock(&aio_job_mtx); 1134 } 1135 1136 /* 1137 * Disconnect from user address space. 1138 */ 1139 if (curcp != mycp) { 1140 1141 mtx_unlock(&aio_job_mtx); 1142 1143 /* Get the user address space to disconnect from. */ 1144 tmpvm = mycp->p_vmspace; 1145 1146 /* Get original address space for daemon. */ 1147 mycp->p_vmspace = myvm; 1148 1149 /* Activate the daemon's address space. */ 1150 pmap_activate(FIRST_THREAD_IN_PROC(mycp)); 1151 #ifdef DIAGNOSTIC 1152 if (tmpvm == myvm) { 1153 printf("AIOD: vmspace problem -- %d\n", 1154 mycp->p_pid); 1155 } 1156 #endif 1157 /* Remove our vmspace reference. */ 1158 vmspace_free(tmpvm); 1159 1160 curcp = mycp; 1161 1162 mtx_lock(&aio_job_mtx); 1163 /* 1164 * We have to restart to avoid race, we only sleep if 1165 * no job can be selected, that should be 1166 * curcp == mycp. 1167 */ 1168 continue; 1169 } 1170 1171 mtx_assert(&aio_job_mtx, MA_OWNED); 1172 1173 TAILQ_INSERT_HEAD(&aio_freeproc, aiop, list); 1174 aiop->aiothreadflags |= AIOP_FREE; 1175 1176 /* 1177 * If daemon is inactive for a long time, allow it to exit, 1178 * thereby freeing resources. 1179 */ 1180 if (msleep(aiop->aiothread, &aio_job_mtx, PRIBIO, "aiordy", 1181 aiod_lifetime)) { 1182 if (TAILQ_EMPTY(&aio_jobs)) { 1183 if ((aiop->aiothreadflags & AIOP_FREE) && 1184 (num_aio_procs > target_aio_procs)) { 1185 TAILQ_REMOVE(&aio_freeproc, aiop, list); 1186 num_aio_procs--; 1187 mtx_unlock(&aio_job_mtx); 1188 uma_zfree(aiop_zone, aiop); 1189 free_unr(aiod_unr, id); 1190 #ifdef DIAGNOSTIC 1191 if (mycp->p_vmspace->vm_refcnt <= 1) { 1192 printf("AIOD: bad vm refcnt for" 1193 " exiting daemon: %d\n", 1194 mycp->p_vmspace->vm_refcnt); 1195 } 1196 #endif 1197 kproc_exit(0); 1198 } 1199 } 1200 } 1201 } 1202 mtx_unlock(&aio_job_mtx); 1203 panic("shouldn't be here\n"); 1204 } 1205 1206 /* 1207 * Create a new AIO daemon. This is mostly a kernel-thread fork routine. The 1208 * AIO daemon modifies its environment itself. 1209 */ 1210 static int 1211 aio_newproc(int *start) 1212 { 1213 int error; 1214 struct proc *p; 1215 int id; 1216 1217 id = alloc_unr(aiod_unr); 1218 error = kproc_create(aio_daemon, (void *)(intptr_t)id, &p, 1219 RFNOWAIT, 0, "aiod%d", id); 1220 if (error == 0) { 1221 /* 1222 * Wait until daemon is started. 1223 */ 1224 sema_wait(&aio_newproc_sem); 1225 mtx_lock(&aio_job_mtx); 1226 num_aio_procs++; 1227 if (start != NULL) 1228 (*start)--; 1229 mtx_unlock(&aio_job_mtx); 1230 } else { 1231 free_unr(aiod_unr, id); 1232 } 1233 return (error); 1234 } 1235 1236 /* 1237 * Try the high-performance, low-overhead physio method for eligible 1238 * VCHR devices. This method doesn't use an aio helper thread, and 1239 * thus has very low overhead. 1240 * 1241 * Assumes that the caller, aio_aqueue(), has incremented the file 1242 * structure's reference count, preventing its deallocation for the 1243 * duration of this call. 1244 */ 1245 static int 1246 aio_qphysio(struct proc *p, struct aiocblist *aiocbe) 1247 { 1248 struct aiocb *cb; 1249 struct file *fp; 1250 struct buf *bp; 1251 struct vnode *vp; 1252 struct kaioinfo *ki; 1253 struct aioliojob *lj; 1254 int error; 1255 1256 cb = &aiocbe->uaiocb; 1257 fp = aiocbe->fd_file; 1258 1259 if (fp->f_type != DTYPE_VNODE) 1260 return (-1); 1261 1262 vp = fp->f_vnode; 1263 1264 /* 1265 * If its not a disk, we don't want to return a positive error. 1266 * It causes the aio code to not fall through to try the thread 1267 * way when you're talking to a regular file. 1268 */ 1269 if (!vn_isdisk(vp, &error)) { 1270 if (error == ENOTBLK) 1271 return (-1); 1272 else 1273 return (error); 1274 } 1275 1276 if (vp->v_bufobj.bo_bsize == 0) 1277 return (-1); 1278 1279 if (cb->aio_nbytes % vp->v_bufobj.bo_bsize) 1280 return (-1); 1281 1282 if (cb->aio_nbytes > vp->v_rdev->si_iosize_max) 1283 return (-1); 1284 1285 if (cb->aio_nbytes > 1286 MAXPHYS - (((vm_offset_t) cb->aio_buf) & PAGE_MASK)) 1287 return (-1); 1288 1289 ki = p->p_aioinfo; 1290 if (ki->kaio_buffer_count >= ki->kaio_ballowed_count) 1291 return (-1); 1292 1293 /* Create and build a buffer header for a transfer. */ 1294 bp = (struct buf *)getpbuf(NULL); 1295 BUF_KERNPROC(bp); 1296 1297 AIO_LOCK(ki); 1298 ki->kaio_count++; 1299 ki->kaio_buffer_count++; 1300 lj = aiocbe->lio; 1301 if (lj) 1302 lj->lioj_count++; 1303 AIO_UNLOCK(ki); 1304 1305 /* 1306 * Get a copy of the kva from the physical buffer. 1307 */ 1308 error = 0; 1309 1310 bp->b_bcount = cb->aio_nbytes; 1311 bp->b_bufsize = cb->aio_nbytes; 1312 bp->b_iodone = aio_physwakeup; 1313 bp->b_saveaddr = bp->b_data; 1314 bp->b_data = (void *)(uintptr_t)cb->aio_buf; 1315 bp->b_offset = cb->aio_offset; 1316 bp->b_iooffset = cb->aio_offset; 1317 bp->b_blkno = btodb(cb->aio_offset); 1318 bp->b_iocmd = cb->aio_lio_opcode == LIO_WRITE ? BIO_WRITE : BIO_READ; 1319 1320 /* 1321 * Bring buffer into kernel space. 1322 */ 1323 if (vmapbuf(bp) < 0) { 1324 error = EFAULT; 1325 goto doerror; 1326 } 1327 1328 AIO_LOCK(ki); 1329 aiocbe->bp = bp; 1330 bp->b_caller1 = (void *)aiocbe; 1331 TAILQ_INSERT_TAIL(&ki->kaio_bufqueue, aiocbe, plist); 1332 TAILQ_INSERT_TAIL(&ki->kaio_all, aiocbe, allist); 1333 aiocbe->jobstate = JOBST_JOBQBUF; 1334 cb->_aiocb_private.status = cb->aio_nbytes; 1335 AIO_UNLOCK(ki); 1336 1337 atomic_add_int(&num_queue_count, 1); 1338 atomic_add_int(&num_buf_aio, 1); 1339 1340 bp->b_error = 0; 1341 1342 TASK_INIT(&aiocbe->biotask, 0, biohelper, aiocbe); 1343 1344 /* Perform transfer. */ 1345 dev_strategy(vp->v_rdev, bp); 1346 return (0); 1347 1348 doerror: 1349 AIO_LOCK(ki); 1350 ki->kaio_count--; 1351 ki->kaio_buffer_count--; 1352 if (lj) 1353 lj->lioj_count--; 1354 aiocbe->bp = NULL; 1355 AIO_UNLOCK(ki); 1356 relpbuf(bp, NULL); 1357 return (error); 1358 } 1359 1360 /* 1361 * Wake up aio requests that may be serviceable now. 1362 */ 1363 static void 1364 aio_swake_cb(struct socket *so, struct sockbuf *sb) 1365 { 1366 struct aiocblist *cb, *cbn; 1367 int opcode; 1368 1369 SOCKBUF_LOCK_ASSERT(sb); 1370 if (sb == &so->so_snd) 1371 opcode = LIO_WRITE; 1372 else 1373 opcode = LIO_READ; 1374 1375 sb->sb_flags &= ~SB_AIO; 1376 mtx_lock(&aio_job_mtx); 1377 TAILQ_FOREACH_SAFE(cb, &so->so_aiojobq, list, cbn) { 1378 if (opcode == cb->uaiocb.aio_lio_opcode) { 1379 if (cb->jobstate != JOBST_JOBQSOCK) 1380 panic("invalid queue value"); 1381 /* XXX 1382 * We don't have actual sockets backend yet, 1383 * so we simply move the requests to the generic 1384 * file I/O backend. 1385 */ 1386 TAILQ_REMOVE(&so->so_aiojobq, cb, list); 1387 TAILQ_INSERT_TAIL(&aio_jobs, cb, list); 1388 aio_kick_nowait(cb->userproc); 1389 } 1390 } 1391 mtx_unlock(&aio_job_mtx); 1392 } 1393 1394 static int 1395 convert_old_sigevent(struct osigevent *osig, struct sigevent *nsig) 1396 { 1397 1398 /* 1399 * Only SIGEV_NONE, SIGEV_SIGNAL, and SIGEV_KEVENT are 1400 * supported by AIO with the old sigevent structure. 1401 */ 1402 nsig->sigev_notify = osig->sigev_notify; 1403 switch (nsig->sigev_notify) { 1404 case SIGEV_NONE: 1405 break; 1406 case SIGEV_SIGNAL: 1407 nsig->sigev_signo = osig->__sigev_u.__sigev_signo; 1408 break; 1409 case SIGEV_KEVENT: 1410 nsig->sigev_notify_kqueue = 1411 osig->__sigev_u.__sigev_notify_kqueue; 1412 nsig->sigev_value.sival_ptr = osig->sigev_value.sival_ptr; 1413 break; 1414 default: 1415 return (EINVAL); 1416 } 1417 return (0); 1418 } 1419 1420 static int 1421 aiocb_copyin_old_sigevent(struct aiocb *ujob, struct aiocb *kjob) 1422 { 1423 struct oaiocb *ojob; 1424 int error; 1425 1426 bzero(kjob, sizeof(struct aiocb)); 1427 error = copyin(ujob, kjob, sizeof(struct oaiocb)); 1428 if (error) 1429 return (error); 1430 ojob = (struct oaiocb *)kjob; 1431 return (convert_old_sigevent(&ojob->aio_sigevent, &kjob->aio_sigevent)); 1432 } 1433 1434 static int 1435 aiocb_copyin(struct aiocb *ujob, struct aiocb *kjob) 1436 { 1437 1438 return (copyin(ujob, kjob, sizeof(struct aiocb))); 1439 } 1440 1441 static long 1442 aiocb_fetch_status(struct aiocb *ujob) 1443 { 1444 1445 return (fuword(&ujob->_aiocb_private.status)); 1446 } 1447 1448 static long 1449 aiocb_fetch_error(struct aiocb *ujob) 1450 { 1451 1452 return (fuword(&ujob->_aiocb_private.error)); 1453 } 1454 1455 static int 1456 aiocb_store_status(struct aiocb *ujob, long status) 1457 { 1458 1459 return (suword(&ujob->_aiocb_private.status, status)); 1460 } 1461 1462 static int 1463 aiocb_store_error(struct aiocb *ujob, long error) 1464 { 1465 1466 return (suword(&ujob->_aiocb_private.error, error)); 1467 } 1468 1469 static int 1470 aiocb_store_kernelinfo(struct aiocb *ujob, long jobref) 1471 { 1472 1473 return (suword(&ujob->_aiocb_private.kernelinfo, jobref)); 1474 } 1475 1476 static int 1477 aiocb_store_aiocb(struct aiocb **ujobp, struct aiocb *ujob) 1478 { 1479 1480 return (suword(ujobp, (long)ujob)); 1481 } 1482 1483 static struct aiocb_ops aiocb_ops = { 1484 .copyin = aiocb_copyin, 1485 .fetch_status = aiocb_fetch_status, 1486 .fetch_error = aiocb_fetch_error, 1487 .store_status = aiocb_store_status, 1488 .store_error = aiocb_store_error, 1489 .store_kernelinfo = aiocb_store_kernelinfo, 1490 .store_aiocb = aiocb_store_aiocb, 1491 }; 1492 1493 static struct aiocb_ops aiocb_ops_osigevent = { 1494 .copyin = aiocb_copyin_old_sigevent, 1495 .fetch_status = aiocb_fetch_status, 1496 .fetch_error = aiocb_fetch_error, 1497 .store_status = aiocb_store_status, 1498 .store_error = aiocb_store_error, 1499 .store_kernelinfo = aiocb_store_kernelinfo, 1500 .store_aiocb = aiocb_store_aiocb, 1501 }; 1502 1503 /* 1504 * Queue a new AIO request. Choosing either the threaded or direct physio VCHR 1505 * technique is done in this code. 1506 */ 1507 int 1508 aio_aqueue(struct thread *td, struct aiocb *job, struct aioliojob *lj, 1509 int type, struct aiocb_ops *ops) 1510 { 1511 struct proc *p = td->td_proc; 1512 struct file *fp; 1513 struct socket *so; 1514 struct aiocblist *aiocbe, *cb; 1515 struct kaioinfo *ki; 1516 struct kevent kev; 1517 struct sockbuf *sb; 1518 int opcode; 1519 int error; 1520 int fd, kqfd; 1521 int jid; 1522 1523 if (p->p_aioinfo == NULL) 1524 aio_init_aioinfo(p); 1525 1526 ki = p->p_aioinfo; 1527 1528 ops->store_status(job, -1); 1529 ops->store_error(job, 0); 1530 ops->store_kernelinfo(job, -1); 1531 1532 if (num_queue_count >= max_queue_count || 1533 ki->kaio_count >= ki->kaio_qallowed_count) { 1534 ops->store_error(job, EAGAIN); 1535 return (EAGAIN); 1536 } 1537 1538 aiocbe = uma_zalloc(aiocb_zone, M_WAITOK | M_ZERO); 1539 aiocbe->inputcharge = 0; 1540 aiocbe->outputcharge = 0; 1541 knlist_init_mtx(&aiocbe->klist, AIO_MTX(ki)); 1542 1543 error = ops->copyin(job, &aiocbe->uaiocb); 1544 if (error) { 1545 ops->store_error(job, error); 1546 uma_zfree(aiocb_zone, aiocbe); 1547 return (error); 1548 } 1549 1550 if (aiocbe->uaiocb.aio_sigevent.sigev_notify != SIGEV_KEVENT && 1551 aiocbe->uaiocb.aio_sigevent.sigev_notify != SIGEV_SIGNAL && 1552 aiocbe->uaiocb.aio_sigevent.sigev_notify != SIGEV_THREAD_ID && 1553 aiocbe->uaiocb.aio_sigevent.sigev_notify != SIGEV_NONE) { 1554 ops->store_error(job, EINVAL); 1555 uma_zfree(aiocb_zone, aiocbe); 1556 return (EINVAL); 1557 } 1558 1559 if ((aiocbe->uaiocb.aio_sigevent.sigev_notify == SIGEV_SIGNAL || 1560 aiocbe->uaiocb.aio_sigevent.sigev_notify == SIGEV_THREAD_ID) && 1561 !_SIG_VALID(aiocbe->uaiocb.aio_sigevent.sigev_signo)) { 1562 uma_zfree(aiocb_zone, aiocbe); 1563 return (EINVAL); 1564 } 1565 1566 ksiginfo_init(&aiocbe->ksi); 1567 1568 /* Save userspace address of the job info. */ 1569 aiocbe->uuaiocb = job; 1570 1571 /* Get the opcode. */ 1572 if (type != LIO_NOP) 1573 aiocbe->uaiocb.aio_lio_opcode = type; 1574 opcode = aiocbe->uaiocb.aio_lio_opcode; 1575 1576 /* Fetch the file object for the specified file descriptor. */ 1577 fd = aiocbe->uaiocb.aio_fildes; 1578 switch (opcode) { 1579 case LIO_WRITE: 1580 error = fget_write(td, fd, &fp); 1581 break; 1582 case LIO_READ: 1583 error = fget_read(td, fd, &fp); 1584 break; 1585 default: 1586 error = fget(td, fd, &fp); 1587 } 1588 if (error) { 1589 uma_zfree(aiocb_zone, aiocbe); 1590 ops->store_error(job, error); 1591 return (error); 1592 } 1593 1594 if (opcode == LIO_SYNC && fp->f_vnode == NULL) { 1595 error = EINVAL; 1596 goto aqueue_fail; 1597 } 1598 1599 if (opcode != LIO_SYNC && aiocbe->uaiocb.aio_offset == -1LL) { 1600 error = EINVAL; 1601 goto aqueue_fail; 1602 } 1603 1604 aiocbe->fd_file = fp; 1605 1606 mtx_lock(&aio_job_mtx); 1607 jid = jobrefid++; 1608 aiocbe->seqno = jobseqno++; 1609 mtx_unlock(&aio_job_mtx); 1610 error = ops->store_kernelinfo(job, jid); 1611 if (error) { 1612 error = EINVAL; 1613 goto aqueue_fail; 1614 } 1615 aiocbe->uaiocb._aiocb_private.kernelinfo = (void *)(intptr_t)jid; 1616 1617 if (opcode == LIO_NOP) { 1618 fdrop(fp, td); 1619 uma_zfree(aiocb_zone, aiocbe); 1620 return (0); 1621 } 1622 if ((opcode != LIO_READ) && (opcode != LIO_WRITE) && 1623 (opcode != LIO_SYNC)) { 1624 error = EINVAL; 1625 goto aqueue_fail; 1626 } 1627 1628 if (aiocbe->uaiocb.aio_sigevent.sigev_notify != SIGEV_KEVENT) 1629 goto no_kqueue; 1630 kqfd = aiocbe->uaiocb.aio_sigevent.sigev_notify_kqueue; 1631 kev.ident = (uintptr_t)aiocbe->uuaiocb; 1632 kev.filter = EVFILT_AIO; 1633 kev.flags = EV_ADD | EV_ENABLE | EV_FLAG1; 1634 kev.data = (intptr_t)aiocbe; 1635 kev.udata = aiocbe->uaiocb.aio_sigevent.sigev_value.sival_ptr; 1636 error = kqfd_register(kqfd, &kev, td, 1); 1637 aqueue_fail: 1638 if (error) { 1639 fdrop(fp, td); 1640 uma_zfree(aiocb_zone, aiocbe); 1641 ops->store_error(job, error); 1642 goto done; 1643 } 1644 no_kqueue: 1645 1646 ops->store_error(job, EINPROGRESS); 1647 aiocbe->uaiocb._aiocb_private.error = EINPROGRESS; 1648 aiocbe->userproc = p; 1649 aiocbe->cred = crhold(td->td_ucred); 1650 aiocbe->jobflags = 0; 1651 aiocbe->lio = lj; 1652 1653 if (opcode == LIO_SYNC) 1654 goto queueit; 1655 1656 if (fp->f_type == DTYPE_SOCKET) { 1657 /* 1658 * Alternate queueing for socket ops: Reach down into the 1659 * descriptor to get the socket data. Then check to see if the 1660 * socket is ready to be read or written (based on the requested 1661 * operation). 1662 * 1663 * If it is not ready for io, then queue the aiocbe on the 1664 * socket, and set the flags so we get a call when sbnotify() 1665 * happens. 1666 * 1667 * Note if opcode is neither LIO_WRITE nor LIO_READ we lock 1668 * and unlock the snd sockbuf for no reason. 1669 */ 1670 so = fp->f_data; 1671 sb = (opcode == LIO_READ) ? &so->so_rcv : &so->so_snd; 1672 SOCKBUF_LOCK(sb); 1673 if (((opcode == LIO_READ) && (!soreadable(so))) || ((opcode == 1674 LIO_WRITE) && (!sowriteable(so)))) { 1675 sb->sb_flags |= SB_AIO; 1676 1677 mtx_lock(&aio_job_mtx); 1678 TAILQ_INSERT_TAIL(&so->so_aiojobq, aiocbe, list); 1679 mtx_unlock(&aio_job_mtx); 1680 1681 AIO_LOCK(ki); 1682 TAILQ_INSERT_TAIL(&ki->kaio_all, aiocbe, allist); 1683 TAILQ_INSERT_TAIL(&ki->kaio_jobqueue, aiocbe, plist); 1684 aiocbe->jobstate = JOBST_JOBQSOCK; 1685 ki->kaio_count++; 1686 if (lj) 1687 lj->lioj_count++; 1688 AIO_UNLOCK(ki); 1689 SOCKBUF_UNLOCK(sb); 1690 atomic_add_int(&num_queue_count, 1); 1691 error = 0; 1692 goto done; 1693 } 1694 SOCKBUF_UNLOCK(sb); 1695 } 1696 1697 if ((error = aio_qphysio(p, aiocbe)) == 0) 1698 goto done; 1699 #if 0 1700 if (error > 0) { 1701 aiocbe->uaiocb._aiocb_private.error = error; 1702 ops->store_error(job, error); 1703 goto done; 1704 } 1705 #endif 1706 queueit: 1707 /* No buffer for daemon I/O. */ 1708 aiocbe->bp = NULL; 1709 atomic_add_int(&num_queue_count, 1); 1710 1711 AIO_LOCK(ki); 1712 ki->kaio_count++; 1713 if (lj) 1714 lj->lioj_count++; 1715 TAILQ_INSERT_TAIL(&ki->kaio_jobqueue, aiocbe, plist); 1716 TAILQ_INSERT_TAIL(&ki->kaio_all, aiocbe, allist); 1717 if (opcode == LIO_SYNC) { 1718 TAILQ_FOREACH(cb, &ki->kaio_jobqueue, plist) { 1719 if (cb->fd_file == aiocbe->fd_file && 1720 cb->uaiocb.aio_lio_opcode != LIO_SYNC && 1721 cb->seqno < aiocbe->seqno) { 1722 cb->jobflags |= AIOCBLIST_CHECKSYNC; 1723 aiocbe->pending++; 1724 } 1725 } 1726 TAILQ_FOREACH(cb, &ki->kaio_bufqueue, plist) { 1727 if (cb->fd_file == aiocbe->fd_file && 1728 cb->uaiocb.aio_lio_opcode != LIO_SYNC && 1729 cb->seqno < aiocbe->seqno) { 1730 cb->jobflags |= AIOCBLIST_CHECKSYNC; 1731 aiocbe->pending++; 1732 } 1733 } 1734 if (aiocbe->pending != 0) { 1735 TAILQ_INSERT_TAIL(&ki->kaio_syncqueue, aiocbe, list); 1736 aiocbe->jobstate = JOBST_JOBQSYNC; 1737 AIO_UNLOCK(ki); 1738 goto done; 1739 } 1740 } 1741 mtx_lock(&aio_job_mtx); 1742 TAILQ_INSERT_TAIL(&aio_jobs, aiocbe, list); 1743 aiocbe->jobstate = JOBST_JOBQGLOBAL; 1744 aio_kick_nowait(p); 1745 mtx_unlock(&aio_job_mtx); 1746 AIO_UNLOCK(ki); 1747 error = 0; 1748 done: 1749 return (error); 1750 } 1751 1752 static void 1753 aio_kick_nowait(struct proc *userp) 1754 { 1755 struct kaioinfo *ki = userp->p_aioinfo; 1756 struct aiothreadlist *aiop; 1757 1758 mtx_assert(&aio_job_mtx, MA_OWNED); 1759 if ((aiop = TAILQ_FIRST(&aio_freeproc)) != NULL) { 1760 TAILQ_REMOVE(&aio_freeproc, aiop, list); 1761 aiop->aiothreadflags &= ~AIOP_FREE; 1762 wakeup(aiop->aiothread); 1763 } else if (((num_aio_resv_start + num_aio_procs) < max_aio_procs) && 1764 ((ki->kaio_active_count + num_aio_resv_start) < 1765 ki->kaio_maxactive_count)) { 1766 taskqueue_enqueue(taskqueue_aiod_bio, &ki->kaio_task); 1767 } 1768 } 1769 1770 static int 1771 aio_kick(struct proc *userp) 1772 { 1773 struct kaioinfo *ki = userp->p_aioinfo; 1774 struct aiothreadlist *aiop; 1775 int error, ret = 0; 1776 1777 mtx_assert(&aio_job_mtx, MA_OWNED); 1778 retryproc: 1779 if ((aiop = TAILQ_FIRST(&aio_freeproc)) != NULL) { 1780 TAILQ_REMOVE(&aio_freeproc, aiop, list); 1781 aiop->aiothreadflags &= ~AIOP_FREE; 1782 wakeup(aiop->aiothread); 1783 } else if (((num_aio_resv_start + num_aio_procs) < max_aio_procs) && 1784 ((ki->kaio_active_count + num_aio_resv_start) < 1785 ki->kaio_maxactive_count)) { 1786 num_aio_resv_start++; 1787 mtx_unlock(&aio_job_mtx); 1788 error = aio_newproc(&num_aio_resv_start); 1789 mtx_lock(&aio_job_mtx); 1790 if (error) { 1791 num_aio_resv_start--; 1792 goto retryproc; 1793 } 1794 } else { 1795 ret = -1; 1796 } 1797 return (ret); 1798 } 1799 1800 static void 1801 aio_kick_helper(void *context, int pending) 1802 { 1803 struct proc *userp = context; 1804 1805 mtx_lock(&aio_job_mtx); 1806 while (--pending >= 0) { 1807 if (aio_kick(userp)) 1808 break; 1809 } 1810 mtx_unlock(&aio_job_mtx); 1811 } 1812 1813 /* 1814 * Support the aio_return system call, as a side-effect, kernel resources are 1815 * released. 1816 */ 1817 static int 1818 kern_aio_return(struct thread *td, struct aiocb *uaiocb, struct aiocb_ops *ops) 1819 { 1820 struct proc *p = td->td_proc; 1821 struct aiocblist *cb; 1822 struct kaioinfo *ki; 1823 int status, error; 1824 1825 ki = p->p_aioinfo; 1826 if (ki == NULL) 1827 return (EINVAL); 1828 AIO_LOCK(ki); 1829 TAILQ_FOREACH(cb, &ki->kaio_done, plist) { 1830 if (cb->uuaiocb == uaiocb) 1831 break; 1832 } 1833 if (cb != NULL) { 1834 MPASS(cb->jobstate == JOBST_JOBFINISHED); 1835 status = cb->uaiocb._aiocb_private.status; 1836 error = cb->uaiocb._aiocb_private.error; 1837 td->td_retval[0] = status; 1838 if (cb->uaiocb.aio_lio_opcode == LIO_WRITE) { 1839 td->td_ru.ru_oublock += cb->outputcharge; 1840 cb->outputcharge = 0; 1841 } else if (cb->uaiocb.aio_lio_opcode == LIO_READ) { 1842 td->td_ru.ru_inblock += cb->inputcharge; 1843 cb->inputcharge = 0; 1844 } 1845 aio_free_entry(cb); 1846 AIO_UNLOCK(ki); 1847 ops->store_error(uaiocb, error); 1848 ops->store_status(uaiocb, status); 1849 } else { 1850 error = EINVAL; 1851 AIO_UNLOCK(ki); 1852 } 1853 return (error); 1854 } 1855 1856 int 1857 aio_return(struct thread *td, struct aio_return_args *uap) 1858 { 1859 1860 return (kern_aio_return(td, uap->aiocbp, &aiocb_ops)); 1861 } 1862 1863 /* 1864 * Allow a process to wakeup when any of the I/O requests are completed. 1865 */ 1866 static int 1867 kern_aio_suspend(struct thread *td, int njoblist, struct aiocb **ujoblist, 1868 struct timespec *ts) 1869 { 1870 struct proc *p = td->td_proc; 1871 struct timeval atv; 1872 struct kaioinfo *ki; 1873 struct aiocblist *cb, *cbfirst; 1874 int error, i, timo; 1875 1876 timo = 0; 1877 if (ts) { 1878 if (ts->tv_nsec < 0 || ts->tv_nsec >= 1000000000) 1879 return (EINVAL); 1880 1881 TIMESPEC_TO_TIMEVAL(&atv, ts); 1882 if (itimerfix(&atv)) 1883 return (EINVAL); 1884 timo = tvtohz(&atv); 1885 } 1886 1887 ki = p->p_aioinfo; 1888 if (ki == NULL) 1889 return (EAGAIN); 1890 1891 if (njoblist == 0) 1892 return (0); 1893 1894 AIO_LOCK(ki); 1895 for (;;) { 1896 cbfirst = NULL; 1897 error = 0; 1898 TAILQ_FOREACH(cb, &ki->kaio_all, allist) { 1899 for (i = 0; i < njoblist; i++) { 1900 if (cb->uuaiocb == ujoblist[i]) { 1901 if (cbfirst == NULL) 1902 cbfirst = cb; 1903 if (cb->jobstate == JOBST_JOBFINISHED) 1904 goto RETURN; 1905 } 1906 } 1907 } 1908 /* All tasks were finished. */ 1909 if (cbfirst == NULL) 1910 break; 1911 1912 ki->kaio_flags |= KAIO_WAKEUP; 1913 error = msleep(&p->p_aioinfo, AIO_MTX(ki), PRIBIO | PCATCH, 1914 "aiospn", timo); 1915 if (error == ERESTART) 1916 error = EINTR; 1917 if (error) 1918 break; 1919 } 1920 RETURN: 1921 AIO_UNLOCK(ki); 1922 return (error); 1923 } 1924 1925 int 1926 aio_suspend(struct thread *td, struct aio_suspend_args *uap) 1927 { 1928 struct timespec ts, *tsp; 1929 struct aiocb **ujoblist; 1930 int error; 1931 1932 if (uap->nent < 0 || uap->nent > AIO_LISTIO_MAX) 1933 return (EINVAL); 1934 1935 if (uap->timeout) { 1936 /* Get timespec struct. */ 1937 if ((error = copyin(uap->timeout, &ts, sizeof(ts))) != 0) 1938 return (error); 1939 tsp = &ts; 1940 } else 1941 tsp = NULL; 1942 1943 ujoblist = uma_zalloc(aiol_zone, M_WAITOK); 1944 error = copyin(uap->aiocbp, ujoblist, uap->nent * sizeof(ujoblist[0])); 1945 if (error == 0) 1946 error = kern_aio_suspend(td, uap->nent, ujoblist, tsp); 1947 uma_zfree(aiol_zone, ujoblist); 1948 return (error); 1949 } 1950 1951 /* 1952 * aio_cancel cancels any non-physio aio operations not currently in 1953 * progress. 1954 */ 1955 int 1956 aio_cancel(struct thread *td, struct aio_cancel_args *uap) 1957 { 1958 struct proc *p = td->td_proc; 1959 struct kaioinfo *ki; 1960 struct aiocblist *cbe, *cbn; 1961 struct file *fp; 1962 struct socket *so; 1963 int error; 1964 int remove; 1965 int cancelled = 0; 1966 int notcancelled = 0; 1967 struct vnode *vp; 1968 1969 /* Lookup file object. */ 1970 error = fget(td, uap->fd, &fp); 1971 if (error) 1972 return (error); 1973 1974 ki = p->p_aioinfo; 1975 if (ki == NULL) 1976 goto done; 1977 1978 if (fp->f_type == DTYPE_VNODE) { 1979 vp = fp->f_vnode; 1980 if (vn_isdisk(vp, &error)) { 1981 fdrop(fp, td); 1982 td->td_retval[0] = AIO_NOTCANCELED; 1983 return (0); 1984 } 1985 } 1986 1987 AIO_LOCK(ki); 1988 TAILQ_FOREACH_SAFE(cbe, &ki->kaio_jobqueue, plist, cbn) { 1989 if ((uap->fd == cbe->uaiocb.aio_fildes) && 1990 ((uap->aiocbp == NULL) || 1991 (uap->aiocbp == cbe->uuaiocb))) { 1992 remove = 0; 1993 1994 mtx_lock(&aio_job_mtx); 1995 if (cbe->jobstate == JOBST_JOBQGLOBAL) { 1996 TAILQ_REMOVE(&aio_jobs, cbe, list); 1997 remove = 1; 1998 } else if (cbe->jobstate == JOBST_JOBQSOCK) { 1999 MPASS(fp->f_type == DTYPE_SOCKET); 2000 so = fp->f_data; 2001 TAILQ_REMOVE(&so->so_aiojobq, cbe, list); 2002 remove = 1; 2003 } else if (cbe->jobstate == JOBST_JOBQSYNC) { 2004 TAILQ_REMOVE(&ki->kaio_syncqueue, cbe, list); 2005 remove = 1; 2006 } 2007 mtx_unlock(&aio_job_mtx); 2008 2009 if (remove) { 2010 TAILQ_REMOVE(&ki->kaio_jobqueue, cbe, plist); 2011 cbe->uaiocb._aiocb_private.status = -1; 2012 cbe->uaiocb._aiocb_private.error = ECANCELED; 2013 aio_bio_done_notify(p, cbe, DONE_QUEUE); 2014 cancelled++; 2015 } else { 2016 notcancelled++; 2017 } 2018 if (uap->aiocbp != NULL) 2019 break; 2020 } 2021 } 2022 AIO_UNLOCK(ki); 2023 2024 done: 2025 fdrop(fp, td); 2026 2027 if (uap->aiocbp != NULL) { 2028 if (cancelled) { 2029 td->td_retval[0] = AIO_CANCELED; 2030 return (0); 2031 } 2032 } 2033 2034 if (notcancelled) { 2035 td->td_retval[0] = AIO_NOTCANCELED; 2036 return (0); 2037 } 2038 2039 if (cancelled) { 2040 td->td_retval[0] = AIO_CANCELED; 2041 return (0); 2042 } 2043 2044 td->td_retval[0] = AIO_ALLDONE; 2045 2046 return (0); 2047 } 2048 2049 /* 2050 * aio_error is implemented in the kernel level for compatibility purposes 2051 * only. For a user mode async implementation, it would be best to do it in 2052 * a userland subroutine. 2053 */ 2054 static int 2055 kern_aio_error(struct thread *td, struct aiocb *aiocbp, struct aiocb_ops *ops) 2056 { 2057 struct proc *p = td->td_proc; 2058 struct aiocblist *cb; 2059 struct kaioinfo *ki; 2060 int status; 2061 2062 ki = p->p_aioinfo; 2063 if (ki == NULL) { 2064 td->td_retval[0] = EINVAL; 2065 return (0); 2066 } 2067 2068 AIO_LOCK(ki); 2069 TAILQ_FOREACH(cb, &ki->kaio_all, allist) { 2070 if (cb->uuaiocb == aiocbp) { 2071 if (cb->jobstate == JOBST_JOBFINISHED) 2072 td->td_retval[0] = 2073 cb->uaiocb._aiocb_private.error; 2074 else 2075 td->td_retval[0] = EINPROGRESS; 2076 AIO_UNLOCK(ki); 2077 return (0); 2078 } 2079 } 2080 AIO_UNLOCK(ki); 2081 2082 /* 2083 * Hack for failure of aio_aqueue. 2084 */ 2085 status = ops->fetch_status(aiocbp); 2086 if (status == -1) { 2087 td->td_retval[0] = ops->fetch_error(aiocbp); 2088 return (0); 2089 } 2090 2091 td->td_retval[0] = EINVAL; 2092 return (0); 2093 } 2094 2095 int 2096 aio_error(struct thread *td, struct aio_error_args *uap) 2097 { 2098 2099 return (kern_aio_error(td, uap->aiocbp, &aiocb_ops)); 2100 } 2101 2102 /* syscall - asynchronous read from a file (REALTIME) */ 2103 int 2104 oaio_read(struct thread *td, struct oaio_read_args *uap) 2105 { 2106 2107 return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_READ, 2108 &aiocb_ops_osigevent)); 2109 } 2110 2111 int 2112 aio_read(struct thread *td, struct aio_read_args *uap) 2113 { 2114 2115 return (aio_aqueue(td, uap->aiocbp, NULL, LIO_READ, &aiocb_ops)); 2116 } 2117 2118 /* syscall - asynchronous write to a file (REALTIME) */ 2119 int 2120 oaio_write(struct thread *td, struct oaio_write_args *uap) 2121 { 2122 2123 return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_WRITE, 2124 &aiocb_ops_osigevent)); 2125 } 2126 2127 int 2128 aio_write(struct thread *td, struct aio_write_args *uap) 2129 { 2130 2131 return (aio_aqueue(td, uap->aiocbp, NULL, LIO_WRITE, &aiocb_ops)); 2132 } 2133 2134 static int 2135 kern_lio_listio(struct thread *td, int mode, struct aiocb * const *uacb_list, 2136 struct aiocb **acb_list, int nent, struct sigevent *sig, 2137 struct aiocb_ops *ops) 2138 { 2139 struct proc *p = td->td_proc; 2140 struct aiocb *iocb; 2141 struct kaioinfo *ki; 2142 struct aioliojob *lj; 2143 struct kevent kev; 2144 int error; 2145 int nerror; 2146 int i; 2147 2148 if ((mode != LIO_NOWAIT) && (mode != LIO_WAIT)) 2149 return (EINVAL); 2150 2151 if (nent < 0 || nent > AIO_LISTIO_MAX) 2152 return (EINVAL); 2153 2154 if (p->p_aioinfo == NULL) 2155 aio_init_aioinfo(p); 2156 2157 ki = p->p_aioinfo; 2158 2159 lj = uma_zalloc(aiolio_zone, M_WAITOK); 2160 lj->lioj_flags = 0; 2161 lj->lioj_count = 0; 2162 lj->lioj_finished_count = 0; 2163 knlist_init_mtx(&lj->klist, AIO_MTX(ki)); 2164 ksiginfo_init(&lj->lioj_ksi); 2165 2166 /* 2167 * Setup signal. 2168 */ 2169 if (sig && (mode == LIO_NOWAIT)) { 2170 bcopy(sig, &lj->lioj_signal, sizeof(lj->lioj_signal)); 2171 if (lj->lioj_signal.sigev_notify == SIGEV_KEVENT) { 2172 /* Assume only new style KEVENT */ 2173 kev.filter = EVFILT_LIO; 2174 kev.flags = EV_ADD | EV_ENABLE | EV_FLAG1; 2175 kev.ident = (uintptr_t)uacb_list; /* something unique */ 2176 kev.data = (intptr_t)lj; 2177 /* pass user defined sigval data */ 2178 kev.udata = lj->lioj_signal.sigev_value.sival_ptr; 2179 error = kqfd_register( 2180 lj->lioj_signal.sigev_notify_kqueue, &kev, td, 1); 2181 if (error) { 2182 uma_zfree(aiolio_zone, lj); 2183 return (error); 2184 } 2185 } else if (lj->lioj_signal.sigev_notify == SIGEV_NONE) { 2186 ; 2187 } else if (lj->lioj_signal.sigev_notify == SIGEV_SIGNAL || 2188 lj->lioj_signal.sigev_notify == SIGEV_THREAD_ID) { 2189 if (!_SIG_VALID(lj->lioj_signal.sigev_signo)) { 2190 uma_zfree(aiolio_zone, lj); 2191 return EINVAL; 2192 } 2193 lj->lioj_flags |= LIOJ_SIGNAL; 2194 } else { 2195 uma_zfree(aiolio_zone, lj); 2196 return EINVAL; 2197 } 2198 } 2199 2200 AIO_LOCK(ki); 2201 TAILQ_INSERT_TAIL(&ki->kaio_liojoblist, lj, lioj_list); 2202 /* 2203 * Add extra aiocb count to avoid the lio to be freed 2204 * by other threads doing aio_waitcomplete or aio_return, 2205 * and prevent event from being sent until we have queued 2206 * all tasks. 2207 */ 2208 lj->lioj_count = 1; 2209 AIO_UNLOCK(ki); 2210 2211 /* 2212 * Get pointers to the list of I/O requests. 2213 */ 2214 nerror = 0; 2215 for (i = 0; i < nent; i++) { 2216 iocb = acb_list[i]; 2217 if (iocb != NULL) { 2218 error = aio_aqueue(td, iocb, lj, LIO_NOP, ops); 2219 if (error != 0) 2220 nerror++; 2221 } 2222 } 2223 2224 error = 0; 2225 AIO_LOCK(ki); 2226 if (mode == LIO_WAIT) { 2227 while (lj->lioj_count - 1 != lj->lioj_finished_count) { 2228 ki->kaio_flags |= KAIO_WAKEUP; 2229 error = msleep(&p->p_aioinfo, AIO_MTX(ki), 2230 PRIBIO | PCATCH, "aiospn", 0); 2231 if (error == ERESTART) 2232 error = EINTR; 2233 if (error) 2234 break; 2235 } 2236 } else { 2237 if (lj->lioj_count - 1 == lj->lioj_finished_count) { 2238 if (lj->lioj_signal.sigev_notify == SIGEV_KEVENT) { 2239 lj->lioj_flags |= LIOJ_KEVENT_POSTED; 2240 KNOTE_LOCKED(&lj->klist, 1); 2241 } 2242 if ((lj->lioj_flags & (LIOJ_SIGNAL|LIOJ_SIGNAL_POSTED)) 2243 == LIOJ_SIGNAL 2244 && (lj->lioj_signal.sigev_notify == SIGEV_SIGNAL || 2245 lj->lioj_signal.sigev_notify == SIGEV_THREAD_ID)) { 2246 aio_sendsig(p, &lj->lioj_signal, 2247 &lj->lioj_ksi); 2248 lj->lioj_flags |= LIOJ_SIGNAL_POSTED; 2249 } 2250 } 2251 } 2252 lj->lioj_count--; 2253 if (lj->lioj_count == 0) { 2254 TAILQ_REMOVE(&ki->kaio_liojoblist, lj, lioj_list); 2255 knlist_delete(&lj->klist, curthread, 1); 2256 PROC_LOCK(p); 2257 sigqueue_take(&lj->lioj_ksi); 2258 PROC_UNLOCK(p); 2259 AIO_UNLOCK(ki); 2260 uma_zfree(aiolio_zone, lj); 2261 } else 2262 AIO_UNLOCK(ki); 2263 2264 if (nerror) 2265 return (EIO); 2266 return (error); 2267 } 2268 2269 /* syscall - list directed I/O (REALTIME) */ 2270 int 2271 olio_listio(struct thread *td, struct olio_listio_args *uap) 2272 { 2273 struct aiocb **acb_list; 2274 struct sigevent *sigp, sig; 2275 struct osigevent osig; 2276 int error, nent; 2277 2278 if ((uap->mode != LIO_NOWAIT) && (uap->mode != LIO_WAIT)) 2279 return (EINVAL); 2280 2281 nent = uap->nent; 2282 if (nent < 0 || nent > AIO_LISTIO_MAX) 2283 return (EINVAL); 2284 2285 if (uap->sig && (uap->mode == LIO_NOWAIT)) { 2286 error = copyin(uap->sig, &osig, sizeof(osig)); 2287 if (error) 2288 return (error); 2289 error = convert_old_sigevent(&osig, &sig); 2290 if (error) 2291 return (error); 2292 sigp = &sig; 2293 } else 2294 sigp = NULL; 2295 2296 acb_list = malloc(sizeof(struct aiocb *) * nent, M_LIO, M_WAITOK); 2297 error = copyin(uap->acb_list, acb_list, nent * sizeof(acb_list[0])); 2298 if (error == 0) 2299 error = kern_lio_listio(td, uap->mode, 2300 (struct aiocb * const *)uap->acb_list, acb_list, nent, sigp, 2301 &aiocb_ops_osigevent); 2302 free(acb_list, M_LIO); 2303 return (error); 2304 } 2305 2306 /* syscall - list directed I/O (REALTIME) */ 2307 int 2308 lio_listio(struct thread *td, struct lio_listio_args *uap) 2309 { 2310 struct aiocb **acb_list; 2311 struct sigevent *sigp, sig; 2312 int error, nent; 2313 2314 if ((uap->mode != LIO_NOWAIT) && (uap->mode != LIO_WAIT)) 2315 return (EINVAL); 2316 2317 nent = uap->nent; 2318 if (nent < 0 || nent > AIO_LISTIO_MAX) 2319 return (EINVAL); 2320 2321 if (uap->sig && (uap->mode == LIO_NOWAIT)) { 2322 error = copyin(uap->sig, &sig, sizeof(sig)); 2323 if (error) 2324 return (error); 2325 sigp = &sig; 2326 } else 2327 sigp = NULL; 2328 2329 acb_list = malloc(sizeof(struct aiocb *) * nent, M_LIO, M_WAITOK); 2330 error = copyin(uap->acb_list, acb_list, nent * sizeof(acb_list[0])); 2331 if (error == 0) 2332 error = kern_lio_listio(td, uap->mode, uap->acb_list, acb_list, 2333 nent, sigp, &aiocb_ops); 2334 free(acb_list, M_LIO); 2335 return (error); 2336 } 2337 2338 /* 2339 * Called from interrupt thread for physio, we should return as fast 2340 * as possible, so we schedule a biohelper task. 2341 */ 2342 static void 2343 aio_physwakeup(struct buf *bp) 2344 { 2345 struct aiocblist *aiocbe; 2346 2347 aiocbe = (struct aiocblist *)bp->b_caller1; 2348 taskqueue_enqueue(taskqueue_aiod_bio, &aiocbe->biotask); 2349 } 2350 2351 /* 2352 * Task routine to perform heavy tasks, process wakeup, and signals. 2353 */ 2354 static void 2355 biohelper(void *context, int pending) 2356 { 2357 struct aiocblist *aiocbe = context; 2358 struct buf *bp; 2359 struct proc *userp; 2360 struct kaioinfo *ki; 2361 int nblks; 2362 2363 bp = aiocbe->bp; 2364 userp = aiocbe->userproc; 2365 ki = userp->p_aioinfo; 2366 AIO_LOCK(ki); 2367 aiocbe->uaiocb._aiocb_private.status -= bp->b_resid; 2368 aiocbe->uaiocb._aiocb_private.error = 0; 2369 if (bp->b_ioflags & BIO_ERROR) 2370 aiocbe->uaiocb._aiocb_private.error = bp->b_error; 2371 nblks = btodb(aiocbe->uaiocb.aio_nbytes); 2372 if (aiocbe->uaiocb.aio_lio_opcode == LIO_WRITE) 2373 aiocbe->outputcharge += nblks; 2374 else 2375 aiocbe->inputcharge += nblks; 2376 aiocbe->bp = NULL; 2377 TAILQ_REMOVE(&userp->p_aioinfo->kaio_bufqueue, aiocbe, plist); 2378 ki->kaio_buffer_count--; 2379 aio_bio_done_notify(userp, aiocbe, DONE_BUF); 2380 AIO_UNLOCK(ki); 2381 2382 /* Release mapping into kernel space. */ 2383 vunmapbuf(bp); 2384 relpbuf(bp, NULL); 2385 atomic_subtract_int(&num_buf_aio, 1); 2386 } 2387 2388 /* syscall - wait for the next completion of an aio request */ 2389 static int 2390 kern_aio_waitcomplete(struct thread *td, struct aiocb **aiocbp, 2391 struct timespec *ts, struct aiocb_ops *ops) 2392 { 2393 struct proc *p = td->td_proc; 2394 struct timeval atv; 2395 struct kaioinfo *ki; 2396 struct aiocblist *cb; 2397 struct aiocb *uuaiocb; 2398 int error, status, timo; 2399 2400 ops->store_aiocb(aiocbp, NULL); 2401 2402 timo = 0; 2403 if (ts) { 2404 if ((ts->tv_nsec < 0) || (ts->tv_nsec >= 1000000000)) 2405 return (EINVAL); 2406 2407 TIMESPEC_TO_TIMEVAL(&atv, ts); 2408 if (itimerfix(&atv)) 2409 return (EINVAL); 2410 timo = tvtohz(&atv); 2411 } 2412 2413 if (p->p_aioinfo == NULL) 2414 aio_init_aioinfo(p); 2415 ki = p->p_aioinfo; 2416 2417 error = 0; 2418 cb = NULL; 2419 AIO_LOCK(ki); 2420 while ((cb = TAILQ_FIRST(&ki->kaio_done)) == NULL) { 2421 ki->kaio_flags |= KAIO_WAKEUP; 2422 error = msleep(&p->p_aioinfo, AIO_MTX(ki), PRIBIO | PCATCH, 2423 "aiowc", timo); 2424 if (timo && error == ERESTART) 2425 error = EINTR; 2426 if (error) 2427 break; 2428 } 2429 2430 if (cb != NULL) { 2431 MPASS(cb->jobstate == JOBST_JOBFINISHED); 2432 uuaiocb = cb->uuaiocb; 2433 status = cb->uaiocb._aiocb_private.status; 2434 error = cb->uaiocb._aiocb_private.error; 2435 td->td_retval[0] = status; 2436 if (cb->uaiocb.aio_lio_opcode == LIO_WRITE) { 2437 td->td_ru.ru_oublock += cb->outputcharge; 2438 cb->outputcharge = 0; 2439 } else if (cb->uaiocb.aio_lio_opcode == LIO_READ) { 2440 td->td_ru.ru_inblock += cb->inputcharge; 2441 cb->inputcharge = 0; 2442 } 2443 aio_free_entry(cb); 2444 AIO_UNLOCK(ki); 2445 ops->store_aiocb(aiocbp, uuaiocb); 2446 ops->store_error(uuaiocb, error); 2447 ops->store_status(uuaiocb, status); 2448 } else 2449 AIO_UNLOCK(ki); 2450 2451 return (error); 2452 } 2453 2454 int 2455 aio_waitcomplete(struct thread *td, struct aio_waitcomplete_args *uap) 2456 { 2457 struct timespec ts, *tsp; 2458 int error; 2459 2460 if (uap->timeout) { 2461 /* Get timespec struct. */ 2462 error = copyin(uap->timeout, &ts, sizeof(ts)); 2463 if (error) 2464 return (error); 2465 tsp = &ts; 2466 } else 2467 tsp = NULL; 2468 2469 return (kern_aio_waitcomplete(td, uap->aiocbp, tsp, &aiocb_ops)); 2470 } 2471 2472 static int 2473 kern_aio_fsync(struct thread *td, int op, struct aiocb *aiocbp, 2474 struct aiocb_ops *ops) 2475 { 2476 struct proc *p = td->td_proc; 2477 struct kaioinfo *ki; 2478 2479 if (op != O_SYNC) /* XXX lack of O_DSYNC */ 2480 return (EINVAL); 2481 ki = p->p_aioinfo; 2482 if (ki == NULL) 2483 aio_init_aioinfo(p); 2484 return (aio_aqueue(td, aiocbp, NULL, LIO_SYNC, ops)); 2485 } 2486 2487 int 2488 aio_fsync(struct thread *td, struct aio_fsync_args *uap) 2489 { 2490 2491 return (kern_aio_fsync(td, uap->op, uap->aiocbp, &aiocb_ops)); 2492 } 2493 2494 /* kqueue attach function */ 2495 static int 2496 filt_aioattach(struct knote *kn) 2497 { 2498 struct aiocblist *aiocbe = (struct aiocblist *)kn->kn_sdata; 2499 2500 /* 2501 * The aiocbe pointer must be validated before using it, so 2502 * registration is restricted to the kernel; the user cannot 2503 * set EV_FLAG1. 2504 */ 2505 if ((kn->kn_flags & EV_FLAG1) == 0) 2506 return (EPERM); 2507 kn->kn_ptr.p_aio = aiocbe; 2508 kn->kn_flags &= ~EV_FLAG1; 2509 2510 knlist_add(&aiocbe->klist, kn, 0); 2511 2512 return (0); 2513 } 2514 2515 /* kqueue detach function */ 2516 static void 2517 filt_aiodetach(struct knote *kn) 2518 { 2519 struct aiocblist *aiocbe = kn->kn_ptr.p_aio; 2520 2521 if (!knlist_empty(&aiocbe->klist)) 2522 knlist_remove(&aiocbe->klist, kn, 0); 2523 } 2524 2525 /* kqueue filter function */ 2526 /*ARGSUSED*/ 2527 static int 2528 filt_aio(struct knote *kn, long hint) 2529 { 2530 struct aiocblist *aiocbe = kn->kn_ptr.p_aio; 2531 2532 kn->kn_data = aiocbe->uaiocb._aiocb_private.error; 2533 if (aiocbe->jobstate != JOBST_JOBFINISHED) 2534 return (0); 2535 kn->kn_flags |= EV_EOF; 2536 return (1); 2537 } 2538 2539 /* kqueue attach function */ 2540 static int 2541 filt_lioattach(struct knote *kn) 2542 { 2543 struct aioliojob * lj = (struct aioliojob *)kn->kn_sdata; 2544 2545 /* 2546 * The aioliojob pointer must be validated before using it, so 2547 * registration is restricted to the kernel; the user cannot 2548 * set EV_FLAG1. 2549 */ 2550 if ((kn->kn_flags & EV_FLAG1) == 0) 2551 return (EPERM); 2552 kn->kn_ptr.p_lio = lj; 2553 kn->kn_flags &= ~EV_FLAG1; 2554 2555 knlist_add(&lj->klist, kn, 0); 2556 2557 return (0); 2558 } 2559 2560 /* kqueue detach function */ 2561 static void 2562 filt_liodetach(struct knote *kn) 2563 { 2564 struct aioliojob * lj = kn->kn_ptr.p_lio; 2565 2566 if (!knlist_empty(&lj->klist)) 2567 knlist_remove(&lj->klist, kn, 0); 2568 } 2569 2570 /* kqueue filter function */ 2571 /*ARGSUSED*/ 2572 static int 2573 filt_lio(struct knote *kn, long hint) 2574 { 2575 struct aioliojob * lj = kn->kn_ptr.p_lio; 2576 2577 return (lj->lioj_flags & LIOJ_KEVENT_POSTED); 2578 } 2579 2580 #ifdef COMPAT_FREEBSD32 2581 2582 struct __aiocb_private32 { 2583 int32_t status; 2584 int32_t error; 2585 uint32_t kernelinfo; 2586 }; 2587 2588 typedef struct oaiocb32 { 2589 int aio_fildes; /* File descriptor */ 2590 uint64_t aio_offset __packed; /* File offset for I/O */ 2591 uint32_t aio_buf; /* I/O buffer in process space */ 2592 uint32_t aio_nbytes; /* Number of bytes for I/O */ 2593 struct osigevent32 aio_sigevent; /* Signal to deliver */ 2594 int aio_lio_opcode; /* LIO opcode */ 2595 int aio_reqprio; /* Request priority -- ignored */ 2596 struct __aiocb_private32 _aiocb_private; 2597 } oaiocb32_t; 2598 2599 typedef struct aiocb32 { 2600 int32_t aio_fildes; /* File descriptor */ 2601 uint64_t aio_offset __packed; /* File offset for I/O */ 2602 uint32_t aio_buf; /* I/O buffer in process space */ 2603 uint32_t aio_nbytes; /* Number of bytes for I/O */ 2604 int __spare__[2]; 2605 uint32_t __spare2__; 2606 int aio_lio_opcode; /* LIO opcode */ 2607 int aio_reqprio; /* Request priority -- ignored */ 2608 struct __aiocb_private32 _aiocb_private; 2609 struct sigevent32 aio_sigevent; /* Signal to deliver */ 2610 } aiocb32_t; 2611 2612 static int 2613 convert_old_sigevent32(struct osigevent32 *osig, struct sigevent *nsig) 2614 { 2615 2616 /* 2617 * Only SIGEV_NONE, SIGEV_SIGNAL, and SIGEV_KEVENT are 2618 * supported by AIO with the old sigevent structure. 2619 */ 2620 CP(*osig, *nsig, sigev_notify); 2621 switch (nsig->sigev_notify) { 2622 case SIGEV_NONE: 2623 break; 2624 case SIGEV_SIGNAL: 2625 nsig->sigev_signo = osig->__sigev_u.__sigev_signo; 2626 break; 2627 case SIGEV_KEVENT: 2628 nsig->sigev_notify_kqueue = 2629 osig->__sigev_u.__sigev_notify_kqueue; 2630 PTRIN_CP(*osig, *nsig, sigev_value.sival_ptr); 2631 break; 2632 default: 2633 return (EINVAL); 2634 } 2635 return (0); 2636 } 2637 2638 static int 2639 aiocb32_copyin_old_sigevent(struct aiocb *ujob, struct aiocb *kjob) 2640 { 2641 struct oaiocb32 job32; 2642 int error; 2643 2644 bzero(kjob, sizeof(struct aiocb)); 2645 error = copyin(ujob, &job32, sizeof(job32)); 2646 if (error) 2647 return (error); 2648 2649 CP(job32, *kjob, aio_fildes); 2650 CP(job32, *kjob, aio_offset); 2651 PTRIN_CP(job32, *kjob, aio_buf); 2652 CP(job32, *kjob, aio_nbytes); 2653 CP(job32, *kjob, aio_lio_opcode); 2654 CP(job32, *kjob, aio_reqprio); 2655 CP(job32, *kjob, _aiocb_private.status); 2656 CP(job32, *kjob, _aiocb_private.error); 2657 PTRIN_CP(job32, *kjob, _aiocb_private.kernelinfo); 2658 return (convert_old_sigevent32(&job32.aio_sigevent, 2659 &kjob->aio_sigevent)); 2660 } 2661 2662 static int 2663 convert_sigevent32(struct sigevent32 *sig32, struct sigevent *sig) 2664 { 2665 2666 CP(*sig32, *sig, sigev_notify); 2667 switch (sig->sigev_notify) { 2668 case SIGEV_NONE: 2669 break; 2670 case SIGEV_THREAD_ID: 2671 CP(*sig32, *sig, sigev_notify_thread_id); 2672 /* FALLTHROUGH */ 2673 case SIGEV_SIGNAL: 2674 CP(*sig32, *sig, sigev_signo); 2675 break; 2676 case SIGEV_KEVENT: 2677 CP(*sig32, *sig, sigev_notify_kqueue); 2678 PTRIN_CP(*sig32, *sig, sigev_value.sival_ptr); 2679 break; 2680 default: 2681 return (EINVAL); 2682 } 2683 return (0); 2684 } 2685 2686 static int 2687 aiocb32_copyin(struct aiocb *ujob, struct aiocb *kjob) 2688 { 2689 struct aiocb32 job32; 2690 int error; 2691 2692 error = copyin(ujob, &job32, sizeof(job32)); 2693 if (error) 2694 return (error); 2695 CP(job32, *kjob, aio_fildes); 2696 CP(job32, *kjob, aio_offset); 2697 PTRIN_CP(job32, *kjob, aio_buf); 2698 CP(job32, *kjob, aio_nbytes); 2699 CP(job32, *kjob, aio_lio_opcode); 2700 CP(job32, *kjob, aio_reqprio); 2701 CP(job32, *kjob, _aiocb_private.status); 2702 CP(job32, *kjob, _aiocb_private.error); 2703 PTRIN_CP(job32, *kjob, _aiocb_private.kernelinfo); 2704 return (convert_sigevent32(&job32.aio_sigevent, &kjob->aio_sigevent)); 2705 } 2706 2707 static long 2708 aiocb32_fetch_status(struct aiocb *ujob) 2709 { 2710 struct aiocb32 *ujob32; 2711 2712 ujob32 = (struct aiocb32 *)ujob; 2713 return (fuword32(&ujob32->_aiocb_private.status)); 2714 } 2715 2716 static long 2717 aiocb32_fetch_error(struct aiocb *ujob) 2718 { 2719 struct aiocb32 *ujob32; 2720 2721 ujob32 = (struct aiocb32 *)ujob; 2722 return (fuword32(&ujob32->_aiocb_private.error)); 2723 } 2724 2725 static int 2726 aiocb32_store_status(struct aiocb *ujob, long status) 2727 { 2728 struct aiocb32 *ujob32; 2729 2730 ujob32 = (struct aiocb32 *)ujob; 2731 return (suword32(&ujob32->_aiocb_private.status, status)); 2732 } 2733 2734 static int 2735 aiocb32_store_error(struct aiocb *ujob, long error) 2736 { 2737 struct aiocb32 *ujob32; 2738 2739 ujob32 = (struct aiocb32 *)ujob; 2740 return (suword32(&ujob32->_aiocb_private.error, error)); 2741 } 2742 2743 static int 2744 aiocb32_store_kernelinfo(struct aiocb *ujob, long jobref) 2745 { 2746 struct aiocb32 *ujob32; 2747 2748 ujob32 = (struct aiocb32 *)ujob; 2749 return (suword32(&ujob32->_aiocb_private.kernelinfo, jobref)); 2750 } 2751 2752 static int 2753 aiocb32_store_aiocb(struct aiocb **ujobp, struct aiocb *ujob) 2754 { 2755 2756 return (suword32(ujobp, (long)ujob)); 2757 } 2758 2759 static struct aiocb_ops aiocb32_ops = { 2760 .copyin = aiocb32_copyin, 2761 .fetch_status = aiocb32_fetch_status, 2762 .fetch_error = aiocb32_fetch_error, 2763 .store_status = aiocb32_store_status, 2764 .store_error = aiocb32_store_error, 2765 .store_kernelinfo = aiocb32_store_kernelinfo, 2766 .store_aiocb = aiocb32_store_aiocb, 2767 }; 2768 2769 static struct aiocb_ops aiocb32_ops_osigevent = { 2770 .copyin = aiocb32_copyin_old_sigevent, 2771 .fetch_status = aiocb32_fetch_status, 2772 .fetch_error = aiocb32_fetch_error, 2773 .store_status = aiocb32_store_status, 2774 .store_error = aiocb32_store_error, 2775 .store_kernelinfo = aiocb32_store_kernelinfo, 2776 .store_aiocb = aiocb32_store_aiocb, 2777 }; 2778 2779 int 2780 freebsd32_aio_return(struct thread *td, struct freebsd32_aio_return_args *uap) 2781 { 2782 2783 return (kern_aio_return(td, (struct aiocb *)uap->aiocbp, &aiocb32_ops)); 2784 } 2785 2786 int 2787 freebsd32_aio_suspend(struct thread *td, struct freebsd32_aio_suspend_args *uap) 2788 { 2789 struct timespec32 ts32; 2790 struct timespec ts, *tsp; 2791 struct aiocb **ujoblist; 2792 uint32_t *ujoblist32; 2793 int error, i; 2794 2795 if (uap->nent < 0 || uap->nent > AIO_LISTIO_MAX) 2796 return (EINVAL); 2797 2798 if (uap->timeout) { 2799 /* Get timespec struct. */ 2800 if ((error = copyin(uap->timeout, &ts32, sizeof(ts32))) != 0) 2801 return (error); 2802 CP(ts32, ts, tv_sec); 2803 CP(ts32, ts, tv_nsec); 2804 tsp = &ts; 2805 } else 2806 tsp = NULL; 2807 2808 ujoblist = uma_zalloc(aiol_zone, M_WAITOK); 2809 ujoblist32 = (uint32_t *)ujoblist; 2810 error = copyin(uap->aiocbp, ujoblist32, uap->nent * 2811 sizeof(ujoblist32[0])); 2812 if (error == 0) { 2813 for (i = uap->nent; i > 0; i--) 2814 ujoblist[i] = PTRIN(ujoblist32[i]); 2815 2816 error = kern_aio_suspend(td, uap->nent, ujoblist, tsp); 2817 } 2818 uma_zfree(aiol_zone, ujoblist); 2819 return (error); 2820 } 2821 2822 int 2823 freebsd32_aio_cancel(struct thread *td, struct freebsd32_aio_cancel_args *uap) 2824 { 2825 2826 return (aio_cancel(td, (struct aio_cancel_args *)uap)); 2827 } 2828 2829 int 2830 freebsd32_aio_error(struct thread *td, struct freebsd32_aio_error_args *uap) 2831 { 2832 2833 return (kern_aio_error(td, (struct aiocb *)uap->aiocbp, &aiocb32_ops)); 2834 } 2835 2836 int 2837 freebsd32_oaio_read(struct thread *td, struct freebsd32_oaio_read_args *uap) 2838 { 2839 2840 return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_READ, 2841 &aiocb32_ops_osigevent)); 2842 } 2843 2844 int 2845 freebsd32_aio_read(struct thread *td, struct freebsd32_aio_read_args *uap) 2846 { 2847 2848 return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_READ, 2849 &aiocb32_ops)); 2850 } 2851 2852 int 2853 freebsd32_oaio_write(struct thread *td, struct freebsd32_oaio_write_args *uap) 2854 { 2855 2856 return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_WRITE, 2857 &aiocb32_ops_osigevent)); 2858 } 2859 2860 int 2861 freebsd32_aio_write(struct thread *td, struct freebsd32_aio_write_args *uap) 2862 { 2863 2864 return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_WRITE, 2865 &aiocb32_ops)); 2866 } 2867 2868 int 2869 freebsd32_aio_waitcomplete(struct thread *td, 2870 struct freebsd32_aio_waitcomplete_args *uap) 2871 { 2872 struct timespec32 ts32; 2873 struct timespec ts, *tsp; 2874 int error; 2875 2876 if (uap->timeout) { 2877 /* Get timespec struct. */ 2878 error = copyin(uap->timeout, &ts32, sizeof(ts32)); 2879 if (error) 2880 return (error); 2881 CP(ts32, ts, tv_sec); 2882 CP(ts32, ts, tv_nsec); 2883 tsp = &ts; 2884 } else 2885 tsp = NULL; 2886 2887 return (kern_aio_waitcomplete(td, (struct aiocb **)uap->aiocbp, tsp, 2888 &aiocb32_ops)); 2889 } 2890 2891 int 2892 freebsd32_aio_fsync(struct thread *td, struct freebsd32_aio_fsync_args *uap) 2893 { 2894 2895 return (kern_aio_fsync(td, uap->op, (struct aiocb *)uap->aiocbp, 2896 &aiocb32_ops)); 2897 } 2898 2899 int 2900 freebsd32_olio_listio(struct thread *td, struct freebsd32_olio_listio_args *uap) 2901 { 2902 struct aiocb **acb_list; 2903 struct sigevent *sigp, sig; 2904 struct osigevent32 osig; 2905 uint32_t *acb_list32; 2906 int error, i, nent; 2907 2908 if ((uap->mode != LIO_NOWAIT) && (uap->mode != LIO_WAIT)) 2909 return (EINVAL); 2910 2911 nent = uap->nent; 2912 if (nent < 0 || nent > AIO_LISTIO_MAX) 2913 return (EINVAL); 2914 2915 if (uap->sig && (uap->mode == LIO_NOWAIT)) { 2916 error = copyin(uap->sig, &osig, sizeof(osig)); 2917 if (error) 2918 return (error); 2919 error = convert_old_sigevent32(&osig, &sig); 2920 if (error) 2921 return (error); 2922 sigp = &sig; 2923 } else 2924 sigp = NULL; 2925 2926 acb_list32 = malloc(sizeof(uint32_t) * nent, M_LIO, M_WAITOK); 2927 error = copyin(uap->acb_list, acb_list32, nent * sizeof(uint32_t)); 2928 if (error) { 2929 free(acb_list32, M_LIO); 2930 return (error); 2931 } 2932 acb_list = malloc(sizeof(struct aiocb *) * nent, M_LIO, M_WAITOK); 2933 for (i = 0; i < nent; i++) 2934 acb_list[i] = PTRIN(acb_list32[i]); 2935 free(acb_list32, M_LIO); 2936 2937 error = kern_lio_listio(td, uap->mode, 2938 (struct aiocb * const *)uap->acb_list, acb_list, nent, sigp, 2939 &aiocb32_ops_osigevent); 2940 free(acb_list, M_LIO); 2941 return (error); 2942 } 2943 2944 int 2945 freebsd32_lio_listio(struct thread *td, struct freebsd32_lio_listio_args *uap) 2946 { 2947 struct aiocb **acb_list; 2948 struct sigevent *sigp, sig; 2949 struct sigevent32 sig32; 2950 uint32_t *acb_list32; 2951 int error, i, nent; 2952 2953 if ((uap->mode != LIO_NOWAIT) && (uap->mode != LIO_WAIT)) 2954 return (EINVAL); 2955 2956 nent = uap->nent; 2957 if (nent < 0 || nent > AIO_LISTIO_MAX) 2958 return (EINVAL); 2959 2960 if (uap->sig && (uap->mode == LIO_NOWAIT)) { 2961 error = copyin(uap->sig, &sig32, sizeof(sig32)); 2962 if (error) 2963 return (error); 2964 error = convert_sigevent32(&sig32, &sig); 2965 if (error) 2966 return (error); 2967 sigp = &sig; 2968 } else 2969 sigp = NULL; 2970 2971 acb_list32 = malloc(sizeof(uint32_t) * nent, M_LIO, M_WAITOK); 2972 error = copyin(uap->acb_list, acb_list32, nent * sizeof(uint32_t)); 2973 if (error) { 2974 free(acb_list32, M_LIO); 2975 return (error); 2976 } 2977 acb_list = malloc(sizeof(struct aiocb *) * nent, M_LIO, M_WAITOK); 2978 for (i = 0; i < nent; i++) 2979 acb_list[i] = PTRIN(acb_list32[i]); 2980 free(acb_list32, M_LIO); 2981 2982 error = kern_lio_listio(td, uap->mode, 2983 (struct aiocb * const *)uap->acb_list, acb_list, nent, sigp, 2984 &aiocb32_ops); 2985 free(acb_list, M_LIO); 2986 return (error); 2987 } 2988 2989 #endif 2990