1 /*- 2 * Copyright (c) 1997 John S. Dyson. All rights reserved. 3 * 4 * Redistribution and use in source and binary forms, with or without 5 * modification, are permitted provided that the following conditions 6 * are met: 7 * 1. Redistributions of source code must retain the above copyright 8 * notice, this list of conditions and the following disclaimer. 9 * 2. John S. Dyson's name may not be used to endorse or promote products 10 * derived from this software without specific prior written permission. 11 * 12 * DISCLAIMER: This code isn't warranted to do anything useful. Anything 13 * bad that happens because of using this software isn't the responsibility 14 * of the author. This software is distributed AS-IS. 15 */ 16 17 /* 18 * This file contains support for the POSIX 1003.1B AIO/LIO facility. 19 */ 20 21 #include <sys/cdefs.h> 22 __FBSDID("$FreeBSD$"); 23 24 #include "opt_compat.h" 25 26 #include <sys/param.h> 27 #include <sys/systm.h> 28 #include <sys/malloc.h> 29 #include <sys/bio.h> 30 #include <sys/buf.h> 31 #include <sys/eventhandler.h> 32 #include <sys/sysproto.h> 33 #include <sys/filedesc.h> 34 #include <sys/kernel.h> 35 #include <sys/module.h> 36 #include <sys/kthread.h> 37 #include <sys/fcntl.h> 38 #include <sys/file.h> 39 #include <sys/limits.h> 40 #include <sys/lock.h> 41 #include <sys/mutex.h> 42 #include <sys/unistd.h> 43 #include <sys/posix4.h> 44 #include <sys/proc.h> 45 #include <sys/resourcevar.h> 46 #include <sys/signalvar.h> 47 #include <sys/protosw.h> 48 #include <sys/sema.h> 49 #include <sys/socket.h> 50 #include <sys/socketvar.h> 51 #include <sys/syscall.h> 52 #include <sys/sysent.h> 53 #include <sys/sysctl.h> 54 #include <sys/sx.h> 55 #include <sys/taskqueue.h> 56 #include <sys/vnode.h> 57 #include <sys/conf.h> 58 #include <sys/event.h> 59 #include <sys/mount.h> 60 61 #include <machine/atomic.h> 62 63 #include <vm/vm.h> 64 #include <vm/vm_extern.h> 65 #include <vm/pmap.h> 66 #include <vm/vm_map.h> 67 #include <vm/vm_object.h> 68 #include <vm/uma.h> 69 #include <sys/aio.h> 70 71 #include "opt_vfs_aio.h" 72 73 /* 74 * Counter for allocating reference ids to new jobs. Wrapped to 1 on 75 * overflow. (XXX will be removed soon.) 76 */ 77 static u_long jobrefid; 78 79 /* 80 * Counter for aio_fsync. 81 */ 82 static uint64_t jobseqno; 83 84 #define JOBST_NULL 0 85 #define JOBST_JOBQSOCK 1 86 #define JOBST_JOBQGLOBAL 2 87 #define JOBST_JOBRUNNING 3 88 #define JOBST_JOBFINISHED 4 89 #define JOBST_JOBQBUF 5 90 #define JOBST_JOBQSYNC 6 91 92 #ifndef MAX_AIO_PER_PROC 93 #define MAX_AIO_PER_PROC 32 94 #endif 95 96 #ifndef MAX_AIO_QUEUE_PER_PROC 97 #define MAX_AIO_QUEUE_PER_PROC 256 /* Bigger than AIO_LISTIO_MAX */ 98 #endif 99 100 #ifndef MAX_AIO_PROCS 101 #define MAX_AIO_PROCS 32 102 #endif 103 104 #ifndef MAX_AIO_QUEUE 105 #define MAX_AIO_QUEUE 1024 /* Bigger than AIO_LISTIO_MAX */ 106 #endif 107 108 #ifndef TARGET_AIO_PROCS 109 #define TARGET_AIO_PROCS 4 110 #endif 111 112 #ifndef MAX_BUF_AIO 113 #define MAX_BUF_AIO 16 114 #endif 115 116 #ifndef AIOD_TIMEOUT_DEFAULT 117 #define AIOD_TIMEOUT_DEFAULT (10 * hz) 118 #endif 119 120 #ifndef AIOD_LIFETIME_DEFAULT 121 #define AIOD_LIFETIME_DEFAULT (30 * hz) 122 #endif 123 124 FEATURE(aio, "Asynchronous I/O"); 125 126 static MALLOC_DEFINE(M_LIO, "lio", "listio aio control block list"); 127 128 static SYSCTL_NODE(_vfs, OID_AUTO, aio, CTLFLAG_RW, 0, "Async IO management"); 129 130 static int max_aio_procs = MAX_AIO_PROCS; 131 SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_procs, 132 CTLFLAG_RW, &max_aio_procs, 0, 133 "Maximum number of kernel threads to use for handling async IO "); 134 135 static int num_aio_procs = 0; 136 SYSCTL_INT(_vfs_aio, OID_AUTO, num_aio_procs, 137 CTLFLAG_RD, &num_aio_procs, 0, 138 "Number of presently active kernel threads for async IO"); 139 140 /* 141 * The code will adjust the actual number of AIO processes towards this 142 * number when it gets a chance. 143 */ 144 static int target_aio_procs = TARGET_AIO_PROCS; 145 SYSCTL_INT(_vfs_aio, OID_AUTO, target_aio_procs, CTLFLAG_RW, &target_aio_procs, 146 0, "Preferred number of ready kernel threads for async IO"); 147 148 static int max_queue_count = MAX_AIO_QUEUE; 149 SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_queue, CTLFLAG_RW, &max_queue_count, 0, 150 "Maximum number of aio requests to queue, globally"); 151 152 static int num_queue_count = 0; 153 SYSCTL_INT(_vfs_aio, OID_AUTO, num_queue_count, CTLFLAG_RD, &num_queue_count, 0, 154 "Number of queued aio requests"); 155 156 static int num_buf_aio = 0; 157 SYSCTL_INT(_vfs_aio, OID_AUTO, num_buf_aio, CTLFLAG_RD, &num_buf_aio, 0, 158 "Number of aio requests presently handled by the buf subsystem"); 159 160 /* Number of async I/O thread in the process of being started */ 161 /* XXX This should be local to aio_aqueue() */ 162 static int num_aio_resv_start = 0; 163 164 static int aiod_timeout; 165 SYSCTL_INT(_vfs_aio, OID_AUTO, aiod_timeout, CTLFLAG_RW, &aiod_timeout, 0, 166 "Timeout value for synchronous aio operations"); 167 168 static int aiod_lifetime; 169 SYSCTL_INT(_vfs_aio, OID_AUTO, aiod_lifetime, CTLFLAG_RW, &aiod_lifetime, 0, 170 "Maximum lifetime for idle aiod"); 171 172 static int unloadable = 0; 173 SYSCTL_INT(_vfs_aio, OID_AUTO, unloadable, CTLFLAG_RW, &unloadable, 0, 174 "Allow unload of aio (not recommended)"); 175 176 177 static int max_aio_per_proc = MAX_AIO_PER_PROC; 178 SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_per_proc, CTLFLAG_RW, &max_aio_per_proc, 179 0, "Maximum active aio requests per process (stored in the process)"); 180 181 static int max_aio_queue_per_proc = MAX_AIO_QUEUE_PER_PROC; 182 SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_queue_per_proc, CTLFLAG_RW, 183 &max_aio_queue_per_proc, 0, 184 "Maximum queued aio requests per process (stored in the process)"); 185 186 static int max_buf_aio = MAX_BUF_AIO; 187 SYSCTL_INT(_vfs_aio, OID_AUTO, max_buf_aio, CTLFLAG_RW, &max_buf_aio, 0, 188 "Maximum buf aio requests per process (stored in the process)"); 189 190 typedef struct oaiocb { 191 int aio_fildes; /* File descriptor */ 192 off_t aio_offset; /* File offset for I/O */ 193 volatile void *aio_buf; /* I/O buffer in process space */ 194 size_t aio_nbytes; /* Number of bytes for I/O */ 195 struct osigevent aio_sigevent; /* Signal to deliver */ 196 int aio_lio_opcode; /* LIO opcode */ 197 int aio_reqprio; /* Request priority -- ignored */ 198 struct __aiocb_private _aiocb_private; 199 } oaiocb_t; 200 201 /* 202 * Below is a key of locks used to protect each member of struct aiocblist 203 * aioliojob and kaioinfo and any backends. 204 * 205 * * - need not protected 206 * a - locked by kaioinfo lock 207 * b - locked by backend lock, the backend lock can be null in some cases, 208 * for example, BIO belongs to this type, in this case, proc lock is 209 * reused. 210 * c - locked by aio_job_mtx, the lock for the generic file I/O backend. 211 */ 212 213 /* 214 * Current, there is only two backends: BIO and generic file I/O. 215 * socket I/O is served by generic file I/O, this is not a good idea, since 216 * disk file I/O and any other types without O_NONBLOCK flag can block daemon 217 * threads, if there is no thread to serve socket I/O, the socket I/O will be 218 * delayed too long or starved, we should create some threads dedicated to 219 * sockets to do non-blocking I/O, same for pipe and fifo, for these I/O 220 * systems we really need non-blocking interface, fiddling O_NONBLOCK in file 221 * structure is not safe because there is race between userland and aio 222 * daemons. 223 */ 224 225 struct aiocblist { 226 TAILQ_ENTRY(aiocblist) list; /* (b) internal list of for backend */ 227 TAILQ_ENTRY(aiocblist) plist; /* (a) list of jobs for each backend */ 228 TAILQ_ENTRY(aiocblist) allist; /* (a) list of all jobs in proc */ 229 int jobflags; /* (a) job flags */ 230 int jobstate; /* (b) job state */ 231 int inputcharge; /* (*) input blockes */ 232 int outputcharge; /* (*) output blockes */ 233 struct buf *bp; /* (*) private to BIO backend, 234 * buffer pointer 235 */ 236 struct proc *userproc; /* (*) user process */ 237 struct ucred *cred; /* (*) active credential when created */ 238 struct file *fd_file; /* (*) pointer to file structure */ 239 struct aioliojob *lio; /* (*) optional lio job */ 240 struct aiocb *uuaiocb; /* (*) pointer in userspace of aiocb */ 241 struct knlist klist; /* (a) list of knotes */ 242 struct aiocb uaiocb; /* (*) kernel I/O control block */ 243 ksiginfo_t ksi; /* (a) realtime signal info */ 244 struct task biotask; /* (*) private to BIO backend */ 245 uint64_t seqno; /* (*) job number */ 246 int pending; /* (a) number of pending I/O, aio_fsync only */ 247 }; 248 249 /* jobflags */ 250 #define AIOCBLIST_DONE 0x01 251 #define AIOCBLIST_BUFDONE 0x02 252 #define AIOCBLIST_RUNDOWN 0x04 253 #define AIOCBLIST_CHECKSYNC 0x08 254 255 /* 256 * AIO process info 257 */ 258 #define AIOP_FREE 0x1 /* proc on free queue */ 259 260 struct aiothreadlist { 261 int aiothreadflags; /* (c) AIO proc flags */ 262 TAILQ_ENTRY(aiothreadlist) list; /* (c) list of processes */ 263 struct thread *aiothread; /* (*) the AIO thread */ 264 }; 265 266 /* 267 * data-structure for lio signal management 268 */ 269 struct aioliojob { 270 int lioj_flags; /* (a) listio flags */ 271 int lioj_count; /* (a) listio flags */ 272 int lioj_finished_count; /* (a) listio flags */ 273 struct sigevent lioj_signal; /* (a) signal on all I/O done */ 274 TAILQ_ENTRY(aioliojob) lioj_list; /* (a) lio list */ 275 struct knlist klist; /* (a) list of knotes */ 276 ksiginfo_t lioj_ksi; /* (a) Realtime signal info */ 277 }; 278 279 #define LIOJ_SIGNAL 0x1 /* signal on all done (lio) */ 280 #define LIOJ_SIGNAL_POSTED 0x2 /* signal has been posted */ 281 #define LIOJ_KEVENT_POSTED 0x4 /* kevent triggered */ 282 283 /* 284 * per process aio data structure 285 */ 286 struct kaioinfo { 287 struct mtx kaio_mtx; /* the lock to protect this struct */ 288 int kaio_flags; /* (a) per process kaio flags */ 289 int kaio_maxactive_count; /* (*) maximum number of AIOs */ 290 int kaio_active_count; /* (c) number of currently used AIOs */ 291 int kaio_qallowed_count; /* (*) maxiumu size of AIO queue */ 292 int kaio_count; /* (a) size of AIO queue */ 293 int kaio_ballowed_count; /* (*) maximum number of buffers */ 294 int kaio_buffer_count; /* (a) number of physio buffers */ 295 TAILQ_HEAD(,aiocblist) kaio_all; /* (a) all AIOs in the process */ 296 TAILQ_HEAD(,aiocblist) kaio_done; /* (a) done queue for process */ 297 TAILQ_HEAD(,aioliojob) kaio_liojoblist; /* (a) list of lio jobs */ 298 TAILQ_HEAD(,aiocblist) kaio_jobqueue; /* (a) job queue for process */ 299 TAILQ_HEAD(,aiocblist) kaio_bufqueue; /* (a) buffer job queue for process */ 300 TAILQ_HEAD(,aiocblist) kaio_sockqueue; /* (a) queue for aios waiting on sockets, 301 * NOT USED YET. 302 */ 303 TAILQ_HEAD(,aiocblist) kaio_syncqueue; /* (a) queue for aio_fsync */ 304 struct task kaio_task; /* (*) task to kick aio threads */ 305 }; 306 307 #define AIO_LOCK(ki) mtx_lock(&(ki)->kaio_mtx) 308 #define AIO_UNLOCK(ki) mtx_unlock(&(ki)->kaio_mtx) 309 #define AIO_LOCK_ASSERT(ki, f) mtx_assert(&(ki)->kaio_mtx, (f)) 310 #define AIO_MTX(ki) (&(ki)->kaio_mtx) 311 312 #define KAIO_RUNDOWN 0x1 /* process is being run down */ 313 #define KAIO_WAKEUP 0x2 /* wakeup process when there is a significant event */ 314 315 /* 316 * Operations used to interact with userland aio control blocks. 317 * Different ABIs provide their own operations. 318 */ 319 struct aiocb_ops { 320 int (*copyin)(struct aiocb *ujob, struct aiocb *kjob); 321 long (*fetch_status)(struct aiocb *ujob); 322 long (*fetch_error)(struct aiocb *ujob); 323 int (*store_status)(struct aiocb *ujob, long status); 324 int (*store_error)(struct aiocb *ujob, long error); 325 int (*store_kernelinfo)(struct aiocb *ujob, long jobref); 326 int (*store_aiocb)(struct aiocb **ujobp, struct aiocb *ujob); 327 }; 328 329 static TAILQ_HEAD(,aiothreadlist) aio_freeproc; /* (c) Idle daemons */ 330 static struct sema aio_newproc_sem; 331 static struct mtx aio_job_mtx; 332 static struct mtx aio_sock_mtx; 333 static TAILQ_HEAD(,aiocblist) aio_jobs; /* (c) Async job list */ 334 static struct unrhdr *aiod_unr; 335 336 void aio_init_aioinfo(struct proc *p); 337 static void aio_onceonly(void); 338 static int aio_free_entry(struct aiocblist *aiocbe); 339 static void aio_process(struct aiocblist *aiocbe); 340 static int aio_newproc(int *); 341 int aio_aqueue(struct thread *td, struct aiocb *job, 342 struct aioliojob *lio, int type, struct aiocb_ops *ops); 343 static void aio_physwakeup(struct buf *bp); 344 static void aio_proc_rundown(void *arg, struct proc *p); 345 static void aio_proc_rundown_exec(void *arg, struct proc *p, struct image_params *imgp); 346 static int aio_qphysio(struct proc *p, struct aiocblist *iocb); 347 static void biohelper(void *, int); 348 static void aio_daemon(void *param); 349 static void aio_swake_cb(struct socket *, struct sockbuf *); 350 static int aio_unload(void); 351 static void aio_bio_done_notify(struct proc *userp, struct aiocblist *aiocbe, int type); 352 #define DONE_BUF 1 353 #define DONE_QUEUE 2 354 static int aio_kick(struct proc *userp); 355 static void aio_kick_nowait(struct proc *userp); 356 static void aio_kick_helper(void *context, int pending); 357 static int filt_aioattach(struct knote *kn); 358 static void filt_aiodetach(struct knote *kn); 359 static int filt_aio(struct knote *kn, long hint); 360 static int filt_lioattach(struct knote *kn); 361 static void filt_liodetach(struct knote *kn); 362 static int filt_lio(struct knote *kn, long hint); 363 364 /* 365 * Zones for: 366 * kaio Per process async io info 367 * aiop async io thread data 368 * aiocb async io jobs 369 * aiol list io job pointer - internal to aio_suspend XXX 370 * aiolio list io jobs 371 */ 372 static uma_zone_t kaio_zone, aiop_zone, aiocb_zone, aiol_zone, aiolio_zone; 373 374 /* kqueue filters for aio */ 375 static struct filterops aio_filtops = 376 { 0, filt_aioattach, filt_aiodetach, filt_aio }; 377 static struct filterops lio_filtops = 378 { 0, filt_lioattach, filt_liodetach, filt_lio }; 379 380 static eventhandler_tag exit_tag, exec_tag; 381 382 TASKQUEUE_DEFINE_THREAD(aiod_bio); 383 384 /* 385 * Main operations function for use as a kernel module. 386 */ 387 static int 388 aio_modload(struct module *module, int cmd, void *arg) 389 { 390 int error = 0; 391 392 switch (cmd) { 393 case MOD_LOAD: 394 aio_onceonly(); 395 break; 396 case MOD_UNLOAD: 397 error = aio_unload(); 398 break; 399 case MOD_SHUTDOWN: 400 break; 401 default: 402 error = EINVAL; 403 break; 404 } 405 return (error); 406 } 407 408 static moduledata_t aio_mod = { 409 "aio", 410 &aio_modload, 411 NULL 412 }; 413 414 SYSCALL_MODULE_HELPER(aio_cancel); 415 SYSCALL_MODULE_HELPER(aio_error); 416 SYSCALL_MODULE_HELPER(aio_fsync); 417 SYSCALL_MODULE_HELPER(aio_read); 418 SYSCALL_MODULE_HELPER(aio_return); 419 SYSCALL_MODULE_HELPER(aio_suspend); 420 SYSCALL_MODULE_HELPER(aio_waitcomplete); 421 SYSCALL_MODULE_HELPER(aio_write); 422 SYSCALL_MODULE_HELPER(lio_listio); 423 SYSCALL_MODULE_HELPER(oaio_read); 424 SYSCALL_MODULE_HELPER(oaio_write); 425 SYSCALL_MODULE_HELPER(olio_listio); 426 427 DECLARE_MODULE(aio, aio_mod, 428 SI_SUB_VFS, SI_ORDER_ANY); 429 MODULE_VERSION(aio, 1); 430 431 /* 432 * Startup initialization 433 */ 434 static void 435 aio_onceonly(void) 436 { 437 438 /* XXX: should probably just use so->callback */ 439 aio_swake = &aio_swake_cb; 440 exit_tag = EVENTHANDLER_REGISTER(process_exit, aio_proc_rundown, NULL, 441 EVENTHANDLER_PRI_ANY); 442 exec_tag = EVENTHANDLER_REGISTER(process_exec, aio_proc_rundown_exec, NULL, 443 EVENTHANDLER_PRI_ANY); 444 kqueue_add_filteropts(EVFILT_AIO, &aio_filtops); 445 kqueue_add_filteropts(EVFILT_LIO, &lio_filtops); 446 TAILQ_INIT(&aio_freeproc); 447 sema_init(&aio_newproc_sem, 0, "aio_new_proc"); 448 mtx_init(&aio_job_mtx, "aio_job", NULL, MTX_DEF); 449 mtx_init(&aio_sock_mtx, "aio_sock", NULL, MTX_DEF); 450 TAILQ_INIT(&aio_jobs); 451 aiod_unr = new_unrhdr(1, INT_MAX, NULL); 452 kaio_zone = uma_zcreate("AIO", sizeof(struct kaioinfo), NULL, NULL, 453 NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); 454 aiop_zone = uma_zcreate("AIOP", sizeof(struct aiothreadlist), NULL, 455 NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); 456 aiocb_zone = uma_zcreate("AIOCB", sizeof(struct aiocblist), NULL, NULL, 457 NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); 458 aiol_zone = uma_zcreate("AIOL", AIO_LISTIO_MAX*sizeof(intptr_t) , NULL, 459 NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); 460 aiolio_zone = uma_zcreate("AIOLIO", sizeof(struct aioliojob), NULL, 461 NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); 462 aiod_timeout = AIOD_TIMEOUT_DEFAULT; 463 aiod_lifetime = AIOD_LIFETIME_DEFAULT; 464 jobrefid = 1; 465 async_io_version = _POSIX_VERSION; 466 p31b_setcfg(CTL_P1003_1B_AIO_LISTIO_MAX, AIO_LISTIO_MAX); 467 p31b_setcfg(CTL_P1003_1B_AIO_MAX, MAX_AIO_QUEUE); 468 p31b_setcfg(CTL_P1003_1B_AIO_PRIO_DELTA_MAX, 0); 469 } 470 471 /* 472 * Callback for unload of AIO when used as a module. 473 */ 474 static int 475 aio_unload(void) 476 { 477 int error; 478 479 /* 480 * XXX: no unloads by default, it's too dangerous. 481 * perhaps we could do it if locked out callers and then 482 * did an aio_proc_rundown() on each process. 483 * 484 * jhb: aio_proc_rundown() needs to run on curproc though, 485 * so I don't think that would fly. 486 */ 487 if (!unloadable) 488 return (EOPNOTSUPP); 489 490 error = kqueue_del_filteropts(EVFILT_AIO); 491 if (error) 492 return error; 493 error = kqueue_del_filteropts(EVFILT_LIO); 494 if (error) 495 return error; 496 async_io_version = 0; 497 aio_swake = NULL; 498 taskqueue_free(taskqueue_aiod_bio); 499 delete_unrhdr(aiod_unr); 500 uma_zdestroy(kaio_zone); 501 uma_zdestroy(aiop_zone); 502 uma_zdestroy(aiocb_zone); 503 uma_zdestroy(aiol_zone); 504 uma_zdestroy(aiolio_zone); 505 EVENTHANDLER_DEREGISTER(process_exit, exit_tag); 506 EVENTHANDLER_DEREGISTER(process_exec, exec_tag); 507 mtx_destroy(&aio_job_mtx); 508 mtx_destroy(&aio_sock_mtx); 509 sema_destroy(&aio_newproc_sem); 510 p31b_setcfg(CTL_P1003_1B_AIO_LISTIO_MAX, -1); 511 p31b_setcfg(CTL_P1003_1B_AIO_MAX, -1); 512 p31b_setcfg(CTL_P1003_1B_AIO_PRIO_DELTA_MAX, -1); 513 return (0); 514 } 515 516 /* 517 * Init the per-process aioinfo structure. The aioinfo limits are set 518 * per-process for user limit (resource) management. 519 */ 520 void 521 aio_init_aioinfo(struct proc *p) 522 { 523 struct kaioinfo *ki; 524 525 ki = uma_zalloc(kaio_zone, M_WAITOK); 526 mtx_init(&ki->kaio_mtx, "aiomtx", NULL, MTX_DEF); 527 ki->kaio_flags = 0; 528 ki->kaio_maxactive_count = max_aio_per_proc; 529 ki->kaio_active_count = 0; 530 ki->kaio_qallowed_count = max_aio_queue_per_proc; 531 ki->kaio_count = 0; 532 ki->kaio_ballowed_count = max_buf_aio; 533 ki->kaio_buffer_count = 0; 534 TAILQ_INIT(&ki->kaio_all); 535 TAILQ_INIT(&ki->kaio_done); 536 TAILQ_INIT(&ki->kaio_jobqueue); 537 TAILQ_INIT(&ki->kaio_bufqueue); 538 TAILQ_INIT(&ki->kaio_liojoblist); 539 TAILQ_INIT(&ki->kaio_sockqueue); 540 TAILQ_INIT(&ki->kaio_syncqueue); 541 TASK_INIT(&ki->kaio_task, 0, aio_kick_helper, p); 542 PROC_LOCK(p); 543 if (p->p_aioinfo == NULL) { 544 p->p_aioinfo = ki; 545 PROC_UNLOCK(p); 546 } else { 547 PROC_UNLOCK(p); 548 mtx_destroy(&ki->kaio_mtx); 549 uma_zfree(kaio_zone, ki); 550 } 551 552 while (num_aio_procs < MIN(target_aio_procs, max_aio_procs)) 553 aio_newproc(NULL); 554 } 555 556 static int 557 aio_sendsig(struct proc *p, struct sigevent *sigev, ksiginfo_t *ksi) 558 { 559 int ret = 0; 560 561 PROC_LOCK(p); 562 if (!KSI_ONQ(ksi)) { 563 ksi->ksi_code = SI_ASYNCIO; 564 ksi->ksi_flags |= KSI_EXT | KSI_INS; 565 ret = psignal_event(p, sigev, ksi); 566 } 567 PROC_UNLOCK(p); 568 return (ret); 569 } 570 571 /* 572 * Free a job entry. Wait for completion if it is currently active, but don't 573 * delay forever. If we delay, we return a flag that says that we have to 574 * restart the queue scan. 575 */ 576 static int 577 aio_free_entry(struct aiocblist *aiocbe) 578 { 579 struct kaioinfo *ki; 580 struct aioliojob *lj; 581 struct proc *p; 582 583 p = aiocbe->userproc; 584 MPASS(curproc == p); 585 ki = p->p_aioinfo; 586 MPASS(ki != NULL); 587 588 AIO_LOCK_ASSERT(ki, MA_OWNED); 589 MPASS(aiocbe->jobstate == JOBST_JOBFINISHED); 590 591 atomic_subtract_int(&num_queue_count, 1); 592 593 ki->kaio_count--; 594 MPASS(ki->kaio_count >= 0); 595 596 TAILQ_REMOVE(&ki->kaio_done, aiocbe, plist); 597 TAILQ_REMOVE(&ki->kaio_all, aiocbe, allist); 598 599 lj = aiocbe->lio; 600 if (lj) { 601 lj->lioj_count--; 602 lj->lioj_finished_count--; 603 604 if (lj->lioj_count == 0) { 605 TAILQ_REMOVE(&ki->kaio_liojoblist, lj, lioj_list); 606 /* lio is going away, we need to destroy any knotes */ 607 knlist_delete(&lj->klist, curthread, 1); 608 PROC_LOCK(p); 609 sigqueue_take(&lj->lioj_ksi); 610 PROC_UNLOCK(p); 611 uma_zfree(aiolio_zone, lj); 612 } 613 } 614 615 /* aiocbe is going away, we need to destroy any knotes */ 616 knlist_delete(&aiocbe->klist, curthread, 1); 617 PROC_LOCK(p); 618 sigqueue_take(&aiocbe->ksi); 619 PROC_UNLOCK(p); 620 621 MPASS(aiocbe->bp == NULL); 622 aiocbe->jobstate = JOBST_NULL; 623 AIO_UNLOCK(ki); 624 625 /* 626 * The thread argument here is used to find the owning process 627 * and is also passed to fo_close() which may pass it to various 628 * places such as devsw close() routines. Because of that, we 629 * need a thread pointer from the process owning the job that is 630 * persistent and won't disappear out from under us or move to 631 * another process. 632 * 633 * Currently, all the callers of this function call it to remove 634 * an aiocblist from the current process' job list either via a 635 * syscall or due to the current process calling exit() or 636 * execve(). Thus, we know that p == curproc. We also know that 637 * curthread can't exit since we are curthread. 638 * 639 * Therefore, we use curthread as the thread to pass to 640 * knlist_delete(). This does mean that it is possible for the 641 * thread pointer at close time to differ from the thread pointer 642 * at open time, but this is already true of file descriptors in 643 * a multithreaded process. 644 */ 645 fdrop(aiocbe->fd_file, curthread); 646 crfree(aiocbe->cred); 647 uma_zfree(aiocb_zone, aiocbe); 648 AIO_LOCK(ki); 649 650 return (0); 651 } 652 653 static void 654 aio_proc_rundown_exec(void *arg, struct proc *p, struct image_params *imgp __unused) 655 { 656 aio_proc_rundown(arg, p); 657 } 658 659 /* 660 * Rundown the jobs for a given process. 661 */ 662 static void 663 aio_proc_rundown(void *arg, struct proc *p) 664 { 665 struct kaioinfo *ki; 666 struct aioliojob *lj; 667 struct aiocblist *cbe, *cbn; 668 struct file *fp; 669 struct socket *so; 670 int remove; 671 672 KASSERT(curthread->td_proc == p, 673 ("%s: called on non-curproc", __func__)); 674 ki = p->p_aioinfo; 675 if (ki == NULL) 676 return; 677 678 AIO_LOCK(ki); 679 ki->kaio_flags |= KAIO_RUNDOWN; 680 681 restart: 682 683 /* 684 * Try to cancel all pending requests. This code simulates 685 * aio_cancel on all pending I/O requests. 686 */ 687 TAILQ_FOREACH_SAFE(cbe, &ki->kaio_jobqueue, plist, cbn) { 688 remove = 0; 689 mtx_lock(&aio_job_mtx); 690 if (cbe->jobstate == JOBST_JOBQGLOBAL) { 691 TAILQ_REMOVE(&aio_jobs, cbe, list); 692 remove = 1; 693 } else if (cbe->jobstate == JOBST_JOBQSOCK) { 694 fp = cbe->fd_file; 695 MPASS(fp->f_type == DTYPE_SOCKET); 696 so = fp->f_data; 697 TAILQ_REMOVE(&so->so_aiojobq, cbe, list); 698 remove = 1; 699 } else if (cbe->jobstate == JOBST_JOBQSYNC) { 700 TAILQ_REMOVE(&ki->kaio_syncqueue, cbe, list); 701 remove = 1; 702 } 703 mtx_unlock(&aio_job_mtx); 704 705 if (remove) { 706 cbe->jobstate = JOBST_JOBFINISHED; 707 cbe->uaiocb._aiocb_private.status = -1; 708 cbe->uaiocb._aiocb_private.error = ECANCELED; 709 TAILQ_REMOVE(&ki->kaio_jobqueue, cbe, plist); 710 aio_bio_done_notify(p, cbe, DONE_QUEUE); 711 } 712 } 713 714 /* Wait for all running I/O to be finished */ 715 if (TAILQ_FIRST(&ki->kaio_bufqueue) || 716 TAILQ_FIRST(&ki->kaio_jobqueue)) { 717 ki->kaio_flags |= KAIO_WAKEUP; 718 msleep(&p->p_aioinfo, AIO_MTX(ki), PRIBIO, "aioprn", hz); 719 goto restart; 720 } 721 722 /* Free all completed I/O requests. */ 723 while ((cbe = TAILQ_FIRST(&ki->kaio_done)) != NULL) 724 aio_free_entry(cbe); 725 726 while ((lj = TAILQ_FIRST(&ki->kaio_liojoblist)) != NULL) { 727 if (lj->lioj_count == 0) { 728 TAILQ_REMOVE(&ki->kaio_liojoblist, lj, lioj_list); 729 knlist_delete(&lj->klist, curthread, 1); 730 PROC_LOCK(p); 731 sigqueue_take(&lj->lioj_ksi); 732 PROC_UNLOCK(p); 733 uma_zfree(aiolio_zone, lj); 734 } else { 735 panic("LIO job not cleaned up: C:%d, FC:%d\n", 736 lj->lioj_count, lj->lioj_finished_count); 737 } 738 } 739 AIO_UNLOCK(ki); 740 taskqueue_drain(taskqueue_aiod_bio, &ki->kaio_task); 741 mtx_destroy(&ki->kaio_mtx); 742 uma_zfree(kaio_zone, ki); 743 p->p_aioinfo = NULL; 744 } 745 746 /* 747 * Select a job to run (called by an AIO daemon). 748 */ 749 static struct aiocblist * 750 aio_selectjob(struct aiothreadlist *aiop) 751 { 752 struct aiocblist *aiocbe; 753 struct kaioinfo *ki; 754 struct proc *userp; 755 756 mtx_assert(&aio_job_mtx, MA_OWNED); 757 TAILQ_FOREACH(aiocbe, &aio_jobs, list) { 758 userp = aiocbe->userproc; 759 ki = userp->p_aioinfo; 760 761 if (ki->kaio_active_count < ki->kaio_maxactive_count) { 762 TAILQ_REMOVE(&aio_jobs, aiocbe, list); 763 /* Account for currently active jobs. */ 764 ki->kaio_active_count++; 765 aiocbe->jobstate = JOBST_JOBRUNNING; 766 break; 767 } 768 } 769 return (aiocbe); 770 } 771 772 /* 773 * Move all data to a permanent storage device, this code 774 * simulates fsync syscall. 775 */ 776 static int 777 aio_fsync_vnode(struct thread *td, struct vnode *vp) 778 { 779 struct mount *mp; 780 int vfslocked; 781 int error; 782 783 vfslocked = VFS_LOCK_GIANT(vp->v_mount); 784 if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) 785 goto drop; 786 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 787 if (vp->v_object != NULL) { 788 VM_OBJECT_LOCK(vp->v_object); 789 vm_object_page_clean(vp->v_object, 0, 0, 0); 790 VM_OBJECT_UNLOCK(vp->v_object); 791 } 792 error = VOP_FSYNC(vp, MNT_WAIT, td); 793 794 VOP_UNLOCK(vp, 0); 795 vn_finished_write(mp); 796 drop: 797 VFS_UNLOCK_GIANT(vfslocked); 798 return (error); 799 } 800 801 /* 802 * The AIO processing activity. This is the code that does the I/O request for 803 * the non-physio version of the operations. The normal vn operations are used, 804 * and this code should work in all instances for every type of file, including 805 * pipes, sockets, fifos, and regular files. 806 * 807 * XXX I don't think it works well for socket, pipe, and fifo. 808 */ 809 static void 810 aio_process(struct aiocblist *aiocbe) 811 { 812 struct ucred *td_savedcred; 813 struct thread *td; 814 struct aiocb *cb; 815 struct file *fp; 816 struct socket *so; 817 struct uio auio; 818 struct iovec aiov; 819 int cnt; 820 int error; 821 int oublock_st, oublock_end; 822 int inblock_st, inblock_end; 823 824 td = curthread; 825 td_savedcred = td->td_ucred; 826 td->td_ucred = aiocbe->cred; 827 cb = &aiocbe->uaiocb; 828 fp = aiocbe->fd_file; 829 830 if (cb->aio_lio_opcode == LIO_SYNC) { 831 error = 0; 832 cnt = 0; 833 if (fp->f_vnode != NULL) 834 error = aio_fsync_vnode(td, fp->f_vnode); 835 cb->_aiocb_private.error = error; 836 cb->_aiocb_private.status = 0; 837 td->td_ucred = td_savedcred; 838 return; 839 } 840 841 aiov.iov_base = (void *)(uintptr_t)cb->aio_buf; 842 aiov.iov_len = cb->aio_nbytes; 843 844 auio.uio_iov = &aiov; 845 auio.uio_iovcnt = 1; 846 auio.uio_offset = cb->aio_offset; 847 auio.uio_resid = cb->aio_nbytes; 848 cnt = cb->aio_nbytes; 849 auio.uio_segflg = UIO_USERSPACE; 850 auio.uio_td = td; 851 852 inblock_st = td->td_ru.ru_inblock; 853 oublock_st = td->td_ru.ru_oublock; 854 /* 855 * aio_aqueue() acquires a reference to the file that is 856 * released in aio_free_entry(). 857 */ 858 if (cb->aio_lio_opcode == LIO_READ) { 859 auio.uio_rw = UIO_READ; 860 if (auio.uio_resid == 0) 861 error = 0; 862 else 863 error = fo_read(fp, &auio, fp->f_cred, FOF_OFFSET, td); 864 } else { 865 if (fp->f_type == DTYPE_VNODE) 866 bwillwrite(); 867 auio.uio_rw = UIO_WRITE; 868 error = fo_write(fp, &auio, fp->f_cred, FOF_OFFSET, td); 869 } 870 inblock_end = td->td_ru.ru_inblock; 871 oublock_end = td->td_ru.ru_oublock; 872 873 aiocbe->inputcharge = inblock_end - inblock_st; 874 aiocbe->outputcharge = oublock_end - oublock_st; 875 876 if ((error) && (auio.uio_resid != cnt)) { 877 if (error == ERESTART || error == EINTR || error == EWOULDBLOCK) 878 error = 0; 879 if ((error == EPIPE) && (cb->aio_lio_opcode == LIO_WRITE)) { 880 int sigpipe = 1; 881 if (fp->f_type == DTYPE_SOCKET) { 882 so = fp->f_data; 883 if (so->so_options & SO_NOSIGPIPE) 884 sigpipe = 0; 885 } 886 if (sigpipe) { 887 PROC_LOCK(aiocbe->userproc); 888 psignal(aiocbe->userproc, SIGPIPE); 889 PROC_UNLOCK(aiocbe->userproc); 890 } 891 } 892 } 893 894 cnt -= auio.uio_resid; 895 cb->_aiocb_private.error = error; 896 cb->_aiocb_private.status = cnt; 897 td->td_ucred = td_savedcred; 898 } 899 900 static void 901 aio_bio_done_notify(struct proc *userp, struct aiocblist *aiocbe, int type) 902 { 903 struct aioliojob *lj; 904 struct kaioinfo *ki; 905 struct aiocblist *scb, *scbn; 906 int lj_done; 907 908 ki = userp->p_aioinfo; 909 AIO_LOCK_ASSERT(ki, MA_OWNED); 910 lj = aiocbe->lio; 911 lj_done = 0; 912 if (lj) { 913 lj->lioj_finished_count++; 914 if (lj->lioj_count == lj->lioj_finished_count) 915 lj_done = 1; 916 } 917 if (type == DONE_QUEUE) { 918 aiocbe->jobflags |= AIOCBLIST_DONE; 919 } else { 920 aiocbe->jobflags |= AIOCBLIST_BUFDONE; 921 } 922 TAILQ_INSERT_TAIL(&ki->kaio_done, aiocbe, plist); 923 aiocbe->jobstate = JOBST_JOBFINISHED; 924 925 if (ki->kaio_flags & KAIO_RUNDOWN) 926 goto notification_done; 927 928 if (aiocbe->uaiocb.aio_sigevent.sigev_notify == SIGEV_SIGNAL || 929 aiocbe->uaiocb.aio_sigevent.sigev_notify == SIGEV_THREAD_ID) 930 aio_sendsig(userp, &aiocbe->uaiocb.aio_sigevent, &aiocbe->ksi); 931 932 KNOTE_LOCKED(&aiocbe->klist, 1); 933 934 if (lj_done) { 935 if (lj->lioj_signal.sigev_notify == SIGEV_KEVENT) { 936 lj->lioj_flags |= LIOJ_KEVENT_POSTED; 937 KNOTE_LOCKED(&lj->klist, 1); 938 } 939 if ((lj->lioj_flags & (LIOJ_SIGNAL|LIOJ_SIGNAL_POSTED)) 940 == LIOJ_SIGNAL 941 && (lj->lioj_signal.sigev_notify == SIGEV_SIGNAL || 942 lj->lioj_signal.sigev_notify == SIGEV_THREAD_ID)) { 943 aio_sendsig(userp, &lj->lioj_signal, &lj->lioj_ksi); 944 lj->lioj_flags |= LIOJ_SIGNAL_POSTED; 945 } 946 } 947 948 notification_done: 949 if (aiocbe->jobflags & AIOCBLIST_CHECKSYNC) { 950 TAILQ_FOREACH_SAFE(scb, &ki->kaio_syncqueue, list, scbn) { 951 if (aiocbe->fd_file == scb->fd_file && 952 aiocbe->seqno < scb->seqno) { 953 if (--scb->pending == 0) { 954 mtx_lock(&aio_job_mtx); 955 scb->jobstate = JOBST_JOBQGLOBAL; 956 TAILQ_REMOVE(&ki->kaio_syncqueue, scb, list); 957 TAILQ_INSERT_TAIL(&aio_jobs, scb, list); 958 aio_kick_nowait(userp); 959 mtx_unlock(&aio_job_mtx); 960 } 961 } 962 } 963 } 964 if (ki->kaio_flags & KAIO_WAKEUP) { 965 ki->kaio_flags &= ~KAIO_WAKEUP; 966 wakeup(&userp->p_aioinfo); 967 } 968 } 969 970 /* 971 * The AIO daemon, most of the actual work is done in aio_process, 972 * but the setup (and address space mgmt) is done in this routine. 973 */ 974 static void 975 aio_daemon(void *_id) 976 { 977 struct aiocblist *aiocbe; 978 struct aiothreadlist *aiop; 979 struct kaioinfo *ki; 980 struct proc *curcp, *mycp, *userp; 981 struct vmspace *myvm, *tmpvm; 982 struct thread *td = curthread; 983 int id = (intptr_t)_id; 984 985 /* 986 * Local copies of curproc (cp) and vmspace (myvm) 987 */ 988 mycp = td->td_proc; 989 myvm = mycp->p_vmspace; 990 991 KASSERT(mycp->p_textvp == NULL, ("kthread has a textvp")); 992 993 /* 994 * Allocate and ready the aio control info. There is one aiop structure 995 * per daemon. 996 */ 997 aiop = uma_zalloc(aiop_zone, M_WAITOK); 998 aiop->aiothread = td; 999 aiop->aiothreadflags = 0; 1000 1001 /* The daemon resides in its own pgrp. */ 1002 setsid(td, NULL); 1003 1004 /* 1005 * Wakeup parent process. (Parent sleeps to keep from blasting away 1006 * and creating too many daemons.) 1007 */ 1008 sema_post(&aio_newproc_sem); 1009 1010 mtx_lock(&aio_job_mtx); 1011 for (;;) { 1012 /* 1013 * curcp is the current daemon process context. 1014 * userp is the current user process context. 1015 */ 1016 curcp = mycp; 1017 1018 /* 1019 * Take daemon off of free queue 1020 */ 1021 if (aiop->aiothreadflags & AIOP_FREE) { 1022 TAILQ_REMOVE(&aio_freeproc, aiop, list); 1023 aiop->aiothreadflags &= ~AIOP_FREE; 1024 } 1025 1026 /* 1027 * Check for jobs. 1028 */ 1029 while ((aiocbe = aio_selectjob(aiop)) != NULL) { 1030 mtx_unlock(&aio_job_mtx); 1031 userp = aiocbe->userproc; 1032 1033 /* 1034 * Connect to process address space for user program. 1035 */ 1036 if (userp != curcp) { 1037 /* 1038 * Save the current address space that we are 1039 * connected to. 1040 */ 1041 tmpvm = mycp->p_vmspace; 1042 1043 /* 1044 * Point to the new user address space, and 1045 * refer to it. 1046 */ 1047 mycp->p_vmspace = userp->p_vmspace; 1048 atomic_add_int(&mycp->p_vmspace->vm_refcnt, 1); 1049 1050 /* Activate the new mapping. */ 1051 pmap_activate(FIRST_THREAD_IN_PROC(mycp)); 1052 1053 /* 1054 * If the old address space wasn't the daemons 1055 * own address space, then we need to remove the 1056 * daemon's reference from the other process 1057 * that it was acting on behalf of. 1058 */ 1059 if (tmpvm != myvm) { 1060 vmspace_free(tmpvm); 1061 } 1062 curcp = userp; 1063 } 1064 1065 ki = userp->p_aioinfo; 1066 1067 /* Do the I/O function. */ 1068 aio_process(aiocbe); 1069 1070 mtx_lock(&aio_job_mtx); 1071 /* Decrement the active job count. */ 1072 ki->kaio_active_count--; 1073 mtx_unlock(&aio_job_mtx); 1074 1075 AIO_LOCK(ki); 1076 TAILQ_REMOVE(&ki->kaio_jobqueue, aiocbe, plist); 1077 aio_bio_done_notify(userp, aiocbe, DONE_QUEUE); 1078 AIO_UNLOCK(ki); 1079 1080 mtx_lock(&aio_job_mtx); 1081 } 1082 1083 /* 1084 * Disconnect from user address space. 1085 */ 1086 if (curcp != mycp) { 1087 1088 mtx_unlock(&aio_job_mtx); 1089 1090 /* Get the user address space to disconnect from. */ 1091 tmpvm = mycp->p_vmspace; 1092 1093 /* Get original address space for daemon. */ 1094 mycp->p_vmspace = myvm; 1095 1096 /* Activate the daemon's address space. */ 1097 pmap_activate(FIRST_THREAD_IN_PROC(mycp)); 1098 #ifdef DIAGNOSTIC 1099 if (tmpvm == myvm) { 1100 printf("AIOD: vmspace problem -- %d\n", 1101 mycp->p_pid); 1102 } 1103 #endif 1104 /* Remove our vmspace reference. */ 1105 vmspace_free(tmpvm); 1106 1107 curcp = mycp; 1108 1109 mtx_lock(&aio_job_mtx); 1110 /* 1111 * We have to restart to avoid race, we only sleep if 1112 * no job can be selected, that should be 1113 * curcp == mycp. 1114 */ 1115 continue; 1116 } 1117 1118 mtx_assert(&aio_job_mtx, MA_OWNED); 1119 1120 TAILQ_INSERT_HEAD(&aio_freeproc, aiop, list); 1121 aiop->aiothreadflags |= AIOP_FREE; 1122 1123 /* 1124 * If daemon is inactive for a long time, allow it to exit, 1125 * thereby freeing resources. 1126 */ 1127 if (msleep(aiop->aiothread, &aio_job_mtx, PRIBIO, "aiordy", 1128 aiod_lifetime)) { 1129 if (TAILQ_EMPTY(&aio_jobs)) { 1130 if ((aiop->aiothreadflags & AIOP_FREE) && 1131 (num_aio_procs > target_aio_procs)) { 1132 TAILQ_REMOVE(&aio_freeproc, aiop, list); 1133 num_aio_procs--; 1134 mtx_unlock(&aio_job_mtx); 1135 uma_zfree(aiop_zone, aiop); 1136 free_unr(aiod_unr, id); 1137 #ifdef DIAGNOSTIC 1138 if (mycp->p_vmspace->vm_refcnt <= 1) { 1139 printf("AIOD: bad vm refcnt for" 1140 " exiting daemon: %d\n", 1141 mycp->p_vmspace->vm_refcnt); 1142 } 1143 #endif 1144 kproc_exit(0); 1145 } 1146 } 1147 } 1148 } 1149 mtx_unlock(&aio_job_mtx); 1150 panic("shouldn't be here\n"); 1151 } 1152 1153 /* 1154 * Create a new AIO daemon. This is mostly a kernel-thread fork routine. The 1155 * AIO daemon modifies its environment itself. 1156 */ 1157 static int 1158 aio_newproc(int *start) 1159 { 1160 int error; 1161 struct proc *p; 1162 int id; 1163 1164 id = alloc_unr(aiod_unr); 1165 error = kproc_create(aio_daemon, (void *)(intptr_t)id, &p, 1166 RFNOWAIT, 0, "aiod%d", id); 1167 if (error == 0) { 1168 /* 1169 * Wait until daemon is started. 1170 */ 1171 sema_wait(&aio_newproc_sem); 1172 mtx_lock(&aio_job_mtx); 1173 num_aio_procs++; 1174 if (start != NULL) 1175 (*start)--; 1176 mtx_unlock(&aio_job_mtx); 1177 } else { 1178 free_unr(aiod_unr, id); 1179 } 1180 return (error); 1181 } 1182 1183 /* 1184 * Try the high-performance, low-overhead physio method for eligible 1185 * VCHR devices. This method doesn't use an aio helper thread, and 1186 * thus has very low overhead. 1187 * 1188 * Assumes that the caller, aio_aqueue(), has incremented the file 1189 * structure's reference count, preventing its deallocation for the 1190 * duration of this call. 1191 */ 1192 static int 1193 aio_qphysio(struct proc *p, struct aiocblist *aiocbe) 1194 { 1195 struct aiocb *cb; 1196 struct file *fp; 1197 struct buf *bp; 1198 struct vnode *vp; 1199 struct kaioinfo *ki; 1200 struct aioliojob *lj; 1201 int error; 1202 1203 cb = &aiocbe->uaiocb; 1204 fp = aiocbe->fd_file; 1205 1206 if (fp->f_type != DTYPE_VNODE) 1207 return (-1); 1208 1209 vp = fp->f_vnode; 1210 1211 /* 1212 * If its not a disk, we don't want to return a positive error. 1213 * It causes the aio code to not fall through to try the thread 1214 * way when you're talking to a regular file. 1215 */ 1216 if (!vn_isdisk(vp, &error)) { 1217 if (error == ENOTBLK) 1218 return (-1); 1219 else 1220 return (error); 1221 } 1222 1223 if (vp->v_bufobj.bo_bsize == 0) 1224 return (-1); 1225 1226 if (cb->aio_nbytes % vp->v_bufobj.bo_bsize) 1227 return (-1); 1228 1229 if (cb->aio_nbytes > vp->v_rdev->si_iosize_max) 1230 return (-1); 1231 1232 if (cb->aio_nbytes > 1233 MAXPHYS - (((vm_offset_t) cb->aio_buf) & PAGE_MASK)) 1234 return (-1); 1235 1236 ki = p->p_aioinfo; 1237 if (ki->kaio_buffer_count >= ki->kaio_ballowed_count) 1238 return (-1); 1239 1240 /* Create and build a buffer header for a transfer. */ 1241 bp = (struct buf *)getpbuf(NULL); 1242 BUF_KERNPROC(bp); 1243 1244 AIO_LOCK(ki); 1245 ki->kaio_count++; 1246 ki->kaio_buffer_count++; 1247 lj = aiocbe->lio; 1248 if (lj) 1249 lj->lioj_count++; 1250 AIO_UNLOCK(ki); 1251 1252 /* 1253 * Get a copy of the kva from the physical buffer. 1254 */ 1255 error = 0; 1256 1257 bp->b_bcount = cb->aio_nbytes; 1258 bp->b_bufsize = cb->aio_nbytes; 1259 bp->b_iodone = aio_physwakeup; 1260 bp->b_saveaddr = bp->b_data; 1261 bp->b_data = (void *)(uintptr_t)cb->aio_buf; 1262 bp->b_offset = cb->aio_offset; 1263 bp->b_iooffset = cb->aio_offset; 1264 bp->b_blkno = btodb(cb->aio_offset); 1265 bp->b_iocmd = cb->aio_lio_opcode == LIO_WRITE ? BIO_WRITE : BIO_READ; 1266 1267 /* 1268 * Bring buffer into kernel space. 1269 */ 1270 if (vmapbuf(bp) < 0) { 1271 error = EFAULT; 1272 goto doerror; 1273 } 1274 1275 AIO_LOCK(ki); 1276 aiocbe->bp = bp; 1277 bp->b_caller1 = (void *)aiocbe; 1278 TAILQ_INSERT_TAIL(&ki->kaio_bufqueue, aiocbe, plist); 1279 TAILQ_INSERT_TAIL(&ki->kaio_all, aiocbe, allist); 1280 aiocbe->jobstate = JOBST_JOBQBUF; 1281 cb->_aiocb_private.status = cb->aio_nbytes; 1282 AIO_UNLOCK(ki); 1283 1284 atomic_add_int(&num_queue_count, 1); 1285 atomic_add_int(&num_buf_aio, 1); 1286 1287 bp->b_error = 0; 1288 1289 TASK_INIT(&aiocbe->biotask, 0, biohelper, aiocbe); 1290 1291 /* Perform transfer. */ 1292 dev_strategy(vp->v_rdev, bp); 1293 return (0); 1294 1295 doerror: 1296 AIO_LOCK(ki); 1297 ki->kaio_count--; 1298 ki->kaio_buffer_count--; 1299 if (lj) 1300 lj->lioj_count--; 1301 aiocbe->bp = NULL; 1302 AIO_UNLOCK(ki); 1303 relpbuf(bp, NULL); 1304 return (error); 1305 } 1306 1307 /* 1308 * Wake up aio requests that may be serviceable now. 1309 */ 1310 static void 1311 aio_swake_cb(struct socket *so, struct sockbuf *sb) 1312 { 1313 struct aiocblist *cb, *cbn; 1314 int opcode; 1315 1316 SOCKBUF_LOCK_ASSERT(sb); 1317 if (sb == &so->so_snd) 1318 opcode = LIO_WRITE; 1319 else 1320 opcode = LIO_READ; 1321 1322 sb->sb_flags &= ~SB_AIO; 1323 mtx_lock(&aio_job_mtx); 1324 TAILQ_FOREACH_SAFE(cb, &so->so_aiojobq, list, cbn) { 1325 if (opcode == cb->uaiocb.aio_lio_opcode) { 1326 if (cb->jobstate != JOBST_JOBQSOCK) 1327 panic("invalid queue value"); 1328 /* XXX 1329 * We don't have actual sockets backend yet, 1330 * so we simply move the requests to the generic 1331 * file I/O backend. 1332 */ 1333 TAILQ_REMOVE(&so->so_aiojobq, cb, list); 1334 TAILQ_INSERT_TAIL(&aio_jobs, cb, list); 1335 aio_kick_nowait(cb->userproc); 1336 } 1337 } 1338 mtx_unlock(&aio_job_mtx); 1339 } 1340 1341 static int 1342 convert_old_sigevent(struct osigevent *osig, struct sigevent *nsig) 1343 { 1344 1345 /* 1346 * Only SIGEV_NONE, SIGEV_SIGNAL, and SIGEV_KEVENT are 1347 * supported by AIO with the old sigevent structure. 1348 */ 1349 nsig->sigev_notify = osig->sigev_notify; 1350 switch (nsig->sigev_notify) { 1351 case SIGEV_NONE: 1352 break; 1353 case SIGEV_SIGNAL: 1354 nsig->sigev_signo = osig->__sigev_u.__sigev_signo; 1355 break; 1356 case SIGEV_KEVENT: 1357 nsig->sigev_notify_kqueue = 1358 osig->__sigev_u.__sigev_notify_kqueue; 1359 nsig->sigev_value.sival_ptr = osig->sigev_value.sival_ptr; 1360 break; 1361 default: 1362 return (EINVAL); 1363 } 1364 return (0); 1365 } 1366 1367 static int 1368 aiocb_copyin_old_sigevent(struct aiocb *ujob, struct aiocb *kjob) 1369 { 1370 struct oaiocb *ojob; 1371 int error; 1372 1373 bzero(kjob, sizeof(struct aiocb)); 1374 error = copyin(ujob, kjob, sizeof(struct oaiocb)); 1375 if (error) 1376 return (error); 1377 ojob = (struct oaiocb *)kjob; 1378 return (convert_old_sigevent(&ojob->aio_sigevent, &kjob->aio_sigevent)); 1379 } 1380 1381 static int 1382 aiocb_copyin(struct aiocb *ujob, struct aiocb *kjob) 1383 { 1384 1385 return (copyin(ujob, kjob, sizeof(struct aiocb))); 1386 } 1387 1388 static long 1389 aiocb_fetch_status(struct aiocb *ujob) 1390 { 1391 1392 return (fuword(&ujob->_aiocb_private.status)); 1393 } 1394 1395 static long 1396 aiocb_fetch_error(struct aiocb *ujob) 1397 { 1398 1399 return (fuword(&ujob->_aiocb_private.error)); 1400 } 1401 1402 static int 1403 aiocb_store_status(struct aiocb *ujob, long status) 1404 { 1405 1406 return (suword(&ujob->_aiocb_private.status, status)); 1407 } 1408 1409 static int 1410 aiocb_store_error(struct aiocb *ujob, long error) 1411 { 1412 1413 return (suword(&ujob->_aiocb_private.error, error)); 1414 } 1415 1416 static int 1417 aiocb_store_kernelinfo(struct aiocb *ujob, long jobref) 1418 { 1419 1420 return (suword(&ujob->_aiocb_private.kernelinfo, jobref)); 1421 } 1422 1423 static int 1424 aiocb_store_aiocb(struct aiocb **ujobp, struct aiocb *ujob) 1425 { 1426 1427 return (suword(ujobp, (long)ujob)); 1428 } 1429 1430 static struct aiocb_ops aiocb_ops = { 1431 .copyin = aiocb_copyin, 1432 .fetch_status = aiocb_fetch_status, 1433 .fetch_error = aiocb_fetch_error, 1434 .store_status = aiocb_store_status, 1435 .store_error = aiocb_store_error, 1436 .store_kernelinfo = aiocb_store_kernelinfo, 1437 .store_aiocb = aiocb_store_aiocb, 1438 }; 1439 1440 static struct aiocb_ops aiocb_ops_osigevent = { 1441 .copyin = aiocb_copyin_old_sigevent, 1442 .fetch_status = aiocb_fetch_status, 1443 .fetch_error = aiocb_fetch_error, 1444 .store_status = aiocb_store_status, 1445 .store_error = aiocb_store_error, 1446 .store_kernelinfo = aiocb_store_kernelinfo, 1447 .store_aiocb = aiocb_store_aiocb, 1448 }; 1449 1450 /* 1451 * Queue a new AIO request. Choosing either the threaded or direct physio VCHR 1452 * technique is done in this code. 1453 */ 1454 int 1455 aio_aqueue(struct thread *td, struct aiocb *job, struct aioliojob *lj, 1456 int type, struct aiocb_ops *ops) 1457 { 1458 struct proc *p = td->td_proc; 1459 struct file *fp; 1460 struct socket *so; 1461 struct aiocblist *aiocbe, *cb; 1462 struct kaioinfo *ki; 1463 struct kevent kev; 1464 struct sockbuf *sb; 1465 int opcode; 1466 int error; 1467 int fd, kqfd; 1468 int jid; 1469 1470 if (p->p_aioinfo == NULL) 1471 aio_init_aioinfo(p); 1472 1473 ki = p->p_aioinfo; 1474 1475 ops->store_status(job, -1); 1476 ops->store_error(job, 0); 1477 ops->store_kernelinfo(job, -1); 1478 1479 if (num_queue_count >= max_queue_count || 1480 ki->kaio_count >= ki->kaio_qallowed_count) { 1481 ops->store_error(job, EAGAIN); 1482 return (EAGAIN); 1483 } 1484 1485 aiocbe = uma_zalloc(aiocb_zone, M_WAITOK | M_ZERO); 1486 aiocbe->inputcharge = 0; 1487 aiocbe->outputcharge = 0; 1488 knlist_init_mtx(&aiocbe->klist, AIO_MTX(ki)); 1489 1490 error = ops->copyin(job, &aiocbe->uaiocb); 1491 if (error) { 1492 ops->store_error(job, error); 1493 uma_zfree(aiocb_zone, aiocbe); 1494 return (error); 1495 } 1496 1497 if (aiocbe->uaiocb.aio_sigevent.sigev_notify != SIGEV_KEVENT && 1498 aiocbe->uaiocb.aio_sigevent.sigev_notify != SIGEV_SIGNAL && 1499 aiocbe->uaiocb.aio_sigevent.sigev_notify != SIGEV_THREAD_ID && 1500 aiocbe->uaiocb.aio_sigevent.sigev_notify != SIGEV_NONE) { 1501 ops->store_error(job, EINVAL); 1502 uma_zfree(aiocb_zone, aiocbe); 1503 return (EINVAL); 1504 } 1505 1506 if ((aiocbe->uaiocb.aio_sigevent.sigev_notify == SIGEV_SIGNAL || 1507 aiocbe->uaiocb.aio_sigevent.sigev_notify == SIGEV_THREAD_ID) && 1508 !_SIG_VALID(aiocbe->uaiocb.aio_sigevent.sigev_signo)) { 1509 uma_zfree(aiocb_zone, aiocbe); 1510 return (EINVAL); 1511 } 1512 1513 ksiginfo_init(&aiocbe->ksi); 1514 1515 /* Save userspace address of the job info. */ 1516 aiocbe->uuaiocb = job; 1517 1518 /* Get the opcode. */ 1519 if (type != LIO_NOP) 1520 aiocbe->uaiocb.aio_lio_opcode = type; 1521 opcode = aiocbe->uaiocb.aio_lio_opcode; 1522 1523 /* Fetch the file object for the specified file descriptor. */ 1524 fd = aiocbe->uaiocb.aio_fildes; 1525 switch (opcode) { 1526 case LIO_WRITE: 1527 error = fget_write(td, fd, &fp); 1528 break; 1529 case LIO_READ: 1530 error = fget_read(td, fd, &fp); 1531 break; 1532 default: 1533 error = fget(td, fd, &fp); 1534 } 1535 if (error) { 1536 uma_zfree(aiocb_zone, aiocbe); 1537 ops->store_error(job, error); 1538 return (error); 1539 } 1540 1541 if (opcode == LIO_SYNC && fp->f_vnode == NULL) { 1542 error = EINVAL; 1543 goto aqueue_fail; 1544 } 1545 1546 if (opcode != LIO_SYNC && aiocbe->uaiocb.aio_offset == -1LL) { 1547 error = EINVAL; 1548 goto aqueue_fail; 1549 } 1550 1551 aiocbe->fd_file = fp; 1552 1553 mtx_lock(&aio_job_mtx); 1554 jid = jobrefid++; 1555 aiocbe->seqno = jobseqno++; 1556 mtx_unlock(&aio_job_mtx); 1557 error = ops->store_kernelinfo(job, jid); 1558 if (error) { 1559 error = EINVAL; 1560 goto aqueue_fail; 1561 } 1562 aiocbe->uaiocb._aiocb_private.kernelinfo = (void *)(intptr_t)jid; 1563 1564 if (opcode == LIO_NOP) { 1565 fdrop(fp, td); 1566 uma_zfree(aiocb_zone, aiocbe); 1567 return (0); 1568 } 1569 if ((opcode != LIO_READ) && (opcode != LIO_WRITE) && 1570 (opcode != LIO_SYNC)) { 1571 error = EINVAL; 1572 goto aqueue_fail; 1573 } 1574 1575 if (aiocbe->uaiocb.aio_sigevent.sigev_notify != SIGEV_KEVENT) 1576 goto no_kqueue; 1577 kqfd = aiocbe->uaiocb.aio_sigevent.sigev_notify_kqueue; 1578 kev.ident = (uintptr_t)aiocbe->uuaiocb; 1579 kev.filter = EVFILT_AIO; 1580 kev.flags = EV_ADD | EV_ENABLE | EV_FLAG1; 1581 kev.data = (intptr_t)aiocbe; 1582 kev.udata = aiocbe->uaiocb.aio_sigevent.sigev_value.sival_ptr; 1583 error = kqfd_register(kqfd, &kev, td, 1); 1584 aqueue_fail: 1585 if (error) { 1586 fdrop(fp, td); 1587 uma_zfree(aiocb_zone, aiocbe); 1588 ops->store_error(job, error); 1589 goto done; 1590 } 1591 no_kqueue: 1592 1593 ops->store_error(job, EINPROGRESS); 1594 aiocbe->uaiocb._aiocb_private.error = EINPROGRESS; 1595 aiocbe->userproc = p; 1596 aiocbe->cred = crhold(td->td_ucred); 1597 aiocbe->jobflags = 0; 1598 aiocbe->lio = lj; 1599 1600 if (opcode == LIO_SYNC) 1601 goto queueit; 1602 1603 if (fp->f_type == DTYPE_SOCKET) { 1604 /* 1605 * Alternate queueing for socket ops: Reach down into the 1606 * descriptor to get the socket data. Then check to see if the 1607 * socket is ready to be read or written (based on the requested 1608 * operation). 1609 * 1610 * If it is not ready for io, then queue the aiocbe on the 1611 * socket, and set the flags so we get a call when sbnotify() 1612 * happens. 1613 * 1614 * Note if opcode is neither LIO_WRITE nor LIO_READ we lock 1615 * and unlock the snd sockbuf for no reason. 1616 */ 1617 so = fp->f_data; 1618 sb = (opcode == LIO_READ) ? &so->so_rcv : &so->so_snd; 1619 SOCKBUF_LOCK(sb); 1620 if (((opcode == LIO_READ) && (!soreadable(so))) || ((opcode == 1621 LIO_WRITE) && (!sowriteable(so)))) { 1622 sb->sb_flags |= SB_AIO; 1623 1624 mtx_lock(&aio_job_mtx); 1625 TAILQ_INSERT_TAIL(&so->so_aiojobq, aiocbe, list); 1626 mtx_unlock(&aio_job_mtx); 1627 1628 AIO_LOCK(ki); 1629 TAILQ_INSERT_TAIL(&ki->kaio_all, aiocbe, allist); 1630 TAILQ_INSERT_TAIL(&ki->kaio_jobqueue, aiocbe, plist); 1631 aiocbe->jobstate = JOBST_JOBQSOCK; 1632 ki->kaio_count++; 1633 if (lj) 1634 lj->lioj_count++; 1635 AIO_UNLOCK(ki); 1636 SOCKBUF_UNLOCK(sb); 1637 atomic_add_int(&num_queue_count, 1); 1638 error = 0; 1639 goto done; 1640 } 1641 SOCKBUF_UNLOCK(sb); 1642 } 1643 1644 if ((error = aio_qphysio(p, aiocbe)) == 0) 1645 goto done; 1646 #if 0 1647 if (error > 0) { 1648 aiocbe->uaiocb._aiocb_private.error = error; 1649 ops->store_error(job, error); 1650 goto done; 1651 } 1652 #endif 1653 queueit: 1654 /* No buffer for daemon I/O. */ 1655 aiocbe->bp = NULL; 1656 atomic_add_int(&num_queue_count, 1); 1657 1658 AIO_LOCK(ki); 1659 ki->kaio_count++; 1660 if (lj) 1661 lj->lioj_count++; 1662 TAILQ_INSERT_TAIL(&ki->kaio_jobqueue, aiocbe, plist); 1663 TAILQ_INSERT_TAIL(&ki->kaio_all, aiocbe, allist); 1664 if (opcode == LIO_SYNC) { 1665 TAILQ_FOREACH(cb, &ki->kaio_jobqueue, plist) { 1666 if (cb->fd_file == aiocbe->fd_file && 1667 cb->uaiocb.aio_lio_opcode != LIO_SYNC && 1668 cb->seqno < aiocbe->seqno) { 1669 cb->jobflags |= AIOCBLIST_CHECKSYNC; 1670 aiocbe->pending++; 1671 } 1672 } 1673 TAILQ_FOREACH(cb, &ki->kaio_bufqueue, plist) { 1674 if (cb->fd_file == aiocbe->fd_file && 1675 cb->uaiocb.aio_lio_opcode != LIO_SYNC && 1676 cb->seqno < aiocbe->seqno) { 1677 cb->jobflags |= AIOCBLIST_CHECKSYNC; 1678 aiocbe->pending++; 1679 } 1680 } 1681 if (aiocbe->pending != 0) { 1682 TAILQ_INSERT_TAIL(&ki->kaio_syncqueue, aiocbe, list); 1683 aiocbe->jobstate = JOBST_JOBQSYNC; 1684 AIO_UNLOCK(ki); 1685 goto done; 1686 } 1687 } 1688 mtx_lock(&aio_job_mtx); 1689 TAILQ_INSERT_TAIL(&aio_jobs, aiocbe, list); 1690 aiocbe->jobstate = JOBST_JOBQGLOBAL; 1691 aio_kick_nowait(p); 1692 mtx_unlock(&aio_job_mtx); 1693 AIO_UNLOCK(ki); 1694 error = 0; 1695 done: 1696 return (error); 1697 } 1698 1699 static void 1700 aio_kick_nowait(struct proc *userp) 1701 { 1702 struct kaioinfo *ki = userp->p_aioinfo; 1703 struct aiothreadlist *aiop; 1704 1705 mtx_assert(&aio_job_mtx, MA_OWNED); 1706 if ((aiop = TAILQ_FIRST(&aio_freeproc)) != NULL) { 1707 TAILQ_REMOVE(&aio_freeproc, aiop, list); 1708 aiop->aiothreadflags &= ~AIOP_FREE; 1709 wakeup(aiop->aiothread); 1710 } else if (((num_aio_resv_start + num_aio_procs) < max_aio_procs) && 1711 ((ki->kaio_active_count + num_aio_resv_start) < 1712 ki->kaio_maxactive_count)) { 1713 taskqueue_enqueue(taskqueue_aiod_bio, &ki->kaio_task); 1714 } 1715 } 1716 1717 static int 1718 aio_kick(struct proc *userp) 1719 { 1720 struct kaioinfo *ki = userp->p_aioinfo; 1721 struct aiothreadlist *aiop; 1722 int error, ret = 0; 1723 1724 mtx_assert(&aio_job_mtx, MA_OWNED); 1725 retryproc: 1726 if ((aiop = TAILQ_FIRST(&aio_freeproc)) != NULL) { 1727 TAILQ_REMOVE(&aio_freeproc, aiop, list); 1728 aiop->aiothreadflags &= ~AIOP_FREE; 1729 wakeup(aiop->aiothread); 1730 } else if (((num_aio_resv_start + num_aio_procs) < max_aio_procs) && 1731 ((ki->kaio_active_count + num_aio_resv_start) < 1732 ki->kaio_maxactive_count)) { 1733 num_aio_resv_start++; 1734 mtx_unlock(&aio_job_mtx); 1735 error = aio_newproc(&num_aio_resv_start); 1736 mtx_lock(&aio_job_mtx); 1737 if (error) { 1738 num_aio_resv_start--; 1739 goto retryproc; 1740 } 1741 } else { 1742 ret = -1; 1743 } 1744 return (ret); 1745 } 1746 1747 static void 1748 aio_kick_helper(void *context, int pending) 1749 { 1750 struct proc *userp = context; 1751 1752 mtx_lock(&aio_job_mtx); 1753 while (--pending >= 0) { 1754 if (aio_kick(userp)) 1755 break; 1756 } 1757 mtx_unlock(&aio_job_mtx); 1758 } 1759 1760 /* 1761 * Support the aio_return system call, as a side-effect, kernel resources are 1762 * released. 1763 */ 1764 static int 1765 kern_aio_return(struct thread *td, struct aiocb *uaiocb, struct aiocb_ops *ops) 1766 { 1767 struct proc *p = td->td_proc; 1768 struct aiocblist *cb; 1769 struct kaioinfo *ki; 1770 int status, error; 1771 1772 ki = p->p_aioinfo; 1773 if (ki == NULL) 1774 return (EINVAL); 1775 AIO_LOCK(ki); 1776 TAILQ_FOREACH(cb, &ki->kaio_done, plist) { 1777 if (cb->uuaiocb == uaiocb) 1778 break; 1779 } 1780 if (cb != NULL) { 1781 MPASS(cb->jobstate == JOBST_JOBFINISHED); 1782 status = cb->uaiocb._aiocb_private.status; 1783 error = cb->uaiocb._aiocb_private.error; 1784 td->td_retval[0] = status; 1785 if (cb->uaiocb.aio_lio_opcode == LIO_WRITE) { 1786 td->td_ru.ru_oublock += cb->outputcharge; 1787 cb->outputcharge = 0; 1788 } else if (cb->uaiocb.aio_lio_opcode == LIO_READ) { 1789 td->td_ru.ru_inblock += cb->inputcharge; 1790 cb->inputcharge = 0; 1791 } 1792 aio_free_entry(cb); 1793 AIO_UNLOCK(ki); 1794 ops->store_error(uaiocb, error); 1795 ops->store_status(uaiocb, status); 1796 } else { 1797 error = EINVAL; 1798 AIO_UNLOCK(ki); 1799 } 1800 return (error); 1801 } 1802 1803 int 1804 aio_return(struct thread *td, struct aio_return_args *uap) 1805 { 1806 1807 return (kern_aio_return(td, uap->aiocbp, &aiocb_ops)); 1808 } 1809 1810 /* 1811 * Allow a process to wakeup when any of the I/O requests are completed. 1812 */ 1813 static int 1814 kern_aio_suspend(struct thread *td, int njoblist, struct aiocb **ujoblist, 1815 struct timespec *ts) 1816 { 1817 struct proc *p = td->td_proc; 1818 struct timeval atv; 1819 struct kaioinfo *ki; 1820 struct aiocblist *cb, *cbfirst; 1821 int error, i, timo; 1822 1823 timo = 0; 1824 if (ts) { 1825 if (ts->tv_nsec < 0 || ts->tv_nsec >= 1000000000) 1826 return (EINVAL); 1827 1828 TIMESPEC_TO_TIMEVAL(&atv, ts); 1829 if (itimerfix(&atv)) 1830 return (EINVAL); 1831 timo = tvtohz(&atv); 1832 } 1833 1834 ki = p->p_aioinfo; 1835 if (ki == NULL) 1836 return (EAGAIN); 1837 1838 if (njoblist == 0) 1839 return (0); 1840 1841 AIO_LOCK(ki); 1842 for (;;) { 1843 cbfirst = NULL; 1844 error = 0; 1845 TAILQ_FOREACH(cb, &ki->kaio_all, allist) { 1846 for (i = 0; i < njoblist; i++) { 1847 if (cb->uuaiocb == ujoblist[i]) { 1848 if (cbfirst == NULL) 1849 cbfirst = cb; 1850 if (cb->jobstate == JOBST_JOBFINISHED) 1851 goto RETURN; 1852 } 1853 } 1854 } 1855 /* All tasks were finished. */ 1856 if (cbfirst == NULL) 1857 break; 1858 1859 ki->kaio_flags |= KAIO_WAKEUP; 1860 error = msleep(&p->p_aioinfo, AIO_MTX(ki), PRIBIO | PCATCH, 1861 "aiospn", timo); 1862 if (error == ERESTART) 1863 error = EINTR; 1864 if (error) 1865 break; 1866 } 1867 RETURN: 1868 AIO_UNLOCK(ki); 1869 return (error); 1870 } 1871 1872 int 1873 aio_suspend(struct thread *td, struct aio_suspend_args *uap) 1874 { 1875 struct timespec ts, *tsp; 1876 struct aiocb **ujoblist; 1877 int error; 1878 1879 if (uap->nent < 0 || uap->nent > AIO_LISTIO_MAX) 1880 return (EINVAL); 1881 1882 if (uap->timeout) { 1883 /* Get timespec struct. */ 1884 if ((error = copyin(uap->timeout, &ts, sizeof(ts))) != 0) 1885 return (error); 1886 tsp = &ts; 1887 } else 1888 tsp = NULL; 1889 1890 ujoblist = uma_zalloc(aiol_zone, M_WAITOK); 1891 error = copyin(uap->aiocbp, ujoblist, uap->nent * sizeof(ujoblist[0])); 1892 if (error == 0) 1893 error = kern_aio_suspend(td, uap->nent, ujoblist, tsp); 1894 uma_zfree(aiol_zone, ujoblist); 1895 return (error); 1896 } 1897 1898 /* 1899 * aio_cancel cancels any non-physio aio operations not currently in 1900 * progress. 1901 */ 1902 int 1903 aio_cancel(struct thread *td, struct aio_cancel_args *uap) 1904 { 1905 struct proc *p = td->td_proc; 1906 struct kaioinfo *ki; 1907 struct aiocblist *cbe, *cbn; 1908 struct file *fp; 1909 struct socket *so; 1910 int error; 1911 int remove; 1912 int cancelled = 0; 1913 int notcancelled = 0; 1914 struct vnode *vp; 1915 1916 /* Lookup file object. */ 1917 error = fget(td, uap->fd, &fp); 1918 if (error) 1919 return (error); 1920 1921 ki = p->p_aioinfo; 1922 if (ki == NULL) 1923 goto done; 1924 1925 if (fp->f_type == DTYPE_VNODE) { 1926 vp = fp->f_vnode; 1927 if (vn_isdisk(vp, &error)) { 1928 fdrop(fp, td); 1929 td->td_retval[0] = AIO_NOTCANCELED; 1930 return (0); 1931 } 1932 } 1933 1934 AIO_LOCK(ki); 1935 TAILQ_FOREACH_SAFE(cbe, &ki->kaio_jobqueue, plist, cbn) { 1936 if ((uap->fd == cbe->uaiocb.aio_fildes) && 1937 ((uap->aiocbp == NULL) || 1938 (uap->aiocbp == cbe->uuaiocb))) { 1939 remove = 0; 1940 1941 mtx_lock(&aio_job_mtx); 1942 if (cbe->jobstate == JOBST_JOBQGLOBAL) { 1943 TAILQ_REMOVE(&aio_jobs, cbe, list); 1944 remove = 1; 1945 } else if (cbe->jobstate == JOBST_JOBQSOCK) { 1946 MPASS(fp->f_type == DTYPE_SOCKET); 1947 so = fp->f_data; 1948 TAILQ_REMOVE(&so->so_aiojobq, cbe, list); 1949 remove = 1; 1950 } else if (cbe->jobstate == JOBST_JOBQSYNC) { 1951 TAILQ_REMOVE(&ki->kaio_syncqueue, cbe, list); 1952 remove = 1; 1953 } 1954 mtx_unlock(&aio_job_mtx); 1955 1956 if (remove) { 1957 TAILQ_REMOVE(&ki->kaio_jobqueue, cbe, plist); 1958 cbe->uaiocb._aiocb_private.status = -1; 1959 cbe->uaiocb._aiocb_private.error = ECANCELED; 1960 aio_bio_done_notify(p, cbe, DONE_QUEUE); 1961 cancelled++; 1962 } else { 1963 notcancelled++; 1964 } 1965 if (uap->aiocbp != NULL) 1966 break; 1967 } 1968 } 1969 AIO_UNLOCK(ki); 1970 1971 done: 1972 fdrop(fp, td); 1973 1974 if (uap->aiocbp != NULL) { 1975 if (cancelled) { 1976 td->td_retval[0] = AIO_CANCELED; 1977 return (0); 1978 } 1979 } 1980 1981 if (notcancelled) { 1982 td->td_retval[0] = AIO_NOTCANCELED; 1983 return (0); 1984 } 1985 1986 if (cancelled) { 1987 td->td_retval[0] = AIO_CANCELED; 1988 return (0); 1989 } 1990 1991 td->td_retval[0] = AIO_ALLDONE; 1992 1993 return (0); 1994 } 1995 1996 /* 1997 * aio_error is implemented in the kernel level for compatibility purposes 1998 * only. For a user mode async implementation, it would be best to do it in 1999 * a userland subroutine. 2000 */ 2001 static int 2002 kern_aio_error(struct thread *td, struct aiocb *aiocbp, struct aiocb_ops *ops) 2003 { 2004 struct proc *p = td->td_proc; 2005 struct aiocblist *cb; 2006 struct kaioinfo *ki; 2007 int status; 2008 2009 ki = p->p_aioinfo; 2010 if (ki == NULL) { 2011 td->td_retval[0] = EINVAL; 2012 return (0); 2013 } 2014 2015 AIO_LOCK(ki); 2016 TAILQ_FOREACH(cb, &ki->kaio_all, allist) { 2017 if (cb->uuaiocb == aiocbp) { 2018 if (cb->jobstate == JOBST_JOBFINISHED) 2019 td->td_retval[0] = 2020 cb->uaiocb._aiocb_private.error; 2021 else 2022 td->td_retval[0] = EINPROGRESS; 2023 AIO_UNLOCK(ki); 2024 return (0); 2025 } 2026 } 2027 AIO_UNLOCK(ki); 2028 2029 /* 2030 * Hack for failure of aio_aqueue. 2031 */ 2032 status = ops->fetch_status(aiocbp); 2033 if (status == -1) { 2034 td->td_retval[0] = ops->fetch_error(aiocbp); 2035 return (0); 2036 } 2037 2038 td->td_retval[0] = EINVAL; 2039 return (0); 2040 } 2041 2042 int 2043 aio_error(struct thread *td, struct aio_error_args *uap) 2044 { 2045 2046 return (kern_aio_error(td, uap->aiocbp, &aiocb_ops)); 2047 } 2048 2049 /* syscall - asynchronous read from a file (REALTIME) */ 2050 int 2051 oaio_read(struct thread *td, struct oaio_read_args *uap) 2052 { 2053 2054 return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_READ, 2055 &aiocb_ops_osigevent)); 2056 } 2057 2058 int 2059 aio_read(struct thread *td, struct aio_read_args *uap) 2060 { 2061 2062 return (aio_aqueue(td, uap->aiocbp, NULL, LIO_READ, &aiocb_ops)); 2063 } 2064 2065 /* syscall - asynchronous write to a file (REALTIME) */ 2066 int 2067 oaio_write(struct thread *td, struct oaio_write_args *uap) 2068 { 2069 2070 return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_WRITE, 2071 &aiocb_ops_osigevent)); 2072 } 2073 2074 int 2075 aio_write(struct thread *td, struct aio_write_args *uap) 2076 { 2077 2078 return (aio_aqueue(td, uap->aiocbp, NULL, LIO_WRITE, &aiocb_ops)); 2079 } 2080 2081 static int 2082 kern_lio_listio(struct thread *td, int mode, struct aiocb * const *uacb_list, 2083 struct aiocb **acb_list, int nent, struct sigevent *sig, 2084 struct aiocb_ops *ops) 2085 { 2086 struct proc *p = td->td_proc; 2087 struct aiocb *iocb; 2088 struct kaioinfo *ki; 2089 struct aioliojob *lj; 2090 struct kevent kev; 2091 int error; 2092 int nerror; 2093 int i; 2094 2095 if ((mode != LIO_NOWAIT) && (mode != LIO_WAIT)) 2096 return (EINVAL); 2097 2098 if (nent < 0 || nent > AIO_LISTIO_MAX) 2099 return (EINVAL); 2100 2101 if (p->p_aioinfo == NULL) 2102 aio_init_aioinfo(p); 2103 2104 ki = p->p_aioinfo; 2105 2106 lj = uma_zalloc(aiolio_zone, M_WAITOK); 2107 lj->lioj_flags = 0; 2108 lj->lioj_count = 0; 2109 lj->lioj_finished_count = 0; 2110 knlist_init_mtx(&lj->klist, AIO_MTX(ki)); 2111 ksiginfo_init(&lj->lioj_ksi); 2112 2113 /* 2114 * Setup signal. 2115 */ 2116 if (sig && (mode == LIO_NOWAIT)) { 2117 bcopy(sig, &lj->lioj_signal, sizeof(lj->lioj_signal)); 2118 if (lj->lioj_signal.sigev_notify == SIGEV_KEVENT) { 2119 /* Assume only new style KEVENT */ 2120 kev.filter = EVFILT_LIO; 2121 kev.flags = EV_ADD | EV_ENABLE | EV_FLAG1; 2122 kev.ident = (uintptr_t)uacb_list; /* something unique */ 2123 kev.data = (intptr_t)lj; 2124 /* pass user defined sigval data */ 2125 kev.udata = lj->lioj_signal.sigev_value.sival_ptr; 2126 error = kqfd_register( 2127 lj->lioj_signal.sigev_notify_kqueue, &kev, td, 1); 2128 if (error) { 2129 uma_zfree(aiolio_zone, lj); 2130 return (error); 2131 } 2132 } else if (lj->lioj_signal.sigev_notify == SIGEV_NONE) { 2133 ; 2134 } else if (lj->lioj_signal.sigev_notify == SIGEV_SIGNAL || 2135 lj->lioj_signal.sigev_notify == SIGEV_THREAD_ID) { 2136 if (!_SIG_VALID(lj->lioj_signal.sigev_signo)) { 2137 uma_zfree(aiolio_zone, lj); 2138 return EINVAL; 2139 } 2140 lj->lioj_flags |= LIOJ_SIGNAL; 2141 } else { 2142 uma_zfree(aiolio_zone, lj); 2143 return EINVAL; 2144 } 2145 } 2146 2147 AIO_LOCK(ki); 2148 TAILQ_INSERT_TAIL(&ki->kaio_liojoblist, lj, lioj_list); 2149 /* 2150 * Add extra aiocb count to avoid the lio to be freed 2151 * by other threads doing aio_waitcomplete or aio_return, 2152 * and prevent event from being sent until we have queued 2153 * all tasks. 2154 */ 2155 lj->lioj_count = 1; 2156 AIO_UNLOCK(ki); 2157 2158 /* 2159 * Get pointers to the list of I/O requests. 2160 */ 2161 nerror = 0; 2162 for (i = 0; i < nent; i++) { 2163 iocb = acb_list[i]; 2164 if (iocb != NULL) { 2165 error = aio_aqueue(td, iocb, lj, LIO_NOP, ops); 2166 if (error != 0) 2167 nerror++; 2168 } 2169 } 2170 2171 error = 0; 2172 AIO_LOCK(ki); 2173 if (mode == LIO_WAIT) { 2174 while (lj->lioj_count - 1 != lj->lioj_finished_count) { 2175 ki->kaio_flags |= KAIO_WAKEUP; 2176 error = msleep(&p->p_aioinfo, AIO_MTX(ki), 2177 PRIBIO | PCATCH, "aiospn", 0); 2178 if (error == ERESTART) 2179 error = EINTR; 2180 if (error) 2181 break; 2182 } 2183 } else { 2184 if (lj->lioj_count - 1 == lj->lioj_finished_count) { 2185 if (lj->lioj_signal.sigev_notify == SIGEV_KEVENT) { 2186 lj->lioj_flags |= LIOJ_KEVENT_POSTED; 2187 KNOTE_LOCKED(&lj->klist, 1); 2188 } 2189 if ((lj->lioj_flags & (LIOJ_SIGNAL|LIOJ_SIGNAL_POSTED)) 2190 == LIOJ_SIGNAL 2191 && (lj->lioj_signal.sigev_notify == SIGEV_SIGNAL || 2192 lj->lioj_signal.sigev_notify == SIGEV_THREAD_ID)) { 2193 aio_sendsig(p, &lj->lioj_signal, 2194 &lj->lioj_ksi); 2195 lj->lioj_flags |= LIOJ_SIGNAL_POSTED; 2196 } 2197 } 2198 } 2199 lj->lioj_count--; 2200 if (lj->lioj_count == 0) { 2201 TAILQ_REMOVE(&ki->kaio_liojoblist, lj, lioj_list); 2202 knlist_delete(&lj->klist, curthread, 1); 2203 PROC_LOCK(p); 2204 sigqueue_take(&lj->lioj_ksi); 2205 PROC_UNLOCK(p); 2206 AIO_UNLOCK(ki); 2207 uma_zfree(aiolio_zone, lj); 2208 } else 2209 AIO_UNLOCK(ki); 2210 2211 if (nerror) 2212 return (EIO); 2213 return (error); 2214 } 2215 2216 /* syscall - list directed I/O (REALTIME) */ 2217 int 2218 olio_listio(struct thread *td, struct olio_listio_args *uap) 2219 { 2220 struct aiocb **acb_list; 2221 struct sigevent *sigp, sig; 2222 struct osigevent osig; 2223 int error, nent; 2224 2225 if ((uap->mode != LIO_NOWAIT) && (uap->mode != LIO_WAIT)) 2226 return (EINVAL); 2227 2228 nent = uap->nent; 2229 if (nent < 0 || nent > AIO_LISTIO_MAX) 2230 return (EINVAL); 2231 2232 if (uap->sig && (uap->mode == LIO_NOWAIT)) { 2233 error = copyin(uap->sig, &osig, sizeof(osig)); 2234 if (error) 2235 return (error); 2236 error = convert_old_sigevent(&osig, &sig); 2237 if (error) 2238 return (error); 2239 sigp = &sig; 2240 } else 2241 sigp = NULL; 2242 2243 acb_list = malloc(sizeof(struct aiocb *) * nent, M_LIO, M_WAITOK); 2244 error = copyin(uap->acb_list, acb_list, nent * sizeof(acb_list[0])); 2245 if (error == 0) 2246 error = kern_lio_listio(td, uap->mode, 2247 (struct aiocb * const *)uap->acb_list, acb_list, nent, sigp, 2248 &aiocb_ops_osigevent); 2249 free(acb_list, M_LIO); 2250 return (error); 2251 } 2252 2253 /* syscall - list directed I/O (REALTIME) */ 2254 int 2255 lio_listio(struct thread *td, struct lio_listio_args *uap) 2256 { 2257 struct aiocb **acb_list; 2258 struct sigevent *sigp, sig; 2259 int error, nent; 2260 2261 if ((uap->mode != LIO_NOWAIT) && (uap->mode != LIO_WAIT)) 2262 return (EINVAL); 2263 2264 nent = uap->nent; 2265 if (nent < 0 || nent > AIO_LISTIO_MAX) 2266 return (EINVAL); 2267 2268 if (uap->sig && (uap->mode == LIO_NOWAIT)) { 2269 error = copyin(uap->sig, &sig, sizeof(sig)); 2270 if (error) 2271 return (error); 2272 sigp = &sig; 2273 } else 2274 sigp = NULL; 2275 2276 acb_list = malloc(sizeof(struct aiocb *) * nent, M_LIO, M_WAITOK); 2277 error = copyin(uap->acb_list, acb_list, nent * sizeof(acb_list[0])); 2278 if (error == 0) 2279 error = kern_lio_listio(td, uap->mode, uap->acb_list, acb_list, 2280 nent, sigp, &aiocb_ops); 2281 free(acb_list, M_LIO); 2282 return (error); 2283 } 2284 2285 /* 2286 * Called from interrupt thread for physio, we should return as fast 2287 * as possible, so we schedule a biohelper task. 2288 */ 2289 static void 2290 aio_physwakeup(struct buf *bp) 2291 { 2292 struct aiocblist *aiocbe; 2293 2294 aiocbe = (struct aiocblist *)bp->b_caller1; 2295 taskqueue_enqueue(taskqueue_aiod_bio, &aiocbe->biotask); 2296 } 2297 2298 /* 2299 * Task routine to perform heavy tasks, process wakeup, and signals. 2300 */ 2301 static void 2302 biohelper(void *context, int pending) 2303 { 2304 struct aiocblist *aiocbe = context; 2305 struct buf *bp; 2306 struct proc *userp; 2307 struct kaioinfo *ki; 2308 int nblks; 2309 2310 bp = aiocbe->bp; 2311 userp = aiocbe->userproc; 2312 ki = userp->p_aioinfo; 2313 AIO_LOCK(ki); 2314 aiocbe->uaiocb._aiocb_private.status -= bp->b_resid; 2315 aiocbe->uaiocb._aiocb_private.error = 0; 2316 if (bp->b_ioflags & BIO_ERROR) 2317 aiocbe->uaiocb._aiocb_private.error = bp->b_error; 2318 nblks = btodb(aiocbe->uaiocb.aio_nbytes); 2319 if (aiocbe->uaiocb.aio_lio_opcode == LIO_WRITE) 2320 aiocbe->outputcharge += nblks; 2321 else 2322 aiocbe->inputcharge += nblks; 2323 aiocbe->bp = NULL; 2324 TAILQ_REMOVE(&userp->p_aioinfo->kaio_bufqueue, aiocbe, plist); 2325 ki->kaio_buffer_count--; 2326 aio_bio_done_notify(userp, aiocbe, DONE_BUF); 2327 AIO_UNLOCK(ki); 2328 2329 /* Release mapping into kernel space. */ 2330 vunmapbuf(bp); 2331 relpbuf(bp, NULL); 2332 atomic_subtract_int(&num_buf_aio, 1); 2333 } 2334 2335 /* syscall - wait for the next completion of an aio request */ 2336 static int 2337 kern_aio_waitcomplete(struct thread *td, struct aiocb **aiocbp, 2338 struct timespec *ts, struct aiocb_ops *ops) 2339 { 2340 struct proc *p = td->td_proc; 2341 struct timeval atv; 2342 struct kaioinfo *ki; 2343 struct aiocblist *cb; 2344 struct aiocb *uuaiocb; 2345 int error, status, timo; 2346 2347 ops->store_aiocb(aiocbp, NULL); 2348 2349 timo = 0; 2350 if (ts) { 2351 if ((ts->tv_nsec < 0) || (ts->tv_nsec >= 1000000000)) 2352 return (EINVAL); 2353 2354 TIMESPEC_TO_TIMEVAL(&atv, ts); 2355 if (itimerfix(&atv)) 2356 return (EINVAL); 2357 timo = tvtohz(&atv); 2358 } 2359 2360 if (p->p_aioinfo == NULL) 2361 aio_init_aioinfo(p); 2362 ki = p->p_aioinfo; 2363 2364 error = 0; 2365 cb = NULL; 2366 AIO_LOCK(ki); 2367 while ((cb = TAILQ_FIRST(&ki->kaio_done)) == NULL) { 2368 ki->kaio_flags |= KAIO_WAKEUP; 2369 error = msleep(&p->p_aioinfo, AIO_MTX(ki), PRIBIO | PCATCH, 2370 "aiowc", timo); 2371 if (timo && error == ERESTART) 2372 error = EINTR; 2373 if (error) 2374 break; 2375 } 2376 2377 if (cb != NULL) { 2378 MPASS(cb->jobstate == JOBST_JOBFINISHED); 2379 uuaiocb = cb->uuaiocb; 2380 status = cb->uaiocb._aiocb_private.status; 2381 error = cb->uaiocb._aiocb_private.error; 2382 td->td_retval[0] = status; 2383 if (cb->uaiocb.aio_lio_opcode == LIO_WRITE) { 2384 td->td_ru.ru_oublock += cb->outputcharge; 2385 cb->outputcharge = 0; 2386 } else if (cb->uaiocb.aio_lio_opcode == LIO_READ) { 2387 td->td_ru.ru_inblock += cb->inputcharge; 2388 cb->inputcharge = 0; 2389 } 2390 aio_free_entry(cb); 2391 AIO_UNLOCK(ki); 2392 ops->store_aiocb(aiocbp, uuaiocb); 2393 ops->store_error(uuaiocb, error); 2394 ops->store_status(uuaiocb, status); 2395 } else 2396 AIO_UNLOCK(ki); 2397 2398 return (error); 2399 } 2400 2401 int 2402 aio_waitcomplete(struct thread *td, struct aio_waitcomplete_args *uap) 2403 { 2404 struct timespec ts, *tsp; 2405 int error; 2406 2407 if (uap->timeout) { 2408 /* Get timespec struct. */ 2409 error = copyin(uap->timeout, &ts, sizeof(ts)); 2410 if (error) 2411 return (error); 2412 tsp = &ts; 2413 } else 2414 tsp = NULL; 2415 2416 return (kern_aio_waitcomplete(td, uap->aiocbp, tsp, &aiocb_ops)); 2417 } 2418 2419 static int 2420 kern_aio_fsync(struct thread *td, int op, struct aiocb *aiocbp, 2421 struct aiocb_ops *ops) 2422 { 2423 struct proc *p = td->td_proc; 2424 struct kaioinfo *ki; 2425 2426 if (op != O_SYNC) /* XXX lack of O_DSYNC */ 2427 return (EINVAL); 2428 ki = p->p_aioinfo; 2429 if (ki == NULL) 2430 aio_init_aioinfo(p); 2431 return (aio_aqueue(td, aiocbp, NULL, LIO_SYNC, ops)); 2432 } 2433 2434 int 2435 aio_fsync(struct thread *td, struct aio_fsync_args *uap) 2436 { 2437 2438 return (kern_aio_fsync(td, uap->op, uap->aiocbp, &aiocb_ops)); 2439 } 2440 2441 /* kqueue attach function */ 2442 static int 2443 filt_aioattach(struct knote *kn) 2444 { 2445 struct aiocblist *aiocbe = (struct aiocblist *)kn->kn_sdata; 2446 2447 /* 2448 * The aiocbe pointer must be validated before using it, so 2449 * registration is restricted to the kernel; the user cannot 2450 * set EV_FLAG1. 2451 */ 2452 if ((kn->kn_flags & EV_FLAG1) == 0) 2453 return (EPERM); 2454 kn->kn_ptr.p_aio = aiocbe; 2455 kn->kn_flags &= ~EV_FLAG1; 2456 2457 knlist_add(&aiocbe->klist, kn, 0); 2458 2459 return (0); 2460 } 2461 2462 /* kqueue detach function */ 2463 static void 2464 filt_aiodetach(struct knote *kn) 2465 { 2466 struct aiocblist *aiocbe = kn->kn_ptr.p_aio; 2467 2468 if (!knlist_empty(&aiocbe->klist)) 2469 knlist_remove(&aiocbe->klist, kn, 0); 2470 } 2471 2472 /* kqueue filter function */ 2473 /*ARGSUSED*/ 2474 static int 2475 filt_aio(struct knote *kn, long hint) 2476 { 2477 struct aiocblist *aiocbe = kn->kn_ptr.p_aio; 2478 2479 kn->kn_data = aiocbe->uaiocb._aiocb_private.error; 2480 if (aiocbe->jobstate != JOBST_JOBFINISHED) 2481 return (0); 2482 kn->kn_flags |= EV_EOF; 2483 return (1); 2484 } 2485 2486 /* kqueue attach function */ 2487 static int 2488 filt_lioattach(struct knote *kn) 2489 { 2490 struct aioliojob * lj = (struct aioliojob *)kn->kn_sdata; 2491 2492 /* 2493 * The aioliojob pointer must be validated before using it, so 2494 * registration is restricted to the kernel; the user cannot 2495 * set EV_FLAG1. 2496 */ 2497 if ((kn->kn_flags & EV_FLAG1) == 0) 2498 return (EPERM); 2499 kn->kn_ptr.p_lio = lj; 2500 kn->kn_flags &= ~EV_FLAG1; 2501 2502 knlist_add(&lj->klist, kn, 0); 2503 2504 return (0); 2505 } 2506 2507 /* kqueue detach function */ 2508 static void 2509 filt_liodetach(struct knote *kn) 2510 { 2511 struct aioliojob * lj = kn->kn_ptr.p_lio; 2512 2513 if (!knlist_empty(&lj->klist)) 2514 knlist_remove(&lj->klist, kn, 0); 2515 } 2516 2517 /* kqueue filter function */ 2518 /*ARGSUSED*/ 2519 static int 2520 filt_lio(struct knote *kn, long hint) 2521 { 2522 struct aioliojob * lj = kn->kn_ptr.p_lio; 2523 2524 return (lj->lioj_flags & LIOJ_KEVENT_POSTED); 2525 } 2526 2527 #ifdef COMPAT_IA32 2528 #include <sys/mount.h> 2529 #include <sys/socket.h> 2530 #include <compat/freebsd32/freebsd32.h> 2531 #include <compat/freebsd32/freebsd32_proto.h> 2532 #include <compat/freebsd32/freebsd32_signal.h> 2533 #include <compat/freebsd32/freebsd32_syscall.h> 2534 #include <compat/freebsd32/freebsd32_util.h> 2535 2536 struct __aiocb_private32 { 2537 int32_t status; 2538 int32_t error; 2539 uint32_t kernelinfo; 2540 }; 2541 2542 typedef struct oaiocb32 { 2543 int aio_fildes; /* File descriptor */ 2544 uint64_t aio_offset __packed; /* File offset for I/O */ 2545 uint32_t aio_buf; /* I/O buffer in process space */ 2546 uint32_t aio_nbytes; /* Number of bytes for I/O */ 2547 struct osigevent32 aio_sigevent; /* Signal to deliver */ 2548 int aio_lio_opcode; /* LIO opcode */ 2549 int aio_reqprio; /* Request priority -- ignored */ 2550 struct __aiocb_private32 _aiocb_private; 2551 } oaiocb32_t; 2552 2553 typedef struct aiocb32 { 2554 int32_t aio_fildes; /* File descriptor */ 2555 uint64_t aio_offset __packed; /* File offset for I/O */ 2556 uint32_t aio_buf; /* I/O buffer in process space */ 2557 uint32_t aio_nbytes; /* Number of bytes for I/O */ 2558 int __spare__[2]; 2559 uint32_t __spare2__; 2560 int aio_lio_opcode; /* LIO opcode */ 2561 int aio_reqprio; /* Request priority -- ignored */ 2562 struct __aiocb_private32 _aiocb_private; 2563 struct sigevent32 aio_sigevent; /* Signal to deliver */ 2564 } aiocb32_t; 2565 2566 static int 2567 convert_old_sigevent32(struct osigevent32 *osig, struct sigevent *nsig) 2568 { 2569 2570 /* 2571 * Only SIGEV_NONE, SIGEV_SIGNAL, and SIGEV_KEVENT are 2572 * supported by AIO with the old sigevent structure. 2573 */ 2574 CP(*osig, *nsig, sigev_notify); 2575 switch (nsig->sigev_notify) { 2576 case SIGEV_NONE: 2577 break; 2578 case SIGEV_SIGNAL: 2579 nsig->sigev_signo = osig->__sigev_u.__sigev_signo; 2580 break; 2581 case SIGEV_KEVENT: 2582 nsig->sigev_notify_kqueue = 2583 osig->__sigev_u.__sigev_notify_kqueue; 2584 PTRIN_CP(*osig, *nsig, sigev_value.sival_ptr); 2585 break; 2586 default: 2587 return (EINVAL); 2588 } 2589 return (0); 2590 } 2591 2592 static int 2593 aiocb32_copyin_old_sigevent(struct aiocb *ujob, struct aiocb *kjob) 2594 { 2595 struct oaiocb32 job32; 2596 int error; 2597 2598 bzero(kjob, sizeof(struct aiocb)); 2599 error = copyin(ujob, &job32, sizeof(job32)); 2600 if (error) 2601 return (error); 2602 2603 CP(job32, *kjob, aio_fildes); 2604 CP(job32, *kjob, aio_offset); 2605 PTRIN_CP(job32, *kjob, aio_buf); 2606 CP(job32, *kjob, aio_nbytes); 2607 CP(job32, *kjob, aio_lio_opcode); 2608 CP(job32, *kjob, aio_reqprio); 2609 CP(job32, *kjob, _aiocb_private.status); 2610 CP(job32, *kjob, _aiocb_private.error); 2611 PTRIN_CP(job32, *kjob, _aiocb_private.kernelinfo); 2612 return (convert_old_sigevent32(&job32.aio_sigevent, 2613 &kjob->aio_sigevent)); 2614 } 2615 2616 static int 2617 convert_sigevent32(struct sigevent32 *sig32, struct sigevent *sig) 2618 { 2619 2620 CP(*sig32, *sig, sigev_notify); 2621 switch (sig->sigev_notify) { 2622 case SIGEV_NONE: 2623 break; 2624 case SIGEV_THREAD_ID: 2625 CP(*sig32, *sig, sigev_notify_thread_id); 2626 /* FALLTHROUGH */ 2627 case SIGEV_SIGNAL: 2628 CP(*sig32, *sig, sigev_signo); 2629 break; 2630 case SIGEV_KEVENT: 2631 CP(*sig32, *sig, sigev_notify_kqueue); 2632 PTRIN_CP(*sig32, *sig, sigev_value.sival_ptr); 2633 break; 2634 default: 2635 return (EINVAL); 2636 } 2637 return (0); 2638 } 2639 2640 static int 2641 aiocb32_copyin(struct aiocb *ujob, struct aiocb *kjob) 2642 { 2643 struct aiocb32 job32; 2644 int error; 2645 2646 error = copyin(ujob, &job32, sizeof(job32)); 2647 if (error) 2648 return (error); 2649 CP(job32, *kjob, aio_fildes); 2650 CP(job32, *kjob, aio_offset); 2651 PTRIN_CP(job32, *kjob, aio_buf); 2652 CP(job32, *kjob, aio_nbytes); 2653 CP(job32, *kjob, aio_lio_opcode); 2654 CP(job32, *kjob, aio_reqprio); 2655 CP(job32, *kjob, _aiocb_private.status); 2656 CP(job32, *kjob, _aiocb_private.error); 2657 PTRIN_CP(job32, *kjob, _aiocb_private.kernelinfo); 2658 return (convert_sigevent32(&job32.aio_sigevent, &kjob->aio_sigevent)); 2659 } 2660 2661 static long 2662 aiocb32_fetch_status(struct aiocb *ujob) 2663 { 2664 struct aiocb32 *ujob32; 2665 2666 ujob32 = (struct aiocb32 *)ujob; 2667 return (fuword32(&ujob32->_aiocb_private.status)); 2668 } 2669 2670 static long 2671 aiocb32_fetch_error(struct aiocb *ujob) 2672 { 2673 struct aiocb32 *ujob32; 2674 2675 ujob32 = (struct aiocb32 *)ujob; 2676 return (fuword32(&ujob32->_aiocb_private.error)); 2677 } 2678 2679 static int 2680 aiocb32_store_status(struct aiocb *ujob, long status) 2681 { 2682 struct aiocb32 *ujob32; 2683 2684 ujob32 = (struct aiocb32 *)ujob; 2685 return (suword32(&ujob32->_aiocb_private.status, status)); 2686 } 2687 2688 static int 2689 aiocb32_store_error(struct aiocb *ujob, long error) 2690 { 2691 struct aiocb32 *ujob32; 2692 2693 ujob32 = (struct aiocb32 *)ujob; 2694 return (suword32(&ujob32->_aiocb_private.error, error)); 2695 } 2696 2697 static int 2698 aiocb32_store_kernelinfo(struct aiocb *ujob, long jobref) 2699 { 2700 struct aiocb32 *ujob32; 2701 2702 ujob32 = (struct aiocb32 *)ujob; 2703 return (suword32(&ujob32->_aiocb_private.kernelinfo, jobref)); 2704 } 2705 2706 static int 2707 aiocb32_store_aiocb(struct aiocb **ujobp, struct aiocb *ujob) 2708 { 2709 2710 return (suword32(ujobp, (long)ujob)); 2711 } 2712 2713 static struct aiocb_ops aiocb32_ops = { 2714 .copyin = aiocb32_copyin, 2715 .fetch_status = aiocb32_fetch_status, 2716 .fetch_error = aiocb32_fetch_error, 2717 .store_status = aiocb32_store_status, 2718 .store_error = aiocb32_store_error, 2719 .store_kernelinfo = aiocb32_store_kernelinfo, 2720 .store_aiocb = aiocb32_store_aiocb, 2721 }; 2722 2723 static struct aiocb_ops aiocb32_ops_osigevent = { 2724 .copyin = aiocb32_copyin_old_sigevent, 2725 .fetch_status = aiocb32_fetch_status, 2726 .fetch_error = aiocb32_fetch_error, 2727 .store_status = aiocb32_store_status, 2728 .store_error = aiocb32_store_error, 2729 .store_kernelinfo = aiocb32_store_kernelinfo, 2730 .store_aiocb = aiocb32_store_aiocb, 2731 }; 2732 2733 int 2734 freebsd32_aio_return(struct thread *td, struct freebsd32_aio_return_args *uap) 2735 { 2736 2737 return (kern_aio_return(td, (struct aiocb *)uap->aiocbp, &aiocb32_ops)); 2738 } 2739 2740 int 2741 freebsd32_aio_suspend(struct thread *td, struct freebsd32_aio_suspend_args *uap) 2742 { 2743 struct timespec32 ts32; 2744 struct timespec ts, *tsp; 2745 struct aiocb **ujoblist; 2746 uint32_t *ujoblist32; 2747 int error, i; 2748 2749 if (uap->nent < 0 || uap->nent > AIO_LISTIO_MAX) 2750 return (EINVAL); 2751 2752 if (uap->timeout) { 2753 /* Get timespec struct. */ 2754 if ((error = copyin(uap->timeout, &ts32, sizeof(ts32))) != 0) 2755 return (error); 2756 CP(ts32, ts, tv_sec); 2757 CP(ts32, ts, tv_nsec); 2758 tsp = &ts; 2759 } else 2760 tsp = NULL; 2761 2762 ujoblist = uma_zalloc(aiol_zone, M_WAITOK); 2763 ujoblist32 = (uint32_t *)ujoblist; 2764 error = copyin(uap->aiocbp, ujoblist32, uap->nent * 2765 sizeof(ujoblist32[0])); 2766 if (error == 0) { 2767 for (i = uap->nent; i > 0; i--) 2768 ujoblist[i] = PTRIN(ujoblist32[i]); 2769 2770 error = kern_aio_suspend(td, uap->nent, ujoblist, tsp); 2771 } 2772 uma_zfree(aiol_zone, ujoblist); 2773 return (error); 2774 } 2775 2776 int 2777 freebsd32_aio_cancel(struct thread *td, struct freebsd32_aio_cancel_args *uap) 2778 { 2779 2780 return (aio_cancel(td, (struct aio_cancel_args *)uap)); 2781 } 2782 2783 int 2784 freebsd32_aio_error(struct thread *td, struct freebsd32_aio_error_args *uap) 2785 { 2786 2787 return (kern_aio_error(td, (struct aiocb *)uap->aiocbp, &aiocb32_ops)); 2788 } 2789 2790 int 2791 freebsd32_oaio_read(struct thread *td, struct freebsd32_oaio_read_args *uap) 2792 { 2793 2794 return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_READ, 2795 &aiocb32_ops_osigevent)); 2796 } 2797 2798 int 2799 freebsd32_aio_read(struct thread *td, struct freebsd32_aio_read_args *uap) 2800 { 2801 2802 return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_READ, 2803 &aiocb32_ops)); 2804 } 2805 2806 int 2807 freebsd32_oaio_write(struct thread *td, struct freebsd32_oaio_write_args *uap) 2808 { 2809 2810 return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_WRITE, 2811 &aiocb32_ops_osigevent)); 2812 } 2813 2814 int 2815 freebsd32_aio_write(struct thread *td, struct freebsd32_aio_write_args *uap) 2816 { 2817 2818 return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_WRITE, 2819 &aiocb32_ops)); 2820 } 2821 2822 int 2823 freebsd32_aio_waitcomplete(struct thread *td, 2824 struct freebsd32_aio_waitcomplete_args *uap) 2825 { 2826 struct timespec32 ts32; 2827 struct timespec ts, *tsp; 2828 int error; 2829 2830 if (uap->timeout) { 2831 /* Get timespec struct. */ 2832 error = copyin(uap->timeout, &ts32, sizeof(ts32)); 2833 if (error) 2834 return (error); 2835 CP(ts32, ts, tv_sec); 2836 CP(ts32, ts, tv_nsec); 2837 tsp = &ts; 2838 } else 2839 tsp = NULL; 2840 2841 return (kern_aio_waitcomplete(td, (struct aiocb **)uap->aiocbp, tsp, 2842 &aiocb32_ops)); 2843 } 2844 2845 int 2846 freebsd32_aio_fsync(struct thread *td, struct freebsd32_aio_fsync_args *uap) 2847 { 2848 2849 return (kern_aio_fsync(td, uap->op, (struct aiocb *)uap->aiocbp, 2850 &aiocb32_ops)); 2851 } 2852 2853 int 2854 freebsd32_olio_listio(struct thread *td, struct freebsd32_olio_listio_args *uap) 2855 { 2856 struct aiocb **acb_list; 2857 struct sigevent *sigp, sig; 2858 struct osigevent32 osig; 2859 uint32_t *acb_list32; 2860 int error, i, nent; 2861 2862 if ((uap->mode != LIO_NOWAIT) && (uap->mode != LIO_WAIT)) 2863 return (EINVAL); 2864 2865 nent = uap->nent; 2866 if (nent < 0 || nent > AIO_LISTIO_MAX) 2867 return (EINVAL); 2868 2869 if (uap->sig && (uap->mode == LIO_NOWAIT)) { 2870 error = copyin(uap->sig, &osig, sizeof(osig)); 2871 if (error) 2872 return (error); 2873 error = convert_old_sigevent32(&osig, &sig); 2874 if (error) 2875 return (error); 2876 sigp = &sig; 2877 } else 2878 sigp = NULL; 2879 2880 acb_list32 = malloc(sizeof(uint32_t) * nent, M_LIO, M_WAITOK); 2881 error = copyin(uap->acb_list, acb_list32, nent * sizeof(uint32_t)); 2882 if (error) { 2883 free(acb_list32, M_LIO); 2884 return (error); 2885 } 2886 acb_list = malloc(sizeof(struct aiocb *) * nent, M_LIO, M_WAITOK); 2887 for (i = 0; i < nent; i++) 2888 acb_list[i] = PTRIN(acb_list32[i]); 2889 free(acb_list32, M_LIO); 2890 2891 error = kern_lio_listio(td, uap->mode, 2892 (struct aiocb * const *)uap->acb_list, acb_list, nent, sigp, 2893 &aiocb32_ops_osigevent); 2894 free(acb_list, M_LIO); 2895 return (error); 2896 } 2897 2898 int 2899 freebsd32_lio_listio(struct thread *td, struct freebsd32_lio_listio_args *uap) 2900 { 2901 struct aiocb **acb_list; 2902 struct sigevent *sigp, sig; 2903 struct sigevent32 sig32; 2904 uint32_t *acb_list32; 2905 int error, i, nent; 2906 2907 if ((uap->mode != LIO_NOWAIT) && (uap->mode != LIO_WAIT)) 2908 return (EINVAL); 2909 2910 nent = uap->nent; 2911 if (nent < 0 || nent > AIO_LISTIO_MAX) 2912 return (EINVAL); 2913 2914 if (uap->sig && (uap->mode == LIO_NOWAIT)) { 2915 error = copyin(uap->sig, &sig32, sizeof(sig32)); 2916 if (error) 2917 return (error); 2918 error = convert_sigevent32(&sig32, &sig); 2919 if (error) 2920 return (error); 2921 sigp = &sig; 2922 } else 2923 sigp = NULL; 2924 2925 acb_list32 = malloc(sizeof(uint32_t) * nent, M_LIO, M_WAITOK); 2926 error = copyin(uap->acb_list, acb_list32, nent * sizeof(uint32_t)); 2927 if (error) { 2928 free(acb_list32, M_LIO); 2929 return (error); 2930 } 2931 acb_list = malloc(sizeof(struct aiocb *) * nent, M_LIO, M_WAITOK); 2932 for (i = 0; i < nent; i++) 2933 acb_list[i] = PTRIN(acb_list32[i]); 2934 free(acb_list32, M_LIO); 2935 2936 error = kern_lio_listio(td, uap->mode, 2937 (struct aiocb * const *)uap->acb_list, acb_list, nent, sigp, 2938 &aiocb32_ops); 2939 free(acb_list, M_LIO); 2940 return (error); 2941 } 2942 2943 SYSCALL32_MODULE_HELPER(freebsd32_aio_return); 2944 SYSCALL32_MODULE_HELPER(freebsd32_aio_suspend); 2945 SYSCALL32_MODULE_HELPER(freebsd32_aio_cancel); 2946 SYSCALL32_MODULE_HELPER(freebsd32_aio_error); 2947 SYSCALL32_MODULE_HELPER(freebsd32_aio_fsync); 2948 SYSCALL32_MODULE_HELPER(freebsd32_aio_read); 2949 SYSCALL32_MODULE_HELPER(freebsd32_aio_write); 2950 SYSCALL32_MODULE_HELPER(freebsd32_aio_waitcomplete); 2951 SYSCALL32_MODULE_HELPER(freebsd32_lio_listio); 2952 SYSCALL32_MODULE_HELPER(freebsd32_oaio_read); 2953 SYSCALL32_MODULE_HELPER(freebsd32_oaio_write); 2954 SYSCALL32_MODULE_HELPER(freebsd32_olio_listio); 2955 #endif 2956