1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause 3 * 4 * Copyright (c) 1997 John S. Dyson. All rights reserved. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions 8 * are met: 9 * 1. Redistributions of source code must retain the above copyright 10 * notice, this list of conditions and the following disclaimer. 11 * 2. John S. Dyson's name may not be used to endorse or promote products 12 * derived from this software without specific prior written permission. 13 * 14 * DISCLAIMER: This code isn't warranted to do anything useful. Anything 15 * bad that happens because of using this software isn't the responsibility 16 * of the author. This software is distributed AS-IS. 17 */ 18 19 /* 20 * This file contains support for the POSIX 1003.1B AIO/LIO facility. 21 */ 22 23 #include <sys/param.h> 24 #include <sys/systm.h> 25 #include <sys/malloc.h> 26 #include <sys/bio.h> 27 #include <sys/buf.h> 28 #include <sys/capsicum.h> 29 #include <sys/eventhandler.h> 30 #include <sys/sysproto.h> 31 #include <sys/filedesc.h> 32 #include <sys/kernel.h> 33 #include <sys/module.h> 34 #include <sys/kthread.h> 35 #include <sys/fcntl.h> 36 #include <sys/file.h> 37 #include <sys/limits.h> 38 #include <sys/lock.h> 39 #include <sys/mutex.h> 40 #include <sys/unistd.h> 41 #include <sys/posix4.h> 42 #include <sys/proc.h> 43 #include <sys/resourcevar.h> 44 #include <sys/signalvar.h> 45 #include <sys/syscallsubr.h> 46 #include <sys/protosw.h> 47 #include <sys/rwlock.h> 48 #include <sys/sema.h> 49 #include <sys/socket.h> 50 #include <sys/socketvar.h> 51 #include <sys/syscall.h> 52 #include <sys/sysctl.h> 53 #include <sys/syslog.h> 54 #include <sys/sx.h> 55 #include <sys/taskqueue.h> 56 #include <sys/vnode.h> 57 #include <sys/conf.h> 58 #include <sys/event.h> 59 #include <sys/mount.h> 60 #include <geom/geom.h> 61 62 #include <machine/atomic.h> 63 64 #include <vm/vm.h> 65 #include <vm/vm_page.h> 66 #include <vm/vm_extern.h> 67 #include <vm/pmap.h> 68 #include <vm/vm_map.h> 69 #include <vm/vm_object.h> 70 #include <vm/vnode_pager.h> 71 #include <vm/uma.h> 72 #include <sys/aio.h> 73 74 /* 75 * Counter for allocating reference ids to new jobs. Wrapped to 1 on 76 * overflow. (XXX will be removed soon.) 77 */ 78 static u_long jobrefid; 79 80 /* 81 * Counter for aio_fsync. 82 */ 83 static uint64_t jobseqno; 84 85 #ifndef MAX_AIO_PER_PROC 86 #define MAX_AIO_PER_PROC 32 87 #endif 88 89 #ifndef MAX_AIO_QUEUE_PER_PROC 90 #define MAX_AIO_QUEUE_PER_PROC 256 91 #endif 92 93 #ifndef MAX_AIO_QUEUE 94 #define MAX_AIO_QUEUE 1024 /* Bigger than MAX_AIO_QUEUE_PER_PROC */ 95 #endif 96 97 #ifndef MAX_BUF_AIO 98 #define MAX_BUF_AIO 16 99 #endif 100 101 FEATURE(aio, "Asynchronous I/O"); 102 SYSCTL_DECL(_p1003_1b); 103 104 static MALLOC_DEFINE(M_LIO, "lio", "listio aio control block list"); 105 static MALLOC_DEFINE(M_AIO, "aio", "structures for asynchronous I/O"); 106 107 static SYSCTL_NODE(_vfs, OID_AUTO, aio, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 108 "Async IO management"); 109 110 static int enable_aio_unsafe = 0; 111 SYSCTL_INT(_vfs_aio, OID_AUTO, enable_unsafe, CTLFLAG_RW, &enable_aio_unsafe, 0, 112 "Permit asynchronous IO on all file types, not just known-safe types"); 113 114 static unsigned int unsafe_warningcnt = 1; 115 SYSCTL_UINT(_vfs_aio, OID_AUTO, unsafe_warningcnt, CTLFLAG_RW, 116 &unsafe_warningcnt, 0, 117 "Warnings that will be triggered upon failed IO requests on unsafe files"); 118 119 static int max_aio_procs = MAX_AIO_PROCS; 120 SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_procs, CTLFLAG_RW, &max_aio_procs, 0, 121 "Maximum number of kernel processes to use for handling async IO "); 122 123 static int num_aio_procs = 0; 124 SYSCTL_INT(_vfs_aio, OID_AUTO, num_aio_procs, CTLFLAG_RD, &num_aio_procs, 0, 125 "Number of presently active kernel processes for async IO"); 126 127 /* 128 * The code will adjust the actual number of AIO processes towards this 129 * number when it gets a chance. 130 */ 131 static int target_aio_procs = TARGET_AIO_PROCS; 132 SYSCTL_INT(_vfs_aio, OID_AUTO, target_aio_procs, CTLFLAG_RW, &target_aio_procs, 133 0, 134 "Preferred number of ready kernel processes for async IO"); 135 136 static int max_queue_count = MAX_AIO_QUEUE; 137 SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_queue, CTLFLAG_RW, &max_queue_count, 0, 138 "Maximum number of aio requests to queue, globally"); 139 140 static int num_queue_count = 0; 141 SYSCTL_INT(_vfs_aio, OID_AUTO, num_queue_count, CTLFLAG_RD, &num_queue_count, 0, 142 "Number of queued aio requests"); 143 144 static int num_buf_aio = 0; 145 SYSCTL_INT(_vfs_aio, OID_AUTO, num_buf_aio, CTLFLAG_RD, &num_buf_aio, 0, 146 "Number of aio requests presently handled by the buf subsystem"); 147 148 static int num_unmapped_aio = 0; 149 SYSCTL_INT(_vfs_aio, OID_AUTO, num_unmapped_aio, CTLFLAG_RD, &num_unmapped_aio, 150 0, 151 "Number of aio requests presently handled by unmapped I/O buffers"); 152 153 /* Number of async I/O processes in the process of being started */ 154 /* XXX This should be local to aio_aqueue() */ 155 static int num_aio_resv_start = 0; 156 157 static int aiod_lifetime; 158 SYSCTL_INT(_vfs_aio, OID_AUTO, aiod_lifetime, CTLFLAG_RW, &aiod_lifetime, 0, 159 "Maximum lifetime for idle aiod"); 160 161 static int max_aio_per_proc = MAX_AIO_PER_PROC; 162 SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_per_proc, CTLFLAG_RW, &max_aio_per_proc, 163 0, 164 "Maximum active aio requests per process"); 165 166 static int max_aio_queue_per_proc = MAX_AIO_QUEUE_PER_PROC; 167 SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_queue_per_proc, CTLFLAG_RW, 168 &max_aio_queue_per_proc, 0, 169 "Maximum queued aio requests per process"); 170 171 static int max_buf_aio = MAX_BUF_AIO; 172 SYSCTL_INT(_vfs_aio, OID_AUTO, max_buf_aio, CTLFLAG_RW, &max_buf_aio, 0, 173 "Maximum buf aio requests per process"); 174 175 /* 176 * Though redundant with vfs.aio.max_aio_queue_per_proc, POSIX requires 177 * sysconf(3) to support AIO_LISTIO_MAX, and we implement that with 178 * vfs.aio.aio_listio_max. 179 */ 180 SYSCTL_INT(_p1003_1b, CTL_P1003_1B_AIO_LISTIO_MAX, aio_listio_max, 181 CTLFLAG_RD | CTLFLAG_CAPRD, &max_aio_queue_per_proc, 182 0, "Maximum aio requests for a single lio_listio call"); 183 184 #ifdef COMPAT_FREEBSD6 185 typedef struct oaiocb { 186 int aio_fildes; /* File descriptor */ 187 off_t aio_offset; /* File offset for I/O */ 188 volatile void *aio_buf; /* I/O buffer in process space */ 189 size_t aio_nbytes; /* Number of bytes for I/O */ 190 struct osigevent aio_sigevent; /* Signal to deliver */ 191 int aio_lio_opcode; /* LIO opcode */ 192 int aio_reqprio; /* Request priority -- ignored */ 193 struct __aiocb_private _aiocb_private; 194 } oaiocb_t; 195 #endif 196 197 /* 198 * Below is a key of locks used to protect each member of struct kaiocb 199 * aioliojob and kaioinfo and any backends. 200 * 201 * * - need not protected 202 * a - locked by kaioinfo lock 203 * b - locked by backend lock, the backend lock can be null in some cases, 204 * for example, BIO belongs to this type, in this case, proc lock is 205 * reused. 206 * c - locked by aio_job_mtx, the lock for the generic file I/O backend. 207 */ 208 209 /* 210 * If the routine that services an AIO request blocks while running in an 211 * AIO kernel process it can starve other I/O requests. BIO requests 212 * queued via aio_qbio() complete asynchronously and do not use AIO kernel 213 * processes at all. Socket I/O requests use a separate pool of 214 * kprocs and also force non-blocking I/O. Other file I/O requests 215 * use the generic fo_read/fo_write operations which can block. The 216 * fsync and mlock operations can also block while executing. Ideally 217 * none of these requests would block while executing. 218 * 219 * Note that the service routines cannot toggle O_NONBLOCK in the file 220 * structure directly while handling a request due to races with 221 * userland threads. 222 */ 223 224 /* jobflags */ 225 #define KAIOCB_QUEUEING 0x01 226 #define KAIOCB_CANCELLED 0x02 227 #define KAIOCB_CANCELLING 0x04 228 #define KAIOCB_CHECKSYNC 0x08 229 #define KAIOCB_CLEARED 0x10 230 #define KAIOCB_FINISHED 0x20 231 232 /* 233 * AIO process info 234 */ 235 #define AIOP_FREE 0x1 /* proc on free queue */ 236 237 struct aioproc { 238 int aioprocflags; /* (c) AIO proc flags */ 239 TAILQ_ENTRY(aioproc) list; /* (c) list of processes */ 240 struct proc *aioproc; /* (*) the AIO proc */ 241 }; 242 243 /* 244 * data-structure for lio signal management 245 */ 246 struct aioliojob { 247 int lioj_flags; /* (a) listio flags */ 248 int lioj_count; /* (a) count of jobs */ 249 int lioj_finished_count; /* (a) count of finished jobs */ 250 struct sigevent lioj_signal; /* (a) signal on all I/O done */ 251 TAILQ_ENTRY(aioliojob) lioj_list; /* (a) lio list */ 252 struct knlist klist; /* (a) list of knotes */ 253 ksiginfo_t lioj_ksi; /* (a) Realtime signal info */ 254 }; 255 256 #define LIOJ_SIGNAL 0x1 /* signal on all done (lio) */ 257 #define LIOJ_SIGNAL_POSTED 0x2 /* signal has been posted */ 258 #define LIOJ_KEVENT_POSTED 0x4 /* kevent triggered */ 259 260 /* 261 * per process aio data structure 262 */ 263 struct kaioinfo { 264 struct mtx kaio_mtx; /* the lock to protect this struct */ 265 int kaio_flags; /* (a) per process kaio flags */ 266 int kaio_active_count; /* (c) number of currently used AIOs */ 267 int kaio_count; /* (a) size of AIO queue */ 268 int kaio_buffer_count; /* (a) number of bio buffers */ 269 TAILQ_HEAD(,kaiocb) kaio_all; /* (a) all AIOs in a process */ 270 TAILQ_HEAD(,kaiocb) kaio_done; /* (a) done queue for process */ 271 TAILQ_HEAD(,aioliojob) kaio_liojoblist; /* (a) list of lio jobs */ 272 TAILQ_HEAD(,kaiocb) kaio_jobqueue; /* (a) job queue for process */ 273 TAILQ_HEAD(,kaiocb) kaio_syncqueue; /* (a) queue for aio_fsync */ 274 TAILQ_HEAD(,kaiocb) kaio_syncready; /* (a) second q for aio_fsync */ 275 struct task kaio_task; /* (*) task to kick aio processes */ 276 struct task kaio_sync_task; /* (*) task to schedule fsync jobs */ 277 }; 278 279 #define AIO_LOCK(ki) mtx_lock(&(ki)->kaio_mtx) 280 #define AIO_UNLOCK(ki) mtx_unlock(&(ki)->kaio_mtx) 281 #define AIO_LOCK_ASSERT(ki, f) mtx_assert(&(ki)->kaio_mtx, (f)) 282 #define AIO_MTX(ki) (&(ki)->kaio_mtx) 283 284 #define KAIO_RUNDOWN 0x1 /* process is being run down */ 285 #define KAIO_WAKEUP 0x2 /* wakeup process when AIO completes */ 286 287 /* 288 * Operations used to interact with userland aio control blocks. 289 * Different ABIs provide their own operations. 290 */ 291 struct aiocb_ops { 292 int (*aio_copyin)(struct aiocb *ujob, struct kaiocb *kjob, int ty); 293 long (*fetch_status)(struct aiocb *ujob); 294 long (*fetch_error)(struct aiocb *ujob); 295 int (*store_status)(struct aiocb *ujob, long status); 296 int (*store_error)(struct aiocb *ujob, long error); 297 int (*store_kernelinfo)(struct aiocb *ujob, long jobref); 298 int (*store_aiocb)(struct aiocb **ujobp, struct aiocb *ujob); 299 }; 300 301 static TAILQ_HEAD(,aioproc) aio_freeproc; /* (c) Idle daemons */ 302 static struct sema aio_newproc_sem; 303 static struct mtx aio_job_mtx; 304 static TAILQ_HEAD(,kaiocb) aio_jobs; /* (c) Async job list */ 305 static struct unrhdr *aiod_unr; 306 307 static void aio_biocleanup(struct bio *bp); 308 void aio_init_aioinfo(struct proc *p); 309 static int aio_onceonly(void); 310 static int aio_free_entry(struct kaiocb *job); 311 static void aio_process_rw(struct kaiocb *job); 312 static void aio_process_sync(struct kaiocb *job); 313 static void aio_process_mlock(struct kaiocb *job); 314 static void aio_schedule_fsync(void *context, int pending); 315 static int aio_newproc(int *); 316 int aio_aqueue(struct thread *td, struct aiocb *ujob, 317 struct aioliojob *lio, int type, struct aiocb_ops *ops); 318 static int aio_queue_file(struct file *fp, struct kaiocb *job); 319 static void aio_biowakeup(struct bio *bp); 320 static void aio_proc_rundown(void *arg, struct proc *p); 321 static void aio_proc_rundown_exec(void *arg, struct proc *p, 322 struct image_params *imgp); 323 static int aio_qbio(struct proc *p, struct kaiocb *job); 324 static void aio_daemon(void *param); 325 static void aio_bio_done_notify(struct proc *userp, struct kaiocb *job); 326 static bool aio_clear_cancel_function_locked(struct kaiocb *job); 327 static int aio_kick(struct proc *userp); 328 static void aio_kick_nowait(struct proc *userp); 329 static void aio_kick_helper(void *context, int pending); 330 static int filt_aioattach(struct knote *kn); 331 static void filt_aiodetach(struct knote *kn); 332 static int filt_aio(struct knote *kn, long hint); 333 static int filt_lioattach(struct knote *kn); 334 static void filt_liodetach(struct knote *kn); 335 static int filt_lio(struct knote *kn, long hint); 336 337 /* 338 * Zones for: 339 * kaio Per process async io info 340 * aiocb async io jobs 341 * aiolio list io jobs 342 */ 343 static uma_zone_t kaio_zone, aiocb_zone, aiolio_zone; 344 345 /* kqueue filters for aio */ 346 static struct filterops aio_filtops = { 347 .f_isfd = 0, 348 .f_attach = filt_aioattach, 349 .f_detach = filt_aiodetach, 350 .f_event = filt_aio, 351 }; 352 static struct filterops lio_filtops = { 353 .f_isfd = 0, 354 .f_attach = filt_lioattach, 355 .f_detach = filt_liodetach, 356 .f_event = filt_lio 357 }; 358 359 static eventhandler_tag exit_tag, exec_tag; 360 361 TASKQUEUE_DEFINE_THREAD(aiod_kick); 362 363 /* 364 * Main operations function for use as a kernel module. 365 */ 366 static int 367 aio_modload(struct module *module, int cmd, void *arg) 368 { 369 int error = 0; 370 371 switch (cmd) { 372 case MOD_LOAD: 373 aio_onceonly(); 374 break; 375 case MOD_SHUTDOWN: 376 break; 377 default: 378 error = EOPNOTSUPP; 379 break; 380 } 381 return (error); 382 } 383 384 static moduledata_t aio_mod = { 385 "aio", 386 &aio_modload, 387 NULL 388 }; 389 390 DECLARE_MODULE(aio, aio_mod, SI_SUB_VFS, SI_ORDER_ANY); 391 MODULE_VERSION(aio, 1); 392 393 /* 394 * Startup initialization 395 */ 396 static int 397 aio_onceonly(void) 398 { 399 400 exit_tag = EVENTHANDLER_REGISTER(process_exit, aio_proc_rundown, NULL, 401 EVENTHANDLER_PRI_ANY); 402 exec_tag = EVENTHANDLER_REGISTER(process_exec, aio_proc_rundown_exec, 403 NULL, EVENTHANDLER_PRI_ANY); 404 kqueue_add_filteropts(EVFILT_AIO, &aio_filtops); 405 kqueue_add_filteropts(EVFILT_LIO, &lio_filtops); 406 TAILQ_INIT(&aio_freeproc); 407 sema_init(&aio_newproc_sem, 0, "aio_new_proc"); 408 mtx_init(&aio_job_mtx, "aio_job", NULL, MTX_DEF); 409 TAILQ_INIT(&aio_jobs); 410 aiod_unr = new_unrhdr(1, INT_MAX, NULL); 411 kaio_zone = uma_zcreate("AIO", sizeof(struct kaioinfo), NULL, NULL, 412 NULL, NULL, UMA_ALIGN_PTR, 0); 413 aiocb_zone = uma_zcreate("AIOCB", sizeof(struct kaiocb), NULL, NULL, 414 NULL, NULL, UMA_ALIGN_PTR, 0); 415 aiolio_zone = uma_zcreate("AIOLIO", sizeof(struct aioliojob), NULL, 416 NULL, NULL, NULL, UMA_ALIGN_PTR, 0); 417 aiod_lifetime = AIOD_LIFETIME_DEFAULT; 418 jobrefid = 1; 419 p31b_setcfg(CTL_P1003_1B_ASYNCHRONOUS_IO, _POSIX_ASYNCHRONOUS_IO); 420 p31b_setcfg(CTL_P1003_1B_AIO_MAX, MAX_AIO_QUEUE); 421 p31b_setcfg(CTL_P1003_1B_AIO_PRIO_DELTA_MAX, 0); 422 423 return (0); 424 } 425 426 /* 427 * Init the per-process aioinfo structure. The aioinfo limits are set 428 * per-process for user limit (resource) management. 429 */ 430 void 431 aio_init_aioinfo(struct proc *p) 432 { 433 struct kaioinfo *ki; 434 435 ki = uma_zalloc(kaio_zone, M_WAITOK); 436 mtx_init(&ki->kaio_mtx, "aiomtx", NULL, MTX_DEF | MTX_NEW); 437 ki->kaio_flags = 0; 438 ki->kaio_active_count = 0; 439 ki->kaio_count = 0; 440 ki->kaio_buffer_count = 0; 441 TAILQ_INIT(&ki->kaio_all); 442 TAILQ_INIT(&ki->kaio_done); 443 TAILQ_INIT(&ki->kaio_jobqueue); 444 TAILQ_INIT(&ki->kaio_liojoblist); 445 TAILQ_INIT(&ki->kaio_syncqueue); 446 TAILQ_INIT(&ki->kaio_syncready); 447 TASK_INIT(&ki->kaio_task, 0, aio_kick_helper, p); 448 TASK_INIT(&ki->kaio_sync_task, 0, aio_schedule_fsync, ki); 449 PROC_LOCK(p); 450 if (p->p_aioinfo == NULL) { 451 p->p_aioinfo = ki; 452 PROC_UNLOCK(p); 453 } else { 454 PROC_UNLOCK(p); 455 mtx_destroy(&ki->kaio_mtx); 456 uma_zfree(kaio_zone, ki); 457 } 458 459 while (num_aio_procs < MIN(target_aio_procs, max_aio_procs)) 460 aio_newproc(NULL); 461 } 462 463 static int 464 aio_sendsig(struct proc *p, struct sigevent *sigev, ksiginfo_t *ksi, bool ext) 465 { 466 struct thread *td; 467 int error; 468 469 error = sigev_findtd(p, sigev, &td); 470 if (error) 471 return (error); 472 if (!KSI_ONQ(ksi)) { 473 ksiginfo_set_sigev(ksi, sigev); 474 ksi->ksi_code = SI_ASYNCIO; 475 ksi->ksi_flags |= ext ? (KSI_EXT | KSI_INS) : 0; 476 tdsendsignal(p, td, ksi->ksi_signo, ksi); 477 } 478 PROC_UNLOCK(p); 479 return (error); 480 } 481 482 /* 483 * Free a job entry. Wait for completion if it is currently active, but don't 484 * delay forever. If we delay, we return a flag that says that we have to 485 * restart the queue scan. 486 */ 487 static int 488 aio_free_entry(struct kaiocb *job) 489 { 490 struct kaioinfo *ki; 491 struct aioliojob *lj; 492 struct proc *p; 493 494 p = job->userproc; 495 MPASS(curproc == p); 496 ki = p->p_aioinfo; 497 MPASS(ki != NULL); 498 499 AIO_LOCK_ASSERT(ki, MA_OWNED); 500 MPASS(job->jobflags & KAIOCB_FINISHED); 501 502 atomic_subtract_int(&num_queue_count, 1); 503 504 ki->kaio_count--; 505 MPASS(ki->kaio_count >= 0); 506 507 TAILQ_REMOVE(&ki->kaio_done, job, plist); 508 TAILQ_REMOVE(&ki->kaio_all, job, allist); 509 510 lj = job->lio; 511 if (lj) { 512 lj->lioj_count--; 513 lj->lioj_finished_count--; 514 515 if (lj->lioj_count == 0) { 516 TAILQ_REMOVE(&ki->kaio_liojoblist, lj, lioj_list); 517 /* lio is going away, we need to destroy any knotes */ 518 knlist_delete(&lj->klist, curthread, 1); 519 PROC_LOCK(p); 520 sigqueue_take(&lj->lioj_ksi); 521 PROC_UNLOCK(p); 522 uma_zfree(aiolio_zone, lj); 523 } 524 } 525 526 /* job is going away, we need to destroy any knotes */ 527 knlist_delete(&job->klist, curthread, 1); 528 PROC_LOCK(p); 529 sigqueue_take(&job->ksi); 530 PROC_UNLOCK(p); 531 532 AIO_UNLOCK(ki); 533 534 /* 535 * The thread argument here is used to find the owning process 536 * and is also passed to fo_close() which may pass it to various 537 * places such as devsw close() routines. Because of that, we 538 * need a thread pointer from the process owning the job that is 539 * persistent and won't disappear out from under us or move to 540 * another process. 541 * 542 * Currently, all the callers of this function call it to remove 543 * a kaiocb from the current process' job list either via a 544 * syscall or due to the current process calling exit() or 545 * execve(). Thus, we know that p == curproc. We also know that 546 * curthread can't exit since we are curthread. 547 * 548 * Therefore, we use curthread as the thread to pass to 549 * knlist_delete(). This does mean that it is possible for the 550 * thread pointer at close time to differ from the thread pointer 551 * at open time, but this is already true of file descriptors in 552 * a multithreaded process. 553 */ 554 if (job->fd_file) 555 fdrop(job->fd_file, curthread); 556 crfree(job->cred); 557 if (job->uiop != &job->uio) 558 free(job->uiop, M_IOV); 559 uma_zfree(aiocb_zone, job); 560 AIO_LOCK(ki); 561 562 return (0); 563 } 564 565 static void 566 aio_proc_rundown_exec(void *arg, struct proc *p, 567 struct image_params *imgp __unused) 568 { 569 aio_proc_rundown(arg, p); 570 } 571 572 static int 573 aio_cancel_job(struct proc *p, struct kaioinfo *ki, struct kaiocb *job) 574 { 575 aio_cancel_fn_t *func; 576 int cancelled; 577 578 AIO_LOCK_ASSERT(ki, MA_OWNED); 579 if (job->jobflags & (KAIOCB_CANCELLED | KAIOCB_FINISHED)) 580 return (0); 581 MPASS((job->jobflags & KAIOCB_CANCELLING) == 0); 582 job->jobflags |= KAIOCB_CANCELLED; 583 584 func = job->cancel_fn; 585 586 /* 587 * If there is no cancel routine, just leave the job marked as 588 * cancelled. The job should be in active use by a caller who 589 * should complete it normally or when it fails to install a 590 * cancel routine. 591 */ 592 if (func == NULL) 593 return (0); 594 595 /* 596 * Set the CANCELLING flag so that aio_complete() will defer 597 * completions of this job. This prevents the job from being 598 * freed out from under the cancel callback. After the 599 * callback any deferred completion (whether from the callback 600 * or any other source) will be completed. 601 */ 602 job->jobflags |= KAIOCB_CANCELLING; 603 AIO_UNLOCK(ki); 604 func(job); 605 AIO_LOCK(ki); 606 job->jobflags &= ~KAIOCB_CANCELLING; 607 if (job->jobflags & KAIOCB_FINISHED) { 608 cancelled = job->uaiocb._aiocb_private.error == ECANCELED; 609 TAILQ_REMOVE(&ki->kaio_jobqueue, job, plist); 610 aio_bio_done_notify(p, job); 611 } else { 612 /* 613 * The cancel callback might have scheduled an 614 * operation to cancel this request, but it is 615 * only counted as cancelled if the request is 616 * cancelled when the callback returns. 617 */ 618 cancelled = 0; 619 } 620 return (cancelled); 621 } 622 623 /* 624 * Rundown the jobs for a given process. 625 */ 626 static void 627 aio_proc_rundown(void *arg, struct proc *p) 628 { 629 struct kaioinfo *ki; 630 struct aioliojob *lj; 631 struct kaiocb *job, *jobn; 632 633 KASSERT(curthread->td_proc == p, 634 ("%s: called on non-curproc", __func__)); 635 ki = p->p_aioinfo; 636 if (ki == NULL) 637 return; 638 639 AIO_LOCK(ki); 640 ki->kaio_flags |= KAIO_RUNDOWN; 641 642 restart: 643 644 /* 645 * Try to cancel all pending requests. This code simulates 646 * aio_cancel on all pending I/O requests. 647 */ 648 TAILQ_FOREACH_SAFE(job, &ki->kaio_jobqueue, plist, jobn) { 649 aio_cancel_job(p, ki, job); 650 } 651 652 /* Wait for all running I/O to be finished */ 653 if (TAILQ_FIRST(&ki->kaio_jobqueue) || ki->kaio_active_count != 0) { 654 ki->kaio_flags |= KAIO_WAKEUP; 655 msleep(&p->p_aioinfo, AIO_MTX(ki), PRIBIO, "aioprn", hz); 656 goto restart; 657 } 658 659 /* Free all completed I/O requests. */ 660 while ((job = TAILQ_FIRST(&ki->kaio_done)) != NULL) 661 aio_free_entry(job); 662 663 while ((lj = TAILQ_FIRST(&ki->kaio_liojoblist)) != NULL) { 664 if (lj->lioj_count == 0) { 665 TAILQ_REMOVE(&ki->kaio_liojoblist, lj, lioj_list); 666 knlist_delete(&lj->klist, curthread, 1); 667 PROC_LOCK(p); 668 sigqueue_take(&lj->lioj_ksi); 669 PROC_UNLOCK(p); 670 uma_zfree(aiolio_zone, lj); 671 } else { 672 panic("LIO job not cleaned up: C:%d, FC:%d\n", 673 lj->lioj_count, lj->lioj_finished_count); 674 } 675 } 676 AIO_UNLOCK(ki); 677 taskqueue_drain(taskqueue_aiod_kick, &ki->kaio_task); 678 taskqueue_drain(taskqueue_aiod_kick, &ki->kaio_sync_task); 679 mtx_destroy(&ki->kaio_mtx); 680 uma_zfree(kaio_zone, ki); 681 p->p_aioinfo = NULL; 682 } 683 684 /* 685 * Select a job to run (called by an AIO daemon). 686 */ 687 static struct kaiocb * 688 aio_selectjob(struct aioproc *aiop) 689 { 690 struct kaiocb *job; 691 struct kaioinfo *ki; 692 struct proc *userp; 693 694 mtx_assert(&aio_job_mtx, MA_OWNED); 695 restart: 696 TAILQ_FOREACH(job, &aio_jobs, list) { 697 userp = job->userproc; 698 ki = userp->p_aioinfo; 699 700 if (ki->kaio_active_count < max_aio_per_proc) { 701 TAILQ_REMOVE(&aio_jobs, job, list); 702 if (!aio_clear_cancel_function(job)) 703 goto restart; 704 705 /* Account for currently active jobs. */ 706 ki->kaio_active_count++; 707 break; 708 } 709 } 710 return (job); 711 } 712 713 /* 714 * Move all data to a permanent storage device. This code 715 * simulates the fsync and fdatasync syscalls. 716 */ 717 static int 718 aio_fsync_vnode(struct thread *td, struct vnode *vp, int op) 719 { 720 struct mount *mp; 721 int error; 722 723 for (;;) { 724 error = vn_start_write(vp, &mp, V_WAIT | V_PCATCH); 725 if (error != 0) 726 break; 727 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 728 vnode_pager_clean_async(vp); 729 if (op == LIO_DSYNC) 730 error = VOP_FDATASYNC(vp, td); 731 else 732 error = VOP_FSYNC(vp, MNT_WAIT, td); 733 734 VOP_UNLOCK(vp); 735 vn_finished_write(mp); 736 if (error != ERELOOKUP) 737 break; 738 } 739 return (error); 740 } 741 742 /* 743 * The AIO processing activity for LIO_READ/LIO_WRITE. This is the code that 744 * does the I/O request for the non-bio version of the operations. The normal 745 * vn operations are used, and this code should work in all instances for every 746 * type of file, including pipes, sockets, fifos, and regular files. 747 * 748 * XXX I don't think it works well for socket, pipe, and fifo. 749 */ 750 static void 751 aio_process_rw(struct kaiocb *job) 752 { 753 struct ucred *td_savedcred; 754 struct thread *td; 755 struct file *fp; 756 ssize_t cnt; 757 long msgsnd_st, msgsnd_end; 758 long msgrcv_st, msgrcv_end; 759 long oublock_st, oublock_end; 760 long inblock_st, inblock_end; 761 int error, opcode; 762 763 KASSERT(job->uaiocb.aio_lio_opcode == LIO_READ || 764 job->uaiocb.aio_lio_opcode == LIO_READV || 765 job->uaiocb.aio_lio_opcode == LIO_WRITE || 766 job->uaiocb.aio_lio_opcode == LIO_WRITEV, 767 ("%s: opcode %d", __func__, job->uaiocb.aio_lio_opcode)); 768 769 aio_switch_vmspace(job); 770 td = curthread; 771 td_savedcred = td->td_ucred; 772 td->td_ucred = job->cred; 773 job->uiop->uio_td = td; 774 fp = job->fd_file; 775 776 opcode = job->uaiocb.aio_lio_opcode; 777 cnt = job->uiop->uio_resid; 778 779 msgrcv_st = td->td_ru.ru_msgrcv; 780 msgsnd_st = td->td_ru.ru_msgsnd; 781 inblock_st = td->td_ru.ru_inblock; 782 oublock_st = td->td_ru.ru_oublock; 783 784 /* 785 * aio_aqueue() acquires a reference to the file that is 786 * released in aio_free_entry(). 787 */ 788 if (opcode == LIO_READ || opcode == LIO_READV) { 789 if (job->uiop->uio_resid == 0) 790 error = 0; 791 else 792 error = fo_read(fp, job->uiop, fp->f_cred, FOF_OFFSET, 793 td); 794 } else { 795 if (fp->f_type == DTYPE_VNODE) 796 bwillwrite(); 797 error = fo_write(fp, job->uiop, fp->f_cred, FOF_OFFSET, td); 798 } 799 msgrcv_end = td->td_ru.ru_msgrcv; 800 msgsnd_end = td->td_ru.ru_msgsnd; 801 inblock_end = td->td_ru.ru_inblock; 802 oublock_end = td->td_ru.ru_oublock; 803 804 job->msgrcv = msgrcv_end - msgrcv_st; 805 job->msgsnd = msgsnd_end - msgsnd_st; 806 job->inblock = inblock_end - inblock_st; 807 job->outblock = oublock_end - oublock_st; 808 809 if (error != 0 && job->uiop->uio_resid != cnt) { 810 if (error == ERESTART || error == EINTR || error == EWOULDBLOCK) 811 error = 0; 812 if (error == EPIPE && (opcode & LIO_WRITE)) { 813 PROC_LOCK(job->userproc); 814 kern_psignal(job->userproc, SIGPIPE); 815 PROC_UNLOCK(job->userproc); 816 } 817 } 818 819 cnt -= job->uiop->uio_resid; 820 td->td_ucred = td_savedcred; 821 if (error) 822 aio_complete(job, -1, error); 823 else 824 aio_complete(job, cnt, 0); 825 } 826 827 static void 828 aio_process_sync(struct kaiocb *job) 829 { 830 struct thread *td = curthread; 831 struct ucred *td_savedcred = td->td_ucred; 832 struct file *fp = job->fd_file; 833 int error = 0; 834 835 KASSERT(job->uaiocb.aio_lio_opcode & LIO_SYNC, 836 ("%s: opcode %d", __func__, job->uaiocb.aio_lio_opcode)); 837 838 td->td_ucred = job->cred; 839 if (fp->f_vnode != NULL) { 840 error = aio_fsync_vnode(td, fp->f_vnode, 841 job->uaiocb.aio_lio_opcode); 842 } 843 td->td_ucred = td_savedcred; 844 if (error) 845 aio_complete(job, -1, error); 846 else 847 aio_complete(job, 0, 0); 848 } 849 850 static void 851 aio_process_mlock(struct kaiocb *job) 852 { 853 struct aiocb *cb = &job->uaiocb; 854 int error; 855 856 KASSERT(job->uaiocb.aio_lio_opcode == LIO_MLOCK, 857 ("%s: opcode %d", __func__, job->uaiocb.aio_lio_opcode)); 858 859 aio_switch_vmspace(job); 860 error = kern_mlock(job->userproc, job->cred, 861 __DEVOLATILE(uintptr_t, cb->aio_buf), cb->aio_nbytes); 862 aio_complete(job, error != 0 ? -1 : 0, error); 863 } 864 865 static void 866 aio_bio_done_notify(struct proc *userp, struct kaiocb *job) 867 { 868 struct aioliojob *lj; 869 struct kaioinfo *ki; 870 struct kaiocb *sjob, *sjobn; 871 int lj_done; 872 bool schedule_fsync; 873 874 ki = userp->p_aioinfo; 875 AIO_LOCK_ASSERT(ki, MA_OWNED); 876 lj = job->lio; 877 lj_done = 0; 878 if (lj) { 879 lj->lioj_finished_count++; 880 if (lj->lioj_count == lj->lioj_finished_count) 881 lj_done = 1; 882 } 883 TAILQ_INSERT_TAIL(&ki->kaio_done, job, plist); 884 MPASS(job->jobflags & KAIOCB_FINISHED); 885 886 if (ki->kaio_flags & KAIO_RUNDOWN) 887 goto notification_done; 888 889 if (job->uaiocb.aio_sigevent.sigev_notify == SIGEV_SIGNAL || 890 job->uaiocb.aio_sigevent.sigev_notify == SIGEV_THREAD_ID) 891 aio_sendsig(userp, &job->uaiocb.aio_sigevent, &job->ksi, true); 892 893 KNOTE_LOCKED(&job->klist, 1); 894 895 if (lj_done) { 896 if (lj->lioj_signal.sigev_notify == SIGEV_KEVENT) { 897 lj->lioj_flags |= LIOJ_KEVENT_POSTED; 898 KNOTE_LOCKED(&lj->klist, 1); 899 } 900 if ((lj->lioj_flags & (LIOJ_SIGNAL | LIOJ_SIGNAL_POSTED)) 901 == LIOJ_SIGNAL && 902 (lj->lioj_signal.sigev_notify == SIGEV_SIGNAL || 903 lj->lioj_signal.sigev_notify == SIGEV_THREAD_ID)) { 904 aio_sendsig(userp, &lj->lioj_signal, &lj->lioj_ksi, 905 true); 906 lj->lioj_flags |= LIOJ_SIGNAL_POSTED; 907 } 908 } 909 910 notification_done: 911 if (job->jobflags & KAIOCB_CHECKSYNC) { 912 schedule_fsync = false; 913 TAILQ_FOREACH_SAFE(sjob, &ki->kaio_syncqueue, list, sjobn) { 914 if (job->fd_file != sjob->fd_file || 915 job->seqno >= sjob->seqno) 916 continue; 917 if (--sjob->pending > 0) 918 continue; 919 TAILQ_REMOVE(&ki->kaio_syncqueue, sjob, list); 920 if (!aio_clear_cancel_function_locked(sjob)) 921 continue; 922 TAILQ_INSERT_TAIL(&ki->kaio_syncready, sjob, list); 923 schedule_fsync = true; 924 } 925 if (schedule_fsync) 926 taskqueue_enqueue(taskqueue_aiod_kick, 927 &ki->kaio_sync_task); 928 } 929 if (ki->kaio_flags & KAIO_WAKEUP) { 930 ki->kaio_flags &= ~KAIO_WAKEUP; 931 wakeup(&userp->p_aioinfo); 932 } 933 } 934 935 static void 936 aio_schedule_fsync(void *context, int pending) 937 { 938 struct kaioinfo *ki; 939 struct kaiocb *job; 940 941 ki = context; 942 AIO_LOCK(ki); 943 while (!TAILQ_EMPTY(&ki->kaio_syncready)) { 944 job = TAILQ_FIRST(&ki->kaio_syncready); 945 TAILQ_REMOVE(&ki->kaio_syncready, job, list); 946 AIO_UNLOCK(ki); 947 aio_schedule(job, aio_process_sync); 948 AIO_LOCK(ki); 949 } 950 AIO_UNLOCK(ki); 951 } 952 953 bool 954 aio_cancel_cleared(struct kaiocb *job) 955 { 956 957 /* 958 * The caller should hold the same queue lock held when 959 * aio_clear_cancel_function() was called and set this flag 960 * ensuring this check sees an up-to-date value. However, 961 * there is no way to assert that. 962 */ 963 return ((job->jobflags & KAIOCB_CLEARED) != 0); 964 } 965 966 static bool 967 aio_clear_cancel_function_locked(struct kaiocb *job) 968 { 969 970 AIO_LOCK_ASSERT(job->userproc->p_aioinfo, MA_OWNED); 971 MPASS(job->cancel_fn != NULL); 972 if (job->jobflags & KAIOCB_CANCELLING) { 973 job->jobflags |= KAIOCB_CLEARED; 974 return (false); 975 } 976 job->cancel_fn = NULL; 977 return (true); 978 } 979 980 bool 981 aio_clear_cancel_function(struct kaiocb *job) 982 { 983 struct kaioinfo *ki; 984 bool ret; 985 986 ki = job->userproc->p_aioinfo; 987 AIO_LOCK(ki); 988 ret = aio_clear_cancel_function_locked(job); 989 AIO_UNLOCK(ki); 990 return (ret); 991 } 992 993 static bool 994 aio_set_cancel_function_locked(struct kaiocb *job, aio_cancel_fn_t *func) 995 { 996 997 AIO_LOCK_ASSERT(job->userproc->p_aioinfo, MA_OWNED); 998 if (job->jobflags & KAIOCB_CANCELLED) 999 return (false); 1000 job->cancel_fn = func; 1001 return (true); 1002 } 1003 1004 bool 1005 aio_set_cancel_function(struct kaiocb *job, aio_cancel_fn_t *func) 1006 { 1007 struct kaioinfo *ki; 1008 bool ret; 1009 1010 ki = job->userproc->p_aioinfo; 1011 AIO_LOCK(ki); 1012 ret = aio_set_cancel_function_locked(job, func); 1013 AIO_UNLOCK(ki); 1014 return (ret); 1015 } 1016 1017 void 1018 aio_complete(struct kaiocb *job, long status, int error) 1019 { 1020 struct kaioinfo *ki; 1021 struct proc *userp; 1022 1023 job->uaiocb._aiocb_private.error = error; 1024 job->uaiocb._aiocb_private.status = status; 1025 1026 userp = job->userproc; 1027 ki = userp->p_aioinfo; 1028 1029 AIO_LOCK(ki); 1030 KASSERT(!(job->jobflags & KAIOCB_FINISHED), 1031 ("duplicate aio_complete")); 1032 job->jobflags |= KAIOCB_FINISHED; 1033 if ((job->jobflags & (KAIOCB_QUEUEING | KAIOCB_CANCELLING)) == 0) { 1034 TAILQ_REMOVE(&ki->kaio_jobqueue, job, plist); 1035 aio_bio_done_notify(userp, job); 1036 } 1037 AIO_UNLOCK(ki); 1038 } 1039 1040 void 1041 aio_cancel(struct kaiocb *job) 1042 { 1043 1044 aio_complete(job, -1, ECANCELED); 1045 } 1046 1047 void 1048 aio_switch_vmspace(struct kaiocb *job) 1049 { 1050 1051 vmspace_switch_aio(job->userproc->p_vmspace); 1052 } 1053 1054 /* 1055 * The AIO daemon, most of the actual work is done in aio_process_*, 1056 * but the setup (and address space mgmt) is done in this routine. 1057 */ 1058 static void 1059 aio_daemon(void *_id) 1060 { 1061 struct kaiocb *job; 1062 struct aioproc *aiop; 1063 struct kaioinfo *ki; 1064 struct proc *p; 1065 struct vmspace *myvm; 1066 struct thread *td = curthread; 1067 int id = (intptr_t)_id; 1068 1069 /* 1070 * Grab an extra reference on the daemon's vmspace so that it 1071 * doesn't get freed by jobs that switch to a different 1072 * vmspace. 1073 */ 1074 p = td->td_proc; 1075 myvm = vmspace_acquire_ref(p); 1076 1077 KASSERT(p->p_textvp == NULL, ("kthread has a textvp")); 1078 1079 /* 1080 * Allocate and ready the aio control info. There is one aiop structure 1081 * per daemon. 1082 */ 1083 aiop = malloc(sizeof(*aiop), M_AIO, M_WAITOK); 1084 aiop->aioproc = p; 1085 aiop->aioprocflags = 0; 1086 1087 /* 1088 * Wakeup parent process. (Parent sleeps to keep from blasting away 1089 * and creating too many daemons.) 1090 */ 1091 sema_post(&aio_newproc_sem); 1092 1093 mtx_lock(&aio_job_mtx); 1094 for (;;) { 1095 /* 1096 * Take daemon off of free queue 1097 */ 1098 if (aiop->aioprocflags & AIOP_FREE) { 1099 TAILQ_REMOVE(&aio_freeproc, aiop, list); 1100 aiop->aioprocflags &= ~AIOP_FREE; 1101 } 1102 1103 /* 1104 * Check for jobs. 1105 */ 1106 while ((job = aio_selectjob(aiop)) != NULL) { 1107 mtx_unlock(&aio_job_mtx); 1108 1109 ki = job->userproc->p_aioinfo; 1110 job->handle_fn(job); 1111 1112 mtx_lock(&aio_job_mtx); 1113 /* Decrement the active job count. */ 1114 ki->kaio_active_count--; 1115 } 1116 1117 /* 1118 * Disconnect from user address space. 1119 */ 1120 if (p->p_vmspace != myvm) { 1121 mtx_unlock(&aio_job_mtx); 1122 vmspace_switch_aio(myvm); 1123 mtx_lock(&aio_job_mtx); 1124 /* 1125 * We have to restart to avoid race, we only sleep if 1126 * no job can be selected. 1127 */ 1128 continue; 1129 } 1130 1131 mtx_assert(&aio_job_mtx, MA_OWNED); 1132 1133 TAILQ_INSERT_HEAD(&aio_freeproc, aiop, list); 1134 aiop->aioprocflags |= AIOP_FREE; 1135 1136 /* 1137 * If daemon is inactive for a long time, allow it to exit, 1138 * thereby freeing resources. 1139 */ 1140 if (msleep(p, &aio_job_mtx, PRIBIO, "aiordy", 1141 aiod_lifetime) == EWOULDBLOCK && TAILQ_EMPTY(&aio_jobs) && 1142 (aiop->aioprocflags & AIOP_FREE) && 1143 num_aio_procs > target_aio_procs) 1144 break; 1145 } 1146 TAILQ_REMOVE(&aio_freeproc, aiop, list); 1147 num_aio_procs--; 1148 mtx_unlock(&aio_job_mtx); 1149 free(aiop, M_AIO); 1150 free_unr(aiod_unr, id); 1151 vmspace_free(myvm); 1152 1153 KASSERT(p->p_vmspace == myvm, 1154 ("AIOD: bad vmspace for exiting daemon")); 1155 KASSERT(refcount_load(&myvm->vm_refcnt) > 1, 1156 ("AIOD: bad vm refcnt for exiting daemon: %d", 1157 refcount_load(&myvm->vm_refcnt))); 1158 kproc_exit(0); 1159 } 1160 1161 /* 1162 * Create a new AIO daemon. This is mostly a kernel-thread fork routine. The 1163 * AIO daemon modifies its environment itself. 1164 */ 1165 static int 1166 aio_newproc(int *start) 1167 { 1168 int error; 1169 struct proc *p; 1170 int id; 1171 1172 id = alloc_unr(aiod_unr); 1173 error = kproc_create(aio_daemon, (void *)(intptr_t)id, &p, 1174 RFNOWAIT, 0, "aiod%d", id); 1175 if (error == 0) { 1176 /* 1177 * Wait until daemon is started. 1178 */ 1179 sema_wait(&aio_newproc_sem); 1180 mtx_lock(&aio_job_mtx); 1181 num_aio_procs++; 1182 if (start != NULL) 1183 (*start)--; 1184 mtx_unlock(&aio_job_mtx); 1185 } else { 1186 free_unr(aiod_unr, id); 1187 } 1188 return (error); 1189 } 1190 1191 /* 1192 * Try the high-performance, low-overhead bio method for eligible 1193 * VCHR devices. This method doesn't use an aio helper thread, and 1194 * thus has very low overhead. 1195 * 1196 * Assumes that the caller, aio_aqueue(), has incremented the file 1197 * structure's reference count, preventing its deallocation for the 1198 * duration of this call. 1199 */ 1200 static int 1201 aio_qbio(struct proc *p, struct kaiocb *job) 1202 { 1203 struct aiocb *cb; 1204 struct file *fp; 1205 struct buf *pbuf; 1206 struct vnode *vp; 1207 struct cdevsw *csw; 1208 struct cdev *dev; 1209 struct kaioinfo *ki; 1210 struct bio **bios = NULL; 1211 off_t offset; 1212 int bio_cmd, error, i, iovcnt, opcode, poff, ref; 1213 vm_prot_t prot; 1214 bool use_unmapped; 1215 1216 cb = &job->uaiocb; 1217 fp = job->fd_file; 1218 opcode = cb->aio_lio_opcode; 1219 1220 if (!(opcode == LIO_WRITE || opcode == LIO_WRITEV || 1221 opcode == LIO_READ || opcode == LIO_READV)) 1222 return (-1); 1223 if (fp == NULL || fp->f_type != DTYPE_VNODE) 1224 return (-1); 1225 1226 vp = fp->f_vnode; 1227 if (vp->v_type != VCHR) 1228 return (-1); 1229 if (vp->v_bufobj.bo_bsize == 0) 1230 return (-1); 1231 1232 bio_cmd = (opcode & LIO_WRITE) ? BIO_WRITE : BIO_READ; 1233 iovcnt = job->uiop->uio_iovcnt; 1234 if (iovcnt > max_buf_aio) 1235 return (-1); 1236 for (i = 0; i < iovcnt; i++) { 1237 if (job->uiop->uio_iov[i].iov_len % vp->v_bufobj.bo_bsize != 0) 1238 return (-1); 1239 if (job->uiop->uio_iov[i].iov_len > maxphys) { 1240 error = -1; 1241 return (-1); 1242 } 1243 } 1244 offset = cb->aio_offset; 1245 1246 ref = 0; 1247 csw = devvn_refthread(vp, &dev, &ref); 1248 if (csw == NULL) 1249 return (ENXIO); 1250 1251 if ((csw->d_flags & D_DISK) == 0) { 1252 error = -1; 1253 goto unref; 1254 } 1255 if (job->uiop->uio_resid > dev->si_iosize_max) { 1256 error = -1; 1257 goto unref; 1258 } 1259 1260 ki = p->p_aioinfo; 1261 job->error = 0; 1262 1263 use_unmapped = (dev->si_flags & SI_UNMAPPED) && unmapped_buf_allowed; 1264 if (!use_unmapped) { 1265 AIO_LOCK(ki); 1266 if (ki->kaio_buffer_count + iovcnt > max_buf_aio) { 1267 AIO_UNLOCK(ki); 1268 error = EAGAIN; 1269 goto unref; 1270 } 1271 ki->kaio_buffer_count += iovcnt; 1272 AIO_UNLOCK(ki); 1273 } 1274 1275 bios = malloc(sizeof(struct bio *) * iovcnt, M_TEMP, M_WAITOK); 1276 refcount_init(&job->nbio, iovcnt); 1277 for (i = 0; i < iovcnt; i++) { 1278 struct vm_page** pages; 1279 struct bio *bp; 1280 void *buf; 1281 size_t nbytes; 1282 int npages; 1283 1284 buf = job->uiop->uio_iov[i].iov_base; 1285 nbytes = job->uiop->uio_iov[i].iov_len; 1286 1287 bios[i] = g_alloc_bio(); 1288 bp = bios[i]; 1289 1290 poff = (vm_offset_t)buf & PAGE_MASK; 1291 if (use_unmapped) { 1292 pbuf = NULL; 1293 pages = malloc(sizeof(vm_page_t) * (atop(round_page( 1294 nbytes)) + 1), M_TEMP, M_WAITOK | M_ZERO); 1295 } else { 1296 pbuf = uma_zalloc(pbuf_zone, M_WAITOK); 1297 BUF_KERNPROC(pbuf); 1298 pages = pbuf->b_pages; 1299 } 1300 1301 bp->bio_length = nbytes; 1302 bp->bio_bcount = nbytes; 1303 bp->bio_done = aio_biowakeup; 1304 bp->bio_offset = offset; 1305 bp->bio_cmd = bio_cmd; 1306 bp->bio_dev = dev; 1307 bp->bio_caller1 = job; 1308 bp->bio_caller2 = pbuf; 1309 1310 prot = VM_PROT_READ; 1311 if (opcode == LIO_READ || opcode == LIO_READV) 1312 prot |= VM_PROT_WRITE; /* Less backwards than it looks */ 1313 npages = vm_fault_quick_hold_pages(&curproc->p_vmspace->vm_map, 1314 (vm_offset_t)buf, bp->bio_length, prot, pages, 1315 atop(maxphys) + 1); 1316 if (npages < 0) { 1317 if (pbuf != NULL) 1318 uma_zfree(pbuf_zone, pbuf); 1319 else 1320 free(pages, M_TEMP); 1321 error = EFAULT; 1322 g_destroy_bio(bp); 1323 i--; 1324 goto destroy_bios; 1325 } 1326 if (pbuf != NULL) { 1327 pmap_qenter((vm_offset_t)pbuf->b_data, pages, npages); 1328 bp->bio_data = pbuf->b_data + poff; 1329 pbuf->b_npages = npages; 1330 atomic_add_int(&num_buf_aio, 1); 1331 } else { 1332 bp->bio_ma = pages; 1333 bp->bio_ma_n = npages; 1334 bp->bio_ma_offset = poff; 1335 bp->bio_data = unmapped_buf; 1336 bp->bio_flags |= BIO_UNMAPPED; 1337 atomic_add_int(&num_unmapped_aio, 1); 1338 } 1339 1340 offset += nbytes; 1341 } 1342 1343 /* Perform transfer. */ 1344 for (i = 0; i < iovcnt; i++) 1345 csw->d_strategy(bios[i]); 1346 free(bios, M_TEMP); 1347 1348 dev_relthread(dev, ref); 1349 return (0); 1350 1351 destroy_bios: 1352 for (; i >= 0; i--) 1353 aio_biocleanup(bios[i]); 1354 free(bios, M_TEMP); 1355 unref: 1356 dev_relthread(dev, ref); 1357 return (error); 1358 } 1359 1360 #ifdef COMPAT_FREEBSD6 1361 static int 1362 convert_old_sigevent(struct osigevent *osig, struct sigevent *nsig) 1363 { 1364 1365 /* 1366 * Only SIGEV_NONE, SIGEV_SIGNAL, and SIGEV_KEVENT are 1367 * supported by AIO with the old sigevent structure. 1368 */ 1369 nsig->sigev_notify = osig->sigev_notify; 1370 switch (nsig->sigev_notify) { 1371 case SIGEV_NONE: 1372 break; 1373 case SIGEV_SIGNAL: 1374 nsig->sigev_signo = osig->__sigev_u.__sigev_signo; 1375 break; 1376 case SIGEV_KEVENT: 1377 nsig->sigev_notify_kqueue = 1378 osig->__sigev_u.__sigev_notify_kqueue; 1379 nsig->sigev_value.sival_ptr = osig->sigev_value.sival_ptr; 1380 break; 1381 default: 1382 return (EINVAL); 1383 } 1384 return (0); 1385 } 1386 1387 static int 1388 aiocb_copyin_old_sigevent(struct aiocb *ujob, struct kaiocb *kjob, 1389 int type __unused) 1390 { 1391 struct oaiocb *ojob; 1392 struct aiocb *kcb = &kjob->uaiocb; 1393 int error; 1394 1395 bzero(kcb, sizeof(struct aiocb)); 1396 error = copyin(ujob, kcb, sizeof(struct oaiocb)); 1397 if (error) 1398 return (error); 1399 /* No need to copyin aio_iov, because it did not exist in FreeBSD 6 */ 1400 ojob = (struct oaiocb *)kcb; 1401 return (convert_old_sigevent(&ojob->aio_sigevent, &kcb->aio_sigevent)); 1402 } 1403 #endif 1404 1405 static int 1406 aiocb_copyin(struct aiocb *ujob, struct kaiocb *kjob, int type) 1407 { 1408 struct aiocb *kcb = &kjob->uaiocb; 1409 int error; 1410 1411 error = copyin(ujob, kcb, sizeof(struct aiocb)); 1412 if (error) 1413 return (error); 1414 if (type == LIO_NOP) 1415 type = kcb->aio_lio_opcode; 1416 if (type & LIO_VECTORED) { 1417 /* malloc a uio and copy in the iovec */ 1418 error = copyinuio(__DEVOLATILE(struct iovec*, kcb->aio_iov), 1419 kcb->aio_iovcnt, &kjob->uiop); 1420 } 1421 1422 return (error); 1423 } 1424 1425 static long 1426 aiocb_fetch_status(struct aiocb *ujob) 1427 { 1428 1429 return (fuword(&ujob->_aiocb_private.status)); 1430 } 1431 1432 static long 1433 aiocb_fetch_error(struct aiocb *ujob) 1434 { 1435 1436 return (fuword(&ujob->_aiocb_private.error)); 1437 } 1438 1439 static int 1440 aiocb_store_status(struct aiocb *ujob, long status) 1441 { 1442 1443 return (suword(&ujob->_aiocb_private.status, status)); 1444 } 1445 1446 static int 1447 aiocb_store_error(struct aiocb *ujob, long error) 1448 { 1449 1450 return (suword(&ujob->_aiocb_private.error, error)); 1451 } 1452 1453 static int 1454 aiocb_store_kernelinfo(struct aiocb *ujob, long jobref) 1455 { 1456 1457 return (suword(&ujob->_aiocb_private.kernelinfo, jobref)); 1458 } 1459 1460 static int 1461 aiocb_store_aiocb(struct aiocb **ujobp, struct aiocb *ujob) 1462 { 1463 1464 return (suword(ujobp, (long)ujob)); 1465 } 1466 1467 static struct aiocb_ops aiocb_ops = { 1468 .aio_copyin = aiocb_copyin, 1469 .fetch_status = aiocb_fetch_status, 1470 .fetch_error = aiocb_fetch_error, 1471 .store_status = aiocb_store_status, 1472 .store_error = aiocb_store_error, 1473 .store_kernelinfo = aiocb_store_kernelinfo, 1474 .store_aiocb = aiocb_store_aiocb, 1475 }; 1476 1477 #ifdef COMPAT_FREEBSD6 1478 static struct aiocb_ops aiocb_ops_osigevent = { 1479 .aio_copyin = aiocb_copyin_old_sigevent, 1480 .fetch_status = aiocb_fetch_status, 1481 .fetch_error = aiocb_fetch_error, 1482 .store_status = aiocb_store_status, 1483 .store_error = aiocb_store_error, 1484 .store_kernelinfo = aiocb_store_kernelinfo, 1485 .store_aiocb = aiocb_store_aiocb, 1486 }; 1487 #endif 1488 1489 /* 1490 * Queue a new AIO request. Choosing either the threaded or direct bio VCHR 1491 * technique is done in this code. 1492 */ 1493 int 1494 aio_aqueue(struct thread *td, struct aiocb *ujob, struct aioliojob *lj, 1495 int type, struct aiocb_ops *ops) 1496 { 1497 struct proc *p = td->td_proc; 1498 struct file *fp = NULL; 1499 struct kaiocb *job; 1500 struct kaioinfo *ki; 1501 struct kevent kev; 1502 int opcode; 1503 int error; 1504 int fd, kqfd; 1505 int jid; 1506 u_short evflags; 1507 1508 if (p->p_aioinfo == NULL) 1509 aio_init_aioinfo(p); 1510 1511 ki = p->p_aioinfo; 1512 1513 ops->store_status(ujob, -1); 1514 ops->store_error(ujob, 0); 1515 ops->store_kernelinfo(ujob, -1); 1516 1517 if (num_queue_count >= max_queue_count || 1518 ki->kaio_count >= max_aio_queue_per_proc) { 1519 error = EAGAIN; 1520 goto err1; 1521 } 1522 1523 job = uma_zalloc(aiocb_zone, M_WAITOK | M_ZERO); 1524 knlist_init_mtx(&job->klist, AIO_MTX(ki)); 1525 1526 error = ops->aio_copyin(ujob, job, type); 1527 if (error) 1528 goto err2; 1529 1530 if (job->uaiocb.aio_nbytes > IOSIZE_MAX) { 1531 error = EINVAL; 1532 goto err2; 1533 } 1534 1535 if (job->uaiocb.aio_sigevent.sigev_notify != SIGEV_KEVENT && 1536 job->uaiocb.aio_sigevent.sigev_notify != SIGEV_SIGNAL && 1537 job->uaiocb.aio_sigevent.sigev_notify != SIGEV_THREAD_ID && 1538 job->uaiocb.aio_sigevent.sigev_notify != SIGEV_NONE) { 1539 error = EINVAL; 1540 goto err2; 1541 } 1542 1543 if ((job->uaiocb.aio_sigevent.sigev_notify == SIGEV_SIGNAL || 1544 job->uaiocb.aio_sigevent.sigev_notify == SIGEV_THREAD_ID) && 1545 !_SIG_VALID(job->uaiocb.aio_sigevent.sigev_signo)) { 1546 error = EINVAL; 1547 goto err2; 1548 } 1549 1550 /* Get the opcode. */ 1551 if (type == LIO_NOP) { 1552 switch (job->uaiocb.aio_lio_opcode) { 1553 case LIO_WRITE: 1554 case LIO_WRITEV: 1555 case LIO_NOP: 1556 case LIO_READ: 1557 case LIO_READV: 1558 opcode = job->uaiocb.aio_lio_opcode; 1559 break; 1560 default: 1561 error = EINVAL; 1562 goto err2; 1563 } 1564 } else 1565 opcode = job->uaiocb.aio_lio_opcode = type; 1566 1567 ksiginfo_init(&job->ksi); 1568 1569 /* Save userspace address of the job info. */ 1570 job->ujob = ujob; 1571 1572 /* 1573 * Validate the opcode and fetch the file object for the specified 1574 * file descriptor. 1575 * 1576 * XXXRW: Moved the opcode validation up here so that we don't 1577 * retrieve a file descriptor without knowing what the capabiltity 1578 * should be. 1579 */ 1580 fd = job->uaiocb.aio_fildes; 1581 switch (opcode) { 1582 case LIO_WRITE: 1583 case LIO_WRITEV: 1584 error = fget_write(td, fd, &cap_pwrite_rights, &fp); 1585 break; 1586 case LIO_READ: 1587 case LIO_READV: 1588 error = fget_read(td, fd, &cap_pread_rights, &fp); 1589 break; 1590 case LIO_SYNC: 1591 case LIO_DSYNC: 1592 error = fget(td, fd, &cap_fsync_rights, &fp); 1593 break; 1594 case LIO_MLOCK: 1595 break; 1596 case LIO_NOP: 1597 error = fget(td, fd, &cap_no_rights, &fp); 1598 break; 1599 default: 1600 error = EINVAL; 1601 } 1602 if (error) 1603 goto err3; 1604 1605 if ((opcode & LIO_SYNC) && fp->f_vnode == NULL) { 1606 error = EINVAL; 1607 goto err3; 1608 } 1609 1610 if ((opcode == LIO_READ || opcode == LIO_READV || 1611 opcode == LIO_WRITE || opcode == LIO_WRITEV) && 1612 job->uaiocb.aio_offset < 0 && 1613 (fp->f_vnode == NULL || fp->f_vnode->v_type != VCHR)) { 1614 error = EINVAL; 1615 goto err3; 1616 } 1617 1618 if (fp != NULL && fp->f_ops == &path_fileops) { 1619 error = EBADF; 1620 goto err3; 1621 } 1622 1623 job->fd_file = fp; 1624 1625 mtx_lock(&aio_job_mtx); 1626 jid = jobrefid++; 1627 job->seqno = jobseqno++; 1628 mtx_unlock(&aio_job_mtx); 1629 error = ops->store_kernelinfo(ujob, jid); 1630 if (error) { 1631 error = EINVAL; 1632 goto err3; 1633 } 1634 job->uaiocb._aiocb_private.kernelinfo = (void *)(intptr_t)jid; 1635 1636 if (opcode == LIO_NOP) { 1637 fdrop(fp, td); 1638 MPASS(job->uiop == &job->uio || job->uiop == NULL); 1639 uma_zfree(aiocb_zone, job); 1640 return (0); 1641 } 1642 1643 if (job->uaiocb.aio_sigevent.sigev_notify != SIGEV_KEVENT) 1644 goto no_kqueue; 1645 evflags = job->uaiocb.aio_sigevent.sigev_notify_kevent_flags; 1646 if ((evflags & ~(EV_CLEAR | EV_DISPATCH | EV_ONESHOT)) != 0) { 1647 error = EINVAL; 1648 goto err3; 1649 } 1650 kqfd = job->uaiocb.aio_sigevent.sigev_notify_kqueue; 1651 memset(&kev, 0, sizeof(kev)); 1652 kev.ident = (uintptr_t)job->ujob; 1653 kev.filter = EVFILT_AIO; 1654 kev.flags = EV_ADD | EV_ENABLE | EV_FLAG1 | evflags; 1655 kev.data = (intptr_t)job; 1656 kev.udata = job->uaiocb.aio_sigevent.sigev_value.sival_ptr; 1657 error = kqfd_register(kqfd, &kev, td, M_WAITOK); 1658 if (error) 1659 goto err3; 1660 1661 no_kqueue: 1662 1663 ops->store_error(ujob, EINPROGRESS); 1664 job->uaiocb._aiocb_private.error = EINPROGRESS; 1665 job->userproc = p; 1666 job->cred = crhold(td->td_ucred); 1667 job->jobflags = KAIOCB_QUEUEING; 1668 job->lio = lj; 1669 1670 if (opcode & LIO_VECTORED) { 1671 /* Use the uio copied in by aio_copyin */ 1672 MPASS(job->uiop != &job->uio && job->uiop != NULL); 1673 } else { 1674 /* Setup the inline uio */ 1675 job->iov[0].iov_base = (void *)(uintptr_t)job->uaiocb.aio_buf; 1676 job->iov[0].iov_len = job->uaiocb.aio_nbytes; 1677 job->uio.uio_iov = job->iov; 1678 job->uio.uio_iovcnt = 1; 1679 job->uio.uio_resid = job->uaiocb.aio_nbytes; 1680 job->uio.uio_segflg = UIO_USERSPACE; 1681 job->uiop = &job->uio; 1682 } 1683 switch (opcode & (LIO_READ | LIO_WRITE)) { 1684 case LIO_READ: 1685 job->uiop->uio_rw = UIO_READ; 1686 break; 1687 case LIO_WRITE: 1688 job->uiop->uio_rw = UIO_WRITE; 1689 break; 1690 } 1691 job->uiop->uio_offset = job->uaiocb.aio_offset; 1692 job->uiop->uio_td = td; 1693 1694 if (opcode == LIO_MLOCK) { 1695 aio_schedule(job, aio_process_mlock); 1696 error = 0; 1697 } else if (fp->f_ops->fo_aio_queue == NULL) 1698 error = aio_queue_file(fp, job); 1699 else 1700 error = fo_aio_queue(fp, job); 1701 if (error) 1702 goto err4; 1703 1704 AIO_LOCK(ki); 1705 job->jobflags &= ~KAIOCB_QUEUEING; 1706 TAILQ_INSERT_TAIL(&ki->kaio_all, job, allist); 1707 ki->kaio_count++; 1708 if (lj) 1709 lj->lioj_count++; 1710 atomic_add_int(&num_queue_count, 1); 1711 if (job->jobflags & KAIOCB_FINISHED) { 1712 /* 1713 * The queue callback completed the request synchronously. 1714 * The bulk of the completion is deferred in that case 1715 * until this point. 1716 */ 1717 aio_bio_done_notify(p, job); 1718 } else 1719 TAILQ_INSERT_TAIL(&ki->kaio_jobqueue, job, plist); 1720 AIO_UNLOCK(ki); 1721 return (0); 1722 1723 err4: 1724 crfree(job->cred); 1725 err3: 1726 if (fp) 1727 fdrop(fp, td); 1728 knlist_delete(&job->klist, curthread, 0); 1729 err2: 1730 if (job->uiop != &job->uio) 1731 free(job->uiop, M_IOV); 1732 uma_zfree(aiocb_zone, job); 1733 err1: 1734 ops->store_error(ujob, error); 1735 return (error); 1736 } 1737 1738 static void 1739 aio_cancel_daemon_job(struct kaiocb *job) 1740 { 1741 1742 mtx_lock(&aio_job_mtx); 1743 if (!aio_cancel_cleared(job)) 1744 TAILQ_REMOVE(&aio_jobs, job, list); 1745 mtx_unlock(&aio_job_mtx); 1746 aio_cancel(job); 1747 } 1748 1749 void 1750 aio_schedule(struct kaiocb *job, aio_handle_fn_t *func) 1751 { 1752 1753 mtx_lock(&aio_job_mtx); 1754 if (!aio_set_cancel_function(job, aio_cancel_daemon_job)) { 1755 mtx_unlock(&aio_job_mtx); 1756 aio_cancel(job); 1757 return; 1758 } 1759 job->handle_fn = func; 1760 TAILQ_INSERT_TAIL(&aio_jobs, job, list); 1761 aio_kick_nowait(job->userproc); 1762 mtx_unlock(&aio_job_mtx); 1763 } 1764 1765 static void 1766 aio_cancel_sync(struct kaiocb *job) 1767 { 1768 struct kaioinfo *ki; 1769 1770 ki = job->userproc->p_aioinfo; 1771 AIO_LOCK(ki); 1772 if (!aio_cancel_cleared(job)) 1773 TAILQ_REMOVE(&ki->kaio_syncqueue, job, list); 1774 AIO_UNLOCK(ki); 1775 aio_cancel(job); 1776 } 1777 1778 int 1779 aio_queue_file(struct file *fp, struct kaiocb *job) 1780 { 1781 struct kaioinfo *ki; 1782 struct kaiocb *job2; 1783 struct vnode *vp; 1784 struct mount *mp; 1785 int error; 1786 bool safe; 1787 1788 ki = job->userproc->p_aioinfo; 1789 error = aio_qbio(job->userproc, job); 1790 if (error >= 0) 1791 return (error); 1792 safe = false; 1793 if (fp->f_type == DTYPE_VNODE) { 1794 vp = fp->f_vnode; 1795 if (vp->v_type == VREG || vp->v_type == VDIR) { 1796 mp = fp->f_vnode->v_mount; 1797 if (mp == NULL || (mp->mnt_flag & MNT_LOCAL) != 0) 1798 safe = true; 1799 } 1800 } 1801 if (!(safe || enable_aio_unsafe)) { 1802 counted_warning(&unsafe_warningcnt, 1803 "is attempting to use unsafe AIO requests"); 1804 return (EOPNOTSUPP); 1805 } 1806 1807 if (job->uaiocb.aio_lio_opcode & (LIO_WRITE | LIO_READ)) { 1808 aio_schedule(job, aio_process_rw); 1809 error = 0; 1810 } else if (job->uaiocb.aio_lio_opcode & LIO_SYNC) { 1811 AIO_LOCK(ki); 1812 TAILQ_FOREACH(job2, &ki->kaio_jobqueue, plist) { 1813 if (job2->fd_file == job->fd_file && 1814 ((job2->uaiocb.aio_lio_opcode & LIO_SYNC) == 0) && 1815 job2->seqno < job->seqno) { 1816 job2->jobflags |= KAIOCB_CHECKSYNC; 1817 job->pending++; 1818 } 1819 } 1820 if (job->pending != 0) { 1821 if (!aio_set_cancel_function_locked(job, 1822 aio_cancel_sync)) { 1823 AIO_UNLOCK(ki); 1824 aio_cancel(job); 1825 return (0); 1826 } 1827 TAILQ_INSERT_TAIL(&ki->kaio_syncqueue, job, list); 1828 AIO_UNLOCK(ki); 1829 return (0); 1830 } 1831 AIO_UNLOCK(ki); 1832 aio_schedule(job, aio_process_sync); 1833 error = 0; 1834 } else { 1835 error = EINVAL; 1836 } 1837 return (error); 1838 } 1839 1840 static void 1841 aio_kick_nowait(struct proc *userp) 1842 { 1843 struct kaioinfo *ki = userp->p_aioinfo; 1844 struct aioproc *aiop; 1845 1846 mtx_assert(&aio_job_mtx, MA_OWNED); 1847 if ((aiop = TAILQ_FIRST(&aio_freeproc)) != NULL) { 1848 TAILQ_REMOVE(&aio_freeproc, aiop, list); 1849 aiop->aioprocflags &= ~AIOP_FREE; 1850 wakeup(aiop->aioproc); 1851 } else if (num_aio_resv_start + num_aio_procs < max_aio_procs && 1852 ki->kaio_active_count + num_aio_resv_start < max_aio_per_proc) { 1853 taskqueue_enqueue(taskqueue_aiod_kick, &ki->kaio_task); 1854 } 1855 } 1856 1857 static int 1858 aio_kick(struct proc *userp) 1859 { 1860 struct kaioinfo *ki = userp->p_aioinfo; 1861 struct aioproc *aiop; 1862 int error, ret = 0; 1863 1864 mtx_assert(&aio_job_mtx, MA_OWNED); 1865 retryproc: 1866 if ((aiop = TAILQ_FIRST(&aio_freeproc)) != NULL) { 1867 TAILQ_REMOVE(&aio_freeproc, aiop, list); 1868 aiop->aioprocflags &= ~AIOP_FREE; 1869 wakeup(aiop->aioproc); 1870 } else if (num_aio_resv_start + num_aio_procs < max_aio_procs && 1871 ki->kaio_active_count + num_aio_resv_start < max_aio_per_proc) { 1872 num_aio_resv_start++; 1873 mtx_unlock(&aio_job_mtx); 1874 error = aio_newproc(&num_aio_resv_start); 1875 mtx_lock(&aio_job_mtx); 1876 if (error) { 1877 num_aio_resv_start--; 1878 goto retryproc; 1879 } 1880 } else { 1881 ret = -1; 1882 } 1883 return (ret); 1884 } 1885 1886 static void 1887 aio_kick_helper(void *context, int pending) 1888 { 1889 struct proc *userp = context; 1890 1891 mtx_lock(&aio_job_mtx); 1892 while (--pending >= 0) { 1893 if (aio_kick(userp)) 1894 break; 1895 } 1896 mtx_unlock(&aio_job_mtx); 1897 } 1898 1899 /* 1900 * Support the aio_return system call, as a side-effect, kernel resources are 1901 * released. 1902 */ 1903 static int 1904 kern_aio_return(struct thread *td, struct aiocb *ujob, struct aiocb_ops *ops) 1905 { 1906 struct proc *p = td->td_proc; 1907 struct kaiocb *job; 1908 struct kaioinfo *ki; 1909 long status, error; 1910 1911 ki = p->p_aioinfo; 1912 if (ki == NULL) 1913 return (EINVAL); 1914 AIO_LOCK(ki); 1915 TAILQ_FOREACH(job, &ki->kaio_done, plist) { 1916 if (job->ujob == ujob) 1917 break; 1918 } 1919 if (job != NULL) { 1920 MPASS(job->jobflags & KAIOCB_FINISHED); 1921 status = job->uaiocb._aiocb_private.status; 1922 error = job->uaiocb._aiocb_private.error; 1923 td->td_retval[0] = status; 1924 td->td_ru.ru_oublock += job->outblock; 1925 td->td_ru.ru_inblock += job->inblock; 1926 td->td_ru.ru_msgsnd += job->msgsnd; 1927 td->td_ru.ru_msgrcv += job->msgrcv; 1928 aio_free_entry(job); 1929 AIO_UNLOCK(ki); 1930 ops->store_error(ujob, error); 1931 ops->store_status(ujob, status); 1932 } else { 1933 error = EINVAL; 1934 AIO_UNLOCK(ki); 1935 } 1936 return (error); 1937 } 1938 1939 int 1940 sys_aio_return(struct thread *td, struct aio_return_args *uap) 1941 { 1942 1943 return (kern_aio_return(td, uap->aiocbp, &aiocb_ops)); 1944 } 1945 1946 /* 1947 * Allow a process to wakeup when any of the I/O requests are completed. 1948 */ 1949 static int 1950 kern_aio_suspend(struct thread *td, int njoblist, struct aiocb **ujoblist, 1951 struct timespec *ts) 1952 { 1953 struct proc *p = td->td_proc; 1954 struct timeval atv; 1955 struct kaioinfo *ki; 1956 struct kaiocb *firstjob, *job; 1957 int error, i, timo; 1958 1959 timo = 0; 1960 if (ts) { 1961 if (ts->tv_nsec < 0 || ts->tv_nsec >= 1000000000) 1962 return (EINVAL); 1963 1964 TIMESPEC_TO_TIMEVAL(&atv, ts); 1965 if (itimerfix(&atv)) 1966 return (EINVAL); 1967 timo = tvtohz(&atv); 1968 } 1969 1970 ki = p->p_aioinfo; 1971 if (ki == NULL) 1972 return (EAGAIN); 1973 1974 if (njoblist == 0) 1975 return (0); 1976 1977 AIO_LOCK(ki); 1978 for (;;) { 1979 firstjob = NULL; 1980 error = 0; 1981 TAILQ_FOREACH(job, &ki->kaio_all, allist) { 1982 for (i = 0; i < njoblist; i++) { 1983 if (job->ujob == ujoblist[i]) { 1984 if (firstjob == NULL) 1985 firstjob = job; 1986 if (job->jobflags & KAIOCB_FINISHED) 1987 goto RETURN; 1988 } 1989 } 1990 } 1991 /* All tasks were finished. */ 1992 if (firstjob == NULL) 1993 break; 1994 1995 ki->kaio_flags |= KAIO_WAKEUP; 1996 error = msleep(&p->p_aioinfo, AIO_MTX(ki), PRIBIO | PCATCH, 1997 "aiospn", timo); 1998 if (error == ERESTART) 1999 error = EINTR; 2000 if (error) 2001 break; 2002 } 2003 RETURN: 2004 AIO_UNLOCK(ki); 2005 return (error); 2006 } 2007 2008 int 2009 sys_aio_suspend(struct thread *td, struct aio_suspend_args *uap) 2010 { 2011 struct timespec ts, *tsp; 2012 struct aiocb **ujoblist; 2013 int error; 2014 2015 if (uap->nent < 0 || uap->nent > max_aio_queue_per_proc) 2016 return (EINVAL); 2017 2018 if (uap->timeout) { 2019 /* Get timespec struct. */ 2020 if ((error = copyin(uap->timeout, &ts, sizeof(ts))) != 0) 2021 return (error); 2022 tsp = &ts; 2023 } else 2024 tsp = NULL; 2025 2026 ujoblist = malloc(uap->nent * sizeof(ujoblist[0]), M_AIO, M_WAITOK); 2027 error = copyin(uap->aiocbp, ujoblist, uap->nent * sizeof(ujoblist[0])); 2028 if (error == 0) 2029 error = kern_aio_suspend(td, uap->nent, ujoblist, tsp); 2030 free(ujoblist, M_AIO); 2031 return (error); 2032 } 2033 2034 /* 2035 * aio_cancel cancels any non-bio aio operations not currently in progress. 2036 */ 2037 int 2038 sys_aio_cancel(struct thread *td, struct aio_cancel_args *uap) 2039 { 2040 struct proc *p = td->td_proc; 2041 struct kaioinfo *ki; 2042 struct kaiocb *job, *jobn; 2043 struct file *fp; 2044 int error; 2045 int cancelled = 0; 2046 int notcancelled = 0; 2047 struct vnode *vp; 2048 2049 /* Lookup file object. */ 2050 error = fget(td, uap->fd, &cap_no_rights, &fp); 2051 if (error) 2052 return (error); 2053 2054 ki = p->p_aioinfo; 2055 if (ki == NULL) 2056 goto done; 2057 2058 if (fp->f_type == DTYPE_VNODE) { 2059 vp = fp->f_vnode; 2060 if (vn_isdisk(vp)) { 2061 fdrop(fp, td); 2062 td->td_retval[0] = AIO_NOTCANCELED; 2063 return (0); 2064 } 2065 } 2066 2067 AIO_LOCK(ki); 2068 TAILQ_FOREACH_SAFE(job, &ki->kaio_jobqueue, plist, jobn) { 2069 if ((uap->fd == job->uaiocb.aio_fildes) && 2070 ((uap->aiocbp == NULL) || 2071 (uap->aiocbp == job->ujob))) { 2072 if (aio_cancel_job(p, ki, job)) { 2073 cancelled++; 2074 } else { 2075 notcancelled++; 2076 } 2077 if (uap->aiocbp != NULL) 2078 break; 2079 } 2080 } 2081 AIO_UNLOCK(ki); 2082 2083 done: 2084 fdrop(fp, td); 2085 2086 if (uap->aiocbp != NULL) { 2087 if (cancelled) { 2088 td->td_retval[0] = AIO_CANCELED; 2089 return (0); 2090 } 2091 } 2092 2093 if (notcancelled) { 2094 td->td_retval[0] = AIO_NOTCANCELED; 2095 return (0); 2096 } 2097 2098 if (cancelled) { 2099 td->td_retval[0] = AIO_CANCELED; 2100 return (0); 2101 } 2102 2103 td->td_retval[0] = AIO_ALLDONE; 2104 2105 return (0); 2106 } 2107 2108 /* 2109 * aio_error is implemented in the kernel level for compatibility purposes 2110 * only. For a user mode async implementation, it would be best to do it in 2111 * a userland subroutine. 2112 */ 2113 static int 2114 kern_aio_error(struct thread *td, struct aiocb *ujob, struct aiocb_ops *ops) 2115 { 2116 struct proc *p = td->td_proc; 2117 struct kaiocb *job; 2118 struct kaioinfo *ki; 2119 int status; 2120 2121 ki = p->p_aioinfo; 2122 if (ki == NULL) { 2123 td->td_retval[0] = EINVAL; 2124 return (0); 2125 } 2126 2127 AIO_LOCK(ki); 2128 TAILQ_FOREACH(job, &ki->kaio_all, allist) { 2129 if (job->ujob == ujob) { 2130 if (job->jobflags & KAIOCB_FINISHED) 2131 td->td_retval[0] = 2132 job->uaiocb._aiocb_private.error; 2133 else 2134 td->td_retval[0] = EINPROGRESS; 2135 AIO_UNLOCK(ki); 2136 return (0); 2137 } 2138 } 2139 AIO_UNLOCK(ki); 2140 2141 /* 2142 * Hack for failure of aio_aqueue. 2143 */ 2144 status = ops->fetch_status(ujob); 2145 if (status == -1) { 2146 td->td_retval[0] = ops->fetch_error(ujob); 2147 return (0); 2148 } 2149 2150 td->td_retval[0] = EINVAL; 2151 return (0); 2152 } 2153 2154 int 2155 sys_aio_error(struct thread *td, struct aio_error_args *uap) 2156 { 2157 2158 return (kern_aio_error(td, uap->aiocbp, &aiocb_ops)); 2159 } 2160 2161 /* syscall - asynchronous read from a file (REALTIME) */ 2162 #ifdef COMPAT_FREEBSD6 2163 int 2164 freebsd6_aio_read(struct thread *td, struct freebsd6_aio_read_args *uap) 2165 { 2166 2167 return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_READ, 2168 &aiocb_ops_osigevent)); 2169 } 2170 #endif 2171 2172 int 2173 sys_aio_read(struct thread *td, struct aio_read_args *uap) 2174 { 2175 2176 return (aio_aqueue(td, uap->aiocbp, NULL, LIO_READ, &aiocb_ops)); 2177 } 2178 2179 int 2180 sys_aio_readv(struct thread *td, struct aio_readv_args *uap) 2181 { 2182 2183 return (aio_aqueue(td, uap->aiocbp, NULL, LIO_READV, &aiocb_ops)); 2184 } 2185 2186 /* syscall - asynchronous write to a file (REALTIME) */ 2187 #ifdef COMPAT_FREEBSD6 2188 int 2189 freebsd6_aio_write(struct thread *td, struct freebsd6_aio_write_args *uap) 2190 { 2191 2192 return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_WRITE, 2193 &aiocb_ops_osigevent)); 2194 } 2195 #endif 2196 2197 int 2198 sys_aio_write(struct thread *td, struct aio_write_args *uap) 2199 { 2200 2201 return (aio_aqueue(td, uap->aiocbp, NULL, LIO_WRITE, &aiocb_ops)); 2202 } 2203 2204 int 2205 sys_aio_writev(struct thread *td, struct aio_writev_args *uap) 2206 { 2207 2208 return (aio_aqueue(td, uap->aiocbp, NULL, LIO_WRITEV, &aiocb_ops)); 2209 } 2210 2211 int 2212 sys_aio_mlock(struct thread *td, struct aio_mlock_args *uap) 2213 { 2214 2215 return (aio_aqueue(td, uap->aiocbp, NULL, LIO_MLOCK, &aiocb_ops)); 2216 } 2217 2218 static int 2219 kern_lio_listio(struct thread *td, int mode, struct aiocb * const *uacb_list, 2220 struct aiocb **acb_list, int nent, struct sigevent *sig, 2221 struct aiocb_ops *ops) 2222 { 2223 struct proc *p = td->td_proc; 2224 struct aiocb *job; 2225 struct kaioinfo *ki; 2226 struct aioliojob *lj; 2227 struct kevent kev; 2228 int error; 2229 int nagain, nerror; 2230 int i; 2231 2232 if ((mode != LIO_NOWAIT) && (mode != LIO_WAIT)) 2233 return (EINVAL); 2234 2235 if (nent < 0 || nent > max_aio_queue_per_proc) 2236 return (EINVAL); 2237 2238 if (p->p_aioinfo == NULL) 2239 aio_init_aioinfo(p); 2240 2241 ki = p->p_aioinfo; 2242 2243 lj = uma_zalloc(aiolio_zone, M_WAITOK); 2244 lj->lioj_flags = 0; 2245 lj->lioj_count = 0; 2246 lj->lioj_finished_count = 0; 2247 lj->lioj_signal.sigev_notify = SIGEV_NONE; 2248 knlist_init_mtx(&lj->klist, AIO_MTX(ki)); 2249 ksiginfo_init(&lj->lioj_ksi); 2250 2251 /* 2252 * Setup signal. 2253 */ 2254 if (sig && (mode == LIO_NOWAIT)) { 2255 bcopy(sig, &lj->lioj_signal, sizeof(lj->lioj_signal)); 2256 if (lj->lioj_signal.sigev_notify == SIGEV_KEVENT) { 2257 /* Assume only new style KEVENT */ 2258 memset(&kev, 0, sizeof(kev)); 2259 kev.filter = EVFILT_LIO; 2260 kev.flags = EV_ADD | EV_ENABLE | EV_FLAG1; 2261 kev.ident = (uintptr_t)uacb_list; /* something unique */ 2262 kev.data = (intptr_t)lj; 2263 /* pass user defined sigval data */ 2264 kev.udata = lj->lioj_signal.sigev_value.sival_ptr; 2265 error = kqfd_register( 2266 lj->lioj_signal.sigev_notify_kqueue, &kev, td, 2267 M_WAITOK); 2268 if (error) { 2269 uma_zfree(aiolio_zone, lj); 2270 return (error); 2271 } 2272 } else if (lj->lioj_signal.sigev_notify == SIGEV_NONE) { 2273 ; 2274 } else if (lj->lioj_signal.sigev_notify == SIGEV_SIGNAL || 2275 lj->lioj_signal.sigev_notify == SIGEV_THREAD_ID) { 2276 if (!_SIG_VALID(lj->lioj_signal.sigev_signo)) { 2277 uma_zfree(aiolio_zone, lj); 2278 return EINVAL; 2279 } 2280 lj->lioj_flags |= LIOJ_SIGNAL; 2281 } else { 2282 uma_zfree(aiolio_zone, lj); 2283 return EINVAL; 2284 } 2285 } 2286 2287 AIO_LOCK(ki); 2288 TAILQ_INSERT_TAIL(&ki->kaio_liojoblist, lj, lioj_list); 2289 /* 2290 * Add extra aiocb count to avoid the lio to be freed 2291 * by other threads doing aio_waitcomplete or aio_return, 2292 * and prevent event from being sent until we have queued 2293 * all tasks. 2294 */ 2295 lj->lioj_count = 1; 2296 AIO_UNLOCK(ki); 2297 2298 /* 2299 * Get pointers to the list of I/O requests. 2300 */ 2301 nagain = 0; 2302 nerror = 0; 2303 for (i = 0; i < nent; i++) { 2304 job = acb_list[i]; 2305 if (job != NULL) { 2306 error = aio_aqueue(td, job, lj, LIO_NOP, ops); 2307 if (error == EAGAIN) 2308 nagain++; 2309 else if (error != 0) 2310 nerror++; 2311 } 2312 } 2313 2314 error = 0; 2315 AIO_LOCK(ki); 2316 if (mode == LIO_WAIT) { 2317 while (lj->lioj_count - 1 != lj->lioj_finished_count) { 2318 ki->kaio_flags |= KAIO_WAKEUP; 2319 error = msleep(&p->p_aioinfo, AIO_MTX(ki), 2320 PRIBIO | PCATCH, "aiospn", 0); 2321 if (error == ERESTART) 2322 error = EINTR; 2323 if (error) 2324 break; 2325 } 2326 } else { 2327 if (lj->lioj_count - 1 == lj->lioj_finished_count) { 2328 if (lj->lioj_signal.sigev_notify == SIGEV_KEVENT) { 2329 lj->lioj_flags |= LIOJ_KEVENT_POSTED; 2330 KNOTE_LOCKED(&lj->klist, 1); 2331 } 2332 if ((lj->lioj_flags & (LIOJ_SIGNAL | 2333 LIOJ_SIGNAL_POSTED)) == LIOJ_SIGNAL && 2334 (lj->lioj_signal.sigev_notify == SIGEV_SIGNAL || 2335 lj->lioj_signal.sigev_notify == SIGEV_THREAD_ID)) { 2336 aio_sendsig(p, &lj->lioj_signal, &lj->lioj_ksi, 2337 lj->lioj_count != 1); 2338 lj->lioj_flags |= LIOJ_SIGNAL_POSTED; 2339 } 2340 } 2341 } 2342 lj->lioj_count--; 2343 if (lj->lioj_count == 0) { 2344 TAILQ_REMOVE(&ki->kaio_liojoblist, lj, lioj_list); 2345 knlist_delete(&lj->klist, curthread, 1); 2346 PROC_LOCK(p); 2347 sigqueue_take(&lj->lioj_ksi); 2348 PROC_UNLOCK(p); 2349 AIO_UNLOCK(ki); 2350 uma_zfree(aiolio_zone, lj); 2351 } else 2352 AIO_UNLOCK(ki); 2353 2354 if (nerror) 2355 return (EIO); 2356 else if (nagain) 2357 return (EAGAIN); 2358 else 2359 return (error); 2360 } 2361 2362 /* syscall - list directed I/O (REALTIME) */ 2363 #ifdef COMPAT_FREEBSD6 2364 int 2365 freebsd6_lio_listio(struct thread *td, struct freebsd6_lio_listio_args *uap) 2366 { 2367 struct aiocb **acb_list; 2368 struct sigevent *sigp, sig; 2369 struct osigevent osig; 2370 int error, nent; 2371 2372 if ((uap->mode != LIO_NOWAIT) && (uap->mode != LIO_WAIT)) 2373 return (EINVAL); 2374 2375 nent = uap->nent; 2376 if (nent < 0 || nent > max_aio_queue_per_proc) 2377 return (EINVAL); 2378 2379 if (uap->sig && (uap->mode == LIO_NOWAIT)) { 2380 error = copyin(uap->sig, &osig, sizeof(osig)); 2381 if (error) 2382 return (error); 2383 error = convert_old_sigevent(&osig, &sig); 2384 if (error) 2385 return (error); 2386 sigp = &sig; 2387 } else 2388 sigp = NULL; 2389 2390 acb_list = malloc(sizeof(struct aiocb *) * nent, M_LIO, M_WAITOK); 2391 error = copyin(uap->acb_list, acb_list, nent * sizeof(acb_list[0])); 2392 if (error == 0) 2393 error = kern_lio_listio(td, uap->mode, 2394 (struct aiocb * const *)uap->acb_list, acb_list, nent, sigp, 2395 &aiocb_ops_osigevent); 2396 free(acb_list, M_LIO); 2397 return (error); 2398 } 2399 #endif 2400 2401 /* syscall - list directed I/O (REALTIME) */ 2402 int 2403 sys_lio_listio(struct thread *td, struct lio_listio_args *uap) 2404 { 2405 struct aiocb **acb_list; 2406 struct sigevent *sigp, sig; 2407 int error, nent; 2408 2409 if ((uap->mode != LIO_NOWAIT) && (uap->mode != LIO_WAIT)) 2410 return (EINVAL); 2411 2412 nent = uap->nent; 2413 if (nent < 0 || nent > max_aio_queue_per_proc) 2414 return (EINVAL); 2415 2416 if (uap->sig && (uap->mode == LIO_NOWAIT)) { 2417 error = copyin(uap->sig, &sig, sizeof(sig)); 2418 if (error) 2419 return (error); 2420 sigp = &sig; 2421 } else 2422 sigp = NULL; 2423 2424 acb_list = malloc(sizeof(struct aiocb *) * nent, M_LIO, M_WAITOK); 2425 error = copyin(uap->acb_list, acb_list, nent * sizeof(acb_list[0])); 2426 if (error == 0) 2427 error = kern_lio_listio(td, uap->mode, uap->acb_list, acb_list, 2428 nent, sigp, &aiocb_ops); 2429 free(acb_list, M_LIO); 2430 return (error); 2431 } 2432 2433 static void 2434 aio_biocleanup(struct bio *bp) 2435 { 2436 struct kaiocb *job = (struct kaiocb *)bp->bio_caller1; 2437 struct kaioinfo *ki; 2438 struct buf *pbuf = (struct buf *)bp->bio_caller2; 2439 2440 /* Release mapping into kernel space. */ 2441 if (pbuf != NULL) { 2442 MPASS(pbuf->b_npages <= atop(maxphys) + 1); 2443 pmap_qremove((vm_offset_t)pbuf->b_data, pbuf->b_npages); 2444 vm_page_unhold_pages(pbuf->b_pages, pbuf->b_npages); 2445 uma_zfree(pbuf_zone, pbuf); 2446 atomic_subtract_int(&num_buf_aio, 1); 2447 ki = job->userproc->p_aioinfo; 2448 AIO_LOCK(ki); 2449 ki->kaio_buffer_count--; 2450 AIO_UNLOCK(ki); 2451 } else { 2452 MPASS(bp->bio_ma_n <= atop(maxphys) + 1); 2453 vm_page_unhold_pages(bp->bio_ma, bp->bio_ma_n); 2454 free(bp->bio_ma, M_TEMP); 2455 atomic_subtract_int(&num_unmapped_aio, 1); 2456 } 2457 g_destroy_bio(bp); 2458 } 2459 2460 static void 2461 aio_biowakeup(struct bio *bp) 2462 { 2463 struct kaiocb *job = (struct kaiocb *)bp->bio_caller1; 2464 size_t nbytes; 2465 long bcount = bp->bio_bcount; 2466 long resid = bp->bio_resid; 2467 int opcode, nblks; 2468 int bio_error = bp->bio_error; 2469 uint16_t flags = bp->bio_flags; 2470 2471 opcode = job->uaiocb.aio_lio_opcode; 2472 2473 aio_biocleanup(bp); 2474 2475 nbytes = bcount - resid; 2476 atomic_add_acq_long(&job->nbytes, nbytes); 2477 nblks = btodb(nbytes); 2478 2479 /* 2480 * If multiple bios experienced an error, the job will reflect the 2481 * error of whichever failed bio completed last. 2482 */ 2483 if (flags & BIO_ERROR) 2484 atomic_store_int(&job->error, bio_error); 2485 if (opcode & LIO_WRITE) 2486 atomic_add_int(&job->outblock, nblks); 2487 else 2488 atomic_add_int(&job->inblock, nblks); 2489 2490 if (refcount_release(&job->nbio)) { 2491 bio_error = atomic_load_int(&job->error); 2492 if (bio_error != 0) 2493 aio_complete(job, -1, bio_error); 2494 else 2495 aio_complete(job, atomic_load_long(&job->nbytes), 0); 2496 } 2497 } 2498 2499 /* syscall - wait for the next completion of an aio request */ 2500 static int 2501 kern_aio_waitcomplete(struct thread *td, struct aiocb **ujobp, 2502 struct timespec *ts, struct aiocb_ops *ops) 2503 { 2504 struct proc *p = td->td_proc; 2505 struct timeval atv; 2506 struct kaioinfo *ki; 2507 struct kaiocb *job; 2508 struct aiocb *ujob; 2509 long error, status; 2510 int timo; 2511 2512 ops->store_aiocb(ujobp, NULL); 2513 2514 if (ts == NULL) { 2515 timo = 0; 2516 } else if (ts->tv_sec == 0 && ts->tv_nsec == 0) { 2517 timo = -1; 2518 } else { 2519 if ((ts->tv_nsec < 0) || (ts->tv_nsec >= 1000000000)) 2520 return (EINVAL); 2521 2522 TIMESPEC_TO_TIMEVAL(&atv, ts); 2523 if (itimerfix(&atv)) 2524 return (EINVAL); 2525 timo = tvtohz(&atv); 2526 } 2527 2528 if (p->p_aioinfo == NULL) 2529 aio_init_aioinfo(p); 2530 ki = p->p_aioinfo; 2531 2532 error = 0; 2533 job = NULL; 2534 AIO_LOCK(ki); 2535 while ((job = TAILQ_FIRST(&ki->kaio_done)) == NULL) { 2536 if (timo == -1) { 2537 error = EWOULDBLOCK; 2538 break; 2539 } 2540 ki->kaio_flags |= KAIO_WAKEUP; 2541 error = msleep(&p->p_aioinfo, AIO_MTX(ki), PRIBIO | PCATCH, 2542 "aiowc", timo); 2543 if (timo && error == ERESTART) 2544 error = EINTR; 2545 if (error) 2546 break; 2547 } 2548 2549 if (job != NULL) { 2550 MPASS(job->jobflags & KAIOCB_FINISHED); 2551 ujob = job->ujob; 2552 status = job->uaiocb._aiocb_private.status; 2553 error = job->uaiocb._aiocb_private.error; 2554 td->td_retval[0] = status; 2555 td->td_ru.ru_oublock += job->outblock; 2556 td->td_ru.ru_inblock += job->inblock; 2557 td->td_ru.ru_msgsnd += job->msgsnd; 2558 td->td_ru.ru_msgrcv += job->msgrcv; 2559 aio_free_entry(job); 2560 AIO_UNLOCK(ki); 2561 ops->store_aiocb(ujobp, ujob); 2562 ops->store_error(ujob, error); 2563 ops->store_status(ujob, status); 2564 } else 2565 AIO_UNLOCK(ki); 2566 2567 return (error); 2568 } 2569 2570 int 2571 sys_aio_waitcomplete(struct thread *td, struct aio_waitcomplete_args *uap) 2572 { 2573 struct timespec ts, *tsp; 2574 int error; 2575 2576 if (uap->timeout) { 2577 /* Get timespec struct. */ 2578 error = copyin(uap->timeout, &ts, sizeof(ts)); 2579 if (error) 2580 return (error); 2581 tsp = &ts; 2582 } else 2583 tsp = NULL; 2584 2585 return (kern_aio_waitcomplete(td, uap->aiocbp, tsp, &aiocb_ops)); 2586 } 2587 2588 static int 2589 kern_aio_fsync(struct thread *td, int op, struct aiocb *ujob, 2590 struct aiocb_ops *ops) 2591 { 2592 int listop; 2593 2594 switch (op) { 2595 case O_SYNC: 2596 listop = LIO_SYNC; 2597 break; 2598 case O_DSYNC: 2599 listop = LIO_DSYNC; 2600 break; 2601 default: 2602 return (EINVAL); 2603 } 2604 2605 return (aio_aqueue(td, ujob, NULL, listop, ops)); 2606 } 2607 2608 int 2609 sys_aio_fsync(struct thread *td, struct aio_fsync_args *uap) 2610 { 2611 2612 return (kern_aio_fsync(td, uap->op, uap->aiocbp, &aiocb_ops)); 2613 } 2614 2615 /* kqueue attach function */ 2616 static int 2617 filt_aioattach(struct knote *kn) 2618 { 2619 struct kaiocb *job; 2620 2621 job = (struct kaiocb *)(uintptr_t)kn->kn_sdata; 2622 2623 /* 2624 * The job pointer must be validated before using it, so 2625 * registration is restricted to the kernel; the user cannot 2626 * set EV_FLAG1. 2627 */ 2628 if ((kn->kn_flags & EV_FLAG1) == 0) 2629 return (EPERM); 2630 kn->kn_ptr.p_aio = job; 2631 kn->kn_flags &= ~EV_FLAG1; 2632 2633 knlist_add(&job->klist, kn, 0); 2634 2635 return (0); 2636 } 2637 2638 /* kqueue detach function */ 2639 static void 2640 filt_aiodetach(struct knote *kn) 2641 { 2642 struct knlist *knl; 2643 2644 knl = &kn->kn_ptr.p_aio->klist; 2645 knl->kl_lock(knl->kl_lockarg); 2646 if (!knlist_empty(knl)) 2647 knlist_remove(knl, kn, 1); 2648 knl->kl_unlock(knl->kl_lockarg); 2649 } 2650 2651 /* kqueue filter function */ 2652 /*ARGSUSED*/ 2653 static int 2654 filt_aio(struct knote *kn, long hint) 2655 { 2656 struct kaiocb *job = kn->kn_ptr.p_aio; 2657 2658 kn->kn_data = job->uaiocb._aiocb_private.error; 2659 if (!(job->jobflags & KAIOCB_FINISHED)) 2660 return (0); 2661 kn->kn_flags |= EV_EOF; 2662 return (1); 2663 } 2664 2665 /* kqueue attach function */ 2666 static int 2667 filt_lioattach(struct knote *kn) 2668 { 2669 struct aioliojob *lj; 2670 2671 lj = (struct aioliojob *)(uintptr_t)kn->kn_sdata; 2672 2673 /* 2674 * The aioliojob pointer must be validated before using it, so 2675 * registration is restricted to the kernel; the user cannot 2676 * set EV_FLAG1. 2677 */ 2678 if ((kn->kn_flags & EV_FLAG1) == 0) 2679 return (EPERM); 2680 kn->kn_ptr.p_lio = lj; 2681 kn->kn_flags &= ~EV_FLAG1; 2682 2683 knlist_add(&lj->klist, kn, 0); 2684 2685 return (0); 2686 } 2687 2688 /* kqueue detach function */ 2689 static void 2690 filt_liodetach(struct knote *kn) 2691 { 2692 struct knlist *knl; 2693 2694 knl = &kn->kn_ptr.p_lio->klist; 2695 knl->kl_lock(knl->kl_lockarg); 2696 if (!knlist_empty(knl)) 2697 knlist_remove(knl, kn, 1); 2698 knl->kl_unlock(knl->kl_lockarg); 2699 } 2700 2701 /* kqueue filter function */ 2702 /*ARGSUSED*/ 2703 static int 2704 filt_lio(struct knote *kn, long hint) 2705 { 2706 struct aioliojob * lj = kn->kn_ptr.p_lio; 2707 2708 return (lj->lioj_flags & LIOJ_KEVENT_POSTED); 2709 } 2710 2711 #ifdef COMPAT_FREEBSD32 2712 #include <sys/mount.h> 2713 #include <sys/socket.h> 2714 #include <sys/sysent.h> 2715 #include <compat/freebsd32/freebsd32.h> 2716 #include <compat/freebsd32/freebsd32_proto.h> 2717 #include <compat/freebsd32/freebsd32_signal.h> 2718 #include <compat/freebsd32/freebsd32_syscall.h> 2719 #include <compat/freebsd32/freebsd32_util.h> 2720 2721 struct __aiocb_private32 { 2722 int32_t status; 2723 int32_t error; 2724 uint32_t kernelinfo; 2725 }; 2726 2727 #ifdef COMPAT_FREEBSD6 2728 typedef struct oaiocb32 { 2729 int aio_fildes; /* File descriptor */ 2730 uint64_t aio_offset __packed; /* File offset for I/O */ 2731 uint32_t aio_buf; /* I/O buffer in process space */ 2732 uint32_t aio_nbytes; /* Number of bytes for I/O */ 2733 struct osigevent32 aio_sigevent; /* Signal to deliver */ 2734 int aio_lio_opcode; /* LIO opcode */ 2735 int aio_reqprio; /* Request priority -- ignored */ 2736 struct __aiocb_private32 _aiocb_private; 2737 } oaiocb32_t; 2738 #endif 2739 2740 typedef struct aiocb32 { 2741 int32_t aio_fildes; /* File descriptor */ 2742 uint64_t aio_offset __packed; /* File offset for I/O */ 2743 uint32_t aio_buf; /* I/O buffer in process space */ 2744 uint32_t aio_nbytes; /* Number of bytes for I/O */ 2745 int __spare__[2]; 2746 uint32_t __spare2__; 2747 int aio_lio_opcode; /* LIO opcode */ 2748 int aio_reqprio; /* Request priority -- ignored */ 2749 struct __aiocb_private32 _aiocb_private; 2750 struct sigevent32 aio_sigevent; /* Signal to deliver */ 2751 } aiocb32_t; 2752 2753 #ifdef COMPAT_FREEBSD6 2754 static int 2755 convert_old_sigevent32(struct osigevent32 *osig, struct sigevent *nsig) 2756 { 2757 2758 /* 2759 * Only SIGEV_NONE, SIGEV_SIGNAL, and SIGEV_KEVENT are 2760 * supported by AIO with the old sigevent structure. 2761 */ 2762 CP(*osig, *nsig, sigev_notify); 2763 switch (nsig->sigev_notify) { 2764 case SIGEV_NONE: 2765 break; 2766 case SIGEV_SIGNAL: 2767 nsig->sigev_signo = osig->__sigev_u.__sigev_signo; 2768 break; 2769 case SIGEV_KEVENT: 2770 nsig->sigev_notify_kqueue = 2771 osig->__sigev_u.__sigev_notify_kqueue; 2772 PTRIN_CP(*osig, *nsig, sigev_value.sival_ptr); 2773 break; 2774 default: 2775 return (EINVAL); 2776 } 2777 return (0); 2778 } 2779 2780 static int 2781 aiocb32_copyin_old_sigevent(struct aiocb *ujob, struct kaiocb *kjob, 2782 int type __unused) 2783 { 2784 struct oaiocb32 job32; 2785 struct aiocb *kcb = &kjob->uaiocb; 2786 int error; 2787 2788 bzero(kcb, sizeof(struct aiocb)); 2789 error = copyin(ujob, &job32, sizeof(job32)); 2790 if (error) 2791 return (error); 2792 2793 /* No need to copyin aio_iov, because it did not exist in FreeBSD 6 */ 2794 2795 CP(job32, *kcb, aio_fildes); 2796 CP(job32, *kcb, aio_offset); 2797 PTRIN_CP(job32, *kcb, aio_buf); 2798 CP(job32, *kcb, aio_nbytes); 2799 CP(job32, *kcb, aio_lio_opcode); 2800 CP(job32, *kcb, aio_reqprio); 2801 CP(job32, *kcb, _aiocb_private.status); 2802 CP(job32, *kcb, _aiocb_private.error); 2803 PTRIN_CP(job32, *kcb, _aiocb_private.kernelinfo); 2804 return (convert_old_sigevent32(&job32.aio_sigevent, 2805 &kcb->aio_sigevent)); 2806 } 2807 #endif 2808 2809 static int 2810 aiocb32_copyin(struct aiocb *ujob, struct kaiocb *kjob, int type) 2811 { 2812 struct aiocb32 job32; 2813 struct aiocb *kcb = &kjob->uaiocb; 2814 struct iovec32 *iov32; 2815 int error; 2816 2817 error = copyin(ujob, &job32, sizeof(job32)); 2818 if (error) 2819 return (error); 2820 CP(job32, *kcb, aio_fildes); 2821 CP(job32, *kcb, aio_offset); 2822 CP(job32, *kcb, aio_lio_opcode); 2823 if (type == LIO_NOP) 2824 type = kcb->aio_lio_opcode; 2825 if (type & LIO_VECTORED) { 2826 iov32 = PTRIN(job32.aio_iov); 2827 CP(job32, *kcb, aio_iovcnt); 2828 /* malloc a uio and copy in the iovec */ 2829 error = freebsd32_copyinuio(iov32, 2830 kcb->aio_iovcnt, &kjob->uiop); 2831 if (error) 2832 return (error); 2833 } else { 2834 PTRIN_CP(job32, *kcb, aio_buf); 2835 CP(job32, *kcb, aio_nbytes); 2836 } 2837 CP(job32, *kcb, aio_reqprio); 2838 CP(job32, *kcb, _aiocb_private.status); 2839 CP(job32, *kcb, _aiocb_private.error); 2840 PTRIN_CP(job32, *kcb, _aiocb_private.kernelinfo); 2841 error = convert_sigevent32(&job32.aio_sigevent, &kcb->aio_sigevent); 2842 2843 return (error); 2844 } 2845 2846 static long 2847 aiocb32_fetch_status(struct aiocb *ujob) 2848 { 2849 struct aiocb32 *ujob32; 2850 2851 ujob32 = (struct aiocb32 *)ujob; 2852 return (fuword32(&ujob32->_aiocb_private.status)); 2853 } 2854 2855 static long 2856 aiocb32_fetch_error(struct aiocb *ujob) 2857 { 2858 struct aiocb32 *ujob32; 2859 2860 ujob32 = (struct aiocb32 *)ujob; 2861 return (fuword32(&ujob32->_aiocb_private.error)); 2862 } 2863 2864 static int 2865 aiocb32_store_status(struct aiocb *ujob, long status) 2866 { 2867 struct aiocb32 *ujob32; 2868 2869 ujob32 = (struct aiocb32 *)ujob; 2870 return (suword32(&ujob32->_aiocb_private.status, status)); 2871 } 2872 2873 static int 2874 aiocb32_store_error(struct aiocb *ujob, long error) 2875 { 2876 struct aiocb32 *ujob32; 2877 2878 ujob32 = (struct aiocb32 *)ujob; 2879 return (suword32(&ujob32->_aiocb_private.error, error)); 2880 } 2881 2882 static int 2883 aiocb32_store_kernelinfo(struct aiocb *ujob, long jobref) 2884 { 2885 struct aiocb32 *ujob32; 2886 2887 ujob32 = (struct aiocb32 *)ujob; 2888 return (suword32(&ujob32->_aiocb_private.kernelinfo, jobref)); 2889 } 2890 2891 static int 2892 aiocb32_store_aiocb(struct aiocb **ujobp, struct aiocb *ujob) 2893 { 2894 2895 return (suword32(ujobp, (long)ujob)); 2896 } 2897 2898 static struct aiocb_ops aiocb32_ops = { 2899 .aio_copyin = aiocb32_copyin, 2900 .fetch_status = aiocb32_fetch_status, 2901 .fetch_error = aiocb32_fetch_error, 2902 .store_status = aiocb32_store_status, 2903 .store_error = aiocb32_store_error, 2904 .store_kernelinfo = aiocb32_store_kernelinfo, 2905 .store_aiocb = aiocb32_store_aiocb, 2906 }; 2907 2908 #ifdef COMPAT_FREEBSD6 2909 static struct aiocb_ops aiocb32_ops_osigevent = { 2910 .aio_copyin = aiocb32_copyin_old_sigevent, 2911 .fetch_status = aiocb32_fetch_status, 2912 .fetch_error = aiocb32_fetch_error, 2913 .store_status = aiocb32_store_status, 2914 .store_error = aiocb32_store_error, 2915 .store_kernelinfo = aiocb32_store_kernelinfo, 2916 .store_aiocb = aiocb32_store_aiocb, 2917 }; 2918 #endif 2919 2920 int 2921 freebsd32_aio_return(struct thread *td, struct freebsd32_aio_return_args *uap) 2922 { 2923 2924 return (kern_aio_return(td, (struct aiocb *)uap->aiocbp, &aiocb32_ops)); 2925 } 2926 2927 int 2928 freebsd32_aio_suspend(struct thread *td, struct freebsd32_aio_suspend_args *uap) 2929 { 2930 struct timespec32 ts32; 2931 struct timespec ts, *tsp; 2932 struct aiocb **ujoblist; 2933 uint32_t *ujoblist32; 2934 int error, i; 2935 2936 if (uap->nent < 0 || uap->nent > max_aio_queue_per_proc) 2937 return (EINVAL); 2938 2939 if (uap->timeout) { 2940 /* Get timespec struct. */ 2941 if ((error = copyin(uap->timeout, &ts32, sizeof(ts32))) != 0) 2942 return (error); 2943 CP(ts32, ts, tv_sec); 2944 CP(ts32, ts, tv_nsec); 2945 tsp = &ts; 2946 } else 2947 tsp = NULL; 2948 2949 ujoblist = malloc(uap->nent * sizeof(ujoblist[0]), M_AIO, M_WAITOK); 2950 ujoblist32 = (uint32_t *)ujoblist; 2951 error = copyin(uap->aiocbp, ujoblist32, uap->nent * 2952 sizeof(ujoblist32[0])); 2953 if (error == 0) { 2954 for (i = uap->nent - 1; i >= 0; i--) 2955 ujoblist[i] = PTRIN(ujoblist32[i]); 2956 2957 error = kern_aio_suspend(td, uap->nent, ujoblist, tsp); 2958 } 2959 free(ujoblist, M_AIO); 2960 return (error); 2961 } 2962 2963 int 2964 freebsd32_aio_error(struct thread *td, struct freebsd32_aio_error_args *uap) 2965 { 2966 2967 return (kern_aio_error(td, (struct aiocb *)uap->aiocbp, &aiocb32_ops)); 2968 } 2969 2970 #ifdef COMPAT_FREEBSD6 2971 int 2972 freebsd6_freebsd32_aio_read(struct thread *td, 2973 struct freebsd6_freebsd32_aio_read_args *uap) 2974 { 2975 2976 return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_READ, 2977 &aiocb32_ops_osigevent)); 2978 } 2979 #endif 2980 2981 int 2982 freebsd32_aio_read(struct thread *td, struct freebsd32_aio_read_args *uap) 2983 { 2984 2985 return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_READ, 2986 &aiocb32_ops)); 2987 } 2988 2989 int 2990 freebsd32_aio_readv(struct thread *td, struct freebsd32_aio_readv_args *uap) 2991 { 2992 2993 return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_READV, 2994 &aiocb32_ops)); 2995 } 2996 2997 #ifdef COMPAT_FREEBSD6 2998 int 2999 freebsd6_freebsd32_aio_write(struct thread *td, 3000 struct freebsd6_freebsd32_aio_write_args *uap) 3001 { 3002 3003 return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_WRITE, 3004 &aiocb32_ops_osigevent)); 3005 } 3006 #endif 3007 3008 int 3009 freebsd32_aio_write(struct thread *td, struct freebsd32_aio_write_args *uap) 3010 { 3011 3012 return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_WRITE, 3013 &aiocb32_ops)); 3014 } 3015 3016 int 3017 freebsd32_aio_writev(struct thread *td, struct freebsd32_aio_writev_args *uap) 3018 { 3019 3020 return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_WRITEV, 3021 &aiocb32_ops)); 3022 } 3023 3024 int 3025 freebsd32_aio_mlock(struct thread *td, struct freebsd32_aio_mlock_args *uap) 3026 { 3027 3028 return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_MLOCK, 3029 &aiocb32_ops)); 3030 } 3031 3032 int 3033 freebsd32_aio_waitcomplete(struct thread *td, 3034 struct freebsd32_aio_waitcomplete_args *uap) 3035 { 3036 struct timespec32 ts32; 3037 struct timespec ts, *tsp; 3038 int error; 3039 3040 if (uap->timeout) { 3041 /* Get timespec struct. */ 3042 error = copyin(uap->timeout, &ts32, sizeof(ts32)); 3043 if (error) 3044 return (error); 3045 CP(ts32, ts, tv_sec); 3046 CP(ts32, ts, tv_nsec); 3047 tsp = &ts; 3048 } else 3049 tsp = NULL; 3050 3051 return (kern_aio_waitcomplete(td, (struct aiocb **)uap->aiocbp, tsp, 3052 &aiocb32_ops)); 3053 } 3054 3055 int 3056 freebsd32_aio_fsync(struct thread *td, struct freebsd32_aio_fsync_args *uap) 3057 { 3058 3059 return (kern_aio_fsync(td, uap->op, (struct aiocb *)uap->aiocbp, 3060 &aiocb32_ops)); 3061 } 3062 3063 #ifdef COMPAT_FREEBSD6 3064 int 3065 freebsd6_freebsd32_lio_listio(struct thread *td, 3066 struct freebsd6_freebsd32_lio_listio_args *uap) 3067 { 3068 struct aiocb **acb_list; 3069 struct sigevent *sigp, sig; 3070 struct osigevent32 osig; 3071 uint32_t *acb_list32; 3072 int error, i, nent; 3073 3074 if ((uap->mode != LIO_NOWAIT) && (uap->mode != LIO_WAIT)) 3075 return (EINVAL); 3076 3077 nent = uap->nent; 3078 if (nent < 0 || nent > max_aio_queue_per_proc) 3079 return (EINVAL); 3080 3081 if (uap->sig && (uap->mode == LIO_NOWAIT)) { 3082 error = copyin(uap->sig, &osig, sizeof(osig)); 3083 if (error) 3084 return (error); 3085 error = convert_old_sigevent32(&osig, &sig); 3086 if (error) 3087 return (error); 3088 sigp = &sig; 3089 } else 3090 sigp = NULL; 3091 3092 acb_list32 = malloc(sizeof(uint32_t) * nent, M_LIO, M_WAITOK); 3093 error = copyin(uap->acb_list, acb_list32, nent * sizeof(uint32_t)); 3094 if (error) { 3095 free(acb_list32, M_LIO); 3096 return (error); 3097 } 3098 acb_list = malloc(sizeof(struct aiocb *) * nent, M_LIO, M_WAITOK); 3099 for (i = 0; i < nent; i++) 3100 acb_list[i] = PTRIN(acb_list32[i]); 3101 free(acb_list32, M_LIO); 3102 3103 error = kern_lio_listio(td, uap->mode, 3104 (struct aiocb * const *)uap->acb_list, acb_list, nent, sigp, 3105 &aiocb32_ops_osigevent); 3106 free(acb_list, M_LIO); 3107 return (error); 3108 } 3109 #endif 3110 3111 int 3112 freebsd32_lio_listio(struct thread *td, struct freebsd32_lio_listio_args *uap) 3113 { 3114 struct aiocb **acb_list; 3115 struct sigevent *sigp, sig; 3116 struct sigevent32 sig32; 3117 uint32_t *acb_list32; 3118 int error, i, nent; 3119 3120 if ((uap->mode != LIO_NOWAIT) && (uap->mode != LIO_WAIT)) 3121 return (EINVAL); 3122 3123 nent = uap->nent; 3124 if (nent < 0 || nent > max_aio_queue_per_proc) 3125 return (EINVAL); 3126 3127 if (uap->sig && (uap->mode == LIO_NOWAIT)) { 3128 error = copyin(uap->sig, &sig32, sizeof(sig32)); 3129 if (error) 3130 return (error); 3131 error = convert_sigevent32(&sig32, &sig); 3132 if (error) 3133 return (error); 3134 sigp = &sig; 3135 } else 3136 sigp = NULL; 3137 3138 acb_list32 = malloc(sizeof(uint32_t) * nent, M_LIO, M_WAITOK); 3139 error = copyin(uap->acb_list, acb_list32, nent * sizeof(uint32_t)); 3140 if (error) { 3141 free(acb_list32, M_LIO); 3142 return (error); 3143 } 3144 acb_list = malloc(sizeof(struct aiocb *) * nent, M_LIO, M_WAITOK); 3145 for (i = 0; i < nent; i++) 3146 acb_list[i] = PTRIN(acb_list32[i]); 3147 free(acb_list32, M_LIO); 3148 3149 error = kern_lio_listio(td, uap->mode, 3150 (struct aiocb * const *)uap->acb_list, acb_list, nent, sigp, 3151 &aiocb32_ops); 3152 free(acb_list, M_LIO); 3153 return (error); 3154 } 3155 3156 #endif 3157