1 /*- 2 * Copyright (c) 2006, 2011 Robert N. M. Watson 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 24 * SUCH DAMAGE. 25 */ 26 27 /* 28 * Support for shared swap-backed anonymous memory objects via 29 * shm_open(2) and shm_unlink(2). While most of the implementation is 30 * here, vm_mmap.c contains mapping logic changes. 31 * 32 * TODO: 33 * 34 * (1) Need to export data to a userland tool via a sysctl. Should ipcs(1) 35 * and ipcrm(1) be expanded or should new tools to manage both POSIX 36 * kernel semaphores and POSIX shared memory be written? 37 * 38 * (2) Add support for this file type to fstat(1). 39 * 40 * (3) Resource limits? Does this need its own resource limits or are the 41 * existing limits in mmap(2) sufficient? 42 */ 43 44 #include <sys/cdefs.h> 45 __FBSDID("$FreeBSD$"); 46 47 #include "opt_capsicum.h" 48 #include "opt_ktrace.h" 49 50 #include <sys/param.h> 51 #include <sys/capsicum.h> 52 #include <sys/conf.h> 53 #include <sys/fcntl.h> 54 #include <sys/file.h> 55 #include <sys/filedesc.h> 56 #include <sys/fnv_hash.h> 57 #include <sys/kernel.h> 58 #include <sys/uio.h> 59 #include <sys/signal.h> 60 #include <sys/ktrace.h> 61 #include <sys/lock.h> 62 #include <sys/malloc.h> 63 #include <sys/mman.h> 64 #include <sys/mutex.h> 65 #include <sys/priv.h> 66 #include <sys/proc.h> 67 #include <sys/refcount.h> 68 #include <sys/resourcevar.h> 69 #include <sys/rwlock.h> 70 #include <sys/stat.h> 71 #include <sys/syscallsubr.h> 72 #include <sys/sysctl.h> 73 #include <sys/sysproto.h> 74 #include <sys/systm.h> 75 #include <sys/sx.h> 76 #include <sys/time.h> 77 #include <sys/vnode.h> 78 #include <sys/unistd.h> 79 #include <sys/user.h> 80 81 #include <security/mac/mac_framework.h> 82 83 #include <vm/vm.h> 84 #include <vm/vm_param.h> 85 #include <vm/pmap.h> 86 #include <vm/vm_extern.h> 87 #include <vm/vm_map.h> 88 #include <vm/vm_kern.h> 89 #include <vm/vm_object.h> 90 #include <vm/vm_page.h> 91 #include <vm/vm_pageout.h> 92 #include <vm/vm_pager.h> 93 #include <vm/swap_pager.h> 94 95 struct shm_mapping { 96 char *sm_path; 97 Fnv32_t sm_fnv; 98 struct shmfd *sm_shmfd; 99 LIST_ENTRY(shm_mapping) sm_link; 100 }; 101 102 static MALLOC_DEFINE(M_SHMFD, "shmfd", "shared memory file descriptor"); 103 static LIST_HEAD(, shm_mapping) *shm_dictionary; 104 static struct sx shm_dict_lock; 105 static struct mtx shm_timestamp_lock; 106 static u_long shm_hash; 107 static struct unrhdr *shm_ino_unr; 108 static dev_t shm_dev_ino; 109 110 #define SHM_HASH(fnv) (&shm_dictionary[(fnv) & shm_hash]) 111 112 static int shm_access(struct shmfd *shmfd, struct ucred *ucred, int flags); 113 static struct shmfd *shm_alloc(struct ucred *ucred, mode_t mode); 114 static void shm_init(void *arg); 115 static void shm_drop(struct shmfd *shmfd); 116 static struct shmfd *shm_hold(struct shmfd *shmfd); 117 static void shm_insert(char *path, Fnv32_t fnv, struct shmfd *shmfd); 118 static struct shmfd *shm_lookup(char *path, Fnv32_t fnv); 119 static int shm_remove(char *path, Fnv32_t fnv, struct ucred *ucred); 120 static int shm_dotruncate(struct shmfd *shmfd, off_t length); 121 122 static fo_rdwr_t shm_read; 123 static fo_rdwr_t shm_write; 124 static fo_truncate_t shm_truncate; 125 static fo_stat_t shm_stat; 126 static fo_close_t shm_close; 127 static fo_chmod_t shm_chmod; 128 static fo_chown_t shm_chown; 129 static fo_seek_t shm_seek; 130 static fo_fill_kinfo_t shm_fill_kinfo; 131 static fo_mmap_t shm_mmap; 132 133 /* File descriptor operations. */ 134 static struct fileops shm_ops = { 135 .fo_read = shm_read, 136 .fo_write = shm_write, 137 .fo_truncate = shm_truncate, 138 .fo_ioctl = invfo_ioctl, 139 .fo_poll = invfo_poll, 140 .fo_kqfilter = invfo_kqfilter, 141 .fo_stat = shm_stat, 142 .fo_close = shm_close, 143 .fo_chmod = shm_chmod, 144 .fo_chown = shm_chown, 145 .fo_sendfile = vn_sendfile, 146 .fo_seek = shm_seek, 147 .fo_fill_kinfo = shm_fill_kinfo, 148 .fo_mmap = shm_mmap, 149 .fo_flags = DFLAG_PASSABLE | DFLAG_SEEKABLE 150 }; 151 152 FEATURE(posix_shm, "POSIX shared memory"); 153 154 static int 155 uiomove_object_page(vm_object_t obj, size_t len, struct uio *uio) 156 { 157 vm_page_t m; 158 vm_pindex_t idx; 159 size_t tlen; 160 int error, offset, rv; 161 162 idx = OFF_TO_IDX(uio->uio_offset); 163 offset = uio->uio_offset & PAGE_MASK; 164 tlen = MIN(PAGE_SIZE - offset, len); 165 166 VM_OBJECT_WLOCK(obj); 167 168 /* 169 * Read I/O without either a corresponding resident page or swap 170 * page: use zero_region. This is intended to avoid instantiating 171 * pages on read from a sparse region. 172 */ 173 if (uio->uio_rw == UIO_READ && vm_page_lookup(obj, idx) == NULL && 174 !vm_pager_has_page(obj, idx, NULL, NULL)) { 175 VM_OBJECT_WUNLOCK(obj); 176 return (uiomove(__DECONST(void *, zero_region), tlen, uio)); 177 } 178 179 /* 180 * Parallel reads of the page content from disk are prevented 181 * by exclusive busy. 182 * 183 * Although the tmpfs vnode lock is held here, it is 184 * nonetheless safe to sleep waiting for a free page. The 185 * pageout daemon does not need to acquire the tmpfs vnode 186 * lock to page out tobj's pages because tobj is a OBJT_SWAP 187 * type object. 188 */ 189 m = vm_page_grab(obj, idx, VM_ALLOC_NORMAL); 190 if (m->valid != VM_PAGE_BITS_ALL) { 191 if (vm_pager_has_page(obj, idx, NULL, NULL)) { 192 rv = vm_pager_get_pages(obj, &m, 1, NULL, NULL); 193 if (rv != VM_PAGER_OK) { 194 printf( 195 "uiomove_object: vm_obj %p idx %jd valid %x pager error %d\n", 196 obj, idx, m->valid, rv); 197 vm_page_lock(m); 198 vm_page_free(m); 199 vm_page_unlock(m); 200 VM_OBJECT_WUNLOCK(obj); 201 return (EIO); 202 } 203 } else 204 vm_page_zero_invalid(m, TRUE); 205 } 206 vm_page_xunbusy(m); 207 vm_page_lock(m); 208 vm_page_hold(m); 209 if (m->queue == PQ_NONE) { 210 vm_page_deactivate(m); 211 } else { 212 /* Requeue to maintain LRU ordering. */ 213 vm_page_requeue(m); 214 } 215 vm_page_unlock(m); 216 VM_OBJECT_WUNLOCK(obj); 217 error = uiomove_fromphys(&m, offset, tlen, uio); 218 if (uio->uio_rw == UIO_WRITE && error == 0) { 219 VM_OBJECT_WLOCK(obj); 220 vm_page_dirty(m); 221 vm_pager_page_unswapped(m); 222 VM_OBJECT_WUNLOCK(obj); 223 } 224 vm_page_lock(m); 225 vm_page_unhold(m); 226 vm_page_unlock(m); 227 228 return (error); 229 } 230 231 int 232 uiomove_object(vm_object_t obj, off_t obj_size, struct uio *uio) 233 { 234 ssize_t resid; 235 size_t len; 236 int error; 237 238 error = 0; 239 while ((resid = uio->uio_resid) > 0) { 240 if (obj_size <= uio->uio_offset) 241 break; 242 len = MIN(obj_size - uio->uio_offset, resid); 243 if (len == 0) 244 break; 245 error = uiomove_object_page(obj, len, uio); 246 if (error != 0 || resid == uio->uio_resid) 247 break; 248 } 249 return (error); 250 } 251 252 static int 253 shm_seek(struct file *fp, off_t offset, int whence, struct thread *td) 254 { 255 struct shmfd *shmfd; 256 off_t foffset; 257 int error; 258 259 shmfd = fp->f_data; 260 foffset = foffset_lock(fp, 0); 261 error = 0; 262 switch (whence) { 263 case L_INCR: 264 if (foffset < 0 || 265 (offset > 0 && foffset > OFF_MAX - offset)) { 266 error = EOVERFLOW; 267 break; 268 } 269 offset += foffset; 270 break; 271 case L_XTND: 272 if (offset > 0 && shmfd->shm_size > OFF_MAX - offset) { 273 error = EOVERFLOW; 274 break; 275 } 276 offset += shmfd->shm_size; 277 break; 278 case L_SET: 279 break; 280 default: 281 error = EINVAL; 282 } 283 if (error == 0) { 284 if (offset < 0 || offset > shmfd->shm_size) 285 error = EINVAL; 286 else 287 td->td_uretoff.tdu_off = offset; 288 } 289 foffset_unlock(fp, offset, error != 0 ? FOF_NOUPDATE : 0); 290 return (error); 291 } 292 293 static int 294 shm_read(struct file *fp, struct uio *uio, struct ucred *active_cred, 295 int flags, struct thread *td) 296 { 297 struct shmfd *shmfd; 298 void *rl_cookie; 299 int error; 300 301 shmfd = fp->f_data; 302 foffset_lock_uio(fp, uio, flags); 303 rl_cookie = rangelock_rlock(&shmfd->shm_rl, uio->uio_offset, 304 uio->uio_offset + uio->uio_resid, &shmfd->shm_mtx); 305 #ifdef MAC 306 error = mac_posixshm_check_read(active_cred, fp->f_cred, shmfd); 307 if (error) 308 return (error); 309 #endif 310 error = uiomove_object(shmfd->shm_object, shmfd->shm_size, uio); 311 rangelock_unlock(&shmfd->shm_rl, rl_cookie, &shmfd->shm_mtx); 312 foffset_unlock_uio(fp, uio, flags); 313 return (error); 314 } 315 316 static int 317 shm_write(struct file *fp, struct uio *uio, struct ucred *active_cred, 318 int flags, struct thread *td) 319 { 320 struct shmfd *shmfd; 321 void *rl_cookie; 322 int error; 323 324 shmfd = fp->f_data; 325 #ifdef MAC 326 error = mac_posixshm_check_write(active_cred, fp->f_cred, shmfd); 327 if (error) 328 return (error); 329 #endif 330 foffset_lock_uio(fp, uio, flags); 331 if ((flags & FOF_OFFSET) == 0) { 332 rl_cookie = rangelock_wlock(&shmfd->shm_rl, 0, OFF_MAX, 333 &shmfd->shm_mtx); 334 } else { 335 rl_cookie = rangelock_wlock(&shmfd->shm_rl, uio->uio_offset, 336 uio->uio_offset + uio->uio_resid, &shmfd->shm_mtx); 337 } 338 339 error = uiomove_object(shmfd->shm_object, shmfd->shm_size, uio); 340 rangelock_unlock(&shmfd->shm_rl, rl_cookie, &shmfd->shm_mtx); 341 foffset_unlock_uio(fp, uio, flags); 342 return (error); 343 } 344 345 static int 346 shm_truncate(struct file *fp, off_t length, struct ucred *active_cred, 347 struct thread *td) 348 { 349 struct shmfd *shmfd; 350 #ifdef MAC 351 int error; 352 #endif 353 354 shmfd = fp->f_data; 355 #ifdef MAC 356 error = mac_posixshm_check_truncate(active_cred, fp->f_cred, shmfd); 357 if (error) 358 return (error); 359 #endif 360 return (shm_dotruncate(shmfd, length)); 361 } 362 363 static int 364 shm_stat(struct file *fp, struct stat *sb, struct ucred *active_cred, 365 struct thread *td) 366 { 367 struct shmfd *shmfd; 368 #ifdef MAC 369 int error; 370 #endif 371 372 shmfd = fp->f_data; 373 374 #ifdef MAC 375 error = mac_posixshm_check_stat(active_cred, fp->f_cred, shmfd); 376 if (error) 377 return (error); 378 #endif 379 380 /* 381 * Attempt to return sanish values for fstat() on a memory file 382 * descriptor. 383 */ 384 bzero(sb, sizeof(*sb)); 385 sb->st_blksize = PAGE_SIZE; 386 sb->st_size = shmfd->shm_size; 387 sb->st_blocks = (sb->st_size + sb->st_blksize - 1) / sb->st_blksize; 388 mtx_lock(&shm_timestamp_lock); 389 sb->st_atim = shmfd->shm_atime; 390 sb->st_ctim = shmfd->shm_ctime; 391 sb->st_mtim = shmfd->shm_mtime; 392 sb->st_birthtim = shmfd->shm_birthtime; 393 sb->st_mode = S_IFREG | shmfd->shm_mode; /* XXX */ 394 sb->st_uid = shmfd->shm_uid; 395 sb->st_gid = shmfd->shm_gid; 396 mtx_unlock(&shm_timestamp_lock); 397 sb->st_dev = shm_dev_ino; 398 sb->st_ino = shmfd->shm_ino; 399 400 return (0); 401 } 402 403 static int 404 shm_close(struct file *fp, struct thread *td) 405 { 406 struct shmfd *shmfd; 407 408 shmfd = fp->f_data; 409 fp->f_data = NULL; 410 shm_drop(shmfd); 411 412 return (0); 413 } 414 415 static int 416 shm_dotruncate(struct shmfd *shmfd, off_t length) 417 { 418 vm_object_t object; 419 vm_page_t m; 420 vm_pindex_t idx, nobjsize; 421 vm_ooffset_t delta; 422 int base, rv; 423 424 object = shmfd->shm_object; 425 VM_OBJECT_WLOCK(object); 426 if (length == shmfd->shm_size) { 427 VM_OBJECT_WUNLOCK(object); 428 return (0); 429 } 430 nobjsize = OFF_TO_IDX(length + PAGE_MASK); 431 432 /* Are we shrinking? If so, trim the end. */ 433 if (length < shmfd->shm_size) { 434 /* 435 * Disallow any requests to shrink the size if this 436 * object is mapped into the kernel. 437 */ 438 if (shmfd->shm_kmappings > 0) { 439 VM_OBJECT_WUNLOCK(object); 440 return (EBUSY); 441 } 442 443 /* 444 * Zero the truncated part of the last page. 445 */ 446 base = length & PAGE_MASK; 447 if (base != 0) { 448 idx = OFF_TO_IDX(length); 449 retry: 450 m = vm_page_lookup(object, idx); 451 if (m != NULL) { 452 if (vm_page_sleep_if_busy(m, "shmtrc")) 453 goto retry; 454 } else if (vm_pager_has_page(object, idx, NULL, NULL)) { 455 m = vm_page_alloc(object, idx, VM_ALLOC_NORMAL); 456 if (m == NULL) { 457 VM_OBJECT_WUNLOCK(object); 458 VM_WAIT; 459 VM_OBJECT_WLOCK(object); 460 goto retry; 461 } else if (m->valid != VM_PAGE_BITS_ALL) 462 rv = vm_pager_get_pages(object, &m, 1, 463 NULL, NULL); 464 else 465 /* A cached page was reactivated. */ 466 rv = VM_PAGER_OK; 467 vm_page_lock(m); 468 if (rv == VM_PAGER_OK) { 469 vm_page_deactivate(m); 470 vm_page_unlock(m); 471 vm_page_xunbusy(m); 472 } else { 473 vm_page_free(m); 474 vm_page_unlock(m); 475 VM_OBJECT_WUNLOCK(object); 476 return (EIO); 477 } 478 } 479 if (m != NULL) { 480 pmap_zero_page_area(m, base, PAGE_SIZE - base); 481 KASSERT(m->valid == VM_PAGE_BITS_ALL, 482 ("shm_dotruncate: page %p is invalid", m)); 483 vm_page_dirty(m); 484 vm_pager_page_unswapped(m); 485 } 486 } 487 delta = ptoa(object->size - nobjsize); 488 489 /* Toss in memory pages. */ 490 if (nobjsize < object->size) 491 vm_object_page_remove(object, nobjsize, object->size, 492 0); 493 494 /* Toss pages from swap. */ 495 if (object->type == OBJT_SWAP) 496 swap_pager_freespace(object, nobjsize, delta); 497 498 /* Free the swap accounted for shm */ 499 swap_release_by_cred(delta, object->cred); 500 object->charge -= delta; 501 } else { 502 /* Attempt to reserve the swap */ 503 delta = ptoa(nobjsize - object->size); 504 if (!swap_reserve_by_cred(delta, object->cred)) { 505 VM_OBJECT_WUNLOCK(object); 506 return (ENOMEM); 507 } 508 object->charge += delta; 509 } 510 shmfd->shm_size = length; 511 mtx_lock(&shm_timestamp_lock); 512 vfs_timestamp(&shmfd->shm_ctime); 513 shmfd->shm_mtime = shmfd->shm_ctime; 514 mtx_unlock(&shm_timestamp_lock); 515 object->size = nobjsize; 516 VM_OBJECT_WUNLOCK(object); 517 return (0); 518 } 519 520 /* 521 * shmfd object management including creation and reference counting 522 * routines. 523 */ 524 static struct shmfd * 525 shm_alloc(struct ucred *ucred, mode_t mode) 526 { 527 struct shmfd *shmfd; 528 int ino; 529 530 shmfd = malloc(sizeof(*shmfd), M_SHMFD, M_WAITOK | M_ZERO); 531 shmfd->shm_size = 0; 532 shmfd->shm_uid = ucred->cr_uid; 533 shmfd->shm_gid = ucred->cr_gid; 534 shmfd->shm_mode = mode; 535 shmfd->shm_object = vm_pager_allocate(OBJT_DEFAULT, NULL, 536 shmfd->shm_size, VM_PROT_DEFAULT, 0, ucred); 537 KASSERT(shmfd->shm_object != NULL, ("shm_create: vm_pager_allocate")); 538 shmfd->shm_object->pg_color = 0; 539 VM_OBJECT_WLOCK(shmfd->shm_object); 540 vm_object_clear_flag(shmfd->shm_object, OBJ_ONEMAPPING); 541 vm_object_set_flag(shmfd->shm_object, OBJ_COLORED | OBJ_NOSPLIT); 542 VM_OBJECT_WUNLOCK(shmfd->shm_object); 543 vfs_timestamp(&shmfd->shm_birthtime); 544 shmfd->shm_atime = shmfd->shm_mtime = shmfd->shm_ctime = 545 shmfd->shm_birthtime; 546 ino = alloc_unr(shm_ino_unr); 547 if (ino == -1) 548 shmfd->shm_ino = 0; 549 else 550 shmfd->shm_ino = ino; 551 refcount_init(&shmfd->shm_refs, 1); 552 mtx_init(&shmfd->shm_mtx, "shmrl", NULL, MTX_DEF); 553 rangelock_init(&shmfd->shm_rl); 554 #ifdef MAC 555 mac_posixshm_init(shmfd); 556 mac_posixshm_create(ucred, shmfd); 557 #endif 558 559 return (shmfd); 560 } 561 562 static struct shmfd * 563 shm_hold(struct shmfd *shmfd) 564 { 565 566 refcount_acquire(&shmfd->shm_refs); 567 return (shmfd); 568 } 569 570 static void 571 shm_drop(struct shmfd *shmfd) 572 { 573 574 if (refcount_release(&shmfd->shm_refs)) { 575 #ifdef MAC 576 mac_posixshm_destroy(shmfd); 577 #endif 578 rangelock_destroy(&shmfd->shm_rl); 579 mtx_destroy(&shmfd->shm_mtx); 580 vm_object_deallocate(shmfd->shm_object); 581 if (shmfd->shm_ino != 0) 582 free_unr(shm_ino_unr, shmfd->shm_ino); 583 free(shmfd, M_SHMFD); 584 } 585 } 586 587 /* 588 * Determine if the credentials have sufficient permissions for a 589 * specified combination of FREAD and FWRITE. 590 */ 591 static int 592 shm_access(struct shmfd *shmfd, struct ucred *ucred, int flags) 593 { 594 accmode_t accmode; 595 int error; 596 597 accmode = 0; 598 if (flags & FREAD) 599 accmode |= VREAD; 600 if (flags & FWRITE) 601 accmode |= VWRITE; 602 mtx_lock(&shm_timestamp_lock); 603 error = vaccess(VREG, shmfd->shm_mode, shmfd->shm_uid, shmfd->shm_gid, 604 accmode, ucred, NULL); 605 mtx_unlock(&shm_timestamp_lock); 606 return (error); 607 } 608 609 /* 610 * Dictionary management. We maintain an in-kernel dictionary to map 611 * paths to shmfd objects. We use the FNV hash on the path to store 612 * the mappings in a hash table. 613 */ 614 static void 615 shm_init(void *arg) 616 { 617 618 mtx_init(&shm_timestamp_lock, "shm timestamps", NULL, MTX_DEF); 619 sx_init(&shm_dict_lock, "shm dictionary"); 620 shm_dictionary = hashinit(1024, M_SHMFD, &shm_hash); 621 shm_ino_unr = new_unrhdr(1, INT32_MAX, NULL); 622 KASSERT(shm_ino_unr != NULL, ("shm fake inodes not initialized")); 623 shm_dev_ino = devfs_alloc_cdp_inode(); 624 KASSERT(shm_dev_ino > 0, ("shm dev inode not initialized")); 625 } 626 SYSINIT(shm_init, SI_SUB_SYSV_SHM, SI_ORDER_ANY, shm_init, NULL); 627 628 static struct shmfd * 629 shm_lookup(char *path, Fnv32_t fnv) 630 { 631 struct shm_mapping *map; 632 633 LIST_FOREACH(map, SHM_HASH(fnv), sm_link) { 634 if (map->sm_fnv != fnv) 635 continue; 636 if (strcmp(map->sm_path, path) == 0) 637 return (map->sm_shmfd); 638 } 639 640 return (NULL); 641 } 642 643 static void 644 shm_insert(char *path, Fnv32_t fnv, struct shmfd *shmfd) 645 { 646 struct shm_mapping *map; 647 648 map = malloc(sizeof(struct shm_mapping), M_SHMFD, M_WAITOK); 649 map->sm_path = path; 650 map->sm_fnv = fnv; 651 map->sm_shmfd = shm_hold(shmfd); 652 shmfd->shm_path = path; 653 LIST_INSERT_HEAD(SHM_HASH(fnv), map, sm_link); 654 } 655 656 static int 657 shm_remove(char *path, Fnv32_t fnv, struct ucred *ucred) 658 { 659 struct shm_mapping *map; 660 int error; 661 662 LIST_FOREACH(map, SHM_HASH(fnv), sm_link) { 663 if (map->sm_fnv != fnv) 664 continue; 665 if (strcmp(map->sm_path, path) == 0) { 666 #ifdef MAC 667 error = mac_posixshm_check_unlink(ucred, map->sm_shmfd); 668 if (error) 669 return (error); 670 #endif 671 error = shm_access(map->sm_shmfd, ucred, 672 FREAD | FWRITE); 673 if (error) 674 return (error); 675 map->sm_shmfd->shm_path = NULL; 676 LIST_REMOVE(map, sm_link); 677 shm_drop(map->sm_shmfd); 678 free(map->sm_path, M_SHMFD); 679 free(map, M_SHMFD); 680 return (0); 681 } 682 } 683 684 return (ENOENT); 685 } 686 687 int 688 kern_shm_open(struct thread *td, const char *userpath, int flags, mode_t mode, 689 struct filecaps *fcaps) 690 { 691 struct filedesc *fdp; 692 struct shmfd *shmfd; 693 struct file *fp; 694 char *path; 695 Fnv32_t fnv; 696 mode_t cmode; 697 int fd, error; 698 699 #ifdef CAPABILITY_MODE 700 /* 701 * shm_open(2) is only allowed for anonymous objects. 702 */ 703 if (IN_CAPABILITY_MODE(td) && (userpath != SHM_ANON)) 704 return (ECAPMODE); 705 #endif 706 707 if ((flags & O_ACCMODE) != O_RDONLY && (flags & O_ACCMODE) != O_RDWR) 708 return (EINVAL); 709 710 if ((flags & ~(O_ACCMODE | O_CREAT | O_EXCL | O_TRUNC | O_CLOEXEC)) != 0) 711 return (EINVAL); 712 713 fdp = td->td_proc->p_fd; 714 cmode = (mode & ~fdp->fd_cmask) & ACCESSPERMS; 715 716 error = falloc_caps(td, &fp, &fd, O_CLOEXEC, fcaps); 717 if (error) 718 return (error); 719 720 /* A SHM_ANON path pointer creates an anonymous object. */ 721 if (userpath == SHM_ANON) { 722 /* A read-only anonymous object is pointless. */ 723 if ((flags & O_ACCMODE) == O_RDONLY) { 724 fdclose(td, fp, fd); 725 fdrop(fp, td); 726 return (EINVAL); 727 } 728 shmfd = shm_alloc(td->td_ucred, cmode); 729 } else { 730 path = malloc(MAXPATHLEN, M_SHMFD, M_WAITOK); 731 error = copyinstr(userpath, path, MAXPATHLEN, NULL); 732 #ifdef KTRACE 733 if (error == 0 && KTRPOINT(curthread, KTR_NAMEI)) 734 ktrnamei(path); 735 #endif 736 /* Require paths to start with a '/' character. */ 737 if (error == 0 && path[0] != '/') 738 error = EINVAL; 739 if (error) { 740 fdclose(td, fp, fd); 741 fdrop(fp, td); 742 free(path, M_SHMFD); 743 return (error); 744 } 745 746 fnv = fnv_32_str(path, FNV1_32_INIT); 747 sx_xlock(&shm_dict_lock); 748 shmfd = shm_lookup(path, fnv); 749 if (shmfd == NULL) { 750 /* Object does not yet exist, create it if requested. */ 751 if (flags & O_CREAT) { 752 #ifdef MAC 753 error = mac_posixshm_check_create(td->td_ucred, 754 path); 755 if (error == 0) { 756 #endif 757 shmfd = shm_alloc(td->td_ucred, cmode); 758 shm_insert(path, fnv, shmfd); 759 #ifdef MAC 760 } 761 #endif 762 } else { 763 free(path, M_SHMFD); 764 error = ENOENT; 765 } 766 } else { 767 /* 768 * Object already exists, obtain a new 769 * reference if requested and permitted. 770 */ 771 free(path, M_SHMFD); 772 if ((flags & (O_CREAT | O_EXCL)) == (O_CREAT | O_EXCL)) 773 error = EEXIST; 774 else { 775 #ifdef MAC 776 error = mac_posixshm_check_open(td->td_ucred, 777 shmfd, FFLAGS(flags & O_ACCMODE)); 778 if (error == 0) 779 #endif 780 error = shm_access(shmfd, td->td_ucred, 781 FFLAGS(flags & O_ACCMODE)); 782 } 783 784 /* 785 * Truncate the file back to zero length if 786 * O_TRUNC was specified and the object was 787 * opened with read/write. 788 */ 789 if (error == 0 && 790 (flags & (O_ACCMODE | O_TRUNC)) == 791 (O_RDWR | O_TRUNC)) { 792 #ifdef MAC 793 error = mac_posixshm_check_truncate( 794 td->td_ucred, fp->f_cred, shmfd); 795 if (error == 0) 796 #endif 797 shm_dotruncate(shmfd, 0); 798 } 799 if (error == 0) 800 shm_hold(shmfd); 801 } 802 sx_xunlock(&shm_dict_lock); 803 804 if (error) { 805 fdclose(td, fp, fd); 806 fdrop(fp, td); 807 return (error); 808 } 809 } 810 811 finit(fp, FFLAGS(flags & O_ACCMODE), DTYPE_SHM, shmfd, &shm_ops); 812 813 td->td_retval[0] = fd; 814 fdrop(fp, td); 815 816 return (0); 817 } 818 819 /* System calls. */ 820 int 821 sys_shm_open(struct thread *td, struct shm_open_args *uap) 822 { 823 824 return (kern_shm_open(td, uap->path, uap->flags, uap->mode, NULL)); 825 } 826 827 int 828 sys_shm_unlink(struct thread *td, struct shm_unlink_args *uap) 829 { 830 char *path; 831 Fnv32_t fnv; 832 int error; 833 834 path = malloc(MAXPATHLEN, M_TEMP, M_WAITOK); 835 error = copyinstr(uap->path, path, MAXPATHLEN, NULL); 836 if (error) { 837 free(path, M_TEMP); 838 return (error); 839 } 840 #ifdef KTRACE 841 if (KTRPOINT(curthread, KTR_NAMEI)) 842 ktrnamei(path); 843 #endif 844 fnv = fnv_32_str(path, FNV1_32_INIT); 845 sx_xlock(&shm_dict_lock); 846 error = shm_remove(path, fnv, td->td_ucred); 847 sx_xunlock(&shm_dict_lock); 848 free(path, M_TEMP); 849 850 return (error); 851 } 852 853 int 854 shm_mmap(struct file *fp, vm_map_t map, vm_offset_t *addr, vm_size_t objsize, 855 vm_prot_t prot, vm_prot_t cap_maxprot, int flags, 856 vm_ooffset_t foff, struct thread *td) 857 { 858 struct shmfd *shmfd; 859 vm_prot_t maxprot; 860 int error; 861 862 shmfd = fp->f_data; 863 maxprot = VM_PROT_NONE; 864 865 /* FREAD should always be set. */ 866 if ((fp->f_flag & FREAD) != 0) 867 maxprot |= VM_PROT_EXECUTE | VM_PROT_READ; 868 if ((fp->f_flag & FWRITE) != 0) 869 maxprot |= VM_PROT_WRITE; 870 871 /* Don't permit shared writable mappings on read-only descriptors. */ 872 if ((flags & MAP_SHARED) != 0 && 873 (maxprot & VM_PROT_WRITE) == 0 && 874 (prot & VM_PROT_WRITE) != 0) 875 return (EACCES); 876 maxprot &= cap_maxprot; 877 878 #ifdef MAC 879 error = mac_posixshm_check_mmap(td->td_ucred, shmfd, prot, flags); 880 if (error != 0) 881 return (error); 882 #endif 883 884 /* 885 * XXXRW: This validation is probably insufficient, and subject to 886 * sign errors. It should be fixed. 887 */ 888 if (foff >= shmfd->shm_size || 889 foff + objsize > round_page(shmfd->shm_size)) 890 return (EINVAL); 891 892 mtx_lock(&shm_timestamp_lock); 893 vfs_timestamp(&shmfd->shm_atime); 894 mtx_unlock(&shm_timestamp_lock); 895 vm_object_reference(shmfd->shm_object); 896 897 error = vm_mmap_object(map, addr, objsize, prot, maxprot, flags, 898 shmfd->shm_object, foff, FALSE, td); 899 if (error != 0) 900 vm_object_deallocate(shmfd->shm_object); 901 return (0); 902 } 903 904 static int 905 shm_chmod(struct file *fp, mode_t mode, struct ucred *active_cred, 906 struct thread *td) 907 { 908 struct shmfd *shmfd; 909 int error; 910 911 error = 0; 912 shmfd = fp->f_data; 913 mtx_lock(&shm_timestamp_lock); 914 /* 915 * SUSv4 says that x bits of permission need not be affected. 916 * Be consistent with our shm_open there. 917 */ 918 #ifdef MAC 919 error = mac_posixshm_check_setmode(active_cred, shmfd, mode); 920 if (error != 0) 921 goto out; 922 #endif 923 error = vaccess(VREG, shmfd->shm_mode, shmfd->shm_uid, 924 shmfd->shm_gid, VADMIN, active_cred, NULL); 925 if (error != 0) 926 goto out; 927 shmfd->shm_mode = mode & ACCESSPERMS; 928 out: 929 mtx_unlock(&shm_timestamp_lock); 930 return (error); 931 } 932 933 static int 934 shm_chown(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred, 935 struct thread *td) 936 { 937 struct shmfd *shmfd; 938 int error; 939 940 error = 0; 941 shmfd = fp->f_data; 942 mtx_lock(&shm_timestamp_lock); 943 #ifdef MAC 944 error = mac_posixshm_check_setowner(active_cred, shmfd, uid, gid); 945 if (error != 0) 946 goto out; 947 #endif 948 if (uid == (uid_t)-1) 949 uid = shmfd->shm_uid; 950 if (gid == (gid_t)-1) 951 gid = shmfd->shm_gid; 952 if (((uid != shmfd->shm_uid && uid != active_cred->cr_uid) || 953 (gid != shmfd->shm_gid && !groupmember(gid, active_cred))) && 954 (error = priv_check_cred(active_cred, PRIV_VFS_CHOWN, 0))) 955 goto out; 956 shmfd->shm_uid = uid; 957 shmfd->shm_gid = gid; 958 out: 959 mtx_unlock(&shm_timestamp_lock); 960 return (error); 961 } 962 963 /* 964 * Helper routines to allow the backing object of a shared memory file 965 * descriptor to be mapped in the kernel. 966 */ 967 int 968 shm_map(struct file *fp, size_t size, off_t offset, void **memp) 969 { 970 struct shmfd *shmfd; 971 vm_offset_t kva, ofs; 972 vm_object_t obj; 973 int rv; 974 975 if (fp->f_type != DTYPE_SHM) 976 return (EINVAL); 977 shmfd = fp->f_data; 978 obj = shmfd->shm_object; 979 VM_OBJECT_WLOCK(obj); 980 /* 981 * XXXRW: This validation is probably insufficient, and subject to 982 * sign errors. It should be fixed. 983 */ 984 if (offset >= shmfd->shm_size || 985 offset + size > round_page(shmfd->shm_size)) { 986 VM_OBJECT_WUNLOCK(obj); 987 return (EINVAL); 988 } 989 990 shmfd->shm_kmappings++; 991 vm_object_reference_locked(obj); 992 VM_OBJECT_WUNLOCK(obj); 993 994 /* Map the object into the kernel_map and wire it. */ 995 kva = vm_map_min(kernel_map); 996 ofs = offset & PAGE_MASK; 997 offset = trunc_page(offset); 998 size = round_page(size + ofs); 999 rv = vm_map_find(kernel_map, obj, offset, &kva, size, 0, 1000 VMFS_OPTIMAL_SPACE, VM_PROT_READ | VM_PROT_WRITE, 1001 VM_PROT_READ | VM_PROT_WRITE, 0); 1002 if (rv == KERN_SUCCESS) { 1003 rv = vm_map_wire(kernel_map, kva, kva + size, 1004 VM_MAP_WIRE_SYSTEM | VM_MAP_WIRE_NOHOLES); 1005 if (rv == KERN_SUCCESS) { 1006 *memp = (void *)(kva + ofs); 1007 return (0); 1008 } 1009 vm_map_remove(kernel_map, kva, kva + size); 1010 } else 1011 vm_object_deallocate(obj); 1012 1013 /* On failure, drop our mapping reference. */ 1014 VM_OBJECT_WLOCK(obj); 1015 shmfd->shm_kmappings--; 1016 VM_OBJECT_WUNLOCK(obj); 1017 1018 return (vm_mmap_to_errno(rv)); 1019 } 1020 1021 /* 1022 * We require the caller to unmap the entire entry. This allows us to 1023 * safely decrement shm_kmappings when a mapping is removed. 1024 */ 1025 int 1026 shm_unmap(struct file *fp, void *mem, size_t size) 1027 { 1028 struct shmfd *shmfd; 1029 vm_map_entry_t entry; 1030 vm_offset_t kva, ofs; 1031 vm_object_t obj; 1032 vm_pindex_t pindex; 1033 vm_prot_t prot; 1034 boolean_t wired; 1035 vm_map_t map; 1036 int rv; 1037 1038 if (fp->f_type != DTYPE_SHM) 1039 return (EINVAL); 1040 shmfd = fp->f_data; 1041 kva = (vm_offset_t)mem; 1042 ofs = kva & PAGE_MASK; 1043 kva = trunc_page(kva); 1044 size = round_page(size + ofs); 1045 map = kernel_map; 1046 rv = vm_map_lookup(&map, kva, VM_PROT_READ | VM_PROT_WRITE, &entry, 1047 &obj, &pindex, &prot, &wired); 1048 if (rv != KERN_SUCCESS) 1049 return (EINVAL); 1050 if (entry->start != kva || entry->end != kva + size) { 1051 vm_map_lookup_done(map, entry); 1052 return (EINVAL); 1053 } 1054 vm_map_lookup_done(map, entry); 1055 if (obj != shmfd->shm_object) 1056 return (EINVAL); 1057 vm_map_remove(map, kva, kva + size); 1058 VM_OBJECT_WLOCK(obj); 1059 KASSERT(shmfd->shm_kmappings > 0, ("shm_unmap: object not mapped")); 1060 shmfd->shm_kmappings--; 1061 VM_OBJECT_WUNLOCK(obj); 1062 return (0); 1063 } 1064 1065 static int 1066 shm_fill_kinfo(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp) 1067 { 1068 struct shmfd *shmfd; 1069 1070 kif->kf_type = KF_TYPE_SHM; 1071 shmfd = fp->f_data; 1072 1073 mtx_lock(&shm_timestamp_lock); 1074 kif->kf_un.kf_file.kf_file_mode = S_IFREG | shmfd->shm_mode; /* XXX */ 1075 mtx_unlock(&shm_timestamp_lock); 1076 kif->kf_un.kf_file.kf_file_size = shmfd->shm_size; 1077 if (shmfd->shm_path != NULL) { 1078 sx_slock(&shm_dict_lock); 1079 if (shmfd->shm_path != NULL) 1080 strlcpy(kif->kf_path, shmfd->shm_path, 1081 sizeof(kif->kf_path)); 1082 sx_sunlock(&shm_dict_lock); 1083 } 1084 return (0); 1085 } 1086