1 /*- 2 * Copyright (c) 2006, 2011, 2016-2017 Robert N. M. Watson 3 * All rights reserved. 4 * 5 * Portions of this software were developed by BAE Systems, the University of 6 * Cambridge Computer Laboratory, and Memorial University under DARPA/AFRL 7 * contract FA8650-15-C-7558 ("CADETS"), as part of the DARPA Transparent 8 * Computing (TC) research program. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 20 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 22 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 23 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 24 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 25 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 26 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 28 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 29 * SUCH DAMAGE. 30 */ 31 32 /* 33 * Support for shared swap-backed anonymous memory objects via 34 * shm_open(2) and shm_unlink(2). While most of the implementation is 35 * here, vm_mmap.c contains mapping logic changes. 36 * 37 * TODO: 38 * 39 * (1) Need to export data to a userland tool via a sysctl. Should ipcs(1) 40 * and ipcrm(1) be expanded or should new tools to manage both POSIX 41 * kernel semaphores and POSIX shared memory be written? 42 * 43 * (2) Add support for this file type to fstat(1). 44 * 45 * (3) Resource limits? Does this need its own resource limits or are the 46 * existing limits in mmap(2) sufficient? 47 */ 48 49 #include <sys/cdefs.h> 50 __FBSDID("$FreeBSD$"); 51 52 #include "opt_capsicum.h" 53 #include "opt_ktrace.h" 54 55 #include <sys/param.h> 56 #include <sys/capsicum.h> 57 #include <sys/conf.h> 58 #include <sys/fcntl.h> 59 #include <sys/file.h> 60 #include <sys/filedesc.h> 61 #include <sys/fnv_hash.h> 62 #include <sys/kernel.h> 63 #include <sys/uio.h> 64 #include <sys/signal.h> 65 #include <sys/jail.h> 66 #include <sys/ktrace.h> 67 #include <sys/lock.h> 68 #include <sys/malloc.h> 69 #include <sys/mman.h> 70 #include <sys/mutex.h> 71 #include <sys/priv.h> 72 #include <sys/proc.h> 73 #include <sys/refcount.h> 74 #include <sys/resourcevar.h> 75 #include <sys/rwlock.h> 76 #include <sys/stat.h> 77 #include <sys/syscallsubr.h> 78 #include <sys/sysctl.h> 79 #include <sys/sysproto.h> 80 #include <sys/systm.h> 81 #include <sys/sx.h> 82 #include <sys/time.h> 83 #include <sys/vnode.h> 84 #include <sys/unistd.h> 85 #include <sys/user.h> 86 87 #include <security/audit/audit.h> 88 #include <security/mac/mac_framework.h> 89 90 #include <vm/vm.h> 91 #include <vm/vm_param.h> 92 #include <vm/pmap.h> 93 #include <vm/vm_extern.h> 94 #include <vm/vm_map.h> 95 #include <vm/vm_kern.h> 96 #include <vm/vm_object.h> 97 #include <vm/vm_page.h> 98 #include <vm/vm_pageout.h> 99 #include <vm/vm_pager.h> 100 #include <vm/swap_pager.h> 101 102 struct shm_mapping { 103 char *sm_path; 104 Fnv32_t sm_fnv; 105 struct shmfd *sm_shmfd; 106 LIST_ENTRY(shm_mapping) sm_link; 107 }; 108 109 static MALLOC_DEFINE(M_SHMFD, "shmfd", "shared memory file descriptor"); 110 static LIST_HEAD(, shm_mapping) *shm_dictionary; 111 static struct sx shm_dict_lock; 112 static struct mtx shm_timestamp_lock; 113 static u_long shm_hash; 114 static struct unrhdr *shm_ino_unr; 115 static dev_t shm_dev_ino; 116 117 #define SHM_HASH(fnv) (&shm_dictionary[(fnv) & shm_hash]) 118 119 static void shm_init(void *arg); 120 static void shm_insert(char *path, Fnv32_t fnv, struct shmfd *shmfd); 121 static struct shmfd *shm_lookup(char *path, Fnv32_t fnv); 122 static int shm_remove(char *path, Fnv32_t fnv, struct ucred *ucred); 123 124 static fo_rdwr_t shm_read; 125 static fo_rdwr_t shm_write; 126 static fo_truncate_t shm_truncate; 127 static fo_stat_t shm_stat; 128 static fo_close_t shm_close; 129 static fo_chmod_t shm_chmod; 130 static fo_chown_t shm_chown; 131 static fo_seek_t shm_seek; 132 static fo_fill_kinfo_t shm_fill_kinfo; 133 static fo_mmap_t shm_mmap; 134 135 /* File descriptor operations. */ 136 struct fileops shm_ops = { 137 .fo_read = shm_read, 138 .fo_write = shm_write, 139 .fo_truncate = shm_truncate, 140 .fo_ioctl = invfo_ioctl, 141 .fo_poll = invfo_poll, 142 .fo_kqfilter = invfo_kqfilter, 143 .fo_stat = shm_stat, 144 .fo_close = shm_close, 145 .fo_chmod = shm_chmod, 146 .fo_chown = shm_chown, 147 .fo_sendfile = vn_sendfile, 148 .fo_seek = shm_seek, 149 .fo_fill_kinfo = shm_fill_kinfo, 150 .fo_mmap = shm_mmap, 151 .fo_flags = DFLAG_PASSABLE | DFLAG_SEEKABLE 152 }; 153 154 FEATURE(posix_shm, "POSIX shared memory"); 155 156 static int 157 uiomove_object_page(vm_object_t obj, size_t len, struct uio *uio) 158 { 159 vm_page_t m; 160 vm_pindex_t idx; 161 size_t tlen; 162 int error, offset, rv; 163 164 idx = OFF_TO_IDX(uio->uio_offset); 165 offset = uio->uio_offset & PAGE_MASK; 166 tlen = MIN(PAGE_SIZE - offset, len); 167 168 VM_OBJECT_WLOCK(obj); 169 170 /* 171 * Read I/O without either a corresponding resident page or swap 172 * page: use zero_region. This is intended to avoid instantiating 173 * pages on read from a sparse region. 174 */ 175 if (uio->uio_rw == UIO_READ && vm_page_lookup(obj, idx) == NULL && 176 !vm_pager_has_page(obj, idx, NULL, NULL)) { 177 VM_OBJECT_WUNLOCK(obj); 178 return (uiomove(__DECONST(void *, zero_region), tlen, uio)); 179 } 180 181 /* 182 * Parallel reads of the page content from disk are prevented 183 * by exclusive busy. 184 * 185 * Although the tmpfs vnode lock is held here, it is 186 * nonetheless safe to sleep waiting for a free page. The 187 * pageout daemon does not need to acquire the tmpfs vnode 188 * lock to page out tobj's pages because tobj is a OBJT_SWAP 189 * type object. 190 */ 191 m = vm_page_grab(obj, idx, VM_ALLOC_NORMAL | VM_ALLOC_NOBUSY); 192 if (m->valid != VM_PAGE_BITS_ALL) { 193 vm_page_xbusy(m); 194 if (vm_pager_has_page(obj, idx, NULL, NULL)) { 195 rv = vm_pager_get_pages(obj, &m, 1, NULL, NULL); 196 if (rv != VM_PAGER_OK) { 197 printf( 198 "uiomove_object: vm_obj %p idx %jd valid %x pager error %d\n", 199 obj, idx, m->valid, rv); 200 vm_page_lock(m); 201 vm_page_free(m); 202 vm_page_unlock(m); 203 VM_OBJECT_WUNLOCK(obj); 204 return (EIO); 205 } 206 } else 207 vm_page_zero_invalid(m, TRUE); 208 vm_page_xunbusy(m); 209 } 210 vm_page_lock(m); 211 vm_page_hold(m); 212 if (vm_page_active(m)) 213 vm_page_reference(m); 214 else 215 vm_page_activate(m); 216 vm_page_unlock(m); 217 VM_OBJECT_WUNLOCK(obj); 218 error = uiomove_fromphys(&m, offset, tlen, uio); 219 if (uio->uio_rw == UIO_WRITE && error == 0) { 220 VM_OBJECT_WLOCK(obj); 221 vm_page_dirty(m); 222 vm_pager_page_unswapped(m); 223 VM_OBJECT_WUNLOCK(obj); 224 } 225 vm_page_lock(m); 226 vm_page_unhold(m); 227 vm_page_unlock(m); 228 229 return (error); 230 } 231 232 int 233 uiomove_object(vm_object_t obj, off_t obj_size, struct uio *uio) 234 { 235 ssize_t resid; 236 size_t len; 237 int error; 238 239 error = 0; 240 while ((resid = uio->uio_resid) > 0) { 241 if (obj_size <= uio->uio_offset) 242 break; 243 len = MIN(obj_size - uio->uio_offset, resid); 244 if (len == 0) 245 break; 246 error = uiomove_object_page(obj, len, uio); 247 if (error != 0 || resid == uio->uio_resid) 248 break; 249 } 250 return (error); 251 } 252 253 static int 254 shm_seek(struct file *fp, off_t offset, int whence, struct thread *td) 255 { 256 struct shmfd *shmfd; 257 off_t foffset; 258 int error; 259 260 shmfd = fp->f_data; 261 foffset = foffset_lock(fp, 0); 262 error = 0; 263 switch (whence) { 264 case L_INCR: 265 if (foffset < 0 || 266 (offset > 0 && foffset > OFF_MAX - offset)) { 267 error = EOVERFLOW; 268 break; 269 } 270 offset += foffset; 271 break; 272 case L_XTND: 273 if (offset > 0 && shmfd->shm_size > OFF_MAX - offset) { 274 error = EOVERFLOW; 275 break; 276 } 277 offset += shmfd->shm_size; 278 break; 279 case L_SET: 280 break; 281 default: 282 error = EINVAL; 283 } 284 if (error == 0) { 285 if (offset < 0 || offset > shmfd->shm_size) 286 error = EINVAL; 287 else 288 td->td_uretoff.tdu_off = offset; 289 } 290 foffset_unlock(fp, offset, error != 0 ? FOF_NOUPDATE : 0); 291 return (error); 292 } 293 294 static int 295 shm_read(struct file *fp, struct uio *uio, struct ucred *active_cred, 296 int flags, struct thread *td) 297 { 298 struct shmfd *shmfd; 299 void *rl_cookie; 300 int error; 301 302 shmfd = fp->f_data; 303 #ifdef MAC 304 error = mac_posixshm_check_read(active_cred, fp->f_cred, shmfd); 305 if (error) 306 return (error); 307 #endif 308 foffset_lock_uio(fp, uio, flags); 309 rl_cookie = rangelock_rlock(&shmfd->shm_rl, uio->uio_offset, 310 uio->uio_offset + uio->uio_resid, &shmfd->shm_mtx); 311 error = uiomove_object(shmfd->shm_object, shmfd->shm_size, uio); 312 rangelock_unlock(&shmfd->shm_rl, rl_cookie, &shmfd->shm_mtx); 313 foffset_unlock_uio(fp, uio, flags); 314 return (error); 315 } 316 317 static int 318 shm_write(struct file *fp, struct uio *uio, struct ucred *active_cred, 319 int flags, struct thread *td) 320 { 321 struct shmfd *shmfd; 322 void *rl_cookie; 323 int error; 324 325 shmfd = fp->f_data; 326 #ifdef MAC 327 error = mac_posixshm_check_write(active_cred, fp->f_cred, shmfd); 328 if (error) 329 return (error); 330 #endif 331 foffset_lock_uio(fp, uio, flags); 332 if ((flags & FOF_OFFSET) == 0) { 333 rl_cookie = rangelock_wlock(&shmfd->shm_rl, 0, OFF_MAX, 334 &shmfd->shm_mtx); 335 } else { 336 rl_cookie = rangelock_wlock(&shmfd->shm_rl, uio->uio_offset, 337 uio->uio_offset + uio->uio_resid, &shmfd->shm_mtx); 338 } 339 340 error = uiomove_object(shmfd->shm_object, shmfd->shm_size, uio); 341 rangelock_unlock(&shmfd->shm_rl, rl_cookie, &shmfd->shm_mtx); 342 foffset_unlock_uio(fp, uio, flags); 343 return (error); 344 } 345 346 static int 347 shm_truncate(struct file *fp, off_t length, struct ucred *active_cred, 348 struct thread *td) 349 { 350 struct shmfd *shmfd; 351 #ifdef MAC 352 int error; 353 #endif 354 355 shmfd = fp->f_data; 356 #ifdef MAC 357 error = mac_posixshm_check_truncate(active_cred, fp->f_cred, shmfd); 358 if (error) 359 return (error); 360 #endif 361 return (shm_dotruncate(shmfd, length)); 362 } 363 364 static int 365 shm_stat(struct file *fp, struct stat *sb, struct ucred *active_cred, 366 struct thread *td) 367 { 368 struct shmfd *shmfd; 369 #ifdef MAC 370 int error; 371 #endif 372 373 shmfd = fp->f_data; 374 375 #ifdef MAC 376 error = mac_posixshm_check_stat(active_cred, fp->f_cred, shmfd); 377 if (error) 378 return (error); 379 #endif 380 381 /* 382 * Attempt to return sanish values for fstat() on a memory file 383 * descriptor. 384 */ 385 bzero(sb, sizeof(*sb)); 386 sb->st_blksize = PAGE_SIZE; 387 sb->st_size = shmfd->shm_size; 388 sb->st_blocks = howmany(sb->st_size, sb->st_blksize); 389 mtx_lock(&shm_timestamp_lock); 390 sb->st_atim = shmfd->shm_atime; 391 sb->st_ctim = shmfd->shm_ctime; 392 sb->st_mtim = shmfd->shm_mtime; 393 sb->st_birthtim = shmfd->shm_birthtime; 394 sb->st_mode = S_IFREG | shmfd->shm_mode; /* XXX */ 395 sb->st_uid = shmfd->shm_uid; 396 sb->st_gid = shmfd->shm_gid; 397 mtx_unlock(&shm_timestamp_lock); 398 sb->st_dev = shm_dev_ino; 399 sb->st_ino = shmfd->shm_ino; 400 401 return (0); 402 } 403 404 static int 405 shm_close(struct file *fp, struct thread *td) 406 { 407 struct shmfd *shmfd; 408 409 shmfd = fp->f_data; 410 fp->f_data = NULL; 411 shm_drop(shmfd); 412 413 return (0); 414 } 415 416 int 417 shm_dotruncate(struct shmfd *shmfd, off_t length) 418 { 419 vm_object_t object; 420 vm_page_t m; 421 vm_pindex_t idx, nobjsize; 422 vm_ooffset_t delta; 423 int base, rv; 424 425 KASSERT(length >= 0, ("shm_dotruncate: length < 0")); 426 object = shmfd->shm_object; 427 VM_OBJECT_WLOCK(object); 428 if (length == shmfd->shm_size) { 429 VM_OBJECT_WUNLOCK(object); 430 return (0); 431 } 432 nobjsize = OFF_TO_IDX(length + PAGE_MASK); 433 434 /* Are we shrinking? If so, trim the end. */ 435 if (length < shmfd->shm_size) { 436 /* 437 * Disallow any requests to shrink the size if this 438 * object is mapped into the kernel. 439 */ 440 if (shmfd->shm_kmappings > 0) { 441 VM_OBJECT_WUNLOCK(object); 442 return (EBUSY); 443 } 444 445 /* 446 * Zero the truncated part of the last page. 447 */ 448 base = length & PAGE_MASK; 449 if (base != 0) { 450 idx = OFF_TO_IDX(length); 451 retry: 452 m = vm_page_lookup(object, idx); 453 if (m != NULL) { 454 if (vm_page_sleep_if_busy(m, "shmtrc")) 455 goto retry; 456 } else if (vm_pager_has_page(object, idx, NULL, NULL)) { 457 m = vm_page_alloc(object, idx, 458 VM_ALLOC_NORMAL | VM_ALLOC_WAITFAIL); 459 if (m == NULL) 460 goto retry; 461 rv = vm_pager_get_pages(object, &m, 1, NULL, 462 NULL); 463 vm_page_lock(m); 464 if (rv == VM_PAGER_OK) { 465 /* 466 * Since the page was not resident, 467 * and therefore not recently 468 * accessed, immediately enqueue it 469 * for asynchronous laundering. The 470 * current operation is not regarded 471 * as an access. 472 */ 473 vm_page_launder(m); 474 vm_page_unlock(m); 475 vm_page_xunbusy(m); 476 } else { 477 vm_page_free(m); 478 vm_page_unlock(m); 479 VM_OBJECT_WUNLOCK(object); 480 return (EIO); 481 } 482 } 483 if (m != NULL) { 484 pmap_zero_page_area(m, base, PAGE_SIZE - base); 485 KASSERT(m->valid == VM_PAGE_BITS_ALL, 486 ("shm_dotruncate: page %p is invalid", m)); 487 vm_page_dirty(m); 488 vm_pager_page_unswapped(m); 489 } 490 } 491 delta = IDX_TO_OFF(object->size - nobjsize); 492 493 /* Toss in memory pages. */ 494 if (nobjsize < object->size) 495 vm_object_page_remove(object, nobjsize, object->size, 496 0); 497 498 /* Toss pages from swap. */ 499 if (object->type == OBJT_SWAP) 500 swap_pager_freespace(object, nobjsize, delta); 501 502 /* Free the swap accounted for shm */ 503 swap_release_by_cred(delta, object->cred); 504 object->charge -= delta; 505 } else { 506 /* Try to reserve additional swap space. */ 507 delta = IDX_TO_OFF(nobjsize - object->size); 508 if (!swap_reserve_by_cred(delta, object->cred)) { 509 VM_OBJECT_WUNLOCK(object); 510 return (ENOMEM); 511 } 512 object->charge += delta; 513 } 514 shmfd->shm_size = length; 515 mtx_lock(&shm_timestamp_lock); 516 vfs_timestamp(&shmfd->shm_ctime); 517 shmfd->shm_mtime = shmfd->shm_ctime; 518 mtx_unlock(&shm_timestamp_lock); 519 object->size = nobjsize; 520 VM_OBJECT_WUNLOCK(object); 521 return (0); 522 } 523 524 /* 525 * shmfd object management including creation and reference counting 526 * routines. 527 */ 528 struct shmfd * 529 shm_alloc(struct ucred *ucred, mode_t mode) 530 { 531 struct shmfd *shmfd; 532 int ino; 533 534 shmfd = malloc(sizeof(*shmfd), M_SHMFD, M_WAITOK | M_ZERO); 535 shmfd->shm_size = 0; 536 shmfd->shm_uid = ucred->cr_uid; 537 shmfd->shm_gid = ucred->cr_gid; 538 shmfd->shm_mode = mode; 539 shmfd->shm_object = vm_pager_allocate(OBJT_DEFAULT, NULL, 540 shmfd->shm_size, VM_PROT_DEFAULT, 0, ucred); 541 KASSERT(shmfd->shm_object != NULL, ("shm_create: vm_pager_allocate")); 542 shmfd->shm_object->pg_color = 0; 543 VM_OBJECT_WLOCK(shmfd->shm_object); 544 vm_object_clear_flag(shmfd->shm_object, OBJ_ONEMAPPING); 545 vm_object_set_flag(shmfd->shm_object, OBJ_COLORED | OBJ_NOSPLIT); 546 VM_OBJECT_WUNLOCK(shmfd->shm_object); 547 vfs_timestamp(&shmfd->shm_birthtime); 548 shmfd->shm_atime = shmfd->shm_mtime = shmfd->shm_ctime = 549 shmfd->shm_birthtime; 550 ino = alloc_unr(shm_ino_unr); 551 if (ino == -1) 552 shmfd->shm_ino = 0; 553 else 554 shmfd->shm_ino = ino; 555 refcount_init(&shmfd->shm_refs, 1); 556 mtx_init(&shmfd->shm_mtx, "shmrl", NULL, MTX_DEF); 557 rangelock_init(&shmfd->shm_rl); 558 #ifdef MAC 559 mac_posixshm_init(shmfd); 560 mac_posixshm_create(ucred, shmfd); 561 #endif 562 563 return (shmfd); 564 } 565 566 struct shmfd * 567 shm_hold(struct shmfd *shmfd) 568 { 569 570 refcount_acquire(&shmfd->shm_refs); 571 return (shmfd); 572 } 573 574 void 575 shm_drop(struct shmfd *shmfd) 576 { 577 578 if (refcount_release(&shmfd->shm_refs)) { 579 #ifdef MAC 580 mac_posixshm_destroy(shmfd); 581 #endif 582 rangelock_destroy(&shmfd->shm_rl); 583 mtx_destroy(&shmfd->shm_mtx); 584 vm_object_deallocate(shmfd->shm_object); 585 if (shmfd->shm_ino != 0) 586 free_unr(shm_ino_unr, shmfd->shm_ino); 587 free(shmfd, M_SHMFD); 588 } 589 } 590 591 /* 592 * Determine if the credentials have sufficient permissions for a 593 * specified combination of FREAD and FWRITE. 594 */ 595 int 596 shm_access(struct shmfd *shmfd, struct ucred *ucred, int flags) 597 { 598 accmode_t accmode; 599 int error; 600 601 accmode = 0; 602 if (flags & FREAD) 603 accmode |= VREAD; 604 if (flags & FWRITE) 605 accmode |= VWRITE; 606 mtx_lock(&shm_timestamp_lock); 607 error = vaccess(VREG, shmfd->shm_mode, shmfd->shm_uid, shmfd->shm_gid, 608 accmode, ucred, NULL); 609 mtx_unlock(&shm_timestamp_lock); 610 return (error); 611 } 612 613 /* 614 * Dictionary management. We maintain an in-kernel dictionary to map 615 * paths to shmfd objects. We use the FNV hash on the path to store 616 * the mappings in a hash table. 617 */ 618 static void 619 shm_init(void *arg) 620 { 621 622 mtx_init(&shm_timestamp_lock, "shm timestamps", NULL, MTX_DEF); 623 sx_init(&shm_dict_lock, "shm dictionary"); 624 shm_dictionary = hashinit(1024, M_SHMFD, &shm_hash); 625 shm_ino_unr = new_unrhdr(1, INT32_MAX, NULL); 626 KASSERT(shm_ino_unr != NULL, ("shm fake inodes not initialized")); 627 shm_dev_ino = devfs_alloc_cdp_inode(); 628 KASSERT(shm_dev_ino > 0, ("shm dev inode not initialized")); 629 } 630 SYSINIT(shm_init, SI_SUB_SYSV_SHM, SI_ORDER_ANY, shm_init, NULL); 631 632 static struct shmfd * 633 shm_lookup(char *path, Fnv32_t fnv) 634 { 635 struct shm_mapping *map; 636 637 LIST_FOREACH(map, SHM_HASH(fnv), sm_link) { 638 if (map->sm_fnv != fnv) 639 continue; 640 if (strcmp(map->sm_path, path) == 0) 641 return (map->sm_shmfd); 642 } 643 644 return (NULL); 645 } 646 647 static void 648 shm_insert(char *path, Fnv32_t fnv, struct shmfd *shmfd) 649 { 650 struct shm_mapping *map; 651 652 map = malloc(sizeof(struct shm_mapping), M_SHMFD, M_WAITOK); 653 map->sm_path = path; 654 map->sm_fnv = fnv; 655 map->sm_shmfd = shm_hold(shmfd); 656 shmfd->shm_path = path; 657 LIST_INSERT_HEAD(SHM_HASH(fnv), map, sm_link); 658 } 659 660 static int 661 shm_remove(char *path, Fnv32_t fnv, struct ucred *ucred) 662 { 663 struct shm_mapping *map; 664 int error; 665 666 LIST_FOREACH(map, SHM_HASH(fnv), sm_link) { 667 if (map->sm_fnv != fnv) 668 continue; 669 if (strcmp(map->sm_path, path) == 0) { 670 #ifdef MAC 671 error = mac_posixshm_check_unlink(ucred, map->sm_shmfd); 672 if (error) 673 return (error); 674 #endif 675 error = shm_access(map->sm_shmfd, ucred, 676 FREAD | FWRITE); 677 if (error) 678 return (error); 679 map->sm_shmfd->shm_path = NULL; 680 LIST_REMOVE(map, sm_link); 681 shm_drop(map->sm_shmfd); 682 free(map->sm_path, M_SHMFD); 683 free(map, M_SHMFD); 684 return (0); 685 } 686 } 687 688 return (ENOENT); 689 } 690 691 int 692 kern_shm_open(struct thread *td, const char *userpath, int flags, mode_t mode, 693 struct filecaps *fcaps) 694 { 695 struct filedesc *fdp; 696 struct shmfd *shmfd; 697 struct file *fp; 698 char *path; 699 const char *pr_path; 700 size_t pr_pathlen; 701 Fnv32_t fnv; 702 mode_t cmode; 703 int fd, error; 704 705 #ifdef CAPABILITY_MODE 706 /* 707 * shm_open(2) is only allowed for anonymous objects. 708 */ 709 if (IN_CAPABILITY_MODE(td) && (userpath != SHM_ANON)) 710 return (ECAPMODE); 711 #endif 712 713 AUDIT_ARG_FFLAGS(flags); 714 AUDIT_ARG_MODE(mode); 715 716 if ((flags & O_ACCMODE) != O_RDONLY && (flags & O_ACCMODE) != O_RDWR) 717 return (EINVAL); 718 719 if ((flags & ~(O_ACCMODE | O_CREAT | O_EXCL | O_TRUNC | O_CLOEXEC)) != 0) 720 return (EINVAL); 721 722 fdp = td->td_proc->p_fd; 723 cmode = (mode & ~fdp->fd_cmask) & ACCESSPERMS; 724 725 error = falloc_caps(td, &fp, &fd, O_CLOEXEC, fcaps); 726 if (error) 727 return (error); 728 729 /* A SHM_ANON path pointer creates an anonymous object. */ 730 if (userpath == SHM_ANON) { 731 /* A read-only anonymous object is pointless. */ 732 if ((flags & O_ACCMODE) == O_RDONLY) { 733 fdclose(td, fp, fd); 734 fdrop(fp, td); 735 return (EINVAL); 736 } 737 shmfd = shm_alloc(td->td_ucred, cmode); 738 } else { 739 path = malloc(MAXPATHLEN, M_SHMFD, M_WAITOK); 740 pr_path = td->td_ucred->cr_prison->pr_path; 741 742 /* Construct a full pathname for jailed callers. */ 743 pr_pathlen = strcmp(pr_path, "/") == 0 ? 0 744 : strlcpy(path, pr_path, MAXPATHLEN); 745 error = copyinstr(userpath, path + pr_pathlen, 746 MAXPATHLEN - pr_pathlen, NULL); 747 #ifdef KTRACE 748 if (error == 0 && KTRPOINT(curthread, KTR_NAMEI)) 749 ktrnamei(path); 750 #endif 751 /* Require paths to start with a '/' character. */ 752 if (error == 0 && path[pr_pathlen] != '/') 753 error = EINVAL; 754 if (error) { 755 fdclose(td, fp, fd); 756 fdrop(fp, td); 757 free(path, M_SHMFD); 758 return (error); 759 } 760 761 AUDIT_ARG_UPATH1_CANON(path); 762 fnv = fnv_32_str(path, FNV1_32_INIT); 763 sx_xlock(&shm_dict_lock); 764 shmfd = shm_lookup(path, fnv); 765 if (shmfd == NULL) { 766 /* Object does not yet exist, create it if requested. */ 767 if (flags & O_CREAT) { 768 #ifdef MAC 769 error = mac_posixshm_check_create(td->td_ucred, 770 path); 771 if (error == 0) { 772 #endif 773 shmfd = shm_alloc(td->td_ucred, cmode); 774 shm_insert(path, fnv, shmfd); 775 #ifdef MAC 776 } 777 #endif 778 } else { 779 free(path, M_SHMFD); 780 error = ENOENT; 781 } 782 } else { 783 /* 784 * Object already exists, obtain a new 785 * reference if requested and permitted. 786 */ 787 free(path, M_SHMFD); 788 if ((flags & (O_CREAT | O_EXCL)) == (O_CREAT | O_EXCL)) 789 error = EEXIST; 790 else { 791 #ifdef MAC 792 error = mac_posixshm_check_open(td->td_ucred, 793 shmfd, FFLAGS(flags & O_ACCMODE)); 794 if (error == 0) 795 #endif 796 error = shm_access(shmfd, td->td_ucred, 797 FFLAGS(flags & O_ACCMODE)); 798 } 799 800 /* 801 * Truncate the file back to zero length if 802 * O_TRUNC was specified and the object was 803 * opened with read/write. 804 */ 805 if (error == 0 && 806 (flags & (O_ACCMODE | O_TRUNC)) == 807 (O_RDWR | O_TRUNC)) { 808 #ifdef MAC 809 error = mac_posixshm_check_truncate( 810 td->td_ucred, fp->f_cred, shmfd); 811 if (error == 0) 812 #endif 813 shm_dotruncate(shmfd, 0); 814 } 815 if (error == 0) 816 shm_hold(shmfd); 817 } 818 sx_xunlock(&shm_dict_lock); 819 820 if (error) { 821 fdclose(td, fp, fd); 822 fdrop(fp, td); 823 return (error); 824 } 825 } 826 827 finit(fp, FFLAGS(flags & O_ACCMODE), DTYPE_SHM, shmfd, &shm_ops); 828 829 td->td_retval[0] = fd; 830 fdrop(fp, td); 831 832 return (0); 833 } 834 835 /* System calls. */ 836 int 837 sys_shm_open(struct thread *td, struct shm_open_args *uap) 838 { 839 840 return (kern_shm_open(td, uap->path, uap->flags, uap->mode, NULL)); 841 } 842 843 int 844 sys_shm_unlink(struct thread *td, struct shm_unlink_args *uap) 845 { 846 char *path; 847 const char *pr_path; 848 size_t pr_pathlen; 849 Fnv32_t fnv; 850 int error; 851 852 path = malloc(MAXPATHLEN, M_TEMP, M_WAITOK); 853 pr_path = td->td_ucred->cr_prison->pr_path; 854 pr_pathlen = strcmp(pr_path, "/") == 0 ? 0 855 : strlcpy(path, pr_path, MAXPATHLEN); 856 error = copyinstr(uap->path, path + pr_pathlen, MAXPATHLEN - pr_pathlen, 857 NULL); 858 if (error) { 859 free(path, M_TEMP); 860 return (error); 861 } 862 #ifdef KTRACE 863 if (KTRPOINT(curthread, KTR_NAMEI)) 864 ktrnamei(path); 865 #endif 866 AUDIT_ARG_UPATH1_CANON(path); 867 fnv = fnv_32_str(path, FNV1_32_INIT); 868 sx_xlock(&shm_dict_lock); 869 error = shm_remove(path, fnv, td->td_ucred); 870 sx_xunlock(&shm_dict_lock); 871 free(path, M_TEMP); 872 873 return (error); 874 } 875 876 int 877 shm_mmap(struct file *fp, vm_map_t map, vm_offset_t *addr, vm_size_t objsize, 878 vm_prot_t prot, vm_prot_t cap_maxprot, int flags, 879 vm_ooffset_t foff, struct thread *td) 880 { 881 struct shmfd *shmfd; 882 vm_prot_t maxprot; 883 int error; 884 885 shmfd = fp->f_data; 886 maxprot = VM_PROT_NONE; 887 888 /* FREAD should always be set. */ 889 if ((fp->f_flag & FREAD) != 0) 890 maxprot |= VM_PROT_EXECUTE | VM_PROT_READ; 891 if ((fp->f_flag & FWRITE) != 0) 892 maxprot |= VM_PROT_WRITE; 893 894 /* Don't permit shared writable mappings on read-only descriptors. */ 895 if ((flags & MAP_SHARED) != 0 && 896 (maxprot & VM_PROT_WRITE) == 0 && 897 (prot & VM_PROT_WRITE) != 0) 898 return (EACCES); 899 maxprot &= cap_maxprot; 900 901 /* See comment in vn_mmap(). */ 902 if ( 903 #ifdef _LP64 904 objsize > OFF_MAX || 905 #endif 906 foff < 0 || foff > OFF_MAX - objsize) 907 return (EINVAL); 908 909 #ifdef MAC 910 error = mac_posixshm_check_mmap(td->td_ucred, shmfd, prot, flags); 911 if (error != 0) 912 return (error); 913 #endif 914 915 mtx_lock(&shm_timestamp_lock); 916 vfs_timestamp(&shmfd->shm_atime); 917 mtx_unlock(&shm_timestamp_lock); 918 vm_object_reference(shmfd->shm_object); 919 920 error = vm_mmap_object(map, addr, objsize, prot, maxprot, flags, 921 shmfd->shm_object, foff, FALSE, td); 922 if (error != 0) 923 vm_object_deallocate(shmfd->shm_object); 924 return (error); 925 } 926 927 static int 928 shm_chmod(struct file *fp, mode_t mode, struct ucred *active_cred, 929 struct thread *td) 930 { 931 struct shmfd *shmfd; 932 int error; 933 934 error = 0; 935 shmfd = fp->f_data; 936 mtx_lock(&shm_timestamp_lock); 937 /* 938 * SUSv4 says that x bits of permission need not be affected. 939 * Be consistent with our shm_open there. 940 */ 941 #ifdef MAC 942 error = mac_posixshm_check_setmode(active_cred, shmfd, mode); 943 if (error != 0) 944 goto out; 945 #endif 946 error = vaccess(VREG, shmfd->shm_mode, shmfd->shm_uid, 947 shmfd->shm_gid, VADMIN, active_cred, NULL); 948 if (error != 0) 949 goto out; 950 shmfd->shm_mode = mode & ACCESSPERMS; 951 out: 952 mtx_unlock(&shm_timestamp_lock); 953 return (error); 954 } 955 956 static int 957 shm_chown(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred, 958 struct thread *td) 959 { 960 struct shmfd *shmfd; 961 int error; 962 963 error = 0; 964 shmfd = fp->f_data; 965 mtx_lock(&shm_timestamp_lock); 966 #ifdef MAC 967 error = mac_posixshm_check_setowner(active_cred, shmfd, uid, gid); 968 if (error != 0) 969 goto out; 970 #endif 971 if (uid == (uid_t)-1) 972 uid = shmfd->shm_uid; 973 if (gid == (gid_t)-1) 974 gid = shmfd->shm_gid; 975 if (((uid != shmfd->shm_uid && uid != active_cred->cr_uid) || 976 (gid != shmfd->shm_gid && !groupmember(gid, active_cred))) && 977 (error = priv_check_cred(active_cred, PRIV_VFS_CHOWN, 0))) 978 goto out; 979 shmfd->shm_uid = uid; 980 shmfd->shm_gid = gid; 981 out: 982 mtx_unlock(&shm_timestamp_lock); 983 return (error); 984 } 985 986 /* 987 * Helper routines to allow the backing object of a shared memory file 988 * descriptor to be mapped in the kernel. 989 */ 990 int 991 shm_map(struct file *fp, size_t size, off_t offset, void **memp) 992 { 993 struct shmfd *shmfd; 994 vm_offset_t kva, ofs; 995 vm_object_t obj; 996 int rv; 997 998 if (fp->f_type != DTYPE_SHM) 999 return (EINVAL); 1000 shmfd = fp->f_data; 1001 obj = shmfd->shm_object; 1002 VM_OBJECT_WLOCK(obj); 1003 /* 1004 * XXXRW: This validation is probably insufficient, and subject to 1005 * sign errors. It should be fixed. 1006 */ 1007 if (offset >= shmfd->shm_size || 1008 offset + size > round_page(shmfd->shm_size)) { 1009 VM_OBJECT_WUNLOCK(obj); 1010 return (EINVAL); 1011 } 1012 1013 shmfd->shm_kmappings++; 1014 vm_object_reference_locked(obj); 1015 VM_OBJECT_WUNLOCK(obj); 1016 1017 /* Map the object into the kernel_map and wire it. */ 1018 kva = vm_map_min(kernel_map); 1019 ofs = offset & PAGE_MASK; 1020 offset = trunc_page(offset); 1021 size = round_page(size + ofs); 1022 rv = vm_map_find(kernel_map, obj, offset, &kva, size, 0, 1023 VMFS_OPTIMAL_SPACE, VM_PROT_READ | VM_PROT_WRITE, 1024 VM_PROT_READ | VM_PROT_WRITE, 0); 1025 if (rv == KERN_SUCCESS) { 1026 rv = vm_map_wire(kernel_map, kva, kva + size, 1027 VM_MAP_WIRE_SYSTEM | VM_MAP_WIRE_NOHOLES); 1028 if (rv == KERN_SUCCESS) { 1029 *memp = (void *)(kva + ofs); 1030 return (0); 1031 } 1032 vm_map_remove(kernel_map, kva, kva + size); 1033 } else 1034 vm_object_deallocate(obj); 1035 1036 /* On failure, drop our mapping reference. */ 1037 VM_OBJECT_WLOCK(obj); 1038 shmfd->shm_kmappings--; 1039 VM_OBJECT_WUNLOCK(obj); 1040 1041 return (vm_mmap_to_errno(rv)); 1042 } 1043 1044 /* 1045 * We require the caller to unmap the entire entry. This allows us to 1046 * safely decrement shm_kmappings when a mapping is removed. 1047 */ 1048 int 1049 shm_unmap(struct file *fp, void *mem, size_t size) 1050 { 1051 struct shmfd *shmfd; 1052 vm_map_entry_t entry; 1053 vm_offset_t kva, ofs; 1054 vm_object_t obj; 1055 vm_pindex_t pindex; 1056 vm_prot_t prot; 1057 boolean_t wired; 1058 vm_map_t map; 1059 int rv; 1060 1061 if (fp->f_type != DTYPE_SHM) 1062 return (EINVAL); 1063 shmfd = fp->f_data; 1064 kva = (vm_offset_t)mem; 1065 ofs = kva & PAGE_MASK; 1066 kva = trunc_page(kva); 1067 size = round_page(size + ofs); 1068 map = kernel_map; 1069 rv = vm_map_lookup(&map, kva, VM_PROT_READ | VM_PROT_WRITE, &entry, 1070 &obj, &pindex, &prot, &wired); 1071 if (rv != KERN_SUCCESS) 1072 return (EINVAL); 1073 if (entry->start != kva || entry->end != kva + size) { 1074 vm_map_lookup_done(map, entry); 1075 return (EINVAL); 1076 } 1077 vm_map_lookup_done(map, entry); 1078 if (obj != shmfd->shm_object) 1079 return (EINVAL); 1080 vm_map_remove(map, kva, kva + size); 1081 VM_OBJECT_WLOCK(obj); 1082 KASSERT(shmfd->shm_kmappings > 0, ("shm_unmap: object not mapped")); 1083 shmfd->shm_kmappings--; 1084 VM_OBJECT_WUNLOCK(obj); 1085 return (0); 1086 } 1087 1088 static int 1089 shm_fill_kinfo(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp) 1090 { 1091 const char *path, *pr_path; 1092 struct shmfd *shmfd; 1093 size_t pr_pathlen; 1094 1095 kif->kf_type = KF_TYPE_SHM; 1096 shmfd = fp->f_data; 1097 1098 mtx_lock(&shm_timestamp_lock); 1099 kif->kf_un.kf_file.kf_file_mode = S_IFREG | shmfd->shm_mode; /* XXX */ 1100 mtx_unlock(&shm_timestamp_lock); 1101 kif->kf_un.kf_file.kf_file_size = shmfd->shm_size; 1102 if (shmfd->shm_path != NULL) { 1103 sx_slock(&shm_dict_lock); 1104 if (shmfd->shm_path != NULL) { 1105 path = shmfd->shm_path; 1106 pr_path = curthread->td_ucred->cr_prison->pr_path; 1107 if (strcmp(pr_path, "/") != 0) { 1108 /* Return the jail-rooted pathname. */ 1109 pr_pathlen = strlen(pr_path); 1110 if (strncmp(path, pr_path, pr_pathlen) == 0 && 1111 path[pr_pathlen] == '/') 1112 path += pr_pathlen; 1113 } 1114 strlcpy(kif->kf_path, path, sizeof(kif->kf_path)); 1115 } 1116 sx_sunlock(&shm_dict_lock); 1117 } 1118 return (0); 1119 } 1120