1 /*- 2 * Copyright (c) 2006 Robert N. M. Watson 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 24 * SUCH DAMAGE. 25 */ 26 27 /* 28 * Support for shared swap-backed anonymous memory objects via 29 * shm_open(2) and shm_unlink(2). While most of the implementation is 30 * here, vm_mmap.c contains mapping logic changes. 31 * 32 * TODO: 33 * 34 * (2) Need to export data to a userland tool via a sysctl. Should ipcs(1) 35 * and ipcrm(1) be expanded or should new tools to manage both POSIX 36 * kernel semaphores and POSIX shared memory be written? 37 * 38 * (3) Add support for this file type to fstat(1). 39 * 40 * (4) Resource limits? Does this need its own resource limits or are the 41 * existing limits in mmap(2) sufficient? 42 * 43 * (5) Partial page truncation. vnode_pager_setsize() will zero any parts 44 * of a partially mapped page as a result of ftruncate(2)/truncate(2). 45 * We can do the same (with the same pmap evil), but do we need to 46 * worry about the bits on disk if the page is swapped out or will the 47 * swapper zero the parts of a page that are invalid if the page is 48 * swapped back in for us? 49 * 50 * (6) Add MAC support in mac_biba(4) and mac_mls(4). 51 * 52 * (7) Add a MAC check_create() hook for creating new named objects. 53 */ 54 55 #include <sys/cdefs.h> 56 __FBSDID("$FreeBSD$"); 57 58 #include <sys/param.h> 59 #include <sys/fcntl.h> 60 #include <sys/file.h> 61 #include <sys/filedesc.h> 62 #include <sys/fnv_hash.h> 63 #include <sys/kernel.h> 64 #include <sys/lock.h> 65 #include <sys/malloc.h> 66 #include <sys/mman.h> 67 #include <sys/mutex.h> 68 #include <sys/proc.h> 69 #include <sys/refcount.h> 70 #include <sys/resourcevar.h> 71 #include <sys/stat.h> 72 #include <sys/sysctl.h> 73 #include <sys/sysproto.h> 74 #include <sys/systm.h> 75 #include <sys/sx.h> 76 #include <sys/time.h> 77 #include <sys/vnode.h> 78 79 #include <security/mac/mac_framework.h> 80 81 #include <vm/vm.h> 82 #include <vm/vm_param.h> 83 #include <vm/pmap.h> 84 #include <vm/vm_map.h> 85 #include <vm/vm_object.h> 86 #include <vm/vm_page.h> 87 #include <vm/vm_pager.h> 88 #include <vm/swap_pager.h> 89 90 struct shm_mapping { 91 char *sm_path; 92 Fnv32_t sm_fnv; 93 struct shmfd *sm_shmfd; 94 LIST_ENTRY(shm_mapping) sm_link; 95 }; 96 97 static MALLOC_DEFINE(M_SHMFD, "shmfd", "shared memory file descriptor"); 98 static LIST_HEAD(, shm_mapping) *shm_dictionary; 99 static struct sx shm_dict_lock; 100 static struct mtx shm_timestamp_lock; 101 static u_long shm_hash; 102 103 #define SHM_HASH(fnv) (&shm_dictionary[(fnv) & shm_hash]) 104 105 static int shm_access(struct shmfd *shmfd, struct ucred *ucred, int flags); 106 static struct shmfd *shm_alloc(struct ucred *ucred, mode_t mode); 107 static void shm_dict_init(void *arg); 108 static void shm_drop(struct shmfd *shmfd); 109 static struct shmfd *shm_hold(struct shmfd *shmfd); 110 static void shm_insert(char *path, Fnv32_t fnv, struct shmfd *shmfd); 111 static struct shmfd *shm_lookup(char *path, Fnv32_t fnv); 112 static int shm_remove(char *path, Fnv32_t fnv, struct ucred *ucred); 113 static void shm_dotruncate(struct shmfd *shmfd, off_t length); 114 115 static fo_rdwr_t shm_read; 116 static fo_rdwr_t shm_write; 117 static fo_truncate_t shm_truncate; 118 static fo_ioctl_t shm_ioctl; 119 static fo_poll_t shm_poll; 120 static fo_kqfilter_t shm_kqfilter; 121 static fo_stat_t shm_stat; 122 static fo_close_t shm_close; 123 124 /* File descriptor operations. */ 125 static struct fileops shm_ops = { 126 .fo_read = shm_read, 127 .fo_write = shm_write, 128 .fo_truncate = shm_truncate, 129 .fo_ioctl = shm_ioctl, 130 .fo_poll = shm_poll, 131 .fo_kqfilter = shm_kqfilter, 132 .fo_stat = shm_stat, 133 .fo_close = shm_close, 134 .fo_flags = DFLAG_PASSABLE 135 }; 136 137 FEATURE(posix_shm, "POSIX shared memory"); 138 139 static int 140 shm_read(struct file *fp, struct uio *uio, struct ucred *active_cred, 141 int flags, struct thread *td) 142 { 143 144 return (EOPNOTSUPP); 145 } 146 147 static int 148 shm_write(struct file *fp, struct uio *uio, struct ucred *active_cred, 149 int flags, struct thread *td) 150 { 151 152 return (EOPNOTSUPP); 153 } 154 155 static int 156 shm_truncate(struct file *fp, off_t length, struct ucred *active_cred, 157 struct thread *td) 158 { 159 struct shmfd *shmfd; 160 #ifdef MAC 161 int error; 162 #endif 163 164 shmfd = fp->f_data; 165 #ifdef MAC 166 error = mac_posixshm_check_truncate(active_cred, fp->f_cred, shmfd); 167 if (error) 168 return (error); 169 #endif 170 shm_dotruncate(shmfd, length); 171 return (0); 172 } 173 174 static int 175 shm_ioctl(struct file *fp, u_long com, void *data, 176 struct ucred *active_cred, struct thread *td) 177 { 178 179 return (EOPNOTSUPP); 180 } 181 182 static int 183 shm_poll(struct file *fp, int events, struct ucred *active_cred, 184 struct thread *td) 185 { 186 187 return (EOPNOTSUPP); 188 } 189 190 static int 191 shm_kqfilter(struct file *fp, struct knote *kn) 192 { 193 194 return (EOPNOTSUPP); 195 } 196 197 static int 198 shm_stat(struct file *fp, struct stat *sb, struct ucred *active_cred, 199 struct thread *td) 200 { 201 struct shmfd *shmfd; 202 #ifdef MAC 203 int error; 204 #endif 205 206 shmfd = fp->f_data; 207 208 #ifdef MAC 209 error = mac_posixshm_check_stat(active_cred, fp->f_cred, shmfd); 210 if (error) 211 return (error); 212 #endif 213 214 /* 215 * Attempt to return sanish values for fstat() on a memory file 216 * descriptor. 217 */ 218 bzero(sb, sizeof(*sb)); 219 sb->st_mode = S_IFREG | shmfd->shm_mode; /* XXX */ 220 sb->st_blksize = PAGE_SIZE; 221 sb->st_size = shmfd->shm_size; 222 sb->st_blocks = (sb->st_size + sb->st_blksize - 1) / sb->st_blksize; 223 sb->st_atimespec = shmfd->shm_atime; 224 sb->st_ctimespec = shmfd->shm_ctime; 225 sb->st_mtimespec = shmfd->shm_mtime; 226 sb->st_birthtimespec = shmfd->shm_birthtime; 227 sb->st_uid = shmfd->shm_uid; 228 sb->st_gid = shmfd->shm_gid; 229 230 return (0); 231 } 232 233 static int 234 shm_close(struct file *fp, struct thread *td) 235 { 236 struct shmfd *shmfd; 237 238 shmfd = fp->f_data; 239 fp->f_data = NULL; 240 shm_drop(shmfd); 241 242 return (0); 243 } 244 245 static void 246 shm_dotruncate(struct shmfd *shmfd, off_t length) 247 { 248 vm_object_t object; 249 vm_page_t m; 250 vm_pindex_t nobjsize; 251 252 object = shmfd->shm_object; 253 VM_OBJECT_LOCK(object); 254 if (length == shmfd->shm_size) { 255 VM_OBJECT_UNLOCK(object); 256 return; 257 } 258 nobjsize = OFF_TO_IDX(length + PAGE_MASK); 259 260 /* Are we shrinking? If so, trim the end. */ 261 if (length < shmfd->shm_size) { 262 /* Toss in memory pages. */ 263 if (nobjsize < object->size) 264 vm_object_page_remove(object, nobjsize, object->size, 265 FALSE); 266 267 /* Toss pages from swap. */ 268 if (object->type == OBJT_SWAP) 269 swap_pager_freespace(object, nobjsize, 270 object->size - nobjsize); 271 272 /* 273 * If the last page is partially mapped, then zero out 274 * the garbage at the end of the page. See comments 275 * in vnode_pager_setsize() for more details. 276 * 277 * XXXJHB: This handles in memory pages, but what about 278 * a page swapped out to disk? 279 */ 280 if ((length & PAGE_MASK) && 281 (m = vm_page_lookup(object, OFF_TO_IDX(length))) != NULL && 282 m->valid != 0) { 283 int base = (int)length & PAGE_MASK; 284 int size = PAGE_SIZE - base; 285 286 pmap_zero_page_area(m, base, size); 287 288 /* 289 * Update the valid bits to reflect the blocks that 290 * have been zeroed. Some of these valid bits may 291 * have already been set. 292 */ 293 vm_page_set_valid(m, base, size); 294 295 /* 296 * Round "base" to the next block boundary so that the 297 * dirty bit for a partially zeroed block is not 298 * cleared. 299 */ 300 base = roundup2(base, DEV_BSIZE); 301 302 vm_page_lock_queues(); 303 vm_page_clear_dirty(m, base, PAGE_SIZE - base); 304 vm_page_unlock_queues(); 305 } else if ((length & PAGE_MASK) && 306 __predict_false(object->cache != NULL)) { 307 vm_page_cache_free(object, OFF_TO_IDX(length), 308 nobjsize); 309 } 310 } 311 shmfd->shm_size = length; 312 mtx_lock(&shm_timestamp_lock); 313 vfs_timestamp(&shmfd->shm_ctime); 314 shmfd->shm_mtime = shmfd->shm_ctime; 315 mtx_unlock(&shm_timestamp_lock); 316 object->size = nobjsize; 317 VM_OBJECT_UNLOCK(object); 318 } 319 320 /* 321 * shmfd object management including creation and reference counting 322 * routines. 323 */ 324 static struct shmfd * 325 shm_alloc(struct ucred *ucred, mode_t mode) 326 { 327 struct shmfd *shmfd; 328 329 shmfd = malloc(sizeof(*shmfd), M_SHMFD, M_WAITOK | M_ZERO); 330 shmfd->shm_size = 0; 331 shmfd->shm_uid = ucred->cr_uid; 332 shmfd->shm_gid = ucred->cr_gid; 333 shmfd->shm_mode = mode; 334 shmfd->shm_object = vm_pager_allocate(OBJT_DEFAULT, NULL, 335 shmfd->shm_size, VM_PROT_DEFAULT, 0); 336 KASSERT(shmfd->shm_object != NULL, ("shm_create: vm_pager_allocate")); 337 VM_OBJECT_LOCK(shmfd->shm_object); 338 vm_object_clear_flag(shmfd->shm_object, OBJ_ONEMAPPING); 339 vm_object_set_flag(shmfd->shm_object, OBJ_NOSPLIT); 340 VM_OBJECT_UNLOCK(shmfd->shm_object); 341 vfs_timestamp(&shmfd->shm_birthtime); 342 shmfd->shm_atime = shmfd->shm_mtime = shmfd->shm_ctime = 343 shmfd->shm_birthtime; 344 refcount_init(&shmfd->shm_refs, 1); 345 #ifdef MAC 346 mac_posixshm_init(shmfd); 347 mac_posixshm_create(ucred, shmfd); 348 #endif 349 350 return (shmfd); 351 } 352 353 static struct shmfd * 354 shm_hold(struct shmfd *shmfd) 355 { 356 357 refcount_acquire(&shmfd->shm_refs); 358 return (shmfd); 359 } 360 361 static void 362 shm_drop(struct shmfd *shmfd) 363 { 364 365 if (refcount_release(&shmfd->shm_refs)) { 366 #ifdef MAC 367 mac_posixshm_destroy(shmfd); 368 #endif 369 vm_object_deallocate(shmfd->shm_object); 370 free(shmfd, M_SHMFD); 371 } 372 } 373 374 /* 375 * Determine if the credentials have sufficient permissions for a 376 * specified combination of FREAD and FWRITE. 377 */ 378 static int 379 shm_access(struct shmfd *shmfd, struct ucred *ucred, int flags) 380 { 381 accmode_t accmode; 382 383 accmode = 0; 384 if (flags & FREAD) 385 accmode |= VREAD; 386 if (flags & FWRITE) 387 accmode |= VWRITE; 388 return (vaccess(VREG, shmfd->shm_mode, shmfd->shm_uid, shmfd->shm_gid, 389 accmode, ucred, NULL)); 390 } 391 392 /* 393 * Dictionary management. We maintain an in-kernel dictionary to map 394 * paths to shmfd objects. We use the FNV hash on the path to store 395 * the mappings in a hash table. 396 */ 397 static void 398 shm_dict_init(void *arg) 399 { 400 401 mtx_init(&shm_timestamp_lock, "shm timestamps", NULL, MTX_DEF); 402 sx_init(&shm_dict_lock, "shm dictionary"); 403 shm_dictionary = hashinit(1024, M_SHMFD, &shm_hash); 404 } 405 SYSINIT(shm_dict_init, SI_SUB_SYSV_SHM, SI_ORDER_ANY, shm_dict_init, NULL); 406 407 static struct shmfd * 408 shm_lookup(char *path, Fnv32_t fnv) 409 { 410 struct shm_mapping *map; 411 412 LIST_FOREACH(map, SHM_HASH(fnv), sm_link) { 413 if (map->sm_fnv != fnv) 414 continue; 415 if (strcmp(map->sm_path, path) == 0) 416 return (map->sm_shmfd); 417 } 418 419 return (NULL); 420 } 421 422 static void 423 shm_insert(char *path, Fnv32_t fnv, struct shmfd *shmfd) 424 { 425 struct shm_mapping *map; 426 427 map = malloc(sizeof(struct shm_mapping), M_SHMFD, M_WAITOK); 428 map->sm_path = path; 429 map->sm_fnv = fnv; 430 map->sm_shmfd = shm_hold(shmfd); 431 LIST_INSERT_HEAD(SHM_HASH(fnv), map, sm_link); 432 } 433 434 static int 435 shm_remove(char *path, Fnv32_t fnv, struct ucred *ucred) 436 { 437 struct shm_mapping *map; 438 int error; 439 440 LIST_FOREACH(map, SHM_HASH(fnv), sm_link) { 441 if (map->sm_fnv != fnv) 442 continue; 443 if (strcmp(map->sm_path, path) == 0) { 444 #ifdef MAC 445 error = mac_posixshm_check_unlink(ucred, map->sm_shmfd); 446 if (error) 447 return (error); 448 #endif 449 error = shm_access(map->sm_shmfd, ucred, 450 FREAD | FWRITE); 451 if (error) 452 return (error); 453 LIST_REMOVE(map, sm_link); 454 shm_drop(map->sm_shmfd); 455 free(map->sm_path, M_SHMFD); 456 free(map, M_SHMFD); 457 return (0); 458 } 459 } 460 461 return (ENOENT); 462 } 463 464 /* System calls. */ 465 int 466 shm_open(struct thread *td, struct shm_open_args *uap) 467 { 468 struct filedesc *fdp; 469 struct shmfd *shmfd; 470 struct file *fp; 471 char *path; 472 Fnv32_t fnv; 473 mode_t cmode; 474 int fd, error; 475 476 if ((uap->flags & O_ACCMODE) != O_RDONLY && 477 (uap->flags & O_ACCMODE) != O_RDWR) 478 return (EINVAL); 479 480 if ((uap->flags & ~(O_ACCMODE | O_CREAT | O_EXCL | O_TRUNC)) != 0) 481 return (EINVAL); 482 483 fdp = td->td_proc->p_fd; 484 cmode = (uap->mode & ~fdp->fd_cmask) & ACCESSPERMS; 485 486 error = falloc(td, &fp, &fd); 487 if (error) 488 return (error); 489 490 /* A SHM_ANON path pointer creates an anonymous object. */ 491 if (uap->path == SHM_ANON) { 492 /* A read-only anonymous object is pointless. */ 493 if ((uap->flags & O_ACCMODE) == O_RDONLY) { 494 fdclose(fdp, fp, fd, td); 495 fdrop(fp, td); 496 return (EINVAL); 497 } 498 shmfd = shm_alloc(td->td_ucred, cmode); 499 } else { 500 path = malloc(MAXPATHLEN, M_SHMFD, M_WAITOK); 501 error = copyinstr(uap->path, path, MAXPATHLEN, NULL); 502 503 /* Require paths to start with a '/' character. */ 504 if (error == 0 && path[0] != '/') 505 error = EINVAL; 506 if (error) { 507 fdclose(fdp, fp, fd, td); 508 fdrop(fp, td); 509 free(path, M_SHMFD); 510 return (error); 511 } 512 513 fnv = fnv_32_str(path, FNV1_32_INIT); 514 sx_xlock(&shm_dict_lock); 515 shmfd = shm_lookup(path, fnv); 516 if (shmfd == NULL) { 517 /* Object does not yet exist, create it if requested. */ 518 if (uap->flags & O_CREAT) { 519 shmfd = shm_alloc(td->td_ucred, cmode); 520 shm_insert(path, fnv, shmfd); 521 } else { 522 free(path, M_SHMFD); 523 error = ENOENT; 524 } 525 } else { 526 /* 527 * Object already exists, obtain a new 528 * reference if requested and permitted. 529 */ 530 free(path, M_SHMFD); 531 if ((uap->flags & (O_CREAT | O_EXCL)) == 532 (O_CREAT | O_EXCL)) 533 error = EEXIST; 534 else { 535 #ifdef MAC 536 error = mac_posixshm_check_open(td->td_ucred, 537 shmfd); 538 if (error == 0) 539 #endif 540 error = shm_access(shmfd, td->td_ucred, 541 FFLAGS(uap->flags & O_ACCMODE)); 542 } 543 544 /* 545 * Truncate the file back to zero length if 546 * O_TRUNC was specified and the object was 547 * opened with read/write. 548 */ 549 if (error == 0 && 550 (uap->flags & (O_ACCMODE | O_TRUNC)) == 551 (O_RDWR | O_TRUNC)) { 552 #ifdef MAC 553 error = mac_posixshm_check_truncate( 554 td->td_ucred, fp->f_cred, shmfd); 555 if (error == 0) 556 #endif 557 shm_dotruncate(shmfd, 0); 558 } 559 if (error == 0) 560 shm_hold(shmfd); 561 } 562 sx_xunlock(&shm_dict_lock); 563 564 if (error) { 565 fdclose(fdp, fp, fd, td); 566 fdrop(fp, td); 567 return (error); 568 } 569 } 570 571 finit(fp, FFLAGS(uap->flags & O_ACCMODE), DTYPE_SHM, shmfd, &shm_ops); 572 573 FILEDESC_XLOCK(fdp); 574 if (fdp->fd_ofiles[fd] == fp) 575 fdp->fd_ofileflags[fd] |= UF_EXCLOSE; 576 FILEDESC_XUNLOCK(fdp); 577 td->td_retval[0] = fd; 578 fdrop(fp, td); 579 580 return (0); 581 } 582 583 int 584 shm_unlink(struct thread *td, struct shm_unlink_args *uap) 585 { 586 char *path; 587 Fnv32_t fnv; 588 int error; 589 590 path = malloc(MAXPATHLEN, M_TEMP, M_WAITOK); 591 error = copyinstr(uap->path, path, MAXPATHLEN, NULL); 592 if (error) { 593 free(path, M_TEMP); 594 return (error); 595 } 596 597 fnv = fnv_32_str(path, FNV1_32_INIT); 598 sx_xlock(&shm_dict_lock); 599 error = shm_remove(path, fnv, td->td_ucred); 600 sx_xunlock(&shm_dict_lock); 601 free(path, M_TEMP); 602 603 return (error); 604 } 605 606 /* 607 * mmap() helper to validate mmap() requests against shm object state 608 * and give mmap() the vm_object to use for the mapping. 609 */ 610 int 611 shm_mmap(struct shmfd *shmfd, vm_size_t objsize, vm_ooffset_t foff, 612 vm_object_t *obj) 613 { 614 615 /* 616 * XXXRW: This validation is probably insufficient, and subject to 617 * sign errors. It should be fixed. 618 */ 619 if (foff >= shmfd->shm_size || 620 foff + objsize > round_page(shmfd->shm_size)) 621 return (EINVAL); 622 623 mtx_lock(&shm_timestamp_lock); 624 vfs_timestamp(&shmfd->shm_atime); 625 mtx_unlock(&shm_timestamp_lock); 626 vm_object_reference(shmfd->shm_object); 627 *obj = shmfd->shm_object; 628 return (0); 629 } 630