18e38aeffSJohn Baldwin /*- 28a36da99SPedro F. Giffuni * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 38a36da99SPedro F. Giffuni * 415bcf785SRobert Watson * Copyright (c) 2006, 2011, 2016-2017 Robert N. M. Watson 5d301b358SKonstantin Belousov * Copyright 2020 The FreeBSD Foundation 68e38aeffSJohn Baldwin * All rights reserved. 78e38aeffSJohn Baldwin * 815bcf785SRobert Watson * Portions of this software were developed by BAE Systems, the University of 915bcf785SRobert Watson * Cambridge Computer Laboratory, and Memorial University under DARPA/AFRL 1015bcf785SRobert Watson * contract FA8650-15-C-7558 ("CADETS"), as part of the DARPA Transparent 1115bcf785SRobert Watson * Computing (TC) research program. 1215bcf785SRobert Watson * 13d301b358SKonstantin Belousov * Portions of this software were developed by Konstantin Belousov 14d301b358SKonstantin Belousov * under sponsorship from the FreeBSD Foundation. 15d301b358SKonstantin Belousov * 168e38aeffSJohn Baldwin * Redistribution and use in source and binary forms, with or without 178e38aeffSJohn Baldwin * modification, are permitted provided that the following conditions 188e38aeffSJohn Baldwin * are met: 198e38aeffSJohn Baldwin * 1. Redistributions of source code must retain the above copyright 208e38aeffSJohn Baldwin * notice, this list of conditions and the following disclaimer. 218e38aeffSJohn Baldwin * 2. Redistributions in binary form must reproduce the above copyright 228e38aeffSJohn Baldwin * notice, this list of conditions and the following disclaimer in the 238e38aeffSJohn Baldwin * documentation and/or other materials provided with the distribution. 248e38aeffSJohn Baldwin * 258e38aeffSJohn Baldwin * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 268e38aeffSJohn Baldwin * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 278e38aeffSJohn Baldwin * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 288e38aeffSJohn Baldwin * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 298e38aeffSJohn Baldwin * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 308e38aeffSJohn Baldwin * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 318e38aeffSJohn Baldwin * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 328e38aeffSJohn Baldwin * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 338e38aeffSJohn Baldwin * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 348e38aeffSJohn Baldwin * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 358e38aeffSJohn Baldwin * SUCH DAMAGE. 368e38aeffSJohn Baldwin */ 378e38aeffSJohn Baldwin 388e38aeffSJohn Baldwin /* 398e38aeffSJohn Baldwin * Support for shared swap-backed anonymous memory objects via 409afb12baSDavid Bright * shm_open(2), shm_rename(2), and shm_unlink(2). 419afb12baSDavid Bright * While most of the implementation is here, vm_mmap.c contains 429afb12baSDavid Bright * mapping logic changes. 438e38aeffSJohn Baldwin * 445c066cd2SKonstantin Belousov * posixshmcontrol(1) allows users to inspect the state of the memory 455c066cd2SKonstantin Belousov * objects. Per-uid swap resource limit controls total amount of 465c066cd2SKonstantin Belousov * memory that user can consume for anonymous objects, including 475c066cd2SKonstantin Belousov * shared. 488e38aeffSJohn Baldwin */ 498e38aeffSJohn Baldwin 508e38aeffSJohn Baldwin #include <sys/cdefs.h> 518e38aeffSJohn Baldwin __FBSDID("$FreeBSD$"); 528e38aeffSJohn Baldwin 5312bc222eSJonathan Anderson #include "opt_capsicum.h" 54551a7895SRui Paulo #include "opt_ktrace.h" 5512bc222eSJonathan Anderson 568e38aeffSJohn Baldwin #include <sys/param.h> 574a144410SRobert Watson #include <sys/capsicum.h> 58610a2b3cSJohn Baldwin #include <sys/conf.h> 598e38aeffSJohn Baldwin #include <sys/fcntl.h> 608e38aeffSJohn Baldwin #include <sys/file.h> 618e38aeffSJohn Baldwin #include <sys/filedesc.h> 622b64ab22SMark Johnston #include <sys/filio.h> 638e38aeffSJohn Baldwin #include <sys/fnv_hash.h> 648e38aeffSJohn Baldwin #include <sys/kernel.h> 6591898857SMark Johnston #include <sys/limits.h> 66551a7895SRui Paulo #include <sys/uio.h> 67551a7895SRui Paulo #include <sys/signal.h> 68cc7b259aSJamie Gritton #include <sys/jail.h> 69551a7895SRui Paulo #include <sys/ktrace.h> 708e38aeffSJohn Baldwin #include <sys/lock.h> 718e38aeffSJohn Baldwin #include <sys/malloc.h> 728e38aeffSJohn Baldwin #include <sys/mman.h> 738e38aeffSJohn Baldwin #include <sys/mutex.h> 749c00bb91SKonstantin Belousov #include <sys/priv.h> 758e38aeffSJohn Baldwin #include <sys/proc.h> 768e38aeffSJohn Baldwin #include <sys/refcount.h> 778e38aeffSJohn Baldwin #include <sys/resourcevar.h> 7889f6b863SAttilio Rao #include <sys/rwlock.h> 7956d0e33eSKonstantin Belousov #include <sys/sbuf.h> 808e38aeffSJohn Baldwin #include <sys/stat.h> 817ee1b208SEd Schouten #include <sys/syscallsubr.h> 828e38aeffSJohn Baldwin #include <sys/sysctl.h> 838e38aeffSJohn Baldwin #include <sys/sysproto.h> 848e38aeffSJohn Baldwin #include <sys/systm.h> 858e38aeffSJohn Baldwin #include <sys/sx.h> 868e38aeffSJohn Baldwin #include <sys/time.h> 87d301b358SKonstantin Belousov #include <sys/vmmeter.h> 888e38aeffSJohn Baldwin #include <sys/vnode.h> 89940cb0e2SKonstantin Belousov #include <sys/unistd.h> 909696feebSJohn Baldwin #include <sys/user.h> 918e38aeffSJohn Baldwin 9215bcf785SRobert Watson #include <security/audit/audit.h> 938e38aeffSJohn Baldwin #include <security/mac/mac_framework.h> 948e38aeffSJohn Baldwin 958e38aeffSJohn Baldwin #include <vm/vm.h> 968e38aeffSJohn Baldwin #include <vm/vm_param.h> 978e38aeffSJohn Baldwin #include <vm/pmap.h> 98338e7cf2SJohn Baldwin #include <vm/vm_extern.h> 998e38aeffSJohn Baldwin #include <vm/vm_map.h> 100fb680e16SJohn Baldwin #include <vm/vm_kern.h> 1018e38aeffSJohn Baldwin #include <vm/vm_object.h> 1028e38aeffSJohn Baldwin #include <vm/vm_page.h> 1032971897dSAlan Cox #include <vm/vm_pageout.h> 1048e38aeffSJohn Baldwin #include <vm/vm_pager.h> 1058e38aeffSJohn Baldwin #include <vm/swap_pager.h> 1068e38aeffSJohn Baldwin 1078e38aeffSJohn Baldwin struct shm_mapping { 1088e38aeffSJohn Baldwin char *sm_path; 1098e38aeffSJohn Baldwin Fnv32_t sm_fnv; 1108e38aeffSJohn Baldwin struct shmfd *sm_shmfd; 1118e38aeffSJohn Baldwin LIST_ENTRY(shm_mapping) sm_link; 1128e38aeffSJohn Baldwin }; 1138e38aeffSJohn Baldwin 1148e38aeffSJohn Baldwin static MALLOC_DEFINE(M_SHMFD, "shmfd", "shared memory file descriptor"); 1158e38aeffSJohn Baldwin static LIST_HEAD(, shm_mapping) *shm_dictionary; 1168e38aeffSJohn Baldwin static struct sx shm_dict_lock; 1178e38aeffSJohn Baldwin static struct mtx shm_timestamp_lock; 1188e38aeffSJohn Baldwin static u_long shm_hash; 1197883ce1fSMateusz Guzik static struct unrhdr64 shm_ino_unr; 120610a2b3cSJohn Baldwin static dev_t shm_dev_ino; 1218e38aeffSJohn Baldwin 1228e38aeffSJohn Baldwin #define SHM_HASH(fnv) (&shm_dictionary[(fnv) & shm_hash]) 1238e38aeffSJohn Baldwin 1245be725d7SAndreas Tobler static void shm_init(void *arg); 1258e38aeffSJohn Baldwin static void shm_insert(char *path, Fnv32_t fnv, struct shmfd *shmfd); 1268e38aeffSJohn Baldwin static struct shmfd *shm_lookup(char *path, Fnv32_t fnv); 1278e38aeffSJohn Baldwin static int shm_remove(char *path, Fnv32_t fnv, struct ucred *ucred); 128d301b358SKonstantin Belousov static int shm_dotruncate_cookie(struct shmfd *shmfd, off_t length, 129d301b358SKonstantin Belousov void *rl_cookie); 130af755d3eSKyle Evans static int shm_dotruncate_locked(struct shmfd *shmfd, off_t length, 131af755d3eSKyle Evans void *rl_cookie); 1322d5603feSDavid Bright static int shm_copyin_path(struct thread *td, const char *userpath_in, 1332d5603feSDavid Bright char **path_out); 134454bc887SKa Ho Ng static int shm_deallocate(struct shmfd *shmfd, off_t *offset, 135454bc887SKa Ho Ng off_t *length, int flags); 1368e38aeffSJohn Baldwin 1378e38aeffSJohn Baldwin static fo_rdwr_t shm_read; 1388e38aeffSJohn Baldwin static fo_rdwr_t shm_write; 1398e38aeffSJohn Baldwin static fo_truncate_t shm_truncate; 1402b64ab22SMark Johnston static fo_ioctl_t shm_ioctl; 1418e38aeffSJohn Baldwin static fo_stat_t shm_stat; 1428e38aeffSJohn Baldwin static fo_close_t shm_close; 1439c00bb91SKonstantin Belousov static fo_chmod_t shm_chmod; 1449c00bb91SKonstantin Belousov static fo_chown_t shm_chown; 145940cb0e2SKonstantin Belousov static fo_seek_t shm_seek; 1469696feebSJohn Baldwin static fo_fill_kinfo_t shm_fill_kinfo; 1477077c426SJohn Baldwin static fo_mmap_t shm_mmap; 148af755d3eSKyle Evans static fo_get_seals_t shm_get_seals; 149af755d3eSKyle Evans static fo_add_seals_t shm_add_seals; 150f1040532SKyle Evans static fo_fallocate_t shm_fallocate; 151454bc887SKa Ho Ng static fo_fspacectl_t shm_fspacectl; 1528e38aeffSJohn Baldwin 1538e38aeffSJohn Baldwin /* File descriptor operations. */ 1541bdbd705SKonstantin Belousov struct fileops shm_ops = { 1558e38aeffSJohn Baldwin .fo_read = shm_read, 1568e38aeffSJohn Baldwin .fo_write = shm_write, 1578e38aeffSJohn Baldwin .fo_truncate = shm_truncate, 1582b64ab22SMark Johnston .fo_ioctl = shm_ioctl, 1592d69d0dcSJohn Baldwin .fo_poll = invfo_poll, 1602d69d0dcSJohn Baldwin .fo_kqfilter = invfo_kqfilter, 1618e38aeffSJohn Baldwin .fo_stat = shm_stat, 1628e38aeffSJohn Baldwin .fo_close = shm_close, 1639c00bb91SKonstantin Belousov .fo_chmod = shm_chmod, 1649c00bb91SKonstantin Belousov .fo_chown = shm_chown, 165227aaa86SKonstantin Belousov .fo_sendfile = vn_sendfile, 166940cb0e2SKonstantin Belousov .fo_seek = shm_seek, 1679696feebSJohn Baldwin .fo_fill_kinfo = shm_fill_kinfo, 1687077c426SJohn Baldwin .fo_mmap = shm_mmap, 169af755d3eSKyle Evans .fo_get_seals = shm_get_seals, 170af755d3eSKyle Evans .fo_add_seals = shm_add_seals, 171f1040532SKyle Evans .fo_fallocate = shm_fallocate, 172454bc887SKa Ho Ng .fo_fspacectl = shm_fspacectl, 173d301b358SKonstantin Belousov .fo_flags = DFLAG_PASSABLE | DFLAG_SEEKABLE, 1748e38aeffSJohn Baldwin }; 1758e38aeffSJohn Baldwin 1768e38aeffSJohn Baldwin FEATURE(posix_shm, "POSIX shared memory"); 1778e38aeffSJohn Baldwin 178d301b358SKonstantin Belousov static SYSCTL_NODE(_vm, OID_AUTO, largepages, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 179d301b358SKonstantin Belousov ""); 180d301b358SKonstantin Belousov 181d301b358SKonstantin Belousov static int largepage_reclaim_tries = 1; 182d301b358SKonstantin Belousov SYSCTL_INT(_vm_largepages, OID_AUTO, reclaim_tries, 183d301b358SKonstantin Belousov CTLFLAG_RWTUN, &largepage_reclaim_tries, 0, 184d301b358SKonstantin Belousov "Number of contig reclaims before giving up for default alloc policy"); 185d301b358SKonstantin Belousov 1868e38aeffSJohn Baldwin static int 18741cf41fdSKonstantin Belousov uiomove_object_page(vm_object_t obj, size_t len, struct uio *uio) 18841cf41fdSKonstantin Belousov { 18941cf41fdSKonstantin Belousov vm_page_t m; 19041cf41fdSKonstantin Belousov vm_pindex_t idx; 19141cf41fdSKonstantin Belousov size_t tlen; 19241cf41fdSKonstantin Belousov int error, offset, rv; 19341cf41fdSKonstantin Belousov 19441cf41fdSKonstantin Belousov idx = OFF_TO_IDX(uio->uio_offset); 19541cf41fdSKonstantin Belousov offset = uio->uio_offset & PAGE_MASK; 19641cf41fdSKonstantin Belousov tlen = MIN(PAGE_SIZE - offset, len); 19741cf41fdSKonstantin Belousov 198f72eaaebSJeff Roberson rv = vm_page_grab_valid_unlocked(&m, obj, idx, 199f72eaaebSJeff Roberson VM_ALLOC_SBUSY | VM_ALLOC_IGN_SBUSY | VM_ALLOC_NOCREAT); 200f72eaaebSJeff Roberson if (rv == VM_PAGER_OK) 201f72eaaebSJeff Roberson goto found; 20241cf41fdSKonstantin Belousov 20341cf41fdSKonstantin Belousov /* 2046311d7aaSWill Andrews * Read I/O without either a corresponding resident page or swap 2056311d7aaSWill Andrews * page: use zero_region. This is intended to avoid instantiating 2066311d7aaSWill Andrews * pages on read from a sparse region. 2076311d7aaSWill Andrews */ 208f72eaaebSJeff Roberson VM_OBJECT_WLOCK(obj); 209f72eaaebSJeff Roberson m = vm_page_lookup(obj, idx); 210f72eaaebSJeff Roberson if (uio->uio_rw == UIO_READ && m == NULL && 2116311d7aaSWill Andrews !vm_pager_has_page(obj, idx, NULL, NULL)) { 2126311d7aaSWill Andrews VM_OBJECT_WUNLOCK(obj); 213b9062c93SKonstantin Belousov return (uiomove(__DECONST(void *, zero_region), tlen, uio)); 2146311d7aaSWill Andrews } 2156311d7aaSWill Andrews 2166311d7aaSWill Andrews /* 21741cf41fdSKonstantin Belousov * Although the tmpfs vnode lock is held here, it is 21841cf41fdSKonstantin Belousov * nonetheless safe to sleep waiting for a free page. The 21941cf41fdSKonstantin Belousov * pageout daemon does not need to acquire the tmpfs vnode 22041cf41fdSKonstantin Belousov * lock to page out tobj's pages because tobj is a OBJT_SWAP 22141cf41fdSKonstantin Belousov * type object. 22241cf41fdSKonstantin Belousov */ 223c7575748SJeff Roberson rv = vm_page_grab_valid(&m, obj, idx, 224a8081778SJeff Roberson VM_ALLOC_NORMAL | VM_ALLOC_SBUSY | VM_ALLOC_IGN_SBUSY); 22541cf41fdSKonstantin Belousov if (rv != VM_PAGER_OK) { 22641cf41fdSKonstantin Belousov VM_OBJECT_WUNLOCK(obj); 227c7575748SJeff Roberson printf("uiomove_object: vm_obj %p idx %jd pager error %d\n", 228c7575748SJeff Roberson obj, idx, rv); 22941cf41fdSKonstantin Belousov return (EIO); 23041cf41fdSKonstantin Belousov } 23141cf41fdSKonstantin Belousov VM_OBJECT_WUNLOCK(obj); 232f72eaaebSJeff Roberson 233f72eaaebSJeff Roberson found: 23441cf41fdSKonstantin Belousov error = uiomove_fromphys(&m, offset, tlen, uio); 235a8081778SJeff Roberson if (uio->uio_rw == UIO_WRITE && error == 0) 236a8081778SJeff Roberson vm_page_set_dirty(m); 237d29f674fSJeff Roberson vm_page_activate(m); 238a8081778SJeff Roberson vm_page_sunbusy(m); 23941cf41fdSKonstantin Belousov 24041cf41fdSKonstantin Belousov return (error); 24141cf41fdSKonstantin Belousov } 24241cf41fdSKonstantin Belousov 24341cf41fdSKonstantin Belousov int 24441cf41fdSKonstantin Belousov uiomove_object(vm_object_t obj, off_t obj_size, struct uio *uio) 24541cf41fdSKonstantin Belousov { 24641cf41fdSKonstantin Belousov ssize_t resid; 24741cf41fdSKonstantin Belousov size_t len; 24841cf41fdSKonstantin Belousov int error; 24941cf41fdSKonstantin Belousov 25041cf41fdSKonstantin Belousov error = 0; 25141cf41fdSKonstantin Belousov while ((resid = uio->uio_resid) > 0) { 25241cf41fdSKonstantin Belousov if (obj_size <= uio->uio_offset) 25341cf41fdSKonstantin Belousov break; 25441cf41fdSKonstantin Belousov len = MIN(obj_size - uio->uio_offset, resid); 25541cf41fdSKonstantin Belousov if (len == 0) 25641cf41fdSKonstantin Belousov break; 25741cf41fdSKonstantin Belousov error = uiomove_object_page(obj, len, uio); 25841cf41fdSKonstantin Belousov if (error != 0 || resid == uio->uio_resid) 25941cf41fdSKonstantin Belousov break; 26041cf41fdSKonstantin Belousov } 26141cf41fdSKonstantin Belousov return (error); 26241cf41fdSKonstantin Belousov } 26341cf41fdSKonstantin Belousov 264d301b358SKonstantin Belousov static u_long count_largepages[MAXPAGESIZES]; 265d301b358SKonstantin Belousov 266d301b358SKonstantin Belousov static int 267d301b358SKonstantin Belousov shm_largepage_phys_populate(vm_object_t object, vm_pindex_t pidx, 268d301b358SKonstantin Belousov int fault_type, vm_prot_t max_prot, vm_pindex_t *first, vm_pindex_t *last) 269d301b358SKonstantin Belousov { 270d301b358SKonstantin Belousov vm_page_t m; 271d301b358SKonstantin Belousov int psind; 272d301b358SKonstantin Belousov 273d301b358SKonstantin Belousov psind = object->un_pager.phys.data_val; 274d301b358SKonstantin Belousov if (psind == 0 || pidx >= object->size) 275d301b358SKonstantin Belousov return (VM_PAGER_FAIL); 276d301b358SKonstantin Belousov *first = rounddown2(pidx, pagesizes[psind] / PAGE_SIZE); 277d301b358SKonstantin Belousov 278d301b358SKonstantin Belousov /* 279d301b358SKonstantin Belousov * We only busy the first page in the superpage run. It is 280d301b358SKonstantin Belousov * useless to busy whole run since we only remove full 281d301b358SKonstantin Belousov * superpage, and it takes too long to busy e.g. 512 * 512 == 282d301b358SKonstantin Belousov * 262144 pages constituing 1G amd64 superage. 283d301b358SKonstantin Belousov */ 284d301b358SKonstantin Belousov m = vm_page_grab(object, *first, VM_ALLOC_NORMAL | VM_ALLOC_NOCREAT); 285d301b358SKonstantin Belousov MPASS(m != NULL); 286d301b358SKonstantin Belousov 287d301b358SKonstantin Belousov *last = *first + atop(pagesizes[psind]) - 1; 288d301b358SKonstantin Belousov return (VM_PAGER_OK); 289d301b358SKonstantin Belousov } 290d301b358SKonstantin Belousov 291d301b358SKonstantin Belousov static boolean_t 292d301b358SKonstantin Belousov shm_largepage_phys_haspage(vm_object_t object, vm_pindex_t pindex, 293d301b358SKonstantin Belousov int *before, int *after) 294d301b358SKonstantin Belousov { 295d301b358SKonstantin Belousov int psind; 296d301b358SKonstantin Belousov 297d301b358SKonstantin Belousov psind = object->un_pager.phys.data_val; 298d301b358SKonstantin Belousov if (psind == 0 || pindex >= object->size) 299d301b358SKonstantin Belousov return (FALSE); 300d301b358SKonstantin Belousov if (before != NULL) { 301d301b358SKonstantin Belousov *before = pindex - rounddown2(pindex, pagesizes[psind] / 302d301b358SKonstantin Belousov PAGE_SIZE); 303d301b358SKonstantin Belousov } 304d301b358SKonstantin Belousov if (after != NULL) { 305d301b358SKonstantin Belousov *after = roundup2(pindex, pagesizes[psind] / PAGE_SIZE) - 306d301b358SKonstantin Belousov pindex; 307d301b358SKonstantin Belousov } 308d301b358SKonstantin Belousov return (TRUE); 309d301b358SKonstantin Belousov } 310d301b358SKonstantin Belousov 311d301b358SKonstantin Belousov static void 312d301b358SKonstantin Belousov shm_largepage_phys_ctor(vm_object_t object, vm_prot_t prot, 313d301b358SKonstantin Belousov vm_ooffset_t foff, struct ucred *cred) 314d301b358SKonstantin Belousov { 315d301b358SKonstantin Belousov } 316d301b358SKonstantin Belousov 317d301b358SKonstantin Belousov static void 318d301b358SKonstantin Belousov shm_largepage_phys_dtor(vm_object_t object) 319d301b358SKonstantin Belousov { 320d301b358SKonstantin Belousov int psind; 321d301b358SKonstantin Belousov 322d301b358SKonstantin Belousov psind = object->un_pager.phys.data_val; 323d301b358SKonstantin Belousov if (psind != 0) { 324d301b358SKonstantin Belousov atomic_subtract_long(&count_largepages[psind], 325d301b358SKonstantin Belousov object->size / (pagesizes[psind] / PAGE_SIZE)); 326d301b358SKonstantin Belousov vm_wire_sub(object->size); 327d301b358SKonstantin Belousov } else { 328d301b358SKonstantin Belousov KASSERT(object->size == 0, 329d301b358SKonstantin Belousov ("largepage phys obj %p not initialized bit size %#jx > 0", 330d301b358SKonstantin Belousov object, (uintmax_t)object->size)); 331d301b358SKonstantin Belousov } 332d301b358SKonstantin Belousov } 333d301b358SKonstantin Belousov 334d474440aSKonstantin Belousov static const struct phys_pager_ops shm_largepage_phys_ops = { 335d301b358SKonstantin Belousov .phys_pg_populate = shm_largepage_phys_populate, 336d301b358SKonstantin Belousov .phys_pg_haspage = shm_largepage_phys_haspage, 337d301b358SKonstantin Belousov .phys_pg_ctor = shm_largepage_phys_ctor, 338d301b358SKonstantin Belousov .phys_pg_dtor = shm_largepage_phys_dtor, 339d301b358SKonstantin Belousov }; 340d301b358SKonstantin Belousov 341d301b358SKonstantin Belousov bool 342d301b358SKonstantin Belousov shm_largepage(struct shmfd *shmfd) 343d301b358SKonstantin Belousov { 344d301b358SKonstantin Belousov return (shmfd->shm_object->type == OBJT_PHYS); 345d301b358SKonstantin Belousov } 346d301b358SKonstantin Belousov 34741cf41fdSKonstantin Belousov static int 348940cb0e2SKonstantin Belousov shm_seek(struct file *fp, off_t offset, int whence, struct thread *td) 349940cb0e2SKonstantin Belousov { 350940cb0e2SKonstantin Belousov struct shmfd *shmfd; 351940cb0e2SKonstantin Belousov off_t foffset; 352940cb0e2SKonstantin Belousov int error; 353940cb0e2SKonstantin Belousov 354940cb0e2SKonstantin Belousov shmfd = fp->f_data; 355940cb0e2SKonstantin Belousov foffset = foffset_lock(fp, 0); 356940cb0e2SKonstantin Belousov error = 0; 357940cb0e2SKonstantin Belousov switch (whence) { 358940cb0e2SKonstantin Belousov case L_INCR: 359940cb0e2SKonstantin Belousov if (foffset < 0 || 360940cb0e2SKonstantin Belousov (offset > 0 && foffset > OFF_MAX - offset)) { 361940cb0e2SKonstantin Belousov error = EOVERFLOW; 362940cb0e2SKonstantin Belousov break; 363940cb0e2SKonstantin Belousov } 364940cb0e2SKonstantin Belousov offset += foffset; 365940cb0e2SKonstantin Belousov break; 366940cb0e2SKonstantin Belousov case L_XTND: 367940cb0e2SKonstantin Belousov if (offset > 0 && shmfd->shm_size > OFF_MAX - offset) { 368940cb0e2SKonstantin Belousov error = EOVERFLOW; 369940cb0e2SKonstantin Belousov break; 370940cb0e2SKonstantin Belousov } 371940cb0e2SKonstantin Belousov offset += shmfd->shm_size; 372940cb0e2SKonstantin Belousov break; 373940cb0e2SKonstantin Belousov case L_SET: 374940cb0e2SKonstantin Belousov break; 375940cb0e2SKonstantin Belousov default: 376940cb0e2SKonstantin Belousov error = EINVAL; 377940cb0e2SKonstantin Belousov } 378940cb0e2SKonstantin Belousov if (error == 0) { 379940cb0e2SKonstantin Belousov if (offset < 0 || offset > shmfd->shm_size) 380940cb0e2SKonstantin Belousov error = EINVAL; 381940cb0e2SKonstantin Belousov else 3826f2b769cSJohn-Mark Gurney td->td_uretoff.tdu_off = offset; 383940cb0e2SKonstantin Belousov } 384940cb0e2SKonstantin Belousov foffset_unlock(fp, offset, error != 0 ? FOF_NOUPDATE : 0); 385940cb0e2SKonstantin Belousov return (error); 386940cb0e2SKonstantin Belousov } 387940cb0e2SKonstantin Belousov 388940cb0e2SKonstantin Belousov static int 3898e38aeffSJohn Baldwin shm_read(struct file *fp, struct uio *uio, struct ucred *active_cred, 3908e38aeffSJohn Baldwin int flags, struct thread *td) 3918e38aeffSJohn Baldwin { 392940cb0e2SKonstantin Belousov struct shmfd *shmfd; 393940cb0e2SKonstantin Belousov void *rl_cookie; 394940cb0e2SKonstantin Belousov int error; 3958e38aeffSJohn Baldwin 396940cb0e2SKonstantin Belousov shmfd = fp->f_data; 397940cb0e2SKonstantin Belousov #ifdef MAC 398940cb0e2SKonstantin Belousov error = mac_posixshm_check_read(active_cred, fp->f_cred, shmfd); 399940cb0e2SKonstantin Belousov if (error) 400940cb0e2SKonstantin Belousov return (error); 401940cb0e2SKonstantin Belousov #endif 4026ea906eeSJilles Tjoelker foffset_lock_uio(fp, uio, flags); 4036ea906eeSJilles Tjoelker rl_cookie = rangelock_rlock(&shmfd->shm_rl, uio->uio_offset, 4046ea906eeSJilles Tjoelker uio->uio_offset + uio->uio_resid, &shmfd->shm_mtx); 405940cb0e2SKonstantin Belousov error = uiomove_object(shmfd->shm_object, shmfd->shm_size, uio); 406940cb0e2SKonstantin Belousov rangelock_unlock(&shmfd->shm_rl, rl_cookie, &shmfd->shm_mtx); 407940cb0e2SKonstantin Belousov foffset_unlock_uio(fp, uio, flags); 408940cb0e2SKonstantin Belousov return (error); 4098e38aeffSJohn Baldwin } 4108e38aeffSJohn Baldwin 4118e38aeffSJohn Baldwin static int 4128e38aeffSJohn Baldwin shm_write(struct file *fp, struct uio *uio, struct ucred *active_cred, 4138e38aeffSJohn Baldwin int flags, struct thread *td) 4148e38aeffSJohn Baldwin { 415940cb0e2SKonstantin Belousov struct shmfd *shmfd; 416940cb0e2SKonstantin Belousov void *rl_cookie; 417940cb0e2SKonstantin Belousov int error; 4183f07b9d9SKyle Evans off_t size; 4198e38aeffSJohn Baldwin 420940cb0e2SKonstantin Belousov shmfd = fp->f_data; 421940cb0e2SKonstantin Belousov #ifdef MAC 422940cb0e2SKonstantin Belousov error = mac_posixshm_check_write(active_cred, fp->f_cred, shmfd); 423940cb0e2SKonstantin Belousov if (error) 424940cb0e2SKonstantin Belousov return (error); 425940cb0e2SKonstantin Belousov #endif 426d301b358SKonstantin Belousov if (shm_largepage(shmfd) && shmfd->shm_lp_psind == 0) 427d301b358SKonstantin Belousov return (EINVAL); 428940cb0e2SKonstantin Belousov foffset_lock_uio(fp, uio, flags); 4293f07b9d9SKyle Evans if (uio->uio_resid > OFF_MAX - uio->uio_offset) { 4303f07b9d9SKyle Evans /* 4313f07b9d9SKyle Evans * Overflow is only an error if we're supposed to expand on 4323f07b9d9SKyle Evans * write. Otherwise, we'll just truncate the write to the 4333f07b9d9SKyle Evans * size of the file, which can only grow up to OFF_MAX. 4343f07b9d9SKyle Evans */ 4353f07b9d9SKyle Evans if ((shmfd->shm_flags & SHM_GROW_ON_WRITE) != 0) { 4363f07b9d9SKyle Evans foffset_unlock_uio(fp, uio, flags); 4373f07b9d9SKyle Evans return (EFBIG); 4383f07b9d9SKyle Evans } 4393f07b9d9SKyle Evans 4403f07b9d9SKyle Evans size = shmfd->shm_size; 4413f07b9d9SKyle Evans } else { 4423f07b9d9SKyle Evans size = uio->uio_offset + uio->uio_resid; 4433f07b9d9SKyle Evans } 444940cb0e2SKonstantin Belousov if ((flags & FOF_OFFSET) == 0) { 445940cb0e2SKonstantin Belousov rl_cookie = rangelock_wlock(&shmfd->shm_rl, 0, OFF_MAX, 446940cb0e2SKonstantin Belousov &shmfd->shm_mtx); 447940cb0e2SKonstantin Belousov } else { 448940cb0e2SKonstantin Belousov rl_cookie = rangelock_wlock(&shmfd->shm_rl, uio->uio_offset, 4493f07b9d9SKyle Evans size, &shmfd->shm_mtx); 450940cb0e2SKonstantin Belousov } 4513f07b9d9SKyle Evans if ((shmfd->shm_seals & F_SEAL_WRITE) != 0) { 452af755d3eSKyle Evans error = EPERM; 4533f07b9d9SKyle Evans } else { 4543f07b9d9SKyle Evans error = 0; 4553f07b9d9SKyle Evans if ((shmfd->shm_flags & SHM_GROW_ON_WRITE) != 0 && 4563f07b9d9SKyle Evans size > shmfd->shm_size) { 45779783634SKonstantin Belousov error = shm_dotruncate_cookie(shmfd, size, rl_cookie); 4583f07b9d9SKyle Evans } 4593f07b9d9SKyle Evans if (error == 0) 4603f07b9d9SKyle Evans error = uiomove_object(shmfd->shm_object, 4613f07b9d9SKyle Evans shmfd->shm_size, uio); 4623f07b9d9SKyle Evans } 463940cb0e2SKonstantin Belousov rangelock_unlock(&shmfd->shm_rl, rl_cookie, &shmfd->shm_mtx); 464940cb0e2SKonstantin Belousov foffset_unlock_uio(fp, uio, flags); 465940cb0e2SKonstantin Belousov return (error); 4668e38aeffSJohn Baldwin } 4678e38aeffSJohn Baldwin 4688e38aeffSJohn Baldwin static int 4698e38aeffSJohn Baldwin shm_truncate(struct file *fp, off_t length, struct ucred *active_cred, 4708e38aeffSJohn Baldwin struct thread *td) 4718e38aeffSJohn Baldwin { 4728e38aeffSJohn Baldwin struct shmfd *shmfd; 4738e38aeffSJohn Baldwin #ifdef MAC 4748e38aeffSJohn Baldwin int error; 4758e38aeffSJohn Baldwin #endif 4768e38aeffSJohn Baldwin 4778e38aeffSJohn Baldwin shmfd = fp->f_data; 4788e38aeffSJohn Baldwin #ifdef MAC 4798e38aeffSJohn Baldwin error = mac_posixshm_check_truncate(active_cred, fp->f_cred, shmfd); 4808e38aeffSJohn Baldwin if (error) 4818e38aeffSJohn Baldwin return (error); 4828e38aeffSJohn Baldwin #endif 4833364c323SKonstantin Belousov return (shm_dotruncate(shmfd, length)); 4848e38aeffSJohn Baldwin } 4858e38aeffSJohn Baldwin 4862b64ab22SMark Johnston int 4872b64ab22SMark Johnston shm_ioctl(struct file *fp, u_long com, void *data, struct ucred *active_cred, 4882b64ab22SMark Johnston struct thread *td) 4892b64ab22SMark Johnston { 490d301b358SKonstantin Belousov struct shmfd *shmfd; 491d301b358SKonstantin Belousov struct shm_largepage_conf *conf; 492d301b358SKonstantin Belousov void *rl_cookie; 4932b64ab22SMark Johnston 494d301b358SKonstantin Belousov shmfd = fp->f_data; 4952b64ab22SMark Johnston switch (com) { 4962b64ab22SMark Johnston case FIONBIO: 4972b64ab22SMark Johnston case FIOASYNC: 4982b64ab22SMark Johnston /* 4992b64ab22SMark Johnston * Allow fcntl(fd, F_SETFL, O_NONBLOCK) to work, 5002b64ab22SMark Johnston * just like it would on an unlinked regular file 5012b64ab22SMark Johnston */ 5022b64ab22SMark Johnston return (0); 503d301b358SKonstantin Belousov case FIOSSHMLPGCNF: 504d301b358SKonstantin Belousov if (!shm_largepage(shmfd)) 505d301b358SKonstantin Belousov return (ENOTTY); 506d301b358SKonstantin Belousov conf = data; 507d301b358SKonstantin Belousov if (shmfd->shm_lp_psind != 0 && 508d301b358SKonstantin Belousov conf->psind != shmfd->shm_lp_psind) 509d301b358SKonstantin Belousov return (EINVAL); 510d301b358SKonstantin Belousov if (conf->psind <= 0 || conf->psind >= MAXPAGESIZES || 511d301b358SKonstantin Belousov pagesizes[conf->psind] == 0) 512d301b358SKonstantin Belousov return (EINVAL); 513d301b358SKonstantin Belousov if (conf->alloc_policy != SHM_LARGEPAGE_ALLOC_DEFAULT && 514d301b358SKonstantin Belousov conf->alloc_policy != SHM_LARGEPAGE_ALLOC_NOWAIT && 515d301b358SKonstantin Belousov conf->alloc_policy != SHM_LARGEPAGE_ALLOC_HARD) 516d301b358SKonstantin Belousov return (EINVAL); 517d301b358SKonstantin Belousov 518d301b358SKonstantin Belousov rl_cookie = rangelock_wlock(&shmfd->shm_rl, 0, OFF_MAX, 519d301b358SKonstantin Belousov &shmfd->shm_mtx); 520d301b358SKonstantin Belousov shmfd->shm_lp_psind = conf->psind; 521d301b358SKonstantin Belousov shmfd->shm_lp_alloc_policy = conf->alloc_policy; 522d301b358SKonstantin Belousov shmfd->shm_object->un_pager.phys.data_val = conf->psind; 523d301b358SKonstantin Belousov rangelock_unlock(&shmfd->shm_rl, rl_cookie, &shmfd->shm_mtx); 524d301b358SKonstantin Belousov return (0); 525d301b358SKonstantin Belousov case FIOGSHMLPGCNF: 526d301b358SKonstantin Belousov if (!shm_largepage(shmfd)) 527d301b358SKonstantin Belousov return (ENOTTY); 528d301b358SKonstantin Belousov conf = data; 529d301b358SKonstantin Belousov rl_cookie = rangelock_rlock(&shmfd->shm_rl, 0, OFF_MAX, 530d301b358SKonstantin Belousov &shmfd->shm_mtx); 531d301b358SKonstantin Belousov conf->psind = shmfd->shm_lp_psind; 532d301b358SKonstantin Belousov conf->alloc_policy = shmfd->shm_lp_alloc_policy; 533d301b358SKonstantin Belousov rangelock_unlock(&shmfd->shm_rl, rl_cookie, &shmfd->shm_mtx); 534d301b358SKonstantin Belousov return (0); 5352b64ab22SMark Johnston default: 5362b64ab22SMark Johnston return (ENOTTY); 5372b64ab22SMark Johnston } 5382b64ab22SMark Johnston } 5392b64ab22SMark Johnston 5408e38aeffSJohn Baldwin static int 541*2b68eb8eSMateusz Guzik shm_stat(struct file *fp, struct stat *sb, struct ucred *active_cred) 5428e38aeffSJohn Baldwin { 5438e38aeffSJohn Baldwin struct shmfd *shmfd; 5448e38aeffSJohn Baldwin #ifdef MAC 5458e38aeffSJohn Baldwin int error; 5468e38aeffSJohn Baldwin #endif 5478e38aeffSJohn Baldwin 5488e38aeffSJohn Baldwin shmfd = fp->f_data; 5498e38aeffSJohn Baldwin 5508e38aeffSJohn Baldwin #ifdef MAC 5518e38aeffSJohn Baldwin error = mac_posixshm_check_stat(active_cred, fp->f_cred, shmfd); 5528e38aeffSJohn Baldwin if (error) 5538e38aeffSJohn Baldwin return (error); 5548e38aeffSJohn Baldwin #endif 5558e38aeffSJohn Baldwin 5568e38aeffSJohn Baldwin /* 5578e38aeffSJohn Baldwin * Attempt to return sanish values for fstat() on a memory file 5588e38aeffSJohn Baldwin * descriptor. 5598e38aeffSJohn Baldwin */ 5608e38aeffSJohn Baldwin bzero(sb, sizeof(*sb)); 5618e38aeffSJohn Baldwin sb->st_blksize = PAGE_SIZE; 5628e38aeffSJohn Baldwin sb->st_size = shmfd->shm_size; 56355e0987aSPedro F. Giffuni sb->st_blocks = howmany(sb->st_size, sb->st_blksize); 5649c00bb91SKonstantin Belousov mtx_lock(&shm_timestamp_lock); 565510ea843SEd Schouten sb->st_atim = shmfd->shm_atime; 566510ea843SEd Schouten sb->st_ctim = shmfd->shm_ctime; 567510ea843SEd Schouten sb->st_mtim = shmfd->shm_mtime; 568510ea843SEd Schouten sb->st_birthtim = shmfd->shm_birthtime; 5699c00bb91SKonstantin Belousov sb->st_mode = S_IFREG | shmfd->shm_mode; /* XXX */ 5708e38aeffSJohn Baldwin sb->st_uid = shmfd->shm_uid; 5718e38aeffSJohn Baldwin sb->st_gid = shmfd->shm_gid; 5729c00bb91SKonstantin Belousov mtx_unlock(&shm_timestamp_lock); 573610a2b3cSJohn Baldwin sb->st_dev = shm_dev_ino; 574610a2b3cSJohn Baldwin sb->st_ino = shmfd->shm_ino; 575e4b77548SKonstantin Belousov sb->st_nlink = shmfd->shm_object->ref_count; 576d301b358SKonstantin Belousov sb->st_blocks = shmfd->shm_object->size / 577d301b358SKonstantin Belousov (pagesizes[shmfd->shm_lp_psind] >> PAGE_SHIFT); 5788e38aeffSJohn Baldwin 5798e38aeffSJohn Baldwin return (0); 5808e38aeffSJohn Baldwin } 5818e38aeffSJohn Baldwin 5828e38aeffSJohn Baldwin static int 5838e38aeffSJohn Baldwin shm_close(struct file *fp, struct thread *td) 5848e38aeffSJohn Baldwin { 5858e38aeffSJohn Baldwin struct shmfd *shmfd; 5868e38aeffSJohn Baldwin 5878e38aeffSJohn Baldwin shmfd = fp->f_data; 5888e38aeffSJohn Baldwin fp->f_data = NULL; 5898e38aeffSJohn Baldwin shm_drop(shmfd); 5908e38aeffSJohn Baldwin 5918e38aeffSJohn Baldwin return (0); 5928e38aeffSJohn Baldwin } 5938e38aeffSJohn Baldwin 594af755d3eSKyle Evans static int 5952d5603feSDavid Bright shm_copyin_path(struct thread *td, const char *userpath_in, char **path_out) { 5962d5603feSDavid Bright int error; 5972d5603feSDavid Bright char *path; 5982d5603feSDavid Bright const char *pr_path; 5992d5603feSDavid Bright size_t pr_pathlen; 6002d5603feSDavid Bright 6012d5603feSDavid Bright path = malloc(MAXPATHLEN, M_SHMFD, M_WAITOK); 6022d5603feSDavid Bright pr_path = td->td_ucred->cr_prison->pr_path; 6032d5603feSDavid Bright 6042d5603feSDavid Bright /* Construct a full pathname for jailed callers. */ 6052d5603feSDavid Bright pr_pathlen = strcmp(pr_path, "/") == 6062d5603feSDavid Bright 0 ? 0 : strlcpy(path, pr_path, MAXPATHLEN); 6072d5603feSDavid Bright error = copyinstr(userpath_in, path + pr_pathlen, 6082d5603feSDavid Bright MAXPATHLEN - pr_pathlen, NULL); 6092d5603feSDavid Bright if (error != 0) 6102d5603feSDavid Bright goto out; 6112d5603feSDavid Bright 6122d5603feSDavid Bright #ifdef KTRACE 6132d5603feSDavid Bright if (KTRPOINT(curthread, KTR_NAMEI)) 6142d5603feSDavid Bright ktrnamei(path); 6152d5603feSDavid Bright #endif 6162d5603feSDavid Bright 6172d5603feSDavid Bright /* Require paths to start with a '/' character. */ 6182d5603feSDavid Bright if (path[pr_pathlen] != '/') { 6192d5603feSDavid Bright error = EINVAL; 6202d5603feSDavid Bright goto out; 6212d5603feSDavid Bright } 6222d5603feSDavid Bright 6232d5603feSDavid Bright *path_out = path; 6242d5603feSDavid Bright 6252d5603feSDavid Bright out: 6262d5603feSDavid Bright if (error != 0) 6272d5603feSDavid Bright free(path, M_SHMFD); 6282d5603feSDavid Bright 6292d5603feSDavid Bright return (error); 6302d5603feSDavid Bright } 6312d5603feSDavid Bright 6322d5603feSDavid Bright static int 633454bc887SKa Ho Ng shm_partial_page_invalidate(vm_object_t object, vm_pindex_t idx, int base, 634454bc887SKa Ho Ng int end) 635454bc887SKa Ho Ng { 636454bc887SKa Ho Ng vm_page_t m; 637454bc887SKa Ho Ng int rv; 638454bc887SKa Ho Ng 639454bc887SKa Ho Ng VM_OBJECT_ASSERT_WLOCKED(object); 640454bc887SKa Ho Ng KASSERT(base >= 0, ("%s: base %d", __func__, base)); 641454bc887SKa Ho Ng KASSERT(end - base <= PAGE_SIZE, ("%s: base %d end %d", __func__, base, 642454bc887SKa Ho Ng end)); 643454bc887SKa Ho Ng 644454bc887SKa Ho Ng retry: 645454bc887SKa Ho Ng m = vm_page_grab(object, idx, VM_ALLOC_NOCREAT); 646454bc887SKa Ho Ng if (m != NULL) { 647454bc887SKa Ho Ng MPASS(vm_page_all_valid(m)); 648454bc887SKa Ho Ng } else if (vm_pager_has_page(object, idx, NULL, NULL)) { 649454bc887SKa Ho Ng m = vm_page_alloc(object, idx, 650454bc887SKa Ho Ng VM_ALLOC_NORMAL | VM_ALLOC_WAITFAIL); 651454bc887SKa Ho Ng if (m == NULL) 652454bc887SKa Ho Ng goto retry; 653454bc887SKa Ho Ng vm_object_pip_add(object, 1); 654454bc887SKa Ho Ng VM_OBJECT_WUNLOCK(object); 655454bc887SKa Ho Ng rv = vm_pager_get_pages(object, &m, 1, NULL, NULL); 656454bc887SKa Ho Ng VM_OBJECT_WLOCK(object); 657454bc887SKa Ho Ng vm_object_pip_wakeup(object); 658454bc887SKa Ho Ng if (rv == VM_PAGER_OK) { 659454bc887SKa Ho Ng /* 660454bc887SKa Ho Ng * Since the page was not resident, and therefore not 661454bc887SKa Ho Ng * recently accessed, immediately enqueue it for 662454bc887SKa Ho Ng * asynchronous laundering. The current operation is 663454bc887SKa Ho Ng * not regarded as an access. 664454bc887SKa Ho Ng */ 665454bc887SKa Ho Ng vm_page_launder(m); 666454bc887SKa Ho Ng } else { 667454bc887SKa Ho Ng vm_page_free(m); 668454bc887SKa Ho Ng VM_OBJECT_WUNLOCK(object); 669454bc887SKa Ho Ng return (EIO); 670454bc887SKa Ho Ng } 671454bc887SKa Ho Ng } 672454bc887SKa Ho Ng if (m != NULL) { 673454bc887SKa Ho Ng pmap_zero_page_area(m, base, end - base); 674454bc887SKa Ho Ng KASSERT(vm_page_all_valid(m), ("%s: page %p is invalid", 675454bc887SKa Ho Ng __func__, m)); 676454bc887SKa Ho Ng vm_page_set_dirty(m); 677454bc887SKa Ho Ng vm_page_xunbusy(m); 678454bc887SKa Ho Ng } 679454bc887SKa Ho Ng 680454bc887SKa Ho Ng return (0); 681454bc887SKa Ho Ng } 682454bc887SKa Ho Ng 683454bc887SKa Ho Ng static int 684af755d3eSKyle Evans shm_dotruncate_locked(struct shmfd *shmfd, off_t length, void *rl_cookie) 6858e38aeffSJohn Baldwin { 6868e38aeffSJohn Baldwin vm_object_t object; 687454bc887SKa Ho Ng vm_pindex_t nobjsize; 6883364c323SKonstantin Belousov vm_ooffset_t delta; 689454bc887SKa Ho Ng int base, error; 6908e38aeffSJohn Baldwin 6912a016de1SAlan Cox KASSERT(length >= 0, ("shm_dotruncate: length < 0")); 6928e38aeffSJohn Baldwin object = shmfd->shm_object; 693af755d3eSKyle Evans VM_OBJECT_ASSERT_WLOCKED(object); 694af755d3eSKyle Evans rangelock_cookie_assert(rl_cookie, RA_WLOCKED); 695af755d3eSKyle Evans if (length == shmfd->shm_size) 6963364c323SKonstantin Belousov return (0); 6978e38aeffSJohn Baldwin nobjsize = OFF_TO_IDX(length + PAGE_MASK); 6988e38aeffSJohn Baldwin 6998e38aeffSJohn Baldwin /* Are we shrinking? If so, trim the end. */ 7008e38aeffSJohn Baldwin if (length < shmfd->shm_size) { 701af755d3eSKyle Evans if ((shmfd->shm_seals & F_SEAL_SHRINK) != 0) 702af755d3eSKyle Evans return (EPERM); 703af755d3eSKyle Evans 704fb680e16SJohn Baldwin /* 705fb680e16SJohn Baldwin * Disallow any requests to shrink the size if this 706fb680e16SJohn Baldwin * object is mapped into the kernel. 707fb680e16SJohn Baldwin */ 708af755d3eSKyle Evans if (shmfd->shm_kmappings > 0) 709fb680e16SJohn Baldwin return (EBUSY); 7102971897dSAlan Cox 7112971897dSAlan Cox /* 7122971897dSAlan Cox * Zero the truncated part of the last page. 7132971897dSAlan Cox */ 7142971897dSAlan Cox base = length & PAGE_MASK; 7152971897dSAlan Cox if (base != 0) { 716454bc887SKa Ho Ng error = shm_partial_page_invalidate(object, 717454bc887SKa Ho Ng OFF_TO_IDX(length), base, PAGE_SIZE); 718454bc887SKa Ho Ng if (error) 719454bc887SKa Ho Ng return (error); 7202971897dSAlan Cox } 7212a016de1SAlan Cox delta = IDX_TO_OFF(object->size - nobjsize); 7223364c323SKonstantin Belousov 7238e38aeffSJohn Baldwin if (nobjsize < object->size) 7248e38aeffSJohn Baldwin vm_object_page_remove(object, nobjsize, object->size, 7256bbee8e2SAlan Cox 0); 7268e38aeffSJohn Baldwin 7273364c323SKonstantin Belousov /* Free the swap accounted for shm */ 728ef694c1aSEdward Tomasz Napierala swap_release_by_cred(delta, object->cred); 7293364c323SKonstantin Belousov object->charge -= delta; 7303364c323SKonstantin Belousov } else { 731af755d3eSKyle Evans if ((shmfd->shm_seals & F_SEAL_GROW) != 0) 732af755d3eSKyle Evans return (EPERM); 733af755d3eSKyle Evans 7342a016de1SAlan Cox /* Try to reserve additional swap space. */ 7352a016de1SAlan Cox delta = IDX_TO_OFF(nobjsize - object->size); 736af755d3eSKyle Evans if (!swap_reserve_by_cred(delta, object->cred)) 7373364c323SKonstantin Belousov return (ENOMEM); 7383364c323SKonstantin Belousov object->charge += delta; 7398e38aeffSJohn Baldwin } 7408e38aeffSJohn Baldwin shmfd->shm_size = length; 7418e38aeffSJohn Baldwin mtx_lock(&shm_timestamp_lock); 7428e38aeffSJohn Baldwin vfs_timestamp(&shmfd->shm_ctime); 7438e38aeffSJohn Baldwin shmfd->shm_mtime = shmfd->shm_ctime; 7448e38aeffSJohn Baldwin mtx_unlock(&shm_timestamp_lock); 7458e38aeffSJohn Baldwin object->size = nobjsize; 7463364c323SKonstantin Belousov return (0); 7478e38aeffSJohn Baldwin } 7488e38aeffSJohn Baldwin 749d301b358SKonstantin Belousov static int 750d301b358SKonstantin Belousov shm_dotruncate_largepage(struct shmfd *shmfd, off_t length, void *rl_cookie) 751d301b358SKonstantin Belousov { 752d301b358SKonstantin Belousov vm_object_t object; 753d301b358SKonstantin Belousov vm_page_t m; 754d301b358SKonstantin Belousov vm_pindex_t newobjsz, oldobjsz; 755d301b358SKonstantin Belousov int aflags, error, i, psind, try; 756d301b358SKonstantin Belousov 757d301b358SKonstantin Belousov KASSERT(length >= 0, ("shm_dotruncate: length < 0")); 758d301b358SKonstantin Belousov object = shmfd->shm_object; 759d301b358SKonstantin Belousov VM_OBJECT_ASSERT_WLOCKED(object); 760d301b358SKonstantin Belousov rangelock_cookie_assert(rl_cookie, RA_WLOCKED); 761d301b358SKonstantin Belousov 762d301b358SKonstantin Belousov oldobjsz = object->size; 763d301b358SKonstantin Belousov newobjsz = OFF_TO_IDX(length); 764d301b358SKonstantin Belousov if (length == shmfd->shm_size) 765d301b358SKonstantin Belousov return (0); 766d301b358SKonstantin Belousov psind = shmfd->shm_lp_psind; 767d301b358SKonstantin Belousov if (psind == 0 && length != 0) 768d301b358SKonstantin Belousov return (EINVAL); 769d301b358SKonstantin Belousov if ((length & (pagesizes[psind] - 1)) != 0) 770d301b358SKonstantin Belousov return (EINVAL); 771d301b358SKonstantin Belousov 772d301b358SKonstantin Belousov if (length < shmfd->shm_size) { 773d301b358SKonstantin Belousov if ((shmfd->shm_seals & F_SEAL_SHRINK) != 0) 774d301b358SKonstantin Belousov return (EPERM); 775d301b358SKonstantin Belousov if (shmfd->shm_kmappings > 0) 776d301b358SKonstantin Belousov return (EBUSY); 777d301b358SKonstantin Belousov return (ENOTSUP); /* Pages are unmanaged. */ 778d301b358SKonstantin Belousov #if 0 779d301b358SKonstantin Belousov vm_object_page_remove(object, newobjsz, oldobjsz, 0); 780d301b358SKonstantin Belousov object->size = newobjsz; 781d301b358SKonstantin Belousov shmfd->shm_size = length; 782d301b358SKonstantin Belousov return (0); 783d301b358SKonstantin Belousov #endif 784d301b358SKonstantin Belousov } 785d301b358SKonstantin Belousov 78679783634SKonstantin Belousov if ((shmfd->shm_seals & F_SEAL_GROW) != 0) 78779783634SKonstantin Belousov return (EPERM); 78879783634SKonstantin Belousov 789d301b358SKonstantin Belousov aflags = VM_ALLOC_NORMAL | VM_ALLOC_ZERO; 790d301b358SKonstantin Belousov if (shmfd->shm_lp_alloc_policy == SHM_LARGEPAGE_ALLOC_NOWAIT) 791d301b358SKonstantin Belousov aflags |= VM_ALLOC_WAITFAIL; 792d301b358SKonstantin Belousov try = 0; 793d301b358SKonstantin Belousov 794d301b358SKonstantin Belousov /* 795d301b358SKonstantin Belousov * Extend shmfd and object, keeping all already fully 796d301b358SKonstantin Belousov * allocated large pages intact even on error, because dropped 797d301b358SKonstantin Belousov * object lock might allowed mapping of them. 798d301b358SKonstantin Belousov */ 799d301b358SKonstantin Belousov while (object->size < newobjsz) { 800d301b358SKonstantin Belousov m = vm_page_alloc_contig(object, object->size, aflags, 801d301b358SKonstantin Belousov pagesizes[psind] / PAGE_SIZE, 0, ~0, 802d301b358SKonstantin Belousov pagesizes[psind], 0, 803d301b358SKonstantin Belousov VM_MEMATTR_DEFAULT); 804d301b358SKonstantin Belousov if (m == NULL) { 805d301b358SKonstantin Belousov VM_OBJECT_WUNLOCK(object); 806d301b358SKonstantin Belousov if (shmfd->shm_lp_alloc_policy == 807d301b358SKonstantin Belousov SHM_LARGEPAGE_ALLOC_NOWAIT || 808d301b358SKonstantin Belousov (shmfd->shm_lp_alloc_policy == 809d301b358SKonstantin Belousov SHM_LARGEPAGE_ALLOC_DEFAULT && 810d301b358SKonstantin Belousov try >= largepage_reclaim_tries)) { 811d301b358SKonstantin Belousov VM_OBJECT_WLOCK(object); 812d301b358SKonstantin Belousov return (ENOMEM); 813d301b358SKonstantin Belousov } 814d301b358SKonstantin Belousov error = vm_page_reclaim_contig(aflags, 815d301b358SKonstantin Belousov pagesizes[psind] / PAGE_SIZE, 0, ~0, 816d301b358SKonstantin Belousov pagesizes[psind], 0) ? 0 : 817d301b358SKonstantin Belousov vm_wait_intr(object); 818d301b358SKonstantin Belousov if (error != 0) { 819d301b358SKonstantin Belousov VM_OBJECT_WLOCK(object); 820d301b358SKonstantin Belousov return (error); 821d301b358SKonstantin Belousov } 822d301b358SKonstantin Belousov try++; 823d301b358SKonstantin Belousov VM_OBJECT_WLOCK(object); 824d301b358SKonstantin Belousov continue; 825d301b358SKonstantin Belousov } 826d301b358SKonstantin Belousov try = 0; 827d301b358SKonstantin Belousov for (i = 0; i < pagesizes[psind] / PAGE_SIZE; i++) { 828d301b358SKonstantin Belousov if ((m[i].flags & PG_ZERO) == 0) 829d301b358SKonstantin Belousov pmap_zero_page(&m[i]); 830d301b358SKonstantin Belousov vm_page_valid(&m[i]); 831d301b358SKonstantin Belousov vm_page_xunbusy(&m[i]); 832d301b358SKonstantin Belousov } 833d301b358SKonstantin Belousov object->size += OFF_TO_IDX(pagesizes[psind]); 834d301b358SKonstantin Belousov shmfd->shm_size += pagesizes[psind]; 835d301b358SKonstantin Belousov atomic_add_long(&count_largepages[psind], 1); 836d301b358SKonstantin Belousov vm_wire_add(atop(pagesizes[psind])); 837d301b358SKonstantin Belousov } 838d301b358SKonstantin Belousov return (0); 839d301b358SKonstantin Belousov } 840d301b358SKonstantin Belousov 841d301b358SKonstantin Belousov static int 842d301b358SKonstantin Belousov shm_dotruncate_cookie(struct shmfd *shmfd, off_t length, void *rl_cookie) 843d301b358SKonstantin Belousov { 844d301b358SKonstantin Belousov int error; 845d301b358SKonstantin Belousov 846d301b358SKonstantin Belousov VM_OBJECT_WLOCK(shmfd->shm_object); 847d301b358SKonstantin Belousov error = shm_largepage(shmfd) ? shm_dotruncate_largepage(shmfd, 848d301b358SKonstantin Belousov length, rl_cookie) : shm_dotruncate_locked(shmfd, length, 849d301b358SKonstantin Belousov rl_cookie); 850d301b358SKonstantin Belousov VM_OBJECT_WUNLOCK(shmfd->shm_object); 851d301b358SKonstantin Belousov return (error); 852d301b358SKonstantin Belousov } 853d301b358SKonstantin Belousov 854af755d3eSKyle Evans int 855af755d3eSKyle Evans shm_dotruncate(struct shmfd *shmfd, off_t length) 856af755d3eSKyle Evans { 857af755d3eSKyle Evans void *rl_cookie; 858af755d3eSKyle Evans int error; 859af755d3eSKyle Evans 860af755d3eSKyle Evans rl_cookie = rangelock_wlock(&shmfd->shm_rl, 0, OFF_MAX, 861af755d3eSKyle Evans &shmfd->shm_mtx); 862d301b358SKonstantin Belousov error = shm_dotruncate_cookie(shmfd, length, rl_cookie); 863af755d3eSKyle Evans rangelock_unlock(&shmfd->shm_rl, rl_cookie, &shmfd->shm_mtx); 864af755d3eSKyle Evans return (error); 865af755d3eSKyle Evans } 866af755d3eSKyle Evans 8678e38aeffSJohn Baldwin /* 8688e38aeffSJohn Baldwin * shmfd object management including creation and reference counting 8698e38aeffSJohn Baldwin * routines. 8708e38aeffSJohn Baldwin */ 8711bdbd705SKonstantin Belousov struct shmfd * 872d301b358SKonstantin Belousov shm_alloc(struct ucred *ucred, mode_t mode, bool largepage) 8738e38aeffSJohn Baldwin { 8748e38aeffSJohn Baldwin struct shmfd *shmfd; 8758e38aeffSJohn Baldwin 8768e38aeffSJohn Baldwin shmfd = malloc(sizeof(*shmfd), M_SHMFD, M_WAITOK | M_ZERO); 8778e38aeffSJohn Baldwin shmfd->shm_size = 0; 8788e38aeffSJohn Baldwin shmfd->shm_uid = ucred->cr_uid; 8798e38aeffSJohn Baldwin shmfd->shm_gid = ucred->cr_gid; 8808e38aeffSJohn Baldwin shmfd->shm_mode = mode; 881d301b358SKonstantin Belousov if (largepage) { 882d301b358SKonstantin Belousov shmfd->shm_object = phys_pager_allocate(NULL, 883d301b358SKonstantin Belousov &shm_largepage_phys_ops, NULL, shmfd->shm_size, 884d301b358SKonstantin Belousov VM_PROT_DEFAULT, 0, ucred); 885d301b358SKonstantin Belousov shmfd->shm_lp_alloc_policy = SHM_LARGEPAGE_ALLOC_DEFAULT; 886d301b358SKonstantin Belousov } else { 88732287ea7SKyle Evans shmfd->shm_object = vm_pager_allocate(OBJT_SWAP, NULL, 8883364c323SKonstantin Belousov shmfd->shm_size, VM_PROT_DEFAULT, 0, ucred); 889d301b358SKonstantin Belousov } 8908e38aeffSJohn Baldwin KASSERT(shmfd->shm_object != NULL, ("shm_create: vm_pager_allocate")); 8918e38aeffSJohn Baldwin vfs_timestamp(&shmfd->shm_birthtime); 8928e38aeffSJohn Baldwin shmfd->shm_atime = shmfd->shm_mtime = shmfd->shm_ctime = 8938e38aeffSJohn Baldwin shmfd->shm_birthtime; 8947883ce1fSMateusz Guzik shmfd->shm_ino = alloc_unr64(&shm_ino_unr); 8958e38aeffSJohn Baldwin refcount_init(&shmfd->shm_refs, 1); 896940cb0e2SKonstantin Belousov mtx_init(&shmfd->shm_mtx, "shmrl", NULL, MTX_DEF); 897940cb0e2SKonstantin Belousov rangelock_init(&shmfd->shm_rl); 8988e38aeffSJohn Baldwin #ifdef MAC 8998e38aeffSJohn Baldwin mac_posixshm_init(shmfd); 9008e38aeffSJohn Baldwin mac_posixshm_create(ucred, shmfd); 9018e38aeffSJohn Baldwin #endif 9028e38aeffSJohn Baldwin 9038e38aeffSJohn Baldwin return (shmfd); 9048e38aeffSJohn Baldwin } 9058e38aeffSJohn Baldwin 9061bdbd705SKonstantin Belousov struct shmfd * 9078e38aeffSJohn Baldwin shm_hold(struct shmfd *shmfd) 9088e38aeffSJohn Baldwin { 9098e38aeffSJohn Baldwin 9108e38aeffSJohn Baldwin refcount_acquire(&shmfd->shm_refs); 9118e38aeffSJohn Baldwin return (shmfd); 9128e38aeffSJohn Baldwin } 9138e38aeffSJohn Baldwin 9141bdbd705SKonstantin Belousov void 9158e38aeffSJohn Baldwin shm_drop(struct shmfd *shmfd) 9168e38aeffSJohn Baldwin { 9178e38aeffSJohn Baldwin 9188e38aeffSJohn Baldwin if (refcount_release(&shmfd->shm_refs)) { 9198e38aeffSJohn Baldwin #ifdef MAC 9208e38aeffSJohn Baldwin mac_posixshm_destroy(shmfd); 9218e38aeffSJohn Baldwin #endif 922940cb0e2SKonstantin Belousov rangelock_destroy(&shmfd->shm_rl); 923940cb0e2SKonstantin Belousov mtx_destroy(&shmfd->shm_mtx); 9248e38aeffSJohn Baldwin vm_object_deallocate(shmfd->shm_object); 9258e38aeffSJohn Baldwin free(shmfd, M_SHMFD); 9268e38aeffSJohn Baldwin } 9278e38aeffSJohn Baldwin } 9288e38aeffSJohn Baldwin 9298e38aeffSJohn Baldwin /* 9308e38aeffSJohn Baldwin * Determine if the credentials have sufficient permissions for a 9318e38aeffSJohn Baldwin * specified combination of FREAD and FWRITE. 9328e38aeffSJohn Baldwin */ 9331bdbd705SKonstantin Belousov int 9348e38aeffSJohn Baldwin shm_access(struct shmfd *shmfd, struct ucred *ucred, int flags) 9358e38aeffSJohn Baldwin { 93615bc6b2bSEdward Tomasz Napierala accmode_t accmode; 9379c00bb91SKonstantin Belousov int error; 9388e38aeffSJohn Baldwin 93915bc6b2bSEdward Tomasz Napierala accmode = 0; 9408e38aeffSJohn Baldwin if (flags & FREAD) 94115bc6b2bSEdward Tomasz Napierala accmode |= VREAD; 9428e38aeffSJohn Baldwin if (flags & FWRITE) 94315bc6b2bSEdward Tomasz Napierala accmode |= VWRITE; 9449c00bb91SKonstantin Belousov mtx_lock(&shm_timestamp_lock); 9459c00bb91SKonstantin Belousov error = vaccess(VREG, shmfd->shm_mode, shmfd->shm_uid, shmfd->shm_gid, 946d292b194SMateusz Guzik accmode, ucred); 9479c00bb91SKonstantin Belousov mtx_unlock(&shm_timestamp_lock); 9489c00bb91SKonstantin Belousov return (error); 9498e38aeffSJohn Baldwin } 9508e38aeffSJohn Baldwin 9518e38aeffSJohn Baldwin static void 952610a2b3cSJohn Baldwin shm_init(void *arg) 9538e38aeffSJohn Baldwin { 954d301b358SKonstantin Belousov char name[32]; 955d301b358SKonstantin Belousov int i; 9568e38aeffSJohn Baldwin 9578e38aeffSJohn Baldwin mtx_init(&shm_timestamp_lock, "shm timestamps", NULL, MTX_DEF); 9588e38aeffSJohn Baldwin sx_init(&shm_dict_lock, "shm dictionary"); 9598e38aeffSJohn Baldwin shm_dictionary = hashinit(1024, M_SHMFD, &shm_hash); 9607883ce1fSMateusz Guzik new_unrhdr64(&shm_ino_unr, 1); 961610a2b3cSJohn Baldwin shm_dev_ino = devfs_alloc_cdp_inode(); 962610a2b3cSJohn Baldwin KASSERT(shm_dev_ino > 0, ("shm dev inode not initialized")); 963d301b358SKonstantin Belousov 964d301b358SKonstantin Belousov for (i = 1; i < MAXPAGESIZES; i++) { 965d301b358SKonstantin Belousov if (pagesizes[i] == 0) 966d301b358SKonstantin Belousov break; 967d301b358SKonstantin Belousov #define M (1024 * 1024) 968d301b358SKonstantin Belousov #define G (1024 * M) 969d301b358SKonstantin Belousov if (pagesizes[i] >= G) 970d301b358SKonstantin Belousov snprintf(name, sizeof(name), "%luG", pagesizes[i] / G); 971d301b358SKonstantin Belousov else if (pagesizes[i] >= M) 972d301b358SKonstantin Belousov snprintf(name, sizeof(name), "%luM", pagesizes[i] / M); 973d301b358SKonstantin Belousov else 974d301b358SKonstantin Belousov snprintf(name, sizeof(name), "%lu", pagesizes[i]); 975d301b358SKonstantin Belousov #undef G 976d301b358SKonstantin Belousov #undef M 977d301b358SKonstantin Belousov SYSCTL_ADD_ULONG(NULL, SYSCTL_STATIC_CHILDREN(_vm_largepages), 978d301b358SKonstantin Belousov OID_AUTO, name, CTLFLAG_RD, &count_largepages[i], 979d301b358SKonstantin Belousov "number of non-transient largepages allocated"); 980d301b358SKonstantin Belousov } 9818e38aeffSJohn Baldwin } 982610a2b3cSJohn Baldwin SYSINIT(shm_init, SI_SUB_SYSV_SHM, SI_ORDER_ANY, shm_init, NULL); 9838e38aeffSJohn Baldwin 98425f44824SKonstantin Belousov /* 98525f44824SKonstantin Belousov * Dictionary management. We maintain an in-kernel dictionary to map 98625f44824SKonstantin Belousov * paths to shmfd objects. We use the FNV hash on the path to store 98725f44824SKonstantin Belousov * the mappings in a hash table. 98825f44824SKonstantin Belousov */ 9898e38aeffSJohn Baldwin static struct shmfd * 9908e38aeffSJohn Baldwin shm_lookup(char *path, Fnv32_t fnv) 9918e38aeffSJohn Baldwin { 9928e38aeffSJohn Baldwin struct shm_mapping *map; 9938e38aeffSJohn Baldwin 9948e38aeffSJohn Baldwin LIST_FOREACH(map, SHM_HASH(fnv), sm_link) { 9958e38aeffSJohn Baldwin if (map->sm_fnv != fnv) 9968e38aeffSJohn Baldwin continue; 9978e38aeffSJohn Baldwin if (strcmp(map->sm_path, path) == 0) 9988e38aeffSJohn Baldwin return (map->sm_shmfd); 9998e38aeffSJohn Baldwin } 10008e38aeffSJohn Baldwin 10018e38aeffSJohn Baldwin return (NULL); 10028e38aeffSJohn Baldwin } 10038e38aeffSJohn Baldwin 10048e38aeffSJohn Baldwin static void 10058e38aeffSJohn Baldwin shm_insert(char *path, Fnv32_t fnv, struct shmfd *shmfd) 10068e38aeffSJohn Baldwin { 10078e38aeffSJohn Baldwin struct shm_mapping *map; 10088e38aeffSJohn Baldwin 10098e38aeffSJohn Baldwin map = malloc(sizeof(struct shm_mapping), M_SHMFD, M_WAITOK); 10108e38aeffSJohn Baldwin map->sm_path = path; 10118e38aeffSJohn Baldwin map->sm_fnv = fnv; 10128e38aeffSJohn Baldwin map->sm_shmfd = shm_hold(shmfd); 1013e506e182SJohn Baldwin shmfd->shm_path = path; 10148e38aeffSJohn Baldwin LIST_INSERT_HEAD(SHM_HASH(fnv), map, sm_link); 10158e38aeffSJohn Baldwin } 10168e38aeffSJohn Baldwin 10178e38aeffSJohn Baldwin static int 10188e38aeffSJohn Baldwin shm_remove(char *path, Fnv32_t fnv, struct ucred *ucred) 10198e38aeffSJohn Baldwin { 10208e38aeffSJohn Baldwin struct shm_mapping *map; 10218e38aeffSJohn Baldwin int error; 10228e38aeffSJohn Baldwin 10238e38aeffSJohn Baldwin LIST_FOREACH(map, SHM_HASH(fnv), sm_link) { 10248e38aeffSJohn Baldwin if (map->sm_fnv != fnv) 10258e38aeffSJohn Baldwin continue; 10268e38aeffSJohn Baldwin if (strcmp(map->sm_path, path) == 0) { 10278e38aeffSJohn Baldwin #ifdef MAC 10288e38aeffSJohn Baldwin error = mac_posixshm_check_unlink(ucred, map->sm_shmfd); 10298e38aeffSJohn Baldwin if (error) 10308e38aeffSJohn Baldwin return (error); 10318e38aeffSJohn Baldwin #endif 10328e38aeffSJohn Baldwin error = shm_access(map->sm_shmfd, ucred, 10338e38aeffSJohn Baldwin FREAD | FWRITE); 10348e38aeffSJohn Baldwin if (error) 10358e38aeffSJohn Baldwin return (error); 1036e506e182SJohn Baldwin map->sm_shmfd->shm_path = NULL; 10378e38aeffSJohn Baldwin LIST_REMOVE(map, sm_link); 10388e38aeffSJohn Baldwin shm_drop(map->sm_shmfd); 10398e38aeffSJohn Baldwin free(map->sm_path, M_SHMFD); 10408e38aeffSJohn Baldwin free(map, M_SHMFD); 10418e38aeffSJohn Baldwin return (0); 10428e38aeffSJohn Baldwin } 10438e38aeffSJohn Baldwin } 10448e38aeffSJohn Baldwin 10458e38aeffSJohn Baldwin return (ENOENT); 10468e38aeffSJohn Baldwin } 10478e38aeffSJohn Baldwin 10488e38aeffSJohn Baldwin int 1049535b1df9SKyle Evans kern_shm_open2(struct thread *td, const char *userpath, int flags, mode_t mode, 1050535b1df9SKyle Evans int shmflags, struct filecaps *fcaps, const char *name __unused) 10518e38aeffSJohn Baldwin { 105285078b85SConrad Meyer struct pwddesc *pdp; 10538e38aeffSJohn Baldwin struct shmfd *shmfd; 10548e38aeffSJohn Baldwin struct file *fp; 10558e38aeffSJohn Baldwin char *path; 10560cd95859SKyle Evans void *rl_cookie; 10578e38aeffSJohn Baldwin Fnv32_t fnv; 10588e38aeffSJohn Baldwin mode_t cmode; 1059535b1df9SKyle Evans int error, fd, initial_seals; 1060d301b358SKonstantin Belousov bool largepage; 1061535b1df9SKyle Evans 1062d301b358SKonstantin Belousov if ((shmflags & ~(SHM_ALLOW_SEALING | SHM_GROW_ON_WRITE | 1063d301b358SKonstantin Belousov SHM_LARGEPAGE)) != 0) 1064535b1df9SKyle Evans return (EINVAL); 1065535b1df9SKyle Evans 1066535b1df9SKyle Evans initial_seals = F_SEAL_SEAL; 1067535b1df9SKyle Evans if ((shmflags & SHM_ALLOW_SEALING) != 0) 1068535b1df9SKyle Evans initial_seals &= ~F_SEAL_SEAL; 10698e38aeffSJohn Baldwin 107012bc222eSJonathan Anderson #ifdef CAPABILITY_MODE 107112bc222eSJonathan Anderson /* 107212bc222eSJonathan Anderson * shm_open(2) is only allowed for anonymous objects. 107312bc222eSJonathan Anderson */ 10747ee1b208SEd Schouten if (IN_CAPABILITY_MODE(td) && (userpath != SHM_ANON)) 107512bc222eSJonathan Anderson return (ECAPMODE); 107612bc222eSJonathan Anderson #endif 107712bc222eSJonathan Anderson 107815bcf785SRobert Watson AUDIT_ARG_FFLAGS(flags); 107915bcf785SRobert Watson AUDIT_ARG_MODE(mode); 108015bcf785SRobert Watson 10817ee1b208SEd Schouten if ((flags & O_ACCMODE) != O_RDONLY && (flags & O_ACCMODE) != O_RDWR) 10828e38aeffSJohn Baldwin return (EINVAL); 10838e38aeffSJohn Baldwin 10847ee1b208SEd Schouten if ((flags & ~(O_ACCMODE | O_CREAT | O_EXCL | O_TRUNC | O_CLOEXEC)) != 0) 10858e38aeffSJohn Baldwin return (EINVAL); 10868e38aeffSJohn Baldwin 1087d301b358SKonstantin Belousov largepage = (shmflags & SHM_LARGEPAGE) != 0; 108878257765SMark Johnston if (largepage && !PMAP_HAS_LARGEPAGES) 1089d301b358SKonstantin Belousov return (ENOTTY); 1090d301b358SKonstantin Belousov 10910cd95859SKyle Evans /* 10920cd95859SKyle Evans * Currently only F_SEAL_SEAL may be set when creating or opening shmfd. 10930cd95859SKyle Evans * If the decision is made later to allow additional seals, care must be 10940cd95859SKyle Evans * taken below to ensure that the seals are properly set if the shmfd 10950cd95859SKyle Evans * already existed -- this currently assumes that only F_SEAL_SEAL can 10960cd95859SKyle Evans * be set and doesn't take further precautions to ensure the validity of 10970cd95859SKyle Evans * the seals being added with respect to current mappings. 10980cd95859SKyle Evans */ 10990cd95859SKyle Evans if ((initial_seals & ~F_SEAL_SEAL) != 0) 11000cd95859SKyle Evans return (EINVAL); 11010cd95859SKyle Evans 110285078b85SConrad Meyer pdp = td->td_proc->p_pd; 110385078b85SConrad Meyer cmode = (mode & ~pdp->pd_cmask) & ACCESSPERMS; 11048e38aeffSJohn Baldwin 1105b5a7ac99SKyle Evans /* 1106b5a7ac99SKyle Evans * shm_open(2) created shm should always have O_CLOEXEC set, as mandated 1107b5a7ac99SKyle Evans * by POSIX. We allow it to be unset here so that an in-kernel 1108b5a7ac99SKyle Evans * interface may be written as a thin layer around shm, optionally not 1109b5a7ac99SKyle Evans * setting CLOEXEC. For shm_open(2), O_CLOEXEC is set unconditionally 1110b5a7ac99SKyle Evans * in sys_shm_open() to keep this implementation compliant. 1111b5a7ac99SKyle Evans */ 1112b5a7ac99SKyle Evans error = falloc_caps(td, &fp, &fd, flags & O_CLOEXEC, fcaps); 11138e38aeffSJohn Baldwin if (error) 11148e38aeffSJohn Baldwin return (error); 11158e38aeffSJohn Baldwin 11168e38aeffSJohn Baldwin /* A SHM_ANON path pointer creates an anonymous object. */ 11177ee1b208SEd Schouten if (userpath == SHM_ANON) { 11188e38aeffSJohn Baldwin /* A read-only anonymous object is pointless. */ 11197ee1b208SEd Schouten if ((flags & O_ACCMODE) == O_RDONLY) { 112090f54cbfSMateusz Guzik fdclose(td, fp, fd); 11218e38aeffSJohn Baldwin fdrop(fp, td); 11228e38aeffSJohn Baldwin return (EINVAL); 11238e38aeffSJohn Baldwin } 1124d301b358SKonstantin Belousov shmfd = shm_alloc(td->td_ucred, cmode, largepage); 11250cd95859SKyle Evans shmfd->shm_seals = initial_seals; 11265dd47b52SKyle Evans shmfd->shm_flags = shmflags; 11278e38aeffSJohn Baldwin } else { 11282d5603feSDavid Bright error = shm_copyin_path(td, userpath, &path); 11292d5603feSDavid Bright if (error != 0) { 113090f54cbfSMateusz Guzik fdclose(td, fp, fd); 11318e38aeffSJohn Baldwin fdrop(fp, td); 11328e38aeffSJohn Baldwin return (error); 11338e38aeffSJohn Baldwin } 11348e38aeffSJohn Baldwin 113515bcf785SRobert Watson AUDIT_ARG_UPATH1_CANON(path); 11368e38aeffSJohn Baldwin fnv = fnv_32_str(path, FNV1_32_INIT); 11378e38aeffSJohn Baldwin sx_xlock(&shm_dict_lock); 11388e38aeffSJohn Baldwin shmfd = shm_lookup(path, fnv); 11398e38aeffSJohn Baldwin if (shmfd == NULL) { 11408e38aeffSJohn Baldwin /* Object does not yet exist, create it if requested. */ 11417ee1b208SEd Schouten if (flags & O_CREAT) { 11429b6dd12eSRobert Watson #ifdef MAC 11439b6dd12eSRobert Watson error = mac_posixshm_check_create(td->td_ucred, 11449b6dd12eSRobert Watson path); 11459b6dd12eSRobert Watson if (error == 0) { 11469b6dd12eSRobert Watson #endif 1147d301b358SKonstantin Belousov shmfd = shm_alloc(td->td_ucred, cmode, 1148d301b358SKonstantin Belousov largepage); 11490cd95859SKyle Evans shmfd->shm_seals = initial_seals; 11505dd47b52SKyle Evans shmfd->shm_flags = shmflags; 11518e38aeffSJohn Baldwin shm_insert(path, fnv, shmfd); 11529b6dd12eSRobert Watson #ifdef MAC 11539b6dd12eSRobert Watson } 11549b6dd12eSRobert Watson #endif 11558e38aeffSJohn Baldwin } else { 11568e38aeffSJohn Baldwin free(path, M_SHMFD); 11578e38aeffSJohn Baldwin error = ENOENT; 11588e38aeffSJohn Baldwin } 11598e38aeffSJohn Baldwin } else { 11600cd95859SKyle Evans rl_cookie = rangelock_wlock(&shmfd->shm_rl, 0, OFF_MAX, 11610cd95859SKyle Evans &shmfd->shm_mtx); 11620cd95859SKyle Evans 11630cd95859SKyle Evans /* 11640cd95859SKyle Evans * kern_shm_open() likely shouldn't ever error out on 11650cd95859SKyle Evans * trying to set a seal that already exists, unlike 11660cd95859SKyle Evans * F_ADD_SEALS. This would break terribly as 11670cd95859SKyle Evans * shm_open(2) actually sets F_SEAL_SEAL to maintain 11680cd95859SKyle Evans * historical behavior where the underlying file could 11690cd95859SKyle Evans * not be sealed. 11700cd95859SKyle Evans */ 11710cd95859SKyle Evans initial_seals &= ~shmfd->shm_seals; 11720cd95859SKyle Evans 11738e38aeffSJohn Baldwin /* 11748e38aeffSJohn Baldwin * Object already exists, obtain a new 11758e38aeffSJohn Baldwin * reference if requested and permitted. 11768e38aeffSJohn Baldwin */ 11778e38aeffSJohn Baldwin free(path, M_SHMFD); 11780cd95859SKyle Evans 11790cd95859SKyle Evans /* 11800cd95859SKyle Evans * initial_seals can't set additional seals if we've 11810cd95859SKyle Evans * already been set F_SEAL_SEAL. If F_SEAL_SEAL is set, 11820cd95859SKyle Evans * then we've already removed that one from 11830cd95859SKyle Evans * initial_seals. This is currently redundant as we 11840cd95859SKyle Evans * only allow setting F_SEAL_SEAL at creation time, but 11850cd95859SKyle Evans * it's cheap to check and decreases the effort required 11860cd95859SKyle Evans * to allow additional seals. 11870cd95859SKyle Evans */ 11880cd95859SKyle Evans if ((shmfd->shm_seals & F_SEAL_SEAL) != 0 && 11890cd95859SKyle Evans initial_seals != 0) 11900cd95859SKyle Evans error = EPERM; 11910cd95859SKyle Evans else if ((flags & (O_CREAT | O_EXCL)) == 11920cd95859SKyle Evans (O_CREAT | O_EXCL)) 11938e38aeffSJohn Baldwin error = EEXIST; 11945dd47b52SKyle Evans else if (shmflags != 0 && shmflags != shmfd->shm_flags) 11955dd47b52SKyle Evans error = EINVAL; 11968e38aeffSJohn Baldwin else { 11978e38aeffSJohn Baldwin #ifdef MAC 11988e38aeffSJohn Baldwin error = mac_posixshm_check_open(td->td_ucred, 11997ee1b208SEd Schouten shmfd, FFLAGS(flags & O_ACCMODE)); 12008e38aeffSJohn Baldwin if (error == 0) 12018e38aeffSJohn Baldwin #endif 12028e38aeffSJohn Baldwin error = shm_access(shmfd, td->td_ucred, 12037ee1b208SEd Schouten FFLAGS(flags & O_ACCMODE)); 12048e38aeffSJohn Baldwin } 12058e38aeffSJohn Baldwin 12068e38aeffSJohn Baldwin /* 12078e38aeffSJohn Baldwin * Truncate the file back to zero length if 12088e38aeffSJohn Baldwin * O_TRUNC was specified and the object was 12098e38aeffSJohn Baldwin * opened with read/write. 12108e38aeffSJohn Baldwin */ 12118e38aeffSJohn Baldwin if (error == 0 && 12127ee1b208SEd Schouten (flags & (O_ACCMODE | O_TRUNC)) == 12138e38aeffSJohn Baldwin (O_RDWR | O_TRUNC)) { 12140cd95859SKyle Evans VM_OBJECT_WLOCK(shmfd->shm_object); 12158e38aeffSJohn Baldwin #ifdef MAC 12168e38aeffSJohn Baldwin error = mac_posixshm_check_truncate( 12178e38aeffSJohn Baldwin td->td_ucred, fp->f_cred, shmfd); 12188e38aeffSJohn Baldwin if (error == 0) 12198e38aeffSJohn Baldwin #endif 12200cd95859SKyle Evans error = shm_dotruncate_locked(shmfd, 0, 12210cd95859SKyle Evans rl_cookie); 12220cd95859SKyle Evans VM_OBJECT_WUNLOCK(shmfd->shm_object); 12238e38aeffSJohn Baldwin } 12240cd95859SKyle Evans if (error == 0) { 12250cd95859SKyle Evans /* 12260cd95859SKyle Evans * Currently we only allow F_SEAL_SEAL to be 12270cd95859SKyle Evans * set initially. As noted above, this would 12280cd95859SKyle Evans * need to be reworked should that change. 12290cd95859SKyle Evans */ 12300cd95859SKyle Evans shmfd->shm_seals |= initial_seals; 12318e38aeffSJohn Baldwin shm_hold(shmfd); 12328e38aeffSJohn Baldwin } 12330cd95859SKyle Evans rangelock_unlock(&shmfd->shm_rl, rl_cookie, 12340cd95859SKyle Evans &shmfd->shm_mtx); 12350cd95859SKyle Evans } 12368e38aeffSJohn Baldwin sx_xunlock(&shm_dict_lock); 12378e38aeffSJohn Baldwin 12388e38aeffSJohn Baldwin if (error) { 123990f54cbfSMateusz Guzik fdclose(td, fp, fd); 12408e38aeffSJohn Baldwin fdrop(fp, td); 12418e38aeffSJohn Baldwin return (error); 12428e38aeffSJohn Baldwin } 12438e38aeffSJohn Baldwin } 12448e38aeffSJohn Baldwin 12457ee1b208SEd Schouten finit(fp, FFLAGS(flags & O_ACCMODE), DTYPE_SHM, shmfd, &shm_ops); 12468e38aeffSJohn Baldwin 12478e38aeffSJohn Baldwin td->td_retval[0] = fd; 12488e38aeffSJohn Baldwin fdrop(fp, td); 12498e38aeffSJohn Baldwin 12508e38aeffSJohn Baldwin return (0); 12518e38aeffSJohn Baldwin } 12528e38aeffSJohn Baldwin 12537ee1b208SEd Schouten /* System calls. */ 1254a9ac5e14SKyle Evans #ifdef COMPAT_FREEBSD12 12557ee1b208SEd Schouten int 1256a9ac5e14SKyle Evans freebsd12_shm_open(struct thread *td, struct freebsd12_shm_open_args *uap) 12577ee1b208SEd Schouten { 12587ee1b208SEd Schouten 1259535b1df9SKyle Evans return (kern_shm_open(td, uap->path, uap->flags | O_CLOEXEC, 1260535b1df9SKyle Evans uap->mode, NULL)); 12617ee1b208SEd Schouten } 1262a9ac5e14SKyle Evans #endif 12637ee1b208SEd Schouten 12648e38aeffSJohn Baldwin int 12658451d0ddSKip Macy sys_shm_unlink(struct thread *td, struct shm_unlink_args *uap) 12668e38aeffSJohn Baldwin { 12678e38aeffSJohn Baldwin char *path; 12688e38aeffSJohn Baldwin Fnv32_t fnv; 12698e38aeffSJohn Baldwin int error; 12708e38aeffSJohn Baldwin 12712d5603feSDavid Bright error = shm_copyin_path(td, uap->path, &path); 12722d5603feSDavid Bright if (error != 0) 12738e38aeffSJohn Baldwin return (error); 12742d5603feSDavid Bright 127515bcf785SRobert Watson AUDIT_ARG_UPATH1_CANON(path); 12768e38aeffSJohn Baldwin fnv = fnv_32_str(path, FNV1_32_INIT); 12778e38aeffSJohn Baldwin sx_xlock(&shm_dict_lock); 12788e38aeffSJohn Baldwin error = shm_remove(path, fnv, td->td_ucred); 12798e38aeffSJohn Baldwin sx_xunlock(&shm_dict_lock); 12804cf919edSMark Johnston free(path, M_SHMFD); 12818e38aeffSJohn Baldwin 12828e38aeffSJohn Baldwin return (error); 12838e38aeffSJohn Baldwin } 12848e38aeffSJohn Baldwin 12858e38aeffSJohn Baldwin int 12869afb12baSDavid Bright sys_shm_rename(struct thread *td, struct shm_rename_args *uap) 12879afb12baSDavid Bright { 12889afb12baSDavid Bright char *path_from = NULL, *path_to = NULL; 12899afb12baSDavid Bright Fnv32_t fnv_from, fnv_to; 12909afb12baSDavid Bright struct shmfd *fd_from; 12919afb12baSDavid Bright struct shmfd *fd_to; 12929afb12baSDavid Bright int error; 12939afb12baSDavid Bright int flags; 12949afb12baSDavid Bright 12959afb12baSDavid Bright flags = uap->flags; 12962d5603feSDavid Bright AUDIT_ARG_FFLAGS(flags); 12979afb12baSDavid Bright 12989afb12baSDavid Bright /* 12999afb12baSDavid Bright * Make sure the user passed only valid flags. 13009afb12baSDavid Bright * If you add a new flag, please add a new term here. 13019afb12baSDavid Bright */ 13029afb12baSDavid Bright if ((flags & ~( 13039afb12baSDavid Bright SHM_RENAME_NOREPLACE | 13049afb12baSDavid Bright SHM_RENAME_EXCHANGE 13059afb12baSDavid Bright )) != 0) { 13069afb12baSDavid Bright error = EINVAL; 13079afb12baSDavid Bright goto out; 13089afb12baSDavid Bright } 13099afb12baSDavid Bright 13109afb12baSDavid Bright /* 13119afb12baSDavid Bright * EXCHANGE and NOREPLACE don't quite make sense together. Let's 13129afb12baSDavid Bright * force the user to choose one or the other. 13139afb12baSDavid Bright */ 13149afb12baSDavid Bright if ((flags & SHM_RENAME_NOREPLACE) != 0 && 13159afb12baSDavid Bright (flags & SHM_RENAME_EXCHANGE) != 0) { 13169afb12baSDavid Bright error = EINVAL; 13179afb12baSDavid Bright goto out; 13189afb12baSDavid Bright } 13199afb12baSDavid Bright 13202d5603feSDavid Bright /* Renaming to or from anonymous makes no sense */ 13212d5603feSDavid Bright if (uap->path_from == SHM_ANON || uap->path_to == SHM_ANON) { 13222d5603feSDavid Bright error = EINVAL; 13232d5603feSDavid Bright goto out; 13242d5603feSDavid Bright } 13252d5603feSDavid Bright 13262d5603feSDavid Bright error = shm_copyin_path(td, uap->path_from, &path_from); 13272d5603feSDavid Bright if (error != 0) 13289afb12baSDavid Bright goto out; 13299afb12baSDavid Bright 13302d5603feSDavid Bright error = shm_copyin_path(td, uap->path_to, &path_to); 13312d5603feSDavid Bright if (error != 0) 13329afb12baSDavid Bright goto out; 13339afb12baSDavid Bright 13342d5603feSDavid Bright AUDIT_ARG_UPATH1_CANON(path_from); 13352d5603feSDavid Bright AUDIT_ARG_UPATH2_CANON(path_to); 13362d5603feSDavid Bright 13379afb12baSDavid Bright /* Rename with from/to equal is a no-op */ 13382d5603feSDavid Bright if (strcmp(path_from, path_to) == 0) 13399afb12baSDavid Bright goto out; 13409afb12baSDavid Bright 13419afb12baSDavid Bright fnv_from = fnv_32_str(path_from, FNV1_32_INIT); 13429afb12baSDavid Bright fnv_to = fnv_32_str(path_to, FNV1_32_INIT); 13439afb12baSDavid Bright 13449afb12baSDavid Bright sx_xlock(&shm_dict_lock); 13459afb12baSDavid Bright 13469afb12baSDavid Bright fd_from = shm_lookup(path_from, fnv_from); 13479afb12baSDavid Bright if (fd_from == NULL) { 13489afb12baSDavid Bright error = ENOENT; 13492d5603feSDavid Bright goto out_locked; 13509afb12baSDavid Bright } 13519afb12baSDavid Bright 13529afb12baSDavid Bright fd_to = shm_lookup(path_to, fnv_to); 13539afb12baSDavid Bright if ((flags & SHM_RENAME_NOREPLACE) != 0 && fd_to != NULL) { 13549afb12baSDavid Bright error = EEXIST; 13552d5603feSDavid Bright goto out_locked; 13569afb12baSDavid Bright } 13579afb12baSDavid Bright 13589afb12baSDavid Bright /* 13599afb12baSDavid Bright * Unconditionally prevents shm_remove from invalidating the 'from' 13609afb12baSDavid Bright * shm's state. 13619afb12baSDavid Bright */ 13629afb12baSDavid Bright shm_hold(fd_from); 13639afb12baSDavid Bright error = shm_remove(path_from, fnv_from, td->td_ucred); 13649afb12baSDavid Bright 13659afb12baSDavid Bright /* 13669afb12baSDavid Bright * One of my assumptions failed if ENOENT (e.g. locking didn't 13679afb12baSDavid Bright * protect us) 13689afb12baSDavid Bright */ 13699afb12baSDavid Bright KASSERT(error != ENOENT, ("Our shm disappeared during shm_rename: %s", 13709afb12baSDavid Bright path_from)); 13712d5603feSDavid Bright if (error != 0) { 13729afb12baSDavid Bright shm_drop(fd_from); 13732d5603feSDavid Bright goto out_locked; 13749afb12baSDavid Bright } 13759afb12baSDavid Bright 13769afb12baSDavid Bright /* 13779afb12baSDavid Bright * If we are exchanging, we need to ensure the shm_remove below 13789afb12baSDavid Bright * doesn't invalidate the dest shm's state. 13799afb12baSDavid Bright */ 13809afb12baSDavid Bright if ((flags & SHM_RENAME_EXCHANGE) != 0 && fd_to != NULL) 13819afb12baSDavid Bright shm_hold(fd_to); 13829afb12baSDavid Bright 13839afb12baSDavid Bright /* 13849afb12baSDavid Bright * NOTE: if path_to is not already in the hash, c'est la vie; 13859afb12baSDavid Bright * it simply means we have nothing already at path_to to unlink. 13869afb12baSDavid Bright * That is the ENOENT case. 13879afb12baSDavid Bright * 13889afb12baSDavid Bright * If we somehow don't have access to unlink this guy, but 13899afb12baSDavid Bright * did for the shm at path_from, then relink the shm to path_from 13909afb12baSDavid Bright * and abort with EACCES. 13919afb12baSDavid Bright * 13929afb12baSDavid Bright * All other errors: that is weird; let's relink and abort the 13939afb12baSDavid Bright * operation. 13949afb12baSDavid Bright */ 13959afb12baSDavid Bright error = shm_remove(path_to, fnv_to, td->td_ucred); 13962d5603feSDavid Bright if (error != 0 && error != ENOENT) { 13979afb12baSDavid Bright shm_insert(path_from, fnv_from, fd_from); 13989afb12baSDavid Bright shm_drop(fd_from); 13999afb12baSDavid Bright /* Don't free path_from now, since the hash references it */ 14009afb12baSDavid Bright path_from = NULL; 14012d5603feSDavid Bright goto out_locked; 14029afb12baSDavid Bright } 14039afb12baSDavid Bright 14042d5603feSDavid Bright error = 0; 14052d5603feSDavid Bright 14069afb12baSDavid Bright shm_insert(path_to, fnv_to, fd_from); 14079afb12baSDavid Bright 14089afb12baSDavid Bright /* Don't free path_to now, since the hash references it */ 14099afb12baSDavid Bright path_to = NULL; 14109afb12baSDavid Bright 14119afb12baSDavid Bright /* We kept a ref when we removed, and incremented again in insert */ 14129afb12baSDavid Bright shm_drop(fd_from); 14139afb12baSDavid Bright KASSERT(fd_from->shm_refs > 0, ("Expected >0 refs; got: %d\n", 14149afb12baSDavid Bright fd_from->shm_refs)); 14159afb12baSDavid Bright 14169afb12baSDavid Bright if ((flags & SHM_RENAME_EXCHANGE) != 0 && fd_to != NULL) { 14179afb12baSDavid Bright shm_insert(path_from, fnv_from, fd_to); 14189afb12baSDavid Bright path_from = NULL; 14199afb12baSDavid Bright shm_drop(fd_to); 14209afb12baSDavid Bright KASSERT(fd_to->shm_refs > 0, ("Expected >0 refs; got: %d\n", 14219afb12baSDavid Bright fd_to->shm_refs)); 14229afb12baSDavid Bright } 14239afb12baSDavid Bright 14242d5603feSDavid Bright out_locked: 14259afb12baSDavid Bright sx_xunlock(&shm_dict_lock); 14269afb12baSDavid Bright 14279afb12baSDavid Bright out: 14289afb12baSDavid Bright free(path_from, M_SHMFD); 14299afb12baSDavid Bright free(path_to, M_SHMFD); 14309afb12baSDavid Bright return (error); 14319afb12baSDavid Bright } 14329afb12baSDavid Bright 1433d301b358SKonstantin Belousov static int 1434d301b358SKonstantin Belousov shm_mmap_large(struct shmfd *shmfd, vm_map_t map, vm_offset_t *addr, 1435d301b358SKonstantin Belousov vm_size_t size, vm_prot_t prot, vm_prot_t max_prot, int flags, 143679783634SKonstantin Belousov vm_ooffset_t foff, struct thread *td) 1437d301b358SKonstantin Belousov { 1438d301b358SKonstantin Belousov struct vmspace *vms; 1439d301b358SKonstantin Belousov vm_map_entry_t next_entry, prev_entry; 1440d301b358SKonstantin Belousov vm_offset_t align, mask, maxaddr; 1441d301b358SKonstantin Belousov int docow, error, rv, try; 1442d301b358SKonstantin Belousov bool curmap; 1443d301b358SKonstantin Belousov 1444d301b358SKonstantin Belousov if (shmfd->shm_lp_psind == 0) 1445d301b358SKonstantin Belousov return (EINVAL); 1446d301b358SKonstantin Belousov 1447d301b358SKonstantin Belousov /* MAP_PRIVATE is disabled */ 1448d301b358SKonstantin Belousov if ((flags & ~(MAP_SHARED | MAP_FIXED | MAP_EXCL | 1449d301b358SKonstantin Belousov MAP_NOCORE | 1450d301b358SKonstantin Belousov #ifdef MAP_32BIT 1451d301b358SKonstantin Belousov MAP_32BIT | 1452d301b358SKonstantin Belousov #endif 1453d301b358SKonstantin Belousov MAP_ALIGNMENT_MASK)) != 0) 1454d301b358SKonstantin Belousov return (EINVAL); 1455d301b358SKonstantin Belousov 1456d301b358SKonstantin Belousov vms = td->td_proc->p_vmspace; 1457d301b358SKonstantin Belousov curmap = map == &vms->vm_map; 1458d301b358SKonstantin Belousov if (curmap) { 1459d301b358SKonstantin Belousov error = kern_mmap_racct_check(td, map, size); 1460d301b358SKonstantin Belousov if (error != 0) 1461d301b358SKonstantin Belousov return (error); 1462d301b358SKonstantin Belousov } 1463d301b358SKonstantin Belousov 1464d301b358SKonstantin Belousov docow = shmfd->shm_lp_psind << MAP_SPLIT_BOUNDARY_SHIFT; 1465d301b358SKonstantin Belousov docow |= MAP_INHERIT_SHARE; 1466d301b358SKonstantin Belousov if ((flags & MAP_NOCORE) != 0) 1467d301b358SKonstantin Belousov docow |= MAP_DISABLE_COREDUMP; 1468d301b358SKonstantin Belousov 1469d301b358SKonstantin Belousov mask = pagesizes[shmfd->shm_lp_psind] - 1; 1470d301b358SKonstantin Belousov if ((foff & mask) != 0) 1471d301b358SKonstantin Belousov return (EINVAL); 1472d301b358SKonstantin Belousov maxaddr = vm_map_max(map); 1473d301b358SKonstantin Belousov #ifdef MAP_32BIT 1474d301b358SKonstantin Belousov if ((flags & MAP_32BIT) != 0 && maxaddr > MAP_32BIT_MAX_ADDR) 1475d301b358SKonstantin Belousov maxaddr = MAP_32BIT_MAX_ADDR; 1476d301b358SKonstantin Belousov #endif 1477d301b358SKonstantin Belousov if (size == 0 || (size & mask) != 0 || 1478d301b358SKonstantin Belousov (*addr != 0 && ((*addr & mask) != 0 || 1479d301b358SKonstantin Belousov *addr + size < *addr || *addr + size > maxaddr))) 1480d301b358SKonstantin Belousov return (EINVAL); 1481d301b358SKonstantin Belousov 1482d301b358SKonstantin Belousov align = flags & MAP_ALIGNMENT_MASK; 1483d301b358SKonstantin Belousov if (align == 0) { 1484d301b358SKonstantin Belousov align = pagesizes[shmfd->shm_lp_psind]; 1485d301b358SKonstantin Belousov } else if (align == MAP_ALIGNED_SUPER) { 1486d301b358SKonstantin Belousov if (shmfd->shm_lp_psind != 1) 1487d301b358SKonstantin Belousov return (EINVAL); 1488d301b358SKonstantin Belousov align = pagesizes[1]; 1489d301b358SKonstantin Belousov } else { 1490d301b358SKonstantin Belousov align >>= MAP_ALIGNMENT_SHIFT; 1491d301b358SKonstantin Belousov align = 1ULL << align; 1492d301b358SKonstantin Belousov /* Also handles overflow. */ 1493d301b358SKonstantin Belousov if (align < pagesizes[shmfd->shm_lp_psind]) 1494d301b358SKonstantin Belousov return (EINVAL); 1495d301b358SKonstantin Belousov } 1496d301b358SKonstantin Belousov 1497d301b358SKonstantin Belousov vm_map_lock(map); 1498d301b358SKonstantin Belousov if ((flags & MAP_FIXED) == 0) { 1499d301b358SKonstantin Belousov try = 1; 1500d301b358SKonstantin Belousov if (curmap && (*addr == 0 || 1501d301b358SKonstantin Belousov (*addr >= round_page((vm_offset_t)vms->vm_taddr) && 1502d301b358SKonstantin Belousov *addr < round_page((vm_offset_t)vms->vm_daddr + 1503d301b358SKonstantin Belousov lim_max(td, RLIMIT_DATA))))) { 1504d301b358SKonstantin Belousov *addr = roundup2((vm_offset_t)vms->vm_daddr + 1505d301b358SKonstantin Belousov lim_max(td, RLIMIT_DATA), 1506d301b358SKonstantin Belousov pagesizes[shmfd->shm_lp_psind]); 1507d301b358SKonstantin Belousov } 1508d301b358SKonstantin Belousov again: 1509d301b358SKonstantin Belousov rv = vm_map_find_aligned(map, addr, size, maxaddr, align); 1510d301b358SKonstantin Belousov if (rv != KERN_SUCCESS) { 1511d301b358SKonstantin Belousov if (try == 1) { 1512d301b358SKonstantin Belousov try = 2; 1513d301b358SKonstantin Belousov *addr = vm_map_min(map); 1514d301b358SKonstantin Belousov if ((*addr & mask) != 0) 1515d301b358SKonstantin Belousov *addr = (*addr + mask) & mask; 1516d301b358SKonstantin Belousov goto again; 1517d301b358SKonstantin Belousov } 1518d301b358SKonstantin Belousov goto fail1; 1519d301b358SKonstantin Belousov } 1520d301b358SKonstantin Belousov } else if ((flags & MAP_EXCL) == 0) { 1521d301b358SKonstantin Belousov rv = vm_map_delete(map, *addr, *addr + size); 1522d301b358SKonstantin Belousov if (rv != KERN_SUCCESS) 1523d301b358SKonstantin Belousov goto fail1; 1524d301b358SKonstantin Belousov } else { 1525d301b358SKonstantin Belousov error = ENOSPC; 1526d301b358SKonstantin Belousov if (vm_map_lookup_entry(map, *addr, &prev_entry)) 1527d301b358SKonstantin Belousov goto fail; 1528d301b358SKonstantin Belousov next_entry = vm_map_entry_succ(prev_entry); 1529d301b358SKonstantin Belousov if (next_entry->start < *addr + size) 1530d301b358SKonstantin Belousov goto fail; 1531d301b358SKonstantin Belousov } 1532d301b358SKonstantin Belousov 1533d301b358SKonstantin Belousov rv = vm_map_insert(map, shmfd->shm_object, foff, *addr, *addr + size, 1534d301b358SKonstantin Belousov prot, max_prot, docow); 1535d301b358SKonstantin Belousov fail1: 1536d301b358SKonstantin Belousov error = vm_mmap_to_errno(rv); 1537d301b358SKonstantin Belousov fail: 1538d301b358SKonstantin Belousov vm_map_unlock(map); 1539d301b358SKonstantin Belousov return (error); 1540d301b358SKonstantin Belousov } 1541d301b358SKonstantin Belousov 1542d301b358SKonstantin Belousov static int 15437077c426SJohn Baldwin shm_mmap(struct file *fp, vm_map_t map, vm_offset_t *addr, vm_size_t objsize, 15447077c426SJohn Baldwin vm_prot_t prot, vm_prot_t cap_maxprot, int flags, 15457077c426SJohn Baldwin vm_ooffset_t foff, struct thread *td) 15468e38aeffSJohn Baldwin { 15477077c426SJohn Baldwin struct shmfd *shmfd; 15487077c426SJohn Baldwin vm_prot_t maxprot; 15497077c426SJohn Baldwin int error; 1550dca52ab4SKyle Evans bool writecnt; 1551af755d3eSKyle Evans void *rl_cookie; 15527077c426SJohn Baldwin 15537077c426SJohn Baldwin shmfd = fp->f_data; 15547077c426SJohn Baldwin maxprot = VM_PROT_NONE; 15557077c426SJohn Baldwin 1556af755d3eSKyle Evans rl_cookie = rangelock_rlock(&shmfd->shm_rl, 0, objsize, 1557af755d3eSKyle Evans &shmfd->shm_mtx); 15587077c426SJohn Baldwin /* FREAD should always be set. */ 15597077c426SJohn Baldwin if ((fp->f_flag & FREAD) != 0) 15607077c426SJohn Baldwin maxprot |= VM_PROT_EXECUTE | VM_PROT_READ; 156158366f05SKyle Evans 156258366f05SKyle Evans /* 156358366f05SKyle Evans * If FWRITE's set, we can allow VM_PROT_WRITE unless it's a shared 1564c7841c6bSMark Johnston * mapping with a write seal applied. Private mappings are always 1565c7841c6bSMark Johnston * writeable. 156658366f05SKyle Evans */ 1567c7841c6bSMark Johnston if ((flags & MAP_SHARED) == 0) { 1568c7841c6bSMark Johnston cap_maxprot |= VM_PROT_WRITE; 15697077c426SJohn Baldwin maxprot |= VM_PROT_WRITE; 1570c7841c6bSMark Johnston writecnt = false; 1571c7841c6bSMark Johnston } else { 1572c7841c6bSMark Johnston if ((fp->f_flag & FWRITE) != 0 && 1573c7841c6bSMark Johnston (shmfd->shm_seals & F_SEAL_WRITE) == 0) 1574c7841c6bSMark Johnston maxprot |= VM_PROT_WRITE; 1575af755d3eSKyle Evans 157651a16c84SKyle Evans /* 157751a16c84SKyle Evans * Any mappings from a writable descriptor may be upgraded to 157851a16c84SKyle Evans * VM_PROT_WRITE with mprotect(2), unless a write-seal was 157951a16c84SKyle Evans * applied between the open and subsequent mmap(2). We want to 158051a16c84SKyle Evans * reject application of a write seal as long as any such 158151a16c84SKyle Evans * mapping exists so that the seal cannot be trivially bypassed. 158251a16c84SKyle Evans */ 158351a16c84SKyle Evans writecnt = (maxprot & VM_PROT_WRITE) != 0; 158451a16c84SKyle Evans if (!writecnt && (prot & VM_PROT_WRITE) != 0) { 1585af755d3eSKyle Evans error = EACCES; 1586af755d3eSKyle Evans goto out; 1587af755d3eSKyle Evans } 1588c7841c6bSMark Johnston } 15897077c426SJohn Baldwin maxprot &= cap_maxprot; 15907077c426SJohn Baldwin 1591987ff181SKonstantin Belousov /* See comment in vn_mmap(). */ 1592987ff181SKonstantin Belousov if ( 1593987ff181SKonstantin Belousov #ifdef _LP64 1594987ff181SKonstantin Belousov objsize > OFF_MAX || 1595987ff181SKonstantin Belousov #endif 1596f9cc8410SEric van Gyzen foff > OFF_MAX - objsize) { 1597af755d3eSKyle Evans error = EINVAL; 1598af755d3eSKyle Evans goto out; 1599af755d3eSKyle Evans } 1600987ff181SKonstantin Belousov 16017077c426SJohn Baldwin #ifdef MAC 16027077c426SJohn Baldwin error = mac_posixshm_check_mmap(td->td_ucred, shmfd, prot, flags); 16037077c426SJohn Baldwin if (error != 0) 1604af755d3eSKyle Evans goto out; 16057077c426SJohn Baldwin #endif 16068e38aeffSJohn Baldwin 16078e38aeffSJohn Baldwin mtx_lock(&shm_timestamp_lock); 16088e38aeffSJohn Baldwin vfs_timestamp(&shmfd->shm_atime); 16098e38aeffSJohn Baldwin mtx_unlock(&shm_timestamp_lock); 16108e38aeffSJohn Baldwin vm_object_reference(shmfd->shm_object); 16117077c426SJohn Baldwin 1612d301b358SKonstantin Belousov if (shm_largepage(shmfd)) { 161379783634SKonstantin Belousov writecnt = false; 1614d301b358SKonstantin Belousov error = shm_mmap_large(shmfd, map, addr, objsize, prot, 161579783634SKonstantin Belousov maxprot, flags, foff, td); 1616d301b358SKonstantin Belousov } else { 161779783634SKonstantin Belousov if (writecnt) { 161879783634SKonstantin Belousov vm_pager_update_writecount(shmfd->shm_object, 0, 161979783634SKonstantin Belousov objsize); 162079783634SKonstantin Belousov } 16217077c426SJohn Baldwin error = vm_mmap_object(map, addr, objsize, prot, maxprot, flags, 1622dca52ab4SKyle Evans shmfd->shm_object, foff, writecnt, td); 1623d301b358SKonstantin Belousov } 1624dca52ab4SKyle Evans if (error != 0) { 1625dca52ab4SKyle Evans if (writecnt) 1626dca52ab4SKyle Evans vm_pager_release_writecount(shmfd->shm_object, 0, 1627dca52ab4SKyle Evans objsize); 16287077c426SJohn Baldwin vm_object_deallocate(shmfd->shm_object); 1629dca52ab4SKyle Evans } 1630af755d3eSKyle Evans out: 1631af755d3eSKyle Evans rangelock_unlock(&shmfd->shm_rl, rl_cookie, &shmfd->shm_mtx); 163234d3e89fSKonstantin Belousov return (error); 16338e38aeffSJohn Baldwin } 16349c00bb91SKonstantin Belousov 16359c00bb91SKonstantin Belousov static int 16369c00bb91SKonstantin Belousov shm_chmod(struct file *fp, mode_t mode, struct ucred *active_cred, 16379c00bb91SKonstantin Belousov struct thread *td) 16389c00bb91SKonstantin Belousov { 16399c00bb91SKonstantin Belousov struct shmfd *shmfd; 16409c00bb91SKonstantin Belousov int error; 16419c00bb91SKonstantin Belousov 16429c00bb91SKonstantin Belousov error = 0; 16439c00bb91SKonstantin Belousov shmfd = fp->f_data; 16449c00bb91SKonstantin Belousov mtx_lock(&shm_timestamp_lock); 16459c00bb91SKonstantin Belousov /* 16469c00bb91SKonstantin Belousov * SUSv4 says that x bits of permission need not be affected. 16479c00bb91SKonstantin Belousov * Be consistent with our shm_open there. 16489c00bb91SKonstantin Belousov */ 16499c00bb91SKonstantin Belousov #ifdef MAC 16509c00bb91SKonstantin Belousov error = mac_posixshm_check_setmode(active_cred, shmfd, mode); 16519c00bb91SKonstantin Belousov if (error != 0) 16529c00bb91SKonstantin Belousov goto out; 16539c00bb91SKonstantin Belousov #endif 1654d292b194SMateusz Guzik error = vaccess(VREG, shmfd->shm_mode, shmfd->shm_uid, shmfd->shm_gid, 1655d292b194SMateusz Guzik VADMIN, active_cred); 16569c00bb91SKonstantin Belousov if (error != 0) 16579c00bb91SKonstantin Belousov goto out; 16589c00bb91SKonstantin Belousov shmfd->shm_mode = mode & ACCESSPERMS; 16599c00bb91SKonstantin Belousov out: 16609c00bb91SKonstantin Belousov mtx_unlock(&shm_timestamp_lock); 16619c00bb91SKonstantin Belousov return (error); 16629c00bb91SKonstantin Belousov } 16639c00bb91SKonstantin Belousov 16649c00bb91SKonstantin Belousov static int 16659c00bb91SKonstantin Belousov shm_chown(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred, 16669c00bb91SKonstantin Belousov struct thread *td) 16679c00bb91SKonstantin Belousov { 16689c00bb91SKonstantin Belousov struct shmfd *shmfd; 16699c00bb91SKonstantin Belousov int error; 16709c00bb91SKonstantin Belousov 167168889ed6SKonstantin Belousov error = 0; 16729c00bb91SKonstantin Belousov shmfd = fp->f_data; 16739c00bb91SKonstantin Belousov mtx_lock(&shm_timestamp_lock); 16749c00bb91SKonstantin Belousov #ifdef MAC 16759c00bb91SKonstantin Belousov error = mac_posixshm_check_setowner(active_cred, shmfd, uid, gid); 16769c00bb91SKonstantin Belousov if (error != 0) 16779c00bb91SKonstantin Belousov goto out; 16789c00bb91SKonstantin Belousov #endif 16799c00bb91SKonstantin Belousov if (uid == (uid_t)-1) 16809c00bb91SKonstantin Belousov uid = shmfd->shm_uid; 16819c00bb91SKonstantin Belousov if (gid == (gid_t)-1) 16829c00bb91SKonstantin Belousov gid = shmfd->shm_gid; 16839c00bb91SKonstantin Belousov if (((uid != shmfd->shm_uid && uid != active_cred->cr_uid) || 16849c00bb91SKonstantin Belousov (gid != shmfd->shm_gid && !groupmember(gid, active_cred))) && 1685cc426dd3SMateusz Guzik (error = priv_check_cred(active_cred, PRIV_VFS_CHOWN))) 16869c00bb91SKonstantin Belousov goto out; 16879c00bb91SKonstantin Belousov shmfd->shm_uid = uid; 16889c00bb91SKonstantin Belousov shmfd->shm_gid = gid; 16899c00bb91SKonstantin Belousov out: 16909c00bb91SKonstantin Belousov mtx_unlock(&shm_timestamp_lock); 16919c00bb91SKonstantin Belousov return (error); 16929c00bb91SKonstantin Belousov } 1693fb680e16SJohn Baldwin 1694fb680e16SJohn Baldwin /* 1695fb680e16SJohn Baldwin * Helper routines to allow the backing object of a shared memory file 1696fb680e16SJohn Baldwin * descriptor to be mapped in the kernel. 1697fb680e16SJohn Baldwin */ 1698fb680e16SJohn Baldwin int 1699fb680e16SJohn Baldwin shm_map(struct file *fp, size_t size, off_t offset, void **memp) 1700fb680e16SJohn Baldwin { 1701fb680e16SJohn Baldwin struct shmfd *shmfd; 1702fb680e16SJohn Baldwin vm_offset_t kva, ofs; 1703fb680e16SJohn Baldwin vm_object_t obj; 1704fb680e16SJohn Baldwin int rv; 1705fb680e16SJohn Baldwin 1706fb680e16SJohn Baldwin if (fp->f_type != DTYPE_SHM) 1707fb680e16SJohn Baldwin return (EINVAL); 1708fb680e16SJohn Baldwin shmfd = fp->f_data; 1709fb680e16SJohn Baldwin obj = shmfd->shm_object; 171089f6b863SAttilio Rao VM_OBJECT_WLOCK(obj); 1711fb680e16SJohn Baldwin /* 1712fb680e16SJohn Baldwin * XXXRW: This validation is probably insufficient, and subject to 1713fb680e16SJohn Baldwin * sign errors. It should be fixed. 1714fb680e16SJohn Baldwin */ 1715fb680e16SJohn Baldwin if (offset >= shmfd->shm_size || 1716fb680e16SJohn Baldwin offset + size > round_page(shmfd->shm_size)) { 171789f6b863SAttilio Rao VM_OBJECT_WUNLOCK(obj); 1718fb680e16SJohn Baldwin return (EINVAL); 1719fb680e16SJohn Baldwin } 1720fb680e16SJohn Baldwin 1721fb680e16SJohn Baldwin shmfd->shm_kmappings++; 1722fb680e16SJohn Baldwin vm_object_reference_locked(obj); 172389f6b863SAttilio Rao VM_OBJECT_WUNLOCK(obj); 1724fb680e16SJohn Baldwin 1725fb680e16SJohn Baldwin /* Map the object into the kernel_map and wire it. */ 1726fb680e16SJohn Baldwin kva = vm_map_min(kernel_map); 1727fb680e16SJohn Baldwin ofs = offset & PAGE_MASK; 1728fb680e16SJohn Baldwin offset = trunc_page(offset); 1729fb680e16SJohn Baldwin size = round_page(size + ofs); 1730edb572a3SJohn Baldwin rv = vm_map_find(kernel_map, obj, offset, &kva, size, 0, 17315e3a17c0SJohn Baldwin VMFS_OPTIMAL_SPACE, VM_PROT_READ | VM_PROT_WRITE, 1732fb680e16SJohn Baldwin VM_PROT_READ | VM_PROT_WRITE, 0); 1733fb680e16SJohn Baldwin if (rv == KERN_SUCCESS) { 1734fb680e16SJohn Baldwin rv = vm_map_wire(kernel_map, kva, kva + size, 1735fb680e16SJohn Baldwin VM_MAP_WIRE_SYSTEM | VM_MAP_WIRE_NOHOLES); 1736fb680e16SJohn Baldwin if (rv == KERN_SUCCESS) { 1737fb680e16SJohn Baldwin *memp = (void *)(kva + ofs); 1738fb680e16SJohn Baldwin return (0); 1739fb680e16SJohn Baldwin } 1740fb680e16SJohn Baldwin vm_map_remove(kernel_map, kva, kva + size); 1741fb680e16SJohn Baldwin } else 1742fb680e16SJohn Baldwin vm_object_deallocate(obj); 1743fb680e16SJohn Baldwin 1744fb680e16SJohn Baldwin /* On failure, drop our mapping reference. */ 174589f6b863SAttilio Rao VM_OBJECT_WLOCK(obj); 1746fb680e16SJohn Baldwin shmfd->shm_kmappings--; 174789f6b863SAttilio Rao VM_OBJECT_WUNLOCK(obj); 1748fb680e16SJohn Baldwin 1749338e7cf2SJohn Baldwin return (vm_mmap_to_errno(rv)); 1750fb680e16SJohn Baldwin } 1751fb680e16SJohn Baldwin 1752fb680e16SJohn Baldwin /* 1753fb680e16SJohn Baldwin * We require the caller to unmap the entire entry. This allows us to 1754fb680e16SJohn Baldwin * safely decrement shm_kmappings when a mapping is removed. 1755fb680e16SJohn Baldwin */ 1756fb680e16SJohn Baldwin int 1757fb680e16SJohn Baldwin shm_unmap(struct file *fp, void *mem, size_t size) 1758fb680e16SJohn Baldwin { 1759fb680e16SJohn Baldwin struct shmfd *shmfd; 1760fb680e16SJohn Baldwin vm_map_entry_t entry; 1761fb680e16SJohn Baldwin vm_offset_t kva, ofs; 1762fb680e16SJohn Baldwin vm_object_t obj; 1763fb680e16SJohn Baldwin vm_pindex_t pindex; 1764fb680e16SJohn Baldwin vm_prot_t prot; 1765fb680e16SJohn Baldwin boolean_t wired; 1766fb680e16SJohn Baldwin vm_map_t map; 1767fb680e16SJohn Baldwin int rv; 1768fb680e16SJohn Baldwin 1769fb680e16SJohn Baldwin if (fp->f_type != DTYPE_SHM) 1770fb680e16SJohn Baldwin return (EINVAL); 1771fb680e16SJohn Baldwin shmfd = fp->f_data; 1772fb680e16SJohn Baldwin kva = (vm_offset_t)mem; 1773fb680e16SJohn Baldwin ofs = kva & PAGE_MASK; 1774fb680e16SJohn Baldwin kva = trunc_page(kva); 1775fb680e16SJohn Baldwin size = round_page(size + ofs); 1776fb680e16SJohn Baldwin map = kernel_map; 1777fb680e16SJohn Baldwin rv = vm_map_lookup(&map, kva, VM_PROT_READ | VM_PROT_WRITE, &entry, 1778fb680e16SJohn Baldwin &obj, &pindex, &prot, &wired); 1779fb680e16SJohn Baldwin if (rv != KERN_SUCCESS) 1780fb680e16SJohn Baldwin return (EINVAL); 1781fb680e16SJohn Baldwin if (entry->start != kva || entry->end != kva + size) { 1782fb680e16SJohn Baldwin vm_map_lookup_done(map, entry); 1783fb680e16SJohn Baldwin return (EINVAL); 1784fb680e16SJohn Baldwin } 1785fb680e16SJohn Baldwin vm_map_lookup_done(map, entry); 1786fb680e16SJohn Baldwin if (obj != shmfd->shm_object) 1787fb680e16SJohn Baldwin return (EINVAL); 1788fb680e16SJohn Baldwin vm_map_remove(map, kva, kva + size); 178989f6b863SAttilio Rao VM_OBJECT_WLOCK(obj); 1790fb680e16SJohn Baldwin KASSERT(shmfd->shm_kmappings > 0, ("shm_unmap: object not mapped")); 1791fb680e16SJohn Baldwin shmfd->shm_kmappings--; 179289f6b863SAttilio Rao VM_OBJECT_WUNLOCK(obj); 1793fb680e16SJohn Baldwin return (0); 1794fb680e16SJohn Baldwin } 1795e506e182SJohn Baldwin 17969696feebSJohn Baldwin static int 179756d0e33eSKonstantin Belousov shm_fill_kinfo_locked(struct shmfd *shmfd, struct kinfo_file *kif, bool list) 1798e506e182SJohn Baldwin { 1799cc7b259aSJamie Gritton const char *path, *pr_path; 1800cc7b259aSJamie Gritton size_t pr_pathlen; 180156d0e33eSKonstantin Belousov bool visible; 1802e506e182SJohn Baldwin 180356d0e33eSKonstantin Belousov sx_assert(&shm_dict_lock, SA_LOCKED); 18049696feebSJohn Baldwin kif->kf_type = KF_TYPE_SHM; 180556d0e33eSKonstantin Belousov kif->kf_un.kf_file.kf_file_mode = S_IFREG | shmfd->shm_mode; 18069696feebSJohn Baldwin kif->kf_un.kf_file.kf_file_size = shmfd->shm_size; 18079696feebSJohn Baldwin if (shmfd->shm_path != NULL) { 180844c16975SJamie Gritton if (shmfd->shm_path != NULL) { 1809cc7b259aSJamie Gritton path = shmfd->shm_path; 1810cc7b259aSJamie Gritton pr_path = curthread->td_ucred->cr_prison->pr_path; 181144c16975SJamie Gritton if (strcmp(pr_path, "/") != 0) { 181244c16975SJamie Gritton /* Return the jail-rooted pathname. */ 1813cc7b259aSJamie Gritton pr_pathlen = strlen(pr_path); 181456d0e33eSKonstantin Belousov visible = strncmp(path, pr_path, pr_pathlen) 181556d0e33eSKonstantin Belousov == 0 && path[pr_pathlen] == '/'; 181656d0e33eSKonstantin Belousov if (list && !visible) 181756d0e33eSKonstantin Belousov return (EPERM); 181856d0e33eSKonstantin Belousov if (visible) 1819cc7b259aSJamie Gritton path += pr_pathlen; 1820cc7b259aSJamie Gritton } 1821cc7b259aSJamie Gritton strlcpy(kif->kf_path, path, sizeof(kif->kf_path)); 1822cc7b259aSJamie Gritton } 1823e506e182SJohn Baldwin } 18249696feebSJohn Baldwin return (0); 18259696feebSJohn Baldwin } 182656d0e33eSKonstantin Belousov 182756d0e33eSKonstantin Belousov static int 182856d0e33eSKonstantin Belousov shm_fill_kinfo(struct file *fp, struct kinfo_file *kif, 182956d0e33eSKonstantin Belousov struct filedesc *fdp __unused) 183056d0e33eSKonstantin Belousov { 183156d0e33eSKonstantin Belousov int res; 183256d0e33eSKonstantin Belousov 183356d0e33eSKonstantin Belousov sx_slock(&shm_dict_lock); 183456d0e33eSKonstantin Belousov res = shm_fill_kinfo_locked(fp->f_data, kif, false); 183556d0e33eSKonstantin Belousov sx_sunlock(&shm_dict_lock); 183656d0e33eSKonstantin Belousov return (res); 183756d0e33eSKonstantin Belousov } 183856d0e33eSKonstantin Belousov 183956d0e33eSKonstantin Belousov static int 1840af755d3eSKyle Evans shm_add_seals(struct file *fp, int seals) 1841af755d3eSKyle Evans { 1842af755d3eSKyle Evans struct shmfd *shmfd; 1843af755d3eSKyle Evans void *rl_cookie; 1844af755d3eSKyle Evans vm_ooffset_t writemappings; 1845af755d3eSKyle Evans int error, nseals; 1846af755d3eSKyle Evans 1847af755d3eSKyle Evans error = 0; 1848af755d3eSKyle Evans shmfd = fp->f_data; 1849af755d3eSKyle Evans rl_cookie = rangelock_wlock(&shmfd->shm_rl, 0, OFF_MAX, 1850af755d3eSKyle Evans &shmfd->shm_mtx); 1851af755d3eSKyle Evans 1852af755d3eSKyle Evans /* Even already-set seals should result in EPERM. */ 1853af755d3eSKyle Evans if ((shmfd->shm_seals & F_SEAL_SEAL) != 0) { 1854af755d3eSKyle Evans error = EPERM; 1855af755d3eSKyle Evans goto out; 1856af755d3eSKyle Evans } 1857af755d3eSKyle Evans nseals = seals & ~shmfd->shm_seals; 1858af755d3eSKyle Evans if ((nseals & F_SEAL_WRITE) != 0) { 185979783634SKonstantin Belousov if (shm_largepage(shmfd)) { 186079783634SKonstantin Belousov error = ENOTSUP; 186179783634SKonstantin Belousov goto out; 186279783634SKonstantin Belousov } 186379783634SKonstantin Belousov 1864af755d3eSKyle Evans /* 1865af755d3eSKyle Evans * The rangelock above prevents writable mappings from being 1866af755d3eSKyle Evans * added after we've started applying seals. The RLOCK here 1867af755d3eSKyle Evans * is to avoid torn reads on ILP32 arches as unmapping/reducing 1868af755d3eSKyle Evans * writemappings will be done without a rangelock. 1869af755d3eSKyle Evans */ 1870af755d3eSKyle Evans VM_OBJECT_RLOCK(shmfd->shm_object); 1871af755d3eSKyle Evans writemappings = shmfd->shm_object->un_pager.swp.writemappings; 1872af755d3eSKyle Evans VM_OBJECT_RUNLOCK(shmfd->shm_object); 1873af755d3eSKyle Evans /* kmappings are also writable */ 1874af755d3eSKyle Evans if (writemappings > 0) { 1875af755d3eSKyle Evans error = EBUSY; 1876af755d3eSKyle Evans goto out; 1877af755d3eSKyle Evans } 1878af755d3eSKyle Evans } 1879af755d3eSKyle Evans shmfd->shm_seals |= nseals; 1880af755d3eSKyle Evans out: 1881af755d3eSKyle Evans rangelock_unlock(&shmfd->shm_rl, rl_cookie, &shmfd->shm_mtx); 1882af755d3eSKyle Evans return (error); 1883af755d3eSKyle Evans } 1884af755d3eSKyle Evans 1885af755d3eSKyle Evans static int 1886af755d3eSKyle Evans shm_get_seals(struct file *fp, int *seals) 1887af755d3eSKyle Evans { 1888af755d3eSKyle Evans struct shmfd *shmfd; 1889af755d3eSKyle Evans 1890af755d3eSKyle Evans shmfd = fp->f_data; 1891af755d3eSKyle Evans *seals = shmfd->shm_seals; 1892af755d3eSKyle Evans return (0); 1893af755d3eSKyle Evans } 1894af755d3eSKyle Evans 1895af755d3eSKyle Evans static int 1896454bc887SKa Ho Ng shm_deallocate(struct shmfd *shmfd, off_t *offset, off_t *length, int flags) 1897454bc887SKa Ho Ng { 1898454bc887SKa Ho Ng vm_object_t object; 1899454bc887SKa Ho Ng vm_pindex_t pistart, pi, piend; 1900454bc887SKa Ho Ng vm_ooffset_t off, len; 1901454bc887SKa Ho Ng int startofs, endofs, end; 1902454bc887SKa Ho Ng int error; 1903454bc887SKa Ho Ng 1904454bc887SKa Ho Ng off = *offset; 1905454bc887SKa Ho Ng len = *length; 1906454bc887SKa Ho Ng KASSERT(off + len <= (vm_ooffset_t)OFF_MAX, ("off + len overflows")); 19071eaa3652SKa Ho Ng if (off + len > shmfd->shm_size) 19081eaa3652SKa Ho Ng len = shmfd->shm_size - off; 1909454bc887SKa Ho Ng object = shmfd->shm_object; 1910454bc887SKa Ho Ng startofs = off & PAGE_MASK; 1911454bc887SKa Ho Ng endofs = (off + len) & PAGE_MASK; 1912454bc887SKa Ho Ng pistart = OFF_TO_IDX(off); 1913454bc887SKa Ho Ng piend = OFF_TO_IDX(off + len); 1914454bc887SKa Ho Ng pi = OFF_TO_IDX(off + PAGE_MASK); 1915454bc887SKa Ho Ng error = 0; 1916454bc887SKa Ho Ng 19175c1428d2SKa Ho Ng /* Handle the case when offset is on or beyond shm size. */ 19185c1428d2SKa Ho Ng if ((off_t)len <= 0) { 19191eaa3652SKa Ho Ng *length = 0; 19201eaa3652SKa Ho Ng return (0); 19211eaa3652SKa Ho Ng } 19221eaa3652SKa Ho Ng 1923454bc887SKa Ho Ng VM_OBJECT_WLOCK(object); 1924454bc887SKa Ho Ng 1925454bc887SKa Ho Ng if (startofs != 0) { 1926454bc887SKa Ho Ng end = pistart != piend ? PAGE_SIZE : endofs; 1927454bc887SKa Ho Ng error = shm_partial_page_invalidate(object, pistart, startofs, 1928454bc887SKa Ho Ng end); 1929454bc887SKa Ho Ng if (error) 1930454bc887SKa Ho Ng goto out; 1931454bc887SKa Ho Ng off += end - startofs; 1932454bc887SKa Ho Ng len -= end - startofs; 1933454bc887SKa Ho Ng } 1934454bc887SKa Ho Ng 1935454bc887SKa Ho Ng if (pi < piend) { 1936454bc887SKa Ho Ng vm_object_page_remove(object, pi, piend, 0); 1937454bc887SKa Ho Ng off += IDX_TO_OFF(piend - pi); 1938454bc887SKa Ho Ng len -= IDX_TO_OFF(piend - pi); 1939454bc887SKa Ho Ng } 1940454bc887SKa Ho Ng 1941454bc887SKa Ho Ng if (endofs != 0 && pistart != piend) { 1942454bc887SKa Ho Ng error = shm_partial_page_invalidate(object, piend, 0, endofs); 1943454bc887SKa Ho Ng if (error) 1944454bc887SKa Ho Ng goto out; 1945454bc887SKa Ho Ng off += endofs; 1946454bc887SKa Ho Ng len -= endofs; 1947454bc887SKa Ho Ng } 1948454bc887SKa Ho Ng 1949454bc887SKa Ho Ng out: 1950454bc887SKa Ho Ng VM_OBJECT_WUNLOCK(shmfd->shm_object); 1951454bc887SKa Ho Ng *offset = off; 1952454bc887SKa Ho Ng *length = len; 1953454bc887SKa Ho Ng return (error); 1954454bc887SKa Ho Ng } 1955454bc887SKa Ho Ng 1956454bc887SKa Ho Ng static int 1957454bc887SKa Ho Ng shm_fspacectl(struct file *fp, int cmd, off_t *offset, off_t *length, int flags, 1958454bc887SKa Ho Ng struct ucred *active_cred, struct thread *td) 1959454bc887SKa Ho Ng { 1960454bc887SKa Ho Ng void *rl_cookie; 1961454bc887SKa Ho Ng struct shmfd *shmfd; 1962454bc887SKa Ho Ng off_t off, len; 1963454bc887SKa Ho Ng int error; 1964454bc887SKa Ho Ng 1965454bc887SKa Ho Ng /* This assumes that the caller already checked for overflow. */ 1966454bc887SKa Ho Ng error = EINVAL; 1967454bc887SKa Ho Ng shmfd = fp->f_data; 1968454bc887SKa Ho Ng off = *offset; 1969454bc887SKa Ho Ng len = *length; 1970454bc887SKa Ho Ng 1971454bc887SKa Ho Ng if (cmd != SPACECTL_DEALLOC || off < 0 || len <= 0 || 1972454bc887SKa Ho Ng len > OFF_MAX - off || flags != 0) 1973454bc887SKa Ho Ng return (EINVAL); 1974454bc887SKa Ho Ng 1975454bc887SKa Ho Ng rl_cookie = rangelock_wlock(&shmfd->shm_rl, off, off + len, 1976454bc887SKa Ho Ng &shmfd->shm_mtx); 1977454bc887SKa Ho Ng switch (cmd) { 1978454bc887SKa Ho Ng case SPACECTL_DEALLOC: 1979454bc887SKa Ho Ng if ((shmfd->shm_seals & F_SEAL_WRITE) != 0) { 1980454bc887SKa Ho Ng error = EPERM; 1981454bc887SKa Ho Ng break; 1982454bc887SKa Ho Ng } 1983454bc887SKa Ho Ng error = shm_deallocate(shmfd, &off, &len, flags); 1984454bc887SKa Ho Ng *offset = off; 1985454bc887SKa Ho Ng *length = len; 1986454bc887SKa Ho Ng break; 1987454bc887SKa Ho Ng default: 1988454bc887SKa Ho Ng __assert_unreachable(); 1989454bc887SKa Ho Ng } 1990454bc887SKa Ho Ng rangelock_unlock(&shmfd->shm_rl, rl_cookie, &shmfd->shm_mtx); 1991454bc887SKa Ho Ng return (error); 1992454bc887SKa Ho Ng } 1993454bc887SKa Ho Ng 1994454bc887SKa Ho Ng 1995454bc887SKa Ho Ng static int 1996f1040532SKyle Evans shm_fallocate(struct file *fp, off_t offset, off_t len, struct thread *td) 1997f1040532SKyle Evans { 1998f1040532SKyle Evans void *rl_cookie; 1999f1040532SKyle Evans struct shmfd *shmfd; 2000f1040532SKyle Evans size_t size; 2001f1040532SKyle Evans int error; 2002f1040532SKyle Evans 2003f1040532SKyle Evans /* This assumes that the caller already checked for overflow. */ 2004f1040532SKyle Evans error = 0; 2005f1040532SKyle Evans shmfd = fp->f_data; 2006f1040532SKyle Evans size = offset + len; 200739eae263SKyle Evans 200839eae263SKyle Evans /* 200939eae263SKyle Evans * Just grab the rangelock for the range that we may be attempting to 201039eae263SKyle Evans * grow, rather than blocking read/write for regions we won't be 201139eae263SKyle Evans * touching while this (potential) resize is in progress. Other 201239eae263SKyle Evans * attempts to resize the shmfd will have to take a write lock from 0 to 201339eae263SKyle Evans * OFF_MAX, so this being potentially beyond the current usable range of 201439eae263SKyle Evans * the shmfd is not necessarily a concern. If other mechanisms are 201539eae263SKyle Evans * added to grow a shmfd, this may need to be re-evaluated. 201639eae263SKyle Evans */ 201739eae263SKyle Evans rl_cookie = rangelock_wlock(&shmfd->shm_rl, offset, size, 2018f1040532SKyle Evans &shmfd->shm_mtx); 2019d301b358SKonstantin Belousov if (size > shmfd->shm_size) 2020d301b358SKonstantin Belousov error = shm_dotruncate_cookie(shmfd, size, rl_cookie); 2021f1040532SKyle Evans rangelock_unlock(&shmfd->shm_rl, rl_cookie, &shmfd->shm_mtx); 2022f1040532SKyle Evans /* Translate to posix_fallocate(2) return value as needed. */ 2023f1040532SKyle Evans if (error == ENOMEM) 2024f1040532SKyle Evans error = ENOSPC; 2025f1040532SKyle Evans return (error); 2026f1040532SKyle Evans } 2027f1040532SKyle Evans 2028f1040532SKyle Evans static int 202956d0e33eSKonstantin Belousov sysctl_posix_shm_list(SYSCTL_HANDLER_ARGS) 203056d0e33eSKonstantin Belousov { 203156d0e33eSKonstantin Belousov struct shm_mapping *shmm; 203256d0e33eSKonstantin Belousov struct sbuf sb; 203356d0e33eSKonstantin Belousov struct kinfo_file kif; 203456d0e33eSKonstantin Belousov u_long i; 203556d0e33eSKonstantin Belousov ssize_t curlen; 203656d0e33eSKonstantin Belousov int error, error2; 203756d0e33eSKonstantin Belousov 203856d0e33eSKonstantin Belousov sbuf_new_for_sysctl(&sb, NULL, sizeof(struct kinfo_file) * 5, req); 203956d0e33eSKonstantin Belousov sbuf_clear_flags(&sb, SBUF_INCLUDENUL); 204056d0e33eSKonstantin Belousov curlen = 0; 204156d0e33eSKonstantin Belousov error = 0; 204256d0e33eSKonstantin Belousov sx_slock(&shm_dict_lock); 204356d0e33eSKonstantin Belousov for (i = 0; i < shm_hash + 1; i++) { 204456d0e33eSKonstantin Belousov LIST_FOREACH(shmm, &shm_dictionary[i], sm_link) { 204556d0e33eSKonstantin Belousov error = shm_fill_kinfo_locked(shmm->sm_shmfd, 204656d0e33eSKonstantin Belousov &kif, true); 2047747a4726SJamie Gritton if (error == EPERM) { 2048747a4726SJamie Gritton error = 0; 204956d0e33eSKonstantin Belousov continue; 2050747a4726SJamie Gritton } 205156d0e33eSKonstantin Belousov if (error != 0) 205256d0e33eSKonstantin Belousov break; 205356d0e33eSKonstantin Belousov pack_kinfo(&kif); 205456d0e33eSKonstantin Belousov if (req->oldptr != NULL && 205556d0e33eSKonstantin Belousov kif.kf_structsize + curlen > req->oldlen) 205656d0e33eSKonstantin Belousov break; 205756d0e33eSKonstantin Belousov error = sbuf_bcat(&sb, &kif, kif.kf_structsize) == 0 ? 205856d0e33eSKonstantin Belousov 0 : ENOMEM; 205956d0e33eSKonstantin Belousov if (error != 0) 206056d0e33eSKonstantin Belousov break; 206156d0e33eSKonstantin Belousov curlen += kif.kf_structsize; 206256d0e33eSKonstantin Belousov } 206356d0e33eSKonstantin Belousov } 206456d0e33eSKonstantin Belousov sx_sunlock(&shm_dict_lock); 206556d0e33eSKonstantin Belousov error2 = sbuf_finish(&sb); 206656d0e33eSKonstantin Belousov sbuf_delete(&sb); 206756d0e33eSKonstantin Belousov return (error != 0 ? error : error2); 206856d0e33eSKonstantin Belousov } 206956d0e33eSKonstantin Belousov 207056d0e33eSKonstantin Belousov SYSCTL_PROC(_kern_ipc, OID_AUTO, posix_shm_list, 207156d0e33eSKonstantin Belousov CTLFLAG_RD | CTLFLAG_MPSAFE | CTLTYPE_OPAQUE, 207256d0e33eSKonstantin Belousov NULL, 0, sysctl_posix_shm_list, "", 207356d0e33eSKonstantin Belousov "POSIX SHM list"); 207420f70576SKyle Evans 207520f70576SKyle Evans int 2076535b1df9SKyle Evans kern_shm_open(struct thread *td, const char *path, int flags, mode_t mode, 2077535b1df9SKyle Evans struct filecaps *caps) 207820f70576SKyle Evans { 207920f70576SKyle Evans 2080535b1df9SKyle Evans return (kern_shm_open2(td, path, flags, mode, 0, caps, NULL)); 208120f70576SKyle Evans } 208220f70576SKyle Evans 208320f70576SKyle Evans /* 208420f70576SKyle Evans * This version of the shm_open() interface leaves CLOEXEC behavior up to the 208520f70576SKyle Evans * caller, and libc will enforce it for the traditional shm_open() call. This 208620f70576SKyle Evans * allows other consumers, like memfd_create(), to opt-in for CLOEXEC. This 208720f70576SKyle Evans * interface also includes a 'name' argument that is currently unused, but could 208820f70576SKyle Evans * potentially be exported later via some interface for debugging purposes. 208920f70576SKyle Evans * From the kernel's perspective, it is optional. Individual consumers like 209020f70576SKyle Evans * memfd_create() may require it in order to be compatible with other systems 209120f70576SKyle Evans * implementing the same function. 209220f70576SKyle Evans */ 209320f70576SKyle Evans int 209420f70576SKyle Evans sys_shm_open2(struct thread *td, struct shm_open2_args *uap) 209520f70576SKyle Evans { 209620f70576SKyle Evans 209720f70576SKyle Evans return (kern_shm_open2(td, uap->path, uap->flags, uap->mode, 2098535b1df9SKyle Evans uap->shmflags, NULL, uap->name)); 209920f70576SKyle Evans } 2100