18e38aeffSJohn Baldwin /*- 28a36da99SPedro F. Giffuni * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 38a36da99SPedro F. Giffuni * 415bcf785SRobert Watson * Copyright (c) 2006, 2011, 2016-2017 Robert N. M. Watson 5d301b358SKonstantin Belousov * Copyright 2020 The FreeBSD Foundation 68e38aeffSJohn Baldwin * All rights reserved. 78e38aeffSJohn Baldwin * 815bcf785SRobert Watson * Portions of this software were developed by BAE Systems, the University of 915bcf785SRobert Watson * Cambridge Computer Laboratory, and Memorial University under DARPA/AFRL 1015bcf785SRobert Watson * contract FA8650-15-C-7558 ("CADETS"), as part of the DARPA Transparent 1115bcf785SRobert Watson * Computing (TC) research program. 1215bcf785SRobert Watson * 13d301b358SKonstantin Belousov * Portions of this software were developed by Konstantin Belousov 14d301b358SKonstantin Belousov * under sponsorship from the FreeBSD Foundation. 15d301b358SKonstantin Belousov * 168e38aeffSJohn Baldwin * Redistribution and use in source and binary forms, with or without 178e38aeffSJohn Baldwin * modification, are permitted provided that the following conditions 188e38aeffSJohn Baldwin * are met: 198e38aeffSJohn Baldwin * 1. Redistributions of source code must retain the above copyright 208e38aeffSJohn Baldwin * notice, this list of conditions and the following disclaimer. 218e38aeffSJohn Baldwin * 2. Redistributions in binary form must reproduce the above copyright 228e38aeffSJohn Baldwin * notice, this list of conditions and the following disclaimer in the 238e38aeffSJohn Baldwin * documentation and/or other materials provided with the distribution. 248e38aeffSJohn Baldwin * 258e38aeffSJohn Baldwin * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 268e38aeffSJohn Baldwin * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 278e38aeffSJohn Baldwin * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 288e38aeffSJohn Baldwin * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 298e38aeffSJohn Baldwin * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 308e38aeffSJohn Baldwin * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 318e38aeffSJohn Baldwin * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 328e38aeffSJohn Baldwin * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 338e38aeffSJohn Baldwin * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 348e38aeffSJohn Baldwin * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 358e38aeffSJohn Baldwin * SUCH DAMAGE. 368e38aeffSJohn Baldwin */ 378e38aeffSJohn Baldwin 388e38aeffSJohn Baldwin /* 398e38aeffSJohn Baldwin * Support for shared swap-backed anonymous memory objects via 409afb12baSDavid Bright * shm_open(2), shm_rename(2), and shm_unlink(2). 419afb12baSDavid Bright * While most of the implementation is here, vm_mmap.c contains 429afb12baSDavid Bright * mapping logic changes. 438e38aeffSJohn Baldwin * 445c066cd2SKonstantin Belousov * posixshmcontrol(1) allows users to inspect the state of the memory 455c066cd2SKonstantin Belousov * objects. Per-uid swap resource limit controls total amount of 465c066cd2SKonstantin Belousov * memory that user can consume for anonymous objects, including 475c066cd2SKonstantin Belousov * shared. 488e38aeffSJohn Baldwin */ 498e38aeffSJohn Baldwin 508e38aeffSJohn Baldwin #include <sys/cdefs.h> 518e38aeffSJohn Baldwin __FBSDID("$FreeBSD$"); 528e38aeffSJohn Baldwin 5312bc222eSJonathan Anderson #include "opt_capsicum.h" 54551a7895SRui Paulo #include "opt_ktrace.h" 5512bc222eSJonathan Anderson 568e38aeffSJohn Baldwin #include <sys/param.h> 574a144410SRobert Watson #include <sys/capsicum.h> 58610a2b3cSJohn Baldwin #include <sys/conf.h> 598e38aeffSJohn Baldwin #include <sys/fcntl.h> 608e38aeffSJohn Baldwin #include <sys/file.h> 618e38aeffSJohn Baldwin #include <sys/filedesc.h> 622b64ab22SMark Johnston #include <sys/filio.h> 638e38aeffSJohn Baldwin #include <sys/fnv_hash.h> 648e38aeffSJohn Baldwin #include <sys/kernel.h> 6591898857SMark Johnston #include <sys/limits.h> 66551a7895SRui Paulo #include <sys/uio.h> 67551a7895SRui Paulo #include <sys/signal.h> 68cc7b259aSJamie Gritton #include <sys/jail.h> 69551a7895SRui Paulo #include <sys/ktrace.h> 708e38aeffSJohn Baldwin #include <sys/lock.h> 718e38aeffSJohn Baldwin #include <sys/malloc.h> 728e38aeffSJohn Baldwin #include <sys/mman.h> 738e38aeffSJohn Baldwin #include <sys/mutex.h> 749c00bb91SKonstantin Belousov #include <sys/priv.h> 758e38aeffSJohn Baldwin #include <sys/proc.h> 768e38aeffSJohn Baldwin #include <sys/refcount.h> 778e38aeffSJohn Baldwin #include <sys/resourcevar.h> 7889f6b863SAttilio Rao #include <sys/rwlock.h> 7956d0e33eSKonstantin Belousov #include <sys/sbuf.h> 808e38aeffSJohn Baldwin #include <sys/stat.h> 817ee1b208SEd Schouten #include <sys/syscallsubr.h> 828e38aeffSJohn Baldwin #include <sys/sysctl.h> 838e38aeffSJohn Baldwin #include <sys/sysproto.h> 848e38aeffSJohn Baldwin #include <sys/systm.h> 858e38aeffSJohn Baldwin #include <sys/sx.h> 868e38aeffSJohn Baldwin #include <sys/time.h> 87d301b358SKonstantin Belousov #include <sys/vmmeter.h> 888e38aeffSJohn Baldwin #include <sys/vnode.h> 89940cb0e2SKonstantin Belousov #include <sys/unistd.h> 909696feebSJohn Baldwin #include <sys/user.h> 918e38aeffSJohn Baldwin 9215bcf785SRobert Watson #include <security/audit/audit.h> 938e38aeffSJohn Baldwin #include <security/mac/mac_framework.h> 948e38aeffSJohn Baldwin 958e38aeffSJohn Baldwin #include <vm/vm.h> 968e38aeffSJohn Baldwin #include <vm/vm_param.h> 978e38aeffSJohn Baldwin #include <vm/pmap.h> 98338e7cf2SJohn Baldwin #include <vm/vm_extern.h> 998e38aeffSJohn Baldwin #include <vm/vm_map.h> 100fb680e16SJohn Baldwin #include <vm/vm_kern.h> 1018e38aeffSJohn Baldwin #include <vm/vm_object.h> 1028e38aeffSJohn Baldwin #include <vm/vm_page.h> 1032971897dSAlan Cox #include <vm/vm_pageout.h> 1048e38aeffSJohn Baldwin #include <vm/vm_pager.h> 1058e38aeffSJohn Baldwin #include <vm/swap_pager.h> 1068e38aeffSJohn Baldwin 1078e38aeffSJohn Baldwin struct shm_mapping { 1088e38aeffSJohn Baldwin char *sm_path; 1098e38aeffSJohn Baldwin Fnv32_t sm_fnv; 1108e38aeffSJohn Baldwin struct shmfd *sm_shmfd; 1118e38aeffSJohn Baldwin LIST_ENTRY(shm_mapping) sm_link; 1128e38aeffSJohn Baldwin }; 1138e38aeffSJohn Baldwin 1148e38aeffSJohn Baldwin static MALLOC_DEFINE(M_SHMFD, "shmfd", "shared memory file descriptor"); 1158e38aeffSJohn Baldwin static LIST_HEAD(, shm_mapping) *shm_dictionary; 1168e38aeffSJohn Baldwin static struct sx shm_dict_lock; 1178e38aeffSJohn Baldwin static struct mtx shm_timestamp_lock; 1188e38aeffSJohn Baldwin static u_long shm_hash; 1197883ce1fSMateusz Guzik static struct unrhdr64 shm_ino_unr; 120610a2b3cSJohn Baldwin static dev_t shm_dev_ino; 1218e38aeffSJohn Baldwin 1228e38aeffSJohn Baldwin #define SHM_HASH(fnv) (&shm_dictionary[(fnv) & shm_hash]) 1238e38aeffSJohn Baldwin 1245be725d7SAndreas Tobler static void shm_init(void *arg); 1258e38aeffSJohn Baldwin static void shm_insert(char *path, Fnv32_t fnv, struct shmfd *shmfd); 1268e38aeffSJohn Baldwin static struct shmfd *shm_lookup(char *path, Fnv32_t fnv); 1278e38aeffSJohn Baldwin static int shm_remove(char *path, Fnv32_t fnv, struct ucred *ucred); 128*7060da62SJamie Gritton static void shm_doremove(struct shm_mapping *map); 129d301b358SKonstantin Belousov static int shm_dotruncate_cookie(struct shmfd *shmfd, off_t length, 130d301b358SKonstantin Belousov void *rl_cookie); 131af755d3eSKyle Evans static int shm_dotruncate_locked(struct shmfd *shmfd, off_t length, 132af755d3eSKyle Evans void *rl_cookie); 1332d5603feSDavid Bright static int shm_copyin_path(struct thread *td, const char *userpath_in, 1342d5603feSDavid Bright char **path_out); 135454bc887SKa Ho Ng static int shm_deallocate(struct shmfd *shmfd, off_t *offset, 136454bc887SKa Ho Ng off_t *length, int flags); 1378e38aeffSJohn Baldwin 1388e38aeffSJohn Baldwin static fo_rdwr_t shm_read; 1398e38aeffSJohn Baldwin static fo_rdwr_t shm_write; 1408e38aeffSJohn Baldwin static fo_truncate_t shm_truncate; 1412b64ab22SMark Johnston static fo_ioctl_t shm_ioctl; 1428e38aeffSJohn Baldwin static fo_stat_t shm_stat; 1438e38aeffSJohn Baldwin static fo_close_t shm_close; 1449c00bb91SKonstantin Belousov static fo_chmod_t shm_chmod; 1459c00bb91SKonstantin Belousov static fo_chown_t shm_chown; 146940cb0e2SKonstantin Belousov static fo_seek_t shm_seek; 1479696feebSJohn Baldwin static fo_fill_kinfo_t shm_fill_kinfo; 1487077c426SJohn Baldwin static fo_mmap_t shm_mmap; 149af755d3eSKyle Evans static fo_get_seals_t shm_get_seals; 150af755d3eSKyle Evans static fo_add_seals_t shm_add_seals; 151f1040532SKyle Evans static fo_fallocate_t shm_fallocate; 152454bc887SKa Ho Ng static fo_fspacectl_t shm_fspacectl; 1538e38aeffSJohn Baldwin 1548e38aeffSJohn Baldwin /* File descriptor operations. */ 1551bdbd705SKonstantin Belousov struct fileops shm_ops = { 1568e38aeffSJohn Baldwin .fo_read = shm_read, 1578e38aeffSJohn Baldwin .fo_write = shm_write, 1588e38aeffSJohn Baldwin .fo_truncate = shm_truncate, 1592b64ab22SMark Johnston .fo_ioctl = shm_ioctl, 1602d69d0dcSJohn Baldwin .fo_poll = invfo_poll, 1612d69d0dcSJohn Baldwin .fo_kqfilter = invfo_kqfilter, 1628e38aeffSJohn Baldwin .fo_stat = shm_stat, 1638e38aeffSJohn Baldwin .fo_close = shm_close, 1649c00bb91SKonstantin Belousov .fo_chmod = shm_chmod, 1659c00bb91SKonstantin Belousov .fo_chown = shm_chown, 166227aaa86SKonstantin Belousov .fo_sendfile = vn_sendfile, 167940cb0e2SKonstantin Belousov .fo_seek = shm_seek, 1689696feebSJohn Baldwin .fo_fill_kinfo = shm_fill_kinfo, 1697077c426SJohn Baldwin .fo_mmap = shm_mmap, 170af755d3eSKyle Evans .fo_get_seals = shm_get_seals, 171af755d3eSKyle Evans .fo_add_seals = shm_add_seals, 172f1040532SKyle Evans .fo_fallocate = shm_fallocate, 173454bc887SKa Ho Ng .fo_fspacectl = shm_fspacectl, 174d301b358SKonstantin Belousov .fo_flags = DFLAG_PASSABLE | DFLAG_SEEKABLE, 1758e38aeffSJohn Baldwin }; 1768e38aeffSJohn Baldwin 1778e38aeffSJohn Baldwin FEATURE(posix_shm, "POSIX shared memory"); 1788e38aeffSJohn Baldwin 179d301b358SKonstantin Belousov static SYSCTL_NODE(_vm, OID_AUTO, largepages, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 180d301b358SKonstantin Belousov ""); 181d301b358SKonstantin Belousov 182d301b358SKonstantin Belousov static int largepage_reclaim_tries = 1; 183d301b358SKonstantin Belousov SYSCTL_INT(_vm_largepages, OID_AUTO, reclaim_tries, 184d301b358SKonstantin Belousov CTLFLAG_RWTUN, &largepage_reclaim_tries, 0, 185d301b358SKonstantin Belousov "Number of contig reclaims before giving up for default alloc policy"); 186d301b358SKonstantin Belousov 1878e38aeffSJohn Baldwin static int 18841cf41fdSKonstantin Belousov uiomove_object_page(vm_object_t obj, size_t len, struct uio *uio) 18941cf41fdSKonstantin Belousov { 19041cf41fdSKonstantin Belousov vm_page_t m; 19141cf41fdSKonstantin Belousov vm_pindex_t idx; 19241cf41fdSKonstantin Belousov size_t tlen; 19341cf41fdSKonstantin Belousov int error, offset, rv; 19441cf41fdSKonstantin Belousov 19541cf41fdSKonstantin Belousov idx = OFF_TO_IDX(uio->uio_offset); 19641cf41fdSKonstantin Belousov offset = uio->uio_offset & PAGE_MASK; 19741cf41fdSKonstantin Belousov tlen = MIN(PAGE_SIZE - offset, len); 19841cf41fdSKonstantin Belousov 199f72eaaebSJeff Roberson rv = vm_page_grab_valid_unlocked(&m, obj, idx, 200f72eaaebSJeff Roberson VM_ALLOC_SBUSY | VM_ALLOC_IGN_SBUSY | VM_ALLOC_NOCREAT); 201f72eaaebSJeff Roberson if (rv == VM_PAGER_OK) 202f72eaaebSJeff Roberson goto found; 20341cf41fdSKonstantin Belousov 20441cf41fdSKonstantin Belousov /* 2056311d7aaSWill Andrews * Read I/O without either a corresponding resident page or swap 2066311d7aaSWill Andrews * page: use zero_region. This is intended to avoid instantiating 2076311d7aaSWill Andrews * pages on read from a sparse region. 2086311d7aaSWill Andrews */ 209f72eaaebSJeff Roberson VM_OBJECT_WLOCK(obj); 210f72eaaebSJeff Roberson m = vm_page_lookup(obj, idx); 211f72eaaebSJeff Roberson if (uio->uio_rw == UIO_READ && m == NULL && 2126311d7aaSWill Andrews !vm_pager_has_page(obj, idx, NULL, NULL)) { 2136311d7aaSWill Andrews VM_OBJECT_WUNLOCK(obj); 214b9062c93SKonstantin Belousov return (uiomove(__DECONST(void *, zero_region), tlen, uio)); 2156311d7aaSWill Andrews } 2166311d7aaSWill Andrews 2176311d7aaSWill Andrews /* 21841cf41fdSKonstantin Belousov * Although the tmpfs vnode lock is held here, it is 21941cf41fdSKonstantin Belousov * nonetheless safe to sleep waiting for a free page. The 22041cf41fdSKonstantin Belousov * pageout daemon does not need to acquire the tmpfs vnode 22141cf41fdSKonstantin Belousov * lock to page out tobj's pages because tobj is a OBJT_SWAP 22241cf41fdSKonstantin Belousov * type object. 22341cf41fdSKonstantin Belousov */ 224c7575748SJeff Roberson rv = vm_page_grab_valid(&m, obj, idx, 225a8081778SJeff Roberson VM_ALLOC_NORMAL | VM_ALLOC_SBUSY | VM_ALLOC_IGN_SBUSY); 22641cf41fdSKonstantin Belousov if (rv != VM_PAGER_OK) { 22741cf41fdSKonstantin Belousov VM_OBJECT_WUNLOCK(obj); 228c7575748SJeff Roberson printf("uiomove_object: vm_obj %p idx %jd pager error %d\n", 229c7575748SJeff Roberson obj, idx, rv); 23041cf41fdSKonstantin Belousov return (EIO); 23141cf41fdSKonstantin Belousov } 23241cf41fdSKonstantin Belousov VM_OBJECT_WUNLOCK(obj); 233f72eaaebSJeff Roberson 234f72eaaebSJeff Roberson found: 23541cf41fdSKonstantin Belousov error = uiomove_fromphys(&m, offset, tlen, uio); 236a8081778SJeff Roberson if (uio->uio_rw == UIO_WRITE && error == 0) 237a8081778SJeff Roberson vm_page_set_dirty(m); 238d29f674fSJeff Roberson vm_page_activate(m); 239a8081778SJeff Roberson vm_page_sunbusy(m); 24041cf41fdSKonstantin Belousov 24141cf41fdSKonstantin Belousov return (error); 24241cf41fdSKonstantin Belousov } 24341cf41fdSKonstantin Belousov 24441cf41fdSKonstantin Belousov int 24541cf41fdSKonstantin Belousov uiomove_object(vm_object_t obj, off_t obj_size, struct uio *uio) 24641cf41fdSKonstantin Belousov { 24741cf41fdSKonstantin Belousov ssize_t resid; 24841cf41fdSKonstantin Belousov size_t len; 24941cf41fdSKonstantin Belousov int error; 25041cf41fdSKonstantin Belousov 25141cf41fdSKonstantin Belousov error = 0; 25241cf41fdSKonstantin Belousov while ((resid = uio->uio_resid) > 0) { 25341cf41fdSKonstantin Belousov if (obj_size <= uio->uio_offset) 25441cf41fdSKonstantin Belousov break; 25541cf41fdSKonstantin Belousov len = MIN(obj_size - uio->uio_offset, resid); 25641cf41fdSKonstantin Belousov if (len == 0) 25741cf41fdSKonstantin Belousov break; 25841cf41fdSKonstantin Belousov error = uiomove_object_page(obj, len, uio); 25941cf41fdSKonstantin Belousov if (error != 0 || resid == uio->uio_resid) 26041cf41fdSKonstantin Belousov break; 26141cf41fdSKonstantin Belousov } 26241cf41fdSKonstantin Belousov return (error); 26341cf41fdSKonstantin Belousov } 26441cf41fdSKonstantin Belousov 265d301b358SKonstantin Belousov static u_long count_largepages[MAXPAGESIZES]; 266d301b358SKonstantin Belousov 267d301b358SKonstantin Belousov static int 268d301b358SKonstantin Belousov shm_largepage_phys_populate(vm_object_t object, vm_pindex_t pidx, 269d301b358SKonstantin Belousov int fault_type, vm_prot_t max_prot, vm_pindex_t *first, vm_pindex_t *last) 270d301b358SKonstantin Belousov { 2713b5331ddSKonstantin Belousov vm_page_t m __diagused; 272d301b358SKonstantin Belousov int psind; 273d301b358SKonstantin Belousov 274d301b358SKonstantin Belousov psind = object->un_pager.phys.data_val; 275d301b358SKonstantin Belousov if (psind == 0 || pidx >= object->size) 276d301b358SKonstantin Belousov return (VM_PAGER_FAIL); 277d301b358SKonstantin Belousov *first = rounddown2(pidx, pagesizes[psind] / PAGE_SIZE); 278d301b358SKonstantin Belousov 279d301b358SKonstantin Belousov /* 280d301b358SKonstantin Belousov * We only busy the first page in the superpage run. It is 281d301b358SKonstantin Belousov * useless to busy whole run since we only remove full 282d301b358SKonstantin Belousov * superpage, and it takes too long to busy e.g. 512 * 512 == 283d301b358SKonstantin Belousov * 262144 pages constituing 1G amd64 superage. 284d301b358SKonstantin Belousov */ 285d301b358SKonstantin Belousov m = vm_page_grab(object, *first, VM_ALLOC_NORMAL | VM_ALLOC_NOCREAT); 286d301b358SKonstantin Belousov MPASS(m != NULL); 287d301b358SKonstantin Belousov 288d301b358SKonstantin Belousov *last = *first + atop(pagesizes[psind]) - 1; 289d301b358SKonstantin Belousov return (VM_PAGER_OK); 290d301b358SKonstantin Belousov } 291d301b358SKonstantin Belousov 292d301b358SKonstantin Belousov static boolean_t 293d301b358SKonstantin Belousov shm_largepage_phys_haspage(vm_object_t object, vm_pindex_t pindex, 294d301b358SKonstantin Belousov int *before, int *after) 295d301b358SKonstantin Belousov { 296d301b358SKonstantin Belousov int psind; 297d301b358SKonstantin Belousov 298d301b358SKonstantin Belousov psind = object->un_pager.phys.data_val; 299d301b358SKonstantin Belousov if (psind == 0 || pindex >= object->size) 300d301b358SKonstantin Belousov return (FALSE); 301d301b358SKonstantin Belousov if (before != NULL) { 302d301b358SKonstantin Belousov *before = pindex - rounddown2(pindex, pagesizes[psind] / 303d301b358SKonstantin Belousov PAGE_SIZE); 304d301b358SKonstantin Belousov } 305d301b358SKonstantin Belousov if (after != NULL) { 306d301b358SKonstantin Belousov *after = roundup2(pindex, pagesizes[psind] / PAGE_SIZE) - 307d301b358SKonstantin Belousov pindex; 308d301b358SKonstantin Belousov } 309d301b358SKonstantin Belousov return (TRUE); 310d301b358SKonstantin Belousov } 311d301b358SKonstantin Belousov 312d301b358SKonstantin Belousov static void 313d301b358SKonstantin Belousov shm_largepage_phys_ctor(vm_object_t object, vm_prot_t prot, 314d301b358SKonstantin Belousov vm_ooffset_t foff, struct ucred *cred) 315d301b358SKonstantin Belousov { 316d301b358SKonstantin Belousov } 317d301b358SKonstantin Belousov 318d301b358SKonstantin Belousov static void 319d301b358SKonstantin Belousov shm_largepage_phys_dtor(vm_object_t object) 320d301b358SKonstantin Belousov { 321d301b358SKonstantin Belousov int psind; 322d301b358SKonstantin Belousov 323d301b358SKonstantin Belousov psind = object->un_pager.phys.data_val; 324d301b358SKonstantin Belousov if (psind != 0) { 325d301b358SKonstantin Belousov atomic_subtract_long(&count_largepages[psind], 326d301b358SKonstantin Belousov object->size / (pagesizes[psind] / PAGE_SIZE)); 327d301b358SKonstantin Belousov vm_wire_sub(object->size); 328d301b358SKonstantin Belousov } else { 329d301b358SKonstantin Belousov KASSERT(object->size == 0, 330d301b358SKonstantin Belousov ("largepage phys obj %p not initialized bit size %#jx > 0", 331d301b358SKonstantin Belousov object, (uintmax_t)object->size)); 332d301b358SKonstantin Belousov } 333d301b358SKonstantin Belousov } 334d301b358SKonstantin Belousov 335d474440aSKonstantin Belousov static const struct phys_pager_ops shm_largepage_phys_ops = { 336d301b358SKonstantin Belousov .phys_pg_populate = shm_largepage_phys_populate, 337d301b358SKonstantin Belousov .phys_pg_haspage = shm_largepage_phys_haspage, 338d301b358SKonstantin Belousov .phys_pg_ctor = shm_largepage_phys_ctor, 339d301b358SKonstantin Belousov .phys_pg_dtor = shm_largepage_phys_dtor, 340d301b358SKonstantin Belousov }; 341d301b358SKonstantin Belousov 342d301b358SKonstantin Belousov bool 343d301b358SKonstantin Belousov shm_largepage(struct shmfd *shmfd) 344d301b358SKonstantin Belousov { 345d301b358SKonstantin Belousov return (shmfd->shm_object->type == OBJT_PHYS); 346d301b358SKonstantin Belousov } 347d301b358SKonstantin Belousov 34841cf41fdSKonstantin Belousov static int 349940cb0e2SKonstantin Belousov shm_seek(struct file *fp, off_t offset, int whence, struct thread *td) 350940cb0e2SKonstantin Belousov { 351940cb0e2SKonstantin Belousov struct shmfd *shmfd; 352940cb0e2SKonstantin Belousov off_t foffset; 353940cb0e2SKonstantin Belousov int error; 354940cb0e2SKonstantin Belousov 355940cb0e2SKonstantin Belousov shmfd = fp->f_data; 356940cb0e2SKonstantin Belousov foffset = foffset_lock(fp, 0); 357940cb0e2SKonstantin Belousov error = 0; 358940cb0e2SKonstantin Belousov switch (whence) { 359940cb0e2SKonstantin Belousov case L_INCR: 360940cb0e2SKonstantin Belousov if (foffset < 0 || 361940cb0e2SKonstantin Belousov (offset > 0 && foffset > OFF_MAX - offset)) { 362940cb0e2SKonstantin Belousov error = EOVERFLOW; 363940cb0e2SKonstantin Belousov break; 364940cb0e2SKonstantin Belousov } 365940cb0e2SKonstantin Belousov offset += foffset; 366940cb0e2SKonstantin Belousov break; 367940cb0e2SKonstantin Belousov case L_XTND: 368940cb0e2SKonstantin Belousov if (offset > 0 && shmfd->shm_size > OFF_MAX - offset) { 369940cb0e2SKonstantin Belousov error = EOVERFLOW; 370940cb0e2SKonstantin Belousov break; 371940cb0e2SKonstantin Belousov } 372940cb0e2SKonstantin Belousov offset += shmfd->shm_size; 373940cb0e2SKonstantin Belousov break; 374940cb0e2SKonstantin Belousov case L_SET: 375940cb0e2SKonstantin Belousov break; 376940cb0e2SKonstantin Belousov default: 377940cb0e2SKonstantin Belousov error = EINVAL; 378940cb0e2SKonstantin Belousov } 379940cb0e2SKonstantin Belousov if (error == 0) { 380940cb0e2SKonstantin Belousov if (offset < 0 || offset > shmfd->shm_size) 381940cb0e2SKonstantin Belousov error = EINVAL; 382940cb0e2SKonstantin Belousov else 3836f2b769cSJohn-Mark Gurney td->td_uretoff.tdu_off = offset; 384940cb0e2SKonstantin Belousov } 385940cb0e2SKonstantin Belousov foffset_unlock(fp, offset, error != 0 ? FOF_NOUPDATE : 0); 386940cb0e2SKonstantin Belousov return (error); 387940cb0e2SKonstantin Belousov } 388940cb0e2SKonstantin Belousov 389940cb0e2SKonstantin Belousov static int 3908e38aeffSJohn Baldwin shm_read(struct file *fp, struct uio *uio, struct ucred *active_cred, 3918e38aeffSJohn Baldwin int flags, struct thread *td) 3928e38aeffSJohn Baldwin { 393940cb0e2SKonstantin Belousov struct shmfd *shmfd; 394940cb0e2SKonstantin Belousov void *rl_cookie; 395940cb0e2SKonstantin Belousov int error; 3968e38aeffSJohn Baldwin 397940cb0e2SKonstantin Belousov shmfd = fp->f_data; 398940cb0e2SKonstantin Belousov #ifdef MAC 399940cb0e2SKonstantin Belousov error = mac_posixshm_check_read(active_cred, fp->f_cred, shmfd); 400940cb0e2SKonstantin Belousov if (error) 401940cb0e2SKonstantin Belousov return (error); 402940cb0e2SKonstantin Belousov #endif 4036ea906eeSJilles Tjoelker foffset_lock_uio(fp, uio, flags); 4046ea906eeSJilles Tjoelker rl_cookie = rangelock_rlock(&shmfd->shm_rl, uio->uio_offset, 4056ea906eeSJilles Tjoelker uio->uio_offset + uio->uio_resid, &shmfd->shm_mtx); 406940cb0e2SKonstantin Belousov error = uiomove_object(shmfd->shm_object, shmfd->shm_size, uio); 407940cb0e2SKonstantin Belousov rangelock_unlock(&shmfd->shm_rl, rl_cookie, &shmfd->shm_mtx); 408940cb0e2SKonstantin Belousov foffset_unlock_uio(fp, uio, flags); 409940cb0e2SKonstantin Belousov return (error); 4108e38aeffSJohn Baldwin } 4118e38aeffSJohn Baldwin 4128e38aeffSJohn Baldwin static int 4138e38aeffSJohn Baldwin shm_write(struct file *fp, struct uio *uio, struct ucred *active_cred, 4148e38aeffSJohn Baldwin int flags, struct thread *td) 4158e38aeffSJohn Baldwin { 416940cb0e2SKonstantin Belousov struct shmfd *shmfd; 417940cb0e2SKonstantin Belousov void *rl_cookie; 418940cb0e2SKonstantin Belousov int error; 4193f07b9d9SKyle Evans off_t size; 4208e38aeffSJohn Baldwin 421940cb0e2SKonstantin Belousov shmfd = fp->f_data; 422940cb0e2SKonstantin Belousov #ifdef MAC 423940cb0e2SKonstantin Belousov error = mac_posixshm_check_write(active_cred, fp->f_cred, shmfd); 424940cb0e2SKonstantin Belousov if (error) 425940cb0e2SKonstantin Belousov return (error); 426940cb0e2SKonstantin Belousov #endif 427d301b358SKonstantin Belousov if (shm_largepage(shmfd) && shmfd->shm_lp_psind == 0) 428d301b358SKonstantin Belousov return (EINVAL); 429940cb0e2SKonstantin Belousov foffset_lock_uio(fp, uio, flags); 4303f07b9d9SKyle Evans if (uio->uio_resid > OFF_MAX - uio->uio_offset) { 4313f07b9d9SKyle Evans /* 4323f07b9d9SKyle Evans * Overflow is only an error if we're supposed to expand on 4333f07b9d9SKyle Evans * write. Otherwise, we'll just truncate the write to the 4343f07b9d9SKyle Evans * size of the file, which can only grow up to OFF_MAX. 4353f07b9d9SKyle Evans */ 4363f07b9d9SKyle Evans if ((shmfd->shm_flags & SHM_GROW_ON_WRITE) != 0) { 4373f07b9d9SKyle Evans foffset_unlock_uio(fp, uio, flags); 4383f07b9d9SKyle Evans return (EFBIG); 4393f07b9d9SKyle Evans } 4403f07b9d9SKyle Evans 4413f07b9d9SKyle Evans size = shmfd->shm_size; 4423f07b9d9SKyle Evans } else { 4433f07b9d9SKyle Evans size = uio->uio_offset + uio->uio_resid; 4443f07b9d9SKyle Evans } 445940cb0e2SKonstantin Belousov if ((flags & FOF_OFFSET) == 0) { 446940cb0e2SKonstantin Belousov rl_cookie = rangelock_wlock(&shmfd->shm_rl, 0, OFF_MAX, 447940cb0e2SKonstantin Belousov &shmfd->shm_mtx); 448940cb0e2SKonstantin Belousov } else { 449940cb0e2SKonstantin Belousov rl_cookie = rangelock_wlock(&shmfd->shm_rl, uio->uio_offset, 4503f07b9d9SKyle Evans size, &shmfd->shm_mtx); 451940cb0e2SKonstantin Belousov } 4523f07b9d9SKyle Evans if ((shmfd->shm_seals & F_SEAL_WRITE) != 0) { 453af755d3eSKyle Evans error = EPERM; 4543f07b9d9SKyle Evans } else { 4553f07b9d9SKyle Evans error = 0; 4563f07b9d9SKyle Evans if ((shmfd->shm_flags & SHM_GROW_ON_WRITE) != 0 && 4573f07b9d9SKyle Evans size > shmfd->shm_size) { 45879783634SKonstantin Belousov error = shm_dotruncate_cookie(shmfd, size, rl_cookie); 4593f07b9d9SKyle Evans } 4603f07b9d9SKyle Evans if (error == 0) 4613f07b9d9SKyle Evans error = uiomove_object(shmfd->shm_object, 4623f07b9d9SKyle Evans shmfd->shm_size, uio); 4633f07b9d9SKyle Evans } 464940cb0e2SKonstantin Belousov rangelock_unlock(&shmfd->shm_rl, rl_cookie, &shmfd->shm_mtx); 465940cb0e2SKonstantin Belousov foffset_unlock_uio(fp, uio, flags); 466940cb0e2SKonstantin Belousov return (error); 4678e38aeffSJohn Baldwin } 4688e38aeffSJohn Baldwin 4698e38aeffSJohn Baldwin static int 4708e38aeffSJohn Baldwin shm_truncate(struct file *fp, off_t length, struct ucred *active_cred, 4718e38aeffSJohn Baldwin struct thread *td) 4728e38aeffSJohn Baldwin { 4738e38aeffSJohn Baldwin struct shmfd *shmfd; 4748e38aeffSJohn Baldwin #ifdef MAC 4758e38aeffSJohn Baldwin int error; 4768e38aeffSJohn Baldwin #endif 4778e38aeffSJohn Baldwin 4788e38aeffSJohn Baldwin shmfd = fp->f_data; 4798e38aeffSJohn Baldwin #ifdef MAC 4808e38aeffSJohn Baldwin error = mac_posixshm_check_truncate(active_cred, fp->f_cred, shmfd); 4818e38aeffSJohn Baldwin if (error) 4828e38aeffSJohn Baldwin return (error); 4838e38aeffSJohn Baldwin #endif 4843364c323SKonstantin Belousov return (shm_dotruncate(shmfd, length)); 4858e38aeffSJohn Baldwin } 4868e38aeffSJohn Baldwin 4872b64ab22SMark Johnston int 4882b64ab22SMark Johnston shm_ioctl(struct file *fp, u_long com, void *data, struct ucred *active_cred, 4892b64ab22SMark Johnston struct thread *td) 4902b64ab22SMark Johnston { 491d301b358SKonstantin Belousov struct shmfd *shmfd; 492d301b358SKonstantin Belousov struct shm_largepage_conf *conf; 493d301b358SKonstantin Belousov void *rl_cookie; 4942b64ab22SMark Johnston 495d301b358SKonstantin Belousov shmfd = fp->f_data; 4962b64ab22SMark Johnston switch (com) { 4972b64ab22SMark Johnston case FIONBIO: 4982b64ab22SMark Johnston case FIOASYNC: 4992b64ab22SMark Johnston /* 5002b64ab22SMark Johnston * Allow fcntl(fd, F_SETFL, O_NONBLOCK) to work, 5012b64ab22SMark Johnston * just like it would on an unlinked regular file 5022b64ab22SMark Johnston */ 5032b64ab22SMark Johnston return (0); 504d301b358SKonstantin Belousov case FIOSSHMLPGCNF: 505d301b358SKonstantin Belousov if (!shm_largepage(shmfd)) 506d301b358SKonstantin Belousov return (ENOTTY); 507d301b358SKonstantin Belousov conf = data; 508d301b358SKonstantin Belousov if (shmfd->shm_lp_psind != 0 && 509d301b358SKonstantin Belousov conf->psind != shmfd->shm_lp_psind) 510d301b358SKonstantin Belousov return (EINVAL); 511d301b358SKonstantin Belousov if (conf->psind <= 0 || conf->psind >= MAXPAGESIZES || 512d301b358SKonstantin Belousov pagesizes[conf->psind] == 0) 513d301b358SKonstantin Belousov return (EINVAL); 514d301b358SKonstantin Belousov if (conf->alloc_policy != SHM_LARGEPAGE_ALLOC_DEFAULT && 515d301b358SKonstantin Belousov conf->alloc_policy != SHM_LARGEPAGE_ALLOC_NOWAIT && 516d301b358SKonstantin Belousov conf->alloc_policy != SHM_LARGEPAGE_ALLOC_HARD) 517d301b358SKonstantin Belousov return (EINVAL); 518d301b358SKonstantin Belousov 519d301b358SKonstantin Belousov rl_cookie = rangelock_wlock(&shmfd->shm_rl, 0, OFF_MAX, 520d301b358SKonstantin Belousov &shmfd->shm_mtx); 521d301b358SKonstantin Belousov shmfd->shm_lp_psind = conf->psind; 522d301b358SKonstantin Belousov shmfd->shm_lp_alloc_policy = conf->alloc_policy; 523d301b358SKonstantin Belousov shmfd->shm_object->un_pager.phys.data_val = conf->psind; 524d301b358SKonstantin Belousov rangelock_unlock(&shmfd->shm_rl, rl_cookie, &shmfd->shm_mtx); 525d301b358SKonstantin Belousov return (0); 526d301b358SKonstantin Belousov case FIOGSHMLPGCNF: 527d301b358SKonstantin Belousov if (!shm_largepage(shmfd)) 528d301b358SKonstantin Belousov return (ENOTTY); 529d301b358SKonstantin Belousov conf = data; 530d301b358SKonstantin Belousov rl_cookie = rangelock_rlock(&shmfd->shm_rl, 0, OFF_MAX, 531d301b358SKonstantin Belousov &shmfd->shm_mtx); 532d301b358SKonstantin Belousov conf->psind = shmfd->shm_lp_psind; 533d301b358SKonstantin Belousov conf->alloc_policy = shmfd->shm_lp_alloc_policy; 534d301b358SKonstantin Belousov rangelock_unlock(&shmfd->shm_rl, rl_cookie, &shmfd->shm_mtx); 535d301b358SKonstantin Belousov return (0); 5362b64ab22SMark Johnston default: 5372b64ab22SMark Johnston return (ENOTTY); 5382b64ab22SMark Johnston } 5392b64ab22SMark Johnston } 5402b64ab22SMark Johnston 5418e38aeffSJohn Baldwin static int 5422b68eb8eSMateusz Guzik shm_stat(struct file *fp, struct stat *sb, struct ucred *active_cred) 5438e38aeffSJohn Baldwin { 5448e38aeffSJohn Baldwin struct shmfd *shmfd; 5458e38aeffSJohn Baldwin #ifdef MAC 5468e38aeffSJohn Baldwin int error; 5478e38aeffSJohn Baldwin #endif 5488e38aeffSJohn Baldwin 5498e38aeffSJohn Baldwin shmfd = fp->f_data; 5508e38aeffSJohn Baldwin 5518e38aeffSJohn Baldwin #ifdef MAC 5528e38aeffSJohn Baldwin error = mac_posixshm_check_stat(active_cred, fp->f_cred, shmfd); 5538e38aeffSJohn Baldwin if (error) 5548e38aeffSJohn Baldwin return (error); 5558e38aeffSJohn Baldwin #endif 5568e38aeffSJohn Baldwin 5578e38aeffSJohn Baldwin /* 5588e38aeffSJohn Baldwin * Attempt to return sanish values for fstat() on a memory file 5598e38aeffSJohn Baldwin * descriptor. 5608e38aeffSJohn Baldwin */ 5618e38aeffSJohn Baldwin bzero(sb, sizeof(*sb)); 5628e38aeffSJohn Baldwin sb->st_blksize = PAGE_SIZE; 5638e38aeffSJohn Baldwin sb->st_size = shmfd->shm_size; 56455e0987aSPedro F. Giffuni sb->st_blocks = howmany(sb->st_size, sb->st_blksize); 5659c00bb91SKonstantin Belousov mtx_lock(&shm_timestamp_lock); 566510ea843SEd Schouten sb->st_atim = shmfd->shm_atime; 567510ea843SEd Schouten sb->st_ctim = shmfd->shm_ctime; 568510ea843SEd Schouten sb->st_mtim = shmfd->shm_mtime; 569510ea843SEd Schouten sb->st_birthtim = shmfd->shm_birthtime; 5709c00bb91SKonstantin Belousov sb->st_mode = S_IFREG | shmfd->shm_mode; /* XXX */ 5718e38aeffSJohn Baldwin sb->st_uid = shmfd->shm_uid; 5728e38aeffSJohn Baldwin sb->st_gid = shmfd->shm_gid; 5739c00bb91SKonstantin Belousov mtx_unlock(&shm_timestamp_lock); 574610a2b3cSJohn Baldwin sb->st_dev = shm_dev_ino; 575610a2b3cSJohn Baldwin sb->st_ino = shmfd->shm_ino; 576e4b77548SKonstantin Belousov sb->st_nlink = shmfd->shm_object->ref_count; 577d301b358SKonstantin Belousov sb->st_blocks = shmfd->shm_object->size / 578d301b358SKonstantin Belousov (pagesizes[shmfd->shm_lp_psind] >> PAGE_SHIFT); 5798e38aeffSJohn Baldwin 5808e38aeffSJohn Baldwin return (0); 5818e38aeffSJohn Baldwin } 5828e38aeffSJohn Baldwin 5838e38aeffSJohn Baldwin static int 5848e38aeffSJohn Baldwin shm_close(struct file *fp, struct thread *td) 5858e38aeffSJohn Baldwin { 5868e38aeffSJohn Baldwin struct shmfd *shmfd; 5878e38aeffSJohn Baldwin 5888e38aeffSJohn Baldwin shmfd = fp->f_data; 5898e38aeffSJohn Baldwin fp->f_data = NULL; 5908e38aeffSJohn Baldwin shm_drop(shmfd); 5918e38aeffSJohn Baldwin 5928e38aeffSJohn Baldwin return (0); 5938e38aeffSJohn Baldwin } 5948e38aeffSJohn Baldwin 595af755d3eSKyle Evans static int 5962d5603feSDavid Bright shm_copyin_path(struct thread *td, const char *userpath_in, char **path_out) { 5972d5603feSDavid Bright int error; 5982d5603feSDavid Bright char *path; 5992d5603feSDavid Bright const char *pr_path; 6002d5603feSDavid Bright size_t pr_pathlen; 6012d5603feSDavid Bright 6022d5603feSDavid Bright path = malloc(MAXPATHLEN, M_SHMFD, M_WAITOK); 6032d5603feSDavid Bright pr_path = td->td_ucred->cr_prison->pr_path; 6042d5603feSDavid Bright 6052d5603feSDavid Bright /* Construct a full pathname for jailed callers. */ 6062d5603feSDavid Bright pr_pathlen = strcmp(pr_path, "/") == 6072d5603feSDavid Bright 0 ? 0 : strlcpy(path, pr_path, MAXPATHLEN); 6082d5603feSDavid Bright error = copyinstr(userpath_in, path + pr_pathlen, 6092d5603feSDavid Bright MAXPATHLEN - pr_pathlen, NULL); 6102d5603feSDavid Bright if (error != 0) 6112d5603feSDavid Bright goto out; 6122d5603feSDavid Bright 6132d5603feSDavid Bright #ifdef KTRACE 6142d5603feSDavid Bright if (KTRPOINT(curthread, KTR_NAMEI)) 6152d5603feSDavid Bright ktrnamei(path); 6162d5603feSDavid Bright #endif 6172d5603feSDavid Bright 6182d5603feSDavid Bright /* Require paths to start with a '/' character. */ 6192d5603feSDavid Bright if (path[pr_pathlen] != '/') { 6202d5603feSDavid Bright error = EINVAL; 6212d5603feSDavid Bright goto out; 6222d5603feSDavid Bright } 6232d5603feSDavid Bright 6242d5603feSDavid Bright *path_out = path; 6252d5603feSDavid Bright 6262d5603feSDavid Bright out: 6272d5603feSDavid Bright if (error != 0) 6282d5603feSDavid Bright free(path, M_SHMFD); 6292d5603feSDavid Bright 6302d5603feSDavid Bright return (error); 6312d5603feSDavid Bright } 6322d5603feSDavid Bright 6332d5603feSDavid Bright static int 634454bc887SKa Ho Ng shm_partial_page_invalidate(vm_object_t object, vm_pindex_t idx, int base, 635454bc887SKa Ho Ng int end) 636454bc887SKa Ho Ng { 637454bc887SKa Ho Ng vm_page_t m; 638454bc887SKa Ho Ng int rv; 639454bc887SKa Ho Ng 640454bc887SKa Ho Ng VM_OBJECT_ASSERT_WLOCKED(object); 641454bc887SKa Ho Ng KASSERT(base >= 0, ("%s: base %d", __func__, base)); 642454bc887SKa Ho Ng KASSERT(end - base <= PAGE_SIZE, ("%s: base %d end %d", __func__, base, 643454bc887SKa Ho Ng end)); 644454bc887SKa Ho Ng 645454bc887SKa Ho Ng retry: 646454bc887SKa Ho Ng m = vm_page_grab(object, idx, VM_ALLOC_NOCREAT); 647454bc887SKa Ho Ng if (m != NULL) { 648454bc887SKa Ho Ng MPASS(vm_page_all_valid(m)); 649454bc887SKa Ho Ng } else if (vm_pager_has_page(object, idx, NULL, NULL)) { 650454bc887SKa Ho Ng m = vm_page_alloc(object, idx, 651454bc887SKa Ho Ng VM_ALLOC_NORMAL | VM_ALLOC_WAITFAIL); 652454bc887SKa Ho Ng if (m == NULL) 653454bc887SKa Ho Ng goto retry; 654454bc887SKa Ho Ng vm_object_pip_add(object, 1); 655454bc887SKa Ho Ng VM_OBJECT_WUNLOCK(object); 656454bc887SKa Ho Ng rv = vm_pager_get_pages(object, &m, 1, NULL, NULL); 657454bc887SKa Ho Ng VM_OBJECT_WLOCK(object); 658454bc887SKa Ho Ng vm_object_pip_wakeup(object); 659454bc887SKa Ho Ng if (rv == VM_PAGER_OK) { 660454bc887SKa Ho Ng /* 661454bc887SKa Ho Ng * Since the page was not resident, and therefore not 662454bc887SKa Ho Ng * recently accessed, immediately enqueue it for 663454bc887SKa Ho Ng * asynchronous laundering. The current operation is 664454bc887SKa Ho Ng * not regarded as an access. 665454bc887SKa Ho Ng */ 666454bc887SKa Ho Ng vm_page_launder(m); 667454bc887SKa Ho Ng } else { 668454bc887SKa Ho Ng vm_page_free(m); 669454bc887SKa Ho Ng VM_OBJECT_WUNLOCK(object); 670454bc887SKa Ho Ng return (EIO); 671454bc887SKa Ho Ng } 672454bc887SKa Ho Ng } 673454bc887SKa Ho Ng if (m != NULL) { 674454bc887SKa Ho Ng pmap_zero_page_area(m, base, end - base); 675454bc887SKa Ho Ng KASSERT(vm_page_all_valid(m), ("%s: page %p is invalid", 676454bc887SKa Ho Ng __func__, m)); 677454bc887SKa Ho Ng vm_page_set_dirty(m); 678454bc887SKa Ho Ng vm_page_xunbusy(m); 679454bc887SKa Ho Ng } 680454bc887SKa Ho Ng 681454bc887SKa Ho Ng return (0); 682454bc887SKa Ho Ng } 683454bc887SKa Ho Ng 684454bc887SKa Ho Ng static int 685af755d3eSKyle Evans shm_dotruncate_locked(struct shmfd *shmfd, off_t length, void *rl_cookie) 6868e38aeffSJohn Baldwin { 6878e38aeffSJohn Baldwin vm_object_t object; 688454bc887SKa Ho Ng vm_pindex_t nobjsize; 6893364c323SKonstantin Belousov vm_ooffset_t delta; 690454bc887SKa Ho Ng int base, error; 6918e38aeffSJohn Baldwin 6922a016de1SAlan Cox KASSERT(length >= 0, ("shm_dotruncate: length < 0")); 6938e38aeffSJohn Baldwin object = shmfd->shm_object; 694af755d3eSKyle Evans VM_OBJECT_ASSERT_WLOCKED(object); 695af755d3eSKyle Evans rangelock_cookie_assert(rl_cookie, RA_WLOCKED); 696af755d3eSKyle Evans if (length == shmfd->shm_size) 6973364c323SKonstantin Belousov return (0); 6988e38aeffSJohn Baldwin nobjsize = OFF_TO_IDX(length + PAGE_MASK); 6998e38aeffSJohn Baldwin 7008e38aeffSJohn Baldwin /* Are we shrinking? If so, trim the end. */ 7018e38aeffSJohn Baldwin if (length < shmfd->shm_size) { 702af755d3eSKyle Evans if ((shmfd->shm_seals & F_SEAL_SHRINK) != 0) 703af755d3eSKyle Evans return (EPERM); 704af755d3eSKyle Evans 705fb680e16SJohn Baldwin /* 706fb680e16SJohn Baldwin * Disallow any requests to shrink the size if this 707fb680e16SJohn Baldwin * object is mapped into the kernel. 708fb680e16SJohn Baldwin */ 709af755d3eSKyle Evans if (shmfd->shm_kmappings > 0) 710fb680e16SJohn Baldwin return (EBUSY); 7112971897dSAlan Cox 7122971897dSAlan Cox /* 7132971897dSAlan Cox * Zero the truncated part of the last page. 7142971897dSAlan Cox */ 7152971897dSAlan Cox base = length & PAGE_MASK; 7162971897dSAlan Cox if (base != 0) { 717454bc887SKa Ho Ng error = shm_partial_page_invalidate(object, 718454bc887SKa Ho Ng OFF_TO_IDX(length), base, PAGE_SIZE); 719454bc887SKa Ho Ng if (error) 720454bc887SKa Ho Ng return (error); 7212971897dSAlan Cox } 7222a016de1SAlan Cox delta = IDX_TO_OFF(object->size - nobjsize); 7233364c323SKonstantin Belousov 7248e38aeffSJohn Baldwin if (nobjsize < object->size) 7258e38aeffSJohn Baldwin vm_object_page_remove(object, nobjsize, object->size, 7266bbee8e2SAlan Cox 0); 7278e38aeffSJohn Baldwin 7283364c323SKonstantin Belousov /* Free the swap accounted for shm */ 729ef694c1aSEdward Tomasz Napierala swap_release_by_cred(delta, object->cred); 7303364c323SKonstantin Belousov object->charge -= delta; 7313364c323SKonstantin Belousov } else { 732af755d3eSKyle Evans if ((shmfd->shm_seals & F_SEAL_GROW) != 0) 733af755d3eSKyle Evans return (EPERM); 734af755d3eSKyle Evans 7352a016de1SAlan Cox /* Try to reserve additional swap space. */ 7362a016de1SAlan Cox delta = IDX_TO_OFF(nobjsize - object->size); 737af755d3eSKyle Evans if (!swap_reserve_by_cred(delta, object->cred)) 7383364c323SKonstantin Belousov return (ENOMEM); 7393364c323SKonstantin Belousov object->charge += delta; 7408e38aeffSJohn Baldwin } 7418e38aeffSJohn Baldwin shmfd->shm_size = length; 7428e38aeffSJohn Baldwin mtx_lock(&shm_timestamp_lock); 7438e38aeffSJohn Baldwin vfs_timestamp(&shmfd->shm_ctime); 7448e38aeffSJohn Baldwin shmfd->shm_mtime = shmfd->shm_ctime; 7458e38aeffSJohn Baldwin mtx_unlock(&shm_timestamp_lock); 7468e38aeffSJohn Baldwin object->size = nobjsize; 7473364c323SKonstantin Belousov return (0); 7488e38aeffSJohn Baldwin } 7498e38aeffSJohn Baldwin 750d301b358SKonstantin Belousov static int 751d301b358SKonstantin Belousov shm_dotruncate_largepage(struct shmfd *shmfd, off_t length, void *rl_cookie) 752d301b358SKonstantin Belousov { 753d301b358SKonstantin Belousov vm_object_t object; 754d301b358SKonstantin Belousov vm_page_t m; 7553b5331ddSKonstantin Belousov vm_pindex_t newobjsz; 7563b5331ddSKonstantin Belousov vm_pindex_t oldobjsz __unused; 757d301b358SKonstantin Belousov int aflags, error, i, psind, try; 758d301b358SKonstantin Belousov 759d301b358SKonstantin Belousov KASSERT(length >= 0, ("shm_dotruncate: length < 0")); 760d301b358SKonstantin Belousov object = shmfd->shm_object; 761d301b358SKonstantin Belousov VM_OBJECT_ASSERT_WLOCKED(object); 762d301b358SKonstantin Belousov rangelock_cookie_assert(rl_cookie, RA_WLOCKED); 763d301b358SKonstantin Belousov 764d301b358SKonstantin Belousov oldobjsz = object->size; 765d301b358SKonstantin Belousov newobjsz = OFF_TO_IDX(length); 766d301b358SKonstantin Belousov if (length == shmfd->shm_size) 767d301b358SKonstantin Belousov return (0); 768d301b358SKonstantin Belousov psind = shmfd->shm_lp_psind; 769d301b358SKonstantin Belousov if (psind == 0 && length != 0) 770d301b358SKonstantin Belousov return (EINVAL); 771d301b358SKonstantin Belousov if ((length & (pagesizes[psind] - 1)) != 0) 772d301b358SKonstantin Belousov return (EINVAL); 773d301b358SKonstantin Belousov 774d301b358SKonstantin Belousov if (length < shmfd->shm_size) { 775d301b358SKonstantin Belousov if ((shmfd->shm_seals & F_SEAL_SHRINK) != 0) 776d301b358SKonstantin Belousov return (EPERM); 777d301b358SKonstantin Belousov if (shmfd->shm_kmappings > 0) 778d301b358SKonstantin Belousov return (EBUSY); 779d301b358SKonstantin Belousov return (ENOTSUP); /* Pages are unmanaged. */ 780d301b358SKonstantin Belousov #if 0 781d301b358SKonstantin Belousov vm_object_page_remove(object, newobjsz, oldobjsz, 0); 782d301b358SKonstantin Belousov object->size = newobjsz; 783d301b358SKonstantin Belousov shmfd->shm_size = length; 784d301b358SKonstantin Belousov return (0); 785d301b358SKonstantin Belousov #endif 786d301b358SKonstantin Belousov } 787d301b358SKonstantin Belousov 78879783634SKonstantin Belousov if ((shmfd->shm_seals & F_SEAL_GROW) != 0) 78979783634SKonstantin Belousov return (EPERM); 79079783634SKonstantin Belousov 791d301b358SKonstantin Belousov aflags = VM_ALLOC_NORMAL | VM_ALLOC_ZERO; 792d301b358SKonstantin Belousov if (shmfd->shm_lp_alloc_policy == SHM_LARGEPAGE_ALLOC_NOWAIT) 793d301b358SKonstantin Belousov aflags |= VM_ALLOC_WAITFAIL; 794d301b358SKonstantin Belousov try = 0; 795d301b358SKonstantin Belousov 796d301b358SKonstantin Belousov /* 797d301b358SKonstantin Belousov * Extend shmfd and object, keeping all already fully 798d301b358SKonstantin Belousov * allocated large pages intact even on error, because dropped 799d301b358SKonstantin Belousov * object lock might allowed mapping of them. 800d301b358SKonstantin Belousov */ 801d301b358SKonstantin Belousov while (object->size < newobjsz) { 802d301b358SKonstantin Belousov m = vm_page_alloc_contig(object, object->size, aflags, 803d301b358SKonstantin Belousov pagesizes[psind] / PAGE_SIZE, 0, ~0, 804d301b358SKonstantin Belousov pagesizes[psind], 0, 805d301b358SKonstantin Belousov VM_MEMATTR_DEFAULT); 806d301b358SKonstantin Belousov if (m == NULL) { 807d301b358SKonstantin Belousov VM_OBJECT_WUNLOCK(object); 808d301b358SKonstantin Belousov if (shmfd->shm_lp_alloc_policy == 809d301b358SKonstantin Belousov SHM_LARGEPAGE_ALLOC_NOWAIT || 810d301b358SKonstantin Belousov (shmfd->shm_lp_alloc_policy == 811d301b358SKonstantin Belousov SHM_LARGEPAGE_ALLOC_DEFAULT && 812d301b358SKonstantin Belousov try >= largepage_reclaim_tries)) { 813d301b358SKonstantin Belousov VM_OBJECT_WLOCK(object); 814d301b358SKonstantin Belousov return (ENOMEM); 815d301b358SKonstantin Belousov } 816d301b358SKonstantin Belousov error = vm_page_reclaim_contig(aflags, 817d301b358SKonstantin Belousov pagesizes[psind] / PAGE_SIZE, 0, ~0, 818d301b358SKonstantin Belousov pagesizes[psind], 0) ? 0 : 819d301b358SKonstantin Belousov vm_wait_intr(object); 820d301b358SKonstantin Belousov if (error != 0) { 821d301b358SKonstantin Belousov VM_OBJECT_WLOCK(object); 822d301b358SKonstantin Belousov return (error); 823d301b358SKonstantin Belousov } 824d301b358SKonstantin Belousov try++; 825d301b358SKonstantin Belousov VM_OBJECT_WLOCK(object); 826d301b358SKonstantin Belousov continue; 827d301b358SKonstantin Belousov } 828d301b358SKonstantin Belousov try = 0; 829d301b358SKonstantin Belousov for (i = 0; i < pagesizes[psind] / PAGE_SIZE; i++) { 830d301b358SKonstantin Belousov if ((m[i].flags & PG_ZERO) == 0) 831d301b358SKonstantin Belousov pmap_zero_page(&m[i]); 832d301b358SKonstantin Belousov vm_page_valid(&m[i]); 833d301b358SKonstantin Belousov vm_page_xunbusy(&m[i]); 834d301b358SKonstantin Belousov } 835d301b358SKonstantin Belousov object->size += OFF_TO_IDX(pagesizes[psind]); 836d301b358SKonstantin Belousov shmfd->shm_size += pagesizes[psind]; 837d301b358SKonstantin Belousov atomic_add_long(&count_largepages[psind], 1); 838d301b358SKonstantin Belousov vm_wire_add(atop(pagesizes[psind])); 839d301b358SKonstantin Belousov } 840d301b358SKonstantin Belousov return (0); 841d301b358SKonstantin Belousov } 842d301b358SKonstantin Belousov 843d301b358SKonstantin Belousov static int 844d301b358SKonstantin Belousov shm_dotruncate_cookie(struct shmfd *shmfd, off_t length, void *rl_cookie) 845d301b358SKonstantin Belousov { 846d301b358SKonstantin Belousov int error; 847d301b358SKonstantin Belousov 848d301b358SKonstantin Belousov VM_OBJECT_WLOCK(shmfd->shm_object); 849d301b358SKonstantin Belousov error = shm_largepage(shmfd) ? shm_dotruncate_largepage(shmfd, 850d301b358SKonstantin Belousov length, rl_cookie) : shm_dotruncate_locked(shmfd, length, 851d301b358SKonstantin Belousov rl_cookie); 852d301b358SKonstantin Belousov VM_OBJECT_WUNLOCK(shmfd->shm_object); 853d301b358SKonstantin Belousov return (error); 854d301b358SKonstantin Belousov } 855d301b358SKonstantin Belousov 856af755d3eSKyle Evans int 857af755d3eSKyle Evans shm_dotruncate(struct shmfd *shmfd, off_t length) 858af755d3eSKyle Evans { 859af755d3eSKyle Evans void *rl_cookie; 860af755d3eSKyle Evans int error; 861af755d3eSKyle Evans 862af755d3eSKyle Evans rl_cookie = rangelock_wlock(&shmfd->shm_rl, 0, OFF_MAX, 863af755d3eSKyle Evans &shmfd->shm_mtx); 864d301b358SKonstantin Belousov error = shm_dotruncate_cookie(shmfd, length, rl_cookie); 865af755d3eSKyle Evans rangelock_unlock(&shmfd->shm_rl, rl_cookie, &shmfd->shm_mtx); 866af755d3eSKyle Evans return (error); 867af755d3eSKyle Evans } 868af755d3eSKyle Evans 8698e38aeffSJohn Baldwin /* 8708e38aeffSJohn Baldwin * shmfd object management including creation and reference counting 8718e38aeffSJohn Baldwin * routines. 8728e38aeffSJohn Baldwin */ 8731bdbd705SKonstantin Belousov struct shmfd * 874d301b358SKonstantin Belousov shm_alloc(struct ucred *ucred, mode_t mode, bool largepage) 8758e38aeffSJohn Baldwin { 8768e38aeffSJohn Baldwin struct shmfd *shmfd; 8778e38aeffSJohn Baldwin 8788e38aeffSJohn Baldwin shmfd = malloc(sizeof(*shmfd), M_SHMFD, M_WAITOK | M_ZERO); 8798e38aeffSJohn Baldwin shmfd->shm_size = 0; 8808e38aeffSJohn Baldwin shmfd->shm_uid = ucred->cr_uid; 8818e38aeffSJohn Baldwin shmfd->shm_gid = ucred->cr_gid; 8828e38aeffSJohn Baldwin shmfd->shm_mode = mode; 883d301b358SKonstantin Belousov if (largepage) { 884d301b358SKonstantin Belousov shmfd->shm_object = phys_pager_allocate(NULL, 885d301b358SKonstantin Belousov &shm_largepage_phys_ops, NULL, shmfd->shm_size, 886d301b358SKonstantin Belousov VM_PROT_DEFAULT, 0, ucred); 887d301b358SKonstantin Belousov shmfd->shm_lp_alloc_policy = SHM_LARGEPAGE_ALLOC_DEFAULT; 888d301b358SKonstantin Belousov } else { 88932287ea7SKyle Evans shmfd->shm_object = vm_pager_allocate(OBJT_SWAP, NULL, 8903364c323SKonstantin Belousov shmfd->shm_size, VM_PROT_DEFAULT, 0, ucred); 891d301b358SKonstantin Belousov } 8928e38aeffSJohn Baldwin KASSERT(shmfd->shm_object != NULL, ("shm_create: vm_pager_allocate")); 8938e38aeffSJohn Baldwin vfs_timestamp(&shmfd->shm_birthtime); 8948e38aeffSJohn Baldwin shmfd->shm_atime = shmfd->shm_mtime = shmfd->shm_ctime = 8958e38aeffSJohn Baldwin shmfd->shm_birthtime; 8967883ce1fSMateusz Guzik shmfd->shm_ino = alloc_unr64(&shm_ino_unr); 8978e38aeffSJohn Baldwin refcount_init(&shmfd->shm_refs, 1); 898940cb0e2SKonstantin Belousov mtx_init(&shmfd->shm_mtx, "shmrl", NULL, MTX_DEF); 899940cb0e2SKonstantin Belousov rangelock_init(&shmfd->shm_rl); 9008e38aeffSJohn Baldwin #ifdef MAC 9018e38aeffSJohn Baldwin mac_posixshm_init(shmfd); 9028e38aeffSJohn Baldwin mac_posixshm_create(ucred, shmfd); 9038e38aeffSJohn Baldwin #endif 9048e38aeffSJohn Baldwin 9058e38aeffSJohn Baldwin return (shmfd); 9068e38aeffSJohn Baldwin } 9078e38aeffSJohn Baldwin 9081bdbd705SKonstantin Belousov struct shmfd * 9098e38aeffSJohn Baldwin shm_hold(struct shmfd *shmfd) 9108e38aeffSJohn Baldwin { 9118e38aeffSJohn Baldwin 9128e38aeffSJohn Baldwin refcount_acquire(&shmfd->shm_refs); 9138e38aeffSJohn Baldwin return (shmfd); 9148e38aeffSJohn Baldwin } 9158e38aeffSJohn Baldwin 9161bdbd705SKonstantin Belousov void 9178e38aeffSJohn Baldwin shm_drop(struct shmfd *shmfd) 9188e38aeffSJohn Baldwin { 9198e38aeffSJohn Baldwin 9208e38aeffSJohn Baldwin if (refcount_release(&shmfd->shm_refs)) { 9218e38aeffSJohn Baldwin #ifdef MAC 9228e38aeffSJohn Baldwin mac_posixshm_destroy(shmfd); 9238e38aeffSJohn Baldwin #endif 924940cb0e2SKonstantin Belousov rangelock_destroy(&shmfd->shm_rl); 925940cb0e2SKonstantin Belousov mtx_destroy(&shmfd->shm_mtx); 9268e38aeffSJohn Baldwin vm_object_deallocate(shmfd->shm_object); 9278e38aeffSJohn Baldwin free(shmfd, M_SHMFD); 9288e38aeffSJohn Baldwin } 9298e38aeffSJohn Baldwin } 9308e38aeffSJohn Baldwin 9318e38aeffSJohn Baldwin /* 9328e38aeffSJohn Baldwin * Determine if the credentials have sufficient permissions for a 9338e38aeffSJohn Baldwin * specified combination of FREAD and FWRITE. 9348e38aeffSJohn Baldwin */ 9351bdbd705SKonstantin Belousov int 9368e38aeffSJohn Baldwin shm_access(struct shmfd *shmfd, struct ucred *ucred, int flags) 9378e38aeffSJohn Baldwin { 93815bc6b2bSEdward Tomasz Napierala accmode_t accmode; 9399c00bb91SKonstantin Belousov int error; 9408e38aeffSJohn Baldwin 94115bc6b2bSEdward Tomasz Napierala accmode = 0; 9428e38aeffSJohn Baldwin if (flags & FREAD) 94315bc6b2bSEdward Tomasz Napierala accmode |= VREAD; 9448e38aeffSJohn Baldwin if (flags & FWRITE) 94515bc6b2bSEdward Tomasz Napierala accmode |= VWRITE; 9469c00bb91SKonstantin Belousov mtx_lock(&shm_timestamp_lock); 9479c00bb91SKonstantin Belousov error = vaccess(VREG, shmfd->shm_mode, shmfd->shm_uid, shmfd->shm_gid, 948d292b194SMateusz Guzik accmode, ucred); 9499c00bb91SKonstantin Belousov mtx_unlock(&shm_timestamp_lock); 9509c00bb91SKonstantin Belousov return (error); 9518e38aeffSJohn Baldwin } 9528e38aeffSJohn Baldwin 9538e38aeffSJohn Baldwin static void 954610a2b3cSJohn Baldwin shm_init(void *arg) 9558e38aeffSJohn Baldwin { 956d301b358SKonstantin Belousov char name[32]; 957d301b358SKonstantin Belousov int i; 9588e38aeffSJohn Baldwin 9598e38aeffSJohn Baldwin mtx_init(&shm_timestamp_lock, "shm timestamps", NULL, MTX_DEF); 9608e38aeffSJohn Baldwin sx_init(&shm_dict_lock, "shm dictionary"); 9618e38aeffSJohn Baldwin shm_dictionary = hashinit(1024, M_SHMFD, &shm_hash); 9627883ce1fSMateusz Guzik new_unrhdr64(&shm_ino_unr, 1); 963610a2b3cSJohn Baldwin shm_dev_ino = devfs_alloc_cdp_inode(); 964610a2b3cSJohn Baldwin KASSERT(shm_dev_ino > 0, ("shm dev inode not initialized")); 965d301b358SKonstantin Belousov 966d301b358SKonstantin Belousov for (i = 1; i < MAXPAGESIZES; i++) { 967d301b358SKonstantin Belousov if (pagesizes[i] == 0) 968d301b358SKonstantin Belousov break; 969d301b358SKonstantin Belousov #define M (1024 * 1024) 970d301b358SKonstantin Belousov #define G (1024 * M) 971d301b358SKonstantin Belousov if (pagesizes[i] >= G) 972d301b358SKonstantin Belousov snprintf(name, sizeof(name), "%luG", pagesizes[i] / G); 973d301b358SKonstantin Belousov else if (pagesizes[i] >= M) 974d301b358SKonstantin Belousov snprintf(name, sizeof(name), "%luM", pagesizes[i] / M); 975d301b358SKonstantin Belousov else 976d301b358SKonstantin Belousov snprintf(name, sizeof(name), "%lu", pagesizes[i]); 977d301b358SKonstantin Belousov #undef G 978d301b358SKonstantin Belousov #undef M 979d301b358SKonstantin Belousov SYSCTL_ADD_ULONG(NULL, SYSCTL_STATIC_CHILDREN(_vm_largepages), 980d301b358SKonstantin Belousov OID_AUTO, name, CTLFLAG_RD, &count_largepages[i], 981d301b358SKonstantin Belousov "number of non-transient largepages allocated"); 982d301b358SKonstantin Belousov } 9838e38aeffSJohn Baldwin } 984610a2b3cSJohn Baldwin SYSINIT(shm_init, SI_SUB_SYSV_SHM, SI_ORDER_ANY, shm_init, NULL); 9858e38aeffSJohn Baldwin 98625f44824SKonstantin Belousov /* 987*7060da62SJamie Gritton * Remove all shared memory objects that belong to a prison. 988*7060da62SJamie Gritton */ 989*7060da62SJamie Gritton void 990*7060da62SJamie Gritton shm_remove_prison(struct prison *pr) 991*7060da62SJamie Gritton { 992*7060da62SJamie Gritton struct shm_mapping *shmm, *tshmm; 993*7060da62SJamie Gritton u_long i; 994*7060da62SJamie Gritton 995*7060da62SJamie Gritton sx_xlock(&shm_dict_lock); 996*7060da62SJamie Gritton for (i = 0; i < shm_hash + 1; i++) { 997*7060da62SJamie Gritton LIST_FOREACH_SAFE(shmm, &shm_dictionary[i], sm_link, tshmm) { 998*7060da62SJamie Gritton if (shmm->sm_shmfd->shm_object->cred && 999*7060da62SJamie Gritton shmm->sm_shmfd->shm_object->cred->cr_prison == pr) 1000*7060da62SJamie Gritton shm_doremove(shmm); 1001*7060da62SJamie Gritton } 1002*7060da62SJamie Gritton } 1003*7060da62SJamie Gritton sx_xunlock(&shm_dict_lock); 1004*7060da62SJamie Gritton } 1005*7060da62SJamie Gritton 1006*7060da62SJamie Gritton /* 100725f44824SKonstantin Belousov * Dictionary management. We maintain an in-kernel dictionary to map 100825f44824SKonstantin Belousov * paths to shmfd objects. We use the FNV hash on the path to store 100925f44824SKonstantin Belousov * the mappings in a hash table. 101025f44824SKonstantin Belousov */ 10118e38aeffSJohn Baldwin static struct shmfd * 10128e38aeffSJohn Baldwin shm_lookup(char *path, Fnv32_t fnv) 10138e38aeffSJohn Baldwin { 10148e38aeffSJohn Baldwin struct shm_mapping *map; 10158e38aeffSJohn Baldwin 10168e38aeffSJohn Baldwin LIST_FOREACH(map, SHM_HASH(fnv), sm_link) { 10178e38aeffSJohn Baldwin if (map->sm_fnv != fnv) 10188e38aeffSJohn Baldwin continue; 10198e38aeffSJohn Baldwin if (strcmp(map->sm_path, path) == 0) 10208e38aeffSJohn Baldwin return (map->sm_shmfd); 10218e38aeffSJohn Baldwin } 10228e38aeffSJohn Baldwin 10238e38aeffSJohn Baldwin return (NULL); 10248e38aeffSJohn Baldwin } 10258e38aeffSJohn Baldwin 10268e38aeffSJohn Baldwin static void 10278e38aeffSJohn Baldwin shm_insert(char *path, Fnv32_t fnv, struct shmfd *shmfd) 10288e38aeffSJohn Baldwin { 10298e38aeffSJohn Baldwin struct shm_mapping *map; 10308e38aeffSJohn Baldwin 10318e38aeffSJohn Baldwin map = malloc(sizeof(struct shm_mapping), M_SHMFD, M_WAITOK); 10328e38aeffSJohn Baldwin map->sm_path = path; 10338e38aeffSJohn Baldwin map->sm_fnv = fnv; 10348e38aeffSJohn Baldwin map->sm_shmfd = shm_hold(shmfd); 1035e506e182SJohn Baldwin shmfd->shm_path = path; 10368e38aeffSJohn Baldwin LIST_INSERT_HEAD(SHM_HASH(fnv), map, sm_link); 10378e38aeffSJohn Baldwin } 10388e38aeffSJohn Baldwin 10398e38aeffSJohn Baldwin static int 10408e38aeffSJohn Baldwin shm_remove(char *path, Fnv32_t fnv, struct ucred *ucred) 10418e38aeffSJohn Baldwin { 10428e38aeffSJohn Baldwin struct shm_mapping *map; 10438e38aeffSJohn Baldwin int error; 10448e38aeffSJohn Baldwin 10458e38aeffSJohn Baldwin LIST_FOREACH(map, SHM_HASH(fnv), sm_link) { 10468e38aeffSJohn Baldwin if (map->sm_fnv != fnv) 10478e38aeffSJohn Baldwin continue; 10488e38aeffSJohn Baldwin if (strcmp(map->sm_path, path) == 0) { 10498e38aeffSJohn Baldwin #ifdef MAC 10508e38aeffSJohn Baldwin error = mac_posixshm_check_unlink(ucred, map->sm_shmfd); 10518e38aeffSJohn Baldwin if (error) 10528e38aeffSJohn Baldwin return (error); 10538e38aeffSJohn Baldwin #endif 10548e38aeffSJohn Baldwin error = shm_access(map->sm_shmfd, ucred, 10558e38aeffSJohn Baldwin FREAD | FWRITE); 10568e38aeffSJohn Baldwin if (error) 10578e38aeffSJohn Baldwin return (error); 1058*7060da62SJamie Gritton shm_doremove(map); 10598e38aeffSJohn Baldwin return (0); 10608e38aeffSJohn Baldwin } 10618e38aeffSJohn Baldwin } 10628e38aeffSJohn Baldwin 10638e38aeffSJohn Baldwin return (ENOENT); 10648e38aeffSJohn Baldwin } 10658e38aeffSJohn Baldwin 1066*7060da62SJamie Gritton static void 1067*7060da62SJamie Gritton shm_doremove(struct shm_mapping *map) 1068*7060da62SJamie Gritton { 1069*7060da62SJamie Gritton map->sm_shmfd->shm_path = NULL; 1070*7060da62SJamie Gritton LIST_REMOVE(map, sm_link); 1071*7060da62SJamie Gritton shm_drop(map->sm_shmfd); 1072*7060da62SJamie Gritton free(map->sm_path, M_SHMFD); 1073*7060da62SJamie Gritton free(map, M_SHMFD); 1074*7060da62SJamie Gritton } 1075*7060da62SJamie Gritton 10768e38aeffSJohn Baldwin int 1077535b1df9SKyle Evans kern_shm_open2(struct thread *td, const char *userpath, int flags, mode_t mode, 1078535b1df9SKyle Evans int shmflags, struct filecaps *fcaps, const char *name __unused) 10798e38aeffSJohn Baldwin { 108085078b85SConrad Meyer struct pwddesc *pdp; 10818e38aeffSJohn Baldwin struct shmfd *shmfd; 10828e38aeffSJohn Baldwin struct file *fp; 10838e38aeffSJohn Baldwin char *path; 10840cd95859SKyle Evans void *rl_cookie; 10858e38aeffSJohn Baldwin Fnv32_t fnv; 10868e38aeffSJohn Baldwin mode_t cmode; 1087535b1df9SKyle Evans int error, fd, initial_seals; 1088d301b358SKonstantin Belousov bool largepage; 1089535b1df9SKyle Evans 1090d301b358SKonstantin Belousov if ((shmflags & ~(SHM_ALLOW_SEALING | SHM_GROW_ON_WRITE | 1091d301b358SKonstantin Belousov SHM_LARGEPAGE)) != 0) 1092535b1df9SKyle Evans return (EINVAL); 1093535b1df9SKyle Evans 1094535b1df9SKyle Evans initial_seals = F_SEAL_SEAL; 1095535b1df9SKyle Evans if ((shmflags & SHM_ALLOW_SEALING) != 0) 1096535b1df9SKyle Evans initial_seals &= ~F_SEAL_SEAL; 10978e38aeffSJohn Baldwin 109812bc222eSJonathan Anderson #ifdef CAPABILITY_MODE 109912bc222eSJonathan Anderson /* 110012bc222eSJonathan Anderson * shm_open(2) is only allowed for anonymous objects. 110112bc222eSJonathan Anderson */ 11027ee1b208SEd Schouten if (IN_CAPABILITY_MODE(td) && (userpath != SHM_ANON)) 110312bc222eSJonathan Anderson return (ECAPMODE); 110412bc222eSJonathan Anderson #endif 110512bc222eSJonathan Anderson 110615bcf785SRobert Watson AUDIT_ARG_FFLAGS(flags); 110715bcf785SRobert Watson AUDIT_ARG_MODE(mode); 110815bcf785SRobert Watson 11097ee1b208SEd Schouten if ((flags & O_ACCMODE) != O_RDONLY && (flags & O_ACCMODE) != O_RDWR) 11108e38aeffSJohn Baldwin return (EINVAL); 11118e38aeffSJohn Baldwin 11127ee1b208SEd Schouten if ((flags & ~(O_ACCMODE | O_CREAT | O_EXCL | O_TRUNC | O_CLOEXEC)) != 0) 11138e38aeffSJohn Baldwin return (EINVAL); 11148e38aeffSJohn Baldwin 1115d301b358SKonstantin Belousov largepage = (shmflags & SHM_LARGEPAGE) != 0; 111678257765SMark Johnston if (largepage && !PMAP_HAS_LARGEPAGES) 1117d301b358SKonstantin Belousov return (ENOTTY); 1118d301b358SKonstantin Belousov 11190cd95859SKyle Evans /* 11200cd95859SKyle Evans * Currently only F_SEAL_SEAL may be set when creating or opening shmfd. 11210cd95859SKyle Evans * If the decision is made later to allow additional seals, care must be 11220cd95859SKyle Evans * taken below to ensure that the seals are properly set if the shmfd 11230cd95859SKyle Evans * already existed -- this currently assumes that only F_SEAL_SEAL can 11240cd95859SKyle Evans * be set and doesn't take further precautions to ensure the validity of 11250cd95859SKyle Evans * the seals being added with respect to current mappings. 11260cd95859SKyle Evans */ 11270cd95859SKyle Evans if ((initial_seals & ~F_SEAL_SEAL) != 0) 11280cd95859SKyle Evans return (EINVAL); 11290cd95859SKyle Evans 113085078b85SConrad Meyer pdp = td->td_proc->p_pd; 113185078b85SConrad Meyer cmode = (mode & ~pdp->pd_cmask) & ACCESSPERMS; 11328e38aeffSJohn Baldwin 1133b5a7ac99SKyle Evans /* 1134b5a7ac99SKyle Evans * shm_open(2) created shm should always have O_CLOEXEC set, as mandated 1135b5a7ac99SKyle Evans * by POSIX. We allow it to be unset here so that an in-kernel 1136b5a7ac99SKyle Evans * interface may be written as a thin layer around shm, optionally not 1137b5a7ac99SKyle Evans * setting CLOEXEC. For shm_open(2), O_CLOEXEC is set unconditionally 1138b5a7ac99SKyle Evans * in sys_shm_open() to keep this implementation compliant. 1139b5a7ac99SKyle Evans */ 1140b5a7ac99SKyle Evans error = falloc_caps(td, &fp, &fd, flags & O_CLOEXEC, fcaps); 11418e38aeffSJohn Baldwin if (error) 11428e38aeffSJohn Baldwin return (error); 11438e38aeffSJohn Baldwin 11448e38aeffSJohn Baldwin /* A SHM_ANON path pointer creates an anonymous object. */ 11457ee1b208SEd Schouten if (userpath == SHM_ANON) { 11468e38aeffSJohn Baldwin /* A read-only anonymous object is pointless. */ 11477ee1b208SEd Schouten if ((flags & O_ACCMODE) == O_RDONLY) { 114890f54cbfSMateusz Guzik fdclose(td, fp, fd); 11498e38aeffSJohn Baldwin fdrop(fp, td); 11508e38aeffSJohn Baldwin return (EINVAL); 11518e38aeffSJohn Baldwin } 1152d301b358SKonstantin Belousov shmfd = shm_alloc(td->td_ucred, cmode, largepage); 11530cd95859SKyle Evans shmfd->shm_seals = initial_seals; 11545dd47b52SKyle Evans shmfd->shm_flags = shmflags; 11558e38aeffSJohn Baldwin } else { 11562d5603feSDavid Bright error = shm_copyin_path(td, userpath, &path); 11572d5603feSDavid Bright if (error != 0) { 115890f54cbfSMateusz Guzik fdclose(td, fp, fd); 11598e38aeffSJohn Baldwin fdrop(fp, td); 11608e38aeffSJohn Baldwin return (error); 11618e38aeffSJohn Baldwin } 11628e38aeffSJohn Baldwin 116315bcf785SRobert Watson AUDIT_ARG_UPATH1_CANON(path); 11648e38aeffSJohn Baldwin fnv = fnv_32_str(path, FNV1_32_INIT); 11658e38aeffSJohn Baldwin sx_xlock(&shm_dict_lock); 11668e38aeffSJohn Baldwin shmfd = shm_lookup(path, fnv); 11678e38aeffSJohn Baldwin if (shmfd == NULL) { 11688e38aeffSJohn Baldwin /* Object does not yet exist, create it if requested. */ 11697ee1b208SEd Schouten if (flags & O_CREAT) { 11709b6dd12eSRobert Watson #ifdef MAC 11719b6dd12eSRobert Watson error = mac_posixshm_check_create(td->td_ucred, 11729b6dd12eSRobert Watson path); 11739b6dd12eSRobert Watson if (error == 0) { 11749b6dd12eSRobert Watson #endif 1175d301b358SKonstantin Belousov shmfd = shm_alloc(td->td_ucred, cmode, 1176d301b358SKonstantin Belousov largepage); 11770cd95859SKyle Evans shmfd->shm_seals = initial_seals; 11785dd47b52SKyle Evans shmfd->shm_flags = shmflags; 11798e38aeffSJohn Baldwin shm_insert(path, fnv, shmfd); 11809b6dd12eSRobert Watson #ifdef MAC 11819b6dd12eSRobert Watson } 11829b6dd12eSRobert Watson #endif 11838e38aeffSJohn Baldwin } else { 11848e38aeffSJohn Baldwin free(path, M_SHMFD); 11858e38aeffSJohn Baldwin error = ENOENT; 11868e38aeffSJohn Baldwin } 11878e38aeffSJohn Baldwin } else { 11880cd95859SKyle Evans rl_cookie = rangelock_wlock(&shmfd->shm_rl, 0, OFF_MAX, 11890cd95859SKyle Evans &shmfd->shm_mtx); 11900cd95859SKyle Evans 11910cd95859SKyle Evans /* 11920cd95859SKyle Evans * kern_shm_open() likely shouldn't ever error out on 11930cd95859SKyle Evans * trying to set a seal that already exists, unlike 11940cd95859SKyle Evans * F_ADD_SEALS. This would break terribly as 11950cd95859SKyle Evans * shm_open(2) actually sets F_SEAL_SEAL to maintain 11960cd95859SKyle Evans * historical behavior where the underlying file could 11970cd95859SKyle Evans * not be sealed. 11980cd95859SKyle Evans */ 11990cd95859SKyle Evans initial_seals &= ~shmfd->shm_seals; 12000cd95859SKyle Evans 12018e38aeffSJohn Baldwin /* 12028e38aeffSJohn Baldwin * Object already exists, obtain a new 12038e38aeffSJohn Baldwin * reference if requested and permitted. 12048e38aeffSJohn Baldwin */ 12058e38aeffSJohn Baldwin free(path, M_SHMFD); 12060cd95859SKyle Evans 12070cd95859SKyle Evans /* 12080cd95859SKyle Evans * initial_seals can't set additional seals if we've 12090cd95859SKyle Evans * already been set F_SEAL_SEAL. If F_SEAL_SEAL is set, 12100cd95859SKyle Evans * then we've already removed that one from 12110cd95859SKyle Evans * initial_seals. This is currently redundant as we 12120cd95859SKyle Evans * only allow setting F_SEAL_SEAL at creation time, but 12130cd95859SKyle Evans * it's cheap to check and decreases the effort required 12140cd95859SKyle Evans * to allow additional seals. 12150cd95859SKyle Evans */ 12160cd95859SKyle Evans if ((shmfd->shm_seals & F_SEAL_SEAL) != 0 && 12170cd95859SKyle Evans initial_seals != 0) 12180cd95859SKyle Evans error = EPERM; 12190cd95859SKyle Evans else if ((flags & (O_CREAT | O_EXCL)) == 12200cd95859SKyle Evans (O_CREAT | O_EXCL)) 12218e38aeffSJohn Baldwin error = EEXIST; 12225dd47b52SKyle Evans else if (shmflags != 0 && shmflags != shmfd->shm_flags) 12235dd47b52SKyle Evans error = EINVAL; 12248e38aeffSJohn Baldwin else { 12258e38aeffSJohn Baldwin #ifdef MAC 12268e38aeffSJohn Baldwin error = mac_posixshm_check_open(td->td_ucred, 12277ee1b208SEd Schouten shmfd, FFLAGS(flags & O_ACCMODE)); 12288e38aeffSJohn Baldwin if (error == 0) 12298e38aeffSJohn Baldwin #endif 12308e38aeffSJohn Baldwin error = shm_access(shmfd, td->td_ucred, 12317ee1b208SEd Schouten FFLAGS(flags & O_ACCMODE)); 12328e38aeffSJohn Baldwin } 12338e38aeffSJohn Baldwin 12348e38aeffSJohn Baldwin /* 12358e38aeffSJohn Baldwin * Truncate the file back to zero length if 12368e38aeffSJohn Baldwin * O_TRUNC was specified and the object was 12378e38aeffSJohn Baldwin * opened with read/write. 12388e38aeffSJohn Baldwin */ 12398e38aeffSJohn Baldwin if (error == 0 && 12407ee1b208SEd Schouten (flags & (O_ACCMODE | O_TRUNC)) == 12418e38aeffSJohn Baldwin (O_RDWR | O_TRUNC)) { 12420cd95859SKyle Evans VM_OBJECT_WLOCK(shmfd->shm_object); 12438e38aeffSJohn Baldwin #ifdef MAC 12448e38aeffSJohn Baldwin error = mac_posixshm_check_truncate( 12458e38aeffSJohn Baldwin td->td_ucred, fp->f_cred, shmfd); 12468e38aeffSJohn Baldwin if (error == 0) 12478e38aeffSJohn Baldwin #endif 12480cd95859SKyle Evans error = shm_dotruncate_locked(shmfd, 0, 12490cd95859SKyle Evans rl_cookie); 12500cd95859SKyle Evans VM_OBJECT_WUNLOCK(shmfd->shm_object); 12518e38aeffSJohn Baldwin } 12520cd95859SKyle Evans if (error == 0) { 12530cd95859SKyle Evans /* 12540cd95859SKyle Evans * Currently we only allow F_SEAL_SEAL to be 12550cd95859SKyle Evans * set initially. As noted above, this would 12560cd95859SKyle Evans * need to be reworked should that change. 12570cd95859SKyle Evans */ 12580cd95859SKyle Evans shmfd->shm_seals |= initial_seals; 12598e38aeffSJohn Baldwin shm_hold(shmfd); 12608e38aeffSJohn Baldwin } 12610cd95859SKyle Evans rangelock_unlock(&shmfd->shm_rl, rl_cookie, 12620cd95859SKyle Evans &shmfd->shm_mtx); 12630cd95859SKyle Evans } 12648e38aeffSJohn Baldwin sx_xunlock(&shm_dict_lock); 12658e38aeffSJohn Baldwin 12668e38aeffSJohn Baldwin if (error) { 126790f54cbfSMateusz Guzik fdclose(td, fp, fd); 12688e38aeffSJohn Baldwin fdrop(fp, td); 12698e38aeffSJohn Baldwin return (error); 12708e38aeffSJohn Baldwin } 12718e38aeffSJohn Baldwin } 12728e38aeffSJohn Baldwin 12737ee1b208SEd Schouten finit(fp, FFLAGS(flags & O_ACCMODE), DTYPE_SHM, shmfd, &shm_ops); 12748e38aeffSJohn Baldwin 12758e38aeffSJohn Baldwin td->td_retval[0] = fd; 12768e38aeffSJohn Baldwin fdrop(fp, td); 12778e38aeffSJohn Baldwin 12788e38aeffSJohn Baldwin return (0); 12798e38aeffSJohn Baldwin } 12808e38aeffSJohn Baldwin 12817ee1b208SEd Schouten /* System calls. */ 1282a9ac5e14SKyle Evans #ifdef COMPAT_FREEBSD12 12837ee1b208SEd Schouten int 1284a9ac5e14SKyle Evans freebsd12_shm_open(struct thread *td, struct freebsd12_shm_open_args *uap) 12857ee1b208SEd Schouten { 12867ee1b208SEd Schouten 1287535b1df9SKyle Evans return (kern_shm_open(td, uap->path, uap->flags | O_CLOEXEC, 1288535b1df9SKyle Evans uap->mode, NULL)); 12897ee1b208SEd Schouten } 1290a9ac5e14SKyle Evans #endif 12917ee1b208SEd Schouten 12928e38aeffSJohn Baldwin int 12938451d0ddSKip Macy sys_shm_unlink(struct thread *td, struct shm_unlink_args *uap) 12948e38aeffSJohn Baldwin { 12958e38aeffSJohn Baldwin char *path; 12968e38aeffSJohn Baldwin Fnv32_t fnv; 12978e38aeffSJohn Baldwin int error; 12988e38aeffSJohn Baldwin 12992d5603feSDavid Bright error = shm_copyin_path(td, uap->path, &path); 13002d5603feSDavid Bright if (error != 0) 13018e38aeffSJohn Baldwin return (error); 13022d5603feSDavid Bright 130315bcf785SRobert Watson AUDIT_ARG_UPATH1_CANON(path); 13048e38aeffSJohn Baldwin fnv = fnv_32_str(path, FNV1_32_INIT); 13058e38aeffSJohn Baldwin sx_xlock(&shm_dict_lock); 13068e38aeffSJohn Baldwin error = shm_remove(path, fnv, td->td_ucred); 13078e38aeffSJohn Baldwin sx_xunlock(&shm_dict_lock); 13084cf919edSMark Johnston free(path, M_SHMFD); 13098e38aeffSJohn Baldwin 13108e38aeffSJohn Baldwin return (error); 13118e38aeffSJohn Baldwin } 13128e38aeffSJohn Baldwin 13138e38aeffSJohn Baldwin int 13149afb12baSDavid Bright sys_shm_rename(struct thread *td, struct shm_rename_args *uap) 13159afb12baSDavid Bright { 13169afb12baSDavid Bright char *path_from = NULL, *path_to = NULL; 13179afb12baSDavid Bright Fnv32_t fnv_from, fnv_to; 13189afb12baSDavid Bright struct shmfd *fd_from; 13199afb12baSDavid Bright struct shmfd *fd_to; 13209afb12baSDavid Bright int error; 13219afb12baSDavid Bright int flags; 13229afb12baSDavid Bright 13239afb12baSDavid Bright flags = uap->flags; 13242d5603feSDavid Bright AUDIT_ARG_FFLAGS(flags); 13259afb12baSDavid Bright 13269afb12baSDavid Bright /* 13279afb12baSDavid Bright * Make sure the user passed only valid flags. 13289afb12baSDavid Bright * If you add a new flag, please add a new term here. 13299afb12baSDavid Bright */ 13309afb12baSDavid Bright if ((flags & ~( 13319afb12baSDavid Bright SHM_RENAME_NOREPLACE | 13329afb12baSDavid Bright SHM_RENAME_EXCHANGE 13339afb12baSDavid Bright )) != 0) { 13349afb12baSDavid Bright error = EINVAL; 13359afb12baSDavid Bright goto out; 13369afb12baSDavid Bright } 13379afb12baSDavid Bright 13389afb12baSDavid Bright /* 13399afb12baSDavid Bright * EXCHANGE and NOREPLACE don't quite make sense together. Let's 13409afb12baSDavid Bright * force the user to choose one or the other. 13419afb12baSDavid Bright */ 13429afb12baSDavid Bright if ((flags & SHM_RENAME_NOREPLACE) != 0 && 13439afb12baSDavid Bright (flags & SHM_RENAME_EXCHANGE) != 0) { 13449afb12baSDavid Bright error = EINVAL; 13459afb12baSDavid Bright goto out; 13469afb12baSDavid Bright } 13479afb12baSDavid Bright 13482d5603feSDavid Bright /* Renaming to or from anonymous makes no sense */ 13492d5603feSDavid Bright if (uap->path_from == SHM_ANON || uap->path_to == SHM_ANON) { 13502d5603feSDavid Bright error = EINVAL; 13512d5603feSDavid Bright goto out; 13522d5603feSDavid Bright } 13532d5603feSDavid Bright 13542d5603feSDavid Bright error = shm_copyin_path(td, uap->path_from, &path_from); 13552d5603feSDavid Bright if (error != 0) 13569afb12baSDavid Bright goto out; 13579afb12baSDavid Bright 13582d5603feSDavid Bright error = shm_copyin_path(td, uap->path_to, &path_to); 13592d5603feSDavid Bright if (error != 0) 13609afb12baSDavid Bright goto out; 13619afb12baSDavid Bright 13622d5603feSDavid Bright AUDIT_ARG_UPATH1_CANON(path_from); 13632d5603feSDavid Bright AUDIT_ARG_UPATH2_CANON(path_to); 13642d5603feSDavid Bright 13659afb12baSDavid Bright /* Rename with from/to equal is a no-op */ 13662d5603feSDavid Bright if (strcmp(path_from, path_to) == 0) 13679afb12baSDavid Bright goto out; 13689afb12baSDavid Bright 13699afb12baSDavid Bright fnv_from = fnv_32_str(path_from, FNV1_32_INIT); 13709afb12baSDavid Bright fnv_to = fnv_32_str(path_to, FNV1_32_INIT); 13719afb12baSDavid Bright 13729afb12baSDavid Bright sx_xlock(&shm_dict_lock); 13739afb12baSDavid Bright 13749afb12baSDavid Bright fd_from = shm_lookup(path_from, fnv_from); 13759afb12baSDavid Bright if (fd_from == NULL) { 13769afb12baSDavid Bright error = ENOENT; 13772d5603feSDavid Bright goto out_locked; 13789afb12baSDavid Bright } 13799afb12baSDavid Bright 13809afb12baSDavid Bright fd_to = shm_lookup(path_to, fnv_to); 13819afb12baSDavid Bright if ((flags & SHM_RENAME_NOREPLACE) != 0 && fd_to != NULL) { 13829afb12baSDavid Bright error = EEXIST; 13832d5603feSDavid Bright goto out_locked; 13849afb12baSDavid Bright } 13859afb12baSDavid Bright 13869afb12baSDavid Bright /* 13879afb12baSDavid Bright * Unconditionally prevents shm_remove from invalidating the 'from' 13889afb12baSDavid Bright * shm's state. 13899afb12baSDavid Bright */ 13909afb12baSDavid Bright shm_hold(fd_from); 13919afb12baSDavid Bright error = shm_remove(path_from, fnv_from, td->td_ucred); 13929afb12baSDavid Bright 13939afb12baSDavid Bright /* 13949afb12baSDavid Bright * One of my assumptions failed if ENOENT (e.g. locking didn't 13959afb12baSDavid Bright * protect us) 13969afb12baSDavid Bright */ 13979afb12baSDavid Bright KASSERT(error != ENOENT, ("Our shm disappeared during shm_rename: %s", 13989afb12baSDavid Bright path_from)); 13992d5603feSDavid Bright if (error != 0) { 14009afb12baSDavid Bright shm_drop(fd_from); 14012d5603feSDavid Bright goto out_locked; 14029afb12baSDavid Bright } 14039afb12baSDavid Bright 14049afb12baSDavid Bright /* 14059afb12baSDavid Bright * If we are exchanging, we need to ensure the shm_remove below 14069afb12baSDavid Bright * doesn't invalidate the dest shm's state. 14079afb12baSDavid Bright */ 14089afb12baSDavid Bright if ((flags & SHM_RENAME_EXCHANGE) != 0 && fd_to != NULL) 14099afb12baSDavid Bright shm_hold(fd_to); 14109afb12baSDavid Bright 14119afb12baSDavid Bright /* 14129afb12baSDavid Bright * NOTE: if path_to is not already in the hash, c'est la vie; 14139afb12baSDavid Bright * it simply means we have nothing already at path_to to unlink. 14149afb12baSDavid Bright * That is the ENOENT case. 14159afb12baSDavid Bright * 14169afb12baSDavid Bright * If we somehow don't have access to unlink this guy, but 14179afb12baSDavid Bright * did for the shm at path_from, then relink the shm to path_from 14189afb12baSDavid Bright * and abort with EACCES. 14199afb12baSDavid Bright * 14209afb12baSDavid Bright * All other errors: that is weird; let's relink and abort the 14219afb12baSDavid Bright * operation. 14229afb12baSDavid Bright */ 14239afb12baSDavid Bright error = shm_remove(path_to, fnv_to, td->td_ucred); 14242d5603feSDavid Bright if (error != 0 && error != ENOENT) { 14259afb12baSDavid Bright shm_insert(path_from, fnv_from, fd_from); 14269afb12baSDavid Bright shm_drop(fd_from); 14279afb12baSDavid Bright /* Don't free path_from now, since the hash references it */ 14289afb12baSDavid Bright path_from = NULL; 14292d5603feSDavid Bright goto out_locked; 14309afb12baSDavid Bright } 14319afb12baSDavid Bright 14322d5603feSDavid Bright error = 0; 14332d5603feSDavid Bright 14349afb12baSDavid Bright shm_insert(path_to, fnv_to, fd_from); 14359afb12baSDavid Bright 14369afb12baSDavid Bright /* Don't free path_to now, since the hash references it */ 14379afb12baSDavid Bright path_to = NULL; 14389afb12baSDavid Bright 14399afb12baSDavid Bright /* We kept a ref when we removed, and incremented again in insert */ 14409afb12baSDavid Bright shm_drop(fd_from); 14419afb12baSDavid Bright KASSERT(fd_from->shm_refs > 0, ("Expected >0 refs; got: %d\n", 14429afb12baSDavid Bright fd_from->shm_refs)); 14439afb12baSDavid Bright 14449afb12baSDavid Bright if ((flags & SHM_RENAME_EXCHANGE) != 0 && fd_to != NULL) { 14459afb12baSDavid Bright shm_insert(path_from, fnv_from, fd_to); 14469afb12baSDavid Bright path_from = NULL; 14479afb12baSDavid Bright shm_drop(fd_to); 14489afb12baSDavid Bright KASSERT(fd_to->shm_refs > 0, ("Expected >0 refs; got: %d\n", 14499afb12baSDavid Bright fd_to->shm_refs)); 14509afb12baSDavid Bright } 14519afb12baSDavid Bright 14522d5603feSDavid Bright out_locked: 14539afb12baSDavid Bright sx_xunlock(&shm_dict_lock); 14549afb12baSDavid Bright 14559afb12baSDavid Bright out: 14569afb12baSDavid Bright free(path_from, M_SHMFD); 14579afb12baSDavid Bright free(path_to, M_SHMFD); 14589afb12baSDavid Bright return (error); 14599afb12baSDavid Bright } 14609afb12baSDavid Bright 1461d301b358SKonstantin Belousov static int 1462d301b358SKonstantin Belousov shm_mmap_large(struct shmfd *shmfd, vm_map_t map, vm_offset_t *addr, 1463d301b358SKonstantin Belousov vm_size_t size, vm_prot_t prot, vm_prot_t max_prot, int flags, 146479783634SKonstantin Belousov vm_ooffset_t foff, struct thread *td) 1465d301b358SKonstantin Belousov { 1466d301b358SKonstantin Belousov struct vmspace *vms; 1467d301b358SKonstantin Belousov vm_map_entry_t next_entry, prev_entry; 1468d301b358SKonstantin Belousov vm_offset_t align, mask, maxaddr; 1469d301b358SKonstantin Belousov int docow, error, rv, try; 1470d301b358SKonstantin Belousov bool curmap; 1471d301b358SKonstantin Belousov 1472d301b358SKonstantin Belousov if (shmfd->shm_lp_psind == 0) 1473d301b358SKonstantin Belousov return (EINVAL); 1474d301b358SKonstantin Belousov 1475d301b358SKonstantin Belousov /* MAP_PRIVATE is disabled */ 1476d301b358SKonstantin Belousov if ((flags & ~(MAP_SHARED | MAP_FIXED | MAP_EXCL | 1477d301b358SKonstantin Belousov MAP_NOCORE | 1478d301b358SKonstantin Belousov #ifdef MAP_32BIT 1479d301b358SKonstantin Belousov MAP_32BIT | 1480d301b358SKonstantin Belousov #endif 1481d301b358SKonstantin Belousov MAP_ALIGNMENT_MASK)) != 0) 1482d301b358SKonstantin Belousov return (EINVAL); 1483d301b358SKonstantin Belousov 1484d301b358SKonstantin Belousov vms = td->td_proc->p_vmspace; 1485d301b358SKonstantin Belousov curmap = map == &vms->vm_map; 1486d301b358SKonstantin Belousov if (curmap) { 1487d301b358SKonstantin Belousov error = kern_mmap_racct_check(td, map, size); 1488d301b358SKonstantin Belousov if (error != 0) 1489d301b358SKonstantin Belousov return (error); 1490d301b358SKonstantin Belousov } 1491d301b358SKonstantin Belousov 1492d301b358SKonstantin Belousov docow = shmfd->shm_lp_psind << MAP_SPLIT_BOUNDARY_SHIFT; 1493d301b358SKonstantin Belousov docow |= MAP_INHERIT_SHARE; 1494d301b358SKonstantin Belousov if ((flags & MAP_NOCORE) != 0) 1495d301b358SKonstantin Belousov docow |= MAP_DISABLE_COREDUMP; 1496d301b358SKonstantin Belousov 1497d301b358SKonstantin Belousov mask = pagesizes[shmfd->shm_lp_psind] - 1; 1498d301b358SKonstantin Belousov if ((foff & mask) != 0) 1499d301b358SKonstantin Belousov return (EINVAL); 1500d301b358SKonstantin Belousov maxaddr = vm_map_max(map); 1501d301b358SKonstantin Belousov #ifdef MAP_32BIT 1502d301b358SKonstantin Belousov if ((flags & MAP_32BIT) != 0 && maxaddr > MAP_32BIT_MAX_ADDR) 1503d301b358SKonstantin Belousov maxaddr = MAP_32BIT_MAX_ADDR; 1504d301b358SKonstantin Belousov #endif 1505d301b358SKonstantin Belousov if (size == 0 || (size & mask) != 0 || 1506d301b358SKonstantin Belousov (*addr != 0 && ((*addr & mask) != 0 || 1507d301b358SKonstantin Belousov *addr + size < *addr || *addr + size > maxaddr))) 1508d301b358SKonstantin Belousov return (EINVAL); 1509d301b358SKonstantin Belousov 1510d301b358SKonstantin Belousov align = flags & MAP_ALIGNMENT_MASK; 1511d301b358SKonstantin Belousov if (align == 0) { 1512d301b358SKonstantin Belousov align = pagesizes[shmfd->shm_lp_psind]; 1513d301b358SKonstantin Belousov } else if (align == MAP_ALIGNED_SUPER) { 1514d301b358SKonstantin Belousov if (shmfd->shm_lp_psind != 1) 1515d301b358SKonstantin Belousov return (EINVAL); 1516d301b358SKonstantin Belousov align = pagesizes[1]; 1517d301b358SKonstantin Belousov } else { 1518d301b358SKonstantin Belousov align >>= MAP_ALIGNMENT_SHIFT; 1519d301b358SKonstantin Belousov align = 1ULL << align; 1520d301b358SKonstantin Belousov /* Also handles overflow. */ 1521d301b358SKonstantin Belousov if (align < pagesizes[shmfd->shm_lp_psind]) 1522d301b358SKonstantin Belousov return (EINVAL); 1523d301b358SKonstantin Belousov } 1524d301b358SKonstantin Belousov 1525d301b358SKonstantin Belousov vm_map_lock(map); 1526d301b358SKonstantin Belousov if ((flags & MAP_FIXED) == 0) { 1527d301b358SKonstantin Belousov try = 1; 1528d301b358SKonstantin Belousov if (curmap && (*addr == 0 || 1529d301b358SKonstantin Belousov (*addr >= round_page((vm_offset_t)vms->vm_taddr) && 1530d301b358SKonstantin Belousov *addr < round_page((vm_offset_t)vms->vm_daddr + 1531d301b358SKonstantin Belousov lim_max(td, RLIMIT_DATA))))) { 1532d301b358SKonstantin Belousov *addr = roundup2((vm_offset_t)vms->vm_daddr + 1533d301b358SKonstantin Belousov lim_max(td, RLIMIT_DATA), 1534d301b358SKonstantin Belousov pagesizes[shmfd->shm_lp_psind]); 1535d301b358SKonstantin Belousov } 1536d301b358SKonstantin Belousov again: 1537d301b358SKonstantin Belousov rv = vm_map_find_aligned(map, addr, size, maxaddr, align); 1538d301b358SKonstantin Belousov if (rv != KERN_SUCCESS) { 1539d301b358SKonstantin Belousov if (try == 1) { 1540d301b358SKonstantin Belousov try = 2; 1541d301b358SKonstantin Belousov *addr = vm_map_min(map); 1542d301b358SKonstantin Belousov if ((*addr & mask) != 0) 1543d301b358SKonstantin Belousov *addr = (*addr + mask) & mask; 1544d301b358SKonstantin Belousov goto again; 1545d301b358SKonstantin Belousov } 1546d301b358SKonstantin Belousov goto fail1; 1547d301b358SKonstantin Belousov } 1548d301b358SKonstantin Belousov } else if ((flags & MAP_EXCL) == 0) { 1549d301b358SKonstantin Belousov rv = vm_map_delete(map, *addr, *addr + size); 1550d301b358SKonstantin Belousov if (rv != KERN_SUCCESS) 1551d301b358SKonstantin Belousov goto fail1; 1552d301b358SKonstantin Belousov } else { 1553d301b358SKonstantin Belousov error = ENOSPC; 1554d301b358SKonstantin Belousov if (vm_map_lookup_entry(map, *addr, &prev_entry)) 1555d301b358SKonstantin Belousov goto fail; 1556d301b358SKonstantin Belousov next_entry = vm_map_entry_succ(prev_entry); 1557d301b358SKonstantin Belousov if (next_entry->start < *addr + size) 1558d301b358SKonstantin Belousov goto fail; 1559d301b358SKonstantin Belousov } 1560d301b358SKonstantin Belousov 1561d301b358SKonstantin Belousov rv = vm_map_insert(map, shmfd->shm_object, foff, *addr, *addr + size, 1562d301b358SKonstantin Belousov prot, max_prot, docow); 1563d301b358SKonstantin Belousov fail1: 1564d301b358SKonstantin Belousov error = vm_mmap_to_errno(rv); 1565d301b358SKonstantin Belousov fail: 1566d301b358SKonstantin Belousov vm_map_unlock(map); 1567d301b358SKonstantin Belousov return (error); 1568d301b358SKonstantin Belousov } 1569d301b358SKonstantin Belousov 1570d301b358SKonstantin Belousov static int 15717077c426SJohn Baldwin shm_mmap(struct file *fp, vm_map_t map, vm_offset_t *addr, vm_size_t objsize, 15727077c426SJohn Baldwin vm_prot_t prot, vm_prot_t cap_maxprot, int flags, 15737077c426SJohn Baldwin vm_ooffset_t foff, struct thread *td) 15748e38aeffSJohn Baldwin { 15757077c426SJohn Baldwin struct shmfd *shmfd; 15767077c426SJohn Baldwin vm_prot_t maxprot; 15777077c426SJohn Baldwin int error; 1578dca52ab4SKyle Evans bool writecnt; 1579af755d3eSKyle Evans void *rl_cookie; 15807077c426SJohn Baldwin 15817077c426SJohn Baldwin shmfd = fp->f_data; 15827077c426SJohn Baldwin maxprot = VM_PROT_NONE; 15837077c426SJohn Baldwin 1584af755d3eSKyle Evans rl_cookie = rangelock_rlock(&shmfd->shm_rl, 0, objsize, 1585af755d3eSKyle Evans &shmfd->shm_mtx); 15867077c426SJohn Baldwin /* FREAD should always be set. */ 15877077c426SJohn Baldwin if ((fp->f_flag & FREAD) != 0) 15887077c426SJohn Baldwin maxprot |= VM_PROT_EXECUTE | VM_PROT_READ; 158958366f05SKyle Evans 159058366f05SKyle Evans /* 159158366f05SKyle Evans * If FWRITE's set, we can allow VM_PROT_WRITE unless it's a shared 1592c7841c6bSMark Johnston * mapping with a write seal applied. Private mappings are always 1593c7841c6bSMark Johnston * writeable. 159458366f05SKyle Evans */ 1595c7841c6bSMark Johnston if ((flags & MAP_SHARED) == 0) { 1596c7841c6bSMark Johnston cap_maxprot |= VM_PROT_WRITE; 15977077c426SJohn Baldwin maxprot |= VM_PROT_WRITE; 1598c7841c6bSMark Johnston writecnt = false; 1599c7841c6bSMark Johnston } else { 1600c7841c6bSMark Johnston if ((fp->f_flag & FWRITE) != 0 && 1601c7841c6bSMark Johnston (shmfd->shm_seals & F_SEAL_WRITE) == 0) 1602c7841c6bSMark Johnston maxprot |= VM_PROT_WRITE; 1603af755d3eSKyle Evans 160451a16c84SKyle Evans /* 160551a16c84SKyle Evans * Any mappings from a writable descriptor may be upgraded to 160651a16c84SKyle Evans * VM_PROT_WRITE with mprotect(2), unless a write-seal was 160751a16c84SKyle Evans * applied between the open and subsequent mmap(2). We want to 160851a16c84SKyle Evans * reject application of a write seal as long as any such 160951a16c84SKyle Evans * mapping exists so that the seal cannot be trivially bypassed. 161051a16c84SKyle Evans */ 161151a16c84SKyle Evans writecnt = (maxprot & VM_PROT_WRITE) != 0; 161251a16c84SKyle Evans if (!writecnt && (prot & VM_PROT_WRITE) != 0) { 1613af755d3eSKyle Evans error = EACCES; 1614af755d3eSKyle Evans goto out; 1615af755d3eSKyle Evans } 1616c7841c6bSMark Johnston } 16177077c426SJohn Baldwin maxprot &= cap_maxprot; 16187077c426SJohn Baldwin 1619987ff181SKonstantin Belousov /* See comment in vn_mmap(). */ 1620987ff181SKonstantin Belousov if ( 1621987ff181SKonstantin Belousov #ifdef _LP64 1622987ff181SKonstantin Belousov objsize > OFF_MAX || 1623987ff181SKonstantin Belousov #endif 1624f9cc8410SEric van Gyzen foff > OFF_MAX - objsize) { 1625af755d3eSKyle Evans error = EINVAL; 1626af755d3eSKyle Evans goto out; 1627af755d3eSKyle Evans } 1628987ff181SKonstantin Belousov 16297077c426SJohn Baldwin #ifdef MAC 16307077c426SJohn Baldwin error = mac_posixshm_check_mmap(td->td_ucred, shmfd, prot, flags); 16317077c426SJohn Baldwin if (error != 0) 1632af755d3eSKyle Evans goto out; 16337077c426SJohn Baldwin #endif 16348e38aeffSJohn Baldwin 16358e38aeffSJohn Baldwin mtx_lock(&shm_timestamp_lock); 16368e38aeffSJohn Baldwin vfs_timestamp(&shmfd->shm_atime); 16378e38aeffSJohn Baldwin mtx_unlock(&shm_timestamp_lock); 16388e38aeffSJohn Baldwin vm_object_reference(shmfd->shm_object); 16397077c426SJohn Baldwin 1640d301b358SKonstantin Belousov if (shm_largepage(shmfd)) { 164179783634SKonstantin Belousov writecnt = false; 1642d301b358SKonstantin Belousov error = shm_mmap_large(shmfd, map, addr, objsize, prot, 164379783634SKonstantin Belousov maxprot, flags, foff, td); 1644d301b358SKonstantin Belousov } else { 164579783634SKonstantin Belousov if (writecnt) { 164679783634SKonstantin Belousov vm_pager_update_writecount(shmfd->shm_object, 0, 164779783634SKonstantin Belousov objsize); 164879783634SKonstantin Belousov } 16497077c426SJohn Baldwin error = vm_mmap_object(map, addr, objsize, prot, maxprot, flags, 1650dca52ab4SKyle Evans shmfd->shm_object, foff, writecnt, td); 1651d301b358SKonstantin Belousov } 1652dca52ab4SKyle Evans if (error != 0) { 1653dca52ab4SKyle Evans if (writecnt) 1654dca52ab4SKyle Evans vm_pager_release_writecount(shmfd->shm_object, 0, 1655dca52ab4SKyle Evans objsize); 16567077c426SJohn Baldwin vm_object_deallocate(shmfd->shm_object); 1657dca52ab4SKyle Evans } 1658af755d3eSKyle Evans out: 1659af755d3eSKyle Evans rangelock_unlock(&shmfd->shm_rl, rl_cookie, &shmfd->shm_mtx); 166034d3e89fSKonstantin Belousov return (error); 16618e38aeffSJohn Baldwin } 16629c00bb91SKonstantin Belousov 16639c00bb91SKonstantin Belousov static int 16649c00bb91SKonstantin Belousov shm_chmod(struct file *fp, mode_t mode, struct ucred *active_cred, 16659c00bb91SKonstantin Belousov struct thread *td) 16669c00bb91SKonstantin Belousov { 16679c00bb91SKonstantin Belousov struct shmfd *shmfd; 16689c00bb91SKonstantin Belousov int error; 16699c00bb91SKonstantin Belousov 16709c00bb91SKonstantin Belousov error = 0; 16719c00bb91SKonstantin Belousov shmfd = fp->f_data; 16729c00bb91SKonstantin Belousov mtx_lock(&shm_timestamp_lock); 16739c00bb91SKonstantin Belousov /* 16749c00bb91SKonstantin Belousov * SUSv4 says that x bits of permission need not be affected. 16759c00bb91SKonstantin Belousov * Be consistent with our shm_open there. 16769c00bb91SKonstantin Belousov */ 16779c00bb91SKonstantin Belousov #ifdef MAC 16789c00bb91SKonstantin Belousov error = mac_posixshm_check_setmode(active_cred, shmfd, mode); 16799c00bb91SKonstantin Belousov if (error != 0) 16809c00bb91SKonstantin Belousov goto out; 16819c00bb91SKonstantin Belousov #endif 1682d292b194SMateusz Guzik error = vaccess(VREG, shmfd->shm_mode, shmfd->shm_uid, shmfd->shm_gid, 1683d292b194SMateusz Guzik VADMIN, active_cred); 16849c00bb91SKonstantin Belousov if (error != 0) 16859c00bb91SKonstantin Belousov goto out; 16869c00bb91SKonstantin Belousov shmfd->shm_mode = mode & ACCESSPERMS; 16879c00bb91SKonstantin Belousov out: 16889c00bb91SKonstantin Belousov mtx_unlock(&shm_timestamp_lock); 16899c00bb91SKonstantin Belousov return (error); 16909c00bb91SKonstantin Belousov } 16919c00bb91SKonstantin Belousov 16929c00bb91SKonstantin Belousov static int 16939c00bb91SKonstantin Belousov shm_chown(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred, 16949c00bb91SKonstantin Belousov struct thread *td) 16959c00bb91SKonstantin Belousov { 16969c00bb91SKonstantin Belousov struct shmfd *shmfd; 16979c00bb91SKonstantin Belousov int error; 16989c00bb91SKonstantin Belousov 169968889ed6SKonstantin Belousov error = 0; 17009c00bb91SKonstantin Belousov shmfd = fp->f_data; 17019c00bb91SKonstantin Belousov mtx_lock(&shm_timestamp_lock); 17029c00bb91SKonstantin Belousov #ifdef MAC 17039c00bb91SKonstantin Belousov error = mac_posixshm_check_setowner(active_cred, shmfd, uid, gid); 17049c00bb91SKonstantin Belousov if (error != 0) 17059c00bb91SKonstantin Belousov goto out; 17069c00bb91SKonstantin Belousov #endif 17079c00bb91SKonstantin Belousov if (uid == (uid_t)-1) 17089c00bb91SKonstantin Belousov uid = shmfd->shm_uid; 17099c00bb91SKonstantin Belousov if (gid == (gid_t)-1) 17109c00bb91SKonstantin Belousov gid = shmfd->shm_gid; 17119c00bb91SKonstantin Belousov if (((uid != shmfd->shm_uid && uid != active_cred->cr_uid) || 17129c00bb91SKonstantin Belousov (gid != shmfd->shm_gid && !groupmember(gid, active_cred))) && 1713cc426dd3SMateusz Guzik (error = priv_check_cred(active_cred, PRIV_VFS_CHOWN))) 17149c00bb91SKonstantin Belousov goto out; 17159c00bb91SKonstantin Belousov shmfd->shm_uid = uid; 17169c00bb91SKonstantin Belousov shmfd->shm_gid = gid; 17179c00bb91SKonstantin Belousov out: 17189c00bb91SKonstantin Belousov mtx_unlock(&shm_timestamp_lock); 17199c00bb91SKonstantin Belousov return (error); 17209c00bb91SKonstantin Belousov } 1721fb680e16SJohn Baldwin 1722fb680e16SJohn Baldwin /* 1723fb680e16SJohn Baldwin * Helper routines to allow the backing object of a shared memory file 1724fb680e16SJohn Baldwin * descriptor to be mapped in the kernel. 1725fb680e16SJohn Baldwin */ 1726fb680e16SJohn Baldwin int 1727fb680e16SJohn Baldwin shm_map(struct file *fp, size_t size, off_t offset, void **memp) 1728fb680e16SJohn Baldwin { 1729fb680e16SJohn Baldwin struct shmfd *shmfd; 1730fb680e16SJohn Baldwin vm_offset_t kva, ofs; 1731fb680e16SJohn Baldwin vm_object_t obj; 1732fb680e16SJohn Baldwin int rv; 1733fb680e16SJohn Baldwin 1734fb680e16SJohn Baldwin if (fp->f_type != DTYPE_SHM) 1735fb680e16SJohn Baldwin return (EINVAL); 1736fb680e16SJohn Baldwin shmfd = fp->f_data; 1737fb680e16SJohn Baldwin obj = shmfd->shm_object; 173889f6b863SAttilio Rao VM_OBJECT_WLOCK(obj); 1739fb680e16SJohn Baldwin /* 1740fb680e16SJohn Baldwin * XXXRW: This validation is probably insufficient, and subject to 1741fb680e16SJohn Baldwin * sign errors. It should be fixed. 1742fb680e16SJohn Baldwin */ 1743fb680e16SJohn Baldwin if (offset >= shmfd->shm_size || 1744fb680e16SJohn Baldwin offset + size > round_page(shmfd->shm_size)) { 174589f6b863SAttilio Rao VM_OBJECT_WUNLOCK(obj); 1746fb680e16SJohn Baldwin return (EINVAL); 1747fb680e16SJohn Baldwin } 1748fb680e16SJohn Baldwin 1749fb680e16SJohn Baldwin shmfd->shm_kmappings++; 1750fb680e16SJohn Baldwin vm_object_reference_locked(obj); 175189f6b863SAttilio Rao VM_OBJECT_WUNLOCK(obj); 1752fb680e16SJohn Baldwin 1753fb680e16SJohn Baldwin /* Map the object into the kernel_map and wire it. */ 1754fb680e16SJohn Baldwin kva = vm_map_min(kernel_map); 1755fb680e16SJohn Baldwin ofs = offset & PAGE_MASK; 1756fb680e16SJohn Baldwin offset = trunc_page(offset); 1757fb680e16SJohn Baldwin size = round_page(size + ofs); 1758edb572a3SJohn Baldwin rv = vm_map_find(kernel_map, obj, offset, &kva, size, 0, 17595e3a17c0SJohn Baldwin VMFS_OPTIMAL_SPACE, VM_PROT_READ | VM_PROT_WRITE, 1760fb680e16SJohn Baldwin VM_PROT_READ | VM_PROT_WRITE, 0); 1761fb680e16SJohn Baldwin if (rv == KERN_SUCCESS) { 1762fb680e16SJohn Baldwin rv = vm_map_wire(kernel_map, kva, kva + size, 1763fb680e16SJohn Baldwin VM_MAP_WIRE_SYSTEM | VM_MAP_WIRE_NOHOLES); 1764fb680e16SJohn Baldwin if (rv == KERN_SUCCESS) { 1765fb680e16SJohn Baldwin *memp = (void *)(kva + ofs); 1766fb680e16SJohn Baldwin return (0); 1767fb680e16SJohn Baldwin } 1768fb680e16SJohn Baldwin vm_map_remove(kernel_map, kva, kva + size); 1769fb680e16SJohn Baldwin } else 1770fb680e16SJohn Baldwin vm_object_deallocate(obj); 1771fb680e16SJohn Baldwin 1772fb680e16SJohn Baldwin /* On failure, drop our mapping reference. */ 177389f6b863SAttilio Rao VM_OBJECT_WLOCK(obj); 1774fb680e16SJohn Baldwin shmfd->shm_kmappings--; 177589f6b863SAttilio Rao VM_OBJECT_WUNLOCK(obj); 1776fb680e16SJohn Baldwin 1777338e7cf2SJohn Baldwin return (vm_mmap_to_errno(rv)); 1778fb680e16SJohn Baldwin } 1779fb680e16SJohn Baldwin 1780fb680e16SJohn Baldwin /* 1781fb680e16SJohn Baldwin * We require the caller to unmap the entire entry. This allows us to 1782fb680e16SJohn Baldwin * safely decrement shm_kmappings when a mapping is removed. 1783fb680e16SJohn Baldwin */ 1784fb680e16SJohn Baldwin int 1785fb680e16SJohn Baldwin shm_unmap(struct file *fp, void *mem, size_t size) 1786fb680e16SJohn Baldwin { 1787fb680e16SJohn Baldwin struct shmfd *shmfd; 1788fb680e16SJohn Baldwin vm_map_entry_t entry; 1789fb680e16SJohn Baldwin vm_offset_t kva, ofs; 1790fb680e16SJohn Baldwin vm_object_t obj; 1791fb680e16SJohn Baldwin vm_pindex_t pindex; 1792fb680e16SJohn Baldwin vm_prot_t prot; 1793fb680e16SJohn Baldwin boolean_t wired; 1794fb680e16SJohn Baldwin vm_map_t map; 1795fb680e16SJohn Baldwin int rv; 1796fb680e16SJohn Baldwin 1797fb680e16SJohn Baldwin if (fp->f_type != DTYPE_SHM) 1798fb680e16SJohn Baldwin return (EINVAL); 1799fb680e16SJohn Baldwin shmfd = fp->f_data; 1800fb680e16SJohn Baldwin kva = (vm_offset_t)mem; 1801fb680e16SJohn Baldwin ofs = kva & PAGE_MASK; 1802fb680e16SJohn Baldwin kva = trunc_page(kva); 1803fb680e16SJohn Baldwin size = round_page(size + ofs); 1804fb680e16SJohn Baldwin map = kernel_map; 1805fb680e16SJohn Baldwin rv = vm_map_lookup(&map, kva, VM_PROT_READ | VM_PROT_WRITE, &entry, 1806fb680e16SJohn Baldwin &obj, &pindex, &prot, &wired); 1807fb680e16SJohn Baldwin if (rv != KERN_SUCCESS) 1808fb680e16SJohn Baldwin return (EINVAL); 1809fb680e16SJohn Baldwin if (entry->start != kva || entry->end != kva + size) { 1810fb680e16SJohn Baldwin vm_map_lookup_done(map, entry); 1811fb680e16SJohn Baldwin return (EINVAL); 1812fb680e16SJohn Baldwin } 1813fb680e16SJohn Baldwin vm_map_lookup_done(map, entry); 1814fb680e16SJohn Baldwin if (obj != shmfd->shm_object) 1815fb680e16SJohn Baldwin return (EINVAL); 1816fb680e16SJohn Baldwin vm_map_remove(map, kva, kva + size); 181789f6b863SAttilio Rao VM_OBJECT_WLOCK(obj); 1818fb680e16SJohn Baldwin KASSERT(shmfd->shm_kmappings > 0, ("shm_unmap: object not mapped")); 1819fb680e16SJohn Baldwin shmfd->shm_kmappings--; 182089f6b863SAttilio Rao VM_OBJECT_WUNLOCK(obj); 1821fb680e16SJohn Baldwin return (0); 1822fb680e16SJohn Baldwin } 1823e506e182SJohn Baldwin 18249696feebSJohn Baldwin static int 182556d0e33eSKonstantin Belousov shm_fill_kinfo_locked(struct shmfd *shmfd, struct kinfo_file *kif, bool list) 1826e506e182SJohn Baldwin { 1827cc7b259aSJamie Gritton const char *path, *pr_path; 1828cc7b259aSJamie Gritton size_t pr_pathlen; 182956d0e33eSKonstantin Belousov bool visible; 1830e506e182SJohn Baldwin 183156d0e33eSKonstantin Belousov sx_assert(&shm_dict_lock, SA_LOCKED); 18329696feebSJohn Baldwin kif->kf_type = KF_TYPE_SHM; 183356d0e33eSKonstantin Belousov kif->kf_un.kf_file.kf_file_mode = S_IFREG | shmfd->shm_mode; 18349696feebSJohn Baldwin kif->kf_un.kf_file.kf_file_size = shmfd->shm_size; 18359696feebSJohn Baldwin if (shmfd->shm_path != NULL) { 183644c16975SJamie Gritton if (shmfd->shm_path != NULL) { 1837cc7b259aSJamie Gritton path = shmfd->shm_path; 1838cc7b259aSJamie Gritton pr_path = curthread->td_ucred->cr_prison->pr_path; 183944c16975SJamie Gritton if (strcmp(pr_path, "/") != 0) { 184044c16975SJamie Gritton /* Return the jail-rooted pathname. */ 1841cc7b259aSJamie Gritton pr_pathlen = strlen(pr_path); 184256d0e33eSKonstantin Belousov visible = strncmp(path, pr_path, pr_pathlen) 184356d0e33eSKonstantin Belousov == 0 && path[pr_pathlen] == '/'; 184456d0e33eSKonstantin Belousov if (list && !visible) 184556d0e33eSKonstantin Belousov return (EPERM); 184656d0e33eSKonstantin Belousov if (visible) 1847cc7b259aSJamie Gritton path += pr_pathlen; 1848cc7b259aSJamie Gritton } 1849cc7b259aSJamie Gritton strlcpy(kif->kf_path, path, sizeof(kif->kf_path)); 1850cc7b259aSJamie Gritton } 1851e506e182SJohn Baldwin } 18529696feebSJohn Baldwin return (0); 18539696feebSJohn Baldwin } 185456d0e33eSKonstantin Belousov 185556d0e33eSKonstantin Belousov static int 185656d0e33eSKonstantin Belousov shm_fill_kinfo(struct file *fp, struct kinfo_file *kif, 185756d0e33eSKonstantin Belousov struct filedesc *fdp __unused) 185856d0e33eSKonstantin Belousov { 185956d0e33eSKonstantin Belousov int res; 186056d0e33eSKonstantin Belousov 186156d0e33eSKonstantin Belousov sx_slock(&shm_dict_lock); 186256d0e33eSKonstantin Belousov res = shm_fill_kinfo_locked(fp->f_data, kif, false); 186356d0e33eSKonstantin Belousov sx_sunlock(&shm_dict_lock); 186456d0e33eSKonstantin Belousov return (res); 186556d0e33eSKonstantin Belousov } 186656d0e33eSKonstantin Belousov 186756d0e33eSKonstantin Belousov static int 1868af755d3eSKyle Evans shm_add_seals(struct file *fp, int seals) 1869af755d3eSKyle Evans { 1870af755d3eSKyle Evans struct shmfd *shmfd; 1871af755d3eSKyle Evans void *rl_cookie; 1872af755d3eSKyle Evans vm_ooffset_t writemappings; 1873af755d3eSKyle Evans int error, nseals; 1874af755d3eSKyle Evans 1875af755d3eSKyle Evans error = 0; 1876af755d3eSKyle Evans shmfd = fp->f_data; 1877af755d3eSKyle Evans rl_cookie = rangelock_wlock(&shmfd->shm_rl, 0, OFF_MAX, 1878af755d3eSKyle Evans &shmfd->shm_mtx); 1879af755d3eSKyle Evans 1880af755d3eSKyle Evans /* Even already-set seals should result in EPERM. */ 1881af755d3eSKyle Evans if ((shmfd->shm_seals & F_SEAL_SEAL) != 0) { 1882af755d3eSKyle Evans error = EPERM; 1883af755d3eSKyle Evans goto out; 1884af755d3eSKyle Evans } 1885af755d3eSKyle Evans nseals = seals & ~shmfd->shm_seals; 1886af755d3eSKyle Evans if ((nseals & F_SEAL_WRITE) != 0) { 188779783634SKonstantin Belousov if (shm_largepage(shmfd)) { 188879783634SKonstantin Belousov error = ENOTSUP; 188979783634SKonstantin Belousov goto out; 189079783634SKonstantin Belousov } 189179783634SKonstantin Belousov 1892af755d3eSKyle Evans /* 1893af755d3eSKyle Evans * The rangelock above prevents writable mappings from being 1894af755d3eSKyle Evans * added after we've started applying seals. The RLOCK here 1895af755d3eSKyle Evans * is to avoid torn reads on ILP32 arches as unmapping/reducing 1896af755d3eSKyle Evans * writemappings will be done without a rangelock. 1897af755d3eSKyle Evans */ 1898af755d3eSKyle Evans VM_OBJECT_RLOCK(shmfd->shm_object); 1899af755d3eSKyle Evans writemappings = shmfd->shm_object->un_pager.swp.writemappings; 1900af755d3eSKyle Evans VM_OBJECT_RUNLOCK(shmfd->shm_object); 1901af755d3eSKyle Evans /* kmappings are also writable */ 1902af755d3eSKyle Evans if (writemappings > 0) { 1903af755d3eSKyle Evans error = EBUSY; 1904af755d3eSKyle Evans goto out; 1905af755d3eSKyle Evans } 1906af755d3eSKyle Evans } 1907af755d3eSKyle Evans shmfd->shm_seals |= nseals; 1908af755d3eSKyle Evans out: 1909af755d3eSKyle Evans rangelock_unlock(&shmfd->shm_rl, rl_cookie, &shmfd->shm_mtx); 1910af755d3eSKyle Evans return (error); 1911af755d3eSKyle Evans } 1912af755d3eSKyle Evans 1913af755d3eSKyle Evans static int 1914af755d3eSKyle Evans shm_get_seals(struct file *fp, int *seals) 1915af755d3eSKyle Evans { 1916af755d3eSKyle Evans struct shmfd *shmfd; 1917af755d3eSKyle Evans 1918af755d3eSKyle Evans shmfd = fp->f_data; 1919af755d3eSKyle Evans *seals = shmfd->shm_seals; 1920af755d3eSKyle Evans return (0); 1921af755d3eSKyle Evans } 1922af755d3eSKyle Evans 1923af755d3eSKyle Evans static int 1924454bc887SKa Ho Ng shm_deallocate(struct shmfd *shmfd, off_t *offset, off_t *length, int flags) 1925454bc887SKa Ho Ng { 1926454bc887SKa Ho Ng vm_object_t object; 1927454bc887SKa Ho Ng vm_pindex_t pistart, pi, piend; 1928454bc887SKa Ho Ng vm_ooffset_t off, len; 1929454bc887SKa Ho Ng int startofs, endofs, end; 1930454bc887SKa Ho Ng int error; 1931454bc887SKa Ho Ng 1932454bc887SKa Ho Ng off = *offset; 1933454bc887SKa Ho Ng len = *length; 1934454bc887SKa Ho Ng KASSERT(off + len <= (vm_ooffset_t)OFF_MAX, ("off + len overflows")); 19351eaa3652SKa Ho Ng if (off + len > shmfd->shm_size) 19361eaa3652SKa Ho Ng len = shmfd->shm_size - off; 1937454bc887SKa Ho Ng object = shmfd->shm_object; 1938454bc887SKa Ho Ng startofs = off & PAGE_MASK; 1939454bc887SKa Ho Ng endofs = (off + len) & PAGE_MASK; 1940454bc887SKa Ho Ng pistart = OFF_TO_IDX(off); 1941454bc887SKa Ho Ng piend = OFF_TO_IDX(off + len); 1942454bc887SKa Ho Ng pi = OFF_TO_IDX(off + PAGE_MASK); 1943454bc887SKa Ho Ng error = 0; 1944454bc887SKa Ho Ng 19455c1428d2SKa Ho Ng /* Handle the case when offset is on or beyond shm size. */ 19465c1428d2SKa Ho Ng if ((off_t)len <= 0) { 19471eaa3652SKa Ho Ng *length = 0; 19481eaa3652SKa Ho Ng return (0); 19491eaa3652SKa Ho Ng } 19501eaa3652SKa Ho Ng 1951454bc887SKa Ho Ng VM_OBJECT_WLOCK(object); 1952454bc887SKa Ho Ng 1953454bc887SKa Ho Ng if (startofs != 0) { 1954454bc887SKa Ho Ng end = pistart != piend ? PAGE_SIZE : endofs; 1955454bc887SKa Ho Ng error = shm_partial_page_invalidate(object, pistart, startofs, 1956454bc887SKa Ho Ng end); 1957454bc887SKa Ho Ng if (error) 1958454bc887SKa Ho Ng goto out; 1959454bc887SKa Ho Ng off += end - startofs; 1960454bc887SKa Ho Ng len -= end - startofs; 1961454bc887SKa Ho Ng } 1962454bc887SKa Ho Ng 1963454bc887SKa Ho Ng if (pi < piend) { 1964454bc887SKa Ho Ng vm_object_page_remove(object, pi, piend, 0); 1965454bc887SKa Ho Ng off += IDX_TO_OFF(piend - pi); 1966454bc887SKa Ho Ng len -= IDX_TO_OFF(piend - pi); 1967454bc887SKa Ho Ng } 1968454bc887SKa Ho Ng 1969454bc887SKa Ho Ng if (endofs != 0 && pistart != piend) { 1970454bc887SKa Ho Ng error = shm_partial_page_invalidate(object, piend, 0, endofs); 1971454bc887SKa Ho Ng if (error) 1972454bc887SKa Ho Ng goto out; 1973454bc887SKa Ho Ng off += endofs; 1974454bc887SKa Ho Ng len -= endofs; 1975454bc887SKa Ho Ng } 1976454bc887SKa Ho Ng 1977454bc887SKa Ho Ng out: 1978454bc887SKa Ho Ng VM_OBJECT_WUNLOCK(shmfd->shm_object); 1979454bc887SKa Ho Ng *offset = off; 1980454bc887SKa Ho Ng *length = len; 1981454bc887SKa Ho Ng return (error); 1982454bc887SKa Ho Ng } 1983454bc887SKa Ho Ng 1984454bc887SKa Ho Ng static int 1985454bc887SKa Ho Ng shm_fspacectl(struct file *fp, int cmd, off_t *offset, off_t *length, int flags, 1986454bc887SKa Ho Ng struct ucred *active_cred, struct thread *td) 1987454bc887SKa Ho Ng { 1988454bc887SKa Ho Ng void *rl_cookie; 1989454bc887SKa Ho Ng struct shmfd *shmfd; 1990454bc887SKa Ho Ng off_t off, len; 1991454bc887SKa Ho Ng int error; 1992454bc887SKa Ho Ng 1993454bc887SKa Ho Ng /* This assumes that the caller already checked for overflow. */ 1994454bc887SKa Ho Ng error = EINVAL; 1995454bc887SKa Ho Ng shmfd = fp->f_data; 1996454bc887SKa Ho Ng off = *offset; 1997454bc887SKa Ho Ng len = *length; 1998454bc887SKa Ho Ng 1999454bc887SKa Ho Ng if (cmd != SPACECTL_DEALLOC || off < 0 || len <= 0 || 2000454bc887SKa Ho Ng len > OFF_MAX - off || flags != 0) 2001454bc887SKa Ho Ng return (EINVAL); 2002454bc887SKa Ho Ng 2003454bc887SKa Ho Ng rl_cookie = rangelock_wlock(&shmfd->shm_rl, off, off + len, 2004454bc887SKa Ho Ng &shmfd->shm_mtx); 2005454bc887SKa Ho Ng switch (cmd) { 2006454bc887SKa Ho Ng case SPACECTL_DEALLOC: 2007454bc887SKa Ho Ng if ((shmfd->shm_seals & F_SEAL_WRITE) != 0) { 2008454bc887SKa Ho Ng error = EPERM; 2009454bc887SKa Ho Ng break; 2010454bc887SKa Ho Ng } 2011454bc887SKa Ho Ng error = shm_deallocate(shmfd, &off, &len, flags); 2012454bc887SKa Ho Ng *offset = off; 2013454bc887SKa Ho Ng *length = len; 2014454bc887SKa Ho Ng break; 2015454bc887SKa Ho Ng default: 2016454bc887SKa Ho Ng __assert_unreachable(); 2017454bc887SKa Ho Ng } 2018454bc887SKa Ho Ng rangelock_unlock(&shmfd->shm_rl, rl_cookie, &shmfd->shm_mtx); 2019454bc887SKa Ho Ng return (error); 2020454bc887SKa Ho Ng } 2021454bc887SKa Ho Ng 2022454bc887SKa Ho Ng 2023454bc887SKa Ho Ng static int 2024f1040532SKyle Evans shm_fallocate(struct file *fp, off_t offset, off_t len, struct thread *td) 2025f1040532SKyle Evans { 2026f1040532SKyle Evans void *rl_cookie; 2027f1040532SKyle Evans struct shmfd *shmfd; 2028f1040532SKyle Evans size_t size; 2029f1040532SKyle Evans int error; 2030f1040532SKyle Evans 2031f1040532SKyle Evans /* This assumes that the caller already checked for overflow. */ 2032f1040532SKyle Evans error = 0; 2033f1040532SKyle Evans shmfd = fp->f_data; 2034f1040532SKyle Evans size = offset + len; 203539eae263SKyle Evans 203639eae263SKyle Evans /* 203739eae263SKyle Evans * Just grab the rangelock for the range that we may be attempting to 203839eae263SKyle Evans * grow, rather than blocking read/write for regions we won't be 203939eae263SKyle Evans * touching while this (potential) resize is in progress. Other 204039eae263SKyle Evans * attempts to resize the shmfd will have to take a write lock from 0 to 204139eae263SKyle Evans * OFF_MAX, so this being potentially beyond the current usable range of 204239eae263SKyle Evans * the shmfd is not necessarily a concern. If other mechanisms are 204339eae263SKyle Evans * added to grow a shmfd, this may need to be re-evaluated. 204439eae263SKyle Evans */ 204539eae263SKyle Evans rl_cookie = rangelock_wlock(&shmfd->shm_rl, offset, size, 2046f1040532SKyle Evans &shmfd->shm_mtx); 2047d301b358SKonstantin Belousov if (size > shmfd->shm_size) 2048d301b358SKonstantin Belousov error = shm_dotruncate_cookie(shmfd, size, rl_cookie); 2049f1040532SKyle Evans rangelock_unlock(&shmfd->shm_rl, rl_cookie, &shmfd->shm_mtx); 2050f1040532SKyle Evans /* Translate to posix_fallocate(2) return value as needed. */ 2051f1040532SKyle Evans if (error == ENOMEM) 2052f1040532SKyle Evans error = ENOSPC; 2053f1040532SKyle Evans return (error); 2054f1040532SKyle Evans } 2055f1040532SKyle Evans 2056f1040532SKyle Evans static int 205756d0e33eSKonstantin Belousov sysctl_posix_shm_list(SYSCTL_HANDLER_ARGS) 205856d0e33eSKonstantin Belousov { 205956d0e33eSKonstantin Belousov struct shm_mapping *shmm; 206056d0e33eSKonstantin Belousov struct sbuf sb; 206156d0e33eSKonstantin Belousov struct kinfo_file kif; 206256d0e33eSKonstantin Belousov u_long i; 206356d0e33eSKonstantin Belousov int error, error2; 206456d0e33eSKonstantin Belousov 206556d0e33eSKonstantin Belousov sbuf_new_for_sysctl(&sb, NULL, sizeof(struct kinfo_file) * 5, req); 206656d0e33eSKonstantin Belousov sbuf_clear_flags(&sb, SBUF_INCLUDENUL); 206756d0e33eSKonstantin Belousov error = 0; 206856d0e33eSKonstantin Belousov sx_slock(&shm_dict_lock); 206956d0e33eSKonstantin Belousov for (i = 0; i < shm_hash + 1; i++) { 207056d0e33eSKonstantin Belousov LIST_FOREACH(shmm, &shm_dictionary[i], sm_link) { 207156d0e33eSKonstantin Belousov error = shm_fill_kinfo_locked(shmm->sm_shmfd, 207256d0e33eSKonstantin Belousov &kif, true); 2073747a4726SJamie Gritton if (error == EPERM) { 2074747a4726SJamie Gritton error = 0; 207556d0e33eSKonstantin Belousov continue; 2076747a4726SJamie Gritton } 207756d0e33eSKonstantin Belousov if (error != 0) 207856d0e33eSKonstantin Belousov break; 207956d0e33eSKonstantin Belousov pack_kinfo(&kif); 208056d0e33eSKonstantin Belousov error = sbuf_bcat(&sb, &kif, kif.kf_structsize) == 0 ? 208156d0e33eSKonstantin Belousov 0 : ENOMEM; 208256d0e33eSKonstantin Belousov if (error != 0) 208356d0e33eSKonstantin Belousov break; 208456d0e33eSKonstantin Belousov } 208556d0e33eSKonstantin Belousov } 208656d0e33eSKonstantin Belousov sx_sunlock(&shm_dict_lock); 208756d0e33eSKonstantin Belousov error2 = sbuf_finish(&sb); 208856d0e33eSKonstantin Belousov sbuf_delete(&sb); 208956d0e33eSKonstantin Belousov return (error != 0 ? error : error2); 209056d0e33eSKonstantin Belousov } 209156d0e33eSKonstantin Belousov 209256d0e33eSKonstantin Belousov SYSCTL_PROC(_kern_ipc, OID_AUTO, posix_shm_list, 2093d7c4ea7dSJamie Gritton CTLFLAG_RD | CTLFLAG_PRISON | CTLFLAG_MPSAFE | CTLTYPE_OPAQUE, 209456d0e33eSKonstantin Belousov NULL, 0, sysctl_posix_shm_list, "", 209556d0e33eSKonstantin Belousov "POSIX SHM list"); 209620f70576SKyle Evans 209720f70576SKyle Evans int 2098535b1df9SKyle Evans kern_shm_open(struct thread *td, const char *path, int flags, mode_t mode, 2099535b1df9SKyle Evans struct filecaps *caps) 210020f70576SKyle Evans { 210120f70576SKyle Evans 2102535b1df9SKyle Evans return (kern_shm_open2(td, path, flags, mode, 0, caps, NULL)); 210320f70576SKyle Evans } 210420f70576SKyle Evans 210520f70576SKyle Evans /* 210620f70576SKyle Evans * This version of the shm_open() interface leaves CLOEXEC behavior up to the 210720f70576SKyle Evans * caller, and libc will enforce it for the traditional shm_open() call. This 210820f70576SKyle Evans * allows other consumers, like memfd_create(), to opt-in for CLOEXEC. This 210920f70576SKyle Evans * interface also includes a 'name' argument that is currently unused, but could 211020f70576SKyle Evans * potentially be exported later via some interface for debugging purposes. 211120f70576SKyle Evans * From the kernel's perspective, it is optional. Individual consumers like 211220f70576SKyle Evans * memfd_create() may require it in order to be compatible with other systems 211320f70576SKyle Evans * implementing the same function. 211420f70576SKyle Evans */ 211520f70576SKyle Evans int 211620f70576SKyle Evans sys_shm_open2(struct thread *td, struct shm_open2_args *uap) 211720f70576SKyle Evans { 211820f70576SKyle Evans 211920f70576SKyle Evans return (kern_shm_open2(td, uap->path, uap->flags, uap->mode, 2120535b1df9SKyle Evans uap->shmflags, NULL, uap->name)); 212120f70576SKyle Evans } 2122