/* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 1993, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2015, Joyent, Inc. All rights reserved. * Copyright (c) 2016 by Delphix. All rights reserved. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define SEGSPTADDR (caddr_t)0x0 /* * # pages used for spt */ size_t spt_used; /* * segspt_minfree is the memory left for system after ISM * locked its pages; it is set up to 5% of availrmem in * sptcreate when ISM is created. ISM should not use more * than ~90% of availrmem; if it does, then the performance * of the system may decrease. Machines with large memories may * be able to use up more memory for ISM so we set the default * segspt_minfree to 5% (which gives ISM max 95% of availrmem. * If somebody wants even more memory for ISM (risking hanging * the system) they can patch the segspt_minfree to smaller number. */ pgcnt_t segspt_minfree = 0; static int segspt_create(struct seg *seg, caddr_t argsp); static int segspt_unmap(struct seg *seg, caddr_t raddr, size_t ssize); static void segspt_free(struct seg *seg); static void segspt_free_pages(struct seg *seg, caddr_t addr, size_t len); static lgrp_mem_policy_info_t *segspt_getpolicy(struct seg *seg, caddr_t addr); static void segspt_badop() { panic("segspt_badop called"); /*NOTREACHED*/ } #define SEGSPT_BADOP(t) (t(*)())segspt_badop struct seg_ops segspt_ops = { SEGSPT_BADOP(int), /* dup */ segspt_unmap, segspt_free, SEGSPT_BADOP(int), /* fault */ SEGSPT_BADOP(faultcode_t), /* faulta */ SEGSPT_BADOP(int), /* setprot */ SEGSPT_BADOP(int), /* checkprot */ SEGSPT_BADOP(int), /* kluster */ SEGSPT_BADOP(size_t), /* swapout */ SEGSPT_BADOP(int), /* sync */ SEGSPT_BADOP(size_t), /* incore */ SEGSPT_BADOP(int), /* lockop */ SEGSPT_BADOP(int), /* getprot */ SEGSPT_BADOP(u_offset_t), /* getoffset */ SEGSPT_BADOP(int), /* gettype */ SEGSPT_BADOP(int), /* getvp */ SEGSPT_BADOP(int), /* advise */ SEGSPT_BADOP(void), /* dump */ SEGSPT_BADOP(int), /* pagelock */ SEGSPT_BADOP(int), /* setpgsz */ SEGSPT_BADOP(int), /* getmemid */ segspt_getpolicy, /* getpolicy */ SEGSPT_BADOP(int), /* capable */ seg_inherit_notsup /* inherit */ }; static int segspt_shmdup(struct seg *seg, struct seg *newseg); static int segspt_shmunmap(struct seg *seg, caddr_t raddr, size_t ssize); static void segspt_shmfree(struct seg *seg); static faultcode_t segspt_shmfault(struct hat *hat, struct seg *seg, caddr_t addr, size_t len, enum fault_type type, enum seg_rw rw); static faultcode_t segspt_shmfaulta(struct seg *seg, caddr_t addr); static int segspt_shmsetprot(register struct seg *seg, register caddr_t addr, register size_t len, register uint_t prot); static int segspt_shmcheckprot(struct seg *seg, caddr_t addr, size_t size, uint_t prot); static int segspt_shmkluster(struct seg *seg, caddr_t addr, ssize_t delta); static size_t segspt_shmswapout(struct seg *seg); static size_t segspt_shmincore(struct seg *seg, caddr_t addr, size_t len, register char *vec); static int segspt_shmsync(struct seg *seg, register caddr_t addr, size_t len, int attr, uint_t flags); static int segspt_shmlockop(struct seg *seg, caddr_t addr, size_t len, int attr, int op, ulong_t *lockmap, size_t pos); static int segspt_shmgetprot(struct seg *seg, caddr_t addr, size_t len, uint_t *protv); static u_offset_t segspt_shmgetoffset(struct seg *seg, caddr_t addr); static int segspt_shmgettype(struct seg *seg, caddr_t addr); static int segspt_shmgetvp(struct seg *seg, caddr_t addr, struct vnode **vpp); static int segspt_shmadvise(struct seg *seg, caddr_t addr, size_t len, uint_t behav); static void segspt_shmdump(struct seg *seg); static int segspt_shmpagelock(struct seg *, caddr_t, size_t, struct page ***, enum lock_type, enum seg_rw); static int segspt_shmsetpgsz(struct seg *, caddr_t, size_t, uint_t); static int segspt_shmgetmemid(struct seg *, caddr_t, memid_t *); static lgrp_mem_policy_info_t *segspt_shmgetpolicy(struct seg *, caddr_t); static int segspt_shmcapable(struct seg *, segcapability_t); struct seg_ops segspt_shmops = { segspt_shmdup, segspt_shmunmap, segspt_shmfree, segspt_shmfault, segspt_shmfaulta, segspt_shmsetprot, segspt_shmcheckprot, segspt_shmkluster, segspt_shmswapout, segspt_shmsync, segspt_shmincore, segspt_shmlockop, segspt_shmgetprot, segspt_shmgetoffset, segspt_shmgettype, segspt_shmgetvp, segspt_shmadvise, /* advise */ segspt_shmdump, segspt_shmpagelock, segspt_shmsetpgsz, segspt_shmgetmemid, segspt_shmgetpolicy, segspt_shmcapable, seg_inherit_notsup }; static void segspt_purge(struct seg *seg); static int segspt_reclaim(void *, caddr_t, size_t, struct page **, enum seg_rw, int); static int spt_anon_getpages(struct seg *seg, caddr_t addr, size_t len, page_t **ppa); /*ARGSUSED*/ int sptcreate(size_t size, struct seg **sptseg, struct anon_map *amp, uint_t prot, uint_t flags, uint_t share_szc) { int err; struct as *newas; struct segspt_crargs sptcargs; #ifdef DEBUG TNF_PROBE_1(sptcreate, "spt", /* CSTYLED */, tnf_ulong, size, size ); #endif if (segspt_minfree == 0) /* leave min 5% of availrmem for */ segspt_minfree = availrmem/20; /* for the system */ if (!hat_supported(HAT_SHARED_PT, (void *)0)) return (EINVAL); /* * get a new as for this shared memory segment */ newas = as_alloc(); newas->a_proc = NULL; sptcargs.amp = amp; sptcargs.prot = prot; sptcargs.flags = flags; sptcargs.szc = share_szc; /* * create a shared page table (spt) segment */ if (err = as_map(newas, SEGSPTADDR, size, segspt_create, &sptcargs)) { as_free(newas); return (err); } *sptseg = sptcargs.seg_spt; return (0); } void sptdestroy(struct as *as, struct anon_map *amp) { #ifdef DEBUG TNF_PROBE_0(sptdestroy, "spt", /* CSTYLED */); #endif (void) as_unmap(as, SEGSPTADDR, amp->size); as_free(as); } /* * called from seg_free(). * free (i.e., unlock, unmap, return to free list) * all the pages in the given seg. */ void segspt_free(struct seg *seg) { struct spt_data *sptd = (struct spt_data *)seg->s_data; ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as)); if (sptd != NULL) { if (sptd->spt_realsize) segspt_free_pages(seg, seg->s_base, sptd->spt_realsize); if (sptd->spt_ppa_lckcnt) { kmem_free(sptd->spt_ppa_lckcnt, sizeof (*sptd->spt_ppa_lckcnt) * btopr(sptd->spt_amp->size)); } kmem_free(sptd->spt_vp, sizeof (*sptd->spt_vp)); cv_destroy(&sptd->spt_cv); mutex_destroy(&sptd->spt_lock); kmem_free(sptd, sizeof (*sptd)); } } /*ARGSUSED*/ static int segspt_shmsync(struct seg *seg, caddr_t addr, size_t len, int attr, uint_t flags) { ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as)); return (0); } /*ARGSUSED*/ static size_t segspt_shmincore(struct seg *seg, caddr_t addr, size_t len, char *vec) { caddr_t eo_seg; pgcnt_t npages; struct shm_data *shmd = (struct shm_data *)seg->s_data; struct seg *sptseg; struct spt_data *sptd; ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as)); #ifdef lint seg = seg; #endif sptseg = shmd->shm_sptseg; sptd = sptseg->s_data; if ((sptd->spt_flags & SHM_PAGEABLE) == 0) { eo_seg = addr + len; while (addr < eo_seg) { /* page exists, and it's locked. */ *vec++ = SEG_PAGE_INCORE | SEG_PAGE_LOCKED | SEG_PAGE_ANON; addr += PAGESIZE; } return (len); } else { struct anon_map *amp = shmd->shm_amp; struct anon *ap; page_t *pp; pgcnt_t anon_index; struct vnode *vp; u_offset_t off; ulong_t i; int ret; anon_sync_obj_t cookie; addr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK); anon_index = seg_page(seg, addr); npages = btopr(len); if (anon_index + npages > btopr(shmd->shm_amp->size)) { return (EINVAL); } ANON_LOCK_ENTER(&->a_rwlock, RW_READER); for (i = 0; i < npages; i++, anon_index++) { ret = 0; anon_array_enter(amp, anon_index, &cookie); ap = anon_get_ptr(amp->ahp, anon_index); if (ap != NULL) { swap_xlate(ap, &vp, &off); anon_array_exit(&cookie); pp = page_lookup_nowait(vp, off, SE_SHARED); if (pp != NULL) { ret |= SEG_PAGE_INCORE | SEG_PAGE_ANON; page_unlock(pp); } } else { anon_array_exit(&cookie); } if (shmd->shm_vpage[anon_index] & DISM_PG_LOCKED) { ret |= SEG_PAGE_LOCKED; } *vec++ = (char)ret; } ANON_LOCK_EXIT(&->a_rwlock); return (len); } } static int segspt_unmap(struct seg *seg, caddr_t raddr, size_t ssize) { size_t share_size; ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as)); /* * seg.s_size may have been rounded up to the largest page size * in shmat(). * XXX This should be cleanedup. sptdestroy should take a length * argument which should be the same as sptcreate. Then * this rounding would not be needed (or is done in shm.c) * Only the check for full segment will be needed. * * XXX -- shouldn't raddr == 0 always? These tests don't seem * to be useful at all. */ share_size = page_get_pagesize(seg->s_szc); ssize = P2ROUNDUP(ssize, share_size); if (raddr == seg->s_base && ssize == seg->s_size) { seg_free(seg); return (0); } else return (EINVAL); } int segspt_create(struct seg *seg, caddr_t argsp) { int err; caddr_t addr = seg->s_base; struct spt_data *sptd; struct segspt_crargs *sptcargs = (struct segspt_crargs *)argsp; struct anon_map *amp = sptcargs->amp; struct kshmid *sp = amp->a_sp; struct cred *cred = CRED(); ulong_t i, j, anon_index = 0; pgcnt_t npages = btopr(amp->size); struct vnode *vp; page_t **ppa; uint_t hat_flags; size_t pgsz; pgcnt_t pgcnt; caddr_t a; pgcnt_t pidx; size_t sz; proc_t *procp = curproc; rctl_qty_t lockedbytes = 0; kproject_t *proj; /* * We are holding the a_lock on the underlying dummy as, * so we can make calls to the HAT layer. */ ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as)); ASSERT(sp != NULL); #ifdef DEBUG TNF_PROBE_2(segspt_create, "spt", /* CSTYLED */, tnf_opaque, addr, addr, tnf_ulong, len, seg->s_size); #endif if ((sptcargs->flags & SHM_PAGEABLE) == 0) { if (err = anon_swap_adjust(npages)) return (err); } err = ENOMEM; if ((sptd = kmem_zalloc(sizeof (*sptd), KM_NOSLEEP)) == NULL) goto out1; if ((sptcargs->flags & SHM_PAGEABLE) == 0) { if ((ppa = kmem_zalloc(((sizeof (page_t *)) * npages), KM_NOSLEEP)) == NULL) goto out2; } mutex_init(&sptd->spt_lock, NULL, MUTEX_DEFAULT, NULL); if ((vp = kmem_zalloc(sizeof (*vp), KM_NOSLEEP)) == NULL) goto out3; seg->s_ops = &segspt_ops; sptd->spt_vp = vp; sptd->spt_amp = amp; sptd->spt_prot = sptcargs->prot; sptd->spt_flags = sptcargs->flags; seg->s_data = (caddr_t)sptd; sptd->spt_ppa = NULL; sptd->spt_ppa_lckcnt = NULL; seg->s_szc = sptcargs->szc; cv_init(&sptd->spt_cv, NULL, CV_DEFAULT, NULL); sptd->spt_gen = 0; ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); if (seg->s_szc > amp->a_szc) { amp->a_szc = seg->s_szc; } ANON_LOCK_EXIT(&->a_rwlock); /* * Set policy to affect initial allocation of pages in * anon_map_createpages() */ (void) lgrp_shm_policy_set(LGRP_MEM_POLICY_DEFAULT, amp, anon_index, NULL, 0, ptob(npages)); if (sptcargs->flags & SHM_PAGEABLE) { size_t share_sz; pgcnt_t new_npgs, more_pgs; struct anon_hdr *nahp; zone_t *zone; share_sz = page_get_pagesize(seg->s_szc); if (!IS_P2ALIGNED(amp->size, share_sz)) { /* * We are rounding up the size of the anon array * on 4 M boundary because we always create 4 M * of page(s) when locking, faulting pages and we * don't have to check for all corner cases e.g. * if there is enough space to allocate 4 M * page. */ new_npgs = btop(P2ROUNDUP(amp->size, share_sz)); more_pgs = new_npgs - npages; /* * The zone will never be NULL, as a fully created * shm always has an owning zone. */ zone = sp->shm_perm.ipc_zone_ref.zref_zone; ASSERT(zone != NULL); if (anon_resv_zone(ptob(more_pgs), zone) == 0) { err = ENOMEM; goto out4; } nahp = anon_create(new_npgs, ANON_SLEEP); ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); (void) anon_copy_ptr(amp->ahp, 0, nahp, 0, npages, ANON_SLEEP); anon_release(amp->ahp, npages); amp->ahp = nahp; ASSERT(amp->swresv == ptob(npages)); amp->swresv = amp->size = ptob(new_npgs); ANON_LOCK_EXIT(&->a_rwlock); npages = new_npgs; } sptd->spt_ppa_lckcnt = kmem_zalloc(npages * sizeof (*sptd->spt_ppa_lckcnt), KM_SLEEP); sptd->spt_pcachecnt = 0; sptd->spt_realsize = ptob(npages); sptcargs->seg_spt = seg; return (0); } /* * get array of pages for each anon slot in amp */ if ((err = anon_map_createpages(amp, anon_index, ptob(npages), ppa, seg, addr, S_CREATE, cred)) != 0) goto out4; mutex_enter(&sp->shm_mlock); /* May be partially locked, so, count bytes to charge for locking */ for (i = 0; i < npages; i++) if (ppa[i]->p_lckcnt == 0) lockedbytes += PAGESIZE; proj = sp->shm_perm.ipc_proj; if (lockedbytes > 0) { mutex_enter(&procp->p_lock); if (rctl_incr_locked_mem(procp, proj, lockedbytes, 0)) { mutex_exit(&procp->p_lock); mutex_exit(&sp->shm_mlock); for (i = 0; i < npages; i++) page_unlock(ppa[i]); err = ENOMEM; goto out4; } mutex_exit(&procp->p_lock); } /* * addr is initial address corresponding to the first page on ppa list */ for (i = 0; i < npages; i++) { /* attempt to lock all pages */ if (page_pp_lock(ppa[i], 0, 1) == 0) { /* * if unable to lock any page, unlock all * of them and return error */ for (j = 0; j < i; j++) page_pp_unlock(ppa[j], 0, 1); for (i = 0; i < npages; i++) page_unlock(ppa[i]); rctl_decr_locked_mem(NULL, proj, lockedbytes, 0); mutex_exit(&sp->shm_mlock); err = ENOMEM; goto out4; } } mutex_exit(&sp->shm_mlock); /* * Some platforms assume that ISM mappings are HAT_LOAD_LOCK * for the entire life of the segment. For example platforms * that do not support Dynamic Reconfiguration. */ hat_flags = HAT_LOAD_SHARE; if (!hat_supported(HAT_DYNAMIC_ISM_UNMAP, NULL)) hat_flags |= HAT_LOAD_LOCK; /* * Load translations one lare page at a time * to make sure we don't create mappings bigger than * segment's size code in case underlying pages * are shared with segvn's segment that uses bigger * size code than we do. */ pgsz = page_get_pagesize(seg->s_szc); pgcnt = page_get_pagecnt(seg->s_szc); for (a = addr, pidx = 0; pidx < npages; a += pgsz, pidx += pgcnt) { sz = MIN(pgsz, ptob(npages - pidx)); hat_memload_array(seg->s_as->a_hat, a, sz, &ppa[pidx], sptd->spt_prot, hat_flags); } /* * On platforms that do not support HAT_DYNAMIC_ISM_UNMAP, * we will leave the pages locked SE_SHARED for the life * of the ISM segment. This will prevent any calls to * hat_pageunload() on this ISM segment for those platforms. */ if (!(hat_flags & HAT_LOAD_LOCK)) { /* * On platforms that support HAT_DYNAMIC_ISM_UNMAP, * we no longer need to hold the SE_SHARED lock on the pages, * since L_PAGELOCK and F_SOFTLOCK calls will grab the * SE_SHARED lock on the pages as necessary. */ for (i = 0; i < npages; i++) page_unlock(ppa[i]); } sptd->spt_pcachecnt = 0; kmem_free(ppa, ((sizeof (page_t *)) * npages)); sptd->spt_realsize = ptob(npages); atomic_add_long(&spt_used, npages); sptcargs->seg_spt = seg; return (0); out4: seg->s_data = NULL; kmem_free(vp, sizeof (*vp)); cv_destroy(&sptd->spt_cv); out3: mutex_destroy(&sptd->spt_lock); if ((sptcargs->flags & SHM_PAGEABLE) == 0) kmem_free(ppa, (sizeof (*ppa) * npages)); out2: kmem_free(sptd, sizeof (*sptd)); out1: if ((sptcargs->flags & SHM_PAGEABLE) == 0) anon_swap_restore(npages); return (err); } /*ARGSUSED*/ void segspt_free_pages(struct seg *seg, caddr_t addr, size_t len) { struct page *pp; struct spt_data *sptd = (struct spt_data *)seg->s_data; pgcnt_t npages; ulong_t anon_idx; struct anon_map *amp; struct anon *ap; struct vnode *vp; u_offset_t off; uint_t hat_flags; int root = 0; pgcnt_t pgs, curnpgs = 0; page_t *rootpp; rctl_qty_t unlocked_bytes = 0; kproject_t *proj; kshmid_t *sp; ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as)); len = P2ROUNDUP(len, PAGESIZE); npages = btop(len); hat_flags = HAT_UNLOAD_UNLOCK | HAT_UNLOAD_UNMAP; if ((hat_supported(HAT_DYNAMIC_ISM_UNMAP, (void *)0)) || (sptd->spt_flags & SHM_PAGEABLE)) { hat_flags = HAT_UNLOAD_UNMAP; } hat_unload(seg->s_as->a_hat, addr, len, hat_flags); amp = sptd->spt_amp; if (sptd->spt_flags & SHM_PAGEABLE) npages = btop(amp->size); ASSERT(amp != NULL); if ((sptd->spt_flags & SHM_PAGEABLE) == 0) { sp = amp->a_sp; proj = sp->shm_perm.ipc_proj; mutex_enter(&sp->shm_mlock); } for (anon_idx = 0; anon_idx < npages; anon_idx++) { if ((sptd->spt_flags & SHM_PAGEABLE) == 0) { if ((ap = anon_get_ptr(amp->ahp, anon_idx)) == NULL) { panic("segspt_free_pages: null app"); /*NOTREACHED*/ } } else { if ((ap = anon_get_next_ptr(amp->ahp, &anon_idx)) == NULL) continue; } ASSERT(ANON_ISBUSY(anon_get_slot(amp->ahp, anon_idx)) == 0); swap_xlate(ap, &vp, &off); /* * If this platform supports HAT_DYNAMIC_ISM_UNMAP, * the pages won't be having SE_SHARED lock at this * point. * * On platforms that do not support HAT_DYNAMIC_ISM_UNMAP, * the pages are still held SE_SHARED locked from the * original segspt_create() * * Our goal is to get SE_EXCL lock on each page, remove * permanent lock on it and invalidate the page. */ if ((sptd->spt_flags & SHM_PAGEABLE) == 0) { if (hat_flags == HAT_UNLOAD_UNMAP) pp = page_lookup(vp, off, SE_EXCL); else { if ((pp = page_find(vp, off)) == NULL) { panic("segspt_free_pages: " "page not locked"); /*NOTREACHED*/ } if (!page_tryupgrade(pp)) { page_unlock(pp); pp = page_lookup(vp, off, SE_EXCL); } } if (pp == NULL) { panic("segspt_free_pages: " "page not in the system"); /*NOTREACHED*/ } ASSERT(pp->p_lckcnt > 0); page_pp_unlock(pp, 0, 1); if (pp->p_lckcnt == 0) unlocked_bytes += PAGESIZE; } else { if ((pp = page_lookup(vp, off, SE_EXCL)) == NULL) continue; } /* * It's logical to invalidate the pages here as in most cases * these were created by segspt. */ if (pp->p_szc != 0) { if (root == 0) { ASSERT(curnpgs == 0); root = 1; rootpp = pp; pgs = curnpgs = page_get_pagecnt(pp->p_szc); ASSERT(pgs > 1); ASSERT(IS_P2ALIGNED(pgs, pgs)); ASSERT(!(page_pptonum(pp) & (pgs - 1))); curnpgs--; } else if ((page_pptonum(pp) & (pgs - 1)) == pgs - 1) { ASSERT(curnpgs == 1); ASSERT(page_pptonum(pp) == page_pptonum(rootpp) + (pgs - 1)); page_destroy_pages(rootpp); root = 0; curnpgs = 0; } else { ASSERT(curnpgs > 1); ASSERT(page_pptonum(pp) == page_pptonum(rootpp) + (pgs - curnpgs)); curnpgs--; } } else { if (root != 0 || curnpgs != 0) { panic("segspt_free_pages: bad large page"); /*NOTREACHED*/ } /* * Before destroying the pages, we need to take care * of the rctl locked memory accounting. For that * we need to calculte the unlocked_bytes. */ if (pp->p_lckcnt > 0) unlocked_bytes += PAGESIZE; /*LINTED: constant in conditional context */ VN_DISPOSE(pp, B_INVAL, 0, kcred); } } if ((sptd->spt_flags & SHM_PAGEABLE) == 0) { if (unlocked_bytes > 0) rctl_decr_locked_mem(NULL, proj, unlocked_bytes, 0); mutex_exit(&sp->shm_mlock); } if (root != 0 || curnpgs != 0) { panic("segspt_free_pages: bad large page"); /*NOTREACHED*/ } /* * mark that pages have been released */ sptd->spt_realsize = 0; if ((sptd->spt_flags & SHM_PAGEABLE) == 0) { atomic_add_long(&spt_used, -npages); anon_swap_restore(npages); } } /* * Get memory allocation policy info for specified address in given segment */ static lgrp_mem_policy_info_t * segspt_getpolicy(struct seg *seg, caddr_t addr) { struct anon_map *amp; ulong_t anon_index; lgrp_mem_policy_info_t *policy_info; struct spt_data *spt_data; ASSERT(seg != NULL); /* * Get anon_map from segspt * * Assume that no lock needs to be held on anon_map, since * it should be protected by its reference count which must be * nonzero for an existing segment * Need to grab readers lock on policy tree though */ spt_data = (struct spt_data *)seg->s_data; if (spt_data == NULL) return (NULL); amp = spt_data->spt_amp; ASSERT(amp->refcnt != 0); /* * Get policy info * * Assume starting anon index of 0 */ anon_index = seg_page(seg, addr); policy_info = lgrp_shm_policy_get(amp, anon_index, NULL, 0); return (policy_info); } /* * DISM only. * Return locked pages over a given range. * * We will cache all DISM locked pages and save the pplist for the * entire segment in the ppa field of the underlying DISM segment structure. * Later, during a call to segspt_reclaim() we will use this ppa array * to page_unlock() all of the pages and then we will free this ppa list. */ /*ARGSUSED*/ static int segspt_dismpagelock(struct seg *seg, caddr_t addr, size_t len, struct page ***ppp, enum lock_type type, enum seg_rw rw) { struct shm_data *shmd = (struct shm_data *)seg->s_data; struct seg *sptseg = shmd->shm_sptseg; struct spt_data *sptd = sptseg->s_data; pgcnt_t pg_idx, npages, tot_npages, npgs; struct page **pplist, **pl, **ppa, *pp; struct anon_map *amp; spgcnt_t an_idx; int ret = ENOTSUP; uint_t pl_built = 0; struct anon *ap; struct vnode *vp; u_offset_t off; pgcnt_t claim_availrmem = 0; uint_t szc; ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as)); ASSERT(type == L_PAGELOCK || type == L_PAGEUNLOCK); /* * We want to lock/unlock the entire ISM segment. Therefore, * we will be using the underlying sptseg and it's base address * and length for the caching arguments. */ ASSERT(sptseg); ASSERT(sptd); pg_idx = seg_page(seg, addr); npages = btopr(len); /* * check if the request is larger than number of pages covered * by amp */ if (pg_idx + npages > btopr(sptd->spt_amp->size)) { *ppp = NULL; return (ENOTSUP); } if (type == L_PAGEUNLOCK) { ASSERT(sptd->spt_ppa != NULL); seg_pinactive(seg, NULL, seg->s_base, sptd->spt_amp->size, sptd->spt_ppa, S_WRITE, SEGP_FORCE_WIRED, segspt_reclaim); /* * If someone is blocked while unmapping, we purge * segment page cache and thus reclaim pplist synchronously * without waiting for seg_pasync_thread. This speeds up * unmapping in cases where munmap(2) is called, while * raw async i/o is still in progress or where a thread * exits on data fault in a multithreaded application. */ if ((sptd->spt_flags & DISM_PPA_CHANGED) || (AS_ISUNMAPWAIT(seg->s_as) && shmd->shm_softlockcnt > 0)) { segspt_purge(seg); } return (0); } /* The L_PAGELOCK case ... */ if (sptd->spt_flags & DISM_PPA_CHANGED) { segspt_purge(seg); /* * for DISM ppa needs to be rebuild since * number of locked pages could be changed */ *ppp = NULL; return (ENOTSUP); } /* * First try to find pages in segment page cache, without * holding the segment lock. */ pplist = seg_plookup(seg, NULL, seg->s_base, sptd->spt_amp->size, S_WRITE, SEGP_FORCE_WIRED); if (pplist != NULL) { ASSERT(sptd->spt_ppa != NULL); ASSERT(sptd->spt_ppa == pplist); ppa = sptd->spt_ppa; for (an_idx = pg_idx; an_idx < pg_idx + npages; ) { if (ppa[an_idx] == NULL) { seg_pinactive(seg, NULL, seg->s_base, sptd->spt_amp->size, ppa, S_WRITE, SEGP_FORCE_WIRED, segspt_reclaim); *ppp = NULL; return (ENOTSUP); } if ((szc = ppa[an_idx]->p_szc) != 0) { npgs = page_get_pagecnt(szc); an_idx = P2ROUNDUP(an_idx + 1, npgs); } else { an_idx++; } } /* * Since we cache the entire DISM segment, we want to * set ppp to point to the first slot that corresponds * to the requested addr, i.e. pg_idx. */ *ppp = &(sptd->spt_ppa[pg_idx]); return (0); } mutex_enter(&sptd->spt_lock); /* * try to find pages in segment page cache with mutex */ pplist = seg_plookup(seg, NULL, seg->s_base, sptd->spt_amp->size, S_WRITE, SEGP_FORCE_WIRED); if (pplist != NULL) { ASSERT(sptd->spt_ppa != NULL); ASSERT(sptd->spt_ppa == pplist); ppa = sptd->spt_ppa; for (an_idx = pg_idx; an_idx < pg_idx + npages; ) { if (ppa[an_idx] == NULL) { mutex_exit(&sptd->spt_lock); seg_pinactive(seg, NULL, seg->s_base, sptd->spt_amp->size, ppa, S_WRITE, SEGP_FORCE_WIRED, segspt_reclaim); *ppp = NULL; return (ENOTSUP); } if ((szc = ppa[an_idx]->p_szc) != 0) { npgs = page_get_pagecnt(szc); an_idx = P2ROUNDUP(an_idx + 1, npgs); } else { an_idx++; } } /* * Since we cache the entire DISM segment, we want to * set ppp to point to the first slot that corresponds * to the requested addr, i.e. pg_idx. */ mutex_exit(&sptd->spt_lock); *ppp = &(sptd->spt_ppa[pg_idx]); return (0); } if (seg_pinsert_check(seg, NULL, seg->s_base, sptd->spt_amp->size, SEGP_FORCE_WIRED) == SEGP_FAIL) { mutex_exit(&sptd->spt_lock); *ppp = NULL; return (ENOTSUP); } /* * No need to worry about protections because DISM pages are always rw. */ pl = pplist = NULL; amp = sptd->spt_amp; /* * Do we need to build the ppa array? */ if (sptd->spt_ppa == NULL) { pgcnt_t lpg_cnt = 0; pl_built = 1; tot_npages = btopr(sptd->spt_amp->size); ASSERT(sptd->spt_pcachecnt == 0); pplist = kmem_zalloc(sizeof (page_t *) * tot_npages, KM_SLEEP); pl = pplist; ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); for (an_idx = 0; an_idx < tot_npages; ) { ap = anon_get_ptr(amp->ahp, an_idx); /* * Cache only mlocked pages. For large pages * if one (constituent) page is mlocked * all pages for that large page * are cached also. This is for quick * lookups of ppa array; */ if ((ap != NULL) && (lpg_cnt != 0 || (sptd->spt_ppa_lckcnt[an_idx] != 0))) { swap_xlate(ap, &vp, &off); pp = page_lookup(vp, off, SE_SHARED); ASSERT(pp != NULL); if (lpg_cnt == 0) { lpg_cnt++; /* * For a small page, we are done -- * lpg_count is reset to 0 below. * * For a large page, we are guaranteed * to find the anon structures of all * constituent pages and a non-zero * lpg_cnt ensures that we don't test * for mlock for these. We are done * when lpg_count reaches (npgs + 1). * If we are not the first constituent * page, restart at the first one. */ npgs = page_get_pagecnt(pp->p_szc); if (!IS_P2ALIGNED(an_idx, npgs)) { an_idx = P2ALIGN(an_idx, npgs); page_unlock(pp); continue; } } if (++lpg_cnt > npgs) lpg_cnt = 0; /* * availrmem is decremented only * for unlocked pages */ if (sptd->spt_ppa_lckcnt[an_idx] == 0) claim_availrmem++; pplist[an_idx] = pp; } an_idx++; } ANON_LOCK_EXIT(&->a_rwlock); if (claim_availrmem) { mutex_enter(&freemem_lock); if (availrmem < tune.t_minarmem + claim_availrmem) { mutex_exit(&freemem_lock); ret = ENOTSUP; claim_availrmem = 0; goto insert_fail; } else { availrmem -= claim_availrmem; } mutex_exit(&freemem_lock); } sptd->spt_ppa = pl; } else { /* * We already have a valid ppa[]. */ pl = sptd->spt_ppa; } ASSERT(pl != NULL); ret = seg_pinsert(seg, NULL, seg->s_base, sptd->spt_amp->size, sptd->spt_amp->size, pl, S_WRITE, SEGP_FORCE_WIRED, segspt_reclaim); if (ret == SEGP_FAIL) { /* * seg_pinsert failed. We return * ENOTSUP, so that the as_pagelock() code will * then try the slower F_SOFTLOCK path. */ if (pl_built) { /* * No one else has referenced the ppa[]. * We created it and we need to destroy it. */ sptd->spt_ppa = NULL; } ret = ENOTSUP; goto insert_fail; } /* * In either case, we increment softlockcnt on the 'real' segment. */ sptd->spt_pcachecnt++; atomic_inc_ulong((ulong_t *)(&(shmd->shm_softlockcnt))); ppa = sptd->spt_ppa; for (an_idx = pg_idx; an_idx < pg_idx + npages; ) { if (ppa[an_idx] == NULL) { mutex_exit(&sptd->spt_lock); seg_pinactive(seg, NULL, seg->s_base, sptd->spt_amp->size, pl, S_WRITE, SEGP_FORCE_WIRED, segspt_reclaim); *ppp = NULL; return (ENOTSUP); } if ((szc = ppa[an_idx]->p_szc) != 0) { npgs = page_get_pagecnt(szc); an_idx = P2ROUNDUP(an_idx + 1, npgs); } else { an_idx++; } } /* * We can now drop the sptd->spt_lock since the ppa[] * exists and we have incremented pacachecnt. */ mutex_exit(&sptd->spt_lock); /* * Since we cache the entire segment, we want to * set ppp to point to the first slot that corresponds * to the requested addr, i.e. pg_idx. */ *ppp = &(sptd->spt_ppa[pg_idx]); return (0); insert_fail: /* * We will only reach this code if we tried and failed. * * And we can drop the lock on the dummy seg, once we've failed * to set up a new ppa[]. */ mutex_exit(&sptd->spt_lock); if (pl_built) { if (claim_availrmem) { mutex_enter(&freemem_lock); availrmem += claim_availrmem; mutex_exit(&freemem_lock); } /* * We created pl and we need to destroy it. */ pplist = pl; for (an_idx = 0; an_idx < tot_npages; an_idx++) { if (pplist[an_idx] != NULL) page_unlock(pplist[an_idx]); } kmem_free(pl, sizeof (page_t *) * tot_npages); } if (shmd->shm_softlockcnt <= 0) { if (AS_ISUNMAPWAIT(seg->s_as)) { mutex_enter(&seg->s_as->a_contents); if (AS_ISUNMAPWAIT(seg->s_as)) { AS_CLRUNMAPWAIT(seg->s_as); cv_broadcast(&seg->s_as->a_cv); } mutex_exit(&seg->s_as->a_contents); } } *ppp = NULL; return (ret); } /* * return locked pages over a given range. * * We will cache the entire ISM segment and save the pplist for the * entire segment in the ppa field of the underlying ISM segment structure. * Later, during a call to segspt_reclaim() we will use this ppa array * to page_unlock() all of the pages and then we will free this ppa list. */ /*ARGSUSED*/ static int segspt_shmpagelock(struct seg *seg, caddr_t addr, size_t len, struct page ***ppp, enum lock_type type, enum seg_rw rw) { struct shm_data *shmd = (struct shm_data *)seg->s_data; struct seg *sptseg = shmd->shm_sptseg; struct spt_data *sptd = sptseg->s_data; pgcnt_t np, page_index, npages; caddr_t a, spt_base; struct page **pplist, **pl, *pp; struct anon_map *amp; ulong_t anon_index; int ret = ENOTSUP; uint_t pl_built = 0; struct anon *ap; struct vnode *vp; u_offset_t off; ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as)); ASSERT(type == L_PAGELOCK || type == L_PAGEUNLOCK); /* * We want to lock/unlock the entire ISM segment. Therefore, * we will be using the underlying sptseg and it's base address * and length for the caching arguments. */ ASSERT(sptseg); ASSERT(sptd); if (sptd->spt_flags & SHM_PAGEABLE) { return (segspt_dismpagelock(seg, addr, len, ppp, type, rw)); } page_index = seg_page(seg, addr); npages = btopr(len); /* * check if the request is larger than number of pages covered * by amp */ if (page_index + npages > btopr(sptd->spt_amp->size)) { *ppp = NULL; return (ENOTSUP); } if (type == L_PAGEUNLOCK) { ASSERT(sptd->spt_ppa != NULL); seg_pinactive(seg, NULL, seg->s_base, sptd->spt_amp->size, sptd->spt_ppa, S_WRITE, SEGP_FORCE_WIRED, segspt_reclaim); /* * If someone is blocked while unmapping, we purge * segment page cache and thus reclaim pplist synchronously * without waiting for seg_pasync_thread. This speeds up * unmapping in cases where munmap(2) is called, while * raw async i/o is still in progress or where a thread * exits on data fault in a multithreaded application. */ if (AS_ISUNMAPWAIT(seg->s_as) && (shmd->shm_softlockcnt > 0)) { segspt_purge(seg); } return (0); } /* The L_PAGELOCK case... */ /* * First try to find pages in segment page cache, without * holding the segment lock. */ pplist = seg_plookup(seg, NULL, seg->s_base, sptd->spt_amp->size, S_WRITE, SEGP_FORCE_WIRED); if (pplist != NULL) { ASSERT(sptd->spt_ppa == pplist); ASSERT(sptd->spt_ppa[page_index]); /* * Since we cache the entire ISM segment, we want to * set ppp to point to the first slot that corresponds * to the requested addr, i.e. page_index. */ *ppp = &(sptd->spt_ppa[page_index]); return (0); } mutex_enter(&sptd->spt_lock); /* * try to find pages in segment page cache */ pplist = seg_plookup(seg, NULL, seg->s_base, sptd->spt_amp->size, S_WRITE, SEGP_FORCE_WIRED); if (pplist != NULL) { ASSERT(sptd->spt_ppa == pplist); /* * Since we cache the entire segment, we want to * set ppp to point to the first slot that corresponds * to the requested addr, i.e. page_index. */ mutex_exit(&sptd->spt_lock); *ppp = &(sptd->spt_ppa[page_index]); return (0); } if (seg_pinsert_check(seg, NULL, seg->s_base, sptd->spt_amp->size, SEGP_FORCE_WIRED) == SEGP_FAIL) { mutex_exit(&sptd->spt_lock); *ppp = NULL; return (ENOTSUP); } /* * No need to worry about protections because ISM pages * are always rw. */ pl = pplist = NULL; /* * Do we need to build the ppa array? */ if (sptd->spt_ppa == NULL) { ASSERT(sptd->spt_ppa == pplist); spt_base = sptseg->s_base; pl_built = 1; /* * availrmem is decremented once during anon_swap_adjust() * and is incremented during the anon_unresv(), which is * called from shm_rm_amp() when the segment is destroyed. */ amp = sptd->spt_amp; ASSERT(amp != NULL); /* pcachecnt is protected by sptd->spt_lock */ ASSERT(sptd->spt_pcachecnt == 0); pplist = kmem_zalloc(sizeof (page_t *) * btopr(sptd->spt_amp->size), KM_SLEEP); pl = pplist; anon_index = seg_page(sptseg, spt_base); ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); for (a = spt_base; a < (spt_base + sptd->spt_amp->size); a += PAGESIZE, anon_index++, pplist++) { ap = anon_get_ptr(amp->ahp, anon_index); ASSERT(ap != NULL); swap_xlate(ap, &vp, &off); pp = page_lookup(vp, off, SE_SHARED); ASSERT(pp != NULL); *pplist = pp; } ANON_LOCK_EXIT(&->a_rwlock); if (a < (spt_base + sptd->spt_amp->size)) { ret = ENOTSUP; goto insert_fail; } sptd->spt_ppa = pl; } else { /* * We already have a valid ppa[]. */ pl = sptd->spt_ppa; } ASSERT(pl != NULL); ret = seg_pinsert(seg, NULL, seg->s_base, sptd->spt_amp->size, sptd->spt_amp->size, pl, S_WRITE, SEGP_FORCE_WIRED, segspt_reclaim); if (ret == SEGP_FAIL) { /* * seg_pinsert failed. We return * ENOTSUP, so that the as_pagelock() code will * then try the slower F_SOFTLOCK path. */ if (pl_built) { /* * No one else has referenced the ppa[]. * We created it and we need to destroy it. */ sptd->spt_ppa = NULL; } ret = ENOTSUP; goto insert_fail; } /* * In either case, we increment softlockcnt on the 'real' segment. */ sptd->spt_pcachecnt++; atomic_inc_ulong((ulong_t *)(&(shmd->shm_softlockcnt))); /* * We can now drop the sptd->spt_lock since the ppa[] * exists and we have incremented pacachecnt. */ mutex_exit(&sptd->spt_lock); /* * Since we cache the entire segment, we want to * set ppp to point to the first slot that corresponds * to the requested addr, i.e. page_index. */ *ppp = &(sptd->spt_ppa[page_index]); return (0); insert_fail: /* * We will only reach this code if we tried and failed. * * And we can drop the lock on the dummy seg, once we've failed * to set up a new ppa[]. */ mutex_exit(&sptd->spt_lock); if (pl_built) { /* * We created pl and we need to destroy it. */ pplist = pl; np = (((uintptr_t)(a - spt_base)) >> PAGESHIFT); while (np) { page_unlock(*pplist); np--; pplist++; } kmem_free(pl, sizeof (page_t *) * btopr(sptd->spt_amp->size)); } if (shmd->shm_softlockcnt <= 0) { if (AS_ISUNMAPWAIT(seg->s_as)) { mutex_enter(&seg->s_as->a_contents); if (AS_ISUNMAPWAIT(seg->s_as)) { AS_CLRUNMAPWAIT(seg->s_as); cv_broadcast(&seg->s_as->a_cv); } mutex_exit(&seg->s_as->a_contents); } } *ppp = NULL; return (ret); } /* * purge any cached pages in the I/O page cache */ static void segspt_purge(struct seg *seg) { seg_ppurge(seg, NULL, SEGP_FORCE_WIRED); } static int segspt_reclaim(void *ptag, caddr_t addr, size_t len, struct page **pplist, enum seg_rw rw, int async) { struct seg *seg = (struct seg *)ptag; struct shm_data *shmd = (struct shm_data *)seg->s_data; struct seg *sptseg; struct spt_data *sptd; pgcnt_t npages, i, free_availrmem = 0; int done = 0; #ifdef lint addr = addr; #endif sptseg = shmd->shm_sptseg; sptd = sptseg->s_data; npages = (len >> PAGESHIFT); ASSERT(npages); ASSERT(sptd->spt_pcachecnt != 0); ASSERT(sptd->spt_ppa == pplist); ASSERT(npages == btopr(sptd->spt_amp->size)); ASSERT(async || AS_LOCK_HELD(seg->s_as)); /* * Acquire the lock on the dummy seg and destroy the * ppa array IF this is the last pcachecnt. */ mutex_enter(&sptd->spt_lock); if (--sptd->spt_pcachecnt == 0) { for (i = 0; i < npages; i++) { if (pplist[i] == NULL) { continue; } if (rw == S_WRITE) { hat_setrefmod(pplist[i]); } else { hat_setref(pplist[i]); } if ((sptd->spt_flags & SHM_PAGEABLE) && (sptd->spt_ppa_lckcnt[i] == 0)) free_availrmem++; page_unlock(pplist[i]); } if ((sptd->spt_flags & SHM_PAGEABLE) && free_availrmem) { mutex_enter(&freemem_lock); availrmem += free_availrmem; mutex_exit(&freemem_lock); } /* * Since we want to cach/uncache the entire ISM segment, * we will track the pplist in a segspt specific field * ppa, that is initialized at the time we add an entry to * the cache. */ ASSERT(sptd->spt_pcachecnt == 0); kmem_free(pplist, sizeof (page_t *) * npages); sptd->spt_ppa = NULL; sptd->spt_flags &= ~DISM_PPA_CHANGED; sptd->spt_gen++; cv_broadcast(&sptd->spt_cv); done = 1; } mutex_exit(&sptd->spt_lock); /* * If we are pcache async thread or called via seg_ppurge_wiredpp() we * may not hold AS lock (in this case async argument is not 0). This * means if softlockcnt drops to 0 after the decrement below address * space may get freed. We can't allow it since after softlock * derement to 0 we still need to access as structure for possible * wakeup of unmap waiters. To prevent the disappearance of as we take * this segment's shm_segfree_syncmtx. segspt_shmfree() also takes * this mutex as a barrier to make sure this routine completes before * segment is freed. * * The second complication we have to deal with in async case is a * possibility of missed wake up of unmap wait thread. When we don't * hold as lock here we may take a_contents lock before unmap wait * thread that was first to see softlockcnt was still not 0. As a * result we'll fail to wake up an unmap wait thread. To avoid this * race we set nounmapwait flag in as structure if we drop softlockcnt * to 0 if async is not 0. unmapwait thread * will not block if this flag is set. */ if (async) mutex_enter(&shmd->shm_segfree_syncmtx); /* * Now decrement softlockcnt. */ ASSERT(shmd->shm_softlockcnt > 0); atomic_dec_ulong((ulong_t *)(&(shmd->shm_softlockcnt))); if (shmd->shm_softlockcnt <= 0) { if (async || AS_ISUNMAPWAIT(seg->s_as)) { mutex_enter(&seg->s_as->a_contents); if (async) AS_SETNOUNMAPWAIT(seg->s_as); if (AS_ISUNMAPWAIT(seg->s_as)) { AS_CLRUNMAPWAIT(seg->s_as); cv_broadcast(&seg->s_as->a_cv); } mutex_exit(&seg->s_as->a_contents); } } if (async) mutex_exit(&shmd->shm_segfree_syncmtx); return (done); } /* * Do a F_SOFTUNLOCK call over the range requested. * The range must have already been F_SOFTLOCK'ed. * * The calls to acquire and release the anon map lock mutex were * removed in order to avoid a deadly embrace during a DR * memory delete operation. (Eg. DR blocks while waiting for a * exclusive lock on a page that is being used for kaio; the * thread that will complete the kaio and call segspt_softunlock * blocks on the anon map lock; another thread holding the anon * map lock blocks on another page lock via the segspt_shmfault * -> page_lookup -> page_lookup_create -> page_lock_es code flow.) * * The appropriateness of the removal is based upon the following: * 1. If we are holding a segment's reader lock and the page is held * shared, then the corresponding element in anonmap which points to * anon struct cannot change and there is no need to acquire the * anonymous map lock. * 2. Threads in segspt_softunlock have a reader lock on the segment * and already have the shared page lock, so we are guaranteed that * the anon map slot cannot change and therefore can call anon_get_ptr() * without grabbing the anonymous map lock. * 3. Threads that softlock a shared page break copy-on-write, even if * its a read. Thus cow faults can be ignored with respect to soft * unlocking, since the breaking of cow means that the anon slot(s) will * not be shared. */ static void segspt_softunlock(struct seg *seg, caddr_t sptseg_addr, size_t len, enum seg_rw rw) { struct shm_data *shmd = (struct shm_data *)seg->s_data; struct seg *sptseg; struct spt_data *sptd; page_t *pp; caddr_t adr; struct vnode *vp; u_offset_t offset; ulong_t anon_index; struct anon_map *amp; /* XXX - for locknest */ struct anon *ap = NULL; pgcnt_t npages; ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as)); sptseg = shmd->shm_sptseg; sptd = sptseg->s_data; /* * Some platforms assume that ISM mappings are HAT_LOAD_LOCK * and therefore their pages are SE_SHARED locked * for the entire life of the segment. */ if ((!hat_supported(HAT_DYNAMIC_ISM_UNMAP, (void *)0)) && ((sptd->spt_flags & SHM_PAGEABLE) == 0)) { goto softlock_decrement; } /* * Any thread is free to do a page_find and * page_unlock() on the pages within this seg. * * We are already holding the as->a_lock on the user's * real segment, but we need to hold the a_lock on the * underlying dummy as. This is mostly to satisfy the * underlying HAT layer. */ AS_LOCK_ENTER(sptseg->s_as, RW_READER); hat_unlock(sptseg->s_as->a_hat, sptseg_addr, len); AS_LOCK_EXIT(sptseg->s_as); amp = sptd->spt_amp; ASSERT(amp != NULL); anon_index = seg_page(sptseg, sptseg_addr); for (adr = sptseg_addr; adr < sptseg_addr + len; adr += PAGESIZE) { ap = anon_get_ptr(amp->ahp, anon_index++); ASSERT(ap != NULL); swap_xlate(ap, &vp, &offset); /* * Use page_find() instead of page_lookup() to * find the page since we know that it has a * "shared" lock. */ pp = page_find(vp, offset); ASSERT(ap == anon_get_ptr(amp->ahp, anon_index - 1)); if (pp == NULL) { panic("segspt_softunlock: " "addr %p, ap %p, vp %p, off %llx", (void *)adr, (void *)ap, (void *)vp, offset); /*NOTREACHED*/ } if (rw == S_WRITE) { hat_setrefmod(pp); } else if (rw != S_OTHER) { hat_setref(pp); } page_unlock(pp); } softlock_decrement: npages = btopr(len); ASSERT(shmd->shm_softlockcnt >= npages); atomic_add_long((ulong_t *)(&(shmd->shm_softlockcnt)), -npages); if (shmd->shm_softlockcnt == 0) { /* * All SOFTLOCKS are gone. Wakeup any waiting * unmappers so they can try again to unmap. * Check for waiters first without the mutex * held so we don't always grab the mutex on * softunlocks. */ if (AS_ISUNMAPWAIT(seg->s_as)) { mutex_enter(&seg->s_as->a_contents); if (AS_ISUNMAPWAIT(seg->s_as)) { AS_CLRUNMAPWAIT(seg->s_as); cv_broadcast(&seg->s_as->a_cv); } mutex_exit(&seg->s_as->a_contents); } } } int segspt_shmattach(struct seg *seg, caddr_t *argsp) { struct shm_data *shmd_arg = (struct shm_data *)argsp; struct shm_data *shmd; struct anon_map *shm_amp = shmd_arg->shm_amp; struct spt_data *sptd; int error = 0; ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as)); shmd = kmem_zalloc((sizeof (*shmd)), KM_NOSLEEP); if (shmd == NULL) return (ENOMEM); shmd->shm_sptas = shmd_arg->shm_sptas; shmd->shm_amp = shm_amp; shmd->shm_sptseg = shmd_arg->shm_sptseg; (void) lgrp_shm_policy_set(LGRP_MEM_POLICY_DEFAULT, shm_amp, 0, NULL, 0, seg->s_size); mutex_init(&shmd->shm_segfree_syncmtx, NULL, MUTEX_DEFAULT, NULL); seg->s_data = (void *)shmd; seg->s_ops = &segspt_shmops; seg->s_szc = shmd->shm_sptseg->s_szc; sptd = shmd->shm_sptseg->s_data; if (sptd->spt_flags & SHM_PAGEABLE) { if ((shmd->shm_vpage = kmem_zalloc(btopr(shm_amp->size), KM_NOSLEEP)) == NULL) { seg->s_data = (void *)NULL; kmem_free(shmd, (sizeof (*shmd))); return (ENOMEM); } shmd->shm_lckpgs = 0; if (hat_supported(HAT_DYNAMIC_ISM_UNMAP, (void *)0)) { if ((error = hat_share(seg->s_as->a_hat, seg->s_base, shmd_arg->shm_sptas->a_hat, SEGSPTADDR, seg->s_size, seg->s_szc)) != 0) { kmem_free(shmd->shm_vpage, btopr(shm_amp->size)); } } } else { error = hat_share(seg->s_as->a_hat, seg->s_base, shmd_arg->shm_sptas->a_hat, SEGSPTADDR, seg->s_size, seg->s_szc); } if (error) { seg->s_szc = 0; seg->s_data = (void *)NULL; kmem_free(shmd, (sizeof (*shmd))); } else { ANON_LOCK_ENTER(&shm_amp->a_rwlock, RW_WRITER); shm_amp->refcnt++; ANON_LOCK_EXIT(&shm_amp->a_rwlock); } return (error); } int segspt_shmunmap(struct seg *seg, caddr_t raddr, size_t ssize) { struct shm_data *shmd = (struct shm_data *)seg->s_data; int reclaim = 1; ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as)); retry: if (shmd->shm_softlockcnt > 0) { if (reclaim == 1) { segspt_purge(seg); reclaim = 0; goto retry; } return (EAGAIN); } if (ssize != seg->s_size) { #ifdef DEBUG cmn_err(CE_WARN, "Incompatible ssize %lx s_size %lx\n", ssize, seg->s_size); #endif return (EINVAL); } (void) segspt_shmlockop(seg, raddr, shmd->shm_amp->size, 0, MC_UNLOCK, NULL, 0); hat_unshare(seg->s_as->a_hat, raddr, ssize, seg->s_szc); seg_free(seg); return (0); } void segspt_shmfree(struct seg *seg) { struct shm_data *shmd = (struct shm_data *)seg->s_data; struct anon_map *shm_amp = shmd->shm_amp; ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as)); (void) segspt_shmlockop(seg, seg->s_base, shm_amp->size, 0, MC_UNLOCK, NULL, 0); /* * Need to increment refcnt when attaching * and decrement when detaching because of dup(). */ ANON_LOCK_ENTER(&shm_amp->a_rwlock, RW_WRITER); shm_amp->refcnt--; ANON_LOCK_EXIT(&shm_amp->a_rwlock); if (shmd->shm_vpage) { /* only for DISM */ kmem_free(shmd->shm_vpage, btopr(shm_amp->size)); shmd->shm_vpage = NULL; } /* * Take shm_segfree_syncmtx lock to let segspt_reclaim() finish if it's * still working with this segment without holding as lock. */ ASSERT(shmd->shm_softlockcnt == 0); mutex_enter(&shmd->shm_segfree_syncmtx); mutex_destroy(&shmd->shm_segfree_syncmtx); kmem_free(shmd, sizeof (*shmd)); } /*ARGSUSED*/ int segspt_shmsetprot(struct seg *seg, caddr_t addr, size_t len, uint_t prot) { ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as)); /* * Shared page table is more than shared mapping. * Individual process sharing page tables can't change prot * because there is only one set of page tables. * This will be allowed after private page table is * supported. */ /* need to return correct status error? */ return (0); } faultcode_t segspt_dismfault(struct hat *hat, struct seg *seg, caddr_t addr, size_t len, enum fault_type type, enum seg_rw rw) { struct shm_data *shmd = (struct shm_data *)seg->s_data; struct seg *sptseg = shmd->shm_sptseg; struct as *curspt = shmd->shm_sptas; struct spt_data *sptd = sptseg->s_data; pgcnt_t npages; size_t size; caddr_t segspt_addr, shm_addr; page_t **ppa; int i; ulong_t an_idx = 0; int err = 0; int dyn_ism_unmap = hat_supported(HAT_DYNAMIC_ISM_UNMAP, (void *)0); size_t pgsz; pgcnt_t pgcnt; caddr_t a; pgcnt_t pidx; #ifdef lint hat = hat; #endif ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as)); /* * Because of the way spt is implemented * the realsize of the segment does not have to be * equal to the segment size itself. The segment size is * often in multiples of a page size larger than PAGESIZE. * The realsize is rounded up to the nearest PAGESIZE * based on what the user requested. This is a bit of * ungliness that is historical but not easily fixed * without re-designing the higher levels of ISM. */ ASSERT(addr >= seg->s_base); if (((addr + len) - seg->s_base) > sptd->spt_realsize) return (FC_NOMAP); /* * For all of the following cases except F_PROT, we need to * make any necessary adjustments to addr and len * and get all of the necessary page_t's into an array called ppa[]. * * The code in shmat() forces base addr and len of ISM segment * to be aligned to largest page size supported. Therefore, * we are able to handle F_SOFTLOCK and F_INVAL calls in "large * pagesize" chunks. We want to make sure that we HAT_LOAD_LOCK * in large pagesize chunks, or else we will screw up the HAT * layer by calling hat_memload_array() with differing page sizes * over a given virtual range. */ pgsz = page_get_pagesize(sptseg->s_szc); pgcnt = page_get_pagecnt(sptseg->s_szc); shm_addr = (caddr_t)P2ALIGN((uintptr_t)(addr), pgsz); size = P2ROUNDUP((uintptr_t)(((addr + len) - shm_addr)), pgsz); npages = btopr(size); /* * Now we need to convert from addr in segshm to addr in segspt. */ an_idx = seg_page(seg, shm_addr); segspt_addr = sptseg->s_base + ptob(an_idx); ASSERT((segspt_addr + ptob(npages)) <= (sptseg->s_base + sptd->spt_realsize)); ASSERT(segspt_addr < (sptseg->s_base + sptseg->s_size)); switch (type) { case F_SOFTLOCK: atomic_add_long((ulong_t *)(&(shmd->shm_softlockcnt)), npages); /* * Fall through to the F_INVAL case to load up the hat layer * entries with the HAT_LOAD_LOCK flag. */ /* FALLTHRU */ case F_INVAL: if ((rw == S_EXEC) && !(sptd->spt_prot & PROT_EXEC)) return (FC_NOMAP); ppa = kmem_zalloc(npages * sizeof (page_t *), KM_SLEEP); err = spt_anon_getpages(sptseg, segspt_addr, size, ppa); if (err != 0) { if (type == F_SOFTLOCK) { atomic_add_long((ulong_t *)( &(shmd->shm_softlockcnt)), -npages); } goto dism_err; } AS_LOCK_ENTER(sptseg->s_as, RW_READER); a = segspt_addr; pidx = 0; if (type == F_SOFTLOCK) { /* * Load up the translation keeping it * locked and don't unlock the page. */ for (; pidx < npages; a += pgsz, pidx += pgcnt) { hat_memload_array(sptseg->s_as->a_hat, a, pgsz, &ppa[pidx], sptd->spt_prot, HAT_LOAD_LOCK | HAT_LOAD_SHARE); } } else { /* * Migrate pages marked for migration */ if (lgrp_optimizations()) page_migrate(seg, shm_addr, ppa, npages); for (; pidx < npages; a += pgsz, pidx += pgcnt) { hat_memload_array(sptseg->s_as->a_hat, a, pgsz, &ppa[pidx], sptd->spt_prot, HAT_LOAD_SHARE); } /* * And now drop the SE_SHARED lock(s). */ if (dyn_ism_unmap) { for (i = 0; i < npages; i++) { page_unlock(ppa[i]); } } } if (!dyn_ism_unmap) { if (hat_share(seg->s_as->a_hat, shm_addr, curspt->a_hat, segspt_addr, ptob(npages), seg->s_szc) != 0) { panic("hat_share err in DISM fault"); /* NOTREACHED */ } if (type == F_INVAL) { for (i = 0; i < npages; i++) { page_unlock(ppa[i]); } } } AS_LOCK_EXIT(sptseg->s_as); dism_err: kmem_free(ppa, npages * sizeof (page_t *)); return (err); case F_SOFTUNLOCK: /* * This is a bit ugly, we pass in the real seg pointer, * but the segspt_addr is the virtual address within the * dummy seg. */ segspt_softunlock(seg, segspt_addr, size, rw); return (0); case F_PROT: /* * This takes care of the unusual case where a user * allocates a stack in shared memory and a register * window overflow is written to that stack page before * it is otherwise modified. * * We can get away with this because ISM segments are * always rw. Other than this unusual case, there * should be no instances of protection violations. */ return (0); default: #ifdef DEBUG panic("segspt_dismfault default type?"); #else return (FC_NOMAP); #endif } } faultcode_t segspt_shmfault(struct hat *hat, struct seg *seg, caddr_t addr, size_t len, enum fault_type type, enum seg_rw rw) { struct shm_data *shmd = (struct shm_data *)seg->s_data; struct seg *sptseg = shmd->shm_sptseg; struct as *curspt = shmd->shm_sptas; struct spt_data *sptd = sptseg->s_data; pgcnt_t npages; size_t size; caddr_t sptseg_addr, shm_addr; page_t *pp, **ppa; int i; u_offset_t offset; ulong_t anon_index = 0; struct vnode *vp; struct anon_map *amp; /* XXX - for locknest */ struct anon *ap = NULL; size_t pgsz; pgcnt_t pgcnt; caddr_t a; pgcnt_t pidx; size_t sz; #ifdef lint hat = hat; #endif ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as)); if (sptd->spt_flags & SHM_PAGEABLE) { return (segspt_dismfault(hat, seg, addr, len, type, rw)); } /* * Because of the way spt is implemented * the realsize of the segment does not have to be * equal to the segment size itself. The segment size is * often in multiples of a page size larger than PAGESIZE. * The realsize is rounded up to the nearest PAGESIZE * based on what the user requested. This is a bit of * ungliness that is historical but not easily fixed * without re-designing the higher levels of ISM. */ ASSERT(addr >= seg->s_base); if (((addr + len) - seg->s_base) > sptd->spt_realsize) return (FC_NOMAP); /* * For all of the following cases except F_PROT, we need to * make any necessary adjustments to addr and len * and get all of the necessary page_t's into an array called ppa[]. * * The code in shmat() forces base addr and len of ISM segment * to be aligned to largest page size supported. Therefore, * we are able to handle F_SOFTLOCK and F_INVAL calls in "large * pagesize" chunks. We want to make sure that we HAT_LOAD_LOCK * in large pagesize chunks, or else we will screw up the HAT * layer by calling hat_memload_array() with differing page sizes * over a given virtual range. */ pgsz = page_get_pagesize(sptseg->s_szc); pgcnt = page_get_pagecnt(sptseg->s_szc); shm_addr = (caddr_t)P2ALIGN((uintptr_t)(addr), pgsz); size = P2ROUNDUP((uintptr_t)(((addr + len) - shm_addr)), pgsz); npages = btopr(size); /* * Now we need to convert from addr in segshm to addr in segspt. */ anon_index = seg_page(seg, shm_addr); sptseg_addr = sptseg->s_base + ptob(anon_index); /* * And now we may have to adjust npages downward if we have * exceeded the realsize of the segment or initial anon * allocations. */ if ((sptseg_addr + ptob(npages)) > (sptseg->s_base + sptd->spt_realsize)) size = (sptseg->s_base + sptd->spt_realsize) - sptseg_addr; npages = btopr(size); ASSERT(sptseg_addr < (sptseg->s_base + sptseg->s_size)); ASSERT((sptd->spt_flags & SHM_PAGEABLE) == 0); switch (type) { case F_SOFTLOCK: /* * availrmem is decremented once during anon_swap_adjust() * and is incremented during the anon_unresv(), which is * called from shm_rm_amp() when the segment is destroyed. */ atomic_add_long((ulong_t *)(&(shmd->shm_softlockcnt)), npages); /* * Some platforms assume that ISM pages are SE_SHARED * locked for the entire life of the segment. */ if (!hat_supported(HAT_DYNAMIC_ISM_UNMAP, (void *)0)) return (0); /* * Fall through to the F_INVAL case to load up the hat layer * entries with the HAT_LOAD_LOCK flag. */ /* FALLTHRU */ case F_INVAL: if ((rw == S_EXEC) && !(sptd->spt_prot & PROT_EXEC)) return (FC_NOMAP); /* * Some platforms that do NOT support DYNAMIC_ISM_UNMAP * may still rely on this call to hat_share(). That * would imply that those hat's can fault on a * HAT_LOAD_LOCK translation, which would seem * contradictory. */ if (!hat_supported(HAT_DYNAMIC_ISM_UNMAP, (void *)0)) { if (hat_share(seg->s_as->a_hat, seg->s_base, curspt->a_hat, sptseg->s_base, sptseg->s_size, sptseg->s_szc) != 0) { panic("hat_share error in ISM fault"); /*NOTREACHED*/ } return (0); } ppa = kmem_zalloc(sizeof (page_t *) * npages, KM_SLEEP); /* * I see no need to lock the real seg, * here, because all of our work will be on the underlying * dummy seg. * * sptseg_addr and npages now account for large pages. */ amp = sptd->spt_amp; ASSERT(amp != NULL); anon_index = seg_page(sptseg, sptseg_addr); ANON_LOCK_ENTER(&->a_rwlock, RW_READER); for (i = 0; i < npages; i++) { ap = anon_get_ptr(amp->ahp, anon_index++); ASSERT(ap != NULL); swap_xlate(ap, &vp, &offset); pp = page_lookup(vp, offset, SE_SHARED); ASSERT(pp != NULL); ppa[i] = pp; } ANON_LOCK_EXIT(&->a_rwlock); ASSERT(i == npages); /* * We are already holding the as->a_lock on the user's * real segment, but we need to hold the a_lock on the * underlying dummy as. This is mostly to satisfy the * underlying HAT layer. */ AS_LOCK_ENTER(sptseg->s_as, RW_READER); a = sptseg_addr; pidx = 0; if (type == F_SOFTLOCK) { /* * Load up the translation keeping it * locked and don't unlock the page. */ for (; pidx < npages; a += pgsz, pidx += pgcnt) { sz = MIN(pgsz, ptob(npages - pidx)); hat_memload_array(sptseg->s_as->a_hat, a, sz, &ppa[pidx], sptd->spt_prot, HAT_LOAD_LOCK | HAT_LOAD_SHARE); } } else { /* * Migrate pages marked for migration. */ if (lgrp_optimizations()) page_migrate(seg, shm_addr, ppa, npages); for (; pidx < npages; a += pgsz, pidx += pgcnt) { sz = MIN(pgsz, ptob(npages - pidx)); hat_memload_array(sptseg->s_as->a_hat, a, sz, &ppa[pidx], sptd->spt_prot, HAT_LOAD_SHARE); } /* * And now drop the SE_SHARED lock(s). */ for (i = 0; i < npages; i++) page_unlock(ppa[i]); } AS_LOCK_EXIT(sptseg->s_as); kmem_free(ppa, sizeof (page_t *) * npages); return (0); case F_SOFTUNLOCK: /* * This is a bit ugly, we pass in the real seg pointer, * but the sptseg_addr is the virtual address within the * dummy seg. */ segspt_softunlock(seg, sptseg_addr, ptob(npages), rw); return (0); case F_PROT: /* * This takes care of the unusual case where a user * allocates a stack in shared memory and a register * window overflow is written to that stack page before * it is otherwise modified. * * We can get away with this because ISM segments are * always rw. Other than this unusual case, there * should be no instances of protection violations. */ return (0); default: #ifdef DEBUG cmn_err(CE_WARN, "segspt_shmfault default type?"); #endif return (FC_NOMAP); } } /*ARGSUSED*/ static faultcode_t segspt_shmfaulta(struct seg *seg, caddr_t addr) { return (0); } /*ARGSUSED*/ static int segspt_shmkluster(struct seg *seg, caddr_t addr, ssize_t delta) { return (0); } /*ARGSUSED*/ static size_t segspt_shmswapout(struct seg *seg) { return (0); } /* * duplicate the shared page tables */ int segspt_shmdup(struct seg *seg, struct seg *newseg) { struct shm_data *shmd = (struct shm_data *)seg->s_data; struct anon_map *amp = shmd->shm_amp; struct shm_data *shmd_new; struct seg *spt_seg = shmd->shm_sptseg; struct spt_data *sptd = spt_seg->s_data; int error = 0; ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as)); shmd_new = kmem_zalloc((sizeof (*shmd_new)), KM_SLEEP); newseg->s_data = (void *)shmd_new; shmd_new->shm_sptas = shmd->shm_sptas; shmd_new->shm_amp = amp; shmd_new->shm_sptseg = shmd->shm_sptseg; newseg->s_ops = &segspt_shmops; newseg->s_szc = seg->s_szc; ASSERT(seg->s_szc == shmd->shm_sptseg->s_szc); ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); amp->refcnt++; ANON_LOCK_EXIT(&->a_rwlock); if (sptd->spt_flags & SHM_PAGEABLE) { shmd_new->shm_vpage = kmem_zalloc(btopr(amp->size), KM_SLEEP); shmd_new->shm_lckpgs = 0; if (hat_supported(HAT_DYNAMIC_ISM_UNMAP, (void *)0)) { if ((error = hat_share(newseg->s_as->a_hat, newseg->s_base, shmd->shm_sptas->a_hat, SEGSPTADDR, seg->s_size, seg->s_szc)) != 0) { kmem_free(shmd_new->shm_vpage, btopr(amp->size)); } } return (error); } else { return (hat_share(newseg->s_as->a_hat, newseg->s_base, shmd->shm_sptas->a_hat, SEGSPTADDR, seg->s_size, seg->s_szc)); } } /*ARGSUSED*/ int segspt_shmcheckprot(struct seg *seg, caddr_t addr, size_t size, uint_t prot) { struct shm_data *shmd = (struct shm_data *)seg->s_data; struct spt_data *sptd = (struct spt_data *)shmd->shm_sptseg->s_data; ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as)); /* * ISM segment is always rw. */ return (((sptd->spt_prot & prot) != prot) ? EACCES : 0); } /* * Return an array of locked large pages, for empty slots allocate * private zero-filled anon pages. */ static int spt_anon_getpages( struct seg *sptseg, caddr_t sptaddr, size_t len, page_t *ppa[]) { struct spt_data *sptd = sptseg->s_data; struct anon_map *amp = sptd->spt_amp; enum seg_rw rw = sptd->spt_prot; uint_t szc = sptseg->s_szc; size_t pg_sz, share_sz = page_get_pagesize(szc); pgcnt_t lp_npgs; caddr_t lp_addr, e_sptaddr; uint_t vpprot, ppa_szc = 0; struct vpage *vpage = NULL; ulong_t j, ppa_idx; int err, ierr = 0; pgcnt_t an_idx; anon_sync_obj_t cookie; int anon_locked = 0; pgcnt_t amp_pgs; ASSERT(IS_P2ALIGNED(sptaddr, share_sz) && IS_P2ALIGNED(len, share_sz)); ASSERT(len != 0); pg_sz = share_sz; lp_npgs = btop(pg_sz); lp_addr = sptaddr; e_sptaddr = sptaddr + len; an_idx = seg_page(sptseg, sptaddr); ppa_idx = 0; ANON_LOCK_ENTER(&->a_rwlock, RW_READER); amp_pgs = page_get_pagecnt(amp->a_szc); /*CONSTCOND*/ while (1) { for (; lp_addr < e_sptaddr; an_idx += lp_npgs, lp_addr += pg_sz, ppa_idx += lp_npgs) { /* * If we're currently locked, and we get to a new * page, unlock our current anon chunk. */ if (anon_locked && P2PHASE(an_idx, amp_pgs) == 0) { anon_array_exit(&cookie); anon_locked = 0; } if (!anon_locked) { anon_array_enter(amp, an_idx, &cookie); anon_locked = 1; } ppa_szc = (uint_t)-1; ierr = anon_map_getpages(amp, an_idx, szc, sptseg, lp_addr, sptd->spt_prot, &vpprot, &ppa[ppa_idx], &ppa_szc, vpage, rw, 0, segvn_anypgsz, 0, kcred); if (ierr != 0) { if (ierr > 0) { err = FC_MAKE_ERR(ierr); goto lpgs_err; } break; } } if (lp_addr == e_sptaddr) { break; } ASSERT(lp_addr < e_sptaddr); /* * ierr == -1 means we failed to allocate a large page. * so do a size down operation. * * ierr == -2 means some other process that privately shares * pages with this process has allocated a larger page and we * need to retry with larger pages. So do a size up * operation. This relies on the fact that large pages are * never partially shared i.e. if we share any constituent * page of a large page with another process we must share the * entire large page. Note this cannot happen for SOFTLOCK * case, unless current address (lpaddr) is at the beginning * of the next page size boundary because the other process * couldn't have relocated locked pages. */ ASSERT(ierr == -1 || ierr == -2); if (segvn_anypgsz) { ASSERT(ierr == -2 || szc != 0); ASSERT(ierr == -1 || szc < sptseg->s_szc); szc = (ierr == -1) ? szc - 1 : szc + 1; } else { /* * For faults and segvn_anypgsz == 0 * we need to be careful not to loop forever * if existing page is found with szc other * than 0 or seg->s_szc. This could be due * to page relocations on behalf of DR or * more likely large page creation. For this * case simply re-size to existing page's szc * if returned by anon_map_getpages(). */ if (ppa_szc == (uint_t)-1) { szc = (ierr == -1) ? 0 : sptseg->s_szc; } else { ASSERT(ppa_szc <= sptseg->s_szc); ASSERT(ierr == -2 || ppa_szc < szc); ASSERT(ierr == -1 || ppa_szc > szc); szc = ppa_szc; } } pg_sz = page_get_pagesize(szc); lp_npgs = btop(pg_sz); ASSERT(IS_P2ALIGNED(lp_addr, pg_sz)); } if (anon_locked) { anon_array_exit(&cookie); } ANON_LOCK_EXIT(&->a_rwlock); return (0); lpgs_err: if (anon_locked) { anon_array_exit(&cookie); } ANON_LOCK_EXIT(&->a_rwlock); for (j = 0; j < ppa_idx; j++) page_unlock(ppa[j]); return (err); } /* * count the number of bytes in a set of spt pages that are currently not * locked */ static rctl_qty_t spt_unlockedbytes(pgcnt_t npages, page_t **ppa) { ulong_t i; rctl_qty_t unlocked = 0; for (i = 0; i < npages; i++) { if (ppa[i]->p_lckcnt == 0) unlocked += PAGESIZE; } return (unlocked); } extern u_longlong_t randtick(void); /* number of locks to reserve/skip by spt_lockpages() and spt_unlockpages() */ #define NLCK (NCPU_P2) /* Random number with a range [0, n-1], n must be power of two */ #define RAND_P2(n) \ ((((long)curthread >> PTR24_LSB) ^ (long)randtick()) & ((n) - 1)) int spt_lockpages(struct seg *seg, pgcnt_t anon_index, pgcnt_t npages, page_t **ppa, ulong_t *lockmap, size_t pos, rctl_qty_t *locked) { struct shm_data *shmd = seg->s_data; struct spt_data *sptd = shmd->shm_sptseg->s_data; ulong_t i; int kernel; pgcnt_t nlck = 0; int rv = 0; int use_reserved = 1; /* return the number of bytes actually locked */ *locked = 0; /* * To avoid contention on freemem_lock, availrmem and pages_locked * global counters are updated only every nlck locked pages instead of * every time. Reserve nlck locks up front and deduct from this * reservation for each page that requires a lock. When the reservation * is consumed, reserve again. nlck is randomized, so the competing * threads do not fall into a cyclic lock contention pattern. When * memory is low, the lock ahead is disabled, and instead page_pp_lock() * is used to lock pages. */ for (i = 0; i < npages; anon_index++, pos++, i++) { if (nlck == 0 && use_reserved == 1) { nlck = NLCK + RAND_P2(NLCK); /* if fewer loops left, decrease nlck */ nlck = MIN(nlck, npages - i); /* * Reserve nlck locks up front and deduct from this * reservation for each page that requires a lock. When * the reservation is consumed, reserve again. */ mutex_enter(&freemem_lock); if ((availrmem - nlck) < pages_pp_maximum) { /* Do not do advance memory reserves */ use_reserved = 0; } else { availrmem -= nlck; pages_locked += nlck; } mutex_exit(&freemem_lock); } if (!(shmd->shm_vpage[anon_index] & DISM_PG_LOCKED)) { if (sptd->spt_ppa_lckcnt[anon_index] < (ushort_t)DISM_LOCK_MAX) { if (++sptd->spt_ppa_lckcnt[anon_index] == (ushort_t)DISM_LOCK_MAX) { cmn_err(CE_WARN, "DISM page lock limit " "reached on DISM offset 0x%lx\n", anon_index << PAGESHIFT); } kernel = (sptd->spt_ppa && sptd->spt_ppa[anon_index]); if (!page_pp_lock(ppa[i], 0, kernel || use_reserved)) { sptd->spt_ppa_lckcnt[anon_index]--; rv = EAGAIN; break; } /* if this is a newly locked page, count it */ if (ppa[i]->p_lckcnt == 1) { if (kernel == 0 && use_reserved == 1) nlck--; *locked += PAGESIZE; } shmd->shm_lckpgs++; shmd->shm_vpage[anon_index] |= DISM_PG_LOCKED; if (lockmap != NULL) BT_SET(lockmap, pos); } } } /* Return unused lock reservation */ if (nlck != 0 && use_reserved == 1) { mutex_enter(&freemem_lock); availrmem += nlck; pages_locked -= nlck; mutex_exit(&freemem_lock); } return (rv); } int spt_unlockpages(struct seg *seg, pgcnt_t anon_index, pgcnt_t npages, rctl_qty_t *unlocked) { struct shm_data *shmd = seg->s_data; struct spt_data *sptd = shmd->shm_sptseg->s_data; struct anon_map *amp = sptd->spt_amp; struct anon *ap; struct vnode *vp; u_offset_t off; struct page *pp; int kernel; anon_sync_obj_t cookie; ulong_t i; pgcnt_t nlck = 0; pgcnt_t nlck_limit = NLCK; ANON_LOCK_ENTER(&->a_rwlock, RW_READER); for (i = 0; i < npages; i++, anon_index++) { if (shmd->shm_vpage[anon_index] & DISM_PG_LOCKED) { anon_array_enter(amp, anon_index, &cookie); ap = anon_get_ptr(amp->ahp, anon_index); ASSERT(ap); swap_xlate(ap, &vp, &off); anon_array_exit(&cookie); pp = page_lookup(vp, off, SE_SHARED); ASSERT(pp); /* * availrmem is decremented only for pages which are not * in seg pcache, for pages in seg pcache availrmem was * decremented in _dismpagelock() */ kernel = (sptd->spt_ppa && sptd->spt_ppa[anon_index]); ASSERT(pp->p_lckcnt > 0); /* * lock page but do not change availrmem, we do it * ourselves every nlck loops. */ page_pp_unlock(pp, 0, 1); if (pp->p_lckcnt == 0) { if (kernel == 0) nlck++; *unlocked += PAGESIZE; } page_unlock(pp); shmd->shm_vpage[anon_index] &= ~DISM_PG_LOCKED; sptd->spt_ppa_lckcnt[anon_index]--; shmd->shm_lckpgs--; } /* * To reduce freemem_lock contention, do not update availrmem * until at least NLCK pages have been unlocked. * 1. No need to update if nlck is zero * 2. Always update if the last iteration */ if (nlck > 0 && (nlck == nlck_limit || i == npages - 1)) { mutex_enter(&freemem_lock); availrmem += nlck; pages_locked -= nlck; mutex_exit(&freemem_lock); nlck = 0; nlck_limit = NLCK + RAND_P2(NLCK); } } ANON_LOCK_EXIT(&->a_rwlock); return (0); } /*ARGSUSED*/ static int segspt_shmlockop(struct seg *seg, caddr_t addr, size_t len, int attr, int op, ulong_t *lockmap, size_t pos) { struct shm_data *shmd = seg->s_data; struct seg *sptseg = shmd->shm_sptseg; struct spt_data *sptd = sptseg->s_data; struct kshmid *sp = sptd->spt_amp->a_sp; pgcnt_t npages, a_npages; page_t **ppa; pgcnt_t an_idx, a_an_idx, ppa_idx; caddr_t spt_addr, a_addr; /* spt and aligned address */ size_t a_len; /* aligned len */ size_t share_sz; ulong_t i; int sts = 0; rctl_qty_t unlocked = 0; rctl_qty_t locked = 0; struct proc *p = curproc; kproject_t *proj; ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as)); ASSERT(sp != NULL); if ((sptd->spt_flags & SHM_PAGEABLE) == 0) { return (0); } addr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK); an_idx = seg_page(seg, addr); npages = btopr(len); if (an_idx + npages > btopr(shmd->shm_amp->size)) { return (ENOMEM); } /* * A shm's project never changes, so no lock needed. * The shm has a hold on the project, so it will not go away. * Since we have a mapping to shm within this zone, we know * that the zone will not go away. */ proj = sp->shm_perm.ipc_proj; if (op == MC_LOCK) { /* * Need to align addr and size request if they are not * aligned so we can always allocate large page(s) however * we only lock what was requested in initial request. */ share_sz = page_get_pagesize(sptseg->s_szc); a_addr = (caddr_t)P2ALIGN((uintptr_t)(addr), share_sz); a_len = P2ROUNDUP((uintptr_t)(((addr + len) - a_addr)), share_sz); a_npages = btop(a_len); a_an_idx = seg_page(seg, a_addr); spt_addr = sptseg->s_base + ptob(a_an_idx); ppa_idx = an_idx - a_an_idx; if ((ppa = kmem_zalloc(((sizeof (page_t *)) * a_npages), KM_NOSLEEP)) == NULL) { return (ENOMEM); } /* * Don't cache any new pages for IO and * flush any cached pages. */ mutex_enter(&sptd->spt_lock); if (sptd->spt_ppa != NULL) sptd->spt_flags |= DISM_PPA_CHANGED; sts = spt_anon_getpages(sptseg, spt_addr, a_len, ppa); if (sts != 0) { mutex_exit(&sptd->spt_lock); kmem_free(ppa, ((sizeof (page_t *)) * a_npages)); return (sts); } mutex_enter(&sp->shm_mlock); /* enforce locked memory rctl */ unlocked = spt_unlockedbytes(npages, &ppa[ppa_idx]); mutex_enter(&p->p_lock); if (rctl_incr_locked_mem(p, proj, unlocked, 0)) { mutex_exit(&p->p_lock); sts = EAGAIN; } else { mutex_exit(&p->p_lock); sts = spt_lockpages(seg, an_idx, npages, &ppa[ppa_idx], lockmap, pos, &locked); /* * correct locked count if not all pages could be * locked */ if ((unlocked - locked) > 0) { rctl_decr_locked_mem(NULL, proj, (unlocked - locked), 0); } } /* * unlock pages */ for (i = 0; i < a_npages; i++) page_unlock(ppa[i]); if (sptd->spt_ppa != NULL) sptd->spt_flags |= DISM_PPA_CHANGED; mutex_exit(&sp->shm_mlock); mutex_exit(&sptd->spt_lock); kmem_free(ppa, ((sizeof (page_t *)) * a_npages)); } else if (op == MC_UNLOCK) { /* unlock */ page_t **ppa; mutex_enter(&sptd->spt_lock); if (shmd->shm_lckpgs == 0) { mutex_exit(&sptd->spt_lock); return (0); } /* * Don't cache new IO pages. */ if (sptd->spt_ppa != NULL) sptd->spt_flags |= DISM_PPA_CHANGED; mutex_enter(&sp->shm_mlock); sts = spt_unlockpages(seg, an_idx, npages, &unlocked); if ((ppa = sptd->spt_ppa) != NULL) sptd->spt_flags |= DISM_PPA_CHANGED; mutex_exit(&sptd->spt_lock); rctl_decr_locked_mem(NULL, proj, unlocked, 0); mutex_exit(&sp->shm_mlock); if (ppa != NULL) seg_ppurge_wiredpp(ppa); } return (sts); } /*ARGSUSED*/ int segspt_shmgetprot(struct seg *seg, caddr_t addr, size_t len, uint_t *protv) { struct shm_data *shmd = (struct shm_data *)seg->s_data; struct spt_data *sptd = (struct spt_data *)shmd->shm_sptseg->s_data; spgcnt_t pgno = seg_page(seg, addr+len) - seg_page(seg, addr) + 1; ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as)); /* * ISM segment is always rw. */ while (--pgno >= 0) *protv++ = sptd->spt_prot; return (0); } /*ARGSUSED*/ u_offset_t segspt_shmgetoffset(struct seg *seg, caddr_t addr) { ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as)); /* Offset does not matter in ISM memory */ return ((u_offset_t)0); } /* ARGSUSED */ int segspt_shmgettype(struct seg *seg, caddr_t addr) { struct shm_data *shmd = (struct shm_data *)seg->s_data; struct spt_data *sptd = (struct spt_data *)shmd->shm_sptseg->s_data; ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as)); /* * The shared memory mapping is always MAP_SHARED, SWAP is only * reserved for DISM */ return (MAP_SHARED | ((sptd->spt_flags & SHM_PAGEABLE) ? 0 : MAP_NORESERVE)); } /*ARGSUSED*/ int segspt_shmgetvp(struct seg *seg, caddr_t addr, struct vnode **vpp) { struct shm_data *shmd = (struct shm_data *)seg->s_data; struct spt_data *sptd = (struct spt_data *)shmd->shm_sptseg->s_data; ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as)); *vpp = sptd->spt_vp; return (0); } /* * We need to wait for pending IO to complete to a DISM segment in order for * pages to get kicked out of the seg_pcache. 120 seconds should be more * than enough time to wait. */ static clock_t spt_pcache_wait = 120; /*ARGSUSED*/ static int segspt_shmadvise(struct seg *seg, caddr_t addr, size_t len, uint_t behav) { struct shm_data *shmd = (struct shm_data *)seg->s_data; struct spt_data *sptd = (struct spt_data *)shmd->shm_sptseg->s_data; struct anon_map *amp; pgcnt_t pg_idx; ushort_t gen; clock_t end_lbolt; int writer; page_t **ppa; ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as)); if (behav == MADV_FREE || behav == MADV_PURGE) { if ((sptd->spt_flags & SHM_PAGEABLE) == 0) return (0); amp = sptd->spt_amp; pg_idx = seg_page(seg, addr); mutex_enter(&sptd->spt_lock); if ((ppa = sptd->spt_ppa) == NULL) { mutex_exit(&sptd->spt_lock); ANON_LOCK_ENTER(&->a_rwlock, RW_READER); (void) anon_disclaim(amp, pg_idx, len, behav, NULL); ANON_LOCK_EXIT(&->a_rwlock); return (0); } sptd->spt_flags |= DISM_PPA_CHANGED; gen = sptd->spt_gen; mutex_exit(&sptd->spt_lock); /* * Purge all DISM cached pages */ seg_ppurge_wiredpp(ppa); /* * Drop the AS_LOCK so that other threads can grab it * in the as_pageunlock path and hopefully get the segment * kicked out of the seg_pcache. We bump the shm_softlockcnt * to keep this segment resident. */ writer = AS_WRITE_HELD(seg->s_as); atomic_inc_ulong((ulong_t *)(&(shmd->shm_softlockcnt))); AS_LOCK_EXIT(seg->s_as); mutex_enter(&sptd->spt_lock); end_lbolt = ddi_get_lbolt() + (hz * spt_pcache_wait); /* * Try to wait for pages to get kicked out of the seg_pcache. */ while (sptd->spt_gen == gen && (sptd->spt_flags & DISM_PPA_CHANGED) && ddi_get_lbolt() < end_lbolt) { if (!cv_timedwait_sig(&sptd->spt_cv, &sptd->spt_lock, end_lbolt)) { break; } } mutex_exit(&sptd->spt_lock); /* Regrab the AS_LOCK and release our hold on the segment */ AS_LOCK_ENTER(seg->s_as, writer ? RW_WRITER : RW_READER); atomic_dec_ulong((ulong_t *)(&(shmd->shm_softlockcnt))); if (shmd->shm_softlockcnt <= 0) { if (AS_ISUNMAPWAIT(seg->s_as)) { mutex_enter(&seg->s_as->a_contents); if (AS_ISUNMAPWAIT(seg->s_as)) { AS_CLRUNMAPWAIT(seg->s_as); cv_broadcast(&seg->s_as->a_cv); } mutex_exit(&seg->s_as->a_contents); } } ANON_LOCK_ENTER(&->a_rwlock, RW_READER); (void) anon_disclaim(amp, pg_idx, len, behav, NULL); ANON_LOCK_EXIT(&->a_rwlock); } else if (lgrp_optimizations() && (behav == MADV_ACCESS_LWP || behav == MADV_ACCESS_MANY || behav == MADV_ACCESS_DEFAULT)) { int already_set; ulong_t anon_index; lgrp_mem_policy_t policy; caddr_t shm_addr; size_t share_size; size_t size; struct seg *sptseg = shmd->shm_sptseg; caddr_t sptseg_addr; /* * Align address and length to page size of underlying segment */ share_size = page_get_pagesize(shmd->shm_sptseg->s_szc); shm_addr = (caddr_t)P2ALIGN((uintptr_t)(addr), share_size); size = P2ROUNDUP((uintptr_t)(((addr + len) - shm_addr)), share_size); amp = shmd->shm_amp; anon_index = seg_page(seg, shm_addr); /* * And now we may have to adjust size downward if we have * exceeded the realsize of the segment or initial anon * allocations. */ sptseg_addr = sptseg->s_base + ptob(anon_index); if ((sptseg_addr + size) > (sptseg->s_base + sptd->spt_realsize)) size = (sptseg->s_base + sptd->spt_realsize) - sptseg_addr; /* * Set memory allocation policy for this segment */ policy = lgrp_madv_to_policy(behav, len, MAP_SHARED); already_set = lgrp_shm_policy_set(policy, amp, anon_index, NULL, 0, len); /* * If random memory allocation policy set already, * don't bother reapplying it. */ if (already_set && !LGRP_MEM_POLICY_REAPPLICABLE(policy)) return (0); /* * Mark any existing pages in the given range for * migration, flushing the I/O page cache, and using * underlying segment to calculate anon index and get * anonmap and vnode pointer from */ if (shmd->shm_softlockcnt > 0) segspt_purge(seg); page_mark_migrate(seg, shm_addr, size, amp, 0, NULL, 0, 0); } return (0); } /*ARGSUSED*/ void segspt_shmdump(struct seg *seg) { /* no-op for ISM segment */ } /*ARGSUSED*/ static faultcode_t segspt_shmsetpgsz(struct seg *seg, caddr_t addr, size_t len, uint_t szc) { return (ENOTSUP); } /* * get a memory ID for an addr in a given segment */ static int segspt_shmgetmemid(struct seg *seg, caddr_t addr, memid_t *memidp) { struct shm_data *shmd = (struct shm_data *)seg->s_data; struct anon *ap; size_t anon_index; struct anon_map *amp = shmd->shm_amp; struct spt_data *sptd = shmd->shm_sptseg->s_data; struct seg *sptseg = shmd->shm_sptseg; anon_sync_obj_t cookie; anon_index = seg_page(seg, addr); if (addr > (seg->s_base + sptd->spt_realsize)) { return (EFAULT); } ANON_LOCK_ENTER(&->a_rwlock, RW_READER); anon_array_enter(amp, anon_index, &cookie); ap = anon_get_ptr(amp->ahp, anon_index); if (ap == NULL) { struct page *pp; caddr_t spt_addr = sptseg->s_base + ptob(anon_index); pp = anon_zero(sptseg, spt_addr, &ap, kcred); if (pp == NULL) { anon_array_exit(&cookie); ANON_LOCK_EXIT(&->a_rwlock); return (ENOMEM); } (void) anon_set_ptr(amp->ahp, anon_index, ap, ANON_SLEEP); page_unlock(pp); } anon_array_exit(&cookie); ANON_LOCK_EXIT(&->a_rwlock); memidp->val[0] = (uintptr_t)ap; memidp->val[1] = (uintptr_t)addr & PAGEOFFSET; return (0); } /* * Get memory allocation policy info for specified address in given segment */ static lgrp_mem_policy_info_t * segspt_shmgetpolicy(struct seg *seg, caddr_t addr) { struct anon_map *amp; ulong_t anon_index; lgrp_mem_policy_info_t *policy_info; struct shm_data *shm_data; ASSERT(seg != NULL); /* * Get anon_map from segshm * * Assume that no lock needs to be held on anon_map, since * it should be protected by its reference count which must be * nonzero for an existing segment * Need to grab readers lock on policy tree though */ shm_data = (struct shm_data *)seg->s_data; if (shm_data == NULL) return (NULL); amp = shm_data->shm_amp; ASSERT(amp->refcnt != 0); /* * Get policy info * * Assume starting anon index of 0 */ anon_index = seg_page(seg, addr); policy_info = lgrp_shm_policy_get(amp, anon_index, NULL, 0); return (policy_info); } /*ARGSUSED*/ static int segspt_shmcapable(struct seg *seg, segcapability_t capability) { return (0); }