1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 1993, 2010, Oracle and/or its affiliates. All rights reserved. 23 */ 24 25 #include <sys/param.h> 26 #include <sys/user.h> 27 #include <sys/mman.h> 28 #include <sys/kmem.h> 29 #include <sys/sysmacros.h> 30 #include <sys/cmn_err.h> 31 #include <sys/systm.h> 32 #include <sys/tuneable.h> 33 #include <vm/hat.h> 34 #include <vm/seg.h> 35 #include <vm/as.h> 36 #include <vm/anon.h> 37 #include <vm/page.h> 38 #include <sys/buf.h> 39 #include <sys/swap.h> 40 #include <sys/atomic.h> 41 #include <vm/seg_spt.h> 42 #include <sys/debug.h> 43 #include <sys/vtrace.h> 44 #include <sys/shm.h> 45 #include <sys/shm_impl.h> 46 #include <sys/lgrp.h> 47 #include <sys/vmsystm.h> 48 #include <sys/policy.h> 49 #include <sys/project.h> 50 #include <sys/tnf_probe.h> 51 #include <sys/zone.h> 52 53 #define SEGSPTADDR (caddr_t)0x0 54 55 /* 56 * # pages used for spt 57 */ 58 size_t spt_used; 59 60 /* 61 * segspt_minfree is the memory left for system after ISM 62 * locked its pages; it is set up to 5% of availrmem in 63 * sptcreate when ISM is created. ISM should not use more 64 * than ~90% of availrmem; if it does, then the performance 65 * of the system may decrease. Machines with large memories may 66 * be able to use up more memory for ISM so we set the default 67 * segspt_minfree to 5% (which gives ISM max 95% of availrmem. 68 * If somebody wants even more memory for ISM (risking hanging 69 * the system) they can patch the segspt_minfree to smaller number. 70 */ 71 pgcnt_t segspt_minfree = 0; 72 73 static int segspt_create(struct seg *seg, caddr_t argsp); 74 static int segspt_unmap(struct seg *seg, caddr_t raddr, size_t ssize); 75 static void segspt_free(struct seg *seg); 76 static void segspt_free_pages(struct seg *seg, caddr_t addr, size_t len); 77 static lgrp_mem_policy_info_t *segspt_getpolicy(struct seg *seg, caddr_t addr); 78 79 static void 80 segspt_badop() 81 { 82 panic("segspt_badop called"); 83 /*NOTREACHED*/ 84 } 85 86 #define SEGSPT_BADOP(t) (t(*)())segspt_badop 87 88 struct seg_ops segspt_ops = { 89 SEGSPT_BADOP(int), /* dup */ 90 segspt_unmap, 91 segspt_free, 92 SEGSPT_BADOP(int), /* fault */ 93 SEGSPT_BADOP(faultcode_t), /* faulta */ 94 SEGSPT_BADOP(int), /* setprot */ 95 SEGSPT_BADOP(int), /* checkprot */ 96 SEGSPT_BADOP(int), /* kluster */ 97 SEGSPT_BADOP(size_t), /* swapout */ 98 SEGSPT_BADOP(int), /* sync */ 99 SEGSPT_BADOP(size_t), /* incore */ 100 SEGSPT_BADOP(int), /* lockop */ 101 SEGSPT_BADOP(int), /* getprot */ 102 SEGSPT_BADOP(u_offset_t), /* getoffset */ 103 SEGSPT_BADOP(int), /* gettype */ 104 SEGSPT_BADOP(int), /* getvp */ 105 SEGSPT_BADOP(int), /* advise */ 106 SEGSPT_BADOP(void), /* dump */ 107 SEGSPT_BADOP(int), /* pagelock */ 108 SEGSPT_BADOP(int), /* setpgsz */ 109 SEGSPT_BADOP(int), /* getmemid */ 110 segspt_getpolicy, /* getpolicy */ 111 SEGSPT_BADOP(int), /* capable */ 112 }; 113 114 static int segspt_shmdup(struct seg *seg, struct seg *newseg); 115 static int segspt_shmunmap(struct seg *seg, caddr_t raddr, size_t ssize); 116 static void segspt_shmfree(struct seg *seg); 117 static faultcode_t segspt_shmfault(struct hat *hat, struct seg *seg, 118 caddr_t addr, size_t len, enum fault_type type, enum seg_rw rw); 119 static faultcode_t segspt_shmfaulta(struct seg *seg, caddr_t addr); 120 static int segspt_shmsetprot(register struct seg *seg, register caddr_t addr, 121 register size_t len, register uint_t prot); 122 static int segspt_shmcheckprot(struct seg *seg, caddr_t addr, size_t size, 123 uint_t prot); 124 static int segspt_shmkluster(struct seg *seg, caddr_t addr, ssize_t delta); 125 static size_t segspt_shmswapout(struct seg *seg); 126 static size_t segspt_shmincore(struct seg *seg, caddr_t addr, size_t len, 127 register char *vec); 128 static int segspt_shmsync(struct seg *seg, register caddr_t addr, size_t len, 129 int attr, uint_t flags); 130 static int segspt_shmlockop(struct seg *seg, caddr_t addr, size_t len, 131 int attr, int op, ulong_t *lockmap, size_t pos); 132 static int segspt_shmgetprot(struct seg *seg, caddr_t addr, size_t len, 133 uint_t *protv); 134 static u_offset_t segspt_shmgetoffset(struct seg *seg, caddr_t addr); 135 static int segspt_shmgettype(struct seg *seg, caddr_t addr); 136 static int segspt_shmgetvp(struct seg *seg, caddr_t addr, struct vnode **vpp); 137 static int segspt_shmadvise(struct seg *seg, caddr_t addr, size_t len, 138 uint_t behav); 139 static void segspt_shmdump(struct seg *seg); 140 static int segspt_shmpagelock(struct seg *, caddr_t, size_t, 141 struct page ***, enum lock_type, enum seg_rw); 142 static int segspt_shmsetpgsz(struct seg *, caddr_t, size_t, uint_t); 143 static int segspt_shmgetmemid(struct seg *, caddr_t, memid_t *); 144 static lgrp_mem_policy_info_t *segspt_shmgetpolicy(struct seg *, caddr_t); 145 static int segspt_shmcapable(struct seg *, segcapability_t); 146 147 struct seg_ops segspt_shmops = { 148 segspt_shmdup, 149 segspt_shmunmap, 150 segspt_shmfree, 151 segspt_shmfault, 152 segspt_shmfaulta, 153 segspt_shmsetprot, 154 segspt_shmcheckprot, 155 segspt_shmkluster, 156 segspt_shmswapout, 157 segspt_shmsync, 158 segspt_shmincore, 159 segspt_shmlockop, 160 segspt_shmgetprot, 161 segspt_shmgetoffset, 162 segspt_shmgettype, 163 segspt_shmgetvp, 164 segspt_shmadvise, /* advise */ 165 segspt_shmdump, 166 segspt_shmpagelock, 167 segspt_shmsetpgsz, 168 segspt_shmgetmemid, 169 segspt_shmgetpolicy, 170 segspt_shmcapable, 171 }; 172 173 static void segspt_purge(struct seg *seg); 174 static int segspt_reclaim(void *, caddr_t, size_t, struct page **, 175 enum seg_rw, int); 176 static int spt_anon_getpages(struct seg *seg, caddr_t addr, size_t len, 177 page_t **ppa); 178 179 180 181 /*ARGSUSED*/ 182 int 183 sptcreate(size_t size, struct seg **sptseg, struct anon_map *amp, 184 uint_t prot, uint_t flags, uint_t share_szc) 185 { 186 int err; 187 struct as *newas; 188 struct segspt_crargs sptcargs; 189 190 #ifdef DEBUG 191 TNF_PROBE_1(sptcreate, "spt", /* CSTYLED */, 192 tnf_ulong, size, size ); 193 #endif 194 if (segspt_minfree == 0) /* leave min 5% of availrmem for */ 195 segspt_minfree = availrmem/20; /* for the system */ 196 197 if (!hat_supported(HAT_SHARED_PT, (void *)0)) 198 return (EINVAL); 199 200 /* 201 * get a new as for this shared memory segment 202 */ 203 newas = as_alloc(); 204 newas->a_proc = NULL; 205 sptcargs.amp = amp; 206 sptcargs.prot = prot; 207 sptcargs.flags = flags; 208 sptcargs.szc = share_szc; 209 /* 210 * create a shared page table (spt) segment 211 */ 212 213 if (err = as_map(newas, SEGSPTADDR, size, segspt_create, &sptcargs)) { 214 as_free(newas); 215 return (err); 216 } 217 *sptseg = sptcargs.seg_spt; 218 return (0); 219 } 220 221 void 222 sptdestroy(struct as *as, struct anon_map *amp) 223 { 224 225 #ifdef DEBUG 226 TNF_PROBE_0(sptdestroy, "spt", /* CSTYLED */); 227 #endif 228 (void) as_unmap(as, SEGSPTADDR, amp->size); 229 as_free(as); 230 } 231 232 /* 233 * called from seg_free(). 234 * free (i.e., unlock, unmap, return to free list) 235 * all the pages in the given seg. 236 */ 237 void 238 segspt_free(struct seg *seg) 239 { 240 struct spt_data *sptd = (struct spt_data *)seg->s_data; 241 242 ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock)); 243 244 if (sptd != NULL) { 245 if (sptd->spt_realsize) 246 segspt_free_pages(seg, seg->s_base, sptd->spt_realsize); 247 248 if (sptd->spt_ppa_lckcnt) 249 kmem_free(sptd->spt_ppa_lckcnt, 250 sizeof (*sptd->spt_ppa_lckcnt) 251 * btopr(sptd->spt_amp->size)); 252 kmem_free(sptd->spt_vp, sizeof (*sptd->spt_vp)); 253 cv_destroy(&sptd->spt_cv); 254 mutex_destroy(&sptd->spt_lock); 255 kmem_free(sptd, sizeof (*sptd)); 256 } 257 } 258 259 /*ARGSUSED*/ 260 static int 261 segspt_shmsync(struct seg *seg, caddr_t addr, size_t len, int attr, 262 uint_t flags) 263 { 264 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 265 266 return (0); 267 } 268 269 /*ARGSUSED*/ 270 static size_t 271 segspt_shmincore(struct seg *seg, caddr_t addr, size_t len, char *vec) 272 { 273 caddr_t eo_seg; 274 pgcnt_t npages; 275 struct shm_data *shmd = (struct shm_data *)seg->s_data; 276 struct seg *sptseg; 277 struct spt_data *sptd; 278 279 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 280 #ifdef lint 281 seg = seg; 282 #endif 283 sptseg = shmd->shm_sptseg; 284 sptd = sptseg->s_data; 285 286 if ((sptd->spt_flags & SHM_PAGEABLE) == 0) { 287 eo_seg = addr + len; 288 while (addr < eo_seg) { 289 /* page exists, and it's locked. */ 290 *vec++ = SEG_PAGE_INCORE | SEG_PAGE_LOCKED | 291 SEG_PAGE_ANON; 292 addr += PAGESIZE; 293 } 294 return (len); 295 } else { 296 struct anon_map *amp = shmd->shm_amp; 297 struct anon *ap; 298 page_t *pp; 299 pgcnt_t anon_index; 300 struct vnode *vp; 301 u_offset_t off; 302 ulong_t i; 303 int ret; 304 anon_sync_obj_t cookie; 305 306 addr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK); 307 anon_index = seg_page(seg, addr); 308 npages = btopr(len); 309 if (anon_index + npages > btopr(shmd->shm_amp->size)) { 310 return (EINVAL); 311 } 312 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 313 for (i = 0; i < npages; i++, anon_index++) { 314 ret = 0; 315 anon_array_enter(amp, anon_index, &cookie); 316 ap = anon_get_ptr(amp->ahp, anon_index); 317 if (ap != NULL) { 318 swap_xlate(ap, &vp, &off); 319 anon_array_exit(&cookie); 320 pp = page_lookup_nowait(vp, off, SE_SHARED); 321 if (pp != NULL) { 322 ret |= SEG_PAGE_INCORE | SEG_PAGE_ANON; 323 page_unlock(pp); 324 } 325 } else { 326 anon_array_exit(&cookie); 327 } 328 if (shmd->shm_vpage[anon_index] & DISM_PG_LOCKED) { 329 ret |= SEG_PAGE_LOCKED; 330 } 331 *vec++ = (char)ret; 332 } 333 ANON_LOCK_EXIT(&->a_rwlock); 334 return (len); 335 } 336 } 337 338 static int 339 segspt_unmap(struct seg *seg, caddr_t raddr, size_t ssize) 340 { 341 size_t share_size; 342 343 ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock)); 344 345 /* 346 * seg.s_size may have been rounded up to the largest page size 347 * in shmat(). 348 * XXX This should be cleanedup. sptdestroy should take a length 349 * argument which should be the same as sptcreate. Then 350 * this rounding would not be needed (or is done in shm.c) 351 * Only the check for full segment will be needed. 352 * 353 * XXX -- shouldn't raddr == 0 always? These tests don't seem 354 * to be useful at all. 355 */ 356 share_size = page_get_pagesize(seg->s_szc); 357 ssize = P2ROUNDUP(ssize, share_size); 358 359 if (raddr == seg->s_base && ssize == seg->s_size) { 360 seg_free(seg); 361 return (0); 362 } else 363 return (EINVAL); 364 } 365 366 int 367 segspt_create(struct seg *seg, caddr_t argsp) 368 { 369 int err; 370 caddr_t addr = seg->s_base; 371 struct spt_data *sptd; 372 struct segspt_crargs *sptcargs = (struct segspt_crargs *)argsp; 373 struct anon_map *amp = sptcargs->amp; 374 struct kshmid *sp = amp->a_sp; 375 struct cred *cred = CRED(); 376 ulong_t i, j, anon_index = 0; 377 pgcnt_t npages = btopr(amp->size); 378 struct vnode *vp; 379 page_t **ppa; 380 uint_t hat_flags; 381 size_t pgsz; 382 pgcnt_t pgcnt; 383 caddr_t a; 384 pgcnt_t pidx; 385 size_t sz; 386 proc_t *procp = curproc; 387 rctl_qty_t lockedbytes = 0; 388 kproject_t *proj; 389 390 /* 391 * We are holding the a_lock on the underlying dummy as, 392 * so we can make calls to the HAT layer. 393 */ 394 ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock)); 395 ASSERT(sp != NULL); 396 397 #ifdef DEBUG 398 TNF_PROBE_2(segspt_create, "spt", /* CSTYLED */, 399 tnf_opaque, addr, addr, tnf_ulong, len, seg->s_size); 400 #endif 401 if ((sptcargs->flags & SHM_PAGEABLE) == 0) { 402 if (err = anon_swap_adjust(npages, segspt_minfree, 60)) 403 return (err); 404 } 405 err = ENOMEM; 406 407 if ((sptd = kmem_zalloc(sizeof (*sptd), KM_NOSLEEP)) == NULL) 408 goto out1; 409 410 if ((sptcargs->flags & SHM_PAGEABLE) == 0) { 411 if ((ppa = kmem_zalloc(((sizeof (page_t *)) * npages), 412 KM_NOSLEEP)) == NULL) 413 goto out2; 414 } 415 416 mutex_init(&sptd->spt_lock, NULL, MUTEX_DEFAULT, NULL); 417 418 if ((vp = kmem_zalloc(sizeof (*vp), KM_NOSLEEP)) == NULL) 419 goto out3; 420 421 seg->s_ops = &segspt_ops; 422 sptd->spt_vp = vp; 423 sptd->spt_amp = amp; 424 sptd->spt_prot = sptcargs->prot; 425 sptd->spt_flags = sptcargs->flags; 426 seg->s_data = (caddr_t)sptd; 427 sptd->spt_ppa = NULL; 428 sptd->spt_ppa_lckcnt = NULL; 429 seg->s_szc = sptcargs->szc; 430 cv_init(&sptd->spt_cv, NULL, CV_DEFAULT, NULL); 431 sptd->spt_gen = 0; 432 433 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 434 if (seg->s_szc > amp->a_szc) { 435 amp->a_szc = seg->s_szc; 436 } 437 ANON_LOCK_EXIT(&->a_rwlock); 438 439 /* 440 * Set policy to affect initial allocation of pages in 441 * anon_map_createpages() 442 */ 443 (void) lgrp_shm_policy_set(LGRP_MEM_POLICY_DEFAULT, amp, anon_index, 444 NULL, 0, ptob(npages)); 445 446 if (sptcargs->flags & SHM_PAGEABLE) { 447 size_t share_sz; 448 pgcnt_t new_npgs, more_pgs; 449 struct anon_hdr *nahp; 450 zone_t *zone; 451 452 share_sz = page_get_pagesize(seg->s_szc); 453 if (!IS_P2ALIGNED(amp->size, share_sz)) { 454 /* 455 * We are rounding up the size of the anon array 456 * on 4 M boundary because we always create 4 M 457 * of page(s) when locking, faulting pages and we 458 * don't have to check for all corner cases e.g. 459 * if there is enough space to allocate 4 M 460 * page. 461 */ 462 new_npgs = btop(P2ROUNDUP(amp->size, share_sz)); 463 more_pgs = new_npgs - npages; 464 465 /* 466 * The zone will never be NULL, as a fully created 467 * shm always has an owning zone. 468 */ 469 zone = sp->shm_perm.ipc_zone; 470 ASSERT(zone != NULL); 471 if (anon_resv_zone(ptob(more_pgs), zone) == 0) { 472 err = ENOMEM; 473 goto out4; 474 } 475 476 nahp = anon_create(new_npgs, ANON_SLEEP); 477 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 478 (void) anon_copy_ptr(amp->ahp, 0, nahp, 0, npages, 479 ANON_SLEEP); 480 anon_release(amp->ahp, npages); 481 amp->ahp = nahp; 482 ASSERT(amp->swresv == ptob(npages)); 483 amp->swresv = amp->size = ptob(new_npgs); 484 ANON_LOCK_EXIT(&->a_rwlock); 485 npages = new_npgs; 486 } 487 488 sptd->spt_ppa_lckcnt = kmem_zalloc(npages * 489 sizeof (*sptd->spt_ppa_lckcnt), KM_SLEEP); 490 sptd->spt_pcachecnt = 0; 491 sptd->spt_realsize = ptob(npages); 492 sptcargs->seg_spt = seg; 493 return (0); 494 } 495 496 /* 497 * get array of pages for each anon slot in amp 498 */ 499 if ((err = anon_map_createpages(amp, anon_index, ptob(npages), ppa, 500 seg, addr, S_CREATE, cred)) != 0) 501 goto out4; 502 503 mutex_enter(&sp->shm_mlock); 504 505 /* May be partially locked, so, count bytes to charge for locking */ 506 for (i = 0; i < npages; i++) 507 if (ppa[i]->p_lckcnt == 0) 508 lockedbytes += PAGESIZE; 509 510 proj = sp->shm_perm.ipc_proj; 511 512 if (lockedbytes > 0) { 513 mutex_enter(&procp->p_lock); 514 if (rctl_incr_locked_mem(procp, proj, lockedbytes, 0)) { 515 mutex_exit(&procp->p_lock); 516 mutex_exit(&sp->shm_mlock); 517 for (i = 0; i < npages; i++) 518 page_unlock(ppa[i]); 519 err = ENOMEM; 520 goto out4; 521 } 522 mutex_exit(&procp->p_lock); 523 } 524 525 /* 526 * addr is initial address corresponding to the first page on ppa list 527 */ 528 for (i = 0; i < npages; i++) { 529 /* attempt to lock all pages */ 530 if (page_pp_lock(ppa[i], 0, 1) == 0) { 531 /* 532 * if unable to lock any page, unlock all 533 * of them and return error 534 */ 535 for (j = 0; j < i; j++) 536 page_pp_unlock(ppa[j], 0, 1); 537 for (i = 0; i < npages; i++) 538 page_unlock(ppa[i]); 539 rctl_decr_locked_mem(NULL, proj, lockedbytes, 0); 540 mutex_exit(&sp->shm_mlock); 541 err = ENOMEM; 542 goto out4; 543 } 544 } 545 mutex_exit(&sp->shm_mlock); 546 547 /* 548 * Some platforms assume that ISM mappings are HAT_LOAD_LOCK 549 * for the entire life of the segment. For example platforms 550 * that do not support Dynamic Reconfiguration. 551 */ 552 hat_flags = HAT_LOAD_SHARE; 553 if (!hat_supported(HAT_DYNAMIC_ISM_UNMAP, NULL)) 554 hat_flags |= HAT_LOAD_LOCK; 555 556 /* 557 * Load translations one lare page at a time 558 * to make sure we don't create mappings bigger than 559 * segment's size code in case underlying pages 560 * are shared with segvn's segment that uses bigger 561 * size code than we do. 562 */ 563 pgsz = page_get_pagesize(seg->s_szc); 564 pgcnt = page_get_pagecnt(seg->s_szc); 565 for (a = addr, pidx = 0; pidx < npages; a += pgsz, pidx += pgcnt) { 566 sz = MIN(pgsz, ptob(npages - pidx)); 567 hat_memload_array(seg->s_as->a_hat, a, sz, 568 &ppa[pidx], sptd->spt_prot, hat_flags); 569 } 570 571 /* 572 * On platforms that do not support HAT_DYNAMIC_ISM_UNMAP, 573 * we will leave the pages locked SE_SHARED for the life 574 * of the ISM segment. This will prevent any calls to 575 * hat_pageunload() on this ISM segment for those platforms. 576 */ 577 if (!(hat_flags & HAT_LOAD_LOCK)) { 578 /* 579 * On platforms that support HAT_DYNAMIC_ISM_UNMAP, 580 * we no longer need to hold the SE_SHARED lock on the pages, 581 * since L_PAGELOCK and F_SOFTLOCK calls will grab the 582 * SE_SHARED lock on the pages as necessary. 583 */ 584 for (i = 0; i < npages; i++) 585 page_unlock(ppa[i]); 586 } 587 sptd->spt_pcachecnt = 0; 588 kmem_free(ppa, ((sizeof (page_t *)) * npages)); 589 sptd->spt_realsize = ptob(npages); 590 atomic_add_long(&spt_used, npages); 591 sptcargs->seg_spt = seg; 592 return (0); 593 594 out4: 595 seg->s_data = NULL; 596 kmem_free(vp, sizeof (*vp)); 597 cv_destroy(&sptd->spt_cv); 598 out3: 599 mutex_destroy(&sptd->spt_lock); 600 if ((sptcargs->flags & SHM_PAGEABLE) == 0) 601 kmem_free(ppa, (sizeof (*ppa) * npages)); 602 out2: 603 kmem_free(sptd, sizeof (*sptd)); 604 out1: 605 if ((sptcargs->flags & SHM_PAGEABLE) == 0) 606 anon_swap_restore(npages); 607 return (err); 608 } 609 610 /*ARGSUSED*/ 611 void 612 segspt_free_pages(struct seg *seg, caddr_t addr, size_t len) 613 { 614 struct page *pp; 615 struct spt_data *sptd = (struct spt_data *)seg->s_data; 616 pgcnt_t npages; 617 ulong_t anon_idx; 618 struct anon_map *amp; 619 struct anon *ap; 620 struct vnode *vp; 621 u_offset_t off; 622 uint_t hat_flags; 623 int root = 0; 624 pgcnt_t pgs, curnpgs = 0; 625 page_t *rootpp; 626 rctl_qty_t unlocked_bytes = 0; 627 kproject_t *proj; 628 kshmid_t *sp; 629 630 ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock)); 631 632 len = P2ROUNDUP(len, PAGESIZE); 633 634 npages = btop(len); 635 636 hat_flags = HAT_UNLOAD_UNLOCK | HAT_UNLOAD_UNMAP; 637 if ((hat_supported(HAT_DYNAMIC_ISM_UNMAP, (void *)0)) || 638 (sptd->spt_flags & SHM_PAGEABLE)) { 639 hat_flags = HAT_UNLOAD_UNMAP; 640 } 641 642 hat_unload(seg->s_as->a_hat, addr, len, hat_flags); 643 644 amp = sptd->spt_amp; 645 if (sptd->spt_flags & SHM_PAGEABLE) 646 npages = btop(amp->size); 647 648 ASSERT(amp != NULL); 649 650 if ((sptd->spt_flags & SHM_PAGEABLE) == 0) { 651 sp = amp->a_sp; 652 proj = sp->shm_perm.ipc_proj; 653 mutex_enter(&sp->shm_mlock); 654 } 655 for (anon_idx = 0; anon_idx < npages; anon_idx++) { 656 if ((sptd->spt_flags & SHM_PAGEABLE) == 0) { 657 if ((ap = anon_get_ptr(amp->ahp, anon_idx)) == NULL) { 658 panic("segspt_free_pages: null app"); 659 /*NOTREACHED*/ 660 } 661 } else { 662 if ((ap = anon_get_next_ptr(amp->ahp, &anon_idx)) 663 == NULL) 664 continue; 665 } 666 ASSERT(ANON_ISBUSY(anon_get_slot(amp->ahp, anon_idx)) == 0); 667 swap_xlate(ap, &vp, &off); 668 669 /* 670 * If this platform supports HAT_DYNAMIC_ISM_UNMAP, 671 * the pages won't be having SE_SHARED lock at this 672 * point. 673 * 674 * On platforms that do not support HAT_DYNAMIC_ISM_UNMAP, 675 * the pages are still held SE_SHARED locked from the 676 * original segspt_create() 677 * 678 * Our goal is to get SE_EXCL lock on each page, remove 679 * permanent lock on it and invalidate the page. 680 */ 681 if ((sptd->spt_flags & SHM_PAGEABLE) == 0) { 682 if (hat_flags == HAT_UNLOAD_UNMAP) 683 pp = page_lookup(vp, off, SE_EXCL); 684 else { 685 if ((pp = page_find(vp, off)) == NULL) { 686 panic("segspt_free_pages: " 687 "page not locked"); 688 /*NOTREACHED*/ 689 } 690 if (!page_tryupgrade(pp)) { 691 page_unlock(pp); 692 pp = page_lookup(vp, off, SE_EXCL); 693 } 694 } 695 if (pp == NULL) { 696 panic("segspt_free_pages: " 697 "page not in the system"); 698 /*NOTREACHED*/ 699 } 700 ASSERT(pp->p_lckcnt > 0); 701 page_pp_unlock(pp, 0, 1); 702 if (pp->p_lckcnt == 0) 703 unlocked_bytes += PAGESIZE; 704 } else { 705 if ((pp = page_lookup(vp, off, SE_EXCL)) == NULL) 706 continue; 707 } 708 /* 709 * It's logical to invalidate the pages here as in most cases 710 * these were created by segspt. 711 */ 712 if (pp->p_szc != 0) { 713 if (root == 0) { 714 ASSERT(curnpgs == 0); 715 root = 1; 716 rootpp = pp; 717 pgs = curnpgs = page_get_pagecnt(pp->p_szc); 718 ASSERT(pgs > 1); 719 ASSERT(IS_P2ALIGNED(pgs, pgs)); 720 ASSERT(!(page_pptonum(pp) & (pgs - 1))); 721 curnpgs--; 722 } else if ((page_pptonum(pp) & (pgs - 1)) == pgs - 1) { 723 ASSERT(curnpgs == 1); 724 ASSERT(page_pptonum(pp) == 725 page_pptonum(rootpp) + (pgs - 1)); 726 page_destroy_pages(rootpp); 727 root = 0; 728 curnpgs = 0; 729 } else { 730 ASSERT(curnpgs > 1); 731 ASSERT(page_pptonum(pp) == 732 page_pptonum(rootpp) + (pgs - curnpgs)); 733 curnpgs--; 734 } 735 } else { 736 if (root != 0 || curnpgs != 0) { 737 panic("segspt_free_pages: bad large page"); 738 /*NOTREACHED*/ 739 } 740 /* 741 * Before destroying the pages, we need to take care 742 * of the rctl locked memory accounting. For that 743 * we need to calculte the unlocked_bytes. 744 */ 745 if (pp->p_lckcnt > 0) 746 unlocked_bytes += PAGESIZE; 747 /*LINTED: constant in conditional context */ 748 VN_DISPOSE(pp, B_INVAL, 0, kcred); 749 } 750 } 751 if ((sptd->spt_flags & SHM_PAGEABLE) == 0) { 752 if (unlocked_bytes > 0) 753 rctl_decr_locked_mem(NULL, proj, unlocked_bytes, 0); 754 mutex_exit(&sp->shm_mlock); 755 } 756 if (root != 0 || curnpgs != 0) { 757 panic("segspt_free_pages: bad large page"); 758 /*NOTREACHED*/ 759 } 760 761 /* 762 * mark that pages have been released 763 */ 764 sptd->spt_realsize = 0; 765 766 if ((sptd->spt_flags & SHM_PAGEABLE) == 0) { 767 atomic_add_long(&spt_used, -npages); 768 anon_swap_restore(npages); 769 } 770 } 771 772 /* 773 * Get memory allocation policy info for specified address in given segment 774 */ 775 static lgrp_mem_policy_info_t * 776 segspt_getpolicy(struct seg *seg, caddr_t addr) 777 { 778 struct anon_map *amp; 779 ulong_t anon_index; 780 lgrp_mem_policy_info_t *policy_info; 781 struct spt_data *spt_data; 782 783 ASSERT(seg != NULL); 784 785 /* 786 * Get anon_map from segspt 787 * 788 * Assume that no lock needs to be held on anon_map, since 789 * it should be protected by its reference count which must be 790 * nonzero for an existing segment 791 * Need to grab readers lock on policy tree though 792 */ 793 spt_data = (struct spt_data *)seg->s_data; 794 if (spt_data == NULL) 795 return (NULL); 796 amp = spt_data->spt_amp; 797 ASSERT(amp->refcnt != 0); 798 799 /* 800 * Get policy info 801 * 802 * Assume starting anon index of 0 803 */ 804 anon_index = seg_page(seg, addr); 805 policy_info = lgrp_shm_policy_get(amp, anon_index, NULL, 0); 806 807 return (policy_info); 808 } 809 810 /* 811 * DISM only. 812 * Return locked pages over a given range. 813 * 814 * We will cache all DISM locked pages and save the pplist for the 815 * entire segment in the ppa field of the underlying DISM segment structure. 816 * Later, during a call to segspt_reclaim() we will use this ppa array 817 * to page_unlock() all of the pages and then we will free this ppa list. 818 */ 819 /*ARGSUSED*/ 820 static int 821 segspt_dismpagelock(struct seg *seg, caddr_t addr, size_t len, 822 struct page ***ppp, enum lock_type type, enum seg_rw rw) 823 { 824 struct shm_data *shmd = (struct shm_data *)seg->s_data; 825 struct seg *sptseg = shmd->shm_sptseg; 826 struct spt_data *sptd = sptseg->s_data; 827 pgcnt_t pg_idx, npages, tot_npages, npgs; 828 struct page **pplist, **pl, **ppa, *pp; 829 struct anon_map *amp; 830 spgcnt_t an_idx; 831 int ret = ENOTSUP; 832 uint_t pl_built = 0; 833 struct anon *ap; 834 struct vnode *vp; 835 u_offset_t off; 836 pgcnt_t claim_availrmem = 0; 837 uint_t szc; 838 839 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 840 ASSERT(type == L_PAGELOCK || type == L_PAGEUNLOCK); 841 842 /* 843 * We want to lock/unlock the entire ISM segment. Therefore, 844 * we will be using the underlying sptseg and it's base address 845 * and length for the caching arguments. 846 */ 847 ASSERT(sptseg); 848 ASSERT(sptd); 849 850 pg_idx = seg_page(seg, addr); 851 npages = btopr(len); 852 853 /* 854 * check if the request is larger than number of pages covered 855 * by amp 856 */ 857 if (pg_idx + npages > btopr(sptd->spt_amp->size)) { 858 *ppp = NULL; 859 return (ENOTSUP); 860 } 861 862 if (type == L_PAGEUNLOCK) { 863 ASSERT(sptd->spt_ppa != NULL); 864 865 seg_pinactive(seg, NULL, seg->s_base, sptd->spt_amp->size, 866 sptd->spt_ppa, S_WRITE, SEGP_FORCE_WIRED, segspt_reclaim); 867 868 /* 869 * If someone is blocked while unmapping, we purge 870 * segment page cache and thus reclaim pplist synchronously 871 * without waiting for seg_pasync_thread. This speeds up 872 * unmapping in cases where munmap(2) is called, while 873 * raw async i/o is still in progress or where a thread 874 * exits on data fault in a multithreaded application. 875 */ 876 if ((sptd->spt_flags & DISM_PPA_CHANGED) || 877 (AS_ISUNMAPWAIT(seg->s_as) && 878 shmd->shm_softlockcnt > 0)) { 879 segspt_purge(seg); 880 } 881 return (0); 882 } 883 884 /* The L_PAGELOCK case ... */ 885 886 if (sptd->spt_flags & DISM_PPA_CHANGED) { 887 segspt_purge(seg); 888 /* 889 * for DISM ppa needs to be rebuild since 890 * number of locked pages could be changed 891 */ 892 *ppp = NULL; 893 return (ENOTSUP); 894 } 895 896 /* 897 * First try to find pages in segment page cache, without 898 * holding the segment lock. 899 */ 900 pplist = seg_plookup(seg, NULL, seg->s_base, sptd->spt_amp->size, 901 S_WRITE, SEGP_FORCE_WIRED); 902 if (pplist != NULL) { 903 ASSERT(sptd->spt_ppa != NULL); 904 ASSERT(sptd->spt_ppa == pplist); 905 ppa = sptd->spt_ppa; 906 for (an_idx = pg_idx; an_idx < pg_idx + npages; ) { 907 if (ppa[an_idx] == NULL) { 908 seg_pinactive(seg, NULL, seg->s_base, 909 sptd->spt_amp->size, ppa, 910 S_WRITE, SEGP_FORCE_WIRED, segspt_reclaim); 911 *ppp = NULL; 912 return (ENOTSUP); 913 } 914 if ((szc = ppa[an_idx]->p_szc) != 0) { 915 npgs = page_get_pagecnt(szc); 916 an_idx = P2ROUNDUP(an_idx + 1, npgs); 917 } else { 918 an_idx++; 919 } 920 } 921 /* 922 * Since we cache the entire DISM segment, we want to 923 * set ppp to point to the first slot that corresponds 924 * to the requested addr, i.e. pg_idx. 925 */ 926 *ppp = &(sptd->spt_ppa[pg_idx]); 927 return (0); 928 } 929 930 mutex_enter(&sptd->spt_lock); 931 /* 932 * try to find pages in segment page cache with mutex 933 */ 934 pplist = seg_plookup(seg, NULL, seg->s_base, sptd->spt_amp->size, 935 S_WRITE, SEGP_FORCE_WIRED); 936 if (pplist != NULL) { 937 ASSERT(sptd->spt_ppa != NULL); 938 ASSERT(sptd->spt_ppa == pplist); 939 ppa = sptd->spt_ppa; 940 for (an_idx = pg_idx; an_idx < pg_idx + npages; ) { 941 if (ppa[an_idx] == NULL) { 942 mutex_exit(&sptd->spt_lock); 943 seg_pinactive(seg, NULL, seg->s_base, 944 sptd->spt_amp->size, ppa, 945 S_WRITE, SEGP_FORCE_WIRED, segspt_reclaim); 946 *ppp = NULL; 947 return (ENOTSUP); 948 } 949 if ((szc = ppa[an_idx]->p_szc) != 0) { 950 npgs = page_get_pagecnt(szc); 951 an_idx = P2ROUNDUP(an_idx + 1, npgs); 952 } else { 953 an_idx++; 954 } 955 } 956 /* 957 * Since we cache the entire DISM segment, we want to 958 * set ppp to point to the first slot that corresponds 959 * to the requested addr, i.e. pg_idx. 960 */ 961 mutex_exit(&sptd->spt_lock); 962 *ppp = &(sptd->spt_ppa[pg_idx]); 963 return (0); 964 } 965 if (seg_pinsert_check(seg, NULL, seg->s_base, sptd->spt_amp->size, 966 SEGP_FORCE_WIRED) == SEGP_FAIL) { 967 mutex_exit(&sptd->spt_lock); 968 *ppp = NULL; 969 return (ENOTSUP); 970 } 971 972 /* 973 * No need to worry about protections because DISM pages are always rw. 974 */ 975 pl = pplist = NULL; 976 amp = sptd->spt_amp; 977 978 /* 979 * Do we need to build the ppa array? 980 */ 981 if (sptd->spt_ppa == NULL) { 982 pgcnt_t lpg_cnt = 0; 983 984 pl_built = 1; 985 tot_npages = btopr(sptd->spt_amp->size); 986 987 ASSERT(sptd->spt_pcachecnt == 0); 988 pplist = kmem_zalloc(sizeof (page_t *) * tot_npages, KM_SLEEP); 989 pl = pplist; 990 991 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 992 for (an_idx = 0; an_idx < tot_npages; ) { 993 ap = anon_get_ptr(amp->ahp, an_idx); 994 /* 995 * Cache only mlocked pages. For large pages 996 * if one (constituent) page is mlocked 997 * all pages for that large page 998 * are cached also. This is for quick 999 * lookups of ppa array; 1000 */ 1001 if ((ap != NULL) && (lpg_cnt != 0 || 1002 (sptd->spt_ppa_lckcnt[an_idx] != 0))) { 1003 1004 swap_xlate(ap, &vp, &off); 1005 pp = page_lookup(vp, off, SE_SHARED); 1006 ASSERT(pp != NULL); 1007 if (lpg_cnt == 0) { 1008 lpg_cnt++; 1009 /* 1010 * For a small page, we are done -- 1011 * lpg_count is reset to 0 below. 1012 * 1013 * For a large page, we are guaranteed 1014 * to find the anon structures of all 1015 * constituent pages and a non-zero 1016 * lpg_cnt ensures that we don't test 1017 * for mlock for these. We are done 1018 * when lpg_count reaches (npgs + 1). 1019 * If we are not the first constituent 1020 * page, restart at the first one. 1021 */ 1022 npgs = page_get_pagecnt(pp->p_szc); 1023 if (!IS_P2ALIGNED(an_idx, npgs)) { 1024 an_idx = P2ALIGN(an_idx, npgs); 1025 page_unlock(pp); 1026 continue; 1027 } 1028 } 1029 if (++lpg_cnt > npgs) 1030 lpg_cnt = 0; 1031 1032 /* 1033 * availrmem is decremented only 1034 * for unlocked pages 1035 */ 1036 if (sptd->spt_ppa_lckcnt[an_idx] == 0) 1037 claim_availrmem++; 1038 pplist[an_idx] = pp; 1039 } 1040 an_idx++; 1041 } 1042 ANON_LOCK_EXIT(&->a_rwlock); 1043 1044 if (claim_availrmem) { 1045 mutex_enter(&freemem_lock); 1046 if (availrmem < tune.t_minarmem + claim_availrmem) { 1047 mutex_exit(&freemem_lock); 1048 ret = ENOTSUP; 1049 claim_availrmem = 0; 1050 goto insert_fail; 1051 } else { 1052 availrmem -= claim_availrmem; 1053 } 1054 mutex_exit(&freemem_lock); 1055 } 1056 1057 sptd->spt_ppa = pl; 1058 } else { 1059 /* 1060 * We already have a valid ppa[]. 1061 */ 1062 pl = sptd->spt_ppa; 1063 } 1064 1065 ASSERT(pl != NULL); 1066 1067 ret = seg_pinsert(seg, NULL, seg->s_base, sptd->spt_amp->size, 1068 sptd->spt_amp->size, pl, S_WRITE, SEGP_FORCE_WIRED, 1069 segspt_reclaim); 1070 if (ret == SEGP_FAIL) { 1071 /* 1072 * seg_pinsert failed. We return 1073 * ENOTSUP, so that the as_pagelock() code will 1074 * then try the slower F_SOFTLOCK path. 1075 */ 1076 if (pl_built) { 1077 /* 1078 * No one else has referenced the ppa[]. 1079 * We created it and we need to destroy it. 1080 */ 1081 sptd->spt_ppa = NULL; 1082 } 1083 ret = ENOTSUP; 1084 goto insert_fail; 1085 } 1086 1087 /* 1088 * In either case, we increment softlockcnt on the 'real' segment. 1089 */ 1090 sptd->spt_pcachecnt++; 1091 atomic_add_long((ulong_t *)(&(shmd->shm_softlockcnt)), 1); 1092 1093 ppa = sptd->spt_ppa; 1094 for (an_idx = pg_idx; an_idx < pg_idx + npages; ) { 1095 if (ppa[an_idx] == NULL) { 1096 mutex_exit(&sptd->spt_lock); 1097 seg_pinactive(seg, NULL, seg->s_base, 1098 sptd->spt_amp->size, 1099 pl, S_WRITE, SEGP_FORCE_WIRED, segspt_reclaim); 1100 *ppp = NULL; 1101 return (ENOTSUP); 1102 } 1103 if ((szc = ppa[an_idx]->p_szc) != 0) { 1104 npgs = page_get_pagecnt(szc); 1105 an_idx = P2ROUNDUP(an_idx + 1, npgs); 1106 } else { 1107 an_idx++; 1108 } 1109 } 1110 /* 1111 * We can now drop the sptd->spt_lock since the ppa[] 1112 * exists and he have incremented pacachecnt. 1113 */ 1114 mutex_exit(&sptd->spt_lock); 1115 1116 /* 1117 * Since we cache the entire segment, we want to 1118 * set ppp to point to the first slot that corresponds 1119 * to the requested addr, i.e. pg_idx. 1120 */ 1121 *ppp = &(sptd->spt_ppa[pg_idx]); 1122 return (0); 1123 1124 insert_fail: 1125 /* 1126 * We will only reach this code if we tried and failed. 1127 * 1128 * And we can drop the lock on the dummy seg, once we've failed 1129 * to set up a new ppa[]. 1130 */ 1131 mutex_exit(&sptd->spt_lock); 1132 1133 if (pl_built) { 1134 if (claim_availrmem) { 1135 mutex_enter(&freemem_lock); 1136 availrmem += claim_availrmem; 1137 mutex_exit(&freemem_lock); 1138 } 1139 1140 /* 1141 * We created pl and we need to destroy it. 1142 */ 1143 pplist = pl; 1144 for (an_idx = 0; an_idx < tot_npages; an_idx++) { 1145 if (pplist[an_idx] != NULL) 1146 page_unlock(pplist[an_idx]); 1147 } 1148 kmem_free(pl, sizeof (page_t *) * tot_npages); 1149 } 1150 1151 if (shmd->shm_softlockcnt <= 0) { 1152 if (AS_ISUNMAPWAIT(seg->s_as)) { 1153 mutex_enter(&seg->s_as->a_contents); 1154 if (AS_ISUNMAPWAIT(seg->s_as)) { 1155 AS_CLRUNMAPWAIT(seg->s_as); 1156 cv_broadcast(&seg->s_as->a_cv); 1157 } 1158 mutex_exit(&seg->s_as->a_contents); 1159 } 1160 } 1161 *ppp = NULL; 1162 return (ret); 1163 } 1164 1165 1166 1167 /* 1168 * return locked pages over a given range. 1169 * 1170 * We will cache the entire ISM segment and save the pplist for the 1171 * entire segment in the ppa field of the underlying ISM segment structure. 1172 * Later, during a call to segspt_reclaim() we will use this ppa array 1173 * to page_unlock() all of the pages and then we will free this ppa list. 1174 */ 1175 /*ARGSUSED*/ 1176 static int 1177 segspt_shmpagelock(struct seg *seg, caddr_t addr, size_t len, 1178 struct page ***ppp, enum lock_type type, enum seg_rw rw) 1179 { 1180 struct shm_data *shmd = (struct shm_data *)seg->s_data; 1181 struct seg *sptseg = shmd->shm_sptseg; 1182 struct spt_data *sptd = sptseg->s_data; 1183 pgcnt_t np, page_index, npages; 1184 caddr_t a, spt_base; 1185 struct page **pplist, **pl, *pp; 1186 struct anon_map *amp; 1187 ulong_t anon_index; 1188 int ret = ENOTSUP; 1189 uint_t pl_built = 0; 1190 struct anon *ap; 1191 struct vnode *vp; 1192 u_offset_t off; 1193 1194 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 1195 ASSERT(type == L_PAGELOCK || type == L_PAGEUNLOCK); 1196 1197 1198 /* 1199 * We want to lock/unlock the entire ISM segment. Therefore, 1200 * we will be using the underlying sptseg and it's base address 1201 * and length for the caching arguments. 1202 */ 1203 ASSERT(sptseg); 1204 ASSERT(sptd); 1205 1206 if (sptd->spt_flags & SHM_PAGEABLE) { 1207 return (segspt_dismpagelock(seg, addr, len, ppp, type, rw)); 1208 } 1209 1210 page_index = seg_page(seg, addr); 1211 npages = btopr(len); 1212 1213 /* 1214 * check if the request is larger than number of pages covered 1215 * by amp 1216 */ 1217 if (page_index + npages > btopr(sptd->spt_amp->size)) { 1218 *ppp = NULL; 1219 return (ENOTSUP); 1220 } 1221 1222 if (type == L_PAGEUNLOCK) { 1223 1224 ASSERT(sptd->spt_ppa != NULL); 1225 1226 seg_pinactive(seg, NULL, seg->s_base, sptd->spt_amp->size, 1227 sptd->spt_ppa, S_WRITE, SEGP_FORCE_WIRED, segspt_reclaim); 1228 1229 /* 1230 * If someone is blocked while unmapping, we purge 1231 * segment page cache and thus reclaim pplist synchronously 1232 * without waiting for seg_pasync_thread. This speeds up 1233 * unmapping in cases where munmap(2) is called, while 1234 * raw async i/o is still in progress or where a thread 1235 * exits on data fault in a multithreaded application. 1236 */ 1237 if (AS_ISUNMAPWAIT(seg->s_as) && (shmd->shm_softlockcnt > 0)) { 1238 segspt_purge(seg); 1239 } 1240 return (0); 1241 } 1242 1243 /* The L_PAGELOCK case... */ 1244 1245 /* 1246 * First try to find pages in segment page cache, without 1247 * holding the segment lock. 1248 */ 1249 pplist = seg_plookup(seg, NULL, seg->s_base, sptd->spt_amp->size, 1250 S_WRITE, SEGP_FORCE_WIRED); 1251 if (pplist != NULL) { 1252 ASSERT(sptd->spt_ppa == pplist); 1253 ASSERT(sptd->spt_ppa[page_index]); 1254 /* 1255 * Since we cache the entire ISM segment, we want to 1256 * set ppp to point to the first slot that corresponds 1257 * to the requested addr, i.e. page_index. 1258 */ 1259 *ppp = &(sptd->spt_ppa[page_index]); 1260 return (0); 1261 } 1262 1263 mutex_enter(&sptd->spt_lock); 1264 1265 /* 1266 * try to find pages in segment page cache 1267 */ 1268 pplist = seg_plookup(seg, NULL, seg->s_base, sptd->spt_amp->size, 1269 S_WRITE, SEGP_FORCE_WIRED); 1270 if (pplist != NULL) { 1271 ASSERT(sptd->spt_ppa == pplist); 1272 /* 1273 * Since we cache the entire segment, we want to 1274 * set ppp to point to the first slot that corresponds 1275 * to the requested addr, i.e. page_index. 1276 */ 1277 mutex_exit(&sptd->spt_lock); 1278 *ppp = &(sptd->spt_ppa[page_index]); 1279 return (0); 1280 } 1281 1282 if (seg_pinsert_check(seg, NULL, seg->s_base, sptd->spt_amp->size, 1283 SEGP_FORCE_WIRED) == SEGP_FAIL) { 1284 mutex_exit(&sptd->spt_lock); 1285 *ppp = NULL; 1286 return (ENOTSUP); 1287 } 1288 1289 /* 1290 * No need to worry about protections because ISM pages 1291 * are always rw. 1292 */ 1293 pl = pplist = NULL; 1294 1295 /* 1296 * Do we need to build the ppa array? 1297 */ 1298 if (sptd->spt_ppa == NULL) { 1299 ASSERT(sptd->spt_ppa == pplist); 1300 1301 spt_base = sptseg->s_base; 1302 pl_built = 1; 1303 1304 /* 1305 * availrmem is decremented once during anon_swap_adjust() 1306 * and is incremented during the anon_unresv(), which is 1307 * called from shm_rm_amp() when the segment is destroyed. 1308 */ 1309 amp = sptd->spt_amp; 1310 ASSERT(amp != NULL); 1311 1312 /* pcachecnt is protected by sptd->spt_lock */ 1313 ASSERT(sptd->spt_pcachecnt == 0); 1314 pplist = kmem_zalloc(sizeof (page_t *) 1315 * btopr(sptd->spt_amp->size), KM_SLEEP); 1316 pl = pplist; 1317 1318 anon_index = seg_page(sptseg, spt_base); 1319 1320 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 1321 for (a = spt_base; a < (spt_base + sptd->spt_amp->size); 1322 a += PAGESIZE, anon_index++, pplist++) { 1323 ap = anon_get_ptr(amp->ahp, anon_index); 1324 ASSERT(ap != NULL); 1325 swap_xlate(ap, &vp, &off); 1326 pp = page_lookup(vp, off, SE_SHARED); 1327 ASSERT(pp != NULL); 1328 *pplist = pp; 1329 } 1330 ANON_LOCK_EXIT(&->a_rwlock); 1331 1332 if (a < (spt_base + sptd->spt_amp->size)) { 1333 ret = ENOTSUP; 1334 goto insert_fail; 1335 } 1336 sptd->spt_ppa = pl; 1337 } else { 1338 /* 1339 * We already have a valid ppa[]. 1340 */ 1341 pl = sptd->spt_ppa; 1342 } 1343 1344 ASSERT(pl != NULL); 1345 1346 ret = seg_pinsert(seg, NULL, seg->s_base, sptd->spt_amp->size, 1347 sptd->spt_amp->size, pl, S_WRITE, SEGP_FORCE_WIRED, 1348 segspt_reclaim); 1349 if (ret == SEGP_FAIL) { 1350 /* 1351 * seg_pinsert failed. We return 1352 * ENOTSUP, so that the as_pagelock() code will 1353 * then try the slower F_SOFTLOCK path. 1354 */ 1355 if (pl_built) { 1356 /* 1357 * No one else has referenced the ppa[]. 1358 * We created it and we need to destroy it. 1359 */ 1360 sptd->spt_ppa = NULL; 1361 } 1362 ret = ENOTSUP; 1363 goto insert_fail; 1364 } 1365 1366 /* 1367 * In either case, we increment softlockcnt on the 'real' segment. 1368 */ 1369 sptd->spt_pcachecnt++; 1370 atomic_add_long((ulong_t *)(&(shmd->shm_softlockcnt)), 1); 1371 1372 /* 1373 * We can now drop the sptd->spt_lock since the ppa[] 1374 * exists and he have incremented pacachecnt. 1375 */ 1376 mutex_exit(&sptd->spt_lock); 1377 1378 /* 1379 * Since we cache the entire segment, we want to 1380 * set ppp to point to the first slot that corresponds 1381 * to the requested addr, i.e. page_index. 1382 */ 1383 *ppp = &(sptd->spt_ppa[page_index]); 1384 return (0); 1385 1386 insert_fail: 1387 /* 1388 * We will only reach this code if we tried and failed. 1389 * 1390 * And we can drop the lock on the dummy seg, once we've failed 1391 * to set up a new ppa[]. 1392 */ 1393 mutex_exit(&sptd->spt_lock); 1394 1395 if (pl_built) { 1396 /* 1397 * We created pl and we need to destroy it. 1398 */ 1399 pplist = pl; 1400 np = (((uintptr_t)(a - spt_base)) >> PAGESHIFT); 1401 while (np) { 1402 page_unlock(*pplist); 1403 np--; 1404 pplist++; 1405 } 1406 kmem_free(pl, sizeof (page_t *) * btopr(sptd->spt_amp->size)); 1407 } 1408 if (shmd->shm_softlockcnt <= 0) { 1409 if (AS_ISUNMAPWAIT(seg->s_as)) { 1410 mutex_enter(&seg->s_as->a_contents); 1411 if (AS_ISUNMAPWAIT(seg->s_as)) { 1412 AS_CLRUNMAPWAIT(seg->s_as); 1413 cv_broadcast(&seg->s_as->a_cv); 1414 } 1415 mutex_exit(&seg->s_as->a_contents); 1416 } 1417 } 1418 *ppp = NULL; 1419 return (ret); 1420 } 1421 1422 /* 1423 * purge any cached pages in the I/O page cache 1424 */ 1425 static void 1426 segspt_purge(struct seg *seg) 1427 { 1428 seg_ppurge(seg, NULL, SEGP_FORCE_WIRED); 1429 } 1430 1431 static int 1432 segspt_reclaim(void *ptag, caddr_t addr, size_t len, struct page **pplist, 1433 enum seg_rw rw, int async) 1434 { 1435 struct seg *seg = (struct seg *)ptag; 1436 struct shm_data *shmd = (struct shm_data *)seg->s_data; 1437 struct seg *sptseg; 1438 struct spt_data *sptd; 1439 pgcnt_t npages, i, free_availrmem = 0; 1440 int done = 0; 1441 1442 #ifdef lint 1443 addr = addr; 1444 #endif 1445 sptseg = shmd->shm_sptseg; 1446 sptd = sptseg->s_data; 1447 npages = (len >> PAGESHIFT); 1448 ASSERT(npages); 1449 ASSERT(sptd->spt_pcachecnt != 0); 1450 ASSERT(sptd->spt_ppa == pplist); 1451 ASSERT(npages == btopr(sptd->spt_amp->size)); 1452 ASSERT(async || AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 1453 1454 /* 1455 * Acquire the lock on the dummy seg and destroy the 1456 * ppa array IF this is the last pcachecnt. 1457 */ 1458 mutex_enter(&sptd->spt_lock); 1459 if (--sptd->spt_pcachecnt == 0) { 1460 for (i = 0; i < npages; i++) { 1461 if (pplist[i] == NULL) { 1462 continue; 1463 } 1464 if (rw == S_WRITE) { 1465 hat_setrefmod(pplist[i]); 1466 } else { 1467 hat_setref(pplist[i]); 1468 } 1469 if ((sptd->spt_flags & SHM_PAGEABLE) && 1470 (sptd->spt_ppa_lckcnt[i] == 0)) 1471 free_availrmem++; 1472 page_unlock(pplist[i]); 1473 } 1474 if ((sptd->spt_flags & SHM_PAGEABLE) && free_availrmem) { 1475 mutex_enter(&freemem_lock); 1476 availrmem += free_availrmem; 1477 mutex_exit(&freemem_lock); 1478 } 1479 /* 1480 * Since we want to cach/uncache the entire ISM segment, 1481 * we will track the pplist in a segspt specific field 1482 * ppa, that is initialized at the time we add an entry to 1483 * the cache. 1484 */ 1485 ASSERT(sptd->spt_pcachecnt == 0); 1486 kmem_free(pplist, sizeof (page_t *) * npages); 1487 sptd->spt_ppa = NULL; 1488 sptd->spt_flags &= ~DISM_PPA_CHANGED; 1489 sptd->spt_gen++; 1490 cv_broadcast(&sptd->spt_cv); 1491 done = 1; 1492 } 1493 mutex_exit(&sptd->spt_lock); 1494 1495 /* 1496 * If we are pcache async thread or called via seg_ppurge_wiredpp() we 1497 * may not hold AS lock (in this case async argument is not 0). This 1498 * means if softlockcnt drops to 0 after the decrement below address 1499 * space may get freed. We can't allow it since after softlock 1500 * derement to 0 we still need to access as structure for possible 1501 * wakeup of unmap waiters. To prevent the disappearance of as we take 1502 * this segment's shm_segfree_syncmtx. segspt_shmfree() also takes 1503 * this mutex as a barrier to make sure this routine completes before 1504 * segment is freed. 1505 * 1506 * The second complication we have to deal with in async case is a 1507 * possibility of missed wake up of unmap wait thread. When we don't 1508 * hold as lock here we may take a_contents lock before unmap wait 1509 * thread that was first to see softlockcnt was still not 0. As a 1510 * result we'll fail to wake up an unmap wait thread. To avoid this 1511 * race we set nounmapwait flag in as structure if we drop softlockcnt 1512 * to 0 if async is not 0. unmapwait thread 1513 * will not block if this flag is set. 1514 */ 1515 if (async) 1516 mutex_enter(&shmd->shm_segfree_syncmtx); 1517 1518 /* 1519 * Now decrement softlockcnt. 1520 */ 1521 ASSERT(shmd->shm_softlockcnt > 0); 1522 atomic_add_long((ulong_t *)(&(shmd->shm_softlockcnt)), -1); 1523 1524 if (shmd->shm_softlockcnt <= 0) { 1525 if (async || AS_ISUNMAPWAIT(seg->s_as)) { 1526 mutex_enter(&seg->s_as->a_contents); 1527 if (async) 1528 AS_SETNOUNMAPWAIT(seg->s_as); 1529 if (AS_ISUNMAPWAIT(seg->s_as)) { 1530 AS_CLRUNMAPWAIT(seg->s_as); 1531 cv_broadcast(&seg->s_as->a_cv); 1532 } 1533 mutex_exit(&seg->s_as->a_contents); 1534 } 1535 } 1536 1537 if (async) 1538 mutex_exit(&shmd->shm_segfree_syncmtx); 1539 1540 return (done); 1541 } 1542 1543 /* 1544 * Do a F_SOFTUNLOCK call over the range requested. 1545 * The range must have already been F_SOFTLOCK'ed. 1546 * 1547 * The calls to acquire and release the anon map lock mutex were 1548 * removed in order to avoid a deadly embrace during a DR 1549 * memory delete operation. (Eg. DR blocks while waiting for a 1550 * exclusive lock on a page that is being used for kaio; the 1551 * thread that will complete the kaio and call segspt_softunlock 1552 * blocks on the anon map lock; another thread holding the anon 1553 * map lock blocks on another page lock via the segspt_shmfault 1554 * -> page_lookup -> page_lookup_create -> page_lock_es code flow.) 1555 * 1556 * The appropriateness of the removal is based upon the following: 1557 * 1. If we are holding a segment's reader lock and the page is held 1558 * shared, then the corresponding element in anonmap which points to 1559 * anon struct cannot change and there is no need to acquire the 1560 * anonymous map lock. 1561 * 2. Threads in segspt_softunlock have a reader lock on the segment 1562 * and already have the shared page lock, so we are guaranteed that 1563 * the anon map slot cannot change and therefore can call anon_get_ptr() 1564 * without grabbing the anonymous map lock. 1565 * 3. Threads that softlock a shared page break copy-on-write, even if 1566 * its a read. Thus cow faults can be ignored with respect to soft 1567 * unlocking, since the breaking of cow means that the anon slot(s) will 1568 * not be shared. 1569 */ 1570 static void 1571 segspt_softunlock(struct seg *seg, caddr_t sptseg_addr, 1572 size_t len, enum seg_rw rw) 1573 { 1574 struct shm_data *shmd = (struct shm_data *)seg->s_data; 1575 struct seg *sptseg; 1576 struct spt_data *sptd; 1577 page_t *pp; 1578 caddr_t adr; 1579 struct vnode *vp; 1580 u_offset_t offset; 1581 ulong_t anon_index; 1582 struct anon_map *amp; /* XXX - for locknest */ 1583 struct anon *ap = NULL; 1584 pgcnt_t npages; 1585 1586 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 1587 1588 sptseg = shmd->shm_sptseg; 1589 sptd = sptseg->s_data; 1590 1591 /* 1592 * Some platforms assume that ISM mappings are HAT_LOAD_LOCK 1593 * and therefore their pages are SE_SHARED locked 1594 * for the entire life of the segment. 1595 */ 1596 if ((!hat_supported(HAT_DYNAMIC_ISM_UNMAP, (void *)0)) && 1597 ((sptd->spt_flags & SHM_PAGEABLE) == 0)) { 1598 goto softlock_decrement; 1599 } 1600 1601 /* 1602 * Any thread is free to do a page_find and 1603 * page_unlock() on the pages within this seg. 1604 * 1605 * We are already holding the as->a_lock on the user's 1606 * real segment, but we need to hold the a_lock on the 1607 * underlying dummy as. This is mostly to satisfy the 1608 * underlying HAT layer. 1609 */ 1610 AS_LOCK_ENTER(sptseg->s_as, &sptseg->s_as->a_lock, RW_READER); 1611 hat_unlock(sptseg->s_as->a_hat, sptseg_addr, len); 1612 AS_LOCK_EXIT(sptseg->s_as, &sptseg->s_as->a_lock); 1613 1614 amp = sptd->spt_amp; 1615 ASSERT(amp != NULL); 1616 anon_index = seg_page(sptseg, sptseg_addr); 1617 1618 for (adr = sptseg_addr; adr < sptseg_addr + len; adr += PAGESIZE) { 1619 ap = anon_get_ptr(amp->ahp, anon_index++); 1620 ASSERT(ap != NULL); 1621 swap_xlate(ap, &vp, &offset); 1622 1623 /* 1624 * Use page_find() instead of page_lookup() to 1625 * find the page since we know that it has a 1626 * "shared" lock. 1627 */ 1628 pp = page_find(vp, offset); 1629 ASSERT(ap == anon_get_ptr(amp->ahp, anon_index - 1)); 1630 if (pp == NULL) { 1631 panic("segspt_softunlock: " 1632 "addr %p, ap %p, vp %p, off %llx", 1633 (void *)adr, (void *)ap, (void *)vp, offset); 1634 /*NOTREACHED*/ 1635 } 1636 1637 if (rw == S_WRITE) { 1638 hat_setrefmod(pp); 1639 } else if (rw != S_OTHER) { 1640 hat_setref(pp); 1641 } 1642 page_unlock(pp); 1643 } 1644 1645 softlock_decrement: 1646 npages = btopr(len); 1647 ASSERT(shmd->shm_softlockcnt >= npages); 1648 atomic_add_long((ulong_t *)(&(shmd->shm_softlockcnt)), -npages); 1649 if (shmd->shm_softlockcnt == 0) { 1650 /* 1651 * All SOFTLOCKS are gone. Wakeup any waiting 1652 * unmappers so they can try again to unmap. 1653 * Check for waiters first without the mutex 1654 * held so we don't always grab the mutex on 1655 * softunlocks. 1656 */ 1657 if (AS_ISUNMAPWAIT(seg->s_as)) { 1658 mutex_enter(&seg->s_as->a_contents); 1659 if (AS_ISUNMAPWAIT(seg->s_as)) { 1660 AS_CLRUNMAPWAIT(seg->s_as); 1661 cv_broadcast(&seg->s_as->a_cv); 1662 } 1663 mutex_exit(&seg->s_as->a_contents); 1664 } 1665 } 1666 } 1667 1668 int 1669 segspt_shmattach(struct seg *seg, caddr_t *argsp) 1670 { 1671 struct shm_data *shmd_arg = (struct shm_data *)argsp; 1672 struct shm_data *shmd; 1673 struct anon_map *shm_amp = shmd_arg->shm_amp; 1674 struct spt_data *sptd; 1675 int error = 0; 1676 1677 ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock)); 1678 1679 shmd = kmem_zalloc((sizeof (*shmd)), KM_NOSLEEP); 1680 if (shmd == NULL) 1681 return (ENOMEM); 1682 1683 shmd->shm_sptas = shmd_arg->shm_sptas; 1684 shmd->shm_amp = shm_amp; 1685 shmd->shm_sptseg = shmd_arg->shm_sptseg; 1686 1687 (void) lgrp_shm_policy_set(LGRP_MEM_POLICY_DEFAULT, shm_amp, 0, 1688 NULL, 0, seg->s_size); 1689 1690 mutex_init(&shmd->shm_segfree_syncmtx, NULL, MUTEX_DEFAULT, NULL); 1691 1692 seg->s_data = (void *)shmd; 1693 seg->s_ops = &segspt_shmops; 1694 seg->s_szc = shmd->shm_sptseg->s_szc; 1695 sptd = shmd->shm_sptseg->s_data; 1696 1697 if (sptd->spt_flags & SHM_PAGEABLE) { 1698 if ((shmd->shm_vpage = kmem_zalloc(btopr(shm_amp->size), 1699 KM_NOSLEEP)) == NULL) { 1700 seg->s_data = (void *)NULL; 1701 kmem_free(shmd, (sizeof (*shmd))); 1702 return (ENOMEM); 1703 } 1704 shmd->shm_lckpgs = 0; 1705 if (hat_supported(HAT_DYNAMIC_ISM_UNMAP, (void *)0)) { 1706 if ((error = hat_share(seg->s_as->a_hat, seg->s_base, 1707 shmd_arg->shm_sptas->a_hat, SEGSPTADDR, 1708 seg->s_size, seg->s_szc)) != 0) { 1709 kmem_free(shmd->shm_vpage, 1710 btopr(shm_amp->size)); 1711 } 1712 } 1713 } else { 1714 error = hat_share(seg->s_as->a_hat, seg->s_base, 1715 shmd_arg->shm_sptas->a_hat, SEGSPTADDR, 1716 seg->s_size, seg->s_szc); 1717 } 1718 if (error) { 1719 seg->s_szc = 0; 1720 seg->s_data = (void *)NULL; 1721 kmem_free(shmd, (sizeof (*shmd))); 1722 } else { 1723 ANON_LOCK_ENTER(&shm_amp->a_rwlock, RW_WRITER); 1724 shm_amp->refcnt++; 1725 ANON_LOCK_EXIT(&shm_amp->a_rwlock); 1726 } 1727 return (error); 1728 } 1729 1730 int 1731 segspt_shmunmap(struct seg *seg, caddr_t raddr, size_t ssize) 1732 { 1733 struct shm_data *shmd = (struct shm_data *)seg->s_data; 1734 int reclaim = 1; 1735 1736 ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock)); 1737 retry: 1738 if (shmd->shm_softlockcnt > 0) { 1739 if (reclaim == 1) { 1740 segspt_purge(seg); 1741 reclaim = 0; 1742 goto retry; 1743 } 1744 return (EAGAIN); 1745 } 1746 1747 if (ssize != seg->s_size) { 1748 #ifdef DEBUG 1749 cmn_err(CE_WARN, "Incompatible ssize %lx s_size %lx\n", 1750 ssize, seg->s_size); 1751 #endif 1752 return (EINVAL); 1753 } 1754 1755 (void) segspt_shmlockop(seg, raddr, shmd->shm_amp->size, 0, MC_UNLOCK, 1756 NULL, 0); 1757 hat_unshare(seg->s_as->a_hat, raddr, ssize, seg->s_szc); 1758 1759 seg_free(seg); 1760 1761 return (0); 1762 } 1763 1764 void 1765 segspt_shmfree(struct seg *seg) 1766 { 1767 struct shm_data *shmd = (struct shm_data *)seg->s_data; 1768 struct anon_map *shm_amp = shmd->shm_amp; 1769 1770 ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock)); 1771 1772 (void) segspt_shmlockop(seg, seg->s_base, shm_amp->size, 0, 1773 MC_UNLOCK, NULL, 0); 1774 1775 /* 1776 * Need to increment refcnt when attaching 1777 * and decrement when detaching because of dup(). 1778 */ 1779 ANON_LOCK_ENTER(&shm_amp->a_rwlock, RW_WRITER); 1780 shm_amp->refcnt--; 1781 ANON_LOCK_EXIT(&shm_amp->a_rwlock); 1782 1783 if (shmd->shm_vpage) { /* only for DISM */ 1784 kmem_free(shmd->shm_vpage, btopr(shm_amp->size)); 1785 shmd->shm_vpage = NULL; 1786 } 1787 1788 /* 1789 * Take shm_segfree_syncmtx lock to let segspt_reclaim() finish if it's 1790 * still working with this segment without holding as lock. 1791 */ 1792 ASSERT(shmd->shm_softlockcnt == 0); 1793 mutex_enter(&shmd->shm_segfree_syncmtx); 1794 mutex_destroy(&shmd->shm_segfree_syncmtx); 1795 1796 kmem_free(shmd, sizeof (*shmd)); 1797 } 1798 1799 /*ARGSUSED*/ 1800 int 1801 segspt_shmsetprot(struct seg *seg, caddr_t addr, size_t len, uint_t prot) 1802 { 1803 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 1804 1805 /* 1806 * Shared page table is more than shared mapping. 1807 * Individual process sharing page tables can't change prot 1808 * because there is only one set of page tables. 1809 * This will be allowed after private page table is 1810 * supported. 1811 */ 1812 /* need to return correct status error? */ 1813 return (0); 1814 } 1815 1816 1817 faultcode_t 1818 segspt_dismfault(struct hat *hat, struct seg *seg, caddr_t addr, 1819 size_t len, enum fault_type type, enum seg_rw rw) 1820 { 1821 struct shm_data *shmd = (struct shm_data *)seg->s_data; 1822 struct seg *sptseg = shmd->shm_sptseg; 1823 struct as *curspt = shmd->shm_sptas; 1824 struct spt_data *sptd = sptseg->s_data; 1825 pgcnt_t npages; 1826 size_t size; 1827 caddr_t segspt_addr, shm_addr; 1828 page_t **ppa; 1829 int i; 1830 ulong_t an_idx = 0; 1831 int err = 0; 1832 int dyn_ism_unmap = hat_supported(HAT_DYNAMIC_ISM_UNMAP, (void *)0); 1833 size_t pgsz; 1834 pgcnt_t pgcnt; 1835 caddr_t a; 1836 pgcnt_t pidx; 1837 1838 #ifdef lint 1839 hat = hat; 1840 #endif 1841 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 1842 1843 /* 1844 * Because of the way spt is implemented 1845 * the realsize of the segment does not have to be 1846 * equal to the segment size itself. The segment size is 1847 * often in multiples of a page size larger than PAGESIZE. 1848 * The realsize is rounded up to the nearest PAGESIZE 1849 * based on what the user requested. This is a bit of 1850 * ungliness that is historical but not easily fixed 1851 * without re-designing the higher levels of ISM. 1852 */ 1853 ASSERT(addr >= seg->s_base); 1854 if (((addr + len) - seg->s_base) > sptd->spt_realsize) 1855 return (FC_NOMAP); 1856 /* 1857 * For all of the following cases except F_PROT, we need to 1858 * make any necessary adjustments to addr and len 1859 * and get all of the necessary page_t's into an array called ppa[]. 1860 * 1861 * The code in shmat() forces base addr and len of ISM segment 1862 * to be aligned to largest page size supported. Therefore, 1863 * we are able to handle F_SOFTLOCK and F_INVAL calls in "large 1864 * pagesize" chunks. We want to make sure that we HAT_LOAD_LOCK 1865 * in large pagesize chunks, or else we will screw up the HAT 1866 * layer by calling hat_memload_array() with differing page sizes 1867 * over a given virtual range. 1868 */ 1869 pgsz = page_get_pagesize(sptseg->s_szc); 1870 pgcnt = page_get_pagecnt(sptseg->s_szc); 1871 shm_addr = (caddr_t)P2ALIGN((uintptr_t)(addr), pgsz); 1872 size = P2ROUNDUP((uintptr_t)(((addr + len) - shm_addr)), pgsz); 1873 npages = btopr(size); 1874 1875 /* 1876 * Now we need to convert from addr in segshm to addr in segspt. 1877 */ 1878 an_idx = seg_page(seg, shm_addr); 1879 segspt_addr = sptseg->s_base + ptob(an_idx); 1880 1881 ASSERT((segspt_addr + ptob(npages)) <= 1882 (sptseg->s_base + sptd->spt_realsize)); 1883 ASSERT(segspt_addr < (sptseg->s_base + sptseg->s_size)); 1884 1885 switch (type) { 1886 1887 case F_SOFTLOCK: 1888 1889 atomic_add_long((ulong_t *)(&(shmd->shm_softlockcnt)), npages); 1890 /* 1891 * Fall through to the F_INVAL case to load up the hat layer 1892 * entries with the HAT_LOAD_LOCK flag. 1893 */ 1894 /* FALLTHRU */ 1895 case F_INVAL: 1896 1897 if ((rw == S_EXEC) && !(sptd->spt_prot & PROT_EXEC)) 1898 return (FC_NOMAP); 1899 1900 ppa = kmem_zalloc(npages * sizeof (page_t *), KM_SLEEP); 1901 1902 err = spt_anon_getpages(sptseg, segspt_addr, size, ppa); 1903 if (err != 0) { 1904 if (type == F_SOFTLOCK) { 1905 atomic_add_long((ulong_t *)( 1906 &(shmd->shm_softlockcnt)), -npages); 1907 } 1908 goto dism_err; 1909 } 1910 AS_LOCK_ENTER(sptseg->s_as, &sptseg->s_as->a_lock, RW_READER); 1911 a = segspt_addr; 1912 pidx = 0; 1913 if (type == F_SOFTLOCK) { 1914 1915 /* 1916 * Load up the translation keeping it 1917 * locked and don't unlock the page. 1918 */ 1919 for (; pidx < npages; a += pgsz, pidx += pgcnt) { 1920 hat_memload_array(sptseg->s_as->a_hat, 1921 a, pgsz, &ppa[pidx], sptd->spt_prot, 1922 HAT_LOAD_LOCK | HAT_LOAD_SHARE); 1923 } 1924 } else { 1925 if (hat == seg->s_as->a_hat) { 1926 1927 /* 1928 * Migrate pages marked for migration 1929 */ 1930 if (lgrp_optimizations()) 1931 page_migrate(seg, shm_addr, ppa, 1932 npages); 1933 1934 /* CPU HAT */ 1935 for (; pidx < npages; 1936 a += pgsz, pidx += pgcnt) { 1937 hat_memload_array(sptseg->s_as->a_hat, 1938 a, pgsz, &ppa[pidx], 1939 sptd->spt_prot, 1940 HAT_LOAD_SHARE); 1941 } 1942 } else { 1943 /* XHAT. Pass real address */ 1944 hat_memload_array(hat, shm_addr, 1945 size, ppa, sptd->spt_prot, HAT_LOAD_SHARE); 1946 } 1947 1948 /* 1949 * And now drop the SE_SHARED lock(s). 1950 */ 1951 if (dyn_ism_unmap) { 1952 for (i = 0; i < npages; i++) { 1953 page_unlock(ppa[i]); 1954 } 1955 } 1956 } 1957 1958 if (!dyn_ism_unmap) { 1959 if (hat_share(seg->s_as->a_hat, shm_addr, 1960 curspt->a_hat, segspt_addr, ptob(npages), 1961 seg->s_szc) != 0) { 1962 panic("hat_share err in DISM fault"); 1963 /* NOTREACHED */ 1964 } 1965 if (type == F_INVAL) { 1966 for (i = 0; i < npages; i++) { 1967 page_unlock(ppa[i]); 1968 } 1969 } 1970 } 1971 AS_LOCK_EXIT(sptseg->s_as, &sptseg->s_as->a_lock); 1972 dism_err: 1973 kmem_free(ppa, npages * sizeof (page_t *)); 1974 return (err); 1975 1976 case F_SOFTUNLOCK: 1977 1978 /* 1979 * This is a bit ugly, we pass in the real seg pointer, 1980 * but the segspt_addr is the virtual address within the 1981 * dummy seg. 1982 */ 1983 segspt_softunlock(seg, segspt_addr, size, rw); 1984 return (0); 1985 1986 case F_PROT: 1987 1988 /* 1989 * This takes care of the unusual case where a user 1990 * allocates a stack in shared memory and a register 1991 * window overflow is written to that stack page before 1992 * it is otherwise modified. 1993 * 1994 * We can get away with this because ISM segments are 1995 * always rw. Other than this unusual case, there 1996 * should be no instances of protection violations. 1997 */ 1998 return (0); 1999 2000 default: 2001 #ifdef DEBUG 2002 panic("segspt_dismfault default type?"); 2003 #else 2004 return (FC_NOMAP); 2005 #endif 2006 } 2007 } 2008 2009 2010 faultcode_t 2011 segspt_shmfault(struct hat *hat, struct seg *seg, caddr_t addr, 2012 size_t len, enum fault_type type, enum seg_rw rw) 2013 { 2014 struct shm_data *shmd = (struct shm_data *)seg->s_data; 2015 struct seg *sptseg = shmd->shm_sptseg; 2016 struct as *curspt = shmd->shm_sptas; 2017 struct spt_data *sptd = sptseg->s_data; 2018 pgcnt_t npages; 2019 size_t size; 2020 caddr_t sptseg_addr, shm_addr; 2021 page_t *pp, **ppa; 2022 int i; 2023 u_offset_t offset; 2024 ulong_t anon_index = 0; 2025 struct vnode *vp; 2026 struct anon_map *amp; /* XXX - for locknest */ 2027 struct anon *ap = NULL; 2028 size_t pgsz; 2029 pgcnt_t pgcnt; 2030 caddr_t a; 2031 pgcnt_t pidx; 2032 size_t sz; 2033 2034 #ifdef lint 2035 hat = hat; 2036 #endif 2037 2038 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 2039 2040 if (sptd->spt_flags & SHM_PAGEABLE) { 2041 return (segspt_dismfault(hat, seg, addr, len, type, rw)); 2042 } 2043 2044 /* 2045 * Because of the way spt is implemented 2046 * the realsize of the segment does not have to be 2047 * equal to the segment size itself. The segment size is 2048 * often in multiples of a page size larger than PAGESIZE. 2049 * The realsize is rounded up to the nearest PAGESIZE 2050 * based on what the user requested. This is a bit of 2051 * ungliness that is historical but not easily fixed 2052 * without re-designing the higher levels of ISM. 2053 */ 2054 ASSERT(addr >= seg->s_base); 2055 if (((addr + len) - seg->s_base) > sptd->spt_realsize) 2056 return (FC_NOMAP); 2057 /* 2058 * For all of the following cases except F_PROT, we need to 2059 * make any necessary adjustments to addr and len 2060 * and get all of the necessary page_t's into an array called ppa[]. 2061 * 2062 * The code in shmat() forces base addr and len of ISM segment 2063 * to be aligned to largest page size supported. Therefore, 2064 * we are able to handle F_SOFTLOCK and F_INVAL calls in "large 2065 * pagesize" chunks. We want to make sure that we HAT_LOAD_LOCK 2066 * in large pagesize chunks, or else we will screw up the HAT 2067 * layer by calling hat_memload_array() with differing page sizes 2068 * over a given virtual range. 2069 */ 2070 pgsz = page_get_pagesize(sptseg->s_szc); 2071 pgcnt = page_get_pagecnt(sptseg->s_szc); 2072 shm_addr = (caddr_t)P2ALIGN((uintptr_t)(addr), pgsz); 2073 size = P2ROUNDUP((uintptr_t)(((addr + len) - shm_addr)), pgsz); 2074 npages = btopr(size); 2075 2076 /* 2077 * Now we need to convert from addr in segshm to addr in segspt. 2078 */ 2079 anon_index = seg_page(seg, shm_addr); 2080 sptseg_addr = sptseg->s_base + ptob(anon_index); 2081 2082 /* 2083 * And now we may have to adjust npages downward if we have 2084 * exceeded the realsize of the segment or initial anon 2085 * allocations. 2086 */ 2087 if ((sptseg_addr + ptob(npages)) > 2088 (sptseg->s_base + sptd->spt_realsize)) 2089 size = (sptseg->s_base + sptd->spt_realsize) - sptseg_addr; 2090 2091 npages = btopr(size); 2092 2093 ASSERT(sptseg_addr < (sptseg->s_base + sptseg->s_size)); 2094 ASSERT((sptd->spt_flags & SHM_PAGEABLE) == 0); 2095 2096 switch (type) { 2097 2098 case F_SOFTLOCK: 2099 2100 /* 2101 * availrmem is decremented once during anon_swap_adjust() 2102 * and is incremented during the anon_unresv(), which is 2103 * called from shm_rm_amp() when the segment is destroyed. 2104 */ 2105 atomic_add_long((ulong_t *)(&(shmd->shm_softlockcnt)), npages); 2106 /* 2107 * Some platforms assume that ISM pages are SE_SHARED 2108 * locked for the entire life of the segment. 2109 */ 2110 if (!hat_supported(HAT_DYNAMIC_ISM_UNMAP, (void *)0)) 2111 return (0); 2112 /* 2113 * Fall through to the F_INVAL case to load up the hat layer 2114 * entries with the HAT_LOAD_LOCK flag. 2115 */ 2116 2117 /* FALLTHRU */ 2118 case F_INVAL: 2119 2120 if ((rw == S_EXEC) && !(sptd->spt_prot & PROT_EXEC)) 2121 return (FC_NOMAP); 2122 2123 /* 2124 * Some platforms that do NOT support DYNAMIC_ISM_UNMAP 2125 * may still rely on this call to hat_share(). That 2126 * would imply that those hat's can fault on a 2127 * HAT_LOAD_LOCK translation, which would seem 2128 * contradictory. 2129 */ 2130 if (!hat_supported(HAT_DYNAMIC_ISM_UNMAP, (void *)0)) { 2131 if (hat_share(seg->s_as->a_hat, seg->s_base, 2132 curspt->a_hat, sptseg->s_base, 2133 sptseg->s_size, sptseg->s_szc) != 0) { 2134 panic("hat_share error in ISM fault"); 2135 /*NOTREACHED*/ 2136 } 2137 return (0); 2138 } 2139 ppa = kmem_zalloc(sizeof (page_t *) * npages, KM_SLEEP); 2140 2141 /* 2142 * I see no need to lock the real seg, 2143 * here, because all of our work will be on the underlying 2144 * dummy seg. 2145 * 2146 * sptseg_addr and npages now account for large pages. 2147 */ 2148 amp = sptd->spt_amp; 2149 ASSERT(amp != NULL); 2150 anon_index = seg_page(sptseg, sptseg_addr); 2151 2152 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 2153 for (i = 0; i < npages; i++) { 2154 ap = anon_get_ptr(amp->ahp, anon_index++); 2155 ASSERT(ap != NULL); 2156 swap_xlate(ap, &vp, &offset); 2157 pp = page_lookup(vp, offset, SE_SHARED); 2158 ASSERT(pp != NULL); 2159 ppa[i] = pp; 2160 } 2161 ANON_LOCK_EXIT(&->a_rwlock); 2162 ASSERT(i == npages); 2163 2164 /* 2165 * We are already holding the as->a_lock on the user's 2166 * real segment, but we need to hold the a_lock on the 2167 * underlying dummy as. This is mostly to satisfy the 2168 * underlying HAT layer. 2169 */ 2170 AS_LOCK_ENTER(sptseg->s_as, &sptseg->s_as->a_lock, RW_READER); 2171 a = sptseg_addr; 2172 pidx = 0; 2173 if (type == F_SOFTLOCK) { 2174 /* 2175 * Load up the translation keeping it 2176 * locked and don't unlock the page. 2177 */ 2178 for (; pidx < npages; a += pgsz, pidx += pgcnt) { 2179 sz = MIN(pgsz, ptob(npages - pidx)); 2180 hat_memload_array(sptseg->s_as->a_hat, a, 2181 sz, &ppa[pidx], sptd->spt_prot, 2182 HAT_LOAD_LOCK | HAT_LOAD_SHARE); 2183 } 2184 } else { 2185 if (hat == seg->s_as->a_hat) { 2186 2187 /* 2188 * Migrate pages marked for migration. 2189 */ 2190 if (lgrp_optimizations()) 2191 page_migrate(seg, shm_addr, ppa, 2192 npages); 2193 2194 /* CPU HAT */ 2195 for (; pidx < npages; 2196 a += pgsz, pidx += pgcnt) { 2197 sz = MIN(pgsz, ptob(npages - pidx)); 2198 hat_memload_array(sptseg->s_as->a_hat, 2199 a, sz, &ppa[pidx], 2200 sptd->spt_prot, HAT_LOAD_SHARE); 2201 } 2202 } else { 2203 /* XHAT. Pass real address */ 2204 hat_memload_array(hat, shm_addr, 2205 ptob(npages), ppa, sptd->spt_prot, 2206 HAT_LOAD_SHARE); 2207 } 2208 2209 /* 2210 * And now drop the SE_SHARED lock(s). 2211 */ 2212 for (i = 0; i < npages; i++) 2213 page_unlock(ppa[i]); 2214 } 2215 AS_LOCK_EXIT(sptseg->s_as, &sptseg->s_as->a_lock); 2216 2217 kmem_free(ppa, sizeof (page_t *) * npages); 2218 return (0); 2219 case F_SOFTUNLOCK: 2220 2221 /* 2222 * This is a bit ugly, we pass in the real seg pointer, 2223 * but the sptseg_addr is the virtual address within the 2224 * dummy seg. 2225 */ 2226 segspt_softunlock(seg, sptseg_addr, ptob(npages), rw); 2227 return (0); 2228 2229 case F_PROT: 2230 2231 /* 2232 * This takes care of the unusual case where a user 2233 * allocates a stack in shared memory and a register 2234 * window overflow is written to that stack page before 2235 * it is otherwise modified. 2236 * 2237 * We can get away with this because ISM segments are 2238 * always rw. Other than this unusual case, there 2239 * should be no instances of protection violations. 2240 */ 2241 return (0); 2242 2243 default: 2244 #ifdef DEBUG 2245 cmn_err(CE_WARN, "segspt_shmfault default type?"); 2246 #endif 2247 return (FC_NOMAP); 2248 } 2249 } 2250 2251 /*ARGSUSED*/ 2252 static faultcode_t 2253 segspt_shmfaulta(struct seg *seg, caddr_t addr) 2254 { 2255 return (0); 2256 } 2257 2258 /*ARGSUSED*/ 2259 static int 2260 segspt_shmkluster(struct seg *seg, caddr_t addr, ssize_t delta) 2261 { 2262 return (0); 2263 } 2264 2265 /*ARGSUSED*/ 2266 static size_t 2267 segspt_shmswapout(struct seg *seg) 2268 { 2269 return (0); 2270 } 2271 2272 /* 2273 * duplicate the shared page tables 2274 */ 2275 int 2276 segspt_shmdup(struct seg *seg, struct seg *newseg) 2277 { 2278 struct shm_data *shmd = (struct shm_data *)seg->s_data; 2279 struct anon_map *amp = shmd->shm_amp; 2280 struct shm_data *shmd_new; 2281 struct seg *spt_seg = shmd->shm_sptseg; 2282 struct spt_data *sptd = spt_seg->s_data; 2283 int error = 0; 2284 2285 ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock)); 2286 2287 shmd_new = kmem_zalloc((sizeof (*shmd_new)), KM_SLEEP); 2288 newseg->s_data = (void *)shmd_new; 2289 shmd_new->shm_sptas = shmd->shm_sptas; 2290 shmd_new->shm_amp = amp; 2291 shmd_new->shm_sptseg = shmd->shm_sptseg; 2292 newseg->s_ops = &segspt_shmops; 2293 newseg->s_szc = seg->s_szc; 2294 ASSERT(seg->s_szc == shmd->shm_sptseg->s_szc); 2295 2296 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 2297 amp->refcnt++; 2298 ANON_LOCK_EXIT(&->a_rwlock); 2299 2300 if (sptd->spt_flags & SHM_PAGEABLE) { 2301 shmd_new->shm_vpage = kmem_zalloc(btopr(amp->size), KM_SLEEP); 2302 shmd_new->shm_lckpgs = 0; 2303 if (hat_supported(HAT_DYNAMIC_ISM_UNMAP, (void *)0)) { 2304 if ((error = hat_share(newseg->s_as->a_hat, 2305 newseg->s_base, shmd->shm_sptas->a_hat, SEGSPTADDR, 2306 seg->s_size, seg->s_szc)) != 0) { 2307 kmem_free(shmd_new->shm_vpage, 2308 btopr(amp->size)); 2309 } 2310 } 2311 return (error); 2312 } else { 2313 return (hat_share(newseg->s_as->a_hat, newseg->s_base, 2314 shmd->shm_sptas->a_hat, SEGSPTADDR, seg->s_size, 2315 seg->s_szc)); 2316 2317 } 2318 } 2319 2320 /*ARGSUSED*/ 2321 int 2322 segspt_shmcheckprot(struct seg *seg, caddr_t addr, size_t size, uint_t prot) 2323 { 2324 struct shm_data *shmd = (struct shm_data *)seg->s_data; 2325 struct spt_data *sptd = (struct spt_data *)shmd->shm_sptseg->s_data; 2326 2327 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 2328 2329 /* 2330 * ISM segment is always rw. 2331 */ 2332 return (((sptd->spt_prot & prot) != prot) ? EACCES : 0); 2333 } 2334 2335 /* 2336 * Return an array of locked large pages, for empty slots allocate 2337 * private zero-filled anon pages. 2338 */ 2339 static int 2340 spt_anon_getpages( 2341 struct seg *sptseg, 2342 caddr_t sptaddr, 2343 size_t len, 2344 page_t *ppa[]) 2345 { 2346 struct spt_data *sptd = sptseg->s_data; 2347 struct anon_map *amp = sptd->spt_amp; 2348 enum seg_rw rw = sptd->spt_prot; 2349 uint_t szc = sptseg->s_szc; 2350 size_t pg_sz, share_sz = page_get_pagesize(szc); 2351 pgcnt_t lp_npgs; 2352 caddr_t lp_addr, e_sptaddr; 2353 uint_t vpprot, ppa_szc = 0; 2354 struct vpage *vpage = NULL; 2355 ulong_t j, ppa_idx; 2356 int err, ierr = 0; 2357 pgcnt_t an_idx; 2358 anon_sync_obj_t cookie; 2359 int anon_locked = 0; 2360 pgcnt_t amp_pgs; 2361 2362 2363 ASSERT(IS_P2ALIGNED(sptaddr, share_sz) && IS_P2ALIGNED(len, share_sz)); 2364 ASSERT(len != 0); 2365 2366 pg_sz = share_sz; 2367 lp_npgs = btop(pg_sz); 2368 lp_addr = sptaddr; 2369 e_sptaddr = sptaddr + len; 2370 an_idx = seg_page(sptseg, sptaddr); 2371 ppa_idx = 0; 2372 2373 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 2374 2375 amp_pgs = page_get_pagecnt(amp->a_szc); 2376 2377 /*CONSTCOND*/ 2378 while (1) { 2379 for (; lp_addr < e_sptaddr; 2380 an_idx += lp_npgs, lp_addr += pg_sz, ppa_idx += lp_npgs) { 2381 2382 /* 2383 * If we're currently locked, and we get to a new 2384 * page, unlock our current anon chunk. 2385 */ 2386 if (anon_locked && P2PHASE(an_idx, amp_pgs) == 0) { 2387 anon_array_exit(&cookie); 2388 anon_locked = 0; 2389 } 2390 if (!anon_locked) { 2391 anon_array_enter(amp, an_idx, &cookie); 2392 anon_locked = 1; 2393 } 2394 ppa_szc = (uint_t)-1; 2395 ierr = anon_map_getpages(amp, an_idx, szc, sptseg, 2396 lp_addr, sptd->spt_prot, &vpprot, &ppa[ppa_idx], 2397 &ppa_szc, vpage, rw, 0, segvn_anypgsz, 0, kcred); 2398 2399 if (ierr != 0) { 2400 if (ierr > 0) { 2401 err = FC_MAKE_ERR(ierr); 2402 goto lpgs_err; 2403 } 2404 break; 2405 } 2406 } 2407 if (lp_addr == e_sptaddr) { 2408 break; 2409 } 2410 ASSERT(lp_addr < e_sptaddr); 2411 2412 /* 2413 * ierr == -1 means we failed to allocate a large page. 2414 * so do a size down operation. 2415 * 2416 * ierr == -2 means some other process that privately shares 2417 * pages with this process has allocated a larger page and we 2418 * need to retry with larger pages. So do a size up 2419 * operation. This relies on the fact that large pages are 2420 * never partially shared i.e. if we share any constituent 2421 * page of a large page with another process we must share the 2422 * entire large page. Note this cannot happen for SOFTLOCK 2423 * case, unless current address (lpaddr) is at the beginning 2424 * of the next page size boundary because the other process 2425 * couldn't have relocated locked pages. 2426 */ 2427 ASSERT(ierr == -1 || ierr == -2); 2428 if (segvn_anypgsz) { 2429 ASSERT(ierr == -2 || szc != 0); 2430 ASSERT(ierr == -1 || szc < sptseg->s_szc); 2431 szc = (ierr == -1) ? szc - 1 : szc + 1; 2432 } else { 2433 /* 2434 * For faults and segvn_anypgsz == 0 2435 * we need to be careful not to loop forever 2436 * if existing page is found with szc other 2437 * than 0 or seg->s_szc. This could be due 2438 * to page relocations on behalf of DR or 2439 * more likely large page creation. For this 2440 * case simply re-size to existing page's szc 2441 * if returned by anon_map_getpages(). 2442 */ 2443 if (ppa_szc == (uint_t)-1) { 2444 szc = (ierr == -1) ? 0 : sptseg->s_szc; 2445 } else { 2446 ASSERT(ppa_szc <= sptseg->s_szc); 2447 ASSERT(ierr == -2 || ppa_szc < szc); 2448 ASSERT(ierr == -1 || ppa_szc > szc); 2449 szc = ppa_szc; 2450 } 2451 } 2452 pg_sz = page_get_pagesize(szc); 2453 lp_npgs = btop(pg_sz); 2454 ASSERT(IS_P2ALIGNED(lp_addr, pg_sz)); 2455 } 2456 if (anon_locked) { 2457 anon_array_exit(&cookie); 2458 } 2459 ANON_LOCK_EXIT(&->a_rwlock); 2460 return (0); 2461 2462 lpgs_err: 2463 if (anon_locked) { 2464 anon_array_exit(&cookie); 2465 } 2466 ANON_LOCK_EXIT(&->a_rwlock); 2467 for (j = 0; j < ppa_idx; j++) 2468 page_unlock(ppa[j]); 2469 return (err); 2470 } 2471 2472 /* 2473 * count the number of bytes in a set of spt pages that are currently not 2474 * locked 2475 */ 2476 static rctl_qty_t 2477 spt_unlockedbytes(pgcnt_t npages, page_t **ppa) 2478 { 2479 ulong_t i; 2480 rctl_qty_t unlocked = 0; 2481 2482 for (i = 0; i < npages; i++) { 2483 if (ppa[i]->p_lckcnt == 0) 2484 unlocked += PAGESIZE; 2485 } 2486 return (unlocked); 2487 } 2488 2489 extern u_longlong_t randtick(void); 2490 /* number of locks to reserve/skip by spt_lockpages() and spt_unlockpages() */ 2491 #define NLCK (NCPU_P2) 2492 /* Random number with a range [0, n-1], n must be power of two */ 2493 #define RAND_P2(n) \ 2494 ((((long)curthread >> PTR24_LSB) ^ (long)randtick()) & ((n) - 1)) 2495 2496 int 2497 spt_lockpages(struct seg *seg, pgcnt_t anon_index, pgcnt_t npages, 2498 page_t **ppa, ulong_t *lockmap, size_t pos, 2499 rctl_qty_t *locked) 2500 { 2501 struct shm_data *shmd = seg->s_data; 2502 struct spt_data *sptd = shmd->shm_sptseg->s_data; 2503 ulong_t i; 2504 int kernel; 2505 pgcnt_t nlck = 0; 2506 int rv = 0; 2507 int use_reserved = 1; 2508 2509 /* return the number of bytes actually locked */ 2510 *locked = 0; 2511 2512 /* 2513 * To avoid contention on freemem_lock, availrmem and pages_locked 2514 * global counters are updated only every nlck locked pages instead of 2515 * every time. Reserve nlck locks up front and deduct from this 2516 * reservation for each page that requires a lock. When the reservation 2517 * is consumed, reserve again. nlck is randomized, so the competing 2518 * threads do not fall into a cyclic lock contention pattern. When 2519 * memory is low, the lock ahead is disabled, and instead page_pp_lock() 2520 * is used to lock pages. 2521 */ 2522 for (i = 0; i < npages; anon_index++, pos++, i++) { 2523 if (nlck == 0 && use_reserved == 1) { 2524 nlck = NLCK + RAND_P2(NLCK); 2525 /* if fewer loops left, decrease nlck */ 2526 nlck = MIN(nlck, npages - i); 2527 /* 2528 * Reserve nlck locks up front and deduct from this 2529 * reservation for each page that requires a lock. When 2530 * the reservation is consumed, reserve again. 2531 */ 2532 mutex_enter(&freemem_lock); 2533 if ((availrmem - nlck) < pages_pp_maximum) { 2534 /* Do not do advance memory reserves */ 2535 use_reserved = 0; 2536 } else { 2537 availrmem -= nlck; 2538 pages_locked += nlck; 2539 } 2540 mutex_exit(&freemem_lock); 2541 } 2542 if (!(shmd->shm_vpage[anon_index] & DISM_PG_LOCKED)) { 2543 if (sptd->spt_ppa_lckcnt[anon_index] < 2544 (ushort_t)DISM_LOCK_MAX) { 2545 if (++sptd->spt_ppa_lckcnt[anon_index] == 2546 (ushort_t)DISM_LOCK_MAX) { 2547 cmn_err(CE_WARN, 2548 "DISM page lock limit " 2549 "reached on DISM offset 0x%lx\n", 2550 anon_index << PAGESHIFT); 2551 } 2552 kernel = (sptd->spt_ppa && 2553 sptd->spt_ppa[anon_index]); 2554 if (!page_pp_lock(ppa[i], 0, kernel || 2555 use_reserved)) { 2556 sptd->spt_ppa_lckcnt[anon_index]--; 2557 rv = EAGAIN; 2558 break; 2559 } 2560 /* if this is a newly locked page, count it */ 2561 if (ppa[i]->p_lckcnt == 1) { 2562 if (kernel == 0 && use_reserved == 1) 2563 nlck--; 2564 *locked += PAGESIZE; 2565 } 2566 shmd->shm_lckpgs++; 2567 shmd->shm_vpage[anon_index] |= DISM_PG_LOCKED; 2568 if (lockmap != NULL) 2569 BT_SET(lockmap, pos); 2570 } 2571 } 2572 } 2573 /* Return unused lock reservation */ 2574 if (nlck != 0 && use_reserved == 1) { 2575 mutex_enter(&freemem_lock); 2576 availrmem += nlck; 2577 pages_locked -= nlck; 2578 mutex_exit(&freemem_lock); 2579 } 2580 2581 return (rv); 2582 } 2583 2584 int 2585 spt_unlockpages(struct seg *seg, pgcnt_t anon_index, pgcnt_t npages, 2586 rctl_qty_t *unlocked) 2587 { 2588 struct shm_data *shmd = seg->s_data; 2589 struct spt_data *sptd = shmd->shm_sptseg->s_data; 2590 struct anon_map *amp = sptd->spt_amp; 2591 struct anon *ap; 2592 struct vnode *vp; 2593 u_offset_t off; 2594 struct page *pp; 2595 int kernel; 2596 anon_sync_obj_t cookie; 2597 ulong_t i; 2598 pgcnt_t nlck = 0; 2599 pgcnt_t nlck_limit = NLCK; 2600 2601 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 2602 for (i = 0; i < npages; i++, anon_index++) { 2603 if (shmd->shm_vpage[anon_index] & DISM_PG_LOCKED) { 2604 anon_array_enter(amp, anon_index, &cookie); 2605 ap = anon_get_ptr(amp->ahp, anon_index); 2606 ASSERT(ap); 2607 2608 swap_xlate(ap, &vp, &off); 2609 anon_array_exit(&cookie); 2610 pp = page_lookup(vp, off, SE_SHARED); 2611 ASSERT(pp); 2612 /* 2613 * availrmem is decremented only for pages which are not 2614 * in seg pcache, for pages in seg pcache availrmem was 2615 * decremented in _dismpagelock() 2616 */ 2617 kernel = (sptd->spt_ppa && sptd->spt_ppa[anon_index]); 2618 ASSERT(pp->p_lckcnt > 0); 2619 2620 /* 2621 * lock page but do not change availrmem, we do it 2622 * ourselves every nlck loops. 2623 */ 2624 page_pp_unlock(pp, 0, 1); 2625 if (pp->p_lckcnt == 0) { 2626 if (kernel == 0) 2627 nlck++; 2628 *unlocked += PAGESIZE; 2629 } 2630 page_unlock(pp); 2631 shmd->shm_vpage[anon_index] &= ~DISM_PG_LOCKED; 2632 sptd->spt_ppa_lckcnt[anon_index]--; 2633 shmd->shm_lckpgs--; 2634 } 2635 2636 /* 2637 * To reduce freemem_lock contention, do not update availrmem 2638 * until at least NLCK pages have been unlocked. 2639 * 1. No need to update if nlck is zero 2640 * 2. Always update if the last iteration 2641 */ 2642 if (nlck > 0 && (nlck == nlck_limit || i == npages - 1)) { 2643 mutex_enter(&freemem_lock); 2644 availrmem += nlck; 2645 pages_locked -= nlck; 2646 mutex_exit(&freemem_lock); 2647 nlck = 0; 2648 nlck_limit = NLCK + RAND_P2(NLCK); 2649 } 2650 } 2651 ANON_LOCK_EXIT(&->a_rwlock); 2652 2653 return (0); 2654 } 2655 2656 /*ARGSUSED*/ 2657 static int 2658 segspt_shmlockop(struct seg *seg, caddr_t addr, size_t len, 2659 int attr, int op, ulong_t *lockmap, size_t pos) 2660 { 2661 struct shm_data *shmd = seg->s_data; 2662 struct seg *sptseg = shmd->shm_sptseg; 2663 struct spt_data *sptd = sptseg->s_data; 2664 struct kshmid *sp = sptd->spt_amp->a_sp; 2665 pgcnt_t npages, a_npages; 2666 page_t **ppa; 2667 pgcnt_t an_idx, a_an_idx, ppa_idx; 2668 caddr_t spt_addr, a_addr; /* spt and aligned address */ 2669 size_t a_len; /* aligned len */ 2670 size_t share_sz; 2671 ulong_t i; 2672 int sts = 0; 2673 rctl_qty_t unlocked = 0; 2674 rctl_qty_t locked = 0; 2675 struct proc *p = curproc; 2676 kproject_t *proj; 2677 2678 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 2679 ASSERT(sp != NULL); 2680 2681 if ((sptd->spt_flags & SHM_PAGEABLE) == 0) { 2682 return (0); 2683 } 2684 2685 addr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK); 2686 an_idx = seg_page(seg, addr); 2687 npages = btopr(len); 2688 2689 if (an_idx + npages > btopr(shmd->shm_amp->size)) { 2690 return (ENOMEM); 2691 } 2692 2693 /* 2694 * A shm's project never changes, so no lock needed. 2695 * The shm has a hold on the project, so it will not go away. 2696 * Since we have a mapping to shm within this zone, we know 2697 * that the zone will not go away. 2698 */ 2699 proj = sp->shm_perm.ipc_proj; 2700 2701 if (op == MC_LOCK) { 2702 2703 /* 2704 * Need to align addr and size request if they are not 2705 * aligned so we can always allocate large page(s) however 2706 * we only lock what was requested in initial request. 2707 */ 2708 share_sz = page_get_pagesize(sptseg->s_szc); 2709 a_addr = (caddr_t)P2ALIGN((uintptr_t)(addr), share_sz); 2710 a_len = P2ROUNDUP((uintptr_t)(((addr + len) - a_addr)), 2711 share_sz); 2712 a_npages = btop(a_len); 2713 a_an_idx = seg_page(seg, a_addr); 2714 spt_addr = sptseg->s_base + ptob(a_an_idx); 2715 ppa_idx = an_idx - a_an_idx; 2716 2717 if ((ppa = kmem_zalloc(((sizeof (page_t *)) * a_npages), 2718 KM_NOSLEEP)) == NULL) { 2719 return (ENOMEM); 2720 } 2721 2722 /* 2723 * Don't cache any new pages for IO and 2724 * flush any cached pages. 2725 */ 2726 mutex_enter(&sptd->spt_lock); 2727 if (sptd->spt_ppa != NULL) 2728 sptd->spt_flags |= DISM_PPA_CHANGED; 2729 2730 sts = spt_anon_getpages(sptseg, spt_addr, a_len, ppa); 2731 if (sts != 0) { 2732 mutex_exit(&sptd->spt_lock); 2733 kmem_free(ppa, ((sizeof (page_t *)) * a_npages)); 2734 return (sts); 2735 } 2736 2737 mutex_enter(&sp->shm_mlock); 2738 /* enforce locked memory rctl */ 2739 unlocked = spt_unlockedbytes(npages, &ppa[ppa_idx]); 2740 2741 mutex_enter(&p->p_lock); 2742 if (rctl_incr_locked_mem(p, proj, unlocked, 0)) { 2743 mutex_exit(&p->p_lock); 2744 sts = EAGAIN; 2745 } else { 2746 mutex_exit(&p->p_lock); 2747 sts = spt_lockpages(seg, an_idx, npages, 2748 &ppa[ppa_idx], lockmap, pos, &locked); 2749 2750 /* 2751 * correct locked count if not all pages could be 2752 * locked 2753 */ 2754 if ((unlocked - locked) > 0) { 2755 rctl_decr_locked_mem(NULL, proj, 2756 (unlocked - locked), 0); 2757 } 2758 } 2759 /* 2760 * unlock pages 2761 */ 2762 for (i = 0; i < a_npages; i++) 2763 page_unlock(ppa[i]); 2764 if (sptd->spt_ppa != NULL) 2765 sptd->spt_flags |= DISM_PPA_CHANGED; 2766 mutex_exit(&sp->shm_mlock); 2767 mutex_exit(&sptd->spt_lock); 2768 2769 kmem_free(ppa, ((sizeof (page_t *)) * a_npages)); 2770 2771 } else if (op == MC_UNLOCK) { /* unlock */ 2772 page_t **ppa; 2773 2774 mutex_enter(&sptd->spt_lock); 2775 if (shmd->shm_lckpgs == 0) { 2776 mutex_exit(&sptd->spt_lock); 2777 return (0); 2778 } 2779 /* 2780 * Don't cache new IO pages. 2781 */ 2782 if (sptd->spt_ppa != NULL) 2783 sptd->spt_flags |= DISM_PPA_CHANGED; 2784 2785 mutex_enter(&sp->shm_mlock); 2786 sts = spt_unlockpages(seg, an_idx, npages, &unlocked); 2787 if ((ppa = sptd->spt_ppa) != NULL) 2788 sptd->spt_flags |= DISM_PPA_CHANGED; 2789 mutex_exit(&sptd->spt_lock); 2790 2791 rctl_decr_locked_mem(NULL, proj, unlocked, 0); 2792 mutex_exit(&sp->shm_mlock); 2793 2794 if (ppa != NULL) 2795 seg_ppurge_wiredpp(ppa); 2796 } 2797 return (sts); 2798 } 2799 2800 /*ARGSUSED*/ 2801 int 2802 segspt_shmgetprot(struct seg *seg, caddr_t addr, size_t len, uint_t *protv) 2803 { 2804 struct shm_data *shmd = (struct shm_data *)seg->s_data; 2805 struct spt_data *sptd = (struct spt_data *)shmd->shm_sptseg->s_data; 2806 spgcnt_t pgno = seg_page(seg, addr+len) - seg_page(seg, addr) + 1; 2807 2808 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 2809 2810 /* 2811 * ISM segment is always rw. 2812 */ 2813 while (--pgno >= 0) 2814 *protv++ = sptd->spt_prot; 2815 return (0); 2816 } 2817 2818 /*ARGSUSED*/ 2819 u_offset_t 2820 segspt_shmgetoffset(struct seg *seg, caddr_t addr) 2821 { 2822 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 2823 2824 /* Offset does not matter in ISM memory */ 2825 2826 return ((u_offset_t)0); 2827 } 2828 2829 /* ARGSUSED */ 2830 int 2831 segspt_shmgettype(struct seg *seg, caddr_t addr) 2832 { 2833 struct shm_data *shmd = (struct shm_data *)seg->s_data; 2834 struct spt_data *sptd = (struct spt_data *)shmd->shm_sptseg->s_data; 2835 2836 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 2837 2838 /* 2839 * The shared memory mapping is always MAP_SHARED, SWAP is only 2840 * reserved for DISM 2841 */ 2842 return (MAP_SHARED | 2843 ((sptd->spt_flags & SHM_PAGEABLE) ? 0 : MAP_NORESERVE)); 2844 } 2845 2846 /*ARGSUSED*/ 2847 int 2848 segspt_shmgetvp(struct seg *seg, caddr_t addr, struct vnode **vpp) 2849 { 2850 struct shm_data *shmd = (struct shm_data *)seg->s_data; 2851 struct spt_data *sptd = (struct spt_data *)shmd->shm_sptseg->s_data; 2852 2853 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 2854 2855 *vpp = sptd->spt_vp; 2856 return (0); 2857 } 2858 2859 /* 2860 * We need to wait for pending IO to complete to a DISM segment in order for 2861 * pages to get kicked out of the seg_pcache. 120 seconds should be more 2862 * than enough time to wait. 2863 */ 2864 static clock_t spt_pcache_wait = 120; 2865 2866 /*ARGSUSED*/ 2867 static int 2868 segspt_shmadvise(struct seg *seg, caddr_t addr, size_t len, uint_t behav) 2869 { 2870 struct shm_data *shmd = (struct shm_data *)seg->s_data; 2871 struct spt_data *sptd = (struct spt_data *)shmd->shm_sptseg->s_data; 2872 struct anon_map *amp; 2873 pgcnt_t pg_idx; 2874 ushort_t gen; 2875 clock_t end_lbolt; 2876 int writer; 2877 page_t **ppa; 2878 2879 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 2880 2881 if (behav == MADV_FREE) { 2882 if ((sptd->spt_flags & SHM_PAGEABLE) == 0) 2883 return (0); 2884 2885 amp = sptd->spt_amp; 2886 pg_idx = seg_page(seg, addr); 2887 2888 mutex_enter(&sptd->spt_lock); 2889 if ((ppa = sptd->spt_ppa) == NULL) { 2890 mutex_exit(&sptd->spt_lock); 2891 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 2892 anon_disclaim(amp, pg_idx, len); 2893 ANON_LOCK_EXIT(&->a_rwlock); 2894 return (0); 2895 } 2896 2897 sptd->spt_flags |= DISM_PPA_CHANGED; 2898 gen = sptd->spt_gen; 2899 2900 mutex_exit(&sptd->spt_lock); 2901 2902 /* 2903 * Purge all DISM cached pages 2904 */ 2905 seg_ppurge_wiredpp(ppa); 2906 2907 /* 2908 * Drop the AS_LOCK so that other threads can grab it 2909 * in the as_pageunlock path and hopefully get the segment 2910 * kicked out of the seg_pcache. We bump the shm_softlockcnt 2911 * to keep this segment resident. 2912 */ 2913 writer = AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock); 2914 atomic_add_long((ulong_t *)(&(shmd->shm_softlockcnt)), 1); 2915 AS_LOCK_EXIT(seg->s_as, &seg->s_as->a_lock); 2916 2917 mutex_enter(&sptd->spt_lock); 2918 2919 end_lbolt = ddi_get_lbolt() + (hz * spt_pcache_wait); 2920 2921 /* 2922 * Try to wait for pages to get kicked out of the seg_pcache. 2923 */ 2924 while (sptd->spt_gen == gen && 2925 (sptd->spt_flags & DISM_PPA_CHANGED) && 2926 ddi_get_lbolt() < end_lbolt) { 2927 if (!cv_timedwait_sig(&sptd->spt_cv, 2928 &sptd->spt_lock, end_lbolt)) { 2929 break; 2930 } 2931 } 2932 2933 mutex_exit(&sptd->spt_lock); 2934 2935 /* Regrab the AS_LOCK and release our hold on the segment */ 2936 AS_LOCK_ENTER(seg->s_as, &seg->s_as->a_lock, 2937 writer ? RW_WRITER : RW_READER); 2938 atomic_add_long((ulong_t *)(&(shmd->shm_softlockcnt)), -1); 2939 if (shmd->shm_softlockcnt <= 0) { 2940 if (AS_ISUNMAPWAIT(seg->s_as)) { 2941 mutex_enter(&seg->s_as->a_contents); 2942 if (AS_ISUNMAPWAIT(seg->s_as)) { 2943 AS_CLRUNMAPWAIT(seg->s_as); 2944 cv_broadcast(&seg->s_as->a_cv); 2945 } 2946 mutex_exit(&seg->s_as->a_contents); 2947 } 2948 } 2949 2950 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 2951 anon_disclaim(amp, pg_idx, len); 2952 ANON_LOCK_EXIT(&->a_rwlock); 2953 } else if (lgrp_optimizations() && (behav == MADV_ACCESS_LWP || 2954 behav == MADV_ACCESS_MANY || behav == MADV_ACCESS_DEFAULT)) { 2955 int already_set; 2956 ulong_t anon_index; 2957 lgrp_mem_policy_t policy; 2958 caddr_t shm_addr; 2959 size_t share_size; 2960 size_t size; 2961 struct seg *sptseg = shmd->shm_sptseg; 2962 caddr_t sptseg_addr; 2963 2964 /* 2965 * Align address and length to page size of underlying segment 2966 */ 2967 share_size = page_get_pagesize(shmd->shm_sptseg->s_szc); 2968 shm_addr = (caddr_t)P2ALIGN((uintptr_t)(addr), share_size); 2969 size = P2ROUNDUP((uintptr_t)(((addr + len) - shm_addr)), 2970 share_size); 2971 2972 amp = shmd->shm_amp; 2973 anon_index = seg_page(seg, shm_addr); 2974 2975 /* 2976 * And now we may have to adjust size downward if we have 2977 * exceeded the realsize of the segment or initial anon 2978 * allocations. 2979 */ 2980 sptseg_addr = sptseg->s_base + ptob(anon_index); 2981 if ((sptseg_addr + size) > 2982 (sptseg->s_base + sptd->spt_realsize)) 2983 size = (sptseg->s_base + sptd->spt_realsize) - 2984 sptseg_addr; 2985 2986 /* 2987 * Set memory allocation policy for this segment 2988 */ 2989 policy = lgrp_madv_to_policy(behav, len, MAP_SHARED); 2990 already_set = lgrp_shm_policy_set(policy, amp, anon_index, 2991 NULL, 0, len); 2992 2993 /* 2994 * If random memory allocation policy set already, 2995 * don't bother reapplying it. 2996 */ 2997 if (already_set && !LGRP_MEM_POLICY_REAPPLICABLE(policy)) 2998 return (0); 2999 3000 /* 3001 * Mark any existing pages in the given range for 3002 * migration, flushing the I/O page cache, and using 3003 * underlying segment to calculate anon index and get 3004 * anonmap and vnode pointer from 3005 */ 3006 if (shmd->shm_softlockcnt > 0) 3007 segspt_purge(seg); 3008 3009 page_mark_migrate(seg, shm_addr, size, amp, 0, NULL, 0, 0); 3010 } 3011 3012 return (0); 3013 } 3014 3015 /*ARGSUSED*/ 3016 void 3017 segspt_shmdump(struct seg *seg) 3018 { 3019 /* no-op for ISM segment */ 3020 } 3021 3022 /*ARGSUSED*/ 3023 static faultcode_t 3024 segspt_shmsetpgsz(struct seg *seg, caddr_t addr, size_t len, uint_t szc) 3025 { 3026 return (ENOTSUP); 3027 } 3028 3029 /* 3030 * get a memory ID for an addr in a given segment 3031 */ 3032 static int 3033 segspt_shmgetmemid(struct seg *seg, caddr_t addr, memid_t *memidp) 3034 { 3035 struct shm_data *shmd = (struct shm_data *)seg->s_data; 3036 struct anon *ap; 3037 size_t anon_index; 3038 struct anon_map *amp = shmd->shm_amp; 3039 struct spt_data *sptd = shmd->shm_sptseg->s_data; 3040 struct seg *sptseg = shmd->shm_sptseg; 3041 anon_sync_obj_t cookie; 3042 3043 anon_index = seg_page(seg, addr); 3044 3045 if (addr > (seg->s_base + sptd->spt_realsize)) { 3046 return (EFAULT); 3047 } 3048 3049 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 3050 anon_array_enter(amp, anon_index, &cookie); 3051 ap = anon_get_ptr(amp->ahp, anon_index); 3052 if (ap == NULL) { 3053 struct page *pp; 3054 caddr_t spt_addr = sptseg->s_base + ptob(anon_index); 3055 3056 pp = anon_zero(sptseg, spt_addr, &ap, kcred); 3057 if (pp == NULL) { 3058 anon_array_exit(&cookie); 3059 ANON_LOCK_EXIT(&->a_rwlock); 3060 return (ENOMEM); 3061 } 3062 (void) anon_set_ptr(amp->ahp, anon_index, ap, ANON_SLEEP); 3063 page_unlock(pp); 3064 } 3065 anon_array_exit(&cookie); 3066 ANON_LOCK_EXIT(&->a_rwlock); 3067 memidp->val[0] = (uintptr_t)ap; 3068 memidp->val[1] = (uintptr_t)addr & PAGEOFFSET; 3069 return (0); 3070 } 3071 3072 /* 3073 * Get memory allocation policy info for specified address in given segment 3074 */ 3075 static lgrp_mem_policy_info_t * 3076 segspt_shmgetpolicy(struct seg *seg, caddr_t addr) 3077 { 3078 struct anon_map *amp; 3079 ulong_t anon_index; 3080 lgrp_mem_policy_info_t *policy_info; 3081 struct shm_data *shm_data; 3082 3083 ASSERT(seg != NULL); 3084 3085 /* 3086 * Get anon_map from segshm 3087 * 3088 * Assume that no lock needs to be held on anon_map, since 3089 * it should be protected by its reference count which must be 3090 * nonzero for an existing segment 3091 * Need to grab readers lock on policy tree though 3092 */ 3093 shm_data = (struct shm_data *)seg->s_data; 3094 if (shm_data == NULL) 3095 return (NULL); 3096 amp = shm_data->shm_amp; 3097 ASSERT(amp->refcnt != 0); 3098 3099 /* 3100 * Get policy info 3101 * 3102 * Assume starting anon index of 0 3103 */ 3104 anon_index = seg_page(seg, addr); 3105 policy_info = lgrp_shm_policy_get(amp, anon_index, NULL, 0); 3106 3107 return (policy_info); 3108 } 3109 3110 /*ARGSUSED*/ 3111 static int 3112 segspt_shmcapable(struct seg *seg, segcapability_t capability) 3113 { 3114 return (0); 3115 } 3116