1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 1993, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright (c) 2015, Joyent, Inc. All rights reserved. 24 * Copyright (c) 2016 by Delphix. All rights reserved. 25 */ 26 27 #include <sys/param.h> 28 #include <sys/user.h> 29 #include <sys/mman.h> 30 #include <sys/kmem.h> 31 #include <sys/sysmacros.h> 32 #include <sys/cmn_err.h> 33 #include <sys/systm.h> 34 #include <sys/tuneable.h> 35 #include <vm/hat.h> 36 #include <vm/seg.h> 37 #include <vm/as.h> 38 #include <vm/anon.h> 39 #include <vm/page.h> 40 #include <sys/buf.h> 41 #include <sys/swap.h> 42 #include <sys/atomic.h> 43 #include <vm/seg_spt.h> 44 #include <sys/debug.h> 45 #include <sys/vtrace.h> 46 #include <sys/shm.h> 47 #include <sys/shm_impl.h> 48 #include <sys/lgrp.h> 49 #include <sys/vmsystm.h> 50 #include <sys/policy.h> 51 #include <sys/project.h> 52 #include <sys/tnf_probe.h> 53 #include <sys/zone.h> 54 55 #define SEGSPTADDR (caddr_t)0x0 56 57 /* 58 * # pages used for spt 59 */ 60 size_t spt_used; 61 62 /* 63 * segspt_minfree is the memory left for system after ISM 64 * locked its pages; it is set up to 5% of availrmem in 65 * sptcreate when ISM is created. ISM should not use more 66 * than ~90% of availrmem; if it does, then the performance 67 * of the system may decrease. Machines with large memories may 68 * be able to use up more memory for ISM so we set the default 69 * segspt_minfree to 5% (which gives ISM max 95% of availrmem. 70 * If somebody wants even more memory for ISM (risking hanging 71 * the system) they can patch the segspt_minfree to smaller number. 72 */ 73 pgcnt_t segspt_minfree = 0; 74 75 static int segspt_create(struct seg *seg, caddr_t argsp); 76 static int segspt_unmap(struct seg *seg, caddr_t raddr, size_t ssize); 77 static void segspt_free(struct seg *seg); 78 static void segspt_free_pages(struct seg *seg, caddr_t addr, size_t len); 79 static lgrp_mem_policy_info_t *segspt_getpolicy(struct seg *seg, caddr_t addr); 80 81 static void 82 segspt_badop() 83 { 84 panic("segspt_badop called"); 85 /*NOTREACHED*/ 86 } 87 88 #define SEGSPT_BADOP(t) (t(*)())segspt_badop 89 90 struct seg_ops segspt_ops = { 91 SEGSPT_BADOP(int), /* dup */ 92 segspt_unmap, 93 segspt_free, 94 SEGSPT_BADOP(int), /* fault */ 95 SEGSPT_BADOP(faultcode_t), /* faulta */ 96 SEGSPT_BADOP(int), /* setprot */ 97 SEGSPT_BADOP(int), /* checkprot */ 98 SEGSPT_BADOP(int), /* kluster */ 99 SEGSPT_BADOP(size_t), /* swapout */ 100 SEGSPT_BADOP(int), /* sync */ 101 SEGSPT_BADOP(size_t), /* incore */ 102 SEGSPT_BADOP(int), /* lockop */ 103 SEGSPT_BADOP(int), /* getprot */ 104 SEGSPT_BADOP(u_offset_t), /* getoffset */ 105 SEGSPT_BADOP(int), /* gettype */ 106 SEGSPT_BADOP(int), /* getvp */ 107 SEGSPT_BADOP(int), /* advise */ 108 SEGSPT_BADOP(void), /* dump */ 109 SEGSPT_BADOP(int), /* pagelock */ 110 SEGSPT_BADOP(int), /* setpgsz */ 111 SEGSPT_BADOP(int), /* getmemid */ 112 segspt_getpolicy, /* getpolicy */ 113 SEGSPT_BADOP(int), /* capable */ 114 seg_inherit_notsup /* inherit */ 115 }; 116 117 static int segspt_shmdup(struct seg *seg, struct seg *newseg); 118 static int segspt_shmunmap(struct seg *seg, caddr_t raddr, size_t ssize); 119 static void segspt_shmfree(struct seg *seg); 120 static faultcode_t segspt_shmfault(struct hat *hat, struct seg *seg, 121 caddr_t addr, size_t len, enum fault_type type, enum seg_rw rw); 122 static faultcode_t segspt_shmfaulta(struct seg *seg, caddr_t addr); 123 static int segspt_shmsetprot(register struct seg *seg, register caddr_t addr, 124 register size_t len, register uint_t prot); 125 static int segspt_shmcheckprot(struct seg *seg, caddr_t addr, size_t size, 126 uint_t prot); 127 static int segspt_shmkluster(struct seg *seg, caddr_t addr, ssize_t delta); 128 static size_t segspt_shmswapout(struct seg *seg); 129 static size_t segspt_shmincore(struct seg *seg, caddr_t addr, size_t len, 130 register char *vec); 131 static int segspt_shmsync(struct seg *seg, register caddr_t addr, size_t len, 132 int attr, uint_t flags); 133 static int segspt_shmlockop(struct seg *seg, caddr_t addr, size_t len, 134 int attr, int op, ulong_t *lockmap, size_t pos); 135 static int segspt_shmgetprot(struct seg *seg, caddr_t addr, size_t len, 136 uint_t *protv); 137 static u_offset_t segspt_shmgetoffset(struct seg *seg, caddr_t addr); 138 static int segspt_shmgettype(struct seg *seg, caddr_t addr); 139 static int segspt_shmgetvp(struct seg *seg, caddr_t addr, struct vnode **vpp); 140 static int segspt_shmadvise(struct seg *seg, caddr_t addr, size_t len, 141 uint_t behav); 142 static void segspt_shmdump(struct seg *seg); 143 static int segspt_shmpagelock(struct seg *, caddr_t, size_t, 144 struct page ***, enum lock_type, enum seg_rw); 145 static int segspt_shmsetpgsz(struct seg *, caddr_t, size_t, uint_t); 146 static int segspt_shmgetmemid(struct seg *, caddr_t, memid_t *); 147 static lgrp_mem_policy_info_t *segspt_shmgetpolicy(struct seg *, caddr_t); 148 static int segspt_shmcapable(struct seg *, segcapability_t); 149 150 struct seg_ops segspt_shmops = { 151 segspt_shmdup, 152 segspt_shmunmap, 153 segspt_shmfree, 154 segspt_shmfault, 155 segspt_shmfaulta, 156 segspt_shmsetprot, 157 segspt_shmcheckprot, 158 segspt_shmkluster, 159 segspt_shmswapout, 160 segspt_shmsync, 161 segspt_shmincore, 162 segspt_shmlockop, 163 segspt_shmgetprot, 164 segspt_shmgetoffset, 165 segspt_shmgettype, 166 segspt_shmgetvp, 167 segspt_shmadvise, /* advise */ 168 segspt_shmdump, 169 segspt_shmpagelock, 170 segspt_shmsetpgsz, 171 segspt_shmgetmemid, 172 segspt_shmgetpolicy, 173 segspt_shmcapable, 174 seg_inherit_notsup 175 }; 176 177 static void segspt_purge(struct seg *seg); 178 static int segspt_reclaim(void *, caddr_t, size_t, struct page **, 179 enum seg_rw, int); 180 static int spt_anon_getpages(struct seg *seg, caddr_t addr, size_t len, 181 page_t **ppa); 182 183 184 185 /*ARGSUSED*/ 186 int 187 sptcreate(size_t size, struct seg **sptseg, struct anon_map *amp, 188 uint_t prot, uint_t flags, uint_t share_szc) 189 { 190 int err; 191 struct as *newas; 192 struct segspt_crargs sptcargs; 193 194 #ifdef DEBUG 195 TNF_PROBE_1(sptcreate, "spt", /* CSTYLED */, 196 tnf_ulong, size, size ); 197 #endif 198 if (segspt_minfree == 0) /* leave min 5% of availrmem for */ 199 segspt_minfree = availrmem/20; /* for the system */ 200 201 if (!hat_supported(HAT_SHARED_PT, (void *)0)) 202 return (EINVAL); 203 204 /* 205 * get a new as for this shared memory segment 206 */ 207 newas = as_alloc(); 208 newas->a_proc = NULL; 209 sptcargs.amp = amp; 210 sptcargs.prot = prot; 211 sptcargs.flags = flags; 212 sptcargs.szc = share_szc; 213 /* 214 * create a shared page table (spt) segment 215 */ 216 217 if (err = as_map(newas, SEGSPTADDR, size, segspt_create, &sptcargs)) { 218 as_free(newas); 219 return (err); 220 } 221 *sptseg = sptcargs.seg_spt; 222 return (0); 223 } 224 225 void 226 sptdestroy(struct as *as, struct anon_map *amp) 227 { 228 229 #ifdef DEBUG 230 TNF_PROBE_0(sptdestroy, "spt", /* CSTYLED */); 231 #endif 232 (void) as_unmap(as, SEGSPTADDR, amp->size); 233 as_free(as); 234 } 235 236 /* 237 * called from seg_free(). 238 * free (i.e., unlock, unmap, return to free list) 239 * all the pages in the given seg. 240 */ 241 void 242 segspt_free(struct seg *seg) 243 { 244 struct spt_data *sptd = (struct spt_data *)seg->s_data; 245 246 ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as)); 247 248 if (sptd != NULL) { 249 if (sptd->spt_realsize) 250 segspt_free_pages(seg, seg->s_base, sptd->spt_realsize); 251 252 if (sptd->spt_ppa_lckcnt) { 253 kmem_free(sptd->spt_ppa_lckcnt, 254 sizeof (*sptd->spt_ppa_lckcnt) 255 * btopr(sptd->spt_amp->size)); 256 } 257 kmem_free(sptd->spt_vp, sizeof (*sptd->spt_vp)); 258 cv_destroy(&sptd->spt_cv); 259 mutex_destroy(&sptd->spt_lock); 260 kmem_free(sptd, sizeof (*sptd)); 261 } 262 } 263 264 /*ARGSUSED*/ 265 static int 266 segspt_shmsync(struct seg *seg, caddr_t addr, size_t len, int attr, 267 uint_t flags) 268 { 269 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as)); 270 271 return (0); 272 } 273 274 /*ARGSUSED*/ 275 static size_t 276 segspt_shmincore(struct seg *seg, caddr_t addr, size_t len, char *vec) 277 { 278 caddr_t eo_seg; 279 pgcnt_t npages; 280 struct shm_data *shmd = (struct shm_data *)seg->s_data; 281 struct seg *sptseg; 282 struct spt_data *sptd; 283 284 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as)); 285 #ifdef lint 286 seg = seg; 287 #endif 288 sptseg = shmd->shm_sptseg; 289 sptd = sptseg->s_data; 290 291 if ((sptd->spt_flags & SHM_PAGEABLE) == 0) { 292 eo_seg = addr + len; 293 while (addr < eo_seg) { 294 /* page exists, and it's locked. */ 295 *vec++ = SEG_PAGE_INCORE | SEG_PAGE_LOCKED | 296 SEG_PAGE_ANON; 297 addr += PAGESIZE; 298 } 299 return (len); 300 } else { 301 struct anon_map *amp = shmd->shm_amp; 302 struct anon *ap; 303 page_t *pp; 304 pgcnt_t anon_index; 305 struct vnode *vp; 306 u_offset_t off; 307 ulong_t i; 308 int ret; 309 anon_sync_obj_t cookie; 310 311 addr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK); 312 anon_index = seg_page(seg, addr); 313 npages = btopr(len); 314 if (anon_index + npages > btopr(shmd->shm_amp->size)) { 315 return (EINVAL); 316 } 317 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 318 for (i = 0; i < npages; i++, anon_index++) { 319 ret = 0; 320 anon_array_enter(amp, anon_index, &cookie); 321 ap = anon_get_ptr(amp->ahp, anon_index); 322 if (ap != NULL) { 323 swap_xlate(ap, &vp, &off); 324 anon_array_exit(&cookie); 325 pp = page_lookup_nowait(vp, off, SE_SHARED); 326 if (pp != NULL) { 327 ret |= SEG_PAGE_INCORE | SEG_PAGE_ANON; 328 page_unlock(pp); 329 } 330 } else { 331 anon_array_exit(&cookie); 332 } 333 if (shmd->shm_vpage[anon_index] & DISM_PG_LOCKED) { 334 ret |= SEG_PAGE_LOCKED; 335 } 336 *vec++ = (char)ret; 337 } 338 ANON_LOCK_EXIT(&->a_rwlock); 339 return (len); 340 } 341 } 342 343 static int 344 segspt_unmap(struct seg *seg, caddr_t raddr, size_t ssize) 345 { 346 size_t share_size; 347 348 ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as)); 349 350 /* 351 * seg.s_size may have been rounded up to the largest page size 352 * in shmat(). 353 * XXX This should be cleanedup. sptdestroy should take a length 354 * argument which should be the same as sptcreate. Then 355 * this rounding would not be needed (or is done in shm.c) 356 * Only the check for full segment will be needed. 357 * 358 * XXX -- shouldn't raddr == 0 always? These tests don't seem 359 * to be useful at all. 360 */ 361 share_size = page_get_pagesize(seg->s_szc); 362 ssize = P2ROUNDUP(ssize, share_size); 363 364 if (raddr == seg->s_base && ssize == seg->s_size) { 365 seg_free(seg); 366 return (0); 367 } else 368 return (EINVAL); 369 } 370 371 int 372 segspt_create(struct seg *seg, caddr_t argsp) 373 { 374 int err; 375 caddr_t addr = seg->s_base; 376 struct spt_data *sptd; 377 struct segspt_crargs *sptcargs = (struct segspt_crargs *)argsp; 378 struct anon_map *amp = sptcargs->amp; 379 struct kshmid *sp = amp->a_sp; 380 struct cred *cred = CRED(); 381 ulong_t i, j, anon_index = 0; 382 pgcnt_t npages = btopr(amp->size); 383 struct vnode *vp; 384 page_t **ppa; 385 uint_t hat_flags; 386 size_t pgsz; 387 pgcnt_t pgcnt; 388 caddr_t a; 389 pgcnt_t pidx; 390 size_t sz; 391 proc_t *procp = curproc; 392 rctl_qty_t lockedbytes = 0; 393 kproject_t *proj; 394 395 /* 396 * We are holding the a_lock on the underlying dummy as, 397 * so we can make calls to the HAT layer. 398 */ 399 ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as)); 400 ASSERT(sp != NULL); 401 402 #ifdef DEBUG 403 TNF_PROBE_2(segspt_create, "spt", /* CSTYLED */, 404 tnf_opaque, addr, addr, tnf_ulong, len, seg->s_size); 405 #endif 406 if ((sptcargs->flags & SHM_PAGEABLE) == 0) { 407 if (err = anon_swap_adjust(npages)) 408 return (err); 409 } 410 err = ENOMEM; 411 412 if ((sptd = kmem_zalloc(sizeof (*sptd), KM_NOSLEEP)) == NULL) 413 goto out1; 414 415 if ((sptcargs->flags & SHM_PAGEABLE) == 0) { 416 if ((ppa = kmem_zalloc(((sizeof (page_t *)) * npages), 417 KM_NOSLEEP)) == NULL) 418 goto out2; 419 } 420 421 mutex_init(&sptd->spt_lock, NULL, MUTEX_DEFAULT, NULL); 422 423 if ((vp = kmem_zalloc(sizeof (*vp), KM_NOSLEEP)) == NULL) 424 goto out3; 425 426 seg->s_ops = &segspt_ops; 427 sptd->spt_vp = vp; 428 sptd->spt_amp = amp; 429 sptd->spt_prot = sptcargs->prot; 430 sptd->spt_flags = sptcargs->flags; 431 seg->s_data = (caddr_t)sptd; 432 sptd->spt_ppa = NULL; 433 sptd->spt_ppa_lckcnt = NULL; 434 seg->s_szc = sptcargs->szc; 435 cv_init(&sptd->spt_cv, NULL, CV_DEFAULT, NULL); 436 sptd->spt_gen = 0; 437 438 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 439 if (seg->s_szc > amp->a_szc) { 440 amp->a_szc = seg->s_szc; 441 } 442 ANON_LOCK_EXIT(&->a_rwlock); 443 444 /* 445 * Set policy to affect initial allocation of pages in 446 * anon_map_createpages() 447 */ 448 (void) lgrp_shm_policy_set(LGRP_MEM_POLICY_DEFAULT, amp, anon_index, 449 NULL, 0, ptob(npages)); 450 451 if (sptcargs->flags & SHM_PAGEABLE) { 452 size_t share_sz; 453 pgcnt_t new_npgs, more_pgs; 454 struct anon_hdr *nahp; 455 zone_t *zone; 456 457 share_sz = page_get_pagesize(seg->s_szc); 458 if (!IS_P2ALIGNED(amp->size, share_sz)) { 459 /* 460 * We are rounding up the size of the anon array 461 * on 4 M boundary because we always create 4 M 462 * of page(s) when locking, faulting pages and we 463 * don't have to check for all corner cases e.g. 464 * if there is enough space to allocate 4 M 465 * page. 466 */ 467 new_npgs = btop(P2ROUNDUP(amp->size, share_sz)); 468 more_pgs = new_npgs - npages; 469 470 /* 471 * The zone will never be NULL, as a fully created 472 * shm always has an owning zone. 473 */ 474 zone = sp->shm_perm.ipc_zone_ref.zref_zone; 475 ASSERT(zone != NULL); 476 if (anon_resv_zone(ptob(more_pgs), zone) == 0) { 477 err = ENOMEM; 478 goto out4; 479 } 480 481 nahp = anon_create(new_npgs, ANON_SLEEP); 482 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 483 (void) anon_copy_ptr(amp->ahp, 0, nahp, 0, npages, 484 ANON_SLEEP); 485 anon_release(amp->ahp, npages); 486 amp->ahp = nahp; 487 ASSERT(amp->swresv == ptob(npages)); 488 amp->swresv = amp->size = ptob(new_npgs); 489 ANON_LOCK_EXIT(&->a_rwlock); 490 npages = new_npgs; 491 } 492 493 sptd->spt_ppa_lckcnt = kmem_zalloc(npages * 494 sizeof (*sptd->spt_ppa_lckcnt), KM_SLEEP); 495 sptd->spt_pcachecnt = 0; 496 sptd->spt_realsize = ptob(npages); 497 sptcargs->seg_spt = seg; 498 return (0); 499 } 500 501 /* 502 * get array of pages for each anon slot in amp 503 */ 504 if ((err = anon_map_createpages(amp, anon_index, ptob(npages), ppa, 505 seg, addr, S_CREATE, cred)) != 0) 506 goto out4; 507 508 mutex_enter(&sp->shm_mlock); 509 510 /* May be partially locked, so, count bytes to charge for locking */ 511 for (i = 0; i < npages; i++) 512 if (ppa[i]->p_lckcnt == 0) 513 lockedbytes += PAGESIZE; 514 515 proj = sp->shm_perm.ipc_proj; 516 517 if (lockedbytes > 0) { 518 mutex_enter(&procp->p_lock); 519 if (rctl_incr_locked_mem(procp, proj, lockedbytes, 0)) { 520 mutex_exit(&procp->p_lock); 521 mutex_exit(&sp->shm_mlock); 522 for (i = 0; i < npages; i++) 523 page_unlock(ppa[i]); 524 err = ENOMEM; 525 goto out4; 526 } 527 mutex_exit(&procp->p_lock); 528 } 529 530 /* 531 * addr is initial address corresponding to the first page on ppa list 532 */ 533 for (i = 0; i < npages; i++) { 534 /* attempt to lock all pages */ 535 if (page_pp_lock(ppa[i], 0, 1) == 0) { 536 /* 537 * if unable to lock any page, unlock all 538 * of them and return error 539 */ 540 for (j = 0; j < i; j++) 541 page_pp_unlock(ppa[j], 0, 1); 542 for (i = 0; i < npages; i++) 543 page_unlock(ppa[i]); 544 rctl_decr_locked_mem(NULL, proj, lockedbytes, 0); 545 mutex_exit(&sp->shm_mlock); 546 err = ENOMEM; 547 goto out4; 548 } 549 } 550 mutex_exit(&sp->shm_mlock); 551 552 /* 553 * Some platforms assume that ISM mappings are HAT_LOAD_LOCK 554 * for the entire life of the segment. For example platforms 555 * that do not support Dynamic Reconfiguration. 556 */ 557 hat_flags = HAT_LOAD_SHARE; 558 if (!hat_supported(HAT_DYNAMIC_ISM_UNMAP, NULL)) 559 hat_flags |= HAT_LOAD_LOCK; 560 561 /* 562 * Load translations one lare page at a time 563 * to make sure we don't create mappings bigger than 564 * segment's size code in case underlying pages 565 * are shared with segvn's segment that uses bigger 566 * size code than we do. 567 */ 568 pgsz = page_get_pagesize(seg->s_szc); 569 pgcnt = page_get_pagecnt(seg->s_szc); 570 for (a = addr, pidx = 0; pidx < npages; a += pgsz, pidx += pgcnt) { 571 sz = MIN(pgsz, ptob(npages - pidx)); 572 hat_memload_array(seg->s_as->a_hat, a, sz, 573 &ppa[pidx], sptd->spt_prot, hat_flags); 574 } 575 576 /* 577 * On platforms that do not support HAT_DYNAMIC_ISM_UNMAP, 578 * we will leave the pages locked SE_SHARED for the life 579 * of the ISM segment. This will prevent any calls to 580 * hat_pageunload() on this ISM segment for those platforms. 581 */ 582 if (!(hat_flags & HAT_LOAD_LOCK)) { 583 /* 584 * On platforms that support HAT_DYNAMIC_ISM_UNMAP, 585 * we no longer need to hold the SE_SHARED lock on the pages, 586 * since L_PAGELOCK and F_SOFTLOCK calls will grab the 587 * SE_SHARED lock on the pages as necessary. 588 */ 589 for (i = 0; i < npages; i++) 590 page_unlock(ppa[i]); 591 } 592 sptd->spt_pcachecnt = 0; 593 kmem_free(ppa, ((sizeof (page_t *)) * npages)); 594 sptd->spt_realsize = ptob(npages); 595 atomic_add_long(&spt_used, npages); 596 sptcargs->seg_spt = seg; 597 return (0); 598 599 out4: 600 seg->s_data = NULL; 601 kmem_free(vp, sizeof (*vp)); 602 cv_destroy(&sptd->spt_cv); 603 out3: 604 mutex_destroy(&sptd->spt_lock); 605 if ((sptcargs->flags & SHM_PAGEABLE) == 0) 606 kmem_free(ppa, (sizeof (*ppa) * npages)); 607 out2: 608 kmem_free(sptd, sizeof (*sptd)); 609 out1: 610 if ((sptcargs->flags & SHM_PAGEABLE) == 0) 611 anon_swap_restore(npages); 612 return (err); 613 } 614 615 /*ARGSUSED*/ 616 void 617 segspt_free_pages(struct seg *seg, caddr_t addr, size_t len) 618 { 619 struct page *pp; 620 struct spt_data *sptd = (struct spt_data *)seg->s_data; 621 pgcnt_t npages; 622 ulong_t anon_idx; 623 struct anon_map *amp; 624 struct anon *ap; 625 struct vnode *vp; 626 u_offset_t off; 627 uint_t hat_flags; 628 int root = 0; 629 pgcnt_t pgs, curnpgs = 0; 630 page_t *rootpp; 631 rctl_qty_t unlocked_bytes = 0; 632 kproject_t *proj; 633 kshmid_t *sp; 634 635 ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as)); 636 637 len = P2ROUNDUP(len, PAGESIZE); 638 639 npages = btop(len); 640 641 hat_flags = HAT_UNLOAD_UNLOCK | HAT_UNLOAD_UNMAP; 642 if ((hat_supported(HAT_DYNAMIC_ISM_UNMAP, (void *)0)) || 643 (sptd->spt_flags & SHM_PAGEABLE)) { 644 hat_flags = HAT_UNLOAD_UNMAP; 645 } 646 647 hat_unload(seg->s_as->a_hat, addr, len, hat_flags); 648 649 amp = sptd->spt_amp; 650 if (sptd->spt_flags & SHM_PAGEABLE) 651 npages = btop(amp->size); 652 653 ASSERT(amp != NULL); 654 655 if ((sptd->spt_flags & SHM_PAGEABLE) == 0) { 656 sp = amp->a_sp; 657 proj = sp->shm_perm.ipc_proj; 658 mutex_enter(&sp->shm_mlock); 659 } 660 for (anon_idx = 0; anon_idx < npages; anon_idx++) { 661 if ((sptd->spt_flags & SHM_PAGEABLE) == 0) { 662 if ((ap = anon_get_ptr(amp->ahp, anon_idx)) == NULL) { 663 panic("segspt_free_pages: null app"); 664 /*NOTREACHED*/ 665 } 666 } else { 667 if ((ap = anon_get_next_ptr(amp->ahp, &anon_idx)) 668 == NULL) 669 continue; 670 } 671 ASSERT(ANON_ISBUSY(anon_get_slot(amp->ahp, anon_idx)) == 0); 672 swap_xlate(ap, &vp, &off); 673 674 /* 675 * If this platform supports HAT_DYNAMIC_ISM_UNMAP, 676 * the pages won't be having SE_SHARED lock at this 677 * point. 678 * 679 * On platforms that do not support HAT_DYNAMIC_ISM_UNMAP, 680 * the pages are still held SE_SHARED locked from the 681 * original segspt_create() 682 * 683 * Our goal is to get SE_EXCL lock on each page, remove 684 * permanent lock on it and invalidate the page. 685 */ 686 if ((sptd->spt_flags & SHM_PAGEABLE) == 0) { 687 if (hat_flags == HAT_UNLOAD_UNMAP) 688 pp = page_lookup(vp, off, SE_EXCL); 689 else { 690 if ((pp = page_find(vp, off)) == NULL) { 691 panic("segspt_free_pages: " 692 "page not locked"); 693 /*NOTREACHED*/ 694 } 695 if (!page_tryupgrade(pp)) { 696 page_unlock(pp); 697 pp = page_lookup(vp, off, SE_EXCL); 698 } 699 } 700 if (pp == NULL) { 701 panic("segspt_free_pages: " 702 "page not in the system"); 703 /*NOTREACHED*/ 704 } 705 ASSERT(pp->p_lckcnt > 0); 706 page_pp_unlock(pp, 0, 1); 707 if (pp->p_lckcnt == 0) 708 unlocked_bytes += PAGESIZE; 709 } else { 710 if ((pp = page_lookup(vp, off, SE_EXCL)) == NULL) 711 continue; 712 } 713 /* 714 * It's logical to invalidate the pages here as in most cases 715 * these were created by segspt. 716 */ 717 if (pp->p_szc != 0) { 718 if (root == 0) { 719 ASSERT(curnpgs == 0); 720 root = 1; 721 rootpp = pp; 722 pgs = curnpgs = page_get_pagecnt(pp->p_szc); 723 ASSERT(pgs > 1); 724 ASSERT(IS_P2ALIGNED(pgs, pgs)); 725 ASSERT(!(page_pptonum(pp) & (pgs - 1))); 726 curnpgs--; 727 } else if ((page_pptonum(pp) & (pgs - 1)) == pgs - 1) { 728 ASSERT(curnpgs == 1); 729 ASSERT(page_pptonum(pp) == 730 page_pptonum(rootpp) + (pgs - 1)); 731 page_destroy_pages(rootpp); 732 root = 0; 733 curnpgs = 0; 734 } else { 735 ASSERT(curnpgs > 1); 736 ASSERT(page_pptonum(pp) == 737 page_pptonum(rootpp) + (pgs - curnpgs)); 738 curnpgs--; 739 } 740 } else { 741 if (root != 0 || curnpgs != 0) { 742 panic("segspt_free_pages: bad large page"); 743 /*NOTREACHED*/ 744 } 745 /* 746 * Before destroying the pages, we need to take care 747 * of the rctl locked memory accounting. For that 748 * we need to calculte the unlocked_bytes. 749 */ 750 if (pp->p_lckcnt > 0) 751 unlocked_bytes += PAGESIZE; 752 /*LINTED: constant in conditional context */ 753 VN_DISPOSE(pp, B_INVAL, 0, kcred); 754 } 755 } 756 if ((sptd->spt_flags & SHM_PAGEABLE) == 0) { 757 if (unlocked_bytes > 0) 758 rctl_decr_locked_mem(NULL, proj, unlocked_bytes, 0); 759 mutex_exit(&sp->shm_mlock); 760 } 761 if (root != 0 || curnpgs != 0) { 762 panic("segspt_free_pages: bad large page"); 763 /*NOTREACHED*/ 764 } 765 766 /* 767 * mark that pages have been released 768 */ 769 sptd->spt_realsize = 0; 770 771 if ((sptd->spt_flags & SHM_PAGEABLE) == 0) { 772 atomic_add_long(&spt_used, -npages); 773 anon_swap_restore(npages); 774 } 775 } 776 777 /* 778 * Get memory allocation policy info for specified address in given segment 779 */ 780 static lgrp_mem_policy_info_t * 781 segspt_getpolicy(struct seg *seg, caddr_t addr) 782 { 783 struct anon_map *amp; 784 ulong_t anon_index; 785 lgrp_mem_policy_info_t *policy_info; 786 struct spt_data *spt_data; 787 788 ASSERT(seg != NULL); 789 790 /* 791 * Get anon_map from segspt 792 * 793 * Assume that no lock needs to be held on anon_map, since 794 * it should be protected by its reference count which must be 795 * nonzero for an existing segment 796 * Need to grab readers lock on policy tree though 797 */ 798 spt_data = (struct spt_data *)seg->s_data; 799 if (spt_data == NULL) 800 return (NULL); 801 amp = spt_data->spt_amp; 802 ASSERT(amp->refcnt != 0); 803 804 /* 805 * Get policy info 806 * 807 * Assume starting anon index of 0 808 */ 809 anon_index = seg_page(seg, addr); 810 policy_info = lgrp_shm_policy_get(amp, anon_index, NULL, 0); 811 812 return (policy_info); 813 } 814 815 /* 816 * DISM only. 817 * Return locked pages over a given range. 818 * 819 * We will cache all DISM locked pages and save the pplist for the 820 * entire segment in the ppa field of the underlying DISM segment structure. 821 * Later, during a call to segspt_reclaim() we will use this ppa array 822 * to page_unlock() all of the pages and then we will free this ppa list. 823 */ 824 /*ARGSUSED*/ 825 static int 826 segspt_dismpagelock(struct seg *seg, caddr_t addr, size_t len, 827 struct page ***ppp, enum lock_type type, enum seg_rw rw) 828 { 829 struct shm_data *shmd = (struct shm_data *)seg->s_data; 830 struct seg *sptseg = shmd->shm_sptseg; 831 struct spt_data *sptd = sptseg->s_data; 832 pgcnt_t pg_idx, npages, tot_npages, npgs; 833 struct page **pplist, **pl, **ppa, *pp; 834 struct anon_map *amp; 835 spgcnt_t an_idx; 836 int ret = ENOTSUP; 837 uint_t pl_built = 0; 838 struct anon *ap; 839 struct vnode *vp; 840 u_offset_t off; 841 pgcnt_t claim_availrmem = 0; 842 uint_t szc; 843 844 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as)); 845 ASSERT(type == L_PAGELOCK || type == L_PAGEUNLOCK); 846 847 /* 848 * We want to lock/unlock the entire ISM segment. Therefore, 849 * we will be using the underlying sptseg and it's base address 850 * and length for the caching arguments. 851 */ 852 ASSERT(sptseg); 853 ASSERT(sptd); 854 855 pg_idx = seg_page(seg, addr); 856 npages = btopr(len); 857 858 /* 859 * check if the request is larger than number of pages covered 860 * by amp 861 */ 862 if (pg_idx + npages > btopr(sptd->spt_amp->size)) { 863 *ppp = NULL; 864 return (ENOTSUP); 865 } 866 867 if (type == L_PAGEUNLOCK) { 868 ASSERT(sptd->spt_ppa != NULL); 869 870 seg_pinactive(seg, NULL, seg->s_base, sptd->spt_amp->size, 871 sptd->spt_ppa, S_WRITE, SEGP_FORCE_WIRED, segspt_reclaim); 872 873 /* 874 * If someone is blocked while unmapping, we purge 875 * segment page cache and thus reclaim pplist synchronously 876 * without waiting for seg_pasync_thread. This speeds up 877 * unmapping in cases where munmap(2) is called, while 878 * raw async i/o is still in progress or where a thread 879 * exits on data fault in a multithreaded application. 880 */ 881 if ((sptd->spt_flags & DISM_PPA_CHANGED) || 882 (AS_ISUNMAPWAIT(seg->s_as) && 883 shmd->shm_softlockcnt > 0)) { 884 segspt_purge(seg); 885 } 886 return (0); 887 } 888 889 /* The L_PAGELOCK case ... */ 890 891 if (sptd->spt_flags & DISM_PPA_CHANGED) { 892 segspt_purge(seg); 893 /* 894 * for DISM ppa needs to be rebuild since 895 * number of locked pages could be changed 896 */ 897 *ppp = NULL; 898 return (ENOTSUP); 899 } 900 901 /* 902 * First try to find pages in segment page cache, without 903 * holding the segment lock. 904 */ 905 pplist = seg_plookup(seg, NULL, seg->s_base, sptd->spt_amp->size, 906 S_WRITE, SEGP_FORCE_WIRED); 907 if (pplist != NULL) { 908 ASSERT(sptd->spt_ppa != NULL); 909 ASSERT(sptd->spt_ppa == pplist); 910 ppa = sptd->spt_ppa; 911 for (an_idx = pg_idx; an_idx < pg_idx + npages; ) { 912 if (ppa[an_idx] == NULL) { 913 seg_pinactive(seg, NULL, seg->s_base, 914 sptd->spt_amp->size, ppa, 915 S_WRITE, SEGP_FORCE_WIRED, segspt_reclaim); 916 *ppp = NULL; 917 return (ENOTSUP); 918 } 919 if ((szc = ppa[an_idx]->p_szc) != 0) { 920 npgs = page_get_pagecnt(szc); 921 an_idx = P2ROUNDUP(an_idx + 1, npgs); 922 } else { 923 an_idx++; 924 } 925 } 926 /* 927 * Since we cache the entire DISM segment, we want to 928 * set ppp to point to the first slot that corresponds 929 * to the requested addr, i.e. pg_idx. 930 */ 931 *ppp = &(sptd->spt_ppa[pg_idx]); 932 return (0); 933 } 934 935 mutex_enter(&sptd->spt_lock); 936 /* 937 * try to find pages in segment page cache with mutex 938 */ 939 pplist = seg_plookup(seg, NULL, seg->s_base, sptd->spt_amp->size, 940 S_WRITE, SEGP_FORCE_WIRED); 941 if (pplist != NULL) { 942 ASSERT(sptd->spt_ppa != NULL); 943 ASSERT(sptd->spt_ppa == pplist); 944 ppa = sptd->spt_ppa; 945 for (an_idx = pg_idx; an_idx < pg_idx + npages; ) { 946 if (ppa[an_idx] == NULL) { 947 mutex_exit(&sptd->spt_lock); 948 seg_pinactive(seg, NULL, seg->s_base, 949 sptd->spt_amp->size, ppa, 950 S_WRITE, SEGP_FORCE_WIRED, segspt_reclaim); 951 *ppp = NULL; 952 return (ENOTSUP); 953 } 954 if ((szc = ppa[an_idx]->p_szc) != 0) { 955 npgs = page_get_pagecnt(szc); 956 an_idx = P2ROUNDUP(an_idx + 1, npgs); 957 } else { 958 an_idx++; 959 } 960 } 961 /* 962 * Since we cache the entire DISM segment, we want to 963 * set ppp to point to the first slot that corresponds 964 * to the requested addr, i.e. pg_idx. 965 */ 966 mutex_exit(&sptd->spt_lock); 967 *ppp = &(sptd->spt_ppa[pg_idx]); 968 return (0); 969 } 970 if (seg_pinsert_check(seg, NULL, seg->s_base, sptd->spt_amp->size, 971 SEGP_FORCE_WIRED) == SEGP_FAIL) { 972 mutex_exit(&sptd->spt_lock); 973 *ppp = NULL; 974 return (ENOTSUP); 975 } 976 977 /* 978 * No need to worry about protections because DISM pages are always rw. 979 */ 980 pl = pplist = NULL; 981 amp = sptd->spt_amp; 982 983 /* 984 * Do we need to build the ppa array? 985 */ 986 if (sptd->spt_ppa == NULL) { 987 pgcnt_t lpg_cnt = 0; 988 989 pl_built = 1; 990 tot_npages = btopr(sptd->spt_amp->size); 991 992 ASSERT(sptd->spt_pcachecnt == 0); 993 pplist = kmem_zalloc(sizeof (page_t *) * tot_npages, KM_SLEEP); 994 pl = pplist; 995 996 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 997 for (an_idx = 0; an_idx < tot_npages; ) { 998 ap = anon_get_ptr(amp->ahp, an_idx); 999 /* 1000 * Cache only mlocked pages. For large pages 1001 * if one (constituent) page is mlocked 1002 * all pages for that large page 1003 * are cached also. This is for quick 1004 * lookups of ppa array; 1005 */ 1006 if ((ap != NULL) && (lpg_cnt != 0 || 1007 (sptd->spt_ppa_lckcnt[an_idx] != 0))) { 1008 1009 swap_xlate(ap, &vp, &off); 1010 pp = page_lookup(vp, off, SE_SHARED); 1011 ASSERT(pp != NULL); 1012 if (lpg_cnt == 0) { 1013 lpg_cnt++; 1014 /* 1015 * For a small page, we are done -- 1016 * lpg_count is reset to 0 below. 1017 * 1018 * For a large page, we are guaranteed 1019 * to find the anon structures of all 1020 * constituent pages and a non-zero 1021 * lpg_cnt ensures that we don't test 1022 * for mlock for these. We are done 1023 * when lpg_count reaches (npgs + 1). 1024 * If we are not the first constituent 1025 * page, restart at the first one. 1026 */ 1027 npgs = page_get_pagecnt(pp->p_szc); 1028 if (!IS_P2ALIGNED(an_idx, npgs)) { 1029 an_idx = P2ALIGN(an_idx, npgs); 1030 page_unlock(pp); 1031 continue; 1032 } 1033 } 1034 if (++lpg_cnt > npgs) 1035 lpg_cnt = 0; 1036 1037 /* 1038 * availrmem is decremented only 1039 * for unlocked pages 1040 */ 1041 if (sptd->spt_ppa_lckcnt[an_idx] == 0) 1042 claim_availrmem++; 1043 pplist[an_idx] = pp; 1044 } 1045 an_idx++; 1046 } 1047 ANON_LOCK_EXIT(&->a_rwlock); 1048 1049 if (claim_availrmem) { 1050 mutex_enter(&freemem_lock); 1051 if (availrmem < tune.t_minarmem + claim_availrmem) { 1052 mutex_exit(&freemem_lock); 1053 ret = ENOTSUP; 1054 claim_availrmem = 0; 1055 goto insert_fail; 1056 } else { 1057 availrmem -= claim_availrmem; 1058 } 1059 mutex_exit(&freemem_lock); 1060 } 1061 1062 sptd->spt_ppa = pl; 1063 } else { 1064 /* 1065 * We already have a valid ppa[]. 1066 */ 1067 pl = sptd->spt_ppa; 1068 } 1069 1070 ASSERT(pl != NULL); 1071 1072 ret = seg_pinsert(seg, NULL, seg->s_base, sptd->spt_amp->size, 1073 sptd->spt_amp->size, pl, S_WRITE, SEGP_FORCE_WIRED, 1074 segspt_reclaim); 1075 if (ret == SEGP_FAIL) { 1076 /* 1077 * seg_pinsert failed. We return 1078 * ENOTSUP, so that the as_pagelock() code will 1079 * then try the slower F_SOFTLOCK path. 1080 */ 1081 if (pl_built) { 1082 /* 1083 * No one else has referenced the ppa[]. 1084 * We created it and we need to destroy it. 1085 */ 1086 sptd->spt_ppa = NULL; 1087 } 1088 ret = ENOTSUP; 1089 goto insert_fail; 1090 } 1091 1092 /* 1093 * In either case, we increment softlockcnt on the 'real' segment. 1094 */ 1095 sptd->spt_pcachecnt++; 1096 atomic_inc_ulong((ulong_t *)(&(shmd->shm_softlockcnt))); 1097 1098 ppa = sptd->spt_ppa; 1099 for (an_idx = pg_idx; an_idx < pg_idx + npages; ) { 1100 if (ppa[an_idx] == NULL) { 1101 mutex_exit(&sptd->spt_lock); 1102 seg_pinactive(seg, NULL, seg->s_base, 1103 sptd->spt_amp->size, 1104 pl, S_WRITE, SEGP_FORCE_WIRED, segspt_reclaim); 1105 *ppp = NULL; 1106 return (ENOTSUP); 1107 } 1108 if ((szc = ppa[an_idx]->p_szc) != 0) { 1109 npgs = page_get_pagecnt(szc); 1110 an_idx = P2ROUNDUP(an_idx + 1, npgs); 1111 } else { 1112 an_idx++; 1113 } 1114 } 1115 /* 1116 * We can now drop the sptd->spt_lock since the ppa[] 1117 * exists and we have incremented pacachecnt. 1118 */ 1119 mutex_exit(&sptd->spt_lock); 1120 1121 /* 1122 * Since we cache the entire segment, we want to 1123 * set ppp to point to the first slot that corresponds 1124 * to the requested addr, i.e. pg_idx. 1125 */ 1126 *ppp = &(sptd->spt_ppa[pg_idx]); 1127 return (0); 1128 1129 insert_fail: 1130 /* 1131 * We will only reach this code if we tried and failed. 1132 * 1133 * And we can drop the lock on the dummy seg, once we've failed 1134 * to set up a new ppa[]. 1135 */ 1136 mutex_exit(&sptd->spt_lock); 1137 1138 if (pl_built) { 1139 if (claim_availrmem) { 1140 mutex_enter(&freemem_lock); 1141 availrmem += claim_availrmem; 1142 mutex_exit(&freemem_lock); 1143 } 1144 1145 /* 1146 * We created pl and we need to destroy it. 1147 */ 1148 pplist = pl; 1149 for (an_idx = 0; an_idx < tot_npages; an_idx++) { 1150 if (pplist[an_idx] != NULL) 1151 page_unlock(pplist[an_idx]); 1152 } 1153 kmem_free(pl, sizeof (page_t *) * tot_npages); 1154 } 1155 1156 if (shmd->shm_softlockcnt <= 0) { 1157 if (AS_ISUNMAPWAIT(seg->s_as)) { 1158 mutex_enter(&seg->s_as->a_contents); 1159 if (AS_ISUNMAPWAIT(seg->s_as)) { 1160 AS_CLRUNMAPWAIT(seg->s_as); 1161 cv_broadcast(&seg->s_as->a_cv); 1162 } 1163 mutex_exit(&seg->s_as->a_contents); 1164 } 1165 } 1166 *ppp = NULL; 1167 return (ret); 1168 } 1169 1170 1171 1172 /* 1173 * return locked pages over a given range. 1174 * 1175 * We will cache the entire ISM segment and save the pplist for the 1176 * entire segment in the ppa field of the underlying ISM segment structure. 1177 * Later, during a call to segspt_reclaim() we will use this ppa array 1178 * to page_unlock() all of the pages and then we will free this ppa list. 1179 */ 1180 /*ARGSUSED*/ 1181 static int 1182 segspt_shmpagelock(struct seg *seg, caddr_t addr, size_t len, 1183 struct page ***ppp, enum lock_type type, enum seg_rw rw) 1184 { 1185 struct shm_data *shmd = (struct shm_data *)seg->s_data; 1186 struct seg *sptseg = shmd->shm_sptseg; 1187 struct spt_data *sptd = sptseg->s_data; 1188 pgcnt_t np, page_index, npages; 1189 caddr_t a, spt_base; 1190 struct page **pplist, **pl, *pp; 1191 struct anon_map *amp; 1192 ulong_t anon_index; 1193 int ret = ENOTSUP; 1194 uint_t pl_built = 0; 1195 struct anon *ap; 1196 struct vnode *vp; 1197 u_offset_t off; 1198 1199 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as)); 1200 ASSERT(type == L_PAGELOCK || type == L_PAGEUNLOCK); 1201 1202 1203 /* 1204 * We want to lock/unlock the entire ISM segment. Therefore, 1205 * we will be using the underlying sptseg and it's base address 1206 * and length for the caching arguments. 1207 */ 1208 ASSERT(sptseg); 1209 ASSERT(sptd); 1210 1211 if (sptd->spt_flags & SHM_PAGEABLE) { 1212 return (segspt_dismpagelock(seg, addr, len, ppp, type, rw)); 1213 } 1214 1215 page_index = seg_page(seg, addr); 1216 npages = btopr(len); 1217 1218 /* 1219 * check if the request is larger than number of pages covered 1220 * by amp 1221 */ 1222 if (page_index + npages > btopr(sptd->spt_amp->size)) { 1223 *ppp = NULL; 1224 return (ENOTSUP); 1225 } 1226 1227 if (type == L_PAGEUNLOCK) { 1228 1229 ASSERT(sptd->spt_ppa != NULL); 1230 1231 seg_pinactive(seg, NULL, seg->s_base, sptd->spt_amp->size, 1232 sptd->spt_ppa, S_WRITE, SEGP_FORCE_WIRED, segspt_reclaim); 1233 1234 /* 1235 * If someone is blocked while unmapping, we purge 1236 * segment page cache and thus reclaim pplist synchronously 1237 * without waiting for seg_pasync_thread. This speeds up 1238 * unmapping in cases where munmap(2) is called, while 1239 * raw async i/o is still in progress or where a thread 1240 * exits on data fault in a multithreaded application. 1241 */ 1242 if (AS_ISUNMAPWAIT(seg->s_as) && (shmd->shm_softlockcnt > 0)) { 1243 segspt_purge(seg); 1244 } 1245 return (0); 1246 } 1247 1248 /* The L_PAGELOCK case... */ 1249 1250 /* 1251 * First try to find pages in segment page cache, without 1252 * holding the segment lock. 1253 */ 1254 pplist = seg_plookup(seg, NULL, seg->s_base, sptd->spt_amp->size, 1255 S_WRITE, SEGP_FORCE_WIRED); 1256 if (pplist != NULL) { 1257 ASSERT(sptd->spt_ppa == pplist); 1258 ASSERT(sptd->spt_ppa[page_index]); 1259 /* 1260 * Since we cache the entire ISM segment, we want to 1261 * set ppp to point to the first slot that corresponds 1262 * to the requested addr, i.e. page_index. 1263 */ 1264 *ppp = &(sptd->spt_ppa[page_index]); 1265 return (0); 1266 } 1267 1268 mutex_enter(&sptd->spt_lock); 1269 1270 /* 1271 * try to find pages in segment page cache 1272 */ 1273 pplist = seg_plookup(seg, NULL, seg->s_base, sptd->spt_amp->size, 1274 S_WRITE, SEGP_FORCE_WIRED); 1275 if (pplist != NULL) { 1276 ASSERT(sptd->spt_ppa == pplist); 1277 /* 1278 * Since we cache the entire segment, we want to 1279 * set ppp to point to the first slot that corresponds 1280 * to the requested addr, i.e. page_index. 1281 */ 1282 mutex_exit(&sptd->spt_lock); 1283 *ppp = &(sptd->spt_ppa[page_index]); 1284 return (0); 1285 } 1286 1287 if (seg_pinsert_check(seg, NULL, seg->s_base, sptd->spt_amp->size, 1288 SEGP_FORCE_WIRED) == SEGP_FAIL) { 1289 mutex_exit(&sptd->spt_lock); 1290 *ppp = NULL; 1291 return (ENOTSUP); 1292 } 1293 1294 /* 1295 * No need to worry about protections because ISM pages 1296 * are always rw. 1297 */ 1298 pl = pplist = NULL; 1299 1300 /* 1301 * Do we need to build the ppa array? 1302 */ 1303 if (sptd->spt_ppa == NULL) { 1304 ASSERT(sptd->spt_ppa == pplist); 1305 1306 spt_base = sptseg->s_base; 1307 pl_built = 1; 1308 1309 /* 1310 * availrmem is decremented once during anon_swap_adjust() 1311 * and is incremented during the anon_unresv(), which is 1312 * called from shm_rm_amp() when the segment is destroyed. 1313 */ 1314 amp = sptd->spt_amp; 1315 ASSERT(amp != NULL); 1316 1317 /* pcachecnt is protected by sptd->spt_lock */ 1318 ASSERT(sptd->spt_pcachecnt == 0); 1319 pplist = kmem_zalloc(sizeof (page_t *) 1320 * btopr(sptd->spt_amp->size), KM_SLEEP); 1321 pl = pplist; 1322 1323 anon_index = seg_page(sptseg, spt_base); 1324 1325 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 1326 for (a = spt_base; a < (spt_base + sptd->spt_amp->size); 1327 a += PAGESIZE, anon_index++, pplist++) { 1328 ap = anon_get_ptr(amp->ahp, anon_index); 1329 ASSERT(ap != NULL); 1330 swap_xlate(ap, &vp, &off); 1331 pp = page_lookup(vp, off, SE_SHARED); 1332 ASSERT(pp != NULL); 1333 *pplist = pp; 1334 } 1335 ANON_LOCK_EXIT(&->a_rwlock); 1336 1337 if (a < (spt_base + sptd->spt_amp->size)) { 1338 ret = ENOTSUP; 1339 goto insert_fail; 1340 } 1341 sptd->spt_ppa = pl; 1342 } else { 1343 /* 1344 * We already have a valid ppa[]. 1345 */ 1346 pl = sptd->spt_ppa; 1347 } 1348 1349 ASSERT(pl != NULL); 1350 1351 ret = seg_pinsert(seg, NULL, seg->s_base, sptd->spt_amp->size, 1352 sptd->spt_amp->size, pl, S_WRITE, SEGP_FORCE_WIRED, 1353 segspt_reclaim); 1354 if (ret == SEGP_FAIL) { 1355 /* 1356 * seg_pinsert failed. We return 1357 * ENOTSUP, so that the as_pagelock() code will 1358 * then try the slower F_SOFTLOCK path. 1359 */ 1360 if (pl_built) { 1361 /* 1362 * No one else has referenced the ppa[]. 1363 * We created it and we need to destroy it. 1364 */ 1365 sptd->spt_ppa = NULL; 1366 } 1367 ret = ENOTSUP; 1368 goto insert_fail; 1369 } 1370 1371 /* 1372 * In either case, we increment softlockcnt on the 'real' segment. 1373 */ 1374 sptd->spt_pcachecnt++; 1375 atomic_inc_ulong((ulong_t *)(&(shmd->shm_softlockcnt))); 1376 1377 /* 1378 * We can now drop the sptd->spt_lock since the ppa[] 1379 * exists and we have incremented pacachecnt. 1380 */ 1381 mutex_exit(&sptd->spt_lock); 1382 1383 /* 1384 * Since we cache the entire segment, we want to 1385 * set ppp to point to the first slot that corresponds 1386 * to the requested addr, i.e. page_index. 1387 */ 1388 *ppp = &(sptd->spt_ppa[page_index]); 1389 return (0); 1390 1391 insert_fail: 1392 /* 1393 * We will only reach this code if we tried and failed. 1394 * 1395 * And we can drop the lock on the dummy seg, once we've failed 1396 * to set up a new ppa[]. 1397 */ 1398 mutex_exit(&sptd->spt_lock); 1399 1400 if (pl_built) { 1401 /* 1402 * We created pl and we need to destroy it. 1403 */ 1404 pplist = pl; 1405 np = (((uintptr_t)(a - spt_base)) >> PAGESHIFT); 1406 while (np) { 1407 page_unlock(*pplist); 1408 np--; 1409 pplist++; 1410 } 1411 kmem_free(pl, sizeof (page_t *) * btopr(sptd->spt_amp->size)); 1412 } 1413 if (shmd->shm_softlockcnt <= 0) { 1414 if (AS_ISUNMAPWAIT(seg->s_as)) { 1415 mutex_enter(&seg->s_as->a_contents); 1416 if (AS_ISUNMAPWAIT(seg->s_as)) { 1417 AS_CLRUNMAPWAIT(seg->s_as); 1418 cv_broadcast(&seg->s_as->a_cv); 1419 } 1420 mutex_exit(&seg->s_as->a_contents); 1421 } 1422 } 1423 *ppp = NULL; 1424 return (ret); 1425 } 1426 1427 /* 1428 * purge any cached pages in the I/O page cache 1429 */ 1430 static void 1431 segspt_purge(struct seg *seg) 1432 { 1433 seg_ppurge(seg, NULL, SEGP_FORCE_WIRED); 1434 } 1435 1436 static int 1437 segspt_reclaim(void *ptag, caddr_t addr, size_t len, struct page **pplist, 1438 enum seg_rw rw, int async) 1439 { 1440 struct seg *seg = (struct seg *)ptag; 1441 struct shm_data *shmd = (struct shm_data *)seg->s_data; 1442 struct seg *sptseg; 1443 struct spt_data *sptd; 1444 pgcnt_t npages, i, free_availrmem = 0; 1445 int done = 0; 1446 1447 #ifdef lint 1448 addr = addr; 1449 #endif 1450 sptseg = shmd->shm_sptseg; 1451 sptd = sptseg->s_data; 1452 npages = (len >> PAGESHIFT); 1453 ASSERT(npages); 1454 ASSERT(sptd->spt_pcachecnt != 0); 1455 ASSERT(sptd->spt_ppa == pplist); 1456 ASSERT(npages == btopr(sptd->spt_amp->size)); 1457 ASSERT(async || AS_LOCK_HELD(seg->s_as)); 1458 1459 /* 1460 * Acquire the lock on the dummy seg and destroy the 1461 * ppa array IF this is the last pcachecnt. 1462 */ 1463 mutex_enter(&sptd->spt_lock); 1464 if (--sptd->spt_pcachecnt == 0) { 1465 for (i = 0; i < npages; i++) { 1466 if (pplist[i] == NULL) { 1467 continue; 1468 } 1469 if (rw == S_WRITE) { 1470 hat_setrefmod(pplist[i]); 1471 } else { 1472 hat_setref(pplist[i]); 1473 } 1474 if ((sptd->spt_flags & SHM_PAGEABLE) && 1475 (sptd->spt_ppa_lckcnt[i] == 0)) 1476 free_availrmem++; 1477 page_unlock(pplist[i]); 1478 } 1479 if ((sptd->spt_flags & SHM_PAGEABLE) && free_availrmem) { 1480 mutex_enter(&freemem_lock); 1481 availrmem += free_availrmem; 1482 mutex_exit(&freemem_lock); 1483 } 1484 /* 1485 * Since we want to cach/uncache the entire ISM segment, 1486 * we will track the pplist in a segspt specific field 1487 * ppa, that is initialized at the time we add an entry to 1488 * the cache. 1489 */ 1490 ASSERT(sptd->spt_pcachecnt == 0); 1491 kmem_free(pplist, sizeof (page_t *) * npages); 1492 sptd->spt_ppa = NULL; 1493 sptd->spt_flags &= ~DISM_PPA_CHANGED; 1494 sptd->spt_gen++; 1495 cv_broadcast(&sptd->spt_cv); 1496 done = 1; 1497 } 1498 mutex_exit(&sptd->spt_lock); 1499 1500 /* 1501 * If we are pcache async thread or called via seg_ppurge_wiredpp() we 1502 * may not hold AS lock (in this case async argument is not 0). This 1503 * means if softlockcnt drops to 0 after the decrement below address 1504 * space may get freed. We can't allow it since after softlock 1505 * derement to 0 we still need to access as structure for possible 1506 * wakeup of unmap waiters. To prevent the disappearance of as we take 1507 * this segment's shm_segfree_syncmtx. segspt_shmfree() also takes 1508 * this mutex as a barrier to make sure this routine completes before 1509 * segment is freed. 1510 * 1511 * The second complication we have to deal with in async case is a 1512 * possibility of missed wake up of unmap wait thread. When we don't 1513 * hold as lock here we may take a_contents lock before unmap wait 1514 * thread that was first to see softlockcnt was still not 0. As a 1515 * result we'll fail to wake up an unmap wait thread. To avoid this 1516 * race we set nounmapwait flag in as structure if we drop softlockcnt 1517 * to 0 if async is not 0. unmapwait thread 1518 * will not block if this flag is set. 1519 */ 1520 if (async) 1521 mutex_enter(&shmd->shm_segfree_syncmtx); 1522 1523 /* 1524 * Now decrement softlockcnt. 1525 */ 1526 ASSERT(shmd->shm_softlockcnt > 0); 1527 atomic_dec_ulong((ulong_t *)(&(shmd->shm_softlockcnt))); 1528 1529 if (shmd->shm_softlockcnt <= 0) { 1530 if (async || AS_ISUNMAPWAIT(seg->s_as)) { 1531 mutex_enter(&seg->s_as->a_contents); 1532 if (async) 1533 AS_SETNOUNMAPWAIT(seg->s_as); 1534 if (AS_ISUNMAPWAIT(seg->s_as)) { 1535 AS_CLRUNMAPWAIT(seg->s_as); 1536 cv_broadcast(&seg->s_as->a_cv); 1537 } 1538 mutex_exit(&seg->s_as->a_contents); 1539 } 1540 } 1541 1542 if (async) 1543 mutex_exit(&shmd->shm_segfree_syncmtx); 1544 1545 return (done); 1546 } 1547 1548 /* 1549 * Do a F_SOFTUNLOCK call over the range requested. 1550 * The range must have already been F_SOFTLOCK'ed. 1551 * 1552 * The calls to acquire and release the anon map lock mutex were 1553 * removed in order to avoid a deadly embrace during a DR 1554 * memory delete operation. (Eg. DR blocks while waiting for a 1555 * exclusive lock on a page that is being used for kaio; the 1556 * thread that will complete the kaio and call segspt_softunlock 1557 * blocks on the anon map lock; another thread holding the anon 1558 * map lock blocks on another page lock via the segspt_shmfault 1559 * -> page_lookup -> page_lookup_create -> page_lock_es code flow.) 1560 * 1561 * The appropriateness of the removal is based upon the following: 1562 * 1. If we are holding a segment's reader lock and the page is held 1563 * shared, then the corresponding element in anonmap which points to 1564 * anon struct cannot change and there is no need to acquire the 1565 * anonymous map lock. 1566 * 2. Threads in segspt_softunlock have a reader lock on the segment 1567 * and already have the shared page lock, so we are guaranteed that 1568 * the anon map slot cannot change and therefore can call anon_get_ptr() 1569 * without grabbing the anonymous map lock. 1570 * 3. Threads that softlock a shared page break copy-on-write, even if 1571 * its a read. Thus cow faults can be ignored with respect to soft 1572 * unlocking, since the breaking of cow means that the anon slot(s) will 1573 * not be shared. 1574 */ 1575 static void 1576 segspt_softunlock(struct seg *seg, caddr_t sptseg_addr, 1577 size_t len, enum seg_rw rw) 1578 { 1579 struct shm_data *shmd = (struct shm_data *)seg->s_data; 1580 struct seg *sptseg; 1581 struct spt_data *sptd; 1582 page_t *pp; 1583 caddr_t adr; 1584 struct vnode *vp; 1585 u_offset_t offset; 1586 ulong_t anon_index; 1587 struct anon_map *amp; /* XXX - for locknest */ 1588 struct anon *ap = NULL; 1589 pgcnt_t npages; 1590 1591 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as)); 1592 1593 sptseg = shmd->shm_sptseg; 1594 sptd = sptseg->s_data; 1595 1596 /* 1597 * Some platforms assume that ISM mappings are HAT_LOAD_LOCK 1598 * and therefore their pages are SE_SHARED locked 1599 * for the entire life of the segment. 1600 */ 1601 if ((!hat_supported(HAT_DYNAMIC_ISM_UNMAP, (void *)0)) && 1602 ((sptd->spt_flags & SHM_PAGEABLE) == 0)) { 1603 goto softlock_decrement; 1604 } 1605 1606 /* 1607 * Any thread is free to do a page_find and 1608 * page_unlock() on the pages within this seg. 1609 * 1610 * We are already holding the as->a_lock on the user's 1611 * real segment, but we need to hold the a_lock on the 1612 * underlying dummy as. This is mostly to satisfy the 1613 * underlying HAT layer. 1614 */ 1615 AS_LOCK_ENTER(sptseg->s_as, RW_READER); 1616 hat_unlock(sptseg->s_as->a_hat, sptseg_addr, len); 1617 AS_LOCK_EXIT(sptseg->s_as); 1618 1619 amp = sptd->spt_amp; 1620 ASSERT(amp != NULL); 1621 anon_index = seg_page(sptseg, sptseg_addr); 1622 1623 for (adr = sptseg_addr; adr < sptseg_addr + len; adr += PAGESIZE) { 1624 ap = anon_get_ptr(amp->ahp, anon_index++); 1625 ASSERT(ap != NULL); 1626 swap_xlate(ap, &vp, &offset); 1627 1628 /* 1629 * Use page_find() instead of page_lookup() to 1630 * find the page since we know that it has a 1631 * "shared" lock. 1632 */ 1633 pp = page_find(vp, offset); 1634 ASSERT(ap == anon_get_ptr(amp->ahp, anon_index - 1)); 1635 if (pp == NULL) { 1636 panic("segspt_softunlock: " 1637 "addr %p, ap %p, vp %p, off %llx", 1638 (void *)adr, (void *)ap, (void *)vp, offset); 1639 /*NOTREACHED*/ 1640 } 1641 1642 if (rw == S_WRITE) { 1643 hat_setrefmod(pp); 1644 } else if (rw != S_OTHER) { 1645 hat_setref(pp); 1646 } 1647 page_unlock(pp); 1648 } 1649 1650 softlock_decrement: 1651 npages = btopr(len); 1652 ASSERT(shmd->shm_softlockcnt >= npages); 1653 atomic_add_long((ulong_t *)(&(shmd->shm_softlockcnt)), -npages); 1654 if (shmd->shm_softlockcnt == 0) { 1655 /* 1656 * All SOFTLOCKS are gone. Wakeup any waiting 1657 * unmappers so they can try again to unmap. 1658 * Check for waiters first without the mutex 1659 * held so we don't always grab the mutex on 1660 * softunlocks. 1661 */ 1662 if (AS_ISUNMAPWAIT(seg->s_as)) { 1663 mutex_enter(&seg->s_as->a_contents); 1664 if (AS_ISUNMAPWAIT(seg->s_as)) { 1665 AS_CLRUNMAPWAIT(seg->s_as); 1666 cv_broadcast(&seg->s_as->a_cv); 1667 } 1668 mutex_exit(&seg->s_as->a_contents); 1669 } 1670 } 1671 } 1672 1673 int 1674 segspt_shmattach(struct seg *seg, caddr_t *argsp) 1675 { 1676 struct shm_data *shmd_arg = (struct shm_data *)argsp; 1677 struct shm_data *shmd; 1678 struct anon_map *shm_amp = shmd_arg->shm_amp; 1679 struct spt_data *sptd; 1680 int error = 0; 1681 1682 ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as)); 1683 1684 shmd = kmem_zalloc((sizeof (*shmd)), KM_NOSLEEP); 1685 if (shmd == NULL) 1686 return (ENOMEM); 1687 1688 shmd->shm_sptas = shmd_arg->shm_sptas; 1689 shmd->shm_amp = shm_amp; 1690 shmd->shm_sptseg = shmd_arg->shm_sptseg; 1691 1692 (void) lgrp_shm_policy_set(LGRP_MEM_POLICY_DEFAULT, shm_amp, 0, 1693 NULL, 0, seg->s_size); 1694 1695 mutex_init(&shmd->shm_segfree_syncmtx, NULL, MUTEX_DEFAULT, NULL); 1696 1697 seg->s_data = (void *)shmd; 1698 seg->s_ops = &segspt_shmops; 1699 seg->s_szc = shmd->shm_sptseg->s_szc; 1700 sptd = shmd->shm_sptseg->s_data; 1701 1702 if (sptd->spt_flags & SHM_PAGEABLE) { 1703 if ((shmd->shm_vpage = kmem_zalloc(btopr(shm_amp->size), 1704 KM_NOSLEEP)) == NULL) { 1705 seg->s_data = (void *)NULL; 1706 kmem_free(shmd, (sizeof (*shmd))); 1707 return (ENOMEM); 1708 } 1709 shmd->shm_lckpgs = 0; 1710 if (hat_supported(HAT_DYNAMIC_ISM_UNMAP, (void *)0)) { 1711 if ((error = hat_share(seg->s_as->a_hat, seg->s_base, 1712 shmd_arg->shm_sptas->a_hat, SEGSPTADDR, 1713 seg->s_size, seg->s_szc)) != 0) { 1714 kmem_free(shmd->shm_vpage, 1715 btopr(shm_amp->size)); 1716 } 1717 } 1718 } else { 1719 error = hat_share(seg->s_as->a_hat, seg->s_base, 1720 shmd_arg->shm_sptas->a_hat, SEGSPTADDR, 1721 seg->s_size, seg->s_szc); 1722 } 1723 if (error) { 1724 seg->s_szc = 0; 1725 seg->s_data = (void *)NULL; 1726 kmem_free(shmd, (sizeof (*shmd))); 1727 } else { 1728 ANON_LOCK_ENTER(&shm_amp->a_rwlock, RW_WRITER); 1729 shm_amp->refcnt++; 1730 ANON_LOCK_EXIT(&shm_amp->a_rwlock); 1731 } 1732 return (error); 1733 } 1734 1735 int 1736 segspt_shmunmap(struct seg *seg, caddr_t raddr, size_t ssize) 1737 { 1738 struct shm_data *shmd = (struct shm_data *)seg->s_data; 1739 int reclaim = 1; 1740 1741 ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as)); 1742 retry: 1743 if (shmd->shm_softlockcnt > 0) { 1744 if (reclaim == 1) { 1745 segspt_purge(seg); 1746 reclaim = 0; 1747 goto retry; 1748 } 1749 return (EAGAIN); 1750 } 1751 1752 if (ssize != seg->s_size) { 1753 #ifdef DEBUG 1754 cmn_err(CE_WARN, "Incompatible ssize %lx s_size %lx\n", 1755 ssize, seg->s_size); 1756 #endif 1757 return (EINVAL); 1758 } 1759 1760 (void) segspt_shmlockop(seg, raddr, shmd->shm_amp->size, 0, MC_UNLOCK, 1761 NULL, 0); 1762 hat_unshare(seg->s_as->a_hat, raddr, ssize, seg->s_szc); 1763 1764 seg_free(seg); 1765 1766 return (0); 1767 } 1768 1769 void 1770 segspt_shmfree(struct seg *seg) 1771 { 1772 struct shm_data *shmd = (struct shm_data *)seg->s_data; 1773 struct anon_map *shm_amp = shmd->shm_amp; 1774 1775 ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as)); 1776 1777 (void) segspt_shmlockop(seg, seg->s_base, shm_amp->size, 0, 1778 MC_UNLOCK, NULL, 0); 1779 1780 /* 1781 * Need to increment refcnt when attaching 1782 * and decrement when detaching because of dup(). 1783 */ 1784 ANON_LOCK_ENTER(&shm_amp->a_rwlock, RW_WRITER); 1785 shm_amp->refcnt--; 1786 ANON_LOCK_EXIT(&shm_amp->a_rwlock); 1787 1788 if (shmd->shm_vpage) { /* only for DISM */ 1789 kmem_free(shmd->shm_vpage, btopr(shm_amp->size)); 1790 shmd->shm_vpage = NULL; 1791 } 1792 1793 /* 1794 * Take shm_segfree_syncmtx lock to let segspt_reclaim() finish if it's 1795 * still working with this segment without holding as lock. 1796 */ 1797 ASSERT(shmd->shm_softlockcnt == 0); 1798 mutex_enter(&shmd->shm_segfree_syncmtx); 1799 mutex_destroy(&shmd->shm_segfree_syncmtx); 1800 1801 kmem_free(shmd, sizeof (*shmd)); 1802 } 1803 1804 /*ARGSUSED*/ 1805 int 1806 segspt_shmsetprot(struct seg *seg, caddr_t addr, size_t len, uint_t prot) 1807 { 1808 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as)); 1809 1810 /* 1811 * Shared page table is more than shared mapping. 1812 * Individual process sharing page tables can't change prot 1813 * because there is only one set of page tables. 1814 * This will be allowed after private page table is 1815 * supported. 1816 */ 1817 /* need to return correct status error? */ 1818 return (0); 1819 } 1820 1821 1822 faultcode_t 1823 segspt_dismfault(struct hat *hat, struct seg *seg, caddr_t addr, 1824 size_t len, enum fault_type type, enum seg_rw rw) 1825 { 1826 struct shm_data *shmd = (struct shm_data *)seg->s_data; 1827 struct seg *sptseg = shmd->shm_sptseg; 1828 struct as *curspt = shmd->shm_sptas; 1829 struct spt_data *sptd = sptseg->s_data; 1830 pgcnt_t npages; 1831 size_t size; 1832 caddr_t segspt_addr, shm_addr; 1833 page_t **ppa; 1834 int i; 1835 ulong_t an_idx = 0; 1836 int err = 0; 1837 int dyn_ism_unmap = hat_supported(HAT_DYNAMIC_ISM_UNMAP, (void *)0); 1838 size_t pgsz; 1839 pgcnt_t pgcnt; 1840 caddr_t a; 1841 pgcnt_t pidx; 1842 1843 #ifdef lint 1844 hat = hat; 1845 #endif 1846 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as)); 1847 1848 /* 1849 * Because of the way spt is implemented 1850 * the realsize of the segment does not have to be 1851 * equal to the segment size itself. The segment size is 1852 * often in multiples of a page size larger than PAGESIZE. 1853 * The realsize is rounded up to the nearest PAGESIZE 1854 * based on what the user requested. This is a bit of 1855 * ungliness that is historical but not easily fixed 1856 * without re-designing the higher levels of ISM. 1857 */ 1858 ASSERT(addr >= seg->s_base); 1859 if (((addr + len) - seg->s_base) > sptd->spt_realsize) 1860 return (FC_NOMAP); 1861 /* 1862 * For all of the following cases except F_PROT, we need to 1863 * make any necessary adjustments to addr and len 1864 * and get all of the necessary page_t's into an array called ppa[]. 1865 * 1866 * The code in shmat() forces base addr and len of ISM segment 1867 * to be aligned to largest page size supported. Therefore, 1868 * we are able to handle F_SOFTLOCK and F_INVAL calls in "large 1869 * pagesize" chunks. We want to make sure that we HAT_LOAD_LOCK 1870 * in large pagesize chunks, or else we will screw up the HAT 1871 * layer by calling hat_memload_array() with differing page sizes 1872 * over a given virtual range. 1873 */ 1874 pgsz = page_get_pagesize(sptseg->s_szc); 1875 pgcnt = page_get_pagecnt(sptseg->s_szc); 1876 shm_addr = (caddr_t)P2ALIGN((uintptr_t)(addr), pgsz); 1877 size = P2ROUNDUP((uintptr_t)(((addr + len) - shm_addr)), pgsz); 1878 npages = btopr(size); 1879 1880 /* 1881 * Now we need to convert from addr in segshm to addr in segspt. 1882 */ 1883 an_idx = seg_page(seg, shm_addr); 1884 segspt_addr = sptseg->s_base + ptob(an_idx); 1885 1886 ASSERT((segspt_addr + ptob(npages)) <= 1887 (sptseg->s_base + sptd->spt_realsize)); 1888 ASSERT(segspt_addr < (sptseg->s_base + sptseg->s_size)); 1889 1890 switch (type) { 1891 1892 case F_SOFTLOCK: 1893 1894 atomic_add_long((ulong_t *)(&(shmd->shm_softlockcnt)), npages); 1895 /* 1896 * Fall through to the F_INVAL case to load up the hat layer 1897 * entries with the HAT_LOAD_LOCK flag. 1898 */ 1899 /* FALLTHRU */ 1900 case F_INVAL: 1901 1902 if ((rw == S_EXEC) && !(sptd->spt_prot & PROT_EXEC)) 1903 return (FC_NOMAP); 1904 1905 ppa = kmem_zalloc(npages * sizeof (page_t *), KM_SLEEP); 1906 1907 err = spt_anon_getpages(sptseg, segspt_addr, size, ppa); 1908 if (err != 0) { 1909 if (type == F_SOFTLOCK) { 1910 atomic_add_long((ulong_t *)( 1911 &(shmd->shm_softlockcnt)), -npages); 1912 } 1913 goto dism_err; 1914 } 1915 AS_LOCK_ENTER(sptseg->s_as, RW_READER); 1916 a = segspt_addr; 1917 pidx = 0; 1918 if (type == F_SOFTLOCK) { 1919 1920 /* 1921 * Load up the translation keeping it 1922 * locked and don't unlock the page. 1923 */ 1924 for (; pidx < npages; a += pgsz, pidx += pgcnt) { 1925 hat_memload_array(sptseg->s_as->a_hat, 1926 a, pgsz, &ppa[pidx], sptd->spt_prot, 1927 HAT_LOAD_LOCK | HAT_LOAD_SHARE); 1928 } 1929 } else { 1930 /* 1931 * Migrate pages marked for migration 1932 */ 1933 if (lgrp_optimizations()) 1934 page_migrate(seg, shm_addr, ppa, npages); 1935 1936 for (; pidx < npages; a += pgsz, pidx += pgcnt) { 1937 hat_memload_array(sptseg->s_as->a_hat, 1938 a, pgsz, &ppa[pidx], 1939 sptd->spt_prot, 1940 HAT_LOAD_SHARE); 1941 } 1942 1943 /* 1944 * And now drop the SE_SHARED lock(s). 1945 */ 1946 if (dyn_ism_unmap) { 1947 for (i = 0; i < npages; i++) { 1948 page_unlock(ppa[i]); 1949 } 1950 } 1951 } 1952 1953 if (!dyn_ism_unmap) { 1954 if (hat_share(seg->s_as->a_hat, shm_addr, 1955 curspt->a_hat, segspt_addr, ptob(npages), 1956 seg->s_szc) != 0) { 1957 panic("hat_share err in DISM fault"); 1958 /* NOTREACHED */ 1959 } 1960 if (type == F_INVAL) { 1961 for (i = 0; i < npages; i++) { 1962 page_unlock(ppa[i]); 1963 } 1964 } 1965 } 1966 AS_LOCK_EXIT(sptseg->s_as); 1967 dism_err: 1968 kmem_free(ppa, npages * sizeof (page_t *)); 1969 return (err); 1970 1971 case F_SOFTUNLOCK: 1972 1973 /* 1974 * This is a bit ugly, we pass in the real seg pointer, 1975 * but the segspt_addr is the virtual address within the 1976 * dummy seg. 1977 */ 1978 segspt_softunlock(seg, segspt_addr, size, rw); 1979 return (0); 1980 1981 case F_PROT: 1982 1983 /* 1984 * This takes care of the unusual case where a user 1985 * allocates a stack in shared memory and a register 1986 * window overflow is written to that stack page before 1987 * it is otherwise modified. 1988 * 1989 * We can get away with this because ISM segments are 1990 * always rw. Other than this unusual case, there 1991 * should be no instances of protection violations. 1992 */ 1993 return (0); 1994 1995 default: 1996 #ifdef DEBUG 1997 panic("segspt_dismfault default type?"); 1998 #else 1999 return (FC_NOMAP); 2000 #endif 2001 } 2002 } 2003 2004 2005 faultcode_t 2006 segspt_shmfault(struct hat *hat, struct seg *seg, caddr_t addr, 2007 size_t len, enum fault_type type, enum seg_rw rw) 2008 { 2009 struct shm_data *shmd = (struct shm_data *)seg->s_data; 2010 struct seg *sptseg = shmd->shm_sptseg; 2011 struct as *curspt = shmd->shm_sptas; 2012 struct spt_data *sptd = sptseg->s_data; 2013 pgcnt_t npages; 2014 size_t size; 2015 caddr_t sptseg_addr, shm_addr; 2016 page_t *pp, **ppa; 2017 int i; 2018 u_offset_t offset; 2019 ulong_t anon_index = 0; 2020 struct vnode *vp; 2021 struct anon_map *amp; /* XXX - for locknest */ 2022 struct anon *ap = NULL; 2023 size_t pgsz; 2024 pgcnt_t pgcnt; 2025 caddr_t a; 2026 pgcnt_t pidx; 2027 size_t sz; 2028 2029 #ifdef lint 2030 hat = hat; 2031 #endif 2032 2033 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as)); 2034 2035 if (sptd->spt_flags & SHM_PAGEABLE) { 2036 return (segspt_dismfault(hat, seg, addr, len, type, rw)); 2037 } 2038 2039 /* 2040 * Because of the way spt is implemented 2041 * the realsize of the segment does not have to be 2042 * equal to the segment size itself. The segment size is 2043 * often in multiples of a page size larger than PAGESIZE. 2044 * The realsize is rounded up to the nearest PAGESIZE 2045 * based on what the user requested. This is a bit of 2046 * ungliness that is historical but not easily fixed 2047 * without re-designing the higher levels of ISM. 2048 */ 2049 ASSERT(addr >= seg->s_base); 2050 if (((addr + len) - seg->s_base) > sptd->spt_realsize) 2051 return (FC_NOMAP); 2052 /* 2053 * For all of the following cases except F_PROT, we need to 2054 * make any necessary adjustments to addr and len 2055 * and get all of the necessary page_t's into an array called ppa[]. 2056 * 2057 * The code in shmat() forces base addr and len of ISM segment 2058 * to be aligned to largest page size supported. Therefore, 2059 * we are able to handle F_SOFTLOCK and F_INVAL calls in "large 2060 * pagesize" chunks. We want to make sure that we HAT_LOAD_LOCK 2061 * in large pagesize chunks, or else we will screw up the HAT 2062 * layer by calling hat_memload_array() with differing page sizes 2063 * over a given virtual range. 2064 */ 2065 pgsz = page_get_pagesize(sptseg->s_szc); 2066 pgcnt = page_get_pagecnt(sptseg->s_szc); 2067 shm_addr = (caddr_t)P2ALIGN((uintptr_t)(addr), pgsz); 2068 size = P2ROUNDUP((uintptr_t)(((addr + len) - shm_addr)), pgsz); 2069 npages = btopr(size); 2070 2071 /* 2072 * Now we need to convert from addr in segshm to addr in segspt. 2073 */ 2074 anon_index = seg_page(seg, shm_addr); 2075 sptseg_addr = sptseg->s_base + ptob(anon_index); 2076 2077 /* 2078 * And now we may have to adjust npages downward if we have 2079 * exceeded the realsize of the segment or initial anon 2080 * allocations. 2081 */ 2082 if ((sptseg_addr + ptob(npages)) > 2083 (sptseg->s_base + sptd->spt_realsize)) 2084 size = (sptseg->s_base + sptd->spt_realsize) - sptseg_addr; 2085 2086 npages = btopr(size); 2087 2088 ASSERT(sptseg_addr < (sptseg->s_base + sptseg->s_size)); 2089 ASSERT((sptd->spt_flags & SHM_PAGEABLE) == 0); 2090 2091 switch (type) { 2092 2093 case F_SOFTLOCK: 2094 2095 /* 2096 * availrmem is decremented once during anon_swap_adjust() 2097 * and is incremented during the anon_unresv(), which is 2098 * called from shm_rm_amp() when the segment is destroyed. 2099 */ 2100 atomic_add_long((ulong_t *)(&(shmd->shm_softlockcnt)), npages); 2101 /* 2102 * Some platforms assume that ISM pages are SE_SHARED 2103 * locked for the entire life of the segment. 2104 */ 2105 if (!hat_supported(HAT_DYNAMIC_ISM_UNMAP, (void *)0)) 2106 return (0); 2107 /* 2108 * Fall through to the F_INVAL case to load up the hat layer 2109 * entries with the HAT_LOAD_LOCK flag. 2110 */ 2111 2112 /* FALLTHRU */ 2113 case F_INVAL: 2114 2115 if ((rw == S_EXEC) && !(sptd->spt_prot & PROT_EXEC)) 2116 return (FC_NOMAP); 2117 2118 /* 2119 * Some platforms that do NOT support DYNAMIC_ISM_UNMAP 2120 * may still rely on this call to hat_share(). That 2121 * would imply that those hat's can fault on a 2122 * HAT_LOAD_LOCK translation, which would seem 2123 * contradictory. 2124 */ 2125 if (!hat_supported(HAT_DYNAMIC_ISM_UNMAP, (void *)0)) { 2126 if (hat_share(seg->s_as->a_hat, seg->s_base, 2127 curspt->a_hat, sptseg->s_base, 2128 sptseg->s_size, sptseg->s_szc) != 0) { 2129 panic("hat_share error in ISM fault"); 2130 /*NOTREACHED*/ 2131 } 2132 return (0); 2133 } 2134 ppa = kmem_zalloc(sizeof (page_t *) * npages, KM_SLEEP); 2135 2136 /* 2137 * I see no need to lock the real seg, 2138 * here, because all of our work will be on the underlying 2139 * dummy seg. 2140 * 2141 * sptseg_addr and npages now account for large pages. 2142 */ 2143 amp = sptd->spt_amp; 2144 ASSERT(amp != NULL); 2145 anon_index = seg_page(sptseg, sptseg_addr); 2146 2147 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 2148 for (i = 0; i < npages; i++) { 2149 ap = anon_get_ptr(amp->ahp, anon_index++); 2150 ASSERT(ap != NULL); 2151 swap_xlate(ap, &vp, &offset); 2152 pp = page_lookup(vp, offset, SE_SHARED); 2153 ASSERT(pp != NULL); 2154 ppa[i] = pp; 2155 } 2156 ANON_LOCK_EXIT(&->a_rwlock); 2157 ASSERT(i == npages); 2158 2159 /* 2160 * We are already holding the as->a_lock on the user's 2161 * real segment, but we need to hold the a_lock on the 2162 * underlying dummy as. This is mostly to satisfy the 2163 * underlying HAT layer. 2164 */ 2165 AS_LOCK_ENTER(sptseg->s_as, RW_READER); 2166 a = sptseg_addr; 2167 pidx = 0; 2168 if (type == F_SOFTLOCK) { 2169 /* 2170 * Load up the translation keeping it 2171 * locked and don't unlock the page. 2172 */ 2173 for (; pidx < npages; a += pgsz, pidx += pgcnt) { 2174 sz = MIN(pgsz, ptob(npages - pidx)); 2175 hat_memload_array(sptseg->s_as->a_hat, a, 2176 sz, &ppa[pidx], sptd->spt_prot, 2177 HAT_LOAD_LOCK | HAT_LOAD_SHARE); 2178 } 2179 } else { 2180 /* 2181 * Migrate pages marked for migration. 2182 */ 2183 if (lgrp_optimizations()) 2184 page_migrate(seg, shm_addr, ppa, npages); 2185 2186 for (; pidx < npages; a += pgsz, pidx += pgcnt) { 2187 sz = MIN(pgsz, ptob(npages - pidx)); 2188 hat_memload_array(sptseg->s_as->a_hat, 2189 a, sz, &ppa[pidx], 2190 sptd->spt_prot, HAT_LOAD_SHARE); 2191 } 2192 2193 /* 2194 * And now drop the SE_SHARED lock(s). 2195 */ 2196 for (i = 0; i < npages; i++) 2197 page_unlock(ppa[i]); 2198 } 2199 AS_LOCK_EXIT(sptseg->s_as); 2200 2201 kmem_free(ppa, sizeof (page_t *) * npages); 2202 return (0); 2203 case F_SOFTUNLOCK: 2204 2205 /* 2206 * This is a bit ugly, we pass in the real seg pointer, 2207 * but the sptseg_addr is the virtual address within the 2208 * dummy seg. 2209 */ 2210 segspt_softunlock(seg, sptseg_addr, ptob(npages), rw); 2211 return (0); 2212 2213 case F_PROT: 2214 2215 /* 2216 * This takes care of the unusual case where a user 2217 * allocates a stack in shared memory and a register 2218 * window overflow is written to that stack page before 2219 * it is otherwise modified. 2220 * 2221 * We can get away with this because ISM segments are 2222 * always rw. Other than this unusual case, there 2223 * should be no instances of protection violations. 2224 */ 2225 return (0); 2226 2227 default: 2228 #ifdef DEBUG 2229 cmn_err(CE_WARN, "segspt_shmfault default type?"); 2230 #endif 2231 return (FC_NOMAP); 2232 } 2233 } 2234 2235 /*ARGSUSED*/ 2236 static faultcode_t 2237 segspt_shmfaulta(struct seg *seg, caddr_t addr) 2238 { 2239 return (0); 2240 } 2241 2242 /*ARGSUSED*/ 2243 static int 2244 segspt_shmkluster(struct seg *seg, caddr_t addr, ssize_t delta) 2245 { 2246 return (0); 2247 } 2248 2249 /*ARGSUSED*/ 2250 static size_t 2251 segspt_shmswapout(struct seg *seg) 2252 { 2253 return (0); 2254 } 2255 2256 /* 2257 * duplicate the shared page tables 2258 */ 2259 int 2260 segspt_shmdup(struct seg *seg, struct seg *newseg) 2261 { 2262 struct shm_data *shmd = (struct shm_data *)seg->s_data; 2263 struct anon_map *amp = shmd->shm_amp; 2264 struct shm_data *shmd_new; 2265 struct seg *spt_seg = shmd->shm_sptseg; 2266 struct spt_data *sptd = spt_seg->s_data; 2267 int error = 0; 2268 2269 ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as)); 2270 2271 shmd_new = kmem_zalloc((sizeof (*shmd_new)), KM_SLEEP); 2272 newseg->s_data = (void *)shmd_new; 2273 shmd_new->shm_sptas = shmd->shm_sptas; 2274 shmd_new->shm_amp = amp; 2275 shmd_new->shm_sptseg = shmd->shm_sptseg; 2276 newseg->s_ops = &segspt_shmops; 2277 newseg->s_szc = seg->s_szc; 2278 ASSERT(seg->s_szc == shmd->shm_sptseg->s_szc); 2279 2280 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 2281 amp->refcnt++; 2282 ANON_LOCK_EXIT(&->a_rwlock); 2283 2284 if (sptd->spt_flags & SHM_PAGEABLE) { 2285 shmd_new->shm_vpage = kmem_zalloc(btopr(amp->size), KM_SLEEP); 2286 shmd_new->shm_lckpgs = 0; 2287 if (hat_supported(HAT_DYNAMIC_ISM_UNMAP, (void *)0)) { 2288 if ((error = hat_share(newseg->s_as->a_hat, 2289 newseg->s_base, shmd->shm_sptas->a_hat, SEGSPTADDR, 2290 seg->s_size, seg->s_szc)) != 0) { 2291 kmem_free(shmd_new->shm_vpage, 2292 btopr(amp->size)); 2293 } 2294 } 2295 return (error); 2296 } else { 2297 return (hat_share(newseg->s_as->a_hat, newseg->s_base, 2298 shmd->shm_sptas->a_hat, SEGSPTADDR, seg->s_size, 2299 seg->s_szc)); 2300 2301 } 2302 } 2303 2304 /*ARGSUSED*/ 2305 int 2306 segspt_shmcheckprot(struct seg *seg, caddr_t addr, size_t size, uint_t prot) 2307 { 2308 struct shm_data *shmd = (struct shm_data *)seg->s_data; 2309 struct spt_data *sptd = (struct spt_data *)shmd->shm_sptseg->s_data; 2310 2311 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as)); 2312 2313 /* 2314 * ISM segment is always rw. 2315 */ 2316 return (((sptd->spt_prot & prot) != prot) ? EACCES : 0); 2317 } 2318 2319 /* 2320 * Return an array of locked large pages, for empty slots allocate 2321 * private zero-filled anon pages. 2322 */ 2323 static int 2324 spt_anon_getpages( 2325 struct seg *sptseg, 2326 caddr_t sptaddr, 2327 size_t len, 2328 page_t *ppa[]) 2329 { 2330 struct spt_data *sptd = sptseg->s_data; 2331 struct anon_map *amp = sptd->spt_amp; 2332 enum seg_rw rw = sptd->spt_prot; 2333 uint_t szc = sptseg->s_szc; 2334 size_t pg_sz, share_sz = page_get_pagesize(szc); 2335 pgcnt_t lp_npgs; 2336 caddr_t lp_addr, e_sptaddr; 2337 uint_t vpprot, ppa_szc = 0; 2338 struct vpage *vpage = NULL; 2339 ulong_t j, ppa_idx; 2340 int err, ierr = 0; 2341 pgcnt_t an_idx; 2342 anon_sync_obj_t cookie; 2343 int anon_locked = 0; 2344 pgcnt_t amp_pgs; 2345 2346 2347 ASSERT(IS_P2ALIGNED(sptaddr, share_sz) && IS_P2ALIGNED(len, share_sz)); 2348 ASSERT(len != 0); 2349 2350 pg_sz = share_sz; 2351 lp_npgs = btop(pg_sz); 2352 lp_addr = sptaddr; 2353 e_sptaddr = sptaddr + len; 2354 an_idx = seg_page(sptseg, sptaddr); 2355 ppa_idx = 0; 2356 2357 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 2358 2359 amp_pgs = page_get_pagecnt(amp->a_szc); 2360 2361 /*CONSTCOND*/ 2362 while (1) { 2363 for (; lp_addr < e_sptaddr; 2364 an_idx += lp_npgs, lp_addr += pg_sz, ppa_idx += lp_npgs) { 2365 2366 /* 2367 * If we're currently locked, and we get to a new 2368 * page, unlock our current anon chunk. 2369 */ 2370 if (anon_locked && P2PHASE(an_idx, amp_pgs) == 0) { 2371 anon_array_exit(&cookie); 2372 anon_locked = 0; 2373 } 2374 if (!anon_locked) { 2375 anon_array_enter(amp, an_idx, &cookie); 2376 anon_locked = 1; 2377 } 2378 ppa_szc = (uint_t)-1; 2379 ierr = anon_map_getpages(amp, an_idx, szc, sptseg, 2380 lp_addr, sptd->spt_prot, &vpprot, &ppa[ppa_idx], 2381 &ppa_szc, vpage, rw, 0, segvn_anypgsz, 0, kcred); 2382 2383 if (ierr != 0) { 2384 if (ierr > 0) { 2385 err = FC_MAKE_ERR(ierr); 2386 goto lpgs_err; 2387 } 2388 break; 2389 } 2390 } 2391 if (lp_addr == e_sptaddr) { 2392 break; 2393 } 2394 ASSERT(lp_addr < e_sptaddr); 2395 2396 /* 2397 * ierr == -1 means we failed to allocate a large page. 2398 * so do a size down operation. 2399 * 2400 * ierr == -2 means some other process that privately shares 2401 * pages with this process has allocated a larger page and we 2402 * need to retry with larger pages. So do a size up 2403 * operation. This relies on the fact that large pages are 2404 * never partially shared i.e. if we share any constituent 2405 * page of a large page with another process we must share the 2406 * entire large page. Note this cannot happen for SOFTLOCK 2407 * case, unless current address (lpaddr) is at the beginning 2408 * of the next page size boundary because the other process 2409 * couldn't have relocated locked pages. 2410 */ 2411 ASSERT(ierr == -1 || ierr == -2); 2412 if (segvn_anypgsz) { 2413 ASSERT(ierr == -2 || szc != 0); 2414 ASSERT(ierr == -1 || szc < sptseg->s_szc); 2415 szc = (ierr == -1) ? szc - 1 : szc + 1; 2416 } else { 2417 /* 2418 * For faults and segvn_anypgsz == 0 2419 * we need to be careful not to loop forever 2420 * if existing page is found with szc other 2421 * than 0 or seg->s_szc. This could be due 2422 * to page relocations on behalf of DR or 2423 * more likely large page creation. For this 2424 * case simply re-size to existing page's szc 2425 * if returned by anon_map_getpages(). 2426 */ 2427 if (ppa_szc == (uint_t)-1) { 2428 szc = (ierr == -1) ? 0 : sptseg->s_szc; 2429 } else { 2430 ASSERT(ppa_szc <= sptseg->s_szc); 2431 ASSERT(ierr == -2 || ppa_szc < szc); 2432 ASSERT(ierr == -1 || ppa_szc > szc); 2433 szc = ppa_szc; 2434 } 2435 } 2436 pg_sz = page_get_pagesize(szc); 2437 lp_npgs = btop(pg_sz); 2438 ASSERT(IS_P2ALIGNED(lp_addr, pg_sz)); 2439 } 2440 if (anon_locked) { 2441 anon_array_exit(&cookie); 2442 } 2443 ANON_LOCK_EXIT(&->a_rwlock); 2444 return (0); 2445 2446 lpgs_err: 2447 if (anon_locked) { 2448 anon_array_exit(&cookie); 2449 } 2450 ANON_LOCK_EXIT(&->a_rwlock); 2451 for (j = 0; j < ppa_idx; j++) 2452 page_unlock(ppa[j]); 2453 return (err); 2454 } 2455 2456 /* 2457 * count the number of bytes in a set of spt pages that are currently not 2458 * locked 2459 */ 2460 static rctl_qty_t 2461 spt_unlockedbytes(pgcnt_t npages, page_t **ppa) 2462 { 2463 ulong_t i; 2464 rctl_qty_t unlocked = 0; 2465 2466 for (i = 0; i < npages; i++) { 2467 if (ppa[i]->p_lckcnt == 0) 2468 unlocked += PAGESIZE; 2469 } 2470 return (unlocked); 2471 } 2472 2473 extern u_longlong_t randtick(void); 2474 /* number of locks to reserve/skip by spt_lockpages() and spt_unlockpages() */ 2475 #define NLCK (NCPU_P2) 2476 /* Random number with a range [0, n-1], n must be power of two */ 2477 #define RAND_P2(n) \ 2478 ((((long)curthread >> PTR24_LSB) ^ (long)randtick()) & ((n) - 1)) 2479 2480 int 2481 spt_lockpages(struct seg *seg, pgcnt_t anon_index, pgcnt_t npages, 2482 page_t **ppa, ulong_t *lockmap, size_t pos, 2483 rctl_qty_t *locked) 2484 { 2485 struct shm_data *shmd = seg->s_data; 2486 struct spt_data *sptd = shmd->shm_sptseg->s_data; 2487 ulong_t i; 2488 int kernel; 2489 pgcnt_t nlck = 0; 2490 int rv = 0; 2491 int use_reserved = 1; 2492 2493 /* return the number of bytes actually locked */ 2494 *locked = 0; 2495 2496 /* 2497 * To avoid contention on freemem_lock, availrmem and pages_locked 2498 * global counters are updated only every nlck locked pages instead of 2499 * every time. Reserve nlck locks up front and deduct from this 2500 * reservation for each page that requires a lock. When the reservation 2501 * is consumed, reserve again. nlck is randomized, so the competing 2502 * threads do not fall into a cyclic lock contention pattern. When 2503 * memory is low, the lock ahead is disabled, and instead page_pp_lock() 2504 * is used to lock pages. 2505 */ 2506 for (i = 0; i < npages; anon_index++, pos++, i++) { 2507 if (nlck == 0 && use_reserved == 1) { 2508 nlck = NLCK + RAND_P2(NLCK); 2509 /* if fewer loops left, decrease nlck */ 2510 nlck = MIN(nlck, npages - i); 2511 /* 2512 * Reserve nlck locks up front and deduct from this 2513 * reservation for each page that requires a lock. When 2514 * the reservation is consumed, reserve again. 2515 */ 2516 mutex_enter(&freemem_lock); 2517 if ((availrmem - nlck) < pages_pp_maximum) { 2518 /* Do not do advance memory reserves */ 2519 use_reserved = 0; 2520 } else { 2521 availrmem -= nlck; 2522 pages_locked += nlck; 2523 } 2524 mutex_exit(&freemem_lock); 2525 } 2526 if (!(shmd->shm_vpage[anon_index] & DISM_PG_LOCKED)) { 2527 if (sptd->spt_ppa_lckcnt[anon_index] < 2528 (ushort_t)DISM_LOCK_MAX) { 2529 if (++sptd->spt_ppa_lckcnt[anon_index] == 2530 (ushort_t)DISM_LOCK_MAX) { 2531 cmn_err(CE_WARN, 2532 "DISM page lock limit " 2533 "reached on DISM offset 0x%lx\n", 2534 anon_index << PAGESHIFT); 2535 } 2536 kernel = (sptd->spt_ppa && 2537 sptd->spt_ppa[anon_index]); 2538 if (!page_pp_lock(ppa[i], 0, kernel || 2539 use_reserved)) { 2540 sptd->spt_ppa_lckcnt[anon_index]--; 2541 rv = EAGAIN; 2542 break; 2543 } 2544 /* if this is a newly locked page, count it */ 2545 if (ppa[i]->p_lckcnt == 1) { 2546 if (kernel == 0 && use_reserved == 1) 2547 nlck--; 2548 *locked += PAGESIZE; 2549 } 2550 shmd->shm_lckpgs++; 2551 shmd->shm_vpage[anon_index] |= DISM_PG_LOCKED; 2552 if (lockmap != NULL) 2553 BT_SET(lockmap, pos); 2554 } 2555 } 2556 } 2557 /* Return unused lock reservation */ 2558 if (nlck != 0 && use_reserved == 1) { 2559 mutex_enter(&freemem_lock); 2560 availrmem += nlck; 2561 pages_locked -= nlck; 2562 mutex_exit(&freemem_lock); 2563 } 2564 2565 return (rv); 2566 } 2567 2568 int 2569 spt_unlockpages(struct seg *seg, pgcnt_t anon_index, pgcnt_t npages, 2570 rctl_qty_t *unlocked) 2571 { 2572 struct shm_data *shmd = seg->s_data; 2573 struct spt_data *sptd = shmd->shm_sptseg->s_data; 2574 struct anon_map *amp = sptd->spt_amp; 2575 struct anon *ap; 2576 struct vnode *vp; 2577 u_offset_t off; 2578 struct page *pp; 2579 int kernel; 2580 anon_sync_obj_t cookie; 2581 ulong_t i; 2582 pgcnt_t nlck = 0; 2583 pgcnt_t nlck_limit = NLCK; 2584 2585 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 2586 for (i = 0; i < npages; i++, anon_index++) { 2587 if (shmd->shm_vpage[anon_index] & DISM_PG_LOCKED) { 2588 anon_array_enter(amp, anon_index, &cookie); 2589 ap = anon_get_ptr(amp->ahp, anon_index); 2590 ASSERT(ap); 2591 2592 swap_xlate(ap, &vp, &off); 2593 anon_array_exit(&cookie); 2594 pp = page_lookup(vp, off, SE_SHARED); 2595 ASSERT(pp); 2596 /* 2597 * availrmem is decremented only for pages which are not 2598 * in seg pcache, for pages in seg pcache availrmem was 2599 * decremented in _dismpagelock() 2600 */ 2601 kernel = (sptd->spt_ppa && sptd->spt_ppa[anon_index]); 2602 ASSERT(pp->p_lckcnt > 0); 2603 2604 /* 2605 * lock page but do not change availrmem, we do it 2606 * ourselves every nlck loops. 2607 */ 2608 page_pp_unlock(pp, 0, 1); 2609 if (pp->p_lckcnt == 0) { 2610 if (kernel == 0) 2611 nlck++; 2612 *unlocked += PAGESIZE; 2613 } 2614 page_unlock(pp); 2615 shmd->shm_vpage[anon_index] &= ~DISM_PG_LOCKED; 2616 sptd->spt_ppa_lckcnt[anon_index]--; 2617 shmd->shm_lckpgs--; 2618 } 2619 2620 /* 2621 * To reduce freemem_lock contention, do not update availrmem 2622 * until at least NLCK pages have been unlocked. 2623 * 1. No need to update if nlck is zero 2624 * 2. Always update if the last iteration 2625 */ 2626 if (nlck > 0 && (nlck == nlck_limit || i == npages - 1)) { 2627 mutex_enter(&freemem_lock); 2628 availrmem += nlck; 2629 pages_locked -= nlck; 2630 mutex_exit(&freemem_lock); 2631 nlck = 0; 2632 nlck_limit = NLCK + RAND_P2(NLCK); 2633 } 2634 } 2635 ANON_LOCK_EXIT(&->a_rwlock); 2636 2637 return (0); 2638 } 2639 2640 /*ARGSUSED*/ 2641 static int 2642 segspt_shmlockop(struct seg *seg, caddr_t addr, size_t len, 2643 int attr, int op, ulong_t *lockmap, size_t pos) 2644 { 2645 struct shm_data *shmd = seg->s_data; 2646 struct seg *sptseg = shmd->shm_sptseg; 2647 struct spt_data *sptd = sptseg->s_data; 2648 struct kshmid *sp = sptd->spt_amp->a_sp; 2649 pgcnt_t npages, a_npages; 2650 page_t **ppa; 2651 pgcnt_t an_idx, a_an_idx, ppa_idx; 2652 caddr_t spt_addr, a_addr; /* spt and aligned address */ 2653 size_t a_len; /* aligned len */ 2654 size_t share_sz; 2655 ulong_t i; 2656 int sts = 0; 2657 rctl_qty_t unlocked = 0; 2658 rctl_qty_t locked = 0; 2659 struct proc *p = curproc; 2660 kproject_t *proj; 2661 2662 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as)); 2663 ASSERT(sp != NULL); 2664 2665 if ((sptd->spt_flags & SHM_PAGEABLE) == 0) { 2666 return (0); 2667 } 2668 2669 addr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK); 2670 an_idx = seg_page(seg, addr); 2671 npages = btopr(len); 2672 2673 if (an_idx + npages > btopr(shmd->shm_amp->size)) { 2674 return (ENOMEM); 2675 } 2676 2677 /* 2678 * A shm's project never changes, so no lock needed. 2679 * The shm has a hold on the project, so it will not go away. 2680 * Since we have a mapping to shm within this zone, we know 2681 * that the zone will not go away. 2682 */ 2683 proj = sp->shm_perm.ipc_proj; 2684 2685 if (op == MC_LOCK) { 2686 2687 /* 2688 * Need to align addr and size request if they are not 2689 * aligned so we can always allocate large page(s) however 2690 * we only lock what was requested in initial request. 2691 */ 2692 share_sz = page_get_pagesize(sptseg->s_szc); 2693 a_addr = (caddr_t)P2ALIGN((uintptr_t)(addr), share_sz); 2694 a_len = P2ROUNDUP((uintptr_t)(((addr + len) - a_addr)), 2695 share_sz); 2696 a_npages = btop(a_len); 2697 a_an_idx = seg_page(seg, a_addr); 2698 spt_addr = sptseg->s_base + ptob(a_an_idx); 2699 ppa_idx = an_idx - a_an_idx; 2700 2701 if ((ppa = kmem_zalloc(((sizeof (page_t *)) * a_npages), 2702 KM_NOSLEEP)) == NULL) { 2703 return (ENOMEM); 2704 } 2705 2706 /* 2707 * Don't cache any new pages for IO and 2708 * flush any cached pages. 2709 */ 2710 mutex_enter(&sptd->spt_lock); 2711 if (sptd->spt_ppa != NULL) 2712 sptd->spt_flags |= DISM_PPA_CHANGED; 2713 2714 sts = spt_anon_getpages(sptseg, spt_addr, a_len, ppa); 2715 if (sts != 0) { 2716 mutex_exit(&sptd->spt_lock); 2717 kmem_free(ppa, ((sizeof (page_t *)) * a_npages)); 2718 return (sts); 2719 } 2720 2721 mutex_enter(&sp->shm_mlock); 2722 /* enforce locked memory rctl */ 2723 unlocked = spt_unlockedbytes(npages, &ppa[ppa_idx]); 2724 2725 mutex_enter(&p->p_lock); 2726 if (rctl_incr_locked_mem(p, proj, unlocked, 0)) { 2727 mutex_exit(&p->p_lock); 2728 sts = EAGAIN; 2729 } else { 2730 mutex_exit(&p->p_lock); 2731 sts = spt_lockpages(seg, an_idx, npages, 2732 &ppa[ppa_idx], lockmap, pos, &locked); 2733 2734 /* 2735 * correct locked count if not all pages could be 2736 * locked 2737 */ 2738 if ((unlocked - locked) > 0) { 2739 rctl_decr_locked_mem(NULL, proj, 2740 (unlocked - locked), 0); 2741 } 2742 } 2743 /* 2744 * unlock pages 2745 */ 2746 for (i = 0; i < a_npages; i++) 2747 page_unlock(ppa[i]); 2748 if (sptd->spt_ppa != NULL) 2749 sptd->spt_flags |= DISM_PPA_CHANGED; 2750 mutex_exit(&sp->shm_mlock); 2751 mutex_exit(&sptd->spt_lock); 2752 2753 kmem_free(ppa, ((sizeof (page_t *)) * a_npages)); 2754 2755 } else if (op == MC_UNLOCK) { /* unlock */ 2756 page_t **ppa; 2757 2758 mutex_enter(&sptd->spt_lock); 2759 if (shmd->shm_lckpgs == 0) { 2760 mutex_exit(&sptd->spt_lock); 2761 return (0); 2762 } 2763 /* 2764 * Don't cache new IO pages. 2765 */ 2766 if (sptd->spt_ppa != NULL) 2767 sptd->spt_flags |= DISM_PPA_CHANGED; 2768 2769 mutex_enter(&sp->shm_mlock); 2770 sts = spt_unlockpages(seg, an_idx, npages, &unlocked); 2771 if ((ppa = sptd->spt_ppa) != NULL) 2772 sptd->spt_flags |= DISM_PPA_CHANGED; 2773 mutex_exit(&sptd->spt_lock); 2774 2775 rctl_decr_locked_mem(NULL, proj, unlocked, 0); 2776 mutex_exit(&sp->shm_mlock); 2777 2778 if (ppa != NULL) 2779 seg_ppurge_wiredpp(ppa); 2780 } 2781 return (sts); 2782 } 2783 2784 /*ARGSUSED*/ 2785 int 2786 segspt_shmgetprot(struct seg *seg, caddr_t addr, size_t len, uint_t *protv) 2787 { 2788 struct shm_data *shmd = (struct shm_data *)seg->s_data; 2789 struct spt_data *sptd = (struct spt_data *)shmd->shm_sptseg->s_data; 2790 spgcnt_t pgno = seg_page(seg, addr+len) - seg_page(seg, addr) + 1; 2791 2792 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as)); 2793 2794 /* 2795 * ISM segment is always rw. 2796 */ 2797 while (--pgno >= 0) 2798 *protv++ = sptd->spt_prot; 2799 return (0); 2800 } 2801 2802 /*ARGSUSED*/ 2803 u_offset_t 2804 segspt_shmgetoffset(struct seg *seg, caddr_t addr) 2805 { 2806 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as)); 2807 2808 /* Offset does not matter in ISM memory */ 2809 2810 return ((u_offset_t)0); 2811 } 2812 2813 /* ARGSUSED */ 2814 int 2815 segspt_shmgettype(struct seg *seg, caddr_t addr) 2816 { 2817 struct shm_data *shmd = (struct shm_data *)seg->s_data; 2818 struct spt_data *sptd = (struct spt_data *)shmd->shm_sptseg->s_data; 2819 2820 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as)); 2821 2822 /* 2823 * The shared memory mapping is always MAP_SHARED, SWAP is only 2824 * reserved for DISM 2825 */ 2826 return (MAP_SHARED | 2827 ((sptd->spt_flags & SHM_PAGEABLE) ? 0 : MAP_NORESERVE)); 2828 } 2829 2830 /*ARGSUSED*/ 2831 int 2832 segspt_shmgetvp(struct seg *seg, caddr_t addr, struct vnode **vpp) 2833 { 2834 struct shm_data *shmd = (struct shm_data *)seg->s_data; 2835 struct spt_data *sptd = (struct spt_data *)shmd->shm_sptseg->s_data; 2836 2837 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as)); 2838 2839 *vpp = sptd->spt_vp; 2840 return (0); 2841 } 2842 2843 /* 2844 * We need to wait for pending IO to complete to a DISM segment in order for 2845 * pages to get kicked out of the seg_pcache. 120 seconds should be more 2846 * than enough time to wait. 2847 */ 2848 static clock_t spt_pcache_wait = 120; 2849 2850 /*ARGSUSED*/ 2851 static int 2852 segspt_shmadvise(struct seg *seg, caddr_t addr, size_t len, uint_t behav) 2853 { 2854 struct shm_data *shmd = (struct shm_data *)seg->s_data; 2855 struct spt_data *sptd = (struct spt_data *)shmd->shm_sptseg->s_data; 2856 struct anon_map *amp; 2857 pgcnt_t pg_idx; 2858 ushort_t gen; 2859 clock_t end_lbolt; 2860 int writer; 2861 page_t **ppa; 2862 2863 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as)); 2864 2865 if (behav == MADV_FREE || behav == MADV_PURGE) { 2866 if ((sptd->spt_flags & SHM_PAGEABLE) == 0) 2867 return (0); 2868 2869 amp = sptd->spt_amp; 2870 pg_idx = seg_page(seg, addr); 2871 2872 mutex_enter(&sptd->spt_lock); 2873 if ((ppa = sptd->spt_ppa) == NULL) { 2874 mutex_exit(&sptd->spt_lock); 2875 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 2876 (void) anon_disclaim(amp, pg_idx, len, behav, NULL); 2877 ANON_LOCK_EXIT(&->a_rwlock); 2878 return (0); 2879 } 2880 2881 sptd->spt_flags |= DISM_PPA_CHANGED; 2882 gen = sptd->spt_gen; 2883 2884 mutex_exit(&sptd->spt_lock); 2885 2886 /* 2887 * Purge all DISM cached pages 2888 */ 2889 seg_ppurge_wiredpp(ppa); 2890 2891 /* 2892 * Drop the AS_LOCK so that other threads can grab it 2893 * in the as_pageunlock path and hopefully get the segment 2894 * kicked out of the seg_pcache. We bump the shm_softlockcnt 2895 * to keep this segment resident. 2896 */ 2897 writer = AS_WRITE_HELD(seg->s_as); 2898 atomic_inc_ulong((ulong_t *)(&(shmd->shm_softlockcnt))); 2899 AS_LOCK_EXIT(seg->s_as); 2900 2901 mutex_enter(&sptd->spt_lock); 2902 2903 end_lbolt = ddi_get_lbolt() + (hz * spt_pcache_wait); 2904 2905 /* 2906 * Try to wait for pages to get kicked out of the seg_pcache. 2907 */ 2908 while (sptd->spt_gen == gen && 2909 (sptd->spt_flags & DISM_PPA_CHANGED) && 2910 ddi_get_lbolt() < end_lbolt) { 2911 if (!cv_timedwait_sig(&sptd->spt_cv, 2912 &sptd->spt_lock, end_lbolt)) { 2913 break; 2914 } 2915 } 2916 2917 mutex_exit(&sptd->spt_lock); 2918 2919 /* Regrab the AS_LOCK and release our hold on the segment */ 2920 AS_LOCK_ENTER(seg->s_as, writer ? RW_WRITER : RW_READER); 2921 atomic_dec_ulong((ulong_t *)(&(shmd->shm_softlockcnt))); 2922 if (shmd->shm_softlockcnt <= 0) { 2923 if (AS_ISUNMAPWAIT(seg->s_as)) { 2924 mutex_enter(&seg->s_as->a_contents); 2925 if (AS_ISUNMAPWAIT(seg->s_as)) { 2926 AS_CLRUNMAPWAIT(seg->s_as); 2927 cv_broadcast(&seg->s_as->a_cv); 2928 } 2929 mutex_exit(&seg->s_as->a_contents); 2930 } 2931 } 2932 2933 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 2934 (void) anon_disclaim(amp, pg_idx, len, behav, NULL); 2935 ANON_LOCK_EXIT(&->a_rwlock); 2936 } else if (lgrp_optimizations() && (behav == MADV_ACCESS_LWP || 2937 behav == MADV_ACCESS_MANY || behav == MADV_ACCESS_DEFAULT)) { 2938 int already_set; 2939 ulong_t anon_index; 2940 lgrp_mem_policy_t policy; 2941 caddr_t shm_addr; 2942 size_t share_size; 2943 size_t size; 2944 struct seg *sptseg = shmd->shm_sptseg; 2945 caddr_t sptseg_addr; 2946 2947 /* 2948 * Align address and length to page size of underlying segment 2949 */ 2950 share_size = page_get_pagesize(shmd->shm_sptseg->s_szc); 2951 shm_addr = (caddr_t)P2ALIGN((uintptr_t)(addr), share_size); 2952 size = P2ROUNDUP((uintptr_t)(((addr + len) - shm_addr)), 2953 share_size); 2954 2955 amp = shmd->shm_amp; 2956 anon_index = seg_page(seg, shm_addr); 2957 2958 /* 2959 * And now we may have to adjust size downward if we have 2960 * exceeded the realsize of the segment or initial anon 2961 * allocations. 2962 */ 2963 sptseg_addr = sptseg->s_base + ptob(anon_index); 2964 if ((sptseg_addr + size) > 2965 (sptseg->s_base + sptd->spt_realsize)) 2966 size = (sptseg->s_base + sptd->spt_realsize) - 2967 sptseg_addr; 2968 2969 /* 2970 * Set memory allocation policy for this segment 2971 */ 2972 policy = lgrp_madv_to_policy(behav, len, MAP_SHARED); 2973 already_set = lgrp_shm_policy_set(policy, amp, anon_index, 2974 NULL, 0, len); 2975 2976 /* 2977 * If random memory allocation policy set already, 2978 * don't bother reapplying it. 2979 */ 2980 if (already_set && !LGRP_MEM_POLICY_REAPPLICABLE(policy)) 2981 return (0); 2982 2983 /* 2984 * Mark any existing pages in the given range for 2985 * migration, flushing the I/O page cache, and using 2986 * underlying segment to calculate anon index and get 2987 * anonmap and vnode pointer from 2988 */ 2989 if (shmd->shm_softlockcnt > 0) 2990 segspt_purge(seg); 2991 2992 page_mark_migrate(seg, shm_addr, size, amp, 0, NULL, 0, 0); 2993 } 2994 2995 return (0); 2996 } 2997 2998 /*ARGSUSED*/ 2999 void 3000 segspt_shmdump(struct seg *seg) 3001 { 3002 /* no-op for ISM segment */ 3003 } 3004 3005 /*ARGSUSED*/ 3006 static faultcode_t 3007 segspt_shmsetpgsz(struct seg *seg, caddr_t addr, size_t len, uint_t szc) 3008 { 3009 return (ENOTSUP); 3010 } 3011 3012 /* 3013 * get a memory ID for an addr in a given segment 3014 */ 3015 static int 3016 segspt_shmgetmemid(struct seg *seg, caddr_t addr, memid_t *memidp) 3017 { 3018 struct shm_data *shmd = (struct shm_data *)seg->s_data; 3019 struct anon *ap; 3020 size_t anon_index; 3021 struct anon_map *amp = shmd->shm_amp; 3022 struct spt_data *sptd = shmd->shm_sptseg->s_data; 3023 struct seg *sptseg = shmd->shm_sptseg; 3024 anon_sync_obj_t cookie; 3025 3026 anon_index = seg_page(seg, addr); 3027 3028 if (addr > (seg->s_base + sptd->spt_realsize)) { 3029 return (EFAULT); 3030 } 3031 3032 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 3033 anon_array_enter(amp, anon_index, &cookie); 3034 ap = anon_get_ptr(amp->ahp, anon_index); 3035 if (ap == NULL) { 3036 struct page *pp; 3037 caddr_t spt_addr = sptseg->s_base + ptob(anon_index); 3038 3039 pp = anon_zero(sptseg, spt_addr, &ap, kcred); 3040 if (pp == NULL) { 3041 anon_array_exit(&cookie); 3042 ANON_LOCK_EXIT(&->a_rwlock); 3043 return (ENOMEM); 3044 } 3045 (void) anon_set_ptr(amp->ahp, anon_index, ap, ANON_SLEEP); 3046 page_unlock(pp); 3047 } 3048 anon_array_exit(&cookie); 3049 ANON_LOCK_EXIT(&->a_rwlock); 3050 memidp->val[0] = (uintptr_t)ap; 3051 memidp->val[1] = (uintptr_t)addr & PAGEOFFSET; 3052 return (0); 3053 } 3054 3055 /* 3056 * Get memory allocation policy info for specified address in given segment 3057 */ 3058 static lgrp_mem_policy_info_t * 3059 segspt_shmgetpolicy(struct seg *seg, caddr_t addr) 3060 { 3061 struct anon_map *amp; 3062 ulong_t anon_index; 3063 lgrp_mem_policy_info_t *policy_info; 3064 struct shm_data *shm_data; 3065 3066 ASSERT(seg != NULL); 3067 3068 /* 3069 * Get anon_map from segshm 3070 * 3071 * Assume that no lock needs to be held on anon_map, since 3072 * it should be protected by its reference count which must be 3073 * nonzero for an existing segment 3074 * Need to grab readers lock on policy tree though 3075 */ 3076 shm_data = (struct shm_data *)seg->s_data; 3077 if (shm_data == NULL) 3078 return (NULL); 3079 amp = shm_data->shm_amp; 3080 ASSERT(amp->refcnt != 0); 3081 3082 /* 3083 * Get policy info 3084 * 3085 * Assume starting anon index of 0 3086 */ 3087 anon_index = seg_page(seg, addr); 3088 policy_info = lgrp_shm_policy_get(amp, anon_index, NULL, 0); 3089 3090 return (policy_info); 3091 } 3092 3093 /*ARGSUSED*/ 3094 static int 3095 segspt_shmcapable(struct seg *seg, segcapability_t capability) 3096 { 3097 return (0); 3098 } 3099