1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License, Version 1.0 only 6 * (the "License"). You may not use this file except in compliance 7 * with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or http://www.opensolaris.org/os/licensing. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* 23 * Copyright 2005 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ 28 /* All Rights Reserved */ 29 30 /* 31 * University Copyright- Copyright (c) 1982, 1986, 1988 32 * The Regents of the University of California 33 * All Rights Reserved 34 * 35 * University Acknowledgment- Portions of this document are derived from 36 * software developed by the University of California, Berkeley, and its 37 * contributors. 38 */ 39 40 #pragma ident "%Z%%M% %I% %E% SMI" 41 42 /* 43 * VM - shared or copy-on-write from a vnode/anonymous memory. 44 */ 45 46 #include <sys/types.h> 47 #include <sys/param.h> 48 #include <sys/t_lock.h> 49 #include <sys/errno.h> 50 #include <sys/systm.h> 51 #include <sys/mman.h> 52 #include <sys/debug.h> 53 #include <sys/cred.h> 54 #include <sys/vmsystm.h> 55 #include <sys/tuneable.h> 56 #include <sys/bitmap.h> 57 #include <sys/swap.h> 58 #include <sys/kmem.h> 59 #include <sys/sysmacros.h> 60 #include <sys/vtrace.h> 61 #include <sys/cmn_err.h> 62 #include <sys/vm.h> 63 #include <sys/dumphdr.h> 64 #include <sys/lgrp.h> 65 66 #include <vm/hat.h> 67 #include <vm/as.h> 68 #include <vm/seg.h> 69 #include <vm/seg_vn.h> 70 #include <vm/pvn.h> 71 #include <vm/anon.h> 72 #include <vm/page.h> 73 #include <vm/vpage.h> 74 75 /* 76 * Private seg op routines. 77 */ 78 static int segvn_dup(struct seg *seg, struct seg *newseg); 79 static int segvn_unmap(struct seg *seg, caddr_t addr, size_t len); 80 static void segvn_free(struct seg *seg); 81 static faultcode_t segvn_fault(struct hat *hat, struct seg *seg, 82 caddr_t addr, size_t len, enum fault_type type, 83 enum seg_rw rw); 84 static faultcode_t segvn_faulta(struct seg *seg, caddr_t addr); 85 static int segvn_setprot(struct seg *seg, caddr_t addr, 86 size_t len, uint_t prot); 87 static int segvn_checkprot(struct seg *seg, caddr_t addr, 88 size_t len, uint_t prot); 89 static int segvn_kluster(struct seg *seg, caddr_t addr, ssize_t delta); 90 static size_t segvn_swapout(struct seg *seg); 91 static int segvn_sync(struct seg *seg, caddr_t addr, size_t len, 92 int attr, uint_t flags); 93 static size_t segvn_incore(struct seg *seg, caddr_t addr, size_t len, 94 char *vec); 95 static int segvn_lockop(struct seg *seg, caddr_t addr, size_t len, 96 int attr, int op, ulong_t *lockmap, size_t pos); 97 static int segvn_getprot(struct seg *seg, caddr_t addr, size_t len, 98 uint_t *protv); 99 static u_offset_t segvn_getoffset(struct seg *seg, caddr_t addr); 100 static int segvn_gettype(struct seg *seg, caddr_t addr); 101 static int segvn_getvp(struct seg *seg, caddr_t addr, 102 struct vnode **vpp); 103 static int segvn_advise(struct seg *seg, caddr_t addr, size_t len, 104 uint_t behav); 105 static void segvn_dump(struct seg *seg); 106 static int segvn_pagelock(struct seg *seg, caddr_t addr, size_t len, 107 struct page ***ppp, enum lock_type type, enum seg_rw rw); 108 static int segvn_setpagesize(struct seg *seg, caddr_t addr, size_t len, 109 uint_t szc); 110 static int segvn_getmemid(struct seg *seg, caddr_t addr, 111 memid_t *memidp); 112 static lgrp_mem_policy_info_t *segvn_getpolicy(struct seg *, caddr_t); 113 114 struct seg_ops segvn_ops = { 115 segvn_dup, 116 segvn_unmap, 117 segvn_free, 118 segvn_fault, 119 segvn_faulta, 120 segvn_setprot, 121 segvn_checkprot, 122 segvn_kluster, 123 segvn_swapout, 124 segvn_sync, 125 segvn_incore, 126 segvn_lockop, 127 segvn_getprot, 128 segvn_getoffset, 129 segvn_gettype, 130 segvn_getvp, 131 segvn_advise, 132 segvn_dump, 133 segvn_pagelock, 134 segvn_setpagesize, 135 segvn_getmemid, 136 segvn_getpolicy, 137 }; 138 139 /* 140 * Common zfod structures, provided as a shorthand for others to use. 141 */ 142 static segvn_crargs_t zfod_segvn_crargs = 143 SEGVN_ZFOD_ARGS(PROT_ZFOD, PROT_ALL); 144 static segvn_crargs_t kzfod_segvn_crargs = 145 SEGVN_ZFOD_ARGS(PROT_ZFOD & ~PROT_USER, 146 PROT_ALL & ~PROT_USER); 147 static segvn_crargs_t stack_noexec_crargs = 148 SEGVN_ZFOD_ARGS(PROT_ZFOD & ~PROT_EXEC, PROT_ALL); 149 150 caddr_t zfod_argsp = (caddr_t)&zfod_segvn_crargs; /* user zfod argsp */ 151 caddr_t kzfod_argsp = (caddr_t)&kzfod_segvn_crargs; /* kernel zfod argsp */ 152 caddr_t stack_exec_argsp = (caddr_t)&zfod_segvn_crargs; /* executable stack */ 153 caddr_t stack_noexec_argsp = (caddr_t)&stack_noexec_crargs; /* noexec stack */ 154 155 #define vpgtob(n) ((n) * sizeof (struct vpage)) /* For brevity */ 156 157 size_t segvn_comb_thrshld = UINT_MAX; /* patchable -- see 1196681 */ 158 159 static int segvn_concat(struct seg *, struct seg *, int); 160 static int segvn_extend_prev(struct seg *, struct seg *, 161 struct segvn_crargs *, size_t); 162 static int segvn_extend_next(struct seg *, struct seg *, 163 struct segvn_crargs *, size_t); 164 static void segvn_softunlock(struct seg *, caddr_t, size_t, enum seg_rw); 165 static void segvn_pagelist_rele(page_t **); 166 static void segvn_setvnode_mpss(vnode_t *); 167 static void segvn_relocate_pages(page_t **, page_t *); 168 static int segvn_full_szcpages(page_t **, uint_t, int *, uint_t *); 169 static int segvn_fill_vp_pages(struct segvn_data *, vnode_t *, u_offset_t, 170 uint_t, page_t **, page_t **, uint_t *, int *); 171 static faultcode_t segvn_fault_vnodepages(struct hat *, struct seg *, caddr_t, 172 caddr_t, enum fault_type, enum seg_rw, caddr_t, caddr_t, int); 173 static faultcode_t segvn_fault_anonpages(struct hat *, struct seg *, caddr_t, 174 caddr_t, enum fault_type, enum seg_rw, caddr_t, caddr_t, int); 175 static faultcode_t segvn_faultpage(struct hat *, struct seg *, caddr_t, 176 u_offset_t, struct vpage *, page_t **, uint_t, 177 enum fault_type, enum seg_rw, int); 178 static void segvn_vpage(struct seg *); 179 180 static void segvn_purge(struct seg *seg); 181 static int segvn_reclaim(struct seg *, caddr_t, size_t, struct page **, 182 enum seg_rw); 183 184 static int sameprot(struct seg *, caddr_t, size_t); 185 186 static int segvn_demote_range(struct seg *, caddr_t, size_t, int); 187 static int segvn_clrszc(struct seg *); 188 static struct seg *segvn_split_seg(struct seg *, caddr_t); 189 static int segvn_claim_pages(struct seg *, struct vpage *, u_offset_t, 190 ulong_t, uint_t); 191 192 static struct kmem_cache *segvn_cache; 193 194 #ifdef VM_STATS 195 static struct segvnvmstats_str { 196 ulong_t fill_vp_pages[31]; 197 ulong_t fltvnpages[49]; 198 ulong_t fullszcpages[10]; 199 ulong_t relocatepages[3]; 200 ulong_t fltanpages[17]; 201 ulong_t pagelock[3]; 202 ulong_t demoterange[3]; 203 } segvnvmstats; 204 #endif /* VM_STATS */ 205 206 #define SDR_RANGE 1 /* demote entire range */ 207 #define SDR_END 2 /* demote non aligned ends only */ 208 209 #define CALC_LPG_REGION(pgsz, seg, addr, len, lpgaddr, lpgeaddr) { \ 210 if ((len) != 0) { \ 211 lpgaddr = (caddr_t)P2ALIGN((uintptr_t)(addr), pgsz); \ 212 ASSERT(lpgaddr >= (seg)->s_base); \ 213 lpgeaddr = (caddr_t)P2ROUNDUP((uintptr_t)((addr) + \ 214 (len)), pgsz); \ 215 ASSERT(lpgeaddr > lpgaddr); \ 216 ASSERT(lpgeaddr <= (seg)->s_base + (seg)->s_size); \ 217 } else { \ 218 lpgeaddr = lpgaddr = (addr); \ 219 } \ 220 } 221 222 /*ARGSUSED*/ 223 static int 224 segvn_cache_constructor(void *buf, void *cdrarg, int kmflags) 225 { 226 struct segvn_data *svd = buf; 227 228 rw_init(&svd->lock, NULL, RW_DEFAULT, NULL); 229 mutex_init(&svd->segp_slock, NULL, MUTEX_DEFAULT, NULL); 230 return (0); 231 } 232 233 /*ARGSUSED1*/ 234 static void 235 segvn_cache_destructor(void *buf, void *cdrarg) 236 { 237 struct segvn_data *svd = buf; 238 239 rw_destroy(&svd->lock); 240 mutex_destroy(&svd->segp_slock); 241 } 242 243 /* 244 * Patching this variable to non-zero allows the system to run with 245 * stacks marked as "not executable". It's a bit of a kludge, but is 246 * provided as a tweakable for platforms that export those ABIs 247 * (e.g. sparc V8) that have executable stacks enabled by default. 248 * There are also some restrictions for platforms that don't actually 249 * implement 'noexec' protections. 250 * 251 * Once enabled, the system is (therefore) unable to provide a fully 252 * ABI-compliant execution environment, though practically speaking, 253 * most everything works. The exceptions are generally some interpreters 254 * and debuggers that create executable code on the stack and jump 255 * into it (without explicitly mprotecting the address range to include 256 * PROT_EXEC). 257 * 258 * One important class of applications that are disabled are those 259 * that have been transformed into malicious agents using one of the 260 * numerous "buffer overflow" attacks. See 4007890. 261 */ 262 int noexec_user_stack = 0; 263 int noexec_user_stack_log = 1; 264 265 int segvn_lpg_disable = 0; 266 uint_t segvn_maxpgszc = 0; 267 268 ulong_t segvn_fltvnpages_clrszc_err; 269 ulong_t segvn_setpgsz_align_err; 270 ulong_t segvn_setpgsz_getattr_err; 271 ulong_t segvn_setpgsz_eof_err; 272 ulong_t segvn_faultvnmpss_align_err1; 273 ulong_t segvn_faultvnmpss_align_err2; 274 ulong_t segvn_faultvnmpss_align_err3; 275 ulong_t segvn_faultvnmpss_align_err4; 276 ulong_t segvn_faultvnmpss_align_err5; 277 ulong_t segvn_vmpss_pageio_deadlk_err; 278 279 /* 280 * Initialize segvn data structures 281 */ 282 void 283 segvn_init(void) 284 { 285 uint_t maxszc; 286 uint_t szc; 287 size_t pgsz; 288 289 segvn_cache = kmem_cache_create("segvn_cache", 290 sizeof (struct segvn_data), 0, 291 segvn_cache_constructor, segvn_cache_destructor, NULL, 292 NULL, NULL, 0); 293 294 if (segvn_lpg_disable != 0) 295 return; 296 szc = maxszc = page_num_pagesizes() - 1; 297 if (szc == 0) { 298 segvn_lpg_disable = 1; 299 return; 300 } 301 if (page_get_pagesize(0) != PAGESIZE) { 302 panic("segvn_init: bad szc 0"); 303 /*NOTREACHED*/ 304 } 305 while (szc != 0) { 306 pgsz = page_get_pagesize(szc); 307 if (pgsz <= PAGESIZE || !IS_P2ALIGNED(pgsz, pgsz)) { 308 panic("segvn_init: bad szc %d", szc); 309 /*NOTREACHED*/ 310 } 311 szc--; 312 } 313 if (segvn_maxpgszc == 0 || segvn_maxpgszc > maxszc) 314 segvn_maxpgszc = maxszc; 315 } 316 317 #define SEGVN_PAGEIO ((void *)0x1) 318 #define SEGVN_NOPAGEIO ((void *)0x2) 319 320 static void 321 segvn_setvnode_mpss(vnode_t *vp) 322 { 323 int err; 324 325 ASSERT(vp->v_mpssdata == NULL || 326 vp->v_mpssdata == SEGVN_PAGEIO || 327 vp->v_mpssdata == SEGVN_NOPAGEIO); 328 329 if (vp->v_mpssdata == NULL) { 330 if (vn_vmpss_usepageio(vp)) { 331 err = VOP_PAGEIO(vp, (page_t *)NULL, 332 (u_offset_t)0, 0, 0, CRED()); 333 } else { 334 err = ENOSYS; 335 } 336 /* 337 * set v_mpssdata just once per vnode life 338 * so that it never changes. 339 */ 340 mutex_enter(&vp->v_lock); 341 if (vp->v_mpssdata == NULL) { 342 if (err == EINVAL) { 343 vp->v_mpssdata = SEGVN_PAGEIO; 344 } else { 345 vp->v_mpssdata = SEGVN_NOPAGEIO; 346 } 347 } 348 mutex_exit(&vp->v_lock); 349 } 350 } 351 352 int 353 segvn_create(struct seg *seg, void *argsp) 354 { 355 struct segvn_crargs *a = (struct segvn_crargs *)argsp; 356 struct segvn_data *svd; 357 size_t swresv = 0; 358 struct cred *cred; 359 struct anon_map *amp; 360 int error = 0; 361 size_t pgsz; 362 lgrp_mem_policy_t mpolicy = LGRP_MEM_POLICY_DEFAULT; 363 364 365 ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock)); 366 367 if (a->type != MAP_PRIVATE && a->type != MAP_SHARED) { 368 panic("segvn_create type"); 369 /*NOTREACHED*/ 370 } 371 372 /* 373 * Check arguments. If a shared anon structure is given then 374 * it is illegal to also specify a vp. 375 */ 376 if (a->amp != NULL && a->vp != NULL) { 377 panic("segvn_create anon_map"); 378 /*NOTREACHED*/ 379 } 380 381 /* MAP_NORESERVE on a MAP_SHARED segment is meaningless. */ 382 if (a->type == MAP_SHARED) 383 a->flags &= ~MAP_NORESERVE; 384 385 if (a->szc != 0) { 386 if (segvn_lpg_disable != 0 || a->amp != NULL || 387 (a->type == MAP_SHARED && a->vp == NULL) || 388 (a->flags & MAP_NORESERVE) || seg->s_as == &kas) { 389 a->szc = 0; 390 } else { 391 if (a->szc > segvn_maxpgszc) 392 a->szc = segvn_maxpgszc; 393 pgsz = page_get_pagesize(a->szc); 394 if (!IS_P2ALIGNED(seg->s_base, pgsz) || 395 !IS_P2ALIGNED(seg->s_size, pgsz)) { 396 a->szc = 0; 397 } else if (a->vp != NULL) { 398 extern struct vnode kvp; 399 if (IS_SWAPFSVP(a->vp) || a->vp == &kvp) { 400 /* 401 * paranoid check. 402 * hat_page_demote() is not supported 403 * on swapfs pages. 404 */ 405 a->szc = 0; 406 } else if (map_addr_vacalign_check(seg->s_base, 407 a->offset & PAGEMASK)) { 408 a->szc = 0; 409 } 410 } 411 } 412 } 413 414 /* 415 * If segment may need private pages, reserve them now. 416 */ 417 if (!(a->flags & MAP_NORESERVE) && ((a->vp == NULL && a->amp == NULL) || 418 (a->type == MAP_PRIVATE && (a->prot & PROT_WRITE)))) { 419 if (anon_resv(seg->s_size) == 0) 420 return (EAGAIN); 421 swresv = seg->s_size; 422 TRACE_3(TR_FAC_VM, TR_ANON_PROC, "anon proc:%p %lu %u", 423 seg, swresv, 1); 424 } 425 426 /* 427 * Reserve any mapping structures that may be required. 428 */ 429 hat_map(seg->s_as->a_hat, seg->s_base, seg->s_size, HAT_MAP); 430 431 if (a->cred) { 432 cred = a->cred; 433 crhold(cred); 434 } else { 435 crhold(cred = CRED()); 436 } 437 438 /* Inform the vnode of the new mapping */ 439 if (a->vp) { 440 error = VOP_ADDMAP(a->vp, a->offset & PAGEMASK, 441 seg->s_as, seg->s_base, seg->s_size, a->prot, 442 a->maxprot, a->type, cred); 443 if (error) { 444 if (swresv != 0) { 445 anon_unresv(swresv); 446 TRACE_3(TR_FAC_VM, TR_ANON_PROC, 447 "anon proc:%p %lu %u", 448 seg, swresv, 0); 449 } 450 crfree(cred); 451 hat_unload(seg->s_as->a_hat, seg->s_base, 452 seg->s_size, HAT_UNLOAD_UNMAP); 453 return (error); 454 } 455 } 456 457 /* 458 * If more than one segment in the address space, and 459 * they're adjacent virtually, try to concatenate them. 460 * Don't concatenate if an explicit anon_map structure 461 * was supplied (e.g., SystemV shared memory). 462 */ 463 if (a->amp == NULL) { 464 struct seg *pseg, *nseg; 465 struct segvn_data *psvd, *nsvd; 466 lgrp_mem_policy_t ppolicy, npolicy; 467 uint_t lgrp_mem_policy_flags = 0; 468 extern lgrp_mem_policy_t lgrp_mem_default_policy; 469 470 /* 471 * Memory policy flags (lgrp_mem_policy_flags) is valid when 472 * extending stack/heap segments. 473 */ 474 if ((a->vp == NULL) && (a->type == MAP_PRIVATE) && 475 !(a->flags & MAP_NORESERVE) && (seg->s_as != &kas)) { 476 lgrp_mem_policy_flags = a->lgrp_mem_policy_flags; 477 } else { 478 /* 479 * Get policy when not extending it from another segment 480 */ 481 mpolicy = lgrp_mem_policy_default(seg->s_size, a->type); 482 } 483 484 /* 485 * First, try to concatenate the previous and new segments 486 */ 487 pseg = AS_SEGPREV(seg->s_as, seg); 488 if (pseg != NULL && 489 pseg->s_base + pseg->s_size == seg->s_base && 490 pseg->s_ops == &segvn_ops) { 491 /* 492 * Get memory allocation policy from previous segment. 493 * When extension is specified (e.g. for heap) apply 494 * this policy to the new segment regardless of the 495 * outcome of segment concatenation. Extension occurs 496 * for non-default policy otherwise default policy is 497 * used and is based on extended segment size. 498 */ 499 psvd = (struct segvn_data *)pseg->s_data; 500 ppolicy = psvd->policy_info.mem_policy; 501 if (lgrp_mem_policy_flags == 502 LGRP_MP_FLAG_EXTEND_UP) { 503 if (ppolicy != lgrp_mem_default_policy) { 504 mpolicy = ppolicy; 505 } else { 506 mpolicy = lgrp_mem_policy_default( 507 pseg->s_size + seg->s_size, 508 a->type); 509 } 510 } 511 512 if (mpolicy == ppolicy && 513 (pseg->s_size + seg->s_size <= 514 segvn_comb_thrshld || psvd->amp == NULL) && 515 segvn_extend_prev(pseg, seg, a, swresv) == 0) { 516 /* 517 * success! now try to concatenate 518 * with following seg 519 */ 520 crfree(cred); 521 nseg = AS_SEGNEXT(pseg->s_as, pseg); 522 if (nseg != NULL && 523 nseg != pseg && 524 nseg->s_ops == &segvn_ops && 525 pseg->s_base + pseg->s_size == 526 nseg->s_base) 527 (void) segvn_concat(pseg, nseg, 0); 528 ASSERT(pseg->s_szc == 0 || 529 (a->szc == pseg->s_szc && 530 IS_P2ALIGNED(pseg->s_base, pgsz) && 531 IS_P2ALIGNED(pseg->s_size, pgsz))); 532 return (0); 533 } 534 } 535 536 /* 537 * Failed, so try to concatenate with following seg 538 */ 539 nseg = AS_SEGNEXT(seg->s_as, seg); 540 if (nseg != NULL && 541 seg->s_base + seg->s_size == nseg->s_base && 542 nseg->s_ops == &segvn_ops) { 543 /* 544 * Get memory allocation policy from next segment. 545 * When extension is specified (e.g. for stack) apply 546 * this policy to the new segment regardless of the 547 * outcome of segment concatenation. Extension occurs 548 * for non-default policy otherwise default policy is 549 * used and is based on extended segment size. 550 */ 551 nsvd = (struct segvn_data *)nseg->s_data; 552 npolicy = nsvd->policy_info.mem_policy; 553 if (lgrp_mem_policy_flags == 554 LGRP_MP_FLAG_EXTEND_DOWN) { 555 if (npolicy != lgrp_mem_default_policy) { 556 mpolicy = npolicy; 557 } else { 558 mpolicy = lgrp_mem_policy_default( 559 nseg->s_size + seg->s_size, 560 a->type); 561 } 562 } 563 564 if (mpolicy == npolicy && 565 segvn_extend_next(seg, nseg, a, swresv) == 0) { 566 crfree(cred); 567 ASSERT(nseg->s_szc == 0 || 568 (a->szc == nseg->s_szc && 569 IS_P2ALIGNED(nseg->s_base, pgsz) && 570 IS_P2ALIGNED(nseg->s_size, pgsz))); 571 return (0); 572 } 573 } 574 } 575 576 if (a->vp != NULL) { 577 VN_HOLD(a->vp); 578 if (a->type == MAP_SHARED) 579 lgrp_shm_policy_init(NULL, a->vp); 580 } 581 svd = kmem_cache_alloc(segvn_cache, KM_SLEEP); 582 583 seg->s_ops = &segvn_ops; 584 seg->s_data = (void *)svd; 585 seg->s_szc = a->szc; 586 587 svd->vp = a->vp; 588 /* 589 * Anonymous mappings have no backing file so the offset is meaningless. 590 */ 591 svd->offset = a->vp ? (a->offset & PAGEMASK) : 0; 592 svd->prot = a->prot; 593 svd->maxprot = a->maxprot; 594 svd->pageprot = 0; 595 svd->type = a->type; 596 svd->vpage = NULL; 597 svd->cred = cred; 598 svd->advice = MADV_NORMAL; 599 svd->pageadvice = 0; 600 svd->flags = (ushort_t)a->flags; 601 svd->softlockcnt = 0; 602 if (a->szc != 0 && a->vp != NULL) { 603 segvn_setvnode_mpss(a->vp); 604 } 605 606 amp = a->amp; 607 if ((svd->amp = amp) == NULL) { 608 svd->anon_index = 0; 609 if (svd->type == MAP_SHARED) { 610 svd->swresv = 0; 611 /* 612 * Shared mappings to a vp need no other setup. 613 * If we have a shared mapping to an anon_map object 614 * which hasn't been allocated yet, allocate the 615 * struct now so that it will be properly shared 616 * by remembering the swap reservation there. 617 */ 618 if (a->vp == NULL) { 619 svd->amp = anonmap_alloc(seg->s_size, swresv); 620 svd->amp->a_szc = seg->s_szc; 621 } 622 } else { 623 /* 624 * Private mapping (with or without a vp). 625 * Allocate anon_map when needed. 626 */ 627 svd->swresv = swresv; 628 } 629 } else { 630 pgcnt_t anon_num; 631 632 /* 633 * Mapping to an existing anon_map structure without a vp. 634 * For now we will insure that the segment size isn't larger 635 * than the size - offset gives us. Later on we may wish to 636 * have the anon array dynamically allocated itself so that 637 * we don't always have to allocate all the anon pointer slots. 638 * This of course involves adding extra code to check that we 639 * aren't trying to use an anon pointer slot beyond the end 640 * of the currently allocated anon array. 641 */ 642 if ((amp->size - a->offset) < seg->s_size) { 643 panic("segvn_create anon_map size"); 644 /*NOTREACHED*/ 645 } 646 647 anon_num = btopr(a->offset); 648 649 if (a->type == MAP_SHARED) { 650 /* 651 * SHARED mapping to a given anon_map. 652 */ 653 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 654 amp->refcnt++; 655 ANON_LOCK_EXIT(&->a_rwlock); 656 svd->anon_index = anon_num; 657 svd->swresv = 0; 658 } else { 659 /* 660 * PRIVATE mapping to a given anon_map. 661 * Make sure that all the needed anon 662 * structures are created (so that we will 663 * share the underlying pages if nothing 664 * is written by this mapping) and then 665 * duplicate the anon array as is done 666 * when a privately mapped segment is dup'ed. 667 */ 668 struct anon *ap; 669 caddr_t addr; 670 caddr_t eaddr; 671 ulong_t anon_idx; 672 int hat_flag = HAT_LOAD; 673 674 if (svd->flags & MAP_TEXT) { 675 hat_flag |= HAT_LOAD_TEXT; 676 } 677 678 svd->amp = anonmap_alloc(seg->s_size, 0); 679 svd->amp->a_szc = seg->s_szc; 680 svd->anon_index = 0; 681 svd->swresv = swresv; 682 683 /* 684 * Prevent 2 threads from allocating anon 685 * slots simultaneously. 686 */ 687 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 688 eaddr = seg->s_base + seg->s_size; 689 690 for (anon_idx = anon_num, addr = seg->s_base; 691 addr < eaddr; addr += PAGESIZE, anon_idx++) { 692 page_t *pp; 693 694 if ((ap = anon_get_ptr(amp->ahp, 695 anon_idx)) != NULL) 696 continue; 697 698 /* 699 * Allocate the anon struct now. 700 * Might as well load up translation 701 * to the page while we're at it... 702 */ 703 pp = anon_zero(seg, addr, &ap, cred); 704 if (ap == NULL || pp == NULL) { 705 panic("segvn_create anon_zero"); 706 /*NOTREACHED*/ 707 } 708 709 /* 710 * Re-acquire the anon_map lock and 711 * initialize the anon array entry. 712 */ 713 ASSERT(anon_get_ptr(amp->ahp, 714 anon_idx) == NULL); 715 (void) anon_set_ptr(amp->ahp, anon_idx, ap, 716 ANON_SLEEP); 717 718 ASSERT(seg->s_szc == 0); 719 ASSERT(!IS_VMODSORT(pp->p_vnode)); 720 721 hat_memload(seg->s_as->a_hat, addr, pp, 722 svd->prot & ~PROT_WRITE, hat_flag); 723 724 page_unlock(pp); 725 } 726 ASSERT(seg->s_szc == 0); 727 anon_dup(amp->ahp, anon_num, svd->amp->ahp, 728 0, seg->s_size); 729 ANON_LOCK_EXIT(&->a_rwlock); 730 } 731 } 732 733 /* 734 * Set default memory allocation policy for segment 735 * 736 * Always set policy for private memory at least for initialization 737 * even if this is a shared memory segment 738 */ 739 (void) lgrp_privm_policy_set(mpolicy, &svd->policy_info, seg->s_size); 740 741 if (svd->type == MAP_SHARED) 742 (void) lgrp_shm_policy_set(mpolicy, svd->amp, svd->anon_index, 743 svd->vp, svd->offset, seg->s_size); 744 745 return (0); 746 } 747 748 /* 749 * Concatenate two existing segments, if possible. 750 * Return 0 on success, -1 if two segments are not compatible 751 * or -2 on memory allocation failure. 752 * If private == 1 then try and concat segments with private pages. 753 */ 754 static int 755 segvn_concat(struct seg *seg1, struct seg *seg2, int private) 756 { 757 struct segvn_data *svd1 = seg1->s_data; 758 struct segvn_data *svd2 = seg2->s_data; 759 struct anon_map *amp1 = svd1->amp; 760 struct anon_map *amp2 = svd2->amp; 761 struct vpage *vpage1 = svd1->vpage; 762 struct vpage *vpage2 = svd2->vpage, *nvpage = NULL; 763 size_t size, nvpsize; 764 pgcnt_t npages1, npages2; 765 766 ASSERT(seg1->s_as && seg2->s_as && seg1->s_as == seg2->s_as); 767 ASSERT(AS_WRITE_HELD(seg1->s_as, &seg1->s_as->a_lock)); 768 ASSERT(seg1->s_ops == seg2->s_ops); 769 770 /* both segments exist, try to merge them */ 771 #define incompat(x) (svd1->x != svd2->x) 772 if (incompat(vp) || incompat(maxprot) || 773 (!svd1->pageadvice && !svd2->pageadvice && incompat(advice)) || 774 (!svd1->pageprot && !svd2->pageprot && incompat(prot)) || 775 incompat(type) || incompat(cred) || incompat(flags) || 776 seg1->s_szc != seg2->s_szc || incompat(policy_info.mem_policy) || 777 (svd2->softlockcnt > 0)) 778 return (-1); 779 #undef incompat 780 781 /* 782 * vp == NULL implies zfod, offset doesn't matter 783 */ 784 if (svd1->vp != NULL && 785 svd1->offset + seg1->s_size != svd2->offset) { 786 return (-1); 787 } 788 789 /* 790 * Fail early if we're not supposed to concatenate 791 * private pages. 792 */ 793 if ((private == 0 || svd1->type != MAP_PRIVATE) && 794 (amp1 != NULL || amp2 != NULL)) { 795 return (-1); 796 } 797 798 /* 799 * If either seg has vpages, create a new merged vpage array. 800 */ 801 if (vpage1 != NULL || vpage2 != NULL) { 802 struct vpage *vp; 803 804 npages1 = seg_pages(seg1); 805 npages2 = seg_pages(seg2); 806 nvpsize = vpgtob(npages1 + npages2); 807 808 if ((nvpage = kmem_zalloc(nvpsize, KM_NOSLEEP)) == NULL) { 809 return (-2); 810 } 811 if (vpage1 != NULL) { 812 bcopy(vpage1, nvpage, vpgtob(npages1)); 813 } 814 if (vpage2 != NULL) { 815 bcopy(vpage2, nvpage + npages1, vpgtob(npages2)); 816 } 817 for (vp = nvpage; vp < nvpage + npages1; vp++) { 818 if (svd2->pageprot && !svd1->pageprot) { 819 VPP_SETPROT(vp, svd1->prot); 820 } 821 if (svd2->pageadvice && !svd1->pageadvice) { 822 VPP_SETADVICE(vp, svd1->advice); 823 } 824 } 825 for (vp = nvpage + npages1; 826 vp < nvpage + npages1 + npages2; vp++) { 827 if (svd1->pageprot && !svd2->pageprot) { 828 VPP_SETPROT(vp, svd2->prot); 829 } 830 if (svd1->pageadvice && !svd2->pageadvice) { 831 VPP_SETADVICE(vp, svd2->advice); 832 } 833 } 834 } 835 836 /* 837 * If either segment has private pages, create a new merged anon 838 * array. 839 */ 840 if (amp1 != NULL || amp2 != NULL) { 841 struct anon_hdr *nahp; 842 struct anon_map *namp = NULL; 843 size_t asize = seg1->s_size + seg2->s_size; 844 845 if ((nahp = anon_create(btop(asize), ANON_NOSLEEP)) == NULL) { 846 if (nvpage != NULL) { 847 kmem_free(nvpage, nvpsize); 848 } 849 return (-2); 850 } 851 if (amp1 != NULL) { 852 /* 853 * XXX anon rwlock is not really needed because 854 * this is a private segment and we are writers. 855 */ 856 ANON_LOCK_ENTER(&1->a_rwlock, RW_WRITER); 857 ASSERT(amp1->refcnt == 1); 858 if (anon_copy_ptr(amp1->ahp, svd1->anon_index, 859 nahp, 0, btop(seg1->s_size), ANON_NOSLEEP)) { 860 anon_release(nahp, btop(asize)); 861 ANON_LOCK_EXIT(&1->a_rwlock); 862 if (nvpage != NULL) { 863 kmem_free(nvpage, nvpsize); 864 } 865 return (-2); 866 } 867 } 868 if (amp2 != NULL) { 869 ANON_LOCK_ENTER(&2->a_rwlock, RW_WRITER); 870 ASSERT(amp2->refcnt == 1); 871 if (anon_copy_ptr(amp2->ahp, svd2->anon_index, 872 nahp, btop(seg1->s_size), btop(seg2->s_size), 873 ANON_NOSLEEP)) { 874 anon_release(nahp, btop(asize)); 875 ANON_LOCK_EXIT(&2->a_rwlock); 876 if (amp1 != NULL) { 877 ANON_LOCK_EXIT(&1->a_rwlock); 878 } 879 if (nvpage != NULL) { 880 kmem_free(nvpage, nvpsize); 881 } 882 return (-2); 883 } 884 } 885 if (amp1 != NULL) { 886 namp = amp1; 887 anon_release(amp1->ahp, btop(amp1->size)); 888 } 889 if (amp2 != NULL) { 890 if (namp == NULL) { 891 ASSERT(amp1 == NULL); 892 namp = amp2; 893 anon_release(amp2->ahp, btop(amp2->size)); 894 } else { 895 amp2->refcnt--; 896 ANON_LOCK_EXIT(&2->a_rwlock); 897 anonmap_free(amp2); 898 } 899 svd2->amp = NULL; /* needed for seg_free */ 900 } 901 namp->ahp = nahp; 902 namp->size = asize; 903 svd1->amp = namp; 904 svd1->anon_index = 0; 905 ANON_LOCK_EXIT(&namp->a_rwlock); 906 } 907 /* 908 * Now free the old vpage structures. 909 */ 910 if (nvpage != NULL) { 911 if (vpage1 != NULL) { 912 kmem_free(vpage1, vpgtob(npages1)); 913 } 914 if (vpage2 != NULL) { 915 svd2->vpage = NULL; 916 kmem_free(vpage2, vpgtob(npages2)); 917 } 918 if (svd2->pageprot) { 919 svd1->pageprot = 1; 920 } 921 if (svd2->pageadvice) { 922 svd1->pageadvice = 1; 923 } 924 svd1->vpage = nvpage; 925 } 926 927 /* all looks ok, merge segments */ 928 svd1->swresv += svd2->swresv; 929 svd2->swresv = 0; /* so seg_free doesn't release swap space */ 930 size = seg2->s_size; 931 seg_free(seg2); 932 seg1->s_size += size; 933 return (0); 934 } 935 936 /* 937 * Extend the previous segment (seg1) to include the 938 * new segment (seg2 + a), if possible. 939 * Return 0 on success. 940 */ 941 static int 942 segvn_extend_prev(seg1, seg2, a, swresv) 943 struct seg *seg1, *seg2; 944 struct segvn_crargs *a; 945 size_t swresv; 946 { 947 struct segvn_data *svd1 = (struct segvn_data *)seg1->s_data; 948 size_t size; 949 struct anon_map *amp1; 950 struct vpage *new_vpage; 951 952 /* 953 * We don't need any segment level locks for "segvn" data 954 * since the address space is "write" locked. 955 */ 956 ASSERT(seg1->s_as && AS_WRITE_HELD(seg1->s_as, &seg1->s_as->a_lock)); 957 958 /* second segment is new, try to extend first */ 959 /* XXX - should also check cred */ 960 if (svd1->vp != a->vp || svd1->maxprot != a->maxprot || 961 (!svd1->pageprot && (svd1->prot != a->prot)) || 962 svd1->type != a->type || svd1->flags != a->flags || 963 seg1->s_szc != a->szc) 964 return (-1); 965 966 /* vp == NULL implies zfod, offset doesn't matter */ 967 if (svd1->vp != NULL && 968 svd1->offset + seg1->s_size != (a->offset & PAGEMASK)) 969 return (-1); 970 971 amp1 = svd1->amp; 972 if (amp1) { 973 pgcnt_t newpgs; 974 975 /* 976 * Segment has private pages, can data structures 977 * be expanded? 978 * 979 * Acquire the anon_map lock to prevent it from changing, 980 * if it is shared. This ensures that the anon_map 981 * will not change while a thread which has a read/write 982 * lock on an address space references it. 983 * XXX - Don't need the anon_map lock at all if "refcnt" 984 * is 1. 985 * 986 * Can't grow a MAP_SHARED segment with an anonmap because 987 * there may be existing anon slots where we want to extend 988 * the segment and we wouldn't know what to do with them 989 * (e.g., for tmpfs right thing is to just leave them there, 990 * for /dev/zero they should be cleared out). 991 */ 992 if (svd1->type == MAP_SHARED) 993 return (-1); 994 995 ANON_LOCK_ENTER(&1->a_rwlock, RW_WRITER); 996 if (amp1->refcnt > 1) { 997 ANON_LOCK_EXIT(&1->a_rwlock); 998 return (-1); 999 } 1000 newpgs = anon_grow(amp1->ahp, &svd1->anon_index, 1001 btop(seg1->s_size), btop(seg2->s_size), ANON_NOSLEEP); 1002 1003 if (newpgs == 0) { 1004 ANON_LOCK_EXIT(&1->a_rwlock); 1005 return (-1); 1006 } 1007 amp1->size = ptob(newpgs); 1008 ANON_LOCK_EXIT(&1->a_rwlock); 1009 } 1010 if (svd1->vpage != NULL) { 1011 new_vpage = 1012 kmem_zalloc(vpgtob(seg_pages(seg1) + seg_pages(seg2)), 1013 KM_NOSLEEP); 1014 if (new_vpage == NULL) 1015 return (-1); 1016 bcopy(svd1->vpage, new_vpage, vpgtob(seg_pages(seg1))); 1017 kmem_free(svd1->vpage, vpgtob(seg_pages(seg1))); 1018 svd1->vpage = new_vpage; 1019 if (svd1->pageprot) { 1020 struct vpage *vp, *evp; 1021 1022 vp = new_vpage + seg_pages(seg1); 1023 evp = vp + seg_pages(seg2); 1024 for (; vp < evp; vp++) 1025 VPP_SETPROT(vp, a->prot); 1026 } 1027 } 1028 size = seg2->s_size; 1029 seg_free(seg2); 1030 seg1->s_size += size; 1031 svd1->swresv += swresv; 1032 return (0); 1033 } 1034 1035 /* 1036 * Extend the next segment (seg2) to include the 1037 * new segment (seg1 + a), if possible. 1038 * Return 0 on success. 1039 */ 1040 static int 1041 segvn_extend_next( 1042 struct seg *seg1, 1043 struct seg *seg2, 1044 struct segvn_crargs *a, 1045 size_t swresv) 1046 { 1047 struct segvn_data *svd2 = (struct segvn_data *)seg2->s_data; 1048 size_t size; 1049 struct anon_map *amp2; 1050 struct vpage *new_vpage; 1051 1052 /* 1053 * We don't need any segment level locks for "segvn" data 1054 * since the address space is "write" locked. 1055 */ 1056 ASSERT(seg2->s_as && AS_WRITE_HELD(seg2->s_as, &seg2->s_as->a_lock)); 1057 1058 /* first segment is new, try to extend second */ 1059 /* XXX - should also check cred */ 1060 if (svd2->vp != a->vp || svd2->maxprot != a->maxprot || 1061 (!svd2->pageprot && (svd2->prot != a->prot)) || 1062 svd2->type != a->type || svd2->flags != a->flags || 1063 seg2->s_szc != a->szc) 1064 return (-1); 1065 /* vp == NULL implies zfod, offset doesn't matter */ 1066 if (svd2->vp != NULL && 1067 (a->offset & PAGEMASK) + seg1->s_size != svd2->offset) 1068 return (-1); 1069 1070 amp2 = svd2->amp; 1071 if (amp2) { 1072 pgcnt_t newpgs; 1073 1074 /* 1075 * Segment has private pages, can data structures 1076 * be expanded? 1077 * 1078 * Acquire the anon_map lock to prevent it from changing, 1079 * if it is shared. This ensures that the anon_map 1080 * will not change while a thread which has a read/write 1081 * lock on an address space references it. 1082 * 1083 * XXX - Don't need the anon_map lock at all if "refcnt" 1084 * is 1. 1085 */ 1086 if (svd2->type == MAP_SHARED) 1087 return (-1); 1088 1089 ANON_LOCK_ENTER(&2->a_rwlock, RW_WRITER); 1090 if (amp2->refcnt > 1) { 1091 ANON_LOCK_EXIT(&2->a_rwlock); 1092 return (-1); 1093 } 1094 newpgs = anon_grow(amp2->ahp, &svd2->anon_index, 1095 btop(seg2->s_size), btop(seg1->s_size), 1096 ANON_NOSLEEP | ANON_GROWDOWN); 1097 1098 if (newpgs == 0) { 1099 ANON_LOCK_EXIT(&2->a_rwlock); 1100 return (-1); 1101 } 1102 amp2->size = ptob(newpgs); 1103 ANON_LOCK_EXIT(&2->a_rwlock); 1104 } 1105 if (svd2->vpage != NULL) { 1106 new_vpage = 1107 kmem_zalloc(vpgtob(seg_pages(seg1) + seg_pages(seg2)), 1108 KM_NOSLEEP); 1109 if (new_vpage == NULL) { 1110 /* Not merging segments so adjust anon_index back */ 1111 if (amp2) 1112 svd2->anon_index += seg_pages(seg1); 1113 return (-1); 1114 } 1115 bcopy(svd2->vpage, new_vpage + seg_pages(seg1), 1116 vpgtob(seg_pages(seg2))); 1117 kmem_free(svd2->vpage, vpgtob(seg_pages(seg2))); 1118 svd2->vpage = new_vpage; 1119 if (svd2->pageprot) { 1120 struct vpage *vp, *evp; 1121 1122 vp = new_vpage; 1123 evp = vp + seg_pages(seg1); 1124 for (; vp < evp; vp++) 1125 VPP_SETPROT(vp, a->prot); 1126 } 1127 } 1128 size = seg1->s_size; 1129 seg_free(seg1); 1130 seg2->s_size += size; 1131 seg2->s_base -= size; 1132 svd2->offset -= size; 1133 svd2->swresv += swresv; 1134 return (0); 1135 } 1136 1137 static int 1138 segvn_dup(struct seg *seg, struct seg *newseg) 1139 { 1140 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 1141 struct segvn_data *newsvd; 1142 pgcnt_t npages = seg_pages(seg); 1143 int error = 0; 1144 uint_t prot; 1145 size_t len; 1146 1147 ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock)); 1148 1149 /* 1150 * If segment has anon reserved, reserve more for the new seg. 1151 * For a MAP_NORESERVE segment swresv will be a count of all the 1152 * allocated anon slots; thus we reserve for the child as many slots 1153 * as the parent has allocated. This semantic prevents the child or 1154 * parent from dieing during a copy-on-write fault caused by trying 1155 * to write a shared pre-existing anon page. 1156 */ 1157 if ((len = svd->swresv) != 0) { 1158 if (anon_resv(svd->swresv) == 0) 1159 return (ENOMEM); 1160 1161 TRACE_3(TR_FAC_VM, TR_ANON_PROC, "anon proc:%p %lu %u", 1162 seg, len, 0); 1163 } 1164 1165 newsvd = kmem_cache_alloc(segvn_cache, KM_SLEEP); 1166 1167 newseg->s_ops = &segvn_ops; 1168 newseg->s_data = (void *)newsvd; 1169 newseg->s_szc = seg->s_szc; 1170 1171 if ((newsvd->vp = svd->vp) != NULL) { 1172 VN_HOLD(svd->vp); 1173 if (svd->type == MAP_SHARED) 1174 lgrp_shm_policy_init(NULL, svd->vp); 1175 } 1176 newsvd->offset = svd->offset; 1177 newsvd->prot = svd->prot; 1178 newsvd->maxprot = svd->maxprot; 1179 newsvd->pageprot = svd->pageprot; 1180 newsvd->type = svd->type; 1181 newsvd->cred = svd->cred; 1182 crhold(newsvd->cred); 1183 newsvd->advice = svd->advice; 1184 newsvd->pageadvice = svd->pageadvice; 1185 newsvd->swresv = svd->swresv; 1186 newsvd->flags = svd->flags; 1187 newsvd->softlockcnt = 0; 1188 newsvd->policy_info = svd->policy_info; 1189 if ((newsvd->amp = svd->amp) == NULL) { 1190 /* 1191 * Not attaching to a shared anon object. 1192 */ 1193 newsvd->anon_index = 0; 1194 } else { 1195 struct anon_map *amp; 1196 1197 amp = svd->amp; 1198 if (svd->type == MAP_SHARED) { 1199 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 1200 amp->refcnt++; 1201 ANON_LOCK_EXIT(&->a_rwlock); 1202 newsvd->anon_index = svd->anon_index; 1203 } else { 1204 int reclaim = 1; 1205 1206 /* 1207 * Allocate and initialize new anon_map structure. 1208 */ 1209 newsvd->amp = anonmap_alloc(newseg->s_size, 0); 1210 newsvd->amp->a_szc = newseg->s_szc; 1211 newsvd->anon_index = 0; 1212 1213 /* 1214 * We don't have to acquire the anon_map lock 1215 * for the new segment (since it belongs to an 1216 * address space that is still not associated 1217 * with any process), or the segment in the old 1218 * address space (since all threads in it 1219 * are stopped while duplicating the address space). 1220 */ 1221 1222 /* 1223 * The goal of the following code is to make sure that 1224 * softlocked pages do not end up as copy on write 1225 * pages. This would cause problems where one 1226 * thread writes to a page that is COW and a different 1227 * thread in the same process has softlocked it. The 1228 * softlock lock would move away from this process 1229 * because the write would cause this process to get 1230 * a copy (without the softlock). 1231 * 1232 * The strategy here is to just break the 1233 * sharing on pages that could possibly be 1234 * softlocked. 1235 */ 1236 retry: 1237 if (svd->softlockcnt) { 1238 struct anon *ap, *newap; 1239 size_t i; 1240 uint_t vpprot; 1241 page_t *anon_pl[1+1], *pp; 1242 caddr_t addr; 1243 ulong_t anon_idx = 0; 1244 1245 /* 1246 * The softlock count might be non zero 1247 * because some pages are still stuck in the 1248 * cache for lazy reclaim. Flush the cache 1249 * now. This should drop the count to zero. 1250 * [or there is really I/O going on to these 1251 * pages]. Note, we have the writers lock so 1252 * nothing gets inserted during the flush. 1253 */ 1254 if (reclaim == 1) { 1255 segvn_purge(seg); 1256 reclaim = 0; 1257 goto retry; 1258 } 1259 i = btopr(seg->s_size); 1260 addr = seg->s_base; 1261 /* 1262 * XXX break cow sharing using PAGESIZE 1263 * pages. They will be relocated into larger 1264 * pages at fault time. 1265 */ 1266 while (i-- > 0) { 1267 if (ap = anon_get_ptr(amp->ahp, 1268 anon_idx)) { 1269 error = anon_getpage(&ap, 1270 &vpprot, anon_pl, PAGESIZE, 1271 seg, addr, S_READ, 1272 svd->cred); 1273 if (error) { 1274 newsvd->vpage = NULL; 1275 goto out; 1276 } 1277 /* 1278 * prot need not be computed 1279 * below 'cause anon_private is 1280 * going to ignore it anyway 1281 * as child doesn't inherit 1282 * pagelock from parent. 1283 */ 1284 prot = svd->pageprot ? 1285 VPP_PROT( 1286 &svd->vpage[ 1287 seg_page(seg, addr)]) 1288 : svd->prot; 1289 pp = anon_private(&newap, 1290 newseg, addr, prot, 1291 anon_pl[0], 0, 1292 newsvd->cred); 1293 if (pp == NULL) { 1294 /* no mem abort */ 1295 newsvd->vpage = NULL; 1296 error = ENOMEM; 1297 goto out; 1298 } 1299 (void) anon_set_ptr( 1300 newsvd->amp->ahp, anon_idx, 1301 newap, ANON_SLEEP); 1302 page_unlock(pp); 1303 } 1304 addr += PAGESIZE; 1305 anon_idx++; 1306 } 1307 } else { /* common case */ 1308 if (seg->s_szc != 0) { 1309 /* 1310 * If at least one of anon slots of a 1311 * large page exists then make sure 1312 * all anon slots of a large page 1313 * exist to avoid partial cow sharing 1314 * of a large page in the future. 1315 */ 1316 anon_dup_fill_holes(amp->ahp, 1317 svd->anon_index, newsvd->amp->ahp, 1318 0, seg->s_size, seg->s_szc, 1319 svd->vp != NULL); 1320 } else { 1321 anon_dup(amp->ahp, svd->anon_index, 1322 newsvd->amp->ahp, 0, seg->s_size); 1323 } 1324 1325 hat_clrattr(seg->s_as->a_hat, seg->s_base, 1326 seg->s_size, PROT_WRITE); 1327 } 1328 } 1329 } 1330 /* 1331 * If necessary, create a vpage structure for the new segment. 1332 * Do not copy any page lock indications. 1333 */ 1334 if (svd->vpage != NULL) { 1335 uint_t i; 1336 struct vpage *ovp = svd->vpage; 1337 struct vpage *nvp; 1338 1339 nvp = newsvd->vpage = 1340 kmem_alloc(vpgtob(npages), KM_SLEEP); 1341 for (i = 0; i < npages; i++) { 1342 *nvp = *ovp++; 1343 VPP_CLRPPLOCK(nvp++); 1344 } 1345 } else 1346 newsvd->vpage = NULL; 1347 1348 /* Inform the vnode of the new mapping */ 1349 if (newsvd->vp != NULL) { 1350 error = VOP_ADDMAP(newsvd->vp, (offset_t)newsvd->offset, 1351 newseg->s_as, newseg->s_base, newseg->s_size, newsvd->prot, 1352 newsvd->maxprot, newsvd->type, newsvd->cred); 1353 } 1354 out: 1355 return (error); 1356 } 1357 1358 1359 /* 1360 * callback function used by segvn_unmap to invoke free_vp_pages() for only 1361 * those pages actually processed by the HAT 1362 */ 1363 extern int free_pages; 1364 1365 static void 1366 segvn_hat_unload_callback(hat_callback_t *cb) 1367 { 1368 struct seg *seg = cb->hcb_data; 1369 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 1370 size_t len; 1371 u_offset_t off; 1372 1373 ASSERT(svd->vp != NULL); 1374 ASSERT(cb->hcb_end_addr > cb->hcb_start_addr); 1375 ASSERT(cb->hcb_start_addr >= seg->s_base); 1376 1377 len = cb->hcb_end_addr - cb->hcb_start_addr; 1378 off = cb->hcb_start_addr - seg->s_base; 1379 free_vp_pages(svd->vp, svd->offset + off, len); 1380 } 1381 1382 1383 static int 1384 segvn_unmap(struct seg *seg, caddr_t addr, size_t len) 1385 { 1386 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 1387 struct segvn_data *nsvd; 1388 struct seg *nseg; 1389 struct anon_map *amp; 1390 pgcnt_t opages; /* old segment size in pages */ 1391 pgcnt_t npages; /* new segment size in pages */ 1392 pgcnt_t dpages; /* pages being deleted (unmapped) */ 1393 hat_callback_t callback; /* used for free_vp_pages() */ 1394 hat_callback_t *cbp = NULL; 1395 caddr_t nbase; 1396 size_t nsize; 1397 size_t oswresv; 1398 int reclaim = 1; 1399 1400 /* 1401 * We don't need any segment level locks for "segvn" data 1402 * since the address space is "write" locked. 1403 */ 1404 ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock)); 1405 1406 /* 1407 * Fail the unmap if pages are SOFTLOCKed through this mapping. 1408 * softlockcnt is protected from change by the as write lock. 1409 */ 1410 retry: 1411 if (svd->softlockcnt > 0) { 1412 /* 1413 * since we do have the writers lock nobody can fill 1414 * the cache during the purge. The flush either succeeds 1415 * or we still have pending I/Os. 1416 */ 1417 if (reclaim == 1) { 1418 segvn_purge(seg); 1419 reclaim = 0; 1420 goto retry; 1421 } 1422 return (EAGAIN); 1423 } 1424 1425 /* 1426 * Check for bad sizes 1427 */ 1428 if (addr < seg->s_base || addr + len > seg->s_base + seg->s_size || 1429 (len & PAGEOFFSET) || ((uintptr_t)addr & PAGEOFFSET)) { 1430 panic("segvn_unmap"); 1431 /*NOTREACHED*/ 1432 } 1433 1434 if (seg->s_szc != 0) { 1435 size_t pgsz = page_get_pagesize(seg->s_szc); 1436 int err; 1437 if (!IS_P2ALIGNED(addr, pgsz) || !IS_P2ALIGNED(len, pgsz)) { 1438 ASSERT(seg->s_base != addr || seg->s_size != len); 1439 VM_STAT_ADD(segvnvmstats.demoterange[0]); 1440 err = segvn_demote_range(seg, addr, len, SDR_END); 1441 if (err == 0) { 1442 return (IE_RETRY); 1443 } 1444 return (err); 1445 } 1446 } 1447 1448 /* Inform the vnode of the unmapping. */ 1449 if (svd->vp) { 1450 int error; 1451 1452 error = VOP_DELMAP(svd->vp, 1453 (offset_t)svd->offset + (uintptr_t)(addr - seg->s_base), 1454 seg->s_as, addr, len, svd->prot, svd->maxprot, 1455 svd->type, svd->cred); 1456 1457 if (error == EAGAIN) 1458 return (error); 1459 } 1460 /* 1461 * Remove any page locks set through this mapping. 1462 */ 1463 (void) segvn_lockop(seg, addr, len, 0, MC_UNLOCK, NULL, 0); 1464 1465 /* 1466 * Unload any hardware translations in the range to be taken out. 1467 * Use a callback to invoke free_vp_pages() effectively. 1468 */ 1469 if (svd->vp != NULL && free_pages != 0) { 1470 callback.hcb_data = seg; 1471 callback.hcb_function = segvn_hat_unload_callback; 1472 cbp = &callback; 1473 } 1474 hat_unload_callback(seg->s_as->a_hat, addr, len, HAT_UNLOAD_UNMAP, cbp); 1475 1476 /* 1477 * Check for entire segment 1478 */ 1479 if (addr == seg->s_base && len == seg->s_size) { 1480 seg_free(seg); 1481 return (0); 1482 } 1483 1484 opages = seg_pages(seg); 1485 dpages = btop(len); 1486 npages = opages - dpages; 1487 amp = svd->amp; 1488 1489 /* 1490 * Check for beginning of segment 1491 */ 1492 if (addr == seg->s_base) { 1493 if (svd->vpage != NULL) { 1494 size_t nbytes; 1495 struct vpage *ovpage; 1496 1497 ovpage = svd->vpage; /* keep pointer to vpage */ 1498 1499 nbytes = vpgtob(npages); 1500 svd->vpage = kmem_alloc(nbytes, KM_SLEEP); 1501 bcopy(&ovpage[dpages], svd->vpage, nbytes); 1502 1503 /* free up old vpage */ 1504 kmem_free(ovpage, vpgtob(opages)); 1505 } 1506 if (amp != NULL) { 1507 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 1508 if (amp->refcnt == 1 || svd->type == MAP_PRIVATE) { 1509 /* 1510 * Free up now unused parts of anon_map array. 1511 */ 1512 if (seg->s_szc != 0) { 1513 anon_free_pages(amp->ahp, 1514 svd->anon_index, len, seg->s_szc); 1515 } else { 1516 anon_free(amp->ahp, svd->anon_index, 1517 len); 1518 } 1519 1520 /* 1521 * Unreserve swap space for the unmapped chunk 1522 * of this segment in case it's MAP_SHARED 1523 */ 1524 if (svd->type == MAP_SHARED) { 1525 anon_unresv(len); 1526 amp->swresv -= len; 1527 } 1528 } 1529 ANON_LOCK_EXIT(&->a_rwlock); 1530 svd->anon_index += dpages; 1531 } 1532 if (svd->vp != NULL) 1533 svd->offset += len; 1534 1535 if (svd->swresv) { 1536 if (svd->flags & MAP_NORESERVE) { 1537 ASSERT(amp); 1538 oswresv = svd->swresv; 1539 1540 svd->swresv = ptob(anon_pages(amp->ahp, 1541 svd->anon_index, npages)); 1542 anon_unresv(oswresv - svd->swresv); 1543 } else { 1544 anon_unresv(len); 1545 svd->swresv -= len; 1546 } 1547 TRACE_3(TR_FAC_VM, TR_ANON_PROC, "anon proc:%p %lu %u", 1548 seg, len, 0); 1549 } 1550 1551 seg->s_base += len; 1552 seg->s_size -= len; 1553 return (0); 1554 } 1555 1556 /* 1557 * Check for end of segment 1558 */ 1559 if (addr + len == seg->s_base + seg->s_size) { 1560 if (svd->vpage != NULL) { 1561 size_t nbytes; 1562 struct vpage *ovpage; 1563 1564 ovpage = svd->vpage; /* keep pointer to vpage */ 1565 1566 nbytes = vpgtob(npages); 1567 svd->vpage = kmem_alloc(nbytes, KM_SLEEP); 1568 bcopy(ovpage, svd->vpage, nbytes); 1569 1570 /* free up old vpage */ 1571 kmem_free(ovpage, vpgtob(opages)); 1572 1573 } 1574 if (amp != NULL) { 1575 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 1576 if (amp->refcnt == 1 || svd->type == MAP_PRIVATE) { 1577 /* 1578 * Free up now unused parts of anon_map array 1579 */ 1580 if (seg->s_szc != 0) { 1581 ulong_t an_idx = svd->anon_index + 1582 npages; 1583 anon_free_pages(amp->ahp, an_idx, 1584 len, seg->s_szc); 1585 } else { 1586 anon_free(amp->ahp, 1587 svd->anon_index + npages, len); 1588 } 1589 /* 1590 * Unreserve swap space for the unmapped chunk 1591 * of this segment in case it's MAP_SHARED 1592 */ 1593 if (svd->type == MAP_SHARED) { 1594 anon_unresv(len); 1595 amp->swresv -= len; 1596 } 1597 } 1598 ANON_LOCK_EXIT(&->a_rwlock); 1599 } 1600 1601 if (svd->swresv) { 1602 if (svd->flags & MAP_NORESERVE) { 1603 ASSERT(amp); 1604 oswresv = svd->swresv; 1605 svd->swresv = ptob(anon_pages(amp->ahp, 1606 svd->anon_index, npages)); 1607 anon_unresv(oswresv - svd->swresv); 1608 } else { 1609 anon_unresv(len); 1610 svd->swresv -= len; 1611 } 1612 TRACE_3(TR_FAC_VM, TR_ANON_PROC, 1613 "anon proc:%p %lu %u", seg, len, 0); 1614 } 1615 1616 seg->s_size -= len; 1617 return (0); 1618 } 1619 1620 /* 1621 * The section to go is in the middle of the segment, 1622 * have to make it into two segments. nseg is made for 1623 * the high end while seg is cut down at the low end. 1624 */ 1625 nbase = addr + len; /* new seg base */ 1626 nsize = (seg->s_base + seg->s_size) - nbase; /* new seg size */ 1627 seg->s_size = addr - seg->s_base; /* shrink old seg */ 1628 nseg = seg_alloc(seg->s_as, nbase, nsize); 1629 if (nseg == NULL) { 1630 panic("segvn_unmap seg_alloc"); 1631 /*NOTREACHED*/ 1632 } 1633 nseg->s_ops = seg->s_ops; 1634 nsvd = kmem_cache_alloc(segvn_cache, KM_SLEEP); 1635 nseg->s_data = (void *)nsvd; 1636 nseg->s_szc = seg->s_szc; 1637 *nsvd = *svd; 1638 nsvd->offset = svd->offset + (uintptr_t)(nseg->s_base - seg->s_base); 1639 nsvd->swresv = 0; 1640 nsvd->softlockcnt = 0; 1641 1642 if (svd->vp != NULL) { 1643 VN_HOLD(nsvd->vp); 1644 if (nsvd->type == MAP_SHARED) 1645 lgrp_shm_policy_init(NULL, nsvd->vp); 1646 } 1647 crhold(svd->cred); 1648 1649 if (svd->vpage == NULL) { 1650 nsvd->vpage = NULL; 1651 } else { 1652 /* need to split vpage into two arrays */ 1653 size_t nbytes; 1654 struct vpage *ovpage; 1655 1656 ovpage = svd->vpage; /* keep pointer to vpage */ 1657 1658 npages = seg_pages(seg); /* seg has shrunk */ 1659 nbytes = vpgtob(npages); 1660 svd->vpage = kmem_alloc(nbytes, KM_SLEEP); 1661 1662 bcopy(ovpage, svd->vpage, nbytes); 1663 1664 npages = seg_pages(nseg); 1665 nbytes = vpgtob(npages); 1666 nsvd->vpage = kmem_alloc(nbytes, KM_SLEEP); 1667 1668 bcopy(&ovpage[opages - npages], nsvd->vpage, nbytes); 1669 1670 /* free up old vpage */ 1671 kmem_free(ovpage, vpgtob(opages)); 1672 } 1673 1674 if (amp == NULL) { 1675 nsvd->amp = NULL; 1676 nsvd->anon_index = 0; 1677 } else { 1678 /* 1679 * Need to create a new anon map for the new segment. 1680 * We'll also allocate a new smaller array for the old 1681 * smaller segment to save space. 1682 */ 1683 opages = btop((uintptr_t)(addr - seg->s_base)); 1684 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 1685 if (amp->refcnt == 1 || svd->type == MAP_PRIVATE) { 1686 /* 1687 * Free up now unused parts of anon_map array 1688 */ 1689 if (seg->s_szc != 0) { 1690 ulong_t an_idx = svd->anon_index + opages; 1691 anon_free_pages(amp->ahp, an_idx, len, 1692 seg->s_szc); 1693 } else { 1694 anon_free(amp->ahp, svd->anon_index + opages, 1695 len); 1696 } 1697 1698 /* 1699 * Unreserve swap space for the unmapped chunk 1700 * of this segment in case it's MAP_SHARED 1701 */ 1702 if (svd->type == MAP_SHARED) { 1703 anon_unresv(len); 1704 amp->swresv -= len; 1705 } 1706 } 1707 1708 nsvd->anon_index = svd->anon_index + 1709 btop((uintptr_t)(nseg->s_base - seg->s_base)); 1710 if (svd->type == MAP_SHARED) { 1711 ASSERT(seg->s_szc == 0); 1712 amp->refcnt++; 1713 nsvd->amp = amp; 1714 } else { 1715 struct anon_map *namp; 1716 struct anon_hdr *nahp; 1717 1718 ASSERT(svd->type == MAP_PRIVATE); 1719 nahp = anon_create(btop(seg->s_size), ANON_SLEEP); 1720 namp = anonmap_alloc(nseg->s_size, 0); 1721 namp->a_szc = seg->s_szc; 1722 (void) anon_copy_ptr(amp->ahp, svd->anon_index, nahp, 1723 0, btop(seg->s_size), ANON_SLEEP); 1724 (void) anon_copy_ptr(amp->ahp, nsvd->anon_index, 1725 namp->ahp, 0, btop(nseg->s_size), ANON_SLEEP); 1726 anon_release(amp->ahp, btop(amp->size)); 1727 svd->anon_index = 0; 1728 nsvd->anon_index = 0; 1729 amp->ahp = nahp; 1730 amp->size = seg->s_size; 1731 nsvd->amp = namp; 1732 } 1733 ANON_LOCK_EXIT(&->a_rwlock); 1734 } 1735 if (svd->swresv) { 1736 if (svd->flags & MAP_NORESERVE) { 1737 ASSERT(amp); 1738 oswresv = svd->swresv; 1739 svd->swresv = ptob(anon_pages(amp->ahp, 1740 svd->anon_index, btop(seg->s_size))); 1741 nsvd->swresv = ptob(anon_pages(nsvd->amp->ahp, 1742 nsvd->anon_index, btop(nseg->s_size))); 1743 ASSERT(oswresv >= (svd->swresv + nsvd->swresv)); 1744 anon_unresv(oswresv - (svd->swresv + nsvd->swresv)); 1745 } else { 1746 if (seg->s_size + nseg->s_size + len != svd->swresv) { 1747 panic("segvn_unmap: " 1748 "cannot split swap reservation"); 1749 /*NOTREACHED*/ 1750 } 1751 anon_unresv(len); 1752 svd->swresv = seg->s_size; 1753 nsvd->swresv = nseg->s_size; 1754 } 1755 TRACE_3(TR_FAC_VM, TR_ANON_PROC, "anon proc:%p %lu %u", 1756 seg, len, 0); 1757 } 1758 1759 return (0); /* I'm glad that's all over with! */ 1760 } 1761 1762 static void 1763 segvn_free(struct seg *seg) 1764 { 1765 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 1766 pgcnt_t npages = seg_pages(seg); 1767 struct anon_map *amp; 1768 size_t len; 1769 1770 /* 1771 * We don't need any segment level locks for "segvn" data 1772 * since the address space is "write" locked. 1773 */ 1774 ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock)); 1775 1776 /* 1777 * Be sure to unlock pages. XXX Why do things get free'ed instead 1778 * of unmapped? XXX 1779 */ 1780 (void) segvn_lockop(seg, seg->s_base, seg->s_size, 1781 0, MC_UNLOCK, NULL, 0); 1782 1783 /* 1784 * Deallocate the vpage and anon pointers if necessary and possible. 1785 */ 1786 if (svd->vpage != NULL) { 1787 kmem_free(svd->vpage, vpgtob(npages)); 1788 svd->vpage = NULL; 1789 } 1790 if ((amp = svd->amp) != NULL) { 1791 /* 1792 * If there are no more references to this anon_map 1793 * structure, then deallocate the structure after freeing 1794 * up all the anon slot pointers that we can. 1795 */ 1796 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 1797 if (--amp->refcnt == 0) { 1798 if (svd->type == MAP_PRIVATE) { 1799 /* 1800 * Private - we only need to anon_free 1801 * the part that this segment refers to. 1802 */ 1803 if (seg->s_szc != 0) { 1804 anon_free_pages(amp->ahp, 1805 svd->anon_index, seg->s_size, 1806 seg->s_szc); 1807 } else { 1808 anon_free(amp->ahp, svd->anon_index, 1809 seg->s_size); 1810 } 1811 } else { 1812 /* 1813 * Shared - anon_free the entire 1814 * anon_map's worth of stuff and 1815 * release any swap reservation. 1816 */ 1817 ASSERT(seg->s_szc == 0); 1818 anon_free(amp->ahp, 0, amp->size); 1819 if ((len = amp->swresv) != 0) { 1820 anon_unresv(len); 1821 TRACE_3(TR_FAC_VM, TR_ANON_PROC, 1822 "anon proc:%p %lu %u", 1823 seg, len, 0); 1824 } 1825 } 1826 svd->amp = NULL; 1827 ANON_LOCK_EXIT(&->a_rwlock); 1828 anonmap_free(amp); 1829 } else if (svd->type == MAP_PRIVATE) { 1830 /* 1831 * We had a private mapping which still has 1832 * a held anon_map so just free up all the 1833 * anon slot pointers that we were using. 1834 */ 1835 if (seg->s_szc != 0) { 1836 anon_free_pages(amp->ahp, svd->anon_index, 1837 seg->s_size, seg->s_szc); 1838 } else { 1839 anon_free(amp->ahp, svd->anon_index, 1840 seg->s_size); 1841 } 1842 ANON_LOCK_EXIT(&->a_rwlock); 1843 } else { 1844 ANON_LOCK_EXIT(&->a_rwlock); 1845 } 1846 } 1847 1848 /* 1849 * Release swap reservation. 1850 */ 1851 if ((len = svd->swresv) != 0) { 1852 anon_unresv(svd->swresv); 1853 TRACE_3(TR_FAC_VM, TR_ANON_PROC, "anon proc:%p %lu %u", 1854 seg, len, 0); 1855 svd->swresv = 0; 1856 } 1857 /* 1858 * Release claim on vnode, credentials, and finally free the 1859 * private data. 1860 */ 1861 if (svd->vp != NULL) { 1862 if (svd->type == MAP_SHARED) 1863 lgrp_shm_policy_fini(NULL, svd->vp); 1864 VN_RELE(svd->vp); 1865 svd->vp = NULL; 1866 } 1867 crfree(svd->cred); 1868 svd->cred = NULL; 1869 1870 seg->s_data = NULL; 1871 kmem_cache_free(segvn_cache, svd); 1872 } 1873 1874 /* 1875 * Do a F_SOFTUNLOCK call over the range requested. The range must have 1876 * already been F_SOFTLOCK'ed. 1877 * Caller must always match addr and len of a softunlock with a previous 1878 * softlock with exactly the same addr and len. 1879 */ 1880 static void 1881 segvn_softunlock(struct seg *seg, caddr_t addr, size_t len, enum seg_rw rw) 1882 { 1883 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 1884 page_t *pp; 1885 caddr_t adr; 1886 struct vnode *vp; 1887 u_offset_t offset; 1888 ulong_t anon_index; 1889 struct anon_map *amp; 1890 struct anon *ap = NULL; 1891 1892 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 1893 ASSERT(SEGVN_LOCK_HELD(seg->s_as, &svd->lock)); 1894 1895 if ((amp = svd->amp) != NULL) 1896 anon_index = svd->anon_index + seg_page(seg, addr); 1897 1898 hat_unlock(seg->s_as->a_hat, addr, len); 1899 for (adr = addr; adr < addr + len; adr += PAGESIZE) { 1900 if (amp != NULL) { 1901 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 1902 if ((ap = anon_get_ptr(amp->ahp, anon_index++)) 1903 != NULL) { 1904 swap_xlate(ap, &vp, &offset); 1905 } else { 1906 vp = svd->vp; 1907 offset = svd->offset + 1908 (uintptr_t)(adr - seg->s_base); 1909 } 1910 ANON_LOCK_EXIT(&->a_rwlock); 1911 } else { 1912 vp = svd->vp; 1913 offset = svd->offset + 1914 (uintptr_t)(adr - seg->s_base); 1915 } 1916 1917 /* 1918 * Use page_find() instead of page_lookup() to 1919 * find the page since we know that it is locked. 1920 */ 1921 pp = page_find(vp, offset); 1922 if (pp == NULL) { 1923 panic( 1924 "segvn_softunlock: addr %p, ap %p, vp %p, off %llx", 1925 (void *)adr, (void *)ap, (void *)vp, offset); 1926 /*NOTREACHED*/ 1927 } 1928 1929 if (rw == S_WRITE) { 1930 hat_setrefmod(pp); 1931 if (seg->s_as->a_vbits) 1932 hat_setstat(seg->s_as, adr, PAGESIZE, 1933 P_REF | P_MOD); 1934 } else if (rw != S_OTHER) { 1935 hat_setref(pp); 1936 if (seg->s_as->a_vbits) 1937 hat_setstat(seg->s_as, adr, PAGESIZE, P_REF); 1938 } 1939 TRACE_3(TR_FAC_VM, TR_SEGVN_FAULT, 1940 "segvn_fault:pp %p vp %p offset %llx", pp, vp, offset); 1941 page_unlock(pp); 1942 } 1943 mutex_enter(&freemem_lock); /* for availrmem */ 1944 availrmem += btop(len); 1945 segvn_pages_locked -= btop(len); 1946 svd->softlockcnt -= btop(len); 1947 mutex_exit(&freemem_lock); 1948 if (svd->softlockcnt == 0) { 1949 /* 1950 * All SOFTLOCKS are gone. Wakeup any waiting 1951 * unmappers so they can try again to unmap. 1952 * Check for waiters first without the mutex 1953 * held so we don't always grab the mutex on 1954 * softunlocks. 1955 */ 1956 if (AS_ISUNMAPWAIT(seg->s_as)) { 1957 mutex_enter(&seg->s_as->a_contents); 1958 if (AS_ISUNMAPWAIT(seg->s_as)) { 1959 AS_CLRUNMAPWAIT(seg->s_as); 1960 cv_broadcast(&seg->s_as->a_cv); 1961 } 1962 mutex_exit(&seg->s_as->a_contents); 1963 } 1964 } 1965 } 1966 1967 #define PAGE_HANDLED ((page_t *)-1) 1968 1969 /* 1970 * Release all the pages in the NULL terminated ppp list 1971 * which haven't already been converted to PAGE_HANDLED. 1972 */ 1973 static void 1974 segvn_pagelist_rele(page_t **ppp) 1975 { 1976 for (; *ppp != NULL; ppp++) { 1977 if (*ppp != PAGE_HANDLED) 1978 page_unlock(*ppp); 1979 } 1980 } 1981 1982 static int stealcow = 1; 1983 1984 /* 1985 * Workaround for viking chip bug. See bug id 1220902. 1986 * To fix this down in pagefault() would require importing so 1987 * much as and segvn code as to be unmaintainable. 1988 */ 1989 int enable_mbit_wa = 0; 1990 1991 /* 1992 * Handles all the dirty work of getting the right 1993 * anonymous pages and loading up the translations. 1994 * This routine is called only from segvn_fault() 1995 * when looping over the range of addresses requested. 1996 * 1997 * The basic algorithm here is: 1998 * If this is an anon_zero case 1999 * Call anon_zero to allocate page 2000 * Load up translation 2001 * Return 2002 * endif 2003 * If this is an anon page 2004 * Use anon_getpage to get the page 2005 * else 2006 * Find page in pl[] list passed in 2007 * endif 2008 * If not a cow 2009 * Load up the translation to the page 2010 * return 2011 * endif 2012 * Call anon_private to handle cow 2013 * Load up (writable) translation to new page 2014 */ 2015 static faultcode_t 2016 segvn_faultpage( 2017 struct hat *hat, /* the hat to use for mapping */ 2018 struct seg *seg, /* seg_vn of interest */ 2019 caddr_t addr, /* address in as */ 2020 u_offset_t off, /* offset in vp */ 2021 struct vpage *vpage, /* pointer to vpage for vp, off */ 2022 page_t *pl[], /* object source page pointer */ 2023 uint_t vpprot, /* access allowed to object pages */ 2024 enum fault_type type, /* type of fault */ 2025 enum seg_rw rw, /* type of access at fault */ 2026 int brkcow) /* we may need to break cow */ 2027 { 2028 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 2029 page_t *pp, **ppp; 2030 uint_t pageflags = 0; 2031 page_t *anon_pl[1 + 1]; 2032 page_t *opp = NULL; /* original page */ 2033 uint_t prot; 2034 int err; 2035 int cow; 2036 int claim; 2037 int steal = 0; 2038 ulong_t anon_index; 2039 struct anon *ap, *oldap; 2040 struct anon_map *amp; 2041 int hat_flag = (type == F_SOFTLOCK) ? HAT_LOAD_LOCK : HAT_LOAD; 2042 int anon_lock = 0; 2043 anon_sync_obj_t cookie; 2044 2045 if (svd->flags & MAP_TEXT) { 2046 hat_flag |= HAT_LOAD_TEXT; 2047 } 2048 2049 ASSERT(SEGVN_READ_HELD(seg->s_as, &svd->lock)); 2050 ASSERT(seg->s_szc == 0); 2051 2052 /* 2053 * Initialize protection value for this page. 2054 * If we have per page protection values check it now. 2055 */ 2056 if (svd->pageprot) { 2057 uint_t protchk; 2058 2059 switch (rw) { 2060 case S_READ: 2061 protchk = PROT_READ; 2062 break; 2063 case S_WRITE: 2064 protchk = PROT_WRITE; 2065 break; 2066 case S_EXEC: 2067 protchk = PROT_EXEC; 2068 break; 2069 case S_OTHER: 2070 default: 2071 protchk = PROT_READ | PROT_WRITE | PROT_EXEC; 2072 break; 2073 } 2074 2075 prot = VPP_PROT(vpage); 2076 if ((prot & protchk) == 0) 2077 return (FC_PROT); /* illegal access type */ 2078 } else { 2079 prot = svd->prot; 2080 } 2081 2082 if (type == F_SOFTLOCK) { 2083 mutex_enter(&freemem_lock); 2084 if (availrmem <= tune.t_minarmem) { 2085 mutex_exit(&freemem_lock); 2086 return (FC_MAKE_ERR(ENOMEM)); /* out of real memory */ 2087 } else { 2088 svd->softlockcnt++; 2089 availrmem--; 2090 segvn_pages_locked++; 2091 } 2092 mutex_exit(&freemem_lock); 2093 } 2094 2095 /* 2096 * Always acquire the anon array lock to prevent 2 threads from 2097 * allocating separate anon slots for the same "addr". 2098 */ 2099 2100 if ((amp = svd->amp) != NULL) { 2101 ASSERT(RW_READ_HELD(&->a_rwlock)); 2102 anon_index = svd->anon_index + seg_page(seg, addr); 2103 anon_array_enter(amp, anon_index, &cookie); 2104 anon_lock = 1; 2105 } 2106 2107 if (svd->vp == NULL && amp != NULL) { 2108 if ((ap = anon_get_ptr(amp->ahp, anon_index)) == NULL) { 2109 /* 2110 * Allocate a (normally) writable anonymous page of 2111 * zeroes. If no advance reservations, reserve now. 2112 */ 2113 if (svd->flags & MAP_NORESERVE) { 2114 if (anon_resv(ptob(1))) { 2115 svd->swresv += ptob(1); 2116 } else { 2117 err = ENOMEM; 2118 goto out; 2119 } 2120 } 2121 if ((pp = anon_zero(seg, addr, &ap, 2122 svd->cred)) == NULL) { 2123 err = ENOMEM; 2124 goto out; /* out of swap space */ 2125 } 2126 /* 2127 * Re-acquire the anon_map lock and 2128 * initialize the anon array entry. 2129 */ 2130 (void) anon_set_ptr(amp->ahp, anon_index, ap, 2131 ANON_SLEEP); 2132 if (enable_mbit_wa) { 2133 if (rw == S_WRITE) 2134 hat_setmod(pp); 2135 else if (!hat_ismod(pp)) 2136 prot &= ~PROT_WRITE; 2137 } 2138 /* 2139 * If AS_PAGLCK is set in a_flags (via memcntl(2) 2140 * with MC_LOCKAS, MCL_FUTURE) and this is a 2141 * MAP_NORESERVE segment, we may need to 2142 * permanently lock the page as it is being faulted 2143 * for the first time. The following text applies 2144 * only to MAP_NORESERVE segments: 2145 * 2146 * As per memcntl(2), if this segment was created 2147 * after MCL_FUTURE was applied (a "future" 2148 * segment), its pages must be locked. If this 2149 * segment existed at MCL_FUTURE application (a 2150 * "past" segment), the interface is unclear. 2151 * 2152 * We decide to lock only if vpage is present: 2153 * 2154 * - "future" segments will have a vpage array (see 2155 * as_map), and so will be locked as required 2156 * 2157 * - "past" segments may not have a vpage array, 2158 * depending on whether events (such as 2159 * mprotect) have occurred. Locking if vpage 2160 * exists will preserve legacy behavior. Not 2161 * locking if vpage is absent, will not break 2162 * the interface or legacy behavior. Note that 2163 * allocating vpage here if it's absent requires 2164 * upgrading the segvn reader lock, the cost of 2165 * which does not seem worthwhile. 2166 */ 2167 if (AS_ISPGLCK(seg->s_as) && vpage != NULL && 2168 (svd->flags & MAP_NORESERVE)) { 2169 claim = VPP_PROT(vpage) & PROT_WRITE; 2170 ASSERT(svd->type == MAP_PRIVATE); 2171 if (page_pp_lock(pp, claim, 0)) 2172 VPP_SETPPLOCK(vpage); 2173 } 2174 2175 2176 /* 2177 * Handle pages that have been marked for migration 2178 */ 2179 if (lgrp_optimizations()) 2180 page_migrate(seg, addr, &pp, 1); 2181 hat_memload(hat, addr, pp, prot, hat_flag); 2182 2183 if (!(hat_flag & HAT_LOAD_LOCK)) 2184 page_unlock(pp); 2185 2186 anon_array_exit(&cookie); 2187 return (0); 2188 } 2189 } 2190 2191 /* 2192 * Obtain the page structure via anon_getpage() if it is 2193 * a private copy of an object (the result of a previous 2194 * copy-on-write). 2195 */ 2196 if (amp != NULL) { 2197 if ((ap = anon_get_ptr(amp->ahp, anon_index)) != NULL) { 2198 err = anon_getpage(&ap, &vpprot, anon_pl, PAGESIZE, 2199 seg, addr, rw, svd->cred); 2200 if (err) 2201 goto out; 2202 2203 if (svd->type == MAP_SHARED) { 2204 /* 2205 * If this is a shared mapping to an 2206 * anon_map, then ignore the write 2207 * permissions returned by anon_getpage(). 2208 * They apply to the private mappings 2209 * of this anon_map. 2210 */ 2211 vpprot |= PROT_WRITE; 2212 } 2213 opp = anon_pl[0]; 2214 } 2215 } 2216 2217 /* 2218 * Search the pl[] list passed in if it is from the 2219 * original object (i.e., not a private copy). 2220 */ 2221 if (opp == NULL) { 2222 /* 2223 * Find original page. We must be bringing it in 2224 * from the list in pl[]. 2225 */ 2226 for (ppp = pl; (opp = *ppp) != NULL; ppp++) { 2227 if (opp == PAGE_HANDLED) 2228 continue; 2229 ASSERT(opp->p_vnode == svd->vp); /* XXX */ 2230 if (opp->p_offset == off) 2231 break; 2232 } 2233 if (opp == NULL) { 2234 panic("segvn_faultpage not found"); 2235 /*NOTREACHED*/ 2236 } 2237 *ppp = PAGE_HANDLED; 2238 2239 } 2240 2241 ASSERT(PAGE_LOCKED(opp)); 2242 2243 TRACE_3(TR_FAC_VM, TR_SEGVN_FAULT, 2244 "segvn_fault:pp %p vp %p offset %llx", 2245 opp, NULL, 0); 2246 2247 /* 2248 * The fault is treated as a copy-on-write fault if a 2249 * write occurs on a private segment and the object 2250 * page (i.e., mapping) is write protected. We assume 2251 * that fatal protection checks have already been made. 2252 */ 2253 2254 cow = brkcow && ((vpprot & PROT_WRITE) == 0); 2255 2256 /* 2257 * If not a copy-on-write case load the translation 2258 * and return. 2259 */ 2260 if (cow == 0) { 2261 if (IS_VMODSORT(opp->p_vnode) || enable_mbit_wa) { 2262 if (rw == S_WRITE) 2263 hat_setmod(opp); 2264 else if (rw != S_OTHER && !hat_ismod(opp)) 2265 prot &= ~PROT_WRITE; 2266 } 2267 2268 /* 2269 * Handle pages that have been marked for migration 2270 */ 2271 if (lgrp_optimizations()) 2272 page_migrate(seg, addr, &opp, 1); 2273 2274 hat_memload(hat, addr, opp, prot & vpprot, hat_flag); 2275 2276 if (!(hat_flag & HAT_LOAD_LOCK)) 2277 page_unlock(opp); 2278 2279 if (anon_lock) { 2280 anon_array_exit(&cookie); 2281 } 2282 return (0); 2283 } 2284 2285 hat_setref(opp); 2286 2287 ASSERT(amp != NULL && anon_lock); 2288 2289 /* 2290 * Steal the page only if it isn't a private page 2291 * since stealing a private page is not worth the effort. 2292 */ 2293 if ((ap = anon_get_ptr(amp->ahp, anon_index)) == NULL) 2294 steal = 1; 2295 2296 /* 2297 * Steal the original page if the following conditions are true: 2298 * 2299 * We are low on memory, the page is not private, page is not 2300 * shared, not modified, not `locked' or if we have it `locked' 2301 * (i.e., p_cowcnt == 1 and p_lckcnt == 0, which also implies 2302 * that the page is not shared) and if it doesn't have any 2303 * translations. page_struct_lock isn't needed to look at p_cowcnt 2304 * and p_lckcnt because we first get exclusive lock on page. 2305 */ 2306 (void) hat_pagesync(opp, HAT_SYNC_DONTZERO | HAT_SYNC_STOPON_MOD); 2307 2308 if (stealcow && freemem < minfree && steal && 2309 page_tryupgrade(opp) && !hat_ismod(opp) && 2310 ((opp->p_lckcnt == 0 && opp->p_cowcnt == 0) || 2311 (opp->p_lckcnt == 0 && opp->p_cowcnt == 1 && 2312 vpage != NULL && VPP_ISPPLOCK(vpage)))) { 2313 /* 2314 * Check if this page has other translations 2315 * after unloading our translation. 2316 */ 2317 if (hat_page_is_mapped(opp)) { 2318 hat_unload(seg->s_as->a_hat, addr, PAGESIZE, 2319 HAT_UNLOAD); 2320 } 2321 2322 /* 2323 * hat_unload() might sync back someone else's recent 2324 * modification, so check again. 2325 */ 2326 if (!hat_ismod(opp) && !hat_page_is_mapped(opp)) 2327 pageflags |= STEAL_PAGE; 2328 } 2329 2330 /* 2331 * If we have a vpage pointer, see if it indicates that we have 2332 * ``locked'' the page we map -- if so, tell anon_private to 2333 * transfer the locking resource to the new page. 2334 * 2335 * See Statement at the beginning of segvn_lockop regarding 2336 * the way lockcnts/cowcnts are handled during COW. 2337 * 2338 */ 2339 if (vpage != NULL && VPP_ISPPLOCK(vpage)) 2340 pageflags |= LOCK_PAGE; 2341 2342 /* 2343 * Allocate a private page and perform the copy. 2344 * For MAP_NORESERVE reserve swap space now, unless this 2345 * is a cow fault on an existing anon page in which case 2346 * MAP_NORESERVE will have made advance reservations. 2347 */ 2348 if ((svd->flags & MAP_NORESERVE) && (ap == NULL)) { 2349 if (anon_resv(ptob(1))) { 2350 svd->swresv += ptob(1); 2351 } else { 2352 page_unlock(opp); 2353 err = ENOMEM; 2354 goto out; 2355 } 2356 } 2357 oldap = ap; 2358 pp = anon_private(&ap, seg, addr, prot, opp, pageflags, svd->cred); 2359 if (pp == NULL) { 2360 err = ENOMEM; /* out of swap space */ 2361 goto out; 2362 } 2363 2364 /* 2365 * If we copied away from an anonymous page, then 2366 * we are one step closer to freeing up an anon slot. 2367 * 2368 * NOTE: The original anon slot must be released while 2369 * holding the "anon_map" lock. This is necessary to prevent 2370 * other threads from obtaining a pointer to the anon slot 2371 * which may be freed if its "refcnt" is 1. 2372 */ 2373 if (oldap != NULL) 2374 anon_decref(oldap); 2375 2376 (void) anon_set_ptr(amp->ahp, anon_index, ap, ANON_SLEEP); 2377 2378 ASSERT(!IS_VMODSORT(pp->p_vnode)); 2379 if (enable_mbit_wa) { 2380 if (rw == S_WRITE) 2381 hat_setmod(pp); 2382 else if (!hat_ismod(pp)) 2383 prot &= ~PROT_WRITE; 2384 } 2385 2386 2387 /* 2388 * Handle pages that have been marked for migration 2389 */ 2390 if (lgrp_optimizations()) 2391 page_migrate(seg, addr, &pp, 1); 2392 hat_memload(hat, addr, pp, prot, hat_flag); 2393 2394 if (!(hat_flag & HAT_LOAD_LOCK)) 2395 page_unlock(pp); 2396 2397 ASSERT(anon_lock); 2398 anon_array_exit(&cookie); 2399 return (0); 2400 out: 2401 if (anon_lock) 2402 anon_array_exit(&cookie); 2403 2404 if (type == F_SOFTLOCK) { 2405 mutex_enter(&freemem_lock); 2406 availrmem++; 2407 segvn_pages_locked--; 2408 svd->softlockcnt--; 2409 mutex_exit(&freemem_lock); 2410 } 2411 return (FC_MAKE_ERR(err)); 2412 } 2413 2414 /* 2415 * relocate a bunch of smaller targ pages into one large repl page. all targ 2416 * pages must be complete pages smaller than replacement pages. 2417 * it's assumed that no page's szc can change since they are all PAGESIZE or 2418 * complete large pages locked SHARED. 2419 */ 2420 static void 2421 segvn_relocate_pages(page_t **targ, page_t *replacement) 2422 { 2423 page_t *pp; 2424 pgcnt_t repl_npgs, curnpgs; 2425 pgcnt_t i; 2426 uint_t repl_szc = replacement->p_szc; 2427 page_t *first_repl = replacement; 2428 page_t *repl; 2429 spgcnt_t npgs; 2430 2431 VM_STAT_ADD(segvnvmstats.relocatepages[0]); 2432 2433 ASSERT(repl_szc != 0); 2434 npgs = repl_npgs = page_get_pagecnt(repl_szc); 2435 2436 i = 0; 2437 while (repl_npgs) { 2438 spgcnt_t nreloc; 2439 int err; 2440 ASSERT(replacement != NULL); 2441 pp = targ[i]; 2442 ASSERT(pp->p_szc < repl_szc); 2443 ASSERT(PAGE_EXCL(pp)); 2444 ASSERT(!PP_ISFREE(pp)); 2445 curnpgs = page_get_pagecnt(pp->p_szc); 2446 if (curnpgs == 1) { 2447 VM_STAT_ADD(segvnvmstats.relocatepages[1]); 2448 repl = replacement; 2449 page_sub(&replacement, repl); 2450 ASSERT(PAGE_EXCL(repl)); 2451 ASSERT(!PP_ISFREE(repl)); 2452 ASSERT(repl->p_szc == repl_szc); 2453 } else { 2454 page_t *repl_savepp; 2455 int j; 2456 VM_STAT_ADD(segvnvmstats.relocatepages[2]); 2457 repl_savepp = replacement; 2458 for (j = 0; j < curnpgs; j++) { 2459 repl = replacement; 2460 page_sub(&replacement, repl); 2461 ASSERT(PAGE_EXCL(repl)); 2462 ASSERT(!PP_ISFREE(repl)); 2463 ASSERT(repl->p_szc == repl_szc); 2464 ASSERT(page_pptonum(targ[i + j]) == 2465 page_pptonum(targ[i]) + j); 2466 } 2467 repl = repl_savepp; 2468 ASSERT(IS_P2ALIGNED(page_pptonum(repl), curnpgs)); 2469 } 2470 err = page_relocate(&pp, &repl, 0, 1, &nreloc, NULL); 2471 if (err || nreloc != curnpgs) { 2472 panic("segvn_relocate_pages: " 2473 "page_relocate failed err=%d curnpgs=%ld " 2474 "nreloc=%ld", err, curnpgs, nreloc); 2475 } 2476 ASSERT(curnpgs <= repl_npgs); 2477 repl_npgs -= curnpgs; 2478 i += curnpgs; 2479 } 2480 ASSERT(replacement == NULL); 2481 2482 repl = first_repl; 2483 repl_npgs = npgs; 2484 for (i = 0; i < repl_npgs; i++) { 2485 ASSERT(PAGE_EXCL(repl)); 2486 ASSERT(!PP_ISFREE(repl)); 2487 targ[i] = repl; 2488 page_downgrade(targ[i]); 2489 repl = page_next(repl); 2490 } 2491 } 2492 2493 /* 2494 * Check if all pages in ppa array are complete smaller than szc pages and 2495 * their roots will still be aligned relative to their current size if the 2496 * entire ppa array is relocated into one szc page. If these conditions are 2497 * not met return 0. 2498 * 2499 * If all pages are properly aligned attempt to upgrade their locks 2500 * to exclusive mode. If it fails set *upgrdfail to 1 and return 0. 2501 * upgrdfail was set to 0 by caller. 2502 * 2503 * Return 1 if all pages are aligned and locked exclusively. 2504 * 2505 * If all pages in ppa array happen to be physically contiguous to make one 2506 * szc page and all exclusive locks are successfully obtained promote the page 2507 * size to szc and set *pszc to szc. Return 1 with pages locked shared. 2508 */ 2509 static int 2510 segvn_full_szcpages(page_t **ppa, uint_t szc, int *upgrdfail, uint_t *pszc) 2511 { 2512 page_t *pp; 2513 pfn_t pfn; 2514 pgcnt_t totnpgs = page_get_pagecnt(szc); 2515 pfn_t first_pfn; 2516 int contig = 1; 2517 pgcnt_t i; 2518 pgcnt_t j; 2519 uint_t curszc; 2520 pgcnt_t curnpgs; 2521 int root = 0; 2522 2523 ASSERT(szc > 0); 2524 2525 VM_STAT_ADD(segvnvmstats.fullszcpages[0]); 2526 2527 for (i = 0; i < totnpgs; i++) { 2528 pp = ppa[i]; 2529 ASSERT(PAGE_SHARED(pp)); 2530 ASSERT(!PP_ISFREE(pp)); 2531 pfn = page_pptonum(pp); 2532 if (i == 0) { 2533 if (!IS_P2ALIGNED(pfn, totnpgs)) { 2534 contig = 0; 2535 } else { 2536 first_pfn = pfn; 2537 } 2538 } else if (contig && pfn != first_pfn + i) { 2539 contig = 0; 2540 } 2541 if (pp->p_szc == 0) { 2542 if (root) { 2543 VM_STAT_ADD(segvnvmstats.fullszcpages[1]); 2544 return (0); 2545 } 2546 } else if (!root) { 2547 if ((curszc = pp->p_szc) >= szc) { 2548 VM_STAT_ADD(segvnvmstats.fullszcpages[2]); 2549 return (0); 2550 } 2551 if (curszc == 0) { 2552 /* 2553 * p_szc changed means we don't have all pages 2554 * locked. return failure. 2555 */ 2556 VM_STAT_ADD(segvnvmstats.fullszcpages[3]); 2557 return (0); 2558 } 2559 curnpgs = page_get_pagecnt(curszc); 2560 if (!IS_P2ALIGNED(pfn, curnpgs) || 2561 !IS_P2ALIGNED(i, curnpgs)) { 2562 VM_STAT_ADD(segvnvmstats.fullszcpages[4]); 2563 return (0); 2564 } 2565 root = 1; 2566 } else { 2567 ASSERT(i > 0); 2568 VM_STAT_ADD(segvnvmstats.fullszcpages[5]); 2569 if (pp->p_szc != curszc) { 2570 VM_STAT_ADD(segvnvmstats.fullszcpages[6]); 2571 return (0); 2572 } 2573 if (pfn - 1 != page_pptonum(ppa[i - 1])) { 2574 panic("segvn_full_szcpages: " 2575 "large page not physically contiguous"); 2576 } 2577 if (P2PHASE(pfn, curnpgs) == curnpgs - 1) { 2578 root = 0; 2579 } 2580 } 2581 } 2582 2583 for (i = 0; i < totnpgs; i++) { 2584 ASSERT(ppa[i]->p_szc < szc); 2585 if (!page_tryupgrade(ppa[i])) { 2586 for (j = 0; j < i; j++) { 2587 page_downgrade(ppa[j]); 2588 } 2589 *pszc = ppa[i]->p_szc; 2590 *upgrdfail = 1; 2591 VM_STAT_ADD(segvnvmstats.fullszcpages[7]); 2592 return (0); 2593 } 2594 } 2595 2596 /* 2597 * When a page is put a free cachelist its szc is set to 0. if file 2598 * system reclaimed pages from cachelist targ pages will be physically 2599 * contiguous with 0 p_szc. in this case just upgrade szc of targ 2600 * pages without any relocations. 2601 * To avoid any hat issues with previous small mappings 2602 * hat_pageunload() the target pages first. 2603 */ 2604 if (contig) { 2605 VM_STAT_ADD(segvnvmstats.fullszcpages[8]); 2606 for (i = 0; i < totnpgs; i++) { 2607 (void) hat_pageunload(ppa[i], HAT_FORCE_PGUNLOAD); 2608 } 2609 for (i = 0; i < totnpgs; i++) { 2610 ppa[i]->p_szc = szc; 2611 } 2612 for (i = 0; i < totnpgs; i++) { 2613 ASSERT(PAGE_EXCL(ppa[i])); 2614 page_downgrade(ppa[i]); 2615 } 2616 if (pszc != NULL) { 2617 *pszc = szc; 2618 } 2619 } 2620 VM_STAT_ADD(segvnvmstats.fullszcpages[9]); 2621 return (1); 2622 } 2623 2624 /* 2625 * Create physically contiguous pages for [vp, off] - [vp, off + 2626 * page_size(szc)) range and for private segment return them in ppa array. 2627 * Pages are created either via IO or relocations. 2628 * 2629 * Return 1 on sucess and 0 on failure. 2630 * 2631 * If physically contiguos pages already exist for this range return 1 without 2632 * filling ppa array. Caller initializes ppa[0] as NULL to detect that ppa 2633 * array wasn't filled. In this case caller fills ppa array via VOP_GETPAGE(). 2634 */ 2635 2636 static int 2637 segvn_fill_vp_pages(struct segvn_data *svd, vnode_t *vp, u_offset_t off, 2638 uint_t szc, page_t **ppa, page_t **ppplist, uint_t *ret_pszc, 2639 int *downsize) 2640 2641 { 2642 page_t *pplist = *ppplist; 2643 size_t pgsz = page_get_pagesize(szc); 2644 pgcnt_t pages = btop(pgsz); 2645 ulong_t start_off = off; 2646 u_offset_t eoff = off + pgsz; 2647 spgcnt_t nreloc; 2648 u_offset_t io_off = off; 2649 size_t io_len; 2650 page_t *io_pplist = NULL; 2651 page_t *done_pplist = NULL; 2652 pgcnt_t pgidx = 0; 2653 page_t *pp; 2654 page_t *newpp; 2655 page_t *targpp; 2656 int io_err = 0; 2657 int i; 2658 pfn_t pfn; 2659 ulong_t ppages; 2660 page_t *targ_pplist = NULL; 2661 page_t *repl_pplist = NULL; 2662 page_t *tmp_pplist; 2663 int nios = 0; 2664 uint_t pszc; 2665 struct vattr va; 2666 2667 VM_STAT_ADD(segvnvmstats.fill_vp_pages[0]); 2668 2669 ASSERT(szc != 0); 2670 ASSERT(pplist->p_szc == szc); 2671 2672 /* 2673 * downsize will be set to 1 only if we fail to lock pages. this will 2674 * allow subsequent faults to try to relocate the page again. If we 2675 * fail due to misalignment don't downsize and let the caller map the 2676 * whole region with small mappings to avoid more faults into the area 2677 * where we can't get large pages anyway. 2678 */ 2679 *downsize = 0; 2680 2681 while (off < eoff) { 2682 newpp = pplist; 2683 ASSERT(newpp != NULL); 2684 ASSERT(PAGE_EXCL(newpp)); 2685 ASSERT(!PP_ISFREE(newpp)); 2686 /* 2687 * we pass NULL for nrelocp to page_lookup_create() 2688 * so that it doesn't relocate. We relocate here 2689 * later only after we make sure we can lock all 2690 * pages in the range we handle and they are all 2691 * aligned. 2692 */ 2693 pp = page_lookup_create(vp, off, SE_SHARED, newpp, NULL, 0); 2694 ASSERT(pp != NULL); 2695 ASSERT(!PP_ISFREE(pp)); 2696 ASSERT(pp->p_vnode == vp); 2697 ASSERT(pp->p_offset == off); 2698 if (pp == newpp) { 2699 VM_STAT_ADD(segvnvmstats.fill_vp_pages[1]); 2700 page_sub(&pplist, pp); 2701 ASSERT(PAGE_EXCL(pp)); 2702 ASSERT(page_iolock_assert(pp)); 2703 page_list_concat(&io_pplist, &pp); 2704 off += PAGESIZE; 2705 continue; 2706 } 2707 VM_STAT_ADD(segvnvmstats.fill_vp_pages[2]); 2708 pfn = page_pptonum(pp); 2709 pszc = pp->p_szc; 2710 if (pszc >= szc && targ_pplist == NULL && io_pplist == NULL && 2711 IS_P2ALIGNED(pfn, pages)) { 2712 ASSERT(repl_pplist == NULL); 2713 ASSERT(done_pplist == NULL); 2714 ASSERT(pplist == *ppplist); 2715 page_unlock(pp); 2716 page_free_replacement_page(pplist); 2717 page_create_putback(pages); 2718 *ppplist = NULL; 2719 VM_STAT_ADD(segvnvmstats.fill_vp_pages[3]); 2720 return (1); 2721 } 2722 if (pszc >= szc) { 2723 page_unlock(pp); 2724 segvn_faultvnmpss_align_err1++; 2725 goto out; 2726 } 2727 ppages = page_get_pagecnt(pszc); 2728 if (!IS_P2ALIGNED(pfn, ppages)) { 2729 ASSERT(pszc > 0); 2730 /* 2731 * sizing down to pszc won't help. 2732 */ 2733 page_unlock(pp); 2734 segvn_faultvnmpss_align_err2++; 2735 goto out; 2736 } 2737 pfn = page_pptonum(newpp); 2738 if (!IS_P2ALIGNED(pfn, ppages)) { 2739 ASSERT(pszc > 0); 2740 /* 2741 * sizing down to pszc won't help. 2742 */ 2743 page_unlock(pp); 2744 segvn_faultvnmpss_align_err3++; 2745 goto out; 2746 } 2747 if (!PAGE_EXCL(pp)) { 2748 VM_STAT_ADD(segvnvmstats.fill_vp_pages[4]); 2749 page_unlock(pp); 2750 *downsize = 1; 2751 *ret_pszc = pp->p_szc; 2752 goto out; 2753 } 2754 targpp = pp; 2755 if (io_pplist != NULL) { 2756 VM_STAT_ADD(segvnvmstats.fill_vp_pages[5]); 2757 io_len = off - io_off; 2758 /* 2759 * Some file systems like NFS don't check EOF 2760 * conditions in VOP_PAGEIO(). Check it here 2761 * now that pages are locked SE_EXCL. Any file 2762 * truncation will wait until the pages are 2763 * unlocked so no need to worry that file will 2764 * be truncated after we check its size here. 2765 * XXX fix NFS to remove this check. 2766 */ 2767 va.va_mask = AT_SIZE; 2768 if (VOP_GETATTR(vp, &va, ATTR_HINT, svd->cred) != 0) { 2769 VM_STAT_ADD(segvnvmstats.fill_vp_pages[6]); 2770 page_unlock(targpp); 2771 goto out; 2772 } 2773 if (btopr(va.va_size) < btopr(io_off + io_len)) { 2774 VM_STAT_ADD(segvnvmstats.fill_vp_pages[7]); 2775 *downsize = 1; 2776 *ret_pszc = 0; 2777 page_unlock(targpp); 2778 goto out; 2779 } 2780 io_err = VOP_PAGEIO(vp, io_pplist, io_off, io_len, 2781 B_READ, svd->cred); 2782 if (io_err) { 2783 VM_STAT_ADD(segvnvmstats.fill_vp_pages[8]); 2784 page_unlock(targpp); 2785 if (io_err == EDEADLK) { 2786 segvn_vmpss_pageio_deadlk_err++; 2787 } 2788 goto out; 2789 } 2790 nios++; 2791 VM_STAT_ADD(segvnvmstats.fill_vp_pages[9]); 2792 while (io_pplist != NULL) { 2793 pp = io_pplist; 2794 page_sub(&io_pplist, pp); 2795 ASSERT(page_iolock_assert(pp)); 2796 page_io_unlock(pp); 2797 pgidx = (pp->p_offset - start_off) >> 2798 PAGESHIFT; 2799 ASSERT(pgidx < pages); 2800 ppa[pgidx] = pp; 2801 page_list_concat(&done_pplist, &pp); 2802 } 2803 } 2804 pp = targpp; 2805 ASSERT(PAGE_EXCL(pp)); 2806 ASSERT(pp->p_szc <= pszc); 2807 if (pszc != 0 && !group_page_trylock(pp, SE_EXCL)) { 2808 VM_STAT_ADD(segvnvmstats.fill_vp_pages[10]); 2809 page_unlock(pp); 2810 *downsize = 1; 2811 *ret_pszc = pp->p_szc; 2812 goto out; 2813 } 2814 VM_STAT_ADD(segvnvmstats.fill_vp_pages[11]); 2815 /* 2816 * page szc chould have changed before the entire group was 2817 * locked. reread page szc. 2818 */ 2819 pszc = pp->p_szc; 2820 ppages = page_get_pagecnt(pszc); 2821 2822 /* link just the roots */ 2823 page_list_concat(&targ_pplist, &pp); 2824 page_sub(&pplist, newpp); 2825 page_list_concat(&repl_pplist, &newpp); 2826 off += PAGESIZE; 2827 while (--ppages != 0) { 2828 newpp = pplist; 2829 page_sub(&pplist, newpp); 2830 off += PAGESIZE; 2831 } 2832 io_off = off; 2833 } 2834 if (io_pplist != NULL) { 2835 VM_STAT_ADD(segvnvmstats.fill_vp_pages[12]); 2836 io_len = eoff - io_off; 2837 va.va_mask = AT_SIZE; 2838 if (VOP_GETATTR(vp, &va, ATTR_HINT, svd->cred) != 0) { 2839 VM_STAT_ADD(segvnvmstats.fill_vp_pages[13]); 2840 goto out; 2841 } 2842 if (btopr(va.va_size) < btopr(io_off + io_len)) { 2843 VM_STAT_ADD(segvnvmstats.fill_vp_pages[14]); 2844 *downsize = 1; 2845 *ret_pszc = 0; 2846 goto out; 2847 } 2848 io_err = VOP_PAGEIO(vp, io_pplist, io_off, io_len, 2849 B_READ, svd->cred); 2850 if (io_err) { 2851 VM_STAT_ADD(segvnvmstats.fill_vp_pages[15]); 2852 if (io_err == EDEADLK) { 2853 segvn_vmpss_pageio_deadlk_err++; 2854 } 2855 goto out; 2856 } 2857 nios++; 2858 while (io_pplist != NULL) { 2859 pp = io_pplist; 2860 page_sub(&io_pplist, pp); 2861 ASSERT(page_iolock_assert(pp)); 2862 page_io_unlock(pp); 2863 pgidx = (pp->p_offset - start_off) >> PAGESHIFT; 2864 ASSERT(pgidx < pages); 2865 ppa[pgidx] = pp; 2866 } 2867 } 2868 /* 2869 * we're now bound to succeed or panic. 2870 * remove pages from done_pplist. it's not needed anymore. 2871 */ 2872 while (done_pplist != NULL) { 2873 pp = done_pplist; 2874 page_sub(&done_pplist, pp); 2875 } 2876 VM_STAT_ADD(segvnvmstats.fill_vp_pages[16]); 2877 ASSERT(pplist == NULL); 2878 *ppplist = NULL; 2879 while (targ_pplist != NULL) { 2880 int ret; 2881 VM_STAT_ADD(segvnvmstats.fill_vp_pages[17]); 2882 ASSERT(repl_pplist); 2883 pp = targ_pplist; 2884 page_sub(&targ_pplist, pp); 2885 pgidx = (pp->p_offset - start_off) >> PAGESHIFT; 2886 newpp = repl_pplist; 2887 page_sub(&repl_pplist, newpp); 2888 #ifdef DEBUG 2889 pfn = page_pptonum(pp); 2890 pszc = pp->p_szc; 2891 ppages = page_get_pagecnt(pszc); 2892 ASSERT(IS_P2ALIGNED(pfn, ppages)); 2893 pfn = page_pptonum(newpp); 2894 ASSERT(IS_P2ALIGNED(pfn, ppages)); 2895 ASSERT(P2PHASE(pfn, pages) == pgidx); 2896 #endif 2897 nreloc = 0; 2898 ret = page_relocate(&pp, &newpp, 0, 1, &nreloc, NULL); 2899 if (ret != 0 || nreloc == 0) { 2900 panic("segvn_fill_vp_pages: " 2901 "page_relocate failed"); 2902 } 2903 pp = newpp; 2904 while (nreloc-- != 0) { 2905 ASSERT(PAGE_EXCL(pp)); 2906 ASSERT(pp->p_vnode == vp); 2907 ASSERT(pgidx == 2908 ((pp->p_offset - start_off) >> PAGESHIFT)); 2909 ppa[pgidx++] = pp; 2910 pp = page_next(pp); 2911 } 2912 } 2913 2914 if (svd->type == MAP_PRIVATE) { 2915 VM_STAT_ADD(segvnvmstats.fill_vp_pages[18]); 2916 for (i = 0; i < pages; i++) { 2917 ASSERT(ppa[i] != NULL); 2918 ASSERT(PAGE_EXCL(ppa[i])); 2919 ASSERT(ppa[i]->p_vnode == vp); 2920 ASSERT(ppa[i]->p_offset == 2921 start_off + (i << PAGESHIFT)); 2922 page_downgrade(ppa[i]); 2923 } 2924 ppa[pages] = NULL; 2925 } else { 2926 VM_STAT_ADD(segvnvmstats.fill_vp_pages[19]); 2927 /* 2928 * the caller will still call VOP_GETPAGE() for shared segments 2929 * to check FS write permissions. For private segments we map 2930 * file read only anyway. so no VOP_GETPAGE is needed. 2931 */ 2932 for (i = 0; i < pages; i++) { 2933 ASSERT(ppa[i] != NULL); 2934 ASSERT(PAGE_EXCL(ppa[i])); 2935 ASSERT(ppa[i]->p_vnode == vp); 2936 ASSERT(ppa[i]->p_offset == 2937 start_off + (i << PAGESHIFT)); 2938 page_unlock(ppa[i]); 2939 } 2940 ppa[0] = NULL; 2941 } 2942 2943 return (1); 2944 out: 2945 /* 2946 * Do the cleanup. Unlock target pages we didn't relocate. They are 2947 * linked on targ_pplist by root pages. reassemble unused replacement 2948 * and io pages back to pplist. 2949 */ 2950 if (io_pplist != NULL) { 2951 VM_STAT_ADD(segvnvmstats.fill_vp_pages[20]); 2952 pp = io_pplist; 2953 do { 2954 ASSERT(pp->p_vnode == vp); 2955 ASSERT(pp->p_offset == io_off); 2956 ASSERT(page_iolock_assert(pp)); 2957 page_io_unlock(pp); 2958 page_hashout(pp, NULL); 2959 io_off += PAGESIZE; 2960 } while ((pp = pp->p_next) != io_pplist); 2961 page_list_concat(&io_pplist, &pplist); 2962 pplist = io_pplist; 2963 } 2964 tmp_pplist = NULL; 2965 while (targ_pplist != NULL) { 2966 VM_STAT_ADD(segvnvmstats.fill_vp_pages[21]); 2967 pp = targ_pplist; 2968 ASSERT(PAGE_EXCL(pp)); 2969 page_sub(&targ_pplist, pp); 2970 2971 pszc = pp->p_szc; 2972 ppages = page_get_pagecnt(pszc); 2973 ASSERT(IS_P2ALIGNED(page_pptonum(pp), ppages)); 2974 2975 if (pszc != 0) { 2976 group_page_unlock(pp); 2977 } 2978 page_unlock(pp); 2979 2980 pp = repl_pplist; 2981 ASSERT(pp != NULL); 2982 ASSERT(PAGE_EXCL(pp)); 2983 ASSERT(pp->p_szc == szc); 2984 page_sub(&repl_pplist, pp); 2985 2986 ASSERT(IS_P2ALIGNED(page_pptonum(pp), ppages)); 2987 2988 /* relink replacement page */ 2989 page_list_concat(&tmp_pplist, &pp); 2990 while (--ppages != 0) { 2991 VM_STAT_ADD(segvnvmstats.fill_vp_pages[22]); 2992 pp = page_next(pp); 2993 ASSERT(PAGE_EXCL(pp)); 2994 ASSERT(pp->p_szc == szc); 2995 page_list_concat(&tmp_pplist, &pp); 2996 } 2997 } 2998 if (tmp_pplist != NULL) { 2999 VM_STAT_ADD(segvnvmstats.fill_vp_pages[23]); 3000 page_list_concat(&tmp_pplist, &pplist); 3001 pplist = tmp_pplist; 3002 } 3003 /* 3004 * at this point all pages are either on done_pplist or 3005 * pplist. They can't be all on done_pplist otherwise 3006 * we'd've been done. 3007 */ 3008 ASSERT(pplist != NULL); 3009 if (nios != 0) { 3010 VM_STAT_ADD(segvnvmstats.fill_vp_pages[24]); 3011 pp = pplist; 3012 do { 3013 VM_STAT_ADD(segvnvmstats.fill_vp_pages[25]); 3014 ASSERT(pp->p_szc == szc); 3015 ASSERT(PAGE_EXCL(pp)); 3016 ASSERT(pp->p_vnode != vp); 3017 pp->p_szc = 0; 3018 } while ((pp = pp->p_next) != pplist); 3019 3020 pp = done_pplist; 3021 do { 3022 VM_STAT_ADD(segvnvmstats.fill_vp_pages[26]); 3023 ASSERT(pp->p_szc == szc); 3024 ASSERT(PAGE_EXCL(pp)); 3025 ASSERT(pp->p_vnode == vp); 3026 pp->p_szc = 0; 3027 } while ((pp = pp->p_next) != done_pplist); 3028 3029 while (pplist != NULL) { 3030 VM_STAT_ADD(segvnvmstats.fill_vp_pages[27]); 3031 pp = pplist; 3032 page_sub(&pplist, pp); 3033 page_free(pp, 0); 3034 } 3035 3036 while (done_pplist != NULL) { 3037 VM_STAT_ADD(segvnvmstats.fill_vp_pages[28]); 3038 pp = done_pplist; 3039 page_sub(&done_pplist, pp); 3040 page_unlock(pp); 3041 } 3042 *ppplist = NULL; 3043 return (0); 3044 } 3045 ASSERT(pplist == *ppplist); 3046 if (io_err) { 3047 VM_STAT_ADD(segvnvmstats.fill_vp_pages[29]); 3048 /* 3049 * don't downsize on io error. 3050 * see if vop_getpage succeeds. 3051 * pplist may still be used in this case 3052 * for relocations. 3053 */ 3054 return (0); 3055 } 3056 VM_STAT_ADD(segvnvmstats.fill_vp_pages[30]); 3057 page_free_replacement_page(pplist); 3058 page_create_putback(pages); 3059 *ppplist = NULL; 3060 return (0); 3061 } 3062 3063 int segvn_anypgsz = 0; 3064 3065 #define SEGVN_RESTORE_SOFTLOCK(type, pages) \ 3066 if ((type) == F_SOFTLOCK) { \ 3067 mutex_enter(&freemem_lock); \ 3068 availrmem += (pages); \ 3069 segvn_pages_locked -= (pages); \ 3070 svd->softlockcnt -= (pages); \ 3071 mutex_exit(&freemem_lock); \ 3072 } 3073 3074 #define SEGVN_UPDATE_MODBITS(ppa, pages, rw, prot, vpprot) \ 3075 if (IS_VMODSORT((ppa)[0]->p_vnode)) { \ 3076 if ((rw) == S_WRITE) { \ 3077 for (i = 0; i < (pages); i++) { \ 3078 ASSERT((ppa)[i]->p_vnode == \ 3079 (ppa)[0]->p_vnode); \ 3080 hat_setmod((ppa)[i]); \ 3081 } \ 3082 } else if ((rw) != S_OTHER && \ 3083 ((prot) & (vpprot) & PROT_WRITE)) { \ 3084 for (i = 0; i < (pages); i++) { \ 3085 ASSERT((ppa)[i]->p_vnode == \ 3086 (ppa)[0]->p_vnode); \ 3087 if (!hat_ismod((ppa)[i])) { \ 3088 prot &= ~PROT_WRITE; \ 3089 break; \ 3090 } \ 3091 } \ 3092 } \ 3093 } 3094 3095 #ifdef VM_STATS 3096 3097 #define SEGVN_VMSTAT_FLTVNPAGES(idx) \ 3098 VM_STAT_ADD(segvnvmstats.fltvnpages[(idx)]); 3099 3100 #else /* VM_STATS */ 3101 3102 #define SEGVN_VMSTAT_FLTVNPAGES(idx) 3103 3104 #endif 3105 3106 static faultcode_t 3107 segvn_fault_vnodepages(struct hat *hat, struct seg *seg, caddr_t lpgaddr, 3108 caddr_t lpgeaddr, enum fault_type type, enum seg_rw rw, caddr_t addr, 3109 caddr_t eaddr, int brkcow) 3110 { 3111 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 3112 struct anon_map *amp = svd->amp; 3113 uchar_t segtype = svd->type; 3114 uint_t szc = seg->s_szc; 3115 size_t pgsz = page_get_pagesize(szc); 3116 size_t maxpgsz = pgsz; 3117 pgcnt_t pages = btop(pgsz); 3118 pgcnt_t maxpages = pages; 3119 size_t ppasize = (pages + 1) * sizeof (page_t *); 3120 caddr_t a = lpgaddr; 3121 caddr_t maxlpgeaddr = lpgeaddr; 3122 u_offset_t off = svd->offset + (uintptr_t)(a - seg->s_base); 3123 ulong_t aindx = svd->anon_index + seg_page(seg, a); 3124 struct vpage *vpage = (svd->vpage != NULL) ? 3125 &svd->vpage[seg_page(seg, a)] : NULL; 3126 vnode_t *vp = svd->vp; 3127 page_t **ppa; 3128 uint_t pszc; 3129 size_t ppgsz; 3130 pgcnt_t ppages; 3131 faultcode_t err = 0; 3132 int ierr; 3133 int vop_size_err = 0; 3134 uint_t protchk, prot, vpprot; 3135 ulong_t i; 3136 int hat_flag = (type == F_SOFTLOCK) ? HAT_LOAD_LOCK : HAT_LOAD; 3137 anon_sync_obj_t an_cookie; 3138 enum seg_rw arw; 3139 int alloc_failed = 0; 3140 int adjszc_chk; 3141 struct vattr va; 3142 int xhat = 0; 3143 page_t *pplist; 3144 pfn_t pfn; 3145 int physcontig; 3146 int upgrdfail; 3147 int segvn_anypgsz_vnode = 0; /* for now map vnode with 2 page sizes */ 3148 3149 ASSERT(szc != 0); 3150 ASSERT(vp != NULL); 3151 ASSERT(brkcow == 0 || amp != NULL); 3152 ASSERT(enable_mbit_wa == 0); /* no mbit simulations with large pages */ 3153 ASSERT(!(svd->flags & MAP_NORESERVE)); 3154 ASSERT(type != F_SOFTUNLOCK); 3155 ASSERT(IS_P2ALIGNED(a, maxpgsz)); 3156 ASSERT(amp == NULL || IS_P2ALIGNED(aindx, maxpages)); 3157 ASSERT(SEGVN_LOCK_HELD(seg->s_as, &svd->lock)); 3158 ASSERT(seg->s_szc < NBBY * sizeof (int)); 3159 3160 VM_STAT_COND_ADD(type == F_SOFTLOCK, segvnvmstats.fltvnpages[0]); 3161 VM_STAT_COND_ADD(type != F_SOFTLOCK, segvnvmstats.fltvnpages[1]); 3162 3163 if (svd->flags & MAP_TEXT) { 3164 hat_flag |= HAT_LOAD_TEXT; 3165 } 3166 3167 if (svd->pageprot) { 3168 switch (rw) { 3169 case S_READ: 3170 protchk = PROT_READ; 3171 break; 3172 case S_WRITE: 3173 protchk = PROT_WRITE; 3174 break; 3175 case S_EXEC: 3176 protchk = PROT_EXEC; 3177 break; 3178 case S_OTHER: 3179 default: 3180 protchk = PROT_READ | PROT_WRITE | PROT_EXEC; 3181 break; 3182 } 3183 } else { 3184 prot = svd->prot; 3185 /* caller has already done segment level protection check. */ 3186 } 3187 3188 if (seg->s_as->a_hat != hat) { 3189 xhat = 1; 3190 } 3191 3192 if (rw == S_WRITE && segtype == MAP_PRIVATE) { 3193 SEGVN_VMSTAT_FLTVNPAGES(2); 3194 arw = S_READ; 3195 } else { 3196 arw = rw; 3197 } 3198 3199 ppa = kmem_alloc(ppasize, KM_SLEEP); 3200 3201 VM_STAT_COND_ADD(amp != NULL, segvnvmstats.fltvnpages[3]); 3202 3203 for (;;) { 3204 adjszc_chk = 0; 3205 for (; a < lpgeaddr; a += pgsz, off += pgsz, aindx += pages) { 3206 if (adjszc_chk) { 3207 while (szc < seg->s_szc) { 3208 uintptr_t e; 3209 uint_t tszc; 3210 tszc = segvn_anypgsz_vnode ? szc + 1 : 3211 seg->s_szc; 3212 ppgsz = page_get_pagesize(tszc); 3213 if (!IS_P2ALIGNED(a, ppgsz) || 3214 ((alloc_failed >> tszc) & 3215 0x1)) { 3216 break; 3217 } 3218 SEGVN_VMSTAT_FLTVNPAGES(4); 3219 szc = tszc; 3220 pgsz = ppgsz; 3221 pages = btop(pgsz); 3222 e = P2ROUNDUP((uintptr_t)eaddr, pgsz); 3223 lpgeaddr = (caddr_t)e; 3224 } 3225 } 3226 3227 again: 3228 if (IS_P2ALIGNED(a, maxpgsz) && amp != NULL) { 3229 ASSERT(IS_P2ALIGNED(aindx, maxpages)); 3230 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 3231 anon_array_enter(amp, aindx, &an_cookie); 3232 if (anon_get_ptr(amp->ahp, aindx) != NULL) { 3233 SEGVN_VMSTAT_FLTVNPAGES(5); 3234 if (anon_pages(amp->ahp, aindx, 3235 maxpages) != maxpages) { 3236 panic("segvn_fault_vnodepages:" 3237 " empty anon slots\n"); 3238 } 3239 anon_array_exit(&an_cookie); 3240 ANON_LOCK_EXIT(&->a_rwlock); 3241 err = segvn_fault_anonpages(hat, seg, 3242 a, a + maxpgsz, type, rw, 3243 MAX(a, addr), 3244 MIN(a + maxpgsz, eaddr), brkcow); 3245 if (err != 0) { 3246 SEGVN_VMSTAT_FLTVNPAGES(6); 3247 goto out; 3248 } 3249 if (szc < seg->s_szc) { 3250 szc = seg->s_szc; 3251 pgsz = maxpgsz; 3252 pages = maxpages; 3253 lpgeaddr = maxlpgeaddr; 3254 } 3255 goto next; 3256 } else if (anon_pages(amp->ahp, aindx, 3257 maxpages)) { 3258 panic("segvn_fault_vnodepages:" 3259 " non empty anon slots\n"); 3260 } else { 3261 SEGVN_VMSTAT_FLTVNPAGES(7); 3262 anon_array_exit(&an_cookie); 3263 ANON_LOCK_EXIT(&->a_rwlock); 3264 } 3265 } 3266 ASSERT(!brkcow || IS_P2ALIGNED(a, maxpgsz)); 3267 3268 if (svd->pageprot != 0 && IS_P2ALIGNED(a, maxpgsz)) { 3269 ASSERT(vpage != NULL); 3270 prot = VPP_PROT(vpage); 3271 ASSERT(sameprot(seg, a, maxpgsz)); 3272 if ((prot & protchk) == 0) { 3273 SEGVN_VMSTAT_FLTVNPAGES(8); 3274 err = FC_PROT; 3275 goto out; 3276 } 3277 } 3278 if (type == F_SOFTLOCK) { 3279 mutex_enter(&freemem_lock); 3280 if (availrmem < tune.t_minarmem + pages) { 3281 mutex_exit(&freemem_lock); 3282 err = FC_MAKE_ERR(ENOMEM); 3283 goto out; 3284 } else { 3285 availrmem -= pages; 3286 segvn_pages_locked += pages; 3287 svd->softlockcnt += pages; 3288 } 3289 mutex_exit(&freemem_lock); 3290 } 3291 3292 pplist = NULL; 3293 physcontig = 0; 3294 ppa[0] = NULL; 3295 if (!brkcow && szc && 3296 !page_exists_physcontig(vp, off, szc, 3297 segtype == MAP_PRIVATE ? ppa : NULL)) { 3298 SEGVN_VMSTAT_FLTVNPAGES(9); 3299 if (page_alloc_pages(seg, a, &pplist, NULL, 3300 szc, 0)) { 3301 SEGVN_RESTORE_SOFTLOCK(type, pages); 3302 SEGVN_VMSTAT_FLTVNPAGES(10); 3303 pszc = 0; 3304 ierr = -1; 3305 alloc_failed |= (1 << szc); 3306 break; 3307 } 3308 if (vp->v_mpssdata == SEGVN_PAGEIO) { 3309 int downsize; 3310 SEGVN_VMSTAT_FLTVNPAGES(11); 3311 physcontig = segvn_fill_vp_pages(svd, 3312 vp, off, szc, ppa, &pplist, 3313 &pszc, &downsize); 3314 ASSERT(!physcontig || pplist == NULL); 3315 if (!physcontig && downsize) { 3316 SEGVN_RESTORE_SOFTLOCK(type, 3317 pages); 3318 ASSERT(pplist == NULL); 3319 SEGVN_VMSTAT_FLTVNPAGES(12); 3320 ierr = -1; 3321 break; 3322 } 3323 ASSERT(!physcontig || 3324 segtype == MAP_PRIVATE || 3325 ppa[0] == NULL); 3326 if (physcontig && ppa[0] == NULL) { 3327 physcontig = 0; 3328 } 3329 } 3330 } else if (!brkcow && szc && ppa[0] != NULL) { 3331 SEGVN_VMSTAT_FLTVNPAGES(13); 3332 ASSERT(segtype == MAP_PRIVATE); 3333 physcontig = 1; 3334 } 3335 3336 if (!physcontig) { 3337 SEGVN_VMSTAT_FLTVNPAGES(14); 3338 ppa[0] = NULL; 3339 ierr = VOP_GETPAGE(vp, (offset_t)off, pgsz, 3340 &vpprot, ppa, pgsz, seg, a, arw, 3341 svd->cred); 3342 if (segtype == MAP_PRIVATE) { 3343 SEGVN_VMSTAT_FLTVNPAGES(15); 3344 vpprot &= ~PROT_WRITE; 3345 } 3346 } else { 3347 ASSERT(segtype == MAP_PRIVATE); 3348 SEGVN_VMSTAT_FLTVNPAGES(16); 3349 vpprot = PROT_ALL & ~PROT_WRITE; 3350 ierr = 0; 3351 } 3352 3353 if (ierr != 0) { 3354 SEGVN_VMSTAT_FLTVNPAGES(17); 3355 if (pplist != NULL) { 3356 SEGVN_VMSTAT_FLTVNPAGES(18); 3357 page_free_replacement_page(pplist); 3358 page_create_putback(pages); 3359 } 3360 SEGVN_RESTORE_SOFTLOCK(type, pages); 3361 if (a + pgsz <= eaddr) { 3362 SEGVN_VMSTAT_FLTVNPAGES(19); 3363 err = FC_MAKE_ERR(ierr); 3364 goto out; 3365 } 3366 va.va_mask = AT_SIZE; 3367 if (VOP_GETATTR(vp, &va, 0, svd->cred) != 0) { 3368 SEGVN_VMSTAT_FLTVNPAGES(20); 3369 err = FC_MAKE_ERR(EIO); 3370 goto out; 3371 } 3372 if (btopr(va.va_size) >= btopr(off + pgsz)) { 3373 SEGVN_VMSTAT_FLTVNPAGES(21); 3374 err = FC_MAKE_ERR(EIO); 3375 goto out; 3376 } 3377 if (btopr(va.va_size) < 3378 btopr(off + (eaddr - a))) { 3379 SEGVN_VMSTAT_FLTVNPAGES(22); 3380 err = FC_MAKE_ERR(EIO); 3381 goto out; 3382 } 3383 if (brkcow || type == F_SOFTLOCK) { 3384 /* can't reduce map area */ 3385 SEGVN_VMSTAT_FLTVNPAGES(23); 3386 vop_size_err = 1; 3387 goto out; 3388 } 3389 SEGVN_VMSTAT_FLTVNPAGES(24); 3390 ASSERT(szc != 0); 3391 pszc = 0; 3392 ierr = -1; 3393 break; 3394 } 3395 3396 if (amp != NULL) { 3397 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 3398 anon_array_enter(amp, aindx, &an_cookie); 3399 } 3400 if (amp != NULL && 3401 anon_get_ptr(amp->ahp, aindx) != NULL) { 3402 ulong_t taindx = P2ALIGN(aindx, maxpages); 3403 3404 SEGVN_VMSTAT_FLTVNPAGES(25); 3405 if (anon_pages(amp->ahp, taindx, maxpages) != 3406 maxpages) { 3407 panic("segvn_fault_vnodepages:" 3408 " empty anon slots\n"); 3409 } 3410 for (i = 0; i < pages; i++) { 3411 page_unlock(ppa[i]); 3412 } 3413 anon_array_exit(&an_cookie); 3414 ANON_LOCK_EXIT(&->a_rwlock); 3415 if (pplist != NULL) { 3416 page_free_replacement_page(pplist); 3417 page_create_putback(pages); 3418 } 3419 SEGVN_RESTORE_SOFTLOCK(type, pages); 3420 if (szc < seg->s_szc) { 3421 SEGVN_VMSTAT_FLTVNPAGES(26); 3422 /* 3423 * For private segments SOFTLOCK 3424 * either always breaks cow (any rw 3425 * type except S_READ_NOCOW) or 3426 * address space is locked as writer 3427 * (S_READ_NOCOW case) and anon slots 3428 * can't show up on second check. 3429 * Therefore if we are here for 3430 * SOFTLOCK case it must be a cow 3431 * break but cow break never reduces 3432 * szc. Thus the assert below. 3433 */ 3434 ASSERT(!brkcow && type != F_SOFTLOCK); 3435 pszc = seg->s_szc; 3436 ierr = -2; 3437 break; 3438 } 3439 ASSERT(IS_P2ALIGNED(a, maxpgsz)); 3440 goto again; 3441 } 3442 #ifdef DEBUG 3443 if (amp != NULL) { 3444 ulong_t taindx = P2ALIGN(aindx, maxpages); 3445 ASSERT(!anon_pages(amp->ahp, taindx, maxpages)); 3446 } 3447 #endif /* DEBUG */ 3448 3449 if (brkcow) { 3450 ASSERT(amp != NULL); 3451 ASSERT(pplist == NULL); 3452 ASSERT(szc == seg->s_szc); 3453 ASSERT(IS_P2ALIGNED(a, maxpgsz)); 3454 ASSERT(IS_P2ALIGNED(aindx, maxpages)); 3455 SEGVN_VMSTAT_FLTVNPAGES(27); 3456 ierr = anon_map_privatepages(amp, aindx, szc, 3457 seg, a, prot, ppa, vpage, segvn_anypgsz, 3458 svd->cred); 3459 if (ierr != 0) { 3460 SEGVN_VMSTAT_FLTVNPAGES(28); 3461 anon_array_exit(&an_cookie); 3462 ANON_LOCK_EXIT(&->a_rwlock); 3463 SEGVN_RESTORE_SOFTLOCK(type, pages); 3464 err = FC_MAKE_ERR(ierr); 3465 goto out; 3466 } 3467 3468 ASSERT(!IS_VMODSORT(ppa[0]->p_vnode)); 3469 /* 3470 * p_szc can't be changed for locked 3471 * swapfs pages. 3472 */ 3473 hat_memload_array(hat, a, pgsz, ppa, prot, 3474 hat_flag); 3475 3476 if (!(hat_flag & HAT_LOAD_LOCK)) { 3477 SEGVN_VMSTAT_FLTVNPAGES(29); 3478 for (i = 0; i < pages; i++) { 3479 page_unlock(ppa[i]); 3480 } 3481 } 3482 anon_array_exit(&an_cookie); 3483 ANON_LOCK_EXIT(&->a_rwlock); 3484 goto next; 3485 } 3486 3487 pfn = page_pptonum(ppa[0]); 3488 /* 3489 * hat_page_demote() needs an EXCl lock on one of 3490 * constituent page_t's and it decreases root's p_szc 3491 * last. This means if root's p_szc is equal szc and 3492 * all its constituent pages are locked 3493 * hat_page_demote() that could have changed p_szc to 3494 * szc is already done and no new have page_demote() 3495 * can start for this large page. 3496 */ 3497 3498 /* 3499 * we need to make sure same mapping size is used for 3500 * the same address range if there's a possibility the 3501 * adddress is already mapped because hat layer panics 3502 * when translation is loaded for the range already 3503 * mapped with a different page size. We achieve it 3504 * by always using largest page size possible subject 3505 * to the constraints of page size, segment page size 3506 * and page alignment. Since mappings are invalidated 3507 * when those constraints change and make it 3508 * impossible to use previously used mapping size no 3509 * mapping size conflicts should happen. 3510 */ 3511 3512 chkszc: 3513 if ((pszc = ppa[0]->p_szc) == szc && 3514 IS_P2ALIGNED(pfn, pages)) { 3515 3516 SEGVN_VMSTAT_FLTVNPAGES(30); 3517 #ifdef DEBUG 3518 for (i = 0; i < pages; i++) { 3519 ASSERT(PAGE_LOCKED(ppa[i])); 3520 ASSERT(!PP_ISFREE(ppa[i])); 3521 ASSERT(page_pptonum(ppa[i]) == 3522 pfn + i); 3523 ASSERT(ppa[i]->p_szc == szc); 3524 ASSERT(ppa[i]->p_vnode == vp); 3525 ASSERT(ppa[i]->p_offset == 3526 off + (i << PAGESHIFT)); 3527 } 3528 #endif 3529 /* 3530 * All pages are of szc we need and they are 3531 * all locked so they can't change szc. load 3532 * translations. 3533 * 3534 * if page got promoted since last check 3535 * we don't need pplist. 3536 */ 3537 if (pplist != NULL) { 3538 page_free_replacement_page(pplist); 3539 page_create_putback(pages); 3540 } 3541 if (PP_ISMIGRATE(ppa[0])) { 3542 page_migrate(seg, a, ppa, pages); 3543 } 3544 SEGVN_UPDATE_MODBITS(ppa, pages, rw, 3545 prot, vpprot); 3546 if (!xhat) { 3547 hat_memload_array(hat, a, pgsz, ppa, 3548 prot & vpprot, hat_flag); 3549 } else { 3550 /* 3551 * avoid large xhat mappings to FS 3552 * pages so that hat_page_demote() 3553 * doesn't need to check for xhat 3554 * large mappings. 3555 */ 3556 for (i = 0; i < pages; i++) { 3557 hat_memload(hat, 3558 a + (i << PAGESHIFT), 3559 ppa[i], prot & vpprot, 3560 hat_flag); 3561 } 3562 } 3563 3564 if (!(hat_flag & HAT_LOAD_LOCK)) { 3565 for (i = 0; i < pages; i++) { 3566 page_unlock(ppa[i]); 3567 } 3568 } 3569 if (amp != NULL) { 3570 anon_array_exit(&an_cookie); 3571 ANON_LOCK_EXIT(&->a_rwlock); 3572 } 3573 goto next; 3574 } 3575 3576 /* 3577 * See if upsize is possible. 3578 */ 3579 if (pszc > szc && szc < seg->s_szc && 3580 (segvn_anypgsz_vnode || pszc >= seg->s_szc)) { 3581 pgcnt_t aphase; 3582 uint_t pszc1 = MIN(pszc, seg->s_szc); 3583 ppgsz = page_get_pagesize(pszc1); 3584 ppages = btop(ppgsz); 3585 aphase = btop(P2PHASE((uintptr_t)a, ppgsz)); 3586 3587 SEGVN_VMSTAT_FLTVNPAGES(31); 3588 if (aphase != P2PHASE(pfn, ppages)) { 3589 segvn_faultvnmpss_align_err4++; 3590 } else if (type == F_SOFTLOCK && 3591 a != lpgaddr && 3592 !IS_P2ALIGNED(pfn, 3593 page_get_pagecnt(ppa[0]->p_szc))) { 3594 /* 3595 * if we locked previous offsets for 3596 * smaller szc page larger page can't 3597 * be here since one needs excl locks 3598 * to promote page size. 3599 */ 3600 panic("segvn_fault_vnodepages: " 3601 "unexpected larger than szc page" 3602 " found after SOFTLOCK"); 3603 } else { 3604 SEGVN_VMSTAT_FLTVNPAGES(32); 3605 if (pplist != NULL) { 3606 page_t *pl = pplist; 3607 page_free_replacement_page(pl); 3608 page_create_putback(pages); 3609 } 3610 for (i = 0; i < pages; i++) { 3611 page_unlock(ppa[i]); 3612 } 3613 if (amp != NULL) { 3614 anon_array_exit(&an_cookie); 3615 ANON_LOCK_EXIT(&->a_rwlock); 3616 } 3617 SEGVN_RESTORE_SOFTLOCK(type, pages); 3618 pszc = pszc1; 3619 ierr = -2; 3620 break; 3621 } 3622 } 3623 3624 /* 3625 * check if we should use smallest mapping size. 3626 */ 3627 upgrdfail = 0; 3628 if (szc == 0 || xhat || 3629 (pszc >= szc && 3630 !IS_P2ALIGNED(pfn, pages)) || 3631 (pszc < szc && 3632 !segvn_full_szcpages(ppa, szc, &upgrdfail, 3633 &pszc))) { 3634 3635 if (upgrdfail) { 3636 /* 3637 * segvn_full_szcpages failed to lock 3638 * all pages EXCL. Size down. 3639 */ 3640 ASSERT(pszc < szc); 3641 3642 SEGVN_VMSTAT_FLTVNPAGES(33); 3643 3644 if (pplist != NULL) { 3645 page_t *pl = pplist; 3646 page_free_replacement_page(pl); 3647 page_create_putback(pages); 3648 } 3649 3650 for (i = 0; i < pages; i++) { 3651 page_unlock(ppa[i]); 3652 } 3653 if (amp != NULL) { 3654 anon_array_exit(&an_cookie); 3655 ANON_LOCK_EXIT(&->a_rwlock); 3656 } 3657 SEGVN_RESTORE_SOFTLOCK(type, pages); 3658 ierr = -1; 3659 break; 3660 } 3661 if (szc != 0 && !xhat) { 3662 segvn_faultvnmpss_align_err5++; 3663 } 3664 SEGVN_VMSTAT_FLTVNPAGES(34); 3665 if (pplist != NULL) { 3666 page_free_replacement_page(pplist); 3667 page_create_putback(pages); 3668 } 3669 SEGVN_UPDATE_MODBITS(ppa, pages, rw, 3670 prot, vpprot); 3671 for (i = 0; i < pages; i++) { 3672 hat_memload(hat, a + (i << PAGESHIFT), 3673 ppa[i], prot & vpprot, hat_flag); 3674 } 3675 if (!(hat_flag & HAT_LOAD_LOCK)) { 3676 for (i = 0; i < pages; i++) { 3677 page_unlock(ppa[i]); 3678 } 3679 } 3680 if (amp != NULL) { 3681 anon_array_exit(&an_cookie); 3682 ANON_LOCK_EXIT(&->a_rwlock); 3683 } 3684 goto next; 3685 } 3686 3687 if (pszc == szc) { 3688 /* 3689 * segvn_full_szcpages() upgraded pages szc. 3690 */ 3691 ASSERT(pszc == ppa[0]->p_szc); 3692 ASSERT(IS_P2ALIGNED(pfn, pages)); 3693 goto chkszc; 3694 } 3695 3696 if (pszc > szc) { 3697 kmutex_t *szcmtx; 3698 SEGVN_VMSTAT_FLTVNPAGES(35); 3699 /* 3700 * p_szc of ppa[0] can change since we haven't 3701 * locked all constituent pages. Call 3702 * page_lock_szc() to prevent szc changes. 3703 * This should be a rare case that happens when 3704 * multiple segments use a different page size 3705 * to map the same file offsets. 3706 */ 3707 szcmtx = page_szc_lock(ppa[0]); 3708 pszc = ppa[0]->p_szc; 3709 ASSERT(szcmtx != NULL || pszc == 0); 3710 ASSERT(ppa[0]->p_szc <= pszc); 3711 if (pszc <= szc) { 3712 SEGVN_VMSTAT_FLTVNPAGES(36); 3713 if (szcmtx != NULL) { 3714 mutex_exit(szcmtx); 3715 } 3716 goto chkszc; 3717 } 3718 if (pplist != NULL) { 3719 /* 3720 * page got promoted since last check. 3721 * we don't need preaalocated large 3722 * page. 3723 */ 3724 SEGVN_VMSTAT_FLTVNPAGES(37); 3725 page_free_replacement_page(pplist); 3726 page_create_putback(pages); 3727 } 3728 SEGVN_UPDATE_MODBITS(ppa, pages, rw, 3729 prot, vpprot); 3730 hat_memload_array(hat, a, pgsz, ppa, 3731 prot & vpprot, hat_flag); 3732 mutex_exit(szcmtx); 3733 if (!(hat_flag & HAT_LOAD_LOCK)) { 3734 for (i = 0; i < pages; i++) { 3735 page_unlock(ppa[i]); 3736 } 3737 } 3738 if (amp != NULL) { 3739 anon_array_exit(&an_cookie); 3740 ANON_LOCK_EXIT(&->a_rwlock); 3741 } 3742 goto next; 3743 } 3744 3745 /* 3746 * if page got demoted since last check 3747 * we could have not allocated larger page. 3748 * allocate now. 3749 */ 3750 if (pplist == NULL && 3751 page_alloc_pages(seg, a, &pplist, NULL, szc, 0)) { 3752 SEGVN_VMSTAT_FLTVNPAGES(38); 3753 for (i = 0; i < pages; i++) { 3754 page_unlock(ppa[i]); 3755 } 3756 if (amp != NULL) { 3757 anon_array_exit(&an_cookie); 3758 ANON_LOCK_EXIT(&->a_rwlock); 3759 } 3760 SEGVN_RESTORE_SOFTLOCK(type, pages); 3761 ierr = -1; 3762 alloc_failed |= (1 << szc); 3763 break; 3764 } 3765 3766 SEGVN_VMSTAT_FLTVNPAGES(39); 3767 3768 segvn_relocate_pages(ppa, pplist); 3769 3770 SEGVN_UPDATE_MODBITS(ppa, pages, rw, prot, vpprot); 3771 hat_memload_array(hat, a, pgsz, ppa, prot & vpprot, 3772 hat_flag); 3773 if (!(hat_flag & HAT_LOAD_LOCK)) { 3774 for (i = 0; i < pages; i++) { 3775 ASSERT(PAGE_SHARED(ppa[i])); 3776 page_unlock(ppa[i]); 3777 } 3778 } 3779 if (amp != NULL) { 3780 anon_array_exit(&an_cookie); 3781 ANON_LOCK_EXIT(&->a_rwlock); 3782 } 3783 3784 next: 3785 if (vpage != NULL) { 3786 vpage += pages; 3787 } 3788 adjszc_chk = 1; 3789 } 3790 if (a == lpgeaddr) 3791 break; 3792 ASSERT(a < lpgeaddr); 3793 /* 3794 * ierr == -1 means we failed to map with a large page. 3795 * (either due to allocation/relocation failures or 3796 * misalignment with other mappings to this file. 3797 * 3798 * ierr == -2 means some other thread allocated a large page 3799 * after we gave up tp map with a large page. retry with 3800 * larger mapping. 3801 */ 3802 ASSERT(ierr == -1 || ierr == -2); 3803 ASSERT(ierr == -2 || szc != 0); 3804 ASSERT(ierr == -1 || szc < seg->s_szc); 3805 if (ierr == -2) { 3806 SEGVN_VMSTAT_FLTVNPAGES(40); 3807 ASSERT(pszc > szc && pszc <= seg->s_szc); 3808 szc = pszc; 3809 } else if (segvn_anypgsz_vnode) { 3810 SEGVN_VMSTAT_FLTVNPAGES(41); 3811 szc--; 3812 } else { 3813 SEGVN_VMSTAT_FLTVNPAGES(42); 3814 ASSERT(pszc < szc); 3815 /* 3816 * other process created pszc large page. 3817 * but we still have to drop to 0 szc. 3818 */ 3819 szc = 0; 3820 } 3821 3822 pgsz = page_get_pagesize(szc); 3823 pages = btop(pgsz); 3824 ASSERT(type != F_SOFTLOCK || ierr == -1 || 3825 (IS_P2ALIGNED(a, pgsz) && IS_P2ALIGNED(lpgeaddr, pgsz))); 3826 if (type == F_SOFTLOCK) { 3827 /* 3828 * For softlocks we cannot reduce the fault area 3829 * (calculated based on the largest page size for this 3830 * segment) for size down and a is already next 3831 * page size aligned as assertted above for size 3832 * ups. Therefore just continue in case of softlock. 3833 */ 3834 SEGVN_VMSTAT_FLTVNPAGES(43); 3835 continue; /* keep lint happy */ 3836 } else if (ierr == -2) { 3837 3838 /* 3839 * Size up case. Note lpgaddr may only be needed for 3840 * softlock case so we don't adjust it here. 3841 */ 3842 a = (caddr_t)P2ALIGN((uintptr_t)a, pgsz); 3843 ASSERT(a >= lpgaddr); 3844 lpgeaddr = (caddr_t)P2ROUNDUP((uintptr_t)eaddr, pgsz); 3845 off = svd->offset + (uintptr_t)(a - seg->s_base); 3846 aindx = svd->anon_index + seg_page(seg, a); 3847 vpage = (svd->vpage != NULL) ? 3848 &svd->vpage[seg_page(seg, a)] : NULL; 3849 } else { 3850 /* 3851 * Size down case. Note lpgaddr may only be needed for 3852 * softlock case so we don't adjust it here. 3853 */ 3854 ASSERT(IS_P2ALIGNED(a, pgsz)); 3855 ASSERT(IS_P2ALIGNED(lpgeaddr, pgsz)); 3856 lpgeaddr = (caddr_t)P2ROUNDUP((uintptr_t)eaddr, pgsz); 3857 ASSERT(a < lpgeaddr); 3858 if (a < addr) { 3859 SEGVN_VMSTAT_FLTVNPAGES(44); 3860 /* 3861 * The beginning of the large page region can 3862 * be pulled to the right to make a smaller 3863 * region. We haven't yet faulted a single 3864 * page. 3865 */ 3866 a = (caddr_t)P2ALIGN((uintptr_t)addr, pgsz); 3867 ASSERT(a >= lpgaddr); 3868 off = svd->offset + 3869 (uintptr_t)(a - seg->s_base); 3870 aindx = svd->anon_index + seg_page(seg, a); 3871 vpage = (svd->vpage != NULL) ? 3872 &svd->vpage[seg_page(seg, a)] : NULL; 3873 } 3874 } 3875 } 3876 out: 3877 kmem_free(ppa, ppasize); 3878 if (!err && !vop_size_err) { 3879 SEGVN_VMSTAT_FLTVNPAGES(45); 3880 return (0); 3881 } 3882 if (type == F_SOFTLOCK && a > lpgaddr) { 3883 SEGVN_VMSTAT_FLTVNPAGES(46); 3884 segvn_softunlock(seg, lpgaddr, a - lpgaddr, S_OTHER); 3885 } 3886 if (!vop_size_err) { 3887 SEGVN_VMSTAT_FLTVNPAGES(47); 3888 return (err); 3889 } 3890 ASSERT(brkcow || type == F_SOFTLOCK); 3891 /* 3892 * Large page end is mapped beyond the end of file and it's a cow 3893 * fault or softlock so we can't reduce the map area. For now just 3894 * demote the segment. This should really only happen if the end of 3895 * the file changed after the mapping was established since when large 3896 * page segments are created we make sure they don't extend beyond the 3897 * end of the file. 3898 */ 3899 SEGVN_VMSTAT_FLTVNPAGES(48); 3900 3901 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 3902 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_WRITER); 3903 err = 0; 3904 if (seg->s_szc != 0) { 3905 err = segvn_clrszc(seg); 3906 if (err != 0) { 3907 segvn_fltvnpages_clrszc_err++; 3908 } 3909 } 3910 ASSERT(err || seg->s_szc == 0); 3911 SEGVN_LOCK_DOWNGRADE(seg->s_as, &svd->lock); 3912 /* segvn_fault will do its job as if szc had been zero to begin with */ 3913 return (err == 0 ? IE_RETRY : FC_MAKE_ERR(err)); 3914 } 3915 3916 /* 3917 * This routine will attempt to fault in one large page. 3918 * it will use smaller pages if that fails. 3919 * It should only be called for pure anonymous segments. 3920 */ 3921 static faultcode_t 3922 segvn_fault_anonpages(struct hat *hat, struct seg *seg, caddr_t lpgaddr, 3923 caddr_t lpgeaddr, enum fault_type type, enum seg_rw rw, caddr_t addr, 3924 caddr_t eaddr, int brkcow) 3925 { 3926 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 3927 struct anon_map *amp = svd->amp; 3928 uchar_t segtype = svd->type; 3929 uint_t szc = seg->s_szc; 3930 size_t pgsz = page_get_pagesize(szc); 3931 size_t maxpgsz = pgsz; 3932 pgcnt_t pages = btop(pgsz); 3933 size_t ppasize = pages * sizeof (page_t *); 3934 caddr_t a = lpgaddr; 3935 ulong_t aindx = svd->anon_index + seg_page(seg, a); 3936 struct vpage *vpage = (svd->vpage != NULL) ? 3937 &svd->vpage[seg_page(seg, a)] : NULL; 3938 page_t **ppa; 3939 uint_t ppa_szc; 3940 faultcode_t err; 3941 int ierr; 3942 uint_t protchk, prot, vpprot; 3943 int i; 3944 int hat_flag = (type == F_SOFTLOCK) ? HAT_LOAD_LOCK : HAT_LOAD; 3945 anon_sync_obj_t cookie; 3946 3947 ASSERT(szc != 0); 3948 ASSERT(amp != NULL); 3949 ASSERT(enable_mbit_wa == 0); /* no mbit simulations with large pages */ 3950 ASSERT(!(svd->flags & MAP_NORESERVE)); 3951 ASSERT(type != F_SOFTUNLOCK); 3952 ASSERT(segtype == MAP_PRIVATE); 3953 ASSERT(IS_P2ALIGNED(a, maxpgsz)); 3954 3955 ASSERT(SEGVN_LOCK_HELD(seg->s_as, &svd->lock)); 3956 3957 VM_STAT_COND_ADD(type == F_SOFTLOCK, segvnvmstats.fltanpages[0]); 3958 VM_STAT_COND_ADD(type != F_SOFTLOCK, segvnvmstats.fltanpages[1]); 3959 3960 if (svd->flags & MAP_TEXT) { 3961 hat_flag |= HAT_LOAD_TEXT; 3962 } 3963 3964 if (svd->pageprot) { 3965 switch (rw) { 3966 case S_READ: 3967 protchk = PROT_READ; 3968 break; 3969 case S_WRITE: 3970 protchk = PROT_WRITE; 3971 break; 3972 case S_EXEC: 3973 protchk = PROT_EXEC; 3974 break; 3975 case S_OTHER: 3976 default: 3977 protchk = PROT_READ | PROT_WRITE | PROT_EXEC; 3978 break; 3979 } 3980 VM_STAT_ADD(segvnvmstats.fltanpages[2]); 3981 } else { 3982 prot = svd->prot; 3983 /* caller has already done segment level protection check. */ 3984 } 3985 3986 ppa = kmem_alloc(ppasize, KM_SLEEP); 3987 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 3988 for (;;) { 3989 for (; a < lpgeaddr; a += pgsz, aindx += pages) { 3990 if (svd->pageprot != 0 && IS_P2ALIGNED(a, maxpgsz)) { 3991 VM_STAT_ADD(segvnvmstats.fltanpages[3]); 3992 ASSERT(vpage != NULL); 3993 prot = VPP_PROT(vpage); 3994 ASSERT(sameprot(seg, a, maxpgsz)); 3995 if ((prot & protchk) == 0) { 3996 err = FC_PROT; 3997 goto error; 3998 } 3999 } 4000 if (type == F_SOFTLOCK) { 4001 mutex_enter(&freemem_lock); 4002 if (availrmem < tune.t_minarmem + pages) { 4003 mutex_exit(&freemem_lock); 4004 err = FC_MAKE_ERR(ENOMEM); 4005 goto error; 4006 } else { 4007 availrmem -= pages; 4008 segvn_pages_locked += pages; 4009 svd->softlockcnt += pages; 4010 } 4011 mutex_exit(&freemem_lock); 4012 } 4013 anon_array_enter(amp, aindx, &cookie); 4014 ppa_szc = (uint_t)-1; 4015 ierr = anon_map_getpages(amp, aindx, szc, seg, a, 4016 prot, &vpprot, ppa, &ppa_szc, vpage, rw, brkcow, 4017 segvn_anypgsz, svd->cred); 4018 if (ierr != 0) { 4019 anon_array_exit(&cookie); 4020 VM_STAT_ADD(segvnvmstats.fltanpages[4]); 4021 if (type == F_SOFTLOCK) { 4022 VM_STAT_ADD(segvnvmstats.fltanpages[5]); 4023 mutex_enter(&freemem_lock); 4024 availrmem += pages; 4025 segvn_pages_locked -= pages; 4026 svd->softlockcnt -= pages; 4027 mutex_exit(&freemem_lock); 4028 } 4029 if (ierr > 0) { 4030 VM_STAT_ADD(segvnvmstats.fltanpages[6]); 4031 err = FC_MAKE_ERR(ierr); 4032 goto error; 4033 } 4034 break; 4035 } 4036 4037 ASSERT(!IS_VMODSORT(ppa[0]->p_vnode)); 4038 4039 /* 4040 * Handle pages that have been marked for migration 4041 */ 4042 if (lgrp_optimizations()) 4043 page_migrate(seg, a, ppa, pages); 4044 4045 hat_memload_array(hat, a, pgsz, ppa, 4046 prot & vpprot, hat_flag); 4047 4048 if (hat_flag & HAT_LOAD_LOCK) { 4049 VM_STAT_ADD(segvnvmstats.fltanpages[7]); 4050 } else { 4051 VM_STAT_ADD(segvnvmstats.fltanpages[8]); 4052 for (i = 0; i < pages; i++) 4053 page_unlock(ppa[i]); 4054 } 4055 if (vpage != NULL) 4056 vpage += pages; 4057 4058 anon_array_exit(&cookie); 4059 } 4060 if (a == lpgeaddr) 4061 break; 4062 ASSERT(a < lpgeaddr); 4063 /* 4064 * ierr == -1 means we failed to allocate a large page. 4065 * so do a size down operation. 4066 * 4067 * ierr == -2 means some other process that privately shares 4068 * pages with this process has allocated a larger page and we 4069 * need to retry with larger pages. So do a size up 4070 * operation. This relies on the fact that large pages are 4071 * never partially shared i.e. if we share any constituent 4072 * page of a large page with another process we must share the 4073 * entire large page. Note this cannot happen for SOFTLOCK 4074 * case, unless current address (a) is at the beginning of the 4075 * next page size boundary because the other process couldn't 4076 * have relocated locked pages. 4077 */ 4078 ASSERT(ierr == -1 || ierr == -2); 4079 if (segvn_anypgsz) { 4080 ASSERT(ierr == -2 || szc != 0); 4081 ASSERT(ierr == -1 || szc < seg->s_szc); 4082 szc = (ierr == -1) ? szc - 1 : szc + 1; 4083 } else { 4084 /* 4085 * For non COW faults and segvn_anypgsz == 0 4086 * we need to be careful not to loop forever 4087 * if existing page is found with szc other 4088 * than 0 or seg->s_szc. This could be due 4089 * to page relocations on behalf of DR or 4090 * more likely large page creation. For this 4091 * case simply re-size to existing page's szc 4092 * if returned by anon_map_getpages(). 4093 */ 4094 if (ppa_szc == (uint_t)-1) { 4095 szc = (ierr == -1) ? 0 : seg->s_szc; 4096 } else { 4097 ASSERT(ppa_szc <= seg->s_szc); 4098 ASSERT(ierr == -2 || ppa_szc < szc); 4099 ASSERT(ierr == -1 || ppa_szc > szc); 4100 szc = ppa_szc; 4101 } 4102 } 4103 4104 pgsz = page_get_pagesize(szc); 4105 pages = btop(pgsz); 4106 ASSERT(type != F_SOFTLOCK || ierr == -1 || 4107 (IS_P2ALIGNED(a, pgsz) && IS_P2ALIGNED(lpgeaddr, pgsz))); 4108 if (type == F_SOFTLOCK) { 4109 /* 4110 * For softlocks we cannot reduce the fault area 4111 * (calculated based on the largest page size for this 4112 * segment) for size down and a is already next 4113 * page size aligned as assertted above for size 4114 * ups. Therefore just continue in case of softlock. 4115 */ 4116 VM_STAT_ADD(segvnvmstats.fltanpages[9]); 4117 continue; /* keep lint happy */ 4118 } else if (ierr == -2) { 4119 4120 /* 4121 * Size up case. Note lpgaddr may only be needed for 4122 * softlock case so we don't adjust it here. 4123 */ 4124 VM_STAT_ADD(segvnvmstats.fltanpages[10]); 4125 a = (caddr_t)P2ALIGN((uintptr_t)a, pgsz); 4126 ASSERT(a >= lpgaddr); 4127 lpgeaddr = (caddr_t)P2ROUNDUP((uintptr_t)eaddr, pgsz); 4128 aindx = svd->anon_index + seg_page(seg, a); 4129 vpage = (svd->vpage != NULL) ? 4130 &svd->vpage[seg_page(seg, a)] : NULL; 4131 } else { 4132 /* 4133 * Size down case. Note lpgaddr may only be needed for 4134 * softlock case so we don't adjust it here. 4135 */ 4136 VM_STAT_ADD(segvnvmstats.fltanpages[11]); 4137 ASSERT(IS_P2ALIGNED(a, pgsz)); 4138 ASSERT(IS_P2ALIGNED(lpgeaddr, pgsz)); 4139 lpgeaddr = (caddr_t)P2ROUNDUP((uintptr_t)eaddr, pgsz); 4140 ASSERT(a < lpgeaddr); 4141 if (a < addr) { 4142 /* 4143 * The beginning of the large page region can 4144 * be pulled to the right to make a smaller 4145 * region. We haven't yet faulted a single 4146 * page. 4147 */ 4148 VM_STAT_ADD(segvnvmstats.fltanpages[12]); 4149 a = (caddr_t)P2ALIGN((uintptr_t)addr, pgsz); 4150 ASSERT(a >= lpgaddr); 4151 aindx = svd->anon_index + seg_page(seg, a); 4152 vpage = (svd->vpage != NULL) ? 4153 &svd->vpage[seg_page(seg, a)] : NULL; 4154 } 4155 } 4156 } 4157 VM_STAT_ADD(segvnvmstats.fltanpages[13]); 4158 ANON_LOCK_EXIT(&->a_rwlock); 4159 kmem_free(ppa, ppasize); 4160 return (0); 4161 error: 4162 VM_STAT_ADD(segvnvmstats.fltanpages[14]); 4163 ANON_LOCK_EXIT(&->a_rwlock); 4164 kmem_free(ppa, ppasize); 4165 if (type == F_SOFTLOCK && a > lpgaddr) { 4166 VM_STAT_ADD(segvnvmstats.fltanpages[15]); 4167 segvn_softunlock(seg, lpgaddr, a - lpgaddr, S_OTHER); 4168 } 4169 return (err); 4170 } 4171 4172 int fltadvice = 1; /* set to free behind pages for sequential access */ 4173 4174 /* 4175 * This routine is called via a machine specific fault handling routine. 4176 * It is also called by software routines wishing to lock or unlock 4177 * a range of addresses. 4178 * 4179 * Here is the basic algorithm: 4180 * If unlocking 4181 * Call segvn_softunlock 4182 * Return 4183 * endif 4184 * Checking and set up work 4185 * If we will need some non-anonymous pages 4186 * Call VOP_GETPAGE over the range of non-anonymous pages 4187 * endif 4188 * Loop over all addresses requested 4189 * Call segvn_faultpage passing in page list 4190 * to load up translations and handle anonymous pages 4191 * endloop 4192 * Load up translation to any additional pages in page list not 4193 * already handled that fit into this segment 4194 */ 4195 static faultcode_t 4196 segvn_fault(struct hat *hat, struct seg *seg, caddr_t addr, size_t len, 4197 enum fault_type type, enum seg_rw rw) 4198 { 4199 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 4200 page_t **plp, **ppp, *pp; 4201 u_offset_t off; 4202 caddr_t a; 4203 struct vpage *vpage; 4204 uint_t vpprot, prot; 4205 int err; 4206 page_t *pl[PVN_GETPAGE_NUM + 1]; 4207 size_t plsz, pl_alloc_sz; 4208 size_t page; 4209 ulong_t anon_index; 4210 struct anon_map *amp; 4211 int dogetpage = 0; 4212 caddr_t lpgaddr, lpgeaddr; 4213 size_t pgsz; 4214 anon_sync_obj_t cookie; 4215 int brkcow = BREAK_COW_SHARE(rw, type, svd->type); 4216 4217 /* 4218 * S_READ_NOCOW is like read 4219 * except caller advises no need 4220 * to copy-on-write for softlock 4221 * because it holds address space 4222 * locked as writer and thus prevents 4223 * any copy on writes of a softlocked 4224 * page by another thread. 4225 * S_READ_NOCOW vs S_READ distinction was 4226 * only needed for BREAK_COW_SHARE(). After 4227 * that we treat S_READ_NOW as just S_READ. 4228 */ 4229 if (rw == S_READ_NOCOW) { 4230 rw = S_READ; 4231 ASSERT(type == F_SOFTLOCK && 4232 AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock)); 4233 } 4234 4235 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 4236 4237 /* 4238 * First handle the easy stuff 4239 */ 4240 if (type == F_SOFTUNLOCK) { 4241 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER); 4242 pgsz = (seg->s_szc == 0) ? PAGESIZE : 4243 page_get_pagesize(seg->s_szc); 4244 VM_STAT_COND_ADD(pgsz > PAGESIZE, segvnvmstats.fltanpages[16]); 4245 CALC_LPG_REGION(pgsz, seg, addr, len, lpgaddr, lpgeaddr); 4246 segvn_softunlock(seg, lpgaddr, lpgeaddr - lpgaddr, rw); 4247 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 4248 return (0); 4249 } 4250 4251 top: 4252 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER); 4253 4254 /* 4255 * If we have the same protections for the entire segment, 4256 * insure that the access being attempted is legitimate. 4257 */ 4258 4259 if (svd->pageprot == 0) { 4260 uint_t protchk; 4261 4262 switch (rw) { 4263 case S_READ: 4264 protchk = PROT_READ; 4265 break; 4266 case S_WRITE: 4267 protchk = PROT_WRITE; 4268 break; 4269 case S_EXEC: 4270 protchk = PROT_EXEC; 4271 break; 4272 case S_OTHER: 4273 default: 4274 protchk = PROT_READ | PROT_WRITE | PROT_EXEC; 4275 break; 4276 } 4277 4278 if ((svd->prot & protchk) == 0) { 4279 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 4280 return (FC_PROT); /* illegal access type */ 4281 } 4282 } 4283 4284 /* 4285 * Check to see if we need to allocate an anon_map structure. 4286 */ 4287 if (svd->amp == NULL && (svd->vp == NULL || brkcow)) { 4288 /* 4289 * Drop the "read" lock on the segment and acquire 4290 * the "write" version since we have to allocate the 4291 * anon_map. 4292 */ 4293 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 4294 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_WRITER); 4295 4296 if (svd->amp == NULL) { 4297 svd->amp = anonmap_alloc(seg->s_size, 0); 4298 svd->amp->a_szc = seg->s_szc; 4299 } 4300 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 4301 4302 /* 4303 * Start all over again since segment protections 4304 * may have changed after we dropped the "read" lock. 4305 */ 4306 goto top; 4307 } 4308 4309 amp = svd->amp; 4310 4311 /* 4312 * MADV_SEQUENTIAL work is ignored for large page segments. 4313 */ 4314 if (seg->s_szc != 0) { 4315 pgsz = page_get_pagesize(seg->s_szc); 4316 ASSERT(SEGVN_LOCK_HELD(seg->s_as, &svd->lock)); 4317 /* 4318 * We may need to do relocations so purge seg_pcache to allow 4319 * pages to be locked exclusively. 4320 */ 4321 if (svd->softlockcnt != 0) 4322 segvn_purge(seg); 4323 CALC_LPG_REGION(pgsz, seg, addr, len, lpgaddr, lpgeaddr); 4324 if (svd->vp == NULL) { 4325 ASSERT(svd->type == MAP_PRIVATE); 4326 err = segvn_fault_anonpages(hat, seg, lpgaddr, 4327 lpgeaddr, type, rw, addr, addr + len, brkcow); 4328 } else { 4329 err = segvn_fault_vnodepages(hat, seg, lpgaddr, 4330 lpgeaddr, type, rw, addr, addr + len, brkcow); 4331 if (err == IE_RETRY) { 4332 ASSERT(seg->s_szc == 0); 4333 ASSERT(SEGVN_READ_HELD(seg->s_as, &svd->lock)); 4334 goto cont; 4335 } 4336 } 4337 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 4338 return (err); 4339 } 4340 4341 cont: 4342 page = seg_page(seg, addr); 4343 if (amp != NULL) { 4344 anon_index = svd->anon_index + page; 4345 4346 if ((type == F_PROT) && (rw == S_READ) && 4347 svd->type == MAP_PRIVATE && svd->pageprot == 0) { 4348 size_t index = anon_index; 4349 struct anon *ap; 4350 4351 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 4352 /* 4353 * The fast path could apply to S_WRITE also, except 4354 * that the protection fault could be caused by lazy 4355 * tlb flush when ro->rw. In this case, the pte is 4356 * RW already. But RO in the other cpu's tlb causes 4357 * the fault. Since hat_chgprot won't do anything if 4358 * pte doesn't change, we may end up faulting 4359 * indefinitely until the RO tlb entry gets replaced. 4360 */ 4361 for (a = addr; a < addr + len; a += PAGESIZE, index++) { 4362 anon_array_enter(amp, index, &cookie); 4363 ap = anon_get_ptr(amp->ahp, index); 4364 anon_array_exit(&cookie); 4365 if ((ap == NULL) || (ap->an_refcnt != 1)) { 4366 ANON_LOCK_EXIT(&->a_rwlock); 4367 goto slow; 4368 } 4369 } 4370 hat_chgprot(seg->s_as->a_hat, addr, len, svd->prot); 4371 ANON_LOCK_EXIT(&->a_rwlock); 4372 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 4373 return (0); 4374 } 4375 } 4376 slow: 4377 4378 if (svd->vpage == NULL) 4379 vpage = NULL; 4380 else 4381 vpage = &svd->vpage[page]; 4382 4383 off = svd->offset + (uintptr_t)(addr - seg->s_base); 4384 4385 /* 4386 * If MADV_SEQUENTIAL has been set for the particular page we 4387 * are faulting on, free behind all pages in the segment and put 4388 * them on the free list. 4389 */ 4390 if ((page != 0) && fltadvice) { /* not if first page in segment */ 4391 struct vpage *vpp; 4392 ulong_t fanon_index; 4393 size_t fpage; 4394 u_offset_t pgoff, fpgoff; 4395 struct vnode *fvp; 4396 struct anon *fap = NULL; 4397 4398 if (svd->advice == MADV_SEQUENTIAL || 4399 (svd->pageadvice && 4400 VPP_ADVICE(vpage) == MADV_SEQUENTIAL)) { 4401 pgoff = off - PAGESIZE; 4402 fpage = page - 1; 4403 if (vpage != NULL) 4404 vpp = &svd->vpage[fpage]; 4405 if (amp != NULL) 4406 fanon_index = svd->anon_index + fpage; 4407 4408 while (pgoff > svd->offset) { 4409 if (svd->advice != MADV_SEQUENTIAL && 4410 (!svd->pageadvice || (vpage && 4411 VPP_ADVICE(vpp) != MADV_SEQUENTIAL))) 4412 break; 4413 4414 /* 4415 * If this is an anon page, we must find the 4416 * correct <vp, offset> for it 4417 */ 4418 fap = NULL; 4419 if (amp != NULL) { 4420 ANON_LOCK_ENTER(&->a_rwlock, 4421 RW_READER); 4422 anon_array_enter(amp, fanon_index, 4423 &cookie); 4424 fap = anon_get_ptr(amp->ahp, 4425 fanon_index); 4426 if (fap != NULL) { 4427 swap_xlate(fap, &fvp, &fpgoff); 4428 } else { 4429 fpgoff = pgoff; 4430 fvp = svd->vp; 4431 } 4432 anon_array_exit(&cookie); 4433 ANON_LOCK_EXIT(&->a_rwlock); 4434 } else { 4435 fpgoff = pgoff; 4436 fvp = svd->vp; 4437 } 4438 if (fvp == NULL) 4439 break; /* XXX */ 4440 /* 4441 * Skip pages that are free or have an 4442 * "exclusive" lock. 4443 */ 4444 pp = page_lookup_nowait(fvp, fpgoff, SE_SHARED); 4445 if (pp == NULL) 4446 break; 4447 /* 4448 * We don't need the page_struct_lock to test 4449 * as this is only advisory; even if we 4450 * acquire it someone might race in and lock 4451 * the page after we unlock and before the 4452 * PUTPAGE, then VOP_PUTPAGE will do nothing. 4453 */ 4454 if (pp->p_lckcnt == 0 && pp->p_cowcnt == 0) { 4455 /* 4456 * Hold the vnode before releasing 4457 * the page lock to prevent it from 4458 * being freed and re-used by some 4459 * other thread. 4460 */ 4461 VN_HOLD(fvp); 4462 page_unlock(pp); 4463 /* 4464 * We should build a page list 4465 * to kluster putpages XXX 4466 */ 4467 (void) VOP_PUTPAGE(fvp, 4468 (offset_t)fpgoff, PAGESIZE, 4469 (B_DONTNEED|B_FREE|B_ASYNC), 4470 svd->cred); 4471 VN_RELE(fvp); 4472 } else { 4473 /* 4474 * XXX - Should the loop terminate if 4475 * the page is `locked'? 4476 */ 4477 page_unlock(pp); 4478 } 4479 --vpp; 4480 --fanon_index; 4481 pgoff -= PAGESIZE; 4482 } 4483 } 4484 } 4485 4486 plp = pl; 4487 *plp = NULL; 4488 pl_alloc_sz = 0; 4489 4490 /* 4491 * See if we need to call VOP_GETPAGE for 4492 * *any* of the range being faulted on. 4493 * We can skip all of this work if there 4494 * was no original vnode. 4495 */ 4496 if (svd->vp != NULL) { 4497 u_offset_t vp_off; 4498 size_t vp_len; 4499 struct anon *ap; 4500 vnode_t *vp; 4501 4502 vp_off = off; 4503 vp_len = len; 4504 4505 if (amp == NULL) 4506 dogetpage = 1; 4507 else { 4508 /* 4509 * Only acquire reader lock to prevent amp->ahp 4510 * from being changed. It's ok to miss pages, 4511 * hence we don't do anon_array_enter 4512 */ 4513 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 4514 ap = anon_get_ptr(amp->ahp, anon_index); 4515 4516 if (len <= PAGESIZE) 4517 /* inline non_anon() */ 4518 dogetpage = (ap == NULL); 4519 else 4520 dogetpage = non_anon(amp->ahp, anon_index, 4521 &vp_off, &vp_len); 4522 ANON_LOCK_EXIT(&->a_rwlock); 4523 } 4524 4525 if (dogetpage) { 4526 enum seg_rw arw; 4527 struct as *as = seg->s_as; 4528 4529 if (len > ptob((sizeof (pl) / sizeof (pl[0])) - 1)) { 4530 /* 4531 * Page list won't fit in local array, 4532 * allocate one of the needed size. 4533 */ 4534 pl_alloc_sz = 4535 (btop(len) + 1) * sizeof (page_t *); 4536 plp = kmem_alloc(pl_alloc_sz, KM_SLEEP); 4537 plp[0] = NULL; 4538 plsz = len; 4539 } else if (rw == S_WRITE && svd->type == MAP_PRIVATE || 4540 rw == S_OTHER || 4541 (((size_t)(addr + PAGESIZE) < 4542 (size_t)(seg->s_base + seg->s_size)) && 4543 hat_probe(as->a_hat, addr + PAGESIZE))) { 4544 /* 4545 * Ask VOP_GETPAGE to return the exact number 4546 * of pages if 4547 * (a) this is a COW fault, or 4548 * (b) this is a software fault, or 4549 * (c) next page is already mapped. 4550 */ 4551 plsz = len; 4552 } else { 4553 /* 4554 * Ask VOP_GETPAGE to return adjacent pages 4555 * within the segment. 4556 */ 4557 plsz = MIN((size_t)PVN_GETPAGE_SZ, (size_t) 4558 ((seg->s_base + seg->s_size) - addr)); 4559 ASSERT((addr + plsz) <= 4560 (seg->s_base + seg->s_size)); 4561 } 4562 4563 /* 4564 * Need to get some non-anonymous pages. 4565 * We need to make only one call to GETPAGE to do 4566 * this to prevent certain deadlocking conditions 4567 * when we are doing locking. In this case 4568 * non_anon() should have picked up the smallest 4569 * range which includes all the non-anonymous 4570 * pages in the requested range. We have to 4571 * be careful regarding which rw flag to pass in 4572 * because on a private mapping, the underlying 4573 * object is never allowed to be written. 4574 */ 4575 if (rw == S_WRITE && svd->type == MAP_PRIVATE) { 4576 arw = S_READ; 4577 } else { 4578 arw = rw; 4579 } 4580 vp = svd->vp; 4581 TRACE_3(TR_FAC_VM, TR_SEGVN_GETPAGE, 4582 "segvn_getpage:seg %p addr %p vp %p", 4583 seg, addr, vp); 4584 err = VOP_GETPAGE(vp, (offset_t)vp_off, vp_len, 4585 &vpprot, plp, plsz, seg, addr + (vp_off - off), arw, 4586 svd->cred); 4587 if (err) { 4588 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 4589 segvn_pagelist_rele(plp); 4590 if (pl_alloc_sz) 4591 kmem_free(plp, pl_alloc_sz); 4592 return (FC_MAKE_ERR(err)); 4593 } 4594 if (svd->type == MAP_PRIVATE) 4595 vpprot &= ~PROT_WRITE; 4596 } 4597 } 4598 4599 /* 4600 * N.B. at this time the plp array has all the needed non-anon 4601 * pages in addition to (possibly) having some adjacent pages. 4602 */ 4603 4604 /* 4605 * Always acquire the anon_array_lock to prevent 4606 * 2 threads from allocating separate anon slots for 4607 * the same "addr". 4608 * 4609 * If this is a copy-on-write fault and we don't already 4610 * have the anon_array_lock, acquire it to prevent the 4611 * fault routine from handling multiple copy-on-write faults 4612 * on the same "addr" in the same address space. 4613 * 4614 * Only one thread should deal with the fault since after 4615 * it is handled, the other threads can acquire a translation 4616 * to the newly created private page. This prevents two or 4617 * more threads from creating different private pages for the 4618 * same fault. 4619 * 4620 * We grab "serialization" lock here if this is a MAP_PRIVATE segment 4621 * to prevent deadlock between this thread and another thread 4622 * which has soft-locked this page and wants to acquire serial_lock. 4623 * ( bug 4026339 ) 4624 * 4625 * The fix for bug 4026339 becomes unnecessary when using the 4626 * locking scheme with per amp rwlock and a global set of hash 4627 * lock, anon_array_lock. If we steal a vnode page when low 4628 * on memory and upgrad the page lock through page_rename, 4629 * then the page is PAGE_HANDLED, nothing needs to be done 4630 * for this page after returning from segvn_faultpage. 4631 * 4632 * But really, the page lock should be downgraded after 4633 * the stolen page is page_rename'd. 4634 */ 4635 4636 if (amp != NULL) 4637 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 4638 4639 /* 4640 * Ok, now loop over the address range and handle faults 4641 */ 4642 for (a = addr; a < addr + len; a += PAGESIZE, off += PAGESIZE) { 4643 err = segvn_faultpage(hat, seg, a, off, vpage, plp, vpprot, 4644 type, rw, brkcow); 4645 if (err) { 4646 if (amp != NULL) 4647 ANON_LOCK_EXIT(&->a_rwlock); 4648 if (type == F_SOFTLOCK && a > addr) 4649 segvn_softunlock(seg, addr, (a - addr), 4650 S_OTHER); 4651 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 4652 segvn_pagelist_rele(plp); 4653 if (pl_alloc_sz) 4654 kmem_free(plp, pl_alloc_sz); 4655 return (err); 4656 } 4657 if (vpage) { 4658 vpage++; 4659 } else if (svd->vpage) { 4660 page = seg_page(seg, addr); 4661 vpage = &svd->vpage[++page]; 4662 } 4663 } 4664 4665 /* Didn't get pages from the underlying fs so we're done */ 4666 if (!dogetpage) 4667 goto done; 4668 4669 /* 4670 * Now handle any other pages in the list returned. 4671 * If the page can be used, load up the translations now. 4672 * Note that the for loop will only be entered if "plp" 4673 * is pointing to a non-NULL page pointer which means that 4674 * VOP_GETPAGE() was called and vpprot has been initialized. 4675 */ 4676 if (svd->pageprot == 0) 4677 prot = svd->prot & vpprot; 4678 4679 4680 /* 4681 * Large Files: diff should be unsigned value because we started 4682 * supporting > 2GB segment sizes from 2.5.1 and when a 4683 * large file of size > 2GB gets mapped to address space 4684 * the diff value can be > 2GB. 4685 */ 4686 4687 for (ppp = plp; (pp = *ppp) != NULL; ppp++) { 4688 size_t diff; 4689 struct anon *ap; 4690 int anon_index; 4691 anon_sync_obj_t cookie; 4692 int hat_flag = HAT_LOAD_ADV; 4693 4694 if (svd->flags & MAP_TEXT) { 4695 hat_flag |= HAT_LOAD_TEXT; 4696 } 4697 4698 if (pp == PAGE_HANDLED) 4699 continue; 4700 4701 if (pp->p_offset >= svd->offset && 4702 (pp->p_offset < svd->offset + seg->s_size)) { 4703 4704 diff = pp->p_offset - svd->offset; 4705 4706 /* 4707 * Large Files: Following is the assertion 4708 * validating the above cast. 4709 */ 4710 ASSERT(svd->vp == pp->p_vnode); 4711 4712 page = btop(diff); 4713 if (svd->pageprot) 4714 prot = VPP_PROT(&svd->vpage[page]) & vpprot; 4715 4716 /* 4717 * Prevent other threads in the address space from 4718 * creating private pages (i.e., allocating anon slots) 4719 * while we are in the process of loading translations 4720 * to additional pages returned by the underlying 4721 * object. 4722 */ 4723 if (amp != NULL) { 4724 anon_index = svd->anon_index + page; 4725 anon_array_enter(amp, anon_index, &cookie); 4726 ap = anon_get_ptr(amp->ahp, anon_index); 4727 } 4728 if ((amp == NULL) || (ap == NULL)) { 4729 if (IS_VMODSORT(pp->p_vnode) || 4730 enable_mbit_wa) { 4731 if (rw == S_WRITE) 4732 hat_setmod(pp); 4733 else if (rw != S_OTHER && 4734 !hat_ismod(pp)) 4735 prot &= ~PROT_WRITE; 4736 } 4737 /* 4738 * Skip mapping read ahead pages marked 4739 * for migration, so they will get migrated 4740 * properly on fault 4741 */ 4742 if ((prot & PROT_READ) && !PP_ISMIGRATE(pp)) { 4743 hat_memload(hat, seg->s_base + diff, 4744 pp, prot, hat_flag); 4745 } 4746 } 4747 if (amp != NULL) 4748 anon_array_exit(&cookie); 4749 } 4750 page_unlock(pp); 4751 } 4752 done: 4753 if (amp != NULL) 4754 ANON_LOCK_EXIT(&->a_rwlock); 4755 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 4756 if (pl_alloc_sz) 4757 kmem_free(plp, pl_alloc_sz); 4758 return (0); 4759 } 4760 4761 /* 4762 * This routine is used to start I/O on pages asynchronously. XXX it will 4763 * only create PAGESIZE pages. At fault time they will be relocated into 4764 * larger pages. 4765 */ 4766 static faultcode_t 4767 segvn_faulta(struct seg *seg, caddr_t addr) 4768 { 4769 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 4770 int err; 4771 struct anon_map *amp; 4772 vnode_t *vp; 4773 4774 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 4775 4776 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER); 4777 if ((amp = svd->amp) != NULL) { 4778 struct anon *ap; 4779 4780 /* 4781 * Reader lock to prevent amp->ahp from being changed. 4782 * This is advisory, it's ok to miss a page, so 4783 * we don't do anon_array_enter lock. 4784 */ 4785 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 4786 if ((ap = anon_get_ptr(amp->ahp, 4787 svd->anon_index + seg_page(seg, addr))) != NULL) { 4788 4789 err = anon_getpage(&ap, NULL, NULL, 4790 0, seg, addr, S_READ, svd->cred); 4791 4792 ANON_LOCK_EXIT(&->a_rwlock); 4793 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 4794 if (err) 4795 return (FC_MAKE_ERR(err)); 4796 return (0); 4797 } 4798 ANON_LOCK_EXIT(&->a_rwlock); 4799 } 4800 4801 if (svd->vp == NULL) { 4802 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 4803 return (0); /* zfod page - do nothing now */ 4804 } 4805 4806 vp = svd->vp; 4807 TRACE_3(TR_FAC_VM, TR_SEGVN_GETPAGE, 4808 "segvn_getpage:seg %p addr %p vp %p", seg, addr, vp); 4809 err = VOP_GETPAGE(vp, 4810 (offset_t)(svd->offset + (uintptr_t)(addr - seg->s_base)), 4811 PAGESIZE, NULL, NULL, 0, seg, addr, 4812 S_OTHER, svd->cred); 4813 4814 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 4815 if (err) 4816 return (FC_MAKE_ERR(err)); 4817 return (0); 4818 } 4819 4820 static int 4821 segvn_setprot(struct seg *seg, caddr_t addr, size_t len, uint_t prot) 4822 { 4823 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 4824 struct vpage *svp, *evp; 4825 struct vnode *vp; 4826 size_t pgsz; 4827 pgcnt_t pgcnt; 4828 anon_sync_obj_t cookie; 4829 4830 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 4831 4832 if ((svd->maxprot & prot) != prot) 4833 return (EACCES); /* violated maxprot */ 4834 4835 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_WRITER); 4836 4837 /* return if prot is the same */ 4838 if (!svd->pageprot && svd->prot == prot) { 4839 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 4840 return (0); 4841 } 4842 4843 /* 4844 * Since we change protections we first have to flush the cache. 4845 * This makes sure all the pagelock calls have to recheck 4846 * protections. 4847 */ 4848 if (svd->softlockcnt > 0) { 4849 /* 4850 * Since we do have the segvn writers lock nobody can fill 4851 * the cache with entries belonging to this seg during 4852 * the purge. The flush either succeeds or we still have 4853 * pending I/Os. 4854 */ 4855 segvn_purge(seg); 4856 if (svd->softlockcnt > 0) { 4857 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 4858 return (EAGAIN); 4859 } 4860 } 4861 4862 if (seg->s_szc != 0) { 4863 int err; 4864 pgsz = page_get_pagesize(seg->s_szc); 4865 pgcnt = pgsz >> PAGESHIFT; 4866 ASSERT(IS_P2ALIGNED(pgcnt, pgcnt)); 4867 if (!IS_P2ALIGNED(addr, pgsz) || !IS_P2ALIGNED(len, pgsz)) { 4868 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 4869 ASSERT(seg->s_base != addr || seg->s_size != len); 4870 /* 4871 * If we are holding the as lock as a reader then 4872 * we need to return IE_RETRY and let the as 4873 * layer drop and re-aquire the lock as a writer. 4874 */ 4875 if (AS_READ_HELD(seg->s_as, &seg->s_as->a_lock)) 4876 return (IE_RETRY); 4877 VM_STAT_ADD(segvnvmstats.demoterange[1]); 4878 err = segvn_demote_range(seg, addr, len, SDR_END); 4879 if (err == 0) 4880 return (IE_RETRY); 4881 if (err == ENOMEM) 4882 return (IE_NOMEM); 4883 return (err); 4884 } 4885 } 4886 4887 4888 /* 4889 * If it's a private mapping and we're making it writable 4890 * and no swap space has been reserved, have to reserve 4891 * it all now. If it's a private mapping to a file (i.e., vp != NULL) 4892 * and we're removing write permission on the entire segment and 4893 * we haven't modified any pages, we can release the swap space. 4894 */ 4895 if (svd->type == MAP_PRIVATE) { 4896 if (prot & PROT_WRITE) { 4897 size_t sz; 4898 if (svd->swresv == 0 && !(svd->flags & MAP_NORESERVE)) { 4899 if (anon_resv(seg->s_size) == 0) { 4900 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 4901 return (IE_NOMEM); 4902 } 4903 sz = svd->swresv = seg->s_size; 4904 TRACE_3(TR_FAC_VM, TR_ANON_PROC, 4905 "anon proc:%p %lu %u", 4906 seg, sz, 1); 4907 } 4908 } else { 4909 /* 4910 * Swap space is released only if this segment 4911 * does not map anonymous memory, since read faults 4912 * on such segments still need an anon slot to read 4913 * in the data. 4914 */ 4915 if (svd->swresv != 0 && svd->vp != NULL && 4916 svd->amp == NULL && addr == seg->s_base && 4917 len == seg->s_size && svd->pageprot == 0) { 4918 anon_unresv(svd->swresv); 4919 svd->swresv = 0; 4920 TRACE_3(TR_FAC_VM, TR_ANON_PROC, 4921 "anon proc:%p %lu %u", 4922 seg, 0, 0); 4923 } 4924 } 4925 } 4926 4927 if (addr == seg->s_base && len == seg->s_size && svd->pageprot == 0) { 4928 if (svd->prot == prot) { 4929 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 4930 return (0); /* all done */ 4931 } 4932 svd->prot = (uchar_t)prot; 4933 } else { 4934 struct anon *ap = NULL; 4935 page_t *pp; 4936 u_offset_t offset, off; 4937 struct anon_map *amp; 4938 ulong_t anon_idx = 0; 4939 4940 /* 4941 * A vpage structure exists or else the change does not 4942 * involve the entire segment. Establish a vpage structure 4943 * if none is there. Then, for each page in the range, 4944 * adjust its individual permissions. Note that write- 4945 * enabling a MAP_PRIVATE page can affect the claims for 4946 * locked down memory. Overcommitting memory terminates 4947 * the operation. 4948 */ 4949 segvn_vpage(seg); 4950 if ((amp = svd->amp) != NULL) { 4951 anon_idx = svd->anon_index + seg_page(seg, addr); 4952 ASSERT(seg->s_szc == 0 || 4953 IS_P2ALIGNED(anon_idx, pgcnt)); 4954 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 4955 } 4956 4957 offset = svd->offset + (uintptr_t)(addr - seg->s_base); 4958 evp = &svd->vpage[seg_page(seg, addr + len)]; 4959 4960 /* 4961 * See Statement at the beginning of segvn_lockop regarding 4962 * the way cowcnts and lckcnts are handled. 4963 */ 4964 for (svp = &svd->vpage[seg_page(seg, addr)]; svp < evp; svp++) { 4965 4966 ASSERT(seg->s_szc == 0 || 4967 (svd->vp != NULL || svd->type == MAP_PRIVATE)); 4968 4969 if (seg->s_szc != 0 && svd->type == MAP_PRIVATE) { 4970 if (amp != NULL) { 4971 anon_array_enter(amp, anon_idx, 4972 &cookie); 4973 } 4974 if (IS_P2ALIGNED(anon_idx, pgcnt) && 4975 !segvn_claim_pages(seg, svp, offset, 4976 anon_idx, prot)) { 4977 if (amp != NULL) { 4978 anon_array_exit(&cookie); 4979 } 4980 break; 4981 } 4982 if (amp != NULL) { 4983 anon_array_exit(&cookie); 4984 } 4985 anon_idx++; 4986 } else { 4987 if (amp != NULL) { 4988 anon_array_enter(amp, anon_idx, 4989 &cookie); 4990 ap = anon_get_ptr(amp->ahp, anon_idx++); 4991 } 4992 4993 if (VPP_ISPPLOCK(svp) && 4994 (VPP_PROT(svp) != prot) && 4995 (svd->type == MAP_PRIVATE)) { 4996 4997 if (amp == NULL || ap == NULL) { 4998 vp = svd->vp; 4999 off = offset; 5000 } else 5001 swap_xlate(ap, &vp, &off); 5002 if (amp != NULL) 5003 anon_array_exit(&cookie); 5004 5005 if ((pp = page_lookup(vp, off, 5006 SE_SHARED)) == NULL) { 5007 panic("segvn_setprot: no page"); 5008 /*NOTREACHED*/ 5009 } 5010 ASSERT(seg->s_szc == 0); 5011 if ((VPP_PROT(svp) ^ prot) & 5012 PROT_WRITE) { 5013 if (prot & PROT_WRITE) { 5014 if (!page_addclaim(pp)) { 5015 page_unlock(pp); 5016 break; 5017 } 5018 } else { 5019 if (!page_subclaim(pp)) { 5020 page_unlock(pp); 5021 break; 5022 } 5023 } 5024 } 5025 page_unlock(pp); 5026 } else if (amp != NULL) 5027 anon_array_exit(&cookie); 5028 } 5029 VPP_SETPROT(svp, prot); 5030 offset += PAGESIZE; 5031 } 5032 if (amp != NULL) 5033 ANON_LOCK_EXIT(&->a_rwlock); 5034 5035 /* 5036 * Did we terminate prematurely? If so, simply unload 5037 * the translations to the things we've updated so far. 5038 */ 5039 if (svp != evp) { 5040 len = (svp - &svd->vpage[seg_page(seg, addr)]) * 5041 PAGESIZE; 5042 ASSERT(seg->s_szc == 0 || IS_P2ALIGNED(len, pgsz)); 5043 if (len != 0) 5044 hat_unload(seg->s_as->a_hat, addr, 5045 len, HAT_UNLOAD); 5046 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5047 return (IE_NOMEM); 5048 } 5049 } 5050 5051 if ((prot & PROT_WRITE) != 0 || (prot & ~PROT_USER) == PROT_NONE) { 5052 /* 5053 * Either private or shared data with write access (in 5054 * which case we need to throw out all former translations 5055 * so that we get the right translations set up on fault 5056 * and we don't allow write access to any copy-on-write pages 5057 * that might be around or to prevent write access to pages 5058 * representing holes in a file), or we don't have permission 5059 * to access the memory at all (in which case we have to 5060 * unload any current translations that might exist). 5061 */ 5062 hat_unload(seg->s_as->a_hat, addr, len, HAT_UNLOAD); 5063 } else { 5064 /* 5065 * A shared mapping or a private mapping in which write 5066 * protection is going to be denied - just change all the 5067 * protections over the range of addresses in question. 5068 * segvn does not support any other attributes other 5069 * than prot so we can use hat_chgattr. 5070 */ 5071 hat_chgattr(seg->s_as->a_hat, addr, len, prot); 5072 } 5073 5074 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5075 5076 return (0); 5077 } 5078 5079 /* 5080 * segvn_setpagesize is called via SEGOP_SETPAGESIZE from as_setpagesize, 5081 * to determine if the seg is capable of mapping the requested szc. 5082 */ 5083 static int 5084 segvn_setpagesize(struct seg *seg, caddr_t addr, size_t len, uint_t szc) 5085 { 5086 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 5087 struct segvn_data *nsvd; 5088 struct anon_map *amp = svd->amp; 5089 struct seg *nseg; 5090 caddr_t eaddr = addr + len, a; 5091 size_t pgsz = page_get_pagesize(szc); 5092 int err; 5093 u_offset_t off = svd->offset + (uintptr_t)(addr - seg->s_base); 5094 extern struct vnode kvp; 5095 5096 ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock)); 5097 ASSERT(addr >= seg->s_base && eaddr <= seg->s_base + seg->s_size); 5098 5099 if (seg->s_szc == szc || segvn_lpg_disable != 0) { 5100 return (0); 5101 } 5102 5103 /* 5104 * addr should always be pgsz aligned but eaddr may be misaligned if 5105 * it's at the end of the segment. 5106 * 5107 * XXX we should assert this condition since as_setpagesize() logic 5108 * guarantees it. 5109 */ 5110 if (!IS_P2ALIGNED(addr, pgsz) || 5111 (!IS_P2ALIGNED(eaddr, pgsz) && 5112 eaddr != seg->s_base + seg->s_size)) { 5113 5114 segvn_setpgsz_align_err++; 5115 return (EINVAL); 5116 } 5117 5118 if ((svd->vp == NULL && svd->type == MAP_SHARED) || 5119 (svd->flags & MAP_NORESERVE) || seg->s_as == &kas || 5120 szc > segvn_maxpgszc) { 5121 return (EINVAL); 5122 } 5123 5124 /* paranoid check */ 5125 if (svd->vp != NULL && 5126 (IS_SWAPFSVP(svd->vp) || svd->vp == &kvp)) { 5127 return (EINVAL); 5128 } 5129 5130 if (seg->s_szc == 0 && svd->vp != NULL && 5131 map_addr_vacalign_check(addr, off)) { 5132 return (EINVAL); 5133 } 5134 5135 /* 5136 * Check that protections are the same within new page 5137 * size boundaries. 5138 */ 5139 if (svd->pageprot) { 5140 for (a = addr; a < eaddr; a += pgsz) { 5141 if ((a + pgsz) > eaddr) { 5142 if (!sameprot(seg, a, eaddr - a)) { 5143 return (EINVAL); 5144 } 5145 } else { 5146 if (!sameprot(seg, a, pgsz)) { 5147 return (EINVAL); 5148 } 5149 } 5150 } 5151 } 5152 5153 /* 5154 * Since we are changing page size we first have to flush 5155 * the cache. This makes sure all the pagelock calls have 5156 * to recheck protections. 5157 */ 5158 if (svd->softlockcnt > 0) { 5159 /* 5160 * Since we do have the segvn writers lock nobody can fill 5161 * the cache with entries belonging to this seg during 5162 * the purge. The flush either succeeds or we still have 5163 * pending I/Os. 5164 */ 5165 segvn_purge(seg); 5166 if (svd->softlockcnt > 0) { 5167 return (EAGAIN); 5168 } 5169 } 5170 5171 /* 5172 * Operation for sub range of existing segment. 5173 */ 5174 if (addr != seg->s_base || eaddr != (seg->s_base + seg->s_size)) { 5175 if (szc < seg->s_szc) { 5176 VM_STAT_ADD(segvnvmstats.demoterange[2]); 5177 err = segvn_demote_range(seg, addr, len, SDR_RANGE); 5178 if (err == 0) { 5179 return (IE_RETRY); 5180 } 5181 if (err == ENOMEM) { 5182 return (IE_NOMEM); 5183 } 5184 return (err); 5185 } 5186 if (addr != seg->s_base) { 5187 nseg = segvn_split_seg(seg, addr); 5188 if (eaddr != (nseg->s_base + nseg->s_size)) { 5189 /* eaddr is szc aligned */ 5190 (void) segvn_split_seg(nseg, eaddr); 5191 } 5192 return (IE_RETRY); 5193 } 5194 if (eaddr != (seg->s_base + seg->s_size)) { 5195 /* eaddr is szc aligned */ 5196 (void) segvn_split_seg(seg, eaddr); 5197 } 5198 return (IE_RETRY); 5199 } 5200 5201 /* 5202 * Break any low level sharing and reset seg->s_szc to 0. 5203 */ 5204 if ((err = segvn_clrszc(seg)) != 0) { 5205 if (err == ENOMEM) { 5206 err = IE_NOMEM; 5207 } 5208 return (err); 5209 } 5210 ASSERT(seg->s_szc == 0); 5211 5212 /* 5213 * If the end of the current segment is not pgsz aligned 5214 * then attempt to concatenate with the next segment. 5215 */ 5216 if (!IS_P2ALIGNED(eaddr, pgsz)) { 5217 nseg = AS_SEGNEXT(seg->s_as, seg); 5218 if (nseg == NULL || nseg == seg || eaddr != nseg->s_base) { 5219 return (ENOMEM); 5220 } 5221 if (nseg->s_ops != &segvn_ops) { 5222 return (EINVAL); 5223 } 5224 nsvd = (struct segvn_data *)nseg->s_data; 5225 if (nsvd->softlockcnt > 0) { 5226 segvn_purge(nseg); 5227 if (nsvd->softlockcnt > 0) { 5228 return (EAGAIN); 5229 } 5230 } 5231 err = segvn_clrszc(nseg); 5232 if (err == ENOMEM) { 5233 err = IE_NOMEM; 5234 } 5235 if (err != 0) { 5236 return (err); 5237 } 5238 err = segvn_concat(seg, nseg, 1); 5239 if (err == -1) { 5240 return (EINVAL); 5241 } 5242 if (err == -2) { 5243 return (IE_NOMEM); 5244 } 5245 return (IE_RETRY); 5246 } 5247 5248 /* 5249 * May need to re-align anon array to 5250 * new szc. 5251 */ 5252 if (amp != NULL) { 5253 pgcnt_t pgcnt = pgsz >> PAGESHIFT; 5254 if (!IS_P2ALIGNED(svd->anon_index, pgcnt)) { 5255 struct anon_hdr *nahp; 5256 5257 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 5258 ASSERT(amp->refcnt == 1); 5259 nahp = anon_create(btop(amp->size), ANON_NOSLEEP); 5260 if (nahp == NULL) { 5261 ANON_LOCK_EXIT(&->a_rwlock); 5262 return (IE_NOMEM); 5263 } 5264 if (anon_copy_ptr(amp->ahp, svd->anon_index, 5265 nahp, 0, btop(seg->s_size), ANON_NOSLEEP)) { 5266 anon_release(nahp, btop(amp->size)); 5267 ANON_LOCK_EXIT(&->a_rwlock); 5268 return (IE_NOMEM); 5269 } 5270 anon_release(amp->ahp, btop(amp->size)); 5271 amp->ahp = nahp; 5272 svd->anon_index = 0; 5273 ANON_LOCK_EXIT(&->a_rwlock); 5274 } 5275 } 5276 if (svd->vp != NULL && szc != 0) { 5277 struct vattr va; 5278 u_offset_t eoffpage = svd->offset; 5279 va.va_mask = AT_SIZE; 5280 eoffpage += seg->s_size; 5281 eoffpage = btopr(eoffpage); 5282 if (VOP_GETATTR(svd->vp, &va, 0, svd->cred) != 0) { 5283 segvn_setpgsz_getattr_err++; 5284 return (EINVAL); 5285 } 5286 if (btopr(va.va_size) < eoffpage) { 5287 segvn_setpgsz_eof_err++; 5288 return (EINVAL); 5289 } 5290 if (amp != NULL) { 5291 /* 5292 * anon_fill_cow_holes() may call VOP_GETPAGE(). 5293 * don't take anon map lock here to avoid holding it 5294 * across VOP_GETPAGE() calls that may call back into 5295 * segvn for klsutering checks. We don't really need 5296 * anon map lock here since it's a private segment and 5297 * we hold as level lock as writers. 5298 */ 5299 if ((err = anon_fill_cow_holes(seg, seg->s_base, 5300 amp->ahp, svd->anon_index, svd->vp, svd->offset, 5301 seg->s_size, szc, svd->prot, svd->vpage, 5302 svd->cred)) != 0) { 5303 return (EINVAL); 5304 } 5305 } 5306 segvn_setvnode_mpss(svd->vp); 5307 } 5308 5309 if (amp != NULL) { 5310 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 5311 amp->a_szc = szc; 5312 ANON_LOCK_EXIT(&->a_rwlock); 5313 } 5314 5315 seg->s_szc = szc; 5316 5317 return (0); 5318 } 5319 5320 static int 5321 segvn_clrszc(struct seg *seg) 5322 { 5323 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 5324 struct anon_map *amp = svd->amp; 5325 size_t pgsz; 5326 pgcnt_t pages; 5327 int err = 0; 5328 caddr_t a = seg->s_base; 5329 caddr_t ea = a + seg->s_size; 5330 ulong_t an_idx = svd->anon_index; 5331 vnode_t *vp = svd->vp; 5332 struct vpage *vpage = svd->vpage; 5333 page_t *anon_pl[1 + 1], *pp; 5334 struct anon *ap, *oldap; 5335 uint_t prot = svd->prot, vpprot; 5336 5337 ASSERT(AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock) || 5338 SEGVN_WRITE_HELD(seg->s_as, &svd->lock)); 5339 ASSERT(svd->type == MAP_PRIVATE || 5340 (vp != NULL && svd->amp == NULL)); 5341 5342 if (vp == NULL && amp == NULL) { 5343 seg->s_szc = 0; 5344 return (0); 5345 } 5346 5347 /* 5348 * do HAT_UNLOAD_UNMAP since we are changing the pagesize. 5349 * unload argument is 0 when we are freeing the segment 5350 * and unload was already done. 5351 */ 5352 hat_unload(seg->s_as->a_hat, seg->s_base, seg->s_size, 5353 HAT_UNLOAD_UNMAP); 5354 5355 if (amp == NULL) { 5356 seg->s_szc = 0; 5357 return (0); 5358 } 5359 5360 pgsz = page_get_pagesize(seg->s_szc); 5361 pages = btop(pgsz); 5362 5363 /* 5364 * XXX anon rwlock is not really needed because this is a 5365 * private segment and we are writers. 5366 */ 5367 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 5368 5369 for (; a < ea; a += pgsz, an_idx += pages) { 5370 if ((oldap = anon_get_ptr(amp->ahp, an_idx)) != NULL) { 5371 if (svd->pageprot != 0) { 5372 ASSERT(vpage != NULL); 5373 prot = VPP_PROT(vpage); 5374 ASSERT(sameprot(seg, a, pgsz)); 5375 } 5376 if (seg->s_szc != 0) { 5377 ASSERT(vp == NULL || anon_pages(amp->ahp, 5378 an_idx, pages) == pages); 5379 if ((err = anon_map_demotepages(amp, an_idx, 5380 seg, a, prot, vpage, svd->cred)) != 0) { 5381 goto out; 5382 } 5383 } else { 5384 if (oldap->an_refcnt == 1) { 5385 continue; 5386 } 5387 if ((err = anon_getpage(&oldap, &vpprot, 5388 anon_pl, PAGESIZE, seg, a, S_READ, 5389 svd->cred))) { 5390 goto out; 5391 } 5392 if ((pp = anon_private(&ap, seg, a, prot, 5393 anon_pl[0], 0, svd->cred)) == NULL) { 5394 err = ENOMEM; 5395 goto out; 5396 } 5397 anon_decref(oldap); 5398 (void) anon_set_ptr(amp->ahp, an_idx, ap, 5399 ANON_SLEEP); 5400 page_unlock(pp); 5401 } 5402 } 5403 vpage = (vpage == NULL) ? NULL : vpage + pages; 5404 } 5405 5406 amp->a_szc = 0; 5407 seg->s_szc = 0; 5408 out: 5409 ANON_LOCK_EXIT(&->a_rwlock); 5410 return (err); 5411 } 5412 5413 static int 5414 segvn_claim_pages( 5415 struct seg *seg, 5416 struct vpage *svp, 5417 u_offset_t off, 5418 ulong_t anon_idx, 5419 uint_t prot) 5420 { 5421 pgcnt_t pgcnt = page_get_pagecnt(seg->s_szc); 5422 size_t ppasize = (pgcnt + 1) * sizeof (page_t *); 5423 page_t **ppa; 5424 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 5425 struct anon_map *amp = svd->amp; 5426 struct vpage *evp = svp + pgcnt; 5427 caddr_t addr = ((uintptr_t)(svp - svd->vpage) << PAGESHIFT) 5428 + seg->s_base; 5429 struct anon *ap; 5430 struct vnode *vp = svd->vp; 5431 page_t *pp; 5432 pgcnt_t pg_idx, i; 5433 int err = 0; 5434 anoff_t aoff; 5435 int anon = (amp != NULL) ? 1 : 0; 5436 5437 ASSERT(svd->type == MAP_PRIVATE); 5438 ASSERT(svd->vpage != NULL); 5439 ASSERT(seg->s_szc != 0); 5440 ASSERT(IS_P2ALIGNED(pgcnt, pgcnt)); 5441 ASSERT(amp == NULL || IS_P2ALIGNED(anon_idx, pgcnt)); 5442 ASSERT(sameprot(seg, addr, pgcnt << PAGESHIFT)); 5443 5444 if (VPP_PROT(svp) == prot) 5445 return (1); 5446 if (!((VPP_PROT(svp) ^ prot) & PROT_WRITE)) 5447 return (1); 5448 5449 ppa = kmem_alloc(ppasize, KM_SLEEP); 5450 if (anon && vp != NULL) { 5451 if (anon_get_ptr(amp->ahp, anon_idx) == NULL) { 5452 anon = 0; 5453 ASSERT(!anon_pages(amp->ahp, anon_idx, pgcnt)); 5454 } 5455 ASSERT(!anon || 5456 anon_pages(amp->ahp, anon_idx, pgcnt) == pgcnt); 5457 } 5458 5459 for (*ppa = NULL, pg_idx = 0; svp < evp; svp++, anon_idx++) { 5460 if (!VPP_ISPPLOCK(svp)) 5461 continue; 5462 if (anon) { 5463 ap = anon_get_ptr(amp->ahp, anon_idx); 5464 if (ap == NULL) { 5465 panic("segvn_claim_pages: no anon slot"); 5466 } 5467 swap_xlate(ap, &vp, &aoff); 5468 off = (u_offset_t)aoff; 5469 } 5470 ASSERT(vp != NULL); 5471 if ((pp = page_lookup(vp, 5472 (u_offset_t)off, SE_SHARED)) == NULL) { 5473 panic("segvn_claim_pages: no page"); 5474 } 5475 ppa[pg_idx++] = pp; 5476 off += PAGESIZE; 5477 } 5478 5479 if (ppa[0] == NULL) { 5480 kmem_free(ppa, ppasize); 5481 return (1); 5482 } 5483 5484 ASSERT(pg_idx <= pgcnt); 5485 ppa[pg_idx] = NULL; 5486 5487 if (prot & PROT_WRITE) 5488 err = page_addclaim_pages(ppa); 5489 else 5490 err = page_subclaim_pages(ppa); 5491 5492 for (i = 0; i < pg_idx; i++) { 5493 ASSERT(ppa[i] != NULL); 5494 page_unlock(ppa[i]); 5495 } 5496 5497 kmem_free(ppa, ppasize); 5498 return (err); 5499 } 5500 5501 /* 5502 * Returns right (upper address) segment if split occured. 5503 * If the address is equal to the beginning or end of its segment it returns 5504 * the current segment. 5505 */ 5506 static struct seg * 5507 segvn_split_seg(struct seg *seg, caddr_t addr) 5508 { 5509 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 5510 struct seg *nseg; 5511 size_t nsize; 5512 struct segvn_data *nsvd; 5513 5514 ASSERT(AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock)); 5515 ASSERT(svd->type == MAP_PRIVATE || svd->amp == NULL); 5516 ASSERT(addr >= seg->s_base); 5517 ASSERT(addr <= seg->s_base + seg->s_size); 5518 5519 if (addr == seg->s_base || addr == seg->s_base + seg->s_size) 5520 return (seg); 5521 5522 nsize = seg->s_base + seg->s_size - addr; 5523 seg->s_size = addr - seg->s_base; 5524 nseg = seg_alloc(seg->s_as, addr, nsize); 5525 ASSERT(nseg != NULL); 5526 nseg->s_ops = seg->s_ops; 5527 nsvd = kmem_cache_alloc(segvn_cache, KM_SLEEP); 5528 nseg->s_data = (void *)nsvd; 5529 nseg->s_szc = seg->s_szc; 5530 *nsvd = *svd; 5531 rw_init(&nsvd->lock, NULL, RW_DEFAULT, NULL); 5532 5533 if (nsvd->vp != NULL) { 5534 VN_HOLD(nsvd->vp); 5535 nsvd->offset = svd->offset + 5536 (uintptr_t)(nseg->s_base - seg->s_base); 5537 if (nsvd->type == MAP_SHARED) 5538 lgrp_shm_policy_init(NULL, nsvd->vp); 5539 } else { 5540 /* 5541 * The offset for an anonymous segment has no signifigance in 5542 * terms of an offset into a file. If we were to use the above 5543 * calculation instead, the structures read out of 5544 * /proc/<pid>/xmap would be more difficult to decipher since 5545 * it would be unclear whether two seemingly contiguous 5546 * prxmap_t structures represented different segments or a 5547 * single segment that had been split up into multiple prxmap_t 5548 * structures (e.g. if some part of the segment had not yet 5549 * been faulted in). 5550 */ 5551 nsvd->offset = 0; 5552 } 5553 5554 ASSERT(svd->softlockcnt == 0); 5555 crhold(svd->cred); 5556 5557 if (svd->vpage != NULL) { 5558 size_t bytes = vpgtob(seg_pages(seg)); 5559 size_t nbytes = vpgtob(seg_pages(nseg)); 5560 struct vpage *ovpage = svd->vpage; 5561 5562 svd->vpage = kmem_alloc(bytes, KM_SLEEP); 5563 bcopy(ovpage, svd->vpage, bytes); 5564 nsvd->vpage = kmem_alloc(nbytes, KM_SLEEP); 5565 bcopy(ovpage + seg_pages(seg), nsvd->vpage, nbytes); 5566 kmem_free(ovpage, bytes + nbytes); 5567 } 5568 if (svd->amp != NULL) { 5569 struct anon_map *oamp = svd->amp, *namp; 5570 struct anon_hdr *nahp; 5571 5572 ANON_LOCK_ENTER(&oamp->a_rwlock, RW_WRITER); 5573 ASSERT(oamp->refcnt == 1); 5574 nahp = anon_create(btop(seg->s_size), ANON_SLEEP); 5575 (void) anon_copy_ptr(oamp->ahp, svd->anon_index, 5576 nahp, 0, btop(seg->s_size), ANON_SLEEP); 5577 5578 namp = anonmap_alloc(nseg->s_size, 0); 5579 namp->a_szc = nseg->s_szc; 5580 (void) anon_copy_ptr(oamp->ahp, 5581 svd->anon_index + btop(seg->s_size), 5582 namp->ahp, 0, btop(nseg->s_size), ANON_SLEEP); 5583 anon_release(oamp->ahp, btop(oamp->size)); 5584 oamp->ahp = nahp; 5585 oamp->size = seg->s_size; 5586 svd->anon_index = 0; 5587 nsvd->amp = namp; 5588 nsvd->anon_index = 0; 5589 ANON_LOCK_EXIT(&oamp->a_rwlock); 5590 } 5591 5592 /* 5593 * Split amount of swap reserve 5594 */ 5595 if (svd->swresv) { 5596 /* 5597 * For MAP_NORESERVE, only allocate swap reserve for pages 5598 * being used. Other segments get enough to cover whole 5599 * segment. 5600 */ 5601 if (svd->flags & MAP_NORESERVE) { 5602 size_t oswresv; 5603 5604 ASSERT(svd->amp); 5605 oswresv = svd->swresv; 5606 svd->swresv = ptob(anon_pages(svd->amp->ahp, 5607 svd->anon_index, btop(seg->s_size))); 5608 nsvd->swresv = ptob(anon_pages(nsvd->amp->ahp, 5609 nsvd->anon_index, btop(nseg->s_size))); 5610 ASSERT(oswresv >= (svd->swresv + nsvd->swresv)); 5611 } else { 5612 ASSERT(svd->swresv == seg->s_size + nseg->s_size); 5613 svd->swresv = seg->s_size; 5614 nsvd->swresv = nseg->s_size; 5615 } 5616 } 5617 5618 return (nseg); 5619 } 5620 5621 5622 /* 5623 * called on memory operations (unmap, setprot, setpagesize) for a subset 5624 * of a large page segment to either demote the memory range (SDR_RANGE) 5625 * or the ends (SDR_END) by addr/len. 5626 * 5627 * returns 0 on success. returns errno, including ENOMEM, on failure. 5628 */ 5629 static int 5630 segvn_demote_range(struct seg *seg, caddr_t addr, size_t len, int flag) 5631 { 5632 caddr_t eaddr = addr + len; 5633 caddr_t lpgaddr, lpgeaddr; 5634 struct seg *nseg; 5635 struct seg *badseg1 = NULL; 5636 struct seg *badseg2 = NULL; 5637 size_t pgsz; 5638 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 5639 int err; 5640 5641 ASSERT(AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock)); 5642 ASSERT(seg->s_szc != 0); 5643 pgsz = page_get_pagesize(seg->s_szc); 5644 ASSERT(seg->s_base != addr || seg->s_size != len); 5645 ASSERT(addr >= seg->s_base && eaddr <= seg->s_base + seg->s_size); 5646 ASSERT(svd->softlockcnt == 0); 5647 ASSERT(svd->type == MAP_PRIVATE || 5648 (svd->vp != NULL && svd->amp == NULL)); 5649 5650 CALC_LPG_REGION(pgsz, seg, addr, len, lpgaddr, lpgeaddr); 5651 ASSERT(flag == SDR_RANGE || eaddr < lpgeaddr || addr > lpgaddr); 5652 if (flag == SDR_RANGE) { 5653 /* demote entire range */ 5654 badseg1 = nseg = segvn_split_seg(seg, lpgaddr); 5655 (void) segvn_split_seg(nseg, lpgeaddr); 5656 ASSERT(badseg1->s_base == lpgaddr); 5657 ASSERT(badseg1->s_size == lpgeaddr - lpgaddr); 5658 } else if (addr != lpgaddr) { 5659 ASSERT(flag == SDR_END); 5660 badseg1 = nseg = segvn_split_seg(seg, lpgaddr); 5661 if (eaddr != lpgeaddr && eaddr > lpgaddr + pgsz && 5662 eaddr < lpgaddr + 2 * pgsz) { 5663 (void) segvn_split_seg(nseg, lpgeaddr); 5664 ASSERT(badseg1->s_base == lpgaddr); 5665 ASSERT(badseg1->s_size == 2 * pgsz); 5666 } else { 5667 nseg = segvn_split_seg(nseg, lpgaddr + pgsz); 5668 ASSERT(badseg1->s_base == lpgaddr); 5669 ASSERT(badseg1->s_size == pgsz); 5670 if (eaddr != lpgeaddr && eaddr > lpgaddr + pgsz) { 5671 ASSERT(lpgeaddr - lpgaddr > 2 * pgsz); 5672 nseg = segvn_split_seg(nseg, lpgeaddr - pgsz); 5673 badseg2 = nseg; 5674 (void) segvn_split_seg(nseg, lpgeaddr); 5675 ASSERT(badseg2->s_base == lpgeaddr - pgsz); 5676 ASSERT(badseg2->s_size == pgsz); 5677 } 5678 } 5679 } else { 5680 ASSERT(flag == SDR_END); 5681 ASSERT(eaddr < lpgeaddr); 5682 badseg1 = nseg = segvn_split_seg(seg, lpgeaddr - pgsz); 5683 (void) segvn_split_seg(nseg, lpgeaddr); 5684 ASSERT(badseg1->s_base == lpgeaddr - pgsz); 5685 ASSERT(badseg1->s_size == pgsz); 5686 } 5687 5688 ASSERT(badseg1 != NULL); 5689 ASSERT(badseg1->s_szc != 0); 5690 ASSERT(page_get_pagesize(badseg1->s_szc) == pgsz); 5691 ASSERT(flag == SDR_RANGE || badseg1->s_size == pgsz || 5692 badseg1->s_size == 2 * pgsz); 5693 if (err = segvn_clrszc(badseg1)) { 5694 return (err); 5695 } 5696 ASSERT(badseg1->s_szc == 0); 5697 5698 if (badseg2 == NULL) 5699 return (0); 5700 ASSERT(badseg2->s_szc != 0); 5701 ASSERT(page_get_pagesize(badseg2->s_szc) == pgsz); 5702 ASSERT(badseg2->s_size == pgsz); 5703 ASSERT(sameprot(badseg2, badseg2->s_base, badseg2->s_size)); 5704 if (err = segvn_clrszc(badseg2)) { 5705 return (err); 5706 } 5707 ASSERT(badseg2->s_szc == 0); 5708 return (0); 5709 } 5710 5711 static int 5712 segvn_checkprot(struct seg *seg, caddr_t addr, size_t len, uint_t prot) 5713 { 5714 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 5715 struct vpage *vp, *evp; 5716 5717 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 5718 5719 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER); 5720 /* 5721 * If segment protection can be used, simply check against them. 5722 */ 5723 if (svd->pageprot == 0) { 5724 int err; 5725 5726 err = ((svd->prot & prot) != prot) ? EACCES : 0; 5727 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5728 return (err); 5729 } 5730 5731 /* 5732 * Have to check down to the vpage level. 5733 */ 5734 evp = &svd->vpage[seg_page(seg, addr + len)]; 5735 for (vp = &svd->vpage[seg_page(seg, addr)]; vp < evp; vp++) { 5736 if ((VPP_PROT(vp) & prot) != prot) { 5737 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5738 return (EACCES); 5739 } 5740 } 5741 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5742 return (0); 5743 } 5744 5745 static int 5746 segvn_getprot(struct seg *seg, caddr_t addr, size_t len, uint_t *protv) 5747 { 5748 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 5749 size_t pgno = seg_page(seg, addr + len) - seg_page(seg, addr) + 1; 5750 5751 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 5752 5753 if (pgno != 0) { 5754 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER); 5755 if (svd->pageprot == 0) { 5756 do 5757 protv[--pgno] = svd->prot; 5758 while (pgno != 0); 5759 } else { 5760 size_t pgoff = seg_page(seg, addr); 5761 5762 do { 5763 pgno--; 5764 protv[pgno] = VPP_PROT(&svd->vpage[pgno+pgoff]); 5765 } while (pgno != 0); 5766 } 5767 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5768 } 5769 return (0); 5770 } 5771 5772 static u_offset_t 5773 segvn_getoffset(struct seg *seg, caddr_t addr) 5774 { 5775 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 5776 5777 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 5778 5779 return (svd->offset + (uintptr_t)(addr - seg->s_base)); 5780 } 5781 5782 /*ARGSUSED*/ 5783 static int 5784 segvn_gettype(struct seg *seg, caddr_t addr) 5785 { 5786 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 5787 5788 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 5789 5790 return (svd->type | (svd->flags & MAP_NORESERVE)); 5791 } 5792 5793 /*ARGSUSED*/ 5794 static int 5795 segvn_getvp(struct seg *seg, caddr_t addr, struct vnode **vpp) 5796 { 5797 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 5798 5799 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 5800 5801 *vpp = svd->vp; 5802 return (0); 5803 } 5804 5805 /* 5806 * Check to see if it makes sense to do kluster/read ahead to 5807 * addr + delta relative to the mapping at addr. We assume here 5808 * that delta is a signed PAGESIZE'd multiple (which can be negative). 5809 * 5810 * For segvn, we currently "approve" of the action if we are 5811 * still in the segment and it maps from the same vp/off, 5812 * or if the advice stored in segvn_data or vpages allows it. 5813 * Currently, klustering is not allowed only if MADV_RANDOM is set. 5814 */ 5815 static int 5816 segvn_kluster(struct seg *seg, caddr_t addr, ssize_t delta) 5817 { 5818 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 5819 struct anon *oap, *ap; 5820 ssize_t pd; 5821 size_t page; 5822 struct vnode *vp1, *vp2; 5823 u_offset_t off1, off2; 5824 struct anon_map *amp; 5825 5826 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 5827 ASSERT(AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock) || 5828 SEGVN_LOCK_HELD(seg->s_as, &svd->lock)); 5829 5830 if (addr + delta < seg->s_base || 5831 addr + delta >= (seg->s_base + seg->s_size)) 5832 return (-1); /* exceeded segment bounds */ 5833 5834 pd = delta / (ssize_t)PAGESIZE; /* divide to preserve sign bit */ 5835 page = seg_page(seg, addr); 5836 5837 /* 5838 * Check to see if either of the pages addr or addr + delta 5839 * have advice set that prevents klustering (if MADV_RANDOM advice 5840 * is set for entire segment, or MADV_SEQUENTIAL is set and delta 5841 * is negative). 5842 */ 5843 if (svd->advice == MADV_RANDOM || 5844 svd->advice == MADV_SEQUENTIAL && delta < 0) 5845 return (-1); 5846 else if (svd->pageadvice && svd->vpage) { 5847 struct vpage *bvpp, *evpp; 5848 5849 bvpp = &svd->vpage[page]; 5850 evpp = &svd->vpage[page + pd]; 5851 if (VPP_ADVICE(bvpp) == MADV_RANDOM || 5852 VPP_ADVICE(evpp) == MADV_SEQUENTIAL && delta < 0) 5853 return (-1); 5854 if (VPP_ADVICE(bvpp) != VPP_ADVICE(evpp) && 5855 VPP_ADVICE(evpp) == MADV_RANDOM) 5856 return (-1); 5857 } 5858 5859 if (svd->type == MAP_SHARED) 5860 return (0); /* shared mapping - all ok */ 5861 5862 if ((amp = svd->amp) == NULL) 5863 return (0); /* off original vnode */ 5864 5865 page += svd->anon_index; 5866 5867 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 5868 5869 oap = anon_get_ptr(amp->ahp, page); 5870 ap = anon_get_ptr(amp->ahp, page + pd); 5871 5872 ANON_LOCK_EXIT(&->a_rwlock); 5873 5874 if ((oap == NULL && ap != NULL) || (oap != NULL && ap == NULL)) { 5875 return (-1); /* one with and one without an anon */ 5876 } 5877 5878 if (oap == NULL) { /* implies that ap == NULL */ 5879 return (0); /* off original vnode */ 5880 } 5881 5882 /* 5883 * Now we know we have two anon pointers - check to 5884 * see if they happen to be properly allocated. 5885 */ 5886 5887 /* 5888 * XXX We cheat here and don't lock the anon slots. We can't because 5889 * we may have been called from the anon layer which might already 5890 * have locked them. We are holding a refcnt on the slots so they 5891 * can't disappear. The worst that will happen is we'll get the wrong 5892 * names (vp, off) for the slots and make a poor klustering decision. 5893 */ 5894 swap_xlate(ap, &vp1, &off1); 5895 swap_xlate(oap, &vp2, &off2); 5896 5897 5898 if (!VOP_CMP(vp1, vp2) || off1 - off2 != delta) 5899 return (-1); 5900 return (0); 5901 } 5902 5903 /* 5904 * Swap the pages of seg out to secondary storage, returning the 5905 * number of bytes of storage freed. 5906 * 5907 * The basic idea is first to unload all translations and then to call 5908 * VOP_PUTPAGE() for all newly-unmapped pages, to push them out to the 5909 * swap device. Pages to which other segments have mappings will remain 5910 * mapped and won't be swapped. Our caller (as_swapout) has already 5911 * performed the unloading step. 5912 * 5913 * The value returned is intended to correlate well with the process's 5914 * memory requirements. However, there are some caveats: 5915 * 1) When given a shared segment as argument, this routine will 5916 * only succeed in swapping out pages for the last sharer of the 5917 * segment. (Previous callers will only have decremented mapping 5918 * reference counts.) 5919 * 2) We assume that the hat layer maintains a large enough translation 5920 * cache to capture process reference patterns. 5921 */ 5922 static size_t 5923 segvn_swapout(struct seg *seg) 5924 { 5925 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 5926 struct anon_map *amp; 5927 pgcnt_t pgcnt = 0; 5928 pgcnt_t npages; 5929 pgcnt_t page; 5930 ulong_t anon_index; 5931 5932 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 5933 5934 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER); 5935 /* 5936 * Find pages unmapped by our caller and force them 5937 * out to the virtual swap device. 5938 */ 5939 if ((amp = svd->amp) != NULL) 5940 anon_index = svd->anon_index; 5941 npages = seg->s_size >> PAGESHIFT; 5942 for (page = 0; page < npages; page++) { 5943 page_t *pp; 5944 struct anon *ap; 5945 struct vnode *vp; 5946 u_offset_t off; 5947 anon_sync_obj_t cookie; 5948 5949 /* 5950 * Obtain <vp, off> pair for the page, then look it up. 5951 * 5952 * Note that this code is willing to consider regular 5953 * pages as well as anon pages. Is this appropriate here? 5954 */ 5955 ap = NULL; 5956 if (amp != NULL) { 5957 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 5958 anon_array_enter(amp, anon_index + page, &cookie); 5959 ap = anon_get_ptr(amp->ahp, anon_index + page); 5960 if (ap != NULL) { 5961 swap_xlate(ap, &vp, &off); 5962 } else { 5963 vp = svd->vp; 5964 off = svd->offset + ptob(page); 5965 } 5966 anon_array_exit(&cookie); 5967 ANON_LOCK_EXIT(&->a_rwlock); 5968 } else { 5969 vp = svd->vp; 5970 off = svd->offset + ptob(page); 5971 } 5972 if (vp == NULL) { /* untouched zfod page */ 5973 ASSERT(ap == NULL); 5974 continue; 5975 } 5976 5977 pp = page_lookup_nowait(vp, off, SE_SHARED); 5978 if (pp == NULL) 5979 continue; 5980 5981 5982 /* 5983 * Examine the page to see whether it can be tossed out, 5984 * keeping track of how many we've found. 5985 */ 5986 if (!page_tryupgrade(pp)) { 5987 /* 5988 * If the page has an i/o lock and no mappings, 5989 * it's very likely that the page is being 5990 * written out as a result of klustering. 5991 * Assume this is so and take credit for it here. 5992 */ 5993 if (!page_io_trylock(pp)) { 5994 if (!hat_page_is_mapped(pp)) 5995 pgcnt++; 5996 } else { 5997 page_io_unlock(pp); 5998 } 5999 page_unlock(pp); 6000 continue; 6001 } 6002 ASSERT(!page_iolock_assert(pp)); 6003 6004 6005 /* 6006 * Skip if page is locked or has mappings. 6007 * We don't need the page_struct_lock to look at lckcnt 6008 * and cowcnt because the page is exclusive locked. 6009 */ 6010 if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0 || 6011 hat_page_is_mapped(pp)) { 6012 page_unlock(pp); 6013 continue; 6014 } 6015 6016 /* 6017 * dispose skips large pages so try to demote first. 6018 */ 6019 if (pp->p_szc != 0 && !page_try_demote_pages(pp)) { 6020 page_unlock(pp); 6021 /* 6022 * XXX should skip the remaining page_t's of this 6023 * large page. 6024 */ 6025 continue; 6026 } 6027 6028 ASSERT(pp->p_szc == 0); 6029 6030 /* 6031 * No longer mapped -- we can toss it out. How 6032 * we do so depends on whether or not it's dirty. 6033 */ 6034 if (hat_ismod(pp) && pp->p_vnode) { 6035 /* 6036 * We must clean the page before it can be 6037 * freed. Setting B_FREE will cause pvn_done 6038 * to free the page when the i/o completes. 6039 * XXX: This also causes it to be accounted 6040 * as a pageout instead of a swap: need 6041 * B_SWAPOUT bit to use instead of B_FREE. 6042 * 6043 * Hold the vnode before releasing the page lock 6044 * to prevent it from being freed and re-used by 6045 * some other thread. 6046 */ 6047 VN_HOLD(vp); 6048 page_unlock(pp); 6049 6050 /* 6051 * Queue all i/o requests for the pageout thread 6052 * to avoid saturating the pageout devices. 6053 */ 6054 if (!queue_io_request(vp, off)) 6055 VN_RELE(vp); 6056 } else { 6057 /* 6058 * The page was clean, free it. 6059 * 6060 * XXX: Can we ever encounter modified pages 6061 * with no associated vnode here? 6062 */ 6063 ASSERT(pp->p_vnode != NULL); 6064 /*LINTED: constant in conditional context*/ 6065 VN_DISPOSE(pp, B_FREE, 0, kcred); 6066 } 6067 6068 /* 6069 * Credit now even if i/o is in progress. 6070 */ 6071 pgcnt++; 6072 } 6073 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 6074 6075 /* 6076 * Wakeup pageout to initiate i/o on all queued requests. 6077 */ 6078 cv_signal_pageout(); 6079 return (ptob(pgcnt)); 6080 } 6081 6082 /* 6083 * Synchronize primary storage cache with real object in virtual memory. 6084 * 6085 * XXX - Anonymous pages should not be sync'ed out at all. 6086 */ 6087 static int 6088 segvn_sync(struct seg *seg, caddr_t addr, size_t len, int attr, uint_t flags) 6089 { 6090 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 6091 struct vpage *vpp; 6092 page_t *pp; 6093 u_offset_t offset; 6094 struct vnode *vp; 6095 u_offset_t off; 6096 caddr_t eaddr; 6097 int bflags; 6098 int err = 0; 6099 int segtype; 6100 int pageprot; 6101 int prot; 6102 ulong_t anon_index; 6103 struct anon_map *amp; 6104 struct anon *ap; 6105 anon_sync_obj_t cookie; 6106 6107 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 6108 6109 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER); 6110 6111 if (svd->softlockcnt > 0) { 6112 /* 6113 * flush all pages from seg cache 6114 * otherwise we may deadlock in swap_putpage 6115 * for B_INVAL page (4175402). 6116 * 6117 * Even if we grab segvn WRITER's lock or segp_slock 6118 * here, there might be another thread which could've 6119 * successfully performed lookup/insert just before 6120 * we acquired the lock here. So, grabbing either 6121 * lock here is of not much use. Until we devise 6122 * a strategy at upper layers to solve the 6123 * synchronization issues completely, we expect 6124 * applications to handle this appropriately. 6125 */ 6126 segvn_purge(seg); 6127 if (svd->softlockcnt > 0) { 6128 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 6129 return (EAGAIN); 6130 } 6131 } 6132 6133 vpp = svd->vpage; 6134 offset = svd->offset + (uintptr_t)(addr - seg->s_base); 6135 bflags = ((flags & MS_ASYNC) ? B_ASYNC : 0) | 6136 ((flags & MS_INVALIDATE) ? B_INVAL : 0); 6137 6138 if (attr) { 6139 pageprot = attr & ~(SHARED|PRIVATE); 6140 segtype = (attr & SHARED) ? MAP_SHARED : MAP_PRIVATE; 6141 6142 /* 6143 * We are done if the segment types don't match 6144 * or if we have segment level protections and 6145 * they don't match. 6146 */ 6147 if (svd->type != segtype) { 6148 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 6149 return (0); 6150 } 6151 if (vpp == NULL) { 6152 if (svd->prot != pageprot) { 6153 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 6154 return (0); 6155 } 6156 prot = svd->prot; 6157 } else 6158 vpp = &svd->vpage[seg_page(seg, addr)]; 6159 6160 } else if (svd->vp && svd->amp == NULL && 6161 (flags & MS_INVALIDATE) == 0) { 6162 6163 /* 6164 * No attributes, no anonymous pages and MS_INVALIDATE flag 6165 * is not on, just use one big request. 6166 */ 6167 err = VOP_PUTPAGE(svd->vp, (offset_t)offset, len, 6168 bflags, svd->cred); 6169 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 6170 return (err); 6171 } 6172 6173 if ((amp = svd->amp) != NULL) 6174 anon_index = svd->anon_index + seg_page(seg, addr); 6175 6176 for (eaddr = addr + len; addr < eaddr; addr += PAGESIZE) { 6177 ap = NULL; 6178 if (amp != NULL) { 6179 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 6180 anon_array_enter(amp, anon_index, &cookie); 6181 ap = anon_get_ptr(amp->ahp, anon_index++); 6182 if (ap != NULL) { 6183 swap_xlate(ap, &vp, &off); 6184 } else { 6185 vp = svd->vp; 6186 off = offset; 6187 } 6188 anon_array_exit(&cookie); 6189 ANON_LOCK_EXIT(&->a_rwlock); 6190 } else { 6191 vp = svd->vp; 6192 off = offset; 6193 } 6194 offset += PAGESIZE; 6195 6196 if (vp == NULL) /* untouched zfod page */ 6197 continue; 6198 6199 if (attr) { 6200 if (vpp) { 6201 prot = VPP_PROT(vpp); 6202 vpp++; 6203 } 6204 if (prot != pageprot) { 6205 continue; 6206 } 6207 } 6208 6209 /* 6210 * See if any of these pages are locked -- if so, then we 6211 * will have to truncate an invalidate request at the first 6212 * locked one. We don't need the page_struct_lock to test 6213 * as this is only advisory; even if we acquire it someone 6214 * might race in and lock the page after we unlock and before 6215 * we do the PUTPAGE, then PUTPAGE simply does nothing. 6216 */ 6217 if (flags & MS_INVALIDATE) { 6218 if ((pp = page_lookup(vp, off, SE_SHARED)) != NULL) { 6219 if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) { 6220 page_unlock(pp); 6221 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 6222 return (EBUSY); 6223 } 6224 if (ap != NULL && pp->p_szc != 0 && 6225 page_tryupgrade(pp)) { 6226 if (pp->p_lckcnt == 0 && 6227 pp->p_cowcnt == 0) { 6228 /* 6229 * swapfs VN_DISPOSE() won't 6230 * invalidate large pages. 6231 * Attempt to demote. 6232 * XXX can't help it if it 6233 * fails. But for swapfs 6234 * pages it is no big deal. 6235 */ 6236 (void) page_try_demote_pages( 6237 pp); 6238 } 6239 } 6240 page_unlock(pp); 6241 } 6242 } else if (svd->type == MAP_SHARED && amp != NULL) { 6243 /* 6244 * Avoid writting out to disk ISM's large pages 6245 * because segspt_free_pages() relies on NULL an_pvp 6246 * of anon slots of such pages. 6247 */ 6248 6249 ASSERT(svd->vp == NULL); 6250 /* 6251 * swapfs uses page_lookup_nowait if not freeing or 6252 * invalidating and skips a page if 6253 * page_lookup_nowait returns NULL. 6254 */ 6255 pp = page_lookup_nowait(vp, off, SE_SHARED); 6256 if (pp == NULL) { 6257 continue; 6258 } 6259 if (pp->p_szc != 0) { 6260 page_unlock(pp); 6261 continue; 6262 } 6263 6264 /* 6265 * Note ISM pages are created large so (vp, off)'s 6266 * page cannot suddenly become large after we unlock 6267 * pp. 6268 */ 6269 page_unlock(pp); 6270 } 6271 /* 6272 * XXX - Should ultimately try to kluster 6273 * calls to VOP_PUTPAGE() for performance. 6274 */ 6275 VN_HOLD(vp); 6276 err = VOP_PUTPAGE(vp, (offset_t)off, PAGESIZE, 6277 bflags, svd->cred); 6278 VN_RELE(vp); 6279 if (err) 6280 break; 6281 } 6282 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 6283 return (err); 6284 } 6285 6286 /* 6287 * Determine if we have data corresponding to pages in the 6288 * primary storage virtual memory cache (i.e., "in core"). 6289 */ 6290 static size_t 6291 segvn_incore(struct seg *seg, caddr_t addr, size_t len, char *vec) 6292 { 6293 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 6294 struct vnode *vp, *avp; 6295 u_offset_t offset, aoffset; 6296 size_t p, ep; 6297 int ret; 6298 struct vpage *vpp; 6299 page_t *pp; 6300 uint_t start; 6301 struct anon_map *amp; /* XXX - for locknest */ 6302 struct anon *ap; 6303 uint_t attr; 6304 anon_sync_obj_t cookie; 6305 6306 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 6307 6308 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER); 6309 if (svd->amp == NULL && svd->vp == NULL) { 6310 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 6311 bzero(vec, btopr(len)); 6312 return (len); /* no anonymous pages created yet */ 6313 } 6314 6315 p = seg_page(seg, addr); 6316 ep = seg_page(seg, addr + len); 6317 start = svd->vp ? SEG_PAGE_VNODEBACKED : 0; 6318 6319 amp = svd->amp; 6320 for (; p < ep; p++, addr += PAGESIZE) { 6321 vpp = (svd->vpage) ? &svd->vpage[p]: NULL; 6322 ret = start; 6323 ap = NULL; 6324 avp = NULL; 6325 /* Grab the vnode/offset for the anon slot */ 6326 if (amp != NULL) { 6327 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 6328 anon_array_enter(amp, svd->anon_index + p, &cookie); 6329 ap = anon_get_ptr(amp->ahp, svd->anon_index + p); 6330 if (ap != NULL) { 6331 swap_xlate(ap, &avp, &aoffset); 6332 } 6333 anon_array_exit(&cookie); 6334 ANON_LOCK_EXIT(&->a_rwlock); 6335 } 6336 if ((avp != NULL) && page_exists(avp, aoffset)) { 6337 /* A page exists for the anon slot */ 6338 ret |= SEG_PAGE_INCORE; 6339 6340 /* 6341 * If page is mapped and writable 6342 */ 6343 attr = (uint_t)0; 6344 if ((hat_getattr(seg->s_as->a_hat, addr, 6345 &attr) != -1) && (attr & PROT_WRITE)) { 6346 ret |= SEG_PAGE_ANON; 6347 } 6348 /* 6349 * Don't get page_struct lock for lckcnt and cowcnt, 6350 * since this is purely advisory. 6351 */ 6352 if ((pp = page_lookup_nowait(avp, aoffset, 6353 SE_SHARED)) != NULL) { 6354 if (pp->p_lckcnt) 6355 ret |= SEG_PAGE_SOFTLOCK; 6356 if (pp->p_cowcnt) 6357 ret |= SEG_PAGE_HASCOW; 6358 page_unlock(pp); 6359 } 6360 } 6361 6362 /* Gather vnode statistics */ 6363 vp = svd->vp; 6364 offset = svd->offset + (uintptr_t)(addr - seg->s_base); 6365 6366 if (vp != NULL) { 6367 /* 6368 * Try to obtain a "shared" lock on the page 6369 * without blocking. If this fails, determine 6370 * if the page is in memory. 6371 */ 6372 pp = page_lookup_nowait(vp, offset, SE_SHARED); 6373 if ((pp == NULL) && (page_exists(vp, offset))) { 6374 /* Page is incore, and is named */ 6375 ret |= (SEG_PAGE_INCORE | SEG_PAGE_VNODE); 6376 } 6377 /* 6378 * Don't get page_struct lock for lckcnt and cowcnt, 6379 * since this is purely advisory. 6380 */ 6381 if (pp != NULL) { 6382 ret |= (SEG_PAGE_INCORE | SEG_PAGE_VNODE); 6383 if (pp->p_lckcnt) 6384 ret |= SEG_PAGE_SOFTLOCK; 6385 if (pp->p_cowcnt) 6386 ret |= SEG_PAGE_HASCOW; 6387 page_unlock(pp); 6388 } 6389 } 6390 6391 /* Gather virtual page information */ 6392 if (vpp) { 6393 if (VPP_ISPPLOCK(vpp)) 6394 ret |= SEG_PAGE_LOCKED; 6395 vpp++; 6396 } 6397 6398 *vec++ = (char)ret; 6399 } 6400 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 6401 return (len); 6402 } 6403 6404 /* 6405 * Statement for p_cowcnts/p_lckcnts. 6406 * 6407 * p_cowcnt is updated while mlock/munlocking MAP_PRIVATE and PROT_WRITE region 6408 * irrespective of the following factors or anything else: 6409 * 6410 * (1) anon slots are populated or not 6411 * (2) cow is broken or not 6412 * (3) refcnt on ap is 1 or greater than 1 6413 * 6414 * If it's not MAP_PRIVATE and PROT_WRITE, p_lckcnt is updated during mlock 6415 * and munlock. 6416 * 6417 * 6418 * Handling p_cowcnts/p_lckcnts during copy-on-write fault: 6419 * 6420 * if vpage has PROT_WRITE 6421 * transfer cowcnt on the oldpage -> cowcnt on the newpage 6422 * else 6423 * transfer lckcnt on the oldpage -> lckcnt on the newpage 6424 * 6425 * During copy-on-write, decrement p_cowcnt on the oldpage and increment 6426 * p_cowcnt on the newpage *if* the corresponding vpage has PROT_WRITE. 6427 * 6428 * We may also break COW if softlocking on read access in the physio case. 6429 * In this case, vpage may not have PROT_WRITE. So, we need to decrement 6430 * p_lckcnt on the oldpage and increment p_lckcnt on the newpage *if* the 6431 * vpage doesn't have PROT_WRITE. 6432 * 6433 * 6434 * Handling p_cowcnts/p_lckcnts during mprotect on mlocked region: 6435 * 6436 * If a MAP_PRIVATE region loses PROT_WRITE, we decrement p_cowcnt and 6437 * increment p_lckcnt by calling page_subclaim() which takes care of 6438 * availrmem accounting and p_lckcnt overflow. 6439 * 6440 * If a MAP_PRIVATE region gains PROT_WRITE, we decrement p_lckcnt and 6441 * increment p_cowcnt by calling page_addclaim() which takes care of 6442 * availrmem availability and p_cowcnt overflow. 6443 */ 6444 6445 /* 6446 * Lock down (or unlock) pages mapped by this segment. 6447 * 6448 * XXX only creates PAGESIZE pages if anon slots are not initialized. 6449 * At fault time they will be relocated into larger pages. 6450 */ 6451 static int 6452 segvn_lockop(struct seg *seg, caddr_t addr, size_t len, 6453 int attr, int op, ulong_t *lockmap, size_t pos) 6454 { 6455 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 6456 struct vpage *vpp; 6457 struct vpage *evp; 6458 page_t *pp; 6459 u_offset_t offset; 6460 u_offset_t off; 6461 int segtype; 6462 int pageprot; 6463 int claim; 6464 struct vnode *vp; 6465 ulong_t anon_index; 6466 struct anon_map *amp; 6467 struct anon *ap; 6468 struct vattr va; 6469 anon_sync_obj_t cookie; 6470 6471 /* 6472 * Hold write lock on address space because may split or concatenate 6473 * segments 6474 */ 6475 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 6476 6477 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_WRITER); 6478 if (attr) { 6479 pageprot = attr & ~(SHARED|PRIVATE); 6480 segtype = attr & SHARED ? MAP_SHARED : MAP_PRIVATE; 6481 6482 /* 6483 * We are done if the segment types don't match 6484 * or if we have segment level protections and 6485 * they don't match. 6486 */ 6487 if (svd->type != segtype) { 6488 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 6489 return (0); 6490 } 6491 if (svd->pageprot == 0 && svd->prot != pageprot) { 6492 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 6493 return (0); 6494 } 6495 } 6496 6497 /* 6498 * If we're locking, then we must create a vpage structure if 6499 * none exists. If we're unlocking, then check to see if there 6500 * is a vpage -- if not, then we could not have locked anything. 6501 */ 6502 6503 if ((vpp = svd->vpage) == NULL) { 6504 if (op == MC_LOCK) 6505 segvn_vpage(seg); 6506 else { 6507 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 6508 return (0); 6509 } 6510 } 6511 6512 /* 6513 * The anonymous data vector (i.e., previously 6514 * unreferenced mapping to swap space) can be allocated 6515 * by lazily testing for its existence. 6516 */ 6517 if (op == MC_LOCK && svd->amp == NULL && svd->vp == NULL) { 6518 svd->amp = anonmap_alloc(seg->s_size, 0); 6519 svd->amp->a_szc = seg->s_szc; 6520 } 6521 6522 if ((amp = svd->amp) != NULL) { 6523 anon_index = svd->anon_index + seg_page(seg, addr); 6524 } 6525 6526 offset = svd->offset + (uintptr_t)(addr - seg->s_base); 6527 evp = &svd->vpage[seg_page(seg, addr + len)]; 6528 6529 /* 6530 * Loop over all pages in the range. Process if we're locking and 6531 * page has not already been locked in this mapping; or if we're 6532 * unlocking and the page has been locked. 6533 */ 6534 for (vpp = &svd->vpage[seg_page(seg, addr)]; vpp < evp; 6535 vpp++, pos++, addr += PAGESIZE, offset += PAGESIZE, anon_index++) { 6536 if ((attr == 0 || VPP_PROT(vpp) == pageprot) && 6537 ((op == MC_LOCK && !VPP_ISPPLOCK(vpp)) || 6538 (op == MC_UNLOCK && VPP_ISPPLOCK(vpp)))) { 6539 6540 if (amp != NULL) 6541 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 6542 /* 6543 * If this isn't a MAP_NORESERVE segment and 6544 * we're locking, allocate anon slots if they 6545 * don't exist. The page is brought in later on. 6546 */ 6547 if (op == MC_LOCK && svd->vp == NULL && 6548 ((svd->flags & MAP_NORESERVE) == 0) && 6549 amp != NULL && 6550 ((ap = anon_get_ptr(amp->ahp, anon_index)) 6551 == NULL)) { 6552 anon_array_enter(amp, anon_index, &cookie); 6553 6554 if ((ap = anon_get_ptr(amp->ahp, 6555 anon_index)) == NULL) { 6556 pp = anon_zero(seg, addr, &ap, 6557 svd->cred); 6558 if (pp == NULL) { 6559 anon_array_exit(&cookie); 6560 ANON_LOCK_EXIT(&->a_rwlock); 6561 SEGVN_LOCK_EXIT(seg->s_as, 6562 &svd->lock); 6563 return (ENOMEM); 6564 } 6565 ASSERT(anon_get_ptr(amp->ahp, 6566 anon_index) == NULL); 6567 (void) anon_set_ptr(amp->ahp, 6568 anon_index, ap, ANON_SLEEP); 6569 page_unlock(pp); 6570 } 6571 anon_array_exit(&cookie); 6572 } 6573 6574 /* 6575 * Get name for page, accounting for 6576 * existence of private copy. 6577 */ 6578 ap = NULL; 6579 if (amp != NULL) { 6580 anon_array_enter(amp, anon_index, &cookie); 6581 ap = anon_get_ptr(amp->ahp, anon_index); 6582 if (ap != NULL) { 6583 swap_xlate(ap, &vp, &off); 6584 } else { 6585 if (svd->vp == NULL && 6586 (svd->flags & MAP_NORESERVE)) { 6587 anon_array_exit(&cookie); 6588 ANON_LOCK_EXIT(&->a_rwlock); 6589 continue; 6590 } 6591 vp = svd->vp; 6592 off = offset; 6593 } 6594 anon_array_exit(&cookie); 6595 ANON_LOCK_EXIT(&->a_rwlock); 6596 } else { 6597 vp = svd->vp; 6598 off = offset; 6599 } 6600 6601 /* 6602 * Get page frame. It's ok if the page is 6603 * not available when we're unlocking, as this 6604 * may simply mean that a page we locked got 6605 * truncated out of existence after we locked it. 6606 * 6607 * Invoke VOP_GETPAGE() to obtain the page struct 6608 * since we may need to read it from disk if its 6609 * been paged out. 6610 */ 6611 if (op != MC_LOCK) 6612 pp = page_lookup(vp, off, SE_SHARED); 6613 else { 6614 page_t *pl[1 + 1]; 6615 int error; 6616 6617 ASSERT(vp != NULL); 6618 6619 error = VOP_GETPAGE(vp, (offset_t)off, PAGESIZE, 6620 (uint_t *)NULL, pl, PAGESIZE, seg, addr, 6621 S_OTHER, svd->cred); 6622 6623 /* 6624 * If the error is EDEADLK then we must bounce 6625 * up and drop all vm subsystem locks and then 6626 * retry the operation later 6627 * This behavior is a temporary measure because 6628 * ufs/sds logging is badly designed and will 6629 * deadlock if we don't allow this bounce to 6630 * happen. The real solution is to re-design 6631 * the logging code to work properly. See bug 6632 * 4125102 for details of the problem. 6633 */ 6634 if (error == EDEADLK) { 6635 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 6636 return (error); 6637 } 6638 /* 6639 * Quit if we fail to fault in the page. Treat 6640 * the failure as an error, unless the addr 6641 * is mapped beyond the end of a file. 6642 */ 6643 if (error && svd->vp) { 6644 va.va_mask = AT_SIZE; 6645 if (VOP_GETATTR(svd->vp, &va, 0, 6646 svd->cred) != 0) { 6647 SEGVN_LOCK_EXIT(seg->s_as, 6648 &svd->lock); 6649 return (EIO); 6650 } 6651 if (btopr(va.va_size) >= 6652 btopr(off + 1)) { 6653 SEGVN_LOCK_EXIT(seg->s_as, 6654 &svd->lock); 6655 return (EIO); 6656 } 6657 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 6658 return (0); 6659 } else if (error) { 6660 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 6661 return (EIO); 6662 } 6663 pp = pl[0]; 6664 ASSERT(pp != NULL); 6665 } 6666 6667 /* 6668 * See Statement at the beginning of this routine. 6669 * 6670 * claim is always set if MAP_PRIVATE and PROT_WRITE 6671 * irrespective of following factors: 6672 * 6673 * (1) anon slots are populated or not 6674 * (2) cow is broken or not 6675 * (3) refcnt on ap is 1 or greater than 1 6676 * 6677 * See 4140683 for details 6678 */ 6679 claim = ((VPP_PROT(vpp) & PROT_WRITE) && 6680 (svd->type == MAP_PRIVATE)); 6681 6682 /* 6683 * Perform page-level operation appropriate to 6684 * operation. If locking, undo the SOFTLOCK 6685 * performed to bring the page into memory 6686 * after setting the lock. If unlocking, 6687 * and no page was found, account for the claim 6688 * separately. 6689 */ 6690 if (op == MC_LOCK) { 6691 int ret = 1; /* Assume success */ 6692 6693 /* 6694 * Make sure another thread didn't lock 6695 * the page after we released the segment 6696 * lock. 6697 */ 6698 if ((attr == 0 || VPP_PROT(vpp) == pageprot) && 6699 !VPP_ISPPLOCK(vpp)) { 6700 ret = page_pp_lock(pp, claim, 0); 6701 if (ret != 0) { 6702 VPP_SETPPLOCK(vpp); 6703 if (lockmap != (ulong_t *)NULL) 6704 BT_SET(lockmap, pos); 6705 } 6706 } 6707 page_unlock(pp); 6708 if (ret == 0) { 6709 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 6710 return (EAGAIN); 6711 } 6712 } else { 6713 if (pp != NULL) { 6714 if ((attr == 0 || 6715 VPP_PROT(vpp) == pageprot) && 6716 VPP_ISPPLOCK(vpp)) 6717 page_pp_unlock(pp, claim, 0); 6718 page_unlock(pp); 6719 } 6720 VPP_CLRPPLOCK(vpp); 6721 } 6722 } 6723 } 6724 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 6725 return (0); 6726 } 6727 6728 /* 6729 * Set advice from user for specified pages 6730 * There are 5 types of advice: 6731 * MADV_NORMAL - Normal (default) behavior (whatever that is) 6732 * MADV_RANDOM - Random page references 6733 * do not allow readahead or 'klustering' 6734 * MADV_SEQUENTIAL - Sequential page references 6735 * Pages previous to the one currently being 6736 * accessed (determined by fault) are 'not needed' 6737 * and are freed immediately 6738 * MADV_WILLNEED - Pages are likely to be used (fault ahead in mctl) 6739 * MADV_DONTNEED - Pages are not needed (synced out in mctl) 6740 * MADV_FREE - Contents can be discarded 6741 * MADV_ACCESS_DEFAULT- Default access 6742 * MADV_ACCESS_LWP - Next LWP will access heavily 6743 * MADV_ACCESS_MANY- Many LWPs or processes will access heavily 6744 */ 6745 static int 6746 segvn_advise(struct seg *seg, caddr_t addr, size_t len, uint_t behav) 6747 { 6748 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 6749 size_t page; 6750 int err = 0; 6751 int already_set; 6752 struct anon_map *amp; 6753 ulong_t anon_index; 6754 struct seg *next; 6755 lgrp_mem_policy_t policy; 6756 struct seg *prev; 6757 struct vnode *vp; 6758 6759 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 6760 6761 /* 6762 * In case of MADV_FREE, we won't be modifying any segment private 6763 * data structures; so, we only need to grab READER's lock 6764 */ 6765 if (behav != MADV_FREE) 6766 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_WRITER); 6767 else 6768 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER); 6769 6770 /* 6771 * Large pages are assumed to be only turned on when accesses to the 6772 * segment's address range have spatial and temporal locality. That 6773 * justifies ignoring MADV_SEQUENTIAL for large page segments. 6774 * Also, ignore advice affecting lgroup memory allocation 6775 * if don't need to do lgroup optimizations on this system 6776 */ 6777 6778 if ((behav == MADV_SEQUENTIAL && seg->s_szc != 0) || 6779 (!lgrp_optimizations() && (behav == MADV_ACCESS_DEFAULT || 6780 behav == MADV_ACCESS_LWP || behav == MADV_ACCESS_MANY))) { 6781 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 6782 return (0); 6783 } 6784 6785 if (behav == MADV_SEQUENTIAL || behav == MADV_ACCESS_DEFAULT || 6786 behav == MADV_ACCESS_LWP || behav == MADV_ACCESS_MANY) { 6787 /* 6788 * Since we are going to unload hat mappings 6789 * we first have to flush the cache. Otherwise 6790 * this might lead to system panic if another 6791 * thread is doing physio on the range whose 6792 * mappings are unloaded by madvise(3C). 6793 */ 6794 if (svd->softlockcnt > 0) { 6795 /* 6796 * Since we do have the segvn writers lock 6797 * nobody can fill the cache with entries 6798 * belonging to this seg during the purge. 6799 * The flush either succeeds or we still 6800 * have pending I/Os. In the later case, 6801 * madvise(3C) fails. 6802 */ 6803 segvn_purge(seg); 6804 if (svd->softlockcnt > 0) { 6805 /* 6806 * Since madvise(3C) is advisory and 6807 * it's not part of UNIX98, madvise(3C) 6808 * failure here doesn't cause any hardship. 6809 * Note that we don't block in "as" layer. 6810 */ 6811 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 6812 return (EAGAIN); 6813 } 6814 } 6815 } 6816 6817 amp = svd->amp; 6818 vp = svd->vp; 6819 if (behav == MADV_FREE) { 6820 /* 6821 * MADV_FREE is not supported for segments with 6822 * underlying object; if anonmap is NULL, anon slots 6823 * are not yet populated and there is nothing for 6824 * us to do. As MADV_FREE is advisory, we don't 6825 * return error in either case. 6826 */ 6827 if (vp || amp == NULL) { 6828 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 6829 return (0); 6830 } 6831 6832 page = seg_page(seg, addr); 6833 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 6834 anon_disclaim(amp, svd->anon_index + page, len, 0); 6835 ANON_LOCK_EXIT(&->a_rwlock); 6836 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 6837 return (0); 6838 } 6839 6840 /* 6841 * If advice is to be applied to entire segment, 6842 * use advice field in seg_data structure 6843 * otherwise use appropriate vpage entry. 6844 */ 6845 if ((addr == seg->s_base) && (len == seg->s_size)) { 6846 switch (behav) { 6847 case MADV_ACCESS_LWP: 6848 case MADV_ACCESS_MANY: 6849 case MADV_ACCESS_DEFAULT: 6850 /* 6851 * Set memory allocation policy for this segment 6852 */ 6853 policy = lgrp_madv_to_policy(behav, len, svd->type); 6854 if (svd->type == MAP_SHARED) 6855 already_set = lgrp_shm_policy_set(policy, amp, 6856 svd->anon_index, vp, svd->offset, len); 6857 else { 6858 /* 6859 * For private memory, need writers lock on 6860 * address space because the segment may be 6861 * split or concatenated when changing policy 6862 */ 6863 if (AS_READ_HELD(seg->s_as, 6864 &seg->s_as->a_lock)) { 6865 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 6866 return (IE_RETRY); 6867 } 6868 6869 already_set = lgrp_privm_policy_set(policy, 6870 &svd->policy_info, len); 6871 } 6872 6873 /* 6874 * If policy set already and it shouldn't be reapplied, 6875 * don't do anything. 6876 */ 6877 if (already_set && 6878 !LGRP_MEM_POLICY_REAPPLICABLE(policy)) 6879 break; 6880 6881 /* 6882 * Mark any existing pages in given range for 6883 * migration 6884 */ 6885 page_mark_migrate(seg, addr, len, amp, svd->anon_index, 6886 vp, svd->offset, 1); 6887 6888 /* 6889 * If same policy set already or this is a shared 6890 * memory segment, don't need to try to concatenate 6891 * segment with adjacent ones. 6892 */ 6893 if (already_set || svd->type == MAP_SHARED) 6894 break; 6895 6896 /* 6897 * Try to concatenate this segment with previous 6898 * one and next one, since we changed policy for 6899 * this one and it may be compatible with adjacent 6900 * ones now. 6901 */ 6902 prev = AS_SEGPREV(seg->s_as, seg); 6903 next = AS_SEGNEXT(seg->s_as, seg); 6904 6905 if (next && next->s_ops == &segvn_ops && 6906 addr + len == next->s_base) 6907 (void) segvn_concat(seg, next, 1); 6908 6909 if (prev && prev->s_ops == &segvn_ops && 6910 addr == prev->s_base + prev->s_size) { 6911 /* 6912 * Drop lock for private data of current 6913 * segment before concatenating (deleting) it 6914 * and return IE_REATTACH to tell as_ctl() that 6915 * current segment has changed 6916 */ 6917 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 6918 if (!segvn_concat(prev, seg, 1)) 6919 err = IE_REATTACH; 6920 6921 return (err); 6922 } 6923 break; 6924 6925 case MADV_SEQUENTIAL: 6926 /* 6927 * unloading mapping guarantees 6928 * detection in segvn_fault 6929 */ 6930 ASSERT(seg->s_szc == 0); 6931 hat_unload(seg->s_as->a_hat, addr, len, 6932 HAT_UNLOAD); 6933 /* FALLTHROUGH */ 6934 case MADV_NORMAL: 6935 case MADV_RANDOM: 6936 svd->advice = (uchar_t)behav; 6937 svd->pageadvice = 0; 6938 break; 6939 case MADV_WILLNEED: /* handled in memcntl */ 6940 case MADV_DONTNEED: /* handled in memcntl */ 6941 case MADV_FREE: /* handled above */ 6942 break; 6943 default: 6944 err = EINVAL; 6945 } 6946 } else { 6947 caddr_t eaddr; 6948 struct seg *new_seg; 6949 struct segvn_data *new_svd; 6950 u_offset_t off; 6951 caddr_t oldeaddr; 6952 6953 page = seg_page(seg, addr); 6954 6955 segvn_vpage(seg); 6956 6957 switch (behav) { 6958 struct vpage *bvpp, *evpp; 6959 6960 case MADV_ACCESS_LWP: 6961 case MADV_ACCESS_MANY: 6962 case MADV_ACCESS_DEFAULT: 6963 /* 6964 * Set memory allocation policy for portion of this 6965 * segment 6966 */ 6967 6968 /* 6969 * Align address and length of advice to page 6970 * boundaries for large pages 6971 */ 6972 if (seg->s_szc != 0) { 6973 size_t pgsz; 6974 6975 pgsz = page_get_pagesize(seg->s_szc); 6976 addr = (caddr_t)P2ALIGN((uintptr_t)addr, pgsz); 6977 len = P2ROUNDUP(len, pgsz); 6978 } 6979 6980 /* 6981 * Check to see whether policy is set already 6982 */ 6983 policy = lgrp_madv_to_policy(behav, len, svd->type); 6984 6985 anon_index = svd->anon_index + page; 6986 off = svd->offset + (uintptr_t)(addr - seg->s_base); 6987 6988 if (svd->type == MAP_SHARED) 6989 already_set = lgrp_shm_policy_set(policy, amp, 6990 anon_index, vp, off, len); 6991 else 6992 already_set = 6993 (policy == svd->policy_info.mem_policy); 6994 6995 /* 6996 * If policy set already and it shouldn't be reapplied, 6997 * don't do anything. 6998 */ 6999 if (already_set && 7000 !LGRP_MEM_POLICY_REAPPLICABLE(policy)) 7001 break; 7002 7003 /* 7004 * For private memory, need writers lock on 7005 * address space because the segment may be 7006 * split or concatenated when changing policy 7007 */ 7008 if (svd->type == MAP_PRIVATE && 7009 AS_READ_HELD(seg->s_as, &seg->s_as->a_lock)) { 7010 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7011 return (IE_RETRY); 7012 } 7013 7014 /* 7015 * Mark any existing pages in given range for 7016 * migration 7017 */ 7018 page_mark_migrate(seg, addr, len, amp, svd->anon_index, 7019 vp, svd->offset, 1); 7020 7021 /* 7022 * Don't need to try to split or concatenate 7023 * segments, since policy is same or this is a shared 7024 * memory segment 7025 */ 7026 if (already_set || svd->type == MAP_SHARED) 7027 break; 7028 7029 /* 7030 * Split off new segment if advice only applies to a 7031 * portion of existing segment starting in middle 7032 */ 7033 new_seg = NULL; 7034 eaddr = addr + len; 7035 oldeaddr = seg->s_base + seg->s_size; 7036 if (addr > seg->s_base) { 7037 /* 7038 * Must flush I/O page cache 7039 * before splitting segment 7040 */ 7041 if (svd->softlockcnt > 0) 7042 segvn_purge(seg); 7043 7044 /* 7045 * Split segment and return IE_REATTACH to tell 7046 * as_ctl() that current segment changed 7047 */ 7048 new_seg = segvn_split_seg(seg, addr); 7049 new_svd = (struct segvn_data *)new_seg->s_data; 7050 err = IE_REATTACH; 7051 7052 /* 7053 * If new segment ends where old one 7054 * did, try to concatenate the new 7055 * segment with next one. 7056 */ 7057 if (eaddr == oldeaddr) { 7058 /* 7059 * Set policy for new segment 7060 */ 7061 (void) lgrp_privm_policy_set(policy, 7062 &new_svd->policy_info, 7063 new_seg->s_size); 7064 7065 next = AS_SEGNEXT(new_seg->s_as, 7066 new_seg); 7067 7068 if (next && 7069 next->s_ops == &segvn_ops && 7070 eaddr == next->s_base) 7071 (void) segvn_concat(new_seg, 7072 next, 1); 7073 } 7074 } 7075 7076 /* 7077 * Split off end of existing segment if advice only 7078 * applies to a portion of segment ending before 7079 * end of the existing segment 7080 */ 7081 if (eaddr < oldeaddr) { 7082 /* 7083 * Must flush I/O page cache 7084 * before splitting segment 7085 */ 7086 if (svd->softlockcnt > 0) 7087 segvn_purge(seg); 7088 7089 /* 7090 * If beginning of old segment was already 7091 * split off, use new segment to split end off 7092 * from. 7093 */ 7094 if (new_seg != NULL && new_seg != seg) { 7095 /* 7096 * Split segment 7097 */ 7098 (void) segvn_split_seg(new_seg, eaddr); 7099 7100 /* 7101 * Set policy for new segment 7102 */ 7103 (void) lgrp_privm_policy_set(policy, 7104 &new_svd->policy_info, 7105 new_seg->s_size); 7106 } else { 7107 /* 7108 * Split segment and return IE_REATTACH 7109 * to tell as_ctl() that current 7110 * segment changed 7111 */ 7112 (void) segvn_split_seg(seg, eaddr); 7113 err = IE_REATTACH; 7114 7115 (void) lgrp_privm_policy_set(policy, 7116 &svd->policy_info, seg->s_size); 7117 7118 /* 7119 * If new segment starts where old one 7120 * did, try to concatenate it with 7121 * previous segment. 7122 */ 7123 if (addr == seg->s_base) { 7124 prev = AS_SEGPREV(seg->s_as, 7125 seg); 7126 7127 /* 7128 * Drop lock for private data 7129 * of current segment before 7130 * concatenating (deleting) it 7131 */ 7132 if (prev && 7133 prev->s_ops == 7134 &segvn_ops && 7135 addr == prev->s_base + 7136 prev->s_size) { 7137 SEGVN_LOCK_EXIT( 7138 seg->s_as, 7139 &svd->lock); 7140 (void) segvn_concat( 7141 prev, seg, 1); 7142 return (err); 7143 } 7144 } 7145 } 7146 } 7147 break; 7148 case MADV_SEQUENTIAL: 7149 ASSERT(seg->s_szc == 0); 7150 hat_unload(seg->s_as->a_hat, addr, len, HAT_UNLOAD); 7151 /* FALLTHROUGH */ 7152 case MADV_NORMAL: 7153 case MADV_RANDOM: 7154 bvpp = &svd->vpage[page]; 7155 evpp = &svd->vpage[page + (len >> PAGESHIFT)]; 7156 for (; bvpp < evpp; bvpp++) 7157 VPP_SETADVICE(bvpp, behav); 7158 svd->advice = MADV_NORMAL; 7159 break; 7160 case MADV_WILLNEED: /* handled in memcntl */ 7161 case MADV_DONTNEED: /* handled in memcntl */ 7162 case MADV_FREE: /* handled above */ 7163 break; 7164 default: 7165 err = EINVAL; 7166 } 7167 } 7168 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7169 return (err); 7170 } 7171 7172 /* 7173 * Create a vpage structure for this seg. 7174 */ 7175 static void 7176 segvn_vpage(struct seg *seg) 7177 { 7178 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 7179 struct vpage *vp, *evp; 7180 7181 ASSERT(SEGVN_WRITE_HELD(seg->s_as, &svd->lock)); 7182 7183 /* 7184 * If no vpage structure exists, allocate one. Copy the protections 7185 * and the advice from the segment itself to the individual pages. 7186 */ 7187 if (svd->vpage == NULL) { 7188 svd->pageprot = 1; 7189 svd->pageadvice = 1; 7190 svd->vpage = kmem_zalloc(seg_pages(seg) * sizeof (struct vpage), 7191 KM_SLEEP); 7192 evp = &svd->vpage[seg_page(seg, seg->s_base + seg->s_size)]; 7193 for (vp = svd->vpage; vp < evp; vp++) { 7194 VPP_SETPROT(vp, svd->prot); 7195 VPP_SETADVICE(vp, svd->advice); 7196 } 7197 } 7198 } 7199 7200 /* 7201 * Dump the pages belonging to this segvn segment. 7202 */ 7203 static void 7204 segvn_dump(struct seg *seg) 7205 { 7206 struct segvn_data *svd; 7207 page_t *pp; 7208 struct anon_map *amp; 7209 ulong_t anon_index; 7210 struct vnode *vp; 7211 u_offset_t off, offset; 7212 pfn_t pfn; 7213 pgcnt_t page, npages; 7214 caddr_t addr; 7215 7216 npages = seg_pages(seg); 7217 svd = (struct segvn_data *)seg->s_data; 7218 vp = svd->vp; 7219 off = offset = svd->offset; 7220 addr = seg->s_base; 7221 7222 if ((amp = svd->amp) != NULL) { 7223 anon_index = svd->anon_index; 7224 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 7225 } 7226 7227 for (page = 0; page < npages; page++, offset += PAGESIZE) { 7228 struct anon *ap; 7229 int we_own_it = 0; 7230 7231 if (amp && (ap = anon_get_ptr(svd->amp->ahp, anon_index++))) { 7232 swap_xlate_nopanic(ap, &vp, &off); 7233 } else { 7234 vp = svd->vp; 7235 off = offset; 7236 } 7237 7238 /* 7239 * If pp == NULL, the page either does not exist 7240 * or is exclusively locked. So determine if it 7241 * exists before searching for it. 7242 */ 7243 7244 if ((pp = page_lookup_nowait(vp, off, SE_SHARED))) 7245 we_own_it = 1; 7246 else 7247 pp = page_exists(vp, off); 7248 7249 if (pp) { 7250 pfn = page_pptonum(pp); 7251 dump_addpage(seg->s_as, addr, pfn); 7252 if (we_own_it) 7253 page_unlock(pp); 7254 } 7255 addr += PAGESIZE; 7256 dump_timeleft = dump_timeout; 7257 } 7258 7259 if (amp != NULL) 7260 ANON_LOCK_EXIT(&->a_rwlock); 7261 } 7262 7263 /* 7264 * lock/unlock anon pages over a given range. Return shadow list 7265 */ 7266 static int 7267 segvn_pagelock(struct seg *seg, caddr_t addr, size_t len, struct page ***ppp, 7268 enum lock_type type, enum seg_rw rw) 7269 { 7270 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 7271 size_t np, adjustpages = 0, npages = (len >> PAGESHIFT); 7272 ulong_t anon_index; 7273 uint_t protchk; 7274 uint_t error; 7275 struct anon_map *amp; 7276 struct page **pplist, **pl, *pp; 7277 caddr_t a; 7278 size_t page; 7279 caddr_t lpgaddr, lpgeaddr; 7280 7281 TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEGVN_START, 7282 "segvn_pagelock: start seg %p addr %p", seg, addr); 7283 7284 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 7285 if (seg->s_szc != 0 && (type == L_PAGELOCK || type == L_PAGEUNLOCK)) { 7286 /* 7287 * We are adjusting the pagelock region to the large page size 7288 * boundary because the unlocked part of a large page cannot 7289 * be freed anyway unless all constituent pages of a large 7290 * page are locked. Therefore this adjustment allows us to 7291 * decrement availrmem by the right value (note we don't want 7292 * to just decrement availrem by the large page size without 7293 * adjusting addr and len because then we may end up 7294 * decrementing availrmem by large page size for every 7295 * constituent page locked by a new as_pagelock call). 7296 * as_pageunlock caller must always match as_pagelock call's 7297 * addr and len. 7298 * 7299 * Note segment's page size cannot change while we are holding 7300 * as lock. And then it cannot change while softlockcnt is 7301 * not 0. This will allow us to correctly recalculate large 7302 * page size region for the matching pageunlock/reclaim call. 7303 * 7304 * for pageunlock *ppp points to the pointer of page_t that 7305 * corresponds to the real unadjusted start address. Similar 7306 * for pagelock *ppp must point to the pointer of page_t that 7307 * corresponds to the real unadjusted start address. 7308 */ 7309 size_t pgsz = page_get_pagesize(seg->s_szc); 7310 CALC_LPG_REGION(pgsz, seg, addr, len, lpgaddr, lpgeaddr); 7311 adjustpages = ((uintptr_t)(addr - lpgaddr)) >> PAGESHIFT; 7312 } 7313 7314 if (type == L_PAGEUNLOCK) { 7315 7316 /* 7317 * update hat ref bits for /proc. We need to make sure 7318 * that threads tracing the ref and mod bits of the 7319 * address space get the right data. 7320 * Note: page ref and mod bits are updated at reclaim time 7321 */ 7322 if (seg->s_as->a_vbits) { 7323 for (a = addr; a < addr + len; a += PAGESIZE) { 7324 if (rw == S_WRITE) { 7325 hat_setstat(seg->s_as, a, 7326 PAGESIZE, P_REF | P_MOD); 7327 } else { 7328 hat_setstat(seg->s_as, a, 7329 PAGESIZE, P_REF); 7330 } 7331 } 7332 } 7333 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER); 7334 if (seg->s_szc != 0) { 7335 VM_STAT_ADD(segvnvmstats.pagelock[0]); 7336 seg_pinactive(seg, lpgaddr, lpgeaddr - lpgaddr, 7337 *ppp - adjustpages, rw, segvn_reclaim); 7338 } else { 7339 seg_pinactive(seg, addr, len, *ppp, rw, segvn_reclaim); 7340 } 7341 7342 /* 7343 * If someone is blocked while unmapping, we purge 7344 * segment page cache and thus reclaim pplist synchronously 7345 * without waiting for seg_pasync_thread. This speeds up 7346 * unmapping in cases where munmap(2) is called, while 7347 * raw async i/o is still in progress or where a thread 7348 * exits on data fault in a multithreaded application. 7349 */ 7350 if (AS_ISUNMAPWAIT(seg->s_as) && (svd->softlockcnt > 0)) { 7351 /* 7352 * Even if we grab segvn WRITER's lock or segp_slock 7353 * here, there might be another thread which could've 7354 * successfully performed lookup/insert just before 7355 * we acquired the lock here. So, grabbing either 7356 * lock here is of not much use. Until we devise 7357 * a strategy at upper layers to solve the 7358 * synchronization issues completely, we expect 7359 * applications to handle this appropriately. 7360 */ 7361 segvn_purge(seg); 7362 } 7363 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7364 TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEGVN_UNLOCK_END, 7365 "segvn_pagelock: unlock seg %p addr %p", seg, addr); 7366 return (0); 7367 } else if (type == L_PAGERECLAIM) { 7368 VM_STAT_COND_ADD(seg->s_szc != 0, segvnvmstats.pagelock[1]); 7369 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER); 7370 (void) segvn_reclaim(seg, addr, len, *ppp, rw); 7371 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7372 TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEGVN_UNLOCK_END, 7373 "segvn_pagelock: reclaim seg %p addr %p", seg, addr); 7374 return (0); 7375 } 7376 7377 if (seg->s_szc != 0) { 7378 VM_STAT_ADD(segvnvmstats.pagelock[2]); 7379 addr = lpgaddr; 7380 len = lpgeaddr - lpgaddr; 7381 npages = (len >> PAGESHIFT); 7382 } 7383 7384 /* 7385 * for now we only support pagelock to anon memory. We've to check 7386 * protections for vnode objects and call into the vnode driver. 7387 * That's too much for a fast path. Let the fault entry point handle it. 7388 */ 7389 if (svd->vp != NULL) { 7390 TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEGVN_MISS_END, 7391 "segvn_pagelock: mapped vnode seg %p addr %p", seg, addr); 7392 *ppp = NULL; 7393 return (ENOTSUP); 7394 } 7395 7396 /* 7397 * if anonmap is not yet created, let the fault entry point populate it 7398 * with anon ptrs. 7399 */ 7400 if ((amp = svd->amp) == NULL) { 7401 TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEGVN_MISS_END, 7402 "segvn_pagelock: anonmap null seg %p addr %p", seg, addr); 7403 *ppp = NULL; 7404 return (EFAULT); 7405 } 7406 7407 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER); 7408 7409 /* 7410 * we acquire segp_slock to prevent duplicate entries 7411 * in seg_pcache 7412 */ 7413 mutex_enter(&svd->segp_slock); 7414 7415 /* 7416 * try to find pages in segment page cache 7417 */ 7418 pplist = seg_plookup(seg, addr, len, rw); 7419 if (pplist != NULL) { 7420 mutex_exit(&svd->segp_slock); 7421 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7422 *ppp = pplist + adjustpages; 7423 TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEGVN_HIT_END, 7424 "segvn_pagelock: cache hit seg %p addr %p", seg, addr); 7425 return (0); 7426 } 7427 7428 if (rw == S_READ) { 7429 protchk = PROT_READ; 7430 } else { 7431 protchk = PROT_WRITE; 7432 } 7433 7434 if (svd->pageprot == 0) { 7435 if ((svd->prot & protchk) == 0) { 7436 mutex_exit(&svd->segp_slock); 7437 error = EFAULT; 7438 goto out; 7439 } 7440 } else { 7441 /* 7442 * check page protections 7443 */ 7444 for (a = addr; a < addr + len; a += PAGESIZE) { 7445 struct vpage *vp; 7446 7447 vp = &svd->vpage[seg_page(seg, a)]; 7448 if ((VPP_PROT(vp) & protchk) == 0) { 7449 mutex_exit(&svd->segp_slock); 7450 error = EFAULT; 7451 goto out; 7452 } 7453 } 7454 } 7455 7456 mutex_enter(&freemem_lock); 7457 if (availrmem < tune.t_minarmem + npages) { 7458 mutex_exit(&freemem_lock); 7459 mutex_exit(&svd->segp_slock); 7460 error = ENOMEM; 7461 goto out; 7462 } else { 7463 svd->softlockcnt += npages; 7464 availrmem -= npages; 7465 segvn_pages_locked += npages; 7466 } 7467 mutex_exit(&freemem_lock); 7468 7469 pplist = kmem_alloc(sizeof (page_t *) * npages, KM_SLEEP); 7470 pl = pplist; 7471 *ppp = pplist + adjustpages; 7472 7473 page = seg_page(seg, addr); 7474 anon_index = svd->anon_index + page; 7475 7476 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 7477 for (a = addr; a < addr + len; a += PAGESIZE, anon_index++) { 7478 struct anon *ap; 7479 struct vnode *vp; 7480 u_offset_t off; 7481 anon_sync_obj_t cookie; 7482 7483 anon_array_enter(amp, anon_index, &cookie); 7484 ap = anon_get_ptr(amp->ahp, anon_index); 7485 if (ap == NULL) { 7486 anon_array_exit(&cookie); 7487 break; 7488 } else { 7489 /* 7490 * We must never use seg_pcache for COW pages 7491 * because we might end up with original page still 7492 * lying in seg_pcache even after private page is 7493 * created. This leads to data corruption as 7494 * aio_write refers to the page still in cache 7495 * while all other accesses refer to the private 7496 * page. 7497 */ 7498 if (ap->an_refcnt != 1) { 7499 anon_array_exit(&cookie); 7500 break; 7501 } 7502 } 7503 swap_xlate(ap, &vp, &off); 7504 anon_array_exit(&cookie); 7505 7506 pp = page_lookup_nowait(vp, off, SE_SHARED); 7507 if (pp == NULL) { 7508 break; 7509 } 7510 *pplist++ = pp; 7511 } 7512 ANON_LOCK_EXIT(&->a_rwlock); 7513 7514 if (a >= addr + len) { 7515 (void) seg_pinsert(seg, addr, len, pl, rw, SEGP_ASYNC_FLUSH, 7516 segvn_reclaim); 7517 mutex_exit(&svd->segp_slock); 7518 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7519 TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEGVN_FILL_END, 7520 "segvn_pagelock: cache fill seg %p addr %p", seg, addr); 7521 return (0); 7522 } 7523 7524 mutex_exit(&svd->segp_slock); 7525 error = EFAULT; 7526 pplist = pl; 7527 np = ((uintptr_t)(a - addr)) >> PAGESHIFT; 7528 while (np > (uint_t)0) { 7529 page_unlock(*pplist); 7530 np--; 7531 pplist++; 7532 } 7533 kmem_free(pl, sizeof (page_t *) * npages); 7534 mutex_enter(&freemem_lock); 7535 svd->softlockcnt -= npages; 7536 availrmem += npages; 7537 segvn_pages_locked -= npages; 7538 mutex_exit(&freemem_lock); 7539 if (svd->softlockcnt <= 0) { 7540 if (AS_ISUNMAPWAIT(seg->s_as)) { 7541 mutex_enter(&seg->s_as->a_contents); 7542 if (AS_ISUNMAPWAIT(seg->s_as)) { 7543 AS_CLRUNMAPWAIT(seg->s_as); 7544 cv_broadcast(&seg->s_as->a_cv); 7545 } 7546 mutex_exit(&seg->s_as->a_contents); 7547 } 7548 } 7549 7550 out: 7551 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7552 *ppp = NULL; 7553 TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEGVN_MISS_END, 7554 "segvn_pagelock: cache miss seg %p addr %p", seg, addr); 7555 return (error); 7556 } 7557 7558 /* 7559 * purge any cached pages in the I/O page cache 7560 */ 7561 static void 7562 segvn_purge(struct seg *seg) 7563 { 7564 seg_ppurge(seg); 7565 } 7566 7567 static int 7568 segvn_reclaim(struct seg *seg, caddr_t addr, size_t len, struct page **pplist, 7569 enum seg_rw rw) 7570 { 7571 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 7572 pgcnt_t np, npages; 7573 struct page **pl; 7574 7575 #ifdef lint 7576 addr = addr; 7577 #endif 7578 7579 npages = np = (len >> PAGESHIFT); 7580 ASSERT(npages); 7581 pl = pplist; 7582 if (seg->s_szc != 0) { 7583 size_t pgsz = page_get_pagesize(seg->s_szc); 7584 if (!IS_P2ALIGNED(addr, pgsz) || !IS_P2ALIGNED(len, pgsz)) { 7585 panic("segvn_reclaim: unaligned addr or len"); 7586 /*NOTREACHED*/ 7587 } 7588 } 7589 7590 while (np > (uint_t)0) { 7591 if (rw == S_WRITE) { 7592 hat_setrefmod(*pplist); 7593 } else { 7594 hat_setref(*pplist); 7595 } 7596 page_unlock(*pplist); 7597 np--; 7598 pplist++; 7599 } 7600 kmem_free(pl, sizeof (page_t *) * npages); 7601 7602 mutex_enter(&freemem_lock); 7603 availrmem += npages; 7604 segvn_pages_locked -= npages; 7605 svd->softlockcnt -= npages; 7606 mutex_exit(&freemem_lock); 7607 if (svd->softlockcnt <= 0) { 7608 if (AS_ISUNMAPWAIT(seg->s_as)) { 7609 mutex_enter(&seg->s_as->a_contents); 7610 if (AS_ISUNMAPWAIT(seg->s_as)) { 7611 AS_CLRUNMAPWAIT(seg->s_as); 7612 cv_broadcast(&seg->s_as->a_cv); 7613 } 7614 mutex_exit(&seg->s_as->a_contents); 7615 } 7616 } 7617 return (0); 7618 } 7619 /* 7620 * get a memory ID for an addr in a given segment 7621 * 7622 * XXX only creates PAGESIZE pages if anon slots are not initialized. 7623 * At fault time they will be relocated into larger pages. 7624 */ 7625 static int 7626 segvn_getmemid(struct seg *seg, caddr_t addr, memid_t *memidp) 7627 { 7628 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 7629 struct anon *ap = NULL; 7630 ulong_t anon_index; 7631 struct anon_map *amp; 7632 anon_sync_obj_t cookie; 7633 7634 if (svd->type == MAP_PRIVATE) { 7635 memidp->val[0] = (uintptr_t)seg->s_as; 7636 memidp->val[1] = (uintptr_t)addr; 7637 return (0); 7638 } 7639 7640 if (svd->type == MAP_SHARED) { 7641 if (svd->vp) { 7642 memidp->val[0] = (uintptr_t)svd->vp; 7643 memidp->val[1] = (u_longlong_t)svd->offset + 7644 (uintptr_t)(addr - seg->s_base); 7645 return (0); 7646 } else { 7647 7648 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER); 7649 if ((amp = svd->amp) != NULL) { 7650 anon_index = svd->anon_index + 7651 seg_page(seg, addr); 7652 } 7653 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7654 7655 ASSERT(amp != NULL); 7656 7657 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 7658 anon_array_enter(amp, anon_index, &cookie); 7659 ap = anon_get_ptr(amp->ahp, anon_index); 7660 if (ap == NULL) { 7661 page_t *pp; 7662 7663 pp = anon_zero(seg, addr, &ap, svd->cred); 7664 if (pp == NULL) { 7665 anon_array_exit(&cookie); 7666 ANON_LOCK_EXIT(&->a_rwlock); 7667 return (ENOMEM); 7668 } 7669 ASSERT(anon_get_ptr(amp->ahp, anon_index) 7670 == NULL); 7671 (void) anon_set_ptr(amp->ahp, anon_index, 7672 ap, ANON_SLEEP); 7673 page_unlock(pp); 7674 } 7675 7676 anon_array_exit(&cookie); 7677 ANON_LOCK_EXIT(&->a_rwlock); 7678 7679 memidp->val[0] = (uintptr_t)ap; 7680 memidp->val[1] = (uintptr_t)addr & PAGEOFFSET; 7681 return (0); 7682 } 7683 } 7684 return (EINVAL); 7685 } 7686 7687 static int 7688 sameprot(struct seg *seg, caddr_t a, size_t len) 7689 { 7690 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 7691 struct vpage *vpage; 7692 spgcnt_t pages = btop(len); 7693 uint_t prot; 7694 7695 if (svd->pageprot == 0) 7696 return (1); 7697 7698 ASSERT(svd->vpage != NULL); 7699 7700 vpage = &svd->vpage[seg_page(seg, a)]; 7701 prot = VPP_PROT(vpage); 7702 vpage++; 7703 pages--; 7704 while (pages-- > 0) { 7705 if (prot != VPP_PROT(vpage)) 7706 return (0); 7707 vpage++; 7708 } 7709 return (1); 7710 } 7711 7712 /* 7713 * Get memory allocation policy info for specified address in given segment 7714 */ 7715 static lgrp_mem_policy_info_t * 7716 segvn_getpolicy(struct seg *seg, caddr_t addr) 7717 { 7718 struct anon_map *amp; 7719 ulong_t anon_index; 7720 lgrp_mem_policy_info_t *policy_info; 7721 struct segvn_data *svn_data; 7722 u_offset_t vn_off; 7723 vnode_t *vp; 7724 7725 ASSERT(seg != NULL); 7726 7727 svn_data = (struct segvn_data *)seg->s_data; 7728 if (svn_data == NULL) 7729 return (NULL); 7730 7731 /* 7732 * Get policy info for private or shared memory 7733 */ 7734 if (svn_data->type != MAP_SHARED) 7735 policy_info = &svn_data->policy_info; 7736 else { 7737 amp = svn_data->amp; 7738 anon_index = svn_data->anon_index + seg_page(seg, addr); 7739 vp = svn_data->vp; 7740 vn_off = svn_data->offset + (uintptr_t)(addr - seg->s_base); 7741 policy_info = lgrp_shm_policy_get(amp, anon_index, vp, vn_off); 7742 } 7743 7744 return (policy_info); 7745 } 7746