1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ 27 /* All Rights Reserved */ 28 29 /* 30 * University Copyright- Copyright (c) 1982, 1986, 1988 31 * The Regents of the University of California 32 * All Rights Reserved 33 * 34 * University Acknowledgment- Portions of this document are derived from 35 * software developed by the University of California, Berkeley, and its 36 * contributors. 37 */ 38 39 #pragma ident "%Z%%M% %I% %E% SMI" 40 41 /* 42 * VM - shared or copy-on-write from a vnode/anonymous memory. 43 */ 44 45 #include <sys/types.h> 46 #include <sys/param.h> 47 #include <sys/t_lock.h> 48 #include <sys/errno.h> 49 #include <sys/systm.h> 50 #include <sys/mman.h> 51 #include <sys/debug.h> 52 #include <sys/cred.h> 53 #include <sys/vmsystm.h> 54 #include <sys/tuneable.h> 55 #include <sys/bitmap.h> 56 #include <sys/swap.h> 57 #include <sys/kmem.h> 58 #include <sys/sysmacros.h> 59 #include <sys/vtrace.h> 60 #include <sys/cmn_err.h> 61 #include <sys/vm.h> 62 #include <sys/dumphdr.h> 63 #include <sys/lgrp.h> 64 65 #include <vm/hat.h> 66 #include <vm/as.h> 67 #include <vm/seg.h> 68 #include <vm/seg_vn.h> 69 #include <vm/pvn.h> 70 #include <vm/anon.h> 71 #include <vm/page.h> 72 #include <vm/vpage.h> 73 #include <sys/proc.h> 74 #include <sys/task.h> 75 #include <sys/project.h> 76 #include <sys/zone.h> 77 #include <sys/shm_impl.h> 78 /* 79 * Private seg op routines. 80 */ 81 static int segvn_dup(struct seg *seg, struct seg *newseg); 82 static int segvn_unmap(struct seg *seg, caddr_t addr, size_t len); 83 static void segvn_free(struct seg *seg); 84 static faultcode_t segvn_fault(struct hat *hat, struct seg *seg, 85 caddr_t addr, size_t len, enum fault_type type, 86 enum seg_rw rw); 87 static faultcode_t segvn_faulta(struct seg *seg, caddr_t addr); 88 static int segvn_setprot(struct seg *seg, caddr_t addr, 89 size_t len, uint_t prot); 90 static int segvn_checkprot(struct seg *seg, caddr_t addr, 91 size_t len, uint_t prot); 92 static int segvn_kluster(struct seg *seg, caddr_t addr, ssize_t delta); 93 static size_t segvn_swapout(struct seg *seg); 94 static int segvn_sync(struct seg *seg, caddr_t addr, size_t len, 95 int attr, uint_t flags); 96 static size_t segvn_incore(struct seg *seg, caddr_t addr, size_t len, 97 char *vec); 98 static int segvn_lockop(struct seg *seg, caddr_t addr, size_t len, 99 int attr, int op, ulong_t *lockmap, size_t pos); 100 static int segvn_getprot(struct seg *seg, caddr_t addr, size_t len, 101 uint_t *protv); 102 static u_offset_t segvn_getoffset(struct seg *seg, caddr_t addr); 103 static int segvn_gettype(struct seg *seg, caddr_t addr); 104 static int segvn_getvp(struct seg *seg, caddr_t addr, 105 struct vnode **vpp); 106 static int segvn_advise(struct seg *seg, caddr_t addr, size_t len, 107 uint_t behav); 108 static void segvn_dump(struct seg *seg); 109 static int segvn_pagelock(struct seg *seg, caddr_t addr, size_t len, 110 struct page ***ppp, enum lock_type type, enum seg_rw rw); 111 static int segvn_setpagesize(struct seg *seg, caddr_t addr, size_t len, 112 uint_t szc); 113 static int segvn_getmemid(struct seg *seg, caddr_t addr, 114 memid_t *memidp); 115 static lgrp_mem_policy_info_t *segvn_getpolicy(struct seg *, caddr_t); 116 static int segvn_capable(struct seg *seg, segcapability_t capable); 117 118 struct seg_ops segvn_ops = { 119 segvn_dup, 120 segvn_unmap, 121 segvn_free, 122 segvn_fault, 123 segvn_faulta, 124 segvn_setprot, 125 segvn_checkprot, 126 segvn_kluster, 127 segvn_swapout, 128 segvn_sync, 129 segvn_incore, 130 segvn_lockop, 131 segvn_getprot, 132 segvn_getoffset, 133 segvn_gettype, 134 segvn_getvp, 135 segvn_advise, 136 segvn_dump, 137 segvn_pagelock, 138 segvn_setpagesize, 139 segvn_getmemid, 140 segvn_getpolicy, 141 segvn_capable, 142 }; 143 144 /* 145 * Common zfod structures, provided as a shorthand for others to use. 146 */ 147 static segvn_crargs_t zfod_segvn_crargs = 148 SEGVN_ZFOD_ARGS(PROT_ZFOD, PROT_ALL); 149 static segvn_crargs_t kzfod_segvn_crargs = 150 SEGVN_ZFOD_ARGS(PROT_ZFOD & ~PROT_USER, 151 PROT_ALL & ~PROT_USER); 152 static segvn_crargs_t stack_noexec_crargs = 153 SEGVN_ZFOD_ARGS(PROT_ZFOD & ~PROT_EXEC, PROT_ALL); 154 155 caddr_t zfod_argsp = (caddr_t)&zfod_segvn_crargs; /* user zfod argsp */ 156 caddr_t kzfod_argsp = (caddr_t)&kzfod_segvn_crargs; /* kernel zfod argsp */ 157 caddr_t stack_exec_argsp = (caddr_t)&zfod_segvn_crargs; /* executable stack */ 158 caddr_t stack_noexec_argsp = (caddr_t)&stack_noexec_crargs; /* noexec stack */ 159 160 #define vpgtob(n) ((n) * sizeof (struct vpage)) /* For brevity */ 161 162 size_t segvn_comb_thrshld = UINT_MAX; /* patchable -- see 1196681 */ 163 164 static int segvn_concat(struct seg *, struct seg *, int); 165 static int segvn_extend_prev(struct seg *, struct seg *, 166 struct segvn_crargs *, size_t); 167 static int segvn_extend_next(struct seg *, struct seg *, 168 struct segvn_crargs *, size_t); 169 static void segvn_softunlock(struct seg *, caddr_t, size_t, enum seg_rw); 170 static void segvn_pagelist_rele(page_t **); 171 static void segvn_setvnode_mpss(vnode_t *); 172 static void segvn_relocate_pages(page_t **, page_t *); 173 static int segvn_full_szcpages(page_t **, uint_t, int *, uint_t *); 174 static int segvn_fill_vp_pages(struct segvn_data *, vnode_t *, u_offset_t, 175 uint_t, page_t **, page_t **, uint_t *, int *); 176 static faultcode_t segvn_fault_vnodepages(struct hat *, struct seg *, caddr_t, 177 caddr_t, enum fault_type, enum seg_rw, caddr_t, caddr_t, int); 178 static faultcode_t segvn_fault_anonpages(struct hat *, struct seg *, caddr_t, 179 caddr_t, enum fault_type, enum seg_rw, caddr_t, caddr_t, int); 180 static faultcode_t segvn_faultpage(struct hat *, struct seg *, caddr_t, 181 u_offset_t, struct vpage *, page_t **, uint_t, 182 enum fault_type, enum seg_rw, int, int); 183 static void segvn_vpage(struct seg *); 184 185 static void segvn_purge(struct seg *seg); 186 static int segvn_reclaim(struct seg *, caddr_t, size_t, struct page **, 187 enum seg_rw); 188 189 static int sameprot(struct seg *, caddr_t, size_t); 190 191 static int segvn_demote_range(struct seg *, caddr_t, size_t, int, uint_t); 192 static int segvn_clrszc(struct seg *); 193 static struct seg *segvn_split_seg(struct seg *, caddr_t); 194 static int segvn_claim_pages(struct seg *, struct vpage *, u_offset_t, 195 ulong_t, uint_t); 196 197 static int segvn_pp_lock_anonpages(page_t *, int); 198 static void segvn_pp_unlock_anonpages(page_t *, int); 199 200 static struct kmem_cache *segvn_cache; 201 202 #ifdef VM_STATS 203 static struct segvnvmstats_str { 204 ulong_t fill_vp_pages[31]; 205 ulong_t fltvnpages[49]; 206 ulong_t fullszcpages[10]; 207 ulong_t relocatepages[3]; 208 ulong_t fltanpages[17]; 209 ulong_t pagelock[3]; 210 ulong_t demoterange[3]; 211 } segvnvmstats; 212 #endif /* VM_STATS */ 213 214 #define SDR_RANGE 1 /* demote entire range */ 215 #define SDR_END 2 /* demote non aligned ends only */ 216 217 #define CALC_LPG_REGION(pgsz, seg, addr, len, lpgaddr, lpgeaddr) { \ 218 if ((len) != 0) { \ 219 lpgaddr = (caddr_t)P2ALIGN((uintptr_t)(addr), pgsz); \ 220 ASSERT(lpgaddr >= (seg)->s_base); \ 221 lpgeaddr = (caddr_t)P2ROUNDUP((uintptr_t)((addr) + \ 222 (len)), pgsz); \ 223 ASSERT(lpgeaddr > lpgaddr); \ 224 ASSERT(lpgeaddr <= (seg)->s_base + (seg)->s_size); \ 225 } else { \ 226 lpgeaddr = lpgaddr = (addr); \ 227 } \ 228 } 229 230 /*ARGSUSED*/ 231 static int 232 segvn_cache_constructor(void *buf, void *cdrarg, int kmflags) 233 { 234 struct segvn_data *svd = buf; 235 236 rw_init(&svd->lock, NULL, RW_DEFAULT, NULL); 237 mutex_init(&svd->segp_slock, NULL, MUTEX_DEFAULT, NULL); 238 return (0); 239 } 240 241 /*ARGSUSED1*/ 242 static void 243 segvn_cache_destructor(void *buf, void *cdrarg) 244 { 245 struct segvn_data *svd = buf; 246 247 rw_destroy(&svd->lock); 248 mutex_destroy(&svd->segp_slock); 249 } 250 251 /* 252 * Patching this variable to non-zero allows the system to run with 253 * stacks marked as "not executable". It's a bit of a kludge, but is 254 * provided as a tweakable for platforms that export those ABIs 255 * (e.g. sparc V8) that have executable stacks enabled by default. 256 * There are also some restrictions for platforms that don't actually 257 * implement 'noexec' protections. 258 * 259 * Once enabled, the system is (therefore) unable to provide a fully 260 * ABI-compliant execution environment, though practically speaking, 261 * most everything works. The exceptions are generally some interpreters 262 * and debuggers that create executable code on the stack and jump 263 * into it (without explicitly mprotecting the address range to include 264 * PROT_EXEC). 265 * 266 * One important class of applications that are disabled are those 267 * that have been transformed into malicious agents using one of the 268 * numerous "buffer overflow" attacks. See 4007890. 269 */ 270 int noexec_user_stack = 0; 271 int noexec_user_stack_log = 1; 272 273 int segvn_lpg_disable = 0; 274 uint_t segvn_maxpgszc = 0; 275 276 ulong_t segvn_vmpss_clrszc_cnt; 277 ulong_t segvn_vmpss_clrszc_err; 278 ulong_t segvn_fltvnpages_clrszc_cnt; 279 ulong_t segvn_fltvnpages_clrszc_err; 280 ulong_t segvn_setpgsz_align_err; 281 ulong_t segvn_setpgsz_anon_align_err; 282 ulong_t segvn_setpgsz_getattr_err; 283 ulong_t segvn_setpgsz_eof_err; 284 ulong_t segvn_faultvnmpss_align_err1; 285 ulong_t segvn_faultvnmpss_align_err2; 286 ulong_t segvn_faultvnmpss_align_err3; 287 ulong_t segvn_faultvnmpss_align_err4; 288 ulong_t segvn_faultvnmpss_align_err5; 289 ulong_t segvn_vmpss_pageio_deadlk_err; 290 291 /* 292 * Initialize segvn data structures 293 */ 294 void 295 segvn_init(void) 296 { 297 uint_t maxszc; 298 uint_t szc; 299 size_t pgsz; 300 301 segvn_cache = kmem_cache_create("segvn_cache", 302 sizeof (struct segvn_data), 0, 303 segvn_cache_constructor, segvn_cache_destructor, NULL, 304 NULL, NULL, 0); 305 306 if (segvn_lpg_disable != 0) 307 return; 308 szc = maxszc = page_num_pagesizes() - 1; 309 if (szc == 0) { 310 segvn_lpg_disable = 1; 311 return; 312 } 313 if (page_get_pagesize(0) != PAGESIZE) { 314 panic("segvn_init: bad szc 0"); 315 /*NOTREACHED*/ 316 } 317 while (szc != 0) { 318 pgsz = page_get_pagesize(szc); 319 if (pgsz <= PAGESIZE || !IS_P2ALIGNED(pgsz, pgsz)) { 320 panic("segvn_init: bad szc %d", szc); 321 /*NOTREACHED*/ 322 } 323 szc--; 324 } 325 if (segvn_maxpgszc == 0 || segvn_maxpgszc > maxszc) 326 segvn_maxpgszc = maxszc; 327 } 328 329 #define SEGVN_PAGEIO ((void *)0x1) 330 #define SEGVN_NOPAGEIO ((void *)0x2) 331 332 static void 333 segvn_setvnode_mpss(vnode_t *vp) 334 { 335 int err; 336 337 ASSERT(vp->v_mpssdata == NULL || 338 vp->v_mpssdata == SEGVN_PAGEIO || 339 vp->v_mpssdata == SEGVN_NOPAGEIO); 340 341 if (vp->v_mpssdata == NULL) { 342 if (vn_vmpss_usepageio(vp)) { 343 err = VOP_PAGEIO(vp, (page_t *)NULL, 344 (u_offset_t)0, 0, 0, CRED()); 345 } else { 346 err = ENOSYS; 347 } 348 /* 349 * set v_mpssdata just once per vnode life 350 * so that it never changes. 351 */ 352 mutex_enter(&vp->v_lock); 353 if (vp->v_mpssdata == NULL) { 354 if (err == EINVAL) { 355 vp->v_mpssdata = SEGVN_PAGEIO; 356 } else { 357 vp->v_mpssdata = SEGVN_NOPAGEIO; 358 } 359 } 360 mutex_exit(&vp->v_lock); 361 } 362 } 363 364 int 365 segvn_create(struct seg *seg, void *argsp) 366 { 367 struct segvn_crargs *a = (struct segvn_crargs *)argsp; 368 struct segvn_data *svd; 369 size_t swresv = 0; 370 struct cred *cred; 371 struct anon_map *amp; 372 int error = 0; 373 size_t pgsz; 374 lgrp_mem_policy_t mpolicy = LGRP_MEM_POLICY_DEFAULT; 375 376 377 ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock)); 378 379 if (a->type != MAP_PRIVATE && a->type != MAP_SHARED) { 380 panic("segvn_create type"); 381 /*NOTREACHED*/ 382 } 383 384 /* 385 * Check arguments. If a shared anon structure is given then 386 * it is illegal to also specify a vp. 387 */ 388 if (a->amp != NULL && a->vp != NULL) { 389 panic("segvn_create anon_map"); 390 /*NOTREACHED*/ 391 } 392 393 /* MAP_NORESERVE on a MAP_SHARED segment is meaningless. */ 394 if (a->type == MAP_SHARED) 395 a->flags &= ~MAP_NORESERVE; 396 397 if (a->szc != 0) { 398 if (segvn_lpg_disable != 0 || (a->szc == AS_MAP_NO_LPOOB) || 399 (a->amp != NULL && a->type == MAP_PRIVATE) || 400 (a->flags & MAP_NORESERVE) || seg->s_as == &kas) { 401 a->szc = 0; 402 } else { 403 if (a->szc > segvn_maxpgszc) 404 a->szc = segvn_maxpgszc; 405 pgsz = page_get_pagesize(a->szc); 406 if (!IS_P2ALIGNED(seg->s_base, pgsz) || 407 !IS_P2ALIGNED(seg->s_size, pgsz)) { 408 a->szc = 0; 409 } else if (a->vp != NULL) { 410 extern struct vnode kvp; 411 if (IS_SWAPFSVP(a->vp) || a->vp == &kvp) { 412 /* 413 * paranoid check. 414 * hat_page_demote() is not supported 415 * on swapfs pages. 416 */ 417 a->szc = 0; 418 } else if (map_addr_vacalign_check(seg->s_base, 419 a->offset & PAGEMASK)) { 420 a->szc = 0; 421 } 422 } else if (a->amp != NULL) { 423 pgcnt_t anum = btopr(a->offset); 424 pgcnt_t pgcnt = page_get_pagecnt(a->szc); 425 if (!IS_P2ALIGNED(anum, pgcnt)) { 426 a->szc = 0; 427 } 428 } 429 } 430 } 431 432 /* 433 * If segment may need private pages, reserve them now. 434 */ 435 if (!(a->flags & MAP_NORESERVE) && ((a->vp == NULL && a->amp == NULL) || 436 (a->type == MAP_PRIVATE && (a->prot & PROT_WRITE)))) { 437 if (anon_resv(seg->s_size) == 0) 438 return (EAGAIN); 439 swresv = seg->s_size; 440 TRACE_3(TR_FAC_VM, TR_ANON_PROC, "anon proc:%p %lu %u", 441 seg, swresv, 1); 442 } 443 444 /* 445 * Reserve any mapping structures that may be required. 446 */ 447 hat_map(seg->s_as->a_hat, seg->s_base, seg->s_size, HAT_MAP); 448 449 if (a->cred) { 450 cred = a->cred; 451 crhold(cred); 452 } else { 453 crhold(cred = CRED()); 454 } 455 456 /* Inform the vnode of the new mapping */ 457 if (a->vp) { 458 error = VOP_ADDMAP(a->vp, a->offset & PAGEMASK, 459 seg->s_as, seg->s_base, seg->s_size, a->prot, 460 a->maxprot, a->type, cred); 461 if (error) { 462 if (swresv != 0) { 463 anon_unresv(swresv); 464 TRACE_3(TR_FAC_VM, TR_ANON_PROC, 465 "anon proc:%p %lu %u", 466 seg, swresv, 0); 467 } 468 crfree(cred); 469 hat_unload(seg->s_as->a_hat, seg->s_base, 470 seg->s_size, HAT_UNLOAD_UNMAP); 471 return (error); 472 } 473 } 474 475 /* 476 * If more than one segment in the address space, and 477 * they're adjacent virtually, try to concatenate them. 478 * Don't concatenate if an explicit anon_map structure 479 * was supplied (e.g., SystemV shared memory). 480 */ 481 if (a->amp == NULL) { 482 struct seg *pseg, *nseg; 483 struct segvn_data *psvd, *nsvd; 484 lgrp_mem_policy_t ppolicy, npolicy; 485 uint_t lgrp_mem_policy_flags = 0; 486 extern lgrp_mem_policy_t lgrp_mem_default_policy; 487 488 /* 489 * Memory policy flags (lgrp_mem_policy_flags) is valid when 490 * extending stack/heap segments. 491 */ 492 if ((a->vp == NULL) && (a->type == MAP_PRIVATE) && 493 !(a->flags & MAP_NORESERVE) && (seg->s_as != &kas)) { 494 lgrp_mem_policy_flags = a->lgrp_mem_policy_flags; 495 } else { 496 /* 497 * Get policy when not extending it from another segment 498 */ 499 mpolicy = lgrp_mem_policy_default(seg->s_size, a->type); 500 } 501 502 /* 503 * First, try to concatenate the previous and new segments 504 */ 505 pseg = AS_SEGPREV(seg->s_as, seg); 506 if (pseg != NULL && 507 pseg->s_base + pseg->s_size == seg->s_base && 508 pseg->s_ops == &segvn_ops) { 509 /* 510 * Get memory allocation policy from previous segment. 511 * When extension is specified (e.g. for heap) apply 512 * this policy to the new segment regardless of the 513 * outcome of segment concatenation. Extension occurs 514 * for non-default policy otherwise default policy is 515 * used and is based on extended segment size. 516 */ 517 psvd = (struct segvn_data *)pseg->s_data; 518 ppolicy = psvd->policy_info.mem_policy; 519 if (lgrp_mem_policy_flags == 520 LGRP_MP_FLAG_EXTEND_UP) { 521 if (ppolicy != lgrp_mem_default_policy) { 522 mpolicy = ppolicy; 523 } else { 524 mpolicy = lgrp_mem_policy_default( 525 pseg->s_size + seg->s_size, 526 a->type); 527 } 528 } 529 530 if (mpolicy == ppolicy && 531 (pseg->s_size + seg->s_size <= 532 segvn_comb_thrshld || psvd->amp == NULL) && 533 segvn_extend_prev(pseg, seg, a, swresv) == 0) { 534 /* 535 * success! now try to concatenate 536 * with following seg 537 */ 538 crfree(cred); 539 nseg = AS_SEGNEXT(pseg->s_as, pseg); 540 if (nseg != NULL && 541 nseg != pseg && 542 nseg->s_ops == &segvn_ops && 543 pseg->s_base + pseg->s_size == 544 nseg->s_base) 545 (void) segvn_concat(pseg, nseg, 0); 546 ASSERT(pseg->s_szc == 0 || 547 (a->szc == pseg->s_szc && 548 IS_P2ALIGNED(pseg->s_base, pgsz) && 549 IS_P2ALIGNED(pseg->s_size, pgsz))); 550 return (0); 551 } 552 } 553 554 /* 555 * Failed, so try to concatenate with following seg 556 */ 557 nseg = AS_SEGNEXT(seg->s_as, seg); 558 if (nseg != NULL && 559 seg->s_base + seg->s_size == nseg->s_base && 560 nseg->s_ops == &segvn_ops) { 561 /* 562 * Get memory allocation policy from next segment. 563 * When extension is specified (e.g. for stack) apply 564 * this policy to the new segment regardless of the 565 * outcome of segment concatenation. Extension occurs 566 * for non-default policy otherwise default policy is 567 * used and is based on extended segment size. 568 */ 569 nsvd = (struct segvn_data *)nseg->s_data; 570 npolicy = nsvd->policy_info.mem_policy; 571 if (lgrp_mem_policy_flags == 572 LGRP_MP_FLAG_EXTEND_DOWN) { 573 if (npolicy != lgrp_mem_default_policy) { 574 mpolicy = npolicy; 575 } else { 576 mpolicy = lgrp_mem_policy_default( 577 nseg->s_size + seg->s_size, 578 a->type); 579 } 580 } 581 582 if (mpolicy == npolicy && 583 segvn_extend_next(seg, nseg, a, swresv) == 0) { 584 crfree(cred); 585 ASSERT(nseg->s_szc == 0 || 586 (a->szc == nseg->s_szc && 587 IS_P2ALIGNED(nseg->s_base, pgsz) && 588 IS_P2ALIGNED(nseg->s_size, pgsz))); 589 return (0); 590 } 591 } 592 } 593 594 if (a->vp != NULL) { 595 VN_HOLD(a->vp); 596 if (a->type == MAP_SHARED) 597 lgrp_shm_policy_init(NULL, a->vp); 598 } 599 svd = kmem_cache_alloc(segvn_cache, KM_SLEEP); 600 601 seg->s_ops = &segvn_ops; 602 seg->s_data = (void *)svd; 603 seg->s_szc = a->szc; 604 605 svd->vp = a->vp; 606 /* 607 * Anonymous mappings have no backing file so the offset is meaningless. 608 */ 609 svd->offset = a->vp ? (a->offset & PAGEMASK) : 0; 610 svd->prot = a->prot; 611 svd->maxprot = a->maxprot; 612 svd->pageprot = 0; 613 svd->type = a->type; 614 svd->vpage = NULL; 615 svd->cred = cred; 616 svd->advice = MADV_NORMAL; 617 svd->pageadvice = 0; 618 svd->flags = (ushort_t)a->flags; 619 svd->softlockcnt = 0; 620 if (a->szc != 0 && a->vp != NULL) { 621 segvn_setvnode_mpss(a->vp); 622 } 623 624 amp = a->amp; 625 if ((svd->amp = amp) == NULL) { 626 svd->anon_index = 0; 627 if (svd->type == MAP_SHARED) { 628 svd->swresv = 0; 629 /* 630 * Shared mappings to a vp need no other setup. 631 * If we have a shared mapping to an anon_map object 632 * which hasn't been allocated yet, allocate the 633 * struct now so that it will be properly shared 634 * by remembering the swap reservation there. 635 */ 636 if (a->vp == NULL) { 637 svd->amp = anonmap_alloc(seg->s_size, swresv); 638 svd->amp->a_szc = seg->s_szc; 639 } 640 } else { 641 /* 642 * Private mapping (with or without a vp). 643 * Allocate anon_map when needed. 644 */ 645 svd->swresv = swresv; 646 } 647 } else { 648 pgcnt_t anon_num; 649 650 /* 651 * Mapping to an existing anon_map structure without a vp. 652 * For now we will insure that the segment size isn't larger 653 * than the size - offset gives us. Later on we may wish to 654 * have the anon array dynamically allocated itself so that 655 * we don't always have to allocate all the anon pointer slots. 656 * This of course involves adding extra code to check that we 657 * aren't trying to use an anon pointer slot beyond the end 658 * of the currently allocated anon array. 659 */ 660 if ((amp->size - a->offset) < seg->s_size) { 661 panic("segvn_create anon_map size"); 662 /*NOTREACHED*/ 663 } 664 665 anon_num = btopr(a->offset); 666 667 if (a->type == MAP_SHARED) { 668 /* 669 * SHARED mapping to a given anon_map. 670 */ 671 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 672 amp->refcnt++; 673 if (a->szc > amp->a_szc) { 674 amp->a_szc = a->szc; 675 } 676 ANON_LOCK_EXIT(&->a_rwlock); 677 svd->anon_index = anon_num; 678 svd->swresv = 0; 679 } else { 680 /* 681 * PRIVATE mapping to a given anon_map. 682 * Make sure that all the needed anon 683 * structures are created (so that we will 684 * share the underlying pages if nothing 685 * is written by this mapping) and then 686 * duplicate the anon array as is done 687 * when a privately mapped segment is dup'ed. 688 */ 689 struct anon *ap; 690 caddr_t addr; 691 caddr_t eaddr; 692 ulong_t anon_idx; 693 int hat_flag = HAT_LOAD; 694 695 if (svd->flags & MAP_TEXT) { 696 hat_flag |= HAT_LOAD_TEXT; 697 } 698 699 svd->amp = anonmap_alloc(seg->s_size, 0); 700 svd->amp->a_szc = seg->s_szc; 701 svd->anon_index = 0; 702 svd->swresv = swresv; 703 704 /* 705 * Prevent 2 threads from allocating anon 706 * slots simultaneously. 707 */ 708 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 709 eaddr = seg->s_base + seg->s_size; 710 711 for (anon_idx = anon_num, addr = seg->s_base; 712 addr < eaddr; addr += PAGESIZE, anon_idx++) { 713 page_t *pp; 714 715 if ((ap = anon_get_ptr(amp->ahp, 716 anon_idx)) != NULL) 717 continue; 718 719 /* 720 * Allocate the anon struct now. 721 * Might as well load up translation 722 * to the page while we're at it... 723 */ 724 pp = anon_zero(seg, addr, &ap, cred); 725 if (ap == NULL || pp == NULL) { 726 panic("segvn_create anon_zero"); 727 /*NOTREACHED*/ 728 } 729 730 /* 731 * Re-acquire the anon_map lock and 732 * initialize the anon array entry. 733 */ 734 ASSERT(anon_get_ptr(amp->ahp, 735 anon_idx) == NULL); 736 (void) anon_set_ptr(amp->ahp, anon_idx, ap, 737 ANON_SLEEP); 738 739 ASSERT(seg->s_szc == 0); 740 ASSERT(!IS_VMODSORT(pp->p_vnode)); 741 742 hat_memload(seg->s_as->a_hat, addr, pp, 743 svd->prot & ~PROT_WRITE, hat_flag); 744 745 page_unlock(pp); 746 } 747 ASSERT(seg->s_szc == 0); 748 anon_dup(amp->ahp, anon_num, svd->amp->ahp, 749 0, seg->s_size); 750 ANON_LOCK_EXIT(&->a_rwlock); 751 } 752 } 753 754 /* 755 * Set default memory allocation policy for segment 756 * 757 * Always set policy for private memory at least for initialization 758 * even if this is a shared memory segment 759 */ 760 (void) lgrp_privm_policy_set(mpolicy, &svd->policy_info, seg->s_size); 761 762 if (svd->type == MAP_SHARED) 763 (void) lgrp_shm_policy_set(mpolicy, svd->amp, svd->anon_index, 764 svd->vp, svd->offset, seg->s_size); 765 766 return (0); 767 } 768 769 /* 770 * Concatenate two existing segments, if possible. 771 * Return 0 on success, -1 if two segments are not compatible 772 * or -2 on memory allocation failure. 773 * If amp_cat == 1 then try and concat segments with anon maps 774 */ 775 static int 776 segvn_concat(struct seg *seg1, struct seg *seg2, int amp_cat) 777 { 778 struct segvn_data *svd1 = seg1->s_data; 779 struct segvn_data *svd2 = seg2->s_data; 780 struct anon_map *amp1 = svd1->amp; 781 struct anon_map *amp2 = svd2->amp; 782 struct vpage *vpage1 = svd1->vpage; 783 struct vpage *vpage2 = svd2->vpage, *nvpage = NULL; 784 size_t size, nvpsize; 785 pgcnt_t npages1, npages2; 786 787 ASSERT(seg1->s_as && seg2->s_as && seg1->s_as == seg2->s_as); 788 ASSERT(AS_WRITE_HELD(seg1->s_as, &seg1->s_as->a_lock)); 789 ASSERT(seg1->s_ops == seg2->s_ops); 790 791 /* both segments exist, try to merge them */ 792 #define incompat(x) (svd1->x != svd2->x) 793 if (incompat(vp) || incompat(maxprot) || 794 (!svd1->pageadvice && !svd2->pageadvice && incompat(advice)) || 795 (!svd1->pageprot && !svd2->pageprot && incompat(prot)) || 796 incompat(type) || incompat(cred) || incompat(flags) || 797 seg1->s_szc != seg2->s_szc || incompat(policy_info.mem_policy) || 798 (svd2->softlockcnt > 0)) 799 return (-1); 800 #undef incompat 801 802 /* 803 * vp == NULL implies zfod, offset doesn't matter 804 */ 805 if (svd1->vp != NULL && 806 svd1->offset + seg1->s_size != svd2->offset) { 807 return (-1); 808 } 809 810 /* 811 * Fail early if we're not supposed to concatenate 812 * segments with non NULL amp. 813 */ 814 if (amp_cat == 0 && (amp1 != NULL || amp2 != NULL)) { 815 return (-1); 816 } 817 818 if (svd1->vp == NULL && svd1->type == MAP_SHARED) { 819 if (amp1 != amp2) { 820 return (-1); 821 } 822 if (amp1 != NULL && svd1->anon_index + btop(seg1->s_size) != 823 svd2->anon_index) { 824 return (-1); 825 } 826 ASSERT(amp1 == NULL || amp1->refcnt >= 2); 827 } 828 829 /* 830 * If either seg has vpages, create a new merged vpage array. 831 */ 832 if (vpage1 != NULL || vpage2 != NULL) { 833 struct vpage *vp; 834 835 npages1 = seg_pages(seg1); 836 npages2 = seg_pages(seg2); 837 nvpsize = vpgtob(npages1 + npages2); 838 839 if ((nvpage = kmem_zalloc(nvpsize, KM_NOSLEEP)) == NULL) { 840 return (-2); 841 } 842 if (vpage1 != NULL) { 843 bcopy(vpage1, nvpage, vpgtob(npages1)); 844 } 845 if (vpage2 != NULL) { 846 bcopy(vpage2, nvpage + npages1, vpgtob(npages2)); 847 } 848 for (vp = nvpage; vp < nvpage + npages1; vp++) { 849 if (svd2->pageprot && !svd1->pageprot) { 850 VPP_SETPROT(vp, svd1->prot); 851 } 852 if (svd2->pageadvice && !svd1->pageadvice) { 853 VPP_SETADVICE(vp, svd1->advice); 854 } 855 } 856 for (vp = nvpage + npages1; 857 vp < nvpage + npages1 + npages2; vp++) { 858 if (svd1->pageprot && !svd2->pageprot) { 859 VPP_SETPROT(vp, svd2->prot); 860 } 861 if (svd1->pageadvice && !svd2->pageadvice) { 862 VPP_SETADVICE(vp, svd2->advice); 863 } 864 } 865 } 866 867 /* 868 * If either segment has private pages, create a new merged anon 869 * array. If mergeing shared anon segments just decrement anon map's 870 * refcnt. 871 */ 872 if (amp1 != NULL && svd1->type == MAP_SHARED) { 873 ASSERT(amp1 == amp2 && svd1->vp == NULL); 874 ANON_LOCK_ENTER(&1->a_rwlock, RW_WRITER); 875 ASSERT(amp1->refcnt >= 2); 876 amp1->refcnt--; 877 ANON_LOCK_EXIT(&1->a_rwlock); 878 svd2->amp = NULL; 879 } else if (amp1 != NULL || amp2 != NULL) { 880 struct anon_hdr *nahp; 881 struct anon_map *namp = NULL; 882 size_t asize; 883 884 ASSERT(svd1->type == MAP_PRIVATE); 885 886 asize = seg1->s_size + seg2->s_size; 887 if ((nahp = anon_create(btop(asize), ANON_NOSLEEP)) == NULL) { 888 if (nvpage != NULL) { 889 kmem_free(nvpage, nvpsize); 890 } 891 return (-2); 892 } 893 if (amp1 != NULL) { 894 /* 895 * XXX anon rwlock is not really needed because 896 * this is a private segment and we are writers. 897 */ 898 ANON_LOCK_ENTER(&1->a_rwlock, RW_WRITER); 899 ASSERT(amp1->refcnt == 1); 900 if (anon_copy_ptr(amp1->ahp, svd1->anon_index, 901 nahp, 0, btop(seg1->s_size), ANON_NOSLEEP)) { 902 anon_release(nahp, btop(asize)); 903 ANON_LOCK_EXIT(&1->a_rwlock); 904 if (nvpage != NULL) { 905 kmem_free(nvpage, nvpsize); 906 } 907 return (-2); 908 } 909 } 910 if (amp2 != NULL) { 911 ANON_LOCK_ENTER(&2->a_rwlock, RW_WRITER); 912 ASSERT(amp2->refcnt == 1); 913 if (anon_copy_ptr(amp2->ahp, svd2->anon_index, 914 nahp, btop(seg1->s_size), btop(seg2->s_size), 915 ANON_NOSLEEP)) { 916 anon_release(nahp, btop(asize)); 917 ANON_LOCK_EXIT(&2->a_rwlock); 918 if (amp1 != NULL) { 919 ANON_LOCK_EXIT(&1->a_rwlock); 920 } 921 if (nvpage != NULL) { 922 kmem_free(nvpage, nvpsize); 923 } 924 return (-2); 925 } 926 } 927 if (amp1 != NULL) { 928 namp = amp1; 929 anon_release(amp1->ahp, btop(amp1->size)); 930 } 931 if (amp2 != NULL) { 932 if (namp == NULL) { 933 ASSERT(amp1 == NULL); 934 namp = amp2; 935 anon_release(amp2->ahp, btop(amp2->size)); 936 } else { 937 amp2->refcnt--; 938 ANON_LOCK_EXIT(&2->a_rwlock); 939 anonmap_free(amp2); 940 } 941 svd2->amp = NULL; /* needed for seg_free */ 942 } 943 namp->ahp = nahp; 944 namp->size = asize; 945 svd1->amp = namp; 946 svd1->anon_index = 0; 947 ANON_LOCK_EXIT(&namp->a_rwlock); 948 } 949 /* 950 * Now free the old vpage structures. 951 */ 952 if (nvpage != NULL) { 953 if (vpage1 != NULL) { 954 kmem_free(vpage1, vpgtob(npages1)); 955 } 956 if (vpage2 != NULL) { 957 svd2->vpage = NULL; 958 kmem_free(vpage2, vpgtob(npages2)); 959 } 960 if (svd2->pageprot) { 961 svd1->pageprot = 1; 962 } 963 if (svd2->pageadvice) { 964 svd1->pageadvice = 1; 965 } 966 svd1->vpage = nvpage; 967 } 968 969 /* all looks ok, merge segments */ 970 svd1->swresv += svd2->swresv; 971 svd2->swresv = 0; /* so seg_free doesn't release swap space */ 972 size = seg2->s_size; 973 seg_free(seg2); 974 seg1->s_size += size; 975 return (0); 976 } 977 978 /* 979 * Extend the previous segment (seg1) to include the 980 * new segment (seg2 + a), if possible. 981 * Return 0 on success. 982 */ 983 static int 984 segvn_extend_prev(seg1, seg2, a, swresv) 985 struct seg *seg1, *seg2; 986 struct segvn_crargs *a; 987 size_t swresv; 988 { 989 struct segvn_data *svd1 = (struct segvn_data *)seg1->s_data; 990 size_t size; 991 struct anon_map *amp1; 992 struct vpage *new_vpage; 993 994 /* 995 * We don't need any segment level locks for "segvn" data 996 * since the address space is "write" locked. 997 */ 998 ASSERT(seg1->s_as && AS_WRITE_HELD(seg1->s_as, &seg1->s_as->a_lock)); 999 1000 /* second segment is new, try to extend first */ 1001 /* XXX - should also check cred */ 1002 if (svd1->vp != a->vp || svd1->maxprot != a->maxprot || 1003 (!svd1->pageprot && (svd1->prot != a->prot)) || 1004 svd1->type != a->type || svd1->flags != a->flags || 1005 seg1->s_szc != a->szc) 1006 return (-1); 1007 1008 /* vp == NULL implies zfod, offset doesn't matter */ 1009 if (svd1->vp != NULL && 1010 svd1->offset + seg1->s_size != (a->offset & PAGEMASK)) 1011 return (-1); 1012 1013 amp1 = svd1->amp; 1014 if (amp1) { 1015 pgcnt_t newpgs; 1016 1017 /* 1018 * Segment has private pages, can data structures 1019 * be expanded? 1020 * 1021 * Acquire the anon_map lock to prevent it from changing, 1022 * if it is shared. This ensures that the anon_map 1023 * will not change while a thread which has a read/write 1024 * lock on an address space references it. 1025 * XXX - Don't need the anon_map lock at all if "refcnt" 1026 * is 1. 1027 * 1028 * Can't grow a MAP_SHARED segment with an anonmap because 1029 * there may be existing anon slots where we want to extend 1030 * the segment and we wouldn't know what to do with them 1031 * (e.g., for tmpfs right thing is to just leave them there, 1032 * for /dev/zero they should be cleared out). 1033 */ 1034 if (svd1->type == MAP_SHARED) 1035 return (-1); 1036 1037 ANON_LOCK_ENTER(&1->a_rwlock, RW_WRITER); 1038 if (amp1->refcnt > 1) { 1039 ANON_LOCK_EXIT(&1->a_rwlock); 1040 return (-1); 1041 } 1042 newpgs = anon_grow(amp1->ahp, &svd1->anon_index, 1043 btop(seg1->s_size), btop(seg2->s_size), ANON_NOSLEEP); 1044 1045 if (newpgs == 0) { 1046 ANON_LOCK_EXIT(&1->a_rwlock); 1047 return (-1); 1048 } 1049 amp1->size = ptob(newpgs); 1050 ANON_LOCK_EXIT(&1->a_rwlock); 1051 } 1052 if (svd1->vpage != NULL) { 1053 new_vpage = 1054 kmem_zalloc(vpgtob(seg_pages(seg1) + seg_pages(seg2)), 1055 KM_NOSLEEP); 1056 if (new_vpage == NULL) 1057 return (-1); 1058 bcopy(svd1->vpage, new_vpage, vpgtob(seg_pages(seg1))); 1059 kmem_free(svd1->vpage, vpgtob(seg_pages(seg1))); 1060 svd1->vpage = new_vpage; 1061 if (svd1->pageprot) { 1062 struct vpage *vp, *evp; 1063 1064 vp = new_vpage + seg_pages(seg1); 1065 evp = vp + seg_pages(seg2); 1066 for (; vp < evp; vp++) 1067 VPP_SETPROT(vp, a->prot); 1068 } 1069 } 1070 size = seg2->s_size; 1071 seg_free(seg2); 1072 seg1->s_size += size; 1073 svd1->swresv += swresv; 1074 return (0); 1075 } 1076 1077 /* 1078 * Extend the next segment (seg2) to include the 1079 * new segment (seg1 + a), if possible. 1080 * Return 0 on success. 1081 */ 1082 static int 1083 segvn_extend_next( 1084 struct seg *seg1, 1085 struct seg *seg2, 1086 struct segvn_crargs *a, 1087 size_t swresv) 1088 { 1089 struct segvn_data *svd2 = (struct segvn_data *)seg2->s_data; 1090 size_t size; 1091 struct anon_map *amp2; 1092 struct vpage *new_vpage; 1093 1094 /* 1095 * We don't need any segment level locks for "segvn" data 1096 * since the address space is "write" locked. 1097 */ 1098 ASSERT(seg2->s_as && AS_WRITE_HELD(seg2->s_as, &seg2->s_as->a_lock)); 1099 1100 /* first segment is new, try to extend second */ 1101 /* XXX - should also check cred */ 1102 if (svd2->vp != a->vp || svd2->maxprot != a->maxprot || 1103 (!svd2->pageprot && (svd2->prot != a->prot)) || 1104 svd2->type != a->type || svd2->flags != a->flags || 1105 seg2->s_szc != a->szc) 1106 return (-1); 1107 /* vp == NULL implies zfod, offset doesn't matter */ 1108 if (svd2->vp != NULL && 1109 (a->offset & PAGEMASK) + seg1->s_size != svd2->offset) 1110 return (-1); 1111 1112 amp2 = svd2->amp; 1113 if (amp2) { 1114 pgcnt_t newpgs; 1115 1116 /* 1117 * Segment has private pages, can data structures 1118 * be expanded? 1119 * 1120 * Acquire the anon_map lock to prevent it from changing, 1121 * if it is shared. This ensures that the anon_map 1122 * will not change while a thread which has a read/write 1123 * lock on an address space references it. 1124 * 1125 * XXX - Don't need the anon_map lock at all if "refcnt" 1126 * is 1. 1127 */ 1128 if (svd2->type == MAP_SHARED) 1129 return (-1); 1130 1131 ANON_LOCK_ENTER(&2->a_rwlock, RW_WRITER); 1132 if (amp2->refcnt > 1) { 1133 ANON_LOCK_EXIT(&2->a_rwlock); 1134 return (-1); 1135 } 1136 newpgs = anon_grow(amp2->ahp, &svd2->anon_index, 1137 btop(seg2->s_size), btop(seg1->s_size), 1138 ANON_NOSLEEP | ANON_GROWDOWN); 1139 1140 if (newpgs == 0) { 1141 ANON_LOCK_EXIT(&2->a_rwlock); 1142 return (-1); 1143 } 1144 amp2->size = ptob(newpgs); 1145 ANON_LOCK_EXIT(&2->a_rwlock); 1146 } 1147 if (svd2->vpage != NULL) { 1148 new_vpage = 1149 kmem_zalloc(vpgtob(seg_pages(seg1) + seg_pages(seg2)), 1150 KM_NOSLEEP); 1151 if (new_vpage == NULL) { 1152 /* Not merging segments so adjust anon_index back */ 1153 if (amp2) 1154 svd2->anon_index += seg_pages(seg1); 1155 return (-1); 1156 } 1157 bcopy(svd2->vpage, new_vpage + seg_pages(seg1), 1158 vpgtob(seg_pages(seg2))); 1159 kmem_free(svd2->vpage, vpgtob(seg_pages(seg2))); 1160 svd2->vpage = new_vpage; 1161 if (svd2->pageprot) { 1162 struct vpage *vp, *evp; 1163 1164 vp = new_vpage; 1165 evp = vp + seg_pages(seg1); 1166 for (; vp < evp; vp++) 1167 VPP_SETPROT(vp, a->prot); 1168 } 1169 } 1170 size = seg1->s_size; 1171 seg_free(seg1); 1172 seg2->s_size += size; 1173 seg2->s_base -= size; 1174 svd2->offset -= size; 1175 svd2->swresv += swresv; 1176 return (0); 1177 } 1178 1179 static int 1180 segvn_dup(struct seg *seg, struct seg *newseg) 1181 { 1182 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 1183 struct segvn_data *newsvd; 1184 pgcnt_t npages = seg_pages(seg); 1185 int error = 0; 1186 uint_t prot; 1187 size_t len; 1188 1189 ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock)); 1190 1191 /* 1192 * If segment has anon reserved, reserve more for the new seg. 1193 * For a MAP_NORESERVE segment swresv will be a count of all the 1194 * allocated anon slots; thus we reserve for the child as many slots 1195 * as the parent has allocated. This semantic prevents the child or 1196 * parent from dieing during a copy-on-write fault caused by trying 1197 * to write a shared pre-existing anon page. 1198 */ 1199 if ((len = svd->swresv) != 0) { 1200 if (anon_resv(svd->swresv) == 0) 1201 return (ENOMEM); 1202 1203 TRACE_3(TR_FAC_VM, TR_ANON_PROC, "anon proc:%p %lu %u", 1204 seg, len, 0); 1205 } 1206 1207 newsvd = kmem_cache_alloc(segvn_cache, KM_SLEEP); 1208 1209 newseg->s_ops = &segvn_ops; 1210 newseg->s_data = (void *)newsvd; 1211 newseg->s_szc = seg->s_szc; 1212 1213 if ((newsvd->vp = svd->vp) != NULL) { 1214 VN_HOLD(svd->vp); 1215 if (svd->type == MAP_SHARED) 1216 lgrp_shm_policy_init(NULL, svd->vp); 1217 } 1218 newsvd->offset = svd->offset; 1219 newsvd->prot = svd->prot; 1220 newsvd->maxprot = svd->maxprot; 1221 newsvd->pageprot = svd->pageprot; 1222 newsvd->type = svd->type; 1223 newsvd->cred = svd->cred; 1224 crhold(newsvd->cred); 1225 newsvd->advice = svd->advice; 1226 newsvd->pageadvice = svd->pageadvice; 1227 newsvd->swresv = svd->swresv; 1228 newsvd->flags = svd->flags; 1229 newsvd->softlockcnt = 0; 1230 newsvd->policy_info = svd->policy_info; 1231 if ((newsvd->amp = svd->amp) == NULL) { 1232 /* 1233 * Not attaching to a shared anon object. 1234 */ 1235 newsvd->anon_index = 0; 1236 } else { 1237 struct anon_map *amp; 1238 1239 amp = svd->amp; 1240 if (svd->type == MAP_SHARED) { 1241 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 1242 amp->refcnt++; 1243 ANON_LOCK_EXIT(&->a_rwlock); 1244 newsvd->anon_index = svd->anon_index; 1245 } else { 1246 int reclaim = 1; 1247 1248 /* 1249 * Allocate and initialize new anon_map structure. 1250 */ 1251 newsvd->amp = anonmap_alloc(newseg->s_size, 0); 1252 newsvd->amp->a_szc = newseg->s_szc; 1253 newsvd->anon_index = 0; 1254 1255 /* 1256 * We don't have to acquire the anon_map lock 1257 * for the new segment (since it belongs to an 1258 * address space that is still not associated 1259 * with any process), or the segment in the old 1260 * address space (since all threads in it 1261 * are stopped while duplicating the address space). 1262 */ 1263 1264 /* 1265 * The goal of the following code is to make sure that 1266 * softlocked pages do not end up as copy on write 1267 * pages. This would cause problems where one 1268 * thread writes to a page that is COW and a different 1269 * thread in the same process has softlocked it. The 1270 * softlock lock would move away from this process 1271 * because the write would cause this process to get 1272 * a copy (without the softlock). 1273 * 1274 * The strategy here is to just break the 1275 * sharing on pages that could possibly be 1276 * softlocked. 1277 */ 1278 retry: 1279 if (svd->softlockcnt) { 1280 struct anon *ap, *newap; 1281 size_t i; 1282 uint_t vpprot; 1283 page_t *anon_pl[1+1], *pp; 1284 caddr_t addr; 1285 ulong_t anon_idx = 0; 1286 1287 /* 1288 * The softlock count might be non zero 1289 * because some pages are still stuck in the 1290 * cache for lazy reclaim. Flush the cache 1291 * now. This should drop the count to zero. 1292 * [or there is really I/O going on to these 1293 * pages]. Note, we have the writers lock so 1294 * nothing gets inserted during the flush. 1295 */ 1296 if (reclaim == 1) { 1297 segvn_purge(seg); 1298 reclaim = 0; 1299 goto retry; 1300 } 1301 i = btopr(seg->s_size); 1302 addr = seg->s_base; 1303 /* 1304 * XXX break cow sharing using PAGESIZE 1305 * pages. They will be relocated into larger 1306 * pages at fault time. 1307 */ 1308 while (i-- > 0) { 1309 if (ap = anon_get_ptr(amp->ahp, 1310 anon_idx)) { 1311 error = anon_getpage(&ap, 1312 &vpprot, anon_pl, PAGESIZE, 1313 seg, addr, S_READ, 1314 svd->cred); 1315 if (error) { 1316 newsvd->vpage = NULL; 1317 goto out; 1318 } 1319 /* 1320 * prot need not be computed 1321 * below 'cause anon_private is 1322 * going to ignore it anyway 1323 * as child doesn't inherit 1324 * pagelock from parent. 1325 */ 1326 prot = svd->pageprot ? 1327 VPP_PROT( 1328 &svd->vpage[ 1329 seg_page(seg, addr)]) 1330 : svd->prot; 1331 pp = anon_private(&newap, 1332 newseg, addr, prot, 1333 anon_pl[0], 0, 1334 newsvd->cred); 1335 if (pp == NULL) { 1336 /* no mem abort */ 1337 newsvd->vpage = NULL; 1338 error = ENOMEM; 1339 goto out; 1340 } 1341 (void) anon_set_ptr( 1342 newsvd->amp->ahp, anon_idx, 1343 newap, ANON_SLEEP); 1344 page_unlock(pp); 1345 } 1346 addr += PAGESIZE; 1347 anon_idx++; 1348 } 1349 } else { /* common case */ 1350 if (seg->s_szc != 0) { 1351 /* 1352 * If at least one of anon slots of a 1353 * large page exists then make sure 1354 * all anon slots of a large page 1355 * exist to avoid partial cow sharing 1356 * of a large page in the future. 1357 */ 1358 anon_dup_fill_holes(amp->ahp, 1359 svd->anon_index, newsvd->amp->ahp, 1360 0, seg->s_size, seg->s_szc, 1361 svd->vp != NULL); 1362 } else { 1363 anon_dup(amp->ahp, svd->anon_index, 1364 newsvd->amp->ahp, 0, seg->s_size); 1365 } 1366 1367 hat_clrattr(seg->s_as->a_hat, seg->s_base, 1368 seg->s_size, PROT_WRITE); 1369 } 1370 } 1371 } 1372 /* 1373 * If necessary, create a vpage structure for the new segment. 1374 * Do not copy any page lock indications. 1375 */ 1376 if (svd->vpage != NULL) { 1377 uint_t i; 1378 struct vpage *ovp = svd->vpage; 1379 struct vpage *nvp; 1380 1381 nvp = newsvd->vpage = 1382 kmem_alloc(vpgtob(npages), KM_SLEEP); 1383 for (i = 0; i < npages; i++) { 1384 *nvp = *ovp++; 1385 VPP_CLRPPLOCK(nvp++); 1386 } 1387 } else 1388 newsvd->vpage = NULL; 1389 1390 /* Inform the vnode of the new mapping */ 1391 if (newsvd->vp != NULL) { 1392 error = VOP_ADDMAP(newsvd->vp, (offset_t)newsvd->offset, 1393 newseg->s_as, newseg->s_base, newseg->s_size, newsvd->prot, 1394 newsvd->maxprot, newsvd->type, newsvd->cred); 1395 } 1396 out: 1397 return (error); 1398 } 1399 1400 1401 /* 1402 * callback function used by segvn_unmap to invoke free_vp_pages() for only 1403 * those pages actually processed by the HAT 1404 */ 1405 extern int free_pages; 1406 1407 static void 1408 segvn_hat_unload_callback(hat_callback_t *cb) 1409 { 1410 struct seg *seg = cb->hcb_data; 1411 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 1412 size_t len; 1413 u_offset_t off; 1414 1415 ASSERT(svd->vp != NULL); 1416 ASSERT(cb->hcb_end_addr > cb->hcb_start_addr); 1417 ASSERT(cb->hcb_start_addr >= seg->s_base); 1418 1419 len = cb->hcb_end_addr - cb->hcb_start_addr; 1420 off = cb->hcb_start_addr - seg->s_base; 1421 free_vp_pages(svd->vp, svd->offset + off, len); 1422 } 1423 1424 1425 static int 1426 segvn_unmap(struct seg *seg, caddr_t addr, size_t len) 1427 { 1428 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 1429 struct segvn_data *nsvd; 1430 struct seg *nseg; 1431 struct anon_map *amp; 1432 pgcnt_t opages; /* old segment size in pages */ 1433 pgcnt_t npages; /* new segment size in pages */ 1434 pgcnt_t dpages; /* pages being deleted (unmapped) */ 1435 hat_callback_t callback; /* used for free_vp_pages() */ 1436 hat_callback_t *cbp = NULL; 1437 caddr_t nbase; 1438 size_t nsize; 1439 size_t oswresv; 1440 int reclaim = 1; 1441 1442 /* 1443 * We don't need any segment level locks for "segvn" data 1444 * since the address space is "write" locked. 1445 */ 1446 ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock)); 1447 1448 /* 1449 * Fail the unmap if pages are SOFTLOCKed through this mapping. 1450 * softlockcnt is protected from change by the as write lock. 1451 */ 1452 retry: 1453 if (svd->softlockcnt > 0) { 1454 /* 1455 * since we do have the writers lock nobody can fill 1456 * the cache during the purge. The flush either succeeds 1457 * or we still have pending I/Os. 1458 */ 1459 if (reclaim == 1) { 1460 segvn_purge(seg); 1461 reclaim = 0; 1462 goto retry; 1463 } 1464 return (EAGAIN); 1465 } 1466 1467 /* 1468 * Check for bad sizes 1469 */ 1470 if (addr < seg->s_base || addr + len > seg->s_base + seg->s_size || 1471 (len & PAGEOFFSET) || ((uintptr_t)addr & PAGEOFFSET)) { 1472 panic("segvn_unmap"); 1473 /*NOTREACHED*/ 1474 } 1475 1476 if (seg->s_szc != 0) { 1477 size_t pgsz = page_get_pagesize(seg->s_szc); 1478 int err; 1479 if (!IS_P2ALIGNED(addr, pgsz) || !IS_P2ALIGNED(len, pgsz)) { 1480 ASSERT(seg->s_base != addr || seg->s_size != len); 1481 VM_STAT_ADD(segvnvmstats.demoterange[0]); 1482 err = segvn_demote_range(seg, addr, len, SDR_END, 0); 1483 if (err == 0) { 1484 return (IE_RETRY); 1485 } 1486 return (err); 1487 } 1488 } 1489 1490 /* Inform the vnode of the unmapping. */ 1491 if (svd->vp) { 1492 int error; 1493 1494 error = VOP_DELMAP(svd->vp, 1495 (offset_t)svd->offset + (uintptr_t)(addr - seg->s_base), 1496 seg->s_as, addr, len, svd->prot, svd->maxprot, 1497 svd->type, svd->cred); 1498 1499 if (error == EAGAIN) 1500 return (error); 1501 } 1502 /* 1503 * Remove any page locks set through this mapping. 1504 */ 1505 (void) segvn_lockop(seg, addr, len, 0, MC_UNLOCK, NULL, 0); 1506 1507 /* 1508 * Unload any hardware translations in the range to be taken out. 1509 * Use a callback to invoke free_vp_pages() effectively. 1510 */ 1511 if (svd->vp != NULL && free_pages != 0) { 1512 callback.hcb_data = seg; 1513 callback.hcb_function = segvn_hat_unload_callback; 1514 cbp = &callback; 1515 } 1516 hat_unload_callback(seg->s_as->a_hat, addr, len, HAT_UNLOAD_UNMAP, cbp); 1517 1518 /* 1519 * Check for entire segment 1520 */ 1521 if (addr == seg->s_base && len == seg->s_size) { 1522 seg_free(seg); 1523 return (0); 1524 } 1525 1526 opages = seg_pages(seg); 1527 dpages = btop(len); 1528 npages = opages - dpages; 1529 amp = svd->amp; 1530 ASSERT(amp == NULL || amp->a_szc >= seg->s_szc); 1531 1532 /* 1533 * Check for beginning of segment 1534 */ 1535 if (addr == seg->s_base) { 1536 if (svd->vpage != NULL) { 1537 size_t nbytes; 1538 struct vpage *ovpage; 1539 1540 ovpage = svd->vpage; /* keep pointer to vpage */ 1541 1542 nbytes = vpgtob(npages); 1543 svd->vpage = kmem_alloc(nbytes, KM_SLEEP); 1544 bcopy(&ovpage[dpages], svd->vpage, nbytes); 1545 1546 /* free up old vpage */ 1547 kmem_free(ovpage, vpgtob(opages)); 1548 } 1549 if (amp != NULL) { 1550 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 1551 if (amp->refcnt == 1 || svd->type == MAP_PRIVATE) { 1552 /* 1553 * Free up now unused parts of anon_map array. 1554 */ 1555 if (amp->a_szc == seg->s_szc) { 1556 if (seg->s_szc != 0) { 1557 anon_free_pages(amp->ahp, 1558 svd->anon_index, len, 1559 seg->s_szc); 1560 } else { 1561 anon_free(amp->ahp, 1562 svd->anon_index, 1563 len); 1564 } 1565 } else { 1566 ASSERT(svd->type == MAP_SHARED); 1567 ASSERT(amp->a_szc > seg->s_szc); 1568 anon_shmap_free_pages(amp, 1569 svd->anon_index, len); 1570 } 1571 1572 /* 1573 * Unreserve swap space for the 1574 * unmapped chunk of this segment in 1575 * case it's MAP_SHARED 1576 */ 1577 if (svd->type == MAP_SHARED) { 1578 anon_unresv(len); 1579 amp->swresv -= len; 1580 } 1581 } 1582 ANON_LOCK_EXIT(&->a_rwlock); 1583 svd->anon_index += dpages; 1584 } 1585 if (svd->vp != NULL) 1586 svd->offset += len; 1587 1588 if (svd->swresv) { 1589 if (svd->flags & MAP_NORESERVE) { 1590 ASSERT(amp); 1591 oswresv = svd->swresv; 1592 1593 svd->swresv = ptob(anon_pages(amp->ahp, 1594 svd->anon_index, npages)); 1595 anon_unresv(oswresv - svd->swresv); 1596 } else { 1597 anon_unresv(len); 1598 svd->swresv -= len; 1599 } 1600 TRACE_3(TR_FAC_VM, TR_ANON_PROC, "anon proc:%p %lu %u", 1601 seg, len, 0); 1602 } 1603 1604 seg->s_base += len; 1605 seg->s_size -= len; 1606 return (0); 1607 } 1608 1609 /* 1610 * Check for end of segment 1611 */ 1612 if (addr + len == seg->s_base + seg->s_size) { 1613 if (svd->vpage != NULL) { 1614 size_t nbytes; 1615 struct vpage *ovpage; 1616 1617 ovpage = svd->vpage; /* keep pointer to vpage */ 1618 1619 nbytes = vpgtob(npages); 1620 svd->vpage = kmem_alloc(nbytes, KM_SLEEP); 1621 bcopy(ovpage, svd->vpage, nbytes); 1622 1623 /* free up old vpage */ 1624 kmem_free(ovpage, vpgtob(opages)); 1625 1626 } 1627 if (amp != NULL) { 1628 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 1629 if (amp->refcnt == 1 || svd->type == MAP_PRIVATE) { 1630 /* 1631 * Free up now unused parts of anon_map array. 1632 */ 1633 ulong_t an_idx = svd->anon_index + npages; 1634 if (amp->a_szc == seg->s_szc) { 1635 if (seg->s_szc != 0) { 1636 anon_free_pages(amp->ahp, 1637 an_idx, len, 1638 seg->s_szc); 1639 } else { 1640 anon_free(amp->ahp, an_idx, 1641 len); 1642 } 1643 } else { 1644 ASSERT(svd->type == MAP_SHARED); 1645 ASSERT(amp->a_szc > seg->s_szc); 1646 anon_shmap_free_pages(amp, 1647 an_idx, len); 1648 } 1649 1650 /* 1651 * Unreserve swap space for the 1652 * unmapped chunk of this segment in 1653 * case it's MAP_SHARED 1654 */ 1655 if (svd->type == MAP_SHARED) { 1656 anon_unresv(len); 1657 amp->swresv -= len; 1658 } 1659 } 1660 ANON_LOCK_EXIT(&->a_rwlock); 1661 } 1662 1663 if (svd->swresv) { 1664 if (svd->flags & MAP_NORESERVE) { 1665 ASSERT(amp); 1666 oswresv = svd->swresv; 1667 svd->swresv = ptob(anon_pages(amp->ahp, 1668 svd->anon_index, npages)); 1669 anon_unresv(oswresv - svd->swresv); 1670 } else { 1671 anon_unresv(len); 1672 svd->swresv -= len; 1673 } 1674 TRACE_3(TR_FAC_VM, TR_ANON_PROC, 1675 "anon proc:%p %lu %u", seg, len, 0); 1676 } 1677 1678 seg->s_size -= len; 1679 return (0); 1680 } 1681 1682 /* 1683 * The section to go is in the middle of the segment, 1684 * have to make it into two segments. nseg is made for 1685 * the high end while seg is cut down at the low end. 1686 */ 1687 nbase = addr + len; /* new seg base */ 1688 nsize = (seg->s_base + seg->s_size) - nbase; /* new seg size */ 1689 seg->s_size = addr - seg->s_base; /* shrink old seg */ 1690 nseg = seg_alloc(seg->s_as, nbase, nsize); 1691 if (nseg == NULL) { 1692 panic("segvn_unmap seg_alloc"); 1693 /*NOTREACHED*/ 1694 } 1695 nseg->s_ops = seg->s_ops; 1696 nsvd = kmem_cache_alloc(segvn_cache, KM_SLEEP); 1697 nseg->s_data = (void *)nsvd; 1698 nseg->s_szc = seg->s_szc; 1699 *nsvd = *svd; 1700 nsvd->offset = svd->offset + (uintptr_t)(nseg->s_base - seg->s_base); 1701 nsvd->swresv = 0; 1702 nsvd->softlockcnt = 0; 1703 1704 if (svd->vp != NULL) { 1705 VN_HOLD(nsvd->vp); 1706 if (nsvd->type == MAP_SHARED) 1707 lgrp_shm_policy_init(NULL, nsvd->vp); 1708 } 1709 crhold(svd->cred); 1710 1711 if (svd->vpage == NULL) { 1712 nsvd->vpage = NULL; 1713 } else { 1714 /* need to split vpage into two arrays */ 1715 size_t nbytes; 1716 struct vpage *ovpage; 1717 1718 ovpage = svd->vpage; /* keep pointer to vpage */ 1719 1720 npages = seg_pages(seg); /* seg has shrunk */ 1721 nbytes = vpgtob(npages); 1722 svd->vpage = kmem_alloc(nbytes, KM_SLEEP); 1723 1724 bcopy(ovpage, svd->vpage, nbytes); 1725 1726 npages = seg_pages(nseg); 1727 nbytes = vpgtob(npages); 1728 nsvd->vpage = kmem_alloc(nbytes, KM_SLEEP); 1729 1730 bcopy(&ovpage[opages - npages], nsvd->vpage, nbytes); 1731 1732 /* free up old vpage */ 1733 kmem_free(ovpage, vpgtob(opages)); 1734 } 1735 1736 if (amp == NULL) { 1737 nsvd->amp = NULL; 1738 nsvd->anon_index = 0; 1739 } else { 1740 /* 1741 * Need to create a new anon map for the new segment. 1742 * We'll also allocate a new smaller array for the old 1743 * smaller segment to save space. 1744 */ 1745 opages = btop((uintptr_t)(addr - seg->s_base)); 1746 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 1747 if (amp->refcnt == 1 || svd->type == MAP_PRIVATE) { 1748 /* 1749 * Free up now unused parts of anon_map array. 1750 */ 1751 ulong_t an_idx = svd->anon_index + opages; 1752 if (amp->a_szc == seg->s_szc) { 1753 if (seg->s_szc != 0) { 1754 anon_free_pages(amp->ahp, an_idx, len, 1755 seg->s_szc); 1756 } else { 1757 anon_free(amp->ahp, an_idx, 1758 len); 1759 } 1760 } else { 1761 ASSERT(svd->type == MAP_SHARED); 1762 ASSERT(amp->a_szc > seg->s_szc); 1763 anon_shmap_free_pages(amp, an_idx, len); 1764 } 1765 1766 /* 1767 * Unreserve swap space for the 1768 * unmapped chunk of this segment in 1769 * case it's MAP_SHARED 1770 */ 1771 if (svd->type == MAP_SHARED) { 1772 anon_unresv(len); 1773 amp->swresv -= len; 1774 } 1775 } 1776 nsvd->anon_index = svd->anon_index + 1777 btop((uintptr_t)(nseg->s_base - seg->s_base)); 1778 if (svd->type == MAP_SHARED) { 1779 amp->refcnt++; 1780 nsvd->amp = amp; 1781 } else { 1782 struct anon_map *namp; 1783 struct anon_hdr *nahp; 1784 1785 ASSERT(svd->type == MAP_PRIVATE); 1786 nahp = anon_create(btop(seg->s_size), ANON_SLEEP); 1787 namp = anonmap_alloc(nseg->s_size, 0); 1788 namp->a_szc = seg->s_szc; 1789 (void) anon_copy_ptr(amp->ahp, svd->anon_index, nahp, 1790 0, btop(seg->s_size), ANON_SLEEP); 1791 (void) anon_copy_ptr(amp->ahp, nsvd->anon_index, 1792 namp->ahp, 0, btop(nseg->s_size), ANON_SLEEP); 1793 anon_release(amp->ahp, btop(amp->size)); 1794 svd->anon_index = 0; 1795 nsvd->anon_index = 0; 1796 amp->ahp = nahp; 1797 amp->size = seg->s_size; 1798 nsvd->amp = namp; 1799 } 1800 ANON_LOCK_EXIT(&->a_rwlock); 1801 } 1802 if (svd->swresv) { 1803 if (svd->flags & MAP_NORESERVE) { 1804 ASSERT(amp); 1805 oswresv = svd->swresv; 1806 svd->swresv = ptob(anon_pages(amp->ahp, 1807 svd->anon_index, btop(seg->s_size))); 1808 nsvd->swresv = ptob(anon_pages(nsvd->amp->ahp, 1809 nsvd->anon_index, btop(nseg->s_size))); 1810 ASSERT(oswresv >= (svd->swresv + nsvd->swresv)); 1811 anon_unresv(oswresv - (svd->swresv + nsvd->swresv)); 1812 } else { 1813 if (seg->s_size + nseg->s_size + len != svd->swresv) { 1814 panic("segvn_unmap: " 1815 "cannot split swap reservation"); 1816 /*NOTREACHED*/ 1817 } 1818 anon_unresv(len); 1819 svd->swresv = seg->s_size; 1820 nsvd->swresv = nseg->s_size; 1821 } 1822 TRACE_3(TR_FAC_VM, TR_ANON_PROC, "anon proc:%p %lu %u", 1823 seg, len, 0); 1824 } 1825 1826 return (0); /* I'm glad that's all over with! */ 1827 } 1828 1829 static void 1830 segvn_free(struct seg *seg) 1831 { 1832 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 1833 pgcnt_t npages = seg_pages(seg); 1834 struct anon_map *amp; 1835 size_t len; 1836 1837 /* 1838 * We don't need any segment level locks for "segvn" data 1839 * since the address space is "write" locked. 1840 */ 1841 ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock)); 1842 1843 /* 1844 * Be sure to unlock pages. XXX Why do things get free'ed instead 1845 * of unmapped? XXX 1846 */ 1847 (void) segvn_lockop(seg, seg->s_base, seg->s_size, 1848 0, MC_UNLOCK, NULL, 0); 1849 1850 /* 1851 * Deallocate the vpage and anon pointers if necessary and possible. 1852 */ 1853 if (svd->vpage != NULL) { 1854 kmem_free(svd->vpage, vpgtob(npages)); 1855 svd->vpage = NULL; 1856 } 1857 if ((amp = svd->amp) != NULL) { 1858 /* 1859 * If there are no more references to this anon_map 1860 * structure, then deallocate the structure after freeing 1861 * up all the anon slot pointers that we can. 1862 */ 1863 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 1864 ASSERT(amp->a_szc >= seg->s_szc); 1865 if (--amp->refcnt == 0) { 1866 if (svd->type == MAP_PRIVATE) { 1867 /* 1868 * Private - we only need to anon_free 1869 * the part that this segment refers to. 1870 */ 1871 if (seg->s_szc != 0) { 1872 anon_free_pages(amp->ahp, 1873 svd->anon_index, seg->s_size, 1874 seg->s_szc); 1875 } else { 1876 anon_free(amp->ahp, svd->anon_index, 1877 seg->s_size); 1878 } 1879 } else { 1880 /* 1881 * Shared - anon_free the entire 1882 * anon_map's worth of stuff and 1883 * release any swap reservation. 1884 */ 1885 if (amp->a_szc != 0) { 1886 anon_shmap_free_pages(amp, 0, 1887 amp->size); 1888 } else { 1889 anon_free(amp->ahp, 0, amp->size); 1890 } 1891 if ((len = amp->swresv) != 0) { 1892 anon_unresv(len); 1893 TRACE_3(TR_FAC_VM, TR_ANON_PROC, 1894 "anon proc:%p %lu %u", 1895 seg, len, 0); 1896 } 1897 } 1898 svd->amp = NULL; 1899 ANON_LOCK_EXIT(&->a_rwlock); 1900 anonmap_free(amp); 1901 } else if (svd->type == MAP_PRIVATE) { 1902 /* 1903 * We had a private mapping which still has 1904 * a held anon_map so just free up all the 1905 * anon slot pointers that we were using. 1906 */ 1907 if (seg->s_szc != 0) { 1908 anon_free_pages(amp->ahp, svd->anon_index, 1909 seg->s_size, seg->s_szc); 1910 } else { 1911 anon_free(amp->ahp, svd->anon_index, 1912 seg->s_size); 1913 } 1914 ANON_LOCK_EXIT(&->a_rwlock); 1915 } else { 1916 ANON_LOCK_EXIT(&->a_rwlock); 1917 } 1918 } 1919 1920 /* 1921 * Release swap reservation. 1922 */ 1923 if ((len = svd->swresv) != 0) { 1924 anon_unresv(svd->swresv); 1925 TRACE_3(TR_FAC_VM, TR_ANON_PROC, "anon proc:%p %lu %u", 1926 seg, len, 0); 1927 svd->swresv = 0; 1928 } 1929 /* 1930 * Release claim on vnode, credentials, and finally free the 1931 * private data. 1932 */ 1933 if (svd->vp != NULL) { 1934 if (svd->type == MAP_SHARED) 1935 lgrp_shm_policy_fini(NULL, svd->vp); 1936 VN_RELE(svd->vp); 1937 svd->vp = NULL; 1938 } 1939 crfree(svd->cred); 1940 svd->cred = NULL; 1941 1942 seg->s_data = NULL; 1943 kmem_cache_free(segvn_cache, svd); 1944 } 1945 1946 ulong_t segvn_lpglck_limit = 0; 1947 /* 1948 * Support routines used by segvn_pagelock() and softlock faults for anonymous 1949 * pages to implement availrmem accounting in a way that makes sure the 1950 * same memory is accounted just once for all softlock/pagelock purposes. 1951 * This prevents a bug when availrmem is quickly incorrectly exausted from 1952 * several pagelocks to different parts of the same large page since each 1953 * pagelock has to decrement availrmem by the size of the entire large 1954 * page. Note those pages are not COW shared until softunlock/pageunlock so 1955 * we don't need to use cow style accounting here. We also need to make sure 1956 * the entire large page is accounted even if softlock range is less than the 1957 * entire large page because large anon pages can't be demoted when any of 1958 * constituent pages is locked. The caller calls this routine for every page_t 1959 * it locks. The very first page in the range may not be the root page of a 1960 * large page. For all other pages it's guranteed we are going to visit the 1961 * root of a particular large page before any other constituent page as we are 1962 * locking sequential pages belonging to the same anon map. So we do all the 1963 * locking when the root is encountered except for the very first page. Since 1964 * softlocking is not supported (except S_READ_NOCOW special case) for vmpss 1965 * segments and since vnode pages can be demoted without locking all 1966 * constituent pages vnode pages don't come here. Unlocking relies on the 1967 * fact that pagesize can't change whenever any of constituent large pages is 1968 * locked at least SE_SHARED. This allows unlocking code to find the right 1969 * root and decrement availrmem by the same amount it was incremented when the 1970 * page was locked. 1971 */ 1972 static int 1973 segvn_pp_lock_anonpages(page_t *pp, int first) 1974 { 1975 pgcnt_t pages; 1976 pfn_t pfn; 1977 uchar_t szc = pp->p_szc; 1978 1979 ASSERT(PAGE_LOCKED(pp)); 1980 ASSERT(pp->p_vnode != NULL); 1981 ASSERT(IS_SWAPFSVP(pp->p_vnode)); 1982 1983 /* 1984 * pagesize won't change as long as any constituent page is locked. 1985 */ 1986 pages = page_get_pagecnt(pp->p_szc); 1987 pfn = page_pptonum(pp); 1988 1989 if (!first) { 1990 if (!IS_P2ALIGNED(pfn, pages)) { 1991 #ifdef DEBUG 1992 pp = &pp[-(spgcnt_t)(pfn & (pages - 1))]; 1993 pfn = page_pptonum(pp); 1994 ASSERT(IS_P2ALIGNED(pfn, pages)); 1995 ASSERT(pp->p_szc == szc); 1996 ASSERT(pp->p_vnode != NULL); 1997 ASSERT(IS_SWAPFSVP(pp->p_vnode)); 1998 ASSERT(pp->p_slckcnt != 0); 1999 #endif /* DEBUG */ 2000 return (1); 2001 } 2002 } else if (!IS_P2ALIGNED(pfn, pages)) { 2003 pp = &pp[-(spgcnt_t)(pfn & (pages - 1))]; 2004 #ifdef DEBUG 2005 pfn = page_pptonum(pp); 2006 ASSERT(IS_P2ALIGNED(pfn, pages)); 2007 ASSERT(pp->p_szc == szc); 2008 ASSERT(pp->p_vnode != NULL); 2009 ASSERT(IS_SWAPFSVP(pp->p_vnode)); 2010 #endif /* DEBUG */ 2011 } 2012 2013 /* 2014 * pp is a root page. 2015 * We haven't locked this large page yet. 2016 */ 2017 page_struct_lock(pp); 2018 if (pp->p_slckcnt != 0) { 2019 if (pp->p_slckcnt < PAGE_SLOCK_MAXIMUM) { 2020 pp->p_slckcnt++; 2021 page_struct_unlock(pp); 2022 return (1); 2023 } 2024 page_struct_unlock(pp); 2025 segvn_lpglck_limit++; 2026 return (0); 2027 } 2028 mutex_enter(&freemem_lock); 2029 if (availrmem < tune.t_minarmem + pages) { 2030 mutex_exit(&freemem_lock); 2031 page_struct_unlock(pp); 2032 return (0); 2033 } 2034 pp->p_slckcnt++; 2035 availrmem -= pages; 2036 mutex_exit(&freemem_lock); 2037 page_struct_unlock(pp); 2038 return (1); 2039 } 2040 2041 static void 2042 segvn_pp_unlock_anonpages(page_t *pp, int first) 2043 { 2044 pgcnt_t pages; 2045 pfn_t pfn; 2046 2047 ASSERT(PAGE_LOCKED(pp)); 2048 ASSERT(pp->p_vnode != NULL); 2049 ASSERT(IS_SWAPFSVP(pp->p_vnode)); 2050 2051 /* 2052 * pagesize won't change as long as any constituent page is locked. 2053 */ 2054 pages = page_get_pagecnt(pp->p_szc); 2055 pfn = page_pptonum(pp); 2056 2057 if (!first) { 2058 if (!IS_P2ALIGNED(pfn, pages)) { 2059 return; 2060 } 2061 } else if (!IS_P2ALIGNED(pfn, pages)) { 2062 pp = &pp[-(spgcnt_t)(pfn & (pages - 1))]; 2063 #ifdef DEBUG 2064 pfn = page_pptonum(pp); 2065 ASSERT(IS_P2ALIGNED(pfn, pages)); 2066 #endif /* DEBUG */ 2067 } 2068 ASSERT(pp->p_vnode != NULL); 2069 ASSERT(IS_SWAPFSVP(pp->p_vnode)); 2070 ASSERT(pp->p_slckcnt != 0); 2071 page_struct_lock(pp); 2072 if (--pp->p_slckcnt == 0) { 2073 mutex_enter(&freemem_lock); 2074 availrmem += pages; 2075 mutex_exit(&freemem_lock); 2076 } 2077 page_struct_unlock(pp); 2078 } 2079 2080 /* 2081 * Do a F_SOFTUNLOCK call over the range requested. The range must have 2082 * already been F_SOFTLOCK'ed. 2083 * Caller must always match addr and len of a softunlock with a previous 2084 * softlock with exactly the same addr and len. 2085 */ 2086 static void 2087 segvn_softunlock(struct seg *seg, caddr_t addr, size_t len, enum seg_rw rw) 2088 { 2089 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 2090 page_t *pp; 2091 caddr_t adr; 2092 struct vnode *vp; 2093 u_offset_t offset; 2094 ulong_t anon_index; 2095 struct anon_map *amp; 2096 struct anon *ap = NULL; 2097 2098 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 2099 ASSERT(SEGVN_LOCK_HELD(seg->s_as, &svd->lock)); 2100 2101 if ((amp = svd->amp) != NULL) 2102 anon_index = svd->anon_index + seg_page(seg, addr); 2103 2104 hat_unlock(seg->s_as->a_hat, addr, len); 2105 for (adr = addr; adr < addr + len; adr += PAGESIZE) { 2106 if (amp != NULL) { 2107 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 2108 if ((ap = anon_get_ptr(amp->ahp, anon_index++)) 2109 != NULL) { 2110 swap_xlate(ap, &vp, &offset); 2111 } else { 2112 vp = svd->vp; 2113 offset = svd->offset + 2114 (uintptr_t)(adr - seg->s_base); 2115 } 2116 ANON_LOCK_EXIT(&->a_rwlock); 2117 } else { 2118 vp = svd->vp; 2119 offset = svd->offset + 2120 (uintptr_t)(adr - seg->s_base); 2121 } 2122 2123 /* 2124 * Use page_find() instead of page_lookup() to 2125 * find the page since we know that it is locked. 2126 */ 2127 pp = page_find(vp, offset); 2128 if (pp == NULL) { 2129 panic( 2130 "segvn_softunlock: addr %p, ap %p, vp %p, off %llx", 2131 (void *)adr, (void *)ap, (void *)vp, offset); 2132 /*NOTREACHED*/ 2133 } 2134 2135 if (rw == S_WRITE) { 2136 hat_setrefmod(pp); 2137 if (seg->s_as->a_vbits) 2138 hat_setstat(seg->s_as, adr, PAGESIZE, 2139 P_REF | P_MOD); 2140 } else if (rw != S_OTHER) { 2141 hat_setref(pp); 2142 if (seg->s_as->a_vbits) 2143 hat_setstat(seg->s_as, adr, PAGESIZE, P_REF); 2144 } 2145 TRACE_3(TR_FAC_VM, TR_SEGVN_FAULT, 2146 "segvn_fault:pp %p vp %p offset %llx", pp, vp, offset); 2147 if (svd->vp == NULL) { 2148 segvn_pp_unlock_anonpages(pp, adr == addr); 2149 } 2150 page_unlock(pp); 2151 } 2152 mutex_enter(&freemem_lock); /* for availrmem */ 2153 if (svd->vp != NULL) { 2154 availrmem += btop(len); 2155 } 2156 segvn_pages_locked -= btop(len); 2157 svd->softlockcnt -= btop(len); 2158 mutex_exit(&freemem_lock); 2159 if (svd->softlockcnt == 0) { 2160 /* 2161 * All SOFTLOCKS are gone. Wakeup any waiting 2162 * unmappers so they can try again to unmap. 2163 * Check for waiters first without the mutex 2164 * held so we don't always grab the mutex on 2165 * softunlocks. 2166 */ 2167 if (AS_ISUNMAPWAIT(seg->s_as)) { 2168 mutex_enter(&seg->s_as->a_contents); 2169 if (AS_ISUNMAPWAIT(seg->s_as)) { 2170 AS_CLRUNMAPWAIT(seg->s_as); 2171 cv_broadcast(&seg->s_as->a_cv); 2172 } 2173 mutex_exit(&seg->s_as->a_contents); 2174 } 2175 } 2176 } 2177 2178 #define PAGE_HANDLED ((page_t *)-1) 2179 2180 /* 2181 * Release all the pages in the NULL terminated ppp list 2182 * which haven't already been converted to PAGE_HANDLED. 2183 */ 2184 static void 2185 segvn_pagelist_rele(page_t **ppp) 2186 { 2187 for (; *ppp != NULL; ppp++) { 2188 if (*ppp != PAGE_HANDLED) 2189 page_unlock(*ppp); 2190 } 2191 } 2192 2193 static int stealcow = 1; 2194 2195 /* 2196 * Workaround for viking chip bug. See bug id 1220902. 2197 * To fix this down in pagefault() would require importing so 2198 * much as and segvn code as to be unmaintainable. 2199 */ 2200 int enable_mbit_wa = 0; 2201 2202 /* 2203 * Handles all the dirty work of getting the right 2204 * anonymous pages and loading up the translations. 2205 * This routine is called only from segvn_fault() 2206 * when looping over the range of addresses requested. 2207 * 2208 * The basic algorithm here is: 2209 * If this is an anon_zero case 2210 * Call anon_zero to allocate page 2211 * Load up translation 2212 * Return 2213 * endif 2214 * If this is an anon page 2215 * Use anon_getpage to get the page 2216 * else 2217 * Find page in pl[] list passed in 2218 * endif 2219 * If not a cow 2220 * Load up the translation to the page 2221 * return 2222 * endif 2223 * Call anon_private to handle cow 2224 * Load up (writable) translation to new page 2225 */ 2226 static faultcode_t 2227 segvn_faultpage( 2228 struct hat *hat, /* the hat to use for mapping */ 2229 struct seg *seg, /* seg_vn of interest */ 2230 caddr_t addr, /* address in as */ 2231 u_offset_t off, /* offset in vp */ 2232 struct vpage *vpage, /* pointer to vpage for vp, off */ 2233 page_t *pl[], /* object source page pointer */ 2234 uint_t vpprot, /* access allowed to object pages */ 2235 enum fault_type type, /* type of fault */ 2236 enum seg_rw rw, /* type of access at fault */ 2237 int brkcow, /* we may need to break cow */ 2238 int first) /* first page for this fault if 1 */ 2239 { 2240 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 2241 page_t *pp, **ppp; 2242 uint_t pageflags = 0; 2243 page_t *anon_pl[1 + 1]; 2244 page_t *opp = NULL; /* original page */ 2245 uint_t prot; 2246 int err; 2247 int cow; 2248 int claim; 2249 int steal = 0; 2250 ulong_t anon_index; 2251 struct anon *ap, *oldap; 2252 struct anon_map *amp; 2253 int hat_flag = (type == F_SOFTLOCK) ? HAT_LOAD_LOCK : HAT_LOAD; 2254 int anon_lock = 0; 2255 anon_sync_obj_t cookie; 2256 2257 if (svd->flags & MAP_TEXT) { 2258 hat_flag |= HAT_LOAD_TEXT; 2259 } 2260 2261 ASSERT(SEGVN_READ_HELD(seg->s_as, &svd->lock)); 2262 ASSERT(seg->s_szc == 0); 2263 2264 /* 2265 * Initialize protection value for this page. 2266 * If we have per page protection values check it now. 2267 */ 2268 if (svd->pageprot) { 2269 uint_t protchk; 2270 2271 switch (rw) { 2272 case S_READ: 2273 protchk = PROT_READ; 2274 break; 2275 case S_WRITE: 2276 protchk = PROT_WRITE; 2277 break; 2278 case S_EXEC: 2279 protchk = PROT_EXEC; 2280 break; 2281 case S_OTHER: 2282 default: 2283 protchk = PROT_READ | PROT_WRITE | PROT_EXEC; 2284 break; 2285 } 2286 2287 prot = VPP_PROT(vpage); 2288 if ((prot & protchk) == 0) 2289 return (FC_PROT); /* illegal access type */ 2290 } else { 2291 prot = svd->prot; 2292 } 2293 2294 if (type == F_SOFTLOCK && svd->vp != NULL) { 2295 mutex_enter(&freemem_lock); 2296 if (availrmem <= tune.t_minarmem) { 2297 mutex_exit(&freemem_lock); 2298 return (FC_MAKE_ERR(ENOMEM)); /* out of real memory */ 2299 } else { 2300 availrmem--; 2301 svd->softlockcnt++; 2302 segvn_pages_locked++; 2303 } 2304 mutex_exit(&freemem_lock); 2305 } 2306 2307 /* 2308 * Always acquire the anon array lock to prevent 2 threads from 2309 * allocating separate anon slots for the same "addr". 2310 */ 2311 2312 if ((amp = svd->amp) != NULL) { 2313 ASSERT(RW_READ_HELD(&->a_rwlock)); 2314 anon_index = svd->anon_index + seg_page(seg, addr); 2315 anon_array_enter(amp, anon_index, &cookie); 2316 anon_lock = 1; 2317 } 2318 2319 if (svd->vp == NULL && amp != NULL) { 2320 if ((ap = anon_get_ptr(amp->ahp, anon_index)) == NULL) { 2321 /* 2322 * Allocate a (normally) writable anonymous page of 2323 * zeroes. If no advance reservations, reserve now. 2324 */ 2325 if (svd->flags & MAP_NORESERVE) { 2326 if (anon_resv(ptob(1))) { 2327 svd->swresv += ptob(1); 2328 } else { 2329 err = ENOMEM; 2330 goto out; 2331 } 2332 } 2333 if ((pp = anon_zero(seg, addr, &ap, 2334 svd->cred)) == NULL) { 2335 err = ENOMEM; 2336 goto out; /* out of swap space */ 2337 } 2338 /* 2339 * Re-acquire the anon_map lock and 2340 * initialize the anon array entry. 2341 */ 2342 (void) anon_set_ptr(amp->ahp, anon_index, ap, 2343 ANON_SLEEP); 2344 2345 ASSERT(pp->p_szc == 0); 2346 2347 /* 2348 * Handle pages that have been marked for migration 2349 */ 2350 if (lgrp_optimizations()) 2351 page_migrate(seg, addr, &pp, 1); 2352 2353 if (type == F_SOFTLOCK) { 2354 if (!segvn_pp_lock_anonpages(pp, first)) { 2355 page_unlock(pp); 2356 err = ENOMEM; 2357 goto out; 2358 } else { 2359 mutex_enter(&freemem_lock); 2360 svd->softlockcnt++; 2361 segvn_pages_locked++; 2362 mutex_exit(&freemem_lock); 2363 } 2364 } 2365 2366 if (enable_mbit_wa) { 2367 if (rw == S_WRITE) 2368 hat_setmod(pp); 2369 else if (!hat_ismod(pp)) 2370 prot &= ~PROT_WRITE; 2371 } 2372 /* 2373 * If AS_PAGLCK is set in a_flags (via memcntl(2) 2374 * with MC_LOCKAS, MCL_FUTURE) and this is a 2375 * MAP_NORESERVE segment, we may need to 2376 * permanently lock the page as it is being faulted 2377 * for the first time. The following text applies 2378 * only to MAP_NORESERVE segments: 2379 * 2380 * As per memcntl(2), if this segment was created 2381 * after MCL_FUTURE was applied (a "future" 2382 * segment), its pages must be locked. If this 2383 * segment existed at MCL_FUTURE application (a 2384 * "past" segment), the interface is unclear. 2385 * 2386 * We decide to lock only if vpage is present: 2387 * 2388 * - "future" segments will have a vpage array (see 2389 * as_map), and so will be locked as required 2390 * 2391 * - "past" segments may not have a vpage array, 2392 * depending on whether events (such as 2393 * mprotect) have occurred. Locking if vpage 2394 * exists will preserve legacy behavior. Not 2395 * locking if vpage is absent, will not break 2396 * the interface or legacy behavior. Note that 2397 * allocating vpage here if it's absent requires 2398 * upgrading the segvn reader lock, the cost of 2399 * which does not seem worthwhile. 2400 * 2401 * Usually testing and setting VPP_ISPPLOCK and 2402 * VPP_SETPPLOCK requires holding the segvn lock as 2403 * writer, but in this case all readers are 2404 * serializing on the anon array lock. 2405 */ 2406 if (AS_ISPGLCK(seg->s_as) && vpage != NULL && 2407 (svd->flags & MAP_NORESERVE) && 2408 !VPP_ISPPLOCK(vpage)) { 2409 proc_t *p = seg->s_as->a_proc; 2410 ASSERT(svd->type == MAP_PRIVATE); 2411 mutex_enter(&p->p_lock); 2412 if (rctl_incr_locked_mem(p, NULL, PAGESIZE, 2413 1) == 0) { 2414 claim = VPP_PROT(vpage) & PROT_WRITE; 2415 if (page_pp_lock(pp, claim, 0)) { 2416 VPP_SETPPLOCK(vpage); 2417 } else { 2418 rctl_decr_locked_mem(p, NULL, 2419 PAGESIZE, 1); 2420 } 2421 } 2422 mutex_exit(&p->p_lock); 2423 } 2424 2425 hat_memload(hat, addr, pp, prot, hat_flag); 2426 2427 if (!(hat_flag & HAT_LOAD_LOCK)) 2428 page_unlock(pp); 2429 2430 anon_array_exit(&cookie); 2431 return (0); 2432 } 2433 } 2434 2435 /* 2436 * Obtain the page structure via anon_getpage() if it is 2437 * a private copy of an object (the result of a previous 2438 * copy-on-write). 2439 */ 2440 if (amp != NULL) { 2441 if ((ap = anon_get_ptr(amp->ahp, anon_index)) != NULL) { 2442 err = anon_getpage(&ap, &vpprot, anon_pl, PAGESIZE, 2443 seg, addr, rw, svd->cred); 2444 if (err) 2445 goto out; 2446 2447 if (svd->type == MAP_SHARED) { 2448 /* 2449 * If this is a shared mapping to an 2450 * anon_map, then ignore the write 2451 * permissions returned by anon_getpage(). 2452 * They apply to the private mappings 2453 * of this anon_map. 2454 */ 2455 vpprot |= PROT_WRITE; 2456 } 2457 opp = anon_pl[0]; 2458 } 2459 } 2460 2461 /* 2462 * Search the pl[] list passed in if it is from the 2463 * original object (i.e., not a private copy). 2464 */ 2465 if (opp == NULL) { 2466 /* 2467 * Find original page. We must be bringing it in 2468 * from the list in pl[]. 2469 */ 2470 for (ppp = pl; (opp = *ppp) != NULL; ppp++) { 2471 if (opp == PAGE_HANDLED) 2472 continue; 2473 ASSERT(opp->p_vnode == svd->vp); /* XXX */ 2474 if (opp->p_offset == off) 2475 break; 2476 } 2477 if (opp == NULL) { 2478 panic("segvn_faultpage not found"); 2479 /*NOTREACHED*/ 2480 } 2481 *ppp = PAGE_HANDLED; 2482 2483 } 2484 2485 ASSERT(PAGE_LOCKED(opp)); 2486 2487 TRACE_3(TR_FAC_VM, TR_SEGVN_FAULT, 2488 "segvn_fault:pp %p vp %p offset %llx", 2489 opp, NULL, 0); 2490 2491 /* 2492 * The fault is treated as a copy-on-write fault if a 2493 * write occurs on a private segment and the object 2494 * page (i.e., mapping) is write protected. We assume 2495 * that fatal protection checks have already been made. 2496 */ 2497 2498 cow = brkcow && ((vpprot & PROT_WRITE) == 0); 2499 2500 /* 2501 * If not a copy-on-write case load the translation 2502 * and return. 2503 */ 2504 if (cow == 0) { 2505 2506 /* 2507 * Handle pages that have been marked for migration 2508 */ 2509 if (lgrp_optimizations()) 2510 page_migrate(seg, addr, &opp, 1); 2511 2512 if (type == F_SOFTLOCK && svd->vp == NULL) { 2513 2514 ASSERT(opp->p_szc == 0 || 2515 (svd->type == MAP_SHARED && 2516 amp != NULL && amp->a_szc != 0)); 2517 2518 if (!segvn_pp_lock_anonpages(opp, first)) { 2519 page_unlock(opp); 2520 err = ENOMEM; 2521 goto out; 2522 } else { 2523 mutex_enter(&freemem_lock); 2524 svd->softlockcnt++; 2525 segvn_pages_locked++; 2526 mutex_exit(&freemem_lock); 2527 } 2528 } 2529 if (IS_VMODSORT(opp->p_vnode) || enable_mbit_wa) { 2530 if (rw == S_WRITE) 2531 hat_setmod(opp); 2532 else if (rw != S_OTHER && !hat_ismod(opp)) 2533 prot &= ~PROT_WRITE; 2534 } 2535 2536 hat_memload(hat, addr, opp, prot & vpprot, hat_flag); 2537 2538 if (!(hat_flag & HAT_LOAD_LOCK)) 2539 page_unlock(opp); 2540 2541 if (anon_lock) { 2542 anon_array_exit(&cookie); 2543 } 2544 return (0); 2545 } 2546 2547 hat_setref(opp); 2548 2549 ASSERT(amp != NULL && anon_lock); 2550 2551 /* 2552 * Steal the page only if it isn't a private page 2553 * since stealing a private page is not worth the effort. 2554 */ 2555 if ((ap = anon_get_ptr(amp->ahp, anon_index)) == NULL) 2556 steal = 1; 2557 2558 /* 2559 * Steal the original page if the following conditions are true: 2560 * 2561 * We are low on memory, the page is not private, page is not large, 2562 * not shared, not modified, not `locked' or if we have it `locked' 2563 * (i.e., p_cowcnt == 1 and p_lckcnt == 0, which also implies 2564 * that the page is not shared) and if it doesn't have any 2565 * translations. page_struct_lock isn't needed to look at p_cowcnt 2566 * and p_lckcnt because we first get exclusive lock on page. 2567 */ 2568 (void) hat_pagesync(opp, HAT_SYNC_DONTZERO | HAT_SYNC_STOPON_MOD); 2569 2570 if (stealcow && freemem < minfree && steal && opp->p_szc == 0 && 2571 page_tryupgrade(opp) && !hat_ismod(opp) && 2572 ((opp->p_lckcnt == 0 && opp->p_cowcnt == 0) || 2573 (opp->p_lckcnt == 0 && opp->p_cowcnt == 1 && 2574 vpage != NULL && VPP_ISPPLOCK(vpage)))) { 2575 /* 2576 * Check if this page has other translations 2577 * after unloading our translation. 2578 */ 2579 if (hat_page_is_mapped(opp)) { 2580 hat_unload(seg->s_as->a_hat, addr, PAGESIZE, 2581 HAT_UNLOAD); 2582 } 2583 2584 /* 2585 * hat_unload() might sync back someone else's recent 2586 * modification, so check again. 2587 */ 2588 if (!hat_ismod(opp) && !hat_page_is_mapped(opp)) 2589 pageflags |= STEAL_PAGE; 2590 } 2591 2592 /* 2593 * If we have a vpage pointer, see if it indicates that we have 2594 * ``locked'' the page we map -- if so, tell anon_private to 2595 * transfer the locking resource to the new page. 2596 * 2597 * See Statement at the beginning of segvn_lockop regarding 2598 * the way lockcnts/cowcnts are handled during COW. 2599 * 2600 */ 2601 if (vpage != NULL && VPP_ISPPLOCK(vpage)) 2602 pageflags |= LOCK_PAGE; 2603 2604 /* 2605 * Allocate a private page and perform the copy. 2606 * For MAP_NORESERVE reserve swap space now, unless this 2607 * is a cow fault on an existing anon page in which case 2608 * MAP_NORESERVE will have made advance reservations. 2609 */ 2610 if ((svd->flags & MAP_NORESERVE) && (ap == NULL)) { 2611 if (anon_resv(ptob(1))) { 2612 svd->swresv += ptob(1); 2613 } else { 2614 page_unlock(opp); 2615 err = ENOMEM; 2616 goto out; 2617 } 2618 } 2619 oldap = ap; 2620 pp = anon_private(&ap, seg, addr, prot, opp, pageflags, svd->cred); 2621 if (pp == NULL) { 2622 err = ENOMEM; /* out of swap space */ 2623 goto out; 2624 } 2625 2626 /* 2627 * If we copied away from an anonymous page, then 2628 * we are one step closer to freeing up an anon slot. 2629 * 2630 * NOTE: The original anon slot must be released while 2631 * holding the "anon_map" lock. This is necessary to prevent 2632 * other threads from obtaining a pointer to the anon slot 2633 * which may be freed if its "refcnt" is 1. 2634 */ 2635 if (oldap != NULL) 2636 anon_decref(oldap); 2637 2638 (void) anon_set_ptr(amp->ahp, anon_index, ap, ANON_SLEEP); 2639 2640 /* 2641 * Handle pages that have been marked for migration 2642 */ 2643 if (lgrp_optimizations()) 2644 page_migrate(seg, addr, &pp, 1); 2645 2646 ASSERT(pp->p_szc == 0); 2647 if (type == F_SOFTLOCK && svd->vp == NULL) { 2648 if (!segvn_pp_lock_anonpages(pp, first)) { 2649 page_unlock(pp); 2650 err = ENOMEM; 2651 goto out; 2652 } else { 2653 mutex_enter(&freemem_lock); 2654 svd->softlockcnt++; 2655 segvn_pages_locked++; 2656 mutex_exit(&freemem_lock); 2657 } 2658 } 2659 2660 ASSERT(!IS_VMODSORT(pp->p_vnode)); 2661 if (enable_mbit_wa) { 2662 if (rw == S_WRITE) 2663 hat_setmod(pp); 2664 else if (!hat_ismod(pp)) 2665 prot &= ~PROT_WRITE; 2666 } 2667 2668 hat_memload(hat, addr, pp, prot, hat_flag); 2669 2670 if (!(hat_flag & HAT_LOAD_LOCK)) 2671 page_unlock(pp); 2672 2673 ASSERT(anon_lock); 2674 anon_array_exit(&cookie); 2675 return (0); 2676 out: 2677 if (anon_lock) 2678 anon_array_exit(&cookie); 2679 2680 if (type == F_SOFTLOCK && svd->vp != NULL) { 2681 mutex_enter(&freemem_lock); 2682 availrmem++; 2683 segvn_pages_locked--; 2684 svd->softlockcnt--; 2685 mutex_exit(&freemem_lock); 2686 } 2687 return (FC_MAKE_ERR(err)); 2688 } 2689 2690 /* 2691 * relocate a bunch of smaller targ pages into one large repl page. all targ 2692 * pages must be complete pages smaller than replacement pages. 2693 * it's assumed that no page's szc can change since they are all PAGESIZE or 2694 * complete large pages locked SHARED. 2695 */ 2696 static void 2697 segvn_relocate_pages(page_t **targ, page_t *replacement) 2698 { 2699 page_t *pp; 2700 pgcnt_t repl_npgs, curnpgs; 2701 pgcnt_t i; 2702 uint_t repl_szc = replacement->p_szc; 2703 page_t *first_repl = replacement; 2704 page_t *repl; 2705 spgcnt_t npgs; 2706 2707 VM_STAT_ADD(segvnvmstats.relocatepages[0]); 2708 2709 ASSERT(repl_szc != 0); 2710 npgs = repl_npgs = page_get_pagecnt(repl_szc); 2711 2712 i = 0; 2713 while (repl_npgs) { 2714 spgcnt_t nreloc; 2715 int err; 2716 ASSERT(replacement != NULL); 2717 pp = targ[i]; 2718 ASSERT(pp->p_szc < repl_szc); 2719 ASSERT(PAGE_EXCL(pp)); 2720 ASSERT(!PP_ISFREE(pp)); 2721 curnpgs = page_get_pagecnt(pp->p_szc); 2722 if (curnpgs == 1) { 2723 VM_STAT_ADD(segvnvmstats.relocatepages[1]); 2724 repl = replacement; 2725 page_sub(&replacement, repl); 2726 ASSERT(PAGE_EXCL(repl)); 2727 ASSERT(!PP_ISFREE(repl)); 2728 ASSERT(repl->p_szc == repl_szc); 2729 } else { 2730 page_t *repl_savepp; 2731 int j; 2732 VM_STAT_ADD(segvnvmstats.relocatepages[2]); 2733 repl_savepp = replacement; 2734 for (j = 0; j < curnpgs; j++) { 2735 repl = replacement; 2736 page_sub(&replacement, repl); 2737 ASSERT(PAGE_EXCL(repl)); 2738 ASSERT(!PP_ISFREE(repl)); 2739 ASSERT(repl->p_szc == repl_szc); 2740 ASSERT(page_pptonum(targ[i + j]) == 2741 page_pptonum(targ[i]) + j); 2742 } 2743 repl = repl_savepp; 2744 ASSERT(IS_P2ALIGNED(page_pptonum(repl), curnpgs)); 2745 } 2746 err = page_relocate(&pp, &repl, 0, 1, &nreloc, NULL); 2747 if (err || nreloc != curnpgs) { 2748 panic("segvn_relocate_pages: " 2749 "page_relocate failed err=%d curnpgs=%ld " 2750 "nreloc=%ld", err, curnpgs, nreloc); 2751 } 2752 ASSERT(curnpgs <= repl_npgs); 2753 repl_npgs -= curnpgs; 2754 i += curnpgs; 2755 } 2756 ASSERT(replacement == NULL); 2757 2758 repl = first_repl; 2759 repl_npgs = npgs; 2760 for (i = 0; i < repl_npgs; i++) { 2761 ASSERT(PAGE_EXCL(repl)); 2762 ASSERT(!PP_ISFREE(repl)); 2763 targ[i] = repl; 2764 page_downgrade(targ[i]); 2765 repl++; 2766 } 2767 } 2768 2769 /* 2770 * Check if all pages in ppa array are complete smaller than szc pages and 2771 * their roots will still be aligned relative to their current size if the 2772 * entire ppa array is relocated into one szc page. If these conditions are 2773 * not met return 0. 2774 * 2775 * If all pages are properly aligned attempt to upgrade their locks 2776 * to exclusive mode. If it fails set *upgrdfail to 1 and return 0. 2777 * upgrdfail was set to 0 by caller. 2778 * 2779 * Return 1 if all pages are aligned and locked exclusively. 2780 * 2781 * If all pages in ppa array happen to be physically contiguous to make one 2782 * szc page and all exclusive locks are successfully obtained promote the page 2783 * size to szc and set *pszc to szc. Return 1 with pages locked shared. 2784 */ 2785 static int 2786 segvn_full_szcpages(page_t **ppa, uint_t szc, int *upgrdfail, uint_t *pszc) 2787 { 2788 page_t *pp; 2789 pfn_t pfn; 2790 pgcnt_t totnpgs = page_get_pagecnt(szc); 2791 pfn_t first_pfn; 2792 int contig = 1; 2793 pgcnt_t i; 2794 pgcnt_t j; 2795 uint_t curszc; 2796 pgcnt_t curnpgs; 2797 int root = 0; 2798 2799 ASSERT(szc > 0); 2800 2801 VM_STAT_ADD(segvnvmstats.fullszcpages[0]); 2802 2803 for (i = 0; i < totnpgs; i++) { 2804 pp = ppa[i]; 2805 ASSERT(PAGE_SHARED(pp)); 2806 ASSERT(!PP_ISFREE(pp)); 2807 pfn = page_pptonum(pp); 2808 if (i == 0) { 2809 if (!IS_P2ALIGNED(pfn, totnpgs)) { 2810 contig = 0; 2811 } else { 2812 first_pfn = pfn; 2813 } 2814 } else if (contig && pfn != first_pfn + i) { 2815 contig = 0; 2816 } 2817 if (pp->p_szc == 0) { 2818 if (root) { 2819 VM_STAT_ADD(segvnvmstats.fullszcpages[1]); 2820 return (0); 2821 } 2822 } else if (!root) { 2823 if ((curszc = pp->p_szc) >= szc) { 2824 VM_STAT_ADD(segvnvmstats.fullszcpages[2]); 2825 return (0); 2826 } 2827 if (curszc == 0) { 2828 /* 2829 * p_szc changed means we don't have all pages 2830 * locked. return failure. 2831 */ 2832 VM_STAT_ADD(segvnvmstats.fullszcpages[3]); 2833 return (0); 2834 } 2835 curnpgs = page_get_pagecnt(curszc); 2836 if (!IS_P2ALIGNED(pfn, curnpgs) || 2837 !IS_P2ALIGNED(i, curnpgs)) { 2838 VM_STAT_ADD(segvnvmstats.fullszcpages[4]); 2839 return (0); 2840 } 2841 root = 1; 2842 } else { 2843 ASSERT(i > 0); 2844 VM_STAT_ADD(segvnvmstats.fullszcpages[5]); 2845 if (pp->p_szc != curszc) { 2846 VM_STAT_ADD(segvnvmstats.fullszcpages[6]); 2847 return (0); 2848 } 2849 if (pfn - 1 != page_pptonum(ppa[i - 1])) { 2850 panic("segvn_full_szcpages: " 2851 "large page not physically contiguous"); 2852 } 2853 if (P2PHASE(pfn, curnpgs) == curnpgs - 1) { 2854 root = 0; 2855 } 2856 } 2857 } 2858 2859 for (i = 0; i < totnpgs; i++) { 2860 ASSERT(ppa[i]->p_szc < szc); 2861 if (!page_tryupgrade(ppa[i])) { 2862 for (j = 0; j < i; j++) { 2863 page_downgrade(ppa[j]); 2864 } 2865 *pszc = ppa[i]->p_szc; 2866 *upgrdfail = 1; 2867 VM_STAT_ADD(segvnvmstats.fullszcpages[7]); 2868 return (0); 2869 } 2870 } 2871 2872 /* 2873 * When a page is put a free cachelist its szc is set to 0. if file 2874 * system reclaimed pages from cachelist targ pages will be physically 2875 * contiguous with 0 p_szc. in this case just upgrade szc of targ 2876 * pages without any relocations. 2877 * To avoid any hat issues with previous small mappings 2878 * hat_pageunload() the target pages first. 2879 */ 2880 if (contig) { 2881 VM_STAT_ADD(segvnvmstats.fullszcpages[8]); 2882 for (i = 0; i < totnpgs; i++) { 2883 (void) hat_pageunload(ppa[i], HAT_FORCE_PGUNLOAD); 2884 } 2885 for (i = 0; i < totnpgs; i++) { 2886 ppa[i]->p_szc = szc; 2887 } 2888 for (i = 0; i < totnpgs; i++) { 2889 ASSERT(PAGE_EXCL(ppa[i])); 2890 page_downgrade(ppa[i]); 2891 } 2892 if (pszc != NULL) { 2893 *pszc = szc; 2894 } 2895 } 2896 VM_STAT_ADD(segvnvmstats.fullszcpages[9]); 2897 return (1); 2898 } 2899 2900 /* 2901 * Create physically contiguous pages for [vp, off] - [vp, off + 2902 * page_size(szc)) range and for private segment return them in ppa array. 2903 * Pages are created either via IO or relocations. 2904 * 2905 * Return 1 on sucess and 0 on failure. 2906 * 2907 * If physically contiguos pages already exist for this range return 1 without 2908 * filling ppa array. Caller initializes ppa[0] as NULL to detect that ppa 2909 * array wasn't filled. In this case caller fills ppa array via VOP_GETPAGE(). 2910 */ 2911 2912 static int 2913 segvn_fill_vp_pages(struct segvn_data *svd, vnode_t *vp, u_offset_t off, 2914 uint_t szc, page_t **ppa, page_t **ppplist, uint_t *ret_pszc, 2915 int *downsize) 2916 2917 { 2918 page_t *pplist = *ppplist; 2919 size_t pgsz = page_get_pagesize(szc); 2920 pgcnt_t pages = btop(pgsz); 2921 ulong_t start_off = off; 2922 u_offset_t eoff = off + pgsz; 2923 spgcnt_t nreloc; 2924 u_offset_t io_off = off; 2925 size_t io_len; 2926 page_t *io_pplist = NULL; 2927 page_t *done_pplist = NULL; 2928 pgcnt_t pgidx = 0; 2929 page_t *pp; 2930 page_t *newpp; 2931 page_t *targpp; 2932 int io_err = 0; 2933 int i; 2934 pfn_t pfn; 2935 ulong_t ppages; 2936 page_t *targ_pplist = NULL; 2937 page_t *repl_pplist = NULL; 2938 page_t *tmp_pplist; 2939 int nios = 0; 2940 uint_t pszc; 2941 struct vattr va; 2942 2943 VM_STAT_ADD(segvnvmstats.fill_vp_pages[0]); 2944 2945 ASSERT(szc != 0); 2946 ASSERT(pplist->p_szc == szc); 2947 2948 /* 2949 * downsize will be set to 1 only if we fail to lock pages. this will 2950 * allow subsequent faults to try to relocate the page again. If we 2951 * fail due to misalignment don't downsize and let the caller map the 2952 * whole region with small mappings to avoid more faults into the area 2953 * where we can't get large pages anyway. 2954 */ 2955 *downsize = 0; 2956 2957 while (off < eoff) { 2958 newpp = pplist; 2959 ASSERT(newpp != NULL); 2960 ASSERT(PAGE_EXCL(newpp)); 2961 ASSERT(!PP_ISFREE(newpp)); 2962 /* 2963 * we pass NULL for nrelocp to page_lookup_create() 2964 * so that it doesn't relocate. We relocate here 2965 * later only after we make sure we can lock all 2966 * pages in the range we handle and they are all 2967 * aligned. 2968 */ 2969 pp = page_lookup_create(vp, off, SE_SHARED, newpp, NULL, 0); 2970 ASSERT(pp != NULL); 2971 ASSERT(!PP_ISFREE(pp)); 2972 ASSERT(pp->p_vnode == vp); 2973 ASSERT(pp->p_offset == off); 2974 if (pp == newpp) { 2975 VM_STAT_ADD(segvnvmstats.fill_vp_pages[1]); 2976 page_sub(&pplist, pp); 2977 ASSERT(PAGE_EXCL(pp)); 2978 ASSERT(page_iolock_assert(pp)); 2979 page_list_concat(&io_pplist, &pp); 2980 off += PAGESIZE; 2981 continue; 2982 } 2983 VM_STAT_ADD(segvnvmstats.fill_vp_pages[2]); 2984 pfn = page_pptonum(pp); 2985 pszc = pp->p_szc; 2986 if (pszc >= szc && targ_pplist == NULL && io_pplist == NULL && 2987 IS_P2ALIGNED(pfn, pages)) { 2988 ASSERT(repl_pplist == NULL); 2989 ASSERT(done_pplist == NULL); 2990 ASSERT(pplist == *ppplist); 2991 page_unlock(pp); 2992 page_free_replacement_page(pplist); 2993 page_create_putback(pages); 2994 *ppplist = NULL; 2995 VM_STAT_ADD(segvnvmstats.fill_vp_pages[3]); 2996 return (1); 2997 } 2998 if (pszc >= szc) { 2999 page_unlock(pp); 3000 segvn_faultvnmpss_align_err1++; 3001 goto out; 3002 } 3003 ppages = page_get_pagecnt(pszc); 3004 if (!IS_P2ALIGNED(pfn, ppages)) { 3005 ASSERT(pszc > 0); 3006 /* 3007 * sizing down to pszc won't help. 3008 */ 3009 page_unlock(pp); 3010 segvn_faultvnmpss_align_err2++; 3011 goto out; 3012 } 3013 pfn = page_pptonum(newpp); 3014 if (!IS_P2ALIGNED(pfn, ppages)) { 3015 ASSERT(pszc > 0); 3016 /* 3017 * sizing down to pszc won't help. 3018 */ 3019 page_unlock(pp); 3020 segvn_faultvnmpss_align_err3++; 3021 goto out; 3022 } 3023 if (!PAGE_EXCL(pp)) { 3024 VM_STAT_ADD(segvnvmstats.fill_vp_pages[4]); 3025 page_unlock(pp); 3026 *downsize = 1; 3027 *ret_pszc = pp->p_szc; 3028 goto out; 3029 } 3030 targpp = pp; 3031 if (io_pplist != NULL) { 3032 VM_STAT_ADD(segvnvmstats.fill_vp_pages[5]); 3033 io_len = off - io_off; 3034 /* 3035 * Some file systems like NFS don't check EOF 3036 * conditions in VOP_PAGEIO(). Check it here 3037 * now that pages are locked SE_EXCL. Any file 3038 * truncation will wait until the pages are 3039 * unlocked so no need to worry that file will 3040 * be truncated after we check its size here. 3041 * XXX fix NFS to remove this check. 3042 */ 3043 va.va_mask = AT_SIZE; 3044 if (VOP_GETATTR(vp, &va, ATTR_HINT, svd->cred) != 0) { 3045 VM_STAT_ADD(segvnvmstats.fill_vp_pages[6]); 3046 page_unlock(targpp); 3047 goto out; 3048 } 3049 if (btopr(va.va_size) < btopr(io_off + io_len)) { 3050 VM_STAT_ADD(segvnvmstats.fill_vp_pages[7]); 3051 *downsize = 1; 3052 *ret_pszc = 0; 3053 page_unlock(targpp); 3054 goto out; 3055 } 3056 io_err = VOP_PAGEIO(vp, io_pplist, io_off, io_len, 3057 B_READ, svd->cred); 3058 if (io_err) { 3059 VM_STAT_ADD(segvnvmstats.fill_vp_pages[8]); 3060 page_unlock(targpp); 3061 if (io_err == EDEADLK) { 3062 segvn_vmpss_pageio_deadlk_err++; 3063 } 3064 goto out; 3065 } 3066 nios++; 3067 VM_STAT_ADD(segvnvmstats.fill_vp_pages[9]); 3068 while (io_pplist != NULL) { 3069 pp = io_pplist; 3070 page_sub(&io_pplist, pp); 3071 ASSERT(page_iolock_assert(pp)); 3072 page_io_unlock(pp); 3073 pgidx = (pp->p_offset - start_off) >> 3074 PAGESHIFT; 3075 ASSERT(pgidx < pages); 3076 ppa[pgidx] = pp; 3077 page_list_concat(&done_pplist, &pp); 3078 } 3079 } 3080 pp = targpp; 3081 ASSERT(PAGE_EXCL(pp)); 3082 ASSERT(pp->p_szc <= pszc); 3083 if (pszc != 0 && !group_page_trylock(pp, SE_EXCL)) { 3084 VM_STAT_ADD(segvnvmstats.fill_vp_pages[10]); 3085 page_unlock(pp); 3086 *downsize = 1; 3087 *ret_pszc = pp->p_szc; 3088 goto out; 3089 } 3090 VM_STAT_ADD(segvnvmstats.fill_vp_pages[11]); 3091 /* 3092 * page szc chould have changed before the entire group was 3093 * locked. reread page szc. 3094 */ 3095 pszc = pp->p_szc; 3096 ppages = page_get_pagecnt(pszc); 3097 3098 /* link just the roots */ 3099 page_list_concat(&targ_pplist, &pp); 3100 page_sub(&pplist, newpp); 3101 page_list_concat(&repl_pplist, &newpp); 3102 off += PAGESIZE; 3103 while (--ppages != 0) { 3104 newpp = pplist; 3105 page_sub(&pplist, newpp); 3106 off += PAGESIZE; 3107 } 3108 io_off = off; 3109 } 3110 if (io_pplist != NULL) { 3111 VM_STAT_ADD(segvnvmstats.fill_vp_pages[12]); 3112 io_len = eoff - io_off; 3113 va.va_mask = AT_SIZE; 3114 if (VOP_GETATTR(vp, &va, ATTR_HINT, svd->cred) != 0) { 3115 VM_STAT_ADD(segvnvmstats.fill_vp_pages[13]); 3116 goto out; 3117 } 3118 if (btopr(va.va_size) < btopr(io_off + io_len)) { 3119 VM_STAT_ADD(segvnvmstats.fill_vp_pages[14]); 3120 *downsize = 1; 3121 *ret_pszc = 0; 3122 goto out; 3123 } 3124 io_err = VOP_PAGEIO(vp, io_pplist, io_off, io_len, 3125 B_READ, svd->cred); 3126 if (io_err) { 3127 VM_STAT_ADD(segvnvmstats.fill_vp_pages[15]); 3128 if (io_err == EDEADLK) { 3129 segvn_vmpss_pageio_deadlk_err++; 3130 } 3131 goto out; 3132 } 3133 nios++; 3134 while (io_pplist != NULL) { 3135 pp = io_pplist; 3136 page_sub(&io_pplist, pp); 3137 ASSERT(page_iolock_assert(pp)); 3138 page_io_unlock(pp); 3139 pgidx = (pp->p_offset - start_off) >> PAGESHIFT; 3140 ASSERT(pgidx < pages); 3141 ppa[pgidx] = pp; 3142 } 3143 } 3144 /* 3145 * we're now bound to succeed or panic. 3146 * remove pages from done_pplist. it's not needed anymore. 3147 */ 3148 while (done_pplist != NULL) { 3149 pp = done_pplist; 3150 page_sub(&done_pplist, pp); 3151 } 3152 VM_STAT_ADD(segvnvmstats.fill_vp_pages[16]); 3153 ASSERT(pplist == NULL); 3154 *ppplist = NULL; 3155 while (targ_pplist != NULL) { 3156 int ret; 3157 VM_STAT_ADD(segvnvmstats.fill_vp_pages[17]); 3158 ASSERT(repl_pplist); 3159 pp = targ_pplist; 3160 page_sub(&targ_pplist, pp); 3161 pgidx = (pp->p_offset - start_off) >> PAGESHIFT; 3162 newpp = repl_pplist; 3163 page_sub(&repl_pplist, newpp); 3164 #ifdef DEBUG 3165 pfn = page_pptonum(pp); 3166 pszc = pp->p_szc; 3167 ppages = page_get_pagecnt(pszc); 3168 ASSERT(IS_P2ALIGNED(pfn, ppages)); 3169 pfn = page_pptonum(newpp); 3170 ASSERT(IS_P2ALIGNED(pfn, ppages)); 3171 ASSERT(P2PHASE(pfn, pages) == pgidx); 3172 #endif 3173 nreloc = 0; 3174 ret = page_relocate(&pp, &newpp, 0, 1, &nreloc, NULL); 3175 if (ret != 0 || nreloc == 0) { 3176 panic("segvn_fill_vp_pages: " 3177 "page_relocate failed"); 3178 } 3179 pp = newpp; 3180 while (nreloc-- != 0) { 3181 ASSERT(PAGE_EXCL(pp)); 3182 ASSERT(pp->p_vnode == vp); 3183 ASSERT(pgidx == 3184 ((pp->p_offset - start_off) >> PAGESHIFT)); 3185 ppa[pgidx++] = pp; 3186 pp++; 3187 } 3188 } 3189 3190 if (svd->type == MAP_PRIVATE) { 3191 VM_STAT_ADD(segvnvmstats.fill_vp_pages[18]); 3192 for (i = 0; i < pages; i++) { 3193 ASSERT(ppa[i] != NULL); 3194 ASSERT(PAGE_EXCL(ppa[i])); 3195 ASSERT(ppa[i]->p_vnode == vp); 3196 ASSERT(ppa[i]->p_offset == 3197 start_off + (i << PAGESHIFT)); 3198 page_downgrade(ppa[i]); 3199 } 3200 ppa[pages] = NULL; 3201 } else { 3202 VM_STAT_ADD(segvnvmstats.fill_vp_pages[19]); 3203 /* 3204 * the caller will still call VOP_GETPAGE() for shared segments 3205 * to check FS write permissions. For private segments we map 3206 * file read only anyway. so no VOP_GETPAGE is needed. 3207 */ 3208 for (i = 0; i < pages; i++) { 3209 ASSERT(ppa[i] != NULL); 3210 ASSERT(PAGE_EXCL(ppa[i])); 3211 ASSERT(ppa[i]->p_vnode == vp); 3212 ASSERT(ppa[i]->p_offset == 3213 start_off + (i << PAGESHIFT)); 3214 page_unlock(ppa[i]); 3215 } 3216 ppa[0] = NULL; 3217 } 3218 3219 return (1); 3220 out: 3221 /* 3222 * Do the cleanup. Unlock target pages we didn't relocate. They are 3223 * linked on targ_pplist by root pages. reassemble unused replacement 3224 * and io pages back to pplist. 3225 */ 3226 if (io_pplist != NULL) { 3227 VM_STAT_ADD(segvnvmstats.fill_vp_pages[20]); 3228 pp = io_pplist; 3229 do { 3230 ASSERT(pp->p_vnode == vp); 3231 ASSERT(pp->p_offset == io_off); 3232 ASSERT(page_iolock_assert(pp)); 3233 page_io_unlock(pp); 3234 page_hashout(pp, NULL); 3235 io_off += PAGESIZE; 3236 } while ((pp = pp->p_next) != io_pplist); 3237 page_list_concat(&io_pplist, &pplist); 3238 pplist = io_pplist; 3239 } 3240 tmp_pplist = NULL; 3241 while (targ_pplist != NULL) { 3242 VM_STAT_ADD(segvnvmstats.fill_vp_pages[21]); 3243 pp = targ_pplist; 3244 ASSERT(PAGE_EXCL(pp)); 3245 page_sub(&targ_pplist, pp); 3246 3247 pszc = pp->p_szc; 3248 ppages = page_get_pagecnt(pszc); 3249 ASSERT(IS_P2ALIGNED(page_pptonum(pp), ppages)); 3250 3251 if (pszc != 0) { 3252 group_page_unlock(pp); 3253 } 3254 page_unlock(pp); 3255 3256 pp = repl_pplist; 3257 ASSERT(pp != NULL); 3258 ASSERT(PAGE_EXCL(pp)); 3259 ASSERT(pp->p_szc == szc); 3260 page_sub(&repl_pplist, pp); 3261 3262 ASSERT(IS_P2ALIGNED(page_pptonum(pp), ppages)); 3263 3264 /* relink replacement page */ 3265 page_list_concat(&tmp_pplist, &pp); 3266 while (--ppages != 0) { 3267 VM_STAT_ADD(segvnvmstats.fill_vp_pages[22]); 3268 pp++; 3269 ASSERT(PAGE_EXCL(pp)); 3270 ASSERT(pp->p_szc == szc); 3271 page_list_concat(&tmp_pplist, &pp); 3272 } 3273 } 3274 if (tmp_pplist != NULL) { 3275 VM_STAT_ADD(segvnvmstats.fill_vp_pages[23]); 3276 page_list_concat(&tmp_pplist, &pplist); 3277 pplist = tmp_pplist; 3278 } 3279 /* 3280 * at this point all pages are either on done_pplist or 3281 * pplist. They can't be all on done_pplist otherwise 3282 * we'd've been done. 3283 */ 3284 ASSERT(pplist != NULL); 3285 if (nios != 0) { 3286 VM_STAT_ADD(segvnvmstats.fill_vp_pages[24]); 3287 pp = pplist; 3288 do { 3289 VM_STAT_ADD(segvnvmstats.fill_vp_pages[25]); 3290 ASSERT(pp->p_szc == szc); 3291 ASSERT(PAGE_EXCL(pp)); 3292 ASSERT(pp->p_vnode != vp); 3293 pp->p_szc = 0; 3294 } while ((pp = pp->p_next) != pplist); 3295 3296 pp = done_pplist; 3297 do { 3298 VM_STAT_ADD(segvnvmstats.fill_vp_pages[26]); 3299 ASSERT(pp->p_szc == szc); 3300 ASSERT(PAGE_EXCL(pp)); 3301 ASSERT(pp->p_vnode == vp); 3302 pp->p_szc = 0; 3303 } while ((pp = pp->p_next) != done_pplist); 3304 3305 while (pplist != NULL) { 3306 VM_STAT_ADD(segvnvmstats.fill_vp_pages[27]); 3307 pp = pplist; 3308 page_sub(&pplist, pp); 3309 page_free(pp, 0); 3310 } 3311 3312 while (done_pplist != NULL) { 3313 VM_STAT_ADD(segvnvmstats.fill_vp_pages[28]); 3314 pp = done_pplist; 3315 page_sub(&done_pplist, pp); 3316 page_unlock(pp); 3317 } 3318 *ppplist = NULL; 3319 return (0); 3320 } 3321 ASSERT(pplist == *ppplist); 3322 if (io_err) { 3323 VM_STAT_ADD(segvnvmstats.fill_vp_pages[29]); 3324 /* 3325 * don't downsize on io error. 3326 * see if vop_getpage succeeds. 3327 * pplist may still be used in this case 3328 * for relocations. 3329 */ 3330 return (0); 3331 } 3332 VM_STAT_ADD(segvnvmstats.fill_vp_pages[30]); 3333 page_free_replacement_page(pplist); 3334 page_create_putback(pages); 3335 *ppplist = NULL; 3336 return (0); 3337 } 3338 3339 int segvn_anypgsz = 0; 3340 3341 #define SEGVN_RESTORE_SOFTLOCK(type, pages) \ 3342 if ((type) == F_SOFTLOCK) { \ 3343 mutex_enter(&freemem_lock); \ 3344 availrmem += (pages); \ 3345 segvn_pages_locked -= (pages); \ 3346 svd->softlockcnt -= (pages); \ 3347 mutex_exit(&freemem_lock); \ 3348 } 3349 3350 #define SEGVN_UPDATE_MODBITS(ppa, pages, rw, prot, vpprot) \ 3351 if (IS_VMODSORT((ppa)[0]->p_vnode)) { \ 3352 if ((rw) == S_WRITE) { \ 3353 for (i = 0; i < (pages); i++) { \ 3354 ASSERT((ppa)[i]->p_vnode == \ 3355 (ppa)[0]->p_vnode); \ 3356 hat_setmod((ppa)[i]); \ 3357 } \ 3358 } else if ((rw) != S_OTHER && \ 3359 ((prot) & (vpprot) & PROT_WRITE)) { \ 3360 for (i = 0; i < (pages); i++) { \ 3361 ASSERT((ppa)[i]->p_vnode == \ 3362 (ppa)[0]->p_vnode); \ 3363 if (!hat_ismod((ppa)[i])) { \ 3364 prot &= ~PROT_WRITE; \ 3365 break; \ 3366 } \ 3367 } \ 3368 } \ 3369 } 3370 3371 #ifdef VM_STATS 3372 3373 #define SEGVN_VMSTAT_FLTVNPAGES(idx) \ 3374 VM_STAT_ADD(segvnvmstats.fltvnpages[(idx)]); 3375 3376 #else /* VM_STATS */ 3377 3378 #define SEGVN_VMSTAT_FLTVNPAGES(idx) 3379 3380 #endif 3381 3382 static faultcode_t 3383 segvn_fault_vnodepages(struct hat *hat, struct seg *seg, caddr_t lpgaddr, 3384 caddr_t lpgeaddr, enum fault_type type, enum seg_rw rw, caddr_t addr, 3385 caddr_t eaddr, int brkcow) 3386 { 3387 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 3388 struct anon_map *amp = svd->amp; 3389 uchar_t segtype = svd->type; 3390 uint_t szc = seg->s_szc; 3391 size_t pgsz = page_get_pagesize(szc); 3392 size_t maxpgsz = pgsz; 3393 pgcnt_t pages = btop(pgsz); 3394 pgcnt_t maxpages = pages; 3395 size_t ppasize = (pages + 1) * sizeof (page_t *); 3396 caddr_t a = lpgaddr; 3397 caddr_t maxlpgeaddr = lpgeaddr; 3398 u_offset_t off = svd->offset + (uintptr_t)(a - seg->s_base); 3399 ulong_t aindx = svd->anon_index + seg_page(seg, a); 3400 struct vpage *vpage = (svd->vpage != NULL) ? 3401 &svd->vpage[seg_page(seg, a)] : NULL; 3402 vnode_t *vp = svd->vp; 3403 page_t **ppa; 3404 uint_t pszc; 3405 size_t ppgsz; 3406 pgcnt_t ppages; 3407 faultcode_t err = 0; 3408 int ierr; 3409 int vop_size_err = 0; 3410 uint_t protchk, prot, vpprot; 3411 ulong_t i; 3412 int hat_flag = (type == F_SOFTLOCK) ? HAT_LOAD_LOCK : HAT_LOAD; 3413 anon_sync_obj_t an_cookie; 3414 enum seg_rw arw; 3415 int alloc_failed = 0; 3416 int adjszc_chk; 3417 struct vattr va; 3418 int xhat = 0; 3419 page_t *pplist; 3420 pfn_t pfn; 3421 int physcontig; 3422 int upgrdfail; 3423 int segvn_anypgsz_vnode = 0; /* for now map vnode with 2 page sizes */ 3424 3425 ASSERT(szc != 0); 3426 ASSERT(vp != NULL); 3427 ASSERT(brkcow == 0 || amp != NULL); 3428 ASSERT(enable_mbit_wa == 0); /* no mbit simulations with large pages */ 3429 ASSERT(!(svd->flags & MAP_NORESERVE)); 3430 ASSERT(type != F_SOFTUNLOCK); 3431 ASSERT(IS_P2ALIGNED(a, maxpgsz)); 3432 ASSERT(amp == NULL || IS_P2ALIGNED(aindx, maxpages)); 3433 ASSERT(SEGVN_LOCK_HELD(seg->s_as, &svd->lock)); 3434 ASSERT(seg->s_szc < NBBY * sizeof (int)); 3435 ASSERT(type != F_SOFTLOCK || lpgeaddr - a == maxpgsz); 3436 3437 VM_STAT_COND_ADD(type == F_SOFTLOCK, segvnvmstats.fltvnpages[0]); 3438 VM_STAT_COND_ADD(type != F_SOFTLOCK, segvnvmstats.fltvnpages[1]); 3439 3440 if (svd->flags & MAP_TEXT) { 3441 hat_flag |= HAT_LOAD_TEXT; 3442 } 3443 3444 if (svd->pageprot) { 3445 switch (rw) { 3446 case S_READ: 3447 protchk = PROT_READ; 3448 break; 3449 case S_WRITE: 3450 protchk = PROT_WRITE; 3451 break; 3452 case S_EXEC: 3453 protchk = PROT_EXEC; 3454 break; 3455 case S_OTHER: 3456 default: 3457 protchk = PROT_READ | PROT_WRITE | PROT_EXEC; 3458 break; 3459 } 3460 } else { 3461 prot = svd->prot; 3462 /* caller has already done segment level protection check. */ 3463 } 3464 3465 if (seg->s_as->a_hat != hat) { 3466 xhat = 1; 3467 } 3468 3469 if (rw == S_WRITE && segtype == MAP_PRIVATE) { 3470 SEGVN_VMSTAT_FLTVNPAGES(2); 3471 arw = S_READ; 3472 } else { 3473 arw = rw; 3474 } 3475 3476 ppa = kmem_alloc(ppasize, KM_SLEEP); 3477 3478 VM_STAT_COND_ADD(amp != NULL, segvnvmstats.fltvnpages[3]); 3479 3480 for (;;) { 3481 adjszc_chk = 0; 3482 for (; a < lpgeaddr; a += pgsz, off += pgsz, aindx += pages) { 3483 if (adjszc_chk) { 3484 while (szc < seg->s_szc) { 3485 uintptr_t e; 3486 uint_t tszc; 3487 tszc = segvn_anypgsz_vnode ? szc + 1 : 3488 seg->s_szc; 3489 ppgsz = page_get_pagesize(tszc); 3490 if (!IS_P2ALIGNED(a, ppgsz) || 3491 ((alloc_failed >> tszc) & 3492 0x1)) { 3493 break; 3494 } 3495 SEGVN_VMSTAT_FLTVNPAGES(4); 3496 szc = tszc; 3497 pgsz = ppgsz; 3498 pages = btop(pgsz); 3499 e = P2ROUNDUP((uintptr_t)eaddr, pgsz); 3500 lpgeaddr = (caddr_t)e; 3501 } 3502 } 3503 3504 again: 3505 if (IS_P2ALIGNED(a, maxpgsz) && amp != NULL) { 3506 ASSERT(IS_P2ALIGNED(aindx, maxpages)); 3507 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 3508 anon_array_enter(amp, aindx, &an_cookie); 3509 if (anon_get_ptr(amp->ahp, aindx) != NULL) { 3510 SEGVN_VMSTAT_FLTVNPAGES(5); 3511 if (anon_pages(amp->ahp, aindx, 3512 maxpages) != maxpages) { 3513 panic("segvn_fault_vnodepages:" 3514 " empty anon slots\n"); 3515 } 3516 anon_array_exit(&an_cookie); 3517 ANON_LOCK_EXIT(&->a_rwlock); 3518 err = segvn_fault_anonpages(hat, seg, 3519 a, a + maxpgsz, type, rw, 3520 MAX(a, addr), 3521 MIN(a + maxpgsz, eaddr), brkcow); 3522 if (err != 0) { 3523 SEGVN_VMSTAT_FLTVNPAGES(6); 3524 goto out; 3525 } 3526 if (szc < seg->s_szc) { 3527 szc = seg->s_szc; 3528 pgsz = maxpgsz; 3529 pages = maxpages; 3530 lpgeaddr = maxlpgeaddr; 3531 } 3532 goto next; 3533 } else if (anon_pages(amp->ahp, aindx, 3534 maxpages)) { 3535 panic("segvn_fault_vnodepages:" 3536 " non empty anon slots\n"); 3537 } else { 3538 SEGVN_VMSTAT_FLTVNPAGES(7); 3539 anon_array_exit(&an_cookie); 3540 ANON_LOCK_EXIT(&->a_rwlock); 3541 } 3542 } 3543 ASSERT(!brkcow || IS_P2ALIGNED(a, maxpgsz)); 3544 3545 if (svd->pageprot != 0 && IS_P2ALIGNED(a, maxpgsz)) { 3546 ASSERT(vpage != NULL); 3547 prot = VPP_PROT(vpage); 3548 ASSERT(sameprot(seg, a, maxpgsz)); 3549 if ((prot & protchk) == 0) { 3550 SEGVN_VMSTAT_FLTVNPAGES(8); 3551 err = FC_PROT; 3552 goto out; 3553 } 3554 } 3555 if (type == F_SOFTLOCK) { 3556 mutex_enter(&freemem_lock); 3557 if (availrmem < tune.t_minarmem + pages) { 3558 mutex_exit(&freemem_lock); 3559 err = FC_MAKE_ERR(ENOMEM); 3560 goto out; 3561 } else { 3562 availrmem -= pages; 3563 segvn_pages_locked += pages; 3564 svd->softlockcnt += pages; 3565 } 3566 mutex_exit(&freemem_lock); 3567 } 3568 3569 pplist = NULL; 3570 physcontig = 0; 3571 ppa[0] = NULL; 3572 if (!brkcow && szc && 3573 !page_exists_physcontig(vp, off, szc, 3574 segtype == MAP_PRIVATE ? ppa : NULL)) { 3575 SEGVN_VMSTAT_FLTVNPAGES(9); 3576 if (page_alloc_pages(vp, seg, a, &pplist, NULL, 3577 szc, 0) && type != F_SOFTLOCK) { 3578 SEGVN_VMSTAT_FLTVNPAGES(10); 3579 pszc = 0; 3580 ierr = -1; 3581 alloc_failed |= (1 << szc); 3582 break; 3583 } 3584 if (pplist != NULL && 3585 vp->v_mpssdata == SEGVN_PAGEIO) { 3586 int downsize; 3587 SEGVN_VMSTAT_FLTVNPAGES(11); 3588 physcontig = segvn_fill_vp_pages(svd, 3589 vp, off, szc, ppa, &pplist, 3590 &pszc, &downsize); 3591 ASSERT(!physcontig || pplist == NULL); 3592 if (!physcontig && downsize && 3593 type != F_SOFTLOCK) { 3594 ASSERT(pplist == NULL); 3595 SEGVN_VMSTAT_FLTVNPAGES(12); 3596 ierr = -1; 3597 break; 3598 } 3599 ASSERT(!physcontig || 3600 segtype == MAP_PRIVATE || 3601 ppa[0] == NULL); 3602 if (physcontig && ppa[0] == NULL) { 3603 physcontig = 0; 3604 } 3605 } 3606 } else if (!brkcow && szc && ppa[0] != NULL) { 3607 SEGVN_VMSTAT_FLTVNPAGES(13); 3608 ASSERT(segtype == MAP_PRIVATE); 3609 physcontig = 1; 3610 } 3611 3612 if (!physcontig) { 3613 SEGVN_VMSTAT_FLTVNPAGES(14); 3614 ppa[0] = NULL; 3615 ierr = VOP_GETPAGE(vp, (offset_t)off, pgsz, 3616 &vpprot, ppa, pgsz, seg, a, arw, 3617 svd->cred); 3618 if (segtype == MAP_PRIVATE) { 3619 SEGVN_VMSTAT_FLTVNPAGES(15); 3620 vpprot &= ~PROT_WRITE; 3621 } 3622 } else { 3623 ASSERT(segtype == MAP_PRIVATE); 3624 SEGVN_VMSTAT_FLTVNPAGES(16); 3625 vpprot = PROT_ALL & ~PROT_WRITE; 3626 ierr = 0; 3627 } 3628 3629 if (ierr != 0) { 3630 SEGVN_VMSTAT_FLTVNPAGES(17); 3631 if (pplist != NULL) { 3632 SEGVN_VMSTAT_FLTVNPAGES(18); 3633 page_free_replacement_page(pplist); 3634 page_create_putback(pages); 3635 } 3636 SEGVN_RESTORE_SOFTLOCK(type, pages); 3637 if (a + pgsz <= eaddr) { 3638 SEGVN_VMSTAT_FLTVNPAGES(19); 3639 err = FC_MAKE_ERR(ierr); 3640 goto out; 3641 } 3642 va.va_mask = AT_SIZE; 3643 if (VOP_GETATTR(vp, &va, 0, svd->cred) != 0) { 3644 SEGVN_VMSTAT_FLTVNPAGES(20); 3645 err = FC_MAKE_ERR(EIO); 3646 goto out; 3647 } 3648 if (btopr(va.va_size) >= btopr(off + pgsz)) { 3649 SEGVN_VMSTAT_FLTVNPAGES(21); 3650 err = FC_MAKE_ERR(ierr); 3651 goto out; 3652 } 3653 if (btopr(va.va_size) < 3654 btopr(off + (eaddr - a))) { 3655 SEGVN_VMSTAT_FLTVNPAGES(22); 3656 err = FC_MAKE_ERR(ierr); 3657 goto out; 3658 } 3659 if (brkcow || type == F_SOFTLOCK) { 3660 /* can't reduce map area */ 3661 SEGVN_VMSTAT_FLTVNPAGES(23); 3662 vop_size_err = 1; 3663 goto out; 3664 } 3665 SEGVN_VMSTAT_FLTVNPAGES(24); 3666 ASSERT(szc != 0); 3667 pszc = 0; 3668 ierr = -1; 3669 break; 3670 } 3671 3672 if (amp != NULL) { 3673 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 3674 anon_array_enter(amp, aindx, &an_cookie); 3675 } 3676 if (amp != NULL && 3677 anon_get_ptr(amp->ahp, aindx) != NULL) { 3678 ulong_t taindx = P2ALIGN(aindx, maxpages); 3679 3680 SEGVN_VMSTAT_FLTVNPAGES(25); 3681 if (anon_pages(amp->ahp, taindx, maxpages) != 3682 maxpages) { 3683 panic("segvn_fault_vnodepages:" 3684 " empty anon slots\n"); 3685 } 3686 for (i = 0; i < pages; i++) { 3687 page_unlock(ppa[i]); 3688 } 3689 anon_array_exit(&an_cookie); 3690 ANON_LOCK_EXIT(&->a_rwlock); 3691 if (pplist != NULL) { 3692 page_free_replacement_page(pplist); 3693 page_create_putback(pages); 3694 } 3695 SEGVN_RESTORE_SOFTLOCK(type, pages); 3696 if (szc < seg->s_szc) { 3697 SEGVN_VMSTAT_FLTVNPAGES(26); 3698 /* 3699 * For private segments SOFTLOCK 3700 * either always breaks cow (any rw 3701 * type except S_READ_NOCOW) or 3702 * address space is locked as writer 3703 * (S_READ_NOCOW case) and anon slots 3704 * can't show up on second check. 3705 * Therefore if we are here for 3706 * SOFTLOCK case it must be a cow 3707 * break but cow break never reduces 3708 * szc. Thus the assert below. 3709 */ 3710 ASSERT(!brkcow && type != F_SOFTLOCK); 3711 pszc = seg->s_szc; 3712 ierr = -2; 3713 break; 3714 } 3715 ASSERT(IS_P2ALIGNED(a, maxpgsz)); 3716 goto again; 3717 } 3718 #ifdef DEBUG 3719 if (amp != NULL) { 3720 ulong_t taindx = P2ALIGN(aindx, maxpages); 3721 ASSERT(!anon_pages(amp->ahp, taindx, maxpages)); 3722 } 3723 #endif /* DEBUG */ 3724 3725 if (brkcow) { 3726 ASSERT(amp != NULL); 3727 ASSERT(pplist == NULL); 3728 ASSERT(szc == seg->s_szc); 3729 ASSERT(IS_P2ALIGNED(a, maxpgsz)); 3730 ASSERT(IS_P2ALIGNED(aindx, maxpages)); 3731 SEGVN_VMSTAT_FLTVNPAGES(27); 3732 ierr = anon_map_privatepages(amp, aindx, szc, 3733 seg, a, prot, ppa, vpage, segvn_anypgsz, 3734 svd->cred); 3735 if (ierr != 0) { 3736 SEGVN_VMSTAT_FLTVNPAGES(28); 3737 anon_array_exit(&an_cookie); 3738 ANON_LOCK_EXIT(&->a_rwlock); 3739 SEGVN_RESTORE_SOFTLOCK(type, pages); 3740 err = FC_MAKE_ERR(ierr); 3741 goto out; 3742 } 3743 3744 ASSERT(!IS_VMODSORT(ppa[0]->p_vnode)); 3745 /* 3746 * p_szc can't be changed for locked 3747 * swapfs pages. 3748 */ 3749 hat_memload_array(hat, a, pgsz, ppa, prot, 3750 hat_flag); 3751 3752 if (!(hat_flag & HAT_LOAD_LOCK)) { 3753 SEGVN_VMSTAT_FLTVNPAGES(29); 3754 for (i = 0; i < pages; i++) { 3755 page_unlock(ppa[i]); 3756 } 3757 } 3758 anon_array_exit(&an_cookie); 3759 ANON_LOCK_EXIT(&->a_rwlock); 3760 goto next; 3761 } 3762 3763 pfn = page_pptonum(ppa[0]); 3764 /* 3765 * hat_page_demote() needs an EXCl lock on one of 3766 * constituent page_t's and it decreases root's p_szc 3767 * last. This means if root's p_szc is equal szc and 3768 * all its constituent pages are locked 3769 * hat_page_demote() that could have changed p_szc to 3770 * szc is already done and no new have page_demote() 3771 * can start for this large page. 3772 */ 3773 3774 /* 3775 * we need to make sure same mapping size is used for 3776 * the same address range if there's a possibility the 3777 * adddress is already mapped because hat layer panics 3778 * when translation is loaded for the range already 3779 * mapped with a different page size. We achieve it 3780 * by always using largest page size possible subject 3781 * to the constraints of page size, segment page size 3782 * and page alignment. Since mappings are invalidated 3783 * when those constraints change and make it 3784 * impossible to use previously used mapping size no 3785 * mapping size conflicts should happen. 3786 */ 3787 3788 chkszc: 3789 if ((pszc = ppa[0]->p_szc) == szc && 3790 IS_P2ALIGNED(pfn, pages)) { 3791 3792 SEGVN_VMSTAT_FLTVNPAGES(30); 3793 #ifdef DEBUG 3794 for (i = 0; i < pages; i++) { 3795 ASSERT(PAGE_LOCKED(ppa[i])); 3796 ASSERT(!PP_ISFREE(ppa[i])); 3797 ASSERT(page_pptonum(ppa[i]) == 3798 pfn + i); 3799 ASSERT(ppa[i]->p_szc == szc); 3800 ASSERT(ppa[i]->p_vnode == vp); 3801 ASSERT(ppa[i]->p_offset == 3802 off + (i << PAGESHIFT)); 3803 } 3804 #endif /* DEBUG */ 3805 /* 3806 * All pages are of szc we need and they are 3807 * all locked so they can't change szc. load 3808 * translations. 3809 * 3810 * if page got promoted since last check 3811 * we don't need pplist. 3812 */ 3813 if (pplist != NULL) { 3814 page_free_replacement_page(pplist); 3815 page_create_putback(pages); 3816 } 3817 if (PP_ISMIGRATE(ppa[0])) { 3818 page_migrate(seg, a, ppa, pages); 3819 } 3820 SEGVN_UPDATE_MODBITS(ppa, pages, rw, 3821 prot, vpprot); 3822 if (!xhat) { 3823 hat_memload_array(hat, a, pgsz, ppa, 3824 prot & vpprot, hat_flag); 3825 } else { 3826 /* 3827 * avoid large xhat mappings to FS 3828 * pages so that hat_page_demote() 3829 * doesn't need to check for xhat 3830 * large mappings. 3831 */ 3832 for (i = 0; i < pages; i++) { 3833 hat_memload(hat, 3834 a + (i << PAGESHIFT), 3835 ppa[i], prot & vpprot, 3836 hat_flag); 3837 } 3838 } 3839 3840 if (!(hat_flag & HAT_LOAD_LOCK)) { 3841 for (i = 0; i < pages; i++) { 3842 page_unlock(ppa[i]); 3843 } 3844 } 3845 if (amp != NULL) { 3846 anon_array_exit(&an_cookie); 3847 ANON_LOCK_EXIT(&->a_rwlock); 3848 } 3849 goto next; 3850 } 3851 3852 /* 3853 * See if upsize is possible. 3854 */ 3855 if (pszc > szc && szc < seg->s_szc && 3856 (segvn_anypgsz_vnode || pszc >= seg->s_szc)) { 3857 pgcnt_t aphase; 3858 uint_t pszc1 = MIN(pszc, seg->s_szc); 3859 ppgsz = page_get_pagesize(pszc1); 3860 ppages = btop(ppgsz); 3861 aphase = btop(P2PHASE((uintptr_t)a, ppgsz)); 3862 3863 ASSERT(type != F_SOFTLOCK); 3864 3865 SEGVN_VMSTAT_FLTVNPAGES(31); 3866 if (aphase != P2PHASE(pfn, ppages)) { 3867 segvn_faultvnmpss_align_err4++; 3868 } else { 3869 SEGVN_VMSTAT_FLTVNPAGES(32); 3870 if (pplist != NULL) { 3871 page_t *pl = pplist; 3872 page_free_replacement_page(pl); 3873 page_create_putback(pages); 3874 } 3875 for (i = 0; i < pages; i++) { 3876 page_unlock(ppa[i]); 3877 } 3878 if (amp != NULL) { 3879 anon_array_exit(&an_cookie); 3880 ANON_LOCK_EXIT(&->a_rwlock); 3881 } 3882 pszc = pszc1; 3883 ierr = -2; 3884 break; 3885 } 3886 } 3887 3888 /* 3889 * check if we should use smallest mapping size. 3890 */ 3891 upgrdfail = 0; 3892 if (szc == 0 || xhat || 3893 (pszc >= szc && 3894 !IS_P2ALIGNED(pfn, pages)) || 3895 (pszc < szc && 3896 !segvn_full_szcpages(ppa, szc, &upgrdfail, 3897 &pszc))) { 3898 3899 if (upgrdfail && type != F_SOFTLOCK) { 3900 /* 3901 * segvn_full_szcpages failed to lock 3902 * all pages EXCL. Size down. 3903 */ 3904 ASSERT(pszc < szc); 3905 3906 SEGVN_VMSTAT_FLTVNPAGES(33); 3907 3908 if (pplist != NULL) { 3909 page_t *pl = pplist; 3910 page_free_replacement_page(pl); 3911 page_create_putback(pages); 3912 } 3913 3914 for (i = 0; i < pages; i++) { 3915 page_unlock(ppa[i]); 3916 } 3917 if (amp != NULL) { 3918 anon_array_exit(&an_cookie); 3919 ANON_LOCK_EXIT(&->a_rwlock); 3920 } 3921 ierr = -1; 3922 break; 3923 } 3924 if (szc != 0 && !xhat) { 3925 segvn_faultvnmpss_align_err5++; 3926 } 3927 SEGVN_VMSTAT_FLTVNPAGES(34); 3928 if (pplist != NULL) { 3929 page_free_replacement_page(pplist); 3930 page_create_putback(pages); 3931 } 3932 SEGVN_UPDATE_MODBITS(ppa, pages, rw, 3933 prot, vpprot); 3934 if (upgrdfail && segvn_anypgsz_vnode) { 3935 /* SOFTLOCK case */ 3936 hat_memload_array(hat, a, pgsz, 3937 ppa, prot & vpprot, hat_flag); 3938 } else { 3939 for (i = 0; i < pages; i++) { 3940 hat_memload(hat, 3941 a + (i << PAGESHIFT), 3942 ppa[i], prot & vpprot, 3943 hat_flag); 3944 } 3945 } 3946 if (!(hat_flag & HAT_LOAD_LOCK)) { 3947 for (i = 0; i < pages; i++) { 3948 page_unlock(ppa[i]); 3949 } 3950 } 3951 if (amp != NULL) { 3952 anon_array_exit(&an_cookie); 3953 ANON_LOCK_EXIT(&->a_rwlock); 3954 } 3955 goto next; 3956 } 3957 3958 if (pszc == szc) { 3959 /* 3960 * segvn_full_szcpages() upgraded pages szc. 3961 */ 3962 ASSERT(pszc == ppa[0]->p_szc); 3963 ASSERT(IS_P2ALIGNED(pfn, pages)); 3964 goto chkszc; 3965 } 3966 3967 if (pszc > szc) { 3968 kmutex_t *szcmtx; 3969 SEGVN_VMSTAT_FLTVNPAGES(35); 3970 /* 3971 * p_szc of ppa[0] can change since we haven't 3972 * locked all constituent pages. Call 3973 * page_lock_szc() to prevent szc changes. 3974 * This should be a rare case that happens when 3975 * multiple segments use a different page size 3976 * to map the same file offsets. 3977 */ 3978 szcmtx = page_szc_lock(ppa[0]); 3979 pszc = ppa[0]->p_szc; 3980 ASSERT(szcmtx != NULL || pszc == 0); 3981 ASSERT(ppa[0]->p_szc <= pszc); 3982 if (pszc <= szc) { 3983 SEGVN_VMSTAT_FLTVNPAGES(36); 3984 if (szcmtx != NULL) { 3985 mutex_exit(szcmtx); 3986 } 3987 goto chkszc; 3988 } 3989 if (pplist != NULL) { 3990 /* 3991 * page got promoted since last check. 3992 * we don't need preaalocated large 3993 * page. 3994 */ 3995 SEGVN_VMSTAT_FLTVNPAGES(37); 3996 page_free_replacement_page(pplist); 3997 page_create_putback(pages); 3998 } 3999 SEGVN_UPDATE_MODBITS(ppa, pages, rw, 4000 prot, vpprot); 4001 hat_memload_array(hat, a, pgsz, ppa, 4002 prot & vpprot, hat_flag); 4003 mutex_exit(szcmtx); 4004 if (!(hat_flag & HAT_LOAD_LOCK)) { 4005 for (i = 0; i < pages; i++) { 4006 page_unlock(ppa[i]); 4007 } 4008 } 4009 if (amp != NULL) { 4010 anon_array_exit(&an_cookie); 4011 ANON_LOCK_EXIT(&->a_rwlock); 4012 } 4013 goto next; 4014 } 4015 4016 /* 4017 * if page got demoted since last check 4018 * we could have not allocated larger page. 4019 * allocate now. 4020 */ 4021 if (pplist == NULL && 4022 page_alloc_pages(vp, seg, a, &pplist, NULL, 4023 szc, 0) && type != F_SOFTLOCK) { 4024 SEGVN_VMSTAT_FLTVNPAGES(38); 4025 for (i = 0; i < pages; i++) { 4026 page_unlock(ppa[i]); 4027 } 4028 if (amp != NULL) { 4029 anon_array_exit(&an_cookie); 4030 ANON_LOCK_EXIT(&->a_rwlock); 4031 } 4032 ierr = -1; 4033 alloc_failed |= (1 << szc); 4034 break; 4035 } 4036 4037 SEGVN_VMSTAT_FLTVNPAGES(39); 4038 4039 if (pplist != NULL) { 4040 segvn_relocate_pages(ppa, pplist); 4041 #ifdef DEBUG 4042 } else { 4043 ASSERT(type == F_SOFTLOCK); 4044 SEGVN_VMSTAT_FLTVNPAGES(40); 4045 #endif /* DEBUG */ 4046 } 4047 4048 SEGVN_UPDATE_MODBITS(ppa, pages, rw, prot, vpprot); 4049 4050 if (pplist == NULL && segvn_anypgsz_vnode == 0) { 4051 ASSERT(type == F_SOFTLOCK); 4052 for (i = 0; i < pages; i++) { 4053 ASSERT(ppa[i]->p_szc < szc); 4054 hat_memload(hat, a + (i << PAGESHIFT), 4055 ppa[i], prot & vpprot, hat_flag); 4056 } 4057 } else { 4058 ASSERT(pplist != NULL || type == F_SOFTLOCK); 4059 hat_memload_array(hat, a, pgsz, ppa, 4060 prot & vpprot, hat_flag); 4061 } 4062 if (!(hat_flag & HAT_LOAD_LOCK)) { 4063 for (i = 0; i < pages; i++) { 4064 ASSERT(PAGE_SHARED(ppa[i])); 4065 page_unlock(ppa[i]); 4066 } 4067 } 4068 if (amp != NULL) { 4069 anon_array_exit(&an_cookie); 4070 ANON_LOCK_EXIT(&->a_rwlock); 4071 } 4072 4073 next: 4074 if (vpage != NULL) { 4075 vpage += pages; 4076 } 4077 adjszc_chk = 1; 4078 } 4079 if (a == lpgeaddr) 4080 break; 4081 ASSERT(a < lpgeaddr); 4082 4083 ASSERT(!brkcow && type != F_SOFTLOCK); 4084 4085 /* 4086 * ierr == -1 means we failed to map with a large page. 4087 * (either due to allocation/relocation failures or 4088 * misalignment with other mappings to this file. 4089 * 4090 * ierr == -2 means some other thread allocated a large page 4091 * after we gave up tp map with a large page. retry with 4092 * larger mapping. 4093 */ 4094 ASSERT(ierr == -1 || ierr == -2); 4095 ASSERT(ierr == -2 || szc != 0); 4096 ASSERT(ierr == -1 || szc < seg->s_szc); 4097 if (ierr == -2) { 4098 SEGVN_VMSTAT_FLTVNPAGES(41); 4099 ASSERT(pszc > szc && pszc <= seg->s_szc); 4100 szc = pszc; 4101 } else if (segvn_anypgsz_vnode) { 4102 SEGVN_VMSTAT_FLTVNPAGES(42); 4103 szc--; 4104 } else { 4105 SEGVN_VMSTAT_FLTVNPAGES(43); 4106 ASSERT(pszc < szc); 4107 /* 4108 * other process created pszc large page. 4109 * but we still have to drop to 0 szc. 4110 */ 4111 szc = 0; 4112 } 4113 4114 pgsz = page_get_pagesize(szc); 4115 pages = btop(pgsz); 4116 if (ierr == -2) { 4117 /* 4118 * Size up case. Note lpgaddr may only be needed for 4119 * softlock case so we don't adjust it here. 4120 */ 4121 a = (caddr_t)P2ALIGN((uintptr_t)a, pgsz); 4122 ASSERT(a >= lpgaddr); 4123 lpgeaddr = (caddr_t)P2ROUNDUP((uintptr_t)eaddr, pgsz); 4124 off = svd->offset + (uintptr_t)(a - seg->s_base); 4125 aindx = svd->anon_index + seg_page(seg, a); 4126 vpage = (svd->vpage != NULL) ? 4127 &svd->vpage[seg_page(seg, a)] : NULL; 4128 } else { 4129 /* 4130 * Size down case. Note lpgaddr may only be needed for 4131 * softlock case so we don't adjust it here. 4132 */ 4133 ASSERT(IS_P2ALIGNED(a, pgsz)); 4134 ASSERT(IS_P2ALIGNED(lpgeaddr, pgsz)); 4135 lpgeaddr = (caddr_t)P2ROUNDUP((uintptr_t)eaddr, pgsz); 4136 ASSERT(a < lpgeaddr); 4137 if (a < addr) { 4138 SEGVN_VMSTAT_FLTVNPAGES(44); 4139 /* 4140 * The beginning of the large page region can 4141 * be pulled to the right to make a smaller 4142 * region. We haven't yet faulted a single 4143 * page. 4144 */ 4145 a = (caddr_t)P2ALIGN((uintptr_t)addr, pgsz); 4146 ASSERT(a >= lpgaddr); 4147 off = svd->offset + 4148 (uintptr_t)(a - seg->s_base); 4149 aindx = svd->anon_index + seg_page(seg, a); 4150 vpage = (svd->vpage != NULL) ? 4151 &svd->vpage[seg_page(seg, a)] : NULL; 4152 } 4153 } 4154 } 4155 out: 4156 kmem_free(ppa, ppasize); 4157 if (!err && !vop_size_err) { 4158 SEGVN_VMSTAT_FLTVNPAGES(45); 4159 return (0); 4160 } 4161 if (type == F_SOFTLOCK && a > lpgaddr) { 4162 SEGVN_VMSTAT_FLTVNPAGES(46); 4163 segvn_softunlock(seg, lpgaddr, a - lpgaddr, S_OTHER); 4164 } 4165 if (!vop_size_err) { 4166 SEGVN_VMSTAT_FLTVNPAGES(47); 4167 return (err); 4168 } 4169 ASSERT(brkcow || type == F_SOFTLOCK); 4170 /* 4171 * Large page end is mapped beyond the end of file and it's a cow 4172 * fault or softlock so we can't reduce the map area. For now just 4173 * demote the segment. This should really only happen if the end of 4174 * the file changed after the mapping was established since when large 4175 * page segments are created we make sure they don't extend beyond the 4176 * end of the file. 4177 */ 4178 SEGVN_VMSTAT_FLTVNPAGES(48); 4179 4180 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 4181 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_WRITER); 4182 err = 0; 4183 if (seg->s_szc != 0) { 4184 segvn_fltvnpages_clrszc_cnt++; 4185 ASSERT(svd->softlockcnt == 0); 4186 err = segvn_clrszc(seg); 4187 if (err != 0) { 4188 segvn_fltvnpages_clrszc_err++; 4189 } 4190 } 4191 ASSERT(err || seg->s_szc == 0); 4192 SEGVN_LOCK_DOWNGRADE(seg->s_as, &svd->lock); 4193 /* segvn_fault will do its job as if szc had been zero to begin with */ 4194 return (err == 0 ? IE_RETRY : FC_MAKE_ERR(err)); 4195 } 4196 4197 /* 4198 * This routine will attempt to fault in one large page. 4199 * it will use smaller pages if that fails. 4200 * It should only be called for pure anonymous segments. 4201 */ 4202 static faultcode_t 4203 segvn_fault_anonpages(struct hat *hat, struct seg *seg, caddr_t lpgaddr, 4204 caddr_t lpgeaddr, enum fault_type type, enum seg_rw rw, caddr_t addr, 4205 caddr_t eaddr, int brkcow) 4206 { 4207 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 4208 struct anon_map *amp = svd->amp; 4209 uchar_t segtype = svd->type; 4210 uint_t szc = seg->s_szc; 4211 size_t pgsz = page_get_pagesize(szc); 4212 size_t maxpgsz = pgsz; 4213 pgcnt_t pages = btop(pgsz); 4214 size_t ppasize = pages * sizeof (page_t *); 4215 caddr_t a = lpgaddr; 4216 ulong_t aindx = svd->anon_index + seg_page(seg, a); 4217 struct vpage *vpage = (svd->vpage != NULL) ? 4218 &svd->vpage[seg_page(seg, a)] : NULL; 4219 page_t **ppa; 4220 uint_t ppa_szc; 4221 faultcode_t err; 4222 int ierr; 4223 uint_t protchk, prot, vpprot; 4224 ulong_t i; 4225 int hat_flag = (type == F_SOFTLOCK) ? HAT_LOAD_LOCK : HAT_LOAD; 4226 anon_sync_obj_t cookie; 4227 int first = 1; 4228 int adjszc_chk; 4229 int purged = 0; 4230 4231 ASSERT(szc != 0); 4232 ASSERT(amp != NULL); 4233 ASSERT(enable_mbit_wa == 0); /* no mbit simulations with large pages */ 4234 ASSERT(!(svd->flags & MAP_NORESERVE)); 4235 ASSERT(type != F_SOFTUNLOCK); 4236 ASSERT(IS_P2ALIGNED(a, maxpgsz)); 4237 4238 ASSERT(SEGVN_LOCK_HELD(seg->s_as, &svd->lock)); 4239 4240 VM_STAT_COND_ADD(type == F_SOFTLOCK, segvnvmstats.fltanpages[0]); 4241 VM_STAT_COND_ADD(type != F_SOFTLOCK, segvnvmstats.fltanpages[1]); 4242 4243 if (svd->flags & MAP_TEXT) { 4244 hat_flag |= HAT_LOAD_TEXT; 4245 } 4246 4247 if (svd->pageprot) { 4248 switch (rw) { 4249 case S_READ: 4250 protchk = PROT_READ; 4251 break; 4252 case S_WRITE: 4253 protchk = PROT_WRITE; 4254 break; 4255 case S_EXEC: 4256 protchk = PROT_EXEC; 4257 break; 4258 case S_OTHER: 4259 default: 4260 protchk = PROT_READ | PROT_WRITE | PROT_EXEC; 4261 break; 4262 } 4263 VM_STAT_ADD(segvnvmstats.fltanpages[2]); 4264 } else { 4265 prot = svd->prot; 4266 /* caller has already done segment level protection check. */ 4267 } 4268 4269 ppa = kmem_alloc(ppasize, KM_SLEEP); 4270 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 4271 for (;;) { 4272 adjszc_chk = 0; 4273 for (; a < lpgeaddr; a += pgsz, aindx += pages) { 4274 if (svd->pageprot != 0 && IS_P2ALIGNED(a, maxpgsz)) { 4275 VM_STAT_ADD(segvnvmstats.fltanpages[3]); 4276 ASSERT(vpage != NULL); 4277 prot = VPP_PROT(vpage); 4278 ASSERT(sameprot(seg, a, maxpgsz)); 4279 if ((prot & protchk) == 0) { 4280 err = FC_PROT; 4281 goto error; 4282 } 4283 } 4284 if (adjszc_chk && IS_P2ALIGNED(a, maxpgsz) && 4285 pgsz < maxpgsz) { 4286 ASSERT(a > lpgaddr); 4287 szc = seg->s_szc; 4288 pgsz = maxpgsz; 4289 pages = btop(pgsz); 4290 ASSERT(IS_P2ALIGNED(aindx, pages)); 4291 lpgeaddr = (caddr_t)P2ROUNDUP((uintptr_t)eaddr, 4292 pgsz); 4293 } 4294 if (type == F_SOFTLOCK && svd->vp != NULL) { 4295 mutex_enter(&freemem_lock); 4296 if (availrmem < tune.t_minarmem + pages) { 4297 mutex_exit(&freemem_lock); 4298 err = FC_MAKE_ERR(ENOMEM); 4299 goto error; 4300 } else { 4301 availrmem -= pages; 4302 segvn_pages_locked += pages; 4303 svd->softlockcnt += pages; 4304 } 4305 mutex_exit(&freemem_lock); 4306 } 4307 anon_array_enter(amp, aindx, &cookie); 4308 ppa_szc = (uint_t)-1; 4309 ierr = anon_map_getpages(amp, aindx, szc, seg, a, 4310 prot, &vpprot, ppa, &ppa_szc, vpage, rw, brkcow, 4311 segvn_anypgsz, svd->cred); 4312 if (ierr != 0) { 4313 anon_array_exit(&cookie); 4314 VM_STAT_ADD(segvnvmstats.fltanpages[4]); 4315 if (type == F_SOFTLOCK && svd->vp != NULL) { 4316 VM_STAT_ADD(segvnvmstats.fltanpages[5]); 4317 mutex_enter(&freemem_lock); 4318 availrmem += pages; 4319 segvn_pages_locked -= pages; 4320 svd->softlockcnt -= pages; 4321 mutex_exit(&freemem_lock); 4322 } 4323 if (ierr > 0) { 4324 VM_STAT_ADD(segvnvmstats.fltanpages[6]); 4325 err = FC_MAKE_ERR(ierr); 4326 goto error; 4327 } 4328 break; 4329 } 4330 4331 ASSERT(!IS_VMODSORT(ppa[0]->p_vnode)); 4332 4333 ASSERT(segtype == MAP_SHARED || 4334 ppa[0]->p_szc <= szc); 4335 ASSERT(segtype == MAP_PRIVATE || 4336 ppa[0]->p_szc >= szc); 4337 4338 /* 4339 * Handle pages that have been marked for migration 4340 */ 4341 if (lgrp_optimizations()) 4342 page_migrate(seg, a, ppa, pages); 4343 4344 if (type == F_SOFTLOCK && svd->vp == NULL) { 4345 /* 4346 * All pages in ppa array belong to the same 4347 * large page. This means it's ok to call 4348 * segvn_pp_lock_anonpages just for ppa[0]. 4349 */ 4350 if (!segvn_pp_lock_anonpages(ppa[0], first)) { 4351 for (i = 0; i < pages; i++) { 4352 page_unlock(ppa[i]); 4353 } 4354 err = FC_MAKE_ERR(ENOMEM); 4355 goto error; 4356 } 4357 first = 0; 4358 mutex_enter(&freemem_lock); 4359 svd->softlockcnt += pages; 4360 segvn_pages_locked += pages; 4361 mutex_exit(&freemem_lock); 4362 } 4363 4364 if (segtype == MAP_SHARED) { 4365 vpprot |= PROT_WRITE; 4366 } 4367 4368 hat_memload_array(hat, a, pgsz, ppa, 4369 prot & vpprot, hat_flag); 4370 4371 if (hat_flag & HAT_LOAD_LOCK) { 4372 VM_STAT_ADD(segvnvmstats.fltanpages[7]); 4373 } else { 4374 VM_STAT_ADD(segvnvmstats.fltanpages[8]); 4375 for (i = 0; i < pages; i++) 4376 page_unlock(ppa[i]); 4377 } 4378 if (vpage != NULL) 4379 vpage += pages; 4380 4381 anon_array_exit(&cookie); 4382 adjszc_chk = 1; 4383 } 4384 if (a == lpgeaddr) 4385 break; 4386 ASSERT(a < lpgeaddr); 4387 /* 4388 * ierr == -1 means we failed to allocate a large page. 4389 * so do a size down operation. 4390 * 4391 * ierr == -2 means some other process that privately shares 4392 * pages with this process has allocated a larger page and we 4393 * need to retry with larger pages. So do a size up 4394 * operation. This relies on the fact that large pages are 4395 * never partially shared i.e. if we share any constituent 4396 * page of a large page with another process we must share the 4397 * entire large page. Note this cannot happen for SOFTLOCK 4398 * case, unless current address (a) is at the beginning of the 4399 * next page size boundary because the other process couldn't 4400 * have relocated locked pages. 4401 */ 4402 ASSERT(ierr == -1 || ierr == -2); 4403 /* 4404 * For the very first relocation failure try to purge this 4405 * segment's cache so that the relocator can obtain an 4406 * exclusive lock on pages we want to relocate. 4407 */ 4408 if (!purged && ierr == -1 && ppa_szc != (uint_t)-1 && 4409 svd->softlockcnt != 0) { 4410 purged = 1; 4411 segvn_purge(seg); 4412 continue; 4413 } 4414 4415 if (segvn_anypgsz) { 4416 ASSERT(ierr == -2 || szc != 0); 4417 ASSERT(ierr == -1 || szc < seg->s_szc); 4418 szc = (ierr == -1) ? szc - 1 : szc + 1; 4419 } else { 4420 /* 4421 * For non COW faults and segvn_anypgsz == 0 4422 * we need to be careful not to loop forever 4423 * if existing page is found with szc other 4424 * than 0 or seg->s_szc. This could be due 4425 * to page relocations on behalf of DR or 4426 * more likely large page creation. For this 4427 * case simply re-size to existing page's szc 4428 * if returned by anon_map_getpages(). 4429 */ 4430 if (ppa_szc == (uint_t)-1) { 4431 szc = (ierr == -1) ? 0 : seg->s_szc; 4432 } else { 4433 ASSERT(ppa_szc <= seg->s_szc); 4434 ASSERT(ierr == -2 || ppa_szc < szc); 4435 ASSERT(ierr == -1 || ppa_szc > szc); 4436 szc = ppa_szc; 4437 } 4438 } 4439 4440 pgsz = page_get_pagesize(szc); 4441 pages = btop(pgsz); 4442 ASSERT(type != F_SOFTLOCK || ierr == -1 || 4443 (IS_P2ALIGNED(a, pgsz) && IS_P2ALIGNED(lpgeaddr, pgsz))); 4444 if (type == F_SOFTLOCK) { 4445 /* 4446 * For softlocks we cannot reduce the fault area 4447 * (calculated based on the largest page size for this 4448 * segment) for size down and a is already next 4449 * page size aligned as assertted above for size 4450 * ups. Therefore just continue in case of softlock. 4451 */ 4452 VM_STAT_ADD(segvnvmstats.fltanpages[9]); 4453 continue; /* keep lint happy */ 4454 } else if (ierr == -2) { 4455 4456 /* 4457 * Size up case. Note lpgaddr may only be needed for 4458 * softlock case so we don't adjust it here. 4459 */ 4460 VM_STAT_ADD(segvnvmstats.fltanpages[10]); 4461 a = (caddr_t)P2ALIGN((uintptr_t)a, pgsz); 4462 ASSERT(a >= lpgaddr); 4463 lpgeaddr = (caddr_t)P2ROUNDUP((uintptr_t)eaddr, pgsz); 4464 aindx = svd->anon_index + seg_page(seg, a); 4465 vpage = (svd->vpage != NULL) ? 4466 &svd->vpage[seg_page(seg, a)] : NULL; 4467 } else { 4468 /* 4469 * Size down case. Note lpgaddr may only be needed for 4470 * softlock case so we don't adjust it here. 4471 */ 4472 VM_STAT_ADD(segvnvmstats.fltanpages[11]); 4473 ASSERT(IS_P2ALIGNED(a, pgsz)); 4474 ASSERT(IS_P2ALIGNED(lpgeaddr, pgsz)); 4475 lpgeaddr = (caddr_t)P2ROUNDUP((uintptr_t)eaddr, pgsz); 4476 ASSERT(a < lpgeaddr); 4477 if (a < addr) { 4478 /* 4479 * The beginning of the large page region can 4480 * be pulled to the right to make a smaller 4481 * region. We haven't yet faulted a single 4482 * page. 4483 */ 4484 VM_STAT_ADD(segvnvmstats.fltanpages[12]); 4485 a = (caddr_t)P2ALIGN((uintptr_t)addr, pgsz); 4486 ASSERT(a >= lpgaddr); 4487 aindx = svd->anon_index + seg_page(seg, a); 4488 vpage = (svd->vpage != NULL) ? 4489 &svd->vpage[seg_page(seg, a)] : NULL; 4490 } 4491 } 4492 } 4493 VM_STAT_ADD(segvnvmstats.fltanpages[13]); 4494 ANON_LOCK_EXIT(&->a_rwlock); 4495 kmem_free(ppa, ppasize); 4496 return (0); 4497 error: 4498 VM_STAT_ADD(segvnvmstats.fltanpages[14]); 4499 ANON_LOCK_EXIT(&->a_rwlock); 4500 kmem_free(ppa, ppasize); 4501 if (type == F_SOFTLOCK && a > lpgaddr) { 4502 VM_STAT_ADD(segvnvmstats.fltanpages[15]); 4503 segvn_softunlock(seg, lpgaddr, a - lpgaddr, S_OTHER); 4504 } 4505 return (err); 4506 } 4507 4508 int fltadvice = 1; /* set to free behind pages for sequential access */ 4509 4510 /* 4511 * This routine is called via a machine specific fault handling routine. 4512 * It is also called by software routines wishing to lock or unlock 4513 * a range of addresses. 4514 * 4515 * Here is the basic algorithm: 4516 * If unlocking 4517 * Call segvn_softunlock 4518 * Return 4519 * endif 4520 * Checking and set up work 4521 * If we will need some non-anonymous pages 4522 * Call VOP_GETPAGE over the range of non-anonymous pages 4523 * endif 4524 * Loop over all addresses requested 4525 * Call segvn_faultpage passing in page list 4526 * to load up translations and handle anonymous pages 4527 * endloop 4528 * Load up translation to any additional pages in page list not 4529 * already handled that fit into this segment 4530 */ 4531 static faultcode_t 4532 segvn_fault(struct hat *hat, struct seg *seg, caddr_t addr, size_t len, 4533 enum fault_type type, enum seg_rw rw) 4534 { 4535 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 4536 page_t **plp, **ppp, *pp; 4537 u_offset_t off; 4538 caddr_t a; 4539 struct vpage *vpage; 4540 uint_t vpprot, prot; 4541 int err; 4542 page_t *pl[PVN_GETPAGE_NUM + 1]; 4543 size_t plsz, pl_alloc_sz; 4544 size_t page; 4545 ulong_t anon_index; 4546 struct anon_map *amp; 4547 int dogetpage = 0; 4548 caddr_t lpgaddr, lpgeaddr; 4549 size_t pgsz; 4550 anon_sync_obj_t cookie; 4551 int brkcow = BREAK_COW_SHARE(rw, type, svd->type); 4552 4553 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 4554 4555 /* 4556 * First handle the easy stuff 4557 */ 4558 if (type == F_SOFTUNLOCK) { 4559 if (rw == S_READ_NOCOW) { 4560 rw = S_READ; 4561 ASSERT(AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock)); 4562 } 4563 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER); 4564 pgsz = (seg->s_szc == 0) ? PAGESIZE : 4565 page_get_pagesize(seg->s_szc); 4566 VM_STAT_COND_ADD(pgsz > PAGESIZE, segvnvmstats.fltanpages[16]); 4567 CALC_LPG_REGION(pgsz, seg, addr, len, lpgaddr, lpgeaddr); 4568 segvn_softunlock(seg, lpgaddr, lpgeaddr - lpgaddr, rw); 4569 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 4570 return (0); 4571 } 4572 4573 top: 4574 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER); 4575 4576 /* 4577 * If we have the same protections for the entire segment, 4578 * insure that the access being attempted is legitimate. 4579 */ 4580 4581 if (svd->pageprot == 0) { 4582 uint_t protchk; 4583 4584 switch (rw) { 4585 case S_READ: 4586 case S_READ_NOCOW: 4587 protchk = PROT_READ; 4588 break; 4589 case S_WRITE: 4590 protchk = PROT_WRITE; 4591 break; 4592 case S_EXEC: 4593 protchk = PROT_EXEC; 4594 break; 4595 case S_OTHER: 4596 default: 4597 protchk = PROT_READ | PROT_WRITE | PROT_EXEC; 4598 break; 4599 } 4600 4601 if ((svd->prot & protchk) == 0) { 4602 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 4603 return (FC_PROT); /* illegal access type */ 4604 } 4605 } 4606 4607 /* 4608 * We can't allow the long term use of softlocks for vmpss segments, 4609 * because in some file truncation cases we should be able to demote 4610 * the segment, which requires that there are no softlocks. The 4611 * only case where it's ok to allow a SOFTLOCK fault against a vmpss 4612 * segment is S_READ_NOCOW, where the caller holds the address space 4613 * locked as writer and calls softunlock before dropping the as lock. 4614 * S_READ_NOCOW is used by /proc to read memory from another user. 4615 * 4616 * Another deadlock between SOFTLOCK and file truncation can happen 4617 * because segvn_fault_vnodepages() calls the FS one pagesize at 4618 * a time. A second VOP_GETPAGE() call by segvn_fault_vnodepages() 4619 * can cause a deadlock because the first set of page_t's remain 4620 * locked SE_SHARED. To avoid this, we demote segments on a first 4621 * SOFTLOCK if they have a length greater than the segment's 4622 * page size. 4623 * 4624 * So for now, we only avoid demoting a segment on a SOFTLOCK when 4625 * the access type is S_READ_NOCOW and the fault length is less than 4626 * or equal to the segment's page size. While this is quite restrictive, 4627 * it should be the most common case of SOFTLOCK against a vmpss 4628 * segment. 4629 * 4630 * For S_READ_NOCOW, it's safe not to do a copy on write because the 4631 * caller makes sure no COW will be caused by another thread for a 4632 * softlocked page. 4633 */ 4634 if (type == F_SOFTLOCK && svd->vp != NULL && seg->s_szc != 0) { 4635 int demote = 0; 4636 4637 if (rw != S_READ_NOCOW) { 4638 demote = 1; 4639 } 4640 if (!demote && len > PAGESIZE) { 4641 pgsz = page_get_pagesize(seg->s_szc); 4642 CALC_LPG_REGION(pgsz, seg, addr, len, lpgaddr, 4643 lpgeaddr); 4644 if (lpgeaddr - lpgaddr > pgsz) { 4645 demote = 1; 4646 } 4647 } 4648 4649 ASSERT(demote || AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock)); 4650 4651 if (demote) { 4652 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 4653 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_WRITER); 4654 if (seg->s_szc != 0) { 4655 segvn_vmpss_clrszc_cnt++; 4656 ASSERT(svd->softlockcnt == 0); 4657 err = segvn_clrszc(seg); 4658 if (err) { 4659 segvn_vmpss_clrszc_err++; 4660 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 4661 return (FC_MAKE_ERR(err)); 4662 } 4663 } 4664 ASSERT(seg->s_szc == 0); 4665 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 4666 goto top; 4667 } 4668 } 4669 4670 /* 4671 * Check to see if we need to allocate an anon_map structure. 4672 */ 4673 if (svd->amp == NULL && (svd->vp == NULL || brkcow)) { 4674 /* 4675 * Drop the "read" lock on the segment and acquire 4676 * the "write" version since we have to allocate the 4677 * anon_map. 4678 */ 4679 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 4680 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_WRITER); 4681 4682 if (svd->amp == NULL) { 4683 svd->amp = anonmap_alloc(seg->s_size, 0); 4684 svd->amp->a_szc = seg->s_szc; 4685 } 4686 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 4687 4688 /* 4689 * Start all over again since segment protections 4690 * may have changed after we dropped the "read" lock. 4691 */ 4692 goto top; 4693 } 4694 4695 /* 4696 * S_READ_NOCOW vs S_READ distinction was 4697 * only needed for the code above. After 4698 * that we treat it as S_READ. 4699 */ 4700 if (rw == S_READ_NOCOW) { 4701 ASSERT(type == F_SOFTLOCK); 4702 ASSERT(AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock)); 4703 rw = S_READ; 4704 } 4705 4706 amp = svd->amp; 4707 4708 /* 4709 * MADV_SEQUENTIAL work is ignored for large page segments. 4710 */ 4711 if (seg->s_szc != 0) { 4712 pgsz = page_get_pagesize(seg->s_szc); 4713 ASSERT(SEGVN_LOCK_HELD(seg->s_as, &svd->lock)); 4714 CALC_LPG_REGION(pgsz, seg, addr, len, lpgaddr, lpgeaddr); 4715 if (svd->vp == NULL) { 4716 err = segvn_fault_anonpages(hat, seg, lpgaddr, 4717 lpgeaddr, type, rw, addr, addr + len, brkcow); 4718 } else { 4719 err = segvn_fault_vnodepages(hat, seg, lpgaddr, 4720 lpgeaddr, type, rw, addr, addr + len, brkcow); 4721 if (err == IE_RETRY) { 4722 ASSERT(seg->s_szc == 0); 4723 ASSERT(SEGVN_READ_HELD(seg->s_as, &svd->lock)); 4724 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 4725 goto top; 4726 } 4727 } 4728 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 4729 return (err); 4730 } 4731 4732 page = seg_page(seg, addr); 4733 if (amp != NULL) { 4734 anon_index = svd->anon_index + page; 4735 4736 if ((type == F_PROT) && (rw == S_READ) && 4737 svd->type == MAP_PRIVATE && svd->pageprot == 0) { 4738 size_t index = anon_index; 4739 struct anon *ap; 4740 4741 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 4742 /* 4743 * The fast path could apply to S_WRITE also, except 4744 * that the protection fault could be caused by lazy 4745 * tlb flush when ro->rw. In this case, the pte is 4746 * RW already. But RO in the other cpu's tlb causes 4747 * the fault. Since hat_chgprot won't do anything if 4748 * pte doesn't change, we may end up faulting 4749 * indefinitely until the RO tlb entry gets replaced. 4750 */ 4751 for (a = addr; a < addr + len; a += PAGESIZE, index++) { 4752 anon_array_enter(amp, index, &cookie); 4753 ap = anon_get_ptr(amp->ahp, index); 4754 anon_array_exit(&cookie); 4755 if ((ap == NULL) || (ap->an_refcnt != 1)) { 4756 ANON_LOCK_EXIT(&->a_rwlock); 4757 goto slow; 4758 } 4759 } 4760 hat_chgprot(seg->s_as->a_hat, addr, len, svd->prot); 4761 ANON_LOCK_EXIT(&->a_rwlock); 4762 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 4763 return (0); 4764 } 4765 } 4766 slow: 4767 4768 if (svd->vpage == NULL) 4769 vpage = NULL; 4770 else 4771 vpage = &svd->vpage[page]; 4772 4773 off = svd->offset + (uintptr_t)(addr - seg->s_base); 4774 4775 /* 4776 * If MADV_SEQUENTIAL has been set for the particular page we 4777 * are faulting on, free behind all pages in the segment and put 4778 * them on the free list. 4779 */ 4780 if ((page != 0) && fltadvice) { /* not if first page in segment */ 4781 struct vpage *vpp; 4782 ulong_t fanon_index; 4783 size_t fpage; 4784 u_offset_t pgoff, fpgoff; 4785 struct vnode *fvp; 4786 struct anon *fap = NULL; 4787 4788 if (svd->advice == MADV_SEQUENTIAL || 4789 (svd->pageadvice && 4790 VPP_ADVICE(vpage) == MADV_SEQUENTIAL)) { 4791 pgoff = off - PAGESIZE; 4792 fpage = page - 1; 4793 if (vpage != NULL) 4794 vpp = &svd->vpage[fpage]; 4795 if (amp != NULL) 4796 fanon_index = svd->anon_index + fpage; 4797 4798 while (pgoff > svd->offset) { 4799 if (svd->advice != MADV_SEQUENTIAL && 4800 (!svd->pageadvice || (vpage && 4801 VPP_ADVICE(vpp) != MADV_SEQUENTIAL))) 4802 break; 4803 4804 /* 4805 * If this is an anon page, we must find the 4806 * correct <vp, offset> for it 4807 */ 4808 fap = NULL; 4809 if (amp != NULL) { 4810 ANON_LOCK_ENTER(&->a_rwlock, 4811 RW_READER); 4812 anon_array_enter(amp, fanon_index, 4813 &cookie); 4814 fap = anon_get_ptr(amp->ahp, 4815 fanon_index); 4816 if (fap != NULL) { 4817 swap_xlate(fap, &fvp, &fpgoff); 4818 } else { 4819 fpgoff = pgoff; 4820 fvp = svd->vp; 4821 } 4822 anon_array_exit(&cookie); 4823 ANON_LOCK_EXIT(&->a_rwlock); 4824 } else { 4825 fpgoff = pgoff; 4826 fvp = svd->vp; 4827 } 4828 if (fvp == NULL) 4829 break; /* XXX */ 4830 /* 4831 * Skip pages that are free or have an 4832 * "exclusive" lock. 4833 */ 4834 pp = page_lookup_nowait(fvp, fpgoff, SE_SHARED); 4835 if (pp == NULL) 4836 break; 4837 /* 4838 * We don't need the page_struct_lock to test 4839 * as this is only advisory; even if we 4840 * acquire it someone might race in and lock 4841 * the page after we unlock and before the 4842 * PUTPAGE, then VOP_PUTPAGE will do nothing. 4843 */ 4844 if (pp->p_lckcnt == 0 && pp->p_cowcnt == 0) { 4845 /* 4846 * Hold the vnode before releasing 4847 * the page lock to prevent it from 4848 * being freed and re-used by some 4849 * other thread. 4850 */ 4851 VN_HOLD(fvp); 4852 page_unlock(pp); 4853 /* 4854 * We should build a page list 4855 * to kluster putpages XXX 4856 */ 4857 (void) VOP_PUTPAGE(fvp, 4858 (offset_t)fpgoff, PAGESIZE, 4859 (B_DONTNEED|B_FREE|B_ASYNC), 4860 svd->cred); 4861 VN_RELE(fvp); 4862 } else { 4863 /* 4864 * XXX - Should the loop terminate if 4865 * the page is `locked'? 4866 */ 4867 page_unlock(pp); 4868 } 4869 --vpp; 4870 --fanon_index; 4871 pgoff -= PAGESIZE; 4872 } 4873 } 4874 } 4875 4876 plp = pl; 4877 *plp = NULL; 4878 pl_alloc_sz = 0; 4879 4880 /* 4881 * See if we need to call VOP_GETPAGE for 4882 * *any* of the range being faulted on. 4883 * We can skip all of this work if there 4884 * was no original vnode. 4885 */ 4886 if (svd->vp != NULL) { 4887 u_offset_t vp_off; 4888 size_t vp_len; 4889 struct anon *ap; 4890 vnode_t *vp; 4891 4892 vp_off = off; 4893 vp_len = len; 4894 4895 if (amp == NULL) 4896 dogetpage = 1; 4897 else { 4898 /* 4899 * Only acquire reader lock to prevent amp->ahp 4900 * from being changed. It's ok to miss pages, 4901 * hence we don't do anon_array_enter 4902 */ 4903 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 4904 ap = anon_get_ptr(amp->ahp, anon_index); 4905 4906 if (len <= PAGESIZE) 4907 /* inline non_anon() */ 4908 dogetpage = (ap == NULL); 4909 else 4910 dogetpage = non_anon(amp->ahp, anon_index, 4911 &vp_off, &vp_len); 4912 ANON_LOCK_EXIT(&->a_rwlock); 4913 } 4914 4915 if (dogetpage) { 4916 enum seg_rw arw; 4917 struct as *as = seg->s_as; 4918 4919 if (len > ptob((sizeof (pl) / sizeof (pl[0])) - 1)) { 4920 /* 4921 * Page list won't fit in local array, 4922 * allocate one of the needed size. 4923 */ 4924 pl_alloc_sz = 4925 (btop(len) + 1) * sizeof (page_t *); 4926 plp = kmem_alloc(pl_alloc_sz, KM_SLEEP); 4927 plp[0] = NULL; 4928 plsz = len; 4929 } else if (rw == S_WRITE && svd->type == MAP_PRIVATE || 4930 rw == S_OTHER || 4931 (((size_t)(addr + PAGESIZE) < 4932 (size_t)(seg->s_base + seg->s_size)) && 4933 hat_probe(as->a_hat, addr + PAGESIZE))) { 4934 /* 4935 * Ask VOP_GETPAGE to return the exact number 4936 * of pages if 4937 * (a) this is a COW fault, or 4938 * (b) this is a software fault, or 4939 * (c) next page is already mapped. 4940 */ 4941 plsz = len; 4942 } else { 4943 /* 4944 * Ask VOP_GETPAGE to return adjacent pages 4945 * within the segment. 4946 */ 4947 plsz = MIN((size_t)PVN_GETPAGE_SZ, (size_t) 4948 ((seg->s_base + seg->s_size) - addr)); 4949 ASSERT((addr + plsz) <= 4950 (seg->s_base + seg->s_size)); 4951 } 4952 4953 /* 4954 * Need to get some non-anonymous pages. 4955 * We need to make only one call to GETPAGE to do 4956 * this to prevent certain deadlocking conditions 4957 * when we are doing locking. In this case 4958 * non_anon() should have picked up the smallest 4959 * range which includes all the non-anonymous 4960 * pages in the requested range. We have to 4961 * be careful regarding which rw flag to pass in 4962 * because on a private mapping, the underlying 4963 * object is never allowed to be written. 4964 */ 4965 if (rw == S_WRITE && svd->type == MAP_PRIVATE) { 4966 arw = S_READ; 4967 } else { 4968 arw = rw; 4969 } 4970 vp = svd->vp; 4971 TRACE_3(TR_FAC_VM, TR_SEGVN_GETPAGE, 4972 "segvn_getpage:seg %p addr %p vp %p", 4973 seg, addr, vp); 4974 err = VOP_GETPAGE(vp, (offset_t)vp_off, vp_len, 4975 &vpprot, plp, plsz, seg, addr + (vp_off - off), arw, 4976 svd->cred); 4977 if (err) { 4978 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 4979 segvn_pagelist_rele(plp); 4980 if (pl_alloc_sz) 4981 kmem_free(plp, pl_alloc_sz); 4982 return (FC_MAKE_ERR(err)); 4983 } 4984 if (svd->type == MAP_PRIVATE) 4985 vpprot &= ~PROT_WRITE; 4986 } 4987 } 4988 4989 /* 4990 * N.B. at this time the plp array has all the needed non-anon 4991 * pages in addition to (possibly) having some adjacent pages. 4992 */ 4993 4994 /* 4995 * Always acquire the anon_array_lock to prevent 4996 * 2 threads from allocating separate anon slots for 4997 * the same "addr". 4998 * 4999 * If this is a copy-on-write fault and we don't already 5000 * have the anon_array_lock, acquire it to prevent the 5001 * fault routine from handling multiple copy-on-write faults 5002 * on the same "addr" in the same address space. 5003 * 5004 * Only one thread should deal with the fault since after 5005 * it is handled, the other threads can acquire a translation 5006 * to the newly created private page. This prevents two or 5007 * more threads from creating different private pages for the 5008 * same fault. 5009 * 5010 * We grab "serialization" lock here if this is a MAP_PRIVATE segment 5011 * to prevent deadlock between this thread and another thread 5012 * which has soft-locked this page and wants to acquire serial_lock. 5013 * ( bug 4026339 ) 5014 * 5015 * The fix for bug 4026339 becomes unnecessary when using the 5016 * locking scheme with per amp rwlock and a global set of hash 5017 * lock, anon_array_lock. If we steal a vnode page when low 5018 * on memory and upgrad the page lock through page_rename, 5019 * then the page is PAGE_HANDLED, nothing needs to be done 5020 * for this page after returning from segvn_faultpage. 5021 * 5022 * But really, the page lock should be downgraded after 5023 * the stolen page is page_rename'd. 5024 */ 5025 5026 if (amp != NULL) 5027 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 5028 5029 /* 5030 * Ok, now loop over the address range and handle faults 5031 */ 5032 for (a = addr; a < addr + len; a += PAGESIZE, off += PAGESIZE) { 5033 err = segvn_faultpage(hat, seg, a, off, vpage, plp, vpprot, 5034 type, rw, brkcow, a == addr); 5035 if (err) { 5036 if (amp != NULL) 5037 ANON_LOCK_EXIT(&->a_rwlock); 5038 if (type == F_SOFTLOCK && a > addr) { 5039 segvn_softunlock(seg, addr, (a - addr), 5040 S_OTHER); 5041 } 5042 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5043 segvn_pagelist_rele(plp); 5044 if (pl_alloc_sz) 5045 kmem_free(plp, pl_alloc_sz); 5046 return (err); 5047 } 5048 if (vpage) { 5049 vpage++; 5050 } else if (svd->vpage) { 5051 page = seg_page(seg, addr); 5052 vpage = &svd->vpage[++page]; 5053 } 5054 } 5055 5056 /* Didn't get pages from the underlying fs so we're done */ 5057 if (!dogetpage) 5058 goto done; 5059 5060 /* 5061 * Now handle any other pages in the list returned. 5062 * If the page can be used, load up the translations now. 5063 * Note that the for loop will only be entered if "plp" 5064 * is pointing to a non-NULL page pointer which means that 5065 * VOP_GETPAGE() was called and vpprot has been initialized. 5066 */ 5067 if (svd->pageprot == 0) 5068 prot = svd->prot & vpprot; 5069 5070 5071 /* 5072 * Large Files: diff should be unsigned value because we started 5073 * supporting > 2GB segment sizes from 2.5.1 and when a 5074 * large file of size > 2GB gets mapped to address space 5075 * the diff value can be > 2GB. 5076 */ 5077 5078 for (ppp = plp; (pp = *ppp) != NULL; ppp++) { 5079 size_t diff; 5080 struct anon *ap; 5081 int anon_index; 5082 anon_sync_obj_t cookie; 5083 int hat_flag = HAT_LOAD_ADV; 5084 5085 if (svd->flags & MAP_TEXT) { 5086 hat_flag |= HAT_LOAD_TEXT; 5087 } 5088 5089 if (pp == PAGE_HANDLED) 5090 continue; 5091 5092 if (pp->p_offset >= svd->offset && 5093 (pp->p_offset < svd->offset + seg->s_size)) { 5094 5095 diff = pp->p_offset - svd->offset; 5096 5097 /* 5098 * Large Files: Following is the assertion 5099 * validating the above cast. 5100 */ 5101 ASSERT(svd->vp == pp->p_vnode); 5102 5103 page = btop(diff); 5104 if (svd->pageprot) 5105 prot = VPP_PROT(&svd->vpage[page]) & vpprot; 5106 5107 /* 5108 * Prevent other threads in the address space from 5109 * creating private pages (i.e., allocating anon slots) 5110 * while we are in the process of loading translations 5111 * to additional pages returned by the underlying 5112 * object. 5113 */ 5114 if (amp != NULL) { 5115 anon_index = svd->anon_index + page; 5116 anon_array_enter(amp, anon_index, &cookie); 5117 ap = anon_get_ptr(amp->ahp, anon_index); 5118 } 5119 if ((amp == NULL) || (ap == NULL)) { 5120 if (IS_VMODSORT(pp->p_vnode) || 5121 enable_mbit_wa) { 5122 if (rw == S_WRITE) 5123 hat_setmod(pp); 5124 else if (rw != S_OTHER && 5125 !hat_ismod(pp)) 5126 prot &= ~PROT_WRITE; 5127 } 5128 /* 5129 * Skip mapping read ahead pages marked 5130 * for migration, so they will get migrated 5131 * properly on fault 5132 */ 5133 if ((prot & PROT_READ) && !PP_ISMIGRATE(pp)) { 5134 hat_memload(hat, seg->s_base + diff, 5135 pp, prot, hat_flag); 5136 } 5137 } 5138 if (amp != NULL) 5139 anon_array_exit(&cookie); 5140 } 5141 page_unlock(pp); 5142 } 5143 done: 5144 if (amp != NULL) 5145 ANON_LOCK_EXIT(&->a_rwlock); 5146 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5147 if (pl_alloc_sz) 5148 kmem_free(plp, pl_alloc_sz); 5149 return (0); 5150 } 5151 5152 /* 5153 * This routine is used to start I/O on pages asynchronously. XXX it will 5154 * only create PAGESIZE pages. At fault time they will be relocated into 5155 * larger pages. 5156 */ 5157 static faultcode_t 5158 segvn_faulta(struct seg *seg, caddr_t addr) 5159 { 5160 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 5161 int err; 5162 struct anon_map *amp; 5163 vnode_t *vp; 5164 5165 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 5166 5167 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER); 5168 if ((amp = svd->amp) != NULL) { 5169 struct anon *ap; 5170 5171 /* 5172 * Reader lock to prevent amp->ahp from being changed. 5173 * This is advisory, it's ok to miss a page, so 5174 * we don't do anon_array_enter lock. 5175 */ 5176 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 5177 if ((ap = anon_get_ptr(amp->ahp, 5178 svd->anon_index + seg_page(seg, addr))) != NULL) { 5179 5180 err = anon_getpage(&ap, NULL, NULL, 5181 0, seg, addr, S_READ, svd->cred); 5182 5183 ANON_LOCK_EXIT(&->a_rwlock); 5184 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5185 if (err) 5186 return (FC_MAKE_ERR(err)); 5187 return (0); 5188 } 5189 ANON_LOCK_EXIT(&->a_rwlock); 5190 } 5191 5192 if (svd->vp == NULL) { 5193 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5194 return (0); /* zfod page - do nothing now */ 5195 } 5196 5197 vp = svd->vp; 5198 TRACE_3(TR_FAC_VM, TR_SEGVN_GETPAGE, 5199 "segvn_getpage:seg %p addr %p vp %p", seg, addr, vp); 5200 err = VOP_GETPAGE(vp, 5201 (offset_t)(svd->offset + (uintptr_t)(addr - seg->s_base)), 5202 PAGESIZE, NULL, NULL, 0, seg, addr, 5203 S_OTHER, svd->cred); 5204 5205 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5206 if (err) 5207 return (FC_MAKE_ERR(err)); 5208 return (0); 5209 } 5210 5211 static int 5212 segvn_setprot(struct seg *seg, caddr_t addr, size_t len, uint_t prot) 5213 { 5214 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 5215 struct vpage *svp, *evp; 5216 struct vnode *vp; 5217 size_t pgsz; 5218 pgcnt_t pgcnt; 5219 anon_sync_obj_t cookie; 5220 5221 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 5222 5223 if ((svd->maxprot & prot) != prot) 5224 return (EACCES); /* violated maxprot */ 5225 5226 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_WRITER); 5227 5228 /* return if prot is the same */ 5229 if (!svd->pageprot && svd->prot == prot) { 5230 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5231 return (0); 5232 } 5233 5234 /* 5235 * Since we change protections we first have to flush the cache. 5236 * This makes sure all the pagelock calls have to recheck 5237 * protections. 5238 */ 5239 if (svd->softlockcnt > 0) { 5240 /* 5241 * Since we do have the segvn writers lock nobody can fill 5242 * the cache with entries belonging to this seg during 5243 * the purge. The flush either succeeds or we still have 5244 * pending I/Os. 5245 */ 5246 segvn_purge(seg); 5247 if (svd->softlockcnt > 0) { 5248 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5249 return (EAGAIN); 5250 } 5251 } 5252 5253 if (seg->s_szc != 0) { 5254 int err; 5255 pgsz = page_get_pagesize(seg->s_szc); 5256 pgcnt = pgsz >> PAGESHIFT; 5257 ASSERT(IS_P2ALIGNED(pgcnt, pgcnt)); 5258 if (!IS_P2ALIGNED(addr, pgsz) || !IS_P2ALIGNED(len, pgsz)) { 5259 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5260 ASSERT(seg->s_base != addr || seg->s_size != len); 5261 /* 5262 * If we are holding the as lock as a reader then 5263 * we need to return IE_RETRY and let the as 5264 * layer drop and re-aquire the lock as a writer. 5265 */ 5266 if (AS_READ_HELD(seg->s_as, &seg->s_as->a_lock)) 5267 return (IE_RETRY); 5268 VM_STAT_ADD(segvnvmstats.demoterange[1]); 5269 if (svd->type == MAP_PRIVATE || svd->vp != NULL) { 5270 err = segvn_demote_range(seg, addr, len, 5271 SDR_END, 0); 5272 } else { 5273 uint_t szcvec = map_pgszcvec(seg->s_base, 5274 pgsz, (uintptr_t)seg->s_base, 5275 (svd->flags & MAP_TEXT), MAPPGSZC_SHM, 0); 5276 err = segvn_demote_range(seg, addr, len, 5277 SDR_END, szcvec); 5278 } 5279 if (err == 0) 5280 return (IE_RETRY); 5281 if (err == ENOMEM) 5282 return (IE_NOMEM); 5283 return (err); 5284 } 5285 } 5286 5287 5288 /* 5289 * If it's a private mapping and we're making it writable 5290 * and no swap space has been reserved, have to reserve 5291 * it all now. If it's a private mapping to a file (i.e., vp != NULL) 5292 * and we're removing write permission on the entire segment and 5293 * we haven't modified any pages, we can release the swap space. 5294 */ 5295 if (svd->type == MAP_PRIVATE) { 5296 if (prot & PROT_WRITE) { 5297 size_t sz; 5298 if (svd->swresv == 0 && !(svd->flags & MAP_NORESERVE)) { 5299 if (anon_resv(seg->s_size) == 0) { 5300 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5301 return (IE_NOMEM); 5302 } 5303 sz = svd->swresv = seg->s_size; 5304 TRACE_3(TR_FAC_VM, TR_ANON_PROC, 5305 "anon proc:%p %lu %u", 5306 seg, sz, 1); 5307 } 5308 } else { 5309 /* 5310 * Swap space is released only if this segment 5311 * does not map anonymous memory, since read faults 5312 * on such segments still need an anon slot to read 5313 * in the data. 5314 */ 5315 if (svd->swresv != 0 && svd->vp != NULL && 5316 svd->amp == NULL && addr == seg->s_base && 5317 len == seg->s_size && svd->pageprot == 0) { 5318 anon_unresv(svd->swresv); 5319 svd->swresv = 0; 5320 TRACE_3(TR_FAC_VM, TR_ANON_PROC, 5321 "anon proc:%p %lu %u", 5322 seg, 0, 0); 5323 } 5324 } 5325 } 5326 5327 if (addr == seg->s_base && len == seg->s_size && svd->pageprot == 0) { 5328 if (svd->prot == prot) { 5329 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5330 return (0); /* all done */ 5331 } 5332 svd->prot = (uchar_t)prot; 5333 } else if (svd->type == MAP_PRIVATE) { 5334 struct anon *ap = NULL; 5335 page_t *pp; 5336 u_offset_t offset, off; 5337 struct anon_map *amp; 5338 ulong_t anon_idx = 0; 5339 5340 /* 5341 * A vpage structure exists or else the change does not 5342 * involve the entire segment. Establish a vpage structure 5343 * if none is there. Then, for each page in the range, 5344 * adjust its individual permissions. Note that write- 5345 * enabling a MAP_PRIVATE page can affect the claims for 5346 * locked down memory. Overcommitting memory terminates 5347 * the operation. 5348 */ 5349 segvn_vpage(seg); 5350 if ((amp = svd->amp) != NULL) { 5351 anon_idx = svd->anon_index + seg_page(seg, addr); 5352 ASSERT(seg->s_szc == 0 || 5353 IS_P2ALIGNED(anon_idx, pgcnt)); 5354 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 5355 } 5356 5357 offset = svd->offset + (uintptr_t)(addr - seg->s_base); 5358 evp = &svd->vpage[seg_page(seg, addr + len)]; 5359 5360 /* 5361 * See Statement at the beginning of segvn_lockop regarding 5362 * the way cowcnts and lckcnts are handled. 5363 */ 5364 for (svp = &svd->vpage[seg_page(seg, addr)]; svp < evp; svp++) { 5365 5366 if (seg->s_szc != 0) { 5367 if (amp != NULL) { 5368 anon_array_enter(amp, anon_idx, 5369 &cookie); 5370 } 5371 if (IS_P2ALIGNED(anon_idx, pgcnt) && 5372 !segvn_claim_pages(seg, svp, offset, 5373 anon_idx, prot)) { 5374 if (amp != NULL) { 5375 anon_array_exit(&cookie); 5376 } 5377 break; 5378 } 5379 if (amp != NULL) { 5380 anon_array_exit(&cookie); 5381 } 5382 anon_idx++; 5383 } else { 5384 if (amp != NULL) { 5385 anon_array_enter(amp, anon_idx, 5386 &cookie); 5387 ap = anon_get_ptr(amp->ahp, anon_idx++); 5388 } 5389 5390 if (VPP_ISPPLOCK(svp) && 5391 VPP_PROT(svp) != prot) { 5392 5393 if (amp == NULL || ap == NULL) { 5394 vp = svd->vp; 5395 off = offset; 5396 } else 5397 swap_xlate(ap, &vp, &off); 5398 if (amp != NULL) 5399 anon_array_exit(&cookie); 5400 5401 if ((pp = page_lookup(vp, off, 5402 SE_SHARED)) == NULL) { 5403 panic("segvn_setprot: no page"); 5404 /*NOTREACHED*/ 5405 } 5406 ASSERT(seg->s_szc == 0); 5407 if ((VPP_PROT(svp) ^ prot) & 5408 PROT_WRITE) { 5409 if (prot & PROT_WRITE) { 5410 if (!page_addclaim(pp)) { 5411 page_unlock(pp); 5412 break; 5413 } 5414 } else { 5415 if (!page_subclaim(pp)) { 5416 page_unlock(pp); 5417 break; 5418 } 5419 } 5420 } 5421 page_unlock(pp); 5422 } else if (amp != NULL) 5423 anon_array_exit(&cookie); 5424 } 5425 VPP_SETPROT(svp, prot); 5426 offset += PAGESIZE; 5427 } 5428 if (amp != NULL) 5429 ANON_LOCK_EXIT(&->a_rwlock); 5430 5431 /* 5432 * Did we terminate prematurely? If so, simply unload 5433 * the translations to the things we've updated so far. 5434 */ 5435 if (svp != evp) { 5436 len = (svp - &svd->vpage[seg_page(seg, addr)]) * 5437 PAGESIZE; 5438 ASSERT(seg->s_szc == 0 || IS_P2ALIGNED(len, pgsz)); 5439 if (len != 0) 5440 hat_unload(seg->s_as->a_hat, addr, 5441 len, HAT_UNLOAD); 5442 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5443 return (IE_NOMEM); 5444 } 5445 } else { 5446 segvn_vpage(seg); 5447 evp = &svd->vpage[seg_page(seg, addr + len)]; 5448 for (svp = &svd->vpage[seg_page(seg, addr)]; svp < evp; svp++) { 5449 VPP_SETPROT(svp, prot); 5450 } 5451 } 5452 5453 if (((prot & PROT_WRITE) != 0 && 5454 (svd->vp != NULL || svd->type == MAP_PRIVATE)) || 5455 (prot & ~PROT_USER) == PROT_NONE) { 5456 /* 5457 * Either private or shared data with write access (in 5458 * which case we need to throw out all former translations 5459 * so that we get the right translations set up on fault 5460 * and we don't allow write access to any copy-on-write pages 5461 * that might be around or to prevent write access to pages 5462 * representing holes in a file), or we don't have permission 5463 * to access the memory at all (in which case we have to 5464 * unload any current translations that might exist). 5465 */ 5466 hat_unload(seg->s_as->a_hat, addr, len, HAT_UNLOAD); 5467 } else { 5468 /* 5469 * A shared mapping or a private mapping in which write 5470 * protection is going to be denied - just change all the 5471 * protections over the range of addresses in question. 5472 * segvn does not support any other attributes other 5473 * than prot so we can use hat_chgattr. 5474 */ 5475 hat_chgattr(seg->s_as->a_hat, addr, len, prot); 5476 } 5477 5478 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5479 5480 return (0); 5481 } 5482 5483 /* 5484 * segvn_setpagesize is called via SEGOP_SETPAGESIZE from as_setpagesize, 5485 * to determine if the seg is capable of mapping the requested szc. 5486 */ 5487 static int 5488 segvn_setpagesize(struct seg *seg, caddr_t addr, size_t len, uint_t szc) 5489 { 5490 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 5491 struct segvn_data *nsvd; 5492 struct anon_map *amp = svd->amp; 5493 struct seg *nseg; 5494 caddr_t eaddr = addr + len, a; 5495 size_t pgsz = page_get_pagesize(szc); 5496 pgcnt_t pgcnt = page_get_pagecnt(szc); 5497 int err; 5498 u_offset_t off = svd->offset + (uintptr_t)(addr - seg->s_base); 5499 extern struct vnode kvp; 5500 5501 ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock)); 5502 ASSERT(addr >= seg->s_base && eaddr <= seg->s_base + seg->s_size); 5503 5504 if (seg->s_szc == szc || segvn_lpg_disable != 0) { 5505 return (0); 5506 } 5507 5508 /* 5509 * addr should always be pgsz aligned but eaddr may be misaligned if 5510 * it's at the end of the segment. 5511 * 5512 * XXX we should assert this condition since as_setpagesize() logic 5513 * guarantees it. 5514 */ 5515 if (!IS_P2ALIGNED(addr, pgsz) || 5516 (!IS_P2ALIGNED(eaddr, pgsz) && 5517 eaddr != seg->s_base + seg->s_size)) { 5518 5519 segvn_setpgsz_align_err++; 5520 return (EINVAL); 5521 } 5522 5523 if (amp != NULL && svd->type == MAP_SHARED) { 5524 ulong_t an_idx = svd->anon_index + seg_page(seg, addr); 5525 if (!IS_P2ALIGNED(an_idx, pgcnt)) { 5526 5527 segvn_setpgsz_anon_align_err++; 5528 return (EINVAL); 5529 } 5530 } 5531 5532 if ((svd->flags & MAP_NORESERVE) || seg->s_as == &kas || 5533 szc > segvn_maxpgszc) { 5534 return (EINVAL); 5535 } 5536 5537 /* paranoid check */ 5538 if (svd->vp != NULL && 5539 (IS_SWAPFSVP(svd->vp) || svd->vp == &kvp)) { 5540 return (EINVAL); 5541 } 5542 5543 if (seg->s_szc == 0 && svd->vp != NULL && 5544 map_addr_vacalign_check(addr, off)) { 5545 return (EINVAL); 5546 } 5547 5548 /* 5549 * Check that protections are the same within new page 5550 * size boundaries. 5551 */ 5552 if (svd->pageprot) { 5553 for (a = addr; a < eaddr; a += pgsz) { 5554 if ((a + pgsz) > eaddr) { 5555 if (!sameprot(seg, a, eaddr - a)) { 5556 return (EINVAL); 5557 } 5558 } else { 5559 if (!sameprot(seg, a, pgsz)) { 5560 return (EINVAL); 5561 } 5562 } 5563 } 5564 } 5565 5566 /* 5567 * Since we are changing page size we first have to flush 5568 * the cache. This makes sure all the pagelock calls have 5569 * to recheck protections. 5570 */ 5571 if (svd->softlockcnt > 0) { 5572 /* 5573 * Since we do have the segvn writers lock nobody can fill 5574 * the cache with entries belonging to this seg during 5575 * the purge. The flush either succeeds or we still have 5576 * pending I/Os. 5577 */ 5578 segvn_purge(seg); 5579 if (svd->softlockcnt > 0) { 5580 return (EAGAIN); 5581 } 5582 } 5583 5584 /* 5585 * Operation for sub range of existing segment. 5586 */ 5587 if (addr != seg->s_base || eaddr != (seg->s_base + seg->s_size)) { 5588 if (szc < seg->s_szc) { 5589 VM_STAT_ADD(segvnvmstats.demoterange[2]); 5590 err = segvn_demote_range(seg, addr, len, SDR_RANGE, 0); 5591 if (err == 0) { 5592 return (IE_RETRY); 5593 } 5594 if (err == ENOMEM) { 5595 return (IE_NOMEM); 5596 } 5597 return (err); 5598 } 5599 if (addr != seg->s_base) { 5600 nseg = segvn_split_seg(seg, addr); 5601 if (eaddr != (nseg->s_base + nseg->s_size)) { 5602 /* eaddr is szc aligned */ 5603 (void) segvn_split_seg(nseg, eaddr); 5604 } 5605 return (IE_RETRY); 5606 } 5607 if (eaddr != (seg->s_base + seg->s_size)) { 5608 /* eaddr is szc aligned */ 5609 (void) segvn_split_seg(seg, eaddr); 5610 } 5611 return (IE_RETRY); 5612 } 5613 5614 /* 5615 * Break any low level sharing and reset seg->s_szc to 0. 5616 */ 5617 if ((err = segvn_clrszc(seg)) != 0) { 5618 if (err == ENOMEM) { 5619 err = IE_NOMEM; 5620 } 5621 return (err); 5622 } 5623 ASSERT(seg->s_szc == 0); 5624 5625 /* 5626 * If the end of the current segment is not pgsz aligned 5627 * then attempt to concatenate with the next segment. 5628 */ 5629 if (!IS_P2ALIGNED(eaddr, pgsz)) { 5630 nseg = AS_SEGNEXT(seg->s_as, seg); 5631 if (nseg == NULL || nseg == seg || eaddr != nseg->s_base) { 5632 return (ENOMEM); 5633 } 5634 if (nseg->s_ops != &segvn_ops) { 5635 return (EINVAL); 5636 } 5637 nsvd = (struct segvn_data *)nseg->s_data; 5638 if (nsvd->softlockcnt > 0) { 5639 segvn_purge(nseg); 5640 if (nsvd->softlockcnt > 0) { 5641 return (EAGAIN); 5642 } 5643 } 5644 err = segvn_clrszc(nseg); 5645 if (err == ENOMEM) { 5646 err = IE_NOMEM; 5647 } 5648 if (err != 0) { 5649 return (err); 5650 } 5651 err = segvn_concat(seg, nseg, 1); 5652 if (err == -1) { 5653 return (EINVAL); 5654 } 5655 if (err == -2) { 5656 return (IE_NOMEM); 5657 } 5658 return (IE_RETRY); 5659 } 5660 5661 /* 5662 * May need to re-align anon array to 5663 * new szc. 5664 */ 5665 if (amp != NULL) { 5666 if (!IS_P2ALIGNED(svd->anon_index, pgcnt)) { 5667 struct anon_hdr *nahp; 5668 5669 ASSERT(svd->type == MAP_PRIVATE); 5670 5671 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 5672 ASSERT(amp->refcnt == 1); 5673 nahp = anon_create(btop(amp->size), ANON_NOSLEEP); 5674 if (nahp == NULL) { 5675 ANON_LOCK_EXIT(&->a_rwlock); 5676 return (IE_NOMEM); 5677 } 5678 if (anon_copy_ptr(amp->ahp, svd->anon_index, 5679 nahp, 0, btop(seg->s_size), ANON_NOSLEEP)) { 5680 anon_release(nahp, btop(amp->size)); 5681 ANON_LOCK_EXIT(&->a_rwlock); 5682 return (IE_NOMEM); 5683 } 5684 anon_release(amp->ahp, btop(amp->size)); 5685 amp->ahp = nahp; 5686 svd->anon_index = 0; 5687 ANON_LOCK_EXIT(&->a_rwlock); 5688 } 5689 } 5690 if (svd->vp != NULL && szc != 0) { 5691 struct vattr va; 5692 u_offset_t eoffpage = svd->offset; 5693 va.va_mask = AT_SIZE; 5694 eoffpage += seg->s_size; 5695 eoffpage = btopr(eoffpage); 5696 if (VOP_GETATTR(svd->vp, &va, 0, svd->cred) != 0) { 5697 segvn_setpgsz_getattr_err++; 5698 return (EINVAL); 5699 } 5700 if (btopr(va.va_size) < eoffpage) { 5701 segvn_setpgsz_eof_err++; 5702 return (EINVAL); 5703 } 5704 if (amp != NULL) { 5705 /* 5706 * anon_fill_cow_holes() may call VOP_GETPAGE(). 5707 * don't take anon map lock here to avoid holding it 5708 * across VOP_GETPAGE() calls that may call back into 5709 * segvn for klsutering checks. We don't really need 5710 * anon map lock here since it's a private segment and 5711 * we hold as level lock as writers. 5712 */ 5713 if ((err = anon_fill_cow_holes(seg, seg->s_base, 5714 amp->ahp, svd->anon_index, svd->vp, svd->offset, 5715 seg->s_size, szc, svd->prot, svd->vpage, 5716 svd->cred)) != 0) { 5717 return (EINVAL); 5718 } 5719 } 5720 segvn_setvnode_mpss(svd->vp); 5721 } 5722 5723 if (amp != NULL) { 5724 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 5725 if (svd->type == MAP_PRIVATE) { 5726 amp->a_szc = szc; 5727 } else if (szc > amp->a_szc) { 5728 amp->a_szc = szc; 5729 } 5730 ANON_LOCK_EXIT(&->a_rwlock); 5731 } 5732 5733 seg->s_szc = szc; 5734 5735 return (0); 5736 } 5737 5738 static int 5739 segvn_clrszc(struct seg *seg) 5740 { 5741 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 5742 struct anon_map *amp = svd->amp; 5743 size_t pgsz; 5744 pgcnt_t pages; 5745 int err = 0; 5746 caddr_t a = seg->s_base; 5747 caddr_t ea = a + seg->s_size; 5748 ulong_t an_idx = svd->anon_index; 5749 vnode_t *vp = svd->vp; 5750 struct vpage *vpage = svd->vpage; 5751 page_t *anon_pl[1 + 1], *pp; 5752 struct anon *ap, *oldap; 5753 uint_t prot = svd->prot, vpprot; 5754 5755 ASSERT(AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock) || 5756 SEGVN_WRITE_HELD(seg->s_as, &svd->lock)); 5757 5758 if (vp == NULL && amp == NULL) { 5759 seg->s_szc = 0; 5760 return (0); 5761 } 5762 5763 /* 5764 * do HAT_UNLOAD_UNMAP since we are changing the pagesize. 5765 * unload argument is 0 when we are freeing the segment 5766 * and unload was already done. 5767 */ 5768 hat_unload(seg->s_as->a_hat, seg->s_base, seg->s_size, 5769 HAT_UNLOAD_UNMAP); 5770 5771 if (amp == NULL || svd->type == MAP_SHARED) { 5772 seg->s_szc = 0; 5773 return (0); 5774 } 5775 5776 pgsz = page_get_pagesize(seg->s_szc); 5777 pages = btop(pgsz); 5778 5779 /* 5780 * XXX anon rwlock is not really needed because this is a 5781 * private segment and we are writers. 5782 */ 5783 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 5784 5785 for (; a < ea; a += pgsz, an_idx += pages) { 5786 if ((oldap = anon_get_ptr(amp->ahp, an_idx)) != NULL) { 5787 if (svd->pageprot != 0) { 5788 ASSERT(vpage != NULL); 5789 prot = VPP_PROT(vpage); 5790 ASSERT(sameprot(seg, a, pgsz)); 5791 } 5792 if (seg->s_szc != 0) { 5793 ASSERT(vp == NULL || anon_pages(amp->ahp, 5794 an_idx, pages) == pages); 5795 if ((err = anon_map_demotepages(amp, an_idx, 5796 seg, a, prot, vpage, svd->cred)) != 0) { 5797 goto out; 5798 } 5799 } else { 5800 if (oldap->an_refcnt == 1) { 5801 continue; 5802 } 5803 if ((err = anon_getpage(&oldap, &vpprot, 5804 anon_pl, PAGESIZE, seg, a, S_READ, 5805 svd->cred))) { 5806 goto out; 5807 } 5808 if ((pp = anon_private(&ap, seg, a, prot, 5809 anon_pl[0], 0, svd->cred)) == NULL) { 5810 err = ENOMEM; 5811 goto out; 5812 } 5813 anon_decref(oldap); 5814 (void) anon_set_ptr(amp->ahp, an_idx, ap, 5815 ANON_SLEEP); 5816 page_unlock(pp); 5817 } 5818 } 5819 vpage = (vpage == NULL) ? NULL : vpage + pages; 5820 } 5821 5822 amp->a_szc = 0; 5823 seg->s_szc = 0; 5824 out: 5825 ANON_LOCK_EXIT(&->a_rwlock); 5826 return (err); 5827 } 5828 5829 static int 5830 segvn_claim_pages( 5831 struct seg *seg, 5832 struct vpage *svp, 5833 u_offset_t off, 5834 ulong_t anon_idx, 5835 uint_t prot) 5836 { 5837 pgcnt_t pgcnt = page_get_pagecnt(seg->s_szc); 5838 size_t ppasize = (pgcnt + 1) * sizeof (page_t *); 5839 page_t **ppa; 5840 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 5841 struct anon_map *amp = svd->amp; 5842 struct vpage *evp = svp + pgcnt; 5843 caddr_t addr = ((uintptr_t)(svp - svd->vpage) << PAGESHIFT) 5844 + seg->s_base; 5845 struct anon *ap; 5846 struct vnode *vp = svd->vp; 5847 page_t *pp; 5848 pgcnt_t pg_idx, i; 5849 int err = 0; 5850 anoff_t aoff; 5851 int anon = (amp != NULL) ? 1 : 0; 5852 5853 ASSERT(svd->type == MAP_PRIVATE); 5854 ASSERT(svd->vpage != NULL); 5855 ASSERT(seg->s_szc != 0); 5856 ASSERT(IS_P2ALIGNED(pgcnt, pgcnt)); 5857 ASSERT(amp == NULL || IS_P2ALIGNED(anon_idx, pgcnt)); 5858 ASSERT(sameprot(seg, addr, pgcnt << PAGESHIFT)); 5859 5860 if (VPP_PROT(svp) == prot) 5861 return (1); 5862 if (!((VPP_PROT(svp) ^ prot) & PROT_WRITE)) 5863 return (1); 5864 5865 ppa = kmem_alloc(ppasize, KM_SLEEP); 5866 if (anon && vp != NULL) { 5867 if (anon_get_ptr(amp->ahp, anon_idx) == NULL) { 5868 anon = 0; 5869 ASSERT(!anon_pages(amp->ahp, anon_idx, pgcnt)); 5870 } 5871 ASSERT(!anon || 5872 anon_pages(amp->ahp, anon_idx, pgcnt) == pgcnt); 5873 } 5874 5875 for (*ppa = NULL, pg_idx = 0; svp < evp; svp++, anon_idx++) { 5876 if (!VPP_ISPPLOCK(svp)) 5877 continue; 5878 if (anon) { 5879 ap = anon_get_ptr(amp->ahp, anon_idx); 5880 if (ap == NULL) { 5881 panic("segvn_claim_pages: no anon slot"); 5882 } 5883 swap_xlate(ap, &vp, &aoff); 5884 off = (u_offset_t)aoff; 5885 } 5886 ASSERT(vp != NULL); 5887 if ((pp = page_lookup(vp, 5888 (u_offset_t)off, SE_SHARED)) == NULL) { 5889 panic("segvn_claim_pages: no page"); 5890 } 5891 ppa[pg_idx++] = pp; 5892 off += PAGESIZE; 5893 } 5894 5895 if (ppa[0] == NULL) { 5896 kmem_free(ppa, ppasize); 5897 return (1); 5898 } 5899 5900 ASSERT(pg_idx <= pgcnt); 5901 ppa[pg_idx] = NULL; 5902 5903 if (prot & PROT_WRITE) 5904 err = page_addclaim_pages(ppa); 5905 else 5906 err = page_subclaim_pages(ppa); 5907 5908 for (i = 0; i < pg_idx; i++) { 5909 ASSERT(ppa[i] != NULL); 5910 page_unlock(ppa[i]); 5911 } 5912 5913 kmem_free(ppa, ppasize); 5914 return (err); 5915 } 5916 5917 /* 5918 * Returns right (upper address) segment if split occured. 5919 * If the address is equal to the beginning or end of its segment it returns 5920 * the current segment. 5921 */ 5922 static struct seg * 5923 segvn_split_seg(struct seg *seg, caddr_t addr) 5924 { 5925 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 5926 struct seg *nseg; 5927 size_t nsize; 5928 struct segvn_data *nsvd; 5929 5930 ASSERT(AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock)); 5931 ASSERT(addr >= seg->s_base); 5932 ASSERT(addr <= seg->s_base + seg->s_size); 5933 5934 if (addr == seg->s_base || addr == seg->s_base + seg->s_size) 5935 return (seg); 5936 5937 nsize = seg->s_base + seg->s_size - addr; 5938 seg->s_size = addr - seg->s_base; 5939 nseg = seg_alloc(seg->s_as, addr, nsize); 5940 ASSERT(nseg != NULL); 5941 nseg->s_ops = seg->s_ops; 5942 nsvd = kmem_cache_alloc(segvn_cache, KM_SLEEP); 5943 nseg->s_data = (void *)nsvd; 5944 nseg->s_szc = seg->s_szc; 5945 *nsvd = *svd; 5946 rw_init(&nsvd->lock, NULL, RW_DEFAULT, NULL); 5947 5948 if (nsvd->vp != NULL) { 5949 VN_HOLD(nsvd->vp); 5950 nsvd->offset = svd->offset + 5951 (uintptr_t)(nseg->s_base - seg->s_base); 5952 if (nsvd->type == MAP_SHARED) 5953 lgrp_shm_policy_init(NULL, nsvd->vp); 5954 } else { 5955 /* 5956 * The offset for an anonymous segment has no signifigance in 5957 * terms of an offset into a file. If we were to use the above 5958 * calculation instead, the structures read out of 5959 * /proc/<pid>/xmap would be more difficult to decipher since 5960 * it would be unclear whether two seemingly contiguous 5961 * prxmap_t structures represented different segments or a 5962 * single segment that had been split up into multiple prxmap_t 5963 * structures (e.g. if some part of the segment had not yet 5964 * been faulted in). 5965 */ 5966 nsvd->offset = 0; 5967 } 5968 5969 ASSERT(svd->softlockcnt == 0); 5970 crhold(svd->cred); 5971 5972 if (svd->vpage != NULL) { 5973 size_t bytes = vpgtob(seg_pages(seg)); 5974 size_t nbytes = vpgtob(seg_pages(nseg)); 5975 struct vpage *ovpage = svd->vpage; 5976 5977 svd->vpage = kmem_alloc(bytes, KM_SLEEP); 5978 bcopy(ovpage, svd->vpage, bytes); 5979 nsvd->vpage = kmem_alloc(nbytes, KM_SLEEP); 5980 bcopy(ovpage + seg_pages(seg), nsvd->vpage, nbytes); 5981 kmem_free(ovpage, bytes + nbytes); 5982 } 5983 if (svd->amp != NULL && svd->type == MAP_PRIVATE) { 5984 struct anon_map *oamp = svd->amp, *namp; 5985 struct anon_hdr *nahp; 5986 5987 ANON_LOCK_ENTER(&oamp->a_rwlock, RW_WRITER); 5988 ASSERT(oamp->refcnt == 1); 5989 nahp = anon_create(btop(seg->s_size), ANON_SLEEP); 5990 (void) anon_copy_ptr(oamp->ahp, svd->anon_index, 5991 nahp, 0, btop(seg->s_size), ANON_SLEEP); 5992 5993 namp = anonmap_alloc(nseg->s_size, 0); 5994 namp->a_szc = nseg->s_szc; 5995 (void) anon_copy_ptr(oamp->ahp, 5996 svd->anon_index + btop(seg->s_size), 5997 namp->ahp, 0, btop(nseg->s_size), ANON_SLEEP); 5998 anon_release(oamp->ahp, btop(oamp->size)); 5999 oamp->ahp = nahp; 6000 oamp->size = seg->s_size; 6001 svd->anon_index = 0; 6002 nsvd->amp = namp; 6003 nsvd->anon_index = 0; 6004 ANON_LOCK_EXIT(&oamp->a_rwlock); 6005 } else if (svd->amp != NULL) { 6006 pgcnt_t pgcnt = page_get_pagecnt(seg->s_szc); 6007 ASSERT(svd->amp == nsvd->amp); 6008 ASSERT(seg->s_szc <= svd->amp->a_szc); 6009 nsvd->anon_index = svd->anon_index + seg_pages(seg); 6010 ASSERT(IS_P2ALIGNED(nsvd->anon_index, pgcnt)); 6011 ANON_LOCK_ENTER(&svd->amp->a_rwlock, RW_WRITER); 6012 svd->amp->refcnt++; 6013 ANON_LOCK_EXIT(&svd->amp->a_rwlock); 6014 } 6015 6016 /* 6017 * Split amount of swap reserve 6018 */ 6019 if (svd->swresv) { 6020 /* 6021 * For MAP_NORESERVE, only allocate swap reserve for pages 6022 * being used. Other segments get enough to cover whole 6023 * segment. 6024 */ 6025 if (svd->flags & MAP_NORESERVE) { 6026 size_t oswresv; 6027 6028 ASSERT(svd->amp); 6029 oswresv = svd->swresv; 6030 svd->swresv = ptob(anon_pages(svd->amp->ahp, 6031 svd->anon_index, btop(seg->s_size))); 6032 nsvd->swresv = ptob(anon_pages(nsvd->amp->ahp, 6033 nsvd->anon_index, btop(nseg->s_size))); 6034 ASSERT(oswresv >= (svd->swresv + nsvd->swresv)); 6035 } else { 6036 ASSERT(svd->swresv == seg->s_size + nseg->s_size); 6037 svd->swresv = seg->s_size; 6038 nsvd->swresv = nseg->s_size; 6039 } 6040 } 6041 6042 return (nseg); 6043 } 6044 6045 /* 6046 * called on memory operations (unmap, setprot, setpagesize) for a subset 6047 * of a large page segment to either demote the memory range (SDR_RANGE) 6048 * or the ends (SDR_END) by addr/len. 6049 * 6050 * returns 0 on success. returns errno, including ENOMEM, on failure. 6051 */ 6052 static int 6053 segvn_demote_range( 6054 struct seg *seg, 6055 caddr_t addr, 6056 size_t len, 6057 int flag, 6058 uint_t szcvec) 6059 { 6060 caddr_t eaddr = addr + len; 6061 caddr_t lpgaddr, lpgeaddr; 6062 struct seg *nseg; 6063 struct seg *badseg1 = NULL; 6064 struct seg *badseg2 = NULL; 6065 size_t pgsz; 6066 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 6067 int err; 6068 uint_t szc = seg->s_szc; 6069 uint_t tszcvec; 6070 6071 ASSERT(AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock)); 6072 ASSERT(szc != 0); 6073 pgsz = page_get_pagesize(szc); 6074 ASSERT(seg->s_base != addr || seg->s_size != len); 6075 ASSERT(addr >= seg->s_base && eaddr <= seg->s_base + seg->s_size); 6076 ASSERT(svd->softlockcnt == 0); 6077 ASSERT(szcvec == 0 || (flag == SDR_END && svd->type == MAP_SHARED)); 6078 6079 CALC_LPG_REGION(pgsz, seg, addr, len, lpgaddr, lpgeaddr); 6080 ASSERT(flag == SDR_RANGE || eaddr < lpgeaddr || addr > lpgaddr); 6081 if (flag == SDR_RANGE) { 6082 /* demote entire range */ 6083 badseg1 = nseg = segvn_split_seg(seg, lpgaddr); 6084 (void) segvn_split_seg(nseg, lpgeaddr); 6085 ASSERT(badseg1->s_base == lpgaddr); 6086 ASSERT(badseg1->s_size == lpgeaddr - lpgaddr); 6087 } else if (addr != lpgaddr) { 6088 ASSERT(flag == SDR_END); 6089 badseg1 = nseg = segvn_split_seg(seg, lpgaddr); 6090 if (eaddr != lpgeaddr && eaddr > lpgaddr + pgsz && 6091 eaddr < lpgaddr + 2 * pgsz) { 6092 (void) segvn_split_seg(nseg, lpgeaddr); 6093 ASSERT(badseg1->s_base == lpgaddr); 6094 ASSERT(badseg1->s_size == 2 * pgsz); 6095 } else { 6096 nseg = segvn_split_seg(nseg, lpgaddr + pgsz); 6097 ASSERT(badseg1->s_base == lpgaddr); 6098 ASSERT(badseg1->s_size == pgsz); 6099 if (eaddr != lpgeaddr && eaddr > lpgaddr + pgsz) { 6100 ASSERT(lpgeaddr - lpgaddr > 2 * pgsz); 6101 nseg = segvn_split_seg(nseg, lpgeaddr - pgsz); 6102 badseg2 = nseg; 6103 (void) segvn_split_seg(nseg, lpgeaddr); 6104 ASSERT(badseg2->s_base == lpgeaddr - pgsz); 6105 ASSERT(badseg2->s_size == pgsz); 6106 } 6107 } 6108 } else { 6109 ASSERT(flag == SDR_END); 6110 ASSERT(eaddr < lpgeaddr); 6111 badseg1 = nseg = segvn_split_seg(seg, lpgeaddr - pgsz); 6112 (void) segvn_split_seg(nseg, lpgeaddr); 6113 ASSERT(badseg1->s_base == lpgeaddr - pgsz); 6114 ASSERT(badseg1->s_size == pgsz); 6115 } 6116 6117 ASSERT(badseg1 != NULL); 6118 ASSERT(badseg1->s_szc == szc); 6119 ASSERT(flag == SDR_RANGE || badseg1->s_size == pgsz || 6120 badseg1->s_size == 2 * pgsz); 6121 ASSERT(sameprot(badseg1, badseg1->s_base, pgsz)); 6122 ASSERT(badseg1->s_size == pgsz || 6123 sameprot(badseg1, badseg1->s_base + pgsz, pgsz)); 6124 if (err = segvn_clrszc(badseg1)) { 6125 return (err); 6126 } 6127 ASSERT(badseg1->s_szc == 0); 6128 6129 if (szc > 1 && (tszcvec = P2PHASE(szcvec, 1 << szc)) > 1) { 6130 uint_t tszc = highbit(tszcvec) - 1; 6131 caddr_t ta = MAX(addr, badseg1->s_base); 6132 caddr_t te; 6133 size_t tpgsz = page_get_pagesize(tszc); 6134 6135 ASSERT(svd->type == MAP_SHARED); 6136 ASSERT(flag == SDR_END); 6137 ASSERT(tszc < szc && tszc > 0); 6138 6139 if (eaddr > badseg1->s_base + badseg1->s_size) { 6140 te = badseg1->s_base + badseg1->s_size; 6141 } else { 6142 te = eaddr; 6143 } 6144 6145 ASSERT(ta <= te); 6146 badseg1->s_szc = tszc; 6147 if (!IS_P2ALIGNED(ta, tpgsz) || !IS_P2ALIGNED(te, tpgsz)) { 6148 if (badseg2 != NULL) { 6149 err = segvn_demote_range(badseg1, ta, te - ta, 6150 SDR_END, tszcvec); 6151 if (err != 0) { 6152 return (err); 6153 } 6154 } else { 6155 return (segvn_demote_range(badseg1, ta, 6156 te - ta, SDR_END, tszcvec)); 6157 } 6158 } 6159 } 6160 6161 if (badseg2 == NULL) 6162 return (0); 6163 ASSERT(badseg2->s_szc == szc); 6164 ASSERT(badseg2->s_size == pgsz); 6165 ASSERT(sameprot(badseg2, badseg2->s_base, badseg2->s_size)); 6166 if (err = segvn_clrszc(badseg2)) { 6167 return (err); 6168 } 6169 ASSERT(badseg2->s_szc == 0); 6170 6171 if (szc > 1 && (tszcvec = P2PHASE(szcvec, 1 << szc)) > 1) { 6172 uint_t tszc = highbit(tszcvec) - 1; 6173 size_t tpgsz = page_get_pagesize(tszc); 6174 6175 ASSERT(svd->type == MAP_SHARED); 6176 ASSERT(flag == SDR_END); 6177 ASSERT(tszc < szc && tszc > 0); 6178 ASSERT(badseg2->s_base > addr); 6179 ASSERT(eaddr > badseg2->s_base); 6180 ASSERT(eaddr < badseg2->s_base + badseg2->s_size); 6181 6182 badseg2->s_szc = tszc; 6183 if (!IS_P2ALIGNED(eaddr, tpgsz)) { 6184 return (segvn_demote_range(badseg2, badseg2->s_base, 6185 eaddr - badseg2->s_base, SDR_END, tszcvec)); 6186 } 6187 } 6188 6189 return (0); 6190 } 6191 6192 static int 6193 segvn_checkprot(struct seg *seg, caddr_t addr, size_t len, uint_t prot) 6194 { 6195 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 6196 struct vpage *vp, *evp; 6197 6198 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 6199 6200 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER); 6201 /* 6202 * If segment protection can be used, simply check against them. 6203 */ 6204 if (svd->pageprot == 0) { 6205 int err; 6206 6207 err = ((svd->prot & prot) != prot) ? EACCES : 0; 6208 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 6209 return (err); 6210 } 6211 6212 /* 6213 * Have to check down to the vpage level. 6214 */ 6215 evp = &svd->vpage[seg_page(seg, addr + len)]; 6216 for (vp = &svd->vpage[seg_page(seg, addr)]; vp < evp; vp++) { 6217 if ((VPP_PROT(vp) & prot) != prot) { 6218 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 6219 return (EACCES); 6220 } 6221 } 6222 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 6223 return (0); 6224 } 6225 6226 static int 6227 segvn_getprot(struct seg *seg, caddr_t addr, size_t len, uint_t *protv) 6228 { 6229 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 6230 size_t pgno = seg_page(seg, addr + len) - seg_page(seg, addr) + 1; 6231 6232 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 6233 6234 if (pgno != 0) { 6235 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER); 6236 if (svd->pageprot == 0) { 6237 do 6238 protv[--pgno] = svd->prot; 6239 while (pgno != 0); 6240 } else { 6241 size_t pgoff = seg_page(seg, addr); 6242 6243 do { 6244 pgno--; 6245 protv[pgno] = VPP_PROT(&svd->vpage[pgno+pgoff]); 6246 } while (pgno != 0); 6247 } 6248 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 6249 } 6250 return (0); 6251 } 6252 6253 static u_offset_t 6254 segvn_getoffset(struct seg *seg, caddr_t addr) 6255 { 6256 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 6257 6258 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 6259 6260 return (svd->offset + (uintptr_t)(addr - seg->s_base)); 6261 } 6262 6263 /*ARGSUSED*/ 6264 static int 6265 segvn_gettype(struct seg *seg, caddr_t addr) 6266 { 6267 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 6268 6269 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 6270 6271 return (svd->type | (svd->flags & (MAP_NORESERVE | MAP_TEXT | 6272 MAP_INITDATA))); 6273 } 6274 6275 /*ARGSUSED*/ 6276 static int 6277 segvn_getvp(struct seg *seg, caddr_t addr, struct vnode **vpp) 6278 { 6279 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 6280 6281 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 6282 6283 *vpp = svd->vp; 6284 return (0); 6285 } 6286 6287 /* 6288 * Check to see if it makes sense to do kluster/read ahead to 6289 * addr + delta relative to the mapping at addr. We assume here 6290 * that delta is a signed PAGESIZE'd multiple (which can be negative). 6291 * 6292 * For segvn, we currently "approve" of the action if we are 6293 * still in the segment and it maps from the same vp/off, 6294 * or if the advice stored in segvn_data or vpages allows it. 6295 * Currently, klustering is not allowed only if MADV_RANDOM is set. 6296 */ 6297 static int 6298 segvn_kluster(struct seg *seg, caddr_t addr, ssize_t delta) 6299 { 6300 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 6301 struct anon *oap, *ap; 6302 ssize_t pd; 6303 size_t page; 6304 struct vnode *vp1, *vp2; 6305 u_offset_t off1, off2; 6306 struct anon_map *amp; 6307 6308 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 6309 ASSERT(AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock) || 6310 SEGVN_LOCK_HELD(seg->s_as, &svd->lock)); 6311 6312 if (addr + delta < seg->s_base || 6313 addr + delta >= (seg->s_base + seg->s_size)) 6314 return (-1); /* exceeded segment bounds */ 6315 6316 pd = delta / (ssize_t)PAGESIZE; /* divide to preserve sign bit */ 6317 page = seg_page(seg, addr); 6318 6319 /* 6320 * Check to see if either of the pages addr or addr + delta 6321 * have advice set that prevents klustering (if MADV_RANDOM advice 6322 * is set for entire segment, or MADV_SEQUENTIAL is set and delta 6323 * is negative). 6324 */ 6325 if (svd->advice == MADV_RANDOM || 6326 svd->advice == MADV_SEQUENTIAL && delta < 0) 6327 return (-1); 6328 else if (svd->pageadvice && svd->vpage) { 6329 struct vpage *bvpp, *evpp; 6330 6331 bvpp = &svd->vpage[page]; 6332 evpp = &svd->vpage[page + pd]; 6333 if (VPP_ADVICE(bvpp) == MADV_RANDOM || 6334 VPP_ADVICE(evpp) == MADV_SEQUENTIAL && delta < 0) 6335 return (-1); 6336 if (VPP_ADVICE(bvpp) != VPP_ADVICE(evpp) && 6337 VPP_ADVICE(evpp) == MADV_RANDOM) 6338 return (-1); 6339 } 6340 6341 if (svd->type == MAP_SHARED) 6342 return (0); /* shared mapping - all ok */ 6343 6344 if ((amp = svd->amp) == NULL) 6345 return (0); /* off original vnode */ 6346 6347 page += svd->anon_index; 6348 6349 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 6350 6351 oap = anon_get_ptr(amp->ahp, page); 6352 ap = anon_get_ptr(amp->ahp, page + pd); 6353 6354 ANON_LOCK_EXIT(&->a_rwlock); 6355 6356 if ((oap == NULL && ap != NULL) || (oap != NULL && ap == NULL)) { 6357 return (-1); /* one with and one without an anon */ 6358 } 6359 6360 if (oap == NULL) { /* implies that ap == NULL */ 6361 return (0); /* off original vnode */ 6362 } 6363 6364 /* 6365 * Now we know we have two anon pointers - check to 6366 * see if they happen to be properly allocated. 6367 */ 6368 6369 /* 6370 * XXX We cheat here and don't lock the anon slots. We can't because 6371 * we may have been called from the anon layer which might already 6372 * have locked them. We are holding a refcnt on the slots so they 6373 * can't disappear. The worst that will happen is we'll get the wrong 6374 * names (vp, off) for the slots and make a poor klustering decision. 6375 */ 6376 swap_xlate(ap, &vp1, &off1); 6377 swap_xlate(oap, &vp2, &off2); 6378 6379 6380 if (!VOP_CMP(vp1, vp2) || off1 - off2 != delta) 6381 return (-1); 6382 return (0); 6383 } 6384 6385 /* 6386 * Swap the pages of seg out to secondary storage, returning the 6387 * number of bytes of storage freed. 6388 * 6389 * The basic idea is first to unload all translations and then to call 6390 * VOP_PUTPAGE() for all newly-unmapped pages, to push them out to the 6391 * swap device. Pages to which other segments have mappings will remain 6392 * mapped and won't be swapped. Our caller (as_swapout) has already 6393 * performed the unloading step. 6394 * 6395 * The value returned is intended to correlate well with the process's 6396 * memory requirements. However, there are some caveats: 6397 * 1) When given a shared segment as argument, this routine will 6398 * only succeed in swapping out pages for the last sharer of the 6399 * segment. (Previous callers will only have decremented mapping 6400 * reference counts.) 6401 * 2) We assume that the hat layer maintains a large enough translation 6402 * cache to capture process reference patterns. 6403 */ 6404 static size_t 6405 segvn_swapout(struct seg *seg) 6406 { 6407 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 6408 struct anon_map *amp; 6409 pgcnt_t pgcnt = 0; 6410 pgcnt_t npages; 6411 pgcnt_t page; 6412 ulong_t anon_index; 6413 6414 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 6415 6416 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER); 6417 /* 6418 * Find pages unmapped by our caller and force them 6419 * out to the virtual swap device. 6420 */ 6421 if ((amp = svd->amp) != NULL) 6422 anon_index = svd->anon_index; 6423 npages = seg->s_size >> PAGESHIFT; 6424 for (page = 0; page < npages; page++) { 6425 page_t *pp; 6426 struct anon *ap; 6427 struct vnode *vp; 6428 u_offset_t off; 6429 anon_sync_obj_t cookie; 6430 6431 /* 6432 * Obtain <vp, off> pair for the page, then look it up. 6433 * 6434 * Note that this code is willing to consider regular 6435 * pages as well as anon pages. Is this appropriate here? 6436 */ 6437 ap = NULL; 6438 if (amp != NULL) { 6439 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 6440 if (anon_array_try_enter(amp, anon_index + page, 6441 &cookie)) { 6442 ANON_LOCK_EXIT(&->a_rwlock); 6443 continue; 6444 } 6445 ap = anon_get_ptr(amp->ahp, anon_index + page); 6446 if (ap != NULL) { 6447 swap_xlate(ap, &vp, &off); 6448 } else { 6449 vp = svd->vp; 6450 off = svd->offset + ptob(page); 6451 } 6452 anon_array_exit(&cookie); 6453 ANON_LOCK_EXIT(&->a_rwlock); 6454 } else { 6455 vp = svd->vp; 6456 off = svd->offset + ptob(page); 6457 } 6458 if (vp == NULL) { /* untouched zfod page */ 6459 ASSERT(ap == NULL); 6460 continue; 6461 } 6462 6463 pp = page_lookup_nowait(vp, off, SE_SHARED); 6464 if (pp == NULL) 6465 continue; 6466 6467 6468 /* 6469 * Examine the page to see whether it can be tossed out, 6470 * keeping track of how many we've found. 6471 */ 6472 if (!page_tryupgrade(pp)) { 6473 /* 6474 * If the page has an i/o lock and no mappings, 6475 * it's very likely that the page is being 6476 * written out as a result of klustering. 6477 * Assume this is so and take credit for it here. 6478 */ 6479 if (!page_io_trylock(pp)) { 6480 if (!hat_page_is_mapped(pp)) 6481 pgcnt++; 6482 } else { 6483 page_io_unlock(pp); 6484 } 6485 page_unlock(pp); 6486 continue; 6487 } 6488 ASSERT(!page_iolock_assert(pp)); 6489 6490 6491 /* 6492 * Skip if page is locked or has mappings. 6493 * We don't need the page_struct_lock to look at lckcnt 6494 * and cowcnt because the page is exclusive locked. 6495 */ 6496 if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0 || 6497 hat_page_is_mapped(pp)) { 6498 page_unlock(pp); 6499 continue; 6500 } 6501 6502 /* 6503 * dispose skips large pages so try to demote first. 6504 */ 6505 if (pp->p_szc != 0 && !page_try_demote_pages(pp)) { 6506 page_unlock(pp); 6507 /* 6508 * XXX should skip the remaining page_t's of this 6509 * large page. 6510 */ 6511 continue; 6512 } 6513 6514 ASSERT(pp->p_szc == 0); 6515 6516 /* 6517 * No longer mapped -- we can toss it out. How 6518 * we do so depends on whether or not it's dirty. 6519 */ 6520 if (hat_ismod(pp) && pp->p_vnode) { 6521 /* 6522 * We must clean the page before it can be 6523 * freed. Setting B_FREE will cause pvn_done 6524 * to free the page when the i/o completes. 6525 * XXX: This also causes it to be accounted 6526 * as a pageout instead of a swap: need 6527 * B_SWAPOUT bit to use instead of B_FREE. 6528 * 6529 * Hold the vnode before releasing the page lock 6530 * to prevent it from being freed and re-used by 6531 * some other thread. 6532 */ 6533 VN_HOLD(vp); 6534 page_unlock(pp); 6535 6536 /* 6537 * Queue all i/o requests for the pageout thread 6538 * to avoid saturating the pageout devices. 6539 */ 6540 if (!queue_io_request(vp, off)) 6541 VN_RELE(vp); 6542 } else { 6543 /* 6544 * The page was clean, free it. 6545 * 6546 * XXX: Can we ever encounter modified pages 6547 * with no associated vnode here? 6548 */ 6549 ASSERT(pp->p_vnode != NULL); 6550 /*LINTED: constant in conditional context*/ 6551 VN_DISPOSE(pp, B_FREE, 0, kcred); 6552 } 6553 6554 /* 6555 * Credit now even if i/o is in progress. 6556 */ 6557 pgcnt++; 6558 } 6559 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 6560 6561 /* 6562 * Wakeup pageout to initiate i/o on all queued requests. 6563 */ 6564 cv_signal_pageout(); 6565 return (ptob(pgcnt)); 6566 } 6567 6568 /* 6569 * Synchronize primary storage cache with real object in virtual memory. 6570 * 6571 * XXX - Anonymous pages should not be sync'ed out at all. 6572 */ 6573 static int 6574 segvn_sync(struct seg *seg, caddr_t addr, size_t len, int attr, uint_t flags) 6575 { 6576 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 6577 struct vpage *vpp; 6578 page_t *pp; 6579 u_offset_t offset; 6580 struct vnode *vp; 6581 u_offset_t off; 6582 caddr_t eaddr; 6583 int bflags; 6584 int err = 0; 6585 int segtype; 6586 int pageprot; 6587 int prot; 6588 ulong_t anon_index; 6589 struct anon_map *amp; 6590 struct anon *ap; 6591 anon_sync_obj_t cookie; 6592 6593 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 6594 6595 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER); 6596 6597 if (svd->softlockcnt > 0) { 6598 /* 6599 * flush all pages from seg cache 6600 * otherwise we may deadlock in swap_putpage 6601 * for B_INVAL page (4175402). 6602 * 6603 * Even if we grab segvn WRITER's lock or segp_slock 6604 * here, there might be another thread which could've 6605 * successfully performed lookup/insert just before 6606 * we acquired the lock here. So, grabbing either 6607 * lock here is of not much use. Until we devise 6608 * a strategy at upper layers to solve the 6609 * synchronization issues completely, we expect 6610 * applications to handle this appropriately. 6611 */ 6612 segvn_purge(seg); 6613 if (svd->softlockcnt > 0) { 6614 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 6615 return (EAGAIN); 6616 } 6617 } 6618 6619 vpp = svd->vpage; 6620 offset = svd->offset + (uintptr_t)(addr - seg->s_base); 6621 bflags = ((flags & MS_ASYNC) ? B_ASYNC : 0) | 6622 ((flags & MS_INVALIDATE) ? B_INVAL : 0); 6623 6624 if (attr) { 6625 pageprot = attr & ~(SHARED|PRIVATE); 6626 segtype = (attr & SHARED) ? MAP_SHARED : MAP_PRIVATE; 6627 6628 /* 6629 * We are done if the segment types don't match 6630 * or if we have segment level protections and 6631 * they don't match. 6632 */ 6633 if (svd->type != segtype) { 6634 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 6635 return (0); 6636 } 6637 if (vpp == NULL) { 6638 if (svd->prot != pageprot) { 6639 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 6640 return (0); 6641 } 6642 prot = svd->prot; 6643 } else 6644 vpp = &svd->vpage[seg_page(seg, addr)]; 6645 6646 } else if (svd->vp && svd->amp == NULL && 6647 (flags & MS_INVALIDATE) == 0) { 6648 6649 /* 6650 * No attributes, no anonymous pages and MS_INVALIDATE flag 6651 * is not on, just use one big request. 6652 */ 6653 err = VOP_PUTPAGE(svd->vp, (offset_t)offset, len, 6654 bflags, svd->cred); 6655 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 6656 return (err); 6657 } 6658 6659 if ((amp = svd->amp) != NULL) 6660 anon_index = svd->anon_index + seg_page(seg, addr); 6661 6662 for (eaddr = addr + len; addr < eaddr; addr += PAGESIZE) { 6663 ap = NULL; 6664 if (amp != NULL) { 6665 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 6666 anon_array_enter(amp, anon_index, &cookie); 6667 ap = anon_get_ptr(amp->ahp, anon_index++); 6668 if (ap != NULL) { 6669 swap_xlate(ap, &vp, &off); 6670 } else { 6671 vp = svd->vp; 6672 off = offset; 6673 } 6674 anon_array_exit(&cookie); 6675 ANON_LOCK_EXIT(&->a_rwlock); 6676 } else { 6677 vp = svd->vp; 6678 off = offset; 6679 } 6680 offset += PAGESIZE; 6681 6682 if (vp == NULL) /* untouched zfod page */ 6683 continue; 6684 6685 if (attr) { 6686 if (vpp) { 6687 prot = VPP_PROT(vpp); 6688 vpp++; 6689 } 6690 if (prot != pageprot) { 6691 continue; 6692 } 6693 } 6694 6695 /* 6696 * See if any of these pages are locked -- if so, then we 6697 * will have to truncate an invalidate request at the first 6698 * locked one. We don't need the page_struct_lock to test 6699 * as this is only advisory; even if we acquire it someone 6700 * might race in and lock the page after we unlock and before 6701 * we do the PUTPAGE, then PUTPAGE simply does nothing. 6702 */ 6703 if (flags & MS_INVALIDATE) { 6704 if ((pp = page_lookup(vp, off, SE_SHARED)) != NULL) { 6705 if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) { 6706 page_unlock(pp); 6707 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 6708 return (EBUSY); 6709 } 6710 if (ap != NULL && pp->p_szc != 0 && 6711 page_tryupgrade(pp)) { 6712 if (pp->p_lckcnt == 0 && 6713 pp->p_cowcnt == 0) { 6714 /* 6715 * swapfs VN_DISPOSE() won't 6716 * invalidate large pages. 6717 * Attempt to demote. 6718 * XXX can't help it if it 6719 * fails. But for swapfs 6720 * pages it is no big deal. 6721 */ 6722 (void) page_try_demote_pages( 6723 pp); 6724 } 6725 } 6726 page_unlock(pp); 6727 } 6728 } else if (svd->type == MAP_SHARED && amp != NULL) { 6729 /* 6730 * Avoid writting out to disk ISM's large pages 6731 * because segspt_free_pages() relies on NULL an_pvp 6732 * of anon slots of such pages. 6733 */ 6734 6735 ASSERT(svd->vp == NULL); 6736 /* 6737 * swapfs uses page_lookup_nowait if not freeing or 6738 * invalidating and skips a page if 6739 * page_lookup_nowait returns NULL. 6740 */ 6741 pp = page_lookup_nowait(vp, off, SE_SHARED); 6742 if (pp == NULL) { 6743 continue; 6744 } 6745 if (pp->p_szc != 0) { 6746 page_unlock(pp); 6747 continue; 6748 } 6749 6750 /* 6751 * Note ISM pages are created large so (vp, off)'s 6752 * page cannot suddenly become large after we unlock 6753 * pp. 6754 */ 6755 page_unlock(pp); 6756 } 6757 /* 6758 * XXX - Should ultimately try to kluster 6759 * calls to VOP_PUTPAGE() for performance. 6760 */ 6761 VN_HOLD(vp); 6762 err = VOP_PUTPAGE(vp, (offset_t)off, PAGESIZE, 6763 bflags, svd->cred); 6764 VN_RELE(vp); 6765 if (err) 6766 break; 6767 } 6768 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 6769 return (err); 6770 } 6771 6772 /* 6773 * Determine if we have data corresponding to pages in the 6774 * primary storage virtual memory cache (i.e., "in core"). 6775 */ 6776 static size_t 6777 segvn_incore(struct seg *seg, caddr_t addr, size_t len, char *vec) 6778 { 6779 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 6780 struct vnode *vp, *avp; 6781 u_offset_t offset, aoffset; 6782 size_t p, ep; 6783 int ret; 6784 struct vpage *vpp; 6785 page_t *pp; 6786 uint_t start; 6787 struct anon_map *amp; /* XXX - for locknest */ 6788 struct anon *ap; 6789 uint_t attr; 6790 anon_sync_obj_t cookie; 6791 6792 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 6793 6794 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER); 6795 if (svd->amp == NULL && svd->vp == NULL) { 6796 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 6797 bzero(vec, btopr(len)); 6798 return (len); /* no anonymous pages created yet */ 6799 } 6800 6801 p = seg_page(seg, addr); 6802 ep = seg_page(seg, addr + len); 6803 start = svd->vp ? SEG_PAGE_VNODEBACKED : 0; 6804 6805 amp = svd->amp; 6806 for (; p < ep; p++, addr += PAGESIZE) { 6807 vpp = (svd->vpage) ? &svd->vpage[p]: NULL; 6808 ret = start; 6809 ap = NULL; 6810 avp = NULL; 6811 /* Grab the vnode/offset for the anon slot */ 6812 if (amp != NULL) { 6813 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 6814 anon_array_enter(amp, svd->anon_index + p, &cookie); 6815 ap = anon_get_ptr(amp->ahp, svd->anon_index + p); 6816 if (ap != NULL) { 6817 swap_xlate(ap, &avp, &aoffset); 6818 } 6819 anon_array_exit(&cookie); 6820 ANON_LOCK_EXIT(&->a_rwlock); 6821 } 6822 if ((avp != NULL) && page_exists(avp, aoffset)) { 6823 /* A page exists for the anon slot */ 6824 ret |= SEG_PAGE_INCORE; 6825 6826 /* 6827 * If page is mapped and writable 6828 */ 6829 attr = (uint_t)0; 6830 if ((hat_getattr(seg->s_as->a_hat, addr, 6831 &attr) != -1) && (attr & PROT_WRITE)) { 6832 ret |= SEG_PAGE_ANON; 6833 } 6834 /* 6835 * Don't get page_struct lock for lckcnt and cowcnt, 6836 * since this is purely advisory. 6837 */ 6838 if ((pp = page_lookup_nowait(avp, aoffset, 6839 SE_SHARED)) != NULL) { 6840 if (pp->p_lckcnt) 6841 ret |= SEG_PAGE_SOFTLOCK; 6842 if (pp->p_cowcnt) 6843 ret |= SEG_PAGE_HASCOW; 6844 page_unlock(pp); 6845 } 6846 } 6847 6848 /* Gather vnode statistics */ 6849 vp = svd->vp; 6850 offset = svd->offset + (uintptr_t)(addr - seg->s_base); 6851 6852 if (vp != NULL) { 6853 /* 6854 * Try to obtain a "shared" lock on the page 6855 * without blocking. If this fails, determine 6856 * if the page is in memory. 6857 */ 6858 pp = page_lookup_nowait(vp, offset, SE_SHARED); 6859 if ((pp == NULL) && (page_exists(vp, offset))) { 6860 /* Page is incore, and is named */ 6861 ret |= (SEG_PAGE_INCORE | SEG_PAGE_VNODE); 6862 } 6863 /* 6864 * Don't get page_struct lock for lckcnt and cowcnt, 6865 * since this is purely advisory. 6866 */ 6867 if (pp != NULL) { 6868 ret |= (SEG_PAGE_INCORE | SEG_PAGE_VNODE); 6869 if (pp->p_lckcnt) 6870 ret |= SEG_PAGE_SOFTLOCK; 6871 if (pp->p_cowcnt) 6872 ret |= SEG_PAGE_HASCOW; 6873 page_unlock(pp); 6874 } 6875 } 6876 6877 /* Gather virtual page information */ 6878 if (vpp) { 6879 if (VPP_ISPPLOCK(vpp)) 6880 ret |= SEG_PAGE_LOCKED; 6881 vpp++; 6882 } 6883 6884 *vec++ = (char)ret; 6885 } 6886 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 6887 return (len); 6888 } 6889 6890 /* 6891 * Statement for p_cowcnts/p_lckcnts. 6892 * 6893 * p_cowcnt is updated while mlock/munlocking MAP_PRIVATE and PROT_WRITE region 6894 * irrespective of the following factors or anything else: 6895 * 6896 * (1) anon slots are populated or not 6897 * (2) cow is broken or not 6898 * (3) refcnt on ap is 1 or greater than 1 6899 * 6900 * If it's not MAP_PRIVATE and PROT_WRITE, p_lckcnt is updated during mlock 6901 * and munlock. 6902 * 6903 * 6904 * Handling p_cowcnts/p_lckcnts during copy-on-write fault: 6905 * 6906 * if vpage has PROT_WRITE 6907 * transfer cowcnt on the oldpage -> cowcnt on the newpage 6908 * else 6909 * transfer lckcnt on the oldpage -> lckcnt on the newpage 6910 * 6911 * During copy-on-write, decrement p_cowcnt on the oldpage and increment 6912 * p_cowcnt on the newpage *if* the corresponding vpage has PROT_WRITE. 6913 * 6914 * We may also break COW if softlocking on read access in the physio case. 6915 * In this case, vpage may not have PROT_WRITE. So, we need to decrement 6916 * p_lckcnt on the oldpage and increment p_lckcnt on the newpage *if* the 6917 * vpage doesn't have PROT_WRITE. 6918 * 6919 * 6920 * Handling p_cowcnts/p_lckcnts during mprotect on mlocked region: 6921 * 6922 * If a MAP_PRIVATE region loses PROT_WRITE, we decrement p_cowcnt and 6923 * increment p_lckcnt by calling page_subclaim() which takes care of 6924 * availrmem accounting and p_lckcnt overflow. 6925 * 6926 * If a MAP_PRIVATE region gains PROT_WRITE, we decrement p_lckcnt and 6927 * increment p_cowcnt by calling page_addclaim() which takes care of 6928 * availrmem availability and p_cowcnt overflow. 6929 */ 6930 6931 /* 6932 * Lock down (or unlock) pages mapped by this segment. 6933 * 6934 * XXX only creates PAGESIZE pages if anon slots are not initialized. 6935 * At fault time they will be relocated into larger pages. 6936 */ 6937 static int 6938 segvn_lockop(struct seg *seg, caddr_t addr, size_t len, 6939 int attr, int op, ulong_t *lockmap, size_t pos) 6940 { 6941 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 6942 struct vpage *vpp; 6943 struct vpage *evp; 6944 page_t *pp; 6945 u_offset_t offset; 6946 u_offset_t off; 6947 int segtype; 6948 int pageprot; 6949 int claim; 6950 struct vnode *vp; 6951 ulong_t anon_index; 6952 struct anon_map *amp; 6953 struct anon *ap; 6954 struct vattr va; 6955 anon_sync_obj_t cookie; 6956 struct kshmid *sp = NULL; 6957 struct proc *p = curproc; 6958 kproject_t *proj = NULL; 6959 int chargeproc = 1; 6960 size_t locked_bytes = 0; 6961 size_t unlocked_bytes = 0; 6962 int err = 0; 6963 6964 /* 6965 * Hold write lock on address space because may split or concatenate 6966 * segments 6967 */ 6968 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 6969 6970 /* 6971 * If this is a shm, use shm's project and zone, else use 6972 * project and zone of calling process 6973 */ 6974 6975 /* Determine if this segment backs a sysV shm */ 6976 if (svd->amp != NULL && svd->amp->a_sp != NULL) { 6977 sp = svd->amp->a_sp; 6978 proj = sp->shm_perm.ipc_proj; 6979 chargeproc = 0; 6980 } 6981 6982 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_WRITER); 6983 if (attr) { 6984 pageprot = attr & ~(SHARED|PRIVATE); 6985 segtype = attr & SHARED ? MAP_SHARED : MAP_PRIVATE; 6986 6987 /* 6988 * We are done if the segment types don't match 6989 * or if we have segment level protections and 6990 * they don't match. 6991 */ 6992 if (svd->type != segtype) { 6993 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 6994 return (0); 6995 } 6996 if (svd->pageprot == 0 && svd->prot != pageprot) { 6997 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 6998 return (0); 6999 } 7000 } 7001 7002 /* 7003 * If we're locking, then we must create a vpage structure if 7004 * none exists. If we're unlocking, then check to see if there 7005 * is a vpage -- if not, then we could not have locked anything. 7006 */ 7007 7008 if ((vpp = svd->vpage) == NULL) { 7009 if (op == MC_LOCK) 7010 segvn_vpage(seg); 7011 else { 7012 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7013 return (0); 7014 } 7015 } 7016 7017 /* 7018 * The anonymous data vector (i.e., previously 7019 * unreferenced mapping to swap space) can be allocated 7020 * by lazily testing for its existence. 7021 */ 7022 if (op == MC_LOCK && svd->amp == NULL && svd->vp == NULL) { 7023 svd->amp = anonmap_alloc(seg->s_size, 0); 7024 svd->amp->a_szc = seg->s_szc; 7025 } 7026 7027 if ((amp = svd->amp) != NULL) { 7028 anon_index = svd->anon_index + seg_page(seg, addr); 7029 } 7030 7031 offset = svd->offset + (uintptr_t)(addr - seg->s_base); 7032 evp = &svd->vpage[seg_page(seg, addr + len)]; 7033 7034 if (sp != NULL) 7035 mutex_enter(&sp->shm_mlock); 7036 7037 /* determine number of unlocked bytes in range for lock operation */ 7038 if (op == MC_LOCK) { 7039 7040 if (sp == NULL) { 7041 for (vpp = &svd->vpage[seg_page(seg, addr)]; vpp < evp; 7042 vpp++) { 7043 if (!VPP_ISPPLOCK(vpp)) 7044 unlocked_bytes += PAGESIZE; 7045 } 7046 } else { 7047 ulong_t i_idx, i_edx; 7048 anon_sync_obj_t i_cookie; 7049 struct anon *i_ap; 7050 struct vnode *i_vp; 7051 u_offset_t i_off; 7052 7053 /* Only count sysV pages once for locked memory */ 7054 i_edx = svd->anon_index + seg_page(seg, addr + len); 7055 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 7056 for (i_idx = anon_index; i_idx < i_edx; i_idx++) { 7057 anon_array_enter(amp, i_idx, &i_cookie); 7058 i_ap = anon_get_ptr(amp->ahp, i_idx); 7059 if (i_ap == NULL) { 7060 unlocked_bytes += PAGESIZE; 7061 anon_array_exit(&i_cookie); 7062 continue; 7063 } 7064 swap_xlate(i_ap, &i_vp, &i_off); 7065 anon_array_exit(&i_cookie); 7066 pp = page_lookup(i_vp, i_off, SE_SHARED); 7067 if (pp == NULL) { 7068 unlocked_bytes += PAGESIZE; 7069 continue; 7070 } else if (pp->p_lckcnt == 0) 7071 unlocked_bytes += PAGESIZE; 7072 page_unlock(pp); 7073 } 7074 ANON_LOCK_EXIT(&->a_rwlock); 7075 } 7076 7077 mutex_enter(&p->p_lock); 7078 err = rctl_incr_locked_mem(p, proj, unlocked_bytes, 7079 chargeproc); 7080 mutex_exit(&p->p_lock); 7081 7082 if (err) { 7083 if (sp != NULL) 7084 mutex_exit(&sp->shm_mlock); 7085 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7086 return (err); 7087 } 7088 } 7089 /* 7090 * Loop over all pages in the range. Process if we're locking and 7091 * page has not already been locked in this mapping; or if we're 7092 * unlocking and the page has been locked. 7093 */ 7094 for (vpp = &svd->vpage[seg_page(seg, addr)]; vpp < evp; 7095 vpp++, pos++, addr += PAGESIZE, offset += PAGESIZE, anon_index++) { 7096 if ((attr == 0 || VPP_PROT(vpp) == pageprot) && 7097 ((op == MC_LOCK && !VPP_ISPPLOCK(vpp)) || 7098 (op == MC_UNLOCK && VPP_ISPPLOCK(vpp)))) { 7099 7100 if (amp != NULL) 7101 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 7102 /* 7103 * If this isn't a MAP_NORESERVE segment and 7104 * we're locking, allocate anon slots if they 7105 * don't exist. The page is brought in later on. 7106 */ 7107 if (op == MC_LOCK && svd->vp == NULL && 7108 ((svd->flags & MAP_NORESERVE) == 0) && 7109 amp != NULL && 7110 ((ap = anon_get_ptr(amp->ahp, anon_index)) 7111 == NULL)) { 7112 anon_array_enter(amp, anon_index, &cookie); 7113 7114 if ((ap = anon_get_ptr(amp->ahp, 7115 anon_index)) == NULL) { 7116 pp = anon_zero(seg, addr, &ap, 7117 svd->cred); 7118 if (pp == NULL) { 7119 anon_array_exit(&cookie); 7120 ANON_LOCK_EXIT(&->a_rwlock); 7121 err = ENOMEM; 7122 goto out; 7123 } 7124 ASSERT(anon_get_ptr(amp->ahp, 7125 anon_index) == NULL); 7126 (void) anon_set_ptr(amp->ahp, 7127 anon_index, ap, ANON_SLEEP); 7128 page_unlock(pp); 7129 } 7130 anon_array_exit(&cookie); 7131 } 7132 7133 /* 7134 * Get name for page, accounting for 7135 * existence of private copy. 7136 */ 7137 ap = NULL; 7138 if (amp != NULL) { 7139 anon_array_enter(amp, anon_index, &cookie); 7140 ap = anon_get_ptr(amp->ahp, anon_index); 7141 if (ap != NULL) { 7142 swap_xlate(ap, &vp, &off); 7143 } else { 7144 if (svd->vp == NULL && 7145 (svd->flags & MAP_NORESERVE)) { 7146 anon_array_exit(&cookie); 7147 ANON_LOCK_EXIT(&->a_rwlock); 7148 continue; 7149 } 7150 vp = svd->vp; 7151 off = offset; 7152 } 7153 anon_array_exit(&cookie); 7154 ANON_LOCK_EXIT(&->a_rwlock); 7155 } else { 7156 vp = svd->vp; 7157 off = offset; 7158 } 7159 7160 /* 7161 * Get page frame. It's ok if the page is 7162 * not available when we're unlocking, as this 7163 * may simply mean that a page we locked got 7164 * truncated out of existence after we locked it. 7165 * 7166 * Invoke VOP_GETPAGE() to obtain the page struct 7167 * since we may need to read it from disk if its 7168 * been paged out. 7169 */ 7170 if (op != MC_LOCK) 7171 pp = page_lookup(vp, off, SE_SHARED); 7172 else { 7173 page_t *pl[1 + 1]; 7174 int error; 7175 7176 ASSERT(vp != NULL); 7177 7178 error = VOP_GETPAGE(vp, (offset_t)off, PAGESIZE, 7179 (uint_t *)NULL, pl, PAGESIZE, seg, addr, 7180 S_OTHER, svd->cred); 7181 7182 /* 7183 * If the error is EDEADLK then we must bounce 7184 * up and drop all vm subsystem locks and then 7185 * retry the operation later 7186 * This behavior is a temporary measure because 7187 * ufs/sds logging is badly designed and will 7188 * deadlock if we don't allow this bounce to 7189 * happen. The real solution is to re-design 7190 * the logging code to work properly. See bug 7191 * 4125102 for details of the problem. 7192 */ 7193 if (error == EDEADLK) { 7194 err = error; 7195 goto out; 7196 } 7197 /* 7198 * Quit if we fail to fault in the page. Treat 7199 * the failure as an error, unless the addr 7200 * is mapped beyond the end of a file. 7201 */ 7202 if (error && svd->vp) { 7203 va.va_mask = AT_SIZE; 7204 if (VOP_GETATTR(svd->vp, &va, 0, 7205 svd->cred) != 0) { 7206 err = EIO; 7207 goto out; 7208 } 7209 if (btopr(va.va_size) >= 7210 btopr(off + 1)) { 7211 err = EIO; 7212 goto out; 7213 } 7214 goto out; 7215 7216 } else if (error) { 7217 err = EIO; 7218 goto out; 7219 } 7220 pp = pl[0]; 7221 ASSERT(pp != NULL); 7222 } 7223 7224 /* 7225 * See Statement at the beginning of this routine. 7226 * 7227 * claim is always set if MAP_PRIVATE and PROT_WRITE 7228 * irrespective of following factors: 7229 * 7230 * (1) anon slots are populated or not 7231 * (2) cow is broken or not 7232 * (3) refcnt on ap is 1 or greater than 1 7233 * 7234 * See 4140683 for details 7235 */ 7236 claim = ((VPP_PROT(vpp) & PROT_WRITE) && 7237 (svd->type == MAP_PRIVATE)); 7238 7239 /* 7240 * Perform page-level operation appropriate to 7241 * operation. If locking, undo the SOFTLOCK 7242 * performed to bring the page into memory 7243 * after setting the lock. If unlocking, 7244 * and no page was found, account for the claim 7245 * separately. 7246 */ 7247 if (op == MC_LOCK) { 7248 int ret = 1; /* Assume success */ 7249 7250 ASSERT(!VPP_ISPPLOCK(vpp)); 7251 7252 ret = page_pp_lock(pp, claim, 0); 7253 if (ret == 0) { 7254 /* locking page failed */ 7255 page_unlock(pp); 7256 err = EAGAIN; 7257 goto out; 7258 } 7259 VPP_SETPPLOCK(vpp); 7260 if (sp != NULL) { 7261 if (pp->p_lckcnt == 1) 7262 locked_bytes += PAGESIZE; 7263 } else 7264 locked_bytes += PAGESIZE; 7265 7266 if (lockmap != (ulong_t *)NULL) 7267 BT_SET(lockmap, pos); 7268 7269 page_unlock(pp); 7270 } else { 7271 ASSERT(VPP_ISPPLOCK(vpp)); 7272 if (pp != NULL) { 7273 /* sysV pages should be locked */ 7274 ASSERT(sp == NULL || pp->p_lckcnt > 0); 7275 page_pp_unlock(pp, claim, 0); 7276 if (sp != NULL) { 7277 if (pp->p_lckcnt == 0) 7278 unlocked_bytes 7279 += PAGESIZE; 7280 } else 7281 unlocked_bytes += PAGESIZE; 7282 page_unlock(pp); 7283 } else { 7284 ASSERT(sp == NULL); 7285 unlocked_bytes += PAGESIZE; 7286 } 7287 VPP_CLRPPLOCK(vpp); 7288 } 7289 } 7290 } 7291 out: 7292 if (op == MC_LOCK) { 7293 /* Credit back bytes that did not get locked */ 7294 if ((unlocked_bytes - locked_bytes) > 0) { 7295 if (proj == NULL) 7296 mutex_enter(&p->p_lock); 7297 rctl_decr_locked_mem(p, proj, 7298 (unlocked_bytes - locked_bytes), chargeproc); 7299 if (proj == NULL) 7300 mutex_exit(&p->p_lock); 7301 } 7302 7303 } else { 7304 /* Account bytes that were unlocked */ 7305 if (unlocked_bytes > 0) { 7306 if (proj == NULL) 7307 mutex_enter(&p->p_lock); 7308 rctl_decr_locked_mem(p, proj, unlocked_bytes, 7309 chargeproc); 7310 if (proj == NULL) 7311 mutex_exit(&p->p_lock); 7312 } 7313 } 7314 if (sp != NULL) 7315 mutex_exit(&sp->shm_mlock); 7316 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7317 7318 return (err); 7319 } 7320 7321 /* 7322 * Set advice from user for specified pages 7323 * There are 5 types of advice: 7324 * MADV_NORMAL - Normal (default) behavior (whatever that is) 7325 * MADV_RANDOM - Random page references 7326 * do not allow readahead or 'klustering' 7327 * MADV_SEQUENTIAL - Sequential page references 7328 * Pages previous to the one currently being 7329 * accessed (determined by fault) are 'not needed' 7330 * and are freed immediately 7331 * MADV_WILLNEED - Pages are likely to be used (fault ahead in mctl) 7332 * MADV_DONTNEED - Pages are not needed (synced out in mctl) 7333 * MADV_FREE - Contents can be discarded 7334 * MADV_ACCESS_DEFAULT- Default access 7335 * MADV_ACCESS_LWP - Next LWP will access heavily 7336 * MADV_ACCESS_MANY- Many LWPs or processes will access heavily 7337 */ 7338 static int 7339 segvn_advise(struct seg *seg, caddr_t addr, size_t len, uint_t behav) 7340 { 7341 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 7342 size_t page; 7343 int err = 0; 7344 int already_set; 7345 struct anon_map *amp; 7346 ulong_t anon_index; 7347 struct seg *next; 7348 lgrp_mem_policy_t policy; 7349 struct seg *prev; 7350 struct vnode *vp; 7351 7352 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 7353 7354 /* 7355 * In case of MADV_FREE, we won't be modifying any segment private 7356 * data structures; so, we only need to grab READER's lock 7357 */ 7358 if (behav != MADV_FREE) 7359 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_WRITER); 7360 else 7361 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER); 7362 7363 /* 7364 * Large pages are assumed to be only turned on when accesses to the 7365 * segment's address range have spatial and temporal locality. That 7366 * justifies ignoring MADV_SEQUENTIAL for large page segments. 7367 * Also, ignore advice affecting lgroup memory allocation 7368 * if don't need to do lgroup optimizations on this system 7369 */ 7370 7371 if ((behav == MADV_SEQUENTIAL && seg->s_szc != 0) || 7372 (!lgrp_optimizations() && (behav == MADV_ACCESS_DEFAULT || 7373 behav == MADV_ACCESS_LWP || behav == MADV_ACCESS_MANY))) { 7374 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7375 return (0); 7376 } 7377 7378 if (behav == MADV_SEQUENTIAL || behav == MADV_ACCESS_DEFAULT || 7379 behav == MADV_ACCESS_LWP || behav == MADV_ACCESS_MANY) { 7380 /* 7381 * Since we are going to unload hat mappings 7382 * we first have to flush the cache. Otherwise 7383 * this might lead to system panic if another 7384 * thread is doing physio on the range whose 7385 * mappings are unloaded by madvise(3C). 7386 */ 7387 if (svd->softlockcnt > 0) { 7388 /* 7389 * Since we do have the segvn writers lock 7390 * nobody can fill the cache with entries 7391 * belonging to this seg during the purge. 7392 * The flush either succeeds or we still 7393 * have pending I/Os. In the later case, 7394 * madvise(3C) fails. 7395 */ 7396 segvn_purge(seg); 7397 if (svd->softlockcnt > 0) { 7398 /* 7399 * Since madvise(3C) is advisory and 7400 * it's not part of UNIX98, madvise(3C) 7401 * failure here doesn't cause any hardship. 7402 * Note that we don't block in "as" layer. 7403 */ 7404 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7405 return (EAGAIN); 7406 } 7407 } 7408 } 7409 7410 amp = svd->amp; 7411 vp = svd->vp; 7412 if (behav == MADV_FREE) { 7413 /* 7414 * MADV_FREE is not supported for segments with 7415 * underlying object; if anonmap is NULL, anon slots 7416 * are not yet populated and there is nothing for 7417 * us to do. As MADV_FREE is advisory, we don't 7418 * return error in either case. 7419 */ 7420 if (vp || amp == NULL) { 7421 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7422 return (0); 7423 } 7424 7425 page = seg_page(seg, addr); 7426 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 7427 anon_disclaim(amp, svd->anon_index + page, len, 0); 7428 ANON_LOCK_EXIT(&->a_rwlock); 7429 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7430 return (0); 7431 } 7432 7433 /* 7434 * If advice is to be applied to entire segment, 7435 * use advice field in seg_data structure 7436 * otherwise use appropriate vpage entry. 7437 */ 7438 if ((addr == seg->s_base) && (len == seg->s_size)) { 7439 switch (behav) { 7440 case MADV_ACCESS_LWP: 7441 case MADV_ACCESS_MANY: 7442 case MADV_ACCESS_DEFAULT: 7443 /* 7444 * Set memory allocation policy for this segment 7445 */ 7446 policy = lgrp_madv_to_policy(behav, len, svd->type); 7447 if (svd->type == MAP_SHARED) 7448 already_set = lgrp_shm_policy_set(policy, amp, 7449 svd->anon_index, vp, svd->offset, len); 7450 else { 7451 /* 7452 * For private memory, need writers lock on 7453 * address space because the segment may be 7454 * split or concatenated when changing policy 7455 */ 7456 if (AS_READ_HELD(seg->s_as, 7457 &seg->s_as->a_lock)) { 7458 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7459 return (IE_RETRY); 7460 } 7461 7462 already_set = lgrp_privm_policy_set(policy, 7463 &svd->policy_info, len); 7464 } 7465 7466 /* 7467 * If policy set already and it shouldn't be reapplied, 7468 * don't do anything. 7469 */ 7470 if (already_set && 7471 !LGRP_MEM_POLICY_REAPPLICABLE(policy)) 7472 break; 7473 7474 /* 7475 * Mark any existing pages in given range for 7476 * migration 7477 */ 7478 page_mark_migrate(seg, addr, len, amp, svd->anon_index, 7479 vp, svd->offset, 1); 7480 7481 /* 7482 * If same policy set already or this is a shared 7483 * memory segment, don't need to try to concatenate 7484 * segment with adjacent ones. 7485 */ 7486 if (already_set || svd->type == MAP_SHARED) 7487 break; 7488 7489 /* 7490 * Try to concatenate this segment with previous 7491 * one and next one, since we changed policy for 7492 * this one and it may be compatible with adjacent 7493 * ones now. 7494 */ 7495 prev = AS_SEGPREV(seg->s_as, seg); 7496 next = AS_SEGNEXT(seg->s_as, seg); 7497 7498 if (next && next->s_ops == &segvn_ops && 7499 addr + len == next->s_base) 7500 (void) segvn_concat(seg, next, 1); 7501 7502 if (prev && prev->s_ops == &segvn_ops && 7503 addr == prev->s_base + prev->s_size) { 7504 /* 7505 * Drop lock for private data of current 7506 * segment before concatenating (deleting) it 7507 * and return IE_REATTACH to tell as_ctl() that 7508 * current segment has changed 7509 */ 7510 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7511 if (!segvn_concat(prev, seg, 1)) 7512 err = IE_REATTACH; 7513 7514 return (err); 7515 } 7516 break; 7517 7518 case MADV_SEQUENTIAL: 7519 /* 7520 * unloading mapping guarantees 7521 * detection in segvn_fault 7522 */ 7523 ASSERT(seg->s_szc == 0); 7524 hat_unload(seg->s_as->a_hat, addr, len, 7525 HAT_UNLOAD); 7526 /* FALLTHROUGH */ 7527 case MADV_NORMAL: 7528 case MADV_RANDOM: 7529 svd->advice = (uchar_t)behav; 7530 svd->pageadvice = 0; 7531 break; 7532 case MADV_WILLNEED: /* handled in memcntl */ 7533 case MADV_DONTNEED: /* handled in memcntl */ 7534 case MADV_FREE: /* handled above */ 7535 break; 7536 default: 7537 err = EINVAL; 7538 } 7539 } else { 7540 caddr_t eaddr; 7541 struct seg *new_seg; 7542 struct segvn_data *new_svd; 7543 u_offset_t off; 7544 caddr_t oldeaddr; 7545 7546 page = seg_page(seg, addr); 7547 7548 segvn_vpage(seg); 7549 7550 switch (behav) { 7551 struct vpage *bvpp, *evpp; 7552 7553 case MADV_ACCESS_LWP: 7554 case MADV_ACCESS_MANY: 7555 case MADV_ACCESS_DEFAULT: 7556 /* 7557 * Set memory allocation policy for portion of this 7558 * segment 7559 */ 7560 7561 /* 7562 * Align address and length of advice to page 7563 * boundaries for large pages 7564 */ 7565 if (seg->s_szc != 0) { 7566 size_t pgsz; 7567 7568 pgsz = page_get_pagesize(seg->s_szc); 7569 addr = (caddr_t)P2ALIGN((uintptr_t)addr, pgsz); 7570 len = P2ROUNDUP(len, pgsz); 7571 } 7572 7573 /* 7574 * Check to see whether policy is set already 7575 */ 7576 policy = lgrp_madv_to_policy(behav, len, svd->type); 7577 7578 anon_index = svd->anon_index + page; 7579 off = svd->offset + (uintptr_t)(addr - seg->s_base); 7580 7581 if (svd->type == MAP_SHARED) 7582 already_set = lgrp_shm_policy_set(policy, amp, 7583 anon_index, vp, off, len); 7584 else 7585 already_set = 7586 (policy == svd->policy_info.mem_policy); 7587 7588 /* 7589 * If policy set already and it shouldn't be reapplied, 7590 * don't do anything. 7591 */ 7592 if (already_set && 7593 !LGRP_MEM_POLICY_REAPPLICABLE(policy)) 7594 break; 7595 7596 /* 7597 * For private memory, need writers lock on 7598 * address space because the segment may be 7599 * split or concatenated when changing policy 7600 */ 7601 if (svd->type == MAP_PRIVATE && 7602 AS_READ_HELD(seg->s_as, &seg->s_as->a_lock)) { 7603 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7604 return (IE_RETRY); 7605 } 7606 7607 /* 7608 * Mark any existing pages in given range for 7609 * migration 7610 */ 7611 page_mark_migrate(seg, addr, len, amp, svd->anon_index, 7612 vp, svd->offset, 1); 7613 7614 /* 7615 * Don't need to try to split or concatenate 7616 * segments, since policy is same or this is a shared 7617 * memory segment 7618 */ 7619 if (already_set || svd->type == MAP_SHARED) 7620 break; 7621 7622 /* 7623 * Split off new segment if advice only applies to a 7624 * portion of existing segment starting in middle 7625 */ 7626 new_seg = NULL; 7627 eaddr = addr + len; 7628 oldeaddr = seg->s_base + seg->s_size; 7629 if (addr > seg->s_base) { 7630 /* 7631 * Must flush I/O page cache 7632 * before splitting segment 7633 */ 7634 if (svd->softlockcnt > 0) 7635 segvn_purge(seg); 7636 7637 /* 7638 * Split segment and return IE_REATTACH to tell 7639 * as_ctl() that current segment changed 7640 */ 7641 new_seg = segvn_split_seg(seg, addr); 7642 new_svd = (struct segvn_data *)new_seg->s_data; 7643 err = IE_REATTACH; 7644 7645 /* 7646 * If new segment ends where old one 7647 * did, try to concatenate the new 7648 * segment with next one. 7649 */ 7650 if (eaddr == oldeaddr) { 7651 /* 7652 * Set policy for new segment 7653 */ 7654 (void) lgrp_privm_policy_set(policy, 7655 &new_svd->policy_info, 7656 new_seg->s_size); 7657 7658 next = AS_SEGNEXT(new_seg->s_as, 7659 new_seg); 7660 7661 if (next && 7662 next->s_ops == &segvn_ops && 7663 eaddr == next->s_base) 7664 (void) segvn_concat(new_seg, 7665 next, 1); 7666 } 7667 } 7668 7669 /* 7670 * Split off end of existing segment if advice only 7671 * applies to a portion of segment ending before 7672 * end of the existing segment 7673 */ 7674 if (eaddr < oldeaddr) { 7675 /* 7676 * Must flush I/O page cache 7677 * before splitting segment 7678 */ 7679 if (svd->softlockcnt > 0) 7680 segvn_purge(seg); 7681 7682 /* 7683 * If beginning of old segment was already 7684 * split off, use new segment to split end off 7685 * from. 7686 */ 7687 if (new_seg != NULL && new_seg != seg) { 7688 /* 7689 * Split segment 7690 */ 7691 (void) segvn_split_seg(new_seg, eaddr); 7692 7693 /* 7694 * Set policy for new segment 7695 */ 7696 (void) lgrp_privm_policy_set(policy, 7697 &new_svd->policy_info, 7698 new_seg->s_size); 7699 } else { 7700 /* 7701 * Split segment and return IE_REATTACH 7702 * to tell as_ctl() that current 7703 * segment changed 7704 */ 7705 (void) segvn_split_seg(seg, eaddr); 7706 err = IE_REATTACH; 7707 7708 (void) lgrp_privm_policy_set(policy, 7709 &svd->policy_info, seg->s_size); 7710 7711 /* 7712 * If new segment starts where old one 7713 * did, try to concatenate it with 7714 * previous segment. 7715 */ 7716 if (addr == seg->s_base) { 7717 prev = AS_SEGPREV(seg->s_as, 7718 seg); 7719 7720 /* 7721 * Drop lock for private data 7722 * of current segment before 7723 * concatenating (deleting) it 7724 */ 7725 if (prev && 7726 prev->s_ops == 7727 &segvn_ops && 7728 addr == prev->s_base + 7729 prev->s_size) { 7730 SEGVN_LOCK_EXIT( 7731 seg->s_as, 7732 &svd->lock); 7733 (void) segvn_concat( 7734 prev, seg, 1); 7735 return (err); 7736 } 7737 } 7738 } 7739 } 7740 break; 7741 case MADV_SEQUENTIAL: 7742 ASSERT(seg->s_szc == 0); 7743 hat_unload(seg->s_as->a_hat, addr, len, HAT_UNLOAD); 7744 /* FALLTHROUGH */ 7745 case MADV_NORMAL: 7746 case MADV_RANDOM: 7747 bvpp = &svd->vpage[page]; 7748 evpp = &svd->vpage[page + (len >> PAGESHIFT)]; 7749 for (; bvpp < evpp; bvpp++) 7750 VPP_SETADVICE(bvpp, behav); 7751 svd->advice = MADV_NORMAL; 7752 break; 7753 case MADV_WILLNEED: /* handled in memcntl */ 7754 case MADV_DONTNEED: /* handled in memcntl */ 7755 case MADV_FREE: /* handled above */ 7756 break; 7757 default: 7758 err = EINVAL; 7759 } 7760 } 7761 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7762 return (err); 7763 } 7764 7765 /* 7766 * Create a vpage structure for this seg. 7767 */ 7768 static void 7769 segvn_vpage(struct seg *seg) 7770 { 7771 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 7772 struct vpage *vp, *evp; 7773 7774 ASSERT(SEGVN_WRITE_HELD(seg->s_as, &svd->lock)); 7775 7776 /* 7777 * If no vpage structure exists, allocate one. Copy the protections 7778 * and the advice from the segment itself to the individual pages. 7779 */ 7780 if (svd->vpage == NULL) { 7781 svd->pageprot = 1; 7782 svd->pageadvice = 1; 7783 svd->vpage = kmem_zalloc(seg_pages(seg) * sizeof (struct vpage), 7784 KM_SLEEP); 7785 evp = &svd->vpage[seg_page(seg, seg->s_base + seg->s_size)]; 7786 for (vp = svd->vpage; vp < evp; vp++) { 7787 VPP_SETPROT(vp, svd->prot); 7788 VPP_SETADVICE(vp, svd->advice); 7789 } 7790 } 7791 } 7792 7793 /* 7794 * Dump the pages belonging to this segvn segment. 7795 */ 7796 static void 7797 segvn_dump(struct seg *seg) 7798 { 7799 struct segvn_data *svd; 7800 page_t *pp; 7801 struct anon_map *amp; 7802 ulong_t anon_index; 7803 struct vnode *vp; 7804 u_offset_t off, offset; 7805 pfn_t pfn; 7806 pgcnt_t page, npages; 7807 caddr_t addr; 7808 7809 npages = seg_pages(seg); 7810 svd = (struct segvn_data *)seg->s_data; 7811 vp = svd->vp; 7812 off = offset = svd->offset; 7813 addr = seg->s_base; 7814 7815 if ((amp = svd->amp) != NULL) { 7816 anon_index = svd->anon_index; 7817 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 7818 } 7819 7820 for (page = 0; page < npages; page++, offset += PAGESIZE) { 7821 struct anon *ap; 7822 int we_own_it = 0; 7823 7824 if (amp && (ap = anon_get_ptr(svd->amp->ahp, anon_index++))) { 7825 swap_xlate_nopanic(ap, &vp, &off); 7826 } else { 7827 vp = svd->vp; 7828 off = offset; 7829 } 7830 7831 /* 7832 * If pp == NULL, the page either does not exist 7833 * or is exclusively locked. So determine if it 7834 * exists before searching for it. 7835 */ 7836 7837 if ((pp = page_lookup_nowait(vp, off, SE_SHARED))) 7838 we_own_it = 1; 7839 else 7840 pp = page_exists(vp, off); 7841 7842 if (pp) { 7843 pfn = page_pptonum(pp); 7844 dump_addpage(seg->s_as, addr, pfn); 7845 if (we_own_it) 7846 page_unlock(pp); 7847 } 7848 addr += PAGESIZE; 7849 dump_timeleft = dump_timeout; 7850 } 7851 7852 if (amp != NULL) 7853 ANON_LOCK_EXIT(&->a_rwlock); 7854 } 7855 7856 /* 7857 * lock/unlock anon pages over a given range. Return shadow list 7858 */ 7859 static int 7860 segvn_pagelock(struct seg *seg, caddr_t addr, size_t len, struct page ***ppp, 7861 enum lock_type type, enum seg_rw rw) 7862 { 7863 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 7864 size_t np, adjustpages = 0, npages = (len >> PAGESHIFT); 7865 ulong_t anon_index; 7866 uint_t protchk; 7867 uint_t error; 7868 struct anon_map *amp; 7869 struct page **pplist, **pl, *pp; 7870 caddr_t a; 7871 size_t page; 7872 caddr_t lpgaddr, lpgeaddr; 7873 pgcnt_t szc0_npages = 0; 7874 7875 TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEGVN_START, 7876 "segvn_pagelock: start seg %p addr %p", seg, addr); 7877 7878 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 7879 if (seg->s_szc != 0 && (type == L_PAGELOCK || type == L_PAGEUNLOCK)) { 7880 /* 7881 * We are adjusting the pagelock region to the large page size 7882 * boundary because the unlocked part of a large page cannot 7883 * be freed anyway unless all constituent pages of a large 7884 * page are locked. Therefore this adjustment allows us to 7885 * decrement availrmem by the right value (note we don't want 7886 * to just decrement availrem by the large page size without 7887 * adjusting addr and len because then we may end up 7888 * decrementing availrmem by large page size for every 7889 * constituent page locked by a new as_pagelock call). 7890 * as_pageunlock caller must always match as_pagelock call's 7891 * addr and len. 7892 * 7893 * Note segment's page size cannot change while we are holding 7894 * as lock. And then it cannot change while softlockcnt is 7895 * not 0. This will allow us to correctly recalculate large 7896 * page size region for the matching pageunlock/reclaim call. 7897 * 7898 * for pageunlock *ppp points to the pointer of page_t that 7899 * corresponds to the real unadjusted start address. Similar 7900 * for pagelock *ppp must point to the pointer of page_t that 7901 * corresponds to the real unadjusted start address. 7902 */ 7903 size_t pgsz = page_get_pagesize(seg->s_szc); 7904 CALC_LPG_REGION(pgsz, seg, addr, len, lpgaddr, lpgeaddr); 7905 adjustpages = ((uintptr_t)(addr - lpgaddr)) >> PAGESHIFT; 7906 } 7907 7908 if (type == L_PAGEUNLOCK) { 7909 7910 /* 7911 * update hat ref bits for /proc. We need to make sure 7912 * that threads tracing the ref and mod bits of the 7913 * address space get the right data. 7914 * Note: page ref and mod bits are updated at reclaim time 7915 */ 7916 if (seg->s_as->a_vbits) { 7917 for (a = addr; a < addr + len; a += PAGESIZE) { 7918 if (rw == S_WRITE) { 7919 hat_setstat(seg->s_as, a, 7920 PAGESIZE, P_REF | P_MOD); 7921 } else { 7922 hat_setstat(seg->s_as, a, 7923 PAGESIZE, P_REF); 7924 } 7925 } 7926 } 7927 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER); 7928 if (seg->s_szc != 0) { 7929 VM_STAT_ADD(segvnvmstats.pagelock[0]); 7930 seg_pinactive(seg, lpgaddr, lpgeaddr - lpgaddr, 7931 *ppp - adjustpages, rw, segvn_reclaim); 7932 } else { 7933 seg_pinactive(seg, addr, len, *ppp, rw, segvn_reclaim); 7934 } 7935 7936 /* 7937 * If someone is blocked while unmapping, we purge 7938 * segment page cache and thus reclaim pplist synchronously 7939 * without waiting for seg_pasync_thread. This speeds up 7940 * unmapping in cases where munmap(2) is called, while 7941 * raw async i/o is still in progress or where a thread 7942 * exits on data fault in a multithreaded application. 7943 */ 7944 if (AS_ISUNMAPWAIT(seg->s_as) && (svd->softlockcnt > 0)) { 7945 /* 7946 * Even if we grab segvn WRITER's lock or segp_slock 7947 * here, there might be another thread which could've 7948 * successfully performed lookup/insert just before 7949 * we acquired the lock here. So, grabbing either 7950 * lock here is of not much use. Until we devise 7951 * a strategy at upper layers to solve the 7952 * synchronization issues completely, we expect 7953 * applications to handle this appropriately. 7954 */ 7955 segvn_purge(seg); 7956 } 7957 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7958 TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEGVN_UNLOCK_END, 7959 "segvn_pagelock: unlock seg %p addr %p", seg, addr); 7960 return (0); 7961 } else if (type == L_PAGERECLAIM) { 7962 VM_STAT_COND_ADD(seg->s_szc != 0, segvnvmstats.pagelock[1]); 7963 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER); 7964 (void) segvn_reclaim(seg, addr, len, *ppp, rw); 7965 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7966 TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEGVN_UNLOCK_END, 7967 "segvn_pagelock: reclaim seg %p addr %p", seg, addr); 7968 return (0); 7969 } 7970 7971 if (seg->s_szc != 0) { 7972 VM_STAT_ADD(segvnvmstats.pagelock[2]); 7973 addr = lpgaddr; 7974 len = lpgeaddr - lpgaddr; 7975 npages = (len >> PAGESHIFT); 7976 } 7977 7978 /* 7979 * for now we only support pagelock to anon memory. We've to check 7980 * protections for vnode objects and call into the vnode driver. 7981 * That's too much for a fast path. Let the fault entry point handle it. 7982 */ 7983 if (svd->vp != NULL) { 7984 TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEGVN_MISS_END, 7985 "segvn_pagelock: mapped vnode seg %p addr %p", seg, addr); 7986 *ppp = NULL; 7987 return (ENOTSUP); 7988 } 7989 7990 /* 7991 * if anonmap is not yet created, let the fault entry point populate it 7992 * with anon ptrs. 7993 */ 7994 if ((amp = svd->amp) == NULL) { 7995 TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEGVN_MISS_END, 7996 "segvn_pagelock: anonmap null seg %p addr %p", seg, addr); 7997 *ppp = NULL; 7998 return (EFAULT); 7999 } 8000 8001 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER); 8002 8003 /* 8004 * we acquire segp_slock to prevent duplicate entries 8005 * in seg_pcache 8006 */ 8007 mutex_enter(&svd->segp_slock); 8008 8009 /* 8010 * try to find pages in segment page cache 8011 */ 8012 pplist = seg_plookup(seg, addr, len, rw); 8013 if (pplist != NULL) { 8014 mutex_exit(&svd->segp_slock); 8015 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 8016 *ppp = pplist + adjustpages; 8017 TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEGVN_HIT_END, 8018 "segvn_pagelock: cache hit seg %p addr %p", seg, addr); 8019 return (0); 8020 } 8021 8022 if (rw == S_READ) { 8023 protchk = PROT_READ; 8024 } else { 8025 protchk = PROT_WRITE; 8026 } 8027 8028 if (svd->pageprot == 0) { 8029 if ((svd->prot & protchk) == 0) { 8030 mutex_exit(&svd->segp_slock); 8031 error = EFAULT; 8032 goto out; 8033 } 8034 } else { 8035 /* 8036 * check page protections 8037 */ 8038 for (a = addr; a < addr + len; a += PAGESIZE) { 8039 struct vpage *vp; 8040 8041 vp = &svd->vpage[seg_page(seg, a)]; 8042 if ((VPP_PROT(vp) & protchk) == 0) { 8043 mutex_exit(&svd->segp_slock); 8044 error = EFAULT; 8045 goto out; 8046 } 8047 } 8048 } 8049 8050 /* 8051 * Avoid per page overhead of segvn_pp_lock_anonpages() for small 8052 * pages. For large pages segvn_pp_lock_anonpages() only does real 8053 * work once per large page. The tradeoff is that we may decrement 8054 * availrmem more than once for the same page but this is ok 8055 * for small pages. 8056 */ 8057 if (seg->s_szc == 0) { 8058 mutex_enter(&freemem_lock); 8059 if (availrmem < tune.t_minarmem + npages) { 8060 mutex_exit(&freemem_lock); 8061 mutex_exit(&svd->segp_slock); 8062 error = ENOMEM; 8063 goto out; 8064 } 8065 availrmem -= npages; 8066 mutex_exit(&freemem_lock); 8067 } 8068 8069 pplist = kmem_alloc(sizeof (page_t *) * npages, KM_SLEEP); 8070 pl = pplist; 8071 *ppp = pplist + adjustpages; 8072 8073 page = seg_page(seg, addr); 8074 anon_index = svd->anon_index + page; 8075 8076 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 8077 for (a = addr; a < addr + len; a += PAGESIZE, anon_index++) { 8078 struct anon *ap; 8079 struct vnode *vp; 8080 u_offset_t off; 8081 anon_sync_obj_t cookie; 8082 8083 anon_array_enter(amp, anon_index, &cookie); 8084 ap = anon_get_ptr(amp->ahp, anon_index); 8085 if (ap == NULL) { 8086 anon_array_exit(&cookie); 8087 break; 8088 } else { 8089 /* 8090 * We must never use seg_pcache for COW pages 8091 * because we might end up with original page still 8092 * lying in seg_pcache even after private page is 8093 * created. This leads to data corruption as 8094 * aio_write refers to the page still in cache 8095 * while all other accesses refer to the private 8096 * page. 8097 */ 8098 if (ap->an_refcnt != 1) { 8099 anon_array_exit(&cookie); 8100 break; 8101 } 8102 } 8103 swap_xlate(ap, &vp, &off); 8104 anon_array_exit(&cookie); 8105 8106 pp = page_lookup_nowait(vp, off, SE_SHARED); 8107 if (pp == NULL) { 8108 break; 8109 } 8110 if (seg->s_szc != 0 || pp->p_szc != 0) { 8111 if (!segvn_pp_lock_anonpages(pp, a == addr)) { 8112 page_unlock(pp); 8113 break; 8114 } 8115 } else { 8116 szc0_npages++; 8117 } 8118 *pplist++ = pp; 8119 } 8120 ANON_LOCK_EXIT(&->a_rwlock); 8121 8122 ASSERT(npages >= szc0_npages); 8123 8124 if (a >= addr + len) { 8125 mutex_enter(&freemem_lock); 8126 if (seg->s_szc == 0 && npages != szc0_npages) { 8127 ASSERT(svd->type == MAP_SHARED && amp->a_szc > 0); 8128 availrmem += (npages - szc0_npages); 8129 } 8130 svd->softlockcnt += npages; 8131 segvn_pages_locked += npages; 8132 mutex_exit(&freemem_lock); 8133 (void) seg_pinsert(seg, addr, len, pl, rw, SEGP_ASYNC_FLUSH, 8134 segvn_reclaim); 8135 mutex_exit(&svd->segp_slock); 8136 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 8137 TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEGVN_FILL_END, 8138 "segvn_pagelock: cache fill seg %p addr %p", seg, addr); 8139 return (0); 8140 } 8141 8142 mutex_exit(&svd->segp_slock); 8143 if (seg->s_szc == 0) { 8144 mutex_enter(&freemem_lock); 8145 availrmem += npages; 8146 mutex_exit(&freemem_lock); 8147 } 8148 error = EFAULT; 8149 pplist = pl; 8150 np = ((uintptr_t)(a - addr)) >> PAGESHIFT; 8151 while (np > (uint_t)0) { 8152 ASSERT(PAGE_LOCKED(*pplist)); 8153 if (seg->s_szc != 0 || (*pplist)->p_szc != 0) { 8154 segvn_pp_unlock_anonpages(*pplist, pplist == pl); 8155 } 8156 page_unlock(*pplist); 8157 np--; 8158 pplist++; 8159 } 8160 kmem_free(pl, sizeof (page_t *) * npages); 8161 out: 8162 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 8163 *ppp = NULL; 8164 TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEGVN_MISS_END, 8165 "segvn_pagelock: cache miss seg %p addr %p", seg, addr); 8166 return (error); 8167 } 8168 8169 /* 8170 * purge any cached pages in the I/O page cache 8171 */ 8172 static void 8173 segvn_purge(struct seg *seg) 8174 { 8175 seg_ppurge(seg); 8176 } 8177 8178 static int 8179 segvn_reclaim(struct seg *seg, caddr_t addr, size_t len, struct page **pplist, 8180 enum seg_rw rw) 8181 { 8182 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 8183 pgcnt_t np, npages; 8184 struct page **pl; 8185 pgcnt_t szc0_npages = 0; 8186 8187 #ifdef lint 8188 addr = addr; 8189 #endif 8190 8191 npages = np = (len >> PAGESHIFT); 8192 ASSERT(npages); 8193 pl = pplist; 8194 if (seg->s_szc != 0) { 8195 size_t pgsz = page_get_pagesize(seg->s_szc); 8196 if (!IS_P2ALIGNED(addr, pgsz) || !IS_P2ALIGNED(len, pgsz)) { 8197 panic("segvn_reclaim: unaligned addr or len"); 8198 /*NOTREACHED*/ 8199 } 8200 } 8201 8202 ASSERT(svd->vp == NULL && svd->amp != NULL); 8203 8204 while (np > (uint_t)0) { 8205 if (rw == S_WRITE) { 8206 hat_setrefmod(*pplist); 8207 } else { 8208 hat_setref(*pplist); 8209 } 8210 if (seg->s_szc != 0 || (*pplist)->p_szc != 0) { 8211 segvn_pp_unlock_anonpages(*pplist, pplist == pl); 8212 } else { 8213 szc0_npages++; 8214 } 8215 page_unlock(*pplist); 8216 np--; 8217 pplist++; 8218 } 8219 kmem_free(pl, sizeof (page_t *) * npages); 8220 8221 mutex_enter(&freemem_lock); 8222 segvn_pages_locked -= npages; 8223 svd->softlockcnt -= npages; 8224 if (szc0_npages != 0) { 8225 availrmem += szc0_npages; 8226 } 8227 mutex_exit(&freemem_lock); 8228 if (svd->softlockcnt <= 0) { 8229 if (AS_ISUNMAPWAIT(seg->s_as)) { 8230 mutex_enter(&seg->s_as->a_contents); 8231 if (AS_ISUNMAPWAIT(seg->s_as)) { 8232 AS_CLRUNMAPWAIT(seg->s_as); 8233 cv_broadcast(&seg->s_as->a_cv); 8234 } 8235 mutex_exit(&seg->s_as->a_contents); 8236 } 8237 } 8238 return (0); 8239 } 8240 /* 8241 * get a memory ID for an addr in a given segment 8242 * 8243 * XXX only creates PAGESIZE pages if anon slots are not initialized. 8244 * At fault time they will be relocated into larger pages. 8245 */ 8246 static int 8247 segvn_getmemid(struct seg *seg, caddr_t addr, memid_t *memidp) 8248 { 8249 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 8250 struct anon *ap = NULL; 8251 ulong_t anon_index; 8252 struct anon_map *amp; 8253 anon_sync_obj_t cookie; 8254 8255 if (svd->type == MAP_PRIVATE) { 8256 memidp->val[0] = (uintptr_t)seg->s_as; 8257 memidp->val[1] = (uintptr_t)addr; 8258 return (0); 8259 } 8260 8261 if (svd->type == MAP_SHARED) { 8262 if (svd->vp) { 8263 memidp->val[0] = (uintptr_t)svd->vp; 8264 memidp->val[1] = (u_longlong_t)svd->offset + 8265 (uintptr_t)(addr - seg->s_base); 8266 return (0); 8267 } else { 8268 8269 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER); 8270 if ((amp = svd->amp) != NULL) { 8271 anon_index = svd->anon_index + 8272 seg_page(seg, addr); 8273 } 8274 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 8275 8276 ASSERT(amp != NULL); 8277 8278 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 8279 anon_array_enter(amp, anon_index, &cookie); 8280 ap = anon_get_ptr(amp->ahp, anon_index); 8281 if (ap == NULL) { 8282 page_t *pp; 8283 8284 pp = anon_zero(seg, addr, &ap, svd->cred); 8285 if (pp == NULL) { 8286 anon_array_exit(&cookie); 8287 ANON_LOCK_EXIT(&->a_rwlock); 8288 return (ENOMEM); 8289 } 8290 ASSERT(anon_get_ptr(amp->ahp, anon_index) 8291 == NULL); 8292 (void) anon_set_ptr(amp->ahp, anon_index, 8293 ap, ANON_SLEEP); 8294 page_unlock(pp); 8295 } 8296 8297 anon_array_exit(&cookie); 8298 ANON_LOCK_EXIT(&->a_rwlock); 8299 8300 memidp->val[0] = (uintptr_t)ap; 8301 memidp->val[1] = (uintptr_t)addr & PAGEOFFSET; 8302 return (0); 8303 } 8304 } 8305 return (EINVAL); 8306 } 8307 8308 static int 8309 sameprot(struct seg *seg, caddr_t a, size_t len) 8310 { 8311 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 8312 struct vpage *vpage; 8313 spgcnt_t pages = btop(len); 8314 uint_t prot; 8315 8316 if (svd->pageprot == 0) 8317 return (1); 8318 8319 ASSERT(svd->vpage != NULL); 8320 8321 vpage = &svd->vpage[seg_page(seg, a)]; 8322 prot = VPP_PROT(vpage); 8323 vpage++; 8324 pages--; 8325 while (pages-- > 0) { 8326 if (prot != VPP_PROT(vpage)) 8327 return (0); 8328 vpage++; 8329 } 8330 return (1); 8331 } 8332 8333 /* 8334 * Get memory allocation policy info for specified address in given segment 8335 */ 8336 static lgrp_mem_policy_info_t * 8337 segvn_getpolicy(struct seg *seg, caddr_t addr) 8338 { 8339 struct anon_map *amp; 8340 ulong_t anon_index; 8341 lgrp_mem_policy_info_t *policy_info; 8342 struct segvn_data *svn_data; 8343 u_offset_t vn_off; 8344 vnode_t *vp; 8345 8346 ASSERT(seg != NULL); 8347 8348 svn_data = (struct segvn_data *)seg->s_data; 8349 if (svn_data == NULL) 8350 return (NULL); 8351 8352 /* 8353 * Get policy info for private or shared memory 8354 */ 8355 if (svn_data->type != MAP_SHARED) 8356 policy_info = &svn_data->policy_info; 8357 else { 8358 amp = svn_data->amp; 8359 anon_index = svn_data->anon_index + seg_page(seg, addr); 8360 vp = svn_data->vp; 8361 vn_off = svn_data->offset + (uintptr_t)(addr - seg->s_base); 8362 policy_info = lgrp_shm_policy_get(amp, anon_index, vp, vn_off); 8363 } 8364 8365 return (policy_info); 8366 } 8367 8368 /*ARGSUSED*/ 8369 static int 8370 segvn_capable(struct seg *seg, segcapability_t capability) 8371 { 8372 return (0); 8373 } 8374