1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ 27 /* All Rights Reserved */ 28 29 /* 30 * University Copyright- Copyright (c) 1982, 1986, 1988 31 * The Regents of the University of California 32 * All Rights Reserved 33 * 34 * University Acknowledgment- Portions of this document are derived from 35 * software developed by the University of California, Berkeley, and its 36 * contributors. 37 */ 38 39 #pragma ident "%Z%%M% %I% %E% SMI" 40 41 /* 42 * VM - shared or copy-on-write from a vnode/anonymous memory. 43 */ 44 45 #include <sys/types.h> 46 #include <sys/param.h> 47 #include <sys/t_lock.h> 48 #include <sys/errno.h> 49 #include <sys/systm.h> 50 #include <sys/mman.h> 51 #include <sys/debug.h> 52 #include <sys/cred.h> 53 #include <sys/vmsystm.h> 54 #include <sys/tuneable.h> 55 #include <sys/bitmap.h> 56 #include <sys/swap.h> 57 #include <sys/kmem.h> 58 #include <sys/sysmacros.h> 59 #include <sys/vtrace.h> 60 #include <sys/cmn_err.h> 61 #include <sys/vm.h> 62 #include <sys/dumphdr.h> 63 #include <sys/lgrp.h> 64 65 #include <vm/hat.h> 66 #include <vm/as.h> 67 #include <vm/seg.h> 68 #include <vm/seg_vn.h> 69 #include <vm/pvn.h> 70 #include <vm/anon.h> 71 #include <vm/page.h> 72 #include <vm/vpage.h> 73 #include <sys/proc.h> 74 #include <sys/task.h> 75 #include <sys/project.h> 76 #include <sys/zone.h> 77 #include <sys/shm_impl.h> 78 /* 79 * Private seg op routines. 80 */ 81 static int segvn_dup(struct seg *seg, struct seg *newseg); 82 static int segvn_unmap(struct seg *seg, caddr_t addr, size_t len); 83 static void segvn_free(struct seg *seg); 84 static faultcode_t segvn_fault(struct hat *hat, struct seg *seg, 85 caddr_t addr, size_t len, enum fault_type type, 86 enum seg_rw rw); 87 static faultcode_t segvn_faulta(struct seg *seg, caddr_t addr); 88 static int segvn_setprot(struct seg *seg, caddr_t addr, 89 size_t len, uint_t prot); 90 static int segvn_checkprot(struct seg *seg, caddr_t addr, 91 size_t len, uint_t prot); 92 static int segvn_kluster(struct seg *seg, caddr_t addr, ssize_t delta); 93 static size_t segvn_swapout(struct seg *seg); 94 static int segvn_sync(struct seg *seg, caddr_t addr, size_t len, 95 int attr, uint_t flags); 96 static size_t segvn_incore(struct seg *seg, caddr_t addr, size_t len, 97 char *vec); 98 static int segvn_lockop(struct seg *seg, caddr_t addr, size_t len, 99 int attr, int op, ulong_t *lockmap, size_t pos); 100 static int segvn_getprot(struct seg *seg, caddr_t addr, size_t len, 101 uint_t *protv); 102 static u_offset_t segvn_getoffset(struct seg *seg, caddr_t addr); 103 static int segvn_gettype(struct seg *seg, caddr_t addr); 104 static int segvn_getvp(struct seg *seg, caddr_t addr, 105 struct vnode **vpp); 106 static int segvn_advise(struct seg *seg, caddr_t addr, size_t len, 107 uint_t behav); 108 static void segvn_dump(struct seg *seg); 109 static int segvn_pagelock(struct seg *seg, caddr_t addr, size_t len, 110 struct page ***ppp, enum lock_type type, enum seg_rw rw); 111 static int segvn_setpagesize(struct seg *seg, caddr_t addr, size_t len, 112 uint_t szc); 113 static int segvn_getmemid(struct seg *seg, caddr_t addr, 114 memid_t *memidp); 115 static lgrp_mem_policy_info_t *segvn_getpolicy(struct seg *, caddr_t); 116 static int segvn_capable(struct seg *seg, segcapability_t capable); 117 118 struct seg_ops segvn_ops = { 119 segvn_dup, 120 segvn_unmap, 121 segvn_free, 122 segvn_fault, 123 segvn_faulta, 124 segvn_setprot, 125 segvn_checkprot, 126 segvn_kluster, 127 segvn_swapout, 128 segvn_sync, 129 segvn_incore, 130 segvn_lockop, 131 segvn_getprot, 132 segvn_getoffset, 133 segvn_gettype, 134 segvn_getvp, 135 segvn_advise, 136 segvn_dump, 137 segvn_pagelock, 138 segvn_setpagesize, 139 segvn_getmemid, 140 segvn_getpolicy, 141 segvn_capable, 142 }; 143 144 /* 145 * Common zfod structures, provided as a shorthand for others to use. 146 */ 147 static segvn_crargs_t zfod_segvn_crargs = 148 SEGVN_ZFOD_ARGS(PROT_ZFOD, PROT_ALL); 149 static segvn_crargs_t kzfod_segvn_crargs = 150 SEGVN_ZFOD_ARGS(PROT_ZFOD & ~PROT_USER, 151 PROT_ALL & ~PROT_USER); 152 static segvn_crargs_t stack_noexec_crargs = 153 SEGVN_ZFOD_ARGS(PROT_ZFOD & ~PROT_EXEC, PROT_ALL); 154 155 caddr_t zfod_argsp = (caddr_t)&zfod_segvn_crargs; /* user zfod argsp */ 156 caddr_t kzfod_argsp = (caddr_t)&kzfod_segvn_crargs; /* kernel zfod argsp */ 157 caddr_t stack_exec_argsp = (caddr_t)&zfod_segvn_crargs; /* executable stack */ 158 caddr_t stack_noexec_argsp = (caddr_t)&stack_noexec_crargs; /* noexec stack */ 159 160 #define vpgtob(n) ((n) * sizeof (struct vpage)) /* For brevity */ 161 162 size_t segvn_comb_thrshld = UINT_MAX; /* patchable -- see 1196681 */ 163 164 static int segvn_concat(struct seg *, struct seg *, int); 165 static int segvn_extend_prev(struct seg *, struct seg *, 166 struct segvn_crargs *, size_t); 167 static int segvn_extend_next(struct seg *, struct seg *, 168 struct segvn_crargs *, size_t); 169 static void segvn_softunlock(struct seg *, caddr_t, size_t, enum seg_rw); 170 static void segvn_pagelist_rele(page_t **); 171 static void segvn_setvnode_mpss(vnode_t *); 172 static void segvn_relocate_pages(page_t **, page_t *); 173 static int segvn_full_szcpages(page_t **, uint_t, int *, uint_t *); 174 static int segvn_fill_vp_pages(struct segvn_data *, vnode_t *, u_offset_t, 175 uint_t, page_t **, page_t **, uint_t *, int *); 176 static faultcode_t segvn_fault_vnodepages(struct hat *, struct seg *, caddr_t, 177 caddr_t, enum fault_type, enum seg_rw, caddr_t, caddr_t, int); 178 static faultcode_t segvn_fault_anonpages(struct hat *, struct seg *, caddr_t, 179 caddr_t, enum fault_type, enum seg_rw, caddr_t, caddr_t, int); 180 static faultcode_t segvn_faultpage(struct hat *, struct seg *, caddr_t, 181 u_offset_t, struct vpage *, page_t **, uint_t, 182 enum fault_type, enum seg_rw, int, int); 183 static void segvn_vpage(struct seg *); 184 185 static void segvn_purge(struct seg *seg); 186 static int segvn_reclaim(struct seg *, caddr_t, size_t, struct page **, 187 enum seg_rw); 188 189 static int sameprot(struct seg *, caddr_t, size_t); 190 191 static int segvn_demote_range(struct seg *, caddr_t, size_t, int, uint_t); 192 static int segvn_clrszc(struct seg *); 193 static struct seg *segvn_split_seg(struct seg *, caddr_t); 194 static int segvn_claim_pages(struct seg *, struct vpage *, u_offset_t, 195 ulong_t, uint_t); 196 197 static int segvn_pp_lock_anonpages(page_t *, int); 198 static void segvn_pp_unlock_anonpages(page_t *, int); 199 200 static struct kmem_cache *segvn_cache; 201 202 #ifdef VM_STATS 203 static struct segvnvmstats_str { 204 ulong_t fill_vp_pages[31]; 205 ulong_t fltvnpages[49]; 206 ulong_t fullszcpages[10]; 207 ulong_t relocatepages[3]; 208 ulong_t fltanpages[17]; 209 ulong_t pagelock[3]; 210 ulong_t demoterange[3]; 211 } segvnvmstats; 212 #endif /* VM_STATS */ 213 214 #define SDR_RANGE 1 /* demote entire range */ 215 #define SDR_END 2 /* demote non aligned ends only */ 216 217 #define CALC_LPG_REGION(pgsz, seg, addr, len, lpgaddr, lpgeaddr) { \ 218 if ((len) != 0) { \ 219 lpgaddr = (caddr_t)P2ALIGN((uintptr_t)(addr), pgsz); \ 220 ASSERT(lpgaddr >= (seg)->s_base); \ 221 lpgeaddr = (caddr_t)P2ROUNDUP((uintptr_t)((addr) + \ 222 (len)), pgsz); \ 223 ASSERT(lpgeaddr > lpgaddr); \ 224 ASSERT(lpgeaddr <= (seg)->s_base + (seg)->s_size); \ 225 } else { \ 226 lpgeaddr = lpgaddr = (addr); \ 227 } \ 228 } 229 230 /*ARGSUSED*/ 231 static int 232 segvn_cache_constructor(void *buf, void *cdrarg, int kmflags) 233 { 234 struct segvn_data *svd = buf; 235 236 rw_init(&svd->lock, NULL, RW_DEFAULT, NULL); 237 mutex_init(&svd->segp_slock, NULL, MUTEX_DEFAULT, NULL); 238 return (0); 239 } 240 241 /*ARGSUSED1*/ 242 static void 243 segvn_cache_destructor(void *buf, void *cdrarg) 244 { 245 struct segvn_data *svd = buf; 246 247 rw_destroy(&svd->lock); 248 mutex_destroy(&svd->segp_slock); 249 } 250 251 /* 252 * Patching this variable to non-zero allows the system to run with 253 * stacks marked as "not executable". It's a bit of a kludge, but is 254 * provided as a tweakable for platforms that export those ABIs 255 * (e.g. sparc V8) that have executable stacks enabled by default. 256 * There are also some restrictions for platforms that don't actually 257 * implement 'noexec' protections. 258 * 259 * Once enabled, the system is (therefore) unable to provide a fully 260 * ABI-compliant execution environment, though practically speaking, 261 * most everything works. The exceptions are generally some interpreters 262 * and debuggers that create executable code on the stack and jump 263 * into it (without explicitly mprotecting the address range to include 264 * PROT_EXEC). 265 * 266 * One important class of applications that are disabled are those 267 * that have been transformed into malicious agents using one of the 268 * numerous "buffer overflow" attacks. See 4007890. 269 */ 270 int noexec_user_stack = 0; 271 int noexec_user_stack_log = 1; 272 273 int segvn_lpg_disable = 0; 274 uint_t segvn_maxpgszc = 0; 275 276 ulong_t segvn_vmpss_clrszc_cnt; 277 ulong_t segvn_vmpss_clrszc_err; 278 ulong_t segvn_fltvnpages_clrszc_cnt; 279 ulong_t segvn_fltvnpages_clrszc_err; 280 ulong_t segvn_setpgsz_align_err; 281 ulong_t segvn_setpgsz_anon_align_err; 282 ulong_t segvn_setpgsz_getattr_err; 283 ulong_t segvn_setpgsz_eof_err; 284 ulong_t segvn_faultvnmpss_align_err1; 285 ulong_t segvn_faultvnmpss_align_err2; 286 ulong_t segvn_faultvnmpss_align_err3; 287 ulong_t segvn_faultvnmpss_align_err4; 288 ulong_t segvn_faultvnmpss_align_err5; 289 ulong_t segvn_vmpss_pageio_deadlk_err; 290 291 /* 292 * Initialize segvn data structures 293 */ 294 void 295 segvn_init(void) 296 { 297 uint_t maxszc; 298 uint_t szc; 299 size_t pgsz; 300 301 segvn_cache = kmem_cache_create("segvn_cache", 302 sizeof (struct segvn_data), 0, 303 segvn_cache_constructor, segvn_cache_destructor, NULL, 304 NULL, NULL, 0); 305 306 if (segvn_lpg_disable != 0) 307 return; 308 szc = maxszc = page_num_pagesizes() - 1; 309 if (szc == 0) { 310 segvn_lpg_disable = 1; 311 return; 312 } 313 if (page_get_pagesize(0) != PAGESIZE) { 314 panic("segvn_init: bad szc 0"); 315 /*NOTREACHED*/ 316 } 317 while (szc != 0) { 318 pgsz = page_get_pagesize(szc); 319 if (pgsz <= PAGESIZE || !IS_P2ALIGNED(pgsz, pgsz)) { 320 panic("segvn_init: bad szc %d", szc); 321 /*NOTREACHED*/ 322 } 323 szc--; 324 } 325 if (segvn_maxpgszc == 0 || segvn_maxpgszc > maxszc) 326 segvn_maxpgszc = maxszc; 327 } 328 329 #define SEGVN_PAGEIO ((void *)0x1) 330 #define SEGVN_NOPAGEIO ((void *)0x2) 331 332 static void 333 segvn_setvnode_mpss(vnode_t *vp) 334 { 335 int err; 336 337 ASSERT(vp->v_mpssdata == NULL || 338 vp->v_mpssdata == SEGVN_PAGEIO || 339 vp->v_mpssdata == SEGVN_NOPAGEIO); 340 341 if (vp->v_mpssdata == NULL) { 342 if (vn_vmpss_usepageio(vp)) { 343 err = VOP_PAGEIO(vp, (page_t *)NULL, 344 (u_offset_t)0, 0, 0, CRED()); 345 } else { 346 err = ENOSYS; 347 } 348 /* 349 * set v_mpssdata just once per vnode life 350 * so that it never changes. 351 */ 352 mutex_enter(&vp->v_lock); 353 if (vp->v_mpssdata == NULL) { 354 if (err == EINVAL) { 355 vp->v_mpssdata = SEGVN_PAGEIO; 356 } else { 357 vp->v_mpssdata = SEGVN_NOPAGEIO; 358 } 359 } 360 mutex_exit(&vp->v_lock); 361 } 362 } 363 364 int 365 segvn_create(struct seg *seg, void *argsp) 366 { 367 struct segvn_crargs *a = (struct segvn_crargs *)argsp; 368 struct segvn_data *svd; 369 size_t swresv = 0; 370 struct cred *cred; 371 struct anon_map *amp; 372 int error = 0; 373 size_t pgsz; 374 lgrp_mem_policy_t mpolicy = LGRP_MEM_POLICY_DEFAULT; 375 376 377 ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock)); 378 379 if (a->type != MAP_PRIVATE && a->type != MAP_SHARED) { 380 panic("segvn_create type"); 381 /*NOTREACHED*/ 382 } 383 384 /* 385 * Check arguments. If a shared anon structure is given then 386 * it is illegal to also specify a vp. 387 */ 388 if (a->amp != NULL && a->vp != NULL) { 389 panic("segvn_create anon_map"); 390 /*NOTREACHED*/ 391 } 392 393 /* MAP_NORESERVE on a MAP_SHARED segment is meaningless. */ 394 if (a->type == MAP_SHARED) 395 a->flags &= ~MAP_NORESERVE; 396 397 if (a->szc != 0) { 398 if (segvn_lpg_disable != 0 || (a->szc == AS_MAP_NO_LPOOB) || 399 (a->amp != NULL && a->type == MAP_PRIVATE) || 400 (a->flags & MAP_NORESERVE) || seg->s_as == &kas) { 401 a->szc = 0; 402 } else { 403 if (a->szc > segvn_maxpgszc) 404 a->szc = segvn_maxpgszc; 405 pgsz = page_get_pagesize(a->szc); 406 if (!IS_P2ALIGNED(seg->s_base, pgsz) || 407 !IS_P2ALIGNED(seg->s_size, pgsz)) { 408 a->szc = 0; 409 } else if (a->vp != NULL) { 410 extern struct vnode kvp; 411 if (IS_SWAPFSVP(a->vp) || VN_ISKAS(a->vp)) { 412 /* 413 * paranoid check. 414 * hat_page_demote() is not supported 415 * on swapfs pages. 416 */ 417 a->szc = 0; 418 } else if (map_addr_vacalign_check(seg->s_base, 419 a->offset & PAGEMASK)) { 420 a->szc = 0; 421 } 422 } else if (a->amp != NULL) { 423 pgcnt_t anum = btopr(a->offset); 424 pgcnt_t pgcnt = page_get_pagecnt(a->szc); 425 if (!IS_P2ALIGNED(anum, pgcnt)) { 426 a->szc = 0; 427 } 428 } 429 } 430 } 431 432 /* 433 * If segment may need private pages, reserve them now. 434 */ 435 if (!(a->flags & MAP_NORESERVE) && ((a->vp == NULL && a->amp == NULL) || 436 (a->type == MAP_PRIVATE && (a->prot & PROT_WRITE)))) { 437 if (anon_resv(seg->s_size) == 0) 438 return (EAGAIN); 439 swresv = seg->s_size; 440 TRACE_3(TR_FAC_VM, TR_ANON_PROC, "anon proc:%p %lu %u", 441 seg, swresv, 1); 442 } 443 444 /* 445 * Reserve any mapping structures that may be required. 446 */ 447 hat_map(seg->s_as->a_hat, seg->s_base, seg->s_size, HAT_MAP); 448 449 if (a->cred) { 450 cred = a->cred; 451 crhold(cred); 452 } else { 453 crhold(cred = CRED()); 454 } 455 456 /* Inform the vnode of the new mapping */ 457 if (a->vp) { 458 error = VOP_ADDMAP(a->vp, a->offset & PAGEMASK, 459 seg->s_as, seg->s_base, seg->s_size, a->prot, 460 a->maxprot, a->type, cred); 461 if (error) { 462 if (swresv != 0) { 463 anon_unresv(swresv); 464 TRACE_3(TR_FAC_VM, TR_ANON_PROC, 465 "anon proc:%p %lu %u", 466 seg, swresv, 0); 467 } 468 crfree(cred); 469 hat_unload(seg->s_as->a_hat, seg->s_base, 470 seg->s_size, HAT_UNLOAD_UNMAP); 471 return (error); 472 } 473 } 474 475 /* 476 * If more than one segment in the address space, and 477 * they're adjacent virtually, try to concatenate them. 478 * Don't concatenate if an explicit anon_map structure 479 * was supplied (e.g., SystemV shared memory). 480 */ 481 if (a->amp == NULL) { 482 struct seg *pseg, *nseg; 483 struct segvn_data *psvd, *nsvd; 484 lgrp_mem_policy_t ppolicy, npolicy; 485 uint_t lgrp_mem_policy_flags = 0; 486 extern lgrp_mem_policy_t lgrp_mem_default_policy; 487 488 /* 489 * Memory policy flags (lgrp_mem_policy_flags) is valid when 490 * extending stack/heap segments. 491 */ 492 if ((a->vp == NULL) && (a->type == MAP_PRIVATE) && 493 !(a->flags & MAP_NORESERVE) && (seg->s_as != &kas)) { 494 lgrp_mem_policy_flags = a->lgrp_mem_policy_flags; 495 } else { 496 /* 497 * Get policy when not extending it from another segment 498 */ 499 mpolicy = lgrp_mem_policy_default(seg->s_size, a->type); 500 } 501 502 /* 503 * First, try to concatenate the previous and new segments 504 */ 505 pseg = AS_SEGPREV(seg->s_as, seg); 506 if (pseg != NULL && 507 pseg->s_base + pseg->s_size == seg->s_base && 508 pseg->s_ops == &segvn_ops) { 509 /* 510 * Get memory allocation policy from previous segment. 511 * When extension is specified (e.g. for heap) apply 512 * this policy to the new segment regardless of the 513 * outcome of segment concatenation. Extension occurs 514 * for non-default policy otherwise default policy is 515 * used and is based on extended segment size. 516 */ 517 psvd = (struct segvn_data *)pseg->s_data; 518 ppolicy = psvd->policy_info.mem_policy; 519 if (lgrp_mem_policy_flags == 520 LGRP_MP_FLAG_EXTEND_UP) { 521 if (ppolicy != lgrp_mem_default_policy) { 522 mpolicy = ppolicy; 523 } else { 524 mpolicy = lgrp_mem_policy_default( 525 pseg->s_size + seg->s_size, 526 a->type); 527 } 528 } 529 530 if (mpolicy == ppolicy && 531 (pseg->s_size + seg->s_size <= 532 segvn_comb_thrshld || psvd->amp == NULL) && 533 segvn_extend_prev(pseg, seg, a, swresv) == 0) { 534 /* 535 * success! now try to concatenate 536 * with following seg 537 */ 538 crfree(cred); 539 nseg = AS_SEGNEXT(pseg->s_as, pseg); 540 if (nseg != NULL && 541 nseg != pseg && 542 nseg->s_ops == &segvn_ops && 543 pseg->s_base + pseg->s_size == 544 nseg->s_base) 545 (void) segvn_concat(pseg, nseg, 0); 546 ASSERT(pseg->s_szc == 0 || 547 (a->szc == pseg->s_szc && 548 IS_P2ALIGNED(pseg->s_base, pgsz) && 549 IS_P2ALIGNED(pseg->s_size, pgsz))); 550 return (0); 551 } 552 } 553 554 /* 555 * Failed, so try to concatenate with following seg 556 */ 557 nseg = AS_SEGNEXT(seg->s_as, seg); 558 if (nseg != NULL && 559 seg->s_base + seg->s_size == nseg->s_base && 560 nseg->s_ops == &segvn_ops) { 561 /* 562 * Get memory allocation policy from next segment. 563 * When extension is specified (e.g. for stack) apply 564 * this policy to the new segment regardless of the 565 * outcome of segment concatenation. Extension occurs 566 * for non-default policy otherwise default policy is 567 * used and is based on extended segment size. 568 */ 569 nsvd = (struct segvn_data *)nseg->s_data; 570 npolicy = nsvd->policy_info.mem_policy; 571 if (lgrp_mem_policy_flags == 572 LGRP_MP_FLAG_EXTEND_DOWN) { 573 if (npolicy != lgrp_mem_default_policy) { 574 mpolicy = npolicy; 575 } else { 576 mpolicy = lgrp_mem_policy_default( 577 nseg->s_size + seg->s_size, 578 a->type); 579 } 580 } 581 582 if (mpolicy == npolicy && 583 segvn_extend_next(seg, nseg, a, swresv) == 0) { 584 crfree(cred); 585 ASSERT(nseg->s_szc == 0 || 586 (a->szc == nseg->s_szc && 587 IS_P2ALIGNED(nseg->s_base, pgsz) && 588 IS_P2ALIGNED(nseg->s_size, pgsz))); 589 return (0); 590 } 591 } 592 } 593 594 if (a->vp != NULL) { 595 VN_HOLD(a->vp); 596 if (a->type == MAP_SHARED) 597 lgrp_shm_policy_init(NULL, a->vp); 598 } 599 svd = kmem_cache_alloc(segvn_cache, KM_SLEEP); 600 601 seg->s_ops = &segvn_ops; 602 seg->s_data = (void *)svd; 603 seg->s_szc = a->szc; 604 605 svd->vp = a->vp; 606 /* 607 * Anonymous mappings have no backing file so the offset is meaningless. 608 */ 609 svd->offset = a->vp ? (a->offset & PAGEMASK) : 0; 610 svd->prot = a->prot; 611 svd->maxprot = a->maxprot; 612 svd->pageprot = 0; 613 svd->type = a->type; 614 svd->vpage = NULL; 615 svd->cred = cred; 616 svd->advice = MADV_NORMAL; 617 svd->pageadvice = 0; 618 svd->flags = (ushort_t)a->flags; 619 svd->softlockcnt = 0; 620 if (a->szc != 0 && a->vp != NULL) { 621 segvn_setvnode_mpss(a->vp); 622 } 623 624 amp = a->amp; 625 if ((svd->amp = amp) == NULL) { 626 svd->anon_index = 0; 627 if (svd->type == MAP_SHARED) { 628 svd->swresv = 0; 629 /* 630 * Shared mappings to a vp need no other setup. 631 * If we have a shared mapping to an anon_map object 632 * which hasn't been allocated yet, allocate the 633 * struct now so that it will be properly shared 634 * by remembering the swap reservation there. 635 */ 636 if (a->vp == NULL) { 637 svd->amp = anonmap_alloc(seg->s_size, swresv); 638 svd->amp->a_szc = seg->s_szc; 639 } 640 } else { 641 /* 642 * Private mapping (with or without a vp). 643 * Allocate anon_map when needed. 644 */ 645 svd->swresv = swresv; 646 } 647 } else { 648 pgcnt_t anon_num; 649 650 /* 651 * Mapping to an existing anon_map structure without a vp. 652 * For now we will insure that the segment size isn't larger 653 * than the size - offset gives us. Later on we may wish to 654 * have the anon array dynamically allocated itself so that 655 * we don't always have to allocate all the anon pointer slots. 656 * This of course involves adding extra code to check that we 657 * aren't trying to use an anon pointer slot beyond the end 658 * of the currently allocated anon array. 659 */ 660 if ((amp->size - a->offset) < seg->s_size) { 661 panic("segvn_create anon_map size"); 662 /*NOTREACHED*/ 663 } 664 665 anon_num = btopr(a->offset); 666 667 if (a->type == MAP_SHARED) { 668 /* 669 * SHARED mapping to a given anon_map. 670 */ 671 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 672 amp->refcnt++; 673 if (a->szc > amp->a_szc) { 674 amp->a_szc = a->szc; 675 } 676 ANON_LOCK_EXIT(&->a_rwlock); 677 svd->anon_index = anon_num; 678 svd->swresv = 0; 679 } else { 680 /* 681 * PRIVATE mapping to a given anon_map. 682 * Make sure that all the needed anon 683 * structures are created (so that we will 684 * share the underlying pages if nothing 685 * is written by this mapping) and then 686 * duplicate the anon array as is done 687 * when a privately mapped segment is dup'ed. 688 */ 689 struct anon *ap; 690 caddr_t addr; 691 caddr_t eaddr; 692 ulong_t anon_idx; 693 int hat_flag = HAT_LOAD; 694 695 if (svd->flags & MAP_TEXT) { 696 hat_flag |= HAT_LOAD_TEXT; 697 } 698 699 svd->amp = anonmap_alloc(seg->s_size, 0); 700 svd->amp->a_szc = seg->s_szc; 701 svd->anon_index = 0; 702 svd->swresv = swresv; 703 704 /* 705 * Prevent 2 threads from allocating anon 706 * slots simultaneously. 707 */ 708 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 709 eaddr = seg->s_base + seg->s_size; 710 711 for (anon_idx = anon_num, addr = seg->s_base; 712 addr < eaddr; addr += PAGESIZE, anon_idx++) { 713 page_t *pp; 714 715 if ((ap = anon_get_ptr(amp->ahp, 716 anon_idx)) != NULL) 717 continue; 718 719 /* 720 * Allocate the anon struct now. 721 * Might as well load up translation 722 * to the page while we're at it... 723 */ 724 pp = anon_zero(seg, addr, &ap, cred); 725 if (ap == NULL || pp == NULL) { 726 panic("segvn_create anon_zero"); 727 /*NOTREACHED*/ 728 } 729 730 /* 731 * Re-acquire the anon_map lock and 732 * initialize the anon array entry. 733 */ 734 ASSERT(anon_get_ptr(amp->ahp, 735 anon_idx) == NULL); 736 (void) anon_set_ptr(amp->ahp, anon_idx, ap, 737 ANON_SLEEP); 738 739 ASSERT(seg->s_szc == 0); 740 ASSERT(!IS_VMODSORT(pp->p_vnode)); 741 742 hat_memload(seg->s_as->a_hat, addr, pp, 743 svd->prot & ~PROT_WRITE, hat_flag); 744 745 page_unlock(pp); 746 } 747 ASSERT(seg->s_szc == 0); 748 anon_dup(amp->ahp, anon_num, svd->amp->ahp, 749 0, seg->s_size); 750 ANON_LOCK_EXIT(&->a_rwlock); 751 } 752 } 753 754 /* 755 * Set default memory allocation policy for segment 756 * 757 * Always set policy for private memory at least for initialization 758 * even if this is a shared memory segment 759 */ 760 (void) lgrp_privm_policy_set(mpolicy, &svd->policy_info, seg->s_size); 761 762 if (svd->type == MAP_SHARED) 763 (void) lgrp_shm_policy_set(mpolicy, svd->amp, svd->anon_index, 764 svd->vp, svd->offset, seg->s_size); 765 766 return (0); 767 } 768 769 /* 770 * Concatenate two existing segments, if possible. 771 * Return 0 on success, -1 if two segments are not compatible 772 * or -2 on memory allocation failure. 773 * If amp_cat == 1 then try and concat segments with anon maps 774 */ 775 static int 776 segvn_concat(struct seg *seg1, struct seg *seg2, int amp_cat) 777 { 778 struct segvn_data *svd1 = seg1->s_data; 779 struct segvn_data *svd2 = seg2->s_data; 780 struct anon_map *amp1 = svd1->amp; 781 struct anon_map *amp2 = svd2->amp; 782 struct vpage *vpage1 = svd1->vpage; 783 struct vpage *vpage2 = svd2->vpage, *nvpage = NULL; 784 size_t size, nvpsize; 785 pgcnt_t npages1, npages2; 786 787 ASSERT(seg1->s_as && seg2->s_as && seg1->s_as == seg2->s_as); 788 ASSERT(AS_WRITE_HELD(seg1->s_as, &seg1->s_as->a_lock)); 789 ASSERT(seg1->s_ops == seg2->s_ops); 790 791 /* both segments exist, try to merge them */ 792 #define incompat(x) (svd1->x != svd2->x) 793 if (incompat(vp) || incompat(maxprot) || 794 (!svd1->pageadvice && !svd2->pageadvice && incompat(advice)) || 795 (!svd1->pageprot && !svd2->pageprot && incompat(prot)) || 796 incompat(type) || incompat(cred) || incompat(flags) || 797 seg1->s_szc != seg2->s_szc || incompat(policy_info.mem_policy) || 798 (svd2->softlockcnt > 0)) 799 return (-1); 800 #undef incompat 801 802 /* 803 * vp == NULL implies zfod, offset doesn't matter 804 */ 805 if (svd1->vp != NULL && 806 svd1->offset + seg1->s_size != svd2->offset) { 807 return (-1); 808 } 809 810 /* 811 * Fail early if we're not supposed to concatenate 812 * segments with non NULL amp. 813 */ 814 if (amp_cat == 0 && (amp1 != NULL || amp2 != NULL)) { 815 return (-1); 816 } 817 818 if (svd1->vp == NULL && svd1->type == MAP_SHARED) { 819 if (amp1 != amp2) { 820 return (-1); 821 } 822 if (amp1 != NULL && svd1->anon_index + btop(seg1->s_size) != 823 svd2->anon_index) { 824 return (-1); 825 } 826 ASSERT(amp1 == NULL || amp1->refcnt >= 2); 827 } 828 829 /* 830 * If either seg has vpages, create a new merged vpage array. 831 */ 832 if (vpage1 != NULL || vpage2 != NULL) { 833 struct vpage *vp; 834 835 npages1 = seg_pages(seg1); 836 npages2 = seg_pages(seg2); 837 nvpsize = vpgtob(npages1 + npages2); 838 839 if ((nvpage = kmem_zalloc(nvpsize, KM_NOSLEEP)) == NULL) { 840 return (-2); 841 } 842 if (vpage1 != NULL) { 843 bcopy(vpage1, nvpage, vpgtob(npages1)); 844 } 845 if (vpage2 != NULL) { 846 bcopy(vpage2, nvpage + npages1, vpgtob(npages2)); 847 } 848 for (vp = nvpage; vp < nvpage + npages1; vp++) { 849 if (svd2->pageprot && !svd1->pageprot) { 850 VPP_SETPROT(vp, svd1->prot); 851 } 852 if (svd2->pageadvice && !svd1->pageadvice) { 853 VPP_SETADVICE(vp, svd1->advice); 854 } 855 } 856 for (vp = nvpage + npages1; 857 vp < nvpage + npages1 + npages2; vp++) { 858 if (svd1->pageprot && !svd2->pageprot) { 859 VPP_SETPROT(vp, svd2->prot); 860 } 861 if (svd1->pageadvice && !svd2->pageadvice) { 862 VPP_SETADVICE(vp, svd2->advice); 863 } 864 } 865 } 866 867 /* 868 * If either segment has private pages, create a new merged anon 869 * array. If mergeing shared anon segments just decrement anon map's 870 * refcnt. 871 */ 872 if (amp1 != NULL && svd1->type == MAP_SHARED) { 873 ASSERT(amp1 == amp2 && svd1->vp == NULL); 874 ANON_LOCK_ENTER(&1->a_rwlock, RW_WRITER); 875 ASSERT(amp1->refcnt >= 2); 876 amp1->refcnt--; 877 ANON_LOCK_EXIT(&1->a_rwlock); 878 svd2->amp = NULL; 879 } else if (amp1 != NULL || amp2 != NULL) { 880 struct anon_hdr *nahp; 881 struct anon_map *namp = NULL; 882 size_t asize; 883 884 ASSERT(svd1->type == MAP_PRIVATE); 885 886 asize = seg1->s_size + seg2->s_size; 887 if ((nahp = anon_create(btop(asize), ANON_NOSLEEP)) == NULL) { 888 if (nvpage != NULL) { 889 kmem_free(nvpage, nvpsize); 890 } 891 return (-2); 892 } 893 if (amp1 != NULL) { 894 /* 895 * XXX anon rwlock is not really needed because 896 * this is a private segment and we are writers. 897 */ 898 ANON_LOCK_ENTER(&1->a_rwlock, RW_WRITER); 899 ASSERT(amp1->refcnt == 1); 900 if (anon_copy_ptr(amp1->ahp, svd1->anon_index, 901 nahp, 0, btop(seg1->s_size), ANON_NOSLEEP)) { 902 anon_release(nahp, btop(asize)); 903 ANON_LOCK_EXIT(&1->a_rwlock); 904 if (nvpage != NULL) { 905 kmem_free(nvpage, nvpsize); 906 } 907 return (-2); 908 } 909 } 910 if (amp2 != NULL) { 911 ANON_LOCK_ENTER(&2->a_rwlock, RW_WRITER); 912 ASSERT(amp2->refcnt == 1); 913 if (anon_copy_ptr(amp2->ahp, svd2->anon_index, 914 nahp, btop(seg1->s_size), btop(seg2->s_size), 915 ANON_NOSLEEP)) { 916 anon_release(nahp, btop(asize)); 917 ANON_LOCK_EXIT(&2->a_rwlock); 918 if (amp1 != NULL) { 919 ANON_LOCK_EXIT(&1->a_rwlock); 920 } 921 if (nvpage != NULL) { 922 kmem_free(nvpage, nvpsize); 923 } 924 return (-2); 925 } 926 } 927 if (amp1 != NULL) { 928 namp = amp1; 929 anon_release(amp1->ahp, btop(amp1->size)); 930 } 931 if (amp2 != NULL) { 932 if (namp == NULL) { 933 ASSERT(amp1 == NULL); 934 namp = amp2; 935 anon_release(amp2->ahp, btop(amp2->size)); 936 } else { 937 amp2->refcnt--; 938 ANON_LOCK_EXIT(&2->a_rwlock); 939 anonmap_free(amp2); 940 } 941 svd2->amp = NULL; /* needed for seg_free */ 942 } 943 namp->ahp = nahp; 944 namp->size = asize; 945 svd1->amp = namp; 946 svd1->anon_index = 0; 947 ANON_LOCK_EXIT(&namp->a_rwlock); 948 } 949 /* 950 * Now free the old vpage structures. 951 */ 952 if (nvpage != NULL) { 953 if (vpage1 != NULL) { 954 kmem_free(vpage1, vpgtob(npages1)); 955 } 956 if (vpage2 != NULL) { 957 svd2->vpage = NULL; 958 kmem_free(vpage2, vpgtob(npages2)); 959 } 960 if (svd2->pageprot) { 961 svd1->pageprot = 1; 962 } 963 if (svd2->pageadvice) { 964 svd1->pageadvice = 1; 965 } 966 svd1->vpage = nvpage; 967 } 968 969 /* all looks ok, merge segments */ 970 svd1->swresv += svd2->swresv; 971 svd2->swresv = 0; /* so seg_free doesn't release swap space */ 972 size = seg2->s_size; 973 seg_free(seg2); 974 seg1->s_size += size; 975 return (0); 976 } 977 978 /* 979 * Extend the previous segment (seg1) to include the 980 * new segment (seg2 + a), if possible. 981 * Return 0 on success. 982 */ 983 static int 984 segvn_extend_prev(seg1, seg2, a, swresv) 985 struct seg *seg1, *seg2; 986 struct segvn_crargs *a; 987 size_t swresv; 988 { 989 struct segvn_data *svd1 = (struct segvn_data *)seg1->s_data; 990 size_t size; 991 struct anon_map *amp1; 992 struct vpage *new_vpage; 993 994 /* 995 * We don't need any segment level locks for "segvn" data 996 * since the address space is "write" locked. 997 */ 998 ASSERT(seg1->s_as && AS_WRITE_HELD(seg1->s_as, &seg1->s_as->a_lock)); 999 1000 /* second segment is new, try to extend first */ 1001 /* XXX - should also check cred */ 1002 if (svd1->vp != a->vp || svd1->maxprot != a->maxprot || 1003 (!svd1->pageprot && (svd1->prot != a->prot)) || 1004 svd1->type != a->type || svd1->flags != a->flags || 1005 seg1->s_szc != a->szc) 1006 return (-1); 1007 1008 /* vp == NULL implies zfod, offset doesn't matter */ 1009 if (svd1->vp != NULL && 1010 svd1->offset + seg1->s_size != (a->offset & PAGEMASK)) 1011 return (-1); 1012 1013 amp1 = svd1->amp; 1014 if (amp1) { 1015 pgcnt_t newpgs; 1016 1017 /* 1018 * Segment has private pages, can data structures 1019 * be expanded? 1020 * 1021 * Acquire the anon_map lock to prevent it from changing, 1022 * if it is shared. This ensures that the anon_map 1023 * will not change while a thread which has a read/write 1024 * lock on an address space references it. 1025 * XXX - Don't need the anon_map lock at all if "refcnt" 1026 * is 1. 1027 * 1028 * Can't grow a MAP_SHARED segment with an anonmap because 1029 * there may be existing anon slots where we want to extend 1030 * the segment and we wouldn't know what to do with them 1031 * (e.g., for tmpfs right thing is to just leave them there, 1032 * for /dev/zero they should be cleared out). 1033 */ 1034 if (svd1->type == MAP_SHARED) 1035 return (-1); 1036 1037 ANON_LOCK_ENTER(&1->a_rwlock, RW_WRITER); 1038 if (amp1->refcnt > 1) { 1039 ANON_LOCK_EXIT(&1->a_rwlock); 1040 return (-1); 1041 } 1042 newpgs = anon_grow(amp1->ahp, &svd1->anon_index, 1043 btop(seg1->s_size), btop(seg2->s_size), ANON_NOSLEEP); 1044 1045 if (newpgs == 0) { 1046 ANON_LOCK_EXIT(&1->a_rwlock); 1047 return (-1); 1048 } 1049 amp1->size = ptob(newpgs); 1050 ANON_LOCK_EXIT(&1->a_rwlock); 1051 } 1052 if (svd1->vpage != NULL) { 1053 new_vpage = 1054 kmem_zalloc(vpgtob(seg_pages(seg1) + seg_pages(seg2)), 1055 KM_NOSLEEP); 1056 if (new_vpage == NULL) 1057 return (-1); 1058 bcopy(svd1->vpage, new_vpage, vpgtob(seg_pages(seg1))); 1059 kmem_free(svd1->vpage, vpgtob(seg_pages(seg1))); 1060 svd1->vpage = new_vpage; 1061 if (svd1->pageprot) { 1062 struct vpage *vp, *evp; 1063 1064 vp = new_vpage + seg_pages(seg1); 1065 evp = vp + seg_pages(seg2); 1066 for (; vp < evp; vp++) 1067 VPP_SETPROT(vp, a->prot); 1068 } 1069 } 1070 size = seg2->s_size; 1071 seg_free(seg2); 1072 seg1->s_size += size; 1073 svd1->swresv += swresv; 1074 return (0); 1075 } 1076 1077 /* 1078 * Extend the next segment (seg2) to include the 1079 * new segment (seg1 + a), if possible. 1080 * Return 0 on success. 1081 */ 1082 static int 1083 segvn_extend_next( 1084 struct seg *seg1, 1085 struct seg *seg2, 1086 struct segvn_crargs *a, 1087 size_t swresv) 1088 { 1089 struct segvn_data *svd2 = (struct segvn_data *)seg2->s_data; 1090 size_t size; 1091 struct anon_map *amp2; 1092 struct vpage *new_vpage; 1093 1094 /* 1095 * We don't need any segment level locks for "segvn" data 1096 * since the address space is "write" locked. 1097 */ 1098 ASSERT(seg2->s_as && AS_WRITE_HELD(seg2->s_as, &seg2->s_as->a_lock)); 1099 1100 /* first segment is new, try to extend second */ 1101 /* XXX - should also check cred */ 1102 if (svd2->vp != a->vp || svd2->maxprot != a->maxprot || 1103 (!svd2->pageprot && (svd2->prot != a->prot)) || 1104 svd2->type != a->type || svd2->flags != a->flags || 1105 seg2->s_szc != a->szc) 1106 return (-1); 1107 /* vp == NULL implies zfod, offset doesn't matter */ 1108 if (svd2->vp != NULL && 1109 (a->offset & PAGEMASK) + seg1->s_size != svd2->offset) 1110 return (-1); 1111 1112 amp2 = svd2->amp; 1113 if (amp2) { 1114 pgcnt_t newpgs; 1115 1116 /* 1117 * Segment has private pages, can data structures 1118 * be expanded? 1119 * 1120 * Acquire the anon_map lock to prevent it from changing, 1121 * if it is shared. This ensures that the anon_map 1122 * will not change while a thread which has a read/write 1123 * lock on an address space references it. 1124 * 1125 * XXX - Don't need the anon_map lock at all if "refcnt" 1126 * is 1. 1127 */ 1128 if (svd2->type == MAP_SHARED) 1129 return (-1); 1130 1131 ANON_LOCK_ENTER(&2->a_rwlock, RW_WRITER); 1132 if (amp2->refcnt > 1) { 1133 ANON_LOCK_EXIT(&2->a_rwlock); 1134 return (-1); 1135 } 1136 newpgs = anon_grow(amp2->ahp, &svd2->anon_index, 1137 btop(seg2->s_size), btop(seg1->s_size), 1138 ANON_NOSLEEP | ANON_GROWDOWN); 1139 1140 if (newpgs == 0) { 1141 ANON_LOCK_EXIT(&2->a_rwlock); 1142 return (-1); 1143 } 1144 amp2->size = ptob(newpgs); 1145 ANON_LOCK_EXIT(&2->a_rwlock); 1146 } 1147 if (svd2->vpage != NULL) { 1148 new_vpage = 1149 kmem_zalloc(vpgtob(seg_pages(seg1) + seg_pages(seg2)), 1150 KM_NOSLEEP); 1151 if (new_vpage == NULL) { 1152 /* Not merging segments so adjust anon_index back */ 1153 if (amp2) 1154 svd2->anon_index += seg_pages(seg1); 1155 return (-1); 1156 } 1157 bcopy(svd2->vpage, new_vpage + seg_pages(seg1), 1158 vpgtob(seg_pages(seg2))); 1159 kmem_free(svd2->vpage, vpgtob(seg_pages(seg2))); 1160 svd2->vpage = new_vpage; 1161 if (svd2->pageprot) { 1162 struct vpage *vp, *evp; 1163 1164 vp = new_vpage; 1165 evp = vp + seg_pages(seg1); 1166 for (; vp < evp; vp++) 1167 VPP_SETPROT(vp, a->prot); 1168 } 1169 } 1170 size = seg1->s_size; 1171 seg_free(seg1); 1172 seg2->s_size += size; 1173 seg2->s_base -= size; 1174 svd2->offset -= size; 1175 svd2->swresv += swresv; 1176 return (0); 1177 } 1178 1179 static int 1180 segvn_dup(struct seg *seg, struct seg *newseg) 1181 { 1182 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 1183 struct segvn_data *newsvd; 1184 pgcnt_t npages = seg_pages(seg); 1185 int error = 0; 1186 uint_t prot; 1187 size_t len; 1188 1189 ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock)); 1190 1191 /* 1192 * If segment has anon reserved, reserve more for the new seg. 1193 * For a MAP_NORESERVE segment swresv will be a count of all the 1194 * allocated anon slots; thus we reserve for the child as many slots 1195 * as the parent has allocated. This semantic prevents the child or 1196 * parent from dieing during a copy-on-write fault caused by trying 1197 * to write a shared pre-existing anon page. 1198 */ 1199 if ((len = svd->swresv) != 0) { 1200 if (anon_resv(svd->swresv) == 0) 1201 return (ENOMEM); 1202 1203 TRACE_3(TR_FAC_VM, TR_ANON_PROC, "anon proc:%p %lu %u", 1204 seg, len, 0); 1205 } 1206 1207 newsvd = kmem_cache_alloc(segvn_cache, KM_SLEEP); 1208 1209 newseg->s_ops = &segvn_ops; 1210 newseg->s_data = (void *)newsvd; 1211 newseg->s_szc = seg->s_szc; 1212 1213 if ((newsvd->vp = svd->vp) != NULL) { 1214 VN_HOLD(svd->vp); 1215 if (svd->type == MAP_SHARED) 1216 lgrp_shm_policy_init(NULL, svd->vp); 1217 } 1218 newsvd->offset = svd->offset; 1219 newsvd->prot = svd->prot; 1220 newsvd->maxprot = svd->maxprot; 1221 newsvd->pageprot = svd->pageprot; 1222 newsvd->type = svd->type; 1223 newsvd->cred = svd->cred; 1224 crhold(newsvd->cred); 1225 newsvd->advice = svd->advice; 1226 newsvd->pageadvice = svd->pageadvice; 1227 newsvd->swresv = svd->swresv; 1228 newsvd->flags = svd->flags; 1229 newsvd->softlockcnt = 0; 1230 newsvd->policy_info = svd->policy_info; 1231 if ((newsvd->amp = svd->amp) == NULL) { 1232 /* 1233 * Not attaching to a shared anon object. 1234 */ 1235 newsvd->anon_index = 0; 1236 } else { 1237 struct anon_map *amp; 1238 1239 amp = svd->amp; 1240 if (svd->type == MAP_SHARED) { 1241 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 1242 amp->refcnt++; 1243 ANON_LOCK_EXIT(&->a_rwlock); 1244 newsvd->anon_index = svd->anon_index; 1245 } else { 1246 int reclaim = 1; 1247 1248 /* 1249 * Allocate and initialize new anon_map structure. 1250 */ 1251 newsvd->amp = anonmap_alloc(newseg->s_size, 0); 1252 newsvd->amp->a_szc = newseg->s_szc; 1253 newsvd->anon_index = 0; 1254 1255 /* 1256 * We don't have to acquire the anon_map lock 1257 * for the new segment (since it belongs to an 1258 * address space that is still not associated 1259 * with any process), or the segment in the old 1260 * address space (since all threads in it 1261 * are stopped while duplicating the address space). 1262 */ 1263 1264 /* 1265 * The goal of the following code is to make sure that 1266 * softlocked pages do not end up as copy on write 1267 * pages. This would cause problems where one 1268 * thread writes to a page that is COW and a different 1269 * thread in the same process has softlocked it. The 1270 * softlock lock would move away from this process 1271 * because the write would cause this process to get 1272 * a copy (without the softlock). 1273 * 1274 * The strategy here is to just break the 1275 * sharing on pages that could possibly be 1276 * softlocked. 1277 */ 1278 retry: 1279 if (svd->softlockcnt) { 1280 struct anon *ap, *newap; 1281 size_t i; 1282 uint_t vpprot; 1283 page_t *anon_pl[1+1], *pp; 1284 caddr_t addr; 1285 ulong_t anon_idx = 0; 1286 1287 /* 1288 * The softlock count might be non zero 1289 * because some pages are still stuck in the 1290 * cache for lazy reclaim. Flush the cache 1291 * now. This should drop the count to zero. 1292 * [or there is really I/O going on to these 1293 * pages]. Note, we have the writers lock so 1294 * nothing gets inserted during the flush. 1295 */ 1296 if (reclaim == 1) { 1297 segvn_purge(seg); 1298 reclaim = 0; 1299 goto retry; 1300 } 1301 i = btopr(seg->s_size); 1302 addr = seg->s_base; 1303 /* 1304 * XXX break cow sharing using PAGESIZE 1305 * pages. They will be relocated into larger 1306 * pages at fault time. 1307 */ 1308 while (i-- > 0) { 1309 if (ap = anon_get_ptr(amp->ahp, 1310 anon_idx)) { 1311 error = anon_getpage(&ap, 1312 &vpprot, anon_pl, PAGESIZE, 1313 seg, addr, S_READ, 1314 svd->cred); 1315 if (error) { 1316 newsvd->vpage = NULL; 1317 goto out; 1318 } 1319 /* 1320 * prot need not be computed 1321 * below 'cause anon_private is 1322 * going to ignore it anyway 1323 * as child doesn't inherit 1324 * pagelock from parent. 1325 */ 1326 prot = svd->pageprot ? 1327 VPP_PROT( 1328 &svd->vpage[ 1329 seg_page(seg, addr)]) 1330 : svd->prot; 1331 pp = anon_private(&newap, 1332 newseg, addr, prot, 1333 anon_pl[0], 0, 1334 newsvd->cred); 1335 if (pp == NULL) { 1336 /* no mem abort */ 1337 newsvd->vpage = NULL; 1338 error = ENOMEM; 1339 goto out; 1340 } 1341 (void) anon_set_ptr( 1342 newsvd->amp->ahp, anon_idx, 1343 newap, ANON_SLEEP); 1344 page_unlock(pp); 1345 } 1346 addr += PAGESIZE; 1347 anon_idx++; 1348 } 1349 } else { /* common case */ 1350 if (seg->s_szc != 0) { 1351 /* 1352 * If at least one of anon slots of a 1353 * large page exists then make sure 1354 * all anon slots of a large page 1355 * exist to avoid partial cow sharing 1356 * of a large page in the future. 1357 */ 1358 anon_dup_fill_holes(amp->ahp, 1359 svd->anon_index, newsvd->amp->ahp, 1360 0, seg->s_size, seg->s_szc, 1361 svd->vp != NULL); 1362 } else { 1363 anon_dup(amp->ahp, svd->anon_index, 1364 newsvd->amp->ahp, 0, seg->s_size); 1365 } 1366 1367 hat_clrattr(seg->s_as->a_hat, seg->s_base, 1368 seg->s_size, PROT_WRITE); 1369 } 1370 } 1371 } 1372 /* 1373 * If necessary, create a vpage structure for the new segment. 1374 * Do not copy any page lock indications. 1375 */ 1376 if (svd->vpage != NULL) { 1377 uint_t i; 1378 struct vpage *ovp = svd->vpage; 1379 struct vpage *nvp; 1380 1381 nvp = newsvd->vpage = 1382 kmem_alloc(vpgtob(npages), KM_SLEEP); 1383 for (i = 0; i < npages; i++) { 1384 *nvp = *ovp++; 1385 VPP_CLRPPLOCK(nvp++); 1386 } 1387 } else 1388 newsvd->vpage = NULL; 1389 1390 /* Inform the vnode of the new mapping */ 1391 if (newsvd->vp != NULL) { 1392 error = VOP_ADDMAP(newsvd->vp, (offset_t)newsvd->offset, 1393 newseg->s_as, newseg->s_base, newseg->s_size, newsvd->prot, 1394 newsvd->maxprot, newsvd->type, newsvd->cred); 1395 } 1396 out: 1397 return (error); 1398 } 1399 1400 1401 /* 1402 * callback function used by segvn_unmap to invoke free_vp_pages() for only 1403 * those pages actually processed by the HAT 1404 */ 1405 extern int free_pages; 1406 1407 static void 1408 segvn_hat_unload_callback(hat_callback_t *cb) 1409 { 1410 struct seg *seg = cb->hcb_data; 1411 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 1412 size_t len; 1413 u_offset_t off; 1414 1415 ASSERT(svd->vp != NULL); 1416 ASSERT(cb->hcb_end_addr > cb->hcb_start_addr); 1417 ASSERT(cb->hcb_start_addr >= seg->s_base); 1418 1419 len = cb->hcb_end_addr - cb->hcb_start_addr; 1420 off = cb->hcb_start_addr - seg->s_base; 1421 free_vp_pages(svd->vp, svd->offset + off, len); 1422 } 1423 1424 1425 static int 1426 segvn_unmap(struct seg *seg, caddr_t addr, size_t len) 1427 { 1428 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 1429 struct segvn_data *nsvd; 1430 struct seg *nseg; 1431 struct anon_map *amp; 1432 pgcnt_t opages; /* old segment size in pages */ 1433 pgcnt_t npages; /* new segment size in pages */ 1434 pgcnt_t dpages; /* pages being deleted (unmapped) */ 1435 hat_callback_t callback; /* used for free_vp_pages() */ 1436 hat_callback_t *cbp = NULL; 1437 caddr_t nbase; 1438 size_t nsize; 1439 size_t oswresv; 1440 int reclaim = 1; 1441 1442 /* 1443 * We don't need any segment level locks for "segvn" data 1444 * since the address space is "write" locked. 1445 */ 1446 ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock)); 1447 1448 /* 1449 * Fail the unmap if pages are SOFTLOCKed through this mapping. 1450 * softlockcnt is protected from change by the as write lock. 1451 */ 1452 retry: 1453 if (svd->softlockcnt > 0) { 1454 /* 1455 * since we do have the writers lock nobody can fill 1456 * the cache during the purge. The flush either succeeds 1457 * or we still have pending I/Os. 1458 */ 1459 if (reclaim == 1) { 1460 segvn_purge(seg); 1461 reclaim = 0; 1462 goto retry; 1463 } 1464 return (EAGAIN); 1465 } 1466 1467 /* 1468 * Check for bad sizes 1469 */ 1470 if (addr < seg->s_base || addr + len > seg->s_base + seg->s_size || 1471 (len & PAGEOFFSET) || ((uintptr_t)addr & PAGEOFFSET)) { 1472 panic("segvn_unmap"); 1473 /*NOTREACHED*/ 1474 } 1475 1476 if (seg->s_szc != 0) { 1477 size_t pgsz = page_get_pagesize(seg->s_szc); 1478 int err; 1479 if (!IS_P2ALIGNED(addr, pgsz) || !IS_P2ALIGNED(len, pgsz)) { 1480 ASSERT(seg->s_base != addr || seg->s_size != len); 1481 VM_STAT_ADD(segvnvmstats.demoterange[0]); 1482 err = segvn_demote_range(seg, addr, len, SDR_END, 0); 1483 if (err == 0) { 1484 return (IE_RETRY); 1485 } 1486 return (err); 1487 } 1488 } 1489 1490 /* Inform the vnode of the unmapping. */ 1491 if (svd->vp) { 1492 int error; 1493 1494 error = VOP_DELMAP(svd->vp, 1495 (offset_t)svd->offset + (uintptr_t)(addr - seg->s_base), 1496 seg->s_as, addr, len, svd->prot, svd->maxprot, 1497 svd->type, svd->cred); 1498 1499 if (error == EAGAIN) 1500 return (error); 1501 } 1502 /* 1503 * Remove any page locks set through this mapping. 1504 */ 1505 (void) segvn_lockop(seg, addr, len, 0, MC_UNLOCK, NULL, 0); 1506 1507 /* 1508 * Unload any hardware translations in the range to be taken out. 1509 * Use a callback to invoke free_vp_pages() effectively. 1510 */ 1511 if (svd->vp != NULL && free_pages != 0) { 1512 callback.hcb_data = seg; 1513 callback.hcb_function = segvn_hat_unload_callback; 1514 cbp = &callback; 1515 } 1516 hat_unload_callback(seg->s_as->a_hat, addr, len, HAT_UNLOAD_UNMAP, cbp); 1517 1518 /* 1519 * Check for entire segment 1520 */ 1521 if (addr == seg->s_base && len == seg->s_size) { 1522 seg_free(seg); 1523 return (0); 1524 } 1525 1526 opages = seg_pages(seg); 1527 dpages = btop(len); 1528 npages = opages - dpages; 1529 amp = svd->amp; 1530 ASSERT(amp == NULL || amp->a_szc >= seg->s_szc); 1531 1532 /* 1533 * Check for beginning of segment 1534 */ 1535 if (addr == seg->s_base) { 1536 if (svd->vpage != NULL) { 1537 size_t nbytes; 1538 struct vpage *ovpage; 1539 1540 ovpage = svd->vpage; /* keep pointer to vpage */ 1541 1542 nbytes = vpgtob(npages); 1543 svd->vpage = kmem_alloc(nbytes, KM_SLEEP); 1544 bcopy(&ovpage[dpages], svd->vpage, nbytes); 1545 1546 /* free up old vpage */ 1547 kmem_free(ovpage, vpgtob(opages)); 1548 } 1549 if (amp != NULL) { 1550 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 1551 if (amp->refcnt == 1 || svd->type == MAP_PRIVATE) { 1552 /* 1553 * Free up now unused parts of anon_map array. 1554 */ 1555 if (amp->a_szc == seg->s_szc) { 1556 if (seg->s_szc != 0) { 1557 anon_free_pages(amp->ahp, 1558 svd->anon_index, len, 1559 seg->s_szc); 1560 } else { 1561 anon_free(amp->ahp, 1562 svd->anon_index, 1563 len); 1564 } 1565 } else { 1566 ASSERT(svd->type == MAP_SHARED); 1567 ASSERT(amp->a_szc > seg->s_szc); 1568 anon_shmap_free_pages(amp, 1569 svd->anon_index, len); 1570 } 1571 1572 /* 1573 * Unreserve swap space for the 1574 * unmapped chunk of this segment in 1575 * case it's MAP_SHARED 1576 */ 1577 if (svd->type == MAP_SHARED) { 1578 anon_unresv(len); 1579 amp->swresv -= len; 1580 } 1581 } 1582 ANON_LOCK_EXIT(&->a_rwlock); 1583 svd->anon_index += dpages; 1584 } 1585 if (svd->vp != NULL) 1586 svd->offset += len; 1587 1588 if (svd->swresv) { 1589 if (svd->flags & MAP_NORESERVE) { 1590 ASSERT(amp); 1591 oswresv = svd->swresv; 1592 1593 svd->swresv = ptob(anon_pages(amp->ahp, 1594 svd->anon_index, npages)); 1595 anon_unresv(oswresv - svd->swresv); 1596 } else { 1597 anon_unresv(len); 1598 svd->swresv -= len; 1599 } 1600 TRACE_3(TR_FAC_VM, TR_ANON_PROC, "anon proc:%p %lu %u", 1601 seg, len, 0); 1602 } 1603 1604 seg->s_base += len; 1605 seg->s_size -= len; 1606 return (0); 1607 } 1608 1609 /* 1610 * Check for end of segment 1611 */ 1612 if (addr + len == seg->s_base + seg->s_size) { 1613 if (svd->vpage != NULL) { 1614 size_t nbytes; 1615 struct vpage *ovpage; 1616 1617 ovpage = svd->vpage; /* keep pointer to vpage */ 1618 1619 nbytes = vpgtob(npages); 1620 svd->vpage = kmem_alloc(nbytes, KM_SLEEP); 1621 bcopy(ovpage, svd->vpage, nbytes); 1622 1623 /* free up old vpage */ 1624 kmem_free(ovpage, vpgtob(opages)); 1625 1626 } 1627 if (amp != NULL) { 1628 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 1629 if (amp->refcnt == 1 || svd->type == MAP_PRIVATE) { 1630 /* 1631 * Free up now unused parts of anon_map array. 1632 */ 1633 ulong_t an_idx = svd->anon_index + npages; 1634 if (amp->a_szc == seg->s_szc) { 1635 if (seg->s_szc != 0) { 1636 anon_free_pages(amp->ahp, 1637 an_idx, len, 1638 seg->s_szc); 1639 } else { 1640 anon_free(amp->ahp, an_idx, 1641 len); 1642 } 1643 } else { 1644 ASSERT(svd->type == MAP_SHARED); 1645 ASSERT(amp->a_szc > seg->s_szc); 1646 anon_shmap_free_pages(amp, 1647 an_idx, len); 1648 } 1649 1650 /* 1651 * Unreserve swap space for the 1652 * unmapped chunk of this segment in 1653 * case it's MAP_SHARED 1654 */ 1655 if (svd->type == MAP_SHARED) { 1656 anon_unresv(len); 1657 amp->swresv -= len; 1658 } 1659 } 1660 ANON_LOCK_EXIT(&->a_rwlock); 1661 } 1662 1663 if (svd->swresv) { 1664 if (svd->flags & MAP_NORESERVE) { 1665 ASSERT(amp); 1666 oswresv = svd->swresv; 1667 svd->swresv = ptob(anon_pages(amp->ahp, 1668 svd->anon_index, npages)); 1669 anon_unresv(oswresv - svd->swresv); 1670 } else { 1671 anon_unresv(len); 1672 svd->swresv -= len; 1673 } 1674 TRACE_3(TR_FAC_VM, TR_ANON_PROC, 1675 "anon proc:%p %lu %u", seg, len, 0); 1676 } 1677 1678 seg->s_size -= len; 1679 return (0); 1680 } 1681 1682 /* 1683 * The section to go is in the middle of the segment, 1684 * have to make it into two segments. nseg is made for 1685 * the high end while seg is cut down at the low end. 1686 */ 1687 nbase = addr + len; /* new seg base */ 1688 nsize = (seg->s_base + seg->s_size) - nbase; /* new seg size */ 1689 seg->s_size = addr - seg->s_base; /* shrink old seg */ 1690 nseg = seg_alloc(seg->s_as, nbase, nsize); 1691 if (nseg == NULL) { 1692 panic("segvn_unmap seg_alloc"); 1693 /*NOTREACHED*/ 1694 } 1695 nseg->s_ops = seg->s_ops; 1696 nsvd = kmem_cache_alloc(segvn_cache, KM_SLEEP); 1697 nseg->s_data = (void *)nsvd; 1698 nseg->s_szc = seg->s_szc; 1699 *nsvd = *svd; 1700 nsvd->offset = svd->offset + (uintptr_t)(nseg->s_base - seg->s_base); 1701 nsvd->swresv = 0; 1702 nsvd->softlockcnt = 0; 1703 1704 if (svd->vp != NULL) { 1705 VN_HOLD(nsvd->vp); 1706 if (nsvd->type == MAP_SHARED) 1707 lgrp_shm_policy_init(NULL, nsvd->vp); 1708 } 1709 crhold(svd->cred); 1710 1711 if (svd->vpage == NULL) { 1712 nsvd->vpage = NULL; 1713 } else { 1714 /* need to split vpage into two arrays */ 1715 size_t nbytes; 1716 struct vpage *ovpage; 1717 1718 ovpage = svd->vpage; /* keep pointer to vpage */ 1719 1720 npages = seg_pages(seg); /* seg has shrunk */ 1721 nbytes = vpgtob(npages); 1722 svd->vpage = kmem_alloc(nbytes, KM_SLEEP); 1723 1724 bcopy(ovpage, svd->vpage, nbytes); 1725 1726 npages = seg_pages(nseg); 1727 nbytes = vpgtob(npages); 1728 nsvd->vpage = kmem_alloc(nbytes, KM_SLEEP); 1729 1730 bcopy(&ovpage[opages - npages], nsvd->vpage, nbytes); 1731 1732 /* free up old vpage */ 1733 kmem_free(ovpage, vpgtob(opages)); 1734 } 1735 1736 if (amp == NULL) { 1737 nsvd->amp = NULL; 1738 nsvd->anon_index = 0; 1739 } else { 1740 /* 1741 * Need to create a new anon map for the new segment. 1742 * We'll also allocate a new smaller array for the old 1743 * smaller segment to save space. 1744 */ 1745 opages = btop((uintptr_t)(addr - seg->s_base)); 1746 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 1747 if (amp->refcnt == 1 || svd->type == MAP_PRIVATE) { 1748 /* 1749 * Free up now unused parts of anon_map array. 1750 */ 1751 ulong_t an_idx = svd->anon_index + opages; 1752 if (amp->a_szc == seg->s_szc) { 1753 if (seg->s_szc != 0) { 1754 anon_free_pages(amp->ahp, an_idx, len, 1755 seg->s_szc); 1756 } else { 1757 anon_free(amp->ahp, an_idx, 1758 len); 1759 } 1760 } else { 1761 ASSERT(svd->type == MAP_SHARED); 1762 ASSERT(amp->a_szc > seg->s_szc); 1763 anon_shmap_free_pages(amp, an_idx, len); 1764 } 1765 1766 /* 1767 * Unreserve swap space for the 1768 * unmapped chunk of this segment in 1769 * case it's MAP_SHARED 1770 */ 1771 if (svd->type == MAP_SHARED) { 1772 anon_unresv(len); 1773 amp->swresv -= len; 1774 } 1775 } 1776 nsvd->anon_index = svd->anon_index + 1777 btop((uintptr_t)(nseg->s_base - seg->s_base)); 1778 if (svd->type == MAP_SHARED) { 1779 amp->refcnt++; 1780 nsvd->amp = amp; 1781 } else { 1782 struct anon_map *namp; 1783 struct anon_hdr *nahp; 1784 1785 ASSERT(svd->type == MAP_PRIVATE); 1786 nahp = anon_create(btop(seg->s_size), ANON_SLEEP); 1787 namp = anonmap_alloc(nseg->s_size, 0); 1788 namp->a_szc = seg->s_szc; 1789 (void) anon_copy_ptr(amp->ahp, svd->anon_index, nahp, 1790 0, btop(seg->s_size), ANON_SLEEP); 1791 (void) anon_copy_ptr(amp->ahp, nsvd->anon_index, 1792 namp->ahp, 0, btop(nseg->s_size), ANON_SLEEP); 1793 anon_release(amp->ahp, btop(amp->size)); 1794 svd->anon_index = 0; 1795 nsvd->anon_index = 0; 1796 amp->ahp = nahp; 1797 amp->size = seg->s_size; 1798 nsvd->amp = namp; 1799 } 1800 ANON_LOCK_EXIT(&->a_rwlock); 1801 } 1802 if (svd->swresv) { 1803 if (svd->flags & MAP_NORESERVE) { 1804 ASSERT(amp); 1805 oswresv = svd->swresv; 1806 svd->swresv = ptob(anon_pages(amp->ahp, 1807 svd->anon_index, btop(seg->s_size))); 1808 nsvd->swresv = ptob(anon_pages(nsvd->amp->ahp, 1809 nsvd->anon_index, btop(nseg->s_size))); 1810 ASSERT(oswresv >= (svd->swresv + nsvd->swresv)); 1811 anon_unresv(oswresv - (svd->swresv + nsvd->swresv)); 1812 } else { 1813 if (seg->s_size + nseg->s_size + len != svd->swresv) { 1814 panic("segvn_unmap: " 1815 "cannot split swap reservation"); 1816 /*NOTREACHED*/ 1817 } 1818 anon_unresv(len); 1819 svd->swresv = seg->s_size; 1820 nsvd->swresv = nseg->s_size; 1821 } 1822 TRACE_3(TR_FAC_VM, TR_ANON_PROC, "anon proc:%p %lu %u", 1823 seg, len, 0); 1824 } 1825 1826 return (0); /* I'm glad that's all over with! */ 1827 } 1828 1829 static void 1830 segvn_free(struct seg *seg) 1831 { 1832 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 1833 pgcnt_t npages = seg_pages(seg); 1834 struct anon_map *amp; 1835 size_t len; 1836 1837 /* 1838 * We don't need any segment level locks for "segvn" data 1839 * since the address space is "write" locked. 1840 */ 1841 ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock)); 1842 1843 /* 1844 * Be sure to unlock pages. XXX Why do things get free'ed instead 1845 * of unmapped? XXX 1846 */ 1847 (void) segvn_lockop(seg, seg->s_base, seg->s_size, 1848 0, MC_UNLOCK, NULL, 0); 1849 1850 /* 1851 * Deallocate the vpage and anon pointers if necessary and possible. 1852 */ 1853 if (svd->vpage != NULL) { 1854 kmem_free(svd->vpage, vpgtob(npages)); 1855 svd->vpage = NULL; 1856 } 1857 if ((amp = svd->amp) != NULL) { 1858 /* 1859 * If there are no more references to this anon_map 1860 * structure, then deallocate the structure after freeing 1861 * up all the anon slot pointers that we can. 1862 */ 1863 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 1864 ASSERT(amp->a_szc >= seg->s_szc); 1865 if (--amp->refcnt == 0) { 1866 if (svd->type == MAP_PRIVATE) { 1867 /* 1868 * Private - we only need to anon_free 1869 * the part that this segment refers to. 1870 */ 1871 if (seg->s_szc != 0) { 1872 anon_free_pages(amp->ahp, 1873 svd->anon_index, seg->s_size, 1874 seg->s_szc); 1875 } else { 1876 anon_free(amp->ahp, svd->anon_index, 1877 seg->s_size); 1878 } 1879 } else { 1880 /* 1881 * Shared - anon_free the entire 1882 * anon_map's worth of stuff and 1883 * release any swap reservation. 1884 */ 1885 if (amp->a_szc != 0) { 1886 anon_shmap_free_pages(amp, 0, 1887 amp->size); 1888 } else { 1889 anon_free(amp->ahp, 0, amp->size); 1890 } 1891 if ((len = amp->swresv) != 0) { 1892 anon_unresv(len); 1893 TRACE_3(TR_FAC_VM, TR_ANON_PROC, 1894 "anon proc:%p %lu %u", 1895 seg, len, 0); 1896 } 1897 } 1898 svd->amp = NULL; 1899 ANON_LOCK_EXIT(&->a_rwlock); 1900 anonmap_free(amp); 1901 } else if (svd->type == MAP_PRIVATE) { 1902 /* 1903 * We had a private mapping which still has 1904 * a held anon_map so just free up all the 1905 * anon slot pointers that we were using. 1906 */ 1907 if (seg->s_szc != 0) { 1908 anon_free_pages(amp->ahp, svd->anon_index, 1909 seg->s_size, seg->s_szc); 1910 } else { 1911 anon_free(amp->ahp, svd->anon_index, 1912 seg->s_size); 1913 } 1914 ANON_LOCK_EXIT(&->a_rwlock); 1915 } else { 1916 ANON_LOCK_EXIT(&->a_rwlock); 1917 } 1918 } 1919 1920 /* 1921 * Release swap reservation. 1922 */ 1923 if ((len = svd->swresv) != 0) { 1924 anon_unresv(svd->swresv); 1925 TRACE_3(TR_FAC_VM, TR_ANON_PROC, "anon proc:%p %lu %u", 1926 seg, len, 0); 1927 svd->swresv = 0; 1928 } 1929 /* 1930 * Release claim on vnode, credentials, and finally free the 1931 * private data. 1932 */ 1933 if (svd->vp != NULL) { 1934 if (svd->type == MAP_SHARED) 1935 lgrp_shm_policy_fini(NULL, svd->vp); 1936 VN_RELE(svd->vp); 1937 svd->vp = NULL; 1938 } 1939 crfree(svd->cred); 1940 svd->cred = NULL; 1941 1942 seg->s_data = NULL; 1943 kmem_cache_free(segvn_cache, svd); 1944 } 1945 1946 ulong_t segvn_lpglck_limit = 0; 1947 /* 1948 * Support routines used by segvn_pagelock() and softlock faults for anonymous 1949 * pages to implement availrmem accounting in a way that makes sure the 1950 * same memory is accounted just once for all softlock/pagelock purposes. 1951 * This prevents a bug when availrmem is quickly incorrectly exausted from 1952 * several pagelocks to different parts of the same large page since each 1953 * pagelock has to decrement availrmem by the size of the entire large 1954 * page. Note those pages are not COW shared until softunlock/pageunlock so 1955 * we don't need to use cow style accounting here. We also need to make sure 1956 * the entire large page is accounted even if softlock range is less than the 1957 * entire large page because large anon pages can't be demoted when any of 1958 * constituent pages is locked. The caller calls this routine for every page_t 1959 * it locks. The very first page in the range may not be the root page of a 1960 * large page. For all other pages it's guranteed we are going to visit the 1961 * root of a particular large page before any other constituent page as we are 1962 * locking sequential pages belonging to the same anon map. So we do all the 1963 * locking when the root is encountered except for the very first page. Since 1964 * softlocking is not supported (except S_READ_NOCOW special case) for vmpss 1965 * segments and since vnode pages can be demoted without locking all 1966 * constituent pages vnode pages don't come here. Unlocking relies on the 1967 * fact that pagesize can't change whenever any of constituent large pages is 1968 * locked at least SE_SHARED. This allows unlocking code to find the right 1969 * root and decrement availrmem by the same amount it was incremented when the 1970 * page was locked. 1971 */ 1972 static int 1973 segvn_pp_lock_anonpages(page_t *pp, int first) 1974 { 1975 pgcnt_t pages; 1976 pfn_t pfn; 1977 uchar_t szc = pp->p_szc; 1978 1979 ASSERT(PAGE_LOCKED(pp)); 1980 ASSERT(pp->p_vnode != NULL); 1981 ASSERT(IS_SWAPFSVP(pp->p_vnode)); 1982 1983 /* 1984 * pagesize won't change as long as any constituent page is locked. 1985 */ 1986 pages = page_get_pagecnt(pp->p_szc); 1987 pfn = page_pptonum(pp); 1988 1989 if (!first) { 1990 if (!IS_P2ALIGNED(pfn, pages)) { 1991 #ifdef DEBUG 1992 pp = &pp[-(spgcnt_t)(pfn & (pages - 1))]; 1993 pfn = page_pptonum(pp); 1994 ASSERT(IS_P2ALIGNED(pfn, pages)); 1995 ASSERT(pp->p_szc == szc); 1996 ASSERT(pp->p_vnode != NULL); 1997 ASSERT(IS_SWAPFSVP(pp->p_vnode)); 1998 ASSERT(pp->p_slckcnt != 0); 1999 #endif /* DEBUG */ 2000 return (1); 2001 } 2002 } else if (!IS_P2ALIGNED(pfn, pages)) { 2003 pp = &pp[-(spgcnt_t)(pfn & (pages - 1))]; 2004 #ifdef DEBUG 2005 pfn = page_pptonum(pp); 2006 ASSERT(IS_P2ALIGNED(pfn, pages)); 2007 ASSERT(pp->p_szc == szc); 2008 ASSERT(pp->p_vnode != NULL); 2009 ASSERT(IS_SWAPFSVP(pp->p_vnode)); 2010 #endif /* DEBUG */ 2011 } 2012 2013 /* 2014 * pp is a root page. 2015 * We haven't locked this large page yet. 2016 */ 2017 page_struct_lock(pp); 2018 if (pp->p_slckcnt != 0) { 2019 if (pp->p_slckcnt < PAGE_SLOCK_MAXIMUM) { 2020 pp->p_slckcnt++; 2021 page_struct_unlock(pp); 2022 return (1); 2023 } 2024 page_struct_unlock(pp); 2025 segvn_lpglck_limit++; 2026 return (0); 2027 } 2028 mutex_enter(&freemem_lock); 2029 if (availrmem < tune.t_minarmem + pages) { 2030 mutex_exit(&freemem_lock); 2031 page_struct_unlock(pp); 2032 return (0); 2033 } 2034 pp->p_slckcnt++; 2035 availrmem -= pages; 2036 mutex_exit(&freemem_lock); 2037 page_struct_unlock(pp); 2038 return (1); 2039 } 2040 2041 static void 2042 segvn_pp_unlock_anonpages(page_t *pp, int first) 2043 { 2044 pgcnt_t pages; 2045 pfn_t pfn; 2046 2047 ASSERT(PAGE_LOCKED(pp)); 2048 ASSERT(pp->p_vnode != NULL); 2049 ASSERT(IS_SWAPFSVP(pp->p_vnode)); 2050 2051 /* 2052 * pagesize won't change as long as any constituent page is locked. 2053 */ 2054 pages = page_get_pagecnt(pp->p_szc); 2055 pfn = page_pptonum(pp); 2056 2057 if (!first) { 2058 if (!IS_P2ALIGNED(pfn, pages)) { 2059 return; 2060 } 2061 } else if (!IS_P2ALIGNED(pfn, pages)) { 2062 pp = &pp[-(spgcnt_t)(pfn & (pages - 1))]; 2063 #ifdef DEBUG 2064 pfn = page_pptonum(pp); 2065 ASSERT(IS_P2ALIGNED(pfn, pages)); 2066 #endif /* DEBUG */ 2067 } 2068 ASSERT(pp->p_vnode != NULL); 2069 ASSERT(IS_SWAPFSVP(pp->p_vnode)); 2070 ASSERT(pp->p_slckcnt != 0); 2071 page_struct_lock(pp); 2072 if (--pp->p_slckcnt == 0) { 2073 mutex_enter(&freemem_lock); 2074 availrmem += pages; 2075 mutex_exit(&freemem_lock); 2076 } 2077 page_struct_unlock(pp); 2078 } 2079 2080 /* 2081 * Do a F_SOFTUNLOCK call over the range requested. The range must have 2082 * already been F_SOFTLOCK'ed. 2083 * Caller must always match addr and len of a softunlock with a previous 2084 * softlock with exactly the same addr and len. 2085 */ 2086 static void 2087 segvn_softunlock(struct seg *seg, caddr_t addr, size_t len, enum seg_rw rw) 2088 { 2089 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 2090 page_t *pp; 2091 caddr_t adr; 2092 struct vnode *vp; 2093 u_offset_t offset; 2094 ulong_t anon_index; 2095 struct anon_map *amp; 2096 struct anon *ap = NULL; 2097 2098 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 2099 ASSERT(SEGVN_LOCK_HELD(seg->s_as, &svd->lock)); 2100 2101 if ((amp = svd->amp) != NULL) 2102 anon_index = svd->anon_index + seg_page(seg, addr); 2103 2104 hat_unlock(seg->s_as->a_hat, addr, len); 2105 for (adr = addr; adr < addr + len; adr += PAGESIZE) { 2106 if (amp != NULL) { 2107 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 2108 if ((ap = anon_get_ptr(amp->ahp, anon_index++)) 2109 != NULL) { 2110 swap_xlate(ap, &vp, &offset); 2111 } else { 2112 vp = svd->vp; 2113 offset = svd->offset + 2114 (uintptr_t)(adr - seg->s_base); 2115 } 2116 ANON_LOCK_EXIT(&->a_rwlock); 2117 } else { 2118 vp = svd->vp; 2119 offset = svd->offset + 2120 (uintptr_t)(adr - seg->s_base); 2121 } 2122 2123 /* 2124 * Use page_find() instead of page_lookup() to 2125 * find the page since we know that it is locked. 2126 */ 2127 pp = page_find(vp, offset); 2128 if (pp == NULL) { 2129 panic( 2130 "segvn_softunlock: addr %p, ap %p, vp %p, off %llx", 2131 (void *)adr, (void *)ap, (void *)vp, offset); 2132 /*NOTREACHED*/ 2133 } 2134 2135 if (rw == S_WRITE) { 2136 hat_setrefmod(pp); 2137 if (seg->s_as->a_vbits) 2138 hat_setstat(seg->s_as, adr, PAGESIZE, 2139 P_REF | P_MOD); 2140 } else if (rw != S_OTHER) { 2141 hat_setref(pp); 2142 if (seg->s_as->a_vbits) 2143 hat_setstat(seg->s_as, adr, PAGESIZE, P_REF); 2144 } 2145 TRACE_3(TR_FAC_VM, TR_SEGVN_FAULT, 2146 "segvn_fault:pp %p vp %p offset %llx", pp, vp, offset); 2147 if (svd->vp == NULL) { 2148 segvn_pp_unlock_anonpages(pp, adr == addr); 2149 } 2150 page_unlock(pp); 2151 } 2152 mutex_enter(&freemem_lock); /* for availrmem */ 2153 if (svd->vp != NULL) { 2154 availrmem += btop(len); 2155 } 2156 segvn_pages_locked -= btop(len); 2157 svd->softlockcnt -= btop(len); 2158 mutex_exit(&freemem_lock); 2159 if (svd->softlockcnt == 0) { 2160 /* 2161 * All SOFTLOCKS are gone. Wakeup any waiting 2162 * unmappers so they can try again to unmap. 2163 * Check for waiters first without the mutex 2164 * held so we don't always grab the mutex on 2165 * softunlocks. 2166 */ 2167 if (AS_ISUNMAPWAIT(seg->s_as)) { 2168 mutex_enter(&seg->s_as->a_contents); 2169 if (AS_ISUNMAPWAIT(seg->s_as)) { 2170 AS_CLRUNMAPWAIT(seg->s_as); 2171 cv_broadcast(&seg->s_as->a_cv); 2172 } 2173 mutex_exit(&seg->s_as->a_contents); 2174 } 2175 } 2176 } 2177 2178 #define PAGE_HANDLED ((page_t *)-1) 2179 2180 /* 2181 * Release all the pages in the NULL terminated ppp list 2182 * which haven't already been converted to PAGE_HANDLED. 2183 */ 2184 static void 2185 segvn_pagelist_rele(page_t **ppp) 2186 { 2187 for (; *ppp != NULL; ppp++) { 2188 if (*ppp != PAGE_HANDLED) 2189 page_unlock(*ppp); 2190 } 2191 } 2192 2193 static int stealcow = 1; 2194 2195 /* 2196 * Workaround for viking chip bug. See bug id 1220902. 2197 * To fix this down in pagefault() would require importing so 2198 * much as and segvn code as to be unmaintainable. 2199 */ 2200 int enable_mbit_wa = 0; 2201 2202 /* 2203 * Handles all the dirty work of getting the right 2204 * anonymous pages and loading up the translations. 2205 * This routine is called only from segvn_fault() 2206 * when looping over the range of addresses requested. 2207 * 2208 * The basic algorithm here is: 2209 * If this is an anon_zero case 2210 * Call anon_zero to allocate page 2211 * Load up translation 2212 * Return 2213 * endif 2214 * If this is an anon page 2215 * Use anon_getpage to get the page 2216 * else 2217 * Find page in pl[] list passed in 2218 * endif 2219 * If not a cow 2220 * Load up the translation to the page 2221 * return 2222 * endif 2223 * Call anon_private to handle cow 2224 * Load up (writable) translation to new page 2225 */ 2226 static faultcode_t 2227 segvn_faultpage( 2228 struct hat *hat, /* the hat to use for mapping */ 2229 struct seg *seg, /* seg_vn of interest */ 2230 caddr_t addr, /* address in as */ 2231 u_offset_t off, /* offset in vp */ 2232 struct vpage *vpage, /* pointer to vpage for vp, off */ 2233 page_t *pl[], /* object source page pointer */ 2234 uint_t vpprot, /* access allowed to object pages */ 2235 enum fault_type type, /* type of fault */ 2236 enum seg_rw rw, /* type of access at fault */ 2237 int brkcow, /* we may need to break cow */ 2238 int first) /* first page for this fault if 1 */ 2239 { 2240 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 2241 page_t *pp, **ppp; 2242 uint_t pageflags = 0; 2243 page_t *anon_pl[1 + 1]; 2244 page_t *opp = NULL; /* original page */ 2245 uint_t prot; 2246 int err; 2247 int cow; 2248 int claim; 2249 int steal = 0; 2250 ulong_t anon_index; 2251 struct anon *ap, *oldap; 2252 struct anon_map *amp; 2253 int hat_flag = (type == F_SOFTLOCK) ? HAT_LOAD_LOCK : HAT_LOAD; 2254 int anon_lock = 0; 2255 anon_sync_obj_t cookie; 2256 2257 if (svd->flags & MAP_TEXT) { 2258 hat_flag |= HAT_LOAD_TEXT; 2259 } 2260 2261 ASSERT(SEGVN_READ_HELD(seg->s_as, &svd->lock)); 2262 ASSERT(seg->s_szc == 0); 2263 2264 /* 2265 * Initialize protection value for this page. 2266 * If we have per page protection values check it now. 2267 */ 2268 if (svd->pageprot) { 2269 uint_t protchk; 2270 2271 switch (rw) { 2272 case S_READ: 2273 protchk = PROT_READ; 2274 break; 2275 case S_WRITE: 2276 protchk = PROT_WRITE; 2277 break; 2278 case S_EXEC: 2279 protchk = PROT_EXEC; 2280 break; 2281 case S_OTHER: 2282 default: 2283 protchk = PROT_READ | PROT_WRITE | PROT_EXEC; 2284 break; 2285 } 2286 2287 prot = VPP_PROT(vpage); 2288 if ((prot & protchk) == 0) 2289 return (FC_PROT); /* illegal access type */ 2290 } else { 2291 prot = svd->prot; 2292 } 2293 2294 if (type == F_SOFTLOCK && svd->vp != NULL) { 2295 mutex_enter(&freemem_lock); 2296 if (availrmem <= tune.t_minarmem) { 2297 mutex_exit(&freemem_lock); 2298 return (FC_MAKE_ERR(ENOMEM)); /* out of real memory */ 2299 } else { 2300 availrmem--; 2301 svd->softlockcnt++; 2302 segvn_pages_locked++; 2303 } 2304 mutex_exit(&freemem_lock); 2305 } 2306 2307 /* 2308 * Always acquire the anon array lock to prevent 2 threads from 2309 * allocating separate anon slots for the same "addr". 2310 */ 2311 2312 if ((amp = svd->amp) != NULL) { 2313 ASSERT(RW_READ_HELD(&->a_rwlock)); 2314 anon_index = svd->anon_index + seg_page(seg, addr); 2315 anon_array_enter(amp, anon_index, &cookie); 2316 anon_lock = 1; 2317 } 2318 2319 if (svd->vp == NULL && amp != NULL) { 2320 if ((ap = anon_get_ptr(amp->ahp, anon_index)) == NULL) { 2321 /* 2322 * Allocate a (normally) writable anonymous page of 2323 * zeroes. If no advance reservations, reserve now. 2324 */ 2325 if (svd->flags & MAP_NORESERVE) { 2326 if (anon_resv_zone(ptob(1), 2327 seg->s_as->a_proc->p_zone)) { 2328 atomic_add_long(&svd->swresv, ptob(1)); 2329 } else { 2330 err = ENOMEM; 2331 goto out; 2332 } 2333 } 2334 if ((pp = anon_zero(seg, addr, &ap, 2335 svd->cred)) == NULL) { 2336 err = ENOMEM; 2337 goto out; /* out of swap space */ 2338 } 2339 /* 2340 * Re-acquire the anon_map lock and 2341 * initialize the anon array entry. 2342 */ 2343 (void) anon_set_ptr(amp->ahp, anon_index, ap, 2344 ANON_SLEEP); 2345 2346 ASSERT(pp->p_szc == 0); 2347 2348 /* 2349 * Handle pages that have been marked for migration 2350 */ 2351 if (lgrp_optimizations()) 2352 page_migrate(seg, addr, &pp, 1); 2353 2354 if (type == F_SOFTLOCK) { 2355 if (!segvn_pp_lock_anonpages(pp, first)) { 2356 page_unlock(pp); 2357 err = ENOMEM; 2358 goto out; 2359 } else { 2360 mutex_enter(&freemem_lock); 2361 svd->softlockcnt++; 2362 segvn_pages_locked++; 2363 mutex_exit(&freemem_lock); 2364 } 2365 } 2366 2367 if (enable_mbit_wa) { 2368 if (rw == S_WRITE) 2369 hat_setmod(pp); 2370 else if (!hat_ismod(pp)) 2371 prot &= ~PROT_WRITE; 2372 } 2373 /* 2374 * If AS_PAGLCK is set in a_flags (via memcntl(2) 2375 * with MC_LOCKAS, MCL_FUTURE) and this is a 2376 * MAP_NORESERVE segment, we may need to 2377 * permanently lock the page as it is being faulted 2378 * for the first time. The following text applies 2379 * only to MAP_NORESERVE segments: 2380 * 2381 * As per memcntl(2), if this segment was created 2382 * after MCL_FUTURE was applied (a "future" 2383 * segment), its pages must be locked. If this 2384 * segment existed at MCL_FUTURE application (a 2385 * "past" segment), the interface is unclear. 2386 * 2387 * We decide to lock only if vpage is present: 2388 * 2389 * - "future" segments will have a vpage array (see 2390 * as_map), and so will be locked as required 2391 * 2392 * - "past" segments may not have a vpage array, 2393 * depending on whether events (such as 2394 * mprotect) have occurred. Locking if vpage 2395 * exists will preserve legacy behavior. Not 2396 * locking if vpage is absent, will not break 2397 * the interface or legacy behavior. Note that 2398 * allocating vpage here if it's absent requires 2399 * upgrading the segvn reader lock, the cost of 2400 * which does not seem worthwhile. 2401 * 2402 * Usually testing and setting VPP_ISPPLOCK and 2403 * VPP_SETPPLOCK requires holding the segvn lock as 2404 * writer, but in this case all readers are 2405 * serializing on the anon array lock. 2406 */ 2407 if (AS_ISPGLCK(seg->s_as) && vpage != NULL && 2408 (svd->flags & MAP_NORESERVE) && 2409 !VPP_ISPPLOCK(vpage)) { 2410 proc_t *p = seg->s_as->a_proc; 2411 ASSERT(svd->type == MAP_PRIVATE); 2412 mutex_enter(&p->p_lock); 2413 if (rctl_incr_locked_mem(p, NULL, PAGESIZE, 2414 1) == 0) { 2415 claim = VPP_PROT(vpage) & PROT_WRITE; 2416 if (page_pp_lock(pp, claim, 0)) { 2417 VPP_SETPPLOCK(vpage); 2418 } else { 2419 rctl_decr_locked_mem(p, NULL, 2420 PAGESIZE, 1); 2421 } 2422 } 2423 mutex_exit(&p->p_lock); 2424 } 2425 2426 hat_memload(hat, addr, pp, prot, hat_flag); 2427 2428 if (!(hat_flag & HAT_LOAD_LOCK)) 2429 page_unlock(pp); 2430 2431 anon_array_exit(&cookie); 2432 return (0); 2433 } 2434 } 2435 2436 /* 2437 * Obtain the page structure via anon_getpage() if it is 2438 * a private copy of an object (the result of a previous 2439 * copy-on-write). 2440 */ 2441 if (amp != NULL) { 2442 if ((ap = anon_get_ptr(amp->ahp, anon_index)) != NULL) { 2443 err = anon_getpage(&ap, &vpprot, anon_pl, PAGESIZE, 2444 seg, addr, rw, svd->cred); 2445 if (err) 2446 goto out; 2447 2448 if (svd->type == MAP_SHARED) { 2449 /* 2450 * If this is a shared mapping to an 2451 * anon_map, then ignore the write 2452 * permissions returned by anon_getpage(). 2453 * They apply to the private mappings 2454 * of this anon_map. 2455 */ 2456 vpprot |= PROT_WRITE; 2457 } 2458 opp = anon_pl[0]; 2459 } 2460 } 2461 2462 /* 2463 * Search the pl[] list passed in if it is from the 2464 * original object (i.e., not a private copy). 2465 */ 2466 if (opp == NULL) { 2467 /* 2468 * Find original page. We must be bringing it in 2469 * from the list in pl[]. 2470 */ 2471 for (ppp = pl; (opp = *ppp) != NULL; ppp++) { 2472 if (opp == PAGE_HANDLED) 2473 continue; 2474 ASSERT(opp->p_vnode == svd->vp); /* XXX */ 2475 if (opp->p_offset == off) 2476 break; 2477 } 2478 if (opp == NULL) { 2479 panic("segvn_faultpage not found"); 2480 /*NOTREACHED*/ 2481 } 2482 *ppp = PAGE_HANDLED; 2483 2484 } 2485 2486 ASSERT(PAGE_LOCKED(opp)); 2487 2488 TRACE_3(TR_FAC_VM, TR_SEGVN_FAULT, 2489 "segvn_fault:pp %p vp %p offset %llx", 2490 opp, NULL, 0); 2491 2492 /* 2493 * The fault is treated as a copy-on-write fault if a 2494 * write occurs on a private segment and the object 2495 * page (i.e., mapping) is write protected. We assume 2496 * that fatal protection checks have already been made. 2497 */ 2498 2499 cow = brkcow && ((vpprot & PROT_WRITE) == 0); 2500 2501 /* 2502 * If not a copy-on-write case load the translation 2503 * and return. 2504 */ 2505 if (cow == 0) { 2506 2507 /* 2508 * Handle pages that have been marked for migration 2509 */ 2510 if (lgrp_optimizations()) 2511 page_migrate(seg, addr, &opp, 1); 2512 2513 if (type == F_SOFTLOCK && svd->vp == NULL) { 2514 2515 ASSERT(opp->p_szc == 0 || 2516 (svd->type == MAP_SHARED && 2517 amp != NULL && amp->a_szc != 0)); 2518 2519 if (!segvn_pp_lock_anonpages(opp, first)) { 2520 page_unlock(opp); 2521 err = ENOMEM; 2522 goto out; 2523 } else { 2524 mutex_enter(&freemem_lock); 2525 svd->softlockcnt++; 2526 segvn_pages_locked++; 2527 mutex_exit(&freemem_lock); 2528 } 2529 } 2530 if (IS_VMODSORT(opp->p_vnode) || enable_mbit_wa) { 2531 if (rw == S_WRITE) 2532 hat_setmod(opp); 2533 else if (rw != S_OTHER && !hat_ismod(opp)) 2534 prot &= ~PROT_WRITE; 2535 } 2536 2537 hat_memload(hat, addr, opp, prot & vpprot, hat_flag); 2538 2539 if (!(hat_flag & HAT_LOAD_LOCK)) 2540 page_unlock(opp); 2541 2542 if (anon_lock) { 2543 anon_array_exit(&cookie); 2544 } 2545 return (0); 2546 } 2547 2548 hat_setref(opp); 2549 2550 ASSERT(amp != NULL && anon_lock); 2551 2552 /* 2553 * Steal the page only if it isn't a private page 2554 * since stealing a private page is not worth the effort. 2555 */ 2556 if ((ap = anon_get_ptr(amp->ahp, anon_index)) == NULL) 2557 steal = 1; 2558 2559 /* 2560 * Steal the original page if the following conditions are true: 2561 * 2562 * We are low on memory, the page is not private, page is not large, 2563 * not shared, not modified, not `locked' or if we have it `locked' 2564 * (i.e., p_cowcnt == 1 and p_lckcnt == 0, which also implies 2565 * that the page is not shared) and if it doesn't have any 2566 * translations. page_struct_lock isn't needed to look at p_cowcnt 2567 * and p_lckcnt because we first get exclusive lock on page. 2568 */ 2569 (void) hat_pagesync(opp, HAT_SYNC_DONTZERO | HAT_SYNC_STOPON_MOD); 2570 2571 if (stealcow && freemem < minfree && steal && opp->p_szc == 0 && 2572 page_tryupgrade(opp) && !hat_ismod(opp) && 2573 ((opp->p_lckcnt == 0 && opp->p_cowcnt == 0) || 2574 (opp->p_lckcnt == 0 && opp->p_cowcnt == 1 && 2575 vpage != NULL && VPP_ISPPLOCK(vpage)))) { 2576 /* 2577 * Check if this page has other translations 2578 * after unloading our translation. 2579 */ 2580 if (hat_page_is_mapped(opp)) { 2581 hat_unload(seg->s_as->a_hat, addr, PAGESIZE, 2582 HAT_UNLOAD); 2583 } 2584 2585 /* 2586 * hat_unload() might sync back someone else's recent 2587 * modification, so check again. 2588 */ 2589 if (!hat_ismod(opp) && !hat_page_is_mapped(opp)) 2590 pageflags |= STEAL_PAGE; 2591 } 2592 2593 /* 2594 * If we have a vpage pointer, see if it indicates that we have 2595 * ``locked'' the page we map -- if so, tell anon_private to 2596 * transfer the locking resource to the new page. 2597 * 2598 * See Statement at the beginning of segvn_lockop regarding 2599 * the way lockcnts/cowcnts are handled during COW. 2600 * 2601 */ 2602 if (vpage != NULL && VPP_ISPPLOCK(vpage)) 2603 pageflags |= LOCK_PAGE; 2604 2605 /* 2606 * Allocate a private page and perform the copy. 2607 * For MAP_NORESERVE reserve swap space now, unless this 2608 * is a cow fault on an existing anon page in which case 2609 * MAP_NORESERVE will have made advance reservations. 2610 */ 2611 if ((svd->flags & MAP_NORESERVE) && (ap == NULL)) { 2612 if (anon_resv(ptob(1))) { 2613 svd->swresv += ptob(1); 2614 } else { 2615 page_unlock(opp); 2616 err = ENOMEM; 2617 goto out; 2618 } 2619 } 2620 oldap = ap; 2621 pp = anon_private(&ap, seg, addr, prot, opp, pageflags, svd->cred); 2622 if (pp == NULL) { 2623 err = ENOMEM; /* out of swap space */ 2624 goto out; 2625 } 2626 2627 /* 2628 * If we copied away from an anonymous page, then 2629 * we are one step closer to freeing up an anon slot. 2630 * 2631 * NOTE: The original anon slot must be released while 2632 * holding the "anon_map" lock. This is necessary to prevent 2633 * other threads from obtaining a pointer to the anon slot 2634 * which may be freed if its "refcnt" is 1. 2635 */ 2636 if (oldap != NULL) 2637 anon_decref(oldap); 2638 2639 (void) anon_set_ptr(amp->ahp, anon_index, ap, ANON_SLEEP); 2640 2641 /* 2642 * Handle pages that have been marked for migration 2643 */ 2644 if (lgrp_optimizations()) 2645 page_migrate(seg, addr, &pp, 1); 2646 2647 ASSERT(pp->p_szc == 0); 2648 if (type == F_SOFTLOCK && svd->vp == NULL) { 2649 if (!segvn_pp_lock_anonpages(pp, first)) { 2650 page_unlock(pp); 2651 err = ENOMEM; 2652 goto out; 2653 } else { 2654 mutex_enter(&freemem_lock); 2655 svd->softlockcnt++; 2656 segvn_pages_locked++; 2657 mutex_exit(&freemem_lock); 2658 } 2659 } 2660 2661 ASSERT(!IS_VMODSORT(pp->p_vnode)); 2662 if (enable_mbit_wa) { 2663 if (rw == S_WRITE) 2664 hat_setmod(pp); 2665 else if (!hat_ismod(pp)) 2666 prot &= ~PROT_WRITE; 2667 } 2668 2669 hat_memload(hat, addr, pp, prot, hat_flag); 2670 2671 if (!(hat_flag & HAT_LOAD_LOCK)) 2672 page_unlock(pp); 2673 2674 ASSERT(anon_lock); 2675 anon_array_exit(&cookie); 2676 return (0); 2677 out: 2678 if (anon_lock) 2679 anon_array_exit(&cookie); 2680 2681 if (type == F_SOFTLOCK && svd->vp != NULL) { 2682 mutex_enter(&freemem_lock); 2683 availrmem++; 2684 segvn_pages_locked--; 2685 svd->softlockcnt--; 2686 mutex_exit(&freemem_lock); 2687 } 2688 return (FC_MAKE_ERR(err)); 2689 } 2690 2691 /* 2692 * relocate a bunch of smaller targ pages into one large repl page. all targ 2693 * pages must be complete pages smaller than replacement pages. 2694 * it's assumed that no page's szc can change since they are all PAGESIZE or 2695 * complete large pages locked SHARED. 2696 */ 2697 static void 2698 segvn_relocate_pages(page_t **targ, page_t *replacement) 2699 { 2700 page_t *pp; 2701 pgcnt_t repl_npgs, curnpgs; 2702 pgcnt_t i; 2703 uint_t repl_szc = replacement->p_szc; 2704 page_t *first_repl = replacement; 2705 page_t *repl; 2706 spgcnt_t npgs; 2707 2708 VM_STAT_ADD(segvnvmstats.relocatepages[0]); 2709 2710 ASSERT(repl_szc != 0); 2711 npgs = repl_npgs = page_get_pagecnt(repl_szc); 2712 2713 i = 0; 2714 while (repl_npgs) { 2715 spgcnt_t nreloc; 2716 int err; 2717 ASSERT(replacement != NULL); 2718 pp = targ[i]; 2719 ASSERT(pp->p_szc < repl_szc); 2720 ASSERT(PAGE_EXCL(pp)); 2721 ASSERT(!PP_ISFREE(pp)); 2722 curnpgs = page_get_pagecnt(pp->p_szc); 2723 if (curnpgs == 1) { 2724 VM_STAT_ADD(segvnvmstats.relocatepages[1]); 2725 repl = replacement; 2726 page_sub(&replacement, repl); 2727 ASSERT(PAGE_EXCL(repl)); 2728 ASSERT(!PP_ISFREE(repl)); 2729 ASSERT(repl->p_szc == repl_szc); 2730 } else { 2731 page_t *repl_savepp; 2732 int j; 2733 VM_STAT_ADD(segvnvmstats.relocatepages[2]); 2734 repl_savepp = replacement; 2735 for (j = 0; j < curnpgs; j++) { 2736 repl = replacement; 2737 page_sub(&replacement, repl); 2738 ASSERT(PAGE_EXCL(repl)); 2739 ASSERT(!PP_ISFREE(repl)); 2740 ASSERT(repl->p_szc == repl_szc); 2741 ASSERT(page_pptonum(targ[i + j]) == 2742 page_pptonum(targ[i]) + j); 2743 } 2744 repl = repl_savepp; 2745 ASSERT(IS_P2ALIGNED(page_pptonum(repl), curnpgs)); 2746 } 2747 err = page_relocate(&pp, &repl, 0, 1, &nreloc, NULL); 2748 if (err || nreloc != curnpgs) { 2749 panic("segvn_relocate_pages: " 2750 "page_relocate failed err=%d curnpgs=%ld " 2751 "nreloc=%ld", err, curnpgs, nreloc); 2752 } 2753 ASSERT(curnpgs <= repl_npgs); 2754 repl_npgs -= curnpgs; 2755 i += curnpgs; 2756 } 2757 ASSERT(replacement == NULL); 2758 2759 repl = first_repl; 2760 repl_npgs = npgs; 2761 for (i = 0; i < repl_npgs; i++) { 2762 ASSERT(PAGE_EXCL(repl)); 2763 ASSERT(!PP_ISFREE(repl)); 2764 targ[i] = repl; 2765 page_downgrade(targ[i]); 2766 repl++; 2767 } 2768 } 2769 2770 /* 2771 * Check if all pages in ppa array are complete smaller than szc pages and 2772 * their roots will still be aligned relative to their current size if the 2773 * entire ppa array is relocated into one szc page. If these conditions are 2774 * not met return 0. 2775 * 2776 * If all pages are properly aligned attempt to upgrade their locks 2777 * to exclusive mode. If it fails set *upgrdfail to 1 and return 0. 2778 * upgrdfail was set to 0 by caller. 2779 * 2780 * Return 1 if all pages are aligned and locked exclusively. 2781 * 2782 * If all pages in ppa array happen to be physically contiguous to make one 2783 * szc page and all exclusive locks are successfully obtained promote the page 2784 * size to szc and set *pszc to szc. Return 1 with pages locked shared. 2785 */ 2786 static int 2787 segvn_full_szcpages(page_t **ppa, uint_t szc, int *upgrdfail, uint_t *pszc) 2788 { 2789 page_t *pp; 2790 pfn_t pfn; 2791 pgcnt_t totnpgs = page_get_pagecnt(szc); 2792 pfn_t first_pfn; 2793 int contig = 1; 2794 pgcnt_t i; 2795 pgcnt_t j; 2796 uint_t curszc; 2797 pgcnt_t curnpgs; 2798 int root = 0; 2799 2800 ASSERT(szc > 0); 2801 2802 VM_STAT_ADD(segvnvmstats.fullszcpages[0]); 2803 2804 for (i = 0; i < totnpgs; i++) { 2805 pp = ppa[i]; 2806 ASSERT(PAGE_SHARED(pp)); 2807 ASSERT(!PP_ISFREE(pp)); 2808 pfn = page_pptonum(pp); 2809 if (i == 0) { 2810 if (!IS_P2ALIGNED(pfn, totnpgs)) { 2811 contig = 0; 2812 } else { 2813 first_pfn = pfn; 2814 } 2815 } else if (contig && pfn != first_pfn + i) { 2816 contig = 0; 2817 } 2818 if (pp->p_szc == 0) { 2819 if (root) { 2820 VM_STAT_ADD(segvnvmstats.fullszcpages[1]); 2821 return (0); 2822 } 2823 } else if (!root) { 2824 if ((curszc = pp->p_szc) >= szc) { 2825 VM_STAT_ADD(segvnvmstats.fullszcpages[2]); 2826 return (0); 2827 } 2828 if (curszc == 0) { 2829 /* 2830 * p_szc changed means we don't have all pages 2831 * locked. return failure. 2832 */ 2833 VM_STAT_ADD(segvnvmstats.fullszcpages[3]); 2834 return (0); 2835 } 2836 curnpgs = page_get_pagecnt(curszc); 2837 if (!IS_P2ALIGNED(pfn, curnpgs) || 2838 !IS_P2ALIGNED(i, curnpgs)) { 2839 VM_STAT_ADD(segvnvmstats.fullszcpages[4]); 2840 return (0); 2841 } 2842 root = 1; 2843 } else { 2844 ASSERT(i > 0); 2845 VM_STAT_ADD(segvnvmstats.fullszcpages[5]); 2846 if (pp->p_szc != curszc) { 2847 VM_STAT_ADD(segvnvmstats.fullszcpages[6]); 2848 return (0); 2849 } 2850 if (pfn - 1 != page_pptonum(ppa[i - 1])) { 2851 panic("segvn_full_szcpages: " 2852 "large page not physically contiguous"); 2853 } 2854 if (P2PHASE(pfn, curnpgs) == curnpgs - 1) { 2855 root = 0; 2856 } 2857 } 2858 } 2859 2860 for (i = 0; i < totnpgs; i++) { 2861 ASSERT(ppa[i]->p_szc < szc); 2862 if (!page_tryupgrade(ppa[i])) { 2863 for (j = 0; j < i; j++) { 2864 page_downgrade(ppa[j]); 2865 } 2866 *pszc = ppa[i]->p_szc; 2867 *upgrdfail = 1; 2868 VM_STAT_ADD(segvnvmstats.fullszcpages[7]); 2869 return (0); 2870 } 2871 } 2872 2873 /* 2874 * When a page is put a free cachelist its szc is set to 0. if file 2875 * system reclaimed pages from cachelist targ pages will be physically 2876 * contiguous with 0 p_szc. in this case just upgrade szc of targ 2877 * pages without any relocations. 2878 * To avoid any hat issues with previous small mappings 2879 * hat_pageunload() the target pages first. 2880 */ 2881 if (contig) { 2882 VM_STAT_ADD(segvnvmstats.fullszcpages[8]); 2883 for (i = 0; i < totnpgs; i++) { 2884 (void) hat_pageunload(ppa[i], HAT_FORCE_PGUNLOAD); 2885 } 2886 for (i = 0; i < totnpgs; i++) { 2887 ppa[i]->p_szc = szc; 2888 } 2889 for (i = 0; i < totnpgs; i++) { 2890 ASSERT(PAGE_EXCL(ppa[i])); 2891 page_downgrade(ppa[i]); 2892 } 2893 if (pszc != NULL) { 2894 *pszc = szc; 2895 } 2896 } 2897 VM_STAT_ADD(segvnvmstats.fullszcpages[9]); 2898 return (1); 2899 } 2900 2901 /* 2902 * Create physically contiguous pages for [vp, off] - [vp, off + 2903 * page_size(szc)) range and for private segment return them in ppa array. 2904 * Pages are created either via IO or relocations. 2905 * 2906 * Return 1 on sucess and 0 on failure. 2907 * 2908 * If physically contiguos pages already exist for this range return 1 without 2909 * filling ppa array. Caller initializes ppa[0] as NULL to detect that ppa 2910 * array wasn't filled. In this case caller fills ppa array via VOP_GETPAGE(). 2911 */ 2912 2913 static int 2914 segvn_fill_vp_pages(struct segvn_data *svd, vnode_t *vp, u_offset_t off, 2915 uint_t szc, page_t **ppa, page_t **ppplist, uint_t *ret_pszc, 2916 int *downsize) 2917 2918 { 2919 page_t *pplist = *ppplist; 2920 size_t pgsz = page_get_pagesize(szc); 2921 pgcnt_t pages = btop(pgsz); 2922 ulong_t start_off = off; 2923 u_offset_t eoff = off + pgsz; 2924 spgcnt_t nreloc; 2925 u_offset_t io_off = off; 2926 size_t io_len; 2927 page_t *io_pplist = NULL; 2928 page_t *done_pplist = NULL; 2929 pgcnt_t pgidx = 0; 2930 page_t *pp; 2931 page_t *newpp; 2932 page_t *targpp; 2933 int io_err = 0; 2934 int i; 2935 pfn_t pfn; 2936 ulong_t ppages; 2937 page_t *targ_pplist = NULL; 2938 page_t *repl_pplist = NULL; 2939 page_t *tmp_pplist; 2940 int nios = 0; 2941 uint_t pszc; 2942 struct vattr va; 2943 2944 VM_STAT_ADD(segvnvmstats.fill_vp_pages[0]); 2945 2946 ASSERT(szc != 0); 2947 ASSERT(pplist->p_szc == szc); 2948 2949 /* 2950 * downsize will be set to 1 only if we fail to lock pages. this will 2951 * allow subsequent faults to try to relocate the page again. If we 2952 * fail due to misalignment don't downsize and let the caller map the 2953 * whole region with small mappings to avoid more faults into the area 2954 * where we can't get large pages anyway. 2955 */ 2956 *downsize = 0; 2957 2958 while (off < eoff) { 2959 newpp = pplist; 2960 ASSERT(newpp != NULL); 2961 ASSERT(PAGE_EXCL(newpp)); 2962 ASSERT(!PP_ISFREE(newpp)); 2963 /* 2964 * we pass NULL for nrelocp to page_lookup_create() 2965 * so that it doesn't relocate. We relocate here 2966 * later only after we make sure we can lock all 2967 * pages in the range we handle and they are all 2968 * aligned. 2969 */ 2970 pp = page_lookup_create(vp, off, SE_SHARED, newpp, NULL, 0); 2971 ASSERT(pp != NULL); 2972 ASSERT(!PP_ISFREE(pp)); 2973 ASSERT(pp->p_vnode == vp); 2974 ASSERT(pp->p_offset == off); 2975 if (pp == newpp) { 2976 VM_STAT_ADD(segvnvmstats.fill_vp_pages[1]); 2977 page_sub(&pplist, pp); 2978 ASSERT(PAGE_EXCL(pp)); 2979 ASSERT(page_iolock_assert(pp)); 2980 page_list_concat(&io_pplist, &pp); 2981 off += PAGESIZE; 2982 continue; 2983 } 2984 VM_STAT_ADD(segvnvmstats.fill_vp_pages[2]); 2985 pfn = page_pptonum(pp); 2986 pszc = pp->p_szc; 2987 if (pszc >= szc && targ_pplist == NULL && io_pplist == NULL && 2988 IS_P2ALIGNED(pfn, pages)) { 2989 ASSERT(repl_pplist == NULL); 2990 ASSERT(done_pplist == NULL); 2991 ASSERT(pplist == *ppplist); 2992 page_unlock(pp); 2993 page_free_replacement_page(pplist); 2994 page_create_putback(pages); 2995 *ppplist = NULL; 2996 VM_STAT_ADD(segvnvmstats.fill_vp_pages[3]); 2997 return (1); 2998 } 2999 if (pszc >= szc) { 3000 page_unlock(pp); 3001 segvn_faultvnmpss_align_err1++; 3002 goto out; 3003 } 3004 ppages = page_get_pagecnt(pszc); 3005 if (!IS_P2ALIGNED(pfn, ppages)) { 3006 ASSERT(pszc > 0); 3007 /* 3008 * sizing down to pszc won't help. 3009 */ 3010 page_unlock(pp); 3011 segvn_faultvnmpss_align_err2++; 3012 goto out; 3013 } 3014 pfn = page_pptonum(newpp); 3015 if (!IS_P2ALIGNED(pfn, ppages)) { 3016 ASSERT(pszc > 0); 3017 /* 3018 * sizing down to pszc won't help. 3019 */ 3020 page_unlock(pp); 3021 segvn_faultvnmpss_align_err3++; 3022 goto out; 3023 } 3024 if (!PAGE_EXCL(pp)) { 3025 VM_STAT_ADD(segvnvmstats.fill_vp_pages[4]); 3026 page_unlock(pp); 3027 *downsize = 1; 3028 *ret_pszc = pp->p_szc; 3029 goto out; 3030 } 3031 targpp = pp; 3032 if (io_pplist != NULL) { 3033 VM_STAT_ADD(segvnvmstats.fill_vp_pages[5]); 3034 io_len = off - io_off; 3035 /* 3036 * Some file systems like NFS don't check EOF 3037 * conditions in VOP_PAGEIO(). Check it here 3038 * now that pages are locked SE_EXCL. Any file 3039 * truncation will wait until the pages are 3040 * unlocked so no need to worry that file will 3041 * be truncated after we check its size here. 3042 * XXX fix NFS to remove this check. 3043 */ 3044 va.va_mask = AT_SIZE; 3045 if (VOP_GETATTR(vp, &va, ATTR_HINT, svd->cred) != 0) { 3046 VM_STAT_ADD(segvnvmstats.fill_vp_pages[6]); 3047 page_unlock(targpp); 3048 goto out; 3049 } 3050 if (btopr(va.va_size) < btopr(io_off + io_len)) { 3051 VM_STAT_ADD(segvnvmstats.fill_vp_pages[7]); 3052 *downsize = 1; 3053 *ret_pszc = 0; 3054 page_unlock(targpp); 3055 goto out; 3056 } 3057 io_err = VOP_PAGEIO(vp, io_pplist, io_off, io_len, 3058 B_READ, svd->cred); 3059 if (io_err) { 3060 VM_STAT_ADD(segvnvmstats.fill_vp_pages[8]); 3061 page_unlock(targpp); 3062 if (io_err == EDEADLK) { 3063 segvn_vmpss_pageio_deadlk_err++; 3064 } 3065 goto out; 3066 } 3067 nios++; 3068 VM_STAT_ADD(segvnvmstats.fill_vp_pages[9]); 3069 while (io_pplist != NULL) { 3070 pp = io_pplist; 3071 page_sub(&io_pplist, pp); 3072 ASSERT(page_iolock_assert(pp)); 3073 page_io_unlock(pp); 3074 pgidx = (pp->p_offset - start_off) >> 3075 PAGESHIFT; 3076 ASSERT(pgidx < pages); 3077 ppa[pgidx] = pp; 3078 page_list_concat(&done_pplist, &pp); 3079 } 3080 } 3081 pp = targpp; 3082 ASSERT(PAGE_EXCL(pp)); 3083 ASSERT(pp->p_szc <= pszc); 3084 if (pszc != 0 && !group_page_trylock(pp, SE_EXCL)) { 3085 VM_STAT_ADD(segvnvmstats.fill_vp_pages[10]); 3086 page_unlock(pp); 3087 *downsize = 1; 3088 *ret_pszc = pp->p_szc; 3089 goto out; 3090 } 3091 VM_STAT_ADD(segvnvmstats.fill_vp_pages[11]); 3092 /* 3093 * page szc chould have changed before the entire group was 3094 * locked. reread page szc. 3095 */ 3096 pszc = pp->p_szc; 3097 ppages = page_get_pagecnt(pszc); 3098 3099 /* link just the roots */ 3100 page_list_concat(&targ_pplist, &pp); 3101 page_sub(&pplist, newpp); 3102 page_list_concat(&repl_pplist, &newpp); 3103 off += PAGESIZE; 3104 while (--ppages != 0) { 3105 newpp = pplist; 3106 page_sub(&pplist, newpp); 3107 off += PAGESIZE; 3108 } 3109 io_off = off; 3110 } 3111 if (io_pplist != NULL) { 3112 VM_STAT_ADD(segvnvmstats.fill_vp_pages[12]); 3113 io_len = eoff - io_off; 3114 va.va_mask = AT_SIZE; 3115 if (VOP_GETATTR(vp, &va, ATTR_HINT, svd->cred) != 0) { 3116 VM_STAT_ADD(segvnvmstats.fill_vp_pages[13]); 3117 goto out; 3118 } 3119 if (btopr(va.va_size) < btopr(io_off + io_len)) { 3120 VM_STAT_ADD(segvnvmstats.fill_vp_pages[14]); 3121 *downsize = 1; 3122 *ret_pszc = 0; 3123 goto out; 3124 } 3125 io_err = VOP_PAGEIO(vp, io_pplist, io_off, io_len, 3126 B_READ, svd->cred); 3127 if (io_err) { 3128 VM_STAT_ADD(segvnvmstats.fill_vp_pages[15]); 3129 if (io_err == EDEADLK) { 3130 segvn_vmpss_pageio_deadlk_err++; 3131 } 3132 goto out; 3133 } 3134 nios++; 3135 while (io_pplist != NULL) { 3136 pp = io_pplist; 3137 page_sub(&io_pplist, pp); 3138 ASSERT(page_iolock_assert(pp)); 3139 page_io_unlock(pp); 3140 pgidx = (pp->p_offset - start_off) >> PAGESHIFT; 3141 ASSERT(pgidx < pages); 3142 ppa[pgidx] = pp; 3143 } 3144 } 3145 /* 3146 * we're now bound to succeed or panic. 3147 * remove pages from done_pplist. it's not needed anymore. 3148 */ 3149 while (done_pplist != NULL) { 3150 pp = done_pplist; 3151 page_sub(&done_pplist, pp); 3152 } 3153 VM_STAT_ADD(segvnvmstats.fill_vp_pages[16]); 3154 ASSERT(pplist == NULL); 3155 *ppplist = NULL; 3156 while (targ_pplist != NULL) { 3157 int ret; 3158 VM_STAT_ADD(segvnvmstats.fill_vp_pages[17]); 3159 ASSERT(repl_pplist); 3160 pp = targ_pplist; 3161 page_sub(&targ_pplist, pp); 3162 pgidx = (pp->p_offset - start_off) >> PAGESHIFT; 3163 newpp = repl_pplist; 3164 page_sub(&repl_pplist, newpp); 3165 #ifdef DEBUG 3166 pfn = page_pptonum(pp); 3167 pszc = pp->p_szc; 3168 ppages = page_get_pagecnt(pszc); 3169 ASSERT(IS_P2ALIGNED(pfn, ppages)); 3170 pfn = page_pptonum(newpp); 3171 ASSERT(IS_P2ALIGNED(pfn, ppages)); 3172 ASSERT(P2PHASE(pfn, pages) == pgidx); 3173 #endif 3174 nreloc = 0; 3175 ret = page_relocate(&pp, &newpp, 0, 1, &nreloc, NULL); 3176 if (ret != 0 || nreloc == 0) { 3177 panic("segvn_fill_vp_pages: " 3178 "page_relocate failed"); 3179 } 3180 pp = newpp; 3181 while (nreloc-- != 0) { 3182 ASSERT(PAGE_EXCL(pp)); 3183 ASSERT(pp->p_vnode == vp); 3184 ASSERT(pgidx == 3185 ((pp->p_offset - start_off) >> PAGESHIFT)); 3186 ppa[pgidx++] = pp; 3187 pp++; 3188 } 3189 } 3190 3191 if (svd->type == MAP_PRIVATE) { 3192 VM_STAT_ADD(segvnvmstats.fill_vp_pages[18]); 3193 for (i = 0; i < pages; i++) { 3194 ASSERT(ppa[i] != NULL); 3195 ASSERT(PAGE_EXCL(ppa[i])); 3196 ASSERT(ppa[i]->p_vnode == vp); 3197 ASSERT(ppa[i]->p_offset == 3198 start_off + (i << PAGESHIFT)); 3199 page_downgrade(ppa[i]); 3200 } 3201 ppa[pages] = NULL; 3202 } else { 3203 VM_STAT_ADD(segvnvmstats.fill_vp_pages[19]); 3204 /* 3205 * the caller will still call VOP_GETPAGE() for shared segments 3206 * to check FS write permissions. For private segments we map 3207 * file read only anyway. so no VOP_GETPAGE is needed. 3208 */ 3209 for (i = 0; i < pages; i++) { 3210 ASSERT(ppa[i] != NULL); 3211 ASSERT(PAGE_EXCL(ppa[i])); 3212 ASSERT(ppa[i]->p_vnode == vp); 3213 ASSERT(ppa[i]->p_offset == 3214 start_off + (i << PAGESHIFT)); 3215 page_unlock(ppa[i]); 3216 } 3217 ppa[0] = NULL; 3218 } 3219 3220 return (1); 3221 out: 3222 /* 3223 * Do the cleanup. Unlock target pages we didn't relocate. They are 3224 * linked on targ_pplist by root pages. reassemble unused replacement 3225 * and io pages back to pplist. 3226 */ 3227 if (io_pplist != NULL) { 3228 VM_STAT_ADD(segvnvmstats.fill_vp_pages[20]); 3229 pp = io_pplist; 3230 do { 3231 ASSERT(pp->p_vnode == vp); 3232 ASSERT(pp->p_offset == io_off); 3233 ASSERT(page_iolock_assert(pp)); 3234 page_io_unlock(pp); 3235 page_hashout(pp, NULL); 3236 io_off += PAGESIZE; 3237 } while ((pp = pp->p_next) != io_pplist); 3238 page_list_concat(&io_pplist, &pplist); 3239 pplist = io_pplist; 3240 } 3241 tmp_pplist = NULL; 3242 while (targ_pplist != NULL) { 3243 VM_STAT_ADD(segvnvmstats.fill_vp_pages[21]); 3244 pp = targ_pplist; 3245 ASSERT(PAGE_EXCL(pp)); 3246 page_sub(&targ_pplist, pp); 3247 3248 pszc = pp->p_szc; 3249 ppages = page_get_pagecnt(pszc); 3250 ASSERT(IS_P2ALIGNED(page_pptonum(pp), ppages)); 3251 3252 if (pszc != 0) { 3253 group_page_unlock(pp); 3254 } 3255 page_unlock(pp); 3256 3257 pp = repl_pplist; 3258 ASSERT(pp != NULL); 3259 ASSERT(PAGE_EXCL(pp)); 3260 ASSERT(pp->p_szc == szc); 3261 page_sub(&repl_pplist, pp); 3262 3263 ASSERT(IS_P2ALIGNED(page_pptonum(pp), ppages)); 3264 3265 /* relink replacement page */ 3266 page_list_concat(&tmp_pplist, &pp); 3267 while (--ppages != 0) { 3268 VM_STAT_ADD(segvnvmstats.fill_vp_pages[22]); 3269 pp++; 3270 ASSERT(PAGE_EXCL(pp)); 3271 ASSERT(pp->p_szc == szc); 3272 page_list_concat(&tmp_pplist, &pp); 3273 } 3274 } 3275 if (tmp_pplist != NULL) { 3276 VM_STAT_ADD(segvnvmstats.fill_vp_pages[23]); 3277 page_list_concat(&tmp_pplist, &pplist); 3278 pplist = tmp_pplist; 3279 } 3280 /* 3281 * at this point all pages are either on done_pplist or 3282 * pplist. They can't be all on done_pplist otherwise 3283 * we'd've been done. 3284 */ 3285 ASSERT(pplist != NULL); 3286 if (nios != 0) { 3287 VM_STAT_ADD(segvnvmstats.fill_vp_pages[24]); 3288 pp = pplist; 3289 do { 3290 VM_STAT_ADD(segvnvmstats.fill_vp_pages[25]); 3291 ASSERT(pp->p_szc == szc); 3292 ASSERT(PAGE_EXCL(pp)); 3293 ASSERT(pp->p_vnode != vp); 3294 pp->p_szc = 0; 3295 } while ((pp = pp->p_next) != pplist); 3296 3297 pp = done_pplist; 3298 do { 3299 VM_STAT_ADD(segvnvmstats.fill_vp_pages[26]); 3300 ASSERT(pp->p_szc == szc); 3301 ASSERT(PAGE_EXCL(pp)); 3302 ASSERT(pp->p_vnode == vp); 3303 pp->p_szc = 0; 3304 } while ((pp = pp->p_next) != done_pplist); 3305 3306 while (pplist != NULL) { 3307 VM_STAT_ADD(segvnvmstats.fill_vp_pages[27]); 3308 pp = pplist; 3309 page_sub(&pplist, pp); 3310 page_free(pp, 0); 3311 } 3312 3313 while (done_pplist != NULL) { 3314 VM_STAT_ADD(segvnvmstats.fill_vp_pages[28]); 3315 pp = done_pplist; 3316 page_sub(&done_pplist, pp); 3317 page_unlock(pp); 3318 } 3319 *ppplist = NULL; 3320 return (0); 3321 } 3322 ASSERT(pplist == *ppplist); 3323 if (io_err) { 3324 VM_STAT_ADD(segvnvmstats.fill_vp_pages[29]); 3325 /* 3326 * don't downsize on io error. 3327 * see if vop_getpage succeeds. 3328 * pplist may still be used in this case 3329 * for relocations. 3330 */ 3331 return (0); 3332 } 3333 VM_STAT_ADD(segvnvmstats.fill_vp_pages[30]); 3334 page_free_replacement_page(pplist); 3335 page_create_putback(pages); 3336 *ppplist = NULL; 3337 return (0); 3338 } 3339 3340 int segvn_anypgsz = 0; 3341 3342 #define SEGVN_RESTORE_SOFTLOCK(type, pages) \ 3343 if ((type) == F_SOFTLOCK) { \ 3344 mutex_enter(&freemem_lock); \ 3345 availrmem += (pages); \ 3346 segvn_pages_locked -= (pages); \ 3347 svd->softlockcnt -= (pages); \ 3348 mutex_exit(&freemem_lock); \ 3349 } 3350 3351 #define SEGVN_UPDATE_MODBITS(ppa, pages, rw, prot, vpprot) \ 3352 if (IS_VMODSORT((ppa)[0]->p_vnode)) { \ 3353 if ((rw) == S_WRITE) { \ 3354 for (i = 0; i < (pages); i++) { \ 3355 ASSERT((ppa)[i]->p_vnode == \ 3356 (ppa)[0]->p_vnode); \ 3357 hat_setmod((ppa)[i]); \ 3358 } \ 3359 } else if ((rw) != S_OTHER && \ 3360 ((prot) & (vpprot) & PROT_WRITE)) { \ 3361 for (i = 0; i < (pages); i++) { \ 3362 ASSERT((ppa)[i]->p_vnode == \ 3363 (ppa)[0]->p_vnode); \ 3364 if (!hat_ismod((ppa)[i])) { \ 3365 prot &= ~PROT_WRITE; \ 3366 break; \ 3367 } \ 3368 } \ 3369 } \ 3370 } 3371 3372 #ifdef VM_STATS 3373 3374 #define SEGVN_VMSTAT_FLTVNPAGES(idx) \ 3375 VM_STAT_ADD(segvnvmstats.fltvnpages[(idx)]); 3376 3377 #else /* VM_STATS */ 3378 3379 #define SEGVN_VMSTAT_FLTVNPAGES(idx) 3380 3381 #endif 3382 3383 static faultcode_t 3384 segvn_fault_vnodepages(struct hat *hat, struct seg *seg, caddr_t lpgaddr, 3385 caddr_t lpgeaddr, enum fault_type type, enum seg_rw rw, caddr_t addr, 3386 caddr_t eaddr, int brkcow) 3387 { 3388 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 3389 struct anon_map *amp = svd->amp; 3390 uchar_t segtype = svd->type; 3391 uint_t szc = seg->s_szc; 3392 size_t pgsz = page_get_pagesize(szc); 3393 size_t maxpgsz = pgsz; 3394 pgcnt_t pages = btop(pgsz); 3395 pgcnt_t maxpages = pages; 3396 size_t ppasize = (pages + 1) * sizeof (page_t *); 3397 caddr_t a = lpgaddr; 3398 caddr_t maxlpgeaddr = lpgeaddr; 3399 u_offset_t off = svd->offset + (uintptr_t)(a - seg->s_base); 3400 ulong_t aindx = svd->anon_index + seg_page(seg, a); 3401 struct vpage *vpage = (svd->vpage != NULL) ? 3402 &svd->vpage[seg_page(seg, a)] : NULL; 3403 vnode_t *vp = svd->vp; 3404 page_t **ppa; 3405 uint_t pszc; 3406 size_t ppgsz; 3407 pgcnt_t ppages; 3408 faultcode_t err = 0; 3409 int ierr; 3410 int vop_size_err = 0; 3411 uint_t protchk, prot, vpprot; 3412 ulong_t i; 3413 int hat_flag = (type == F_SOFTLOCK) ? HAT_LOAD_LOCK : HAT_LOAD; 3414 anon_sync_obj_t an_cookie; 3415 enum seg_rw arw; 3416 int alloc_failed = 0; 3417 int adjszc_chk; 3418 struct vattr va; 3419 int xhat = 0; 3420 page_t *pplist; 3421 pfn_t pfn; 3422 int physcontig; 3423 int upgrdfail; 3424 int segvn_anypgsz_vnode = 0; /* for now map vnode with 2 page sizes */ 3425 3426 ASSERT(szc != 0); 3427 ASSERT(vp != NULL); 3428 ASSERT(brkcow == 0 || amp != NULL); 3429 ASSERT(enable_mbit_wa == 0); /* no mbit simulations with large pages */ 3430 ASSERT(!(svd->flags & MAP_NORESERVE)); 3431 ASSERT(type != F_SOFTUNLOCK); 3432 ASSERT(IS_P2ALIGNED(a, maxpgsz)); 3433 ASSERT(amp == NULL || IS_P2ALIGNED(aindx, maxpages)); 3434 ASSERT(SEGVN_LOCK_HELD(seg->s_as, &svd->lock)); 3435 ASSERT(seg->s_szc < NBBY * sizeof (int)); 3436 ASSERT(type != F_SOFTLOCK || lpgeaddr - a == maxpgsz); 3437 3438 VM_STAT_COND_ADD(type == F_SOFTLOCK, segvnvmstats.fltvnpages[0]); 3439 VM_STAT_COND_ADD(type != F_SOFTLOCK, segvnvmstats.fltvnpages[1]); 3440 3441 if (svd->flags & MAP_TEXT) { 3442 hat_flag |= HAT_LOAD_TEXT; 3443 } 3444 3445 if (svd->pageprot) { 3446 switch (rw) { 3447 case S_READ: 3448 protchk = PROT_READ; 3449 break; 3450 case S_WRITE: 3451 protchk = PROT_WRITE; 3452 break; 3453 case S_EXEC: 3454 protchk = PROT_EXEC; 3455 break; 3456 case S_OTHER: 3457 default: 3458 protchk = PROT_READ | PROT_WRITE | PROT_EXEC; 3459 break; 3460 } 3461 } else { 3462 prot = svd->prot; 3463 /* caller has already done segment level protection check. */ 3464 } 3465 3466 if (seg->s_as->a_hat != hat) { 3467 xhat = 1; 3468 } 3469 3470 if (rw == S_WRITE && segtype == MAP_PRIVATE) { 3471 SEGVN_VMSTAT_FLTVNPAGES(2); 3472 arw = S_READ; 3473 } else { 3474 arw = rw; 3475 } 3476 3477 ppa = kmem_alloc(ppasize, KM_SLEEP); 3478 3479 VM_STAT_COND_ADD(amp != NULL, segvnvmstats.fltvnpages[3]); 3480 3481 for (;;) { 3482 adjszc_chk = 0; 3483 for (; a < lpgeaddr; a += pgsz, off += pgsz, aindx += pages) { 3484 if (adjszc_chk) { 3485 while (szc < seg->s_szc) { 3486 uintptr_t e; 3487 uint_t tszc; 3488 tszc = segvn_anypgsz_vnode ? szc + 1 : 3489 seg->s_szc; 3490 ppgsz = page_get_pagesize(tszc); 3491 if (!IS_P2ALIGNED(a, ppgsz) || 3492 ((alloc_failed >> tszc) & 3493 0x1)) { 3494 break; 3495 } 3496 SEGVN_VMSTAT_FLTVNPAGES(4); 3497 szc = tszc; 3498 pgsz = ppgsz; 3499 pages = btop(pgsz); 3500 e = P2ROUNDUP((uintptr_t)eaddr, pgsz); 3501 lpgeaddr = (caddr_t)e; 3502 } 3503 } 3504 3505 again: 3506 if (IS_P2ALIGNED(a, maxpgsz) && amp != NULL) { 3507 ASSERT(IS_P2ALIGNED(aindx, maxpages)); 3508 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 3509 anon_array_enter(amp, aindx, &an_cookie); 3510 if (anon_get_ptr(amp->ahp, aindx) != NULL) { 3511 SEGVN_VMSTAT_FLTVNPAGES(5); 3512 if (anon_pages(amp->ahp, aindx, 3513 maxpages) != maxpages) { 3514 panic("segvn_fault_vnodepages:" 3515 " empty anon slots\n"); 3516 } 3517 anon_array_exit(&an_cookie); 3518 ANON_LOCK_EXIT(&->a_rwlock); 3519 err = segvn_fault_anonpages(hat, seg, 3520 a, a + maxpgsz, type, rw, 3521 MAX(a, addr), 3522 MIN(a + maxpgsz, eaddr), brkcow); 3523 if (err != 0) { 3524 SEGVN_VMSTAT_FLTVNPAGES(6); 3525 goto out; 3526 } 3527 if (szc < seg->s_szc) { 3528 szc = seg->s_szc; 3529 pgsz = maxpgsz; 3530 pages = maxpages; 3531 lpgeaddr = maxlpgeaddr; 3532 } 3533 goto next; 3534 } else if (anon_pages(amp->ahp, aindx, 3535 maxpages)) { 3536 panic("segvn_fault_vnodepages:" 3537 " non empty anon slots\n"); 3538 } else { 3539 SEGVN_VMSTAT_FLTVNPAGES(7); 3540 anon_array_exit(&an_cookie); 3541 ANON_LOCK_EXIT(&->a_rwlock); 3542 } 3543 } 3544 ASSERT(!brkcow || IS_P2ALIGNED(a, maxpgsz)); 3545 3546 if (svd->pageprot != 0 && IS_P2ALIGNED(a, maxpgsz)) { 3547 ASSERT(vpage != NULL); 3548 prot = VPP_PROT(vpage); 3549 ASSERT(sameprot(seg, a, maxpgsz)); 3550 if ((prot & protchk) == 0) { 3551 SEGVN_VMSTAT_FLTVNPAGES(8); 3552 err = FC_PROT; 3553 goto out; 3554 } 3555 } 3556 if (type == F_SOFTLOCK) { 3557 mutex_enter(&freemem_lock); 3558 if (availrmem < tune.t_minarmem + pages) { 3559 mutex_exit(&freemem_lock); 3560 err = FC_MAKE_ERR(ENOMEM); 3561 goto out; 3562 } else { 3563 availrmem -= pages; 3564 segvn_pages_locked += pages; 3565 svd->softlockcnt += pages; 3566 } 3567 mutex_exit(&freemem_lock); 3568 } 3569 3570 pplist = NULL; 3571 physcontig = 0; 3572 ppa[0] = NULL; 3573 if (!brkcow && szc && 3574 !page_exists_physcontig(vp, off, szc, 3575 segtype == MAP_PRIVATE ? ppa : NULL)) { 3576 SEGVN_VMSTAT_FLTVNPAGES(9); 3577 if (page_alloc_pages(vp, seg, a, &pplist, NULL, 3578 szc, 0) && type != F_SOFTLOCK) { 3579 SEGVN_VMSTAT_FLTVNPAGES(10); 3580 pszc = 0; 3581 ierr = -1; 3582 alloc_failed |= (1 << szc); 3583 break; 3584 } 3585 if (pplist != NULL && 3586 vp->v_mpssdata == SEGVN_PAGEIO) { 3587 int downsize; 3588 SEGVN_VMSTAT_FLTVNPAGES(11); 3589 physcontig = segvn_fill_vp_pages(svd, 3590 vp, off, szc, ppa, &pplist, 3591 &pszc, &downsize); 3592 ASSERT(!physcontig || pplist == NULL); 3593 if (!physcontig && downsize && 3594 type != F_SOFTLOCK) { 3595 ASSERT(pplist == NULL); 3596 SEGVN_VMSTAT_FLTVNPAGES(12); 3597 ierr = -1; 3598 break; 3599 } 3600 ASSERT(!physcontig || 3601 segtype == MAP_PRIVATE || 3602 ppa[0] == NULL); 3603 if (physcontig && ppa[0] == NULL) { 3604 physcontig = 0; 3605 } 3606 } 3607 } else if (!brkcow && szc && ppa[0] != NULL) { 3608 SEGVN_VMSTAT_FLTVNPAGES(13); 3609 ASSERT(segtype == MAP_PRIVATE); 3610 physcontig = 1; 3611 } 3612 3613 if (!physcontig) { 3614 SEGVN_VMSTAT_FLTVNPAGES(14); 3615 ppa[0] = NULL; 3616 ierr = VOP_GETPAGE(vp, (offset_t)off, pgsz, 3617 &vpprot, ppa, pgsz, seg, a, arw, 3618 svd->cred); 3619 if (segtype == MAP_PRIVATE) { 3620 SEGVN_VMSTAT_FLTVNPAGES(15); 3621 vpprot &= ~PROT_WRITE; 3622 } 3623 } else { 3624 ASSERT(segtype == MAP_PRIVATE); 3625 SEGVN_VMSTAT_FLTVNPAGES(16); 3626 vpprot = PROT_ALL & ~PROT_WRITE; 3627 ierr = 0; 3628 } 3629 3630 if (ierr != 0) { 3631 SEGVN_VMSTAT_FLTVNPAGES(17); 3632 if (pplist != NULL) { 3633 SEGVN_VMSTAT_FLTVNPAGES(18); 3634 page_free_replacement_page(pplist); 3635 page_create_putback(pages); 3636 } 3637 SEGVN_RESTORE_SOFTLOCK(type, pages); 3638 if (a + pgsz <= eaddr) { 3639 SEGVN_VMSTAT_FLTVNPAGES(19); 3640 err = FC_MAKE_ERR(ierr); 3641 goto out; 3642 } 3643 va.va_mask = AT_SIZE; 3644 if (VOP_GETATTR(vp, &va, 0, svd->cred) != 0) { 3645 SEGVN_VMSTAT_FLTVNPAGES(20); 3646 err = FC_MAKE_ERR(EIO); 3647 goto out; 3648 } 3649 if (btopr(va.va_size) >= btopr(off + pgsz)) { 3650 SEGVN_VMSTAT_FLTVNPAGES(21); 3651 err = FC_MAKE_ERR(ierr); 3652 goto out; 3653 } 3654 if (btopr(va.va_size) < 3655 btopr(off + (eaddr - a))) { 3656 SEGVN_VMSTAT_FLTVNPAGES(22); 3657 err = FC_MAKE_ERR(ierr); 3658 goto out; 3659 } 3660 if (brkcow || type == F_SOFTLOCK) { 3661 /* can't reduce map area */ 3662 SEGVN_VMSTAT_FLTVNPAGES(23); 3663 vop_size_err = 1; 3664 goto out; 3665 } 3666 SEGVN_VMSTAT_FLTVNPAGES(24); 3667 ASSERT(szc != 0); 3668 pszc = 0; 3669 ierr = -1; 3670 break; 3671 } 3672 3673 if (amp != NULL) { 3674 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 3675 anon_array_enter(amp, aindx, &an_cookie); 3676 } 3677 if (amp != NULL && 3678 anon_get_ptr(amp->ahp, aindx) != NULL) { 3679 ulong_t taindx = P2ALIGN(aindx, maxpages); 3680 3681 SEGVN_VMSTAT_FLTVNPAGES(25); 3682 if (anon_pages(amp->ahp, taindx, maxpages) != 3683 maxpages) { 3684 panic("segvn_fault_vnodepages:" 3685 " empty anon slots\n"); 3686 } 3687 for (i = 0; i < pages; i++) { 3688 page_unlock(ppa[i]); 3689 } 3690 anon_array_exit(&an_cookie); 3691 ANON_LOCK_EXIT(&->a_rwlock); 3692 if (pplist != NULL) { 3693 page_free_replacement_page(pplist); 3694 page_create_putback(pages); 3695 } 3696 SEGVN_RESTORE_SOFTLOCK(type, pages); 3697 if (szc < seg->s_szc) { 3698 SEGVN_VMSTAT_FLTVNPAGES(26); 3699 /* 3700 * For private segments SOFTLOCK 3701 * either always breaks cow (any rw 3702 * type except S_READ_NOCOW) or 3703 * address space is locked as writer 3704 * (S_READ_NOCOW case) and anon slots 3705 * can't show up on second check. 3706 * Therefore if we are here for 3707 * SOFTLOCK case it must be a cow 3708 * break but cow break never reduces 3709 * szc. Thus the assert below. 3710 */ 3711 ASSERT(!brkcow && type != F_SOFTLOCK); 3712 pszc = seg->s_szc; 3713 ierr = -2; 3714 break; 3715 } 3716 ASSERT(IS_P2ALIGNED(a, maxpgsz)); 3717 goto again; 3718 } 3719 #ifdef DEBUG 3720 if (amp != NULL) { 3721 ulong_t taindx = P2ALIGN(aindx, maxpages); 3722 ASSERT(!anon_pages(amp->ahp, taindx, maxpages)); 3723 } 3724 #endif /* DEBUG */ 3725 3726 if (brkcow) { 3727 ASSERT(amp != NULL); 3728 ASSERT(pplist == NULL); 3729 ASSERT(szc == seg->s_szc); 3730 ASSERT(IS_P2ALIGNED(a, maxpgsz)); 3731 ASSERT(IS_P2ALIGNED(aindx, maxpages)); 3732 SEGVN_VMSTAT_FLTVNPAGES(27); 3733 ierr = anon_map_privatepages(amp, aindx, szc, 3734 seg, a, prot, ppa, vpage, segvn_anypgsz, 3735 svd->cred); 3736 if (ierr != 0) { 3737 SEGVN_VMSTAT_FLTVNPAGES(28); 3738 anon_array_exit(&an_cookie); 3739 ANON_LOCK_EXIT(&->a_rwlock); 3740 SEGVN_RESTORE_SOFTLOCK(type, pages); 3741 err = FC_MAKE_ERR(ierr); 3742 goto out; 3743 } 3744 3745 ASSERT(!IS_VMODSORT(ppa[0]->p_vnode)); 3746 /* 3747 * p_szc can't be changed for locked 3748 * swapfs pages. 3749 */ 3750 hat_memload_array(hat, a, pgsz, ppa, prot, 3751 hat_flag); 3752 3753 if (!(hat_flag & HAT_LOAD_LOCK)) { 3754 SEGVN_VMSTAT_FLTVNPAGES(29); 3755 for (i = 0; i < pages; i++) { 3756 page_unlock(ppa[i]); 3757 } 3758 } 3759 anon_array_exit(&an_cookie); 3760 ANON_LOCK_EXIT(&->a_rwlock); 3761 goto next; 3762 } 3763 3764 pfn = page_pptonum(ppa[0]); 3765 /* 3766 * hat_page_demote() needs an EXCl lock on one of 3767 * constituent page_t's and it decreases root's p_szc 3768 * last. This means if root's p_szc is equal szc and 3769 * all its constituent pages are locked 3770 * hat_page_demote() that could have changed p_szc to 3771 * szc is already done and no new have page_demote() 3772 * can start for this large page. 3773 */ 3774 3775 /* 3776 * we need to make sure same mapping size is used for 3777 * the same address range if there's a possibility the 3778 * adddress is already mapped because hat layer panics 3779 * when translation is loaded for the range already 3780 * mapped with a different page size. We achieve it 3781 * by always using largest page size possible subject 3782 * to the constraints of page size, segment page size 3783 * and page alignment. Since mappings are invalidated 3784 * when those constraints change and make it 3785 * impossible to use previously used mapping size no 3786 * mapping size conflicts should happen. 3787 */ 3788 3789 chkszc: 3790 if ((pszc = ppa[0]->p_szc) == szc && 3791 IS_P2ALIGNED(pfn, pages)) { 3792 3793 SEGVN_VMSTAT_FLTVNPAGES(30); 3794 #ifdef DEBUG 3795 for (i = 0; i < pages; i++) { 3796 ASSERT(PAGE_LOCKED(ppa[i])); 3797 ASSERT(!PP_ISFREE(ppa[i])); 3798 ASSERT(page_pptonum(ppa[i]) == 3799 pfn + i); 3800 ASSERT(ppa[i]->p_szc == szc); 3801 ASSERT(ppa[i]->p_vnode == vp); 3802 ASSERT(ppa[i]->p_offset == 3803 off + (i << PAGESHIFT)); 3804 } 3805 #endif /* DEBUG */ 3806 /* 3807 * All pages are of szc we need and they are 3808 * all locked so they can't change szc. load 3809 * translations. 3810 * 3811 * if page got promoted since last check 3812 * we don't need pplist. 3813 */ 3814 if (pplist != NULL) { 3815 page_free_replacement_page(pplist); 3816 page_create_putback(pages); 3817 } 3818 if (PP_ISMIGRATE(ppa[0])) { 3819 page_migrate(seg, a, ppa, pages); 3820 } 3821 SEGVN_UPDATE_MODBITS(ppa, pages, rw, 3822 prot, vpprot); 3823 if (!xhat) { 3824 hat_memload_array(hat, a, pgsz, ppa, 3825 prot & vpprot, hat_flag); 3826 } else { 3827 /* 3828 * avoid large xhat mappings to FS 3829 * pages so that hat_page_demote() 3830 * doesn't need to check for xhat 3831 * large mappings. 3832 */ 3833 for (i = 0; i < pages; i++) { 3834 hat_memload(hat, 3835 a + (i << PAGESHIFT), 3836 ppa[i], prot & vpprot, 3837 hat_flag); 3838 } 3839 } 3840 3841 if (!(hat_flag & HAT_LOAD_LOCK)) { 3842 for (i = 0; i < pages; i++) { 3843 page_unlock(ppa[i]); 3844 } 3845 } 3846 if (amp != NULL) { 3847 anon_array_exit(&an_cookie); 3848 ANON_LOCK_EXIT(&->a_rwlock); 3849 } 3850 goto next; 3851 } 3852 3853 /* 3854 * See if upsize is possible. 3855 */ 3856 if (pszc > szc && szc < seg->s_szc && 3857 (segvn_anypgsz_vnode || pszc >= seg->s_szc)) { 3858 pgcnt_t aphase; 3859 uint_t pszc1 = MIN(pszc, seg->s_szc); 3860 ppgsz = page_get_pagesize(pszc1); 3861 ppages = btop(ppgsz); 3862 aphase = btop(P2PHASE((uintptr_t)a, ppgsz)); 3863 3864 ASSERT(type != F_SOFTLOCK); 3865 3866 SEGVN_VMSTAT_FLTVNPAGES(31); 3867 if (aphase != P2PHASE(pfn, ppages)) { 3868 segvn_faultvnmpss_align_err4++; 3869 } else { 3870 SEGVN_VMSTAT_FLTVNPAGES(32); 3871 if (pplist != NULL) { 3872 page_t *pl = pplist; 3873 page_free_replacement_page(pl); 3874 page_create_putback(pages); 3875 } 3876 for (i = 0; i < pages; i++) { 3877 page_unlock(ppa[i]); 3878 } 3879 if (amp != NULL) { 3880 anon_array_exit(&an_cookie); 3881 ANON_LOCK_EXIT(&->a_rwlock); 3882 } 3883 pszc = pszc1; 3884 ierr = -2; 3885 break; 3886 } 3887 } 3888 3889 /* 3890 * check if we should use smallest mapping size. 3891 */ 3892 upgrdfail = 0; 3893 if (szc == 0 || xhat || 3894 (pszc >= szc && 3895 !IS_P2ALIGNED(pfn, pages)) || 3896 (pszc < szc && 3897 !segvn_full_szcpages(ppa, szc, &upgrdfail, 3898 &pszc))) { 3899 3900 if (upgrdfail && type != F_SOFTLOCK) { 3901 /* 3902 * segvn_full_szcpages failed to lock 3903 * all pages EXCL. Size down. 3904 */ 3905 ASSERT(pszc < szc); 3906 3907 SEGVN_VMSTAT_FLTVNPAGES(33); 3908 3909 if (pplist != NULL) { 3910 page_t *pl = pplist; 3911 page_free_replacement_page(pl); 3912 page_create_putback(pages); 3913 } 3914 3915 for (i = 0; i < pages; i++) { 3916 page_unlock(ppa[i]); 3917 } 3918 if (amp != NULL) { 3919 anon_array_exit(&an_cookie); 3920 ANON_LOCK_EXIT(&->a_rwlock); 3921 } 3922 ierr = -1; 3923 break; 3924 } 3925 if (szc != 0 && !xhat) { 3926 segvn_faultvnmpss_align_err5++; 3927 } 3928 SEGVN_VMSTAT_FLTVNPAGES(34); 3929 if (pplist != NULL) { 3930 page_free_replacement_page(pplist); 3931 page_create_putback(pages); 3932 } 3933 SEGVN_UPDATE_MODBITS(ppa, pages, rw, 3934 prot, vpprot); 3935 if (upgrdfail && segvn_anypgsz_vnode) { 3936 /* SOFTLOCK case */ 3937 hat_memload_array(hat, a, pgsz, 3938 ppa, prot & vpprot, hat_flag); 3939 } else { 3940 for (i = 0; i < pages; i++) { 3941 hat_memload(hat, 3942 a + (i << PAGESHIFT), 3943 ppa[i], prot & vpprot, 3944 hat_flag); 3945 } 3946 } 3947 if (!(hat_flag & HAT_LOAD_LOCK)) { 3948 for (i = 0; i < pages; i++) { 3949 page_unlock(ppa[i]); 3950 } 3951 } 3952 if (amp != NULL) { 3953 anon_array_exit(&an_cookie); 3954 ANON_LOCK_EXIT(&->a_rwlock); 3955 } 3956 goto next; 3957 } 3958 3959 if (pszc == szc) { 3960 /* 3961 * segvn_full_szcpages() upgraded pages szc. 3962 */ 3963 ASSERT(pszc == ppa[0]->p_szc); 3964 ASSERT(IS_P2ALIGNED(pfn, pages)); 3965 goto chkszc; 3966 } 3967 3968 if (pszc > szc) { 3969 kmutex_t *szcmtx; 3970 SEGVN_VMSTAT_FLTVNPAGES(35); 3971 /* 3972 * p_szc of ppa[0] can change since we haven't 3973 * locked all constituent pages. Call 3974 * page_lock_szc() to prevent szc changes. 3975 * This should be a rare case that happens when 3976 * multiple segments use a different page size 3977 * to map the same file offsets. 3978 */ 3979 szcmtx = page_szc_lock(ppa[0]); 3980 pszc = ppa[0]->p_szc; 3981 ASSERT(szcmtx != NULL || pszc == 0); 3982 ASSERT(ppa[0]->p_szc <= pszc); 3983 if (pszc <= szc) { 3984 SEGVN_VMSTAT_FLTVNPAGES(36); 3985 if (szcmtx != NULL) { 3986 mutex_exit(szcmtx); 3987 } 3988 goto chkszc; 3989 } 3990 if (pplist != NULL) { 3991 /* 3992 * page got promoted since last check. 3993 * we don't need preaalocated large 3994 * page. 3995 */ 3996 SEGVN_VMSTAT_FLTVNPAGES(37); 3997 page_free_replacement_page(pplist); 3998 page_create_putback(pages); 3999 } 4000 SEGVN_UPDATE_MODBITS(ppa, pages, rw, 4001 prot, vpprot); 4002 hat_memload_array(hat, a, pgsz, ppa, 4003 prot & vpprot, hat_flag); 4004 mutex_exit(szcmtx); 4005 if (!(hat_flag & HAT_LOAD_LOCK)) { 4006 for (i = 0; i < pages; i++) { 4007 page_unlock(ppa[i]); 4008 } 4009 } 4010 if (amp != NULL) { 4011 anon_array_exit(&an_cookie); 4012 ANON_LOCK_EXIT(&->a_rwlock); 4013 } 4014 goto next; 4015 } 4016 4017 /* 4018 * if page got demoted since last check 4019 * we could have not allocated larger page. 4020 * allocate now. 4021 */ 4022 if (pplist == NULL && 4023 page_alloc_pages(vp, seg, a, &pplist, NULL, 4024 szc, 0) && type != F_SOFTLOCK) { 4025 SEGVN_VMSTAT_FLTVNPAGES(38); 4026 for (i = 0; i < pages; i++) { 4027 page_unlock(ppa[i]); 4028 } 4029 if (amp != NULL) { 4030 anon_array_exit(&an_cookie); 4031 ANON_LOCK_EXIT(&->a_rwlock); 4032 } 4033 ierr = -1; 4034 alloc_failed |= (1 << szc); 4035 break; 4036 } 4037 4038 SEGVN_VMSTAT_FLTVNPAGES(39); 4039 4040 if (pplist != NULL) { 4041 segvn_relocate_pages(ppa, pplist); 4042 #ifdef DEBUG 4043 } else { 4044 ASSERT(type == F_SOFTLOCK); 4045 SEGVN_VMSTAT_FLTVNPAGES(40); 4046 #endif /* DEBUG */ 4047 } 4048 4049 SEGVN_UPDATE_MODBITS(ppa, pages, rw, prot, vpprot); 4050 4051 if (pplist == NULL && segvn_anypgsz_vnode == 0) { 4052 ASSERT(type == F_SOFTLOCK); 4053 for (i = 0; i < pages; i++) { 4054 ASSERT(ppa[i]->p_szc < szc); 4055 hat_memload(hat, a + (i << PAGESHIFT), 4056 ppa[i], prot & vpprot, hat_flag); 4057 } 4058 } else { 4059 ASSERT(pplist != NULL || type == F_SOFTLOCK); 4060 hat_memload_array(hat, a, pgsz, ppa, 4061 prot & vpprot, hat_flag); 4062 } 4063 if (!(hat_flag & HAT_LOAD_LOCK)) { 4064 for (i = 0; i < pages; i++) { 4065 ASSERT(PAGE_SHARED(ppa[i])); 4066 page_unlock(ppa[i]); 4067 } 4068 } 4069 if (amp != NULL) { 4070 anon_array_exit(&an_cookie); 4071 ANON_LOCK_EXIT(&->a_rwlock); 4072 } 4073 4074 next: 4075 if (vpage != NULL) { 4076 vpage += pages; 4077 } 4078 adjszc_chk = 1; 4079 } 4080 if (a == lpgeaddr) 4081 break; 4082 ASSERT(a < lpgeaddr); 4083 4084 ASSERT(!brkcow && type != F_SOFTLOCK); 4085 4086 /* 4087 * ierr == -1 means we failed to map with a large page. 4088 * (either due to allocation/relocation failures or 4089 * misalignment with other mappings to this file. 4090 * 4091 * ierr == -2 means some other thread allocated a large page 4092 * after we gave up tp map with a large page. retry with 4093 * larger mapping. 4094 */ 4095 ASSERT(ierr == -1 || ierr == -2); 4096 ASSERT(ierr == -2 || szc != 0); 4097 ASSERT(ierr == -1 || szc < seg->s_szc); 4098 if (ierr == -2) { 4099 SEGVN_VMSTAT_FLTVNPAGES(41); 4100 ASSERT(pszc > szc && pszc <= seg->s_szc); 4101 szc = pszc; 4102 } else if (segvn_anypgsz_vnode) { 4103 SEGVN_VMSTAT_FLTVNPAGES(42); 4104 szc--; 4105 } else { 4106 SEGVN_VMSTAT_FLTVNPAGES(43); 4107 ASSERT(pszc < szc); 4108 /* 4109 * other process created pszc large page. 4110 * but we still have to drop to 0 szc. 4111 */ 4112 szc = 0; 4113 } 4114 4115 pgsz = page_get_pagesize(szc); 4116 pages = btop(pgsz); 4117 if (ierr == -2) { 4118 /* 4119 * Size up case. Note lpgaddr may only be needed for 4120 * softlock case so we don't adjust it here. 4121 */ 4122 a = (caddr_t)P2ALIGN((uintptr_t)a, pgsz); 4123 ASSERT(a >= lpgaddr); 4124 lpgeaddr = (caddr_t)P2ROUNDUP((uintptr_t)eaddr, pgsz); 4125 off = svd->offset + (uintptr_t)(a - seg->s_base); 4126 aindx = svd->anon_index + seg_page(seg, a); 4127 vpage = (svd->vpage != NULL) ? 4128 &svd->vpage[seg_page(seg, a)] : NULL; 4129 } else { 4130 /* 4131 * Size down case. Note lpgaddr may only be needed for 4132 * softlock case so we don't adjust it here. 4133 */ 4134 ASSERT(IS_P2ALIGNED(a, pgsz)); 4135 ASSERT(IS_P2ALIGNED(lpgeaddr, pgsz)); 4136 lpgeaddr = (caddr_t)P2ROUNDUP((uintptr_t)eaddr, pgsz); 4137 ASSERT(a < lpgeaddr); 4138 if (a < addr) { 4139 SEGVN_VMSTAT_FLTVNPAGES(44); 4140 /* 4141 * The beginning of the large page region can 4142 * be pulled to the right to make a smaller 4143 * region. We haven't yet faulted a single 4144 * page. 4145 */ 4146 a = (caddr_t)P2ALIGN((uintptr_t)addr, pgsz); 4147 ASSERT(a >= lpgaddr); 4148 off = svd->offset + 4149 (uintptr_t)(a - seg->s_base); 4150 aindx = svd->anon_index + seg_page(seg, a); 4151 vpage = (svd->vpage != NULL) ? 4152 &svd->vpage[seg_page(seg, a)] : NULL; 4153 } 4154 } 4155 } 4156 out: 4157 kmem_free(ppa, ppasize); 4158 if (!err && !vop_size_err) { 4159 SEGVN_VMSTAT_FLTVNPAGES(45); 4160 return (0); 4161 } 4162 if (type == F_SOFTLOCK && a > lpgaddr) { 4163 SEGVN_VMSTAT_FLTVNPAGES(46); 4164 segvn_softunlock(seg, lpgaddr, a - lpgaddr, S_OTHER); 4165 } 4166 if (!vop_size_err) { 4167 SEGVN_VMSTAT_FLTVNPAGES(47); 4168 return (err); 4169 } 4170 ASSERT(brkcow || type == F_SOFTLOCK); 4171 /* 4172 * Large page end is mapped beyond the end of file and it's a cow 4173 * fault or softlock so we can't reduce the map area. For now just 4174 * demote the segment. This should really only happen if the end of 4175 * the file changed after the mapping was established since when large 4176 * page segments are created we make sure they don't extend beyond the 4177 * end of the file. 4178 */ 4179 SEGVN_VMSTAT_FLTVNPAGES(48); 4180 4181 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 4182 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_WRITER); 4183 err = 0; 4184 if (seg->s_szc != 0) { 4185 segvn_fltvnpages_clrszc_cnt++; 4186 ASSERT(svd->softlockcnt == 0); 4187 err = segvn_clrszc(seg); 4188 if (err != 0) { 4189 segvn_fltvnpages_clrszc_err++; 4190 } 4191 } 4192 ASSERT(err || seg->s_szc == 0); 4193 SEGVN_LOCK_DOWNGRADE(seg->s_as, &svd->lock); 4194 /* segvn_fault will do its job as if szc had been zero to begin with */ 4195 return (err == 0 ? IE_RETRY : FC_MAKE_ERR(err)); 4196 } 4197 4198 /* 4199 * This routine will attempt to fault in one large page. 4200 * it will use smaller pages if that fails. 4201 * It should only be called for pure anonymous segments. 4202 */ 4203 static faultcode_t 4204 segvn_fault_anonpages(struct hat *hat, struct seg *seg, caddr_t lpgaddr, 4205 caddr_t lpgeaddr, enum fault_type type, enum seg_rw rw, caddr_t addr, 4206 caddr_t eaddr, int brkcow) 4207 { 4208 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 4209 struct anon_map *amp = svd->amp; 4210 uchar_t segtype = svd->type; 4211 uint_t szc = seg->s_szc; 4212 size_t pgsz = page_get_pagesize(szc); 4213 size_t maxpgsz = pgsz; 4214 pgcnt_t pages = btop(pgsz); 4215 size_t ppasize = pages * sizeof (page_t *); 4216 caddr_t a = lpgaddr; 4217 ulong_t aindx = svd->anon_index + seg_page(seg, a); 4218 struct vpage *vpage = (svd->vpage != NULL) ? 4219 &svd->vpage[seg_page(seg, a)] : NULL; 4220 page_t **ppa; 4221 uint_t ppa_szc; 4222 faultcode_t err; 4223 int ierr; 4224 uint_t protchk, prot, vpprot; 4225 ulong_t i; 4226 int hat_flag = (type == F_SOFTLOCK) ? HAT_LOAD_LOCK : HAT_LOAD; 4227 anon_sync_obj_t cookie; 4228 int first = 1; 4229 int adjszc_chk; 4230 int purged = 0; 4231 4232 ASSERT(szc != 0); 4233 ASSERT(amp != NULL); 4234 ASSERT(enable_mbit_wa == 0); /* no mbit simulations with large pages */ 4235 ASSERT(!(svd->flags & MAP_NORESERVE)); 4236 ASSERT(type != F_SOFTUNLOCK); 4237 ASSERT(IS_P2ALIGNED(a, maxpgsz)); 4238 4239 ASSERT(SEGVN_LOCK_HELD(seg->s_as, &svd->lock)); 4240 4241 VM_STAT_COND_ADD(type == F_SOFTLOCK, segvnvmstats.fltanpages[0]); 4242 VM_STAT_COND_ADD(type != F_SOFTLOCK, segvnvmstats.fltanpages[1]); 4243 4244 if (svd->flags & MAP_TEXT) { 4245 hat_flag |= HAT_LOAD_TEXT; 4246 } 4247 4248 if (svd->pageprot) { 4249 switch (rw) { 4250 case S_READ: 4251 protchk = PROT_READ; 4252 break; 4253 case S_WRITE: 4254 protchk = PROT_WRITE; 4255 break; 4256 case S_EXEC: 4257 protchk = PROT_EXEC; 4258 break; 4259 case S_OTHER: 4260 default: 4261 protchk = PROT_READ | PROT_WRITE | PROT_EXEC; 4262 break; 4263 } 4264 VM_STAT_ADD(segvnvmstats.fltanpages[2]); 4265 } else { 4266 prot = svd->prot; 4267 /* caller has already done segment level protection check. */ 4268 } 4269 4270 ppa = kmem_alloc(ppasize, KM_SLEEP); 4271 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 4272 for (;;) { 4273 adjszc_chk = 0; 4274 for (; a < lpgeaddr; a += pgsz, aindx += pages) { 4275 if (svd->pageprot != 0 && IS_P2ALIGNED(a, maxpgsz)) { 4276 VM_STAT_ADD(segvnvmstats.fltanpages[3]); 4277 ASSERT(vpage != NULL); 4278 prot = VPP_PROT(vpage); 4279 ASSERT(sameprot(seg, a, maxpgsz)); 4280 if ((prot & protchk) == 0) { 4281 err = FC_PROT; 4282 goto error; 4283 } 4284 } 4285 if (adjszc_chk && IS_P2ALIGNED(a, maxpgsz) && 4286 pgsz < maxpgsz) { 4287 ASSERT(a > lpgaddr); 4288 szc = seg->s_szc; 4289 pgsz = maxpgsz; 4290 pages = btop(pgsz); 4291 ASSERT(IS_P2ALIGNED(aindx, pages)); 4292 lpgeaddr = (caddr_t)P2ROUNDUP((uintptr_t)eaddr, 4293 pgsz); 4294 } 4295 if (type == F_SOFTLOCK && svd->vp != NULL) { 4296 mutex_enter(&freemem_lock); 4297 if (availrmem < tune.t_minarmem + pages) { 4298 mutex_exit(&freemem_lock); 4299 err = FC_MAKE_ERR(ENOMEM); 4300 goto error; 4301 } else { 4302 availrmem -= pages; 4303 segvn_pages_locked += pages; 4304 svd->softlockcnt += pages; 4305 } 4306 mutex_exit(&freemem_lock); 4307 } 4308 anon_array_enter(amp, aindx, &cookie); 4309 ppa_szc = (uint_t)-1; 4310 ierr = anon_map_getpages(amp, aindx, szc, seg, a, 4311 prot, &vpprot, ppa, &ppa_szc, vpage, rw, brkcow, 4312 segvn_anypgsz, svd->cred); 4313 if (ierr != 0) { 4314 anon_array_exit(&cookie); 4315 VM_STAT_ADD(segvnvmstats.fltanpages[4]); 4316 if (type == F_SOFTLOCK && svd->vp != NULL) { 4317 VM_STAT_ADD(segvnvmstats.fltanpages[5]); 4318 mutex_enter(&freemem_lock); 4319 availrmem += pages; 4320 segvn_pages_locked -= pages; 4321 svd->softlockcnt -= pages; 4322 mutex_exit(&freemem_lock); 4323 } 4324 if (ierr > 0) { 4325 VM_STAT_ADD(segvnvmstats.fltanpages[6]); 4326 err = FC_MAKE_ERR(ierr); 4327 goto error; 4328 } 4329 break; 4330 } 4331 4332 ASSERT(!IS_VMODSORT(ppa[0]->p_vnode)); 4333 4334 ASSERT(segtype == MAP_SHARED || 4335 ppa[0]->p_szc <= szc); 4336 ASSERT(segtype == MAP_PRIVATE || 4337 ppa[0]->p_szc >= szc); 4338 4339 /* 4340 * Handle pages that have been marked for migration 4341 */ 4342 if (lgrp_optimizations()) 4343 page_migrate(seg, a, ppa, pages); 4344 4345 if (type == F_SOFTLOCK && svd->vp == NULL) { 4346 /* 4347 * All pages in ppa array belong to the same 4348 * large page. This means it's ok to call 4349 * segvn_pp_lock_anonpages just for ppa[0]. 4350 */ 4351 if (!segvn_pp_lock_anonpages(ppa[0], first)) { 4352 for (i = 0; i < pages; i++) { 4353 page_unlock(ppa[i]); 4354 } 4355 err = FC_MAKE_ERR(ENOMEM); 4356 goto error; 4357 } 4358 first = 0; 4359 mutex_enter(&freemem_lock); 4360 svd->softlockcnt += pages; 4361 segvn_pages_locked += pages; 4362 mutex_exit(&freemem_lock); 4363 } 4364 4365 if (segtype == MAP_SHARED) { 4366 vpprot |= PROT_WRITE; 4367 } 4368 4369 hat_memload_array(hat, a, pgsz, ppa, 4370 prot & vpprot, hat_flag); 4371 4372 if (hat_flag & HAT_LOAD_LOCK) { 4373 VM_STAT_ADD(segvnvmstats.fltanpages[7]); 4374 } else { 4375 VM_STAT_ADD(segvnvmstats.fltanpages[8]); 4376 for (i = 0; i < pages; i++) 4377 page_unlock(ppa[i]); 4378 } 4379 if (vpage != NULL) 4380 vpage += pages; 4381 4382 anon_array_exit(&cookie); 4383 adjszc_chk = 1; 4384 } 4385 if (a == lpgeaddr) 4386 break; 4387 ASSERT(a < lpgeaddr); 4388 /* 4389 * ierr == -1 means we failed to allocate a large page. 4390 * so do a size down operation. 4391 * 4392 * ierr == -2 means some other process that privately shares 4393 * pages with this process has allocated a larger page and we 4394 * need to retry with larger pages. So do a size up 4395 * operation. This relies on the fact that large pages are 4396 * never partially shared i.e. if we share any constituent 4397 * page of a large page with another process we must share the 4398 * entire large page. Note this cannot happen for SOFTLOCK 4399 * case, unless current address (a) is at the beginning of the 4400 * next page size boundary because the other process couldn't 4401 * have relocated locked pages. 4402 */ 4403 ASSERT(ierr == -1 || ierr == -2); 4404 /* 4405 * For the very first relocation failure try to purge this 4406 * segment's cache so that the relocator can obtain an 4407 * exclusive lock on pages we want to relocate. 4408 */ 4409 if (!purged && ierr == -1 && ppa_szc != (uint_t)-1 && 4410 svd->softlockcnt != 0) { 4411 purged = 1; 4412 segvn_purge(seg); 4413 continue; 4414 } 4415 4416 if (segvn_anypgsz) { 4417 ASSERT(ierr == -2 || szc != 0); 4418 ASSERT(ierr == -1 || szc < seg->s_szc); 4419 szc = (ierr == -1) ? szc - 1 : szc + 1; 4420 } else { 4421 /* 4422 * For non COW faults and segvn_anypgsz == 0 4423 * we need to be careful not to loop forever 4424 * if existing page is found with szc other 4425 * than 0 or seg->s_szc. This could be due 4426 * to page relocations on behalf of DR or 4427 * more likely large page creation. For this 4428 * case simply re-size to existing page's szc 4429 * if returned by anon_map_getpages(). 4430 */ 4431 if (ppa_szc == (uint_t)-1) { 4432 szc = (ierr == -1) ? 0 : seg->s_szc; 4433 } else { 4434 ASSERT(ppa_szc <= seg->s_szc); 4435 ASSERT(ierr == -2 || ppa_szc < szc); 4436 ASSERT(ierr == -1 || ppa_szc > szc); 4437 szc = ppa_szc; 4438 } 4439 } 4440 4441 pgsz = page_get_pagesize(szc); 4442 pages = btop(pgsz); 4443 ASSERT(type != F_SOFTLOCK || ierr == -1 || 4444 (IS_P2ALIGNED(a, pgsz) && IS_P2ALIGNED(lpgeaddr, pgsz))); 4445 if (type == F_SOFTLOCK) { 4446 /* 4447 * For softlocks we cannot reduce the fault area 4448 * (calculated based on the largest page size for this 4449 * segment) for size down and a is already next 4450 * page size aligned as assertted above for size 4451 * ups. Therefore just continue in case of softlock. 4452 */ 4453 VM_STAT_ADD(segvnvmstats.fltanpages[9]); 4454 continue; /* keep lint happy */ 4455 } else if (ierr == -2) { 4456 4457 /* 4458 * Size up case. Note lpgaddr may only be needed for 4459 * softlock case so we don't adjust it here. 4460 */ 4461 VM_STAT_ADD(segvnvmstats.fltanpages[10]); 4462 a = (caddr_t)P2ALIGN((uintptr_t)a, pgsz); 4463 ASSERT(a >= lpgaddr); 4464 lpgeaddr = (caddr_t)P2ROUNDUP((uintptr_t)eaddr, pgsz); 4465 aindx = svd->anon_index + seg_page(seg, a); 4466 vpage = (svd->vpage != NULL) ? 4467 &svd->vpage[seg_page(seg, a)] : NULL; 4468 } else { 4469 /* 4470 * Size down case. Note lpgaddr may only be needed for 4471 * softlock case so we don't adjust it here. 4472 */ 4473 VM_STAT_ADD(segvnvmstats.fltanpages[11]); 4474 ASSERT(IS_P2ALIGNED(a, pgsz)); 4475 ASSERT(IS_P2ALIGNED(lpgeaddr, pgsz)); 4476 lpgeaddr = (caddr_t)P2ROUNDUP((uintptr_t)eaddr, pgsz); 4477 ASSERT(a < lpgeaddr); 4478 if (a < addr) { 4479 /* 4480 * The beginning of the large page region can 4481 * be pulled to the right to make a smaller 4482 * region. We haven't yet faulted a single 4483 * page. 4484 */ 4485 VM_STAT_ADD(segvnvmstats.fltanpages[12]); 4486 a = (caddr_t)P2ALIGN((uintptr_t)addr, pgsz); 4487 ASSERT(a >= lpgaddr); 4488 aindx = svd->anon_index + seg_page(seg, a); 4489 vpage = (svd->vpage != NULL) ? 4490 &svd->vpage[seg_page(seg, a)] : NULL; 4491 } 4492 } 4493 } 4494 VM_STAT_ADD(segvnvmstats.fltanpages[13]); 4495 ANON_LOCK_EXIT(&->a_rwlock); 4496 kmem_free(ppa, ppasize); 4497 return (0); 4498 error: 4499 VM_STAT_ADD(segvnvmstats.fltanpages[14]); 4500 ANON_LOCK_EXIT(&->a_rwlock); 4501 kmem_free(ppa, ppasize); 4502 if (type == F_SOFTLOCK && a > lpgaddr) { 4503 VM_STAT_ADD(segvnvmstats.fltanpages[15]); 4504 segvn_softunlock(seg, lpgaddr, a - lpgaddr, S_OTHER); 4505 } 4506 return (err); 4507 } 4508 4509 int fltadvice = 1; /* set to free behind pages for sequential access */ 4510 4511 /* 4512 * This routine is called via a machine specific fault handling routine. 4513 * It is also called by software routines wishing to lock or unlock 4514 * a range of addresses. 4515 * 4516 * Here is the basic algorithm: 4517 * If unlocking 4518 * Call segvn_softunlock 4519 * Return 4520 * endif 4521 * Checking and set up work 4522 * If we will need some non-anonymous pages 4523 * Call VOP_GETPAGE over the range of non-anonymous pages 4524 * endif 4525 * Loop over all addresses requested 4526 * Call segvn_faultpage passing in page list 4527 * to load up translations and handle anonymous pages 4528 * endloop 4529 * Load up translation to any additional pages in page list not 4530 * already handled that fit into this segment 4531 */ 4532 static faultcode_t 4533 segvn_fault(struct hat *hat, struct seg *seg, caddr_t addr, size_t len, 4534 enum fault_type type, enum seg_rw rw) 4535 { 4536 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 4537 page_t **plp, **ppp, *pp; 4538 u_offset_t off; 4539 caddr_t a; 4540 struct vpage *vpage; 4541 uint_t vpprot, prot; 4542 int err; 4543 page_t *pl[PVN_GETPAGE_NUM + 1]; 4544 size_t plsz, pl_alloc_sz; 4545 size_t page; 4546 ulong_t anon_index; 4547 struct anon_map *amp; 4548 int dogetpage = 0; 4549 caddr_t lpgaddr, lpgeaddr; 4550 size_t pgsz; 4551 anon_sync_obj_t cookie; 4552 int brkcow = BREAK_COW_SHARE(rw, type, svd->type); 4553 4554 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 4555 4556 /* 4557 * First handle the easy stuff 4558 */ 4559 if (type == F_SOFTUNLOCK) { 4560 if (rw == S_READ_NOCOW) { 4561 rw = S_READ; 4562 ASSERT(AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock)); 4563 } 4564 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER); 4565 pgsz = (seg->s_szc == 0) ? PAGESIZE : 4566 page_get_pagesize(seg->s_szc); 4567 VM_STAT_COND_ADD(pgsz > PAGESIZE, segvnvmstats.fltanpages[16]); 4568 CALC_LPG_REGION(pgsz, seg, addr, len, lpgaddr, lpgeaddr); 4569 segvn_softunlock(seg, lpgaddr, lpgeaddr - lpgaddr, rw); 4570 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 4571 return (0); 4572 } 4573 4574 top: 4575 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER); 4576 4577 /* 4578 * If we have the same protections for the entire segment, 4579 * insure that the access being attempted is legitimate. 4580 */ 4581 4582 if (svd->pageprot == 0) { 4583 uint_t protchk; 4584 4585 switch (rw) { 4586 case S_READ: 4587 case S_READ_NOCOW: 4588 protchk = PROT_READ; 4589 break; 4590 case S_WRITE: 4591 protchk = PROT_WRITE; 4592 break; 4593 case S_EXEC: 4594 protchk = PROT_EXEC; 4595 break; 4596 case S_OTHER: 4597 default: 4598 protchk = PROT_READ | PROT_WRITE | PROT_EXEC; 4599 break; 4600 } 4601 4602 if ((svd->prot & protchk) == 0) { 4603 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 4604 return (FC_PROT); /* illegal access type */ 4605 } 4606 } 4607 4608 /* 4609 * We can't allow the long term use of softlocks for vmpss segments, 4610 * because in some file truncation cases we should be able to demote 4611 * the segment, which requires that there are no softlocks. The 4612 * only case where it's ok to allow a SOFTLOCK fault against a vmpss 4613 * segment is S_READ_NOCOW, where the caller holds the address space 4614 * locked as writer and calls softunlock before dropping the as lock. 4615 * S_READ_NOCOW is used by /proc to read memory from another user. 4616 * 4617 * Another deadlock between SOFTLOCK and file truncation can happen 4618 * because segvn_fault_vnodepages() calls the FS one pagesize at 4619 * a time. A second VOP_GETPAGE() call by segvn_fault_vnodepages() 4620 * can cause a deadlock because the first set of page_t's remain 4621 * locked SE_SHARED. To avoid this, we demote segments on a first 4622 * SOFTLOCK if they have a length greater than the segment's 4623 * page size. 4624 * 4625 * So for now, we only avoid demoting a segment on a SOFTLOCK when 4626 * the access type is S_READ_NOCOW and the fault length is less than 4627 * or equal to the segment's page size. While this is quite restrictive, 4628 * it should be the most common case of SOFTLOCK against a vmpss 4629 * segment. 4630 * 4631 * For S_READ_NOCOW, it's safe not to do a copy on write because the 4632 * caller makes sure no COW will be caused by another thread for a 4633 * softlocked page. 4634 */ 4635 if (type == F_SOFTLOCK && svd->vp != NULL && seg->s_szc != 0) { 4636 int demote = 0; 4637 4638 if (rw != S_READ_NOCOW) { 4639 demote = 1; 4640 } 4641 if (!demote && len > PAGESIZE) { 4642 pgsz = page_get_pagesize(seg->s_szc); 4643 CALC_LPG_REGION(pgsz, seg, addr, len, lpgaddr, 4644 lpgeaddr); 4645 if (lpgeaddr - lpgaddr > pgsz) { 4646 demote = 1; 4647 } 4648 } 4649 4650 ASSERT(demote || AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock)); 4651 4652 if (demote) { 4653 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 4654 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_WRITER); 4655 if (seg->s_szc != 0) { 4656 segvn_vmpss_clrszc_cnt++; 4657 ASSERT(svd->softlockcnt == 0); 4658 err = segvn_clrszc(seg); 4659 if (err) { 4660 segvn_vmpss_clrszc_err++; 4661 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 4662 return (FC_MAKE_ERR(err)); 4663 } 4664 } 4665 ASSERT(seg->s_szc == 0); 4666 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 4667 goto top; 4668 } 4669 } 4670 4671 /* 4672 * Check to see if we need to allocate an anon_map structure. 4673 */ 4674 if (svd->amp == NULL && (svd->vp == NULL || brkcow)) { 4675 /* 4676 * Drop the "read" lock on the segment and acquire 4677 * the "write" version since we have to allocate the 4678 * anon_map. 4679 */ 4680 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 4681 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_WRITER); 4682 4683 if (svd->amp == NULL) { 4684 svd->amp = anonmap_alloc(seg->s_size, 0); 4685 svd->amp->a_szc = seg->s_szc; 4686 } 4687 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 4688 4689 /* 4690 * Start all over again since segment protections 4691 * may have changed after we dropped the "read" lock. 4692 */ 4693 goto top; 4694 } 4695 4696 /* 4697 * S_READ_NOCOW vs S_READ distinction was 4698 * only needed for the code above. After 4699 * that we treat it as S_READ. 4700 */ 4701 if (rw == S_READ_NOCOW) { 4702 ASSERT(type == F_SOFTLOCK); 4703 ASSERT(AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock)); 4704 rw = S_READ; 4705 } 4706 4707 amp = svd->amp; 4708 4709 /* 4710 * MADV_SEQUENTIAL work is ignored for large page segments. 4711 */ 4712 if (seg->s_szc != 0) { 4713 pgsz = page_get_pagesize(seg->s_szc); 4714 ASSERT(SEGVN_LOCK_HELD(seg->s_as, &svd->lock)); 4715 CALC_LPG_REGION(pgsz, seg, addr, len, lpgaddr, lpgeaddr); 4716 if (svd->vp == NULL) { 4717 err = segvn_fault_anonpages(hat, seg, lpgaddr, 4718 lpgeaddr, type, rw, addr, addr + len, brkcow); 4719 } else { 4720 err = segvn_fault_vnodepages(hat, seg, lpgaddr, 4721 lpgeaddr, type, rw, addr, addr + len, brkcow); 4722 if (err == IE_RETRY) { 4723 ASSERT(seg->s_szc == 0); 4724 ASSERT(SEGVN_READ_HELD(seg->s_as, &svd->lock)); 4725 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 4726 goto top; 4727 } 4728 } 4729 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 4730 return (err); 4731 } 4732 4733 page = seg_page(seg, addr); 4734 if (amp != NULL) { 4735 anon_index = svd->anon_index + page; 4736 4737 if ((type == F_PROT) && (rw == S_READ) && 4738 svd->type == MAP_PRIVATE && svd->pageprot == 0) { 4739 size_t index = anon_index; 4740 struct anon *ap; 4741 4742 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 4743 /* 4744 * The fast path could apply to S_WRITE also, except 4745 * that the protection fault could be caused by lazy 4746 * tlb flush when ro->rw. In this case, the pte is 4747 * RW already. But RO in the other cpu's tlb causes 4748 * the fault. Since hat_chgprot won't do anything if 4749 * pte doesn't change, we may end up faulting 4750 * indefinitely until the RO tlb entry gets replaced. 4751 */ 4752 for (a = addr; a < addr + len; a += PAGESIZE, index++) { 4753 anon_array_enter(amp, index, &cookie); 4754 ap = anon_get_ptr(amp->ahp, index); 4755 anon_array_exit(&cookie); 4756 if ((ap == NULL) || (ap->an_refcnt != 1)) { 4757 ANON_LOCK_EXIT(&->a_rwlock); 4758 goto slow; 4759 } 4760 } 4761 hat_chgprot(seg->s_as->a_hat, addr, len, svd->prot); 4762 ANON_LOCK_EXIT(&->a_rwlock); 4763 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 4764 return (0); 4765 } 4766 } 4767 slow: 4768 4769 if (svd->vpage == NULL) 4770 vpage = NULL; 4771 else 4772 vpage = &svd->vpage[page]; 4773 4774 off = svd->offset + (uintptr_t)(addr - seg->s_base); 4775 4776 /* 4777 * If MADV_SEQUENTIAL has been set for the particular page we 4778 * are faulting on, free behind all pages in the segment and put 4779 * them on the free list. 4780 */ 4781 if ((page != 0) && fltadvice) { /* not if first page in segment */ 4782 struct vpage *vpp; 4783 ulong_t fanon_index; 4784 size_t fpage; 4785 u_offset_t pgoff, fpgoff; 4786 struct vnode *fvp; 4787 struct anon *fap = NULL; 4788 4789 if (svd->advice == MADV_SEQUENTIAL || 4790 (svd->pageadvice && 4791 VPP_ADVICE(vpage) == MADV_SEQUENTIAL)) { 4792 pgoff = off - PAGESIZE; 4793 fpage = page - 1; 4794 if (vpage != NULL) 4795 vpp = &svd->vpage[fpage]; 4796 if (amp != NULL) 4797 fanon_index = svd->anon_index + fpage; 4798 4799 while (pgoff > svd->offset) { 4800 if (svd->advice != MADV_SEQUENTIAL && 4801 (!svd->pageadvice || (vpage && 4802 VPP_ADVICE(vpp) != MADV_SEQUENTIAL))) 4803 break; 4804 4805 /* 4806 * If this is an anon page, we must find the 4807 * correct <vp, offset> for it 4808 */ 4809 fap = NULL; 4810 if (amp != NULL) { 4811 ANON_LOCK_ENTER(&->a_rwlock, 4812 RW_READER); 4813 anon_array_enter(amp, fanon_index, 4814 &cookie); 4815 fap = anon_get_ptr(amp->ahp, 4816 fanon_index); 4817 if (fap != NULL) { 4818 swap_xlate(fap, &fvp, &fpgoff); 4819 } else { 4820 fpgoff = pgoff; 4821 fvp = svd->vp; 4822 } 4823 anon_array_exit(&cookie); 4824 ANON_LOCK_EXIT(&->a_rwlock); 4825 } else { 4826 fpgoff = pgoff; 4827 fvp = svd->vp; 4828 } 4829 if (fvp == NULL) 4830 break; /* XXX */ 4831 /* 4832 * Skip pages that are free or have an 4833 * "exclusive" lock. 4834 */ 4835 pp = page_lookup_nowait(fvp, fpgoff, SE_SHARED); 4836 if (pp == NULL) 4837 break; 4838 /* 4839 * We don't need the page_struct_lock to test 4840 * as this is only advisory; even if we 4841 * acquire it someone might race in and lock 4842 * the page after we unlock and before the 4843 * PUTPAGE, then VOP_PUTPAGE will do nothing. 4844 */ 4845 if (pp->p_lckcnt == 0 && pp->p_cowcnt == 0) { 4846 /* 4847 * Hold the vnode before releasing 4848 * the page lock to prevent it from 4849 * being freed and re-used by some 4850 * other thread. 4851 */ 4852 VN_HOLD(fvp); 4853 page_unlock(pp); 4854 /* 4855 * We should build a page list 4856 * to kluster putpages XXX 4857 */ 4858 (void) VOP_PUTPAGE(fvp, 4859 (offset_t)fpgoff, PAGESIZE, 4860 (B_DONTNEED|B_FREE|B_ASYNC), 4861 svd->cred); 4862 VN_RELE(fvp); 4863 } else { 4864 /* 4865 * XXX - Should the loop terminate if 4866 * the page is `locked'? 4867 */ 4868 page_unlock(pp); 4869 } 4870 --vpp; 4871 --fanon_index; 4872 pgoff -= PAGESIZE; 4873 } 4874 } 4875 } 4876 4877 plp = pl; 4878 *plp = NULL; 4879 pl_alloc_sz = 0; 4880 4881 /* 4882 * See if we need to call VOP_GETPAGE for 4883 * *any* of the range being faulted on. 4884 * We can skip all of this work if there 4885 * was no original vnode. 4886 */ 4887 if (svd->vp != NULL) { 4888 u_offset_t vp_off; 4889 size_t vp_len; 4890 struct anon *ap; 4891 vnode_t *vp; 4892 4893 vp_off = off; 4894 vp_len = len; 4895 4896 if (amp == NULL) 4897 dogetpage = 1; 4898 else { 4899 /* 4900 * Only acquire reader lock to prevent amp->ahp 4901 * from being changed. It's ok to miss pages, 4902 * hence we don't do anon_array_enter 4903 */ 4904 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 4905 ap = anon_get_ptr(amp->ahp, anon_index); 4906 4907 if (len <= PAGESIZE) 4908 /* inline non_anon() */ 4909 dogetpage = (ap == NULL); 4910 else 4911 dogetpage = non_anon(amp->ahp, anon_index, 4912 &vp_off, &vp_len); 4913 ANON_LOCK_EXIT(&->a_rwlock); 4914 } 4915 4916 if (dogetpage) { 4917 enum seg_rw arw; 4918 struct as *as = seg->s_as; 4919 4920 if (len > ptob((sizeof (pl) / sizeof (pl[0])) - 1)) { 4921 /* 4922 * Page list won't fit in local array, 4923 * allocate one of the needed size. 4924 */ 4925 pl_alloc_sz = 4926 (btop(len) + 1) * sizeof (page_t *); 4927 plp = kmem_alloc(pl_alloc_sz, KM_SLEEP); 4928 plp[0] = NULL; 4929 plsz = len; 4930 } else if (rw == S_WRITE && svd->type == MAP_PRIVATE || 4931 rw == S_OTHER || 4932 (((size_t)(addr + PAGESIZE) < 4933 (size_t)(seg->s_base + seg->s_size)) && 4934 hat_probe(as->a_hat, addr + PAGESIZE))) { 4935 /* 4936 * Ask VOP_GETPAGE to return the exact number 4937 * of pages if 4938 * (a) this is a COW fault, or 4939 * (b) this is a software fault, or 4940 * (c) next page is already mapped. 4941 */ 4942 plsz = len; 4943 } else { 4944 /* 4945 * Ask VOP_GETPAGE to return adjacent pages 4946 * within the segment. 4947 */ 4948 plsz = MIN((size_t)PVN_GETPAGE_SZ, (size_t) 4949 ((seg->s_base + seg->s_size) - addr)); 4950 ASSERT((addr + plsz) <= 4951 (seg->s_base + seg->s_size)); 4952 } 4953 4954 /* 4955 * Need to get some non-anonymous pages. 4956 * We need to make only one call to GETPAGE to do 4957 * this to prevent certain deadlocking conditions 4958 * when we are doing locking. In this case 4959 * non_anon() should have picked up the smallest 4960 * range which includes all the non-anonymous 4961 * pages in the requested range. We have to 4962 * be careful regarding which rw flag to pass in 4963 * because on a private mapping, the underlying 4964 * object is never allowed to be written. 4965 */ 4966 if (rw == S_WRITE && svd->type == MAP_PRIVATE) { 4967 arw = S_READ; 4968 } else { 4969 arw = rw; 4970 } 4971 vp = svd->vp; 4972 TRACE_3(TR_FAC_VM, TR_SEGVN_GETPAGE, 4973 "segvn_getpage:seg %p addr %p vp %p", 4974 seg, addr, vp); 4975 err = VOP_GETPAGE(vp, (offset_t)vp_off, vp_len, 4976 &vpprot, plp, plsz, seg, addr + (vp_off - off), arw, 4977 svd->cred); 4978 if (err) { 4979 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 4980 segvn_pagelist_rele(plp); 4981 if (pl_alloc_sz) 4982 kmem_free(plp, pl_alloc_sz); 4983 return (FC_MAKE_ERR(err)); 4984 } 4985 if (svd->type == MAP_PRIVATE) 4986 vpprot &= ~PROT_WRITE; 4987 } 4988 } 4989 4990 /* 4991 * N.B. at this time the plp array has all the needed non-anon 4992 * pages in addition to (possibly) having some adjacent pages. 4993 */ 4994 4995 /* 4996 * Always acquire the anon_array_lock to prevent 4997 * 2 threads from allocating separate anon slots for 4998 * the same "addr". 4999 * 5000 * If this is a copy-on-write fault and we don't already 5001 * have the anon_array_lock, acquire it to prevent the 5002 * fault routine from handling multiple copy-on-write faults 5003 * on the same "addr" in the same address space. 5004 * 5005 * Only one thread should deal with the fault since after 5006 * it is handled, the other threads can acquire a translation 5007 * to the newly created private page. This prevents two or 5008 * more threads from creating different private pages for the 5009 * same fault. 5010 * 5011 * We grab "serialization" lock here if this is a MAP_PRIVATE segment 5012 * to prevent deadlock between this thread and another thread 5013 * which has soft-locked this page and wants to acquire serial_lock. 5014 * ( bug 4026339 ) 5015 * 5016 * The fix for bug 4026339 becomes unnecessary when using the 5017 * locking scheme with per amp rwlock and a global set of hash 5018 * lock, anon_array_lock. If we steal a vnode page when low 5019 * on memory and upgrad the page lock through page_rename, 5020 * then the page is PAGE_HANDLED, nothing needs to be done 5021 * for this page after returning from segvn_faultpage. 5022 * 5023 * But really, the page lock should be downgraded after 5024 * the stolen page is page_rename'd. 5025 */ 5026 5027 if (amp != NULL) 5028 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 5029 5030 /* 5031 * Ok, now loop over the address range and handle faults 5032 */ 5033 for (a = addr; a < addr + len; a += PAGESIZE, off += PAGESIZE) { 5034 err = segvn_faultpage(hat, seg, a, off, vpage, plp, vpprot, 5035 type, rw, brkcow, a == addr); 5036 if (err) { 5037 if (amp != NULL) 5038 ANON_LOCK_EXIT(&->a_rwlock); 5039 if (type == F_SOFTLOCK && a > addr) { 5040 segvn_softunlock(seg, addr, (a - addr), 5041 S_OTHER); 5042 } 5043 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5044 segvn_pagelist_rele(plp); 5045 if (pl_alloc_sz) 5046 kmem_free(plp, pl_alloc_sz); 5047 return (err); 5048 } 5049 if (vpage) { 5050 vpage++; 5051 } else if (svd->vpage) { 5052 page = seg_page(seg, addr); 5053 vpage = &svd->vpage[++page]; 5054 } 5055 } 5056 5057 /* Didn't get pages from the underlying fs so we're done */ 5058 if (!dogetpage) 5059 goto done; 5060 5061 /* 5062 * Now handle any other pages in the list returned. 5063 * If the page can be used, load up the translations now. 5064 * Note that the for loop will only be entered if "plp" 5065 * is pointing to a non-NULL page pointer which means that 5066 * VOP_GETPAGE() was called and vpprot has been initialized. 5067 */ 5068 if (svd->pageprot == 0) 5069 prot = svd->prot & vpprot; 5070 5071 5072 /* 5073 * Large Files: diff should be unsigned value because we started 5074 * supporting > 2GB segment sizes from 2.5.1 and when a 5075 * large file of size > 2GB gets mapped to address space 5076 * the diff value can be > 2GB. 5077 */ 5078 5079 for (ppp = plp; (pp = *ppp) != NULL; ppp++) { 5080 size_t diff; 5081 struct anon *ap; 5082 int anon_index; 5083 anon_sync_obj_t cookie; 5084 int hat_flag = HAT_LOAD_ADV; 5085 5086 if (svd->flags & MAP_TEXT) { 5087 hat_flag |= HAT_LOAD_TEXT; 5088 } 5089 5090 if (pp == PAGE_HANDLED) 5091 continue; 5092 5093 if (pp->p_offset >= svd->offset && 5094 (pp->p_offset < svd->offset + seg->s_size)) { 5095 5096 diff = pp->p_offset - svd->offset; 5097 5098 /* 5099 * Large Files: Following is the assertion 5100 * validating the above cast. 5101 */ 5102 ASSERT(svd->vp == pp->p_vnode); 5103 5104 page = btop(diff); 5105 if (svd->pageprot) 5106 prot = VPP_PROT(&svd->vpage[page]) & vpprot; 5107 5108 /* 5109 * Prevent other threads in the address space from 5110 * creating private pages (i.e., allocating anon slots) 5111 * while we are in the process of loading translations 5112 * to additional pages returned by the underlying 5113 * object. 5114 */ 5115 if (amp != NULL) { 5116 anon_index = svd->anon_index + page; 5117 anon_array_enter(amp, anon_index, &cookie); 5118 ap = anon_get_ptr(amp->ahp, anon_index); 5119 } 5120 if ((amp == NULL) || (ap == NULL)) { 5121 if (IS_VMODSORT(pp->p_vnode) || 5122 enable_mbit_wa) { 5123 if (rw == S_WRITE) 5124 hat_setmod(pp); 5125 else if (rw != S_OTHER && 5126 !hat_ismod(pp)) 5127 prot &= ~PROT_WRITE; 5128 } 5129 /* 5130 * Skip mapping read ahead pages marked 5131 * for migration, so they will get migrated 5132 * properly on fault 5133 */ 5134 if ((prot & PROT_READ) && !PP_ISMIGRATE(pp)) { 5135 hat_memload(hat, seg->s_base + diff, 5136 pp, prot, hat_flag); 5137 } 5138 } 5139 if (amp != NULL) 5140 anon_array_exit(&cookie); 5141 } 5142 page_unlock(pp); 5143 } 5144 done: 5145 if (amp != NULL) 5146 ANON_LOCK_EXIT(&->a_rwlock); 5147 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5148 if (pl_alloc_sz) 5149 kmem_free(plp, pl_alloc_sz); 5150 return (0); 5151 } 5152 5153 /* 5154 * This routine is used to start I/O on pages asynchronously. XXX it will 5155 * only create PAGESIZE pages. At fault time they will be relocated into 5156 * larger pages. 5157 */ 5158 static faultcode_t 5159 segvn_faulta(struct seg *seg, caddr_t addr) 5160 { 5161 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 5162 int err; 5163 struct anon_map *amp; 5164 vnode_t *vp; 5165 5166 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 5167 5168 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER); 5169 if ((amp = svd->amp) != NULL) { 5170 struct anon *ap; 5171 5172 /* 5173 * Reader lock to prevent amp->ahp from being changed. 5174 * This is advisory, it's ok to miss a page, so 5175 * we don't do anon_array_enter lock. 5176 */ 5177 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 5178 if ((ap = anon_get_ptr(amp->ahp, 5179 svd->anon_index + seg_page(seg, addr))) != NULL) { 5180 5181 err = anon_getpage(&ap, NULL, NULL, 5182 0, seg, addr, S_READ, svd->cred); 5183 5184 ANON_LOCK_EXIT(&->a_rwlock); 5185 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5186 if (err) 5187 return (FC_MAKE_ERR(err)); 5188 return (0); 5189 } 5190 ANON_LOCK_EXIT(&->a_rwlock); 5191 } 5192 5193 if (svd->vp == NULL) { 5194 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5195 return (0); /* zfod page - do nothing now */ 5196 } 5197 5198 vp = svd->vp; 5199 TRACE_3(TR_FAC_VM, TR_SEGVN_GETPAGE, 5200 "segvn_getpage:seg %p addr %p vp %p", seg, addr, vp); 5201 err = VOP_GETPAGE(vp, 5202 (offset_t)(svd->offset + (uintptr_t)(addr - seg->s_base)), 5203 PAGESIZE, NULL, NULL, 0, seg, addr, 5204 S_OTHER, svd->cred); 5205 5206 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5207 if (err) 5208 return (FC_MAKE_ERR(err)); 5209 return (0); 5210 } 5211 5212 static int 5213 segvn_setprot(struct seg *seg, caddr_t addr, size_t len, uint_t prot) 5214 { 5215 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 5216 struct vpage *svp, *evp; 5217 struct vnode *vp; 5218 size_t pgsz; 5219 pgcnt_t pgcnt; 5220 anon_sync_obj_t cookie; 5221 5222 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 5223 5224 if ((svd->maxprot & prot) != prot) 5225 return (EACCES); /* violated maxprot */ 5226 5227 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_WRITER); 5228 5229 /* return if prot is the same */ 5230 if (!svd->pageprot && svd->prot == prot) { 5231 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5232 return (0); 5233 } 5234 5235 /* 5236 * Since we change protections we first have to flush the cache. 5237 * This makes sure all the pagelock calls have to recheck 5238 * protections. 5239 */ 5240 if (svd->softlockcnt > 0) { 5241 /* 5242 * Since we do have the segvn writers lock nobody can fill 5243 * the cache with entries belonging to this seg during 5244 * the purge. The flush either succeeds or we still have 5245 * pending I/Os. 5246 */ 5247 segvn_purge(seg); 5248 if (svd->softlockcnt > 0) { 5249 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5250 return (EAGAIN); 5251 } 5252 } 5253 5254 if (seg->s_szc != 0) { 5255 int err; 5256 pgsz = page_get_pagesize(seg->s_szc); 5257 pgcnt = pgsz >> PAGESHIFT; 5258 ASSERT(IS_P2ALIGNED(pgcnt, pgcnt)); 5259 if (!IS_P2ALIGNED(addr, pgsz) || !IS_P2ALIGNED(len, pgsz)) { 5260 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5261 ASSERT(seg->s_base != addr || seg->s_size != len); 5262 /* 5263 * If we are holding the as lock as a reader then 5264 * we need to return IE_RETRY and let the as 5265 * layer drop and re-aquire the lock as a writer. 5266 */ 5267 if (AS_READ_HELD(seg->s_as, &seg->s_as->a_lock)) 5268 return (IE_RETRY); 5269 VM_STAT_ADD(segvnvmstats.demoterange[1]); 5270 if (svd->type == MAP_PRIVATE || svd->vp != NULL) { 5271 err = segvn_demote_range(seg, addr, len, 5272 SDR_END, 0); 5273 } else { 5274 uint_t szcvec = map_pgszcvec(seg->s_base, 5275 pgsz, (uintptr_t)seg->s_base, 5276 (svd->flags & MAP_TEXT), MAPPGSZC_SHM, 0); 5277 err = segvn_demote_range(seg, addr, len, 5278 SDR_END, szcvec); 5279 } 5280 if (err == 0) 5281 return (IE_RETRY); 5282 if (err == ENOMEM) 5283 return (IE_NOMEM); 5284 return (err); 5285 } 5286 } 5287 5288 5289 /* 5290 * If it's a private mapping and we're making it writable 5291 * and no swap space has been reserved, have to reserve 5292 * it all now. If it's a private mapping to a file (i.e., vp != NULL) 5293 * and we're removing write permission on the entire segment and 5294 * we haven't modified any pages, we can release the swap space. 5295 */ 5296 if (svd->type == MAP_PRIVATE) { 5297 if (prot & PROT_WRITE) { 5298 size_t sz; 5299 if (svd->swresv == 0 && !(svd->flags & MAP_NORESERVE)) { 5300 if (anon_resv(seg->s_size) == 0) { 5301 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5302 return (IE_NOMEM); 5303 } 5304 sz = svd->swresv = seg->s_size; 5305 TRACE_3(TR_FAC_VM, TR_ANON_PROC, 5306 "anon proc:%p %lu %u", 5307 seg, sz, 1); 5308 } 5309 } else { 5310 /* 5311 * Swap space is released only if this segment 5312 * does not map anonymous memory, since read faults 5313 * on such segments still need an anon slot to read 5314 * in the data. 5315 */ 5316 if (svd->swresv != 0 && svd->vp != NULL && 5317 svd->amp == NULL && addr == seg->s_base && 5318 len == seg->s_size && svd->pageprot == 0) { 5319 anon_unresv(svd->swresv); 5320 svd->swresv = 0; 5321 TRACE_3(TR_FAC_VM, TR_ANON_PROC, 5322 "anon proc:%p %lu %u", 5323 seg, 0, 0); 5324 } 5325 } 5326 } 5327 5328 if (addr == seg->s_base && len == seg->s_size && svd->pageprot == 0) { 5329 if (svd->prot == prot) { 5330 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5331 return (0); /* all done */ 5332 } 5333 svd->prot = (uchar_t)prot; 5334 } else if (svd->type == MAP_PRIVATE) { 5335 struct anon *ap = NULL; 5336 page_t *pp; 5337 u_offset_t offset, off; 5338 struct anon_map *amp; 5339 ulong_t anon_idx = 0; 5340 5341 /* 5342 * A vpage structure exists or else the change does not 5343 * involve the entire segment. Establish a vpage structure 5344 * if none is there. Then, for each page in the range, 5345 * adjust its individual permissions. Note that write- 5346 * enabling a MAP_PRIVATE page can affect the claims for 5347 * locked down memory. Overcommitting memory terminates 5348 * the operation. 5349 */ 5350 segvn_vpage(seg); 5351 if ((amp = svd->amp) != NULL) { 5352 anon_idx = svd->anon_index + seg_page(seg, addr); 5353 ASSERT(seg->s_szc == 0 || 5354 IS_P2ALIGNED(anon_idx, pgcnt)); 5355 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 5356 } 5357 5358 offset = svd->offset + (uintptr_t)(addr - seg->s_base); 5359 evp = &svd->vpage[seg_page(seg, addr + len)]; 5360 5361 /* 5362 * See Statement at the beginning of segvn_lockop regarding 5363 * the way cowcnts and lckcnts are handled. 5364 */ 5365 for (svp = &svd->vpage[seg_page(seg, addr)]; svp < evp; svp++) { 5366 5367 if (seg->s_szc != 0) { 5368 if (amp != NULL) { 5369 anon_array_enter(amp, anon_idx, 5370 &cookie); 5371 } 5372 if (IS_P2ALIGNED(anon_idx, pgcnt) && 5373 !segvn_claim_pages(seg, svp, offset, 5374 anon_idx, prot)) { 5375 if (amp != NULL) { 5376 anon_array_exit(&cookie); 5377 } 5378 break; 5379 } 5380 if (amp != NULL) { 5381 anon_array_exit(&cookie); 5382 } 5383 anon_idx++; 5384 } else { 5385 if (amp != NULL) { 5386 anon_array_enter(amp, anon_idx, 5387 &cookie); 5388 ap = anon_get_ptr(amp->ahp, anon_idx++); 5389 } 5390 5391 if (VPP_ISPPLOCK(svp) && 5392 VPP_PROT(svp) != prot) { 5393 5394 if (amp == NULL || ap == NULL) { 5395 vp = svd->vp; 5396 off = offset; 5397 } else 5398 swap_xlate(ap, &vp, &off); 5399 if (amp != NULL) 5400 anon_array_exit(&cookie); 5401 5402 if ((pp = page_lookup(vp, off, 5403 SE_SHARED)) == NULL) { 5404 panic("segvn_setprot: no page"); 5405 /*NOTREACHED*/ 5406 } 5407 ASSERT(seg->s_szc == 0); 5408 if ((VPP_PROT(svp) ^ prot) & 5409 PROT_WRITE) { 5410 if (prot & PROT_WRITE) { 5411 if (!page_addclaim(pp)) { 5412 page_unlock(pp); 5413 break; 5414 } 5415 } else { 5416 if (!page_subclaim(pp)) { 5417 page_unlock(pp); 5418 break; 5419 } 5420 } 5421 } 5422 page_unlock(pp); 5423 } else if (amp != NULL) 5424 anon_array_exit(&cookie); 5425 } 5426 VPP_SETPROT(svp, prot); 5427 offset += PAGESIZE; 5428 } 5429 if (amp != NULL) 5430 ANON_LOCK_EXIT(&->a_rwlock); 5431 5432 /* 5433 * Did we terminate prematurely? If so, simply unload 5434 * the translations to the things we've updated so far. 5435 */ 5436 if (svp != evp) { 5437 len = (svp - &svd->vpage[seg_page(seg, addr)]) * 5438 PAGESIZE; 5439 ASSERT(seg->s_szc == 0 || IS_P2ALIGNED(len, pgsz)); 5440 if (len != 0) 5441 hat_unload(seg->s_as->a_hat, addr, 5442 len, HAT_UNLOAD); 5443 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5444 return (IE_NOMEM); 5445 } 5446 } else { 5447 segvn_vpage(seg); 5448 evp = &svd->vpage[seg_page(seg, addr + len)]; 5449 for (svp = &svd->vpage[seg_page(seg, addr)]; svp < evp; svp++) { 5450 VPP_SETPROT(svp, prot); 5451 } 5452 } 5453 5454 if (((prot & PROT_WRITE) != 0 && 5455 (svd->vp != NULL || svd->type == MAP_PRIVATE)) || 5456 (prot & ~PROT_USER) == PROT_NONE) { 5457 /* 5458 * Either private or shared data with write access (in 5459 * which case we need to throw out all former translations 5460 * so that we get the right translations set up on fault 5461 * and we don't allow write access to any copy-on-write pages 5462 * that might be around or to prevent write access to pages 5463 * representing holes in a file), or we don't have permission 5464 * to access the memory at all (in which case we have to 5465 * unload any current translations that might exist). 5466 */ 5467 hat_unload(seg->s_as->a_hat, addr, len, HAT_UNLOAD); 5468 } else { 5469 /* 5470 * A shared mapping or a private mapping in which write 5471 * protection is going to be denied - just change all the 5472 * protections over the range of addresses in question. 5473 * segvn does not support any other attributes other 5474 * than prot so we can use hat_chgattr. 5475 */ 5476 hat_chgattr(seg->s_as->a_hat, addr, len, prot); 5477 } 5478 5479 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5480 5481 return (0); 5482 } 5483 5484 /* 5485 * segvn_setpagesize is called via SEGOP_SETPAGESIZE from as_setpagesize, 5486 * to determine if the seg is capable of mapping the requested szc. 5487 */ 5488 static int 5489 segvn_setpagesize(struct seg *seg, caddr_t addr, size_t len, uint_t szc) 5490 { 5491 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 5492 struct segvn_data *nsvd; 5493 struct anon_map *amp = svd->amp; 5494 struct seg *nseg; 5495 caddr_t eaddr = addr + len, a; 5496 size_t pgsz = page_get_pagesize(szc); 5497 pgcnt_t pgcnt = page_get_pagecnt(szc); 5498 int err; 5499 u_offset_t off = svd->offset + (uintptr_t)(addr - seg->s_base); 5500 extern struct vnode kvp; 5501 5502 ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock)); 5503 ASSERT(addr >= seg->s_base && eaddr <= seg->s_base + seg->s_size); 5504 5505 if (seg->s_szc == szc || segvn_lpg_disable != 0) { 5506 return (0); 5507 } 5508 5509 /* 5510 * addr should always be pgsz aligned but eaddr may be misaligned if 5511 * it's at the end of the segment. 5512 * 5513 * XXX we should assert this condition since as_setpagesize() logic 5514 * guarantees it. 5515 */ 5516 if (!IS_P2ALIGNED(addr, pgsz) || 5517 (!IS_P2ALIGNED(eaddr, pgsz) && 5518 eaddr != seg->s_base + seg->s_size)) { 5519 5520 segvn_setpgsz_align_err++; 5521 return (EINVAL); 5522 } 5523 5524 if (amp != NULL && svd->type == MAP_SHARED) { 5525 ulong_t an_idx = svd->anon_index + seg_page(seg, addr); 5526 if (!IS_P2ALIGNED(an_idx, pgcnt)) { 5527 5528 segvn_setpgsz_anon_align_err++; 5529 return (EINVAL); 5530 } 5531 } 5532 5533 if ((svd->flags & MAP_NORESERVE) || seg->s_as == &kas || 5534 szc > segvn_maxpgszc) { 5535 return (EINVAL); 5536 } 5537 5538 /* paranoid check */ 5539 if (svd->vp != NULL && 5540 (IS_SWAPFSVP(svd->vp) || VN_ISKAS(svd->vp))) { 5541 return (EINVAL); 5542 } 5543 5544 if (seg->s_szc == 0 && svd->vp != NULL && 5545 map_addr_vacalign_check(addr, off)) { 5546 return (EINVAL); 5547 } 5548 5549 /* 5550 * Check that protections are the same within new page 5551 * size boundaries. 5552 */ 5553 if (svd->pageprot) { 5554 for (a = addr; a < eaddr; a += pgsz) { 5555 if ((a + pgsz) > eaddr) { 5556 if (!sameprot(seg, a, eaddr - a)) { 5557 return (EINVAL); 5558 } 5559 } else { 5560 if (!sameprot(seg, a, pgsz)) { 5561 return (EINVAL); 5562 } 5563 } 5564 } 5565 } 5566 5567 /* 5568 * Since we are changing page size we first have to flush 5569 * the cache. This makes sure all the pagelock calls have 5570 * to recheck protections. 5571 */ 5572 if (svd->softlockcnt > 0) { 5573 /* 5574 * Since we do have the segvn writers lock nobody can fill 5575 * the cache with entries belonging to this seg during 5576 * the purge. The flush either succeeds or we still have 5577 * pending I/Os. 5578 */ 5579 segvn_purge(seg); 5580 if (svd->softlockcnt > 0) { 5581 return (EAGAIN); 5582 } 5583 } 5584 5585 /* 5586 * Operation for sub range of existing segment. 5587 */ 5588 if (addr != seg->s_base || eaddr != (seg->s_base + seg->s_size)) { 5589 if (szc < seg->s_szc) { 5590 VM_STAT_ADD(segvnvmstats.demoterange[2]); 5591 err = segvn_demote_range(seg, addr, len, SDR_RANGE, 0); 5592 if (err == 0) { 5593 return (IE_RETRY); 5594 } 5595 if (err == ENOMEM) { 5596 return (IE_NOMEM); 5597 } 5598 return (err); 5599 } 5600 if (addr != seg->s_base) { 5601 nseg = segvn_split_seg(seg, addr); 5602 if (eaddr != (nseg->s_base + nseg->s_size)) { 5603 /* eaddr is szc aligned */ 5604 (void) segvn_split_seg(nseg, eaddr); 5605 } 5606 return (IE_RETRY); 5607 } 5608 if (eaddr != (seg->s_base + seg->s_size)) { 5609 /* eaddr is szc aligned */ 5610 (void) segvn_split_seg(seg, eaddr); 5611 } 5612 return (IE_RETRY); 5613 } 5614 5615 /* 5616 * Break any low level sharing and reset seg->s_szc to 0. 5617 */ 5618 if ((err = segvn_clrszc(seg)) != 0) { 5619 if (err == ENOMEM) { 5620 err = IE_NOMEM; 5621 } 5622 return (err); 5623 } 5624 ASSERT(seg->s_szc == 0); 5625 5626 /* 5627 * If the end of the current segment is not pgsz aligned 5628 * then attempt to concatenate with the next segment. 5629 */ 5630 if (!IS_P2ALIGNED(eaddr, pgsz)) { 5631 nseg = AS_SEGNEXT(seg->s_as, seg); 5632 if (nseg == NULL || nseg == seg || eaddr != nseg->s_base) { 5633 return (ENOMEM); 5634 } 5635 if (nseg->s_ops != &segvn_ops) { 5636 return (EINVAL); 5637 } 5638 nsvd = (struct segvn_data *)nseg->s_data; 5639 if (nsvd->softlockcnt > 0) { 5640 segvn_purge(nseg); 5641 if (nsvd->softlockcnt > 0) { 5642 return (EAGAIN); 5643 } 5644 } 5645 err = segvn_clrszc(nseg); 5646 if (err == ENOMEM) { 5647 err = IE_NOMEM; 5648 } 5649 if (err != 0) { 5650 return (err); 5651 } 5652 err = segvn_concat(seg, nseg, 1); 5653 if (err == -1) { 5654 return (EINVAL); 5655 } 5656 if (err == -2) { 5657 return (IE_NOMEM); 5658 } 5659 return (IE_RETRY); 5660 } 5661 5662 /* 5663 * May need to re-align anon array to 5664 * new szc. 5665 */ 5666 if (amp != NULL) { 5667 if (!IS_P2ALIGNED(svd->anon_index, pgcnt)) { 5668 struct anon_hdr *nahp; 5669 5670 ASSERT(svd->type == MAP_PRIVATE); 5671 5672 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 5673 ASSERT(amp->refcnt == 1); 5674 nahp = anon_create(btop(amp->size), ANON_NOSLEEP); 5675 if (nahp == NULL) { 5676 ANON_LOCK_EXIT(&->a_rwlock); 5677 return (IE_NOMEM); 5678 } 5679 if (anon_copy_ptr(amp->ahp, svd->anon_index, 5680 nahp, 0, btop(seg->s_size), ANON_NOSLEEP)) { 5681 anon_release(nahp, btop(amp->size)); 5682 ANON_LOCK_EXIT(&->a_rwlock); 5683 return (IE_NOMEM); 5684 } 5685 anon_release(amp->ahp, btop(amp->size)); 5686 amp->ahp = nahp; 5687 svd->anon_index = 0; 5688 ANON_LOCK_EXIT(&->a_rwlock); 5689 } 5690 } 5691 if (svd->vp != NULL && szc != 0) { 5692 struct vattr va; 5693 u_offset_t eoffpage = svd->offset; 5694 va.va_mask = AT_SIZE; 5695 eoffpage += seg->s_size; 5696 eoffpage = btopr(eoffpage); 5697 if (VOP_GETATTR(svd->vp, &va, 0, svd->cred) != 0) { 5698 segvn_setpgsz_getattr_err++; 5699 return (EINVAL); 5700 } 5701 if (btopr(va.va_size) < eoffpage) { 5702 segvn_setpgsz_eof_err++; 5703 return (EINVAL); 5704 } 5705 if (amp != NULL) { 5706 /* 5707 * anon_fill_cow_holes() may call VOP_GETPAGE(). 5708 * don't take anon map lock here to avoid holding it 5709 * across VOP_GETPAGE() calls that may call back into 5710 * segvn for klsutering checks. We don't really need 5711 * anon map lock here since it's a private segment and 5712 * we hold as level lock as writers. 5713 */ 5714 if ((err = anon_fill_cow_holes(seg, seg->s_base, 5715 amp->ahp, svd->anon_index, svd->vp, svd->offset, 5716 seg->s_size, szc, svd->prot, svd->vpage, 5717 svd->cred)) != 0) { 5718 return (EINVAL); 5719 } 5720 } 5721 segvn_setvnode_mpss(svd->vp); 5722 } 5723 5724 if (amp != NULL) { 5725 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 5726 if (svd->type == MAP_PRIVATE) { 5727 amp->a_szc = szc; 5728 } else if (szc > amp->a_szc) { 5729 amp->a_szc = szc; 5730 } 5731 ANON_LOCK_EXIT(&->a_rwlock); 5732 } 5733 5734 seg->s_szc = szc; 5735 5736 return (0); 5737 } 5738 5739 static int 5740 segvn_clrszc(struct seg *seg) 5741 { 5742 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 5743 struct anon_map *amp = svd->amp; 5744 size_t pgsz; 5745 pgcnt_t pages; 5746 int err = 0; 5747 caddr_t a = seg->s_base; 5748 caddr_t ea = a + seg->s_size; 5749 ulong_t an_idx = svd->anon_index; 5750 vnode_t *vp = svd->vp; 5751 struct vpage *vpage = svd->vpage; 5752 page_t *anon_pl[1 + 1], *pp; 5753 struct anon *ap, *oldap; 5754 uint_t prot = svd->prot, vpprot; 5755 5756 ASSERT(AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock) || 5757 SEGVN_WRITE_HELD(seg->s_as, &svd->lock)); 5758 5759 if (vp == NULL && amp == NULL) { 5760 seg->s_szc = 0; 5761 return (0); 5762 } 5763 5764 /* 5765 * do HAT_UNLOAD_UNMAP since we are changing the pagesize. 5766 * unload argument is 0 when we are freeing the segment 5767 * and unload was already done. 5768 */ 5769 hat_unload(seg->s_as->a_hat, seg->s_base, seg->s_size, 5770 HAT_UNLOAD_UNMAP); 5771 5772 if (amp == NULL || svd->type == MAP_SHARED) { 5773 seg->s_szc = 0; 5774 return (0); 5775 } 5776 5777 pgsz = page_get_pagesize(seg->s_szc); 5778 pages = btop(pgsz); 5779 5780 /* 5781 * XXX anon rwlock is not really needed because this is a 5782 * private segment and we are writers. 5783 */ 5784 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 5785 5786 for (; a < ea; a += pgsz, an_idx += pages) { 5787 if ((oldap = anon_get_ptr(amp->ahp, an_idx)) != NULL) { 5788 if (svd->pageprot != 0) { 5789 ASSERT(vpage != NULL); 5790 prot = VPP_PROT(vpage); 5791 ASSERT(sameprot(seg, a, pgsz)); 5792 } 5793 if (seg->s_szc != 0) { 5794 ASSERT(vp == NULL || anon_pages(amp->ahp, 5795 an_idx, pages) == pages); 5796 if ((err = anon_map_demotepages(amp, an_idx, 5797 seg, a, prot, vpage, svd->cred)) != 0) { 5798 goto out; 5799 } 5800 } else { 5801 if (oldap->an_refcnt == 1) { 5802 continue; 5803 } 5804 if ((err = anon_getpage(&oldap, &vpprot, 5805 anon_pl, PAGESIZE, seg, a, S_READ, 5806 svd->cred))) { 5807 goto out; 5808 } 5809 if ((pp = anon_private(&ap, seg, a, prot, 5810 anon_pl[0], 0, svd->cred)) == NULL) { 5811 err = ENOMEM; 5812 goto out; 5813 } 5814 anon_decref(oldap); 5815 (void) anon_set_ptr(amp->ahp, an_idx, ap, 5816 ANON_SLEEP); 5817 page_unlock(pp); 5818 } 5819 } 5820 vpage = (vpage == NULL) ? NULL : vpage + pages; 5821 } 5822 5823 amp->a_szc = 0; 5824 seg->s_szc = 0; 5825 out: 5826 ANON_LOCK_EXIT(&->a_rwlock); 5827 return (err); 5828 } 5829 5830 static int 5831 segvn_claim_pages( 5832 struct seg *seg, 5833 struct vpage *svp, 5834 u_offset_t off, 5835 ulong_t anon_idx, 5836 uint_t prot) 5837 { 5838 pgcnt_t pgcnt = page_get_pagecnt(seg->s_szc); 5839 size_t ppasize = (pgcnt + 1) * sizeof (page_t *); 5840 page_t **ppa; 5841 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 5842 struct anon_map *amp = svd->amp; 5843 struct vpage *evp = svp + pgcnt; 5844 caddr_t addr = ((uintptr_t)(svp - svd->vpage) << PAGESHIFT) 5845 + seg->s_base; 5846 struct anon *ap; 5847 struct vnode *vp = svd->vp; 5848 page_t *pp; 5849 pgcnt_t pg_idx, i; 5850 int err = 0; 5851 anoff_t aoff; 5852 int anon = (amp != NULL) ? 1 : 0; 5853 5854 ASSERT(svd->type == MAP_PRIVATE); 5855 ASSERT(svd->vpage != NULL); 5856 ASSERT(seg->s_szc != 0); 5857 ASSERT(IS_P2ALIGNED(pgcnt, pgcnt)); 5858 ASSERT(amp == NULL || IS_P2ALIGNED(anon_idx, pgcnt)); 5859 ASSERT(sameprot(seg, addr, pgcnt << PAGESHIFT)); 5860 5861 if (VPP_PROT(svp) == prot) 5862 return (1); 5863 if (!((VPP_PROT(svp) ^ prot) & PROT_WRITE)) 5864 return (1); 5865 5866 ppa = kmem_alloc(ppasize, KM_SLEEP); 5867 if (anon && vp != NULL) { 5868 if (anon_get_ptr(amp->ahp, anon_idx) == NULL) { 5869 anon = 0; 5870 ASSERT(!anon_pages(amp->ahp, anon_idx, pgcnt)); 5871 } 5872 ASSERT(!anon || 5873 anon_pages(amp->ahp, anon_idx, pgcnt) == pgcnt); 5874 } 5875 5876 for (*ppa = NULL, pg_idx = 0; svp < evp; svp++, anon_idx++) { 5877 if (!VPP_ISPPLOCK(svp)) 5878 continue; 5879 if (anon) { 5880 ap = anon_get_ptr(amp->ahp, anon_idx); 5881 if (ap == NULL) { 5882 panic("segvn_claim_pages: no anon slot"); 5883 } 5884 swap_xlate(ap, &vp, &aoff); 5885 off = (u_offset_t)aoff; 5886 } 5887 ASSERT(vp != NULL); 5888 if ((pp = page_lookup(vp, 5889 (u_offset_t)off, SE_SHARED)) == NULL) { 5890 panic("segvn_claim_pages: no page"); 5891 } 5892 ppa[pg_idx++] = pp; 5893 off += PAGESIZE; 5894 } 5895 5896 if (ppa[0] == NULL) { 5897 kmem_free(ppa, ppasize); 5898 return (1); 5899 } 5900 5901 ASSERT(pg_idx <= pgcnt); 5902 ppa[pg_idx] = NULL; 5903 5904 if (prot & PROT_WRITE) 5905 err = page_addclaim_pages(ppa); 5906 else 5907 err = page_subclaim_pages(ppa); 5908 5909 for (i = 0; i < pg_idx; i++) { 5910 ASSERT(ppa[i] != NULL); 5911 page_unlock(ppa[i]); 5912 } 5913 5914 kmem_free(ppa, ppasize); 5915 return (err); 5916 } 5917 5918 /* 5919 * Returns right (upper address) segment if split occured. 5920 * If the address is equal to the beginning or end of its segment it returns 5921 * the current segment. 5922 */ 5923 static struct seg * 5924 segvn_split_seg(struct seg *seg, caddr_t addr) 5925 { 5926 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 5927 struct seg *nseg; 5928 size_t nsize; 5929 struct segvn_data *nsvd; 5930 5931 ASSERT(AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock)); 5932 ASSERT(addr >= seg->s_base); 5933 ASSERT(addr <= seg->s_base + seg->s_size); 5934 5935 if (addr == seg->s_base || addr == seg->s_base + seg->s_size) 5936 return (seg); 5937 5938 nsize = seg->s_base + seg->s_size - addr; 5939 seg->s_size = addr - seg->s_base; 5940 nseg = seg_alloc(seg->s_as, addr, nsize); 5941 ASSERT(nseg != NULL); 5942 nseg->s_ops = seg->s_ops; 5943 nsvd = kmem_cache_alloc(segvn_cache, KM_SLEEP); 5944 nseg->s_data = (void *)nsvd; 5945 nseg->s_szc = seg->s_szc; 5946 *nsvd = *svd; 5947 rw_init(&nsvd->lock, NULL, RW_DEFAULT, NULL); 5948 5949 if (nsvd->vp != NULL) { 5950 VN_HOLD(nsvd->vp); 5951 nsvd->offset = svd->offset + 5952 (uintptr_t)(nseg->s_base - seg->s_base); 5953 if (nsvd->type == MAP_SHARED) 5954 lgrp_shm_policy_init(NULL, nsvd->vp); 5955 } else { 5956 /* 5957 * The offset for an anonymous segment has no signifigance in 5958 * terms of an offset into a file. If we were to use the above 5959 * calculation instead, the structures read out of 5960 * /proc/<pid>/xmap would be more difficult to decipher since 5961 * it would be unclear whether two seemingly contiguous 5962 * prxmap_t structures represented different segments or a 5963 * single segment that had been split up into multiple prxmap_t 5964 * structures (e.g. if some part of the segment had not yet 5965 * been faulted in). 5966 */ 5967 nsvd->offset = 0; 5968 } 5969 5970 ASSERT(svd->softlockcnt == 0); 5971 crhold(svd->cred); 5972 5973 if (svd->vpage != NULL) { 5974 size_t bytes = vpgtob(seg_pages(seg)); 5975 size_t nbytes = vpgtob(seg_pages(nseg)); 5976 struct vpage *ovpage = svd->vpage; 5977 5978 svd->vpage = kmem_alloc(bytes, KM_SLEEP); 5979 bcopy(ovpage, svd->vpage, bytes); 5980 nsvd->vpage = kmem_alloc(nbytes, KM_SLEEP); 5981 bcopy(ovpage + seg_pages(seg), nsvd->vpage, nbytes); 5982 kmem_free(ovpage, bytes + nbytes); 5983 } 5984 if (svd->amp != NULL && svd->type == MAP_PRIVATE) { 5985 struct anon_map *oamp = svd->amp, *namp; 5986 struct anon_hdr *nahp; 5987 5988 ANON_LOCK_ENTER(&oamp->a_rwlock, RW_WRITER); 5989 ASSERT(oamp->refcnt == 1); 5990 nahp = anon_create(btop(seg->s_size), ANON_SLEEP); 5991 (void) anon_copy_ptr(oamp->ahp, svd->anon_index, 5992 nahp, 0, btop(seg->s_size), ANON_SLEEP); 5993 5994 namp = anonmap_alloc(nseg->s_size, 0); 5995 namp->a_szc = nseg->s_szc; 5996 (void) anon_copy_ptr(oamp->ahp, 5997 svd->anon_index + btop(seg->s_size), 5998 namp->ahp, 0, btop(nseg->s_size), ANON_SLEEP); 5999 anon_release(oamp->ahp, btop(oamp->size)); 6000 oamp->ahp = nahp; 6001 oamp->size = seg->s_size; 6002 svd->anon_index = 0; 6003 nsvd->amp = namp; 6004 nsvd->anon_index = 0; 6005 ANON_LOCK_EXIT(&oamp->a_rwlock); 6006 } else if (svd->amp != NULL) { 6007 pgcnt_t pgcnt = page_get_pagecnt(seg->s_szc); 6008 ASSERT(svd->amp == nsvd->amp); 6009 ASSERT(seg->s_szc <= svd->amp->a_szc); 6010 nsvd->anon_index = svd->anon_index + seg_pages(seg); 6011 ASSERT(IS_P2ALIGNED(nsvd->anon_index, pgcnt)); 6012 ANON_LOCK_ENTER(&svd->amp->a_rwlock, RW_WRITER); 6013 svd->amp->refcnt++; 6014 ANON_LOCK_EXIT(&svd->amp->a_rwlock); 6015 } 6016 6017 /* 6018 * Split amount of swap reserve 6019 */ 6020 if (svd->swresv) { 6021 /* 6022 * For MAP_NORESERVE, only allocate swap reserve for pages 6023 * being used. Other segments get enough to cover whole 6024 * segment. 6025 */ 6026 if (svd->flags & MAP_NORESERVE) { 6027 size_t oswresv; 6028 6029 ASSERT(svd->amp); 6030 oswresv = svd->swresv; 6031 svd->swresv = ptob(anon_pages(svd->amp->ahp, 6032 svd->anon_index, btop(seg->s_size))); 6033 nsvd->swresv = ptob(anon_pages(nsvd->amp->ahp, 6034 nsvd->anon_index, btop(nseg->s_size))); 6035 ASSERT(oswresv >= (svd->swresv + nsvd->swresv)); 6036 } else { 6037 ASSERT(svd->swresv == seg->s_size + nseg->s_size); 6038 svd->swresv = seg->s_size; 6039 nsvd->swresv = nseg->s_size; 6040 } 6041 } 6042 6043 return (nseg); 6044 } 6045 6046 /* 6047 * called on memory operations (unmap, setprot, setpagesize) for a subset 6048 * of a large page segment to either demote the memory range (SDR_RANGE) 6049 * or the ends (SDR_END) by addr/len. 6050 * 6051 * returns 0 on success. returns errno, including ENOMEM, on failure. 6052 */ 6053 static int 6054 segvn_demote_range( 6055 struct seg *seg, 6056 caddr_t addr, 6057 size_t len, 6058 int flag, 6059 uint_t szcvec) 6060 { 6061 caddr_t eaddr = addr + len; 6062 caddr_t lpgaddr, lpgeaddr; 6063 struct seg *nseg; 6064 struct seg *badseg1 = NULL; 6065 struct seg *badseg2 = NULL; 6066 size_t pgsz; 6067 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 6068 int err; 6069 uint_t szc = seg->s_szc; 6070 uint_t tszcvec; 6071 6072 ASSERT(AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock)); 6073 ASSERT(szc != 0); 6074 pgsz = page_get_pagesize(szc); 6075 ASSERT(seg->s_base != addr || seg->s_size != len); 6076 ASSERT(addr >= seg->s_base && eaddr <= seg->s_base + seg->s_size); 6077 ASSERT(svd->softlockcnt == 0); 6078 ASSERT(szcvec == 0 || (flag == SDR_END && svd->type == MAP_SHARED)); 6079 6080 CALC_LPG_REGION(pgsz, seg, addr, len, lpgaddr, lpgeaddr); 6081 ASSERT(flag == SDR_RANGE || eaddr < lpgeaddr || addr > lpgaddr); 6082 if (flag == SDR_RANGE) { 6083 /* demote entire range */ 6084 badseg1 = nseg = segvn_split_seg(seg, lpgaddr); 6085 (void) segvn_split_seg(nseg, lpgeaddr); 6086 ASSERT(badseg1->s_base == lpgaddr); 6087 ASSERT(badseg1->s_size == lpgeaddr - lpgaddr); 6088 } else if (addr != lpgaddr) { 6089 ASSERT(flag == SDR_END); 6090 badseg1 = nseg = segvn_split_seg(seg, lpgaddr); 6091 if (eaddr != lpgeaddr && eaddr > lpgaddr + pgsz && 6092 eaddr < lpgaddr + 2 * pgsz) { 6093 (void) segvn_split_seg(nseg, lpgeaddr); 6094 ASSERT(badseg1->s_base == lpgaddr); 6095 ASSERT(badseg1->s_size == 2 * pgsz); 6096 } else { 6097 nseg = segvn_split_seg(nseg, lpgaddr + pgsz); 6098 ASSERT(badseg1->s_base == lpgaddr); 6099 ASSERT(badseg1->s_size == pgsz); 6100 if (eaddr != lpgeaddr && eaddr > lpgaddr + pgsz) { 6101 ASSERT(lpgeaddr - lpgaddr > 2 * pgsz); 6102 nseg = segvn_split_seg(nseg, lpgeaddr - pgsz); 6103 badseg2 = nseg; 6104 (void) segvn_split_seg(nseg, lpgeaddr); 6105 ASSERT(badseg2->s_base == lpgeaddr - pgsz); 6106 ASSERT(badseg2->s_size == pgsz); 6107 } 6108 } 6109 } else { 6110 ASSERT(flag == SDR_END); 6111 ASSERT(eaddr < lpgeaddr); 6112 badseg1 = nseg = segvn_split_seg(seg, lpgeaddr - pgsz); 6113 (void) segvn_split_seg(nseg, lpgeaddr); 6114 ASSERT(badseg1->s_base == lpgeaddr - pgsz); 6115 ASSERT(badseg1->s_size == pgsz); 6116 } 6117 6118 ASSERT(badseg1 != NULL); 6119 ASSERT(badseg1->s_szc == szc); 6120 ASSERT(flag == SDR_RANGE || badseg1->s_size == pgsz || 6121 badseg1->s_size == 2 * pgsz); 6122 ASSERT(sameprot(badseg1, badseg1->s_base, pgsz)); 6123 ASSERT(badseg1->s_size == pgsz || 6124 sameprot(badseg1, badseg1->s_base + pgsz, pgsz)); 6125 if (err = segvn_clrszc(badseg1)) { 6126 return (err); 6127 } 6128 ASSERT(badseg1->s_szc == 0); 6129 6130 if (szc > 1 && (tszcvec = P2PHASE(szcvec, 1 << szc)) > 1) { 6131 uint_t tszc = highbit(tszcvec) - 1; 6132 caddr_t ta = MAX(addr, badseg1->s_base); 6133 caddr_t te; 6134 size_t tpgsz = page_get_pagesize(tszc); 6135 6136 ASSERT(svd->type == MAP_SHARED); 6137 ASSERT(flag == SDR_END); 6138 ASSERT(tszc < szc && tszc > 0); 6139 6140 if (eaddr > badseg1->s_base + badseg1->s_size) { 6141 te = badseg1->s_base + badseg1->s_size; 6142 } else { 6143 te = eaddr; 6144 } 6145 6146 ASSERT(ta <= te); 6147 badseg1->s_szc = tszc; 6148 if (!IS_P2ALIGNED(ta, tpgsz) || !IS_P2ALIGNED(te, tpgsz)) { 6149 if (badseg2 != NULL) { 6150 err = segvn_demote_range(badseg1, ta, te - ta, 6151 SDR_END, tszcvec); 6152 if (err != 0) { 6153 return (err); 6154 } 6155 } else { 6156 return (segvn_demote_range(badseg1, ta, 6157 te - ta, SDR_END, tszcvec)); 6158 } 6159 } 6160 } 6161 6162 if (badseg2 == NULL) 6163 return (0); 6164 ASSERT(badseg2->s_szc == szc); 6165 ASSERT(badseg2->s_size == pgsz); 6166 ASSERT(sameprot(badseg2, badseg2->s_base, badseg2->s_size)); 6167 if (err = segvn_clrszc(badseg2)) { 6168 return (err); 6169 } 6170 ASSERT(badseg2->s_szc == 0); 6171 6172 if (szc > 1 && (tszcvec = P2PHASE(szcvec, 1 << szc)) > 1) { 6173 uint_t tszc = highbit(tszcvec) - 1; 6174 size_t tpgsz = page_get_pagesize(tszc); 6175 6176 ASSERT(svd->type == MAP_SHARED); 6177 ASSERT(flag == SDR_END); 6178 ASSERT(tszc < szc && tszc > 0); 6179 ASSERT(badseg2->s_base > addr); 6180 ASSERT(eaddr > badseg2->s_base); 6181 ASSERT(eaddr < badseg2->s_base + badseg2->s_size); 6182 6183 badseg2->s_szc = tszc; 6184 if (!IS_P2ALIGNED(eaddr, tpgsz)) { 6185 return (segvn_demote_range(badseg2, badseg2->s_base, 6186 eaddr - badseg2->s_base, SDR_END, tszcvec)); 6187 } 6188 } 6189 6190 return (0); 6191 } 6192 6193 static int 6194 segvn_checkprot(struct seg *seg, caddr_t addr, size_t len, uint_t prot) 6195 { 6196 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 6197 struct vpage *vp, *evp; 6198 6199 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 6200 6201 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER); 6202 /* 6203 * If segment protection can be used, simply check against them. 6204 */ 6205 if (svd->pageprot == 0) { 6206 int err; 6207 6208 err = ((svd->prot & prot) != prot) ? EACCES : 0; 6209 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 6210 return (err); 6211 } 6212 6213 /* 6214 * Have to check down to the vpage level. 6215 */ 6216 evp = &svd->vpage[seg_page(seg, addr + len)]; 6217 for (vp = &svd->vpage[seg_page(seg, addr)]; vp < evp; vp++) { 6218 if ((VPP_PROT(vp) & prot) != prot) { 6219 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 6220 return (EACCES); 6221 } 6222 } 6223 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 6224 return (0); 6225 } 6226 6227 static int 6228 segvn_getprot(struct seg *seg, caddr_t addr, size_t len, uint_t *protv) 6229 { 6230 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 6231 size_t pgno = seg_page(seg, addr + len) - seg_page(seg, addr) + 1; 6232 6233 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 6234 6235 if (pgno != 0) { 6236 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER); 6237 if (svd->pageprot == 0) { 6238 do 6239 protv[--pgno] = svd->prot; 6240 while (pgno != 0); 6241 } else { 6242 size_t pgoff = seg_page(seg, addr); 6243 6244 do { 6245 pgno--; 6246 protv[pgno] = VPP_PROT(&svd->vpage[pgno+pgoff]); 6247 } while (pgno != 0); 6248 } 6249 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 6250 } 6251 return (0); 6252 } 6253 6254 static u_offset_t 6255 segvn_getoffset(struct seg *seg, caddr_t addr) 6256 { 6257 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 6258 6259 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 6260 6261 return (svd->offset + (uintptr_t)(addr - seg->s_base)); 6262 } 6263 6264 /*ARGSUSED*/ 6265 static int 6266 segvn_gettype(struct seg *seg, caddr_t addr) 6267 { 6268 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 6269 6270 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 6271 6272 return (svd->type | (svd->flags & (MAP_NORESERVE | MAP_TEXT | 6273 MAP_INITDATA))); 6274 } 6275 6276 /*ARGSUSED*/ 6277 static int 6278 segvn_getvp(struct seg *seg, caddr_t addr, struct vnode **vpp) 6279 { 6280 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 6281 6282 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 6283 6284 *vpp = svd->vp; 6285 return (0); 6286 } 6287 6288 /* 6289 * Check to see if it makes sense to do kluster/read ahead to 6290 * addr + delta relative to the mapping at addr. We assume here 6291 * that delta is a signed PAGESIZE'd multiple (which can be negative). 6292 * 6293 * For segvn, we currently "approve" of the action if we are 6294 * still in the segment and it maps from the same vp/off, 6295 * or if the advice stored in segvn_data or vpages allows it. 6296 * Currently, klustering is not allowed only if MADV_RANDOM is set. 6297 */ 6298 static int 6299 segvn_kluster(struct seg *seg, caddr_t addr, ssize_t delta) 6300 { 6301 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 6302 struct anon *oap, *ap; 6303 ssize_t pd; 6304 size_t page; 6305 struct vnode *vp1, *vp2; 6306 u_offset_t off1, off2; 6307 struct anon_map *amp; 6308 6309 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 6310 ASSERT(AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock) || 6311 SEGVN_LOCK_HELD(seg->s_as, &svd->lock)); 6312 6313 if (addr + delta < seg->s_base || 6314 addr + delta >= (seg->s_base + seg->s_size)) 6315 return (-1); /* exceeded segment bounds */ 6316 6317 pd = delta / (ssize_t)PAGESIZE; /* divide to preserve sign bit */ 6318 page = seg_page(seg, addr); 6319 6320 /* 6321 * Check to see if either of the pages addr or addr + delta 6322 * have advice set that prevents klustering (if MADV_RANDOM advice 6323 * is set for entire segment, or MADV_SEQUENTIAL is set and delta 6324 * is negative). 6325 */ 6326 if (svd->advice == MADV_RANDOM || 6327 svd->advice == MADV_SEQUENTIAL && delta < 0) 6328 return (-1); 6329 else if (svd->pageadvice && svd->vpage) { 6330 struct vpage *bvpp, *evpp; 6331 6332 bvpp = &svd->vpage[page]; 6333 evpp = &svd->vpage[page + pd]; 6334 if (VPP_ADVICE(bvpp) == MADV_RANDOM || 6335 VPP_ADVICE(evpp) == MADV_SEQUENTIAL && delta < 0) 6336 return (-1); 6337 if (VPP_ADVICE(bvpp) != VPP_ADVICE(evpp) && 6338 VPP_ADVICE(evpp) == MADV_RANDOM) 6339 return (-1); 6340 } 6341 6342 if (svd->type == MAP_SHARED) 6343 return (0); /* shared mapping - all ok */ 6344 6345 if ((amp = svd->amp) == NULL) 6346 return (0); /* off original vnode */ 6347 6348 page += svd->anon_index; 6349 6350 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 6351 6352 oap = anon_get_ptr(amp->ahp, page); 6353 ap = anon_get_ptr(amp->ahp, page + pd); 6354 6355 ANON_LOCK_EXIT(&->a_rwlock); 6356 6357 if ((oap == NULL && ap != NULL) || (oap != NULL && ap == NULL)) { 6358 return (-1); /* one with and one without an anon */ 6359 } 6360 6361 if (oap == NULL) { /* implies that ap == NULL */ 6362 return (0); /* off original vnode */ 6363 } 6364 6365 /* 6366 * Now we know we have two anon pointers - check to 6367 * see if they happen to be properly allocated. 6368 */ 6369 6370 /* 6371 * XXX We cheat here and don't lock the anon slots. We can't because 6372 * we may have been called from the anon layer which might already 6373 * have locked them. We are holding a refcnt on the slots so they 6374 * can't disappear. The worst that will happen is we'll get the wrong 6375 * names (vp, off) for the slots and make a poor klustering decision. 6376 */ 6377 swap_xlate(ap, &vp1, &off1); 6378 swap_xlate(oap, &vp2, &off2); 6379 6380 6381 if (!VOP_CMP(vp1, vp2) || off1 - off2 != delta) 6382 return (-1); 6383 return (0); 6384 } 6385 6386 /* 6387 * Swap the pages of seg out to secondary storage, returning the 6388 * number of bytes of storage freed. 6389 * 6390 * The basic idea is first to unload all translations and then to call 6391 * VOP_PUTPAGE() for all newly-unmapped pages, to push them out to the 6392 * swap device. Pages to which other segments have mappings will remain 6393 * mapped and won't be swapped. Our caller (as_swapout) has already 6394 * performed the unloading step. 6395 * 6396 * The value returned is intended to correlate well with the process's 6397 * memory requirements. However, there are some caveats: 6398 * 1) When given a shared segment as argument, this routine will 6399 * only succeed in swapping out pages for the last sharer of the 6400 * segment. (Previous callers will only have decremented mapping 6401 * reference counts.) 6402 * 2) We assume that the hat layer maintains a large enough translation 6403 * cache to capture process reference patterns. 6404 */ 6405 static size_t 6406 segvn_swapout(struct seg *seg) 6407 { 6408 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 6409 struct anon_map *amp; 6410 pgcnt_t pgcnt = 0; 6411 pgcnt_t npages; 6412 pgcnt_t page; 6413 ulong_t anon_index; 6414 6415 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 6416 6417 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER); 6418 /* 6419 * Find pages unmapped by our caller and force them 6420 * out to the virtual swap device. 6421 */ 6422 if ((amp = svd->amp) != NULL) 6423 anon_index = svd->anon_index; 6424 npages = seg->s_size >> PAGESHIFT; 6425 for (page = 0; page < npages; page++) { 6426 page_t *pp; 6427 struct anon *ap; 6428 struct vnode *vp; 6429 u_offset_t off; 6430 anon_sync_obj_t cookie; 6431 6432 /* 6433 * Obtain <vp, off> pair for the page, then look it up. 6434 * 6435 * Note that this code is willing to consider regular 6436 * pages as well as anon pages. Is this appropriate here? 6437 */ 6438 ap = NULL; 6439 if (amp != NULL) { 6440 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 6441 if (anon_array_try_enter(amp, anon_index + page, 6442 &cookie)) { 6443 ANON_LOCK_EXIT(&->a_rwlock); 6444 continue; 6445 } 6446 ap = anon_get_ptr(amp->ahp, anon_index + page); 6447 if (ap != NULL) { 6448 swap_xlate(ap, &vp, &off); 6449 } else { 6450 vp = svd->vp; 6451 off = svd->offset + ptob(page); 6452 } 6453 anon_array_exit(&cookie); 6454 ANON_LOCK_EXIT(&->a_rwlock); 6455 } else { 6456 vp = svd->vp; 6457 off = svd->offset + ptob(page); 6458 } 6459 if (vp == NULL) { /* untouched zfod page */ 6460 ASSERT(ap == NULL); 6461 continue; 6462 } 6463 6464 pp = page_lookup_nowait(vp, off, SE_SHARED); 6465 if (pp == NULL) 6466 continue; 6467 6468 6469 /* 6470 * Examine the page to see whether it can be tossed out, 6471 * keeping track of how many we've found. 6472 */ 6473 if (!page_tryupgrade(pp)) { 6474 /* 6475 * If the page has an i/o lock and no mappings, 6476 * it's very likely that the page is being 6477 * written out as a result of klustering. 6478 * Assume this is so and take credit for it here. 6479 */ 6480 if (!page_io_trylock(pp)) { 6481 if (!hat_page_is_mapped(pp)) 6482 pgcnt++; 6483 } else { 6484 page_io_unlock(pp); 6485 } 6486 page_unlock(pp); 6487 continue; 6488 } 6489 ASSERT(!page_iolock_assert(pp)); 6490 6491 6492 /* 6493 * Skip if page is locked or has mappings. 6494 * We don't need the page_struct_lock to look at lckcnt 6495 * and cowcnt because the page is exclusive locked. 6496 */ 6497 if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0 || 6498 hat_page_is_mapped(pp)) { 6499 page_unlock(pp); 6500 continue; 6501 } 6502 6503 /* 6504 * dispose skips large pages so try to demote first. 6505 */ 6506 if (pp->p_szc != 0 && !page_try_demote_pages(pp)) { 6507 page_unlock(pp); 6508 /* 6509 * XXX should skip the remaining page_t's of this 6510 * large page. 6511 */ 6512 continue; 6513 } 6514 6515 ASSERT(pp->p_szc == 0); 6516 6517 /* 6518 * No longer mapped -- we can toss it out. How 6519 * we do so depends on whether or not it's dirty. 6520 */ 6521 if (hat_ismod(pp) && pp->p_vnode) { 6522 /* 6523 * We must clean the page before it can be 6524 * freed. Setting B_FREE will cause pvn_done 6525 * to free the page when the i/o completes. 6526 * XXX: This also causes it to be accounted 6527 * as a pageout instead of a swap: need 6528 * B_SWAPOUT bit to use instead of B_FREE. 6529 * 6530 * Hold the vnode before releasing the page lock 6531 * to prevent it from being freed and re-used by 6532 * some other thread. 6533 */ 6534 VN_HOLD(vp); 6535 page_unlock(pp); 6536 6537 /* 6538 * Queue all i/o requests for the pageout thread 6539 * to avoid saturating the pageout devices. 6540 */ 6541 if (!queue_io_request(vp, off)) 6542 VN_RELE(vp); 6543 } else { 6544 /* 6545 * The page was clean, free it. 6546 * 6547 * XXX: Can we ever encounter modified pages 6548 * with no associated vnode here? 6549 */ 6550 ASSERT(pp->p_vnode != NULL); 6551 /*LINTED: constant in conditional context*/ 6552 VN_DISPOSE(pp, B_FREE, 0, kcred); 6553 } 6554 6555 /* 6556 * Credit now even if i/o is in progress. 6557 */ 6558 pgcnt++; 6559 } 6560 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 6561 6562 /* 6563 * Wakeup pageout to initiate i/o on all queued requests. 6564 */ 6565 cv_signal_pageout(); 6566 return (ptob(pgcnt)); 6567 } 6568 6569 /* 6570 * Synchronize primary storage cache with real object in virtual memory. 6571 * 6572 * XXX - Anonymous pages should not be sync'ed out at all. 6573 */ 6574 static int 6575 segvn_sync(struct seg *seg, caddr_t addr, size_t len, int attr, uint_t flags) 6576 { 6577 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 6578 struct vpage *vpp; 6579 page_t *pp; 6580 u_offset_t offset; 6581 struct vnode *vp; 6582 u_offset_t off; 6583 caddr_t eaddr; 6584 int bflags; 6585 int err = 0; 6586 int segtype; 6587 int pageprot; 6588 int prot; 6589 ulong_t anon_index; 6590 struct anon_map *amp; 6591 struct anon *ap; 6592 anon_sync_obj_t cookie; 6593 6594 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 6595 6596 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER); 6597 6598 if (svd->softlockcnt > 0) { 6599 /* 6600 * flush all pages from seg cache 6601 * otherwise we may deadlock in swap_putpage 6602 * for B_INVAL page (4175402). 6603 * 6604 * Even if we grab segvn WRITER's lock or segp_slock 6605 * here, there might be another thread which could've 6606 * successfully performed lookup/insert just before 6607 * we acquired the lock here. So, grabbing either 6608 * lock here is of not much use. Until we devise 6609 * a strategy at upper layers to solve the 6610 * synchronization issues completely, we expect 6611 * applications to handle this appropriately. 6612 */ 6613 segvn_purge(seg); 6614 if (svd->softlockcnt > 0) { 6615 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 6616 return (EAGAIN); 6617 } 6618 } 6619 6620 vpp = svd->vpage; 6621 offset = svd->offset + (uintptr_t)(addr - seg->s_base); 6622 bflags = ((flags & MS_ASYNC) ? B_ASYNC : 0) | 6623 ((flags & MS_INVALIDATE) ? B_INVAL : 0); 6624 6625 if (attr) { 6626 pageprot = attr & ~(SHARED|PRIVATE); 6627 segtype = (attr & SHARED) ? MAP_SHARED : MAP_PRIVATE; 6628 6629 /* 6630 * We are done if the segment types don't match 6631 * or if we have segment level protections and 6632 * they don't match. 6633 */ 6634 if (svd->type != segtype) { 6635 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 6636 return (0); 6637 } 6638 if (vpp == NULL) { 6639 if (svd->prot != pageprot) { 6640 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 6641 return (0); 6642 } 6643 prot = svd->prot; 6644 } else 6645 vpp = &svd->vpage[seg_page(seg, addr)]; 6646 6647 } else if (svd->vp && svd->amp == NULL && 6648 (flags & MS_INVALIDATE) == 0) { 6649 6650 /* 6651 * No attributes, no anonymous pages and MS_INVALIDATE flag 6652 * is not on, just use one big request. 6653 */ 6654 err = VOP_PUTPAGE(svd->vp, (offset_t)offset, len, 6655 bflags, svd->cred); 6656 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 6657 return (err); 6658 } 6659 6660 if ((amp = svd->amp) != NULL) 6661 anon_index = svd->anon_index + seg_page(seg, addr); 6662 6663 for (eaddr = addr + len; addr < eaddr; addr += PAGESIZE) { 6664 ap = NULL; 6665 if (amp != NULL) { 6666 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 6667 anon_array_enter(amp, anon_index, &cookie); 6668 ap = anon_get_ptr(amp->ahp, anon_index++); 6669 if (ap != NULL) { 6670 swap_xlate(ap, &vp, &off); 6671 } else { 6672 vp = svd->vp; 6673 off = offset; 6674 } 6675 anon_array_exit(&cookie); 6676 ANON_LOCK_EXIT(&->a_rwlock); 6677 } else { 6678 vp = svd->vp; 6679 off = offset; 6680 } 6681 offset += PAGESIZE; 6682 6683 if (vp == NULL) /* untouched zfod page */ 6684 continue; 6685 6686 if (attr) { 6687 if (vpp) { 6688 prot = VPP_PROT(vpp); 6689 vpp++; 6690 } 6691 if (prot != pageprot) { 6692 continue; 6693 } 6694 } 6695 6696 /* 6697 * See if any of these pages are locked -- if so, then we 6698 * will have to truncate an invalidate request at the first 6699 * locked one. We don't need the page_struct_lock to test 6700 * as this is only advisory; even if we acquire it someone 6701 * might race in and lock the page after we unlock and before 6702 * we do the PUTPAGE, then PUTPAGE simply does nothing. 6703 */ 6704 if (flags & MS_INVALIDATE) { 6705 if ((pp = page_lookup(vp, off, SE_SHARED)) != NULL) { 6706 if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) { 6707 page_unlock(pp); 6708 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 6709 return (EBUSY); 6710 } 6711 if (ap != NULL && pp->p_szc != 0 && 6712 page_tryupgrade(pp)) { 6713 if (pp->p_lckcnt == 0 && 6714 pp->p_cowcnt == 0) { 6715 /* 6716 * swapfs VN_DISPOSE() won't 6717 * invalidate large pages. 6718 * Attempt to demote. 6719 * XXX can't help it if it 6720 * fails. But for swapfs 6721 * pages it is no big deal. 6722 */ 6723 (void) page_try_demote_pages( 6724 pp); 6725 } 6726 } 6727 page_unlock(pp); 6728 } 6729 } else if (svd->type == MAP_SHARED && amp != NULL) { 6730 /* 6731 * Avoid writting out to disk ISM's large pages 6732 * because segspt_free_pages() relies on NULL an_pvp 6733 * of anon slots of such pages. 6734 */ 6735 6736 ASSERT(svd->vp == NULL); 6737 /* 6738 * swapfs uses page_lookup_nowait if not freeing or 6739 * invalidating and skips a page if 6740 * page_lookup_nowait returns NULL. 6741 */ 6742 pp = page_lookup_nowait(vp, off, SE_SHARED); 6743 if (pp == NULL) { 6744 continue; 6745 } 6746 if (pp->p_szc != 0) { 6747 page_unlock(pp); 6748 continue; 6749 } 6750 6751 /* 6752 * Note ISM pages are created large so (vp, off)'s 6753 * page cannot suddenly become large after we unlock 6754 * pp. 6755 */ 6756 page_unlock(pp); 6757 } 6758 /* 6759 * XXX - Should ultimately try to kluster 6760 * calls to VOP_PUTPAGE() for performance. 6761 */ 6762 VN_HOLD(vp); 6763 err = VOP_PUTPAGE(vp, (offset_t)off, PAGESIZE, 6764 bflags, svd->cred); 6765 VN_RELE(vp); 6766 if (err) 6767 break; 6768 } 6769 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 6770 return (err); 6771 } 6772 6773 /* 6774 * Determine if we have data corresponding to pages in the 6775 * primary storage virtual memory cache (i.e., "in core"). 6776 */ 6777 static size_t 6778 segvn_incore(struct seg *seg, caddr_t addr, size_t len, char *vec) 6779 { 6780 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 6781 struct vnode *vp, *avp; 6782 u_offset_t offset, aoffset; 6783 size_t p, ep; 6784 int ret; 6785 struct vpage *vpp; 6786 page_t *pp; 6787 uint_t start; 6788 struct anon_map *amp; /* XXX - for locknest */ 6789 struct anon *ap; 6790 uint_t attr; 6791 anon_sync_obj_t cookie; 6792 6793 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 6794 6795 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER); 6796 if (svd->amp == NULL && svd->vp == NULL) { 6797 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 6798 bzero(vec, btopr(len)); 6799 return (len); /* no anonymous pages created yet */ 6800 } 6801 6802 p = seg_page(seg, addr); 6803 ep = seg_page(seg, addr + len); 6804 start = svd->vp ? SEG_PAGE_VNODEBACKED : 0; 6805 6806 amp = svd->amp; 6807 for (; p < ep; p++, addr += PAGESIZE) { 6808 vpp = (svd->vpage) ? &svd->vpage[p]: NULL; 6809 ret = start; 6810 ap = NULL; 6811 avp = NULL; 6812 /* Grab the vnode/offset for the anon slot */ 6813 if (amp != NULL) { 6814 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 6815 anon_array_enter(amp, svd->anon_index + p, &cookie); 6816 ap = anon_get_ptr(amp->ahp, svd->anon_index + p); 6817 if (ap != NULL) { 6818 swap_xlate(ap, &avp, &aoffset); 6819 } 6820 anon_array_exit(&cookie); 6821 ANON_LOCK_EXIT(&->a_rwlock); 6822 } 6823 if ((avp != NULL) && page_exists(avp, aoffset)) { 6824 /* A page exists for the anon slot */ 6825 ret |= SEG_PAGE_INCORE; 6826 6827 /* 6828 * If page is mapped and writable 6829 */ 6830 attr = (uint_t)0; 6831 if ((hat_getattr(seg->s_as->a_hat, addr, 6832 &attr) != -1) && (attr & PROT_WRITE)) { 6833 ret |= SEG_PAGE_ANON; 6834 } 6835 /* 6836 * Don't get page_struct lock for lckcnt and cowcnt, 6837 * since this is purely advisory. 6838 */ 6839 if ((pp = page_lookup_nowait(avp, aoffset, 6840 SE_SHARED)) != NULL) { 6841 if (pp->p_lckcnt) 6842 ret |= SEG_PAGE_SOFTLOCK; 6843 if (pp->p_cowcnt) 6844 ret |= SEG_PAGE_HASCOW; 6845 page_unlock(pp); 6846 } 6847 } 6848 6849 /* Gather vnode statistics */ 6850 vp = svd->vp; 6851 offset = svd->offset + (uintptr_t)(addr - seg->s_base); 6852 6853 if (vp != NULL) { 6854 /* 6855 * Try to obtain a "shared" lock on the page 6856 * without blocking. If this fails, determine 6857 * if the page is in memory. 6858 */ 6859 pp = page_lookup_nowait(vp, offset, SE_SHARED); 6860 if ((pp == NULL) && (page_exists(vp, offset))) { 6861 /* Page is incore, and is named */ 6862 ret |= (SEG_PAGE_INCORE | SEG_PAGE_VNODE); 6863 } 6864 /* 6865 * Don't get page_struct lock for lckcnt and cowcnt, 6866 * since this is purely advisory. 6867 */ 6868 if (pp != NULL) { 6869 ret |= (SEG_PAGE_INCORE | SEG_PAGE_VNODE); 6870 if (pp->p_lckcnt) 6871 ret |= SEG_PAGE_SOFTLOCK; 6872 if (pp->p_cowcnt) 6873 ret |= SEG_PAGE_HASCOW; 6874 page_unlock(pp); 6875 } 6876 } 6877 6878 /* Gather virtual page information */ 6879 if (vpp) { 6880 if (VPP_ISPPLOCK(vpp)) 6881 ret |= SEG_PAGE_LOCKED; 6882 vpp++; 6883 } 6884 6885 *vec++ = (char)ret; 6886 } 6887 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 6888 return (len); 6889 } 6890 6891 /* 6892 * Statement for p_cowcnts/p_lckcnts. 6893 * 6894 * p_cowcnt is updated while mlock/munlocking MAP_PRIVATE and PROT_WRITE region 6895 * irrespective of the following factors or anything else: 6896 * 6897 * (1) anon slots are populated or not 6898 * (2) cow is broken or not 6899 * (3) refcnt on ap is 1 or greater than 1 6900 * 6901 * If it's not MAP_PRIVATE and PROT_WRITE, p_lckcnt is updated during mlock 6902 * and munlock. 6903 * 6904 * 6905 * Handling p_cowcnts/p_lckcnts during copy-on-write fault: 6906 * 6907 * if vpage has PROT_WRITE 6908 * transfer cowcnt on the oldpage -> cowcnt on the newpage 6909 * else 6910 * transfer lckcnt on the oldpage -> lckcnt on the newpage 6911 * 6912 * During copy-on-write, decrement p_cowcnt on the oldpage and increment 6913 * p_cowcnt on the newpage *if* the corresponding vpage has PROT_WRITE. 6914 * 6915 * We may also break COW if softlocking on read access in the physio case. 6916 * In this case, vpage may not have PROT_WRITE. So, we need to decrement 6917 * p_lckcnt on the oldpage and increment p_lckcnt on the newpage *if* the 6918 * vpage doesn't have PROT_WRITE. 6919 * 6920 * 6921 * Handling p_cowcnts/p_lckcnts during mprotect on mlocked region: 6922 * 6923 * If a MAP_PRIVATE region loses PROT_WRITE, we decrement p_cowcnt and 6924 * increment p_lckcnt by calling page_subclaim() which takes care of 6925 * availrmem accounting and p_lckcnt overflow. 6926 * 6927 * If a MAP_PRIVATE region gains PROT_WRITE, we decrement p_lckcnt and 6928 * increment p_cowcnt by calling page_addclaim() which takes care of 6929 * availrmem availability and p_cowcnt overflow. 6930 */ 6931 6932 /* 6933 * Lock down (or unlock) pages mapped by this segment. 6934 * 6935 * XXX only creates PAGESIZE pages if anon slots are not initialized. 6936 * At fault time they will be relocated into larger pages. 6937 */ 6938 static int 6939 segvn_lockop(struct seg *seg, caddr_t addr, size_t len, 6940 int attr, int op, ulong_t *lockmap, size_t pos) 6941 { 6942 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 6943 struct vpage *vpp; 6944 struct vpage *evp; 6945 page_t *pp; 6946 u_offset_t offset; 6947 u_offset_t off; 6948 int segtype; 6949 int pageprot; 6950 int claim; 6951 struct vnode *vp; 6952 ulong_t anon_index; 6953 struct anon_map *amp; 6954 struct anon *ap; 6955 struct vattr va; 6956 anon_sync_obj_t cookie; 6957 struct kshmid *sp = NULL; 6958 struct proc *p = curproc; 6959 kproject_t *proj = NULL; 6960 int chargeproc = 1; 6961 size_t locked_bytes = 0; 6962 size_t unlocked_bytes = 0; 6963 int err = 0; 6964 6965 /* 6966 * Hold write lock on address space because may split or concatenate 6967 * segments 6968 */ 6969 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 6970 6971 /* 6972 * If this is a shm, use shm's project and zone, else use 6973 * project and zone of calling process 6974 */ 6975 6976 /* Determine if this segment backs a sysV shm */ 6977 if (svd->amp != NULL && svd->amp->a_sp != NULL) { 6978 sp = svd->amp->a_sp; 6979 proj = sp->shm_perm.ipc_proj; 6980 chargeproc = 0; 6981 } 6982 6983 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_WRITER); 6984 if (attr) { 6985 pageprot = attr & ~(SHARED|PRIVATE); 6986 segtype = attr & SHARED ? MAP_SHARED : MAP_PRIVATE; 6987 6988 /* 6989 * We are done if the segment types don't match 6990 * or if we have segment level protections and 6991 * they don't match. 6992 */ 6993 if (svd->type != segtype) { 6994 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 6995 return (0); 6996 } 6997 if (svd->pageprot == 0 && svd->prot != pageprot) { 6998 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 6999 return (0); 7000 } 7001 } 7002 7003 /* 7004 * If we're locking, then we must create a vpage structure if 7005 * none exists. If we're unlocking, then check to see if there 7006 * is a vpage -- if not, then we could not have locked anything. 7007 */ 7008 7009 if ((vpp = svd->vpage) == NULL) { 7010 if (op == MC_LOCK) 7011 segvn_vpage(seg); 7012 else { 7013 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7014 return (0); 7015 } 7016 } 7017 7018 /* 7019 * The anonymous data vector (i.e., previously 7020 * unreferenced mapping to swap space) can be allocated 7021 * by lazily testing for its existence. 7022 */ 7023 if (op == MC_LOCK && svd->amp == NULL && svd->vp == NULL) { 7024 svd->amp = anonmap_alloc(seg->s_size, 0); 7025 svd->amp->a_szc = seg->s_szc; 7026 } 7027 7028 if ((amp = svd->amp) != NULL) { 7029 anon_index = svd->anon_index + seg_page(seg, addr); 7030 } 7031 7032 offset = svd->offset + (uintptr_t)(addr - seg->s_base); 7033 evp = &svd->vpage[seg_page(seg, addr + len)]; 7034 7035 if (sp != NULL) 7036 mutex_enter(&sp->shm_mlock); 7037 7038 /* determine number of unlocked bytes in range for lock operation */ 7039 if (op == MC_LOCK) { 7040 7041 if (sp == NULL) { 7042 for (vpp = &svd->vpage[seg_page(seg, addr)]; vpp < evp; 7043 vpp++) { 7044 if (!VPP_ISPPLOCK(vpp)) 7045 unlocked_bytes += PAGESIZE; 7046 } 7047 } else { 7048 ulong_t i_idx, i_edx; 7049 anon_sync_obj_t i_cookie; 7050 struct anon *i_ap; 7051 struct vnode *i_vp; 7052 u_offset_t i_off; 7053 7054 /* Only count sysV pages once for locked memory */ 7055 i_edx = svd->anon_index + seg_page(seg, addr + len); 7056 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 7057 for (i_idx = anon_index; i_idx < i_edx; i_idx++) { 7058 anon_array_enter(amp, i_idx, &i_cookie); 7059 i_ap = anon_get_ptr(amp->ahp, i_idx); 7060 if (i_ap == NULL) { 7061 unlocked_bytes += PAGESIZE; 7062 anon_array_exit(&i_cookie); 7063 continue; 7064 } 7065 swap_xlate(i_ap, &i_vp, &i_off); 7066 anon_array_exit(&i_cookie); 7067 pp = page_lookup(i_vp, i_off, SE_SHARED); 7068 if (pp == NULL) { 7069 unlocked_bytes += PAGESIZE; 7070 continue; 7071 } else if (pp->p_lckcnt == 0) 7072 unlocked_bytes += PAGESIZE; 7073 page_unlock(pp); 7074 } 7075 ANON_LOCK_EXIT(&->a_rwlock); 7076 } 7077 7078 mutex_enter(&p->p_lock); 7079 err = rctl_incr_locked_mem(p, proj, unlocked_bytes, 7080 chargeproc); 7081 mutex_exit(&p->p_lock); 7082 7083 if (err) { 7084 if (sp != NULL) 7085 mutex_exit(&sp->shm_mlock); 7086 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7087 return (err); 7088 } 7089 } 7090 /* 7091 * Loop over all pages in the range. Process if we're locking and 7092 * page has not already been locked in this mapping; or if we're 7093 * unlocking and the page has been locked. 7094 */ 7095 for (vpp = &svd->vpage[seg_page(seg, addr)]; vpp < evp; 7096 vpp++, pos++, addr += PAGESIZE, offset += PAGESIZE, anon_index++) { 7097 if ((attr == 0 || VPP_PROT(vpp) == pageprot) && 7098 ((op == MC_LOCK && !VPP_ISPPLOCK(vpp)) || 7099 (op == MC_UNLOCK && VPP_ISPPLOCK(vpp)))) { 7100 7101 if (amp != NULL) 7102 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 7103 /* 7104 * If this isn't a MAP_NORESERVE segment and 7105 * we're locking, allocate anon slots if they 7106 * don't exist. The page is brought in later on. 7107 */ 7108 if (op == MC_LOCK && svd->vp == NULL && 7109 ((svd->flags & MAP_NORESERVE) == 0) && 7110 amp != NULL && 7111 ((ap = anon_get_ptr(amp->ahp, anon_index)) 7112 == NULL)) { 7113 anon_array_enter(amp, anon_index, &cookie); 7114 7115 if ((ap = anon_get_ptr(amp->ahp, 7116 anon_index)) == NULL) { 7117 pp = anon_zero(seg, addr, &ap, 7118 svd->cred); 7119 if (pp == NULL) { 7120 anon_array_exit(&cookie); 7121 ANON_LOCK_EXIT(&->a_rwlock); 7122 err = ENOMEM; 7123 goto out; 7124 } 7125 ASSERT(anon_get_ptr(amp->ahp, 7126 anon_index) == NULL); 7127 (void) anon_set_ptr(amp->ahp, 7128 anon_index, ap, ANON_SLEEP); 7129 page_unlock(pp); 7130 } 7131 anon_array_exit(&cookie); 7132 } 7133 7134 /* 7135 * Get name for page, accounting for 7136 * existence of private copy. 7137 */ 7138 ap = NULL; 7139 if (amp != NULL) { 7140 anon_array_enter(amp, anon_index, &cookie); 7141 ap = anon_get_ptr(amp->ahp, anon_index); 7142 if (ap != NULL) { 7143 swap_xlate(ap, &vp, &off); 7144 } else { 7145 if (svd->vp == NULL && 7146 (svd->flags & MAP_NORESERVE)) { 7147 anon_array_exit(&cookie); 7148 ANON_LOCK_EXIT(&->a_rwlock); 7149 continue; 7150 } 7151 vp = svd->vp; 7152 off = offset; 7153 } 7154 anon_array_exit(&cookie); 7155 ANON_LOCK_EXIT(&->a_rwlock); 7156 } else { 7157 vp = svd->vp; 7158 off = offset; 7159 } 7160 7161 /* 7162 * Get page frame. It's ok if the page is 7163 * not available when we're unlocking, as this 7164 * may simply mean that a page we locked got 7165 * truncated out of existence after we locked it. 7166 * 7167 * Invoke VOP_GETPAGE() to obtain the page struct 7168 * since we may need to read it from disk if its 7169 * been paged out. 7170 */ 7171 if (op != MC_LOCK) 7172 pp = page_lookup(vp, off, SE_SHARED); 7173 else { 7174 page_t *pl[1 + 1]; 7175 int error; 7176 7177 ASSERT(vp != NULL); 7178 7179 error = VOP_GETPAGE(vp, (offset_t)off, PAGESIZE, 7180 (uint_t *)NULL, pl, PAGESIZE, seg, addr, 7181 S_OTHER, svd->cred); 7182 7183 /* 7184 * If the error is EDEADLK then we must bounce 7185 * up and drop all vm subsystem locks and then 7186 * retry the operation later 7187 * This behavior is a temporary measure because 7188 * ufs/sds logging is badly designed and will 7189 * deadlock if we don't allow this bounce to 7190 * happen. The real solution is to re-design 7191 * the logging code to work properly. See bug 7192 * 4125102 for details of the problem. 7193 */ 7194 if (error == EDEADLK) { 7195 err = error; 7196 goto out; 7197 } 7198 /* 7199 * Quit if we fail to fault in the page. Treat 7200 * the failure as an error, unless the addr 7201 * is mapped beyond the end of a file. 7202 */ 7203 if (error && svd->vp) { 7204 va.va_mask = AT_SIZE; 7205 if (VOP_GETATTR(svd->vp, &va, 0, 7206 svd->cred) != 0) { 7207 err = EIO; 7208 goto out; 7209 } 7210 if (btopr(va.va_size) >= 7211 btopr(off + 1)) { 7212 err = EIO; 7213 goto out; 7214 } 7215 goto out; 7216 7217 } else if (error) { 7218 err = EIO; 7219 goto out; 7220 } 7221 pp = pl[0]; 7222 ASSERT(pp != NULL); 7223 } 7224 7225 /* 7226 * See Statement at the beginning of this routine. 7227 * 7228 * claim is always set if MAP_PRIVATE and PROT_WRITE 7229 * irrespective of following factors: 7230 * 7231 * (1) anon slots are populated or not 7232 * (2) cow is broken or not 7233 * (3) refcnt on ap is 1 or greater than 1 7234 * 7235 * See 4140683 for details 7236 */ 7237 claim = ((VPP_PROT(vpp) & PROT_WRITE) && 7238 (svd->type == MAP_PRIVATE)); 7239 7240 /* 7241 * Perform page-level operation appropriate to 7242 * operation. If locking, undo the SOFTLOCK 7243 * performed to bring the page into memory 7244 * after setting the lock. If unlocking, 7245 * and no page was found, account for the claim 7246 * separately. 7247 */ 7248 if (op == MC_LOCK) { 7249 int ret = 1; /* Assume success */ 7250 7251 ASSERT(!VPP_ISPPLOCK(vpp)); 7252 7253 ret = page_pp_lock(pp, claim, 0); 7254 if (ret == 0) { 7255 /* locking page failed */ 7256 page_unlock(pp); 7257 err = EAGAIN; 7258 goto out; 7259 } 7260 VPP_SETPPLOCK(vpp); 7261 if (sp != NULL) { 7262 if (pp->p_lckcnt == 1) 7263 locked_bytes += PAGESIZE; 7264 } else 7265 locked_bytes += PAGESIZE; 7266 7267 if (lockmap != (ulong_t *)NULL) 7268 BT_SET(lockmap, pos); 7269 7270 page_unlock(pp); 7271 } else { 7272 ASSERT(VPP_ISPPLOCK(vpp)); 7273 if (pp != NULL) { 7274 /* sysV pages should be locked */ 7275 ASSERT(sp == NULL || pp->p_lckcnt > 0); 7276 page_pp_unlock(pp, claim, 0); 7277 if (sp != NULL) { 7278 if (pp->p_lckcnt == 0) 7279 unlocked_bytes 7280 += PAGESIZE; 7281 } else 7282 unlocked_bytes += PAGESIZE; 7283 page_unlock(pp); 7284 } else { 7285 ASSERT(sp == NULL); 7286 unlocked_bytes += PAGESIZE; 7287 } 7288 VPP_CLRPPLOCK(vpp); 7289 } 7290 } 7291 } 7292 out: 7293 if (op == MC_LOCK) { 7294 /* Credit back bytes that did not get locked */ 7295 if ((unlocked_bytes - locked_bytes) > 0) { 7296 if (proj == NULL) 7297 mutex_enter(&p->p_lock); 7298 rctl_decr_locked_mem(p, proj, 7299 (unlocked_bytes - locked_bytes), chargeproc); 7300 if (proj == NULL) 7301 mutex_exit(&p->p_lock); 7302 } 7303 7304 } else { 7305 /* Account bytes that were unlocked */ 7306 if (unlocked_bytes > 0) { 7307 if (proj == NULL) 7308 mutex_enter(&p->p_lock); 7309 rctl_decr_locked_mem(p, proj, unlocked_bytes, 7310 chargeproc); 7311 if (proj == NULL) 7312 mutex_exit(&p->p_lock); 7313 } 7314 } 7315 if (sp != NULL) 7316 mutex_exit(&sp->shm_mlock); 7317 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7318 7319 return (err); 7320 } 7321 7322 /* 7323 * Set advice from user for specified pages 7324 * There are 5 types of advice: 7325 * MADV_NORMAL - Normal (default) behavior (whatever that is) 7326 * MADV_RANDOM - Random page references 7327 * do not allow readahead or 'klustering' 7328 * MADV_SEQUENTIAL - Sequential page references 7329 * Pages previous to the one currently being 7330 * accessed (determined by fault) are 'not needed' 7331 * and are freed immediately 7332 * MADV_WILLNEED - Pages are likely to be used (fault ahead in mctl) 7333 * MADV_DONTNEED - Pages are not needed (synced out in mctl) 7334 * MADV_FREE - Contents can be discarded 7335 * MADV_ACCESS_DEFAULT- Default access 7336 * MADV_ACCESS_LWP - Next LWP will access heavily 7337 * MADV_ACCESS_MANY- Many LWPs or processes will access heavily 7338 */ 7339 static int 7340 segvn_advise(struct seg *seg, caddr_t addr, size_t len, uint_t behav) 7341 { 7342 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 7343 size_t page; 7344 int err = 0; 7345 int already_set; 7346 struct anon_map *amp; 7347 ulong_t anon_index; 7348 struct seg *next; 7349 lgrp_mem_policy_t policy; 7350 struct seg *prev; 7351 struct vnode *vp; 7352 7353 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 7354 7355 /* 7356 * In case of MADV_FREE, we won't be modifying any segment private 7357 * data structures; so, we only need to grab READER's lock 7358 */ 7359 if (behav != MADV_FREE) 7360 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_WRITER); 7361 else 7362 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER); 7363 7364 /* 7365 * Large pages are assumed to be only turned on when accesses to the 7366 * segment's address range have spatial and temporal locality. That 7367 * justifies ignoring MADV_SEQUENTIAL for large page segments. 7368 * Also, ignore advice affecting lgroup memory allocation 7369 * if don't need to do lgroup optimizations on this system 7370 */ 7371 7372 if ((behav == MADV_SEQUENTIAL && seg->s_szc != 0) || 7373 (!lgrp_optimizations() && (behav == MADV_ACCESS_DEFAULT || 7374 behav == MADV_ACCESS_LWP || behav == MADV_ACCESS_MANY))) { 7375 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7376 return (0); 7377 } 7378 7379 if (behav == MADV_SEQUENTIAL || behav == MADV_ACCESS_DEFAULT || 7380 behav == MADV_ACCESS_LWP || behav == MADV_ACCESS_MANY) { 7381 /* 7382 * Since we are going to unload hat mappings 7383 * we first have to flush the cache. Otherwise 7384 * this might lead to system panic if another 7385 * thread is doing physio on the range whose 7386 * mappings are unloaded by madvise(3C). 7387 */ 7388 if (svd->softlockcnt > 0) { 7389 /* 7390 * Since we do have the segvn writers lock 7391 * nobody can fill the cache with entries 7392 * belonging to this seg during the purge. 7393 * The flush either succeeds or we still 7394 * have pending I/Os. In the later case, 7395 * madvise(3C) fails. 7396 */ 7397 segvn_purge(seg); 7398 if (svd->softlockcnt > 0) { 7399 /* 7400 * Since madvise(3C) is advisory and 7401 * it's not part of UNIX98, madvise(3C) 7402 * failure here doesn't cause any hardship. 7403 * Note that we don't block in "as" layer. 7404 */ 7405 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7406 return (EAGAIN); 7407 } 7408 } 7409 } 7410 7411 amp = svd->amp; 7412 vp = svd->vp; 7413 if (behav == MADV_FREE) { 7414 /* 7415 * MADV_FREE is not supported for segments with 7416 * underlying object; if anonmap is NULL, anon slots 7417 * are not yet populated and there is nothing for 7418 * us to do. As MADV_FREE is advisory, we don't 7419 * return error in either case. 7420 */ 7421 if (vp || amp == NULL) { 7422 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7423 return (0); 7424 } 7425 7426 page = seg_page(seg, addr); 7427 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 7428 anon_disclaim(amp, svd->anon_index + page, len, 0); 7429 ANON_LOCK_EXIT(&->a_rwlock); 7430 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7431 return (0); 7432 } 7433 7434 /* 7435 * If advice is to be applied to entire segment, 7436 * use advice field in seg_data structure 7437 * otherwise use appropriate vpage entry. 7438 */ 7439 if ((addr == seg->s_base) && (len == seg->s_size)) { 7440 switch (behav) { 7441 case MADV_ACCESS_LWP: 7442 case MADV_ACCESS_MANY: 7443 case MADV_ACCESS_DEFAULT: 7444 /* 7445 * Set memory allocation policy for this segment 7446 */ 7447 policy = lgrp_madv_to_policy(behav, len, svd->type); 7448 if (svd->type == MAP_SHARED) 7449 already_set = lgrp_shm_policy_set(policy, amp, 7450 svd->anon_index, vp, svd->offset, len); 7451 else { 7452 /* 7453 * For private memory, need writers lock on 7454 * address space because the segment may be 7455 * split or concatenated when changing policy 7456 */ 7457 if (AS_READ_HELD(seg->s_as, 7458 &seg->s_as->a_lock)) { 7459 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7460 return (IE_RETRY); 7461 } 7462 7463 already_set = lgrp_privm_policy_set(policy, 7464 &svd->policy_info, len); 7465 } 7466 7467 /* 7468 * If policy set already and it shouldn't be reapplied, 7469 * don't do anything. 7470 */ 7471 if (already_set && 7472 !LGRP_MEM_POLICY_REAPPLICABLE(policy)) 7473 break; 7474 7475 /* 7476 * Mark any existing pages in given range for 7477 * migration 7478 */ 7479 page_mark_migrate(seg, addr, len, amp, svd->anon_index, 7480 vp, svd->offset, 1); 7481 7482 /* 7483 * If same policy set already or this is a shared 7484 * memory segment, don't need to try to concatenate 7485 * segment with adjacent ones. 7486 */ 7487 if (already_set || svd->type == MAP_SHARED) 7488 break; 7489 7490 /* 7491 * Try to concatenate this segment with previous 7492 * one and next one, since we changed policy for 7493 * this one and it may be compatible with adjacent 7494 * ones now. 7495 */ 7496 prev = AS_SEGPREV(seg->s_as, seg); 7497 next = AS_SEGNEXT(seg->s_as, seg); 7498 7499 if (next && next->s_ops == &segvn_ops && 7500 addr + len == next->s_base) 7501 (void) segvn_concat(seg, next, 1); 7502 7503 if (prev && prev->s_ops == &segvn_ops && 7504 addr == prev->s_base + prev->s_size) { 7505 /* 7506 * Drop lock for private data of current 7507 * segment before concatenating (deleting) it 7508 * and return IE_REATTACH to tell as_ctl() that 7509 * current segment has changed 7510 */ 7511 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7512 if (!segvn_concat(prev, seg, 1)) 7513 err = IE_REATTACH; 7514 7515 return (err); 7516 } 7517 break; 7518 7519 case MADV_SEQUENTIAL: 7520 /* 7521 * unloading mapping guarantees 7522 * detection in segvn_fault 7523 */ 7524 ASSERT(seg->s_szc == 0); 7525 hat_unload(seg->s_as->a_hat, addr, len, 7526 HAT_UNLOAD); 7527 /* FALLTHROUGH */ 7528 case MADV_NORMAL: 7529 case MADV_RANDOM: 7530 svd->advice = (uchar_t)behav; 7531 svd->pageadvice = 0; 7532 break; 7533 case MADV_WILLNEED: /* handled in memcntl */ 7534 case MADV_DONTNEED: /* handled in memcntl */ 7535 case MADV_FREE: /* handled above */ 7536 break; 7537 default: 7538 err = EINVAL; 7539 } 7540 } else { 7541 caddr_t eaddr; 7542 struct seg *new_seg; 7543 struct segvn_data *new_svd; 7544 u_offset_t off; 7545 caddr_t oldeaddr; 7546 7547 page = seg_page(seg, addr); 7548 7549 segvn_vpage(seg); 7550 7551 switch (behav) { 7552 struct vpage *bvpp, *evpp; 7553 7554 case MADV_ACCESS_LWP: 7555 case MADV_ACCESS_MANY: 7556 case MADV_ACCESS_DEFAULT: 7557 /* 7558 * Set memory allocation policy for portion of this 7559 * segment 7560 */ 7561 7562 /* 7563 * Align address and length of advice to page 7564 * boundaries for large pages 7565 */ 7566 if (seg->s_szc != 0) { 7567 size_t pgsz; 7568 7569 pgsz = page_get_pagesize(seg->s_szc); 7570 addr = (caddr_t)P2ALIGN((uintptr_t)addr, pgsz); 7571 len = P2ROUNDUP(len, pgsz); 7572 } 7573 7574 /* 7575 * Check to see whether policy is set already 7576 */ 7577 policy = lgrp_madv_to_policy(behav, len, svd->type); 7578 7579 anon_index = svd->anon_index + page; 7580 off = svd->offset + (uintptr_t)(addr - seg->s_base); 7581 7582 if (svd->type == MAP_SHARED) 7583 already_set = lgrp_shm_policy_set(policy, amp, 7584 anon_index, vp, off, len); 7585 else 7586 already_set = 7587 (policy == svd->policy_info.mem_policy); 7588 7589 /* 7590 * If policy set already and it shouldn't be reapplied, 7591 * don't do anything. 7592 */ 7593 if (already_set && 7594 !LGRP_MEM_POLICY_REAPPLICABLE(policy)) 7595 break; 7596 7597 /* 7598 * For private memory, need writers lock on 7599 * address space because the segment may be 7600 * split or concatenated when changing policy 7601 */ 7602 if (svd->type == MAP_PRIVATE && 7603 AS_READ_HELD(seg->s_as, &seg->s_as->a_lock)) { 7604 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7605 return (IE_RETRY); 7606 } 7607 7608 /* 7609 * Mark any existing pages in given range for 7610 * migration 7611 */ 7612 page_mark_migrate(seg, addr, len, amp, svd->anon_index, 7613 vp, svd->offset, 1); 7614 7615 /* 7616 * Don't need to try to split or concatenate 7617 * segments, since policy is same or this is a shared 7618 * memory segment 7619 */ 7620 if (already_set || svd->type == MAP_SHARED) 7621 break; 7622 7623 /* 7624 * Split off new segment if advice only applies to a 7625 * portion of existing segment starting in middle 7626 */ 7627 new_seg = NULL; 7628 eaddr = addr + len; 7629 oldeaddr = seg->s_base + seg->s_size; 7630 if (addr > seg->s_base) { 7631 /* 7632 * Must flush I/O page cache 7633 * before splitting segment 7634 */ 7635 if (svd->softlockcnt > 0) 7636 segvn_purge(seg); 7637 7638 /* 7639 * Split segment and return IE_REATTACH to tell 7640 * as_ctl() that current segment changed 7641 */ 7642 new_seg = segvn_split_seg(seg, addr); 7643 new_svd = (struct segvn_data *)new_seg->s_data; 7644 err = IE_REATTACH; 7645 7646 /* 7647 * If new segment ends where old one 7648 * did, try to concatenate the new 7649 * segment with next one. 7650 */ 7651 if (eaddr == oldeaddr) { 7652 /* 7653 * Set policy for new segment 7654 */ 7655 (void) lgrp_privm_policy_set(policy, 7656 &new_svd->policy_info, 7657 new_seg->s_size); 7658 7659 next = AS_SEGNEXT(new_seg->s_as, 7660 new_seg); 7661 7662 if (next && 7663 next->s_ops == &segvn_ops && 7664 eaddr == next->s_base) 7665 (void) segvn_concat(new_seg, 7666 next, 1); 7667 } 7668 } 7669 7670 /* 7671 * Split off end of existing segment if advice only 7672 * applies to a portion of segment ending before 7673 * end of the existing segment 7674 */ 7675 if (eaddr < oldeaddr) { 7676 /* 7677 * Must flush I/O page cache 7678 * before splitting segment 7679 */ 7680 if (svd->softlockcnt > 0) 7681 segvn_purge(seg); 7682 7683 /* 7684 * If beginning of old segment was already 7685 * split off, use new segment to split end off 7686 * from. 7687 */ 7688 if (new_seg != NULL && new_seg != seg) { 7689 /* 7690 * Split segment 7691 */ 7692 (void) segvn_split_seg(new_seg, eaddr); 7693 7694 /* 7695 * Set policy for new segment 7696 */ 7697 (void) lgrp_privm_policy_set(policy, 7698 &new_svd->policy_info, 7699 new_seg->s_size); 7700 } else { 7701 /* 7702 * Split segment and return IE_REATTACH 7703 * to tell as_ctl() that current 7704 * segment changed 7705 */ 7706 (void) segvn_split_seg(seg, eaddr); 7707 err = IE_REATTACH; 7708 7709 (void) lgrp_privm_policy_set(policy, 7710 &svd->policy_info, seg->s_size); 7711 7712 /* 7713 * If new segment starts where old one 7714 * did, try to concatenate it with 7715 * previous segment. 7716 */ 7717 if (addr == seg->s_base) { 7718 prev = AS_SEGPREV(seg->s_as, 7719 seg); 7720 7721 /* 7722 * Drop lock for private data 7723 * of current segment before 7724 * concatenating (deleting) it 7725 */ 7726 if (prev && 7727 prev->s_ops == 7728 &segvn_ops && 7729 addr == prev->s_base + 7730 prev->s_size) { 7731 SEGVN_LOCK_EXIT( 7732 seg->s_as, 7733 &svd->lock); 7734 (void) segvn_concat( 7735 prev, seg, 1); 7736 return (err); 7737 } 7738 } 7739 } 7740 } 7741 break; 7742 case MADV_SEQUENTIAL: 7743 ASSERT(seg->s_szc == 0); 7744 hat_unload(seg->s_as->a_hat, addr, len, HAT_UNLOAD); 7745 /* FALLTHROUGH */ 7746 case MADV_NORMAL: 7747 case MADV_RANDOM: 7748 bvpp = &svd->vpage[page]; 7749 evpp = &svd->vpage[page + (len >> PAGESHIFT)]; 7750 for (; bvpp < evpp; bvpp++) 7751 VPP_SETADVICE(bvpp, behav); 7752 svd->advice = MADV_NORMAL; 7753 break; 7754 case MADV_WILLNEED: /* handled in memcntl */ 7755 case MADV_DONTNEED: /* handled in memcntl */ 7756 case MADV_FREE: /* handled above */ 7757 break; 7758 default: 7759 err = EINVAL; 7760 } 7761 } 7762 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7763 return (err); 7764 } 7765 7766 /* 7767 * Create a vpage structure for this seg. 7768 */ 7769 static void 7770 segvn_vpage(struct seg *seg) 7771 { 7772 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 7773 struct vpage *vp, *evp; 7774 7775 ASSERT(SEGVN_WRITE_HELD(seg->s_as, &svd->lock)); 7776 7777 /* 7778 * If no vpage structure exists, allocate one. Copy the protections 7779 * and the advice from the segment itself to the individual pages. 7780 */ 7781 if (svd->vpage == NULL) { 7782 svd->pageprot = 1; 7783 svd->pageadvice = 1; 7784 svd->vpage = kmem_zalloc(seg_pages(seg) * sizeof (struct vpage), 7785 KM_SLEEP); 7786 evp = &svd->vpage[seg_page(seg, seg->s_base + seg->s_size)]; 7787 for (vp = svd->vpage; vp < evp; vp++) { 7788 VPP_SETPROT(vp, svd->prot); 7789 VPP_SETADVICE(vp, svd->advice); 7790 } 7791 } 7792 } 7793 7794 /* 7795 * Dump the pages belonging to this segvn segment. 7796 */ 7797 static void 7798 segvn_dump(struct seg *seg) 7799 { 7800 struct segvn_data *svd; 7801 page_t *pp; 7802 struct anon_map *amp; 7803 ulong_t anon_index; 7804 struct vnode *vp; 7805 u_offset_t off, offset; 7806 pfn_t pfn; 7807 pgcnt_t page, npages; 7808 caddr_t addr; 7809 7810 npages = seg_pages(seg); 7811 svd = (struct segvn_data *)seg->s_data; 7812 vp = svd->vp; 7813 off = offset = svd->offset; 7814 addr = seg->s_base; 7815 7816 if ((amp = svd->amp) != NULL) { 7817 anon_index = svd->anon_index; 7818 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 7819 } 7820 7821 for (page = 0; page < npages; page++, offset += PAGESIZE) { 7822 struct anon *ap; 7823 int we_own_it = 0; 7824 7825 if (amp && (ap = anon_get_ptr(svd->amp->ahp, anon_index++))) { 7826 swap_xlate_nopanic(ap, &vp, &off); 7827 } else { 7828 vp = svd->vp; 7829 off = offset; 7830 } 7831 7832 /* 7833 * If pp == NULL, the page either does not exist 7834 * or is exclusively locked. So determine if it 7835 * exists before searching for it. 7836 */ 7837 7838 if ((pp = page_lookup_nowait(vp, off, SE_SHARED))) 7839 we_own_it = 1; 7840 else 7841 pp = page_exists(vp, off); 7842 7843 if (pp) { 7844 pfn = page_pptonum(pp); 7845 dump_addpage(seg->s_as, addr, pfn); 7846 if (we_own_it) 7847 page_unlock(pp); 7848 } 7849 addr += PAGESIZE; 7850 dump_timeleft = dump_timeout; 7851 } 7852 7853 if (amp != NULL) 7854 ANON_LOCK_EXIT(&->a_rwlock); 7855 } 7856 7857 /* 7858 * lock/unlock anon pages over a given range. Return shadow list 7859 */ 7860 static int 7861 segvn_pagelock(struct seg *seg, caddr_t addr, size_t len, struct page ***ppp, 7862 enum lock_type type, enum seg_rw rw) 7863 { 7864 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 7865 size_t np, adjustpages = 0, npages = (len >> PAGESHIFT); 7866 ulong_t anon_index; 7867 uint_t protchk; 7868 uint_t error; 7869 struct anon_map *amp; 7870 struct page **pplist, **pl, *pp; 7871 caddr_t a; 7872 size_t page; 7873 caddr_t lpgaddr, lpgeaddr; 7874 pgcnt_t szc0_npages = 0; 7875 7876 TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEGVN_START, 7877 "segvn_pagelock: start seg %p addr %p", seg, addr); 7878 7879 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 7880 if (seg->s_szc != 0 && (type == L_PAGELOCK || type == L_PAGEUNLOCK)) { 7881 /* 7882 * We are adjusting the pagelock region to the large page size 7883 * boundary because the unlocked part of a large page cannot 7884 * be freed anyway unless all constituent pages of a large 7885 * page are locked. Therefore this adjustment allows us to 7886 * decrement availrmem by the right value (note we don't want 7887 * to just decrement availrem by the large page size without 7888 * adjusting addr and len because then we may end up 7889 * decrementing availrmem by large page size for every 7890 * constituent page locked by a new as_pagelock call). 7891 * as_pageunlock caller must always match as_pagelock call's 7892 * addr and len. 7893 * 7894 * Note segment's page size cannot change while we are holding 7895 * as lock. And then it cannot change while softlockcnt is 7896 * not 0. This will allow us to correctly recalculate large 7897 * page size region for the matching pageunlock/reclaim call. 7898 * 7899 * for pageunlock *ppp points to the pointer of page_t that 7900 * corresponds to the real unadjusted start address. Similar 7901 * for pagelock *ppp must point to the pointer of page_t that 7902 * corresponds to the real unadjusted start address. 7903 */ 7904 size_t pgsz = page_get_pagesize(seg->s_szc); 7905 CALC_LPG_REGION(pgsz, seg, addr, len, lpgaddr, lpgeaddr); 7906 adjustpages = ((uintptr_t)(addr - lpgaddr)) >> PAGESHIFT; 7907 } 7908 7909 if (type == L_PAGEUNLOCK) { 7910 7911 /* 7912 * update hat ref bits for /proc. We need to make sure 7913 * that threads tracing the ref and mod bits of the 7914 * address space get the right data. 7915 * Note: page ref and mod bits are updated at reclaim time 7916 */ 7917 if (seg->s_as->a_vbits) { 7918 for (a = addr; a < addr + len; a += PAGESIZE) { 7919 if (rw == S_WRITE) { 7920 hat_setstat(seg->s_as, a, 7921 PAGESIZE, P_REF | P_MOD); 7922 } else { 7923 hat_setstat(seg->s_as, a, 7924 PAGESIZE, P_REF); 7925 } 7926 } 7927 } 7928 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER); 7929 if (seg->s_szc != 0) { 7930 VM_STAT_ADD(segvnvmstats.pagelock[0]); 7931 seg_pinactive(seg, lpgaddr, lpgeaddr - lpgaddr, 7932 *ppp - adjustpages, rw, segvn_reclaim); 7933 } else { 7934 seg_pinactive(seg, addr, len, *ppp, rw, segvn_reclaim); 7935 } 7936 7937 /* 7938 * If someone is blocked while unmapping, we purge 7939 * segment page cache and thus reclaim pplist synchronously 7940 * without waiting for seg_pasync_thread. This speeds up 7941 * unmapping in cases where munmap(2) is called, while 7942 * raw async i/o is still in progress or where a thread 7943 * exits on data fault in a multithreaded application. 7944 */ 7945 if (AS_ISUNMAPWAIT(seg->s_as) && (svd->softlockcnt > 0)) { 7946 /* 7947 * Even if we grab segvn WRITER's lock or segp_slock 7948 * here, there might be another thread which could've 7949 * successfully performed lookup/insert just before 7950 * we acquired the lock here. So, grabbing either 7951 * lock here is of not much use. Until we devise 7952 * a strategy at upper layers to solve the 7953 * synchronization issues completely, we expect 7954 * applications to handle this appropriately. 7955 */ 7956 segvn_purge(seg); 7957 } 7958 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7959 TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEGVN_UNLOCK_END, 7960 "segvn_pagelock: unlock seg %p addr %p", seg, addr); 7961 return (0); 7962 } else if (type == L_PAGERECLAIM) { 7963 VM_STAT_COND_ADD(seg->s_szc != 0, segvnvmstats.pagelock[1]); 7964 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER); 7965 (void) segvn_reclaim(seg, addr, len, *ppp, rw); 7966 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7967 TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEGVN_UNLOCK_END, 7968 "segvn_pagelock: reclaim seg %p addr %p", seg, addr); 7969 return (0); 7970 } 7971 7972 if (seg->s_szc != 0) { 7973 VM_STAT_ADD(segvnvmstats.pagelock[2]); 7974 addr = lpgaddr; 7975 len = lpgeaddr - lpgaddr; 7976 npages = (len >> PAGESHIFT); 7977 } 7978 7979 /* 7980 * for now we only support pagelock to anon memory. We've to check 7981 * protections for vnode objects and call into the vnode driver. 7982 * That's too much for a fast path. Let the fault entry point handle it. 7983 */ 7984 if (svd->vp != NULL) { 7985 TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEGVN_MISS_END, 7986 "segvn_pagelock: mapped vnode seg %p addr %p", seg, addr); 7987 *ppp = NULL; 7988 return (ENOTSUP); 7989 } 7990 7991 /* 7992 * if anonmap is not yet created, let the fault entry point populate it 7993 * with anon ptrs. 7994 */ 7995 if ((amp = svd->amp) == NULL) { 7996 TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEGVN_MISS_END, 7997 "segvn_pagelock: anonmap null seg %p addr %p", seg, addr); 7998 *ppp = NULL; 7999 return (EFAULT); 8000 } 8001 8002 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER); 8003 8004 /* 8005 * we acquire segp_slock to prevent duplicate entries 8006 * in seg_pcache 8007 */ 8008 mutex_enter(&svd->segp_slock); 8009 8010 /* 8011 * try to find pages in segment page cache 8012 */ 8013 pplist = seg_plookup(seg, addr, len, rw); 8014 if (pplist != NULL) { 8015 mutex_exit(&svd->segp_slock); 8016 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 8017 *ppp = pplist + adjustpages; 8018 TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEGVN_HIT_END, 8019 "segvn_pagelock: cache hit seg %p addr %p", seg, addr); 8020 return (0); 8021 } 8022 8023 if (rw == S_READ) { 8024 protchk = PROT_READ; 8025 } else { 8026 protchk = PROT_WRITE; 8027 } 8028 8029 if (svd->pageprot == 0) { 8030 if ((svd->prot & protchk) == 0) { 8031 mutex_exit(&svd->segp_slock); 8032 error = EFAULT; 8033 goto out; 8034 } 8035 } else { 8036 /* 8037 * check page protections 8038 */ 8039 for (a = addr; a < addr + len; a += PAGESIZE) { 8040 struct vpage *vp; 8041 8042 vp = &svd->vpage[seg_page(seg, a)]; 8043 if ((VPP_PROT(vp) & protchk) == 0) { 8044 mutex_exit(&svd->segp_slock); 8045 error = EFAULT; 8046 goto out; 8047 } 8048 } 8049 } 8050 8051 /* 8052 * Avoid per page overhead of segvn_pp_lock_anonpages() for small 8053 * pages. For large pages segvn_pp_lock_anonpages() only does real 8054 * work once per large page. The tradeoff is that we may decrement 8055 * availrmem more than once for the same page but this is ok 8056 * for small pages. 8057 */ 8058 if (seg->s_szc == 0) { 8059 mutex_enter(&freemem_lock); 8060 if (availrmem < tune.t_minarmem + npages) { 8061 mutex_exit(&freemem_lock); 8062 mutex_exit(&svd->segp_slock); 8063 error = ENOMEM; 8064 goto out; 8065 } 8066 availrmem -= npages; 8067 mutex_exit(&freemem_lock); 8068 } 8069 8070 pplist = kmem_alloc(sizeof (page_t *) * npages, KM_SLEEP); 8071 pl = pplist; 8072 *ppp = pplist + adjustpages; 8073 8074 page = seg_page(seg, addr); 8075 anon_index = svd->anon_index + page; 8076 8077 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 8078 for (a = addr; a < addr + len; a += PAGESIZE, anon_index++) { 8079 struct anon *ap; 8080 struct vnode *vp; 8081 u_offset_t off; 8082 anon_sync_obj_t cookie; 8083 8084 anon_array_enter(amp, anon_index, &cookie); 8085 ap = anon_get_ptr(amp->ahp, anon_index); 8086 if (ap == NULL) { 8087 anon_array_exit(&cookie); 8088 break; 8089 } else { 8090 /* 8091 * We must never use seg_pcache for COW pages 8092 * because we might end up with original page still 8093 * lying in seg_pcache even after private page is 8094 * created. This leads to data corruption as 8095 * aio_write refers to the page still in cache 8096 * while all other accesses refer to the private 8097 * page. 8098 */ 8099 if (ap->an_refcnt != 1) { 8100 anon_array_exit(&cookie); 8101 break; 8102 } 8103 } 8104 swap_xlate(ap, &vp, &off); 8105 anon_array_exit(&cookie); 8106 8107 pp = page_lookup_nowait(vp, off, SE_SHARED); 8108 if (pp == NULL) { 8109 break; 8110 } 8111 if (seg->s_szc != 0 || pp->p_szc != 0) { 8112 if (!segvn_pp_lock_anonpages(pp, a == addr)) { 8113 page_unlock(pp); 8114 break; 8115 } 8116 } else { 8117 szc0_npages++; 8118 } 8119 *pplist++ = pp; 8120 } 8121 ANON_LOCK_EXIT(&->a_rwlock); 8122 8123 ASSERT(npages >= szc0_npages); 8124 8125 if (a >= addr + len) { 8126 mutex_enter(&freemem_lock); 8127 if (seg->s_szc == 0 && npages != szc0_npages) { 8128 ASSERT(svd->type == MAP_SHARED && amp->a_szc > 0); 8129 availrmem += (npages - szc0_npages); 8130 } 8131 svd->softlockcnt += npages; 8132 segvn_pages_locked += npages; 8133 mutex_exit(&freemem_lock); 8134 (void) seg_pinsert(seg, addr, len, pl, rw, SEGP_ASYNC_FLUSH, 8135 segvn_reclaim); 8136 mutex_exit(&svd->segp_slock); 8137 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 8138 TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEGVN_FILL_END, 8139 "segvn_pagelock: cache fill seg %p addr %p", seg, addr); 8140 return (0); 8141 } 8142 8143 mutex_exit(&svd->segp_slock); 8144 if (seg->s_szc == 0) { 8145 mutex_enter(&freemem_lock); 8146 availrmem += npages; 8147 mutex_exit(&freemem_lock); 8148 } 8149 error = EFAULT; 8150 pplist = pl; 8151 np = ((uintptr_t)(a - addr)) >> PAGESHIFT; 8152 while (np > (uint_t)0) { 8153 ASSERT(PAGE_LOCKED(*pplist)); 8154 if (seg->s_szc != 0 || (*pplist)->p_szc != 0) { 8155 segvn_pp_unlock_anonpages(*pplist, pplist == pl); 8156 } 8157 page_unlock(*pplist); 8158 np--; 8159 pplist++; 8160 } 8161 kmem_free(pl, sizeof (page_t *) * npages); 8162 out: 8163 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 8164 *ppp = NULL; 8165 TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEGVN_MISS_END, 8166 "segvn_pagelock: cache miss seg %p addr %p", seg, addr); 8167 return (error); 8168 } 8169 8170 /* 8171 * purge any cached pages in the I/O page cache 8172 */ 8173 static void 8174 segvn_purge(struct seg *seg) 8175 { 8176 seg_ppurge(seg); 8177 } 8178 8179 static int 8180 segvn_reclaim(struct seg *seg, caddr_t addr, size_t len, struct page **pplist, 8181 enum seg_rw rw) 8182 { 8183 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 8184 pgcnt_t np, npages; 8185 struct page **pl; 8186 pgcnt_t szc0_npages = 0; 8187 8188 #ifdef lint 8189 addr = addr; 8190 #endif 8191 8192 npages = np = (len >> PAGESHIFT); 8193 ASSERT(npages); 8194 pl = pplist; 8195 if (seg->s_szc != 0) { 8196 size_t pgsz = page_get_pagesize(seg->s_szc); 8197 if (!IS_P2ALIGNED(addr, pgsz) || !IS_P2ALIGNED(len, pgsz)) { 8198 panic("segvn_reclaim: unaligned addr or len"); 8199 /*NOTREACHED*/ 8200 } 8201 } 8202 8203 ASSERT(svd->vp == NULL && svd->amp != NULL); 8204 8205 while (np > (uint_t)0) { 8206 if (rw == S_WRITE) { 8207 hat_setrefmod(*pplist); 8208 } else { 8209 hat_setref(*pplist); 8210 } 8211 if (seg->s_szc != 0 || (*pplist)->p_szc != 0) { 8212 segvn_pp_unlock_anonpages(*pplist, pplist == pl); 8213 } else { 8214 szc0_npages++; 8215 } 8216 page_unlock(*pplist); 8217 np--; 8218 pplist++; 8219 } 8220 kmem_free(pl, sizeof (page_t *) * npages); 8221 8222 mutex_enter(&freemem_lock); 8223 segvn_pages_locked -= npages; 8224 svd->softlockcnt -= npages; 8225 if (szc0_npages != 0) { 8226 availrmem += szc0_npages; 8227 } 8228 mutex_exit(&freemem_lock); 8229 if (svd->softlockcnt <= 0) { 8230 if (AS_ISUNMAPWAIT(seg->s_as)) { 8231 mutex_enter(&seg->s_as->a_contents); 8232 if (AS_ISUNMAPWAIT(seg->s_as)) { 8233 AS_CLRUNMAPWAIT(seg->s_as); 8234 cv_broadcast(&seg->s_as->a_cv); 8235 } 8236 mutex_exit(&seg->s_as->a_contents); 8237 } 8238 } 8239 return (0); 8240 } 8241 /* 8242 * get a memory ID for an addr in a given segment 8243 * 8244 * XXX only creates PAGESIZE pages if anon slots are not initialized. 8245 * At fault time they will be relocated into larger pages. 8246 */ 8247 static int 8248 segvn_getmemid(struct seg *seg, caddr_t addr, memid_t *memidp) 8249 { 8250 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 8251 struct anon *ap = NULL; 8252 ulong_t anon_index; 8253 struct anon_map *amp; 8254 anon_sync_obj_t cookie; 8255 8256 if (svd->type == MAP_PRIVATE) { 8257 memidp->val[0] = (uintptr_t)seg->s_as; 8258 memidp->val[1] = (uintptr_t)addr; 8259 return (0); 8260 } 8261 8262 if (svd->type == MAP_SHARED) { 8263 if (svd->vp) { 8264 memidp->val[0] = (uintptr_t)svd->vp; 8265 memidp->val[1] = (u_longlong_t)svd->offset + 8266 (uintptr_t)(addr - seg->s_base); 8267 return (0); 8268 } else { 8269 8270 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER); 8271 if ((amp = svd->amp) != NULL) { 8272 anon_index = svd->anon_index + 8273 seg_page(seg, addr); 8274 } 8275 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 8276 8277 ASSERT(amp != NULL); 8278 8279 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 8280 anon_array_enter(amp, anon_index, &cookie); 8281 ap = anon_get_ptr(amp->ahp, anon_index); 8282 if (ap == NULL) { 8283 page_t *pp; 8284 8285 pp = anon_zero(seg, addr, &ap, svd->cred); 8286 if (pp == NULL) { 8287 anon_array_exit(&cookie); 8288 ANON_LOCK_EXIT(&->a_rwlock); 8289 return (ENOMEM); 8290 } 8291 ASSERT(anon_get_ptr(amp->ahp, anon_index) 8292 == NULL); 8293 (void) anon_set_ptr(amp->ahp, anon_index, 8294 ap, ANON_SLEEP); 8295 page_unlock(pp); 8296 } 8297 8298 anon_array_exit(&cookie); 8299 ANON_LOCK_EXIT(&->a_rwlock); 8300 8301 memidp->val[0] = (uintptr_t)ap; 8302 memidp->val[1] = (uintptr_t)addr & PAGEOFFSET; 8303 return (0); 8304 } 8305 } 8306 return (EINVAL); 8307 } 8308 8309 static int 8310 sameprot(struct seg *seg, caddr_t a, size_t len) 8311 { 8312 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 8313 struct vpage *vpage; 8314 spgcnt_t pages = btop(len); 8315 uint_t prot; 8316 8317 if (svd->pageprot == 0) 8318 return (1); 8319 8320 ASSERT(svd->vpage != NULL); 8321 8322 vpage = &svd->vpage[seg_page(seg, a)]; 8323 prot = VPP_PROT(vpage); 8324 vpage++; 8325 pages--; 8326 while (pages-- > 0) { 8327 if (prot != VPP_PROT(vpage)) 8328 return (0); 8329 vpage++; 8330 } 8331 return (1); 8332 } 8333 8334 /* 8335 * Get memory allocation policy info for specified address in given segment 8336 */ 8337 static lgrp_mem_policy_info_t * 8338 segvn_getpolicy(struct seg *seg, caddr_t addr) 8339 { 8340 struct anon_map *amp; 8341 ulong_t anon_index; 8342 lgrp_mem_policy_info_t *policy_info; 8343 struct segvn_data *svn_data; 8344 u_offset_t vn_off; 8345 vnode_t *vp; 8346 8347 ASSERT(seg != NULL); 8348 8349 svn_data = (struct segvn_data *)seg->s_data; 8350 if (svn_data == NULL) 8351 return (NULL); 8352 8353 /* 8354 * Get policy info for private or shared memory 8355 */ 8356 if (svn_data->type != MAP_SHARED) 8357 policy_info = &svn_data->policy_info; 8358 else { 8359 amp = svn_data->amp; 8360 anon_index = svn_data->anon_index + seg_page(seg, addr); 8361 vp = svn_data->vp; 8362 vn_off = svn_data->offset + (uintptr_t)(addr - seg->s_base); 8363 policy_info = lgrp_shm_policy_get(amp, anon_index, vp, vn_off); 8364 } 8365 8366 return (policy_info); 8367 } 8368 8369 /*ARGSUSED*/ 8370 static int 8371 segvn_capable(struct seg *seg, segcapability_t capability) 8372 { 8373 return (0); 8374 } 8375