1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ 27 /* All Rights Reserved */ 28 29 /* 30 * University Copyright- Copyright (c) 1982, 1986, 1988 31 * The Regents of the University of California 32 * All Rights Reserved 33 * 34 * University Acknowledgment- Portions of this document are derived from 35 * software developed by the University of California, Berkeley, and its 36 * contributors. 37 */ 38 39 #pragma ident "%Z%%M% %I% %E% SMI" 40 41 /* 42 * VM - shared or copy-on-write from a vnode/anonymous memory. 43 */ 44 45 #include <sys/types.h> 46 #include <sys/param.h> 47 #include <sys/t_lock.h> 48 #include <sys/errno.h> 49 #include <sys/systm.h> 50 #include <sys/mman.h> 51 #include <sys/debug.h> 52 #include <sys/cred.h> 53 #include <sys/vmsystm.h> 54 #include <sys/tuneable.h> 55 #include <sys/bitmap.h> 56 #include <sys/swap.h> 57 #include <sys/kmem.h> 58 #include <sys/sysmacros.h> 59 #include <sys/vtrace.h> 60 #include <sys/cmn_err.h> 61 #include <sys/vm.h> 62 #include <sys/dumphdr.h> 63 #include <sys/lgrp.h> 64 65 #include <vm/hat.h> 66 #include <vm/as.h> 67 #include <vm/seg.h> 68 #include <vm/seg_vn.h> 69 #include <vm/pvn.h> 70 #include <vm/anon.h> 71 #include <vm/page.h> 72 #include <vm/vpage.h> 73 #include <sys/proc.h> 74 #include <sys/task.h> 75 #include <sys/project.h> 76 #include <sys/zone.h> 77 #include <sys/shm_impl.h> 78 /* 79 * Private seg op routines. 80 */ 81 static int segvn_dup(struct seg *seg, struct seg *newseg); 82 static int segvn_unmap(struct seg *seg, caddr_t addr, size_t len); 83 static void segvn_free(struct seg *seg); 84 static faultcode_t segvn_fault(struct hat *hat, struct seg *seg, 85 caddr_t addr, size_t len, enum fault_type type, 86 enum seg_rw rw); 87 static faultcode_t segvn_faulta(struct seg *seg, caddr_t addr); 88 static int segvn_setprot(struct seg *seg, caddr_t addr, 89 size_t len, uint_t prot); 90 static int segvn_checkprot(struct seg *seg, caddr_t addr, 91 size_t len, uint_t prot); 92 static int segvn_kluster(struct seg *seg, caddr_t addr, ssize_t delta); 93 static size_t segvn_swapout(struct seg *seg); 94 static int segvn_sync(struct seg *seg, caddr_t addr, size_t len, 95 int attr, uint_t flags); 96 static size_t segvn_incore(struct seg *seg, caddr_t addr, size_t len, 97 char *vec); 98 static int segvn_lockop(struct seg *seg, caddr_t addr, size_t len, 99 int attr, int op, ulong_t *lockmap, size_t pos); 100 static int segvn_getprot(struct seg *seg, caddr_t addr, size_t len, 101 uint_t *protv); 102 static u_offset_t segvn_getoffset(struct seg *seg, caddr_t addr); 103 static int segvn_gettype(struct seg *seg, caddr_t addr); 104 static int segvn_getvp(struct seg *seg, caddr_t addr, 105 struct vnode **vpp); 106 static int segvn_advise(struct seg *seg, caddr_t addr, size_t len, 107 uint_t behav); 108 static void segvn_dump(struct seg *seg); 109 static int segvn_pagelock(struct seg *seg, caddr_t addr, size_t len, 110 struct page ***ppp, enum lock_type type, enum seg_rw rw); 111 static int segvn_setpagesize(struct seg *seg, caddr_t addr, size_t len, 112 uint_t szc); 113 static int segvn_getmemid(struct seg *seg, caddr_t addr, 114 memid_t *memidp); 115 static lgrp_mem_policy_info_t *segvn_getpolicy(struct seg *, caddr_t); 116 static int segvn_capable(struct seg *seg, segcapability_t capable); 117 118 struct seg_ops segvn_ops = { 119 segvn_dup, 120 segvn_unmap, 121 segvn_free, 122 segvn_fault, 123 segvn_faulta, 124 segvn_setprot, 125 segvn_checkprot, 126 segvn_kluster, 127 segvn_swapout, 128 segvn_sync, 129 segvn_incore, 130 segvn_lockop, 131 segvn_getprot, 132 segvn_getoffset, 133 segvn_gettype, 134 segvn_getvp, 135 segvn_advise, 136 segvn_dump, 137 segvn_pagelock, 138 segvn_setpagesize, 139 segvn_getmemid, 140 segvn_getpolicy, 141 segvn_capable, 142 }; 143 144 /* 145 * Common zfod structures, provided as a shorthand for others to use. 146 */ 147 static segvn_crargs_t zfod_segvn_crargs = 148 SEGVN_ZFOD_ARGS(PROT_ZFOD, PROT_ALL); 149 static segvn_crargs_t kzfod_segvn_crargs = 150 SEGVN_ZFOD_ARGS(PROT_ZFOD & ~PROT_USER, 151 PROT_ALL & ~PROT_USER); 152 static segvn_crargs_t stack_noexec_crargs = 153 SEGVN_ZFOD_ARGS(PROT_ZFOD & ~PROT_EXEC, PROT_ALL); 154 155 caddr_t zfod_argsp = (caddr_t)&zfod_segvn_crargs; /* user zfod argsp */ 156 caddr_t kzfod_argsp = (caddr_t)&kzfod_segvn_crargs; /* kernel zfod argsp */ 157 caddr_t stack_exec_argsp = (caddr_t)&zfod_segvn_crargs; /* executable stack */ 158 caddr_t stack_noexec_argsp = (caddr_t)&stack_noexec_crargs; /* noexec stack */ 159 160 #define vpgtob(n) ((n) * sizeof (struct vpage)) /* For brevity */ 161 162 size_t segvn_comb_thrshld = UINT_MAX; /* patchable -- see 1196681 */ 163 164 static int segvn_concat(struct seg *, struct seg *, int); 165 static int segvn_extend_prev(struct seg *, struct seg *, 166 struct segvn_crargs *, size_t); 167 static int segvn_extend_next(struct seg *, struct seg *, 168 struct segvn_crargs *, size_t); 169 static void segvn_softunlock(struct seg *, caddr_t, size_t, enum seg_rw); 170 static void segvn_pagelist_rele(page_t **); 171 static void segvn_setvnode_mpss(vnode_t *); 172 static void segvn_relocate_pages(page_t **, page_t *); 173 static int segvn_full_szcpages(page_t **, uint_t, int *, uint_t *); 174 static int segvn_fill_vp_pages(struct segvn_data *, vnode_t *, u_offset_t, 175 uint_t, page_t **, page_t **, uint_t *, int *); 176 static faultcode_t segvn_fault_vnodepages(struct hat *, struct seg *, caddr_t, 177 caddr_t, enum fault_type, enum seg_rw, caddr_t, caddr_t, int); 178 static faultcode_t segvn_fault_anonpages(struct hat *, struct seg *, caddr_t, 179 caddr_t, enum fault_type, enum seg_rw, caddr_t, caddr_t, int); 180 static faultcode_t segvn_faultpage(struct hat *, struct seg *, caddr_t, 181 u_offset_t, struct vpage *, page_t **, uint_t, 182 enum fault_type, enum seg_rw, int, int); 183 static void segvn_vpage(struct seg *); 184 185 static void segvn_purge(struct seg *seg); 186 static int segvn_reclaim(struct seg *, caddr_t, size_t, struct page **, 187 enum seg_rw); 188 189 static int sameprot(struct seg *, caddr_t, size_t); 190 191 static int segvn_demote_range(struct seg *, caddr_t, size_t, int, uint_t); 192 static int segvn_clrszc(struct seg *); 193 static struct seg *segvn_split_seg(struct seg *, caddr_t); 194 static int segvn_claim_pages(struct seg *, struct vpage *, u_offset_t, 195 ulong_t, uint_t); 196 197 static int segvn_pp_lock_anonpages(page_t *, int); 198 static void segvn_pp_unlock_anonpages(page_t *, int); 199 200 static struct kmem_cache *segvn_cache; 201 202 #ifdef VM_STATS 203 static struct segvnvmstats_str { 204 ulong_t fill_vp_pages[31]; 205 ulong_t fltvnpages[49]; 206 ulong_t fullszcpages[10]; 207 ulong_t relocatepages[3]; 208 ulong_t fltanpages[17]; 209 ulong_t pagelock[3]; 210 ulong_t demoterange[3]; 211 } segvnvmstats; 212 #endif /* VM_STATS */ 213 214 #define SDR_RANGE 1 /* demote entire range */ 215 #define SDR_END 2 /* demote non aligned ends only */ 216 217 #define CALC_LPG_REGION(pgsz, seg, addr, len, lpgaddr, lpgeaddr) { \ 218 if ((len) != 0) { \ 219 lpgaddr = (caddr_t)P2ALIGN((uintptr_t)(addr), pgsz); \ 220 ASSERT(lpgaddr >= (seg)->s_base); \ 221 lpgeaddr = (caddr_t)P2ROUNDUP((uintptr_t)((addr) + \ 222 (len)), pgsz); \ 223 ASSERT(lpgeaddr > lpgaddr); \ 224 ASSERT(lpgeaddr <= (seg)->s_base + (seg)->s_size); \ 225 } else { \ 226 lpgeaddr = lpgaddr = (addr); \ 227 } \ 228 } 229 230 /*ARGSUSED*/ 231 static int 232 segvn_cache_constructor(void *buf, void *cdrarg, int kmflags) 233 { 234 struct segvn_data *svd = buf; 235 236 rw_init(&svd->lock, NULL, RW_DEFAULT, NULL); 237 mutex_init(&svd->segp_slock, NULL, MUTEX_DEFAULT, NULL); 238 return (0); 239 } 240 241 /*ARGSUSED1*/ 242 static void 243 segvn_cache_destructor(void *buf, void *cdrarg) 244 { 245 struct segvn_data *svd = buf; 246 247 rw_destroy(&svd->lock); 248 mutex_destroy(&svd->segp_slock); 249 } 250 251 /* 252 * Patching this variable to non-zero allows the system to run with 253 * stacks marked as "not executable". It's a bit of a kludge, but is 254 * provided as a tweakable for platforms that export those ABIs 255 * (e.g. sparc V8) that have executable stacks enabled by default. 256 * There are also some restrictions for platforms that don't actually 257 * implement 'noexec' protections. 258 * 259 * Once enabled, the system is (therefore) unable to provide a fully 260 * ABI-compliant execution environment, though practically speaking, 261 * most everything works. The exceptions are generally some interpreters 262 * and debuggers that create executable code on the stack and jump 263 * into it (without explicitly mprotecting the address range to include 264 * PROT_EXEC). 265 * 266 * One important class of applications that are disabled are those 267 * that have been transformed into malicious agents using one of the 268 * numerous "buffer overflow" attacks. See 4007890. 269 */ 270 int noexec_user_stack = 0; 271 int noexec_user_stack_log = 1; 272 273 int segvn_lpg_disable = 0; 274 uint_t segvn_maxpgszc = 0; 275 276 ulong_t segvn_vmpss_clrszc_cnt; 277 ulong_t segvn_vmpss_clrszc_err; 278 ulong_t segvn_fltvnpages_clrszc_cnt; 279 ulong_t segvn_fltvnpages_clrszc_err; 280 ulong_t segvn_setpgsz_align_err; 281 ulong_t segvn_setpgsz_anon_align_err; 282 ulong_t segvn_setpgsz_getattr_err; 283 ulong_t segvn_setpgsz_eof_err; 284 ulong_t segvn_faultvnmpss_align_err1; 285 ulong_t segvn_faultvnmpss_align_err2; 286 ulong_t segvn_faultvnmpss_align_err3; 287 ulong_t segvn_faultvnmpss_align_err4; 288 ulong_t segvn_faultvnmpss_align_err5; 289 ulong_t segvn_vmpss_pageio_deadlk_err; 290 291 /* 292 * Initialize segvn data structures 293 */ 294 void 295 segvn_init(void) 296 { 297 uint_t maxszc; 298 uint_t szc; 299 size_t pgsz; 300 301 segvn_cache = kmem_cache_create("segvn_cache", 302 sizeof (struct segvn_data), 0, 303 segvn_cache_constructor, segvn_cache_destructor, NULL, 304 NULL, NULL, 0); 305 306 if (segvn_lpg_disable != 0) 307 return; 308 szc = maxszc = page_num_pagesizes() - 1; 309 if (szc == 0) { 310 segvn_lpg_disable = 1; 311 return; 312 } 313 if (page_get_pagesize(0) != PAGESIZE) { 314 panic("segvn_init: bad szc 0"); 315 /*NOTREACHED*/ 316 } 317 while (szc != 0) { 318 pgsz = page_get_pagesize(szc); 319 if (pgsz <= PAGESIZE || !IS_P2ALIGNED(pgsz, pgsz)) { 320 panic("segvn_init: bad szc %d", szc); 321 /*NOTREACHED*/ 322 } 323 szc--; 324 } 325 if (segvn_maxpgszc == 0 || segvn_maxpgszc > maxszc) 326 segvn_maxpgszc = maxszc; 327 } 328 329 #define SEGVN_PAGEIO ((void *)0x1) 330 #define SEGVN_NOPAGEIO ((void *)0x2) 331 332 static void 333 segvn_setvnode_mpss(vnode_t *vp) 334 { 335 int err; 336 337 ASSERT(vp->v_mpssdata == NULL || 338 vp->v_mpssdata == SEGVN_PAGEIO || 339 vp->v_mpssdata == SEGVN_NOPAGEIO); 340 341 if (vp->v_mpssdata == NULL) { 342 if (vn_vmpss_usepageio(vp)) { 343 err = VOP_PAGEIO(vp, (page_t *)NULL, 344 (u_offset_t)0, 0, 0, CRED()); 345 } else { 346 err = ENOSYS; 347 } 348 /* 349 * set v_mpssdata just once per vnode life 350 * so that it never changes. 351 */ 352 mutex_enter(&vp->v_lock); 353 if (vp->v_mpssdata == NULL) { 354 if (err == EINVAL) { 355 vp->v_mpssdata = SEGVN_PAGEIO; 356 } else { 357 vp->v_mpssdata = SEGVN_NOPAGEIO; 358 } 359 } 360 mutex_exit(&vp->v_lock); 361 } 362 } 363 364 int 365 segvn_create(struct seg *seg, void *argsp) 366 { 367 struct segvn_crargs *a = (struct segvn_crargs *)argsp; 368 struct segvn_data *svd; 369 size_t swresv = 0; 370 struct cred *cred; 371 struct anon_map *amp; 372 int error = 0; 373 size_t pgsz; 374 lgrp_mem_policy_t mpolicy = LGRP_MEM_POLICY_DEFAULT; 375 376 377 ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock)); 378 379 if (a->type != MAP_PRIVATE && a->type != MAP_SHARED) { 380 panic("segvn_create type"); 381 /*NOTREACHED*/ 382 } 383 384 /* 385 * Check arguments. If a shared anon structure is given then 386 * it is illegal to also specify a vp. 387 */ 388 if (a->amp != NULL && a->vp != NULL) { 389 panic("segvn_create anon_map"); 390 /*NOTREACHED*/ 391 } 392 393 /* MAP_NORESERVE on a MAP_SHARED segment is meaningless. */ 394 if (a->type == MAP_SHARED) 395 a->flags &= ~MAP_NORESERVE; 396 397 if (a->szc != 0) { 398 if (segvn_lpg_disable != 0 || 399 (a->amp != NULL && a->type == MAP_PRIVATE) || 400 (a->flags & MAP_NORESERVE) || seg->s_as == &kas) { 401 a->szc = 0; 402 } else { 403 if (a->szc > segvn_maxpgszc) 404 a->szc = segvn_maxpgszc; 405 pgsz = page_get_pagesize(a->szc); 406 if (!IS_P2ALIGNED(seg->s_base, pgsz) || 407 !IS_P2ALIGNED(seg->s_size, pgsz)) { 408 a->szc = 0; 409 } else if (a->vp != NULL) { 410 extern struct vnode kvp; 411 if (IS_SWAPFSVP(a->vp) || a->vp == &kvp) { 412 /* 413 * paranoid check. 414 * hat_page_demote() is not supported 415 * on swapfs pages. 416 */ 417 a->szc = 0; 418 } else if (map_addr_vacalign_check(seg->s_base, 419 a->offset & PAGEMASK)) { 420 a->szc = 0; 421 } 422 } else if (a->amp != NULL) { 423 pgcnt_t anum = btopr(a->offset); 424 pgcnt_t pgcnt = page_get_pagecnt(a->szc); 425 if (!IS_P2ALIGNED(anum, pgcnt)) { 426 a->szc = 0; 427 } 428 } 429 } 430 } 431 432 /* 433 * If segment may need private pages, reserve them now. 434 */ 435 if (!(a->flags & MAP_NORESERVE) && ((a->vp == NULL && a->amp == NULL) || 436 (a->type == MAP_PRIVATE && (a->prot & PROT_WRITE)))) { 437 if (anon_resv(seg->s_size) == 0) 438 return (EAGAIN); 439 swresv = seg->s_size; 440 TRACE_3(TR_FAC_VM, TR_ANON_PROC, "anon proc:%p %lu %u", 441 seg, swresv, 1); 442 } 443 444 /* 445 * Reserve any mapping structures that may be required. 446 */ 447 hat_map(seg->s_as->a_hat, seg->s_base, seg->s_size, HAT_MAP); 448 449 if (a->cred) { 450 cred = a->cred; 451 crhold(cred); 452 } else { 453 crhold(cred = CRED()); 454 } 455 456 /* Inform the vnode of the new mapping */ 457 if (a->vp) { 458 error = VOP_ADDMAP(a->vp, a->offset & PAGEMASK, 459 seg->s_as, seg->s_base, seg->s_size, a->prot, 460 a->maxprot, a->type, cred); 461 if (error) { 462 if (swresv != 0) { 463 anon_unresv(swresv); 464 TRACE_3(TR_FAC_VM, TR_ANON_PROC, 465 "anon proc:%p %lu %u", 466 seg, swresv, 0); 467 } 468 crfree(cred); 469 hat_unload(seg->s_as->a_hat, seg->s_base, 470 seg->s_size, HAT_UNLOAD_UNMAP); 471 return (error); 472 } 473 } 474 475 /* 476 * If more than one segment in the address space, and 477 * they're adjacent virtually, try to concatenate them. 478 * Don't concatenate if an explicit anon_map structure 479 * was supplied (e.g., SystemV shared memory). 480 */ 481 if (a->amp == NULL) { 482 struct seg *pseg, *nseg; 483 struct segvn_data *psvd, *nsvd; 484 lgrp_mem_policy_t ppolicy, npolicy; 485 uint_t lgrp_mem_policy_flags = 0; 486 extern lgrp_mem_policy_t lgrp_mem_default_policy; 487 488 /* 489 * Memory policy flags (lgrp_mem_policy_flags) is valid when 490 * extending stack/heap segments. 491 */ 492 if ((a->vp == NULL) && (a->type == MAP_PRIVATE) && 493 !(a->flags & MAP_NORESERVE) && (seg->s_as != &kas)) { 494 lgrp_mem_policy_flags = a->lgrp_mem_policy_flags; 495 } else { 496 /* 497 * Get policy when not extending it from another segment 498 */ 499 mpolicy = lgrp_mem_policy_default(seg->s_size, a->type); 500 } 501 502 /* 503 * First, try to concatenate the previous and new segments 504 */ 505 pseg = AS_SEGPREV(seg->s_as, seg); 506 if (pseg != NULL && 507 pseg->s_base + pseg->s_size == seg->s_base && 508 pseg->s_ops == &segvn_ops) { 509 /* 510 * Get memory allocation policy from previous segment. 511 * When extension is specified (e.g. for heap) apply 512 * this policy to the new segment regardless of the 513 * outcome of segment concatenation. Extension occurs 514 * for non-default policy otherwise default policy is 515 * used and is based on extended segment size. 516 */ 517 psvd = (struct segvn_data *)pseg->s_data; 518 ppolicy = psvd->policy_info.mem_policy; 519 if (lgrp_mem_policy_flags == 520 LGRP_MP_FLAG_EXTEND_UP) { 521 if (ppolicy != lgrp_mem_default_policy) { 522 mpolicy = ppolicy; 523 } else { 524 mpolicy = lgrp_mem_policy_default( 525 pseg->s_size + seg->s_size, 526 a->type); 527 } 528 } 529 530 if (mpolicy == ppolicy && 531 (pseg->s_size + seg->s_size <= 532 segvn_comb_thrshld || psvd->amp == NULL) && 533 segvn_extend_prev(pseg, seg, a, swresv) == 0) { 534 /* 535 * success! now try to concatenate 536 * with following seg 537 */ 538 crfree(cred); 539 nseg = AS_SEGNEXT(pseg->s_as, pseg); 540 if (nseg != NULL && 541 nseg != pseg && 542 nseg->s_ops == &segvn_ops && 543 pseg->s_base + pseg->s_size == 544 nseg->s_base) 545 (void) segvn_concat(pseg, nseg, 0); 546 ASSERT(pseg->s_szc == 0 || 547 (a->szc == pseg->s_szc && 548 IS_P2ALIGNED(pseg->s_base, pgsz) && 549 IS_P2ALIGNED(pseg->s_size, pgsz))); 550 return (0); 551 } 552 } 553 554 /* 555 * Failed, so try to concatenate with following seg 556 */ 557 nseg = AS_SEGNEXT(seg->s_as, seg); 558 if (nseg != NULL && 559 seg->s_base + seg->s_size == nseg->s_base && 560 nseg->s_ops == &segvn_ops) { 561 /* 562 * Get memory allocation policy from next segment. 563 * When extension is specified (e.g. for stack) apply 564 * this policy to the new segment regardless of the 565 * outcome of segment concatenation. Extension occurs 566 * for non-default policy otherwise default policy is 567 * used and is based on extended segment size. 568 */ 569 nsvd = (struct segvn_data *)nseg->s_data; 570 npolicy = nsvd->policy_info.mem_policy; 571 if (lgrp_mem_policy_flags == 572 LGRP_MP_FLAG_EXTEND_DOWN) { 573 if (npolicy != lgrp_mem_default_policy) { 574 mpolicy = npolicy; 575 } else { 576 mpolicy = lgrp_mem_policy_default( 577 nseg->s_size + seg->s_size, 578 a->type); 579 } 580 } 581 582 if (mpolicy == npolicy && 583 segvn_extend_next(seg, nseg, a, swresv) == 0) { 584 crfree(cred); 585 ASSERT(nseg->s_szc == 0 || 586 (a->szc == nseg->s_szc && 587 IS_P2ALIGNED(nseg->s_base, pgsz) && 588 IS_P2ALIGNED(nseg->s_size, pgsz))); 589 return (0); 590 } 591 } 592 } 593 594 if (a->vp != NULL) { 595 VN_HOLD(a->vp); 596 if (a->type == MAP_SHARED) 597 lgrp_shm_policy_init(NULL, a->vp); 598 } 599 svd = kmem_cache_alloc(segvn_cache, KM_SLEEP); 600 601 seg->s_ops = &segvn_ops; 602 seg->s_data = (void *)svd; 603 seg->s_szc = a->szc; 604 605 svd->vp = a->vp; 606 /* 607 * Anonymous mappings have no backing file so the offset is meaningless. 608 */ 609 svd->offset = a->vp ? (a->offset & PAGEMASK) : 0; 610 svd->prot = a->prot; 611 svd->maxprot = a->maxprot; 612 svd->pageprot = 0; 613 svd->type = a->type; 614 svd->vpage = NULL; 615 svd->cred = cred; 616 svd->advice = MADV_NORMAL; 617 svd->pageadvice = 0; 618 svd->flags = (ushort_t)a->flags; 619 svd->softlockcnt = 0; 620 if (a->szc != 0 && a->vp != NULL) { 621 segvn_setvnode_mpss(a->vp); 622 } 623 624 amp = a->amp; 625 if ((svd->amp = amp) == NULL) { 626 svd->anon_index = 0; 627 if (svd->type == MAP_SHARED) { 628 svd->swresv = 0; 629 /* 630 * Shared mappings to a vp need no other setup. 631 * If we have a shared mapping to an anon_map object 632 * which hasn't been allocated yet, allocate the 633 * struct now so that it will be properly shared 634 * by remembering the swap reservation there. 635 */ 636 if (a->vp == NULL) { 637 svd->amp = anonmap_alloc(seg->s_size, swresv); 638 svd->amp->a_szc = seg->s_szc; 639 } 640 } else { 641 /* 642 * Private mapping (with or without a vp). 643 * Allocate anon_map when needed. 644 */ 645 svd->swresv = swresv; 646 } 647 } else { 648 pgcnt_t anon_num; 649 650 /* 651 * Mapping to an existing anon_map structure without a vp. 652 * For now we will insure that the segment size isn't larger 653 * than the size - offset gives us. Later on we may wish to 654 * have the anon array dynamically allocated itself so that 655 * we don't always have to allocate all the anon pointer slots. 656 * This of course involves adding extra code to check that we 657 * aren't trying to use an anon pointer slot beyond the end 658 * of the currently allocated anon array. 659 */ 660 if ((amp->size - a->offset) < seg->s_size) { 661 panic("segvn_create anon_map size"); 662 /*NOTREACHED*/ 663 } 664 665 anon_num = btopr(a->offset); 666 667 if (a->type == MAP_SHARED) { 668 /* 669 * SHARED mapping to a given anon_map. 670 */ 671 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 672 amp->refcnt++; 673 if (a->szc > amp->a_szc) { 674 amp->a_szc = a->szc; 675 } 676 ANON_LOCK_EXIT(&->a_rwlock); 677 svd->anon_index = anon_num; 678 svd->swresv = 0; 679 } else { 680 /* 681 * PRIVATE mapping to a given anon_map. 682 * Make sure that all the needed anon 683 * structures are created (so that we will 684 * share the underlying pages if nothing 685 * is written by this mapping) and then 686 * duplicate the anon array as is done 687 * when a privately mapped segment is dup'ed. 688 */ 689 struct anon *ap; 690 caddr_t addr; 691 caddr_t eaddr; 692 ulong_t anon_idx; 693 int hat_flag = HAT_LOAD; 694 695 if (svd->flags & MAP_TEXT) { 696 hat_flag |= HAT_LOAD_TEXT; 697 } 698 699 svd->amp = anonmap_alloc(seg->s_size, 0); 700 svd->amp->a_szc = seg->s_szc; 701 svd->anon_index = 0; 702 svd->swresv = swresv; 703 704 /* 705 * Prevent 2 threads from allocating anon 706 * slots simultaneously. 707 */ 708 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 709 eaddr = seg->s_base + seg->s_size; 710 711 for (anon_idx = anon_num, addr = seg->s_base; 712 addr < eaddr; addr += PAGESIZE, anon_idx++) { 713 page_t *pp; 714 715 if ((ap = anon_get_ptr(amp->ahp, 716 anon_idx)) != NULL) 717 continue; 718 719 /* 720 * Allocate the anon struct now. 721 * Might as well load up translation 722 * to the page while we're at it... 723 */ 724 pp = anon_zero(seg, addr, &ap, cred); 725 if (ap == NULL || pp == NULL) { 726 panic("segvn_create anon_zero"); 727 /*NOTREACHED*/ 728 } 729 730 /* 731 * Re-acquire the anon_map lock and 732 * initialize the anon array entry. 733 */ 734 ASSERT(anon_get_ptr(amp->ahp, 735 anon_idx) == NULL); 736 (void) anon_set_ptr(amp->ahp, anon_idx, ap, 737 ANON_SLEEP); 738 739 ASSERT(seg->s_szc == 0); 740 ASSERT(!IS_VMODSORT(pp->p_vnode)); 741 742 hat_memload(seg->s_as->a_hat, addr, pp, 743 svd->prot & ~PROT_WRITE, hat_flag); 744 745 page_unlock(pp); 746 } 747 ASSERT(seg->s_szc == 0); 748 anon_dup(amp->ahp, anon_num, svd->amp->ahp, 749 0, seg->s_size); 750 ANON_LOCK_EXIT(&->a_rwlock); 751 } 752 } 753 754 /* 755 * Set default memory allocation policy for segment 756 * 757 * Always set policy for private memory at least for initialization 758 * even if this is a shared memory segment 759 */ 760 (void) lgrp_privm_policy_set(mpolicy, &svd->policy_info, seg->s_size); 761 762 if (svd->type == MAP_SHARED) 763 (void) lgrp_shm_policy_set(mpolicy, svd->amp, svd->anon_index, 764 svd->vp, svd->offset, seg->s_size); 765 766 return (0); 767 } 768 769 /* 770 * Concatenate two existing segments, if possible. 771 * Return 0 on success, -1 if two segments are not compatible 772 * or -2 on memory allocation failure. 773 * If amp_cat == 1 then try and concat segments with anon maps 774 */ 775 static int 776 segvn_concat(struct seg *seg1, struct seg *seg2, int amp_cat) 777 { 778 struct segvn_data *svd1 = seg1->s_data; 779 struct segvn_data *svd2 = seg2->s_data; 780 struct anon_map *amp1 = svd1->amp; 781 struct anon_map *amp2 = svd2->amp; 782 struct vpage *vpage1 = svd1->vpage; 783 struct vpage *vpage2 = svd2->vpage, *nvpage = NULL; 784 size_t size, nvpsize; 785 pgcnt_t npages1, npages2; 786 787 ASSERT(seg1->s_as && seg2->s_as && seg1->s_as == seg2->s_as); 788 ASSERT(AS_WRITE_HELD(seg1->s_as, &seg1->s_as->a_lock)); 789 ASSERT(seg1->s_ops == seg2->s_ops); 790 791 /* both segments exist, try to merge them */ 792 #define incompat(x) (svd1->x != svd2->x) 793 if (incompat(vp) || incompat(maxprot) || 794 (!svd1->pageadvice && !svd2->pageadvice && incompat(advice)) || 795 (!svd1->pageprot && !svd2->pageprot && incompat(prot)) || 796 incompat(type) || incompat(cred) || incompat(flags) || 797 seg1->s_szc != seg2->s_szc || incompat(policy_info.mem_policy) || 798 (svd2->softlockcnt > 0)) 799 return (-1); 800 #undef incompat 801 802 /* 803 * vp == NULL implies zfod, offset doesn't matter 804 */ 805 if (svd1->vp != NULL && 806 svd1->offset + seg1->s_size != svd2->offset) { 807 return (-1); 808 } 809 810 /* 811 * Fail early if we're not supposed to concatenate 812 * segments with non NULL amp. 813 */ 814 if (amp_cat == 0 && (amp1 != NULL || amp2 != NULL)) { 815 return (-1); 816 } 817 818 if (svd1->vp == NULL && svd1->type == MAP_SHARED) { 819 if (amp1 != amp2) { 820 return (-1); 821 } 822 if (amp1 != NULL && svd1->anon_index + btop(seg1->s_size) != 823 svd2->anon_index) { 824 return (-1); 825 } 826 ASSERT(amp1 == NULL || amp1->refcnt >= 2); 827 } 828 829 /* 830 * If either seg has vpages, create a new merged vpage array. 831 */ 832 if (vpage1 != NULL || vpage2 != NULL) { 833 struct vpage *vp; 834 835 npages1 = seg_pages(seg1); 836 npages2 = seg_pages(seg2); 837 nvpsize = vpgtob(npages1 + npages2); 838 839 if ((nvpage = kmem_zalloc(nvpsize, KM_NOSLEEP)) == NULL) { 840 return (-2); 841 } 842 if (vpage1 != NULL) { 843 bcopy(vpage1, nvpage, vpgtob(npages1)); 844 } 845 if (vpage2 != NULL) { 846 bcopy(vpage2, nvpage + npages1, vpgtob(npages2)); 847 } 848 for (vp = nvpage; vp < nvpage + npages1; vp++) { 849 if (svd2->pageprot && !svd1->pageprot) { 850 VPP_SETPROT(vp, svd1->prot); 851 } 852 if (svd2->pageadvice && !svd1->pageadvice) { 853 VPP_SETADVICE(vp, svd1->advice); 854 } 855 } 856 for (vp = nvpage + npages1; 857 vp < nvpage + npages1 + npages2; vp++) { 858 if (svd1->pageprot && !svd2->pageprot) { 859 VPP_SETPROT(vp, svd2->prot); 860 } 861 if (svd1->pageadvice && !svd2->pageadvice) { 862 VPP_SETADVICE(vp, svd2->advice); 863 } 864 } 865 } 866 867 /* 868 * If either segment has private pages, create a new merged anon 869 * array. If mergeing shared anon segments just decrement anon map's 870 * refcnt. 871 */ 872 if (amp1 != NULL && svd1->type == MAP_SHARED) { 873 ASSERT(amp1 == amp2 && svd1->vp == NULL); 874 ANON_LOCK_ENTER(&1->a_rwlock, RW_WRITER); 875 ASSERT(amp1->refcnt >= 2); 876 amp1->refcnt--; 877 ANON_LOCK_EXIT(&1->a_rwlock); 878 svd2->amp = NULL; 879 } else if (amp1 != NULL || amp2 != NULL) { 880 struct anon_hdr *nahp; 881 struct anon_map *namp = NULL; 882 size_t asize; 883 884 ASSERT(svd1->type == MAP_PRIVATE); 885 886 asize = seg1->s_size + seg2->s_size; 887 if ((nahp = anon_create(btop(asize), ANON_NOSLEEP)) == NULL) { 888 if (nvpage != NULL) { 889 kmem_free(nvpage, nvpsize); 890 } 891 return (-2); 892 } 893 if (amp1 != NULL) { 894 /* 895 * XXX anon rwlock is not really needed because 896 * this is a private segment and we are writers. 897 */ 898 ANON_LOCK_ENTER(&1->a_rwlock, RW_WRITER); 899 ASSERT(amp1->refcnt == 1); 900 if (anon_copy_ptr(amp1->ahp, svd1->anon_index, 901 nahp, 0, btop(seg1->s_size), ANON_NOSLEEP)) { 902 anon_release(nahp, btop(asize)); 903 ANON_LOCK_EXIT(&1->a_rwlock); 904 if (nvpage != NULL) { 905 kmem_free(nvpage, nvpsize); 906 } 907 return (-2); 908 } 909 } 910 if (amp2 != NULL) { 911 ANON_LOCK_ENTER(&2->a_rwlock, RW_WRITER); 912 ASSERT(amp2->refcnt == 1); 913 if (anon_copy_ptr(amp2->ahp, svd2->anon_index, 914 nahp, btop(seg1->s_size), btop(seg2->s_size), 915 ANON_NOSLEEP)) { 916 anon_release(nahp, btop(asize)); 917 ANON_LOCK_EXIT(&2->a_rwlock); 918 if (amp1 != NULL) { 919 ANON_LOCK_EXIT(&1->a_rwlock); 920 } 921 if (nvpage != NULL) { 922 kmem_free(nvpage, nvpsize); 923 } 924 return (-2); 925 } 926 } 927 if (amp1 != NULL) { 928 namp = amp1; 929 anon_release(amp1->ahp, btop(amp1->size)); 930 } 931 if (amp2 != NULL) { 932 if (namp == NULL) { 933 ASSERT(amp1 == NULL); 934 namp = amp2; 935 anon_release(amp2->ahp, btop(amp2->size)); 936 } else { 937 amp2->refcnt--; 938 ANON_LOCK_EXIT(&2->a_rwlock); 939 anonmap_free(amp2); 940 } 941 svd2->amp = NULL; /* needed for seg_free */ 942 } 943 namp->ahp = nahp; 944 namp->size = asize; 945 svd1->amp = namp; 946 svd1->anon_index = 0; 947 ANON_LOCK_EXIT(&namp->a_rwlock); 948 } 949 /* 950 * Now free the old vpage structures. 951 */ 952 if (nvpage != NULL) { 953 if (vpage1 != NULL) { 954 kmem_free(vpage1, vpgtob(npages1)); 955 } 956 if (vpage2 != NULL) { 957 svd2->vpage = NULL; 958 kmem_free(vpage2, vpgtob(npages2)); 959 } 960 if (svd2->pageprot) { 961 svd1->pageprot = 1; 962 } 963 if (svd2->pageadvice) { 964 svd1->pageadvice = 1; 965 } 966 svd1->vpage = nvpage; 967 } 968 969 /* all looks ok, merge segments */ 970 svd1->swresv += svd2->swresv; 971 svd2->swresv = 0; /* so seg_free doesn't release swap space */ 972 size = seg2->s_size; 973 seg_free(seg2); 974 seg1->s_size += size; 975 return (0); 976 } 977 978 /* 979 * Extend the previous segment (seg1) to include the 980 * new segment (seg2 + a), if possible. 981 * Return 0 on success. 982 */ 983 static int 984 segvn_extend_prev(seg1, seg2, a, swresv) 985 struct seg *seg1, *seg2; 986 struct segvn_crargs *a; 987 size_t swresv; 988 { 989 struct segvn_data *svd1 = (struct segvn_data *)seg1->s_data; 990 size_t size; 991 struct anon_map *amp1; 992 struct vpage *new_vpage; 993 994 /* 995 * We don't need any segment level locks for "segvn" data 996 * since the address space is "write" locked. 997 */ 998 ASSERT(seg1->s_as && AS_WRITE_HELD(seg1->s_as, &seg1->s_as->a_lock)); 999 1000 /* second segment is new, try to extend first */ 1001 /* XXX - should also check cred */ 1002 if (svd1->vp != a->vp || svd1->maxprot != a->maxprot || 1003 (!svd1->pageprot && (svd1->prot != a->prot)) || 1004 svd1->type != a->type || svd1->flags != a->flags || 1005 seg1->s_szc != a->szc) 1006 return (-1); 1007 1008 /* vp == NULL implies zfod, offset doesn't matter */ 1009 if (svd1->vp != NULL && 1010 svd1->offset + seg1->s_size != (a->offset & PAGEMASK)) 1011 return (-1); 1012 1013 amp1 = svd1->amp; 1014 if (amp1) { 1015 pgcnt_t newpgs; 1016 1017 /* 1018 * Segment has private pages, can data structures 1019 * be expanded? 1020 * 1021 * Acquire the anon_map lock to prevent it from changing, 1022 * if it is shared. This ensures that the anon_map 1023 * will not change while a thread which has a read/write 1024 * lock on an address space references it. 1025 * XXX - Don't need the anon_map lock at all if "refcnt" 1026 * is 1. 1027 * 1028 * Can't grow a MAP_SHARED segment with an anonmap because 1029 * there may be existing anon slots where we want to extend 1030 * the segment and we wouldn't know what to do with them 1031 * (e.g., for tmpfs right thing is to just leave them there, 1032 * for /dev/zero they should be cleared out). 1033 */ 1034 if (svd1->type == MAP_SHARED) 1035 return (-1); 1036 1037 ANON_LOCK_ENTER(&1->a_rwlock, RW_WRITER); 1038 if (amp1->refcnt > 1) { 1039 ANON_LOCK_EXIT(&1->a_rwlock); 1040 return (-1); 1041 } 1042 newpgs = anon_grow(amp1->ahp, &svd1->anon_index, 1043 btop(seg1->s_size), btop(seg2->s_size), ANON_NOSLEEP); 1044 1045 if (newpgs == 0) { 1046 ANON_LOCK_EXIT(&1->a_rwlock); 1047 return (-1); 1048 } 1049 amp1->size = ptob(newpgs); 1050 ANON_LOCK_EXIT(&1->a_rwlock); 1051 } 1052 if (svd1->vpage != NULL) { 1053 new_vpage = 1054 kmem_zalloc(vpgtob(seg_pages(seg1) + seg_pages(seg2)), 1055 KM_NOSLEEP); 1056 if (new_vpage == NULL) 1057 return (-1); 1058 bcopy(svd1->vpage, new_vpage, vpgtob(seg_pages(seg1))); 1059 kmem_free(svd1->vpage, vpgtob(seg_pages(seg1))); 1060 svd1->vpage = new_vpage; 1061 if (svd1->pageprot) { 1062 struct vpage *vp, *evp; 1063 1064 vp = new_vpage + seg_pages(seg1); 1065 evp = vp + seg_pages(seg2); 1066 for (; vp < evp; vp++) 1067 VPP_SETPROT(vp, a->prot); 1068 } 1069 } 1070 size = seg2->s_size; 1071 seg_free(seg2); 1072 seg1->s_size += size; 1073 svd1->swresv += swresv; 1074 return (0); 1075 } 1076 1077 /* 1078 * Extend the next segment (seg2) to include the 1079 * new segment (seg1 + a), if possible. 1080 * Return 0 on success. 1081 */ 1082 static int 1083 segvn_extend_next( 1084 struct seg *seg1, 1085 struct seg *seg2, 1086 struct segvn_crargs *a, 1087 size_t swresv) 1088 { 1089 struct segvn_data *svd2 = (struct segvn_data *)seg2->s_data; 1090 size_t size; 1091 struct anon_map *amp2; 1092 struct vpage *new_vpage; 1093 1094 /* 1095 * We don't need any segment level locks for "segvn" data 1096 * since the address space is "write" locked. 1097 */ 1098 ASSERT(seg2->s_as && AS_WRITE_HELD(seg2->s_as, &seg2->s_as->a_lock)); 1099 1100 /* first segment is new, try to extend second */ 1101 /* XXX - should also check cred */ 1102 if (svd2->vp != a->vp || svd2->maxprot != a->maxprot || 1103 (!svd2->pageprot && (svd2->prot != a->prot)) || 1104 svd2->type != a->type || svd2->flags != a->flags || 1105 seg2->s_szc != a->szc) 1106 return (-1); 1107 /* vp == NULL implies zfod, offset doesn't matter */ 1108 if (svd2->vp != NULL && 1109 (a->offset & PAGEMASK) + seg1->s_size != svd2->offset) 1110 return (-1); 1111 1112 amp2 = svd2->amp; 1113 if (amp2) { 1114 pgcnt_t newpgs; 1115 1116 /* 1117 * Segment has private pages, can data structures 1118 * be expanded? 1119 * 1120 * Acquire the anon_map lock to prevent it from changing, 1121 * if it is shared. This ensures that the anon_map 1122 * will not change while a thread which has a read/write 1123 * lock on an address space references it. 1124 * 1125 * XXX - Don't need the anon_map lock at all if "refcnt" 1126 * is 1. 1127 */ 1128 if (svd2->type == MAP_SHARED) 1129 return (-1); 1130 1131 ANON_LOCK_ENTER(&2->a_rwlock, RW_WRITER); 1132 if (amp2->refcnt > 1) { 1133 ANON_LOCK_EXIT(&2->a_rwlock); 1134 return (-1); 1135 } 1136 newpgs = anon_grow(amp2->ahp, &svd2->anon_index, 1137 btop(seg2->s_size), btop(seg1->s_size), 1138 ANON_NOSLEEP | ANON_GROWDOWN); 1139 1140 if (newpgs == 0) { 1141 ANON_LOCK_EXIT(&2->a_rwlock); 1142 return (-1); 1143 } 1144 amp2->size = ptob(newpgs); 1145 ANON_LOCK_EXIT(&2->a_rwlock); 1146 } 1147 if (svd2->vpage != NULL) { 1148 new_vpage = 1149 kmem_zalloc(vpgtob(seg_pages(seg1) + seg_pages(seg2)), 1150 KM_NOSLEEP); 1151 if (new_vpage == NULL) { 1152 /* Not merging segments so adjust anon_index back */ 1153 if (amp2) 1154 svd2->anon_index += seg_pages(seg1); 1155 return (-1); 1156 } 1157 bcopy(svd2->vpage, new_vpage + seg_pages(seg1), 1158 vpgtob(seg_pages(seg2))); 1159 kmem_free(svd2->vpage, vpgtob(seg_pages(seg2))); 1160 svd2->vpage = new_vpage; 1161 if (svd2->pageprot) { 1162 struct vpage *vp, *evp; 1163 1164 vp = new_vpage; 1165 evp = vp + seg_pages(seg1); 1166 for (; vp < evp; vp++) 1167 VPP_SETPROT(vp, a->prot); 1168 } 1169 } 1170 size = seg1->s_size; 1171 seg_free(seg1); 1172 seg2->s_size += size; 1173 seg2->s_base -= size; 1174 svd2->offset -= size; 1175 svd2->swresv += swresv; 1176 return (0); 1177 } 1178 1179 static int 1180 segvn_dup(struct seg *seg, struct seg *newseg) 1181 { 1182 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 1183 struct segvn_data *newsvd; 1184 pgcnt_t npages = seg_pages(seg); 1185 int error = 0; 1186 uint_t prot; 1187 size_t len; 1188 1189 ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock)); 1190 1191 /* 1192 * If segment has anon reserved, reserve more for the new seg. 1193 * For a MAP_NORESERVE segment swresv will be a count of all the 1194 * allocated anon slots; thus we reserve for the child as many slots 1195 * as the parent has allocated. This semantic prevents the child or 1196 * parent from dieing during a copy-on-write fault caused by trying 1197 * to write a shared pre-existing anon page. 1198 */ 1199 if ((len = svd->swresv) != 0) { 1200 if (anon_resv(svd->swresv) == 0) 1201 return (ENOMEM); 1202 1203 TRACE_3(TR_FAC_VM, TR_ANON_PROC, "anon proc:%p %lu %u", 1204 seg, len, 0); 1205 } 1206 1207 newsvd = kmem_cache_alloc(segvn_cache, KM_SLEEP); 1208 1209 newseg->s_ops = &segvn_ops; 1210 newseg->s_data = (void *)newsvd; 1211 newseg->s_szc = seg->s_szc; 1212 1213 if ((newsvd->vp = svd->vp) != NULL) { 1214 VN_HOLD(svd->vp); 1215 if (svd->type == MAP_SHARED) 1216 lgrp_shm_policy_init(NULL, svd->vp); 1217 } 1218 newsvd->offset = svd->offset; 1219 newsvd->prot = svd->prot; 1220 newsvd->maxprot = svd->maxprot; 1221 newsvd->pageprot = svd->pageprot; 1222 newsvd->type = svd->type; 1223 newsvd->cred = svd->cred; 1224 crhold(newsvd->cred); 1225 newsvd->advice = svd->advice; 1226 newsvd->pageadvice = svd->pageadvice; 1227 newsvd->swresv = svd->swresv; 1228 newsvd->flags = svd->flags; 1229 newsvd->softlockcnt = 0; 1230 newsvd->policy_info = svd->policy_info; 1231 if ((newsvd->amp = svd->amp) == NULL) { 1232 /* 1233 * Not attaching to a shared anon object. 1234 */ 1235 newsvd->anon_index = 0; 1236 } else { 1237 struct anon_map *amp; 1238 1239 amp = svd->amp; 1240 if (svd->type == MAP_SHARED) { 1241 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 1242 amp->refcnt++; 1243 ANON_LOCK_EXIT(&->a_rwlock); 1244 newsvd->anon_index = svd->anon_index; 1245 } else { 1246 int reclaim = 1; 1247 1248 /* 1249 * Allocate and initialize new anon_map structure. 1250 */ 1251 newsvd->amp = anonmap_alloc(newseg->s_size, 0); 1252 newsvd->amp->a_szc = newseg->s_szc; 1253 newsvd->anon_index = 0; 1254 1255 /* 1256 * We don't have to acquire the anon_map lock 1257 * for the new segment (since it belongs to an 1258 * address space that is still not associated 1259 * with any process), or the segment in the old 1260 * address space (since all threads in it 1261 * are stopped while duplicating the address space). 1262 */ 1263 1264 /* 1265 * The goal of the following code is to make sure that 1266 * softlocked pages do not end up as copy on write 1267 * pages. This would cause problems where one 1268 * thread writes to a page that is COW and a different 1269 * thread in the same process has softlocked it. The 1270 * softlock lock would move away from this process 1271 * because the write would cause this process to get 1272 * a copy (without the softlock). 1273 * 1274 * The strategy here is to just break the 1275 * sharing on pages that could possibly be 1276 * softlocked. 1277 */ 1278 retry: 1279 if (svd->softlockcnt) { 1280 struct anon *ap, *newap; 1281 size_t i; 1282 uint_t vpprot; 1283 page_t *anon_pl[1+1], *pp; 1284 caddr_t addr; 1285 ulong_t anon_idx = 0; 1286 1287 /* 1288 * The softlock count might be non zero 1289 * because some pages are still stuck in the 1290 * cache for lazy reclaim. Flush the cache 1291 * now. This should drop the count to zero. 1292 * [or there is really I/O going on to these 1293 * pages]. Note, we have the writers lock so 1294 * nothing gets inserted during the flush. 1295 */ 1296 if (reclaim == 1) { 1297 segvn_purge(seg); 1298 reclaim = 0; 1299 goto retry; 1300 } 1301 i = btopr(seg->s_size); 1302 addr = seg->s_base; 1303 /* 1304 * XXX break cow sharing using PAGESIZE 1305 * pages. They will be relocated into larger 1306 * pages at fault time. 1307 */ 1308 while (i-- > 0) { 1309 if (ap = anon_get_ptr(amp->ahp, 1310 anon_idx)) { 1311 error = anon_getpage(&ap, 1312 &vpprot, anon_pl, PAGESIZE, 1313 seg, addr, S_READ, 1314 svd->cred); 1315 if (error) { 1316 newsvd->vpage = NULL; 1317 goto out; 1318 } 1319 /* 1320 * prot need not be computed 1321 * below 'cause anon_private is 1322 * going to ignore it anyway 1323 * as child doesn't inherit 1324 * pagelock from parent. 1325 */ 1326 prot = svd->pageprot ? 1327 VPP_PROT( 1328 &svd->vpage[ 1329 seg_page(seg, addr)]) 1330 : svd->prot; 1331 pp = anon_private(&newap, 1332 newseg, addr, prot, 1333 anon_pl[0], 0, 1334 newsvd->cred); 1335 if (pp == NULL) { 1336 /* no mem abort */ 1337 newsvd->vpage = NULL; 1338 error = ENOMEM; 1339 goto out; 1340 } 1341 (void) anon_set_ptr( 1342 newsvd->amp->ahp, anon_idx, 1343 newap, ANON_SLEEP); 1344 page_unlock(pp); 1345 } 1346 addr += PAGESIZE; 1347 anon_idx++; 1348 } 1349 } else { /* common case */ 1350 if (seg->s_szc != 0) { 1351 /* 1352 * If at least one of anon slots of a 1353 * large page exists then make sure 1354 * all anon slots of a large page 1355 * exist to avoid partial cow sharing 1356 * of a large page in the future. 1357 */ 1358 anon_dup_fill_holes(amp->ahp, 1359 svd->anon_index, newsvd->amp->ahp, 1360 0, seg->s_size, seg->s_szc, 1361 svd->vp != NULL); 1362 } else { 1363 anon_dup(amp->ahp, svd->anon_index, 1364 newsvd->amp->ahp, 0, seg->s_size); 1365 } 1366 1367 hat_clrattr(seg->s_as->a_hat, seg->s_base, 1368 seg->s_size, PROT_WRITE); 1369 } 1370 } 1371 } 1372 /* 1373 * If necessary, create a vpage structure for the new segment. 1374 * Do not copy any page lock indications. 1375 */ 1376 if (svd->vpage != NULL) { 1377 uint_t i; 1378 struct vpage *ovp = svd->vpage; 1379 struct vpage *nvp; 1380 1381 nvp = newsvd->vpage = 1382 kmem_alloc(vpgtob(npages), KM_SLEEP); 1383 for (i = 0; i < npages; i++) { 1384 *nvp = *ovp++; 1385 VPP_CLRPPLOCK(nvp++); 1386 } 1387 } else 1388 newsvd->vpage = NULL; 1389 1390 /* Inform the vnode of the new mapping */ 1391 if (newsvd->vp != NULL) { 1392 error = VOP_ADDMAP(newsvd->vp, (offset_t)newsvd->offset, 1393 newseg->s_as, newseg->s_base, newseg->s_size, newsvd->prot, 1394 newsvd->maxprot, newsvd->type, newsvd->cred); 1395 } 1396 out: 1397 return (error); 1398 } 1399 1400 1401 /* 1402 * callback function used by segvn_unmap to invoke free_vp_pages() for only 1403 * those pages actually processed by the HAT 1404 */ 1405 extern int free_pages; 1406 1407 static void 1408 segvn_hat_unload_callback(hat_callback_t *cb) 1409 { 1410 struct seg *seg = cb->hcb_data; 1411 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 1412 size_t len; 1413 u_offset_t off; 1414 1415 ASSERT(svd->vp != NULL); 1416 ASSERT(cb->hcb_end_addr > cb->hcb_start_addr); 1417 ASSERT(cb->hcb_start_addr >= seg->s_base); 1418 1419 len = cb->hcb_end_addr - cb->hcb_start_addr; 1420 off = cb->hcb_start_addr - seg->s_base; 1421 free_vp_pages(svd->vp, svd->offset + off, len); 1422 } 1423 1424 1425 static int 1426 segvn_unmap(struct seg *seg, caddr_t addr, size_t len) 1427 { 1428 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 1429 struct segvn_data *nsvd; 1430 struct seg *nseg; 1431 struct anon_map *amp; 1432 pgcnt_t opages; /* old segment size in pages */ 1433 pgcnt_t npages; /* new segment size in pages */ 1434 pgcnt_t dpages; /* pages being deleted (unmapped) */ 1435 hat_callback_t callback; /* used for free_vp_pages() */ 1436 hat_callback_t *cbp = NULL; 1437 caddr_t nbase; 1438 size_t nsize; 1439 size_t oswresv; 1440 int reclaim = 1; 1441 1442 /* 1443 * We don't need any segment level locks for "segvn" data 1444 * since the address space is "write" locked. 1445 */ 1446 ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock)); 1447 1448 /* 1449 * Fail the unmap if pages are SOFTLOCKed through this mapping. 1450 * softlockcnt is protected from change by the as write lock. 1451 */ 1452 retry: 1453 if (svd->softlockcnt > 0) { 1454 /* 1455 * since we do have the writers lock nobody can fill 1456 * the cache during the purge. The flush either succeeds 1457 * or we still have pending I/Os. 1458 */ 1459 if (reclaim == 1) { 1460 segvn_purge(seg); 1461 reclaim = 0; 1462 goto retry; 1463 } 1464 return (EAGAIN); 1465 } 1466 1467 /* 1468 * Check for bad sizes 1469 */ 1470 if (addr < seg->s_base || addr + len > seg->s_base + seg->s_size || 1471 (len & PAGEOFFSET) || ((uintptr_t)addr & PAGEOFFSET)) { 1472 panic("segvn_unmap"); 1473 /*NOTREACHED*/ 1474 } 1475 1476 if (seg->s_szc != 0) { 1477 size_t pgsz = page_get_pagesize(seg->s_szc); 1478 int err; 1479 if (!IS_P2ALIGNED(addr, pgsz) || !IS_P2ALIGNED(len, pgsz)) { 1480 ASSERT(seg->s_base != addr || seg->s_size != len); 1481 VM_STAT_ADD(segvnvmstats.demoterange[0]); 1482 err = segvn_demote_range(seg, addr, len, SDR_END, 0); 1483 if (err == 0) { 1484 return (IE_RETRY); 1485 } 1486 return (err); 1487 } 1488 } 1489 1490 /* Inform the vnode of the unmapping. */ 1491 if (svd->vp) { 1492 int error; 1493 1494 error = VOP_DELMAP(svd->vp, 1495 (offset_t)svd->offset + (uintptr_t)(addr - seg->s_base), 1496 seg->s_as, addr, len, svd->prot, svd->maxprot, 1497 svd->type, svd->cred); 1498 1499 if (error == EAGAIN) 1500 return (error); 1501 } 1502 /* 1503 * Remove any page locks set through this mapping. 1504 */ 1505 (void) segvn_lockop(seg, addr, len, 0, MC_UNLOCK, NULL, 0); 1506 1507 /* 1508 * Unload any hardware translations in the range to be taken out. 1509 * Use a callback to invoke free_vp_pages() effectively. 1510 */ 1511 if (svd->vp != NULL && free_pages != 0) { 1512 callback.hcb_data = seg; 1513 callback.hcb_function = segvn_hat_unload_callback; 1514 cbp = &callback; 1515 } 1516 hat_unload_callback(seg->s_as->a_hat, addr, len, HAT_UNLOAD_UNMAP, cbp); 1517 1518 /* 1519 * Check for entire segment 1520 */ 1521 if (addr == seg->s_base && len == seg->s_size) { 1522 seg_free(seg); 1523 return (0); 1524 } 1525 1526 opages = seg_pages(seg); 1527 dpages = btop(len); 1528 npages = opages - dpages; 1529 amp = svd->amp; 1530 ASSERT(amp == NULL || amp->a_szc >= seg->s_szc); 1531 1532 /* 1533 * Check for beginning of segment 1534 */ 1535 if (addr == seg->s_base) { 1536 if (svd->vpage != NULL) { 1537 size_t nbytes; 1538 struct vpage *ovpage; 1539 1540 ovpage = svd->vpage; /* keep pointer to vpage */ 1541 1542 nbytes = vpgtob(npages); 1543 svd->vpage = kmem_alloc(nbytes, KM_SLEEP); 1544 bcopy(&ovpage[dpages], svd->vpage, nbytes); 1545 1546 /* free up old vpage */ 1547 kmem_free(ovpage, vpgtob(opages)); 1548 } 1549 if (amp != NULL) { 1550 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 1551 if (amp->refcnt == 1 || svd->type == MAP_PRIVATE) { 1552 /* 1553 * Free up now unused parts of anon_map array. 1554 */ 1555 if (amp->a_szc == seg->s_szc) { 1556 if (seg->s_szc != 0) { 1557 anon_free_pages(amp->ahp, 1558 svd->anon_index, len, 1559 seg->s_szc); 1560 } else { 1561 anon_free(amp->ahp, 1562 svd->anon_index, 1563 len); 1564 } 1565 } else { 1566 ASSERT(svd->type == MAP_SHARED); 1567 ASSERT(amp->a_szc > seg->s_szc); 1568 anon_shmap_free_pages(amp, 1569 svd->anon_index, len); 1570 } 1571 1572 /* 1573 * Unreserve swap space for the 1574 * unmapped chunk of this segment in 1575 * case it's MAP_SHARED 1576 */ 1577 if (svd->type == MAP_SHARED) { 1578 anon_unresv(len); 1579 amp->swresv -= len; 1580 } 1581 } 1582 ANON_LOCK_EXIT(&->a_rwlock); 1583 svd->anon_index += dpages; 1584 } 1585 if (svd->vp != NULL) 1586 svd->offset += len; 1587 1588 if (svd->swresv) { 1589 if (svd->flags & MAP_NORESERVE) { 1590 ASSERT(amp); 1591 oswresv = svd->swresv; 1592 1593 svd->swresv = ptob(anon_pages(amp->ahp, 1594 svd->anon_index, npages)); 1595 anon_unresv(oswresv - svd->swresv); 1596 } else { 1597 anon_unresv(len); 1598 svd->swresv -= len; 1599 } 1600 TRACE_3(TR_FAC_VM, TR_ANON_PROC, "anon proc:%p %lu %u", 1601 seg, len, 0); 1602 } 1603 1604 seg->s_base += len; 1605 seg->s_size -= len; 1606 return (0); 1607 } 1608 1609 /* 1610 * Check for end of segment 1611 */ 1612 if (addr + len == seg->s_base + seg->s_size) { 1613 if (svd->vpage != NULL) { 1614 size_t nbytes; 1615 struct vpage *ovpage; 1616 1617 ovpage = svd->vpage; /* keep pointer to vpage */ 1618 1619 nbytes = vpgtob(npages); 1620 svd->vpage = kmem_alloc(nbytes, KM_SLEEP); 1621 bcopy(ovpage, svd->vpage, nbytes); 1622 1623 /* free up old vpage */ 1624 kmem_free(ovpage, vpgtob(opages)); 1625 1626 } 1627 if (amp != NULL) { 1628 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 1629 if (amp->refcnt == 1 || svd->type == MAP_PRIVATE) { 1630 /* 1631 * Free up now unused parts of anon_map array. 1632 */ 1633 ulong_t an_idx = svd->anon_index + npages; 1634 if (amp->a_szc == seg->s_szc) { 1635 if (seg->s_szc != 0) { 1636 anon_free_pages(amp->ahp, 1637 an_idx, len, 1638 seg->s_szc); 1639 } else { 1640 anon_free(amp->ahp, an_idx, 1641 len); 1642 } 1643 } else { 1644 ASSERT(svd->type == MAP_SHARED); 1645 ASSERT(amp->a_szc > seg->s_szc); 1646 anon_shmap_free_pages(amp, 1647 an_idx, len); 1648 } 1649 1650 /* 1651 * Unreserve swap space for the 1652 * unmapped chunk of this segment in 1653 * case it's MAP_SHARED 1654 */ 1655 if (svd->type == MAP_SHARED) { 1656 anon_unresv(len); 1657 amp->swresv -= len; 1658 } 1659 } 1660 ANON_LOCK_EXIT(&->a_rwlock); 1661 } 1662 1663 if (svd->swresv) { 1664 if (svd->flags & MAP_NORESERVE) { 1665 ASSERT(amp); 1666 oswresv = svd->swresv; 1667 svd->swresv = ptob(anon_pages(amp->ahp, 1668 svd->anon_index, npages)); 1669 anon_unresv(oswresv - svd->swresv); 1670 } else { 1671 anon_unresv(len); 1672 svd->swresv -= len; 1673 } 1674 TRACE_3(TR_FAC_VM, TR_ANON_PROC, 1675 "anon proc:%p %lu %u", seg, len, 0); 1676 } 1677 1678 seg->s_size -= len; 1679 return (0); 1680 } 1681 1682 /* 1683 * The section to go is in the middle of the segment, 1684 * have to make it into two segments. nseg is made for 1685 * the high end while seg is cut down at the low end. 1686 */ 1687 nbase = addr + len; /* new seg base */ 1688 nsize = (seg->s_base + seg->s_size) - nbase; /* new seg size */ 1689 seg->s_size = addr - seg->s_base; /* shrink old seg */ 1690 nseg = seg_alloc(seg->s_as, nbase, nsize); 1691 if (nseg == NULL) { 1692 panic("segvn_unmap seg_alloc"); 1693 /*NOTREACHED*/ 1694 } 1695 nseg->s_ops = seg->s_ops; 1696 nsvd = kmem_cache_alloc(segvn_cache, KM_SLEEP); 1697 nseg->s_data = (void *)nsvd; 1698 nseg->s_szc = seg->s_szc; 1699 *nsvd = *svd; 1700 nsvd->offset = svd->offset + (uintptr_t)(nseg->s_base - seg->s_base); 1701 nsvd->swresv = 0; 1702 nsvd->softlockcnt = 0; 1703 1704 if (svd->vp != NULL) { 1705 VN_HOLD(nsvd->vp); 1706 if (nsvd->type == MAP_SHARED) 1707 lgrp_shm_policy_init(NULL, nsvd->vp); 1708 } 1709 crhold(svd->cred); 1710 1711 if (svd->vpage == NULL) { 1712 nsvd->vpage = NULL; 1713 } else { 1714 /* need to split vpage into two arrays */ 1715 size_t nbytes; 1716 struct vpage *ovpage; 1717 1718 ovpage = svd->vpage; /* keep pointer to vpage */ 1719 1720 npages = seg_pages(seg); /* seg has shrunk */ 1721 nbytes = vpgtob(npages); 1722 svd->vpage = kmem_alloc(nbytes, KM_SLEEP); 1723 1724 bcopy(ovpage, svd->vpage, nbytes); 1725 1726 npages = seg_pages(nseg); 1727 nbytes = vpgtob(npages); 1728 nsvd->vpage = kmem_alloc(nbytes, KM_SLEEP); 1729 1730 bcopy(&ovpage[opages - npages], nsvd->vpage, nbytes); 1731 1732 /* free up old vpage */ 1733 kmem_free(ovpage, vpgtob(opages)); 1734 } 1735 1736 if (amp == NULL) { 1737 nsvd->amp = NULL; 1738 nsvd->anon_index = 0; 1739 } else { 1740 /* 1741 * Need to create a new anon map for the new segment. 1742 * We'll also allocate a new smaller array for the old 1743 * smaller segment to save space. 1744 */ 1745 opages = btop((uintptr_t)(addr - seg->s_base)); 1746 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 1747 if (amp->refcnt == 1 || svd->type == MAP_PRIVATE) { 1748 /* 1749 * Free up now unused parts of anon_map array. 1750 */ 1751 ulong_t an_idx = svd->anon_index + opages; 1752 if (amp->a_szc == seg->s_szc) { 1753 if (seg->s_szc != 0) { 1754 anon_free_pages(amp->ahp, an_idx, len, 1755 seg->s_szc); 1756 } else { 1757 anon_free(amp->ahp, an_idx, 1758 len); 1759 } 1760 } else { 1761 ASSERT(svd->type == MAP_SHARED); 1762 ASSERT(amp->a_szc > seg->s_szc); 1763 anon_shmap_free_pages(amp, an_idx, len); 1764 } 1765 1766 /* 1767 * Unreserve swap space for the 1768 * unmapped chunk of this segment in 1769 * case it's MAP_SHARED 1770 */ 1771 if (svd->type == MAP_SHARED) { 1772 anon_unresv(len); 1773 amp->swresv -= len; 1774 } 1775 } 1776 nsvd->anon_index = svd->anon_index + 1777 btop((uintptr_t)(nseg->s_base - seg->s_base)); 1778 if (svd->type == MAP_SHARED) { 1779 amp->refcnt++; 1780 nsvd->amp = amp; 1781 } else { 1782 struct anon_map *namp; 1783 struct anon_hdr *nahp; 1784 1785 ASSERT(svd->type == MAP_PRIVATE); 1786 nahp = anon_create(btop(seg->s_size), ANON_SLEEP); 1787 namp = anonmap_alloc(nseg->s_size, 0); 1788 namp->a_szc = seg->s_szc; 1789 (void) anon_copy_ptr(amp->ahp, svd->anon_index, nahp, 1790 0, btop(seg->s_size), ANON_SLEEP); 1791 (void) anon_copy_ptr(amp->ahp, nsvd->anon_index, 1792 namp->ahp, 0, btop(nseg->s_size), ANON_SLEEP); 1793 anon_release(amp->ahp, btop(amp->size)); 1794 svd->anon_index = 0; 1795 nsvd->anon_index = 0; 1796 amp->ahp = nahp; 1797 amp->size = seg->s_size; 1798 nsvd->amp = namp; 1799 } 1800 ANON_LOCK_EXIT(&->a_rwlock); 1801 } 1802 if (svd->swresv) { 1803 if (svd->flags & MAP_NORESERVE) { 1804 ASSERT(amp); 1805 oswresv = svd->swresv; 1806 svd->swresv = ptob(anon_pages(amp->ahp, 1807 svd->anon_index, btop(seg->s_size))); 1808 nsvd->swresv = ptob(anon_pages(nsvd->amp->ahp, 1809 nsvd->anon_index, btop(nseg->s_size))); 1810 ASSERT(oswresv >= (svd->swresv + nsvd->swresv)); 1811 anon_unresv(oswresv - (svd->swresv + nsvd->swresv)); 1812 } else { 1813 if (seg->s_size + nseg->s_size + len != svd->swresv) { 1814 panic("segvn_unmap: " 1815 "cannot split swap reservation"); 1816 /*NOTREACHED*/ 1817 } 1818 anon_unresv(len); 1819 svd->swresv = seg->s_size; 1820 nsvd->swresv = nseg->s_size; 1821 } 1822 TRACE_3(TR_FAC_VM, TR_ANON_PROC, "anon proc:%p %lu %u", 1823 seg, len, 0); 1824 } 1825 1826 return (0); /* I'm glad that's all over with! */ 1827 } 1828 1829 static void 1830 segvn_free(struct seg *seg) 1831 { 1832 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 1833 pgcnt_t npages = seg_pages(seg); 1834 struct anon_map *amp; 1835 size_t len; 1836 1837 /* 1838 * We don't need any segment level locks for "segvn" data 1839 * since the address space is "write" locked. 1840 */ 1841 ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock)); 1842 1843 /* 1844 * Be sure to unlock pages. XXX Why do things get free'ed instead 1845 * of unmapped? XXX 1846 */ 1847 (void) segvn_lockop(seg, seg->s_base, seg->s_size, 1848 0, MC_UNLOCK, NULL, 0); 1849 1850 /* 1851 * Deallocate the vpage and anon pointers if necessary and possible. 1852 */ 1853 if (svd->vpage != NULL) { 1854 kmem_free(svd->vpage, vpgtob(npages)); 1855 svd->vpage = NULL; 1856 } 1857 if ((amp = svd->amp) != NULL) { 1858 /* 1859 * If there are no more references to this anon_map 1860 * structure, then deallocate the structure after freeing 1861 * up all the anon slot pointers that we can. 1862 */ 1863 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 1864 ASSERT(amp->a_szc >= seg->s_szc); 1865 if (--amp->refcnt == 0) { 1866 if (svd->type == MAP_PRIVATE) { 1867 /* 1868 * Private - we only need to anon_free 1869 * the part that this segment refers to. 1870 */ 1871 if (seg->s_szc != 0) { 1872 anon_free_pages(amp->ahp, 1873 svd->anon_index, seg->s_size, 1874 seg->s_szc); 1875 } else { 1876 anon_free(amp->ahp, svd->anon_index, 1877 seg->s_size); 1878 } 1879 } else { 1880 /* 1881 * Shared - anon_free the entire 1882 * anon_map's worth of stuff and 1883 * release any swap reservation. 1884 */ 1885 if (amp->a_szc != 0) { 1886 anon_shmap_free_pages(amp, 0, 1887 amp->size); 1888 } else { 1889 anon_free(amp->ahp, 0, amp->size); 1890 } 1891 if ((len = amp->swresv) != 0) { 1892 anon_unresv(len); 1893 TRACE_3(TR_FAC_VM, TR_ANON_PROC, 1894 "anon proc:%p %lu %u", 1895 seg, len, 0); 1896 } 1897 } 1898 svd->amp = NULL; 1899 ANON_LOCK_EXIT(&->a_rwlock); 1900 anonmap_free(amp); 1901 } else if (svd->type == MAP_PRIVATE) { 1902 /* 1903 * We had a private mapping which still has 1904 * a held anon_map so just free up all the 1905 * anon slot pointers that we were using. 1906 */ 1907 if (seg->s_szc != 0) { 1908 anon_free_pages(amp->ahp, svd->anon_index, 1909 seg->s_size, seg->s_szc); 1910 } else { 1911 anon_free(amp->ahp, svd->anon_index, 1912 seg->s_size); 1913 } 1914 ANON_LOCK_EXIT(&->a_rwlock); 1915 } else { 1916 ANON_LOCK_EXIT(&->a_rwlock); 1917 } 1918 } 1919 1920 /* 1921 * Release swap reservation. 1922 */ 1923 if ((len = svd->swresv) != 0) { 1924 anon_unresv(svd->swresv); 1925 TRACE_3(TR_FAC_VM, TR_ANON_PROC, "anon proc:%p %lu %u", 1926 seg, len, 0); 1927 svd->swresv = 0; 1928 } 1929 /* 1930 * Release claim on vnode, credentials, and finally free the 1931 * private data. 1932 */ 1933 if (svd->vp != NULL) { 1934 if (svd->type == MAP_SHARED) 1935 lgrp_shm_policy_fini(NULL, svd->vp); 1936 VN_RELE(svd->vp); 1937 svd->vp = NULL; 1938 } 1939 crfree(svd->cred); 1940 svd->cred = NULL; 1941 1942 seg->s_data = NULL; 1943 kmem_cache_free(segvn_cache, svd); 1944 } 1945 1946 ulong_t segvn_lpglck_limit = 0; 1947 /* 1948 * Support routines used by segvn_pagelock() and softlock faults for anonymous 1949 * pages to implement availrmem accounting in a way that makes sure the 1950 * same memory is accounted just once for all softlock/pagelock purposes. 1951 * This prevents a bug when availrmem is quickly incorrectly exausted from 1952 * several pagelocks to different parts of the same large page since each 1953 * pagelock has to decrement availrmem by the size of the entire large 1954 * page. Note those pages are not COW shared until softunlock/pageunlock so 1955 * we don't need to use cow style accounting here. We also need to make sure 1956 * the entire large page is accounted even if softlock range is less than the 1957 * entire large page because large anon pages can't be demoted when any of 1958 * constituent pages is locked. The caller calls this routine for every page_t 1959 * it locks. The very first page in the range may not be the root page of a 1960 * large page. For all other pages it's guranteed we are going to visit the 1961 * root of a particular large page before any other constituent page as we are 1962 * locking sequential pages belonging to the same anon map. So we do all the 1963 * locking when the root is encountered except for the very first page. Since 1964 * softlocking is not supported (except S_READ_NOCOW special case) for vmpss 1965 * segments and since vnode pages can be demoted without locking all 1966 * constituent pages vnode pages don't come here. Unlocking relies on the 1967 * fact that pagesize can't change whenever any of constituent large pages is 1968 * locked at least SE_SHARED. This allows unlocking code to find the right 1969 * root and decrement availrmem by the same amount it was incremented when the 1970 * page was locked. 1971 */ 1972 static int 1973 segvn_pp_lock_anonpages(page_t *pp, int first) 1974 { 1975 pgcnt_t pages; 1976 pfn_t pfn; 1977 uchar_t szc = pp->p_szc; 1978 1979 ASSERT(PAGE_LOCKED(pp)); 1980 ASSERT(pp->p_vnode != NULL); 1981 ASSERT(IS_SWAPFSVP(pp->p_vnode)); 1982 1983 /* 1984 * pagesize won't change as long as any constituent page is locked. 1985 */ 1986 pages = page_get_pagecnt(pp->p_szc); 1987 pfn = page_pptonum(pp); 1988 1989 if (!first) { 1990 if (!IS_P2ALIGNED(pfn, pages)) { 1991 #ifdef DEBUG 1992 pp = &pp[-(spgcnt_t)(pfn & (pages - 1))]; 1993 pfn = page_pptonum(pp); 1994 ASSERT(IS_P2ALIGNED(pfn, pages)); 1995 ASSERT(pp->p_szc == szc); 1996 ASSERT(pp->p_vnode != NULL); 1997 ASSERT(IS_SWAPFSVP(pp->p_vnode)); 1998 ASSERT(pp->p_slckcnt != 0); 1999 #endif /* DEBUG */ 2000 return (1); 2001 } 2002 } else if (!IS_P2ALIGNED(pfn, pages)) { 2003 pp = &pp[-(spgcnt_t)(pfn & (pages - 1))]; 2004 #ifdef DEBUG 2005 pfn = page_pptonum(pp); 2006 ASSERT(IS_P2ALIGNED(pfn, pages)); 2007 ASSERT(pp->p_szc == szc); 2008 ASSERT(pp->p_vnode != NULL); 2009 ASSERT(IS_SWAPFSVP(pp->p_vnode)); 2010 #endif /* DEBUG */ 2011 } 2012 2013 /* 2014 * pp is a root page. 2015 * We haven't locked this large page yet. 2016 */ 2017 page_struct_lock(pp); 2018 if (pp->p_slckcnt != 0) { 2019 if (pp->p_slckcnt < PAGE_SLOCK_MAXIMUM) { 2020 pp->p_slckcnt++; 2021 page_struct_unlock(pp); 2022 return (1); 2023 } 2024 page_struct_unlock(pp); 2025 segvn_lpglck_limit++; 2026 return (0); 2027 } 2028 mutex_enter(&freemem_lock); 2029 if (availrmem < tune.t_minarmem + pages) { 2030 mutex_exit(&freemem_lock); 2031 page_struct_unlock(pp); 2032 return (0); 2033 } 2034 pp->p_slckcnt++; 2035 availrmem -= pages; 2036 mutex_exit(&freemem_lock); 2037 page_struct_unlock(pp); 2038 return (1); 2039 } 2040 2041 static void 2042 segvn_pp_unlock_anonpages(page_t *pp, int first) 2043 { 2044 pgcnt_t pages; 2045 pfn_t pfn; 2046 2047 ASSERT(PAGE_LOCKED(pp)); 2048 ASSERT(pp->p_vnode != NULL); 2049 ASSERT(IS_SWAPFSVP(pp->p_vnode)); 2050 2051 /* 2052 * pagesize won't change as long as any constituent page is locked. 2053 */ 2054 pages = page_get_pagecnt(pp->p_szc); 2055 pfn = page_pptonum(pp); 2056 2057 if (!first) { 2058 if (!IS_P2ALIGNED(pfn, pages)) { 2059 return; 2060 } 2061 } else if (!IS_P2ALIGNED(pfn, pages)) { 2062 pp = &pp[-(spgcnt_t)(pfn & (pages - 1))]; 2063 #ifdef DEBUG 2064 pfn = page_pptonum(pp); 2065 ASSERT(IS_P2ALIGNED(pfn, pages)); 2066 #endif /* DEBUG */ 2067 } 2068 ASSERT(pp->p_vnode != NULL); 2069 ASSERT(IS_SWAPFSVP(pp->p_vnode)); 2070 ASSERT(pp->p_slckcnt != 0); 2071 page_struct_lock(pp); 2072 if (--pp->p_slckcnt == 0) { 2073 mutex_enter(&freemem_lock); 2074 availrmem += pages; 2075 mutex_exit(&freemem_lock); 2076 } 2077 page_struct_unlock(pp); 2078 } 2079 2080 /* 2081 * Do a F_SOFTUNLOCK call over the range requested. The range must have 2082 * already been F_SOFTLOCK'ed. 2083 * Caller must always match addr and len of a softunlock with a previous 2084 * softlock with exactly the same addr and len. 2085 */ 2086 static void 2087 segvn_softunlock(struct seg *seg, caddr_t addr, size_t len, enum seg_rw rw) 2088 { 2089 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 2090 page_t *pp; 2091 caddr_t adr; 2092 struct vnode *vp; 2093 u_offset_t offset; 2094 ulong_t anon_index; 2095 struct anon_map *amp; 2096 struct anon *ap = NULL; 2097 2098 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 2099 ASSERT(SEGVN_LOCK_HELD(seg->s_as, &svd->lock)); 2100 2101 if ((amp = svd->amp) != NULL) 2102 anon_index = svd->anon_index + seg_page(seg, addr); 2103 2104 hat_unlock(seg->s_as->a_hat, addr, len); 2105 for (adr = addr; adr < addr + len; adr += PAGESIZE) { 2106 if (amp != NULL) { 2107 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 2108 if ((ap = anon_get_ptr(amp->ahp, anon_index++)) 2109 != NULL) { 2110 swap_xlate(ap, &vp, &offset); 2111 } else { 2112 vp = svd->vp; 2113 offset = svd->offset + 2114 (uintptr_t)(adr - seg->s_base); 2115 } 2116 ANON_LOCK_EXIT(&->a_rwlock); 2117 } else { 2118 vp = svd->vp; 2119 offset = svd->offset + 2120 (uintptr_t)(adr - seg->s_base); 2121 } 2122 2123 /* 2124 * Use page_find() instead of page_lookup() to 2125 * find the page since we know that it is locked. 2126 */ 2127 pp = page_find(vp, offset); 2128 if (pp == NULL) { 2129 panic( 2130 "segvn_softunlock: addr %p, ap %p, vp %p, off %llx", 2131 (void *)adr, (void *)ap, (void *)vp, offset); 2132 /*NOTREACHED*/ 2133 } 2134 2135 if (rw == S_WRITE) { 2136 hat_setrefmod(pp); 2137 if (seg->s_as->a_vbits) 2138 hat_setstat(seg->s_as, adr, PAGESIZE, 2139 P_REF | P_MOD); 2140 } else if (rw != S_OTHER) { 2141 hat_setref(pp); 2142 if (seg->s_as->a_vbits) 2143 hat_setstat(seg->s_as, adr, PAGESIZE, P_REF); 2144 } 2145 TRACE_3(TR_FAC_VM, TR_SEGVN_FAULT, 2146 "segvn_fault:pp %p vp %p offset %llx", pp, vp, offset); 2147 if (svd->vp == NULL) { 2148 segvn_pp_unlock_anonpages(pp, adr == addr); 2149 } 2150 page_unlock(pp); 2151 } 2152 mutex_enter(&freemem_lock); /* for availrmem */ 2153 if (svd->vp != NULL) { 2154 availrmem += btop(len); 2155 } 2156 segvn_pages_locked -= btop(len); 2157 svd->softlockcnt -= btop(len); 2158 mutex_exit(&freemem_lock); 2159 if (svd->softlockcnt == 0) { 2160 /* 2161 * All SOFTLOCKS are gone. Wakeup any waiting 2162 * unmappers so they can try again to unmap. 2163 * Check for waiters first without the mutex 2164 * held so we don't always grab the mutex on 2165 * softunlocks. 2166 */ 2167 if (AS_ISUNMAPWAIT(seg->s_as)) { 2168 mutex_enter(&seg->s_as->a_contents); 2169 if (AS_ISUNMAPWAIT(seg->s_as)) { 2170 AS_CLRUNMAPWAIT(seg->s_as); 2171 cv_broadcast(&seg->s_as->a_cv); 2172 } 2173 mutex_exit(&seg->s_as->a_contents); 2174 } 2175 } 2176 } 2177 2178 #define PAGE_HANDLED ((page_t *)-1) 2179 2180 /* 2181 * Release all the pages in the NULL terminated ppp list 2182 * which haven't already been converted to PAGE_HANDLED. 2183 */ 2184 static void 2185 segvn_pagelist_rele(page_t **ppp) 2186 { 2187 for (; *ppp != NULL; ppp++) { 2188 if (*ppp != PAGE_HANDLED) 2189 page_unlock(*ppp); 2190 } 2191 } 2192 2193 static int stealcow = 1; 2194 2195 /* 2196 * Workaround for viking chip bug. See bug id 1220902. 2197 * To fix this down in pagefault() would require importing so 2198 * much as and segvn code as to be unmaintainable. 2199 */ 2200 int enable_mbit_wa = 0; 2201 2202 /* 2203 * Handles all the dirty work of getting the right 2204 * anonymous pages and loading up the translations. 2205 * This routine is called only from segvn_fault() 2206 * when looping over the range of addresses requested. 2207 * 2208 * The basic algorithm here is: 2209 * If this is an anon_zero case 2210 * Call anon_zero to allocate page 2211 * Load up translation 2212 * Return 2213 * endif 2214 * If this is an anon page 2215 * Use anon_getpage to get the page 2216 * else 2217 * Find page in pl[] list passed in 2218 * endif 2219 * If not a cow 2220 * Load up the translation to the page 2221 * return 2222 * endif 2223 * Call anon_private to handle cow 2224 * Load up (writable) translation to new page 2225 */ 2226 static faultcode_t 2227 segvn_faultpage( 2228 struct hat *hat, /* the hat to use for mapping */ 2229 struct seg *seg, /* seg_vn of interest */ 2230 caddr_t addr, /* address in as */ 2231 u_offset_t off, /* offset in vp */ 2232 struct vpage *vpage, /* pointer to vpage for vp, off */ 2233 page_t *pl[], /* object source page pointer */ 2234 uint_t vpprot, /* access allowed to object pages */ 2235 enum fault_type type, /* type of fault */ 2236 enum seg_rw rw, /* type of access at fault */ 2237 int brkcow, /* we may need to break cow */ 2238 int first) /* first page for this fault if 1 */ 2239 { 2240 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 2241 page_t *pp, **ppp; 2242 uint_t pageflags = 0; 2243 page_t *anon_pl[1 + 1]; 2244 page_t *opp = NULL; /* original page */ 2245 uint_t prot; 2246 int err; 2247 int cow; 2248 int claim; 2249 int steal = 0; 2250 ulong_t anon_index; 2251 struct anon *ap, *oldap; 2252 struct anon_map *amp; 2253 int hat_flag = (type == F_SOFTLOCK) ? HAT_LOAD_LOCK : HAT_LOAD; 2254 int anon_lock = 0; 2255 anon_sync_obj_t cookie; 2256 2257 if (svd->flags & MAP_TEXT) { 2258 hat_flag |= HAT_LOAD_TEXT; 2259 } 2260 2261 ASSERT(SEGVN_READ_HELD(seg->s_as, &svd->lock)); 2262 ASSERT(seg->s_szc == 0); 2263 2264 /* 2265 * Initialize protection value for this page. 2266 * If we have per page protection values check it now. 2267 */ 2268 if (svd->pageprot) { 2269 uint_t protchk; 2270 2271 switch (rw) { 2272 case S_READ: 2273 protchk = PROT_READ; 2274 break; 2275 case S_WRITE: 2276 protchk = PROT_WRITE; 2277 break; 2278 case S_EXEC: 2279 protchk = PROT_EXEC; 2280 break; 2281 case S_OTHER: 2282 default: 2283 protchk = PROT_READ | PROT_WRITE | PROT_EXEC; 2284 break; 2285 } 2286 2287 prot = VPP_PROT(vpage); 2288 if ((prot & protchk) == 0) 2289 return (FC_PROT); /* illegal access type */ 2290 } else { 2291 prot = svd->prot; 2292 } 2293 2294 if (type == F_SOFTLOCK && svd->vp != NULL) { 2295 mutex_enter(&freemem_lock); 2296 if (availrmem <= tune.t_minarmem) { 2297 mutex_exit(&freemem_lock); 2298 return (FC_MAKE_ERR(ENOMEM)); /* out of real memory */ 2299 } else { 2300 availrmem--; 2301 svd->softlockcnt++; 2302 segvn_pages_locked++; 2303 } 2304 mutex_exit(&freemem_lock); 2305 } 2306 2307 /* 2308 * Always acquire the anon array lock to prevent 2 threads from 2309 * allocating separate anon slots for the same "addr". 2310 */ 2311 2312 if ((amp = svd->amp) != NULL) { 2313 ASSERT(RW_READ_HELD(&->a_rwlock)); 2314 anon_index = svd->anon_index + seg_page(seg, addr); 2315 anon_array_enter(amp, anon_index, &cookie); 2316 anon_lock = 1; 2317 } 2318 2319 if (svd->vp == NULL && amp != NULL) { 2320 if ((ap = anon_get_ptr(amp->ahp, anon_index)) == NULL) { 2321 /* 2322 * Allocate a (normally) writable anonymous page of 2323 * zeroes. If no advance reservations, reserve now. 2324 */ 2325 if (svd->flags & MAP_NORESERVE) { 2326 if (anon_resv(ptob(1))) { 2327 svd->swresv += ptob(1); 2328 } else { 2329 err = ENOMEM; 2330 goto out; 2331 } 2332 } 2333 if ((pp = anon_zero(seg, addr, &ap, 2334 svd->cred)) == NULL) { 2335 err = ENOMEM; 2336 goto out; /* out of swap space */ 2337 } 2338 /* 2339 * Re-acquire the anon_map lock and 2340 * initialize the anon array entry. 2341 */ 2342 (void) anon_set_ptr(amp->ahp, anon_index, ap, 2343 ANON_SLEEP); 2344 2345 ASSERT(pp->p_szc == 0); 2346 2347 /* 2348 * Handle pages that have been marked for migration 2349 */ 2350 if (lgrp_optimizations()) 2351 page_migrate(seg, addr, &pp, 1); 2352 2353 if (type == F_SOFTLOCK) { 2354 if (!segvn_pp_lock_anonpages(pp, first)) { 2355 page_unlock(pp); 2356 err = ENOMEM; 2357 goto out; 2358 } else { 2359 mutex_enter(&freemem_lock); 2360 svd->softlockcnt++; 2361 segvn_pages_locked++; 2362 mutex_exit(&freemem_lock); 2363 } 2364 } 2365 2366 if (enable_mbit_wa) { 2367 if (rw == S_WRITE) 2368 hat_setmod(pp); 2369 else if (!hat_ismod(pp)) 2370 prot &= ~PROT_WRITE; 2371 } 2372 /* 2373 * If AS_PAGLCK is set in a_flags (via memcntl(2) 2374 * with MC_LOCKAS, MCL_FUTURE) and this is a 2375 * MAP_NORESERVE segment, we may need to 2376 * permanently lock the page as it is being faulted 2377 * for the first time. The following text applies 2378 * only to MAP_NORESERVE segments: 2379 * 2380 * As per memcntl(2), if this segment was created 2381 * after MCL_FUTURE was applied (a "future" 2382 * segment), its pages must be locked. If this 2383 * segment existed at MCL_FUTURE application (a 2384 * "past" segment), the interface is unclear. 2385 * 2386 * We decide to lock only if vpage is present: 2387 * 2388 * - "future" segments will have a vpage array (see 2389 * as_map), and so will be locked as required 2390 * 2391 * - "past" segments may not have a vpage array, 2392 * depending on whether events (such as 2393 * mprotect) have occurred. Locking if vpage 2394 * exists will preserve legacy behavior. Not 2395 * locking if vpage is absent, will not break 2396 * the interface or legacy behavior. Note that 2397 * allocating vpage here if it's absent requires 2398 * upgrading the segvn reader lock, the cost of 2399 * which does not seem worthwhile. 2400 * 2401 * Usually testing and setting VPP_ISPPLOCK and 2402 * VPP_SETPPLOCK requires holding the segvn lock as 2403 * writer, but in this case all readers are 2404 * serializing on the anon array lock. 2405 */ 2406 if (AS_ISPGLCK(seg->s_as) && vpage != NULL && 2407 (svd->flags & MAP_NORESERVE) && 2408 !VPP_ISPPLOCK(vpage)) { 2409 proc_t *p = seg->s_as->a_proc; 2410 ASSERT(svd->type == MAP_PRIVATE); 2411 mutex_enter(&p->p_lock); 2412 if (rctl_incr_locked_mem(p, NULL, PAGESIZE, 2413 1) == 0) { 2414 claim = VPP_PROT(vpage) & PROT_WRITE; 2415 if (page_pp_lock(pp, claim, 0)) { 2416 VPP_SETPPLOCK(vpage); 2417 } else { 2418 rctl_decr_locked_mem(p, NULL, 2419 PAGESIZE, 1); 2420 } 2421 } 2422 mutex_exit(&p->p_lock); 2423 } 2424 2425 hat_memload(hat, addr, pp, prot, hat_flag); 2426 2427 if (!(hat_flag & HAT_LOAD_LOCK)) 2428 page_unlock(pp); 2429 2430 anon_array_exit(&cookie); 2431 return (0); 2432 } 2433 } 2434 2435 /* 2436 * Obtain the page structure via anon_getpage() if it is 2437 * a private copy of an object (the result of a previous 2438 * copy-on-write). 2439 */ 2440 if (amp != NULL) { 2441 if ((ap = anon_get_ptr(amp->ahp, anon_index)) != NULL) { 2442 err = anon_getpage(&ap, &vpprot, anon_pl, PAGESIZE, 2443 seg, addr, rw, svd->cred); 2444 if (err) 2445 goto out; 2446 2447 if (svd->type == MAP_SHARED) { 2448 /* 2449 * If this is a shared mapping to an 2450 * anon_map, then ignore the write 2451 * permissions returned by anon_getpage(). 2452 * They apply to the private mappings 2453 * of this anon_map. 2454 */ 2455 vpprot |= PROT_WRITE; 2456 } 2457 opp = anon_pl[0]; 2458 } 2459 } 2460 2461 /* 2462 * Search the pl[] list passed in if it is from the 2463 * original object (i.e., not a private copy). 2464 */ 2465 if (opp == NULL) { 2466 /* 2467 * Find original page. We must be bringing it in 2468 * from the list in pl[]. 2469 */ 2470 for (ppp = pl; (opp = *ppp) != NULL; ppp++) { 2471 if (opp == PAGE_HANDLED) 2472 continue; 2473 ASSERT(opp->p_vnode == svd->vp); /* XXX */ 2474 if (opp->p_offset == off) 2475 break; 2476 } 2477 if (opp == NULL) { 2478 panic("segvn_faultpage not found"); 2479 /*NOTREACHED*/ 2480 } 2481 *ppp = PAGE_HANDLED; 2482 2483 } 2484 2485 ASSERT(PAGE_LOCKED(opp)); 2486 2487 TRACE_3(TR_FAC_VM, TR_SEGVN_FAULT, 2488 "segvn_fault:pp %p vp %p offset %llx", 2489 opp, NULL, 0); 2490 2491 /* 2492 * The fault is treated as a copy-on-write fault if a 2493 * write occurs on a private segment and the object 2494 * page (i.e., mapping) is write protected. We assume 2495 * that fatal protection checks have already been made. 2496 */ 2497 2498 cow = brkcow && ((vpprot & PROT_WRITE) == 0); 2499 2500 /* 2501 * If not a copy-on-write case load the translation 2502 * and return. 2503 */ 2504 if (cow == 0) { 2505 2506 /* 2507 * Handle pages that have been marked for migration 2508 */ 2509 if (lgrp_optimizations()) 2510 page_migrate(seg, addr, &opp, 1); 2511 2512 if (type == F_SOFTLOCK && svd->vp == NULL) { 2513 2514 ASSERT(opp->p_szc == 0 || 2515 (svd->type == MAP_SHARED && 2516 amp != NULL && amp->a_szc != 0)); 2517 2518 if (!segvn_pp_lock_anonpages(opp, first)) { 2519 page_unlock(opp); 2520 err = ENOMEM; 2521 goto out; 2522 } else { 2523 mutex_enter(&freemem_lock); 2524 svd->softlockcnt++; 2525 segvn_pages_locked++; 2526 mutex_exit(&freemem_lock); 2527 } 2528 } 2529 if (IS_VMODSORT(opp->p_vnode) || enable_mbit_wa) { 2530 if (rw == S_WRITE) 2531 hat_setmod(opp); 2532 else if (rw != S_OTHER && !hat_ismod(opp)) 2533 prot &= ~PROT_WRITE; 2534 } 2535 2536 hat_memload(hat, addr, opp, prot & vpprot, hat_flag); 2537 2538 if (!(hat_flag & HAT_LOAD_LOCK)) 2539 page_unlock(opp); 2540 2541 if (anon_lock) { 2542 anon_array_exit(&cookie); 2543 } 2544 return (0); 2545 } 2546 2547 hat_setref(opp); 2548 2549 ASSERT(amp != NULL && anon_lock); 2550 2551 /* 2552 * Steal the page only if it isn't a private page 2553 * since stealing a private page is not worth the effort. 2554 */ 2555 if ((ap = anon_get_ptr(amp->ahp, anon_index)) == NULL) 2556 steal = 1; 2557 2558 /* 2559 * Steal the original page if the following conditions are true: 2560 * 2561 * We are low on memory, the page is not private, page is not large, 2562 * not shared, not modified, not `locked' or if we have it `locked' 2563 * (i.e., p_cowcnt == 1 and p_lckcnt == 0, which also implies 2564 * that the page is not shared) and if it doesn't have any 2565 * translations. page_struct_lock isn't needed to look at p_cowcnt 2566 * and p_lckcnt because we first get exclusive lock on page. 2567 */ 2568 (void) hat_pagesync(opp, HAT_SYNC_DONTZERO | HAT_SYNC_STOPON_MOD); 2569 2570 if (stealcow && freemem < minfree && steal && opp->p_szc == 0 && 2571 page_tryupgrade(opp) && !hat_ismod(opp) && 2572 ((opp->p_lckcnt == 0 && opp->p_cowcnt == 0) || 2573 (opp->p_lckcnt == 0 && opp->p_cowcnt == 1 && 2574 vpage != NULL && VPP_ISPPLOCK(vpage)))) { 2575 /* 2576 * Check if this page has other translations 2577 * after unloading our translation. 2578 */ 2579 if (hat_page_is_mapped(opp)) { 2580 hat_unload(seg->s_as->a_hat, addr, PAGESIZE, 2581 HAT_UNLOAD); 2582 } 2583 2584 /* 2585 * hat_unload() might sync back someone else's recent 2586 * modification, so check again. 2587 */ 2588 if (!hat_ismod(opp) && !hat_page_is_mapped(opp)) 2589 pageflags |= STEAL_PAGE; 2590 } 2591 2592 /* 2593 * If we have a vpage pointer, see if it indicates that we have 2594 * ``locked'' the page we map -- if so, tell anon_private to 2595 * transfer the locking resource to the new page. 2596 * 2597 * See Statement at the beginning of segvn_lockop regarding 2598 * the way lockcnts/cowcnts are handled during COW. 2599 * 2600 */ 2601 if (vpage != NULL && VPP_ISPPLOCK(vpage)) 2602 pageflags |= LOCK_PAGE; 2603 2604 /* 2605 * Allocate a private page and perform the copy. 2606 * For MAP_NORESERVE reserve swap space now, unless this 2607 * is a cow fault on an existing anon page in which case 2608 * MAP_NORESERVE will have made advance reservations. 2609 */ 2610 if ((svd->flags & MAP_NORESERVE) && (ap == NULL)) { 2611 if (anon_resv(ptob(1))) { 2612 svd->swresv += ptob(1); 2613 } else { 2614 page_unlock(opp); 2615 err = ENOMEM; 2616 goto out; 2617 } 2618 } 2619 oldap = ap; 2620 pp = anon_private(&ap, seg, addr, prot, opp, pageflags, svd->cred); 2621 if (pp == NULL) { 2622 err = ENOMEM; /* out of swap space */ 2623 goto out; 2624 } 2625 2626 /* 2627 * If we copied away from an anonymous page, then 2628 * we are one step closer to freeing up an anon slot. 2629 * 2630 * NOTE: The original anon slot must be released while 2631 * holding the "anon_map" lock. This is necessary to prevent 2632 * other threads from obtaining a pointer to the anon slot 2633 * which may be freed if its "refcnt" is 1. 2634 */ 2635 if (oldap != NULL) 2636 anon_decref(oldap); 2637 2638 (void) anon_set_ptr(amp->ahp, anon_index, ap, ANON_SLEEP); 2639 2640 /* 2641 * Handle pages that have been marked for migration 2642 */ 2643 if (lgrp_optimizations()) 2644 page_migrate(seg, addr, &pp, 1); 2645 2646 ASSERT(pp->p_szc == 0); 2647 if (type == F_SOFTLOCK && svd->vp == NULL) { 2648 if (!segvn_pp_lock_anonpages(pp, first)) { 2649 page_unlock(pp); 2650 err = ENOMEM; 2651 goto out; 2652 } else { 2653 mutex_enter(&freemem_lock); 2654 svd->softlockcnt++; 2655 segvn_pages_locked++; 2656 mutex_exit(&freemem_lock); 2657 } 2658 } 2659 2660 ASSERT(!IS_VMODSORT(pp->p_vnode)); 2661 if (enable_mbit_wa) { 2662 if (rw == S_WRITE) 2663 hat_setmod(pp); 2664 else if (!hat_ismod(pp)) 2665 prot &= ~PROT_WRITE; 2666 } 2667 2668 hat_memload(hat, addr, pp, prot, hat_flag); 2669 2670 if (!(hat_flag & HAT_LOAD_LOCK)) 2671 page_unlock(pp); 2672 2673 ASSERT(anon_lock); 2674 anon_array_exit(&cookie); 2675 return (0); 2676 out: 2677 if (anon_lock) 2678 anon_array_exit(&cookie); 2679 2680 if (type == F_SOFTLOCK && svd->vp != NULL) { 2681 mutex_enter(&freemem_lock); 2682 availrmem++; 2683 segvn_pages_locked--; 2684 svd->softlockcnt--; 2685 mutex_exit(&freemem_lock); 2686 } 2687 return (FC_MAKE_ERR(err)); 2688 } 2689 2690 /* 2691 * relocate a bunch of smaller targ pages into one large repl page. all targ 2692 * pages must be complete pages smaller than replacement pages. 2693 * it's assumed that no page's szc can change since they are all PAGESIZE or 2694 * complete large pages locked SHARED. 2695 */ 2696 static void 2697 segvn_relocate_pages(page_t **targ, page_t *replacement) 2698 { 2699 page_t *pp; 2700 pgcnt_t repl_npgs, curnpgs; 2701 pgcnt_t i; 2702 uint_t repl_szc = replacement->p_szc; 2703 page_t *first_repl = replacement; 2704 page_t *repl; 2705 spgcnt_t npgs; 2706 2707 VM_STAT_ADD(segvnvmstats.relocatepages[0]); 2708 2709 ASSERT(repl_szc != 0); 2710 npgs = repl_npgs = page_get_pagecnt(repl_szc); 2711 2712 i = 0; 2713 while (repl_npgs) { 2714 spgcnt_t nreloc; 2715 int err; 2716 ASSERT(replacement != NULL); 2717 pp = targ[i]; 2718 ASSERT(pp->p_szc < repl_szc); 2719 ASSERT(PAGE_EXCL(pp)); 2720 ASSERT(!PP_ISFREE(pp)); 2721 curnpgs = page_get_pagecnt(pp->p_szc); 2722 if (curnpgs == 1) { 2723 VM_STAT_ADD(segvnvmstats.relocatepages[1]); 2724 repl = replacement; 2725 page_sub(&replacement, repl); 2726 ASSERT(PAGE_EXCL(repl)); 2727 ASSERT(!PP_ISFREE(repl)); 2728 ASSERT(repl->p_szc == repl_szc); 2729 } else { 2730 page_t *repl_savepp; 2731 int j; 2732 VM_STAT_ADD(segvnvmstats.relocatepages[2]); 2733 repl_savepp = replacement; 2734 for (j = 0; j < curnpgs; j++) { 2735 repl = replacement; 2736 page_sub(&replacement, repl); 2737 ASSERT(PAGE_EXCL(repl)); 2738 ASSERT(!PP_ISFREE(repl)); 2739 ASSERT(repl->p_szc == repl_szc); 2740 ASSERT(page_pptonum(targ[i + j]) == 2741 page_pptonum(targ[i]) + j); 2742 } 2743 repl = repl_savepp; 2744 ASSERT(IS_P2ALIGNED(page_pptonum(repl), curnpgs)); 2745 } 2746 err = page_relocate(&pp, &repl, 0, 1, &nreloc, NULL); 2747 if (err || nreloc != curnpgs) { 2748 panic("segvn_relocate_pages: " 2749 "page_relocate failed err=%d curnpgs=%ld " 2750 "nreloc=%ld", err, curnpgs, nreloc); 2751 } 2752 ASSERT(curnpgs <= repl_npgs); 2753 repl_npgs -= curnpgs; 2754 i += curnpgs; 2755 } 2756 ASSERT(replacement == NULL); 2757 2758 repl = first_repl; 2759 repl_npgs = npgs; 2760 for (i = 0; i < repl_npgs; i++) { 2761 ASSERT(PAGE_EXCL(repl)); 2762 ASSERT(!PP_ISFREE(repl)); 2763 targ[i] = repl; 2764 page_downgrade(targ[i]); 2765 repl++; 2766 } 2767 } 2768 2769 /* 2770 * Check if all pages in ppa array are complete smaller than szc pages and 2771 * their roots will still be aligned relative to their current size if the 2772 * entire ppa array is relocated into one szc page. If these conditions are 2773 * not met return 0. 2774 * 2775 * If all pages are properly aligned attempt to upgrade their locks 2776 * to exclusive mode. If it fails set *upgrdfail to 1 and return 0. 2777 * upgrdfail was set to 0 by caller. 2778 * 2779 * Return 1 if all pages are aligned and locked exclusively. 2780 * 2781 * If all pages in ppa array happen to be physically contiguous to make one 2782 * szc page and all exclusive locks are successfully obtained promote the page 2783 * size to szc and set *pszc to szc. Return 1 with pages locked shared. 2784 */ 2785 static int 2786 segvn_full_szcpages(page_t **ppa, uint_t szc, int *upgrdfail, uint_t *pszc) 2787 { 2788 page_t *pp; 2789 pfn_t pfn; 2790 pgcnt_t totnpgs = page_get_pagecnt(szc); 2791 pfn_t first_pfn; 2792 int contig = 1; 2793 pgcnt_t i; 2794 pgcnt_t j; 2795 uint_t curszc; 2796 pgcnt_t curnpgs; 2797 int root = 0; 2798 2799 ASSERT(szc > 0); 2800 2801 VM_STAT_ADD(segvnvmstats.fullszcpages[0]); 2802 2803 for (i = 0; i < totnpgs; i++) { 2804 pp = ppa[i]; 2805 ASSERT(PAGE_SHARED(pp)); 2806 ASSERT(!PP_ISFREE(pp)); 2807 pfn = page_pptonum(pp); 2808 if (i == 0) { 2809 if (!IS_P2ALIGNED(pfn, totnpgs)) { 2810 contig = 0; 2811 } else { 2812 first_pfn = pfn; 2813 } 2814 } else if (contig && pfn != first_pfn + i) { 2815 contig = 0; 2816 } 2817 if (pp->p_szc == 0) { 2818 if (root) { 2819 VM_STAT_ADD(segvnvmstats.fullszcpages[1]); 2820 return (0); 2821 } 2822 } else if (!root) { 2823 if ((curszc = pp->p_szc) >= szc) { 2824 VM_STAT_ADD(segvnvmstats.fullszcpages[2]); 2825 return (0); 2826 } 2827 if (curszc == 0) { 2828 /* 2829 * p_szc changed means we don't have all pages 2830 * locked. return failure. 2831 */ 2832 VM_STAT_ADD(segvnvmstats.fullszcpages[3]); 2833 return (0); 2834 } 2835 curnpgs = page_get_pagecnt(curszc); 2836 if (!IS_P2ALIGNED(pfn, curnpgs) || 2837 !IS_P2ALIGNED(i, curnpgs)) { 2838 VM_STAT_ADD(segvnvmstats.fullszcpages[4]); 2839 return (0); 2840 } 2841 root = 1; 2842 } else { 2843 ASSERT(i > 0); 2844 VM_STAT_ADD(segvnvmstats.fullszcpages[5]); 2845 if (pp->p_szc != curszc) { 2846 VM_STAT_ADD(segvnvmstats.fullszcpages[6]); 2847 return (0); 2848 } 2849 if (pfn - 1 != page_pptonum(ppa[i - 1])) { 2850 panic("segvn_full_szcpages: " 2851 "large page not physically contiguous"); 2852 } 2853 if (P2PHASE(pfn, curnpgs) == curnpgs - 1) { 2854 root = 0; 2855 } 2856 } 2857 } 2858 2859 for (i = 0; i < totnpgs; i++) { 2860 ASSERT(ppa[i]->p_szc < szc); 2861 if (!page_tryupgrade(ppa[i])) { 2862 for (j = 0; j < i; j++) { 2863 page_downgrade(ppa[j]); 2864 } 2865 *pszc = ppa[i]->p_szc; 2866 *upgrdfail = 1; 2867 VM_STAT_ADD(segvnvmstats.fullszcpages[7]); 2868 return (0); 2869 } 2870 } 2871 2872 /* 2873 * When a page is put a free cachelist its szc is set to 0. if file 2874 * system reclaimed pages from cachelist targ pages will be physically 2875 * contiguous with 0 p_szc. in this case just upgrade szc of targ 2876 * pages without any relocations. 2877 * To avoid any hat issues with previous small mappings 2878 * hat_pageunload() the target pages first. 2879 */ 2880 if (contig) { 2881 VM_STAT_ADD(segvnvmstats.fullszcpages[8]); 2882 for (i = 0; i < totnpgs; i++) { 2883 (void) hat_pageunload(ppa[i], HAT_FORCE_PGUNLOAD); 2884 } 2885 for (i = 0; i < totnpgs; i++) { 2886 ppa[i]->p_szc = szc; 2887 } 2888 for (i = 0; i < totnpgs; i++) { 2889 ASSERT(PAGE_EXCL(ppa[i])); 2890 page_downgrade(ppa[i]); 2891 } 2892 if (pszc != NULL) { 2893 *pszc = szc; 2894 } 2895 } 2896 VM_STAT_ADD(segvnvmstats.fullszcpages[9]); 2897 return (1); 2898 } 2899 2900 /* 2901 * Create physically contiguous pages for [vp, off] - [vp, off + 2902 * page_size(szc)) range and for private segment return them in ppa array. 2903 * Pages are created either via IO or relocations. 2904 * 2905 * Return 1 on sucess and 0 on failure. 2906 * 2907 * If physically contiguos pages already exist for this range return 1 without 2908 * filling ppa array. Caller initializes ppa[0] as NULL to detect that ppa 2909 * array wasn't filled. In this case caller fills ppa array via VOP_GETPAGE(). 2910 */ 2911 2912 static int 2913 segvn_fill_vp_pages(struct segvn_data *svd, vnode_t *vp, u_offset_t off, 2914 uint_t szc, page_t **ppa, page_t **ppplist, uint_t *ret_pszc, 2915 int *downsize) 2916 2917 { 2918 page_t *pplist = *ppplist; 2919 size_t pgsz = page_get_pagesize(szc); 2920 pgcnt_t pages = btop(pgsz); 2921 ulong_t start_off = off; 2922 u_offset_t eoff = off + pgsz; 2923 spgcnt_t nreloc; 2924 u_offset_t io_off = off; 2925 size_t io_len; 2926 page_t *io_pplist = NULL; 2927 page_t *done_pplist = NULL; 2928 pgcnt_t pgidx = 0; 2929 page_t *pp; 2930 page_t *newpp; 2931 page_t *targpp; 2932 int io_err = 0; 2933 int i; 2934 pfn_t pfn; 2935 ulong_t ppages; 2936 page_t *targ_pplist = NULL; 2937 page_t *repl_pplist = NULL; 2938 page_t *tmp_pplist; 2939 int nios = 0; 2940 uint_t pszc; 2941 struct vattr va; 2942 2943 VM_STAT_ADD(segvnvmstats.fill_vp_pages[0]); 2944 2945 ASSERT(szc != 0); 2946 ASSERT(pplist->p_szc == szc); 2947 2948 /* 2949 * downsize will be set to 1 only if we fail to lock pages. this will 2950 * allow subsequent faults to try to relocate the page again. If we 2951 * fail due to misalignment don't downsize and let the caller map the 2952 * whole region with small mappings to avoid more faults into the area 2953 * where we can't get large pages anyway. 2954 */ 2955 *downsize = 0; 2956 2957 while (off < eoff) { 2958 newpp = pplist; 2959 ASSERT(newpp != NULL); 2960 ASSERT(PAGE_EXCL(newpp)); 2961 ASSERT(!PP_ISFREE(newpp)); 2962 /* 2963 * we pass NULL for nrelocp to page_lookup_create() 2964 * so that it doesn't relocate. We relocate here 2965 * later only after we make sure we can lock all 2966 * pages in the range we handle and they are all 2967 * aligned. 2968 */ 2969 pp = page_lookup_create(vp, off, SE_SHARED, newpp, NULL, 0); 2970 ASSERT(pp != NULL); 2971 ASSERT(!PP_ISFREE(pp)); 2972 ASSERT(pp->p_vnode == vp); 2973 ASSERT(pp->p_offset == off); 2974 if (pp == newpp) { 2975 VM_STAT_ADD(segvnvmstats.fill_vp_pages[1]); 2976 page_sub(&pplist, pp); 2977 ASSERT(PAGE_EXCL(pp)); 2978 ASSERT(page_iolock_assert(pp)); 2979 page_list_concat(&io_pplist, &pp); 2980 off += PAGESIZE; 2981 continue; 2982 } 2983 VM_STAT_ADD(segvnvmstats.fill_vp_pages[2]); 2984 pfn = page_pptonum(pp); 2985 pszc = pp->p_szc; 2986 if (pszc >= szc && targ_pplist == NULL && io_pplist == NULL && 2987 IS_P2ALIGNED(pfn, pages)) { 2988 ASSERT(repl_pplist == NULL); 2989 ASSERT(done_pplist == NULL); 2990 ASSERT(pplist == *ppplist); 2991 page_unlock(pp); 2992 page_free_replacement_page(pplist); 2993 page_create_putback(pages); 2994 *ppplist = NULL; 2995 VM_STAT_ADD(segvnvmstats.fill_vp_pages[3]); 2996 return (1); 2997 } 2998 if (pszc >= szc) { 2999 page_unlock(pp); 3000 segvn_faultvnmpss_align_err1++; 3001 goto out; 3002 } 3003 ppages = page_get_pagecnt(pszc); 3004 if (!IS_P2ALIGNED(pfn, ppages)) { 3005 ASSERT(pszc > 0); 3006 /* 3007 * sizing down to pszc won't help. 3008 */ 3009 page_unlock(pp); 3010 segvn_faultvnmpss_align_err2++; 3011 goto out; 3012 } 3013 pfn = page_pptonum(newpp); 3014 if (!IS_P2ALIGNED(pfn, ppages)) { 3015 ASSERT(pszc > 0); 3016 /* 3017 * sizing down to pszc won't help. 3018 */ 3019 page_unlock(pp); 3020 segvn_faultvnmpss_align_err3++; 3021 goto out; 3022 } 3023 if (!PAGE_EXCL(pp)) { 3024 VM_STAT_ADD(segvnvmstats.fill_vp_pages[4]); 3025 page_unlock(pp); 3026 *downsize = 1; 3027 *ret_pszc = pp->p_szc; 3028 goto out; 3029 } 3030 targpp = pp; 3031 if (io_pplist != NULL) { 3032 VM_STAT_ADD(segvnvmstats.fill_vp_pages[5]); 3033 io_len = off - io_off; 3034 /* 3035 * Some file systems like NFS don't check EOF 3036 * conditions in VOP_PAGEIO(). Check it here 3037 * now that pages are locked SE_EXCL. Any file 3038 * truncation will wait until the pages are 3039 * unlocked so no need to worry that file will 3040 * be truncated after we check its size here. 3041 * XXX fix NFS to remove this check. 3042 */ 3043 va.va_mask = AT_SIZE; 3044 if (VOP_GETATTR(vp, &va, ATTR_HINT, svd->cred) != 0) { 3045 VM_STAT_ADD(segvnvmstats.fill_vp_pages[6]); 3046 page_unlock(targpp); 3047 goto out; 3048 } 3049 if (btopr(va.va_size) < btopr(io_off + io_len)) { 3050 VM_STAT_ADD(segvnvmstats.fill_vp_pages[7]); 3051 *downsize = 1; 3052 *ret_pszc = 0; 3053 page_unlock(targpp); 3054 goto out; 3055 } 3056 io_err = VOP_PAGEIO(vp, io_pplist, io_off, io_len, 3057 B_READ, svd->cred); 3058 if (io_err) { 3059 VM_STAT_ADD(segvnvmstats.fill_vp_pages[8]); 3060 page_unlock(targpp); 3061 if (io_err == EDEADLK) { 3062 segvn_vmpss_pageio_deadlk_err++; 3063 } 3064 goto out; 3065 } 3066 nios++; 3067 VM_STAT_ADD(segvnvmstats.fill_vp_pages[9]); 3068 while (io_pplist != NULL) { 3069 pp = io_pplist; 3070 page_sub(&io_pplist, pp); 3071 ASSERT(page_iolock_assert(pp)); 3072 page_io_unlock(pp); 3073 pgidx = (pp->p_offset - start_off) >> 3074 PAGESHIFT; 3075 ASSERT(pgidx < pages); 3076 ppa[pgidx] = pp; 3077 page_list_concat(&done_pplist, &pp); 3078 } 3079 } 3080 pp = targpp; 3081 ASSERT(PAGE_EXCL(pp)); 3082 ASSERT(pp->p_szc <= pszc); 3083 if (pszc != 0 && !group_page_trylock(pp, SE_EXCL)) { 3084 VM_STAT_ADD(segvnvmstats.fill_vp_pages[10]); 3085 page_unlock(pp); 3086 *downsize = 1; 3087 *ret_pszc = pp->p_szc; 3088 goto out; 3089 } 3090 VM_STAT_ADD(segvnvmstats.fill_vp_pages[11]); 3091 /* 3092 * page szc chould have changed before the entire group was 3093 * locked. reread page szc. 3094 */ 3095 pszc = pp->p_szc; 3096 ppages = page_get_pagecnt(pszc); 3097 3098 /* link just the roots */ 3099 page_list_concat(&targ_pplist, &pp); 3100 page_sub(&pplist, newpp); 3101 page_list_concat(&repl_pplist, &newpp); 3102 off += PAGESIZE; 3103 while (--ppages != 0) { 3104 newpp = pplist; 3105 page_sub(&pplist, newpp); 3106 off += PAGESIZE; 3107 } 3108 io_off = off; 3109 } 3110 if (io_pplist != NULL) { 3111 VM_STAT_ADD(segvnvmstats.fill_vp_pages[12]); 3112 io_len = eoff - io_off; 3113 va.va_mask = AT_SIZE; 3114 if (VOP_GETATTR(vp, &va, ATTR_HINT, svd->cred) != 0) { 3115 VM_STAT_ADD(segvnvmstats.fill_vp_pages[13]); 3116 goto out; 3117 } 3118 if (btopr(va.va_size) < btopr(io_off + io_len)) { 3119 VM_STAT_ADD(segvnvmstats.fill_vp_pages[14]); 3120 *downsize = 1; 3121 *ret_pszc = 0; 3122 goto out; 3123 } 3124 io_err = VOP_PAGEIO(vp, io_pplist, io_off, io_len, 3125 B_READ, svd->cred); 3126 if (io_err) { 3127 VM_STAT_ADD(segvnvmstats.fill_vp_pages[15]); 3128 if (io_err == EDEADLK) { 3129 segvn_vmpss_pageio_deadlk_err++; 3130 } 3131 goto out; 3132 } 3133 nios++; 3134 while (io_pplist != NULL) { 3135 pp = io_pplist; 3136 page_sub(&io_pplist, pp); 3137 ASSERT(page_iolock_assert(pp)); 3138 page_io_unlock(pp); 3139 pgidx = (pp->p_offset - start_off) >> PAGESHIFT; 3140 ASSERT(pgidx < pages); 3141 ppa[pgidx] = pp; 3142 } 3143 } 3144 /* 3145 * we're now bound to succeed or panic. 3146 * remove pages from done_pplist. it's not needed anymore. 3147 */ 3148 while (done_pplist != NULL) { 3149 pp = done_pplist; 3150 page_sub(&done_pplist, pp); 3151 } 3152 VM_STAT_ADD(segvnvmstats.fill_vp_pages[16]); 3153 ASSERT(pplist == NULL); 3154 *ppplist = NULL; 3155 while (targ_pplist != NULL) { 3156 int ret; 3157 VM_STAT_ADD(segvnvmstats.fill_vp_pages[17]); 3158 ASSERT(repl_pplist); 3159 pp = targ_pplist; 3160 page_sub(&targ_pplist, pp); 3161 pgidx = (pp->p_offset - start_off) >> PAGESHIFT; 3162 newpp = repl_pplist; 3163 page_sub(&repl_pplist, newpp); 3164 #ifdef DEBUG 3165 pfn = page_pptonum(pp); 3166 pszc = pp->p_szc; 3167 ppages = page_get_pagecnt(pszc); 3168 ASSERT(IS_P2ALIGNED(pfn, ppages)); 3169 pfn = page_pptonum(newpp); 3170 ASSERT(IS_P2ALIGNED(pfn, ppages)); 3171 ASSERT(P2PHASE(pfn, pages) == pgidx); 3172 #endif 3173 nreloc = 0; 3174 ret = page_relocate(&pp, &newpp, 0, 1, &nreloc, NULL); 3175 if (ret != 0 || nreloc == 0) { 3176 panic("segvn_fill_vp_pages: " 3177 "page_relocate failed"); 3178 } 3179 pp = newpp; 3180 while (nreloc-- != 0) { 3181 ASSERT(PAGE_EXCL(pp)); 3182 ASSERT(pp->p_vnode == vp); 3183 ASSERT(pgidx == 3184 ((pp->p_offset - start_off) >> PAGESHIFT)); 3185 ppa[pgidx++] = pp; 3186 pp++; 3187 } 3188 } 3189 3190 if (svd->type == MAP_PRIVATE) { 3191 VM_STAT_ADD(segvnvmstats.fill_vp_pages[18]); 3192 for (i = 0; i < pages; i++) { 3193 ASSERT(ppa[i] != NULL); 3194 ASSERT(PAGE_EXCL(ppa[i])); 3195 ASSERT(ppa[i]->p_vnode == vp); 3196 ASSERT(ppa[i]->p_offset == 3197 start_off + (i << PAGESHIFT)); 3198 page_downgrade(ppa[i]); 3199 } 3200 ppa[pages] = NULL; 3201 } else { 3202 VM_STAT_ADD(segvnvmstats.fill_vp_pages[19]); 3203 /* 3204 * the caller will still call VOP_GETPAGE() for shared segments 3205 * to check FS write permissions. For private segments we map 3206 * file read only anyway. so no VOP_GETPAGE is needed. 3207 */ 3208 for (i = 0; i < pages; i++) { 3209 ASSERT(ppa[i] != NULL); 3210 ASSERT(PAGE_EXCL(ppa[i])); 3211 ASSERT(ppa[i]->p_vnode == vp); 3212 ASSERT(ppa[i]->p_offset == 3213 start_off + (i << PAGESHIFT)); 3214 page_unlock(ppa[i]); 3215 } 3216 ppa[0] = NULL; 3217 } 3218 3219 return (1); 3220 out: 3221 /* 3222 * Do the cleanup. Unlock target pages we didn't relocate. They are 3223 * linked on targ_pplist by root pages. reassemble unused replacement 3224 * and io pages back to pplist. 3225 */ 3226 if (io_pplist != NULL) { 3227 VM_STAT_ADD(segvnvmstats.fill_vp_pages[20]); 3228 pp = io_pplist; 3229 do { 3230 ASSERT(pp->p_vnode == vp); 3231 ASSERT(pp->p_offset == io_off); 3232 ASSERT(page_iolock_assert(pp)); 3233 page_io_unlock(pp); 3234 page_hashout(pp, NULL); 3235 io_off += PAGESIZE; 3236 } while ((pp = pp->p_next) != io_pplist); 3237 page_list_concat(&io_pplist, &pplist); 3238 pplist = io_pplist; 3239 } 3240 tmp_pplist = NULL; 3241 while (targ_pplist != NULL) { 3242 VM_STAT_ADD(segvnvmstats.fill_vp_pages[21]); 3243 pp = targ_pplist; 3244 ASSERT(PAGE_EXCL(pp)); 3245 page_sub(&targ_pplist, pp); 3246 3247 pszc = pp->p_szc; 3248 ppages = page_get_pagecnt(pszc); 3249 ASSERT(IS_P2ALIGNED(page_pptonum(pp), ppages)); 3250 3251 if (pszc != 0) { 3252 group_page_unlock(pp); 3253 } 3254 page_unlock(pp); 3255 3256 pp = repl_pplist; 3257 ASSERT(pp != NULL); 3258 ASSERT(PAGE_EXCL(pp)); 3259 ASSERT(pp->p_szc == szc); 3260 page_sub(&repl_pplist, pp); 3261 3262 ASSERT(IS_P2ALIGNED(page_pptonum(pp), ppages)); 3263 3264 /* relink replacement page */ 3265 page_list_concat(&tmp_pplist, &pp); 3266 while (--ppages != 0) { 3267 VM_STAT_ADD(segvnvmstats.fill_vp_pages[22]); 3268 pp++; 3269 ASSERT(PAGE_EXCL(pp)); 3270 ASSERT(pp->p_szc == szc); 3271 page_list_concat(&tmp_pplist, &pp); 3272 } 3273 } 3274 if (tmp_pplist != NULL) { 3275 VM_STAT_ADD(segvnvmstats.fill_vp_pages[23]); 3276 page_list_concat(&tmp_pplist, &pplist); 3277 pplist = tmp_pplist; 3278 } 3279 /* 3280 * at this point all pages are either on done_pplist or 3281 * pplist. They can't be all on done_pplist otherwise 3282 * we'd've been done. 3283 */ 3284 ASSERT(pplist != NULL); 3285 if (nios != 0) { 3286 VM_STAT_ADD(segvnvmstats.fill_vp_pages[24]); 3287 pp = pplist; 3288 do { 3289 VM_STAT_ADD(segvnvmstats.fill_vp_pages[25]); 3290 ASSERT(pp->p_szc == szc); 3291 ASSERT(PAGE_EXCL(pp)); 3292 ASSERT(pp->p_vnode != vp); 3293 pp->p_szc = 0; 3294 } while ((pp = pp->p_next) != pplist); 3295 3296 pp = done_pplist; 3297 do { 3298 VM_STAT_ADD(segvnvmstats.fill_vp_pages[26]); 3299 ASSERT(pp->p_szc == szc); 3300 ASSERT(PAGE_EXCL(pp)); 3301 ASSERT(pp->p_vnode == vp); 3302 pp->p_szc = 0; 3303 } while ((pp = pp->p_next) != done_pplist); 3304 3305 while (pplist != NULL) { 3306 VM_STAT_ADD(segvnvmstats.fill_vp_pages[27]); 3307 pp = pplist; 3308 page_sub(&pplist, pp); 3309 page_free(pp, 0); 3310 } 3311 3312 while (done_pplist != NULL) { 3313 VM_STAT_ADD(segvnvmstats.fill_vp_pages[28]); 3314 pp = done_pplist; 3315 page_sub(&done_pplist, pp); 3316 page_unlock(pp); 3317 } 3318 *ppplist = NULL; 3319 return (0); 3320 } 3321 ASSERT(pplist == *ppplist); 3322 if (io_err) { 3323 VM_STAT_ADD(segvnvmstats.fill_vp_pages[29]); 3324 /* 3325 * don't downsize on io error. 3326 * see if vop_getpage succeeds. 3327 * pplist may still be used in this case 3328 * for relocations. 3329 */ 3330 return (0); 3331 } 3332 VM_STAT_ADD(segvnvmstats.fill_vp_pages[30]); 3333 page_free_replacement_page(pplist); 3334 page_create_putback(pages); 3335 *ppplist = NULL; 3336 return (0); 3337 } 3338 3339 int segvn_anypgsz = 0; 3340 3341 #define SEGVN_RESTORE_SOFTLOCK(type, pages) \ 3342 if ((type) == F_SOFTLOCK) { \ 3343 mutex_enter(&freemem_lock); \ 3344 availrmem += (pages); \ 3345 segvn_pages_locked -= (pages); \ 3346 svd->softlockcnt -= (pages); \ 3347 mutex_exit(&freemem_lock); \ 3348 } 3349 3350 #define SEGVN_UPDATE_MODBITS(ppa, pages, rw, prot, vpprot) \ 3351 if (IS_VMODSORT((ppa)[0]->p_vnode)) { \ 3352 if ((rw) == S_WRITE) { \ 3353 for (i = 0; i < (pages); i++) { \ 3354 ASSERT((ppa)[i]->p_vnode == \ 3355 (ppa)[0]->p_vnode); \ 3356 hat_setmod((ppa)[i]); \ 3357 } \ 3358 } else if ((rw) != S_OTHER && \ 3359 ((prot) & (vpprot) & PROT_WRITE)) { \ 3360 for (i = 0; i < (pages); i++) { \ 3361 ASSERT((ppa)[i]->p_vnode == \ 3362 (ppa)[0]->p_vnode); \ 3363 if (!hat_ismod((ppa)[i])) { \ 3364 prot &= ~PROT_WRITE; \ 3365 break; \ 3366 } \ 3367 } \ 3368 } \ 3369 } 3370 3371 #ifdef VM_STATS 3372 3373 #define SEGVN_VMSTAT_FLTVNPAGES(idx) \ 3374 VM_STAT_ADD(segvnvmstats.fltvnpages[(idx)]); 3375 3376 #else /* VM_STATS */ 3377 3378 #define SEGVN_VMSTAT_FLTVNPAGES(idx) 3379 3380 #endif 3381 3382 static faultcode_t 3383 segvn_fault_vnodepages(struct hat *hat, struct seg *seg, caddr_t lpgaddr, 3384 caddr_t lpgeaddr, enum fault_type type, enum seg_rw rw, caddr_t addr, 3385 caddr_t eaddr, int brkcow) 3386 { 3387 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 3388 struct anon_map *amp = svd->amp; 3389 uchar_t segtype = svd->type; 3390 uint_t szc = seg->s_szc; 3391 size_t pgsz = page_get_pagesize(szc); 3392 size_t maxpgsz = pgsz; 3393 pgcnt_t pages = btop(pgsz); 3394 pgcnt_t maxpages = pages; 3395 size_t ppasize = (pages + 1) * sizeof (page_t *); 3396 caddr_t a = lpgaddr; 3397 caddr_t maxlpgeaddr = lpgeaddr; 3398 u_offset_t off = svd->offset + (uintptr_t)(a - seg->s_base); 3399 ulong_t aindx = svd->anon_index + seg_page(seg, a); 3400 struct vpage *vpage = (svd->vpage != NULL) ? 3401 &svd->vpage[seg_page(seg, a)] : NULL; 3402 vnode_t *vp = svd->vp; 3403 page_t **ppa; 3404 uint_t pszc; 3405 size_t ppgsz; 3406 pgcnt_t ppages; 3407 faultcode_t err = 0; 3408 int ierr; 3409 int vop_size_err = 0; 3410 uint_t protchk, prot, vpprot; 3411 ulong_t i; 3412 int hat_flag = (type == F_SOFTLOCK) ? HAT_LOAD_LOCK : HAT_LOAD; 3413 anon_sync_obj_t an_cookie; 3414 enum seg_rw arw; 3415 int alloc_failed = 0; 3416 int adjszc_chk; 3417 struct vattr va; 3418 int xhat = 0; 3419 page_t *pplist; 3420 pfn_t pfn; 3421 int physcontig; 3422 int upgrdfail; 3423 int segvn_anypgsz_vnode = 0; /* for now map vnode with 2 page sizes */ 3424 3425 ASSERT(szc != 0); 3426 ASSERT(vp != NULL); 3427 ASSERT(brkcow == 0 || amp != NULL); 3428 ASSERT(enable_mbit_wa == 0); /* no mbit simulations with large pages */ 3429 ASSERT(!(svd->flags & MAP_NORESERVE)); 3430 ASSERT(type != F_SOFTUNLOCK); 3431 ASSERT(IS_P2ALIGNED(a, maxpgsz)); 3432 ASSERT(amp == NULL || IS_P2ALIGNED(aindx, maxpages)); 3433 ASSERT(SEGVN_LOCK_HELD(seg->s_as, &svd->lock)); 3434 ASSERT(seg->s_szc < NBBY * sizeof (int)); 3435 ASSERT(type != F_SOFTLOCK || lpgeaddr - a == maxpgsz); 3436 3437 VM_STAT_COND_ADD(type == F_SOFTLOCK, segvnvmstats.fltvnpages[0]); 3438 VM_STAT_COND_ADD(type != F_SOFTLOCK, segvnvmstats.fltvnpages[1]); 3439 3440 if (svd->flags & MAP_TEXT) { 3441 hat_flag |= HAT_LOAD_TEXT; 3442 } 3443 3444 if (svd->pageprot) { 3445 switch (rw) { 3446 case S_READ: 3447 protchk = PROT_READ; 3448 break; 3449 case S_WRITE: 3450 protchk = PROT_WRITE; 3451 break; 3452 case S_EXEC: 3453 protchk = PROT_EXEC; 3454 break; 3455 case S_OTHER: 3456 default: 3457 protchk = PROT_READ | PROT_WRITE | PROT_EXEC; 3458 break; 3459 } 3460 } else { 3461 prot = svd->prot; 3462 /* caller has already done segment level protection check. */ 3463 } 3464 3465 if (seg->s_as->a_hat != hat) { 3466 xhat = 1; 3467 } 3468 3469 if (rw == S_WRITE && segtype == MAP_PRIVATE) { 3470 SEGVN_VMSTAT_FLTVNPAGES(2); 3471 arw = S_READ; 3472 } else { 3473 arw = rw; 3474 } 3475 3476 ppa = kmem_alloc(ppasize, KM_SLEEP); 3477 3478 VM_STAT_COND_ADD(amp != NULL, segvnvmstats.fltvnpages[3]); 3479 3480 for (;;) { 3481 adjszc_chk = 0; 3482 for (; a < lpgeaddr; a += pgsz, off += pgsz, aindx += pages) { 3483 if (adjszc_chk) { 3484 while (szc < seg->s_szc) { 3485 uintptr_t e; 3486 uint_t tszc; 3487 tszc = segvn_anypgsz_vnode ? szc + 1 : 3488 seg->s_szc; 3489 ppgsz = page_get_pagesize(tszc); 3490 if (!IS_P2ALIGNED(a, ppgsz) || 3491 ((alloc_failed >> tszc) & 3492 0x1)) { 3493 break; 3494 } 3495 SEGVN_VMSTAT_FLTVNPAGES(4); 3496 szc = tszc; 3497 pgsz = ppgsz; 3498 pages = btop(pgsz); 3499 e = P2ROUNDUP((uintptr_t)eaddr, pgsz); 3500 lpgeaddr = (caddr_t)e; 3501 } 3502 } 3503 3504 again: 3505 if (IS_P2ALIGNED(a, maxpgsz) && amp != NULL) { 3506 ASSERT(IS_P2ALIGNED(aindx, maxpages)); 3507 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 3508 anon_array_enter(amp, aindx, &an_cookie); 3509 if (anon_get_ptr(amp->ahp, aindx) != NULL) { 3510 SEGVN_VMSTAT_FLTVNPAGES(5); 3511 if (anon_pages(amp->ahp, aindx, 3512 maxpages) != maxpages) { 3513 panic("segvn_fault_vnodepages:" 3514 " empty anon slots\n"); 3515 } 3516 anon_array_exit(&an_cookie); 3517 ANON_LOCK_EXIT(&->a_rwlock); 3518 err = segvn_fault_anonpages(hat, seg, 3519 a, a + maxpgsz, type, rw, 3520 MAX(a, addr), 3521 MIN(a + maxpgsz, eaddr), brkcow); 3522 if (err != 0) { 3523 SEGVN_VMSTAT_FLTVNPAGES(6); 3524 goto out; 3525 } 3526 if (szc < seg->s_szc) { 3527 szc = seg->s_szc; 3528 pgsz = maxpgsz; 3529 pages = maxpages; 3530 lpgeaddr = maxlpgeaddr; 3531 } 3532 goto next; 3533 } else if (anon_pages(amp->ahp, aindx, 3534 maxpages)) { 3535 panic("segvn_fault_vnodepages:" 3536 " non empty anon slots\n"); 3537 } else { 3538 SEGVN_VMSTAT_FLTVNPAGES(7); 3539 anon_array_exit(&an_cookie); 3540 ANON_LOCK_EXIT(&->a_rwlock); 3541 } 3542 } 3543 ASSERT(!brkcow || IS_P2ALIGNED(a, maxpgsz)); 3544 3545 if (svd->pageprot != 0 && IS_P2ALIGNED(a, maxpgsz)) { 3546 ASSERT(vpage != NULL); 3547 prot = VPP_PROT(vpage); 3548 ASSERT(sameprot(seg, a, maxpgsz)); 3549 if ((prot & protchk) == 0) { 3550 SEGVN_VMSTAT_FLTVNPAGES(8); 3551 err = FC_PROT; 3552 goto out; 3553 } 3554 } 3555 if (type == F_SOFTLOCK) { 3556 mutex_enter(&freemem_lock); 3557 if (availrmem < tune.t_minarmem + pages) { 3558 mutex_exit(&freemem_lock); 3559 err = FC_MAKE_ERR(ENOMEM); 3560 goto out; 3561 } else { 3562 availrmem -= pages; 3563 segvn_pages_locked += pages; 3564 svd->softlockcnt += pages; 3565 } 3566 mutex_exit(&freemem_lock); 3567 } 3568 3569 pplist = NULL; 3570 physcontig = 0; 3571 ppa[0] = NULL; 3572 if (!brkcow && szc && 3573 !page_exists_physcontig(vp, off, szc, 3574 segtype == MAP_PRIVATE ? ppa : NULL)) { 3575 SEGVN_VMSTAT_FLTVNPAGES(9); 3576 if (page_alloc_pages(vp, seg, a, &pplist, NULL, 3577 szc, 0) && type != F_SOFTLOCK) { 3578 SEGVN_VMSTAT_FLTVNPAGES(10); 3579 pszc = 0; 3580 ierr = -1; 3581 alloc_failed |= (1 << szc); 3582 break; 3583 } 3584 if (pplist != NULL && 3585 vp->v_mpssdata == SEGVN_PAGEIO) { 3586 int downsize; 3587 SEGVN_VMSTAT_FLTVNPAGES(11); 3588 physcontig = segvn_fill_vp_pages(svd, 3589 vp, off, szc, ppa, &pplist, 3590 &pszc, &downsize); 3591 ASSERT(!physcontig || pplist == NULL); 3592 if (!physcontig && downsize && 3593 type != F_SOFTLOCK) { 3594 ASSERT(pplist == NULL); 3595 SEGVN_VMSTAT_FLTVNPAGES(12); 3596 ierr = -1; 3597 break; 3598 } 3599 ASSERT(!physcontig || 3600 segtype == MAP_PRIVATE || 3601 ppa[0] == NULL); 3602 if (physcontig && ppa[0] == NULL) { 3603 physcontig = 0; 3604 } 3605 } 3606 } else if (!brkcow && szc && ppa[0] != NULL) { 3607 SEGVN_VMSTAT_FLTVNPAGES(13); 3608 ASSERT(segtype == MAP_PRIVATE); 3609 physcontig = 1; 3610 } 3611 3612 if (!physcontig) { 3613 SEGVN_VMSTAT_FLTVNPAGES(14); 3614 ppa[0] = NULL; 3615 ierr = VOP_GETPAGE(vp, (offset_t)off, pgsz, 3616 &vpprot, ppa, pgsz, seg, a, arw, 3617 svd->cred); 3618 if (segtype == MAP_PRIVATE) { 3619 SEGVN_VMSTAT_FLTVNPAGES(15); 3620 vpprot &= ~PROT_WRITE; 3621 } 3622 } else { 3623 ASSERT(segtype == MAP_PRIVATE); 3624 SEGVN_VMSTAT_FLTVNPAGES(16); 3625 vpprot = PROT_ALL & ~PROT_WRITE; 3626 ierr = 0; 3627 } 3628 3629 if (ierr != 0) { 3630 SEGVN_VMSTAT_FLTVNPAGES(17); 3631 if (pplist != NULL) { 3632 SEGVN_VMSTAT_FLTVNPAGES(18); 3633 page_free_replacement_page(pplist); 3634 page_create_putback(pages); 3635 } 3636 SEGVN_RESTORE_SOFTLOCK(type, pages); 3637 if (a + pgsz <= eaddr) { 3638 SEGVN_VMSTAT_FLTVNPAGES(19); 3639 err = FC_MAKE_ERR(ierr); 3640 goto out; 3641 } 3642 va.va_mask = AT_SIZE; 3643 if (VOP_GETATTR(vp, &va, 0, svd->cred) != 0) { 3644 SEGVN_VMSTAT_FLTVNPAGES(20); 3645 err = FC_MAKE_ERR(EIO); 3646 goto out; 3647 } 3648 if (btopr(va.va_size) >= btopr(off + pgsz)) { 3649 SEGVN_VMSTAT_FLTVNPAGES(21); 3650 err = FC_MAKE_ERR(ierr); 3651 goto out; 3652 } 3653 if (btopr(va.va_size) < 3654 btopr(off + (eaddr - a))) { 3655 SEGVN_VMSTAT_FLTVNPAGES(22); 3656 err = FC_MAKE_ERR(ierr); 3657 goto out; 3658 } 3659 if (brkcow || type == F_SOFTLOCK) { 3660 /* can't reduce map area */ 3661 SEGVN_VMSTAT_FLTVNPAGES(23); 3662 vop_size_err = 1; 3663 goto out; 3664 } 3665 SEGVN_VMSTAT_FLTVNPAGES(24); 3666 ASSERT(szc != 0); 3667 pszc = 0; 3668 ierr = -1; 3669 break; 3670 } 3671 3672 if (amp != NULL) { 3673 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 3674 anon_array_enter(amp, aindx, &an_cookie); 3675 } 3676 if (amp != NULL && 3677 anon_get_ptr(amp->ahp, aindx) != NULL) { 3678 ulong_t taindx = P2ALIGN(aindx, maxpages); 3679 3680 SEGVN_VMSTAT_FLTVNPAGES(25); 3681 if (anon_pages(amp->ahp, taindx, maxpages) != 3682 maxpages) { 3683 panic("segvn_fault_vnodepages:" 3684 " empty anon slots\n"); 3685 } 3686 for (i = 0; i < pages; i++) { 3687 page_unlock(ppa[i]); 3688 } 3689 anon_array_exit(&an_cookie); 3690 ANON_LOCK_EXIT(&->a_rwlock); 3691 if (pplist != NULL) { 3692 page_free_replacement_page(pplist); 3693 page_create_putback(pages); 3694 } 3695 SEGVN_RESTORE_SOFTLOCK(type, pages); 3696 if (szc < seg->s_szc) { 3697 SEGVN_VMSTAT_FLTVNPAGES(26); 3698 /* 3699 * For private segments SOFTLOCK 3700 * either always breaks cow (any rw 3701 * type except S_READ_NOCOW) or 3702 * address space is locked as writer 3703 * (S_READ_NOCOW case) and anon slots 3704 * can't show up on second check. 3705 * Therefore if we are here for 3706 * SOFTLOCK case it must be a cow 3707 * break but cow break never reduces 3708 * szc. Thus the assert below. 3709 */ 3710 ASSERT(!brkcow && type != F_SOFTLOCK); 3711 pszc = seg->s_szc; 3712 ierr = -2; 3713 break; 3714 } 3715 ASSERT(IS_P2ALIGNED(a, maxpgsz)); 3716 goto again; 3717 } 3718 #ifdef DEBUG 3719 if (amp != NULL) { 3720 ulong_t taindx = P2ALIGN(aindx, maxpages); 3721 ASSERT(!anon_pages(amp->ahp, taindx, maxpages)); 3722 } 3723 #endif /* DEBUG */ 3724 3725 if (brkcow) { 3726 ASSERT(amp != NULL); 3727 ASSERT(pplist == NULL); 3728 ASSERT(szc == seg->s_szc); 3729 ASSERT(IS_P2ALIGNED(a, maxpgsz)); 3730 ASSERT(IS_P2ALIGNED(aindx, maxpages)); 3731 SEGVN_VMSTAT_FLTVNPAGES(27); 3732 ierr = anon_map_privatepages(amp, aindx, szc, 3733 seg, a, prot, ppa, vpage, segvn_anypgsz, 3734 svd->cred); 3735 if (ierr != 0) { 3736 SEGVN_VMSTAT_FLTVNPAGES(28); 3737 anon_array_exit(&an_cookie); 3738 ANON_LOCK_EXIT(&->a_rwlock); 3739 SEGVN_RESTORE_SOFTLOCK(type, pages); 3740 err = FC_MAKE_ERR(ierr); 3741 goto out; 3742 } 3743 3744 ASSERT(!IS_VMODSORT(ppa[0]->p_vnode)); 3745 /* 3746 * p_szc can't be changed for locked 3747 * swapfs pages. 3748 */ 3749 hat_memload_array(hat, a, pgsz, ppa, prot, 3750 hat_flag); 3751 3752 if (!(hat_flag & HAT_LOAD_LOCK)) { 3753 SEGVN_VMSTAT_FLTVNPAGES(29); 3754 for (i = 0; i < pages; i++) { 3755 page_unlock(ppa[i]); 3756 } 3757 } 3758 anon_array_exit(&an_cookie); 3759 ANON_LOCK_EXIT(&->a_rwlock); 3760 goto next; 3761 } 3762 3763 pfn = page_pptonum(ppa[0]); 3764 /* 3765 * hat_page_demote() needs an EXCl lock on one of 3766 * constituent page_t's and it decreases root's p_szc 3767 * last. This means if root's p_szc is equal szc and 3768 * all its constituent pages are locked 3769 * hat_page_demote() that could have changed p_szc to 3770 * szc is already done and no new have page_demote() 3771 * can start for this large page. 3772 */ 3773 3774 /* 3775 * we need to make sure same mapping size is used for 3776 * the same address range if there's a possibility the 3777 * adddress is already mapped because hat layer panics 3778 * when translation is loaded for the range already 3779 * mapped with a different page size. We achieve it 3780 * by always using largest page size possible subject 3781 * to the constraints of page size, segment page size 3782 * and page alignment. Since mappings are invalidated 3783 * when those constraints change and make it 3784 * impossible to use previously used mapping size no 3785 * mapping size conflicts should happen. 3786 */ 3787 3788 chkszc: 3789 if ((pszc = ppa[0]->p_szc) == szc && 3790 IS_P2ALIGNED(pfn, pages)) { 3791 3792 SEGVN_VMSTAT_FLTVNPAGES(30); 3793 #ifdef DEBUG 3794 for (i = 0; i < pages; i++) { 3795 ASSERT(PAGE_LOCKED(ppa[i])); 3796 ASSERT(!PP_ISFREE(ppa[i])); 3797 ASSERT(page_pptonum(ppa[i]) == 3798 pfn + i); 3799 ASSERT(ppa[i]->p_szc == szc); 3800 ASSERT(ppa[i]->p_vnode == vp); 3801 ASSERT(ppa[i]->p_offset == 3802 off + (i << PAGESHIFT)); 3803 } 3804 #endif /* DEBUG */ 3805 /* 3806 * All pages are of szc we need and they are 3807 * all locked so they can't change szc. load 3808 * translations. 3809 * 3810 * if page got promoted since last check 3811 * we don't need pplist. 3812 */ 3813 if (pplist != NULL) { 3814 page_free_replacement_page(pplist); 3815 page_create_putback(pages); 3816 } 3817 if (PP_ISMIGRATE(ppa[0])) { 3818 page_migrate(seg, a, ppa, pages); 3819 } 3820 SEGVN_UPDATE_MODBITS(ppa, pages, rw, 3821 prot, vpprot); 3822 if (!xhat) { 3823 hat_memload_array(hat, a, pgsz, ppa, 3824 prot & vpprot, hat_flag); 3825 } else { 3826 /* 3827 * avoid large xhat mappings to FS 3828 * pages so that hat_page_demote() 3829 * doesn't need to check for xhat 3830 * large mappings. 3831 */ 3832 for (i = 0; i < pages; i++) { 3833 hat_memload(hat, 3834 a + (i << PAGESHIFT), 3835 ppa[i], prot & vpprot, 3836 hat_flag); 3837 } 3838 } 3839 3840 if (!(hat_flag & HAT_LOAD_LOCK)) { 3841 for (i = 0; i < pages; i++) { 3842 page_unlock(ppa[i]); 3843 } 3844 } 3845 if (amp != NULL) { 3846 anon_array_exit(&an_cookie); 3847 ANON_LOCK_EXIT(&->a_rwlock); 3848 } 3849 goto next; 3850 } 3851 3852 /* 3853 * See if upsize is possible. 3854 */ 3855 if (pszc > szc && szc < seg->s_szc && 3856 (segvn_anypgsz_vnode || pszc >= seg->s_szc)) { 3857 pgcnt_t aphase; 3858 uint_t pszc1 = MIN(pszc, seg->s_szc); 3859 ppgsz = page_get_pagesize(pszc1); 3860 ppages = btop(ppgsz); 3861 aphase = btop(P2PHASE((uintptr_t)a, ppgsz)); 3862 3863 ASSERT(type != F_SOFTLOCK); 3864 3865 SEGVN_VMSTAT_FLTVNPAGES(31); 3866 if (aphase != P2PHASE(pfn, ppages)) { 3867 segvn_faultvnmpss_align_err4++; 3868 } else { 3869 SEGVN_VMSTAT_FLTVNPAGES(32); 3870 if (pplist != NULL) { 3871 page_t *pl = pplist; 3872 page_free_replacement_page(pl); 3873 page_create_putback(pages); 3874 } 3875 for (i = 0; i < pages; i++) { 3876 page_unlock(ppa[i]); 3877 } 3878 if (amp != NULL) { 3879 anon_array_exit(&an_cookie); 3880 ANON_LOCK_EXIT(&->a_rwlock); 3881 } 3882 pszc = pszc1; 3883 ierr = -2; 3884 break; 3885 } 3886 } 3887 3888 /* 3889 * check if we should use smallest mapping size. 3890 */ 3891 upgrdfail = 0; 3892 if (szc == 0 || xhat || 3893 (pszc >= szc && 3894 !IS_P2ALIGNED(pfn, pages)) || 3895 (pszc < szc && 3896 !segvn_full_szcpages(ppa, szc, &upgrdfail, 3897 &pszc))) { 3898 3899 if (upgrdfail && type != F_SOFTLOCK) { 3900 /* 3901 * segvn_full_szcpages failed to lock 3902 * all pages EXCL. Size down. 3903 */ 3904 ASSERT(pszc < szc); 3905 3906 SEGVN_VMSTAT_FLTVNPAGES(33); 3907 3908 if (pplist != NULL) { 3909 page_t *pl = pplist; 3910 page_free_replacement_page(pl); 3911 page_create_putback(pages); 3912 } 3913 3914 for (i = 0; i < pages; i++) { 3915 page_unlock(ppa[i]); 3916 } 3917 if (amp != NULL) { 3918 anon_array_exit(&an_cookie); 3919 ANON_LOCK_EXIT(&->a_rwlock); 3920 } 3921 ierr = -1; 3922 break; 3923 } 3924 if (szc != 0 && !xhat) { 3925 segvn_faultvnmpss_align_err5++; 3926 } 3927 SEGVN_VMSTAT_FLTVNPAGES(34); 3928 if (pplist != NULL) { 3929 page_free_replacement_page(pplist); 3930 page_create_putback(pages); 3931 } 3932 SEGVN_UPDATE_MODBITS(ppa, pages, rw, 3933 prot, vpprot); 3934 if (upgrdfail && segvn_anypgsz_vnode) { 3935 /* SOFTLOCK case */ 3936 hat_memload_array(hat, a, pgsz, 3937 ppa, prot & vpprot, hat_flag); 3938 } else { 3939 for (i = 0; i < pages; i++) { 3940 hat_memload(hat, 3941 a + (i << PAGESHIFT), 3942 ppa[i], prot & vpprot, 3943 hat_flag); 3944 } 3945 } 3946 if (!(hat_flag & HAT_LOAD_LOCK)) { 3947 for (i = 0; i < pages; i++) { 3948 page_unlock(ppa[i]); 3949 } 3950 } 3951 if (amp != NULL) { 3952 anon_array_exit(&an_cookie); 3953 ANON_LOCK_EXIT(&->a_rwlock); 3954 } 3955 goto next; 3956 } 3957 3958 if (pszc == szc) { 3959 /* 3960 * segvn_full_szcpages() upgraded pages szc. 3961 */ 3962 ASSERT(pszc == ppa[0]->p_szc); 3963 ASSERT(IS_P2ALIGNED(pfn, pages)); 3964 goto chkszc; 3965 } 3966 3967 if (pszc > szc) { 3968 kmutex_t *szcmtx; 3969 SEGVN_VMSTAT_FLTVNPAGES(35); 3970 /* 3971 * p_szc of ppa[0] can change since we haven't 3972 * locked all constituent pages. Call 3973 * page_lock_szc() to prevent szc changes. 3974 * This should be a rare case that happens when 3975 * multiple segments use a different page size 3976 * to map the same file offsets. 3977 */ 3978 szcmtx = page_szc_lock(ppa[0]); 3979 pszc = ppa[0]->p_szc; 3980 ASSERT(szcmtx != NULL || pszc == 0); 3981 ASSERT(ppa[0]->p_szc <= pszc); 3982 if (pszc <= szc) { 3983 SEGVN_VMSTAT_FLTVNPAGES(36); 3984 if (szcmtx != NULL) { 3985 mutex_exit(szcmtx); 3986 } 3987 goto chkszc; 3988 } 3989 if (pplist != NULL) { 3990 /* 3991 * page got promoted since last check. 3992 * we don't need preaalocated large 3993 * page. 3994 */ 3995 SEGVN_VMSTAT_FLTVNPAGES(37); 3996 page_free_replacement_page(pplist); 3997 page_create_putback(pages); 3998 } 3999 SEGVN_UPDATE_MODBITS(ppa, pages, rw, 4000 prot, vpprot); 4001 hat_memload_array(hat, a, pgsz, ppa, 4002 prot & vpprot, hat_flag); 4003 mutex_exit(szcmtx); 4004 if (!(hat_flag & HAT_LOAD_LOCK)) { 4005 for (i = 0; i < pages; i++) { 4006 page_unlock(ppa[i]); 4007 } 4008 } 4009 if (amp != NULL) { 4010 anon_array_exit(&an_cookie); 4011 ANON_LOCK_EXIT(&->a_rwlock); 4012 } 4013 goto next; 4014 } 4015 4016 /* 4017 * if page got demoted since last check 4018 * we could have not allocated larger page. 4019 * allocate now. 4020 */ 4021 if (pplist == NULL && 4022 page_alloc_pages(vp, seg, a, &pplist, NULL, 4023 szc, 0) && type != F_SOFTLOCK) { 4024 SEGVN_VMSTAT_FLTVNPAGES(38); 4025 for (i = 0; i < pages; i++) { 4026 page_unlock(ppa[i]); 4027 } 4028 if (amp != NULL) { 4029 anon_array_exit(&an_cookie); 4030 ANON_LOCK_EXIT(&->a_rwlock); 4031 } 4032 ierr = -1; 4033 alloc_failed |= (1 << szc); 4034 break; 4035 } 4036 4037 SEGVN_VMSTAT_FLTVNPAGES(39); 4038 4039 if (pplist != NULL) { 4040 segvn_relocate_pages(ppa, pplist); 4041 #ifdef DEBUG 4042 } else { 4043 ASSERT(type == F_SOFTLOCK); 4044 SEGVN_VMSTAT_FLTVNPAGES(40); 4045 #endif /* DEBUG */ 4046 } 4047 4048 SEGVN_UPDATE_MODBITS(ppa, pages, rw, prot, vpprot); 4049 4050 if (pplist == NULL && segvn_anypgsz_vnode == 0) { 4051 ASSERT(type == F_SOFTLOCK); 4052 for (i = 0; i < pages; i++) { 4053 ASSERT(ppa[i]->p_szc < szc); 4054 hat_memload(hat, a + (i << PAGESHIFT), 4055 ppa[i], prot & vpprot, hat_flag); 4056 } 4057 } else { 4058 ASSERT(pplist != NULL || type == F_SOFTLOCK); 4059 hat_memload_array(hat, a, pgsz, ppa, 4060 prot & vpprot, hat_flag); 4061 } 4062 if (!(hat_flag & HAT_LOAD_LOCK)) { 4063 for (i = 0; i < pages; i++) { 4064 ASSERT(PAGE_SHARED(ppa[i])); 4065 page_unlock(ppa[i]); 4066 } 4067 } 4068 if (amp != NULL) { 4069 anon_array_exit(&an_cookie); 4070 ANON_LOCK_EXIT(&->a_rwlock); 4071 } 4072 4073 next: 4074 if (vpage != NULL) { 4075 vpage += pages; 4076 } 4077 adjszc_chk = 1; 4078 } 4079 if (a == lpgeaddr) 4080 break; 4081 ASSERT(a < lpgeaddr); 4082 4083 ASSERT(!brkcow && type != F_SOFTLOCK); 4084 4085 /* 4086 * ierr == -1 means we failed to map with a large page. 4087 * (either due to allocation/relocation failures or 4088 * misalignment with other mappings to this file. 4089 * 4090 * ierr == -2 means some other thread allocated a large page 4091 * after we gave up tp map with a large page. retry with 4092 * larger mapping. 4093 */ 4094 ASSERT(ierr == -1 || ierr == -2); 4095 ASSERT(ierr == -2 || szc != 0); 4096 ASSERT(ierr == -1 || szc < seg->s_szc); 4097 if (ierr == -2) { 4098 SEGVN_VMSTAT_FLTVNPAGES(41); 4099 ASSERT(pszc > szc && pszc <= seg->s_szc); 4100 szc = pszc; 4101 } else if (segvn_anypgsz_vnode) { 4102 SEGVN_VMSTAT_FLTVNPAGES(42); 4103 szc--; 4104 } else { 4105 SEGVN_VMSTAT_FLTVNPAGES(43); 4106 ASSERT(pszc < szc); 4107 /* 4108 * other process created pszc large page. 4109 * but we still have to drop to 0 szc. 4110 */ 4111 szc = 0; 4112 } 4113 4114 pgsz = page_get_pagesize(szc); 4115 pages = btop(pgsz); 4116 if (ierr == -2) { 4117 /* 4118 * Size up case. Note lpgaddr may only be needed for 4119 * softlock case so we don't adjust it here. 4120 */ 4121 a = (caddr_t)P2ALIGN((uintptr_t)a, pgsz); 4122 ASSERT(a >= lpgaddr); 4123 lpgeaddr = (caddr_t)P2ROUNDUP((uintptr_t)eaddr, pgsz); 4124 off = svd->offset + (uintptr_t)(a - seg->s_base); 4125 aindx = svd->anon_index + seg_page(seg, a); 4126 vpage = (svd->vpage != NULL) ? 4127 &svd->vpage[seg_page(seg, a)] : NULL; 4128 } else { 4129 /* 4130 * Size down case. Note lpgaddr may only be needed for 4131 * softlock case so we don't adjust it here. 4132 */ 4133 ASSERT(IS_P2ALIGNED(a, pgsz)); 4134 ASSERT(IS_P2ALIGNED(lpgeaddr, pgsz)); 4135 lpgeaddr = (caddr_t)P2ROUNDUP((uintptr_t)eaddr, pgsz); 4136 ASSERT(a < lpgeaddr); 4137 if (a < addr) { 4138 SEGVN_VMSTAT_FLTVNPAGES(44); 4139 /* 4140 * The beginning of the large page region can 4141 * be pulled to the right to make a smaller 4142 * region. We haven't yet faulted a single 4143 * page. 4144 */ 4145 a = (caddr_t)P2ALIGN((uintptr_t)addr, pgsz); 4146 ASSERT(a >= lpgaddr); 4147 off = svd->offset + 4148 (uintptr_t)(a - seg->s_base); 4149 aindx = svd->anon_index + seg_page(seg, a); 4150 vpage = (svd->vpage != NULL) ? 4151 &svd->vpage[seg_page(seg, a)] : NULL; 4152 } 4153 } 4154 } 4155 out: 4156 kmem_free(ppa, ppasize); 4157 if (!err && !vop_size_err) { 4158 SEGVN_VMSTAT_FLTVNPAGES(45); 4159 return (0); 4160 } 4161 if (type == F_SOFTLOCK && a > lpgaddr) { 4162 SEGVN_VMSTAT_FLTVNPAGES(46); 4163 segvn_softunlock(seg, lpgaddr, a - lpgaddr, S_OTHER); 4164 } 4165 if (!vop_size_err) { 4166 SEGVN_VMSTAT_FLTVNPAGES(47); 4167 return (err); 4168 } 4169 ASSERT(brkcow || type == F_SOFTLOCK); 4170 /* 4171 * Large page end is mapped beyond the end of file and it's a cow 4172 * fault or softlock so we can't reduce the map area. For now just 4173 * demote the segment. This should really only happen if the end of 4174 * the file changed after the mapping was established since when large 4175 * page segments are created we make sure they don't extend beyond the 4176 * end of the file. 4177 */ 4178 SEGVN_VMSTAT_FLTVNPAGES(48); 4179 4180 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 4181 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_WRITER); 4182 err = 0; 4183 if (seg->s_szc != 0) { 4184 segvn_fltvnpages_clrszc_cnt++; 4185 ASSERT(svd->softlockcnt == 0); 4186 err = segvn_clrszc(seg); 4187 if (err != 0) { 4188 segvn_fltvnpages_clrszc_err++; 4189 } 4190 } 4191 ASSERT(err || seg->s_szc == 0); 4192 SEGVN_LOCK_DOWNGRADE(seg->s_as, &svd->lock); 4193 /* segvn_fault will do its job as if szc had been zero to begin with */ 4194 return (err == 0 ? IE_RETRY : FC_MAKE_ERR(err)); 4195 } 4196 4197 /* 4198 * This routine will attempt to fault in one large page. 4199 * it will use smaller pages if that fails. 4200 * It should only be called for pure anonymous segments. 4201 */ 4202 static faultcode_t 4203 segvn_fault_anonpages(struct hat *hat, struct seg *seg, caddr_t lpgaddr, 4204 caddr_t lpgeaddr, enum fault_type type, enum seg_rw rw, caddr_t addr, 4205 caddr_t eaddr, int brkcow) 4206 { 4207 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 4208 struct anon_map *amp = svd->amp; 4209 uchar_t segtype = svd->type; 4210 uint_t szc = seg->s_szc; 4211 size_t pgsz = page_get_pagesize(szc); 4212 size_t maxpgsz = pgsz; 4213 pgcnt_t pages = btop(pgsz); 4214 size_t ppasize = pages * sizeof (page_t *); 4215 caddr_t a = lpgaddr; 4216 ulong_t aindx = svd->anon_index + seg_page(seg, a); 4217 struct vpage *vpage = (svd->vpage != NULL) ? 4218 &svd->vpage[seg_page(seg, a)] : NULL; 4219 page_t **ppa; 4220 uint_t ppa_szc; 4221 faultcode_t err; 4222 int ierr; 4223 uint_t protchk, prot, vpprot; 4224 ulong_t i; 4225 int hat_flag = (type == F_SOFTLOCK) ? HAT_LOAD_LOCK : HAT_LOAD; 4226 anon_sync_obj_t cookie; 4227 int first = 1; 4228 int adjszc_chk; 4229 int purged = 0; 4230 4231 ASSERT(szc != 0); 4232 ASSERT(amp != NULL); 4233 ASSERT(enable_mbit_wa == 0); /* no mbit simulations with large pages */ 4234 ASSERT(!(svd->flags & MAP_NORESERVE)); 4235 ASSERT(type != F_SOFTUNLOCK); 4236 ASSERT(IS_P2ALIGNED(a, maxpgsz)); 4237 4238 ASSERT(SEGVN_LOCK_HELD(seg->s_as, &svd->lock)); 4239 4240 VM_STAT_COND_ADD(type == F_SOFTLOCK, segvnvmstats.fltanpages[0]); 4241 VM_STAT_COND_ADD(type != F_SOFTLOCK, segvnvmstats.fltanpages[1]); 4242 4243 if (svd->flags & MAP_TEXT) { 4244 hat_flag |= HAT_LOAD_TEXT; 4245 } 4246 4247 if (svd->pageprot) { 4248 switch (rw) { 4249 case S_READ: 4250 protchk = PROT_READ; 4251 break; 4252 case S_WRITE: 4253 protchk = PROT_WRITE; 4254 break; 4255 case S_EXEC: 4256 protchk = PROT_EXEC; 4257 break; 4258 case S_OTHER: 4259 default: 4260 protchk = PROT_READ | PROT_WRITE | PROT_EXEC; 4261 break; 4262 } 4263 VM_STAT_ADD(segvnvmstats.fltanpages[2]); 4264 } else { 4265 prot = svd->prot; 4266 /* caller has already done segment level protection check. */ 4267 } 4268 4269 ppa = kmem_alloc(ppasize, KM_SLEEP); 4270 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 4271 for (;;) { 4272 adjszc_chk = 0; 4273 for (; a < lpgeaddr; a += pgsz, aindx += pages) { 4274 if (svd->pageprot != 0 && IS_P2ALIGNED(a, maxpgsz)) { 4275 VM_STAT_ADD(segvnvmstats.fltanpages[3]); 4276 ASSERT(vpage != NULL); 4277 prot = VPP_PROT(vpage); 4278 ASSERT(sameprot(seg, a, maxpgsz)); 4279 if ((prot & protchk) == 0) { 4280 err = FC_PROT; 4281 goto error; 4282 } 4283 } 4284 if (adjszc_chk && IS_P2ALIGNED(a, maxpgsz) && 4285 pgsz < maxpgsz) { 4286 ASSERT(a > lpgaddr); 4287 szc = seg->s_szc; 4288 pgsz = maxpgsz; 4289 pages = btop(pgsz); 4290 ASSERT(IS_P2ALIGNED(aindx, pages)); 4291 lpgeaddr = (caddr_t)P2ROUNDUP((uintptr_t)eaddr, 4292 pgsz); 4293 } 4294 if (type == F_SOFTLOCK && svd->vp != NULL) { 4295 mutex_enter(&freemem_lock); 4296 if (availrmem < tune.t_minarmem + pages) { 4297 mutex_exit(&freemem_lock); 4298 err = FC_MAKE_ERR(ENOMEM); 4299 goto error; 4300 } else { 4301 availrmem -= pages; 4302 segvn_pages_locked += pages; 4303 svd->softlockcnt += pages; 4304 } 4305 mutex_exit(&freemem_lock); 4306 } 4307 anon_array_enter(amp, aindx, &cookie); 4308 ppa_szc = (uint_t)-1; 4309 ierr = anon_map_getpages(amp, aindx, szc, seg, a, 4310 prot, &vpprot, ppa, &ppa_szc, vpage, rw, brkcow, 4311 segvn_anypgsz, svd->cred); 4312 if (ierr != 0) { 4313 anon_array_exit(&cookie); 4314 VM_STAT_ADD(segvnvmstats.fltanpages[4]); 4315 if (type == F_SOFTLOCK && svd->vp != NULL) { 4316 VM_STAT_ADD(segvnvmstats.fltanpages[5]); 4317 mutex_enter(&freemem_lock); 4318 availrmem += pages; 4319 segvn_pages_locked -= pages; 4320 svd->softlockcnt -= pages; 4321 mutex_exit(&freemem_lock); 4322 } 4323 if (ierr > 0) { 4324 VM_STAT_ADD(segvnvmstats.fltanpages[6]); 4325 err = FC_MAKE_ERR(ierr); 4326 goto error; 4327 } 4328 break; 4329 } 4330 4331 ASSERT(!IS_VMODSORT(ppa[0]->p_vnode)); 4332 4333 ASSERT(segtype == MAP_SHARED || 4334 ppa[0]->p_szc <= szc); 4335 ASSERT(segtype == MAP_PRIVATE || 4336 ppa[0]->p_szc >= szc); 4337 4338 /* 4339 * Handle pages that have been marked for migration 4340 */ 4341 if (lgrp_optimizations()) 4342 page_migrate(seg, a, ppa, pages); 4343 4344 if (type == F_SOFTLOCK && svd->vp == NULL) { 4345 /* 4346 * All pages in ppa array belong to the same 4347 * large page. This means it's ok to call 4348 * segvn_pp_lock_anonpages just for ppa[0]. 4349 */ 4350 if (!segvn_pp_lock_anonpages(ppa[0], first)) { 4351 for (i = 0; i < pages; i++) { 4352 page_unlock(ppa[i]); 4353 } 4354 err = FC_MAKE_ERR(ENOMEM); 4355 goto error; 4356 } 4357 first = 0; 4358 mutex_enter(&freemem_lock); 4359 svd->softlockcnt += pages; 4360 segvn_pages_locked += pages; 4361 mutex_exit(&freemem_lock); 4362 } 4363 4364 if (segtype == MAP_SHARED) { 4365 vpprot |= PROT_WRITE; 4366 } 4367 4368 hat_memload_array(hat, a, pgsz, ppa, 4369 prot & vpprot, hat_flag); 4370 4371 if (hat_flag & HAT_LOAD_LOCK) { 4372 VM_STAT_ADD(segvnvmstats.fltanpages[7]); 4373 } else { 4374 VM_STAT_ADD(segvnvmstats.fltanpages[8]); 4375 for (i = 0; i < pages; i++) 4376 page_unlock(ppa[i]); 4377 } 4378 if (vpage != NULL) 4379 vpage += pages; 4380 4381 anon_array_exit(&cookie); 4382 adjszc_chk = 1; 4383 } 4384 if (a == lpgeaddr) 4385 break; 4386 ASSERT(a < lpgeaddr); 4387 /* 4388 * ierr == -1 means we failed to allocate a large page. 4389 * so do a size down operation. 4390 * 4391 * ierr == -2 means some other process that privately shares 4392 * pages with this process has allocated a larger page and we 4393 * need to retry with larger pages. So do a size up 4394 * operation. This relies on the fact that large pages are 4395 * never partially shared i.e. if we share any constituent 4396 * page of a large page with another process we must share the 4397 * entire large page. Note this cannot happen for SOFTLOCK 4398 * case, unless current address (a) is at the beginning of the 4399 * next page size boundary because the other process couldn't 4400 * have relocated locked pages. 4401 */ 4402 ASSERT(ierr == -1 || ierr == -2); 4403 /* 4404 * For the very first relocation failure try to purge this 4405 * segment's cache so that the relocator can obtain an 4406 * exclusive lock on pages we want to relocate. 4407 */ 4408 if (!purged && ierr == -1 && ppa_szc != (uint_t)-1 && 4409 svd->softlockcnt != 0) { 4410 purged = 1; 4411 segvn_purge(seg); 4412 continue; 4413 } 4414 4415 if (segvn_anypgsz) { 4416 ASSERT(ierr == -2 || szc != 0); 4417 ASSERT(ierr == -1 || szc < seg->s_szc); 4418 szc = (ierr == -1) ? szc - 1 : szc + 1; 4419 } else { 4420 /* 4421 * For non COW faults and segvn_anypgsz == 0 4422 * we need to be careful not to loop forever 4423 * if existing page is found with szc other 4424 * than 0 or seg->s_szc. This could be due 4425 * to page relocations on behalf of DR or 4426 * more likely large page creation. For this 4427 * case simply re-size to existing page's szc 4428 * if returned by anon_map_getpages(). 4429 */ 4430 if (ppa_szc == (uint_t)-1) { 4431 szc = (ierr == -1) ? 0 : seg->s_szc; 4432 } else { 4433 ASSERT(ppa_szc <= seg->s_szc); 4434 ASSERT(ierr == -2 || ppa_szc < szc); 4435 ASSERT(ierr == -1 || ppa_szc > szc); 4436 szc = ppa_szc; 4437 } 4438 } 4439 4440 pgsz = page_get_pagesize(szc); 4441 pages = btop(pgsz); 4442 ASSERT(type != F_SOFTLOCK || ierr == -1 || 4443 (IS_P2ALIGNED(a, pgsz) && IS_P2ALIGNED(lpgeaddr, pgsz))); 4444 if (type == F_SOFTLOCK) { 4445 /* 4446 * For softlocks we cannot reduce the fault area 4447 * (calculated based on the largest page size for this 4448 * segment) for size down and a is already next 4449 * page size aligned as assertted above for size 4450 * ups. Therefore just continue in case of softlock. 4451 */ 4452 VM_STAT_ADD(segvnvmstats.fltanpages[9]); 4453 continue; /* keep lint happy */ 4454 } else if (ierr == -2) { 4455 4456 /* 4457 * Size up case. Note lpgaddr may only be needed for 4458 * softlock case so we don't adjust it here. 4459 */ 4460 VM_STAT_ADD(segvnvmstats.fltanpages[10]); 4461 a = (caddr_t)P2ALIGN((uintptr_t)a, pgsz); 4462 ASSERT(a >= lpgaddr); 4463 lpgeaddr = (caddr_t)P2ROUNDUP((uintptr_t)eaddr, pgsz); 4464 aindx = svd->anon_index + seg_page(seg, a); 4465 vpage = (svd->vpage != NULL) ? 4466 &svd->vpage[seg_page(seg, a)] : NULL; 4467 } else { 4468 /* 4469 * Size down case. Note lpgaddr may only be needed for 4470 * softlock case so we don't adjust it here. 4471 */ 4472 VM_STAT_ADD(segvnvmstats.fltanpages[11]); 4473 ASSERT(IS_P2ALIGNED(a, pgsz)); 4474 ASSERT(IS_P2ALIGNED(lpgeaddr, pgsz)); 4475 lpgeaddr = (caddr_t)P2ROUNDUP((uintptr_t)eaddr, pgsz); 4476 ASSERT(a < lpgeaddr); 4477 if (a < addr) { 4478 /* 4479 * The beginning of the large page region can 4480 * be pulled to the right to make a smaller 4481 * region. We haven't yet faulted a single 4482 * page. 4483 */ 4484 VM_STAT_ADD(segvnvmstats.fltanpages[12]); 4485 a = (caddr_t)P2ALIGN((uintptr_t)addr, pgsz); 4486 ASSERT(a >= lpgaddr); 4487 aindx = svd->anon_index + seg_page(seg, a); 4488 vpage = (svd->vpage != NULL) ? 4489 &svd->vpage[seg_page(seg, a)] : NULL; 4490 } 4491 } 4492 } 4493 VM_STAT_ADD(segvnvmstats.fltanpages[13]); 4494 ANON_LOCK_EXIT(&->a_rwlock); 4495 kmem_free(ppa, ppasize); 4496 return (0); 4497 error: 4498 VM_STAT_ADD(segvnvmstats.fltanpages[14]); 4499 ANON_LOCK_EXIT(&->a_rwlock); 4500 kmem_free(ppa, ppasize); 4501 if (type == F_SOFTLOCK && a > lpgaddr) { 4502 VM_STAT_ADD(segvnvmstats.fltanpages[15]); 4503 segvn_softunlock(seg, lpgaddr, a - lpgaddr, S_OTHER); 4504 } 4505 return (err); 4506 } 4507 4508 int fltadvice = 1; /* set to free behind pages for sequential access */ 4509 4510 /* 4511 * This routine is called via a machine specific fault handling routine. 4512 * It is also called by software routines wishing to lock or unlock 4513 * a range of addresses. 4514 * 4515 * Here is the basic algorithm: 4516 * If unlocking 4517 * Call segvn_softunlock 4518 * Return 4519 * endif 4520 * Checking and set up work 4521 * If we will need some non-anonymous pages 4522 * Call VOP_GETPAGE over the range of non-anonymous pages 4523 * endif 4524 * Loop over all addresses requested 4525 * Call segvn_faultpage passing in page list 4526 * to load up translations and handle anonymous pages 4527 * endloop 4528 * Load up translation to any additional pages in page list not 4529 * already handled that fit into this segment 4530 */ 4531 static faultcode_t 4532 segvn_fault(struct hat *hat, struct seg *seg, caddr_t addr, size_t len, 4533 enum fault_type type, enum seg_rw rw) 4534 { 4535 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 4536 page_t **plp, **ppp, *pp; 4537 u_offset_t off; 4538 caddr_t a; 4539 struct vpage *vpage; 4540 uint_t vpprot, prot; 4541 int err; 4542 page_t *pl[PVN_GETPAGE_NUM + 1]; 4543 size_t plsz, pl_alloc_sz; 4544 size_t page; 4545 ulong_t anon_index; 4546 struct anon_map *amp; 4547 int dogetpage = 0; 4548 caddr_t lpgaddr, lpgeaddr; 4549 size_t pgsz; 4550 anon_sync_obj_t cookie; 4551 int brkcow = BREAK_COW_SHARE(rw, type, svd->type); 4552 4553 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 4554 4555 /* 4556 * First handle the easy stuff 4557 */ 4558 if (type == F_SOFTUNLOCK) { 4559 if (rw == S_READ_NOCOW) { 4560 rw = S_READ; 4561 ASSERT(AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock)); 4562 } 4563 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER); 4564 pgsz = (seg->s_szc == 0) ? PAGESIZE : 4565 page_get_pagesize(seg->s_szc); 4566 VM_STAT_COND_ADD(pgsz > PAGESIZE, segvnvmstats.fltanpages[16]); 4567 CALC_LPG_REGION(pgsz, seg, addr, len, lpgaddr, lpgeaddr); 4568 segvn_softunlock(seg, lpgaddr, lpgeaddr - lpgaddr, rw); 4569 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 4570 return (0); 4571 } 4572 4573 top: 4574 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER); 4575 4576 /* 4577 * If we have the same protections for the entire segment, 4578 * insure that the access being attempted is legitimate. 4579 */ 4580 4581 if (svd->pageprot == 0) { 4582 uint_t protchk; 4583 4584 switch (rw) { 4585 case S_READ: 4586 case S_READ_NOCOW: 4587 protchk = PROT_READ; 4588 break; 4589 case S_WRITE: 4590 protchk = PROT_WRITE; 4591 break; 4592 case S_EXEC: 4593 protchk = PROT_EXEC; 4594 break; 4595 case S_OTHER: 4596 default: 4597 protchk = PROT_READ | PROT_WRITE | PROT_EXEC; 4598 break; 4599 } 4600 4601 if ((svd->prot & protchk) == 0) { 4602 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 4603 return (FC_PROT); /* illegal access type */ 4604 } 4605 } 4606 4607 /* 4608 * We can't allow the long term use of softlocks for vmpss segments, 4609 * because in some file truncation cases we should be able to demote 4610 * the segment, which requires that there are no softlocks. The 4611 * only case where it's ok to allow a SOFTLOCK fault against a vmpss 4612 * segment is S_READ_NOCOW, where the caller holds the address space 4613 * locked as writer and calls softunlock before dropping the as lock. 4614 * S_READ_NOCOW is used by /proc to read memory from another user. 4615 * 4616 * Another deadlock between SOFTLOCK and file truncation can happen 4617 * because segvn_fault_vnodepages() calls the FS one pagesize at 4618 * a time. A second VOP_GETPAGE() call by segvn_fault_vnodepages() 4619 * can cause a deadlock because the first set of page_t's remain 4620 * locked SE_SHARED. To avoid this, we demote segments on a first 4621 * SOFTLOCK if they have a length greater than the segment's 4622 * page size. 4623 * 4624 * So for now, we only avoid demoting a segment on a SOFTLOCK when 4625 * the access type is S_READ_NOCOW and the fault length is less than 4626 * or equal to the segment's page size. While this is quite restrictive, 4627 * it should be the most common case of SOFTLOCK against a vmpss 4628 * segment. 4629 * 4630 * For S_READ_NOCOW, it's safe not to do a copy on write because the 4631 * caller makes sure no COW will be caused by another thread for a 4632 * softlocked page. 4633 */ 4634 if (type == F_SOFTLOCK && svd->vp != NULL && seg->s_szc != 0) { 4635 int demote = 0; 4636 4637 if (rw != S_READ_NOCOW) { 4638 demote = 1; 4639 } 4640 if (!demote && len > PAGESIZE) { 4641 pgsz = page_get_pagesize(seg->s_szc); 4642 CALC_LPG_REGION(pgsz, seg, addr, len, lpgaddr, 4643 lpgeaddr); 4644 if (lpgeaddr - lpgaddr > pgsz) { 4645 demote = 1; 4646 } 4647 } 4648 4649 ASSERT(demote || AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock)); 4650 4651 if (demote) { 4652 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 4653 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_WRITER); 4654 if (seg->s_szc != 0) { 4655 segvn_vmpss_clrszc_cnt++; 4656 ASSERT(svd->softlockcnt == 0); 4657 err = segvn_clrszc(seg); 4658 if (err) { 4659 segvn_vmpss_clrszc_err++; 4660 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 4661 return (FC_MAKE_ERR(err)); 4662 } 4663 } 4664 ASSERT(seg->s_szc == 0); 4665 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 4666 goto top; 4667 } 4668 } 4669 4670 /* 4671 * Check to see if we need to allocate an anon_map structure. 4672 */ 4673 if (svd->amp == NULL && (svd->vp == NULL || brkcow)) { 4674 /* 4675 * Drop the "read" lock on the segment and acquire 4676 * the "write" version since we have to allocate the 4677 * anon_map. 4678 */ 4679 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 4680 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_WRITER); 4681 4682 if (svd->amp == NULL) { 4683 svd->amp = anonmap_alloc(seg->s_size, 0); 4684 svd->amp->a_szc = seg->s_szc; 4685 } 4686 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 4687 4688 /* 4689 * Start all over again since segment protections 4690 * may have changed after we dropped the "read" lock. 4691 */ 4692 goto top; 4693 } 4694 4695 /* 4696 * S_READ_NOCOW vs S_READ distinction was 4697 * only needed for the code above. After 4698 * that we treat it as S_READ. 4699 */ 4700 if (rw == S_READ_NOCOW) { 4701 ASSERT(type == F_SOFTLOCK); 4702 ASSERT(AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock)); 4703 rw = S_READ; 4704 } 4705 4706 amp = svd->amp; 4707 4708 /* 4709 * MADV_SEQUENTIAL work is ignored for large page segments. 4710 */ 4711 if (seg->s_szc != 0) { 4712 pgsz = page_get_pagesize(seg->s_szc); 4713 ASSERT(SEGVN_LOCK_HELD(seg->s_as, &svd->lock)); 4714 CALC_LPG_REGION(pgsz, seg, addr, len, lpgaddr, lpgeaddr); 4715 if (svd->vp == NULL) { 4716 err = segvn_fault_anonpages(hat, seg, lpgaddr, 4717 lpgeaddr, type, rw, addr, addr + len, brkcow); 4718 } else { 4719 err = segvn_fault_vnodepages(hat, seg, lpgaddr, 4720 lpgeaddr, type, rw, addr, addr + len, brkcow); 4721 if (err == IE_RETRY) { 4722 ASSERT(seg->s_szc == 0); 4723 ASSERT(SEGVN_READ_HELD(seg->s_as, &svd->lock)); 4724 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 4725 goto top; 4726 } 4727 } 4728 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 4729 return (err); 4730 } 4731 4732 page = seg_page(seg, addr); 4733 if (amp != NULL) { 4734 anon_index = svd->anon_index + page; 4735 4736 if ((type == F_PROT) && (rw == S_READ) && 4737 svd->type == MAP_PRIVATE && svd->pageprot == 0) { 4738 size_t index = anon_index; 4739 struct anon *ap; 4740 4741 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 4742 /* 4743 * The fast path could apply to S_WRITE also, except 4744 * that the protection fault could be caused by lazy 4745 * tlb flush when ro->rw. In this case, the pte is 4746 * RW already. But RO in the other cpu's tlb causes 4747 * the fault. Since hat_chgprot won't do anything if 4748 * pte doesn't change, we may end up faulting 4749 * indefinitely until the RO tlb entry gets replaced. 4750 */ 4751 for (a = addr; a < addr + len; a += PAGESIZE, index++) { 4752 anon_array_enter(amp, index, &cookie); 4753 ap = anon_get_ptr(amp->ahp, index); 4754 anon_array_exit(&cookie); 4755 if ((ap == NULL) || (ap->an_refcnt != 1)) { 4756 ANON_LOCK_EXIT(&->a_rwlock); 4757 goto slow; 4758 } 4759 } 4760 hat_chgprot(seg->s_as->a_hat, addr, len, svd->prot); 4761 ANON_LOCK_EXIT(&->a_rwlock); 4762 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 4763 return (0); 4764 } 4765 } 4766 slow: 4767 4768 if (svd->vpage == NULL) 4769 vpage = NULL; 4770 else 4771 vpage = &svd->vpage[page]; 4772 4773 off = svd->offset + (uintptr_t)(addr - seg->s_base); 4774 4775 /* 4776 * If MADV_SEQUENTIAL has been set for the particular page we 4777 * are faulting on, free behind all pages in the segment and put 4778 * them on the free list. 4779 */ 4780 if ((page != 0) && fltadvice) { /* not if first page in segment */ 4781 struct vpage *vpp; 4782 ulong_t fanon_index; 4783 size_t fpage; 4784 u_offset_t pgoff, fpgoff; 4785 struct vnode *fvp; 4786 struct anon *fap = NULL; 4787 4788 if (svd->advice == MADV_SEQUENTIAL || 4789 (svd->pageadvice && 4790 VPP_ADVICE(vpage) == MADV_SEQUENTIAL)) { 4791 pgoff = off - PAGESIZE; 4792 fpage = page - 1; 4793 if (vpage != NULL) 4794 vpp = &svd->vpage[fpage]; 4795 if (amp != NULL) 4796 fanon_index = svd->anon_index + fpage; 4797 4798 while (pgoff > svd->offset) { 4799 if (svd->advice != MADV_SEQUENTIAL && 4800 (!svd->pageadvice || (vpage && 4801 VPP_ADVICE(vpp) != MADV_SEQUENTIAL))) 4802 break; 4803 4804 /* 4805 * If this is an anon page, we must find the 4806 * correct <vp, offset> for it 4807 */ 4808 fap = NULL; 4809 if (amp != NULL) { 4810 ANON_LOCK_ENTER(&->a_rwlock, 4811 RW_READER); 4812 anon_array_enter(amp, fanon_index, 4813 &cookie); 4814 fap = anon_get_ptr(amp->ahp, 4815 fanon_index); 4816 if (fap != NULL) { 4817 swap_xlate(fap, &fvp, &fpgoff); 4818 } else { 4819 fpgoff = pgoff; 4820 fvp = svd->vp; 4821 } 4822 anon_array_exit(&cookie); 4823 ANON_LOCK_EXIT(&->a_rwlock); 4824 } else { 4825 fpgoff = pgoff; 4826 fvp = svd->vp; 4827 } 4828 if (fvp == NULL) 4829 break; /* XXX */ 4830 /* 4831 * Skip pages that are free or have an 4832 * "exclusive" lock. 4833 */ 4834 pp = page_lookup_nowait(fvp, fpgoff, SE_SHARED); 4835 if (pp == NULL) 4836 break; 4837 /* 4838 * We don't need the page_struct_lock to test 4839 * as this is only advisory; even if we 4840 * acquire it someone might race in and lock 4841 * the page after we unlock and before the 4842 * PUTPAGE, then VOP_PUTPAGE will do nothing. 4843 */ 4844 if (pp->p_lckcnt == 0 && pp->p_cowcnt == 0) { 4845 /* 4846 * Hold the vnode before releasing 4847 * the page lock to prevent it from 4848 * being freed and re-used by some 4849 * other thread. 4850 */ 4851 VN_HOLD(fvp); 4852 page_unlock(pp); 4853 /* 4854 * We should build a page list 4855 * to kluster putpages XXX 4856 */ 4857 (void) VOP_PUTPAGE(fvp, 4858 (offset_t)fpgoff, PAGESIZE, 4859 (B_DONTNEED|B_FREE|B_ASYNC), 4860 svd->cred); 4861 VN_RELE(fvp); 4862 } else { 4863 /* 4864 * XXX - Should the loop terminate if 4865 * the page is `locked'? 4866 */ 4867 page_unlock(pp); 4868 } 4869 --vpp; 4870 --fanon_index; 4871 pgoff -= PAGESIZE; 4872 } 4873 } 4874 } 4875 4876 plp = pl; 4877 *plp = NULL; 4878 pl_alloc_sz = 0; 4879 4880 /* 4881 * See if we need to call VOP_GETPAGE for 4882 * *any* of the range being faulted on. 4883 * We can skip all of this work if there 4884 * was no original vnode. 4885 */ 4886 if (svd->vp != NULL) { 4887 u_offset_t vp_off; 4888 size_t vp_len; 4889 struct anon *ap; 4890 vnode_t *vp; 4891 4892 vp_off = off; 4893 vp_len = len; 4894 4895 if (amp == NULL) 4896 dogetpage = 1; 4897 else { 4898 /* 4899 * Only acquire reader lock to prevent amp->ahp 4900 * from being changed. It's ok to miss pages, 4901 * hence we don't do anon_array_enter 4902 */ 4903 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 4904 ap = anon_get_ptr(amp->ahp, anon_index); 4905 4906 if (len <= PAGESIZE) 4907 /* inline non_anon() */ 4908 dogetpage = (ap == NULL); 4909 else 4910 dogetpage = non_anon(amp->ahp, anon_index, 4911 &vp_off, &vp_len); 4912 ANON_LOCK_EXIT(&->a_rwlock); 4913 } 4914 4915 if (dogetpage) { 4916 enum seg_rw arw; 4917 struct as *as = seg->s_as; 4918 4919 if (len > ptob((sizeof (pl) / sizeof (pl[0])) - 1)) { 4920 /* 4921 * Page list won't fit in local array, 4922 * allocate one of the needed size. 4923 */ 4924 pl_alloc_sz = 4925 (btop(len) + 1) * sizeof (page_t *); 4926 plp = kmem_alloc(pl_alloc_sz, KM_SLEEP); 4927 plp[0] = NULL; 4928 plsz = len; 4929 } else if (rw == S_WRITE && svd->type == MAP_PRIVATE || 4930 rw == S_OTHER || 4931 (((size_t)(addr + PAGESIZE) < 4932 (size_t)(seg->s_base + seg->s_size)) && 4933 hat_probe(as->a_hat, addr + PAGESIZE))) { 4934 /* 4935 * Ask VOP_GETPAGE to return the exact number 4936 * of pages if 4937 * (a) this is a COW fault, or 4938 * (b) this is a software fault, or 4939 * (c) next page is already mapped. 4940 */ 4941 plsz = len; 4942 } else { 4943 /* 4944 * Ask VOP_GETPAGE to return adjacent pages 4945 * within the segment. 4946 */ 4947 plsz = MIN((size_t)PVN_GETPAGE_SZ, (size_t) 4948 ((seg->s_base + seg->s_size) - addr)); 4949 ASSERT((addr + plsz) <= 4950 (seg->s_base + seg->s_size)); 4951 } 4952 4953 /* 4954 * Need to get some non-anonymous pages. 4955 * We need to make only one call to GETPAGE to do 4956 * this to prevent certain deadlocking conditions 4957 * when we are doing locking. In this case 4958 * non_anon() should have picked up the smallest 4959 * range which includes all the non-anonymous 4960 * pages in the requested range. We have to 4961 * be careful regarding which rw flag to pass in 4962 * because on a private mapping, the underlying 4963 * object is never allowed to be written. 4964 */ 4965 if (rw == S_WRITE && svd->type == MAP_PRIVATE) { 4966 arw = S_READ; 4967 } else { 4968 arw = rw; 4969 } 4970 vp = svd->vp; 4971 TRACE_3(TR_FAC_VM, TR_SEGVN_GETPAGE, 4972 "segvn_getpage:seg %p addr %p vp %p", 4973 seg, addr, vp); 4974 err = VOP_GETPAGE(vp, (offset_t)vp_off, vp_len, 4975 &vpprot, plp, plsz, seg, addr + (vp_off - off), arw, 4976 svd->cred); 4977 if (err) { 4978 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 4979 segvn_pagelist_rele(plp); 4980 if (pl_alloc_sz) 4981 kmem_free(plp, pl_alloc_sz); 4982 return (FC_MAKE_ERR(err)); 4983 } 4984 if (svd->type == MAP_PRIVATE) 4985 vpprot &= ~PROT_WRITE; 4986 } 4987 } 4988 4989 /* 4990 * N.B. at this time the plp array has all the needed non-anon 4991 * pages in addition to (possibly) having some adjacent pages. 4992 */ 4993 4994 /* 4995 * Always acquire the anon_array_lock to prevent 4996 * 2 threads from allocating separate anon slots for 4997 * the same "addr". 4998 * 4999 * If this is a copy-on-write fault and we don't already 5000 * have the anon_array_lock, acquire it to prevent the 5001 * fault routine from handling multiple copy-on-write faults 5002 * on the same "addr" in the same address space. 5003 * 5004 * Only one thread should deal with the fault since after 5005 * it is handled, the other threads can acquire a translation 5006 * to the newly created private page. This prevents two or 5007 * more threads from creating different private pages for the 5008 * same fault. 5009 * 5010 * We grab "serialization" lock here if this is a MAP_PRIVATE segment 5011 * to prevent deadlock between this thread and another thread 5012 * which has soft-locked this page and wants to acquire serial_lock. 5013 * ( bug 4026339 ) 5014 * 5015 * The fix for bug 4026339 becomes unnecessary when using the 5016 * locking scheme with per amp rwlock and a global set of hash 5017 * lock, anon_array_lock. If we steal a vnode page when low 5018 * on memory and upgrad the page lock through page_rename, 5019 * then the page is PAGE_HANDLED, nothing needs to be done 5020 * for this page after returning from segvn_faultpage. 5021 * 5022 * But really, the page lock should be downgraded after 5023 * the stolen page is page_rename'd. 5024 */ 5025 5026 if (amp != NULL) 5027 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 5028 5029 /* 5030 * Ok, now loop over the address range and handle faults 5031 */ 5032 for (a = addr; a < addr + len; a += PAGESIZE, off += PAGESIZE) { 5033 err = segvn_faultpage(hat, seg, a, off, vpage, plp, vpprot, 5034 type, rw, brkcow, a == addr); 5035 if (err) { 5036 if (amp != NULL) 5037 ANON_LOCK_EXIT(&->a_rwlock); 5038 if (type == F_SOFTLOCK && a > addr) { 5039 segvn_softunlock(seg, addr, (a - addr), 5040 S_OTHER); 5041 } 5042 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5043 segvn_pagelist_rele(plp); 5044 if (pl_alloc_sz) 5045 kmem_free(plp, pl_alloc_sz); 5046 return (err); 5047 } 5048 if (vpage) { 5049 vpage++; 5050 } else if (svd->vpage) { 5051 page = seg_page(seg, addr); 5052 vpage = &svd->vpage[++page]; 5053 } 5054 } 5055 5056 /* Didn't get pages from the underlying fs so we're done */ 5057 if (!dogetpage) 5058 goto done; 5059 5060 /* 5061 * Now handle any other pages in the list returned. 5062 * If the page can be used, load up the translations now. 5063 * Note that the for loop will only be entered if "plp" 5064 * is pointing to a non-NULL page pointer which means that 5065 * VOP_GETPAGE() was called and vpprot has been initialized. 5066 */ 5067 if (svd->pageprot == 0) 5068 prot = svd->prot & vpprot; 5069 5070 5071 /* 5072 * Large Files: diff should be unsigned value because we started 5073 * supporting > 2GB segment sizes from 2.5.1 and when a 5074 * large file of size > 2GB gets mapped to address space 5075 * the diff value can be > 2GB. 5076 */ 5077 5078 for (ppp = plp; (pp = *ppp) != NULL; ppp++) { 5079 size_t diff; 5080 struct anon *ap; 5081 int anon_index; 5082 anon_sync_obj_t cookie; 5083 int hat_flag = HAT_LOAD_ADV; 5084 5085 if (svd->flags & MAP_TEXT) { 5086 hat_flag |= HAT_LOAD_TEXT; 5087 } 5088 5089 if (pp == PAGE_HANDLED) 5090 continue; 5091 5092 if (pp->p_offset >= svd->offset && 5093 (pp->p_offset < svd->offset + seg->s_size)) { 5094 5095 diff = pp->p_offset - svd->offset; 5096 5097 /* 5098 * Large Files: Following is the assertion 5099 * validating the above cast. 5100 */ 5101 ASSERT(svd->vp == pp->p_vnode); 5102 5103 page = btop(diff); 5104 if (svd->pageprot) 5105 prot = VPP_PROT(&svd->vpage[page]) & vpprot; 5106 5107 /* 5108 * Prevent other threads in the address space from 5109 * creating private pages (i.e., allocating anon slots) 5110 * while we are in the process of loading translations 5111 * to additional pages returned by the underlying 5112 * object. 5113 */ 5114 if (amp != NULL) { 5115 anon_index = svd->anon_index + page; 5116 anon_array_enter(amp, anon_index, &cookie); 5117 ap = anon_get_ptr(amp->ahp, anon_index); 5118 } 5119 if ((amp == NULL) || (ap == NULL)) { 5120 if (IS_VMODSORT(pp->p_vnode) || 5121 enable_mbit_wa) { 5122 if (rw == S_WRITE) 5123 hat_setmod(pp); 5124 else if (rw != S_OTHER && 5125 !hat_ismod(pp)) 5126 prot &= ~PROT_WRITE; 5127 } 5128 /* 5129 * Skip mapping read ahead pages marked 5130 * for migration, so they will get migrated 5131 * properly on fault 5132 */ 5133 if ((prot & PROT_READ) && !PP_ISMIGRATE(pp)) { 5134 hat_memload(hat, seg->s_base + diff, 5135 pp, prot, hat_flag); 5136 } 5137 } 5138 if (amp != NULL) 5139 anon_array_exit(&cookie); 5140 } 5141 page_unlock(pp); 5142 } 5143 done: 5144 if (amp != NULL) 5145 ANON_LOCK_EXIT(&->a_rwlock); 5146 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5147 if (pl_alloc_sz) 5148 kmem_free(plp, pl_alloc_sz); 5149 return (0); 5150 } 5151 5152 /* 5153 * This routine is used to start I/O on pages asynchronously. XXX it will 5154 * only create PAGESIZE pages. At fault time they will be relocated into 5155 * larger pages. 5156 */ 5157 static faultcode_t 5158 segvn_faulta(struct seg *seg, caddr_t addr) 5159 { 5160 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 5161 int err; 5162 struct anon_map *amp; 5163 vnode_t *vp; 5164 5165 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 5166 5167 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER); 5168 if ((amp = svd->amp) != NULL) { 5169 struct anon *ap; 5170 5171 /* 5172 * Reader lock to prevent amp->ahp from being changed. 5173 * This is advisory, it's ok to miss a page, so 5174 * we don't do anon_array_enter lock. 5175 */ 5176 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 5177 if ((ap = anon_get_ptr(amp->ahp, 5178 svd->anon_index + seg_page(seg, addr))) != NULL) { 5179 5180 err = anon_getpage(&ap, NULL, NULL, 5181 0, seg, addr, S_READ, svd->cred); 5182 5183 ANON_LOCK_EXIT(&->a_rwlock); 5184 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5185 if (err) 5186 return (FC_MAKE_ERR(err)); 5187 return (0); 5188 } 5189 ANON_LOCK_EXIT(&->a_rwlock); 5190 } 5191 5192 if (svd->vp == NULL) { 5193 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5194 return (0); /* zfod page - do nothing now */ 5195 } 5196 5197 vp = svd->vp; 5198 TRACE_3(TR_FAC_VM, TR_SEGVN_GETPAGE, 5199 "segvn_getpage:seg %p addr %p vp %p", seg, addr, vp); 5200 err = VOP_GETPAGE(vp, 5201 (offset_t)(svd->offset + (uintptr_t)(addr - seg->s_base)), 5202 PAGESIZE, NULL, NULL, 0, seg, addr, 5203 S_OTHER, svd->cred); 5204 5205 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5206 if (err) 5207 return (FC_MAKE_ERR(err)); 5208 return (0); 5209 } 5210 5211 static int 5212 segvn_setprot(struct seg *seg, caddr_t addr, size_t len, uint_t prot) 5213 { 5214 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 5215 struct vpage *svp, *evp; 5216 struct vnode *vp; 5217 size_t pgsz; 5218 pgcnt_t pgcnt; 5219 anon_sync_obj_t cookie; 5220 5221 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 5222 5223 if ((svd->maxprot & prot) != prot) 5224 return (EACCES); /* violated maxprot */ 5225 5226 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_WRITER); 5227 5228 /* return if prot is the same */ 5229 if (!svd->pageprot && svd->prot == prot) { 5230 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5231 return (0); 5232 } 5233 5234 /* 5235 * Since we change protections we first have to flush the cache. 5236 * This makes sure all the pagelock calls have to recheck 5237 * protections. 5238 */ 5239 if (svd->softlockcnt > 0) { 5240 /* 5241 * Since we do have the segvn writers lock nobody can fill 5242 * the cache with entries belonging to this seg during 5243 * the purge. The flush either succeeds or we still have 5244 * pending I/Os. 5245 */ 5246 segvn_purge(seg); 5247 if (svd->softlockcnt > 0) { 5248 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5249 return (EAGAIN); 5250 } 5251 } 5252 5253 if (seg->s_szc != 0) { 5254 int err; 5255 pgsz = page_get_pagesize(seg->s_szc); 5256 pgcnt = pgsz >> PAGESHIFT; 5257 ASSERT(IS_P2ALIGNED(pgcnt, pgcnt)); 5258 if (!IS_P2ALIGNED(addr, pgsz) || !IS_P2ALIGNED(len, pgsz)) { 5259 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5260 ASSERT(seg->s_base != addr || seg->s_size != len); 5261 /* 5262 * If we are holding the as lock as a reader then 5263 * we need to return IE_RETRY and let the as 5264 * layer drop and re-aquire the lock as a writer. 5265 */ 5266 if (AS_READ_HELD(seg->s_as, &seg->s_as->a_lock)) 5267 return (IE_RETRY); 5268 VM_STAT_ADD(segvnvmstats.demoterange[1]); 5269 if (svd->type == MAP_PRIVATE || svd->vp != NULL) { 5270 err = segvn_demote_range(seg, addr, len, 5271 SDR_END, 0); 5272 } else { 5273 uint_t szcvec = map_shm_pgszcvec(seg->s_base, 5274 pgsz, (uintptr_t)seg->s_base); 5275 err = segvn_demote_range(seg, addr, len, 5276 SDR_END, szcvec); 5277 } 5278 if (err == 0) 5279 return (IE_RETRY); 5280 if (err == ENOMEM) 5281 return (IE_NOMEM); 5282 return (err); 5283 } 5284 } 5285 5286 5287 /* 5288 * If it's a private mapping and we're making it writable 5289 * and no swap space has been reserved, have to reserve 5290 * it all now. If it's a private mapping to a file (i.e., vp != NULL) 5291 * and we're removing write permission on the entire segment and 5292 * we haven't modified any pages, we can release the swap space. 5293 */ 5294 if (svd->type == MAP_PRIVATE) { 5295 if (prot & PROT_WRITE) { 5296 size_t sz; 5297 if (svd->swresv == 0 && !(svd->flags & MAP_NORESERVE)) { 5298 if (anon_resv(seg->s_size) == 0) { 5299 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5300 return (IE_NOMEM); 5301 } 5302 sz = svd->swresv = seg->s_size; 5303 TRACE_3(TR_FAC_VM, TR_ANON_PROC, 5304 "anon proc:%p %lu %u", 5305 seg, sz, 1); 5306 } 5307 } else { 5308 /* 5309 * Swap space is released only if this segment 5310 * does not map anonymous memory, since read faults 5311 * on such segments still need an anon slot to read 5312 * in the data. 5313 */ 5314 if (svd->swresv != 0 && svd->vp != NULL && 5315 svd->amp == NULL && addr == seg->s_base && 5316 len == seg->s_size && svd->pageprot == 0) { 5317 anon_unresv(svd->swresv); 5318 svd->swresv = 0; 5319 TRACE_3(TR_FAC_VM, TR_ANON_PROC, 5320 "anon proc:%p %lu %u", 5321 seg, 0, 0); 5322 } 5323 } 5324 } 5325 5326 if (addr == seg->s_base && len == seg->s_size && svd->pageprot == 0) { 5327 if (svd->prot == prot) { 5328 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5329 return (0); /* all done */ 5330 } 5331 svd->prot = (uchar_t)prot; 5332 } else if (svd->type == MAP_PRIVATE) { 5333 struct anon *ap = NULL; 5334 page_t *pp; 5335 u_offset_t offset, off; 5336 struct anon_map *amp; 5337 ulong_t anon_idx = 0; 5338 5339 /* 5340 * A vpage structure exists or else the change does not 5341 * involve the entire segment. Establish a vpage structure 5342 * if none is there. Then, for each page in the range, 5343 * adjust its individual permissions. Note that write- 5344 * enabling a MAP_PRIVATE page can affect the claims for 5345 * locked down memory. Overcommitting memory terminates 5346 * the operation. 5347 */ 5348 segvn_vpage(seg); 5349 if ((amp = svd->amp) != NULL) { 5350 anon_idx = svd->anon_index + seg_page(seg, addr); 5351 ASSERT(seg->s_szc == 0 || 5352 IS_P2ALIGNED(anon_idx, pgcnt)); 5353 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 5354 } 5355 5356 offset = svd->offset + (uintptr_t)(addr - seg->s_base); 5357 evp = &svd->vpage[seg_page(seg, addr + len)]; 5358 5359 /* 5360 * See Statement at the beginning of segvn_lockop regarding 5361 * the way cowcnts and lckcnts are handled. 5362 */ 5363 for (svp = &svd->vpage[seg_page(seg, addr)]; svp < evp; svp++) { 5364 5365 if (seg->s_szc != 0) { 5366 if (amp != NULL) { 5367 anon_array_enter(amp, anon_idx, 5368 &cookie); 5369 } 5370 if (IS_P2ALIGNED(anon_idx, pgcnt) && 5371 !segvn_claim_pages(seg, svp, offset, 5372 anon_idx, prot)) { 5373 if (amp != NULL) { 5374 anon_array_exit(&cookie); 5375 } 5376 break; 5377 } 5378 if (amp != NULL) { 5379 anon_array_exit(&cookie); 5380 } 5381 anon_idx++; 5382 } else { 5383 if (amp != NULL) { 5384 anon_array_enter(amp, anon_idx, 5385 &cookie); 5386 ap = anon_get_ptr(amp->ahp, anon_idx++); 5387 } 5388 5389 if (VPP_ISPPLOCK(svp) && 5390 VPP_PROT(svp) != prot) { 5391 5392 if (amp == NULL || ap == NULL) { 5393 vp = svd->vp; 5394 off = offset; 5395 } else 5396 swap_xlate(ap, &vp, &off); 5397 if (amp != NULL) 5398 anon_array_exit(&cookie); 5399 5400 if ((pp = page_lookup(vp, off, 5401 SE_SHARED)) == NULL) { 5402 panic("segvn_setprot: no page"); 5403 /*NOTREACHED*/ 5404 } 5405 ASSERT(seg->s_szc == 0); 5406 if ((VPP_PROT(svp) ^ prot) & 5407 PROT_WRITE) { 5408 if (prot & PROT_WRITE) { 5409 if (!page_addclaim(pp)) { 5410 page_unlock(pp); 5411 break; 5412 } 5413 } else { 5414 if (!page_subclaim(pp)) { 5415 page_unlock(pp); 5416 break; 5417 } 5418 } 5419 } 5420 page_unlock(pp); 5421 } else if (amp != NULL) 5422 anon_array_exit(&cookie); 5423 } 5424 VPP_SETPROT(svp, prot); 5425 offset += PAGESIZE; 5426 } 5427 if (amp != NULL) 5428 ANON_LOCK_EXIT(&->a_rwlock); 5429 5430 /* 5431 * Did we terminate prematurely? If so, simply unload 5432 * the translations to the things we've updated so far. 5433 */ 5434 if (svp != evp) { 5435 len = (svp - &svd->vpage[seg_page(seg, addr)]) * 5436 PAGESIZE; 5437 ASSERT(seg->s_szc == 0 || IS_P2ALIGNED(len, pgsz)); 5438 if (len != 0) 5439 hat_unload(seg->s_as->a_hat, addr, 5440 len, HAT_UNLOAD); 5441 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5442 return (IE_NOMEM); 5443 } 5444 } else { 5445 segvn_vpage(seg); 5446 evp = &svd->vpage[seg_page(seg, addr + len)]; 5447 for (svp = &svd->vpage[seg_page(seg, addr)]; svp < evp; svp++) { 5448 VPP_SETPROT(svp, prot); 5449 } 5450 } 5451 5452 if (((prot & PROT_WRITE) != 0 && 5453 (svd->vp != NULL || svd->type == MAP_PRIVATE)) || 5454 (prot & ~PROT_USER) == PROT_NONE) { 5455 /* 5456 * Either private or shared data with write access (in 5457 * which case we need to throw out all former translations 5458 * so that we get the right translations set up on fault 5459 * and we don't allow write access to any copy-on-write pages 5460 * that might be around or to prevent write access to pages 5461 * representing holes in a file), or we don't have permission 5462 * to access the memory at all (in which case we have to 5463 * unload any current translations that might exist). 5464 */ 5465 hat_unload(seg->s_as->a_hat, addr, len, HAT_UNLOAD); 5466 } else { 5467 /* 5468 * A shared mapping or a private mapping in which write 5469 * protection is going to be denied - just change all the 5470 * protections over the range of addresses in question. 5471 * segvn does not support any other attributes other 5472 * than prot so we can use hat_chgattr. 5473 */ 5474 hat_chgattr(seg->s_as->a_hat, addr, len, prot); 5475 } 5476 5477 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5478 5479 return (0); 5480 } 5481 5482 /* 5483 * segvn_setpagesize is called via SEGOP_SETPAGESIZE from as_setpagesize, 5484 * to determine if the seg is capable of mapping the requested szc. 5485 */ 5486 static int 5487 segvn_setpagesize(struct seg *seg, caddr_t addr, size_t len, uint_t szc) 5488 { 5489 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 5490 struct segvn_data *nsvd; 5491 struct anon_map *amp = svd->amp; 5492 struct seg *nseg; 5493 caddr_t eaddr = addr + len, a; 5494 size_t pgsz = page_get_pagesize(szc); 5495 pgcnt_t pgcnt = page_get_pagecnt(szc); 5496 int err; 5497 u_offset_t off = svd->offset + (uintptr_t)(addr - seg->s_base); 5498 extern struct vnode kvp; 5499 5500 ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock)); 5501 ASSERT(addr >= seg->s_base && eaddr <= seg->s_base + seg->s_size); 5502 5503 if (seg->s_szc == szc || segvn_lpg_disable != 0) { 5504 return (0); 5505 } 5506 5507 /* 5508 * addr should always be pgsz aligned but eaddr may be misaligned if 5509 * it's at the end of the segment. 5510 * 5511 * XXX we should assert this condition since as_setpagesize() logic 5512 * guarantees it. 5513 */ 5514 if (!IS_P2ALIGNED(addr, pgsz) || 5515 (!IS_P2ALIGNED(eaddr, pgsz) && 5516 eaddr != seg->s_base + seg->s_size)) { 5517 5518 segvn_setpgsz_align_err++; 5519 return (EINVAL); 5520 } 5521 5522 if (amp != NULL && svd->type == MAP_SHARED) { 5523 ulong_t an_idx = svd->anon_index + seg_page(seg, addr); 5524 if (!IS_P2ALIGNED(an_idx, pgcnt)) { 5525 5526 segvn_setpgsz_anon_align_err++; 5527 return (EINVAL); 5528 } 5529 } 5530 5531 if ((svd->flags & MAP_NORESERVE) || seg->s_as == &kas || 5532 szc > segvn_maxpgszc) { 5533 return (EINVAL); 5534 } 5535 5536 /* paranoid check */ 5537 if (svd->vp != NULL && 5538 (IS_SWAPFSVP(svd->vp) || svd->vp == &kvp)) { 5539 return (EINVAL); 5540 } 5541 5542 if (seg->s_szc == 0 && svd->vp != NULL && 5543 map_addr_vacalign_check(addr, off)) { 5544 return (EINVAL); 5545 } 5546 5547 /* 5548 * Check that protections are the same within new page 5549 * size boundaries. 5550 */ 5551 if (svd->pageprot) { 5552 for (a = addr; a < eaddr; a += pgsz) { 5553 if ((a + pgsz) > eaddr) { 5554 if (!sameprot(seg, a, eaddr - a)) { 5555 return (EINVAL); 5556 } 5557 } else { 5558 if (!sameprot(seg, a, pgsz)) { 5559 return (EINVAL); 5560 } 5561 } 5562 } 5563 } 5564 5565 /* 5566 * Since we are changing page size we first have to flush 5567 * the cache. This makes sure all the pagelock calls have 5568 * to recheck protections. 5569 */ 5570 if (svd->softlockcnt > 0) { 5571 /* 5572 * Since we do have the segvn writers lock nobody can fill 5573 * the cache with entries belonging to this seg during 5574 * the purge. The flush either succeeds or we still have 5575 * pending I/Os. 5576 */ 5577 segvn_purge(seg); 5578 if (svd->softlockcnt > 0) { 5579 return (EAGAIN); 5580 } 5581 } 5582 5583 /* 5584 * Operation for sub range of existing segment. 5585 */ 5586 if (addr != seg->s_base || eaddr != (seg->s_base + seg->s_size)) { 5587 if (szc < seg->s_szc) { 5588 VM_STAT_ADD(segvnvmstats.demoterange[2]); 5589 err = segvn_demote_range(seg, addr, len, SDR_RANGE, 0); 5590 if (err == 0) { 5591 return (IE_RETRY); 5592 } 5593 if (err == ENOMEM) { 5594 return (IE_NOMEM); 5595 } 5596 return (err); 5597 } 5598 if (addr != seg->s_base) { 5599 nseg = segvn_split_seg(seg, addr); 5600 if (eaddr != (nseg->s_base + nseg->s_size)) { 5601 /* eaddr is szc aligned */ 5602 (void) segvn_split_seg(nseg, eaddr); 5603 } 5604 return (IE_RETRY); 5605 } 5606 if (eaddr != (seg->s_base + seg->s_size)) { 5607 /* eaddr is szc aligned */ 5608 (void) segvn_split_seg(seg, eaddr); 5609 } 5610 return (IE_RETRY); 5611 } 5612 5613 /* 5614 * Break any low level sharing and reset seg->s_szc to 0. 5615 */ 5616 if ((err = segvn_clrszc(seg)) != 0) { 5617 if (err == ENOMEM) { 5618 err = IE_NOMEM; 5619 } 5620 return (err); 5621 } 5622 ASSERT(seg->s_szc == 0); 5623 5624 /* 5625 * If the end of the current segment is not pgsz aligned 5626 * then attempt to concatenate with the next segment. 5627 */ 5628 if (!IS_P2ALIGNED(eaddr, pgsz)) { 5629 nseg = AS_SEGNEXT(seg->s_as, seg); 5630 if (nseg == NULL || nseg == seg || eaddr != nseg->s_base) { 5631 return (ENOMEM); 5632 } 5633 if (nseg->s_ops != &segvn_ops) { 5634 return (EINVAL); 5635 } 5636 nsvd = (struct segvn_data *)nseg->s_data; 5637 if (nsvd->softlockcnt > 0) { 5638 segvn_purge(nseg); 5639 if (nsvd->softlockcnt > 0) { 5640 return (EAGAIN); 5641 } 5642 } 5643 err = segvn_clrszc(nseg); 5644 if (err == ENOMEM) { 5645 err = IE_NOMEM; 5646 } 5647 if (err != 0) { 5648 return (err); 5649 } 5650 err = segvn_concat(seg, nseg, 1); 5651 if (err == -1) { 5652 return (EINVAL); 5653 } 5654 if (err == -2) { 5655 return (IE_NOMEM); 5656 } 5657 return (IE_RETRY); 5658 } 5659 5660 /* 5661 * May need to re-align anon array to 5662 * new szc. 5663 */ 5664 if (amp != NULL) { 5665 if (!IS_P2ALIGNED(svd->anon_index, pgcnt)) { 5666 struct anon_hdr *nahp; 5667 5668 ASSERT(svd->type == MAP_PRIVATE); 5669 5670 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 5671 ASSERT(amp->refcnt == 1); 5672 nahp = anon_create(btop(amp->size), ANON_NOSLEEP); 5673 if (nahp == NULL) { 5674 ANON_LOCK_EXIT(&->a_rwlock); 5675 return (IE_NOMEM); 5676 } 5677 if (anon_copy_ptr(amp->ahp, svd->anon_index, 5678 nahp, 0, btop(seg->s_size), ANON_NOSLEEP)) { 5679 anon_release(nahp, btop(amp->size)); 5680 ANON_LOCK_EXIT(&->a_rwlock); 5681 return (IE_NOMEM); 5682 } 5683 anon_release(amp->ahp, btop(amp->size)); 5684 amp->ahp = nahp; 5685 svd->anon_index = 0; 5686 ANON_LOCK_EXIT(&->a_rwlock); 5687 } 5688 } 5689 if (svd->vp != NULL && szc != 0) { 5690 struct vattr va; 5691 u_offset_t eoffpage = svd->offset; 5692 va.va_mask = AT_SIZE; 5693 eoffpage += seg->s_size; 5694 eoffpage = btopr(eoffpage); 5695 if (VOP_GETATTR(svd->vp, &va, 0, svd->cred) != 0) { 5696 segvn_setpgsz_getattr_err++; 5697 return (EINVAL); 5698 } 5699 if (btopr(va.va_size) < eoffpage) { 5700 segvn_setpgsz_eof_err++; 5701 return (EINVAL); 5702 } 5703 if (amp != NULL) { 5704 /* 5705 * anon_fill_cow_holes() may call VOP_GETPAGE(). 5706 * don't take anon map lock here to avoid holding it 5707 * across VOP_GETPAGE() calls that may call back into 5708 * segvn for klsutering checks. We don't really need 5709 * anon map lock here since it's a private segment and 5710 * we hold as level lock as writers. 5711 */ 5712 if ((err = anon_fill_cow_holes(seg, seg->s_base, 5713 amp->ahp, svd->anon_index, svd->vp, svd->offset, 5714 seg->s_size, szc, svd->prot, svd->vpage, 5715 svd->cred)) != 0) { 5716 return (EINVAL); 5717 } 5718 } 5719 segvn_setvnode_mpss(svd->vp); 5720 } 5721 5722 if (amp != NULL) { 5723 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 5724 if (svd->type == MAP_PRIVATE) { 5725 amp->a_szc = szc; 5726 } else if (szc > amp->a_szc) { 5727 amp->a_szc = szc; 5728 } 5729 ANON_LOCK_EXIT(&->a_rwlock); 5730 } 5731 5732 seg->s_szc = szc; 5733 5734 return (0); 5735 } 5736 5737 static int 5738 segvn_clrszc(struct seg *seg) 5739 { 5740 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 5741 struct anon_map *amp = svd->amp; 5742 size_t pgsz; 5743 pgcnt_t pages; 5744 int err = 0; 5745 caddr_t a = seg->s_base; 5746 caddr_t ea = a + seg->s_size; 5747 ulong_t an_idx = svd->anon_index; 5748 vnode_t *vp = svd->vp; 5749 struct vpage *vpage = svd->vpage; 5750 page_t *anon_pl[1 + 1], *pp; 5751 struct anon *ap, *oldap; 5752 uint_t prot = svd->prot, vpprot; 5753 5754 ASSERT(AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock) || 5755 SEGVN_WRITE_HELD(seg->s_as, &svd->lock)); 5756 5757 if (vp == NULL && amp == NULL) { 5758 seg->s_szc = 0; 5759 return (0); 5760 } 5761 5762 /* 5763 * do HAT_UNLOAD_UNMAP since we are changing the pagesize. 5764 * unload argument is 0 when we are freeing the segment 5765 * and unload was already done. 5766 */ 5767 hat_unload(seg->s_as->a_hat, seg->s_base, seg->s_size, 5768 HAT_UNLOAD_UNMAP); 5769 5770 if (amp == NULL || svd->type == MAP_SHARED) { 5771 seg->s_szc = 0; 5772 return (0); 5773 } 5774 5775 pgsz = page_get_pagesize(seg->s_szc); 5776 pages = btop(pgsz); 5777 5778 /* 5779 * XXX anon rwlock is not really needed because this is a 5780 * private segment and we are writers. 5781 */ 5782 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 5783 5784 for (; a < ea; a += pgsz, an_idx += pages) { 5785 if ((oldap = anon_get_ptr(amp->ahp, an_idx)) != NULL) { 5786 if (svd->pageprot != 0) { 5787 ASSERT(vpage != NULL); 5788 prot = VPP_PROT(vpage); 5789 ASSERT(sameprot(seg, a, pgsz)); 5790 } 5791 if (seg->s_szc != 0) { 5792 ASSERT(vp == NULL || anon_pages(amp->ahp, 5793 an_idx, pages) == pages); 5794 if ((err = anon_map_demotepages(amp, an_idx, 5795 seg, a, prot, vpage, svd->cred)) != 0) { 5796 goto out; 5797 } 5798 } else { 5799 if (oldap->an_refcnt == 1) { 5800 continue; 5801 } 5802 if ((err = anon_getpage(&oldap, &vpprot, 5803 anon_pl, PAGESIZE, seg, a, S_READ, 5804 svd->cred))) { 5805 goto out; 5806 } 5807 if ((pp = anon_private(&ap, seg, a, prot, 5808 anon_pl[0], 0, svd->cred)) == NULL) { 5809 err = ENOMEM; 5810 goto out; 5811 } 5812 anon_decref(oldap); 5813 (void) anon_set_ptr(amp->ahp, an_idx, ap, 5814 ANON_SLEEP); 5815 page_unlock(pp); 5816 } 5817 } 5818 vpage = (vpage == NULL) ? NULL : vpage + pages; 5819 } 5820 5821 amp->a_szc = 0; 5822 seg->s_szc = 0; 5823 out: 5824 ANON_LOCK_EXIT(&->a_rwlock); 5825 return (err); 5826 } 5827 5828 static int 5829 segvn_claim_pages( 5830 struct seg *seg, 5831 struct vpage *svp, 5832 u_offset_t off, 5833 ulong_t anon_idx, 5834 uint_t prot) 5835 { 5836 pgcnt_t pgcnt = page_get_pagecnt(seg->s_szc); 5837 size_t ppasize = (pgcnt + 1) * sizeof (page_t *); 5838 page_t **ppa; 5839 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 5840 struct anon_map *amp = svd->amp; 5841 struct vpage *evp = svp + pgcnt; 5842 caddr_t addr = ((uintptr_t)(svp - svd->vpage) << PAGESHIFT) 5843 + seg->s_base; 5844 struct anon *ap; 5845 struct vnode *vp = svd->vp; 5846 page_t *pp; 5847 pgcnt_t pg_idx, i; 5848 int err = 0; 5849 anoff_t aoff; 5850 int anon = (amp != NULL) ? 1 : 0; 5851 5852 ASSERT(svd->type == MAP_PRIVATE); 5853 ASSERT(svd->vpage != NULL); 5854 ASSERT(seg->s_szc != 0); 5855 ASSERT(IS_P2ALIGNED(pgcnt, pgcnt)); 5856 ASSERT(amp == NULL || IS_P2ALIGNED(anon_idx, pgcnt)); 5857 ASSERT(sameprot(seg, addr, pgcnt << PAGESHIFT)); 5858 5859 if (VPP_PROT(svp) == prot) 5860 return (1); 5861 if (!((VPP_PROT(svp) ^ prot) & PROT_WRITE)) 5862 return (1); 5863 5864 ppa = kmem_alloc(ppasize, KM_SLEEP); 5865 if (anon && vp != NULL) { 5866 if (anon_get_ptr(amp->ahp, anon_idx) == NULL) { 5867 anon = 0; 5868 ASSERT(!anon_pages(amp->ahp, anon_idx, pgcnt)); 5869 } 5870 ASSERT(!anon || 5871 anon_pages(amp->ahp, anon_idx, pgcnt) == pgcnt); 5872 } 5873 5874 for (*ppa = NULL, pg_idx = 0; svp < evp; svp++, anon_idx++) { 5875 if (!VPP_ISPPLOCK(svp)) 5876 continue; 5877 if (anon) { 5878 ap = anon_get_ptr(amp->ahp, anon_idx); 5879 if (ap == NULL) { 5880 panic("segvn_claim_pages: no anon slot"); 5881 } 5882 swap_xlate(ap, &vp, &aoff); 5883 off = (u_offset_t)aoff; 5884 } 5885 ASSERT(vp != NULL); 5886 if ((pp = page_lookup(vp, 5887 (u_offset_t)off, SE_SHARED)) == NULL) { 5888 panic("segvn_claim_pages: no page"); 5889 } 5890 ppa[pg_idx++] = pp; 5891 off += PAGESIZE; 5892 } 5893 5894 if (ppa[0] == NULL) { 5895 kmem_free(ppa, ppasize); 5896 return (1); 5897 } 5898 5899 ASSERT(pg_idx <= pgcnt); 5900 ppa[pg_idx] = NULL; 5901 5902 if (prot & PROT_WRITE) 5903 err = page_addclaim_pages(ppa); 5904 else 5905 err = page_subclaim_pages(ppa); 5906 5907 for (i = 0; i < pg_idx; i++) { 5908 ASSERT(ppa[i] != NULL); 5909 page_unlock(ppa[i]); 5910 } 5911 5912 kmem_free(ppa, ppasize); 5913 return (err); 5914 } 5915 5916 /* 5917 * Returns right (upper address) segment if split occured. 5918 * If the address is equal to the beginning or end of its segment it returns 5919 * the current segment. 5920 */ 5921 static struct seg * 5922 segvn_split_seg(struct seg *seg, caddr_t addr) 5923 { 5924 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 5925 struct seg *nseg; 5926 size_t nsize; 5927 struct segvn_data *nsvd; 5928 5929 ASSERT(AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock)); 5930 ASSERT(addr >= seg->s_base); 5931 ASSERT(addr <= seg->s_base + seg->s_size); 5932 5933 if (addr == seg->s_base || addr == seg->s_base + seg->s_size) 5934 return (seg); 5935 5936 nsize = seg->s_base + seg->s_size - addr; 5937 seg->s_size = addr - seg->s_base; 5938 nseg = seg_alloc(seg->s_as, addr, nsize); 5939 ASSERT(nseg != NULL); 5940 nseg->s_ops = seg->s_ops; 5941 nsvd = kmem_cache_alloc(segvn_cache, KM_SLEEP); 5942 nseg->s_data = (void *)nsvd; 5943 nseg->s_szc = seg->s_szc; 5944 *nsvd = *svd; 5945 rw_init(&nsvd->lock, NULL, RW_DEFAULT, NULL); 5946 5947 if (nsvd->vp != NULL) { 5948 VN_HOLD(nsvd->vp); 5949 nsvd->offset = svd->offset + 5950 (uintptr_t)(nseg->s_base - seg->s_base); 5951 if (nsvd->type == MAP_SHARED) 5952 lgrp_shm_policy_init(NULL, nsvd->vp); 5953 } else { 5954 /* 5955 * The offset for an anonymous segment has no signifigance in 5956 * terms of an offset into a file. If we were to use the above 5957 * calculation instead, the structures read out of 5958 * /proc/<pid>/xmap would be more difficult to decipher since 5959 * it would be unclear whether two seemingly contiguous 5960 * prxmap_t structures represented different segments or a 5961 * single segment that had been split up into multiple prxmap_t 5962 * structures (e.g. if some part of the segment had not yet 5963 * been faulted in). 5964 */ 5965 nsvd->offset = 0; 5966 } 5967 5968 ASSERT(svd->softlockcnt == 0); 5969 crhold(svd->cred); 5970 5971 if (svd->vpage != NULL) { 5972 size_t bytes = vpgtob(seg_pages(seg)); 5973 size_t nbytes = vpgtob(seg_pages(nseg)); 5974 struct vpage *ovpage = svd->vpage; 5975 5976 svd->vpage = kmem_alloc(bytes, KM_SLEEP); 5977 bcopy(ovpage, svd->vpage, bytes); 5978 nsvd->vpage = kmem_alloc(nbytes, KM_SLEEP); 5979 bcopy(ovpage + seg_pages(seg), nsvd->vpage, nbytes); 5980 kmem_free(ovpage, bytes + nbytes); 5981 } 5982 if (svd->amp != NULL && svd->type == MAP_PRIVATE) { 5983 struct anon_map *oamp = svd->amp, *namp; 5984 struct anon_hdr *nahp; 5985 5986 ANON_LOCK_ENTER(&oamp->a_rwlock, RW_WRITER); 5987 ASSERT(oamp->refcnt == 1); 5988 nahp = anon_create(btop(seg->s_size), ANON_SLEEP); 5989 (void) anon_copy_ptr(oamp->ahp, svd->anon_index, 5990 nahp, 0, btop(seg->s_size), ANON_SLEEP); 5991 5992 namp = anonmap_alloc(nseg->s_size, 0); 5993 namp->a_szc = nseg->s_szc; 5994 (void) anon_copy_ptr(oamp->ahp, 5995 svd->anon_index + btop(seg->s_size), 5996 namp->ahp, 0, btop(nseg->s_size), ANON_SLEEP); 5997 anon_release(oamp->ahp, btop(oamp->size)); 5998 oamp->ahp = nahp; 5999 oamp->size = seg->s_size; 6000 svd->anon_index = 0; 6001 nsvd->amp = namp; 6002 nsvd->anon_index = 0; 6003 ANON_LOCK_EXIT(&oamp->a_rwlock); 6004 } else if (svd->amp != NULL) { 6005 pgcnt_t pgcnt = page_get_pagecnt(seg->s_szc); 6006 ASSERT(svd->amp == nsvd->amp); 6007 ASSERT(seg->s_szc <= svd->amp->a_szc); 6008 nsvd->anon_index = svd->anon_index + seg_pages(seg); 6009 ASSERT(IS_P2ALIGNED(nsvd->anon_index, pgcnt)); 6010 ANON_LOCK_ENTER(&svd->amp->a_rwlock, RW_WRITER); 6011 svd->amp->refcnt++; 6012 ANON_LOCK_EXIT(&svd->amp->a_rwlock); 6013 } 6014 6015 /* 6016 * Split amount of swap reserve 6017 */ 6018 if (svd->swresv) { 6019 /* 6020 * For MAP_NORESERVE, only allocate swap reserve for pages 6021 * being used. Other segments get enough to cover whole 6022 * segment. 6023 */ 6024 if (svd->flags & MAP_NORESERVE) { 6025 size_t oswresv; 6026 6027 ASSERT(svd->amp); 6028 oswresv = svd->swresv; 6029 svd->swresv = ptob(anon_pages(svd->amp->ahp, 6030 svd->anon_index, btop(seg->s_size))); 6031 nsvd->swresv = ptob(anon_pages(nsvd->amp->ahp, 6032 nsvd->anon_index, btop(nseg->s_size))); 6033 ASSERT(oswresv >= (svd->swresv + nsvd->swresv)); 6034 } else { 6035 ASSERT(svd->swresv == seg->s_size + nseg->s_size); 6036 svd->swresv = seg->s_size; 6037 nsvd->swresv = nseg->s_size; 6038 } 6039 } 6040 6041 return (nseg); 6042 } 6043 6044 /* 6045 * called on memory operations (unmap, setprot, setpagesize) for a subset 6046 * of a large page segment to either demote the memory range (SDR_RANGE) 6047 * or the ends (SDR_END) by addr/len. 6048 * 6049 * returns 0 on success. returns errno, including ENOMEM, on failure. 6050 */ 6051 static int 6052 segvn_demote_range( 6053 struct seg *seg, 6054 caddr_t addr, 6055 size_t len, 6056 int flag, 6057 uint_t szcvec) 6058 { 6059 caddr_t eaddr = addr + len; 6060 caddr_t lpgaddr, lpgeaddr; 6061 struct seg *nseg; 6062 struct seg *badseg1 = NULL; 6063 struct seg *badseg2 = NULL; 6064 size_t pgsz; 6065 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 6066 int err; 6067 uint_t szc = seg->s_szc; 6068 uint_t tszcvec; 6069 6070 ASSERT(AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock)); 6071 ASSERT(szc != 0); 6072 pgsz = page_get_pagesize(szc); 6073 ASSERT(seg->s_base != addr || seg->s_size != len); 6074 ASSERT(addr >= seg->s_base && eaddr <= seg->s_base + seg->s_size); 6075 ASSERT(svd->softlockcnt == 0); 6076 ASSERT(szcvec == 0 || (flag == SDR_END && svd->type == MAP_SHARED)); 6077 6078 CALC_LPG_REGION(pgsz, seg, addr, len, lpgaddr, lpgeaddr); 6079 ASSERT(flag == SDR_RANGE || eaddr < lpgeaddr || addr > lpgaddr); 6080 if (flag == SDR_RANGE) { 6081 /* demote entire range */ 6082 badseg1 = nseg = segvn_split_seg(seg, lpgaddr); 6083 (void) segvn_split_seg(nseg, lpgeaddr); 6084 ASSERT(badseg1->s_base == lpgaddr); 6085 ASSERT(badseg1->s_size == lpgeaddr - lpgaddr); 6086 } else if (addr != lpgaddr) { 6087 ASSERT(flag == SDR_END); 6088 badseg1 = nseg = segvn_split_seg(seg, lpgaddr); 6089 if (eaddr != lpgeaddr && eaddr > lpgaddr + pgsz && 6090 eaddr < lpgaddr + 2 * pgsz) { 6091 (void) segvn_split_seg(nseg, lpgeaddr); 6092 ASSERT(badseg1->s_base == lpgaddr); 6093 ASSERT(badseg1->s_size == 2 * pgsz); 6094 } else { 6095 nseg = segvn_split_seg(nseg, lpgaddr + pgsz); 6096 ASSERT(badseg1->s_base == lpgaddr); 6097 ASSERT(badseg1->s_size == pgsz); 6098 if (eaddr != lpgeaddr && eaddr > lpgaddr + pgsz) { 6099 ASSERT(lpgeaddr - lpgaddr > 2 * pgsz); 6100 nseg = segvn_split_seg(nseg, lpgeaddr - pgsz); 6101 badseg2 = nseg; 6102 (void) segvn_split_seg(nseg, lpgeaddr); 6103 ASSERT(badseg2->s_base == lpgeaddr - pgsz); 6104 ASSERT(badseg2->s_size == pgsz); 6105 } 6106 } 6107 } else { 6108 ASSERT(flag == SDR_END); 6109 ASSERT(eaddr < lpgeaddr); 6110 badseg1 = nseg = segvn_split_seg(seg, lpgeaddr - pgsz); 6111 (void) segvn_split_seg(nseg, lpgeaddr); 6112 ASSERT(badseg1->s_base == lpgeaddr - pgsz); 6113 ASSERT(badseg1->s_size == pgsz); 6114 } 6115 6116 ASSERT(badseg1 != NULL); 6117 ASSERT(badseg1->s_szc == szc); 6118 ASSERT(flag == SDR_RANGE || badseg1->s_size == pgsz || 6119 badseg1->s_size == 2 * pgsz); 6120 ASSERT(sameprot(badseg1, badseg1->s_base, pgsz)); 6121 ASSERT(badseg1->s_size == pgsz || 6122 sameprot(badseg1, badseg1->s_base + pgsz, pgsz)); 6123 if (err = segvn_clrszc(badseg1)) { 6124 return (err); 6125 } 6126 ASSERT(badseg1->s_szc == 0); 6127 6128 if (szc > 1 && (tszcvec = P2PHASE(szcvec, 1 << szc)) > 1) { 6129 uint_t tszc = highbit(tszcvec) - 1; 6130 caddr_t ta = MAX(addr, badseg1->s_base); 6131 caddr_t te; 6132 size_t tpgsz = page_get_pagesize(tszc); 6133 6134 ASSERT(svd->type == MAP_SHARED); 6135 ASSERT(flag == SDR_END); 6136 ASSERT(tszc < szc && tszc > 0); 6137 6138 if (eaddr > badseg1->s_base + badseg1->s_size) { 6139 te = badseg1->s_base + badseg1->s_size; 6140 } else { 6141 te = eaddr; 6142 } 6143 6144 ASSERT(ta <= te); 6145 badseg1->s_szc = tszc; 6146 if (!IS_P2ALIGNED(ta, tpgsz) || !IS_P2ALIGNED(te, tpgsz)) { 6147 if (badseg2 != NULL) { 6148 err = segvn_demote_range(badseg1, ta, te - ta, 6149 SDR_END, tszcvec); 6150 if (err != 0) { 6151 return (err); 6152 } 6153 } else { 6154 return (segvn_demote_range(badseg1, ta, 6155 te - ta, SDR_END, tszcvec)); 6156 } 6157 } 6158 } 6159 6160 if (badseg2 == NULL) 6161 return (0); 6162 ASSERT(badseg2->s_szc == szc); 6163 ASSERT(badseg2->s_size == pgsz); 6164 ASSERT(sameprot(badseg2, badseg2->s_base, badseg2->s_size)); 6165 if (err = segvn_clrszc(badseg2)) { 6166 return (err); 6167 } 6168 ASSERT(badseg2->s_szc == 0); 6169 6170 if (szc > 1 && (tszcvec = P2PHASE(szcvec, 1 << szc)) > 1) { 6171 uint_t tszc = highbit(tszcvec) - 1; 6172 size_t tpgsz = page_get_pagesize(tszc); 6173 6174 ASSERT(svd->type == MAP_SHARED); 6175 ASSERT(flag == SDR_END); 6176 ASSERT(tszc < szc && tszc > 0); 6177 ASSERT(badseg2->s_base > addr); 6178 ASSERT(eaddr > badseg2->s_base); 6179 ASSERT(eaddr < badseg2->s_base + badseg2->s_size); 6180 6181 badseg2->s_szc = tszc; 6182 if (!IS_P2ALIGNED(eaddr, tpgsz)) { 6183 return (segvn_demote_range(badseg2, badseg2->s_base, 6184 eaddr - badseg2->s_base, SDR_END, tszcvec)); 6185 } 6186 } 6187 6188 return (0); 6189 } 6190 6191 static int 6192 segvn_checkprot(struct seg *seg, caddr_t addr, size_t len, uint_t prot) 6193 { 6194 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 6195 struct vpage *vp, *evp; 6196 6197 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 6198 6199 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER); 6200 /* 6201 * If segment protection can be used, simply check against them. 6202 */ 6203 if (svd->pageprot == 0) { 6204 int err; 6205 6206 err = ((svd->prot & prot) != prot) ? EACCES : 0; 6207 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 6208 return (err); 6209 } 6210 6211 /* 6212 * Have to check down to the vpage level. 6213 */ 6214 evp = &svd->vpage[seg_page(seg, addr + len)]; 6215 for (vp = &svd->vpage[seg_page(seg, addr)]; vp < evp; vp++) { 6216 if ((VPP_PROT(vp) & prot) != prot) { 6217 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 6218 return (EACCES); 6219 } 6220 } 6221 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 6222 return (0); 6223 } 6224 6225 static int 6226 segvn_getprot(struct seg *seg, caddr_t addr, size_t len, uint_t *protv) 6227 { 6228 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 6229 size_t pgno = seg_page(seg, addr + len) - seg_page(seg, addr) + 1; 6230 6231 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 6232 6233 if (pgno != 0) { 6234 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER); 6235 if (svd->pageprot == 0) { 6236 do 6237 protv[--pgno] = svd->prot; 6238 while (pgno != 0); 6239 } else { 6240 size_t pgoff = seg_page(seg, addr); 6241 6242 do { 6243 pgno--; 6244 protv[pgno] = VPP_PROT(&svd->vpage[pgno+pgoff]); 6245 } while (pgno != 0); 6246 } 6247 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 6248 } 6249 return (0); 6250 } 6251 6252 static u_offset_t 6253 segvn_getoffset(struct seg *seg, caddr_t addr) 6254 { 6255 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 6256 6257 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 6258 6259 return (svd->offset + (uintptr_t)(addr - seg->s_base)); 6260 } 6261 6262 /*ARGSUSED*/ 6263 static int 6264 segvn_gettype(struct seg *seg, caddr_t addr) 6265 { 6266 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 6267 6268 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 6269 6270 return (svd->type | (svd->flags & MAP_NORESERVE)); 6271 } 6272 6273 /*ARGSUSED*/ 6274 static int 6275 segvn_getvp(struct seg *seg, caddr_t addr, struct vnode **vpp) 6276 { 6277 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 6278 6279 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 6280 6281 *vpp = svd->vp; 6282 return (0); 6283 } 6284 6285 /* 6286 * Check to see if it makes sense to do kluster/read ahead to 6287 * addr + delta relative to the mapping at addr. We assume here 6288 * that delta is a signed PAGESIZE'd multiple (which can be negative). 6289 * 6290 * For segvn, we currently "approve" of the action if we are 6291 * still in the segment and it maps from the same vp/off, 6292 * or if the advice stored in segvn_data or vpages allows it. 6293 * Currently, klustering is not allowed only if MADV_RANDOM is set. 6294 */ 6295 static int 6296 segvn_kluster(struct seg *seg, caddr_t addr, ssize_t delta) 6297 { 6298 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 6299 struct anon *oap, *ap; 6300 ssize_t pd; 6301 size_t page; 6302 struct vnode *vp1, *vp2; 6303 u_offset_t off1, off2; 6304 struct anon_map *amp; 6305 6306 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 6307 ASSERT(AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock) || 6308 SEGVN_LOCK_HELD(seg->s_as, &svd->lock)); 6309 6310 if (addr + delta < seg->s_base || 6311 addr + delta >= (seg->s_base + seg->s_size)) 6312 return (-1); /* exceeded segment bounds */ 6313 6314 pd = delta / (ssize_t)PAGESIZE; /* divide to preserve sign bit */ 6315 page = seg_page(seg, addr); 6316 6317 /* 6318 * Check to see if either of the pages addr or addr + delta 6319 * have advice set that prevents klustering (if MADV_RANDOM advice 6320 * is set for entire segment, or MADV_SEQUENTIAL is set and delta 6321 * is negative). 6322 */ 6323 if (svd->advice == MADV_RANDOM || 6324 svd->advice == MADV_SEQUENTIAL && delta < 0) 6325 return (-1); 6326 else if (svd->pageadvice && svd->vpage) { 6327 struct vpage *bvpp, *evpp; 6328 6329 bvpp = &svd->vpage[page]; 6330 evpp = &svd->vpage[page + pd]; 6331 if (VPP_ADVICE(bvpp) == MADV_RANDOM || 6332 VPP_ADVICE(evpp) == MADV_SEQUENTIAL && delta < 0) 6333 return (-1); 6334 if (VPP_ADVICE(bvpp) != VPP_ADVICE(evpp) && 6335 VPP_ADVICE(evpp) == MADV_RANDOM) 6336 return (-1); 6337 } 6338 6339 if (svd->type == MAP_SHARED) 6340 return (0); /* shared mapping - all ok */ 6341 6342 if ((amp = svd->amp) == NULL) 6343 return (0); /* off original vnode */ 6344 6345 page += svd->anon_index; 6346 6347 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 6348 6349 oap = anon_get_ptr(amp->ahp, page); 6350 ap = anon_get_ptr(amp->ahp, page + pd); 6351 6352 ANON_LOCK_EXIT(&->a_rwlock); 6353 6354 if ((oap == NULL && ap != NULL) || (oap != NULL && ap == NULL)) { 6355 return (-1); /* one with and one without an anon */ 6356 } 6357 6358 if (oap == NULL) { /* implies that ap == NULL */ 6359 return (0); /* off original vnode */ 6360 } 6361 6362 /* 6363 * Now we know we have two anon pointers - check to 6364 * see if they happen to be properly allocated. 6365 */ 6366 6367 /* 6368 * XXX We cheat here and don't lock the anon slots. We can't because 6369 * we may have been called from the anon layer which might already 6370 * have locked them. We are holding a refcnt on the slots so they 6371 * can't disappear. The worst that will happen is we'll get the wrong 6372 * names (vp, off) for the slots and make a poor klustering decision. 6373 */ 6374 swap_xlate(ap, &vp1, &off1); 6375 swap_xlate(oap, &vp2, &off2); 6376 6377 6378 if (!VOP_CMP(vp1, vp2) || off1 - off2 != delta) 6379 return (-1); 6380 return (0); 6381 } 6382 6383 /* 6384 * Swap the pages of seg out to secondary storage, returning the 6385 * number of bytes of storage freed. 6386 * 6387 * The basic idea is first to unload all translations and then to call 6388 * VOP_PUTPAGE() for all newly-unmapped pages, to push them out to the 6389 * swap device. Pages to which other segments have mappings will remain 6390 * mapped and won't be swapped. Our caller (as_swapout) has already 6391 * performed the unloading step. 6392 * 6393 * The value returned is intended to correlate well with the process's 6394 * memory requirements. However, there are some caveats: 6395 * 1) When given a shared segment as argument, this routine will 6396 * only succeed in swapping out pages for the last sharer of the 6397 * segment. (Previous callers will only have decremented mapping 6398 * reference counts.) 6399 * 2) We assume that the hat layer maintains a large enough translation 6400 * cache to capture process reference patterns. 6401 */ 6402 static size_t 6403 segvn_swapout(struct seg *seg) 6404 { 6405 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 6406 struct anon_map *amp; 6407 pgcnt_t pgcnt = 0; 6408 pgcnt_t npages; 6409 pgcnt_t page; 6410 ulong_t anon_index; 6411 6412 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 6413 6414 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER); 6415 /* 6416 * Find pages unmapped by our caller and force them 6417 * out to the virtual swap device. 6418 */ 6419 if ((amp = svd->amp) != NULL) 6420 anon_index = svd->anon_index; 6421 npages = seg->s_size >> PAGESHIFT; 6422 for (page = 0; page < npages; page++) { 6423 page_t *pp; 6424 struct anon *ap; 6425 struct vnode *vp; 6426 u_offset_t off; 6427 anon_sync_obj_t cookie; 6428 6429 /* 6430 * Obtain <vp, off> pair for the page, then look it up. 6431 * 6432 * Note that this code is willing to consider regular 6433 * pages as well as anon pages. Is this appropriate here? 6434 */ 6435 ap = NULL; 6436 if (amp != NULL) { 6437 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 6438 if (anon_array_try_enter(amp, anon_index + page, 6439 &cookie)) { 6440 ANON_LOCK_EXIT(&->a_rwlock); 6441 continue; 6442 } 6443 ap = anon_get_ptr(amp->ahp, anon_index + page); 6444 if (ap != NULL) { 6445 swap_xlate(ap, &vp, &off); 6446 } else { 6447 vp = svd->vp; 6448 off = svd->offset + ptob(page); 6449 } 6450 anon_array_exit(&cookie); 6451 ANON_LOCK_EXIT(&->a_rwlock); 6452 } else { 6453 vp = svd->vp; 6454 off = svd->offset + ptob(page); 6455 } 6456 if (vp == NULL) { /* untouched zfod page */ 6457 ASSERT(ap == NULL); 6458 continue; 6459 } 6460 6461 pp = page_lookup_nowait(vp, off, SE_SHARED); 6462 if (pp == NULL) 6463 continue; 6464 6465 6466 /* 6467 * Examine the page to see whether it can be tossed out, 6468 * keeping track of how many we've found. 6469 */ 6470 if (!page_tryupgrade(pp)) { 6471 /* 6472 * If the page has an i/o lock and no mappings, 6473 * it's very likely that the page is being 6474 * written out as a result of klustering. 6475 * Assume this is so and take credit for it here. 6476 */ 6477 if (!page_io_trylock(pp)) { 6478 if (!hat_page_is_mapped(pp)) 6479 pgcnt++; 6480 } else { 6481 page_io_unlock(pp); 6482 } 6483 page_unlock(pp); 6484 continue; 6485 } 6486 ASSERT(!page_iolock_assert(pp)); 6487 6488 6489 /* 6490 * Skip if page is locked or has mappings. 6491 * We don't need the page_struct_lock to look at lckcnt 6492 * and cowcnt because the page is exclusive locked. 6493 */ 6494 if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0 || 6495 hat_page_is_mapped(pp)) { 6496 page_unlock(pp); 6497 continue; 6498 } 6499 6500 /* 6501 * dispose skips large pages so try to demote first. 6502 */ 6503 if (pp->p_szc != 0 && !page_try_demote_pages(pp)) { 6504 page_unlock(pp); 6505 /* 6506 * XXX should skip the remaining page_t's of this 6507 * large page. 6508 */ 6509 continue; 6510 } 6511 6512 ASSERT(pp->p_szc == 0); 6513 6514 /* 6515 * No longer mapped -- we can toss it out. How 6516 * we do so depends on whether or not it's dirty. 6517 */ 6518 if (hat_ismod(pp) && pp->p_vnode) { 6519 /* 6520 * We must clean the page before it can be 6521 * freed. Setting B_FREE will cause pvn_done 6522 * to free the page when the i/o completes. 6523 * XXX: This also causes it to be accounted 6524 * as a pageout instead of a swap: need 6525 * B_SWAPOUT bit to use instead of B_FREE. 6526 * 6527 * Hold the vnode before releasing the page lock 6528 * to prevent it from being freed and re-used by 6529 * some other thread. 6530 */ 6531 VN_HOLD(vp); 6532 page_unlock(pp); 6533 6534 /* 6535 * Queue all i/o requests for the pageout thread 6536 * to avoid saturating the pageout devices. 6537 */ 6538 if (!queue_io_request(vp, off)) 6539 VN_RELE(vp); 6540 } else { 6541 /* 6542 * The page was clean, free it. 6543 * 6544 * XXX: Can we ever encounter modified pages 6545 * with no associated vnode here? 6546 */ 6547 ASSERT(pp->p_vnode != NULL); 6548 /*LINTED: constant in conditional context*/ 6549 VN_DISPOSE(pp, B_FREE, 0, kcred); 6550 } 6551 6552 /* 6553 * Credit now even if i/o is in progress. 6554 */ 6555 pgcnt++; 6556 } 6557 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 6558 6559 /* 6560 * Wakeup pageout to initiate i/o on all queued requests. 6561 */ 6562 cv_signal_pageout(); 6563 return (ptob(pgcnt)); 6564 } 6565 6566 /* 6567 * Synchronize primary storage cache with real object in virtual memory. 6568 * 6569 * XXX - Anonymous pages should not be sync'ed out at all. 6570 */ 6571 static int 6572 segvn_sync(struct seg *seg, caddr_t addr, size_t len, int attr, uint_t flags) 6573 { 6574 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 6575 struct vpage *vpp; 6576 page_t *pp; 6577 u_offset_t offset; 6578 struct vnode *vp; 6579 u_offset_t off; 6580 caddr_t eaddr; 6581 int bflags; 6582 int err = 0; 6583 int segtype; 6584 int pageprot; 6585 int prot; 6586 ulong_t anon_index; 6587 struct anon_map *amp; 6588 struct anon *ap; 6589 anon_sync_obj_t cookie; 6590 6591 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 6592 6593 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER); 6594 6595 if (svd->softlockcnt > 0) { 6596 /* 6597 * flush all pages from seg cache 6598 * otherwise we may deadlock in swap_putpage 6599 * for B_INVAL page (4175402). 6600 * 6601 * Even if we grab segvn WRITER's lock or segp_slock 6602 * here, there might be another thread which could've 6603 * successfully performed lookup/insert just before 6604 * we acquired the lock here. So, grabbing either 6605 * lock here is of not much use. Until we devise 6606 * a strategy at upper layers to solve the 6607 * synchronization issues completely, we expect 6608 * applications to handle this appropriately. 6609 */ 6610 segvn_purge(seg); 6611 if (svd->softlockcnt > 0) { 6612 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 6613 return (EAGAIN); 6614 } 6615 } 6616 6617 vpp = svd->vpage; 6618 offset = svd->offset + (uintptr_t)(addr - seg->s_base); 6619 bflags = ((flags & MS_ASYNC) ? B_ASYNC : 0) | 6620 ((flags & MS_INVALIDATE) ? B_INVAL : 0); 6621 6622 if (attr) { 6623 pageprot = attr & ~(SHARED|PRIVATE); 6624 segtype = (attr & SHARED) ? MAP_SHARED : MAP_PRIVATE; 6625 6626 /* 6627 * We are done if the segment types don't match 6628 * or if we have segment level protections and 6629 * they don't match. 6630 */ 6631 if (svd->type != segtype) { 6632 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 6633 return (0); 6634 } 6635 if (vpp == NULL) { 6636 if (svd->prot != pageprot) { 6637 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 6638 return (0); 6639 } 6640 prot = svd->prot; 6641 } else 6642 vpp = &svd->vpage[seg_page(seg, addr)]; 6643 6644 } else if (svd->vp && svd->amp == NULL && 6645 (flags & MS_INVALIDATE) == 0) { 6646 6647 /* 6648 * No attributes, no anonymous pages and MS_INVALIDATE flag 6649 * is not on, just use one big request. 6650 */ 6651 err = VOP_PUTPAGE(svd->vp, (offset_t)offset, len, 6652 bflags, svd->cred); 6653 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 6654 return (err); 6655 } 6656 6657 if ((amp = svd->amp) != NULL) 6658 anon_index = svd->anon_index + seg_page(seg, addr); 6659 6660 for (eaddr = addr + len; addr < eaddr; addr += PAGESIZE) { 6661 ap = NULL; 6662 if (amp != NULL) { 6663 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 6664 anon_array_enter(amp, anon_index, &cookie); 6665 ap = anon_get_ptr(amp->ahp, anon_index++); 6666 if (ap != NULL) { 6667 swap_xlate(ap, &vp, &off); 6668 } else { 6669 vp = svd->vp; 6670 off = offset; 6671 } 6672 anon_array_exit(&cookie); 6673 ANON_LOCK_EXIT(&->a_rwlock); 6674 } else { 6675 vp = svd->vp; 6676 off = offset; 6677 } 6678 offset += PAGESIZE; 6679 6680 if (vp == NULL) /* untouched zfod page */ 6681 continue; 6682 6683 if (attr) { 6684 if (vpp) { 6685 prot = VPP_PROT(vpp); 6686 vpp++; 6687 } 6688 if (prot != pageprot) { 6689 continue; 6690 } 6691 } 6692 6693 /* 6694 * See if any of these pages are locked -- if so, then we 6695 * will have to truncate an invalidate request at the first 6696 * locked one. We don't need the page_struct_lock to test 6697 * as this is only advisory; even if we acquire it someone 6698 * might race in and lock the page after we unlock and before 6699 * we do the PUTPAGE, then PUTPAGE simply does nothing. 6700 */ 6701 if (flags & MS_INVALIDATE) { 6702 if ((pp = page_lookup(vp, off, SE_SHARED)) != NULL) { 6703 if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) { 6704 page_unlock(pp); 6705 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 6706 return (EBUSY); 6707 } 6708 if (ap != NULL && pp->p_szc != 0 && 6709 page_tryupgrade(pp)) { 6710 if (pp->p_lckcnt == 0 && 6711 pp->p_cowcnt == 0) { 6712 /* 6713 * swapfs VN_DISPOSE() won't 6714 * invalidate large pages. 6715 * Attempt to demote. 6716 * XXX can't help it if it 6717 * fails. But for swapfs 6718 * pages it is no big deal. 6719 */ 6720 (void) page_try_demote_pages( 6721 pp); 6722 } 6723 } 6724 page_unlock(pp); 6725 } 6726 } else if (svd->type == MAP_SHARED && amp != NULL) { 6727 /* 6728 * Avoid writting out to disk ISM's large pages 6729 * because segspt_free_pages() relies on NULL an_pvp 6730 * of anon slots of such pages. 6731 */ 6732 6733 ASSERT(svd->vp == NULL); 6734 /* 6735 * swapfs uses page_lookup_nowait if not freeing or 6736 * invalidating and skips a page if 6737 * page_lookup_nowait returns NULL. 6738 */ 6739 pp = page_lookup_nowait(vp, off, SE_SHARED); 6740 if (pp == NULL) { 6741 continue; 6742 } 6743 if (pp->p_szc != 0) { 6744 page_unlock(pp); 6745 continue; 6746 } 6747 6748 /* 6749 * Note ISM pages are created large so (vp, off)'s 6750 * page cannot suddenly become large after we unlock 6751 * pp. 6752 */ 6753 page_unlock(pp); 6754 } 6755 /* 6756 * XXX - Should ultimately try to kluster 6757 * calls to VOP_PUTPAGE() for performance. 6758 */ 6759 VN_HOLD(vp); 6760 err = VOP_PUTPAGE(vp, (offset_t)off, PAGESIZE, 6761 bflags, svd->cred); 6762 VN_RELE(vp); 6763 if (err) 6764 break; 6765 } 6766 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 6767 return (err); 6768 } 6769 6770 /* 6771 * Determine if we have data corresponding to pages in the 6772 * primary storage virtual memory cache (i.e., "in core"). 6773 */ 6774 static size_t 6775 segvn_incore(struct seg *seg, caddr_t addr, size_t len, char *vec) 6776 { 6777 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 6778 struct vnode *vp, *avp; 6779 u_offset_t offset, aoffset; 6780 size_t p, ep; 6781 int ret; 6782 struct vpage *vpp; 6783 page_t *pp; 6784 uint_t start; 6785 struct anon_map *amp; /* XXX - for locknest */ 6786 struct anon *ap; 6787 uint_t attr; 6788 anon_sync_obj_t cookie; 6789 6790 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 6791 6792 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER); 6793 if (svd->amp == NULL && svd->vp == NULL) { 6794 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 6795 bzero(vec, btopr(len)); 6796 return (len); /* no anonymous pages created yet */ 6797 } 6798 6799 p = seg_page(seg, addr); 6800 ep = seg_page(seg, addr + len); 6801 start = svd->vp ? SEG_PAGE_VNODEBACKED : 0; 6802 6803 amp = svd->amp; 6804 for (; p < ep; p++, addr += PAGESIZE) { 6805 vpp = (svd->vpage) ? &svd->vpage[p]: NULL; 6806 ret = start; 6807 ap = NULL; 6808 avp = NULL; 6809 /* Grab the vnode/offset for the anon slot */ 6810 if (amp != NULL) { 6811 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 6812 anon_array_enter(amp, svd->anon_index + p, &cookie); 6813 ap = anon_get_ptr(amp->ahp, svd->anon_index + p); 6814 if (ap != NULL) { 6815 swap_xlate(ap, &avp, &aoffset); 6816 } 6817 anon_array_exit(&cookie); 6818 ANON_LOCK_EXIT(&->a_rwlock); 6819 } 6820 if ((avp != NULL) && page_exists(avp, aoffset)) { 6821 /* A page exists for the anon slot */ 6822 ret |= SEG_PAGE_INCORE; 6823 6824 /* 6825 * If page is mapped and writable 6826 */ 6827 attr = (uint_t)0; 6828 if ((hat_getattr(seg->s_as->a_hat, addr, 6829 &attr) != -1) && (attr & PROT_WRITE)) { 6830 ret |= SEG_PAGE_ANON; 6831 } 6832 /* 6833 * Don't get page_struct lock for lckcnt and cowcnt, 6834 * since this is purely advisory. 6835 */ 6836 if ((pp = page_lookup_nowait(avp, aoffset, 6837 SE_SHARED)) != NULL) { 6838 if (pp->p_lckcnt) 6839 ret |= SEG_PAGE_SOFTLOCK; 6840 if (pp->p_cowcnt) 6841 ret |= SEG_PAGE_HASCOW; 6842 page_unlock(pp); 6843 } 6844 } 6845 6846 /* Gather vnode statistics */ 6847 vp = svd->vp; 6848 offset = svd->offset + (uintptr_t)(addr - seg->s_base); 6849 6850 if (vp != NULL) { 6851 /* 6852 * Try to obtain a "shared" lock on the page 6853 * without blocking. If this fails, determine 6854 * if the page is in memory. 6855 */ 6856 pp = page_lookup_nowait(vp, offset, SE_SHARED); 6857 if ((pp == NULL) && (page_exists(vp, offset))) { 6858 /* Page is incore, and is named */ 6859 ret |= (SEG_PAGE_INCORE | SEG_PAGE_VNODE); 6860 } 6861 /* 6862 * Don't get page_struct lock for lckcnt and cowcnt, 6863 * since this is purely advisory. 6864 */ 6865 if (pp != NULL) { 6866 ret |= (SEG_PAGE_INCORE | SEG_PAGE_VNODE); 6867 if (pp->p_lckcnt) 6868 ret |= SEG_PAGE_SOFTLOCK; 6869 if (pp->p_cowcnt) 6870 ret |= SEG_PAGE_HASCOW; 6871 page_unlock(pp); 6872 } 6873 } 6874 6875 /* Gather virtual page information */ 6876 if (vpp) { 6877 if (VPP_ISPPLOCK(vpp)) 6878 ret |= SEG_PAGE_LOCKED; 6879 vpp++; 6880 } 6881 6882 *vec++ = (char)ret; 6883 } 6884 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 6885 return (len); 6886 } 6887 6888 /* 6889 * Statement for p_cowcnts/p_lckcnts. 6890 * 6891 * p_cowcnt is updated while mlock/munlocking MAP_PRIVATE and PROT_WRITE region 6892 * irrespective of the following factors or anything else: 6893 * 6894 * (1) anon slots are populated or not 6895 * (2) cow is broken or not 6896 * (3) refcnt on ap is 1 or greater than 1 6897 * 6898 * If it's not MAP_PRIVATE and PROT_WRITE, p_lckcnt is updated during mlock 6899 * and munlock. 6900 * 6901 * 6902 * Handling p_cowcnts/p_lckcnts during copy-on-write fault: 6903 * 6904 * if vpage has PROT_WRITE 6905 * transfer cowcnt on the oldpage -> cowcnt on the newpage 6906 * else 6907 * transfer lckcnt on the oldpage -> lckcnt on the newpage 6908 * 6909 * During copy-on-write, decrement p_cowcnt on the oldpage and increment 6910 * p_cowcnt on the newpage *if* the corresponding vpage has PROT_WRITE. 6911 * 6912 * We may also break COW if softlocking on read access in the physio case. 6913 * In this case, vpage may not have PROT_WRITE. So, we need to decrement 6914 * p_lckcnt on the oldpage and increment p_lckcnt on the newpage *if* the 6915 * vpage doesn't have PROT_WRITE. 6916 * 6917 * 6918 * Handling p_cowcnts/p_lckcnts during mprotect on mlocked region: 6919 * 6920 * If a MAP_PRIVATE region loses PROT_WRITE, we decrement p_cowcnt and 6921 * increment p_lckcnt by calling page_subclaim() which takes care of 6922 * availrmem accounting and p_lckcnt overflow. 6923 * 6924 * If a MAP_PRIVATE region gains PROT_WRITE, we decrement p_lckcnt and 6925 * increment p_cowcnt by calling page_addclaim() which takes care of 6926 * availrmem availability and p_cowcnt overflow. 6927 */ 6928 6929 /* 6930 * Lock down (or unlock) pages mapped by this segment. 6931 * 6932 * XXX only creates PAGESIZE pages if anon slots are not initialized. 6933 * At fault time they will be relocated into larger pages. 6934 */ 6935 static int 6936 segvn_lockop(struct seg *seg, caddr_t addr, size_t len, 6937 int attr, int op, ulong_t *lockmap, size_t pos) 6938 { 6939 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 6940 struct vpage *vpp; 6941 struct vpage *evp; 6942 page_t *pp; 6943 u_offset_t offset; 6944 u_offset_t off; 6945 int segtype; 6946 int pageprot; 6947 int claim; 6948 struct vnode *vp; 6949 ulong_t anon_index; 6950 struct anon_map *amp; 6951 struct anon *ap; 6952 struct vattr va; 6953 anon_sync_obj_t cookie; 6954 struct kshmid *sp = NULL; 6955 struct proc *p = curproc; 6956 kproject_t *proj = NULL; 6957 int chargeproc = 1; 6958 size_t locked_bytes = 0; 6959 size_t unlocked_bytes = 0; 6960 int err = 0; 6961 6962 /* 6963 * Hold write lock on address space because may split or concatenate 6964 * segments 6965 */ 6966 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 6967 6968 /* 6969 * If this is a shm, use shm's project and zone, else use 6970 * project and zone of calling process 6971 */ 6972 6973 /* Determine if this segment backs a sysV shm */ 6974 if (svd->amp != NULL && svd->amp->a_sp != NULL) { 6975 sp = svd->amp->a_sp; 6976 proj = sp->shm_perm.ipc_proj; 6977 chargeproc = 0; 6978 } 6979 6980 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_WRITER); 6981 if (attr) { 6982 pageprot = attr & ~(SHARED|PRIVATE); 6983 segtype = attr & SHARED ? MAP_SHARED : MAP_PRIVATE; 6984 6985 /* 6986 * We are done if the segment types don't match 6987 * or if we have segment level protections and 6988 * they don't match. 6989 */ 6990 if (svd->type != segtype) { 6991 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 6992 return (0); 6993 } 6994 if (svd->pageprot == 0 && svd->prot != pageprot) { 6995 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 6996 return (0); 6997 } 6998 } 6999 7000 /* 7001 * If we're locking, then we must create a vpage structure if 7002 * none exists. If we're unlocking, then check to see if there 7003 * is a vpage -- if not, then we could not have locked anything. 7004 */ 7005 7006 if ((vpp = svd->vpage) == NULL) { 7007 if (op == MC_LOCK) 7008 segvn_vpage(seg); 7009 else { 7010 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7011 return (0); 7012 } 7013 } 7014 7015 /* 7016 * The anonymous data vector (i.e., previously 7017 * unreferenced mapping to swap space) can be allocated 7018 * by lazily testing for its existence. 7019 */ 7020 if (op == MC_LOCK && svd->amp == NULL && svd->vp == NULL) { 7021 svd->amp = anonmap_alloc(seg->s_size, 0); 7022 svd->amp->a_szc = seg->s_szc; 7023 } 7024 7025 if ((amp = svd->amp) != NULL) { 7026 anon_index = svd->anon_index + seg_page(seg, addr); 7027 } 7028 7029 offset = svd->offset + (uintptr_t)(addr - seg->s_base); 7030 evp = &svd->vpage[seg_page(seg, addr + len)]; 7031 7032 if (sp != NULL) 7033 mutex_enter(&sp->shm_mlock); 7034 7035 /* determine number of unlocked bytes in range for lock operation */ 7036 if (op == MC_LOCK) { 7037 7038 if (sp == NULL) { 7039 for (vpp = &svd->vpage[seg_page(seg, addr)]; vpp < evp; 7040 vpp++) { 7041 if (!VPP_ISPPLOCK(vpp)) 7042 unlocked_bytes += PAGESIZE; 7043 } 7044 } else { 7045 ulong_t i_idx, i_edx; 7046 anon_sync_obj_t i_cookie; 7047 struct anon *i_ap; 7048 struct vnode *i_vp; 7049 u_offset_t i_off; 7050 7051 /* Only count sysV pages once for locked memory */ 7052 i_edx = svd->anon_index + seg_page(seg, addr + len); 7053 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 7054 for (i_idx = anon_index; i_idx < i_edx; i_idx++) { 7055 anon_array_enter(amp, i_idx, &i_cookie); 7056 i_ap = anon_get_ptr(amp->ahp, i_idx); 7057 if (i_ap == NULL) { 7058 unlocked_bytes += PAGESIZE; 7059 anon_array_exit(&i_cookie); 7060 continue; 7061 } 7062 swap_xlate(i_ap, &i_vp, &i_off); 7063 anon_array_exit(&i_cookie); 7064 pp = page_lookup(i_vp, i_off, SE_SHARED); 7065 if (pp == NULL) { 7066 unlocked_bytes += PAGESIZE; 7067 continue; 7068 } else if (pp->p_lckcnt == 0) 7069 unlocked_bytes += PAGESIZE; 7070 page_unlock(pp); 7071 } 7072 ANON_LOCK_EXIT(&->a_rwlock); 7073 } 7074 7075 mutex_enter(&p->p_lock); 7076 err = rctl_incr_locked_mem(p, proj, unlocked_bytes, 7077 chargeproc); 7078 mutex_exit(&p->p_lock); 7079 7080 if (err) { 7081 if (sp != NULL) 7082 mutex_exit(&sp->shm_mlock); 7083 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7084 return (err); 7085 } 7086 } 7087 /* 7088 * Loop over all pages in the range. Process if we're locking and 7089 * page has not already been locked in this mapping; or if we're 7090 * unlocking and the page has been locked. 7091 */ 7092 for (vpp = &svd->vpage[seg_page(seg, addr)]; vpp < evp; 7093 vpp++, pos++, addr += PAGESIZE, offset += PAGESIZE, anon_index++) { 7094 if ((attr == 0 || VPP_PROT(vpp) == pageprot) && 7095 ((op == MC_LOCK && !VPP_ISPPLOCK(vpp)) || 7096 (op == MC_UNLOCK && VPP_ISPPLOCK(vpp)))) { 7097 7098 if (amp != NULL) 7099 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 7100 /* 7101 * If this isn't a MAP_NORESERVE segment and 7102 * we're locking, allocate anon slots if they 7103 * don't exist. The page is brought in later on. 7104 */ 7105 if (op == MC_LOCK && svd->vp == NULL && 7106 ((svd->flags & MAP_NORESERVE) == 0) && 7107 amp != NULL && 7108 ((ap = anon_get_ptr(amp->ahp, anon_index)) 7109 == NULL)) { 7110 anon_array_enter(amp, anon_index, &cookie); 7111 7112 if ((ap = anon_get_ptr(amp->ahp, 7113 anon_index)) == NULL) { 7114 pp = anon_zero(seg, addr, &ap, 7115 svd->cred); 7116 if (pp == NULL) { 7117 anon_array_exit(&cookie); 7118 ANON_LOCK_EXIT(&->a_rwlock); 7119 err = ENOMEM; 7120 goto out; 7121 } 7122 ASSERT(anon_get_ptr(amp->ahp, 7123 anon_index) == NULL); 7124 (void) anon_set_ptr(amp->ahp, 7125 anon_index, ap, ANON_SLEEP); 7126 page_unlock(pp); 7127 } 7128 anon_array_exit(&cookie); 7129 } 7130 7131 /* 7132 * Get name for page, accounting for 7133 * existence of private copy. 7134 */ 7135 ap = NULL; 7136 if (amp != NULL) { 7137 anon_array_enter(amp, anon_index, &cookie); 7138 ap = anon_get_ptr(amp->ahp, anon_index); 7139 if (ap != NULL) { 7140 swap_xlate(ap, &vp, &off); 7141 } else { 7142 if (svd->vp == NULL && 7143 (svd->flags & MAP_NORESERVE)) { 7144 anon_array_exit(&cookie); 7145 ANON_LOCK_EXIT(&->a_rwlock); 7146 continue; 7147 } 7148 vp = svd->vp; 7149 off = offset; 7150 } 7151 anon_array_exit(&cookie); 7152 ANON_LOCK_EXIT(&->a_rwlock); 7153 } else { 7154 vp = svd->vp; 7155 off = offset; 7156 } 7157 7158 /* 7159 * Get page frame. It's ok if the page is 7160 * not available when we're unlocking, as this 7161 * may simply mean that a page we locked got 7162 * truncated out of existence after we locked it. 7163 * 7164 * Invoke VOP_GETPAGE() to obtain the page struct 7165 * since we may need to read it from disk if its 7166 * been paged out. 7167 */ 7168 if (op != MC_LOCK) 7169 pp = page_lookup(vp, off, SE_SHARED); 7170 else { 7171 page_t *pl[1 + 1]; 7172 int error; 7173 7174 ASSERT(vp != NULL); 7175 7176 error = VOP_GETPAGE(vp, (offset_t)off, PAGESIZE, 7177 (uint_t *)NULL, pl, PAGESIZE, seg, addr, 7178 S_OTHER, svd->cred); 7179 7180 /* 7181 * If the error is EDEADLK then we must bounce 7182 * up and drop all vm subsystem locks and then 7183 * retry the operation later 7184 * This behavior is a temporary measure because 7185 * ufs/sds logging is badly designed and will 7186 * deadlock if we don't allow this bounce to 7187 * happen. The real solution is to re-design 7188 * the logging code to work properly. See bug 7189 * 4125102 for details of the problem. 7190 */ 7191 if (error == EDEADLK) { 7192 err = error; 7193 goto out; 7194 } 7195 /* 7196 * Quit if we fail to fault in the page. Treat 7197 * the failure as an error, unless the addr 7198 * is mapped beyond the end of a file. 7199 */ 7200 if (error && svd->vp) { 7201 va.va_mask = AT_SIZE; 7202 if (VOP_GETATTR(svd->vp, &va, 0, 7203 svd->cred) != 0) { 7204 err = EIO; 7205 goto out; 7206 } 7207 if (btopr(va.va_size) >= 7208 btopr(off + 1)) { 7209 err = EIO; 7210 goto out; 7211 } 7212 goto out; 7213 7214 } else if (error) { 7215 err = EIO; 7216 goto out; 7217 } 7218 pp = pl[0]; 7219 ASSERT(pp != NULL); 7220 } 7221 7222 /* 7223 * See Statement at the beginning of this routine. 7224 * 7225 * claim is always set if MAP_PRIVATE and PROT_WRITE 7226 * irrespective of following factors: 7227 * 7228 * (1) anon slots are populated or not 7229 * (2) cow is broken or not 7230 * (3) refcnt on ap is 1 or greater than 1 7231 * 7232 * See 4140683 for details 7233 */ 7234 claim = ((VPP_PROT(vpp) & PROT_WRITE) && 7235 (svd->type == MAP_PRIVATE)); 7236 7237 /* 7238 * Perform page-level operation appropriate to 7239 * operation. If locking, undo the SOFTLOCK 7240 * performed to bring the page into memory 7241 * after setting the lock. If unlocking, 7242 * and no page was found, account for the claim 7243 * separately. 7244 */ 7245 if (op == MC_LOCK) { 7246 int ret = 1; /* Assume success */ 7247 7248 ASSERT(!VPP_ISPPLOCK(vpp)); 7249 7250 ret = page_pp_lock(pp, claim, 0); 7251 if (ret == 0) { 7252 /* locking page failed */ 7253 page_unlock(pp); 7254 err = EAGAIN; 7255 goto out; 7256 } 7257 VPP_SETPPLOCK(vpp); 7258 if (sp != NULL) { 7259 if (pp->p_lckcnt == 1) 7260 locked_bytes += PAGESIZE; 7261 } else 7262 locked_bytes += PAGESIZE; 7263 7264 if (lockmap != (ulong_t *)NULL) 7265 BT_SET(lockmap, pos); 7266 7267 page_unlock(pp); 7268 } else { 7269 ASSERT(VPP_ISPPLOCK(vpp)); 7270 if (pp != NULL) { 7271 /* sysV pages should be locked */ 7272 ASSERT(sp == NULL || pp->p_lckcnt > 0); 7273 page_pp_unlock(pp, claim, 0); 7274 if (sp != NULL) { 7275 if (pp->p_lckcnt == 0) 7276 unlocked_bytes 7277 += PAGESIZE; 7278 } else 7279 unlocked_bytes += PAGESIZE; 7280 page_unlock(pp); 7281 } else { 7282 ASSERT(sp != NULL); 7283 unlocked_bytes += PAGESIZE; 7284 } 7285 VPP_CLRPPLOCK(vpp); 7286 } 7287 } 7288 } 7289 out: 7290 if (op == MC_LOCK) { 7291 /* Credit back bytes that did not get locked */ 7292 if ((unlocked_bytes - locked_bytes) > 0) { 7293 if (proj == NULL) 7294 mutex_enter(&p->p_lock); 7295 rctl_decr_locked_mem(p, proj, 7296 (unlocked_bytes - locked_bytes), chargeproc); 7297 if (proj == NULL) 7298 mutex_exit(&p->p_lock); 7299 } 7300 7301 } else { 7302 /* Account bytes that were unlocked */ 7303 if (unlocked_bytes > 0) { 7304 if (proj == NULL) 7305 mutex_enter(&p->p_lock); 7306 rctl_decr_locked_mem(p, proj, unlocked_bytes, 7307 chargeproc); 7308 if (proj == NULL) 7309 mutex_exit(&p->p_lock); 7310 } 7311 } 7312 if (sp != NULL) 7313 mutex_exit(&sp->shm_mlock); 7314 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7315 7316 return (err); 7317 } 7318 7319 /* 7320 * Set advice from user for specified pages 7321 * There are 5 types of advice: 7322 * MADV_NORMAL - Normal (default) behavior (whatever that is) 7323 * MADV_RANDOM - Random page references 7324 * do not allow readahead or 'klustering' 7325 * MADV_SEQUENTIAL - Sequential page references 7326 * Pages previous to the one currently being 7327 * accessed (determined by fault) are 'not needed' 7328 * and are freed immediately 7329 * MADV_WILLNEED - Pages are likely to be used (fault ahead in mctl) 7330 * MADV_DONTNEED - Pages are not needed (synced out in mctl) 7331 * MADV_FREE - Contents can be discarded 7332 * MADV_ACCESS_DEFAULT- Default access 7333 * MADV_ACCESS_LWP - Next LWP will access heavily 7334 * MADV_ACCESS_MANY- Many LWPs or processes will access heavily 7335 */ 7336 static int 7337 segvn_advise(struct seg *seg, caddr_t addr, size_t len, uint_t behav) 7338 { 7339 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 7340 size_t page; 7341 int err = 0; 7342 int already_set; 7343 struct anon_map *amp; 7344 ulong_t anon_index; 7345 struct seg *next; 7346 lgrp_mem_policy_t policy; 7347 struct seg *prev; 7348 struct vnode *vp; 7349 7350 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 7351 7352 /* 7353 * In case of MADV_FREE, we won't be modifying any segment private 7354 * data structures; so, we only need to grab READER's lock 7355 */ 7356 if (behav != MADV_FREE) 7357 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_WRITER); 7358 else 7359 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER); 7360 7361 /* 7362 * Large pages are assumed to be only turned on when accesses to the 7363 * segment's address range have spatial and temporal locality. That 7364 * justifies ignoring MADV_SEQUENTIAL for large page segments. 7365 * Also, ignore advice affecting lgroup memory allocation 7366 * if don't need to do lgroup optimizations on this system 7367 */ 7368 7369 if ((behav == MADV_SEQUENTIAL && seg->s_szc != 0) || 7370 (!lgrp_optimizations() && (behav == MADV_ACCESS_DEFAULT || 7371 behav == MADV_ACCESS_LWP || behav == MADV_ACCESS_MANY))) { 7372 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7373 return (0); 7374 } 7375 7376 if (behav == MADV_SEQUENTIAL || behav == MADV_ACCESS_DEFAULT || 7377 behav == MADV_ACCESS_LWP || behav == MADV_ACCESS_MANY) { 7378 /* 7379 * Since we are going to unload hat mappings 7380 * we first have to flush the cache. Otherwise 7381 * this might lead to system panic if another 7382 * thread is doing physio on the range whose 7383 * mappings are unloaded by madvise(3C). 7384 */ 7385 if (svd->softlockcnt > 0) { 7386 /* 7387 * Since we do have the segvn writers lock 7388 * nobody can fill the cache with entries 7389 * belonging to this seg during the purge. 7390 * The flush either succeeds or we still 7391 * have pending I/Os. In the later case, 7392 * madvise(3C) fails. 7393 */ 7394 segvn_purge(seg); 7395 if (svd->softlockcnt > 0) { 7396 /* 7397 * Since madvise(3C) is advisory and 7398 * it's not part of UNIX98, madvise(3C) 7399 * failure here doesn't cause any hardship. 7400 * Note that we don't block in "as" layer. 7401 */ 7402 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7403 return (EAGAIN); 7404 } 7405 } 7406 } 7407 7408 amp = svd->amp; 7409 vp = svd->vp; 7410 if (behav == MADV_FREE) { 7411 /* 7412 * MADV_FREE is not supported for segments with 7413 * underlying object; if anonmap is NULL, anon slots 7414 * are not yet populated and there is nothing for 7415 * us to do. As MADV_FREE is advisory, we don't 7416 * return error in either case. 7417 */ 7418 if (vp || amp == NULL) { 7419 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7420 return (0); 7421 } 7422 7423 page = seg_page(seg, addr); 7424 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 7425 anon_disclaim(amp, svd->anon_index + page, len, 0); 7426 ANON_LOCK_EXIT(&->a_rwlock); 7427 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7428 return (0); 7429 } 7430 7431 /* 7432 * If advice is to be applied to entire segment, 7433 * use advice field in seg_data structure 7434 * otherwise use appropriate vpage entry. 7435 */ 7436 if ((addr == seg->s_base) && (len == seg->s_size)) { 7437 switch (behav) { 7438 case MADV_ACCESS_LWP: 7439 case MADV_ACCESS_MANY: 7440 case MADV_ACCESS_DEFAULT: 7441 /* 7442 * Set memory allocation policy for this segment 7443 */ 7444 policy = lgrp_madv_to_policy(behav, len, svd->type); 7445 if (svd->type == MAP_SHARED) 7446 already_set = lgrp_shm_policy_set(policy, amp, 7447 svd->anon_index, vp, svd->offset, len); 7448 else { 7449 /* 7450 * For private memory, need writers lock on 7451 * address space because the segment may be 7452 * split or concatenated when changing policy 7453 */ 7454 if (AS_READ_HELD(seg->s_as, 7455 &seg->s_as->a_lock)) { 7456 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7457 return (IE_RETRY); 7458 } 7459 7460 already_set = lgrp_privm_policy_set(policy, 7461 &svd->policy_info, len); 7462 } 7463 7464 /* 7465 * If policy set already and it shouldn't be reapplied, 7466 * don't do anything. 7467 */ 7468 if (already_set && 7469 !LGRP_MEM_POLICY_REAPPLICABLE(policy)) 7470 break; 7471 7472 /* 7473 * Mark any existing pages in given range for 7474 * migration 7475 */ 7476 page_mark_migrate(seg, addr, len, amp, svd->anon_index, 7477 vp, svd->offset, 1); 7478 7479 /* 7480 * If same policy set already or this is a shared 7481 * memory segment, don't need to try to concatenate 7482 * segment with adjacent ones. 7483 */ 7484 if (already_set || svd->type == MAP_SHARED) 7485 break; 7486 7487 /* 7488 * Try to concatenate this segment with previous 7489 * one and next one, since we changed policy for 7490 * this one and it may be compatible with adjacent 7491 * ones now. 7492 */ 7493 prev = AS_SEGPREV(seg->s_as, seg); 7494 next = AS_SEGNEXT(seg->s_as, seg); 7495 7496 if (next && next->s_ops == &segvn_ops && 7497 addr + len == next->s_base) 7498 (void) segvn_concat(seg, next, 1); 7499 7500 if (prev && prev->s_ops == &segvn_ops && 7501 addr == prev->s_base + prev->s_size) { 7502 /* 7503 * Drop lock for private data of current 7504 * segment before concatenating (deleting) it 7505 * and return IE_REATTACH to tell as_ctl() that 7506 * current segment has changed 7507 */ 7508 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7509 if (!segvn_concat(prev, seg, 1)) 7510 err = IE_REATTACH; 7511 7512 return (err); 7513 } 7514 break; 7515 7516 case MADV_SEQUENTIAL: 7517 /* 7518 * unloading mapping guarantees 7519 * detection in segvn_fault 7520 */ 7521 ASSERT(seg->s_szc == 0); 7522 hat_unload(seg->s_as->a_hat, addr, len, 7523 HAT_UNLOAD); 7524 /* FALLTHROUGH */ 7525 case MADV_NORMAL: 7526 case MADV_RANDOM: 7527 svd->advice = (uchar_t)behav; 7528 svd->pageadvice = 0; 7529 break; 7530 case MADV_WILLNEED: /* handled in memcntl */ 7531 case MADV_DONTNEED: /* handled in memcntl */ 7532 case MADV_FREE: /* handled above */ 7533 break; 7534 default: 7535 err = EINVAL; 7536 } 7537 } else { 7538 caddr_t eaddr; 7539 struct seg *new_seg; 7540 struct segvn_data *new_svd; 7541 u_offset_t off; 7542 caddr_t oldeaddr; 7543 7544 page = seg_page(seg, addr); 7545 7546 segvn_vpage(seg); 7547 7548 switch (behav) { 7549 struct vpage *bvpp, *evpp; 7550 7551 case MADV_ACCESS_LWP: 7552 case MADV_ACCESS_MANY: 7553 case MADV_ACCESS_DEFAULT: 7554 /* 7555 * Set memory allocation policy for portion of this 7556 * segment 7557 */ 7558 7559 /* 7560 * Align address and length of advice to page 7561 * boundaries for large pages 7562 */ 7563 if (seg->s_szc != 0) { 7564 size_t pgsz; 7565 7566 pgsz = page_get_pagesize(seg->s_szc); 7567 addr = (caddr_t)P2ALIGN((uintptr_t)addr, pgsz); 7568 len = P2ROUNDUP(len, pgsz); 7569 } 7570 7571 /* 7572 * Check to see whether policy is set already 7573 */ 7574 policy = lgrp_madv_to_policy(behav, len, svd->type); 7575 7576 anon_index = svd->anon_index + page; 7577 off = svd->offset + (uintptr_t)(addr - seg->s_base); 7578 7579 if (svd->type == MAP_SHARED) 7580 already_set = lgrp_shm_policy_set(policy, amp, 7581 anon_index, vp, off, len); 7582 else 7583 already_set = 7584 (policy == svd->policy_info.mem_policy); 7585 7586 /* 7587 * If policy set already and it shouldn't be reapplied, 7588 * don't do anything. 7589 */ 7590 if (already_set && 7591 !LGRP_MEM_POLICY_REAPPLICABLE(policy)) 7592 break; 7593 7594 /* 7595 * For private memory, need writers lock on 7596 * address space because the segment may be 7597 * split or concatenated when changing policy 7598 */ 7599 if (svd->type == MAP_PRIVATE && 7600 AS_READ_HELD(seg->s_as, &seg->s_as->a_lock)) { 7601 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7602 return (IE_RETRY); 7603 } 7604 7605 /* 7606 * Mark any existing pages in given range for 7607 * migration 7608 */ 7609 page_mark_migrate(seg, addr, len, amp, svd->anon_index, 7610 vp, svd->offset, 1); 7611 7612 /* 7613 * Don't need to try to split or concatenate 7614 * segments, since policy is same or this is a shared 7615 * memory segment 7616 */ 7617 if (already_set || svd->type == MAP_SHARED) 7618 break; 7619 7620 /* 7621 * Split off new segment if advice only applies to a 7622 * portion of existing segment starting in middle 7623 */ 7624 new_seg = NULL; 7625 eaddr = addr + len; 7626 oldeaddr = seg->s_base + seg->s_size; 7627 if (addr > seg->s_base) { 7628 /* 7629 * Must flush I/O page cache 7630 * before splitting segment 7631 */ 7632 if (svd->softlockcnt > 0) 7633 segvn_purge(seg); 7634 7635 /* 7636 * Split segment and return IE_REATTACH to tell 7637 * as_ctl() that current segment changed 7638 */ 7639 new_seg = segvn_split_seg(seg, addr); 7640 new_svd = (struct segvn_data *)new_seg->s_data; 7641 err = IE_REATTACH; 7642 7643 /* 7644 * If new segment ends where old one 7645 * did, try to concatenate the new 7646 * segment with next one. 7647 */ 7648 if (eaddr == oldeaddr) { 7649 /* 7650 * Set policy for new segment 7651 */ 7652 (void) lgrp_privm_policy_set(policy, 7653 &new_svd->policy_info, 7654 new_seg->s_size); 7655 7656 next = AS_SEGNEXT(new_seg->s_as, 7657 new_seg); 7658 7659 if (next && 7660 next->s_ops == &segvn_ops && 7661 eaddr == next->s_base) 7662 (void) segvn_concat(new_seg, 7663 next, 1); 7664 } 7665 } 7666 7667 /* 7668 * Split off end of existing segment if advice only 7669 * applies to a portion of segment ending before 7670 * end of the existing segment 7671 */ 7672 if (eaddr < oldeaddr) { 7673 /* 7674 * Must flush I/O page cache 7675 * before splitting segment 7676 */ 7677 if (svd->softlockcnt > 0) 7678 segvn_purge(seg); 7679 7680 /* 7681 * If beginning of old segment was already 7682 * split off, use new segment to split end off 7683 * from. 7684 */ 7685 if (new_seg != NULL && new_seg != seg) { 7686 /* 7687 * Split segment 7688 */ 7689 (void) segvn_split_seg(new_seg, eaddr); 7690 7691 /* 7692 * Set policy for new segment 7693 */ 7694 (void) lgrp_privm_policy_set(policy, 7695 &new_svd->policy_info, 7696 new_seg->s_size); 7697 } else { 7698 /* 7699 * Split segment and return IE_REATTACH 7700 * to tell as_ctl() that current 7701 * segment changed 7702 */ 7703 (void) segvn_split_seg(seg, eaddr); 7704 err = IE_REATTACH; 7705 7706 (void) lgrp_privm_policy_set(policy, 7707 &svd->policy_info, seg->s_size); 7708 7709 /* 7710 * If new segment starts where old one 7711 * did, try to concatenate it with 7712 * previous segment. 7713 */ 7714 if (addr == seg->s_base) { 7715 prev = AS_SEGPREV(seg->s_as, 7716 seg); 7717 7718 /* 7719 * Drop lock for private data 7720 * of current segment before 7721 * concatenating (deleting) it 7722 */ 7723 if (prev && 7724 prev->s_ops == 7725 &segvn_ops && 7726 addr == prev->s_base + 7727 prev->s_size) { 7728 SEGVN_LOCK_EXIT( 7729 seg->s_as, 7730 &svd->lock); 7731 (void) segvn_concat( 7732 prev, seg, 1); 7733 return (err); 7734 } 7735 } 7736 } 7737 } 7738 break; 7739 case MADV_SEQUENTIAL: 7740 ASSERT(seg->s_szc == 0); 7741 hat_unload(seg->s_as->a_hat, addr, len, HAT_UNLOAD); 7742 /* FALLTHROUGH */ 7743 case MADV_NORMAL: 7744 case MADV_RANDOM: 7745 bvpp = &svd->vpage[page]; 7746 evpp = &svd->vpage[page + (len >> PAGESHIFT)]; 7747 for (; bvpp < evpp; bvpp++) 7748 VPP_SETADVICE(bvpp, behav); 7749 svd->advice = MADV_NORMAL; 7750 break; 7751 case MADV_WILLNEED: /* handled in memcntl */ 7752 case MADV_DONTNEED: /* handled in memcntl */ 7753 case MADV_FREE: /* handled above */ 7754 break; 7755 default: 7756 err = EINVAL; 7757 } 7758 } 7759 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7760 return (err); 7761 } 7762 7763 /* 7764 * Create a vpage structure for this seg. 7765 */ 7766 static void 7767 segvn_vpage(struct seg *seg) 7768 { 7769 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 7770 struct vpage *vp, *evp; 7771 7772 ASSERT(SEGVN_WRITE_HELD(seg->s_as, &svd->lock)); 7773 7774 /* 7775 * If no vpage structure exists, allocate one. Copy the protections 7776 * and the advice from the segment itself to the individual pages. 7777 */ 7778 if (svd->vpage == NULL) { 7779 svd->pageprot = 1; 7780 svd->pageadvice = 1; 7781 svd->vpage = kmem_zalloc(seg_pages(seg) * sizeof (struct vpage), 7782 KM_SLEEP); 7783 evp = &svd->vpage[seg_page(seg, seg->s_base + seg->s_size)]; 7784 for (vp = svd->vpage; vp < evp; vp++) { 7785 VPP_SETPROT(vp, svd->prot); 7786 VPP_SETADVICE(vp, svd->advice); 7787 } 7788 } 7789 } 7790 7791 /* 7792 * Dump the pages belonging to this segvn segment. 7793 */ 7794 static void 7795 segvn_dump(struct seg *seg) 7796 { 7797 struct segvn_data *svd; 7798 page_t *pp; 7799 struct anon_map *amp; 7800 ulong_t anon_index; 7801 struct vnode *vp; 7802 u_offset_t off, offset; 7803 pfn_t pfn; 7804 pgcnt_t page, npages; 7805 caddr_t addr; 7806 7807 npages = seg_pages(seg); 7808 svd = (struct segvn_data *)seg->s_data; 7809 vp = svd->vp; 7810 off = offset = svd->offset; 7811 addr = seg->s_base; 7812 7813 if ((amp = svd->amp) != NULL) { 7814 anon_index = svd->anon_index; 7815 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 7816 } 7817 7818 for (page = 0; page < npages; page++, offset += PAGESIZE) { 7819 struct anon *ap; 7820 int we_own_it = 0; 7821 7822 if (amp && (ap = anon_get_ptr(svd->amp->ahp, anon_index++))) { 7823 swap_xlate_nopanic(ap, &vp, &off); 7824 } else { 7825 vp = svd->vp; 7826 off = offset; 7827 } 7828 7829 /* 7830 * If pp == NULL, the page either does not exist 7831 * or is exclusively locked. So determine if it 7832 * exists before searching for it. 7833 */ 7834 7835 if ((pp = page_lookup_nowait(vp, off, SE_SHARED))) 7836 we_own_it = 1; 7837 else 7838 pp = page_exists(vp, off); 7839 7840 if (pp) { 7841 pfn = page_pptonum(pp); 7842 dump_addpage(seg->s_as, addr, pfn); 7843 if (we_own_it) 7844 page_unlock(pp); 7845 } 7846 addr += PAGESIZE; 7847 dump_timeleft = dump_timeout; 7848 } 7849 7850 if (amp != NULL) 7851 ANON_LOCK_EXIT(&->a_rwlock); 7852 } 7853 7854 /* 7855 * lock/unlock anon pages over a given range. Return shadow list 7856 */ 7857 static int 7858 segvn_pagelock(struct seg *seg, caddr_t addr, size_t len, struct page ***ppp, 7859 enum lock_type type, enum seg_rw rw) 7860 { 7861 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 7862 size_t np, adjustpages = 0, npages = (len >> PAGESHIFT); 7863 ulong_t anon_index; 7864 uint_t protchk; 7865 uint_t error; 7866 struct anon_map *amp; 7867 struct page **pplist, **pl, *pp; 7868 caddr_t a; 7869 size_t page; 7870 caddr_t lpgaddr, lpgeaddr; 7871 pgcnt_t szc0_npages = 0; 7872 7873 TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEGVN_START, 7874 "segvn_pagelock: start seg %p addr %p", seg, addr); 7875 7876 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 7877 if (seg->s_szc != 0 && (type == L_PAGELOCK || type == L_PAGEUNLOCK)) { 7878 /* 7879 * We are adjusting the pagelock region to the large page size 7880 * boundary because the unlocked part of a large page cannot 7881 * be freed anyway unless all constituent pages of a large 7882 * page are locked. Therefore this adjustment allows us to 7883 * decrement availrmem by the right value (note we don't want 7884 * to just decrement availrem by the large page size without 7885 * adjusting addr and len because then we may end up 7886 * decrementing availrmem by large page size for every 7887 * constituent page locked by a new as_pagelock call). 7888 * as_pageunlock caller must always match as_pagelock call's 7889 * addr and len. 7890 * 7891 * Note segment's page size cannot change while we are holding 7892 * as lock. And then it cannot change while softlockcnt is 7893 * not 0. This will allow us to correctly recalculate large 7894 * page size region for the matching pageunlock/reclaim call. 7895 * 7896 * for pageunlock *ppp points to the pointer of page_t that 7897 * corresponds to the real unadjusted start address. Similar 7898 * for pagelock *ppp must point to the pointer of page_t that 7899 * corresponds to the real unadjusted start address. 7900 */ 7901 size_t pgsz = page_get_pagesize(seg->s_szc); 7902 CALC_LPG_REGION(pgsz, seg, addr, len, lpgaddr, lpgeaddr); 7903 adjustpages = ((uintptr_t)(addr - lpgaddr)) >> PAGESHIFT; 7904 } 7905 7906 if (type == L_PAGEUNLOCK) { 7907 7908 /* 7909 * update hat ref bits for /proc. We need to make sure 7910 * that threads tracing the ref and mod bits of the 7911 * address space get the right data. 7912 * Note: page ref and mod bits are updated at reclaim time 7913 */ 7914 if (seg->s_as->a_vbits) { 7915 for (a = addr; a < addr + len; a += PAGESIZE) { 7916 if (rw == S_WRITE) { 7917 hat_setstat(seg->s_as, a, 7918 PAGESIZE, P_REF | P_MOD); 7919 } else { 7920 hat_setstat(seg->s_as, a, 7921 PAGESIZE, P_REF); 7922 } 7923 } 7924 } 7925 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER); 7926 if (seg->s_szc != 0) { 7927 VM_STAT_ADD(segvnvmstats.pagelock[0]); 7928 seg_pinactive(seg, lpgaddr, lpgeaddr - lpgaddr, 7929 *ppp - adjustpages, rw, segvn_reclaim); 7930 } else { 7931 seg_pinactive(seg, addr, len, *ppp, rw, segvn_reclaim); 7932 } 7933 7934 /* 7935 * If someone is blocked while unmapping, we purge 7936 * segment page cache and thus reclaim pplist synchronously 7937 * without waiting for seg_pasync_thread. This speeds up 7938 * unmapping in cases where munmap(2) is called, while 7939 * raw async i/o is still in progress or where a thread 7940 * exits on data fault in a multithreaded application. 7941 */ 7942 if (AS_ISUNMAPWAIT(seg->s_as) && (svd->softlockcnt > 0)) { 7943 /* 7944 * Even if we grab segvn WRITER's lock or segp_slock 7945 * here, there might be another thread which could've 7946 * successfully performed lookup/insert just before 7947 * we acquired the lock here. So, grabbing either 7948 * lock here is of not much use. Until we devise 7949 * a strategy at upper layers to solve the 7950 * synchronization issues completely, we expect 7951 * applications to handle this appropriately. 7952 */ 7953 segvn_purge(seg); 7954 } 7955 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7956 TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEGVN_UNLOCK_END, 7957 "segvn_pagelock: unlock seg %p addr %p", seg, addr); 7958 return (0); 7959 } else if (type == L_PAGERECLAIM) { 7960 VM_STAT_COND_ADD(seg->s_szc != 0, segvnvmstats.pagelock[1]); 7961 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER); 7962 (void) segvn_reclaim(seg, addr, len, *ppp, rw); 7963 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7964 TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEGVN_UNLOCK_END, 7965 "segvn_pagelock: reclaim seg %p addr %p", seg, addr); 7966 return (0); 7967 } 7968 7969 if (seg->s_szc != 0) { 7970 VM_STAT_ADD(segvnvmstats.pagelock[2]); 7971 addr = lpgaddr; 7972 len = lpgeaddr - lpgaddr; 7973 npages = (len >> PAGESHIFT); 7974 } 7975 7976 /* 7977 * for now we only support pagelock to anon memory. We've to check 7978 * protections for vnode objects and call into the vnode driver. 7979 * That's too much for a fast path. Let the fault entry point handle it. 7980 */ 7981 if (svd->vp != NULL) { 7982 TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEGVN_MISS_END, 7983 "segvn_pagelock: mapped vnode seg %p addr %p", seg, addr); 7984 *ppp = NULL; 7985 return (ENOTSUP); 7986 } 7987 7988 /* 7989 * if anonmap is not yet created, let the fault entry point populate it 7990 * with anon ptrs. 7991 */ 7992 if ((amp = svd->amp) == NULL) { 7993 TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEGVN_MISS_END, 7994 "segvn_pagelock: anonmap null seg %p addr %p", seg, addr); 7995 *ppp = NULL; 7996 return (EFAULT); 7997 } 7998 7999 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER); 8000 8001 /* 8002 * we acquire segp_slock to prevent duplicate entries 8003 * in seg_pcache 8004 */ 8005 mutex_enter(&svd->segp_slock); 8006 8007 /* 8008 * try to find pages in segment page cache 8009 */ 8010 pplist = seg_plookup(seg, addr, len, rw); 8011 if (pplist != NULL) { 8012 mutex_exit(&svd->segp_slock); 8013 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 8014 *ppp = pplist + adjustpages; 8015 TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEGVN_HIT_END, 8016 "segvn_pagelock: cache hit seg %p addr %p", seg, addr); 8017 return (0); 8018 } 8019 8020 if (rw == S_READ) { 8021 protchk = PROT_READ; 8022 } else { 8023 protchk = PROT_WRITE; 8024 } 8025 8026 if (svd->pageprot == 0) { 8027 if ((svd->prot & protchk) == 0) { 8028 mutex_exit(&svd->segp_slock); 8029 error = EFAULT; 8030 goto out; 8031 } 8032 } else { 8033 /* 8034 * check page protections 8035 */ 8036 for (a = addr; a < addr + len; a += PAGESIZE) { 8037 struct vpage *vp; 8038 8039 vp = &svd->vpage[seg_page(seg, a)]; 8040 if ((VPP_PROT(vp) & protchk) == 0) { 8041 mutex_exit(&svd->segp_slock); 8042 error = EFAULT; 8043 goto out; 8044 } 8045 } 8046 } 8047 8048 /* 8049 * Avoid per page overhead of segvn_pp_lock_anonpages() for small 8050 * pages. For large pages segvn_pp_lock_anonpages() only does real 8051 * work once per large page. The tradeoff is that we may decrement 8052 * availrmem more than once for the same page but this is ok 8053 * for small pages. 8054 */ 8055 if (seg->s_szc == 0) { 8056 mutex_enter(&freemem_lock); 8057 if (availrmem < tune.t_minarmem + npages) { 8058 mutex_exit(&freemem_lock); 8059 mutex_exit(&svd->segp_slock); 8060 error = ENOMEM; 8061 goto out; 8062 } 8063 availrmem -= npages; 8064 mutex_exit(&freemem_lock); 8065 } 8066 8067 pplist = kmem_alloc(sizeof (page_t *) * npages, KM_SLEEP); 8068 pl = pplist; 8069 *ppp = pplist + adjustpages; 8070 8071 page = seg_page(seg, addr); 8072 anon_index = svd->anon_index + page; 8073 8074 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 8075 for (a = addr; a < addr + len; a += PAGESIZE, anon_index++) { 8076 struct anon *ap; 8077 struct vnode *vp; 8078 u_offset_t off; 8079 anon_sync_obj_t cookie; 8080 8081 anon_array_enter(amp, anon_index, &cookie); 8082 ap = anon_get_ptr(amp->ahp, anon_index); 8083 if (ap == NULL) { 8084 anon_array_exit(&cookie); 8085 break; 8086 } else { 8087 /* 8088 * We must never use seg_pcache for COW pages 8089 * because we might end up with original page still 8090 * lying in seg_pcache even after private page is 8091 * created. This leads to data corruption as 8092 * aio_write refers to the page still in cache 8093 * while all other accesses refer to the private 8094 * page. 8095 */ 8096 if (ap->an_refcnt != 1) { 8097 anon_array_exit(&cookie); 8098 break; 8099 } 8100 } 8101 swap_xlate(ap, &vp, &off); 8102 anon_array_exit(&cookie); 8103 8104 pp = page_lookup_nowait(vp, off, SE_SHARED); 8105 if (pp == NULL) { 8106 break; 8107 } 8108 if (seg->s_szc != 0 || pp->p_szc != 0) { 8109 if (!segvn_pp_lock_anonpages(pp, a == addr)) { 8110 page_unlock(pp); 8111 break; 8112 } 8113 } else { 8114 szc0_npages++; 8115 } 8116 *pplist++ = pp; 8117 } 8118 ANON_LOCK_EXIT(&->a_rwlock); 8119 8120 ASSERT(npages >= szc0_npages); 8121 8122 if (a >= addr + len) { 8123 mutex_enter(&freemem_lock); 8124 if (seg->s_szc == 0 && npages != szc0_npages) { 8125 ASSERT(svd->type == MAP_SHARED && amp->a_szc > 0); 8126 availrmem += (npages - szc0_npages); 8127 } 8128 svd->softlockcnt += npages; 8129 segvn_pages_locked += npages; 8130 mutex_exit(&freemem_lock); 8131 (void) seg_pinsert(seg, addr, len, pl, rw, SEGP_ASYNC_FLUSH, 8132 segvn_reclaim); 8133 mutex_exit(&svd->segp_slock); 8134 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 8135 TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEGVN_FILL_END, 8136 "segvn_pagelock: cache fill seg %p addr %p", seg, addr); 8137 return (0); 8138 } 8139 8140 mutex_exit(&svd->segp_slock); 8141 if (seg->s_szc == 0) { 8142 mutex_enter(&freemem_lock); 8143 availrmem += npages; 8144 mutex_exit(&freemem_lock); 8145 } 8146 error = EFAULT; 8147 pplist = pl; 8148 np = ((uintptr_t)(a - addr)) >> PAGESHIFT; 8149 while (np > (uint_t)0) { 8150 ASSERT(PAGE_LOCKED(*pplist)); 8151 if (seg->s_szc != 0 || (*pplist)->p_szc != 0) { 8152 segvn_pp_unlock_anonpages(*pplist, pplist == pl); 8153 } 8154 page_unlock(*pplist); 8155 np--; 8156 pplist++; 8157 } 8158 kmem_free(pl, sizeof (page_t *) * npages); 8159 out: 8160 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 8161 *ppp = NULL; 8162 TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEGVN_MISS_END, 8163 "segvn_pagelock: cache miss seg %p addr %p", seg, addr); 8164 return (error); 8165 } 8166 8167 /* 8168 * purge any cached pages in the I/O page cache 8169 */ 8170 static void 8171 segvn_purge(struct seg *seg) 8172 { 8173 seg_ppurge(seg); 8174 } 8175 8176 static int 8177 segvn_reclaim(struct seg *seg, caddr_t addr, size_t len, struct page **pplist, 8178 enum seg_rw rw) 8179 { 8180 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 8181 pgcnt_t np, npages; 8182 struct page **pl; 8183 pgcnt_t szc0_npages = 0; 8184 8185 #ifdef lint 8186 addr = addr; 8187 #endif 8188 8189 npages = np = (len >> PAGESHIFT); 8190 ASSERT(npages); 8191 pl = pplist; 8192 if (seg->s_szc != 0) { 8193 size_t pgsz = page_get_pagesize(seg->s_szc); 8194 if (!IS_P2ALIGNED(addr, pgsz) || !IS_P2ALIGNED(len, pgsz)) { 8195 panic("segvn_reclaim: unaligned addr or len"); 8196 /*NOTREACHED*/ 8197 } 8198 } 8199 8200 ASSERT(svd->vp == NULL && svd->amp != NULL); 8201 8202 while (np > (uint_t)0) { 8203 if (rw == S_WRITE) { 8204 hat_setrefmod(*pplist); 8205 } else { 8206 hat_setref(*pplist); 8207 } 8208 if (seg->s_szc != 0 || (*pplist)->p_szc != 0) { 8209 segvn_pp_unlock_anonpages(*pplist, pplist == pl); 8210 } else { 8211 szc0_npages++; 8212 } 8213 page_unlock(*pplist); 8214 np--; 8215 pplist++; 8216 } 8217 kmem_free(pl, sizeof (page_t *) * npages); 8218 8219 mutex_enter(&freemem_lock); 8220 segvn_pages_locked -= npages; 8221 svd->softlockcnt -= npages; 8222 if (szc0_npages != 0) { 8223 availrmem += szc0_npages; 8224 } 8225 mutex_exit(&freemem_lock); 8226 if (svd->softlockcnt <= 0) { 8227 if (AS_ISUNMAPWAIT(seg->s_as)) { 8228 mutex_enter(&seg->s_as->a_contents); 8229 if (AS_ISUNMAPWAIT(seg->s_as)) { 8230 AS_CLRUNMAPWAIT(seg->s_as); 8231 cv_broadcast(&seg->s_as->a_cv); 8232 } 8233 mutex_exit(&seg->s_as->a_contents); 8234 } 8235 } 8236 return (0); 8237 } 8238 /* 8239 * get a memory ID for an addr in a given segment 8240 * 8241 * XXX only creates PAGESIZE pages if anon slots are not initialized. 8242 * At fault time they will be relocated into larger pages. 8243 */ 8244 static int 8245 segvn_getmemid(struct seg *seg, caddr_t addr, memid_t *memidp) 8246 { 8247 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 8248 struct anon *ap = NULL; 8249 ulong_t anon_index; 8250 struct anon_map *amp; 8251 anon_sync_obj_t cookie; 8252 8253 if (svd->type == MAP_PRIVATE) { 8254 memidp->val[0] = (uintptr_t)seg->s_as; 8255 memidp->val[1] = (uintptr_t)addr; 8256 return (0); 8257 } 8258 8259 if (svd->type == MAP_SHARED) { 8260 if (svd->vp) { 8261 memidp->val[0] = (uintptr_t)svd->vp; 8262 memidp->val[1] = (u_longlong_t)svd->offset + 8263 (uintptr_t)(addr - seg->s_base); 8264 return (0); 8265 } else { 8266 8267 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER); 8268 if ((amp = svd->amp) != NULL) { 8269 anon_index = svd->anon_index + 8270 seg_page(seg, addr); 8271 } 8272 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 8273 8274 ASSERT(amp != NULL); 8275 8276 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 8277 anon_array_enter(amp, anon_index, &cookie); 8278 ap = anon_get_ptr(amp->ahp, anon_index); 8279 if (ap == NULL) { 8280 page_t *pp; 8281 8282 pp = anon_zero(seg, addr, &ap, svd->cred); 8283 if (pp == NULL) { 8284 anon_array_exit(&cookie); 8285 ANON_LOCK_EXIT(&->a_rwlock); 8286 return (ENOMEM); 8287 } 8288 ASSERT(anon_get_ptr(amp->ahp, anon_index) 8289 == NULL); 8290 (void) anon_set_ptr(amp->ahp, anon_index, 8291 ap, ANON_SLEEP); 8292 page_unlock(pp); 8293 } 8294 8295 anon_array_exit(&cookie); 8296 ANON_LOCK_EXIT(&->a_rwlock); 8297 8298 memidp->val[0] = (uintptr_t)ap; 8299 memidp->val[1] = (uintptr_t)addr & PAGEOFFSET; 8300 return (0); 8301 } 8302 } 8303 return (EINVAL); 8304 } 8305 8306 static int 8307 sameprot(struct seg *seg, caddr_t a, size_t len) 8308 { 8309 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 8310 struct vpage *vpage; 8311 spgcnt_t pages = btop(len); 8312 uint_t prot; 8313 8314 if (svd->pageprot == 0) 8315 return (1); 8316 8317 ASSERT(svd->vpage != NULL); 8318 8319 vpage = &svd->vpage[seg_page(seg, a)]; 8320 prot = VPP_PROT(vpage); 8321 vpage++; 8322 pages--; 8323 while (pages-- > 0) { 8324 if (prot != VPP_PROT(vpage)) 8325 return (0); 8326 vpage++; 8327 } 8328 return (1); 8329 } 8330 8331 /* 8332 * Get memory allocation policy info for specified address in given segment 8333 */ 8334 static lgrp_mem_policy_info_t * 8335 segvn_getpolicy(struct seg *seg, caddr_t addr) 8336 { 8337 struct anon_map *amp; 8338 ulong_t anon_index; 8339 lgrp_mem_policy_info_t *policy_info; 8340 struct segvn_data *svn_data; 8341 u_offset_t vn_off; 8342 vnode_t *vp; 8343 8344 ASSERT(seg != NULL); 8345 8346 svn_data = (struct segvn_data *)seg->s_data; 8347 if (svn_data == NULL) 8348 return (NULL); 8349 8350 /* 8351 * Get policy info for private or shared memory 8352 */ 8353 if (svn_data->type != MAP_SHARED) 8354 policy_info = &svn_data->policy_info; 8355 else { 8356 amp = svn_data->amp; 8357 anon_index = svn_data->anon_index + seg_page(seg, addr); 8358 vp = svn_data->vp; 8359 vn_off = svn_data->offset + (uintptr_t)(addr - seg->s_base); 8360 policy_info = lgrp_shm_policy_get(amp, anon_index, vp, vn_off); 8361 } 8362 8363 return (policy_info); 8364 } 8365 8366 /*ARGSUSED*/ 8367 static int 8368 segvn_capable(struct seg *seg, segcapability_t capability) 8369 { 8370 return (0); 8371 } 8372