1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ 27 /* All Rights Reserved */ 28 29 /* 30 * University Copyright- Copyright (c) 1982, 1986, 1988 31 * The Regents of the University of California 32 * All Rights Reserved 33 * 34 * University Acknowledgment- Portions of this document are derived from 35 * software developed by the University of California, Berkeley, and its 36 * contributors. 37 */ 38 39 #pragma ident "%Z%%M% %I% %E% SMI" 40 41 /* 42 * VM - shared or copy-on-write from a vnode/anonymous memory. 43 */ 44 45 #include <sys/types.h> 46 #include <sys/param.h> 47 #include <sys/t_lock.h> 48 #include <sys/errno.h> 49 #include <sys/systm.h> 50 #include <sys/mman.h> 51 #include <sys/debug.h> 52 #include <sys/cred.h> 53 #include <sys/vmsystm.h> 54 #include <sys/tuneable.h> 55 #include <sys/bitmap.h> 56 #include <sys/swap.h> 57 #include <sys/kmem.h> 58 #include <sys/sysmacros.h> 59 #include <sys/vtrace.h> 60 #include <sys/cmn_err.h> 61 #include <sys/vm.h> 62 #include <sys/dumphdr.h> 63 #include <sys/lgrp.h> 64 65 #include <vm/hat.h> 66 #include <vm/as.h> 67 #include <vm/seg.h> 68 #include <vm/seg_vn.h> 69 #include <vm/pvn.h> 70 #include <vm/anon.h> 71 #include <vm/page.h> 72 #include <vm/vpage.h> 73 74 /* 75 * Private seg op routines. 76 */ 77 static int segvn_dup(struct seg *seg, struct seg *newseg); 78 static int segvn_unmap(struct seg *seg, caddr_t addr, size_t len); 79 static void segvn_free(struct seg *seg); 80 static faultcode_t segvn_fault(struct hat *hat, struct seg *seg, 81 caddr_t addr, size_t len, enum fault_type type, 82 enum seg_rw rw); 83 static faultcode_t segvn_faulta(struct seg *seg, caddr_t addr); 84 static int segvn_setprot(struct seg *seg, caddr_t addr, 85 size_t len, uint_t prot); 86 static int segvn_checkprot(struct seg *seg, caddr_t addr, 87 size_t len, uint_t prot); 88 static int segvn_kluster(struct seg *seg, caddr_t addr, ssize_t delta); 89 static size_t segvn_swapout(struct seg *seg); 90 static int segvn_sync(struct seg *seg, caddr_t addr, size_t len, 91 int attr, uint_t flags); 92 static size_t segvn_incore(struct seg *seg, caddr_t addr, size_t len, 93 char *vec); 94 static int segvn_lockop(struct seg *seg, caddr_t addr, size_t len, 95 int attr, int op, ulong_t *lockmap, size_t pos); 96 static int segvn_getprot(struct seg *seg, caddr_t addr, size_t len, 97 uint_t *protv); 98 static u_offset_t segvn_getoffset(struct seg *seg, caddr_t addr); 99 static int segvn_gettype(struct seg *seg, caddr_t addr); 100 static int segvn_getvp(struct seg *seg, caddr_t addr, 101 struct vnode **vpp); 102 static int segvn_advise(struct seg *seg, caddr_t addr, size_t len, 103 uint_t behav); 104 static void segvn_dump(struct seg *seg); 105 static int segvn_pagelock(struct seg *seg, caddr_t addr, size_t len, 106 struct page ***ppp, enum lock_type type, enum seg_rw rw); 107 static int segvn_setpagesize(struct seg *seg, caddr_t addr, size_t len, 108 uint_t szc); 109 static int segvn_getmemid(struct seg *seg, caddr_t addr, 110 memid_t *memidp); 111 static lgrp_mem_policy_info_t *segvn_getpolicy(struct seg *, caddr_t); 112 static int segvn_capable(struct seg *seg, segcapability_t capable); 113 114 struct seg_ops segvn_ops = { 115 segvn_dup, 116 segvn_unmap, 117 segvn_free, 118 segvn_fault, 119 segvn_faulta, 120 segvn_setprot, 121 segvn_checkprot, 122 segvn_kluster, 123 segvn_swapout, 124 segvn_sync, 125 segvn_incore, 126 segvn_lockop, 127 segvn_getprot, 128 segvn_getoffset, 129 segvn_gettype, 130 segvn_getvp, 131 segvn_advise, 132 segvn_dump, 133 segvn_pagelock, 134 segvn_setpagesize, 135 segvn_getmemid, 136 segvn_getpolicy, 137 segvn_capable, 138 }; 139 140 /* 141 * Common zfod structures, provided as a shorthand for others to use. 142 */ 143 static segvn_crargs_t zfod_segvn_crargs = 144 SEGVN_ZFOD_ARGS(PROT_ZFOD, PROT_ALL); 145 static segvn_crargs_t kzfod_segvn_crargs = 146 SEGVN_ZFOD_ARGS(PROT_ZFOD & ~PROT_USER, 147 PROT_ALL & ~PROT_USER); 148 static segvn_crargs_t stack_noexec_crargs = 149 SEGVN_ZFOD_ARGS(PROT_ZFOD & ~PROT_EXEC, PROT_ALL); 150 151 caddr_t zfod_argsp = (caddr_t)&zfod_segvn_crargs; /* user zfod argsp */ 152 caddr_t kzfod_argsp = (caddr_t)&kzfod_segvn_crargs; /* kernel zfod argsp */ 153 caddr_t stack_exec_argsp = (caddr_t)&zfod_segvn_crargs; /* executable stack */ 154 caddr_t stack_noexec_argsp = (caddr_t)&stack_noexec_crargs; /* noexec stack */ 155 156 #define vpgtob(n) ((n) * sizeof (struct vpage)) /* For brevity */ 157 158 size_t segvn_comb_thrshld = UINT_MAX; /* patchable -- see 1196681 */ 159 160 static int segvn_concat(struct seg *, struct seg *, int); 161 static int segvn_extend_prev(struct seg *, struct seg *, 162 struct segvn_crargs *, size_t); 163 static int segvn_extend_next(struct seg *, struct seg *, 164 struct segvn_crargs *, size_t); 165 static void segvn_softunlock(struct seg *, caddr_t, size_t, enum seg_rw); 166 static void segvn_pagelist_rele(page_t **); 167 static void segvn_setvnode_mpss(vnode_t *); 168 static void segvn_relocate_pages(page_t **, page_t *); 169 static int segvn_full_szcpages(page_t **, uint_t, int *, uint_t *); 170 static int segvn_fill_vp_pages(struct segvn_data *, vnode_t *, u_offset_t, 171 uint_t, page_t **, page_t **, uint_t *, int *); 172 static faultcode_t segvn_fault_vnodepages(struct hat *, struct seg *, caddr_t, 173 caddr_t, enum fault_type, enum seg_rw, caddr_t, caddr_t, int); 174 static faultcode_t segvn_fault_anonpages(struct hat *, struct seg *, caddr_t, 175 caddr_t, enum fault_type, enum seg_rw, caddr_t, caddr_t, int); 176 static faultcode_t segvn_faultpage(struct hat *, struct seg *, caddr_t, 177 u_offset_t, struct vpage *, page_t **, uint_t, 178 enum fault_type, enum seg_rw, int, int); 179 static void segvn_vpage(struct seg *); 180 181 static void segvn_purge(struct seg *seg); 182 static int segvn_reclaim(struct seg *, caddr_t, size_t, struct page **, 183 enum seg_rw); 184 185 static int sameprot(struct seg *, caddr_t, size_t); 186 187 static int segvn_demote_range(struct seg *, caddr_t, size_t, int, uint_t); 188 static int segvn_clrszc(struct seg *); 189 static struct seg *segvn_split_seg(struct seg *, caddr_t); 190 static int segvn_claim_pages(struct seg *, struct vpage *, u_offset_t, 191 ulong_t, uint_t); 192 193 static int segvn_pp_lock_anonpages(page_t *, int); 194 static void segvn_pp_unlock_anonpages(page_t *, int); 195 196 static struct kmem_cache *segvn_cache; 197 198 #ifdef VM_STATS 199 static struct segvnvmstats_str { 200 ulong_t fill_vp_pages[31]; 201 ulong_t fltvnpages[49]; 202 ulong_t fullszcpages[10]; 203 ulong_t relocatepages[3]; 204 ulong_t fltanpages[17]; 205 ulong_t pagelock[3]; 206 ulong_t demoterange[3]; 207 } segvnvmstats; 208 #endif /* VM_STATS */ 209 210 #define SDR_RANGE 1 /* demote entire range */ 211 #define SDR_END 2 /* demote non aligned ends only */ 212 213 #define CALC_LPG_REGION(pgsz, seg, addr, len, lpgaddr, lpgeaddr) { \ 214 if ((len) != 0) { \ 215 lpgaddr = (caddr_t)P2ALIGN((uintptr_t)(addr), pgsz); \ 216 ASSERT(lpgaddr >= (seg)->s_base); \ 217 lpgeaddr = (caddr_t)P2ROUNDUP((uintptr_t)((addr) + \ 218 (len)), pgsz); \ 219 ASSERT(lpgeaddr > lpgaddr); \ 220 ASSERT(lpgeaddr <= (seg)->s_base + (seg)->s_size); \ 221 } else { \ 222 lpgeaddr = lpgaddr = (addr); \ 223 } \ 224 } 225 226 /*ARGSUSED*/ 227 static int 228 segvn_cache_constructor(void *buf, void *cdrarg, int kmflags) 229 { 230 struct segvn_data *svd = buf; 231 232 rw_init(&svd->lock, NULL, RW_DEFAULT, NULL); 233 mutex_init(&svd->segp_slock, NULL, MUTEX_DEFAULT, NULL); 234 return (0); 235 } 236 237 /*ARGSUSED1*/ 238 static void 239 segvn_cache_destructor(void *buf, void *cdrarg) 240 { 241 struct segvn_data *svd = buf; 242 243 rw_destroy(&svd->lock); 244 mutex_destroy(&svd->segp_slock); 245 } 246 247 /* 248 * Patching this variable to non-zero allows the system to run with 249 * stacks marked as "not executable". It's a bit of a kludge, but is 250 * provided as a tweakable for platforms that export those ABIs 251 * (e.g. sparc V8) that have executable stacks enabled by default. 252 * There are also some restrictions for platforms that don't actually 253 * implement 'noexec' protections. 254 * 255 * Once enabled, the system is (therefore) unable to provide a fully 256 * ABI-compliant execution environment, though practically speaking, 257 * most everything works. The exceptions are generally some interpreters 258 * and debuggers that create executable code on the stack and jump 259 * into it (without explicitly mprotecting the address range to include 260 * PROT_EXEC). 261 * 262 * One important class of applications that are disabled are those 263 * that have been transformed into malicious agents using one of the 264 * numerous "buffer overflow" attacks. See 4007890. 265 */ 266 int noexec_user_stack = 0; 267 int noexec_user_stack_log = 1; 268 269 int segvn_lpg_disable = 0; 270 uint_t segvn_maxpgszc = 0; 271 272 ulong_t segvn_vmpss_clrszc_cnt; 273 ulong_t segvn_vmpss_clrszc_err; 274 ulong_t segvn_fltvnpages_clrszc_cnt; 275 ulong_t segvn_fltvnpages_clrszc_err; 276 ulong_t segvn_setpgsz_align_err; 277 ulong_t segvn_setpgsz_anon_align_err; 278 ulong_t segvn_setpgsz_getattr_err; 279 ulong_t segvn_setpgsz_eof_err; 280 ulong_t segvn_faultvnmpss_align_err1; 281 ulong_t segvn_faultvnmpss_align_err2; 282 ulong_t segvn_faultvnmpss_align_err3; 283 ulong_t segvn_faultvnmpss_align_err4; 284 ulong_t segvn_faultvnmpss_align_err5; 285 ulong_t segvn_vmpss_pageio_deadlk_err; 286 287 /* 288 * Initialize segvn data structures 289 */ 290 void 291 segvn_init(void) 292 { 293 uint_t maxszc; 294 uint_t szc; 295 size_t pgsz; 296 297 segvn_cache = kmem_cache_create("segvn_cache", 298 sizeof (struct segvn_data), 0, 299 segvn_cache_constructor, segvn_cache_destructor, NULL, 300 NULL, NULL, 0); 301 302 if (segvn_lpg_disable != 0) 303 return; 304 szc = maxszc = page_num_pagesizes() - 1; 305 if (szc == 0) { 306 segvn_lpg_disable = 1; 307 return; 308 } 309 if (page_get_pagesize(0) != PAGESIZE) { 310 panic("segvn_init: bad szc 0"); 311 /*NOTREACHED*/ 312 } 313 while (szc != 0) { 314 pgsz = page_get_pagesize(szc); 315 if (pgsz <= PAGESIZE || !IS_P2ALIGNED(pgsz, pgsz)) { 316 panic("segvn_init: bad szc %d", szc); 317 /*NOTREACHED*/ 318 } 319 szc--; 320 } 321 if (segvn_maxpgszc == 0 || segvn_maxpgszc > maxszc) 322 segvn_maxpgszc = maxszc; 323 } 324 325 #define SEGVN_PAGEIO ((void *)0x1) 326 #define SEGVN_NOPAGEIO ((void *)0x2) 327 328 static void 329 segvn_setvnode_mpss(vnode_t *vp) 330 { 331 int err; 332 333 ASSERT(vp->v_mpssdata == NULL || 334 vp->v_mpssdata == SEGVN_PAGEIO || 335 vp->v_mpssdata == SEGVN_NOPAGEIO); 336 337 if (vp->v_mpssdata == NULL) { 338 if (vn_vmpss_usepageio(vp)) { 339 err = VOP_PAGEIO(vp, (page_t *)NULL, 340 (u_offset_t)0, 0, 0, CRED()); 341 } else { 342 err = ENOSYS; 343 } 344 /* 345 * set v_mpssdata just once per vnode life 346 * so that it never changes. 347 */ 348 mutex_enter(&vp->v_lock); 349 if (vp->v_mpssdata == NULL) { 350 if (err == EINVAL) { 351 vp->v_mpssdata = SEGVN_PAGEIO; 352 } else { 353 vp->v_mpssdata = SEGVN_NOPAGEIO; 354 } 355 } 356 mutex_exit(&vp->v_lock); 357 } 358 } 359 360 int 361 segvn_create(struct seg *seg, void *argsp) 362 { 363 struct segvn_crargs *a = (struct segvn_crargs *)argsp; 364 struct segvn_data *svd; 365 size_t swresv = 0; 366 struct cred *cred; 367 struct anon_map *amp; 368 int error = 0; 369 size_t pgsz; 370 lgrp_mem_policy_t mpolicy = LGRP_MEM_POLICY_DEFAULT; 371 372 373 ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock)); 374 375 if (a->type != MAP_PRIVATE && a->type != MAP_SHARED) { 376 panic("segvn_create type"); 377 /*NOTREACHED*/ 378 } 379 380 /* 381 * Check arguments. If a shared anon structure is given then 382 * it is illegal to also specify a vp. 383 */ 384 if (a->amp != NULL && a->vp != NULL) { 385 panic("segvn_create anon_map"); 386 /*NOTREACHED*/ 387 } 388 389 /* MAP_NORESERVE on a MAP_SHARED segment is meaningless. */ 390 if (a->type == MAP_SHARED) 391 a->flags &= ~MAP_NORESERVE; 392 393 if (a->szc != 0) { 394 if (segvn_lpg_disable != 0 || 395 (a->amp != NULL && a->type == MAP_PRIVATE) || 396 (a->flags & MAP_NORESERVE) || seg->s_as == &kas) { 397 a->szc = 0; 398 } else { 399 if (a->szc > segvn_maxpgszc) 400 a->szc = segvn_maxpgszc; 401 pgsz = page_get_pagesize(a->szc); 402 if (!IS_P2ALIGNED(seg->s_base, pgsz) || 403 !IS_P2ALIGNED(seg->s_size, pgsz)) { 404 a->szc = 0; 405 } else if (a->vp != NULL) { 406 extern struct vnode kvp; 407 if (IS_SWAPFSVP(a->vp) || a->vp == &kvp) { 408 /* 409 * paranoid check. 410 * hat_page_demote() is not supported 411 * on swapfs pages. 412 */ 413 a->szc = 0; 414 } else if (map_addr_vacalign_check(seg->s_base, 415 a->offset & PAGEMASK)) { 416 a->szc = 0; 417 } 418 } else if (a->amp != NULL) { 419 pgcnt_t anum = btopr(a->offset); 420 pgcnt_t pgcnt = page_get_pagecnt(a->szc); 421 if (!IS_P2ALIGNED(anum, pgcnt)) { 422 a->szc = 0; 423 } 424 } 425 } 426 } 427 428 /* 429 * If segment may need private pages, reserve them now. 430 */ 431 if (!(a->flags & MAP_NORESERVE) && ((a->vp == NULL && a->amp == NULL) || 432 (a->type == MAP_PRIVATE && (a->prot & PROT_WRITE)))) { 433 if (anon_resv(seg->s_size) == 0) 434 return (EAGAIN); 435 swresv = seg->s_size; 436 TRACE_3(TR_FAC_VM, TR_ANON_PROC, "anon proc:%p %lu %u", 437 seg, swresv, 1); 438 } 439 440 /* 441 * Reserve any mapping structures that may be required. 442 */ 443 hat_map(seg->s_as->a_hat, seg->s_base, seg->s_size, HAT_MAP); 444 445 if (a->cred) { 446 cred = a->cred; 447 crhold(cred); 448 } else { 449 crhold(cred = CRED()); 450 } 451 452 /* Inform the vnode of the new mapping */ 453 if (a->vp) { 454 error = VOP_ADDMAP(a->vp, a->offset & PAGEMASK, 455 seg->s_as, seg->s_base, seg->s_size, a->prot, 456 a->maxprot, a->type, cred); 457 if (error) { 458 if (swresv != 0) { 459 anon_unresv(swresv); 460 TRACE_3(TR_FAC_VM, TR_ANON_PROC, 461 "anon proc:%p %lu %u", 462 seg, swresv, 0); 463 } 464 crfree(cred); 465 hat_unload(seg->s_as->a_hat, seg->s_base, 466 seg->s_size, HAT_UNLOAD_UNMAP); 467 return (error); 468 } 469 } 470 471 /* 472 * If more than one segment in the address space, and 473 * they're adjacent virtually, try to concatenate them. 474 * Don't concatenate if an explicit anon_map structure 475 * was supplied (e.g., SystemV shared memory). 476 */ 477 if (a->amp == NULL) { 478 struct seg *pseg, *nseg; 479 struct segvn_data *psvd, *nsvd; 480 lgrp_mem_policy_t ppolicy, npolicy; 481 uint_t lgrp_mem_policy_flags = 0; 482 extern lgrp_mem_policy_t lgrp_mem_default_policy; 483 484 /* 485 * Memory policy flags (lgrp_mem_policy_flags) is valid when 486 * extending stack/heap segments. 487 */ 488 if ((a->vp == NULL) && (a->type == MAP_PRIVATE) && 489 !(a->flags & MAP_NORESERVE) && (seg->s_as != &kas)) { 490 lgrp_mem_policy_flags = a->lgrp_mem_policy_flags; 491 } else { 492 /* 493 * Get policy when not extending it from another segment 494 */ 495 mpolicy = lgrp_mem_policy_default(seg->s_size, a->type); 496 } 497 498 /* 499 * First, try to concatenate the previous and new segments 500 */ 501 pseg = AS_SEGPREV(seg->s_as, seg); 502 if (pseg != NULL && 503 pseg->s_base + pseg->s_size == seg->s_base && 504 pseg->s_ops == &segvn_ops) { 505 /* 506 * Get memory allocation policy from previous segment. 507 * When extension is specified (e.g. for heap) apply 508 * this policy to the new segment regardless of the 509 * outcome of segment concatenation. Extension occurs 510 * for non-default policy otherwise default policy is 511 * used and is based on extended segment size. 512 */ 513 psvd = (struct segvn_data *)pseg->s_data; 514 ppolicy = psvd->policy_info.mem_policy; 515 if (lgrp_mem_policy_flags == 516 LGRP_MP_FLAG_EXTEND_UP) { 517 if (ppolicy != lgrp_mem_default_policy) { 518 mpolicy = ppolicy; 519 } else { 520 mpolicy = lgrp_mem_policy_default( 521 pseg->s_size + seg->s_size, 522 a->type); 523 } 524 } 525 526 if (mpolicy == ppolicy && 527 (pseg->s_size + seg->s_size <= 528 segvn_comb_thrshld || psvd->amp == NULL) && 529 segvn_extend_prev(pseg, seg, a, swresv) == 0) { 530 /* 531 * success! now try to concatenate 532 * with following seg 533 */ 534 crfree(cred); 535 nseg = AS_SEGNEXT(pseg->s_as, pseg); 536 if (nseg != NULL && 537 nseg != pseg && 538 nseg->s_ops == &segvn_ops && 539 pseg->s_base + pseg->s_size == 540 nseg->s_base) 541 (void) segvn_concat(pseg, nseg, 0); 542 ASSERT(pseg->s_szc == 0 || 543 (a->szc == pseg->s_szc && 544 IS_P2ALIGNED(pseg->s_base, pgsz) && 545 IS_P2ALIGNED(pseg->s_size, pgsz))); 546 return (0); 547 } 548 } 549 550 /* 551 * Failed, so try to concatenate with following seg 552 */ 553 nseg = AS_SEGNEXT(seg->s_as, seg); 554 if (nseg != NULL && 555 seg->s_base + seg->s_size == nseg->s_base && 556 nseg->s_ops == &segvn_ops) { 557 /* 558 * Get memory allocation policy from next segment. 559 * When extension is specified (e.g. for stack) apply 560 * this policy to the new segment regardless of the 561 * outcome of segment concatenation. Extension occurs 562 * for non-default policy otherwise default policy is 563 * used and is based on extended segment size. 564 */ 565 nsvd = (struct segvn_data *)nseg->s_data; 566 npolicy = nsvd->policy_info.mem_policy; 567 if (lgrp_mem_policy_flags == 568 LGRP_MP_FLAG_EXTEND_DOWN) { 569 if (npolicy != lgrp_mem_default_policy) { 570 mpolicy = npolicy; 571 } else { 572 mpolicy = lgrp_mem_policy_default( 573 nseg->s_size + seg->s_size, 574 a->type); 575 } 576 } 577 578 if (mpolicy == npolicy && 579 segvn_extend_next(seg, nseg, a, swresv) == 0) { 580 crfree(cred); 581 ASSERT(nseg->s_szc == 0 || 582 (a->szc == nseg->s_szc && 583 IS_P2ALIGNED(nseg->s_base, pgsz) && 584 IS_P2ALIGNED(nseg->s_size, pgsz))); 585 return (0); 586 } 587 } 588 } 589 590 if (a->vp != NULL) { 591 VN_HOLD(a->vp); 592 if (a->type == MAP_SHARED) 593 lgrp_shm_policy_init(NULL, a->vp); 594 } 595 svd = kmem_cache_alloc(segvn_cache, KM_SLEEP); 596 597 seg->s_ops = &segvn_ops; 598 seg->s_data = (void *)svd; 599 seg->s_szc = a->szc; 600 601 svd->vp = a->vp; 602 /* 603 * Anonymous mappings have no backing file so the offset is meaningless. 604 */ 605 svd->offset = a->vp ? (a->offset & PAGEMASK) : 0; 606 svd->prot = a->prot; 607 svd->maxprot = a->maxprot; 608 svd->pageprot = 0; 609 svd->type = a->type; 610 svd->vpage = NULL; 611 svd->cred = cred; 612 svd->advice = MADV_NORMAL; 613 svd->pageadvice = 0; 614 svd->flags = (ushort_t)a->flags; 615 svd->softlockcnt = 0; 616 if (a->szc != 0 && a->vp != NULL) { 617 segvn_setvnode_mpss(a->vp); 618 } 619 620 amp = a->amp; 621 if ((svd->amp = amp) == NULL) { 622 svd->anon_index = 0; 623 if (svd->type == MAP_SHARED) { 624 svd->swresv = 0; 625 /* 626 * Shared mappings to a vp need no other setup. 627 * If we have a shared mapping to an anon_map object 628 * which hasn't been allocated yet, allocate the 629 * struct now so that it will be properly shared 630 * by remembering the swap reservation there. 631 */ 632 if (a->vp == NULL) { 633 svd->amp = anonmap_alloc(seg->s_size, swresv); 634 svd->amp->a_szc = seg->s_szc; 635 } 636 } else { 637 /* 638 * Private mapping (with or without a vp). 639 * Allocate anon_map when needed. 640 */ 641 svd->swresv = swresv; 642 } 643 } else { 644 pgcnt_t anon_num; 645 646 /* 647 * Mapping to an existing anon_map structure without a vp. 648 * For now we will insure that the segment size isn't larger 649 * than the size - offset gives us. Later on we may wish to 650 * have the anon array dynamically allocated itself so that 651 * we don't always have to allocate all the anon pointer slots. 652 * This of course involves adding extra code to check that we 653 * aren't trying to use an anon pointer slot beyond the end 654 * of the currently allocated anon array. 655 */ 656 if ((amp->size - a->offset) < seg->s_size) { 657 panic("segvn_create anon_map size"); 658 /*NOTREACHED*/ 659 } 660 661 anon_num = btopr(a->offset); 662 663 if (a->type == MAP_SHARED) { 664 /* 665 * SHARED mapping to a given anon_map. 666 */ 667 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 668 amp->refcnt++; 669 if (a->szc > amp->a_szc) { 670 amp->a_szc = a->szc; 671 } 672 ANON_LOCK_EXIT(&->a_rwlock); 673 svd->anon_index = anon_num; 674 svd->swresv = 0; 675 } else { 676 /* 677 * PRIVATE mapping to a given anon_map. 678 * Make sure that all the needed anon 679 * structures are created (so that we will 680 * share the underlying pages if nothing 681 * is written by this mapping) and then 682 * duplicate the anon array as is done 683 * when a privately mapped segment is dup'ed. 684 */ 685 struct anon *ap; 686 caddr_t addr; 687 caddr_t eaddr; 688 ulong_t anon_idx; 689 int hat_flag = HAT_LOAD; 690 691 if (svd->flags & MAP_TEXT) { 692 hat_flag |= HAT_LOAD_TEXT; 693 } 694 695 svd->amp = anonmap_alloc(seg->s_size, 0); 696 svd->amp->a_szc = seg->s_szc; 697 svd->anon_index = 0; 698 svd->swresv = swresv; 699 700 /* 701 * Prevent 2 threads from allocating anon 702 * slots simultaneously. 703 */ 704 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 705 eaddr = seg->s_base + seg->s_size; 706 707 for (anon_idx = anon_num, addr = seg->s_base; 708 addr < eaddr; addr += PAGESIZE, anon_idx++) { 709 page_t *pp; 710 711 if ((ap = anon_get_ptr(amp->ahp, 712 anon_idx)) != NULL) 713 continue; 714 715 /* 716 * Allocate the anon struct now. 717 * Might as well load up translation 718 * to the page while we're at it... 719 */ 720 pp = anon_zero(seg, addr, &ap, cred); 721 if (ap == NULL || pp == NULL) { 722 panic("segvn_create anon_zero"); 723 /*NOTREACHED*/ 724 } 725 726 /* 727 * Re-acquire the anon_map lock and 728 * initialize the anon array entry. 729 */ 730 ASSERT(anon_get_ptr(amp->ahp, 731 anon_idx) == NULL); 732 (void) anon_set_ptr(amp->ahp, anon_idx, ap, 733 ANON_SLEEP); 734 735 ASSERT(seg->s_szc == 0); 736 ASSERT(!IS_VMODSORT(pp->p_vnode)); 737 738 hat_memload(seg->s_as->a_hat, addr, pp, 739 svd->prot & ~PROT_WRITE, hat_flag); 740 741 page_unlock(pp); 742 } 743 ASSERT(seg->s_szc == 0); 744 anon_dup(amp->ahp, anon_num, svd->amp->ahp, 745 0, seg->s_size); 746 ANON_LOCK_EXIT(&->a_rwlock); 747 } 748 } 749 750 /* 751 * Set default memory allocation policy for segment 752 * 753 * Always set policy for private memory at least for initialization 754 * even if this is a shared memory segment 755 */ 756 (void) lgrp_privm_policy_set(mpolicy, &svd->policy_info, seg->s_size); 757 758 if (svd->type == MAP_SHARED) 759 (void) lgrp_shm_policy_set(mpolicy, svd->amp, svd->anon_index, 760 svd->vp, svd->offset, seg->s_size); 761 762 return (0); 763 } 764 765 /* 766 * Concatenate two existing segments, if possible. 767 * Return 0 on success, -1 if two segments are not compatible 768 * or -2 on memory allocation failure. 769 * If amp_cat == 1 then try and concat segments with anon maps 770 */ 771 static int 772 segvn_concat(struct seg *seg1, struct seg *seg2, int amp_cat) 773 { 774 struct segvn_data *svd1 = seg1->s_data; 775 struct segvn_data *svd2 = seg2->s_data; 776 struct anon_map *amp1 = svd1->amp; 777 struct anon_map *amp2 = svd2->amp; 778 struct vpage *vpage1 = svd1->vpage; 779 struct vpage *vpage2 = svd2->vpage, *nvpage = NULL; 780 size_t size, nvpsize; 781 pgcnt_t npages1, npages2; 782 783 ASSERT(seg1->s_as && seg2->s_as && seg1->s_as == seg2->s_as); 784 ASSERT(AS_WRITE_HELD(seg1->s_as, &seg1->s_as->a_lock)); 785 ASSERT(seg1->s_ops == seg2->s_ops); 786 787 /* both segments exist, try to merge them */ 788 #define incompat(x) (svd1->x != svd2->x) 789 if (incompat(vp) || incompat(maxprot) || 790 (!svd1->pageadvice && !svd2->pageadvice && incompat(advice)) || 791 (!svd1->pageprot && !svd2->pageprot && incompat(prot)) || 792 incompat(type) || incompat(cred) || incompat(flags) || 793 seg1->s_szc != seg2->s_szc || incompat(policy_info.mem_policy) || 794 (svd2->softlockcnt > 0)) 795 return (-1); 796 #undef incompat 797 798 /* 799 * vp == NULL implies zfod, offset doesn't matter 800 */ 801 if (svd1->vp != NULL && 802 svd1->offset + seg1->s_size != svd2->offset) { 803 return (-1); 804 } 805 806 /* 807 * Fail early if we're not supposed to concatenate 808 * segments with non NULL amp. 809 */ 810 if (amp_cat == 0 && (amp1 != NULL || amp2 != NULL)) { 811 return (-1); 812 } 813 814 if (svd1->vp == NULL && svd1->type == MAP_SHARED) { 815 if (amp1 != amp2) { 816 return (-1); 817 } 818 if (amp1 != NULL && svd1->anon_index + btop(seg1->s_size) != 819 svd2->anon_index) { 820 return (-1); 821 } 822 ASSERT(amp1 == NULL || amp1->refcnt >= 2); 823 } 824 825 /* 826 * If either seg has vpages, create a new merged vpage array. 827 */ 828 if (vpage1 != NULL || vpage2 != NULL) { 829 struct vpage *vp; 830 831 npages1 = seg_pages(seg1); 832 npages2 = seg_pages(seg2); 833 nvpsize = vpgtob(npages1 + npages2); 834 835 if ((nvpage = kmem_zalloc(nvpsize, KM_NOSLEEP)) == NULL) { 836 return (-2); 837 } 838 if (vpage1 != NULL) { 839 bcopy(vpage1, nvpage, vpgtob(npages1)); 840 } 841 if (vpage2 != NULL) { 842 bcopy(vpage2, nvpage + npages1, vpgtob(npages2)); 843 } 844 for (vp = nvpage; vp < nvpage + npages1; vp++) { 845 if (svd2->pageprot && !svd1->pageprot) { 846 VPP_SETPROT(vp, svd1->prot); 847 } 848 if (svd2->pageadvice && !svd1->pageadvice) { 849 VPP_SETADVICE(vp, svd1->advice); 850 } 851 } 852 for (vp = nvpage + npages1; 853 vp < nvpage + npages1 + npages2; vp++) { 854 if (svd1->pageprot && !svd2->pageprot) { 855 VPP_SETPROT(vp, svd2->prot); 856 } 857 if (svd1->pageadvice && !svd2->pageadvice) { 858 VPP_SETADVICE(vp, svd2->advice); 859 } 860 } 861 } 862 863 /* 864 * If either segment has private pages, create a new merged anon 865 * array. If mergeing shared anon segments just decrement anon map's 866 * refcnt. 867 */ 868 if (amp1 != NULL && svd1->type == MAP_SHARED) { 869 ASSERT(amp1 == amp2 && svd1->vp == NULL); 870 ANON_LOCK_ENTER(&1->a_rwlock, RW_WRITER); 871 ASSERT(amp1->refcnt >= 2); 872 amp1->refcnt--; 873 ANON_LOCK_EXIT(&1->a_rwlock); 874 svd2->amp = NULL; 875 } else if (amp1 != NULL || amp2 != NULL) { 876 struct anon_hdr *nahp; 877 struct anon_map *namp = NULL; 878 size_t asize; 879 880 ASSERT(svd1->type == MAP_PRIVATE); 881 882 asize = seg1->s_size + seg2->s_size; 883 if ((nahp = anon_create(btop(asize), ANON_NOSLEEP)) == NULL) { 884 if (nvpage != NULL) { 885 kmem_free(nvpage, nvpsize); 886 } 887 return (-2); 888 } 889 if (amp1 != NULL) { 890 /* 891 * XXX anon rwlock is not really needed because 892 * this is a private segment and we are writers. 893 */ 894 ANON_LOCK_ENTER(&1->a_rwlock, RW_WRITER); 895 ASSERT(amp1->refcnt == 1); 896 if (anon_copy_ptr(amp1->ahp, svd1->anon_index, 897 nahp, 0, btop(seg1->s_size), ANON_NOSLEEP)) { 898 anon_release(nahp, btop(asize)); 899 ANON_LOCK_EXIT(&1->a_rwlock); 900 if (nvpage != NULL) { 901 kmem_free(nvpage, nvpsize); 902 } 903 return (-2); 904 } 905 } 906 if (amp2 != NULL) { 907 ANON_LOCK_ENTER(&2->a_rwlock, RW_WRITER); 908 ASSERT(amp2->refcnt == 1); 909 if (anon_copy_ptr(amp2->ahp, svd2->anon_index, 910 nahp, btop(seg1->s_size), btop(seg2->s_size), 911 ANON_NOSLEEP)) { 912 anon_release(nahp, btop(asize)); 913 ANON_LOCK_EXIT(&2->a_rwlock); 914 if (amp1 != NULL) { 915 ANON_LOCK_EXIT(&1->a_rwlock); 916 } 917 if (nvpage != NULL) { 918 kmem_free(nvpage, nvpsize); 919 } 920 return (-2); 921 } 922 } 923 if (amp1 != NULL) { 924 namp = amp1; 925 anon_release(amp1->ahp, btop(amp1->size)); 926 } 927 if (amp2 != NULL) { 928 if (namp == NULL) { 929 ASSERT(amp1 == NULL); 930 namp = amp2; 931 anon_release(amp2->ahp, btop(amp2->size)); 932 } else { 933 amp2->refcnt--; 934 ANON_LOCK_EXIT(&2->a_rwlock); 935 anonmap_free(amp2); 936 } 937 svd2->amp = NULL; /* needed for seg_free */ 938 } 939 namp->ahp = nahp; 940 namp->size = asize; 941 svd1->amp = namp; 942 svd1->anon_index = 0; 943 ANON_LOCK_EXIT(&namp->a_rwlock); 944 } 945 /* 946 * Now free the old vpage structures. 947 */ 948 if (nvpage != NULL) { 949 if (vpage1 != NULL) { 950 kmem_free(vpage1, vpgtob(npages1)); 951 } 952 if (vpage2 != NULL) { 953 svd2->vpage = NULL; 954 kmem_free(vpage2, vpgtob(npages2)); 955 } 956 if (svd2->pageprot) { 957 svd1->pageprot = 1; 958 } 959 if (svd2->pageadvice) { 960 svd1->pageadvice = 1; 961 } 962 svd1->vpage = nvpage; 963 } 964 965 /* all looks ok, merge segments */ 966 svd1->swresv += svd2->swresv; 967 svd2->swresv = 0; /* so seg_free doesn't release swap space */ 968 size = seg2->s_size; 969 seg_free(seg2); 970 seg1->s_size += size; 971 return (0); 972 } 973 974 /* 975 * Extend the previous segment (seg1) to include the 976 * new segment (seg2 + a), if possible. 977 * Return 0 on success. 978 */ 979 static int 980 segvn_extend_prev(seg1, seg2, a, swresv) 981 struct seg *seg1, *seg2; 982 struct segvn_crargs *a; 983 size_t swresv; 984 { 985 struct segvn_data *svd1 = (struct segvn_data *)seg1->s_data; 986 size_t size; 987 struct anon_map *amp1; 988 struct vpage *new_vpage; 989 990 /* 991 * We don't need any segment level locks for "segvn" data 992 * since the address space is "write" locked. 993 */ 994 ASSERT(seg1->s_as && AS_WRITE_HELD(seg1->s_as, &seg1->s_as->a_lock)); 995 996 /* second segment is new, try to extend first */ 997 /* XXX - should also check cred */ 998 if (svd1->vp != a->vp || svd1->maxprot != a->maxprot || 999 (!svd1->pageprot && (svd1->prot != a->prot)) || 1000 svd1->type != a->type || svd1->flags != a->flags || 1001 seg1->s_szc != a->szc) 1002 return (-1); 1003 1004 /* vp == NULL implies zfod, offset doesn't matter */ 1005 if (svd1->vp != NULL && 1006 svd1->offset + seg1->s_size != (a->offset & PAGEMASK)) 1007 return (-1); 1008 1009 amp1 = svd1->amp; 1010 if (amp1) { 1011 pgcnt_t newpgs; 1012 1013 /* 1014 * Segment has private pages, can data structures 1015 * be expanded? 1016 * 1017 * Acquire the anon_map lock to prevent it from changing, 1018 * if it is shared. This ensures that the anon_map 1019 * will not change while a thread which has a read/write 1020 * lock on an address space references it. 1021 * XXX - Don't need the anon_map lock at all if "refcnt" 1022 * is 1. 1023 * 1024 * Can't grow a MAP_SHARED segment with an anonmap because 1025 * there may be existing anon slots where we want to extend 1026 * the segment and we wouldn't know what to do with them 1027 * (e.g., for tmpfs right thing is to just leave them there, 1028 * for /dev/zero they should be cleared out). 1029 */ 1030 if (svd1->type == MAP_SHARED) 1031 return (-1); 1032 1033 ANON_LOCK_ENTER(&1->a_rwlock, RW_WRITER); 1034 if (amp1->refcnt > 1) { 1035 ANON_LOCK_EXIT(&1->a_rwlock); 1036 return (-1); 1037 } 1038 newpgs = anon_grow(amp1->ahp, &svd1->anon_index, 1039 btop(seg1->s_size), btop(seg2->s_size), ANON_NOSLEEP); 1040 1041 if (newpgs == 0) { 1042 ANON_LOCK_EXIT(&1->a_rwlock); 1043 return (-1); 1044 } 1045 amp1->size = ptob(newpgs); 1046 ANON_LOCK_EXIT(&1->a_rwlock); 1047 } 1048 if (svd1->vpage != NULL) { 1049 new_vpage = 1050 kmem_zalloc(vpgtob(seg_pages(seg1) + seg_pages(seg2)), 1051 KM_NOSLEEP); 1052 if (new_vpage == NULL) 1053 return (-1); 1054 bcopy(svd1->vpage, new_vpage, vpgtob(seg_pages(seg1))); 1055 kmem_free(svd1->vpage, vpgtob(seg_pages(seg1))); 1056 svd1->vpage = new_vpage; 1057 if (svd1->pageprot) { 1058 struct vpage *vp, *evp; 1059 1060 vp = new_vpage + seg_pages(seg1); 1061 evp = vp + seg_pages(seg2); 1062 for (; vp < evp; vp++) 1063 VPP_SETPROT(vp, a->prot); 1064 } 1065 } 1066 size = seg2->s_size; 1067 seg_free(seg2); 1068 seg1->s_size += size; 1069 svd1->swresv += swresv; 1070 return (0); 1071 } 1072 1073 /* 1074 * Extend the next segment (seg2) to include the 1075 * new segment (seg1 + a), if possible. 1076 * Return 0 on success. 1077 */ 1078 static int 1079 segvn_extend_next( 1080 struct seg *seg1, 1081 struct seg *seg2, 1082 struct segvn_crargs *a, 1083 size_t swresv) 1084 { 1085 struct segvn_data *svd2 = (struct segvn_data *)seg2->s_data; 1086 size_t size; 1087 struct anon_map *amp2; 1088 struct vpage *new_vpage; 1089 1090 /* 1091 * We don't need any segment level locks for "segvn" data 1092 * since the address space is "write" locked. 1093 */ 1094 ASSERT(seg2->s_as && AS_WRITE_HELD(seg2->s_as, &seg2->s_as->a_lock)); 1095 1096 /* first segment is new, try to extend second */ 1097 /* XXX - should also check cred */ 1098 if (svd2->vp != a->vp || svd2->maxprot != a->maxprot || 1099 (!svd2->pageprot && (svd2->prot != a->prot)) || 1100 svd2->type != a->type || svd2->flags != a->flags || 1101 seg2->s_szc != a->szc) 1102 return (-1); 1103 /* vp == NULL implies zfod, offset doesn't matter */ 1104 if (svd2->vp != NULL && 1105 (a->offset & PAGEMASK) + seg1->s_size != svd2->offset) 1106 return (-1); 1107 1108 amp2 = svd2->amp; 1109 if (amp2) { 1110 pgcnt_t newpgs; 1111 1112 /* 1113 * Segment has private pages, can data structures 1114 * be expanded? 1115 * 1116 * Acquire the anon_map lock to prevent it from changing, 1117 * if it is shared. This ensures that the anon_map 1118 * will not change while a thread which has a read/write 1119 * lock on an address space references it. 1120 * 1121 * XXX - Don't need the anon_map lock at all if "refcnt" 1122 * is 1. 1123 */ 1124 if (svd2->type == MAP_SHARED) 1125 return (-1); 1126 1127 ANON_LOCK_ENTER(&2->a_rwlock, RW_WRITER); 1128 if (amp2->refcnt > 1) { 1129 ANON_LOCK_EXIT(&2->a_rwlock); 1130 return (-1); 1131 } 1132 newpgs = anon_grow(amp2->ahp, &svd2->anon_index, 1133 btop(seg2->s_size), btop(seg1->s_size), 1134 ANON_NOSLEEP | ANON_GROWDOWN); 1135 1136 if (newpgs == 0) { 1137 ANON_LOCK_EXIT(&2->a_rwlock); 1138 return (-1); 1139 } 1140 amp2->size = ptob(newpgs); 1141 ANON_LOCK_EXIT(&2->a_rwlock); 1142 } 1143 if (svd2->vpage != NULL) { 1144 new_vpage = 1145 kmem_zalloc(vpgtob(seg_pages(seg1) + seg_pages(seg2)), 1146 KM_NOSLEEP); 1147 if (new_vpage == NULL) { 1148 /* Not merging segments so adjust anon_index back */ 1149 if (amp2) 1150 svd2->anon_index += seg_pages(seg1); 1151 return (-1); 1152 } 1153 bcopy(svd2->vpage, new_vpage + seg_pages(seg1), 1154 vpgtob(seg_pages(seg2))); 1155 kmem_free(svd2->vpage, vpgtob(seg_pages(seg2))); 1156 svd2->vpage = new_vpage; 1157 if (svd2->pageprot) { 1158 struct vpage *vp, *evp; 1159 1160 vp = new_vpage; 1161 evp = vp + seg_pages(seg1); 1162 for (; vp < evp; vp++) 1163 VPP_SETPROT(vp, a->prot); 1164 } 1165 } 1166 size = seg1->s_size; 1167 seg_free(seg1); 1168 seg2->s_size += size; 1169 seg2->s_base -= size; 1170 svd2->offset -= size; 1171 svd2->swresv += swresv; 1172 return (0); 1173 } 1174 1175 static int 1176 segvn_dup(struct seg *seg, struct seg *newseg) 1177 { 1178 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 1179 struct segvn_data *newsvd; 1180 pgcnt_t npages = seg_pages(seg); 1181 int error = 0; 1182 uint_t prot; 1183 size_t len; 1184 1185 ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock)); 1186 1187 /* 1188 * If segment has anon reserved, reserve more for the new seg. 1189 * For a MAP_NORESERVE segment swresv will be a count of all the 1190 * allocated anon slots; thus we reserve for the child as many slots 1191 * as the parent has allocated. This semantic prevents the child or 1192 * parent from dieing during a copy-on-write fault caused by trying 1193 * to write a shared pre-existing anon page. 1194 */ 1195 if ((len = svd->swresv) != 0) { 1196 if (anon_resv(svd->swresv) == 0) 1197 return (ENOMEM); 1198 1199 TRACE_3(TR_FAC_VM, TR_ANON_PROC, "anon proc:%p %lu %u", 1200 seg, len, 0); 1201 } 1202 1203 newsvd = kmem_cache_alloc(segvn_cache, KM_SLEEP); 1204 1205 newseg->s_ops = &segvn_ops; 1206 newseg->s_data = (void *)newsvd; 1207 newseg->s_szc = seg->s_szc; 1208 1209 if ((newsvd->vp = svd->vp) != NULL) { 1210 VN_HOLD(svd->vp); 1211 if (svd->type == MAP_SHARED) 1212 lgrp_shm_policy_init(NULL, svd->vp); 1213 } 1214 newsvd->offset = svd->offset; 1215 newsvd->prot = svd->prot; 1216 newsvd->maxprot = svd->maxprot; 1217 newsvd->pageprot = svd->pageprot; 1218 newsvd->type = svd->type; 1219 newsvd->cred = svd->cred; 1220 crhold(newsvd->cred); 1221 newsvd->advice = svd->advice; 1222 newsvd->pageadvice = svd->pageadvice; 1223 newsvd->swresv = svd->swresv; 1224 newsvd->flags = svd->flags; 1225 newsvd->softlockcnt = 0; 1226 newsvd->policy_info = svd->policy_info; 1227 if ((newsvd->amp = svd->amp) == NULL) { 1228 /* 1229 * Not attaching to a shared anon object. 1230 */ 1231 newsvd->anon_index = 0; 1232 } else { 1233 struct anon_map *amp; 1234 1235 amp = svd->amp; 1236 if (svd->type == MAP_SHARED) { 1237 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 1238 amp->refcnt++; 1239 ANON_LOCK_EXIT(&->a_rwlock); 1240 newsvd->anon_index = svd->anon_index; 1241 } else { 1242 int reclaim = 1; 1243 1244 /* 1245 * Allocate and initialize new anon_map structure. 1246 */ 1247 newsvd->amp = anonmap_alloc(newseg->s_size, 0); 1248 newsvd->amp->a_szc = newseg->s_szc; 1249 newsvd->anon_index = 0; 1250 1251 /* 1252 * We don't have to acquire the anon_map lock 1253 * for the new segment (since it belongs to an 1254 * address space that is still not associated 1255 * with any process), or the segment in the old 1256 * address space (since all threads in it 1257 * are stopped while duplicating the address space). 1258 */ 1259 1260 /* 1261 * The goal of the following code is to make sure that 1262 * softlocked pages do not end up as copy on write 1263 * pages. This would cause problems where one 1264 * thread writes to a page that is COW and a different 1265 * thread in the same process has softlocked it. The 1266 * softlock lock would move away from this process 1267 * because the write would cause this process to get 1268 * a copy (without the softlock). 1269 * 1270 * The strategy here is to just break the 1271 * sharing on pages that could possibly be 1272 * softlocked. 1273 */ 1274 retry: 1275 if (svd->softlockcnt) { 1276 struct anon *ap, *newap; 1277 size_t i; 1278 uint_t vpprot; 1279 page_t *anon_pl[1+1], *pp; 1280 caddr_t addr; 1281 ulong_t anon_idx = 0; 1282 1283 /* 1284 * The softlock count might be non zero 1285 * because some pages are still stuck in the 1286 * cache for lazy reclaim. Flush the cache 1287 * now. This should drop the count to zero. 1288 * [or there is really I/O going on to these 1289 * pages]. Note, we have the writers lock so 1290 * nothing gets inserted during the flush. 1291 */ 1292 if (reclaim == 1) { 1293 segvn_purge(seg); 1294 reclaim = 0; 1295 goto retry; 1296 } 1297 i = btopr(seg->s_size); 1298 addr = seg->s_base; 1299 /* 1300 * XXX break cow sharing using PAGESIZE 1301 * pages. They will be relocated into larger 1302 * pages at fault time. 1303 */ 1304 while (i-- > 0) { 1305 if (ap = anon_get_ptr(amp->ahp, 1306 anon_idx)) { 1307 error = anon_getpage(&ap, 1308 &vpprot, anon_pl, PAGESIZE, 1309 seg, addr, S_READ, 1310 svd->cred); 1311 if (error) { 1312 newsvd->vpage = NULL; 1313 goto out; 1314 } 1315 /* 1316 * prot need not be computed 1317 * below 'cause anon_private is 1318 * going to ignore it anyway 1319 * as child doesn't inherit 1320 * pagelock from parent. 1321 */ 1322 prot = svd->pageprot ? 1323 VPP_PROT( 1324 &svd->vpage[ 1325 seg_page(seg, addr)]) 1326 : svd->prot; 1327 pp = anon_private(&newap, 1328 newseg, addr, prot, 1329 anon_pl[0], 0, 1330 newsvd->cred); 1331 if (pp == NULL) { 1332 /* no mem abort */ 1333 newsvd->vpage = NULL; 1334 error = ENOMEM; 1335 goto out; 1336 } 1337 (void) anon_set_ptr( 1338 newsvd->amp->ahp, anon_idx, 1339 newap, ANON_SLEEP); 1340 page_unlock(pp); 1341 } 1342 addr += PAGESIZE; 1343 anon_idx++; 1344 } 1345 } else { /* common case */ 1346 if (seg->s_szc != 0) { 1347 /* 1348 * If at least one of anon slots of a 1349 * large page exists then make sure 1350 * all anon slots of a large page 1351 * exist to avoid partial cow sharing 1352 * of a large page in the future. 1353 */ 1354 anon_dup_fill_holes(amp->ahp, 1355 svd->anon_index, newsvd->amp->ahp, 1356 0, seg->s_size, seg->s_szc, 1357 svd->vp != NULL); 1358 } else { 1359 anon_dup(amp->ahp, svd->anon_index, 1360 newsvd->amp->ahp, 0, seg->s_size); 1361 } 1362 1363 hat_clrattr(seg->s_as->a_hat, seg->s_base, 1364 seg->s_size, PROT_WRITE); 1365 } 1366 } 1367 } 1368 /* 1369 * If necessary, create a vpage structure for the new segment. 1370 * Do not copy any page lock indications. 1371 */ 1372 if (svd->vpage != NULL) { 1373 uint_t i; 1374 struct vpage *ovp = svd->vpage; 1375 struct vpage *nvp; 1376 1377 nvp = newsvd->vpage = 1378 kmem_alloc(vpgtob(npages), KM_SLEEP); 1379 for (i = 0; i < npages; i++) { 1380 *nvp = *ovp++; 1381 VPP_CLRPPLOCK(nvp++); 1382 } 1383 } else 1384 newsvd->vpage = NULL; 1385 1386 /* Inform the vnode of the new mapping */ 1387 if (newsvd->vp != NULL) { 1388 error = VOP_ADDMAP(newsvd->vp, (offset_t)newsvd->offset, 1389 newseg->s_as, newseg->s_base, newseg->s_size, newsvd->prot, 1390 newsvd->maxprot, newsvd->type, newsvd->cred); 1391 } 1392 out: 1393 return (error); 1394 } 1395 1396 1397 /* 1398 * callback function used by segvn_unmap to invoke free_vp_pages() for only 1399 * those pages actually processed by the HAT 1400 */ 1401 extern int free_pages; 1402 1403 static void 1404 segvn_hat_unload_callback(hat_callback_t *cb) 1405 { 1406 struct seg *seg = cb->hcb_data; 1407 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 1408 size_t len; 1409 u_offset_t off; 1410 1411 ASSERT(svd->vp != NULL); 1412 ASSERT(cb->hcb_end_addr > cb->hcb_start_addr); 1413 ASSERT(cb->hcb_start_addr >= seg->s_base); 1414 1415 len = cb->hcb_end_addr - cb->hcb_start_addr; 1416 off = cb->hcb_start_addr - seg->s_base; 1417 free_vp_pages(svd->vp, svd->offset + off, len); 1418 } 1419 1420 1421 static int 1422 segvn_unmap(struct seg *seg, caddr_t addr, size_t len) 1423 { 1424 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 1425 struct segvn_data *nsvd; 1426 struct seg *nseg; 1427 struct anon_map *amp; 1428 pgcnt_t opages; /* old segment size in pages */ 1429 pgcnt_t npages; /* new segment size in pages */ 1430 pgcnt_t dpages; /* pages being deleted (unmapped) */ 1431 hat_callback_t callback; /* used for free_vp_pages() */ 1432 hat_callback_t *cbp = NULL; 1433 caddr_t nbase; 1434 size_t nsize; 1435 size_t oswresv; 1436 int reclaim = 1; 1437 1438 /* 1439 * We don't need any segment level locks for "segvn" data 1440 * since the address space is "write" locked. 1441 */ 1442 ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock)); 1443 1444 /* 1445 * Fail the unmap if pages are SOFTLOCKed through this mapping. 1446 * softlockcnt is protected from change by the as write lock. 1447 */ 1448 retry: 1449 if (svd->softlockcnt > 0) { 1450 /* 1451 * since we do have the writers lock nobody can fill 1452 * the cache during the purge. The flush either succeeds 1453 * or we still have pending I/Os. 1454 */ 1455 if (reclaim == 1) { 1456 segvn_purge(seg); 1457 reclaim = 0; 1458 goto retry; 1459 } 1460 return (EAGAIN); 1461 } 1462 1463 /* 1464 * Check for bad sizes 1465 */ 1466 if (addr < seg->s_base || addr + len > seg->s_base + seg->s_size || 1467 (len & PAGEOFFSET) || ((uintptr_t)addr & PAGEOFFSET)) { 1468 panic("segvn_unmap"); 1469 /*NOTREACHED*/ 1470 } 1471 1472 if (seg->s_szc != 0) { 1473 size_t pgsz = page_get_pagesize(seg->s_szc); 1474 int err; 1475 if (!IS_P2ALIGNED(addr, pgsz) || !IS_P2ALIGNED(len, pgsz)) { 1476 ASSERT(seg->s_base != addr || seg->s_size != len); 1477 VM_STAT_ADD(segvnvmstats.demoterange[0]); 1478 err = segvn_demote_range(seg, addr, len, SDR_END, 0); 1479 if (err == 0) { 1480 return (IE_RETRY); 1481 } 1482 return (err); 1483 } 1484 } 1485 1486 /* Inform the vnode of the unmapping. */ 1487 if (svd->vp) { 1488 int error; 1489 1490 error = VOP_DELMAP(svd->vp, 1491 (offset_t)svd->offset + (uintptr_t)(addr - seg->s_base), 1492 seg->s_as, addr, len, svd->prot, svd->maxprot, 1493 svd->type, svd->cred); 1494 1495 if (error == EAGAIN) 1496 return (error); 1497 } 1498 /* 1499 * Remove any page locks set through this mapping. 1500 */ 1501 (void) segvn_lockop(seg, addr, len, 0, MC_UNLOCK, NULL, 0); 1502 1503 /* 1504 * Unload any hardware translations in the range to be taken out. 1505 * Use a callback to invoke free_vp_pages() effectively. 1506 */ 1507 if (svd->vp != NULL && free_pages != 0) { 1508 callback.hcb_data = seg; 1509 callback.hcb_function = segvn_hat_unload_callback; 1510 cbp = &callback; 1511 } 1512 hat_unload_callback(seg->s_as->a_hat, addr, len, HAT_UNLOAD_UNMAP, cbp); 1513 1514 /* 1515 * Check for entire segment 1516 */ 1517 if (addr == seg->s_base && len == seg->s_size) { 1518 seg_free(seg); 1519 return (0); 1520 } 1521 1522 opages = seg_pages(seg); 1523 dpages = btop(len); 1524 npages = opages - dpages; 1525 amp = svd->amp; 1526 ASSERT(amp == NULL || amp->a_szc >= seg->s_szc); 1527 1528 /* 1529 * Check for beginning of segment 1530 */ 1531 if (addr == seg->s_base) { 1532 if (svd->vpage != NULL) { 1533 size_t nbytes; 1534 struct vpage *ovpage; 1535 1536 ovpage = svd->vpage; /* keep pointer to vpage */ 1537 1538 nbytes = vpgtob(npages); 1539 svd->vpage = kmem_alloc(nbytes, KM_SLEEP); 1540 bcopy(&ovpage[dpages], svd->vpage, nbytes); 1541 1542 /* free up old vpage */ 1543 kmem_free(ovpage, vpgtob(opages)); 1544 } 1545 if (amp != NULL) { 1546 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 1547 if (amp->refcnt == 1 || svd->type == MAP_PRIVATE) { 1548 /* 1549 * Free up now unused parts of anon_map array. 1550 */ 1551 if (amp->a_szc == seg->s_szc) { 1552 if (seg->s_szc != 0) { 1553 anon_free_pages(amp->ahp, 1554 svd->anon_index, len, 1555 seg->s_szc); 1556 } else { 1557 anon_free(amp->ahp, 1558 svd->anon_index, 1559 len); 1560 } 1561 } else { 1562 ASSERT(svd->type == MAP_SHARED); 1563 ASSERT(amp->a_szc > seg->s_szc); 1564 anon_shmap_free_pages(amp, 1565 svd->anon_index, len); 1566 } 1567 1568 /* 1569 * Unreserve swap space for the 1570 * unmapped chunk of this segment in 1571 * case it's MAP_SHARED 1572 */ 1573 if (svd->type == MAP_SHARED) { 1574 anon_unresv(len); 1575 amp->swresv -= len; 1576 } 1577 } 1578 ANON_LOCK_EXIT(&->a_rwlock); 1579 svd->anon_index += dpages; 1580 } 1581 if (svd->vp != NULL) 1582 svd->offset += len; 1583 1584 if (svd->swresv) { 1585 if (svd->flags & MAP_NORESERVE) { 1586 ASSERT(amp); 1587 oswresv = svd->swresv; 1588 1589 svd->swresv = ptob(anon_pages(amp->ahp, 1590 svd->anon_index, npages)); 1591 anon_unresv(oswresv - svd->swresv); 1592 } else { 1593 anon_unresv(len); 1594 svd->swresv -= len; 1595 } 1596 TRACE_3(TR_FAC_VM, TR_ANON_PROC, "anon proc:%p %lu %u", 1597 seg, len, 0); 1598 } 1599 1600 seg->s_base += len; 1601 seg->s_size -= len; 1602 return (0); 1603 } 1604 1605 /* 1606 * Check for end of segment 1607 */ 1608 if (addr + len == seg->s_base + seg->s_size) { 1609 if (svd->vpage != NULL) { 1610 size_t nbytes; 1611 struct vpage *ovpage; 1612 1613 ovpage = svd->vpage; /* keep pointer to vpage */ 1614 1615 nbytes = vpgtob(npages); 1616 svd->vpage = kmem_alloc(nbytes, KM_SLEEP); 1617 bcopy(ovpage, svd->vpage, nbytes); 1618 1619 /* free up old vpage */ 1620 kmem_free(ovpage, vpgtob(opages)); 1621 1622 } 1623 if (amp != NULL) { 1624 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 1625 if (amp->refcnt == 1 || svd->type == MAP_PRIVATE) { 1626 /* 1627 * Free up now unused parts of anon_map array. 1628 */ 1629 ulong_t an_idx = svd->anon_index + npages; 1630 if (amp->a_szc == seg->s_szc) { 1631 if (seg->s_szc != 0) { 1632 anon_free_pages(amp->ahp, 1633 an_idx, len, 1634 seg->s_szc); 1635 } else { 1636 anon_free(amp->ahp, an_idx, 1637 len); 1638 } 1639 } else { 1640 ASSERT(svd->type == MAP_SHARED); 1641 ASSERT(amp->a_szc > seg->s_szc); 1642 anon_shmap_free_pages(amp, 1643 an_idx, len); 1644 } 1645 1646 /* 1647 * Unreserve swap space for the 1648 * unmapped chunk of this segment in 1649 * case it's MAP_SHARED 1650 */ 1651 if (svd->type == MAP_SHARED) { 1652 anon_unresv(len); 1653 amp->swresv -= len; 1654 } 1655 } 1656 ANON_LOCK_EXIT(&->a_rwlock); 1657 } 1658 1659 if (svd->swresv) { 1660 if (svd->flags & MAP_NORESERVE) { 1661 ASSERT(amp); 1662 oswresv = svd->swresv; 1663 svd->swresv = ptob(anon_pages(amp->ahp, 1664 svd->anon_index, npages)); 1665 anon_unresv(oswresv - svd->swresv); 1666 } else { 1667 anon_unresv(len); 1668 svd->swresv -= len; 1669 } 1670 TRACE_3(TR_FAC_VM, TR_ANON_PROC, 1671 "anon proc:%p %lu %u", seg, len, 0); 1672 } 1673 1674 seg->s_size -= len; 1675 return (0); 1676 } 1677 1678 /* 1679 * The section to go is in the middle of the segment, 1680 * have to make it into two segments. nseg is made for 1681 * the high end while seg is cut down at the low end. 1682 */ 1683 nbase = addr + len; /* new seg base */ 1684 nsize = (seg->s_base + seg->s_size) - nbase; /* new seg size */ 1685 seg->s_size = addr - seg->s_base; /* shrink old seg */ 1686 nseg = seg_alloc(seg->s_as, nbase, nsize); 1687 if (nseg == NULL) { 1688 panic("segvn_unmap seg_alloc"); 1689 /*NOTREACHED*/ 1690 } 1691 nseg->s_ops = seg->s_ops; 1692 nsvd = kmem_cache_alloc(segvn_cache, KM_SLEEP); 1693 nseg->s_data = (void *)nsvd; 1694 nseg->s_szc = seg->s_szc; 1695 *nsvd = *svd; 1696 nsvd->offset = svd->offset + (uintptr_t)(nseg->s_base - seg->s_base); 1697 nsvd->swresv = 0; 1698 nsvd->softlockcnt = 0; 1699 1700 if (svd->vp != NULL) { 1701 VN_HOLD(nsvd->vp); 1702 if (nsvd->type == MAP_SHARED) 1703 lgrp_shm_policy_init(NULL, nsvd->vp); 1704 } 1705 crhold(svd->cred); 1706 1707 if (svd->vpage == NULL) { 1708 nsvd->vpage = NULL; 1709 } else { 1710 /* need to split vpage into two arrays */ 1711 size_t nbytes; 1712 struct vpage *ovpage; 1713 1714 ovpage = svd->vpage; /* keep pointer to vpage */ 1715 1716 npages = seg_pages(seg); /* seg has shrunk */ 1717 nbytes = vpgtob(npages); 1718 svd->vpage = kmem_alloc(nbytes, KM_SLEEP); 1719 1720 bcopy(ovpage, svd->vpage, nbytes); 1721 1722 npages = seg_pages(nseg); 1723 nbytes = vpgtob(npages); 1724 nsvd->vpage = kmem_alloc(nbytes, KM_SLEEP); 1725 1726 bcopy(&ovpage[opages - npages], nsvd->vpage, nbytes); 1727 1728 /* free up old vpage */ 1729 kmem_free(ovpage, vpgtob(opages)); 1730 } 1731 1732 if (amp == NULL) { 1733 nsvd->amp = NULL; 1734 nsvd->anon_index = 0; 1735 } else { 1736 /* 1737 * Need to create a new anon map for the new segment. 1738 * We'll also allocate a new smaller array for the old 1739 * smaller segment to save space. 1740 */ 1741 opages = btop((uintptr_t)(addr - seg->s_base)); 1742 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 1743 if (amp->refcnt == 1 || svd->type == MAP_PRIVATE) { 1744 /* 1745 * Free up now unused parts of anon_map array. 1746 */ 1747 ulong_t an_idx = svd->anon_index + opages; 1748 if (amp->a_szc == seg->s_szc) { 1749 if (seg->s_szc != 0) { 1750 anon_free_pages(amp->ahp, an_idx, len, 1751 seg->s_szc); 1752 } else { 1753 anon_free(amp->ahp, an_idx, 1754 len); 1755 } 1756 } else { 1757 ASSERT(svd->type == MAP_SHARED); 1758 ASSERT(amp->a_szc > seg->s_szc); 1759 anon_shmap_free_pages(amp, an_idx, len); 1760 } 1761 1762 /* 1763 * Unreserve swap space for the 1764 * unmapped chunk of this segment in 1765 * case it's MAP_SHARED 1766 */ 1767 if (svd->type == MAP_SHARED) { 1768 anon_unresv(len); 1769 amp->swresv -= len; 1770 } 1771 } 1772 nsvd->anon_index = svd->anon_index + 1773 btop((uintptr_t)(nseg->s_base - seg->s_base)); 1774 if (svd->type == MAP_SHARED) { 1775 amp->refcnt++; 1776 nsvd->amp = amp; 1777 } else { 1778 struct anon_map *namp; 1779 struct anon_hdr *nahp; 1780 1781 ASSERT(svd->type == MAP_PRIVATE); 1782 nahp = anon_create(btop(seg->s_size), ANON_SLEEP); 1783 namp = anonmap_alloc(nseg->s_size, 0); 1784 namp->a_szc = seg->s_szc; 1785 (void) anon_copy_ptr(amp->ahp, svd->anon_index, nahp, 1786 0, btop(seg->s_size), ANON_SLEEP); 1787 (void) anon_copy_ptr(amp->ahp, nsvd->anon_index, 1788 namp->ahp, 0, btop(nseg->s_size), ANON_SLEEP); 1789 anon_release(amp->ahp, btop(amp->size)); 1790 svd->anon_index = 0; 1791 nsvd->anon_index = 0; 1792 amp->ahp = nahp; 1793 amp->size = seg->s_size; 1794 nsvd->amp = namp; 1795 } 1796 ANON_LOCK_EXIT(&->a_rwlock); 1797 } 1798 if (svd->swresv) { 1799 if (svd->flags & MAP_NORESERVE) { 1800 ASSERT(amp); 1801 oswresv = svd->swresv; 1802 svd->swresv = ptob(anon_pages(amp->ahp, 1803 svd->anon_index, btop(seg->s_size))); 1804 nsvd->swresv = ptob(anon_pages(nsvd->amp->ahp, 1805 nsvd->anon_index, btop(nseg->s_size))); 1806 ASSERT(oswresv >= (svd->swresv + nsvd->swresv)); 1807 anon_unresv(oswresv - (svd->swresv + nsvd->swresv)); 1808 } else { 1809 if (seg->s_size + nseg->s_size + len != svd->swresv) { 1810 panic("segvn_unmap: " 1811 "cannot split swap reservation"); 1812 /*NOTREACHED*/ 1813 } 1814 anon_unresv(len); 1815 svd->swresv = seg->s_size; 1816 nsvd->swresv = nseg->s_size; 1817 } 1818 TRACE_3(TR_FAC_VM, TR_ANON_PROC, "anon proc:%p %lu %u", 1819 seg, len, 0); 1820 } 1821 1822 return (0); /* I'm glad that's all over with! */ 1823 } 1824 1825 static void 1826 segvn_free(struct seg *seg) 1827 { 1828 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 1829 pgcnt_t npages = seg_pages(seg); 1830 struct anon_map *amp; 1831 size_t len; 1832 1833 /* 1834 * We don't need any segment level locks for "segvn" data 1835 * since the address space is "write" locked. 1836 */ 1837 ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock)); 1838 1839 /* 1840 * Be sure to unlock pages. XXX Why do things get free'ed instead 1841 * of unmapped? XXX 1842 */ 1843 (void) segvn_lockop(seg, seg->s_base, seg->s_size, 1844 0, MC_UNLOCK, NULL, 0); 1845 1846 /* 1847 * Deallocate the vpage and anon pointers if necessary and possible. 1848 */ 1849 if (svd->vpage != NULL) { 1850 kmem_free(svd->vpage, vpgtob(npages)); 1851 svd->vpage = NULL; 1852 } 1853 if ((amp = svd->amp) != NULL) { 1854 /* 1855 * If there are no more references to this anon_map 1856 * structure, then deallocate the structure after freeing 1857 * up all the anon slot pointers that we can. 1858 */ 1859 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 1860 ASSERT(amp->a_szc >= seg->s_szc); 1861 if (--amp->refcnt == 0) { 1862 if (svd->type == MAP_PRIVATE) { 1863 /* 1864 * Private - we only need to anon_free 1865 * the part that this segment refers to. 1866 */ 1867 if (seg->s_szc != 0) { 1868 anon_free_pages(amp->ahp, 1869 svd->anon_index, seg->s_size, 1870 seg->s_szc); 1871 } else { 1872 anon_free(amp->ahp, svd->anon_index, 1873 seg->s_size); 1874 } 1875 } else { 1876 /* 1877 * Shared - anon_free the entire 1878 * anon_map's worth of stuff and 1879 * release any swap reservation. 1880 */ 1881 if (amp->a_szc != 0) { 1882 anon_shmap_free_pages(amp, 0, 1883 amp->size); 1884 } else { 1885 anon_free(amp->ahp, 0, amp->size); 1886 } 1887 if ((len = amp->swresv) != 0) { 1888 anon_unresv(len); 1889 TRACE_3(TR_FAC_VM, TR_ANON_PROC, 1890 "anon proc:%p %lu %u", 1891 seg, len, 0); 1892 } 1893 } 1894 svd->amp = NULL; 1895 ANON_LOCK_EXIT(&->a_rwlock); 1896 anonmap_free(amp); 1897 } else if (svd->type == MAP_PRIVATE) { 1898 /* 1899 * We had a private mapping which still has 1900 * a held anon_map so just free up all the 1901 * anon slot pointers that we were using. 1902 */ 1903 if (seg->s_szc != 0) { 1904 anon_free_pages(amp->ahp, svd->anon_index, 1905 seg->s_size, seg->s_szc); 1906 } else { 1907 anon_free(amp->ahp, svd->anon_index, 1908 seg->s_size); 1909 } 1910 ANON_LOCK_EXIT(&->a_rwlock); 1911 } else { 1912 ANON_LOCK_EXIT(&->a_rwlock); 1913 } 1914 } 1915 1916 /* 1917 * Release swap reservation. 1918 */ 1919 if ((len = svd->swresv) != 0) { 1920 anon_unresv(svd->swresv); 1921 TRACE_3(TR_FAC_VM, TR_ANON_PROC, "anon proc:%p %lu %u", 1922 seg, len, 0); 1923 svd->swresv = 0; 1924 } 1925 /* 1926 * Release claim on vnode, credentials, and finally free the 1927 * private data. 1928 */ 1929 if (svd->vp != NULL) { 1930 if (svd->type == MAP_SHARED) 1931 lgrp_shm_policy_fini(NULL, svd->vp); 1932 VN_RELE(svd->vp); 1933 svd->vp = NULL; 1934 } 1935 crfree(svd->cred); 1936 svd->cred = NULL; 1937 1938 seg->s_data = NULL; 1939 kmem_cache_free(segvn_cache, svd); 1940 } 1941 1942 ulong_t segvn_lpglck_limit = 0; 1943 /* 1944 * Support routines used by segvn_pagelock() and softlock faults for anonymous 1945 * pages to implement availrmem accounting in a way that makes sure the 1946 * same memory is accounted just once for all softlock/pagelock purposes. 1947 * This prevents a bug when availrmem is quickly incorrectly exausted from 1948 * several pagelocks to different parts of the same large page since each 1949 * pagelock has to decrement availrmem by the size of the entire large 1950 * page. Note those pages are not COW shared until softunlock/pageunlock so 1951 * we don't need to use cow style accounting here. We also need to make sure 1952 * the entire large page is accounted even if softlock range is less than the 1953 * entire large page because large anon pages can't be demoted when any of 1954 * constituent pages is locked. The caller calls this routine for every page_t 1955 * it locks. The very first page in the range may not be the root page of a 1956 * large page. For all other pages it's guranteed we are going to visit the 1957 * root of a particular large page before any other constituent page as we are 1958 * locking sequential pages belonging to the same anon map. So we do all the 1959 * locking when the root is encountered except for the very first page. Since 1960 * softlocking is not supported (except S_READ_NOCOW special case) for vmpss 1961 * segments and since vnode pages can be demoted without locking all 1962 * constituent pages vnode pages don't come here. Unlocking relies on the 1963 * fact that pagesize can't change whenever any of constituent large pages is 1964 * locked at least SE_SHARED. This allows unlocking code to find the right 1965 * root and decrement availrmem by the same amount it was incremented when the 1966 * page was locked. 1967 */ 1968 static int 1969 segvn_pp_lock_anonpages(page_t *pp, int first) 1970 { 1971 pgcnt_t pages; 1972 pfn_t pfn; 1973 uchar_t szc = pp->p_szc; 1974 1975 ASSERT(PAGE_LOCKED(pp)); 1976 ASSERT(pp->p_vnode != NULL); 1977 ASSERT(IS_SWAPFSVP(pp->p_vnode)); 1978 1979 /* 1980 * pagesize won't change as long as any constituent page is locked. 1981 */ 1982 pages = page_get_pagecnt(pp->p_szc); 1983 pfn = page_pptonum(pp); 1984 1985 if (!first) { 1986 if (!IS_P2ALIGNED(pfn, pages)) { 1987 #ifdef DEBUG 1988 pp = &pp[-(spgcnt_t)(pfn & (pages - 1))]; 1989 pfn = page_pptonum(pp); 1990 ASSERT(IS_P2ALIGNED(pfn, pages)); 1991 ASSERT(pp->p_szc == szc); 1992 ASSERT(pp->p_vnode != NULL); 1993 ASSERT(IS_SWAPFSVP(pp->p_vnode)); 1994 ASSERT(pp->p_slckcnt != 0); 1995 #endif /* DEBUG */ 1996 return (1); 1997 } 1998 } else if (!IS_P2ALIGNED(pfn, pages)) { 1999 pp = &pp[-(spgcnt_t)(pfn & (pages - 1))]; 2000 #ifdef DEBUG 2001 pfn = page_pptonum(pp); 2002 ASSERT(IS_P2ALIGNED(pfn, pages)); 2003 ASSERT(pp->p_szc == szc); 2004 ASSERT(pp->p_vnode != NULL); 2005 ASSERT(IS_SWAPFSVP(pp->p_vnode)); 2006 #endif /* DEBUG */ 2007 } 2008 2009 /* 2010 * pp is a root page. 2011 * We haven't locked this large page yet. 2012 */ 2013 page_struct_lock(pp); 2014 if (pp->p_slckcnt != 0) { 2015 if (pp->p_slckcnt < PAGE_SLOCK_MAXIMUM) { 2016 pp->p_slckcnt++; 2017 page_struct_unlock(pp); 2018 return (1); 2019 } 2020 page_struct_unlock(pp); 2021 segvn_lpglck_limit++; 2022 return (0); 2023 } 2024 mutex_enter(&freemem_lock); 2025 if (availrmem < tune.t_minarmem + pages) { 2026 mutex_exit(&freemem_lock); 2027 page_struct_unlock(pp); 2028 return (0); 2029 } 2030 pp->p_slckcnt++; 2031 availrmem -= pages; 2032 mutex_exit(&freemem_lock); 2033 page_struct_unlock(pp); 2034 return (1); 2035 } 2036 2037 static void 2038 segvn_pp_unlock_anonpages(page_t *pp, int first) 2039 { 2040 pgcnt_t pages; 2041 pfn_t pfn; 2042 2043 ASSERT(PAGE_LOCKED(pp)); 2044 ASSERT(pp->p_vnode != NULL); 2045 ASSERT(IS_SWAPFSVP(pp->p_vnode)); 2046 2047 /* 2048 * pagesize won't change as long as any constituent page is locked. 2049 */ 2050 pages = page_get_pagecnt(pp->p_szc); 2051 pfn = page_pptonum(pp); 2052 2053 if (!first) { 2054 if (!IS_P2ALIGNED(pfn, pages)) { 2055 return; 2056 } 2057 } else if (!IS_P2ALIGNED(pfn, pages)) { 2058 pp = &pp[-(spgcnt_t)(pfn & (pages - 1))]; 2059 #ifdef DEBUG 2060 pfn = page_pptonum(pp); 2061 ASSERT(IS_P2ALIGNED(pfn, pages)); 2062 #endif /* DEBUG */ 2063 } 2064 ASSERT(pp->p_vnode != NULL); 2065 ASSERT(IS_SWAPFSVP(pp->p_vnode)); 2066 ASSERT(pp->p_slckcnt != 0); 2067 page_struct_lock(pp); 2068 if (--pp->p_slckcnt == 0) { 2069 mutex_enter(&freemem_lock); 2070 availrmem += pages; 2071 mutex_exit(&freemem_lock); 2072 } 2073 page_struct_unlock(pp); 2074 } 2075 2076 /* 2077 * Do a F_SOFTUNLOCK call over the range requested. The range must have 2078 * already been F_SOFTLOCK'ed. 2079 * Caller must always match addr and len of a softunlock with a previous 2080 * softlock with exactly the same addr and len. 2081 */ 2082 static void 2083 segvn_softunlock(struct seg *seg, caddr_t addr, size_t len, enum seg_rw rw) 2084 { 2085 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 2086 page_t *pp; 2087 caddr_t adr; 2088 struct vnode *vp; 2089 u_offset_t offset; 2090 ulong_t anon_index; 2091 struct anon_map *amp; 2092 struct anon *ap = NULL; 2093 2094 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 2095 ASSERT(SEGVN_LOCK_HELD(seg->s_as, &svd->lock)); 2096 2097 if ((amp = svd->amp) != NULL) 2098 anon_index = svd->anon_index + seg_page(seg, addr); 2099 2100 hat_unlock(seg->s_as->a_hat, addr, len); 2101 for (adr = addr; adr < addr + len; adr += PAGESIZE) { 2102 if (amp != NULL) { 2103 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 2104 if ((ap = anon_get_ptr(amp->ahp, anon_index++)) 2105 != NULL) { 2106 swap_xlate(ap, &vp, &offset); 2107 } else { 2108 vp = svd->vp; 2109 offset = svd->offset + 2110 (uintptr_t)(adr - seg->s_base); 2111 } 2112 ANON_LOCK_EXIT(&->a_rwlock); 2113 } else { 2114 vp = svd->vp; 2115 offset = svd->offset + 2116 (uintptr_t)(adr - seg->s_base); 2117 } 2118 2119 /* 2120 * Use page_find() instead of page_lookup() to 2121 * find the page since we know that it is locked. 2122 */ 2123 pp = page_find(vp, offset); 2124 if (pp == NULL) { 2125 panic( 2126 "segvn_softunlock: addr %p, ap %p, vp %p, off %llx", 2127 (void *)adr, (void *)ap, (void *)vp, offset); 2128 /*NOTREACHED*/ 2129 } 2130 2131 if (rw == S_WRITE) { 2132 hat_setrefmod(pp); 2133 if (seg->s_as->a_vbits) 2134 hat_setstat(seg->s_as, adr, PAGESIZE, 2135 P_REF | P_MOD); 2136 } else if (rw != S_OTHER) { 2137 hat_setref(pp); 2138 if (seg->s_as->a_vbits) 2139 hat_setstat(seg->s_as, adr, PAGESIZE, P_REF); 2140 } 2141 TRACE_3(TR_FAC_VM, TR_SEGVN_FAULT, 2142 "segvn_fault:pp %p vp %p offset %llx", pp, vp, offset); 2143 if (svd->vp == NULL) { 2144 segvn_pp_unlock_anonpages(pp, adr == addr); 2145 } 2146 page_unlock(pp); 2147 } 2148 mutex_enter(&freemem_lock); /* for availrmem */ 2149 if (svd->vp != NULL) { 2150 availrmem += btop(len); 2151 } 2152 segvn_pages_locked -= btop(len); 2153 svd->softlockcnt -= btop(len); 2154 mutex_exit(&freemem_lock); 2155 if (svd->softlockcnt == 0) { 2156 /* 2157 * All SOFTLOCKS are gone. Wakeup any waiting 2158 * unmappers so they can try again to unmap. 2159 * Check for waiters first without the mutex 2160 * held so we don't always grab the mutex on 2161 * softunlocks. 2162 */ 2163 if (AS_ISUNMAPWAIT(seg->s_as)) { 2164 mutex_enter(&seg->s_as->a_contents); 2165 if (AS_ISUNMAPWAIT(seg->s_as)) { 2166 AS_CLRUNMAPWAIT(seg->s_as); 2167 cv_broadcast(&seg->s_as->a_cv); 2168 } 2169 mutex_exit(&seg->s_as->a_contents); 2170 } 2171 } 2172 } 2173 2174 #define PAGE_HANDLED ((page_t *)-1) 2175 2176 /* 2177 * Release all the pages in the NULL terminated ppp list 2178 * which haven't already been converted to PAGE_HANDLED. 2179 */ 2180 static void 2181 segvn_pagelist_rele(page_t **ppp) 2182 { 2183 for (; *ppp != NULL; ppp++) { 2184 if (*ppp != PAGE_HANDLED) 2185 page_unlock(*ppp); 2186 } 2187 } 2188 2189 static int stealcow = 1; 2190 2191 /* 2192 * Workaround for viking chip bug. See bug id 1220902. 2193 * To fix this down in pagefault() would require importing so 2194 * much as and segvn code as to be unmaintainable. 2195 */ 2196 int enable_mbit_wa = 0; 2197 2198 /* 2199 * Handles all the dirty work of getting the right 2200 * anonymous pages and loading up the translations. 2201 * This routine is called only from segvn_fault() 2202 * when looping over the range of addresses requested. 2203 * 2204 * The basic algorithm here is: 2205 * If this is an anon_zero case 2206 * Call anon_zero to allocate page 2207 * Load up translation 2208 * Return 2209 * endif 2210 * If this is an anon page 2211 * Use anon_getpage to get the page 2212 * else 2213 * Find page in pl[] list passed in 2214 * endif 2215 * If not a cow 2216 * Load up the translation to the page 2217 * return 2218 * endif 2219 * Call anon_private to handle cow 2220 * Load up (writable) translation to new page 2221 */ 2222 static faultcode_t 2223 segvn_faultpage( 2224 struct hat *hat, /* the hat to use for mapping */ 2225 struct seg *seg, /* seg_vn of interest */ 2226 caddr_t addr, /* address in as */ 2227 u_offset_t off, /* offset in vp */ 2228 struct vpage *vpage, /* pointer to vpage for vp, off */ 2229 page_t *pl[], /* object source page pointer */ 2230 uint_t vpprot, /* access allowed to object pages */ 2231 enum fault_type type, /* type of fault */ 2232 enum seg_rw rw, /* type of access at fault */ 2233 int brkcow, /* we may need to break cow */ 2234 int first) /* first page for this fault if 1 */ 2235 { 2236 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 2237 page_t *pp, **ppp; 2238 uint_t pageflags = 0; 2239 page_t *anon_pl[1 + 1]; 2240 page_t *opp = NULL; /* original page */ 2241 uint_t prot; 2242 int err; 2243 int cow; 2244 int claim; 2245 int steal = 0; 2246 ulong_t anon_index; 2247 struct anon *ap, *oldap; 2248 struct anon_map *amp; 2249 int hat_flag = (type == F_SOFTLOCK) ? HAT_LOAD_LOCK : HAT_LOAD; 2250 int anon_lock = 0; 2251 anon_sync_obj_t cookie; 2252 2253 if (svd->flags & MAP_TEXT) { 2254 hat_flag |= HAT_LOAD_TEXT; 2255 } 2256 2257 ASSERT(SEGVN_READ_HELD(seg->s_as, &svd->lock)); 2258 ASSERT(seg->s_szc == 0); 2259 2260 /* 2261 * Initialize protection value for this page. 2262 * If we have per page protection values check it now. 2263 */ 2264 if (svd->pageprot) { 2265 uint_t protchk; 2266 2267 switch (rw) { 2268 case S_READ: 2269 protchk = PROT_READ; 2270 break; 2271 case S_WRITE: 2272 protchk = PROT_WRITE; 2273 break; 2274 case S_EXEC: 2275 protchk = PROT_EXEC; 2276 break; 2277 case S_OTHER: 2278 default: 2279 protchk = PROT_READ | PROT_WRITE | PROT_EXEC; 2280 break; 2281 } 2282 2283 prot = VPP_PROT(vpage); 2284 if ((prot & protchk) == 0) 2285 return (FC_PROT); /* illegal access type */ 2286 } else { 2287 prot = svd->prot; 2288 } 2289 2290 if (type == F_SOFTLOCK && svd->vp != NULL) { 2291 mutex_enter(&freemem_lock); 2292 if (availrmem <= tune.t_minarmem) { 2293 mutex_exit(&freemem_lock); 2294 return (FC_MAKE_ERR(ENOMEM)); /* out of real memory */ 2295 } else { 2296 availrmem--; 2297 svd->softlockcnt++; 2298 segvn_pages_locked++; 2299 } 2300 mutex_exit(&freemem_lock); 2301 } 2302 2303 /* 2304 * Always acquire the anon array lock to prevent 2 threads from 2305 * allocating separate anon slots for the same "addr". 2306 */ 2307 2308 if ((amp = svd->amp) != NULL) { 2309 ASSERT(RW_READ_HELD(&->a_rwlock)); 2310 anon_index = svd->anon_index + seg_page(seg, addr); 2311 anon_array_enter(amp, anon_index, &cookie); 2312 anon_lock = 1; 2313 } 2314 2315 if (svd->vp == NULL && amp != NULL) { 2316 if ((ap = anon_get_ptr(amp->ahp, anon_index)) == NULL) { 2317 /* 2318 * Allocate a (normally) writable anonymous page of 2319 * zeroes. If no advance reservations, reserve now. 2320 */ 2321 if (svd->flags & MAP_NORESERVE) { 2322 if (anon_resv(ptob(1))) { 2323 svd->swresv += ptob(1); 2324 } else { 2325 err = ENOMEM; 2326 goto out; 2327 } 2328 } 2329 if ((pp = anon_zero(seg, addr, &ap, 2330 svd->cred)) == NULL) { 2331 err = ENOMEM; 2332 goto out; /* out of swap space */ 2333 } 2334 /* 2335 * Re-acquire the anon_map lock and 2336 * initialize the anon array entry. 2337 */ 2338 (void) anon_set_ptr(amp->ahp, anon_index, ap, 2339 ANON_SLEEP); 2340 2341 ASSERT(pp->p_szc == 0); 2342 2343 /* 2344 * Handle pages that have been marked for migration 2345 */ 2346 if (lgrp_optimizations()) 2347 page_migrate(seg, addr, &pp, 1); 2348 2349 if (type == F_SOFTLOCK) { 2350 if (!segvn_pp_lock_anonpages(pp, first)) { 2351 page_unlock(pp); 2352 err = ENOMEM; 2353 goto out; 2354 } else { 2355 mutex_enter(&freemem_lock); 2356 svd->softlockcnt++; 2357 segvn_pages_locked++; 2358 mutex_exit(&freemem_lock); 2359 } 2360 } 2361 2362 if (enable_mbit_wa) { 2363 if (rw == S_WRITE) 2364 hat_setmod(pp); 2365 else if (!hat_ismod(pp)) 2366 prot &= ~PROT_WRITE; 2367 } 2368 /* 2369 * If AS_PAGLCK is set in a_flags (via memcntl(2) 2370 * with MC_LOCKAS, MCL_FUTURE) and this is a 2371 * MAP_NORESERVE segment, we may need to 2372 * permanently lock the page as it is being faulted 2373 * for the first time. The following text applies 2374 * only to MAP_NORESERVE segments: 2375 * 2376 * As per memcntl(2), if this segment was created 2377 * after MCL_FUTURE was applied (a "future" 2378 * segment), its pages must be locked. If this 2379 * segment existed at MCL_FUTURE application (a 2380 * "past" segment), the interface is unclear. 2381 * 2382 * We decide to lock only if vpage is present: 2383 * 2384 * - "future" segments will have a vpage array (see 2385 * as_map), and so will be locked as required 2386 * 2387 * - "past" segments may not have a vpage array, 2388 * depending on whether events (such as 2389 * mprotect) have occurred. Locking if vpage 2390 * exists will preserve legacy behavior. Not 2391 * locking if vpage is absent, will not break 2392 * the interface or legacy behavior. Note that 2393 * allocating vpage here if it's absent requires 2394 * upgrading the segvn reader lock, the cost of 2395 * which does not seem worthwhile. 2396 */ 2397 if (AS_ISPGLCK(seg->s_as) && vpage != NULL && 2398 (svd->flags & MAP_NORESERVE)) { 2399 claim = VPP_PROT(vpage) & PROT_WRITE; 2400 ASSERT(svd->type == MAP_PRIVATE); 2401 if (page_pp_lock(pp, claim, 0)) 2402 VPP_SETPPLOCK(vpage); 2403 } 2404 2405 hat_memload(hat, addr, pp, prot, hat_flag); 2406 2407 if (!(hat_flag & HAT_LOAD_LOCK)) 2408 page_unlock(pp); 2409 2410 anon_array_exit(&cookie); 2411 return (0); 2412 } 2413 } 2414 2415 /* 2416 * Obtain the page structure via anon_getpage() if it is 2417 * a private copy of an object (the result of a previous 2418 * copy-on-write). 2419 */ 2420 if (amp != NULL) { 2421 if ((ap = anon_get_ptr(amp->ahp, anon_index)) != NULL) { 2422 err = anon_getpage(&ap, &vpprot, anon_pl, PAGESIZE, 2423 seg, addr, rw, svd->cred); 2424 if (err) 2425 goto out; 2426 2427 if (svd->type == MAP_SHARED) { 2428 /* 2429 * If this is a shared mapping to an 2430 * anon_map, then ignore the write 2431 * permissions returned by anon_getpage(). 2432 * They apply to the private mappings 2433 * of this anon_map. 2434 */ 2435 vpprot |= PROT_WRITE; 2436 } 2437 opp = anon_pl[0]; 2438 } 2439 } 2440 2441 /* 2442 * Search the pl[] list passed in if it is from the 2443 * original object (i.e., not a private copy). 2444 */ 2445 if (opp == NULL) { 2446 /* 2447 * Find original page. We must be bringing it in 2448 * from the list in pl[]. 2449 */ 2450 for (ppp = pl; (opp = *ppp) != NULL; ppp++) { 2451 if (opp == PAGE_HANDLED) 2452 continue; 2453 ASSERT(opp->p_vnode == svd->vp); /* XXX */ 2454 if (opp->p_offset == off) 2455 break; 2456 } 2457 if (opp == NULL) { 2458 panic("segvn_faultpage not found"); 2459 /*NOTREACHED*/ 2460 } 2461 *ppp = PAGE_HANDLED; 2462 2463 } 2464 2465 ASSERT(PAGE_LOCKED(opp)); 2466 2467 TRACE_3(TR_FAC_VM, TR_SEGVN_FAULT, 2468 "segvn_fault:pp %p vp %p offset %llx", 2469 opp, NULL, 0); 2470 2471 /* 2472 * The fault is treated as a copy-on-write fault if a 2473 * write occurs on a private segment and the object 2474 * page (i.e., mapping) is write protected. We assume 2475 * that fatal protection checks have already been made. 2476 */ 2477 2478 cow = brkcow && ((vpprot & PROT_WRITE) == 0); 2479 2480 /* 2481 * If not a copy-on-write case load the translation 2482 * and return. 2483 */ 2484 if (cow == 0) { 2485 2486 /* 2487 * Handle pages that have been marked for migration 2488 */ 2489 if (lgrp_optimizations()) 2490 page_migrate(seg, addr, &opp, 1); 2491 2492 if (type == F_SOFTLOCK && svd->vp == NULL) { 2493 2494 ASSERT(opp->p_szc == 0 || 2495 (svd->type == MAP_SHARED && 2496 amp != NULL && amp->a_szc != 0)); 2497 2498 if (!segvn_pp_lock_anonpages(opp, first)) { 2499 page_unlock(opp); 2500 err = ENOMEM; 2501 goto out; 2502 } else { 2503 mutex_enter(&freemem_lock); 2504 svd->softlockcnt++; 2505 segvn_pages_locked++; 2506 mutex_exit(&freemem_lock); 2507 } 2508 } 2509 if (IS_VMODSORT(opp->p_vnode) || enable_mbit_wa) { 2510 if (rw == S_WRITE) 2511 hat_setmod(opp); 2512 else if (rw != S_OTHER && !hat_ismod(opp)) 2513 prot &= ~PROT_WRITE; 2514 } 2515 2516 hat_memload(hat, addr, opp, prot & vpprot, hat_flag); 2517 2518 if (!(hat_flag & HAT_LOAD_LOCK)) 2519 page_unlock(opp); 2520 2521 if (anon_lock) { 2522 anon_array_exit(&cookie); 2523 } 2524 return (0); 2525 } 2526 2527 hat_setref(opp); 2528 2529 ASSERT(amp != NULL && anon_lock); 2530 2531 /* 2532 * Steal the page only if it isn't a private page 2533 * since stealing a private page is not worth the effort. 2534 */ 2535 if ((ap = anon_get_ptr(amp->ahp, anon_index)) == NULL) 2536 steal = 1; 2537 2538 /* 2539 * Steal the original page if the following conditions are true: 2540 * 2541 * We are low on memory, the page is not private, page is not large, 2542 * not shared, not modified, not `locked' or if we have it `locked' 2543 * (i.e., p_cowcnt == 1 and p_lckcnt == 0, which also implies 2544 * that the page is not shared) and if it doesn't have any 2545 * translations. page_struct_lock isn't needed to look at p_cowcnt 2546 * and p_lckcnt because we first get exclusive lock on page. 2547 */ 2548 (void) hat_pagesync(opp, HAT_SYNC_DONTZERO | HAT_SYNC_STOPON_MOD); 2549 2550 if (stealcow && freemem < minfree && steal && opp->p_szc == 0 && 2551 page_tryupgrade(opp) && !hat_ismod(opp) && 2552 ((opp->p_lckcnt == 0 && opp->p_cowcnt == 0) || 2553 (opp->p_lckcnt == 0 && opp->p_cowcnt == 1 && 2554 vpage != NULL && VPP_ISPPLOCK(vpage)))) { 2555 /* 2556 * Check if this page has other translations 2557 * after unloading our translation. 2558 */ 2559 if (hat_page_is_mapped(opp)) { 2560 hat_unload(seg->s_as->a_hat, addr, PAGESIZE, 2561 HAT_UNLOAD); 2562 } 2563 2564 /* 2565 * hat_unload() might sync back someone else's recent 2566 * modification, so check again. 2567 */ 2568 if (!hat_ismod(opp) && !hat_page_is_mapped(opp)) 2569 pageflags |= STEAL_PAGE; 2570 } 2571 2572 /* 2573 * If we have a vpage pointer, see if it indicates that we have 2574 * ``locked'' the page we map -- if so, tell anon_private to 2575 * transfer the locking resource to the new page. 2576 * 2577 * See Statement at the beginning of segvn_lockop regarding 2578 * the way lockcnts/cowcnts are handled during COW. 2579 * 2580 */ 2581 if (vpage != NULL && VPP_ISPPLOCK(vpage)) 2582 pageflags |= LOCK_PAGE; 2583 2584 /* 2585 * Allocate a private page and perform the copy. 2586 * For MAP_NORESERVE reserve swap space now, unless this 2587 * is a cow fault on an existing anon page in which case 2588 * MAP_NORESERVE will have made advance reservations. 2589 */ 2590 if ((svd->flags & MAP_NORESERVE) && (ap == NULL)) { 2591 if (anon_resv(ptob(1))) { 2592 svd->swresv += ptob(1); 2593 } else { 2594 page_unlock(opp); 2595 err = ENOMEM; 2596 goto out; 2597 } 2598 } 2599 oldap = ap; 2600 pp = anon_private(&ap, seg, addr, prot, opp, pageflags, svd->cred); 2601 if (pp == NULL) { 2602 err = ENOMEM; /* out of swap space */ 2603 goto out; 2604 } 2605 2606 /* 2607 * If we copied away from an anonymous page, then 2608 * we are one step closer to freeing up an anon slot. 2609 * 2610 * NOTE: The original anon slot must be released while 2611 * holding the "anon_map" lock. This is necessary to prevent 2612 * other threads from obtaining a pointer to the anon slot 2613 * which may be freed if its "refcnt" is 1. 2614 */ 2615 if (oldap != NULL) 2616 anon_decref(oldap); 2617 2618 (void) anon_set_ptr(amp->ahp, anon_index, ap, ANON_SLEEP); 2619 2620 /* 2621 * Handle pages that have been marked for migration 2622 */ 2623 if (lgrp_optimizations()) 2624 page_migrate(seg, addr, &pp, 1); 2625 2626 ASSERT(pp->p_szc == 0); 2627 if (type == F_SOFTLOCK && svd->vp == NULL) { 2628 if (!segvn_pp_lock_anonpages(pp, first)) { 2629 page_unlock(pp); 2630 err = ENOMEM; 2631 goto out; 2632 } else { 2633 mutex_enter(&freemem_lock); 2634 svd->softlockcnt++; 2635 segvn_pages_locked++; 2636 mutex_exit(&freemem_lock); 2637 } 2638 } 2639 2640 ASSERT(!IS_VMODSORT(pp->p_vnode)); 2641 if (enable_mbit_wa) { 2642 if (rw == S_WRITE) 2643 hat_setmod(pp); 2644 else if (!hat_ismod(pp)) 2645 prot &= ~PROT_WRITE; 2646 } 2647 2648 hat_memload(hat, addr, pp, prot, hat_flag); 2649 2650 if (!(hat_flag & HAT_LOAD_LOCK)) 2651 page_unlock(pp); 2652 2653 ASSERT(anon_lock); 2654 anon_array_exit(&cookie); 2655 return (0); 2656 out: 2657 if (anon_lock) 2658 anon_array_exit(&cookie); 2659 2660 if (type == F_SOFTLOCK && svd->vp != NULL) { 2661 mutex_enter(&freemem_lock); 2662 availrmem++; 2663 segvn_pages_locked--; 2664 svd->softlockcnt--; 2665 mutex_exit(&freemem_lock); 2666 } 2667 return (FC_MAKE_ERR(err)); 2668 } 2669 2670 /* 2671 * relocate a bunch of smaller targ pages into one large repl page. all targ 2672 * pages must be complete pages smaller than replacement pages. 2673 * it's assumed that no page's szc can change since they are all PAGESIZE or 2674 * complete large pages locked SHARED. 2675 */ 2676 static void 2677 segvn_relocate_pages(page_t **targ, page_t *replacement) 2678 { 2679 page_t *pp; 2680 pgcnt_t repl_npgs, curnpgs; 2681 pgcnt_t i; 2682 uint_t repl_szc = replacement->p_szc; 2683 page_t *first_repl = replacement; 2684 page_t *repl; 2685 spgcnt_t npgs; 2686 2687 VM_STAT_ADD(segvnvmstats.relocatepages[0]); 2688 2689 ASSERT(repl_szc != 0); 2690 npgs = repl_npgs = page_get_pagecnt(repl_szc); 2691 2692 i = 0; 2693 while (repl_npgs) { 2694 spgcnt_t nreloc; 2695 int err; 2696 ASSERT(replacement != NULL); 2697 pp = targ[i]; 2698 ASSERT(pp->p_szc < repl_szc); 2699 ASSERT(PAGE_EXCL(pp)); 2700 ASSERT(!PP_ISFREE(pp)); 2701 curnpgs = page_get_pagecnt(pp->p_szc); 2702 if (curnpgs == 1) { 2703 VM_STAT_ADD(segvnvmstats.relocatepages[1]); 2704 repl = replacement; 2705 page_sub(&replacement, repl); 2706 ASSERT(PAGE_EXCL(repl)); 2707 ASSERT(!PP_ISFREE(repl)); 2708 ASSERT(repl->p_szc == repl_szc); 2709 } else { 2710 page_t *repl_savepp; 2711 int j; 2712 VM_STAT_ADD(segvnvmstats.relocatepages[2]); 2713 repl_savepp = replacement; 2714 for (j = 0; j < curnpgs; j++) { 2715 repl = replacement; 2716 page_sub(&replacement, repl); 2717 ASSERT(PAGE_EXCL(repl)); 2718 ASSERT(!PP_ISFREE(repl)); 2719 ASSERT(repl->p_szc == repl_szc); 2720 ASSERT(page_pptonum(targ[i + j]) == 2721 page_pptonum(targ[i]) + j); 2722 } 2723 repl = repl_savepp; 2724 ASSERT(IS_P2ALIGNED(page_pptonum(repl), curnpgs)); 2725 } 2726 err = page_relocate(&pp, &repl, 0, 1, &nreloc, NULL); 2727 if (err || nreloc != curnpgs) { 2728 panic("segvn_relocate_pages: " 2729 "page_relocate failed err=%d curnpgs=%ld " 2730 "nreloc=%ld", err, curnpgs, nreloc); 2731 } 2732 ASSERT(curnpgs <= repl_npgs); 2733 repl_npgs -= curnpgs; 2734 i += curnpgs; 2735 } 2736 ASSERT(replacement == NULL); 2737 2738 repl = first_repl; 2739 repl_npgs = npgs; 2740 for (i = 0; i < repl_npgs; i++) { 2741 ASSERT(PAGE_EXCL(repl)); 2742 ASSERT(!PP_ISFREE(repl)); 2743 targ[i] = repl; 2744 page_downgrade(targ[i]); 2745 repl++; 2746 } 2747 } 2748 2749 /* 2750 * Check if all pages in ppa array are complete smaller than szc pages and 2751 * their roots will still be aligned relative to their current size if the 2752 * entire ppa array is relocated into one szc page. If these conditions are 2753 * not met return 0. 2754 * 2755 * If all pages are properly aligned attempt to upgrade their locks 2756 * to exclusive mode. If it fails set *upgrdfail to 1 and return 0. 2757 * upgrdfail was set to 0 by caller. 2758 * 2759 * Return 1 if all pages are aligned and locked exclusively. 2760 * 2761 * If all pages in ppa array happen to be physically contiguous to make one 2762 * szc page and all exclusive locks are successfully obtained promote the page 2763 * size to szc and set *pszc to szc. Return 1 with pages locked shared. 2764 */ 2765 static int 2766 segvn_full_szcpages(page_t **ppa, uint_t szc, int *upgrdfail, uint_t *pszc) 2767 { 2768 page_t *pp; 2769 pfn_t pfn; 2770 pgcnt_t totnpgs = page_get_pagecnt(szc); 2771 pfn_t first_pfn; 2772 int contig = 1; 2773 pgcnt_t i; 2774 pgcnt_t j; 2775 uint_t curszc; 2776 pgcnt_t curnpgs; 2777 int root = 0; 2778 2779 ASSERT(szc > 0); 2780 2781 VM_STAT_ADD(segvnvmstats.fullszcpages[0]); 2782 2783 for (i = 0; i < totnpgs; i++) { 2784 pp = ppa[i]; 2785 ASSERT(PAGE_SHARED(pp)); 2786 ASSERT(!PP_ISFREE(pp)); 2787 pfn = page_pptonum(pp); 2788 if (i == 0) { 2789 if (!IS_P2ALIGNED(pfn, totnpgs)) { 2790 contig = 0; 2791 } else { 2792 first_pfn = pfn; 2793 } 2794 } else if (contig && pfn != first_pfn + i) { 2795 contig = 0; 2796 } 2797 if (pp->p_szc == 0) { 2798 if (root) { 2799 VM_STAT_ADD(segvnvmstats.fullszcpages[1]); 2800 return (0); 2801 } 2802 } else if (!root) { 2803 if ((curszc = pp->p_szc) >= szc) { 2804 VM_STAT_ADD(segvnvmstats.fullszcpages[2]); 2805 return (0); 2806 } 2807 if (curszc == 0) { 2808 /* 2809 * p_szc changed means we don't have all pages 2810 * locked. return failure. 2811 */ 2812 VM_STAT_ADD(segvnvmstats.fullszcpages[3]); 2813 return (0); 2814 } 2815 curnpgs = page_get_pagecnt(curszc); 2816 if (!IS_P2ALIGNED(pfn, curnpgs) || 2817 !IS_P2ALIGNED(i, curnpgs)) { 2818 VM_STAT_ADD(segvnvmstats.fullszcpages[4]); 2819 return (0); 2820 } 2821 root = 1; 2822 } else { 2823 ASSERT(i > 0); 2824 VM_STAT_ADD(segvnvmstats.fullszcpages[5]); 2825 if (pp->p_szc != curszc) { 2826 VM_STAT_ADD(segvnvmstats.fullszcpages[6]); 2827 return (0); 2828 } 2829 if (pfn - 1 != page_pptonum(ppa[i - 1])) { 2830 panic("segvn_full_szcpages: " 2831 "large page not physically contiguous"); 2832 } 2833 if (P2PHASE(pfn, curnpgs) == curnpgs - 1) { 2834 root = 0; 2835 } 2836 } 2837 } 2838 2839 for (i = 0; i < totnpgs; i++) { 2840 ASSERT(ppa[i]->p_szc < szc); 2841 if (!page_tryupgrade(ppa[i])) { 2842 for (j = 0; j < i; j++) { 2843 page_downgrade(ppa[j]); 2844 } 2845 *pszc = ppa[i]->p_szc; 2846 *upgrdfail = 1; 2847 VM_STAT_ADD(segvnvmstats.fullszcpages[7]); 2848 return (0); 2849 } 2850 } 2851 2852 /* 2853 * When a page is put a free cachelist its szc is set to 0. if file 2854 * system reclaimed pages from cachelist targ pages will be physically 2855 * contiguous with 0 p_szc. in this case just upgrade szc of targ 2856 * pages without any relocations. 2857 * To avoid any hat issues with previous small mappings 2858 * hat_pageunload() the target pages first. 2859 */ 2860 if (contig) { 2861 VM_STAT_ADD(segvnvmstats.fullszcpages[8]); 2862 for (i = 0; i < totnpgs; i++) { 2863 (void) hat_pageunload(ppa[i], HAT_FORCE_PGUNLOAD); 2864 } 2865 for (i = 0; i < totnpgs; i++) { 2866 ppa[i]->p_szc = szc; 2867 } 2868 for (i = 0; i < totnpgs; i++) { 2869 ASSERT(PAGE_EXCL(ppa[i])); 2870 page_downgrade(ppa[i]); 2871 } 2872 if (pszc != NULL) { 2873 *pszc = szc; 2874 } 2875 } 2876 VM_STAT_ADD(segvnvmstats.fullszcpages[9]); 2877 return (1); 2878 } 2879 2880 /* 2881 * Create physically contiguous pages for [vp, off] - [vp, off + 2882 * page_size(szc)) range and for private segment return them in ppa array. 2883 * Pages are created either via IO or relocations. 2884 * 2885 * Return 1 on sucess and 0 on failure. 2886 * 2887 * If physically contiguos pages already exist for this range return 1 without 2888 * filling ppa array. Caller initializes ppa[0] as NULL to detect that ppa 2889 * array wasn't filled. In this case caller fills ppa array via VOP_GETPAGE(). 2890 */ 2891 2892 static int 2893 segvn_fill_vp_pages(struct segvn_data *svd, vnode_t *vp, u_offset_t off, 2894 uint_t szc, page_t **ppa, page_t **ppplist, uint_t *ret_pszc, 2895 int *downsize) 2896 2897 { 2898 page_t *pplist = *ppplist; 2899 size_t pgsz = page_get_pagesize(szc); 2900 pgcnt_t pages = btop(pgsz); 2901 ulong_t start_off = off; 2902 u_offset_t eoff = off + pgsz; 2903 spgcnt_t nreloc; 2904 u_offset_t io_off = off; 2905 size_t io_len; 2906 page_t *io_pplist = NULL; 2907 page_t *done_pplist = NULL; 2908 pgcnt_t pgidx = 0; 2909 page_t *pp; 2910 page_t *newpp; 2911 page_t *targpp; 2912 int io_err = 0; 2913 int i; 2914 pfn_t pfn; 2915 ulong_t ppages; 2916 page_t *targ_pplist = NULL; 2917 page_t *repl_pplist = NULL; 2918 page_t *tmp_pplist; 2919 int nios = 0; 2920 uint_t pszc; 2921 struct vattr va; 2922 2923 VM_STAT_ADD(segvnvmstats.fill_vp_pages[0]); 2924 2925 ASSERT(szc != 0); 2926 ASSERT(pplist->p_szc == szc); 2927 2928 /* 2929 * downsize will be set to 1 only if we fail to lock pages. this will 2930 * allow subsequent faults to try to relocate the page again. If we 2931 * fail due to misalignment don't downsize and let the caller map the 2932 * whole region with small mappings to avoid more faults into the area 2933 * where we can't get large pages anyway. 2934 */ 2935 *downsize = 0; 2936 2937 while (off < eoff) { 2938 newpp = pplist; 2939 ASSERT(newpp != NULL); 2940 ASSERT(PAGE_EXCL(newpp)); 2941 ASSERT(!PP_ISFREE(newpp)); 2942 /* 2943 * we pass NULL for nrelocp to page_lookup_create() 2944 * so that it doesn't relocate. We relocate here 2945 * later only after we make sure we can lock all 2946 * pages in the range we handle and they are all 2947 * aligned. 2948 */ 2949 pp = page_lookup_create(vp, off, SE_SHARED, newpp, NULL, 0); 2950 ASSERT(pp != NULL); 2951 ASSERT(!PP_ISFREE(pp)); 2952 ASSERT(pp->p_vnode == vp); 2953 ASSERT(pp->p_offset == off); 2954 if (pp == newpp) { 2955 VM_STAT_ADD(segvnvmstats.fill_vp_pages[1]); 2956 page_sub(&pplist, pp); 2957 ASSERT(PAGE_EXCL(pp)); 2958 ASSERT(page_iolock_assert(pp)); 2959 page_list_concat(&io_pplist, &pp); 2960 off += PAGESIZE; 2961 continue; 2962 } 2963 VM_STAT_ADD(segvnvmstats.fill_vp_pages[2]); 2964 pfn = page_pptonum(pp); 2965 pszc = pp->p_szc; 2966 if (pszc >= szc && targ_pplist == NULL && io_pplist == NULL && 2967 IS_P2ALIGNED(pfn, pages)) { 2968 ASSERT(repl_pplist == NULL); 2969 ASSERT(done_pplist == NULL); 2970 ASSERT(pplist == *ppplist); 2971 page_unlock(pp); 2972 page_free_replacement_page(pplist); 2973 page_create_putback(pages); 2974 *ppplist = NULL; 2975 VM_STAT_ADD(segvnvmstats.fill_vp_pages[3]); 2976 return (1); 2977 } 2978 if (pszc >= szc) { 2979 page_unlock(pp); 2980 segvn_faultvnmpss_align_err1++; 2981 goto out; 2982 } 2983 ppages = page_get_pagecnt(pszc); 2984 if (!IS_P2ALIGNED(pfn, ppages)) { 2985 ASSERT(pszc > 0); 2986 /* 2987 * sizing down to pszc won't help. 2988 */ 2989 page_unlock(pp); 2990 segvn_faultvnmpss_align_err2++; 2991 goto out; 2992 } 2993 pfn = page_pptonum(newpp); 2994 if (!IS_P2ALIGNED(pfn, ppages)) { 2995 ASSERT(pszc > 0); 2996 /* 2997 * sizing down to pszc won't help. 2998 */ 2999 page_unlock(pp); 3000 segvn_faultvnmpss_align_err3++; 3001 goto out; 3002 } 3003 if (!PAGE_EXCL(pp)) { 3004 VM_STAT_ADD(segvnvmstats.fill_vp_pages[4]); 3005 page_unlock(pp); 3006 *downsize = 1; 3007 *ret_pszc = pp->p_szc; 3008 goto out; 3009 } 3010 targpp = pp; 3011 if (io_pplist != NULL) { 3012 VM_STAT_ADD(segvnvmstats.fill_vp_pages[5]); 3013 io_len = off - io_off; 3014 /* 3015 * Some file systems like NFS don't check EOF 3016 * conditions in VOP_PAGEIO(). Check it here 3017 * now that pages are locked SE_EXCL. Any file 3018 * truncation will wait until the pages are 3019 * unlocked so no need to worry that file will 3020 * be truncated after we check its size here. 3021 * XXX fix NFS to remove this check. 3022 */ 3023 va.va_mask = AT_SIZE; 3024 if (VOP_GETATTR(vp, &va, ATTR_HINT, svd->cred) != 0) { 3025 VM_STAT_ADD(segvnvmstats.fill_vp_pages[6]); 3026 page_unlock(targpp); 3027 goto out; 3028 } 3029 if (btopr(va.va_size) < btopr(io_off + io_len)) { 3030 VM_STAT_ADD(segvnvmstats.fill_vp_pages[7]); 3031 *downsize = 1; 3032 *ret_pszc = 0; 3033 page_unlock(targpp); 3034 goto out; 3035 } 3036 io_err = VOP_PAGEIO(vp, io_pplist, io_off, io_len, 3037 B_READ, svd->cred); 3038 if (io_err) { 3039 VM_STAT_ADD(segvnvmstats.fill_vp_pages[8]); 3040 page_unlock(targpp); 3041 if (io_err == EDEADLK) { 3042 segvn_vmpss_pageio_deadlk_err++; 3043 } 3044 goto out; 3045 } 3046 nios++; 3047 VM_STAT_ADD(segvnvmstats.fill_vp_pages[9]); 3048 while (io_pplist != NULL) { 3049 pp = io_pplist; 3050 page_sub(&io_pplist, pp); 3051 ASSERT(page_iolock_assert(pp)); 3052 page_io_unlock(pp); 3053 pgidx = (pp->p_offset - start_off) >> 3054 PAGESHIFT; 3055 ASSERT(pgidx < pages); 3056 ppa[pgidx] = pp; 3057 page_list_concat(&done_pplist, &pp); 3058 } 3059 } 3060 pp = targpp; 3061 ASSERT(PAGE_EXCL(pp)); 3062 ASSERT(pp->p_szc <= pszc); 3063 if (pszc != 0 && !group_page_trylock(pp, SE_EXCL)) { 3064 VM_STAT_ADD(segvnvmstats.fill_vp_pages[10]); 3065 page_unlock(pp); 3066 *downsize = 1; 3067 *ret_pszc = pp->p_szc; 3068 goto out; 3069 } 3070 VM_STAT_ADD(segvnvmstats.fill_vp_pages[11]); 3071 /* 3072 * page szc chould have changed before the entire group was 3073 * locked. reread page szc. 3074 */ 3075 pszc = pp->p_szc; 3076 ppages = page_get_pagecnt(pszc); 3077 3078 /* link just the roots */ 3079 page_list_concat(&targ_pplist, &pp); 3080 page_sub(&pplist, newpp); 3081 page_list_concat(&repl_pplist, &newpp); 3082 off += PAGESIZE; 3083 while (--ppages != 0) { 3084 newpp = pplist; 3085 page_sub(&pplist, newpp); 3086 off += PAGESIZE; 3087 } 3088 io_off = off; 3089 } 3090 if (io_pplist != NULL) { 3091 VM_STAT_ADD(segvnvmstats.fill_vp_pages[12]); 3092 io_len = eoff - io_off; 3093 va.va_mask = AT_SIZE; 3094 if (VOP_GETATTR(vp, &va, ATTR_HINT, svd->cred) != 0) { 3095 VM_STAT_ADD(segvnvmstats.fill_vp_pages[13]); 3096 goto out; 3097 } 3098 if (btopr(va.va_size) < btopr(io_off + io_len)) { 3099 VM_STAT_ADD(segvnvmstats.fill_vp_pages[14]); 3100 *downsize = 1; 3101 *ret_pszc = 0; 3102 goto out; 3103 } 3104 io_err = VOP_PAGEIO(vp, io_pplist, io_off, io_len, 3105 B_READ, svd->cred); 3106 if (io_err) { 3107 VM_STAT_ADD(segvnvmstats.fill_vp_pages[15]); 3108 if (io_err == EDEADLK) { 3109 segvn_vmpss_pageio_deadlk_err++; 3110 } 3111 goto out; 3112 } 3113 nios++; 3114 while (io_pplist != NULL) { 3115 pp = io_pplist; 3116 page_sub(&io_pplist, pp); 3117 ASSERT(page_iolock_assert(pp)); 3118 page_io_unlock(pp); 3119 pgidx = (pp->p_offset - start_off) >> PAGESHIFT; 3120 ASSERT(pgidx < pages); 3121 ppa[pgidx] = pp; 3122 } 3123 } 3124 /* 3125 * we're now bound to succeed or panic. 3126 * remove pages from done_pplist. it's not needed anymore. 3127 */ 3128 while (done_pplist != NULL) { 3129 pp = done_pplist; 3130 page_sub(&done_pplist, pp); 3131 } 3132 VM_STAT_ADD(segvnvmstats.fill_vp_pages[16]); 3133 ASSERT(pplist == NULL); 3134 *ppplist = NULL; 3135 while (targ_pplist != NULL) { 3136 int ret; 3137 VM_STAT_ADD(segvnvmstats.fill_vp_pages[17]); 3138 ASSERT(repl_pplist); 3139 pp = targ_pplist; 3140 page_sub(&targ_pplist, pp); 3141 pgidx = (pp->p_offset - start_off) >> PAGESHIFT; 3142 newpp = repl_pplist; 3143 page_sub(&repl_pplist, newpp); 3144 #ifdef DEBUG 3145 pfn = page_pptonum(pp); 3146 pszc = pp->p_szc; 3147 ppages = page_get_pagecnt(pszc); 3148 ASSERT(IS_P2ALIGNED(pfn, ppages)); 3149 pfn = page_pptonum(newpp); 3150 ASSERT(IS_P2ALIGNED(pfn, ppages)); 3151 ASSERT(P2PHASE(pfn, pages) == pgidx); 3152 #endif 3153 nreloc = 0; 3154 ret = page_relocate(&pp, &newpp, 0, 1, &nreloc, NULL); 3155 if (ret != 0 || nreloc == 0) { 3156 panic("segvn_fill_vp_pages: " 3157 "page_relocate failed"); 3158 } 3159 pp = newpp; 3160 while (nreloc-- != 0) { 3161 ASSERT(PAGE_EXCL(pp)); 3162 ASSERT(pp->p_vnode == vp); 3163 ASSERT(pgidx == 3164 ((pp->p_offset - start_off) >> PAGESHIFT)); 3165 ppa[pgidx++] = pp; 3166 pp++; 3167 } 3168 } 3169 3170 if (svd->type == MAP_PRIVATE) { 3171 VM_STAT_ADD(segvnvmstats.fill_vp_pages[18]); 3172 for (i = 0; i < pages; i++) { 3173 ASSERT(ppa[i] != NULL); 3174 ASSERT(PAGE_EXCL(ppa[i])); 3175 ASSERT(ppa[i]->p_vnode == vp); 3176 ASSERT(ppa[i]->p_offset == 3177 start_off + (i << PAGESHIFT)); 3178 page_downgrade(ppa[i]); 3179 } 3180 ppa[pages] = NULL; 3181 } else { 3182 VM_STAT_ADD(segvnvmstats.fill_vp_pages[19]); 3183 /* 3184 * the caller will still call VOP_GETPAGE() for shared segments 3185 * to check FS write permissions. For private segments we map 3186 * file read only anyway. so no VOP_GETPAGE is needed. 3187 */ 3188 for (i = 0; i < pages; i++) { 3189 ASSERT(ppa[i] != NULL); 3190 ASSERT(PAGE_EXCL(ppa[i])); 3191 ASSERT(ppa[i]->p_vnode == vp); 3192 ASSERT(ppa[i]->p_offset == 3193 start_off + (i << PAGESHIFT)); 3194 page_unlock(ppa[i]); 3195 } 3196 ppa[0] = NULL; 3197 } 3198 3199 return (1); 3200 out: 3201 /* 3202 * Do the cleanup. Unlock target pages we didn't relocate. They are 3203 * linked on targ_pplist by root pages. reassemble unused replacement 3204 * and io pages back to pplist. 3205 */ 3206 if (io_pplist != NULL) { 3207 VM_STAT_ADD(segvnvmstats.fill_vp_pages[20]); 3208 pp = io_pplist; 3209 do { 3210 ASSERT(pp->p_vnode == vp); 3211 ASSERT(pp->p_offset == io_off); 3212 ASSERT(page_iolock_assert(pp)); 3213 page_io_unlock(pp); 3214 page_hashout(pp, NULL); 3215 io_off += PAGESIZE; 3216 } while ((pp = pp->p_next) != io_pplist); 3217 page_list_concat(&io_pplist, &pplist); 3218 pplist = io_pplist; 3219 } 3220 tmp_pplist = NULL; 3221 while (targ_pplist != NULL) { 3222 VM_STAT_ADD(segvnvmstats.fill_vp_pages[21]); 3223 pp = targ_pplist; 3224 ASSERT(PAGE_EXCL(pp)); 3225 page_sub(&targ_pplist, pp); 3226 3227 pszc = pp->p_szc; 3228 ppages = page_get_pagecnt(pszc); 3229 ASSERT(IS_P2ALIGNED(page_pptonum(pp), ppages)); 3230 3231 if (pszc != 0) { 3232 group_page_unlock(pp); 3233 } 3234 page_unlock(pp); 3235 3236 pp = repl_pplist; 3237 ASSERT(pp != NULL); 3238 ASSERT(PAGE_EXCL(pp)); 3239 ASSERT(pp->p_szc == szc); 3240 page_sub(&repl_pplist, pp); 3241 3242 ASSERT(IS_P2ALIGNED(page_pptonum(pp), ppages)); 3243 3244 /* relink replacement page */ 3245 page_list_concat(&tmp_pplist, &pp); 3246 while (--ppages != 0) { 3247 VM_STAT_ADD(segvnvmstats.fill_vp_pages[22]); 3248 pp++; 3249 ASSERT(PAGE_EXCL(pp)); 3250 ASSERT(pp->p_szc == szc); 3251 page_list_concat(&tmp_pplist, &pp); 3252 } 3253 } 3254 if (tmp_pplist != NULL) { 3255 VM_STAT_ADD(segvnvmstats.fill_vp_pages[23]); 3256 page_list_concat(&tmp_pplist, &pplist); 3257 pplist = tmp_pplist; 3258 } 3259 /* 3260 * at this point all pages are either on done_pplist or 3261 * pplist. They can't be all on done_pplist otherwise 3262 * we'd've been done. 3263 */ 3264 ASSERT(pplist != NULL); 3265 if (nios != 0) { 3266 VM_STAT_ADD(segvnvmstats.fill_vp_pages[24]); 3267 pp = pplist; 3268 do { 3269 VM_STAT_ADD(segvnvmstats.fill_vp_pages[25]); 3270 ASSERT(pp->p_szc == szc); 3271 ASSERT(PAGE_EXCL(pp)); 3272 ASSERT(pp->p_vnode != vp); 3273 pp->p_szc = 0; 3274 } while ((pp = pp->p_next) != pplist); 3275 3276 pp = done_pplist; 3277 do { 3278 VM_STAT_ADD(segvnvmstats.fill_vp_pages[26]); 3279 ASSERT(pp->p_szc == szc); 3280 ASSERT(PAGE_EXCL(pp)); 3281 ASSERT(pp->p_vnode == vp); 3282 pp->p_szc = 0; 3283 } while ((pp = pp->p_next) != done_pplist); 3284 3285 while (pplist != NULL) { 3286 VM_STAT_ADD(segvnvmstats.fill_vp_pages[27]); 3287 pp = pplist; 3288 page_sub(&pplist, pp); 3289 page_free(pp, 0); 3290 } 3291 3292 while (done_pplist != NULL) { 3293 VM_STAT_ADD(segvnvmstats.fill_vp_pages[28]); 3294 pp = done_pplist; 3295 page_sub(&done_pplist, pp); 3296 page_unlock(pp); 3297 } 3298 *ppplist = NULL; 3299 return (0); 3300 } 3301 ASSERT(pplist == *ppplist); 3302 if (io_err) { 3303 VM_STAT_ADD(segvnvmstats.fill_vp_pages[29]); 3304 /* 3305 * don't downsize on io error. 3306 * see if vop_getpage succeeds. 3307 * pplist may still be used in this case 3308 * for relocations. 3309 */ 3310 return (0); 3311 } 3312 VM_STAT_ADD(segvnvmstats.fill_vp_pages[30]); 3313 page_free_replacement_page(pplist); 3314 page_create_putback(pages); 3315 *ppplist = NULL; 3316 return (0); 3317 } 3318 3319 int segvn_anypgsz = 0; 3320 3321 #define SEGVN_RESTORE_SOFTLOCK(type, pages) \ 3322 if ((type) == F_SOFTLOCK) { \ 3323 mutex_enter(&freemem_lock); \ 3324 availrmem += (pages); \ 3325 segvn_pages_locked -= (pages); \ 3326 svd->softlockcnt -= (pages); \ 3327 mutex_exit(&freemem_lock); \ 3328 } 3329 3330 #define SEGVN_UPDATE_MODBITS(ppa, pages, rw, prot, vpprot) \ 3331 if (IS_VMODSORT((ppa)[0]->p_vnode)) { \ 3332 if ((rw) == S_WRITE) { \ 3333 for (i = 0; i < (pages); i++) { \ 3334 ASSERT((ppa)[i]->p_vnode == \ 3335 (ppa)[0]->p_vnode); \ 3336 hat_setmod((ppa)[i]); \ 3337 } \ 3338 } else if ((rw) != S_OTHER && \ 3339 ((prot) & (vpprot) & PROT_WRITE)) { \ 3340 for (i = 0; i < (pages); i++) { \ 3341 ASSERT((ppa)[i]->p_vnode == \ 3342 (ppa)[0]->p_vnode); \ 3343 if (!hat_ismod((ppa)[i])) { \ 3344 prot &= ~PROT_WRITE; \ 3345 break; \ 3346 } \ 3347 } \ 3348 } \ 3349 } 3350 3351 #ifdef VM_STATS 3352 3353 #define SEGVN_VMSTAT_FLTVNPAGES(idx) \ 3354 VM_STAT_ADD(segvnvmstats.fltvnpages[(idx)]); 3355 3356 #else /* VM_STATS */ 3357 3358 #define SEGVN_VMSTAT_FLTVNPAGES(idx) 3359 3360 #endif 3361 3362 static faultcode_t 3363 segvn_fault_vnodepages(struct hat *hat, struct seg *seg, caddr_t lpgaddr, 3364 caddr_t lpgeaddr, enum fault_type type, enum seg_rw rw, caddr_t addr, 3365 caddr_t eaddr, int brkcow) 3366 { 3367 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 3368 struct anon_map *amp = svd->amp; 3369 uchar_t segtype = svd->type; 3370 uint_t szc = seg->s_szc; 3371 size_t pgsz = page_get_pagesize(szc); 3372 size_t maxpgsz = pgsz; 3373 pgcnt_t pages = btop(pgsz); 3374 pgcnt_t maxpages = pages; 3375 size_t ppasize = (pages + 1) * sizeof (page_t *); 3376 caddr_t a = lpgaddr; 3377 caddr_t maxlpgeaddr = lpgeaddr; 3378 u_offset_t off = svd->offset + (uintptr_t)(a - seg->s_base); 3379 ulong_t aindx = svd->anon_index + seg_page(seg, a); 3380 struct vpage *vpage = (svd->vpage != NULL) ? 3381 &svd->vpage[seg_page(seg, a)] : NULL; 3382 vnode_t *vp = svd->vp; 3383 page_t **ppa; 3384 uint_t pszc; 3385 size_t ppgsz; 3386 pgcnt_t ppages; 3387 faultcode_t err = 0; 3388 int ierr; 3389 int vop_size_err = 0; 3390 uint_t protchk, prot, vpprot; 3391 ulong_t i; 3392 int hat_flag = (type == F_SOFTLOCK) ? HAT_LOAD_LOCK : HAT_LOAD; 3393 anon_sync_obj_t an_cookie; 3394 enum seg_rw arw; 3395 int alloc_failed = 0; 3396 int adjszc_chk; 3397 struct vattr va; 3398 int xhat = 0; 3399 page_t *pplist; 3400 pfn_t pfn; 3401 int physcontig; 3402 int upgrdfail; 3403 int segvn_anypgsz_vnode = 0; /* for now map vnode with 2 page sizes */ 3404 3405 ASSERT(szc != 0); 3406 ASSERT(vp != NULL); 3407 ASSERT(brkcow == 0 || amp != NULL); 3408 ASSERT(enable_mbit_wa == 0); /* no mbit simulations with large pages */ 3409 ASSERT(!(svd->flags & MAP_NORESERVE)); 3410 ASSERT(type != F_SOFTUNLOCK); 3411 ASSERT(IS_P2ALIGNED(a, maxpgsz)); 3412 ASSERT(amp == NULL || IS_P2ALIGNED(aindx, maxpages)); 3413 ASSERT(SEGVN_LOCK_HELD(seg->s_as, &svd->lock)); 3414 ASSERT(seg->s_szc < NBBY * sizeof (int)); 3415 ASSERT(type != F_SOFTLOCK || lpgeaddr - a == maxpgsz); 3416 3417 VM_STAT_COND_ADD(type == F_SOFTLOCK, segvnvmstats.fltvnpages[0]); 3418 VM_STAT_COND_ADD(type != F_SOFTLOCK, segvnvmstats.fltvnpages[1]); 3419 3420 if (svd->flags & MAP_TEXT) { 3421 hat_flag |= HAT_LOAD_TEXT; 3422 } 3423 3424 if (svd->pageprot) { 3425 switch (rw) { 3426 case S_READ: 3427 protchk = PROT_READ; 3428 break; 3429 case S_WRITE: 3430 protchk = PROT_WRITE; 3431 break; 3432 case S_EXEC: 3433 protchk = PROT_EXEC; 3434 break; 3435 case S_OTHER: 3436 default: 3437 protchk = PROT_READ | PROT_WRITE | PROT_EXEC; 3438 break; 3439 } 3440 } else { 3441 prot = svd->prot; 3442 /* caller has already done segment level protection check. */ 3443 } 3444 3445 if (seg->s_as->a_hat != hat) { 3446 xhat = 1; 3447 } 3448 3449 if (rw == S_WRITE && segtype == MAP_PRIVATE) { 3450 SEGVN_VMSTAT_FLTVNPAGES(2); 3451 arw = S_READ; 3452 } else { 3453 arw = rw; 3454 } 3455 3456 ppa = kmem_alloc(ppasize, KM_SLEEP); 3457 3458 VM_STAT_COND_ADD(amp != NULL, segvnvmstats.fltvnpages[3]); 3459 3460 for (;;) { 3461 adjszc_chk = 0; 3462 for (; a < lpgeaddr; a += pgsz, off += pgsz, aindx += pages) { 3463 if (adjszc_chk) { 3464 while (szc < seg->s_szc) { 3465 uintptr_t e; 3466 uint_t tszc; 3467 tszc = segvn_anypgsz_vnode ? szc + 1 : 3468 seg->s_szc; 3469 ppgsz = page_get_pagesize(tszc); 3470 if (!IS_P2ALIGNED(a, ppgsz) || 3471 ((alloc_failed >> tszc) & 3472 0x1)) { 3473 break; 3474 } 3475 SEGVN_VMSTAT_FLTVNPAGES(4); 3476 szc = tszc; 3477 pgsz = ppgsz; 3478 pages = btop(pgsz); 3479 e = P2ROUNDUP((uintptr_t)eaddr, pgsz); 3480 lpgeaddr = (caddr_t)e; 3481 } 3482 } 3483 3484 again: 3485 if (IS_P2ALIGNED(a, maxpgsz) && amp != NULL) { 3486 ASSERT(IS_P2ALIGNED(aindx, maxpages)); 3487 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 3488 anon_array_enter(amp, aindx, &an_cookie); 3489 if (anon_get_ptr(amp->ahp, aindx) != NULL) { 3490 SEGVN_VMSTAT_FLTVNPAGES(5); 3491 if (anon_pages(amp->ahp, aindx, 3492 maxpages) != maxpages) { 3493 panic("segvn_fault_vnodepages:" 3494 " empty anon slots\n"); 3495 } 3496 anon_array_exit(&an_cookie); 3497 ANON_LOCK_EXIT(&->a_rwlock); 3498 err = segvn_fault_anonpages(hat, seg, 3499 a, a + maxpgsz, type, rw, 3500 MAX(a, addr), 3501 MIN(a + maxpgsz, eaddr), brkcow); 3502 if (err != 0) { 3503 SEGVN_VMSTAT_FLTVNPAGES(6); 3504 goto out; 3505 } 3506 if (szc < seg->s_szc) { 3507 szc = seg->s_szc; 3508 pgsz = maxpgsz; 3509 pages = maxpages; 3510 lpgeaddr = maxlpgeaddr; 3511 } 3512 goto next; 3513 } else if (anon_pages(amp->ahp, aindx, 3514 maxpages)) { 3515 panic("segvn_fault_vnodepages:" 3516 " non empty anon slots\n"); 3517 } else { 3518 SEGVN_VMSTAT_FLTVNPAGES(7); 3519 anon_array_exit(&an_cookie); 3520 ANON_LOCK_EXIT(&->a_rwlock); 3521 } 3522 } 3523 ASSERT(!brkcow || IS_P2ALIGNED(a, maxpgsz)); 3524 3525 if (svd->pageprot != 0 && IS_P2ALIGNED(a, maxpgsz)) { 3526 ASSERT(vpage != NULL); 3527 prot = VPP_PROT(vpage); 3528 ASSERT(sameprot(seg, a, maxpgsz)); 3529 if ((prot & protchk) == 0) { 3530 SEGVN_VMSTAT_FLTVNPAGES(8); 3531 err = FC_PROT; 3532 goto out; 3533 } 3534 } 3535 if (type == F_SOFTLOCK) { 3536 mutex_enter(&freemem_lock); 3537 if (availrmem < tune.t_minarmem + pages) { 3538 mutex_exit(&freemem_lock); 3539 err = FC_MAKE_ERR(ENOMEM); 3540 goto out; 3541 } else { 3542 availrmem -= pages; 3543 segvn_pages_locked += pages; 3544 svd->softlockcnt += pages; 3545 } 3546 mutex_exit(&freemem_lock); 3547 } 3548 3549 pplist = NULL; 3550 physcontig = 0; 3551 ppa[0] = NULL; 3552 if (!brkcow && szc && 3553 !page_exists_physcontig(vp, off, szc, 3554 segtype == MAP_PRIVATE ? ppa : NULL)) { 3555 SEGVN_VMSTAT_FLTVNPAGES(9); 3556 if (page_alloc_pages(vp, seg, a, &pplist, NULL, 3557 szc, 0) && type != F_SOFTLOCK) { 3558 SEGVN_VMSTAT_FLTVNPAGES(10); 3559 pszc = 0; 3560 ierr = -1; 3561 alloc_failed |= (1 << szc); 3562 break; 3563 } 3564 if (pplist != NULL && 3565 vp->v_mpssdata == SEGVN_PAGEIO) { 3566 int downsize; 3567 SEGVN_VMSTAT_FLTVNPAGES(11); 3568 physcontig = segvn_fill_vp_pages(svd, 3569 vp, off, szc, ppa, &pplist, 3570 &pszc, &downsize); 3571 ASSERT(!physcontig || pplist == NULL); 3572 if (!physcontig && downsize && 3573 type != F_SOFTLOCK) { 3574 ASSERT(pplist == NULL); 3575 SEGVN_VMSTAT_FLTVNPAGES(12); 3576 ierr = -1; 3577 break; 3578 } 3579 ASSERT(!physcontig || 3580 segtype == MAP_PRIVATE || 3581 ppa[0] == NULL); 3582 if (physcontig && ppa[0] == NULL) { 3583 physcontig = 0; 3584 } 3585 } 3586 } else if (!brkcow && szc && ppa[0] != NULL) { 3587 SEGVN_VMSTAT_FLTVNPAGES(13); 3588 ASSERT(segtype == MAP_PRIVATE); 3589 physcontig = 1; 3590 } 3591 3592 if (!physcontig) { 3593 SEGVN_VMSTAT_FLTVNPAGES(14); 3594 ppa[0] = NULL; 3595 ierr = VOP_GETPAGE(vp, (offset_t)off, pgsz, 3596 &vpprot, ppa, pgsz, seg, a, arw, 3597 svd->cred); 3598 if (segtype == MAP_PRIVATE) { 3599 SEGVN_VMSTAT_FLTVNPAGES(15); 3600 vpprot &= ~PROT_WRITE; 3601 } 3602 } else { 3603 ASSERT(segtype == MAP_PRIVATE); 3604 SEGVN_VMSTAT_FLTVNPAGES(16); 3605 vpprot = PROT_ALL & ~PROT_WRITE; 3606 ierr = 0; 3607 } 3608 3609 if (ierr != 0) { 3610 SEGVN_VMSTAT_FLTVNPAGES(17); 3611 if (pplist != NULL) { 3612 SEGVN_VMSTAT_FLTVNPAGES(18); 3613 page_free_replacement_page(pplist); 3614 page_create_putback(pages); 3615 } 3616 SEGVN_RESTORE_SOFTLOCK(type, pages); 3617 if (a + pgsz <= eaddr) { 3618 SEGVN_VMSTAT_FLTVNPAGES(19); 3619 err = FC_MAKE_ERR(ierr); 3620 goto out; 3621 } 3622 va.va_mask = AT_SIZE; 3623 if (VOP_GETATTR(vp, &va, 0, svd->cred) != 0) { 3624 SEGVN_VMSTAT_FLTVNPAGES(20); 3625 err = FC_MAKE_ERR(EIO); 3626 goto out; 3627 } 3628 if (btopr(va.va_size) >= btopr(off + pgsz)) { 3629 SEGVN_VMSTAT_FLTVNPAGES(21); 3630 err = FC_MAKE_ERR(ierr); 3631 goto out; 3632 } 3633 if (btopr(va.va_size) < 3634 btopr(off + (eaddr - a))) { 3635 SEGVN_VMSTAT_FLTVNPAGES(22); 3636 err = FC_MAKE_ERR(ierr); 3637 goto out; 3638 } 3639 if (brkcow || type == F_SOFTLOCK) { 3640 /* can't reduce map area */ 3641 SEGVN_VMSTAT_FLTVNPAGES(23); 3642 vop_size_err = 1; 3643 goto out; 3644 } 3645 SEGVN_VMSTAT_FLTVNPAGES(24); 3646 ASSERT(szc != 0); 3647 pszc = 0; 3648 ierr = -1; 3649 break; 3650 } 3651 3652 if (amp != NULL) { 3653 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 3654 anon_array_enter(amp, aindx, &an_cookie); 3655 } 3656 if (amp != NULL && 3657 anon_get_ptr(amp->ahp, aindx) != NULL) { 3658 ulong_t taindx = P2ALIGN(aindx, maxpages); 3659 3660 SEGVN_VMSTAT_FLTVNPAGES(25); 3661 if (anon_pages(amp->ahp, taindx, maxpages) != 3662 maxpages) { 3663 panic("segvn_fault_vnodepages:" 3664 " empty anon slots\n"); 3665 } 3666 for (i = 0; i < pages; i++) { 3667 page_unlock(ppa[i]); 3668 } 3669 anon_array_exit(&an_cookie); 3670 ANON_LOCK_EXIT(&->a_rwlock); 3671 if (pplist != NULL) { 3672 page_free_replacement_page(pplist); 3673 page_create_putback(pages); 3674 } 3675 SEGVN_RESTORE_SOFTLOCK(type, pages); 3676 if (szc < seg->s_szc) { 3677 SEGVN_VMSTAT_FLTVNPAGES(26); 3678 /* 3679 * For private segments SOFTLOCK 3680 * either always breaks cow (any rw 3681 * type except S_READ_NOCOW) or 3682 * address space is locked as writer 3683 * (S_READ_NOCOW case) and anon slots 3684 * can't show up on second check. 3685 * Therefore if we are here for 3686 * SOFTLOCK case it must be a cow 3687 * break but cow break never reduces 3688 * szc. Thus the assert below. 3689 */ 3690 ASSERT(!brkcow && type != F_SOFTLOCK); 3691 pszc = seg->s_szc; 3692 ierr = -2; 3693 break; 3694 } 3695 ASSERT(IS_P2ALIGNED(a, maxpgsz)); 3696 goto again; 3697 } 3698 #ifdef DEBUG 3699 if (amp != NULL) { 3700 ulong_t taindx = P2ALIGN(aindx, maxpages); 3701 ASSERT(!anon_pages(amp->ahp, taindx, maxpages)); 3702 } 3703 #endif /* DEBUG */ 3704 3705 if (brkcow) { 3706 ASSERT(amp != NULL); 3707 ASSERT(pplist == NULL); 3708 ASSERT(szc == seg->s_szc); 3709 ASSERT(IS_P2ALIGNED(a, maxpgsz)); 3710 ASSERT(IS_P2ALIGNED(aindx, maxpages)); 3711 SEGVN_VMSTAT_FLTVNPAGES(27); 3712 ierr = anon_map_privatepages(amp, aindx, szc, 3713 seg, a, prot, ppa, vpage, segvn_anypgsz, 3714 svd->cred); 3715 if (ierr != 0) { 3716 SEGVN_VMSTAT_FLTVNPAGES(28); 3717 anon_array_exit(&an_cookie); 3718 ANON_LOCK_EXIT(&->a_rwlock); 3719 SEGVN_RESTORE_SOFTLOCK(type, pages); 3720 err = FC_MAKE_ERR(ierr); 3721 goto out; 3722 } 3723 3724 ASSERT(!IS_VMODSORT(ppa[0]->p_vnode)); 3725 /* 3726 * p_szc can't be changed for locked 3727 * swapfs pages. 3728 */ 3729 hat_memload_array(hat, a, pgsz, ppa, prot, 3730 hat_flag); 3731 3732 if (!(hat_flag & HAT_LOAD_LOCK)) { 3733 SEGVN_VMSTAT_FLTVNPAGES(29); 3734 for (i = 0; i < pages; i++) { 3735 page_unlock(ppa[i]); 3736 } 3737 } 3738 anon_array_exit(&an_cookie); 3739 ANON_LOCK_EXIT(&->a_rwlock); 3740 goto next; 3741 } 3742 3743 pfn = page_pptonum(ppa[0]); 3744 /* 3745 * hat_page_demote() needs an EXCl lock on one of 3746 * constituent page_t's and it decreases root's p_szc 3747 * last. This means if root's p_szc is equal szc and 3748 * all its constituent pages are locked 3749 * hat_page_demote() that could have changed p_szc to 3750 * szc is already done and no new have page_demote() 3751 * can start for this large page. 3752 */ 3753 3754 /* 3755 * we need to make sure same mapping size is used for 3756 * the same address range if there's a possibility the 3757 * adddress is already mapped because hat layer panics 3758 * when translation is loaded for the range already 3759 * mapped with a different page size. We achieve it 3760 * by always using largest page size possible subject 3761 * to the constraints of page size, segment page size 3762 * and page alignment. Since mappings are invalidated 3763 * when those constraints change and make it 3764 * impossible to use previously used mapping size no 3765 * mapping size conflicts should happen. 3766 */ 3767 3768 chkszc: 3769 if ((pszc = ppa[0]->p_szc) == szc && 3770 IS_P2ALIGNED(pfn, pages)) { 3771 3772 SEGVN_VMSTAT_FLTVNPAGES(30); 3773 #ifdef DEBUG 3774 for (i = 0; i < pages; i++) { 3775 ASSERT(PAGE_LOCKED(ppa[i])); 3776 ASSERT(!PP_ISFREE(ppa[i])); 3777 ASSERT(page_pptonum(ppa[i]) == 3778 pfn + i); 3779 ASSERT(ppa[i]->p_szc == szc); 3780 ASSERT(ppa[i]->p_vnode == vp); 3781 ASSERT(ppa[i]->p_offset == 3782 off + (i << PAGESHIFT)); 3783 } 3784 #endif /* DEBUG */ 3785 /* 3786 * All pages are of szc we need and they are 3787 * all locked so they can't change szc. load 3788 * translations. 3789 * 3790 * if page got promoted since last check 3791 * we don't need pplist. 3792 */ 3793 if (pplist != NULL) { 3794 page_free_replacement_page(pplist); 3795 page_create_putback(pages); 3796 } 3797 if (PP_ISMIGRATE(ppa[0])) { 3798 page_migrate(seg, a, ppa, pages); 3799 } 3800 SEGVN_UPDATE_MODBITS(ppa, pages, rw, 3801 prot, vpprot); 3802 if (!xhat) { 3803 hat_memload_array(hat, a, pgsz, ppa, 3804 prot & vpprot, hat_flag); 3805 } else { 3806 /* 3807 * avoid large xhat mappings to FS 3808 * pages so that hat_page_demote() 3809 * doesn't need to check for xhat 3810 * large mappings. 3811 */ 3812 for (i = 0; i < pages; i++) { 3813 hat_memload(hat, 3814 a + (i << PAGESHIFT), 3815 ppa[i], prot & vpprot, 3816 hat_flag); 3817 } 3818 } 3819 3820 if (!(hat_flag & HAT_LOAD_LOCK)) { 3821 for (i = 0; i < pages; i++) { 3822 page_unlock(ppa[i]); 3823 } 3824 } 3825 if (amp != NULL) { 3826 anon_array_exit(&an_cookie); 3827 ANON_LOCK_EXIT(&->a_rwlock); 3828 } 3829 goto next; 3830 } 3831 3832 /* 3833 * See if upsize is possible. 3834 */ 3835 if (pszc > szc && szc < seg->s_szc && 3836 (segvn_anypgsz_vnode || pszc >= seg->s_szc)) { 3837 pgcnt_t aphase; 3838 uint_t pszc1 = MIN(pszc, seg->s_szc); 3839 ppgsz = page_get_pagesize(pszc1); 3840 ppages = btop(ppgsz); 3841 aphase = btop(P2PHASE((uintptr_t)a, ppgsz)); 3842 3843 ASSERT(type != F_SOFTLOCK); 3844 3845 SEGVN_VMSTAT_FLTVNPAGES(31); 3846 if (aphase != P2PHASE(pfn, ppages)) { 3847 segvn_faultvnmpss_align_err4++; 3848 } else { 3849 SEGVN_VMSTAT_FLTVNPAGES(32); 3850 if (pplist != NULL) { 3851 page_t *pl = pplist; 3852 page_free_replacement_page(pl); 3853 page_create_putback(pages); 3854 } 3855 for (i = 0; i < pages; i++) { 3856 page_unlock(ppa[i]); 3857 } 3858 if (amp != NULL) { 3859 anon_array_exit(&an_cookie); 3860 ANON_LOCK_EXIT(&->a_rwlock); 3861 } 3862 pszc = pszc1; 3863 ierr = -2; 3864 break; 3865 } 3866 } 3867 3868 /* 3869 * check if we should use smallest mapping size. 3870 */ 3871 upgrdfail = 0; 3872 if (szc == 0 || xhat || 3873 (pszc >= szc && 3874 !IS_P2ALIGNED(pfn, pages)) || 3875 (pszc < szc && 3876 !segvn_full_szcpages(ppa, szc, &upgrdfail, 3877 &pszc))) { 3878 3879 if (upgrdfail && type != F_SOFTLOCK) { 3880 /* 3881 * segvn_full_szcpages failed to lock 3882 * all pages EXCL. Size down. 3883 */ 3884 ASSERT(pszc < szc); 3885 3886 SEGVN_VMSTAT_FLTVNPAGES(33); 3887 3888 if (pplist != NULL) { 3889 page_t *pl = pplist; 3890 page_free_replacement_page(pl); 3891 page_create_putback(pages); 3892 } 3893 3894 for (i = 0; i < pages; i++) { 3895 page_unlock(ppa[i]); 3896 } 3897 if (amp != NULL) { 3898 anon_array_exit(&an_cookie); 3899 ANON_LOCK_EXIT(&->a_rwlock); 3900 } 3901 ierr = -1; 3902 break; 3903 } 3904 if (szc != 0 && !xhat) { 3905 segvn_faultvnmpss_align_err5++; 3906 } 3907 SEGVN_VMSTAT_FLTVNPAGES(34); 3908 if (pplist != NULL) { 3909 page_free_replacement_page(pplist); 3910 page_create_putback(pages); 3911 } 3912 SEGVN_UPDATE_MODBITS(ppa, pages, rw, 3913 prot, vpprot); 3914 if (upgrdfail && segvn_anypgsz_vnode) { 3915 /* SOFTLOCK case */ 3916 hat_memload_array(hat, a, pgsz, 3917 ppa, prot & vpprot, hat_flag); 3918 } else { 3919 for (i = 0; i < pages; i++) { 3920 hat_memload(hat, 3921 a + (i << PAGESHIFT), 3922 ppa[i], prot & vpprot, 3923 hat_flag); 3924 } 3925 } 3926 if (!(hat_flag & HAT_LOAD_LOCK)) { 3927 for (i = 0; i < pages; i++) { 3928 page_unlock(ppa[i]); 3929 } 3930 } 3931 if (amp != NULL) { 3932 anon_array_exit(&an_cookie); 3933 ANON_LOCK_EXIT(&->a_rwlock); 3934 } 3935 goto next; 3936 } 3937 3938 if (pszc == szc) { 3939 /* 3940 * segvn_full_szcpages() upgraded pages szc. 3941 */ 3942 ASSERT(pszc == ppa[0]->p_szc); 3943 ASSERT(IS_P2ALIGNED(pfn, pages)); 3944 goto chkszc; 3945 } 3946 3947 if (pszc > szc) { 3948 kmutex_t *szcmtx; 3949 SEGVN_VMSTAT_FLTVNPAGES(35); 3950 /* 3951 * p_szc of ppa[0] can change since we haven't 3952 * locked all constituent pages. Call 3953 * page_lock_szc() to prevent szc changes. 3954 * This should be a rare case that happens when 3955 * multiple segments use a different page size 3956 * to map the same file offsets. 3957 */ 3958 szcmtx = page_szc_lock(ppa[0]); 3959 pszc = ppa[0]->p_szc; 3960 ASSERT(szcmtx != NULL || pszc == 0); 3961 ASSERT(ppa[0]->p_szc <= pszc); 3962 if (pszc <= szc) { 3963 SEGVN_VMSTAT_FLTVNPAGES(36); 3964 if (szcmtx != NULL) { 3965 mutex_exit(szcmtx); 3966 } 3967 goto chkszc; 3968 } 3969 if (pplist != NULL) { 3970 /* 3971 * page got promoted since last check. 3972 * we don't need preaalocated large 3973 * page. 3974 */ 3975 SEGVN_VMSTAT_FLTVNPAGES(37); 3976 page_free_replacement_page(pplist); 3977 page_create_putback(pages); 3978 } 3979 SEGVN_UPDATE_MODBITS(ppa, pages, rw, 3980 prot, vpprot); 3981 hat_memload_array(hat, a, pgsz, ppa, 3982 prot & vpprot, hat_flag); 3983 mutex_exit(szcmtx); 3984 if (!(hat_flag & HAT_LOAD_LOCK)) { 3985 for (i = 0; i < pages; i++) { 3986 page_unlock(ppa[i]); 3987 } 3988 } 3989 if (amp != NULL) { 3990 anon_array_exit(&an_cookie); 3991 ANON_LOCK_EXIT(&->a_rwlock); 3992 } 3993 goto next; 3994 } 3995 3996 /* 3997 * if page got demoted since last check 3998 * we could have not allocated larger page. 3999 * allocate now. 4000 */ 4001 if (pplist == NULL && 4002 page_alloc_pages(vp, seg, a, &pplist, NULL, 4003 szc, 0) && type != F_SOFTLOCK) { 4004 SEGVN_VMSTAT_FLTVNPAGES(38); 4005 for (i = 0; i < pages; i++) { 4006 page_unlock(ppa[i]); 4007 } 4008 if (amp != NULL) { 4009 anon_array_exit(&an_cookie); 4010 ANON_LOCK_EXIT(&->a_rwlock); 4011 } 4012 ierr = -1; 4013 alloc_failed |= (1 << szc); 4014 break; 4015 } 4016 4017 SEGVN_VMSTAT_FLTVNPAGES(39); 4018 4019 if (pplist != NULL) { 4020 segvn_relocate_pages(ppa, pplist); 4021 #ifdef DEBUG 4022 } else { 4023 ASSERT(type == F_SOFTLOCK); 4024 SEGVN_VMSTAT_FLTVNPAGES(40); 4025 #endif /* DEBUG */ 4026 } 4027 4028 SEGVN_UPDATE_MODBITS(ppa, pages, rw, prot, vpprot); 4029 4030 if (pplist == NULL && segvn_anypgsz_vnode == 0) { 4031 ASSERT(type == F_SOFTLOCK); 4032 for (i = 0; i < pages; i++) { 4033 ASSERT(ppa[i]->p_szc < szc); 4034 hat_memload(hat, a + (i << PAGESHIFT), 4035 ppa[i], prot & vpprot, hat_flag); 4036 } 4037 } else { 4038 ASSERT(pplist != NULL || type == F_SOFTLOCK); 4039 hat_memload_array(hat, a, pgsz, ppa, 4040 prot & vpprot, hat_flag); 4041 } 4042 if (!(hat_flag & HAT_LOAD_LOCK)) { 4043 for (i = 0; i < pages; i++) { 4044 ASSERT(PAGE_SHARED(ppa[i])); 4045 page_unlock(ppa[i]); 4046 } 4047 } 4048 if (amp != NULL) { 4049 anon_array_exit(&an_cookie); 4050 ANON_LOCK_EXIT(&->a_rwlock); 4051 } 4052 4053 next: 4054 if (vpage != NULL) { 4055 vpage += pages; 4056 } 4057 adjszc_chk = 1; 4058 } 4059 if (a == lpgeaddr) 4060 break; 4061 ASSERT(a < lpgeaddr); 4062 4063 ASSERT(!brkcow && type != F_SOFTLOCK); 4064 4065 /* 4066 * ierr == -1 means we failed to map with a large page. 4067 * (either due to allocation/relocation failures or 4068 * misalignment with other mappings to this file. 4069 * 4070 * ierr == -2 means some other thread allocated a large page 4071 * after we gave up tp map with a large page. retry with 4072 * larger mapping. 4073 */ 4074 ASSERT(ierr == -1 || ierr == -2); 4075 ASSERT(ierr == -2 || szc != 0); 4076 ASSERT(ierr == -1 || szc < seg->s_szc); 4077 if (ierr == -2) { 4078 SEGVN_VMSTAT_FLTVNPAGES(41); 4079 ASSERT(pszc > szc && pszc <= seg->s_szc); 4080 szc = pszc; 4081 } else if (segvn_anypgsz_vnode) { 4082 SEGVN_VMSTAT_FLTVNPAGES(42); 4083 szc--; 4084 } else { 4085 SEGVN_VMSTAT_FLTVNPAGES(43); 4086 ASSERT(pszc < szc); 4087 /* 4088 * other process created pszc large page. 4089 * but we still have to drop to 0 szc. 4090 */ 4091 szc = 0; 4092 } 4093 4094 pgsz = page_get_pagesize(szc); 4095 pages = btop(pgsz); 4096 if (ierr == -2) { 4097 /* 4098 * Size up case. Note lpgaddr may only be needed for 4099 * softlock case so we don't adjust it here. 4100 */ 4101 a = (caddr_t)P2ALIGN((uintptr_t)a, pgsz); 4102 ASSERT(a >= lpgaddr); 4103 lpgeaddr = (caddr_t)P2ROUNDUP((uintptr_t)eaddr, pgsz); 4104 off = svd->offset + (uintptr_t)(a - seg->s_base); 4105 aindx = svd->anon_index + seg_page(seg, a); 4106 vpage = (svd->vpage != NULL) ? 4107 &svd->vpage[seg_page(seg, a)] : NULL; 4108 } else { 4109 /* 4110 * Size down case. Note lpgaddr may only be needed for 4111 * softlock case so we don't adjust it here. 4112 */ 4113 ASSERT(IS_P2ALIGNED(a, pgsz)); 4114 ASSERT(IS_P2ALIGNED(lpgeaddr, pgsz)); 4115 lpgeaddr = (caddr_t)P2ROUNDUP((uintptr_t)eaddr, pgsz); 4116 ASSERT(a < lpgeaddr); 4117 if (a < addr) { 4118 SEGVN_VMSTAT_FLTVNPAGES(44); 4119 /* 4120 * The beginning of the large page region can 4121 * be pulled to the right to make a smaller 4122 * region. We haven't yet faulted a single 4123 * page. 4124 */ 4125 a = (caddr_t)P2ALIGN((uintptr_t)addr, pgsz); 4126 ASSERT(a >= lpgaddr); 4127 off = svd->offset + 4128 (uintptr_t)(a - seg->s_base); 4129 aindx = svd->anon_index + seg_page(seg, a); 4130 vpage = (svd->vpage != NULL) ? 4131 &svd->vpage[seg_page(seg, a)] : NULL; 4132 } 4133 } 4134 } 4135 out: 4136 kmem_free(ppa, ppasize); 4137 if (!err && !vop_size_err) { 4138 SEGVN_VMSTAT_FLTVNPAGES(45); 4139 return (0); 4140 } 4141 if (type == F_SOFTLOCK && a > lpgaddr) { 4142 SEGVN_VMSTAT_FLTVNPAGES(46); 4143 segvn_softunlock(seg, lpgaddr, a - lpgaddr, S_OTHER); 4144 } 4145 if (!vop_size_err) { 4146 SEGVN_VMSTAT_FLTVNPAGES(47); 4147 return (err); 4148 } 4149 ASSERT(brkcow || type == F_SOFTLOCK); 4150 /* 4151 * Large page end is mapped beyond the end of file and it's a cow 4152 * fault or softlock so we can't reduce the map area. For now just 4153 * demote the segment. This should really only happen if the end of 4154 * the file changed after the mapping was established since when large 4155 * page segments are created we make sure they don't extend beyond the 4156 * end of the file. 4157 */ 4158 SEGVN_VMSTAT_FLTVNPAGES(48); 4159 4160 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 4161 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_WRITER); 4162 err = 0; 4163 if (seg->s_szc != 0) { 4164 segvn_fltvnpages_clrszc_cnt++; 4165 ASSERT(svd->softlockcnt == 0); 4166 err = segvn_clrszc(seg); 4167 if (err != 0) { 4168 segvn_fltvnpages_clrszc_err++; 4169 } 4170 } 4171 ASSERT(err || seg->s_szc == 0); 4172 SEGVN_LOCK_DOWNGRADE(seg->s_as, &svd->lock); 4173 /* segvn_fault will do its job as if szc had been zero to begin with */ 4174 return (err == 0 ? IE_RETRY : FC_MAKE_ERR(err)); 4175 } 4176 4177 /* 4178 * This routine will attempt to fault in one large page. 4179 * it will use smaller pages if that fails. 4180 * It should only be called for pure anonymous segments. 4181 */ 4182 static faultcode_t 4183 segvn_fault_anonpages(struct hat *hat, struct seg *seg, caddr_t lpgaddr, 4184 caddr_t lpgeaddr, enum fault_type type, enum seg_rw rw, caddr_t addr, 4185 caddr_t eaddr, int brkcow) 4186 { 4187 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 4188 struct anon_map *amp = svd->amp; 4189 uchar_t segtype = svd->type; 4190 uint_t szc = seg->s_szc; 4191 size_t pgsz = page_get_pagesize(szc); 4192 size_t maxpgsz = pgsz; 4193 pgcnt_t pages = btop(pgsz); 4194 size_t ppasize = pages * sizeof (page_t *); 4195 caddr_t a = lpgaddr; 4196 ulong_t aindx = svd->anon_index + seg_page(seg, a); 4197 struct vpage *vpage = (svd->vpage != NULL) ? 4198 &svd->vpage[seg_page(seg, a)] : NULL; 4199 page_t **ppa; 4200 uint_t ppa_szc; 4201 faultcode_t err; 4202 int ierr; 4203 uint_t protchk, prot, vpprot; 4204 ulong_t i; 4205 int hat_flag = (type == F_SOFTLOCK) ? HAT_LOAD_LOCK : HAT_LOAD; 4206 anon_sync_obj_t cookie; 4207 int first = 1; 4208 int adjszc_chk; 4209 int purged = 0; 4210 4211 ASSERT(szc != 0); 4212 ASSERT(amp != NULL); 4213 ASSERT(enable_mbit_wa == 0); /* no mbit simulations with large pages */ 4214 ASSERT(!(svd->flags & MAP_NORESERVE)); 4215 ASSERT(type != F_SOFTUNLOCK); 4216 ASSERT(IS_P2ALIGNED(a, maxpgsz)); 4217 4218 ASSERT(SEGVN_LOCK_HELD(seg->s_as, &svd->lock)); 4219 4220 VM_STAT_COND_ADD(type == F_SOFTLOCK, segvnvmstats.fltanpages[0]); 4221 VM_STAT_COND_ADD(type != F_SOFTLOCK, segvnvmstats.fltanpages[1]); 4222 4223 if (svd->flags & MAP_TEXT) { 4224 hat_flag |= HAT_LOAD_TEXT; 4225 } 4226 4227 if (svd->pageprot) { 4228 switch (rw) { 4229 case S_READ: 4230 protchk = PROT_READ; 4231 break; 4232 case S_WRITE: 4233 protchk = PROT_WRITE; 4234 break; 4235 case S_EXEC: 4236 protchk = PROT_EXEC; 4237 break; 4238 case S_OTHER: 4239 default: 4240 protchk = PROT_READ | PROT_WRITE | PROT_EXEC; 4241 break; 4242 } 4243 VM_STAT_ADD(segvnvmstats.fltanpages[2]); 4244 } else { 4245 prot = svd->prot; 4246 /* caller has already done segment level protection check. */ 4247 } 4248 4249 ppa = kmem_alloc(ppasize, KM_SLEEP); 4250 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 4251 for (;;) { 4252 adjszc_chk = 0; 4253 for (; a < lpgeaddr; a += pgsz, aindx += pages) { 4254 if (svd->pageprot != 0 && IS_P2ALIGNED(a, maxpgsz)) { 4255 VM_STAT_ADD(segvnvmstats.fltanpages[3]); 4256 ASSERT(vpage != NULL); 4257 prot = VPP_PROT(vpage); 4258 ASSERT(sameprot(seg, a, maxpgsz)); 4259 if ((prot & protchk) == 0) { 4260 err = FC_PROT; 4261 goto error; 4262 } 4263 } 4264 if (adjszc_chk && IS_P2ALIGNED(a, maxpgsz) && 4265 pgsz < maxpgsz) { 4266 ASSERT(a > lpgaddr); 4267 szc = seg->s_szc; 4268 pgsz = maxpgsz; 4269 pages = btop(pgsz); 4270 ASSERT(IS_P2ALIGNED(aindx, pages)); 4271 lpgeaddr = (caddr_t)P2ROUNDUP((uintptr_t)eaddr, 4272 pgsz); 4273 } 4274 if (type == F_SOFTLOCK && svd->vp != NULL) { 4275 mutex_enter(&freemem_lock); 4276 if (availrmem < tune.t_minarmem + pages) { 4277 mutex_exit(&freemem_lock); 4278 err = FC_MAKE_ERR(ENOMEM); 4279 goto error; 4280 } else { 4281 availrmem -= pages; 4282 segvn_pages_locked += pages; 4283 svd->softlockcnt += pages; 4284 } 4285 mutex_exit(&freemem_lock); 4286 } 4287 anon_array_enter(amp, aindx, &cookie); 4288 ppa_szc = (uint_t)-1; 4289 ierr = anon_map_getpages(amp, aindx, szc, seg, a, 4290 prot, &vpprot, ppa, &ppa_szc, vpage, rw, brkcow, 4291 segvn_anypgsz, svd->cred); 4292 if (ierr != 0) { 4293 anon_array_exit(&cookie); 4294 VM_STAT_ADD(segvnvmstats.fltanpages[4]); 4295 if (type == F_SOFTLOCK && svd->vp != NULL) { 4296 VM_STAT_ADD(segvnvmstats.fltanpages[5]); 4297 mutex_enter(&freemem_lock); 4298 availrmem += pages; 4299 segvn_pages_locked -= pages; 4300 svd->softlockcnt -= pages; 4301 mutex_exit(&freemem_lock); 4302 } 4303 if (ierr > 0) { 4304 VM_STAT_ADD(segvnvmstats.fltanpages[6]); 4305 err = FC_MAKE_ERR(ierr); 4306 goto error; 4307 } 4308 break; 4309 } 4310 4311 ASSERT(!IS_VMODSORT(ppa[0]->p_vnode)); 4312 4313 ASSERT(segtype == MAP_SHARED || 4314 ppa[0]->p_szc <= szc); 4315 ASSERT(segtype == MAP_PRIVATE || 4316 ppa[0]->p_szc >= szc); 4317 4318 /* 4319 * Handle pages that have been marked for migration 4320 */ 4321 if (lgrp_optimizations()) 4322 page_migrate(seg, a, ppa, pages); 4323 4324 if (type == F_SOFTLOCK && svd->vp == NULL) { 4325 /* 4326 * All pages in ppa array belong to the same 4327 * large page. This means it's ok to call 4328 * segvn_pp_lock_anonpages just for ppa[0]. 4329 */ 4330 if (!segvn_pp_lock_anonpages(ppa[0], first)) { 4331 for (i = 0; i < pages; i++) { 4332 page_unlock(ppa[i]); 4333 } 4334 err = FC_MAKE_ERR(ENOMEM); 4335 goto error; 4336 } 4337 first = 0; 4338 mutex_enter(&freemem_lock); 4339 svd->softlockcnt += pages; 4340 segvn_pages_locked += pages; 4341 mutex_exit(&freemem_lock); 4342 } 4343 4344 if (segtype == MAP_SHARED) { 4345 vpprot |= PROT_WRITE; 4346 } 4347 4348 hat_memload_array(hat, a, pgsz, ppa, 4349 prot & vpprot, hat_flag); 4350 4351 if (hat_flag & HAT_LOAD_LOCK) { 4352 VM_STAT_ADD(segvnvmstats.fltanpages[7]); 4353 } else { 4354 VM_STAT_ADD(segvnvmstats.fltanpages[8]); 4355 for (i = 0; i < pages; i++) 4356 page_unlock(ppa[i]); 4357 } 4358 if (vpage != NULL) 4359 vpage += pages; 4360 4361 anon_array_exit(&cookie); 4362 adjszc_chk = 1; 4363 } 4364 if (a == lpgeaddr) 4365 break; 4366 ASSERT(a < lpgeaddr); 4367 /* 4368 * ierr == -1 means we failed to allocate a large page. 4369 * so do a size down operation. 4370 * 4371 * ierr == -2 means some other process that privately shares 4372 * pages with this process has allocated a larger page and we 4373 * need to retry with larger pages. So do a size up 4374 * operation. This relies on the fact that large pages are 4375 * never partially shared i.e. if we share any constituent 4376 * page of a large page with another process we must share the 4377 * entire large page. Note this cannot happen for SOFTLOCK 4378 * case, unless current address (a) is at the beginning of the 4379 * next page size boundary because the other process couldn't 4380 * have relocated locked pages. 4381 */ 4382 ASSERT(ierr == -1 || ierr == -2); 4383 /* 4384 * For the very first relocation failure try to purge this 4385 * segment's cache so that the relocator can obtain an 4386 * exclusive lock on pages we want to relocate. 4387 */ 4388 if (!purged && ierr == -1 && ppa_szc != (uint_t)-1 && 4389 svd->softlockcnt != 0) { 4390 purged = 1; 4391 segvn_purge(seg); 4392 continue; 4393 } 4394 4395 if (segvn_anypgsz) { 4396 ASSERT(ierr == -2 || szc != 0); 4397 ASSERT(ierr == -1 || szc < seg->s_szc); 4398 szc = (ierr == -1) ? szc - 1 : szc + 1; 4399 } else { 4400 /* 4401 * For non COW faults and segvn_anypgsz == 0 4402 * we need to be careful not to loop forever 4403 * if existing page is found with szc other 4404 * than 0 or seg->s_szc. This could be due 4405 * to page relocations on behalf of DR or 4406 * more likely large page creation. For this 4407 * case simply re-size to existing page's szc 4408 * if returned by anon_map_getpages(). 4409 */ 4410 if (ppa_szc == (uint_t)-1) { 4411 szc = (ierr == -1) ? 0 : seg->s_szc; 4412 } else { 4413 ASSERT(ppa_szc <= seg->s_szc); 4414 ASSERT(ierr == -2 || ppa_szc < szc); 4415 ASSERT(ierr == -1 || ppa_szc > szc); 4416 szc = ppa_szc; 4417 } 4418 } 4419 4420 pgsz = page_get_pagesize(szc); 4421 pages = btop(pgsz); 4422 ASSERT(type != F_SOFTLOCK || ierr == -1 || 4423 (IS_P2ALIGNED(a, pgsz) && IS_P2ALIGNED(lpgeaddr, pgsz))); 4424 if (type == F_SOFTLOCK) { 4425 /* 4426 * For softlocks we cannot reduce the fault area 4427 * (calculated based on the largest page size for this 4428 * segment) for size down and a is already next 4429 * page size aligned as assertted above for size 4430 * ups. Therefore just continue in case of softlock. 4431 */ 4432 VM_STAT_ADD(segvnvmstats.fltanpages[9]); 4433 continue; /* keep lint happy */ 4434 } else if (ierr == -2) { 4435 4436 /* 4437 * Size up case. Note lpgaddr may only be needed for 4438 * softlock case so we don't adjust it here. 4439 */ 4440 VM_STAT_ADD(segvnvmstats.fltanpages[10]); 4441 a = (caddr_t)P2ALIGN((uintptr_t)a, pgsz); 4442 ASSERT(a >= lpgaddr); 4443 lpgeaddr = (caddr_t)P2ROUNDUP((uintptr_t)eaddr, pgsz); 4444 aindx = svd->anon_index + seg_page(seg, a); 4445 vpage = (svd->vpage != NULL) ? 4446 &svd->vpage[seg_page(seg, a)] : NULL; 4447 } else { 4448 /* 4449 * Size down case. Note lpgaddr may only be needed for 4450 * softlock case so we don't adjust it here. 4451 */ 4452 VM_STAT_ADD(segvnvmstats.fltanpages[11]); 4453 ASSERT(IS_P2ALIGNED(a, pgsz)); 4454 ASSERT(IS_P2ALIGNED(lpgeaddr, pgsz)); 4455 lpgeaddr = (caddr_t)P2ROUNDUP((uintptr_t)eaddr, pgsz); 4456 ASSERT(a < lpgeaddr); 4457 if (a < addr) { 4458 /* 4459 * The beginning of the large page region can 4460 * be pulled to the right to make a smaller 4461 * region. We haven't yet faulted a single 4462 * page. 4463 */ 4464 VM_STAT_ADD(segvnvmstats.fltanpages[12]); 4465 a = (caddr_t)P2ALIGN((uintptr_t)addr, pgsz); 4466 ASSERT(a >= lpgaddr); 4467 aindx = svd->anon_index + seg_page(seg, a); 4468 vpage = (svd->vpage != NULL) ? 4469 &svd->vpage[seg_page(seg, a)] : NULL; 4470 } 4471 } 4472 } 4473 VM_STAT_ADD(segvnvmstats.fltanpages[13]); 4474 ANON_LOCK_EXIT(&->a_rwlock); 4475 kmem_free(ppa, ppasize); 4476 return (0); 4477 error: 4478 VM_STAT_ADD(segvnvmstats.fltanpages[14]); 4479 ANON_LOCK_EXIT(&->a_rwlock); 4480 kmem_free(ppa, ppasize); 4481 if (type == F_SOFTLOCK && a > lpgaddr) { 4482 VM_STAT_ADD(segvnvmstats.fltanpages[15]); 4483 segvn_softunlock(seg, lpgaddr, a - lpgaddr, S_OTHER); 4484 } 4485 return (err); 4486 } 4487 4488 int fltadvice = 1; /* set to free behind pages for sequential access */ 4489 4490 /* 4491 * This routine is called via a machine specific fault handling routine. 4492 * It is also called by software routines wishing to lock or unlock 4493 * a range of addresses. 4494 * 4495 * Here is the basic algorithm: 4496 * If unlocking 4497 * Call segvn_softunlock 4498 * Return 4499 * endif 4500 * Checking and set up work 4501 * If we will need some non-anonymous pages 4502 * Call VOP_GETPAGE over the range of non-anonymous pages 4503 * endif 4504 * Loop over all addresses requested 4505 * Call segvn_faultpage passing in page list 4506 * to load up translations and handle anonymous pages 4507 * endloop 4508 * Load up translation to any additional pages in page list not 4509 * already handled that fit into this segment 4510 */ 4511 static faultcode_t 4512 segvn_fault(struct hat *hat, struct seg *seg, caddr_t addr, size_t len, 4513 enum fault_type type, enum seg_rw rw) 4514 { 4515 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 4516 page_t **plp, **ppp, *pp; 4517 u_offset_t off; 4518 caddr_t a; 4519 struct vpage *vpage; 4520 uint_t vpprot, prot; 4521 int err; 4522 page_t *pl[PVN_GETPAGE_NUM + 1]; 4523 size_t plsz, pl_alloc_sz; 4524 size_t page; 4525 ulong_t anon_index; 4526 struct anon_map *amp; 4527 int dogetpage = 0; 4528 caddr_t lpgaddr, lpgeaddr; 4529 size_t pgsz; 4530 anon_sync_obj_t cookie; 4531 int brkcow = BREAK_COW_SHARE(rw, type, svd->type); 4532 4533 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 4534 4535 /* 4536 * First handle the easy stuff 4537 */ 4538 if (type == F_SOFTUNLOCK) { 4539 if (rw == S_READ_NOCOW) { 4540 rw = S_READ; 4541 ASSERT(AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock)); 4542 } 4543 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER); 4544 pgsz = (seg->s_szc == 0) ? PAGESIZE : 4545 page_get_pagesize(seg->s_szc); 4546 VM_STAT_COND_ADD(pgsz > PAGESIZE, segvnvmstats.fltanpages[16]); 4547 CALC_LPG_REGION(pgsz, seg, addr, len, lpgaddr, lpgeaddr); 4548 segvn_softunlock(seg, lpgaddr, lpgeaddr - lpgaddr, rw); 4549 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 4550 return (0); 4551 } 4552 4553 top: 4554 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER); 4555 4556 /* 4557 * If we have the same protections for the entire segment, 4558 * insure that the access being attempted is legitimate. 4559 */ 4560 4561 if (svd->pageprot == 0) { 4562 uint_t protchk; 4563 4564 switch (rw) { 4565 case S_READ: 4566 case S_READ_NOCOW: 4567 protchk = PROT_READ; 4568 break; 4569 case S_WRITE: 4570 protchk = PROT_WRITE; 4571 break; 4572 case S_EXEC: 4573 protchk = PROT_EXEC; 4574 break; 4575 case S_OTHER: 4576 default: 4577 protchk = PROT_READ | PROT_WRITE | PROT_EXEC; 4578 break; 4579 } 4580 4581 if ((svd->prot & protchk) == 0) { 4582 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 4583 return (FC_PROT); /* illegal access type */ 4584 } 4585 } 4586 4587 /* 4588 * We can't allow the long term use of softlocks for vmpss segments, 4589 * because in some file truncation cases we should be able to demote 4590 * the segment, which requires that there are no softlocks. The 4591 * only case where it's ok to allow a SOFTLOCK fault against a vmpss 4592 * segment is S_READ_NOCOW, where the caller holds the address space 4593 * locked as writer and calls softunlock before dropping the as lock. 4594 * S_READ_NOCOW is used by /proc to read memory from another user. 4595 * 4596 * Another deadlock between SOFTLOCK and file truncation can happen 4597 * because segvn_fault_vnodepages() calls the FS one pagesize at 4598 * a time. A second VOP_GETPAGE() call by segvn_fault_vnodepages() 4599 * can cause a deadlock because the first set of page_t's remain 4600 * locked SE_SHARED. To avoid this, we demote segments on a first 4601 * SOFTLOCK if they have a length greater than the segment's 4602 * page size. 4603 * 4604 * So for now, we only avoid demoting a segment on a SOFTLOCK when 4605 * the access type is S_READ_NOCOW and the fault length is less than 4606 * or equal to the segment's page size. While this is quite restrictive, 4607 * it should be the most common case of SOFTLOCK against a vmpss 4608 * segment. 4609 * 4610 * For S_READ_NOCOW, it's safe not to do a copy on write because the 4611 * caller makes sure no COW will be caused by another thread for a 4612 * softlocked page. 4613 */ 4614 if (type == F_SOFTLOCK && svd->vp != NULL && seg->s_szc != 0) { 4615 int demote = 0; 4616 4617 if (rw != S_READ_NOCOW) { 4618 demote = 1; 4619 } 4620 if (!demote && len > PAGESIZE) { 4621 pgsz = page_get_pagesize(seg->s_szc); 4622 CALC_LPG_REGION(pgsz, seg, addr, len, lpgaddr, 4623 lpgeaddr); 4624 if (lpgeaddr - lpgaddr > pgsz) { 4625 demote = 1; 4626 } 4627 } 4628 4629 ASSERT(demote || AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock)); 4630 4631 if (demote) { 4632 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 4633 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_WRITER); 4634 if (seg->s_szc != 0) { 4635 segvn_vmpss_clrszc_cnt++; 4636 ASSERT(svd->softlockcnt == 0); 4637 err = segvn_clrszc(seg); 4638 if (err) { 4639 segvn_vmpss_clrszc_err++; 4640 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 4641 return (FC_MAKE_ERR(err)); 4642 } 4643 } 4644 ASSERT(seg->s_szc == 0); 4645 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 4646 goto top; 4647 } 4648 } 4649 4650 /* 4651 * Check to see if we need to allocate an anon_map structure. 4652 */ 4653 if (svd->amp == NULL && (svd->vp == NULL || brkcow)) { 4654 /* 4655 * Drop the "read" lock on the segment and acquire 4656 * the "write" version since we have to allocate the 4657 * anon_map. 4658 */ 4659 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 4660 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_WRITER); 4661 4662 if (svd->amp == NULL) { 4663 svd->amp = anonmap_alloc(seg->s_size, 0); 4664 svd->amp->a_szc = seg->s_szc; 4665 } 4666 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 4667 4668 /* 4669 * Start all over again since segment protections 4670 * may have changed after we dropped the "read" lock. 4671 */ 4672 goto top; 4673 } 4674 4675 /* 4676 * S_READ_NOCOW vs S_READ distinction was 4677 * only needed for the code above. After 4678 * that we treat it as S_READ. 4679 */ 4680 if (rw == S_READ_NOCOW) { 4681 ASSERT(type == F_SOFTLOCK); 4682 ASSERT(AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock)); 4683 rw = S_READ; 4684 } 4685 4686 amp = svd->amp; 4687 4688 /* 4689 * MADV_SEQUENTIAL work is ignored for large page segments. 4690 */ 4691 if (seg->s_szc != 0) { 4692 pgsz = page_get_pagesize(seg->s_szc); 4693 ASSERT(SEGVN_LOCK_HELD(seg->s_as, &svd->lock)); 4694 CALC_LPG_REGION(pgsz, seg, addr, len, lpgaddr, lpgeaddr); 4695 if (svd->vp == NULL) { 4696 err = segvn_fault_anonpages(hat, seg, lpgaddr, 4697 lpgeaddr, type, rw, addr, addr + len, brkcow); 4698 } else { 4699 err = segvn_fault_vnodepages(hat, seg, lpgaddr, 4700 lpgeaddr, type, rw, addr, addr + len, brkcow); 4701 if (err == IE_RETRY) { 4702 ASSERT(seg->s_szc == 0); 4703 ASSERT(SEGVN_READ_HELD(seg->s_as, &svd->lock)); 4704 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 4705 goto top; 4706 } 4707 } 4708 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 4709 return (err); 4710 } 4711 4712 page = seg_page(seg, addr); 4713 if (amp != NULL) { 4714 anon_index = svd->anon_index + page; 4715 4716 if ((type == F_PROT) && (rw == S_READ) && 4717 svd->type == MAP_PRIVATE && svd->pageprot == 0) { 4718 size_t index = anon_index; 4719 struct anon *ap; 4720 4721 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 4722 /* 4723 * The fast path could apply to S_WRITE also, except 4724 * that the protection fault could be caused by lazy 4725 * tlb flush when ro->rw. In this case, the pte is 4726 * RW already. But RO in the other cpu's tlb causes 4727 * the fault. Since hat_chgprot won't do anything if 4728 * pte doesn't change, we may end up faulting 4729 * indefinitely until the RO tlb entry gets replaced. 4730 */ 4731 for (a = addr; a < addr + len; a += PAGESIZE, index++) { 4732 anon_array_enter(amp, index, &cookie); 4733 ap = anon_get_ptr(amp->ahp, index); 4734 anon_array_exit(&cookie); 4735 if ((ap == NULL) || (ap->an_refcnt != 1)) { 4736 ANON_LOCK_EXIT(&->a_rwlock); 4737 goto slow; 4738 } 4739 } 4740 hat_chgprot(seg->s_as->a_hat, addr, len, svd->prot); 4741 ANON_LOCK_EXIT(&->a_rwlock); 4742 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 4743 return (0); 4744 } 4745 } 4746 slow: 4747 4748 if (svd->vpage == NULL) 4749 vpage = NULL; 4750 else 4751 vpage = &svd->vpage[page]; 4752 4753 off = svd->offset + (uintptr_t)(addr - seg->s_base); 4754 4755 /* 4756 * If MADV_SEQUENTIAL has been set for the particular page we 4757 * are faulting on, free behind all pages in the segment and put 4758 * them on the free list. 4759 */ 4760 if ((page != 0) && fltadvice) { /* not if first page in segment */ 4761 struct vpage *vpp; 4762 ulong_t fanon_index; 4763 size_t fpage; 4764 u_offset_t pgoff, fpgoff; 4765 struct vnode *fvp; 4766 struct anon *fap = NULL; 4767 4768 if (svd->advice == MADV_SEQUENTIAL || 4769 (svd->pageadvice && 4770 VPP_ADVICE(vpage) == MADV_SEQUENTIAL)) { 4771 pgoff = off - PAGESIZE; 4772 fpage = page - 1; 4773 if (vpage != NULL) 4774 vpp = &svd->vpage[fpage]; 4775 if (amp != NULL) 4776 fanon_index = svd->anon_index + fpage; 4777 4778 while (pgoff > svd->offset) { 4779 if (svd->advice != MADV_SEQUENTIAL && 4780 (!svd->pageadvice || (vpage && 4781 VPP_ADVICE(vpp) != MADV_SEQUENTIAL))) 4782 break; 4783 4784 /* 4785 * If this is an anon page, we must find the 4786 * correct <vp, offset> for it 4787 */ 4788 fap = NULL; 4789 if (amp != NULL) { 4790 ANON_LOCK_ENTER(&->a_rwlock, 4791 RW_READER); 4792 anon_array_enter(amp, fanon_index, 4793 &cookie); 4794 fap = anon_get_ptr(amp->ahp, 4795 fanon_index); 4796 if (fap != NULL) { 4797 swap_xlate(fap, &fvp, &fpgoff); 4798 } else { 4799 fpgoff = pgoff; 4800 fvp = svd->vp; 4801 } 4802 anon_array_exit(&cookie); 4803 ANON_LOCK_EXIT(&->a_rwlock); 4804 } else { 4805 fpgoff = pgoff; 4806 fvp = svd->vp; 4807 } 4808 if (fvp == NULL) 4809 break; /* XXX */ 4810 /* 4811 * Skip pages that are free or have an 4812 * "exclusive" lock. 4813 */ 4814 pp = page_lookup_nowait(fvp, fpgoff, SE_SHARED); 4815 if (pp == NULL) 4816 break; 4817 /* 4818 * We don't need the page_struct_lock to test 4819 * as this is only advisory; even if we 4820 * acquire it someone might race in and lock 4821 * the page after we unlock and before the 4822 * PUTPAGE, then VOP_PUTPAGE will do nothing. 4823 */ 4824 if (pp->p_lckcnt == 0 && pp->p_cowcnt == 0) { 4825 /* 4826 * Hold the vnode before releasing 4827 * the page lock to prevent it from 4828 * being freed and re-used by some 4829 * other thread. 4830 */ 4831 VN_HOLD(fvp); 4832 page_unlock(pp); 4833 /* 4834 * We should build a page list 4835 * to kluster putpages XXX 4836 */ 4837 (void) VOP_PUTPAGE(fvp, 4838 (offset_t)fpgoff, PAGESIZE, 4839 (B_DONTNEED|B_FREE|B_ASYNC), 4840 svd->cred); 4841 VN_RELE(fvp); 4842 } else { 4843 /* 4844 * XXX - Should the loop terminate if 4845 * the page is `locked'? 4846 */ 4847 page_unlock(pp); 4848 } 4849 --vpp; 4850 --fanon_index; 4851 pgoff -= PAGESIZE; 4852 } 4853 } 4854 } 4855 4856 plp = pl; 4857 *plp = NULL; 4858 pl_alloc_sz = 0; 4859 4860 /* 4861 * See if we need to call VOP_GETPAGE for 4862 * *any* of the range being faulted on. 4863 * We can skip all of this work if there 4864 * was no original vnode. 4865 */ 4866 if (svd->vp != NULL) { 4867 u_offset_t vp_off; 4868 size_t vp_len; 4869 struct anon *ap; 4870 vnode_t *vp; 4871 4872 vp_off = off; 4873 vp_len = len; 4874 4875 if (amp == NULL) 4876 dogetpage = 1; 4877 else { 4878 /* 4879 * Only acquire reader lock to prevent amp->ahp 4880 * from being changed. It's ok to miss pages, 4881 * hence we don't do anon_array_enter 4882 */ 4883 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 4884 ap = anon_get_ptr(amp->ahp, anon_index); 4885 4886 if (len <= PAGESIZE) 4887 /* inline non_anon() */ 4888 dogetpage = (ap == NULL); 4889 else 4890 dogetpage = non_anon(amp->ahp, anon_index, 4891 &vp_off, &vp_len); 4892 ANON_LOCK_EXIT(&->a_rwlock); 4893 } 4894 4895 if (dogetpage) { 4896 enum seg_rw arw; 4897 struct as *as = seg->s_as; 4898 4899 if (len > ptob((sizeof (pl) / sizeof (pl[0])) - 1)) { 4900 /* 4901 * Page list won't fit in local array, 4902 * allocate one of the needed size. 4903 */ 4904 pl_alloc_sz = 4905 (btop(len) + 1) * sizeof (page_t *); 4906 plp = kmem_alloc(pl_alloc_sz, KM_SLEEP); 4907 plp[0] = NULL; 4908 plsz = len; 4909 } else if (rw == S_WRITE && svd->type == MAP_PRIVATE || 4910 rw == S_OTHER || 4911 (((size_t)(addr + PAGESIZE) < 4912 (size_t)(seg->s_base + seg->s_size)) && 4913 hat_probe(as->a_hat, addr + PAGESIZE))) { 4914 /* 4915 * Ask VOP_GETPAGE to return the exact number 4916 * of pages if 4917 * (a) this is a COW fault, or 4918 * (b) this is a software fault, or 4919 * (c) next page is already mapped. 4920 */ 4921 plsz = len; 4922 } else { 4923 /* 4924 * Ask VOP_GETPAGE to return adjacent pages 4925 * within the segment. 4926 */ 4927 plsz = MIN((size_t)PVN_GETPAGE_SZ, (size_t) 4928 ((seg->s_base + seg->s_size) - addr)); 4929 ASSERT((addr + plsz) <= 4930 (seg->s_base + seg->s_size)); 4931 } 4932 4933 /* 4934 * Need to get some non-anonymous pages. 4935 * We need to make only one call to GETPAGE to do 4936 * this to prevent certain deadlocking conditions 4937 * when we are doing locking. In this case 4938 * non_anon() should have picked up the smallest 4939 * range which includes all the non-anonymous 4940 * pages in the requested range. We have to 4941 * be careful regarding which rw flag to pass in 4942 * because on a private mapping, the underlying 4943 * object is never allowed to be written. 4944 */ 4945 if (rw == S_WRITE && svd->type == MAP_PRIVATE) { 4946 arw = S_READ; 4947 } else { 4948 arw = rw; 4949 } 4950 vp = svd->vp; 4951 TRACE_3(TR_FAC_VM, TR_SEGVN_GETPAGE, 4952 "segvn_getpage:seg %p addr %p vp %p", 4953 seg, addr, vp); 4954 err = VOP_GETPAGE(vp, (offset_t)vp_off, vp_len, 4955 &vpprot, plp, plsz, seg, addr + (vp_off - off), arw, 4956 svd->cred); 4957 if (err) { 4958 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 4959 segvn_pagelist_rele(plp); 4960 if (pl_alloc_sz) 4961 kmem_free(plp, pl_alloc_sz); 4962 return (FC_MAKE_ERR(err)); 4963 } 4964 if (svd->type == MAP_PRIVATE) 4965 vpprot &= ~PROT_WRITE; 4966 } 4967 } 4968 4969 /* 4970 * N.B. at this time the plp array has all the needed non-anon 4971 * pages in addition to (possibly) having some adjacent pages. 4972 */ 4973 4974 /* 4975 * Always acquire the anon_array_lock to prevent 4976 * 2 threads from allocating separate anon slots for 4977 * the same "addr". 4978 * 4979 * If this is a copy-on-write fault and we don't already 4980 * have the anon_array_lock, acquire it to prevent the 4981 * fault routine from handling multiple copy-on-write faults 4982 * on the same "addr" in the same address space. 4983 * 4984 * Only one thread should deal with the fault since after 4985 * it is handled, the other threads can acquire a translation 4986 * to the newly created private page. This prevents two or 4987 * more threads from creating different private pages for the 4988 * same fault. 4989 * 4990 * We grab "serialization" lock here if this is a MAP_PRIVATE segment 4991 * to prevent deadlock between this thread and another thread 4992 * which has soft-locked this page and wants to acquire serial_lock. 4993 * ( bug 4026339 ) 4994 * 4995 * The fix for bug 4026339 becomes unnecessary when using the 4996 * locking scheme with per amp rwlock and a global set of hash 4997 * lock, anon_array_lock. If we steal a vnode page when low 4998 * on memory and upgrad the page lock through page_rename, 4999 * then the page is PAGE_HANDLED, nothing needs to be done 5000 * for this page after returning from segvn_faultpage. 5001 * 5002 * But really, the page lock should be downgraded after 5003 * the stolen page is page_rename'd. 5004 */ 5005 5006 if (amp != NULL) 5007 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 5008 5009 /* 5010 * Ok, now loop over the address range and handle faults 5011 */ 5012 for (a = addr; a < addr + len; a += PAGESIZE, off += PAGESIZE) { 5013 err = segvn_faultpage(hat, seg, a, off, vpage, plp, vpprot, 5014 type, rw, brkcow, a == addr); 5015 if (err) { 5016 if (amp != NULL) 5017 ANON_LOCK_EXIT(&->a_rwlock); 5018 if (type == F_SOFTLOCK && a > addr) { 5019 segvn_softunlock(seg, addr, (a - addr), 5020 S_OTHER); 5021 } 5022 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5023 segvn_pagelist_rele(plp); 5024 if (pl_alloc_sz) 5025 kmem_free(plp, pl_alloc_sz); 5026 return (err); 5027 } 5028 if (vpage) { 5029 vpage++; 5030 } else if (svd->vpage) { 5031 page = seg_page(seg, addr); 5032 vpage = &svd->vpage[++page]; 5033 } 5034 } 5035 5036 /* Didn't get pages from the underlying fs so we're done */ 5037 if (!dogetpage) 5038 goto done; 5039 5040 /* 5041 * Now handle any other pages in the list returned. 5042 * If the page can be used, load up the translations now. 5043 * Note that the for loop will only be entered if "plp" 5044 * is pointing to a non-NULL page pointer which means that 5045 * VOP_GETPAGE() was called and vpprot has been initialized. 5046 */ 5047 if (svd->pageprot == 0) 5048 prot = svd->prot & vpprot; 5049 5050 5051 /* 5052 * Large Files: diff should be unsigned value because we started 5053 * supporting > 2GB segment sizes from 2.5.1 and when a 5054 * large file of size > 2GB gets mapped to address space 5055 * the diff value can be > 2GB. 5056 */ 5057 5058 for (ppp = plp; (pp = *ppp) != NULL; ppp++) { 5059 size_t diff; 5060 struct anon *ap; 5061 int anon_index; 5062 anon_sync_obj_t cookie; 5063 int hat_flag = HAT_LOAD_ADV; 5064 5065 if (svd->flags & MAP_TEXT) { 5066 hat_flag |= HAT_LOAD_TEXT; 5067 } 5068 5069 if (pp == PAGE_HANDLED) 5070 continue; 5071 5072 if (pp->p_offset >= svd->offset && 5073 (pp->p_offset < svd->offset + seg->s_size)) { 5074 5075 diff = pp->p_offset - svd->offset; 5076 5077 /* 5078 * Large Files: Following is the assertion 5079 * validating the above cast. 5080 */ 5081 ASSERT(svd->vp == pp->p_vnode); 5082 5083 page = btop(diff); 5084 if (svd->pageprot) 5085 prot = VPP_PROT(&svd->vpage[page]) & vpprot; 5086 5087 /* 5088 * Prevent other threads in the address space from 5089 * creating private pages (i.e., allocating anon slots) 5090 * while we are in the process of loading translations 5091 * to additional pages returned by the underlying 5092 * object. 5093 */ 5094 if (amp != NULL) { 5095 anon_index = svd->anon_index + page; 5096 anon_array_enter(amp, anon_index, &cookie); 5097 ap = anon_get_ptr(amp->ahp, anon_index); 5098 } 5099 if ((amp == NULL) || (ap == NULL)) { 5100 if (IS_VMODSORT(pp->p_vnode) || 5101 enable_mbit_wa) { 5102 if (rw == S_WRITE) 5103 hat_setmod(pp); 5104 else if (rw != S_OTHER && 5105 !hat_ismod(pp)) 5106 prot &= ~PROT_WRITE; 5107 } 5108 /* 5109 * Skip mapping read ahead pages marked 5110 * for migration, so they will get migrated 5111 * properly on fault 5112 */ 5113 if ((prot & PROT_READ) && !PP_ISMIGRATE(pp)) { 5114 hat_memload(hat, seg->s_base + diff, 5115 pp, prot, hat_flag); 5116 } 5117 } 5118 if (amp != NULL) 5119 anon_array_exit(&cookie); 5120 } 5121 page_unlock(pp); 5122 } 5123 done: 5124 if (amp != NULL) 5125 ANON_LOCK_EXIT(&->a_rwlock); 5126 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5127 if (pl_alloc_sz) 5128 kmem_free(plp, pl_alloc_sz); 5129 return (0); 5130 } 5131 5132 /* 5133 * This routine is used to start I/O on pages asynchronously. XXX it will 5134 * only create PAGESIZE pages. At fault time they will be relocated into 5135 * larger pages. 5136 */ 5137 static faultcode_t 5138 segvn_faulta(struct seg *seg, caddr_t addr) 5139 { 5140 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 5141 int err; 5142 struct anon_map *amp; 5143 vnode_t *vp; 5144 5145 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 5146 5147 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER); 5148 if ((amp = svd->amp) != NULL) { 5149 struct anon *ap; 5150 5151 /* 5152 * Reader lock to prevent amp->ahp from being changed. 5153 * This is advisory, it's ok to miss a page, so 5154 * we don't do anon_array_enter lock. 5155 */ 5156 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 5157 if ((ap = anon_get_ptr(amp->ahp, 5158 svd->anon_index + seg_page(seg, addr))) != NULL) { 5159 5160 err = anon_getpage(&ap, NULL, NULL, 5161 0, seg, addr, S_READ, svd->cred); 5162 5163 ANON_LOCK_EXIT(&->a_rwlock); 5164 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5165 if (err) 5166 return (FC_MAKE_ERR(err)); 5167 return (0); 5168 } 5169 ANON_LOCK_EXIT(&->a_rwlock); 5170 } 5171 5172 if (svd->vp == NULL) { 5173 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5174 return (0); /* zfod page - do nothing now */ 5175 } 5176 5177 vp = svd->vp; 5178 TRACE_3(TR_FAC_VM, TR_SEGVN_GETPAGE, 5179 "segvn_getpage:seg %p addr %p vp %p", seg, addr, vp); 5180 err = VOP_GETPAGE(vp, 5181 (offset_t)(svd->offset + (uintptr_t)(addr - seg->s_base)), 5182 PAGESIZE, NULL, NULL, 0, seg, addr, 5183 S_OTHER, svd->cred); 5184 5185 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5186 if (err) 5187 return (FC_MAKE_ERR(err)); 5188 return (0); 5189 } 5190 5191 static int 5192 segvn_setprot(struct seg *seg, caddr_t addr, size_t len, uint_t prot) 5193 { 5194 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 5195 struct vpage *svp, *evp; 5196 struct vnode *vp; 5197 size_t pgsz; 5198 pgcnt_t pgcnt; 5199 anon_sync_obj_t cookie; 5200 5201 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 5202 5203 if ((svd->maxprot & prot) != prot) 5204 return (EACCES); /* violated maxprot */ 5205 5206 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_WRITER); 5207 5208 /* return if prot is the same */ 5209 if (!svd->pageprot && svd->prot == prot) { 5210 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5211 return (0); 5212 } 5213 5214 /* 5215 * Since we change protections we first have to flush the cache. 5216 * This makes sure all the pagelock calls have to recheck 5217 * protections. 5218 */ 5219 if (svd->softlockcnt > 0) { 5220 /* 5221 * Since we do have the segvn writers lock nobody can fill 5222 * the cache with entries belonging to this seg during 5223 * the purge. The flush either succeeds or we still have 5224 * pending I/Os. 5225 */ 5226 segvn_purge(seg); 5227 if (svd->softlockcnt > 0) { 5228 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5229 return (EAGAIN); 5230 } 5231 } 5232 5233 if (seg->s_szc != 0) { 5234 int err; 5235 pgsz = page_get_pagesize(seg->s_szc); 5236 pgcnt = pgsz >> PAGESHIFT; 5237 ASSERT(IS_P2ALIGNED(pgcnt, pgcnt)); 5238 if (!IS_P2ALIGNED(addr, pgsz) || !IS_P2ALIGNED(len, pgsz)) { 5239 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5240 ASSERT(seg->s_base != addr || seg->s_size != len); 5241 /* 5242 * If we are holding the as lock as a reader then 5243 * we need to return IE_RETRY and let the as 5244 * layer drop and re-aquire the lock as a writer. 5245 */ 5246 if (AS_READ_HELD(seg->s_as, &seg->s_as->a_lock)) 5247 return (IE_RETRY); 5248 VM_STAT_ADD(segvnvmstats.demoterange[1]); 5249 if (svd->type == MAP_PRIVATE || svd->vp != NULL) { 5250 err = segvn_demote_range(seg, addr, len, 5251 SDR_END, 0); 5252 } else { 5253 uint_t szcvec = map_shm_pgszcvec(seg->s_base, 5254 pgsz, (uintptr_t)seg->s_base); 5255 err = segvn_demote_range(seg, addr, len, 5256 SDR_END, szcvec); 5257 } 5258 if (err == 0) 5259 return (IE_RETRY); 5260 if (err == ENOMEM) 5261 return (IE_NOMEM); 5262 return (err); 5263 } 5264 } 5265 5266 5267 /* 5268 * If it's a private mapping and we're making it writable 5269 * and no swap space has been reserved, have to reserve 5270 * it all now. If it's a private mapping to a file (i.e., vp != NULL) 5271 * and we're removing write permission on the entire segment and 5272 * we haven't modified any pages, we can release the swap space. 5273 */ 5274 if (svd->type == MAP_PRIVATE) { 5275 if (prot & PROT_WRITE) { 5276 size_t sz; 5277 if (svd->swresv == 0 && !(svd->flags & MAP_NORESERVE)) { 5278 if (anon_resv(seg->s_size) == 0) { 5279 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5280 return (IE_NOMEM); 5281 } 5282 sz = svd->swresv = seg->s_size; 5283 TRACE_3(TR_FAC_VM, TR_ANON_PROC, 5284 "anon proc:%p %lu %u", 5285 seg, sz, 1); 5286 } 5287 } else { 5288 /* 5289 * Swap space is released only if this segment 5290 * does not map anonymous memory, since read faults 5291 * on such segments still need an anon slot to read 5292 * in the data. 5293 */ 5294 if (svd->swresv != 0 && svd->vp != NULL && 5295 svd->amp == NULL && addr == seg->s_base && 5296 len == seg->s_size && svd->pageprot == 0) { 5297 anon_unresv(svd->swresv); 5298 svd->swresv = 0; 5299 TRACE_3(TR_FAC_VM, TR_ANON_PROC, 5300 "anon proc:%p %lu %u", 5301 seg, 0, 0); 5302 } 5303 } 5304 } 5305 5306 if (addr == seg->s_base && len == seg->s_size && svd->pageprot == 0) { 5307 if (svd->prot == prot) { 5308 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5309 return (0); /* all done */ 5310 } 5311 svd->prot = (uchar_t)prot; 5312 } else if (svd->type == MAP_PRIVATE) { 5313 struct anon *ap = NULL; 5314 page_t *pp; 5315 u_offset_t offset, off; 5316 struct anon_map *amp; 5317 ulong_t anon_idx = 0; 5318 5319 /* 5320 * A vpage structure exists or else the change does not 5321 * involve the entire segment. Establish a vpage structure 5322 * if none is there. Then, for each page in the range, 5323 * adjust its individual permissions. Note that write- 5324 * enabling a MAP_PRIVATE page can affect the claims for 5325 * locked down memory. Overcommitting memory terminates 5326 * the operation. 5327 */ 5328 segvn_vpage(seg); 5329 if ((amp = svd->amp) != NULL) { 5330 anon_idx = svd->anon_index + seg_page(seg, addr); 5331 ASSERT(seg->s_szc == 0 || 5332 IS_P2ALIGNED(anon_idx, pgcnt)); 5333 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 5334 } 5335 5336 offset = svd->offset + (uintptr_t)(addr - seg->s_base); 5337 evp = &svd->vpage[seg_page(seg, addr + len)]; 5338 5339 /* 5340 * See Statement at the beginning of segvn_lockop regarding 5341 * the way cowcnts and lckcnts are handled. 5342 */ 5343 for (svp = &svd->vpage[seg_page(seg, addr)]; svp < evp; svp++) { 5344 5345 if (seg->s_szc != 0) { 5346 if (amp != NULL) { 5347 anon_array_enter(amp, anon_idx, 5348 &cookie); 5349 } 5350 if (IS_P2ALIGNED(anon_idx, pgcnt) && 5351 !segvn_claim_pages(seg, svp, offset, 5352 anon_idx, prot)) { 5353 if (amp != NULL) { 5354 anon_array_exit(&cookie); 5355 } 5356 break; 5357 } 5358 if (amp != NULL) { 5359 anon_array_exit(&cookie); 5360 } 5361 anon_idx++; 5362 } else { 5363 if (amp != NULL) { 5364 anon_array_enter(amp, anon_idx, 5365 &cookie); 5366 ap = anon_get_ptr(amp->ahp, anon_idx++); 5367 } 5368 5369 if (VPP_ISPPLOCK(svp) && 5370 VPP_PROT(svp) != prot) { 5371 5372 if (amp == NULL || ap == NULL) { 5373 vp = svd->vp; 5374 off = offset; 5375 } else 5376 swap_xlate(ap, &vp, &off); 5377 if (amp != NULL) 5378 anon_array_exit(&cookie); 5379 5380 if ((pp = page_lookup(vp, off, 5381 SE_SHARED)) == NULL) { 5382 panic("segvn_setprot: no page"); 5383 /*NOTREACHED*/ 5384 } 5385 ASSERT(seg->s_szc == 0); 5386 if ((VPP_PROT(svp) ^ prot) & 5387 PROT_WRITE) { 5388 if (prot & PROT_WRITE) { 5389 if (!page_addclaim(pp)) { 5390 page_unlock(pp); 5391 break; 5392 } 5393 } else { 5394 if (!page_subclaim(pp)) { 5395 page_unlock(pp); 5396 break; 5397 } 5398 } 5399 } 5400 page_unlock(pp); 5401 } else if (amp != NULL) 5402 anon_array_exit(&cookie); 5403 } 5404 VPP_SETPROT(svp, prot); 5405 offset += PAGESIZE; 5406 } 5407 if (amp != NULL) 5408 ANON_LOCK_EXIT(&->a_rwlock); 5409 5410 /* 5411 * Did we terminate prematurely? If so, simply unload 5412 * the translations to the things we've updated so far. 5413 */ 5414 if (svp != evp) { 5415 len = (svp - &svd->vpage[seg_page(seg, addr)]) * 5416 PAGESIZE; 5417 ASSERT(seg->s_szc == 0 || IS_P2ALIGNED(len, pgsz)); 5418 if (len != 0) 5419 hat_unload(seg->s_as->a_hat, addr, 5420 len, HAT_UNLOAD); 5421 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5422 return (IE_NOMEM); 5423 } 5424 } else { 5425 segvn_vpage(seg); 5426 evp = &svd->vpage[seg_page(seg, addr + len)]; 5427 for (svp = &svd->vpage[seg_page(seg, addr)]; svp < evp; svp++) { 5428 VPP_SETPROT(svp, prot); 5429 } 5430 } 5431 5432 if (((prot & PROT_WRITE) != 0 && 5433 (svd->vp != NULL || svd->type == MAP_PRIVATE)) || 5434 (prot & ~PROT_USER) == PROT_NONE) { 5435 /* 5436 * Either private or shared data with write access (in 5437 * which case we need to throw out all former translations 5438 * so that we get the right translations set up on fault 5439 * and we don't allow write access to any copy-on-write pages 5440 * that might be around or to prevent write access to pages 5441 * representing holes in a file), or we don't have permission 5442 * to access the memory at all (in which case we have to 5443 * unload any current translations that might exist). 5444 */ 5445 hat_unload(seg->s_as->a_hat, addr, len, HAT_UNLOAD); 5446 } else { 5447 /* 5448 * A shared mapping or a private mapping in which write 5449 * protection is going to be denied - just change all the 5450 * protections over the range of addresses in question. 5451 * segvn does not support any other attributes other 5452 * than prot so we can use hat_chgattr. 5453 */ 5454 hat_chgattr(seg->s_as->a_hat, addr, len, prot); 5455 } 5456 5457 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5458 5459 return (0); 5460 } 5461 5462 /* 5463 * segvn_setpagesize is called via SEGOP_SETPAGESIZE from as_setpagesize, 5464 * to determine if the seg is capable of mapping the requested szc. 5465 */ 5466 static int 5467 segvn_setpagesize(struct seg *seg, caddr_t addr, size_t len, uint_t szc) 5468 { 5469 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 5470 struct segvn_data *nsvd; 5471 struct anon_map *amp = svd->amp; 5472 struct seg *nseg; 5473 caddr_t eaddr = addr + len, a; 5474 size_t pgsz = page_get_pagesize(szc); 5475 pgcnt_t pgcnt = page_get_pagecnt(szc); 5476 int err; 5477 u_offset_t off = svd->offset + (uintptr_t)(addr - seg->s_base); 5478 extern struct vnode kvp; 5479 5480 ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock)); 5481 ASSERT(addr >= seg->s_base && eaddr <= seg->s_base + seg->s_size); 5482 5483 if (seg->s_szc == szc || segvn_lpg_disable != 0) { 5484 return (0); 5485 } 5486 5487 /* 5488 * addr should always be pgsz aligned but eaddr may be misaligned if 5489 * it's at the end of the segment. 5490 * 5491 * XXX we should assert this condition since as_setpagesize() logic 5492 * guarantees it. 5493 */ 5494 if (!IS_P2ALIGNED(addr, pgsz) || 5495 (!IS_P2ALIGNED(eaddr, pgsz) && 5496 eaddr != seg->s_base + seg->s_size)) { 5497 5498 segvn_setpgsz_align_err++; 5499 return (EINVAL); 5500 } 5501 5502 if (amp != NULL && svd->type == MAP_SHARED) { 5503 ulong_t an_idx = svd->anon_index + seg_page(seg, addr); 5504 if (!IS_P2ALIGNED(an_idx, pgcnt)) { 5505 5506 segvn_setpgsz_anon_align_err++; 5507 return (EINVAL); 5508 } 5509 } 5510 5511 if ((svd->flags & MAP_NORESERVE) || seg->s_as == &kas || 5512 szc > segvn_maxpgszc) { 5513 return (EINVAL); 5514 } 5515 5516 /* paranoid check */ 5517 if (svd->vp != NULL && 5518 (IS_SWAPFSVP(svd->vp) || svd->vp == &kvp)) { 5519 return (EINVAL); 5520 } 5521 5522 if (seg->s_szc == 0 && svd->vp != NULL && 5523 map_addr_vacalign_check(addr, off)) { 5524 return (EINVAL); 5525 } 5526 5527 /* 5528 * Check that protections are the same within new page 5529 * size boundaries. 5530 */ 5531 if (svd->pageprot) { 5532 for (a = addr; a < eaddr; a += pgsz) { 5533 if ((a + pgsz) > eaddr) { 5534 if (!sameprot(seg, a, eaddr - a)) { 5535 return (EINVAL); 5536 } 5537 } else { 5538 if (!sameprot(seg, a, pgsz)) { 5539 return (EINVAL); 5540 } 5541 } 5542 } 5543 } 5544 5545 /* 5546 * Since we are changing page size we first have to flush 5547 * the cache. This makes sure all the pagelock calls have 5548 * to recheck protections. 5549 */ 5550 if (svd->softlockcnt > 0) { 5551 /* 5552 * Since we do have the segvn writers lock nobody can fill 5553 * the cache with entries belonging to this seg during 5554 * the purge. The flush either succeeds or we still have 5555 * pending I/Os. 5556 */ 5557 segvn_purge(seg); 5558 if (svd->softlockcnt > 0) { 5559 return (EAGAIN); 5560 } 5561 } 5562 5563 /* 5564 * Operation for sub range of existing segment. 5565 */ 5566 if (addr != seg->s_base || eaddr != (seg->s_base + seg->s_size)) { 5567 if (szc < seg->s_szc) { 5568 VM_STAT_ADD(segvnvmstats.demoterange[2]); 5569 err = segvn_demote_range(seg, addr, len, SDR_RANGE, 0); 5570 if (err == 0) { 5571 return (IE_RETRY); 5572 } 5573 if (err == ENOMEM) { 5574 return (IE_NOMEM); 5575 } 5576 return (err); 5577 } 5578 if (addr != seg->s_base) { 5579 nseg = segvn_split_seg(seg, addr); 5580 if (eaddr != (nseg->s_base + nseg->s_size)) { 5581 /* eaddr is szc aligned */ 5582 (void) segvn_split_seg(nseg, eaddr); 5583 } 5584 return (IE_RETRY); 5585 } 5586 if (eaddr != (seg->s_base + seg->s_size)) { 5587 /* eaddr is szc aligned */ 5588 (void) segvn_split_seg(seg, eaddr); 5589 } 5590 return (IE_RETRY); 5591 } 5592 5593 /* 5594 * Break any low level sharing and reset seg->s_szc to 0. 5595 */ 5596 if ((err = segvn_clrszc(seg)) != 0) { 5597 if (err == ENOMEM) { 5598 err = IE_NOMEM; 5599 } 5600 return (err); 5601 } 5602 ASSERT(seg->s_szc == 0); 5603 5604 /* 5605 * If the end of the current segment is not pgsz aligned 5606 * then attempt to concatenate with the next segment. 5607 */ 5608 if (!IS_P2ALIGNED(eaddr, pgsz)) { 5609 nseg = AS_SEGNEXT(seg->s_as, seg); 5610 if (nseg == NULL || nseg == seg || eaddr != nseg->s_base) { 5611 return (ENOMEM); 5612 } 5613 if (nseg->s_ops != &segvn_ops) { 5614 return (EINVAL); 5615 } 5616 nsvd = (struct segvn_data *)nseg->s_data; 5617 if (nsvd->softlockcnt > 0) { 5618 segvn_purge(nseg); 5619 if (nsvd->softlockcnt > 0) { 5620 return (EAGAIN); 5621 } 5622 } 5623 err = segvn_clrszc(nseg); 5624 if (err == ENOMEM) { 5625 err = IE_NOMEM; 5626 } 5627 if (err != 0) { 5628 return (err); 5629 } 5630 err = segvn_concat(seg, nseg, 1); 5631 if (err == -1) { 5632 return (EINVAL); 5633 } 5634 if (err == -2) { 5635 return (IE_NOMEM); 5636 } 5637 return (IE_RETRY); 5638 } 5639 5640 /* 5641 * May need to re-align anon array to 5642 * new szc. 5643 */ 5644 if (amp != NULL) { 5645 if (!IS_P2ALIGNED(svd->anon_index, pgcnt)) { 5646 struct anon_hdr *nahp; 5647 5648 ASSERT(svd->type == MAP_PRIVATE); 5649 5650 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 5651 ASSERT(amp->refcnt == 1); 5652 nahp = anon_create(btop(amp->size), ANON_NOSLEEP); 5653 if (nahp == NULL) { 5654 ANON_LOCK_EXIT(&->a_rwlock); 5655 return (IE_NOMEM); 5656 } 5657 if (anon_copy_ptr(amp->ahp, svd->anon_index, 5658 nahp, 0, btop(seg->s_size), ANON_NOSLEEP)) { 5659 anon_release(nahp, btop(amp->size)); 5660 ANON_LOCK_EXIT(&->a_rwlock); 5661 return (IE_NOMEM); 5662 } 5663 anon_release(amp->ahp, btop(amp->size)); 5664 amp->ahp = nahp; 5665 svd->anon_index = 0; 5666 ANON_LOCK_EXIT(&->a_rwlock); 5667 } 5668 } 5669 if (svd->vp != NULL && szc != 0) { 5670 struct vattr va; 5671 u_offset_t eoffpage = svd->offset; 5672 va.va_mask = AT_SIZE; 5673 eoffpage += seg->s_size; 5674 eoffpage = btopr(eoffpage); 5675 if (VOP_GETATTR(svd->vp, &va, 0, svd->cred) != 0) { 5676 segvn_setpgsz_getattr_err++; 5677 return (EINVAL); 5678 } 5679 if (btopr(va.va_size) < eoffpage) { 5680 segvn_setpgsz_eof_err++; 5681 return (EINVAL); 5682 } 5683 if (amp != NULL) { 5684 /* 5685 * anon_fill_cow_holes() may call VOP_GETPAGE(). 5686 * don't take anon map lock here to avoid holding it 5687 * across VOP_GETPAGE() calls that may call back into 5688 * segvn for klsutering checks. We don't really need 5689 * anon map lock here since it's a private segment and 5690 * we hold as level lock as writers. 5691 */ 5692 if ((err = anon_fill_cow_holes(seg, seg->s_base, 5693 amp->ahp, svd->anon_index, svd->vp, svd->offset, 5694 seg->s_size, szc, svd->prot, svd->vpage, 5695 svd->cred)) != 0) { 5696 return (EINVAL); 5697 } 5698 } 5699 segvn_setvnode_mpss(svd->vp); 5700 } 5701 5702 if (amp != NULL) { 5703 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 5704 if (svd->type == MAP_PRIVATE) { 5705 amp->a_szc = szc; 5706 } else if (szc > amp->a_szc) { 5707 amp->a_szc = szc; 5708 } 5709 ANON_LOCK_EXIT(&->a_rwlock); 5710 } 5711 5712 seg->s_szc = szc; 5713 5714 return (0); 5715 } 5716 5717 static int 5718 segvn_clrszc(struct seg *seg) 5719 { 5720 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 5721 struct anon_map *amp = svd->amp; 5722 size_t pgsz; 5723 pgcnt_t pages; 5724 int err = 0; 5725 caddr_t a = seg->s_base; 5726 caddr_t ea = a + seg->s_size; 5727 ulong_t an_idx = svd->anon_index; 5728 vnode_t *vp = svd->vp; 5729 struct vpage *vpage = svd->vpage; 5730 page_t *anon_pl[1 + 1], *pp; 5731 struct anon *ap, *oldap; 5732 uint_t prot = svd->prot, vpprot; 5733 5734 ASSERT(AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock) || 5735 SEGVN_WRITE_HELD(seg->s_as, &svd->lock)); 5736 5737 if (vp == NULL && amp == NULL) { 5738 seg->s_szc = 0; 5739 return (0); 5740 } 5741 5742 /* 5743 * do HAT_UNLOAD_UNMAP since we are changing the pagesize. 5744 * unload argument is 0 when we are freeing the segment 5745 * and unload was already done. 5746 */ 5747 hat_unload(seg->s_as->a_hat, seg->s_base, seg->s_size, 5748 HAT_UNLOAD_UNMAP); 5749 5750 if (amp == NULL || svd->type == MAP_SHARED) { 5751 seg->s_szc = 0; 5752 return (0); 5753 } 5754 5755 pgsz = page_get_pagesize(seg->s_szc); 5756 pages = btop(pgsz); 5757 5758 /* 5759 * XXX anon rwlock is not really needed because this is a 5760 * private segment and we are writers. 5761 */ 5762 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 5763 5764 for (; a < ea; a += pgsz, an_idx += pages) { 5765 if ((oldap = anon_get_ptr(amp->ahp, an_idx)) != NULL) { 5766 if (svd->pageprot != 0) { 5767 ASSERT(vpage != NULL); 5768 prot = VPP_PROT(vpage); 5769 ASSERT(sameprot(seg, a, pgsz)); 5770 } 5771 if (seg->s_szc != 0) { 5772 ASSERT(vp == NULL || anon_pages(amp->ahp, 5773 an_idx, pages) == pages); 5774 if ((err = anon_map_demotepages(amp, an_idx, 5775 seg, a, prot, vpage, svd->cred)) != 0) { 5776 goto out; 5777 } 5778 } else { 5779 if (oldap->an_refcnt == 1) { 5780 continue; 5781 } 5782 if ((err = anon_getpage(&oldap, &vpprot, 5783 anon_pl, PAGESIZE, seg, a, S_READ, 5784 svd->cred))) { 5785 goto out; 5786 } 5787 if ((pp = anon_private(&ap, seg, a, prot, 5788 anon_pl[0], 0, svd->cred)) == NULL) { 5789 err = ENOMEM; 5790 goto out; 5791 } 5792 anon_decref(oldap); 5793 (void) anon_set_ptr(amp->ahp, an_idx, ap, 5794 ANON_SLEEP); 5795 page_unlock(pp); 5796 } 5797 } 5798 vpage = (vpage == NULL) ? NULL : vpage + pages; 5799 } 5800 5801 amp->a_szc = 0; 5802 seg->s_szc = 0; 5803 out: 5804 ANON_LOCK_EXIT(&->a_rwlock); 5805 return (err); 5806 } 5807 5808 static int 5809 segvn_claim_pages( 5810 struct seg *seg, 5811 struct vpage *svp, 5812 u_offset_t off, 5813 ulong_t anon_idx, 5814 uint_t prot) 5815 { 5816 pgcnt_t pgcnt = page_get_pagecnt(seg->s_szc); 5817 size_t ppasize = (pgcnt + 1) * sizeof (page_t *); 5818 page_t **ppa; 5819 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 5820 struct anon_map *amp = svd->amp; 5821 struct vpage *evp = svp + pgcnt; 5822 caddr_t addr = ((uintptr_t)(svp - svd->vpage) << PAGESHIFT) 5823 + seg->s_base; 5824 struct anon *ap; 5825 struct vnode *vp = svd->vp; 5826 page_t *pp; 5827 pgcnt_t pg_idx, i; 5828 int err = 0; 5829 anoff_t aoff; 5830 int anon = (amp != NULL) ? 1 : 0; 5831 5832 ASSERT(svd->type == MAP_PRIVATE); 5833 ASSERT(svd->vpage != NULL); 5834 ASSERT(seg->s_szc != 0); 5835 ASSERT(IS_P2ALIGNED(pgcnt, pgcnt)); 5836 ASSERT(amp == NULL || IS_P2ALIGNED(anon_idx, pgcnt)); 5837 ASSERT(sameprot(seg, addr, pgcnt << PAGESHIFT)); 5838 5839 if (VPP_PROT(svp) == prot) 5840 return (1); 5841 if (!((VPP_PROT(svp) ^ prot) & PROT_WRITE)) 5842 return (1); 5843 5844 ppa = kmem_alloc(ppasize, KM_SLEEP); 5845 if (anon && vp != NULL) { 5846 if (anon_get_ptr(amp->ahp, anon_idx) == NULL) { 5847 anon = 0; 5848 ASSERT(!anon_pages(amp->ahp, anon_idx, pgcnt)); 5849 } 5850 ASSERT(!anon || 5851 anon_pages(amp->ahp, anon_idx, pgcnt) == pgcnt); 5852 } 5853 5854 for (*ppa = NULL, pg_idx = 0; svp < evp; svp++, anon_idx++) { 5855 if (!VPP_ISPPLOCK(svp)) 5856 continue; 5857 if (anon) { 5858 ap = anon_get_ptr(amp->ahp, anon_idx); 5859 if (ap == NULL) { 5860 panic("segvn_claim_pages: no anon slot"); 5861 } 5862 swap_xlate(ap, &vp, &aoff); 5863 off = (u_offset_t)aoff; 5864 } 5865 ASSERT(vp != NULL); 5866 if ((pp = page_lookup(vp, 5867 (u_offset_t)off, SE_SHARED)) == NULL) { 5868 panic("segvn_claim_pages: no page"); 5869 } 5870 ppa[pg_idx++] = pp; 5871 off += PAGESIZE; 5872 } 5873 5874 if (ppa[0] == NULL) { 5875 kmem_free(ppa, ppasize); 5876 return (1); 5877 } 5878 5879 ASSERT(pg_idx <= pgcnt); 5880 ppa[pg_idx] = NULL; 5881 5882 if (prot & PROT_WRITE) 5883 err = page_addclaim_pages(ppa); 5884 else 5885 err = page_subclaim_pages(ppa); 5886 5887 for (i = 0; i < pg_idx; i++) { 5888 ASSERT(ppa[i] != NULL); 5889 page_unlock(ppa[i]); 5890 } 5891 5892 kmem_free(ppa, ppasize); 5893 return (err); 5894 } 5895 5896 /* 5897 * Returns right (upper address) segment if split occured. 5898 * If the address is equal to the beginning or end of its segment it returns 5899 * the current segment. 5900 */ 5901 static struct seg * 5902 segvn_split_seg(struct seg *seg, caddr_t addr) 5903 { 5904 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 5905 struct seg *nseg; 5906 size_t nsize; 5907 struct segvn_data *nsvd; 5908 5909 ASSERT(AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock)); 5910 ASSERT(addr >= seg->s_base); 5911 ASSERT(addr <= seg->s_base + seg->s_size); 5912 5913 if (addr == seg->s_base || addr == seg->s_base + seg->s_size) 5914 return (seg); 5915 5916 nsize = seg->s_base + seg->s_size - addr; 5917 seg->s_size = addr - seg->s_base; 5918 nseg = seg_alloc(seg->s_as, addr, nsize); 5919 ASSERT(nseg != NULL); 5920 nseg->s_ops = seg->s_ops; 5921 nsvd = kmem_cache_alloc(segvn_cache, KM_SLEEP); 5922 nseg->s_data = (void *)nsvd; 5923 nseg->s_szc = seg->s_szc; 5924 *nsvd = *svd; 5925 rw_init(&nsvd->lock, NULL, RW_DEFAULT, NULL); 5926 5927 if (nsvd->vp != NULL) { 5928 VN_HOLD(nsvd->vp); 5929 nsvd->offset = svd->offset + 5930 (uintptr_t)(nseg->s_base - seg->s_base); 5931 if (nsvd->type == MAP_SHARED) 5932 lgrp_shm_policy_init(NULL, nsvd->vp); 5933 } else { 5934 /* 5935 * The offset for an anonymous segment has no signifigance in 5936 * terms of an offset into a file. If we were to use the above 5937 * calculation instead, the structures read out of 5938 * /proc/<pid>/xmap would be more difficult to decipher since 5939 * it would be unclear whether two seemingly contiguous 5940 * prxmap_t structures represented different segments or a 5941 * single segment that had been split up into multiple prxmap_t 5942 * structures (e.g. if some part of the segment had not yet 5943 * been faulted in). 5944 */ 5945 nsvd->offset = 0; 5946 } 5947 5948 ASSERT(svd->softlockcnt == 0); 5949 crhold(svd->cred); 5950 5951 if (svd->vpage != NULL) { 5952 size_t bytes = vpgtob(seg_pages(seg)); 5953 size_t nbytes = vpgtob(seg_pages(nseg)); 5954 struct vpage *ovpage = svd->vpage; 5955 5956 svd->vpage = kmem_alloc(bytes, KM_SLEEP); 5957 bcopy(ovpage, svd->vpage, bytes); 5958 nsvd->vpage = kmem_alloc(nbytes, KM_SLEEP); 5959 bcopy(ovpage + seg_pages(seg), nsvd->vpage, nbytes); 5960 kmem_free(ovpage, bytes + nbytes); 5961 } 5962 if (svd->amp != NULL && svd->type == MAP_PRIVATE) { 5963 struct anon_map *oamp = svd->amp, *namp; 5964 struct anon_hdr *nahp; 5965 5966 ANON_LOCK_ENTER(&oamp->a_rwlock, RW_WRITER); 5967 ASSERT(oamp->refcnt == 1); 5968 nahp = anon_create(btop(seg->s_size), ANON_SLEEP); 5969 (void) anon_copy_ptr(oamp->ahp, svd->anon_index, 5970 nahp, 0, btop(seg->s_size), ANON_SLEEP); 5971 5972 namp = anonmap_alloc(nseg->s_size, 0); 5973 namp->a_szc = nseg->s_szc; 5974 (void) anon_copy_ptr(oamp->ahp, 5975 svd->anon_index + btop(seg->s_size), 5976 namp->ahp, 0, btop(nseg->s_size), ANON_SLEEP); 5977 anon_release(oamp->ahp, btop(oamp->size)); 5978 oamp->ahp = nahp; 5979 oamp->size = seg->s_size; 5980 svd->anon_index = 0; 5981 nsvd->amp = namp; 5982 nsvd->anon_index = 0; 5983 ANON_LOCK_EXIT(&oamp->a_rwlock); 5984 } else if (svd->amp != NULL) { 5985 pgcnt_t pgcnt = page_get_pagecnt(seg->s_szc); 5986 ASSERT(svd->amp == nsvd->amp); 5987 ASSERT(seg->s_szc <= svd->amp->a_szc); 5988 nsvd->anon_index = svd->anon_index + seg_pages(seg); 5989 ASSERT(IS_P2ALIGNED(nsvd->anon_index, pgcnt)); 5990 ANON_LOCK_ENTER(&svd->amp->a_rwlock, RW_WRITER); 5991 svd->amp->refcnt++; 5992 ANON_LOCK_EXIT(&svd->amp->a_rwlock); 5993 } 5994 5995 /* 5996 * Split amount of swap reserve 5997 */ 5998 if (svd->swresv) { 5999 /* 6000 * For MAP_NORESERVE, only allocate swap reserve for pages 6001 * being used. Other segments get enough to cover whole 6002 * segment. 6003 */ 6004 if (svd->flags & MAP_NORESERVE) { 6005 size_t oswresv; 6006 6007 ASSERT(svd->amp); 6008 oswresv = svd->swresv; 6009 svd->swresv = ptob(anon_pages(svd->amp->ahp, 6010 svd->anon_index, btop(seg->s_size))); 6011 nsvd->swresv = ptob(anon_pages(nsvd->amp->ahp, 6012 nsvd->anon_index, btop(nseg->s_size))); 6013 ASSERT(oswresv >= (svd->swresv + nsvd->swresv)); 6014 } else { 6015 ASSERT(svd->swresv == seg->s_size + nseg->s_size); 6016 svd->swresv = seg->s_size; 6017 nsvd->swresv = nseg->s_size; 6018 } 6019 } 6020 6021 return (nseg); 6022 } 6023 6024 /* 6025 * called on memory operations (unmap, setprot, setpagesize) for a subset 6026 * of a large page segment to either demote the memory range (SDR_RANGE) 6027 * or the ends (SDR_END) by addr/len. 6028 * 6029 * returns 0 on success. returns errno, including ENOMEM, on failure. 6030 */ 6031 static int 6032 segvn_demote_range( 6033 struct seg *seg, 6034 caddr_t addr, 6035 size_t len, 6036 int flag, 6037 uint_t szcvec) 6038 { 6039 caddr_t eaddr = addr + len; 6040 caddr_t lpgaddr, lpgeaddr; 6041 struct seg *nseg; 6042 struct seg *badseg1 = NULL; 6043 struct seg *badseg2 = NULL; 6044 size_t pgsz; 6045 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 6046 int err; 6047 uint_t szc = seg->s_szc; 6048 uint_t tszcvec; 6049 6050 ASSERT(AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock)); 6051 ASSERT(szc != 0); 6052 pgsz = page_get_pagesize(szc); 6053 ASSERT(seg->s_base != addr || seg->s_size != len); 6054 ASSERT(addr >= seg->s_base && eaddr <= seg->s_base + seg->s_size); 6055 ASSERT(svd->softlockcnt == 0); 6056 ASSERT(szcvec == 0 || (flag == SDR_END && svd->type == MAP_SHARED)); 6057 6058 CALC_LPG_REGION(pgsz, seg, addr, len, lpgaddr, lpgeaddr); 6059 ASSERT(flag == SDR_RANGE || eaddr < lpgeaddr || addr > lpgaddr); 6060 if (flag == SDR_RANGE) { 6061 /* demote entire range */ 6062 badseg1 = nseg = segvn_split_seg(seg, lpgaddr); 6063 (void) segvn_split_seg(nseg, lpgeaddr); 6064 ASSERT(badseg1->s_base == lpgaddr); 6065 ASSERT(badseg1->s_size == lpgeaddr - lpgaddr); 6066 } else if (addr != lpgaddr) { 6067 ASSERT(flag == SDR_END); 6068 badseg1 = nseg = segvn_split_seg(seg, lpgaddr); 6069 if (eaddr != lpgeaddr && eaddr > lpgaddr + pgsz && 6070 eaddr < lpgaddr + 2 * pgsz) { 6071 (void) segvn_split_seg(nseg, lpgeaddr); 6072 ASSERT(badseg1->s_base == lpgaddr); 6073 ASSERT(badseg1->s_size == 2 * pgsz); 6074 } else { 6075 nseg = segvn_split_seg(nseg, lpgaddr + pgsz); 6076 ASSERT(badseg1->s_base == lpgaddr); 6077 ASSERT(badseg1->s_size == pgsz); 6078 if (eaddr != lpgeaddr && eaddr > lpgaddr + pgsz) { 6079 ASSERT(lpgeaddr - lpgaddr > 2 * pgsz); 6080 nseg = segvn_split_seg(nseg, lpgeaddr - pgsz); 6081 badseg2 = nseg; 6082 (void) segvn_split_seg(nseg, lpgeaddr); 6083 ASSERT(badseg2->s_base == lpgeaddr - pgsz); 6084 ASSERT(badseg2->s_size == pgsz); 6085 } 6086 } 6087 } else { 6088 ASSERT(flag == SDR_END); 6089 ASSERT(eaddr < lpgeaddr); 6090 badseg1 = nseg = segvn_split_seg(seg, lpgeaddr - pgsz); 6091 (void) segvn_split_seg(nseg, lpgeaddr); 6092 ASSERT(badseg1->s_base == lpgeaddr - pgsz); 6093 ASSERT(badseg1->s_size == pgsz); 6094 } 6095 6096 ASSERT(badseg1 != NULL); 6097 ASSERT(badseg1->s_szc == szc); 6098 ASSERT(flag == SDR_RANGE || badseg1->s_size == pgsz || 6099 badseg1->s_size == 2 * pgsz); 6100 ASSERT(sameprot(badseg1, badseg1->s_base, pgsz)); 6101 ASSERT(badseg1->s_size == pgsz || 6102 sameprot(badseg1, badseg1->s_base + pgsz, pgsz)); 6103 if (err = segvn_clrszc(badseg1)) { 6104 return (err); 6105 } 6106 ASSERT(badseg1->s_szc == 0); 6107 6108 if (szc > 1 && (tszcvec = P2PHASE(szcvec, 1 << szc)) > 1) { 6109 uint_t tszc = highbit(tszcvec) - 1; 6110 caddr_t ta = MAX(addr, badseg1->s_base); 6111 caddr_t te; 6112 size_t tpgsz = page_get_pagesize(tszc); 6113 6114 ASSERT(svd->type == MAP_SHARED); 6115 ASSERT(flag == SDR_END); 6116 ASSERT(tszc < szc && tszc > 0); 6117 6118 if (eaddr > badseg1->s_base + badseg1->s_size) { 6119 te = badseg1->s_base + badseg1->s_size; 6120 } else { 6121 te = eaddr; 6122 } 6123 6124 ASSERT(ta <= te); 6125 badseg1->s_szc = tszc; 6126 if (!IS_P2ALIGNED(ta, tpgsz) || !IS_P2ALIGNED(te, tpgsz)) { 6127 if (badseg2 != NULL) { 6128 err = segvn_demote_range(badseg1, ta, te - ta, 6129 SDR_END, tszcvec); 6130 if (err != 0) { 6131 return (err); 6132 } 6133 } else { 6134 return (segvn_demote_range(badseg1, ta, 6135 te - ta, SDR_END, tszcvec)); 6136 } 6137 } 6138 } 6139 6140 if (badseg2 == NULL) 6141 return (0); 6142 ASSERT(badseg2->s_szc == szc); 6143 ASSERT(badseg2->s_size == pgsz); 6144 ASSERT(sameprot(badseg2, badseg2->s_base, badseg2->s_size)); 6145 if (err = segvn_clrszc(badseg2)) { 6146 return (err); 6147 } 6148 ASSERT(badseg2->s_szc == 0); 6149 6150 if (szc > 1 && (tszcvec = P2PHASE(szcvec, 1 << szc)) > 1) { 6151 uint_t tszc = highbit(tszcvec) - 1; 6152 size_t tpgsz = page_get_pagesize(tszc); 6153 6154 ASSERT(svd->type == MAP_SHARED); 6155 ASSERT(flag == SDR_END); 6156 ASSERT(tszc < szc && tszc > 0); 6157 ASSERT(badseg2->s_base > addr); 6158 ASSERT(eaddr > badseg2->s_base); 6159 ASSERT(eaddr < badseg2->s_base + badseg2->s_size); 6160 6161 badseg2->s_szc = tszc; 6162 if (!IS_P2ALIGNED(eaddr, tpgsz)) { 6163 return (segvn_demote_range(badseg2, badseg2->s_base, 6164 eaddr - badseg2->s_base, SDR_END, tszcvec)); 6165 } 6166 } 6167 6168 return (0); 6169 } 6170 6171 static int 6172 segvn_checkprot(struct seg *seg, caddr_t addr, size_t len, uint_t prot) 6173 { 6174 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 6175 struct vpage *vp, *evp; 6176 6177 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 6178 6179 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER); 6180 /* 6181 * If segment protection can be used, simply check against them. 6182 */ 6183 if (svd->pageprot == 0) { 6184 int err; 6185 6186 err = ((svd->prot & prot) != prot) ? EACCES : 0; 6187 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 6188 return (err); 6189 } 6190 6191 /* 6192 * Have to check down to the vpage level. 6193 */ 6194 evp = &svd->vpage[seg_page(seg, addr + len)]; 6195 for (vp = &svd->vpage[seg_page(seg, addr)]; vp < evp; vp++) { 6196 if ((VPP_PROT(vp) & prot) != prot) { 6197 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 6198 return (EACCES); 6199 } 6200 } 6201 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 6202 return (0); 6203 } 6204 6205 static int 6206 segvn_getprot(struct seg *seg, caddr_t addr, size_t len, uint_t *protv) 6207 { 6208 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 6209 size_t pgno = seg_page(seg, addr + len) - seg_page(seg, addr) + 1; 6210 6211 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 6212 6213 if (pgno != 0) { 6214 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER); 6215 if (svd->pageprot == 0) { 6216 do 6217 protv[--pgno] = svd->prot; 6218 while (pgno != 0); 6219 } else { 6220 size_t pgoff = seg_page(seg, addr); 6221 6222 do { 6223 pgno--; 6224 protv[pgno] = VPP_PROT(&svd->vpage[pgno+pgoff]); 6225 } while (pgno != 0); 6226 } 6227 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 6228 } 6229 return (0); 6230 } 6231 6232 static u_offset_t 6233 segvn_getoffset(struct seg *seg, caddr_t addr) 6234 { 6235 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 6236 6237 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 6238 6239 return (svd->offset + (uintptr_t)(addr - seg->s_base)); 6240 } 6241 6242 /*ARGSUSED*/ 6243 static int 6244 segvn_gettype(struct seg *seg, caddr_t addr) 6245 { 6246 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 6247 6248 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 6249 6250 return (svd->type | (svd->flags & MAP_NORESERVE)); 6251 } 6252 6253 /*ARGSUSED*/ 6254 static int 6255 segvn_getvp(struct seg *seg, caddr_t addr, struct vnode **vpp) 6256 { 6257 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 6258 6259 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 6260 6261 *vpp = svd->vp; 6262 return (0); 6263 } 6264 6265 /* 6266 * Check to see if it makes sense to do kluster/read ahead to 6267 * addr + delta relative to the mapping at addr. We assume here 6268 * that delta is a signed PAGESIZE'd multiple (which can be negative). 6269 * 6270 * For segvn, we currently "approve" of the action if we are 6271 * still in the segment and it maps from the same vp/off, 6272 * or if the advice stored in segvn_data or vpages allows it. 6273 * Currently, klustering is not allowed only if MADV_RANDOM is set. 6274 */ 6275 static int 6276 segvn_kluster(struct seg *seg, caddr_t addr, ssize_t delta) 6277 { 6278 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 6279 struct anon *oap, *ap; 6280 ssize_t pd; 6281 size_t page; 6282 struct vnode *vp1, *vp2; 6283 u_offset_t off1, off2; 6284 struct anon_map *amp; 6285 6286 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 6287 ASSERT(AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock) || 6288 SEGVN_LOCK_HELD(seg->s_as, &svd->lock)); 6289 6290 if (addr + delta < seg->s_base || 6291 addr + delta >= (seg->s_base + seg->s_size)) 6292 return (-1); /* exceeded segment bounds */ 6293 6294 pd = delta / (ssize_t)PAGESIZE; /* divide to preserve sign bit */ 6295 page = seg_page(seg, addr); 6296 6297 /* 6298 * Check to see if either of the pages addr or addr + delta 6299 * have advice set that prevents klustering (if MADV_RANDOM advice 6300 * is set for entire segment, or MADV_SEQUENTIAL is set and delta 6301 * is negative). 6302 */ 6303 if (svd->advice == MADV_RANDOM || 6304 svd->advice == MADV_SEQUENTIAL && delta < 0) 6305 return (-1); 6306 else if (svd->pageadvice && svd->vpage) { 6307 struct vpage *bvpp, *evpp; 6308 6309 bvpp = &svd->vpage[page]; 6310 evpp = &svd->vpage[page + pd]; 6311 if (VPP_ADVICE(bvpp) == MADV_RANDOM || 6312 VPP_ADVICE(evpp) == MADV_SEQUENTIAL && delta < 0) 6313 return (-1); 6314 if (VPP_ADVICE(bvpp) != VPP_ADVICE(evpp) && 6315 VPP_ADVICE(evpp) == MADV_RANDOM) 6316 return (-1); 6317 } 6318 6319 if (svd->type == MAP_SHARED) 6320 return (0); /* shared mapping - all ok */ 6321 6322 if ((amp = svd->amp) == NULL) 6323 return (0); /* off original vnode */ 6324 6325 page += svd->anon_index; 6326 6327 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 6328 6329 oap = anon_get_ptr(amp->ahp, page); 6330 ap = anon_get_ptr(amp->ahp, page + pd); 6331 6332 ANON_LOCK_EXIT(&->a_rwlock); 6333 6334 if ((oap == NULL && ap != NULL) || (oap != NULL && ap == NULL)) { 6335 return (-1); /* one with and one without an anon */ 6336 } 6337 6338 if (oap == NULL) { /* implies that ap == NULL */ 6339 return (0); /* off original vnode */ 6340 } 6341 6342 /* 6343 * Now we know we have two anon pointers - check to 6344 * see if they happen to be properly allocated. 6345 */ 6346 6347 /* 6348 * XXX We cheat here and don't lock the anon slots. We can't because 6349 * we may have been called from the anon layer which might already 6350 * have locked them. We are holding a refcnt on the slots so they 6351 * can't disappear. The worst that will happen is we'll get the wrong 6352 * names (vp, off) for the slots and make a poor klustering decision. 6353 */ 6354 swap_xlate(ap, &vp1, &off1); 6355 swap_xlate(oap, &vp2, &off2); 6356 6357 6358 if (!VOP_CMP(vp1, vp2) || off1 - off2 != delta) 6359 return (-1); 6360 return (0); 6361 } 6362 6363 /* 6364 * Swap the pages of seg out to secondary storage, returning the 6365 * number of bytes of storage freed. 6366 * 6367 * The basic idea is first to unload all translations and then to call 6368 * VOP_PUTPAGE() for all newly-unmapped pages, to push them out to the 6369 * swap device. Pages to which other segments have mappings will remain 6370 * mapped and won't be swapped. Our caller (as_swapout) has already 6371 * performed the unloading step. 6372 * 6373 * The value returned is intended to correlate well with the process's 6374 * memory requirements. However, there are some caveats: 6375 * 1) When given a shared segment as argument, this routine will 6376 * only succeed in swapping out pages for the last sharer of the 6377 * segment. (Previous callers will only have decremented mapping 6378 * reference counts.) 6379 * 2) We assume that the hat layer maintains a large enough translation 6380 * cache to capture process reference patterns. 6381 */ 6382 static size_t 6383 segvn_swapout(struct seg *seg) 6384 { 6385 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 6386 struct anon_map *amp; 6387 pgcnt_t pgcnt = 0; 6388 pgcnt_t npages; 6389 pgcnt_t page; 6390 ulong_t anon_index; 6391 6392 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 6393 6394 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER); 6395 /* 6396 * Find pages unmapped by our caller and force them 6397 * out to the virtual swap device. 6398 */ 6399 if ((amp = svd->amp) != NULL) 6400 anon_index = svd->anon_index; 6401 npages = seg->s_size >> PAGESHIFT; 6402 for (page = 0; page < npages; page++) { 6403 page_t *pp; 6404 struct anon *ap; 6405 struct vnode *vp; 6406 u_offset_t off; 6407 anon_sync_obj_t cookie; 6408 6409 /* 6410 * Obtain <vp, off> pair for the page, then look it up. 6411 * 6412 * Note that this code is willing to consider regular 6413 * pages as well as anon pages. Is this appropriate here? 6414 */ 6415 ap = NULL; 6416 if (amp != NULL) { 6417 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 6418 if (anon_array_try_enter(amp, anon_index + page, 6419 &cookie)) { 6420 ANON_LOCK_EXIT(&->a_rwlock); 6421 continue; 6422 } 6423 ap = anon_get_ptr(amp->ahp, anon_index + page); 6424 if (ap != NULL) { 6425 swap_xlate(ap, &vp, &off); 6426 } else { 6427 vp = svd->vp; 6428 off = svd->offset + ptob(page); 6429 } 6430 anon_array_exit(&cookie); 6431 ANON_LOCK_EXIT(&->a_rwlock); 6432 } else { 6433 vp = svd->vp; 6434 off = svd->offset + ptob(page); 6435 } 6436 if (vp == NULL) { /* untouched zfod page */ 6437 ASSERT(ap == NULL); 6438 continue; 6439 } 6440 6441 pp = page_lookup_nowait(vp, off, SE_SHARED); 6442 if (pp == NULL) 6443 continue; 6444 6445 6446 /* 6447 * Examine the page to see whether it can be tossed out, 6448 * keeping track of how many we've found. 6449 */ 6450 if (!page_tryupgrade(pp)) { 6451 /* 6452 * If the page has an i/o lock and no mappings, 6453 * it's very likely that the page is being 6454 * written out as a result of klustering. 6455 * Assume this is so and take credit for it here. 6456 */ 6457 if (!page_io_trylock(pp)) { 6458 if (!hat_page_is_mapped(pp)) 6459 pgcnt++; 6460 } else { 6461 page_io_unlock(pp); 6462 } 6463 page_unlock(pp); 6464 continue; 6465 } 6466 ASSERT(!page_iolock_assert(pp)); 6467 6468 6469 /* 6470 * Skip if page is locked or has mappings. 6471 * We don't need the page_struct_lock to look at lckcnt 6472 * and cowcnt because the page is exclusive locked. 6473 */ 6474 if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0 || 6475 hat_page_is_mapped(pp)) { 6476 page_unlock(pp); 6477 continue; 6478 } 6479 6480 /* 6481 * dispose skips large pages so try to demote first. 6482 */ 6483 if (pp->p_szc != 0 && !page_try_demote_pages(pp)) { 6484 page_unlock(pp); 6485 /* 6486 * XXX should skip the remaining page_t's of this 6487 * large page. 6488 */ 6489 continue; 6490 } 6491 6492 ASSERT(pp->p_szc == 0); 6493 6494 /* 6495 * No longer mapped -- we can toss it out. How 6496 * we do so depends on whether or not it's dirty. 6497 */ 6498 if (hat_ismod(pp) && pp->p_vnode) { 6499 /* 6500 * We must clean the page before it can be 6501 * freed. Setting B_FREE will cause pvn_done 6502 * to free the page when the i/o completes. 6503 * XXX: This also causes it to be accounted 6504 * as a pageout instead of a swap: need 6505 * B_SWAPOUT bit to use instead of B_FREE. 6506 * 6507 * Hold the vnode before releasing the page lock 6508 * to prevent it from being freed and re-used by 6509 * some other thread. 6510 */ 6511 VN_HOLD(vp); 6512 page_unlock(pp); 6513 6514 /* 6515 * Queue all i/o requests for the pageout thread 6516 * to avoid saturating the pageout devices. 6517 */ 6518 if (!queue_io_request(vp, off)) 6519 VN_RELE(vp); 6520 } else { 6521 /* 6522 * The page was clean, free it. 6523 * 6524 * XXX: Can we ever encounter modified pages 6525 * with no associated vnode here? 6526 */ 6527 ASSERT(pp->p_vnode != NULL); 6528 /*LINTED: constant in conditional context*/ 6529 VN_DISPOSE(pp, B_FREE, 0, kcred); 6530 } 6531 6532 /* 6533 * Credit now even if i/o is in progress. 6534 */ 6535 pgcnt++; 6536 } 6537 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 6538 6539 /* 6540 * Wakeup pageout to initiate i/o on all queued requests. 6541 */ 6542 cv_signal_pageout(); 6543 return (ptob(pgcnt)); 6544 } 6545 6546 /* 6547 * Synchronize primary storage cache with real object in virtual memory. 6548 * 6549 * XXX - Anonymous pages should not be sync'ed out at all. 6550 */ 6551 static int 6552 segvn_sync(struct seg *seg, caddr_t addr, size_t len, int attr, uint_t flags) 6553 { 6554 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 6555 struct vpage *vpp; 6556 page_t *pp; 6557 u_offset_t offset; 6558 struct vnode *vp; 6559 u_offset_t off; 6560 caddr_t eaddr; 6561 int bflags; 6562 int err = 0; 6563 int segtype; 6564 int pageprot; 6565 int prot; 6566 ulong_t anon_index; 6567 struct anon_map *amp; 6568 struct anon *ap; 6569 anon_sync_obj_t cookie; 6570 6571 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 6572 6573 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER); 6574 6575 if (svd->softlockcnt > 0) { 6576 /* 6577 * flush all pages from seg cache 6578 * otherwise we may deadlock in swap_putpage 6579 * for B_INVAL page (4175402). 6580 * 6581 * Even if we grab segvn WRITER's lock or segp_slock 6582 * here, there might be another thread which could've 6583 * successfully performed lookup/insert just before 6584 * we acquired the lock here. So, grabbing either 6585 * lock here is of not much use. Until we devise 6586 * a strategy at upper layers to solve the 6587 * synchronization issues completely, we expect 6588 * applications to handle this appropriately. 6589 */ 6590 segvn_purge(seg); 6591 if (svd->softlockcnt > 0) { 6592 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 6593 return (EAGAIN); 6594 } 6595 } 6596 6597 vpp = svd->vpage; 6598 offset = svd->offset + (uintptr_t)(addr - seg->s_base); 6599 bflags = ((flags & MS_ASYNC) ? B_ASYNC : 0) | 6600 ((flags & MS_INVALIDATE) ? B_INVAL : 0); 6601 6602 if (attr) { 6603 pageprot = attr & ~(SHARED|PRIVATE); 6604 segtype = (attr & SHARED) ? MAP_SHARED : MAP_PRIVATE; 6605 6606 /* 6607 * We are done if the segment types don't match 6608 * or if we have segment level protections and 6609 * they don't match. 6610 */ 6611 if (svd->type != segtype) { 6612 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 6613 return (0); 6614 } 6615 if (vpp == NULL) { 6616 if (svd->prot != pageprot) { 6617 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 6618 return (0); 6619 } 6620 prot = svd->prot; 6621 } else 6622 vpp = &svd->vpage[seg_page(seg, addr)]; 6623 6624 } else if (svd->vp && svd->amp == NULL && 6625 (flags & MS_INVALIDATE) == 0) { 6626 6627 /* 6628 * No attributes, no anonymous pages and MS_INVALIDATE flag 6629 * is not on, just use one big request. 6630 */ 6631 err = VOP_PUTPAGE(svd->vp, (offset_t)offset, len, 6632 bflags, svd->cred); 6633 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 6634 return (err); 6635 } 6636 6637 if ((amp = svd->amp) != NULL) 6638 anon_index = svd->anon_index + seg_page(seg, addr); 6639 6640 for (eaddr = addr + len; addr < eaddr; addr += PAGESIZE) { 6641 ap = NULL; 6642 if (amp != NULL) { 6643 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 6644 anon_array_enter(amp, anon_index, &cookie); 6645 ap = anon_get_ptr(amp->ahp, anon_index++); 6646 if (ap != NULL) { 6647 swap_xlate(ap, &vp, &off); 6648 } else { 6649 vp = svd->vp; 6650 off = offset; 6651 } 6652 anon_array_exit(&cookie); 6653 ANON_LOCK_EXIT(&->a_rwlock); 6654 } else { 6655 vp = svd->vp; 6656 off = offset; 6657 } 6658 offset += PAGESIZE; 6659 6660 if (vp == NULL) /* untouched zfod page */ 6661 continue; 6662 6663 if (attr) { 6664 if (vpp) { 6665 prot = VPP_PROT(vpp); 6666 vpp++; 6667 } 6668 if (prot != pageprot) { 6669 continue; 6670 } 6671 } 6672 6673 /* 6674 * See if any of these pages are locked -- if so, then we 6675 * will have to truncate an invalidate request at the first 6676 * locked one. We don't need the page_struct_lock to test 6677 * as this is only advisory; even if we acquire it someone 6678 * might race in and lock the page after we unlock and before 6679 * we do the PUTPAGE, then PUTPAGE simply does nothing. 6680 */ 6681 if (flags & MS_INVALIDATE) { 6682 if ((pp = page_lookup(vp, off, SE_SHARED)) != NULL) { 6683 if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) { 6684 page_unlock(pp); 6685 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 6686 return (EBUSY); 6687 } 6688 if (ap != NULL && pp->p_szc != 0 && 6689 page_tryupgrade(pp)) { 6690 if (pp->p_lckcnt == 0 && 6691 pp->p_cowcnt == 0) { 6692 /* 6693 * swapfs VN_DISPOSE() won't 6694 * invalidate large pages. 6695 * Attempt to demote. 6696 * XXX can't help it if it 6697 * fails. But for swapfs 6698 * pages it is no big deal. 6699 */ 6700 (void) page_try_demote_pages( 6701 pp); 6702 } 6703 } 6704 page_unlock(pp); 6705 } 6706 } else if (svd->type == MAP_SHARED && amp != NULL) { 6707 /* 6708 * Avoid writting out to disk ISM's large pages 6709 * because segspt_free_pages() relies on NULL an_pvp 6710 * of anon slots of such pages. 6711 */ 6712 6713 ASSERT(svd->vp == NULL); 6714 /* 6715 * swapfs uses page_lookup_nowait if not freeing or 6716 * invalidating and skips a page if 6717 * page_lookup_nowait returns NULL. 6718 */ 6719 pp = page_lookup_nowait(vp, off, SE_SHARED); 6720 if (pp == NULL) { 6721 continue; 6722 } 6723 if (pp->p_szc != 0) { 6724 page_unlock(pp); 6725 continue; 6726 } 6727 6728 /* 6729 * Note ISM pages are created large so (vp, off)'s 6730 * page cannot suddenly become large after we unlock 6731 * pp. 6732 */ 6733 page_unlock(pp); 6734 } 6735 /* 6736 * XXX - Should ultimately try to kluster 6737 * calls to VOP_PUTPAGE() for performance. 6738 */ 6739 VN_HOLD(vp); 6740 err = VOP_PUTPAGE(vp, (offset_t)off, PAGESIZE, 6741 bflags, svd->cred); 6742 VN_RELE(vp); 6743 if (err) 6744 break; 6745 } 6746 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 6747 return (err); 6748 } 6749 6750 /* 6751 * Determine if we have data corresponding to pages in the 6752 * primary storage virtual memory cache (i.e., "in core"). 6753 */ 6754 static size_t 6755 segvn_incore(struct seg *seg, caddr_t addr, size_t len, char *vec) 6756 { 6757 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 6758 struct vnode *vp, *avp; 6759 u_offset_t offset, aoffset; 6760 size_t p, ep; 6761 int ret; 6762 struct vpage *vpp; 6763 page_t *pp; 6764 uint_t start; 6765 struct anon_map *amp; /* XXX - for locknest */ 6766 struct anon *ap; 6767 uint_t attr; 6768 anon_sync_obj_t cookie; 6769 6770 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 6771 6772 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER); 6773 if (svd->amp == NULL && svd->vp == NULL) { 6774 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 6775 bzero(vec, btopr(len)); 6776 return (len); /* no anonymous pages created yet */ 6777 } 6778 6779 p = seg_page(seg, addr); 6780 ep = seg_page(seg, addr + len); 6781 start = svd->vp ? SEG_PAGE_VNODEBACKED : 0; 6782 6783 amp = svd->amp; 6784 for (; p < ep; p++, addr += PAGESIZE) { 6785 vpp = (svd->vpage) ? &svd->vpage[p]: NULL; 6786 ret = start; 6787 ap = NULL; 6788 avp = NULL; 6789 /* Grab the vnode/offset for the anon slot */ 6790 if (amp != NULL) { 6791 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 6792 anon_array_enter(amp, svd->anon_index + p, &cookie); 6793 ap = anon_get_ptr(amp->ahp, svd->anon_index + p); 6794 if (ap != NULL) { 6795 swap_xlate(ap, &avp, &aoffset); 6796 } 6797 anon_array_exit(&cookie); 6798 ANON_LOCK_EXIT(&->a_rwlock); 6799 } 6800 if ((avp != NULL) && page_exists(avp, aoffset)) { 6801 /* A page exists for the anon slot */ 6802 ret |= SEG_PAGE_INCORE; 6803 6804 /* 6805 * If page is mapped and writable 6806 */ 6807 attr = (uint_t)0; 6808 if ((hat_getattr(seg->s_as->a_hat, addr, 6809 &attr) != -1) && (attr & PROT_WRITE)) { 6810 ret |= SEG_PAGE_ANON; 6811 } 6812 /* 6813 * Don't get page_struct lock for lckcnt and cowcnt, 6814 * since this is purely advisory. 6815 */ 6816 if ((pp = page_lookup_nowait(avp, aoffset, 6817 SE_SHARED)) != NULL) { 6818 if (pp->p_lckcnt) 6819 ret |= SEG_PAGE_SOFTLOCK; 6820 if (pp->p_cowcnt) 6821 ret |= SEG_PAGE_HASCOW; 6822 page_unlock(pp); 6823 } 6824 } 6825 6826 /* Gather vnode statistics */ 6827 vp = svd->vp; 6828 offset = svd->offset + (uintptr_t)(addr - seg->s_base); 6829 6830 if (vp != NULL) { 6831 /* 6832 * Try to obtain a "shared" lock on the page 6833 * without blocking. If this fails, determine 6834 * if the page is in memory. 6835 */ 6836 pp = page_lookup_nowait(vp, offset, SE_SHARED); 6837 if ((pp == NULL) && (page_exists(vp, offset))) { 6838 /* Page is incore, and is named */ 6839 ret |= (SEG_PAGE_INCORE | SEG_PAGE_VNODE); 6840 } 6841 /* 6842 * Don't get page_struct lock for lckcnt and cowcnt, 6843 * since this is purely advisory. 6844 */ 6845 if (pp != NULL) { 6846 ret |= (SEG_PAGE_INCORE | SEG_PAGE_VNODE); 6847 if (pp->p_lckcnt) 6848 ret |= SEG_PAGE_SOFTLOCK; 6849 if (pp->p_cowcnt) 6850 ret |= SEG_PAGE_HASCOW; 6851 page_unlock(pp); 6852 } 6853 } 6854 6855 /* Gather virtual page information */ 6856 if (vpp) { 6857 if (VPP_ISPPLOCK(vpp)) 6858 ret |= SEG_PAGE_LOCKED; 6859 vpp++; 6860 } 6861 6862 *vec++ = (char)ret; 6863 } 6864 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 6865 return (len); 6866 } 6867 6868 /* 6869 * Statement for p_cowcnts/p_lckcnts. 6870 * 6871 * p_cowcnt is updated while mlock/munlocking MAP_PRIVATE and PROT_WRITE region 6872 * irrespective of the following factors or anything else: 6873 * 6874 * (1) anon slots are populated or not 6875 * (2) cow is broken or not 6876 * (3) refcnt on ap is 1 or greater than 1 6877 * 6878 * If it's not MAP_PRIVATE and PROT_WRITE, p_lckcnt is updated during mlock 6879 * and munlock. 6880 * 6881 * 6882 * Handling p_cowcnts/p_lckcnts during copy-on-write fault: 6883 * 6884 * if vpage has PROT_WRITE 6885 * transfer cowcnt on the oldpage -> cowcnt on the newpage 6886 * else 6887 * transfer lckcnt on the oldpage -> lckcnt on the newpage 6888 * 6889 * During copy-on-write, decrement p_cowcnt on the oldpage and increment 6890 * p_cowcnt on the newpage *if* the corresponding vpage has PROT_WRITE. 6891 * 6892 * We may also break COW if softlocking on read access in the physio case. 6893 * In this case, vpage may not have PROT_WRITE. So, we need to decrement 6894 * p_lckcnt on the oldpage and increment p_lckcnt on the newpage *if* the 6895 * vpage doesn't have PROT_WRITE. 6896 * 6897 * 6898 * Handling p_cowcnts/p_lckcnts during mprotect on mlocked region: 6899 * 6900 * If a MAP_PRIVATE region loses PROT_WRITE, we decrement p_cowcnt and 6901 * increment p_lckcnt by calling page_subclaim() which takes care of 6902 * availrmem accounting and p_lckcnt overflow. 6903 * 6904 * If a MAP_PRIVATE region gains PROT_WRITE, we decrement p_lckcnt and 6905 * increment p_cowcnt by calling page_addclaim() which takes care of 6906 * availrmem availability and p_cowcnt overflow. 6907 */ 6908 6909 /* 6910 * Lock down (or unlock) pages mapped by this segment. 6911 * 6912 * XXX only creates PAGESIZE pages if anon slots are not initialized. 6913 * At fault time they will be relocated into larger pages. 6914 */ 6915 static int 6916 segvn_lockop(struct seg *seg, caddr_t addr, size_t len, 6917 int attr, int op, ulong_t *lockmap, size_t pos) 6918 { 6919 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 6920 struct vpage *vpp; 6921 struct vpage *evp; 6922 page_t *pp; 6923 u_offset_t offset; 6924 u_offset_t off; 6925 int segtype; 6926 int pageprot; 6927 int claim; 6928 struct vnode *vp; 6929 ulong_t anon_index; 6930 struct anon_map *amp; 6931 struct anon *ap; 6932 struct vattr va; 6933 anon_sync_obj_t cookie; 6934 6935 /* 6936 * Hold write lock on address space because may split or concatenate 6937 * segments 6938 */ 6939 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 6940 6941 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_WRITER); 6942 if (attr) { 6943 pageprot = attr & ~(SHARED|PRIVATE); 6944 segtype = attr & SHARED ? MAP_SHARED : MAP_PRIVATE; 6945 6946 /* 6947 * We are done if the segment types don't match 6948 * or if we have segment level protections and 6949 * they don't match. 6950 */ 6951 if (svd->type != segtype) { 6952 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 6953 return (0); 6954 } 6955 if (svd->pageprot == 0 && svd->prot != pageprot) { 6956 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 6957 return (0); 6958 } 6959 } 6960 6961 /* 6962 * If we're locking, then we must create a vpage structure if 6963 * none exists. If we're unlocking, then check to see if there 6964 * is a vpage -- if not, then we could not have locked anything. 6965 */ 6966 6967 if ((vpp = svd->vpage) == NULL) { 6968 if (op == MC_LOCK) 6969 segvn_vpage(seg); 6970 else { 6971 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 6972 return (0); 6973 } 6974 } 6975 6976 /* 6977 * The anonymous data vector (i.e., previously 6978 * unreferenced mapping to swap space) can be allocated 6979 * by lazily testing for its existence. 6980 */ 6981 if (op == MC_LOCK && svd->amp == NULL && svd->vp == NULL) { 6982 svd->amp = anonmap_alloc(seg->s_size, 0); 6983 svd->amp->a_szc = seg->s_szc; 6984 } 6985 6986 if ((amp = svd->amp) != NULL) { 6987 anon_index = svd->anon_index + seg_page(seg, addr); 6988 } 6989 6990 offset = svd->offset + (uintptr_t)(addr - seg->s_base); 6991 evp = &svd->vpage[seg_page(seg, addr + len)]; 6992 6993 /* 6994 * Loop over all pages in the range. Process if we're locking and 6995 * page has not already been locked in this mapping; or if we're 6996 * unlocking and the page has been locked. 6997 */ 6998 for (vpp = &svd->vpage[seg_page(seg, addr)]; vpp < evp; 6999 vpp++, pos++, addr += PAGESIZE, offset += PAGESIZE, anon_index++) { 7000 if ((attr == 0 || VPP_PROT(vpp) == pageprot) && 7001 ((op == MC_LOCK && !VPP_ISPPLOCK(vpp)) || 7002 (op == MC_UNLOCK && VPP_ISPPLOCK(vpp)))) { 7003 7004 if (amp != NULL) 7005 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 7006 /* 7007 * If this isn't a MAP_NORESERVE segment and 7008 * we're locking, allocate anon slots if they 7009 * don't exist. The page is brought in later on. 7010 */ 7011 if (op == MC_LOCK && svd->vp == NULL && 7012 ((svd->flags & MAP_NORESERVE) == 0) && 7013 amp != NULL && 7014 ((ap = anon_get_ptr(amp->ahp, anon_index)) 7015 == NULL)) { 7016 anon_array_enter(amp, anon_index, &cookie); 7017 7018 if ((ap = anon_get_ptr(amp->ahp, 7019 anon_index)) == NULL) { 7020 pp = anon_zero(seg, addr, &ap, 7021 svd->cred); 7022 if (pp == NULL) { 7023 anon_array_exit(&cookie); 7024 ANON_LOCK_EXIT(&->a_rwlock); 7025 SEGVN_LOCK_EXIT(seg->s_as, 7026 &svd->lock); 7027 return (ENOMEM); 7028 } 7029 ASSERT(anon_get_ptr(amp->ahp, 7030 anon_index) == NULL); 7031 (void) anon_set_ptr(amp->ahp, 7032 anon_index, ap, ANON_SLEEP); 7033 page_unlock(pp); 7034 } 7035 anon_array_exit(&cookie); 7036 } 7037 7038 /* 7039 * Get name for page, accounting for 7040 * existence of private copy. 7041 */ 7042 ap = NULL; 7043 if (amp != NULL) { 7044 anon_array_enter(amp, anon_index, &cookie); 7045 ap = anon_get_ptr(amp->ahp, anon_index); 7046 if (ap != NULL) { 7047 swap_xlate(ap, &vp, &off); 7048 } else { 7049 if (svd->vp == NULL && 7050 (svd->flags & MAP_NORESERVE)) { 7051 anon_array_exit(&cookie); 7052 ANON_LOCK_EXIT(&->a_rwlock); 7053 continue; 7054 } 7055 vp = svd->vp; 7056 off = offset; 7057 } 7058 anon_array_exit(&cookie); 7059 ANON_LOCK_EXIT(&->a_rwlock); 7060 } else { 7061 vp = svd->vp; 7062 off = offset; 7063 } 7064 7065 /* 7066 * Get page frame. It's ok if the page is 7067 * not available when we're unlocking, as this 7068 * may simply mean that a page we locked got 7069 * truncated out of existence after we locked it. 7070 * 7071 * Invoke VOP_GETPAGE() to obtain the page struct 7072 * since we may need to read it from disk if its 7073 * been paged out. 7074 */ 7075 if (op != MC_LOCK) 7076 pp = page_lookup(vp, off, SE_SHARED); 7077 else { 7078 page_t *pl[1 + 1]; 7079 int error; 7080 7081 ASSERT(vp != NULL); 7082 7083 error = VOP_GETPAGE(vp, (offset_t)off, PAGESIZE, 7084 (uint_t *)NULL, pl, PAGESIZE, seg, addr, 7085 S_OTHER, svd->cred); 7086 7087 /* 7088 * If the error is EDEADLK then we must bounce 7089 * up and drop all vm subsystem locks and then 7090 * retry the operation later 7091 * This behavior is a temporary measure because 7092 * ufs/sds logging is badly designed and will 7093 * deadlock if we don't allow this bounce to 7094 * happen. The real solution is to re-design 7095 * the logging code to work properly. See bug 7096 * 4125102 for details of the problem. 7097 */ 7098 if (error == EDEADLK) { 7099 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7100 return (error); 7101 } 7102 /* 7103 * Quit if we fail to fault in the page. Treat 7104 * the failure as an error, unless the addr 7105 * is mapped beyond the end of a file. 7106 */ 7107 if (error && svd->vp) { 7108 va.va_mask = AT_SIZE; 7109 if (VOP_GETATTR(svd->vp, &va, 0, 7110 svd->cred) != 0) { 7111 SEGVN_LOCK_EXIT(seg->s_as, 7112 &svd->lock); 7113 return (EIO); 7114 } 7115 if (btopr(va.va_size) >= 7116 btopr(off + 1)) { 7117 SEGVN_LOCK_EXIT(seg->s_as, 7118 &svd->lock); 7119 return (EIO); 7120 } 7121 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7122 return (0); 7123 } else if (error) { 7124 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7125 return (EIO); 7126 } 7127 pp = pl[0]; 7128 ASSERT(pp != NULL); 7129 } 7130 7131 /* 7132 * See Statement at the beginning of this routine. 7133 * 7134 * claim is always set if MAP_PRIVATE and PROT_WRITE 7135 * irrespective of following factors: 7136 * 7137 * (1) anon slots are populated or not 7138 * (2) cow is broken or not 7139 * (3) refcnt on ap is 1 or greater than 1 7140 * 7141 * See 4140683 for details 7142 */ 7143 claim = ((VPP_PROT(vpp) & PROT_WRITE) && 7144 (svd->type == MAP_PRIVATE)); 7145 7146 /* 7147 * Perform page-level operation appropriate to 7148 * operation. If locking, undo the SOFTLOCK 7149 * performed to bring the page into memory 7150 * after setting the lock. If unlocking, 7151 * and no page was found, account for the claim 7152 * separately. 7153 */ 7154 if (op == MC_LOCK) { 7155 int ret = 1; /* Assume success */ 7156 7157 /* 7158 * Make sure another thread didn't lock 7159 * the page after we released the segment 7160 * lock. 7161 */ 7162 if ((attr == 0 || VPP_PROT(vpp) == pageprot) && 7163 !VPP_ISPPLOCK(vpp)) { 7164 ret = page_pp_lock(pp, claim, 0); 7165 if (ret != 0) { 7166 VPP_SETPPLOCK(vpp); 7167 if (lockmap != (ulong_t *)NULL) 7168 BT_SET(lockmap, pos); 7169 } 7170 } 7171 page_unlock(pp); 7172 if (ret == 0) { 7173 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7174 return (EAGAIN); 7175 } 7176 } else { 7177 if (pp != NULL) { 7178 if ((attr == 0 || 7179 VPP_PROT(vpp) == pageprot) && 7180 VPP_ISPPLOCK(vpp)) 7181 page_pp_unlock(pp, claim, 0); 7182 page_unlock(pp); 7183 } 7184 VPP_CLRPPLOCK(vpp); 7185 } 7186 } 7187 } 7188 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7189 return (0); 7190 } 7191 7192 /* 7193 * Set advice from user for specified pages 7194 * There are 5 types of advice: 7195 * MADV_NORMAL - Normal (default) behavior (whatever that is) 7196 * MADV_RANDOM - Random page references 7197 * do not allow readahead or 'klustering' 7198 * MADV_SEQUENTIAL - Sequential page references 7199 * Pages previous to the one currently being 7200 * accessed (determined by fault) are 'not needed' 7201 * and are freed immediately 7202 * MADV_WILLNEED - Pages are likely to be used (fault ahead in mctl) 7203 * MADV_DONTNEED - Pages are not needed (synced out in mctl) 7204 * MADV_FREE - Contents can be discarded 7205 * MADV_ACCESS_DEFAULT- Default access 7206 * MADV_ACCESS_LWP - Next LWP will access heavily 7207 * MADV_ACCESS_MANY- Many LWPs or processes will access heavily 7208 */ 7209 static int 7210 segvn_advise(struct seg *seg, caddr_t addr, size_t len, uint_t behav) 7211 { 7212 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 7213 size_t page; 7214 int err = 0; 7215 int already_set; 7216 struct anon_map *amp; 7217 ulong_t anon_index; 7218 struct seg *next; 7219 lgrp_mem_policy_t policy; 7220 struct seg *prev; 7221 struct vnode *vp; 7222 7223 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 7224 7225 /* 7226 * In case of MADV_FREE, we won't be modifying any segment private 7227 * data structures; so, we only need to grab READER's lock 7228 */ 7229 if (behav != MADV_FREE) 7230 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_WRITER); 7231 else 7232 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER); 7233 7234 /* 7235 * Large pages are assumed to be only turned on when accesses to the 7236 * segment's address range have spatial and temporal locality. That 7237 * justifies ignoring MADV_SEQUENTIAL for large page segments. 7238 * Also, ignore advice affecting lgroup memory allocation 7239 * if don't need to do lgroup optimizations on this system 7240 */ 7241 7242 if ((behav == MADV_SEQUENTIAL && seg->s_szc != 0) || 7243 (!lgrp_optimizations() && (behav == MADV_ACCESS_DEFAULT || 7244 behav == MADV_ACCESS_LWP || behav == MADV_ACCESS_MANY))) { 7245 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7246 return (0); 7247 } 7248 7249 if (behav == MADV_SEQUENTIAL || behav == MADV_ACCESS_DEFAULT || 7250 behav == MADV_ACCESS_LWP || behav == MADV_ACCESS_MANY) { 7251 /* 7252 * Since we are going to unload hat mappings 7253 * we first have to flush the cache. Otherwise 7254 * this might lead to system panic if another 7255 * thread is doing physio on the range whose 7256 * mappings are unloaded by madvise(3C). 7257 */ 7258 if (svd->softlockcnt > 0) { 7259 /* 7260 * Since we do have the segvn writers lock 7261 * nobody can fill the cache with entries 7262 * belonging to this seg during the purge. 7263 * The flush either succeeds or we still 7264 * have pending I/Os. In the later case, 7265 * madvise(3C) fails. 7266 */ 7267 segvn_purge(seg); 7268 if (svd->softlockcnt > 0) { 7269 /* 7270 * Since madvise(3C) is advisory and 7271 * it's not part of UNIX98, madvise(3C) 7272 * failure here doesn't cause any hardship. 7273 * Note that we don't block in "as" layer. 7274 */ 7275 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7276 return (EAGAIN); 7277 } 7278 } 7279 } 7280 7281 amp = svd->amp; 7282 vp = svd->vp; 7283 if (behav == MADV_FREE) { 7284 /* 7285 * MADV_FREE is not supported for segments with 7286 * underlying object; if anonmap is NULL, anon slots 7287 * are not yet populated and there is nothing for 7288 * us to do. As MADV_FREE is advisory, we don't 7289 * return error in either case. 7290 */ 7291 if (vp || amp == NULL) { 7292 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7293 return (0); 7294 } 7295 7296 page = seg_page(seg, addr); 7297 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 7298 anon_disclaim(amp, svd->anon_index + page, len, 0); 7299 ANON_LOCK_EXIT(&->a_rwlock); 7300 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7301 return (0); 7302 } 7303 7304 /* 7305 * If advice is to be applied to entire segment, 7306 * use advice field in seg_data structure 7307 * otherwise use appropriate vpage entry. 7308 */ 7309 if ((addr == seg->s_base) && (len == seg->s_size)) { 7310 switch (behav) { 7311 case MADV_ACCESS_LWP: 7312 case MADV_ACCESS_MANY: 7313 case MADV_ACCESS_DEFAULT: 7314 /* 7315 * Set memory allocation policy for this segment 7316 */ 7317 policy = lgrp_madv_to_policy(behav, len, svd->type); 7318 if (svd->type == MAP_SHARED) 7319 already_set = lgrp_shm_policy_set(policy, amp, 7320 svd->anon_index, vp, svd->offset, len); 7321 else { 7322 /* 7323 * For private memory, need writers lock on 7324 * address space because the segment may be 7325 * split or concatenated when changing policy 7326 */ 7327 if (AS_READ_HELD(seg->s_as, 7328 &seg->s_as->a_lock)) { 7329 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7330 return (IE_RETRY); 7331 } 7332 7333 already_set = lgrp_privm_policy_set(policy, 7334 &svd->policy_info, len); 7335 } 7336 7337 /* 7338 * If policy set already and it shouldn't be reapplied, 7339 * don't do anything. 7340 */ 7341 if (already_set && 7342 !LGRP_MEM_POLICY_REAPPLICABLE(policy)) 7343 break; 7344 7345 /* 7346 * Mark any existing pages in given range for 7347 * migration 7348 */ 7349 page_mark_migrate(seg, addr, len, amp, svd->anon_index, 7350 vp, svd->offset, 1); 7351 7352 /* 7353 * If same policy set already or this is a shared 7354 * memory segment, don't need to try to concatenate 7355 * segment with adjacent ones. 7356 */ 7357 if (already_set || svd->type == MAP_SHARED) 7358 break; 7359 7360 /* 7361 * Try to concatenate this segment with previous 7362 * one and next one, since we changed policy for 7363 * this one and it may be compatible with adjacent 7364 * ones now. 7365 */ 7366 prev = AS_SEGPREV(seg->s_as, seg); 7367 next = AS_SEGNEXT(seg->s_as, seg); 7368 7369 if (next && next->s_ops == &segvn_ops && 7370 addr + len == next->s_base) 7371 (void) segvn_concat(seg, next, 1); 7372 7373 if (prev && prev->s_ops == &segvn_ops && 7374 addr == prev->s_base + prev->s_size) { 7375 /* 7376 * Drop lock for private data of current 7377 * segment before concatenating (deleting) it 7378 * and return IE_REATTACH to tell as_ctl() that 7379 * current segment has changed 7380 */ 7381 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7382 if (!segvn_concat(prev, seg, 1)) 7383 err = IE_REATTACH; 7384 7385 return (err); 7386 } 7387 break; 7388 7389 case MADV_SEQUENTIAL: 7390 /* 7391 * unloading mapping guarantees 7392 * detection in segvn_fault 7393 */ 7394 ASSERT(seg->s_szc == 0); 7395 hat_unload(seg->s_as->a_hat, addr, len, 7396 HAT_UNLOAD); 7397 /* FALLTHROUGH */ 7398 case MADV_NORMAL: 7399 case MADV_RANDOM: 7400 svd->advice = (uchar_t)behav; 7401 svd->pageadvice = 0; 7402 break; 7403 case MADV_WILLNEED: /* handled in memcntl */ 7404 case MADV_DONTNEED: /* handled in memcntl */ 7405 case MADV_FREE: /* handled above */ 7406 break; 7407 default: 7408 err = EINVAL; 7409 } 7410 } else { 7411 caddr_t eaddr; 7412 struct seg *new_seg; 7413 struct segvn_data *new_svd; 7414 u_offset_t off; 7415 caddr_t oldeaddr; 7416 7417 page = seg_page(seg, addr); 7418 7419 segvn_vpage(seg); 7420 7421 switch (behav) { 7422 struct vpage *bvpp, *evpp; 7423 7424 case MADV_ACCESS_LWP: 7425 case MADV_ACCESS_MANY: 7426 case MADV_ACCESS_DEFAULT: 7427 /* 7428 * Set memory allocation policy for portion of this 7429 * segment 7430 */ 7431 7432 /* 7433 * Align address and length of advice to page 7434 * boundaries for large pages 7435 */ 7436 if (seg->s_szc != 0) { 7437 size_t pgsz; 7438 7439 pgsz = page_get_pagesize(seg->s_szc); 7440 addr = (caddr_t)P2ALIGN((uintptr_t)addr, pgsz); 7441 len = P2ROUNDUP(len, pgsz); 7442 } 7443 7444 /* 7445 * Check to see whether policy is set already 7446 */ 7447 policy = lgrp_madv_to_policy(behav, len, svd->type); 7448 7449 anon_index = svd->anon_index + page; 7450 off = svd->offset + (uintptr_t)(addr - seg->s_base); 7451 7452 if (svd->type == MAP_SHARED) 7453 already_set = lgrp_shm_policy_set(policy, amp, 7454 anon_index, vp, off, len); 7455 else 7456 already_set = 7457 (policy == svd->policy_info.mem_policy); 7458 7459 /* 7460 * If policy set already and it shouldn't be reapplied, 7461 * don't do anything. 7462 */ 7463 if (already_set && 7464 !LGRP_MEM_POLICY_REAPPLICABLE(policy)) 7465 break; 7466 7467 /* 7468 * For private memory, need writers lock on 7469 * address space because the segment may be 7470 * split or concatenated when changing policy 7471 */ 7472 if (svd->type == MAP_PRIVATE && 7473 AS_READ_HELD(seg->s_as, &seg->s_as->a_lock)) { 7474 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7475 return (IE_RETRY); 7476 } 7477 7478 /* 7479 * Mark any existing pages in given range for 7480 * migration 7481 */ 7482 page_mark_migrate(seg, addr, len, amp, svd->anon_index, 7483 vp, svd->offset, 1); 7484 7485 /* 7486 * Don't need to try to split or concatenate 7487 * segments, since policy is same or this is a shared 7488 * memory segment 7489 */ 7490 if (already_set || svd->type == MAP_SHARED) 7491 break; 7492 7493 /* 7494 * Split off new segment if advice only applies to a 7495 * portion of existing segment starting in middle 7496 */ 7497 new_seg = NULL; 7498 eaddr = addr + len; 7499 oldeaddr = seg->s_base + seg->s_size; 7500 if (addr > seg->s_base) { 7501 /* 7502 * Must flush I/O page cache 7503 * before splitting segment 7504 */ 7505 if (svd->softlockcnt > 0) 7506 segvn_purge(seg); 7507 7508 /* 7509 * Split segment and return IE_REATTACH to tell 7510 * as_ctl() that current segment changed 7511 */ 7512 new_seg = segvn_split_seg(seg, addr); 7513 new_svd = (struct segvn_data *)new_seg->s_data; 7514 err = IE_REATTACH; 7515 7516 /* 7517 * If new segment ends where old one 7518 * did, try to concatenate the new 7519 * segment with next one. 7520 */ 7521 if (eaddr == oldeaddr) { 7522 /* 7523 * Set policy for new segment 7524 */ 7525 (void) lgrp_privm_policy_set(policy, 7526 &new_svd->policy_info, 7527 new_seg->s_size); 7528 7529 next = AS_SEGNEXT(new_seg->s_as, 7530 new_seg); 7531 7532 if (next && 7533 next->s_ops == &segvn_ops && 7534 eaddr == next->s_base) 7535 (void) segvn_concat(new_seg, 7536 next, 1); 7537 } 7538 } 7539 7540 /* 7541 * Split off end of existing segment if advice only 7542 * applies to a portion of segment ending before 7543 * end of the existing segment 7544 */ 7545 if (eaddr < oldeaddr) { 7546 /* 7547 * Must flush I/O page cache 7548 * before splitting segment 7549 */ 7550 if (svd->softlockcnt > 0) 7551 segvn_purge(seg); 7552 7553 /* 7554 * If beginning of old segment was already 7555 * split off, use new segment to split end off 7556 * from. 7557 */ 7558 if (new_seg != NULL && new_seg != seg) { 7559 /* 7560 * Split segment 7561 */ 7562 (void) segvn_split_seg(new_seg, eaddr); 7563 7564 /* 7565 * Set policy for new segment 7566 */ 7567 (void) lgrp_privm_policy_set(policy, 7568 &new_svd->policy_info, 7569 new_seg->s_size); 7570 } else { 7571 /* 7572 * Split segment and return IE_REATTACH 7573 * to tell as_ctl() that current 7574 * segment changed 7575 */ 7576 (void) segvn_split_seg(seg, eaddr); 7577 err = IE_REATTACH; 7578 7579 (void) lgrp_privm_policy_set(policy, 7580 &svd->policy_info, seg->s_size); 7581 7582 /* 7583 * If new segment starts where old one 7584 * did, try to concatenate it with 7585 * previous segment. 7586 */ 7587 if (addr == seg->s_base) { 7588 prev = AS_SEGPREV(seg->s_as, 7589 seg); 7590 7591 /* 7592 * Drop lock for private data 7593 * of current segment before 7594 * concatenating (deleting) it 7595 */ 7596 if (prev && 7597 prev->s_ops == 7598 &segvn_ops && 7599 addr == prev->s_base + 7600 prev->s_size) { 7601 SEGVN_LOCK_EXIT( 7602 seg->s_as, 7603 &svd->lock); 7604 (void) segvn_concat( 7605 prev, seg, 1); 7606 return (err); 7607 } 7608 } 7609 } 7610 } 7611 break; 7612 case MADV_SEQUENTIAL: 7613 ASSERT(seg->s_szc == 0); 7614 hat_unload(seg->s_as->a_hat, addr, len, HAT_UNLOAD); 7615 /* FALLTHROUGH */ 7616 case MADV_NORMAL: 7617 case MADV_RANDOM: 7618 bvpp = &svd->vpage[page]; 7619 evpp = &svd->vpage[page + (len >> PAGESHIFT)]; 7620 for (; bvpp < evpp; bvpp++) 7621 VPP_SETADVICE(bvpp, behav); 7622 svd->advice = MADV_NORMAL; 7623 break; 7624 case MADV_WILLNEED: /* handled in memcntl */ 7625 case MADV_DONTNEED: /* handled in memcntl */ 7626 case MADV_FREE: /* handled above */ 7627 break; 7628 default: 7629 err = EINVAL; 7630 } 7631 } 7632 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7633 return (err); 7634 } 7635 7636 /* 7637 * Create a vpage structure for this seg. 7638 */ 7639 static void 7640 segvn_vpage(struct seg *seg) 7641 { 7642 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 7643 struct vpage *vp, *evp; 7644 7645 ASSERT(SEGVN_WRITE_HELD(seg->s_as, &svd->lock)); 7646 7647 /* 7648 * If no vpage structure exists, allocate one. Copy the protections 7649 * and the advice from the segment itself to the individual pages. 7650 */ 7651 if (svd->vpage == NULL) { 7652 svd->pageprot = 1; 7653 svd->pageadvice = 1; 7654 svd->vpage = kmem_zalloc(seg_pages(seg) * sizeof (struct vpage), 7655 KM_SLEEP); 7656 evp = &svd->vpage[seg_page(seg, seg->s_base + seg->s_size)]; 7657 for (vp = svd->vpage; vp < evp; vp++) { 7658 VPP_SETPROT(vp, svd->prot); 7659 VPP_SETADVICE(vp, svd->advice); 7660 } 7661 } 7662 } 7663 7664 /* 7665 * Dump the pages belonging to this segvn segment. 7666 */ 7667 static void 7668 segvn_dump(struct seg *seg) 7669 { 7670 struct segvn_data *svd; 7671 page_t *pp; 7672 struct anon_map *amp; 7673 ulong_t anon_index; 7674 struct vnode *vp; 7675 u_offset_t off, offset; 7676 pfn_t pfn; 7677 pgcnt_t page, npages; 7678 caddr_t addr; 7679 7680 npages = seg_pages(seg); 7681 svd = (struct segvn_data *)seg->s_data; 7682 vp = svd->vp; 7683 off = offset = svd->offset; 7684 addr = seg->s_base; 7685 7686 if ((amp = svd->amp) != NULL) { 7687 anon_index = svd->anon_index; 7688 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 7689 } 7690 7691 for (page = 0; page < npages; page++, offset += PAGESIZE) { 7692 struct anon *ap; 7693 int we_own_it = 0; 7694 7695 if (amp && (ap = anon_get_ptr(svd->amp->ahp, anon_index++))) { 7696 swap_xlate_nopanic(ap, &vp, &off); 7697 } else { 7698 vp = svd->vp; 7699 off = offset; 7700 } 7701 7702 /* 7703 * If pp == NULL, the page either does not exist 7704 * or is exclusively locked. So determine if it 7705 * exists before searching for it. 7706 */ 7707 7708 if ((pp = page_lookup_nowait(vp, off, SE_SHARED))) 7709 we_own_it = 1; 7710 else 7711 pp = page_exists(vp, off); 7712 7713 if (pp) { 7714 pfn = page_pptonum(pp); 7715 dump_addpage(seg->s_as, addr, pfn); 7716 if (we_own_it) 7717 page_unlock(pp); 7718 } 7719 addr += PAGESIZE; 7720 dump_timeleft = dump_timeout; 7721 } 7722 7723 if (amp != NULL) 7724 ANON_LOCK_EXIT(&->a_rwlock); 7725 } 7726 7727 /* 7728 * lock/unlock anon pages over a given range. Return shadow list 7729 */ 7730 static int 7731 segvn_pagelock(struct seg *seg, caddr_t addr, size_t len, struct page ***ppp, 7732 enum lock_type type, enum seg_rw rw) 7733 { 7734 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 7735 size_t np, adjustpages = 0, npages = (len >> PAGESHIFT); 7736 ulong_t anon_index; 7737 uint_t protchk; 7738 uint_t error; 7739 struct anon_map *amp; 7740 struct page **pplist, **pl, *pp; 7741 caddr_t a; 7742 size_t page; 7743 caddr_t lpgaddr, lpgeaddr; 7744 pgcnt_t szc0_npages = 0; 7745 7746 TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEGVN_START, 7747 "segvn_pagelock: start seg %p addr %p", seg, addr); 7748 7749 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 7750 if (seg->s_szc != 0 && (type == L_PAGELOCK || type == L_PAGEUNLOCK)) { 7751 /* 7752 * We are adjusting the pagelock region to the large page size 7753 * boundary because the unlocked part of a large page cannot 7754 * be freed anyway unless all constituent pages of a large 7755 * page are locked. Therefore this adjustment allows us to 7756 * decrement availrmem by the right value (note we don't want 7757 * to just decrement availrem by the large page size without 7758 * adjusting addr and len because then we may end up 7759 * decrementing availrmem by large page size for every 7760 * constituent page locked by a new as_pagelock call). 7761 * as_pageunlock caller must always match as_pagelock call's 7762 * addr and len. 7763 * 7764 * Note segment's page size cannot change while we are holding 7765 * as lock. And then it cannot change while softlockcnt is 7766 * not 0. This will allow us to correctly recalculate large 7767 * page size region for the matching pageunlock/reclaim call. 7768 * 7769 * for pageunlock *ppp points to the pointer of page_t that 7770 * corresponds to the real unadjusted start address. Similar 7771 * for pagelock *ppp must point to the pointer of page_t that 7772 * corresponds to the real unadjusted start address. 7773 */ 7774 size_t pgsz = page_get_pagesize(seg->s_szc); 7775 CALC_LPG_REGION(pgsz, seg, addr, len, lpgaddr, lpgeaddr); 7776 adjustpages = ((uintptr_t)(addr - lpgaddr)) >> PAGESHIFT; 7777 } 7778 7779 if (type == L_PAGEUNLOCK) { 7780 7781 /* 7782 * update hat ref bits for /proc. We need to make sure 7783 * that threads tracing the ref and mod bits of the 7784 * address space get the right data. 7785 * Note: page ref and mod bits are updated at reclaim time 7786 */ 7787 if (seg->s_as->a_vbits) { 7788 for (a = addr; a < addr + len; a += PAGESIZE) { 7789 if (rw == S_WRITE) { 7790 hat_setstat(seg->s_as, a, 7791 PAGESIZE, P_REF | P_MOD); 7792 } else { 7793 hat_setstat(seg->s_as, a, 7794 PAGESIZE, P_REF); 7795 } 7796 } 7797 } 7798 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER); 7799 if (seg->s_szc != 0) { 7800 VM_STAT_ADD(segvnvmstats.pagelock[0]); 7801 seg_pinactive(seg, lpgaddr, lpgeaddr - lpgaddr, 7802 *ppp - adjustpages, rw, segvn_reclaim); 7803 } else { 7804 seg_pinactive(seg, addr, len, *ppp, rw, segvn_reclaim); 7805 } 7806 7807 /* 7808 * If someone is blocked while unmapping, we purge 7809 * segment page cache and thus reclaim pplist synchronously 7810 * without waiting for seg_pasync_thread. This speeds up 7811 * unmapping in cases where munmap(2) is called, while 7812 * raw async i/o is still in progress or where a thread 7813 * exits on data fault in a multithreaded application. 7814 */ 7815 if (AS_ISUNMAPWAIT(seg->s_as) && (svd->softlockcnt > 0)) { 7816 /* 7817 * Even if we grab segvn WRITER's lock or segp_slock 7818 * here, there might be another thread which could've 7819 * successfully performed lookup/insert just before 7820 * we acquired the lock here. So, grabbing either 7821 * lock here is of not much use. Until we devise 7822 * a strategy at upper layers to solve the 7823 * synchronization issues completely, we expect 7824 * applications to handle this appropriately. 7825 */ 7826 segvn_purge(seg); 7827 } 7828 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7829 TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEGVN_UNLOCK_END, 7830 "segvn_pagelock: unlock seg %p addr %p", seg, addr); 7831 return (0); 7832 } else if (type == L_PAGERECLAIM) { 7833 VM_STAT_COND_ADD(seg->s_szc != 0, segvnvmstats.pagelock[1]); 7834 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER); 7835 (void) segvn_reclaim(seg, addr, len, *ppp, rw); 7836 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7837 TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEGVN_UNLOCK_END, 7838 "segvn_pagelock: reclaim seg %p addr %p", seg, addr); 7839 return (0); 7840 } 7841 7842 if (seg->s_szc != 0) { 7843 VM_STAT_ADD(segvnvmstats.pagelock[2]); 7844 addr = lpgaddr; 7845 len = lpgeaddr - lpgaddr; 7846 npages = (len >> PAGESHIFT); 7847 } 7848 7849 /* 7850 * for now we only support pagelock to anon memory. We've to check 7851 * protections for vnode objects and call into the vnode driver. 7852 * That's too much for a fast path. Let the fault entry point handle it. 7853 */ 7854 if (svd->vp != NULL) { 7855 TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEGVN_MISS_END, 7856 "segvn_pagelock: mapped vnode seg %p addr %p", seg, addr); 7857 *ppp = NULL; 7858 return (ENOTSUP); 7859 } 7860 7861 /* 7862 * if anonmap is not yet created, let the fault entry point populate it 7863 * with anon ptrs. 7864 */ 7865 if ((amp = svd->amp) == NULL) { 7866 TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEGVN_MISS_END, 7867 "segvn_pagelock: anonmap null seg %p addr %p", seg, addr); 7868 *ppp = NULL; 7869 return (EFAULT); 7870 } 7871 7872 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER); 7873 7874 /* 7875 * we acquire segp_slock to prevent duplicate entries 7876 * in seg_pcache 7877 */ 7878 mutex_enter(&svd->segp_slock); 7879 7880 /* 7881 * try to find pages in segment page cache 7882 */ 7883 pplist = seg_plookup(seg, addr, len, rw); 7884 if (pplist != NULL) { 7885 mutex_exit(&svd->segp_slock); 7886 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7887 *ppp = pplist + adjustpages; 7888 TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEGVN_HIT_END, 7889 "segvn_pagelock: cache hit seg %p addr %p", seg, addr); 7890 return (0); 7891 } 7892 7893 if (rw == S_READ) { 7894 protchk = PROT_READ; 7895 } else { 7896 protchk = PROT_WRITE; 7897 } 7898 7899 if (svd->pageprot == 0) { 7900 if ((svd->prot & protchk) == 0) { 7901 mutex_exit(&svd->segp_slock); 7902 error = EFAULT; 7903 goto out; 7904 } 7905 } else { 7906 /* 7907 * check page protections 7908 */ 7909 for (a = addr; a < addr + len; a += PAGESIZE) { 7910 struct vpage *vp; 7911 7912 vp = &svd->vpage[seg_page(seg, a)]; 7913 if ((VPP_PROT(vp) & protchk) == 0) { 7914 mutex_exit(&svd->segp_slock); 7915 error = EFAULT; 7916 goto out; 7917 } 7918 } 7919 } 7920 7921 /* 7922 * Avoid per page overhead of segvn_pp_lock_anonpages() for small 7923 * pages. For large pages segvn_pp_lock_anonpages() only does real 7924 * work once per large page. The tradeoff is that we may decrement 7925 * availrmem more than once for the same page but this is ok 7926 * for small pages. 7927 */ 7928 if (seg->s_szc == 0) { 7929 mutex_enter(&freemem_lock); 7930 if (availrmem < tune.t_minarmem + npages) { 7931 mutex_exit(&freemem_lock); 7932 mutex_exit(&svd->segp_slock); 7933 error = ENOMEM; 7934 goto out; 7935 } 7936 availrmem -= npages; 7937 mutex_exit(&freemem_lock); 7938 } 7939 7940 pplist = kmem_alloc(sizeof (page_t *) * npages, KM_SLEEP); 7941 pl = pplist; 7942 *ppp = pplist + adjustpages; 7943 7944 page = seg_page(seg, addr); 7945 anon_index = svd->anon_index + page; 7946 7947 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 7948 for (a = addr; a < addr + len; a += PAGESIZE, anon_index++) { 7949 struct anon *ap; 7950 struct vnode *vp; 7951 u_offset_t off; 7952 anon_sync_obj_t cookie; 7953 7954 anon_array_enter(amp, anon_index, &cookie); 7955 ap = anon_get_ptr(amp->ahp, anon_index); 7956 if (ap == NULL) { 7957 anon_array_exit(&cookie); 7958 break; 7959 } else { 7960 /* 7961 * We must never use seg_pcache for COW pages 7962 * because we might end up with original page still 7963 * lying in seg_pcache even after private page is 7964 * created. This leads to data corruption as 7965 * aio_write refers to the page still in cache 7966 * while all other accesses refer to the private 7967 * page. 7968 */ 7969 if (ap->an_refcnt != 1) { 7970 anon_array_exit(&cookie); 7971 break; 7972 } 7973 } 7974 swap_xlate(ap, &vp, &off); 7975 anon_array_exit(&cookie); 7976 7977 pp = page_lookup_nowait(vp, off, SE_SHARED); 7978 if (pp == NULL) { 7979 break; 7980 } 7981 if (seg->s_szc != 0 || pp->p_szc != 0) { 7982 if (!segvn_pp_lock_anonpages(pp, a == addr)) { 7983 page_unlock(pp); 7984 break; 7985 } 7986 } else { 7987 szc0_npages++; 7988 } 7989 *pplist++ = pp; 7990 } 7991 ANON_LOCK_EXIT(&->a_rwlock); 7992 7993 ASSERT(npages >= szc0_npages); 7994 7995 if (a >= addr + len) { 7996 mutex_enter(&freemem_lock); 7997 if (seg->s_szc == 0 && npages != szc0_npages) { 7998 ASSERT(svd->type == MAP_SHARED && amp->a_szc > 0); 7999 availrmem += (npages - szc0_npages); 8000 } 8001 svd->softlockcnt += npages; 8002 segvn_pages_locked += npages; 8003 mutex_exit(&freemem_lock); 8004 (void) seg_pinsert(seg, addr, len, pl, rw, SEGP_ASYNC_FLUSH, 8005 segvn_reclaim); 8006 mutex_exit(&svd->segp_slock); 8007 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 8008 TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEGVN_FILL_END, 8009 "segvn_pagelock: cache fill seg %p addr %p", seg, addr); 8010 return (0); 8011 } 8012 8013 mutex_exit(&svd->segp_slock); 8014 if (seg->s_szc == 0) { 8015 mutex_enter(&freemem_lock); 8016 availrmem += npages; 8017 mutex_exit(&freemem_lock); 8018 } 8019 error = EFAULT; 8020 pplist = pl; 8021 np = ((uintptr_t)(a - addr)) >> PAGESHIFT; 8022 while (np > (uint_t)0) { 8023 ASSERT(PAGE_LOCKED(*pplist)); 8024 if (seg->s_szc != 0 || (*pplist)->p_szc != 0) { 8025 segvn_pp_unlock_anonpages(*pplist, pplist == pl); 8026 } 8027 page_unlock(*pplist); 8028 np--; 8029 pplist++; 8030 } 8031 kmem_free(pl, sizeof (page_t *) * npages); 8032 out: 8033 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 8034 *ppp = NULL; 8035 TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEGVN_MISS_END, 8036 "segvn_pagelock: cache miss seg %p addr %p", seg, addr); 8037 return (error); 8038 } 8039 8040 /* 8041 * purge any cached pages in the I/O page cache 8042 */ 8043 static void 8044 segvn_purge(struct seg *seg) 8045 { 8046 seg_ppurge(seg); 8047 } 8048 8049 static int 8050 segvn_reclaim(struct seg *seg, caddr_t addr, size_t len, struct page **pplist, 8051 enum seg_rw rw) 8052 { 8053 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 8054 pgcnt_t np, npages; 8055 struct page **pl; 8056 pgcnt_t szc0_npages = 0; 8057 8058 #ifdef lint 8059 addr = addr; 8060 #endif 8061 8062 npages = np = (len >> PAGESHIFT); 8063 ASSERT(npages); 8064 pl = pplist; 8065 if (seg->s_szc != 0) { 8066 size_t pgsz = page_get_pagesize(seg->s_szc); 8067 if (!IS_P2ALIGNED(addr, pgsz) || !IS_P2ALIGNED(len, pgsz)) { 8068 panic("segvn_reclaim: unaligned addr or len"); 8069 /*NOTREACHED*/ 8070 } 8071 } 8072 8073 ASSERT(svd->vp == NULL && svd->amp != NULL); 8074 8075 while (np > (uint_t)0) { 8076 if (rw == S_WRITE) { 8077 hat_setrefmod(*pplist); 8078 } else { 8079 hat_setref(*pplist); 8080 } 8081 if (seg->s_szc != 0 || (*pplist)->p_szc != 0) { 8082 segvn_pp_unlock_anonpages(*pplist, pplist == pl); 8083 } else { 8084 szc0_npages++; 8085 } 8086 page_unlock(*pplist); 8087 np--; 8088 pplist++; 8089 } 8090 kmem_free(pl, sizeof (page_t *) * npages); 8091 8092 mutex_enter(&freemem_lock); 8093 segvn_pages_locked -= npages; 8094 svd->softlockcnt -= npages; 8095 if (szc0_npages != 0) { 8096 availrmem += szc0_npages; 8097 } 8098 mutex_exit(&freemem_lock); 8099 if (svd->softlockcnt <= 0) { 8100 if (AS_ISUNMAPWAIT(seg->s_as)) { 8101 mutex_enter(&seg->s_as->a_contents); 8102 if (AS_ISUNMAPWAIT(seg->s_as)) { 8103 AS_CLRUNMAPWAIT(seg->s_as); 8104 cv_broadcast(&seg->s_as->a_cv); 8105 } 8106 mutex_exit(&seg->s_as->a_contents); 8107 } 8108 } 8109 return (0); 8110 } 8111 /* 8112 * get a memory ID for an addr in a given segment 8113 * 8114 * XXX only creates PAGESIZE pages if anon slots are not initialized. 8115 * At fault time they will be relocated into larger pages. 8116 */ 8117 static int 8118 segvn_getmemid(struct seg *seg, caddr_t addr, memid_t *memidp) 8119 { 8120 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 8121 struct anon *ap = NULL; 8122 ulong_t anon_index; 8123 struct anon_map *amp; 8124 anon_sync_obj_t cookie; 8125 8126 if (svd->type == MAP_PRIVATE) { 8127 memidp->val[0] = (uintptr_t)seg->s_as; 8128 memidp->val[1] = (uintptr_t)addr; 8129 return (0); 8130 } 8131 8132 if (svd->type == MAP_SHARED) { 8133 if (svd->vp) { 8134 memidp->val[0] = (uintptr_t)svd->vp; 8135 memidp->val[1] = (u_longlong_t)svd->offset + 8136 (uintptr_t)(addr - seg->s_base); 8137 return (0); 8138 } else { 8139 8140 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER); 8141 if ((amp = svd->amp) != NULL) { 8142 anon_index = svd->anon_index + 8143 seg_page(seg, addr); 8144 } 8145 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 8146 8147 ASSERT(amp != NULL); 8148 8149 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 8150 anon_array_enter(amp, anon_index, &cookie); 8151 ap = anon_get_ptr(amp->ahp, anon_index); 8152 if (ap == NULL) { 8153 page_t *pp; 8154 8155 pp = anon_zero(seg, addr, &ap, svd->cred); 8156 if (pp == NULL) { 8157 anon_array_exit(&cookie); 8158 ANON_LOCK_EXIT(&->a_rwlock); 8159 return (ENOMEM); 8160 } 8161 ASSERT(anon_get_ptr(amp->ahp, anon_index) 8162 == NULL); 8163 (void) anon_set_ptr(amp->ahp, anon_index, 8164 ap, ANON_SLEEP); 8165 page_unlock(pp); 8166 } 8167 8168 anon_array_exit(&cookie); 8169 ANON_LOCK_EXIT(&->a_rwlock); 8170 8171 memidp->val[0] = (uintptr_t)ap; 8172 memidp->val[1] = (uintptr_t)addr & PAGEOFFSET; 8173 return (0); 8174 } 8175 } 8176 return (EINVAL); 8177 } 8178 8179 static int 8180 sameprot(struct seg *seg, caddr_t a, size_t len) 8181 { 8182 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 8183 struct vpage *vpage; 8184 spgcnt_t pages = btop(len); 8185 uint_t prot; 8186 8187 if (svd->pageprot == 0) 8188 return (1); 8189 8190 ASSERT(svd->vpage != NULL); 8191 8192 vpage = &svd->vpage[seg_page(seg, a)]; 8193 prot = VPP_PROT(vpage); 8194 vpage++; 8195 pages--; 8196 while (pages-- > 0) { 8197 if (prot != VPP_PROT(vpage)) 8198 return (0); 8199 vpage++; 8200 } 8201 return (1); 8202 } 8203 8204 /* 8205 * Get memory allocation policy info for specified address in given segment 8206 */ 8207 static lgrp_mem_policy_info_t * 8208 segvn_getpolicy(struct seg *seg, caddr_t addr) 8209 { 8210 struct anon_map *amp; 8211 ulong_t anon_index; 8212 lgrp_mem_policy_info_t *policy_info; 8213 struct segvn_data *svn_data; 8214 u_offset_t vn_off; 8215 vnode_t *vp; 8216 8217 ASSERT(seg != NULL); 8218 8219 svn_data = (struct segvn_data *)seg->s_data; 8220 if (svn_data == NULL) 8221 return (NULL); 8222 8223 /* 8224 * Get policy info for private or shared memory 8225 */ 8226 if (svn_data->type != MAP_SHARED) 8227 policy_info = &svn_data->policy_info; 8228 else { 8229 amp = svn_data->amp; 8230 anon_index = svn_data->anon_index + seg_page(seg, addr); 8231 vp = svn_data->vp; 8232 vn_off = svn_data->offset + (uintptr_t)(addr - seg->s_base); 8233 policy_info = lgrp_shm_policy_get(amp, anon_index, vp, vn_off); 8234 } 8235 8236 return (policy_info); 8237 } 8238 8239 /*ARGSUSED*/ 8240 static int 8241 segvn_capable(struct seg *seg, segcapability_t capability) 8242 { 8243 return (0); 8244 } 8245