1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ 27 /* All Rights Reserved */ 28 29 /* 30 * University Copyright- Copyright (c) 1982, 1986, 1988 31 * The Regents of the University of California 32 * All Rights Reserved 33 * 34 * University Acknowledgment- Portions of this document are derived from 35 * software developed by the University of California, Berkeley, and its 36 * contributors. 37 */ 38 39 #pragma ident "%Z%%M% %I% %E% SMI" 40 41 /* 42 * VM - shared or copy-on-write from a vnode/anonymous memory. 43 */ 44 45 #include <sys/types.h> 46 #include <sys/param.h> 47 #include <sys/t_lock.h> 48 #include <sys/errno.h> 49 #include <sys/systm.h> 50 #include <sys/mman.h> 51 #include <sys/debug.h> 52 #include <sys/cred.h> 53 #include <sys/vmsystm.h> 54 #include <sys/tuneable.h> 55 #include <sys/bitmap.h> 56 #include <sys/swap.h> 57 #include <sys/kmem.h> 58 #include <sys/sysmacros.h> 59 #include <sys/vtrace.h> 60 #include <sys/cmn_err.h> 61 #include <sys/vm.h> 62 #include <sys/dumphdr.h> 63 #include <sys/lgrp.h> 64 65 #include <vm/hat.h> 66 #include <vm/as.h> 67 #include <vm/seg.h> 68 #include <vm/seg_vn.h> 69 #include <vm/pvn.h> 70 #include <vm/anon.h> 71 #include <vm/page.h> 72 #include <vm/vpage.h> 73 74 /* 75 * Private seg op routines. 76 */ 77 static int segvn_dup(struct seg *seg, struct seg *newseg); 78 static int segvn_unmap(struct seg *seg, caddr_t addr, size_t len); 79 static void segvn_free(struct seg *seg); 80 static faultcode_t segvn_fault(struct hat *hat, struct seg *seg, 81 caddr_t addr, size_t len, enum fault_type type, 82 enum seg_rw rw); 83 static faultcode_t segvn_faulta(struct seg *seg, caddr_t addr); 84 static int segvn_setprot(struct seg *seg, caddr_t addr, 85 size_t len, uint_t prot); 86 static int segvn_checkprot(struct seg *seg, caddr_t addr, 87 size_t len, uint_t prot); 88 static int segvn_kluster(struct seg *seg, caddr_t addr, ssize_t delta); 89 static size_t segvn_swapout(struct seg *seg); 90 static int segvn_sync(struct seg *seg, caddr_t addr, size_t len, 91 int attr, uint_t flags); 92 static size_t segvn_incore(struct seg *seg, caddr_t addr, size_t len, 93 char *vec); 94 static int segvn_lockop(struct seg *seg, caddr_t addr, size_t len, 95 int attr, int op, ulong_t *lockmap, size_t pos); 96 static int segvn_getprot(struct seg *seg, caddr_t addr, size_t len, 97 uint_t *protv); 98 static u_offset_t segvn_getoffset(struct seg *seg, caddr_t addr); 99 static int segvn_gettype(struct seg *seg, caddr_t addr); 100 static int segvn_getvp(struct seg *seg, caddr_t addr, 101 struct vnode **vpp); 102 static int segvn_advise(struct seg *seg, caddr_t addr, size_t len, 103 uint_t behav); 104 static void segvn_dump(struct seg *seg); 105 static int segvn_pagelock(struct seg *seg, caddr_t addr, size_t len, 106 struct page ***ppp, enum lock_type type, enum seg_rw rw); 107 static int segvn_setpagesize(struct seg *seg, caddr_t addr, size_t len, 108 uint_t szc); 109 static int segvn_getmemid(struct seg *seg, caddr_t addr, 110 memid_t *memidp); 111 static lgrp_mem_policy_info_t *segvn_getpolicy(struct seg *, caddr_t); 112 static int segvn_capable(struct seg *seg, segcapability_t capable); 113 114 struct seg_ops segvn_ops = { 115 segvn_dup, 116 segvn_unmap, 117 segvn_free, 118 segvn_fault, 119 segvn_faulta, 120 segvn_setprot, 121 segvn_checkprot, 122 segvn_kluster, 123 segvn_swapout, 124 segvn_sync, 125 segvn_incore, 126 segvn_lockop, 127 segvn_getprot, 128 segvn_getoffset, 129 segvn_gettype, 130 segvn_getvp, 131 segvn_advise, 132 segvn_dump, 133 segvn_pagelock, 134 segvn_setpagesize, 135 segvn_getmemid, 136 segvn_getpolicy, 137 segvn_capable, 138 }; 139 140 /* 141 * Common zfod structures, provided as a shorthand for others to use. 142 */ 143 static segvn_crargs_t zfod_segvn_crargs = 144 SEGVN_ZFOD_ARGS(PROT_ZFOD, PROT_ALL); 145 static segvn_crargs_t kzfod_segvn_crargs = 146 SEGVN_ZFOD_ARGS(PROT_ZFOD & ~PROT_USER, 147 PROT_ALL & ~PROT_USER); 148 static segvn_crargs_t stack_noexec_crargs = 149 SEGVN_ZFOD_ARGS(PROT_ZFOD & ~PROT_EXEC, PROT_ALL); 150 151 caddr_t zfod_argsp = (caddr_t)&zfod_segvn_crargs; /* user zfod argsp */ 152 caddr_t kzfod_argsp = (caddr_t)&kzfod_segvn_crargs; /* kernel zfod argsp */ 153 caddr_t stack_exec_argsp = (caddr_t)&zfod_segvn_crargs; /* executable stack */ 154 caddr_t stack_noexec_argsp = (caddr_t)&stack_noexec_crargs; /* noexec stack */ 155 156 #define vpgtob(n) ((n) * sizeof (struct vpage)) /* For brevity */ 157 158 size_t segvn_comb_thrshld = UINT_MAX; /* patchable -- see 1196681 */ 159 160 static int segvn_concat(struct seg *, struct seg *, int); 161 static int segvn_extend_prev(struct seg *, struct seg *, 162 struct segvn_crargs *, size_t); 163 static int segvn_extend_next(struct seg *, struct seg *, 164 struct segvn_crargs *, size_t); 165 static void segvn_softunlock(struct seg *, caddr_t, size_t, enum seg_rw); 166 static void segvn_pagelist_rele(page_t **); 167 static void segvn_setvnode_mpss(vnode_t *); 168 static void segvn_relocate_pages(page_t **, page_t *); 169 static int segvn_full_szcpages(page_t **, uint_t, int *, uint_t *); 170 static int segvn_fill_vp_pages(struct segvn_data *, vnode_t *, u_offset_t, 171 uint_t, page_t **, page_t **, uint_t *, int *); 172 static faultcode_t segvn_fault_vnodepages(struct hat *, struct seg *, caddr_t, 173 caddr_t, enum fault_type, enum seg_rw, caddr_t, caddr_t, int); 174 static faultcode_t segvn_fault_anonpages(struct hat *, struct seg *, caddr_t, 175 caddr_t, enum fault_type, enum seg_rw, caddr_t, caddr_t, int); 176 static faultcode_t segvn_faultpage(struct hat *, struct seg *, caddr_t, 177 u_offset_t, struct vpage *, page_t **, uint_t, 178 enum fault_type, enum seg_rw, int, int); 179 static void segvn_vpage(struct seg *); 180 181 static void segvn_purge(struct seg *seg); 182 static int segvn_reclaim(struct seg *, caddr_t, size_t, struct page **, 183 enum seg_rw); 184 185 static int sameprot(struct seg *, caddr_t, size_t); 186 187 static int segvn_demote_range(struct seg *, caddr_t, size_t, int, uint_t); 188 static int segvn_clrszc(struct seg *); 189 static struct seg *segvn_split_seg(struct seg *, caddr_t); 190 static int segvn_claim_pages(struct seg *, struct vpage *, u_offset_t, 191 ulong_t, uint_t); 192 193 static int segvn_pp_lock_anonpages(page_t *, int); 194 static void segvn_pp_unlock_anonpages(page_t *, int); 195 196 static struct kmem_cache *segvn_cache; 197 198 #ifdef VM_STATS 199 static struct segvnvmstats_str { 200 ulong_t fill_vp_pages[31]; 201 ulong_t fltvnpages[49]; 202 ulong_t fullszcpages[10]; 203 ulong_t relocatepages[3]; 204 ulong_t fltanpages[17]; 205 ulong_t pagelock[3]; 206 ulong_t demoterange[3]; 207 } segvnvmstats; 208 #endif /* VM_STATS */ 209 210 #define SDR_RANGE 1 /* demote entire range */ 211 #define SDR_END 2 /* demote non aligned ends only */ 212 213 #define CALC_LPG_REGION(pgsz, seg, addr, len, lpgaddr, lpgeaddr) { \ 214 if ((len) != 0) { \ 215 lpgaddr = (caddr_t)P2ALIGN((uintptr_t)(addr), pgsz); \ 216 ASSERT(lpgaddr >= (seg)->s_base); \ 217 lpgeaddr = (caddr_t)P2ROUNDUP((uintptr_t)((addr) + \ 218 (len)), pgsz); \ 219 ASSERT(lpgeaddr > lpgaddr); \ 220 ASSERT(lpgeaddr <= (seg)->s_base + (seg)->s_size); \ 221 } else { \ 222 lpgeaddr = lpgaddr = (addr); \ 223 } \ 224 } 225 226 /*ARGSUSED*/ 227 static int 228 segvn_cache_constructor(void *buf, void *cdrarg, int kmflags) 229 { 230 struct segvn_data *svd = buf; 231 232 rw_init(&svd->lock, NULL, RW_DEFAULT, NULL); 233 mutex_init(&svd->segp_slock, NULL, MUTEX_DEFAULT, NULL); 234 return (0); 235 } 236 237 /*ARGSUSED1*/ 238 static void 239 segvn_cache_destructor(void *buf, void *cdrarg) 240 { 241 struct segvn_data *svd = buf; 242 243 rw_destroy(&svd->lock); 244 mutex_destroy(&svd->segp_slock); 245 } 246 247 /* 248 * Patching this variable to non-zero allows the system to run with 249 * stacks marked as "not executable". It's a bit of a kludge, but is 250 * provided as a tweakable for platforms that export those ABIs 251 * (e.g. sparc V8) that have executable stacks enabled by default. 252 * There are also some restrictions for platforms that don't actually 253 * implement 'noexec' protections. 254 * 255 * Once enabled, the system is (therefore) unable to provide a fully 256 * ABI-compliant execution environment, though practically speaking, 257 * most everything works. The exceptions are generally some interpreters 258 * and debuggers that create executable code on the stack and jump 259 * into it (without explicitly mprotecting the address range to include 260 * PROT_EXEC). 261 * 262 * One important class of applications that are disabled are those 263 * that have been transformed into malicious agents using one of the 264 * numerous "buffer overflow" attacks. See 4007890. 265 */ 266 int noexec_user_stack = 0; 267 int noexec_user_stack_log = 1; 268 269 int segvn_lpg_disable = 0; 270 uint_t segvn_maxpgszc = 0; 271 272 ulong_t segvn_vmpss_clrszc_cnt; 273 ulong_t segvn_vmpss_clrszc_err; 274 ulong_t segvn_fltvnpages_clrszc_cnt; 275 ulong_t segvn_fltvnpages_clrszc_err; 276 ulong_t segvn_setpgsz_align_err; 277 ulong_t segvn_setpgsz_anon_align_err; 278 ulong_t segvn_setpgsz_getattr_err; 279 ulong_t segvn_setpgsz_eof_err; 280 ulong_t segvn_faultvnmpss_align_err1; 281 ulong_t segvn_faultvnmpss_align_err2; 282 ulong_t segvn_faultvnmpss_align_err3; 283 ulong_t segvn_faultvnmpss_align_err4; 284 ulong_t segvn_faultvnmpss_align_err5; 285 ulong_t segvn_vmpss_pageio_deadlk_err; 286 287 /* 288 * Initialize segvn data structures 289 */ 290 void 291 segvn_init(void) 292 { 293 uint_t maxszc; 294 uint_t szc; 295 size_t pgsz; 296 297 segvn_cache = kmem_cache_create("segvn_cache", 298 sizeof (struct segvn_data), 0, 299 segvn_cache_constructor, segvn_cache_destructor, NULL, 300 NULL, NULL, 0); 301 302 if (segvn_lpg_disable != 0) 303 return; 304 szc = maxszc = page_num_pagesizes() - 1; 305 if (szc == 0) { 306 segvn_lpg_disable = 1; 307 return; 308 } 309 if (page_get_pagesize(0) != PAGESIZE) { 310 panic("segvn_init: bad szc 0"); 311 /*NOTREACHED*/ 312 } 313 while (szc != 0) { 314 pgsz = page_get_pagesize(szc); 315 if (pgsz <= PAGESIZE || !IS_P2ALIGNED(pgsz, pgsz)) { 316 panic("segvn_init: bad szc %d", szc); 317 /*NOTREACHED*/ 318 } 319 szc--; 320 } 321 if (segvn_maxpgszc == 0 || segvn_maxpgszc > maxszc) 322 segvn_maxpgszc = maxszc; 323 } 324 325 #define SEGVN_PAGEIO ((void *)0x1) 326 #define SEGVN_NOPAGEIO ((void *)0x2) 327 328 static void 329 segvn_setvnode_mpss(vnode_t *vp) 330 { 331 int err; 332 333 ASSERT(vp->v_mpssdata == NULL || 334 vp->v_mpssdata == SEGVN_PAGEIO || 335 vp->v_mpssdata == SEGVN_NOPAGEIO); 336 337 if (vp->v_mpssdata == NULL) { 338 if (vn_vmpss_usepageio(vp)) { 339 err = VOP_PAGEIO(vp, (page_t *)NULL, 340 (u_offset_t)0, 0, 0, CRED()); 341 } else { 342 err = ENOSYS; 343 } 344 /* 345 * set v_mpssdata just once per vnode life 346 * so that it never changes. 347 */ 348 mutex_enter(&vp->v_lock); 349 if (vp->v_mpssdata == NULL) { 350 if (err == EINVAL) { 351 vp->v_mpssdata = SEGVN_PAGEIO; 352 } else { 353 vp->v_mpssdata = SEGVN_NOPAGEIO; 354 } 355 } 356 mutex_exit(&vp->v_lock); 357 } 358 } 359 360 int 361 segvn_create(struct seg *seg, void *argsp) 362 { 363 struct segvn_crargs *a = (struct segvn_crargs *)argsp; 364 struct segvn_data *svd; 365 size_t swresv = 0; 366 struct cred *cred; 367 struct anon_map *amp; 368 int error = 0; 369 size_t pgsz; 370 lgrp_mem_policy_t mpolicy = LGRP_MEM_POLICY_DEFAULT; 371 372 373 ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock)); 374 375 if (a->type != MAP_PRIVATE && a->type != MAP_SHARED) { 376 panic("segvn_create type"); 377 /*NOTREACHED*/ 378 } 379 380 /* 381 * Check arguments. If a shared anon structure is given then 382 * it is illegal to also specify a vp. 383 */ 384 if (a->amp != NULL && a->vp != NULL) { 385 panic("segvn_create anon_map"); 386 /*NOTREACHED*/ 387 } 388 389 /* MAP_NORESERVE on a MAP_SHARED segment is meaningless. */ 390 if (a->type == MAP_SHARED) 391 a->flags &= ~MAP_NORESERVE; 392 393 if (a->szc != 0) { 394 if (segvn_lpg_disable != 0 || 395 (a->amp != NULL && a->type == MAP_PRIVATE) || 396 (a->flags & MAP_NORESERVE) || seg->s_as == &kas) { 397 a->szc = 0; 398 } else { 399 if (a->szc > segvn_maxpgszc) 400 a->szc = segvn_maxpgszc; 401 pgsz = page_get_pagesize(a->szc); 402 if (!IS_P2ALIGNED(seg->s_base, pgsz) || 403 !IS_P2ALIGNED(seg->s_size, pgsz)) { 404 a->szc = 0; 405 } else if (a->vp != NULL) { 406 extern struct vnode kvp; 407 if (IS_SWAPFSVP(a->vp) || a->vp == &kvp) { 408 /* 409 * paranoid check. 410 * hat_page_demote() is not supported 411 * on swapfs pages. 412 */ 413 a->szc = 0; 414 } else if (map_addr_vacalign_check(seg->s_base, 415 a->offset & PAGEMASK)) { 416 a->szc = 0; 417 } 418 } else if (a->amp != NULL) { 419 pgcnt_t anum = btopr(a->offset); 420 pgcnt_t pgcnt = page_get_pagecnt(a->szc); 421 if (!IS_P2ALIGNED(anum, pgcnt)) { 422 a->szc = 0; 423 } 424 } 425 } 426 } 427 428 /* 429 * If segment may need private pages, reserve them now. 430 */ 431 if (!(a->flags & MAP_NORESERVE) && ((a->vp == NULL && a->amp == NULL) || 432 (a->type == MAP_PRIVATE && (a->prot & PROT_WRITE)))) { 433 if (anon_resv(seg->s_size) == 0) 434 return (EAGAIN); 435 swresv = seg->s_size; 436 TRACE_3(TR_FAC_VM, TR_ANON_PROC, "anon proc:%p %lu %u", 437 seg, swresv, 1); 438 } 439 440 /* 441 * Reserve any mapping structures that may be required. 442 */ 443 hat_map(seg->s_as->a_hat, seg->s_base, seg->s_size, HAT_MAP); 444 445 if (a->cred) { 446 cred = a->cred; 447 crhold(cred); 448 } else { 449 crhold(cred = CRED()); 450 } 451 452 /* Inform the vnode of the new mapping */ 453 if (a->vp) { 454 error = VOP_ADDMAP(a->vp, a->offset & PAGEMASK, 455 seg->s_as, seg->s_base, seg->s_size, a->prot, 456 a->maxprot, a->type, cred); 457 if (error) { 458 if (swresv != 0) { 459 anon_unresv(swresv); 460 TRACE_3(TR_FAC_VM, TR_ANON_PROC, 461 "anon proc:%p %lu %u", 462 seg, swresv, 0); 463 } 464 crfree(cred); 465 hat_unload(seg->s_as->a_hat, seg->s_base, 466 seg->s_size, HAT_UNLOAD_UNMAP); 467 return (error); 468 } 469 } 470 471 /* 472 * If more than one segment in the address space, and 473 * they're adjacent virtually, try to concatenate them. 474 * Don't concatenate if an explicit anon_map structure 475 * was supplied (e.g., SystemV shared memory). 476 */ 477 if (a->amp == NULL) { 478 struct seg *pseg, *nseg; 479 struct segvn_data *psvd, *nsvd; 480 lgrp_mem_policy_t ppolicy, npolicy; 481 uint_t lgrp_mem_policy_flags = 0; 482 extern lgrp_mem_policy_t lgrp_mem_default_policy; 483 484 /* 485 * Memory policy flags (lgrp_mem_policy_flags) is valid when 486 * extending stack/heap segments. 487 */ 488 if ((a->vp == NULL) && (a->type == MAP_PRIVATE) && 489 !(a->flags & MAP_NORESERVE) && (seg->s_as != &kas)) { 490 lgrp_mem_policy_flags = a->lgrp_mem_policy_flags; 491 } else { 492 /* 493 * Get policy when not extending it from another segment 494 */ 495 mpolicy = lgrp_mem_policy_default(seg->s_size, a->type); 496 } 497 498 /* 499 * First, try to concatenate the previous and new segments 500 */ 501 pseg = AS_SEGPREV(seg->s_as, seg); 502 if (pseg != NULL && 503 pseg->s_base + pseg->s_size == seg->s_base && 504 pseg->s_ops == &segvn_ops) { 505 /* 506 * Get memory allocation policy from previous segment. 507 * When extension is specified (e.g. for heap) apply 508 * this policy to the new segment regardless of the 509 * outcome of segment concatenation. Extension occurs 510 * for non-default policy otherwise default policy is 511 * used and is based on extended segment size. 512 */ 513 psvd = (struct segvn_data *)pseg->s_data; 514 ppolicy = psvd->policy_info.mem_policy; 515 if (lgrp_mem_policy_flags == 516 LGRP_MP_FLAG_EXTEND_UP) { 517 if (ppolicy != lgrp_mem_default_policy) { 518 mpolicy = ppolicy; 519 } else { 520 mpolicy = lgrp_mem_policy_default( 521 pseg->s_size + seg->s_size, 522 a->type); 523 } 524 } 525 526 if (mpolicy == ppolicy && 527 (pseg->s_size + seg->s_size <= 528 segvn_comb_thrshld || psvd->amp == NULL) && 529 segvn_extend_prev(pseg, seg, a, swresv) == 0) { 530 /* 531 * success! now try to concatenate 532 * with following seg 533 */ 534 crfree(cred); 535 nseg = AS_SEGNEXT(pseg->s_as, pseg); 536 if (nseg != NULL && 537 nseg != pseg && 538 nseg->s_ops == &segvn_ops && 539 pseg->s_base + pseg->s_size == 540 nseg->s_base) 541 (void) segvn_concat(pseg, nseg, 0); 542 ASSERT(pseg->s_szc == 0 || 543 (a->szc == pseg->s_szc && 544 IS_P2ALIGNED(pseg->s_base, pgsz) && 545 IS_P2ALIGNED(pseg->s_size, pgsz))); 546 return (0); 547 } 548 } 549 550 /* 551 * Failed, so try to concatenate with following seg 552 */ 553 nseg = AS_SEGNEXT(seg->s_as, seg); 554 if (nseg != NULL && 555 seg->s_base + seg->s_size == nseg->s_base && 556 nseg->s_ops == &segvn_ops) { 557 /* 558 * Get memory allocation policy from next segment. 559 * When extension is specified (e.g. for stack) apply 560 * this policy to the new segment regardless of the 561 * outcome of segment concatenation. Extension occurs 562 * for non-default policy otherwise default policy is 563 * used and is based on extended segment size. 564 */ 565 nsvd = (struct segvn_data *)nseg->s_data; 566 npolicy = nsvd->policy_info.mem_policy; 567 if (lgrp_mem_policy_flags == 568 LGRP_MP_FLAG_EXTEND_DOWN) { 569 if (npolicy != lgrp_mem_default_policy) { 570 mpolicy = npolicy; 571 } else { 572 mpolicy = lgrp_mem_policy_default( 573 nseg->s_size + seg->s_size, 574 a->type); 575 } 576 } 577 578 if (mpolicy == npolicy && 579 segvn_extend_next(seg, nseg, a, swresv) == 0) { 580 crfree(cred); 581 ASSERT(nseg->s_szc == 0 || 582 (a->szc == nseg->s_szc && 583 IS_P2ALIGNED(nseg->s_base, pgsz) && 584 IS_P2ALIGNED(nseg->s_size, pgsz))); 585 return (0); 586 } 587 } 588 } 589 590 if (a->vp != NULL) { 591 VN_HOLD(a->vp); 592 if (a->type == MAP_SHARED) 593 lgrp_shm_policy_init(NULL, a->vp); 594 } 595 svd = kmem_cache_alloc(segvn_cache, KM_SLEEP); 596 597 seg->s_ops = &segvn_ops; 598 seg->s_data = (void *)svd; 599 seg->s_szc = a->szc; 600 601 svd->vp = a->vp; 602 /* 603 * Anonymous mappings have no backing file so the offset is meaningless. 604 */ 605 svd->offset = a->vp ? (a->offset & PAGEMASK) : 0; 606 svd->prot = a->prot; 607 svd->maxprot = a->maxprot; 608 svd->pageprot = 0; 609 svd->type = a->type; 610 svd->vpage = NULL; 611 svd->cred = cred; 612 svd->advice = MADV_NORMAL; 613 svd->pageadvice = 0; 614 svd->flags = (ushort_t)a->flags; 615 svd->softlockcnt = 0; 616 if (a->szc != 0 && a->vp != NULL) { 617 segvn_setvnode_mpss(a->vp); 618 } 619 620 amp = a->amp; 621 if ((svd->amp = amp) == NULL) { 622 svd->anon_index = 0; 623 if (svd->type == MAP_SHARED) { 624 svd->swresv = 0; 625 /* 626 * Shared mappings to a vp need no other setup. 627 * If we have a shared mapping to an anon_map object 628 * which hasn't been allocated yet, allocate the 629 * struct now so that it will be properly shared 630 * by remembering the swap reservation there. 631 */ 632 if (a->vp == NULL) { 633 svd->amp = anonmap_alloc(seg->s_size, swresv); 634 svd->amp->a_szc = seg->s_szc; 635 } 636 } else { 637 /* 638 * Private mapping (with or without a vp). 639 * Allocate anon_map when needed. 640 */ 641 svd->swresv = swresv; 642 } 643 } else { 644 pgcnt_t anon_num; 645 646 /* 647 * Mapping to an existing anon_map structure without a vp. 648 * For now we will insure that the segment size isn't larger 649 * than the size - offset gives us. Later on we may wish to 650 * have the anon array dynamically allocated itself so that 651 * we don't always have to allocate all the anon pointer slots. 652 * This of course involves adding extra code to check that we 653 * aren't trying to use an anon pointer slot beyond the end 654 * of the currently allocated anon array. 655 */ 656 if ((amp->size - a->offset) < seg->s_size) { 657 panic("segvn_create anon_map size"); 658 /*NOTREACHED*/ 659 } 660 661 anon_num = btopr(a->offset); 662 663 if (a->type == MAP_SHARED) { 664 /* 665 * SHARED mapping to a given anon_map. 666 */ 667 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 668 amp->refcnt++; 669 if (a->szc > amp->a_szc) { 670 amp->a_szc = a->szc; 671 } 672 ANON_LOCK_EXIT(&->a_rwlock); 673 svd->anon_index = anon_num; 674 svd->swresv = 0; 675 } else { 676 /* 677 * PRIVATE mapping to a given anon_map. 678 * Make sure that all the needed anon 679 * structures are created (so that we will 680 * share the underlying pages if nothing 681 * is written by this mapping) and then 682 * duplicate the anon array as is done 683 * when a privately mapped segment is dup'ed. 684 */ 685 struct anon *ap; 686 caddr_t addr; 687 caddr_t eaddr; 688 ulong_t anon_idx; 689 int hat_flag = HAT_LOAD; 690 691 if (svd->flags & MAP_TEXT) { 692 hat_flag |= HAT_LOAD_TEXT; 693 } 694 695 svd->amp = anonmap_alloc(seg->s_size, 0); 696 svd->amp->a_szc = seg->s_szc; 697 svd->anon_index = 0; 698 svd->swresv = swresv; 699 700 /* 701 * Prevent 2 threads from allocating anon 702 * slots simultaneously. 703 */ 704 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 705 eaddr = seg->s_base + seg->s_size; 706 707 for (anon_idx = anon_num, addr = seg->s_base; 708 addr < eaddr; addr += PAGESIZE, anon_idx++) { 709 page_t *pp; 710 711 if ((ap = anon_get_ptr(amp->ahp, 712 anon_idx)) != NULL) 713 continue; 714 715 /* 716 * Allocate the anon struct now. 717 * Might as well load up translation 718 * to the page while we're at it... 719 */ 720 pp = anon_zero(seg, addr, &ap, cred); 721 if (ap == NULL || pp == NULL) { 722 panic("segvn_create anon_zero"); 723 /*NOTREACHED*/ 724 } 725 726 /* 727 * Re-acquire the anon_map lock and 728 * initialize the anon array entry. 729 */ 730 ASSERT(anon_get_ptr(amp->ahp, 731 anon_idx) == NULL); 732 (void) anon_set_ptr(amp->ahp, anon_idx, ap, 733 ANON_SLEEP); 734 735 ASSERT(seg->s_szc == 0); 736 ASSERT(!IS_VMODSORT(pp->p_vnode)); 737 738 hat_memload(seg->s_as->a_hat, addr, pp, 739 svd->prot & ~PROT_WRITE, hat_flag); 740 741 page_unlock(pp); 742 } 743 ASSERT(seg->s_szc == 0); 744 anon_dup(amp->ahp, anon_num, svd->amp->ahp, 745 0, seg->s_size); 746 ANON_LOCK_EXIT(&->a_rwlock); 747 } 748 } 749 750 /* 751 * Set default memory allocation policy for segment 752 * 753 * Always set policy for private memory at least for initialization 754 * even if this is a shared memory segment 755 */ 756 (void) lgrp_privm_policy_set(mpolicy, &svd->policy_info, seg->s_size); 757 758 if (svd->type == MAP_SHARED) 759 (void) lgrp_shm_policy_set(mpolicy, svd->amp, svd->anon_index, 760 svd->vp, svd->offset, seg->s_size); 761 762 return (0); 763 } 764 765 /* 766 * Concatenate two existing segments, if possible. 767 * Return 0 on success, -1 if two segments are not compatible 768 * or -2 on memory allocation failure. 769 * If amp_cat == 1 then try and concat segments with anon maps 770 */ 771 static int 772 segvn_concat(struct seg *seg1, struct seg *seg2, int amp_cat) 773 { 774 struct segvn_data *svd1 = seg1->s_data; 775 struct segvn_data *svd2 = seg2->s_data; 776 struct anon_map *amp1 = svd1->amp; 777 struct anon_map *amp2 = svd2->amp; 778 struct vpage *vpage1 = svd1->vpage; 779 struct vpage *vpage2 = svd2->vpage, *nvpage = NULL; 780 size_t size, nvpsize; 781 pgcnt_t npages1, npages2; 782 783 ASSERT(seg1->s_as && seg2->s_as && seg1->s_as == seg2->s_as); 784 ASSERT(AS_WRITE_HELD(seg1->s_as, &seg1->s_as->a_lock)); 785 ASSERT(seg1->s_ops == seg2->s_ops); 786 787 /* both segments exist, try to merge them */ 788 #define incompat(x) (svd1->x != svd2->x) 789 if (incompat(vp) || incompat(maxprot) || 790 (!svd1->pageadvice && !svd2->pageadvice && incompat(advice)) || 791 (!svd1->pageprot && !svd2->pageprot && incompat(prot)) || 792 incompat(type) || incompat(cred) || incompat(flags) || 793 seg1->s_szc != seg2->s_szc || incompat(policy_info.mem_policy) || 794 (svd2->softlockcnt > 0)) 795 return (-1); 796 #undef incompat 797 798 /* 799 * vp == NULL implies zfod, offset doesn't matter 800 */ 801 if (svd1->vp != NULL && 802 svd1->offset + seg1->s_size != svd2->offset) { 803 return (-1); 804 } 805 806 /* 807 * Fail early if we're not supposed to concatenate 808 * segments with non NULL amp. 809 */ 810 if (amp_cat == 0 && (amp1 != NULL || amp2 != NULL)) { 811 return (-1); 812 } 813 814 if (svd1->vp == NULL && svd1->type == MAP_SHARED) { 815 if (amp1 != amp2) { 816 return (-1); 817 } 818 if (amp1 != NULL && svd1->anon_index + btop(seg1->s_size) != 819 svd2->anon_index) { 820 return (-1); 821 } 822 ASSERT(amp1 == NULL || amp1->refcnt >= 2); 823 } 824 825 /* 826 * If either seg has vpages, create a new merged vpage array. 827 */ 828 if (vpage1 != NULL || vpage2 != NULL) { 829 struct vpage *vp; 830 831 npages1 = seg_pages(seg1); 832 npages2 = seg_pages(seg2); 833 nvpsize = vpgtob(npages1 + npages2); 834 835 if ((nvpage = kmem_zalloc(nvpsize, KM_NOSLEEP)) == NULL) { 836 return (-2); 837 } 838 if (vpage1 != NULL) { 839 bcopy(vpage1, nvpage, vpgtob(npages1)); 840 } 841 if (vpage2 != NULL) { 842 bcopy(vpage2, nvpage + npages1, vpgtob(npages2)); 843 } 844 for (vp = nvpage; vp < nvpage + npages1; vp++) { 845 if (svd2->pageprot && !svd1->pageprot) { 846 VPP_SETPROT(vp, svd1->prot); 847 } 848 if (svd2->pageadvice && !svd1->pageadvice) { 849 VPP_SETADVICE(vp, svd1->advice); 850 } 851 } 852 for (vp = nvpage + npages1; 853 vp < nvpage + npages1 + npages2; vp++) { 854 if (svd1->pageprot && !svd2->pageprot) { 855 VPP_SETPROT(vp, svd2->prot); 856 } 857 if (svd1->pageadvice && !svd2->pageadvice) { 858 VPP_SETADVICE(vp, svd2->advice); 859 } 860 } 861 } 862 863 /* 864 * If either segment has private pages, create a new merged anon 865 * array. If mergeing shared anon segments just decrement anon map's 866 * refcnt. 867 */ 868 if (amp1 != NULL && svd1->type == MAP_SHARED) { 869 ASSERT(amp1 == amp2 && svd1->vp == NULL); 870 ANON_LOCK_ENTER(&1->a_rwlock, RW_WRITER); 871 ASSERT(amp1->refcnt >= 2); 872 amp1->refcnt--; 873 ANON_LOCK_EXIT(&1->a_rwlock); 874 svd2->amp = NULL; 875 } else if (amp1 != NULL || amp2 != NULL) { 876 struct anon_hdr *nahp; 877 struct anon_map *namp = NULL; 878 size_t asize; 879 880 ASSERT(svd1->type == MAP_PRIVATE); 881 882 asize = seg1->s_size + seg2->s_size; 883 if ((nahp = anon_create(btop(asize), ANON_NOSLEEP)) == NULL) { 884 if (nvpage != NULL) { 885 kmem_free(nvpage, nvpsize); 886 } 887 return (-2); 888 } 889 if (amp1 != NULL) { 890 /* 891 * XXX anon rwlock is not really needed because 892 * this is a private segment and we are writers. 893 */ 894 ANON_LOCK_ENTER(&1->a_rwlock, RW_WRITER); 895 ASSERT(amp1->refcnt == 1); 896 if (anon_copy_ptr(amp1->ahp, svd1->anon_index, 897 nahp, 0, btop(seg1->s_size), ANON_NOSLEEP)) { 898 anon_release(nahp, btop(asize)); 899 ANON_LOCK_EXIT(&1->a_rwlock); 900 if (nvpage != NULL) { 901 kmem_free(nvpage, nvpsize); 902 } 903 return (-2); 904 } 905 } 906 if (amp2 != NULL) { 907 ANON_LOCK_ENTER(&2->a_rwlock, RW_WRITER); 908 ASSERT(amp2->refcnt == 1); 909 if (anon_copy_ptr(amp2->ahp, svd2->anon_index, 910 nahp, btop(seg1->s_size), btop(seg2->s_size), 911 ANON_NOSLEEP)) { 912 anon_release(nahp, btop(asize)); 913 ANON_LOCK_EXIT(&2->a_rwlock); 914 if (amp1 != NULL) { 915 ANON_LOCK_EXIT(&1->a_rwlock); 916 } 917 if (nvpage != NULL) { 918 kmem_free(nvpage, nvpsize); 919 } 920 return (-2); 921 } 922 } 923 if (amp1 != NULL) { 924 namp = amp1; 925 anon_release(amp1->ahp, btop(amp1->size)); 926 } 927 if (amp2 != NULL) { 928 if (namp == NULL) { 929 ASSERT(amp1 == NULL); 930 namp = amp2; 931 anon_release(amp2->ahp, btop(amp2->size)); 932 } else { 933 amp2->refcnt--; 934 ANON_LOCK_EXIT(&2->a_rwlock); 935 anonmap_free(amp2); 936 } 937 svd2->amp = NULL; /* needed for seg_free */ 938 } 939 namp->ahp = nahp; 940 namp->size = asize; 941 svd1->amp = namp; 942 svd1->anon_index = 0; 943 ANON_LOCK_EXIT(&namp->a_rwlock); 944 } 945 /* 946 * Now free the old vpage structures. 947 */ 948 if (nvpage != NULL) { 949 if (vpage1 != NULL) { 950 kmem_free(vpage1, vpgtob(npages1)); 951 } 952 if (vpage2 != NULL) { 953 svd2->vpage = NULL; 954 kmem_free(vpage2, vpgtob(npages2)); 955 } 956 if (svd2->pageprot) { 957 svd1->pageprot = 1; 958 } 959 if (svd2->pageadvice) { 960 svd1->pageadvice = 1; 961 } 962 svd1->vpage = nvpage; 963 } 964 965 /* all looks ok, merge segments */ 966 svd1->swresv += svd2->swresv; 967 svd2->swresv = 0; /* so seg_free doesn't release swap space */ 968 size = seg2->s_size; 969 seg_free(seg2); 970 seg1->s_size += size; 971 return (0); 972 } 973 974 /* 975 * Extend the previous segment (seg1) to include the 976 * new segment (seg2 + a), if possible. 977 * Return 0 on success. 978 */ 979 static int 980 segvn_extend_prev(seg1, seg2, a, swresv) 981 struct seg *seg1, *seg2; 982 struct segvn_crargs *a; 983 size_t swresv; 984 { 985 struct segvn_data *svd1 = (struct segvn_data *)seg1->s_data; 986 size_t size; 987 struct anon_map *amp1; 988 struct vpage *new_vpage; 989 990 /* 991 * We don't need any segment level locks for "segvn" data 992 * since the address space is "write" locked. 993 */ 994 ASSERT(seg1->s_as && AS_WRITE_HELD(seg1->s_as, &seg1->s_as->a_lock)); 995 996 /* second segment is new, try to extend first */ 997 /* XXX - should also check cred */ 998 if (svd1->vp != a->vp || svd1->maxprot != a->maxprot || 999 (!svd1->pageprot && (svd1->prot != a->prot)) || 1000 svd1->type != a->type || svd1->flags != a->flags || 1001 seg1->s_szc != a->szc) 1002 return (-1); 1003 1004 /* vp == NULL implies zfod, offset doesn't matter */ 1005 if (svd1->vp != NULL && 1006 svd1->offset + seg1->s_size != (a->offset & PAGEMASK)) 1007 return (-1); 1008 1009 amp1 = svd1->amp; 1010 if (amp1) { 1011 pgcnt_t newpgs; 1012 1013 /* 1014 * Segment has private pages, can data structures 1015 * be expanded? 1016 * 1017 * Acquire the anon_map lock to prevent it from changing, 1018 * if it is shared. This ensures that the anon_map 1019 * will not change while a thread which has a read/write 1020 * lock on an address space references it. 1021 * XXX - Don't need the anon_map lock at all if "refcnt" 1022 * is 1. 1023 * 1024 * Can't grow a MAP_SHARED segment with an anonmap because 1025 * there may be existing anon slots where we want to extend 1026 * the segment and we wouldn't know what to do with them 1027 * (e.g., for tmpfs right thing is to just leave them there, 1028 * for /dev/zero they should be cleared out). 1029 */ 1030 if (svd1->type == MAP_SHARED) 1031 return (-1); 1032 1033 ANON_LOCK_ENTER(&1->a_rwlock, RW_WRITER); 1034 if (amp1->refcnt > 1) { 1035 ANON_LOCK_EXIT(&1->a_rwlock); 1036 return (-1); 1037 } 1038 newpgs = anon_grow(amp1->ahp, &svd1->anon_index, 1039 btop(seg1->s_size), btop(seg2->s_size), ANON_NOSLEEP); 1040 1041 if (newpgs == 0) { 1042 ANON_LOCK_EXIT(&1->a_rwlock); 1043 return (-1); 1044 } 1045 amp1->size = ptob(newpgs); 1046 ANON_LOCK_EXIT(&1->a_rwlock); 1047 } 1048 if (svd1->vpage != NULL) { 1049 new_vpage = 1050 kmem_zalloc(vpgtob(seg_pages(seg1) + seg_pages(seg2)), 1051 KM_NOSLEEP); 1052 if (new_vpage == NULL) 1053 return (-1); 1054 bcopy(svd1->vpage, new_vpage, vpgtob(seg_pages(seg1))); 1055 kmem_free(svd1->vpage, vpgtob(seg_pages(seg1))); 1056 svd1->vpage = new_vpage; 1057 if (svd1->pageprot) { 1058 struct vpage *vp, *evp; 1059 1060 vp = new_vpage + seg_pages(seg1); 1061 evp = vp + seg_pages(seg2); 1062 for (; vp < evp; vp++) 1063 VPP_SETPROT(vp, a->prot); 1064 } 1065 } 1066 size = seg2->s_size; 1067 seg_free(seg2); 1068 seg1->s_size += size; 1069 svd1->swresv += swresv; 1070 return (0); 1071 } 1072 1073 /* 1074 * Extend the next segment (seg2) to include the 1075 * new segment (seg1 + a), if possible. 1076 * Return 0 on success. 1077 */ 1078 static int 1079 segvn_extend_next( 1080 struct seg *seg1, 1081 struct seg *seg2, 1082 struct segvn_crargs *a, 1083 size_t swresv) 1084 { 1085 struct segvn_data *svd2 = (struct segvn_data *)seg2->s_data; 1086 size_t size; 1087 struct anon_map *amp2; 1088 struct vpage *new_vpage; 1089 1090 /* 1091 * We don't need any segment level locks for "segvn" data 1092 * since the address space is "write" locked. 1093 */ 1094 ASSERT(seg2->s_as && AS_WRITE_HELD(seg2->s_as, &seg2->s_as->a_lock)); 1095 1096 /* first segment is new, try to extend second */ 1097 /* XXX - should also check cred */ 1098 if (svd2->vp != a->vp || svd2->maxprot != a->maxprot || 1099 (!svd2->pageprot && (svd2->prot != a->prot)) || 1100 svd2->type != a->type || svd2->flags != a->flags || 1101 seg2->s_szc != a->szc) 1102 return (-1); 1103 /* vp == NULL implies zfod, offset doesn't matter */ 1104 if (svd2->vp != NULL && 1105 (a->offset & PAGEMASK) + seg1->s_size != svd2->offset) 1106 return (-1); 1107 1108 amp2 = svd2->amp; 1109 if (amp2) { 1110 pgcnt_t newpgs; 1111 1112 /* 1113 * Segment has private pages, can data structures 1114 * be expanded? 1115 * 1116 * Acquire the anon_map lock to prevent it from changing, 1117 * if it is shared. This ensures that the anon_map 1118 * will not change while a thread which has a read/write 1119 * lock on an address space references it. 1120 * 1121 * XXX - Don't need the anon_map lock at all if "refcnt" 1122 * is 1. 1123 */ 1124 if (svd2->type == MAP_SHARED) 1125 return (-1); 1126 1127 ANON_LOCK_ENTER(&2->a_rwlock, RW_WRITER); 1128 if (amp2->refcnt > 1) { 1129 ANON_LOCK_EXIT(&2->a_rwlock); 1130 return (-1); 1131 } 1132 newpgs = anon_grow(amp2->ahp, &svd2->anon_index, 1133 btop(seg2->s_size), btop(seg1->s_size), 1134 ANON_NOSLEEP | ANON_GROWDOWN); 1135 1136 if (newpgs == 0) { 1137 ANON_LOCK_EXIT(&2->a_rwlock); 1138 return (-1); 1139 } 1140 amp2->size = ptob(newpgs); 1141 ANON_LOCK_EXIT(&2->a_rwlock); 1142 } 1143 if (svd2->vpage != NULL) { 1144 new_vpage = 1145 kmem_zalloc(vpgtob(seg_pages(seg1) + seg_pages(seg2)), 1146 KM_NOSLEEP); 1147 if (new_vpage == NULL) { 1148 /* Not merging segments so adjust anon_index back */ 1149 if (amp2) 1150 svd2->anon_index += seg_pages(seg1); 1151 return (-1); 1152 } 1153 bcopy(svd2->vpage, new_vpage + seg_pages(seg1), 1154 vpgtob(seg_pages(seg2))); 1155 kmem_free(svd2->vpage, vpgtob(seg_pages(seg2))); 1156 svd2->vpage = new_vpage; 1157 if (svd2->pageprot) { 1158 struct vpage *vp, *evp; 1159 1160 vp = new_vpage; 1161 evp = vp + seg_pages(seg1); 1162 for (; vp < evp; vp++) 1163 VPP_SETPROT(vp, a->prot); 1164 } 1165 } 1166 size = seg1->s_size; 1167 seg_free(seg1); 1168 seg2->s_size += size; 1169 seg2->s_base -= size; 1170 svd2->offset -= size; 1171 svd2->swresv += swresv; 1172 return (0); 1173 } 1174 1175 static int 1176 segvn_dup(struct seg *seg, struct seg *newseg) 1177 { 1178 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 1179 struct segvn_data *newsvd; 1180 pgcnt_t npages = seg_pages(seg); 1181 int error = 0; 1182 uint_t prot; 1183 size_t len; 1184 1185 ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock)); 1186 1187 /* 1188 * If segment has anon reserved, reserve more for the new seg. 1189 * For a MAP_NORESERVE segment swresv will be a count of all the 1190 * allocated anon slots; thus we reserve for the child as many slots 1191 * as the parent has allocated. This semantic prevents the child or 1192 * parent from dieing during a copy-on-write fault caused by trying 1193 * to write a shared pre-existing anon page. 1194 */ 1195 if ((len = svd->swresv) != 0) { 1196 if (anon_resv(svd->swresv) == 0) 1197 return (ENOMEM); 1198 1199 TRACE_3(TR_FAC_VM, TR_ANON_PROC, "anon proc:%p %lu %u", 1200 seg, len, 0); 1201 } 1202 1203 newsvd = kmem_cache_alloc(segvn_cache, KM_SLEEP); 1204 1205 newseg->s_ops = &segvn_ops; 1206 newseg->s_data = (void *)newsvd; 1207 newseg->s_szc = seg->s_szc; 1208 1209 if ((newsvd->vp = svd->vp) != NULL) { 1210 VN_HOLD(svd->vp); 1211 if (svd->type == MAP_SHARED) 1212 lgrp_shm_policy_init(NULL, svd->vp); 1213 } 1214 newsvd->offset = svd->offset; 1215 newsvd->prot = svd->prot; 1216 newsvd->maxprot = svd->maxprot; 1217 newsvd->pageprot = svd->pageprot; 1218 newsvd->type = svd->type; 1219 newsvd->cred = svd->cred; 1220 crhold(newsvd->cred); 1221 newsvd->advice = svd->advice; 1222 newsvd->pageadvice = svd->pageadvice; 1223 newsvd->swresv = svd->swresv; 1224 newsvd->flags = svd->flags; 1225 newsvd->softlockcnt = 0; 1226 newsvd->policy_info = svd->policy_info; 1227 if ((newsvd->amp = svd->amp) == NULL) { 1228 /* 1229 * Not attaching to a shared anon object. 1230 */ 1231 newsvd->anon_index = 0; 1232 } else { 1233 struct anon_map *amp; 1234 1235 amp = svd->amp; 1236 if (svd->type == MAP_SHARED) { 1237 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 1238 amp->refcnt++; 1239 ANON_LOCK_EXIT(&->a_rwlock); 1240 newsvd->anon_index = svd->anon_index; 1241 } else { 1242 int reclaim = 1; 1243 1244 /* 1245 * Allocate and initialize new anon_map structure. 1246 */ 1247 newsvd->amp = anonmap_alloc(newseg->s_size, 0); 1248 newsvd->amp->a_szc = newseg->s_szc; 1249 newsvd->anon_index = 0; 1250 1251 /* 1252 * We don't have to acquire the anon_map lock 1253 * for the new segment (since it belongs to an 1254 * address space that is still not associated 1255 * with any process), or the segment in the old 1256 * address space (since all threads in it 1257 * are stopped while duplicating the address space). 1258 */ 1259 1260 /* 1261 * The goal of the following code is to make sure that 1262 * softlocked pages do not end up as copy on write 1263 * pages. This would cause problems where one 1264 * thread writes to a page that is COW and a different 1265 * thread in the same process has softlocked it. The 1266 * softlock lock would move away from this process 1267 * because the write would cause this process to get 1268 * a copy (without the softlock). 1269 * 1270 * The strategy here is to just break the 1271 * sharing on pages that could possibly be 1272 * softlocked. 1273 */ 1274 retry: 1275 if (svd->softlockcnt) { 1276 struct anon *ap, *newap; 1277 size_t i; 1278 uint_t vpprot; 1279 page_t *anon_pl[1+1], *pp; 1280 caddr_t addr; 1281 ulong_t anon_idx = 0; 1282 1283 /* 1284 * The softlock count might be non zero 1285 * because some pages are still stuck in the 1286 * cache for lazy reclaim. Flush the cache 1287 * now. This should drop the count to zero. 1288 * [or there is really I/O going on to these 1289 * pages]. Note, we have the writers lock so 1290 * nothing gets inserted during the flush. 1291 */ 1292 if (reclaim == 1) { 1293 segvn_purge(seg); 1294 reclaim = 0; 1295 goto retry; 1296 } 1297 i = btopr(seg->s_size); 1298 addr = seg->s_base; 1299 /* 1300 * XXX break cow sharing using PAGESIZE 1301 * pages. They will be relocated into larger 1302 * pages at fault time. 1303 */ 1304 while (i-- > 0) { 1305 if (ap = anon_get_ptr(amp->ahp, 1306 anon_idx)) { 1307 error = anon_getpage(&ap, 1308 &vpprot, anon_pl, PAGESIZE, 1309 seg, addr, S_READ, 1310 svd->cred); 1311 if (error) { 1312 newsvd->vpage = NULL; 1313 goto out; 1314 } 1315 /* 1316 * prot need not be computed 1317 * below 'cause anon_private is 1318 * going to ignore it anyway 1319 * as child doesn't inherit 1320 * pagelock from parent. 1321 */ 1322 prot = svd->pageprot ? 1323 VPP_PROT( 1324 &svd->vpage[ 1325 seg_page(seg, addr)]) 1326 : svd->prot; 1327 pp = anon_private(&newap, 1328 newseg, addr, prot, 1329 anon_pl[0], 0, 1330 newsvd->cred); 1331 if (pp == NULL) { 1332 /* no mem abort */ 1333 newsvd->vpage = NULL; 1334 error = ENOMEM; 1335 goto out; 1336 } 1337 (void) anon_set_ptr( 1338 newsvd->amp->ahp, anon_idx, 1339 newap, ANON_SLEEP); 1340 page_unlock(pp); 1341 } 1342 addr += PAGESIZE; 1343 anon_idx++; 1344 } 1345 } else { /* common case */ 1346 if (seg->s_szc != 0) { 1347 /* 1348 * If at least one of anon slots of a 1349 * large page exists then make sure 1350 * all anon slots of a large page 1351 * exist to avoid partial cow sharing 1352 * of a large page in the future. 1353 */ 1354 anon_dup_fill_holes(amp->ahp, 1355 svd->anon_index, newsvd->amp->ahp, 1356 0, seg->s_size, seg->s_szc, 1357 svd->vp != NULL); 1358 } else { 1359 anon_dup(amp->ahp, svd->anon_index, 1360 newsvd->amp->ahp, 0, seg->s_size); 1361 } 1362 1363 hat_clrattr(seg->s_as->a_hat, seg->s_base, 1364 seg->s_size, PROT_WRITE); 1365 } 1366 } 1367 } 1368 /* 1369 * If necessary, create a vpage structure for the new segment. 1370 * Do not copy any page lock indications. 1371 */ 1372 if (svd->vpage != NULL) { 1373 uint_t i; 1374 struct vpage *ovp = svd->vpage; 1375 struct vpage *nvp; 1376 1377 nvp = newsvd->vpage = 1378 kmem_alloc(vpgtob(npages), KM_SLEEP); 1379 for (i = 0; i < npages; i++) { 1380 *nvp = *ovp++; 1381 VPP_CLRPPLOCK(nvp++); 1382 } 1383 } else 1384 newsvd->vpage = NULL; 1385 1386 /* Inform the vnode of the new mapping */ 1387 if (newsvd->vp != NULL) { 1388 error = VOP_ADDMAP(newsvd->vp, (offset_t)newsvd->offset, 1389 newseg->s_as, newseg->s_base, newseg->s_size, newsvd->prot, 1390 newsvd->maxprot, newsvd->type, newsvd->cred); 1391 } 1392 out: 1393 return (error); 1394 } 1395 1396 1397 /* 1398 * callback function used by segvn_unmap to invoke free_vp_pages() for only 1399 * those pages actually processed by the HAT 1400 */ 1401 extern int free_pages; 1402 1403 static void 1404 segvn_hat_unload_callback(hat_callback_t *cb) 1405 { 1406 struct seg *seg = cb->hcb_data; 1407 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 1408 size_t len; 1409 u_offset_t off; 1410 1411 ASSERT(svd->vp != NULL); 1412 ASSERT(cb->hcb_end_addr > cb->hcb_start_addr); 1413 ASSERT(cb->hcb_start_addr >= seg->s_base); 1414 1415 len = cb->hcb_end_addr - cb->hcb_start_addr; 1416 off = cb->hcb_start_addr - seg->s_base; 1417 free_vp_pages(svd->vp, svd->offset + off, len); 1418 } 1419 1420 1421 static int 1422 segvn_unmap(struct seg *seg, caddr_t addr, size_t len) 1423 { 1424 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 1425 struct segvn_data *nsvd; 1426 struct seg *nseg; 1427 struct anon_map *amp; 1428 pgcnt_t opages; /* old segment size in pages */ 1429 pgcnt_t npages; /* new segment size in pages */ 1430 pgcnt_t dpages; /* pages being deleted (unmapped) */ 1431 hat_callback_t callback; /* used for free_vp_pages() */ 1432 hat_callback_t *cbp = NULL; 1433 caddr_t nbase; 1434 size_t nsize; 1435 size_t oswresv; 1436 int reclaim = 1; 1437 1438 /* 1439 * We don't need any segment level locks for "segvn" data 1440 * since the address space is "write" locked. 1441 */ 1442 ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock)); 1443 1444 /* 1445 * Fail the unmap if pages are SOFTLOCKed through this mapping. 1446 * softlockcnt is protected from change by the as write lock. 1447 */ 1448 retry: 1449 if (svd->softlockcnt > 0) { 1450 /* 1451 * since we do have the writers lock nobody can fill 1452 * the cache during the purge. The flush either succeeds 1453 * or we still have pending I/Os. 1454 */ 1455 if (reclaim == 1) { 1456 segvn_purge(seg); 1457 reclaim = 0; 1458 goto retry; 1459 } 1460 return (EAGAIN); 1461 } 1462 1463 /* 1464 * Check for bad sizes 1465 */ 1466 if (addr < seg->s_base || addr + len > seg->s_base + seg->s_size || 1467 (len & PAGEOFFSET) || ((uintptr_t)addr & PAGEOFFSET)) { 1468 panic("segvn_unmap"); 1469 /*NOTREACHED*/ 1470 } 1471 1472 if (seg->s_szc != 0) { 1473 size_t pgsz = page_get_pagesize(seg->s_szc); 1474 int err; 1475 if (!IS_P2ALIGNED(addr, pgsz) || !IS_P2ALIGNED(len, pgsz)) { 1476 ASSERT(seg->s_base != addr || seg->s_size != len); 1477 VM_STAT_ADD(segvnvmstats.demoterange[0]); 1478 err = segvn_demote_range(seg, addr, len, SDR_END, 0); 1479 if (err == 0) { 1480 return (IE_RETRY); 1481 } 1482 return (err); 1483 } 1484 } 1485 1486 /* Inform the vnode of the unmapping. */ 1487 if (svd->vp) { 1488 int error; 1489 1490 error = VOP_DELMAP(svd->vp, 1491 (offset_t)svd->offset + (uintptr_t)(addr - seg->s_base), 1492 seg->s_as, addr, len, svd->prot, svd->maxprot, 1493 svd->type, svd->cred); 1494 1495 if (error == EAGAIN) 1496 return (error); 1497 } 1498 /* 1499 * Remove any page locks set through this mapping. 1500 */ 1501 (void) segvn_lockop(seg, addr, len, 0, MC_UNLOCK, NULL, 0); 1502 1503 /* 1504 * Unload any hardware translations in the range to be taken out. 1505 * Use a callback to invoke free_vp_pages() effectively. 1506 */ 1507 if (svd->vp != NULL && free_pages != 0) { 1508 callback.hcb_data = seg; 1509 callback.hcb_function = segvn_hat_unload_callback; 1510 cbp = &callback; 1511 } 1512 hat_unload_callback(seg->s_as->a_hat, addr, len, HAT_UNLOAD_UNMAP, cbp); 1513 1514 /* 1515 * Check for entire segment 1516 */ 1517 if (addr == seg->s_base && len == seg->s_size) { 1518 seg_free(seg); 1519 return (0); 1520 } 1521 1522 opages = seg_pages(seg); 1523 dpages = btop(len); 1524 npages = opages - dpages; 1525 amp = svd->amp; 1526 ASSERT(amp == NULL || amp->a_szc >= seg->s_szc); 1527 1528 /* 1529 * Check for beginning of segment 1530 */ 1531 if (addr == seg->s_base) { 1532 if (svd->vpage != NULL) { 1533 size_t nbytes; 1534 struct vpage *ovpage; 1535 1536 ovpage = svd->vpage; /* keep pointer to vpage */ 1537 1538 nbytes = vpgtob(npages); 1539 svd->vpage = kmem_alloc(nbytes, KM_SLEEP); 1540 bcopy(&ovpage[dpages], svd->vpage, nbytes); 1541 1542 /* free up old vpage */ 1543 kmem_free(ovpage, vpgtob(opages)); 1544 } 1545 if (amp != NULL) { 1546 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 1547 if (amp->refcnt == 1 || svd->type == MAP_PRIVATE) { 1548 /* 1549 * Free up now unused parts of anon_map array. 1550 */ 1551 if (amp->a_szc == seg->s_szc) { 1552 if (seg->s_szc != 0) { 1553 anon_free_pages(amp->ahp, 1554 svd->anon_index, len, 1555 seg->s_szc); 1556 } else { 1557 anon_free(amp->ahp, 1558 svd->anon_index, 1559 len); 1560 } 1561 } else { 1562 ASSERT(svd->type == MAP_SHARED); 1563 ASSERT(amp->a_szc > seg->s_szc); 1564 anon_shmap_free_pages(amp, 1565 svd->anon_index, len); 1566 } 1567 1568 /* 1569 * Unreserve swap space for the 1570 * unmapped chunk of this segment in 1571 * case it's MAP_SHARED 1572 */ 1573 if (svd->type == MAP_SHARED) { 1574 anon_unresv(len); 1575 amp->swresv -= len; 1576 } 1577 } 1578 ANON_LOCK_EXIT(&->a_rwlock); 1579 svd->anon_index += dpages; 1580 } 1581 if (svd->vp != NULL) 1582 svd->offset += len; 1583 1584 if (svd->swresv) { 1585 if (svd->flags & MAP_NORESERVE) { 1586 ASSERT(amp); 1587 oswresv = svd->swresv; 1588 1589 svd->swresv = ptob(anon_pages(amp->ahp, 1590 svd->anon_index, npages)); 1591 anon_unresv(oswresv - svd->swresv); 1592 } else { 1593 anon_unresv(len); 1594 svd->swresv -= len; 1595 } 1596 TRACE_3(TR_FAC_VM, TR_ANON_PROC, "anon proc:%p %lu %u", 1597 seg, len, 0); 1598 } 1599 1600 seg->s_base += len; 1601 seg->s_size -= len; 1602 return (0); 1603 } 1604 1605 /* 1606 * Check for end of segment 1607 */ 1608 if (addr + len == seg->s_base + seg->s_size) { 1609 if (svd->vpage != NULL) { 1610 size_t nbytes; 1611 struct vpage *ovpage; 1612 1613 ovpage = svd->vpage; /* keep pointer to vpage */ 1614 1615 nbytes = vpgtob(npages); 1616 svd->vpage = kmem_alloc(nbytes, KM_SLEEP); 1617 bcopy(ovpage, svd->vpage, nbytes); 1618 1619 /* free up old vpage */ 1620 kmem_free(ovpage, vpgtob(opages)); 1621 1622 } 1623 if (amp != NULL) { 1624 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 1625 if (amp->refcnt == 1 || svd->type == MAP_PRIVATE) { 1626 /* 1627 * Free up now unused parts of anon_map array. 1628 */ 1629 ulong_t an_idx = svd->anon_index + npages; 1630 if (amp->a_szc == seg->s_szc) { 1631 if (seg->s_szc != 0) { 1632 anon_free_pages(amp->ahp, 1633 an_idx, len, 1634 seg->s_szc); 1635 } else { 1636 anon_free(amp->ahp, an_idx, 1637 len); 1638 } 1639 } else { 1640 ASSERT(svd->type == MAP_SHARED); 1641 ASSERT(amp->a_szc > seg->s_szc); 1642 anon_shmap_free_pages(amp, 1643 an_idx, len); 1644 } 1645 1646 /* 1647 * Unreserve swap space for the 1648 * unmapped chunk of this segment in 1649 * case it's MAP_SHARED 1650 */ 1651 if (svd->type == MAP_SHARED) { 1652 anon_unresv(len); 1653 amp->swresv -= len; 1654 } 1655 } 1656 ANON_LOCK_EXIT(&->a_rwlock); 1657 } 1658 1659 if (svd->swresv) { 1660 if (svd->flags & MAP_NORESERVE) { 1661 ASSERT(amp); 1662 oswresv = svd->swresv; 1663 svd->swresv = ptob(anon_pages(amp->ahp, 1664 svd->anon_index, npages)); 1665 anon_unresv(oswresv - svd->swresv); 1666 } else { 1667 anon_unresv(len); 1668 svd->swresv -= len; 1669 } 1670 TRACE_3(TR_FAC_VM, TR_ANON_PROC, 1671 "anon proc:%p %lu %u", seg, len, 0); 1672 } 1673 1674 seg->s_size -= len; 1675 return (0); 1676 } 1677 1678 /* 1679 * The section to go is in the middle of the segment, 1680 * have to make it into two segments. nseg is made for 1681 * the high end while seg is cut down at the low end. 1682 */ 1683 nbase = addr + len; /* new seg base */ 1684 nsize = (seg->s_base + seg->s_size) - nbase; /* new seg size */ 1685 seg->s_size = addr - seg->s_base; /* shrink old seg */ 1686 nseg = seg_alloc(seg->s_as, nbase, nsize); 1687 if (nseg == NULL) { 1688 panic("segvn_unmap seg_alloc"); 1689 /*NOTREACHED*/ 1690 } 1691 nseg->s_ops = seg->s_ops; 1692 nsvd = kmem_cache_alloc(segvn_cache, KM_SLEEP); 1693 nseg->s_data = (void *)nsvd; 1694 nseg->s_szc = seg->s_szc; 1695 *nsvd = *svd; 1696 nsvd->offset = svd->offset + (uintptr_t)(nseg->s_base - seg->s_base); 1697 nsvd->swresv = 0; 1698 nsvd->softlockcnt = 0; 1699 1700 if (svd->vp != NULL) { 1701 VN_HOLD(nsvd->vp); 1702 if (nsvd->type == MAP_SHARED) 1703 lgrp_shm_policy_init(NULL, nsvd->vp); 1704 } 1705 crhold(svd->cred); 1706 1707 if (svd->vpage == NULL) { 1708 nsvd->vpage = NULL; 1709 } else { 1710 /* need to split vpage into two arrays */ 1711 size_t nbytes; 1712 struct vpage *ovpage; 1713 1714 ovpage = svd->vpage; /* keep pointer to vpage */ 1715 1716 npages = seg_pages(seg); /* seg has shrunk */ 1717 nbytes = vpgtob(npages); 1718 svd->vpage = kmem_alloc(nbytes, KM_SLEEP); 1719 1720 bcopy(ovpage, svd->vpage, nbytes); 1721 1722 npages = seg_pages(nseg); 1723 nbytes = vpgtob(npages); 1724 nsvd->vpage = kmem_alloc(nbytes, KM_SLEEP); 1725 1726 bcopy(&ovpage[opages - npages], nsvd->vpage, nbytes); 1727 1728 /* free up old vpage */ 1729 kmem_free(ovpage, vpgtob(opages)); 1730 } 1731 1732 if (amp == NULL) { 1733 nsvd->amp = NULL; 1734 nsvd->anon_index = 0; 1735 } else { 1736 /* 1737 * Need to create a new anon map for the new segment. 1738 * We'll also allocate a new smaller array for the old 1739 * smaller segment to save space. 1740 */ 1741 opages = btop((uintptr_t)(addr - seg->s_base)); 1742 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 1743 if (amp->refcnt == 1 || svd->type == MAP_PRIVATE) { 1744 /* 1745 * Free up now unused parts of anon_map array. 1746 */ 1747 ulong_t an_idx = svd->anon_index + opages; 1748 if (amp->a_szc == seg->s_szc) { 1749 if (seg->s_szc != 0) { 1750 anon_free_pages(amp->ahp, an_idx, len, 1751 seg->s_szc); 1752 } else { 1753 anon_free(amp->ahp, an_idx, 1754 len); 1755 } 1756 } else { 1757 ASSERT(svd->type == MAP_SHARED); 1758 ASSERT(amp->a_szc > seg->s_szc); 1759 anon_shmap_free_pages(amp, an_idx, len); 1760 } 1761 1762 /* 1763 * Unreserve swap space for the 1764 * unmapped chunk of this segment in 1765 * case it's MAP_SHARED 1766 */ 1767 if (svd->type == MAP_SHARED) { 1768 anon_unresv(len); 1769 amp->swresv -= len; 1770 } 1771 } 1772 nsvd->anon_index = svd->anon_index + 1773 btop((uintptr_t)(nseg->s_base - seg->s_base)); 1774 if (svd->type == MAP_SHARED) { 1775 amp->refcnt++; 1776 nsvd->amp = amp; 1777 } else { 1778 struct anon_map *namp; 1779 struct anon_hdr *nahp; 1780 1781 ASSERT(svd->type == MAP_PRIVATE); 1782 nahp = anon_create(btop(seg->s_size), ANON_SLEEP); 1783 namp = anonmap_alloc(nseg->s_size, 0); 1784 namp->a_szc = seg->s_szc; 1785 (void) anon_copy_ptr(amp->ahp, svd->anon_index, nahp, 1786 0, btop(seg->s_size), ANON_SLEEP); 1787 (void) anon_copy_ptr(amp->ahp, nsvd->anon_index, 1788 namp->ahp, 0, btop(nseg->s_size), ANON_SLEEP); 1789 anon_release(amp->ahp, btop(amp->size)); 1790 svd->anon_index = 0; 1791 nsvd->anon_index = 0; 1792 amp->ahp = nahp; 1793 amp->size = seg->s_size; 1794 nsvd->amp = namp; 1795 } 1796 ANON_LOCK_EXIT(&->a_rwlock); 1797 } 1798 if (svd->swresv) { 1799 if (svd->flags & MAP_NORESERVE) { 1800 ASSERT(amp); 1801 oswresv = svd->swresv; 1802 svd->swresv = ptob(anon_pages(amp->ahp, 1803 svd->anon_index, btop(seg->s_size))); 1804 nsvd->swresv = ptob(anon_pages(nsvd->amp->ahp, 1805 nsvd->anon_index, btop(nseg->s_size))); 1806 ASSERT(oswresv >= (svd->swresv + nsvd->swresv)); 1807 anon_unresv(oswresv - (svd->swresv + nsvd->swresv)); 1808 } else { 1809 if (seg->s_size + nseg->s_size + len != svd->swresv) { 1810 panic("segvn_unmap: " 1811 "cannot split swap reservation"); 1812 /*NOTREACHED*/ 1813 } 1814 anon_unresv(len); 1815 svd->swresv = seg->s_size; 1816 nsvd->swresv = nseg->s_size; 1817 } 1818 TRACE_3(TR_FAC_VM, TR_ANON_PROC, "anon proc:%p %lu %u", 1819 seg, len, 0); 1820 } 1821 1822 return (0); /* I'm glad that's all over with! */ 1823 } 1824 1825 static void 1826 segvn_free(struct seg *seg) 1827 { 1828 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 1829 pgcnt_t npages = seg_pages(seg); 1830 struct anon_map *amp; 1831 size_t len; 1832 1833 /* 1834 * We don't need any segment level locks for "segvn" data 1835 * since the address space is "write" locked. 1836 */ 1837 ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock)); 1838 1839 /* 1840 * Be sure to unlock pages. XXX Why do things get free'ed instead 1841 * of unmapped? XXX 1842 */ 1843 (void) segvn_lockop(seg, seg->s_base, seg->s_size, 1844 0, MC_UNLOCK, NULL, 0); 1845 1846 /* 1847 * Deallocate the vpage and anon pointers if necessary and possible. 1848 */ 1849 if (svd->vpage != NULL) { 1850 kmem_free(svd->vpage, vpgtob(npages)); 1851 svd->vpage = NULL; 1852 } 1853 if ((amp = svd->amp) != NULL) { 1854 /* 1855 * If there are no more references to this anon_map 1856 * structure, then deallocate the structure after freeing 1857 * up all the anon slot pointers that we can. 1858 */ 1859 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 1860 ASSERT(amp->a_szc >= seg->s_szc); 1861 if (--amp->refcnt == 0) { 1862 if (svd->type == MAP_PRIVATE) { 1863 /* 1864 * Private - we only need to anon_free 1865 * the part that this segment refers to. 1866 */ 1867 if (seg->s_szc != 0) { 1868 anon_free_pages(amp->ahp, 1869 svd->anon_index, seg->s_size, 1870 seg->s_szc); 1871 } else { 1872 anon_free(amp->ahp, svd->anon_index, 1873 seg->s_size); 1874 } 1875 } else { 1876 /* 1877 * Shared - anon_free the entire 1878 * anon_map's worth of stuff and 1879 * release any swap reservation. 1880 */ 1881 if (amp->a_szc != 0) { 1882 anon_shmap_free_pages(amp, 0, 1883 amp->size); 1884 } else { 1885 anon_free(amp->ahp, 0, amp->size); 1886 } 1887 if ((len = amp->swresv) != 0) { 1888 anon_unresv(len); 1889 TRACE_3(TR_FAC_VM, TR_ANON_PROC, 1890 "anon proc:%p %lu %u", 1891 seg, len, 0); 1892 } 1893 } 1894 svd->amp = NULL; 1895 ANON_LOCK_EXIT(&->a_rwlock); 1896 anonmap_free(amp); 1897 } else if (svd->type == MAP_PRIVATE) { 1898 /* 1899 * We had a private mapping which still has 1900 * a held anon_map so just free up all the 1901 * anon slot pointers that we were using. 1902 */ 1903 if (seg->s_szc != 0) { 1904 anon_free_pages(amp->ahp, svd->anon_index, 1905 seg->s_size, seg->s_szc); 1906 } else { 1907 anon_free(amp->ahp, svd->anon_index, 1908 seg->s_size); 1909 } 1910 ANON_LOCK_EXIT(&->a_rwlock); 1911 } else { 1912 ANON_LOCK_EXIT(&->a_rwlock); 1913 } 1914 } 1915 1916 /* 1917 * Release swap reservation. 1918 */ 1919 if ((len = svd->swresv) != 0) { 1920 anon_unresv(svd->swresv); 1921 TRACE_3(TR_FAC_VM, TR_ANON_PROC, "anon proc:%p %lu %u", 1922 seg, len, 0); 1923 svd->swresv = 0; 1924 } 1925 /* 1926 * Release claim on vnode, credentials, and finally free the 1927 * private data. 1928 */ 1929 if (svd->vp != NULL) { 1930 if (svd->type == MAP_SHARED) 1931 lgrp_shm_policy_fini(NULL, svd->vp); 1932 VN_RELE(svd->vp); 1933 svd->vp = NULL; 1934 } 1935 crfree(svd->cred); 1936 svd->cred = NULL; 1937 1938 seg->s_data = NULL; 1939 kmem_cache_free(segvn_cache, svd); 1940 } 1941 1942 ulong_t segvn_lpglck_limit = 0; 1943 /* 1944 * Support routines used by segvn_pagelock() and softlock faults for anonymous 1945 * pages to implement availrmem accounting in a way that makes sure the 1946 * same memory is accounted just once for all softlock/pagelock purposes. 1947 * This prevents a bug when availrmem is quickly incorrectly exausted from 1948 * several pagelocks to different parts of the same large page since each 1949 * pagelock has to decrement availrmem by the size of the entire large 1950 * page. Note those pages are not COW shared until softunlock/pageunlock so 1951 * we don't need to use cow style accounting here. We also need to make sure 1952 * the entire large page is accounted even if softlock range is less than the 1953 * entire large page because large anon pages can't be demoted when any of 1954 * constituent pages is locked. The caller calls this routine for every page_t 1955 * it locks. The very first page in the range may not be the root page of a 1956 * large page. For all other pages it's guranteed we are going to visit the 1957 * root of a particular large page before any other constituent page as we are 1958 * locking sequential pages belonging to the same anon map. So we do all the 1959 * locking when the root is encountered except for the very first page. Since 1960 * softlocking is not supported (except S_READ_NOCOW special case) for vmpss 1961 * segments and since vnode pages can be demoted without locking all 1962 * constituent pages vnode pages don't come here. Unlocking relies on the 1963 * fact that pagesize can't change whenever any of constituent large pages is 1964 * locked at least SE_SHARED. This allows unlocking code to find the right 1965 * root and decrement availrmem by the same amount it was incremented when the 1966 * page was locked. 1967 */ 1968 static int 1969 segvn_pp_lock_anonpages(page_t *pp, int first) 1970 { 1971 pgcnt_t pages; 1972 pfn_t pfn; 1973 uchar_t szc = pp->p_szc; 1974 1975 ASSERT(PAGE_LOCKED(pp)); 1976 ASSERT(pp->p_vnode != NULL); 1977 ASSERT(IS_SWAPFSVP(pp->p_vnode)); 1978 1979 /* 1980 * pagesize won't change as long as any constituent page is locked. 1981 */ 1982 pages = page_get_pagecnt(pp->p_szc); 1983 pfn = page_pptonum(pp); 1984 1985 if (!first) { 1986 if (!IS_P2ALIGNED(pfn, pages)) { 1987 #ifdef DEBUG 1988 pp = &pp[-(spgcnt_t)(pfn & (pages - 1))]; 1989 pfn = page_pptonum(pp); 1990 ASSERT(IS_P2ALIGNED(pfn, pages)); 1991 ASSERT(pp->p_szc == szc); 1992 ASSERT(pp->p_vnode != NULL); 1993 ASSERT(IS_SWAPFSVP(pp->p_vnode)); 1994 ASSERT(pp->p_slckcnt != 0); 1995 #endif /* DEBUG */ 1996 return (1); 1997 } 1998 } else if (!IS_P2ALIGNED(pfn, pages)) { 1999 pp = &pp[-(spgcnt_t)(pfn & (pages - 1))]; 2000 #ifdef DEBUG 2001 pfn = page_pptonum(pp); 2002 ASSERT(IS_P2ALIGNED(pfn, pages)); 2003 ASSERT(pp->p_szc == szc); 2004 ASSERT(pp->p_vnode != NULL); 2005 ASSERT(IS_SWAPFSVP(pp->p_vnode)); 2006 #endif /* DEBUG */ 2007 } 2008 2009 /* 2010 * pp is a root page. 2011 * We haven't locked this large page yet. 2012 */ 2013 page_struct_lock(pp); 2014 if (pp->p_slckcnt != 0) { 2015 if (pp->p_slckcnt < PAGE_SLOCK_MAXIMUM) { 2016 pp->p_slckcnt++; 2017 page_struct_unlock(pp); 2018 return (1); 2019 } 2020 page_struct_unlock(pp); 2021 segvn_lpglck_limit++; 2022 return (0); 2023 } 2024 mutex_enter(&freemem_lock); 2025 if (availrmem < tune.t_minarmem + pages) { 2026 mutex_exit(&freemem_lock); 2027 page_struct_unlock(pp); 2028 return (0); 2029 } 2030 pp->p_slckcnt++; 2031 availrmem -= pages; 2032 mutex_exit(&freemem_lock); 2033 page_struct_unlock(pp); 2034 return (1); 2035 } 2036 2037 static void 2038 segvn_pp_unlock_anonpages(page_t *pp, int first) 2039 { 2040 pgcnt_t pages; 2041 pfn_t pfn; 2042 2043 ASSERT(PAGE_LOCKED(pp)); 2044 ASSERT(pp->p_vnode != NULL); 2045 ASSERT(IS_SWAPFSVP(pp->p_vnode)); 2046 2047 /* 2048 * pagesize won't change as long as any constituent page is locked. 2049 */ 2050 pages = page_get_pagecnt(pp->p_szc); 2051 pfn = page_pptonum(pp); 2052 2053 if (!first) { 2054 if (!IS_P2ALIGNED(pfn, pages)) { 2055 return; 2056 } 2057 } else if (!IS_P2ALIGNED(pfn, pages)) { 2058 pp = &pp[-(spgcnt_t)(pfn & (pages - 1))]; 2059 #ifdef DEBUG 2060 pfn = page_pptonum(pp); 2061 ASSERT(IS_P2ALIGNED(pfn, pages)); 2062 #endif /* DEBUG */ 2063 } 2064 ASSERT(pp->p_vnode != NULL); 2065 ASSERT(IS_SWAPFSVP(pp->p_vnode)); 2066 ASSERT(pp->p_slckcnt != 0); 2067 page_struct_lock(pp); 2068 if (--pp->p_slckcnt == 0) { 2069 mutex_enter(&freemem_lock); 2070 availrmem += pages; 2071 mutex_exit(&freemem_lock); 2072 } 2073 page_struct_unlock(pp); 2074 } 2075 2076 /* 2077 * Do a F_SOFTUNLOCK call over the range requested. The range must have 2078 * already been F_SOFTLOCK'ed. 2079 * Caller must always match addr and len of a softunlock with a previous 2080 * softlock with exactly the same addr and len. 2081 */ 2082 static void 2083 segvn_softunlock(struct seg *seg, caddr_t addr, size_t len, enum seg_rw rw) 2084 { 2085 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 2086 page_t *pp; 2087 caddr_t adr; 2088 struct vnode *vp; 2089 u_offset_t offset; 2090 ulong_t anon_index; 2091 struct anon_map *amp; 2092 struct anon *ap = NULL; 2093 2094 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 2095 ASSERT(SEGVN_LOCK_HELD(seg->s_as, &svd->lock)); 2096 2097 if ((amp = svd->amp) != NULL) 2098 anon_index = svd->anon_index + seg_page(seg, addr); 2099 2100 hat_unlock(seg->s_as->a_hat, addr, len); 2101 for (adr = addr; adr < addr + len; adr += PAGESIZE) { 2102 if (amp != NULL) { 2103 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 2104 if ((ap = anon_get_ptr(amp->ahp, anon_index++)) 2105 != NULL) { 2106 swap_xlate(ap, &vp, &offset); 2107 } else { 2108 vp = svd->vp; 2109 offset = svd->offset + 2110 (uintptr_t)(adr - seg->s_base); 2111 } 2112 ANON_LOCK_EXIT(&->a_rwlock); 2113 } else { 2114 vp = svd->vp; 2115 offset = svd->offset + 2116 (uintptr_t)(adr - seg->s_base); 2117 } 2118 2119 /* 2120 * Use page_find() instead of page_lookup() to 2121 * find the page since we know that it is locked. 2122 */ 2123 pp = page_find(vp, offset); 2124 if (pp == NULL) { 2125 panic( 2126 "segvn_softunlock: addr %p, ap %p, vp %p, off %llx", 2127 (void *)adr, (void *)ap, (void *)vp, offset); 2128 /*NOTREACHED*/ 2129 } 2130 2131 if (rw == S_WRITE) { 2132 hat_setrefmod(pp); 2133 if (seg->s_as->a_vbits) 2134 hat_setstat(seg->s_as, adr, PAGESIZE, 2135 P_REF | P_MOD); 2136 } else if (rw != S_OTHER) { 2137 hat_setref(pp); 2138 if (seg->s_as->a_vbits) 2139 hat_setstat(seg->s_as, adr, PAGESIZE, P_REF); 2140 } 2141 TRACE_3(TR_FAC_VM, TR_SEGVN_FAULT, 2142 "segvn_fault:pp %p vp %p offset %llx", pp, vp, offset); 2143 if (svd->vp == NULL) { 2144 segvn_pp_unlock_anonpages(pp, adr == addr); 2145 } 2146 page_unlock(pp); 2147 } 2148 mutex_enter(&freemem_lock); /* for availrmem */ 2149 if (svd->vp != NULL) { 2150 availrmem += btop(len); 2151 } 2152 segvn_pages_locked -= btop(len); 2153 svd->softlockcnt -= btop(len); 2154 mutex_exit(&freemem_lock); 2155 if (svd->softlockcnt == 0) { 2156 /* 2157 * All SOFTLOCKS are gone. Wakeup any waiting 2158 * unmappers so they can try again to unmap. 2159 * Check for waiters first without the mutex 2160 * held so we don't always grab the mutex on 2161 * softunlocks. 2162 */ 2163 if (AS_ISUNMAPWAIT(seg->s_as)) { 2164 mutex_enter(&seg->s_as->a_contents); 2165 if (AS_ISUNMAPWAIT(seg->s_as)) { 2166 AS_CLRUNMAPWAIT(seg->s_as); 2167 cv_broadcast(&seg->s_as->a_cv); 2168 } 2169 mutex_exit(&seg->s_as->a_contents); 2170 } 2171 } 2172 } 2173 2174 #define PAGE_HANDLED ((page_t *)-1) 2175 2176 /* 2177 * Release all the pages in the NULL terminated ppp list 2178 * which haven't already been converted to PAGE_HANDLED. 2179 */ 2180 static void 2181 segvn_pagelist_rele(page_t **ppp) 2182 { 2183 for (; *ppp != NULL; ppp++) { 2184 if (*ppp != PAGE_HANDLED) 2185 page_unlock(*ppp); 2186 } 2187 } 2188 2189 static int stealcow = 1; 2190 2191 /* 2192 * Workaround for viking chip bug. See bug id 1220902. 2193 * To fix this down in pagefault() would require importing so 2194 * much as and segvn code as to be unmaintainable. 2195 */ 2196 int enable_mbit_wa = 0; 2197 2198 /* 2199 * Handles all the dirty work of getting the right 2200 * anonymous pages and loading up the translations. 2201 * This routine is called only from segvn_fault() 2202 * when looping over the range of addresses requested. 2203 * 2204 * The basic algorithm here is: 2205 * If this is an anon_zero case 2206 * Call anon_zero to allocate page 2207 * Load up translation 2208 * Return 2209 * endif 2210 * If this is an anon page 2211 * Use anon_getpage to get the page 2212 * else 2213 * Find page in pl[] list passed in 2214 * endif 2215 * If not a cow 2216 * Load up the translation to the page 2217 * return 2218 * endif 2219 * Call anon_private to handle cow 2220 * Load up (writable) translation to new page 2221 */ 2222 static faultcode_t 2223 segvn_faultpage( 2224 struct hat *hat, /* the hat to use for mapping */ 2225 struct seg *seg, /* seg_vn of interest */ 2226 caddr_t addr, /* address in as */ 2227 u_offset_t off, /* offset in vp */ 2228 struct vpage *vpage, /* pointer to vpage for vp, off */ 2229 page_t *pl[], /* object source page pointer */ 2230 uint_t vpprot, /* access allowed to object pages */ 2231 enum fault_type type, /* type of fault */ 2232 enum seg_rw rw, /* type of access at fault */ 2233 int brkcow, /* we may need to break cow */ 2234 int first) /* first page for this fault if 1 */ 2235 { 2236 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 2237 page_t *pp, **ppp; 2238 uint_t pageflags = 0; 2239 page_t *anon_pl[1 + 1]; 2240 page_t *opp = NULL; /* original page */ 2241 uint_t prot; 2242 int err; 2243 int cow; 2244 int claim; 2245 int steal = 0; 2246 ulong_t anon_index; 2247 struct anon *ap, *oldap; 2248 struct anon_map *amp; 2249 int hat_flag = (type == F_SOFTLOCK) ? HAT_LOAD_LOCK : HAT_LOAD; 2250 int anon_lock = 0; 2251 anon_sync_obj_t cookie; 2252 2253 if (svd->flags & MAP_TEXT) { 2254 hat_flag |= HAT_LOAD_TEXT; 2255 } 2256 2257 ASSERT(SEGVN_READ_HELD(seg->s_as, &svd->lock)); 2258 ASSERT(seg->s_szc == 0); 2259 2260 /* 2261 * Initialize protection value for this page. 2262 * If we have per page protection values check it now. 2263 */ 2264 if (svd->pageprot) { 2265 uint_t protchk; 2266 2267 switch (rw) { 2268 case S_READ: 2269 protchk = PROT_READ; 2270 break; 2271 case S_WRITE: 2272 protchk = PROT_WRITE; 2273 break; 2274 case S_EXEC: 2275 protchk = PROT_EXEC; 2276 break; 2277 case S_OTHER: 2278 default: 2279 protchk = PROT_READ | PROT_WRITE | PROT_EXEC; 2280 break; 2281 } 2282 2283 prot = VPP_PROT(vpage); 2284 if ((prot & protchk) == 0) 2285 return (FC_PROT); /* illegal access type */ 2286 } else { 2287 prot = svd->prot; 2288 } 2289 2290 if (type == F_SOFTLOCK && svd->vp != NULL) { 2291 mutex_enter(&freemem_lock); 2292 if (availrmem <= tune.t_minarmem) { 2293 mutex_exit(&freemem_lock); 2294 return (FC_MAKE_ERR(ENOMEM)); /* out of real memory */ 2295 } else { 2296 availrmem--; 2297 svd->softlockcnt++; 2298 segvn_pages_locked++; 2299 } 2300 mutex_exit(&freemem_lock); 2301 } 2302 2303 /* 2304 * Always acquire the anon array lock to prevent 2 threads from 2305 * allocating separate anon slots for the same "addr". 2306 */ 2307 2308 if ((amp = svd->amp) != NULL) { 2309 ASSERT(RW_READ_HELD(&->a_rwlock)); 2310 anon_index = svd->anon_index + seg_page(seg, addr); 2311 anon_array_enter(amp, anon_index, &cookie); 2312 anon_lock = 1; 2313 } 2314 2315 if (svd->vp == NULL && amp != NULL) { 2316 if ((ap = anon_get_ptr(amp->ahp, anon_index)) == NULL) { 2317 /* 2318 * Allocate a (normally) writable anonymous page of 2319 * zeroes. If no advance reservations, reserve now. 2320 */ 2321 if (svd->flags & MAP_NORESERVE) { 2322 if (anon_resv(ptob(1))) { 2323 svd->swresv += ptob(1); 2324 } else { 2325 err = ENOMEM; 2326 goto out; 2327 } 2328 } 2329 if ((pp = anon_zero(seg, addr, &ap, 2330 svd->cred)) == NULL) { 2331 err = ENOMEM; 2332 goto out; /* out of swap space */ 2333 } 2334 /* 2335 * Re-acquire the anon_map lock and 2336 * initialize the anon array entry. 2337 */ 2338 (void) anon_set_ptr(amp->ahp, anon_index, ap, 2339 ANON_SLEEP); 2340 2341 ASSERT(pp->p_szc == 0); 2342 if (type == F_SOFTLOCK) { 2343 if (!segvn_pp_lock_anonpages(pp, first)) { 2344 page_unlock(pp); 2345 err = ENOMEM; 2346 goto out; 2347 } else { 2348 mutex_enter(&freemem_lock); 2349 svd->softlockcnt++; 2350 segvn_pages_locked++; 2351 mutex_exit(&freemem_lock); 2352 } 2353 } 2354 2355 if (enable_mbit_wa) { 2356 if (rw == S_WRITE) 2357 hat_setmod(pp); 2358 else if (!hat_ismod(pp)) 2359 prot &= ~PROT_WRITE; 2360 } 2361 /* 2362 * If AS_PAGLCK is set in a_flags (via memcntl(2) 2363 * with MC_LOCKAS, MCL_FUTURE) and this is a 2364 * MAP_NORESERVE segment, we may need to 2365 * permanently lock the page as it is being faulted 2366 * for the first time. The following text applies 2367 * only to MAP_NORESERVE segments: 2368 * 2369 * As per memcntl(2), if this segment was created 2370 * after MCL_FUTURE was applied (a "future" 2371 * segment), its pages must be locked. If this 2372 * segment existed at MCL_FUTURE application (a 2373 * "past" segment), the interface is unclear. 2374 * 2375 * We decide to lock only if vpage is present: 2376 * 2377 * - "future" segments will have a vpage array (see 2378 * as_map), and so will be locked as required 2379 * 2380 * - "past" segments may not have a vpage array, 2381 * depending on whether events (such as 2382 * mprotect) have occurred. Locking if vpage 2383 * exists will preserve legacy behavior. Not 2384 * locking if vpage is absent, will not break 2385 * the interface or legacy behavior. Note that 2386 * allocating vpage here if it's absent requires 2387 * upgrading the segvn reader lock, the cost of 2388 * which does not seem worthwhile. 2389 */ 2390 if (AS_ISPGLCK(seg->s_as) && vpage != NULL && 2391 (svd->flags & MAP_NORESERVE)) { 2392 claim = VPP_PROT(vpage) & PROT_WRITE; 2393 ASSERT(svd->type == MAP_PRIVATE); 2394 if (page_pp_lock(pp, claim, 0)) 2395 VPP_SETPPLOCK(vpage); 2396 } 2397 2398 2399 /* 2400 * Handle pages that have been marked for migration 2401 */ 2402 if (lgrp_optimizations()) 2403 page_migrate(seg, addr, &pp, 1); 2404 hat_memload(hat, addr, pp, prot, hat_flag); 2405 2406 if (!(hat_flag & HAT_LOAD_LOCK)) 2407 page_unlock(pp); 2408 2409 anon_array_exit(&cookie); 2410 return (0); 2411 } 2412 } 2413 2414 /* 2415 * Obtain the page structure via anon_getpage() if it is 2416 * a private copy of an object (the result of a previous 2417 * copy-on-write). 2418 */ 2419 if (amp != NULL) { 2420 if ((ap = anon_get_ptr(amp->ahp, anon_index)) != NULL) { 2421 err = anon_getpage(&ap, &vpprot, anon_pl, PAGESIZE, 2422 seg, addr, rw, svd->cred); 2423 if (err) 2424 goto out; 2425 2426 if (svd->type == MAP_SHARED) { 2427 /* 2428 * If this is a shared mapping to an 2429 * anon_map, then ignore the write 2430 * permissions returned by anon_getpage(). 2431 * They apply to the private mappings 2432 * of this anon_map. 2433 */ 2434 vpprot |= PROT_WRITE; 2435 } 2436 opp = anon_pl[0]; 2437 } 2438 } 2439 2440 /* 2441 * Search the pl[] list passed in if it is from the 2442 * original object (i.e., not a private copy). 2443 */ 2444 if (opp == NULL) { 2445 /* 2446 * Find original page. We must be bringing it in 2447 * from the list in pl[]. 2448 */ 2449 for (ppp = pl; (opp = *ppp) != NULL; ppp++) { 2450 if (opp == PAGE_HANDLED) 2451 continue; 2452 ASSERT(opp->p_vnode == svd->vp); /* XXX */ 2453 if (opp->p_offset == off) 2454 break; 2455 } 2456 if (opp == NULL) { 2457 panic("segvn_faultpage not found"); 2458 /*NOTREACHED*/ 2459 } 2460 *ppp = PAGE_HANDLED; 2461 2462 } 2463 2464 ASSERT(PAGE_LOCKED(opp)); 2465 2466 TRACE_3(TR_FAC_VM, TR_SEGVN_FAULT, 2467 "segvn_fault:pp %p vp %p offset %llx", 2468 opp, NULL, 0); 2469 2470 /* 2471 * The fault is treated as a copy-on-write fault if a 2472 * write occurs on a private segment and the object 2473 * page (i.e., mapping) is write protected. We assume 2474 * that fatal protection checks have already been made. 2475 */ 2476 2477 cow = brkcow && ((vpprot & PROT_WRITE) == 0); 2478 2479 /* 2480 * If not a copy-on-write case load the translation 2481 * and return. 2482 */ 2483 if (cow == 0) { 2484 if (type == F_SOFTLOCK && svd->vp == NULL) { 2485 2486 ASSERT(opp->p_szc == 0 || 2487 (svd->type == MAP_SHARED && 2488 amp != NULL && amp->a_szc != 0)); 2489 2490 if (!segvn_pp_lock_anonpages(opp, first)) { 2491 page_unlock(opp); 2492 err = ENOMEM; 2493 goto out; 2494 } else { 2495 mutex_enter(&freemem_lock); 2496 svd->softlockcnt++; 2497 segvn_pages_locked++; 2498 mutex_exit(&freemem_lock); 2499 } 2500 } 2501 if (IS_VMODSORT(opp->p_vnode) || enable_mbit_wa) { 2502 if (rw == S_WRITE) 2503 hat_setmod(opp); 2504 else if (rw != S_OTHER && !hat_ismod(opp)) 2505 prot &= ~PROT_WRITE; 2506 } 2507 2508 /* 2509 * Handle pages that have been marked for migration 2510 */ 2511 if (lgrp_optimizations()) 2512 page_migrate(seg, addr, &opp, 1); 2513 2514 hat_memload(hat, addr, opp, prot & vpprot, hat_flag); 2515 2516 if (!(hat_flag & HAT_LOAD_LOCK)) 2517 page_unlock(opp); 2518 2519 if (anon_lock) { 2520 anon_array_exit(&cookie); 2521 } 2522 return (0); 2523 } 2524 2525 hat_setref(opp); 2526 2527 ASSERT(amp != NULL && anon_lock); 2528 2529 /* 2530 * Steal the page only if it isn't a private page 2531 * since stealing a private page is not worth the effort. 2532 */ 2533 if ((ap = anon_get_ptr(amp->ahp, anon_index)) == NULL) 2534 steal = 1; 2535 2536 /* 2537 * Steal the original page if the following conditions are true: 2538 * 2539 * We are low on memory, the page is not private, page is not large, 2540 * not shared, not modified, not `locked' or if we have it `locked' 2541 * (i.e., p_cowcnt == 1 and p_lckcnt == 0, which also implies 2542 * that the page is not shared) and if it doesn't have any 2543 * translations. page_struct_lock isn't needed to look at p_cowcnt 2544 * and p_lckcnt because we first get exclusive lock on page. 2545 */ 2546 (void) hat_pagesync(opp, HAT_SYNC_DONTZERO | HAT_SYNC_STOPON_MOD); 2547 2548 if (stealcow && freemem < minfree && steal && opp->p_szc == 0 && 2549 page_tryupgrade(opp) && !hat_ismod(opp) && 2550 ((opp->p_lckcnt == 0 && opp->p_cowcnt == 0) || 2551 (opp->p_lckcnt == 0 && opp->p_cowcnt == 1 && 2552 vpage != NULL && VPP_ISPPLOCK(vpage)))) { 2553 /* 2554 * Check if this page has other translations 2555 * after unloading our translation. 2556 */ 2557 if (hat_page_is_mapped(opp)) { 2558 hat_unload(seg->s_as->a_hat, addr, PAGESIZE, 2559 HAT_UNLOAD); 2560 } 2561 2562 /* 2563 * hat_unload() might sync back someone else's recent 2564 * modification, so check again. 2565 */ 2566 if (!hat_ismod(opp) && !hat_page_is_mapped(opp)) 2567 pageflags |= STEAL_PAGE; 2568 } 2569 2570 /* 2571 * If we have a vpage pointer, see if it indicates that we have 2572 * ``locked'' the page we map -- if so, tell anon_private to 2573 * transfer the locking resource to the new page. 2574 * 2575 * See Statement at the beginning of segvn_lockop regarding 2576 * the way lockcnts/cowcnts are handled during COW. 2577 * 2578 */ 2579 if (vpage != NULL && VPP_ISPPLOCK(vpage)) 2580 pageflags |= LOCK_PAGE; 2581 2582 /* 2583 * Allocate a private page and perform the copy. 2584 * For MAP_NORESERVE reserve swap space now, unless this 2585 * is a cow fault on an existing anon page in which case 2586 * MAP_NORESERVE will have made advance reservations. 2587 */ 2588 if ((svd->flags & MAP_NORESERVE) && (ap == NULL)) { 2589 if (anon_resv(ptob(1))) { 2590 svd->swresv += ptob(1); 2591 } else { 2592 page_unlock(opp); 2593 err = ENOMEM; 2594 goto out; 2595 } 2596 } 2597 oldap = ap; 2598 pp = anon_private(&ap, seg, addr, prot, opp, pageflags, svd->cred); 2599 if (pp == NULL) { 2600 err = ENOMEM; /* out of swap space */ 2601 goto out; 2602 } 2603 2604 /* 2605 * If we copied away from an anonymous page, then 2606 * we are one step closer to freeing up an anon slot. 2607 * 2608 * NOTE: The original anon slot must be released while 2609 * holding the "anon_map" lock. This is necessary to prevent 2610 * other threads from obtaining a pointer to the anon slot 2611 * which may be freed if its "refcnt" is 1. 2612 */ 2613 if (oldap != NULL) 2614 anon_decref(oldap); 2615 2616 (void) anon_set_ptr(amp->ahp, anon_index, ap, ANON_SLEEP); 2617 2618 ASSERT(pp->p_szc == 0); 2619 if (type == F_SOFTLOCK && svd->vp == NULL) { 2620 if (!segvn_pp_lock_anonpages(pp, first)) { 2621 page_unlock(pp); 2622 err = ENOMEM; 2623 goto out; 2624 } else { 2625 mutex_enter(&freemem_lock); 2626 svd->softlockcnt++; 2627 segvn_pages_locked++; 2628 mutex_exit(&freemem_lock); 2629 } 2630 } 2631 2632 ASSERT(!IS_VMODSORT(pp->p_vnode)); 2633 if (enable_mbit_wa) { 2634 if (rw == S_WRITE) 2635 hat_setmod(pp); 2636 else if (!hat_ismod(pp)) 2637 prot &= ~PROT_WRITE; 2638 } 2639 2640 2641 /* 2642 * Handle pages that have been marked for migration 2643 */ 2644 if (lgrp_optimizations()) 2645 page_migrate(seg, addr, &pp, 1); 2646 hat_memload(hat, addr, pp, prot, hat_flag); 2647 2648 if (!(hat_flag & HAT_LOAD_LOCK)) 2649 page_unlock(pp); 2650 2651 ASSERT(anon_lock); 2652 anon_array_exit(&cookie); 2653 return (0); 2654 out: 2655 if (anon_lock) 2656 anon_array_exit(&cookie); 2657 2658 if (type == F_SOFTLOCK && svd->vp != NULL) { 2659 mutex_enter(&freemem_lock); 2660 availrmem++; 2661 segvn_pages_locked--; 2662 svd->softlockcnt--; 2663 mutex_exit(&freemem_lock); 2664 } 2665 return (FC_MAKE_ERR(err)); 2666 } 2667 2668 /* 2669 * relocate a bunch of smaller targ pages into one large repl page. all targ 2670 * pages must be complete pages smaller than replacement pages. 2671 * it's assumed that no page's szc can change since they are all PAGESIZE or 2672 * complete large pages locked SHARED. 2673 */ 2674 static void 2675 segvn_relocate_pages(page_t **targ, page_t *replacement) 2676 { 2677 page_t *pp; 2678 pgcnt_t repl_npgs, curnpgs; 2679 pgcnt_t i; 2680 uint_t repl_szc = replacement->p_szc; 2681 page_t *first_repl = replacement; 2682 page_t *repl; 2683 spgcnt_t npgs; 2684 2685 VM_STAT_ADD(segvnvmstats.relocatepages[0]); 2686 2687 ASSERT(repl_szc != 0); 2688 npgs = repl_npgs = page_get_pagecnt(repl_szc); 2689 2690 i = 0; 2691 while (repl_npgs) { 2692 spgcnt_t nreloc; 2693 int err; 2694 ASSERT(replacement != NULL); 2695 pp = targ[i]; 2696 ASSERT(pp->p_szc < repl_szc); 2697 ASSERT(PAGE_EXCL(pp)); 2698 ASSERT(!PP_ISFREE(pp)); 2699 curnpgs = page_get_pagecnt(pp->p_szc); 2700 if (curnpgs == 1) { 2701 VM_STAT_ADD(segvnvmstats.relocatepages[1]); 2702 repl = replacement; 2703 page_sub(&replacement, repl); 2704 ASSERT(PAGE_EXCL(repl)); 2705 ASSERT(!PP_ISFREE(repl)); 2706 ASSERT(repl->p_szc == repl_szc); 2707 } else { 2708 page_t *repl_savepp; 2709 int j; 2710 VM_STAT_ADD(segvnvmstats.relocatepages[2]); 2711 repl_savepp = replacement; 2712 for (j = 0; j < curnpgs; j++) { 2713 repl = replacement; 2714 page_sub(&replacement, repl); 2715 ASSERT(PAGE_EXCL(repl)); 2716 ASSERT(!PP_ISFREE(repl)); 2717 ASSERT(repl->p_szc == repl_szc); 2718 ASSERT(page_pptonum(targ[i + j]) == 2719 page_pptonum(targ[i]) + j); 2720 } 2721 repl = repl_savepp; 2722 ASSERT(IS_P2ALIGNED(page_pptonum(repl), curnpgs)); 2723 } 2724 err = page_relocate(&pp, &repl, 0, 1, &nreloc, NULL); 2725 if (err || nreloc != curnpgs) { 2726 panic("segvn_relocate_pages: " 2727 "page_relocate failed err=%d curnpgs=%ld " 2728 "nreloc=%ld", err, curnpgs, nreloc); 2729 } 2730 ASSERT(curnpgs <= repl_npgs); 2731 repl_npgs -= curnpgs; 2732 i += curnpgs; 2733 } 2734 ASSERT(replacement == NULL); 2735 2736 repl = first_repl; 2737 repl_npgs = npgs; 2738 for (i = 0; i < repl_npgs; i++) { 2739 ASSERT(PAGE_EXCL(repl)); 2740 ASSERT(!PP_ISFREE(repl)); 2741 targ[i] = repl; 2742 page_downgrade(targ[i]); 2743 repl++; 2744 } 2745 } 2746 2747 /* 2748 * Check if all pages in ppa array are complete smaller than szc pages and 2749 * their roots will still be aligned relative to their current size if the 2750 * entire ppa array is relocated into one szc page. If these conditions are 2751 * not met return 0. 2752 * 2753 * If all pages are properly aligned attempt to upgrade their locks 2754 * to exclusive mode. If it fails set *upgrdfail to 1 and return 0. 2755 * upgrdfail was set to 0 by caller. 2756 * 2757 * Return 1 if all pages are aligned and locked exclusively. 2758 * 2759 * If all pages in ppa array happen to be physically contiguous to make one 2760 * szc page and all exclusive locks are successfully obtained promote the page 2761 * size to szc and set *pszc to szc. Return 1 with pages locked shared. 2762 */ 2763 static int 2764 segvn_full_szcpages(page_t **ppa, uint_t szc, int *upgrdfail, uint_t *pszc) 2765 { 2766 page_t *pp; 2767 pfn_t pfn; 2768 pgcnt_t totnpgs = page_get_pagecnt(szc); 2769 pfn_t first_pfn; 2770 int contig = 1; 2771 pgcnt_t i; 2772 pgcnt_t j; 2773 uint_t curszc; 2774 pgcnt_t curnpgs; 2775 int root = 0; 2776 2777 ASSERT(szc > 0); 2778 2779 VM_STAT_ADD(segvnvmstats.fullszcpages[0]); 2780 2781 for (i = 0; i < totnpgs; i++) { 2782 pp = ppa[i]; 2783 ASSERT(PAGE_SHARED(pp)); 2784 ASSERT(!PP_ISFREE(pp)); 2785 pfn = page_pptonum(pp); 2786 if (i == 0) { 2787 if (!IS_P2ALIGNED(pfn, totnpgs)) { 2788 contig = 0; 2789 } else { 2790 first_pfn = pfn; 2791 } 2792 } else if (contig && pfn != first_pfn + i) { 2793 contig = 0; 2794 } 2795 if (pp->p_szc == 0) { 2796 if (root) { 2797 VM_STAT_ADD(segvnvmstats.fullszcpages[1]); 2798 return (0); 2799 } 2800 } else if (!root) { 2801 if ((curszc = pp->p_szc) >= szc) { 2802 VM_STAT_ADD(segvnvmstats.fullszcpages[2]); 2803 return (0); 2804 } 2805 if (curszc == 0) { 2806 /* 2807 * p_szc changed means we don't have all pages 2808 * locked. return failure. 2809 */ 2810 VM_STAT_ADD(segvnvmstats.fullszcpages[3]); 2811 return (0); 2812 } 2813 curnpgs = page_get_pagecnt(curszc); 2814 if (!IS_P2ALIGNED(pfn, curnpgs) || 2815 !IS_P2ALIGNED(i, curnpgs)) { 2816 VM_STAT_ADD(segvnvmstats.fullszcpages[4]); 2817 return (0); 2818 } 2819 root = 1; 2820 } else { 2821 ASSERT(i > 0); 2822 VM_STAT_ADD(segvnvmstats.fullszcpages[5]); 2823 if (pp->p_szc != curszc) { 2824 VM_STAT_ADD(segvnvmstats.fullszcpages[6]); 2825 return (0); 2826 } 2827 if (pfn - 1 != page_pptonum(ppa[i - 1])) { 2828 panic("segvn_full_szcpages: " 2829 "large page not physically contiguous"); 2830 } 2831 if (P2PHASE(pfn, curnpgs) == curnpgs - 1) { 2832 root = 0; 2833 } 2834 } 2835 } 2836 2837 for (i = 0; i < totnpgs; i++) { 2838 ASSERT(ppa[i]->p_szc < szc); 2839 if (!page_tryupgrade(ppa[i])) { 2840 for (j = 0; j < i; j++) { 2841 page_downgrade(ppa[j]); 2842 } 2843 *pszc = ppa[i]->p_szc; 2844 *upgrdfail = 1; 2845 VM_STAT_ADD(segvnvmstats.fullszcpages[7]); 2846 return (0); 2847 } 2848 } 2849 2850 /* 2851 * When a page is put a free cachelist its szc is set to 0. if file 2852 * system reclaimed pages from cachelist targ pages will be physically 2853 * contiguous with 0 p_szc. in this case just upgrade szc of targ 2854 * pages without any relocations. 2855 * To avoid any hat issues with previous small mappings 2856 * hat_pageunload() the target pages first. 2857 */ 2858 if (contig) { 2859 VM_STAT_ADD(segvnvmstats.fullszcpages[8]); 2860 for (i = 0; i < totnpgs; i++) { 2861 (void) hat_pageunload(ppa[i], HAT_FORCE_PGUNLOAD); 2862 } 2863 for (i = 0; i < totnpgs; i++) { 2864 ppa[i]->p_szc = szc; 2865 } 2866 for (i = 0; i < totnpgs; i++) { 2867 ASSERT(PAGE_EXCL(ppa[i])); 2868 page_downgrade(ppa[i]); 2869 } 2870 if (pszc != NULL) { 2871 *pszc = szc; 2872 } 2873 } 2874 VM_STAT_ADD(segvnvmstats.fullszcpages[9]); 2875 return (1); 2876 } 2877 2878 /* 2879 * Create physically contiguous pages for [vp, off] - [vp, off + 2880 * page_size(szc)) range and for private segment return them in ppa array. 2881 * Pages are created either via IO or relocations. 2882 * 2883 * Return 1 on sucess and 0 on failure. 2884 * 2885 * If physically contiguos pages already exist for this range return 1 without 2886 * filling ppa array. Caller initializes ppa[0] as NULL to detect that ppa 2887 * array wasn't filled. In this case caller fills ppa array via VOP_GETPAGE(). 2888 */ 2889 2890 static int 2891 segvn_fill_vp_pages(struct segvn_data *svd, vnode_t *vp, u_offset_t off, 2892 uint_t szc, page_t **ppa, page_t **ppplist, uint_t *ret_pszc, 2893 int *downsize) 2894 2895 { 2896 page_t *pplist = *ppplist; 2897 size_t pgsz = page_get_pagesize(szc); 2898 pgcnt_t pages = btop(pgsz); 2899 ulong_t start_off = off; 2900 u_offset_t eoff = off + pgsz; 2901 spgcnt_t nreloc; 2902 u_offset_t io_off = off; 2903 size_t io_len; 2904 page_t *io_pplist = NULL; 2905 page_t *done_pplist = NULL; 2906 pgcnt_t pgidx = 0; 2907 page_t *pp; 2908 page_t *newpp; 2909 page_t *targpp; 2910 int io_err = 0; 2911 int i; 2912 pfn_t pfn; 2913 ulong_t ppages; 2914 page_t *targ_pplist = NULL; 2915 page_t *repl_pplist = NULL; 2916 page_t *tmp_pplist; 2917 int nios = 0; 2918 uint_t pszc; 2919 struct vattr va; 2920 2921 VM_STAT_ADD(segvnvmstats.fill_vp_pages[0]); 2922 2923 ASSERT(szc != 0); 2924 ASSERT(pplist->p_szc == szc); 2925 2926 /* 2927 * downsize will be set to 1 only if we fail to lock pages. this will 2928 * allow subsequent faults to try to relocate the page again. If we 2929 * fail due to misalignment don't downsize and let the caller map the 2930 * whole region with small mappings to avoid more faults into the area 2931 * where we can't get large pages anyway. 2932 */ 2933 *downsize = 0; 2934 2935 while (off < eoff) { 2936 newpp = pplist; 2937 ASSERT(newpp != NULL); 2938 ASSERT(PAGE_EXCL(newpp)); 2939 ASSERT(!PP_ISFREE(newpp)); 2940 /* 2941 * we pass NULL for nrelocp to page_lookup_create() 2942 * so that it doesn't relocate. We relocate here 2943 * later only after we make sure we can lock all 2944 * pages in the range we handle and they are all 2945 * aligned. 2946 */ 2947 pp = page_lookup_create(vp, off, SE_SHARED, newpp, NULL, 0); 2948 ASSERT(pp != NULL); 2949 ASSERT(!PP_ISFREE(pp)); 2950 ASSERT(pp->p_vnode == vp); 2951 ASSERT(pp->p_offset == off); 2952 if (pp == newpp) { 2953 VM_STAT_ADD(segvnvmstats.fill_vp_pages[1]); 2954 page_sub(&pplist, pp); 2955 ASSERT(PAGE_EXCL(pp)); 2956 ASSERT(page_iolock_assert(pp)); 2957 page_list_concat(&io_pplist, &pp); 2958 off += PAGESIZE; 2959 continue; 2960 } 2961 VM_STAT_ADD(segvnvmstats.fill_vp_pages[2]); 2962 pfn = page_pptonum(pp); 2963 pszc = pp->p_szc; 2964 if (pszc >= szc && targ_pplist == NULL && io_pplist == NULL && 2965 IS_P2ALIGNED(pfn, pages)) { 2966 ASSERT(repl_pplist == NULL); 2967 ASSERT(done_pplist == NULL); 2968 ASSERT(pplist == *ppplist); 2969 page_unlock(pp); 2970 page_free_replacement_page(pplist); 2971 page_create_putback(pages); 2972 *ppplist = NULL; 2973 VM_STAT_ADD(segvnvmstats.fill_vp_pages[3]); 2974 return (1); 2975 } 2976 if (pszc >= szc) { 2977 page_unlock(pp); 2978 segvn_faultvnmpss_align_err1++; 2979 goto out; 2980 } 2981 ppages = page_get_pagecnt(pszc); 2982 if (!IS_P2ALIGNED(pfn, ppages)) { 2983 ASSERT(pszc > 0); 2984 /* 2985 * sizing down to pszc won't help. 2986 */ 2987 page_unlock(pp); 2988 segvn_faultvnmpss_align_err2++; 2989 goto out; 2990 } 2991 pfn = page_pptonum(newpp); 2992 if (!IS_P2ALIGNED(pfn, ppages)) { 2993 ASSERT(pszc > 0); 2994 /* 2995 * sizing down to pszc won't help. 2996 */ 2997 page_unlock(pp); 2998 segvn_faultvnmpss_align_err3++; 2999 goto out; 3000 } 3001 if (!PAGE_EXCL(pp)) { 3002 VM_STAT_ADD(segvnvmstats.fill_vp_pages[4]); 3003 page_unlock(pp); 3004 *downsize = 1; 3005 *ret_pszc = pp->p_szc; 3006 goto out; 3007 } 3008 targpp = pp; 3009 if (io_pplist != NULL) { 3010 VM_STAT_ADD(segvnvmstats.fill_vp_pages[5]); 3011 io_len = off - io_off; 3012 /* 3013 * Some file systems like NFS don't check EOF 3014 * conditions in VOP_PAGEIO(). Check it here 3015 * now that pages are locked SE_EXCL. Any file 3016 * truncation will wait until the pages are 3017 * unlocked so no need to worry that file will 3018 * be truncated after we check its size here. 3019 * XXX fix NFS to remove this check. 3020 */ 3021 va.va_mask = AT_SIZE; 3022 if (VOP_GETATTR(vp, &va, ATTR_HINT, svd->cred) != 0) { 3023 VM_STAT_ADD(segvnvmstats.fill_vp_pages[6]); 3024 page_unlock(targpp); 3025 goto out; 3026 } 3027 if (btopr(va.va_size) < btopr(io_off + io_len)) { 3028 VM_STAT_ADD(segvnvmstats.fill_vp_pages[7]); 3029 *downsize = 1; 3030 *ret_pszc = 0; 3031 page_unlock(targpp); 3032 goto out; 3033 } 3034 io_err = VOP_PAGEIO(vp, io_pplist, io_off, io_len, 3035 B_READ, svd->cred); 3036 if (io_err) { 3037 VM_STAT_ADD(segvnvmstats.fill_vp_pages[8]); 3038 page_unlock(targpp); 3039 if (io_err == EDEADLK) { 3040 segvn_vmpss_pageio_deadlk_err++; 3041 } 3042 goto out; 3043 } 3044 nios++; 3045 VM_STAT_ADD(segvnvmstats.fill_vp_pages[9]); 3046 while (io_pplist != NULL) { 3047 pp = io_pplist; 3048 page_sub(&io_pplist, pp); 3049 ASSERT(page_iolock_assert(pp)); 3050 page_io_unlock(pp); 3051 pgidx = (pp->p_offset - start_off) >> 3052 PAGESHIFT; 3053 ASSERT(pgidx < pages); 3054 ppa[pgidx] = pp; 3055 page_list_concat(&done_pplist, &pp); 3056 } 3057 } 3058 pp = targpp; 3059 ASSERT(PAGE_EXCL(pp)); 3060 ASSERT(pp->p_szc <= pszc); 3061 if (pszc != 0 && !group_page_trylock(pp, SE_EXCL)) { 3062 VM_STAT_ADD(segvnvmstats.fill_vp_pages[10]); 3063 page_unlock(pp); 3064 *downsize = 1; 3065 *ret_pszc = pp->p_szc; 3066 goto out; 3067 } 3068 VM_STAT_ADD(segvnvmstats.fill_vp_pages[11]); 3069 /* 3070 * page szc chould have changed before the entire group was 3071 * locked. reread page szc. 3072 */ 3073 pszc = pp->p_szc; 3074 ppages = page_get_pagecnt(pszc); 3075 3076 /* link just the roots */ 3077 page_list_concat(&targ_pplist, &pp); 3078 page_sub(&pplist, newpp); 3079 page_list_concat(&repl_pplist, &newpp); 3080 off += PAGESIZE; 3081 while (--ppages != 0) { 3082 newpp = pplist; 3083 page_sub(&pplist, newpp); 3084 off += PAGESIZE; 3085 } 3086 io_off = off; 3087 } 3088 if (io_pplist != NULL) { 3089 VM_STAT_ADD(segvnvmstats.fill_vp_pages[12]); 3090 io_len = eoff - io_off; 3091 va.va_mask = AT_SIZE; 3092 if (VOP_GETATTR(vp, &va, ATTR_HINT, svd->cred) != 0) { 3093 VM_STAT_ADD(segvnvmstats.fill_vp_pages[13]); 3094 goto out; 3095 } 3096 if (btopr(va.va_size) < btopr(io_off + io_len)) { 3097 VM_STAT_ADD(segvnvmstats.fill_vp_pages[14]); 3098 *downsize = 1; 3099 *ret_pszc = 0; 3100 goto out; 3101 } 3102 io_err = VOP_PAGEIO(vp, io_pplist, io_off, io_len, 3103 B_READ, svd->cred); 3104 if (io_err) { 3105 VM_STAT_ADD(segvnvmstats.fill_vp_pages[15]); 3106 if (io_err == EDEADLK) { 3107 segvn_vmpss_pageio_deadlk_err++; 3108 } 3109 goto out; 3110 } 3111 nios++; 3112 while (io_pplist != NULL) { 3113 pp = io_pplist; 3114 page_sub(&io_pplist, pp); 3115 ASSERT(page_iolock_assert(pp)); 3116 page_io_unlock(pp); 3117 pgidx = (pp->p_offset - start_off) >> PAGESHIFT; 3118 ASSERT(pgidx < pages); 3119 ppa[pgidx] = pp; 3120 } 3121 } 3122 /* 3123 * we're now bound to succeed or panic. 3124 * remove pages from done_pplist. it's not needed anymore. 3125 */ 3126 while (done_pplist != NULL) { 3127 pp = done_pplist; 3128 page_sub(&done_pplist, pp); 3129 } 3130 VM_STAT_ADD(segvnvmstats.fill_vp_pages[16]); 3131 ASSERT(pplist == NULL); 3132 *ppplist = NULL; 3133 while (targ_pplist != NULL) { 3134 int ret; 3135 VM_STAT_ADD(segvnvmstats.fill_vp_pages[17]); 3136 ASSERT(repl_pplist); 3137 pp = targ_pplist; 3138 page_sub(&targ_pplist, pp); 3139 pgidx = (pp->p_offset - start_off) >> PAGESHIFT; 3140 newpp = repl_pplist; 3141 page_sub(&repl_pplist, newpp); 3142 #ifdef DEBUG 3143 pfn = page_pptonum(pp); 3144 pszc = pp->p_szc; 3145 ppages = page_get_pagecnt(pszc); 3146 ASSERT(IS_P2ALIGNED(pfn, ppages)); 3147 pfn = page_pptonum(newpp); 3148 ASSERT(IS_P2ALIGNED(pfn, ppages)); 3149 ASSERT(P2PHASE(pfn, pages) == pgidx); 3150 #endif 3151 nreloc = 0; 3152 ret = page_relocate(&pp, &newpp, 0, 1, &nreloc, NULL); 3153 if (ret != 0 || nreloc == 0) { 3154 panic("segvn_fill_vp_pages: " 3155 "page_relocate failed"); 3156 } 3157 pp = newpp; 3158 while (nreloc-- != 0) { 3159 ASSERT(PAGE_EXCL(pp)); 3160 ASSERT(pp->p_vnode == vp); 3161 ASSERT(pgidx == 3162 ((pp->p_offset - start_off) >> PAGESHIFT)); 3163 ppa[pgidx++] = pp; 3164 pp++; 3165 } 3166 } 3167 3168 if (svd->type == MAP_PRIVATE) { 3169 VM_STAT_ADD(segvnvmstats.fill_vp_pages[18]); 3170 for (i = 0; i < pages; i++) { 3171 ASSERT(ppa[i] != NULL); 3172 ASSERT(PAGE_EXCL(ppa[i])); 3173 ASSERT(ppa[i]->p_vnode == vp); 3174 ASSERT(ppa[i]->p_offset == 3175 start_off + (i << PAGESHIFT)); 3176 page_downgrade(ppa[i]); 3177 } 3178 ppa[pages] = NULL; 3179 } else { 3180 VM_STAT_ADD(segvnvmstats.fill_vp_pages[19]); 3181 /* 3182 * the caller will still call VOP_GETPAGE() for shared segments 3183 * to check FS write permissions. For private segments we map 3184 * file read only anyway. so no VOP_GETPAGE is needed. 3185 */ 3186 for (i = 0; i < pages; i++) { 3187 ASSERT(ppa[i] != NULL); 3188 ASSERT(PAGE_EXCL(ppa[i])); 3189 ASSERT(ppa[i]->p_vnode == vp); 3190 ASSERT(ppa[i]->p_offset == 3191 start_off + (i << PAGESHIFT)); 3192 page_unlock(ppa[i]); 3193 } 3194 ppa[0] = NULL; 3195 } 3196 3197 return (1); 3198 out: 3199 /* 3200 * Do the cleanup. Unlock target pages we didn't relocate. They are 3201 * linked on targ_pplist by root pages. reassemble unused replacement 3202 * and io pages back to pplist. 3203 */ 3204 if (io_pplist != NULL) { 3205 VM_STAT_ADD(segvnvmstats.fill_vp_pages[20]); 3206 pp = io_pplist; 3207 do { 3208 ASSERT(pp->p_vnode == vp); 3209 ASSERT(pp->p_offset == io_off); 3210 ASSERT(page_iolock_assert(pp)); 3211 page_io_unlock(pp); 3212 page_hashout(pp, NULL); 3213 io_off += PAGESIZE; 3214 } while ((pp = pp->p_next) != io_pplist); 3215 page_list_concat(&io_pplist, &pplist); 3216 pplist = io_pplist; 3217 } 3218 tmp_pplist = NULL; 3219 while (targ_pplist != NULL) { 3220 VM_STAT_ADD(segvnvmstats.fill_vp_pages[21]); 3221 pp = targ_pplist; 3222 ASSERT(PAGE_EXCL(pp)); 3223 page_sub(&targ_pplist, pp); 3224 3225 pszc = pp->p_szc; 3226 ppages = page_get_pagecnt(pszc); 3227 ASSERT(IS_P2ALIGNED(page_pptonum(pp), ppages)); 3228 3229 if (pszc != 0) { 3230 group_page_unlock(pp); 3231 } 3232 page_unlock(pp); 3233 3234 pp = repl_pplist; 3235 ASSERT(pp != NULL); 3236 ASSERT(PAGE_EXCL(pp)); 3237 ASSERT(pp->p_szc == szc); 3238 page_sub(&repl_pplist, pp); 3239 3240 ASSERT(IS_P2ALIGNED(page_pptonum(pp), ppages)); 3241 3242 /* relink replacement page */ 3243 page_list_concat(&tmp_pplist, &pp); 3244 while (--ppages != 0) { 3245 VM_STAT_ADD(segvnvmstats.fill_vp_pages[22]); 3246 pp++; 3247 ASSERT(PAGE_EXCL(pp)); 3248 ASSERT(pp->p_szc == szc); 3249 page_list_concat(&tmp_pplist, &pp); 3250 } 3251 } 3252 if (tmp_pplist != NULL) { 3253 VM_STAT_ADD(segvnvmstats.fill_vp_pages[23]); 3254 page_list_concat(&tmp_pplist, &pplist); 3255 pplist = tmp_pplist; 3256 } 3257 /* 3258 * at this point all pages are either on done_pplist or 3259 * pplist. They can't be all on done_pplist otherwise 3260 * we'd've been done. 3261 */ 3262 ASSERT(pplist != NULL); 3263 if (nios != 0) { 3264 VM_STAT_ADD(segvnvmstats.fill_vp_pages[24]); 3265 pp = pplist; 3266 do { 3267 VM_STAT_ADD(segvnvmstats.fill_vp_pages[25]); 3268 ASSERT(pp->p_szc == szc); 3269 ASSERT(PAGE_EXCL(pp)); 3270 ASSERT(pp->p_vnode != vp); 3271 pp->p_szc = 0; 3272 } while ((pp = pp->p_next) != pplist); 3273 3274 pp = done_pplist; 3275 do { 3276 VM_STAT_ADD(segvnvmstats.fill_vp_pages[26]); 3277 ASSERT(pp->p_szc == szc); 3278 ASSERT(PAGE_EXCL(pp)); 3279 ASSERT(pp->p_vnode == vp); 3280 pp->p_szc = 0; 3281 } while ((pp = pp->p_next) != done_pplist); 3282 3283 while (pplist != NULL) { 3284 VM_STAT_ADD(segvnvmstats.fill_vp_pages[27]); 3285 pp = pplist; 3286 page_sub(&pplist, pp); 3287 page_free(pp, 0); 3288 } 3289 3290 while (done_pplist != NULL) { 3291 VM_STAT_ADD(segvnvmstats.fill_vp_pages[28]); 3292 pp = done_pplist; 3293 page_sub(&done_pplist, pp); 3294 page_unlock(pp); 3295 } 3296 *ppplist = NULL; 3297 return (0); 3298 } 3299 ASSERT(pplist == *ppplist); 3300 if (io_err) { 3301 VM_STAT_ADD(segvnvmstats.fill_vp_pages[29]); 3302 /* 3303 * don't downsize on io error. 3304 * see if vop_getpage succeeds. 3305 * pplist may still be used in this case 3306 * for relocations. 3307 */ 3308 return (0); 3309 } 3310 VM_STAT_ADD(segvnvmstats.fill_vp_pages[30]); 3311 page_free_replacement_page(pplist); 3312 page_create_putback(pages); 3313 *ppplist = NULL; 3314 return (0); 3315 } 3316 3317 int segvn_anypgsz = 0; 3318 3319 #define SEGVN_RESTORE_SOFTLOCK(type, pages) \ 3320 if ((type) == F_SOFTLOCK) { \ 3321 mutex_enter(&freemem_lock); \ 3322 availrmem += (pages); \ 3323 segvn_pages_locked -= (pages); \ 3324 svd->softlockcnt -= (pages); \ 3325 mutex_exit(&freemem_lock); \ 3326 } 3327 3328 #define SEGVN_UPDATE_MODBITS(ppa, pages, rw, prot, vpprot) \ 3329 if (IS_VMODSORT((ppa)[0]->p_vnode)) { \ 3330 if ((rw) == S_WRITE) { \ 3331 for (i = 0; i < (pages); i++) { \ 3332 ASSERT((ppa)[i]->p_vnode == \ 3333 (ppa)[0]->p_vnode); \ 3334 hat_setmod((ppa)[i]); \ 3335 } \ 3336 } else if ((rw) != S_OTHER && \ 3337 ((prot) & (vpprot) & PROT_WRITE)) { \ 3338 for (i = 0; i < (pages); i++) { \ 3339 ASSERT((ppa)[i]->p_vnode == \ 3340 (ppa)[0]->p_vnode); \ 3341 if (!hat_ismod((ppa)[i])) { \ 3342 prot &= ~PROT_WRITE; \ 3343 break; \ 3344 } \ 3345 } \ 3346 } \ 3347 } 3348 3349 #ifdef VM_STATS 3350 3351 #define SEGVN_VMSTAT_FLTVNPAGES(idx) \ 3352 VM_STAT_ADD(segvnvmstats.fltvnpages[(idx)]); 3353 3354 #else /* VM_STATS */ 3355 3356 #define SEGVN_VMSTAT_FLTVNPAGES(idx) 3357 3358 #endif 3359 3360 static faultcode_t 3361 segvn_fault_vnodepages(struct hat *hat, struct seg *seg, caddr_t lpgaddr, 3362 caddr_t lpgeaddr, enum fault_type type, enum seg_rw rw, caddr_t addr, 3363 caddr_t eaddr, int brkcow) 3364 { 3365 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 3366 struct anon_map *amp = svd->amp; 3367 uchar_t segtype = svd->type; 3368 uint_t szc = seg->s_szc; 3369 size_t pgsz = page_get_pagesize(szc); 3370 size_t maxpgsz = pgsz; 3371 pgcnt_t pages = btop(pgsz); 3372 pgcnt_t maxpages = pages; 3373 size_t ppasize = (pages + 1) * sizeof (page_t *); 3374 caddr_t a = lpgaddr; 3375 caddr_t maxlpgeaddr = lpgeaddr; 3376 u_offset_t off = svd->offset + (uintptr_t)(a - seg->s_base); 3377 ulong_t aindx = svd->anon_index + seg_page(seg, a); 3378 struct vpage *vpage = (svd->vpage != NULL) ? 3379 &svd->vpage[seg_page(seg, a)] : NULL; 3380 vnode_t *vp = svd->vp; 3381 page_t **ppa; 3382 uint_t pszc; 3383 size_t ppgsz; 3384 pgcnt_t ppages; 3385 faultcode_t err = 0; 3386 int ierr; 3387 int vop_size_err = 0; 3388 uint_t protchk, prot, vpprot; 3389 ulong_t i; 3390 int hat_flag = (type == F_SOFTLOCK) ? HAT_LOAD_LOCK : HAT_LOAD; 3391 anon_sync_obj_t an_cookie; 3392 enum seg_rw arw; 3393 int alloc_failed = 0; 3394 int adjszc_chk; 3395 struct vattr va; 3396 int xhat = 0; 3397 page_t *pplist; 3398 pfn_t pfn; 3399 int physcontig; 3400 int upgrdfail; 3401 int segvn_anypgsz_vnode = 0; /* for now map vnode with 2 page sizes */ 3402 3403 ASSERT(szc != 0); 3404 ASSERT(vp != NULL); 3405 ASSERT(brkcow == 0 || amp != NULL); 3406 ASSERT(enable_mbit_wa == 0); /* no mbit simulations with large pages */ 3407 ASSERT(!(svd->flags & MAP_NORESERVE)); 3408 ASSERT(type != F_SOFTUNLOCK); 3409 ASSERT(IS_P2ALIGNED(a, maxpgsz)); 3410 ASSERT(amp == NULL || IS_P2ALIGNED(aindx, maxpages)); 3411 ASSERT(SEGVN_LOCK_HELD(seg->s_as, &svd->lock)); 3412 ASSERT(seg->s_szc < NBBY * sizeof (int)); 3413 ASSERT(type != F_SOFTLOCK || lpgeaddr - a == maxpgsz); 3414 3415 VM_STAT_COND_ADD(type == F_SOFTLOCK, segvnvmstats.fltvnpages[0]); 3416 VM_STAT_COND_ADD(type != F_SOFTLOCK, segvnvmstats.fltvnpages[1]); 3417 3418 if (svd->flags & MAP_TEXT) { 3419 hat_flag |= HAT_LOAD_TEXT; 3420 } 3421 3422 if (svd->pageprot) { 3423 switch (rw) { 3424 case S_READ: 3425 protchk = PROT_READ; 3426 break; 3427 case S_WRITE: 3428 protchk = PROT_WRITE; 3429 break; 3430 case S_EXEC: 3431 protchk = PROT_EXEC; 3432 break; 3433 case S_OTHER: 3434 default: 3435 protchk = PROT_READ | PROT_WRITE | PROT_EXEC; 3436 break; 3437 } 3438 } else { 3439 prot = svd->prot; 3440 /* caller has already done segment level protection check. */ 3441 } 3442 3443 if (seg->s_as->a_hat != hat) { 3444 xhat = 1; 3445 } 3446 3447 if (rw == S_WRITE && segtype == MAP_PRIVATE) { 3448 SEGVN_VMSTAT_FLTVNPAGES(2); 3449 arw = S_READ; 3450 } else { 3451 arw = rw; 3452 } 3453 3454 ppa = kmem_alloc(ppasize, KM_SLEEP); 3455 3456 VM_STAT_COND_ADD(amp != NULL, segvnvmstats.fltvnpages[3]); 3457 3458 for (;;) { 3459 adjszc_chk = 0; 3460 for (; a < lpgeaddr; a += pgsz, off += pgsz, aindx += pages) { 3461 if (adjszc_chk) { 3462 while (szc < seg->s_szc) { 3463 uintptr_t e; 3464 uint_t tszc; 3465 tszc = segvn_anypgsz_vnode ? szc + 1 : 3466 seg->s_szc; 3467 ppgsz = page_get_pagesize(tszc); 3468 if (!IS_P2ALIGNED(a, ppgsz) || 3469 ((alloc_failed >> tszc) & 3470 0x1)) { 3471 break; 3472 } 3473 SEGVN_VMSTAT_FLTVNPAGES(4); 3474 szc = tszc; 3475 pgsz = ppgsz; 3476 pages = btop(pgsz); 3477 e = P2ROUNDUP((uintptr_t)eaddr, pgsz); 3478 lpgeaddr = (caddr_t)e; 3479 } 3480 } 3481 3482 again: 3483 if (IS_P2ALIGNED(a, maxpgsz) && amp != NULL) { 3484 ASSERT(IS_P2ALIGNED(aindx, maxpages)); 3485 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 3486 anon_array_enter(amp, aindx, &an_cookie); 3487 if (anon_get_ptr(amp->ahp, aindx) != NULL) { 3488 SEGVN_VMSTAT_FLTVNPAGES(5); 3489 if (anon_pages(amp->ahp, aindx, 3490 maxpages) != maxpages) { 3491 panic("segvn_fault_vnodepages:" 3492 " empty anon slots\n"); 3493 } 3494 anon_array_exit(&an_cookie); 3495 ANON_LOCK_EXIT(&->a_rwlock); 3496 err = segvn_fault_anonpages(hat, seg, 3497 a, a + maxpgsz, type, rw, 3498 MAX(a, addr), 3499 MIN(a + maxpgsz, eaddr), brkcow); 3500 if (err != 0) { 3501 SEGVN_VMSTAT_FLTVNPAGES(6); 3502 goto out; 3503 } 3504 if (szc < seg->s_szc) { 3505 szc = seg->s_szc; 3506 pgsz = maxpgsz; 3507 pages = maxpages; 3508 lpgeaddr = maxlpgeaddr; 3509 } 3510 goto next; 3511 } else if (anon_pages(amp->ahp, aindx, 3512 maxpages)) { 3513 panic("segvn_fault_vnodepages:" 3514 " non empty anon slots\n"); 3515 } else { 3516 SEGVN_VMSTAT_FLTVNPAGES(7); 3517 anon_array_exit(&an_cookie); 3518 ANON_LOCK_EXIT(&->a_rwlock); 3519 } 3520 } 3521 ASSERT(!brkcow || IS_P2ALIGNED(a, maxpgsz)); 3522 3523 if (svd->pageprot != 0 && IS_P2ALIGNED(a, maxpgsz)) { 3524 ASSERT(vpage != NULL); 3525 prot = VPP_PROT(vpage); 3526 ASSERT(sameprot(seg, a, maxpgsz)); 3527 if ((prot & protchk) == 0) { 3528 SEGVN_VMSTAT_FLTVNPAGES(8); 3529 err = FC_PROT; 3530 goto out; 3531 } 3532 } 3533 if (type == F_SOFTLOCK) { 3534 mutex_enter(&freemem_lock); 3535 if (availrmem < tune.t_minarmem + pages) { 3536 mutex_exit(&freemem_lock); 3537 err = FC_MAKE_ERR(ENOMEM); 3538 goto out; 3539 } else { 3540 availrmem -= pages; 3541 segvn_pages_locked += pages; 3542 svd->softlockcnt += pages; 3543 } 3544 mutex_exit(&freemem_lock); 3545 } 3546 3547 pplist = NULL; 3548 physcontig = 0; 3549 ppa[0] = NULL; 3550 if (!brkcow && szc && 3551 !page_exists_physcontig(vp, off, szc, 3552 segtype == MAP_PRIVATE ? ppa : NULL)) { 3553 SEGVN_VMSTAT_FLTVNPAGES(9); 3554 if (page_alloc_pages(vp, seg, a, &pplist, NULL, 3555 szc, 0) && type != F_SOFTLOCK) { 3556 SEGVN_VMSTAT_FLTVNPAGES(10); 3557 pszc = 0; 3558 ierr = -1; 3559 alloc_failed |= (1 << szc); 3560 break; 3561 } 3562 if (pplist != NULL && 3563 vp->v_mpssdata == SEGVN_PAGEIO) { 3564 int downsize; 3565 SEGVN_VMSTAT_FLTVNPAGES(11); 3566 physcontig = segvn_fill_vp_pages(svd, 3567 vp, off, szc, ppa, &pplist, 3568 &pszc, &downsize); 3569 ASSERT(!physcontig || pplist == NULL); 3570 if (!physcontig && downsize && 3571 type != F_SOFTLOCK) { 3572 ASSERT(pplist == NULL); 3573 SEGVN_VMSTAT_FLTVNPAGES(12); 3574 ierr = -1; 3575 break; 3576 } 3577 ASSERT(!physcontig || 3578 segtype == MAP_PRIVATE || 3579 ppa[0] == NULL); 3580 if (physcontig && ppa[0] == NULL) { 3581 physcontig = 0; 3582 } 3583 } 3584 } else if (!brkcow && szc && ppa[0] != NULL) { 3585 SEGVN_VMSTAT_FLTVNPAGES(13); 3586 ASSERT(segtype == MAP_PRIVATE); 3587 physcontig = 1; 3588 } 3589 3590 if (!physcontig) { 3591 SEGVN_VMSTAT_FLTVNPAGES(14); 3592 ppa[0] = NULL; 3593 ierr = VOP_GETPAGE(vp, (offset_t)off, pgsz, 3594 &vpprot, ppa, pgsz, seg, a, arw, 3595 svd->cred); 3596 if (segtype == MAP_PRIVATE) { 3597 SEGVN_VMSTAT_FLTVNPAGES(15); 3598 vpprot &= ~PROT_WRITE; 3599 } 3600 } else { 3601 ASSERT(segtype == MAP_PRIVATE); 3602 SEGVN_VMSTAT_FLTVNPAGES(16); 3603 vpprot = PROT_ALL & ~PROT_WRITE; 3604 ierr = 0; 3605 } 3606 3607 if (ierr != 0) { 3608 SEGVN_VMSTAT_FLTVNPAGES(17); 3609 if (pplist != NULL) { 3610 SEGVN_VMSTAT_FLTVNPAGES(18); 3611 page_free_replacement_page(pplist); 3612 page_create_putback(pages); 3613 } 3614 SEGVN_RESTORE_SOFTLOCK(type, pages); 3615 if (a + pgsz <= eaddr) { 3616 SEGVN_VMSTAT_FLTVNPAGES(19); 3617 err = FC_MAKE_ERR(ierr); 3618 goto out; 3619 } 3620 va.va_mask = AT_SIZE; 3621 if (VOP_GETATTR(vp, &va, 0, svd->cred) != 0) { 3622 SEGVN_VMSTAT_FLTVNPAGES(20); 3623 err = FC_MAKE_ERR(EIO); 3624 goto out; 3625 } 3626 if (btopr(va.va_size) >= btopr(off + pgsz)) { 3627 SEGVN_VMSTAT_FLTVNPAGES(21); 3628 err = FC_MAKE_ERR(ierr); 3629 goto out; 3630 } 3631 if (btopr(va.va_size) < 3632 btopr(off + (eaddr - a))) { 3633 SEGVN_VMSTAT_FLTVNPAGES(22); 3634 err = FC_MAKE_ERR(ierr); 3635 goto out; 3636 } 3637 if (brkcow || type == F_SOFTLOCK) { 3638 /* can't reduce map area */ 3639 SEGVN_VMSTAT_FLTVNPAGES(23); 3640 vop_size_err = 1; 3641 goto out; 3642 } 3643 SEGVN_VMSTAT_FLTVNPAGES(24); 3644 ASSERT(szc != 0); 3645 pszc = 0; 3646 ierr = -1; 3647 break; 3648 } 3649 3650 if (amp != NULL) { 3651 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 3652 anon_array_enter(amp, aindx, &an_cookie); 3653 } 3654 if (amp != NULL && 3655 anon_get_ptr(amp->ahp, aindx) != NULL) { 3656 ulong_t taindx = P2ALIGN(aindx, maxpages); 3657 3658 SEGVN_VMSTAT_FLTVNPAGES(25); 3659 if (anon_pages(amp->ahp, taindx, maxpages) != 3660 maxpages) { 3661 panic("segvn_fault_vnodepages:" 3662 " empty anon slots\n"); 3663 } 3664 for (i = 0; i < pages; i++) { 3665 page_unlock(ppa[i]); 3666 } 3667 anon_array_exit(&an_cookie); 3668 ANON_LOCK_EXIT(&->a_rwlock); 3669 if (pplist != NULL) { 3670 page_free_replacement_page(pplist); 3671 page_create_putback(pages); 3672 } 3673 SEGVN_RESTORE_SOFTLOCK(type, pages); 3674 if (szc < seg->s_szc) { 3675 SEGVN_VMSTAT_FLTVNPAGES(26); 3676 /* 3677 * For private segments SOFTLOCK 3678 * either always breaks cow (any rw 3679 * type except S_READ_NOCOW) or 3680 * address space is locked as writer 3681 * (S_READ_NOCOW case) and anon slots 3682 * can't show up on second check. 3683 * Therefore if we are here for 3684 * SOFTLOCK case it must be a cow 3685 * break but cow break never reduces 3686 * szc. Thus the assert below. 3687 */ 3688 ASSERT(!brkcow && type != F_SOFTLOCK); 3689 pszc = seg->s_szc; 3690 ierr = -2; 3691 break; 3692 } 3693 ASSERT(IS_P2ALIGNED(a, maxpgsz)); 3694 goto again; 3695 } 3696 #ifdef DEBUG 3697 if (amp != NULL) { 3698 ulong_t taindx = P2ALIGN(aindx, maxpages); 3699 ASSERT(!anon_pages(amp->ahp, taindx, maxpages)); 3700 } 3701 #endif /* DEBUG */ 3702 3703 if (brkcow) { 3704 ASSERT(amp != NULL); 3705 ASSERT(pplist == NULL); 3706 ASSERT(szc == seg->s_szc); 3707 ASSERT(IS_P2ALIGNED(a, maxpgsz)); 3708 ASSERT(IS_P2ALIGNED(aindx, maxpages)); 3709 SEGVN_VMSTAT_FLTVNPAGES(27); 3710 ierr = anon_map_privatepages(amp, aindx, szc, 3711 seg, a, prot, ppa, vpage, segvn_anypgsz, 3712 svd->cred); 3713 if (ierr != 0) { 3714 SEGVN_VMSTAT_FLTVNPAGES(28); 3715 anon_array_exit(&an_cookie); 3716 ANON_LOCK_EXIT(&->a_rwlock); 3717 SEGVN_RESTORE_SOFTLOCK(type, pages); 3718 err = FC_MAKE_ERR(ierr); 3719 goto out; 3720 } 3721 3722 ASSERT(!IS_VMODSORT(ppa[0]->p_vnode)); 3723 /* 3724 * p_szc can't be changed for locked 3725 * swapfs pages. 3726 */ 3727 hat_memload_array(hat, a, pgsz, ppa, prot, 3728 hat_flag); 3729 3730 if (!(hat_flag & HAT_LOAD_LOCK)) { 3731 SEGVN_VMSTAT_FLTVNPAGES(29); 3732 for (i = 0; i < pages; i++) { 3733 page_unlock(ppa[i]); 3734 } 3735 } 3736 anon_array_exit(&an_cookie); 3737 ANON_LOCK_EXIT(&->a_rwlock); 3738 goto next; 3739 } 3740 3741 pfn = page_pptonum(ppa[0]); 3742 /* 3743 * hat_page_demote() needs an EXCl lock on one of 3744 * constituent page_t's and it decreases root's p_szc 3745 * last. This means if root's p_szc is equal szc and 3746 * all its constituent pages are locked 3747 * hat_page_demote() that could have changed p_szc to 3748 * szc is already done and no new have page_demote() 3749 * can start for this large page. 3750 */ 3751 3752 /* 3753 * we need to make sure same mapping size is used for 3754 * the same address range if there's a possibility the 3755 * adddress is already mapped because hat layer panics 3756 * when translation is loaded for the range already 3757 * mapped with a different page size. We achieve it 3758 * by always using largest page size possible subject 3759 * to the constraints of page size, segment page size 3760 * and page alignment. Since mappings are invalidated 3761 * when those constraints change and make it 3762 * impossible to use previously used mapping size no 3763 * mapping size conflicts should happen. 3764 */ 3765 3766 chkszc: 3767 if ((pszc = ppa[0]->p_szc) == szc && 3768 IS_P2ALIGNED(pfn, pages)) { 3769 3770 SEGVN_VMSTAT_FLTVNPAGES(30); 3771 #ifdef DEBUG 3772 for (i = 0; i < pages; i++) { 3773 ASSERT(PAGE_LOCKED(ppa[i])); 3774 ASSERT(!PP_ISFREE(ppa[i])); 3775 ASSERT(page_pptonum(ppa[i]) == 3776 pfn + i); 3777 ASSERT(ppa[i]->p_szc == szc); 3778 ASSERT(ppa[i]->p_vnode == vp); 3779 ASSERT(ppa[i]->p_offset == 3780 off + (i << PAGESHIFT)); 3781 } 3782 #endif /* DEBUG */ 3783 /* 3784 * All pages are of szc we need and they are 3785 * all locked so they can't change szc. load 3786 * translations. 3787 * 3788 * if page got promoted since last check 3789 * we don't need pplist. 3790 */ 3791 if (pplist != NULL) { 3792 page_free_replacement_page(pplist); 3793 page_create_putback(pages); 3794 } 3795 if (PP_ISMIGRATE(ppa[0])) { 3796 page_migrate(seg, a, ppa, pages); 3797 } 3798 SEGVN_UPDATE_MODBITS(ppa, pages, rw, 3799 prot, vpprot); 3800 if (!xhat) { 3801 hat_memload_array(hat, a, pgsz, ppa, 3802 prot & vpprot, hat_flag); 3803 } else { 3804 /* 3805 * avoid large xhat mappings to FS 3806 * pages so that hat_page_demote() 3807 * doesn't need to check for xhat 3808 * large mappings. 3809 */ 3810 for (i = 0; i < pages; i++) { 3811 hat_memload(hat, 3812 a + (i << PAGESHIFT), 3813 ppa[i], prot & vpprot, 3814 hat_flag); 3815 } 3816 } 3817 3818 if (!(hat_flag & HAT_LOAD_LOCK)) { 3819 for (i = 0; i < pages; i++) { 3820 page_unlock(ppa[i]); 3821 } 3822 } 3823 if (amp != NULL) { 3824 anon_array_exit(&an_cookie); 3825 ANON_LOCK_EXIT(&->a_rwlock); 3826 } 3827 goto next; 3828 } 3829 3830 /* 3831 * See if upsize is possible. 3832 */ 3833 if (pszc > szc && szc < seg->s_szc && 3834 (segvn_anypgsz_vnode || pszc >= seg->s_szc)) { 3835 pgcnt_t aphase; 3836 uint_t pszc1 = MIN(pszc, seg->s_szc); 3837 ppgsz = page_get_pagesize(pszc1); 3838 ppages = btop(ppgsz); 3839 aphase = btop(P2PHASE((uintptr_t)a, ppgsz)); 3840 3841 ASSERT(type != F_SOFTLOCK); 3842 3843 SEGVN_VMSTAT_FLTVNPAGES(31); 3844 if (aphase != P2PHASE(pfn, ppages)) { 3845 segvn_faultvnmpss_align_err4++; 3846 } else { 3847 SEGVN_VMSTAT_FLTVNPAGES(32); 3848 if (pplist != NULL) { 3849 page_t *pl = pplist; 3850 page_free_replacement_page(pl); 3851 page_create_putback(pages); 3852 } 3853 for (i = 0; i < pages; i++) { 3854 page_unlock(ppa[i]); 3855 } 3856 if (amp != NULL) { 3857 anon_array_exit(&an_cookie); 3858 ANON_LOCK_EXIT(&->a_rwlock); 3859 } 3860 pszc = pszc1; 3861 ierr = -2; 3862 break; 3863 } 3864 } 3865 3866 /* 3867 * check if we should use smallest mapping size. 3868 */ 3869 upgrdfail = 0; 3870 if (szc == 0 || xhat || 3871 (pszc >= szc && 3872 !IS_P2ALIGNED(pfn, pages)) || 3873 (pszc < szc && 3874 !segvn_full_szcpages(ppa, szc, &upgrdfail, 3875 &pszc))) { 3876 3877 if (upgrdfail && type != F_SOFTLOCK) { 3878 /* 3879 * segvn_full_szcpages failed to lock 3880 * all pages EXCL. Size down. 3881 */ 3882 ASSERT(pszc < szc); 3883 3884 SEGVN_VMSTAT_FLTVNPAGES(33); 3885 3886 if (pplist != NULL) { 3887 page_t *pl = pplist; 3888 page_free_replacement_page(pl); 3889 page_create_putback(pages); 3890 } 3891 3892 for (i = 0; i < pages; i++) { 3893 page_unlock(ppa[i]); 3894 } 3895 if (amp != NULL) { 3896 anon_array_exit(&an_cookie); 3897 ANON_LOCK_EXIT(&->a_rwlock); 3898 } 3899 ierr = -1; 3900 break; 3901 } 3902 if (szc != 0 && !xhat) { 3903 segvn_faultvnmpss_align_err5++; 3904 } 3905 SEGVN_VMSTAT_FLTVNPAGES(34); 3906 if (pplist != NULL) { 3907 page_free_replacement_page(pplist); 3908 page_create_putback(pages); 3909 } 3910 SEGVN_UPDATE_MODBITS(ppa, pages, rw, 3911 prot, vpprot); 3912 if (upgrdfail && segvn_anypgsz_vnode) { 3913 /* SOFTLOCK case */ 3914 hat_memload_array(hat, a, pgsz, 3915 ppa, prot & vpprot, hat_flag); 3916 } else { 3917 for (i = 0; i < pages; i++) { 3918 hat_memload(hat, 3919 a + (i << PAGESHIFT), 3920 ppa[i], prot & vpprot, 3921 hat_flag); 3922 } 3923 } 3924 if (!(hat_flag & HAT_LOAD_LOCK)) { 3925 for (i = 0; i < pages; i++) { 3926 page_unlock(ppa[i]); 3927 } 3928 } 3929 if (amp != NULL) { 3930 anon_array_exit(&an_cookie); 3931 ANON_LOCK_EXIT(&->a_rwlock); 3932 } 3933 goto next; 3934 } 3935 3936 if (pszc == szc) { 3937 /* 3938 * segvn_full_szcpages() upgraded pages szc. 3939 */ 3940 ASSERT(pszc == ppa[0]->p_szc); 3941 ASSERT(IS_P2ALIGNED(pfn, pages)); 3942 goto chkszc; 3943 } 3944 3945 if (pszc > szc) { 3946 kmutex_t *szcmtx; 3947 SEGVN_VMSTAT_FLTVNPAGES(35); 3948 /* 3949 * p_szc of ppa[0] can change since we haven't 3950 * locked all constituent pages. Call 3951 * page_lock_szc() to prevent szc changes. 3952 * This should be a rare case that happens when 3953 * multiple segments use a different page size 3954 * to map the same file offsets. 3955 */ 3956 szcmtx = page_szc_lock(ppa[0]); 3957 pszc = ppa[0]->p_szc; 3958 ASSERT(szcmtx != NULL || pszc == 0); 3959 ASSERT(ppa[0]->p_szc <= pszc); 3960 if (pszc <= szc) { 3961 SEGVN_VMSTAT_FLTVNPAGES(36); 3962 if (szcmtx != NULL) { 3963 mutex_exit(szcmtx); 3964 } 3965 goto chkszc; 3966 } 3967 if (pplist != NULL) { 3968 /* 3969 * page got promoted since last check. 3970 * we don't need preaalocated large 3971 * page. 3972 */ 3973 SEGVN_VMSTAT_FLTVNPAGES(37); 3974 page_free_replacement_page(pplist); 3975 page_create_putback(pages); 3976 } 3977 SEGVN_UPDATE_MODBITS(ppa, pages, rw, 3978 prot, vpprot); 3979 hat_memload_array(hat, a, pgsz, ppa, 3980 prot & vpprot, hat_flag); 3981 mutex_exit(szcmtx); 3982 if (!(hat_flag & HAT_LOAD_LOCK)) { 3983 for (i = 0; i < pages; i++) { 3984 page_unlock(ppa[i]); 3985 } 3986 } 3987 if (amp != NULL) { 3988 anon_array_exit(&an_cookie); 3989 ANON_LOCK_EXIT(&->a_rwlock); 3990 } 3991 goto next; 3992 } 3993 3994 /* 3995 * if page got demoted since last check 3996 * we could have not allocated larger page. 3997 * allocate now. 3998 */ 3999 if (pplist == NULL && 4000 page_alloc_pages(vp, seg, a, &pplist, NULL, 4001 szc, 0) && type != F_SOFTLOCK) { 4002 SEGVN_VMSTAT_FLTVNPAGES(38); 4003 for (i = 0; i < pages; i++) { 4004 page_unlock(ppa[i]); 4005 } 4006 if (amp != NULL) { 4007 anon_array_exit(&an_cookie); 4008 ANON_LOCK_EXIT(&->a_rwlock); 4009 } 4010 ierr = -1; 4011 alloc_failed |= (1 << szc); 4012 break; 4013 } 4014 4015 SEGVN_VMSTAT_FLTVNPAGES(39); 4016 4017 if (pplist != NULL) { 4018 segvn_relocate_pages(ppa, pplist); 4019 #ifdef DEBUG 4020 } else { 4021 ASSERT(type == F_SOFTLOCK); 4022 SEGVN_VMSTAT_FLTVNPAGES(40); 4023 #endif /* DEBUG */ 4024 } 4025 4026 SEGVN_UPDATE_MODBITS(ppa, pages, rw, prot, vpprot); 4027 4028 if (pplist == NULL && segvn_anypgsz_vnode == 0) { 4029 ASSERT(type == F_SOFTLOCK); 4030 for (i = 0; i < pages; i++) { 4031 ASSERT(ppa[i]->p_szc < szc); 4032 hat_memload(hat, a + (i << PAGESHIFT), 4033 ppa[i], prot & vpprot, hat_flag); 4034 } 4035 } else { 4036 ASSERT(pplist != NULL || type == F_SOFTLOCK); 4037 hat_memload_array(hat, a, pgsz, ppa, 4038 prot & vpprot, hat_flag); 4039 } 4040 if (!(hat_flag & HAT_LOAD_LOCK)) { 4041 for (i = 0; i < pages; i++) { 4042 ASSERT(PAGE_SHARED(ppa[i])); 4043 page_unlock(ppa[i]); 4044 } 4045 } 4046 if (amp != NULL) { 4047 anon_array_exit(&an_cookie); 4048 ANON_LOCK_EXIT(&->a_rwlock); 4049 } 4050 4051 next: 4052 if (vpage != NULL) { 4053 vpage += pages; 4054 } 4055 adjszc_chk = 1; 4056 } 4057 if (a == lpgeaddr) 4058 break; 4059 ASSERT(a < lpgeaddr); 4060 4061 ASSERT(!brkcow && type != F_SOFTLOCK); 4062 4063 /* 4064 * ierr == -1 means we failed to map with a large page. 4065 * (either due to allocation/relocation failures or 4066 * misalignment with other mappings to this file. 4067 * 4068 * ierr == -2 means some other thread allocated a large page 4069 * after we gave up tp map with a large page. retry with 4070 * larger mapping. 4071 */ 4072 ASSERT(ierr == -1 || ierr == -2); 4073 ASSERT(ierr == -2 || szc != 0); 4074 ASSERT(ierr == -1 || szc < seg->s_szc); 4075 if (ierr == -2) { 4076 SEGVN_VMSTAT_FLTVNPAGES(41); 4077 ASSERT(pszc > szc && pszc <= seg->s_szc); 4078 szc = pszc; 4079 } else if (segvn_anypgsz_vnode) { 4080 SEGVN_VMSTAT_FLTVNPAGES(42); 4081 szc--; 4082 } else { 4083 SEGVN_VMSTAT_FLTVNPAGES(43); 4084 ASSERT(pszc < szc); 4085 /* 4086 * other process created pszc large page. 4087 * but we still have to drop to 0 szc. 4088 */ 4089 szc = 0; 4090 } 4091 4092 pgsz = page_get_pagesize(szc); 4093 pages = btop(pgsz); 4094 if (ierr == -2) { 4095 /* 4096 * Size up case. Note lpgaddr may only be needed for 4097 * softlock case so we don't adjust it here. 4098 */ 4099 a = (caddr_t)P2ALIGN((uintptr_t)a, pgsz); 4100 ASSERT(a >= lpgaddr); 4101 lpgeaddr = (caddr_t)P2ROUNDUP((uintptr_t)eaddr, pgsz); 4102 off = svd->offset + (uintptr_t)(a - seg->s_base); 4103 aindx = svd->anon_index + seg_page(seg, a); 4104 vpage = (svd->vpage != NULL) ? 4105 &svd->vpage[seg_page(seg, a)] : NULL; 4106 } else { 4107 /* 4108 * Size down case. Note lpgaddr may only be needed for 4109 * softlock case so we don't adjust it here. 4110 */ 4111 ASSERT(IS_P2ALIGNED(a, pgsz)); 4112 ASSERT(IS_P2ALIGNED(lpgeaddr, pgsz)); 4113 lpgeaddr = (caddr_t)P2ROUNDUP((uintptr_t)eaddr, pgsz); 4114 ASSERT(a < lpgeaddr); 4115 if (a < addr) { 4116 SEGVN_VMSTAT_FLTVNPAGES(44); 4117 /* 4118 * The beginning of the large page region can 4119 * be pulled to the right to make a smaller 4120 * region. We haven't yet faulted a single 4121 * page. 4122 */ 4123 a = (caddr_t)P2ALIGN((uintptr_t)addr, pgsz); 4124 ASSERT(a >= lpgaddr); 4125 off = svd->offset + 4126 (uintptr_t)(a - seg->s_base); 4127 aindx = svd->anon_index + seg_page(seg, a); 4128 vpage = (svd->vpage != NULL) ? 4129 &svd->vpage[seg_page(seg, a)] : NULL; 4130 } 4131 } 4132 } 4133 out: 4134 kmem_free(ppa, ppasize); 4135 if (!err && !vop_size_err) { 4136 SEGVN_VMSTAT_FLTVNPAGES(45); 4137 return (0); 4138 } 4139 if (type == F_SOFTLOCK && a > lpgaddr) { 4140 SEGVN_VMSTAT_FLTVNPAGES(46); 4141 segvn_softunlock(seg, lpgaddr, a - lpgaddr, S_OTHER); 4142 } 4143 if (!vop_size_err) { 4144 SEGVN_VMSTAT_FLTVNPAGES(47); 4145 return (err); 4146 } 4147 ASSERT(brkcow || type == F_SOFTLOCK); 4148 /* 4149 * Large page end is mapped beyond the end of file and it's a cow 4150 * fault or softlock so we can't reduce the map area. For now just 4151 * demote the segment. This should really only happen if the end of 4152 * the file changed after the mapping was established since when large 4153 * page segments are created we make sure they don't extend beyond the 4154 * end of the file. 4155 */ 4156 SEGVN_VMSTAT_FLTVNPAGES(48); 4157 4158 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 4159 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_WRITER); 4160 err = 0; 4161 if (seg->s_szc != 0) { 4162 segvn_fltvnpages_clrszc_cnt++; 4163 ASSERT(svd->softlockcnt == 0); 4164 err = segvn_clrszc(seg); 4165 if (err != 0) { 4166 segvn_fltvnpages_clrszc_err++; 4167 } 4168 } 4169 ASSERT(err || seg->s_szc == 0); 4170 SEGVN_LOCK_DOWNGRADE(seg->s_as, &svd->lock); 4171 /* segvn_fault will do its job as if szc had been zero to begin with */ 4172 return (err == 0 ? IE_RETRY : FC_MAKE_ERR(err)); 4173 } 4174 4175 /* 4176 * This routine will attempt to fault in one large page. 4177 * it will use smaller pages if that fails. 4178 * It should only be called for pure anonymous segments. 4179 */ 4180 static faultcode_t 4181 segvn_fault_anonpages(struct hat *hat, struct seg *seg, caddr_t lpgaddr, 4182 caddr_t lpgeaddr, enum fault_type type, enum seg_rw rw, caddr_t addr, 4183 caddr_t eaddr, int brkcow) 4184 { 4185 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 4186 struct anon_map *amp = svd->amp; 4187 uchar_t segtype = svd->type; 4188 uint_t szc = seg->s_szc; 4189 size_t pgsz = page_get_pagesize(szc); 4190 size_t maxpgsz = pgsz; 4191 pgcnt_t pages = btop(pgsz); 4192 size_t ppasize = pages * sizeof (page_t *); 4193 caddr_t a = lpgaddr; 4194 ulong_t aindx = svd->anon_index + seg_page(seg, a); 4195 struct vpage *vpage = (svd->vpage != NULL) ? 4196 &svd->vpage[seg_page(seg, a)] : NULL; 4197 page_t **ppa; 4198 uint_t ppa_szc; 4199 faultcode_t err; 4200 int ierr; 4201 uint_t protchk, prot, vpprot; 4202 ulong_t i; 4203 int hat_flag = (type == F_SOFTLOCK) ? HAT_LOAD_LOCK : HAT_LOAD; 4204 anon_sync_obj_t cookie; 4205 int first = 1; 4206 int adjszc_chk; 4207 int purged = 0; 4208 4209 ASSERT(szc != 0); 4210 ASSERT(amp != NULL); 4211 ASSERT(enable_mbit_wa == 0); /* no mbit simulations with large pages */ 4212 ASSERT(!(svd->flags & MAP_NORESERVE)); 4213 ASSERT(type != F_SOFTUNLOCK); 4214 ASSERT(IS_P2ALIGNED(a, maxpgsz)); 4215 4216 ASSERT(SEGVN_LOCK_HELD(seg->s_as, &svd->lock)); 4217 4218 VM_STAT_COND_ADD(type == F_SOFTLOCK, segvnvmstats.fltanpages[0]); 4219 VM_STAT_COND_ADD(type != F_SOFTLOCK, segvnvmstats.fltanpages[1]); 4220 4221 if (svd->flags & MAP_TEXT) { 4222 hat_flag |= HAT_LOAD_TEXT; 4223 } 4224 4225 if (svd->pageprot) { 4226 switch (rw) { 4227 case S_READ: 4228 protchk = PROT_READ; 4229 break; 4230 case S_WRITE: 4231 protchk = PROT_WRITE; 4232 break; 4233 case S_EXEC: 4234 protchk = PROT_EXEC; 4235 break; 4236 case S_OTHER: 4237 default: 4238 protchk = PROT_READ | PROT_WRITE | PROT_EXEC; 4239 break; 4240 } 4241 VM_STAT_ADD(segvnvmstats.fltanpages[2]); 4242 } else { 4243 prot = svd->prot; 4244 /* caller has already done segment level protection check. */ 4245 } 4246 4247 ppa = kmem_alloc(ppasize, KM_SLEEP); 4248 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 4249 for (;;) { 4250 adjszc_chk = 0; 4251 for (; a < lpgeaddr; a += pgsz, aindx += pages) { 4252 if (svd->pageprot != 0 && IS_P2ALIGNED(a, maxpgsz)) { 4253 VM_STAT_ADD(segvnvmstats.fltanpages[3]); 4254 ASSERT(vpage != NULL); 4255 prot = VPP_PROT(vpage); 4256 ASSERT(sameprot(seg, a, maxpgsz)); 4257 if ((prot & protchk) == 0) { 4258 err = FC_PROT; 4259 goto error; 4260 } 4261 } 4262 if (adjszc_chk && IS_P2ALIGNED(a, maxpgsz) && 4263 pgsz < maxpgsz) { 4264 ASSERT(a > lpgaddr); 4265 szc = seg->s_szc; 4266 pgsz = maxpgsz; 4267 pages = btop(pgsz); 4268 ASSERT(IS_P2ALIGNED(aindx, pages)); 4269 lpgeaddr = (caddr_t)P2ROUNDUP((uintptr_t)eaddr, 4270 pgsz); 4271 } 4272 if (type == F_SOFTLOCK && svd->vp != NULL) { 4273 mutex_enter(&freemem_lock); 4274 if (availrmem < tune.t_minarmem + pages) { 4275 mutex_exit(&freemem_lock); 4276 err = FC_MAKE_ERR(ENOMEM); 4277 goto error; 4278 } else { 4279 availrmem -= pages; 4280 segvn_pages_locked += pages; 4281 svd->softlockcnt += pages; 4282 } 4283 mutex_exit(&freemem_lock); 4284 } 4285 anon_array_enter(amp, aindx, &cookie); 4286 ppa_szc = (uint_t)-1; 4287 ierr = anon_map_getpages(amp, aindx, szc, seg, a, 4288 prot, &vpprot, ppa, &ppa_szc, vpage, rw, brkcow, 4289 segvn_anypgsz, svd->cred); 4290 if (ierr != 0) { 4291 anon_array_exit(&cookie); 4292 VM_STAT_ADD(segvnvmstats.fltanpages[4]); 4293 if (type == F_SOFTLOCK && svd->vp != NULL) { 4294 VM_STAT_ADD(segvnvmstats.fltanpages[5]); 4295 mutex_enter(&freemem_lock); 4296 availrmem += pages; 4297 segvn_pages_locked -= pages; 4298 svd->softlockcnt -= pages; 4299 mutex_exit(&freemem_lock); 4300 } 4301 if (ierr > 0) { 4302 VM_STAT_ADD(segvnvmstats.fltanpages[6]); 4303 err = FC_MAKE_ERR(ierr); 4304 goto error; 4305 } 4306 break; 4307 } 4308 4309 ASSERT(!IS_VMODSORT(ppa[0]->p_vnode)); 4310 4311 ASSERT(segtype == MAP_SHARED || 4312 ppa[0]->p_szc <= szc); 4313 ASSERT(segtype == MAP_PRIVATE || 4314 ppa[0]->p_szc >= szc); 4315 4316 if (type == F_SOFTLOCK && svd->vp == NULL) { 4317 /* 4318 * All pages in ppa array belong to the same 4319 * large page. This means it's ok to call 4320 * segvn_pp_lock_anonpages just for ppa[0]. 4321 */ 4322 if (!segvn_pp_lock_anonpages(ppa[0], first)) { 4323 for (i = 0; i < pages; i++) { 4324 page_unlock(ppa[i]); 4325 } 4326 err = FC_MAKE_ERR(ENOMEM); 4327 goto error; 4328 } 4329 first = 0; 4330 mutex_enter(&freemem_lock); 4331 svd->softlockcnt += pages; 4332 segvn_pages_locked += pages; 4333 mutex_exit(&freemem_lock); 4334 } 4335 4336 /* 4337 * Handle pages that have been marked for migration 4338 */ 4339 if (lgrp_optimizations()) 4340 page_migrate(seg, a, ppa, pages); 4341 4342 if (segtype == MAP_SHARED) { 4343 vpprot |= PROT_WRITE; 4344 } 4345 4346 hat_memload_array(hat, a, pgsz, ppa, 4347 prot & vpprot, hat_flag); 4348 4349 if (hat_flag & HAT_LOAD_LOCK) { 4350 VM_STAT_ADD(segvnvmstats.fltanpages[7]); 4351 } else { 4352 VM_STAT_ADD(segvnvmstats.fltanpages[8]); 4353 for (i = 0; i < pages; i++) 4354 page_unlock(ppa[i]); 4355 } 4356 if (vpage != NULL) 4357 vpage += pages; 4358 4359 anon_array_exit(&cookie); 4360 adjszc_chk = 1; 4361 } 4362 if (a == lpgeaddr) 4363 break; 4364 ASSERT(a < lpgeaddr); 4365 /* 4366 * ierr == -1 means we failed to allocate a large page. 4367 * so do a size down operation. 4368 * 4369 * ierr == -2 means some other process that privately shares 4370 * pages with this process has allocated a larger page and we 4371 * need to retry with larger pages. So do a size up 4372 * operation. This relies on the fact that large pages are 4373 * never partially shared i.e. if we share any constituent 4374 * page of a large page with another process we must share the 4375 * entire large page. Note this cannot happen for SOFTLOCK 4376 * case, unless current address (a) is at the beginning of the 4377 * next page size boundary because the other process couldn't 4378 * have relocated locked pages. 4379 */ 4380 ASSERT(ierr == -1 || ierr == -2); 4381 /* 4382 * For the very first relocation failure try to purge this 4383 * segment's cache so that the relocator can obtain an 4384 * exclusive lock on pages we want to relocate. 4385 */ 4386 if (!purged && ierr == -1 && ppa_szc != (uint_t)-1 && 4387 svd->softlockcnt != 0) { 4388 purged = 1; 4389 segvn_purge(seg); 4390 continue; 4391 } 4392 4393 if (segvn_anypgsz) { 4394 ASSERT(ierr == -2 || szc != 0); 4395 ASSERT(ierr == -1 || szc < seg->s_szc); 4396 szc = (ierr == -1) ? szc - 1 : szc + 1; 4397 } else { 4398 /* 4399 * For non COW faults and segvn_anypgsz == 0 4400 * we need to be careful not to loop forever 4401 * if existing page is found with szc other 4402 * than 0 or seg->s_szc. This could be due 4403 * to page relocations on behalf of DR or 4404 * more likely large page creation. For this 4405 * case simply re-size to existing page's szc 4406 * if returned by anon_map_getpages(). 4407 */ 4408 if (ppa_szc == (uint_t)-1) { 4409 szc = (ierr == -1) ? 0 : seg->s_szc; 4410 } else { 4411 ASSERT(ppa_szc <= seg->s_szc); 4412 ASSERT(ierr == -2 || ppa_szc < szc); 4413 ASSERT(ierr == -1 || ppa_szc > szc); 4414 szc = ppa_szc; 4415 } 4416 } 4417 4418 pgsz = page_get_pagesize(szc); 4419 pages = btop(pgsz); 4420 ASSERT(type != F_SOFTLOCK || ierr == -1 || 4421 (IS_P2ALIGNED(a, pgsz) && IS_P2ALIGNED(lpgeaddr, pgsz))); 4422 if (type == F_SOFTLOCK) { 4423 /* 4424 * For softlocks we cannot reduce the fault area 4425 * (calculated based on the largest page size for this 4426 * segment) for size down and a is already next 4427 * page size aligned as assertted above for size 4428 * ups. Therefore just continue in case of softlock. 4429 */ 4430 VM_STAT_ADD(segvnvmstats.fltanpages[9]); 4431 continue; /* keep lint happy */ 4432 } else if (ierr == -2) { 4433 4434 /* 4435 * Size up case. Note lpgaddr may only be needed for 4436 * softlock case so we don't adjust it here. 4437 */ 4438 VM_STAT_ADD(segvnvmstats.fltanpages[10]); 4439 a = (caddr_t)P2ALIGN((uintptr_t)a, pgsz); 4440 ASSERT(a >= lpgaddr); 4441 lpgeaddr = (caddr_t)P2ROUNDUP((uintptr_t)eaddr, pgsz); 4442 aindx = svd->anon_index + seg_page(seg, a); 4443 vpage = (svd->vpage != NULL) ? 4444 &svd->vpage[seg_page(seg, a)] : NULL; 4445 } else { 4446 /* 4447 * Size down case. Note lpgaddr may only be needed for 4448 * softlock case so we don't adjust it here. 4449 */ 4450 VM_STAT_ADD(segvnvmstats.fltanpages[11]); 4451 ASSERT(IS_P2ALIGNED(a, pgsz)); 4452 ASSERT(IS_P2ALIGNED(lpgeaddr, pgsz)); 4453 lpgeaddr = (caddr_t)P2ROUNDUP((uintptr_t)eaddr, pgsz); 4454 ASSERT(a < lpgeaddr); 4455 if (a < addr) { 4456 /* 4457 * The beginning of the large page region can 4458 * be pulled to the right to make a smaller 4459 * region. We haven't yet faulted a single 4460 * page. 4461 */ 4462 VM_STAT_ADD(segvnvmstats.fltanpages[12]); 4463 a = (caddr_t)P2ALIGN((uintptr_t)addr, pgsz); 4464 ASSERT(a >= lpgaddr); 4465 aindx = svd->anon_index + seg_page(seg, a); 4466 vpage = (svd->vpage != NULL) ? 4467 &svd->vpage[seg_page(seg, a)] : NULL; 4468 } 4469 } 4470 } 4471 VM_STAT_ADD(segvnvmstats.fltanpages[13]); 4472 ANON_LOCK_EXIT(&->a_rwlock); 4473 kmem_free(ppa, ppasize); 4474 return (0); 4475 error: 4476 VM_STAT_ADD(segvnvmstats.fltanpages[14]); 4477 ANON_LOCK_EXIT(&->a_rwlock); 4478 kmem_free(ppa, ppasize); 4479 if (type == F_SOFTLOCK && a > lpgaddr) { 4480 VM_STAT_ADD(segvnvmstats.fltanpages[15]); 4481 segvn_softunlock(seg, lpgaddr, a - lpgaddr, S_OTHER); 4482 } 4483 return (err); 4484 } 4485 4486 int fltadvice = 1; /* set to free behind pages for sequential access */ 4487 4488 /* 4489 * This routine is called via a machine specific fault handling routine. 4490 * It is also called by software routines wishing to lock or unlock 4491 * a range of addresses. 4492 * 4493 * Here is the basic algorithm: 4494 * If unlocking 4495 * Call segvn_softunlock 4496 * Return 4497 * endif 4498 * Checking and set up work 4499 * If we will need some non-anonymous pages 4500 * Call VOP_GETPAGE over the range of non-anonymous pages 4501 * endif 4502 * Loop over all addresses requested 4503 * Call segvn_faultpage passing in page list 4504 * to load up translations and handle anonymous pages 4505 * endloop 4506 * Load up translation to any additional pages in page list not 4507 * already handled that fit into this segment 4508 */ 4509 static faultcode_t 4510 segvn_fault(struct hat *hat, struct seg *seg, caddr_t addr, size_t len, 4511 enum fault_type type, enum seg_rw rw) 4512 { 4513 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 4514 page_t **plp, **ppp, *pp; 4515 u_offset_t off; 4516 caddr_t a; 4517 struct vpage *vpage; 4518 uint_t vpprot, prot; 4519 int err; 4520 page_t *pl[PVN_GETPAGE_NUM + 1]; 4521 size_t plsz, pl_alloc_sz; 4522 size_t page; 4523 ulong_t anon_index; 4524 struct anon_map *amp; 4525 int dogetpage = 0; 4526 caddr_t lpgaddr, lpgeaddr; 4527 size_t pgsz; 4528 anon_sync_obj_t cookie; 4529 int brkcow = BREAK_COW_SHARE(rw, type, svd->type); 4530 4531 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 4532 4533 /* 4534 * First handle the easy stuff 4535 */ 4536 if (type == F_SOFTUNLOCK) { 4537 if (rw == S_READ_NOCOW) { 4538 rw = S_READ; 4539 ASSERT(AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock)); 4540 } 4541 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER); 4542 pgsz = (seg->s_szc == 0) ? PAGESIZE : 4543 page_get_pagesize(seg->s_szc); 4544 VM_STAT_COND_ADD(pgsz > PAGESIZE, segvnvmstats.fltanpages[16]); 4545 CALC_LPG_REGION(pgsz, seg, addr, len, lpgaddr, lpgeaddr); 4546 segvn_softunlock(seg, lpgaddr, lpgeaddr - lpgaddr, rw); 4547 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 4548 return (0); 4549 } 4550 4551 top: 4552 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER); 4553 4554 /* 4555 * If we have the same protections for the entire segment, 4556 * insure that the access being attempted is legitimate. 4557 */ 4558 4559 if (svd->pageprot == 0) { 4560 uint_t protchk; 4561 4562 switch (rw) { 4563 case S_READ: 4564 case S_READ_NOCOW: 4565 protchk = PROT_READ; 4566 break; 4567 case S_WRITE: 4568 protchk = PROT_WRITE; 4569 break; 4570 case S_EXEC: 4571 protchk = PROT_EXEC; 4572 break; 4573 case S_OTHER: 4574 default: 4575 protchk = PROT_READ | PROT_WRITE | PROT_EXEC; 4576 break; 4577 } 4578 4579 if ((svd->prot & protchk) == 0) { 4580 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 4581 return (FC_PROT); /* illegal access type */ 4582 } 4583 } 4584 4585 /* 4586 * We can't allow the long term use of softlocks for vmpss segments, 4587 * because in some file truncation cases we should be able to demote 4588 * the segment, which requires that there are no softlocks. The 4589 * only case where it's ok to allow a SOFTLOCK fault against a vmpss 4590 * segment is S_READ_NOCOW, where the caller holds the address space 4591 * locked as writer and calls softunlock before dropping the as lock. 4592 * S_READ_NOCOW is used by /proc to read memory from another user. 4593 * 4594 * Another deadlock between SOFTLOCK and file truncation can happen 4595 * because segvn_fault_vnodepages() calls the FS one pagesize at 4596 * a time. A second VOP_GETPAGE() call by segvn_fault_vnodepages() 4597 * can cause a deadlock because the first set of page_t's remain 4598 * locked SE_SHARED. To avoid this, we demote segments on a first 4599 * SOFTLOCK if they have a length greater than the segment's 4600 * page size. 4601 * 4602 * So for now, we only avoid demoting a segment on a SOFTLOCK when 4603 * the access type is S_READ_NOCOW and the fault length is less than 4604 * or equal to the segment's page size. While this is quite restrictive, 4605 * it should be the most common case of SOFTLOCK against a vmpss 4606 * segment. 4607 * 4608 * For S_READ_NOCOW, it's safe not to do a copy on write because the 4609 * caller makes sure no COW will be caused by another thread for a 4610 * softlocked page. 4611 */ 4612 if (type == F_SOFTLOCK && svd->vp != NULL && seg->s_szc != 0) { 4613 int demote = 0; 4614 4615 if (rw != S_READ_NOCOW) { 4616 demote = 1; 4617 } 4618 if (!demote && len > PAGESIZE) { 4619 pgsz = page_get_pagesize(seg->s_szc); 4620 CALC_LPG_REGION(pgsz, seg, addr, len, lpgaddr, 4621 lpgeaddr); 4622 if (lpgeaddr - lpgaddr > pgsz) { 4623 demote = 1; 4624 } 4625 } 4626 4627 ASSERT(demote || AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock)); 4628 4629 if (demote) { 4630 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 4631 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_WRITER); 4632 if (seg->s_szc != 0) { 4633 segvn_vmpss_clrszc_cnt++; 4634 ASSERT(svd->softlockcnt == 0); 4635 err = segvn_clrszc(seg); 4636 if (err) { 4637 segvn_vmpss_clrszc_err++; 4638 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 4639 return (FC_MAKE_ERR(err)); 4640 } 4641 } 4642 ASSERT(seg->s_szc == 0); 4643 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 4644 goto top; 4645 } 4646 } 4647 4648 /* 4649 * Check to see if we need to allocate an anon_map structure. 4650 */ 4651 if (svd->amp == NULL && (svd->vp == NULL || brkcow)) { 4652 /* 4653 * Drop the "read" lock on the segment and acquire 4654 * the "write" version since we have to allocate the 4655 * anon_map. 4656 */ 4657 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 4658 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_WRITER); 4659 4660 if (svd->amp == NULL) { 4661 svd->amp = anonmap_alloc(seg->s_size, 0); 4662 svd->amp->a_szc = seg->s_szc; 4663 } 4664 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 4665 4666 /* 4667 * Start all over again since segment protections 4668 * may have changed after we dropped the "read" lock. 4669 */ 4670 goto top; 4671 } 4672 4673 /* 4674 * S_READ_NOCOW vs S_READ distinction was 4675 * only needed for the code above. After 4676 * that we treat it as S_READ. 4677 */ 4678 if (rw == S_READ_NOCOW) { 4679 ASSERT(type == F_SOFTLOCK); 4680 ASSERT(AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock)); 4681 rw = S_READ; 4682 } 4683 4684 amp = svd->amp; 4685 4686 /* 4687 * MADV_SEQUENTIAL work is ignored for large page segments. 4688 */ 4689 if (seg->s_szc != 0) { 4690 pgsz = page_get_pagesize(seg->s_szc); 4691 ASSERT(SEGVN_LOCK_HELD(seg->s_as, &svd->lock)); 4692 CALC_LPG_REGION(pgsz, seg, addr, len, lpgaddr, lpgeaddr); 4693 if (svd->vp == NULL) { 4694 err = segvn_fault_anonpages(hat, seg, lpgaddr, 4695 lpgeaddr, type, rw, addr, addr + len, brkcow); 4696 } else { 4697 err = segvn_fault_vnodepages(hat, seg, lpgaddr, 4698 lpgeaddr, type, rw, addr, addr + len, brkcow); 4699 if (err == IE_RETRY) { 4700 ASSERT(seg->s_szc == 0); 4701 ASSERT(SEGVN_READ_HELD(seg->s_as, &svd->lock)); 4702 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 4703 goto top; 4704 } 4705 } 4706 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 4707 return (err); 4708 } 4709 4710 page = seg_page(seg, addr); 4711 if (amp != NULL) { 4712 anon_index = svd->anon_index + page; 4713 4714 if ((type == F_PROT) && (rw == S_READ) && 4715 svd->type == MAP_PRIVATE && svd->pageprot == 0) { 4716 size_t index = anon_index; 4717 struct anon *ap; 4718 4719 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 4720 /* 4721 * The fast path could apply to S_WRITE also, except 4722 * that the protection fault could be caused by lazy 4723 * tlb flush when ro->rw. In this case, the pte is 4724 * RW already. But RO in the other cpu's tlb causes 4725 * the fault. Since hat_chgprot won't do anything if 4726 * pte doesn't change, we may end up faulting 4727 * indefinitely until the RO tlb entry gets replaced. 4728 */ 4729 for (a = addr; a < addr + len; a += PAGESIZE, index++) { 4730 anon_array_enter(amp, index, &cookie); 4731 ap = anon_get_ptr(amp->ahp, index); 4732 anon_array_exit(&cookie); 4733 if ((ap == NULL) || (ap->an_refcnt != 1)) { 4734 ANON_LOCK_EXIT(&->a_rwlock); 4735 goto slow; 4736 } 4737 } 4738 hat_chgprot(seg->s_as->a_hat, addr, len, svd->prot); 4739 ANON_LOCK_EXIT(&->a_rwlock); 4740 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 4741 return (0); 4742 } 4743 } 4744 slow: 4745 4746 if (svd->vpage == NULL) 4747 vpage = NULL; 4748 else 4749 vpage = &svd->vpage[page]; 4750 4751 off = svd->offset + (uintptr_t)(addr - seg->s_base); 4752 4753 /* 4754 * If MADV_SEQUENTIAL has been set for the particular page we 4755 * are faulting on, free behind all pages in the segment and put 4756 * them on the free list. 4757 */ 4758 if ((page != 0) && fltadvice) { /* not if first page in segment */ 4759 struct vpage *vpp; 4760 ulong_t fanon_index; 4761 size_t fpage; 4762 u_offset_t pgoff, fpgoff; 4763 struct vnode *fvp; 4764 struct anon *fap = NULL; 4765 4766 if (svd->advice == MADV_SEQUENTIAL || 4767 (svd->pageadvice && 4768 VPP_ADVICE(vpage) == MADV_SEQUENTIAL)) { 4769 pgoff = off - PAGESIZE; 4770 fpage = page - 1; 4771 if (vpage != NULL) 4772 vpp = &svd->vpage[fpage]; 4773 if (amp != NULL) 4774 fanon_index = svd->anon_index + fpage; 4775 4776 while (pgoff > svd->offset) { 4777 if (svd->advice != MADV_SEQUENTIAL && 4778 (!svd->pageadvice || (vpage && 4779 VPP_ADVICE(vpp) != MADV_SEQUENTIAL))) 4780 break; 4781 4782 /* 4783 * If this is an anon page, we must find the 4784 * correct <vp, offset> for it 4785 */ 4786 fap = NULL; 4787 if (amp != NULL) { 4788 ANON_LOCK_ENTER(&->a_rwlock, 4789 RW_READER); 4790 anon_array_enter(amp, fanon_index, 4791 &cookie); 4792 fap = anon_get_ptr(amp->ahp, 4793 fanon_index); 4794 if (fap != NULL) { 4795 swap_xlate(fap, &fvp, &fpgoff); 4796 } else { 4797 fpgoff = pgoff; 4798 fvp = svd->vp; 4799 } 4800 anon_array_exit(&cookie); 4801 ANON_LOCK_EXIT(&->a_rwlock); 4802 } else { 4803 fpgoff = pgoff; 4804 fvp = svd->vp; 4805 } 4806 if (fvp == NULL) 4807 break; /* XXX */ 4808 /* 4809 * Skip pages that are free or have an 4810 * "exclusive" lock. 4811 */ 4812 pp = page_lookup_nowait(fvp, fpgoff, SE_SHARED); 4813 if (pp == NULL) 4814 break; 4815 /* 4816 * We don't need the page_struct_lock to test 4817 * as this is only advisory; even if we 4818 * acquire it someone might race in and lock 4819 * the page after we unlock and before the 4820 * PUTPAGE, then VOP_PUTPAGE will do nothing. 4821 */ 4822 if (pp->p_lckcnt == 0 && pp->p_cowcnt == 0) { 4823 /* 4824 * Hold the vnode before releasing 4825 * the page lock to prevent it from 4826 * being freed and re-used by some 4827 * other thread. 4828 */ 4829 VN_HOLD(fvp); 4830 page_unlock(pp); 4831 /* 4832 * We should build a page list 4833 * to kluster putpages XXX 4834 */ 4835 (void) VOP_PUTPAGE(fvp, 4836 (offset_t)fpgoff, PAGESIZE, 4837 (B_DONTNEED|B_FREE|B_ASYNC), 4838 svd->cred); 4839 VN_RELE(fvp); 4840 } else { 4841 /* 4842 * XXX - Should the loop terminate if 4843 * the page is `locked'? 4844 */ 4845 page_unlock(pp); 4846 } 4847 --vpp; 4848 --fanon_index; 4849 pgoff -= PAGESIZE; 4850 } 4851 } 4852 } 4853 4854 plp = pl; 4855 *plp = NULL; 4856 pl_alloc_sz = 0; 4857 4858 /* 4859 * See if we need to call VOP_GETPAGE for 4860 * *any* of the range being faulted on. 4861 * We can skip all of this work if there 4862 * was no original vnode. 4863 */ 4864 if (svd->vp != NULL) { 4865 u_offset_t vp_off; 4866 size_t vp_len; 4867 struct anon *ap; 4868 vnode_t *vp; 4869 4870 vp_off = off; 4871 vp_len = len; 4872 4873 if (amp == NULL) 4874 dogetpage = 1; 4875 else { 4876 /* 4877 * Only acquire reader lock to prevent amp->ahp 4878 * from being changed. It's ok to miss pages, 4879 * hence we don't do anon_array_enter 4880 */ 4881 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 4882 ap = anon_get_ptr(amp->ahp, anon_index); 4883 4884 if (len <= PAGESIZE) 4885 /* inline non_anon() */ 4886 dogetpage = (ap == NULL); 4887 else 4888 dogetpage = non_anon(amp->ahp, anon_index, 4889 &vp_off, &vp_len); 4890 ANON_LOCK_EXIT(&->a_rwlock); 4891 } 4892 4893 if (dogetpage) { 4894 enum seg_rw arw; 4895 struct as *as = seg->s_as; 4896 4897 if (len > ptob((sizeof (pl) / sizeof (pl[0])) - 1)) { 4898 /* 4899 * Page list won't fit in local array, 4900 * allocate one of the needed size. 4901 */ 4902 pl_alloc_sz = 4903 (btop(len) + 1) * sizeof (page_t *); 4904 plp = kmem_alloc(pl_alloc_sz, KM_SLEEP); 4905 plp[0] = NULL; 4906 plsz = len; 4907 } else if (rw == S_WRITE && svd->type == MAP_PRIVATE || 4908 rw == S_OTHER || 4909 (((size_t)(addr + PAGESIZE) < 4910 (size_t)(seg->s_base + seg->s_size)) && 4911 hat_probe(as->a_hat, addr + PAGESIZE))) { 4912 /* 4913 * Ask VOP_GETPAGE to return the exact number 4914 * of pages if 4915 * (a) this is a COW fault, or 4916 * (b) this is a software fault, or 4917 * (c) next page is already mapped. 4918 */ 4919 plsz = len; 4920 } else { 4921 /* 4922 * Ask VOP_GETPAGE to return adjacent pages 4923 * within the segment. 4924 */ 4925 plsz = MIN((size_t)PVN_GETPAGE_SZ, (size_t) 4926 ((seg->s_base + seg->s_size) - addr)); 4927 ASSERT((addr + plsz) <= 4928 (seg->s_base + seg->s_size)); 4929 } 4930 4931 /* 4932 * Need to get some non-anonymous pages. 4933 * We need to make only one call to GETPAGE to do 4934 * this to prevent certain deadlocking conditions 4935 * when we are doing locking. In this case 4936 * non_anon() should have picked up the smallest 4937 * range which includes all the non-anonymous 4938 * pages in the requested range. We have to 4939 * be careful regarding which rw flag to pass in 4940 * because on a private mapping, the underlying 4941 * object is never allowed to be written. 4942 */ 4943 if (rw == S_WRITE && svd->type == MAP_PRIVATE) { 4944 arw = S_READ; 4945 } else { 4946 arw = rw; 4947 } 4948 vp = svd->vp; 4949 TRACE_3(TR_FAC_VM, TR_SEGVN_GETPAGE, 4950 "segvn_getpage:seg %p addr %p vp %p", 4951 seg, addr, vp); 4952 err = VOP_GETPAGE(vp, (offset_t)vp_off, vp_len, 4953 &vpprot, plp, plsz, seg, addr + (vp_off - off), arw, 4954 svd->cred); 4955 if (err) { 4956 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 4957 segvn_pagelist_rele(plp); 4958 if (pl_alloc_sz) 4959 kmem_free(plp, pl_alloc_sz); 4960 return (FC_MAKE_ERR(err)); 4961 } 4962 if (svd->type == MAP_PRIVATE) 4963 vpprot &= ~PROT_WRITE; 4964 } 4965 } 4966 4967 /* 4968 * N.B. at this time the plp array has all the needed non-anon 4969 * pages in addition to (possibly) having some adjacent pages. 4970 */ 4971 4972 /* 4973 * Always acquire the anon_array_lock to prevent 4974 * 2 threads from allocating separate anon slots for 4975 * the same "addr". 4976 * 4977 * If this is a copy-on-write fault and we don't already 4978 * have the anon_array_lock, acquire it to prevent the 4979 * fault routine from handling multiple copy-on-write faults 4980 * on the same "addr" in the same address space. 4981 * 4982 * Only one thread should deal with the fault since after 4983 * it is handled, the other threads can acquire a translation 4984 * to the newly created private page. This prevents two or 4985 * more threads from creating different private pages for the 4986 * same fault. 4987 * 4988 * We grab "serialization" lock here if this is a MAP_PRIVATE segment 4989 * to prevent deadlock between this thread and another thread 4990 * which has soft-locked this page and wants to acquire serial_lock. 4991 * ( bug 4026339 ) 4992 * 4993 * The fix for bug 4026339 becomes unnecessary when using the 4994 * locking scheme with per amp rwlock and a global set of hash 4995 * lock, anon_array_lock. If we steal a vnode page when low 4996 * on memory and upgrad the page lock through page_rename, 4997 * then the page is PAGE_HANDLED, nothing needs to be done 4998 * for this page after returning from segvn_faultpage. 4999 * 5000 * But really, the page lock should be downgraded after 5001 * the stolen page is page_rename'd. 5002 */ 5003 5004 if (amp != NULL) 5005 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 5006 5007 /* 5008 * Ok, now loop over the address range and handle faults 5009 */ 5010 for (a = addr; a < addr + len; a += PAGESIZE, off += PAGESIZE) { 5011 err = segvn_faultpage(hat, seg, a, off, vpage, plp, vpprot, 5012 type, rw, brkcow, a == addr); 5013 if (err) { 5014 if (amp != NULL) 5015 ANON_LOCK_EXIT(&->a_rwlock); 5016 if (type == F_SOFTLOCK && a > addr) { 5017 segvn_softunlock(seg, addr, (a - addr), 5018 S_OTHER); 5019 } 5020 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5021 segvn_pagelist_rele(plp); 5022 if (pl_alloc_sz) 5023 kmem_free(plp, pl_alloc_sz); 5024 return (err); 5025 } 5026 if (vpage) { 5027 vpage++; 5028 } else if (svd->vpage) { 5029 page = seg_page(seg, addr); 5030 vpage = &svd->vpage[++page]; 5031 } 5032 } 5033 5034 /* Didn't get pages from the underlying fs so we're done */ 5035 if (!dogetpage) 5036 goto done; 5037 5038 /* 5039 * Now handle any other pages in the list returned. 5040 * If the page can be used, load up the translations now. 5041 * Note that the for loop will only be entered if "plp" 5042 * is pointing to a non-NULL page pointer which means that 5043 * VOP_GETPAGE() was called and vpprot has been initialized. 5044 */ 5045 if (svd->pageprot == 0) 5046 prot = svd->prot & vpprot; 5047 5048 5049 /* 5050 * Large Files: diff should be unsigned value because we started 5051 * supporting > 2GB segment sizes from 2.5.1 and when a 5052 * large file of size > 2GB gets mapped to address space 5053 * the diff value can be > 2GB. 5054 */ 5055 5056 for (ppp = plp; (pp = *ppp) != NULL; ppp++) { 5057 size_t diff; 5058 struct anon *ap; 5059 int anon_index; 5060 anon_sync_obj_t cookie; 5061 int hat_flag = HAT_LOAD_ADV; 5062 5063 if (svd->flags & MAP_TEXT) { 5064 hat_flag |= HAT_LOAD_TEXT; 5065 } 5066 5067 if (pp == PAGE_HANDLED) 5068 continue; 5069 5070 if (pp->p_offset >= svd->offset && 5071 (pp->p_offset < svd->offset + seg->s_size)) { 5072 5073 diff = pp->p_offset - svd->offset; 5074 5075 /* 5076 * Large Files: Following is the assertion 5077 * validating the above cast. 5078 */ 5079 ASSERT(svd->vp == pp->p_vnode); 5080 5081 page = btop(diff); 5082 if (svd->pageprot) 5083 prot = VPP_PROT(&svd->vpage[page]) & vpprot; 5084 5085 /* 5086 * Prevent other threads in the address space from 5087 * creating private pages (i.e., allocating anon slots) 5088 * while we are in the process of loading translations 5089 * to additional pages returned by the underlying 5090 * object. 5091 */ 5092 if (amp != NULL) { 5093 anon_index = svd->anon_index + page; 5094 anon_array_enter(amp, anon_index, &cookie); 5095 ap = anon_get_ptr(amp->ahp, anon_index); 5096 } 5097 if ((amp == NULL) || (ap == NULL)) { 5098 if (IS_VMODSORT(pp->p_vnode) || 5099 enable_mbit_wa) { 5100 if (rw == S_WRITE) 5101 hat_setmod(pp); 5102 else if (rw != S_OTHER && 5103 !hat_ismod(pp)) 5104 prot &= ~PROT_WRITE; 5105 } 5106 /* 5107 * Skip mapping read ahead pages marked 5108 * for migration, so they will get migrated 5109 * properly on fault 5110 */ 5111 if ((prot & PROT_READ) && !PP_ISMIGRATE(pp)) { 5112 hat_memload(hat, seg->s_base + diff, 5113 pp, prot, hat_flag); 5114 } 5115 } 5116 if (amp != NULL) 5117 anon_array_exit(&cookie); 5118 } 5119 page_unlock(pp); 5120 } 5121 done: 5122 if (amp != NULL) 5123 ANON_LOCK_EXIT(&->a_rwlock); 5124 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5125 if (pl_alloc_sz) 5126 kmem_free(plp, pl_alloc_sz); 5127 return (0); 5128 } 5129 5130 /* 5131 * This routine is used to start I/O on pages asynchronously. XXX it will 5132 * only create PAGESIZE pages. At fault time they will be relocated into 5133 * larger pages. 5134 */ 5135 static faultcode_t 5136 segvn_faulta(struct seg *seg, caddr_t addr) 5137 { 5138 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 5139 int err; 5140 struct anon_map *amp; 5141 vnode_t *vp; 5142 5143 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 5144 5145 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER); 5146 if ((amp = svd->amp) != NULL) { 5147 struct anon *ap; 5148 5149 /* 5150 * Reader lock to prevent amp->ahp from being changed. 5151 * This is advisory, it's ok to miss a page, so 5152 * we don't do anon_array_enter lock. 5153 */ 5154 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 5155 if ((ap = anon_get_ptr(amp->ahp, 5156 svd->anon_index + seg_page(seg, addr))) != NULL) { 5157 5158 err = anon_getpage(&ap, NULL, NULL, 5159 0, seg, addr, S_READ, svd->cred); 5160 5161 ANON_LOCK_EXIT(&->a_rwlock); 5162 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5163 if (err) 5164 return (FC_MAKE_ERR(err)); 5165 return (0); 5166 } 5167 ANON_LOCK_EXIT(&->a_rwlock); 5168 } 5169 5170 if (svd->vp == NULL) { 5171 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5172 return (0); /* zfod page - do nothing now */ 5173 } 5174 5175 vp = svd->vp; 5176 TRACE_3(TR_FAC_VM, TR_SEGVN_GETPAGE, 5177 "segvn_getpage:seg %p addr %p vp %p", seg, addr, vp); 5178 err = VOP_GETPAGE(vp, 5179 (offset_t)(svd->offset + (uintptr_t)(addr - seg->s_base)), 5180 PAGESIZE, NULL, NULL, 0, seg, addr, 5181 S_OTHER, svd->cred); 5182 5183 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5184 if (err) 5185 return (FC_MAKE_ERR(err)); 5186 return (0); 5187 } 5188 5189 static int 5190 segvn_setprot(struct seg *seg, caddr_t addr, size_t len, uint_t prot) 5191 { 5192 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 5193 struct vpage *svp, *evp; 5194 struct vnode *vp; 5195 size_t pgsz; 5196 pgcnt_t pgcnt; 5197 anon_sync_obj_t cookie; 5198 5199 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 5200 5201 if ((svd->maxprot & prot) != prot) 5202 return (EACCES); /* violated maxprot */ 5203 5204 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_WRITER); 5205 5206 /* return if prot is the same */ 5207 if (!svd->pageprot && svd->prot == prot) { 5208 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5209 return (0); 5210 } 5211 5212 /* 5213 * Since we change protections we first have to flush the cache. 5214 * This makes sure all the pagelock calls have to recheck 5215 * protections. 5216 */ 5217 if (svd->softlockcnt > 0) { 5218 /* 5219 * Since we do have the segvn writers lock nobody can fill 5220 * the cache with entries belonging to this seg during 5221 * the purge. The flush either succeeds or we still have 5222 * pending I/Os. 5223 */ 5224 segvn_purge(seg); 5225 if (svd->softlockcnt > 0) { 5226 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5227 return (EAGAIN); 5228 } 5229 } 5230 5231 if (seg->s_szc != 0) { 5232 int err; 5233 pgsz = page_get_pagesize(seg->s_szc); 5234 pgcnt = pgsz >> PAGESHIFT; 5235 ASSERT(IS_P2ALIGNED(pgcnt, pgcnt)); 5236 if (!IS_P2ALIGNED(addr, pgsz) || !IS_P2ALIGNED(len, pgsz)) { 5237 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5238 ASSERT(seg->s_base != addr || seg->s_size != len); 5239 /* 5240 * If we are holding the as lock as a reader then 5241 * we need to return IE_RETRY and let the as 5242 * layer drop and re-aquire the lock as a writer. 5243 */ 5244 if (AS_READ_HELD(seg->s_as, &seg->s_as->a_lock)) 5245 return (IE_RETRY); 5246 VM_STAT_ADD(segvnvmstats.demoterange[1]); 5247 if (svd->type == MAP_PRIVATE || svd->vp != NULL) { 5248 err = segvn_demote_range(seg, addr, len, 5249 SDR_END, 0); 5250 } else { 5251 uint_t szcvec = map_shm_pgszcvec(seg->s_base, 5252 pgsz, (uintptr_t)seg->s_base); 5253 err = segvn_demote_range(seg, addr, len, 5254 SDR_END, szcvec); 5255 } 5256 if (err == 0) 5257 return (IE_RETRY); 5258 if (err == ENOMEM) 5259 return (IE_NOMEM); 5260 return (err); 5261 } 5262 } 5263 5264 5265 /* 5266 * If it's a private mapping and we're making it writable 5267 * and no swap space has been reserved, have to reserve 5268 * it all now. If it's a private mapping to a file (i.e., vp != NULL) 5269 * and we're removing write permission on the entire segment and 5270 * we haven't modified any pages, we can release the swap space. 5271 */ 5272 if (svd->type == MAP_PRIVATE) { 5273 if (prot & PROT_WRITE) { 5274 size_t sz; 5275 if (svd->swresv == 0 && !(svd->flags & MAP_NORESERVE)) { 5276 if (anon_resv(seg->s_size) == 0) { 5277 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5278 return (IE_NOMEM); 5279 } 5280 sz = svd->swresv = seg->s_size; 5281 TRACE_3(TR_FAC_VM, TR_ANON_PROC, 5282 "anon proc:%p %lu %u", 5283 seg, sz, 1); 5284 } 5285 } else { 5286 /* 5287 * Swap space is released only if this segment 5288 * does not map anonymous memory, since read faults 5289 * on such segments still need an anon slot to read 5290 * in the data. 5291 */ 5292 if (svd->swresv != 0 && svd->vp != NULL && 5293 svd->amp == NULL && addr == seg->s_base && 5294 len == seg->s_size && svd->pageprot == 0) { 5295 anon_unresv(svd->swresv); 5296 svd->swresv = 0; 5297 TRACE_3(TR_FAC_VM, TR_ANON_PROC, 5298 "anon proc:%p %lu %u", 5299 seg, 0, 0); 5300 } 5301 } 5302 } 5303 5304 if (addr == seg->s_base && len == seg->s_size && svd->pageprot == 0) { 5305 if (svd->prot == prot) { 5306 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5307 return (0); /* all done */ 5308 } 5309 svd->prot = (uchar_t)prot; 5310 } else if (svd->type == MAP_PRIVATE) { 5311 struct anon *ap = NULL; 5312 page_t *pp; 5313 u_offset_t offset, off; 5314 struct anon_map *amp; 5315 ulong_t anon_idx = 0; 5316 5317 /* 5318 * A vpage structure exists or else the change does not 5319 * involve the entire segment. Establish a vpage structure 5320 * if none is there. Then, for each page in the range, 5321 * adjust its individual permissions. Note that write- 5322 * enabling a MAP_PRIVATE page can affect the claims for 5323 * locked down memory. Overcommitting memory terminates 5324 * the operation. 5325 */ 5326 segvn_vpage(seg); 5327 if ((amp = svd->amp) != NULL) { 5328 anon_idx = svd->anon_index + seg_page(seg, addr); 5329 ASSERT(seg->s_szc == 0 || 5330 IS_P2ALIGNED(anon_idx, pgcnt)); 5331 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 5332 } 5333 5334 offset = svd->offset + (uintptr_t)(addr - seg->s_base); 5335 evp = &svd->vpage[seg_page(seg, addr + len)]; 5336 5337 /* 5338 * See Statement at the beginning of segvn_lockop regarding 5339 * the way cowcnts and lckcnts are handled. 5340 */ 5341 for (svp = &svd->vpage[seg_page(seg, addr)]; svp < evp; svp++) { 5342 5343 if (seg->s_szc != 0) { 5344 if (amp != NULL) { 5345 anon_array_enter(amp, anon_idx, 5346 &cookie); 5347 } 5348 if (IS_P2ALIGNED(anon_idx, pgcnt) && 5349 !segvn_claim_pages(seg, svp, offset, 5350 anon_idx, prot)) { 5351 if (amp != NULL) { 5352 anon_array_exit(&cookie); 5353 } 5354 break; 5355 } 5356 if (amp != NULL) { 5357 anon_array_exit(&cookie); 5358 } 5359 anon_idx++; 5360 } else { 5361 if (amp != NULL) { 5362 anon_array_enter(amp, anon_idx, 5363 &cookie); 5364 ap = anon_get_ptr(amp->ahp, anon_idx++); 5365 } 5366 5367 if (VPP_ISPPLOCK(svp) && 5368 VPP_PROT(svp) != prot) { 5369 5370 if (amp == NULL || ap == NULL) { 5371 vp = svd->vp; 5372 off = offset; 5373 } else 5374 swap_xlate(ap, &vp, &off); 5375 if (amp != NULL) 5376 anon_array_exit(&cookie); 5377 5378 if ((pp = page_lookup(vp, off, 5379 SE_SHARED)) == NULL) { 5380 panic("segvn_setprot: no page"); 5381 /*NOTREACHED*/ 5382 } 5383 ASSERT(seg->s_szc == 0); 5384 if ((VPP_PROT(svp) ^ prot) & 5385 PROT_WRITE) { 5386 if (prot & PROT_WRITE) { 5387 if (!page_addclaim(pp)) { 5388 page_unlock(pp); 5389 break; 5390 } 5391 } else { 5392 if (!page_subclaim(pp)) { 5393 page_unlock(pp); 5394 break; 5395 } 5396 } 5397 } 5398 page_unlock(pp); 5399 } else if (amp != NULL) 5400 anon_array_exit(&cookie); 5401 } 5402 VPP_SETPROT(svp, prot); 5403 offset += PAGESIZE; 5404 } 5405 if (amp != NULL) 5406 ANON_LOCK_EXIT(&->a_rwlock); 5407 5408 /* 5409 * Did we terminate prematurely? If so, simply unload 5410 * the translations to the things we've updated so far. 5411 */ 5412 if (svp != evp) { 5413 len = (svp - &svd->vpage[seg_page(seg, addr)]) * 5414 PAGESIZE; 5415 ASSERT(seg->s_szc == 0 || IS_P2ALIGNED(len, pgsz)); 5416 if (len != 0) 5417 hat_unload(seg->s_as->a_hat, addr, 5418 len, HAT_UNLOAD); 5419 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5420 return (IE_NOMEM); 5421 } 5422 } else { 5423 segvn_vpage(seg); 5424 evp = &svd->vpage[seg_page(seg, addr + len)]; 5425 for (svp = &svd->vpage[seg_page(seg, addr)]; svp < evp; svp++) { 5426 VPP_SETPROT(svp, prot); 5427 } 5428 } 5429 5430 if (((prot & PROT_WRITE) != 0 && 5431 (svd->vp != NULL || svd->type == MAP_PRIVATE)) || 5432 (prot & ~PROT_USER) == PROT_NONE) { 5433 /* 5434 * Either private or shared data with write access (in 5435 * which case we need to throw out all former translations 5436 * so that we get the right translations set up on fault 5437 * and we don't allow write access to any copy-on-write pages 5438 * that might be around or to prevent write access to pages 5439 * representing holes in a file), or we don't have permission 5440 * to access the memory at all (in which case we have to 5441 * unload any current translations that might exist). 5442 */ 5443 hat_unload(seg->s_as->a_hat, addr, len, HAT_UNLOAD); 5444 } else { 5445 /* 5446 * A shared mapping or a private mapping in which write 5447 * protection is going to be denied - just change all the 5448 * protections over the range of addresses in question. 5449 * segvn does not support any other attributes other 5450 * than prot so we can use hat_chgattr. 5451 */ 5452 hat_chgattr(seg->s_as->a_hat, addr, len, prot); 5453 } 5454 5455 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5456 5457 return (0); 5458 } 5459 5460 /* 5461 * segvn_setpagesize is called via SEGOP_SETPAGESIZE from as_setpagesize, 5462 * to determine if the seg is capable of mapping the requested szc. 5463 */ 5464 static int 5465 segvn_setpagesize(struct seg *seg, caddr_t addr, size_t len, uint_t szc) 5466 { 5467 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 5468 struct segvn_data *nsvd; 5469 struct anon_map *amp = svd->amp; 5470 struct seg *nseg; 5471 caddr_t eaddr = addr + len, a; 5472 size_t pgsz = page_get_pagesize(szc); 5473 pgcnt_t pgcnt = page_get_pagecnt(szc); 5474 int err; 5475 u_offset_t off = svd->offset + (uintptr_t)(addr - seg->s_base); 5476 extern struct vnode kvp; 5477 5478 ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock)); 5479 ASSERT(addr >= seg->s_base && eaddr <= seg->s_base + seg->s_size); 5480 5481 if (seg->s_szc == szc || segvn_lpg_disable != 0) { 5482 return (0); 5483 } 5484 5485 /* 5486 * addr should always be pgsz aligned but eaddr may be misaligned if 5487 * it's at the end of the segment. 5488 * 5489 * XXX we should assert this condition since as_setpagesize() logic 5490 * guarantees it. 5491 */ 5492 if (!IS_P2ALIGNED(addr, pgsz) || 5493 (!IS_P2ALIGNED(eaddr, pgsz) && 5494 eaddr != seg->s_base + seg->s_size)) { 5495 5496 segvn_setpgsz_align_err++; 5497 return (EINVAL); 5498 } 5499 5500 if (amp != NULL && svd->type == MAP_SHARED) { 5501 ulong_t an_idx = svd->anon_index + seg_page(seg, addr); 5502 if (!IS_P2ALIGNED(an_idx, pgcnt)) { 5503 5504 segvn_setpgsz_anon_align_err++; 5505 return (EINVAL); 5506 } 5507 } 5508 5509 if ((svd->flags & MAP_NORESERVE) || seg->s_as == &kas || 5510 szc > segvn_maxpgszc) { 5511 return (EINVAL); 5512 } 5513 5514 /* paranoid check */ 5515 if (svd->vp != NULL && 5516 (IS_SWAPFSVP(svd->vp) || svd->vp == &kvp)) { 5517 return (EINVAL); 5518 } 5519 5520 if (seg->s_szc == 0 && svd->vp != NULL && 5521 map_addr_vacalign_check(addr, off)) { 5522 return (EINVAL); 5523 } 5524 5525 /* 5526 * Check that protections are the same within new page 5527 * size boundaries. 5528 */ 5529 if (svd->pageprot) { 5530 for (a = addr; a < eaddr; a += pgsz) { 5531 if ((a + pgsz) > eaddr) { 5532 if (!sameprot(seg, a, eaddr - a)) { 5533 return (EINVAL); 5534 } 5535 } else { 5536 if (!sameprot(seg, a, pgsz)) { 5537 return (EINVAL); 5538 } 5539 } 5540 } 5541 } 5542 5543 /* 5544 * Since we are changing page size we first have to flush 5545 * the cache. This makes sure all the pagelock calls have 5546 * to recheck protections. 5547 */ 5548 if (svd->softlockcnt > 0) { 5549 /* 5550 * Since we do have the segvn writers lock nobody can fill 5551 * the cache with entries belonging to this seg during 5552 * the purge. The flush either succeeds or we still have 5553 * pending I/Os. 5554 */ 5555 segvn_purge(seg); 5556 if (svd->softlockcnt > 0) { 5557 return (EAGAIN); 5558 } 5559 } 5560 5561 /* 5562 * Operation for sub range of existing segment. 5563 */ 5564 if (addr != seg->s_base || eaddr != (seg->s_base + seg->s_size)) { 5565 if (szc < seg->s_szc) { 5566 VM_STAT_ADD(segvnvmstats.demoterange[2]); 5567 err = segvn_demote_range(seg, addr, len, SDR_RANGE, 0); 5568 if (err == 0) { 5569 return (IE_RETRY); 5570 } 5571 if (err == ENOMEM) { 5572 return (IE_NOMEM); 5573 } 5574 return (err); 5575 } 5576 if (addr != seg->s_base) { 5577 nseg = segvn_split_seg(seg, addr); 5578 if (eaddr != (nseg->s_base + nseg->s_size)) { 5579 /* eaddr is szc aligned */ 5580 (void) segvn_split_seg(nseg, eaddr); 5581 } 5582 return (IE_RETRY); 5583 } 5584 if (eaddr != (seg->s_base + seg->s_size)) { 5585 /* eaddr is szc aligned */ 5586 (void) segvn_split_seg(seg, eaddr); 5587 } 5588 return (IE_RETRY); 5589 } 5590 5591 /* 5592 * Break any low level sharing and reset seg->s_szc to 0. 5593 */ 5594 if ((err = segvn_clrszc(seg)) != 0) { 5595 if (err == ENOMEM) { 5596 err = IE_NOMEM; 5597 } 5598 return (err); 5599 } 5600 ASSERT(seg->s_szc == 0); 5601 5602 /* 5603 * If the end of the current segment is not pgsz aligned 5604 * then attempt to concatenate with the next segment. 5605 */ 5606 if (!IS_P2ALIGNED(eaddr, pgsz)) { 5607 nseg = AS_SEGNEXT(seg->s_as, seg); 5608 if (nseg == NULL || nseg == seg || eaddr != nseg->s_base) { 5609 return (ENOMEM); 5610 } 5611 if (nseg->s_ops != &segvn_ops) { 5612 return (EINVAL); 5613 } 5614 nsvd = (struct segvn_data *)nseg->s_data; 5615 if (nsvd->softlockcnt > 0) { 5616 segvn_purge(nseg); 5617 if (nsvd->softlockcnt > 0) { 5618 return (EAGAIN); 5619 } 5620 } 5621 err = segvn_clrszc(nseg); 5622 if (err == ENOMEM) { 5623 err = IE_NOMEM; 5624 } 5625 if (err != 0) { 5626 return (err); 5627 } 5628 err = segvn_concat(seg, nseg, 1); 5629 if (err == -1) { 5630 return (EINVAL); 5631 } 5632 if (err == -2) { 5633 return (IE_NOMEM); 5634 } 5635 return (IE_RETRY); 5636 } 5637 5638 /* 5639 * May need to re-align anon array to 5640 * new szc. 5641 */ 5642 if (amp != NULL) { 5643 if (!IS_P2ALIGNED(svd->anon_index, pgcnt)) { 5644 struct anon_hdr *nahp; 5645 5646 ASSERT(svd->type == MAP_PRIVATE); 5647 5648 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 5649 ASSERT(amp->refcnt == 1); 5650 nahp = anon_create(btop(amp->size), ANON_NOSLEEP); 5651 if (nahp == NULL) { 5652 ANON_LOCK_EXIT(&->a_rwlock); 5653 return (IE_NOMEM); 5654 } 5655 if (anon_copy_ptr(amp->ahp, svd->anon_index, 5656 nahp, 0, btop(seg->s_size), ANON_NOSLEEP)) { 5657 anon_release(nahp, btop(amp->size)); 5658 ANON_LOCK_EXIT(&->a_rwlock); 5659 return (IE_NOMEM); 5660 } 5661 anon_release(amp->ahp, btop(amp->size)); 5662 amp->ahp = nahp; 5663 svd->anon_index = 0; 5664 ANON_LOCK_EXIT(&->a_rwlock); 5665 } 5666 } 5667 if (svd->vp != NULL && szc != 0) { 5668 struct vattr va; 5669 u_offset_t eoffpage = svd->offset; 5670 va.va_mask = AT_SIZE; 5671 eoffpage += seg->s_size; 5672 eoffpage = btopr(eoffpage); 5673 if (VOP_GETATTR(svd->vp, &va, 0, svd->cred) != 0) { 5674 segvn_setpgsz_getattr_err++; 5675 return (EINVAL); 5676 } 5677 if (btopr(va.va_size) < eoffpage) { 5678 segvn_setpgsz_eof_err++; 5679 return (EINVAL); 5680 } 5681 if (amp != NULL) { 5682 /* 5683 * anon_fill_cow_holes() may call VOP_GETPAGE(). 5684 * don't take anon map lock here to avoid holding it 5685 * across VOP_GETPAGE() calls that may call back into 5686 * segvn for klsutering checks. We don't really need 5687 * anon map lock here since it's a private segment and 5688 * we hold as level lock as writers. 5689 */ 5690 if ((err = anon_fill_cow_holes(seg, seg->s_base, 5691 amp->ahp, svd->anon_index, svd->vp, svd->offset, 5692 seg->s_size, szc, svd->prot, svd->vpage, 5693 svd->cred)) != 0) { 5694 return (EINVAL); 5695 } 5696 } 5697 segvn_setvnode_mpss(svd->vp); 5698 } 5699 5700 if (amp != NULL) { 5701 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 5702 if (svd->type == MAP_PRIVATE) { 5703 amp->a_szc = szc; 5704 } else if (szc > amp->a_szc) { 5705 amp->a_szc = szc; 5706 } 5707 ANON_LOCK_EXIT(&->a_rwlock); 5708 } 5709 5710 seg->s_szc = szc; 5711 5712 return (0); 5713 } 5714 5715 static int 5716 segvn_clrszc(struct seg *seg) 5717 { 5718 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 5719 struct anon_map *amp = svd->amp; 5720 size_t pgsz; 5721 pgcnt_t pages; 5722 int err = 0; 5723 caddr_t a = seg->s_base; 5724 caddr_t ea = a + seg->s_size; 5725 ulong_t an_idx = svd->anon_index; 5726 vnode_t *vp = svd->vp; 5727 struct vpage *vpage = svd->vpage; 5728 page_t *anon_pl[1 + 1], *pp; 5729 struct anon *ap, *oldap; 5730 uint_t prot = svd->prot, vpprot; 5731 5732 ASSERT(AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock) || 5733 SEGVN_WRITE_HELD(seg->s_as, &svd->lock)); 5734 5735 if (vp == NULL && amp == NULL) { 5736 seg->s_szc = 0; 5737 return (0); 5738 } 5739 5740 /* 5741 * do HAT_UNLOAD_UNMAP since we are changing the pagesize. 5742 * unload argument is 0 when we are freeing the segment 5743 * and unload was already done. 5744 */ 5745 hat_unload(seg->s_as->a_hat, seg->s_base, seg->s_size, 5746 HAT_UNLOAD_UNMAP); 5747 5748 if (amp == NULL || svd->type == MAP_SHARED) { 5749 seg->s_szc = 0; 5750 return (0); 5751 } 5752 5753 pgsz = page_get_pagesize(seg->s_szc); 5754 pages = btop(pgsz); 5755 5756 /* 5757 * XXX anon rwlock is not really needed because this is a 5758 * private segment and we are writers. 5759 */ 5760 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 5761 5762 for (; a < ea; a += pgsz, an_idx += pages) { 5763 if ((oldap = anon_get_ptr(amp->ahp, an_idx)) != NULL) { 5764 if (svd->pageprot != 0) { 5765 ASSERT(vpage != NULL); 5766 prot = VPP_PROT(vpage); 5767 ASSERT(sameprot(seg, a, pgsz)); 5768 } 5769 if (seg->s_szc != 0) { 5770 ASSERT(vp == NULL || anon_pages(amp->ahp, 5771 an_idx, pages) == pages); 5772 if ((err = anon_map_demotepages(amp, an_idx, 5773 seg, a, prot, vpage, svd->cred)) != 0) { 5774 goto out; 5775 } 5776 } else { 5777 if (oldap->an_refcnt == 1) { 5778 continue; 5779 } 5780 if ((err = anon_getpage(&oldap, &vpprot, 5781 anon_pl, PAGESIZE, seg, a, S_READ, 5782 svd->cred))) { 5783 goto out; 5784 } 5785 if ((pp = anon_private(&ap, seg, a, prot, 5786 anon_pl[0], 0, svd->cred)) == NULL) { 5787 err = ENOMEM; 5788 goto out; 5789 } 5790 anon_decref(oldap); 5791 (void) anon_set_ptr(amp->ahp, an_idx, ap, 5792 ANON_SLEEP); 5793 page_unlock(pp); 5794 } 5795 } 5796 vpage = (vpage == NULL) ? NULL : vpage + pages; 5797 } 5798 5799 amp->a_szc = 0; 5800 seg->s_szc = 0; 5801 out: 5802 ANON_LOCK_EXIT(&->a_rwlock); 5803 return (err); 5804 } 5805 5806 static int 5807 segvn_claim_pages( 5808 struct seg *seg, 5809 struct vpage *svp, 5810 u_offset_t off, 5811 ulong_t anon_idx, 5812 uint_t prot) 5813 { 5814 pgcnt_t pgcnt = page_get_pagecnt(seg->s_szc); 5815 size_t ppasize = (pgcnt + 1) * sizeof (page_t *); 5816 page_t **ppa; 5817 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 5818 struct anon_map *amp = svd->amp; 5819 struct vpage *evp = svp + pgcnt; 5820 caddr_t addr = ((uintptr_t)(svp - svd->vpage) << PAGESHIFT) 5821 + seg->s_base; 5822 struct anon *ap; 5823 struct vnode *vp = svd->vp; 5824 page_t *pp; 5825 pgcnt_t pg_idx, i; 5826 int err = 0; 5827 anoff_t aoff; 5828 int anon = (amp != NULL) ? 1 : 0; 5829 5830 ASSERT(svd->type == MAP_PRIVATE); 5831 ASSERT(svd->vpage != NULL); 5832 ASSERT(seg->s_szc != 0); 5833 ASSERT(IS_P2ALIGNED(pgcnt, pgcnt)); 5834 ASSERT(amp == NULL || IS_P2ALIGNED(anon_idx, pgcnt)); 5835 ASSERT(sameprot(seg, addr, pgcnt << PAGESHIFT)); 5836 5837 if (VPP_PROT(svp) == prot) 5838 return (1); 5839 if (!((VPP_PROT(svp) ^ prot) & PROT_WRITE)) 5840 return (1); 5841 5842 ppa = kmem_alloc(ppasize, KM_SLEEP); 5843 if (anon && vp != NULL) { 5844 if (anon_get_ptr(amp->ahp, anon_idx) == NULL) { 5845 anon = 0; 5846 ASSERT(!anon_pages(amp->ahp, anon_idx, pgcnt)); 5847 } 5848 ASSERT(!anon || 5849 anon_pages(amp->ahp, anon_idx, pgcnt) == pgcnt); 5850 } 5851 5852 for (*ppa = NULL, pg_idx = 0; svp < evp; svp++, anon_idx++) { 5853 if (!VPP_ISPPLOCK(svp)) 5854 continue; 5855 if (anon) { 5856 ap = anon_get_ptr(amp->ahp, anon_idx); 5857 if (ap == NULL) { 5858 panic("segvn_claim_pages: no anon slot"); 5859 } 5860 swap_xlate(ap, &vp, &aoff); 5861 off = (u_offset_t)aoff; 5862 } 5863 ASSERT(vp != NULL); 5864 if ((pp = page_lookup(vp, 5865 (u_offset_t)off, SE_SHARED)) == NULL) { 5866 panic("segvn_claim_pages: no page"); 5867 } 5868 ppa[pg_idx++] = pp; 5869 off += PAGESIZE; 5870 } 5871 5872 if (ppa[0] == NULL) { 5873 kmem_free(ppa, ppasize); 5874 return (1); 5875 } 5876 5877 ASSERT(pg_idx <= pgcnt); 5878 ppa[pg_idx] = NULL; 5879 5880 if (prot & PROT_WRITE) 5881 err = page_addclaim_pages(ppa); 5882 else 5883 err = page_subclaim_pages(ppa); 5884 5885 for (i = 0; i < pg_idx; i++) { 5886 ASSERT(ppa[i] != NULL); 5887 page_unlock(ppa[i]); 5888 } 5889 5890 kmem_free(ppa, ppasize); 5891 return (err); 5892 } 5893 5894 /* 5895 * Returns right (upper address) segment if split occured. 5896 * If the address is equal to the beginning or end of its segment it returns 5897 * the current segment. 5898 */ 5899 static struct seg * 5900 segvn_split_seg(struct seg *seg, caddr_t addr) 5901 { 5902 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 5903 struct seg *nseg; 5904 size_t nsize; 5905 struct segvn_data *nsvd; 5906 5907 ASSERT(AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock)); 5908 ASSERT(addr >= seg->s_base); 5909 ASSERT(addr <= seg->s_base + seg->s_size); 5910 5911 if (addr == seg->s_base || addr == seg->s_base + seg->s_size) 5912 return (seg); 5913 5914 nsize = seg->s_base + seg->s_size - addr; 5915 seg->s_size = addr - seg->s_base; 5916 nseg = seg_alloc(seg->s_as, addr, nsize); 5917 ASSERT(nseg != NULL); 5918 nseg->s_ops = seg->s_ops; 5919 nsvd = kmem_cache_alloc(segvn_cache, KM_SLEEP); 5920 nseg->s_data = (void *)nsvd; 5921 nseg->s_szc = seg->s_szc; 5922 *nsvd = *svd; 5923 rw_init(&nsvd->lock, NULL, RW_DEFAULT, NULL); 5924 5925 if (nsvd->vp != NULL) { 5926 VN_HOLD(nsvd->vp); 5927 nsvd->offset = svd->offset + 5928 (uintptr_t)(nseg->s_base - seg->s_base); 5929 if (nsvd->type == MAP_SHARED) 5930 lgrp_shm_policy_init(NULL, nsvd->vp); 5931 } else { 5932 /* 5933 * The offset for an anonymous segment has no signifigance in 5934 * terms of an offset into a file. If we were to use the above 5935 * calculation instead, the structures read out of 5936 * /proc/<pid>/xmap would be more difficult to decipher since 5937 * it would be unclear whether two seemingly contiguous 5938 * prxmap_t structures represented different segments or a 5939 * single segment that had been split up into multiple prxmap_t 5940 * structures (e.g. if some part of the segment had not yet 5941 * been faulted in). 5942 */ 5943 nsvd->offset = 0; 5944 } 5945 5946 ASSERT(svd->softlockcnt == 0); 5947 crhold(svd->cred); 5948 5949 if (svd->vpage != NULL) { 5950 size_t bytes = vpgtob(seg_pages(seg)); 5951 size_t nbytes = vpgtob(seg_pages(nseg)); 5952 struct vpage *ovpage = svd->vpage; 5953 5954 svd->vpage = kmem_alloc(bytes, KM_SLEEP); 5955 bcopy(ovpage, svd->vpage, bytes); 5956 nsvd->vpage = kmem_alloc(nbytes, KM_SLEEP); 5957 bcopy(ovpage + seg_pages(seg), nsvd->vpage, nbytes); 5958 kmem_free(ovpage, bytes + nbytes); 5959 } 5960 if (svd->amp != NULL && svd->type == MAP_PRIVATE) { 5961 struct anon_map *oamp = svd->amp, *namp; 5962 struct anon_hdr *nahp; 5963 5964 ANON_LOCK_ENTER(&oamp->a_rwlock, RW_WRITER); 5965 ASSERT(oamp->refcnt == 1); 5966 nahp = anon_create(btop(seg->s_size), ANON_SLEEP); 5967 (void) anon_copy_ptr(oamp->ahp, svd->anon_index, 5968 nahp, 0, btop(seg->s_size), ANON_SLEEP); 5969 5970 namp = anonmap_alloc(nseg->s_size, 0); 5971 namp->a_szc = nseg->s_szc; 5972 (void) anon_copy_ptr(oamp->ahp, 5973 svd->anon_index + btop(seg->s_size), 5974 namp->ahp, 0, btop(nseg->s_size), ANON_SLEEP); 5975 anon_release(oamp->ahp, btop(oamp->size)); 5976 oamp->ahp = nahp; 5977 oamp->size = seg->s_size; 5978 svd->anon_index = 0; 5979 nsvd->amp = namp; 5980 nsvd->anon_index = 0; 5981 ANON_LOCK_EXIT(&oamp->a_rwlock); 5982 } else if (svd->amp != NULL) { 5983 pgcnt_t pgcnt = page_get_pagecnt(seg->s_szc); 5984 ASSERT(svd->amp == nsvd->amp); 5985 ASSERT(seg->s_szc <= svd->amp->a_szc); 5986 nsvd->anon_index = svd->anon_index + seg_pages(seg); 5987 ASSERT(IS_P2ALIGNED(nsvd->anon_index, pgcnt)); 5988 ANON_LOCK_ENTER(&svd->amp->a_rwlock, RW_WRITER); 5989 svd->amp->refcnt++; 5990 ANON_LOCK_EXIT(&svd->amp->a_rwlock); 5991 } 5992 5993 /* 5994 * Split amount of swap reserve 5995 */ 5996 if (svd->swresv) { 5997 /* 5998 * For MAP_NORESERVE, only allocate swap reserve for pages 5999 * being used. Other segments get enough to cover whole 6000 * segment. 6001 */ 6002 if (svd->flags & MAP_NORESERVE) { 6003 size_t oswresv; 6004 6005 ASSERT(svd->amp); 6006 oswresv = svd->swresv; 6007 svd->swresv = ptob(anon_pages(svd->amp->ahp, 6008 svd->anon_index, btop(seg->s_size))); 6009 nsvd->swresv = ptob(anon_pages(nsvd->amp->ahp, 6010 nsvd->anon_index, btop(nseg->s_size))); 6011 ASSERT(oswresv >= (svd->swresv + nsvd->swresv)); 6012 } else { 6013 ASSERT(svd->swresv == seg->s_size + nseg->s_size); 6014 svd->swresv = seg->s_size; 6015 nsvd->swresv = nseg->s_size; 6016 } 6017 } 6018 6019 return (nseg); 6020 } 6021 6022 /* 6023 * called on memory operations (unmap, setprot, setpagesize) for a subset 6024 * of a large page segment to either demote the memory range (SDR_RANGE) 6025 * or the ends (SDR_END) by addr/len. 6026 * 6027 * returns 0 on success. returns errno, including ENOMEM, on failure. 6028 */ 6029 static int 6030 segvn_demote_range( 6031 struct seg *seg, 6032 caddr_t addr, 6033 size_t len, 6034 int flag, 6035 uint_t szcvec) 6036 { 6037 caddr_t eaddr = addr + len; 6038 caddr_t lpgaddr, lpgeaddr; 6039 struct seg *nseg; 6040 struct seg *badseg1 = NULL; 6041 struct seg *badseg2 = NULL; 6042 size_t pgsz; 6043 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 6044 int err; 6045 uint_t szc = seg->s_szc; 6046 uint_t tszcvec; 6047 6048 ASSERT(AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock)); 6049 ASSERT(szc != 0); 6050 pgsz = page_get_pagesize(szc); 6051 ASSERT(seg->s_base != addr || seg->s_size != len); 6052 ASSERT(addr >= seg->s_base && eaddr <= seg->s_base + seg->s_size); 6053 ASSERT(svd->softlockcnt == 0); 6054 ASSERT(szcvec == 0 || (flag == SDR_END && svd->type == MAP_SHARED)); 6055 6056 CALC_LPG_REGION(pgsz, seg, addr, len, lpgaddr, lpgeaddr); 6057 ASSERT(flag == SDR_RANGE || eaddr < lpgeaddr || addr > lpgaddr); 6058 if (flag == SDR_RANGE) { 6059 /* demote entire range */ 6060 badseg1 = nseg = segvn_split_seg(seg, lpgaddr); 6061 (void) segvn_split_seg(nseg, lpgeaddr); 6062 ASSERT(badseg1->s_base == lpgaddr); 6063 ASSERT(badseg1->s_size == lpgeaddr - lpgaddr); 6064 } else if (addr != lpgaddr) { 6065 ASSERT(flag == SDR_END); 6066 badseg1 = nseg = segvn_split_seg(seg, lpgaddr); 6067 if (eaddr != lpgeaddr && eaddr > lpgaddr + pgsz && 6068 eaddr < lpgaddr + 2 * pgsz) { 6069 (void) segvn_split_seg(nseg, lpgeaddr); 6070 ASSERT(badseg1->s_base == lpgaddr); 6071 ASSERT(badseg1->s_size == 2 * pgsz); 6072 } else { 6073 nseg = segvn_split_seg(nseg, lpgaddr + pgsz); 6074 ASSERT(badseg1->s_base == lpgaddr); 6075 ASSERT(badseg1->s_size == pgsz); 6076 if (eaddr != lpgeaddr && eaddr > lpgaddr + pgsz) { 6077 ASSERT(lpgeaddr - lpgaddr > 2 * pgsz); 6078 nseg = segvn_split_seg(nseg, lpgeaddr - pgsz); 6079 badseg2 = nseg; 6080 (void) segvn_split_seg(nseg, lpgeaddr); 6081 ASSERT(badseg2->s_base == lpgeaddr - pgsz); 6082 ASSERT(badseg2->s_size == pgsz); 6083 } 6084 } 6085 } else { 6086 ASSERT(flag == SDR_END); 6087 ASSERT(eaddr < lpgeaddr); 6088 badseg1 = nseg = segvn_split_seg(seg, lpgeaddr - pgsz); 6089 (void) segvn_split_seg(nseg, lpgeaddr); 6090 ASSERT(badseg1->s_base == lpgeaddr - pgsz); 6091 ASSERT(badseg1->s_size == pgsz); 6092 } 6093 6094 ASSERT(badseg1 != NULL); 6095 ASSERT(badseg1->s_szc == szc); 6096 ASSERT(flag == SDR_RANGE || badseg1->s_size == pgsz || 6097 badseg1->s_size == 2 * pgsz); 6098 ASSERT(sameprot(badseg1, badseg1->s_base, pgsz)); 6099 ASSERT(badseg1->s_size == pgsz || 6100 sameprot(badseg1, badseg1->s_base + pgsz, pgsz)); 6101 if (err = segvn_clrszc(badseg1)) { 6102 return (err); 6103 } 6104 ASSERT(badseg1->s_szc == 0); 6105 6106 if (szc > 1 && (tszcvec = P2PHASE(szcvec, 1 << szc)) > 1) { 6107 uint_t tszc = highbit(tszcvec) - 1; 6108 caddr_t ta = MAX(addr, badseg1->s_base); 6109 caddr_t te; 6110 size_t tpgsz = page_get_pagesize(tszc); 6111 6112 ASSERT(svd->type == MAP_SHARED); 6113 ASSERT(flag == SDR_END); 6114 ASSERT(tszc < szc && tszc > 0); 6115 6116 if (eaddr > badseg1->s_base + badseg1->s_size) { 6117 te = badseg1->s_base + badseg1->s_size; 6118 } else { 6119 te = eaddr; 6120 } 6121 6122 ASSERT(ta <= te); 6123 badseg1->s_szc = tszc; 6124 if (!IS_P2ALIGNED(ta, tpgsz) || !IS_P2ALIGNED(te, tpgsz)) { 6125 if (badseg2 != NULL) { 6126 err = segvn_demote_range(badseg1, ta, te - ta, 6127 SDR_END, tszcvec); 6128 if (err != 0) { 6129 return (err); 6130 } 6131 } else { 6132 return (segvn_demote_range(badseg1, ta, 6133 te - ta, SDR_END, tszcvec)); 6134 } 6135 } 6136 } 6137 6138 if (badseg2 == NULL) 6139 return (0); 6140 ASSERT(badseg2->s_szc == szc); 6141 ASSERT(badseg2->s_size == pgsz); 6142 ASSERT(sameprot(badseg2, badseg2->s_base, badseg2->s_size)); 6143 if (err = segvn_clrszc(badseg2)) { 6144 return (err); 6145 } 6146 ASSERT(badseg2->s_szc == 0); 6147 6148 if (szc > 1 && (tszcvec = P2PHASE(szcvec, 1 << szc)) > 1) { 6149 uint_t tszc = highbit(tszcvec) - 1; 6150 size_t tpgsz = page_get_pagesize(tszc); 6151 6152 ASSERT(svd->type == MAP_SHARED); 6153 ASSERT(flag == SDR_END); 6154 ASSERT(tszc < szc && tszc > 0); 6155 ASSERT(badseg2->s_base > addr); 6156 ASSERT(eaddr > badseg2->s_base); 6157 ASSERT(eaddr < badseg2->s_base + badseg2->s_size); 6158 6159 badseg2->s_szc = tszc; 6160 if (!IS_P2ALIGNED(eaddr, tpgsz)) { 6161 return (segvn_demote_range(badseg2, badseg2->s_base, 6162 eaddr - badseg2->s_base, SDR_END, tszcvec)); 6163 } 6164 } 6165 6166 return (0); 6167 } 6168 6169 static int 6170 segvn_checkprot(struct seg *seg, caddr_t addr, size_t len, uint_t prot) 6171 { 6172 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 6173 struct vpage *vp, *evp; 6174 6175 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 6176 6177 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER); 6178 /* 6179 * If segment protection can be used, simply check against them. 6180 */ 6181 if (svd->pageprot == 0) { 6182 int err; 6183 6184 err = ((svd->prot & prot) != prot) ? EACCES : 0; 6185 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 6186 return (err); 6187 } 6188 6189 /* 6190 * Have to check down to the vpage level. 6191 */ 6192 evp = &svd->vpage[seg_page(seg, addr + len)]; 6193 for (vp = &svd->vpage[seg_page(seg, addr)]; vp < evp; vp++) { 6194 if ((VPP_PROT(vp) & prot) != prot) { 6195 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 6196 return (EACCES); 6197 } 6198 } 6199 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 6200 return (0); 6201 } 6202 6203 static int 6204 segvn_getprot(struct seg *seg, caddr_t addr, size_t len, uint_t *protv) 6205 { 6206 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 6207 size_t pgno = seg_page(seg, addr + len) - seg_page(seg, addr) + 1; 6208 6209 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 6210 6211 if (pgno != 0) { 6212 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER); 6213 if (svd->pageprot == 0) { 6214 do 6215 protv[--pgno] = svd->prot; 6216 while (pgno != 0); 6217 } else { 6218 size_t pgoff = seg_page(seg, addr); 6219 6220 do { 6221 pgno--; 6222 protv[pgno] = VPP_PROT(&svd->vpage[pgno+pgoff]); 6223 } while (pgno != 0); 6224 } 6225 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 6226 } 6227 return (0); 6228 } 6229 6230 static u_offset_t 6231 segvn_getoffset(struct seg *seg, caddr_t addr) 6232 { 6233 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 6234 6235 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 6236 6237 return (svd->offset + (uintptr_t)(addr - seg->s_base)); 6238 } 6239 6240 /*ARGSUSED*/ 6241 static int 6242 segvn_gettype(struct seg *seg, caddr_t addr) 6243 { 6244 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 6245 6246 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 6247 6248 return (svd->type | (svd->flags & MAP_NORESERVE)); 6249 } 6250 6251 /*ARGSUSED*/ 6252 static int 6253 segvn_getvp(struct seg *seg, caddr_t addr, struct vnode **vpp) 6254 { 6255 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 6256 6257 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 6258 6259 *vpp = svd->vp; 6260 return (0); 6261 } 6262 6263 /* 6264 * Check to see if it makes sense to do kluster/read ahead to 6265 * addr + delta relative to the mapping at addr. We assume here 6266 * that delta is a signed PAGESIZE'd multiple (which can be negative). 6267 * 6268 * For segvn, we currently "approve" of the action if we are 6269 * still in the segment and it maps from the same vp/off, 6270 * or if the advice stored in segvn_data or vpages allows it. 6271 * Currently, klustering is not allowed only if MADV_RANDOM is set. 6272 */ 6273 static int 6274 segvn_kluster(struct seg *seg, caddr_t addr, ssize_t delta) 6275 { 6276 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 6277 struct anon *oap, *ap; 6278 ssize_t pd; 6279 size_t page; 6280 struct vnode *vp1, *vp2; 6281 u_offset_t off1, off2; 6282 struct anon_map *amp; 6283 6284 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 6285 ASSERT(AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock) || 6286 SEGVN_LOCK_HELD(seg->s_as, &svd->lock)); 6287 6288 if (addr + delta < seg->s_base || 6289 addr + delta >= (seg->s_base + seg->s_size)) 6290 return (-1); /* exceeded segment bounds */ 6291 6292 pd = delta / (ssize_t)PAGESIZE; /* divide to preserve sign bit */ 6293 page = seg_page(seg, addr); 6294 6295 /* 6296 * Check to see if either of the pages addr or addr + delta 6297 * have advice set that prevents klustering (if MADV_RANDOM advice 6298 * is set for entire segment, or MADV_SEQUENTIAL is set and delta 6299 * is negative). 6300 */ 6301 if (svd->advice == MADV_RANDOM || 6302 svd->advice == MADV_SEQUENTIAL && delta < 0) 6303 return (-1); 6304 else if (svd->pageadvice && svd->vpage) { 6305 struct vpage *bvpp, *evpp; 6306 6307 bvpp = &svd->vpage[page]; 6308 evpp = &svd->vpage[page + pd]; 6309 if (VPP_ADVICE(bvpp) == MADV_RANDOM || 6310 VPP_ADVICE(evpp) == MADV_SEQUENTIAL && delta < 0) 6311 return (-1); 6312 if (VPP_ADVICE(bvpp) != VPP_ADVICE(evpp) && 6313 VPP_ADVICE(evpp) == MADV_RANDOM) 6314 return (-1); 6315 } 6316 6317 if (svd->type == MAP_SHARED) 6318 return (0); /* shared mapping - all ok */ 6319 6320 if ((amp = svd->amp) == NULL) 6321 return (0); /* off original vnode */ 6322 6323 page += svd->anon_index; 6324 6325 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 6326 6327 oap = anon_get_ptr(amp->ahp, page); 6328 ap = anon_get_ptr(amp->ahp, page + pd); 6329 6330 ANON_LOCK_EXIT(&->a_rwlock); 6331 6332 if ((oap == NULL && ap != NULL) || (oap != NULL && ap == NULL)) { 6333 return (-1); /* one with and one without an anon */ 6334 } 6335 6336 if (oap == NULL) { /* implies that ap == NULL */ 6337 return (0); /* off original vnode */ 6338 } 6339 6340 /* 6341 * Now we know we have two anon pointers - check to 6342 * see if they happen to be properly allocated. 6343 */ 6344 6345 /* 6346 * XXX We cheat here and don't lock the anon slots. We can't because 6347 * we may have been called from the anon layer which might already 6348 * have locked them. We are holding a refcnt on the slots so they 6349 * can't disappear. The worst that will happen is we'll get the wrong 6350 * names (vp, off) for the slots and make a poor klustering decision. 6351 */ 6352 swap_xlate(ap, &vp1, &off1); 6353 swap_xlate(oap, &vp2, &off2); 6354 6355 6356 if (!VOP_CMP(vp1, vp2) || off1 - off2 != delta) 6357 return (-1); 6358 return (0); 6359 } 6360 6361 /* 6362 * Swap the pages of seg out to secondary storage, returning the 6363 * number of bytes of storage freed. 6364 * 6365 * The basic idea is first to unload all translations and then to call 6366 * VOP_PUTPAGE() for all newly-unmapped pages, to push them out to the 6367 * swap device. Pages to which other segments have mappings will remain 6368 * mapped and won't be swapped. Our caller (as_swapout) has already 6369 * performed the unloading step. 6370 * 6371 * The value returned is intended to correlate well with the process's 6372 * memory requirements. However, there are some caveats: 6373 * 1) When given a shared segment as argument, this routine will 6374 * only succeed in swapping out pages for the last sharer of the 6375 * segment. (Previous callers will only have decremented mapping 6376 * reference counts.) 6377 * 2) We assume that the hat layer maintains a large enough translation 6378 * cache to capture process reference patterns. 6379 */ 6380 static size_t 6381 segvn_swapout(struct seg *seg) 6382 { 6383 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 6384 struct anon_map *amp; 6385 pgcnt_t pgcnt = 0; 6386 pgcnt_t npages; 6387 pgcnt_t page; 6388 ulong_t anon_index; 6389 6390 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 6391 6392 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER); 6393 /* 6394 * Find pages unmapped by our caller and force them 6395 * out to the virtual swap device. 6396 */ 6397 if ((amp = svd->amp) != NULL) 6398 anon_index = svd->anon_index; 6399 npages = seg->s_size >> PAGESHIFT; 6400 for (page = 0; page < npages; page++) { 6401 page_t *pp; 6402 struct anon *ap; 6403 struct vnode *vp; 6404 u_offset_t off; 6405 anon_sync_obj_t cookie; 6406 6407 /* 6408 * Obtain <vp, off> pair for the page, then look it up. 6409 * 6410 * Note that this code is willing to consider regular 6411 * pages as well as anon pages. Is this appropriate here? 6412 */ 6413 ap = NULL; 6414 if (amp != NULL) { 6415 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 6416 if (anon_array_try_enter(amp, anon_index + page, 6417 &cookie)) { 6418 ANON_LOCK_EXIT(&->a_rwlock); 6419 continue; 6420 } 6421 ap = anon_get_ptr(amp->ahp, anon_index + page); 6422 if (ap != NULL) { 6423 swap_xlate(ap, &vp, &off); 6424 } else { 6425 vp = svd->vp; 6426 off = svd->offset + ptob(page); 6427 } 6428 anon_array_exit(&cookie); 6429 ANON_LOCK_EXIT(&->a_rwlock); 6430 } else { 6431 vp = svd->vp; 6432 off = svd->offset + ptob(page); 6433 } 6434 if (vp == NULL) { /* untouched zfod page */ 6435 ASSERT(ap == NULL); 6436 continue; 6437 } 6438 6439 pp = page_lookup_nowait(vp, off, SE_SHARED); 6440 if (pp == NULL) 6441 continue; 6442 6443 6444 /* 6445 * Examine the page to see whether it can be tossed out, 6446 * keeping track of how many we've found. 6447 */ 6448 if (!page_tryupgrade(pp)) { 6449 /* 6450 * If the page has an i/o lock and no mappings, 6451 * it's very likely that the page is being 6452 * written out as a result of klustering. 6453 * Assume this is so and take credit for it here. 6454 */ 6455 if (!page_io_trylock(pp)) { 6456 if (!hat_page_is_mapped(pp)) 6457 pgcnt++; 6458 } else { 6459 page_io_unlock(pp); 6460 } 6461 page_unlock(pp); 6462 continue; 6463 } 6464 ASSERT(!page_iolock_assert(pp)); 6465 6466 6467 /* 6468 * Skip if page is locked or has mappings. 6469 * We don't need the page_struct_lock to look at lckcnt 6470 * and cowcnt because the page is exclusive locked. 6471 */ 6472 if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0 || 6473 hat_page_is_mapped(pp)) { 6474 page_unlock(pp); 6475 continue; 6476 } 6477 6478 /* 6479 * dispose skips large pages so try to demote first. 6480 */ 6481 if (pp->p_szc != 0 && !page_try_demote_pages(pp)) { 6482 page_unlock(pp); 6483 /* 6484 * XXX should skip the remaining page_t's of this 6485 * large page. 6486 */ 6487 continue; 6488 } 6489 6490 ASSERT(pp->p_szc == 0); 6491 6492 /* 6493 * No longer mapped -- we can toss it out. How 6494 * we do so depends on whether or not it's dirty. 6495 */ 6496 if (hat_ismod(pp) && pp->p_vnode) { 6497 /* 6498 * We must clean the page before it can be 6499 * freed. Setting B_FREE will cause pvn_done 6500 * to free the page when the i/o completes. 6501 * XXX: This also causes it to be accounted 6502 * as a pageout instead of a swap: need 6503 * B_SWAPOUT bit to use instead of B_FREE. 6504 * 6505 * Hold the vnode before releasing the page lock 6506 * to prevent it from being freed and re-used by 6507 * some other thread. 6508 */ 6509 VN_HOLD(vp); 6510 page_unlock(pp); 6511 6512 /* 6513 * Queue all i/o requests for the pageout thread 6514 * to avoid saturating the pageout devices. 6515 */ 6516 if (!queue_io_request(vp, off)) 6517 VN_RELE(vp); 6518 } else { 6519 /* 6520 * The page was clean, free it. 6521 * 6522 * XXX: Can we ever encounter modified pages 6523 * with no associated vnode here? 6524 */ 6525 ASSERT(pp->p_vnode != NULL); 6526 /*LINTED: constant in conditional context*/ 6527 VN_DISPOSE(pp, B_FREE, 0, kcred); 6528 } 6529 6530 /* 6531 * Credit now even if i/o is in progress. 6532 */ 6533 pgcnt++; 6534 } 6535 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 6536 6537 /* 6538 * Wakeup pageout to initiate i/o on all queued requests. 6539 */ 6540 cv_signal_pageout(); 6541 return (ptob(pgcnt)); 6542 } 6543 6544 /* 6545 * Synchronize primary storage cache with real object in virtual memory. 6546 * 6547 * XXX - Anonymous pages should not be sync'ed out at all. 6548 */ 6549 static int 6550 segvn_sync(struct seg *seg, caddr_t addr, size_t len, int attr, uint_t flags) 6551 { 6552 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 6553 struct vpage *vpp; 6554 page_t *pp; 6555 u_offset_t offset; 6556 struct vnode *vp; 6557 u_offset_t off; 6558 caddr_t eaddr; 6559 int bflags; 6560 int err = 0; 6561 int segtype; 6562 int pageprot; 6563 int prot; 6564 ulong_t anon_index; 6565 struct anon_map *amp; 6566 struct anon *ap; 6567 anon_sync_obj_t cookie; 6568 6569 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 6570 6571 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER); 6572 6573 if (svd->softlockcnt > 0) { 6574 /* 6575 * flush all pages from seg cache 6576 * otherwise we may deadlock in swap_putpage 6577 * for B_INVAL page (4175402). 6578 * 6579 * Even if we grab segvn WRITER's lock or segp_slock 6580 * here, there might be another thread which could've 6581 * successfully performed lookup/insert just before 6582 * we acquired the lock here. So, grabbing either 6583 * lock here is of not much use. Until we devise 6584 * a strategy at upper layers to solve the 6585 * synchronization issues completely, we expect 6586 * applications to handle this appropriately. 6587 */ 6588 segvn_purge(seg); 6589 if (svd->softlockcnt > 0) { 6590 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 6591 return (EAGAIN); 6592 } 6593 } 6594 6595 vpp = svd->vpage; 6596 offset = svd->offset + (uintptr_t)(addr - seg->s_base); 6597 bflags = ((flags & MS_ASYNC) ? B_ASYNC : 0) | 6598 ((flags & MS_INVALIDATE) ? B_INVAL : 0); 6599 6600 if (attr) { 6601 pageprot = attr & ~(SHARED|PRIVATE); 6602 segtype = (attr & SHARED) ? MAP_SHARED : MAP_PRIVATE; 6603 6604 /* 6605 * We are done if the segment types don't match 6606 * or if we have segment level protections and 6607 * they don't match. 6608 */ 6609 if (svd->type != segtype) { 6610 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 6611 return (0); 6612 } 6613 if (vpp == NULL) { 6614 if (svd->prot != pageprot) { 6615 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 6616 return (0); 6617 } 6618 prot = svd->prot; 6619 } else 6620 vpp = &svd->vpage[seg_page(seg, addr)]; 6621 6622 } else if (svd->vp && svd->amp == NULL && 6623 (flags & MS_INVALIDATE) == 0) { 6624 6625 /* 6626 * No attributes, no anonymous pages and MS_INVALIDATE flag 6627 * is not on, just use one big request. 6628 */ 6629 err = VOP_PUTPAGE(svd->vp, (offset_t)offset, len, 6630 bflags, svd->cred); 6631 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 6632 return (err); 6633 } 6634 6635 if ((amp = svd->amp) != NULL) 6636 anon_index = svd->anon_index + seg_page(seg, addr); 6637 6638 for (eaddr = addr + len; addr < eaddr; addr += PAGESIZE) { 6639 ap = NULL; 6640 if (amp != NULL) { 6641 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 6642 anon_array_enter(amp, anon_index, &cookie); 6643 ap = anon_get_ptr(amp->ahp, anon_index++); 6644 if (ap != NULL) { 6645 swap_xlate(ap, &vp, &off); 6646 } else { 6647 vp = svd->vp; 6648 off = offset; 6649 } 6650 anon_array_exit(&cookie); 6651 ANON_LOCK_EXIT(&->a_rwlock); 6652 } else { 6653 vp = svd->vp; 6654 off = offset; 6655 } 6656 offset += PAGESIZE; 6657 6658 if (vp == NULL) /* untouched zfod page */ 6659 continue; 6660 6661 if (attr) { 6662 if (vpp) { 6663 prot = VPP_PROT(vpp); 6664 vpp++; 6665 } 6666 if (prot != pageprot) { 6667 continue; 6668 } 6669 } 6670 6671 /* 6672 * See if any of these pages are locked -- if so, then we 6673 * will have to truncate an invalidate request at the first 6674 * locked one. We don't need the page_struct_lock to test 6675 * as this is only advisory; even if we acquire it someone 6676 * might race in and lock the page after we unlock and before 6677 * we do the PUTPAGE, then PUTPAGE simply does nothing. 6678 */ 6679 if (flags & MS_INVALIDATE) { 6680 if ((pp = page_lookup(vp, off, SE_SHARED)) != NULL) { 6681 if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) { 6682 page_unlock(pp); 6683 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 6684 return (EBUSY); 6685 } 6686 if (ap != NULL && pp->p_szc != 0 && 6687 page_tryupgrade(pp)) { 6688 if (pp->p_lckcnt == 0 && 6689 pp->p_cowcnt == 0) { 6690 /* 6691 * swapfs VN_DISPOSE() won't 6692 * invalidate large pages. 6693 * Attempt to demote. 6694 * XXX can't help it if it 6695 * fails. But for swapfs 6696 * pages it is no big deal. 6697 */ 6698 (void) page_try_demote_pages( 6699 pp); 6700 } 6701 } 6702 page_unlock(pp); 6703 } 6704 } else if (svd->type == MAP_SHARED && amp != NULL) { 6705 /* 6706 * Avoid writting out to disk ISM's large pages 6707 * because segspt_free_pages() relies on NULL an_pvp 6708 * of anon slots of such pages. 6709 */ 6710 6711 ASSERT(svd->vp == NULL); 6712 /* 6713 * swapfs uses page_lookup_nowait if not freeing or 6714 * invalidating and skips a page if 6715 * page_lookup_nowait returns NULL. 6716 */ 6717 pp = page_lookup_nowait(vp, off, SE_SHARED); 6718 if (pp == NULL) { 6719 continue; 6720 } 6721 if (pp->p_szc != 0) { 6722 page_unlock(pp); 6723 continue; 6724 } 6725 6726 /* 6727 * Note ISM pages are created large so (vp, off)'s 6728 * page cannot suddenly become large after we unlock 6729 * pp. 6730 */ 6731 page_unlock(pp); 6732 } 6733 /* 6734 * XXX - Should ultimately try to kluster 6735 * calls to VOP_PUTPAGE() for performance. 6736 */ 6737 VN_HOLD(vp); 6738 err = VOP_PUTPAGE(vp, (offset_t)off, PAGESIZE, 6739 bflags, svd->cred); 6740 VN_RELE(vp); 6741 if (err) 6742 break; 6743 } 6744 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 6745 return (err); 6746 } 6747 6748 /* 6749 * Determine if we have data corresponding to pages in the 6750 * primary storage virtual memory cache (i.e., "in core"). 6751 */ 6752 static size_t 6753 segvn_incore(struct seg *seg, caddr_t addr, size_t len, char *vec) 6754 { 6755 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 6756 struct vnode *vp, *avp; 6757 u_offset_t offset, aoffset; 6758 size_t p, ep; 6759 int ret; 6760 struct vpage *vpp; 6761 page_t *pp; 6762 uint_t start; 6763 struct anon_map *amp; /* XXX - for locknest */ 6764 struct anon *ap; 6765 uint_t attr; 6766 anon_sync_obj_t cookie; 6767 6768 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 6769 6770 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER); 6771 if (svd->amp == NULL && svd->vp == NULL) { 6772 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 6773 bzero(vec, btopr(len)); 6774 return (len); /* no anonymous pages created yet */ 6775 } 6776 6777 p = seg_page(seg, addr); 6778 ep = seg_page(seg, addr + len); 6779 start = svd->vp ? SEG_PAGE_VNODEBACKED : 0; 6780 6781 amp = svd->amp; 6782 for (; p < ep; p++, addr += PAGESIZE) { 6783 vpp = (svd->vpage) ? &svd->vpage[p]: NULL; 6784 ret = start; 6785 ap = NULL; 6786 avp = NULL; 6787 /* Grab the vnode/offset for the anon slot */ 6788 if (amp != NULL) { 6789 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 6790 anon_array_enter(amp, svd->anon_index + p, &cookie); 6791 ap = anon_get_ptr(amp->ahp, svd->anon_index + p); 6792 if (ap != NULL) { 6793 swap_xlate(ap, &avp, &aoffset); 6794 } 6795 anon_array_exit(&cookie); 6796 ANON_LOCK_EXIT(&->a_rwlock); 6797 } 6798 if ((avp != NULL) && page_exists(avp, aoffset)) { 6799 /* A page exists for the anon slot */ 6800 ret |= SEG_PAGE_INCORE; 6801 6802 /* 6803 * If page is mapped and writable 6804 */ 6805 attr = (uint_t)0; 6806 if ((hat_getattr(seg->s_as->a_hat, addr, 6807 &attr) != -1) && (attr & PROT_WRITE)) { 6808 ret |= SEG_PAGE_ANON; 6809 } 6810 /* 6811 * Don't get page_struct lock for lckcnt and cowcnt, 6812 * since this is purely advisory. 6813 */ 6814 if ((pp = page_lookup_nowait(avp, aoffset, 6815 SE_SHARED)) != NULL) { 6816 if (pp->p_lckcnt) 6817 ret |= SEG_PAGE_SOFTLOCK; 6818 if (pp->p_cowcnt) 6819 ret |= SEG_PAGE_HASCOW; 6820 page_unlock(pp); 6821 } 6822 } 6823 6824 /* Gather vnode statistics */ 6825 vp = svd->vp; 6826 offset = svd->offset + (uintptr_t)(addr - seg->s_base); 6827 6828 if (vp != NULL) { 6829 /* 6830 * Try to obtain a "shared" lock on the page 6831 * without blocking. If this fails, determine 6832 * if the page is in memory. 6833 */ 6834 pp = page_lookup_nowait(vp, offset, SE_SHARED); 6835 if ((pp == NULL) && (page_exists(vp, offset))) { 6836 /* Page is incore, and is named */ 6837 ret |= (SEG_PAGE_INCORE | SEG_PAGE_VNODE); 6838 } 6839 /* 6840 * Don't get page_struct lock for lckcnt and cowcnt, 6841 * since this is purely advisory. 6842 */ 6843 if (pp != NULL) { 6844 ret |= (SEG_PAGE_INCORE | SEG_PAGE_VNODE); 6845 if (pp->p_lckcnt) 6846 ret |= SEG_PAGE_SOFTLOCK; 6847 if (pp->p_cowcnt) 6848 ret |= SEG_PAGE_HASCOW; 6849 page_unlock(pp); 6850 } 6851 } 6852 6853 /* Gather virtual page information */ 6854 if (vpp) { 6855 if (VPP_ISPPLOCK(vpp)) 6856 ret |= SEG_PAGE_LOCKED; 6857 vpp++; 6858 } 6859 6860 *vec++ = (char)ret; 6861 } 6862 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 6863 return (len); 6864 } 6865 6866 /* 6867 * Statement for p_cowcnts/p_lckcnts. 6868 * 6869 * p_cowcnt is updated while mlock/munlocking MAP_PRIVATE and PROT_WRITE region 6870 * irrespective of the following factors or anything else: 6871 * 6872 * (1) anon slots are populated or not 6873 * (2) cow is broken or not 6874 * (3) refcnt on ap is 1 or greater than 1 6875 * 6876 * If it's not MAP_PRIVATE and PROT_WRITE, p_lckcnt is updated during mlock 6877 * and munlock. 6878 * 6879 * 6880 * Handling p_cowcnts/p_lckcnts during copy-on-write fault: 6881 * 6882 * if vpage has PROT_WRITE 6883 * transfer cowcnt on the oldpage -> cowcnt on the newpage 6884 * else 6885 * transfer lckcnt on the oldpage -> lckcnt on the newpage 6886 * 6887 * During copy-on-write, decrement p_cowcnt on the oldpage and increment 6888 * p_cowcnt on the newpage *if* the corresponding vpage has PROT_WRITE. 6889 * 6890 * We may also break COW if softlocking on read access in the physio case. 6891 * In this case, vpage may not have PROT_WRITE. So, we need to decrement 6892 * p_lckcnt on the oldpage and increment p_lckcnt on the newpage *if* the 6893 * vpage doesn't have PROT_WRITE. 6894 * 6895 * 6896 * Handling p_cowcnts/p_lckcnts during mprotect on mlocked region: 6897 * 6898 * If a MAP_PRIVATE region loses PROT_WRITE, we decrement p_cowcnt and 6899 * increment p_lckcnt by calling page_subclaim() which takes care of 6900 * availrmem accounting and p_lckcnt overflow. 6901 * 6902 * If a MAP_PRIVATE region gains PROT_WRITE, we decrement p_lckcnt and 6903 * increment p_cowcnt by calling page_addclaim() which takes care of 6904 * availrmem availability and p_cowcnt overflow. 6905 */ 6906 6907 /* 6908 * Lock down (or unlock) pages mapped by this segment. 6909 * 6910 * XXX only creates PAGESIZE pages if anon slots are not initialized. 6911 * At fault time they will be relocated into larger pages. 6912 */ 6913 static int 6914 segvn_lockop(struct seg *seg, caddr_t addr, size_t len, 6915 int attr, int op, ulong_t *lockmap, size_t pos) 6916 { 6917 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 6918 struct vpage *vpp; 6919 struct vpage *evp; 6920 page_t *pp; 6921 u_offset_t offset; 6922 u_offset_t off; 6923 int segtype; 6924 int pageprot; 6925 int claim; 6926 struct vnode *vp; 6927 ulong_t anon_index; 6928 struct anon_map *amp; 6929 struct anon *ap; 6930 struct vattr va; 6931 anon_sync_obj_t cookie; 6932 6933 /* 6934 * Hold write lock on address space because may split or concatenate 6935 * segments 6936 */ 6937 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 6938 6939 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_WRITER); 6940 if (attr) { 6941 pageprot = attr & ~(SHARED|PRIVATE); 6942 segtype = attr & SHARED ? MAP_SHARED : MAP_PRIVATE; 6943 6944 /* 6945 * We are done if the segment types don't match 6946 * or if we have segment level protections and 6947 * they don't match. 6948 */ 6949 if (svd->type != segtype) { 6950 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 6951 return (0); 6952 } 6953 if (svd->pageprot == 0 && svd->prot != pageprot) { 6954 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 6955 return (0); 6956 } 6957 } 6958 6959 /* 6960 * If we're locking, then we must create a vpage structure if 6961 * none exists. If we're unlocking, then check to see if there 6962 * is a vpage -- if not, then we could not have locked anything. 6963 */ 6964 6965 if ((vpp = svd->vpage) == NULL) { 6966 if (op == MC_LOCK) 6967 segvn_vpage(seg); 6968 else { 6969 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 6970 return (0); 6971 } 6972 } 6973 6974 /* 6975 * The anonymous data vector (i.e., previously 6976 * unreferenced mapping to swap space) can be allocated 6977 * by lazily testing for its existence. 6978 */ 6979 if (op == MC_LOCK && svd->amp == NULL && svd->vp == NULL) { 6980 svd->amp = anonmap_alloc(seg->s_size, 0); 6981 svd->amp->a_szc = seg->s_szc; 6982 } 6983 6984 if ((amp = svd->amp) != NULL) { 6985 anon_index = svd->anon_index + seg_page(seg, addr); 6986 } 6987 6988 offset = svd->offset + (uintptr_t)(addr - seg->s_base); 6989 evp = &svd->vpage[seg_page(seg, addr + len)]; 6990 6991 /* 6992 * Loop over all pages in the range. Process if we're locking and 6993 * page has not already been locked in this mapping; or if we're 6994 * unlocking and the page has been locked. 6995 */ 6996 for (vpp = &svd->vpage[seg_page(seg, addr)]; vpp < evp; 6997 vpp++, pos++, addr += PAGESIZE, offset += PAGESIZE, anon_index++) { 6998 if ((attr == 0 || VPP_PROT(vpp) == pageprot) && 6999 ((op == MC_LOCK && !VPP_ISPPLOCK(vpp)) || 7000 (op == MC_UNLOCK && VPP_ISPPLOCK(vpp)))) { 7001 7002 if (amp != NULL) 7003 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 7004 /* 7005 * If this isn't a MAP_NORESERVE segment and 7006 * we're locking, allocate anon slots if they 7007 * don't exist. The page is brought in later on. 7008 */ 7009 if (op == MC_LOCK && svd->vp == NULL && 7010 ((svd->flags & MAP_NORESERVE) == 0) && 7011 amp != NULL && 7012 ((ap = anon_get_ptr(amp->ahp, anon_index)) 7013 == NULL)) { 7014 anon_array_enter(amp, anon_index, &cookie); 7015 7016 if ((ap = anon_get_ptr(amp->ahp, 7017 anon_index)) == NULL) { 7018 pp = anon_zero(seg, addr, &ap, 7019 svd->cred); 7020 if (pp == NULL) { 7021 anon_array_exit(&cookie); 7022 ANON_LOCK_EXIT(&->a_rwlock); 7023 SEGVN_LOCK_EXIT(seg->s_as, 7024 &svd->lock); 7025 return (ENOMEM); 7026 } 7027 ASSERT(anon_get_ptr(amp->ahp, 7028 anon_index) == NULL); 7029 (void) anon_set_ptr(amp->ahp, 7030 anon_index, ap, ANON_SLEEP); 7031 page_unlock(pp); 7032 } 7033 anon_array_exit(&cookie); 7034 } 7035 7036 /* 7037 * Get name for page, accounting for 7038 * existence of private copy. 7039 */ 7040 ap = NULL; 7041 if (amp != NULL) { 7042 anon_array_enter(amp, anon_index, &cookie); 7043 ap = anon_get_ptr(amp->ahp, anon_index); 7044 if (ap != NULL) { 7045 swap_xlate(ap, &vp, &off); 7046 } else { 7047 if (svd->vp == NULL && 7048 (svd->flags & MAP_NORESERVE)) { 7049 anon_array_exit(&cookie); 7050 ANON_LOCK_EXIT(&->a_rwlock); 7051 continue; 7052 } 7053 vp = svd->vp; 7054 off = offset; 7055 } 7056 anon_array_exit(&cookie); 7057 ANON_LOCK_EXIT(&->a_rwlock); 7058 } else { 7059 vp = svd->vp; 7060 off = offset; 7061 } 7062 7063 /* 7064 * Get page frame. It's ok if the page is 7065 * not available when we're unlocking, as this 7066 * may simply mean that a page we locked got 7067 * truncated out of existence after we locked it. 7068 * 7069 * Invoke VOP_GETPAGE() to obtain the page struct 7070 * since we may need to read it from disk if its 7071 * been paged out. 7072 */ 7073 if (op != MC_LOCK) 7074 pp = page_lookup(vp, off, SE_SHARED); 7075 else { 7076 page_t *pl[1 + 1]; 7077 int error; 7078 7079 ASSERT(vp != NULL); 7080 7081 error = VOP_GETPAGE(vp, (offset_t)off, PAGESIZE, 7082 (uint_t *)NULL, pl, PAGESIZE, seg, addr, 7083 S_OTHER, svd->cred); 7084 7085 /* 7086 * If the error is EDEADLK then we must bounce 7087 * up and drop all vm subsystem locks and then 7088 * retry the operation later 7089 * This behavior is a temporary measure because 7090 * ufs/sds logging is badly designed and will 7091 * deadlock if we don't allow this bounce to 7092 * happen. The real solution is to re-design 7093 * the logging code to work properly. See bug 7094 * 4125102 for details of the problem. 7095 */ 7096 if (error == EDEADLK) { 7097 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7098 return (error); 7099 } 7100 /* 7101 * Quit if we fail to fault in the page. Treat 7102 * the failure as an error, unless the addr 7103 * is mapped beyond the end of a file. 7104 */ 7105 if (error && svd->vp) { 7106 va.va_mask = AT_SIZE; 7107 if (VOP_GETATTR(svd->vp, &va, 0, 7108 svd->cred) != 0) { 7109 SEGVN_LOCK_EXIT(seg->s_as, 7110 &svd->lock); 7111 return (EIO); 7112 } 7113 if (btopr(va.va_size) >= 7114 btopr(off + 1)) { 7115 SEGVN_LOCK_EXIT(seg->s_as, 7116 &svd->lock); 7117 return (EIO); 7118 } 7119 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7120 return (0); 7121 } else if (error) { 7122 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7123 return (EIO); 7124 } 7125 pp = pl[0]; 7126 ASSERT(pp != NULL); 7127 } 7128 7129 /* 7130 * See Statement at the beginning of this routine. 7131 * 7132 * claim is always set if MAP_PRIVATE and PROT_WRITE 7133 * irrespective of following factors: 7134 * 7135 * (1) anon slots are populated or not 7136 * (2) cow is broken or not 7137 * (3) refcnt on ap is 1 or greater than 1 7138 * 7139 * See 4140683 for details 7140 */ 7141 claim = ((VPP_PROT(vpp) & PROT_WRITE) && 7142 (svd->type == MAP_PRIVATE)); 7143 7144 /* 7145 * Perform page-level operation appropriate to 7146 * operation. If locking, undo the SOFTLOCK 7147 * performed to bring the page into memory 7148 * after setting the lock. If unlocking, 7149 * and no page was found, account for the claim 7150 * separately. 7151 */ 7152 if (op == MC_LOCK) { 7153 int ret = 1; /* Assume success */ 7154 7155 /* 7156 * Make sure another thread didn't lock 7157 * the page after we released the segment 7158 * lock. 7159 */ 7160 if ((attr == 0 || VPP_PROT(vpp) == pageprot) && 7161 !VPP_ISPPLOCK(vpp)) { 7162 ret = page_pp_lock(pp, claim, 0); 7163 if (ret != 0) { 7164 VPP_SETPPLOCK(vpp); 7165 if (lockmap != (ulong_t *)NULL) 7166 BT_SET(lockmap, pos); 7167 } 7168 } 7169 page_unlock(pp); 7170 if (ret == 0) { 7171 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7172 return (EAGAIN); 7173 } 7174 } else { 7175 if (pp != NULL) { 7176 if ((attr == 0 || 7177 VPP_PROT(vpp) == pageprot) && 7178 VPP_ISPPLOCK(vpp)) 7179 page_pp_unlock(pp, claim, 0); 7180 page_unlock(pp); 7181 } 7182 VPP_CLRPPLOCK(vpp); 7183 } 7184 } 7185 } 7186 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7187 return (0); 7188 } 7189 7190 /* 7191 * Set advice from user for specified pages 7192 * There are 5 types of advice: 7193 * MADV_NORMAL - Normal (default) behavior (whatever that is) 7194 * MADV_RANDOM - Random page references 7195 * do not allow readahead or 'klustering' 7196 * MADV_SEQUENTIAL - Sequential page references 7197 * Pages previous to the one currently being 7198 * accessed (determined by fault) are 'not needed' 7199 * and are freed immediately 7200 * MADV_WILLNEED - Pages are likely to be used (fault ahead in mctl) 7201 * MADV_DONTNEED - Pages are not needed (synced out in mctl) 7202 * MADV_FREE - Contents can be discarded 7203 * MADV_ACCESS_DEFAULT- Default access 7204 * MADV_ACCESS_LWP - Next LWP will access heavily 7205 * MADV_ACCESS_MANY- Many LWPs or processes will access heavily 7206 */ 7207 static int 7208 segvn_advise(struct seg *seg, caddr_t addr, size_t len, uint_t behav) 7209 { 7210 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 7211 size_t page; 7212 int err = 0; 7213 int already_set; 7214 struct anon_map *amp; 7215 ulong_t anon_index; 7216 struct seg *next; 7217 lgrp_mem_policy_t policy; 7218 struct seg *prev; 7219 struct vnode *vp; 7220 7221 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 7222 7223 /* 7224 * In case of MADV_FREE, we won't be modifying any segment private 7225 * data structures; so, we only need to grab READER's lock 7226 */ 7227 if (behav != MADV_FREE) 7228 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_WRITER); 7229 else 7230 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER); 7231 7232 /* 7233 * Large pages are assumed to be only turned on when accesses to the 7234 * segment's address range have spatial and temporal locality. That 7235 * justifies ignoring MADV_SEQUENTIAL for large page segments. 7236 * Also, ignore advice affecting lgroup memory allocation 7237 * if don't need to do lgroup optimizations on this system 7238 */ 7239 7240 if ((behav == MADV_SEQUENTIAL && seg->s_szc != 0) || 7241 (!lgrp_optimizations() && (behav == MADV_ACCESS_DEFAULT || 7242 behav == MADV_ACCESS_LWP || behav == MADV_ACCESS_MANY))) { 7243 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7244 return (0); 7245 } 7246 7247 if (behav == MADV_SEQUENTIAL || behav == MADV_ACCESS_DEFAULT || 7248 behav == MADV_ACCESS_LWP || behav == MADV_ACCESS_MANY) { 7249 /* 7250 * Since we are going to unload hat mappings 7251 * we first have to flush the cache. Otherwise 7252 * this might lead to system panic if another 7253 * thread is doing physio on the range whose 7254 * mappings are unloaded by madvise(3C). 7255 */ 7256 if (svd->softlockcnt > 0) { 7257 /* 7258 * Since we do have the segvn writers lock 7259 * nobody can fill the cache with entries 7260 * belonging to this seg during the purge. 7261 * The flush either succeeds or we still 7262 * have pending I/Os. In the later case, 7263 * madvise(3C) fails. 7264 */ 7265 segvn_purge(seg); 7266 if (svd->softlockcnt > 0) { 7267 /* 7268 * Since madvise(3C) is advisory and 7269 * it's not part of UNIX98, madvise(3C) 7270 * failure here doesn't cause any hardship. 7271 * Note that we don't block in "as" layer. 7272 */ 7273 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7274 return (EAGAIN); 7275 } 7276 } 7277 } 7278 7279 amp = svd->amp; 7280 vp = svd->vp; 7281 if (behav == MADV_FREE) { 7282 /* 7283 * MADV_FREE is not supported for segments with 7284 * underlying object; if anonmap is NULL, anon slots 7285 * are not yet populated and there is nothing for 7286 * us to do. As MADV_FREE is advisory, we don't 7287 * return error in either case. 7288 */ 7289 if (vp || amp == NULL) { 7290 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7291 return (0); 7292 } 7293 7294 page = seg_page(seg, addr); 7295 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 7296 anon_disclaim(amp, svd->anon_index + page, len, 0); 7297 ANON_LOCK_EXIT(&->a_rwlock); 7298 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7299 return (0); 7300 } 7301 7302 /* 7303 * If advice is to be applied to entire segment, 7304 * use advice field in seg_data structure 7305 * otherwise use appropriate vpage entry. 7306 */ 7307 if ((addr == seg->s_base) && (len == seg->s_size)) { 7308 switch (behav) { 7309 case MADV_ACCESS_LWP: 7310 case MADV_ACCESS_MANY: 7311 case MADV_ACCESS_DEFAULT: 7312 /* 7313 * Set memory allocation policy for this segment 7314 */ 7315 policy = lgrp_madv_to_policy(behav, len, svd->type); 7316 if (svd->type == MAP_SHARED) 7317 already_set = lgrp_shm_policy_set(policy, amp, 7318 svd->anon_index, vp, svd->offset, len); 7319 else { 7320 /* 7321 * For private memory, need writers lock on 7322 * address space because the segment may be 7323 * split or concatenated when changing policy 7324 */ 7325 if (AS_READ_HELD(seg->s_as, 7326 &seg->s_as->a_lock)) { 7327 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7328 return (IE_RETRY); 7329 } 7330 7331 already_set = lgrp_privm_policy_set(policy, 7332 &svd->policy_info, len); 7333 } 7334 7335 /* 7336 * If policy set already and it shouldn't be reapplied, 7337 * don't do anything. 7338 */ 7339 if (already_set && 7340 !LGRP_MEM_POLICY_REAPPLICABLE(policy)) 7341 break; 7342 7343 /* 7344 * Mark any existing pages in given range for 7345 * migration 7346 */ 7347 page_mark_migrate(seg, addr, len, amp, svd->anon_index, 7348 vp, svd->offset, 1); 7349 7350 /* 7351 * If same policy set already or this is a shared 7352 * memory segment, don't need to try to concatenate 7353 * segment with adjacent ones. 7354 */ 7355 if (already_set || svd->type == MAP_SHARED) 7356 break; 7357 7358 /* 7359 * Try to concatenate this segment with previous 7360 * one and next one, since we changed policy for 7361 * this one and it may be compatible with adjacent 7362 * ones now. 7363 */ 7364 prev = AS_SEGPREV(seg->s_as, seg); 7365 next = AS_SEGNEXT(seg->s_as, seg); 7366 7367 if (next && next->s_ops == &segvn_ops && 7368 addr + len == next->s_base) 7369 (void) segvn_concat(seg, next, 1); 7370 7371 if (prev && prev->s_ops == &segvn_ops && 7372 addr == prev->s_base + prev->s_size) { 7373 /* 7374 * Drop lock for private data of current 7375 * segment before concatenating (deleting) it 7376 * and return IE_REATTACH to tell as_ctl() that 7377 * current segment has changed 7378 */ 7379 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7380 if (!segvn_concat(prev, seg, 1)) 7381 err = IE_REATTACH; 7382 7383 return (err); 7384 } 7385 break; 7386 7387 case MADV_SEQUENTIAL: 7388 /* 7389 * unloading mapping guarantees 7390 * detection in segvn_fault 7391 */ 7392 ASSERT(seg->s_szc == 0); 7393 hat_unload(seg->s_as->a_hat, addr, len, 7394 HAT_UNLOAD); 7395 /* FALLTHROUGH */ 7396 case MADV_NORMAL: 7397 case MADV_RANDOM: 7398 svd->advice = (uchar_t)behav; 7399 svd->pageadvice = 0; 7400 break; 7401 case MADV_WILLNEED: /* handled in memcntl */ 7402 case MADV_DONTNEED: /* handled in memcntl */ 7403 case MADV_FREE: /* handled above */ 7404 break; 7405 default: 7406 err = EINVAL; 7407 } 7408 } else { 7409 caddr_t eaddr; 7410 struct seg *new_seg; 7411 struct segvn_data *new_svd; 7412 u_offset_t off; 7413 caddr_t oldeaddr; 7414 7415 page = seg_page(seg, addr); 7416 7417 segvn_vpage(seg); 7418 7419 switch (behav) { 7420 struct vpage *bvpp, *evpp; 7421 7422 case MADV_ACCESS_LWP: 7423 case MADV_ACCESS_MANY: 7424 case MADV_ACCESS_DEFAULT: 7425 /* 7426 * Set memory allocation policy for portion of this 7427 * segment 7428 */ 7429 7430 /* 7431 * Align address and length of advice to page 7432 * boundaries for large pages 7433 */ 7434 if (seg->s_szc != 0) { 7435 size_t pgsz; 7436 7437 pgsz = page_get_pagesize(seg->s_szc); 7438 addr = (caddr_t)P2ALIGN((uintptr_t)addr, pgsz); 7439 len = P2ROUNDUP(len, pgsz); 7440 } 7441 7442 /* 7443 * Check to see whether policy is set already 7444 */ 7445 policy = lgrp_madv_to_policy(behav, len, svd->type); 7446 7447 anon_index = svd->anon_index + page; 7448 off = svd->offset + (uintptr_t)(addr - seg->s_base); 7449 7450 if (svd->type == MAP_SHARED) 7451 already_set = lgrp_shm_policy_set(policy, amp, 7452 anon_index, vp, off, len); 7453 else 7454 already_set = 7455 (policy == svd->policy_info.mem_policy); 7456 7457 /* 7458 * If policy set already and it shouldn't be reapplied, 7459 * don't do anything. 7460 */ 7461 if (already_set && 7462 !LGRP_MEM_POLICY_REAPPLICABLE(policy)) 7463 break; 7464 7465 /* 7466 * For private memory, need writers lock on 7467 * address space because the segment may be 7468 * split or concatenated when changing policy 7469 */ 7470 if (svd->type == MAP_PRIVATE && 7471 AS_READ_HELD(seg->s_as, &seg->s_as->a_lock)) { 7472 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7473 return (IE_RETRY); 7474 } 7475 7476 /* 7477 * Mark any existing pages in given range for 7478 * migration 7479 */ 7480 page_mark_migrate(seg, addr, len, amp, svd->anon_index, 7481 vp, svd->offset, 1); 7482 7483 /* 7484 * Don't need to try to split or concatenate 7485 * segments, since policy is same or this is a shared 7486 * memory segment 7487 */ 7488 if (already_set || svd->type == MAP_SHARED) 7489 break; 7490 7491 /* 7492 * Split off new segment if advice only applies to a 7493 * portion of existing segment starting in middle 7494 */ 7495 new_seg = NULL; 7496 eaddr = addr + len; 7497 oldeaddr = seg->s_base + seg->s_size; 7498 if (addr > seg->s_base) { 7499 /* 7500 * Must flush I/O page cache 7501 * before splitting segment 7502 */ 7503 if (svd->softlockcnt > 0) 7504 segvn_purge(seg); 7505 7506 /* 7507 * Split segment and return IE_REATTACH to tell 7508 * as_ctl() that current segment changed 7509 */ 7510 new_seg = segvn_split_seg(seg, addr); 7511 new_svd = (struct segvn_data *)new_seg->s_data; 7512 err = IE_REATTACH; 7513 7514 /* 7515 * If new segment ends where old one 7516 * did, try to concatenate the new 7517 * segment with next one. 7518 */ 7519 if (eaddr == oldeaddr) { 7520 /* 7521 * Set policy for new segment 7522 */ 7523 (void) lgrp_privm_policy_set(policy, 7524 &new_svd->policy_info, 7525 new_seg->s_size); 7526 7527 next = AS_SEGNEXT(new_seg->s_as, 7528 new_seg); 7529 7530 if (next && 7531 next->s_ops == &segvn_ops && 7532 eaddr == next->s_base) 7533 (void) segvn_concat(new_seg, 7534 next, 1); 7535 } 7536 } 7537 7538 /* 7539 * Split off end of existing segment if advice only 7540 * applies to a portion of segment ending before 7541 * end of the existing segment 7542 */ 7543 if (eaddr < oldeaddr) { 7544 /* 7545 * Must flush I/O page cache 7546 * before splitting segment 7547 */ 7548 if (svd->softlockcnt > 0) 7549 segvn_purge(seg); 7550 7551 /* 7552 * If beginning of old segment was already 7553 * split off, use new segment to split end off 7554 * from. 7555 */ 7556 if (new_seg != NULL && new_seg != seg) { 7557 /* 7558 * Split segment 7559 */ 7560 (void) segvn_split_seg(new_seg, eaddr); 7561 7562 /* 7563 * Set policy for new segment 7564 */ 7565 (void) lgrp_privm_policy_set(policy, 7566 &new_svd->policy_info, 7567 new_seg->s_size); 7568 } else { 7569 /* 7570 * Split segment and return IE_REATTACH 7571 * to tell as_ctl() that current 7572 * segment changed 7573 */ 7574 (void) segvn_split_seg(seg, eaddr); 7575 err = IE_REATTACH; 7576 7577 (void) lgrp_privm_policy_set(policy, 7578 &svd->policy_info, seg->s_size); 7579 7580 /* 7581 * If new segment starts where old one 7582 * did, try to concatenate it with 7583 * previous segment. 7584 */ 7585 if (addr == seg->s_base) { 7586 prev = AS_SEGPREV(seg->s_as, 7587 seg); 7588 7589 /* 7590 * Drop lock for private data 7591 * of current segment before 7592 * concatenating (deleting) it 7593 */ 7594 if (prev && 7595 prev->s_ops == 7596 &segvn_ops && 7597 addr == prev->s_base + 7598 prev->s_size) { 7599 SEGVN_LOCK_EXIT( 7600 seg->s_as, 7601 &svd->lock); 7602 (void) segvn_concat( 7603 prev, seg, 1); 7604 return (err); 7605 } 7606 } 7607 } 7608 } 7609 break; 7610 case MADV_SEQUENTIAL: 7611 ASSERT(seg->s_szc == 0); 7612 hat_unload(seg->s_as->a_hat, addr, len, HAT_UNLOAD); 7613 /* FALLTHROUGH */ 7614 case MADV_NORMAL: 7615 case MADV_RANDOM: 7616 bvpp = &svd->vpage[page]; 7617 evpp = &svd->vpage[page + (len >> PAGESHIFT)]; 7618 for (; bvpp < evpp; bvpp++) 7619 VPP_SETADVICE(bvpp, behav); 7620 svd->advice = MADV_NORMAL; 7621 break; 7622 case MADV_WILLNEED: /* handled in memcntl */ 7623 case MADV_DONTNEED: /* handled in memcntl */ 7624 case MADV_FREE: /* handled above */ 7625 break; 7626 default: 7627 err = EINVAL; 7628 } 7629 } 7630 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7631 return (err); 7632 } 7633 7634 /* 7635 * Create a vpage structure for this seg. 7636 */ 7637 static void 7638 segvn_vpage(struct seg *seg) 7639 { 7640 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 7641 struct vpage *vp, *evp; 7642 7643 ASSERT(SEGVN_WRITE_HELD(seg->s_as, &svd->lock)); 7644 7645 /* 7646 * If no vpage structure exists, allocate one. Copy the protections 7647 * and the advice from the segment itself to the individual pages. 7648 */ 7649 if (svd->vpage == NULL) { 7650 svd->pageprot = 1; 7651 svd->pageadvice = 1; 7652 svd->vpage = kmem_zalloc(seg_pages(seg) * sizeof (struct vpage), 7653 KM_SLEEP); 7654 evp = &svd->vpage[seg_page(seg, seg->s_base + seg->s_size)]; 7655 for (vp = svd->vpage; vp < evp; vp++) { 7656 VPP_SETPROT(vp, svd->prot); 7657 VPP_SETADVICE(vp, svd->advice); 7658 } 7659 } 7660 } 7661 7662 /* 7663 * Dump the pages belonging to this segvn segment. 7664 */ 7665 static void 7666 segvn_dump(struct seg *seg) 7667 { 7668 struct segvn_data *svd; 7669 page_t *pp; 7670 struct anon_map *amp; 7671 ulong_t anon_index; 7672 struct vnode *vp; 7673 u_offset_t off, offset; 7674 pfn_t pfn; 7675 pgcnt_t page, npages; 7676 caddr_t addr; 7677 7678 npages = seg_pages(seg); 7679 svd = (struct segvn_data *)seg->s_data; 7680 vp = svd->vp; 7681 off = offset = svd->offset; 7682 addr = seg->s_base; 7683 7684 if ((amp = svd->amp) != NULL) { 7685 anon_index = svd->anon_index; 7686 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 7687 } 7688 7689 for (page = 0; page < npages; page++, offset += PAGESIZE) { 7690 struct anon *ap; 7691 int we_own_it = 0; 7692 7693 if (amp && (ap = anon_get_ptr(svd->amp->ahp, anon_index++))) { 7694 swap_xlate_nopanic(ap, &vp, &off); 7695 } else { 7696 vp = svd->vp; 7697 off = offset; 7698 } 7699 7700 /* 7701 * If pp == NULL, the page either does not exist 7702 * or is exclusively locked. So determine if it 7703 * exists before searching for it. 7704 */ 7705 7706 if ((pp = page_lookup_nowait(vp, off, SE_SHARED))) 7707 we_own_it = 1; 7708 else 7709 pp = page_exists(vp, off); 7710 7711 if (pp) { 7712 pfn = page_pptonum(pp); 7713 dump_addpage(seg->s_as, addr, pfn); 7714 if (we_own_it) 7715 page_unlock(pp); 7716 } 7717 addr += PAGESIZE; 7718 dump_timeleft = dump_timeout; 7719 } 7720 7721 if (amp != NULL) 7722 ANON_LOCK_EXIT(&->a_rwlock); 7723 } 7724 7725 /* 7726 * lock/unlock anon pages over a given range. Return shadow list 7727 */ 7728 static int 7729 segvn_pagelock(struct seg *seg, caddr_t addr, size_t len, struct page ***ppp, 7730 enum lock_type type, enum seg_rw rw) 7731 { 7732 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 7733 size_t np, adjustpages = 0, npages = (len >> PAGESHIFT); 7734 ulong_t anon_index; 7735 uint_t protchk; 7736 uint_t error; 7737 struct anon_map *amp; 7738 struct page **pplist, **pl, *pp; 7739 caddr_t a; 7740 size_t page; 7741 caddr_t lpgaddr, lpgeaddr; 7742 pgcnt_t szc0_npages = 0; 7743 7744 TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEGVN_START, 7745 "segvn_pagelock: start seg %p addr %p", seg, addr); 7746 7747 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 7748 if (seg->s_szc != 0 && (type == L_PAGELOCK || type == L_PAGEUNLOCK)) { 7749 /* 7750 * We are adjusting the pagelock region to the large page size 7751 * boundary because the unlocked part of a large page cannot 7752 * be freed anyway unless all constituent pages of a large 7753 * page are locked. Therefore this adjustment allows us to 7754 * decrement availrmem by the right value (note we don't want 7755 * to just decrement availrem by the large page size without 7756 * adjusting addr and len because then we may end up 7757 * decrementing availrmem by large page size for every 7758 * constituent page locked by a new as_pagelock call). 7759 * as_pageunlock caller must always match as_pagelock call's 7760 * addr and len. 7761 * 7762 * Note segment's page size cannot change while we are holding 7763 * as lock. And then it cannot change while softlockcnt is 7764 * not 0. This will allow us to correctly recalculate large 7765 * page size region for the matching pageunlock/reclaim call. 7766 * 7767 * for pageunlock *ppp points to the pointer of page_t that 7768 * corresponds to the real unadjusted start address. Similar 7769 * for pagelock *ppp must point to the pointer of page_t that 7770 * corresponds to the real unadjusted start address. 7771 */ 7772 size_t pgsz = page_get_pagesize(seg->s_szc); 7773 CALC_LPG_REGION(pgsz, seg, addr, len, lpgaddr, lpgeaddr); 7774 adjustpages = ((uintptr_t)(addr - lpgaddr)) >> PAGESHIFT; 7775 } 7776 7777 if (type == L_PAGEUNLOCK) { 7778 7779 /* 7780 * update hat ref bits for /proc. We need to make sure 7781 * that threads tracing the ref and mod bits of the 7782 * address space get the right data. 7783 * Note: page ref and mod bits are updated at reclaim time 7784 */ 7785 if (seg->s_as->a_vbits) { 7786 for (a = addr; a < addr + len; a += PAGESIZE) { 7787 if (rw == S_WRITE) { 7788 hat_setstat(seg->s_as, a, 7789 PAGESIZE, P_REF | P_MOD); 7790 } else { 7791 hat_setstat(seg->s_as, a, 7792 PAGESIZE, P_REF); 7793 } 7794 } 7795 } 7796 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER); 7797 if (seg->s_szc != 0) { 7798 VM_STAT_ADD(segvnvmstats.pagelock[0]); 7799 seg_pinactive(seg, lpgaddr, lpgeaddr - lpgaddr, 7800 *ppp - adjustpages, rw, segvn_reclaim); 7801 } else { 7802 seg_pinactive(seg, addr, len, *ppp, rw, segvn_reclaim); 7803 } 7804 7805 /* 7806 * If someone is blocked while unmapping, we purge 7807 * segment page cache and thus reclaim pplist synchronously 7808 * without waiting for seg_pasync_thread. This speeds up 7809 * unmapping in cases where munmap(2) is called, while 7810 * raw async i/o is still in progress or where a thread 7811 * exits on data fault in a multithreaded application. 7812 */ 7813 if (AS_ISUNMAPWAIT(seg->s_as) && (svd->softlockcnt > 0)) { 7814 /* 7815 * Even if we grab segvn WRITER's lock or segp_slock 7816 * here, there might be another thread which could've 7817 * successfully performed lookup/insert just before 7818 * we acquired the lock here. So, grabbing either 7819 * lock here is of not much use. Until we devise 7820 * a strategy at upper layers to solve the 7821 * synchronization issues completely, we expect 7822 * applications to handle this appropriately. 7823 */ 7824 segvn_purge(seg); 7825 } 7826 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7827 TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEGVN_UNLOCK_END, 7828 "segvn_pagelock: unlock seg %p addr %p", seg, addr); 7829 return (0); 7830 } else if (type == L_PAGERECLAIM) { 7831 VM_STAT_COND_ADD(seg->s_szc != 0, segvnvmstats.pagelock[1]); 7832 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER); 7833 (void) segvn_reclaim(seg, addr, len, *ppp, rw); 7834 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7835 TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEGVN_UNLOCK_END, 7836 "segvn_pagelock: reclaim seg %p addr %p", seg, addr); 7837 return (0); 7838 } 7839 7840 if (seg->s_szc != 0) { 7841 VM_STAT_ADD(segvnvmstats.pagelock[2]); 7842 addr = lpgaddr; 7843 len = lpgeaddr - lpgaddr; 7844 npages = (len >> PAGESHIFT); 7845 } 7846 7847 /* 7848 * for now we only support pagelock to anon memory. We've to check 7849 * protections for vnode objects and call into the vnode driver. 7850 * That's too much for a fast path. Let the fault entry point handle it. 7851 */ 7852 if (svd->vp != NULL) { 7853 TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEGVN_MISS_END, 7854 "segvn_pagelock: mapped vnode seg %p addr %p", seg, addr); 7855 *ppp = NULL; 7856 return (ENOTSUP); 7857 } 7858 7859 /* 7860 * if anonmap is not yet created, let the fault entry point populate it 7861 * with anon ptrs. 7862 */ 7863 if ((amp = svd->amp) == NULL) { 7864 TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEGVN_MISS_END, 7865 "segvn_pagelock: anonmap null seg %p addr %p", seg, addr); 7866 *ppp = NULL; 7867 return (EFAULT); 7868 } 7869 7870 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER); 7871 7872 /* 7873 * we acquire segp_slock to prevent duplicate entries 7874 * in seg_pcache 7875 */ 7876 mutex_enter(&svd->segp_slock); 7877 7878 /* 7879 * try to find pages in segment page cache 7880 */ 7881 pplist = seg_plookup(seg, addr, len, rw); 7882 if (pplist != NULL) { 7883 mutex_exit(&svd->segp_slock); 7884 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7885 *ppp = pplist + adjustpages; 7886 TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEGVN_HIT_END, 7887 "segvn_pagelock: cache hit seg %p addr %p", seg, addr); 7888 return (0); 7889 } 7890 7891 if (rw == S_READ) { 7892 protchk = PROT_READ; 7893 } else { 7894 protchk = PROT_WRITE; 7895 } 7896 7897 if (svd->pageprot == 0) { 7898 if ((svd->prot & protchk) == 0) { 7899 mutex_exit(&svd->segp_slock); 7900 error = EFAULT; 7901 goto out; 7902 } 7903 } else { 7904 /* 7905 * check page protections 7906 */ 7907 for (a = addr; a < addr + len; a += PAGESIZE) { 7908 struct vpage *vp; 7909 7910 vp = &svd->vpage[seg_page(seg, a)]; 7911 if ((VPP_PROT(vp) & protchk) == 0) { 7912 mutex_exit(&svd->segp_slock); 7913 error = EFAULT; 7914 goto out; 7915 } 7916 } 7917 } 7918 7919 /* 7920 * Avoid per page overhead of segvn_pp_lock_anonpages() for small 7921 * pages. For large pages segvn_pp_lock_anonpages() only does real 7922 * work once per large page. The tradeoff is that we may decrement 7923 * availrmem more than once for the same page but this is ok 7924 * for small pages. 7925 */ 7926 if (seg->s_szc == 0) { 7927 mutex_enter(&freemem_lock); 7928 if (availrmem < tune.t_minarmem + npages) { 7929 mutex_exit(&freemem_lock); 7930 mutex_exit(&svd->segp_slock); 7931 error = ENOMEM; 7932 goto out; 7933 } 7934 availrmem -= npages; 7935 mutex_exit(&freemem_lock); 7936 } 7937 7938 pplist = kmem_alloc(sizeof (page_t *) * npages, KM_SLEEP); 7939 pl = pplist; 7940 *ppp = pplist + adjustpages; 7941 7942 page = seg_page(seg, addr); 7943 anon_index = svd->anon_index + page; 7944 7945 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 7946 for (a = addr; a < addr + len; a += PAGESIZE, anon_index++) { 7947 struct anon *ap; 7948 struct vnode *vp; 7949 u_offset_t off; 7950 anon_sync_obj_t cookie; 7951 7952 anon_array_enter(amp, anon_index, &cookie); 7953 ap = anon_get_ptr(amp->ahp, anon_index); 7954 if (ap == NULL) { 7955 anon_array_exit(&cookie); 7956 break; 7957 } else { 7958 /* 7959 * We must never use seg_pcache for COW pages 7960 * because we might end up with original page still 7961 * lying in seg_pcache even after private page is 7962 * created. This leads to data corruption as 7963 * aio_write refers to the page still in cache 7964 * while all other accesses refer to the private 7965 * page. 7966 */ 7967 if (ap->an_refcnt != 1) { 7968 anon_array_exit(&cookie); 7969 break; 7970 } 7971 } 7972 swap_xlate(ap, &vp, &off); 7973 anon_array_exit(&cookie); 7974 7975 pp = page_lookup_nowait(vp, off, SE_SHARED); 7976 if (pp == NULL) { 7977 break; 7978 } 7979 if (seg->s_szc != 0 || pp->p_szc != 0) { 7980 if (!segvn_pp_lock_anonpages(pp, a == addr)) { 7981 page_unlock(pp); 7982 break; 7983 } 7984 } else { 7985 szc0_npages++; 7986 } 7987 *pplist++ = pp; 7988 } 7989 ANON_LOCK_EXIT(&->a_rwlock); 7990 7991 ASSERT(npages >= szc0_npages); 7992 7993 if (a >= addr + len) { 7994 mutex_enter(&freemem_lock); 7995 if (seg->s_szc == 0 && npages != szc0_npages) { 7996 ASSERT(svd->type == MAP_SHARED && amp->a_szc > 0); 7997 availrmem += (npages - szc0_npages); 7998 } 7999 svd->softlockcnt += npages; 8000 segvn_pages_locked += npages; 8001 mutex_exit(&freemem_lock); 8002 (void) seg_pinsert(seg, addr, len, pl, rw, SEGP_ASYNC_FLUSH, 8003 segvn_reclaim); 8004 mutex_exit(&svd->segp_slock); 8005 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 8006 TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEGVN_FILL_END, 8007 "segvn_pagelock: cache fill seg %p addr %p", seg, addr); 8008 return (0); 8009 } 8010 8011 mutex_exit(&svd->segp_slock); 8012 if (seg->s_szc == 0) { 8013 mutex_enter(&freemem_lock); 8014 availrmem += npages; 8015 mutex_exit(&freemem_lock); 8016 } 8017 error = EFAULT; 8018 pplist = pl; 8019 np = ((uintptr_t)(a - addr)) >> PAGESHIFT; 8020 while (np > (uint_t)0) { 8021 ASSERT(PAGE_LOCKED(*pplist)); 8022 if (seg->s_szc != 0 || (*pplist)->p_szc != 0) { 8023 segvn_pp_unlock_anonpages(*pplist, pplist == pl); 8024 } 8025 page_unlock(*pplist); 8026 np--; 8027 pplist++; 8028 } 8029 kmem_free(pl, sizeof (page_t *) * npages); 8030 out: 8031 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 8032 *ppp = NULL; 8033 TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEGVN_MISS_END, 8034 "segvn_pagelock: cache miss seg %p addr %p", seg, addr); 8035 return (error); 8036 } 8037 8038 /* 8039 * purge any cached pages in the I/O page cache 8040 */ 8041 static void 8042 segvn_purge(struct seg *seg) 8043 { 8044 seg_ppurge(seg); 8045 } 8046 8047 static int 8048 segvn_reclaim(struct seg *seg, caddr_t addr, size_t len, struct page **pplist, 8049 enum seg_rw rw) 8050 { 8051 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 8052 pgcnt_t np, npages; 8053 struct page **pl; 8054 pgcnt_t szc0_npages = 0; 8055 8056 #ifdef lint 8057 addr = addr; 8058 #endif 8059 8060 npages = np = (len >> PAGESHIFT); 8061 ASSERT(npages); 8062 pl = pplist; 8063 if (seg->s_szc != 0) { 8064 size_t pgsz = page_get_pagesize(seg->s_szc); 8065 if (!IS_P2ALIGNED(addr, pgsz) || !IS_P2ALIGNED(len, pgsz)) { 8066 panic("segvn_reclaim: unaligned addr or len"); 8067 /*NOTREACHED*/ 8068 } 8069 } 8070 8071 ASSERT(svd->vp == NULL && svd->amp != NULL); 8072 8073 while (np > (uint_t)0) { 8074 if (rw == S_WRITE) { 8075 hat_setrefmod(*pplist); 8076 } else { 8077 hat_setref(*pplist); 8078 } 8079 if (seg->s_szc != 0 || (*pplist)->p_szc != 0) { 8080 segvn_pp_unlock_anonpages(*pplist, pplist == pl); 8081 } else { 8082 szc0_npages++; 8083 } 8084 page_unlock(*pplist); 8085 np--; 8086 pplist++; 8087 } 8088 kmem_free(pl, sizeof (page_t *) * npages); 8089 8090 mutex_enter(&freemem_lock); 8091 segvn_pages_locked -= npages; 8092 svd->softlockcnt -= npages; 8093 if (szc0_npages != 0) { 8094 availrmem += szc0_npages; 8095 } 8096 mutex_exit(&freemem_lock); 8097 if (svd->softlockcnt <= 0) { 8098 if (AS_ISUNMAPWAIT(seg->s_as)) { 8099 mutex_enter(&seg->s_as->a_contents); 8100 if (AS_ISUNMAPWAIT(seg->s_as)) { 8101 AS_CLRUNMAPWAIT(seg->s_as); 8102 cv_broadcast(&seg->s_as->a_cv); 8103 } 8104 mutex_exit(&seg->s_as->a_contents); 8105 } 8106 } 8107 return (0); 8108 } 8109 /* 8110 * get a memory ID for an addr in a given segment 8111 * 8112 * XXX only creates PAGESIZE pages if anon slots are not initialized. 8113 * At fault time they will be relocated into larger pages. 8114 */ 8115 static int 8116 segvn_getmemid(struct seg *seg, caddr_t addr, memid_t *memidp) 8117 { 8118 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 8119 struct anon *ap = NULL; 8120 ulong_t anon_index; 8121 struct anon_map *amp; 8122 anon_sync_obj_t cookie; 8123 8124 if (svd->type == MAP_PRIVATE) { 8125 memidp->val[0] = (uintptr_t)seg->s_as; 8126 memidp->val[1] = (uintptr_t)addr; 8127 return (0); 8128 } 8129 8130 if (svd->type == MAP_SHARED) { 8131 if (svd->vp) { 8132 memidp->val[0] = (uintptr_t)svd->vp; 8133 memidp->val[1] = (u_longlong_t)svd->offset + 8134 (uintptr_t)(addr - seg->s_base); 8135 return (0); 8136 } else { 8137 8138 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER); 8139 if ((amp = svd->amp) != NULL) { 8140 anon_index = svd->anon_index + 8141 seg_page(seg, addr); 8142 } 8143 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 8144 8145 ASSERT(amp != NULL); 8146 8147 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 8148 anon_array_enter(amp, anon_index, &cookie); 8149 ap = anon_get_ptr(amp->ahp, anon_index); 8150 if (ap == NULL) { 8151 page_t *pp; 8152 8153 pp = anon_zero(seg, addr, &ap, svd->cred); 8154 if (pp == NULL) { 8155 anon_array_exit(&cookie); 8156 ANON_LOCK_EXIT(&->a_rwlock); 8157 return (ENOMEM); 8158 } 8159 ASSERT(anon_get_ptr(amp->ahp, anon_index) 8160 == NULL); 8161 (void) anon_set_ptr(amp->ahp, anon_index, 8162 ap, ANON_SLEEP); 8163 page_unlock(pp); 8164 } 8165 8166 anon_array_exit(&cookie); 8167 ANON_LOCK_EXIT(&->a_rwlock); 8168 8169 memidp->val[0] = (uintptr_t)ap; 8170 memidp->val[1] = (uintptr_t)addr & PAGEOFFSET; 8171 return (0); 8172 } 8173 } 8174 return (EINVAL); 8175 } 8176 8177 static int 8178 sameprot(struct seg *seg, caddr_t a, size_t len) 8179 { 8180 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 8181 struct vpage *vpage; 8182 spgcnt_t pages = btop(len); 8183 uint_t prot; 8184 8185 if (svd->pageprot == 0) 8186 return (1); 8187 8188 ASSERT(svd->vpage != NULL); 8189 8190 vpage = &svd->vpage[seg_page(seg, a)]; 8191 prot = VPP_PROT(vpage); 8192 vpage++; 8193 pages--; 8194 while (pages-- > 0) { 8195 if (prot != VPP_PROT(vpage)) 8196 return (0); 8197 vpage++; 8198 } 8199 return (1); 8200 } 8201 8202 /* 8203 * Get memory allocation policy info for specified address in given segment 8204 */ 8205 static lgrp_mem_policy_info_t * 8206 segvn_getpolicy(struct seg *seg, caddr_t addr) 8207 { 8208 struct anon_map *amp; 8209 ulong_t anon_index; 8210 lgrp_mem_policy_info_t *policy_info; 8211 struct segvn_data *svn_data; 8212 u_offset_t vn_off; 8213 vnode_t *vp; 8214 8215 ASSERT(seg != NULL); 8216 8217 svn_data = (struct segvn_data *)seg->s_data; 8218 if (svn_data == NULL) 8219 return (NULL); 8220 8221 /* 8222 * Get policy info for private or shared memory 8223 */ 8224 if (svn_data->type != MAP_SHARED) 8225 policy_info = &svn_data->policy_info; 8226 else { 8227 amp = svn_data->amp; 8228 anon_index = svn_data->anon_index + seg_page(seg, addr); 8229 vp = svn_data->vp; 8230 vn_off = svn_data->offset + (uintptr_t)(addr - seg->s_base); 8231 policy_info = lgrp_shm_policy_get(amp, anon_index, vp, vn_off); 8232 } 8233 8234 return (policy_info); 8235 } 8236 8237 /*ARGSUSED*/ 8238 static int 8239 segvn_capable(struct seg *seg, segcapability_t capability) 8240 { 8241 return (0); 8242 } 8243