1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License, Version 1.0 only 6 * (the "License"). You may not use this file except in compliance 7 * with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or http://www.opensolaris.org/os/licensing. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* 23 * Copyright 2005 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ 28 /* All Rights Reserved */ 29 30 /* 31 * University Copyright- Copyright (c) 1982, 1986, 1988 32 * The Regents of the University of California 33 * All Rights Reserved 34 * 35 * University Acknowledgment- Portions of this document are derived from 36 * software developed by the University of California, Berkeley, and its 37 * contributors. 38 */ 39 40 #pragma ident "%Z%%M% %I% %E% SMI" 41 42 /* 43 * VM - shared or copy-on-write from a vnode/anonymous memory. 44 */ 45 46 #include <sys/types.h> 47 #include <sys/param.h> 48 #include <sys/t_lock.h> 49 #include <sys/errno.h> 50 #include <sys/systm.h> 51 #include <sys/mman.h> 52 #include <sys/debug.h> 53 #include <sys/cred.h> 54 #include <sys/vmsystm.h> 55 #include <sys/tuneable.h> 56 #include <sys/bitmap.h> 57 #include <sys/swap.h> 58 #include <sys/kmem.h> 59 #include <sys/sysmacros.h> 60 #include <sys/vtrace.h> 61 #include <sys/cmn_err.h> 62 #include <sys/vm.h> 63 #include <sys/dumphdr.h> 64 #include <sys/lgrp.h> 65 66 #include <vm/hat.h> 67 #include <vm/as.h> 68 #include <vm/seg.h> 69 #include <vm/seg_vn.h> 70 #include <vm/pvn.h> 71 #include <vm/anon.h> 72 #include <vm/page.h> 73 #include <vm/vpage.h> 74 75 /* 76 * Private seg op routines. 77 */ 78 static int segvn_dup(struct seg *seg, struct seg *newseg); 79 static int segvn_unmap(struct seg *seg, caddr_t addr, size_t len); 80 static void segvn_free(struct seg *seg); 81 static faultcode_t segvn_fault(struct hat *hat, struct seg *seg, 82 caddr_t addr, size_t len, enum fault_type type, 83 enum seg_rw rw); 84 static faultcode_t segvn_faulta(struct seg *seg, caddr_t addr); 85 static int segvn_setprot(struct seg *seg, caddr_t addr, 86 size_t len, uint_t prot); 87 static int segvn_checkprot(struct seg *seg, caddr_t addr, 88 size_t len, uint_t prot); 89 static int segvn_kluster(struct seg *seg, caddr_t addr, ssize_t delta); 90 static size_t segvn_swapout(struct seg *seg); 91 static int segvn_sync(struct seg *seg, caddr_t addr, size_t len, 92 int attr, uint_t flags); 93 static size_t segvn_incore(struct seg *seg, caddr_t addr, size_t len, 94 char *vec); 95 static int segvn_lockop(struct seg *seg, caddr_t addr, size_t len, 96 int attr, int op, ulong_t *lockmap, size_t pos); 97 static int segvn_getprot(struct seg *seg, caddr_t addr, size_t len, 98 uint_t *protv); 99 static u_offset_t segvn_getoffset(struct seg *seg, caddr_t addr); 100 static int segvn_gettype(struct seg *seg, caddr_t addr); 101 static int segvn_getvp(struct seg *seg, caddr_t addr, 102 struct vnode **vpp); 103 static int segvn_advise(struct seg *seg, caddr_t addr, size_t len, 104 uint_t behav); 105 static void segvn_dump(struct seg *seg); 106 static int segvn_pagelock(struct seg *seg, caddr_t addr, size_t len, 107 struct page ***ppp, enum lock_type type, enum seg_rw rw); 108 static int segvn_setpagesize(struct seg *seg, caddr_t addr, size_t len, 109 uint_t szc); 110 static int segvn_getmemid(struct seg *seg, caddr_t addr, 111 memid_t *memidp); 112 static lgrp_mem_policy_info_t *segvn_getpolicy(struct seg *, caddr_t); 113 static int segvn_capable(struct seg *seg, segcapability_t capable); 114 115 struct seg_ops segvn_ops = { 116 segvn_dup, 117 segvn_unmap, 118 segvn_free, 119 segvn_fault, 120 segvn_faulta, 121 segvn_setprot, 122 segvn_checkprot, 123 segvn_kluster, 124 segvn_swapout, 125 segvn_sync, 126 segvn_incore, 127 segvn_lockop, 128 segvn_getprot, 129 segvn_getoffset, 130 segvn_gettype, 131 segvn_getvp, 132 segvn_advise, 133 segvn_dump, 134 segvn_pagelock, 135 segvn_setpagesize, 136 segvn_getmemid, 137 segvn_getpolicy, 138 segvn_capable, 139 }; 140 141 /* 142 * Common zfod structures, provided as a shorthand for others to use. 143 */ 144 static segvn_crargs_t zfod_segvn_crargs = 145 SEGVN_ZFOD_ARGS(PROT_ZFOD, PROT_ALL); 146 static segvn_crargs_t kzfod_segvn_crargs = 147 SEGVN_ZFOD_ARGS(PROT_ZFOD & ~PROT_USER, 148 PROT_ALL & ~PROT_USER); 149 static segvn_crargs_t stack_noexec_crargs = 150 SEGVN_ZFOD_ARGS(PROT_ZFOD & ~PROT_EXEC, PROT_ALL); 151 152 caddr_t zfod_argsp = (caddr_t)&zfod_segvn_crargs; /* user zfod argsp */ 153 caddr_t kzfod_argsp = (caddr_t)&kzfod_segvn_crargs; /* kernel zfod argsp */ 154 caddr_t stack_exec_argsp = (caddr_t)&zfod_segvn_crargs; /* executable stack */ 155 caddr_t stack_noexec_argsp = (caddr_t)&stack_noexec_crargs; /* noexec stack */ 156 157 #define vpgtob(n) ((n) * sizeof (struct vpage)) /* For brevity */ 158 159 size_t segvn_comb_thrshld = UINT_MAX; /* patchable -- see 1196681 */ 160 161 static int segvn_concat(struct seg *, struct seg *, int); 162 static int segvn_extend_prev(struct seg *, struct seg *, 163 struct segvn_crargs *, size_t); 164 static int segvn_extend_next(struct seg *, struct seg *, 165 struct segvn_crargs *, size_t); 166 static void segvn_softunlock(struct seg *, caddr_t, size_t, enum seg_rw); 167 static void segvn_pagelist_rele(page_t **); 168 static void segvn_setvnode_mpss(vnode_t *); 169 static void segvn_relocate_pages(page_t **, page_t *); 170 static int segvn_full_szcpages(page_t **, uint_t, int *, uint_t *); 171 static int segvn_fill_vp_pages(struct segvn_data *, vnode_t *, u_offset_t, 172 uint_t, page_t **, page_t **, uint_t *, int *); 173 static faultcode_t segvn_fault_vnodepages(struct hat *, struct seg *, caddr_t, 174 caddr_t, enum fault_type, enum seg_rw, caddr_t, caddr_t, int); 175 static faultcode_t segvn_fault_anonpages(struct hat *, struct seg *, caddr_t, 176 caddr_t, enum fault_type, enum seg_rw, caddr_t, caddr_t, int); 177 static faultcode_t segvn_faultpage(struct hat *, struct seg *, caddr_t, 178 u_offset_t, struct vpage *, page_t **, uint_t, 179 enum fault_type, enum seg_rw, int); 180 static void segvn_vpage(struct seg *); 181 182 static void segvn_purge(struct seg *seg); 183 static int segvn_reclaim(struct seg *, caddr_t, size_t, struct page **, 184 enum seg_rw); 185 186 static int sameprot(struct seg *, caddr_t, size_t); 187 188 static int segvn_demote_range(struct seg *, caddr_t, size_t, int); 189 static int segvn_clrszc(struct seg *); 190 static struct seg *segvn_split_seg(struct seg *, caddr_t); 191 static int segvn_claim_pages(struct seg *, struct vpage *, u_offset_t, 192 ulong_t, uint_t); 193 194 static struct kmem_cache *segvn_cache; 195 196 #ifdef VM_STATS 197 static struct segvnvmstats_str { 198 ulong_t fill_vp_pages[31]; 199 ulong_t fltvnpages[49]; 200 ulong_t fullszcpages[10]; 201 ulong_t relocatepages[3]; 202 ulong_t fltanpages[17]; 203 ulong_t pagelock[3]; 204 ulong_t demoterange[3]; 205 } segvnvmstats; 206 #endif /* VM_STATS */ 207 208 #define SDR_RANGE 1 /* demote entire range */ 209 #define SDR_END 2 /* demote non aligned ends only */ 210 211 #define CALC_LPG_REGION(pgsz, seg, addr, len, lpgaddr, lpgeaddr) { \ 212 if ((len) != 0) { \ 213 lpgaddr = (caddr_t)P2ALIGN((uintptr_t)(addr), pgsz); \ 214 ASSERT(lpgaddr >= (seg)->s_base); \ 215 lpgeaddr = (caddr_t)P2ROUNDUP((uintptr_t)((addr) + \ 216 (len)), pgsz); \ 217 ASSERT(lpgeaddr > lpgaddr); \ 218 ASSERT(lpgeaddr <= (seg)->s_base + (seg)->s_size); \ 219 } else { \ 220 lpgeaddr = lpgaddr = (addr); \ 221 } \ 222 } 223 224 /*ARGSUSED*/ 225 static int 226 segvn_cache_constructor(void *buf, void *cdrarg, int kmflags) 227 { 228 struct segvn_data *svd = buf; 229 230 rw_init(&svd->lock, NULL, RW_DEFAULT, NULL); 231 mutex_init(&svd->segp_slock, NULL, MUTEX_DEFAULT, NULL); 232 return (0); 233 } 234 235 /*ARGSUSED1*/ 236 static void 237 segvn_cache_destructor(void *buf, void *cdrarg) 238 { 239 struct segvn_data *svd = buf; 240 241 rw_destroy(&svd->lock); 242 mutex_destroy(&svd->segp_slock); 243 } 244 245 /* 246 * Patching this variable to non-zero allows the system to run with 247 * stacks marked as "not executable". It's a bit of a kludge, but is 248 * provided as a tweakable for platforms that export those ABIs 249 * (e.g. sparc V8) that have executable stacks enabled by default. 250 * There are also some restrictions for platforms that don't actually 251 * implement 'noexec' protections. 252 * 253 * Once enabled, the system is (therefore) unable to provide a fully 254 * ABI-compliant execution environment, though practically speaking, 255 * most everything works. The exceptions are generally some interpreters 256 * and debuggers that create executable code on the stack and jump 257 * into it (without explicitly mprotecting the address range to include 258 * PROT_EXEC). 259 * 260 * One important class of applications that are disabled are those 261 * that have been transformed into malicious agents using one of the 262 * numerous "buffer overflow" attacks. See 4007890. 263 */ 264 int noexec_user_stack = 0; 265 int noexec_user_stack_log = 1; 266 267 int segvn_lpg_disable = 0; 268 uint_t segvn_maxpgszc = 0; 269 270 ulong_t segvn_vmpss_clrszc_cnt; 271 ulong_t segvn_vmpss_clrszc_err; 272 ulong_t segvn_fltvnpages_clrszc_cnt; 273 ulong_t segvn_fltvnpages_clrszc_err; 274 ulong_t segvn_setpgsz_align_err; 275 ulong_t segvn_setpgsz_getattr_err; 276 ulong_t segvn_setpgsz_eof_err; 277 ulong_t segvn_faultvnmpss_align_err1; 278 ulong_t segvn_faultvnmpss_align_err2; 279 ulong_t segvn_faultvnmpss_align_err3; 280 ulong_t segvn_faultvnmpss_align_err4; 281 ulong_t segvn_faultvnmpss_align_err5; 282 ulong_t segvn_vmpss_pageio_deadlk_err; 283 284 /* 285 * Initialize segvn data structures 286 */ 287 void 288 segvn_init(void) 289 { 290 uint_t maxszc; 291 uint_t szc; 292 size_t pgsz; 293 294 segvn_cache = kmem_cache_create("segvn_cache", 295 sizeof (struct segvn_data), 0, 296 segvn_cache_constructor, segvn_cache_destructor, NULL, 297 NULL, NULL, 0); 298 299 if (segvn_lpg_disable != 0) 300 return; 301 szc = maxszc = page_num_pagesizes() - 1; 302 if (szc == 0) { 303 segvn_lpg_disable = 1; 304 return; 305 } 306 if (page_get_pagesize(0) != PAGESIZE) { 307 panic("segvn_init: bad szc 0"); 308 /*NOTREACHED*/ 309 } 310 while (szc != 0) { 311 pgsz = page_get_pagesize(szc); 312 if (pgsz <= PAGESIZE || !IS_P2ALIGNED(pgsz, pgsz)) { 313 panic("segvn_init: bad szc %d", szc); 314 /*NOTREACHED*/ 315 } 316 szc--; 317 } 318 if (segvn_maxpgszc == 0 || segvn_maxpgszc > maxszc) 319 segvn_maxpgszc = maxszc; 320 } 321 322 #define SEGVN_PAGEIO ((void *)0x1) 323 #define SEGVN_NOPAGEIO ((void *)0x2) 324 325 static void 326 segvn_setvnode_mpss(vnode_t *vp) 327 { 328 int err; 329 330 ASSERT(vp->v_mpssdata == NULL || 331 vp->v_mpssdata == SEGVN_PAGEIO || 332 vp->v_mpssdata == SEGVN_NOPAGEIO); 333 334 if (vp->v_mpssdata == NULL) { 335 if (vn_vmpss_usepageio(vp)) { 336 err = VOP_PAGEIO(vp, (page_t *)NULL, 337 (u_offset_t)0, 0, 0, CRED()); 338 } else { 339 err = ENOSYS; 340 } 341 /* 342 * set v_mpssdata just once per vnode life 343 * so that it never changes. 344 */ 345 mutex_enter(&vp->v_lock); 346 if (vp->v_mpssdata == NULL) { 347 if (err == EINVAL) { 348 vp->v_mpssdata = SEGVN_PAGEIO; 349 } else { 350 vp->v_mpssdata = SEGVN_NOPAGEIO; 351 } 352 } 353 mutex_exit(&vp->v_lock); 354 } 355 } 356 357 int 358 segvn_create(struct seg *seg, void *argsp) 359 { 360 struct segvn_crargs *a = (struct segvn_crargs *)argsp; 361 struct segvn_data *svd; 362 size_t swresv = 0; 363 struct cred *cred; 364 struct anon_map *amp; 365 int error = 0; 366 size_t pgsz; 367 lgrp_mem_policy_t mpolicy = LGRP_MEM_POLICY_DEFAULT; 368 369 370 ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock)); 371 372 if (a->type != MAP_PRIVATE && a->type != MAP_SHARED) { 373 panic("segvn_create type"); 374 /*NOTREACHED*/ 375 } 376 377 /* 378 * Check arguments. If a shared anon structure is given then 379 * it is illegal to also specify a vp. 380 */ 381 if (a->amp != NULL && a->vp != NULL) { 382 panic("segvn_create anon_map"); 383 /*NOTREACHED*/ 384 } 385 386 /* MAP_NORESERVE on a MAP_SHARED segment is meaningless. */ 387 if (a->type == MAP_SHARED) 388 a->flags &= ~MAP_NORESERVE; 389 390 if (a->szc != 0) { 391 if (segvn_lpg_disable != 0 || a->amp != NULL || 392 (a->type == MAP_SHARED && a->vp == NULL) || 393 (a->flags & MAP_NORESERVE) || seg->s_as == &kas) { 394 a->szc = 0; 395 } else { 396 if (a->szc > segvn_maxpgszc) 397 a->szc = segvn_maxpgszc; 398 pgsz = page_get_pagesize(a->szc); 399 if (!IS_P2ALIGNED(seg->s_base, pgsz) || 400 !IS_P2ALIGNED(seg->s_size, pgsz)) { 401 a->szc = 0; 402 } else if (a->vp != NULL) { 403 extern struct vnode kvp; 404 if (IS_SWAPFSVP(a->vp) || a->vp == &kvp) { 405 /* 406 * paranoid check. 407 * hat_page_demote() is not supported 408 * on swapfs pages. 409 */ 410 a->szc = 0; 411 } else if (map_addr_vacalign_check(seg->s_base, 412 a->offset & PAGEMASK)) { 413 a->szc = 0; 414 } 415 } 416 } 417 } 418 419 /* 420 * If segment may need private pages, reserve them now. 421 */ 422 if (!(a->flags & MAP_NORESERVE) && ((a->vp == NULL && a->amp == NULL) || 423 (a->type == MAP_PRIVATE && (a->prot & PROT_WRITE)))) { 424 if (anon_resv(seg->s_size) == 0) 425 return (EAGAIN); 426 swresv = seg->s_size; 427 TRACE_3(TR_FAC_VM, TR_ANON_PROC, "anon proc:%p %lu %u", 428 seg, swresv, 1); 429 } 430 431 /* 432 * Reserve any mapping structures that may be required. 433 */ 434 hat_map(seg->s_as->a_hat, seg->s_base, seg->s_size, HAT_MAP); 435 436 if (a->cred) { 437 cred = a->cred; 438 crhold(cred); 439 } else { 440 crhold(cred = CRED()); 441 } 442 443 /* Inform the vnode of the new mapping */ 444 if (a->vp) { 445 error = VOP_ADDMAP(a->vp, a->offset & PAGEMASK, 446 seg->s_as, seg->s_base, seg->s_size, a->prot, 447 a->maxprot, a->type, cred); 448 if (error) { 449 if (swresv != 0) { 450 anon_unresv(swresv); 451 TRACE_3(TR_FAC_VM, TR_ANON_PROC, 452 "anon proc:%p %lu %u", 453 seg, swresv, 0); 454 } 455 crfree(cred); 456 hat_unload(seg->s_as->a_hat, seg->s_base, 457 seg->s_size, HAT_UNLOAD_UNMAP); 458 return (error); 459 } 460 } 461 462 /* 463 * If more than one segment in the address space, and 464 * they're adjacent virtually, try to concatenate them. 465 * Don't concatenate if an explicit anon_map structure 466 * was supplied (e.g., SystemV shared memory). 467 */ 468 if (a->amp == NULL) { 469 struct seg *pseg, *nseg; 470 struct segvn_data *psvd, *nsvd; 471 lgrp_mem_policy_t ppolicy, npolicy; 472 uint_t lgrp_mem_policy_flags = 0; 473 extern lgrp_mem_policy_t lgrp_mem_default_policy; 474 475 /* 476 * Memory policy flags (lgrp_mem_policy_flags) is valid when 477 * extending stack/heap segments. 478 */ 479 if ((a->vp == NULL) && (a->type == MAP_PRIVATE) && 480 !(a->flags & MAP_NORESERVE) && (seg->s_as != &kas)) { 481 lgrp_mem_policy_flags = a->lgrp_mem_policy_flags; 482 } else { 483 /* 484 * Get policy when not extending it from another segment 485 */ 486 mpolicy = lgrp_mem_policy_default(seg->s_size, a->type); 487 } 488 489 /* 490 * First, try to concatenate the previous and new segments 491 */ 492 pseg = AS_SEGPREV(seg->s_as, seg); 493 if (pseg != NULL && 494 pseg->s_base + pseg->s_size == seg->s_base && 495 pseg->s_ops == &segvn_ops) { 496 /* 497 * Get memory allocation policy from previous segment. 498 * When extension is specified (e.g. for heap) apply 499 * this policy to the new segment regardless of the 500 * outcome of segment concatenation. Extension occurs 501 * for non-default policy otherwise default policy is 502 * used and is based on extended segment size. 503 */ 504 psvd = (struct segvn_data *)pseg->s_data; 505 ppolicy = psvd->policy_info.mem_policy; 506 if (lgrp_mem_policy_flags == 507 LGRP_MP_FLAG_EXTEND_UP) { 508 if (ppolicy != lgrp_mem_default_policy) { 509 mpolicy = ppolicy; 510 } else { 511 mpolicy = lgrp_mem_policy_default( 512 pseg->s_size + seg->s_size, 513 a->type); 514 } 515 } 516 517 if (mpolicy == ppolicy && 518 (pseg->s_size + seg->s_size <= 519 segvn_comb_thrshld || psvd->amp == NULL) && 520 segvn_extend_prev(pseg, seg, a, swresv) == 0) { 521 /* 522 * success! now try to concatenate 523 * with following seg 524 */ 525 crfree(cred); 526 nseg = AS_SEGNEXT(pseg->s_as, pseg); 527 if (nseg != NULL && 528 nseg != pseg && 529 nseg->s_ops == &segvn_ops && 530 pseg->s_base + pseg->s_size == 531 nseg->s_base) 532 (void) segvn_concat(pseg, nseg, 0); 533 ASSERT(pseg->s_szc == 0 || 534 (a->szc == pseg->s_szc && 535 IS_P2ALIGNED(pseg->s_base, pgsz) && 536 IS_P2ALIGNED(pseg->s_size, pgsz))); 537 return (0); 538 } 539 } 540 541 /* 542 * Failed, so try to concatenate with following seg 543 */ 544 nseg = AS_SEGNEXT(seg->s_as, seg); 545 if (nseg != NULL && 546 seg->s_base + seg->s_size == nseg->s_base && 547 nseg->s_ops == &segvn_ops) { 548 /* 549 * Get memory allocation policy from next segment. 550 * When extension is specified (e.g. for stack) apply 551 * this policy to the new segment regardless of the 552 * outcome of segment concatenation. Extension occurs 553 * for non-default policy otherwise default policy is 554 * used and is based on extended segment size. 555 */ 556 nsvd = (struct segvn_data *)nseg->s_data; 557 npolicy = nsvd->policy_info.mem_policy; 558 if (lgrp_mem_policy_flags == 559 LGRP_MP_FLAG_EXTEND_DOWN) { 560 if (npolicy != lgrp_mem_default_policy) { 561 mpolicy = npolicy; 562 } else { 563 mpolicy = lgrp_mem_policy_default( 564 nseg->s_size + seg->s_size, 565 a->type); 566 } 567 } 568 569 if (mpolicy == npolicy && 570 segvn_extend_next(seg, nseg, a, swresv) == 0) { 571 crfree(cred); 572 ASSERT(nseg->s_szc == 0 || 573 (a->szc == nseg->s_szc && 574 IS_P2ALIGNED(nseg->s_base, pgsz) && 575 IS_P2ALIGNED(nseg->s_size, pgsz))); 576 return (0); 577 } 578 } 579 } 580 581 if (a->vp != NULL) { 582 VN_HOLD(a->vp); 583 if (a->type == MAP_SHARED) 584 lgrp_shm_policy_init(NULL, a->vp); 585 } 586 svd = kmem_cache_alloc(segvn_cache, KM_SLEEP); 587 588 seg->s_ops = &segvn_ops; 589 seg->s_data = (void *)svd; 590 seg->s_szc = a->szc; 591 592 svd->vp = a->vp; 593 /* 594 * Anonymous mappings have no backing file so the offset is meaningless. 595 */ 596 svd->offset = a->vp ? (a->offset & PAGEMASK) : 0; 597 svd->prot = a->prot; 598 svd->maxprot = a->maxprot; 599 svd->pageprot = 0; 600 svd->type = a->type; 601 svd->vpage = NULL; 602 svd->cred = cred; 603 svd->advice = MADV_NORMAL; 604 svd->pageadvice = 0; 605 svd->flags = (ushort_t)a->flags; 606 svd->softlockcnt = 0; 607 if (a->szc != 0 && a->vp != NULL) { 608 segvn_setvnode_mpss(a->vp); 609 } 610 611 amp = a->amp; 612 if ((svd->amp = amp) == NULL) { 613 svd->anon_index = 0; 614 if (svd->type == MAP_SHARED) { 615 svd->swresv = 0; 616 /* 617 * Shared mappings to a vp need no other setup. 618 * If we have a shared mapping to an anon_map object 619 * which hasn't been allocated yet, allocate the 620 * struct now so that it will be properly shared 621 * by remembering the swap reservation there. 622 */ 623 if (a->vp == NULL) { 624 svd->amp = anonmap_alloc(seg->s_size, swresv); 625 svd->amp->a_szc = seg->s_szc; 626 } 627 } else { 628 /* 629 * Private mapping (with or without a vp). 630 * Allocate anon_map when needed. 631 */ 632 svd->swresv = swresv; 633 } 634 } else { 635 pgcnt_t anon_num; 636 637 /* 638 * Mapping to an existing anon_map structure without a vp. 639 * For now we will insure that the segment size isn't larger 640 * than the size - offset gives us. Later on we may wish to 641 * have the anon array dynamically allocated itself so that 642 * we don't always have to allocate all the anon pointer slots. 643 * This of course involves adding extra code to check that we 644 * aren't trying to use an anon pointer slot beyond the end 645 * of the currently allocated anon array. 646 */ 647 if ((amp->size - a->offset) < seg->s_size) { 648 panic("segvn_create anon_map size"); 649 /*NOTREACHED*/ 650 } 651 652 anon_num = btopr(a->offset); 653 654 if (a->type == MAP_SHARED) { 655 /* 656 * SHARED mapping to a given anon_map. 657 */ 658 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 659 amp->refcnt++; 660 ANON_LOCK_EXIT(&->a_rwlock); 661 svd->anon_index = anon_num; 662 svd->swresv = 0; 663 } else { 664 /* 665 * PRIVATE mapping to a given anon_map. 666 * Make sure that all the needed anon 667 * structures are created (so that we will 668 * share the underlying pages if nothing 669 * is written by this mapping) and then 670 * duplicate the anon array as is done 671 * when a privately mapped segment is dup'ed. 672 */ 673 struct anon *ap; 674 caddr_t addr; 675 caddr_t eaddr; 676 ulong_t anon_idx; 677 int hat_flag = HAT_LOAD; 678 679 if (svd->flags & MAP_TEXT) { 680 hat_flag |= HAT_LOAD_TEXT; 681 } 682 683 svd->amp = anonmap_alloc(seg->s_size, 0); 684 svd->amp->a_szc = seg->s_szc; 685 svd->anon_index = 0; 686 svd->swresv = swresv; 687 688 /* 689 * Prevent 2 threads from allocating anon 690 * slots simultaneously. 691 */ 692 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 693 eaddr = seg->s_base + seg->s_size; 694 695 for (anon_idx = anon_num, addr = seg->s_base; 696 addr < eaddr; addr += PAGESIZE, anon_idx++) { 697 page_t *pp; 698 699 if ((ap = anon_get_ptr(amp->ahp, 700 anon_idx)) != NULL) 701 continue; 702 703 /* 704 * Allocate the anon struct now. 705 * Might as well load up translation 706 * to the page while we're at it... 707 */ 708 pp = anon_zero(seg, addr, &ap, cred); 709 if (ap == NULL || pp == NULL) { 710 panic("segvn_create anon_zero"); 711 /*NOTREACHED*/ 712 } 713 714 /* 715 * Re-acquire the anon_map lock and 716 * initialize the anon array entry. 717 */ 718 ASSERT(anon_get_ptr(amp->ahp, 719 anon_idx) == NULL); 720 (void) anon_set_ptr(amp->ahp, anon_idx, ap, 721 ANON_SLEEP); 722 723 ASSERT(seg->s_szc == 0); 724 ASSERT(!IS_VMODSORT(pp->p_vnode)); 725 726 hat_memload(seg->s_as->a_hat, addr, pp, 727 svd->prot & ~PROT_WRITE, hat_flag); 728 729 page_unlock(pp); 730 } 731 ASSERT(seg->s_szc == 0); 732 anon_dup(amp->ahp, anon_num, svd->amp->ahp, 733 0, seg->s_size); 734 ANON_LOCK_EXIT(&->a_rwlock); 735 } 736 } 737 738 /* 739 * Set default memory allocation policy for segment 740 * 741 * Always set policy for private memory at least for initialization 742 * even if this is a shared memory segment 743 */ 744 (void) lgrp_privm_policy_set(mpolicy, &svd->policy_info, seg->s_size); 745 746 if (svd->type == MAP_SHARED) 747 (void) lgrp_shm_policy_set(mpolicy, svd->amp, svd->anon_index, 748 svd->vp, svd->offset, seg->s_size); 749 750 return (0); 751 } 752 753 /* 754 * Concatenate two existing segments, if possible. 755 * Return 0 on success, -1 if two segments are not compatible 756 * or -2 on memory allocation failure. 757 * If private == 1 then try and concat segments with private pages. 758 */ 759 static int 760 segvn_concat(struct seg *seg1, struct seg *seg2, int private) 761 { 762 struct segvn_data *svd1 = seg1->s_data; 763 struct segvn_data *svd2 = seg2->s_data; 764 struct anon_map *amp1 = svd1->amp; 765 struct anon_map *amp2 = svd2->amp; 766 struct vpage *vpage1 = svd1->vpage; 767 struct vpage *vpage2 = svd2->vpage, *nvpage = NULL; 768 size_t size, nvpsize; 769 pgcnt_t npages1, npages2; 770 771 ASSERT(seg1->s_as && seg2->s_as && seg1->s_as == seg2->s_as); 772 ASSERT(AS_WRITE_HELD(seg1->s_as, &seg1->s_as->a_lock)); 773 ASSERT(seg1->s_ops == seg2->s_ops); 774 775 /* both segments exist, try to merge them */ 776 #define incompat(x) (svd1->x != svd2->x) 777 if (incompat(vp) || incompat(maxprot) || 778 (!svd1->pageadvice && !svd2->pageadvice && incompat(advice)) || 779 (!svd1->pageprot && !svd2->pageprot && incompat(prot)) || 780 incompat(type) || incompat(cred) || incompat(flags) || 781 seg1->s_szc != seg2->s_szc || incompat(policy_info.mem_policy) || 782 (svd2->softlockcnt > 0)) 783 return (-1); 784 #undef incompat 785 786 /* 787 * vp == NULL implies zfod, offset doesn't matter 788 */ 789 if (svd1->vp != NULL && 790 svd1->offset + seg1->s_size != svd2->offset) { 791 return (-1); 792 } 793 794 /* 795 * Fail early if we're not supposed to concatenate 796 * private pages. 797 */ 798 if ((private == 0 || svd1->type != MAP_PRIVATE) && 799 (amp1 != NULL || amp2 != NULL)) { 800 return (-1); 801 } 802 803 /* 804 * If either seg has vpages, create a new merged vpage array. 805 */ 806 if (vpage1 != NULL || vpage2 != NULL) { 807 struct vpage *vp; 808 809 npages1 = seg_pages(seg1); 810 npages2 = seg_pages(seg2); 811 nvpsize = vpgtob(npages1 + npages2); 812 813 if ((nvpage = kmem_zalloc(nvpsize, KM_NOSLEEP)) == NULL) { 814 return (-2); 815 } 816 if (vpage1 != NULL) { 817 bcopy(vpage1, nvpage, vpgtob(npages1)); 818 } 819 if (vpage2 != NULL) { 820 bcopy(vpage2, nvpage + npages1, vpgtob(npages2)); 821 } 822 for (vp = nvpage; vp < nvpage + npages1; vp++) { 823 if (svd2->pageprot && !svd1->pageprot) { 824 VPP_SETPROT(vp, svd1->prot); 825 } 826 if (svd2->pageadvice && !svd1->pageadvice) { 827 VPP_SETADVICE(vp, svd1->advice); 828 } 829 } 830 for (vp = nvpage + npages1; 831 vp < nvpage + npages1 + npages2; vp++) { 832 if (svd1->pageprot && !svd2->pageprot) { 833 VPP_SETPROT(vp, svd2->prot); 834 } 835 if (svd1->pageadvice && !svd2->pageadvice) { 836 VPP_SETADVICE(vp, svd2->advice); 837 } 838 } 839 } 840 841 /* 842 * If either segment has private pages, create a new merged anon 843 * array. 844 */ 845 if (amp1 != NULL || amp2 != NULL) { 846 struct anon_hdr *nahp; 847 struct anon_map *namp = NULL; 848 size_t asize = seg1->s_size + seg2->s_size; 849 850 if ((nahp = anon_create(btop(asize), ANON_NOSLEEP)) == NULL) { 851 if (nvpage != NULL) { 852 kmem_free(nvpage, nvpsize); 853 } 854 return (-2); 855 } 856 if (amp1 != NULL) { 857 /* 858 * XXX anon rwlock is not really needed because 859 * this is a private segment and we are writers. 860 */ 861 ANON_LOCK_ENTER(&1->a_rwlock, RW_WRITER); 862 ASSERT(amp1->refcnt == 1); 863 if (anon_copy_ptr(amp1->ahp, svd1->anon_index, 864 nahp, 0, btop(seg1->s_size), ANON_NOSLEEP)) { 865 anon_release(nahp, btop(asize)); 866 ANON_LOCK_EXIT(&1->a_rwlock); 867 if (nvpage != NULL) { 868 kmem_free(nvpage, nvpsize); 869 } 870 return (-2); 871 } 872 } 873 if (amp2 != NULL) { 874 ANON_LOCK_ENTER(&2->a_rwlock, RW_WRITER); 875 ASSERT(amp2->refcnt == 1); 876 if (anon_copy_ptr(amp2->ahp, svd2->anon_index, 877 nahp, btop(seg1->s_size), btop(seg2->s_size), 878 ANON_NOSLEEP)) { 879 anon_release(nahp, btop(asize)); 880 ANON_LOCK_EXIT(&2->a_rwlock); 881 if (amp1 != NULL) { 882 ANON_LOCK_EXIT(&1->a_rwlock); 883 } 884 if (nvpage != NULL) { 885 kmem_free(nvpage, nvpsize); 886 } 887 return (-2); 888 } 889 } 890 if (amp1 != NULL) { 891 namp = amp1; 892 anon_release(amp1->ahp, btop(amp1->size)); 893 } 894 if (amp2 != NULL) { 895 if (namp == NULL) { 896 ASSERT(amp1 == NULL); 897 namp = amp2; 898 anon_release(amp2->ahp, btop(amp2->size)); 899 } else { 900 amp2->refcnt--; 901 ANON_LOCK_EXIT(&2->a_rwlock); 902 anonmap_free(amp2); 903 } 904 svd2->amp = NULL; /* needed for seg_free */ 905 } 906 namp->ahp = nahp; 907 namp->size = asize; 908 svd1->amp = namp; 909 svd1->anon_index = 0; 910 ANON_LOCK_EXIT(&namp->a_rwlock); 911 } 912 /* 913 * Now free the old vpage structures. 914 */ 915 if (nvpage != NULL) { 916 if (vpage1 != NULL) { 917 kmem_free(vpage1, vpgtob(npages1)); 918 } 919 if (vpage2 != NULL) { 920 svd2->vpage = NULL; 921 kmem_free(vpage2, vpgtob(npages2)); 922 } 923 if (svd2->pageprot) { 924 svd1->pageprot = 1; 925 } 926 if (svd2->pageadvice) { 927 svd1->pageadvice = 1; 928 } 929 svd1->vpage = nvpage; 930 } 931 932 /* all looks ok, merge segments */ 933 svd1->swresv += svd2->swresv; 934 svd2->swresv = 0; /* so seg_free doesn't release swap space */ 935 size = seg2->s_size; 936 seg_free(seg2); 937 seg1->s_size += size; 938 return (0); 939 } 940 941 /* 942 * Extend the previous segment (seg1) to include the 943 * new segment (seg2 + a), if possible. 944 * Return 0 on success. 945 */ 946 static int 947 segvn_extend_prev(seg1, seg2, a, swresv) 948 struct seg *seg1, *seg2; 949 struct segvn_crargs *a; 950 size_t swresv; 951 { 952 struct segvn_data *svd1 = (struct segvn_data *)seg1->s_data; 953 size_t size; 954 struct anon_map *amp1; 955 struct vpage *new_vpage; 956 957 /* 958 * We don't need any segment level locks for "segvn" data 959 * since the address space is "write" locked. 960 */ 961 ASSERT(seg1->s_as && AS_WRITE_HELD(seg1->s_as, &seg1->s_as->a_lock)); 962 963 /* second segment is new, try to extend first */ 964 /* XXX - should also check cred */ 965 if (svd1->vp != a->vp || svd1->maxprot != a->maxprot || 966 (!svd1->pageprot && (svd1->prot != a->prot)) || 967 svd1->type != a->type || svd1->flags != a->flags || 968 seg1->s_szc != a->szc) 969 return (-1); 970 971 /* vp == NULL implies zfod, offset doesn't matter */ 972 if (svd1->vp != NULL && 973 svd1->offset + seg1->s_size != (a->offset & PAGEMASK)) 974 return (-1); 975 976 amp1 = svd1->amp; 977 if (amp1) { 978 pgcnt_t newpgs; 979 980 /* 981 * Segment has private pages, can data structures 982 * be expanded? 983 * 984 * Acquire the anon_map lock to prevent it from changing, 985 * if it is shared. This ensures that the anon_map 986 * will not change while a thread which has a read/write 987 * lock on an address space references it. 988 * XXX - Don't need the anon_map lock at all if "refcnt" 989 * is 1. 990 * 991 * Can't grow a MAP_SHARED segment with an anonmap because 992 * there may be existing anon slots where we want to extend 993 * the segment and we wouldn't know what to do with them 994 * (e.g., for tmpfs right thing is to just leave them there, 995 * for /dev/zero they should be cleared out). 996 */ 997 if (svd1->type == MAP_SHARED) 998 return (-1); 999 1000 ANON_LOCK_ENTER(&1->a_rwlock, RW_WRITER); 1001 if (amp1->refcnt > 1) { 1002 ANON_LOCK_EXIT(&1->a_rwlock); 1003 return (-1); 1004 } 1005 newpgs = anon_grow(amp1->ahp, &svd1->anon_index, 1006 btop(seg1->s_size), btop(seg2->s_size), ANON_NOSLEEP); 1007 1008 if (newpgs == 0) { 1009 ANON_LOCK_EXIT(&1->a_rwlock); 1010 return (-1); 1011 } 1012 amp1->size = ptob(newpgs); 1013 ANON_LOCK_EXIT(&1->a_rwlock); 1014 } 1015 if (svd1->vpage != NULL) { 1016 new_vpage = 1017 kmem_zalloc(vpgtob(seg_pages(seg1) + seg_pages(seg2)), 1018 KM_NOSLEEP); 1019 if (new_vpage == NULL) 1020 return (-1); 1021 bcopy(svd1->vpage, new_vpage, vpgtob(seg_pages(seg1))); 1022 kmem_free(svd1->vpage, vpgtob(seg_pages(seg1))); 1023 svd1->vpage = new_vpage; 1024 if (svd1->pageprot) { 1025 struct vpage *vp, *evp; 1026 1027 vp = new_vpage + seg_pages(seg1); 1028 evp = vp + seg_pages(seg2); 1029 for (; vp < evp; vp++) 1030 VPP_SETPROT(vp, a->prot); 1031 } 1032 } 1033 size = seg2->s_size; 1034 seg_free(seg2); 1035 seg1->s_size += size; 1036 svd1->swresv += swresv; 1037 return (0); 1038 } 1039 1040 /* 1041 * Extend the next segment (seg2) to include the 1042 * new segment (seg1 + a), if possible. 1043 * Return 0 on success. 1044 */ 1045 static int 1046 segvn_extend_next( 1047 struct seg *seg1, 1048 struct seg *seg2, 1049 struct segvn_crargs *a, 1050 size_t swresv) 1051 { 1052 struct segvn_data *svd2 = (struct segvn_data *)seg2->s_data; 1053 size_t size; 1054 struct anon_map *amp2; 1055 struct vpage *new_vpage; 1056 1057 /* 1058 * We don't need any segment level locks for "segvn" data 1059 * since the address space is "write" locked. 1060 */ 1061 ASSERT(seg2->s_as && AS_WRITE_HELD(seg2->s_as, &seg2->s_as->a_lock)); 1062 1063 /* first segment is new, try to extend second */ 1064 /* XXX - should also check cred */ 1065 if (svd2->vp != a->vp || svd2->maxprot != a->maxprot || 1066 (!svd2->pageprot && (svd2->prot != a->prot)) || 1067 svd2->type != a->type || svd2->flags != a->flags || 1068 seg2->s_szc != a->szc) 1069 return (-1); 1070 /* vp == NULL implies zfod, offset doesn't matter */ 1071 if (svd2->vp != NULL && 1072 (a->offset & PAGEMASK) + seg1->s_size != svd2->offset) 1073 return (-1); 1074 1075 amp2 = svd2->amp; 1076 if (amp2) { 1077 pgcnt_t newpgs; 1078 1079 /* 1080 * Segment has private pages, can data structures 1081 * be expanded? 1082 * 1083 * Acquire the anon_map lock to prevent it from changing, 1084 * if it is shared. This ensures that the anon_map 1085 * will not change while a thread which has a read/write 1086 * lock on an address space references it. 1087 * 1088 * XXX - Don't need the anon_map lock at all if "refcnt" 1089 * is 1. 1090 */ 1091 if (svd2->type == MAP_SHARED) 1092 return (-1); 1093 1094 ANON_LOCK_ENTER(&2->a_rwlock, RW_WRITER); 1095 if (amp2->refcnt > 1) { 1096 ANON_LOCK_EXIT(&2->a_rwlock); 1097 return (-1); 1098 } 1099 newpgs = anon_grow(amp2->ahp, &svd2->anon_index, 1100 btop(seg2->s_size), btop(seg1->s_size), 1101 ANON_NOSLEEP | ANON_GROWDOWN); 1102 1103 if (newpgs == 0) { 1104 ANON_LOCK_EXIT(&2->a_rwlock); 1105 return (-1); 1106 } 1107 amp2->size = ptob(newpgs); 1108 ANON_LOCK_EXIT(&2->a_rwlock); 1109 } 1110 if (svd2->vpage != NULL) { 1111 new_vpage = 1112 kmem_zalloc(vpgtob(seg_pages(seg1) + seg_pages(seg2)), 1113 KM_NOSLEEP); 1114 if (new_vpage == NULL) { 1115 /* Not merging segments so adjust anon_index back */ 1116 if (amp2) 1117 svd2->anon_index += seg_pages(seg1); 1118 return (-1); 1119 } 1120 bcopy(svd2->vpage, new_vpage + seg_pages(seg1), 1121 vpgtob(seg_pages(seg2))); 1122 kmem_free(svd2->vpage, vpgtob(seg_pages(seg2))); 1123 svd2->vpage = new_vpage; 1124 if (svd2->pageprot) { 1125 struct vpage *vp, *evp; 1126 1127 vp = new_vpage; 1128 evp = vp + seg_pages(seg1); 1129 for (; vp < evp; vp++) 1130 VPP_SETPROT(vp, a->prot); 1131 } 1132 } 1133 size = seg1->s_size; 1134 seg_free(seg1); 1135 seg2->s_size += size; 1136 seg2->s_base -= size; 1137 svd2->offset -= size; 1138 svd2->swresv += swresv; 1139 return (0); 1140 } 1141 1142 static int 1143 segvn_dup(struct seg *seg, struct seg *newseg) 1144 { 1145 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 1146 struct segvn_data *newsvd; 1147 pgcnt_t npages = seg_pages(seg); 1148 int error = 0; 1149 uint_t prot; 1150 size_t len; 1151 1152 ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock)); 1153 1154 /* 1155 * If segment has anon reserved, reserve more for the new seg. 1156 * For a MAP_NORESERVE segment swresv will be a count of all the 1157 * allocated anon slots; thus we reserve for the child as many slots 1158 * as the parent has allocated. This semantic prevents the child or 1159 * parent from dieing during a copy-on-write fault caused by trying 1160 * to write a shared pre-existing anon page. 1161 */ 1162 if ((len = svd->swresv) != 0) { 1163 if (anon_resv(svd->swresv) == 0) 1164 return (ENOMEM); 1165 1166 TRACE_3(TR_FAC_VM, TR_ANON_PROC, "anon proc:%p %lu %u", 1167 seg, len, 0); 1168 } 1169 1170 newsvd = kmem_cache_alloc(segvn_cache, KM_SLEEP); 1171 1172 newseg->s_ops = &segvn_ops; 1173 newseg->s_data = (void *)newsvd; 1174 newseg->s_szc = seg->s_szc; 1175 1176 if ((newsvd->vp = svd->vp) != NULL) { 1177 VN_HOLD(svd->vp); 1178 if (svd->type == MAP_SHARED) 1179 lgrp_shm_policy_init(NULL, svd->vp); 1180 } 1181 newsvd->offset = svd->offset; 1182 newsvd->prot = svd->prot; 1183 newsvd->maxprot = svd->maxprot; 1184 newsvd->pageprot = svd->pageprot; 1185 newsvd->type = svd->type; 1186 newsvd->cred = svd->cred; 1187 crhold(newsvd->cred); 1188 newsvd->advice = svd->advice; 1189 newsvd->pageadvice = svd->pageadvice; 1190 newsvd->swresv = svd->swresv; 1191 newsvd->flags = svd->flags; 1192 newsvd->softlockcnt = 0; 1193 newsvd->policy_info = svd->policy_info; 1194 if ((newsvd->amp = svd->amp) == NULL) { 1195 /* 1196 * Not attaching to a shared anon object. 1197 */ 1198 newsvd->anon_index = 0; 1199 } else { 1200 struct anon_map *amp; 1201 1202 amp = svd->amp; 1203 if (svd->type == MAP_SHARED) { 1204 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 1205 amp->refcnt++; 1206 ANON_LOCK_EXIT(&->a_rwlock); 1207 newsvd->anon_index = svd->anon_index; 1208 } else { 1209 int reclaim = 1; 1210 1211 /* 1212 * Allocate and initialize new anon_map structure. 1213 */ 1214 newsvd->amp = anonmap_alloc(newseg->s_size, 0); 1215 newsvd->amp->a_szc = newseg->s_szc; 1216 newsvd->anon_index = 0; 1217 1218 /* 1219 * We don't have to acquire the anon_map lock 1220 * for the new segment (since it belongs to an 1221 * address space that is still not associated 1222 * with any process), or the segment in the old 1223 * address space (since all threads in it 1224 * are stopped while duplicating the address space). 1225 */ 1226 1227 /* 1228 * The goal of the following code is to make sure that 1229 * softlocked pages do not end up as copy on write 1230 * pages. This would cause problems where one 1231 * thread writes to a page that is COW and a different 1232 * thread in the same process has softlocked it. The 1233 * softlock lock would move away from this process 1234 * because the write would cause this process to get 1235 * a copy (without the softlock). 1236 * 1237 * The strategy here is to just break the 1238 * sharing on pages that could possibly be 1239 * softlocked. 1240 */ 1241 retry: 1242 if (svd->softlockcnt) { 1243 struct anon *ap, *newap; 1244 size_t i; 1245 uint_t vpprot; 1246 page_t *anon_pl[1+1], *pp; 1247 caddr_t addr; 1248 ulong_t anon_idx = 0; 1249 1250 /* 1251 * The softlock count might be non zero 1252 * because some pages are still stuck in the 1253 * cache for lazy reclaim. Flush the cache 1254 * now. This should drop the count to zero. 1255 * [or there is really I/O going on to these 1256 * pages]. Note, we have the writers lock so 1257 * nothing gets inserted during the flush. 1258 */ 1259 if (reclaim == 1) { 1260 segvn_purge(seg); 1261 reclaim = 0; 1262 goto retry; 1263 } 1264 i = btopr(seg->s_size); 1265 addr = seg->s_base; 1266 /* 1267 * XXX break cow sharing using PAGESIZE 1268 * pages. They will be relocated into larger 1269 * pages at fault time. 1270 */ 1271 while (i-- > 0) { 1272 if (ap = anon_get_ptr(amp->ahp, 1273 anon_idx)) { 1274 error = anon_getpage(&ap, 1275 &vpprot, anon_pl, PAGESIZE, 1276 seg, addr, S_READ, 1277 svd->cred); 1278 if (error) { 1279 newsvd->vpage = NULL; 1280 goto out; 1281 } 1282 /* 1283 * prot need not be computed 1284 * below 'cause anon_private is 1285 * going to ignore it anyway 1286 * as child doesn't inherit 1287 * pagelock from parent. 1288 */ 1289 prot = svd->pageprot ? 1290 VPP_PROT( 1291 &svd->vpage[ 1292 seg_page(seg, addr)]) 1293 : svd->prot; 1294 pp = anon_private(&newap, 1295 newseg, addr, prot, 1296 anon_pl[0], 0, 1297 newsvd->cred); 1298 if (pp == NULL) { 1299 /* no mem abort */ 1300 newsvd->vpage = NULL; 1301 error = ENOMEM; 1302 goto out; 1303 } 1304 (void) anon_set_ptr( 1305 newsvd->amp->ahp, anon_idx, 1306 newap, ANON_SLEEP); 1307 page_unlock(pp); 1308 } 1309 addr += PAGESIZE; 1310 anon_idx++; 1311 } 1312 } else { /* common case */ 1313 if (seg->s_szc != 0) { 1314 /* 1315 * If at least one of anon slots of a 1316 * large page exists then make sure 1317 * all anon slots of a large page 1318 * exist to avoid partial cow sharing 1319 * of a large page in the future. 1320 */ 1321 anon_dup_fill_holes(amp->ahp, 1322 svd->anon_index, newsvd->amp->ahp, 1323 0, seg->s_size, seg->s_szc, 1324 svd->vp != NULL); 1325 } else { 1326 anon_dup(amp->ahp, svd->anon_index, 1327 newsvd->amp->ahp, 0, seg->s_size); 1328 } 1329 1330 hat_clrattr(seg->s_as->a_hat, seg->s_base, 1331 seg->s_size, PROT_WRITE); 1332 } 1333 } 1334 } 1335 /* 1336 * If necessary, create a vpage structure for the new segment. 1337 * Do not copy any page lock indications. 1338 */ 1339 if (svd->vpage != NULL) { 1340 uint_t i; 1341 struct vpage *ovp = svd->vpage; 1342 struct vpage *nvp; 1343 1344 nvp = newsvd->vpage = 1345 kmem_alloc(vpgtob(npages), KM_SLEEP); 1346 for (i = 0; i < npages; i++) { 1347 *nvp = *ovp++; 1348 VPP_CLRPPLOCK(nvp++); 1349 } 1350 } else 1351 newsvd->vpage = NULL; 1352 1353 /* Inform the vnode of the new mapping */ 1354 if (newsvd->vp != NULL) { 1355 error = VOP_ADDMAP(newsvd->vp, (offset_t)newsvd->offset, 1356 newseg->s_as, newseg->s_base, newseg->s_size, newsvd->prot, 1357 newsvd->maxprot, newsvd->type, newsvd->cred); 1358 } 1359 out: 1360 return (error); 1361 } 1362 1363 1364 /* 1365 * callback function used by segvn_unmap to invoke free_vp_pages() for only 1366 * those pages actually processed by the HAT 1367 */ 1368 extern int free_pages; 1369 1370 static void 1371 segvn_hat_unload_callback(hat_callback_t *cb) 1372 { 1373 struct seg *seg = cb->hcb_data; 1374 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 1375 size_t len; 1376 u_offset_t off; 1377 1378 ASSERT(svd->vp != NULL); 1379 ASSERT(cb->hcb_end_addr > cb->hcb_start_addr); 1380 ASSERT(cb->hcb_start_addr >= seg->s_base); 1381 1382 len = cb->hcb_end_addr - cb->hcb_start_addr; 1383 off = cb->hcb_start_addr - seg->s_base; 1384 free_vp_pages(svd->vp, svd->offset + off, len); 1385 } 1386 1387 1388 static int 1389 segvn_unmap(struct seg *seg, caddr_t addr, size_t len) 1390 { 1391 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 1392 struct segvn_data *nsvd; 1393 struct seg *nseg; 1394 struct anon_map *amp; 1395 pgcnt_t opages; /* old segment size in pages */ 1396 pgcnt_t npages; /* new segment size in pages */ 1397 pgcnt_t dpages; /* pages being deleted (unmapped) */ 1398 hat_callback_t callback; /* used for free_vp_pages() */ 1399 hat_callback_t *cbp = NULL; 1400 caddr_t nbase; 1401 size_t nsize; 1402 size_t oswresv; 1403 int reclaim = 1; 1404 1405 /* 1406 * We don't need any segment level locks for "segvn" data 1407 * since the address space is "write" locked. 1408 */ 1409 ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock)); 1410 1411 /* 1412 * Fail the unmap if pages are SOFTLOCKed through this mapping. 1413 * softlockcnt is protected from change by the as write lock. 1414 */ 1415 retry: 1416 if (svd->softlockcnt > 0) { 1417 /* 1418 * since we do have the writers lock nobody can fill 1419 * the cache during the purge. The flush either succeeds 1420 * or we still have pending I/Os. 1421 */ 1422 if (reclaim == 1) { 1423 segvn_purge(seg); 1424 reclaim = 0; 1425 goto retry; 1426 } 1427 return (EAGAIN); 1428 } 1429 1430 /* 1431 * Check for bad sizes 1432 */ 1433 if (addr < seg->s_base || addr + len > seg->s_base + seg->s_size || 1434 (len & PAGEOFFSET) || ((uintptr_t)addr & PAGEOFFSET)) { 1435 panic("segvn_unmap"); 1436 /*NOTREACHED*/ 1437 } 1438 1439 if (seg->s_szc != 0) { 1440 size_t pgsz = page_get_pagesize(seg->s_szc); 1441 int err; 1442 if (!IS_P2ALIGNED(addr, pgsz) || !IS_P2ALIGNED(len, pgsz)) { 1443 ASSERT(seg->s_base != addr || seg->s_size != len); 1444 VM_STAT_ADD(segvnvmstats.demoterange[0]); 1445 err = segvn_demote_range(seg, addr, len, SDR_END); 1446 if (err == 0) { 1447 return (IE_RETRY); 1448 } 1449 return (err); 1450 } 1451 } 1452 1453 /* Inform the vnode of the unmapping. */ 1454 if (svd->vp) { 1455 int error; 1456 1457 error = VOP_DELMAP(svd->vp, 1458 (offset_t)svd->offset + (uintptr_t)(addr - seg->s_base), 1459 seg->s_as, addr, len, svd->prot, svd->maxprot, 1460 svd->type, svd->cred); 1461 1462 if (error == EAGAIN) 1463 return (error); 1464 } 1465 /* 1466 * Remove any page locks set through this mapping. 1467 */ 1468 (void) segvn_lockop(seg, addr, len, 0, MC_UNLOCK, NULL, 0); 1469 1470 /* 1471 * Unload any hardware translations in the range to be taken out. 1472 * Use a callback to invoke free_vp_pages() effectively. 1473 */ 1474 if (svd->vp != NULL && free_pages != 0) { 1475 callback.hcb_data = seg; 1476 callback.hcb_function = segvn_hat_unload_callback; 1477 cbp = &callback; 1478 } 1479 hat_unload_callback(seg->s_as->a_hat, addr, len, HAT_UNLOAD_UNMAP, cbp); 1480 1481 /* 1482 * Check for entire segment 1483 */ 1484 if (addr == seg->s_base && len == seg->s_size) { 1485 seg_free(seg); 1486 return (0); 1487 } 1488 1489 opages = seg_pages(seg); 1490 dpages = btop(len); 1491 npages = opages - dpages; 1492 amp = svd->amp; 1493 1494 /* 1495 * Check for beginning of segment 1496 */ 1497 if (addr == seg->s_base) { 1498 if (svd->vpage != NULL) { 1499 size_t nbytes; 1500 struct vpage *ovpage; 1501 1502 ovpage = svd->vpage; /* keep pointer to vpage */ 1503 1504 nbytes = vpgtob(npages); 1505 svd->vpage = kmem_alloc(nbytes, KM_SLEEP); 1506 bcopy(&ovpage[dpages], svd->vpage, nbytes); 1507 1508 /* free up old vpage */ 1509 kmem_free(ovpage, vpgtob(opages)); 1510 } 1511 if (amp != NULL) { 1512 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 1513 if (amp->refcnt == 1 || svd->type == MAP_PRIVATE) { 1514 /* 1515 * Free up now unused parts of anon_map array. 1516 */ 1517 if (seg->s_szc != 0) { 1518 anon_free_pages(amp->ahp, 1519 svd->anon_index, len, seg->s_szc); 1520 } else { 1521 anon_free(amp->ahp, svd->anon_index, 1522 len); 1523 } 1524 1525 /* 1526 * Unreserve swap space for the unmapped chunk 1527 * of this segment in case it's MAP_SHARED 1528 */ 1529 if (svd->type == MAP_SHARED) { 1530 anon_unresv(len); 1531 amp->swresv -= len; 1532 } 1533 } 1534 ANON_LOCK_EXIT(&->a_rwlock); 1535 svd->anon_index += dpages; 1536 } 1537 if (svd->vp != NULL) 1538 svd->offset += len; 1539 1540 if (svd->swresv) { 1541 if (svd->flags & MAP_NORESERVE) { 1542 ASSERT(amp); 1543 oswresv = svd->swresv; 1544 1545 svd->swresv = ptob(anon_pages(amp->ahp, 1546 svd->anon_index, npages)); 1547 anon_unresv(oswresv - svd->swresv); 1548 } else { 1549 anon_unresv(len); 1550 svd->swresv -= len; 1551 } 1552 TRACE_3(TR_FAC_VM, TR_ANON_PROC, "anon proc:%p %lu %u", 1553 seg, len, 0); 1554 } 1555 1556 seg->s_base += len; 1557 seg->s_size -= len; 1558 return (0); 1559 } 1560 1561 /* 1562 * Check for end of segment 1563 */ 1564 if (addr + len == seg->s_base + seg->s_size) { 1565 if (svd->vpage != NULL) { 1566 size_t nbytes; 1567 struct vpage *ovpage; 1568 1569 ovpage = svd->vpage; /* keep pointer to vpage */ 1570 1571 nbytes = vpgtob(npages); 1572 svd->vpage = kmem_alloc(nbytes, KM_SLEEP); 1573 bcopy(ovpage, svd->vpage, nbytes); 1574 1575 /* free up old vpage */ 1576 kmem_free(ovpage, vpgtob(opages)); 1577 1578 } 1579 if (amp != NULL) { 1580 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 1581 if (amp->refcnt == 1 || svd->type == MAP_PRIVATE) { 1582 /* 1583 * Free up now unused parts of anon_map array 1584 */ 1585 if (seg->s_szc != 0) { 1586 ulong_t an_idx = svd->anon_index + 1587 npages; 1588 anon_free_pages(amp->ahp, an_idx, 1589 len, seg->s_szc); 1590 } else { 1591 anon_free(amp->ahp, 1592 svd->anon_index + npages, len); 1593 } 1594 /* 1595 * Unreserve swap space for the unmapped chunk 1596 * of this segment in case it's MAP_SHARED 1597 */ 1598 if (svd->type == MAP_SHARED) { 1599 anon_unresv(len); 1600 amp->swresv -= len; 1601 } 1602 } 1603 ANON_LOCK_EXIT(&->a_rwlock); 1604 } 1605 1606 if (svd->swresv) { 1607 if (svd->flags & MAP_NORESERVE) { 1608 ASSERT(amp); 1609 oswresv = svd->swresv; 1610 svd->swresv = ptob(anon_pages(amp->ahp, 1611 svd->anon_index, npages)); 1612 anon_unresv(oswresv - svd->swresv); 1613 } else { 1614 anon_unresv(len); 1615 svd->swresv -= len; 1616 } 1617 TRACE_3(TR_FAC_VM, TR_ANON_PROC, 1618 "anon proc:%p %lu %u", seg, len, 0); 1619 } 1620 1621 seg->s_size -= len; 1622 return (0); 1623 } 1624 1625 /* 1626 * The section to go is in the middle of the segment, 1627 * have to make it into two segments. nseg is made for 1628 * the high end while seg is cut down at the low end. 1629 */ 1630 nbase = addr + len; /* new seg base */ 1631 nsize = (seg->s_base + seg->s_size) - nbase; /* new seg size */ 1632 seg->s_size = addr - seg->s_base; /* shrink old seg */ 1633 nseg = seg_alloc(seg->s_as, nbase, nsize); 1634 if (nseg == NULL) { 1635 panic("segvn_unmap seg_alloc"); 1636 /*NOTREACHED*/ 1637 } 1638 nseg->s_ops = seg->s_ops; 1639 nsvd = kmem_cache_alloc(segvn_cache, KM_SLEEP); 1640 nseg->s_data = (void *)nsvd; 1641 nseg->s_szc = seg->s_szc; 1642 *nsvd = *svd; 1643 nsvd->offset = svd->offset + (uintptr_t)(nseg->s_base - seg->s_base); 1644 nsvd->swresv = 0; 1645 nsvd->softlockcnt = 0; 1646 1647 if (svd->vp != NULL) { 1648 VN_HOLD(nsvd->vp); 1649 if (nsvd->type == MAP_SHARED) 1650 lgrp_shm_policy_init(NULL, nsvd->vp); 1651 } 1652 crhold(svd->cred); 1653 1654 if (svd->vpage == NULL) { 1655 nsvd->vpage = NULL; 1656 } else { 1657 /* need to split vpage into two arrays */ 1658 size_t nbytes; 1659 struct vpage *ovpage; 1660 1661 ovpage = svd->vpage; /* keep pointer to vpage */ 1662 1663 npages = seg_pages(seg); /* seg has shrunk */ 1664 nbytes = vpgtob(npages); 1665 svd->vpage = kmem_alloc(nbytes, KM_SLEEP); 1666 1667 bcopy(ovpage, svd->vpage, nbytes); 1668 1669 npages = seg_pages(nseg); 1670 nbytes = vpgtob(npages); 1671 nsvd->vpage = kmem_alloc(nbytes, KM_SLEEP); 1672 1673 bcopy(&ovpage[opages - npages], nsvd->vpage, nbytes); 1674 1675 /* free up old vpage */ 1676 kmem_free(ovpage, vpgtob(opages)); 1677 } 1678 1679 if (amp == NULL) { 1680 nsvd->amp = NULL; 1681 nsvd->anon_index = 0; 1682 } else { 1683 /* 1684 * Need to create a new anon map for the new segment. 1685 * We'll also allocate a new smaller array for the old 1686 * smaller segment to save space. 1687 */ 1688 opages = btop((uintptr_t)(addr - seg->s_base)); 1689 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 1690 if (amp->refcnt == 1 || svd->type == MAP_PRIVATE) { 1691 /* 1692 * Free up now unused parts of anon_map array 1693 */ 1694 if (seg->s_szc != 0) { 1695 ulong_t an_idx = svd->anon_index + opages; 1696 anon_free_pages(amp->ahp, an_idx, len, 1697 seg->s_szc); 1698 } else { 1699 anon_free(amp->ahp, svd->anon_index + opages, 1700 len); 1701 } 1702 1703 /* 1704 * Unreserve swap space for the unmapped chunk 1705 * of this segment in case it's MAP_SHARED 1706 */ 1707 if (svd->type == MAP_SHARED) { 1708 anon_unresv(len); 1709 amp->swresv -= len; 1710 } 1711 } 1712 1713 nsvd->anon_index = svd->anon_index + 1714 btop((uintptr_t)(nseg->s_base - seg->s_base)); 1715 if (svd->type == MAP_SHARED) { 1716 ASSERT(seg->s_szc == 0); 1717 amp->refcnt++; 1718 nsvd->amp = amp; 1719 } else { 1720 struct anon_map *namp; 1721 struct anon_hdr *nahp; 1722 1723 ASSERT(svd->type == MAP_PRIVATE); 1724 nahp = anon_create(btop(seg->s_size), ANON_SLEEP); 1725 namp = anonmap_alloc(nseg->s_size, 0); 1726 namp->a_szc = seg->s_szc; 1727 (void) anon_copy_ptr(amp->ahp, svd->anon_index, nahp, 1728 0, btop(seg->s_size), ANON_SLEEP); 1729 (void) anon_copy_ptr(amp->ahp, nsvd->anon_index, 1730 namp->ahp, 0, btop(nseg->s_size), ANON_SLEEP); 1731 anon_release(amp->ahp, btop(amp->size)); 1732 svd->anon_index = 0; 1733 nsvd->anon_index = 0; 1734 amp->ahp = nahp; 1735 amp->size = seg->s_size; 1736 nsvd->amp = namp; 1737 } 1738 ANON_LOCK_EXIT(&->a_rwlock); 1739 } 1740 if (svd->swresv) { 1741 if (svd->flags & MAP_NORESERVE) { 1742 ASSERT(amp); 1743 oswresv = svd->swresv; 1744 svd->swresv = ptob(anon_pages(amp->ahp, 1745 svd->anon_index, btop(seg->s_size))); 1746 nsvd->swresv = ptob(anon_pages(nsvd->amp->ahp, 1747 nsvd->anon_index, btop(nseg->s_size))); 1748 ASSERT(oswresv >= (svd->swresv + nsvd->swresv)); 1749 anon_unresv(oswresv - (svd->swresv + nsvd->swresv)); 1750 } else { 1751 if (seg->s_size + nseg->s_size + len != svd->swresv) { 1752 panic("segvn_unmap: " 1753 "cannot split swap reservation"); 1754 /*NOTREACHED*/ 1755 } 1756 anon_unresv(len); 1757 svd->swresv = seg->s_size; 1758 nsvd->swresv = nseg->s_size; 1759 } 1760 TRACE_3(TR_FAC_VM, TR_ANON_PROC, "anon proc:%p %lu %u", 1761 seg, len, 0); 1762 } 1763 1764 return (0); /* I'm glad that's all over with! */ 1765 } 1766 1767 static void 1768 segvn_free(struct seg *seg) 1769 { 1770 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 1771 pgcnt_t npages = seg_pages(seg); 1772 struct anon_map *amp; 1773 size_t len; 1774 1775 /* 1776 * We don't need any segment level locks for "segvn" data 1777 * since the address space is "write" locked. 1778 */ 1779 ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock)); 1780 1781 /* 1782 * Be sure to unlock pages. XXX Why do things get free'ed instead 1783 * of unmapped? XXX 1784 */ 1785 (void) segvn_lockop(seg, seg->s_base, seg->s_size, 1786 0, MC_UNLOCK, NULL, 0); 1787 1788 /* 1789 * Deallocate the vpage and anon pointers if necessary and possible. 1790 */ 1791 if (svd->vpage != NULL) { 1792 kmem_free(svd->vpage, vpgtob(npages)); 1793 svd->vpage = NULL; 1794 } 1795 if ((amp = svd->amp) != NULL) { 1796 /* 1797 * If there are no more references to this anon_map 1798 * structure, then deallocate the structure after freeing 1799 * up all the anon slot pointers that we can. 1800 */ 1801 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 1802 if (--amp->refcnt == 0) { 1803 if (svd->type == MAP_PRIVATE) { 1804 /* 1805 * Private - we only need to anon_free 1806 * the part that this segment refers to. 1807 */ 1808 if (seg->s_szc != 0) { 1809 anon_free_pages(amp->ahp, 1810 svd->anon_index, seg->s_size, 1811 seg->s_szc); 1812 } else { 1813 anon_free(amp->ahp, svd->anon_index, 1814 seg->s_size); 1815 } 1816 } else { 1817 /* 1818 * Shared - anon_free the entire 1819 * anon_map's worth of stuff and 1820 * release any swap reservation. 1821 */ 1822 ASSERT(seg->s_szc == 0); 1823 anon_free(amp->ahp, 0, amp->size); 1824 if ((len = amp->swresv) != 0) { 1825 anon_unresv(len); 1826 TRACE_3(TR_FAC_VM, TR_ANON_PROC, 1827 "anon proc:%p %lu %u", 1828 seg, len, 0); 1829 } 1830 } 1831 svd->amp = NULL; 1832 ANON_LOCK_EXIT(&->a_rwlock); 1833 anonmap_free(amp); 1834 } else if (svd->type == MAP_PRIVATE) { 1835 /* 1836 * We had a private mapping which still has 1837 * a held anon_map so just free up all the 1838 * anon slot pointers that we were using. 1839 */ 1840 if (seg->s_szc != 0) { 1841 anon_free_pages(amp->ahp, svd->anon_index, 1842 seg->s_size, seg->s_szc); 1843 } else { 1844 anon_free(amp->ahp, svd->anon_index, 1845 seg->s_size); 1846 } 1847 ANON_LOCK_EXIT(&->a_rwlock); 1848 } else { 1849 ANON_LOCK_EXIT(&->a_rwlock); 1850 } 1851 } 1852 1853 /* 1854 * Release swap reservation. 1855 */ 1856 if ((len = svd->swresv) != 0) { 1857 anon_unresv(svd->swresv); 1858 TRACE_3(TR_FAC_VM, TR_ANON_PROC, "anon proc:%p %lu %u", 1859 seg, len, 0); 1860 svd->swresv = 0; 1861 } 1862 /* 1863 * Release claim on vnode, credentials, and finally free the 1864 * private data. 1865 */ 1866 if (svd->vp != NULL) { 1867 if (svd->type == MAP_SHARED) 1868 lgrp_shm_policy_fini(NULL, svd->vp); 1869 VN_RELE(svd->vp); 1870 svd->vp = NULL; 1871 } 1872 crfree(svd->cred); 1873 svd->cred = NULL; 1874 1875 seg->s_data = NULL; 1876 kmem_cache_free(segvn_cache, svd); 1877 } 1878 1879 /* 1880 * Do a F_SOFTUNLOCK call over the range requested. The range must have 1881 * already been F_SOFTLOCK'ed. 1882 * Caller must always match addr and len of a softunlock with a previous 1883 * softlock with exactly the same addr and len. 1884 */ 1885 static void 1886 segvn_softunlock(struct seg *seg, caddr_t addr, size_t len, enum seg_rw rw) 1887 { 1888 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 1889 page_t *pp; 1890 caddr_t adr; 1891 struct vnode *vp; 1892 u_offset_t offset; 1893 ulong_t anon_index; 1894 struct anon_map *amp; 1895 struct anon *ap = NULL; 1896 1897 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 1898 ASSERT(SEGVN_LOCK_HELD(seg->s_as, &svd->lock)); 1899 1900 if ((amp = svd->amp) != NULL) 1901 anon_index = svd->anon_index + seg_page(seg, addr); 1902 1903 hat_unlock(seg->s_as->a_hat, addr, len); 1904 for (adr = addr; adr < addr + len; adr += PAGESIZE) { 1905 if (amp != NULL) { 1906 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 1907 if ((ap = anon_get_ptr(amp->ahp, anon_index++)) 1908 != NULL) { 1909 swap_xlate(ap, &vp, &offset); 1910 } else { 1911 vp = svd->vp; 1912 offset = svd->offset + 1913 (uintptr_t)(adr - seg->s_base); 1914 } 1915 ANON_LOCK_EXIT(&->a_rwlock); 1916 } else { 1917 vp = svd->vp; 1918 offset = svd->offset + 1919 (uintptr_t)(adr - seg->s_base); 1920 } 1921 1922 /* 1923 * Use page_find() instead of page_lookup() to 1924 * find the page since we know that it is locked. 1925 */ 1926 pp = page_find(vp, offset); 1927 if (pp == NULL) { 1928 panic( 1929 "segvn_softunlock: addr %p, ap %p, vp %p, off %llx", 1930 (void *)adr, (void *)ap, (void *)vp, offset); 1931 /*NOTREACHED*/ 1932 } 1933 1934 if (rw == S_WRITE) { 1935 hat_setrefmod(pp); 1936 if (seg->s_as->a_vbits) 1937 hat_setstat(seg->s_as, adr, PAGESIZE, 1938 P_REF | P_MOD); 1939 } else if (rw != S_OTHER) { 1940 hat_setref(pp); 1941 if (seg->s_as->a_vbits) 1942 hat_setstat(seg->s_as, adr, PAGESIZE, P_REF); 1943 } 1944 TRACE_3(TR_FAC_VM, TR_SEGVN_FAULT, 1945 "segvn_fault:pp %p vp %p offset %llx", pp, vp, offset); 1946 page_unlock(pp); 1947 } 1948 mutex_enter(&freemem_lock); /* for availrmem */ 1949 availrmem += btop(len); 1950 segvn_pages_locked -= btop(len); 1951 svd->softlockcnt -= btop(len); 1952 mutex_exit(&freemem_lock); 1953 if (svd->softlockcnt == 0) { 1954 /* 1955 * All SOFTLOCKS are gone. Wakeup any waiting 1956 * unmappers so they can try again to unmap. 1957 * Check for waiters first without the mutex 1958 * held so we don't always grab the mutex on 1959 * softunlocks. 1960 */ 1961 if (AS_ISUNMAPWAIT(seg->s_as)) { 1962 mutex_enter(&seg->s_as->a_contents); 1963 if (AS_ISUNMAPWAIT(seg->s_as)) { 1964 AS_CLRUNMAPWAIT(seg->s_as); 1965 cv_broadcast(&seg->s_as->a_cv); 1966 } 1967 mutex_exit(&seg->s_as->a_contents); 1968 } 1969 } 1970 } 1971 1972 #define PAGE_HANDLED ((page_t *)-1) 1973 1974 /* 1975 * Release all the pages in the NULL terminated ppp list 1976 * which haven't already been converted to PAGE_HANDLED. 1977 */ 1978 static void 1979 segvn_pagelist_rele(page_t **ppp) 1980 { 1981 for (; *ppp != NULL; ppp++) { 1982 if (*ppp != PAGE_HANDLED) 1983 page_unlock(*ppp); 1984 } 1985 } 1986 1987 static int stealcow = 1; 1988 1989 /* 1990 * Workaround for viking chip bug. See bug id 1220902. 1991 * To fix this down in pagefault() would require importing so 1992 * much as and segvn code as to be unmaintainable. 1993 */ 1994 int enable_mbit_wa = 0; 1995 1996 /* 1997 * Handles all the dirty work of getting the right 1998 * anonymous pages and loading up the translations. 1999 * This routine is called only from segvn_fault() 2000 * when looping over the range of addresses requested. 2001 * 2002 * The basic algorithm here is: 2003 * If this is an anon_zero case 2004 * Call anon_zero to allocate page 2005 * Load up translation 2006 * Return 2007 * endif 2008 * If this is an anon page 2009 * Use anon_getpage to get the page 2010 * else 2011 * Find page in pl[] list passed in 2012 * endif 2013 * If not a cow 2014 * Load up the translation to the page 2015 * return 2016 * endif 2017 * Call anon_private to handle cow 2018 * Load up (writable) translation to new page 2019 */ 2020 static faultcode_t 2021 segvn_faultpage( 2022 struct hat *hat, /* the hat to use for mapping */ 2023 struct seg *seg, /* seg_vn of interest */ 2024 caddr_t addr, /* address in as */ 2025 u_offset_t off, /* offset in vp */ 2026 struct vpage *vpage, /* pointer to vpage for vp, off */ 2027 page_t *pl[], /* object source page pointer */ 2028 uint_t vpprot, /* access allowed to object pages */ 2029 enum fault_type type, /* type of fault */ 2030 enum seg_rw rw, /* type of access at fault */ 2031 int brkcow) /* we may need to break cow */ 2032 { 2033 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 2034 page_t *pp, **ppp; 2035 uint_t pageflags = 0; 2036 page_t *anon_pl[1 + 1]; 2037 page_t *opp = NULL; /* original page */ 2038 uint_t prot; 2039 int err; 2040 int cow; 2041 int claim; 2042 int steal = 0; 2043 ulong_t anon_index; 2044 struct anon *ap, *oldap; 2045 struct anon_map *amp; 2046 int hat_flag = (type == F_SOFTLOCK) ? HAT_LOAD_LOCK : HAT_LOAD; 2047 int anon_lock = 0; 2048 anon_sync_obj_t cookie; 2049 2050 if (svd->flags & MAP_TEXT) { 2051 hat_flag |= HAT_LOAD_TEXT; 2052 } 2053 2054 ASSERT(SEGVN_READ_HELD(seg->s_as, &svd->lock)); 2055 ASSERT(seg->s_szc == 0); 2056 2057 /* 2058 * Initialize protection value for this page. 2059 * If we have per page protection values check it now. 2060 */ 2061 if (svd->pageprot) { 2062 uint_t protchk; 2063 2064 switch (rw) { 2065 case S_READ: 2066 protchk = PROT_READ; 2067 break; 2068 case S_WRITE: 2069 protchk = PROT_WRITE; 2070 break; 2071 case S_EXEC: 2072 protchk = PROT_EXEC; 2073 break; 2074 case S_OTHER: 2075 default: 2076 protchk = PROT_READ | PROT_WRITE | PROT_EXEC; 2077 break; 2078 } 2079 2080 prot = VPP_PROT(vpage); 2081 if ((prot & protchk) == 0) 2082 return (FC_PROT); /* illegal access type */ 2083 } else { 2084 prot = svd->prot; 2085 } 2086 2087 if (type == F_SOFTLOCK) { 2088 mutex_enter(&freemem_lock); 2089 if (availrmem <= tune.t_minarmem) { 2090 mutex_exit(&freemem_lock); 2091 return (FC_MAKE_ERR(ENOMEM)); /* out of real memory */ 2092 } else { 2093 svd->softlockcnt++; 2094 availrmem--; 2095 segvn_pages_locked++; 2096 } 2097 mutex_exit(&freemem_lock); 2098 } 2099 2100 /* 2101 * Always acquire the anon array lock to prevent 2 threads from 2102 * allocating separate anon slots for the same "addr". 2103 */ 2104 2105 if ((amp = svd->amp) != NULL) { 2106 ASSERT(RW_READ_HELD(&->a_rwlock)); 2107 anon_index = svd->anon_index + seg_page(seg, addr); 2108 anon_array_enter(amp, anon_index, &cookie); 2109 anon_lock = 1; 2110 } 2111 2112 if (svd->vp == NULL && amp != NULL) { 2113 if ((ap = anon_get_ptr(amp->ahp, anon_index)) == NULL) { 2114 /* 2115 * Allocate a (normally) writable anonymous page of 2116 * zeroes. If no advance reservations, reserve now. 2117 */ 2118 if (svd->flags & MAP_NORESERVE) { 2119 if (anon_resv(ptob(1))) { 2120 svd->swresv += ptob(1); 2121 } else { 2122 err = ENOMEM; 2123 goto out; 2124 } 2125 } 2126 if ((pp = anon_zero(seg, addr, &ap, 2127 svd->cred)) == NULL) { 2128 err = ENOMEM; 2129 goto out; /* out of swap space */ 2130 } 2131 /* 2132 * Re-acquire the anon_map lock and 2133 * initialize the anon array entry. 2134 */ 2135 (void) anon_set_ptr(amp->ahp, anon_index, ap, 2136 ANON_SLEEP); 2137 if (enable_mbit_wa) { 2138 if (rw == S_WRITE) 2139 hat_setmod(pp); 2140 else if (!hat_ismod(pp)) 2141 prot &= ~PROT_WRITE; 2142 } 2143 /* 2144 * If AS_PAGLCK is set in a_flags (via memcntl(2) 2145 * with MC_LOCKAS, MCL_FUTURE) and this is a 2146 * MAP_NORESERVE segment, we may need to 2147 * permanently lock the page as it is being faulted 2148 * for the first time. The following text applies 2149 * only to MAP_NORESERVE segments: 2150 * 2151 * As per memcntl(2), if this segment was created 2152 * after MCL_FUTURE was applied (a "future" 2153 * segment), its pages must be locked. If this 2154 * segment existed at MCL_FUTURE application (a 2155 * "past" segment), the interface is unclear. 2156 * 2157 * We decide to lock only if vpage is present: 2158 * 2159 * - "future" segments will have a vpage array (see 2160 * as_map), and so will be locked as required 2161 * 2162 * - "past" segments may not have a vpage array, 2163 * depending on whether events (such as 2164 * mprotect) have occurred. Locking if vpage 2165 * exists will preserve legacy behavior. Not 2166 * locking if vpage is absent, will not break 2167 * the interface or legacy behavior. Note that 2168 * allocating vpage here if it's absent requires 2169 * upgrading the segvn reader lock, the cost of 2170 * which does not seem worthwhile. 2171 */ 2172 if (AS_ISPGLCK(seg->s_as) && vpage != NULL && 2173 (svd->flags & MAP_NORESERVE)) { 2174 claim = VPP_PROT(vpage) & PROT_WRITE; 2175 ASSERT(svd->type == MAP_PRIVATE); 2176 if (page_pp_lock(pp, claim, 0)) 2177 VPP_SETPPLOCK(vpage); 2178 } 2179 2180 2181 /* 2182 * Handle pages that have been marked for migration 2183 */ 2184 if (lgrp_optimizations()) 2185 page_migrate(seg, addr, &pp, 1); 2186 hat_memload(hat, addr, pp, prot, hat_flag); 2187 2188 if (!(hat_flag & HAT_LOAD_LOCK)) 2189 page_unlock(pp); 2190 2191 anon_array_exit(&cookie); 2192 return (0); 2193 } 2194 } 2195 2196 /* 2197 * Obtain the page structure via anon_getpage() if it is 2198 * a private copy of an object (the result of a previous 2199 * copy-on-write). 2200 */ 2201 if (amp != NULL) { 2202 if ((ap = anon_get_ptr(amp->ahp, anon_index)) != NULL) { 2203 err = anon_getpage(&ap, &vpprot, anon_pl, PAGESIZE, 2204 seg, addr, rw, svd->cred); 2205 if (err) 2206 goto out; 2207 2208 if (svd->type == MAP_SHARED) { 2209 /* 2210 * If this is a shared mapping to an 2211 * anon_map, then ignore the write 2212 * permissions returned by anon_getpage(). 2213 * They apply to the private mappings 2214 * of this anon_map. 2215 */ 2216 vpprot |= PROT_WRITE; 2217 } 2218 opp = anon_pl[0]; 2219 } 2220 } 2221 2222 /* 2223 * Search the pl[] list passed in if it is from the 2224 * original object (i.e., not a private copy). 2225 */ 2226 if (opp == NULL) { 2227 /* 2228 * Find original page. We must be bringing it in 2229 * from the list in pl[]. 2230 */ 2231 for (ppp = pl; (opp = *ppp) != NULL; ppp++) { 2232 if (opp == PAGE_HANDLED) 2233 continue; 2234 ASSERT(opp->p_vnode == svd->vp); /* XXX */ 2235 if (opp->p_offset == off) 2236 break; 2237 } 2238 if (opp == NULL) { 2239 panic("segvn_faultpage not found"); 2240 /*NOTREACHED*/ 2241 } 2242 *ppp = PAGE_HANDLED; 2243 2244 } 2245 2246 ASSERT(PAGE_LOCKED(opp)); 2247 2248 TRACE_3(TR_FAC_VM, TR_SEGVN_FAULT, 2249 "segvn_fault:pp %p vp %p offset %llx", 2250 opp, NULL, 0); 2251 2252 /* 2253 * The fault is treated as a copy-on-write fault if a 2254 * write occurs on a private segment and the object 2255 * page (i.e., mapping) is write protected. We assume 2256 * that fatal protection checks have already been made. 2257 */ 2258 2259 cow = brkcow && ((vpprot & PROT_WRITE) == 0); 2260 2261 /* 2262 * If not a copy-on-write case load the translation 2263 * and return. 2264 */ 2265 if (cow == 0) { 2266 if (IS_VMODSORT(opp->p_vnode) || enable_mbit_wa) { 2267 if (rw == S_WRITE) 2268 hat_setmod(opp); 2269 else if (rw != S_OTHER && !hat_ismod(opp)) 2270 prot &= ~PROT_WRITE; 2271 } 2272 2273 /* 2274 * Handle pages that have been marked for migration 2275 */ 2276 if (lgrp_optimizations()) 2277 page_migrate(seg, addr, &opp, 1); 2278 2279 hat_memload(hat, addr, opp, prot & vpprot, hat_flag); 2280 2281 if (!(hat_flag & HAT_LOAD_LOCK)) 2282 page_unlock(opp); 2283 2284 if (anon_lock) { 2285 anon_array_exit(&cookie); 2286 } 2287 return (0); 2288 } 2289 2290 hat_setref(opp); 2291 2292 ASSERT(amp != NULL && anon_lock); 2293 2294 /* 2295 * Steal the page only if it isn't a private page 2296 * since stealing a private page is not worth the effort. 2297 */ 2298 if ((ap = anon_get_ptr(amp->ahp, anon_index)) == NULL) 2299 steal = 1; 2300 2301 /* 2302 * Steal the original page if the following conditions are true: 2303 * 2304 * We are low on memory, the page is not private, page is not large, 2305 * not shared, not modified, not `locked' or if we have it `locked' 2306 * (i.e., p_cowcnt == 1 and p_lckcnt == 0, which also implies 2307 * that the page is not shared) and if it doesn't have any 2308 * translations. page_struct_lock isn't needed to look at p_cowcnt 2309 * and p_lckcnt because we first get exclusive lock on page. 2310 */ 2311 (void) hat_pagesync(opp, HAT_SYNC_DONTZERO | HAT_SYNC_STOPON_MOD); 2312 2313 if (stealcow && freemem < minfree && steal && opp->p_szc == 0 && 2314 page_tryupgrade(opp) && !hat_ismod(opp) && 2315 ((opp->p_lckcnt == 0 && opp->p_cowcnt == 0) || 2316 (opp->p_lckcnt == 0 && opp->p_cowcnt == 1 && 2317 vpage != NULL && VPP_ISPPLOCK(vpage)))) { 2318 /* 2319 * Check if this page has other translations 2320 * after unloading our translation. 2321 */ 2322 if (hat_page_is_mapped(opp)) { 2323 hat_unload(seg->s_as->a_hat, addr, PAGESIZE, 2324 HAT_UNLOAD); 2325 } 2326 2327 /* 2328 * hat_unload() might sync back someone else's recent 2329 * modification, so check again. 2330 */ 2331 if (!hat_ismod(opp) && !hat_page_is_mapped(opp)) 2332 pageflags |= STEAL_PAGE; 2333 } 2334 2335 /* 2336 * If we have a vpage pointer, see if it indicates that we have 2337 * ``locked'' the page we map -- if so, tell anon_private to 2338 * transfer the locking resource to the new page. 2339 * 2340 * See Statement at the beginning of segvn_lockop regarding 2341 * the way lockcnts/cowcnts are handled during COW. 2342 * 2343 */ 2344 if (vpage != NULL && VPP_ISPPLOCK(vpage)) 2345 pageflags |= LOCK_PAGE; 2346 2347 /* 2348 * Allocate a private page and perform the copy. 2349 * For MAP_NORESERVE reserve swap space now, unless this 2350 * is a cow fault on an existing anon page in which case 2351 * MAP_NORESERVE will have made advance reservations. 2352 */ 2353 if ((svd->flags & MAP_NORESERVE) && (ap == NULL)) { 2354 if (anon_resv(ptob(1))) { 2355 svd->swresv += ptob(1); 2356 } else { 2357 page_unlock(opp); 2358 err = ENOMEM; 2359 goto out; 2360 } 2361 } 2362 oldap = ap; 2363 pp = anon_private(&ap, seg, addr, prot, opp, pageflags, svd->cred); 2364 if (pp == NULL) { 2365 err = ENOMEM; /* out of swap space */ 2366 goto out; 2367 } 2368 2369 /* 2370 * If we copied away from an anonymous page, then 2371 * we are one step closer to freeing up an anon slot. 2372 * 2373 * NOTE: The original anon slot must be released while 2374 * holding the "anon_map" lock. This is necessary to prevent 2375 * other threads from obtaining a pointer to the anon slot 2376 * which may be freed if its "refcnt" is 1. 2377 */ 2378 if (oldap != NULL) 2379 anon_decref(oldap); 2380 2381 (void) anon_set_ptr(amp->ahp, anon_index, ap, ANON_SLEEP); 2382 2383 ASSERT(!IS_VMODSORT(pp->p_vnode)); 2384 if (enable_mbit_wa) { 2385 if (rw == S_WRITE) 2386 hat_setmod(pp); 2387 else if (!hat_ismod(pp)) 2388 prot &= ~PROT_WRITE; 2389 } 2390 2391 2392 /* 2393 * Handle pages that have been marked for migration 2394 */ 2395 if (lgrp_optimizations()) 2396 page_migrate(seg, addr, &pp, 1); 2397 hat_memload(hat, addr, pp, prot, hat_flag); 2398 2399 if (!(hat_flag & HAT_LOAD_LOCK)) 2400 page_unlock(pp); 2401 2402 ASSERT(anon_lock); 2403 anon_array_exit(&cookie); 2404 return (0); 2405 out: 2406 if (anon_lock) 2407 anon_array_exit(&cookie); 2408 2409 if (type == F_SOFTLOCK) { 2410 mutex_enter(&freemem_lock); 2411 availrmem++; 2412 segvn_pages_locked--; 2413 svd->softlockcnt--; 2414 mutex_exit(&freemem_lock); 2415 } 2416 return (FC_MAKE_ERR(err)); 2417 } 2418 2419 /* 2420 * relocate a bunch of smaller targ pages into one large repl page. all targ 2421 * pages must be complete pages smaller than replacement pages. 2422 * it's assumed that no page's szc can change since they are all PAGESIZE or 2423 * complete large pages locked SHARED. 2424 */ 2425 static void 2426 segvn_relocate_pages(page_t **targ, page_t *replacement) 2427 { 2428 page_t *pp; 2429 pgcnt_t repl_npgs, curnpgs; 2430 pgcnt_t i; 2431 uint_t repl_szc = replacement->p_szc; 2432 page_t *first_repl = replacement; 2433 page_t *repl; 2434 spgcnt_t npgs; 2435 2436 VM_STAT_ADD(segvnvmstats.relocatepages[0]); 2437 2438 ASSERT(repl_szc != 0); 2439 npgs = repl_npgs = page_get_pagecnt(repl_szc); 2440 2441 i = 0; 2442 while (repl_npgs) { 2443 spgcnt_t nreloc; 2444 int err; 2445 ASSERT(replacement != NULL); 2446 pp = targ[i]; 2447 ASSERT(pp->p_szc < repl_szc); 2448 ASSERT(PAGE_EXCL(pp)); 2449 ASSERT(!PP_ISFREE(pp)); 2450 curnpgs = page_get_pagecnt(pp->p_szc); 2451 if (curnpgs == 1) { 2452 VM_STAT_ADD(segvnvmstats.relocatepages[1]); 2453 repl = replacement; 2454 page_sub(&replacement, repl); 2455 ASSERT(PAGE_EXCL(repl)); 2456 ASSERT(!PP_ISFREE(repl)); 2457 ASSERT(repl->p_szc == repl_szc); 2458 } else { 2459 page_t *repl_savepp; 2460 int j; 2461 VM_STAT_ADD(segvnvmstats.relocatepages[2]); 2462 repl_savepp = replacement; 2463 for (j = 0; j < curnpgs; j++) { 2464 repl = replacement; 2465 page_sub(&replacement, repl); 2466 ASSERT(PAGE_EXCL(repl)); 2467 ASSERT(!PP_ISFREE(repl)); 2468 ASSERT(repl->p_szc == repl_szc); 2469 ASSERT(page_pptonum(targ[i + j]) == 2470 page_pptonum(targ[i]) + j); 2471 } 2472 repl = repl_savepp; 2473 ASSERT(IS_P2ALIGNED(page_pptonum(repl), curnpgs)); 2474 } 2475 err = page_relocate(&pp, &repl, 0, 1, &nreloc, NULL); 2476 if (err || nreloc != curnpgs) { 2477 panic("segvn_relocate_pages: " 2478 "page_relocate failed err=%d curnpgs=%ld " 2479 "nreloc=%ld", err, curnpgs, nreloc); 2480 } 2481 ASSERT(curnpgs <= repl_npgs); 2482 repl_npgs -= curnpgs; 2483 i += curnpgs; 2484 } 2485 ASSERT(replacement == NULL); 2486 2487 repl = first_repl; 2488 repl_npgs = npgs; 2489 for (i = 0; i < repl_npgs; i++) { 2490 ASSERT(PAGE_EXCL(repl)); 2491 ASSERT(!PP_ISFREE(repl)); 2492 targ[i] = repl; 2493 page_downgrade(targ[i]); 2494 repl++; 2495 } 2496 } 2497 2498 /* 2499 * Check if all pages in ppa array are complete smaller than szc pages and 2500 * their roots will still be aligned relative to their current size if the 2501 * entire ppa array is relocated into one szc page. If these conditions are 2502 * not met return 0. 2503 * 2504 * If all pages are properly aligned attempt to upgrade their locks 2505 * to exclusive mode. If it fails set *upgrdfail to 1 and return 0. 2506 * upgrdfail was set to 0 by caller. 2507 * 2508 * Return 1 if all pages are aligned and locked exclusively. 2509 * 2510 * If all pages in ppa array happen to be physically contiguous to make one 2511 * szc page and all exclusive locks are successfully obtained promote the page 2512 * size to szc and set *pszc to szc. Return 1 with pages locked shared. 2513 */ 2514 static int 2515 segvn_full_szcpages(page_t **ppa, uint_t szc, int *upgrdfail, uint_t *pszc) 2516 { 2517 page_t *pp; 2518 pfn_t pfn; 2519 pgcnt_t totnpgs = page_get_pagecnt(szc); 2520 pfn_t first_pfn; 2521 int contig = 1; 2522 pgcnt_t i; 2523 pgcnt_t j; 2524 uint_t curszc; 2525 pgcnt_t curnpgs; 2526 int root = 0; 2527 2528 ASSERT(szc > 0); 2529 2530 VM_STAT_ADD(segvnvmstats.fullszcpages[0]); 2531 2532 for (i = 0; i < totnpgs; i++) { 2533 pp = ppa[i]; 2534 ASSERT(PAGE_SHARED(pp)); 2535 ASSERT(!PP_ISFREE(pp)); 2536 pfn = page_pptonum(pp); 2537 if (i == 0) { 2538 if (!IS_P2ALIGNED(pfn, totnpgs)) { 2539 contig = 0; 2540 } else { 2541 first_pfn = pfn; 2542 } 2543 } else if (contig && pfn != first_pfn + i) { 2544 contig = 0; 2545 } 2546 if (pp->p_szc == 0) { 2547 if (root) { 2548 VM_STAT_ADD(segvnvmstats.fullszcpages[1]); 2549 return (0); 2550 } 2551 } else if (!root) { 2552 if ((curszc = pp->p_szc) >= szc) { 2553 VM_STAT_ADD(segvnvmstats.fullszcpages[2]); 2554 return (0); 2555 } 2556 if (curszc == 0) { 2557 /* 2558 * p_szc changed means we don't have all pages 2559 * locked. return failure. 2560 */ 2561 VM_STAT_ADD(segvnvmstats.fullszcpages[3]); 2562 return (0); 2563 } 2564 curnpgs = page_get_pagecnt(curszc); 2565 if (!IS_P2ALIGNED(pfn, curnpgs) || 2566 !IS_P2ALIGNED(i, curnpgs)) { 2567 VM_STAT_ADD(segvnvmstats.fullszcpages[4]); 2568 return (0); 2569 } 2570 root = 1; 2571 } else { 2572 ASSERT(i > 0); 2573 VM_STAT_ADD(segvnvmstats.fullszcpages[5]); 2574 if (pp->p_szc != curszc) { 2575 VM_STAT_ADD(segvnvmstats.fullszcpages[6]); 2576 return (0); 2577 } 2578 if (pfn - 1 != page_pptonum(ppa[i - 1])) { 2579 panic("segvn_full_szcpages: " 2580 "large page not physically contiguous"); 2581 } 2582 if (P2PHASE(pfn, curnpgs) == curnpgs - 1) { 2583 root = 0; 2584 } 2585 } 2586 } 2587 2588 for (i = 0; i < totnpgs; i++) { 2589 ASSERT(ppa[i]->p_szc < szc); 2590 if (!page_tryupgrade(ppa[i])) { 2591 for (j = 0; j < i; j++) { 2592 page_downgrade(ppa[j]); 2593 } 2594 *pszc = ppa[i]->p_szc; 2595 *upgrdfail = 1; 2596 VM_STAT_ADD(segvnvmstats.fullszcpages[7]); 2597 return (0); 2598 } 2599 } 2600 2601 /* 2602 * When a page is put a free cachelist its szc is set to 0. if file 2603 * system reclaimed pages from cachelist targ pages will be physically 2604 * contiguous with 0 p_szc. in this case just upgrade szc of targ 2605 * pages without any relocations. 2606 * To avoid any hat issues with previous small mappings 2607 * hat_pageunload() the target pages first. 2608 */ 2609 if (contig) { 2610 VM_STAT_ADD(segvnvmstats.fullszcpages[8]); 2611 for (i = 0; i < totnpgs; i++) { 2612 (void) hat_pageunload(ppa[i], HAT_FORCE_PGUNLOAD); 2613 } 2614 for (i = 0; i < totnpgs; i++) { 2615 ppa[i]->p_szc = szc; 2616 } 2617 for (i = 0; i < totnpgs; i++) { 2618 ASSERT(PAGE_EXCL(ppa[i])); 2619 page_downgrade(ppa[i]); 2620 } 2621 if (pszc != NULL) { 2622 *pszc = szc; 2623 } 2624 } 2625 VM_STAT_ADD(segvnvmstats.fullszcpages[9]); 2626 return (1); 2627 } 2628 2629 /* 2630 * Create physically contiguous pages for [vp, off] - [vp, off + 2631 * page_size(szc)) range and for private segment return them in ppa array. 2632 * Pages are created either via IO or relocations. 2633 * 2634 * Return 1 on sucess and 0 on failure. 2635 * 2636 * If physically contiguos pages already exist for this range return 1 without 2637 * filling ppa array. Caller initializes ppa[0] as NULL to detect that ppa 2638 * array wasn't filled. In this case caller fills ppa array via VOP_GETPAGE(). 2639 */ 2640 2641 static int 2642 segvn_fill_vp_pages(struct segvn_data *svd, vnode_t *vp, u_offset_t off, 2643 uint_t szc, page_t **ppa, page_t **ppplist, uint_t *ret_pszc, 2644 int *downsize) 2645 2646 { 2647 page_t *pplist = *ppplist; 2648 size_t pgsz = page_get_pagesize(szc); 2649 pgcnt_t pages = btop(pgsz); 2650 ulong_t start_off = off; 2651 u_offset_t eoff = off + pgsz; 2652 spgcnt_t nreloc; 2653 u_offset_t io_off = off; 2654 size_t io_len; 2655 page_t *io_pplist = NULL; 2656 page_t *done_pplist = NULL; 2657 pgcnt_t pgidx = 0; 2658 page_t *pp; 2659 page_t *newpp; 2660 page_t *targpp; 2661 int io_err = 0; 2662 int i; 2663 pfn_t pfn; 2664 ulong_t ppages; 2665 page_t *targ_pplist = NULL; 2666 page_t *repl_pplist = NULL; 2667 page_t *tmp_pplist; 2668 int nios = 0; 2669 uint_t pszc; 2670 struct vattr va; 2671 2672 VM_STAT_ADD(segvnvmstats.fill_vp_pages[0]); 2673 2674 ASSERT(szc != 0); 2675 ASSERT(pplist->p_szc == szc); 2676 2677 /* 2678 * downsize will be set to 1 only if we fail to lock pages. this will 2679 * allow subsequent faults to try to relocate the page again. If we 2680 * fail due to misalignment don't downsize and let the caller map the 2681 * whole region with small mappings to avoid more faults into the area 2682 * where we can't get large pages anyway. 2683 */ 2684 *downsize = 0; 2685 2686 while (off < eoff) { 2687 newpp = pplist; 2688 ASSERT(newpp != NULL); 2689 ASSERT(PAGE_EXCL(newpp)); 2690 ASSERT(!PP_ISFREE(newpp)); 2691 /* 2692 * we pass NULL for nrelocp to page_lookup_create() 2693 * so that it doesn't relocate. We relocate here 2694 * later only after we make sure we can lock all 2695 * pages in the range we handle and they are all 2696 * aligned. 2697 */ 2698 pp = page_lookup_create(vp, off, SE_SHARED, newpp, NULL, 0); 2699 ASSERT(pp != NULL); 2700 ASSERT(!PP_ISFREE(pp)); 2701 ASSERT(pp->p_vnode == vp); 2702 ASSERT(pp->p_offset == off); 2703 if (pp == newpp) { 2704 VM_STAT_ADD(segvnvmstats.fill_vp_pages[1]); 2705 page_sub(&pplist, pp); 2706 ASSERT(PAGE_EXCL(pp)); 2707 ASSERT(page_iolock_assert(pp)); 2708 page_list_concat(&io_pplist, &pp); 2709 off += PAGESIZE; 2710 continue; 2711 } 2712 VM_STAT_ADD(segvnvmstats.fill_vp_pages[2]); 2713 pfn = page_pptonum(pp); 2714 pszc = pp->p_szc; 2715 if (pszc >= szc && targ_pplist == NULL && io_pplist == NULL && 2716 IS_P2ALIGNED(pfn, pages)) { 2717 ASSERT(repl_pplist == NULL); 2718 ASSERT(done_pplist == NULL); 2719 ASSERT(pplist == *ppplist); 2720 page_unlock(pp); 2721 page_free_replacement_page(pplist); 2722 page_create_putback(pages); 2723 *ppplist = NULL; 2724 VM_STAT_ADD(segvnvmstats.fill_vp_pages[3]); 2725 return (1); 2726 } 2727 if (pszc >= szc) { 2728 page_unlock(pp); 2729 segvn_faultvnmpss_align_err1++; 2730 goto out; 2731 } 2732 ppages = page_get_pagecnt(pszc); 2733 if (!IS_P2ALIGNED(pfn, ppages)) { 2734 ASSERT(pszc > 0); 2735 /* 2736 * sizing down to pszc won't help. 2737 */ 2738 page_unlock(pp); 2739 segvn_faultvnmpss_align_err2++; 2740 goto out; 2741 } 2742 pfn = page_pptonum(newpp); 2743 if (!IS_P2ALIGNED(pfn, ppages)) { 2744 ASSERT(pszc > 0); 2745 /* 2746 * sizing down to pszc won't help. 2747 */ 2748 page_unlock(pp); 2749 segvn_faultvnmpss_align_err3++; 2750 goto out; 2751 } 2752 if (!PAGE_EXCL(pp)) { 2753 VM_STAT_ADD(segvnvmstats.fill_vp_pages[4]); 2754 page_unlock(pp); 2755 *downsize = 1; 2756 *ret_pszc = pp->p_szc; 2757 goto out; 2758 } 2759 targpp = pp; 2760 if (io_pplist != NULL) { 2761 VM_STAT_ADD(segvnvmstats.fill_vp_pages[5]); 2762 io_len = off - io_off; 2763 /* 2764 * Some file systems like NFS don't check EOF 2765 * conditions in VOP_PAGEIO(). Check it here 2766 * now that pages are locked SE_EXCL. Any file 2767 * truncation will wait until the pages are 2768 * unlocked so no need to worry that file will 2769 * be truncated after we check its size here. 2770 * XXX fix NFS to remove this check. 2771 */ 2772 va.va_mask = AT_SIZE; 2773 if (VOP_GETATTR(vp, &va, ATTR_HINT, svd->cred) != 0) { 2774 VM_STAT_ADD(segvnvmstats.fill_vp_pages[6]); 2775 page_unlock(targpp); 2776 goto out; 2777 } 2778 if (btopr(va.va_size) < btopr(io_off + io_len)) { 2779 VM_STAT_ADD(segvnvmstats.fill_vp_pages[7]); 2780 *downsize = 1; 2781 *ret_pszc = 0; 2782 page_unlock(targpp); 2783 goto out; 2784 } 2785 io_err = VOP_PAGEIO(vp, io_pplist, io_off, io_len, 2786 B_READ, svd->cred); 2787 if (io_err) { 2788 VM_STAT_ADD(segvnvmstats.fill_vp_pages[8]); 2789 page_unlock(targpp); 2790 if (io_err == EDEADLK) { 2791 segvn_vmpss_pageio_deadlk_err++; 2792 } 2793 goto out; 2794 } 2795 nios++; 2796 VM_STAT_ADD(segvnvmstats.fill_vp_pages[9]); 2797 while (io_pplist != NULL) { 2798 pp = io_pplist; 2799 page_sub(&io_pplist, pp); 2800 ASSERT(page_iolock_assert(pp)); 2801 page_io_unlock(pp); 2802 pgidx = (pp->p_offset - start_off) >> 2803 PAGESHIFT; 2804 ASSERT(pgidx < pages); 2805 ppa[pgidx] = pp; 2806 page_list_concat(&done_pplist, &pp); 2807 } 2808 } 2809 pp = targpp; 2810 ASSERT(PAGE_EXCL(pp)); 2811 ASSERT(pp->p_szc <= pszc); 2812 if (pszc != 0 && !group_page_trylock(pp, SE_EXCL)) { 2813 VM_STAT_ADD(segvnvmstats.fill_vp_pages[10]); 2814 page_unlock(pp); 2815 *downsize = 1; 2816 *ret_pszc = pp->p_szc; 2817 goto out; 2818 } 2819 VM_STAT_ADD(segvnvmstats.fill_vp_pages[11]); 2820 /* 2821 * page szc chould have changed before the entire group was 2822 * locked. reread page szc. 2823 */ 2824 pszc = pp->p_szc; 2825 ppages = page_get_pagecnt(pszc); 2826 2827 /* link just the roots */ 2828 page_list_concat(&targ_pplist, &pp); 2829 page_sub(&pplist, newpp); 2830 page_list_concat(&repl_pplist, &newpp); 2831 off += PAGESIZE; 2832 while (--ppages != 0) { 2833 newpp = pplist; 2834 page_sub(&pplist, newpp); 2835 off += PAGESIZE; 2836 } 2837 io_off = off; 2838 } 2839 if (io_pplist != NULL) { 2840 VM_STAT_ADD(segvnvmstats.fill_vp_pages[12]); 2841 io_len = eoff - io_off; 2842 va.va_mask = AT_SIZE; 2843 if (VOP_GETATTR(vp, &va, ATTR_HINT, svd->cred) != 0) { 2844 VM_STAT_ADD(segvnvmstats.fill_vp_pages[13]); 2845 goto out; 2846 } 2847 if (btopr(va.va_size) < btopr(io_off + io_len)) { 2848 VM_STAT_ADD(segvnvmstats.fill_vp_pages[14]); 2849 *downsize = 1; 2850 *ret_pszc = 0; 2851 goto out; 2852 } 2853 io_err = VOP_PAGEIO(vp, io_pplist, io_off, io_len, 2854 B_READ, svd->cred); 2855 if (io_err) { 2856 VM_STAT_ADD(segvnvmstats.fill_vp_pages[15]); 2857 if (io_err == EDEADLK) { 2858 segvn_vmpss_pageio_deadlk_err++; 2859 } 2860 goto out; 2861 } 2862 nios++; 2863 while (io_pplist != NULL) { 2864 pp = io_pplist; 2865 page_sub(&io_pplist, pp); 2866 ASSERT(page_iolock_assert(pp)); 2867 page_io_unlock(pp); 2868 pgidx = (pp->p_offset - start_off) >> PAGESHIFT; 2869 ASSERT(pgidx < pages); 2870 ppa[pgidx] = pp; 2871 } 2872 } 2873 /* 2874 * we're now bound to succeed or panic. 2875 * remove pages from done_pplist. it's not needed anymore. 2876 */ 2877 while (done_pplist != NULL) { 2878 pp = done_pplist; 2879 page_sub(&done_pplist, pp); 2880 } 2881 VM_STAT_ADD(segvnvmstats.fill_vp_pages[16]); 2882 ASSERT(pplist == NULL); 2883 *ppplist = NULL; 2884 while (targ_pplist != NULL) { 2885 int ret; 2886 VM_STAT_ADD(segvnvmstats.fill_vp_pages[17]); 2887 ASSERT(repl_pplist); 2888 pp = targ_pplist; 2889 page_sub(&targ_pplist, pp); 2890 pgidx = (pp->p_offset - start_off) >> PAGESHIFT; 2891 newpp = repl_pplist; 2892 page_sub(&repl_pplist, newpp); 2893 #ifdef DEBUG 2894 pfn = page_pptonum(pp); 2895 pszc = pp->p_szc; 2896 ppages = page_get_pagecnt(pszc); 2897 ASSERT(IS_P2ALIGNED(pfn, ppages)); 2898 pfn = page_pptonum(newpp); 2899 ASSERT(IS_P2ALIGNED(pfn, ppages)); 2900 ASSERT(P2PHASE(pfn, pages) == pgidx); 2901 #endif 2902 nreloc = 0; 2903 ret = page_relocate(&pp, &newpp, 0, 1, &nreloc, NULL); 2904 if (ret != 0 || nreloc == 0) { 2905 panic("segvn_fill_vp_pages: " 2906 "page_relocate failed"); 2907 } 2908 pp = newpp; 2909 while (nreloc-- != 0) { 2910 ASSERT(PAGE_EXCL(pp)); 2911 ASSERT(pp->p_vnode == vp); 2912 ASSERT(pgidx == 2913 ((pp->p_offset - start_off) >> PAGESHIFT)); 2914 ppa[pgidx++] = pp; 2915 pp++; 2916 } 2917 } 2918 2919 if (svd->type == MAP_PRIVATE) { 2920 VM_STAT_ADD(segvnvmstats.fill_vp_pages[18]); 2921 for (i = 0; i < pages; i++) { 2922 ASSERT(ppa[i] != NULL); 2923 ASSERT(PAGE_EXCL(ppa[i])); 2924 ASSERT(ppa[i]->p_vnode == vp); 2925 ASSERT(ppa[i]->p_offset == 2926 start_off + (i << PAGESHIFT)); 2927 page_downgrade(ppa[i]); 2928 } 2929 ppa[pages] = NULL; 2930 } else { 2931 VM_STAT_ADD(segvnvmstats.fill_vp_pages[19]); 2932 /* 2933 * the caller will still call VOP_GETPAGE() for shared segments 2934 * to check FS write permissions. For private segments we map 2935 * file read only anyway. so no VOP_GETPAGE is needed. 2936 */ 2937 for (i = 0; i < pages; i++) { 2938 ASSERT(ppa[i] != NULL); 2939 ASSERT(PAGE_EXCL(ppa[i])); 2940 ASSERT(ppa[i]->p_vnode == vp); 2941 ASSERT(ppa[i]->p_offset == 2942 start_off + (i << PAGESHIFT)); 2943 page_unlock(ppa[i]); 2944 } 2945 ppa[0] = NULL; 2946 } 2947 2948 return (1); 2949 out: 2950 /* 2951 * Do the cleanup. Unlock target pages we didn't relocate. They are 2952 * linked on targ_pplist by root pages. reassemble unused replacement 2953 * and io pages back to pplist. 2954 */ 2955 if (io_pplist != NULL) { 2956 VM_STAT_ADD(segvnvmstats.fill_vp_pages[20]); 2957 pp = io_pplist; 2958 do { 2959 ASSERT(pp->p_vnode == vp); 2960 ASSERT(pp->p_offset == io_off); 2961 ASSERT(page_iolock_assert(pp)); 2962 page_io_unlock(pp); 2963 page_hashout(pp, NULL); 2964 io_off += PAGESIZE; 2965 } while ((pp = pp->p_next) != io_pplist); 2966 page_list_concat(&io_pplist, &pplist); 2967 pplist = io_pplist; 2968 } 2969 tmp_pplist = NULL; 2970 while (targ_pplist != NULL) { 2971 VM_STAT_ADD(segvnvmstats.fill_vp_pages[21]); 2972 pp = targ_pplist; 2973 ASSERT(PAGE_EXCL(pp)); 2974 page_sub(&targ_pplist, pp); 2975 2976 pszc = pp->p_szc; 2977 ppages = page_get_pagecnt(pszc); 2978 ASSERT(IS_P2ALIGNED(page_pptonum(pp), ppages)); 2979 2980 if (pszc != 0) { 2981 group_page_unlock(pp); 2982 } 2983 page_unlock(pp); 2984 2985 pp = repl_pplist; 2986 ASSERT(pp != NULL); 2987 ASSERT(PAGE_EXCL(pp)); 2988 ASSERT(pp->p_szc == szc); 2989 page_sub(&repl_pplist, pp); 2990 2991 ASSERT(IS_P2ALIGNED(page_pptonum(pp), ppages)); 2992 2993 /* relink replacement page */ 2994 page_list_concat(&tmp_pplist, &pp); 2995 while (--ppages != 0) { 2996 VM_STAT_ADD(segvnvmstats.fill_vp_pages[22]); 2997 pp++; 2998 ASSERT(PAGE_EXCL(pp)); 2999 ASSERT(pp->p_szc == szc); 3000 page_list_concat(&tmp_pplist, &pp); 3001 } 3002 } 3003 if (tmp_pplist != NULL) { 3004 VM_STAT_ADD(segvnvmstats.fill_vp_pages[23]); 3005 page_list_concat(&tmp_pplist, &pplist); 3006 pplist = tmp_pplist; 3007 } 3008 /* 3009 * at this point all pages are either on done_pplist or 3010 * pplist. They can't be all on done_pplist otherwise 3011 * we'd've been done. 3012 */ 3013 ASSERT(pplist != NULL); 3014 if (nios != 0) { 3015 VM_STAT_ADD(segvnvmstats.fill_vp_pages[24]); 3016 pp = pplist; 3017 do { 3018 VM_STAT_ADD(segvnvmstats.fill_vp_pages[25]); 3019 ASSERT(pp->p_szc == szc); 3020 ASSERT(PAGE_EXCL(pp)); 3021 ASSERT(pp->p_vnode != vp); 3022 pp->p_szc = 0; 3023 } while ((pp = pp->p_next) != pplist); 3024 3025 pp = done_pplist; 3026 do { 3027 VM_STAT_ADD(segvnvmstats.fill_vp_pages[26]); 3028 ASSERT(pp->p_szc == szc); 3029 ASSERT(PAGE_EXCL(pp)); 3030 ASSERT(pp->p_vnode == vp); 3031 pp->p_szc = 0; 3032 } while ((pp = pp->p_next) != done_pplist); 3033 3034 while (pplist != NULL) { 3035 VM_STAT_ADD(segvnvmstats.fill_vp_pages[27]); 3036 pp = pplist; 3037 page_sub(&pplist, pp); 3038 page_free(pp, 0); 3039 } 3040 3041 while (done_pplist != NULL) { 3042 VM_STAT_ADD(segvnvmstats.fill_vp_pages[28]); 3043 pp = done_pplist; 3044 page_sub(&done_pplist, pp); 3045 page_unlock(pp); 3046 } 3047 *ppplist = NULL; 3048 return (0); 3049 } 3050 ASSERT(pplist == *ppplist); 3051 if (io_err) { 3052 VM_STAT_ADD(segvnvmstats.fill_vp_pages[29]); 3053 /* 3054 * don't downsize on io error. 3055 * see if vop_getpage succeeds. 3056 * pplist may still be used in this case 3057 * for relocations. 3058 */ 3059 return (0); 3060 } 3061 VM_STAT_ADD(segvnvmstats.fill_vp_pages[30]); 3062 page_free_replacement_page(pplist); 3063 page_create_putback(pages); 3064 *ppplist = NULL; 3065 return (0); 3066 } 3067 3068 int segvn_anypgsz = 0; 3069 3070 #define SEGVN_RESTORE_SOFTLOCK(type, pages) \ 3071 if ((type) == F_SOFTLOCK) { \ 3072 mutex_enter(&freemem_lock); \ 3073 availrmem += (pages); \ 3074 segvn_pages_locked -= (pages); \ 3075 svd->softlockcnt -= (pages); \ 3076 mutex_exit(&freemem_lock); \ 3077 } 3078 3079 #define SEGVN_UPDATE_MODBITS(ppa, pages, rw, prot, vpprot) \ 3080 if (IS_VMODSORT((ppa)[0]->p_vnode)) { \ 3081 if ((rw) == S_WRITE) { \ 3082 for (i = 0; i < (pages); i++) { \ 3083 ASSERT((ppa)[i]->p_vnode == \ 3084 (ppa)[0]->p_vnode); \ 3085 hat_setmod((ppa)[i]); \ 3086 } \ 3087 } else if ((rw) != S_OTHER && \ 3088 ((prot) & (vpprot) & PROT_WRITE)) { \ 3089 for (i = 0; i < (pages); i++) { \ 3090 ASSERT((ppa)[i]->p_vnode == \ 3091 (ppa)[0]->p_vnode); \ 3092 if (!hat_ismod((ppa)[i])) { \ 3093 prot &= ~PROT_WRITE; \ 3094 break; \ 3095 } \ 3096 } \ 3097 } \ 3098 } 3099 3100 #ifdef VM_STATS 3101 3102 #define SEGVN_VMSTAT_FLTVNPAGES(idx) \ 3103 VM_STAT_ADD(segvnvmstats.fltvnpages[(idx)]); 3104 3105 #else /* VM_STATS */ 3106 3107 #define SEGVN_VMSTAT_FLTVNPAGES(idx) 3108 3109 #endif 3110 3111 static faultcode_t 3112 segvn_fault_vnodepages(struct hat *hat, struct seg *seg, caddr_t lpgaddr, 3113 caddr_t lpgeaddr, enum fault_type type, enum seg_rw rw, caddr_t addr, 3114 caddr_t eaddr, int brkcow) 3115 { 3116 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 3117 struct anon_map *amp = svd->amp; 3118 uchar_t segtype = svd->type; 3119 uint_t szc = seg->s_szc; 3120 size_t pgsz = page_get_pagesize(szc); 3121 size_t maxpgsz = pgsz; 3122 pgcnt_t pages = btop(pgsz); 3123 pgcnt_t maxpages = pages; 3124 size_t ppasize = (pages + 1) * sizeof (page_t *); 3125 caddr_t a = lpgaddr; 3126 caddr_t maxlpgeaddr = lpgeaddr; 3127 u_offset_t off = svd->offset + (uintptr_t)(a - seg->s_base); 3128 ulong_t aindx = svd->anon_index + seg_page(seg, a); 3129 struct vpage *vpage = (svd->vpage != NULL) ? 3130 &svd->vpage[seg_page(seg, a)] : NULL; 3131 vnode_t *vp = svd->vp; 3132 page_t **ppa; 3133 uint_t pszc; 3134 size_t ppgsz; 3135 pgcnt_t ppages; 3136 faultcode_t err = 0; 3137 int ierr; 3138 int vop_size_err = 0; 3139 uint_t protchk, prot, vpprot; 3140 ulong_t i; 3141 int hat_flag = (type == F_SOFTLOCK) ? HAT_LOAD_LOCK : HAT_LOAD; 3142 anon_sync_obj_t an_cookie; 3143 enum seg_rw arw; 3144 int alloc_failed = 0; 3145 int adjszc_chk; 3146 struct vattr va; 3147 int xhat = 0; 3148 page_t *pplist; 3149 pfn_t pfn; 3150 int physcontig; 3151 int upgrdfail; 3152 int segvn_anypgsz_vnode = 0; /* for now map vnode with 2 page sizes */ 3153 3154 ASSERT(szc != 0); 3155 ASSERT(vp != NULL); 3156 ASSERT(brkcow == 0 || amp != NULL); 3157 ASSERT(enable_mbit_wa == 0); /* no mbit simulations with large pages */ 3158 ASSERT(!(svd->flags & MAP_NORESERVE)); 3159 ASSERT(type != F_SOFTUNLOCK); 3160 ASSERT(IS_P2ALIGNED(a, maxpgsz)); 3161 ASSERT(amp == NULL || IS_P2ALIGNED(aindx, maxpages)); 3162 ASSERT(SEGVN_LOCK_HELD(seg->s_as, &svd->lock)); 3163 ASSERT(seg->s_szc < NBBY * sizeof (int)); 3164 ASSERT(type != F_SOFTLOCK || lpgeaddr - a == maxpgsz); 3165 3166 VM_STAT_COND_ADD(type == F_SOFTLOCK, segvnvmstats.fltvnpages[0]); 3167 VM_STAT_COND_ADD(type != F_SOFTLOCK, segvnvmstats.fltvnpages[1]); 3168 3169 if (svd->flags & MAP_TEXT) { 3170 hat_flag |= HAT_LOAD_TEXT; 3171 } 3172 3173 if (svd->pageprot) { 3174 switch (rw) { 3175 case S_READ: 3176 protchk = PROT_READ; 3177 break; 3178 case S_WRITE: 3179 protchk = PROT_WRITE; 3180 break; 3181 case S_EXEC: 3182 protchk = PROT_EXEC; 3183 break; 3184 case S_OTHER: 3185 default: 3186 protchk = PROT_READ | PROT_WRITE | PROT_EXEC; 3187 break; 3188 } 3189 } else { 3190 prot = svd->prot; 3191 /* caller has already done segment level protection check. */ 3192 } 3193 3194 if (seg->s_as->a_hat != hat) { 3195 xhat = 1; 3196 } 3197 3198 if (rw == S_WRITE && segtype == MAP_PRIVATE) { 3199 SEGVN_VMSTAT_FLTVNPAGES(2); 3200 arw = S_READ; 3201 } else { 3202 arw = rw; 3203 } 3204 3205 ppa = kmem_alloc(ppasize, KM_SLEEP); 3206 3207 VM_STAT_COND_ADD(amp != NULL, segvnvmstats.fltvnpages[3]); 3208 3209 for (;;) { 3210 adjszc_chk = 0; 3211 for (; a < lpgeaddr; a += pgsz, off += pgsz, aindx += pages) { 3212 if (adjszc_chk) { 3213 while (szc < seg->s_szc) { 3214 uintptr_t e; 3215 uint_t tszc; 3216 tszc = segvn_anypgsz_vnode ? szc + 1 : 3217 seg->s_szc; 3218 ppgsz = page_get_pagesize(tszc); 3219 if (!IS_P2ALIGNED(a, ppgsz) || 3220 ((alloc_failed >> tszc) & 3221 0x1)) { 3222 break; 3223 } 3224 SEGVN_VMSTAT_FLTVNPAGES(4); 3225 szc = tszc; 3226 pgsz = ppgsz; 3227 pages = btop(pgsz); 3228 e = P2ROUNDUP((uintptr_t)eaddr, pgsz); 3229 lpgeaddr = (caddr_t)e; 3230 } 3231 } 3232 3233 again: 3234 if (IS_P2ALIGNED(a, maxpgsz) && amp != NULL) { 3235 ASSERT(IS_P2ALIGNED(aindx, maxpages)); 3236 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 3237 anon_array_enter(amp, aindx, &an_cookie); 3238 if (anon_get_ptr(amp->ahp, aindx) != NULL) { 3239 SEGVN_VMSTAT_FLTVNPAGES(5); 3240 if (anon_pages(amp->ahp, aindx, 3241 maxpages) != maxpages) { 3242 panic("segvn_fault_vnodepages:" 3243 " empty anon slots\n"); 3244 } 3245 anon_array_exit(&an_cookie); 3246 ANON_LOCK_EXIT(&->a_rwlock); 3247 err = segvn_fault_anonpages(hat, seg, 3248 a, a + maxpgsz, type, rw, 3249 MAX(a, addr), 3250 MIN(a + maxpgsz, eaddr), brkcow); 3251 if (err != 0) { 3252 SEGVN_VMSTAT_FLTVNPAGES(6); 3253 goto out; 3254 } 3255 if (szc < seg->s_szc) { 3256 szc = seg->s_szc; 3257 pgsz = maxpgsz; 3258 pages = maxpages; 3259 lpgeaddr = maxlpgeaddr; 3260 } 3261 goto next; 3262 } else if (anon_pages(amp->ahp, aindx, 3263 maxpages)) { 3264 panic("segvn_fault_vnodepages:" 3265 " non empty anon slots\n"); 3266 } else { 3267 SEGVN_VMSTAT_FLTVNPAGES(7); 3268 anon_array_exit(&an_cookie); 3269 ANON_LOCK_EXIT(&->a_rwlock); 3270 } 3271 } 3272 ASSERT(!brkcow || IS_P2ALIGNED(a, maxpgsz)); 3273 3274 if (svd->pageprot != 0 && IS_P2ALIGNED(a, maxpgsz)) { 3275 ASSERT(vpage != NULL); 3276 prot = VPP_PROT(vpage); 3277 ASSERT(sameprot(seg, a, maxpgsz)); 3278 if ((prot & protchk) == 0) { 3279 SEGVN_VMSTAT_FLTVNPAGES(8); 3280 err = FC_PROT; 3281 goto out; 3282 } 3283 } 3284 if (type == F_SOFTLOCK) { 3285 mutex_enter(&freemem_lock); 3286 if (availrmem < tune.t_minarmem + pages) { 3287 mutex_exit(&freemem_lock); 3288 err = FC_MAKE_ERR(ENOMEM); 3289 goto out; 3290 } else { 3291 availrmem -= pages; 3292 segvn_pages_locked += pages; 3293 svd->softlockcnt += pages; 3294 } 3295 mutex_exit(&freemem_lock); 3296 } 3297 3298 pplist = NULL; 3299 physcontig = 0; 3300 ppa[0] = NULL; 3301 if (!brkcow && szc && 3302 !page_exists_physcontig(vp, off, szc, 3303 segtype == MAP_PRIVATE ? ppa : NULL)) { 3304 SEGVN_VMSTAT_FLTVNPAGES(9); 3305 if (page_alloc_pages(vp, seg, a, &pplist, NULL, 3306 szc, 0) && type != F_SOFTLOCK) { 3307 SEGVN_VMSTAT_FLTVNPAGES(10); 3308 pszc = 0; 3309 ierr = -1; 3310 alloc_failed |= (1 << szc); 3311 break; 3312 } 3313 if (pplist != NULL && 3314 vp->v_mpssdata == SEGVN_PAGEIO) { 3315 int downsize; 3316 SEGVN_VMSTAT_FLTVNPAGES(11); 3317 physcontig = segvn_fill_vp_pages(svd, 3318 vp, off, szc, ppa, &pplist, 3319 &pszc, &downsize); 3320 ASSERT(!physcontig || pplist == NULL); 3321 if (!physcontig && downsize && 3322 type != F_SOFTLOCK) { 3323 ASSERT(pplist == NULL); 3324 SEGVN_VMSTAT_FLTVNPAGES(12); 3325 ierr = -1; 3326 break; 3327 } 3328 ASSERT(!physcontig || 3329 segtype == MAP_PRIVATE || 3330 ppa[0] == NULL); 3331 if (physcontig && ppa[0] == NULL) { 3332 physcontig = 0; 3333 } 3334 } 3335 } else if (!brkcow && szc && ppa[0] != NULL) { 3336 SEGVN_VMSTAT_FLTVNPAGES(13); 3337 ASSERT(segtype == MAP_PRIVATE); 3338 physcontig = 1; 3339 } 3340 3341 if (!physcontig) { 3342 SEGVN_VMSTAT_FLTVNPAGES(14); 3343 ppa[0] = NULL; 3344 ierr = VOP_GETPAGE(vp, (offset_t)off, pgsz, 3345 &vpprot, ppa, pgsz, seg, a, arw, 3346 svd->cred); 3347 if (segtype == MAP_PRIVATE) { 3348 SEGVN_VMSTAT_FLTVNPAGES(15); 3349 vpprot &= ~PROT_WRITE; 3350 } 3351 } else { 3352 ASSERT(segtype == MAP_PRIVATE); 3353 SEGVN_VMSTAT_FLTVNPAGES(16); 3354 vpprot = PROT_ALL & ~PROT_WRITE; 3355 ierr = 0; 3356 } 3357 3358 if (ierr != 0) { 3359 SEGVN_VMSTAT_FLTVNPAGES(17); 3360 if (pplist != NULL) { 3361 SEGVN_VMSTAT_FLTVNPAGES(18); 3362 page_free_replacement_page(pplist); 3363 page_create_putback(pages); 3364 } 3365 SEGVN_RESTORE_SOFTLOCK(type, pages); 3366 if (a + pgsz <= eaddr) { 3367 SEGVN_VMSTAT_FLTVNPAGES(19); 3368 err = FC_MAKE_ERR(ierr); 3369 goto out; 3370 } 3371 va.va_mask = AT_SIZE; 3372 if (VOP_GETATTR(vp, &va, 0, svd->cred) != 0) { 3373 SEGVN_VMSTAT_FLTVNPAGES(20); 3374 err = FC_MAKE_ERR(EIO); 3375 goto out; 3376 } 3377 if (btopr(va.va_size) >= btopr(off + pgsz)) { 3378 SEGVN_VMSTAT_FLTVNPAGES(21); 3379 err = FC_MAKE_ERR(ierr); 3380 goto out; 3381 } 3382 if (btopr(va.va_size) < 3383 btopr(off + (eaddr - a))) { 3384 SEGVN_VMSTAT_FLTVNPAGES(22); 3385 err = FC_MAKE_ERR(ierr); 3386 goto out; 3387 } 3388 if (brkcow || type == F_SOFTLOCK) { 3389 /* can't reduce map area */ 3390 SEGVN_VMSTAT_FLTVNPAGES(23); 3391 vop_size_err = 1; 3392 goto out; 3393 } 3394 SEGVN_VMSTAT_FLTVNPAGES(24); 3395 ASSERT(szc != 0); 3396 pszc = 0; 3397 ierr = -1; 3398 break; 3399 } 3400 3401 if (amp != NULL) { 3402 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 3403 anon_array_enter(amp, aindx, &an_cookie); 3404 } 3405 if (amp != NULL && 3406 anon_get_ptr(amp->ahp, aindx) != NULL) { 3407 ulong_t taindx = P2ALIGN(aindx, maxpages); 3408 3409 SEGVN_VMSTAT_FLTVNPAGES(25); 3410 if (anon_pages(amp->ahp, taindx, maxpages) != 3411 maxpages) { 3412 panic("segvn_fault_vnodepages:" 3413 " empty anon slots\n"); 3414 } 3415 for (i = 0; i < pages; i++) { 3416 page_unlock(ppa[i]); 3417 } 3418 anon_array_exit(&an_cookie); 3419 ANON_LOCK_EXIT(&->a_rwlock); 3420 if (pplist != NULL) { 3421 page_free_replacement_page(pplist); 3422 page_create_putback(pages); 3423 } 3424 SEGVN_RESTORE_SOFTLOCK(type, pages); 3425 if (szc < seg->s_szc) { 3426 SEGVN_VMSTAT_FLTVNPAGES(26); 3427 /* 3428 * For private segments SOFTLOCK 3429 * either always breaks cow (any rw 3430 * type except S_READ_NOCOW) or 3431 * address space is locked as writer 3432 * (S_READ_NOCOW case) and anon slots 3433 * can't show up on second check. 3434 * Therefore if we are here for 3435 * SOFTLOCK case it must be a cow 3436 * break but cow break never reduces 3437 * szc. Thus the assert below. 3438 */ 3439 ASSERT(!brkcow && type != F_SOFTLOCK); 3440 pszc = seg->s_szc; 3441 ierr = -2; 3442 break; 3443 } 3444 ASSERT(IS_P2ALIGNED(a, maxpgsz)); 3445 goto again; 3446 } 3447 #ifdef DEBUG 3448 if (amp != NULL) { 3449 ulong_t taindx = P2ALIGN(aindx, maxpages); 3450 ASSERT(!anon_pages(amp->ahp, taindx, maxpages)); 3451 } 3452 #endif /* DEBUG */ 3453 3454 if (brkcow) { 3455 ASSERT(amp != NULL); 3456 ASSERT(pplist == NULL); 3457 ASSERT(szc == seg->s_szc); 3458 ASSERT(IS_P2ALIGNED(a, maxpgsz)); 3459 ASSERT(IS_P2ALIGNED(aindx, maxpages)); 3460 SEGVN_VMSTAT_FLTVNPAGES(27); 3461 ierr = anon_map_privatepages(amp, aindx, szc, 3462 seg, a, prot, ppa, vpage, segvn_anypgsz, 3463 svd->cred); 3464 if (ierr != 0) { 3465 SEGVN_VMSTAT_FLTVNPAGES(28); 3466 anon_array_exit(&an_cookie); 3467 ANON_LOCK_EXIT(&->a_rwlock); 3468 SEGVN_RESTORE_SOFTLOCK(type, pages); 3469 err = FC_MAKE_ERR(ierr); 3470 goto out; 3471 } 3472 3473 ASSERT(!IS_VMODSORT(ppa[0]->p_vnode)); 3474 /* 3475 * p_szc can't be changed for locked 3476 * swapfs pages. 3477 */ 3478 hat_memload_array(hat, a, pgsz, ppa, prot, 3479 hat_flag); 3480 3481 if (!(hat_flag & HAT_LOAD_LOCK)) { 3482 SEGVN_VMSTAT_FLTVNPAGES(29); 3483 for (i = 0; i < pages; i++) { 3484 page_unlock(ppa[i]); 3485 } 3486 } 3487 anon_array_exit(&an_cookie); 3488 ANON_LOCK_EXIT(&->a_rwlock); 3489 goto next; 3490 } 3491 3492 pfn = page_pptonum(ppa[0]); 3493 /* 3494 * hat_page_demote() needs an EXCl lock on one of 3495 * constituent page_t's and it decreases root's p_szc 3496 * last. This means if root's p_szc is equal szc and 3497 * all its constituent pages are locked 3498 * hat_page_demote() that could have changed p_szc to 3499 * szc is already done and no new have page_demote() 3500 * can start for this large page. 3501 */ 3502 3503 /* 3504 * we need to make sure same mapping size is used for 3505 * the same address range if there's a possibility the 3506 * adddress is already mapped because hat layer panics 3507 * when translation is loaded for the range already 3508 * mapped with a different page size. We achieve it 3509 * by always using largest page size possible subject 3510 * to the constraints of page size, segment page size 3511 * and page alignment. Since mappings are invalidated 3512 * when those constraints change and make it 3513 * impossible to use previously used mapping size no 3514 * mapping size conflicts should happen. 3515 */ 3516 3517 chkszc: 3518 if ((pszc = ppa[0]->p_szc) == szc && 3519 IS_P2ALIGNED(pfn, pages)) { 3520 3521 SEGVN_VMSTAT_FLTVNPAGES(30); 3522 #ifdef DEBUG 3523 for (i = 0; i < pages; i++) { 3524 ASSERT(PAGE_LOCKED(ppa[i])); 3525 ASSERT(!PP_ISFREE(ppa[i])); 3526 ASSERT(page_pptonum(ppa[i]) == 3527 pfn + i); 3528 ASSERT(ppa[i]->p_szc == szc); 3529 ASSERT(ppa[i]->p_vnode == vp); 3530 ASSERT(ppa[i]->p_offset == 3531 off + (i << PAGESHIFT)); 3532 } 3533 #endif /* DEBUG */ 3534 /* 3535 * All pages are of szc we need and they are 3536 * all locked so they can't change szc. load 3537 * translations. 3538 * 3539 * if page got promoted since last check 3540 * we don't need pplist. 3541 */ 3542 if (pplist != NULL) { 3543 page_free_replacement_page(pplist); 3544 page_create_putback(pages); 3545 } 3546 if (PP_ISMIGRATE(ppa[0])) { 3547 page_migrate(seg, a, ppa, pages); 3548 } 3549 SEGVN_UPDATE_MODBITS(ppa, pages, rw, 3550 prot, vpprot); 3551 if (!xhat) { 3552 hat_memload_array(hat, a, pgsz, ppa, 3553 prot & vpprot, hat_flag); 3554 } else { 3555 /* 3556 * avoid large xhat mappings to FS 3557 * pages so that hat_page_demote() 3558 * doesn't need to check for xhat 3559 * large mappings. 3560 */ 3561 for (i = 0; i < pages; i++) { 3562 hat_memload(hat, 3563 a + (i << PAGESHIFT), 3564 ppa[i], prot & vpprot, 3565 hat_flag); 3566 } 3567 } 3568 3569 if (!(hat_flag & HAT_LOAD_LOCK)) { 3570 for (i = 0; i < pages; i++) { 3571 page_unlock(ppa[i]); 3572 } 3573 } 3574 if (amp != NULL) { 3575 anon_array_exit(&an_cookie); 3576 ANON_LOCK_EXIT(&->a_rwlock); 3577 } 3578 goto next; 3579 } 3580 3581 /* 3582 * See if upsize is possible. 3583 */ 3584 if (pszc > szc && szc < seg->s_szc && 3585 (segvn_anypgsz_vnode || pszc >= seg->s_szc)) { 3586 pgcnt_t aphase; 3587 uint_t pszc1 = MIN(pszc, seg->s_szc); 3588 ppgsz = page_get_pagesize(pszc1); 3589 ppages = btop(ppgsz); 3590 aphase = btop(P2PHASE((uintptr_t)a, ppgsz)); 3591 3592 ASSERT(type != F_SOFTLOCK); 3593 3594 SEGVN_VMSTAT_FLTVNPAGES(31); 3595 if (aphase != P2PHASE(pfn, ppages)) { 3596 segvn_faultvnmpss_align_err4++; 3597 } else { 3598 SEGVN_VMSTAT_FLTVNPAGES(32); 3599 if (pplist != NULL) { 3600 page_t *pl = pplist; 3601 page_free_replacement_page(pl); 3602 page_create_putback(pages); 3603 } 3604 for (i = 0; i < pages; i++) { 3605 page_unlock(ppa[i]); 3606 } 3607 if (amp != NULL) { 3608 anon_array_exit(&an_cookie); 3609 ANON_LOCK_EXIT(&->a_rwlock); 3610 } 3611 pszc = pszc1; 3612 ierr = -2; 3613 break; 3614 } 3615 } 3616 3617 /* 3618 * check if we should use smallest mapping size. 3619 */ 3620 upgrdfail = 0; 3621 if (szc == 0 || xhat || 3622 (pszc >= szc && 3623 !IS_P2ALIGNED(pfn, pages)) || 3624 (pszc < szc && 3625 !segvn_full_szcpages(ppa, szc, &upgrdfail, 3626 &pszc))) { 3627 3628 if (upgrdfail && type != F_SOFTLOCK) { 3629 /* 3630 * segvn_full_szcpages failed to lock 3631 * all pages EXCL. Size down. 3632 */ 3633 ASSERT(pszc < szc); 3634 3635 SEGVN_VMSTAT_FLTVNPAGES(33); 3636 3637 if (pplist != NULL) { 3638 page_t *pl = pplist; 3639 page_free_replacement_page(pl); 3640 page_create_putback(pages); 3641 } 3642 3643 for (i = 0; i < pages; i++) { 3644 page_unlock(ppa[i]); 3645 } 3646 if (amp != NULL) { 3647 anon_array_exit(&an_cookie); 3648 ANON_LOCK_EXIT(&->a_rwlock); 3649 } 3650 ierr = -1; 3651 break; 3652 } 3653 if (szc != 0 && !xhat) { 3654 segvn_faultvnmpss_align_err5++; 3655 } 3656 SEGVN_VMSTAT_FLTVNPAGES(34); 3657 if (pplist != NULL) { 3658 page_free_replacement_page(pplist); 3659 page_create_putback(pages); 3660 } 3661 SEGVN_UPDATE_MODBITS(ppa, pages, rw, 3662 prot, vpprot); 3663 for (i = 0; i < pages; i++) { 3664 hat_memload(hat, a + (i << PAGESHIFT), 3665 ppa[i], prot & vpprot, hat_flag); 3666 } 3667 if (!(hat_flag & HAT_LOAD_LOCK)) { 3668 for (i = 0; i < pages; i++) { 3669 page_unlock(ppa[i]); 3670 } 3671 } 3672 if (amp != NULL) { 3673 anon_array_exit(&an_cookie); 3674 ANON_LOCK_EXIT(&->a_rwlock); 3675 } 3676 goto next; 3677 } 3678 3679 if (pszc == szc) { 3680 /* 3681 * segvn_full_szcpages() upgraded pages szc. 3682 */ 3683 ASSERT(pszc == ppa[0]->p_szc); 3684 ASSERT(IS_P2ALIGNED(pfn, pages)); 3685 goto chkszc; 3686 } 3687 3688 if (pszc > szc) { 3689 kmutex_t *szcmtx; 3690 SEGVN_VMSTAT_FLTVNPAGES(35); 3691 /* 3692 * p_szc of ppa[0] can change since we haven't 3693 * locked all constituent pages. Call 3694 * page_lock_szc() to prevent szc changes. 3695 * This should be a rare case that happens when 3696 * multiple segments use a different page size 3697 * to map the same file offsets. 3698 */ 3699 szcmtx = page_szc_lock(ppa[0]); 3700 pszc = ppa[0]->p_szc; 3701 ASSERT(szcmtx != NULL || pszc == 0); 3702 ASSERT(ppa[0]->p_szc <= pszc); 3703 if (pszc <= szc) { 3704 SEGVN_VMSTAT_FLTVNPAGES(36); 3705 if (szcmtx != NULL) { 3706 mutex_exit(szcmtx); 3707 } 3708 goto chkszc; 3709 } 3710 if (pplist != NULL) { 3711 /* 3712 * page got promoted since last check. 3713 * we don't need preaalocated large 3714 * page. 3715 */ 3716 SEGVN_VMSTAT_FLTVNPAGES(37); 3717 page_free_replacement_page(pplist); 3718 page_create_putback(pages); 3719 } 3720 SEGVN_UPDATE_MODBITS(ppa, pages, rw, 3721 prot, vpprot); 3722 hat_memload_array(hat, a, pgsz, ppa, 3723 prot & vpprot, hat_flag); 3724 mutex_exit(szcmtx); 3725 if (!(hat_flag & HAT_LOAD_LOCK)) { 3726 for (i = 0; i < pages; i++) { 3727 page_unlock(ppa[i]); 3728 } 3729 } 3730 if (amp != NULL) { 3731 anon_array_exit(&an_cookie); 3732 ANON_LOCK_EXIT(&->a_rwlock); 3733 } 3734 goto next; 3735 } 3736 3737 /* 3738 * if page got demoted since last check 3739 * we could have not allocated larger page. 3740 * allocate now. 3741 */ 3742 if (pplist == NULL && 3743 page_alloc_pages(vp, seg, a, &pplist, NULL, 3744 szc, 0) && type != F_SOFTLOCK) { 3745 SEGVN_VMSTAT_FLTVNPAGES(38); 3746 for (i = 0; i < pages; i++) { 3747 page_unlock(ppa[i]); 3748 } 3749 if (amp != NULL) { 3750 anon_array_exit(&an_cookie); 3751 ANON_LOCK_EXIT(&->a_rwlock); 3752 } 3753 ierr = -1; 3754 alloc_failed |= (1 << szc); 3755 break; 3756 } 3757 3758 SEGVN_VMSTAT_FLTVNPAGES(39); 3759 3760 if (pplist != NULL) { 3761 segvn_relocate_pages(ppa, pplist); 3762 #ifdef DEBUG 3763 } else { 3764 ASSERT(type == F_SOFTLOCK); 3765 SEGVN_VMSTAT_FLTVNPAGES(40); 3766 #endif /* DEBUG */ 3767 } 3768 3769 SEGVN_UPDATE_MODBITS(ppa, pages, rw, prot, vpprot); 3770 3771 if (pplist == NULL && segvn_anypgsz_vnode == 0) { 3772 ASSERT(type == F_SOFTLOCK); 3773 for (i = 0; i < pages; i++) { 3774 ASSERT(ppa[i]->p_szc < szc); 3775 hat_memload(hat, a + (i << PAGESHIFT), 3776 ppa[i], prot & vpprot, hat_flag); 3777 } 3778 } else { 3779 ASSERT(pplist != NULL || type == F_SOFTLOCK); 3780 hat_memload_array(hat, a, pgsz, ppa, 3781 prot & vpprot, hat_flag); 3782 } 3783 if (!(hat_flag & HAT_LOAD_LOCK)) { 3784 for (i = 0; i < pages; i++) { 3785 ASSERT(PAGE_SHARED(ppa[i])); 3786 page_unlock(ppa[i]); 3787 } 3788 } 3789 if (amp != NULL) { 3790 anon_array_exit(&an_cookie); 3791 ANON_LOCK_EXIT(&->a_rwlock); 3792 } 3793 3794 next: 3795 if (vpage != NULL) { 3796 vpage += pages; 3797 } 3798 adjszc_chk = 1; 3799 } 3800 if (a == lpgeaddr) 3801 break; 3802 ASSERT(a < lpgeaddr); 3803 3804 ASSERT(!brkcow && type != F_SOFTLOCK); 3805 3806 /* 3807 * ierr == -1 means we failed to map with a large page. 3808 * (either due to allocation/relocation failures or 3809 * misalignment with other mappings to this file. 3810 * 3811 * ierr == -2 means some other thread allocated a large page 3812 * after we gave up tp map with a large page. retry with 3813 * larger mapping. 3814 */ 3815 ASSERT(ierr == -1 || ierr == -2); 3816 ASSERT(ierr == -2 || szc != 0); 3817 ASSERT(ierr == -1 || szc < seg->s_szc); 3818 if (ierr == -2) { 3819 SEGVN_VMSTAT_FLTVNPAGES(41); 3820 ASSERT(pszc > szc && pszc <= seg->s_szc); 3821 szc = pszc; 3822 } else if (segvn_anypgsz_vnode) { 3823 SEGVN_VMSTAT_FLTVNPAGES(42); 3824 szc--; 3825 } else { 3826 SEGVN_VMSTAT_FLTVNPAGES(43); 3827 ASSERT(pszc < szc); 3828 /* 3829 * other process created pszc large page. 3830 * but we still have to drop to 0 szc. 3831 */ 3832 szc = 0; 3833 } 3834 3835 pgsz = page_get_pagesize(szc); 3836 pages = btop(pgsz); 3837 if (ierr == -2) { 3838 /* 3839 * Size up case. Note lpgaddr may only be needed for 3840 * softlock case so we don't adjust it here. 3841 */ 3842 a = (caddr_t)P2ALIGN((uintptr_t)a, pgsz); 3843 ASSERT(a >= lpgaddr); 3844 lpgeaddr = (caddr_t)P2ROUNDUP((uintptr_t)eaddr, pgsz); 3845 off = svd->offset + (uintptr_t)(a - seg->s_base); 3846 aindx = svd->anon_index + seg_page(seg, a); 3847 vpage = (svd->vpage != NULL) ? 3848 &svd->vpage[seg_page(seg, a)] : NULL; 3849 } else { 3850 /* 3851 * Size down case. Note lpgaddr may only be needed for 3852 * softlock case so we don't adjust it here. 3853 */ 3854 ASSERT(IS_P2ALIGNED(a, pgsz)); 3855 ASSERT(IS_P2ALIGNED(lpgeaddr, pgsz)); 3856 lpgeaddr = (caddr_t)P2ROUNDUP((uintptr_t)eaddr, pgsz); 3857 ASSERT(a < lpgeaddr); 3858 if (a < addr) { 3859 SEGVN_VMSTAT_FLTVNPAGES(44); 3860 /* 3861 * The beginning of the large page region can 3862 * be pulled to the right to make a smaller 3863 * region. We haven't yet faulted a single 3864 * page. 3865 */ 3866 a = (caddr_t)P2ALIGN((uintptr_t)addr, pgsz); 3867 ASSERT(a >= lpgaddr); 3868 off = svd->offset + 3869 (uintptr_t)(a - seg->s_base); 3870 aindx = svd->anon_index + seg_page(seg, a); 3871 vpage = (svd->vpage != NULL) ? 3872 &svd->vpage[seg_page(seg, a)] : NULL; 3873 } 3874 } 3875 } 3876 out: 3877 kmem_free(ppa, ppasize); 3878 if (!err && !vop_size_err) { 3879 SEGVN_VMSTAT_FLTVNPAGES(45); 3880 return (0); 3881 } 3882 if (type == F_SOFTLOCK && a > lpgaddr) { 3883 SEGVN_VMSTAT_FLTVNPAGES(46); 3884 segvn_softunlock(seg, lpgaddr, a - lpgaddr, S_OTHER); 3885 } 3886 if (!vop_size_err) { 3887 SEGVN_VMSTAT_FLTVNPAGES(47); 3888 return (err); 3889 } 3890 ASSERT(brkcow || type == F_SOFTLOCK); 3891 /* 3892 * Large page end is mapped beyond the end of file and it's a cow 3893 * fault or softlock so we can't reduce the map area. For now just 3894 * demote the segment. This should really only happen if the end of 3895 * the file changed after the mapping was established since when large 3896 * page segments are created we make sure they don't extend beyond the 3897 * end of the file. 3898 */ 3899 SEGVN_VMSTAT_FLTVNPAGES(48); 3900 3901 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 3902 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_WRITER); 3903 err = 0; 3904 if (seg->s_szc != 0) { 3905 segvn_fltvnpages_clrszc_cnt++; 3906 ASSERT(svd->softlockcnt == 0); 3907 err = segvn_clrszc(seg); 3908 if (err != 0) { 3909 segvn_fltvnpages_clrszc_err++; 3910 } 3911 } 3912 ASSERT(err || seg->s_szc == 0); 3913 SEGVN_LOCK_DOWNGRADE(seg->s_as, &svd->lock); 3914 /* segvn_fault will do its job as if szc had been zero to begin with */ 3915 return (err == 0 ? IE_RETRY : FC_MAKE_ERR(err)); 3916 } 3917 3918 /* 3919 * This routine will attempt to fault in one large page. 3920 * it will use smaller pages if that fails. 3921 * It should only be called for pure anonymous segments. 3922 */ 3923 static faultcode_t 3924 segvn_fault_anonpages(struct hat *hat, struct seg *seg, caddr_t lpgaddr, 3925 caddr_t lpgeaddr, enum fault_type type, enum seg_rw rw, caddr_t addr, 3926 caddr_t eaddr, int brkcow) 3927 { 3928 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 3929 struct anon_map *amp = svd->amp; 3930 uchar_t segtype = svd->type; 3931 uint_t szc = seg->s_szc; 3932 size_t pgsz = page_get_pagesize(szc); 3933 size_t maxpgsz = pgsz; 3934 pgcnt_t pages = btop(pgsz); 3935 size_t ppasize = pages * sizeof (page_t *); 3936 caddr_t a = lpgaddr; 3937 ulong_t aindx = svd->anon_index + seg_page(seg, a); 3938 struct vpage *vpage = (svd->vpage != NULL) ? 3939 &svd->vpage[seg_page(seg, a)] : NULL; 3940 page_t **ppa; 3941 uint_t ppa_szc; 3942 faultcode_t err; 3943 int ierr; 3944 uint_t protchk, prot, vpprot; 3945 int i; 3946 int hat_flag = (type == F_SOFTLOCK) ? HAT_LOAD_LOCK : HAT_LOAD; 3947 anon_sync_obj_t cookie; 3948 3949 ASSERT(szc != 0); 3950 ASSERT(amp != NULL); 3951 ASSERT(enable_mbit_wa == 0); /* no mbit simulations with large pages */ 3952 ASSERT(!(svd->flags & MAP_NORESERVE)); 3953 ASSERT(type != F_SOFTUNLOCK); 3954 ASSERT(segtype == MAP_PRIVATE); 3955 ASSERT(IS_P2ALIGNED(a, maxpgsz)); 3956 3957 ASSERT(SEGVN_LOCK_HELD(seg->s_as, &svd->lock)); 3958 3959 VM_STAT_COND_ADD(type == F_SOFTLOCK, segvnvmstats.fltanpages[0]); 3960 VM_STAT_COND_ADD(type != F_SOFTLOCK, segvnvmstats.fltanpages[1]); 3961 3962 if (svd->flags & MAP_TEXT) { 3963 hat_flag |= HAT_LOAD_TEXT; 3964 } 3965 3966 if (svd->pageprot) { 3967 switch (rw) { 3968 case S_READ: 3969 protchk = PROT_READ; 3970 break; 3971 case S_WRITE: 3972 protchk = PROT_WRITE; 3973 break; 3974 case S_EXEC: 3975 protchk = PROT_EXEC; 3976 break; 3977 case S_OTHER: 3978 default: 3979 protchk = PROT_READ | PROT_WRITE | PROT_EXEC; 3980 break; 3981 } 3982 VM_STAT_ADD(segvnvmstats.fltanpages[2]); 3983 } else { 3984 prot = svd->prot; 3985 /* caller has already done segment level protection check. */ 3986 } 3987 3988 ppa = kmem_alloc(ppasize, KM_SLEEP); 3989 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 3990 for (;;) { 3991 for (; a < lpgeaddr; a += pgsz, aindx += pages) { 3992 if (svd->pageprot != 0 && IS_P2ALIGNED(a, maxpgsz)) { 3993 VM_STAT_ADD(segvnvmstats.fltanpages[3]); 3994 ASSERT(vpage != NULL); 3995 prot = VPP_PROT(vpage); 3996 ASSERT(sameprot(seg, a, maxpgsz)); 3997 if ((prot & protchk) == 0) { 3998 err = FC_PROT; 3999 goto error; 4000 } 4001 } 4002 if (type == F_SOFTLOCK) { 4003 mutex_enter(&freemem_lock); 4004 if (availrmem < tune.t_minarmem + pages) { 4005 mutex_exit(&freemem_lock); 4006 err = FC_MAKE_ERR(ENOMEM); 4007 goto error; 4008 } else { 4009 availrmem -= pages; 4010 segvn_pages_locked += pages; 4011 svd->softlockcnt += pages; 4012 } 4013 mutex_exit(&freemem_lock); 4014 } 4015 anon_array_enter(amp, aindx, &cookie); 4016 ppa_szc = (uint_t)-1; 4017 ierr = anon_map_getpages(amp, aindx, szc, seg, a, 4018 prot, &vpprot, ppa, &ppa_szc, vpage, rw, brkcow, 4019 segvn_anypgsz, svd->cred); 4020 if (ierr != 0) { 4021 anon_array_exit(&cookie); 4022 VM_STAT_ADD(segvnvmstats.fltanpages[4]); 4023 if (type == F_SOFTLOCK) { 4024 VM_STAT_ADD(segvnvmstats.fltanpages[5]); 4025 mutex_enter(&freemem_lock); 4026 availrmem += pages; 4027 segvn_pages_locked -= pages; 4028 svd->softlockcnt -= pages; 4029 mutex_exit(&freemem_lock); 4030 } 4031 if (ierr > 0) { 4032 VM_STAT_ADD(segvnvmstats.fltanpages[6]); 4033 err = FC_MAKE_ERR(ierr); 4034 goto error; 4035 } 4036 break; 4037 } 4038 4039 ASSERT(!IS_VMODSORT(ppa[0]->p_vnode)); 4040 4041 /* 4042 * Handle pages that have been marked for migration 4043 */ 4044 if (lgrp_optimizations()) 4045 page_migrate(seg, a, ppa, pages); 4046 4047 hat_memload_array(hat, a, pgsz, ppa, 4048 prot & vpprot, hat_flag); 4049 4050 if (hat_flag & HAT_LOAD_LOCK) { 4051 VM_STAT_ADD(segvnvmstats.fltanpages[7]); 4052 } else { 4053 VM_STAT_ADD(segvnvmstats.fltanpages[8]); 4054 for (i = 0; i < pages; i++) 4055 page_unlock(ppa[i]); 4056 } 4057 if (vpage != NULL) 4058 vpage += pages; 4059 4060 anon_array_exit(&cookie); 4061 } 4062 if (a == lpgeaddr) 4063 break; 4064 ASSERT(a < lpgeaddr); 4065 /* 4066 * ierr == -1 means we failed to allocate a large page. 4067 * so do a size down operation. 4068 * 4069 * ierr == -2 means some other process that privately shares 4070 * pages with this process has allocated a larger page and we 4071 * need to retry with larger pages. So do a size up 4072 * operation. This relies on the fact that large pages are 4073 * never partially shared i.e. if we share any constituent 4074 * page of a large page with another process we must share the 4075 * entire large page. Note this cannot happen for SOFTLOCK 4076 * case, unless current address (a) is at the beginning of the 4077 * next page size boundary because the other process couldn't 4078 * have relocated locked pages. 4079 */ 4080 ASSERT(ierr == -1 || ierr == -2); 4081 if (segvn_anypgsz) { 4082 ASSERT(ierr == -2 || szc != 0); 4083 ASSERT(ierr == -1 || szc < seg->s_szc); 4084 szc = (ierr == -1) ? szc - 1 : szc + 1; 4085 } else { 4086 /* 4087 * For non COW faults and segvn_anypgsz == 0 4088 * we need to be careful not to loop forever 4089 * if existing page is found with szc other 4090 * than 0 or seg->s_szc. This could be due 4091 * to page relocations on behalf of DR or 4092 * more likely large page creation. For this 4093 * case simply re-size to existing page's szc 4094 * if returned by anon_map_getpages(). 4095 */ 4096 if (ppa_szc == (uint_t)-1) { 4097 szc = (ierr == -1) ? 0 : seg->s_szc; 4098 } else { 4099 ASSERT(ppa_szc <= seg->s_szc); 4100 ASSERT(ierr == -2 || ppa_szc < szc); 4101 ASSERT(ierr == -1 || ppa_szc > szc); 4102 szc = ppa_szc; 4103 } 4104 } 4105 4106 pgsz = page_get_pagesize(szc); 4107 pages = btop(pgsz); 4108 ASSERT(type != F_SOFTLOCK || ierr == -1 || 4109 (IS_P2ALIGNED(a, pgsz) && IS_P2ALIGNED(lpgeaddr, pgsz))); 4110 if (type == F_SOFTLOCK) { 4111 /* 4112 * For softlocks we cannot reduce the fault area 4113 * (calculated based on the largest page size for this 4114 * segment) for size down and a is already next 4115 * page size aligned as assertted above for size 4116 * ups. Therefore just continue in case of softlock. 4117 */ 4118 VM_STAT_ADD(segvnvmstats.fltanpages[9]); 4119 continue; /* keep lint happy */ 4120 } else if (ierr == -2) { 4121 4122 /* 4123 * Size up case. Note lpgaddr may only be needed for 4124 * softlock case so we don't adjust it here. 4125 */ 4126 VM_STAT_ADD(segvnvmstats.fltanpages[10]); 4127 a = (caddr_t)P2ALIGN((uintptr_t)a, pgsz); 4128 ASSERT(a >= lpgaddr); 4129 lpgeaddr = (caddr_t)P2ROUNDUP((uintptr_t)eaddr, pgsz); 4130 aindx = svd->anon_index + seg_page(seg, a); 4131 vpage = (svd->vpage != NULL) ? 4132 &svd->vpage[seg_page(seg, a)] : NULL; 4133 } else { 4134 /* 4135 * Size down case. Note lpgaddr may only be needed for 4136 * softlock case so we don't adjust it here. 4137 */ 4138 VM_STAT_ADD(segvnvmstats.fltanpages[11]); 4139 ASSERT(IS_P2ALIGNED(a, pgsz)); 4140 ASSERT(IS_P2ALIGNED(lpgeaddr, pgsz)); 4141 lpgeaddr = (caddr_t)P2ROUNDUP((uintptr_t)eaddr, pgsz); 4142 ASSERT(a < lpgeaddr); 4143 if (a < addr) { 4144 /* 4145 * The beginning of the large page region can 4146 * be pulled to the right to make a smaller 4147 * region. We haven't yet faulted a single 4148 * page. 4149 */ 4150 VM_STAT_ADD(segvnvmstats.fltanpages[12]); 4151 a = (caddr_t)P2ALIGN((uintptr_t)addr, pgsz); 4152 ASSERT(a >= lpgaddr); 4153 aindx = svd->anon_index + seg_page(seg, a); 4154 vpage = (svd->vpage != NULL) ? 4155 &svd->vpage[seg_page(seg, a)] : NULL; 4156 } 4157 } 4158 } 4159 VM_STAT_ADD(segvnvmstats.fltanpages[13]); 4160 ANON_LOCK_EXIT(&->a_rwlock); 4161 kmem_free(ppa, ppasize); 4162 return (0); 4163 error: 4164 VM_STAT_ADD(segvnvmstats.fltanpages[14]); 4165 ANON_LOCK_EXIT(&->a_rwlock); 4166 kmem_free(ppa, ppasize); 4167 if (type == F_SOFTLOCK && a > lpgaddr) { 4168 VM_STAT_ADD(segvnvmstats.fltanpages[15]); 4169 segvn_softunlock(seg, lpgaddr, a - lpgaddr, S_OTHER); 4170 } 4171 return (err); 4172 } 4173 4174 int fltadvice = 1; /* set to free behind pages for sequential access */ 4175 4176 /* 4177 * This routine is called via a machine specific fault handling routine. 4178 * It is also called by software routines wishing to lock or unlock 4179 * a range of addresses. 4180 * 4181 * Here is the basic algorithm: 4182 * If unlocking 4183 * Call segvn_softunlock 4184 * Return 4185 * endif 4186 * Checking and set up work 4187 * If we will need some non-anonymous pages 4188 * Call VOP_GETPAGE over the range of non-anonymous pages 4189 * endif 4190 * Loop over all addresses requested 4191 * Call segvn_faultpage passing in page list 4192 * to load up translations and handle anonymous pages 4193 * endloop 4194 * Load up translation to any additional pages in page list not 4195 * already handled that fit into this segment 4196 */ 4197 static faultcode_t 4198 segvn_fault(struct hat *hat, struct seg *seg, caddr_t addr, size_t len, 4199 enum fault_type type, enum seg_rw rw) 4200 { 4201 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 4202 page_t **plp, **ppp, *pp; 4203 u_offset_t off; 4204 caddr_t a; 4205 struct vpage *vpage; 4206 uint_t vpprot, prot; 4207 int err; 4208 page_t *pl[PVN_GETPAGE_NUM + 1]; 4209 size_t plsz, pl_alloc_sz; 4210 size_t page; 4211 ulong_t anon_index; 4212 struct anon_map *amp; 4213 int dogetpage = 0; 4214 caddr_t lpgaddr, lpgeaddr; 4215 size_t pgsz; 4216 anon_sync_obj_t cookie; 4217 int brkcow = BREAK_COW_SHARE(rw, type, svd->type); 4218 4219 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 4220 4221 /* 4222 * First handle the easy stuff 4223 */ 4224 if (type == F_SOFTUNLOCK) { 4225 if (rw == S_READ_NOCOW) { 4226 rw = S_READ; 4227 ASSERT(AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock)); 4228 } 4229 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER); 4230 pgsz = (seg->s_szc == 0) ? PAGESIZE : 4231 page_get_pagesize(seg->s_szc); 4232 VM_STAT_COND_ADD(pgsz > PAGESIZE, segvnvmstats.fltanpages[16]); 4233 CALC_LPG_REGION(pgsz, seg, addr, len, lpgaddr, lpgeaddr); 4234 segvn_softunlock(seg, lpgaddr, lpgeaddr - lpgaddr, rw); 4235 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 4236 return (0); 4237 } 4238 4239 top: 4240 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER); 4241 4242 /* 4243 * If we have the same protections for the entire segment, 4244 * insure that the access being attempted is legitimate. 4245 */ 4246 4247 if (svd->pageprot == 0) { 4248 uint_t protchk; 4249 4250 switch (rw) { 4251 case S_READ: 4252 case S_READ_NOCOW: 4253 protchk = PROT_READ; 4254 break; 4255 case S_WRITE: 4256 protchk = PROT_WRITE; 4257 break; 4258 case S_EXEC: 4259 protchk = PROT_EXEC; 4260 break; 4261 case S_OTHER: 4262 default: 4263 protchk = PROT_READ | PROT_WRITE | PROT_EXEC; 4264 break; 4265 } 4266 4267 if ((svd->prot & protchk) == 0) { 4268 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 4269 return (FC_PROT); /* illegal access type */ 4270 } 4271 } 4272 4273 /* 4274 * We can't allow the long term use of softlocks for vmpss segments, 4275 * because in some file truncation cases we should be able to demote 4276 * the segment, which requires that there are no softlocks. The 4277 * only case where it's ok to allow a SOFTLOCK fault against a vmpss 4278 * segment is S_READ_NOCOW, where the caller holds the address space 4279 * locked as writer and calls softunlock before dropping the as lock. 4280 * S_READ_NOCOW is used by /proc to read memory from another user. 4281 * 4282 * Another deadlock between SOFTLOCK and file truncation can happen 4283 * because segvn_fault_vnodepages() calls the FS one pagesize at 4284 * a time. A second VOP_GETPAGE() call by segvn_fault_vnodepages() 4285 * can cause a deadlock because the first set of page_t's remain 4286 * locked SE_SHARED. To avoid this, we demote segments on a first 4287 * SOFTLOCK if they have a length greater than the segment's 4288 * page size. 4289 * 4290 * So for now, we only avoid demoting a segment on a SOFTLOCK when 4291 * the access type is S_READ_NOCOW and the fault length is less than 4292 * or equal to the segment's page size. While this is quite restrictive, 4293 * it should be the most common case of SOFTLOCK against a vmpss 4294 * segment. 4295 * 4296 * For S_READ_NOCOW, it's safe not to do a copy on write because the 4297 * caller makes sure no COW will be caused by another thread for a 4298 * softlocked page. 4299 */ 4300 if (type == F_SOFTLOCK && svd->vp != NULL && seg->s_szc != 0) { 4301 int demote = 0; 4302 4303 if (rw != S_READ_NOCOW) { 4304 demote = 1; 4305 } 4306 if (!demote && len > PAGESIZE) { 4307 pgsz = page_get_pagesize(seg->s_szc); 4308 CALC_LPG_REGION(pgsz, seg, addr, len, lpgaddr, 4309 lpgeaddr); 4310 if (lpgeaddr - lpgaddr > pgsz) { 4311 demote = 1; 4312 } 4313 } 4314 4315 ASSERT(demote || AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock)); 4316 4317 if (demote) { 4318 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 4319 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_WRITER); 4320 if (seg->s_szc != 0) { 4321 segvn_vmpss_clrszc_cnt++; 4322 ASSERT(svd->softlockcnt == 0); 4323 err = segvn_clrszc(seg); 4324 if (err) { 4325 segvn_vmpss_clrszc_err++; 4326 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 4327 return (FC_MAKE_ERR(err)); 4328 } 4329 } 4330 ASSERT(seg->s_szc == 0); 4331 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 4332 goto top; 4333 } 4334 } 4335 4336 /* 4337 * Check to see if we need to allocate an anon_map structure. 4338 */ 4339 if (svd->amp == NULL && (svd->vp == NULL || brkcow)) { 4340 /* 4341 * Drop the "read" lock on the segment and acquire 4342 * the "write" version since we have to allocate the 4343 * anon_map. 4344 */ 4345 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 4346 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_WRITER); 4347 4348 if (svd->amp == NULL) { 4349 svd->amp = anonmap_alloc(seg->s_size, 0); 4350 svd->amp->a_szc = seg->s_szc; 4351 } 4352 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 4353 4354 /* 4355 * Start all over again since segment protections 4356 * may have changed after we dropped the "read" lock. 4357 */ 4358 goto top; 4359 } 4360 4361 /* 4362 * S_READ_NOCOW vs S_READ distinction was 4363 * only needed for the code above. After 4364 * that we treat it as S_READ. 4365 */ 4366 if (rw == S_READ_NOCOW) { 4367 ASSERT(type == F_SOFTLOCK); 4368 ASSERT(AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock)); 4369 rw = S_READ; 4370 } 4371 4372 amp = svd->amp; 4373 4374 /* 4375 * MADV_SEQUENTIAL work is ignored for large page segments. 4376 */ 4377 if (seg->s_szc != 0) { 4378 pgsz = page_get_pagesize(seg->s_szc); 4379 ASSERT(SEGVN_LOCK_HELD(seg->s_as, &svd->lock)); 4380 /* 4381 * We may need to do relocations so purge seg_pcache to allow 4382 * pages to be locked exclusively. 4383 */ 4384 if (svd->softlockcnt != 0) 4385 segvn_purge(seg); 4386 CALC_LPG_REGION(pgsz, seg, addr, len, lpgaddr, lpgeaddr); 4387 if (svd->vp == NULL) { 4388 ASSERT(svd->type == MAP_PRIVATE); 4389 err = segvn_fault_anonpages(hat, seg, lpgaddr, 4390 lpgeaddr, type, rw, addr, addr + len, brkcow); 4391 } else { 4392 err = segvn_fault_vnodepages(hat, seg, lpgaddr, 4393 lpgeaddr, type, rw, addr, addr + len, brkcow); 4394 if (err == IE_RETRY) { 4395 ASSERT(seg->s_szc == 0); 4396 ASSERT(SEGVN_READ_HELD(seg->s_as, &svd->lock)); 4397 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 4398 goto top; 4399 } 4400 } 4401 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 4402 return (err); 4403 } 4404 4405 page = seg_page(seg, addr); 4406 if (amp != NULL) { 4407 anon_index = svd->anon_index + page; 4408 4409 if ((type == F_PROT) && (rw == S_READ) && 4410 svd->type == MAP_PRIVATE && svd->pageprot == 0) { 4411 size_t index = anon_index; 4412 struct anon *ap; 4413 4414 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 4415 /* 4416 * The fast path could apply to S_WRITE also, except 4417 * that the protection fault could be caused by lazy 4418 * tlb flush when ro->rw. In this case, the pte is 4419 * RW already. But RO in the other cpu's tlb causes 4420 * the fault. Since hat_chgprot won't do anything if 4421 * pte doesn't change, we may end up faulting 4422 * indefinitely until the RO tlb entry gets replaced. 4423 */ 4424 for (a = addr; a < addr + len; a += PAGESIZE, index++) { 4425 anon_array_enter(amp, index, &cookie); 4426 ap = anon_get_ptr(amp->ahp, index); 4427 anon_array_exit(&cookie); 4428 if ((ap == NULL) || (ap->an_refcnt != 1)) { 4429 ANON_LOCK_EXIT(&->a_rwlock); 4430 goto slow; 4431 } 4432 } 4433 hat_chgprot(seg->s_as->a_hat, addr, len, svd->prot); 4434 ANON_LOCK_EXIT(&->a_rwlock); 4435 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 4436 return (0); 4437 } 4438 } 4439 slow: 4440 4441 if (svd->vpage == NULL) 4442 vpage = NULL; 4443 else 4444 vpage = &svd->vpage[page]; 4445 4446 off = svd->offset + (uintptr_t)(addr - seg->s_base); 4447 4448 /* 4449 * If MADV_SEQUENTIAL has been set for the particular page we 4450 * are faulting on, free behind all pages in the segment and put 4451 * them on the free list. 4452 */ 4453 if ((page != 0) && fltadvice) { /* not if first page in segment */ 4454 struct vpage *vpp; 4455 ulong_t fanon_index; 4456 size_t fpage; 4457 u_offset_t pgoff, fpgoff; 4458 struct vnode *fvp; 4459 struct anon *fap = NULL; 4460 4461 if (svd->advice == MADV_SEQUENTIAL || 4462 (svd->pageadvice && 4463 VPP_ADVICE(vpage) == MADV_SEQUENTIAL)) { 4464 pgoff = off - PAGESIZE; 4465 fpage = page - 1; 4466 if (vpage != NULL) 4467 vpp = &svd->vpage[fpage]; 4468 if (amp != NULL) 4469 fanon_index = svd->anon_index + fpage; 4470 4471 while (pgoff > svd->offset) { 4472 if (svd->advice != MADV_SEQUENTIAL && 4473 (!svd->pageadvice || (vpage && 4474 VPP_ADVICE(vpp) != MADV_SEQUENTIAL))) 4475 break; 4476 4477 /* 4478 * If this is an anon page, we must find the 4479 * correct <vp, offset> for it 4480 */ 4481 fap = NULL; 4482 if (amp != NULL) { 4483 ANON_LOCK_ENTER(&->a_rwlock, 4484 RW_READER); 4485 anon_array_enter(amp, fanon_index, 4486 &cookie); 4487 fap = anon_get_ptr(amp->ahp, 4488 fanon_index); 4489 if (fap != NULL) { 4490 swap_xlate(fap, &fvp, &fpgoff); 4491 } else { 4492 fpgoff = pgoff; 4493 fvp = svd->vp; 4494 } 4495 anon_array_exit(&cookie); 4496 ANON_LOCK_EXIT(&->a_rwlock); 4497 } else { 4498 fpgoff = pgoff; 4499 fvp = svd->vp; 4500 } 4501 if (fvp == NULL) 4502 break; /* XXX */ 4503 /* 4504 * Skip pages that are free or have an 4505 * "exclusive" lock. 4506 */ 4507 pp = page_lookup_nowait(fvp, fpgoff, SE_SHARED); 4508 if (pp == NULL) 4509 break; 4510 /* 4511 * We don't need the page_struct_lock to test 4512 * as this is only advisory; even if we 4513 * acquire it someone might race in and lock 4514 * the page after we unlock and before the 4515 * PUTPAGE, then VOP_PUTPAGE will do nothing. 4516 */ 4517 if (pp->p_lckcnt == 0 && pp->p_cowcnt == 0) { 4518 /* 4519 * Hold the vnode before releasing 4520 * the page lock to prevent it from 4521 * being freed and re-used by some 4522 * other thread. 4523 */ 4524 VN_HOLD(fvp); 4525 page_unlock(pp); 4526 /* 4527 * We should build a page list 4528 * to kluster putpages XXX 4529 */ 4530 (void) VOP_PUTPAGE(fvp, 4531 (offset_t)fpgoff, PAGESIZE, 4532 (B_DONTNEED|B_FREE|B_ASYNC), 4533 svd->cred); 4534 VN_RELE(fvp); 4535 } else { 4536 /* 4537 * XXX - Should the loop terminate if 4538 * the page is `locked'? 4539 */ 4540 page_unlock(pp); 4541 } 4542 --vpp; 4543 --fanon_index; 4544 pgoff -= PAGESIZE; 4545 } 4546 } 4547 } 4548 4549 plp = pl; 4550 *plp = NULL; 4551 pl_alloc_sz = 0; 4552 4553 /* 4554 * See if we need to call VOP_GETPAGE for 4555 * *any* of the range being faulted on. 4556 * We can skip all of this work if there 4557 * was no original vnode. 4558 */ 4559 if (svd->vp != NULL) { 4560 u_offset_t vp_off; 4561 size_t vp_len; 4562 struct anon *ap; 4563 vnode_t *vp; 4564 4565 vp_off = off; 4566 vp_len = len; 4567 4568 if (amp == NULL) 4569 dogetpage = 1; 4570 else { 4571 /* 4572 * Only acquire reader lock to prevent amp->ahp 4573 * from being changed. It's ok to miss pages, 4574 * hence we don't do anon_array_enter 4575 */ 4576 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 4577 ap = anon_get_ptr(amp->ahp, anon_index); 4578 4579 if (len <= PAGESIZE) 4580 /* inline non_anon() */ 4581 dogetpage = (ap == NULL); 4582 else 4583 dogetpage = non_anon(amp->ahp, anon_index, 4584 &vp_off, &vp_len); 4585 ANON_LOCK_EXIT(&->a_rwlock); 4586 } 4587 4588 if (dogetpage) { 4589 enum seg_rw arw; 4590 struct as *as = seg->s_as; 4591 4592 if (len > ptob((sizeof (pl) / sizeof (pl[0])) - 1)) { 4593 /* 4594 * Page list won't fit in local array, 4595 * allocate one of the needed size. 4596 */ 4597 pl_alloc_sz = 4598 (btop(len) + 1) * sizeof (page_t *); 4599 plp = kmem_alloc(pl_alloc_sz, KM_SLEEP); 4600 plp[0] = NULL; 4601 plsz = len; 4602 } else if (rw == S_WRITE && svd->type == MAP_PRIVATE || 4603 rw == S_OTHER || 4604 (((size_t)(addr + PAGESIZE) < 4605 (size_t)(seg->s_base + seg->s_size)) && 4606 hat_probe(as->a_hat, addr + PAGESIZE))) { 4607 /* 4608 * Ask VOP_GETPAGE to return the exact number 4609 * of pages if 4610 * (a) this is a COW fault, or 4611 * (b) this is a software fault, or 4612 * (c) next page is already mapped. 4613 */ 4614 plsz = len; 4615 } else { 4616 /* 4617 * Ask VOP_GETPAGE to return adjacent pages 4618 * within the segment. 4619 */ 4620 plsz = MIN((size_t)PVN_GETPAGE_SZ, (size_t) 4621 ((seg->s_base + seg->s_size) - addr)); 4622 ASSERT((addr + plsz) <= 4623 (seg->s_base + seg->s_size)); 4624 } 4625 4626 /* 4627 * Need to get some non-anonymous pages. 4628 * We need to make only one call to GETPAGE to do 4629 * this to prevent certain deadlocking conditions 4630 * when we are doing locking. In this case 4631 * non_anon() should have picked up the smallest 4632 * range which includes all the non-anonymous 4633 * pages in the requested range. We have to 4634 * be careful regarding which rw flag to pass in 4635 * because on a private mapping, the underlying 4636 * object is never allowed to be written. 4637 */ 4638 if (rw == S_WRITE && svd->type == MAP_PRIVATE) { 4639 arw = S_READ; 4640 } else { 4641 arw = rw; 4642 } 4643 vp = svd->vp; 4644 TRACE_3(TR_FAC_VM, TR_SEGVN_GETPAGE, 4645 "segvn_getpage:seg %p addr %p vp %p", 4646 seg, addr, vp); 4647 err = VOP_GETPAGE(vp, (offset_t)vp_off, vp_len, 4648 &vpprot, plp, plsz, seg, addr + (vp_off - off), arw, 4649 svd->cred); 4650 if (err) { 4651 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 4652 segvn_pagelist_rele(plp); 4653 if (pl_alloc_sz) 4654 kmem_free(plp, pl_alloc_sz); 4655 return (FC_MAKE_ERR(err)); 4656 } 4657 if (svd->type == MAP_PRIVATE) 4658 vpprot &= ~PROT_WRITE; 4659 } 4660 } 4661 4662 /* 4663 * N.B. at this time the plp array has all the needed non-anon 4664 * pages in addition to (possibly) having some adjacent pages. 4665 */ 4666 4667 /* 4668 * Always acquire the anon_array_lock to prevent 4669 * 2 threads from allocating separate anon slots for 4670 * the same "addr". 4671 * 4672 * If this is a copy-on-write fault and we don't already 4673 * have the anon_array_lock, acquire it to prevent the 4674 * fault routine from handling multiple copy-on-write faults 4675 * on the same "addr" in the same address space. 4676 * 4677 * Only one thread should deal with the fault since after 4678 * it is handled, the other threads can acquire a translation 4679 * to the newly created private page. This prevents two or 4680 * more threads from creating different private pages for the 4681 * same fault. 4682 * 4683 * We grab "serialization" lock here if this is a MAP_PRIVATE segment 4684 * to prevent deadlock between this thread and another thread 4685 * which has soft-locked this page and wants to acquire serial_lock. 4686 * ( bug 4026339 ) 4687 * 4688 * The fix for bug 4026339 becomes unnecessary when using the 4689 * locking scheme with per amp rwlock and a global set of hash 4690 * lock, anon_array_lock. If we steal a vnode page when low 4691 * on memory and upgrad the page lock through page_rename, 4692 * then the page is PAGE_HANDLED, nothing needs to be done 4693 * for this page after returning from segvn_faultpage. 4694 * 4695 * But really, the page lock should be downgraded after 4696 * the stolen page is page_rename'd. 4697 */ 4698 4699 if (amp != NULL) 4700 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 4701 4702 /* 4703 * Ok, now loop over the address range and handle faults 4704 */ 4705 for (a = addr; a < addr + len; a += PAGESIZE, off += PAGESIZE) { 4706 err = segvn_faultpage(hat, seg, a, off, vpage, plp, vpprot, 4707 type, rw, brkcow); 4708 if (err) { 4709 if (amp != NULL) 4710 ANON_LOCK_EXIT(&->a_rwlock); 4711 if (type == F_SOFTLOCK && a > addr) 4712 segvn_softunlock(seg, addr, (a - addr), 4713 S_OTHER); 4714 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 4715 segvn_pagelist_rele(plp); 4716 if (pl_alloc_sz) 4717 kmem_free(plp, pl_alloc_sz); 4718 return (err); 4719 } 4720 if (vpage) { 4721 vpage++; 4722 } else if (svd->vpage) { 4723 page = seg_page(seg, addr); 4724 vpage = &svd->vpage[++page]; 4725 } 4726 } 4727 4728 /* Didn't get pages from the underlying fs so we're done */ 4729 if (!dogetpage) 4730 goto done; 4731 4732 /* 4733 * Now handle any other pages in the list returned. 4734 * If the page can be used, load up the translations now. 4735 * Note that the for loop will only be entered if "plp" 4736 * is pointing to a non-NULL page pointer which means that 4737 * VOP_GETPAGE() was called and vpprot has been initialized. 4738 */ 4739 if (svd->pageprot == 0) 4740 prot = svd->prot & vpprot; 4741 4742 4743 /* 4744 * Large Files: diff should be unsigned value because we started 4745 * supporting > 2GB segment sizes from 2.5.1 and when a 4746 * large file of size > 2GB gets mapped to address space 4747 * the diff value can be > 2GB. 4748 */ 4749 4750 for (ppp = plp; (pp = *ppp) != NULL; ppp++) { 4751 size_t diff; 4752 struct anon *ap; 4753 int anon_index; 4754 anon_sync_obj_t cookie; 4755 int hat_flag = HAT_LOAD_ADV; 4756 4757 if (svd->flags & MAP_TEXT) { 4758 hat_flag |= HAT_LOAD_TEXT; 4759 } 4760 4761 if (pp == PAGE_HANDLED) 4762 continue; 4763 4764 if (pp->p_offset >= svd->offset && 4765 (pp->p_offset < svd->offset + seg->s_size)) { 4766 4767 diff = pp->p_offset - svd->offset; 4768 4769 /* 4770 * Large Files: Following is the assertion 4771 * validating the above cast. 4772 */ 4773 ASSERT(svd->vp == pp->p_vnode); 4774 4775 page = btop(diff); 4776 if (svd->pageprot) 4777 prot = VPP_PROT(&svd->vpage[page]) & vpprot; 4778 4779 /* 4780 * Prevent other threads in the address space from 4781 * creating private pages (i.e., allocating anon slots) 4782 * while we are in the process of loading translations 4783 * to additional pages returned by the underlying 4784 * object. 4785 */ 4786 if (amp != NULL) { 4787 anon_index = svd->anon_index + page; 4788 anon_array_enter(amp, anon_index, &cookie); 4789 ap = anon_get_ptr(amp->ahp, anon_index); 4790 } 4791 if ((amp == NULL) || (ap == NULL)) { 4792 if (IS_VMODSORT(pp->p_vnode) || 4793 enable_mbit_wa) { 4794 if (rw == S_WRITE) 4795 hat_setmod(pp); 4796 else if (rw != S_OTHER && 4797 !hat_ismod(pp)) 4798 prot &= ~PROT_WRITE; 4799 } 4800 /* 4801 * Skip mapping read ahead pages marked 4802 * for migration, so they will get migrated 4803 * properly on fault 4804 */ 4805 if ((prot & PROT_READ) && !PP_ISMIGRATE(pp)) { 4806 hat_memload(hat, seg->s_base + diff, 4807 pp, prot, hat_flag); 4808 } 4809 } 4810 if (amp != NULL) 4811 anon_array_exit(&cookie); 4812 } 4813 page_unlock(pp); 4814 } 4815 done: 4816 if (amp != NULL) 4817 ANON_LOCK_EXIT(&->a_rwlock); 4818 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 4819 if (pl_alloc_sz) 4820 kmem_free(plp, pl_alloc_sz); 4821 return (0); 4822 } 4823 4824 /* 4825 * This routine is used to start I/O on pages asynchronously. XXX it will 4826 * only create PAGESIZE pages. At fault time they will be relocated into 4827 * larger pages. 4828 */ 4829 static faultcode_t 4830 segvn_faulta(struct seg *seg, caddr_t addr) 4831 { 4832 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 4833 int err; 4834 struct anon_map *amp; 4835 vnode_t *vp; 4836 4837 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 4838 4839 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER); 4840 if ((amp = svd->amp) != NULL) { 4841 struct anon *ap; 4842 4843 /* 4844 * Reader lock to prevent amp->ahp from being changed. 4845 * This is advisory, it's ok to miss a page, so 4846 * we don't do anon_array_enter lock. 4847 */ 4848 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 4849 if ((ap = anon_get_ptr(amp->ahp, 4850 svd->anon_index + seg_page(seg, addr))) != NULL) { 4851 4852 err = anon_getpage(&ap, NULL, NULL, 4853 0, seg, addr, S_READ, svd->cred); 4854 4855 ANON_LOCK_EXIT(&->a_rwlock); 4856 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 4857 if (err) 4858 return (FC_MAKE_ERR(err)); 4859 return (0); 4860 } 4861 ANON_LOCK_EXIT(&->a_rwlock); 4862 } 4863 4864 if (svd->vp == NULL) { 4865 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 4866 return (0); /* zfod page - do nothing now */ 4867 } 4868 4869 vp = svd->vp; 4870 TRACE_3(TR_FAC_VM, TR_SEGVN_GETPAGE, 4871 "segvn_getpage:seg %p addr %p vp %p", seg, addr, vp); 4872 err = VOP_GETPAGE(vp, 4873 (offset_t)(svd->offset + (uintptr_t)(addr - seg->s_base)), 4874 PAGESIZE, NULL, NULL, 0, seg, addr, 4875 S_OTHER, svd->cred); 4876 4877 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 4878 if (err) 4879 return (FC_MAKE_ERR(err)); 4880 return (0); 4881 } 4882 4883 static int 4884 segvn_setprot(struct seg *seg, caddr_t addr, size_t len, uint_t prot) 4885 { 4886 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 4887 struct vpage *svp, *evp; 4888 struct vnode *vp; 4889 size_t pgsz; 4890 pgcnt_t pgcnt; 4891 anon_sync_obj_t cookie; 4892 4893 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 4894 4895 if ((svd->maxprot & prot) != prot) 4896 return (EACCES); /* violated maxprot */ 4897 4898 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_WRITER); 4899 4900 /* return if prot is the same */ 4901 if (!svd->pageprot && svd->prot == prot) { 4902 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 4903 return (0); 4904 } 4905 4906 /* 4907 * Since we change protections we first have to flush the cache. 4908 * This makes sure all the pagelock calls have to recheck 4909 * protections. 4910 */ 4911 if (svd->softlockcnt > 0) { 4912 /* 4913 * Since we do have the segvn writers lock nobody can fill 4914 * the cache with entries belonging to this seg during 4915 * the purge. The flush either succeeds or we still have 4916 * pending I/Os. 4917 */ 4918 segvn_purge(seg); 4919 if (svd->softlockcnt > 0) { 4920 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 4921 return (EAGAIN); 4922 } 4923 } 4924 4925 if (seg->s_szc != 0) { 4926 int err; 4927 pgsz = page_get_pagesize(seg->s_szc); 4928 pgcnt = pgsz >> PAGESHIFT; 4929 ASSERT(IS_P2ALIGNED(pgcnt, pgcnt)); 4930 if (!IS_P2ALIGNED(addr, pgsz) || !IS_P2ALIGNED(len, pgsz)) { 4931 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 4932 ASSERT(seg->s_base != addr || seg->s_size != len); 4933 /* 4934 * If we are holding the as lock as a reader then 4935 * we need to return IE_RETRY and let the as 4936 * layer drop and re-aquire the lock as a writer. 4937 */ 4938 if (AS_READ_HELD(seg->s_as, &seg->s_as->a_lock)) 4939 return (IE_RETRY); 4940 VM_STAT_ADD(segvnvmstats.demoterange[1]); 4941 err = segvn_demote_range(seg, addr, len, SDR_END); 4942 if (err == 0) 4943 return (IE_RETRY); 4944 if (err == ENOMEM) 4945 return (IE_NOMEM); 4946 return (err); 4947 } 4948 } 4949 4950 4951 /* 4952 * If it's a private mapping and we're making it writable 4953 * and no swap space has been reserved, have to reserve 4954 * it all now. If it's a private mapping to a file (i.e., vp != NULL) 4955 * and we're removing write permission on the entire segment and 4956 * we haven't modified any pages, we can release the swap space. 4957 */ 4958 if (svd->type == MAP_PRIVATE) { 4959 if (prot & PROT_WRITE) { 4960 size_t sz; 4961 if (svd->swresv == 0 && !(svd->flags & MAP_NORESERVE)) { 4962 if (anon_resv(seg->s_size) == 0) { 4963 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 4964 return (IE_NOMEM); 4965 } 4966 sz = svd->swresv = seg->s_size; 4967 TRACE_3(TR_FAC_VM, TR_ANON_PROC, 4968 "anon proc:%p %lu %u", 4969 seg, sz, 1); 4970 } 4971 } else { 4972 /* 4973 * Swap space is released only if this segment 4974 * does not map anonymous memory, since read faults 4975 * on such segments still need an anon slot to read 4976 * in the data. 4977 */ 4978 if (svd->swresv != 0 && svd->vp != NULL && 4979 svd->amp == NULL && addr == seg->s_base && 4980 len == seg->s_size && svd->pageprot == 0) { 4981 anon_unresv(svd->swresv); 4982 svd->swresv = 0; 4983 TRACE_3(TR_FAC_VM, TR_ANON_PROC, 4984 "anon proc:%p %lu %u", 4985 seg, 0, 0); 4986 } 4987 } 4988 } 4989 4990 if (addr == seg->s_base && len == seg->s_size && svd->pageprot == 0) { 4991 if (svd->prot == prot) { 4992 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 4993 return (0); /* all done */ 4994 } 4995 svd->prot = (uchar_t)prot; 4996 } else { 4997 struct anon *ap = NULL; 4998 page_t *pp; 4999 u_offset_t offset, off; 5000 struct anon_map *amp; 5001 ulong_t anon_idx = 0; 5002 5003 /* 5004 * A vpage structure exists or else the change does not 5005 * involve the entire segment. Establish a vpage structure 5006 * if none is there. Then, for each page in the range, 5007 * adjust its individual permissions. Note that write- 5008 * enabling a MAP_PRIVATE page can affect the claims for 5009 * locked down memory. Overcommitting memory terminates 5010 * the operation. 5011 */ 5012 segvn_vpage(seg); 5013 if ((amp = svd->amp) != NULL) { 5014 anon_idx = svd->anon_index + seg_page(seg, addr); 5015 ASSERT(seg->s_szc == 0 || 5016 IS_P2ALIGNED(anon_idx, pgcnt)); 5017 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 5018 } 5019 5020 offset = svd->offset + (uintptr_t)(addr - seg->s_base); 5021 evp = &svd->vpage[seg_page(seg, addr + len)]; 5022 5023 /* 5024 * See Statement at the beginning of segvn_lockop regarding 5025 * the way cowcnts and lckcnts are handled. 5026 */ 5027 for (svp = &svd->vpage[seg_page(seg, addr)]; svp < evp; svp++) { 5028 5029 ASSERT(seg->s_szc == 0 || 5030 (svd->vp != NULL || svd->type == MAP_PRIVATE)); 5031 5032 if (seg->s_szc != 0 && svd->type == MAP_PRIVATE) { 5033 if (amp != NULL) { 5034 anon_array_enter(amp, anon_idx, 5035 &cookie); 5036 } 5037 if (IS_P2ALIGNED(anon_idx, pgcnt) && 5038 !segvn_claim_pages(seg, svp, offset, 5039 anon_idx, prot)) { 5040 if (amp != NULL) { 5041 anon_array_exit(&cookie); 5042 } 5043 break; 5044 } 5045 if (amp != NULL) { 5046 anon_array_exit(&cookie); 5047 } 5048 anon_idx++; 5049 } else { 5050 if (amp != NULL) { 5051 anon_array_enter(amp, anon_idx, 5052 &cookie); 5053 ap = anon_get_ptr(amp->ahp, anon_idx++); 5054 } 5055 5056 if (VPP_ISPPLOCK(svp) && 5057 (VPP_PROT(svp) != prot) && 5058 (svd->type == MAP_PRIVATE)) { 5059 5060 if (amp == NULL || ap == NULL) { 5061 vp = svd->vp; 5062 off = offset; 5063 } else 5064 swap_xlate(ap, &vp, &off); 5065 if (amp != NULL) 5066 anon_array_exit(&cookie); 5067 5068 if ((pp = page_lookup(vp, off, 5069 SE_SHARED)) == NULL) { 5070 panic("segvn_setprot: no page"); 5071 /*NOTREACHED*/ 5072 } 5073 ASSERT(seg->s_szc == 0); 5074 if ((VPP_PROT(svp) ^ prot) & 5075 PROT_WRITE) { 5076 if (prot & PROT_WRITE) { 5077 if (!page_addclaim(pp)) { 5078 page_unlock(pp); 5079 break; 5080 } 5081 } else { 5082 if (!page_subclaim(pp)) { 5083 page_unlock(pp); 5084 break; 5085 } 5086 } 5087 } 5088 page_unlock(pp); 5089 } else if (amp != NULL) 5090 anon_array_exit(&cookie); 5091 } 5092 VPP_SETPROT(svp, prot); 5093 offset += PAGESIZE; 5094 } 5095 if (amp != NULL) 5096 ANON_LOCK_EXIT(&->a_rwlock); 5097 5098 /* 5099 * Did we terminate prematurely? If so, simply unload 5100 * the translations to the things we've updated so far. 5101 */ 5102 if (svp != evp) { 5103 len = (svp - &svd->vpage[seg_page(seg, addr)]) * 5104 PAGESIZE; 5105 ASSERT(seg->s_szc == 0 || IS_P2ALIGNED(len, pgsz)); 5106 if (len != 0) 5107 hat_unload(seg->s_as->a_hat, addr, 5108 len, HAT_UNLOAD); 5109 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5110 return (IE_NOMEM); 5111 } 5112 } 5113 5114 if ((prot & PROT_WRITE) != 0 || (prot & ~PROT_USER) == PROT_NONE) { 5115 /* 5116 * Either private or shared data with write access (in 5117 * which case we need to throw out all former translations 5118 * so that we get the right translations set up on fault 5119 * and we don't allow write access to any copy-on-write pages 5120 * that might be around or to prevent write access to pages 5121 * representing holes in a file), or we don't have permission 5122 * to access the memory at all (in which case we have to 5123 * unload any current translations that might exist). 5124 */ 5125 hat_unload(seg->s_as->a_hat, addr, len, HAT_UNLOAD); 5126 } else { 5127 /* 5128 * A shared mapping or a private mapping in which write 5129 * protection is going to be denied - just change all the 5130 * protections over the range of addresses in question. 5131 * segvn does not support any other attributes other 5132 * than prot so we can use hat_chgattr. 5133 */ 5134 hat_chgattr(seg->s_as->a_hat, addr, len, prot); 5135 } 5136 5137 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5138 5139 return (0); 5140 } 5141 5142 /* 5143 * segvn_setpagesize is called via SEGOP_SETPAGESIZE from as_setpagesize, 5144 * to determine if the seg is capable of mapping the requested szc. 5145 */ 5146 static int 5147 segvn_setpagesize(struct seg *seg, caddr_t addr, size_t len, uint_t szc) 5148 { 5149 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 5150 struct segvn_data *nsvd; 5151 struct anon_map *amp = svd->amp; 5152 struct seg *nseg; 5153 caddr_t eaddr = addr + len, a; 5154 size_t pgsz = page_get_pagesize(szc); 5155 int err; 5156 u_offset_t off = svd->offset + (uintptr_t)(addr - seg->s_base); 5157 extern struct vnode kvp; 5158 5159 ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock)); 5160 ASSERT(addr >= seg->s_base && eaddr <= seg->s_base + seg->s_size); 5161 5162 if (seg->s_szc == szc || segvn_lpg_disable != 0) { 5163 return (0); 5164 } 5165 5166 /* 5167 * addr should always be pgsz aligned but eaddr may be misaligned if 5168 * it's at the end of the segment. 5169 * 5170 * XXX we should assert this condition since as_setpagesize() logic 5171 * guarantees it. 5172 */ 5173 if (!IS_P2ALIGNED(addr, pgsz) || 5174 (!IS_P2ALIGNED(eaddr, pgsz) && 5175 eaddr != seg->s_base + seg->s_size)) { 5176 5177 segvn_setpgsz_align_err++; 5178 return (EINVAL); 5179 } 5180 5181 if ((svd->vp == NULL && svd->type == MAP_SHARED) || 5182 (svd->flags & MAP_NORESERVE) || seg->s_as == &kas || 5183 szc > segvn_maxpgszc) { 5184 return (EINVAL); 5185 } 5186 5187 /* paranoid check */ 5188 if (svd->vp != NULL && 5189 (IS_SWAPFSVP(svd->vp) || svd->vp == &kvp)) { 5190 return (EINVAL); 5191 } 5192 5193 if (seg->s_szc == 0 && svd->vp != NULL && 5194 map_addr_vacalign_check(addr, off)) { 5195 return (EINVAL); 5196 } 5197 5198 /* 5199 * Check that protections are the same within new page 5200 * size boundaries. 5201 */ 5202 if (svd->pageprot) { 5203 for (a = addr; a < eaddr; a += pgsz) { 5204 if ((a + pgsz) > eaddr) { 5205 if (!sameprot(seg, a, eaddr - a)) { 5206 return (EINVAL); 5207 } 5208 } else { 5209 if (!sameprot(seg, a, pgsz)) { 5210 return (EINVAL); 5211 } 5212 } 5213 } 5214 } 5215 5216 /* 5217 * Since we are changing page size we first have to flush 5218 * the cache. This makes sure all the pagelock calls have 5219 * to recheck protections. 5220 */ 5221 if (svd->softlockcnt > 0) { 5222 /* 5223 * Since we do have the segvn writers lock nobody can fill 5224 * the cache with entries belonging to this seg during 5225 * the purge. The flush either succeeds or we still have 5226 * pending I/Os. 5227 */ 5228 segvn_purge(seg); 5229 if (svd->softlockcnt > 0) { 5230 return (EAGAIN); 5231 } 5232 } 5233 5234 /* 5235 * Operation for sub range of existing segment. 5236 */ 5237 if (addr != seg->s_base || eaddr != (seg->s_base + seg->s_size)) { 5238 if (szc < seg->s_szc) { 5239 VM_STAT_ADD(segvnvmstats.demoterange[2]); 5240 err = segvn_demote_range(seg, addr, len, SDR_RANGE); 5241 if (err == 0) { 5242 return (IE_RETRY); 5243 } 5244 if (err == ENOMEM) { 5245 return (IE_NOMEM); 5246 } 5247 return (err); 5248 } 5249 if (addr != seg->s_base) { 5250 nseg = segvn_split_seg(seg, addr); 5251 if (eaddr != (nseg->s_base + nseg->s_size)) { 5252 /* eaddr is szc aligned */ 5253 (void) segvn_split_seg(nseg, eaddr); 5254 } 5255 return (IE_RETRY); 5256 } 5257 if (eaddr != (seg->s_base + seg->s_size)) { 5258 /* eaddr is szc aligned */ 5259 (void) segvn_split_seg(seg, eaddr); 5260 } 5261 return (IE_RETRY); 5262 } 5263 5264 /* 5265 * Break any low level sharing and reset seg->s_szc to 0. 5266 */ 5267 if ((err = segvn_clrszc(seg)) != 0) { 5268 if (err == ENOMEM) { 5269 err = IE_NOMEM; 5270 } 5271 return (err); 5272 } 5273 ASSERT(seg->s_szc == 0); 5274 5275 /* 5276 * If the end of the current segment is not pgsz aligned 5277 * then attempt to concatenate with the next segment. 5278 */ 5279 if (!IS_P2ALIGNED(eaddr, pgsz)) { 5280 nseg = AS_SEGNEXT(seg->s_as, seg); 5281 if (nseg == NULL || nseg == seg || eaddr != nseg->s_base) { 5282 return (ENOMEM); 5283 } 5284 if (nseg->s_ops != &segvn_ops) { 5285 return (EINVAL); 5286 } 5287 nsvd = (struct segvn_data *)nseg->s_data; 5288 if (nsvd->softlockcnt > 0) { 5289 segvn_purge(nseg); 5290 if (nsvd->softlockcnt > 0) { 5291 return (EAGAIN); 5292 } 5293 } 5294 err = segvn_clrszc(nseg); 5295 if (err == ENOMEM) { 5296 err = IE_NOMEM; 5297 } 5298 if (err != 0) { 5299 return (err); 5300 } 5301 err = segvn_concat(seg, nseg, 1); 5302 if (err == -1) { 5303 return (EINVAL); 5304 } 5305 if (err == -2) { 5306 return (IE_NOMEM); 5307 } 5308 return (IE_RETRY); 5309 } 5310 5311 /* 5312 * May need to re-align anon array to 5313 * new szc. 5314 */ 5315 if (amp != NULL) { 5316 pgcnt_t pgcnt = pgsz >> PAGESHIFT; 5317 if (!IS_P2ALIGNED(svd->anon_index, pgcnt)) { 5318 struct anon_hdr *nahp; 5319 5320 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 5321 ASSERT(amp->refcnt == 1); 5322 nahp = anon_create(btop(amp->size), ANON_NOSLEEP); 5323 if (nahp == NULL) { 5324 ANON_LOCK_EXIT(&->a_rwlock); 5325 return (IE_NOMEM); 5326 } 5327 if (anon_copy_ptr(amp->ahp, svd->anon_index, 5328 nahp, 0, btop(seg->s_size), ANON_NOSLEEP)) { 5329 anon_release(nahp, btop(amp->size)); 5330 ANON_LOCK_EXIT(&->a_rwlock); 5331 return (IE_NOMEM); 5332 } 5333 anon_release(amp->ahp, btop(amp->size)); 5334 amp->ahp = nahp; 5335 svd->anon_index = 0; 5336 ANON_LOCK_EXIT(&->a_rwlock); 5337 } 5338 } 5339 if (svd->vp != NULL && szc != 0) { 5340 struct vattr va; 5341 u_offset_t eoffpage = svd->offset; 5342 va.va_mask = AT_SIZE; 5343 eoffpage += seg->s_size; 5344 eoffpage = btopr(eoffpage); 5345 if (VOP_GETATTR(svd->vp, &va, 0, svd->cred) != 0) { 5346 segvn_setpgsz_getattr_err++; 5347 return (EINVAL); 5348 } 5349 if (btopr(va.va_size) < eoffpage) { 5350 segvn_setpgsz_eof_err++; 5351 return (EINVAL); 5352 } 5353 if (amp != NULL) { 5354 /* 5355 * anon_fill_cow_holes() may call VOP_GETPAGE(). 5356 * don't take anon map lock here to avoid holding it 5357 * across VOP_GETPAGE() calls that may call back into 5358 * segvn for klsutering checks. We don't really need 5359 * anon map lock here since it's a private segment and 5360 * we hold as level lock as writers. 5361 */ 5362 if ((err = anon_fill_cow_holes(seg, seg->s_base, 5363 amp->ahp, svd->anon_index, svd->vp, svd->offset, 5364 seg->s_size, szc, svd->prot, svd->vpage, 5365 svd->cred)) != 0) { 5366 return (EINVAL); 5367 } 5368 } 5369 segvn_setvnode_mpss(svd->vp); 5370 } 5371 5372 if (amp != NULL) { 5373 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 5374 amp->a_szc = szc; 5375 ANON_LOCK_EXIT(&->a_rwlock); 5376 } 5377 5378 seg->s_szc = szc; 5379 5380 return (0); 5381 } 5382 5383 static int 5384 segvn_clrszc(struct seg *seg) 5385 { 5386 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 5387 struct anon_map *amp = svd->amp; 5388 size_t pgsz; 5389 pgcnt_t pages; 5390 int err = 0; 5391 caddr_t a = seg->s_base; 5392 caddr_t ea = a + seg->s_size; 5393 ulong_t an_idx = svd->anon_index; 5394 vnode_t *vp = svd->vp; 5395 struct vpage *vpage = svd->vpage; 5396 page_t *anon_pl[1 + 1], *pp; 5397 struct anon *ap, *oldap; 5398 uint_t prot = svd->prot, vpprot; 5399 5400 ASSERT(AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock) || 5401 SEGVN_WRITE_HELD(seg->s_as, &svd->lock)); 5402 ASSERT(svd->type == MAP_PRIVATE || 5403 (vp != NULL && svd->amp == NULL)); 5404 5405 if (vp == NULL && amp == NULL) { 5406 seg->s_szc = 0; 5407 return (0); 5408 } 5409 5410 /* 5411 * do HAT_UNLOAD_UNMAP since we are changing the pagesize. 5412 * unload argument is 0 when we are freeing the segment 5413 * and unload was already done. 5414 */ 5415 hat_unload(seg->s_as->a_hat, seg->s_base, seg->s_size, 5416 HAT_UNLOAD_UNMAP); 5417 5418 if (amp == NULL) { 5419 seg->s_szc = 0; 5420 return (0); 5421 } 5422 5423 pgsz = page_get_pagesize(seg->s_szc); 5424 pages = btop(pgsz); 5425 5426 /* 5427 * XXX anon rwlock is not really needed because this is a 5428 * private segment and we are writers. 5429 */ 5430 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 5431 5432 for (; a < ea; a += pgsz, an_idx += pages) { 5433 if ((oldap = anon_get_ptr(amp->ahp, an_idx)) != NULL) { 5434 if (svd->pageprot != 0) { 5435 ASSERT(vpage != NULL); 5436 prot = VPP_PROT(vpage); 5437 ASSERT(sameprot(seg, a, pgsz)); 5438 } 5439 if (seg->s_szc != 0) { 5440 ASSERT(vp == NULL || anon_pages(amp->ahp, 5441 an_idx, pages) == pages); 5442 if ((err = anon_map_demotepages(amp, an_idx, 5443 seg, a, prot, vpage, svd->cred)) != 0) { 5444 goto out; 5445 } 5446 } else { 5447 if (oldap->an_refcnt == 1) { 5448 continue; 5449 } 5450 if ((err = anon_getpage(&oldap, &vpprot, 5451 anon_pl, PAGESIZE, seg, a, S_READ, 5452 svd->cred))) { 5453 goto out; 5454 } 5455 if ((pp = anon_private(&ap, seg, a, prot, 5456 anon_pl[0], 0, svd->cred)) == NULL) { 5457 err = ENOMEM; 5458 goto out; 5459 } 5460 anon_decref(oldap); 5461 (void) anon_set_ptr(amp->ahp, an_idx, ap, 5462 ANON_SLEEP); 5463 page_unlock(pp); 5464 } 5465 } 5466 vpage = (vpage == NULL) ? NULL : vpage + pages; 5467 } 5468 5469 amp->a_szc = 0; 5470 seg->s_szc = 0; 5471 out: 5472 ANON_LOCK_EXIT(&->a_rwlock); 5473 return (err); 5474 } 5475 5476 static int 5477 segvn_claim_pages( 5478 struct seg *seg, 5479 struct vpage *svp, 5480 u_offset_t off, 5481 ulong_t anon_idx, 5482 uint_t prot) 5483 { 5484 pgcnt_t pgcnt = page_get_pagecnt(seg->s_szc); 5485 size_t ppasize = (pgcnt + 1) * sizeof (page_t *); 5486 page_t **ppa; 5487 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 5488 struct anon_map *amp = svd->amp; 5489 struct vpage *evp = svp + pgcnt; 5490 caddr_t addr = ((uintptr_t)(svp - svd->vpage) << PAGESHIFT) 5491 + seg->s_base; 5492 struct anon *ap; 5493 struct vnode *vp = svd->vp; 5494 page_t *pp; 5495 pgcnt_t pg_idx, i; 5496 int err = 0; 5497 anoff_t aoff; 5498 int anon = (amp != NULL) ? 1 : 0; 5499 5500 ASSERT(svd->type == MAP_PRIVATE); 5501 ASSERT(svd->vpage != NULL); 5502 ASSERT(seg->s_szc != 0); 5503 ASSERT(IS_P2ALIGNED(pgcnt, pgcnt)); 5504 ASSERT(amp == NULL || IS_P2ALIGNED(anon_idx, pgcnt)); 5505 ASSERT(sameprot(seg, addr, pgcnt << PAGESHIFT)); 5506 5507 if (VPP_PROT(svp) == prot) 5508 return (1); 5509 if (!((VPP_PROT(svp) ^ prot) & PROT_WRITE)) 5510 return (1); 5511 5512 ppa = kmem_alloc(ppasize, KM_SLEEP); 5513 if (anon && vp != NULL) { 5514 if (anon_get_ptr(amp->ahp, anon_idx) == NULL) { 5515 anon = 0; 5516 ASSERT(!anon_pages(amp->ahp, anon_idx, pgcnt)); 5517 } 5518 ASSERT(!anon || 5519 anon_pages(amp->ahp, anon_idx, pgcnt) == pgcnt); 5520 } 5521 5522 for (*ppa = NULL, pg_idx = 0; svp < evp; svp++, anon_idx++) { 5523 if (!VPP_ISPPLOCK(svp)) 5524 continue; 5525 if (anon) { 5526 ap = anon_get_ptr(amp->ahp, anon_idx); 5527 if (ap == NULL) { 5528 panic("segvn_claim_pages: no anon slot"); 5529 } 5530 swap_xlate(ap, &vp, &aoff); 5531 off = (u_offset_t)aoff; 5532 } 5533 ASSERT(vp != NULL); 5534 if ((pp = page_lookup(vp, 5535 (u_offset_t)off, SE_SHARED)) == NULL) { 5536 panic("segvn_claim_pages: no page"); 5537 } 5538 ppa[pg_idx++] = pp; 5539 off += PAGESIZE; 5540 } 5541 5542 if (ppa[0] == NULL) { 5543 kmem_free(ppa, ppasize); 5544 return (1); 5545 } 5546 5547 ASSERT(pg_idx <= pgcnt); 5548 ppa[pg_idx] = NULL; 5549 5550 if (prot & PROT_WRITE) 5551 err = page_addclaim_pages(ppa); 5552 else 5553 err = page_subclaim_pages(ppa); 5554 5555 for (i = 0; i < pg_idx; i++) { 5556 ASSERT(ppa[i] != NULL); 5557 page_unlock(ppa[i]); 5558 } 5559 5560 kmem_free(ppa, ppasize); 5561 return (err); 5562 } 5563 5564 /* 5565 * Returns right (upper address) segment if split occured. 5566 * If the address is equal to the beginning or end of its segment it returns 5567 * the current segment. 5568 */ 5569 static struct seg * 5570 segvn_split_seg(struct seg *seg, caddr_t addr) 5571 { 5572 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 5573 struct seg *nseg; 5574 size_t nsize; 5575 struct segvn_data *nsvd; 5576 5577 ASSERT(AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock)); 5578 ASSERT(svd->type == MAP_PRIVATE || svd->amp == NULL); 5579 ASSERT(addr >= seg->s_base); 5580 ASSERT(addr <= seg->s_base + seg->s_size); 5581 5582 if (addr == seg->s_base || addr == seg->s_base + seg->s_size) 5583 return (seg); 5584 5585 nsize = seg->s_base + seg->s_size - addr; 5586 seg->s_size = addr - seg->s_base; 5587 nseg = seg_alloc(seg->s_as, addr, nsize); 5588 ASSERT(nseg != NULL); 5589 nseg->s_ops = seg->s_ops; 5590 nsvd = kmem_cache_alloc(segvn_cache, KM_SLEEP); 5591 nseg->s_data = (void *)nsvd; 5592 nseg->s_szc = seg->s_szc; 5593 *nsvd = *svd; 5594 rw_init(&nsvd->lock, NULL, RW_DEFAULT, NULL); 5595 5596 if (nsvd->vp != NULL) { 5597 VN_HOLD(nsvd->vp); 5598 nsvd->offset = svd->offset + 5599 (uintptr_t)(nseg->s_base - seg->s_base); 5600 if (nsvd->type == MAP_SHARED) 5601 lgrp_shm_policy_init(NULL, nsvd->vp); 5602 } else { 5603 /* 5604 * The offset for an anonymous segment has no signifigance in 5605 * terms of an offset into a file. If we were to use the above 5606 * calculation instead, the structures read out of 5607 * /proc/<pid>/xmap would be more difficult to decipher since 5608 * it would be unclear whether two seemingly contiguous 5609 * prxmap_t structures represented different segments or a 5610 * single segment that had been split up into multiple prxmap_t 5611 * structures (e.g. if some part of the segment had not yet 5612 * been faulted in). 5613 */ 5614 nsvd->offset = 0; 5615 } 5616 5617 ASSERT(svd->softlockcnt == 0); 5618 crhold(svd->cred); 5619 5620 if (svd->vpage != NULL) { 5621 size_t bytes = vpgtob(seg_pages(seg)); 5622 size_t nbytes = vpgtob(seg_pages(nseg)); 5623 struct vpage *ovpage = svd->vpage; 5624 5625 svd->vpage = kmem_alloc(bytes, KM_SLEEP); 5626 bcopy(ovpage, svd->vpage, bytes); 5627 nsvd->vpage = kmem_alloc(nbytes, KM_SLEEP); 5628 bcopy(ovpage + seg_pages(seg), nsvd->vpage, nbytes); 5629 kmem_free(ovpage, bytes + nbytes); 5630 } 5631 if (svd->amp != NULL) { 5632 struct anon_map *oamp = svd->amp, *namp; 5633 struct anon_hdr *nahp; 5634 5635 ANON_LOCK_ENTER(&oamp->a_rwlock, RW_WRITER); 5636 ASSERT(oamp->refcnt == 1); 5637 nahp = anon_create(btop(seg->s_size), ANON_SLEEP); 5638 (void) anon_copy_ptr(oamp->ahp, svd->anon_index, 5639 nahp, 0, btop(seg->s_size), ANON_SLEEP); 5640 5641 namp = anonmap_alloc(nseg->s_size, 0); 5642 namp->a_szc = nseg->s_szc; 5643 (void) anon_copy_ptr(oamp->ahp, 5644 svd->anon_index + btop(seg->s_size), 5645 namp->ahp, 0, btop(nseg->s_size), ANON_SLEEP); 5646 anon_release(oamp->ahp, btop(oamp->size)); 5647 oamp->ahp = nahp; 5648 oamp->size = seg->s_size; 5649 svd->anon_index = 0; 5650 nsvd->amp = namp; 5651 nsvd->anon_index = 0; 5652 ANON_LOCK_EXIT(&oamp->a_rwlock); 5653 } 5654 5655 /* 5656 * Split amount of swap reserve 5657 */ 5658 if (svd->swresv) { 5659 /* 5660 * For MAP_NORESERVE, only allocate swap reserve for pages 5661 * being used. Other segments get enough to cover whole 5662 * segment. 5663 */ 5664 if (svd->flags & MAP_NORESERVE) { 5665 size_t oswresv; 5666 5667 ASSERT(svd->amp); 5668 oswresv = svd->swresv; 5669 svd->swresv = ptob(anon_pages(svd->amp->ahp, 5670 svd->anon_index, btop(seg->s_size))); 5671 nsvd->swresv = ptob(anon_pages(nsvd->amp->ahp, 5672 nsvd->anon_index, btop(nseg->s_size))); 5673 ASSERT(oswresv >= (svd->swresv + nsvd->swresv)); 5674 } else { 5675 ASSERT(svd->swresv == seg->s_size + nseg->s_size); 5676 svd->swresv = seg->s_size; 5677 nsvd->swresv = nseg->s_size; 5678 } 5679 } 5680 5681 return (nseg); 5682 } 5683 5684 5685 /* 5686 * called on memory operations (unmap, setprot, setpagesize) for a subset 5687 * of a large page segment to either demote the memory range (SDR_RANGE) 5688 * or the ends (SDR_END) by addr/len. 5689 * 5690 * returns 0 on success. returns errno, including ENOMEM, on failure. 5691 */ 5692 static int 5693 segvn_demote_range(struct seg *seg, caddr_t addr, size_t len, int flag) 5694 { 5695 caddr_t eaddr = addr + len; 5696 caddr_t lpgaddr, lpgeaddr; 5697 struct seg *nseg; 5698 struct seg *badseg1 = NULL; 5699 struct seg *badseg2 = NULL; 5700 size_t pgsz; 5701 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 5702 int err; 5703 5704 ASSERT(AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock)); 5705 ASSERT(seg->s_szc != 0); 5706 pgsz = page_get_pagesize(seg->s_szc); 5707 ASSERT(seg->s_base != addr || seg->s_size != len); 5708 ASSERT(addr >= seg->s_base && eaddr <= seg->s_base + seg->s_size); 5709 ASSERT(svd->softlockcnt == 0); 5710 ASSERT(svd->type == MAP_PRIVATE || 5711 (svd->vp != NULL && svd->amp == NULL)); 5712 5713 CALC_LPG_REGION(pgsz, seg, addr, len, lpgaddr, lpgeaddr); 5714 ASSERT(flag == SDR_RANGE || eaddr < lpgeaddr || addr > lpgaddr); 5715 if (flag == SDR_RANGE) { 5716 /* demote entire range */ 5717 badseg1 = nseg = segvn_split_seg(seg, lpgaddr); 5718 (void) segvn_split_seg(nseg, lpgeaddr); 5719 ASSERT(badseg1->s_base == lpgaddr); 5720 ASSERT(badseg1->s_size == lpgeaddr - lpgaddr); 5721 } else if (addr != lpgaddr) { 5722 ASSERT(flag == SDR_END); 5723 badseg1 = nseg = segvn_split_seg(seg, lpgaddr); 5724 if (eaddr != lpgeaddr && eaddr > lpgaddr + pgsz && 5725 eaddr < lpgaddr + 2 * pgsz) { 5726 (void) segvn_split_seg(nseg, lpgeaddr); 5727 ASSERT(badseg1->s_base == lpgaddr); 5728 ASSERT(badseg1->s_size == 2 * pgsz); 5729 } else { 5730 nseg = segvn_split_seg(nseg, lpgaddr + pgsz); 5731 ASSERT(badseg1->s_base == lpgaddr); 5732 ASSERT(badseg1->s_size == pgsz); 5733 if (eaddr != lpgeaddr && eaddr > lpgaddr + pgsz) { 5734 ASSERT(lpgeaddr - lpgaddr > 2 * pgsz); 5735 nseg = segvn_split_seg(nseg, lpgeaddr - pgsz); 5736 badseg2 = nseg; 5737 (void) segvn_split_seg(nseg, lpgeaddr); 5738 ASSERT(badseg2->s_base == lpgeaddr - pgsz); 5739 ASSERT(badseg2->s_size == pgsz); 5740 } 5741 } 5742 } else { 5743 ASSERT(flag == SDR_END); 5744 ASSERT(eaddr < lpgeaddr); 5745 badseg1 = nseg = segvn_split_seg(seg, lpgeaddr - pgsz); 5746 (void) segvn_split_seg(nseg, lpgeaddr); 5747 ASSERT(badseg1->s_base == lpgeaddr - pgsz); 5748 ASSERT(badseg1->s_size == pgsz); 5749 } 5750 5751 ASSERT(badseg1 != NULL); 5752 ASSERT(badseg1->s_szc != 0); 5753 ASSERT(page_get_pagesize(badseg1->s_szc) == pgsz); 5754 ASSERT(flag == SDR_RANGE || badseg1->s_size == pgsz || 5755 badseg1->s_size == 2 * pgsz); 5756 if (err = segvn_clrszc(badseg1)) { 5757 return (err); 5758 } 5759 ASSERT(badseg1->s_szc == 0); 5760 5761 if (badseg2 == NULL) 5762 return (0); 5763 ASSERT(badseg2->s_szc != 0); 5764 ASSERT(page_get_pagesize(badseg2->s_szc) == pgsz); 5765 ASSERT(badseg2->s_size == pgsz); 5766 ASSERT(sameprot(badseg2, badseg2->s_base, badseg2->s_size)); 5767 if (err = segvn_clrszc(badseg2)) { 5768 return (err); 5769 } 5770 ASSERT(badseg2->s_szc == 0); 5771 return (0); 5772 } 5773 5774 static int 5775 segvn_checkprot(struct seg *seg, caddr_t addr, size_t len, uint_t prot) 5776 { 5777 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 5778 struct vpage *vp, *evp; 5779 5780 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 5781 5782 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER); 5783 /* 5784 * If segment protection can be used, simply check against them. 5785 */ 5786 if (svd->pageprot == 0) { 5787 int err; 5788 5789 err = ((svd->prot & prot) != prot) ? EACCES : 0; 5790 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5791 return (err); 5792 } 5793 5794 /* 5795 * Have to check down to the vpage level. 5796 */ 5797 evp = &svd->vpage[seg_page(seg, addr + len)]; 5798 for (vp = &svd->vpage[seg_page(seg, addr)]; vp < evp; vp++) { 5799 if ((VPP_PROT(vp) & prot) != prot) { 5800 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5801 return (EACCES); 5802 } 5803 } 5804 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5805 return (0); 5806 } 5807 5808 static int 5809 segvn_getprot(struct seg *seg, caddr_t addr, size_t len, uint_t *protv) 5810 { 5811 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 5812 size_t pgno = seg_page(seg, addr + len) - seg_page(seg, addr) + 1; 5813 5814 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 5815 5816 if (pgno != 0) { 5817 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER); 5818 if (svd->pageprot == 0) { 5819 do 5820 protv[--pgno] = svd->prot; 5821 while (pgno != 0); 5822 } else { 5823 size_t pgoff = seg_page(seg, addr); 5824 5825 do { 5826 pgno--; 5827 protv[pgno] = VPP_PROT(&svd->vpage[pgno+pgoff]); 5828 } while (pgno != 0); 5829 } 5830 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5831 } 5832 return (0); 5833 } 5834 5835 static u_offset_t 5836 segvn_getoffset(struct seg *seg, caddr_t addr) 5837 { 5838 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 5839 5840 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 5841 5842 return (svd->offset + (uintptr_t)(addr - seg->s_base)); 5843 } 5844 5845 /*ARGSUSED*/ 5846 static int 5847 segvn_gettype(struct seg *seg, caddr_t addr) 5848 { 5849 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 5850 5851 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 5852 5853 return (svd->type | (svd->flags & MAP_NORESERVE)); 5854 } 5855 5856 /*ARGSUSED*/ 5857 static int 5858 segvn_getvp(struct seg *seg, caddr_t addr, struct vnode **vpp) 5859 { 5860 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 5861 5862 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 5863 5864 *vpp = svd->vp; 5865 return (0); 5866 } 5867 5868 /* 5869 * Check to see if it makes sense to do kluster/read ahead to 5870 * addr + delta relative to the mapping at addr. We assume here 5871 * that delta is a signed PAGESIZE'd multiple (which can be negative). 5872 * 5873 * For segvn, we currently "approve" of the action if we are 5874 * still in the segment and it maps from the same vp/off, 5875 * or if the advice stored in segvn_data or vpages allows it. 5876 * Currently, klustering is not allowed only if MADV_RANDOM is set. 5877 */ 5878 static int 5879 segvn_kluster(struct seg *seg, caddr_t addr, ssize_t delta) 5880 { 5881 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 5882 struct anon *oap, *ap; 5883 ssize_t pd; 5884 size_t page; 5885 struct vnode *vp1, *vp2; 5886 u_offset_t off1, off2; 5887 struct anon_map *amp; 5888 5889 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 5890 ASSERT(AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock) || 5891 SEGVN_LOCK_HELD(seg->s_as, &svd->lock)); 5892 5893 if (addr + delta < seg->s_base || 5894 addr + delta >= (seg->s_base + seg->s_size)) 5895 return (-1); /* exceeded segment bounds */ 5896 5897 pd = delta / (ssize_t)PAGESIZE; /* divide to preserve sign bit */ 5898 page = seg_page(seg, addr); 5899 5900 /* 5901 * Check to see if either of the pages addr or addr + delta 5902 * have advice set that prevents klustering (if MADV_RANDOM advice 5903 * is set for entire segment, or MADV_SEQUENTIAL is set and delta 5904 * is negative). 5905 */ 5906 if (svd->advice == MADV_RANDOM || 5907 svd->advice == MADV_SEQUENTIAL && delta < 0) 5908 return (-1); 5909 else if (svd->pageadvice && svd->vpage) { 5910 struct vpage *bvpp, *evpp; 5911 5912 bvpp = &svd->vpage[page]; 5913 evpp = &svd->vpage[page + pd]; 5914 if (VPP_ADVICE(bvpp) == MADV_RANDOM || 5915 VPP_ADVICE(evpp) == MADV_SEQUENTIAL && delta < 0) 5916 return (-1); 5917 if (VPP_ADVICE(bvpp) != VPP_ADVICE(evpp) && 5918 VPP_ADVICE(evpp) == MADV_RANDOM) 5919 return (-1); 5920 } 5921 5922 if (svd->type == MAP_SHARED) 5923 return (0); /* shared mapping - all ok */ 5924 5925 if ((amp = svd->amp) == NULL) 5926 return (0); /* off original vnode */ 5927 5928 page += svd->anon_index; 5929 5930 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 5931 5932 oap = anon_get_ptr(amp->ahp, page); 5933 ap = anon_get_ptr(amp->ahp, page + pd); 5934 5935 ANON_LOCK_EXIT(&->a_rwlock); 5936 5937 if ((oap == NULL && ap != NULL) || (oap != NULL && ap == NULL)) { 5938 return (-1); /* one with and one without an anon */ 5939 } 5940 5941 if (oap == NULL) { /* implies that ap == NULL */ 5942 return (0); /* off original vnode */ 5943 } 5944 5945 /* 5946 * Now we know we have two anon pointers - check to 5947 * see if they happen to be properly allocated. 5948 */ 5949 5950 /* 5951 * XXX We cheat here and don't lock the anon slots. We can't because 5952 * we may have been called from the anon layer which might already 5953 * have locked them. We are holding a refcnt on the slots so they 5954 * can't disappear. The worst that will happen is we'll get the wrong 5955 * names (vp, off) for the slots and make a poor klustering decision. 5956 */ 5957 swap_xlate(ap, &vp1, &off1); 5958 swap_xlate(oap, &vp2, &off2); 5959 5960 5961 if (!VOP_CMP(vp1, vp2) || off1 - off2 != delta) 5962 return (-1); 5963 return (0); 5964 } 5965 5966 /* 5967 * Swap the pages of seg out to secondary storage, returning the 5968 * number of bytes of storage freed. 5969 * 5970 * The basic idea is first to unload all translations and then to call 5971 * VOP_PUTPAGE() for all newly-unmapped pages, to push them out to the 5972 * swap device. Pages to which other segments have mappings will remain 5973 * mapped and won't be swapped. Our caller (as_swapout) has already 5974 * performed the unloading step. 5975 * 5976 * The value returned is intended to correlate well with the process's 5977 * memory requirements. However, there are some caveats: 5978 * 1) When given a shared segment as argument, this routine will 5979 * only succeed in swapping out pages for the last sharer of the 5980 * segment. (Previous callers will only have decremented mapping 5981 * reference counts.) 5982 * 2) We assume that the hat layer maintains a large enough translation 5983 * cache to capture process reference patterns. 5984 */ 5985 static size_t 5986 segvn_swapout(struct seg *seg) 5987 { 5988 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 5989 struct anon_map *amp; 5990 pgcnt_t pgcnt = 0; 5991 pgcnt_t npages; 5992 pgcnt_t page; 5993 ulong_t anon_index; 5994 5995 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 5996 5997 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER); 5998 /* 5999 * Find pages unmapped by our caller and force them 6000 * out to the virtual swap device. 6001 */ 6002 if ((amp = svd->amp) != NULL) 6003 anon_index = svd->anon_index; 6004 npages = seg->s_size >> PAGESHIFT; 6005 for (page = 0; page < npages; page++) { 6006 page_t *pp; 6007 struct anon *ap; 6008 struct vnode *vp; 6009 u_offset_t off; 6010 anon_sync_obj_t cookie; 6011 6012 /* 6013 * Obtain <vp, off> pair for the page, then look it up. 6014 * 6015 * Note that this code is willing to consider regular 6016 * pages as well as anon pages. Is this appropriate here? 6017 */ 6018 ap = NULL; 6019 if (amp != NULL) { 6020 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 6021 if (anon_array_try_enter(amp, anon_index + page, 6022 &cookie)) { 6023 ANON_LOCK_EXIT(&->a_rwlock); 6024 continue; 6025 } 6026 ap = anon_get_ptr(amp->ahp, anon_index + page); 6027 if (ap != NULL) { 6028 swap_xlate(ap, &vp, &off); 6029 } else { 6030 vp = svd->vp; 6031 off = svd->offset + ptob(page); 6032 } 6033 anon_array_exit(&cookie); 6034 ANON_LOCK_EXIT(&->a_rwlock); 6035 } else { 6036 vp = svd->vp; 6037 off = svd->offset + ptob(page); 6038 } 6039 if (vp == NULL) { /* untouched zfod page */ 6040 ASSERT(ap == NULL); 6041 continue; 6042 } 6043 6044 pp = page_lookup_nowait(vp, off, SE_SHARED); 6045 if (pp == NULL) 6046 continue; 6047 6048 6049 /* 6050 * Examine the page to see whether it can be tossed out, 6051 * keeping track of how many we've found. 6052 */ 6053 if (!page_tryupgrade(pp)) { 6054 /* 6055 * If the page has an i/o lock and no mappings, 6056 * it's very likely that the page is being 6057 * written out as a result of klustering. 6058 * Assume this is so and take credit for it here. 6059 */ 6060 if (!page_io_trylock(pp)) { 6061 if (!hat_page_is_mapped(pp)) 6062 pgcnt++; 6063 } else { 6064 page_io_unlock(pp); 6065 } 6066 page_unlock(pp); 6067 continue; 6068 } 6069 ASSERT(!page_iolock_assert(pp)); 6070 6071 6072 /* 6073 * Skip if page is locked or has mappings. 6074 * We don't need the page_struct_lock to look at lckcnt 6075 * and cowcnt because the page is exclusive locked. 6076 */ 6077 if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0 || 6078 hat_page_is_mapped(pp)) { 6079 page_unlock(pp); 6080 continue; 6081 } 6082 6083 /* 6084 * dispose skips large pages so try to demote first. 6085 */ 6086 if (pp->p_szc != 0 && !page_try_demote_pages(pp)) { 6087 page_unlock(pp); 6088 /* 6089 * XXX should skip the remaining page_t's of this 6090 * large page. 6091 */ 6092 continue; 6093 } 6094 6095 ASSERT(pp->p_szc == 0); 6096 6097 /* 6098 * No longer mapped -- we can toss it out. How 6099 * we do so depends on whether or not it's dirty. 6100 */ 6101 if (hat_ismod(pp) && pp->p_vnode) { 6102 /* 6103 * We must clean the page before it can be 6104 * freed. Setting B_FREE will cause pvn_done 6105 * to free the page when the i/o completes. 6106 * XXX: This also causes it to be accounted 6107 * as a pageout instead of a swap: need 6108 * B_SWAPOUT bit to use instead of B_FREE. 6109 * 6110 * Hold the vnode before releasing the page lock 6111 * to prevent it from being freed and re-used by 6112 * some other thread. 6113 */ 6114 VN_HOLD(vp); 6115 page_unlock(pp); 6116 6117 /* 6118 * Queue all i/o requests for the pageout thread 6119 * to avoid saturating the pageout devices. 6120 */ 6121 if (!queue_io_request(vp, off)) 6122 VN_RELE(vp); 6123 } else { 6124 /* 6125 * The page was clean, free it. 6126 * 6127 * XXX: Can we ever encounter modified pages 6128 * with no associated vnode here? 6129 */ 6130 ASSERT(pp->p_vnode != NULL); 6131 /*LINTED: constant in conditional context*/ 6132 VN_DISPOSE(pp, B_FREE, 0, kcred); 6133 } 6134 6135 /* 6136 * Credit now even if i/o is in progress. 6137 */ 6138 pgcnt++; 6139 } 6140 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 6141 6142 /* 6143 * Wakeup pageout to initiate i/o on all queued requests. 6144 */ 6145 cv_signal_pageout(); 6146 return (ptob(pgcnt)); 6147 } 6148 6149 /* 6150 * Synchronize primary storage cache with real object in virtual memory. 6151 * 6152 * XXX - Anonymous pages should not be sync'ed out at all. 6153 */ 6154 static int 6155 segvn_sync(struct seg *seg, caddr_t addr, size_t len, int attr, uint_t flags) 6156 { 6157 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 6158 struct vpage *vpp; 6159 page_t *pp; 6160 u_offset_t offset; 6161 struct vnode *vp; 6162 u_offset_t off; 6163 caddr_t eaddr; 6164 int bflags; 6165 int err = 0; 6166 int segtype; 6167 int pageprot; 6168 int prot; 6169 ulong_t anon_index; 6170 struct anon_map *amp; 6171 struct anon *ap; 6172 anon_sync_obj_t cookie; 6173 6174 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 6175 6176 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER); 6177 6178 if (svd->softlockcnt > 0) { 6179 /* 6180 * flush all pages from seg cache 6181 * otherwise we may deadlock in swap_putpage 6182 * for B_INVAL page (4175402). 6183 * 6184 * Even if we grab segvn WRITER's lock or segp_slock 6185 * here, there might be another thread which could've 6186 * successfully performed lookup/insert just before 6187 * we acquired the lock here. So, grabbing either 6188 * lock here is of not much use. Until we devise 6189 * a strategy at upper layers to solve the 6190 * synchronization issues completely, we expect 6191 * applications to handle this appropriately. 6192 */ 6193 segvn_purge(seg); 6194 if (svd->softlockcnt > 0) { 6195 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 6196 return (EAGAIN); 6197 } 6198 } 6199 6200 vpp = svd->vpage; 6201 offset = svd->offset + (uintptr_t)(addr - seg->s_base); 6202 bflags = ((flags & MS_ASYNC) ? B_ASYNC : 0) | 6203 ((flags & MS_INVALIDATE) ? B_INVAL : 0); 6204 6205 if (attr) { 6206 pageprot = attr & ~(SHARED|PRIVATE); 6207 segtype = (attr & SHARED) ? MAP_SHARED : MAP_PRIVATE; 6208 6209 /* 6210 * We are done if the segment types don't match 6211 * or if we have segment level protections and 6212 * they don't match. 6213 */ 6214 if (svd->type != segtype) { 6215 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 6216 return (0); 6217 } 6218 if (vpp == NULL) { 6219 if (svd->prot != pageprot) { 6220 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 6221 return (0); 6222 } 6223 prot = svd->prot; 6224 } else 6225 vpp = &svd->vpage[seg_page(seg, addr)]; 6226 6227 } else if (svd->vp && svd->amp == NULL && 6228 (flags & MS_INVALIDATE) == 0) { 6229 6230 /* 6231 * No attributes, no anonymous pages and MS_INVALIDATE flag 6232 * is not on, just use one big request. 6233 */ 6234 err = VOP_PUTPAGE(svd->vp, (offset_t)offset, len, 6235 bflags, svd->cred); 6236 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 6237 return (err); 6238 } 6239 6240 if ((amp = svd->amp) != NULL) 6241 anon_index = svd->anon_index + seg_page(seg, addr); 6242 6243 for (eaddr = addr + len; addr < eaddr; addr += PAGESIZE) { 6244 ap = NULL; 6245 if (amp != NULL) { 6246 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 6247 anon_array_enter(amp, anon_index, &cookie); 6248 ap = anon_get_ptr(amp->ahp, anon_index++); 6249 if (ap != NULL) { 6250 swap_xlate(ap, &vp, &off); 6251 } else { 6252 vp = svd->vp; 6253 off = offset; 6254 } 6255 anon_array_exit(&cookie); 6256 ANON_LOCK_EXIT(&->a_rwlock); 6257 } else { 6258 vp = svd->vp; 6259 off = offset; 6260 } 6261 offset += PAGESIZE; 6262 6263 if (vp == NULL) /* untouched zfod page */ 6264 continue; 6265 6266 if (attr) { 6267 if (vpp) { 6268 prot = VPP_PROT(vpp); 6269 vpp++; 6270 } 6271 if (prot != pageprot) { 6272 continue; 6273 } 6274 } 6275 6276 /* 6277 * See if any of these pages are locked -- if so, then we 6278 * will have to truncate an invalidate request at the first 6279 * locked one. We don't need the page_struct_lock to test 6280 * as this is only advisory; even if we acquire it someone 6281 * might race in and lock the page after we unlock and before 6282 * we do the PUTPAGE, then PUTPAGE simply does nothing. 6283 */ 6284 if (flags & MS_INVALIDATE) { 6285 if ((pp = page_lookup(vp, off, SE_SHARED)) != NULL) { 6286 if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) { 6287 page_unlock(pp); 6288 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 6289 return (EBUSY); 6290 } 6291 if (ap != NULL && pp->p_szc != 0 && 6292 page_tryupgrade(pp)) { 6293 if (pp->p_lckcnt == 0 && 6294 pp->p_cowcnt == 0) { 6295 /* 6296 * swapfs VN_DISPOSE() won't 6297 * invalidate large pages. 6298 * Attempt to demote. 6299 * XXX can't help it if it 6300 * fails. But for swapfs 6301 * pages it is no big deal. 6302 */ 6303 (void) page_try_demote_pages( 6304 pp); 6305 } 6306 } 6307 page_unlock(pp); 6308 } 6309 } else if (svd->type == MAP_SHARED && amp != NULL) { 6310 /* 6311 * Avoid writting out to disk ISM's large pages 6312 * because segspt_free_pages() relies on NULL an_pvp 6313 * of anon slots of such pages. 6314 */ 6315 6316 ASSERT(svd->vp == NULL); 6317 /* 6318 * swapfs uses page_lookup_nowait if not freeing or 6319 * invalidating and skips a page if 6320 * page_lookup_nowait returns NULL. 6321 */ 6322 pp = page_lookup_nowait(vp, off, SE_SHARED); 6323 if (pp == NULL) { 6324 continue; 6325 } 6326 if (pp->p_szc != 0) { 6327 page_unlock(pp); 6328 continue; 6329 } 6330 6331 /* 6332 * Note ISM pages are created large so (vp, off)'s 6333 * page cannot suddenly become large after we unlock 6334 * pp. 6335 */ 6336 page_unlock(pp); 6337 } 6338 /* 6339 * XXX - Should ultimately try to kluster 6340 * calls to VOP_PUTPAGE() for performance. 6341 */ 6342 VN_HOLD(vp); 6343 err = VOP_PUTPAGE(vp, (offset_t)off, PAGESIZE, 6344 bflags, svd->cred); 6345 VN_RELE(vp); 6346 if (err) 6347 break; 6348 } 6349 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 6350 return (err); 6351 } 6352 6353 /* 6354 * Determine if we have data corresponding to pages in the 6355 * primary storage virtual memory cache (i.e., "in core"). 6356 */ 6357 static size_t 6358 segvn_incore(struct seg *seg, caddr_t addr, size_t len, char *vec) 6359 { 6360 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 6361 struct vnode *vp, *avp; 6362 u_offset_t offset, aoffset; 6363 size_t p, ep; 6364 int ret; 6365 struct vpage *vpp; 6366 page_t *pp; 6367 uint_t start; 6368 struct anon_map *amp; /* XXX - for locknest */ 6369 struct anon *ap; 6370 uint_t attr; 6371 anon_sync_obj_t cookie; 6372 6373 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 6374 6375 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER); 6376 if (svd->amp == NULL && svd->vp == NULL) { 6377 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 6378 bzero(vec, btopr(len)); 6379 return (len); /* no anonymous pages created yet */ 6380 } 6381 6382 p = seg_page(seg, addr); 6383 ep = seg_page(seg, addr + len); 6384 start = svd->vp ? SEG_PAGE_VNODEBACKED : 0; 6385 6386 amp = svd->amp; 6387 for (; p < ep; p++, addr += PAGESIZE) { 6388 vpp = (svd->vpage) ? &svd->vpage[p]: NULL; 6389 ret = start; 6390 ap = NULL; 6391 avp = NULL; 6392 /* Grab the vnode/offset for the anon slot */ 6393 if (amp != NULL) { 6394 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 6395 anon_array_enter(amp, svd->anon_index + p, &cookie); 6396 ap = anon_get_ptr(amp->ahp, svd->anon_index + p); 6397 if (ap != NULL) { 6398 swap_xlate(ap, &avp, &aoffset); 6399 } 6400 anon_array_exit(&cookie); 6401 ANON_LOCK_EXIT(&->a_rwlock); 6402 } 6403 if ((avp != NULL) && page_exists(avp, aoffset)) { 6404 /* A page exists for the anon slot */ 6405 ret |= SEG_PAGE_INCORE; 6406 6407 /* 6408 * If page is mapped and writable 6409 */ 6410 attr = (uint_t)0; 6411 if ((hat_getattr(seg->s_as->a_hat, addr, 6412 &attr) != -1) && (attr & PROT_WRITE)) { 6413 ret |= SEG_PAGE_ANON; 6414 } 6415 /* 6416 * Don't get page_struct lock for lckcnt and cowcnt, 6417 * since this is purely advisory. 6418 */ 6419 if ((pp = page_lookup_nowait(avp, aoffset, 6420 SE_SHARED)) != NULL) { 6421 if (pp->p_lckcnt) 6422 ret |= SEG_PAGE_SOFTLOCK; 6423 if (pp->p_cowcnt) 6424 ret |= SEG_PAGE_HASCOW; 6425 page_unlock(pp); 6426 } 6427 } 6428 6429 /* Gather vnode statistics */ 6430 vp = svd->vp; 6431 offset = svd->offset + (uintptr_t)(addr - seg->s_base); 6432 6433 if (vp != NULL) { 6434 /* 6435 * Try to obtain a "shared" lock on the page 6436 * without blocking. If this fails, determine 6437 * if the page is in memory. 6438 */ 6439 pp = page_lookup_nowait(vp, offset, SE_SHARED); 6440 if ((pp == NULL) && (page_exists(vp, offset))) { 6441 /* Page is incore, and is named */ 6442 ret |= (SEG_PAGE_INCORE | SEG_PAGE_VNODE); 6443 } 6444 /* 6445 * Don't get page_struct lock for lckcnt and cowcnt, 6446 * since this is purely advisory. 6447 */ 6448 if (pp != NULL) { 6449 ret |= (SEG_PAGE_INCORE | SEG_PAGE_VNODE); 6450 if (pp->p_lckcnt) 6451 ret |= SEG_PAGE_SOFTLOCK; 6452 if (pp->p_cowcnt) 6453 ret |= SEG_PAGE_HASCOW; 6454 page_unlock(pp); 6455 } 6456 } 6457 6458 /* Gather virtual page information */ 6459 if (vpp) { 6460 if (VPP_ISPPLOCK(vpp)) 6461 ret |= SEG_PAGE_LOCKED; 6462 vpp++; 6463 } 6464 6465 *vec++ = (char)ret; 6466 } 6467 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 6468 return (len); 6469 } 6470 6471 /* 6472 * Statement for p_cowcnts/p_lckcnts. 6473 * 6474 * p_cowcnt is updated while mlock/munlocking MAP_PRIVATE and PROT_WRITE region 6475 * irrespective of the following factors or anything else: 6476 * 6477 * (1) anon slots are populated or not 6478 * (2) cow is broken or not 6479 * (3) refcnt on ap is 1 or greater than 1 6480 * 6481 * If it's not MAP_PRIVATE and PROT_WRITE, p_lckcnt is updated during mlock 6482 * and munlock. 6483 * 6484 * 6485 * Handling p_cowcnts/p_lckcnts during copy-on-write fault: 6486 * 6487 * if vpage has PROT_WRITE 6488 * transfer cowcnt on the oldpage -> cowcnt on the newpage 6489 * else 6490 * transfer lckcnt on the oldpage -> lckcnt on the newpage 6491 * 6492 * During copy-on-write, decrement p_cowcnt on the oldpage and increment 6493 * p_cowcnt on the newpage *if* the corresponding vpage has PROT_WRITE. 6494 * 6495 * We may also break COW if softlocking on read access in the physio case. 6496 * In this case, vpage may not have PROT_WRITE. So, we need to decrement 6497 * p_lckcnt on the oldpage and increment p_lckcnt on the newpage *if* the 6498 * vpage doesn't have PROT_WRITE. 6499 * 6500 * 6501 * Handling p_cowcnts/p_lckcnts during mprotect on mlocked region: 6502 * 6503 * If a MAP_PRIVATE region loses PROT_WRITE, we decrement p_cowcnt and 6504 * increment p_lckcnt by calling page_subclaim() which takes care of 6505 * availrmem accounting and p_lckcnt overflow. 6506 * 6507 * If a MAP_PRIVATE region gains PROT_WRITE, we decrement p_lckcnt and 6508 * increment p_cowcnt by calling page_addclaim() which takes care of 6509 * availrmem availability and p_cowcnt overflow. 6510 */ 6511 6512 /* 6513 * Lock down (or unlock) pages mapped by this segment. 6514 * 6515 * XXX only creates PAGESIZE pages if anon slots are not initialized. 6516 * At fault time they will be relocated into larger pages. 6517 */ 6518 static int 6519 segvn_lockop(struct seg *seg, caddr_t addr, size_t len, 6520 int attr, int op, ulong_t *lockmap, size_t pos) 6521 { 6522 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 6523 struct vpage *vpp; 6524 struct vpage *evp; 6525 page_t *pp; 6526 u_offset_t offset; 6527 u_offset_t off; 6528 int segtype; 6529 int pageprot; 6530 int claim; 6531 struct vnode *vp; 6532 ulong_t anon_index; 6533 struct anon_map *amp; 6534 struct anon *ap; 6535 struct vattr va; 6536 anon_sync_obj_t cookie; 6537 6538 /* 6539 * Hold write lock on address space because may split or concatenate 6540 * segments 6541 */ 6542 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 6543 6544 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_WRITER); 6545 if (attr) { 6546 pageprot = attr & ~(SHARED|PRIVATE); 6547 segtype = attr & SHARED ? MAP_SHARED : MAP_PRIVATE; 6548 6549 /* 6550 * We are done if the segment types don't match 6551 * or if we have segment level protections and 6552 * they don't match. 6553 */ 6554 if (svd->type != segtype) { 6555 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 6556 return (0); 6557 } 6558 if (svd->pageprot == 0 && svd->prot != pageprot) { 6559 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 6560 return (0); 6561 } 6562 } 6563 6564 /* 6565 * If we're locking, then we must create a vpage structure if 6566 * none exists. If we're unlocking, then check to see if there 6567 * is a vpage -- if not, then we could not have locked anything. 6568 */ 6569 6570 if ((vpp = svd->vpage) == NULL) { 6571 if (op == MC_LOCK) 6572 segvn_vpage(seg); 6573 else { 6574 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 6575 return (0); 6576 } 6577 } 6578 6579 /* 6580 * The anonymous data vector (i.e., previously 6581 * unreferenced mapping to swap space) can be allocated 6582 * by lazily testing for its existence. 6583 */ 6584 if (op == MC_LOCK && svd->amp == NULL && svd->vp == NULL) { 6585 svd->amp = anonmap_alloc(seg->s_size, 0); 6586 svd->amp->a_szc = seg->s_szc; 6587 } 6588 6589 if ((amp = svd->amp) != NULL) { 6590 anon_index = svd->anon_index + seg_page(seg, addr); 6591 } 6592 6593 offset = svd->offset + (uintptr_t)(addr - seg->s_base); 6594 evp = &svd->vpage[seg_page(seg, addr + len)]; 6595 6596 /* 6597 * Loop over all pages in the range. Process if we're locking and 6598 * page has not already been locked in this mapping; or if we're 6599 * unlocking and the page has been locked. 6600 */ 6601 for (vpp = &svd->vpage[seg_page(seg, addr)]; vpp < evp; 6602 vpp++, pos++, addr += PAGESIZE, offset += PAGESIZE, anon_index++) { 6603 if ((attr == 0 || VPP_PROT(vpp) == pageprot) && 6604 ((op == MC_LOCK && !VPP_ISPPLOCK(vpp)) || 6605 (op == MC_UNLOCK && VPP_ISPPLOCK(vpp)))) { 6606 6607 if (amp != NULL) 6608 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 6609 /* 6610 * If this isn't a MAP_NORESERVE segment and 6611 * we're locking, allocate anon slots if they 6612 * don't exist. The page is brought in later on. 6613 */ 6614 if (op == MC_LOCK && svd->vp == NULL && 6615 ((svd->flags & MAP_NORESERVE) == 0) && 6616 amp != NULL && 6617 ((ap = anon_get_ptr(amp->ahp, anon_index)) 6618 == NULL)) { 6619 anon_array_enter(amp, anon_index, &cookie); 6620 6621 if ((ap = anon_get_ptr(amp->ahp, 6622 anon_index)) == NULL) { 6623 pp = anon_zero(seg, addr, &ap, 6624 svd->cred); 6625 if (pp == NULL) { 6626 anon_array_exit(&cookie); 6627 ANON_LOCK_EXIT(&->a_rwlock); 6628 SEGVN_LOCK_EXIT(seg->s_as, 6629 &svd->lock); 6630 return (ENOMEM); 6631 } 6632 ASSERT(anon_get_ptr(amp->ahp, 6633 anon_index) == NULL); 6634 (void) anon_set_ptr(amp->ahp, 6635 anon_index, ap, ANON_SLEEP); 6636 page_unlock(pp); 6637 } 6638 anon_array_exit(&cookie); 6639 } 6640 6641 /* 6642 * Get name for page, accounting for 6643 * existence of private copy. 6644 */ 6645 ap = NULL; 6646 if (amp != NULL) { 6647 anon_array_enter(amp, anon_index, &cookie); 6648 ap = anon_get_ptr(amp->ahp, anon_index); 6649 if (ap != NULL) { 6650 swap_xlate(ap, &vp, &off); 6651 } else { 6652 if (svd->vp == NULL && 6653 (svd->flags & MAP_NORESERVE)) { 6654 anon_array_exit(&cookie); 6655 ANON_LOCK_EXIT(&->a_rwlock); 6656 continue; 6657 } 6658 vp = svd->vp; 6659 off = offset; 6660 } 6661 anon_array_exit(&cookie); 6662 ANON_LOCK_EXIT(&->a_rwlock); 6663 } else { 6664 vp = svd->vp; 6665 off = offset; 6666 } 6667 6668 /* 6669 * Get page frame. It's ok if the page is 6670 * not available when we're unlocking, as this 6671 * may simply mean that a page we locked got 6672 * truncated out of existence after we locked it. 6673 * 6674 * Invoke VOP_GETPAGE() to obtain the page struct 6675 * since we may need to read it from disk if its 6676 * been paged out. 6677 */ 6678 if (op != MC_LOCK) 6679 pp = page_lookup(vp, off, SE_SHARED); 6680 else { 6681 page_t *pl[1 + 1]; 6682 int error; 6683 6684 ASSERT(vp != NULL); 6685 6686 error = VOP_GETPAGE(vp, (offset_t)off, PAGESIZE, 6687 (uint_t *)NULL, pl, PAGESIZE, seg, addr, 6688 S_OTHER, svd->cred); 6689 6690 /* 6691 * If the error is EDEADLK then we must bounce 6692 * up and drop all vm subsystem locks and then 6693 * retry the operation later 6694 * This behavior is a temporary measure because 6695 * ufs/sds logging is badly designed and will 6696 * deadlock if we don't allow this bounce to 6697 * happen. The real solution is to re-design 6698 * the logging code to work properly. See bug 6699 * 4125102 for details of the problem. 6700 */ 6701 if (error == EDEADLK) { 6702 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 6703 return (error); 6704 } 6705 /* 6706 * Quit if we fail to fault in the page. Treat 6707 * the failure as an error, unless the addr 6708 * is mapped beyond the end of a file. 6709 */ 6710 if (error && svd->vp) { 6711 va.va_mask = AT_SIZE; 6712 if (VOP_GETATTR(svd->vp, &va, 0, 6713 svd->cred) != 0) { 6714 SEGVN_LOCK_EXIT(seg->s_as, 6715 &svd->lock); 6716 return (EIO); 6717 } 6718 if (btopr(va.va_size) >= 6719 btopr(off + 1)) { 6720 SEGVN_LOCK_EXIT(seg->s_as, 6721 &svd->lock); 6722 return (EIO); 6723 } 6724 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 6725 return (0); 6726 } else if (error) { 6727 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 6728 return (EIO); 6729 } 6730 pp = pl[0]; 6731 ASSERT(pp != NULL); 6732 } 6733 6734 /* 6735 * See Statement at the beginning of this routine. 6736 * 6737 * claim is always set if MAP_PRIVATE and PROT_WRITE 6738 * irrespective of following factors: 6739 * 6740 * (1) anon slots are populated or not 6741 * (2) cow is broken or not 6742 * (3) refcnt on ap is 1 or greater than 1 6743 * 6744 * See 4140683 for details 6745 */ 6746 claim = ((VPP_PROT(vpp) & PROT_WRITE) && 6747 (svd->type == MAP_PRIVATE)); 6748 6749 /* 6750 * Perform page-level operation appropriate to 6751 * operation. If locking, undo the SOFTLOCK 6752 * performed to bring the page into memory 6753 * after setting the lock. If unlocking, 6754 * and no page was found, account for the claim 6755 * separately. 6756 */ 6757 if (op == MC_LOCK) { 6758 int ret = 1; /* Assume success */ 6759 6760 /* 6761 * Make sure another thread didn't lock 6762 * the page after we released the segment 6763 * lock. 6764 */ 6765 if ((attr == 0 || VPP_PROT(vpp) == pageprot) && 6766 !VPP_ISPPLOCK(vpp)) { 6767 ret = page_pp_lock(pp, claim, 0); 6768 if (ret != 0) { 6769 VPP_SETPPLOCK(vpp); 6770 if (lockmap != (ulong_t *)NULL) 6771 BT_SET(lockmap, pos); 6772 } 6773 } 6774 page_unlock(pp); 6775 if (ret == 0) { 6776 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 6777 return (EAGAIN); 6778 } 6779 } else { 6780 if (pp != NULL) { 6781 if ((attr == 0 || 6782 VPP_PROT(vpp) == pageprot) && 6783 VPP_ISPPLOCK(vpp)) 6784 page_pp_unlock(pp, claim, 0); 6785 page_unlock(pp); 6786 } 6787 VPP_CLRPPLOCK(vpp); 6788 } 6789 } 6790 } 6791 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 6792 return (0); 6793 } 6794 6795 /* 6796 * Set advice from user for specified pages 6797 * There are 5 types of advice: 6798 * MADV_NORMAL - Normal (default) behavior (whatever that is) 6799 * MADV_RANDOM - Random page references 6800 * do not allow readahead or 'klustering' 6801 * MADV_SEQUENTIAL - Sequential page references 6802 * Pages previous to the one currently being 6803 * accessed (determined by fault) are 'not needed' 6804 * and are freed immediately 6805 * MADV_WILLNEED - Pages are likely to be used (fault ahead in mctl) 6806 * MADV_DONTNEED - Pages are not needed (synced out in mctl) 6807 * MADV_FREE - Contents can be discarded 6808 * MADV_ACCESS_DEFAULT- Default access 6809 * MADV_ACCESS_LWP - Next LWP will access heavily 6810 * MADV_ACCESS_MANY- Many LWPs or processes will access heavily 6811 */ 6812 static int 6813 segvn_advise(struct seg *seg, caddr_t addr, size_t len, uint_t behav) 6814 { 6815 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 6816 size_t page; 6817 int err = 0; 6818 int already_set; 6819 struct anon_map *amp; 6820 ulong_t anon_index; 6821 struct seg *next; 6822 lgrp_mem_policy_t policy; 6823 struct seg *prev; 6824 struct vnode *vp; 6825 6826 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 6827 6828 /* 6829 * In case of MADV_FREE, we won't be modifying any segment private 6830 * data structures; so, we only need to grab READER's lock 6831 */ 6832 if (behav != MADV_FREE) 6833 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_WRITER); 6834 else 6835 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER); 6836 6837 /* 6838 * Large pages are assumed to be only turned on when accesses to the 6839 * segment's address range have spatial and temporal locality. That 6840 * justifies ignoring MADV_SEQUENTIAL for large page segments. 6841 * Also, ignore advice affecting lgroup memory allocation 6842 * if don't need to do lgroup optimizations on this system 6843 */ 6844 6845 if ((behav == MADV_SEQUENTIAL && seg->s_szc != 0) || 6846 (!lgrp_optimizations() && (behav == MADV_ACCESS_DEFAULT || 6847 behav == MADV_ACCESS_LWP || behav == MADV_ACCESS_MANY))) { 6848 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 6849 return (0); 6850 } 6851 6852 if (behav == MADV_SEQUENTIAL || behav == MADV_ACCESS_DEFAULT || 6853 behav == MADV_ACCESS_LWP || behav == MADV_ACCESS_MANY) { 6854 /* 6855 * Since we are going to unload hat mappings 6856 * we first have to flush the cache. Otherwise 6857 * this might lead to system panic if another 6858 * thread is doing physio on the range whose 6859 * mappings are unloaded by madvise(3C). 6860 */ 6861 if (svd->softlockcnt > 0) { 6862 /* 6863 * Since we do have the segvn writers lock 6864 * nobody can fill the cache with entries 6865 * belonging to this seg during the purge. 6866 * The flush either succeeds or we still 6867 * have pending I/Os. In the later case, 6868 * madvise(3C) fails. 6869 */ 6870 segvn_purge(seg); 6871 if (svd->softlockcnt > 0) { 6872 /* 6873 * Since madvise(3C) is advisory and 6874 * it's not part of UNIX98, madvise(3C) 6875 * failure here doesn't cause any hardship. 6876 * Note that we don't block in "as" layer. 6877 */ 6878 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 6879 return (EAGAIN); 6880 } 6881 } 6882 } 6883 6884 amp = svd->amp; 6885 vp = svd->vp; 6886 if (behav == MADV_FREE) { 6887 /* 6888 * MADV_FREE is not supported for segments with 6889 * underlying object; if anonmap is NULL, anon slots 6890 * are not yet populated and there is nothing for 6891 * us to do. As MADV_FREE is advisory, we don't 6892 * return error in either case. 6893 */ 6894 if (vp || amp == NULL) { 6895 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 6896 return (0); 6897 } 6898 6899 page = seg_page(seg, addr); 6900 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 6901 anon_disclaim(amp, svd->anon_index + page, len, 0); 6902 ANON_LOCK_EXIT(&->a_rwlock); 6903 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 6904 return (0); 6905 } 6906 6907 /* 6908 * If advice is to be applied to entire segment, 6909 * use advice field in seg_data structure 6910 * otherwise use appropriate vpage entry. 6911 */ 6912 if ((addr == seg->s_base) && (len == seg->s_size)) { 6913 switch (behav) { 6914 case MADV_ACCESS_LWP: 6915 case MADV_ACCESS_MANY: 6916 case MADV_ACCESS_DEFAULT: 6917 /* 6918 * Set memory allocation policy for this segment 6919 */ 6920 policy = lgrp_madv_to_policy(behav, len, svd->type); 6921 if (svd->type == MAP_SHARED) 6922 already_set = lgrp_shm_policy_set(policy, amp, 6923 svd->anon_index, vp, svd->offset, len); 6924 else { 6925 /* 6926 * For private memory, need writers lock on 6927 * address space because the segment may be 6928 * split or concatenated when changing policy 6929 */ 6930 if (AS_READ_HELD(seg->s_as, 6931 &seg->s_as->a_lock)) { 6932 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 6933 return (IE_RETRY); 6934 } 6935 6936 already_set = lgrp_privm_policy_set(policy, 6937 &svd->policy_info, len); 6938 } 6939 6940 /* 6941 * If policy set already and it shouldn't be reapplied, 6942 * don't do anything. 6943 */ 6944 if (already_set && 6945 !LGRP_MEM_POLICY_REAPPLICABLE(policy)) 6946 break; 6947 6948 /* 6949 * Mark any existing pages in given range for 6950 * migration 6951 */ 6952 page_mark_migrate(seg, addr, len, amp, svd->anon_index, 6953 vp, svd->offset, 1); 6954 6955 /* 6956 * If same policy set already or this is a shared 6957 * memory segment, don't need to try to concatenate 6958 * segment with adjacent ones. 6959 */ 6960 if (already_set || svd->type == MAP_SHARED) 6961 break; 6962 6963 /* 6964 * Try to concatenate this segment with previous 6965 * one and next one, since we changed policy for 6966 * this one and it may be compatible with adjacent 6967 * ones now. 6968 */ 6969 prev = AS_SEGPREV(seg->s_as, seg); 6970 next = AS_SEGNEXT(seg->s_as, seg); 6971 6972 if (next && next->s_ops == &segvn_ops && 6973 addr + len == next->s_base) 6974 (void) segvn_concat(seg, next, 1); 6975 6976 if (prev && prev->s_ops == &segvn_ops && 6977 addr == prev->s_base + prev->s_size) { 6978 /* 6979 * Drop lock for private data of current 6980 * segment before concatenating (deleting) it 6981 * and return IE_REATTACH to tell as_ctl() that 6982 * current segment has changed 6983 */ 6984 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 6985 if (!segvn_concat(prev, seg, 1)) 6986 err = IE_REATTACH; 6987 6988 return (err); 6989 } 6990 break; 6991 6992 case MADV_SEQUENTIAL: 6993 /* 6994 * unloading mapping guarantees 6995 * detection in segvn_fault 6996 */ 6997 ASSERT(seg->s_szc == 0); 6998 hat_unload(seg->s_as->a_hat, addr, len, 6999 HAT_UNLOAD); 7000 /* FALLTHROUGH */ 7001 case MADV_NORMAL: 7002 case MADV_RANDOM: 7003 svd->advice = (uchar_t)behav; 7004 svd->pageadvice = 0; 7005 break; 7006 case MADV_WILLNEED: /* handled in memcntl */ 7007 case MADV_DONTNEED: /* handled in memcntl */ 7008 case MADV_FREE: /* handled above */ 7009 break; 7010 default: 7011 err = EINVAL; 7012 } 7013 } else { 7014 caddr_t eaddr; 7015 struct seg *new_seg; 7016 struct segvn_data *new_svd; 7017 u_offset_t off; 7018 caddr_t oldeaddr; 7019 7020 page = seg_page(seg, addr); 7021 7022 segvn_vpage(seg); 7023 7024 switch (behav) { 7025 struct vpage *bvpp, *evpp; 7026 7027 case MADV_ACCESS_LWP: 7028 case MADV_ACCESS_MANY: 7029 case MADV_ACCESS_DEFAULT: 7030 /* 7031 * Set memory allocation policy for portion of this 7032 * segment 7033 */ 7034 7035 /* 7036 * Align address and length of advice to page 7037 * boundaries for large pages 7038 */ 7039 if (seg->s_szc != 0) { 7040 size_t pgsz; 7041 7042 pgsz = page_get_pagesize(seg->s_szc); 7043 addr = (caddr_t)P2ALIGN((uintptr_t)addr, pgsz); 7044 len = P2ROUNDUP(len, pgsz); 7045 } 7046 7047 /* 7048 * Check to see whether policy is set already 7049 */ 7050 policy = lgrp_madv_to_policy(behav, len, svd->type); 7051 7052 anon_index = svd->anon_index + page; 7053 off = svd->offset + (uintptr_t)(addr - seg->s_base); 7054 7055 if (svd->type == MAP_SHARED) 7056 already_set = lgrp_shm_policy_set(policy, amp, 7057 anon_index, vp, off, len); 7058 else 7059 already_set = 7060 (policy == svd->policy_info.mem_policy); 7061 7062 /* 7063 * If policy set already and it shouldn't be reapplied, 7064 * don't do anything. 7065 */ 7066 if (already_set && 7067 !LGRP_MEM_POLICY_REAPPLICABLE(policy)) 7068 break; 7069 7070 /* 7071 * For private memory, need writers lock on 7072 * address space because the segment may be 7073 * split or concatenated when changing policy 7074 */ 7075 if (svd->type == MAP_PRIVATE && 7076 AS_READ_HELD(seg->s_as, &seg->s_as->a_lock)) { 7077 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7078 return (IE_RETRY); 7079 } 7080 7081 /* 7082 * Mark any existing pages in given range for 7083 * migration 7084 */ 7085 page_mark_migrate(seg, addr, len, amp, svd->anon_index, 7086 vp, svd->offset, 1); 7087 7088 /* 7089 * Don't need to try to split or concatenate 7090 * segments, since policy is same or this is a shared 7091 * memory segment 7092 */ 7093 if (already_set || svd->type == MAP_SHARED) 7094 break; 7095 7096 /* 7097 * Split off new segment if advice only applies to a 7098 * portion of existing segment starting in middle 7099 */ 7100 new_seg = NULL; 7101 eaddr = addr + len; 7102 oldeaddr = seg->s_base + seg->s_size; 7103 if (addr > seg->s_base) { 7104 /* 7105 * Must flush I/O page cache 7106 * before splitting segment 7107 */ 7108 if (svd->softlockcnt > 0) 7109 segvn_purge(seg); 7110 7111 /* 7112 * Split segment and return IE_REATTACH to tell 7113 * as_ctl() that current segment changed 7114 */ 7115 new_seg = segvn_split_seg(seg, addr); 7116 new_svd = (struct segvn_data *)new_seg->s_data; 7117 err = IE_REATTACH; 7118 7119 /* 7120 * If new segment ends where old one 7121 * did, try to concatenate the new 7122 * segment with next one. 7123 */ 7124 if (eaddr == oldeaddr) { 7125 /* 7126 * Set policy for new segment 7127 */ 7128 (void) lgrp_privm_policy_set(policy, 7129 &new_svd->policy_info, 7130 new_seg->s_size); 7131 7132 next = AS_SEGNEXT(new_seg->s_as, 7133 new_seg); 7134 7135 if (next && 7136 next->s_ops == &segvn_ops && 7137 eaddr == next->s_base) 7138 (void) segvn_concat(new_seg, 7139 next, 1); 7140 } 7141 } 7142 7143 /* 7144 * Split off end of existing segment if advice only 7145 * applies to a portion of segment ending before 7146 * end of the existing segment 7147 */ 7148 if (eaddr < oldeaddr) { 7149 /* 7150 * Must flush I/O page cache 7151 * before splitting segment 7152 */ 7153 if (svd->softlockcnt > 0) 7154 segvn_purge(seg); 7155 7156 /* 7157 * If beginning of old segment was already 7158 * split off, use new segment to split end off 7159 * from. 7160 */ 7161 if (new_seg != NULL && new_seg != seg) { 7162 /* 7163 * Split segment 7164 */ 7165 (void) segvn_split_seg(new_seg, eaddr); 7166 7167 /* 7168 * Set policy for new segment 7169 */ 7170 (void) lgrp_privm_policy_set(policy, 7171 &new_svd->policy_info, 7172 new_seg->s_size); 7173 } else { 7174 /* 7175 * Split segment and return IE_REATTACH 7176 * to tell as_ctl() that current 7177 * segment changed 7178 */ 7179 (void) segvn_split_seg(seg, eaddr); 7180 err = IE_REATTACH; 7181 7182 (void) lgrp_privm_policy_set(policy, 7183 &svd->policy_info, seg->s_size); 7184 7185 /* 7186 * If new segment starts where old one 7187 * did, try to concatenate it with 7188 * previous segment. 7189 */ 7190 if (addr == seg->s_base) { 7191 prev = AS_SEGPREV(seg->s_as, 7192 seg); 7193 7194 /* 7195 * Drop lock for private data 7196 * of current segment before 7197 * concatenating (deleting) it 7198 */ 7199 if (prev && 7200 prev->s_ops == 7201 &segvn_ops && 7202 addr == prev->s_base + 7203 prev->s_size) { 7204 SEGVN_LOCK_EXIT( 7205 seg->s_as, 7206 &svd->lock); 7207 (void) segvn_concat( 7208 prev, seg, 1); 7209 return (err); 7210 } 7211 } 7212 } 7213 } 7214 break; 7215 case MADV_SEQUENTIAL: 7216 ASSERT(seg->s_szc == 0); 7217 hat_unload(seg->s_as->a_hat, addr, len, HAT_UNLOAD); 7218 /* FALLTHROUGH */ 7219 case MADV_NORMAL: 7220 case MADV_RANDOM: 7221 bvpp = &svd->vpage[page]; 7222 evpp = &svd->vpage[page + (len >> PAGESHIFT)]; 7223 for (; bvpp < evpp; bvpp++) 7224 VPP_SETADVICE(bvpp, behav); 7225 svd->advice = MADV_NORMAL; 7226 break; 7227 case MADV_WILLNEED: /* handled in memcntl */ 7228 case MADV_DONTNEED: /* handled in memcntl */ 7229 case MADV_FREE: /* handled above */ 7230 break; 7231 default: 7232 err = EINVAL; 7233 } 7234 } 7235 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7236 return (err); 7237 } 7238 7239 /* 7240 * Create a vpage structure for this seg. 7241 */ 7242 static void 7243 segvn_vpage(struct seg *seg) 7244 { 7245 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 7246 struct vpage *vp, *evp; 7247 7248 ASSERT(SEGVN_WRITE_HELD(seg->s_as, &svd->lock)); 7249 7250 /* 7251 * If no vpage structure exists, allocate one. Copy the protections 7252 * and the advice from the segment itself to the individual pages. 7253 */ 7254 if (svd->vpage == NULL) { 7255 svd->pageprot = 1; 7256 svd->pageadvice = 1; 7257 svd->vpage = kmem_zalloc(seg_pages(seg) * sizeof (struct vpage), 7258 KM_SLEEP); 7259 evp = &svd->vpage[seg_page(seg, seg->s_base + seg->s_size)]; 7260 for (vp = svd->vpage; vp < evp; vp++) { 7261 VPP_SETPROT(vp, svd->prot); 7262 VPP_SETADVICE(vp, svd->advice); 7263 } 7264 } 7265 } 7266 7267 /* 7268 * Dump the pages belonging to this segvn segment. 7269 */ 7270 static void 7271 segvn_dump(struct seg *seg) 7272 { 7273 struct segvn_data *svd; 7274 page_t *pp; 7275 struct anon_map *amp; 7276 ulong_t anon_index; 7277 struct vnode *vp; 7278 u_offset_t off, offset; 7279 pfn_t pfn; 7280 pgcnt_t page, npages; 7281 caddr_t addr; 7282 7283 npages = seg_pages(seg); 7284 svd = (struct segvn_data *)seg->s_data; 7285 vp = svd->vp; 7286 off = offset = svd->offset; 7287 addr = seg->s_base; 7288 7289 if ((amp = svd->amp) != NULL) { 7290 anon_index = svd->anon_index; 7291 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 7292 } 7293 7294 for (page = 0; page < npages; page++, offset += PAGESIZE) { 7295 struct anon *ap; 7296 int we_own_it = 0; 7297 7298 if (amp && (ap = anon_get_ptr(svd->amp->ahp, anon_index++))) { 7299 swap_xlate_nopanic(ap, &vp, &off); 7300 } else { 7301 vp = svd->vp; 7302 off = offset; 7303 } 7304 7305 /* 7306 * If pp == NULL, the page either does not exist 7307 * or is exclusively locked. So determine if it 7308 * exists before searching for it. 7309 */ 7310 7311 if ((pp = page_lookup_nowait(vp, off, SE_SHARED))) 7312 we_own_it = 1; 7313 else 7314 pp = page_exists(vp, off); 7315 7316 if (pp) { 7317 pfn = page_pptonum(pp); 7318 dump_addpage(seg->s_as, addr, pfn); 7319 if (we_own_it) 7320 page_unlock(pp); 7321 } 7322 addr += PAGESIZE; 7323 dump_timeleft = dump_timeout; 7324 } 7325 7326 if (amp != NULL) 7327 ANON_LOCK_EXIT(&->a_rwlock); 7328 } 7329 7330 /* 7331 * lock/unlock anon pages over a given range. Return shadow list 7332 */ 7333 static int 7334 segvn_pagelock(struct seg *seg, caddr_t addr, size_t len, struct page ***ppp, 7335 enum lock_type type, enum seg_rw rw) 7336 { 7337 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 7338 size_t np, adjustpages = 0, npages = (len >> PAGESHIFT); 7339 ulong_t anon_index; 7340 uint_t protchk; 7341 uint_t error; 7342 struct anon_map *amp; 7343 struct page **pplist, **pl, *pp; 7344 caddr_t a; 7345 size_t page; 7346 caddr_t lpgaddr, lpgeaddr; 7347 7348 TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEGVN_START, 7349 "segvn_pagelock: start seg %p addr %p", seg, addr); 7350 7351 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 7352 if (seg->s_szc != 0 && (type == L_PAGELOCK || type == L_PAGEUNLOCK)) { 7353 /* 7354 * We are adjusting the pagelock region to the large page size 7355 * boundary because the unlocked part of a large page cannot 7356 * be freed anyway unless all constituent pages of a large 7357 * page are locked. Therefore this adjustment allows us to 7358 * decrement availrmem by the right value (note we don't want 7359 * to just decrement availrem by the large page size without 7360 * adjusting addr and len because then we may end up 7361 * decrementing availrmem by large page size for every 7362 * constituent page locked by a new as_pagelock call). 7363 * as_pageunlock caller must always match as_pagelock call's 7364 * addr and len. 7365 * 7366 * Note segment's page size cannot change while we are holding 7367 * as lock. And then it cannot change while softlockcnt is 7368 * not 0. This will allow us to correctly recalculate large 7369 * page size region for the matching pageunlock/reclaim call. 7370 * 7371 * for pageunlock *ppp points to the pointer of page_t that 7372 * corresponds to the real unadjusted start address. Similar 7373 * for pagelock *ppp must point to the pointer of page_t that 7374 * corresponds to the real unadjusted start address. 7375 */ 7376 size_t pgsz = page_get_pagesize(seg->s_szc); 7377 CALC_LPG_REGION(pgsz, seg, addr, len, lpgaddr, lpgeaddr); 7378 adjustpages = ((uintptr_t)(addr - lpgaddr)) >> PAGESHIFT; 7379 } 7380 7381 if (type == L_PAGEUNLOCK) { 7382 7383 /* 7384 * update hat ref bits for /proc. We need to make sure 7385 * that threads tracing the ref and mod bits of the 7386 * address space get the right data. 7387 * Note: page ref and mod bits are updated at reclaim time 7388 */ 7389 if (seg->s_as->a_vbits) { 7390 for (a = addr; a < addr + len; a += PAGESIZE) { 7391 if (rw == S_WRITE) { 7392 hat_setstat(seg->s_as, a, 7393 PAGESIZE, P_REF | P_MOD); 7394 } else { 7395 hat_setstat(seg->s_as, a, 7396 PAGESIZE, P_REF); 7397 } 7398 } 7399 } 7400 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER); 7401 if (seg->s_szc != 0) { 7402 VM_STAT_ADD(segvnvmstats.pagelock[0]); 7403 seg_pinactive(seg, lpgaddr, lpgeaddr - lpgaddr, 7404 *ppp - adjustpages, rw, segvn_reclaim); 7405 } else { 7406 seg_pinactive(seg, addr, len, *ppp, rw, segvn_reclaim); 7407 } 7408 7409 /* 7410 * If someone is blocked while unmapping, we purge 7411 * segment page cache and thus reclaim pplist synchronously 7412 * without waiting for seg_pasync_thread. This speeds up 7413 * unmapping in cases where munmap(2) is called, while 7414 * raw async i/o is still in progress or where a thread 7415 * exits on data fault in a multithreaded application. 7416 */ 7417 if (AS_ISUNMAPWAIT(seg->s_as) && (svd->softlockcnt > 0)) { 7418 /* 7419 * Even if we grab segvn WRITER's lock or segp_slock 7420 * here, there might be another thread which could've 7421 * successfully performed lookup/insert just before 7422 * we acquired the lock here. So, grabbing either 7423 * lock here is of not much use. Until we devise 7424 * a strategy at upper layers to solve the 7425 * synchronization issues completely, we expect 7426 * applications to handle this appropriately. 7427 */ 7428 segvn_purge(seg); 7429 } 7430 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7431 TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEGVN_UNLOCK_END, 7432 "segvn_pagelock: unlock seg %p addr %p", seg, addr); 7433 return (0); 7434 } else if (type == L_PAGERECLAIM) { 7435 VM_STAT_COND_ADD(seg->s_szc != 0, segvnvmstats.pagelock[1]); 7436 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER); 7437 (void) segvn_reclaim(seg, addr, len, *ppp, rw); 7438 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7439 TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEGVN_UNLOCK_END, 7440 "segvn_pagelock: reclaim seg %p addr %p", seg, addr); 7441 return (0); 7442 } 7443 7444 if (seg->s_szc != 0) { 7445 VM_STAT_ADD(segvnvmstats.pagelock[2]); 7446 addr = lpgaddr; 7447 len = lpgeaddr - lpgaddr; 7448 npages = (len >> PAGESHIFT); 7449 } 7450 7451 /* 7452 * for now we only support pagelock to anon memory. We've to check 7453 * protections for vnode objects and call into the vnode driver. 7454 * That's too much for a fast path. Let the fault entry point handle it. 7455 */ 7456 if (svd->vp != NULL) { 7457 TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEGVN_MISS_END, 7458 "segvn_pagelock: mapped vnode seg %p addr %p", seg, addr); 7459 *ppp = NULL; 7460 return (ENOTSUP); 7461 } 7462 7463 /* 7464 * if anonmap is not yet created, let the fault entry point populate it 7465 * with anon ptrs. 7466 */ 7467 if ((amp = svd->amp) == NULL) { 7468 TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEGVN_MISS_END, 7469 "segvn_pagelock: anonmap null seg %p addr %p", seg, addr); 7470 *ppp = NULL; 7471 return (EFAULT); 7472 } 7473 7474 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER); 7475 7476 /* 7477 * we acquire segp_slock to prevent duplicate entries 7478 * in seg_pcache 7479 */ 7480 mutex_enter(&svd->segp_slock); 7481 7482 /* 7483 * try to find pages in segment page cache 7484 */ 7485 pplist = seg_plookup(seg, addr, len, rw); 7486 if (pplist != NULL) { 7487 mutex_exit(&svd->segp_slock); 7488 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7489 *ppp = pplist + adjustpages; 7490 TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEGVN_HIT_END, 7491 "segvn_pagelock: cache hit seg %p addr %p", seg, addr); 7492 return (0); 7493 } 7494 7495 if (rw == S_READ) { 7496 protchk = PROT_READ; 7497 } else { 7498 protchk = PROT_WRITE; 7499 } 7500 7501 if (svd->pageprot == 0) { 7502 if ((svd->prot & protchk) == 0) { 7503 mutex_exit(&svd->segp_slock); 7504 error = EFAULT; 7505 goto out; 7506 } 7507 } else { 7508 /* 7509 * check page protections 7510 */ 7511 for (a = addr; a < addr + len; a += PAGESIZE) { 7512 struct vpage *vp; 7513 7514 vp = &svd->vpage[seg_page(seg, a)]; 7515 if ((VPP_PROT(vp) & protchk) == 0) { 7516 mutex_exit(&svd->segp_slock); 7517 error = EFAULT; 7518 goto out; 7519 } 7520 } 7521 } 7522 7523 mutex_enter(&freemem_lock); 7524 if (availrmem < tune.t_minarmem + npages) { 7525 mutex_exit(&freemem_lock); 7526 mutex_exit(&svd->segp_slock); 7527 error = ENOMEM; 7528 goto out; 7529 } else { 7530 svd->softlockcnt += npages; 7531 availrmem -= npages; 7532 segvn_pages_locked += npages; 7533 } 7534 mutex_exit(&freemem_lock); 7535 7536 pplist = kmem_alloc(sizeof (page_t *) * npages, KM_SLEEP); 7537 pl = pplist; 7538 *ppp = pplist + adjustpages; 7539 7540 page = seg_page(seg, addr); 7541 anon_index = svd->anon_index + page; 7542 7543 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 7544 for (a = addr; a < addr + len; a += PAGESIZE, anon_index++) { 7545 struct anon *ap; 7546 struct vnode *vp; 7547 u_offset_t off; 7548 anon_sync_obj_t cookie; 7549 7550 anon_array_enter(amp, anon_index, &cookie); 7551 ap = anon_get_ptr(amp->ahp, anon_index); 7552 if (ap == NULL) { 7553 anon_array_exit(&cookie); 7554 break; 7555 } else { 7556 /* 7557 * We must never use seg_pcache for COW pages 7558 * because we might end up with original page still 7559 * lying in seg_pcache even after private page is 7560 * created. This leads to data corruption as 7561 * aio_write refers to the page still in cache 7562 * while all other accesses refer to the private 7563 * page. 7564 */ 7565 if (ap->an_refcnt != 1) { 7566 anon_array_exit(&cookie); 7567 break; 7568 } 7569 } 7570 swap_xlate(ap, &vp, &off); 7571 anon_array_exit(&cookie); 7572 7573 pp = page_lookup_nowait(vp, off, SE_SHARED); 7574 if (pp == NULL) { 7575 break; 7576 } 7577 *pplist++ = pp; 7578 } 7579 ANON_LOCK_EXIT(&->a_rwlock); 7580 7581 if (a >= addr + len) { 7582 (void) seg_pinsert(seg, addr, len, pl, rw, SEGP_ASYNC_FLUSH, 7583 segvn_reclaim); 7584 mutex_exit(&svd->segp_slock); 7585 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7586 TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEGVN_FILL_END, 7587 "segvn_pagelock: cache fill seg %p addr %p", seg, addr); 7588 return (0); 7589 } 7590 7591 mutex_exit(&svd->segp_slock); 7592 error = EFAULT; 7593 pplist = pl; 7594 np = ((uintptr_t)(a - addr)) >> PAGESHIFT; 7595 while (np > (uint_t)0) { 7596 page_unlock(*pplist); 7597 np--; 7598 pplist++; 7599 } 7600 kmem_free(pl, sizeof (page_t *) * npages); 7601 mutex_enter(&freemem_lock); 7602 svd->softlockcnt -= npages; 7603 availrmem += npages; 7604 segvn_pages_locked -= npages; 7605 mutex_exit(&freemem_lock); 7606 if (svd->softlockcnt <= 0) { 7607 if (AS_ISUNMAPWAIT(seg->s_as)) { 7608 mutex_enter(&seg->s_as->a_contents); 7609 if (AS_ISUNMAPWAIT(seg->s_as)) { 7610 AS_CLRUNMAPWAIT(seg->s_as); 7611 cv_broadcast(&seg->s_as->a_cv); 7612 } 7613 mutex_exit(&seg->s_as->a_contents); 7614 } 7615 } 7616 7617 out: 7618 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7619 *ppp = NULL; 7620 TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEGVN_MISS_END, 7621 "segvn_pagelock: cache miss seg %p addr %p", seg, addr); 7622 return (error); 7623 } 7624 7625 /* 7626 * purge any cached pages in the I/O page cache 7627 */ 7628 static void 7629 segvn_purge(struct seg *seg) 7630 { 7631 seg_ppurge(seg); 7632 } 7633 7634 static int 7635 segvn_reclaim(struct seg *seg, caddr_t addr, size_t len, struct page **pplist, 7636 enum seg_rw rw) 7637 { 7638 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 7639 pgcnt_t np, npages; 7640 struct page **pl; 7641 7642 #ifdef lint 7643 addr = addr; 7644 #endif 7645 7646 npages = np = (len >> PAGESHIFT); 7647 ASSERT(npages); 7648 pl = pplist; 7649 if (seg->s_szc != 0) { 7650 size_t pgsz = page_get_pagesize(seg->s_szc); 7651 if (!IS_P2ALIGNED(addr, pgsz) || !IS_P2ALIGNED(len, pgsz)) { 7652 panic("segvn_reclaim: unaligned addr or len"); 7653 /*NOTREACHED*/ 7654 } 7655 } 7656 7657 while (np > (uint_t)0) { 7658 if (rw == S_WRITE) { 7659 hat_setrefmod(*pplist); 7660 } else { 7661 hat_setref(*pplist); 7662 } 7663 page_unlock(*pplist); 7664 np--; 7665 pplist++; 7666 } 7667 kmem_free(pl, sizeof (page_t *) * npages); 7668 7669 mutex_enter(&freemem_lock); 7670 availrmem += npages; 7671 segvn_pages_locked -= npages; 7672 svd->softlockcnt -= npages; 7673 mutex_exit(&freemem_lock); 7674 if (svd->softlockcnt <= 0) { 7675 if (AS_ISUNMAPWAIT(seg->s_as)) { 7676 mutex_enter(&seg->s_as->a_contents); 7677 if (AS_ISUNMAPWAIT(seg->s_as)) { 7678 AS_CLRUNMAPWAIT(seg->s_as); 7679 cv_broadcast(&seg->s_as->a_cv); 7680 } 7681 mutex_exit(&seg->s_as->a_contents); 7682 } 7683 } 7684 return (0); 7685 } 7686 /* 7687 * get a memory ID for an addr in a given segment 7688 * 7689 * XXX only creates PAGESIZE pages if anon slots are not initialized. 7690 * At fault time they will be relocated into larger pages. 7691 */ 7692 static int 7693 segvn_getmemid(struct seg *seg, caddr_t addr, memid_t *memidp) 7694 { 7695 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 7696 struct anon *ap = NULL; 7697 ulong_t anon_index; 7698 struct anon_map *amp; 7699 anon_sync_obj_t cookie; 7700 7701 if (svd->type == MAP_PRIVATE) { 7702 memidp->val[0] = (uintptr_t)seg->s_as; 7703 memidp->val[1] = (uintptr_t)addr; 7704 return (0); 7705 } 7706 7707 if (svd->type == MAP_SHARED) { 7708 if (svd->vp) { 7709 memidp->val[0] = (uintptr_t)svd->vp; 7710 memidp->val[1] = (u_longlong_t)svd->offset + 7711 (uintptr_t)(addr - seg->s_base); 7712 return (0); 7713 } else { 7714 7715 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER); 7716 if ((amp = svd->amp) != NULL) { 7717 anon_index = svd->anon_index + 7718 seg_page(seg, addr); 7719 } 7720 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7721 7722 ASSERT(amp != NULL); 7723 7724 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 7725 anon_array_enter(amp, anon_index, &cookie); 7726 ap = anon_get_ptr(amp->ahp, anon_index); 7727 if (ap == NULL) { 7728 page_t *pp; 7729 7730 pp = anon_zero(seg, addr, &ap, svd->cred); 7731 if (pp == NULL) { 7732 anon_array_exit(&cookie); 7733 ANON_LOCK_EXIT(&->a_rwlock); 7734 return (ENOMEM); 7735 } 7736 ASSERT(anon_get_ptr(amp->ahp, anon_index) 7737 == NULL); 7738 (void) anon_set_ptr(amp->ahp, anon_index, 7739 ap, ANON_SLEEP); 7740 page_unlock(pp); 7741 } 7742 7743 anon_array_exit(&cookie); 7744 ANON_LOCK_EXIT(&->a_rwlock); 7745 7746 memidp->val[0] = (uintptr_t)ap; 7747 memidp->val[1] = (uintptr_t)addr & PAGEOFFSET; 7748 return (0); 7749 } 7750 } 7751 return (EINVAL); 7752 } 7753 7754 static int 7755 sameprot(struct seg *seg, caddr_t a, size_t len) 7756 { 7757 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 7758 struct vpage *vpage; 7759 spgcnt_t pages = btop(len); 7760 uint_t prot; 7761 7762 if (svd->pageprot == 0) 7763 return (1); 7764 7765 ASSERT(svd->vpage != NULL); 7766 7767 vpage = &svd->vpage[seg_page(seg, a)]; 7768 prot = VPP_PROT(vpage); 7769 vpage++; 7770 pages--; 7771 while (pages-- > 0) { 7772 if (prot != VPP_PROT(vpage)) 7773 return (0); 7774 vpage++; 7775 } 7776 return (1); 7777 } 7778 7779 /* 7780 * Get memory allocation policy info for specified address in given segment 7781 */ 7782 static lgrp_mem_policy_info_t * 7783 segvn_getpolicy(struct seg *seg, caddr_t addr) 7784 { 7785 struct anon_map *amp; 7786 ulong_t anon_index; 7787 lgrp_mem_policy_info_t *policy_info; 7788 struct segvn_data *svn_data; 7789 u_offset_t vn_off; 7790 vnode_t *vp; 7791 7792 ASSERT(seg != NULL); 7793 7794 svn_data = (struct segvn_data *)seg->s_data; 7795 if (svn_data == NULL) 7796 return (NULL); 7797 7798 /* 7799 * Get policy info for private or shared memory 7800 */ 7801 if (svn_data->type != MAP_SHARED) 7802 policy_info = &svn_data->policy_info; 7803 else { 7804 amp = svn_data->amp; 7805 anon_index = svn_data->anon_index + seg_page(seg, addr); 7806 vp = svn_data->vp; 7807 vn_off = svn_data->offset + (uintptr_t)(addr - seg->s_base); 7808 policy_info = lgrp_shm_policy_get(amp, anon_index, vp, vn_off); 7809 } 7810 7811 return (policy_info); 7812 } 7813 7814 /*ARGSUSED*/ 7815 static int 7816 segvn_capable(struct seg *seg, segcapability_t capability) 7817 { 7818 return (0); 7819 } 7820