1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License, Version 1.0 only 6 * (the "License"). You may not use this file except in compliance 7 * with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or http://www.opensolaris.org/os/licensing. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* 23 * Copyright 2005 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ 28 /* All Rights Reserved */ 29 30 /* 31 * University Copyright- Copyright (c) 1982, 1986, 1988 32 * The Regents of the University of California 33 * All Rights Reserved 34 * 35 * University Acknowledgment- Portions of this document are derived from 36 * software developed by the University of California, Berkeley, and its 37 * contributors. 38 */ 39 40 #pragma ident "%Z%%M% %I% %E% SMI" 41 42 /* 43 * VM - shared or copy-on-write from a vnode/anonymous memory. 44 */ 45 46 #include <sys/types.h> 47 #include <sys/param.h> 48 #include <sys/t_lock.h> 49 #include <sys/errno.h> 50 #include <sys/systm.h> 51 #include <sys/mman.h> 52 #include <sys/debug.h> 53 #include <sys/cred.h> 54 #include <sys/vmsystm.h> 55 #include <sys/tuneable.h> 56 #include <sys/bitmap.h> 57 #include <sys/swap.h> 58 #include <sys/kmem.h> 59 #include <sys/sysmacros.h> 60 #include <sys/vtrace.h> 61 #include <sys/cmn_err.h> 62 #include <sys/vm.h> 63 #include <sys/dumphdr.h> 64 #include <sys/lgrp.h> 65 66 #include <vm/hat.h> 67 #include <vm/as.h> 68 #include <vm/seg.h> 69 #include <vm/seg_vn.h> 70 #include <vm/pvn.h> 71 #include <vm/anon.h> 72 #include <vm/page.h> 73 #include <vm/vpage.h> 74 75 /* 76 * Private seg op routines. 77 */ 78 static int segvn_dup(struct seg *seg, struct seg *newseg); 79 static int segvn_unmap(struct seg *seg, caddr_t addr, size_t len); 80 static void segvn_free(struct seg *seg); 81 static faultcode_t segvn_fault(struct hat *hat, struct seg *seg, 82 caddr_t addr, size_t len, enum fault_type type, 83 enum seg_rw rw); 84 static faultcode_t segvn_faulta(struct seg *seg, caddr_t addr); 85 static int segvn_setprot(struct seg *seg, caddr_t addr, 86 size_t len, uint_t prot); 87 static int segvn_checkprot(struct seg *seg, caddr_t addr, 88 size_t len, uint_t prot); 89 static int segvn_kluster(struct seg *seg, caddr_t addr, ssize_t delta); 90 static size_t segvn_swapout(struct seg *seg); 91 static int segvn_sync(struct seg *seg, caddr_t addr, size_t len, 92 int attr, uint_t flags); 93 static size_t segvn_incore(struct seg *seg, caddr_t addr, size_t len, 94 char *vec); 95 static int segvn_lockop(struct seg *seg, caddr_t addr, size_t len, 96 int attr, int op, ulong_t *lockmap, size_t pos); 97 static int segvn_getprot(struct seg *seg, caddr_t addr, size_t len, 98 uint_t *protv); 99 static u_offset_t segvn_getoffset(struct seg *seg, caddr_t addr); 100 static int segvn_gettype(struct seg *seg, caddr_t addr); 101 static int segvn_getvp(struct seg *seg, caddr_t addr, 102 struct vnode **vpp); 103 static int segvn_advise(struct seg *seg, caddr_t addr, size_t len, 104 uint_t behav); 105 static void segvn_dump(struct seg *seg); 106 static int segvn_pagelock(struct seg *seg, caddr_t addr, size_t len, 107 struct page ***ppp, enum lock_type type, enum seg_rw rw); 108 static int segvn_setpagesize(struct seg *seg, caddr_t addr, size_t len, 109 uint_t szc); 110 static int segvn_getmemid(struct seg *seg, caddr_t addr, 111 memid_t *memidp); 112 static lgrp_mem_policy_info_t *segvn_getpolicy(struct seg *, caddr_t); 113 114 struct seg_ops segvn_ops = { 115 segvn_dup, 116 segvn_unmap, 117 segvn_free, 118 segvn_fault, 119 segvn_faulta, 120 segvn_setprot, 121 segvn_checkprot, 122 segvn_kluster, 123 segvn_swapout, 124 segvn_sync, 125 segvn_incore, 126 segvn_lockop, 127 segvn_getprot, 128 segvn_getoffset, 129 segvn_gettype, 130 segvn_getvp, 131 segvn_advise, 132 segvn_dump, 133 segvn_pagelock, 134 segvn_setpagesize, 135 segvn_getmemid, 136 segvn_getpolicy, 137 }; 138 139 /* 140 * Common zfod structures, provided as a shorthand for others to use. 141 */ 142 static segvn_crargs_t zfod_segvn_crargs = 143 SEGVN_ZFOD_ARGS(PROT_ZFOD, PROT_ALL); 144 static segvn_crargs_t kzfod_segvn_crargs = 145 SEGVN_ZFOD_ARGS(PROT_ZFOD & ~PROT_USER, 146 PROT_ALL & ~PROT_USER); 147 static segvn_crargs_t stack_noexec_crargs = 148 SEGVN_ZFOD_ARGS(PROT_ZFOD & ~PROT_EXEC, PROT_ALL); 149 150 caddr_t zfod_argsp = (caddr_t)&zfod_segvn_crargs; /* user zfod argsp */ 151 caddr_t kzfod_argsp = (caddr_t)&kzfod_segvn_crargs; /* kernel zfod argsp */ 152 caddr_t stack_exec_argsp = (caddr_t)&zfod_segvn_crargs; /* executable stack */ 153 caddr_t stack_noexec_argsp = (caddr_t)&stack_noexec_crargs; /* noexec stack */ 154 155 #define vpgtob(n) ((n) * sizeof (struct vpage)) /* For brevity */ 156 157 size_t segvn_comb_thrshld = UINT_MAX; /* patchable -- see 1196681 */ 158 159 static int segvn_concat(struct seg *, struct seg *, int); 160 static int segvn_extend_prev(struct seg *, struct seg *, 161 struct segvn_crargs *, size_t); 162 static int segvn_extend_next(struct seg *, struct seg *, 163 struct segvn_crargs *, size_t); 164 static void segvn_softunlock(struct seg *, caddr_t, size_t, enum seg_rw); 165 static void segvn_pagelist_rele(page_t **); 166 static void segvn_setvnode_mpss(vnode_t *); 167 static void segvn_relocate_pages(page_t **, page_t *); 168 static int segvn_full_szcpages(page_t **, uint_t, int *, uint_t *); 169 static int segvn_fill_vp_pages(struct segvn_data *, vnode_t *, u_offset_t, 170 uint_t, page_t **, page_t **, uint_t *, int *); 171 static faultcode_t segvn_fault_vnodepages(struct hat *, struct seg *, caddr_t, 172 caddr_t, enum fault_type, enum seg_rw, caddr_t, caddr_t, int); 173 static faultcode_t segvn_fault_anonpages(struct hat *, struct seg *, caddr_t, 174 caddr_t, enum fault_type, enum seg_rw, caddr_t, caddr_t, int); 175 static faultcode_t segvn_faultpage(struct hat *, struct seg *, caddr_t, 176 u_offset_t, struct vpage *, page_t **, uint_t, 177 enum fault_type, enum seg_rw, int); 178 static void segvn_vpage(struct seg *); 179 180 static void segvn_purge(struct seg *seg); 181 static int segvn_reclaim(struct seg *, caddr_t, size_t, struct page **, 182 enum seg_rw); 183 184 static int sameprot(struct seg *, caddr_t, size_t); 185 186 static int segvn_demote_range(struct seg *, caddr_t, size_t, int); 187 static int segvn_clrszc(struct seg *); 188 static struct seg *segvn_split_seg(struct seg *, caddr_t); 189 static int segvn_claim_pages(struct seg *, struct vpage *, u_offset_t, 190 ulong_t, uint_t); 191 192 static struct kmem_cache *segvn_cache; 193 194 #ifdef VM_STATS 195 static struct segvnvmstats_str { 196 ulong_t fill_vp_pages[31]; 197 ulong_t fltvnpages[49]; 198 ulong_t fullszcpages[10]; 199 ulong_t relocatepages[3]; 200 ulong_t fltanpages[17]; 201 ulong_t pagelock[3]; 202 ulong_t demoterange[3]; 203 } segvnvmstats; 204 #endif /* VM_STATS */ 205 206 #define SDR_RANGE 1 /* demote entire range */ 207 #define SDR_END 2 /* demote non aligned ends only */ 208 209 #define CALC_LPG_REGION(pgsz, seg, addr, len, lpgaddr, lpgeaddr) { \ 210 if ((len) != 0) { \ 211 lpgaddr = (caddr_t)P2ALIGN((uintptr_t)(addr), pgsz); \ 212 ASSERT(lpgaddr >= (seg)->s_base); \ 213 lpgeaddr = (caddr_t)P2ROUNDUP((uintptr_t)((addr) + \ 214 (len)), pgsz); \ 215 ASSERT(lpgeaddr > lpgaddr); \ 216 ASSERT(lpgeaddr <= (seg)->s_base + (seg)->s_size); \ 217 } else { \ 218 lpgeaddr = lpgaddr = (addr); \ 219 } \ 220 } 221 222 /*ARGSUSED*/ 223 static int 224 segvn_cache_constructor(void *buf, void *cdrarg, int kmflags) 225 { 226 struct segvn_data *svd = buf; 227 228 rw_init(&svd->lock, NULL, RW_DEFAULT, NULL); 229 mutex_init(&svd->segp_slock, NULL, MUTEX_DEFAULT, NULL); 230 return (0); 231 } 232 233 /*ARGSUSED1*/ 234 static void 235 segvn_cache_destructor(void *buf, void *cdrarg) 236 { 237 struct segvn_data *svd = buf; 238 239 rw_destroy(&svd->lock); 240 mutex_destroy(&svd->segp_slock); 241 } 242 243 /* 244 * Patching this variable to non-zero allows the system to run with 245 * stacks marked as "not executable". It's a bit of a kludge, but is 246 * provided as a tweakable for platforms that export those ABIs 247 * (e.g. sparc V8) that have executable stacks enabled by default. 248 * There are also some restrictions for platforms that don't actually 249 * implement 'noexec' protections. 250 * 251 * Once enabled, the system is (therefore) unable to provide a fully 252 * ABI-compliant execution environment, though practically speaking, 253 * most everything works. The exceptions are generally some interpreters 254 * and debuggers that create executable code on the stack and jump 255 * into it (without explicitly mprotecting the address range to include 256 * PROT_EXEC). 257 * 258 * One important class of applications that are disabled are those 259 * that have been transformed into malicious agents using one of the 260 * numerous "buffer overflow" attacks. See 4007890. 261 */ 262 int noexec_user_stack = 0; 263 int noexec_user_stack_log = 1; 264 265 int segvn_lpg_disable = 0; 266 uint_t segvn_maxpgszc = 0; 267 268 ulong_t segvn_vmpss_clrszc_cnt; 269 ulong_t segvn_vmpss_clrszc_err; 270 ulong_t segvn_fltvnpages_clrszc_cnt; 271 ulong_t segvn_fltvnpages_clrszc_err; 272 ulong_t segvn_setpgsz_align_err; 273 ulong_t segvn_setpgsz_getattr_err; 274 ulong_t segvn_setpgsz_eof_err; 275 ulong_t segvn_faultvnmpss_align_err1; 276 ulong_t segvn_faultvnmpss_align_err2; 277 ulong_t segvn_faultvnmpss_align_err3; 278 ulong_t segvn_faultvnmpss_align_err4; 279 ulong_t segvn_faultvnmpss_align_err5; 280 ulong_t segvn_vmpss_pageio_deadlk_err; 281 282 /* 283 * Initialize segvn data structures 284 */ 285 void 286 segvn_init(void) 287 { 288 uint_t maxszc; 289 uint_t szc; 290 size_t pgsz; 291 292 segvn_cache = kmem_cache_create("segvn_cache", 293 sizeof (struct segvn_data), 0, 294 segvn_cache_constructor, segvn_cache_destructor, NULL, 295 NULL, NULL, 0); 296 297 if (segvn_lpg_disable != 0) 298 return; 299 szc = maxszc = page_num_pagesizes() - 1; 300 if (szc == 0) { 301 segvn_lpg_disable = 1; 302 return; 303 } 304 if (page_get_pagesize(0) != PAGESIZE) { 305 panic("segvn_init: bad szc 0"); 306 /*NOTREACHED*/ 307 } 308 while (szc != 0) { 309 pgsz = page_get_pagesize(szc); 310 if (pgsz <= PAGESIZE || !IS_P2ALIGNED(pgsz, pgsz)) { 311 panic("segvn_init: bad szc %d", szc); 312 /*NOTREACHED*/ 313 } 314 szc--; 315 } 316 if (segvn_maxpgszc == 0 || segvn_maxpgszc > maxszc) 317 segvn_maxpgszc = maxszc; 318 } 319 320 #define SEGVN_PAGEIO ((void *)0x1) 321 #define SEGVN_NOPAGEIO ((void *)0x2) 322 323 static void 324 segvn_setvnode_mpss(vnode_t *vp) 325 { 326 int err; 327 328 ASSERT(vp->v_mpssdata == NULL || 329 vp->v_mpssdata == SEGVN_PAGEIO || 330 vp->v_mpssdata == SEGVN_NOPAGEIO); 331 332 if (vp->v_mpssdata == NULL) { 333 if (vn_vmpss_usepageio(vp)) { 334 err = VOP_PAGEIO(vp, (page_t *)NULL, 335 (u_offset_t)0, 0, 0, CRED()); 336 } else { 337 err = ENOSYS; 338 } 339 /* 340 * set v_mpssdata just once per vnode life 341 * so that it never changes. 342 */ 343 mutex_enter(&vp->v_lock); 344 if (vp->v_mpssdata == NULL) { 345 if (err == EINVAL) { 346 vp->v_mpssdata = SEGVN_PAGEIO; 347 } else { 348 vp->v_mpssdata = SEGVN_NOPAGEIO; 349 } 350 } 351 mutex_exit(&vp->v_lock); 352 } 353 } 354 355 int 356 segvn_create(struct seg *seg, void *argsp) 357 { 358 struct segvn_crargs *a = (struct segvn_crargs *)argsp; 359 struct segvn_data *svd; 360 size_t swresv = 0; 361 struct cred *cred; 362 struct anon_map *amp; 363 int error = 0; 364 size_t pgsz; 365 lgrp_mem_policy_t mpolicy = LGRP_MEM_POLICY_DEFAULT; 366 367 368 ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock)); 369 370 if (a->type != MAP_PRIVATE && a->type != MAP_SHARED) { 371 panic("segvn_create type"); 372 /*NOTREACHED*/ 373 } 374 375 /* 376 * Check arguments. If a shared anon structure is given then 377 * it is illegal to also specify a vp. 378 */ 379 if (a->amp != NULL && a->vp != NULL) { 380 panic("segvn_create anon_map"); 381 /*NOTREACHED*/ 382 } 383 384 /* MAP_NORESERVE on a MAP_SHARED segment is meaningless. */ 385 if (a->type == MAP_SHARED) 386 a->flags &= ~MAP_NORESERVE; 387 388 if (a->szc != 0) { 389 if (segvn_lpg_disable != 0 || a->amp != NULL || 390 (a->type == MAP_SHARED && a->vp == NULL) || 391 (a->flags & MAP_NORESERVE) || seg->s_as == &kas) { 392 a->szc = 0; 393 } else { 394 if (a->szc > segvn_maxpgszc) 395 a->szc = segvn_maxpgszc; 396 pgsz = page_get_pagesize(a->szc); 397 if (!IS_P2ALIGNED(seg->s_base, pgsz) || 398 !IS_P2ALIGNED(seg->s_size, pgsz)) { 399 a->szc = 0; 400 } else if (a->vp != NULL) { 401 extern struct vnode kvp; 402 if (IS_SWAPFSVP(a->vp) || a->vp == &kvp) { 403 /* 404 * paranoid check. 405 * hat_page_demote() is not supported 406 * on swapfs pages. 407 */ 408 a->szc = 0; 409 } else if (map_addr_vacalign_check(seg->s_base, 410 a->offset & PAGEMASK)) { 411 a->szc = 0; 412 } 413 } 414 } 415 } 416 417 /* 418 * If segment may need private pages, reserve them now. 419 */ 420 if (!(a->flags & MAP_NORESERVE) && ((a->vp == NULL && a->amp == NULL) || 421 (a->type == MAP_PRIVATE && (a->prot & PROT_WRITE)))) { 422 if (anon_resv(seg->s_size) == 0) 423 return (EAGAIN); 424 swresv = seg->s_size; 425 TRACE_3(TR_FAC_VM, TR_ANON_PROC, "anon proc:%p %lu %u", 426 seg, swresv, 1); 427 } 428 429 /* 430 * Reserve any mapping structures that may be required. 431 */ 432 hat_map(seg->s_as->a_hat, seg->s_base, seg->s_size, HAT_MAP); 433 434 if (a->cred) { 435 cred = a->cred; 436 crhold(cred); 437 } else { 438 crhold(cred = CRED()); 439 } 440 441 /* Inform the vnode of the new mapping */ 442 if (a->vp) { 443 error = VOP_ADDMAP(a->vp, a->offset & PAGEMASK, 444 seg->s_as, seg->s_base, seg->s_size, a->prot, 445 a->maxprot, a->type, cred); 446 if (error) { 447 if (swresv != 0) { 448 anon_unresv(swresv); 449 TRACE_3(TR_FAC_VM, TR_ANON_PROC, 450 "anon proc:%p %lu %u", 451 seg, swresv, 0); 452 } 453 crfree(cred); 454 hat_unload(seg->s_as->a_hat, seg->s_base, 455 seg->s_size, HAT_UNLOAD_UNMAP); 456 return (error); 457 } 458 } 459 460 /* 461 * If more than one segment in the address space, and 462 * they're adjacent virtually, try to concatenate them. 463 * Don't concatenate if an explicit anon_map structure 464 * was supplied (e.g., SystemV shared memory). 465 */ 466 if (a->amp == NULL) { 467 struct seg *pseg, *nseg; 468 struct segvn_data *psvd, *nsvd; 469 lgrp_mem_policy_t ppolicy, npolicy; 470 uint_t lgrp_mem_policy_flags = 0; 471 extern lgrp_mem_policy_t lgrp_mem_default_policy; 472 473 /* 474 * Memory policy flags (lgrp_mem_policy_flags) is valid when 475 * extending stack/heap segments. 476 */ 477 if ((a->vp == NULL) && (a->type == MAP_PRIVATE) && 478 !(a->flags & MAP_NORESERVE) && (seg->s_as != &kas)) { 479 lgrp_mem_policy_flags = a->lgrp_mem_policy_flags; 480 } else { 481 /* 482 * Get policy when not extending it from another segment 483 */ 484 mpolicy = lgrp_mem_policy_default(seg->s_size, a->type); 485 } 486 487 /* 488 * First, try to concatenate the previous and new segments 489 */ 490 pseg = AS_SEGPREV(seg->s_as, seg); 491 if (pseg != NULL && 492 pseg->s_base + pseg->s_size == seg->s_base && 493 pseg->s_ops == &segvn_ops) { 494 /* 495 * Get memory allocation policy from previous segment. 496 * When extension is specified (e.g. for heap) apply 497 * this policy to the new segment regardless of the 498 * outcome of segment concatenation. Extension occurs 499 * for non-default policy otherwise default policy is 500 * used and is based on extended segment size. 501 */ 502 psvd = (struct segvn_data *)pseg->s_data; 503 ppolicy = psvd->policy_info.mem_policy; 504 if (lgrp_mem_policy_flags == 505 LGRP_MP_FLAG_EXTEND_UP) { 506 if (ppolicy != lgrp_mem_default_policy) { 507 mpolicy = ppolicy; 508 } else { 509 mpolicy = lgrp_mem_policy_default( 510 pseg->s_size + seg->s_size, 511 a->type); 512 } 513 } 514 515 if (mpolicy == ppolicy && 516 (pseg->s_size + seg->s_size <= 517 segvn_comb_thrshld || psvd->amp == NULL) && 518 segvn_extend_prev(pseg, seg, a, swresv) == 0) { 519 /* 520 * success! now try to concatenate 521 * with following seg 522 */ 523 crfree(cred); 524 nseg = AS_SEGNEXT(pseg->s_as, pseg); 525 if (nseg != NULL && 526 nseg != pseg && 527 nseg->s_ops == &segvn_ops && 528 pseg->s_base + pseg->s_size == 529 nseg->s_base) 530 (void) segvn_concat(pseg, nseg, 0); 531 ASSERT(pseg->s_szc == 0 || 532 (a->szc == pseg->s_szc && 533 IS_P2ALIGNED(pseg->s_base, pgsz) && 534 IS_P2ALIGNED(pseg->s_size, pgsz))); 535 return (0); 536 } 537 } 538 539 /* 540 * Failed, so try to concatenate with following seg 541 */ 542 nseg = AS_SEGNEXT(seg->s_as, seg); 543 if (nseg != NULL && 544 seg->s_base + seg->s_size == nseg->s_base && 545 nseg->s_ops == &segvn_ops) { 546 /* 547 * Get memory allocation policy from next segment. 548 * When extension is specified (e.g. for stack) apply 549 * this policy to the new segment regardless of the 550 * outcome of segment concatenation. Extension occurs 551 * for non-default policy otherwise default policy is 552 * used and is based on extended segment size. 553 */ 554 nsvd = (struct segvn_data *)nseg->s_data; 555 npolicy = nsvd->policy_info.mem_policy; 556 if (lgrp_mem_policy_flags == 557 LGRP_MP_FLAG_EXTEND_DOWN) { 558 if (npolicy != lgrp_mem_default_policy) { 559 mpolicy = npolicy; 560 } else { 561 mpolicy = lgrp_mem_policy_default( 562 nseg->s_size + seg->s_size, 563 a->type); 564 } 565 } 566 567 if (mpolicy == npolicy && 568 segvn_extend_next(seg, nseg, a, swresv) == 0) { 569 crfree(cred); 570 ASSERT(nseg->s_szc == 0 || 571 (a->szc == nseg->s_szc && 572 IS_P2ALIGNED(nseg->s_base, pgsz) && 573 IS_P2ALIGNED(nseg->s_size, pgsz))); 574 return (0); 575 } 576 } 577 } 578 579 if (a->vp != NULL) { 580 VN_HOLD(a->vp); 581 if (a->type == MAP_SHARED) 582 lgrp_shm_policy_init(NULL, a->vp); 583 } 584 svd = kmem_cache_alloc(segvn_cache, KM_SLEEP); 585 586 seg->s_ops = &segvn_ops; 587 seg->s_data = (void *)svd; 588 seg->s_szc = a->szc; 589 590 svd->vp = a->vp; 591 /* 592 * Anonymous mappings have no backing file so the offset is meaningless. 593 */ 594 svd->offset = a->vp ? (a->offset & PAGEMASK) : 0; 595 svd->prot = a->prot; 596 svd->maxprot = a->maxprot; 597 svd->pageprot = 0; 598 svd->type = a->type; 599 svd->vpage = NULL; 600 svd->cred = cred; 601 svd->advice = MADV_NORMAL; 602 svd->pageadvice = 0; 603 svd->flags = (ushort_t)a->flags; 604 svd->softlockcnt = 0; 605 if (a->szc != 0 && a->vp != NULL) { 606 segvn_setvnode_mpss(a->vp); 607 } 608 609 amp = a->amp; 610 if ((svd->amp = amp) == NULL) { 611 svd->anon_index = 0; 612 if (svd->type == MAP_SHARED) { 613 svd->swresv = 0; 614 /* 615 * Shared mappings to a vp need no other setup. 616 * If we have a shared mapping to an anon_map object 617 * which hasn't been allocated yet, allocate the 618 * struct now so that it will be properly shared 619 * by remembering the swap reservation there. 620 */ 621 if (a->vp == NULL) { 622 svd->amp = anonmap_alloc(seg->s_size, swresv); 623 svd->amp->a_szc = seg->s_szc; 624 } 625 } else { 626 /* 627 * Private mapping (with or without a vp). 628 * Allocate anon_map when needed. 629 */ 630 svd->swresv = swresv; 631 } 632 } else { 633 pgcnt_t anon_num; 634 635 /* 636 * Mapping to an existing anon_map structure without a vp. 637 * For now we will insure that the segment size isn't larger 638 * than the size - offset gives us. Later on we may wish to 639 * have the anon array dynamically allocated itself so that 640 * we don't always have to allocate all the anon pointer slots. 641 * This of course involves adding extra code to check that we 642 * aren't trying to use an anon pointer slot beyond the end 643 * of the currently allocated anon array. 644 */ 645 if ((amp->size - a->offset) < seg->s_size) { 646 panic("segvn_create anon_map size"); 647 /*NOTREACHED*/ 648 } 649 650 anon_num = btopr(a->offset); 651 652 if (a->type == MAP_SHARED) { 653 /* 654 * SHARED mapping to a given anon_map. 655 */ 656 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 657 amp->refcnt++; 658 ANON_LOCK_EXIT(&->a_rwlock); 659 svd->anon_index = anon_num; 660 svd->swresv = 0; 661 } else { 662 /* 663 * PRIVATE mapping to a given anon_map. 664 * Make sure that all the needed anon 665 * structures are created (so that we will 666 * share the underlying pages if nothing 667 * is written by this mapping) and then 668 * duplicate the anon array as is done 669 * when a privately mapped segment is dup'ed. 670 */ 671 struct anon *ap; 672 caddr_t addr; 673 caddr_t eaddr; 674 ulong_t anon_idx; 675 int hat_flag = HAT_LOAD; 676 677 if (svd->flags & MAP_TEXT) { 678 hat_flag |= HAT_LOAD_TEXT; 679 } 680 681 svd->amp = anonmap_alloc(seg->s_size, 0); 682 svd->amp->a_szc = seg->s_szc; 683 svd->anon_index = 0; 684 svd->swresv = swresv; 685 686 /* 687 * Prevent 2 threads from allocating anon 688 * slots simultaneously. 689 */ 690 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 691 eaddr = seg->s_base + seg->s_size; 692 693 for (anon_idx = anon_num, addr = seg->s_base; 694 addr < eaddr; addr += PAGESIZE, anon_idx++) { 695 page_t *pp; 696 697 if ((ap = anon_get_ptr(amp->ahp, 698 anon_idx)) != NULL) 699 continue; 700 701 /* 702 * Allocate the anon struct now. 703 * Might as well load up translation 704 * to the page while we're at it... 705 */ 706 pp = anon_zero(seg, addr, &ap, cred); 707 if (ap == NULL || pp == NULL) { 708 panic("segvn_create anon_zero"); 709 /*NOTREACHED*/ 710 } 711 712 /* 713 * Re-acquire the anon_map lock and 714 * initialize the anon array entry. 715 */ 716 ASSERT(anon_get_ptr(amp->ahp, 717 anon_idx) == NULL); 718 (void) anon_set_ptr(amp->ahp, anon_idx, ap, 719 ANON_SLEEP); 720 721 ASSERT(seg->s_szc == 0); 722 ASSERT(!IS_VMODSORT(pp->p_vnode)); 723 724 hat_memload(seg->s_as->a_hat, addr, pp, 725 svd->prot & ~PROT_WRITE, hat_flag); 726 727 page_unlock(pp); 728 } 729 ASSERT(seg->s_szc == 0); 730 anon_dup(amp->ahp, anon_num, svd->amp->ahp, 731 0, seg->s_size); 732 ANON_LOCK_EXIT(&->a_rwlock); 733 } 734 } 735 736 /* 737 * Set default memory allocation policy for segment 738 * 739 * Always set policy for private memory at least for initialization 740 * even if this is a shared memory segment 741 */ 742 (void) lgrp_privm_policy_set(mpolicy, &svd->policy_info, seg->s_size); 743 744 if (svd->type == MAP_SHARED) 745 (void) lgrp_shm_policy_set(mpolicy, svd->amp, svd->anon_index, 746 svd->vp, svd->offset, seg->s_size); 747 748 return (0); 749 } 750 751 /* 752 * Concatenate two existing segments, if possible. 753 * Return 0 on success, -1 if two segments are not compatible 754 * or -2 on memory allocation failure. 755 * If private == 1 then try and concat segments with private pages. 756 */ 757 static int 758 segvn_concat(struct seg *seg1, struct seg *seg2, int private) 759 { 760 struct segvn_data *svd1 = seg1->s_data; 761 struct segvn_data *svd2 = seg2->s_data; 762 struct anon_map *amp1 = svd1->amp; 763 struct anon_map *amp2 = svd2->amp; 764 struct vpage *vpage1 = svd1->vpage; 765 struct vpage *vpage2 = svd2->vpage, *nvpage = NULL; 766 size_t size, nvpsize; 767 pgcnt_t npages1, npages2; 768 769 ASSERT(seg1->s_as && seg2->s_as && seg1->s_as == seg2->s_as); 770 ASSERT(AS_WRITE_HELD(seg1->s_as, &seg1->s_as->a_lock)); 771 ASSERT(seg1->s_ops == seg2->s_ops); 772 773 /* both segments exist, try to merge them */ 774 #define incompat(x) (svd1->x != svd2->x) 775 if (incompat(vp) || incompat(maxprot) || 776 (!svd1->pageadvice && !svd2->pageadvice && incompat(advice)) || 777 (!svd1->pageprot && !svd2->pageprot && incompat(prot)) || 778 incompat(type) || incompat(cred) || incompat(flags) || 779 seg1->s_szc != seg2->s_szc || incompat(policy_info.mem_policy) || 780 (svd2->softlockcnt > 0)) 781 return (-1); 782 #undef incompat 783 784 /* 785 * vp == NULL implies zfod, offset doesn't matter 786 */ 787 if (svd1->vp != NULL && 788 svd1->offset + seg1->s_size != svd2->offset) { 789 return (-1); 790 } 791 792 /* 793 * Fail early if we're not supposed to concatenate 794 * private pages. 795 */ 796 if ((private == 0 || svd1->type != MAP_PRIVATE) && 797 (amp1 != NULL || amp2 != NULL)) { 798 return (-1); 799 } 800 801 /* 802 * If either seg has vpages, create a new merged vpage array. 803 */ 804 if (vpage1 != NULL || vpage2 != NULL) { 805 struct vpage *vp; 806 807 npages1 = seg_pages(seg1); 808 npages2 = seg_pages(seg2); 809 nvpsize = vpgtob(npages1 + npages2); 810 811 if ((nvpage = kmem_zalloc(nvpsize, KM_NOSLEEP)) == NULL) { 812 return (-2); 813 } 814 if (vpage1 != NULL) { 815 bcopy(vpage1, nvpage, vpgtob(npages1)); 816 } 817 if (vpage2 != NULL) { 818 bcopy(vpage2, nvpage + npages1, vpgtob(npages2)); 819 } 820 for (vp = nvpage; vp < nvpage + npages1; vp++) { 821 if (svd2->pageprot && !svd1->pageprot) { 822 VPP_SETPROT(vp, svd1->prot); 823 } 824 if (svd2->pageadvice && !svd1->pageadvice) { 825 VPP_SETADVICE(vp, svd1->advice); 826 } 827 } 828 for (vp = nvpage + npages1; 829 vp < nvpage + npages1 + npages2; vp++) { 830 if (svd1->pageprot && !svd2->pageprot) { 831 VPP_SETPROT(vp, svd2->prot); 832 } 833 if (svd1->pageadvice && !svd2->pageadvice) { 834 VPP_SETADVICE(vp, svd2->advice); 835 } 836 } 837 } 838 839 /* 840 * If either segment has private pages, create a new merged anon 841 * array. 842 */ 843 if (amp1 != NULL || amp2 != NULL) { 844 struct anon_hdr *nahp; 845 struct anon_map *namp = NULL; 846 size_t asize = seg1->s_size + seg2->s_size; 847 848 if ((nahp = anon_create(btop(asize), ANON_NOSLEEP)) == NULL) { 849 if (nvpage != NULL) { 850 kmem_free(nvpage, nvpsize); 851 } 852 return (-2); 853 } 854 if (amp1 != NULL) { 855 /* 856 * XXX anon rwlock is not really needed because 857 * this is a private segment and we are writers. 858 */ 859 ANON_LOCK_ENTER(&1->a_rwlock, RW_WRITER); 860 ASSERT(amp1->refcnt == 1); 861 if (anon_copy_ptr(amp1->ahp, svd1->anon_index, 862 nahp, 0, btop(seg1->s_size), ANON_NOSLEEP)) { 863 anon_release(nahp, btop(asize)); 864 ANON_LOCK_EXIT(&1->a_rwlock); 865 if (nvpage != NULL) { 866 kmem_free(nvpage, nvpsize); 867 } 868 return (-2); 869 } 870 } 871 if (amp2 != NULL) { 872 ANON_LOCK_ENTER(&2->a_rwlock, RW_WRITER); 873 ASSERT(amp2->refcnt == 1); 874 if (anon_copy_ptr(amp2->ahp, svd2->anon_index, 875 nahp, btop(seg1->s_size), btop(seg2->s_size), 876 ANON_NOSLEEP)) { 877 anon_release(nahp, btop(asize)); 878 ANON_LOCK_EXIT(&2->a_rwlock); 879 if (amp1 != NULL) { 880 ANON_LOCK_EXIT(&1->a_rwlock); 881 } 882 if (nvpage != NULL) { 883 kmem_free(nvpage, nvpsize); 884 } 885 return (-2); 886 } 887 } 888 if (amp1 != NULL) { 889 namp = amp1; 890 anon_release(amp1->ahp, btop(amp1->size)); 891 } 892 if (amp2 != NULL) { 893 if (namp == NULL) { 894 ASSERT(amp1 == NULL); 895 namp = amp2; 896 anon_release(amp2->ahp, btop(amp2->size)); 897 } else { 898 amp2->refcnt--; 899 ANON_LOCK_EXIT(&2->a_rwlock); 900 anonmap_free(amp2); 901 } 902 svd2->amp = NULL; /* needed for seg_free */ 903 } 904 namp->ahp = nahp; 905 namp->size = asize; 906 svd1->amp = namp; 907 svd1->anon_index = 0; 908 ANON_LOCK_EXIT(&namp->a_rwlock); 909 } 910 /* 911 * Now free the old vpage structures. 912 */ 913 if (nvpage != NULL) { 914 if (vpage1 != NULL) { 915 kmem_free(vpage1, vpgtob(npages1)); 916 } 917 if (vpage2 != NULL) { 918 svd2->vpage = NULL; 919 kmem_free(vpage2, vpgtob(npages2)); 920 } 921 if (svd2->pageprot) { 922 svd1->pageprot = 1; 923 } 924 if (svd2->pageadvice) { 925 svd1->pageadvice = 1; 926 } 927 svd1->vpage = nvpage; 928 } 929 930 /* all looks ok, merge segments */ 931 svd1->swresv += svd2->swresv; 932 svd2->swresv = 0; /* so seg_free doesn't release swap space */ 933 size = seg2->s_size; 934 seg_free(seg2); 935 seg1->s_size += size; 936 return (0); 937 } 938 939 /* 940 * Extend the previous segment (seg1) to include the 941 * new segment (seg2 + a), if possible. 942 * Return 0 on success. 943 */ 944 static int 945 segvn_extend_prev(seg1, seg2, a, swresv) 946 struct seg *seg1, *seg2; 947 struct segvn_crargs *a; 948 size_t swresv; 949 { 950 struct segvn_data *svd1 = (struct segvn_data *)seg1->s_data; 951 size_t size; 952 struct anon_map *amp1; 953 struct vpage *new_vpage; 954 955 /* 956 * We don't need any segment level locks for "segvn" data 957 * since the address space is "write" locked. 958 */ 959 ASSERT(seg1->s_as && AS_WRITE_HELD(seg1->s_as, &seg1->s_as->a_lock)); 960 961 /* second segment is new, try to extend first */ 962 /* XXX - should also check cred */ 963 if (svd1->vp != a->vp || svd1->maxprot != a->maxprot || 964 (!svd1->pageprot && (svd1->prot != a->prot)) || 965 svd1->type != a->type || svd1->flags != a->flags || 966 seg1->s_szc != a->szc) 967 return (-1); 968 969 /* vp == NULL implies zfod, offset doesn't matter */ 970 if (svd1->vp != NULL && 971 svd1->offset + seg1->s_size != (a->offset & PAGEMASK)) 972 return (-1); 973 974 amp1 = svd1->amp; 975 if (amp1) { 976 pgcnt_t newpgs; 977 978 /* 979 * Segment has private pages, can data structures 980 * be expanded? 981 * 982 * Acquire the anon_map lock to prevent it from changing, 983 * if it is shared. This ensures that the anon_map 984 * will not change while a thread which has a read/write 985 * lock on an address space references it. 986 * XXX - Don't need the anon_map lock at all if "refcnt" 987 * is 1. 988 * 989 * Can't grow a MAP_SHARED segment with an anonmap because 990 * there may be existing anon slots where we want to extend 991 * the segment and we wouldn't know what to do with them 992 * (e.g., for tmpfs right thing is to just leave them there, 993 * for /dev/zero they should be cleared out). 994 */ 995 if (svd1->type == MAP_SHARED) 996 return (-1); 997 998 ANON_LOCK_ENTER(&1->a_rwlock, RW_WRITER); 999 if (amp1->refcnt > 1) { 1000 ANON_LOCK_EXIT(&1->a_rwlock); 1001 return (-1); 1002 } 1003 newpgs = anon_grow(amp1->ahp, &svd1->anon_index, 1004 btop(seg1->s_size), btop(seg2->s_size), ANON_NOSLEEP); 1005 1006 if (newpgs == 0) { 1007 ANON_LOCK_EXIT(&1->a_rwlock); 1008 return (-1); 1009 } 1010 amp1->size = ptob(newpgs); 1011 ANON_LOCK_EXIT(&1->a_rwlock); 1012 } 1013 if (svd1->vpage != NULL) { 1014 new_vpage = 1015 kmem_zalloc(vpgtob(seg_pages(seg1) + seg_pages(seg2)), 1016 KM_NOSLEEP); 1017 if (new_vpage == NULL) 1018 return (-1); 1019 bcopy(svd1->vpage, new_vpage, vpgtob(seg_pages(seg1))); 1020 kmem_free(svd1->vpage, vpgtob(seg_pages(seg1))); 1021 svd1->vpage = new_vpage; 1022 if (svd1->pageprot) { 1023 struct vpage *vp, *evp; 1024 1025 vp = new_vpage + seg_pages(seg1); 1026 evp = vp + seg_pages(seg2); 1027 for (; vp < evp; vp++) 1028 VPP_SETPROT(vp, a->prot); 1029 } 1030 } 1031 size = seg2->s_size; 1032 seg_free(seg2); 1033 seg1->s_size += size; 1034 svd1->swresv += swresv; 1035 return (0); 1036 } 1037 1038 /* 1039 * Extend the next segment (seg2) to include the 1040 * new segment (seg1 + a), if possible. 1041 * Return 0 on success. 1042 */ 1043 static int 1044 segvn_extend_next( 1045 struct seg *seg1, 1046 struct seg *seg2, 1047 struct segvn_crargs *a, 1048 size_t swresv) 1049 { 1050 struct segvn_data *svd2 = (struct segvn_data *)seg2->s_data; 1051 size_t size; 1052 struct anon_map *amp2; 1053 struct vpage *new_vpage; 1054 1055 /* 1056 * We don't need any segment level locks for "segvn" data 1057 * since the address space is "write" locked. 1058 */ 1059 ASSERT(seg2->s_as && AS_WRITE_HELD(seg2->s_as, &seg2->s_as->a_lock)); 1060 1061 /* first segment is new, try to extend second */ 1062 /* XXX - should also check cred */ 1063 if (svd2->vp != a->vp || svd2->maxprot != a->maxprot || 1064 (!svd2->pageprot && (svd2->prot != a->prot)) || 1065 svd2->type != a->type || svd2->flags != a->flags || 1066 seg2->s_szc != a->szc) 1067 return (-1); 1068 /* vp == NULL implies zfod, offset doesn't matter */ 1069 if (svd2->vp != NULL && 1070 (a->offset & PAGEMASK) + seg1->s_size != svd2->offset) 1071 return (-1); 1072 1073 amp2 = svd2->amp; 1074 if (amp2) { 1075 pgcnt_t newpgs; 1076 1077 /* 1078 * Segment has private pages, can data structures 1079 * be expanded? 1080 * 1081 * Acquire the anon_map lock to prevent it from changing, 1082 * if it is shared. This ensures that the anon_map 1083 * will not change while a thread which has a read/write 1084 * lock on an address space references it. 1085 * 1086 * XXX - Don't need the anon_map lock at all if "refcnt" 1087 * is 1. 1088 */ 1089 if (svd2->type == MAP_SHARED) 1090 return (-1); 1091 1092 ANON_LOCK_ENTER(&2->a_rwlock, RW_WRITER); 1093 if (amp2->refcnt > 1) { 1094 ANON_LOCK_EXIT(&2->a_rwlock); 1095 return (-1); 1096 } 1097 newpgs = anon_grow(amp2->ahp, &svd2->anon_index, 1098 btop(seg2->s_size), btop(seg1->s_size), 1099 ANON_NOSLEEP | ANON_GROWDOWN); 1100 1101 if (newpgs == 0) { 1102 ANON_LOCK_EXIT(&2->a_rwlock); 1103 return (-1); 1104 } 1105 amp2->size = ptob(newpgs); 1106 ANON_LOCK_EXIT(&2->a_rwlock); 1107 } 1108 if (svd2->vpage != NULL) { 1109 new_vpage = 1110 kmem_zalloc(vpgtob(seg_pages(seg1) + seg_pages(seg2)), 1111 KM_NOSLEEP); 1112 if (new_vpage == NULL) { 1113 /* Not merging segments so adjust anon_index back */ 1114 if (amp2) 1115 svd2->anon_index += seg_pages(seg1); 1116 return (-1); 1117 } 1118 bcopy(svd2->vpage, new_vpage + seg_pages(seg1), 1119 vpgtob(seg_pages(seg2))); 1120 kmem_free(svd2->vpage, vpgtob(seg_pages(seg2))); 1121 svd2->vpage = new_vpage; 1122 if (svd2->pageprot) { 1123 struct vpage *vp, *evp; 1124 1125 vp = new_vpage; 1126 evp = vp + seg_pages(seg1); 1127 for (; vp < evp; vp++) 1128 VPP_SETPROT(vp, a->prot); 1129 } 1130 } 1131 size = seg1->s_size; 1132 seg_free(seg1); 1133 seg2->s_size += size; 1134 seg2->s_base -= size; 1135 svd2->offset -= size; 1136 svd2->swresv += swresv; 1137 return (0); 1138 } 1139 1140 static int 1141 segvn_dup(struct seg *seg, struct seg *newseg) 1142 { 1143 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 1144 struct segvn_data *newsvd; 1145 pgcnt_t npages = seg_pages(seg); 1146 int error = 0; 1147 uint_t prot; 1148 size_t len; 1149 1150 ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock)); 1151 1152 /* 1153 * If segment has anon reserved, reserve more for the new seg. 1154 * For a MAP_NORESERVE segment swresv will be a count of all the 1155 * allocated anon slots; thus we reserve for the child as many slots 1156 * as the parent has allocated. This semantic prevents the child or 1157 * parent from dieing during a copy-on-write fault caused by trying 1158 * to write a shared pre-existing anon page. 1159 */ 1160 if ((len = svd->swresv) != 0) { 1161 if (anon_resv(svd->swresv) == 0) 1162 return (ENOMEM); 1163 1164 TRACE_3(TR_FAC_VM, TR_ANON_PROC, "anon proc:%p %lu %u", 1165 seg, len, 0); 1166 } 1167 1168 newsvd = kmem_cache_alloc(segvn_cache, KM_SLEEP); 1169 1170 newseg->s_ops = &segvn_ops; 1171 newseg->s_data = (void *)newsvd; 1172 newseg->s_szc = seg->s_szc; 1173 1174 if ((newsvd->vp = svd->vp) != NULL) { 1175 VN_HOLD(svd->vp); 1176 if (svd->type == MAP_SHARED) 1177 lgrp_shm_policy_init(NULL, svd->vp); 1178 } 1179 newsvd->offset = svd->offset; 1180 newsvd->prot = svd->prot; 1181 newsvd->maxprot = svd->maxprot; 1182 newsvd->pageprot = svd->pageprot; 1183 newsvd->type = svd->type; 1184 newsvd->cred = svd->cred; 1185 crhold(newsvd->cred); 1186 newsvd->advice = svd->advice; 1187 newsvd->pageadvice = svd->pageadvice; 1188 newsvd->swresv = svd->swresv; 1189 newsvd->flags = svd->flags; 1190 newsvd->softlockcnt = 0; 1191 newsvd->policy_info = svd->policy_info; 1192 if ((newsvd->amp = svd->amp) == NULL) { 1193 /* 1194 * Not attaching to a shared anon object. 1195 */ 1196 newsvd->anon_index = 0; 1197 } else { 1198 struct anon_map *amp; 1199 1200 amp = svd->amp; 1201 if (svd->type == MAP_SHARED) { 1202 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 1203 amp->refcnt++; 1204 ANON_LOCK_EXIT(&->a_rwlock); 1205 newsvd->anon_index = svd->anon_index; 1206 } else { 1207 int reclaim = 1; 1208 1209 /* 1210 * Allocate and initialize new anon_map structure. 1211 */ 1212 newsvd->amp = anonmap_alloc(newseg->s_size, 0); 1213 newsvd->amp->a_szc = newseg->s_szc; 1214 newsvd->anon_index = 0; 1215 1216 /* 1217 * We don't have to acquire the anon_map lock 1218 * for the new segment (since it belongs to an 1219 * address space that is still not associated 1220 * with any process), or the segment in the old 1221 * address space (since all threads in it 1222 * are stopped while duplicating the address space). 1223 */ 1224 1225 /* 1226 * The goal of the following code is to make sure that 1227 * softlocked pages do not end up as copy on write 1228 * pages. This would cause problems where one 1229 * thread writes to a page that is COW and a different 1230 * thread in the same process has softlocked it. The 1231 * softlock lock would move away from this process 1232 * because the write would cause this process to get 1233 * a copy (without the softlock). 1234 * 1235 * The strategy here is to just break the 1236 * sharing on pages that could possibly be 1237 * softlocked. 1238 */ 1239 retry: 1240 if (svd->softlockcnt) { 1241 struct anon *ap, *newap; 1242 size_t i; 1243 uint_t vpprot; 1244 page_t *anon_pl[1+1], *pp; 1245 caddr_t addr; 1246 ulong_t anon_idx = 0; 1247 1248 /* 1249 * The softlock count might be non zero 1250 * because some pages are still stuck in the 1251 * cache for lazy reclaim. Flush the cache 1252 * now. This should drop the count to zero. 1253 * [or there is really I/O going on to these 1254 * pages]. Note, we have the writers lock so 1255 * nothing gets inserted during the flush. 1256 */ 1257 if (reclaim == 1) { 1258 segvn_purge(seg); 1259 reclaim = 0; 1260 goto retry; 1261 } 1262 i = btopr(seg->s_size); 1263 addr = seg->s_base; 1264 /* 1265 * XXX break cow sharing using PAGESIZE 1266 * pages. They will be relocated into larger 1267 * pages at fault time. 1268 */ 1269 while (i-- > 0) { 1270 if (ap = anon_get_ptr(amp->ahp, 1271 anon_idx)) { 1272 error = anon_getpage(&ap, 1273 &vpprot, anon_pl, PAGESIZE, 1274 seg, addr, S_READ, 1275 svd->cred); 1276 if (error) { 1277 newsvd->vpage = NULL; 1278 goto out; 1279 } 1280 /* 1281 * prot need not be computed 1282 * below 'cause anon_private is 1283 * going to ignore it anyway 1284 * as child doesn't inherit 1285 * pagelock from parent. 1286 */ 1287 prot = svd->pageprot ? 1288 VPP_PROT( 1289 &svd->vpage[ 1290 seg_page(seg, addr)]) 1291 : svd->prot; 1292 pp = anon_private(&newap, 1293 newseg, addr, prot, 1294 anon_pl[0], 0, 1295 newsvd->cred); 1296 if (pp == NULL) { 1297 /* no mem abort */ 1298 newsvd->vpage = NULL; 1299 error = ENOMEM; 1300 goto out; 1301 } 1302 (void) anon_set_ptr( 1303 newsvd->amp->ahp, anon_idx, 1304 newap, ANON_SLEEP); 1305 page_unlock(pp); 1306 } 1307 addr += PAGESIZE; 1308 anon_idx++; 1309 } 1310 } else { /* common case */ 1311 if (seg->s_szc != 0) { 1312 /* 1313 * If at least one of anon slots of a 1314 * large page exists then make sure 1315 * all anon slots of a large page 1316 * exist to avoid partial cow sharing 1317 * of a large page in the future. 1318 */ 1319 anon_dup_fill_holes(amp->ahp, 1320 svd->anon_index, newsvd->amp->ahp, 1321 0, seg->s_size, seg->s_szc, 1322 svd->vp != NULL); 1323 } else { 1324 anon_dup(amp->ahp, svd->anon_index, 1325 newsvd->amp->ahp, 0, seg->s_size); 1326 } 1327 1328 hat_clrattr(seg->s_as->a_hat, seg->s_base, 1329 seg->s_size, PROT_WRITE); 1330 } 1331 } 1332 } 1333 /* 1334 * If necessary, create a vpage structure for the new segment. 1335 * Do not copy any page lock indications. 1336 */ 1337 if (svd->vpage != NULL) { 1338 uint_t i; 1339 struct vpage *ovp = svd->vpage; 1340 struct vpage *nvp; 1341 1342 nvp = newsvd->vpage = 1343 kmem_alloc(vpgtob(npages), KM_SLEEP); 1344 for (i = 0; i < npages; i++) { 1345 *nvp = *ovp++; 1346 VPP_CLRPPLOCK(nvp++); 1347 } 1348 } else 1349 newsvd->vpage = NULL; 1350 1351 /* Inform the vnode of the new mapping */ 1352 if (newsvd->vp != NULL) { 1353 error = VOP_ADDMAP(newsvd->vp, (offset_t)newsvd->offset, 1354 newseg->s_as, newseg->s_base, newseg->s_size, newsvd->prot, 1355 newsvd->maxprot, newsvd->type, newsvd->cred); 1356 } 1357 out: 1358 return (error); 1359 } 1360 1361 1362 /* 1363 * callback function used by segvn_unmap to invoke free_vp_pages() for only 1364 * those pages actually processed by the HAT 1365 */ 1366 extern int free_pages; 1367 1368 static void 1369 segvn_hat_unload_callback(hat_callback_t *cb) 1370 { 1371 struct seg *seg = cb->hcb_data; 1372 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 1373 size_t len; 1374 u_offset_t off; 1375 1376 ASSERT(svd->vp != NULL); 1377 ASSERT(cb->hcb_end_addr > cb->hcb_start_addr); 1378 ASSERT(cb->hcb_start_addr >= seg->s_base); 1379 1380 len = cb->hcb_end_addr - cb->hcb_start_addr; 1381 off = cb->hcb_start_addr - seg->s_base; 1382 free_vp_pages(svd->vp, svd->offset + off, len); 1383 } 1384 1385 1386 static int 1387 segvn_unmap(struct seg *seg, caddr_t addr, size_t len) 1388 { 1389 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 1390 struct segvn_data *nsvd; 1391 struct seg *nseg; 1392 struct anon_map *amp; 1393 pgcnt_t opages; /* old segment size in pages */ 1394 pgcnt_t npages; /* new segment size in pages */ 1395 pgcnt_t dpages; /* pages being deleted (unmapped) */ 1396 hat_callback_t callback; /* used for free_vp_pages() */ 1397 hat_callback_t *cbp = NULL; 1398 caddr_t nbase; 1399 size_t nsize; 1400 size_t oswresv; 1401 int reclaim = 1; 1402 1403 /* 1404 * We don't need any segment level locks for "segvn" data 1405 * since the address space is "write" locked. 1406 */ 1407 ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock)); 1408 1409 /* 1410 * Fail the unmap if pages are SOFTLOCKed through this mapping. 1411 * softlockcnt is protected from change by the as write lock. 1412 */ 1413 retry: 1414 if (svd->softlockcnt > 0) { 1415 /* 1416 * since we do have the writers lock nobody can fill 1417 * the cache during the purge. The flush either succeeds 1418 * or we still have pending I/Os. 1419 */ 1420 if (reclaim == 1) { 1421 segvn_purge(seg); 1422 reclaim = 0; 1423 goto retry; 1424 } 1425 return (EAGAIN); 1426 } 1427 1428 /* 1429 * Check for bad sizes 1430 */ 1431 if (addr < seg->s_base || addr + len > seg->s_base + seg->s_size || 1432 (len & PAGEOFFSET) || ((uintptr_t)addr & PAGEOFFSET)) { 1433 panic("segvn_unmap"); 1434 /*NOTREACHED*/ 1435 } 1436 1437 if (seg->s_szc != 0) { 1438 size_t pgsz = page_get_pagesize(seg->s_szc); 1439 int err; 1440 if (!IS_P2ALIGNED(addr, pgsz) || !IS_P2ALIGNED(len, pgsz)) { 1441 ASSERT(seg->s_base != addr || seg->s_size != len); 1442 VM_STAT_ADD(segvnvmstats.demoterange[0]); 1443 err = segvn_demote_range(seg, addr, len, SDR_END); 1444 if (err == 0) { 1445 return (IE_RETRY); 1446 } 1447 return (err); 1448 } 1449 } 1450 1451 /* Inform the vnode of the unmapping. */ 1452 if (svd->vp) { 1453 int error; 1454 1455 error = VOP_DELMAP(svd->vp, 1456 (offset_t)svd->offset + (uintptr_t)(addr - seg->s_base), 1457 seg->s_as, addr, len, svd->prot, svd->maxprot, 1458 svd->type, svd->cred); 1459 1460 if (error == EAGAIN) 1461 return (error); 1462 } 1463 /* 1464 * Remove any page locks set through this mapping. 1465 */ 1466 (void) segvn_lockop(seg, addr, len, 0, MC_UNLOCK, NULL, 0); 1467 1468 /* 1469 * Unload any hardware translations in the range to be taken out. 1470 * Use a callback to invoke free_vp_pages() effectively. 1471 */ 1472 if (svd->vp != NULL && free_pages != 0) { 1473 callback.hcb_data = seg; 1474 callback.hcb_function = segvn_hat_unload_callback; 1475 cbp = &callback; 1476 } 1477 hat_unload_callback(seg->s_as->a_hat, addr, len, HAT_UNLOAD_UNMAP, cbp); 1478 1479 /* 1480 * Check for entire segment 1481 */ 1482 if (addr == seg->s_base && len == seg->s_size) { 1483 seg_free(seg); 1484 return (0); 1485 } 1486 1487 opages = seg_pages(seg); 1488 dpages = btop(len); 1489 npages = opages - dpages; 1490 amp = svd->amp; 1491 1492 /* 1493 * Check for beginning of segment 1494 */ 1495 if (addr == seg->s_base) { 1496 if (svd->vpage != NULL) { 1497 size_t nbytes; 1498 struct vpage *ovpage; 1499 1500 ovpage = svd->vpage; /* keep pointer to vpage */ 1501 1502 nbytes = vpgtob(npages); 1503 svd->vpage = kmem_alloc(nbytes, KM_SLEEP); 1504 bcopy(&ovpage[dpages], svd->vpage, nbytes); 1505 1506 /* free up old vpage */ 1507 kmem_free(ovpage, vpgtob(opages)); 1508 } 1509 if (amp != NULL) { 1510 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 1511 if (amp->refcnt == 1 || svd->type == MAP_PRIVATE) { 1512 /* 1513 * Free up now unused parts of anon_map array. 1514 */ 1515 if (seg->s_szc != 0) { 1516 anon_free_pages(amp->ahp, 1517 svd->anon_index, len, seg->s_szc); 1518 } else { 1519 anon_free(amp->ahp, svd->anon_index, 1520 len); 1521 } 1522 1523 /* 1524 * Unreserve swap space for the unmapped chunk 1525 * of this segment in case it's MAP_SHARED 1526 */ 1527 if (svd->type == MAP_SHARED) { 1528 anon_unresv(len); 1529 amp->swresv -= len; 1530 } 1531 } 1532 ANON_LOCK_EXIT(&->a_rwlock); 1533 svd->anon_index += dpages; 1534 } 1535 if (svd->vp != NULL) 1536 svd->offset += len; 1537 1538 if (svd->swresv) { 1539 if (svd->flags & MAP_NORESERVE) { 1540 ASSERT(amp); 1541 oswresv = svd->swresv; 1542 1543 svd->swresv = ptob(anon_pages(amp->ahp, 1544 svd->anon_index, npages)); 1545 anon_unresv(oswresv - svd->swresv); 1546 } else { 1547 anon_unresv(len); 1548 svd->swresv -= len; 1549 } 1550 TRACE_3(TR_FAC_VM, TR_ANON_PROC, "anon proc:%p %lu %u", 1551 seg, len, 0); 1552 } 1553 1554 seg->s_base += len; 1555 seg->s_size -= len; 1556 return (0); 1557 } 1558 1559 /* 1560 * Check for end of segment 1561 */ 1562 if (addr + len == seg->s_base + seg->s_size) { 1563 if (svd->vpage != NULL) { 1564 size_t nbytes; 1565 struct vpage *ovpage; 1566 1567 ovpage = svd->vpage; /* keep pointer to vpage */ 1568 1569 nbytes = vpgtob(npages); 1570 svd->vpage = kmem_alloc(nbytes, KM_SLEEP); 1571 bcopy(ovpage, svd->vpage, nbytes); 1572 1573 /* free up old vpage */ 1574 kmem_free(ovpage, vpgtob(opages)); 1575 1576 } 1577 if (amp != NULL) { 1578 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 1579 if (amp->refcnt == 1 || svd->type == MAP_PRIVATE) { 1580 /* 1581 * Free up now unused parts of anon_map array 1582 */ 1583 if (seg->s_szc != 0) { 1584 ulong_t an_idx = svd->anon_index + 1585 npages; 1586 anon_free_pages(amp->ahp, an_idx, 1587 len, seg->s_szc); 1588 } else { 1589 anon_free(amp->ahp, 1590 svd->anon_index + npages, len); 1591 } 1592 /* 1593 * Unreserve swap space for the unmapped chunk 1594 * of this segment in case it's MAP_SHARED 1595 */ 1596 if (svd->type == MAP_SHARED) { 1597 anon_unresv(len); 1598 amp->swresv -= len; 1599 } 1600 } 1601 ANON_LOCK_EXIT(&->a_rwlock); 1602 } 1603 1604 if (svd->swresv) { 1605 if (svd->flags & MAP_NORESERVE) { 1606 ASSERT(amp); 1607 oswresv = svd->swresv; 1608 svd->swresv = ptob(anon_pages(amp->ahp, 1609 svd->anon_index, npages)); 1610 anon_unresv(oswresv - svd->swresv); 1611 } else { 1612 anon_unresv(len); 1613 svd->swresv -= len; 1614 } 1615 TRACE_3(TR_FAC_VM, TR_ANON_PROC, 1616 "anon proc:%p %lu %u", seg, len, 0); 1617 } 1618 1619 seg->s_size -= len; 1620 return (0); 1621 } 1622 1623 /* 1624 * The section to go is in the middle of the segment, 1625 * have to make it into two segments. nseg is made for 1626 * the high end while seg is cut down at the low end. 1627 */ 1628 nbase = addr + len; /* new seg base */ 1629 nsize = (seg->s_base + seg->s_size) - nbase; /* new seg size */ 1630 seg->s_size = addr - seg->s_base; /* shrink old seg */ 1631 nseg = seg_alloc(seg->s_as, nbase, nsize); 1632 if (nseg == NULL) { 1633 panic("segvn_unmap seg_alloc"); 1634 /*NOTREACHED*/ 1635 } 1636 nseg->s_ops = seg->s_ops; 1637 nsvd = kmem_cache_alloc(segvn_cache, KM_SLEEP); 1638 nseg->s_data = (void *)nsvd; 1639 nseg->s_szc = seg->s_szc; 1640 *nsvd = *svd; 1641 nsvd->offset = svd->offset + (uintptr_t)(nseg->s_base - seg->s_base); 1642 nsvd->swresv = 0; 1643 nsvd->softlockcnt = 0; 1644 1645 if (svd->vp != NULL) { 1646 VN_HOLD(nsvd->vp); 1647 if (nsvd->type == MAP_SHARED) 1648 lgrp_shm_policy_init(NULL, nsvd->vp); 1649 } 1650 crhold(svd->cred); 1651 1652 if (svd->vpage == NULL) { 1653 nsvd->vpage = NULL; 1654 } else { 1655 /* need to split vpage into two arrays */ 1656 size_t nbytes; 1657 struct vpage *ovpage; 1658 1659 ovpage = svd->vpage; /* keep pointer to vpage */ 1660 1661 npages = seg_pages(seg); /* seg has shrunk */ 1662 nbytes = vpgtob(npages); 1663 svd->vpage = kmem_alloc(nbytes, KM_SLEEP); 1664 1665 bcopy(ovpage, svd->vpage, nbytes); 1666 1667 npages = seg_pages(nseg); 1668 nbytes = vpgtob(npages); 1669 nsvd->vpage = kmem_alloc(nbytes, KM_SLEEP); 1670 1671 bcopy(&ovpage[opages - npages], nsvd->vpage, nbytes); 1672 1673 /* free up old vpage */ 1674 kmem_free(ovpage, vpgtob(opages)); 1675 } 1676 1677 if (amp == NULL) { 1678 nsvd->amp = NULL; 1679 nsvd->anon_index = 0; 1680 } else { 1681 /* 1682 * Need to create a new anon map for the new segment. 1683 * We'll also allocate a new smaller array for the old 1684 * smaller segment to save space. 1685 */ 1686 opages = btop((uintptr_t)(addr - seg->s_base)); 1687 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 1688 if (amp->refcnt == 1 || svd->type == MAP_PRIVATE) { 1689 /* 1690 * Free up now unused parts of anon_map array 1691 */ 1692 if (seg->s_szc != 0) { 1693 ulong_t an_idx = svd->anon_index + opages; 1694 anon_free_pages(amp->ahp, an_idx, len, 1695 seg->s_szc); 1696 } else { 1697 anon_free(amp->ahp, svd->anon_index + opages, 1698 len); 1699 } 1700 1701 /* 1702 * Unreserve swap space for the unmapped chunk 1703 * of this segment in case it's MAP_SHARED 1704 */ 1705 if (svd->type == MAP_SHARED) { 1706 anon_unresv(len); 1707 amp->swresv -= len; 1708 } 1709 } 1710 1711 nsvd->anon_index = svd->anon_index + 1712 btop((uintptr_t)(nseg->s_base - seg->s_base)); 1713 if (svd->type == MAP_SHARED) { 1714 ASSERT(seg->s_szc == 0); 1715 amp->refcnt++; 1716 nsvd->amp = amp; 1717 } else { 1718 struct anon_map *namp; 1719 struct anon_hdr *nahp; 1720 1721 ASSERT(svd->type == MAP_PRIVATE); 1722 nahp = anon_create(btop(seg->s_size), ANON_SLEEP); 1723 namp = anonmap_alloc(nseg->s_size, 0); 1724 namp->a_szc = seg->s_szc; 1725 (void) anon_copy_ptr(amp->ahp, svd->anon_index, nahp, 1726 0, btop(seg->s_size), ANON_SLEEP); 1727 (void) anon_copy_ptr(amp->ahp, nsvd->anon_index, 1728 namp->ahp, 0, btop(nseg->s_size), ANON_SLEEP); 1729 anon_release(amp->ahp, btop(amp->size)); 1730 svd->anon_index = 0; 1731 nsvd->anon_index = 0; 1732 amp->ahp = nahp; 1733 amp->size = seg->s_size; 1734 nsvd->amp = namp; 1735 } 1736 ANON_LOCK_EXIT(&->a_rwlock); 1737 } 1738 if (svd->swresv) { 1739 if (svd->flags & MAP_NORESERVE) { 1740 ASSERT(amp); 1741 oswresv = svd->swresv; 1742 svd->swresv = ptob(anon_pages(amp->ahp, 1743 svd->anon_index, btop(seg->s_size))); 1744 nsvd->swresv = ptob(anon_pages(nsvd->amp->ahp, 1745 nsvd->anon_index, btop(nseg->s_size))); 1746 ASSERT(oswresv >= (svd->swresv + nsvd->swresv)); 1747 anon_unresv(oswresv - (svd->swresv + nsvd->swresv)); 1748 } else { 1749 if (seg->s_size + nseg->s_size + len != svd->swresv) { 1750 panic("segvn_unmap: " 1751 "cannot split swap reservation"); 1752 /*NOTREACHED*/ 1753 } 1754 anon_unresv(len); 1755 svd->swresv = seg->s_size; 1756 nsvd->swresv = nseg->s_size; 1757 } 1758 TRACE_3(TR_FAC_VM, TR_ANON_PROC, "anon proc:%p %lu %u", 1759 seg, len, 0); 1760 } 1761 1762 return (0); /* I'm glad that's all over with! */ 1763 } 1764 1765 static void 1766 segvn_free(struct seg *seg) 1767 { 1768 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 1769 pgcnt_t npages = seg_pages(seg); 1770 struct anon_map *amp; 1771 size_t len; 1772 1773 /* 1774 * We don't need any segment level locks for "segvn" data 1775 * since the address space is "write" locked. 1776 */ 1777 ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock)); 1778 1779 /* 1780 * Be sure to unlock pages. XXX Why do things get free'ed instead 1781 * of unmapped? XXX 1782 */ 1783 (void) segvn_lockop(seg, seg->s_base, seg->s_size, 1784 0, MC_UNLOCK, NULL, 0); 1785 1786 /* 1787 * Deallocate the vpage and anon pointers if necessary and possible. 1788 */ 1789 if (svd->vpage != NULL) { 1790 kmem_free(svd->vpage, vpgtob(npages)); 1791 svd->vpage = NULL; 1792 } 1793 if ((amp = svd->amp) != NULL) { 1794 /* 1795 * If there are no more references to this anon_map 1796 * structure, then deallocate the structure after freeing 1797 * up all the anon slot pointers that we can. 1798 */ 1799 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 1800 if (--amp->refcnt == 0) { 1801 if (svd->type == MAP_PRIVATE) { 1802 /* 1803 * Private - we only need to anon_free 1804 * the part that this segment refers to. 1805 */ 1806 if (seg->s_szc != 0) { 1807 anon_free_pages(amp->ahp, 1808 svd->anon_index, seg->s_size, 1809 seg->s_szc); 1810 } else { 1811 anon_free(amp->ahp, svd->anon_index, 1812 seg->s_size); 1813 } 1814 } else { 1815 /* 1816 * Shared - anon_free the entire 1817 * anon_map's worth of stuff and 1818 * release any swap reservation. 1819 */ 1820 ASSERT(seg->s_szc == 0); 1821 anon_free(amp->ahp, 0, amp->size); 1822 if ((len = amp->swresv) != 0) { 1823 anon_unresv(len); 1824 TRACE_3(TR_FAC_VM, TR_ANON_PROC, 1825 "anon proc:%p %lu %u", 1826 seg, len, 0); 1827 } 1828 } 1829 svd->amp = NULL; 1830 ANON_LOCK_EXIT(&->a_rwlock); 1831 anonmap_free(amp); 1832 } else if (svd->type == MAP_PRIVATE) { 1833 /* 1834 * We had a private mapping which still has 1835 * a held anon_map so just free up all the 1836 * anon slot pointers that we were using. 1837 */ 1838 if (seg->s_szc != 0) { 1839 anon_free_pages(amp->ahp, svd->anon_index, 1840 seg->s_size, seg->s_szc); 1841 } else { 1842 anon_free(amp->ahp, svd->anon_index, 1843 seg->s_size); 1844 } 1845 ANON_LOCK_EXIT(&->a_rwlock); 1846 } else { 1847 ANON_LOCK_EXIT(&->a_rwlock); 1848 } 1849 } 1850 1851 /* 1852 * Release swap reservation. 1853 */ 1854 if ((len = svd->swresv) != 0) { 1855 anon_unresv(svd->swresv); 1856 TRACE_3(TR_FAC_VM, TR_ANON_PROC, "anon proc:%p %lu %u", 1857 seg, len, 0); 1858 svd->swresv = 0; 1859 } 1860 /* 1861 * Release claim on vnode, credentials, and finally free the 1862 * private data. 1863 */ 1864 if (svd->vp != NULL) { 1865 if (svd->type == MAP_SHARED) 1866 lgrp_shm_policy_fini(NULL, svd->vp); 1867 VN_RELE(svd->vp); 1868 svd->vp = NULL; 1869 } 1870 crfree(svd->cred); 1871 svd->cred = NULL; 1872 1873 seg->s_data = NULL; 1874 kmem_cache_free(segvn_cache, svd); 1875 } 1876 1877 /* 1878 * Do a F_SOFTUNLOCK call over the range requested. The range must have 1879 * already been F_SOFTLOCK'ed. 1880 * Caller must always match addr and len of a softunlock with a previous 1881 * softlock with exactly the same addr and len. 1882 */ 1883 static void 1884 segvn_softunlock(struct seg *seg, caddr_t addr, size_t len, enum seg_rw rw) 1885 { 1886 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 1887 page_t *pp; 1888 caddr_t adr; 1889 struct vnode *vp; 1890 u_offset_t offset; 1891 ulong_t anon_index; 1892 struct anon_map *amp; 1893 struct anon *ap = NULL; 1894 1895 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 1896 ASSERT(SEGVN_LOCK_HELD(seg->s_as, &svd->lock)); 1897 1898 if ((amp = svd->amp) != NULL) 1899 anon_index = svd->anon_index + seg_page(seg, addr); 1900 1901 hat_unlock(seg->s_as->a_hat, addr, len); 1902 for (adr = addr; adr < addr + len; adr += PAGESIZE) { 1903 if (amp != NULL) { 1904 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 1905 if ((ap = anon_get_ptr(amp->ahp, anon_index++)) 1906 != NULL) { 1907 swap_xlate(ap, &vp, &offset); 1908 } else { 1909 vp = svd->vp; 1910 offset = svd->offset + 1911 (uintptr_t)(adr - seg->s_base); 1912 } 1913 ANON_LOCK_EXIT(&->a_rwlock); 1914 } else { 1915 vp = svd->vp; 1916 offset = svd->offset + 1917 (uintptr_t)(adr - seg->s_base); 1918 } 1919 1920 /* 1921 * Use page_find() instead of page_lookup() to 1922 * find the page since we know that it is locked. 1923 */ 1924 pp = page_find(vp, offset); 1925 if (pp == NULL) { 1926 panic( 1927 "segvn_softunlock: addr %p, ap %p, vp %p, off %llx", 1928 (void *)adr, (void *)ap, (void *)vp, offset); 1929 /*NOTREACHED*/ 1930 } 1931 1932 if (rw == S_WRITE) { 1933 hat_setrefmod(pp); 1934 if (seg->s_as->a_vbits) 1935 hat_setstat(seg->s_as, adr, PAGESIZE, 1936 P_REF | P_MOD); 1937 } else if (rw != S_OTHER) { 1938 hat_setref(pp); 1939 if (seg->s_as->a_vbits) 1940 hat_setstat(seg->s_as, adr, PAGESIZE, P_REF); 1941 } 1942 TRACE_3(TR_FAC_VM, TR_SEGVN_FAULT, 1943 "segvn_fault:pp %p vp %p offset %llx", pp, vp, offset); 1944 page_unlock(pp); 1945 } 1946 mutex_enter(&freemem_lock); /* for availrmem */ 1947 availrmem += btop(len); 1948 segvn_pages_locked -= btop(len); 1949 svd->softlockcnt -= btop(len); 1950 mutex_exit(&freemem_lock); 1951 if (svd->softlockcnt == 0) { 1952 /* 1953 * All SOFTLOCKS are gone. Wakeup any waiting 1954 * unmappers so they can try again to unmap. 1955 * Check for waiters first without the mutex 1956 * held so we don't always grab the mutex on 1957 * softunlocks. 1958 */ 1959 if (AS_ISUNMAPWAIT(seg->s_as)) { 1960 mutex_enter(&seg->s_as->a_contents); 1961 if (AS_ISUNMAPWAIT(seg->s_as)) { 1962 AS_CLRUNMAPWAIT(seg->s_as); 1963 cv_broadcast(&seg->s_as->a_cv); 1964 } 1965 mutex_exit(&seg->s_as->a_contents); 1966 } 1967 } 1968 } 1969 1970 #define PAGE_HANDLED ((page_t *)-1) 1971 1972 /* 1973 * Release all the pages in the NULL terminated ppp list 1974 * which haven't already been converted to PAGE_HANDLED. 1975 */ 1976 static void 1977 segvn_pagelist_rele(page_t **ppp) 1978 { 1979 for (; *ppp != NULL; ppp++) { 1980 if (*ppp != PAGE_HANDLED) 1981 page_unlock(*ppp); 1982 } 1983 } 1984 1985 static int stealcow = 1; 1986 1987 /* 1988 * Workaround for viking chip bug. See bug id 1220902. 1989 * To fix this down in pagefault() would require importing so 1990 * much as and segvn code as to be unmaintainable. 1991 */ 1992 int enable_mbit_wa = 0; 1993 1994 /* 1995 * Handles all the dirty work of getting the right 1996 * anonymous pages and loading up the translations. 1997 * This routine is called only from segvn_fault() 1998 * when looping over the range of addresses requested. 1999 * 2000 * The basic algorithm here is: 2001 * If this is an anon_zero case 2002 * Call anon_zero to allocate page 2003 * Load up translation 2004 * Return 2005 * endif 2006 * If this is an anon page 2007 * Use anon_getpage to get the page 2008 * else 2009 * Find page in pl[] list passed in 2010 * endif 2011 * If not a cow 2012 * Load up the translation to the page 2013 * return 2014 * endif 2015 * Call anon_private to handle cow 2016 * Load up (writable) translation to new page 2017 */ 2018 static faultcode_t 2019 segvn_faultpage( 2020 struct hat *hat, /* the hat to use for mapping */ 2021 struct seg *seg, /* seg_vn of interest */ 2022 caddr_t addr, /* address in as */ 2023 u_offset_t off, /* offset in vp */ 2024 struct vpage *vpage, /* pointer to vpage for vp, off */ 2025 page_t *pl[], /* object source page pointer */ 2026 uint_t vpprot, /* access allowed to object pages */ 2027 enum fault_type type, /* type of fault */ 2028 enum seg_rw rw, /* type of access at fault */ 2029 int brkcow) /* we may need to break cow */ 2030 { 2031 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 2032 page_t *pp, **ppp; 2033 uint_t pageflags = 0; 2034 page_t *anon_pl[1 + 1]; 2035 page_t *opp = NULL; /* original page */ 2036 uint_t prot; 2037 int err; 2038 int cow; 2039 int claim; 2040 int steal = 0; 2041 ulong_t anon_index; 2042 struct anon *ap, *oldap; 2043 struct anon_map *amp; 2044 int hat_flag = (type == F_SOFTLOCK) ? HAT_LOAD_LOCK : HAT_LOAD; 2045 int anon_lock = 0; 2046 anon_sync_obj_t cookie; 2047 2048 if (svd->flags & MAP_TEXT) { 2049 hat_flag |= HAT_LOAD_TEXT; 2050 } 2051 2052 ASSERT(SEGVN_READ_HELD(seg->s_as, &svd->lock)); 2053 ASSERT(seg->s_szc == 0); 2054 2055 /* 2056 * Initialize protection value for this page. 2057 * If we have per page protection values check it now. 2058 */ 2059 if (svd->pageprot) { 2060 uint_t protchk; 2061 2062 switch (rw) { 2063 case S_READ: 2064 protchk = PROT_READ; 2065 break; 2066 case S_WRITE: 2067 protchk = PROT_WRITE; 2068 break; 2069 case S_EXEC: 2070 protchk = PROT_EXEC; 2071 break; 2072 case S_OTHER: 2073 default: 2074 protchk = PROT_READ | PROT_WRITE | PROT_EXEC; 2075 break; 2076 } 2077 2078 prot = VPP_PROT(vpage); 2079 if ((prot & protchk) == 0) 2080 return (FC_PROT); /* illegal access type */ 2081 } else { 2082 prot = svd->prot; 2083 } 2084 2085 if (type == F_SOFTLOCK) { 2086 mutex_enter(&freemem_lock); 2087 if (availrmem <= tune.t_minarmem) { 2088 mutex_exit(&freemem_lock); 2089 return (FC_MAKE_ERR(ENOMEM)); /* out of real memory */ 2090 } else { 2091 svd->softlockcnt++; 2092 availrmem--; 2093 segvn_pages_locked++; 2094 } 2095 mutex_exit(&freemem_lock); 2096 } 2097 2098 /* 2099 * Always acquire the anon array lock to prevent 2 threads from 2100 * allocating separate anon slots for the same "addr". 2101 */ 2102 2103 if ((amp = svd->amp) != NULL) { 2104 ASSERT(RW_READ_HELD(&->a_rwlock)); 2105 anon_index = svd->anon_index + seg_page(seg, addr); 2106 anon_array_enter(amp, anon_index, &cookie); 2107 anon_lock = 1; 2108 } 2109 2110 if (svd->vp == NULL && amp != NULL) { 2111 if ((ap = anon_get_ptr(amp->ahp, anon_index)) == NULL) { 2112 /* 2113 * Allocate a (normally) writable anonymous page of 2114 * zeroes. If no advance reservations, reserve now. 2115 */ 2116 if (svd->flags & MAP_NORESERVE) { 2117 if (anon_resv(ptob(1))) { 2118 svd->swresv += ptob(1); 2119 } else { 2120 err = ENOMEM; 2121 goto out; 2122 } 2123 } 2124 if ((pp = anon_zero(seg, addr, &ap, 2125 svd->cred)) == NULL) { 2126 err = ENOMEM; 2127 goto out; /* out of swap space */ 2128 } 2129 /* 2130 * Re-acquire the anon_map lock and 2131 * initialize the anon array entry. 2132 */ 2133 (void) anon_set_ptr(amp->ahp, anon_index, ap, 2134 ANON_SLEEP); 2135 if (enable_mbit_wa) { 2136 if (rw == S_WRITE) 2137 hat_setmod(pp); 2138 else if (!hat_ismod(pp)) 2139 prot &= ~PROT_WRITE; 2140 } 2141 /* 2142 * If AS_PAGLCK is set in a_flags (via memcntl(2) 2143 * with MC_LOCKAS, MCL_FUTURE) and this is a 2144 * MAP_NORESERVE segment, we may need to 2145 * permanently lock the page as it is being faulted 2146 * for the first time. The following text applies 2147 * only to MAP_NORESERVE segments: 2148 * 2149 * As per memcntl(2), if this segment was created 2150 * after MCL_FUTURE was applied (a "future" 2151 * segment), its pages must be locked. If this 2152 * segment existed at MCL_FUTURE application (a 2153 * "past" segment), the interface is unclear. 2154 * 2155 * We decide to lock only if vpage is present: 2156 * 2157 * - "future" segments will have a vpage array (see 2158 * as_map), and so will be locked as required 2159 * 2160 * - "past" segments may not have a vpage array, 2161 * depending on whether events (such as 2162 * mprotect) have occurred. Locking if vpage 2163 * exists will preserve legacy behavior. Not 2164 * locking if vpage is absent, will not break 2165 * the interface or legacy behavior. Note that 2166 * allocating vpage here if it's absent requires 2167 * upgrading the segvn reader lock, the cost of 2168 * which does not seem worthwhile. 2169 */ 2170 if (AS_ISPGLCK(seg->s_as) && vpage != NULL && 2171 (svd->flags & MAP_NORESERVE)) { 2172 claim = VPP_PROT(vpage) & PROT_WRITE; 2173 ASSERT(svd->type == MAP_PRIVATE); 2174 if (page_pp_lock(pp, claim, 0)) 2175 VPP_SETPPLOCK(vpage); 2176 } 2177 2178 2179 /* 2180 * Handle pages that have been marked for migration 2181 */ 2182 if (lgrp_optimizations()) 2183 page_migrate(seg, addr, &pp, 1); 2184 hat_memload(hat, addr, pp, prot, hat_flag); 2185 2186 if (!(hat_flag & HAT_LOAD_LOCK)) 2187 page_unlock(pp); 2188 2189 anon_array_exit(&cookie); 2190 return (0); 2191 } 2192 } 2193 2194 /* 2195 * Obtain the page structure via anon_getpage() if it is 2196 * a private copy of an object (the result of a previous 2197 * copy-on-write). 2198 */ 2199 if (amp != NULL) { 2200 if ((ap = anon_get_ptr(amp->ahp, anon_index)) != NULL) { 2201 err = anon_getpage(&ap, &vpprot, anon_pl, PAGESIZE, 2202 seg, addr, rw, svd->cred); 2203 if (err) 2204 goto out; 2205 2206 if (svd->type == MAP_SHARED) { 2207 /* 2208 * If this is a shared mapping to an 2209 * anon_map, then ignore the write 2210 * permissions returned by anon_getpage(). 2211 * They apply to the private mappings 2212 * of this anon_map. 2213 */ 2214 vpprot |= PROT_WRITE; 2215 } 2216 opp = anon_pl[0]; 2217 } 2218 } 2219 2220 /* 2221 * Search the pl[] list passed in if it is from the 2222 * original object (i.e., not a private copy). 2223 */ 2224 if (opp == NULL) { 2225 /* 2226 * Find original page. We must be bringing it in 2227 * from the list in pl[]. 2228 */ 2229 for (ppp = pl; (opp = *ppp) != NULL; ppp++) { 2230 if (opp == PAGE_HANDLED) 2231 continue; 2232 ASSERT(opp->p_vnode == svd->vp); /* XXX */ 2233 if (opp->p_offset == off) 2234 break; 2235 } 2236 if (opp == NULL) { 2237 panic("segvn_faultpage not found"); 2238 /*NOTREACHED*/ 2239 } 2240 *ppp = PAGE_HANDLED; 2241 2242 } 2243 2244 ASSERT(PAGE_LOCKED(opp)); 2245 2246 TRACE_3(TR_FAC_VM, TR_SEGVN_FAULT, 2247 "segvn_fault:pp %p vp %p offset %llx", 2248 opp, NULL, 0); 2249 2250 /* 2251 * The fault is treated as a copy-on-write fault if a 2252 * write occurs on a private segment and the object 2253 * page (i.e., mapping) is write protected. We assume 2254 * that fatal protection checks have already been made. 2255 */ 2256 2257 cow = brkcow && ((vpprot & PROT_WRITE) == 0); 2258 2259 /* 2260 * If not a copy-on-write case load the translation 2261 * and return. 2262 */ 2263 if (cow == 0) { 2264 if (IS_VMODSORT(opp->p_vnode) || enable_mbit_wa) { 2265 if (rw == S_WRITE) 2266 hat_setmod(opp); 2267 else if (rw != S_OTHER && !hat_ismod(opp)) 2268 prot &= ~PROT_WRITE; 2269 } 2270 2271 /* 2272 * Handle pages that have been marked for migration 2273 */ 2274 if (lgrp_optimizations()) 2275 page_migrate(seg, addr, &opp, 1); 2276 2277 hat_memload(hat, addr, opp, prot & vpprot, hat_flag); 2278 2279 if (!(hat_flag & HAT_LOAD_LOCK)) 2280 page_unlock(opp); 2281 2282 if (anon_lock) { 2283 anon_array_exit(&cookie); 2284 } 2285 return (0); 2286 } 2287 2288 hat_setref(opp); 2289 2290 ASSERT(amp != NULL && anon_lock); 2291 2292 /* 2293 * Steal the page only if it isn't a private page 2294 * since stealing a private page is not worth the effort. 2295 */ 2296 if ((ap = anon_get_ptr(amp->ahp, anon_index)) == NULL) 2297 steal = 1; 2298 2299 /* 2300 * Steal the original page if the following conditions are true: 2301 * 2302 * We are low on memory, the page is not private, page is not large, 2303 * not shared, not modified, not `locked' or if we have it `locked' 2304 * (i.e., p_cowcnt == 1 and p_lckcnt == 0, which also implies 2305 * that the page is not shared) and if it doesn't have any 2306 * translations. page_struct_lock isn't needed to look at p_cowcnt 2307 * and p_lckcnt because we first get exclusive lock on page. 2308 */ 2309 (void) hat_pagesync(opp, HAT_SYNC_DONTZERO | HAT_SYNC_STOPON_MOD); 2310 2311 if (stealcow && freemem < minfree && steal && opp->p_szc == 0 && 2312 page_tryupgrade(opp) && !hat_ismod(opp) && 2313 ((opp->p_lckcnt == 0 && opp->p_cowcnt == 0) || 2314 (opp->p_lckcnt == 0 && opp->p_cowcnt == 1 && 2315 vpage != NULL && VPP_ISPPLOCK(vpage)))) { 2316 /* 2317 * Check if this page has other translations 2318 * after unloading our translation. 2319 */ 2320 if (hat_page_is_mapped(opp)) { 2321 hat_unload(seg->s_as->a_hat, addr, PAGESIZE, 2322 HAT_UNLOAD); 2323 } 2324 2325 /* 2326 * hat_unload() might sync back someone else's recent 2327 * modification, so check again. 2328 */ 2329 if (!hat_ismod(opp) && !hat_page_is_mapped(opp)) 2330 pageflags |= STEAL_PAGE; 2331 } 2332 2333 /* 2334 * If we have a vpage pointer, see if it indicates that we have 2335 * ``locked'' the page we map -- if so, tell anon_private to 2336 * transfer the locking resource to the new page. 2337 * 2338 * See Statement at the beginning of segvn_lockop regarding 2339 * the way lockcnts/cowcnts are handled during COW. 2340 * 2341 */ 2342 if (vpage != NULL && VPP_ISPPLOCK(vpage)) 2343 pageflags |= LOCK_PAGE; 2344 2345 /* 2346 * Allocate a private page and perform the copy. 2347 * For MAP_NORESERVE reserve swap space now, unless this 2348 * is a cow fault on an existing anon page in which case 2349 * MAP_NORESERVE will have made advance reservations. 2350 */ 2351 if ((svd->flags & MAP_NORESERVE) && (ap == NULL)) { 2352 if (anon_resv(ptob(1))) { 2353 svd->swresv += ptob(1); 2354 } else { 2355 page_unlock(opp); 2356 err = ENOMEM; 2357 goto out; 2358 } 2359 } 2360 oldap = ap; 2361 pp = anon_private(&ap, seg, addr, prot, opp, pageflags, svd->cred); 2362 if (pp == NULL) { 2363 err = ENOMEM; /* out of swap space */ 2364 goto out; 2365 } 2366 2367 /* 2368 * If we copied away from an anonymous page, then 2369 * we are one step closer to freeing up an anon slot. 2370 * 2371 * NOTE: The original anon slot must be released while 2372 * holding the "anon_map" lock. This is necessary to prevent 2373 * other threads from obtaining a pointer to the anon slot 2374 * which may be freed if its "refcnt" is 1. 2375 */ 2376 if (oldap != NULL) 2377 anon_decref(oldap); 2378 2379 (void) anon_set_ptr(amp->ahp, anon_index, ap, ANON_SLEEP); 2380 2381 ASSERT(!IS_VMODSORT(pp->p_vnode)); 2382 if (enable_mbit_wa) { 2383 if (rw == S_WRITE) 2384 hat_setmod(pp); 2385 else if (!hat_ismod(pp)) 2386 prot &= ~PROT_WRITE; 2387 } 2388 2389 2390 /* 2391 * Handle pages that have been marked for migration 2392 */ 2393 if (lgrp_optimizations()) 2394 page_migrate(seg, addr, &pp, 1); 2395 hat_memload(hat, addr, pp, prot, hat_flag); 2396 2397 if (!(hat_flag & HAT_LOAD_LOCK)) 2398 page_unlock(pp); 2399 2400 ASSERT(anon_lock); 2401 anon_array_exit(&cookie); 2402 return (0); 2403 out: 2404 if (anon_lock) 2405 anon_array_exit(&cookie); 2406 2407 if (type == F_SOFTLOCK) { 2408 mutex_enter(&freemem_lock); 2409 availrmem++; 2410 segvn_pages_locked--; 2411 svd->softlockcnt--; 2412 mutex_exit(&freemem_lock); 2413 } 2414 return (FC_MAKE_ERR(err)); 2415 } 2416 2417 /* 2418 * relocate a bunch of smaller targ pages into one large repl page. all targ 2419 * pages must be complete pages smaller than replacement pages. 2420 * it's assumed that no page's szc can change since they are all PAGESIZE or 2421 * complete large pages locked SHARED. 2422 */ 2423 static void 2424 segvn_relocate_pages(page_t **targ, page_t *replacement) 2425 { 2426 page_t *pp; 2427 pgcnt_t repl_npgs, curnpgs; 2428 pgcnt_t i; 2429 uint_t repl_szc = replacement->p_szc; 2430 page_t *first_repl = replacement; 2431 page_t *repl; 2432 spgcnt_t npgs; 2433 2434 VM_STAT_ADD(segvnvmstats.relocatepages[0]); 2435 2436 ASSERT(repl_szc != 0); 2437 npgs = repl_npgs = page_get_pagecnt(repl_szc); 2438 2439 i = 0; 2440 while (repl_npgs) { 2441 spgcnt_t nreloc; 2442 int err; 2443 ASSERT(replacement != NULL); 2444 pp = targ[i]; 2445 ASSERT(pp->p_szc < repl_szc); 2446 ASSERT(PAGE_EXCL(pp)); 2447 ASSERT(!PP_ISFREE(pp)); 2448 curnpgs = page_get_pagecnt(pp->p_szc); 2449 if (curnpgs == 1) { 2450 VM_STAT_ADD(segvnvmstats.relocatepages[1]); 2451 repl = replacement; 2452 page_sub(&replacement, repl); 2453 ASSERT(PAGE_EXCL(repl)); 2454 ASSERT(!PP_ISFREE(repl)); 2455 ASSERT(repl->p_szc == repl_szc); 2456 } else { 2457 page_t *repl_savepp; 2458 int j; 2459 VM_STAT_ADD(segvnvmstats.relocatepages[2]); 2460 repl_savepp = replacement; 2461 for (j = 0; j < curnpgs; j++) { 2462 repl = replacement; 2463 page_sub(&replacement, repl); 2464 ASSERT(PAGE_EXCL(repl)); 2465 ASSERT(!PP_ISFREE(repl)); 2466 ASSERT(repl->p_szc == repl_szc); 2467 ASSERT(page_pptonum(targ[i + j]) == 2468 page_pptonum(targ[i]) + j); 2469 } 2470 repl = repl_savepp; 2471 ASSERT(IS_P2ALIGNED(page_pptonum(repl), curnpgs)); 2472 } 2473 err = page_relocate(&pp, &repl, 0, 1, &nreloc, NULL); 2474 if (err || nreloc != curnpgs) { 2475 panic("segvn_relocate_pages: " 2476 "page_relocate failed err=%d curnpgs=%ld " 2477 "nreloc=%ld", err, curnpgs, nreloc); 2478 } 2479 ASSERT(curnpgs <= repl_npgs); 2480 repl_npgs -= curnpgs; 2481 i += curnpgs; 2482 } 2483 ASSERT(replacement == NULL); 2484 2485 repl = first_repl; 2486 repl_npgs = npgs; 2487 for (i = 0; i < repl_npgs; i++) { 2488 ASSERT(PAGE_EXCL(repl)); 2489 ASSERT(!PP_ISFREE(repl)); 2490 targ[i] = repl; 2491 page_downgrade(targ[i]); 2492 repl = page_next(repl); 2493 } 2494 } 2495 2496 /* 2497 * Check if all pages in ppa array are complete smaller than szc pages and 2498 * their roots will still be aligned relative to their current size if the 2499 * entire ppa array is relocated into one szc page. If these conditions are 2500 * not met return 0. 2501 * 2502 * If all pages are properly aligned attempt to upgrade their locks 2503 * to exclusive mode. If it fails set *upgrdfail to 1 and return 0. 2504 * upgrdfail was set to 0 by caller. 2505 * 2506 * Return 1 if all pages are aligned and locked exclusively. 2507 * 2508 * If all pages in ppa array happen to be physically contiguous to make one 2509 * szc page and all exclusive locks are successfully obtained promote the page 2510 * size to szc and set *pszc to szc. Return 1 with pages locked shared. 2511 */ 2512 static int 2513 segvn_full_szcpages(page_t **ppa, uint_t szc, int *upgrdfail, uint_t *pszc) 2514 { 2515 page_t *pp; 2516 pfn_t pfn; 2517 pgcnt_t totnpgs = page_get_pagecnt(szc); 2518 pfn_t first_pfn; 2519 int contig = 1; 2520 pgcnt_t i; 2521 pgcnt_t j; 2522 uint_t curszc; 2523 pgcnt_t curnpgs; 2524 int root = 0; 2525 2526 ASSERT(szc > 0); 2527 2528 VM_STAT_ADD(segvnvmstats.fullszcpages[0]); 2529 2530 for (i = 0; i < totnpgs; i++) { 2531 pp = ppa[i]; 2532 ASSERT(PAGE_SHARED(pp)); 2533 ASSERT(!PP_ISFREE(pp)); 2534 pfn = page_pptonum(pp); 2535 if (i == 0) { 2536 if (!IS_P2ALIGNED(pfn, totnpgs)) { 2537 contig = 0; 2538 } else { 2539 first_pfn = pfn; 2540 } 2541 } else if (contig && pfn != first_pfn + i) { 2542 contig = 0; 2543 } 2544 if (pp->p_szc == 0) { 2545 if (root) { 2546 VM_STAT_ADD(segvnvmstats.fullszcpages[1]); 2547 return (0); 2548 } 2549 } else if (!root) { 2550 if ((curszc = pp->p_szc) >= szc) { 2551 VM_STAT_ADD(segvnvmstats.fullszcpages[2]); 2552 return (0); 2553 } 2554 if (curszc == 0) { 2555 /* 2556 * p_szc changed means we don't have all pages 2557 * locked. return failure. 2558 */ 2559 VM_STAT_ADD(segvnvmstats.fullszcpages[3]); 2560 return (0); 2561 } 2562 curnpgs = page_get_pagecnt(curszc); 2563 if (!IS_P2ALIGNED(pfn, curnpgs) || 2564 !IS_P2ALIGNED(i, curnpgs)) { 2565 VM_STAT_ADD(segvnvmstats.fullszcpages[4]); 2566 return (0); 2567 } 2568 root = 1; 2569 } else { 2570 ASSERT(i > 0); 2571 VM_STAT_ADD(segvnvmstats.fullszcpages[5]); 2572 if (pp->p_szc != curszc) { 2573 VM_STAT_ADD(segvnvmstats.fullszcpages[6]); 2574 return (0); 2575 } 2576 if (pfn - 1 != page_pptonum(ppa[i - 1])) { 2577 panic("segvn_full_szcpages: " 2578 "large page not physically contiguous"); 2579 } 2580 if (P2PHASE(pfn, curnpgs) == curnpgs - 1) { 2581 root = 0; 2582 } 2583 } 2584 } 2585 2586 for (i = 0; i < totnpgs; i++) { 2587 ASSERT(ppa[i]->p_szc < szc); 2588 if (!page_tryupgrade(ppa[i])) { 2589 for (j = 0; j < i; j++) { 2590 page_downgrade(ppa[j]); 2591 } 2592 *pszc = ppa[i]->p_szc; 2593 *upgrdfail = 1; 2594 VM_STAT_ADD(segvnvmstats.fullszcpages[7]); 2595 return (0); 2596 } 2597 } 2598 2599 /* 2600 * When a page is put a free cachelist its szc is set to 0. if file 2601 * system reclaimed pages from cachelist targ pages will be physically 2602 * contiguous with 0 p_szc. in this case just upgrade szc of targ 2603 * pages without any relocations. 2604 * To avoid any hat issues with previous small mappings 2605 * hat_pageunload() the target pages first. 2606 */ 2607 if (contig) { 2608 VM_STAT_ADD(segvnvmstats.fullszcpages[8]); 2609 for (i = 0; i < totnpgs; i++) { 2610 (void) hat_pageunload(ppa[i], HAT_FORCE_PGUNLOAD); 2611 } 2612 for (i = 0; i < totnpgs; i++) { 2613 ppa[i]->p_szc = szc; 2614 } 2615 for (i = 0; i < totnpgs; i++) { 2616 ASSERT(PAGE_EXCL(ppa[i])); 2617 page_downgrade(ppa[i]); 2618 } 2619 if (pszc != NULL) { 2620 *pszc = szc; 2621 } 2622 } 2623 VM_STAT_ADD(segvnvmstats.fullszcpages[9]); 2624 return (1); 2625 } 2626 2627 /* 2628 * Create physically contiguous pages for [vp, off] - [vp, off + 2629 * page_size(szc)) range and for private segment return them in ppa array. 2630 * Pages are created either via IO or relocations. 2631 * 2632 * Return 1 on sucess and 0 on failure. 2633 * 2634 * If physically contiguos pages already exist for this range return 1 without 2635 * filling ppa array. Caller initializes ppa[0] as NULL to detect that ppa 2636 * array wasn't filled. In this case caller fills ppa array via VOP_GETPAGE(). 2637 */ 2638 2639 static int 2640 segvn_fill_vp_pages(struct segvn_data *svd, vnode_t *vp, u_offset_t off, 2641 uint_t szc, page_t **ppa, page_t **ppplist, uint_t *ret_pszc, 2642 int *downsize) 2643 2644 { 2645 page_t *pplist = *ppplist; 2646 size_t pgsz = page_get_pagesize(szc); 2647 pgcnt_t pages = btop(pgsz); 2648 ulong_t start_off = off; 2649 u_offset_t eoff = off + pgsz; 2650 spgcnt_t nreloc; 2651 u_offset_t io_off = off; 2652 size_t io_len; 2653 page_t *io_pplist = NULL; 2654 page_t *done_pplist = NULL; 2655 pgcnt_t pgidx = 0; 2656 page_t *pp; 2657 page_t *newpp; 2658 page_t *targpp; 2659 int io_err = 0; 2660 int i; 2661 pfn_t pfn; 2662 ulong_t ppages; 2663 page_t *targ_pplist = NULL; 2664 page_t *repl_pplist = NULL; 2665 page_t *tmp_pplist; 2666 int nios = 0; 2667 uint_t pszc; 2668 struct vattr va; 2669 2670 VM_STAT_ADD(segvnvmstats.fill_vp_pages[0]); 2671 2672 ASSERT(szc != 0); 2673 ASSERT(pplist->p_szc == szc); 2674 2675 /* 2676 * downsize will be set to 1 only if we fail to lock pages. this will 2677 * allow subsequent faults to try to relocate the page again. If we 2678 * fail due to misalignment don't downsize and let the caller map the 2679 * whole region with small mappings to avoid more faults into the area 2680 * where we can't get large pages anyway. 2681 */ 2682 *downsize = 0; 2683 2684 while (off < eoff) { 2685 newpp = pplist; 2686 ASSERT(newpp != NULL); 2687 ASSERT(PAGE_EXCL(newpp)); 2688 ASSERT(!PP_ISFREE(newpp)); 2689 /* 2690 * we pass NULL for nrelocp to page_lookup_create() 2691 * so that it doesn't relocate. We relocate here 2692 * later only after we make sure we can lock all 2693 * pages in the range we handle and they are all 2694 * aligned. 2695 */ 2696 pp = page_lookup_create(vp, off, SE_SHARED, newpp, NULL, 0); 2697 ASSERT(pp != NULL); 2698 ASSERT(!PP_ISFREE(pp)); 2699 ASSERT(pp->p_vnode == vp); 2700 ASSERT(pp->p_offset == off); 2701 if (pp == newpp) { 2702 VM_STAT_ADD(segvnvmstats.fill_vp_pages[1]); 2703 page_sub(&pplist, pp); 2704 ASSERT(PAGE_EXCL(pp)); 2705 ASSERT(page_iolock_assert(pp)); 2706 page_list_concat(&io_pplist, &pp); 2707 off += PAGESIZE; 2708 continue; 2709 } 2710 VM_STAT_ADD(segvnvmstats.fill_vp_pages[2]); 2711 pfn = page_pptonum(pp); 2712 pszc = pp->p_szc; 2713 if (pszc >= szc && targ_pplist == NULL && io_pplist == NULL && 2714 IS_P2ALIGNED(pfn, pages)) { 2715 ASSERT(repl_pplist == NULL); 2716 ASSERT(done_pplist == NULL); 2717 ASSERT(pplist == *ppplist); 2718 page_unlock(pp); 2719 page_free_replacement_page(pplist); 2720 page_create_putback(pages); 2721 *ppplist = NULL; 2722 VM_STAT_ADD(segvnvmstats.fill_vp_pages[3]); 2723 return (1); 2724 } 2725 if (pszc >= szc) { 2726 page_unlock(pp); 2727 segvn_faultvnmpss_align_err1++; 2728 goto out; 2729 } 2730 ppages = page_get_pagecnt(pszc); 2731 if (!IS_P2ALIGNED(pfn, ppages)) { 2732 ASSERT(pszc > 0); 2733 /* 2734 * sizing down to pszc won't help. 2735 */ 2736 page_unlock(pp); 2737 segvn_faultvnmpss_align_err2++; 2738 goto out; 2739 } 2740 pfn = page_pptonum(newpp); 2741 if (!IS_P2ALIGNED(pfn, ppages)) { 2742 ASSERT(pszc > 0); 2743 /* 2744 * sizing down to pszc won't help. 2745 */ 2746 page_unlock(pp); 2747 segvn_faultvnmpss_align_err3++; 2748 goto out; 2749 } 2750 if (!PAGE_EXCL(pp)) { 2751 VM_STAT_ADD(segvnvmstats.fill_vp_pages[4]); 2752 page_unlock(pp); 2753 *downsize = 1; 2754 *ret_pszc = pp->p_szc; 2755 goto out; 2756 } 2757 targpp = pp; 2758 if (io_pplist != NULL) { 2759 VM_STAT_ADD(segvnvmstats.fill_vp_pages[5]); 2760 io_len = off - io_off; 2761 /* 2762 * Some file systems like NFS don't check EOF 2763 * conditions in VOP_PAGEIO(). Check it here 2764 * now that pages are locked SE_EXCL. Any file 2765 * truncation will wait until the pages are 2766 * unlocked so no need to worry that file will 2767 * be truncated after we check its size here. 2768 * XXX fix NFS to remove this check. 2769 */ 2770 va.va_mask = AT_SIZE; 2771 if (VOP_GETATTR(vp, &va, ATTR_HINT, svd->cred) != 0) { 2772 VM_STAT_ADD(segvnvmstats.fill_vp_pages[6]); 2773 page_unlock(targpp); 2774 goto out; 2775 } 2776 if (btopr(va.va_size) < btopr(io_off + io_len)) { 2777 VM_STAT_ADD(segvnvmstats.fill_vp_pages[7]); 2778 *downsize = 1; 2779 *ret_pszc = 0; 2780 page_unlock(targpp); 2781 goto out; 2782 } 2783 io_err = VOP_PAGEIO(vp, io_pplist, io_off, io_len, 2784 B_READ, svd->cred); 2785 if (io_err) { 2786 VM_STAT_ADD(segvnvmstats.fill_vp_pages[8]); 2787 page_unlock(targpp); 2788 if (io_err == EDEADLK) { 2789 segvn_vmpss_pageio_deadlk_err++; 2790 } 2791 goto out; 2792 } 2793 nios++; 2794 VM_STAT_ADD(segvnvmstats.fill_vp_pages[9]); 2795 while (io_pplist != NULL) { 2796 pp = io_pplist; 2797 page_sub(&io_pplist, pp); 2798 ASSERT(page_iolock_assert(pp)); 2799 page_io_unlock(pp); 2800 pgidx = (pp->p_offset - start_off) >> 2801 PAGESHIFT; 2802 ASSERT(pgidx < pages); 2803 ppa[pgidx] = pp; 2804 page_list_concat(&done_pplist, &pp); 2805 } 2806 } 2807 pp = targpp; 2808 ASSERT(PAGE_EXCL(pp)); 2809 ASSERT(pp->p_szc <= pszc); 2810 if (pszc != 0 && !group_page_trylock(pp, SE_EXCL)) { 2811 VM_STAT_ADD(segvnvmstats.fill_vp_pages[10]); 2812 page_unlock(pp); 2813 *downsize = 1; 2814 *ret_pszc = pp->p_szc; 2815 goto out; 2816 } 2817 VM_STAT_ADD(segvnvmstats.fill_vp_pages[11]); 2818 /* 2819 * page szc chould have changed before the entire group was 2820 * locked. reread page szc. 2821 */ 2822 pszc = pp->p_szc; 2823 ppages = page_get_pagecnt(pszc); 2824 2825 /* link just the roots */ 2826 page_list_concat(&targ_pplist, &pp); 2827 page_sub(&pplist, newpp); 2828 page_list_concat(&repl_pplist, &newpp); 2829 off += PAGESIZE; 2830 while (--ppages != 0) { 2831 newpp = pplist; 2832 page_sub(&pplist, newpp); 2833 off += PAGESIZE; 2834 } 2835 io_off = off; 2836 } 2837 if (io_pplist != NULL) { 2838 VM_STAT_ADD(segvnvmstats.fill_vp_pages[12]); 2839 io_len = eoff - io_off; 2840 va.va_mask = AT_SIZE; 2841 if (VOP_GETATTR(vp, &va, ATTR_HINT, svd->cred) != 0) { 2842 VM_STAT_ADD(segvnvmstats.fill_vp_pages[13]); 2843 goto out; 2844 } 2845 if (btopr(va.va_size) < btopr(io_off + io_len)) { 2846 VM_STAT_ADD(segvnvmstats.fill_vp_pages[14]); 2847 *downsize = 1; 2848 *ret_pszc = 0; 2849 goto out; 2850 } 2851 io_err = VOP_PAGEIO(vp, io_pplist, io_off, io_len, 2852 B_READ, svd->cred); 2853 if (io_err) { 2854 VM_STAT_ADD(segvnvmstats.fill_vp_pages[15]); 2855 if (io_err == EDEADLK) { 2856 segvn_vmpss_pageio_deadlk_err++; 2857 } 2858 goto out; 2859 } 2860 nios++; 2861 while (io_pplist != NULL) { 2862 pp = io_pplist; 2863 page_sub(&io_pplist, pp); 2864 ASSERT(page_iolock_assert(pp)); 2865 page_io_unlock(pp); 2866 pgidx = (pp->p_offset - start_off) >> PAGESHIFT; 2867 ASSERT(pgidx < pages); 2868 ppa[pgidx] = pp; 2869 } 2870 } 2871 /* 2872 * we're now bound to succeed or panic. 2873 * remove pages from done_pplist. it's not needed anymore. 2874 */ 2875 while (done_pplist != NULL) { 2876 pp = done_pplist; 2877 page_sub(&done_pplist, pp); 2878 } 2879 VM_STAT_ADD(segvnvmstats.fill_vp_pages[16]); 2880 ASSERT(pplist == NULL); 2881 *ppplist = NULL; 2882 while (targ_pplist != NULL) { 2883 int ret; 2884 VM_STAT_ADD(segvnvmstats.fill_vp_pages[17]); 2885 ASSERT(repl_pplist); 2886 pp = targ_pplist; 2887 page_sub(&targ_pplist, pp); 2888 pgidx = (pp->p_offset - start_off) >> PAGESHIFT; 2889 newpp = repl_pplist; 2890 page_sub(&repl_pplist, newpp); 2891 #ifdef DEBUG 2892 pfn = page_pptonum(pp); 2893 pszc = pp->p_szc; 2894 ppages = page_get_pagecnt(pszc); 2895 ASSERT(IS_P2ALIGNED(pfn, ppages)); 2896 pfn = page_pptonum(newpp); 2897 ASSERT(IS_P2ALIGNED(pfn, ppages)); 2898 ASSERT(P2PHASE(pfn, pages) == pgidx); 2899 #endif 2900 nreloc = 0; 2901 ret = page_relocate(&pp, &newpp, 0, 1, &nreloc, NULL); 2902 if (ret != 0 || nreloc == 0) { 2903 panic("segvn_fill_vp_pages: " 2904 "page_relocate failed"); 2905 } 2906 pp = newpp; 2907 while (nreloc-- != 0) { 2908 ASSERT(PAGE_EXCL(pp)); 2909 ASSERT(pp->p_vnode == vp); 2910 ASSERT(pgidx == 2911 ((pp->p_offset - start_off) >> PAGESHIFT)); 2912 ppa[pgidx++] = pp; 2913 pp = page_next(pp); 2914 } 2915 } 2916 2917 if (svd->type == MAP_PRIVATE) { 2918 VM_STAT_ADD(segvnvmstats.fill_vp_pages[18]); 2919 for (i = 0; i < pages; i++) { 2920 ASSERT(ppa[i] != NULL); 2921 ASSERT(PAGE_EXCL(ppa[i])); 2922 ASSERT(ppa[i]->p_vnode == vp); 2923 ASSERT(ppa[i]->p_offset == 2924 start_off + (i << PAGESHIFT)); 2925 page_downgrade(ppa[i]); 2926 } 2927 ppa[pages] = NULL; 2928 } else { 2929 VM_STAT_ADD(segvnvmstats.fill_vp_pages[19]); 2930 /* 2931 * the caller will still call VOP_GETPAGE() for shared segments 2932 * to check FS write permissions. For private segments we map 2933 * file read only anyway. so no VOP_GETPAGE is needed. 2934 */ 2935 for (i = 0; i < pages; i++) { 2936 ASSERT(ppa[i] != NULL); 2937 ASSERT(PAGE_EXCL(ppa[i])); 2938 ASSERT(ppa[i]->p_vnode == vp); 2939 ASSERT(ppa[i]->p_offset == 2940 start_off + (i << PAGESHIFT)); 2941 page_unlock(ppa[i]); 2942 } 2943 ppa[0] = NULL; 2944 } 2945 2946 return (1); 2947 out: 2948 /* 2949 * Do the cleanup. Unlock target pages we didn't relocate. They are 2950 * linked on targ_pplist by root pages. reassemble unused replacement 2951 * and io pages back to pplist. 2952 */ 2953 if (io_pplist != NULL) { 2954 VM_STAT_ADD(segvnvmstats.fill_vp_pages[20]); 2955 pp = io_pplist; 2956 do { 2957 ASSERT(pp->p_vnode == vp); 2958 ASSERT(pp->p_offset == io_off); 2959 ASSERT(page_iolock_assert(pp)); 2960 page_io_unlock(pp); 2961 page_hashout(pp, NULL); 2962 io_off += PAGESIZE; 2963 } while ((pp = pp->p_next) != io_pplist); 2964 page_list_concat(&io_pplist, &pplist); 2965 pplist = io_pplist; 2966 } 2967 tmp_pplist = NULL; 2968 while (targ_pplist != NULL) { 2969 VM_STAT_ADD(segvnvmstats.fill_vp_pages[21]); 2970 pp = targ_pplist; 2971 ASSERT(PAGE_EXCL(pp)); 2972 page_sub(&targ_pplist, pp); 2973 2974 pszc = pp->p_szc; 2975 ppages = page_get_pagecnt(pszc); 2976 ASSERT(IS_P2ALIGNED(page_pptonum(pp), ppages)); 2977 2978 if (pszc != 0) { 2979 group_page_unlock(pp); 2980 } 2981 page_unlock(pp); 2982 2983 pp = repl_pplist; 2984 ASSERT(pp != NULL); 2985 ASSERT(PAGE_EXCL(pp)); 2986 ASSERT(pp->p_szc == szc); 2987 page_sub(&repl_pplist, pp); 2988 2989 ASSERT(IS_P2ALIGNED(page_pptonum(pp), ppages)); 2990 2991 /* relink replacement page */ 2992 page_list_concat(&tmp_pplist, &pp); 2993 while (--ppages != 0) { 2994 VM_STAT_ADD(segvnvmstats.fill_vp_pages[22]); 2995 pp = page_next(pp); 2996 ASSERT(PAGE_EXCL(pp)); 2997 ASSERT(pp->p_szc == szc); 2998 page_list_concat(&tmp_pplist, &pp); 2999 } 3000 } 3001 if (tmp_pplist != NULL) { 3002 VM_STAT_ADD(segvnvmstats.fill_vp_pages[23]); 3003 page_list_concat(&tmp_pplist, &pplist); 3004 pplist = tmp_pplist; 3005 } 3006 /* 3007 * at this point all pages are either on done_pplist or 3008 * pplist. They can't be all on done_pplist otherwise 3009 * we'd've been done. 3010 */ 3011 ASSERT(pplist != NULL); 3012 if (nios != 0) { 3013 VM_STAT_ADD(segvnvmstats.fill_vp_pages[24]); 3014 pp = pplist; 3015 do { 3016 VM_STAT_ADD(segvnvmstats.fill_vp_pages[25]); 3017 ASSERT(pp->p_szc == szc); 3018 ASSERT(PAGE_EXCL(pp)); 3019 ASSERT(pp->p_vnode != vp); 3020 pp->p_szc = 0; 3021 } while ((pp = pp->p_next) != pplist); 3022 3023 pp = done_pplist; 3024 do { 3025 VM_STAT_ADD(segvnvmstats.fill_vp_pages[26]); 3026 ASSERT(pp->p_szc == szc); 3027 ASSERT(PAGE_EXCL(pp)); 3028 ASSERT(pp->p_vnode == vp); 3029 pp->p_szc = 0; 3030 } while ((pp = pp->p_next) != done_pplist); 3031 3032 while (pplist != NULL) { 3033 VM_STAT_ADD(segvnvmstats.fill_vp_pages[27]); 3034 pp = pplist; 3035 page_sub(&pplist, pp); 3036 page_free(pp, 0); 3037 } 3038 3039 while (done_pplist != NULL) { 3040 VM_STAT_ADD(segvnvmstats.fill_vp_pages[28]); 3041 pp = done_pplist; 3042 page_sub(&done_pplist, pp); 3043 page_unlock(pp); 3044 } 3045 *ppplist = NULL; 3046 return (0); 3047 } 3048 ASSERT(pplist == *ppplist); 3049 if (io_err) { 3050 VM_STAT_ADD(segvnvmstats.fill_vp_pages[29]); 3051 /* 3052 * don't downsize on io error. 3053 * see if vop_getpage succeeds. 3054 * pplist may still be used in this case 3055 * for relocations. 3056 */ 3057 return (0); 3058 } 3059 VM_STAT_ADD(segvnvmstats.fill_vp_pages[30]); 3060 page_free_replacement_page(pplist); 3061 page_create_putback(pages); 3062 *ppplist = NULL; 3063 return (0); 3064 } 3065 3066 int segvn_anypgsz = 0; 3067 3068 #define SEGVN_RESTORE_SOFTLOCK(type, pages) \ 3069 if ((type) == F_SOFTLOCK) { \ 3070 mutex_enter(&freemem_lock); \ 3071 availrmem += (pages); \ 3072 segvn_pages_locked -= (pages); \ 3073 svd->softlockcnt -= (pages); \ 3074 mutex_exit(&freemem_lock); \ 3075 } 3076 3077 #define SEGVN_UPDATE_MODBITS(ppa, pages, rw, prot, vpprot) \ 3078 if (IS_VMODSORT((ppa)[0]->p_vnode)) { \ 3079 if ((rw) == S_WRITE) { \ 3080 for (i = 0; i < (pages); i++) { \ 3081 ASSERT((ppa)[i]->p_vnode == \ 3082 (ppa)[0]->p_vnode); \ 3083 hat_setmod((ppa)[i]); \ 3084 } \ 3085 } else if ((rw) != S_OTHER && \ 3086 ((prot) & (vpprot) & PROT_WRITE)) { \ 3087 for (i = 0; i < (pages); i++) { \ 3088 ASSERT((ppa)[i]->p_vnode == \ 3089 (ppa)[0]->p_vnode); \ 3090 if (!hat_ismod((ppa)[i])) { \ 3091 prot &= ~PROT_WRITE; \ 3092 break; \ 3093 } \ 3094 } \ 3095 } \ 3096 } 3097 3098 #ifdef VM_STATS 3099 3100 #define SEGVN_VMSTAT_FLTVNPAGES(idx) \ 3101 VM_STAT_ADD(segvnvmstats.fltvnpages[(idx)]); 3102 3103 #else /* VM_STATS */ 3104 3105 #define SEGVN_VMSTAT_FLTVNPAGES(idx) 3106 3107 #endif 3108 3109 static faultcode_t 3110 segvn_fault_vnodepages(struct hat *hat, struct seg *seg, caddr_t lpgaddr, 3111 caddr_t lpgeaddr, enum fault_type type, enum seg_rw rw, caddr_t addr, 3112 caddr_t eaddr, int brkcow) 3113 { 3114 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 3115 struct anon_map *amp = svd->amp; 3116 uchar_t segtype = svd->type; 3117 uint_t szc = seg->s_szc; 3118 size_t pgsz = page_get_pagesize(szc); 3119 size_t maxpgsz = pgsz; 3120 pgcnt_t pages = btop(pgsz); 3121 pgcnt_t maxpages = pages; 3122 size_t ppasize = (pages + 1) * sizeof (page_t *); 3123 caddr_t a = lpgaddr; 3124 caddr_t maxlpgeaddr = lpgeaddr; 3125 u_offset_t off = svd->offset + (uintptr_t)(a - seg->s_base); 3126 ulong_t aindx = svd->anon_index + seg_page(seg, a); 3127 struct vpage *vpage = (svd->vpage != NULL) ? 3128 &svd->vpage[seg_page(seg, a)] : NULL; 3129 vnode_t *vp = svd->vp; 3130 page_t **ppa; 3131 uint_t pszc; 3132 size_t ppgsz; 3133 pgcnt_t ppages; 3134 faultcode_t err = 0; 3135 int ierr; 3136 int vop_size_err = 0; 3137 uint_t protchk, prot, vpprot; 3138 ulong_t i; 3139 int hat_flag = (type == F_SOFTLOCK) ? HAT_LOAD_LOCK : HAT_LOAD; 3140 anon_sync_obj_t an_cookie; 3141 enum seg_rw arw; 3142 int alloc_failed = 0; 3143 int adjszc_chk; 3144 struct vattr va; 3145 int xhat = 0; 3146 page_t *pplist; 3147 pfn_t pfn; 3148 int physcontig; 3149 int upgrdfail; 3150 int segvn_anypgsz_vnode = 0; /* for now map vnode with 2 page sizes */ 3151 3152 ASSERT(szc != 0); 3153 ASSERT(vp != NULL); 3154 ASSERT(brkcow == 0 || amp != NULL); 3155 ASSERT(enable_mbit_wa == 0); /* no mbit simulations with large pages */ 3156 ASSERT(!(svd->flags & MAP_NORESERVE)); 3157 ASSERT(type != F_SOFTUNLOCK); 3158 ASSERT(IS_P2ALIGNED(a, maxpgsz)); 3159 ASSERT(amp == NULL || IS_P2ALIGNED(aindx, maxpages)); 3160 ASSERT(SEGVN_LOCK_HELD(seg->s_as, &svd->lock)); 3161 ASSERT(seg->s_szc < NBBY * sizeof (int)); 3162 ASSERT(type != F_SOFTLOCK || lpgeaddr - a == maxpgsz); 3163 3164 VM_STAT_COND_ADD(type == F_SOFTLOCK, segvnvmstats.fltvnpages[0]); 3165 VM_STAT_COND_ADD(type != F_SOFTLOCK, segvnvmstats.fltvnpages[1]); 3166 3167 if (svd->flags & MAP_TEXT) { 3168 hat_flag |= HAT_LOAD_TEXT; 3169 } 3170 3171 if (svd->pageprot) { 3172 switch (rw) { 3173 case S_READ: 3174 protchk = PROT_READ; 3175 break; 3176 case S_WRITE: 3177 protchk = PROT_WRITE; 3178 break; 3179 case S_EXEC: 3180 protchk = PROT_EXEC; 3181 break; 3182 case S_OTHER: 3183 default: 3184 protchk = PROT_READ | PROT_WRITE | PROT_EXEC; 3185 break; 3186 } 3187 } else { 3188 prot = svd->prot; 3189 /* caller has already done segment level protection check. */ 3190 } 3191 3192 if (seg->s_as->a_hat != hat) { 3193 xhat = 1; 3194 } 3195 3196 if (rw == S_WRITE && segtype == MAP_PRIVATE) { 3197 SEGVN_VMSTAT_FLTVNPAGES(2); 3198 arw = S_READ; 3199 } else { 3200 arw = rw; 3201 } 3202 3203 ppa = kmem_alloc(ppasize, KM_SLEEP); 3204 3205 VM_STAT_COND_ADD(amp != NULL, segvnvmstats.fltvnpages[3]); 3206 3207 for (;;) { 3208 adjszc_chk = 0; 3209 for (; a < lpgeaddr; a += pgsz, off += pgsz, aindx += pages) { 3210 if (adjszc_chk) { 3211 while (szc < seg->s_szc) { 3212 uintptr_t e; 3213 uint_t tszc; 3214 tszc = segvn_anypgsz_vnode ? szc + 1 : 3215 seg->s_szc; 3216 ppgsz = page_get_pagesize(tszc); 3217 if (!IS_P2ALIGNED(a, ppgsz) || 3218 ((alloc_failed >> tszc) & 3219 0x1)) { 3220 break; 3221 } 3222 SEGVN_VMSTAT_FLTVNPAGES(4); 3223 szc = tszc; 3224 pgsz = ppgsz; 3225 pages = btop(pgsz); 3226 e = P2ROUNDUP((uintptr_t)eaddr, pgsz); 3227 lpgeaddr = (caddr_t)e; 3228 } 3229 } 3230 3231 again: 3232 if (IS_P2ALIGNED(a, maxpgsz) && amp != NULL) { 3233 ASSERT(IS_P2ALIGNED(aindx, maxpages)); 3234 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 3235 anon_array_enter(amp, aindx, &an_cookie); 3236 if (anon_get_ptr(amp->ahp, aindx) != NULL) { 3237 SEGVN_VMSTAT_FLTVNPAGES(5); 3238 if (anon_pages(amp->ahp, aindx, 3239 maxpages) != maxpages) { 3240 panic("segvn_fault_vnodepages:" 3241 " empty anon slots\n"); 3242 } 3243 anon_array_exit(&an_cookie); 3244 ANON_LOCK_EXIT(&->a_rwlock); 3245 err = segvn_fault_anonpages(hat, seg, 3246 a, a + maxpgsz, type, rw, 3247 MAX(a, addr), 3248 MIN(a + maxpgsz, eaddr), brkcow); 3249 if (err != 0) { 3250 SEGVN_VMSTAT_FLTVNPAGES(6); 3251 goto out; 3252 } 3253 if (szc < seg->s_szc) { 3254 szc = seg->s_szc; 3255 pgsz = maxpgsz; 3256 pages = maxpages; 3257 lpgeaddr = maxlpgeaddr; 3258 } 3259 goto next; 3260 } else if (anon_pages(amp->ahp, aindx, 3261 maxpages)) { 3262 panic("segvn_fault_vnodepages:" 3263 " non empty anon slots\n"); 3264 } else { 3265 SEGVN_VMSTAT_FLTVNPAGES(7); 3266 anon_array_exit(&an_cookie); 3267 ANON_LOCK_EXIT(&->a_rwlock); 3268 } 3269 } 3270 ASSERT(!brkcow || IS_P2ALIGNED(a, maxpgsz)); 3271 3272 if (svd->pageprot != 0 && IS_P2ALIGNED(a, maxpgsz)) { 3273 ASSERT(vpage != NULL); 3274 prot = VPP_PROT(vpage); 3275 ASSERT(sameprot(seg, a, maxpgsz)); 3276 if ((prot & protchk) == 0) { 3277 SEGVN_VMSTAT_FLTVNPAGES(8); 3278 err = FC_PROT; 3279 goto out; 3280 } 3281 } 3282 if (type == F_SOFTLOCK) { 3283 mutex_enter(&freemem_lock); 3284 if (availrmem < tune.t_minarmem + pages) { 3285 mutex_exit(&freemem_lock); 3286 err = FC_MAKE_ERR(ENOMEM); 3287 goto out; 3288 } else { 3289 availrmem -= pages; 3290 segvn_pages_locked += pages; 3291 svd->softlockcnt += pages; 3292 } 3293 mutex_exit(&freemem_lock); 3294 } 3295 3296 pplist = NULL; 3297 physcontig = 0; 3298 ppa[0] = NULL; 3299 if (!brkcow && szc && 3300 !page_exists_physcontig(vp, off, szc, 3301 segtype == MAP_PRIVATE ? ppa : NULL)) { 3302 SEGVN_VMSTAT_FLTVNPAGES(9); 3303 if (page_alloc_pages(seg, a, &pplist, NULL, 3304 szc, 0) && type != F_SOFTLOCK) { 3305 SEGVN_VMSTAT_FLTVNPAGES(10); 3306 pszc = 0; 3307 ierr = -1; 3308 alloc_failed |= (1 << szc); 3309 break; 3310 } 3311 if (pplist != NULL && 3312 vp->v_mpssdata == SEGVN_PAGEIO) { 3313 int downsize; 3314 SEGVN_VMSTAT_FLTVNPAGES(11); 3315 physcontig = segvn_fill_vp_pages(svd, 3316 vp, off, szc, ppa, &pplist, 3317 &pszc, &downsize); 3318 ASSERT(!physcontig || pplist == NULL); 3319 if (!physcontig && downsize && 3320 type != F_SOFTLOCK) { 3321 ASSERT(pplist == NULL); 3322 SEGVN_VMSTAT_FLTVNPAGES(12); 3323 ierr = -1; 3324 break; 3325 } 3326 ASSERT(!physcontig || 3327 segtype == MAP_PRIVATE || 3328 ppa[0] == NULL); 3329 if (physcontig && ppa[0] == NULL) { 3330 physcontig = 0; 3331 } 3332 } 3333 } else if (!brkcow && szc && ppa[0] != NULL) { 3334 SEGVN_VMSTAT_FLTVNPAGES(13); 3335 ASSERT(segtype == MAP_PRIVATE); 3336 physcontig = 1; 3337 } 3338 3339 if (!physcontig) { 3340 SEGVN_VMSTAT_FLTVNPAGES(14); 3341 ppa[0] = NULL; 3342 ierr = VOP_GETPAGE(vp, (offset_t)off, pgsz, 3343 &vpprot, ppa, pgsz, seg, a, arw, 3344 svd->cred); 3345 if (segtype == MAP_PRIVATE) { 3346 SEGVN_VMSTAT_FLTVNPAGES(15); 3347 vpprot &= ~PROT_WRITE; 3348 } 3349 } else { 3350 ASSERT(segtype == MAP_PRIVATE); 3351 SEGVN_VMSTAT_FLTVNPAGES(16); 3352 vpprot = PROT_ALL & ~PROT_WRITE; 3353 ierr = 0; 3354 } 3355 3356 if (ierr != 0) { 3357 SEGVN_VMSTAT_FLTVNPAGES(17); 3358 if (pplist != NULL) { 3359 SEGVN_VMSTAT_FLTVNPAGES(18); 3360 page_free_replacement_page(pplist); 3361 page_create_putback(pages); 3362 } 3363 SEGVN_RESTORE_SOFTLOCK(type, pages); 3364 if (a + pgsz <= eaddr) { 3365 SEGVN_VMSTAT_FLTVNPAGES(19); 3366 err = FC_MAKE_ERR(ierr); 3367 goto out; 3368 } 3369 va.va_mask = AT_SIZE; 3370 if (VOP_GETATTR(vp, &va, 0, svd->cred) != 0) { 3371 SEGVN_VMSTAT_FLTVNPAGES(20); 3372 err = FC_MAKE_ERR(EIO); 3373 goto out; 3374 } 3375 if (btopr(va.va_size) >= btopr(off + pgsz)) { 3376 SEGVN_VMSTAT_FLTVNPAGES(21); 3377 err = FC_MAKE_ERR(EIO); 3378 goto out; 3379 } 3380 if (btopr(va.va_size) < 3381 btopr(off + (eaddr - a))) { 3382 SEGVN_VMSTAT_FLTVNPAGES(22); 3383 err = FC_MAKE_ERR(EIO); 3384 goto out; 3385 } 3386 if (brkcow || type == F_SOFTLOCK) { 3387 /* can't reduce map area */ 3388 SEGVN_VMSTAT_FLTVNPAGES(23); 3389 vop_size_err = 1; 3390 goto out; 3391 } 3392 SEGVN_VMSTAT_FLTVNPAGES(24); 3393 ASSERT(szc != 0); 3394 pszc = 0; 3395 ierr = -1; 3396 break; 3397 } 3398 3399 if (amp != NULL) { 3400 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 3401 anon_array_enter(amp, aindx, &an_cookie); 3402 } 3403 if (amp != NULL && 3404 anon_get_ptr(amp->ahp, aindx) != NULL) { 3405 ulong_t taindx = P2ALIGN(aindx, maxpages); 3406 3407 SEGVN_VMSTAT_FLTVNPAGES(25); 3408 if (anon_pages(amp->ahp, taindx, maxpages) != 3409 maxpages) { 3410 panic("segvn_fault_vnodepages:" 3411 " empty anon slots\n"); 3412 } 3413 for (i = 0; i < pages; i++) { 3414 page_unlock(ppa[i]); 3415 } 3416 anon_array_exit(&an_cookie); 3417 ANON_LOCK_EXIT(&->a_rwlock); 3418 if (pplist != NULL) { 3419 page_free_replacement_page(pplist); 3420 page_create_putback(pages); 3421 } 3422 SEGVN_RESTORE_SOFTLOCK(type, pages); 3423 if (szc < seg->s_szc) { 3424 SEGVN_VMSTAT_FLTVNPAGES(26); 3425 /* 3426 * For private segments SOFTLOCK 3427 * either always breaks cow (any rw 3428 * type except S_READ_NOCOW) or 3429 * address space is locked as writer 3430 * (S_READ_NOCOW case) and anon slots 3431 * can't show up on second check. 3432 * Therefore if we are here for 3433 * SOFTLOCK case it must be a cow 3434 * break but cow break never reduces 3435 * szc. Thus the assert below. 3436 */ 3437 ASSERT(!brkcow && type != F_SOFTLOCK); 3438 pszc = seg->s_szc; 3439 ierr = -2; 3440 break; 3441 } 3442 ASSERT(IS_P2ALIGNED(a, maxpgsz)); 3443 goto again; 3444 } 3445 #ifdef DEBUG 3446 if (amp != NULL) { 3447 ulong_t taindx = P2ALIGN(aindx, maxpages); 3448 ASSERT(!anon_pages(amp->ahp, taindx, maxpages)); 3449 } 3450 #endif /* DEBUG */ 3451 3452 if (brkcow) { 3453 ASSERT(amp != NULL); 3454 ASSERT(pplist == NULL); 3455 ASSERT(szc == seg->s_szc); 3456 ASSERT(IS_P2ALIGNED(a, maxpgsz)); 3457 ASSERT(IS_P2ALIGNED(aindx, maxpages)); 3458 SEGVN_VMSTAT_FLTVNPAGES(27); 3459 ierr = anon_map_privatepages(amp, aindx, szc, 3460 seg, a, prot, ppa, vpage, segvn_anypgsz, 3461 svd->cred); 3462 if (ierr != 0) { 3463 SEGVN_VMSTAT_FLTVNPAGES(28); 3464 anon_array_exit(&an_cookie); 3465 ANON_LOCK_EXIT(&->a_rwlock); 3466 SEGVN_RESTORE_SOFTLOCK(type, pages); 3467 err = FC_MAKE_ERR(ierr); 3468 goto out; 3469 } 3470 3471 ASSERT(!IS_VMODSORT(ppa[0]->p_vnode)); 3472 /* 3473 * p_szc can't be changed for locked 3474 * swapfs pages. 3475 */ 3476 hat_memload_array(hat, a, pgsz, ppa, prot, 3477 hat_flag); 3478 3479 if (!(hat_flag & HAT_LOAD_LOCK)) { 3480 SEGVN_VMSTAT_FLTVNPAGES(29); 3481 for (i = 0; i < pages; i++) { 3482 page_unlock(ppa[i]); 3483 } 3484 } 3485 anon_array_exit(&an_cookie); 3486 ANON_LOCK_EXIT(&->a_rwlock); 3487 goto next; 3488 } 3489 3490 pfn = page_pptonum(ppa[0]); 3491 /* 3492 * hat_page_demote() needs an EXCl lock on one of 3493 * constituent page_t's and it decreases root's p_szc 3494 * last. This means if root's p_szc is equal szc and 3495 * all its constituent pages are locked 3496 * hat_page_demote() that could have changed p_szc to 3497 * szc is already done and no new have page_demote() 3498 * can start for this large page. 3499 */ 3500 3501 /* 3502 * we need to make sure same mapping size is used for 3503 * the same address range if there's a possibility the 3504 * adddress is already mapped because hat layer panics 3505 * when translation is loaded for the range already 3506 * mapped with a different page size. We achieve it 3507 * by always using largest page size possible subject 3508 * to the constraints of page size, segment page size 3509 * and page alignment. Since mappings are invalidated 3510 * when those constraints change and make it 3511 * impossible to use previously used mapping size no 3512 * mapping size conflicts should happen. 3513 */ 3514 3515 chkszc: 3516 if ((pszc = ppa[0]->p_szc) == szc && 3517 IS_P2ALIGNED(pfn, pages)) { 3518 3519 SEGVN_VMSTAT_FLTVNPAGES(30); 3520 #ifdef DEBUG 3521 for (i = 0; i < pages; i++) { 3522 ASSERT(PAGE_LOCKED(ppa[i])); 3523 ASSERT(!PP_ISFREE(ppa[i])); 3524 ASSERT(page_pptonum(ppa[i]) == 3525 pfn + i); 3526 ASSERT(ppa[i]->p_szc == szc); 3527 ASSERT(ppa[i]->p_vnode == vp); 3528 ASSERT(ppa[i]->p_offset == 3529 off + (i << PAGESHIFT)); 3530 } 3531 #endif /* DEBUG */ 3532 /* 3533 * All pages are of szc we need and they are 3534 * all locked so they can't change szc. load 3535 * translations. 3536 * 3537 * if page got promoted since last check 3538 * we don't need pplist. 3539 */ 3540 if (pplist != NULL) { 3541 page_free_replacement_page(pplist); 3542 page_create_putback(pages); 3543 } 3544 if (PP_ISMIGRATE(ppa[0])) { 3545 page_migrate(seg, a, ppa, pages); 3546 } 3547 SEGVN_UPDATE_MODBITS(ppa, pages, rw, 3548 prot, vpprot); 3549 if (!xhat) { 3550 hat_memload_array(hat, a, pgsz, ppa, 3551 prot & vpprot, hat_flag); 3552 } else { 3553 /* 3554 * avoid large xhat mappings to FS 3555 * pages so that hat_page_demote() 3556 * doesn't need to check for xhat 3557 * large mappings. 3558 */ 3559 for (i = 0; i < pages; i++) { 3560 hat_memload(hat, 3561 a + (i << PAGESHIFT), 3562 ppa[i], prot & vpprot, 3563 hat_flag); 3564 } 3565 } 3566 3567 if (!(hat_flag & HAT_LOAD_LOCK)) { 3568 for (i = 0; i < pages; i++) { 3569 page_unlock(ppa[i]); 3570 } 3571 } 3572 if (amp != NULL) { 3573 anon_array_exit(&an_cookie); 3574 ANON_LOCK_EXIT(&->a_rwlock); 3575 } 3576 goto next; 3577 } 3578 3579 /* 3580 * See if upsize is possible. 3581 */ 3582 if (pszc > szc && szc < seg->s_szc && 3583 (segvn_anypgsz_vnode || pszc >= seg->s_szc)) { 3584 pgcnt_t aphase; 3585 uint_t pszc1 = MIN(pszc, seg->s_szc); 3586 ppgsz = page_get_pagesize(pszc1); 3587 ppages = btop(ppgsz); 3588 aphase = btop(P2PHASE((uintptr_t)a, ppgsz)); 3589 3590 ASSERT(type != F_SOFTLOCK); 3591 3592 SEGVN_VMSTAT_FLTVNPAGES(31); 3593 if (aphase != P2PHASE(pfn, ppages)) { 3594 segvn_faultvnmpss_align_err4++; 3595 } else { 3596 SEGVN_VMSTAT_FLTVNPAGES(32); 3597 if (pplist != NULL) { 3598 page_t *pl = pplist; 3599 page_free_replacement_page(pl); 3600 page_create_putback(pages); 3601 } 3602 for (i = 0; i < pages; i++) { 3603 page_unlock(ppa[i]); 3604 } 3605 if (amp != NULL) { 3606 anon_array_exit(&an_cookie); 3607 ANON_LOCK_EXIT(&->a_rwlock); 3608 } 3609 pszc = pszc1; 3610 ierr = -2; 3611 break; 3612 } 3613 } 3614 3615 /* 3616 * check if we should use smallest mapping size. 3617 */ 3618 upgrdfail = 0; 3619 if (szc == 0 || xhat || 3620 (pszc >= szc && 3621 !IS_P2ALIGNED(pfn, pages)) || 3622 (pszc < szc && 3623 !segvn_full_szcpages(ppa, szc, &upgrdfail, 3624 &pszc))) { 3625 3626 if (upgrdfail && type != F_SOFTLOCK) { 3627 /* 3628 * segvn_full_szcpages failed to lock 3629 * all pages EXCL. Size down. 3630 */ 3631 ASSERT(pszc < szc); 3632 3633 SEGVN_VMSTAT_FLTVNPAGES(33); 3634 3635 if (pplist != NULL) { 3636 page_t *pl = pplist; 3637 page_free_replacement_page(pl); 3638 page_create_putback(pages); 3639 } 3640 3641 for (i = 0; i < pages; i++) { 3642 page_unlock(ppa[i]); 3643 } 3644 if (amp != NULL) { 3645 anon_array_exit(&an_cookie); 3646 ANON_LOCK_EXIT(&->a_rwlock); 3647 } 3648 ierr = -1; 3649 break; 3650 } 3651 if (szc != 0 && !xhat) { 3652 segvn_faultvnmpss_align_err5++; 3653 } 3654 SEGVN_VMSTAT_FLTVNPAGES(34); 3655 if (pplist != NULL) { 3656 page_free_replacement_page(pplist); 3657 page_create_putback(pages); 3658 } 3659 SEGVN_UPDATE_MODBITS(ppa, pages, rw, 3660 prot, vpprot); 3661 for (i = 0; i < pages; i++) { 3662 hat_memload(hat, a + (i << PAGESHIFT), 3663 ppa[i], prot & vpprot, hat_flag); 3664 } 3665 if (!(hat_flag & HAT_LOAD_LOCK)) { 3666 for (i = 0; i < pages; i++) { 3667 page_unlock(ppa[i]); 3668 } 3669 } 3670 if (amp != NULL) { 3671 anon_array_exit(&an_cookie); 3672 ANON_LOCK_EXIT(&->a_rwlock); 3673 } 3674 goto next; 3675 } 3676 3677 if (pszc == szc) { 3678 /* 3679 * segvn_full_szcpages() upgraded pages szc. 3680 */ 3681 ASSERT(pszc == ppa[0]->p_szc); 3682 ASSERT(IS_P2ALIGNED(pfn, pages)); 3683 goto chkszc; 3684 } 3685 3686 if (pszc > szc) { 3687 kmutex_t *szcmtx; 3688 SEGVN_VMSTAT_FLTVNPAGES(35); 3689 /* 3690 * p_szc of ppa[0] can change since we haven't 3691 * locked all constituent pages. Call 3692 * page_lock_szc() to prevent szc changes. 3693 * This should be a rare case that happens when 3694 * multiple segments use a different page size 3695 * to map the same file offsets. 3696 */ 3697 szcmtx = page_szc_lock(ppa[0]); 3698 pszc = ppa[0]->p_szc; 3699 ASSERT(szcmtx != NULL || pszc == 0); 3700 ASSERT(ppa[0]->p_szc <= pszc); 3701 if (pszc <= szc) { 3702 SEGVN_VMSTAT_FLTVNPAGES(36); 3703 if (szcmtx != NULL) { 3704 mutex_exit(szcmtx); 3705 } 3706 goto chkszc; 3707 } 3708 if (pplist != NULL) { 3709 /* 3710 * page got promoted since last check. 3711 * we don't need preaalocated large 3712 * page. 3713 */ 3714 SEGVN_VMSTAT_FLTVNPAGES(37); 3715 page_free_replacement_page(pplist); 3716 page_create_putback(pages); 3717 } 3718 SEGVN_UPDATE_MODBITS(ppa, pages, rw, 3719 prot, vpprot); 3720 hat_memload_array(hat, a, pgsz, ppa, 3721 prot & vpprot, hat_flag); 3722 mutex_exit(szcmtx); 3723 if (!(hat_flag & HAT_LOAD_LOCK)) { 3724 for (i = 0; i < pages; i++) { 3725 page_unlock(ppa[i]); 3726 } 3727 } 3728 if (amp != NULL) { 3729 anon_array_exit(&an_cookie); 3730 ANON_LOCK_EXIT(&->a_rwlock); 3731 } 3732 goto next; 3733 } 3734 3735 /* 3736 * if page got demoted since last check 3737 * we could have not allocated larger page. 3738 * allocate now. 3739 */ 3740 if (pplist == NULL && 3741 page_alloc_pages(seg, a, &pplist, NULL, szc, 0) && 3742 type != F_SOFTLOCK) { 3743 SEGVN_VMSTAT_FLTVNPAGES(38); 3744 for (i = 0; i < pages; i++) { 3745 page_unlock(ppa[i]); 3746 } 3747 if (amp != NULL) { 3748 anon_array_exit(&an_cookie); 3749 ANON_LOCK_EXIT(&->a_rwlock); 3750 } 3751 ierr = -1; 3752 alloc_failed |= (1 << szc); 3753 break; 3754 } 3755 3756 SEGVN_VMSTAT_FLTVNPAGES(39); 3757 3758 if (pplist != NULL) { 3759 segvn_relocate_pages(ppa, pplist); 3760 #ifdef DEBUG 3761 } else { 3762 ASSERT(type == F_SOFTLOCK); 3763 SEGVN_VMSTAT_FLTVNPAGES(40); 3764 #endif /* DEBUG */ 3765 } 3766 3767 SEGVN_UPDATE_MODBITS(ppa, pages, rw, prot, vpprot); 3768 3769 if (pplist == NULL && segvn_anypgsz_vnode == 0) { 3770 ASSERT(type == F_SOFTLOCK); 3771 for (i = 0; i < pages; i++) { 3772 ASSERT(ppa[i]->p_szc < szc); 3773 hat_memload(hat, a + (i << PAGESHIFT), 3774 ppa[i], prot & vpprot, hat_flag); 3775 } 3776 } else { 3777 ASSERT(pplist != NULL || type == F_SOFTLOCK); 3778 hat_memload_array(hat, a, pgsz, ppa, 3779 prot & vpprot, hat_flag); 3780 } 3781 if (!(hat_flag & HAT_LOAD_LOCK)) { 3782 for (i = 0; i < pages; i++) { 3783 ASSERT(PAGE_SHARED(ppa[i])); 3784 page_unlock(ppa[i]); 3785 } 3786 } 3787 if (amp != NULL) { 3788 anon_array_exit(&an_cookie); 3789 ANON_LOCK_EXIT(&->a_rwlock); 3790 } 3791 3792 next: 3793 if (vpage != NULL) { 3794 vpage += pages; 3795 } 3796 adjszc_chk = 1; 3797 } 3798 if (a == lpgeaddr) 3799 break; 3800 ASSERT(a < lpgeaddr); 3801 3802 ASSERT(!brkcow && type != F_SOFTLOCK); 3803 3804 /* 3805 * ierr == -1 means we failed to map with a large page. 3806 * (either due to allocation/relocation failures or 3807 * misalignment with other mappings to this file. 3808 * 3809 * ierr == -2 means some other thread allocated a large page 3810 * after we gave up tp map with a large page. retry with 3811 * larger mapping. 3812 */ 3813 ASSERT(ierr == -1 || ierr == -2); 3814 ASSERT(ierr == -2 || szc != 0); 3815 ASSERT(ierr == -1 || szc < seg->s_szc); 3816 if (ierr == -2) { 3817 SEGVN_VMSTAT_FLTVNPAGES(41); 3818 ASSERT(pszc > szc && pszc <= seg->s_szc); 3819 szc = pszc; 3820 } else if (segvn_anypgsz_vnode) { 3821 SEGVN_VMSTAT_FLTVNPAGES(42); 3822 szc--; 3823 } else { 3824 SEGVN_VMSTAT_FLTVNPAGES(43); 3825 ASSERT(pszc < szc); 3826 /* 3827 * other process created pszc large page. 3828 * but we still have to drop to 0 szc. 3829 */ 3830 szc = 0; 3831 } 3832 3833 pgsz = page_get_pagesize(szc); 3834 pages = btop(pgsz); 3835 if (ierr == -2) { 3836 /* 3837 * Size up case. Note lpgaddr may only be needed for 3838 * softlock case so we don't adjust it here. 3839 */ 3840 a = (caddr_t)P2ALIGN((uintptr_t)a, pgsz); 3841 ASSERT(a >= lpgaddr); 3842 lpgeaddr = (caddr_t)P2ROUNDUP((uintptr_t)eaddr, pgsz); 3843 off = svd->offset + (uintptr_t)(a - seg->s_base); 3844 aindx = svd->anon_index + seg_page(seg, a); 3845 vpage = (svd->vpage != NULL) ? 3846 &svd->vpage[seg_page(seg, a)] : NULL; 3847 } else { 3848 /* 3849 * Size down case. Note lpgaddr may only be needed for 3850 * softlock case so we don't adjust it here. 3851 */ 3852 ASSERT(IS_P2ALIGNED(a, pgsz)); 3853 ASSERT(IS_P2ALIGNED(lpgeaddr, pgsz)); 3854 lpgeaddr = (caddr_t)P2ROUNDUP((uintptr_t)eaddr, pgsz); 3855 ASSERT(a < lpgeaddr); 3856 if (a < addr) { 3857 SEGVN_VMSTAT_FLTVNPAGES(44); 3858 /* 3859 * The beginning of the large page region can 3860 * be pulled to the right to make a smaller 3861 * region. We haven't yet faulted a single 3862 * page. 3863 */ 3864 a = (caddr_t)P2ALIGN((uintptr_t)addr, pgsz); 3865 ASSERT(a >= lpgaddr); 3866 off = svd->offset + 3867 (uintptr_t)(a - seg->s_base); 3868 aindx = svd->anon_index + seg_page(seg, a); 3869 vpage = (svd->vpage != NULL) ? 3870 &svd->vpage[seg_page(seg, a)] : NULL; 3871 } 3872 } 3873 } 3874 out: 3875 kmem_free(ppa, ppasize); 3876 if (!err && !vop_size_err) { 3877 SEGVN_VMSTAT_FLTVNPAGES(45); 3878 return (0); 3879 } 3880 if (type == F_SOFTLOCK && a > lpgaddr) { 3881 SEGVN_VMSTAT_FLTVNPAGES(46); 3882 segvn_softunlock(seg, lpgaddr, a - lpgaddr, S_OTHER); 3883 } 3884 if (!vop_size_err) { 3885 SEGVN_VMSTAT_FLTVNPAGES(47); 3886 return (err); 3887 } 3888 ASSERT(brkcow || type == F_SOFTLOCK); 3889 /* 3890 * Large page end is mapped beyond the end of file and it's a cow 3891 * fault or softlock so we can't reduce the map area. For now just 3892 * demote the segment. This should really only happen if the end of 3893 * the file changed after the mapping was established since when large 3894 * page segments are created we make sure they don't extend beyond the 3895 * end of the file. 3896 */ 3897 SEGVN_VMSTAT_FLTVNPAGES(48); 3898 3899 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 3900 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_WRITER); 3901 err = 0; 3902 if (seg->s_szc != 0) { 3903 segvn_fltvnpages_clrszc_cnt++; 3904 ASSERT(svd->softlockcnt == 0); 3905 err = segvn_clrszc(seg); 3906 if (err != 0) { 3907 segvn_fltvnpages_clrszc_err++; 3908 } 3909 } 3910 ASSERT(err || seg->s_szc == 0); 3911 SEGVN_LOCK_DOWNGRADE(seg->s_as, &svd->lock); 3912 /* segvn_fault will do its job as if szc had been zero to begin with */ 3913 return (err == 0 ? IE_RETRY : FC_MAKE_ERR(err)); 3914 } 3915 3916 /* 3917 * This routine will attempt to fault in one large page. 3918 * it will use smaller pages if that fails. 3919 * It should only be called for pure anonymous segments. 3920 */ 3921 static faultcode_t 3922 segvn_fault_anonpages(struct hat *hat, struct seg *seg, caddr_t lpgaddr, 3923 caddr_t lpgeaddr, enum fault_type type, enum seg_rw rw, caddr_t addr, 3924 caddr_t eaddr, int brkcow) 3925 { 3926 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 3927 struct anon_map *amp = svd->amp; 3928 uchar_t segtype = svd->type; 3929 uint_t szc = seg->s_szc; 3930 size_t pgsz = page_get_pagesize(szc); 3931 size_t maxpgsz = pgsz; 3932 pgcnt_t pages = btop(pgsz); 3933 size_t ppasize = pages * sizeof (page_t *); 3934 caddr_t a = lpgaddr; 3935 ulong_t aindx = svd->anon_index + seg_page(seg, a); 3936 struct vpage *vpage = (svd->vpage != NULL) ? 3937 &svd->vpage[seg_page(seg, a)] : NULL; 3938 page_t **ppa; 3939 uint_t ppa_szc; 3940 faultcode_t err; 3941 int ierr; 3942 uint_t protchk, prot, vpprot; 3943 int i; 3944 int hat_flag = (type == F_SOFTLOCK) ? HAT_LOAD_LOCK : HAT_LOAD; 3945 anon_sync_obj_t cookie; 3946 3947 ASSERT(szc != 0); 3948 ASSERT(amp != NULL); 3949 ASSERT(enable_mbit_wa == 0); /* no mbit simulations with large pages */ 3950 ASSERT(!(svd->flags & MAP_NORESERVE)); 3951 ASSERT(type != F_SOFTUNLOCK); 3952 ASSERT(segtype == MAP_PRIVATE); 3953 ASSERT(IS_P2ALIGNED(a, maxpgsz)); 3954 3955 ASSERT(SEGVN_LOCK_HELD(seg->s_as, &svd->lock)); 3956 3957 VM_STAT_COND_ADD(type == F_SOFTLOCK, segvnvmstats.fltanpages[0]); 3958 VM_STAT_COND_ADD(type != F_SOFTLOCK, segvnvmstats.fltanpages[1]); 3959 3960 if (svd->flags & MAP_TEXT) { 3961 hat_flag |= HAT_LOAD_TEXT; 3962 } 3963 3964 if (svd->pageprot) { 3965 switch (rw) { 3966 case S_READ: 3967 protchk = PROT_READ; 3968 break; 3969 case S_WRITE: 3970 protchk = PROT_WRITE; 3971 break; 3972 case S_EXEC: 3973 protchk = PROT_EXEC; 3974 break; 3975 case S_OTHER: 3976 default: 3977 protchk = PROT_READ | PROT_WRITE | PROT_EXEC; 3978 break; 3979 } 3980 VM_STAT_ADD(segvnvmstats.fltanpages[2]); 3981 } else { 3982 prot = svd->prot; 3983 /* caller has already done segment level protection check. */ 3984 } 3985 3986 ppa = kmem_alloc(ppasize, KM_SLEEP); 3987 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 3988 for (;;) { 3989 for (; a < lpgeaddr; a += pgsz, aindx += pages) { 3990 if (svd->pageprot != 0 && IS_P2ALIGNED(a, maxpgsz)) { 3991 VM_STAT_ADD(segvnvmstats.fltanpages[3]); 3992 ASSERT(vpage != NULL); 3993 prot = VPP_PROT(vpage); 3994 ASSERT(sameprot(seg, a, maxpgsz)); 3995 if ((prot & protchk) == 0) { 3996 err = FC_PROT; 3997 goto error; 3998 } 3999 } 4000 if (type == F_SOFTLOCK) { 4001 mutex_enter(&freemem_lock); 4002 if (availrmem < tune.t_minarmem + pages) { 4003 mutex_exit(&freemem_lock); 4004 err = FC_MAKE_ERR(ENOMEM); 4005 goto error; 4006 } else { 4007 availrmem -= pages; 4008 segvn_pages_locked += pages; 4009 svd->softlockcnt += pages; 4010 } 4011 mutex_exit(&freemem_lock); 4012 } 4013 anon_array_enter(amp, aindx, &cookie); 4014 ppa_szc = (uint_t)-1; 4015 ierr = anon_map_getpages(amp, aindx, szc, seg, a, 4016 prot, &vpprot, ppa, &ppa_szc, vpage, rw, brkcow, 4017 segvn_anypgsz, svd->cred); 4018 if (ierr != 0) { 4019 anon_array_exit(&cookie); 4020 VM_STAT_ADD(segvnvmstats.fltanpages[4]); 4021 if (type == F_SOFTLOCK) { 4022 VM_STAT_ADD(segvnvmstats.fltanpages[5]); 4023 mutex_enter(&freemem_lock); 4024 availrmem += pages; 4025 segvn_pages_locked -= pages; 4026 svd->softlockcnt -= pages; 4027 mutex_exit(&freemem_lock); 4028 } 4029 if (ierr > 0) { 4030 VM_STAT_ADD(segvnvmstats.fltanpages[6]); 4031 err = FC_MAKE_ERR(ierr); 4032 goto error; 4033 } 4034 break; 4035 } 4036 4037 ASSERT(!IS_VMODSORT(ppa[0]->p_vnode)); 4038 4039 /* 4040 * Handle pages that have been marked for migration 4041 */ 4042 if (lgrp_optimizations()) 4043 page_migrate(seg, a, ppa, pages); 4044 4045 hat_memload_array(hat, a, pgsz, ppa, 4046 prot & vpprot, hat_flag); 4047 4048 if (hat_flag & HAT_LOAD_LOCK) { 4049 VM_STAT_ADD(segvnvmstats.fltanpages[7]); 4050 } else { 4051 VM_STAT_ADD(segvnvmstats.fltanpages[8]); 4052 for (i = 0; i < pages; i++) 4053 page_unlock(ppa[i]); 4054 } 4055 if (vpage != NULL) 4056 vpage += pages; 4057 4058 anon_array_exit(&cookie); 4059 } 4060 if (a == lpgeaddr) 4061 break; 4062 ASSERT(a < lpgeaddr); 4063 /* 4064 * ierr == -1 means we failed to allocate a large page. 4065 * so do a size down operation. 4066 * 4067 * ierr == -2 means some other process that privately shares 4068 * pages with this process has allocated a larger page and we 4069 * need to retry with larger pages. So do a size up 4070 * operation. This relies on the fact that large pages are 4071 * never partially shared i.e. if we share any constituent 4072 * page of a large page with another process we must share the 4073 * entire large page. Note this cannot happen for SOFTLOCK 4074 * case, unless current address (a) is at the beginning of the 4075 * next page size boundary because the other process couldn't 4076 * have relocated locked pages. 4077 */ 4078 ASSERT(ierr == -1 || ierr == -2); 4079 if (segvn_anypgsz) { 4080 ASSERT(ierr == -2 || szc != 0); 4081 ASSERT(ierr == -1 || szc < seg->s_szc); 4082 szc = (ierr == -1) ? szc - 1 : szc + 1; 4083 } else { 4084 /* 4085 * For non COW faults and segvn_anypgsz == 0 4086 * we need to be careful not to loop forever 4087 * if existing page is found with szc other 4088 * than 0 or seg->s_szc. This could be due 4089 * to page relocations on behalf of DR or 4090 * more likely large page creation. For this 4091 * case simply re-size to existing page's szc 4092 * if returned by anon_map_getpages(). 4093 */ 4094 if (ppa_szc == (uint_t)-1) { 4095 szc = (ierr == -1) ? 0 : seg->s_szc; 4096 } else { 4097 ASSERT(ppa_szc <= seg->s_szc); 4098 ASSERT(ierr == -2 || ppa_szc < szc); 4099 ASSERT(ierr == -1 || ppa_szc > szc); 4100 szc = ppa_szc; 4101 } 4102 } 4103 4104 pgsz = page_get_pagesize(szc); 4105 pages = btop(pgsz); 4106 ASSERT(type != F_SOFTLOCK || ierr == -1 || 4107 (IS_P2ALIGNED(a, pgsz) && IS_P2ALIGNED(lpgeaddr, pgsz))); 4108 if (type == F_SOFTLOCK) { 4109 /* 4110 * For softlocks we cannot reduce the fault area 4111 * (calculated based on the largest page size for this 4112 * segment) for size down and a is already next 4113 * page size aligned as assertted above for size 4114 * ups. Therefore just continue in case of softlock. 4115 */ 4116 VM_STAT_ADD(segvnvmstats.fltanpages[9]); 4117 continue; /* keep lint happy */ 4118 } else if (ierr == -2) { 4119 4120 /* 4121 * Size up case. Note lpgaddr may only be needed for 4122 * softlock case so we don't adjust it here. 4123 */ 4124 VM_STAT_ADD(segvnvmstats.fltanpages[10]); 4125 a = (caddr_t)P2ALIGN((uintptr_t)a, pgsz); 4126 ASSERT(a >= lpgaddr); 4127 lpgeaddr = (caddr_t)P2ROUNDUP((uintptr_t)eaddr, pgsz); 4128 aindx = svd->anon_index + seg_page(seg, a); 4129 vpage = (svd->vpage != NULL) ? 4130 &svd->vpage[seg_page(seg, a)] : NULL; 4131 } else { 4132 /* 4133 * Size down case. Note lpgaddr may only be needed for 4134 * softlock case so we don't adjust it here. 4135 */ 4136 VM_STAT_ADD(segvnvmstats.fltanpages[11]); 4137 ASSERT(IS_P2ALIGNED(a, pgsz)); 4138 ASSERT(IS_P2ALIGNED(lpgeaddr, pgsz)); 4139 lpgeaddr = (caddr_t)P2ROUNDUP((uintptr_t)eaddr, pgsz); 4140 ASSERT(a < lpgeaddr); 4141 if (a < addr) { 4142 /* 4143 * The beginning of the large page region can 4144 * be pulled to the right to make a smaller 4145 * region. We haven't yet faulted a single 4146 * page. 4147 */ 4148 VM_STAT_ADD(segvnvmstats.fltanpages[12]); 4149 a = (caddr_t)P2ALIGN((uintptr_t)addr, pgsz); 4150 ASSERT(a >= lpgaddr); 4151 aindx = svd->anon_index + seg_page(seg, a); 4152 vpage = (svd->vpage != NULL) ? 4153 &svd->vpage[seg_page(seg, a)] : NULL; 4154 } 4155 } 4156 } 4157 VM_STAT_ADD(segvnvmstats.fltanpages[13]); 4158 ANON_LOCK_EXIT(&->a_rwlock); 4159 kmem_free(ppa, ppasize); 4160 return (0); 4161 error: 4162 VM_STAT_ADD(segvnvmstats.fltanpages[14]); 4163 ANON_LOCK_EXIT(&->a_rwlock); 4164 kmem_free(ppa, ppasize); 4165 if (type == F_SOFTLOCK && a > lpgaddr) { 4166 VM_STAT_ADD(segvnvmstats.fltanpages[15]); 4167 segvn_softunlock(seg, lpgaddr, a - lpgaddr, S_OTHER); 4168 } 4169 return (err); 4170 } 4171 4172 int fltadvice = 1; /* set to free behind pages for sequential access */ 4173 4174 /* 4175 * This routine is called via a machine specific fault handling routine. 4176 * It is also called by software routines wishing to lock or unlock 4177 * a range of addresses. 4178 * 4179 * Here is the basic algorithm: 4180 * If unlocking 4181 * Call segvn_softunlock 4182 * Return 4183 * endif 4184 * Checking and set up work 4185 * If we will need some non-anonymous pages 4186 * Call VOP_GETPAGE over the range of non-anonymous pages 4187 * endif 4188 * Loop over all addresses requested 4189 * Call segvn_faultpage passing in page list 4190 * to load up translations and handle anonymous pages 4191 * endloop 4192 * Load up translation to any additional pages in page list not 4193 * already handled that fit into this segment 4194 */ 4195 static faultcode_t 4196 segvn_fault(struct hat *hat, struct seg *seg, caddr_t addr, size_t len, 4197 enum fault_type type, enum seg_rw rw) 4198 { 4199 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 4200 page_t **plp, **ppp, *pp; 4201 u_offset_t off; 4202 caddr_t a; 4203 struct vpage *vpage; 4204 uint_t vpprot, prot; 4205 int err; 4206 page_t *pl[PVN_GETPAGE_NUM + 1]; 4207 size_t plsz, pl_alloc_sz; 4208 size_t page; 4209 ulong_t anon_index; 4210 struct anon_map *amp; 4211 int dogetpage = 0; 4212 caddr_t lpgaddr, lpgeaddr; 4213 size_t pgsz; 4214 anon_sync_obj_t cookie; 4215 int brkcow = BREAK_COW_SHARE(rw, type, svd->type); 4216 4217 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 4218 4219 /* 4220 * First handle the easy stuff 4221 */ 4222 if (type == F_SOFTUNLOCK) { 4223 if (rw == S_READ_NOCOW) { 4224 rw = S_READ; 4225 ASSERT(AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock)); 4226 } 4227 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER); 4228 pgsz = (seg->s_szc == 0) ? PAGESIZE : 4229 page_get_pagesize(seg->s_szc); 4230 VM_STAT_COND_ADD(pgsz > PAGESIZE, segvnvmstats.fltanpages[16]); 4231 CALC_LPG_REGION(pgsz, seg, addr, len, lpgaddr, lpgeaddr); 4232 segvn_softunlock(seg, lpgaddr, lpgeaddr - lpgaddr, rw); 4233 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 4234 return (0); 4235 } 4236 4237 top: 4238 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER); 4239 4240 /* 4241 * If we have the same protections for the entire segment, 4242 * insure that the access being attempted is legitimate. 4243 */ 4244 4245 if (svd->pageprot == 0) { 4246 uint_t protchk; 4247 4248 switch (rw) { 4249 case S_READ: 4250 case S_READ_NOCOW: 4251 protchk = PROT_READ; 4252 break; 4253 case S_WRITE: 4254 protchk = PROT_WRITE; 4255 break; 4256 case S_EXEC: 4257 protchk = PROT_EXEC; 4258 break; 4259 case S_OTHER: 4260 default: 4261 protchk = PROT_READ | PROT_WRITE | PROT_EXEC; 4262 break; 4263 } 4264 4265 if ((svd->prot & protchk) == 0) { 4266 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 4267 return (FC_PROT); /* illegal access type */ 4268 } 4269 } 4270 4271 /* 4272 * We can't allow the long term use of softlocks for vmpss segments, 4273 * because in some file truncation cases we should be able to demote 4274 * the segment, which requires that there are no softlocks. The 4275 * only case where it's ok to allow a SOFTLOCK fault against a vmpss 4276 * segment is S_READ_NOCOW, where the caller holds the address space 4277 * locked as writer and calls softunlock before dropping the as lock. 4278 * S_READ_NOCOW is used by /proc to read memory from another user. 4279 * 4280 * Another deadlock between SOFTLOCK and file truncation can happen 4281 * because segvn_fault_vnodepages() calls the FS one pagesize at 4282 * a time. A second VOP_GETPAGE() call by segvn_fault_vnodepages() 4283 * can cause a deadlock because the first set of page_t's remain 4284 * locked SE_SHARED. To avoid this, we demote segments on a first 4285 * SOFTLOCK if they have a length greater than the segment's 4286 * page size. 4287 * 4288 * So for now, we only avoid demoting a segment on a SOFTLOCK when 4289 * the access type is S_READ_NOCOW and the fault length is less than 4290 * or equal to the segment's page size. While this is quite restrictive, 4291 * it should be the most common case of SOFTLOCK against a vmpss 4292 * segment. 4293 * 4294 * For S_READ_NOCOW, it's safe not to do a copy on write because the 4295 * caller makes sure no COW will be caused by another thread for a 4296 * softlocked page. 4297 */ 4298 if (type == F_SOFTLOCK && svd->vp != NULL && seg->s_szc != 0) { 4299 int demote = 0; 4300 4301 if (rw != S_READ_NOCOW) { 4302 demote = 1; 4303 } 4304 if (!demote && len > PAGESIZE) { 4305 pgsz = page_get_pagesize(seg->s_szc); 4306 CALC_LPG_REGION(pgsz, seg, addr, len, lpgaddr, 4307 lpgeaddr); 4308 if (lpgeaddr - lpgaddr > pgsz) { 4309 demote = 1; 4310 } 4311 } 4312 4313 ASSERT(demote || AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock)); 4314 4315 if (demote) { 4316 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 4317 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_WRITER); 4318 if (seg->s_szc != 0) { 4319 segvn_vmpss_clrszc_cnt++; 4320 ASSERT(svd->softlockcnt == 0); 4321 err = segvn_clrszc(seg); 4322 if (err) { 4323 segvn_vmpss_clrszc_err++; 4324 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 4325 return (FC_MAKE_ERR(err)); 4326 } 4327 } 4328 ASSERT(seg->s_szc == 0); 4329 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 4330 goto top; 4331 } 4332 } 4333 4334 /* 4335 * S_READ_NOCOW vs S_READ distinction was 4336 * only needed for the code above. After 4337 * that we treat it as S_READ. 4338 */ 4339 if (rw == S_READ_NOCOW) { 4340 ASSERT(type == F_SOFTLOCK); 4341 ASSERT(AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock)); 4342 rw = S_READ; 4343 } 4344 4345 /* 4346 * Check to see if we need to allocate an anon_map structure. 4347 */ 4348 if (svd->amp == NULL && (svd->vp == NULL || brkcow)) { 4349 /* 4350 * Drop the "read" lock on the segment and acquire 4351 * the "write" version since we have to allocate the 4352 * anon_map. 4353 */ 4354 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 4355 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_WRITER); 4356 4357 if (svd->amp == NULL) { 4358 svd->amp = anonmap_alloc(seg->s_size, 0); 4359 svd->amp->a_szc = seg->s_szc; 4360 } 4361 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 4362 4363 /* 4364 * Start all over again since segment protections 4365 * may have changed after we dropped the "read" lock. 4366 */ 4367 goto top; 4368 } 4369 4370 amp = svd->amp; 4371 4372 /* 4373 * MADV_SEQUENTIAL work is ignored for large page segments. 4374 */ 4375 if (seg->s_szc != 0) { 4376 pgsz = page_get_pagesize(seg->s_szc); 4377 ASSERT(SEGVN_LOCK_HELD(seg->s_as, &svd->lock)); 4378 /* 4379 * We may need to do relocations so purge seg_pcache to allow 4380 * pages to be locked exclusively. 4381 */ 4382 if (svd->softlockcnt != 0) 4383 segvn_purge(seg); 4384 CALC_LPG_REGION(pgsz, seg, addr, len, lpgaddr, lpgeaddr); 4385 if (svd->vp == NULL) { 4386 ASSERT(svd->type == MAP_PRIVATE); 4387 err = segvn_fault_anonpages(hat, seg, lpgaddr, 4388 lpgeaddr, type, rw, addr, addr + len, brkcow); 4389 } else { 4390 err = segvn_fault_vnodepages(hat, seg, lpgaddr, 4391 lpgeaddr, type, rw, addr, addr + len, brkcow); 4392 if (err == IE_RETRY) { 4393 ASSERT(seg->s_szc == 0); 4394 ASSERT(SEGVN_READ_HELD(seg->s_as, &svd->lock)); 4395 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 4396 goto top; 4397 } 4398 } 4399 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 4400 return (err); 4401 } 4402 4403 page = seg_page(seg, addr); 4404 if (amp != NULL) { 4405 anon_index = svd->anon_index + page; 4406 4407 if ((type == F_PROT) && (rw == S_READ) && 4408 svd->type == MAP_PRIVATE && svd->pageprot == 0) { 4409 size_t index = anon_index; 4410 struct anon *ap; 4411 4412 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 4413 /* 4414 * The fast path could apply to S_WRITE also, except 4415 * that the protection fault could be caused by lazy 4416 * tlb flush when ro->rw. In this case, the pte is 4417 * RW already. But RO in the other cpu's tlb causes 4418 * the fault. Since hat_chgprot won't do anything if 4419 * pte doesn't change, we may end up faulting 4420 * indefinitely until the RO tlb entry gets replaced. 4421 */ 4422 for (a = addr; a < addr + len; a += PAGESIZE, index++) { 4423 anon_array_enter(amp, index, &cookie); 4424 ap = anon_get_ptr(amp->ahp, index); 4425 anon_array_exit(&cookie); 4426 if ((ap == NULL) || (ap->an_refcnt != 1)) { 4427 ANON_LOCK_EXIT(&->a_rwlock); 4428 goto slow; 4429 } 4430 } 4431 hat_chgprot(seg->s_as->a_hat, addr, len, svd->prot); 4432 ANON_LOCK_EXIT(&->a_rwlock); 4433 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 4434 return (0); 4435 } 4436 } 4437 slow: 4438 4439 if (svd->vpage == NULL) 4440 vpage = NULL; 4441 else 4442 vpage = &svd->vpage[page]; 4443 4444 off = svd->offset + (uintptr_t)(addr - seg->s_base); 4445 4446 /* 4447 * If MADV_SEQUENTIAL has been set for the particular page we 4448 * are faulting on, free behind all pages in the segment and put 4449 * them on the free list. 4450 */ 4451 if ((page != 0) && fltadvice) { /* not if first page in segment */ 4452 struct vpage *vpp; 4453 ulong_t fanon_index; 4454 size_t fpage; 4455 u_offset_t pgoff, fpgoff; 4456 struct vnode *fvp; 4457 struct anon *fap = NULL; 4458 4459 if (svd->advice == MADV_SEQUENTIAL || 4460 (svd->pageadvice && 4461 VPP_ADVICE(vpage) == MADV_SEQUENTIAL)) { 4462 pgoff = off - PAGESIZE; 4463 fpage = page - 1; 4464 if (vpage != NULL) 4465 vpp = &svd->vpage[fpage]; 4466 if (amp != NULL) 4467 fanon_index = svd->anon_index + fpage; 4468 4469 while (pgoff > svd->offset) { 4470 if (svd->advice != MADV_SEQUENTIAL && 4471 (!svd->pageadvice || (vpage && 4472 VPP_ADVICE(vpp) != MADV_SEQUENTIAL))) 4473 break; 4474 4475 /* 4476 * If this is an anon page, we must find the 4477 * correct <vp, offset> for it 4478 */ 4479 fap = NULL; 4480 if (amp != NULL) { 4481 ANON_LOCK_ENTER(&->a_rwlock, 4482 RW_READER); 4483 anon_array_enter(amp, fanon_index, 4484 &cookie); 4485 fap = anon_get_ptr(amp->ahp, 4486 fanon_index); 4487 if (fap != NULL) { 4488 swap_xlate(fap, &fvp, &fpgoff); 4489 } else { 4490 fpgoff = pgoff; 4491 fvp = svd->vp; 4492 } 4493 anon_array_exit(&cookie); 4494 ANON_LOCK_EXIT(&->a_rwlock); 4495 } else { 4496 fpgoff = pgoff; 4497 fvp = svd->vp; 4498 } 4499 if (fvp == NULL) 4500 break; /* XXX */ 4501 /* 4502 * Skip pages that are free or have an 4503 * "exclusive" lock. 4504 */ 4505 pp = page_lookup_nowait(fvp, fpgoff, SE_SHARED); 4506 if (pp == NULL) 4507 break; 4508 /* 4509 * We don't need the page_struct_lock to test 4510 * as this is only advisory; even if we 4511 * acquire it someone might race in and lock 4512 * the page after we unlock and before the 4513 * PUTPAGE, then VOP_PUTPAGE will do nothing. 4514 */ 4515 if (pp->p_lckcnt == 0 && pp->p_cowcnt == 0) { 4516 /* 4517 * Hold the vnode before releasing 4518 * the page lock to prevent it from 4519 * being freed and re-used by some 4520 * other thread. 4521 */ 4522 VN_HOLD(fvp); 4523 page_unlock(pp); 4524 /* 4525 * We should build a page list 4526 * to kluster putpages XXX 4527 */ 4528 (void) VOP_PUTPAGE(fvp, 4529 (offset_t)fpgoff, PAGESIZE, 4530 (B_DONTNEED|B_FREE|B_ASYNC), 4531 svd->cred); 4532 VN_RELE(fvp); 4533 } else { 4534 /* 4535 * XXX - Should the loop terminate if 4536 * the page is `locked'? 4537 */ 4538 page_unlock(pp); 4539 } 4540 --vpp; 4541 --fanon_index; 4542 pgoff -= PAGESIZE; 4543 } 4544 } 4545 } 4546 4547 plp = pl; 4548 *plp = NULL; 4549 pl_alloc_sz = 0; 4550 4551 /* 4552 * See if we need to call VOP_GETPAGE for 4553 * *any* of the range being faulted on. 4554 * We can skip all of this work if there 4555 * was no original vnode. 4556 */ 4557 if (svd->vp != NULL) { 4558 u_offset_t vp_off; 4559 size_t vp_len; 4560 struct anon *ap; 4561 vnode_t *vp; 4562 4563 vp_off = off; 4564 vp_len = len; 4565 4566 if (amp == NULL) 4567 dogetpage = 1; 4568 else { 4569 /* 4570 * Only acquire reader lock to prevent amp->ahp 4571 * from being changed. It's ok to miss pages, 4572 * hence we don't do anon_array_enter 4573 */ 4574 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 4575 ap = anon_get_ptr(amp->ahp, anon_index); 4576 4577 if (len <= PAGESIZE) 4578 /* inline non_anon() */ 4579 dogetpage = (ap == NULL); 4580 else 4581 dogetpage = non_anon(amp->ahp, anon_index, 4582 &vp_off, &vp_len); 4583 ANON_LOCK_EXIT(&->a_rwlock); 4584 } 4585 4586 if (dogetpage) { 4587 enum seg_rw arw; 4588 struct as *as = seg->s_as; 4589 4590 if (len > ptob((sizeof (pl) / sizeof (pl[0])) - 1)) { 4591 /* 4592 * Page list won't fit in local array, 4593 * allocate one of the needed size. 4594 */ 4595 pl_alloc_sz = 4596 (btop(len) + 1) * sizeof (page_t *); 4597 plp = kmem_alloc(pl_alloc_sz, KM_SLEEP); 4598 plp[0] = NULL; 4599 plsz = len; 4600 } else if (rw == S_WRITE && svd->type == MAP_PRIVATE || 4601 rw == S_OTHER || 4602 (((size_t)(addr + PAGESIZE) < 4603 (size_t)(seg->s_base + seg->s_size)) && 4604 hat_probe(as->a_hat, addr + PAGESIZE))) { 4605 /* 4606 * Ask VOP_GETPAGE to return the exact number 4607 * of pages if 4608 * (a) this is a COW fault, or 4609 * (b) this is a software fault, or 4610 * (c) next page is already mapped. 4611 */ 4612 plsz = len; 4613 } else { 4614 /* 4615 * Ask VOP_GETPAGE to return adjacent pages 4616 * within the segment. 4617 */ 4618 plsz = MIN((size_t)PVN_GETPAGE_SZ, (size_t) 4619 ((seg->s_base + seg->s_size) - addr)); 4620 ASSERT((addr + plsz) <= 4621 (seg->s_base + seg->s_size)); 4622 } 4623 4624 /* 4625 * Need to get some non-anonymous pages. 4626 * We need to make only one call to GETPAGE to do 4627 * this to prevent certain deadlocking conditions 4628 * when we are doing locking. In this case 4629 * non_anon() should have picked up the smallest 4630 * range which includes all the non-anonymous 4631 * pages in the requested range. We have to 4632 * be careful regarding which rw flag to pass in 4633 * because on a private mapping, the underlying 4634 * object is never allowed to be written. 4635 */ 4636 if (rw == S_WRITE && svd->type == MAP_PRIVATE) { 4637 arw = S_READ; 4638 } else { 4639 arw = rw; 4640 } 4641 vp = svd->vp; 4642 TRACE_3(TR_FAC_VM, TR_SEGVN_GETPAGE, 4643 "segvn_getpage:seg %p addr %p vp %p", 4644 seg, addr, vp); 4645 err = VOP_GETPAGE(vp, (offset_t)vp_off, vp_len, 4646 &vpprot, plp, plsz, seg, addr + (vp_off - off), arw, 4647 svd->cred); 4648 if (err) { 4649 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 4650 segvn_pagelist_rele(plp); 4651 if (pl_alloc_sz) 4652 kmem_free(plp, pl_alloc_sz); 4653 return (FC_MAKE_ERR(err)); 4654 } 4655 if (svd->type == MAP_PRIVATE) 4656 vpprot &= ~PROT_WRITE; 4657 } 4658 } 4659 4660 /* 4661 * N.B. at this time the plp array has all the needed non-anon 4662 * pages in addition to (possibly) having some adjacent pages. 4663 */ 4664 4665 /* 4666 * Always acquire the anon_array_lock to prevent 4667 * 2 threads from allocating separate anon slots for 4668 * the same "addr". 4669 * 4670 * If this is a copy-on-write fault and we don't already 4671 * have the anon_array_lock, acquire it to prevent the 4672 * fault routine from handling multiple copy-on-write faults 4673 * on the same "addr" in the same address space. 4674 * 4675 * Only one thread should deal with the fault since after 4676 * it is handled, the other threads can acquire a translation 4677 * to the newly created private page. This prevents two or 4678 * more threads from creating different private pages for the 4679 * same fault. 4680 * 4681 * We grab "serialization" lock here if this is a MAP_PRIVATE segment 4682 * to prevent deadlock between this thread and another thread 4683 * which has soft-locked this page and wants to acquire serial_lock. 4684 * ( bug 4026339 ) 4685 * 4686 * The fix for bug 4026339 becomes unnecessary when using the 4687 * locking scheme with per amp rwlock and a global set of hash 4688 * lock, anon_array_lock. If we steal a vnode page when low 4689 * on memory and upgrad the page lock through page_rename, 4690 * then the page is PAGE_HANDLED, nothing needs to be done 4691 * for this page after returning from segvn_faultpage. 4692 * 4693 * But really, the page lock should be downgraded after 4694 * the stolen page is page_rename'd. 4695 */ 4696 4697 if (amp != NULL) 4698 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 4699 4700 /* 4701 * Ok, now loop over the address range and handle faults 4702 */ 4703 for (a = addr; a < addr + len; a += PAGESIZE, off += PAGESIZE) { 4704 err = segvn_faultpage(hat, seg, a, off, vpage, plp, vpprot, 4705 type, rw, brkcow); 4706 if (err) { 4707 if (amp != NULL) 4708 ANON_LOCK_EXIT(&->a_rwlock); 4709 if (type == F_SOFTLOCK && a > addr) 4710 segvn_softunlock(seg, addr, (a - addr), 4711 S_OTHER); 4712 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 4713 segvn_pagelist_rele(plp); 4714 if (pl_alloc_sz) 4715 kmem_free(plp, pl_alloc_sz); 4716 return (err); 4717 } 4718 if (vpage) { 4719 vpage++; 4720 } else if (svd->vpage) { 4721 page = seg_page(seg, addr); 4722 vpage = &svd->vpage[++page]; 4723 } 4724 } 4725 4726 /* Didn't get pages from the underlying fs so we're done */ 4727 if (!dogetpage) 4728 goto done; 4729 4730 /* 4731 * Now handle any other pages in the list returned. 4732 * If the page can be used, load up the translations now. 4733 * Note that the for loop will only be entered if "plp" 4734 * is pointing to a non-NULL page pointer which means that 4735 * VOP_GETPAGE() was called and vpprot has been initialized. 4736 */ 4737 if (svd->pageprot == 0) 4738 prot = svd->prot & vpprot; 4739 4740 4741 /* 4742 * Large Files: diff should be unsigned value because we started 4743 * supporting > 2GB segment sizes from 2.5.1 and when a 4744 * large file of size > 2GB gets mapped to address space 4745 * the diff value can be > 2GB. 4746 */ 4747 4748 for (ppp = plp; (pp = *ppp) != NULL; ppp++) { 4749 size_t diff; 4750 struct anon *ap; 4751 int anon_index; 4752 anon_sync_obj_t cookie; 4753 int hat_flag = HAT_LOAD_ADV; 4754 4755 if (svd->flags & MAP_TEXT) { 4756 hat_flag |= HAT_LOAD_TEXT; 4757 } 4758 4759 if (pp == PAGE_HANDLED) 4760 continue; 4761 4762 if (pp->p_offset >= svd->offset && 4763 (pp->p_offset < svd->offset + seg->s_size)) { 4764 4765 diff = pp->p_offset - svd->offset; 4766 4767 /* 4768 * Large Files: Following is the assertion 4769 * validating the above cast. 4770 */ 4771 ASSERT(svd->vp == pp->p_vnode); 4772 4773 page = btop(diff); 4774 if (svd->pageprot) 4775 prot = VPP_PROT(&svd->vpage[page]) & vpprot; 4776 4777 /* 4778 * Prevent other threads in the address space from 4779 * creating private pages (i.e., allocating anon slots) 4780 * while we are in the process of loading translations 4781 * to additional pages returned by the underlying 4782 * object. 4783 */ 4784 if (amp != NULL) { 4785 anon_index = svd->anon_index + page; 4786 anon_array_enter(amp, anon_index, &cookie); 4787 ap = anon_get_ptr(amp->ahp, anon_index); 4788 } 4789 if ((amp == NULL) || (ap == NULL)) { 4790 if (IS_VMODSORT(pp->p_vnode) || 4791 enable_mbit_wa) { 4792 if (rw == S_WRITE) 4793 hat_setmod(pp); 4794 else if (rw != S_OTHER && 4795 !hat_ismod(pp)) 4796 prot &= ~PROT_WRITE; 4797 } 4798 /* 4799 * Skip mapping read ahead pages marked 4800 * for migration, so they will get migrated 4801 * properly on fault 4802 */ 4803 if ((prot & PROT_READ) && !PP_ISMIGRATE(pp)) { 4804 hat_memload(hat, seg->s_base + diff, 4805 pp, prot, hat_flag); 4806 } 4807 } 4808 if (amp != NULL) 4809 anon_array_exit(&cookie); 4810 } 4811 page_unlock(pp); 4812 } 4813 done: 4814 if (amp != NULL) 4815 ANON_LOCK_EXIT(&->a_rwlock); 4816 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 4817 if (pl_alloc_sz) 4818 kmem_free(plp, pl_alloc_sz); 4819 return (0); 4820 } 4821 4822 /* 4823 * This routine is used to start I/O on pages asynchronously. XXX it will 4824 * only create PAGESIZE pages. At fault time they will be relocated into 4825 * larger pages. 4826 */ 4827 static faultcode_t 4828 segvn_faulta(struct seg *seg, caddr_t addr) 4829 { 4830 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 4831 int err; 4832 struct anon_map *amp; 4833 vnode_t *vp; 4834 4835 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 4836 4837 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER); 4838 if ((amp = svd->amp) != NULL) { 4839 struct anon *ap; 4840 4841 /* 4842 * Reader lock to prevent amp->ahp from being changed. 4843 * This is advisory, it's ok to miss a page, so 4844 * we don't do anon_array_enter lock. 4845 */ 4846 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 4847 if ((ap = anon_get_ptr(amp->ahp, 4848 svd->anon_index + seg_page(seg, addr))) != NULL) { 4849 4850 err = anon_getpage(&ap, NULL, NULL, 4851 0, seg, addr, S_READ, svd->cred); 4852 4853 ANON_LOCK_EXIT(&->a_rwlock); 4854 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 4855 if (err) 4856 return (FC_MAKE_ERR(err)); 4857 return (0); 4858 } 4859 ANON_LOCK_EXIT(&->a_rwlock); 4860 } 4861 4862 if (svd->vp == NULL) { 4863 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 4864 return (0); /* zfod page - do nothing now */ 4865 } 4866 4867 vp = svd->vp; 4868 TRACE_3(TR_FAC_VM, TR_SEGVN_GETPAGE, 4869 "segvn_getpage:seg %p addr %p vp %p", seg, addr, vp); 4870 err = VOP_GETPAGE(vp, 4871 (offset_t)(svd->offset + (uintptr_t)(addr - seg->s_base)), 4872 PAGESIZE, NULL, NULL, 0, seg, addr, 4873 S_OTHER, svd->cred); 4874 4875 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 4876 if (err) 4877 return (FC_MAKE_ERR(err)); 4878 return (0); 4879 } 4880 4881 static int 4882 segvn_setprot(struct seg *seg, caddr_t addr, size_t len, uint_t prot) 4883 { 4884 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 4885 struct vpage *svp, *evp; 4886 struct vnode *vp; 4887 size_t pgsz; 4888 pgcnt_t pgcnt; 4889 anon_sync_obj_t cookie; 4890 4891 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 4892 4893 if ((svd->maxprot & prot) != prot) 4894 return (EACCES); /* violated maxprot */ 4895 4896 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_WRITER); 4897 4898 /* return if prot is the same */ 4899 if (!svd->pageprot && svd->prot == prot) { 4900 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 4901 return (0); 4902 } 4903 4904 /* 4905 * Since we change protections we first have to flush the cache. 4906 * This makes sure all the pagelock calls have to recheck 4907 * protections. 4908 */ 4909 if (svd->softlockcnt > 0) { 4910 /* 4911 * Since we do have the segvn writers lock nobody can fill 4912 * the cache with entries belonging to this seg during 4913 * the purge. The flush either succeeds or we still have 4914 * pending I/Os. 4915 */ 4916 segvn_purge(seg); 4917 if (svd->softlockcnt > 0) { 4918 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 4919 return (EAGAIN); 4920 } 4921 } 4922 4923 if (seg->s_szc != 0) { 4924 int err; 4925 pgsz = page_get_pagesize(seg->s_szc); 4926 pgcnt = pgsz >> PAGESHIFT; 4927 ASSERT(IS_P2ALIGNED(pgcnt, pgcnt)); 4928 if (!IS_P2ALIGNED(addr, pgsz) || !IS_P2ALIGNED(len, pgsz)) { 4929 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 4930 ASSERT(seg->s_base != addr || seg->s_size != len); 4931 /* 4932 * If we are holding the as lock as a reader then 4933 * we need to return IE_RETRY and let the as 4934 * layer drop and re-aquire the lock as a writer. 4935 */ 4936 if (AS_READ_HELD(seg->s_as, &seg->s_as->a_lock)) 4937 return (IE_RETRY); 4938 VM_STAT_ADD(segvnvmstats.demoterange[1]); 4939 err = segvn_demote_range(seg, addr, len, SDR_END); 4940 if (err == 0) 4941 return (IE_RETRY); 4942 if (err == ENOMEM) 4943 return (IE_NOMEM); 4944 return (err); 4945 } 4946 } 4947 4948 4949 /* 4950 * If it's a private mapping and we're making it writable 4951 * and no swap space has been reserved, have to reserve 4952 * it all now. If it's a private mapping to a file (i.e., vp != NULL) 4953 * and we're removing write permission on the entire segment and 4954 * we haven't modified any pages, we can release the swap space. 4955 */ 4956 if (svd->type == MAP_PRIVATE) { 4957 if (prot & PROT_WRITE) { 4958 size_t sz; 4959 if (svd->swresv == 0 && !(svd->flags & MAP_NORESERVE)) { 4960 if (anon_resv(seg->s_size) == 0) { 4961 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 4962 return (IE_NOMEM); 4963 } 4964 sz = svd->swresv = seg->s_size; 4965 TRACE_3(TR_FAC_VM, TR_ANON_PROC, 4966 "anon proc:%p %lu %u", 4967 seg, sz, 1); 4968 } 4969 } else { 4970 /* 4971 * Swap space is released only if this segment 4972 * does not map anonymous memory, since read faults 4973 * on such segments still need an anon slot to read 4974 * in the data. 4975 */ 4976 if (svd->swresv != 0 && svd->vp != NULL && 4977 svd->amp == NULL && addr == seg->s_base && 4978 len == seg->s_size && svd->pageprot == 0) { 4979 anon_unresv(svd->swresv); 4980 svd->swresv = 0; 4981 TRACE_3(TR_FAC_VM, TR_ANON_PROC, 4982 "anon proc:%p %lu %u", 4983 seg, 0, 0); 4984 } 4985 } 4986 } 4987 4988 if (addr == seg->s_base && len == seg->s_size && svd->pageprot == 0) { 4989 if (svd->prot == prot) { 4990 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 4991 return (0); /* all done */ 4992 } 4993 svd->prot = (uchar_t)prot; 4994 } else { 4995 struct anon *ap = NULL; 4996 page_t *pp; 4997 u_offset_t offset, off; 4998 struct anon_map *amp; 4999 ulong_t anon_idx = 0; 5000 5001 /* 5002 * A vpage structure exists or else the change does not 5003 * involve the entire segment. Establish a vpage structure 5004 * if none is there. Then, for each page in the range, 5005 * adjust its individual permissions. Note that write- 5006 * enabling a MAP_PRIVATE page can affect the claims for 5007 * locked down memory. Overcommitting memory terminates 5008 * the operation. 5009 */ 5010 segvn_vpage(seg); 5011 if ((amp = svd->amp) != NULL) { 5012 anon_idx = svd->anon_index + seg_page(seg, addr); 5013 ASSERT(seg->s_szc == 0 || 5014 IS_P2ALIGNED(anon_idx, pgcnt)); 5015 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 5016 } 5017 5018 offset = svd->offset + (uintptr_t)(addr - seg->s_base); 5019 evp = &svd->vpage[seg_page(seg, addr + len)]; 5020 5021 /* 5022 * See Statement at the beginning of segvn_lockop regarding 5023 * the way cowcnts and lckcnts are handled. 5024 */ 5025 for (svp = &svd->vpage[seg_page(seg, addr)]; svp < evp; svp++) { 5026 5027 ASSERT(seg->s_szc == 0 || 5028 (svd->vp != NULL || svd->type == MAP_PRIVATE)); 5029 5030 if (seg->s_szc != 0 && svd->type == MAP_PRIVATE) { 5031 if (amp != NULL) { 5032 anon_array_enter(amp, anon_idx, 5033 &cookie); 5034 } 5035 if (IS_P2ALIGNED(anon_idx, pgcnt) && 5036 !segvn_claim_pages(seg, svp, offset, 5037 anon_idx, prot)) { 5038 if (amp != NULL) { 5039 anon_array_exit(&cookie); 5040 } 5041 break; 5042 } 5043 if (amp != NULL) { 5044 anon_array_exit(&cookie); 5045 } 5046 anon_idx++; 5047 } else { 5048 if (amp != NULL) { 5049 anon_array_enter(amp, anon_idx, 5050 &cookie); 5051 ap = anon_get_ptr(amp->ahp, anon_idx++); 5052 } 5053 5054 if (VPP_ISPPLOCK(svp) && 5055 (VPP_PROT(svp) != prot) && 5056 (svd->type == MAP_PRIVATE)) { 5057 5058 if (amp == NULL || ap == NULL) { 5059 vp = svd->vp; 5060 off = offset; 5061 } else 5062 swap_xlate(ap, &vp, &off); 5063 if (amp != NULL) 5064 anon_array_exit(&cookie); 5065 5066 if ((pp = page_lookup(vp, off, 5067 SE_SHARED)) == NULL) { 5068 panic("segvn_setprot: no page"); 5069 /*NOTREACHED*/ 5070 } 5071 ASSERT(seg->s_szc == 0); 5072 if ((VPP_PROT(svp) ^ prot) & 5073 PROT_WRITE) { 5074 if (prot & PROT_WRITE) { 5075 if (!page_addclaim(pp)) { 5076 page_unlock(pp); 5077 break; 5078 } 5079 } else { 5080 if (!page_subclaim(pp)) { 5081 page_unlock(pp); 5082 break; 5083 } 5084 } 5085 } 5086 page_unlock(pp); 5087 } else if (amp != NULL) 5088 anon_array_exit(&cookie); 5089 } 5090 VPP_SETPROT(svp, prot); 5091 offset += PAGESIZE; 5092 } 5093 if (amp != NULL) 5094 ANON_LOCK_EXIT(&->a_rwlock); 5095 5096 /* 5097 * Did we terminate prematurely? If so, simply unload 5098 * the translations to the things we've updated so far. 5099 */ 5100 if (svp != evp) { 5101 len = (svp - &svd->vpage[seg_page(seg, addr)]) * 5102 PAGESIZE; 5103 ASSERT(seg->s_szc == 0 || IS_P2ALIGNED(len, pgsz)); 5104 if (len != 0) 5105 hat_unload(seg->s_as->a_hat, addr, 5106 len, HAT_UNLOAD); 5107 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5108 return (IE_NOMEM); 5109 } 5110 } 5111 5112 if ((prot & PROT_WRITE) != 0 || (prot & ~PROT_USER) == PROT_NONE) { 5113 /* 5114 * Either private or shared data with write access (in 5115 * which case we need to throw out all former translations 5116 * so that we get the right translations set up on fault 5117 * and we don't allow write access to any copy-on-write pages 5118 * that might be around or to prevent write access to pages 5119 * representing holes in a file), or we don't have permission 5120 * to access the memory at all (in which case we have to 5121 * unload any current translations that might exist). 5122 */ 5123 hat_unload(seg->s_as->a_hat, addr, len, HAT_UNLOAD); 5124 } else { 5125 /* 5126 * A shared mapping or a private mapping in which write 5127 * protection is going to be denied - just change all the 5128 * protections over the range of addresses in question. 5129 * segvn does not support any other attributes other 5130 * than prot so we can use hat_chgattr. 5131 */ 5132 hat_chgattr(seg->s_as->a_hat, addr, len, prot); 5133 } 5134 5135 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5136 5137 return (0); 5138 } 5139 5140 /* 5141 * segvn_setpagesize is called via SEGOP_SETPAGESIZE from as_setpagesize, 5142 * to determine if the seg is capable of mapping the requested szc. 5143 */ 5144 static int 5145 segvn_setpagesize(struct seg *seg, caddr_t addr, size_t len, uint_t szc) 5146 { 5147 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 5148 struct segvn_data *nsvd; 5149 struct anon_map *amp = svd->amp; 5150 struct seg *nseg; 5151 caddr_t eaddr = addr + len, a; 5152 size_t pgsz = page_get_pagesize(szc); 5153 int err; 5154 u_offset_t off = svd->offset + (uintptr_t)(addr - seg->s_base); 5155 extern struct vnode kvp; 5156 5157 ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock)); 5158 ASSERT(addr >= seg->s_base && eaddr <= seg->s_base + seg->s_size); 5159 5160 if (seg->s_szc == szc || segvn_lpg_disable != 0) { 5161 return (0); 5162 } 5163 5164 /* 5165 * addr should always be pgsz aligned but eaddr may be misaligned if 5166 * it's at the end of the segment. 5167 * 5168 * XXX we should assert this condition since as_setpagesize() logic 5169 * guarantees it. 5170 */ 5171 if (!IS_P2ALIGNED(addr, pgsz) || 5172 (!IS_P2ALIGNED(eaddr, pgsz) && 5173 eaddr != seg->s_base + seg->s_size)) { 5174 5175 segvn_setpgsz_align_err++; 5176 return (EINVAL); 5177 } 5178 5179 if ((svd->vp == NULL && svd->type == MAP_SHARED) || 5180 (svd->flags & MAP_NORESERVE) || seg->s_as == &kas || 5181 szc > segvn_maxpgszc) { 5182 return (EINVAL); 5183 } 5184 5185 /* paranoid check */ 5186 if (svd->vp != NULL && 5187 (IS_SWAPFSVP(svd->vp) || svd->vp == &kvp)) { 5188 return (EINVAL); 5189 } 5190 5191 if (seg->s_szc == 0 && svd->vp != NULL && 5192 map_addr_vacalign_check(addr, off)) { 5193 return (EINVAL); 5194 } 5195 5196 /* 5197 * Check that protections are the same within new page 5198 * size boundaries. 5199 */ 5200 if (svd->pageprot) { 5201 for (a = addr; a < eaddr; a += pgsz) { 5202 if ((a + pgsz) > eaddr) { 5203 if (!sameprot(seg, a, eaddr - a)) { 5204 return (EINVAL); 5205 } 5206 } else { 5207 if (!sameprot(seg, a, pgsz)) { 5208 return (EINVAL); 5209 } 5210 } 5211 } 5212 } 5213 5214 /* 5215 * Since we are changing page size we first have to flush 5216 * the cache. This makes sure all the pagelock calls have 5217 * to recheck protections. 5218 */ 5219 if (svd->softlockcnt > 0) { 5220 /* 5221 * Since we do have the segvn writers lock nobody can fill 5222 * the cache with entries belonging to this seg during 5223 * the purge. The flush either succeeds or we still have 5224 * pending I/Os. 5225 */ 5226 segvn_purge(seg); 5227 if (svd->softlockcnt > 0) { 5228 return (EAGAIN); 5229 } 5230 } 5231 5232 /* 5233 * Operation for sub range of existing segment. 5234 */ 5235 if (addr != seg->s_base || eaddr != (seg->s_base + seg->s_size)) { 5236 if (szc < seg->s_szc) { 5237 VM_STAT_ADD(segvnvmstats.demoterange[2]); 5238 err = segvn_demote_range(seg, addr, len, SDR_RANGE); 5239 if (err == 0) { 5240 return (IE_RETRY); 5241 } 5242 if (err == ENOMEM) { 5243 return (IE_NOMEM); 5244 } 5245 return (err); 5246 } 5247 if (addr != seg->s_base) { 5248 nseg = segvn_split_seg(seg, addr); 5249 if (eaddr != (nseg->s_base + nseg->s_size)) { 5250 /* eaddr is szc aligned */ 5251 (void) segvn_split_seg(nseg, eaddr); 5252 } 5253 return (IE_RETRY); 5254 } 5255 if (eaddr != (seg->s_base + seg->s_size)) { 5256 /* eaddr is szc aligned */ 5257 (void) segvn_split_seg(seg, eaddr); 5258 } 5259 return (IE_RETRY); 5260 } 5261 5262 /* 5263 * Break any low level sharing and reset seg->s_szc to 0. 5264 */ 5265 if ((err = segvn_clrszc(seg)) != 0) { 5266 if (err == ENOMEM) { 5267 err = IE_NOMEM; 5268 } 5269 return (err); 5270 } 5271 ASSERT(seg->s_szc == 0); 5272 5273 /* 5274 * If the end of the current segment is not pgsz aligned 5275 * then attempt to concatenate with the next segment. 5276 */ 5277 if (!IS_P2ALIGNED(eaddr, pgsz)) { 5278 nseg = AS_SEGNEXT(seg->s_as, seg); 5279 if (nseg == NULL || nseg == seg || eaddr != nseg->s_base) { 5280 return (ENOMEM); 5281 } 5282 if (nseg->s_ops != &segvn_ops) { 5283 return (EINVAL); 5284 } 5285 nsvd = (struct segvn_data *)nseg->s_data; 5286 if (nsvd->softlockcnt > 0) { 5287 segvn_purge(nseg); 5288 if (nsvd->softlockcnt > 0) { 5289 return (EAGAIN); 5290 } 5291 } 5292 err = segvn_clrszc(nseg); 5293 if (err == ENOMEM) { 5294 err = IE_NOMEM; 5295 } 5296 if (err != 0) { 5297 return (err); 5298 } 5299 err = segvn_concat(seg, nseg, 1); 5300 if (err == -1) { 5301 return (EINVAL); 5302 } 5303 if (err == -2) { 5304 return (IE_NOMEM); 5305 } 5306 return (IE_RETRY); 5307 } 5308 5309 /* 5310 * May need to re-align anon array to 5311 * new szc. 5312 */ 5313 if (amp != NULL) { 5314 pgcnt_t pgcnt = pgsz >> PAGESHIFT; 5315 if (!IS_P2ALIGNED(svd->anon_index, pgcnt)) { 5316 struct anon_hdr *nahp; 5317 5318 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 5319 ASSERT(amp->refcnt == 1); 5320 nahp = anon_create(btop(amp->size), ANON_NOSLEEP); 5321 if (nahp == NULL) { 5322 ANON_LOCK_EXIT(&->a_rwlock); 5323 return (IE_NOMEM); 5324 } 5325 if (anon_copy_ptr(amp->ahp, svd->anon_index, 5326 nahp, 0, btop(seg->s_size), ANON_NOSLEEP)) { 5327 anon_release(nahp, btop(amp->size)); 5328 ANON_LOCK_EXIT(&->a_rwlock); 5329 return (IE_NOMEM); 5330 } 5331 anon_release(amp->ahp, btop(amp->size)); 5332 amp->ahp = nahp; 5333 svd->anon_index = 0; 5334 ANON_LOCK_EXIT(&->a_rwlock); 5335 } 5336 } 5337 if (svd->vp != NULL && szc != 0) { 5338 struct vattr va; 5339 u_offset_t eoffpage = svd->offset; 5340 va.va_mask = AT_SIZE; 5341 eoffpage += seg->s_size; 5342 eoffpage = btopr(eoffpage); 5343 if (VOP_GETATTR(svd->vp, &va, 0, svd->cred) != 0) { 5344 segvn_setpgsz_getattr_err++; 5345 return (EINVAL); 5346 } 5347 if (btopr(va.va_size) < eoffpage) { 5348 segvn_setpgsz_eof_err++; 5349 return (EINVAL); 5350 } 5351 if (amp != NULL) { 5352 /* 5353 * anon_fill_cow_holes() may call VOP_GETPAGE(). 5354 * don't take anon map lock here to avoid holding it 5355 * across VOP_GETPAGE() calls that may call back into 5356 * segvn for klsutering checks. We don't really need 5357 * anon map lock here since it's a private segment and 5358 * we hold as level lock as writers. 5359 */ 5360 if ((err = anon_fill_cow_holes(seg, seg->s_base, 5361 amp->ahp, svd->anon_index, svd->vp, svd->offset, 5362 seg->s_size, szc, svd->prot, svd->vpage, 5363 svd->cred)) != 0) { 5364 return (EINVAL); 5365 } 5366 } 5367 segvn_setvnode_mpss(svd->vp); 5368 } 5369 5370 if (amp != NULL) { 5371 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 5372 amp->a_szc = szc; 5373 ANON_LOCK_EXIT(&->a_rwlock); 5374 } 5375 5376 seg->s_szc = szc; 5377 5378 return (0); 5379 } 5380 5381 static int 5382 segvn_clrszc(struct seg *seg) 5383 { 5384 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 5385 struct anon_map *amp = svd->amp; 5386 size_t pgsz; 5387 pgcnt_t pages; 5388 int err = 0; 5389 caddr_t a = seg->s_base; 5390 caddr_t ea = a + seg->s_size; 5391 ulong_t an_idx = svd->anon_index; 5392 vnode_t *vp = svd->vp; 5393 struct vpage *vpage = svd->vpage; 5394 page_t *anon_pl[1 + 1], *pp; 5395 struct anon *ap, *oldap; 5396 uint_t prot = svd->prot, vpprot; 5397 5398 ASSERT(AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock) || 5399 SEGVN_WRITE_HELD(seg->s_as, &svd->lock)); 5400 ASSERT(svd->type == MAP_PRIVATE || 5401 (vp != NULL && svd->amp == NULL)); 5402 5403 if (vp == NULL && amp == NULL) { 5404 seg->s_szc = 0; 5405 return (0); 5406 } 5407 5408 /* 5409 * do HAT_UNLOAD_UNMAP since we are changing the pagesize. 5410 * unload argument is 0 when we are freeing the segment 5411 * and unload was already done. 5412 */ 5413 hat_unload(seg->s_as->a_hat, seg->s_base, seg->s_size, 5414 HAT_UNLOAD_UNMAP); 5415 5416 if (amp == NULL) { 5417 seg->s_szc = 0; 5418 return (0); 5419 } 5420 5421 pgsz = page_get_pagesize(seg->s_szc); 5422 pages = btop(pgsz); 5423 5424 /* 5425 * XXX anon rwlock is not really needed because this is a 5426 * private segment and we are writers. 5427 */ 5428 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 5429 5430 for (; a < ea; a += pgsz, an_idx += pages) { 5431 if ((oldap = anon_get_ptr(amp->ahp, an_idx)) != NULL) { 5432 if (svd->pageprot != 0) { 5433 ASSERT(vpage != NULL); 5434 prot = VPP_PROT(vpage); 5435 ASSERT(sameprot(seg, a, pgsz)); 5436 } 5437 if (seg->s_szc != 0) { 5438 ASSERT(vp == NULL || anon_pages(amp->ahp, 5439 an_idx, pages) == pages); 5440 if ((err = anon_map_demotepages(amp, an_idx, 5441 seg, a, prot, vpage, svd->cred)) != 0) { 5442 goto out; 5443 } 5444 } else { 5445 if (oldap->an_refcnt == 1) { 5446 continue; 5447 } 5448 if ((err = anon_getpage(&oldap, &vpprot, 5449 anon_pl, PAGESIZE, seg, a, S_READ, 5450 svd->cred))) { 5451 goto out; 5452 } 5453 if ((pp = anon_private(&ap, seg, a, prot, 5454 anon_pl[0], 0, svd->cred)) == NULL) { 5455 err = ENOMEM; 5456 goto out; 5457 } 5458 anon_decref(oldap); 5459 (void) anon_set_ptr(amp->ahp, an_idx, ap, 5460 ANON_SLEEP); 5461 page_unlock(pp); 5462 } 5463 } 5464 vpage = (vpage == NULL) ? NULL : vpage + pages; 5465 } 5466 5467 amp->a_szc = 0; 5468 seg->s_szc = 0; 5469 out: 5470 ANON_LOCK_EXIT(&->a_rwlock); 5471 return (err); 5472 } 5473 5474 static int 5475 segvn_claim_pages( 5476 struct seg *seg, 5477 struct vpage *svp, 5478 u_offset_t off, 5479 ulong_t anon_idx, 5480 uint_t prot) 5481 { 5482 pgcnt_t pgcnt = page_get_pagecnt(seg->s_szc); 5483 size_t ppasize = (pgcnt + 1) * sizeof (page_t *); 5484 page_t **ppa; 5485 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 5486 struct anon_map *amp = svd->amp; 5487 struct vpage *evp = svp + pgcnt; 5488 caddr_t addr = ((uintptr_t)(svp - svd->vpage) << PAGESHIFT) 5489 + seg->s_base; 5490 struct anon *ap; 5491 struct vnode *vp = svd->vp; 5492 page_t *pp; 5493 pgcnt_t pg_idx, i; 5494 int err = 0; 5495 anoff_t aoff; 5496 int anon = (amp != NULL) ? 1 : 0; 5497 5498 ASSERT(svd->type == MAP_PRIVATE); 5499 ASSERT(svd->vpage != NULL); 5500 ASSERT(seg->s_szc != 0); 5501 ASSERT(IS_P2ALIGNED(pgcnt, pgcnt)); 5502 ASSERT(amp == NULL || IS_P2ALIGNED(anon_idx, pgcnt)); 5503 ASSERT(sameprot(seg, addr, pgcnt << PAGESHIFT)); 5504 5505 if (VPP_PROT(svp) == prot) 5506 return (1); 5507 if (!((VPP_PROT(svp) ^ prot) & PROT_WRITE)) 5508 return (1); 5509 5510 ppa = kmem_alloc(ppasize, KM_SLEEP); 5511 if (anon && vp != NULL) { 5512 if (anon_get_ptr(amp->ahp, anon_idx) == NULL) { 5513 anon = 0; 5514 ASSERT(!anon_pages(amp->ahp, anon_idx, pgcnt)); 5515 } 5516 ASSERT(!anon || 5517 anon_pages(amp->ahp, anon_idx, pgcnt) == pgcnt); 5518 } 5519 5520 for (*ppa = NULL, pg_idx = 0; svp < evp; svp++, anon_idx++) { 5521 if (!VPP_ISPPLOCK(svp)) 5522 continue; 5523 if (anon) { 5524 ap = anon_get_ptr(amp->ahp, anon_idx); 5525 if (ap == NULL) { 5526 panic("segvn_claim_pages: no anon slot"); 5527 } 5528 swap_xlate(ap, &vp, &aoff); 5529 off = (u_offset_t)aoff; 5530 } 5531 ASSERT(vp != NULL); 5532 if ((pp = page_lookup(vp, 5533 (u_offset_t)off, SE_SHARED)) == NULL) { 5534 panic("segvn_claim_pages: no page"); 5535 } 5536 ppa[pg_idx++] = pp; 5537 off += PAGESIZE; 5538 } 5539 5540 if (ppa[0] == NULL) { 5541 kmem_free(ppa, ppasize); 5542 return (1); 5543 } 5544 5545 ASSERT(pg_idx <= pgcnt); 5546 ppa[pg_idx] = NULL; 5547 5548 if (prot & PROT_WRITE) 5549 err = page_addclaim_pages(ppa); 5550 else 5551 err = page_subclaim_pages(ppa); 5552 5553 for (i = 0; i < pg_idx; i++) { 5554 ASSERT(ppa[i] != NULL); 5555 page_unlock(ppa[i]); 5556 } 5557 5558 kmem_free(ppa, ppasize); 5559 return (err); 5560 } 5561 5562 /* 5563 * Returns right (upper address) segment if split occured. 5564 * If the address is equal to the beginning or end of its segment it returns 5565 * the current segment. 5566 */ 5567 static struct seg * 5568 segvn_split_seg(struct seg *seg, caddr_t addr) 5569 { 5570 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 5571 struct seg *nseg; 5572 size_t nsize; 5573 struct segvn_data *nsvd; 5574 5575 ASSERT(AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock)); 5576 ASSERT(svd->type == MAP_PRIVATE || svd->amp == NULL); 5577 ASSERT(addr >= seg->s_base); 5578 ASSERT(addr <= seg->s_base + seg->s_size); 5579 5580 if (addr == seg->s_base || addr == seg->s_base + seg->s_size) 5581 return (seg); 5582 5583 nsize = seg->s_base + seg->s_size - addr; 5584 seg->s_size = addr - seg->s_base; 5585 nseg = seg_alloc(seg->s_as, addr, nsize); 5586 ASSERT(nseg != NULL); 5587 nseg->s_ops = seg->s_ops; 5588 nsvd = kmem_cache_alloc(segvn_cache, KM_SLEEP); 5589 nseg->s_data = (void *)nsvd; 5590 nseg->s_szc = seg->s_szc; 5591 *nsvd = *svd; 5592 rw_init(&nsvd->lock, NULL, RW_DEFAULT, NULL); 5593 5594 if (nsvd->vp != NULL) { 5595 VN_HOLD(nsvd->vp); 5596 nsvd->offset = svd->offset + 5597 (uintptr_t)(nseg->s_base - seg->s_base); 5598 if (nsvd->type == MAP_SHARED) 5599 lgrp_shm_policy_init(NULL, nsvd->vp); 5600 } else { 5601 /* 5602 * The offset for an anonymous segment has no signifigance in 5603 * terms of an offset into a file. If we were to use the above 5604 * calculation instead, the structures read out of 5605 * /proc/<pid>/xmap would be more difficult to decipher since 5606 * it would be unclear whether two seemingly contiguous 5607 * prxmap_t structures represented different segments or a 5608 * single segment that had been split up into multiple prxmap_t 5609 * structures (e.g. if some part of the segment had not yet 5610 * been faulted in). 5611 */ 5612 nsvd->offset = 0; 5613 } 5614 5615 ASSERT(svd->softlockcnt == 0); 5616 crhold(svd->cred); 5617 5618 if (svd->vpage != NULL) { 5619 size_t bytes = vpgtob(seg_pages(seg)); 5620 size_t nbytes = vpgtob(seg_pages(nseg)); 5621 struct vpage *ovpage = svd->vpage; 5622 5623 svd->vpage = kmem_alloc(bytes, KM_SLEEP); 5624 bcopy(ovpage, svd->vpage, bytes); 5625 nsvd->vpage = kmem_alloc(nbytes, KM_SLEEP); 5626 bcopy(ovpage + seg_pages(seg), nsvd->vpage, nbytes); 5627 kmem_free(ovpage, bytes + nbytes); 5628 } 5629 if (svd->amp != NULL) { 5630 struct anon_map *oamp = svd->amp, *namp; 5631 struct anon_hdr *nahp; 5632 5633 ANON_LOCK_ENTER(&oamp->a_rwlock, RW_WRITER); 5634 ASSERT(oamp->refcnt == 1); 5635 nahp = anon_create(btop(seg->s_size), ANON_SLEEP); 5636 (void) anon_copy_ptr(oamp->ahp, svd->anon_index, 5637 nahp, 0, btop(seg->s_size), ANON_SLEEP); 5638 5639 namp = anonmap_alloc(nseg->s_size, 0); 5640 namp->a_szc = nseg->s_szc; 5641 (void) anon_copy_ptr(oamp->ahp, 5642 svd->anon_index + btop(seg->s_size), 5643 namp->ahp, 0, btop(nseg->s_size), ANON_SLEEP); 5644 anon_release(oamp->ahp, btop(oamp->size)); 5645 oamp->ahp = nahp; 5646 oamp->size = seg->s_size; 5647 svd->anon_index = 0; 5648 nsvd->amp = namp; 5649 nsvd->anon_index = 0; 5650 ANON_LOCK_EXIT(&oamp->a_rwlock); 5651 } 5652 5653 /* 5654 * Split amount of swap reserve 5655 */ 5656 if (svd->swresv) { 5657 /* 5658 * For MAP_NORESERVE, only allocate swap reserve for pages 5659 * being used. Other segments get enough to cover whole 5660 * segment. 5661 */ 5662 if (svd->flags & MAP_NORESERVE) { 5663 size_t oswresv; 5664 5665 ASSERT(svd->amp); 5666 oswresv = svd->swresv; 5667 svd->swresv = ptob(anon_pages(svd->amp->ahp, 5668 svd->anon_index, btop(seg->s_size))); 5669 nsvd->swresv = ptob(anon_pages(nsvd->amp->ahp, 5670 nsvd->anon_index, btop(nseg->s_size))); 5671 ASSERT(oswresv >= (svd->swresv + nsvd->swresv)); 5672 } else { 5673 ASSERT(svd->swresv == seg->s_size + nseg->s_size); 5674 svd->swresv = seg->s_size; 5675 nsvd->swresv = nseg->s_size; 5676 } 5677 } 5678 5679 return (nseg); 5680 } 5681 5682 5683 /* 5684 * called on memory operations (unmap, setprot, setpagesize) for a subset 5685 * of a large page segment to either demote the memory range (SDR_RANGE) 5686 * or the ends (SDR_END) by addr/len. 5687 * 5688 * returns 0 on success. returns errno, including ENOMEM, on failure. 5689 */ 5690 static int 5691 segvn_demote_range(struct seg *seg, caddr_t addr, size_t len, int flag) 5692 { 5693 caddr_t eaddr = addr + len; 5694 caddr_t lpgaddr, lpgeaddr; 5695 struct seg *nseg; 5696 struct seg *badseg1 = NULL; 5697 struct seg *badseg2 = NULL; 5698 size_t pgsz; 5699 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 5700 int err; 5701 5702 ASSERT(AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock)); 5703 ASSERT(seg->s_szc != 0); 5704 pgsz = page_get_pagesize(seg->s_szc); 5705 ASSERT(seg->s_base != addr || seg->s_size != len); 5706 ASSERT(addr >= seg->s_base && eaddr <= seg->s_base + seg->s_size); 5707 ASSERT(svd->softlockcnt == 0); 5708 ASSERT(svd->type == MAP_PRIVATE || 5709 (svd->vp != NULL && svd->amp == NULL)); 5710 5711 CALC_LPG_REGION(pgsz, seg, addr, len, lpgaddr, lpgeaddr); 5712 ASSERT(flag == SDR_RANGE || eaddr < lpgeaddr || addr > lpgaddr); 5713 if (flag == SDR_RANGE) { 5714 /* demote entire range */ 5715 badseg1 = nseg = segvn_split_seg(seg, lpgaddr); 5716 (void) segvn_split_seg(nseg, lpgeaddr); 5717 ASSERT(badseg1->s_base == lpgaddr); 5718 ASSERT(badseg1->s_size == lpgeaddr - lpgaddr); 5719 } else if (addr != lpgaddr) { 5720 ASSERT(flag == SDR_END); 5721 badseg1 = nseg = segvn_split_seg(seg, lpgaddr); 5722 if (eaddr != lpgeaddr && eaddr > lpgaddr + pgsz && 5723 eaddr < lpgaddr + 2 * pgsz) { 5724 (void) segvn_split_seg(nseg, lpgeaddr); 5725 ASSERT(badseg1->s_base == lpgaddr); 5726 ASSERT(badseg1->s_size == 2 * pgsz); 5727 } else { 5728 nseg = segvn_split_seg(nseg, lpgaddr + pgsz); 5729 ASSERT(badseg1->s_base == lpgaddr); 5730 ASSERT(badseg1->s_size == pgsz); 5731 if (eaddr != lpgeaddr && eaddr > lpgaddr + pgsz) { 5732 ASSERT(lpgeaddr - lpgaddr > 2 * pgsz); 5733 nseg = segvn_split_seg(nseg, lpgeaddr - pgsz); 5734 badseg2 = nseg; 5735 (void) segvn_split_seg(nseg, lpgeaddr); 5736 ASSERT(badseg2->s_base == lpgeaddr - pgsz); 5737 ASSERT(badseg2->s_size == pgsz); 5738 } 5739 } 5740 } else { 5741 ASSERT(flag == SDR_END); 5742 ASSERT(eaddr < lpgeaddr); 5743 badseg1 = nseg = segvn_split_seg(seg, lpgeaddr - pgsz); 5744 (void) segvn_split_seg(nseg, lpgeaddr); 5745 ASSERT(badseg1->s_base == lpgeaddr - pgsz); 5746 ASSERT(badseg1->s_size == pgsz); 5747 } 5748 5749 ASSERT(badseg1 != NULL); 5750 ASSERT(badseg1->s_szc != 0); 5751 ASSERT(page_get_pagesize(badseg1->s_szc) == pgsz); 5752 ASSERT(flag == SDR_RANGE || badseg1->s_size == pgsz || 5753 badseg1->s_size == 2 * pgsz); 5754 if (err = segvn_clrszc(badseg1)) { 5755 return (err); 5756 } 5757 ASSERT(badseg1->s_szc == 0); 5758 5759 if (badseg2 == NULL) 5760 return (0); 5761 ASSERT(badseg2->s_szc != 0); 5762 ASSERT(page_get_pagesize(badseg2->s_szc) == pgsz); 5763 ASSERT(badseg2->s_size == pgsz); 5764 ASSERT(sameprot(badseg2, badseg2->s_base, badseg2->s_size)); 5765 if (err = segvn_clrszc(badseg2)) { 5766 return (err); 5767 } 5768 ASSERT(badseg2->s_szc == 0); 5769 return (0); 5770 } 5771 5772 static int 5773 segvn_checkprot(struct seg *seg, caddr_t addr, size_t len, uint_t prot) 5774 { 5775 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 5776 struct vpage *vp, *evp; 5777 5778 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 5779 5780 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER); 5781 /* 5782 * If segment protection can be used, simply check against them. 5783 */ 5784 if (svd->pageprot == 0) { 5785 int err; 5786 5787 err = ((svd->prot & prot) != prot) ? EACCES : 0; 5788 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5789 return (err); 5790 } 5791 5792 /* 5793 * Have to check down to the vpage level. 5794 */ 5795 evp = &svd->vpage[seg_page(seg, addr + len)]; 5796 for (vp = &svd->vpage[seg_page(seg, addr)]; vp < evp; vp++) { 5797 if ((VPP_PROT(vp) & prot) != prot) { 5798 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5799 return (EACCES); 5800 } 5801 } 5802 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5803 return (0); 5804 } 5805 5806 static int 5807 segvn_getprot(struct seg *seg, caddr_t addr, size_t len, uint_t *protv) 5808 { 5809 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 5810 size_t pgno = seg_page(seg, addr + len) - seg_page(seg, addr) + 1; 5811 5812 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 5813 5814 if (pgno != 0) { 5815 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER); 5816 if (svd->pageprot == 0) { 5817 do 5818 protv[--pgno] = svd->prot; 5819 while (pgno != 0); 5820 } else { 5821 size_t pgoff = seg_page(seg, addr); 5822 5823 do { 5824 pgno--; 5825 protv[pgno] = VPP_PROT(&svd->vpage[pgno+pgoff]); 5826 } while (pgno != 0); 5827 } 5828 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5829 } 5830 return (0); 5831 } 5832 5833 static u_offset_t 5834 segvn_getoffset(struct seg *seg, caddr_t addr) 5835 { 5836 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 5837 5838 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 5839 5840 return (svd->offset + (uintptr_t)(addr - seg->s_base)); 5841 } 5842 5843 /*ARGSUSED*/ 5844 static int 5845 segvn_gettype(struct seg *seg, caddr_t addr) 5846 { 5847 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 5848 5849 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 5850 5851 return (svd->type | (svd->flags & MAP_NORESERVE)); 5852 } 5853 5854 /*ARGSUSED*/ 5855 static int 5856 segvn_getvp(struct seg *seg, caddr_t addr, struct vnode **vpp) 5857 { 5858 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 5859 5860 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 5861 5862 *vpp = svd->vp; 5863 return (0); 5864 } 5865 5866 /* 5867 * Check to see if it makes sense to do kluster/read ahead to 5868 * addr + delta relative to the mapping at addr. We assume here 5869 * that delta is a signed PAGESIZE'd multiple (which can be negative). 5870 * 5871 * For segvn, we currently "approve" of the action if we are 5872 * still in the segment and it maps from the same vp/off, 5873 * or if the advice stored in segvn_data or vpages allows it. 5874 * Currently, klustering is not allowed only if MADV_RANDOM is set. 5875 */ 5876 static int 5877 segvn_kluster(struct seg *seg, caddr_t addr, ssize_t delta) 5878 { 5879 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 5880 struct anon *oap, *ap; 5881 ssize_t pd; 5882 size_t page; 5883 struct vnode *vp1, *vp2; 5884 u_offset_t off1, off2; 5885 struct anon_map *amp; 5886 5887 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 5888 ASSERT(AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock) || 5889 SEGVN_LOCK_HELD(seg->s_as, &svd->lock)); 5890 5891 if (addr + delta < seg->s_base || 5892 addr + delta >= (seg->s_base + seg->s_size)) 5893 return (-1); /* exceeded segment bounds */ 5894 5895 pd = delta / (ssize_t)PAGESIZE; /* divide to preserve sign bit */ 5896 page = seg_page(seg, addr); 5897 5898 /* 5899 * Check to see if either of the pages addr or addr + delta 5900 * have advice set that prevents klustering (if MADV_RANDOM advice 5901 * is set for entire segment, or MADV_SEQUENTIAL is set and delta 5902 * is negative). 5903 */ 5904 if (svd->advice == MADV_RANDOM || 5905 svd->advice == MADV_SEQUENTIAL && delta < 0) 5906 return (-1); 5907 else if (svd->pageadvice && svd->vpage) { 5908 struct vpage *bvpp, *evpp; 5909 5910 bvpp = &svd->vpage[page]; 5911 evpp = &svd->vpage[page + pd]; 5912 if (VPP_ADVICE(bvpp) == MADV_RANDOM || 5913 VPP_ADVICE(evpp) == MADV_SEQUENTIAL && delta < 0) 5914 return (-1); 5915 if (VPP_ADVICE(bvpp) != VPP_ADVICE(evpp) && 5916 VPP_ADVICE(evpp) == MADV_RANDOM) 5917 return (-1); 5918 } 5919 5920 if (svd->type == MAP_SHARED) 5921 return (0); /* shared mapping - all ok */ 5922 5923 if ((amp = svd->amp) == NULL) 5924 return (0); /* off original vnode */ 5925 5926 page += svd->anon_index; 5927 5928 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 5929 5930 oap = anon_get_ptr(amp->ahp, page); 5931 ap = anon_get_ptr(amp->ahp, page + pd); 5932 5933 ANON_LOCK_EXIT(&->a_rwlock); 5934 5935 if ((oap == NULL && ap != NULL) || (oap != NULL && ap == NULL)) { 5936 return (-1); /* one with and one without an anon */ 5937 } 5938 5939 if (oap == NULL) { /* implies that ap == NULL */ 5940 return (0); /* off original vnode */ 5941 } 5942 5943 /* 5944 * Now we know we have two anon pointers - check to 5945 * see if they happen to be properly allocated. 5946 */ 5947 5948 /* 5949 * XXX We cheat here and don't lock the anon slots. We can't because 5950 * we may have been called from the anon layer which might already 5951 * have locked them. We are holding a refcnt on the slots so they 5952 * can't disappear. The worst that will happen is we'll get the wrong 5953 * names (vp, off) for the slots and make a poor klustering decision. 5954 */ 5955 swap_xlate(ap, &vp1, &off1); 5956 swap_xlate(oap, &vp2, &off2); 5957 5958 5959 if (!VOP_CMP(vp1, vp2) || off1 - off2 != delta) 5960 return (-1); 5961 return (0); 5962 } 5963 5964 /* 5965 * Swap the pages of seg out to secondary storage, returning the 5966 * number of bytes of storage freed. 5967 * 5968 * The basic idea is first to unload all translations and then to call 5969 * VOP_PUTPAGE() for all newly-unmapped pages, to push them out to the 5970 * swap device. Pages to which other segments have mappings will remain 5971 * mapped and won't be swapped. Our caller (as_swapout) has already 5972 * performed the unloading step. 5973 * 5974 * The value returned is intended to correlate well with the process's 5975 * memory requirements. However, there are some caveats: 5976 * 1) When given a shared segment as argument, this routine will 5977 * only succeed in swapping out pages for the last sharer of the 5978 * segment. (Previous callers will only have decremented mapping 5979 * reference counts.) 5980 * 2) We assume that the hat layer maintains a large enough translation 5981 * cache to capture process reference patterns. 5982 */ 5983 static size_t 5984 segvn_swapout(struct seg *seg) 5985 { 5986 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 5987 struct anon_map *amp; 5988 pgcnt_t pgcnt = 0; 5989 pgcnt_t npages; 5990 pgcnt_t page; 5991 ulong_t anon_index; 5992 5993 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 5994 5995 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER); 5996 /* 5997 * Find pages unmapped by our caller and force them 5998 * out to the virtual swap device. 5999 */ 6000 if ((amp = svd->amp) != NULL) 6001 anon_index = svd->anon_index; 6002 npages = seg->s_size >> PAGESHIFT; 6003 for (page = 0; page < npages; page++) { 6004 page_t *pp; 6005 struct anon *ap; 6006 struct vnode *vp; 6007 u_offset_t off; 6008 anon_sync_obj_t cookie; 6009 6010 /* 6011 * Obtain <vp, off> pair for the page, then look it up. 6012 * 6013 * Note that this code is willing to consider regular 6014 * pages as well as anon pages. Is this appropriate here? 6015 */ 6016 ap = NULL; 6017 if (amp != NULL) { 6018 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 6019 anon_array_enter(amp, anon_index + page, &cookie); 6020 ap = anon_get_ptr(amp->ahp, anon_index + page); 6021 if (ap != NULL) { 6022 swap_xlate(ap, &vp, &off); 6023 } else { 6024 vp = svd->vp; 6025 off = svd->offset + ptob(page); 6026 } 6027 anon_array_exit(&cookie); 6028 ANON_LOCK_EXIT(&->a_rwlock); 6029 } else { 6030 vp = svd->vp; 6031 off = svd->offset + ptob(page); 6032 } 6033 if (vp == NULL) { /* untouched zfod page */ 6034 ASSERT(ap == NULL); 6035 continue; 6036 } 6037 6038 pp = page_lookup_nowait(vp, off, SE_SHARED); 6039 if (pp == NULL) 6040 continue; 6041 6042 6043 /* 6044 * Examine the page to see whether it can be tossed out, 6045 * keeping track of how many we've found. 6046 */ 6047 if (!page_tryupgrade(pp)) { 6048 /* 6049 * If the page has an i/o lock and no mappings, 6050 * it's very likely that the page is being 6051 * written out as a result of klustering. 6052 * Assume this is so and take credit for it here. 6053 */ 6054 if (!page_io_trylock(pp)) { 6055 if (!hat_page_is_mapped(pp)) 6056 pgcnt++; 6057 } else { 6058 page_io_unlock(pp); 6059 } 6060 page_unlock(pp); 6061 continue; 6062 } 6063 ASSERT(!page_iolock_assert(pp)); 6064 6065 6066 /* 6067 * Skip if page is locked or has mappings. 6068 * We don't need the page_struct_lock to look at lckcnt 6069 * and cowcnt because the page is exclusive locked. 6070 */ 6071 if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0 || 6072 hat_page_is_mapped(pp)) { 6073 page_unlock(pp); 6074 continue; 6075 } 6076 6077 /* 6078 * dispose skips large pages so try to demote first. 6079 */ 6080 if (pp->p_szc != 0 && !page_try_demote_pages(pp)) { 6081 page_unlock(pp); 6082 /* 6083 * XXX should skip the remaining page_t's of this 6084 * large page. 6085 */ 6086 continue; 6087 } 6088 6089 ASSERT(pp->p_szc == 0); 6090 6091 /* 6092 * No longer mapped -- we can toss it out. How 6093 * we do so depends on whether or not it's dirty. 6094 */ 6095 if (hat_ismod(pp) && pp->p_vnode) { 6096 /* 6097 * We must clean the page before it can be 6098 * freed. Setting B_FREE will cause pvn_done 6099 * to free the page when the i/o completes. 6100 * XXX: This also causes it to be accounted 6101 * as a pageout instead of a swap: need 6102 * B_SWAPOUT bit to use instead of B_FREE. 6103 * 6104 * Hold the vnode before releasing the page lock 6105 * to prevent it from being freed and re-used by 6106 * some other thread. 6107 */ 6108 VN_HOLD(vp); 6109 page_unlock(pp); 6110 6111 /* 6112 * Queue all i/o requests for the pageout thread 6113 * to avoid saturating the pageout devices. 6114 */ 6115 if (!queue_io_request(vp, off)) 6116 VN_RELE(vp); 6117 } else { 6118 /* 6119 * The page was clean, free it. 6120 * 6121 * XXX: Can we ever encounter modified pages 6122 * with no associated vnode here? 6123 */ 6124 ASSERT(pp->p_vnode != NULL); 6125 /*LINTED: constant in conditional context*/ 6126 VN_DISPOSE(pp, B_FREE, 0, kcred); 6127 } 6128 6129 /* 6130 * Credit now even if i/o is in progress. 6131 */ 6132 pgcnt++; 6133 } 6134 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 6135 6136 /* 6137 * Wakeup pageout to initiate i/o on all queued requests. 6138 */ 6139 cv_signal_pageout(); 6140 return (ptob(pgcnt)); 6141 } 6142 6143 /* 6144 * Synchronize primary storage cache with real object in virtual memory. 6145 * 6146 * XXX - Anonymous pages should not be sync'ed out at all. 6147 */ 6148 static int 6149 segvn_sync(struct seg *seg, caddr_t addr, size_t len, int attr, uint_t flags) 6150 { 6151 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 6152 struct vpage *vpp; 6153 page_t *pp; 6154 u_offset_t offset; 6155 struct vnode *vp; 6156 u_offset_t off; 6157 caddr_t eaddr; 6158 int bflags; 6159 int err = 0; 6160 int segtype; 6161 int pageprot; 6162 int prot; 6163 ulong_t anon_index; 6164 struct anon_map *amp; 6165 struct anon *ap; 6166 anon_sync_obj_t cookie; 6167 6168 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 6169 6170 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER); 6171 6172 if (svd->softlockcnt > 0) { 6173 /* 6174 * flush all pages from seg cache 6175 * otherwise we may deadlock in swap_putpage 6176 * for B_INVAL page (4175402). 6177 * 6178 * Even if we grab segvn WRITER's lock or segp_slock 6179 * here, there might be another thread which could've 6180 * successfully performed lookup/insert just before 6181 * we acquired the lock here. So, grabbing either 6182 * lock here is of not much use. Until we devise 6183 * a strategy at upper layers to solve the 6184 * synchronization issues completely, we expect 6185 * applications to handle this appropriately. 6186 */ 6187 segvn_purge(seg); 6188 if (svd->softlockcnt > 0) { 6189 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 6190 return (EAGAIN); 6191 } 6192 } 6193 6194 vpp = svd->vpage; 6195 offset = svd->offset + (uintptr_t)(addr - seg->s_base); 6196 bflags = ((flags & MS_ASYNC) ? B_ASYNC : 0) | 6197 ((flags & MS_INVALIDATE) ? B_INVAL : 0); 6198 6199 if (attr) { 6200 pageprot = attr & ~(SHARED|PRIVATE); 6201 segtype = (attr & SHARED) ? MAP_SHARED : MAP_PRIVATE; 6202 6203 /* 6204 * We are done if the segment types don't match 6205 * or if we have segment level protections and 6206 * they don't match. 6207 */ 6208 if (svd->type != segtype) { 6209 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 6210 return (0); 6211 } 6212 if (vpp == NULL) { 6213 if (svd->prot != pageprot) { 6214 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 6215 return (0); 6216 } 6217 prot = svd->prot; 6218 } else 6219 vpp = &svd->vpage[seg_page(seg, addr)]; 6220 6221 } else if (svd->vp && svd->amp == NULL && 6222 (flags & MS_INVALIDATE) == 0) { 6223 6224 /* 6225 * No attributes, no anonymous pages and MS_INVALIDATE flag 6226 * is not on, just use one big request. 6227 */ 6228 err = VOP_PUTPAGE(svd->vp, (offset_t)offset, len, 6229 bflags, svd->cred); 6230 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 6231 return (err); 6232 } 6233 6234 if ((amp = svd->amp) != NULL) 6235 anon_index = svd->anon_index + seg_page(seg, addr); 6236 6237 for (eaddr = addr + len; addr < eaddr; addr += PAGESIZE) { 6238 ap = NULL; 6239 if (amp != NULL) { 6240 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 6241 anon_array_enter(amp, anon_index, &cookie); 6242 ap = anon_get_ptr(amp->ahp, anon_index++); 6243 if (ap != NULL) { 6244 swap_xlate(ap, &vp, &off); 6245 } else { 6246 vp = svd->vp; 6247 off = offset; 6248 } 6249 anon_array_exit(&cookie); 6250 ANON_LOCK_EXIT(&->a_rwlock); 6251 } else { 6252 vp = svd->vp; 6253 off = offset; 6254 } 6255 offset += PAGESIZE; 6256 6257 if (vp == NULL) /* untouched zfod page */ 6258 continue; 6259 6260 if (attr) { 6261 if (vpp) { 6262 prot = VPP_PROT(vpp); 6263 vpp++; 6264 } 6265 if (prot != pageprot) { 6266 continue; 6267 } 6268 } 6269 6270 /* 6271 * See if any of these pages are locked -- if so, then we 6272 * will have to truncate an invalidate request at the first 6273 * locked one. We don't need the page_struct_lock to test 6274 * as this is only advisory; even if we acquire it someone 6275 * might race in and lock the page after we unlock and before 6276 * we do the PUTPAGE, then PUTPAGE simply does nothing. 6277 */ 6278 if (flags & MS_INVALIDATE) { 6279 if ((pp = page_lookup(vp, off, SE_SHARED)) != NULL) { 6280 if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) { 6281 page_unlock(pp); 6282 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 6283 return (EBUSY); 6284 } 6285 if (ap != NULL && pp->p_szc != 0 && 6286 page_tryupgrade(pp)) { 6287 if (pp->p_lckcnt == 0 && 6288 pp->p_cowcnt == 0) { 6289 /* 6290 * swapfs VN_DISPOSE() won't 6291 * invalidate large pages. 6292 * Attempt to demote. 6293 * XXX can't help it if it 6294 * fails. But for swapfs 6295 * pages it is no big deal. 6296 */ 6297 (void) page_try_demote_pages( 6298 pp); 6299 } 6300 } 6301 page_unlock(pp); 6302 } 6303 } else if (svd->type == MAP_SHARED && amp != NULL) { 6304 /* 6305 * Avoid writting out to disk ISM's large pages 6306 * because segspt_free_pages() relies on NULL an_pvp 6307 * of anon slots of such pages. 6308 */ 6309 6310 ASSERT(svd->vp == NULL); 6311 /* 6312 * swapfs uses page_lookup_nowait if not freeing or 6313 * invalidating and skips a page if 6314 * page_lookup_nowait returns NULL. 6315 */ 6316 pp = page_lookup_nowait(vp, off, SE_SHARED); 6317 if (pp == NULL) { 6318 continue; 6319 } 6320 if (pp->p_szc != 0) { 6321 page_unlock(pp); 6322 continue; 6323 } 6324 6325 /* 6326 * Note ISM pages are created large so (vp, off)'s 6327 * page cannot suddenly become large after we unlock 6328 * pp. 6329 */ 6330 page_unlock(pp); 6331 } 6332 /* 6333 * XXX - Should ultimately try to kluster 6334 * calls to VOP_PUTPAGE() for performance. 6335 */ 6336 VN_HOLD(vp); 6337 err = VOP_PUTPAGE(vp, (offset_t)off, PAGESIZE, 6338 bflags, svd->cred); 6339 VN_RELE(vp); 6340 if (err) 6341 break; 6342 } 6343 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 6344 return (err); 6345 } 6346 6347 /* 6348 * Determine if we have data corresponding to pages in the 6349 * primary storage virtual memory cache (i.e., "in core"). 6350 */ 6351 static size_t 6352 segvn_incore(struct seg *seg, caddr_t addr, size_t len, char *vec) 6353 { 6354 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 6355 struct vnode *vp, *avp; 6356 u_offset_t offset, aoffset; 6357 size_t p, ep; 6358 int ret; 6359 struct vpage *vpp; 6360 page_t *pp; 6361 uint_t start; 6362 struct anon_map *amp; /* XXX - for locknest */ 6363 struct anon *ap; 6364 uint_t attr; 6365 anon_sync_obj_t cookie; 6366 6367 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 6368 6369 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER); 6370 if (svd->amp == NULL && svd->vp == NULL) { 6371 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 6372 bzero(vec, btopr(len)); 6373 return (len); /* no anonymous pages created yet */ 6374 } 6375 6376 p = seg_page(seg, addr); 6377 ep = seg_page(seg, addr + len); 6378 start = svd->vp ? SEG_PAGE_VNODEBACKED : 0; 6379 6380 amp = svd->amp; 6381 for (; p < ep; p++, addr += PAGESIZE) { 6382 vpp = (svd->vpage) ? &svd->vpage[p]: NULL; 6383 ret = start; 6384 ap = NULL; 6385 avp = NULL; 6386 /* Grab the vnode/offset for the anon slot */ 6387 if (amp != NULL) { 6388 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 6389 anon_array_enter(amp, svd->anon_index + p, &cookie); 6390 ap = anon_get_ptr(amp->ahp, svd->anon_index + p); 6391 if (ap != NULL) { 6392 swap_xlate(ap, &avp, &aoffset); 6393 } 6394 anon_array_exit(&cookie); 6395 ANON_LOCK_EXIT(&->a_rwlock); 6396 } 6397 if ((avp != NULL) && page_exists(avp, aoffset)) { 6398 /* A page exists for the anon slot */ 6399 ret |= SEG_PAGE_INCORE; 6400 6401 /* 6402 * If page is mapped and writable 6403 */ 6404 attr = (uint_t)0; 6405 if ((hat_getattr(seg->s_as->a_hat, addr, 6406 &attr) != -1) && (attr & PROT_WRITE)) { 6407 ret |= SEG_PAGE_ANON; 6408 } 6409 /* 6410 * Don't get page_struct lock for lckcnt and cowcnt, 6411 * since this is purely advisory. 6412 */ 6413 if ((pp = page_lookup_nowait(avp, aoffset, 6414 SE_SHARED)) != NULL) { 6415 if (pp->p_lckcnt) 6416 ret |= SEG_PAGE_SOFTLOCK; 6417 if (pp->p_cowcnt) 6418 ret |= SEG_PAGE_HASCOW; 6419 page_unlock(pp); 6420 } 6421 } 6422 6423 /* Gather vnode statistics */ 6424 vp = svd->vp; 6425 offset = svd->offset + (uintptr_t)(addr - seg->s_base); 6426 6427 if (vp != NULL) { 6428 /* 6429 * Try to obtain a "shared" lock on the page 6430 * without blocking. If this fails, determine 6431 * if the page is in memory. 6432 */ 6433 pp = page_lookup_nowait(vp, offset, SE_SHARED); 6434 if ((pp == NULL) && (page_exists(vp, offset))) { 6435 /* Page is incore, and is named */ 6436 ret |= (SEG_PAGE_INCORE | SEG_PAGE_VNODE); 6437 } 6438 /* 6439 * Don't get page_struct lock for lckcnt and cowcnt, 6440 * since this is purely advisory. 6441 */ 6442 if (pp != NULL) { 6443 ret |= (SEG_PAGE_INCORE | SEG_PAGE_VNODE); 6444 if (pp->p_lckcnt) 6445 ret |= SEG_PAGE_SOFTLOCK; 6446 if (pp->p_cowcnt) 6447 ret |= SEG_PAGE_HASCOW; 6448 page_unlock(pp); 6449 } 6450 } 6451 6452 /* Gather virtual page information */ 6453 if (vpp) { 6454 if (VPP_ISPPLOCK(vpp)) 6455 ret |= SEG_PAGE_LOCKED; 6456 vpp++; 6457 } 6458 6459 *vec++ = (char)ret; 6460 } 6461 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 6462 return (len); 6463 } 6464 6465 /* 6466 * Statement for p_cowcnts/p_lckcnts. 6467 * 6468 * p_cowcnt is updated while mlock/munlocking MAP_PRIVATE and PROT_WRITE region 6469 * irrespective of the following factors or anything else: 6470 * 6471 * (1) anon slots are populated or not 6472 * (2) cow is broken or not 6473 * (3) refcnt on ap is 1 or greater than 1 6474 * 6475 * If it's not MAP_PRIVATE and PROT_WRITE, p_lckcnt is updated during mlock 6476 * and munlock. 6477 * 6478 * 6479 * Handling p_cowcnts/p_lckcnts during copy-on-write fault: 6480 * 6481 * if vpage has PROT_WRITE 6482 * transfer cowcnt on the oldpage -> cowcnt on the newpage 6483 * else 6484 * transfer lckcnt on the oldpage -> lckcnt on the newpage 6485 * 6486 * During copy-on-write, decrement p_cowcnt on the oldpage and increment 6487 * p_cowcnt on the newpage *if* the corresponding vpage has PROT_WRITE. 6488 * 6489 * We may also break COW if softlocking on read access in the physio case. 6490 * In this case, vpage may not have PROT_WRITE. So, we need to decrement 6491 * p_lckcnt on the oldpage and increment p_lckcnt on the newpage *if* the 6492 * vpage doesn't have PROT_WRITE. 6493 * 6494 * 6495 * Handling p_cowcnts/p_lckcnts during mprotect on mlocked region: 6496 * 6497 * If a MAP_PRIVATE region loses PROT_WRITE, we decrement p_cowcnt and 6498 * increment p_lckcnt by calling page_subclaim() which takes care of 6499 * availrmem accounting and p_lckcnt overflow. 6500 * 6501 * If a MAP_PRIVATE region gains PROT_WRITE, we decrement p_lckcnt and 6502 * increment p_cowcnt by calling page_addclaim() which takes care of 6503 * availrmem availability and p_cowcnt overflow. 6504 */ 6505 6506 /* 6507 * Lock down (or unlock) pages mapped by this segment. 6508 * 6509 * XXX only creates PAGESIZE pages if anon slots are not initialized. 6510 * At fault time they will be relocated into larger pages. 6511 */ 6512 static int 6513 segvn_lockop(struct seg *seg, caddr_t addr, size_t len, 6514 int attr, int op, ulong_t *lockmap, size_t pos) 6515 { 6516 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 6517 struct vpage *vpp; 6518 struct vpage *evp; 6519 page_t *pp; 6520 u_offset_t offset; 6521 u_offset_t off; 6522 int segtype; 6523 int pageprot; 6524 int claim; 6525 struct vnode *vp; 6526 ulong_t anon_index; 6527 struct anon_map *amp; 6528 struct anon *ap; 6529 struct vattr va; 6530 anon_sync_obj_t cookie; 6531 6532 /* 6533 * Hold write lock on address space because may split or concatenate 6534 * segments 6535 */ 6536 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 6537 6538 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_WRITER); 6539 if (attr) { 6540 pageprot = attr & ~(SHARED|PRIVATE); 6541 segtype = attr & SHARED ? MAP_SHARED : MAP_PRIVATE; 6542 6543 /* 6544 * We are done if the segment types don't match 6545 * or if we have segment level protections and 6546 * they don't match. 6547 */ 6548 if (svd->type != segtype) { 6549 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 6550 return (0); 6551 } 6552 if (svd->pageprot == 0 && svd->prot != pageprot) { 6553 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 6554 return (0); 6555 } 6556 } 6557 6558 /* 6559 * If we're locking, then we must create a vpage structure if 6560 * none exists. If we're unlocking, then check to see if there 6561 * is a vpage -- if not, then we could not have locked anything. 6562 */ 6563 6564 if ((vpp = svd->vpage) == NULL) { 6565 if (op == MC_LOCK) 6566 segvn_vpage(seg); 6567 else { 6568 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 6569 return (0); 6570 } 6571 } 6572 6573 /* 6574 * The anonymous data vector (i.e., previously 6575 * unreferenced mapping to swap space) can be allocated 6576 * by lazily testing for its existence. 6577 */ 6578 if (op == MC_LOCK && svd->amp == NULL && svd->vp == NULL) { 6579 svd->amp = anonmap_alloc(seg->s_size, 0); 6580 svd->amp->a_szc = seg->s_szc; 6581 } 6582 6583 if ((amp = svd->amp) != NULL) { 6584 anon_index = svd->anon_index + seg_page(seg, addr); 6585 } 6586 6587 offset = svd->offset + (uintptr_t)(addr - seg->s_base); 6588 evp = &svd->vpage[seg_page(seg, addr + len)]; 6589 6590 /* 6591 * Loop over all pages in the range. Process if we're locking and 6592 * page has not already been locked in this mapping; or if we're 6593 * unlocking and the page has been locked. 6594 */ 6595 for (vpp = &svd->vpage[seg_page(seg, addr)]; vpp < evp; 6596 vpp++, pos++, addr += PAGESIZE, offset += PAGESIZE, anon_index++) { 6597 if ((attr == 0 || VPP_PROT(vpp) == pageprot) && 6598 ((op == MC_LOCK && !VPP_ISPPLOCK(vpp)) || 6599 (op == MC_UNLOCK && VPP_ISPPLOCK(vpp)))) { 6600 6601 if (amp != NULL) 6602 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 6603 /* 6604 * If this isn't a MAP_NORESERVE segment and 6605 * we're locking, allocate anon slots if they 6606 * don't exist. The page is brought in later on. 6607 */ 6608 if (op == MC_LOCK && svd->vp == NULL && 6609 ((svd->flags & MAP_NORESERVE) == 0) && 6610 amp != NULL && 6611 ((ap = anon_get_ptr(amp->ahp, anon_index)) 6612 == NULL)) { 6613 anon_array_enter(amp, anon_index, &cookie); 6614 6615 if ((ap = anon_get_ptr(amp->ahp, 6616 anon_index)) == NULL) { 6617 pp = anon_zero(seg, addr, &ap, 6618 svd->cred); 6619 if (pp == NULL) { 6620 anon_array_exit(&cookie); 6621 ANON_LOCK_EXIT(&->a_rwlock); 6622 SEGVN_LOCK_EXIT(seg->s_as, 6623 &svd->lock); 6624 return (ENOMEM); 6625 } 6626 ASSERT(anon_get_ptr(amp->ahp, 6627 anon_index) == NULL); 6628 (void) anon_set_ptr(amp->ahp, 6629 anon_index, ap, ANON_SLEEP); 6630 page_unlock(pp); 6631 } 6632 anon_array_exit(&cookie); 6633 } 6634 6635 /* 6636 * Get name for page, accounting for 6637 * existence of private copy. 6638 */ 6639 ap = NULL; 6640 if (amp != NULL) { 6641 anon_array_enter(amp, anon_index, &cookie); 6642 ap = anon_get_ptr(amp->ahp, anon_index); 6643 if (ap != NULL) { 6644 swap_xlate(ap, &vp, &off); 6645 } else { 6646 if (svd->vp == NULL && 6647 (svd->flags & MAP_NORESERVE)) { 6648 anon_array_exit(&cookie); 6649 ANON_LOCK_EXIT(&->a_rwlock); 6650 continue; 6651 } 6652 vp = svd->vp; 6653 off = offset; 6654 } 6655 anon_array_exit(&cookie); 6656 ANON_LOCK_EXIT(&->a_rwlock); 6657 } else { 6658 vp = svd->vp; 6659 off = offset; 6660 } 6661 6662 /* 6663 * Get page frame. It's ok if the page is 6664 * not available when we're unlocking, as this 6665 * may simply mean that a page we locked got 6666 * truncated out of existence after we locked it. 6667 * 6668 * Invoke VOP_GETPAGE() to obtain the page struct 6669 * since we may need to read it from disk if its 6670 * been paged out. 6671 */ 6672 if (op != MC_LOCK) 6673 pp = page_lookup(vp, off, SE_SHARED); 6674 else { 6675 page_t *pl[1 + 1]; 6676 int error; 6677 6678 ASSERT(vp != NULL); 6679 6680 error = VOP_GETPAGE(vp, (offset_t)off, PAGESIZE, 6681 (uint_t *)NULL, pl, PAGESIZE, seg, addr, 6682 S_OTHER, svd->cred); 6683 6684 /* 6685 * If the error is EDEADLK then we must bounce 6686 * up and drop all vm subsystem locks and then 6687 * retry the operation later 6688 * This behavior is a temporary measure because 6689 * ufs/sds logging is badly designed and will 6690 * deadlock if we don't allow this bounce to 6691 * happen. The real solution is to re-design 6692 * the logging code to work properly. See bug 6693 * 4125102 for details of the problem. 6694 */ 6695 if (error == EDEADLK) { 6696 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 6697 return (error); 6698 } 6699 /* 6700 * Quit if we fail to fault in the page. Treat 6701 * the failure as an error, unless the addr 6702 * is mapped beyond the end of a file. 6703 */ 6704 if (error && svd->vp) { 6705 va.va_mask = AT_SIZE; 6706 if (VOP_GETATTR(svd->vp, &va, 0, 6707 svd->cred) != 0) { 6708 SEGVN_LOCK_EXIT(seg->s_as, 6709 &svd->lock); 6710 return (EIO); 6711 } 6712 if (btopr(va.va_size) >= 6713 btopr(off + 1)) { 6714 SEGVN_LOCK_EXIT(seg->s_as, 6715 &svd->lock); 6716 return (EIO); 6717 } 6718 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 6719 return (0); 6720 } else if (error) { 6721 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 6722 return (EIO); 6723 } 6724 pp = pl[0]; 6725 ASSERT(pp != NULL); 6726 } 6727 6728 /* 6729 * See Statement at the beginning of this routine. 6730 * 6731 * claim is always set if MAP_PRIVATE and PROT_WRITE 6732 * irrespective of following factors: 6733 * 6734 * (1) anon slots are populated or not 6735 * (2) cow is broken or not 6736 * (3) refcnt on ap is 1 or greater than 1 6737 * 6738 * See 4140683 for details 6739 */ 6740 claim = ((VPP_PROT(vpp) & PROT_WRITE) && 6741 (svd->type == MAP_PRIVATE)); 6742 6743 /* 6744 * Perform page-level operation appropriate to 6745 * operation. If locking, undo the SOFTLOCK 6746 * performed to bring the page into memory 6747 * after setting the lock. If unlocking, 6748 * and no page was found, account for the claim 6749 * separately. 6750 */ 6751 if (op == MC_LOCK) { 6752 int ret = 1; /* Assume success */ 6753 6754 /* 6755 * Make sure another thread didn't lock 6756 * the page after we released the segment 6757 * lock. 6758 */ 6759 if ((attr == 0 || VPP_PROT(vpp) == pageprot) && 6760 !VPP_ISPPLOCK(vpp)) { 6761 ret = page_pp_lock(pp, claim, 0); 6762 if (ret != 0) { 6763 VPP_SETPPLOCK(vpp); 6764 if (lockmap != (ulong_t *)NULL) 6765 BT_SET(lockmap, pos); 6766 } 6767 } 6768 page_unlock(pp); 6769 if (ret == 0) { 6770 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 6771 return (EAGAIN); 6772 } 6773 } else { 6774 if (pp != NULL) { 6775 if ((attr == 0 || 6776 VPP_PROT(vpp) == pageprot) && 6777 VPP_ISPPLOCK(vpp)) 6778 page_pp_unlock(pp, claim, 0); 6779 page_unlock(pp); 6780 } 6781 VPP_CLRPPLOCK(vpp); 6782 } 6783 } 6784 } 6785 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 6786 return (0); 6787 } 6788 6789 /* 6790 * Set advice from user for specified pages 6791 * There are 5 types of advice: 6792 * MADV_NORMAL - Normal (default) behavior (whatever that is) 6793 * MADV_RANDOM - Random page references 6794 * do not allow readahead or 'klustering' 6795 * MADV_SEQUENTIAL - Sequential page references 6796 * Pages previous to the one currently being 6797 * accessed (determined by fault) are 'not needed' 6798 * and are freed immediately 6799 * MADV_WILLNEED - Pages are likely to be used (fault ahead in mctl) 6800 * MADV_DONTNEED - Pages are not needed (synced out in mctl) 6801 * MADV_FREE - Contents can be discarded 6802 * MADV_ACCESS_DEFAULT- Default access 6803 * MADV_ACCESS_LWP - Next LWP will access heavily 6804 * MADV_ACCESS_MANY- Many LWPs or processes will access heavily 6805 */ 6806 static int 6807 segvn_advise(struct seg *seg, caddr_t addr, size_t len, uint_t behav) 6808 { 6809 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 6810 size_t page; 6811 int err = 0; 6812 int already_set; 6813 struct anon_map *amp; 6814 ulong_t anon_index; 6815 struct seg *next; 6816 lgrp_mem_policy_t policy; 6817 struct seg *prev; 6818 struct vnode *vp; 6819 6820 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 6821 6822 /* 6823 * In case of MADV_FREE, we won't be modifying any segment private 6824 * data structures; so, we only need to grab READER's lock 6825 */ 6826 if (behav != MADV_FREE) 6827 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_WRITER); 6828 else 6829 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER); 6830 6831 /* 6832 * Large pages are assumed to be only turned on when accesses to the 6833 * segment's address range have spatial and temporal locality. That 6834 * justifies ignoring MADV_SEQUENTIAL for large page segments. 6835 * Also, ignore advice affecting lgroup memory allocation 6836 * if don't need to do lgroup optimizations on this system 6837 */ 6838 6839 if ((behav == MADV_SEQUENTIAL && seg->s_szc != 0) || 6840 (!lgrp_optimizations() && (behav == MADV_ACCESS_DEFAULT || 6841 behav == MADV_ACCESS_LWP || behav == MADV_ACCESS_MANY))) { 6842 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 6843 return (0); 6844 } 6845 6846 if (behav == MADV_SEQUENTIAL || behav == MADV_ACCESS_DEFAULT || 6847 behav == MADV_ACCESS_LWP || behav == MADV_ACCESS_MANY) { 6848 /* 6849 * Since we are going to unload hat mappings 6850 * we first have to flush the cache. Otherwise 6851 * this might lead to system panic if another 6852 * thread is doing physio on the range whose 6853 * mappings are unloaded by madvise(3C). 6854 */ 6855 if (svd->softlockcnt > 0) { 6856 /* 6857 * Since we do have the segvn writers lock 6858 * nobody can fill the cache with entries 6859 * belonging to this seg during the purge. 6860 * The flush either succeeds or we still 6861 * have pending I/Os. In the later case, 6862 * madvise(3C) fails. 6863 */ 6864 segvn_purge(seg); 6865 if (svd->softlockcnt > 0) { 6866 /* 6867 * Since madvise(3C) is advisory and 6868 * it's not part of UNIX98, madvise(3C) 6869 * failure here doesn't cause any hardship. 6870 * Note that we don't block in "as" layer. 6871 */ 6872 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 6873 return (EAGAIN); 6874 } 6875 } 6876 } 6877 6878 amp = svd->amp; 6879 vp = svd->vp; 6880 if (behav == MADV_FREE) { 6881 /* 6882 * MADV_FREE is not supported for segments with 6883 * underlying object; if anonmap is NULL, anon slots 6884 * are not yet populated and there is nothing for 6885 * us to do. As MADV_FREE is advisory, we don't 6886 * return error in either case. 6887 */ 6888 if (vp || amp == NULL) { 6889 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 6890 return (0); 6891 } 6892 6893 page = seg_page(seg, addr); 6894 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 6895 anon_disclaim(amp, svd->anon_index + page, len, 0); 6896 ANON_LOCK_EXIT(&->a_rwlock); 6897 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 6898 return (0); 6899 } 6900 6901 /* 6902 * If advice is to be applied to entire segment, 6903 * use advice field in seg_data structure 6904 * otherwise use appropriate vpage entry. 6905 */ 6906 if ((addr == seg->s_base) && (len == seg->s_size)) { 6907 switch (behav) { 6908 case MADV_ACCESS_LWP: 6909 case MADV_ACCESS_MANY: 6910 case MADV_ACCESS_DEFAULT: 6911 /* 6912 * Set memory allocation policy for this segment 6913 */ 6914 policy = lgrp_madv_to_policy(behav, len, svd->type); 6915 if (svd->type == MAP_SHARED) 6916 already_set = lgrp_shm_policy_set(policy, amp, 6917 svd->anon_index, vp, svd->offset, len); 6918 else { 6919 /* 6920 * For private memory, need writers lock on 6921 * address space because the segment may be 6922 * split or concatenated when changing policy 6923 */ 6924 if (AS_READ_HELD(seg->s_as, 6925 &seg->s_as->a_lock)) { 6926 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 6927 return (IE_RETRY); 6928 } 6929 6930 already_set = lgrp_privm_policy_set(policy, 6931 &svd->policy_info, len); 6932 } 6933 6934 /* 6935 * If policy set already and it shouldn't be reapplied, 6936 * don't do anything. 6937 */ 6938 if (already_set && 6939 !LGRP_MEM_POLICY_REAPPLICABLE(policy)) 6940 break; 6941 6942 /* 6943 * Mark any existing pages in given range for 6944 * migration 6945 */ 6946 page_mark_migrate(seg, addr, len, amp, svd->anon_index, 6947 vp, svd->offset, 1); 6948 6949 /* 6950 * If same policy set already or this is a shared 6951 * memory segment, don't need to try to concatenate 6952 * segment with adjacent ones. 6953 */ 6954 if (already_set || svd->type == MAP_SHARED) 6955 break; 6956 6957 /* 6958 * Try to concatenate this segment with previous 6959 * one and next one, since we changed policy for 6960 * this one and it may be compatible with adjacent 6961 * ones now. 6962 */ 6963 prev = AS_SEGPREV(seg->s_as, seg); 6964 next = AS_SEGNEXT(seg->s_as, seg); 6965 6966 if (next && next->s_ops == &segvn_ops && 6967 addr + len == next->s_base) 6968 (void) segvn_concat(seg, next, 1); 6969 6970 if (prev && prev->s_ops == &segvn_ops && 6971 addr == prev->s_base + prev->s_size) { 6972 /* 6973 * Drop lock for private data of current 6974 * segment before concatenating (deleting) it 6975 * and return IE_REATTACH to tell as_ctl() that 6976 * current segment has changed 6977 */ 6978 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 6979 if (!segvn_concat(prev, seg, 1)) 6980 err = IE_REATTACH; 6981 6982 return (err); 6983 } 6984 break; 6985 6986 case MADV_SEQUENTIAL: 6987 /* 6988 * unloading mapping guarantees 6989 * detection in segvn_fault 6990 */ 6991 ASSERT(seg->s_szc == 0); 6992 hat_unload(seg->s_as->a_hat, addr, len, 6993 HAT_UNLOAD); 6994 /* FALLTHROUGH */ 6995 case MADV_NORMAL: 6996 case MADV_RANDOM: 6997 svd->advice = (uchar_t)behav; 6998 svd->pageadvice = 0; 6999 break; 7000 case MADV_WILLNEED: /* handled in memcntl */ 7001 case MADV_DONTNEED: /* handled in memcntl */ 7002 case MADV_FREE: /* handled above */ 7003 break; 7004 default: 7005 err = EINVAL; 7006 } 7007 } else { 7008 caddr_t eaddr; 7009 struct seg *new_seg; 7010 struct segvn_data *new_svd; 7011 u_offset_t off; 7012 caddr_t oldeaddr; 7013 7014 page = seg_page(seg, addr); 7015 7016 segvn_vpage(seg); 7017 7018 switch (behav) { 7019 struct vpage *bvpp, *evpp; 7020 7021 case MADV_ACCESS_LWP: 7022 case MADV_ACCESS_MANY: 7023 case MADV_ACCESS_DEFAULT: 7024 /* 7025 * Set memory allocation policy for portion of this 7026 * segment 7027 */ 7028 7029 /* 7030 * Align address and length of advice to page 7031 * boundaries for large pages 7032 */ 7033 if (seg->s_szc != 0) { 7034 size_t pgsz; 7035 7036 pgsz = page_get_pagesize(seg->s_szc); 7037 addr = (caddr_t)P2ALIGN((uintptr_t)addr, pgsz); 7038 len = P2ROUNDUP(len, pgsz); 7039 } 7040 7041 /* 7042 * Check to see whether policy is set already 7043 */ 7044 policy = lgrp_madv_to_policy(behav, len, svd->type); 7045 7046 anon_index = svd->anon_index + page; 7047 off = svd->offset + (uintptr_t)(addr - seg->s_base); 7048 7049 if (svd->type == MAP_SHARED) 7050 already_set = lgrp_shm_policy_set(policy, amp, 7051 anon_index, vp, off, len); 7052 else 7053 already_set = 7054 (policy == svd->policy_info.mem_policy); 7055 7056 /* 7057 * If policy set already and it shouldn't be reapplied, 7058 * don't do anything. 7059 */ 7060 if (already_set && 7061 !LGRP_MEM_POLICY_REAPPLICABLE(policy)) 7062 break; 7063 7064 /* 7065 * For private memory, need writers lock on 7066 * address space because the segment may be 7067 * split or concatenated when changing policy 7068 */ 7069 if (svd->type == MAP_PRIVATE && 7070 AS_READ_HELD(seg->s_as, &seg->s_as->a_lock)) { 7071 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7072 return (IE_RETRY); 7073 } 7074 7075 /* 7076 * Mark any existing pages in given range for 7077 * migration 7078 */ 7079 page_mark_migrate(seg, addr, len, amp, svd->anon_index, 7080 vp, svd->offset, 1); 7081 7082 /* 7083 * Don't need to try to split or concatenate 7084 * segments, since policy is same or this is a shared 7085 * memory segment 7086 */ 7087 if (already_set || svd->type == MAP_SHARED) 7088 break; 7089 7090 /* 7091 * Split off new segment if advice only applies to a 7092 * portion of existing segment starting in middle 7093 */ 7094 new_seg = NULL; 7095 eaddr = addr + len; 7096 oldeaddr = seg->s_base + seg->s_size; 7097 if (addr > seg->s_base) { 7098 /* 7099 * Must flush I/O page cache 7100 * before splitting segment 7101 */ 7102 if (svd->softlockcnt > 0) 7103 segvn_purge(seg); 7104 7105 /* 7106 * Split segment and return IE_REATTACH to tell 7107 * as_ctl() that current segment changed 7108 */ 7109 new_seg = segvn_split_seg(seg, addr); 7110 new_svd = (struct segvn_data *)new_seg->s_data; 7111 err = IE_REATTACH; 7112 7113 /* 7114 * If new segment ends where old one 7115 * did, try to concatenate the new 7116 * segment with next one. 7117 */ 7118 if (eaddr == oldeaddr) { 7119 /* 7120 * Set policy for new segment 7121 */ 7122 (void) lgrp_privm_policy_set(policy, 7123 &new_svd->policy_info, 7124 new_seg->s_size); 7125 7126 next = AS_SEGNEXT(new_seg->s_as, 7127 new_seg); 7128 7129 if (next && 7130 next->s_ops == &segvn_ops && 7131 eaddr == next->s_base) 7132 (void) segvn_concat(new_seg, 7133 next, 1); 7134 } 7135 } 7136 7137 /* 7138 * Split off end of existing segment if advice only 7139 * applies to a portion of segment ending before 7140 * end of the existing segment 7141 */ 7142 if (eaddr < oldeaddr) { 7143 /* 7144 * Must flush I/O page cache 7145 * before splitting segment 7146 */ 7147 if (svd->softlockcnt > 0) 7148 segvn_purge(seg); 7149 7150 /* 7151 * If beginning of old segment was already 7152 * split off, use new segment to split end off 7153 * from. 7154 */ 7155 if (new_seg != NULL && new_seg != seg) { 7156 /* 7157 * Split segment 7158 */ 7159 (void) segvn_split_seg(new_seg, eaddr); 7160 7161 /* 7162 * Set policy for new segment 7163 */ 7164 (void) lgrp_privm_policy_set(policy, 7165 &new_svd->policy_info, 7166 new_seg->s_size); 7167 } else { 7168 /* 7169 * Split segment and return IE_REATTACH 7170 * to tell as_ctl() that current 7171 * segment changed 7172 */ 7173 (void) segvn_split_seg(seg, eaddr); 7174 err = IE_REATTACH; 7175 7176 (void) lgrp_privm_policy_set(policy, 7177 &svd->policy_info, seg->s_size); 7178 7179 /* 7180 * If new segment starts where old one 7181 * did, try to concatenate it with 7182 * previous segment. 7183 */ 7184 if (addr == seg->s_base) { 7185 prev = AS_SEGPREV(seg->s_as, 7186 seg); 7187 7188 /* 7189 * Drop lock for private data 7190 * of current segment before 7191 * concatenating (deleting) it 7192 */ 7193 if (prev && 7194 prev->s_ops == 7195 &segvn_ops && 7196 addr == prev->s_base + 7197 prev->s_size) { 7198 SEGVN_LOCK_EXIT( 7199 seg->s_as, 7200 &svd->lock); 7201 (void) segvn_concat( 7202 prev, seg, 1); 7203 return (err); 7204 } 7205 } 7206 } 7207 } 7208 break; 7209 case MADV_SEQUENTIAL: 7210 ASSERT(seg->s_szc == 0); 7211 hat_unload(seg->s_as->a_hat, addr, len, HAT_UNLOAD); 7212 /* FALLTHROUGH */ 7213 case MADV_NORMAL: 7214 case MADV_RANDOM: 7215 bvpp = &svd->vpage[page]; 7216 evpp = &svd->vpage[page + (len >> PAGESHIFT)]; 7217 for (; bvpp < evpp; bvpp++) 7218 VPP_SETADVICE(bvpp, behav); 7219 svd->advice = MADV_NORMAL; 7220 break; 7221 case MADV_WILLNEED: /* handled in memcntl */ 7222 case MADV_DONTNEED: /* handled in memcntl */ 7223 case MADV_FREE: /* handled above */ 7224 break; 7225 default: 7226 err = EINVAL; 7227 } 7228 } 7229 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7230 return (err); 7231 } 7232 7233 /* 7234 * Create a vpage structure for this seg. 7235 */ 7236 static void 7237 segvn_vpage(struct seg *seg) 7238 { 7239 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 7240 struct vpage *vp, *evp; 7241 7242 ASSERT(SEGVN_WRITE_HELD(seg->s_as, &svd->lock)); 7243 7244 /* 7245 * If no vpage structure exists, allocate one. Copy the protections 7246 * and the advice from the segment itself to the individual pages. 7247 */ 7248 if (svd->vpage == NULL) { 7249 svd->pageprot = 1; 7250 svd->pageadvice = 1; 7251 svd->vpage = kmem_zalloc(seg_pages(seg) * sizeof (struct vpage), 7252 KM_SLEEP); 7253 evp = &svd->vpage[seg_page(seg, seg->s_base + seg->s_size)]; 7254 for (vp = svd->vpage; vp < evp; vp++) { 7255 VPP_SETPROT(vp, svd->prot); 7256 VPP_SETADVICE(vp, svd->advice); 7257 } 7258 } 7259 } 7260 7261 /* 7262 * Dump the pages belonging to this segvn segment. 7263 */ 7264 static void 7265 segvn_dump(struct seg *seg) 7266 { 7267 struct segvn_data *svd; 7268 page_t *pp; 7269 struct anon_map *amp; 7270 ulong_t anon_index; 7271 struct vnode *vp; 7272 u_offset_t off, offset; 7273 pfn_t pfn; 7274 pgcnt_t page, npages; 7275 caddr_t addr; 7276 7277 npages = seg_pages(seg); 7278 svd = (struct segvn_data *)seg->s_data; 7279 vp = svd->vp; 7280 off = offset = svd->offset; 7281 addr = seg->s_base; 7282 7283 if ((amp = svd->amp) != NULL) { 7284 anon_index = svd->anon_index; 7285 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 7286 } 7287 7288 for (page = 0; page < npages; page++, offset += PAGESIZE) { 7289 struct anon *ap; 7290 int we_own_it = 0; 7291 7292 if (amp && (ap = anon_get_ptr(svd->amp->ahp, anon_index++))) { 7293 swap_xlate_nopanic(ap, &vp, &off); 7294 } else { 7295 vp = svd->vp; 7296 off = offset; 7297 } 7298 7299 /* 7300 * If pp == NULL, the page either does not exist 7301 * or is exclusively locked. So determine if it 7302 * exists before searching for it. 7303 */ 7304 7305 if ((pp = page_lookup_nowait(vp, off, SE_SHARED))) 7306 we_own_it = 1; 7307 else 7308 pp = page_exists(vp, off); 7309 7310 if (pp) { 7311 pfn = page_pptonum(pp); 7312 dump_addpage(seg->s_as, addr, pfn); 7313 if (we_own_it) 7314 page_unlock(pp); 7315 } 7316 addr += PAGESIZE; 7317 dump_timeleft = dump_timeout; 7318 } 7319 7320 if (amp != NULL) 7321 ANON_LOCK_EXIT(&->a_rwlock); 7322 } 7323 7324 /* 7325 * lock/unlock anon pages over a given range. Return shadow list 7326 */ 7327 static int 7328 segvn_pagelock(struct seg *seg, caddr_t addr, size_t len, struct page ***ppp, 7329 enum lock_type type, enum seg_rw rw) 7330 { 7331 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 7332 size_t np, adjustpages = 0, npages = (len >> PAGESHIFT); 7333 ulong_t anon_index; 7334 uint_t protchk; 7335 uint_t error; 7336 struct anon_map *amp; 7337 struct page **pplist, **pl, *pp; 7338 caddr_t a; 7339 size_t page; 7340 caddr_t lpgaddr, lpgeaddr; 7341 7342 TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEGVN_START, 7343 "segvn_pagelock: start seg %p addr %p", seg, addr); 7344 7345 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 7346 if (seg->s_szc != 0 && (type == L_PAGELOCK || type == L_PAGEUNLOCK)) { 7347 /* 7348 * We are adjusting the pagelock region to the large page size 7349 * boundary because the unlocked part of a large page cannot 7350 * be freed anyway unless all constituent pages of a large 7351 * page are locked. Therefore this adjustment allows us to 7352 * decrement availrmem by the right value (note we don't want 7353 * to just decrement availrem by the large page size without 7354 * adjusting addr and len because then we may end up 7355 * decrementing availrmem by large page size for every 7356 * constituent page locked by a new as_pagelock call). 7357 * as_pageunlock caller must always match as_pagelock call's 7358 * addr and len. 7359 * 7360 * Note segment's page size cannot change while we are holding 7361 * as lock. And then it cannot change while softlockcnt is 7362 * not 0. This will allow us to correctly recalculate large 7363 * page size region for the matching pageunlock/reclaim call. 7364 * 7365 * for pageunlock *ppp points to the pointer of page_t that 7366 * corresponds to the real unadjusted start address. Similar 7367 * for pagelock *ppp must point to the pointer of page_t that 7368 * corresponds to the real unadjusted start address. 7369 */ 7370 size_t pgsz = page_get_pagesize(seg->s_szc); 7371 CALC_LPG_REGION(pgsz, seg, addr, len, lpgaddr, lpgeaddr); 7372 adjustpages = ((uintptr_t)(addr - lpgaddr)) >> PAGESHIFT; 7373 } 7374 7375 if (type == L_PAGEUNLOCK) { 7376 7377 /* 7378 * update hat ref bits for /proc. We need to make sure 7379 * that threads tracing the ref and mod bits of the 7380 * address space get the right data. 7381 * Note: page ref and mod bits are updated at reclaim time 7382 */ 7383 if (seg->s_as->a_vbits) { 7384 for (a = addr; a < addr + len; a += PAGESIZE) { 7385 if (rw == S_WRITE) { 7386 hat_setstat(seg->s_as, a, 7387 PAGESIZE, P_REF | P_MOD); 7388 } else { 7389 hat_setstat(seg->s_as, a, 7390 PAGESIZE, P_REF); 7391 } 7392 } 7393 } 7394 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER); 7395 if (seg->s_szc != 0) { 7396 VM_STAT_ADD(segvnvmstats.pagelock[0]); 7397 seg_pinactive(seg, lpgaddr, lpgeaddr - lpgaddr, 7398 *ppp - adjustpages, rw, segvn_reclaim); 7399 } else { 7400 seg_pinactive(seg, addr, len, *ppp, rw, segvn_reclaim); 7401 } 7402 7403 /* 7404 * If someone is blocked while unmapping, we purge 7405 * segment page cache and thus reclaim pplist synchronously 7406 * without waiting for seg_pasync_thread. This speeds up 7407 * unmapping in cases where munmap(2) is called, while 7408 * raw async i/o is still in progress or where a thread 7409 * exits on data fault in a multithreaded application. 7410 */ 7411 if (AS_ISUNMAPWAIT(seg->s_as) && (svd->softlockcnt > 0)) { 7412 /* 7413 * Even if we grab segvn WRITER's lock or segp_slock 7414 * here, there might be another thread which could've 7415 * successfully performed lookup/insert just before 7416 * we acquired the lock here. So, grabbing either 7417 * lock here is of not much use. Until we devise 7418 * a strategy at upper layers to solve the 7419 * synchronization issues completely, we expect 7420 * applications to handle this appropriately. 7421 */ 7422 segvn_purge(seg); 7423 } 7424 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7425 TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEGVN_UNLOCK_END, 7426 "segvn_pagelock: unlock seg %p addr %p", seg, addr); 7427 return (0); 7428 } else if (type == L_PAGERECLAIM) { 7429 VM_STAT_COND_ADD(seg->s_szc != 0, segvnvmstats.pagelock[1]); 7430 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER); 7431 (void) segvn_reclaim(seg, addr, len, *ppp, rw); 7432 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7433 TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEGVN_UNLOCK_END, 7434 "segvn_pagelock: reclaim seg %p addr %p", seg, addr); 7435 return (0); 7436 } 7437 7438 if (seg->s_szc != 0) { 7439 VM_STAT_ADD(segvnvmstats.pagelock[2]); 7440 addr = lpgaddr; 7441 len = lpgeaddr - lpgaddr; 7442 npages = (len >> PAGESHIFT); 7443 } 7444 7445 /* 7446 * for now we only support pagelock to anon memory. We've to check 7447 * protections for vnode objects and call into the vnode driver. 7448 * That's too much for a fast path. Let the fault entry point handle it. 7449 */ 7450 if (svd->vp != NULL) { 7451 TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEGVN_MISS_END, 7452 "segvn_pagelock: mapped vnode seg %p addr %p", seg, addr); 7453 *ppp = NULL; 7454 return (ENOTSUP); 7455 } 7456 7457 /* 7458 * if anonmap is not yet created, let the fault entry point populate it 7459 * with anon ptrs. 7460 */ 7461 if ((amp = svd->amp) == NULL) { 7462 TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEGVN_MISS_END, 7463 "segvn_pagelock: anonmap null seg %p addr %p", seg, addr); 7464 *ppp = NULL; 7465 return (EFAULT); 7466 } 7467 7468 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER); 7469 7470 /* 7471 * we acquire segp_slock to prevent duplicate entries 7472 * in seg_pcache 7473 */ 7474 mutex_enter(&svd->segp_slock); 7475 7476 /* 7477 * try to find pages in segment page cache 7478 */ 7479 pplist = seg_plookup(seg, addr, len, rw); 7480 if (pplist != NULL) { 7481 mutex_exit(&svd->segp_slock); 7482 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7483 *ppp = pplist + adjustpages; 7484 TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEGVN_HIT_END, 7485 "segvn_pagelock: cache hit seg %p addr %p", seg, addr); 7486 return (0); 7487 } 7488 7489 if (rw == S_READ) { 7490 protchk = PROT_READ; 7491 } else { 7492 protchk = PROT_WRITE; 7493 } 7494 7495 if (svd->pageprot == 0) { 7496 if ((svd->prot & protchk) == 0) { 7497 mutex_exit(&svd->segp_slock); 7498 error = EFAULT; 7499 goto out; 7500 } 7501 } else { 7502 /* 7503 * check page protections 7504 */ 7505 for (a = addr; a < addr + len; a += PAGESIZE) { 7506 struct vpage *vp; 7507 7508 vp = &svd->vpage[seg_page(seg, a)]; 7509 if ((VPP_PROT(vp) & protchk) == 0) { 7510 mutex_exit(&svd->segp_slock); 7511 error = EFAULT; 7512 goto out; 7513 } 7514 } 7515 } 7516 7517 mutex_enter(&freemem_lock); 7518 if (availrmem < tune.t_minarmem + npages) { 7519 mutex_exit(&freemem_lock); 7520 mutex_exit(&svd->segp_slock); 7521 error = ENOMEM; 7522 goto out; 7523 } else { 7524 svd->softlockcnt += npages; 7525 availrmem -= npages; 7526 segvn_pages_locked += npages; 7527 } 7528 mutex_exit(&freemem_lock); 7529 7530 pplist = kmem_alloc(sizeof (page_t *) * npages, KM_SLEEP); 7531 pl = pplist; 7532 *ppp = pplist + adjustpages; 7533 7534 page = seg_page(seg, addr); 7535 anon_index = svd->anon_index + page; 7536 7537 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 7538 for (a = addr; a < addr + len; a += PAGESIZE, anon_index++) { 7539 struct anon *ap; 7540 struct vnode *vp; 7541 u_offset_t off; 7542 anon_sync_obj_t cookie; 7543 7544 anon_array_enter(amp, anon_index, &cookie); 7545 ap = anon_get_ptr(amp->ahp, anon_index); 7546 if (ap == NULL) { 7547 anon_array_exit(&cookie); 7548 break; 7549 } else { 7550 /* 7551 * We must never use seg_pcache for COW pages 7552 * because we might end up with original page still 7553 * lying in seg_pcache even after private page is 7554 * created. This leads to data corruption as 7555 * aio_write refers to the page still in cache 7556 * while all other accesses refer to the private 7557 * page. 7558 */ 7559 if (ap->an_refcnt != 1) { 7560 anon_array_exit(&cookie); 7561 break; 7562 } 7563 } 7564 swap_xlate(ap, &vp, &off); 7565 anon_array_exit(&cookie); 7566 7567 pp = page_lookup_nowait(vp, off, SE_SHARED); 7568 if (pp == NULL) { 7569 break; 7570 } 7571 *pplist++ = pp; 7572 } 7573 ANON_LOCK_EXIT(&->a_rwlock); 7574 7575 if (a >= addr + len) { 7576 (void) seg_pinsert(seg, addr, len, pl, rw, SEGP_ASYNC_FLUSH, 7577 segvn_reclaim); 7578 mutex_exit(&svd->segp_slock); 7579 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7580 TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEGVN_FILL_END, 7581 "segvn_pagelock: cache fill seg %p addr %p", seg, addr); 7582 return (0); 7583 } 7584 7585 mutex_exit(&svd->segp_slock); 7586 error = EFAULT; 7587 pplist = pl; 7588 np = ((uintptr_t)(a - addr)) >> PAGESHIFT; 7589 while (np > (uint_t)0) { 7590 page_unlock(*pplist); 7591 np--; 7592 pplist++; 7593 } 7594 kmem_free(pl, sizeof (page_t *) * npages); 7595 mutex_enter(&freemem_lock); 7596 svd->softlockcnt -= npages; 7597 availrmem += npages; 7598 segvn_pages_locked -= npages; 7599 mutex_exit(&freemem_lock); 7600 if (svd->softlockcnt <= 0) { 7601 if (AS_ISUNMAPWAIT(seg->s_as)) { 7602 mutex_enter(&seg->s_as->a_contents); 7603 if (AS_ISUNMAPWAIT(seg->s_as)) { 7604 AS_CLRUNMAPWAIT(seg->s_as); 7605 cv_broadcast(&seg->s_as->a_cv); 7606 } 7607 mutex_exit(&seg->s_as->a_contents); 7608 } 7609 } 7610 7611 out: 7612 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7613 *ppp = NULL; 7614 TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEGVN_MISS_END, 7615 "segvn_pagelock: cache miss seg %p addr %p", seg, addr); 7616 return (error); 7617 } 7618 7619 /* 7620 * purge any cached pages in the I/O page cache 7621 */ 7622 static void 7623 segvn_purge(struct seg *seg) 7624 { 7625 seg_ppurge(seg); 7626 } 7627 7628 static int 7629 segvn_reclaim(struct seg *seg, caddr_t addr, size_t len, struct page **pplist, 7630 enum seg_rw rw) 7631 { 7632 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 7633 pgcnt_t np, npages; 7634 struct page **pl; 7635 7636 #ifdef lint 7637 addr = addr; 7638 #endif 7639 7640 npages = np = (len >> PAGESHIFT); 7641 ASSERT(npages); 7642 pl = pplist; 7643 if (seg->s_szc != 0) { 7644 size_t pgsz = page_get_pagesize(seg->s_szc); 7645 if (!IS_P2ALIGNED(addr, pgsz) || !IS_P2ALIGNED(len, pgsz)) { 7646 panic("segvn_reclaim: unaligned addr or len"); 7647 /*NOTREACHED*/ 7648 } 7649 } 7650 7651 while (np > (uint_t)0) { 7652 if (rw == S_WRITE) { 7653 hat_setrefmod(*pplist); 7654 } else { 7655 hat_setref(*pplist); 7656 } 7657 page_unlock(*pplist); 7658 np--; 7659 pplist++; 7660 } 7661 kmem_free(pl, sizeof (page_t *) * npages); 7662 7663 mutex_enter(&freemem_lock); 7664 availrmem += npages; 7665 segvn_pages_locked -= npages; 7666 svd->softlockcnt -= npages; 7667 mutex_exit(&freemem_lock); 7668 if (svd->softlockcnt <= 0) { 7669 if (AS_ISUNMAPWAIT(seg->s_as)) { 7670 mutex_enter(&seg->s_as->a_contents); 7671 if (AS_ISUNMAPWAIT(seg->s_as)) { 7672 AS_CLRUNMAPWAIT(seg->s_as); 7673 cv_broadcast(&seg->s_as->a_cv); 7674 } 7675 mutex_exit(&seg->s_as->a_contents); 7676 } 7677 } 7678 return (0); 7679 } 7680 /* 7681 * get a memory ID for an addr in a given segment 7682 * 7683 * XXX only creates PAGESIZE pages if anon slots are not initialized. 7684 * At fault time they will be relocated into larger pages. 7685 */ 7686 static int 7687 segvn_getmemid(struct seg *seg, caddr_t addr, memid_t *memidp) 7688 { 7689 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 7690 struct anon *ap = NULL; 7691 ulong_t anon_index; 7692 struct anon_map *amp; 7693 anon_sync_obj_t cookie; 7694 7695 if (svd->type == MAP_PRIVATE) { 7696 memidp->val[0] = (uintptr_t)seg->s_as; 7697 memidp->val[1] = (uintptr_t)addr; 7698 return (0); 7699 } 7700 7701 if (svd->type == MAP_SHARED) { 7702 if (svd->vp) { 7703 memidp->val[0] = (uintptr_t)svd->vp; 7704 memidp->val[1] = (u_longlong_t)svd->offset + 7705 (uintptr_t)(addr - seg->s_base); 7706 return (0); 7707 } else { 7708 7709 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER); 7710 if ((amp = svd->amp) != NULL) { 7711 anon_index = svd->anon_index + 7712 seg_page(seg, addr); 7713 } 7714 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7715 7716 ASSERT(amp != NULL); 7717 7718 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 7719 anon_array_enter(amp, anon_index, &cookie); 7720 ap = anon_get_ptr(amp->ahp, anon_index); 7721 if (ap == NULL) { 7722 page_t *pp; 7723 7724 pp = anon_zero(seg, addr, &ap, svd->cred); 7725 if (pp == NULL) { 7726 anon_array_exit(&cookie); 7727 ANON_LOCK_EXIT(&->a_rwlock); 7728 return (ENOMEM); 7729 } 7730 ASSERT(anon_get_ptr(amp->ahp, anon_index) 7731 == NULL); 7732 (void) anon_set_ptr(amp->ahp, anon_index, 7733 ap, ANON_SLEEP); 7734 page_unlock(pp); 7735 } 7736 7737 anon_array_exit(&cookie); 7738 ANON_LOCK_EXIT(&->a_rwlock); 7739 7740 memidp->val[0] = (uintptr_t)ap; 7741 memidp->val[1] = (uintptr_t)addr & PAGEOFFSET; 7742 return (0); 7743 } 7744 } 7745 return (EINVAL); 7746 } 7747 7748 static int 7749 sameprot(struct seg *seg, caddr_t a, size_t len) 7750 { 7751 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 7752 struct vpage *vpage; 7753 spgcnt_t pages = btop(len); 7754 uint_t prot; 7755 7756 if (svd->pageprot == 0) 7757 return (1); 7758 7759 ASSERT(svd->vpage != NULL); 7760 7761 vpage = &svd->vpage[seg_page(seg, a)]; 7762 prot = VPP_PROT(vpage); 7763 vpage++; 7764 pages--; 7765 while (pages-- > 0) { 7766 if (prot != VPP_PROT(vpage)) 7767 return (0); 7768 vpage++; 7769 } 7770 return (1); 7771 } 7772 7773 /* 7774 * Get memory allocation policy info for specified address in given segment 7775 */ 7776 static lgrp_mem_policy_info_t * 7777 segvn_getpolicy(struct seg *seg, caddr_t addr) 7778 { 7779 struct anon_map *amp; 7780 ulong_t anon_index; 7781 lgrp_mem_policy_info_t *policy_info; 7782 struct segvn_data *svn_data; 7783 u_offset_t vn_off; 7784 vnode_t *vp; 7785 7786 ASSERT(seg != NULL); 7787 7788 svn_data = (struct segvn_data *)seg->s_data; 7789 if (svn_data == NULL) 7790 return (NULL); 7791 7792 /* 7793 * Get policy info for private or shared memory 7794 */ 7795 if (svn_data->type != MAP_SHARED) 7796 policy_info = &svn_data->policy_info; 7797 else { 7798 amp = svn_data->amp; 7799 anon_index = svn_data->anon_index + seg_page(seg, addr); 7800 vp = svn_data->vp; 7801 vn_off = svn_data->offset + (uintptr_t)(addr - seg->s_base); 7802 policy_info = lgrp_shm_policy_get(amp, anon_index, vp, vn_off); 7803 } 7804 7805 return (policy_info); 7806 } 7807