1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ 27 /* All Rights Reserved */ 28 29 /* 30 * University Copyright- Copyright (c) 1982, 1986, 1988 31 * The Regents of the University of California 32 * All Rights Reserved 33 * 34 * University Acknowledgment- Portions of this document are derived from 35 * software developed by the University of California, Berkeley, and its 36 * contributors. 37 */ 38 39 #pragma ident "%Z%%M% %I% %E% SMI" 40 41 /* 42 * VM - shared or copy-on-write from a vnode/anonymous memory. 43 */ 44 45 #include <sys/types.h> 46 #include <sys/param.h> 47 #include <sys/t_lock.h> 48 #include <sys/errno.h> 49 #include <sys/systm.h> 50 #include <sys/mman.h> 51 #include <sys/debug.h> 52 #include <sys/cred.h> 53 #include <sys/vmsystm.h> 54 #include <sys/tuneable.h> 55 #include <sys/bitmap.h> 56 #include <sys/swap.h> 57 #include <sys/kmem.h> 58 #include <sys/sysmacros.h> 59 #include <sys/vtrace.h> 60 #include <sys/cmn_err.h> 61 #include <sys/vm.h> 62 #include <sys/dumphdr.h> 63 #include <sys/lgrp.h> 64 65 #include <vm/hat.h> 66 #include <vm/as.h> 67 #include <vm/seg.h> 68 #include <vm/seg_vn.h> 69 #include <vm/pvn.h> 70 #include <vm/anon.h> 71 #include <vm/page.h> 72 #include <vm/vpage.h> 73 #include <sys/proc.h> 74 #include <sys/task.h> 75 #include <sys/project.h> 76 #include <sys/zone.h> 77 #include <sys/shm_impl.h> 78 /* 79 * Private seg op routines. 80 */ 81 static int segvn_dup(struct seg *seg, struct seg *newseg); 82 static int segvn_unmap(struct seg *seg, caddr_t addr, size_t len); 83 static void segvn_free(struct seg *seg); 84 static faultcode_t segvn_fault(struct hat *hat, struct seg *seg, 85 caddr_t addr, size_t len, enum fault_type type, 86 enum seg_rw rw); 87 static faultcode_t segvn_faulta(struct seg *seg, caddr_t addr); 88 static int segvn_setprot(struct seg *seg, caddr_t addr, 89 size_t len, uint_t prot); 90 static int segvn_checkprot(struct seg *seg, caddr_t addr, 91 size_t len, uint_t prot); 92 static int segvn_kluster(struct seg *seg, caddr_t addr, ssize_t delta); 93 static size_t segvn_swapout(struct seg *seg); 94 static int segvn_sync(struct seg *seg, caddr_t addr, size_t len, 95 int attr, uint_t flags); 96 static size_t segvn_incore(struct seg *seg, caddr_t addr, size_t len, 97 char *vec); 98 static int segvn_lockop(struct seg *seg, caddr_t addr, size_t len, 99 int attr, int op, ulong_t *lockmap, size_t pos); 100 static int segvn_getprot(struct seg *seg, caddr_t addr, size_t len, 101 uint_t *protv); 102 static u_offset_t segvn_getoffset(struct seg *seg, caddr_t addr); 103 static int segvn_gettype(struct seg *seg, caddr_t addr); 104 static int segvn_getvp(struct seg *seg, caddr_t addr, 105 struct vnode **vpp); 106 static int segvn_advise(struct seg *seg, caddr_t addr, size_t len, 107 uint_t behav); 108 static void segvn_dump(struct seg *seg); 109 static int segvn_pagelock(struct seg *seg, caddr_t addr, size_t len, 110 struct page ***ppp, enum lock_type type, enum seg_rw rw); 111 static int segvn_setpagesize(struct seg *seg, caddr_t addr, size_t len, 112 uint_t szc); 113 static int segvn_getmemid(struct seg *seg, caddr_t addr, 114 memid_t *memidp); 115 static lgrp_mem_policy_info_t *segvn_getpolicy(struct seg *, caddr_t); 116 static int segvn_capable(struct seg *seg, segcapability_t capable); 117 118 struct seg_ops segvn_ops = { 119 segvn_dup, 120 segvn_unmap, 121 segvn_free, 122 segvn_fault, 123 segvn_faulta, 124 segvn_setprot, 125 segvn_checkprot, 126 segvn_kluster, 127 segvn_swapout, 128 segvn_sync, 129 segvn_incore, 130 segvn_lockop, 131 segvn_getprot, 132 segvn_getoffset, 133 segvn_gettype, 134 segvn_getvp, 135 segvn_advise, 136 segvn_dump, 137 segvn_pagelock, 138 segvn_setpagesize, 139 segvn_getmemid, 140 segvn_getpolicy, 141 segvn_capable, 142 }; 143 144 /* 145 * Common zfod structures, provided as a shorthand for others to use. 146 */ 147 static segvn_crargs_t zfod_segvn_crargs = 148 SEGVN_ZFOD_ARGS(PROT_ZFOD, PROT_ALL); 149 static segvn_crargs_t kzfod_segvn_crargs = 150 SEGVN_ZFOD_ARGS(PROT_ZFOD & ~PROT_USER, 151 PROT_ALL & ~PROT_USER); 152 static segvn_crargs_t stack_noexec_crargs = 153 SEGVN_ZFOD_ARGS(PROT_ZFOD & ~PROT_EXEC, PROT_ALL); 154 155 caddr_t zfod_argsp = (caddr_t)&zfod_segvn_crargs; /* user zfod argsp */ 156 caddr_t kzfod_argsp = (caddr_t)&kzfod_segvn_crargs; /* kernel zfod argsp */ 157 caddr_t stack_exec_argsp = (caddr_t)&zfod_segvn_crargs; /* executable stack */ 158 caddr_t stack_noexec_argsp = (caddr_t)&stack_noexec_crargs; /* noexec stack */ 159 160 #define vpgtob(n) ((n) * sizeof (struct vpage)) /* For brevity */ 161 162 size_t segvn_comb_thrshld = UINT_MAX; /* patchable -- see 1196681 */ 163 164 static int segvn_concat(struct seg *, struct seg *, int); 165 static int segvn_extend_prev(struct seg *, struct seg *, 166 struct segvn_crargs *, size_t); 167 static int segvn_extend_next(struct seg *, struct seg *, 168 struct segvn_crargs *, size_t); 169 static void segvn_softunlock(struct seg *, caddr_t, size_t, enum seg_rw); 170 static void segvn_pagelist_rele(page_t **); 171 static void segvn_setvnode_mpss(vnode_t *); 172 static void segvn_relocate_pages(page_t **, page_t *); 173 static int segvn_full_szcpages(page_t **, uint_t, int *, uint_t *); 174 static int segvn_fill_vp_pages(struct segvn_data *, vnode_t *, u_offset_t, 175 uint_t, page_t **, page_t **, uint_t *, int *); 176 static faultcode_t segvn_fault_vnodepages(struct hat *, struct seg *, caddr_t, 177 caddr_t, enum fault_type, enum seg_rw, caddr_t, caddr_t, int); 178 static faultcode_t segvn_fault_anonpages(struct hat *, struct seg *, caddr_t, 179 caddr_t, enum fault_type, enum seg_rw, caddr_t, caddr_t, int); 180 static faultcode_t segvn_faultpage(struct hat *, struct seg *, caddr_t, 181 u_offset_t, struct vpage *, page_t **, uint_t, 182 enum fault_type, enum seg_rw, int, int); 183 static void segvn_vpage(struct seg *); 184 185 static void segvn_purge(struct seg *seg); 186 static int segvn_reclaim(struct seg *, caddr_t, size_t, struct page **, 187 enum seg_rw); 188 189 static int sameprot(struct seg *, caddr_t, size_t); 190 191 static int segvn_demote_range(struct seg *, caddr_t, size_t, int, uint_t); 192 static int segvn_clrszc(struct seg *); 193 static struct seg *segvn_split_seg(struct seg *, caddr_t); 194 static int segvn_claim_pages(struct seg *, struct vpage *, u_offset_t, 195 ulong_t, uint_t); 196 197 static int segvn_pp_lock_anonpages(page_t *, int); 198 static void segvn_pp_unlock_anonpages(page_t *, int); 199 200 static struct kmem_cache *segvn_cache; 201 202 #ifdef VM_STATS 203 static struct segvnvmstats_str { 204 ulong_t fill_vp_pages[31]; 205 ulong_t fltvnpages[49]; 206 ulong_t fullszcpages[10]; 207 ulong_t relocatepages[3]; 208 ulong_t fltanpages[17]; 209 ulong_t pagelock[3]; 210 ulong_t demoterange[3]; 211 } segvnvmstats; 212 #endif /* VM_STATS */ 213 214 #define SDR_RANGE 1 /* demote entire range */ 215 #define SDR_END 2 /* demote non aligned ends only */ 216 217 #define CALC_LPG_REGION(pgsz, seg, addr, len, lpgaddr, lpgeaddr) { \ 218 if ((len) != 0) { \ 219 lpgaddr = (caddr_t)P2ALIGN((uintptr_t)(addr), pgsz); \ 220 ASSERT(lpgaddr >= (seg)->s_base); \ 221 lpgeaddr = (caddr_t)P2ROUNDUP((uintptr_t)((addr) + \ 222 (len)), pgsz); \ 223 ASSERT(lpgeaddr > lpgaddr); \ 224 ASSERT(lpgeaddr <= (seg)->s_base + (seg)->s_size); \ 225 } else { \ 226 lpgeaddr = lpgaddr = (addr); \ 227 } \ 228 } 229 230 /*ARGSUSED*/ 231 static int 232 segvn_cache_constructor(void *buf, void *cdrarg, int kmflags) 233 { 234 struct segvn_data *svd = buf; 235 236 rw_init(&svd->lock, NULL, RW_DEFAULT, NULL); 237 mutex_init(&svd->segp_slock, NULL, MUTEX_DEFAULT, NULL); 238 return (0); 239 } 240 241 /*ARGSUSED1*/ 242 static void 243 segvn_cache_destructor(void *buf, void *cdrarg) 244 { 245 struct segvn_data *svd = buf; 246 247 rw_destroy(&svd->lock); 248 mutex_destroy(&svd->segp_slock); 249 } 250 251 /* 252 * Patching this variable to non-zero allows the system to run with 253 * stacks marked as "not executable". It's a bit of a kludge, but is 254 * provided as a tweakable for platforms that export those ABIs 255 * (e.g. sparc V8) that have executable stacks enabled by default. 256 * There are also some restrictions for platforms that don't actually 257 * implement 'noexec' protections. 258 * 259 * Once enabled, the system is (therefore) unable to provide a fully 260 * ABI-compliant execution environment, though practically speaking, 261 * most everything works. The exceptions are generally some interpreters 262 * and debuggers that create executable code on the stack and jump 263 * into it (without explicitly mprotecting the address range to include 264 * PROT_EXEC). 265 * 266 * One important class of applications that are disabled are those 267 * that have been transformed into malicious agents using one of the 268 * numerous "buffer overflow" attacks. See 4007890. 269 */ 270 int noexec_user_stack = 0; 271 int noexec_user_stack_log = 1; 272 273 int segvn_lpg_disable = 0; 274 uint_t segvn_maxpgszc = 0; 275 276 ulong_t segvn_vmpss_clrszc_cnt; 277 ulong_t segvn_vmpss_clrszc_err; 278 ulong_t segvn_fltvnpages_clrszc_cnt; 279 ulong_t segvn_fltvnpages_clrszc_err; 280 ulong_t segvn_setpgsz_align_err; 281 ulong_t segvn_setpgsz_anon_align_err; 282 ulong_t segvn_setpgsz_getattr_err; 283 ulong_t segvn_setpgsz_eof_err; 284 ulong_t segvn_faultvnmpss_align_err1; 285 ulong_t segvn_faultvnmpss_align_err2; 286 ulong_t segvn_faultvnmpss_align_err3; 287 ulong_t segvn_faultvnmpss_align_err4; 288 ulong_t segvn_faultvnmpss_align_err5; 289 ulong_t segvn_vmpss_pageio_deadlk_err; 290 291 /* 292 * Initialize segvn data structures 293 */ 294 void 295 segvn_init(void) 296 { 297 uint_t maxszc; 298 uint_t szc; 299 size_t pgsz; 300 301 segvn_cache = kmem_cache_create("segvn_cache", 302 sizeof (struct segvn_data), 0, 303 segvn_cache_constructor, segvn_cache_destructor, NULL, 304 NULL, NULL, 0); 305 306 if (segvn_lpg_disable != 0) 307 return; 308 szc = maxszc = page_num_pagesizes() - 1; 309 if (szc == 0) { 310 segvn_lpg_disable = 1; 311 return; 312 } 313 if (page_get_pagesize(0) != PAGESIZE) { 314 panic("segvn_init: bad szc 0"); 315 /*NOTREACHED*/ 316 } 317 while (szc != 0) { 318 pgsz = page_get_pagesize(szc); 319 if (pgsz <= PAGESIZE || !IS_P2ALIGNED(pgsz, pgsz)) { 320 panic("segvn_init: bad szc %d", szc); 321 /*NOTREACHED*/ 322 } 323 szc--; 324 } 325 if (segvn_maxpgszc == 0 || segvn_maxpgszc > maxszc) 326 segvn_maxpgszc = maxszc; 327 } 328 329 #define SEGVN_PAGEIO ((void *)0x1) 330 #define SEGVN_NOPAGEIO ((void *)0x2) 331 332 static void 333 segvn_setvnode_mpss(vnode_t *vp) 334 { 335 int err; 336 337 ASSERT(vp->v_mpssdata == NULL || 338 vp->v_mpssdata == SEGVN_PAGEIO || 339 vp->v_mpssdata == SEGVN_NOPAGEIO); 340 341 if (vp->v_mpssdata == NULL) { 342 if (vn_vmpss_usepageio(vp)) { 343 err = VOP_PAGEIO(vp, (page_t *)NULL, 344 (u_offset_t)0, 0, 0, CRED()); 345 } else { 346 err = ENOSYS; 347 } 348 /* 349 * set v_mpssdata just once per vnode life 350 * so that it never changes. 351 */ 352 mutex_enter(&vp->v_lock); 353 if (vp->v_mpssdata == NULL) { 354 if (err == EINVAL) { 355 vp->v_mpssdata = SEGVN_PAGEIO; 356 } else { 357 vp->v_mpssdata = SEGVN_NOPAGEIO; 358 } 359 } 360 mutex_exit(&vp->v_lock); 361 } 362 } 363 364 int 365 segvn_create(struct seg *seg, void *argsp) 366 { 367 struct segvn_crargs *a = (struct segvn_crargs *)argsp; 368 struct segvn_data *svd; 369 size_t swresv = 0; 370 struct cred *cred; 371 struct anon_map *amp; 372 int error = 0; 373 size_t pgsz; 374 lgrp_mem_policy_t mpolicy = LGRP_MEM_POLICY_DEFAULT; 375 376 377 ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock)); 378 379 if (a->type != MAP_PRIVATE && a->type != MAP_SHARED) { 380 panic("segvn_create type"); 381 /*NOTREACHED*/ 382 } 383 384 /* 385 * Check arguments. If a shared anon structure is given then 386 * it is illegal to also specify a vp. 387 */ 388 if (a->amp != NULL && a->vp != NULL) { 389 panic("segvn_create anon_map"); 390 /*NOTREACHED*/ 391 } 392 393 /* MAP_NORESERVE on a MAP_SHARED segment is meaningless. */ 394 if (a->type == MAP_SHARED) 395 a->flags &= ~MAP_NORESERVE; 396 397 if (a->szc != 0) { 398 if (segvn_lpg_disable != 0 || (a->szc == AS_MAP_NO_LPOOB) || 399 (a->amp != NULL && a->type == MAP_PRIVATE) || 400 (a->flags & MAP_NORESERVE) || seg->s_as == &kas) { 401 a->szc = 0; 402 } else { 403 if (a->szc > segvn_maxpgszc) 404 a->szc = segvn_maxpgszc; 405 pgsz = page_get_pagesize(a->szc); 406 if (!IS_P2ALIGNED(seg->s_base, pgsz) || 407 !IS_P2ALIGNED(seg->s_size, pgsz)) { 408 a->szc = 0; 409 } else if (a->vp != NULL) { 410 extern struct vnode kvp; 411 if (IS_SWAPFSVP(a->vp) || VN_ISKAS(a->vp)) { 412 /* 413 * paranoid check. 414 * hat_page_demote() is not supported 415 * on swapfs pages. 416 */ 417 a->szc = 0; 418 } else if (map_addr_vacalign_check(seg->s_base, 419 a->offset & PAGEMASK)) { 420 a->szc = 0; 421 } 422 } else if (a->amp != NULL) { 423 pgcnt_t anum = btopr(a->offset); 424 pgcnt_t pgcnt = page_get_pagecnt(a->szc); 425 if (!IS_P2ALIGNED(anum, pgcnt)) { 426 a->szc = 0; 427 } 428 } 429 } 430 } 431 432 /* 433 * If segment may need private pages, reserve them now. 434 */ 435 if (!(a->flags & MAP_NORESERVE) && ((a->vp == NULL && a->amp == NULL) || 436 (a->type == MAP_PRIVATE && (a->prot & PROT_WRITE)))) { 437 if (anon_resv(seg->s_size) == 0) 438 return (EAGAIN); 439 swresv = seg->s_size; 440 TRACE_3(TR_FAC_VM, TR_ANON_PROC, "anon proc:%p %lu %u", 441 seg, swresv, 1); 442 } 443 444 /* 445 * Reserve any mapping structures that may be required. 446 */ 447 hat_map(seg->s_as->a_hat, seg->s_base, seg->s_size, HAT_MAP); 448 449 if (a->cred) { 450 cred = a->cred; 451 crhold(cred); 452 } else { 453 crhold(cred = CRED()); 454 } 455 456 /* Inform the vnode of the new mapping */ 457 if (a->vp) { 458 error = VOP_ADDMAP(a->vp, a->offset & PAGEMASK, 459 seg->s_as, seg->s_base, seg->s_size, a->prot, 460 a->maxprot, a->type, cred); 461 if (error) { 462 if (swresv != 0) { 463 anon_unresv(swresv); 464 TRACE_3(TR_FAC_VM, TR_ANON_PROC, 465 "anon proc:%p %lu %u", 466 seg, swresv, 0); 467 } 468 crfree(cred); 469 hat_unload(seg->s_as->a_hat, seg->s_base, 470 seg->s_size, HAT_UNLOAD_UNMAP); 471 return (error); 472 } 473 } 474 475 /* 476 * If more than one segment in the address space, and 477 * they're adjacent virtually, try to concatenate them. 478 * Don't concatenate if an explicit anon_map structure 479 * was supplied (e.g., SystemV shared memory). 480 */ 481 if (a->amp == NULL) { 482 struct seg *pseg, *nseg; 483 struct segvn_data *psvd, *nsvd; 484 lgrp_mem_policy_t ppolicy, npolicy; 485 uint_t lgrp_mem_policy_flags = 0; 486 extern lgrp_mem_policy_t lgrp_mem_default_policy; 487 488 /* 489 * Memory policy flags (lgrp_mem_policy_flags) is valid when 490 * extending stack/heap segments. 491 */ 492 if ((a->vp == NULL) && (a->type == MAP_PRIVATE) && 493 !(a->flags & MAP_NORESERVE) && (seg->s_as != &kas)) { 494 lgrp_mem_policy_flags = a->lgrp_mem_policy_flags; 495 } else { 496 /* 497 * Get policy when not extending it from another segment 498 */ 499 mpolicy = lgrp_mem_policy_default(seg->s_size, a->type); 500 } 501 502 /* 503 * First, try to concatenate the previous and new segments 504 */ 505 pseg = AS_SEGPREV(seg->s_as, seg); 506 if (pseg != NULL && 507 pseg->s_base + pseg->s_size == seg->s_base && 508 pseg->s_ops == &segvn_ops) { 509 /* 510 * Get memory allocation policy from previous segment. 511 * When extension is specified (e.g. for heap) apply 512 * this policy to the new segment regardless of the 513 * outcome of segment concatenation. Extension occurs 514 * for non-default policy otherwise default policy is 515 * used and is based on extended segment size. 516 */ 517 psvd = (struct segvn_data *)pseg->s_data; 518 ppolicy = psvd->policy_info.mem_policy; 519 if (lgrp_mem_policy_flags == 520 LGRP_MP_FLAG_EXTEND_UP) { 521 if (ppolicy != lgrp_mem_default_policy) { 522 mpolicy = ppolicy; 523 } else { 524 mpolicy = lgrp_mem_policy_default( 525 pseg->s_size + seg->s_size, 526 a->type); 527 } 528 } 529 530 if (mpolicy == ppolicy && 531 (pseg->s_size + seg->s_size <= 532 segvn_comb_thrshld || psvd->amp == NULL) && 533 segvn_extend_prev(pseg, seg, a, swresv) == 0) { 534 /* 535 * success! now try to concatenate 536 * with following seg 537 */ 538 crfree(cred); 539 nseg = AS_SEGNEXT(pseg->s_as, pseg); 540 if (nseg != NULL && 541 nseg != pseg && 542 nseg->s_ops == &segvn_ops && 543 pseg->s_base + pseg->s_size == 544 nseg->s_base) 545 (void) segvn_concat(pseg, nseg, 0); 546 ASSERT(pseg->s_szc == 0 || 547 (a->szc == pseg->s_szc && 548 IS_P2ALIGNED(pseg->s_base, pgsz) && 549 IS_P2ALIGNED(pseg->s_size, pgsz))); 550 return (0); 551 } 552 } 553 554 /* 555 * Failed, so try to concatenate with following seg 556 */ 557 nseg = AS_SEGNEXT(seg->s_as, seg); 558 if (nseg != NULL && 559 seg->s_base + seg->s_size == nseg->s_base && 560 nseg->s_ops == &segvn_ops) { 561 /* 562 * Get memory allocation policy from next segment. 563 * When extension is specified (e.g. for stack) apply 564 * this policy to the new segment regardless of the 565 * outcome of segment concatenation. Extension occurs 566 * for non-default policy otherwise default policy is 567 * used and is based on extended segment size. 568 */ 569 nsvd = (struct segvn_data *)nseg->s_data; 570 npolicy = nsvd->policy_info.mem_policy; 571 if (lgrp_mem_policy_flags == 572 LGRP_MP_FLAG_EXTEND_DOWN) { 573 if (npolicy != lgrp_mem_default_policy) { 574 mpolicy = npolicy; 575 } else { 576 mpolicy = lgrp_mem_policy_default( 577 nseg->s_size + seg->s_size, 578 a->type); 579 } 580 } 581 582 if (mpolicy == npolicy && 583 segvn_extend_next(seg, nseg, a, swresv) == 0) { 584 crfree(cred); 585 ASSERT(nseg->s_szc == 0 || 586 (a->szc == nseg->s_szc && 587 IS_P2ALIGNED(nseg->s_base, pgsz) && 588 IS_P2ALIGNED(nseg->s_size, pgsz))); 589 return (0); 590 } 591 } 592 } 593 594 if (a->vp != NULL) { 595 VN_HOLD(a->vp); 596 if (a->type == MAP_SHARED) 597 lgrp_shm_policy_init(NULL, a->vp); 598 } 599 svd = kmem_cache_alloc(segvn_cache, KM_SLEEP); 600 601 seg->s_ops = &segvn_ops; 602 seg->s_data = (void *)svd; 603 seg->s_szc = a->szc; 604 605 svd->vp = a->vp; 606 /* 607 * Anonymous mappings have no backing file so the offset is meaningless. 608 */ 609 svd->offset = a->vp ? (a->offset & PAGEMASK) : 0; 610 svd->prot = a->prot; 611 svd->maxprot = a->maxprot; 612 svd->pageprot = 0; 613 svd->type = a->type; 614 svd->vpage = NULL; 615 svd->cred = cred; 616 svd->advice = MADV_NORMAL; 617 svd->pageadvice = 0; 618 svd->flags = (ushort_t)a->flags; 619 svd->softlockcnt = 0; 620 if (a->szc != 0 && a->vp != NULL) { 621 segvn_setvnode_mpss(a->vp); 622 } 623 624 amp = a->amp; 625 if ((svd->amp = amp) == NULL) { 626 svd->anon_index = 0; 627 if (svd->type == MAP_SHARED) { 628 svd->swresv = 0; 629 /* 630 * Shared mappings to a vp need no other setup. 631 * If we have a shared mapping to an anon_map object 632 * which hasn't been allocated yet, allocate the 633 * struct now so that it will be properly shared 634 * by remembering the swap reservation there. 635 */ 636 if (a->vp == NULL) { 637 svd->amp = anonmap_alloc(seg->s_size, swresv); 638 svd->amp->a_szc = seg->s_szc; 639 } 640 } else { 641 /* 642 * Private mapping (with or without a vp). 643 * Allocate anon_map when needed. 644 */ 645 svd->swresv = swresv; 646 } 647 } else { 648 pgcnt_t anon_num; 649 650 /* 651 * Mapping to an existing anon_map structure without a vp. 652 * For now we will insure that the segment size isn't larger 653 * than the size - offset gives us. Later on we may wish to 654 * have the anon array dynamically allocated itself so that 655 * we don't always have to allocate all the anon pointer slots. 656 * This of course involves adding extra code to check that we 657 * aren't trying to use an anon pointer slot beyond the end 658 * of the currently allocated anon array. 659 */ 660 if ((amp->size - a->offset) < seg->s_size) { 661 panic("segvn_create anon_map size"); 662 /*NOTREACHED*/ 663 } 664 665 anon_num = btopr(a->offset); 666 667 if (a->type == MAP_SHARED) { 668 /* 669 * SHARED mapping to a given anon_map. 670 */ 671 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 672 amp->refcnt++; 673 if (a->szc > amp->a_szc) { 674 amp->a_szc = a->szc; 675 } 676 ANON_LOCK_EXIT(&->a_rwlock); 677 svd->anon_index = anon_num; 678 svd->swresv = 0; 679 } else { 680 /* 681 * PRIVATE mapping to a given anon_map. 682 * Make sure that all the needed anon 683 * structures are created (so that we will 684 * share the underlying pages if nothing 685 * is written by this mapping) and then 686 * duplicate the anon array as is done 687 * when a privately mapped segment is dup'ed. 688 */ 689 struct anon *ap; 690 caddr_t addr; 691 caddr_t eaddr; 692 ulong_t anon_idx; 693 int hat_flag = HAT_LOAD; 694 695 if (svd->flags & MAP_TEXT) { 696 hat_flag |= HAT_LOAD_TEXT; 697 } 698 699 svd->amp = anonmap_alloc(seg->s_size, 0); 700 svd->amp->a_szc = seg->s_szc; 701 svd->anon_index = 0; 702 svd->swresv = swresv; 703 704 /* 705 * Prevent 2 threads from allocating anon 706 * slots simultaneously. 707 */ 708 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 709 eaddr = seg->s_base + seg->s_size; 710 711 for (anon_idx = anon_num, addr = seg->s_base; 712 addr < eaddr; addr += PAGESIZE, anon_idx++) { 713 page_t *pp; 714 715 if ((ap = anon_get_ptr(amp->ahp, 716 anon_idx)) != NULL) 717 continue; 718 719 /* 720 * Allocate the anon struct now. 721 * Might as well load up translation 722 * to the page while we're at it... 723 */ 724 pp = anon_zero(seg, addr, &ap, cred); 725 if (ap == NULL || pp == NULL) { 726 panic("segvn_create anon_zero"); 727 /*NOTREACHED*/ 728 } 729 730 /* 731 * Re-acquire the anon_map lock and 732 * initialize the anon array entry. 733 */ 734 ASSERT(anon_get_ptr(amp->ahp, 735 anon_idx) == NULL); 736 (void) anon_set_ptr(amp->ahp, anon_idx, ap, 737 ANON_SLEEP); 738 739 ASSERT(seg->s_szc == 0); 740 ASSERT(!IS_VMODSORT(pp->p_vnode)); 741 742 hat_memload(seg->s_as->a_hat, addr, pp, 743 svd->prot & ~PROT_WRITE, hat_flag); 744 745 page_unlock(pp); 746 } 747 ASSERT(seg->s_szc == 0); 748 anon_dup(amp->ahp, anon_num, svd->amp->ahp, 749 0, seg->s_size); 750 ANON_LOCK_EXIT(&->a_rwlock); 751 } 752 } 753 754 /* 755 * Set default memory allocation policy for segment 756 * 757 * Always set policy for private memory at least for initialization 758 * even if this is a shared memory segment 759 */ 760 (void) lgrp_privm_policy_set(mpolicy, &svd->policy_info, seg->s_size); 761 762 if (svd->type == MAP_SHARED) 763 (void) lgrp_shm_policy_set(mpolicy, svd->amp, svd->anon_index, 764 svd->vp, svd->offset, seg->s_size); 765 766 return (0); 767 } 768 769 /* 770 * Concatenate two existing segments, if possible. 771 * Return 0 on success, -1 if two segments are not compatible 772 * or -2 on memory allocation failure. 773 * If amp_cat == 1 then try and concat segments with anon maps 774 */ 775 static int 776 segvn_concat(struct seg *seg1, struct seg *seg2, int amp_cat) 777 { 778 struct segvn_data *svd1 = seg1->s_data; 779 struct segvn_data *svd2 = seg2->s_data; 780 struct anon_map *amp1 = svd1->amp; 781 struct anon_map *amp2 = svd2->amp; 782 struct vpage *vpage1 = svd1->vpage; 783 struct vpage *vpage2 = svd2->vpage, *nvpage = NULL; 784 size_t size, nvpsize; 785 pgcnt_t npages1, npages2; 786 787 ASSERT(seg1->s_as && seg2->s_as && seg1->s_as == seg2->s_as); 788 ASSERT(AS_WRITE_HELD(seg1->s_as, &seg1->s_as->a_lock)); 789 ASSERT(seg1->s_ops == seg2->s_ops); 790 791 /* both segments exist, try to merge them */ 792 #define incompat(x) (svd1->x != svd2->x) 793 if (incompat(vp) || incompat(maxprot) || 794 (!svd1->pageadvice && !svd2->pageadvice && incompat(advice)) || 795 (!svd1->pageprot && !svd2->pageprot && incompat(prot)) || 796 incompat(type) || incompat(cred) || incompat(flags) || 797 seg1->s_szc != seg2->s_szc || incompat(policy_info.mem_policy) || 798 (svd2->softlockcnt > 0)) 799 return (-1); 800 #undef incompat 801 802 /* 803 * vp == NULL implies zfod, offset doesn't matter 804 */ 805 if (svd1->vp != NULL && 806 svd1->offset + seg1->s_size != svd2->offset) { 807 return (-1); 808 } 809 810 /* 811 * Fail early if we're not supposed to concatenate 812 * segments with non NULL amp. 813 */ 814 if (amp_cat == 0 && (amp1 != NULL || amp2 != NULL)) { 815 return (-1); 816 } 817 818 if (svd1->vp == NULL && svd1->type == MAP_SHARED) { 819 if (amp1 != amp2) { 820 return (-1); 821 } 822 if (amp1 != NULL && svd1->anon_index + btop(seg1->s_size) != 823 svd2->anon_index) { 824 return (-1); 825 } 826 ASSERT(amp1 == NULL || amp1->refcnt >= 2); 827 } 828 829 /* 830 * If either seg has vpages, create a new merged vpage array. 831 */ 832 if (vpage1 != NULL || vpage2 != NULL) { 833 struct vpage *vp; 834 835 npages1 = seg_pages(seg1); 836 npages2 = seg_pages(seg2); 837 nvpsize = vpgtob(npages1 + npages2); 838 839 if ((nvpage = kmem_zalloc(nvpsize, KM_NOSLEEP)) == NULL) { 840 return (-2); 841 } 842 if (vpage1 != NULL) { 843 bcopy(vpage1, nvpage, vpgtob(npages1)); 844 } 845 if (vpage2 != NULL) { 846 bcopy(vpage2, nvpage + npages1, vpgtob(npages2)); 847 } 848 for (vp = nvpage; vp < nvpage + npages1; vp++) { 849 if (svd2->pageprot && !svd1->pageprot) { 850 VPP_SETPROT(vp, svd1->prot); 851 } 852 if (svd2->pageadvice && !svd1->pageadvice) { 853 VPP_SETADVICE(vp, svd1->advice); 854 } 855 } 856 for (vp = nvpage + npages1; 857 vp < nvpage + npages1 + npages2; vp++) { 858 if (svd1->pageprot && !svd2->pageprot) { 859 VPP_SETPROT(vp, svd2->prot); 860 } 861 if (svd1->pageadvice && !svd2->pageadvice) { 862 VPP_SETADVICE(vp, svd2->advice); 863 } 864 } 865 } 866 867 /* 868 * If either segment has private pages, create a new merged anon 869 * array. If mergeing shared anon segments just decrement anon map's 870 * refcnt. 871 */ 872 if (amp1 != NULL && svd1->type == MAP_SHARED) { 873 ASSERT(amp1 == amp2 && svd1->vp == NULL); 874 ANON_LOCK_ENTER(&1->a_rwlock, RW_WRITER); 875 ASSERT(amp1->refcnt >= 2); 876 amp1->refcnt--; 877 ANON_LOCK_EXIT(&1->a_rwlock); 878 svd2->amp = NULL; 879 } else if (amp1 != NULL || amp2 != NULL) { 880 struct anon_hdr *nahp; 881 struct anon_map *namp = NULL; 882 size_t asize; 883 884 ASSERT(svd1->type == MAP_PRIVATE); 885 886 asize = seg1->s_size + seg2->s_size; 887 if ((nahp = anon_create(btop(asize), ANON_NOSLEEP)) == NULL) { 888 if (nvpage != NULL) { 889 kmem_free(nvpage, nvpsize); 890 } 891 return (-2); 892 } 893 if (amp1 != NULL) { 894 /* 895 * XXX anon rwlock is not really needed because 896 * this is a private segment and we are writers. 897 */ 898 ANON_LOCK_ENTER(&1->a_rwlock, RW_WRITER); 899 ASSERT(amp1->refcnt == 1); 900 if (anon_copy_ptr(amp1->ahp, svd1->anon_index, 901 nahp, 0, btop(seg1->s_size), ANON_NOSLEEP)) { 902 anon_release(nahp, btop(asize)); 903 ANON_LOCK_EXIT(&1->a_rwlock); 904 if (nvpage != NULL) { 905 kmem_free(nvpage, nvpsize); 906 } 907 return (-2); 908 } 909 } 910 if (amp2 != NULL) { 911 ANON_LOCK_ENTER(&2->a_rwlock, RW_WRITER); 912 ASSERT(amp2->refcnt == 1); 913 if (anon_copy_ptr(amp2->ahp, svd2->anon_index, 914 nahp, btop(seg1->s_size), btop(seg2->s_size), 915 ANON_NOSLEEP)) { 916 anon_release(nahp, btop(asize)); 917 ANON_LOCK_EXIT(&2->a_rwlock); 918 if (amp1 != NULL) { 919 ANON_LOCK_EXIT(&1->a_rwlock); 920 } 921 if (nvpage != NULL) { 922 kmem_free(nvpage, nvpsize); 923 } 924 return (-2); 925 } 926 } 927 if (amp1 != NULL) { 928 namp = amp1; 929 anon_release(amp1->ahp, btop(amp1->size)); 930 } 931 if (amp2 != NULL) { 932 if (namp == NULL) { 933 ASSERT(amp1 == NULL); 934 namp = amp2; 935 anon_release(amp2->ahp, btop(amp2->size)); 936 } else { 937 amp2->refcnt--; 938 ANON_LOCK_EXIT(&2->a_rwlock); 939 anonmap_free(amp2); 940 } 941 svd2->amp = NULL; /* needed for seg_free */ 942 } 943 namp->ahp = nahp; 944 namp->size = asize; 945 svd1->amp = namp; 946 svd1->anon_index = 0; 947 ANON_LOCK_EXIT(&namp->a_rwlock); 948 } 949 /* 950 * Now free the old vpage structures. 951 */ 952 if (nvpage != NULL) { 953 if (vpage1 != NULL) { 954 kmem_free(vpage1, vpgtob(npages1)); 955 } 956 if (vpage2 != NULL) { 957 svd2->vpage = NULL; 958 kmem_free(vpage2, vpgtob(npages2)); 959 } 960 if (svd2->pageprot) { 961 svd1->pageprot = 1; 962 } 963 if (svd2->pageadvice) { 964 svd1->pageadvice = 1; 965 } 966 svd1->vpage = nvpage; 967 } 968 969 /* all looks ok, merge segments */ 970 svd1->swresv += svd2->swresv; 971 svd2->swresv = 0; /* so seg_free doesn't release swap space */ 972 size = seg2->s_size; 973 seg_free(seg2); 974 seg1->s_size += size; 975 return (0); 976 } 977 978 /* 979 * Extend the previous segment (seg1) to include the 980 * new segment (seg2 + a), if possible. 981 * Return 0 on success. 982 */ 983 static int 984 segvn_extend_prev(seg1, seg2, a, swresv) 985 struct seg *seg1, *seg2; 986 struct segvn_crargs *a; 987 size_t swresv; 988 { 989 struct segvn_data *svd1 = (struct segvn_data *)seg1->s_data; 990 size_t size; 991 struct anon_map *amp1; 992 struct vpage *new_vpage; 993 994 /* 995 * We don't need any segment level locks for "segvn" data 996 * since the address space is "write" locked. 997 */ 998 ASSERT(seg1->s_as && AS_WRITE_HELD(seg1->s_as, &seg1->s_as->a_lock)); 999 1000 /* second segment is new, try to extend first */ 1001 /* XXX - should also check cred */ 1002 if (svd1->vp != a->vp || svd1->maxprot != a->maxprot || 1003 (!svd1->pageprot && (svd1->prot != a->prot)) || 1004 svd1->type != a->type || svd1->flags != a->flags || 1005 seg1->s_szc != a->szc) 1006 return (-1); 1007 1008 /* vp == NULL implies zfod, offset doesn't matter */ 1009 if (svd1->vp != NULL && 1010 svd1->offset + seg1->s_size != (a->offset & PAGEMASK)) 1011 return (-1); 1012 1013 amp1 = svd1->amp; 1014 if (amp1) { 1015 pgcnt_t newpgs; 1016 1017 /* 1018 * Segment has private pages, can data structures 1019 * be expanded? 1020 * 1021 * Acquire the anon_map lock to prevent it from changing, 1022 * if it is shared. This ensures that the anon_map 1023 * will not change while a thread which has a read/write 1024 * lock on an address space references it. 1025 * XXX - Don't need the anon_map lock at all if "refcnt" 1026 * is 1. 1027 * 1028 * Can't grow a MAP_SHARED segment with an anonmap because 1029 * there may be existing anon slots where we want to extend 1030 * the segment and we wouldn't know what to do with them 1031 * (e.g., for tmpfs right thing is to just leave them there, 1032 * for /dev/zero they should be cleared out). 1033 */ 1034 if (svd1->type == MAP_SHARED) 1035 return (-1); 1036 1037 ANON_LOCK_ENTER(&1->a_rwlock, RW_WRITER); 1038 if (amp1->refcnt > 1) { 1039 ANON_LOCK_EXIT(&1->a_rwlock); 1040 return (-1); 1041 } 1042 newpgs = anon_grow(amp1->ahp, &svd1->anon_index, 1043 btop(seg1->s_size), btop(seg2->s_size), ANON_NOSLEEP); 1044 1045 if (newpgs == 0) { 1046 ANON_LOCK_EXIT(&1->a_rwlock); 1047 return (-1); 1048 } 1049 amp1->size = ptob(newpgs); 1050 ANON_LOCK_EXIT(&1->a_rwlock); 1051 } 1052 if (svd1->vpage != NULL) { 1053 new_vpage = 1054 kmem_zalloc(vpgtob(seg_pages(seg1) + seg_pages(seg2)), 1055 KM_NOSLEEP); 1056 if (new_vpage == NULL) 1057 return (-1); 1058 bcopy(svd1->vpage, new_vpage, vpgtob(seg_pages(seg1))); 1059 kmem_free(svd1->vpage, vpgtob(seg_pages(seg1))); 1060 svd1->vpage = new_vpage; 1061 if (svd1->pageprot) { 1062 struct vpage *vp, *evp; 1063 1064 vp = new_vpage + seg_pages(seg1); 1065 evp = vp + seg_pages(seg2); 1066 for (; vp < evp; vp++) 1067 VPP_SETPROT(vp, a->prot); 1068 } 1069 } 1070 size = seg2->s_size; 1071 seg_free(seg2); 1072 seg1->s_size += size; 1073 svd1->swresv += swresv; 1074 return (0); 1075 } 1076 1077 /* 1078 * Extend the next segment (seg2) to include the 1079 * new segment (seg1 + a), if possible. 1080 * Return 0 on success. 1081 */ 1082 static int 1083 segvn_extend_next( 1084 struct seg *seg1, 1085 struct seg *seg2, 1086 struct segvn_crargs *a, 1087 size_t swresv) 1088 { 1089 struct segvn_data *svd2 = (struct segvn_data *)seg2->s_data; 1090 size_t size; 1091 struct anon_map *amp2; 1092 struct vpage *new_vpage; 1093 1094 /* 1095 * We don't need any segment level locks for "segvn" data 1096 * since the address space is "write" locked. 1097 */ 1098 ASSERT(seg2->s_as && AS_WRITE_HELD(seg2->s_as, &seg2->s_as->a_lock)); 1099 1100 /* first segment is new, try to extend second */ 1101 /* XXX - should also check cred */ 1102 if (svd2->vp != a->vp || svd2->maxprot != a->maxprot || 1103 (!svd2->pageprot && (svd2->prot != a->prot)) || 1104 svd2->type != a->type || svd2->flags != a->flags || 1105 seg2->s_szc != a->szc) 1106 return (-1); 1107 /* vp == NULL implies zfod, offset doesn't matter */ 1108 if (svd2->vp != NULL && 1109 (a->offset & PAGEMASK) + seg1->s_size != svd2->offset) 1110 return (-1); 1111 1112 amp2 = svd2->amp; 1113 if (amp2) { 1114 pgcnt_t newpgs; 1115 1116 /* 1117 * Segment has private pages, can data structures 1118 * be expanded? 1119 * 1120 * Acquire the anon_map lock to prevent it from changing, 1121 * if it is shared. This ensures that the anon_map 1122 * will not change while a thread which has a read/write 1123 * lock on an address space references it. 1124 * 1125 * XXX - Don't need the anon_map lock at all if "refcnt" 1126 * is 1. 1127 */ 1128 if (svd2->type == MAP_SHARED) 1129 return (-1); 1130 1131 ANON_LOCK_ENTER(&2->a_rwlock, RW_WRITER); 1132 if (amp2->refcnt > 1) { 1133 ANON_LOCK_EXIT(&2->a_rwlock); 1134 return (-1); 1135 } 1136 newpgs = anon_grow(amp2->ahp, &svd2->anon_index, 1137 btop(seg2->s_size), btop(seg1->s_size), 1138 ANON_NOSLEEP | ANON_GROWDOWN); 1139 1140 if (newpgs == 0) { 1141 ANON_LOCK_EXIT(&2->a_rwlock); 1142 return (-1); 1143 } 1144 amp2->size = ptob(newpgs); 1145 ANON_LOCK_EXIT(&2->a_rwlock); 1146 } 1147 if (svd2->vpage != NULL) { 1148 new_vpage = 1149 kmem_zalloc(vpgtob(seg_pages(seg1) + seg_pages(seg2)), 1150 KM_NOSLEEP); 1151 if (new_vpage == NULL) { 1152 /* Not merging segments so adjust anon_index back */ 1153 if (amp2) 1154 svd2->anon_index += seg_pages(seg1); 1155 return (-1); 1156 } 1157 bcopy(svd2->vpage, new_vpage + seg_pages(seg1), 1158 vpgtob(seg_pages(seg2))); 1159 kmem_free(svd2->vpage, vpgtob(seg_pages(seg2))); 1160 svd2->vpage = new_vpage; 1161 if (svd2->pageprot) { 1162 struct vpage *vp, *evp; 1163 1164 vp = new_vpage; 1165 evp = vp + seg_pages(seg1); 1166 for (; vp < evp; vp++) 1167 VPP_SETPROT(vp, a->prot); 1168 } 1169 } 1170 size = seg1->s_size; 1171 seg_free(seg1); 1172 seg2->s_size += size; 1173 seg2->s_base -= size; 1174 svd2->offset -= size; 1175 svd2->swresv += swresv; 1176 return (0); 1177 } 1178 1179 static int 1180 segvn_dup(struct seg *seg, struct seg *newseg) 1181 { 1182 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 1183 struct segvn_data *newsvd; 1184 pgcnt_t npages = seg_pages(seg); 1185 int error = 0; 1186 uint_t prot; 1187 size_t len; 1188 1189 ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock)); 1190 1191 /* 1192 * If segment has anon reserved, reserve more for the new seg. 1193 * For a MAP_NORESERVE segment swresv will be a count of all the 1194 * allocated anon slots; thus we reserve for the child as many slots 1195 * as the parent has allocated. This semantic prevents the child or 1196 * parent from dieing during a copy-on-write fault caused by trying 1197 * to write a shared pre-existing anon page. 1198 */ 1199 if ((len = svd->swresv) != 0) { 1200 if (anon_resv(svd->swresv) == 0) 1201 return (ENOMEM); 1202 1203 TRACE_3(TR_FAC_VM, TR_ANON_PROC, "anon proc:%p %lu %u", 1204 seg, len, 0); 1205 } 1206 1207 newsvd = kmem_cache_alloc(segvn_cache, KM_SLEEP); 1208 1209 newseg->s_ops = &segvn_ops; 1210 newseg->s_data = (void *)newsvd; 1211 newseg->s_szc = seg->s_szc; 1212 1213 if ((newsvd->vp = svd->vp) != NULL) { 1214 VN_HOLD(svd->vp); 1215 if (svd->type == MAP_SHARED) 1216 lgrp_shm_policy_init(NULL, svd->vp); 1217 } 1218 newsvd->offset = svd->offset; 1219 newsvd->prot = svd->prot; 1220 newsvd->maxprot = svd->maxprot; 1221 newsvd->pageprot = svd->pageprot; 1222 newsvd->type = svd->type; 1223 newsvd->cred = svd->cred; 1224 crhold(newsvd->cred); 1225 newsvd->advice = svd->advice; 1226 newsvd->pageadvice = svd->pageadvice; 1227 newsvd->swresv = svd->swresv; 1228 newsvd->flags = svd->flags; 1229 newsvd->softlockcnt = 0; 1230 newsvd->policy_info = svd->policy_info; 1231 if ((newsvd->amp = svd->amp) == NULL) { 1232 /* 1233 * Not attaching to a shared anon object. 1234 */ 1235 newsvd->anon_index = 0; 1236 } else { 1237 struct anon_map *amp; 1238 1239 amp = svd->amp; 1240 if (svd->type == MAP_SHARED) { 1241 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 1242 amp->refcnt++; 1243 ANON_LOCK_EXIT(&->a_rwlock); 1244 newsvd->anon_index = svd->anon_index; 1245 } else { 1246 int reclaim = 1; 1247 1248 /* 1249 * Allocate and initialize new anon_map structure. 1250 */ 1251 newsvd->amp = anonmap_alloc(newseg->s_size, 0); 1252 newsvd->amp->a_szc = newseg->s_szc; 1253 newsvd->anon_index = 0; 1254 1255 /* 1256 * We don't have to acquire the anon_map lock 1257 * for the new segment (since it belongs to an 1258 * address space that is still not associated 1259 * with any process), or the segment in the old 1260 * address space (since all threads in it 1261 * are stopped while duplicating the address space). 1262 */ 1263 1264 /* 1265 * The goal of the following code is to make sure that 1266 * softlocked pages do not end up as copy on write 1267 * pages. This would cause problems where one 1268 * thread writes to a page that is COW and a different 1269 * thread in the same process has softlocked it. The 1270 * softlock lock would move away from this process 1271 * because the write would cause this process to get 1272 * a copy (without the softlock). 1273 * 1274 * The strategy here is to just break the 1275 * sharing on pages that could possibly be 1276 * softlocked. 1277 */ 1278 retry: 1279 if (svd->softlockcnt) { 1280 struct anon *ap, *newap; 1281 size_t i; 1282 uint_t vpprot; 1283 page_t *anon_pl[1+1], *pp; 1284 caddr_t addr; 1285 ulong_t anon_idx = 0; 1286 1287 /* 1288 * The softlock count might be non zero 1289 * because some pages are still stuck in the 1290 * cache for lazy reclaim. Flush the cache 1291 * now. This should drop the count to zero. 1292 * [or there is really I/O going on to these 1293 * pages]. Note, we have the writers lock so 1294 * nothing gets inserted during the flush. 1295 */ 1296 if (reclaim == 1) { 1297 segvn_purge(seg); 1298 reclaim = 0; 1299 goto retry; 1300 } 1301 i = btopr(seg->s_size); 1302 addr = seg->s_base; 1303 /* 1304 * XXX break cow sharing using PAGESIZE 1305 * pages. They will be relocated into larger 1306 * pages at fault time. 1307 */ 1308 while (i-- > 0) { 1309 if (ap = anon_get_ptr(amp->ahp, 1310 anon_idx)) { 1311 error = anon_getpage(&ap, 1312 &vpprot, anon_pl, PAGESIZE, 1313 seg, addr, S_READ, 1314 svd->cred); 1315 if (error) { 1316 newsvd->vpage = NULL; 1317 goto out; 1318 } 1319 /* 1320 * prot need not be computed 1321 * below 'cause anon_private is 1322 * going to ignore it anyway 1323 * as child doesn't inherit 1324 * pagelock from parent. 1325 */ 1326 prot = svd->pageprot ? 1327 VPP_PROT( 1328 &svd->vpage[ 1329 seg_page(seg, addr)]) 1330 : svd->prot; 1331 pp = anon_private(&newap, 1332 newseg, addr, prot, 1333 anon_pl[0], 0, 1334 newsvd->cred); 1335 if (pp == NULL) { 1336 /* no mem abort */ 1337 newsvd->vpage = NULL; 1338 error = ENOMEM; 1339 goto out; 1340 } 1341 (void) anon_set_ptr( 1342 newsvd->amp->ahp, anon_idx, 1343 newap, ANON_SLEEP); 1344 page_unlock(pp); 1345 } 1346 addr += PAGESIZE; 1347 anon_idx++; 1348 } 1349 } else { /* common case */ 1350 if (seg->s_szc != 0) { 1351 /* 1352 * If at least one of anon slots of a 1353 * large page exists then make sure 1354 * all anon slots of a large page 1355 * exist to avoid partial cow sharing 1356 * of a large page in the future. 1357 */ 1358 anon_dup_fill_holes(amp->ahp, 1359 svd->anon_index, newsvd->amp->ahp, 1360 0, seg->s_size, seg->s_szc, 1361 svd->vp != NULL); 1362 } else { 1363 anon_dup(amp->ahp, svd->anon_index, 1364 newsvd->amp->ahp, 0, seg->s_size); 1365 } 1366 1367 hat_clrattr(seg->s_as->a_hat, seg->s_base, 1368 seg->s_size, PROT_WRITE); 1369 } 1370 } 1371 } 1372 /* 1373 * If necessary, create a vpage structure for the new segment. 1374 * Do not copy any page lock indications. 1375 */ 1376 if (svd->vpage != NULL) { 1377 uint_t i; 1378 struct vpage *ovp = svd->vpage; 1379 struct vpage *nvp; 1380 1381 nvp = newsvd->vpage = 1382 kmem_alloc(vpgtob(npages), KM_SLEEP); 1383 for (i = 0; i < npages; i++) { 1384 *nvp = *ovp++; 1385 VPP_CLRPPLOCK(nvp++); 1386 } 1387 } else 1388 newsvd->vpage = NULL; 1389 1390 /* Inform the vnode of the new mapping */ 1391 if (newsvd->vp != NULL) { 1392 error = VOP_ADDMAP(newsvd->vp, (offset_t)newsvd->offset, 1393 newseg->s_as, newseg->s_base, newseg->s_size, newsvd->prot, 1394 newsvd->maxprot, newsvd->type, newsvd->cred); 1395 } 1396 out: 1397 return (error); 1398 } 1399 1400 1401 /* 1402 * callback function used by segvn_unmap to invoke free_vp_pages() for only 1403 * those pages actually processed by the HAT 1404 */ 1405 extern int free_pages; 1406 1407 static void 1408 segvn_hat_unload_callback(hat_callback_t *cb) 1409 { 1410 struct seg *seg = cb->hcb_data; 1411 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 1412 size_t len; 1413 u_offset_t off; 1414 1415 ASSERT(svd->vp != NULL); 1416 ASSERT(cb->hcb_end_addr > cb->hcb_start_addr); 1417 ASSERT(cb->hcb_start_addr >= seg->s_base); 1418 1419 len = cb->hcb_end_addr - cb->hcb_start_addr; 1420 off = cb->hcb_start_addr - seg->s_base; 1421 free_vp_pages(svd->vp, svd->offset + off, len); 1422 } 1423 1424 1425 static int 1426 segvn_unmap(struct seg *seg, caddr_t addr, size_t len) 1427 { 1428 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 1429 struct segvn_data *nsvd; 1430 struct seg *nseg; 1431 struct anon_map *amp; 1432 pgcnt_t opages; /* old segment size in pages */ 1433 pgcnt_t npages; /* new segment size in pages */ 1434 pgcnt_t dpages; /* pages being deleted (unmapped) */ 1435 hat_callback_t callback; /* used for free_vp_pages() */ 1436 hat_callback_t *cbp = NULL; 1437 caddr_t nbase; 1438 size_t nsize; 1439 size_t oswresv; 1440 int reclaim = 1; 1441 1442 /* 1443 * We don't need any segment level locks for "segvn" data 1444 * since the address space is "write" locked. 1445 */ 1446 ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock)); 1447 1448 /* 1449 * Fail the unmap if pages are SOFTLOCKed through this mapping. 1450 * softlockcnt is protected from change by the as write lock. 1451 */ 1452 retry: 1453 if (svd->softlockcnt > 0) { 1454 /* 1455 * since we do have the writers lock nobody can fill 1456 * the cache during the purge. The flush either succeeds 1457 * or we still have pending I/Os. 1458 */ 1459 if (reclaim == 1) { 1460 segvn_purge(seg); 1461 reclaim = 0; 1462 goto retry; 1463 } 1464 return (EAGAIN); 1465 } 1466 1467 /* 1468 * Check for bad sizes 1469 */ 1470 if (addr < seg->s_base || addr + len > seg->s_base + seg->s_size || 1471 (len & PAGEOFFSET) || ((uintptr_t)addr & PAGEOFFSET)) { 1472 panic("segvn_unmap"); 1473 /*NOTREACHED*/ 1474 } 1475 1476 if (seg->s_szc != 0) { 1477 size_t pgsz = page_get_pagesize(seg->s_szc); 1478 int err; 1479 if (!IS_P2ALIGNED(addr, pgsz) || !IS_P2ALIGNED(len, pgsz)) { 1480 ASSERT(seg->s_base != addr || seg->s_size != len); 1481 VM_STAT_ADD(segvnvmstats.demoterange[0]); 1482 err = segvn_demote_range(seg, addr, len, SDR_END, 0); 1483 if (err == 0) { 1484 return (IE_RETRY); 1485 } 1486 return (err); 1487 } 1488 } 1489 1490 /* Inform the vnode of the unmapping. */ 1491 if (svd->vp) { 1492 int error; 1493 1494 error = VOP_DELMAP(svd->vp, 1495 (offset_t)svd->offset + (uintptr_t)(addr - seg->s_base), 1496 seg->s_as, addr, len, svd->prot, svd->maxprot, 1497 svd->type, svd->cred); 1498 1499 if (error == EAGAIN) 1500 return (error); 1501 } 1502 /* 1503 * Remove any page locks set through this mapping. 1504 */ 1505 (void) segvn_lockop(seg, addr, len, 0, MC_UNLOCK, NULL, 0); 1506 1507 /* 1508 * Unload any hardware translations in the range to be taken out. 1509 * Use a callback to invoke free_vp_pages() effectively. 1510 */ 1511 if (svd->vp != NULL && free_pages != 0) { 1512 callback.hcb_data = seg; 1513 callback.hcb_function = segvn_hat_unload_callback; 1514 cbp = &callback; 1515 } 1516 hat_unload_callback(seg->s_as->a_hat, addr, len, HAT_UNLOAD_UNMAP, cbp); 1517 1518 /* 1519 * Check for entire segment 1520 */ 1521 if (addr == seg->s_base && len == seg->s_size) { 1522 seg_free(seg); 1523 return (0); 1524 } 1525 1526 opages = seg_pages(seg); 1527 dpages = btop(len); 1528 npages = opages - dpages; 1529 amp = svd->amp; 1530 ASSERT(amp == NULL || amp->a_szc >= seg->s_szc); 1531 1532 /* 1533 * Check for beginning of segment 1534 */ 1535 if (addr == seg->s_base) { 1536 if (svd->vpage != NULL) { 1537 size_t nbytes; 1538 struct vpage *ovpage; 1539 1540 ovpage = svd->vpage; /* keep pointer to vpage */ 1541 1542 nbytes = vpgtob(npages); 1543 svd->vpage = kmem_alloc(nbytes, KM_SLEEP); 1544 bcopy(&ovpage[dpages], svd->vpage, nbytes); 1545 1546 /* free up old vpage */ 1547 kmem_free(ovpage, vpgtob(opages)); 1548 } 1549 if (amp != NULL) { 1550 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 1551 if (amp->refcnt == 1 || svd->type == MAP_PRIVATE) { 1552 /* 1553 * Free up now unused parts of anon_map array. 1554 */ 1555 if (amp->a_szc == seg->s_szc) { 1556 if (seg->s_szc != 0) { 1557 anon_free_pages(amp->ahp, 1558 svd->anon_index, len, 1559 seg->s_szc); 1560 } else { 1561 anon_free(amp->ahp, 1562 svd->anon_index, 1563 len); 1564 } 1565 } else { 1566 ASSERT(svd->type == MAP_SHARED); 1567 ASSERT(amp->a_szc > seg->s_szc); 1568 anon_shmap_free_pages(amp, 1569 svd->anon_index, len); 1570 } 1571 1572 /* 1573 * Unreserve swap space for the 1574 * unmapped chunk of this segment in 1575 * case it's MAP_SHARED 1576 */ 1577 if (svd->type == MAP_SHARED) { 1578 anon_unresv(len); 1579 amp->swresv -= len; 1580 } 1581 } 1582 ANON_LOCK_EXIT(&->a_rwlock); 1583 svd->anon_index += dpages; 1584 } 1585 if (svd->vp != NULL) 1586 svd->offset += len; 1587 1588 if (svd->swresv) { 1589 if (svd->flags & MAP_NORESERVE) { 1590 ASSERT(amp); 1591 oswresv = svd->swresv; 1592 1593 svd->swresv = ptob(anon_pages(amp->ahp, 1594 svd->anon_index, npages)); 1595 anon_unresv(oswresv - svd->swresv); 1596 } else { 1597 anon_unresv(len); 1598 svd->swresv -= len; 1599 } 1600 TRACE_3(TR_FAC_VM, TR_ANON_PROC, "anon proc:%p %lu %u", 1601 seg, len, 0); 1602 } 1603 1604 seg->s_base += len; 1605 seg->s_size -= len; 1606 return (0); 1607 } 1608 1609 /* 1610 * Check for end of segment 1611 */ 1612 if (addr + len == seg->s_base + seg->s_size) { 1613 if (svd->vpage != NULL) { 1614 size_t nbytes; 1615 struct vpage *ovpage; 1616 1617 ovpage = svd->vpage; /* keep pointer to vpage */ 1618 1619 nbytes = vpgtob(npages); 1620 svd->vpage = kmem_alloc(nbytes, KM_SLEEP); 1621 bcopy(ovpage, svd->vpage, nbytes); 1622 1623 /* free up old vpage */ 1624 kmem_free(ovpage, vpgtob(opages)); 1625 1626 } 1627 if (amp != NULL) { 1628 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 1629 if (amp->refcnt == 1 || svd->type == MAP_PRIVATE) { 1630 /* 1631 * Free up now unused parts of anon_map array. 1632 */ 1633 ulong_t an_idx = svd->anon_index + npages; 1634 if (amp->a_szc == seg->s_szc) { 1635 if (seg->s_szc != 0) { 1636 anon_free_pages(amp->ahp, 1637 an_idx, len, 1638 seg->s_szc); 1639 } else { 1640 anon_free(amp->ahp, an_idx, 1641 len); 1642 } 1643 } else { 1644 ASSERT(svd->type == MAP_SHARED); 1645 ASSERT(amp->a_szc > seg->s_szc); 1646 anon_shmap_free_pages(amp, 1647 an_idx, len); 1648 } 1649 1650 /* 1651 * Unreserve swap space for the 1652 * unmapped chunk of this segment in 1653 * case it's MAP_SHARED 1654 */ 1655 if (svd->type == MAP_SHARED) { 1656 anon_unresv(len); 1657 amp->swresv -= len; 1658 } 1659 } 1660 ANON_LOCK_EXIT(&->a_rwlock); 1661 } 1662 1663 if (svd->swresv) { 1664 if (svd->flags & MAP_NORESERVE) { 1665 ASSERT(amp); 1666 oswresv = svd->swresv; 1667 svd->swresv = ptob(anon_pages(amp->ahp, 1668 svd->anon_index, npages)); 1669 anon_unresv(oswresv - svd->swresv); 1670 } else { 1671 anon_unresv(len); 1672 svd->swresv -= len; 1673 } 1674 TRACE_3(TR_FAC_VM, TR_ANON_PROC, 1675 "anon proc:%p %lu %u", seg, len, 0); 1676 } 1677 1678 seg->s_size -= len; 1679 return (0); 1680 } 1681 1682 /* 1683 * The section to go is in the middle of the segment, 1684 * have to make it into two segments. nseg is made for 1685 * the high end while seg is cut down at the low end. 1686 */ 1687 nbase = addr + len; /* new seg base */ 1688 nsize = (seg->s_base + seg->s_size) - nbase; /* new seg size */ 1689 seg->s_size = addr - seg->s_base; /* shrink old seg */ 1690 nseg = seg_alloc(seg->s_as, nbase, nsize); 1691 if (nseg == NULL) { 1692 panic("segvn_unmap seg_alloc"); 1693 /*NOTREACHED*/ 1694 } 1695 nseg->s_ops = seg->s_ops; 1696 nsvd = kmem_cache_alloc(segvn_cache, KM_SLEEP); 1697 nseg->s_data = (void *)nsvd; 1698 nseg->s_szc = seg->s_szc; 1699 *nsvd = *svd; 1700 nsvd->offset = svd->offset + (uintptr_t)(nseg->s_base - seg->s_base); 1701 nsvd->swresv = 0; 1702 nsvd->softlockcnt = 0; 1703 1704 if (svd->vp != NULL) { 1705 VN_HOLD(nsvd->vp); 1706 if (nsvd->type == MAP_SHARED) 1707 lgrp_shm_policy_init(NULL, nsvd->vp); 1708 } 1709 crhold(svd->cred); 1710 1711 if (svd->vpage == NULL) { 1712 nsvd->vpage = NULL; 1713 } else { 1714 /* need to split vpage into two arrays */ 1715 size_t nbytes; 1716 struct vpage *ovpage; 1717 1718 ovpage = svd->vpage; /* keep pointer to vpage */ 1719 1720 npages = seg_pages(seg); /* seg has shrunk */ 1721 nbytes = vpgtob(npages); 1722 svd->vpage = kmem_alloc(nbytes, KM_SLEEP); 1723 1724 bcopy(ovpage, svd->vpage, nbytes); 1725 1726 npages = seg_pages(nseg); 1727 nbytes = vpgtob(npages); 1728 nsvd->vpage = kmem_alloc(nbytes, KM_SLEEP); 1729 1730 bcopy(&ovpage[opages - npages], nsvd->vpage, nbytes); 1731 1732 /* free up old vpage */ 1733 kmem_free(ovpage, vpgtob(opages)); 1734 } 1735 1736 if (amp == NULL) { 1737 nsvd->amp = NULL; 1738 nsvd->anon_index = 0; 1739 } else { 1740 /* 1741 * Need to create a new anon map for the new segment. 1742 * We'll also allocate a new smaller array for the old 1743 * smaller segment to save space. 1744 */ 1745 opages = btop((uintptr_t)(addr - seg->s_base)); 1746 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 1747 if (amp->refcnt == 1 || svd->type == MAP_PRIVATE) { 1748 /* 1749 * Free up now unused parts of anon_map array. 1750 */ 1751 ulong_t an_idx = svd->anon_index + opages; 1752 if (amp->a_szc == seg->s_szc) { 1753 if (seg->s_szc != 0) { 1754 anon_free_pages(amp->ahp, an_idx, len, 1755 seg->s_szc); 1756 } else { 1757 anon_free(amp->ahp, an_idx, 1758 len); 1759 } 1760 } else { 1761 ASSERT(svd->type == MAP_SHARED); 1762 ASSERT(amp->a_szc > seg->s_szc); 1763 anon_shmap_free_pages(amp, an_idx, len); 1764 } 1765 1766 /* 1767 * Unreserve swap space for the 1768 * unmapped chunk of this segment in 1769 * case it's MAP_SHARED 1770 */ 1771 if (svd->type == MAP_SHARED) { 1772 anon_unresv(len); 1773 amp->swresv -= len; 1774 } 1775 } 1776 nsvd->anon_index = svd->anon_index + 1777 btop((uintptr_t)(nseg->s_base - seg->s_base)); 1778 if (svd->type == MAP_SHARED) { 1779 amp->refcnt++; 1780 nsvd->amp = amp; 1781 } else { 1782 struct anon_map *namp; 1783 struct anon_hdr *nahp; 1784 1785 ASSERT(svd->type == MAP_PRIVATE); 1786 nahp = anon_create(btop(seg->s_size), ANON_SLEEP); 1787 namp = anonmap_alloc(nseg->s_size, 0); 1788 namp->a_szc = seg->s_szc; 1789 (void) anon_copy_ptr(amp->ahp, svd->anon_index, nahp, 1790 0, btop(seg->s_size), ANON_SLEEP); 1791 (void) anon_copy_ptr(amp->ahp, nsvd->anon_index, 1792 namp->ahp, 0, btop(nseg->s_size), ANON_SLEEP); 1793 anon_release(amp->ahp, btop(amp->size)); 1794 svd->anon_index = 0; 1795 nsvd->anon_index = 0; 1796 amp->ahp = nahp; 1797 amp->size = seg->s_size; 1798 nsvd->amp = namp; 1799 } 1800 ANON_LOCK_EXIT(&->a_rwlock); 1801 } 1802 if (svd->swresv) { 1803 if (svd->flags & MAP_NORESERVE) { 1804 ASSERT(amp); 1805 oswresv = svd->swresv; 1806 svd->swresv = ptob(anon_pages(amp->ahp, 1807 svd->anon_index, btop(seg->s_size))); 1808 nsvd->swresv = ptob(anon_pages(nsvd->amp->ahp, 1809 nsvd->anon_index, btop(nseg->s_size))); 1810 ASSERT(oswresv >= (svd->swresv + nsvd->swresv)); 1811 anon_unresv(oswresv - (svd->swresv + nsvd->swresv)); 1812 } else { 1813 if (seg->s_size + nseg->s_size + len != svd->swresv) { 1814 panic("segvn_unmap: " 1815 "cannot split swap reservation"); 1816 /*NOTREACHED*/ 1817 } 1818 anon_unresv(len); 1819 svd->swresv = seg->s_size; 1820 nsvd->swresv = nseg->s_size; 1821 } 1822 TRACE_3(TR_FAC_VM, TR_ANON_PROC, "anon proc:%p %lu %u", 1823 seg, len, 0); 1824 } 1825 1826 return (0); /* I'm glad that's all over with! */ 1827 } 1828 1829 static void 1830 segvn_free(struct seg *seg) 1831 { 1832 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 1833 pgcnt_t npages = seg_pages(seg); 1834 struct anon_map *amp; 1835 size_t len; 1836 1837 /* 1838 * We don't need any segment level locks for "segvn" data 1839 * since the address space is "write" locked. 1840 */ 1841 ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock)); 1842 1843 /* 1844 * Be sure to unlock pages. XXX Why do things get free'ed instead 1845 * of unmapped? XXX 1846 */ 1847 (void) segvn_lockop(seg, seg->s_base, seg->s_size, 1848 0, MC_UNLOCK, NULL, 0); 1849 1850 /* 1851 * Deallocate the vpage and anon pointers if necessary and possible. 1852 */ 1853 if (svd->vpage != NULL) { 1854 kmem_free(svd->vpage, vpgtob(npages)); 1855 svd->vpage = NULL; 1856 } 1857 if ((amp = svd->amp) != NULL) { 1858 /* 1859 * If there are no more references to this anon_map 1860 * structure, then deallocate the structure after freeing 1861 * up all the anon slot pointers that we can. 1862 */ 1863 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 1864 ASSERT(amp->a_szc >= seg->s_szc); 1865 if (--amp->refcnt == 0) { 1866 if (svd->type == MAP_PRIVATE) { 1867 /* 1868 * Private - we only need to anon_free 1869 * the part that this segment refers to. 1870 */ 1871 if (seg->s_szc != 0) { 1872 anon_free_pages(amp->ahp, 1873 svd->anon_index, seg->s_size, 1874 seg->s_szc); 1875 } else { 1876 anon_free(amp->ahp, svd->anon_index, 1877 seg->s_size); 1878 } 1879 } else { 1880 /* 1881 * Shared - anon_free the entire 1882 * anon_map's worth of stuff and 1883 * release any swap reservation. 1884 */ 1885 if (amp->a_szc != 0) { 1886 anon_shmap_free_pages(amp, 0, 1887 amp->size); 1888 } else { 1889 anon_free(amp->ahp, 0, amp->size); 1890 } 1891 if ((len = amp->swresv) != 0) { 1892 anon_unresv(len); 1893 TRACE_3(TR_FAC_VM, TR_ANON_PROC, 1894 "anon proc:%p %lu %u", 1895 seg, len, 0); 1896 } 1897 } 1898 svd->amp = NULL; 1899 ANON_LOCK_EXIT(&->a_rwlock); 1900 anonmap_free(amp); 1901 } else if (svd->type == MAP_PRIVATE) { 1902 /* 1903 * We had a private mapping which still has 1904 * a held anon_map so just free up all the 1905 * anon slot pointers that we were using. 1906 */ 1907 if (seg->s_szc != 0) { 1908 anon_free_pages(amp->ahp, svd->anon_index, 1909 seg->s_size, seg->s_szc); 1910 } else { 1911 anon_free(amp->ahp, svd->anon_index, 1912 seg->s_size); 1913 } 1914 ANON_LOCK_EXIT(&->a_rwlock); 1915 } else { 1916 ANON_LOCK_EXIT(&->a_rwlock); 1917 } 1918 } 1919 1920 /* 1921 * Release swap reservation. 1922 */ 1923 if ((len = svd->swresv) != 0) { 1924 anon_unresv(svd->swresv); 1925 TRACE_3(TR_FAC_VM, TR_ANON_PROC, "anon proc:%p %lu %u", 1926 seg, len, 0); 1927 svd->swresv = 0; 1928 } 1929 /* 1930 * Release claim on vnode, credentials, and finally free the 1931 * private data. 1932 */ 1933 if (svd->vp != NULL) { 1934 if (svd->type == MAP_SHARED) 1935 lgrp_shm_policy_fini(NULL, svd->vp); 1936 VN_RELE(svd->vp); 1937 svd->vp = NULL; 1938 } 1939 crfree(svd->cred); 1940 svd->cred = NULL; 1941 1942 seg->s_data = NULL; 1943 kmem_cache_free(segvn_cache, svd); 1944 } 1945 1946 ulong_t segvn_lpglck_limit = 0; 1947 /* 1948 * Support routines used by segvn_pagelock() and softlock faults for anonymous 1949 * pages to implement availrmem accounting in a way that makes sure the 1950 * same memory is accounted just once for all softlock/pagelock purposes. 1951 * This prevents a bug when availrmem is quickly incorrectly exausted from 1952 * several pagelocks to different parts of the same large page since each 1953 * pagelock has to decrement availrmem by the size of the entire large 1954 * page. Note those pages are not COW shared until softunlock/pageunlock so 1955 * we don't need to use cow style accounting here. We also need to make sure 1956 * the entire large page is accounted even if softlock range is less than the 1957 * entire large page because large anon pages can't be demoted when any of 1958 * constituent pages is locked. The caller calls this routine for every page_t 1959 * it locks. The very first page in the range may not be the root page of a 1960 * large page. For all other pages it's guranteed we are going to visit the 1961 * root of a particular large page before any other constituent page as we are 1962 * locking sequential pages belonging to the same anon map. So we do all the 1963 * locking when the root is encountered except for the very first page. Since 1964 * softlocking is not supported (except S_READ_NOCOW special case) for vmpss 1965 * segments and since vnode pages can be demoted without locking all 1966 * constituent pages vnode pages don't come here. Unlocking relies on the 1967 * fact that pagesize can't change whenever any of constituent large pages is 1968 * locked at least SE_SHARED. This allows unlocking code to find the right 1969 * root and decrement availrmem by the same amount it was incremented when the 1970 * page was locked. 1971 */ 1972 static int 1973 segvn_pp_lock_anonpages(page_t *pp, int first) 1974 { 1975 pgcnt_t pages; 1976 pfn_t pfn; 1977 uchar_t szc = pp->p_szc; 1978 1979 ASSERT(PAGE_LOCKED(pp)); 1980 ASSERT(pp->p_vnode != NULL); 1981 ASSERT(IS_SWAPFSVP(pp->p_vnode)); 1982 1983 /* 1984 * pagesize won't change as long as any constituent page is locked. 1985 */ 1986 pages = page_get_pagecnt(pp->p_szc); 1987 pfn = page_pptonum(pp); 1988 1989 if (!first) { 1990 if (!IS_P2ALIGNED(pfn, pages)) { 1991 #ifdef DEBUG 1992 pp = &pp[-(spgcnt_t)(pfn & (pages - 1))]; 1993 pfn = page_pptonum(pp); 1994 ASSERT(IS_P2ALIGNED(pfn, pages)); 1995 ASSERT(pp->p_szc == szc); 1996 ASSERT(pp->p_vnode != NULL); 1997 ASSERT(IS_SWAPFSVP(pp->p_vnode)); 1998 ASSERT(pp->p_slckcnt != 0); 1999 #endif /* DEBUG */ 2000 return (1); 2001 } 2002 } else if (!IS_P2ALIGNED(pfn, pages)) { 2003 pp = &pp[-(spgcnt_t)(pfn & (pages - 1))]; 2004 #ifdef DEBUG 2005 pfn = page_pptonum(pp); 2006 ASSERT(IS_P2ALIGNED(pfn, pages)); 2007 ASSERT(pp->p_szc == szc); 2008 ASSERT(pp->p_vnode != NULL); 2009 ASSERT(IS_SWAPFSVP(pp->p_vnode)); 2010 #endif /* DEBUG */ 2011 } 2012 2013 /* 2014 * pp is a root page. 2015 * We haven't locked this large page yet. 2016 */ 2017 page_struct_lock(pp); 2018 if (pp->p_slckcnt != 0) { 2019 if (pp->p_slckcnt < PAGE_SLOCK_MAXIMUM) { 2020 pp->p_slckcnt++; 2021 page_struct_unlock(pp); 2022 return (1); 2023 } 2024 page_struct_unlock(pp); 2025 segvn_lpglck_limit++; 2026 return (0); 2027 } 2028 mutex_enter(&freemem_lock); 2029 if (availrmem < tune.t_minarmem + pages) { 2030 mutex_exit(&freemem_lock); 2031 page_struct_unlock(pp); 2032 return (0); 2033 } 2034 pp->p_slckcnt++; 2035 availrmem -= pages; 2036 mutex_exit(&freemem_lock); 2037 page_struct_unlock(pp); 2038 return (1); 2039 } 2040 2041 static void 2042 segvn_pp_unlock_anonpages(page_t *pp, int first) 2043 { 2044 pgcnt_t pages; 2045 pfn_t pfn; 2046 2047 ASSERT(PAGE_LOCKED(pp)); 2048 ASSERT(pp->p_vnode != NULL); 2049 ASSERT(IS_SWAPFSVP(pp->p_vnode)); 2050 2051 /* 2052 * pagesize won't change as long as any constituent page is locked. 2053 */ 2054 pages = page_get_pagecnt(pp->p_szc); 2055 pfn = page_pptonum(pp); 2056 2057 if (!first) { 2058 if (!IS_P2ALIGNED(pfn, pages)) { 2059 return; 2060 } 2061 } else if (!IS_P2ALIGNED(pfn, pages)) { 2062 pp = &pp[-(spgcnt_t)(pfn & (pages - 1))]; 2063 #ifdef DEBUG 2064 pfn = page_pptonum(pp); 2065 ASSERT(IS_P2ALIGNED(pfn, pages)); 2066 #endif /* DEBUG */ 2067 } 2068 ASSERT(pp->p_vnode != NULL); 2069 ASSERT(IS_SWAPFSVP(pp->p_vnode)); 2070 ASSERT(pp->p_slckcnt != 0); 2071 page_struct_lock(pp); 2072 if (--pp->p_slckcnt == 0) { 2073 mutex_enter(&freemem_lock); 2074 availrmem += pages; 2075 mutex_exit(&freemem_lock); 2076 } 2077 page_struct_unlock(pp); 2078 } 2079 2080 /* 2081 * Do a F_SOFTUNLOCK call over the range requested. The range must have 2082 * already been F_SOFTLOCK'ed. 2083 * Caller must always match addr and len of a softunlock with a previous 2084 * softlock with exactly the same addr and len. 2085 */ 2086 static void 2087 segvn_softunlock(struct seg *seg, caddr_t addr, size_t len, enum seg_rw rw) 2088 { 2089 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 2090 page_t *pp; 2091 caddr_t adr; 2092 struct vnode *vp; 2093 u_offset_t offset; 2094 ulong_t anon_index; 2095 struct anon_map *amp; 2096 struct anon *ap = NULL; 2097 2098 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 2099 ASSERT(SEGVN_LOCK_HELD(seg->s_as, &svd->lock)); 2100 2101 if ((amp = svd->amp) != NULL) 2102 anon_index = svd->anon_index + seg_page(seg, addr); 2103 2104 hat_unlock(seg->s_as->a_hat, addr, len); 2105 for (adr = addr; adr < addr + len; adr += PAGESIZE) { 2106 if (amp != NULL) { 2107 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 2108 if ((ap = anon_get_ptr(amp->ahp, anon_index++)) 2109 != NULL) { 2110 swap_xlate(ap, &vp, &offset); 2111 } else { 2112 vp = svd->vp; 2113 offset = svd->offset + 2114 (uintptr_t)(adr - seg->s_base); 2115 } 2116 ANON_LOCK_EXIT(&->a_rwlock); 2117 } else { 2118 vp = svd->vp; 2119 offset = svd->offset + 2120 (uintptr_t)(adr - seg->s_base); 2121 } 2122 2123 /* 2124 * Use page_find() instead of page_lookup() to 2125 * find the page since we know that it is locked. 2126 */ 2127 pp = page_find(vp, offset); 2128 if (pp == NULL) { 2129 panic( 2130 "segvn_softunlock: addr %p, ap %p, vp %p, off %llx", 2131 (void *)adr, (void *)ap, (void *)vp, offset); 2132 /*NOTREACHED*/ 2133 } 2134 2135 if (rw == S_WRITE) { 2136 hat_setrefmod(pp); 2137 if (seg->s_as->a_vbits) 2138 hat_setstat(seg->s_as, adr, PAGESIZE, 2139 P_REF | P_MOD); 2140 } else if (rw != S_OTHER) { 2141 hat_setref(pp); 2142 if (seg->s_as->a_vbits) 2143 hat_setstat(seg->s_as, adr, PAGESIZE, P_REF); 2144 } 2145 TRACE_3(TR_FAC_VM, TR_SEGVN_FAULT, 2146 "segvn_fault:pp %p vp %p offset %llx", pp, vp, offset); 2147 if (svd->vp == NULL) { 2148 segvn_pp_unlock_anonpages(pp, adr == addr); 2149 } 2150 page_unlock(pp); 2151 } 2152 mutex_enter(&freemem_lock); /* for availrmem */ 2153 if (svd->vp != NULL) { 2154 availrmem += btop(len); 2155 } 2156 segvn_pages_locked -= btop(len); 2157 svd->softlockcnt -= btop(len); 2158 mutex_exit(&freemem_lock); 2159 if (svd->softlockcnt == 0) { 2160 /* 2161 * All SOFTLOCKS are gone. Wakeup any waiting 2162 * unmappers so they can try again to unmap. 2163 * Check for waiters first without the mutex 2164 * held so we don't always grab the mutex on 2165 * softunlocks. 2166 */ 2167 if (AS_ISUNMAPWAIT(seg->s_as)) { 2168 mutex_enter(&seg->s_as->a_contents); 2169 if (AS_ISUNMAPWAIT(seg->s_as)) { 2170 AS_CLRUNMAPWAIT(seg->s_as); 2171 cv_broadcast(&seg->s_as->a_cv); 2172 } 2173 mutex_exit(&seg->s_as->a_contents); 2174 } 2175 } 2176 } 2177 2178 #define PAGE_HANDLED ((page_t *)-1) 2179 2180 /* 2181 * Release all the pages in the NULL terminated ppp list 2182 * which haven't already been converted to PAGE_HANDLED. 2183 */ 2184 static void 2185 segvn_pagelist_rele(page_t **ppp) 2186 { 2187 for (; *ppp != NULL; ppp++) { 2188 if (*ppp != PAGE_HANDLED) 2189 page_unlock(*ppp); 2190 } 2191 } 2192 2193 static int stealcow = 1; 2194 2195 /* 2196 * Workaround for viking chip bug. See bug id 1220902. 2197 * To fix this down in pagefault() would require importing so 2198 * much as and segvn code as to be unmaintainable. 2199 */ 2200 int enable_mbit_wa = 0; 2201 2202 /* 2203 * Handles all the dirty work of getting the right 2204 * anonymous pages and loading up the translations. 2205 * This routine is called only from segvn_fault() 2206 * when looping over the range of addresses requested. 2207 * 2208 * The basic algorithm here is: 2209 * If this is an anon_zero case 2210 * Call anon_zero to allocate page 2211 * Load up translation 2212 * Return 2213 * endif 2214 * If this is an anon page 2215 * Use anon_getpage to get the page 2216 * else 2217 * Find page in pl[] list passed in 2218 * endif 2219 * If not a cow 2220 * Load up the translation to the page 2221 * return 2222 * endif 2223 * Call anon_private to handle cow 2224 * Load up (writable) translation to new page 2225 */ 2226 static faultcode_t 2227 segvn_faultpage( 2228 struct hat *hat, /* the hat to use for mapping */ 2229 struct seg *seg, /* seg_vn of interest */ 2230 caddr_t addr, /* address in as */ 2231 u_offset_t off, /* offset in vp */ 2232 struct vpage *vpage, /* pointer to vpage for vp, off */ 2233 page_t *pl[], /* object source page pointer */ 2234 uint_t vpprot, /* access allowed to object pages */ 2235 enum fault_type type, /* type of fault */ 2236 enum seg_rw rw, /* type of access at fault */ 2237 int brkcow, /* we may need to break cow */ 2238 int first) /* first page for this fault if 1 */ 2239 { 2240 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 2241 page_t *pp, **ppp; 2242 uint_t pageflags = 0; 2243 page_t *anon_pl[1 + 1]; 2244 page_t *opp = NULL; /* original page */ 2245 uint_t prot; 2246 int err; 2247 int cow; 2248 int claim; 2249 int steal = 0; 2250 ulong_t anon_index; 2251 struct anon *ap, *oldap; 2252 struct anon_map *amp; 2253 int hat_flag = (type == F_SOFTLOCK) ? HAT_LOAD_LOCK : HAT_LOAD; 2254 int anon_lock = 0; 2255 anon_sync_obj_t cookie; 2256 2257 if (svd->flags & MAP_TEXT) { 2258 hat_flag |= HAT_LOAD_TEXT; 2259 } 2260 2261 ASSERT(SEGVN_READ_HELD(seg->s_as, &svd->lock)); 2262 ASSERT(seg->s_szc == 0); 2263 2264 /* 2265 * Initialize protection value for this page. 2266 * If we have per page protection values check it now. 2267 */ 2268 if (svd->pageprot) { 2269 uint_t protchk; 2270 2271 switch (rw) { 2272 case S_READ: 2273 protchk = PROT_READ; 2274 break; 2275 case S_WRITE: 2276 protchk = PROT_WRITE; 2277 break; 2278 case S_EXEC: 2279 protchk = PROT_EXEC; 2280 break; 2281 case S_OTHER: 2282 default: 2283 protchk = PROT_READ | PROT_WRITE | PROT_EXEC; 2284 break; 2285 } 2286 2287 prot = VPP_PROT(vpage); 2288 if ((prot & protchk) == 0) 2289 return (FC_PROT); /* illegal access type */ 2290 } else { 2291 prot = svd->prot; 2292 } 2293 2294 if (type == F_SOFTLOCK && svd->vp != NULL) { 2295 mutex_enter(&freemem_lock); 2296 if (availrmem <= tune.t_minarmem) { 2297 mutex_exit(&freemem_lock); 2298 return (FC_MAKE_ERR(ENOMEM)); /* out of real memory */ 2299 } else { 2300 availrmem--; 2301 svd->softlockcnt++; 2302 segvn_pages_locked++; 2303 } 2304 mutex_exit(&freemem_lock); 2305 } 2306 2307 /* 2308 * Always acquire the anon array lock to prevent 2 threads from 2309 * allocating separate anon slots for the same "addr". 2310 */ 2311 2312 if ((amp = svd->amp) != NULL) { 2313 ASSERT(RW_READ_HELD(&->a_rwlock)); 2314 anon_index = svd->anon_index + seg_page(seg, addr); 2315 anon_array_enter(amp, anon_index, &cookie); 2316 anon_lock = 1; 2317 } 2318 2319 if (svd->vp == NULL && amp != NULL) { 2320 if ((ap = anon_get_ptr(amp->ahp, anon_index)) == NULL) { 2321 /* 2322 * Allocate a (normally) writable anonymous page of 2323 * zeroes. If no advance reservations, reserve now. 2324 */ 2325 if (svd->flags & MAP_NORESERVE) { 2326 if (anon_resv_zone(ptob(1), 2327 seg->s_as->a_proc->p_zone)) { 2328 atomic_add_long(&svd->swresv, ptob(1)); 2329 } else { 2330 err = ENOMEM; 2331 goto out; 2332 } 2333 } 2334 if ((pp = anon_zero(seg, addr, &ap, 2335 svd->cred)) == NULL) { 2336 err = ENOMEM; 2337 goto out; /* out of swap space */ 2338 } 2339 /* 2340 * Re-acquire the anon_map lock and 2341 * initialize the anon array entry. 2342 */ 2343 (void) anon_set_ptr(amp->ahp, anon_index, ap, 2344 ANON_SLEEP); 2345 2346 ASSERT(pp->p_szc == 0); 2347 2348 /* 2349 * Handle pages that have been marked for migration 2350 */ 2351 if (lgrp_optimizations()) 2352 page_migrate(seg, addr, &pp, 1); 2353 2354 if (type == F_SOFTLOCK) { 2355 if (!segvn_pp_lock_anonpages(pp, first)) { 2356 page_unlock(pp); 2357 err = ENOMEM; 2358 goto out; 2359 } else { 2360 mutex_enter(&freemem_lock); 2361 svd->softlockcnt++; 2362 segvn_pages_locked++; 2363 mutex_exit(&freemem_lock); 2364 } 2365 } 2366 2367 if (enable_mbit_wa) { 2368 if (rw == S_WRITE) 2369 hat_setmod(pp); 2370 else if (!hat_ismod(pp)) 2371 prot &= ~PROT_WRITE; 2372 } 2373 /* 2374 * If AS_PAGLCK is set in a_flags (via memcntl(2) 2375 * with MC_LOCKAS, MCL_FUTURE) and this is a 2376 * MAP_NORESERVE segment, we may need to 2377 * permanently lock the page as it is being faulted 2378 * for the first time. The following text applies 2379 * only to MAP_NORESERVE segments: 2380 * 2381 * As per memcntl(2), if this segment was created 2382 * after MCL_FUTURE was applied (a "future" 2383 * segment), its pages must be locked. If this 2384 * segment existed at MCL_FUTURE application (a 2385 * "past" segment), the interface is unclear. 2386 * 2387 * We decide to lock only if vpage is present: 2388 * 2389 * - "future" segments will have a vpage array (see 2390 * as_map), and so will be locked as required 2391 * 2392 * - "past" segments may not have a vpage array, 2393 * depending on whether events (such as 2394 * mprotect) have occurred. Locking if vpage 2395 * exists will preserve legacy behavior. Not 2396 * locking if vpage is absent, will not break 2397 * the interface or legacy behavior. Note that 2398 * allocating vpage here if it's absent requires 2399 * upgrading the segvn reader lock, the cost of 2400 * which does not seem worthwhile. 2401 * 2402 * Usually testing and setting VPP_ISPPLOCK and 2403 * VPP_SETPPLOCK requires holding the segvn lock as 2404 * writer, but in this case all readers are 2405 * serializing on the anon array lock. 2406 */ 2407 if (AS_ISPGLCK(seg->s_as) && vpage != NULL && 2408 (svd->flags & MAP_NORESERVE) && 2409 !VPP_ISPPLOCK(vpage)) { 2410 proc_t *p = seg->s_as->a_proc; 2411 ASSERT(svd->type == MAP_PRIVATE); 2412 mutex_enter(&p->p_lock); 2413 if (rctl_incr_locked_mem(p, NULL, PAGESIZE, 2414 1) == 0) { 2415 claim = VPP_PROT(vpage) & PROT_WRITE; 2416 if (page_pp_lock(pp, claim, 0)) { 2417 VPP_SETPPLOCK(vpage); 2418 } else { 2419 rctl_decr_locked_mem(p, NULL, 2420 PAGESIZE, 1); 2421 } 2422 } 2423 mutex_exit(&p->p_lock); 2424 } 2425 2426 hat_memload(hat, addr, pp, prot, hat_flag); 2427 2428 if (!(hat_flag & HAT_LOAD_LOCK)) 2429 page_unlock(pp); 2430 2431 anon_array_exit(&cookie); 2432 return (0); 2433 } 2434 } 2435 2436 /* 2437 * Obtain the page structure via anon_getpage() if it is 2438 * a private copy of an object (the result of a previous 2439 * copy-on-write). 2440 */ 2441 if (amp != NULL) { 2442 if ((ap = anon_get_ptr(amp->ahp, anon_index)) != NULL) { 2443 err = anon_getpage(&ap, &vpprot, anon_pl, PAGESIZE, 2444 seg, addr, rw, svd->cred); 2445 if (err) 2446 goto out; 2447 2448 if (svd->type == MAP_SHARED) { 2449 /* 2450 * If this is a shared mapping to an 2451 * anon_map, then ignore the write 2452 * permissions returned by anon_getpage(). 2453 * They apply to the private mappings 2454 * of this anon_map. 2455 */ 2456 vpprot |= PROT_WRITE; 2457 } 2458 opp = anon_pl[0]; 2459 } 2460 } 2461 2462 /* 2463 * Search the pl[] list passed in if it is from the 2464 * original object (i.e., not a private copy). 2465 */ 2466 if (opp == NULL) { 2467 /* 2468 * Find original page. We must be bringing it in 2469 * from the list in pl[]. 2470 */ 2471 for (ppp = pl; (opp = *ppp) != NULL; ppp++) { 2472 if (opp == PAGE_HANDLED) 2473 continue; 2474 ASSERT(opp->p_vnode == svd->vp); /* XXX */ 2475 if (opp->p_offset == off) 2476 break; 2477 } 2478 if (opp == NULL) { 2479 panic("segvn_faultpage not found"); 2480 /*NOTREACHED*/ 2481 } 2482 *ppp = PAGE_HANDLED; 2483 2484 } 2485 2486 ASSERT(PAGE_LOCKED(opp)); 2487 2488 TRACE_3(TR_FAC_VM, TR_SEGVN_FAULT, 2489 "segvn_fault:pp %p vp %p offset %llx", 2490 opp, NULL, 0); 2491 2492 /* 2493 * The fault is treated as a copy-on-write fault if a 2494 * write occurs on a private segment and the object 2495 * page (i.e., mapping) is write protected. We assume 2496 * that fatal protection checks have already been made. 2497 */ 2498 2499 cow = brkcow && ((vpprot & PROT_WRITE) == 0); 2500 2501 /* 2502 * If not a copy-on-write case load the translation 2503 * and return. 2504 */ 2505 if (cow == 0) { 2506 2507 /* 2508 * Handle pages that have been marked for migration 2509 */ 2510 if (lgrp_optimizations()) 2511 page_migrate(seg, addr, &opp, 1); 2512 2513 if (type == F_SOFTLOCK && svd->vp == NULL) { 2514 2515 ASSERT(opp->p_szc == 0 || 2516 (svd->type == MAP_SHARED && 2517 amp != NULL && amp->a_szc != 0)); 2518 2519 if (!segvn_pp_lock_anonpages(opp, first)) { 2520 page_unlock(opp); 2521 err = ENOMEM; 2522 goto out; 2523 } else { 2524 mutex_enter(&freemem_lock); 2525 svd->softlockcnt++; 2526 segvn_pages_locked++; 2527 mutex_exit(&freemem_lock); 2528 } 2529 } 2530 if (IS_VMODSORT(opp->p_vnode) || enable_mbit_wa) { 2531 if (rw == S_WRITE) 2532 hat_setmod(opp); 2533 else if (rw != S_OTHER && !hat_ismod(opp)) 2534 prot &= ~PROT_WRITE; 2535 } 2536 2537 hat_memload(hat, addr, opp, prot & vpprot, hat_flag); 2538 2539 if (!(hat_flag & HAT_LOAD_LOCK)) 2540 page_unlock(opp); 2541 2542 if (anon_lock) { 2543 anon_array_exit(&cookie); 2544 } 2545 return (0); 2546 } 2547 2548 hat_setref(opp); 2549 2550 ASSERT(amp != NULL && anon_lock); 2551 2552 /* 2553 * Steal the page only if it isn't a private page 2554 * since stealing a private page is not worth the effort. 2555 */ 2556 if ((ap = anon_get_ptr(amp->ahp, anon_index)) == NULL) 2557 steal = 1; 2558 2559 /* 2560 * Steal the original page if the following conditions are true: 2561 * 2562 * We are low on memory, the page is not private, page is not large, 2563 * not shared, not modified, not `locked' or if we have it `locked' 2564 * (i.e., p_cowcnt == 1 and p_lckcnt == 0, which also implies 2565 * that the page is not shared) and if it doesn't have any 2566 * translations. page_struct_lock isn't needed to look at p_cowcnt 2567 * and p_lckcnt because we first get exclusive lock on page. 2568 */ 2569 (void) hat_pagesync(opp, HAT_SYNC_DONTZERO | HAT_SYNC_STOPON_MOD); 2570 2571 if (stealcow && freemem < minfree && steal && opp->p_szc == 0 && 2572 page_tryupgrade(opp) && !hat_ismod(opp) && 2573 ((opp->p_lckcnt == 0 && opp->p_cowcnt == 0) || 2574 (opp->p_lckcnt == 0 && opp->p_cowcnt == 1 && 2575 vpage != NULL && VPP_ISPPLOCK(vpage)))) { 2576 /* 2577 * Check if this page has other translations 2578 * after unloading our translation. 2579 */ 2580 if (hat_page_is_mapped(opp)) { 2581 hat_unload(seg->s_as->a_hat, addr, PAGESIZE, 2582 HAT_UNLOAD); 2583 } 2584 2585 /* 2586 * hat_unload() might sync back someone else's recent 2587 * modification, so check again. 2588 */ 2589 if (!hat_ismod(opp) && !hat_page_is_mapped(opp)) 2590 pageflags |= STEAL_PAGE; 2591 } 2592 2593 /* 2594 * If we have a vpage pointer, see if it indicates that we have 2595 * ``locked'' the page we map -- if so, tell anon_private to 2596 * transfer the locking resource to the new page. 2597 * 2598 * See Statement at the beginning of segvn_lockop regarding 2599 * the way lockcnts/cowcnts are handled during COW. 2600 * 2601 */ 2602 if (vpage != NULL && VPP_ISPPLOCK(vpage)) 2603 pageflags |= LOCK_PAGE; 2604 2605 /* 2606 * Allocate a private page and perform the copy. 2607 * For MAP_NORESERVE reserve swap space now, unless this 2608 * is a cow fault on an existing anon page in which case 2609 * MAP_NORESERVE will have made advance reservations. 2610 */ 2611 if ((svd->flags & MAP_NORESERVE) && (ap == NULL)) { 2612 if (anon_resv(ptob(1))) { 2613 svd->swresv += ptob(1); 2614 } else { 2615 page_unlock(opp); 2616 err = ENOMEM; 2617 goto out; 2618 } 2619 } 2620 oldap = ap; 2621 pp = anon_private(&ap, seg, addr, prot, opp, pageflags, svd->cred); 2622 if (pp == NULL) { 2623 err = ENOMEM; /* out of swap space */ 2624 goto out; 2625 } 2626 2627 /* 2628 * If we copied away from an anonymous page, then 2629 * we are one step closer to freeing up an anon slot. 2630 * 2631 * NOTE: The original anon slot must be released while 2632 * holding the "anon_map" lock. This is necessary to prevent 2633 * other threads from obtaining a pointer to the anon slot 2634 * which may be freed if its "refcnt" is 1. 2635 */ 2636 if (oldap != NULL) 2637 anon_decref(oldap); 2638 2639 (void) anon_set_ptr(amp->ahp, anon_index, ap, ANON_SLEEP); 2640 2641 /* 2642 * Handle pages that have been marked for migration 2643 */ 2644 if (lgrp_optimizations()) 2645 page_migrate(seg, addr, &pp, 1); 2646 2647 ASSERT(pp->p_szc == 0); 2648 if (type == F_SOFTLOCK && svd->vp == NULL) { 2649 if (!segvn_pp_lock_anonpages(pp, first)) { 2650 page_unlock(pp); 2651 err = ENOMEM; 2652 goto out; 2653 } else { 2654 mutex_enter(&freemem_lock); 2655 svd->softlockcnt++; 2656 segvn_pages_locked++; 2657 mutex_exit(&freemem_lock); 2658 } 2659 } 2660 2661 ASSERT(!IS_VMODSORT(pp->p_vnode)); 2662 if (enable_mbit_wa) { 2663 if (rw == S_WRITE) 2664 hat_setmod(pp); 2665 else if (!hat_ismod(pp)) 2666 prot &= ~PROT_WRITE; 2667 } 2668 2669 hat_memload(hat, addr, pp, prot, hat_flag); 2670 2671 if (!(hat_flag & HAT_LOAD_LOCK)) 2672 page_unlock(pp); 2673 2674 ASSERT(anon_lock); 2675 anon_array_exit(&cookie); 2676 return (0); 2677 out: 2678 if (anon_lock) 2679 anon_array_exit(&cookie); 2680 2681 if (type == F_SOFTLOCK && svd->vp != NULL) { 2682 mutex_enter(&freemem_lock); 2683 availrmem++; 2684 segvn_pages_locked--; 2685 svd->softlockcnt--; 2686 mutex_exit(&freemem_lock); 2687 } 2688 return (FC_MAKE_ERR(err)); 2689 } 2690 2691 /* 2692 * relocate a bunch of smaller targ pages into one large repl page. all targ 2693 * pages must be complete pages smaller than replacement pages. 2694 * it's assumed that no page's szc can change since they are all PAGESIZE or 2695 * complete large pages locked SHARED. 2696 */ 2697 static void 2698 segvn_relocate_pages(page_t **targ, page_t *replacement) 2699 { 2700 page_t *pp; 2701 pgcnt_t repl_npgs, curnpgs; 2702 pgcnt_t i; 2703 uint_t repl_szc = replacement->p_szc; 2704 page_t *first_repl = replacement; 2705 page_t *repl; 2706 spgcnt_t npgs; 2707 2708 VM_STAT_ADD(segvnvmstats.relocatepages[0]); 2709 2710 ASSERT(repl_szc != 0); 2711 npgs = repl_npgs = page_get_pagecnt(repl_szc); 2712 2713 i = 0; 2714 while (repl_npgs) { 2715 spgcnt_t nreloc; 2716 int err; 2717 ASSERT(replacement != NULL); 2718 pp = targ[i]; 2719 ASSERT(pp->p_szc < repl_szc); 2720 ASSERT(PAGE_EXCL(pp)); 2721 ASSERT(!PP_ISFREE(pp)); 2722 curnpgs = page_get_pagecnt(pp->p_szc); 2723 if (curnpgs == 1) { 2724 VM_STAT_ADD(segvnvmstats.relocatepages[1]); 2725 repl = replacement; 2726 page_sub(&replacement, repl); 2727 ASSERT(PAGE_EXCL(repl)); 2728 ASSERT(!PP_ISFREE(repl)); 2729 ASSERT(repl->p_szc == repl_szc); 2730 } else { 2731 page_t *repl_savepp; 2732 int j; 2733 VM_STAT_ADD(segvnvmstats.relocatepages[2]); 2734 repl_savepp = replacement; 2735 for (j = 0; j < curnpgs; j++) { 2736 repl = replacement; 2737 page_sub(&replacement, repl); 2738 ASSERT(PAGE_EXCL(repl)); 2739 ASSERT(!PP_ISFREE(repl)); 2740 ASSERT(repl->p_szc == repl_szc); 2741 ASSERT(page_pptonum(targ[i + j]) == 2742 page_pptonum(targ[i]) + j); 2743 } 2744 repl = repl_savepp; 2745 ASSERT(IS_P2ALIGNED(page_pptonum(repl), curnpgs)); 2746 } 2747 err = page_relocate(&pp, &repl, 0, 1, &nreloc, NULL); 2748 if (err || nreloc != curnpgs) { 2749 panic("segvn_relocate_pages: " 2750 "page_relocate failed err=%d curnpgs=%ld " 2751 "nreloc=%ld", err, curnpgs, nreloc); 2752 } 2753 ASSERT(curnpgs <= repl_npgs); 2754 repl_npgs -= curnpgs; 2755 i += curnpgs; 2756 } 2757 ASSERT(replacement == NULL); 2758 2759 repl = first_repl; 2760 repl_npgs = npgs; 2761 for (i = 0; i < repl_npgs; i++) { 2762 ASSERT(PAGE_EXCL(repl)); 2763 ASSERT(!PP_ISFREE(repl)); 2764 targ[i] = repl; 2765 page_downgrade(targ[i]); 2766 repl++; 2767 } 2768 } 2769 2770 /* 2771 * Check if all pages in ppa array are complete smaller than szc pages and 2772 * their roots will still be aligned relative to their current size if the 2773 * entire ppa array is relocated into one szc page. If these conditions are 2774 * not met return 0. 2775 * 2776 * If all pages are properly aligned attempt to upgrade their locks 2777 * to exclusive mode. If it fails set *upgrdfail to 1 and return 0. 2778 * upgrdfail was set to 0 by caller. 2779 * 2780 * Return 1 if all pages are aligned and locked exclusively. 2781 * 2782 * If all pages in ppa array happen to be physically contiguous to make one 2783 * szc page and all exclusive locks are successfully obtained promote the page 2784 * size to szc and set *pszc to szc. Return 1 with pages locked shared. 2785 */ 2786 static int 2787 segvn_full_szcpages(page_t **ppa, uint_t szc, int *upgrdfail, uint_t *pszc) 2788 { 2789 page_t *pp; 2790 pfn_t pfn; 2791 pgcnt_t totnpgs = page_get_pagecnt(szc); 2792 pfn_t first_pfn; 2793 int contig = 1; 2794 pgcnt_t i; 2795 pgcnt_t j; 2796 uint_t curszc; 2797 pgcnt_t curnpgs; 2798 int root = 0; 2799 2800 ASSERT(szc > 0); 2801 2802 VM_STAT_ADD(segvnvmstats.fullszcpages[0]); 2803 2804 for (i = 0; i < totnpgs; i++) { 2805 pp = ppa[i]; 2806 ASSERT(PAGE_SHARED(pp)); 2807 ASSERT(!PP_ISFREE(pp)); 2808 pfn = page_pptonum(pp); 2809 if (i == 0) { 2810 if (!IS_P2ALIGNED(pfn, totnpgs)) { 2811 contig = 0; 2812 } else { 2813 first_pfn = pfn; 2814 } 2815 } else if (contig && pfn != first_pfn + i) { 2816 contig = 0; 2817 } 2818 if (pp->p_szc == 0) { 2819 if (root) { 2820 VM_STAT_ADD(segvnvmstats.fullszcpages[1]); 2821 return (0); 2822 } 2823 } else if (!root) { 2824 if ((curszc = pp->p_szc) >= szc) { 2825 VM_STAT_ADD(segvnvmstats.fullszcpages[2]); 2826 return (0); 2827 } 2828 if (curszc == 0) { 2829 /* 2830 * p_szc changed means we don't have all pages 2831 * locked. return failure. 2832 */ 2833 VM_STAT_ADD(segvnvmstats.fullszcpages[3]); 2834 return (0); 2835 } 2836 curnpgs = page_get_pagecnt(curszc); 2837 if (!IS_P2ALIGNED(pfn, curnpgs) || 2838 !IS_P2ALIGNED(i, curnpgs)) { 2839 VM_STAT_ADD(segvnvmstats.fullszcpages[4]); 2840 return (0); 2841 } 2842 root = 1; 2843 } else { 2844 ASSERT(i > 0); 2845 VM_STAT_ADD(segvnvmstats.fullszcpages[5]); 2846 if (pp->p_szc != curszc) { 2847 VM_STAT_ADD(segvnvmstats.fullszcpages[6]); 2848 return (0); 2849 } 2850 if (pfn - 1 != page_pptonum(ppa[i - 1])) { 2851 panic("segvn_full_szcpages: " 2852 "large page not physically contiguous"); 2853 } 2854 if (P2PHASE(pfn, curnpgs) == curnpgs - 1) { 2855 root = 0; 2856 } 2857 } 2858 } 2859 2860 for (i = 0; i < totnpgs; i++) { 2861 ASSERT(ppa[i]->p_szc < szc); 2862 if (!page_tryupgrade(ppa[i])) { 2863 for (j = 0; j < i; j++) { 2864 page_downgrade(ppa[j]); 2865 } 2866 *pszc = ppa[i]->p_szc; 2867 *upgrdfail = 1; 2868 VM_STAT_ADD(segvnvmstats.fullszcpages[7]); 2869 return (0); 2870 } 2871 } 2872 2873 /* 2874 * When a page is put a free cachelist its szc is set to 0. if file 2875 * system reclaimed pages from cachelist targ pages will be physically 2876 * contiguous with 0 p_szc. in this case just upgrade szc of targ 2877 * pages without any relocations. 2878 * To avoid any hat issues with previous small mappings 2879 * hat_pageunload() the target pages first. 2880 */ 2881 if (contig) { 2882 VM_STAT_ADD(segvnvmstats.fullszcpages[8]); 2883 for (i = 0; i < totnpgs; i++) { 2884 (void) hat_pageunload(ppa[i], HAT_FORCE_PGUNLOAD); 2885 } 2886 for (i = 0; i < totnpgs; i++) { 2887 ppa[i]->p_szc = szc; 2888 } 2889 for (i = 0; i < totnpgs; i++) { 2890 ASSERT(PAGE_EXCL(ppa[i])); 2891 page_downgrade(ppa[i]); 2892 } 2893 if (pszc != NULL) { 2894 *pszc = szc; 2895 } 2896 } 2897 VM_STAT_ADD(segvnvmstats.fullszcpages[9]); 2898 return (1); 2899 } 2900 2901 /* 2902 * Create physically contiguous pages for [vp, off] - [vp, off + 2903 * page_size(szc)) range and for private segment return them in ppa array. 2904 * Pages are created either via IO or relocations. 2905 * 2906 * Return 1 on sucess and 0 on failure. 2907 * 2908 * If physically contiguos pages already exist for this range return 1 without 2909 * filling ppa array. Caller initializes ppa[0] as NULL to detect that ppa 2910 * array wasn't filled. In this case caller fills ppa array via VOP_GETPAGE(). 2911 */ 2912 2913 static int 2914 segvn_fill_vp_pages(struct segvn_data *svd, vnode_t *vp, u_offset_t off, 2915 uint_t szc, page_t **ppa, page_t **ppplist, uint_t *ret_pszc, 2916 int *downsize) 2917 2918 { 2919 page_t *pplist = *ppplist; 2920 size_t pgsz = page_get_pagesize(szc); 2921 pgcnt_t pages = btop(pgsz); 2922 ulong_t start_off = off; 2923 u_offset_t eoff = off + pgsz; 2924 spgcnt_t nreloc; 2925 u_offset_t io_off = off; 2926 size_t io_len; 2927 page_t *io_pplist = NULL; 2928 page_t *done_pplist = NULL; 2929 pgcnt_t pgidx = 0; 2930 page_t *pp; 2931 page_t *newpp; 2932 page_t *targpp; 2933 int io_err = 0; 2934 int i; 2935 pfn_t pfn; 2936 ulong_t ppages; 2937 page_t *targ_pplist = NULL; 2938 page_t *repl_pplist = NULL; 2939 page_t *tmp_pplist; 2940 int nios = 0; 2941 uint_t pszc; 2942 struct vattr va; 2943 2944 VM_STAT_ADD(segvnvmstats.fill_vp_pages[0]); 2945 2946 ASSERT(szc != 0); 2947 ASSERT(pplist->p_szc == szc); 2948 2949 /* 2950 * downsize will be set to 1 only if we fail to lock pages. this will 2951 * allow subsequent faults to try to relocate the page again. If we 2952 * fail due to misalignment don't downsize and let the caller map the 2953 * whole region with small mappings to avoid more faults into the area 2954 * where we can't get large pages anyway. 2955 */ 2956 *downsize = 0; 2957 2958 while (off < eoff) { 2959 newpp = pplist; 2960 ASSERT(newpp != NULL); 2961 ASSERT(PAGE_EXCL(newpp)); 2962 ASSERT(!PP_ISFREE(newpp)); 2963 /* 2964 * we pass NULL for nrelocp to page_lookup_create() 2965 * so that it doesn't relocate. We relocate here 2966 * later only after we make sure we can lock all 2967 * pages in the range we handle and they are all 2968 * aligned. 2969 */ 2970 pp = page_lookup_create(vp, off, SE_SHARED, newpp, NULL, 0); 2971 ASSERT(pp != NULL); 2972 ASSERT(!PP_ISFREE(pp)); 2973 ASSERT(pp->p_vnode == vp); 2974 ASSERT(pp->p_offset == off); 2975 if (pp == newpp) { 2976 VM_STAT_ADD(segvnvmstats.fill_vp_pages[1]); 2977 page_sub(&pplist, pp); 2978 ASSERT(PAGE_EXCL(pp)); 2979 ASSERT(page_iolock_assert(pp)); 2980 page_list_concat(&io_pplist, &pp); 2981 off += PAGESIZE; 2982 continue; 2983 } 2984 VM_STAT_ADD(segvnvmstats.fill_vp_pages[2]); 2985 pfn = page_pptonum(pp); 2986 pszc = pp->p_szc; 2987 if (pszc >= szc && targ_pplist == NULL && io_pplist == NULL && 2988 IS_P2ALIGNED(pfn, pages)) { 2989 ASSERT(repl_pplist == NULL); 2990 ASSERT(done_pplist == NULL); 2991 ASSERT(pplist == *ppplist); 2992 page_unlock(pp); 2993 page_free_replacement_page(pplist); 2994 page_create_putback(pages); 2995 *ppplist = NULL; 2996 VM_STAT_ADD(segvnvmstats.fill_vp_pages[3]); 2997 return (1); 2998 } 2999 if (pszc >= szc) { 3000 page_unlock(pp); 3001 segvn_faultvnmpss_align_err1++; 3002 goto out; 3003 } 3004 ppages = page_get_pagecnt(pszc); 3005 if (!IS_P2ALIGNED(pfn, ppages)) { 3006 ASSERT(pszc > 0); 3007 /* 3008 * sizing down to pszc won't help. 3009 */ 3010 page_unlock(pp); 3011 segvn_faultvnmpss_align_err2++; 3012 goto out; 3013 } 3014 pfn = page_pptonum(newpp); 3015 if (!IS_P2ALIGNED(pfn, ppages)) { 3016 ASSERT(pszc > 0); 3017 /* 3018 * sizing down to pszc won't help. 3019 */ 3020 page_unlock(pp); 3021 segvn_faultvnmpss_align_err3++; 3022 goto out; 3023 } 3024 if (!PAGE_EXCL(pp)) { 3025 VM_STAT_ADD(segvnvmstats.fill_vp_pages[4]); 3026 page_unlock(pp); 3027 *downsize = 1; 3028 *ret_pszc = pp->p_szc; 3029 goto out; 3030 } 3031 targpp = pp; 3032 if (io_pplist != NULL) { 3033 VM_STAT_ADD(segvnvmstats.fill_vp_pages[5]); 3034 io_len = off - io_off; 3035 /* 3036 * Some file systems like NFS don't check EOF 3037 * conditions in VOP_PAGEIO(). Check it here 3038 * now that pages are locked SE_EXCL. Any file 3039 * truncation will wait until the pages are 3040 * unlocked so no need to worry that file will 3041 * be truncated after we check its size here. 3042 * XXX fix NFS to remove this check. 3043 */ 3044 va.va_mask = AT_SIZE; 3045 if (VOP_GETATTR(vp, &va, ATTR_HINT, svd->cred) != 0) { 3046 VM_STAT_ADD(segvnvmstats.fill_vp_pages[6]); 3047 page_unlock(targpp); 3048 goto out; 3049 } 3050 if (btopr(va.va_size) < btopr(io_off + io_len)) { 3051 VM_STAT_ADD(segvnvmstats.fill_vp_pages[7]); 3052 *downsize = 1; 3053 *ret_pszc = 0; 3054 page_unlock(targpp); 3055 goto out; 3056 } 3057 io_err = VOP_PAGEIO(vp, io_pplist, io_off, io_len, 3058 B_READ, svd->cred); 3059 if (io_err) { 3060 VM_STAT_ADD(segvnvmstats.fill_vp_pages[8]); 3061 page_unlock(targpp); 3062 if (io_err == EDEADLK) { 3063 segvn_vmpss_pageio_deadlk_err++; 3064 } 3065 goto out; 3066 } 3067 nios++; 3068 VM_STAT_ADD(segvnvmstats.fill_vp_pages[9]); 3069 while (io_pplist != NULL) { 3070 pp = io_pplist; 3071 page_sub(&io_pplist, pp); 3072 ASSERT(page_iolock_assert(pp)); 3073 page_io_unlock(pp); 3074 pgidx = (pp->p_offset - start_off) >> 3075 PAGESHIFT; 3076 ASSERT(pgidx < pages); 3077 ppa[pgidx] = pp; 3078 page_list_concat(&done_pplist, &pp); 3079 } 3080 } 3081 pp = targpp; 3082 ASSERT(PAGE_EXCL(pp)); 3083 ASSERT(pp->p_szc <= pszc); 3084 if (pszc != 0 && !group_page_trylock(pp, SE_EXCL)) { 3085 VM_STAT_ADD(segvnvmstats.fill_vp_pages[10]); 3086 page_unlock(pp); 3087 *downsize = 1; 3088 *ret_pszc = pp->p_szc; 3089 goto out; 3090 } 3091 VM_STAT_ADD(segvnvmstats.fill_vp_pages[11]); 3092 /* 3093 * page szc chould have changed before the entire group was 3094 * locked. reread page szc. 3095 */ 3096 pszc = pp->p_szc; 3097 ppages = page_get_pagecnt(pszc); 3098 3099 /* link just the roots */ 3100 page_list_concat(&targ_pplist, &pp); 3101 page_sub(&pplist, newpp); 3102 page_list_concat(&repl_pplist, &newpp); 3103 off += PAGESIZE; 3104 while (--ppages != 0) { 3105 newpp = pplist; 3106 page_sub(&pplist, newpp); 3107 off += PAGESIZE; 3108 } 3109 io_off = off; 3110 } 3111 if (io_pplist != NULL) { 3112 VM_STAT_ADD(segvnvmstats.fill_vp_pages[12]); 3113 io_len = eoff - io_off; 3114 va.va_mask = AT_SIZE; 3115 if (VOP_GETATTR(vp, &va, ATTR_HINT, svd->cred) != 0) { 3116 VM_STAT_ADD(segvnvmstats.fill_vp_pages[13]); 3117 goto out; 3118 } 3119 if (btopr(va.va_size) < btopr(io_off + io_len)) { 3120 VM_STAT_ADD(segvnvmstats.fill_vp_pages[14]); 3121 *downsize = 1; 3122 *ret_pszc = 0; 3123 goto out; 3124 } 3125 io_err = VOP_PAGEIO(vp, io_pplist, io_off, io_len, 3126 B_READ, svd->cred); 3127 if (io_err) { 3128 VM_STAT_ADD(segvnvmstats.fill_vp_pages[15]); 3129 if (io_err == EDEADLK) { 3130 segvn_vmpss_pageio_deadlk_err++; 3131 } 3132 goto out; 3133 } 3134 nios++; 3135 while (io_pplist != NULL) { 3136 pp = io_pplist; 3137 page_sub(&io_pplist, pp); 3138 ASSERT(page_iolock_assert(pp)); 3139 page_io_unlock(pp); 3140 pgidx = (pp->p_offset - start_off) >> PAGESHIFT; 3141 ASSERT(pgidx < pages); 3142 ppa[pgidx] = pp; 3143 } 3144 } 3145 /* 3146 * we're now bound to succeed or panic. 3147 * remove pages from done_pplist. it's not needed anymore. 3148 */ 3149 while (done_pplist != NULL) { 3150 pp = done_pplist; 3151 page_sub(&done_pplist, pp); 3152 } 3153 VM_STAT_ADD(segvnvmstats.fill_vp_pages[16]); 3154 ASSERT(pplist == NULL); 3155 *ppplist = NULL; 3156 while (targ_pplist != NULL) { 3157 int ret; 3158 VM_STAT_ADD(segvnvmstats.fill_vp_pages[17]); 3159 ASSERT(repl_pplist); 3160 pp = targ_pplist; 3161 page_sub(&targ_pplist, pp); 3162 pgidx = (pp->p_offset - start_off) >> PAGESHIFT; 3163 newpp = repl_pplist; 3164 page_sub(&repl_pplist, newpp); 3165 #ifdef DEBUG 3166 pfn = page_pptonum(pp); 3167 pszc = pp->p_szc; 3168 ppages = page_get_pagecnt(pszc); 3169 ASSERT(IS_P2ALIGNED(pfn, ppages)); 3170 pfn = page_pptonum(newpp); 3171 ASSERT(IS_P2ALIGNED(pfn, ppages)); 3172 ASSERT(P2PHASE(pfn, pages) == pgidx); 3173 #endif 3174 nreloc = 0; 3175 ret = page_relocate(&pp, &newpp, 0, 1, &nreloc, NULL); 3176 if (ret != 0 || nreloc == 0) { 3177 panic("segvn_fill_vp_pages: " 3178 "page_relocate failed"); 3179 } 3180 pp = newpp; 3181 while (nreloc-- != 0) { 3182 ASSERT(PAGE_EXCL(pp)); 3183 ASSERT(pp->p_vnode == vp); 3184 ASSERT(pgidx == 3185 ((pp->p_offset - start_off) >> PAGESHIFT)); 3186 ppa[pgidx++] = pp; 3187 pp++; 3188 } 3189 } 3190 3191 if (svd->type == MAP_PRIVATE) { 3192 VM_STAT_ADD(segvnvmstats.fill_vp_pages[18]); 3193 for (i = 0; i < pages; i++) { 3194 ASSERT(ppa[i] != NULL); 3195 ASSERT(PAGE_EXCL(ppa[i])); 3196 ASSERT(ppa[i]->p_vnode == vp); 3197 ASSERT(ppa[i]->p_offset == 3198 start_off + (i << PAGESHIFT)); 3199 page_downgrade(ppa[i]); 3200 } 3201 ppa[pages] = NULL; 3202 } else { 3203 VM_STAT_ADD(segvnvmstats.fill_vp_pages[19]); 3204 /* 3205 * the caller will still call VOP_GETPAGE() for shared segments 3206 * to check FS write permissions. For private segments we map 3207 * file read only anyway. so no VOP_GETPAGE is needed. 3208 */ 3209 for (i = 0; i < pages; i++) { 3210 ASSERT(ppa[i] != NULL); 3211 ASSERT(PAGE_EXCL(ppa[i])); 3212 ASSERT(ppa[i]->p_vnode == vp); 3213 ASSERT(ppa[i]->p_offset == 3214 start_off + (i << PAGESHIFT)); 3215 page_unlock(ppa[i]); 3216 } 3217 ppa[0] = NULL; 3218 } 3219 3220 return (1); 3221 out: 3222 /* 3223 * Do the cleanup. Unlock target pages we didn't relocate. They are 3224 * linked on targ_pplist by root pages. reassemble unused replacement 3225 * and io pages back to pplist. 3226 */ 3227 if (io_pplist != NULL) { 3228 VM_STAT_ADD(segvnvmstats.fill_vp_pages[20]); 3229 pp = io_pplist; 3230 do { 3231 ASSERT(pp->p_vnode == vp); 3232 ASSERT(pp->p_offset == io_off); 3233 ASSERT(page_iolock_assert(pp)); 3234 page_io_unlock(pp); 3235 page_hashout(pp, NULL); 3236 io_off += PAGESIZE; 3237 } while ((pp = pp->p_next) != io_pplist); 3238 page_list_concat(&io_pplist, &pplist); 3239 pplist = io_pplist; 3240 } 3241 tmp_pplist = NULL; 3242 while (targ_pplist != NULL) { 3243 VM_STAT_ADD(segvnvmstats.fill_vp_pages[21]); 3244 pp = targ_pplist; 3245 ASSERT(PAGE_EXCL(pp)); 3246 page_sub(&targ_pplist, pp); 3247 3248 pszc = pp->p_szc; 3249 ppages = page_get_pagecnt(pszc); 3250 ASSERT(IS_P2ALIGNED(page_pptonum(pp), ppages)); 3251 3252 if (pszc != 0) { 3253 group_page_unlock(pp); 3254 } 3255 page_unlock(pp); 3256 3257 pp = repl_pplist; 3258 ASSERT(pp != NULL); 3259 ASSERT(PAGE_EXCL(pp)); 3260 ASSERT(pp->p_szc == szc); 3261 page_sub(&repl_pplist, pp); 3262 3263 ASSERT(IS_P2ALIGNED(page_pptonum(pp), ppages)); 3264 3265 /* relink replacement page */ 3266 page_list_concat(&tmp_pplist, &pp); 3267 while (--ppages != 0) { 3268 VM_STAT_ADD(segvnvmstats.fill_vp_pages[22]); 3269 pp++; 3270 ASSERT(PAGE_EXCL(pp)); 3271 ASSERT(pp->p_szc == szc); 3272 page_list_concat(&tmp_pplist, &pp); 3273 } 3274 } 3275 if (tmp_pplist != NULL) { 3276 VM_STAT_ADD(segvnvmstats.fill_vp_pages[23]); 3277 page_list_concat(&tmp_pplist, &pplist); 3278 pplist = tmp_pplist; 3279 } 3280 /* 3281 * at this point all pages are either on done_pplist or 3282 * pplist. They can't be all on done_pplist otherwise 3283 * we'd've been done. 3284 */ 3285 ASSERT(pplist != NULL); 3286 if (nios != 0) { 3287 VM_STAT_ADD(segvnvmstats.fill_vp_pages[24]); 3288 pp = pplist; 3289 do { 3290 VM_STAT_ADD(segvnvmstats.fill_vp_pages[25]); 3291 ASSERT(pp->p_szc == szc); 3292 ASSERT(PAGE_EXCL(pp)); 3293 ASSERT(pp->p_vnode != vp); 3294 pp->p_szc = 0; 3295 } while ((pp = pp->p_next) != pplist); 3296 3297 pp = done_pplist; 3298 do { 3299 VM_STAT_ADD(segvnvmstats.fill_vp_pages[26]); 3300 ASSERT(pp->p_szc == szc); 3301 ASSERT(PAGE_EXCL(pp)); 3302 ASSERT(pp->p_vnode == vp); 3303 pp->p_szc = 0; 3304 } while ((pp = pp->p_next) != done_pplist); 3305 3306 while (pplist != NULL) { 3307 VM_STAT_ADD(segvnvmstats.fill_vp_pages[27]); 3308 pp = pplist; 3309 page_sub(&pplist, pp); 3310 page_free(pp, 0); 3311 } 3312 3313 while (done_pplist != NULL) { 3314 VM_STAT_ADD(segvnvmstats.fill_vp_pages[28]); 3315 pp = done_pplist; 3316 page_sub(&done_pplist, pp); 3317 page_unlock(pp); 3318 } 3319 *ppplist = NULL; 3320 return (0); 3321 } 3322 ASSERT(pplist == *ppplist); 3323 if (io_err) { 3324 VM_STAT_ADD(segvnvmstats.fill_vp_pages[29]); 3325 /* 3326 * don't downsize on io error. 3327 * see if vop_getpage succeeds. 3328 * pplist may still be used in this case 3329 * for relocations. 3330 */ 3331 return (0); 3332 } 3333 VM_STAT_ADD(segvnvmstats.fill_vp_pages[30]); 3334 page_free_replacement_page(pplist); 3335 page_create_putback(pages); 3336 *ppplist = NULL; 3337 return (0); 3338 } 3339 3340 int segvn_anypgsz = 0; 3341 3342 #define SEGVN_RESTORE_SOFTLOCK(type, pages) \ 3343 if ((type) == F_SOFTLOCK) { \ 3344 mutex_enter(&freemem_lock); \ 3345 availrmem += (pages); \ 3346 segvn_pages_locked -= (pages); \ 3347 svd->softlockcnt -= (pages); \ 3348 mutex_exit(&freemem_lock); \ 3349 } 3350 3351 #define SEGVN_UPDATE_MODBITS(ppa, pages, rw, prot, vpprot) \ 3352 if (IS_VMODSORT((ppa)[0]->p_vnode)) { \ 3353 if ((rw) == S_WRITE) { \ 3354 for (i = 0; i < (pages); i++) { \ 3355 ASSERT((ppa)[i]->p_vnode == \ 3356 (ppa)[0]->p_vnode); \ 3357 hat_setmod((ppa)[i]); \ 3358 } \ 3359 } else if ((rw) != S_OTHER && \ 3360 ((prot) & (vpprot) & PROT_WRITE)) { \ 3361 for (i = 0; i < (pages); i++) { \ 3362 ASSERT((ppa)[i]->p_vnode == \ 3363 (ppa)[0]->p_vnode); \ 3364 if (!hat_ismod((ppa)[i])) { \ 3365 prot &= ~PROT_WRITE; \ 3366 break; \ 3367 } \ 3368 } \ 3369 } \ 3370 } 3371 3372 #ifdef VM_STATS 3373 3374 #define SEGVN_VMSTAT_FLTVNPAGES(idx) \ 3375 VM_STAT_ADD(segvnvmstats.fltvnpages[(idx)]); 3376 3377 #else /* VM_STATS */ 3378 3379 #define SEGVN_VMSTAT_FLTVNPAGES(idx) 3380 3381 #endif 3382 3383 static faultcode_t 3384 segvn_fault_vnodepages(struct hat *hat, struct seg *seg, caddr_t lpgaddr, 3385 caddr_t lpgeaddr, enum fault_type type, enum seg_rw rw, caddr_t addr, 3386 caddr_t eaddr, int brkcow) 3387 { 3388 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 3389 struct anon_map *amp = svd->amp; 3390 uchar_t segtype = svd->type; 3391 uint_t szc = seg->s_szc; 3392 size_t pgsz = page_get_pagesize(szc); 3393 size_t maxpgsz = pgsz; 3394 pgcnt_t pages = btop(pgsz); 3395 pgcnt_t maxpages = pages; 3396 size_t ppasize = (pages + 1) * sizeof (page_t *); 3397 caddr_t a = lpgaddr; 3398 caddr_t maxlpgeaddr = lpgeaddr; 3399 u_offset_t off = svd->offset + (uintptr_t)(a - seg->s_base); 3400 ulong_t aindx = svd->anon_index + seg_page(seg, a); 3401 struct vpage *vpage = (svd->vpage != NULL) ? 3402 &svd->vpage[seg_page(seg, a)] : NULL; 3403 vnode_t *vp = svd->vp; 3404 page_t **ppa; 3405 uint_t pszc; 3406 size_t ppgsz; 3407 pgcnt_t ppages; 3408 faultcode_t err = 0; 3409 int ierr; 3410 int vop_size_err = 0; 3411 uint_t protchk, prot, vpprot; 3412 ulong_t i; 3413 int hat_flag = (type == F_SOFTLOCK) ? HAT_LOAD_LOCK : HAT_LOAD; 3414 anon_sync_obj_t an_cookie; 3415 enum seg_rw arw; 3416 int alloc_failed = 0; 3417 int adjszc_chk; 3418 struct vattr va; 3419 int xhat = 0; 3420 page_t *pplist; 3421 pfn_t pfn; 3422 int physcontig; 3423 int upgrdfail; 3424 int segvn_anypgsz_vnode = 0; /* for now map vnode with 2 page sizes */ 3425 3426 ASSERT(szc != 0); 3427 ASSERT(vp != NULL); 3428 ASSERT(brkcow == 0 || amp != NULL); 3429 ASSERT(enable_mbit_wa == 0); /* no mbit simulations with large pages */ 3430 ASSERT(!(svd->flags & MAP_NORESERVE)); 3431 ASSERT(type != F_SOFTUNLOCK); 3432 ASSERT(IS_P2ALIGNED(a, maxpgsz)); 3433 ASSERT(amp == NULL || IS_P2ALIGNED(aindx, maxpages)); 3434 ASSERT(SEGVN_LOCK_HELD(seg->s_as, &svd->lock)); 3435 ASSERT(seg->s_szc < NBBY * sizeof (int)); 3436 ASSERT(type != F_SOFTLOCK || lpgeaddr - a == maxpgsz); 3437 3438 VM_STAT_COND_ADD(type == F_SOFTLOCK, segvnvmstats.fltvnpages[0]); 3439 VM_STAT_COND_ADD(type != F_SOFTLOCK, segvnvmstats.fltvnpages[1]); 3440 3441 if (svd->flags & MAP_TEXT) { 3442 hat_flag |= HAT_LOAD_TEXT; 3443 } 3444 3445 if (svd->pageprot) { 3446 switch (rw) { 3447 case S_READ: 3448 protchk = PROT_READ; 3449 break; 3450 case S_WRITE: 3451 protchk = PROT_WRITE; 3452 break; 3453 case S_EXEC: 3454 protchk = PROT_EXEC; 3455 break; 3456 case S_OTHER: 3457 default: 3458 protchk = PROT_READ | PROT_WRITE | PROT_EXEC; 3459 break; 3460 } 3461 } else { 3462 prot = svd->prot; 3463 /* caller has already done segment level protection check. */ 3464 } 3465 3466 if (seg->s_as->a_hat != hat) { 3467 xhat = 1; 3468 } 3469 3470 if (rw == S_WRITE && segtype == MAP_PRIVATE) { 3471 SEGVN_VMSTAT_FLTVNPAGES(2); 3472 arw = S_READ; 3473 } else { 3474 arw = rw; 3475 } 3476 3477 ppa = kmem_alloc(ppasize, KM_SLEEP); 3478 3479 VM_STAT_COND_ADD(amp != NULL, segvnvmstats.fltvnpages[3]); 3480 3481 for (;;) { 3482 adjszc_chk = 0; 3483 for (; a < lpgeaddr; a += pgsz, off += pgsz, aindx += pages) { 3484 if (adjszc_chk) { 3485 while (szc < seg->s_szc) { 3486 uintptr_t e; 3487 uint_t tszc; 3488 tszc = segvn_anypgsz_vnode ? szc + 1 : 3489 seg->s_szc; 3490 ppgsz = page_get_pagesize(tszc); 3491 if (!IS_P2ALIGNED(a, ppgsz) || 3492 ((alloc_failed >> tszc) & 3493 0x1)) { 3494 break; 3495 } 3496 SEGVN_VMSTAT_FLTVNPAGES(4); 3497 szc = tszc; 3498 pgsz = ppgsz; 3499 pages = btop(pgsz); 3500 e = P2ROUNDUP((uintptr_t)eaddr, pgsz); 3501 lpgeaddr = (caddr_t)e; 3502 } 3503 } 3504 3505 again: 3506 if (IS_P2ALIGNED(a, maxpgsz) && amp != NULL) { 3507 ASSERT(IS_P2ALIGNED(aindx, maxpages)); 3508 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 3509 anon_array_enter(amp, aindx, &an_cookie); 3510 if (anon_get_ptr(amp->ahp, aindx) != NULL) { 3511 SEGVN_VMSTAT_FLTVNPAGES(5); 3512 if (anon_pages(amp->ahp, aindx, 3513 maxpages) != maxpages) { 3514 panic("segvn_fault_vnodepages:" 3515 " empty anon slots\n"); 3516 } 3517 anon_array_exit(&an_cookie); 3518 ANON_LOCK_EXIT(&->a_rwlock); 3519 err = segvn_fault_anonpages(hat, seg, 3520 a, a + maxpgsz, type, rw, 3521 MAX(a, addr), 3522 MIN(a + maxpgsz, eaddr), brkcow); 3523 if (err != 0) { 3524 SEGVN_VMSTAT_FLTVNPAGES(6); 3525 goto out; 3526 } 3527 if (szc < seg->s_szc) { 3528 szc = seg->s_szc; 3529 pgsz = maxpgsz; 3530 pages = maxpages; 3531 lpgeaddr = maxlpgeaddr; 3532 } 3533 goto next; 3534 } else if (anon_pages(amp->ahp, aindx, 3535 maxpages)) { 3536 panic("segvn_fault_vnodepages:" 3537 " non empty anon slots\n"); 3538 } else { 3539 SEGVN_VMSTAT_FLTVNPAGES(7); 3540 anon_array_exit(&an_cookie); 3541 ANON_LOCK_EXIT(&->a_rwlock); 3542 } 3543 } 3544 ASSERT(!brkcow || IS_P2ALIGNED(a, maxpgsz)); 3545 3546 if (svd->pageprot != 0 && IS_P2ALIGNED(a, maxpgsz)) { 3547 ASSERT(vpage != NULL); 3548 prot = VPP_PROT(vpage); 3549 ASSERT(sameprot(seg, a, maxpgsz)); 3550 if ((prot & protchk) == 0) { 3551 SEGVN_VMSTAT_FLTVNPAGES(8); 3552 err = FC_PROT; 3553 goto out; 3554 } 3555 } 3556 if (type == F_SOFTLOCK) { 3557 mutex_enter(&freemem_lock); 3558 if (availrmem < tune.t_minarmem + pages) { 3559 mutex_exit(&freemem_lock); 3560 err = FC_MAKE_ERR(ENOMEM); 3561 goto out; 3562 } else { 3563 availrmem -= pages; 3564 segvn_pages_locked += pages; 3565 svd->softlockcnt += pages; 3566 } 3567 mutex_exit(&freemem_lock); 3568 } 3569 3570 pplist = NULL; 3571 physcontig = 0; 3572 ppa[0] = NULL; 3573 if (!brkcow && szc && 3574 !page_exists_physcontig(vp, off, szc, 3575 segtype == MAP_PRIVATE ? ppa : NULL)) { 3576 SEGVN_VMSTAT_FLTVNPAGES(9); 3577 if (page_alloc_pages(vp, seg, a, &pplist, NULL, 3578 szc, 0) && type != F_SOFTLOCK) { 3579 SEGVN_VMSTAT_FLTVNPAGES(10); 3580 pszc = 0; 3581 ierr = -1; 3582 alloc_failed |= (1 << szc); 3583 break; 3584 } 3585 if (pplist != NULL && 3586 vp->v_mpssdata == SEGVN_PAGEIO) { 3587 int downsize; 3588 SEGVN_VMSTAT_FLTVNPAGES(11); 3589 physcontig = segvn_fill_vp_pages(svd, 3590 vp, off, szc, ppa, &pplist, 3591 &pszc, &downsize); 3592 ASSERT(!physcontig || pplist == NULL); 3593 if (!physcontig && downsize && 3594 type != F_SOFTLOCK) { 3595 ASSERT(pplist == NULL); 3596 SEGVN_VMSTAT_FLTVNPAGES(12); 3597 ierr = -1; 3598 break; 3599 } 3600 ASSERT(!physcontig || 3601 segtype == MAP_PRIVATE || 3602 ppa[0] == NULL); 3603 if (physcontig && ppa[0] == NULL) { 3604 physcontig = 0; 3605 } 3606 } 3607 } else if (!brkcow && szc && ppa[0] != NULL) { 3608 SEGVN_VMSTAT_FLTVNPAGES(13); 3609 ASSERT(segtype == MAP_PRIVATE); 3610 physcontig = 1; 3611 } 3612 3613 if (!physcontig) { 3614 SEGVN_VMSTAT_FLTVNPAGES(14); 3615 ppa[0] = NULL; 3616 ierr = VOP_GETPAGE(vp, (offset_t)off, pgsz, 3617 &vpprot, ppa, pgsz, seg, a, arw, 3618 svd->cred); 3619 #ifdef DEBUG 3620 if (ierr == 0) { 3621 for (i = 0; i < pages; i++) { 3622 ASSERT(PAGE_LOCKED(ppa[i])); 3623 ASSERT(!PP_ISFREE(ppa[i])); 3624 ASSERT(ppa[i]->p_vnode == vp); 3625 ASSERT(ppa[i]->p_offset == 3626 off + (i << PAGESHIFT)); 3627 } 3628 } 3629 #endif /* DEBUG */ 3630 if (segtype == MAP_PRIVATE) { 3631 SEGVN_VMSTAT_FLTVNPAGES(15); 3632 vpprot &= ~PROT_WRITE; 3633 } 3634 } else { 3635 ASSERT(segtype == MAP_PRIVATE); 3636 SEGVN_VMSTAT_FLTVNPAGES(16); 3637 vpprot = PROT_ALL & ~PROT_WRITE; 3638 ierr = 0; 3639 } 3640 3641 if (ierr != 0) { 3642 SEGVN_VMSTAT_FLTVNPAGES(17); 3643 if (pplist != NULL) { 3644 SEGVN_VMSTAT_FLTVNPAGES(18); 3645 page_free_replacement_page(pplist); 3646 page_create_putback(pages); 3647 } 3648 SEGVN_RESTORE_SOFTLOCK(type, pages); 3649 if (a + pgsz <= eaddr) { 3650 SEGVN_VMSTAT_FLTVNPAGES(19); 3651 err = FC_MAKE_ERR(ierr); 3652 goto out; 3653 } 3654 va.va_mask = AT_SIZE; 3655 if (VOP_GETATTR(vp, &va, 0, svd->cred) != 0) { 3656 SEGVN_VMSTAT_FLTVNPAGES(20); 3657 err = FC_MAKE_ERR(EIO); 3658 goto out; 3659 } 3660 if (btopr(va.va_size) >= btopr(off + pgsz)) { 3661 SEGVN_VMSTAT_FLTVNPAGES(21); 3662 err = FC_MAKE_ERR(ierr); 3663 goto out; 3664 } 3665 if (btopr(va.va_size) < 3666 btopr(off + (eaddr - a))) { 3667 SEGVN_VMSTAT_FLTVNPAGES(22); 3668 err = FC_MAKE_ERR(ierr); 3669 goto out; 3670 } 3671 if (brkcow || type == F_SOFTLOCK) { 3672 /* can't reduce map area */ 3673 SEGVN_VMSTAT_FLTVNPAGES(23); 3674 vop_size_err = 1; 3675 goto out; 3676 } 3677 SEGVN_VMSTAT_FLTVNPAGES(24); 3678 ASSERT(szc != 0); 3679 pszc = 0; 3680 ierr = -1; 3681 break; 3682 } 3683 3684 if (amp != NULL) { 3685 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 3686 anon_array_enter(amp, aindx, &an_cookie); 3687 } 3688 if (amp != NULL && 3689 anon_get_ptr(amp->ahp, aindx) != NULL) { 3690 ulong_t taindx = P2ALIGN(aindx, maxpages); 3691 3692 SEGVN_VMSTAT_FLTVNPAGES(25); 3693 if (anon_pages(amp->ahp, taindx, maxpages) != 3694 maxpages) { 3695 panic("segvn_fault_vnodepages:" 3696 " empty anon slots\n"); 3697 } 3698 for (i = 0; i < pages; i++) { 3699 page_unlock(ppa[i]); 3700 } 3701 anon_array_exit(&an_cookie); 3702 ANON_LOCK_EXIT(&->a_rwlock); 3703 if (pplist != NULL) { 3704 page_free_replacement_page(pplist); 3705 page_create_putback(pages); 3706 } 3707 SEGVN_RESTORE_SOFTLOCK(type, pages); 3708 if (szc < seg->s_szc) { 3709 SEGVN_VMSTAT_FLTVNPAGES(26); 3710 /* 3711 * For private segments SOFTLOCK 3712 * either always breaks cow (any rw 3713 * type except S_READ_NOCOW) or 3714 * address space is locked as writer 3715 * (S_READ_NOCOW case) and anon slots 3716 * can't show up on second check. 3717 * Therefore if we are here for 3718 * SOFTLOCK case it must be a cow 3719 * break but cow break never reduces 3720 * szc. Thus the assert below. 3721 */ 3722 ASSERT(!brkcow && type != F_SOFTLOCK); 3723 pszc = seg->s_szc; 3724 ierr = -2; 3725 break; 3726 } 3727 ASSERT(IS_P2ALIGNED(a, maxpgsz)); 3728 goto again; 3729 } 3730 #ifdef DEBUG 3731 if (amp != NULL) { 3732 ulong_t taindx = P2ALIGN(aindx, maxpages); 3733 ASSERT(!anon_pages(amp->ahp, taindx, maxpages)); 3734 } 3735 #endif /* DEBUG */ 3736 3737 if (brkcow) { 3738 ASSERT(amp != NULL); 3739 ASSERT(pplist == NULL); 3740 ASSERT(szc == seg->s_szc); 3741 ASSERT(IS_P2ALIGNED(a, maxpgsz)); 3742 ASSERT(IS_P2ALIGNED(aindx, maxpages)); 3743 SEGVN_VMSTAT_FLTVNPAGES(27); 3744 ierr = anon_map_privatepages(amp, aindx, szc, 3745 seg, a, prot, ppa, vpage, segvn_anypgsz, 3746 svd->cred); 3747 if (ierr != 0) { 3748 SEGVN_VMSTAT_FLTVNPAGES(28); 3749 anon_array_exit(&an_cookie); 3750 ANON_LOCK_EXIT(&->a_rwlock); 3751 SEGVN_RESTORE_SOFTLOCK(type, pages); 3752 err = FC_MAKE_ERR(ierr); 3753 goto out; 3754 } 3755 3756 ASSERT(!IS_VMODSORT(ppa[0]->p_vnode)); 3757 /* 3758 * p_szc can't be changed for locked 3759 * swapfs pages. 3760 */ 3761 hat_memload_array(hat, a, pgsz, ppa, prot, 3762 hat_flag); 3763 3764 if (!(hat_flag & HAT_LOAD_LOCK)) { 3765 SEGVN_VMSTAT_FLTVNPAGES(29); 3766 for (i = 0; i < pages; i++) { 3767 page_unlock(ppa[i]); 3768 } 3769 } 3770 anon_array_exit(&an_cookie); 3771 ANON_LOCK_EXIT(&->a_rwlock); 3772 goto next; 3773 } 3774 3775 pfn = page_pptonum(ppa[0]); 3776 /* 3777 * hat_page_demote() needs an EXCl lock on one of 3778 * constituent page_t's and it decreases root's p_szc 3779 * last. This means if root's p_szc is equal szc and 3780 * all its constituent pages are locked 3781 * hat_page_demote() that could have changed p_szc to 3782 * szc is already done and no new have page_demote() 3783 * can start for this large page. 3784 */ 3785 3786 /* 3787 * we need to make sure same mapping size is used for 3788 * the same address range if there's a possibility the 3789 * adddress is already mapped because hat layer panics 3790 * when translation is loaded for the range already 3791 * mapped with a different page size. We achieve it 3792 * by always using largest page size possible subject 3793 * to the constraints of page size, segment page size 3794 * and page alignment. Since mappings are invalidated 3795 * when those constraints change and make it 3796 * impossible to use previously used mapping size no 3797 * mapping size conflicts should happen. 3798 */ 3799 3800 chkszc: 3801 if ((pszc = ppa[0]->p_szc) == szc && 3802 IS_P2ALIGNED(pfn, pages)) { 3803 3804 SEGVN_VMSTAT_FLTVNPAGES(30); 3805 #ifdef DEBUG 3806 for (i = 0; i < pages; i++) { 3807 ASSERT(PAGE_LOCKED(ppa[i])); 3808 ASSERT(!PP_ISFREE(ppa[i])); 3809 ASSERT(page_pptonum(ppa[i]) == 3810 pfn + i); 3811 ASSERT(ppa[i]->p_szc == szc); 3812 ASSERT(ppa[i]->p_vnode == vp); 3813 ASSERT(ppa[i]->p_offset == 3814 off + (i << PAGESHIFT)); 3815 } 3816 #endif /* DEBUG */ 3817 /* 3818 * All pages are of szc we need and they are 3819 * all locked so they can't change szc. load 3820 * translations. 3821 * 3822 * if page got promoted since last check 3823 * we don't need pplist. 3824 */ 3825 if (pplist != NULL) { 3826 page_free_replacement_page(pplist); 3827 page_create_putback(pages); 3828 } 3829 if (PP_ISMIGRATE(ppa[0])) { 3830 page_migrate(seg, a, ppa, pages); 3831 } 3832 SEGVN_UPDATE_MODBITS(ppa, pages, rw, 3833 prot, vpprot); 3834 if (!xhat) { 3835 hat_memload_array(hat, a, pgsz, ppa, 3836 prot & vpprot, hat_flag); 3837 } else { 3838 /* 3839 * avoid large xhat mappings to FS 3840 * pages so that hat_page_demote() 3841 * doesn't need to check for xhat 3842 * large mappings. 3843 */ 3844 for (i = 0; i < pages; i++) { 3845 hat_memload(hat, 3846 a + (i << PAGESHIFT), 3847 ppa[i], prot & vpprot, 3848 hat_flag); 3849 } 3850 } 3851 3852 if (!(hat_flag & HAT_LOAD_LOCK)) { 3853 for (i = 0; i < pages; i++) { 3854 page_unlock(ppa[i]); 3855 } 3856 } 3857 if (amp != NULL) { 3858 anon_array_exit(&an_cookie); 3859 ANON_LOCK_EXIT(&->a_rwlock); 3860 } 3861 goto next; 3862 } 3863 3864 /* 3865 * See if upsize is possible. 3866 */ 3867 if (pszc > szc && szc < seg->s_szc && 3868 (segvn_anypgsz_vnode || pszc >= seg->s_szc)) { 3869 pgcnt_t aphase; 3870 uint_t pszc1 = MIN(pszc, seg->s_szc); 3871 ppgsz = page_get_pagesize(pszc1); 3872 ppages = btop(ppgsz); 3873 aphase = btop(P2PHASE((uintptr_t)a, ppgsz)); 3874 3875 ASSERT(type != F_SOFTLOCK); 3876 3877 SEGVN_VMSTAT_FLTVNPAGES(31); 3878 if (aphase != P2PHASE(pfn, ppages)) { 3879 segvn_faultvnmpss_align_err4++; 3880 } else { 3881 SEGVN_VMSTAT_FLTVNPAGES(32); 3882 if (pplist != NULL) { 3883 page_t *pl = pplist; 3884 page_free_replacement_page(pl); 3885 page_create_putback(pages); 3886 } 3887 for (i = 0; i < pages; i++) { 3888 page_unlock(ppa[i]); 3889 } 3890 if (amp != NULL) { 3891 anon_array_exit(&an_cookie); 3892 ANON_LOCK_EXIT(&->a_rwlock); 3893 } 3894 pszc = pszc1; 3895 ierr = -2; 3896 break; 3897 } 3898 } 3899 3900 /* 3901 * check if we should use smallest mapping size. 3902 */ 3903 upgrdfail = 0; 3904 if (szc == 0 || xhat || 3905 (pszc >= szc && 3906 !IS_P2ALIGNED(pfn, pages)) || 3907 (pszc < szc && 3908 !segvn_full_szcpages(ppa, szc, &upgrdfail, 3909 &pszc))) { 3910 3911 if (upgrdfail && type != F_SOFTLOCK) { 3912 /* 3913 * segvn_full_szcpages failed to lock 3914 * all pages EXCL. Size down. 3915 */ 3916 ASSERT(pszc < szc); 3917 3918 SEGVN_VMSTAT_FLTVNPAGES(33); 3919 3920 if (pplist != NULL) { 3921 page_t *pl = pplist; 3922 page_free_replacement_page(pl); 3923 page_create_putback(pages); 3924 } 3925 3926 for (i = 0; i < pages; i++) { 3927 page_unlock(ppa[i]); 3928 } 3929 if (amp != NULL) { 3930 anon_array_exit(&an_cookie); 3931 ANON_LOCK_EXIT(&->a_rwlock); 3932 } 3933 ierr = -1; 3934 break; 3935 } 3936 if (szc != 0 && !xhat && !upgrdfail) { 3937 segvn_faultvnmpss_align_err5++; 3938 } 3939 SEGVN_VMSTAT_FLTVNPAGES(34); 3940 if (pplist != NULL) { 3941 page_free_replacement_page(pplist); 3942 page_create_putback(pages); 3943 } 3944 SEGVN_UPDATE_MODBITS(ppa, pages, rw, 3945 prot, vpprot); 3946 if (upgrdfail && segvn_anypgsz_vnode) { 3947 /* SOFTLOCK case */ 3948 hat_memload_array(hat, a, pgsz, 3949 ppa, prot & vpprot, hat_flag); 3950 } else { 3951 for (i = 0; i < pages; i++) { 3952 hat_memload(hat, 3953 a + (i << PAGESHIFT), 3954 ppa[i], prot & vpprot, 3955 hat_flag); 3956 } 3957 } 3958 if (!(hat_flag & HAT_LOAD_LOCK)) { 3959 for (i = 0; i < pages; i++) { 3960 page_unlock(ppa[i]); 3961 } 3962 } 3963 if (amp != NULL) { 3964 anon_array_exit(&an_cookie); 3965 ANON_LOCK_EXIT(&->a_rwlock); 3966 } 3967 goto next; 3968 } 3969 3970 if (pszc == szc) { 3971 /* 3972 * segvn_full_szcpages() upgraded pages szc. 3973 */ 3974 ASSERT(pszc == ppa[0]->p_szc); 3975 ASSERT(IS_P2ALIGNED(pfn, pages)); 3976 goto chkszc; 3977 } 3978 3979 if (pszc > szc) { 3980 kmutex_t *szcmtx; 3981 SEGVN_VMSTAT_FLTVNPAGES(35); 3982 /* 3983 * p_szc of ppa[0] can change since we haven't 3984 * locked all constituent pages. Call 3985 * page_lock_szc() to prevent szc changes. 3986 * This should be a rare case that happens when 3987 * multiple segments use a different page size 3988 * to map the same file offsets. 3989 */ 3990 szcmtx = page_szc_lock(ppa[0]); 3991 pszc = ppa[0]->p_szc; 3992 ASSERT(szcmtx != NULL || pszc == 0); 3993 ASSERT(ppa[0]->p_szc <= pszc); 3994 if (pszc <= szc) { 3995 SEGVN_VMSTAT_FLTVNPAGES(36); 3996 if (szcmtx != NULL) { 3997 mutex_exit(szcmtx); 3998 } 3999 goto chkszc; 4000 } 4001 if (pplist != NULL) { 4002 /* 4003 * page got promoted since last check. 4004 * we don't need preaalocated large 4005 * page. 4006 */ 4007 SEGVN_VMSTAT_FLTVNPAGES(37); 4008 page_free_replacement_page(pplist); 4009 page_create_putback(pages); 4010 } 4011 SEGVN_UPDATE_MODBITS(ppa, pages, rw, 4012 prot, vpprot); 4013 hat_memload_array(hat, a, pgsz, ppa, 4014 prot & vpprot, hat_flag); 4015 mutex_exit(szcmtx); 4016 if (!(hat_flag & HAT_LOAD_LOCK)) { 4017 for (i = 0; i < pages; i++) { 4018 page_unlock(ppa[i]); 4019 } 4020 } 4021 if (amp != NULL) { 4022 anon_array_exit(&an_cookie); 4023 ANON_LOCK_EXIT(&->a_rwlock); 4024 } 4025 goto next; 4026 } 4027 4028 /* 4029 * if page got demoted since last check 4030 * we could have not allocated larger page. 4031 * allocate now. 4032 */ 4033 if (pplist == NULL && 4034 page_alloc_pages(vp, seg, a, &pplist, NULL, 4035 szc, 0) && type != F_SOFTLOCK) { 4036 SEGVN_VMSTAT_FLTVNPAGES(38); 4037 for (i = 0; i < pages; i++) { 4038 page_unlock(ppa[i]); 4039 } 4040 if (amp != NULL) { 4041 anon_array_exit(&an_cookie); 4042 ANON_LOCK_EXIT(&->a_rwlock); 4043 } 4044 ierr = -1; 4045 alloc_failed |= (1 << szc); 4046 break; 4047 } 4048 4049 SEGVN_VMSTAT_FLTVNPAGES(39); 4050 4051 if (pplist != NULL) { 4052 segvn_relocate_pages(ppa, pplist); 4053 #ifdef DEBUG 4054 } else { 4055 ASSERT(type == F_SOFTLOCK); 4056 SEGVN_VMSTAT_FLTVNPAGES(40); 4057 #endif /* DEBUG */ 4058 } 4059 4060 SEGVN_UPDATE_MODBITS(ppa, pages, rw, prot, vpprot); 4061 4062 if (pplist == NULL && segvn_anypgsz_vnode == 0) { 4063 ASSERT(type == F_SOFTLOCK); 4064 for (i = 0; i < pages; i++) { 4065 ASSERT(ppa[i]->p_szc < szc); 4066 hat_memload(hat, a + (i << PAGESHIFT), 4067 ppa[i], prot & vpprot, hat_flag); 4068 } 4069 } else { 4070 ASSERT(pplist != NULL || type == F_SOFTLOCK); 4071 hat_memload_array(hat, a, pgsz, ppa, 4072 prot & vpprot, hat_flag); 4073 } 4074 if (!(hat_flag & HAT_LOAD_LOCK)) { 4075 for (i = 0; i < pages; i++) { 4076 ASSERT(PAGE_SHARED(ppa[i])); 4077 page_unlock(ppa[i]); 4078 } 4079 } 4080 if (amp != NULL) { 4081 anon_array_exit(&an_cookie); 4082 ANON_LOCK_EXIT(&->a_rwlock); 4083 } 4084 4085 next: 4086 if (vpage != NULL) { 4087 vpage += pages; 4088 } 4089 adjszc_chk = 1; 4090 } 4091 if (a == lpgeaddr) 4092 break; 4093 ASSERT(a < lpgeaddr); 4094 4095 ASSERT(!brkcow && type != F_SOFTLOCK); 4096 4097 /* 4098 * ierr == -1 means we failed to map with a large page. 4099 * (either due to allocation/relocation failures or 4100 * misalignment with other mappings to this file. 4101 * 4102 * ierr == -2 means some other thread allocated a large page 4103 * after we gave up tp map with a large page. retry with 4104 * larger mapping. 4105 */ 4106 ASSERT(ierr == -1 || ierr == -2); 4107 ASSERT(ierr == -2 || szc != 0); 4108 ASSERT(ierr == -1 || szc < seg->s_szc); 4109 if (ierr == -2) { 4110 SEGVN_VMSTAT_FLTVNPAGES(41); 4111 ASSERT(pszc > szc && pszc <= seg->s_szc); 4112 szc = pszc; 4113 } else if (segvn_anypgsz_vnode) { 4114 SEGVN_VMSTAT_FLTVNPAGES(42); 4115 szc--; 4116 } else { 4117 SEGVN_VMSTAT_FLTVNPAGES(43); 4118 ASSERT(pszc < szc); 4119 /* 4120 * other process created pszc large page. 4121 * but we still have to drop to 0 szc. 4122 */ 4123 szc = 0; 4124 } 4125 4126 pgsz = page_get_pagesize(szc); 4127 pages = btop(pgsz); 4128 if (ierr == -2) { 4129 /* 4130 * Size up case. Note lpgaddr may only be needed for 4131 * softlock case so we don't adjust it here. 4132 */ 4133 a = (caddr_t)P2ALIGN((uintptr_t)a, pgsz); 4134 ASSERT(a >= lpgaddr); 4135 lpgeaddr = (caddr_t)P2ROUNDUP((uintptr_t)eaddr, pgsz); 4136 off = svd->offset + (uintptr_t)(a - seg->s_base); 4137 aindx = svd->anon_index + seg_page(seg, a); 4138 vpage = (svd->vpage != NULL) ? 4139 &svd->vpage[seg_page(seg, a)] : NULL; 4140 } else { 4141 /* 4142 * Size down case. Note lpgaddr may only be needed for 4143 * softlock case so we don't adjust it here. 4144 */ 4145 ASSERT(IS_P2ALIGNED(a, pgsz)); 4146 ASSERT(IS_P2ALIGNED(lpgeaddr, pgsz)); 4147 lpgeaddr = (caddr_t)P2ROUNDUP((uintptr_t)eaddr, pgsz); 4148 ASSERT(a < lpgeaddr); 4149 if (a < addr) { 4150 SEGVN_VMSTAT_FLTVNPAGES(44); 4151 /* 4152 * The beginning of the large page region can 4153 * be pulled to the right to make a smaller 4154 * region. We haven't yet faulted a single 4155 * page. 4156 */ 4157 a = (caddr_t)P2ALIGN((uintptr_t)addr, pgsz); 4158 ASSERT(a >= lpgaddr); 4159 off = svd->offset + 4160 (uintptr_t)(a - seg->s_base); 4161 aindx = svd->anon_index + seg_page(seg, a); 4162 vpage = (svd->vpage != NULL) ? 4163 &svd->vpage[seg_page(seg, a)] : NULL; 4164 } 4165 } 4166 } 4167 out: 4168 kmem_free(ppa, ppasize); 4169 if (!err && !vop_size_err) { 4170 SEGVN_VMSTAT_FLTVNPAGES(45); 4171 return (0); 4172 } 4173 if (type == F_SOFTLOCK && a > lpgaddr) { 4174 SEGVN_VMSTAT_FLTVNPAGES(46); 4175 segvn_softunlock(seg, lpgaddr, a - lpgaddr, S_OTHER); 4176 } 4177 if (!vop_size_err) { 4178 SEGVN_VMSTAT_FLTVNPAGES(47); 4179 return (err); 4180 } 4181 ASSERT(brkcow || type == F_SOFTLOCK); 4182 /* 4183 * Large page end is mapped beyond the end of file and it's a cow 4184 * fault or softlock so we can't reduce the map area. For now just 4185 * demote the segment. This should really only happen if the end of 4186 * the file changed after the mapping was established since when large 4187 * page segments are created we make sure they don't extend beyond the 4188 * end of the file. 4189 */ 4190 SEGVN_VMSTAT_FLTVNPAGES(48); 4191 4192 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 4193 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_WRITER); 4194 err = 0; 4195 if (seg->s_szc != 0) { 4196 segvn_fltvnpages_clrszc_cnt++; 4197 ASSERT(svd->softlockcnt == 0); 4198 err = segvn_clrszc(seg); 4199 if (err != 0) { 4200 segvn_fltvnpages_clrszc_err++; 4201 } 4202 } 4203 ASSERT(err || seg->s_szc == 0); 4204 SEGVN_LOCK_DOWNGRADE(seg->s_as, &svd->lock); 4205 /* segvn_fault will do its job as if szc had been zero to begin with */ 4206 return (err == 0 ? IE_RETRY : FC_MAKE_ERR(err)); 4207 } 4208 4209 /* 4210 * This routine will attempt to fault in one large page. 4211 * it will use smaller pages if that fails. 4212 * It should only be called for pure anonymous segments. 4213 */ 4214 static faultcode_t 4215 segvn_fault_anonpages(struct hat *hat, struct seg *seg, caddr_t lpgaddr, 4216 caddr_t lpgeaddr, enum fault_type type, enum seg_rw rw, caddr_t addr, 4217 caddr_t eaddr, int brkcow) 4218 { 4219 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 4220 struct anon_map *amp = svd->amp; 4221 uchar_t segtype = svd->type; 4222 uint_t szc = seg->s_szc; 4223 size_t pgsz = page_get_pagesize(szc); 4224 size_t maxpgsz = pgsz; 4225 pgcnt_t pages = btop(pgsz); 4226 size_t ppasize = pages * sizeof (page_t *); 4227 caddr_t a = lpgaddr; 4228 ulong_t aindx = svd->anon_index + seg_page(seg, a); 4229 struct vpage *vpage = (svd->vpage != NULL) ? 4230 &svd->vpage[seg_page(seg, a)] : NULL; 4231 page_t **ppa; 4232 uint_t ppa_szc; 4233 faultcode_t err; 4234 int ierr; 4235 uint_t protchk, prot, vpprot; 4236 ulong_t i; 4237 int hat_flag = (type == F_SOFTLOCK) ? HAT_LOAD_LOCK : HAT_LOAD; 4238 anon_sync_obj_t cookie; 4239 int first = 1; 4240 int adjszc_chk; 4241 int purged = 0; 4242 4243 ASSERT(szc != 0); 4244 ASSERT(amp != NULL); 4245 ASSERT(enable_mbit_wa == 0); /* no mbit simulations with large pages */ 4246 ASSERT(!(svd->flags & MAP_NORESERVE)); 4247 ASSERT(type != F_SOFTUNLOCK); 4248 ASSERT(IS_P2ALIGNED(a, maxpgsz)); 4249 4250 ASSERT(SEGVN_LOCK_HELD(seg->s_as, &svd->lock)); 4251 4252 VM_STAT_COND_ADD(type == F_SOFTLOCK, segvnvmstats.fltanpages[0]); 4253 VM_STAT_COND_ADD(type != F_SOFTLOCK, segvnvmstats.fltanpages[1]); 4254 4255 if (svd->flags & MAP_TEXT) { 4256 hat_flag |= HAT_LOAD_TEXT; 4257 } 4258 4259 if (svd->pageprot) { 4260 switch (rw) { 4261 case S_READ: 4262 protchk = PROT_READ; 4263 break; 4264 case S_WRITE: 4265 protchk = PROT_WRITE; 4266 break; 4267 case S_EXEC: 4268 protchk = PROT_EXEC; 4269 break; 4270 case S_OTHER: 4271 default: 4272 protchk = PROT_READ | PROT_WRITE | PROT_EXEC; 4273 break; 4274 } 4275 VM_STAT_ADD(segvnvmstats.fltanpages[2]); 4276 } else { 4277 prot = svd->prot; 4278 /* caller has already done segment level protection check. */ 4279 } 4280 4281 ppa = kmem_alloc(ppasize, KM_SLEEP); 4282 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 4283 for (;;) { 4284 adjszc_chk = 0; 4285 for (; a < lpgeaddr; a += pgsz, aindx += pages) { 4286 if (svd->pageprot != 0 && IS_P2ALIGNED(a, maxpgsz)) { 4287 VM_STAT_ADD(segvnvmstats.fltanpages[3]); 4288 ASSERT(vpage != NULL); 4289 prot = VPP_PROT(vpage); 4290 ASSERT(sameprot(seg, a, maxpgsz)); 4291 if ((prot & protchk) == 0) { 4292 err = FC_PROT; 4293 goto error; 4294 } 4295 } 4296 if (adjszc_chk && IS_P2ALIGNED(a, maxpgsz) && 4297 pgsz < maxpgsz) { 4298 ASSERT(a > lpgaddr); 4299 szc = seg->s_szc; 4300 pgsz = maxpgsz; 4301 pages = btop(pgsz); 4302 ASSERT(IS_P2ALIGNED(aindx, pages)); 4303 lpgeaddr = (caddr_t)P2ROUNDUP((uintptr_t)eaddr, 4304 pgsz); 4305 } 4306 if (type == F_SOFTLOCK && svd->vp != NULL) { 4307 mutex_enter(&freemem_lock); 4308 if (availrmem < tune.t_minarmem + pages) { 4309 mutex_exit(&freemem_lock); 4310 err = FC_MAKE_ERR(ENOMEM); 4311 goto error; 4312 } else { 4313 availrmem -= pages; 4314 segvn_pages_locked += pages; 4315 svd->softlockcnt += pages; 4316 } 4317 mutex_exit(&freemem_lock); 4318 } 4319 anon_array_enter(amp, aindx, &cookie); 4320 ppa_szc = (uint_t)-1; 4321 ierr = anon_map_getpages(amp, aindx, szc, seg, a, 4322 prot, &vpprot, ppa, &ppa_szc, vpage, rw, brkcow, 4323 segvn_anypgsz, svd->cred); 4324 if (ierr != 0) { 4325 anon_array_exit(&cookie); 4326 VM_STAT_ADD(segvnvmstats.fltanpages[4]); 4327 if (type == F_SOFTLOCK && svd->vp != NULL) { 4328 VM_STAT_ADD(segvnvmstats.fltanpages[5]); 4329 mutex_enter(&freemem_lock); 4330 availrmem += pages; 4331 segvn_pages_locked -= pages; 4332 svd->softlockcnt -= pages; 4333 mutex_exit(&freemem_lock); 4334 } 4335 if (ierr > 0) { 4336 VM_STAT_ADD(segvnvmstats.fltanpages[6]); 4337 err = FC_MAKE_ERR(ierr); 4338 goto error; 4339 } 4340 break; 4341 } 4342 4343 ASSERT(!IS_VMODSORT(ppa[0]->p_vnode)); 4344 4345 ASSERT(segtype == MAP_SHARED || 4346 ppa[0]->p_szc <= szc); 4347 ASSERT(segtype == MAP_PRIVATE || 4348 ppa[0]->p_szc >= szc); 4349 4350 /* 4351 * Handle pages that have been marked for migration 4352 */ 4353 if (lgrp_optimizations()) 4354 page_migrate(seg, a, ppa, pages); 4355 4356 if (type == F_SOFTLOCK && svd->vp == NULL) { 4357 /* 4358 * All pages in ppa array belong to the same 4359 * large page. This means it's ok to call 4360 * segvn_pp_lock_anonpages just for ppa[0]. 4361 */ 4362 if (!segvn_pp_lock_anonpages(ppa[0], first)) { 4363 for (i = 0; i < pages; i++) { 4364 page_unlock(ppa[i]); 4365 } 4366 err = FC_MAKE_ERR(ENOMEM); 4367 goto error; 4368 } 4369 first = 0; 4370 mutex_enter(&freemem_lock); 4371 svd->softlockcnt += pages; 4372 segvn_pages_locked += pages; 4373 mutex_exit(&freemem_lock); 4374 } 4375 4376 if (segtype == MAP_SHARED) { 4377 vpprot |= PROT_WRITE; 4378 } 4379 4380 hat_memload_array(hat, a, pgsz, ppa, 4381 prot & vpprot, hat_flag); 4382 4383 if (hat_flag & HAT_LOAD_LOCK) { 4384 VM_STAT_ADD(segvnvmstats.fltanpages[7]); 4385 } else { 4386 VM_STAT_ADD(segvnvmstats.fltanpages[8]); 4387 for (i = 0; i < pages; i++) 4388 page_unlock(ppa[i]); 4389 } 4390 if (vpage != NULL) 4391 vpage += pages; 4392 4393 anon_array_exit(&cookie); 4394 adjszc_chk = 1; 4395 } 4396 if (a == lpgeaddr) 4397 break; 4398 ASSERT(a < lpgeaddr); 4399 /* 4400 * ierr == -1 means we failed to allocate a large page. 4401 * so do a size down operation. 4402 * 4403 * ierr == -2 means some other process that privately shares 4404 * pages with this process has allocated a larger page and we 4405 * need to retry with larger pages. So do a size up 4406 * operation. This relies on the fact that large pages are 4407 * never partially shared i.e. if we share any constituent 4408 * page of a large page with another process we must share the 4409 * entire large page. Note this cannot happen for SOFTLOCK 4410 * case, unless current address (a) is at the beginning of the 4411 * next page size boundary because the other process couldn't 4412 * have relocated locked pages. 4413 */ 4414 ASSERT(ierr == -1 || ierr == -2); 4415 /* 4416 * For the very first relocation failure try to purge this 4417 * segment's cache so that the relocator can obtain an 4418 * exclusive lock on pages we want to relocate. 4419 */ 4420 if (!purged && ierr == -1 && ppa_szc != (uint_t)-1 && 4421 svd->softlockcnt != 0) { 4422 purged = 1; 4423 segvn_purge(seg); 4424 continue; 4425 } 4426 4427 if (segvn_anypgsz) { 4428 ASSERT(ierr == -2 || szc != 0); 4429 ASSERT(ierr == -1 || szc < seg->s_szc); 4430 szc = (ierr == -1) ? szc - 1 : szc + 1; 4431 } else { 4432 /* 4433 * For non COW faults and segvn_anypgsz == 0 4434 * we need to be careful not to loop forever 4435 * if existing page is found with szc other 4436 * than 0 or seg->s_szc. This could be due 4437 * to page relocations on behalf of DR or 4438 * more likely large page creation. For this 4439 * case simply re-size to existing page's szc 4440 * if returned by anon_map_getpages(). 4441 */ 4442 if (ppa_szc == (uint_t)-1) { 4443 szc = (ierr == -1) ? 0 : seg->s_szc; 4444 } else { 4445 ASSERT(ppa_szc <= seg->s_szc); 4446 ASSERT(ierr == -2 || ppa_szc < szc); 4447 ASSERT(ierr == -1 || ppa_szc > szc); 4448 szc = ppa_szc; 4449 } 4450 } 4451 4452 pgsz = page_get_pagesize(szc); 4453 pages = btop(pgsz); 4454 ASSERT(type != F_SOFTLOCK || ierr == -1 || 4455 (IS_P2ALIGNED(a, pgsz) && IS_P2ALIGNED(lpgeaddr, pgsz))); 4456 if (type == F_SOFTLOCK) { 4457 /* 4458 * For softlocks we cannot reduce the fault area 4459 * (calculated based on the largest page size for this 4460 * segment) for size down and a is already next 4461 * page size aligned as assertted above for size 4462 * ups. Therefore just continue in case of softlock. 4463 */ 4464 VM_STAT_ADD(segvnvmstats.fltanpages[9]); 4465 continue; /* keep lint happy */ 4466 } else if (ierr == -2) { 4467 4468 /* 4469 * Size up case. Note lpgaddr may only be needed for 4470 * softlock case so we don't adjust it here. 4471 */ 4472 VM_STAT_ADD(segvnvmstats.fltanpages[10]); 4473 a = (caddr_t)P2ALIGN((uintptr_t)a, pgsz); 4474 ASSERT(a >= lpgaddr); 4475 lpgeaddr = (caddr_t)P2ROUNDUP((uintptr_t)eaddr, pgsz); 4476 aindx = svd->anon_index + seg_page(seg, a); 4477 vpage = (svd->vpage != NULL) ? 4478 &svd->vpage[seg_page(seg, a)] : NULL; 4479 } else { 4480 /* 4481 * Size down case. Note lpgaddr may only be needed for 4482 * softlock case so we don't adjust it here. 4483 */ 4484 VM_STAT_ADD(segvnvmstats.fltanpages[11]); 4485 ASSERT(IS_P2ALIGNED(a, pgsz)); 4486 ASSERT(IS_P2ALIGNED(lpgeaddr, pgsz)); 4487 lpgeaddr = (caddr_t)P2ROUNDUP((uintptr_t)eaddr, pgsz); 4488 ASSERT(a < lpgeaddr); 4489 if (a < addr) { 4490 /* 4491 * The beginning of the large page region can 4492 * be pulled to the right to make a smaller 4493 * region. We haven't yet faulted a single 4494 * page. 4495 */ 4496 VM_STAT_ADD(segvnvmstats.fltanpages[12]); 4497 a = (caddr_t)P2ALIGN((uintptr_t)addr, pgsz); 4498 ASSERT(a >= lpgaddr); 4499 aindx = svd->anon_index + seg_page(seg, a); 4500 vpage = (svd->vpage != NULL) ? 4501 &svd->vpage[seg_page(seg, a)] : NULL; 4502 } 4503 } 4504 } 4505 VM_STAT_ADD(segvnvmstats.fltanpages[13]); 4506 ANON_LOCK_EXIT(&->a_rwlock); 4507 kmem_free(ppa, ppasize); 4508 return (0); 4509 error: 4510 VM_STAT_ADD(segvnvmstats.fltanpages[14]); 4511 ANON_LOCK_EXIT(&->a_rwlock); 4512 kmem_free(ppa, ppasize); 4513 if (type == F_SOFTLOCK && a > lpgaddr) { 4514 VM_STAT_ADD(segvnvmstats.fltanpages[15]); 4515 segvn_softunlock(seg, lpgaddr, a - lpgaddr, S_OTHER); 4516 } 4517 return (err); 4518 } 4519 4520 int fltadvice = 1; /* set to free behind pages for sequential access */ 4521 4522 /* 4523 * This routine is called via a machine specific fault handling routine. 4524 * It is also called by software routines wishing to lock or unlock 4525 * a range of addresses. 4526 * 4527 * Here is the basic algorithm: 4528 * If unlocking 4529 * Call segvn_softunlock 4530 * Return 4531 * endif 4532 * Checking and set up work 4533 * If we will need some non-anonymous pages 4534 * Call VOP_GETPAGE over the range of non-anonymous pages 4535 * endif 4536 * Loop over all addresses requested 4537 * Call segvn_faultpage passing in page list 4538 * to load up translations and handle anonymous pages 4539 * endloop 4540 * Load up translation to any additional pages in page list not 4541 * already handled that fit into this segment 4542 */ 4543 static faultcode_t 4544 segvn_fault(struct hat *hat, struct seg *seg, caddr_t addr, size_t len, 4545 enum fault_type type, enum seg_rw rw) 4546 { 4547 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 4548 page_t **plp, **ppp, *pp; 4549 u_offset_t off; 4550 caddr_t a; 4551 struct vpage *vpage; 4552 uint_t vpprot, prot; 4553 int err; 4554 page_t *pl[PVN_GETPAGE_NUM + 1]; 4555 size_t plsz, pl_alloc_sz; 4556 size_t page; 4557 ulong_t anon_index; 4558 struct anon_map *amp; 4559 int dogetpage = 0; 4560 caddr_t lpgaddr, lpgeaddr; 4561 size_t pgsz; 4562 anon_sync_obj_t cookie; 4563 int brkcow = BREAK_COW_SHARE(rw, type, svd->type); 4564 4565 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 4566 4567 /* 4568 * First handle the easy stuff 4569 */ 4570 if (type == F_SOFTUNLOCK) { 4571 if (rw == S_READ_NOCOW) { 4572 rw = S_READ; 4573 ASSERT(AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock)); 4574 } 4575 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER); 4576 pgsz = (seg->s_szc == 0) ? PAGESIZE : 4577 page_get_pagesize(seg->s_szc); 4578 VM_STAT_COND_ADD(pgsz > PAGESIZE, segvnvmstats.fltanpages[16]); 4579 CALC_LPG_REGION(pgsz, seg, addr, len, lpgaddr, lpgeaddr); 4580 segvn_softunlock(seg, lpgaddr, lpgeaddr - lpgaddr, rw); 4581 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 4582 return (0); 4583 } 4584 4585 top: 4586 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER); 4587 4588 /* 4589 * If we have the same protections for the entire segment, 4590 * insure that the access being attempted is legitimate. 4591 */ 4592 4593 if (svd->pageprot == 0) { 4594 uint_t protchk; 4595 4596 switch (rw) { 4597 case S_READ: 4598 case S_READ_NOCOW: 4599 protchk = PROT_READ; 4600 break; 4601 case S_WRITE: 4602 protchk = PROT_WRITE; 4603 break; 4604 case S_EXEC: 4605 protchk = PROT_EXEC; 4606 break; 4607 case S_OTHER: 4608 default: 4609 protchk = PROT_READ | PROT_WRITE | PROT_EXEC; 4610 break; 4611 } 4612 4613 if ((svd->prot & protchk) == 0) { 4614 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 4615 return (FC_PROT); /* illegal access type */ 4616 } 4617 } 4618 4619 /* 4620 * We can't allow the long term use of softlocks for vmpss segments, 4621 * because in some file truncation cases we should be able to demote 4622 * the segment, which requires that there are no softlocks. The 4623 * only case where it's ok to allow a SOFTLOCK fault against a vmpss 4624 * segment is S_READ_NOCOW, where the caller holds the address space 4625 * locked as writer and calls softunlock before dropping the as lock. 4626 * S_READ_NOCOW is used by /proc to read memory from another user. 4627 * 4628 * Another deadlock between SOFTLOCK and file truncation can happen 4629 * because segvn_fault_vnodepages() calls the FS one pagesize at 4630 * a time. A second VOP_GETPAGE() call by segvn_fault_vnodepages() 4631 * can cause a deadlock because the first set of page_t's remain 4632 * locked SE_SHARED. To avoid this, we demote segments on a first 4633 * SOFTLOCK if they have a length greater than the segment's 4634 * page size. 4635 * 4636 * So for now, we only avoid demoting a segment on a SOFTLOCK when 4637 * the access type is S_READ_NOCOW and the fault length is less than 4638 * or equal to the segment's page size. While this is quite restrictive, 4639 * it should be the most common case of SOFTLOCK against a vmpss 4640 * segment. 4641 * 4642 * For S_READ_NOCOW, it's safe not to do a copy on write because the 4643 * caller makes sure no COW will be caused by another thread for a 4644 * softlocked page. 4645 */ 4646 if (type == F_SOFTLOCK && svd->vp != NULL && seg->s_szc != 0) { 4647 int demote = 0; 4648 4649 if (rw != S_READ_NOCOW) { 4650 demote = 1; 4651 } 4652 if (!demote && len > PAGESIZE) { 4653 pgsz = page_get_pagesize(seg->s_szc); 4654 CALC_LPG_REGION(pgsz, seg, addr, len, lpgaddr, 4655 lpgeaddr); 4656 if (lpgeaddr - lpgaddr > pgsz) { 4657 demote = 1; 4658 } 4659 } 4660 4661 ASSERT(demote || AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock)); 4662 4663 if (demote) { 4664 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 4665 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_WRITER); 4666 if (seg->s_szc != 0) { 4667 segvn_vmpss_clrszc_cnt++; 4668 ASSERT(svd->softlockcnt == 0); 4669 err = segvn_clrszc(seg); 4670 if (err) { 4671 segvn_vmpss_clrszc_err++; 4672 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 4673 return (FC_MAKE_ERR(err)); 4674 } 4675 } 4676 ASSERT(seg->s_szc == 0); 4677 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 4678 goto top; 4679 } 4680 } 4681 4682 /* 4683 * Check to see if we need to allocate an anon_map structure. 4684 */ 4685 if (svd->amp == NULL && (svd->vp == NULL || brkcow)) { 4686 /* 4687 * Drop the "read" lock on the segment and acquire 4688 * the "write" version since we have to allocate the 4689 * anon_map. 4690 */ 4691 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 4692 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_WRITER); 4693 4694 if (svd->amp == NULL) { 4695 svd->amp = anonmap_alloc(seg->s_size, 0); 4696 svd->amp->a_szc = seg->s_szc; 4697 } 4698 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 4699 4700 /* 4701 * Start all over again since segment protections 4702 * may have changed after we dropped the "read" lock. 4703 */ 4704 goto top; 4705 } 4706 4707 /* 4708 * S_READ_NOCOW vs S_READ distinction was 4709 * only needed for the code above. After 4710 * that we treat it as S_READ. 4711 */ 4712 if (rw == S_READ_NOCOW) { 4713 ASSERT(type == F_SOFTLOCK); 4714 ASSERT(AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock)); 4715 rw = S_READ; 4716 } 4717 4718 amp = svd->amp; 4719 4720 /* 4721 * MADV_SEQUENTIAL work is ignored for large page segments. 4722 */ 4723 if (seg->s_szc != 0) { 4724 pgsz = page_get_pagesize(seg->s_szc); 4725 ASSERT(SEGVN_LOCK_HELD(seg->s_as, &svd->lock)); 4726 CALC_LPG_REGION(pgsz, seg, addr, len, lpgaddr, lpgeaddr); 4727 if (svd->vp == NULL) { 4728 err = segvn_fault_anonpages(hat, seg, lpgaddr, 4729 lpgeaddr, type, rw, addr, addr + len, brkcow); 4730 } else { 4731 err = segvn_fault_vnodepages(hat, seg, lpgaddr, 4732 lpgeaddr, type, rw, addr, addr + len, brkcow); 4733 if (err == IE_RETRY) { 4734 ASSERT(seg->s_szc == 0); 4735 ASSERT(SEGVN_READ_HELD(seg->s_as, &svd->lock)); 4736 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 4737 goto top; 4738 } 4739 } 4740 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 4741 return (err); 4742 } 4743 4744 page = seg_page(seg, addr); 4745 if (amp != NULL) { 4746 anon_index = svd->anon_index + page; 4747 4748 if ((type == F_PROT) && (rw == S_READ) && 4749 svd->type == MAP_PRIVATE && svd->pageprot == 0) { 4750 size_t index = anon_index; 4751 struct anon *ap; 4752 4753 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 4754 /* 4755 * The fast path could apply to S_WRITE also, except 4756 * that the protection fault could be caused by lazy 4757 * tlb flush when ro->rw. In this case, the pte is 4758 * RW already. But RO in the other cpu's tlb causes 4759 * the fault. Since hat_chgprot won't do anything if 4760 * pte doesn't change, we may end up faulting 4761 * indefinitely until the RO tlb entry gets replaced. 4762 */ 4763 for (a = addr; a < addr + len; a += PAGESIZE, index++) { 4764 anon_array_enter(amp, index, &cookie); 4765 ap = anon_get_ptr(amp->ahp, index); 4766 anon_array_exit(&cookie); 4767 if ((ap == NULL) || (ap->an_refcnt != 1)) { 4768 ANON_LOCK_EXIT(&->a_rwlock); 4769 goto slow; 4770 } 4771 } 4772 hat_chgprot(seg->s_as->a_hat, addr, len, svd->prot); 4773 ANON_LOCK_EXIT(&->a_rwlock); 4774 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 4775 return (0); 4776 } 4777 } 4778 slow: 4779 4780 if (svd->vpage == NULL) 4781 vpage = NULL; 4782 else 4783 vpage = &svd->vpage[page]; 4784 4785 off = svd->offset + (uintptr_t)(addr - seg->s_base); 4786 4787 /* 4788 * If MADV_SEQUENTIAL has been set for the particular page we 4789 * are faulting on, free behind all pages in the segment and put 4790 * them on the free list. 4791 */ 4792 if ((page != 0) && fltadvice) { /* not if first page in segment */ 4793 struct vpage *vpp; 4794 ulong_t fanon_index; 4795 size_t fpage; 4796 u_offset_t pgoff, fpgoff; 4797 struct vnode *fvp; 4798 struct anon *fap = NULL; 4799 4800 if (svd->advice == MADV_SEQUENTIAL || 4801 (svd->pageadvice && 4802 VPP_ADVICE(vpage) == MADV_SEQUENTIAL)) { 4803 pgoff = off - PAGESIZE; 4804 fpage = page - 1; 4805 if (vpage != NULL) 4806 vpp = &svd->vpage[fpage]; 4807 if (amp != NULL) 4808 fanon_index = svd->anon_index + fpage; 4809 4810 while (pgoff > svd->offset) { 4811 if (svd->advice != MADV_SEQUENTIAL && 4812 (!svd->pageadvice || (vpage && 4813 VPP_ADVICE(vpp) != MADV_SEQUENTIAL))) 4814 break; 4815 4816 /* 4817 * If this is an anon page, we must find the 4818 * correct <vp, offset> for it 4819 */ 4820 fap = NULL; 4821 if (amp != NULL) { 4822 ANON_LOCK_ENTER(&->a_rwlock, 4823 RW_READER); 4824 anon_array_enter(amp, fanon_index, 4825 &cookie); 4826 fap = anon_get_ptr(amp->ahp, 4827 fanon_index); 4828 if (fap != NULL) { 4829 swap_xlate(fap, &fvp, &fpgoff); 4830 } else { 4831 fpgoff = pgoff; 4832 fvp = svd->vp; 4833 } 4834 anon_array_exit(&cookie); 4835 ANON_LOCK_EXIT(&->a_rwlock); 4836 } else { 4837 fpgoff = pgoff; 4838 fvp = svd->vp; 4839 } 4840 if (fvp == NULL) 4841 break; /* XXX */ 4842 /* 4843 * Skip pages that are free or have an 4844 * "exclusive" lock. 4845 */ 4846 pp = page_lookup_nowait(fvp, fpgoff, SE_SHARED); 4847 if (pp == NULL) 4848 break; 4849 /* 4850 * We don't need the page_struct_lock to test 4851 * as this is only advisory; even if we 4852 * acquire it someone might race in and lock 4853 * the page after we unlock and before the 4854 * PUTPAGE, then VOP_PUTPAGE will do nothing. 4855 */ 4856 if (pp->p_lckcnt == 0 && pp->p_cowcnt == 0) { 4857 /* 4858 * Hold the vnode before releasing 4859 * the page lock to prevent it from 4860 * being freed and re-used by some 4861 * other thread. 4862 */ 4863 VN_HOLD(fvp); 4864 page_unlock(pp); 4865 /* 4866 * We should build a page list 4867 * to kluster putpages XXX 4868 */ 4869 (void) VOP_PUTPAGE(fvp, 4870 (offset_t)fpgoff, PAGESIZE, 4871 (B_DONTNEED|B_FREE|B_ASYNC), 4872 svd->cred); 4873 VN_RELE(fvp); 4874 } else { 4875 /* 4876 * XXX - Should the loop terminate if 4877 * the page is `locked'? 4878 */ 4879 page_unlock(pp); 4880 } 4881 --vpp; 4882 --fanon_index; 4883 pgoff -= PAGESIZE; 4884 } 4885 } 4886 } 4887 4888 plp = pl; 4889 *plp = NULL; 4890 pl_alloc_sz = 0; 4891 4892 /* 4893 * See if we need to call VOP_GETPAGE for 4894 * *any* of the range being faulted on. 4895 * We can skip all of this work if there 4896 * was no original vnode. 4897 */ 4898 if (svd->vp != NULL) { 4899 u_offset_t vp_off; 4900 size_t vp_len; 4901 struct anon *ap; 4902 vnode_t *vp; 4903 4904 vp_off = off; 4905 vp_len = len; 4906 4907 if (amp == NULL) 4908 dogetpage = 1; 4909 else { 4910 /* 4911 * Only acquire reader lock to prevent amp->ahp 4912 * from being changed. It's ok to miss pages, 4913 * hence we don't do anon_array_enter 4914 */ 4915 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 4916 ap = anon_get_ptr(amp->ahp, anon_index); 4917 4918 if (len <= PAGESIZE) 4919 /* inline non_anon() */ 4920 dogetpage = (ap == NULL); 4921 else 4922 dogetpage = non_anon(amp->ahp, anon_index, 4923 &vp_off, &vp_len); 4924 ANON_LOCK_EXIT(&->a_rwlock); 4925 } 4926 4927 if (dogetpage) { 4928 enum seg_rw arw; 4929 struct as *as = seg->s_as; 4930 4931 if (len > ptob((sizeof (pl) / sizeof (pl[0])) - 1)) { 4932 /* 4933 * Page list won't fit in local array, 4934 * allocate one of the needed size. 4935 */ 4936 pl_alloc_sz = 4937 (btop(len) + 1) * sizeof (page_t *); 4938 plp = kmem_alloc(pl_alloc_sz, KM_SLEEP); 4939 plp[0] = NULL; 4940 plsz = len; 4941 } else if (rw == S_WRITE && svd->type == MAP_PRIVATE || 4942 rw == S_OTHER || 4943 (((size_t)(addr + PAGESIZE) < 4944 (size_t)(seg->s_base + seg->s_size)) && 4945 hat_probe(as->a_hat, addr + PAGESIZE))) { 4946 /* 4947 * Ask VOP_GETPAGE to return the exact number 4948 * of pages if 4949 * (a) this is a COW fault, or 4950 * (b) this is a software fault, or 4951 * (c) next page is already mapped. 4952 */ 4953 plsz = len; 4954 } else { 4955 /* 4956 * Ask VOP_GETPAGE to return adjacent pages 4957 * within the segment. 4958 */ 4959 plsz = MIN((size_t)PVN_GETPAGE_SZ, (size_t) 4960 ((seg->s_base + seg->s_size) - addr)); 4961 ASSERT((addr + plsz) <= 4962 (seg->s_base + seg->s_size)); 4963 } 4964 4965 /* 4966 * Need to get some non-anonymous pages. 4967 * We need to make only one call to GETPAGE to do 4968 * this to prevent certain deadlocking conditions 4969 * when we are doing locking. In this case 4970 * non_anon() should have picked up the smallest 4971 * range which includes all the non-anonymous 4972 * pages in the requested range. We have to 4973 * be careful regarding which rw flag to pass in 4974 * because on a private mapping, the underlying 4975 * object is never allowed to be written. 4976 */ 4977 if (rw == S_WRITE && svd->type == MAP_PRIVATE) { 4978 arw = S_READ; 4979 } else { 4980 arw = rw; 4981 } 4982 vp = svd->vp; 4983 TRACE_3(TR_FAC_VM, TR_SEGVN_GETPAGE, 4984 "segvn_getpage:seg %p addr %p vp %p", 4985 seg, addr, vp); 4986 err = VOP_GETPAGE(vp, (offset_t)vp_off, vp_len, 4987 &vpprot, plp, plsz, seg, addr + (vp_off - off), arw, 4988 svd->cred); 4989 if (err) { 4990 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 4991 segvn_pagelist_rele(plp); 4992 if (pl_alloc_sz) 4993 kmem_free(plp, pl_alloc_sz); 4994 return (FC_MAKE_ERR(err)); 4995 } 4996 if (svd->type == MAP_PRIVATE) 4997 vpprot &= ~PROT_WRITE; 4998 } 4999 } 5000 5001 /* 5002 * N.B. at this time the plp array has all the needed non-anon 5003 * pages in addition to (possibly) having some adjacent pages. 5004 */ 5005 5006 /* 5007 * Always acquire the anon_array_lock to prevent 5008 * 2 threads from allocating separate anon slots for 5009 * the same "addr". 5010 * 5011 * If this is a copy-on-write fault and we don't already 5012 * have the anon_array_lock, acquire it to prevent the 5013 * fault routine from handling multiple copy-on-write faults 5014 * on the same "addr" in the same address space. 5015 * 5016 * Only one thread should deal with the fault since after 5017 * it is handled, the other threads can acquire a translation 5018 * to the newly created private page. This prevents two or 5019 * more threads from creating different private pages for the 5020 * same fault. 5021 * 5022 * We grab "serialization" lock here if this is a MAP_PRIVATE segment 5023 * to prevent deadlock between this thread and another thread 5024 * which has soft-locked this page and wants to acquire serial_lock. 5025 * ( bug 4026339 ) 5026 * 5027 * The fix for bug 4026339 becomes unnecessary when using the 5028 * locking scheme with per amp rwlock and a global set of hash 5029 * lock, anon_array_lock. If we steal a vnode page when low 5030 * on memory and upgrad the page lock through page_rename, 5031 * then the page is PAGE_HANDLED, nothing needs to be done 5032 * for this page after returning from segvn_faultpage. 5033 * 5034 * But really, the page lock should be downgraded after 5035 * the stolen page is page_rename'd. 5036 */ 5037 5038 if (amp != NULL) 5039 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 5040 5041 /* 5042 * Ok, now loop over the address range and handle faults 5043 */ 5044 for (a = addr; a < addr + len; a += PAGESIZE, off += PAGESIZE) { 5045 err = segvn_faultpage(hat, seg, a, off, vpage, plp, vpprot, 5046 type, rw, brkcow, a == addr); 5047 if (err) { 5048 if (amp != NULL) 5049 ANON_LOCK_EXIT(&->a_rwlock); 5050 if (type == F_SOFTLOCK && a > addr) { 5051 segvn_softunlock(seg, addr, (a - addr), 5052 S_OTHER); 5053 } 5054 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5055 segvn_pagelist_rele(plp); 5056 if (pl_alloc_sz) 5057 kmem_free(plp, pl_alloc_sz); 5058 return (err); 5059 } 5060 if (vpage) { 5061 vpage++; 5062 } else if (svd->vpage) { 5063 page = seg_page(seg, addr); 5064 vpage = &svd->vpage[++page]; 5065 } 5066 } 5067 5068 /* Didn't get pages from the underlying fs so we're done */ 5069 if (!dogetpage) 5070 goto done; 5071 5072 /* 5073 * Now handle any other pages in the list returned. 5074 * If the page can be used, load up the translations now. 5075 * Note that the for loop will only be entered if "plp" 5076 * is pointing to a non-NULL page pointer which means that 5077 * VOP_GETPAGE() was called and vpprot has been initialized. 5078 */ 5079 if (svd->pageprot == 0) 5080 prot = svd->prot & vpprot; 5081 5082 5083 /* 5084 * Large Files: diff should be unsigned value because we started 5085 * supporting > 2GB segment sizes from 2.5.1 and when a 5086 * large file of size > 2GB gets mapped to address space 5087 * the diff value can be > 2GB. 5088 */ 5089 5090 for (ppp = plp; (pp = *ppp) != NULL; ppp++) { 5091 size_t diff; 5092 struct anon *ap; 5093 int anon_index; 5094 anon_sync_obj_t cookie; 5095 int hat_flag = HAT_LOAD_ADV; 5096 5097 if (svd->flags & MAP_TEXT) { 5098 hat_flag |= HAT_LOAD_TEXT; 5099 } 5100 5101 if (pp == PAGE_HANDLED) 5102 continue; 5103 5104 if (pp->p_offset >= svd->offset && 5105 (pp->p_offset < svd->offset + seg->s_size)) { 5106 5107 diff = pp->p_offset - svd->offset; 5108 5109 /* 5110 * Large Files: Following is the assertion 5111 * validating the above cast. 5112 */ 5113 ASSERT(svd->vp == pp->p_vnode); 5114 5115 page = btop(diff); 5116 if (svd->pageprot) 5117 prot = VPP_PROT(&svd->vpage[page]) & vpprot; 5118 5119 /* 5120 * Prevent other threads in the address space from 5121 * creating private pages (i.e., allocating anon slots) 5122 * while we are in the process of loading translations 5123 * to additional pages returned by the underlying 5124 * object. 5125 */ 5126 if (amp != NULL) { 5127 anon_index = svd->anon_index + page; 5128 anon_array_enter(amp, anon_index, &cookie); 5129 ap = anon_get_ptr(amp->ahp, anon_index); 5130 } 5131 if ((amp == NULL) || (ap == NULL)) { 5132 if (IS_VMODSORT(pp->p_vnode) || 5133 enable_mbit_wa) { 5134 if (rw == S_WRITE) 5135 hat_setmod(pp); 5136 else if (rw != S_OTHER && 5137 !hat_ismod(pp)) 5138 prot &= ~PROT_WRITE; 5139 } 5140 /* 5141 * Skip mapping read ahead pages marked 5142 * for migration, so they will get migrated 5143 * properly on fault 5144 */ 5145 if ((prot & PROT_READ) && !PP_ISMIGRATE(pp)) { 5146 hat_memload(hat, seg->s_base + diff, 5147 pp, prot, hat_flag); 5148 } 5149 } 5150 if (amp != NULL) 5151 anon_array_exit(&cookie); 5152 } 5153 page_unlock(pp); 5154 } 5155 done: 5156 if (amp != NULL) 5157 ANON_LOCK_EXIT(&->a_rwlock); 5158 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5159 if (pl_alloc_sz) 5160 kmem_free(plp, pl_alloc_sz); 5161 return (0); 5162 } 5163 5164 /* 5165 * This routine is used to start I/O on pages asynchronously. XXX it will 5166 * only create PAGESIZE pages. At fault time they will be relocated into 5167 * larger pages. 5168 */ 5169 static faultcode_t 5170 segvn_faulta(struct seg *seg, caddr_t addr) 5171 { 5172 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 5173 int err; 5174 struct anon_map *amp; 5175 vnode_t *vp; 5176 5177 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 5178 5179 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER); 5180 if ((amp = svd->amp) != NULL) { 5181 struct anon *ap; 5182 5183 /* 5184 * Reader lock to prevent amp->ahp from being changed. 5185 * This is advisory, it's ok to miss a page, so 5186 * we don't do anon_array_enter lock. 5187 */ 5188 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 5189 if ((ap = anon_get_ptr(amp->ahp, 5190 svd->anon_index + seg_page(seg, addr))) != NULL) { 5191 5192 err = anon_getpage(&ap, NULL, NULL, 5193 0, seg, addr, S_READ, svd->cred); 5194 5195 ANON_LOCK_EXIT(&->a_rwlock); 5196 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5197 if (err) 5198 return (FC_MAKE_ERR(err)); 5199 return (0); 5200 } 5201 ANON_LOCK_EXIT(&->a_rwlock); 5202 } 5203 5204 if (svd->vp == NULL) { 5205 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5206 return (0); /* zfod page - do nothing now */ 5207 } 5208 5209 vp = svd->vp; 5210 TRACE_3(TR_FAC_VM, TR_SEGVN_GETPAGE, 5211 "segvn_getpage:seg %p addr %p vp %p", seg, addr, vp); 5212 err = VOP_GETPAGE(vp, 5213 (offset_t)(svd->offset + (uintptr_t)(addr - seg->s_base)), 5214 PAGESIZE, NULL, NULL, 0, seg, addr, 5215 S_OTHER, svd->cred); 5216 5217 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5218 if (err) 5219 return (FC_MAKE_ERR(err)); 5220 return (0); 5221 } 5222 5223 static int 5224 segvn_setprot(struct seg *seg, caddr_t addr, size_t len, uint_t prot) 5225 { 5226 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 5227 struct vpage *svp, *evp; 5228 struct vnode *vp; 5229 size_t pgsz; 5230 pgcnt_t pgcnt; 5231 anon_sync_obj_t cookie; 5232 5233 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 5234 5235 if ((svd->maxprot & prot) != prot) 5236 return (EACCES); /* violated maxprot */ 5237 5238 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_WRITER); 5239 5240 /* return if prot is the same */ 5241 if (!svd->pageprot && svd->prot == prot) { 5242 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5243 return (0); 5244 } 5245 5246 /* 5247 * Since we change protections we first have to flush the cache. 5248 * This makes sure all the pagelock calls have to recheck 5249 * protections. 5250 */ 5251 if (svd->softlockcnt > 0) { 5252 /* 5253 * Since we do have the segvn writers lock nobody can fill 5254 * the cache with entries belonging to this seg during 5255 * the purge. The flush either succeeds or we still have 5256 * pending I/Os. 5257 */ 5258 segvn_purge(seg); 5259 if (svd->softlockcnt > 0) { 5260 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5261 return (EAGAIN); 5262 } 5263 } 5264 5265 if (seg->s_szc != 0) { 5266 int err; 5267 pgsz = page_get_pagesize(seg->s_szc); 5268 pgcnt = pgsz >> PAGESHIFT; 5269 ASSERT(IS_P2ALIGNED(pgcnt, pgcnt)); 5270 if (!IS_P2ALIGNED(addr, pgsz) || !IS_P2ALIGNED(len, pgsz)) { 5271 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5272 ASSERT(seg->s_base != addr || seg->s_size != len); 5273 /* 5274 * If we are holding the as lock as a reader then 5275 * we need to return IE_RETRY and let the as 5276 * layer drop and re-aquire the lock as a writer. 5277 */ 5278 if (AS_READ_HELD(seg->s_as, &seg->s_as->a_lock)) 5279 return (IE_RETRY); 5280 VM_STAT_ADD(segvnvmstats.demoterange[1]); 5281 if (svd->type == MAP_PRIVATE || svd->vp != NULL) { 5282 err = segvn_demote_range(seg, addr, len, 5283 SDR_END, 0); 5284 } else { 5285 uint_t szcvec = map_pgszcvec(seg->s_base, 5286 pgsz, (uintptr_t)seg->s_base, 5287 (svd->flags & MAP_TEXT), MAPPGSZC_SHM, 0); 5288 err = segvn_demote_range(seg, addr, len, 5289 SDR_END, szcvec); 5290 } 5291 if (err == 0) 5292 return (IE_RETRY); 5293 if (err == ENOMEM) 5294 return (IE_NOMEM); 5295 return (err); 5296 } 5297 } 5298 5299 5300 /* 5301 * If it's a private mapping and we're making it writable 5302 * and no swap space has been reserved, have to reserve 5303 * it all now. If it's a private mapping to a file (i.e., vp != NULL) 5304 * and we're removing write permission on the entire segment and 5305 * we haven't modified any pages, we can release the swap space. 5306 */ 5307 if (svd->type == MAP_PRIVATE) { 5308 if (prot & PROT_WRITE) { 5309 size_t sz; 5310 if (svd->swresv == 0 && !(svd->flags & MAP_NORESERVE)) { 5311 if (anon_resv(seg->s_size) == 0) { 5312 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5313 return (IE_NOMEM); 5314 } 5315 sz = svd->swresv = seg->s_size; 5316 TRACE_3(TR_FAC_VM, TR_ANON_PROC, 5317 "anon proc:%p %lu %u", 5318 seg, sz, 1); 5319 } 5320 } else { 5321 /* 5322 * Swap space is released only if this segment 5323 * does not map anonymous memory, since read faults 5324 * on such segments still need an anon slot to read 5325 * in the data. 5326 */ 5327 if (svd->swresv != 0 && svd->vp != NULL && 5328 svd->amp == NULL && addr == seg->s_base && 5329 len == seg->s_size && svd->pageprot == 0) { 5330 anon_unresv(svd->swresv); 5331 svd->swresv = 0; 5332 TRACE_3(TR_FAC_VM, TR_ANON_PROC, 5333 "anon proc:%p %lu %u", 5334 seg, 0, 0); 5335 } 5336 } 5337 } 5338 5339 if (addr == seg->s_base && len == seg->s_size && svd->pageprot == 0) { 5340 if (svd->prot == prot) { 5341 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5342 return (0); /* all done */ 5343 } 5344 svd->prot = (uchar_t)prot; 5345 } else if (svd->type == MAP_PRIVATE) { 5346 struct anon *ap = NULL; 5347 page_t *pp; 5348 u_offset_t offset, off; 5349 struct anon_map *amp; 5350 ulong_t anon_idx = 0; 5351 5352 /* 5353 * A vpage structure exists or else the change does not 5354 * involve the entire segment. Establish a vpage structure 5355 * if none is there. Then, for each page in the range, 5356 * adjust its individual permissions. Note that write- 5357 * enabling a MAP_PRIVATE page can affect the claims for 5358 * locked down memory. Overcommitting memory terminates 5359 * the operation. 5360 */ 5361 segvn_vpage(seg); 5362 if ((amp = svd->amp) != NULL) { 5363 anon_idx = svd->anon_index + seg_page(seg, addr); 5364 ASSERT(seg->s_szc == 0 || 5365 IS_P2ALIGNED(anon_idx, pgcnt)); 5366 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 5367 } 5368 5369 offset = svd->offset + (uintptr_t)(addr - seg->s_base); 5370 evp = &svd->vpage[seg_page(seg, addr + len)]; 5371 5372 /* 5373 * See Statement at the beginning of segvn_lockop regarding 5374 * the way cowcnts and lckcnts are handled. 5375 */ 5376 for (svp = &svd->vpage[seg_page(seg, addr)]; svp < evp; svp++) { 5377 5378 if (seg->s_szc != 0) { 5379 if (amp != NULL) { 5380 anon_array_enter(amp, anon_idx, 5381 &cookie); 5382 } 5383 if (IS_P2ALIGNED(anon_idx, pgcnt) && 5384 !segvn_claim_pages(seg, svp, offset, 5385 anon_idx, prot)) { 5386 if (amp != NULL) { 5387 anon_array_exit(&cookie); 5388 } 5389 break; 5390 } 5391 if (amp != NULL) { 5392 anon_array_exit(&cookie); 5393 } 5394 anon_idx++; 5395 } else { 5396 if (amp != NULL) { 5397 anon_array_enter(amp, anon_idx, 5398 &cookie); 5399 ap = anon_get_ptr(amp->ahp, anon_idx++); 5400 } 5401 5402 if (VPP_ISPPLOCK(svp) && 5403 VPP_PROT(svp) != prot) { 5404 5405 if (amp == NULL || ap == NULL) { 5406 vp = svd->vp; 5407 off = offset; 5408 } else 5409 swap_xlate(ap, &vp, &off); 5410 if (amp != NULL) 5411 anon_array_exit(&cookie); 5412 5413 if ((pp = page_lookup(vp, off, 5414 SE_SHARED)) == NULL) { 5415 panic("segvn_setprot: no page"); 5416 /*NOTREACHED*/ 5417 } 5418 ASSERT(seg->s_szc == 0); 5419 if ((VPP_PROT(svp) ^ prot) & 5420 PROT_WRITE) { 5421 if (prot & PROT_WRITE) { 5422 if (!page_addclaim(pp)) { 5423 page_unlock(pp); 5424 break; 5425 } 5426 } else { 5427 if (!page_subclaim(pp)) { 5428 page_unlock(pp); 5429 break; 5430 } 5431 } 5432 } 5433 page_unlock(pp); 5434 } else if (amp != NULL) 5435 anon_array_exit(&cookie); 5436 } 5437 VPP_SETPROT(svp, prot); 5438 offset += PAGESIZE; 5439 } 5440 if (amp != NULL) 5441 ANON_LOCK_EXIT(&->a_rwlock); 5442 5443 /* 5444 * Did we terminate prematurely? If so, simply unload 5445 * the translations to the things we've updated so far. 5446 */ 5447 if (svp != evp) { 5448 len = (svp - &svd->vpage[seg_page(seg, addr)]) * 5449 PAGESIZE; 5450 ASSERT(seg->s_szc == 0 || IS_P2ALIGNED(len, pgsz)); 5451 if (len != 0) 5452 hat_unload(seg->s_as->a_hat, addr, 5453 len, HAT_UNLOAD); 5454 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5455 return (IE_NOMEM); 5456 } 5457 } else { 5458 segvn_vpage(seg); 5459 evp = &svd->vpage[seg_page(seg, addr + len)]; 5460 for (svp = &svd->vpage[seg_page(seg, addr)]; svp < evp; svp++) { 5461 VPP_SETPROT(svp, prot); 5462 } 5463 } 5464 5465 if (((prot & PROT_WRITE) != 0 && 5466 (svd->vp != NULL || svd->type == MAP_PRIVATE)) || 5467 (prot & ~PROT_USER) == PROT_NONE) { 5468 /* 5469 * Either private or shared data with write access (in 5470 * which case we need to throw out all former translations 5471 * so that we get the right translations set up on fault 5472 * and we don't allow write access to any copy-on-write pages 5473 * that might be around or to prevent write access to pages 5474 * representing holes in a file), or we don't have permission 5475 * to access the memory at all (in which case we have to 5476 * unload any current translations that might exist). 5477 */ 5478 hat_unload(seg->s_as->a_hat, addr, len, HAT_UNLOAD); 5479 } else { 5480 /* 5481 * A shared mapping or a private mapping in which write 5482 * protection is going to be denied - just change all the 5483 * protections over the range of addresses in question. 5484 * segvn does not support any other attributes other 5485 * than prot so we can use hat_chgattr. 5486 */ 5487 hat_chgattr(seg->s_as->a_hat, addr, len, prot); 5488 } 5489 5490 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5491 5492 return (0); 5493 } 5494 5495 /* 5496 * segvn_setpagesize is called via SEGOP_SETPAGESIZE from as_setpagesize, 5497 * to determine if the seg is capable of mapping the requested szc. 5498 */ 5499 static int 5500 segvn_setpagesize(struct seg *seg, caddr_t addr, size_t len, uint_t szc) 5501 { 5502 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 5503 struct segvn_data *nsvd; 5504 struct anon_map *amp = svd->amp; 5505 struct seg *nseg; 5506 caddr_t eaddr = addr + len, a; 5507 size_t pgsz = page_get_pagesize(szc); 5508 pgcnt_t pgcnt = page_get_pagecnt(szc); 5509 int err; 5510 u_offset_t off = svd->offset + (uintptr_t)(addr - seg->s_base); 5511 extern struct vnode kvp; 5512 5513 ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock)); 5514 ASSERT(addr >= seg->s_base && eaddr <= seg->s_base + seg->s_size); 5515 5516 if (seg->s_szc == szc || segvn_lpg_disable != 0) { 5517 return (0); 5518 } 5519 5520 /* 5521 * addr should always be pgsz aligned but eaddr may be misaligned if 5522 * it's at the end of the segment. 5523 * 5524 * XXX we should assert this condition since as_setpagesize() logic 5525 * guarantees it. 5526 */ 5527 if (!IS_P2ALIGNED(addr, pgsz) || 5528 (!IS_P2ALIGNED(eaddr, pgsz) && 5529 eaddr != seg->s_base + seg->s_size)) { 5530 5531 segvn_setpgsz_align_err++; 5532 return (EINVAL); 5533 } 5534 5535 if (amp != NULL && svd->type == MAP_SHARED) { 5536 ulong_t an_idx = svd->anon_index + seg_page(seg, addr); 5537 if (!IS_P2ALIGNED(an_idx, pgcnt)) { 5538 5539 segvn_setpgsz_anon_align_err++; 5540 return (EINVAL); 5541 } 5542 } 5543 5544 if ((svd->flags & MAP_NORESERVE) || seg->s_as == &kas || 5545 szc > segvn_maxpgszc) { 5546 return (EINVAL); 5547 } 5548 5549 /* paranoid check */ 5550 if (svd->vp != NULL && 5551 (IS_SWAPFSVP(svd->vp) || VN_ISKAS(svd->vp))) { 5552 return (EINVAL); 5553 } 5554 5555 if (seg->s_szc == 0 && svd->vp != NULL && 5556 map_addr_vacalign_check(addr, off)) { 5557 return (EINVAL); 5558 } 5559 5560 /* 5561 * Check that protections are the same within new page 5562 * size boundaries. 5563 */ 5564 if (svd->pageprot) { 5565 for (a = addr; a < eaddr; a += pgsz) { 5566 if ((a + pgsz) > eaddr) { 5567 if (!sameprot(seg, a, eaddr - a)) { 5568 return (EINVAL); 5569 } 5570 } else { 5571 if (!sameprot(seg, a, pgsz)) { 5572 return (EINVAL); 5573 } 5574 } 5575 } 5576 } 5577 5578 /* 5579 * Since we are changing page size we first have to flush 5580 * the cache. This makes sure all the pagelock calls have 5581 * to recheck protections. 5582 */ 5583 if (svd->softlockcnt > 0) { 5584 /* 5585 * Since we do have the segvn writers lock nobody can fill 5586 * the cache with entries belonging to this seg during 5587 * the purge. The flush either succeeds or we still have 5588 * pending I/Os. 5589 */ 5590 segvn_purge(seg); 5591 if (svd->softlockcnt > 0) { 5592 return (EAGAIN); 5593 } 5594 } 5595 5596 /* 5597 * Operation for sub range of existing segment. 5598 */ 5599 if (addr != seg->s_base || eaddr != (seg->s_base + seg->s_size)) { 5600 if (szc < seg->s_szc) { 5601 VM_STAT_ADD(segvnvmstats.demoterange[2]); 5602 err = segvn_demote_range(seg, addr, len, SDR_RANGE, 0); 5603 if (err == 0) { 5604 return (IE_RETRY); 5605 } 5606 if (err == ENOMEM) { 5607 return (IE_NOMEM); 5608 } 5609 return (err); 5610 } 5611 if (addr != seg->s_base) { 5612 nseg = segvn_split_seg(seg, addr); 5613 if (eaddr != (nseg->s_base + nseg->s_size)) { 5614 /* eaddr is szc aligned */ 5615 (void) segvn_split_seg(nseg, eaddr); 5616 } 5617 return (IE_RETRY); 5618 } 5619 if (eaddr != (seg->s_base + seg->s_size)) { 5620 /* eaddr is szc aligned */ 5621 (void) segvn_split_seg(seg, eaddr); 5622 } 5623 return (IE_RETRY); 5624 } 5625 5626 /* 5627 * Break any low level sharing and reset seg->s_szc to 0. 5628 */ 5629 if ((err = segvn_clrszc(seg)) != 0) { 5630 if (err == ENOMEM) { 5631 err = IE_NOMEM; 5632 } 5633 return (err); 5634 } 5635 ASSERT(seg->s_szc == 0); 5636 5637 /* 5638 * If the end of the current segment is not pgsz aligned 5639 * then attempt to concatenate with the next segment. 5640 */ 5641 if (!IS_P2ALIGNED(eaddr, pgsz)) { 5642 nseg = AS_SEGNEXT(seg->s_as, seg); 5643 if (nseg == NULL || nseg == seg || eaddr != nseg->s_base) { 5644 return (ENOMEM); 5645 } 5646 if (nseg->s_ops != &segvn_ops) { 5647 return (EINVAL); 5648 } 5649 nsvd = (struct segvn_data *)nseg->s_data; 5650 if (nsvd->softlockcnt > 0) { 5651 segvn_purge(nseg); 5652 if (nsvd->softlockcnt > 0) { 5653 return (EAGAIN); 5654 } 5655 } 5656 err = segvn_clrszc(nseg); 5657 if (err == ENOMEM) { 5658 err = IE_NOMEM; 5659 } 5660 if (err != 0) { 5661 return (err); 5662 } 5663 err = segvn_concat(seg, nseg, 1); 5664 if (err == -1) { 5665 return (EINVAL); 5666 } 5667 if (err == -2) { 5668 return (IE_NOMEM); 5669 } 5670 return (IE_RETRY); 5671 } 5672 5673 /* 5674 * May need to re-align anon array to 5675 * new szc. 5676 */ 5677 if (amp != NULL) { 5678 if (!IS_P2ALIGNED(svd->anon_index, pgcnt)) { 5679 struct anon_hdr *nahp; 5680 5681 ASSERT(svd->type == MAP_PRIVATE); 5682 5683 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 5684 ASSERT(amp->refcnt == 1); 5685 nahp = anon_create(btop(amp->size), ANON_NOSLEEP); 5686 if (nahp == NULL) { 5687 ANON_LOCK_EXIT(&->a_rwlock); 5688 return (IE_NOMEM); 5689 } 5690 if (anon_copy_ptr(amp->ahp, svd->anon_index, 5691 nahp, 0, btop(seg->s_size), ANON_NOSLEEP)) { 5692 anon_release(nahp, btop(amp->size)); 5693 ANON_LOCK_EXIT(&->a_rwlock); 5694 return (IE_NOMEM); 5695 } 5696 anon_release(amp->ahp, btop(amp->size)); 5697 amp->ahp = nahp; 5698 svd->anon_index = 0; 5699 ANON_LOCK_EXIT(&->a_rwlock); 5700 } 5701 } 5702 if (svd->vp != NULL && szc != 0) { 5703 struct vattr va; 5704 u_offset_t eoffpage = svd->offset; 5705 va.va_mask = AT_SIZE; 5706 eoffpage += seg->s_size; 5707 eoffpage = btopr(eoffpage); 5708 if (VOP_GETATTR(svd->vp, &va, 0, svd->cred) != 0) { 5709 segvn_setpgsz_getattr_err++; 5710 return (EINVAL); 5711 } 5712 if (btopr(va.va_size) < eoffpage) { 5713 segvn_setpgsz_eof_err++; 5714 return (EINVAL); 5715 } 5716 if (amp != NULL) { 5717 /* 5718 * anon_fill_cow_holes() may call VOP_GETPAGE(). 5719 * don't take anon map lock here to avoid holding it 5720 * across VOP_GETPAGE() calls that may call back into 5721 * segvn for klsutering checks. We don't really need 5722 * anon map lock here since it's a private segment and 5723 * we hold as level lock as writers. 5724 */ 5725 if ((err = anon_fill_cow_holes(seg, seg->s_base, 5726 amp->ahp, svd->anon_index, svd->vp, svd->offset, 5727 seg->s_size, szc, svd->prot, svd->vpage, 5728 svd->cred)) != 0) { 5729 return (EINVAL); 5730 } 5731 } 5732 segvn_setvnode_mpss(svd->vp); 5733 } 5734 5735 if (amp != NULL) { 5736 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 5737 if (svd->type == MAP_PRIVATE) { 5738 amp->a_szc = szc; 5739 } else if (szc > amp->a_szc) { 5740 amp->a_szc = szc; 5741 } 5742 ANON_LOCK_EXIT(&->a_rwlock); 5743 } 5744 5745 seg->s_szc = szc; 5746 5747 return (0); 5748 } 5749 5750 static int 5751 segvn_clrszc(struct seg *seg) 5752 { 5753 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 5754 struct anon_map *amp = svd->amp; 5755 size_t pgsz; 5756 pgcnt_t pages; 5757 int err = 0; 5758 caddr_t a = seg->s_base; 5759 caddr_t ea = a + seg->s_size; 5760 ulong_t an_idx = svd->anon_index; 5761 vnode_t *vp = svd->vp; 5762 struct vpage *vpage = svd->vpage; 5763 page_t *anon_pl[1 + 1], *pp; 5764 struct anon *ap, *oldap; 5765 uint_t prot = svd->prot, vpprot; 5766 int pageflag = 0; 5767 5768 ASSERT(AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock) || 5769 SEGVN_WRITE_HELD(seg->s_as, &svd->lock)); 5770 5771 if (vp == NULL && amp == NULL) { 5772 seg->s_szc = 0; 5773 return (0); 5774 } 5775 5776 /* 5777 * do HAT_UNLOAD_UNMAP since we are changing the pagesize. 5778 * unload argument is 0 when we are freeing the segment 5779 * and unload was already done. 5780 */ 5781 hat_unload(seg->s_as->a_hat, seg->s_base, seg->s_size, 5782 HAT_UNLOAD_UNMAP); 5783 5784 if (amp == NULL || svd->type == MAP_SHARED) { 5785 seg->s_szc = 0; 5786 return (0); 5787 } 5788 5789 pgsz = page_get_pagesize(seg->s_szc); 5790 pages = btop(pgsz); 5791 5792 /* 5793 * XXX anon rwlock is not really needed because this is a 5794 * private segment and we are writers. 5795 */ 5796 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 5797 5798 for (; a < ea; a += pgsz, an_idx += pages) { 5799 if ((oldap = anon_get_ptr(amp->ahp, an_idx)) != NULL) { 5800 ASSERT(vpage != NULL || svd->pageprot == 0); 5801 if (vpage != NULL) { 5802 ASSERT(sameprot(seg, a, pgsz)); 5803 prot = VPP_PROT(vpage); 5804 pageflag = VPP_ISPPLOCK(vpage) ? LOCK_PAGE : 0; 5805 } 5806 if (seg->s_szc != 0) { 5807 ASSERT(vp == NULL || anon_pages(amp->ahp, 5808 an_idx, pages) == pages); 5809 if ((err = anon_map_demotepages(amp, an_idx, 5810 seg, a, prot, vpage, svd->cred)) != 0) { 5811 goto out; 5812 } 5813 } else { 5814 if (oldap->an_refcnt == 1) { 5815 continue; 5816 } 5817 if ((err = anon_getpage(&oldap, &vpprot, 5818 anon_pl, PAGESIZE, seg, a, S_READ, 5819 svd->cred))) { 5820 goto out; 5821 } 5822 if ((pp = anon_private(&ap, seg, a, prot, 5823 anon_pl[0], pageflag, svd->cred)) == NULL) { 5824 err = ENOMEM; 5825 goto out; 5826 } 5827 anon_decref(oldap); 5828 (void) anon_set_ptr(amp->ahp, an_idx, ap, 5829 ANON_SLEEP); 5830 page_unlock(pp); 5831 } 5832 } 5833 vpage = (vpage == NULL) ? NULL : vpage + pages; 5834 } 5835 5836 amp->a_szc = 0; 5837 seg->s_szc = 0; 5838 out: 5839 ANON_LOCK_EXIT(&->a_rwlock); 5840 return (err); 5841 } 5842 5843 static int 5844 segvn_claim_pages( 5845 struct seg *seg, 5846 struct vpage *svp, 5847 u_offset_t off, 5848 ulong_t anon_idx, 5849 uint_t prot) 5850 { 5851 pgcnt_t pgcnt = page_get_pagecnt(seg->s_szc); 5852 size_t ppasize = (pgcnt + 1) * sizeof (page_t *); 5853 page_t **ppa; 5854 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 5855 struct anon_map *amp = svd->amp; 5856 struct vpage *evp = svp + pgcnt; 5857 caddr_t addr = ((uintptr_t)(svp - svd->vpage) << PAGESHIFT) 5858 + seg->s_base; 5859 struct anon *ap; 5860 struct vnode *vp = svd->vp; 5861 page_t *pp; 5862 pgcnt_t pg_idx, i; 5863 int err = 0; 5864 anoff_t aoff; 5865 int anon = (amp != NULL) ? 1 : 0; 5866 5867 ASSERT(svd->type == MAP_PRIVATE); 5868 ASSERT(svd->vpage != NULL); 5869 ASSERT(seg->s_szc != 0); 5870 ASSERT(IS_P2ALIGNED(pgcnt, pgcnt)); 5871 ASSERT(amp == NULL || IS_P2ALIGNED(anon_idx, pgcnt)); 5872 ASSERT(sameprot(seg, addr, pgcnt << PAGESHIFT)); 5873 5874 if (VPP_PROT(svp) == prot) 5875 return (1); 5876 if (!((VPP_PROT(svp) ^ prot) & PROT_WRITE)) 5877 return (1); 5878 5879 ppa = kmem_alloc(ppasize, KM_SLEEP); 5880 if (anon && vp != NULL) { 5881 if (anon_get_ptr(amp->ahp, anon_idx) == NULL) { 5882 anon = 0; 5883 ASSERT(!anon_pages(amp->ahp, anon_idx, pgcnt)); 5884 } 5885 ASSERT(!anon || 5886 anon_pages(amp->ahp, anon_idx, pgcnt) == pgcnt); 5887 } 5888 5889 for (*ppa = NULL, pg_idx = 0; svp < evp; svp++, anon_idx++) { 5890 if (!VPP_ISPPLOCK(svp)) 5891 continue; 5892 if (anon) { 5893 ap = anon_get_ptr(amp->ahp, anon_idx); 5894 if (ap == NULL) { 5895 panic("segvn_claim_pages: no anon slot"); 5896 } 5897 swap_xlate(ap, &vp, &aoff); 5898 off = (u_offset_t)aoff; 5899 } 5900 ASSERT(vp != NULL); 5901 if ((pp = page_lookup(vp, 5902 (u_offset_t)off, SE_SHARED)) == NULL) { 5903 panic("segvn_claim_pages: no page"); 5904 } 5905 ppa[pg_idx++] = pp; 5906 off += PAGESIZE; 5907 } 5908 5909 if (ppa[0] == NULL) { 5910 kmem_free(ppa, ppasize); 5911 return (1); 5912 } 5913 5914 ASSERT(pg_idx <= pgcnt); 5915 ppa[pg_idx] = NULL; 5916 5917 if (prot & PROT_WRITE) 5918 err = page_addclaim_pages(ppa); 5919 else 5920 err = page_subclaim_pages(ppa); 5921 5922 for (i = 0; i < pg_idx; i++) { 5923 ASSERT(ppa[i] != NULL); 5924 page_unlock(ppa[i]); 5925 } 5926 5927 kmem_free(ppa, ppasize); 5928 return (err); 5929 } 5930 5931 /* 5932 * Returns right (upper address) segment if split occured. 5933 * If the address is equal to the beginning or end of its segment it returns 5934 * the current segment. 5935 */ 5936 static struct seg * 5937 segvn_split_seg(struct seg *seg, caddr_t addr) 5938 { 5939 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 5940 struct seg *nseg; 5941 size_t nsize; 5942 struct segvn_data *nsvd; 5943 5944 ASSERT(AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock)); 5945 ASSERT(addr >= seg->s_base); 5946 ASSERT(addr <= seg->s_base + seg->s_size); 5947 5948 if (addr == seg->s_base || addr == seg->s_base + seg->s_size) 5949 return (seg); 5950 5951 nsize = seg->s_base + seg->s_size - addr; 5952 seg->s_size = addr - seg->s_base; 5953 nseg = seg_alloc(seg->s_as, addr, nsize); 5954 ASSERT(nseg != NULL); 5955 nseg->s_ops = seg->s_ops; 5956 nsvd = kmem_cache_alloc(segvn_cache, KM_SLEEP); 5957 nseg->s_data = (void *)nsvd; 5958 nseg->s_szc = seg->s_szc; 5959 *nsvd = *svd; 5960 rw_init(&nsvd->lock, NULL, RW_DEFAULT, NULL); 5961 5962 if (nsvd->vp != NULL) { 5963 VN_HOLD(nsvd->vp); 5964 nsvd->offset = svd->offset + 5965 (uintptr_t)(nseg->s_base - seg->s_base); 5966 if (nsvd->type == MAP_SHARED) 5967 lgrp_shm_policy_init(NULL, nsvd->vp); 5968 } else { 5969 /* 5970 * The offset for an anonymous segment has no signifigance in 5971 * terms of an offset into a file. If we were to use the above 5972 * calculation instead, the structures read out of 5973 * /proc/<pid>/xmap would be more difficult to decipher since 5974 * it would be unclear whether two seemingly contiguous 5975 * prxmap_t structures represented different segments or a 5976 * single segment that had been split up into multiple prxmap_t 5977 * structures (e.g. if some part of the segment had not yet 5978 * been faulted in). 5979 */ 5980 nsvd->offset = 0; 5981 } 5982 5983 ASSERT(svd->softlockcnt == 0); 5984 crhold(svd->cred); 5985 5986 if (svd->vpage != NULL) { 5987 size_t bytes = vpgtob(seg_pages(seg)); 5988 size_t nbytes = vpgtob(seg_pages(nseg)); 5989 struct vpage *ovpage = svd->vpage; 5990 5991 svd->vpage = kmem_alloc(bytes, KM_SLEEP); 5992 bcopy(ovpage, svd->vpage, bytes); 5993 nsvd->vpage = kmem_alloc(nbytes, KM_SLEEP); 5994 bcopy(ovpage + seg_pages(seg), nsvd->vpage, nbytes); 5995 kmem_free(ovpage, bytes + nbytes); 5996 } 5997 if (svd->amp != NULL && svd->type == MAP_PRIVATE) { 5998 struct anon_map *oamp = svd->amp, *namp; 5999 struct anon_hdr *nahp; 6000 6001 ANON_LOCK_ENTER(&oamp->a_rwlock, RW_WRITER); 6002 ASSERT(oamp->refcnt == 1); 6003 nahp = anon_create(btop(seg->s_size), ANON_SLEEP); 6004 (void) anon_copy_ptr(oamp->ahp, svd->anon_index, 6005 nahp, 0, btop(seg->s_size), ANON_SLEEP); 6006 6007 namp = anonmap_alloc(nseg->s_size, 0); 6008 namp->a_szc = nseg->s_szc; 6009 (void) anon_copy_ptr(oamp->ahp, 6010 svd->anon_index + btop(seg->s_size), 6011 namp->ahp, 0, btop(nseg->s_size), ANON_SLEEP); 6012 anon_release(oamp->ahp, btop(oamp->size)); 6013 oamp->ahp = nahp; 6014 oamp->size = seg->s_size; 6015 svd->anon_index = 0; 6016 nsvd->amp = namp; 6017 nsvd->anon_index = 0; 6018 ANON_LOCK_EXIT(&oamp->a_rwlock); 6019 } else if (svd->amp != NULL) { 6020 pgcnt_t pgcnt = page_get_pagecnt(seg->s_szc); 6021 ASSERT(svd->amp == nsvd->amp); 6022 ASSERT(seg->s_szc <= svd->amp->a_szc); 6023 nsvd->anon_index = svd->anon_index + seg_pages(seg); 6024 ASSERT(IS_P2ALIGNED(nsvd->anon_index, pgcnt)); 6025 ANON_LOCK_ENTER(&svd->amp->a_rwlock, RW_WRITER); 6026 svd->amp->refcnt++; 6027 ANON_LOCK_EXIT(&svd->amp->a_rwlock); 6028 } 6029 6030 /* 6031 * Split amount of swap reserve 6032 */ 6033 if (svd->swresv) { 6034 /* 6035 * For MAP_NORESERVE, only allocate swap reserve for pages 6036 * being used. Other segments get enough to cover whole 6037 * segment. 6038 */ 6039 if (svd->flags & MAP_NORESERVE) { 6040 size_t oswresv; 6041 6042 ASSERT(svd->amp); 6043 oswresv = svd->swresv; 6044 svd->swresv = ptob(anon_pages(svd->amp->ahp, 6045 svd->anon_index, btop(seg->s_size))); 6046 nsvd->swresv = ptob(anon_pages(nsvd->amp->ahp, 6047 nsvd->anon_index, btop(nseg->s_size))); 6048 ASSERT(oswresv >= (svd->swresv + nsvd->swresv)); 6049 } else { 6050 ASSERT(svd->swresv == seg->s_size + nseg->s_size); 6051 svd->swresv = seg->s_size; 6052 nsvd->swresv = nseg->s_size; 6053 } 6054 } 6055 6056 return (nseg); 6057 } 6058 6059 /* 6060 * called on memory operations (unmap, setprot, setpagesize) for a subset 6061 * of a large page segment to either demote the memory range (SDR_RANGE) 6062 * or the ends (SDR_END) by addr/len. 6063 * 6064 * returns 0 on success. returns errno, including ENOMEM, on failure. 6065 */ 6066 static int 6067 segvn_demote_range( 6068 struct seg *seg, 6069 caddr_t addr, 6070 size_t len, 6071 int flag, 6072 uint_t szcvec) 6073 { 6074 caddr_t eaddr = addr + len; 6075 caddr_t lpgaddr, lpgeaddr; 6076 struct seg *nseg; 6077 struct seg *badseg1 = NULL; 6078 struct seg *badseg2 = NULL; 6079 size_t pgsz; 6080 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 6081 int err; 6082 uint_t szc = seg->s_szc; 6083 uint_t tszcvec; 6084 6085 ASSERT(AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock)); 6086 ASSERT(szc != 0); 6087 pgsz = page_get_pagesize(szc); 6088 ASSERT(seg->s_base != addr || seg->s_size != len); 6089 ASSERT(addr >= seg->s_base && eaddr <= seg->s_base + seg->s_size); 6090 ASSERT(svd->softlockcnt == 0); 6091 ASSERT(szcvec == 0 || (flag == SDR_END && svd->type == MAP_SHARED)); 6092 6093 CALC_LPG_REGION(pgsz, seg, addr, len, lpgaddr, lpgeaddr); 6094 ASSERT(flag == SDR_RANGE || eaddr < lpgeaddr || addr > lpgaddr); 6095 if (flag == SDR_RANGE) { 6096 /* demote entire range */ 6097 badseg1 = nseg = segvn_split_seg(seg, lpgaddr); 6098 (void) segvn_split_seg(nseg, lpgeaddr); 6099 ASSERT(badseg1->s_base == lpgaddr); 6100 ASSERT(badseg1->s_size == lpgeaddr - lpgaddr); 6101 } else if (addr != lpgaddr) { 6102 ASSERT(flag == SDR_END); 6103 badseg1 = nseg = segvn_split_seg(seg, lpgaddr); 6104 if (eaddr != lpgeaddr && eaddr > lpgaddr + pgsz && 6105 eaddr < lpgaddr + 2 * pgsz) { 6106 (void) segvn_split_seg(nseg, lpgeaddr); 6107 ASSERT(badseg1->s_base == lpgaddr); 6108 ASSERT(badseg1->s_size == 2 * pgsz); 6109 } else { 6110 nseg = segvn_split_seg(nseg, lpgaddr + pgsz); 6111 ASSERT(badseg1->s_base == lpgaddr); 6112 ASSERT(badseg1->s_size == pgsz); 6113 if (eaddr != lpgeaddr && eaddr > lpgaddr + pgsz) { 6114 ASSERT(lpgeaddr - lpgaddr > 2 * pgsz); 6115 nseg = segvn_split_seg(nseg, lpgeaddr - pgsz); 6116 badseg2 = nseg; 6117 (void) segvn_split_seg(nseg, lpgeaddr); 6118 ASSERT(badseg2->s_base == lpgeaddr - pgsz); 6119 ASSERT(badseg2->s_size == pgsz); 6120 } 6121 } 6122 } else { 6123 ASSERT(flag == SDR_END); 6124 ASSERT(eaddr < lpgeaddr); 6125 badseg1 = nseg = segvn_split_seg(seg, lpgeaddr - pgsz); 6126 (void) segvn_split_seg(nseg, lpgeaddr); 6127 ASSERT(badseg1->s_base == lpgeaddr - pgsz); 6128 ASSERT(badseg1->s_size == pgsz); 6129 } 6130 6131 ASSERT(badseg1 != NULL); 6132 ASSERT(badseg1->s_szc == szc); 6133 ASSERT(flag == SDR_RANGE || badseg1->s_size == pgsz || 6134 badseg1->s_size == 2 * pgsz); 6135 ASSERT(sameprot(badseg1, badseg1->s_base, pgsz)); 6136 ASSERT(badseg1->s_size == pgsz || 6137 sameprot(badseg1, badseg1->s_base + pgsz, pgsz)); 6138 if (err = segvn_clrszc(badseg1)) { 6139 return (err); 6140 } 6141 ASSERT(badseg1->s_szc == 0); 6142 6143 if (szc > 1 && (tszcvec = P2PHASE(szcvec, 1 << szc)) > 1) { 6144 uint_t tszc = highbit(tszcvec) - 1; 6145 caddr_t ta = MAX(addr, badseg1->s_base); 6146 caddr_t te; 6147 size_t tpgsz = page_get_pagesize(tszc); 6148 6149 ASSERT(svd->type == MAP_SHARED); 6150 ASSERT(flag == SDR_END); 6151 ASSERT(tszc < szc && tszc > 0); 6152 6153 if (eaddr > badseg1->s_base + badseg1->s_size) { 6154 te = badseg1->s_base + badseg1->s_size; 6155 } else { 6156 te = eaddr; 6157 } 6158 6159 ASSERT(ta <= te); 6160 badseg1->s_szc = tszc; 6161 if (!IS_P2ALIGNED(ta, tpgsz) || !IS_P2ALIGNED(te, tpgsz)) { 6162 if (badseg2 != NULL) { 6163 err = segvn_demote_range(badseg1, ta, te - ta, 6164 SDR_END, tszcvec); 6165 if (err != 0) { 6166 return (err); 6167 } 6168 } else { 6169 return (segvn_demote_range(badseg1, ta, 6170 te - ta, SDR_END, tszcvec)); 6171 } 6172 } 6173 } 6174 6175 if (badseg2 == NULL) 6176 return (0); 6177 ASSERT(badseg2->s_szc == szc); 6178 ASSERT(badseg2->s_size == pgsz); 6179 ASSERT(sameprot(badseg2, badseg2->s_base, badseg2->s_size)); 6180 if (err = segvn_clrszc(badseg2)) { 6181 return (err); 6182 } 6183 ASSERT(badseg2->s_szc == 0); 6184 6185 if (szc > 1 && (tszcvec = P2PHASE(szcvec, 1 << szc)) > 1) { 6186 uint_t tszc = highbit(tszcvec) - 1; 6187 size_t tpgsz = page_get_pagesize(tszc); 6188 6189 ASSERT(svd->type == MAP_SHARED); 6190 ASSERT(flag == SDR_END); 6191 ASSERT(tszc < szc && tszc > 0); 6192 ASSERT(badseg2->s_base > addr); 6193 ASSERT(eaddr > badseg2->s_base); 6194 ASSERT(eaddr < badseg2->s_base + badseg2->s_size); 6195 6196 badseg2->s_szc = tszc; 6197 if (!IS_P2ALIGNED(eaddr, tpgsz)) { 6198 return (segvn_demote_range(badseg2, badseg2->s_base, 6199 eaddr - badseg2->s_base, SDR_END, tszcvec)); 6200 } 6201 } 6202 6203 return (0); 6204 } 6205 6206 static int 6207 segvn_checkprot(struct seg *seg, caddr_t addr, size_t len, uint_t prot) 6208 { 6209 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 6210 struct vpage *vp, *evp; 6211 6212 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 6213 6214 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER); 6215 /* 6216 * If segment protection can be used, simply check against them. 6217 */ 6218 if (svd->pageprot == 0) { 6219 int err; 6220 6221 err = ((svd->prot & prot) != prot) ? EACCES : 0; 6222 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 6223 return (err); 6224 } 6225 6226 /* 6227 * Have to check down to the vpage level. 6228 */ 6229 evp = &svd->vpage[seg_page(seg, addr + len)]; 6230 for (vp = &svd->vpage[seg_page(seg, addr)]; vp < evp; vp++) { 6231 if ((VPP_PROT(vp) & prot) != prot) { 6232 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 6233 return (EACCES); 6234 } 6235 } 6236 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 6237 return (0); 6238 } 6239 6240 static int 6241 segvn_getprot(struct seg *seg, caddr_t addr, size_t len, uint_t *protv) 6242 { 6243 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 6244 size_t pgno = seg_page(seg, addr + len) - seg_page(seg, addr) + 1; 6245 6246 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 6247 6248 if (pgno != 0) { 6249 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER); 6250 if (svd->pageprot == 0) { 6251 do 6252 protv[--pgno] = svd->prot; 6253 while (pgno != 0); 6254 } else { 6255 size_t pgoff = seg_page(seg, addr); 6256 6257 do { 6258 pgno--; 6259 protv[pgno] = VPP_PROT(&svd->vpage[pgno+pgoff]); 6260 } while (pgno != 0); 6261 } 6262 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 6263 } 6264 return (0); 6265 } 6266 6267 static u_offset_t 6268 segvn_getoffset(struct seg *seg, caddr_t addr) 6269 { 6270 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 6271 6272 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 6273 6274 return (svd->offset + (uintptr_t)(addr - seg->s_base)); 6275 } 6276 6277 /*ARGSUSED*/ 6278 static int 6279 segvn_gettype(struct seg *seg, caddr_t addr) 6280 { 6281 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 6282 6283 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 6284 6285 return (svd->type | (svd->flags & (MAP_NORESERVE | MAP_TEXT | 6286 MAP_INITDATA))); 6287 } 6288 6289 /*ARGSUSED*/ 6290 static int 6291 segvn_getvp(struct seg *seg, caddr_t addr, struct vnode **vpp) 6292 { 6293 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 6294 6295 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 6296 6297 *vpp = svd->vp; 6298 return (0); 6299 } 6300 6301 /* 6302 * Check to see if it makes sense to do kluster/read ahead to 6303 * addr + delta relative to the mapping at addr. We assume here 6304 * that delta is a signed PAGESIZE'd multiple (which can be negative). 6305 * 6306 * For segvn, we currently "approve" of the action if we are 6307 * still in the segment and it maps from the same vp/off, 6308 * or if the advice stored in segvn_data or vpages allows it. 6309 * Currently, klustering is not allowed only if MADV_RANDOM is set. 6310 */ 6311 static int 6312 segvn_kluster(struct seg *seg, caddr_t addr, ssize_t delta) 6313 { 6314 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 6315 struct anon *oap, *ap; 6316 ssize_t pd; 6317 size_t page; 6318 struct vnode *vp1, *vp2; 6319 u_offset_t off1, off2; 6320 struct anon_map *amp; 6321 6322 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 6323 ASSERT(AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock) || 6324 SEGVN_LOCK_HELD(seg->s_as, &svd->lock)); 6325 6326 if (addr + delta < seg->s_base || 6327 addr + delta >= (seg->s_base + seg->s_size)) 6328 return (-1); /* exceeded segment bounds */ 6329 6330 pd = delta / (ssize_t)PAGESIZE; /* divide to preserve sign bit */ 6331 page = seg_page(seg, addr); 6332 6333 /* 6334 * Check to see if either of the pages addr or addr + delta 6335 * have advice set that prevents klustering (if MADV_RANDOM advice 6336 * is set for entire segment, or MADV_SEQUENTIAL is set and delta 6337 * is negative). 6338 */ 6339 if (svd->advice == MADV_RANDOM || 6340 svd->advice == MADV_SEQUENTIAL && delta < 0) 6341 return (-1); 6342 else if (svd->pageadvice && svd->vpage) { 6343 struct vpage *bvpp, *evpp; 6344 6345 bvpp = &svd->vpage[page]; 6346 evpp = &svd->vpage[page + pd]; 6347 if (VPP_ADVICE(bvpp) == MADV_RANDOM || 6348 VPP_ADVICE(evpp) == MADV_SEQUENTIAL && delta < 0) 6349 return (-1); 6350 if (VPP_ADVICE(bvpp) != VPP_ADVICE(evpp) && 6351 VPP_ADVICE(evpp) == MADV_RANDOM) 6352 return (-1); 6353 } 6354 6355 if (svd->type == MAP_SHARED) 6356 return (0); /* shared mapping - all ok */ 6357 6358 if ((amp = svd->amp) == NULL) 6359 return (0); /* off original vnode */ 6360 6361 page += svd->anon_index; 6362 6363 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 6364 6365 oap = anon_get_ptr(amp->ahp, page); 6366 ap = anon_get_ptr(amp->ahp, page + pd); 6367 6368 ANON_LOCK_EXIT(&->a_rwlock); 6369 6370 if ((oap == NULL && ap != NULL) || (oap != NULL && ap == NULL)) { 6371 return (-1); /* one with and one without an anon */ 6372 } 6373 6374 if (oap == NULL) { /* implies that ap == NULL */ 6375 return (0); /* off original vnode */ 6376 } 6377 6378 /* 6379 * Now we know we have two anon pointers - check to 6380 * see if they happen to be properly allocated. 6381 */ 6382 6383 /* 6384 * XXX We cheat here and don't lock the anon slots. We can't because 6385 * we may have been called from the anon layer which might already 6386 * have locked them. We are holding a refcnt on the slots so they 6387 * can't disappear. The worst that will happen is we'll get the wrong 6388 * names (vp, off) for the slots and make a poor klustering decision. 6389 */ 6390 swap_xlate(ap, &vp1, &off1); 6391 swap_xlate(oap, &vp2, &off2); 6392 6393 6394 if (!VOP_CMP(vp1, vp2) || off1 - off2 != delta) 6395 return (-1); 6396 return (0); 6397 } 6398 6399 /* 6400 * Swap the pages of seg out to secondary storage, returning the 6401 * number of bytes of storage freed. 6402 * 6403 * The basic idea is first to unload all translations and then to call 6404 * VOP_PUTPAGE() for all newly-unmapped pages, to push them out to the 6405 * swap device. Pages to which other segments have mappings will remain 6406 * mapped and won't be swapped. Our caller (as_swapout) has already 6407 * performed the unloading step. 6408 * 6409 * The value returned is intended to correlate well with the process's 6410 * memory requirements. However, there are some caveats: 6411 * 1) When given a shared segment as argument, this routine will 6412 * only succeed in swapping out pages for the last sharer of the 6413 * segment. (Previous callers will only have decremented mapping 6414 * reference counts.) 6415 * 2) We assume that the hat layer maintains a large enough translation 6416 * cache to capture process reference patterns. 6417 */ 6418 static size_t 6419 segvn_swapout(struct seg *seg) 6420 { 6421 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 6422 struct anon_map *amp; 6423 pgcnt_t pgcnt = 0; 6424 pgcnt_t npages; 6425 pgcnt_t page; 6426 ulong_t anon_index; 6427 6428 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 6429 6430 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER); 6431 /* 6432 * Find pages unmapped by our caller and force them 6433 * out to the virtual swap device. 6434 */ 6435 if ((amp = svd->amp) != NULL) 6436 anon_index = svd->anon_index; 6437 npages = seg->s_size >> PAGESHIFT; 6438 for (page = 0; page < npages; page++) { 6439 page_t *pp; 6440 struct anon *ap; 6441 struct vnode *vp; 6442 u_offset_t off; 6443 anon_sync_obj_t cookie; 6444 6445 /* 6446 * Obtain <vp, off> pair for the page, then look it up. 6447 * 6448 * Note that this code is willing to consider regular 6449 * pages as well as anon pages. Is this appropriate here? 6450 */ 6451 ap = NULL; 6452 if (amp != NULL) { 6453 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 6454 if (anon_array_try_enter(amp, anon_index + page, 6455 &cookie)) { 6456 ANON_LOCK_EXIT(&->a_rwlock); 6457 continue; 6458 } 6459 ap = anon_get_ptr(amp->ahp, anon_index + page); 6460 if (ap != NULL) { 6461 swap_xlate(ap, &vp, &off); 6462 } else { 6463 vp = svd->vp; 6464 off = svd->offset + ptob(page); 6465 } 6466 anon_array_exit(&cookie); 6467 ANON_LOCK_EXIT(&->a_rwlock); 6468 } else { 6469 vp = svd->vp; 6470 off = svd->offset + ptob(page); 6471 } 6472 if (vp == NULL) { /* untouched zfod page */ 6473 ASSERT(ap == NULL); 6474 continue; 6475 } 6476 6477 pp = page_lookup_nowait(vp, off, SE_SHARED); 6478 if (pp == NULL) 6479 continue; 6480 6481 6482 /* 6483 * Examine the page to see whether it can be tossed out, 6484 * keeping track of how many we've found. 6485 */ 6486 if (!page_tryupgrade(pp)) { 6487 /* 6488 * If the page has an i/o lock and no mappings, 6489 * it's very likely that the page is being 6490 * written out as a result of klustering. 6491 * Assume this is so and take credit for it here. 6492 */ 6493 if (!page_io_trylock(pp)) { 6494 if (!hat_page_is_mapped(pp)) 6495 pgcnt++; 6496 } else { 6497 page_io_unlock(pp); 6498 } 6499 page_unlock(pp); 6500 continue; 6501 } 6502 ASSERT(!page_iolock_assert(pp)); 6503 6504 6505 /* 6506 * Skip if page is locked or has mappings. 6507 * We don't need the page_struct_lock to look at lckcnt 6508 * and cowcnt because the page is exclusive locked. 6509 */ 6510 if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0 || 6511 hat_page_is_mapped(pp)) { 6512 page_unlock(pp); 6513 continue; 6514 } 6515 6516 /* 6517 * dispose skips large pages so try to demote first. 6518 */ 6519 if (pp->p_szc != 0 && !page_try_demote_pages(pp)) { 6520 page_unlock(pp); 6521 /* 6522 * XXX should skip the remaining page_t's of this 6523 * large page. 6524 */ 6525 continue; 6526 } 6527 6528 ASSERT(pp->p_szc == 0); 6529 6530 /* 6531 * No longer mapped -- we can toss it out. How 6532 * we do so depends on whether or not it's dirty. 6533 */ 6534 if (hat_ismod(pp) && pp->p_vnode) { 6535 /* 6536 * We must clean the page before it can be 6537 * freed. Setting B_FREE will cause pvn_done 6538 * to free the page when the i/o completes. 6539 * XXX: This also causes it to be accounted 6540 * as a pageout instead of a swap: need 6541 * B_SWAPOUT bit to use instead of B_FREE. 6542 * 6543 * Hold the vnode before releasing the page lock 6544 * to prevent it from being freed and re-used by 6545 * some other thread. 6546 */ 6547 VN_HOLD(vp); 6548 page_unlock(pp); 6549 6550 /* 6551 * Queue all i/o requests for the pageout thread 6552 * to avoid saturating the pageout devices. 6553 */ 6554 if (!queue_io_request(vp, off)) 6555 VN_RELE(vp); 6556 } else { 6557 /* 6558 * The page was clean, free it. 6559 * 6560 * XXX: Can we ever encounter modified pages 6561 * with no associated vnode here? 6562 */ 6563 ASSERT(pp->p_vnode != NULL); 6564 /*LINTED: constant in conditional context*/ 6565 VN_DISPOSE(pp, B_FREE, 0, kcred); 6566 } 6567 6568 /* 6569 * Credit now even if i/o is in progress. 6570 */ 6571 pgcnt++; 6572 } 6573 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 6574 6575 /* 6576 * Wakeup pageout to initiate i/o on all queued requests. 6577 */ 6578 cv_signal_pageout(); 6579 return (ptob(pgcnt)); 6580 } 6581 6582 /* 6583 * Synchronize primary storage cache with real object in virtual memory. 6584 * 6585 * XXX - Anonymous pages should not be sync'ed out at all. 6586 */ 6587 static int 6588 segvn_sync(struct seg *seg, caddr_t addr, size_t len, int attr, uint_t flags) 6589 { 6590 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 6591 struct vpage *vpp; 6592 page_t *pp; 6593 u_offset_t offset; 6594 struct vnode *vp; 6595 u_offset_t off; 6596 caddr_t eaddr; 6597 int bflags; 6598 int err = 0; 6599 int segtype; 6600 int pageprot; 6601 int prot; 6602 ulong_t anon_index; 6603 struct anon_map *amp; 6604 struct anon *ap; 6605 anon_sync_obj_t cookie; 6606 6607 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 6608 6609 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER); 6610 6611 if (svd->softlockcnt > 0) { 6612 /* 6613 * flush all pages from seg cache 6614 * otherwise we may deadlock in swap_putpage 6615 * for B_INVAL page (4175402). 6616 * 6617 * Even if we grab segvn WRITER's lock or segp_slock 6618 * here, there might be another thread which could've 6619 * successfully performed lookup/insert just before 6620 * we acquired the lock here. So, grabbing either 6621 * lock here is of not much use. Until we devise 6622 * a strategy at upper layers to solve the 6623 * synchronization issues completely, we expect 6624 * applications to handle this appropriately. 6625 */ 6626 segvn_purge(seg); 6627 if (svd->softlockcnt > 0) { 6628 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 6629 return (EAGAIN); 6630 } 6631 } 6632 6633 vpp = svd->vpage; 6634 offset = svd->offset + (uintptr_t)(addr - seg->s_base); 6635 bflags = ((flags & MS_ASYNC) ? B_ASYNC : 0) | 6636 ((flags & MS_INVALIDATE) ? B_INVAL : 0); 6637 6638 if (attr) { 6639 pageprot = attr & ~(SHARED|PRIVATE); 6640 segtype = (attr & SHARED) ? MAP_SHARED : MAP_PRIVATE; 6641 6642 /* 6643 * We are done if the segment types don't match 6644 * or if we have segment level protections and 6645 * they don't match. 6646 */ 6647 if (svd->type != segtype) { 6648 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 6649 return (0); 6650 } 6651 if (vpp == NULL) { 6652 if (svd->prot != pageprot) { 6653 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 6654 return (0); 6655 } 6656 prot = svd->prot; 6657 } else 6658 vpp = &svd->vpage[seg_page(seg, addr)]; 6659 6660 } else if (svd->vp && svd->amp == NULL && 6661 (flags & MS_INVALIDATE) == 0) { 6662 6663 /* 6664 * No attributes, no anonymous pages and MS_INVALIDATE flag 6665 * is not on, just use one big request. 6666 */ 6667 err = VOP_PUTPAGE(svd->vp, (offset_t)offset, len, 6668 bflags, svd->cred); 6669 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 6670 return (err); 6671 } 6672 6673 if ((amp = svd->amp) != NULL) 6674 anon_index = svd->anon_index + seg_page(seg, addr); 6675 6676 for (eaddr = addr + len; addr < eaddr; addr += PAGESIZE) { 6677 ap = NULL; 6678 if (amp != NULL) { 6679 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 6680 anon_array_enter(amp, anon_index, &cookie); 6681 ap = anon_get_ptr(amp->ahp, anon_index++); 6682 if (ap != NULL) { 6683 swap_xlate(ap, &vp, &off); 6684 } else { 6685 vp = svd->vp; 6686 off = offset; 6687 } 6688 anon_array_exit(&cookie); 6689 ANON_LOCK_EXIT(&->a_rwlock); 6690 } else { 6691 vp = svd->vp; 6692 off = offset; 6693 } 6694 offset += PAGESIZE; 6695 6696 if (vp == NULL) /* untouched zfod page */ 6697 continue; 6698 6699 if (attr) { 6700 if (vpp) { 6701 prot = VPP_PROT(vpp); 6702 vpp++; 6703 } 6704 if (prot != pageprot) { 6705 continue; 6706 } 6707 } 6708 6709 /* 6710 * See if any of these pages are locked -- if so, then we 6711 * will have to truncate an invalidate request at the first 6712 * locked one. We don't need the page_struct_lock to test 6713 * as this is only advisory; even if we acquire it someone 6714 * might race in and lock the page after we unlock and before 6715 * we do the PUTPAGE, then PUTPAGE simply does nothing. 6716 */ 6717 if (flags & MS_INVALIDATE) { 6718 if ((pp = page_lookup(vp, off, SE_SHARED)) != NULL) { 6719 if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) { 6720 page_unlock(pp); 6721 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 6722 return (EBUSY); 6723 } 6724 if (ap != NULL && pp->p_szc != 0 && 6725 page_tryupgrade(pp)) { 6726 if (pp->p_lckcnt == 0 && 6727 pp->p_cowcnt == 0) { 6728 /* 6729 * swapfs VN_DISPOSE() won't 6730 * invalidate large pages. 6731 * Attempt to demote. 6732 * XXX can't help it if it 6733 * fails. But for swapfs 6734 * pages it is no big deal. 6735 */ 6736 (void) page_try_demote_pages( 6737 pp); 6738 } 6739 } 6740 page_unlock(pp); 6741 } 6742 } else if (svd->type == MAP_SHARED && amp != NULL) { 6743 /* 6744 * Avoid writting out to disk ISM's large pages 6745 * because segspt_free_pages() relies on NULL an_pvp 6746 * of anon slots of such pages. 6747 */ 6748 6749 ASSERT(svd->vp == NULL); 6750 /* 6751 * swapfs uses page_lookup_nowait if not freeing or 6752 * invalidating and skips a page if 6753 * page_lookup_nowait returns NULL. 6754 */ 6755 pp = page_lookup_nowait(vp, off, SE_SHARED); 6756 if (pp == NULL) { 6757 continue; 6758 } 6759 if (pp->p_szc != 0) { 6760 page_unlock(pp); 6761 continue; 6762 } 6763 6764 /* 6765 * Note ISM pages are created large so (vp, off)'s 6766 * page cannot suddenly become large after we unlock 6767 * pp. 6768 */ 6769 page_unlock(pp); 6770 } 6771 /* 6772 * XXX - Should ultimately try to kluster 6773 * calls to VOP_PUTPAGE() for performance. 6774 */ 6775 VN_HOLD(vp); 6776 err = VOP_PUTPAGE(vp, (offset_t)off, PAGESIZE, 6777 bflags, svd->cred); 6778 VN_RELE(vp); 6779 if (err) 6780 break; 6781 } 6782 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 6783 return (err); 6784 } 6785 6786 /* 6787 * Determine if we have data corresponding to pages in the 6788 * primary storage virtual memory cache (i.e., "in core"). 6789 */ 6790 static size_t 6791 segvn_incore(struct seg *seg, caddr_t addr, size_t len, char *vec) 6792 { 6793 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 6794 struct vnode *vp, *avp; 6795 u_offset_t offset, aoffset; 6796 size_t p, ep; 6797 int ret; 6798 struct vpage *vpp; 6799 page_t *pp; 6800 uint_t start; 6801 struct anon_map *amp; /* XXX - for locknest */ 6802 struct anon *ap; 6803 uint_t attr; 6804 anon_sync_obj_t cookie; 6805 6806 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 6807 6808 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER); 6809 if (svd->amp == NULL && svd->vp == NULL) { 6810 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 6811 bzero(vec, btopr(len)); 6812 return (len); /* no anonymous pages created yet */ 6813 } 6814 6815 p = seg_page(seg, addr); 6816 ep = seg_page(seg, addr + len); 6817 start = svd->vp ? SEG_PAGE_VNODEBACKED : 0; 6818 6819 amp = svd->amp; 6820 for (; p < ep; p++, addr += PAGESIZE) { 6821 vpp = (svd->vpage) ? &svd->vpage[p]: NULL; 6822 ret = start; 6823 ap = NULL; 6824 avp = NULL; 6825 /* Grab the vnode/offset for the anon slot */ 6826 if (amp != NULL) { 6827 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 6828 anon_array_enter(amp, svd->anon_index + p, &cookie); 6829 ap = anon_get_ptr(amp->ahp, svd->anon_index + p); 6830 if (ap != NULL) { 6831 swap_xlate(ap, &avp, &aoffset); 6832 } 6833 anon_array_exit(&cookie); 6834 ANON_LOCK_EXIT(&->a_rwlock); 6835 } 6836 if ((avp != NULL) && page_exists(avp, aoffset)) { 6837 /* A page exists for the anon slot */ 6838 ret |= SEG_PAGE_INCORE; 6839 6840 /* 6841 * If page is mapped and writable 6842 */ 6843 attr = (uint_t)0; 6844 if ((hat_getattr(seg->s_as->a_hat, addr, 6845 &attr) != -1) && (attr & PROT_WRITE)) { 6846 ret |= SEG_PAGE_ANON; 6847 } 6848 /* 6849 * Don't get page_struct lock for lckcnt and cowcnt, 6850 * since this is purely advisory. 6851 */ 6852 if ((pp = page_lookup_nowait(avp, aoffset, 6853 SE_SHARED)) != NULL) { 6854 if (pp->p_lckcnt) 6855 ret |= SEG_PAGE_SOFTLOCK; 6856 if (pp->p_cowcnt) 6857 ret |= SEG_PAGE_HASCOW; 6858 page_unlock(pp); 6859 } 6860 } 6861 6862 /* Gather vnode statistics */ 6863 vp = svd->vp; 6864 offset = svd->offset + (uintptr_t)(addr - seg->s_base); 6865 6866 if (vp != NULL) { 6867 /* 6868 * Try to obtain a "shared" lock on the page 6869 * without blocking. If this fails, determine 6870 * if the page is in memory. 6871 */ 6872 pp = page_lookup_nowait(vp, offset, SE_SHARED); 6873 if ((pp == NULL) && (page_exists(vp, offset))) { 6874 /* Page is incore, and is named */ 6875 ret |= (SEG_PAGE_INCORE | SEG_PAGE_VNODE); 6876 } 6877 /* 6878 * Don't get page_struct lock for lckcnt and cowcnt, 6879 * since this is purely advisory. 6880 */ 6881 if (pp != NULL) { 6882 ret |= (SEG_PAGE_INCORE | SEG_PAGE_VNODE); 6883 if (pp->p_lckcnt) 6884 ret |= SEG_PAGE_SOFTLOCK; 6885 if (pp->p_cowcnt) 6886 ret |= SEG_PAGE_HASCOW; 6887 page_unlock(pp); 6888 } 6889 } 6890 6891 /* Gather virtual page information */ 6892 if (vpp) { 6893 if (VPP_ISPPLOCK(vpp)) 6894 ret |= SEG_PAGE_LOCKED; 6895 vpp++; 6896 } 6897 6898 *vec++ = (char)ret; 6899 } 6900 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 6901 return (len); 6902 } 6903 6904 /* 6905 * Statement for p_cowcnts/p_lckcnts. 6906 * 6907 * p_cowcnt is updated while mlock/munlocking MAP_PRIVATE and PROT_WRITE region 6908 * irrespective of the following factors or anything else: 6909 * 6910 * (1) anon slots are populated or not 6911 * (2) cow is broken or not 6912 * (3) refcnt on ap is 1 or greater than 1 6913 * 6914 * If it's not MAP_PRIVATE and PROT_WRITE, p_lckcnt is updated during mlock 6915 * and munlock. 6916 * 6917 * 6918 * Handling p_cowcnts/p_lckcnts during copy-on-write fault: 6919 * 6920 * if vpage has PROT_WRITE 6921 * transfer cowcnt on the oldpage -> cowcnt on the newpage 6922 * else 6923 * transfer lckcnt on the oldpage -> lckcnt on the newpage 6924 * 6925 * During copy-on-write, decrement p_cowcnt on the oldpage and increment 6926 * p_cowcnt on the newpage *if* the corresponding vpage has PROT_WRITE. 6927 * 6928 * We may also break COW if softlocking on read access in the physio case. 6929 * In this case, vpage may not have PROT_WRITE. So, we need to decrement 6930 * p_lckcnt on the oldpage and increment p_lckcnt on the newpage *if* the 6931 * vpage doesn't have PROT_WRITE. 6932 * 6933 * 6934 * Handling p_cowcnts/p_lckcnts during mprotect on mlocked region: 6935 * 6936 * If a MAP_PRIVATE region loses PROT_WRITE, we decrement p_cowcnt and 6937 * increment p_lckcnt by calling page_subclaim() which takes care of 6938 * availrmem accounting and p_lckcnt overflow. 6939 * 6940 * If a MAP_PRIVATE region gains PROT_WRITE, we decrement p_lckcnt and 6941 * increment p_cowcnt by calling page_addclaim() which takes care of 6942 * availrmem availability and p_cowcnt overflow. 6943 */ 6944 6945 /* 6946 * Lock down (or unlock) pages mapped by this segment. 6947 * 6948 * XXX only creates PAGESIZE pages if anon slots are not initialized. 6949 * At fault time they will be relocated into larger pages. 6950 */ 6951 static int 6952 segvn_lockop(struct seg *seg, caddr_t addr, size_t len, 6953 int attr, int op, ulong_t *lockmap, size_t pos) 6954 { 6955 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 6956 struct vpage *vpp; 6957 struct vpage *evp; 6958 page_t *pp; 6959 u_offset_t offset; 6960 u_offset_t off; 6961 int segtype; 6962 int pageprot; 6963 int claim; 6964 struct vnode *vp; 6965 ulong_t anon_index; 6966 struct anon_map *amp; 6967 struct anon *ap; 6968 struct vattr va; 6969 anon_sync_obj_t cookie; 6970 struct kshmid *sp = NULL; 6971 struct proc *p = curproc; 6972 kproject_t *proj = NULL; 6973 int chargeproc = 1; 6974 size_t locked_bytes = 0; 6975 size_t unlocked_bytes = 0; 6976 int err = 0; 6977 6978 /* 6979 * Hold write lock on address space because may split or concatenate 6980 * segments 6981 */ 6982 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 6983 6984 /* 6985 * If this is a shm, use shm's project and zone, else use 6986 * project and zone of calling process 6987 */ 6988 6989 /* Determine if this segment backs a sysV shm */ 6990 if (svd->amp != NULL && svd->amp->a_sp != NULL) { 6991 sp = svd->amp->a_sp; 6992 proj = sp->shm_perm.ipc_proj; 6993 chargeproc = 0; 6994 } 6995 6996 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_WRITER); 6997 if (attr) { 6998 pageprot = attr & ~(SHARED|PRIVATE); 6999 segtype = attr & SHARED ? MAP_SHARED : MAP_PRIVATE; 7000 7001 /* 7002 * We are done if the segment types don't match 7003 * or if we have segment level protections and 7004 * they don't match. 7005 */ 7006 if (svd->type != segtype) { 7007 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7008 return (0); 7009 } 7010 if (svd->pageprot == 0 && svd->prot != pageprot) { 7011 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7012 return (0); 7013 } 7014 } 7015 7016 /* 7017 * If we're locking, then we must create a vpage structure if 7018 * none exists. If we're unlocking, then check to see if there 7019 * is a vpage -- if not, then we could not have locked anything. 7020 */ 7021 7022 if ((vpp = svd->vpage) == NULL) { 7023 if (op == MC_LOCK) 7024 segvn_vpage(seg); 7025 else { 7026 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7027 return (0); 7028 } 7029 } 7030 7031 /* 7032 * The anonymous data vector (i.e., previously 7033 * unreferenced mapping to swap space) can be allocated 7034 * by lazily testing for its existence. 7035 */ 7036 if (op == MC_LOCK && svd->amp == NULL && svd->vp == NULL) { 7037 svd->amp = anonmap_alloc(seg->s_size, 0); 7038 svd->amp->a_szc = seg->s_szc; 7039 } 7040 7041 if ((amp = svd->amp) != NULL) { 7042 anon_index = svd->anon_index + seg_page(seg, addr); 7043 } 7044 7045 offset = svd->offset + (uintptr_t)(addr - seg->s_base); 7046 evp = &svd->vpage[seg_page(seg, addr + len)]; 7047 7048 if (sp != NULL) 7049 mutex_enter(&sp->shm_mlock); 7050 7051 /* determine number of unlocked bytes in range for lock operation */ 7052 if (op == MC_LOCK) { 7053 7054 if (sp == NULL) { 7055 for (vpp = &svd->vpage[seg_page(seg, addr)]; vpp < evp; 7056 vpp++) { 7057 if (!VPP_ISPPLOCK(vpp)) 7058 unlocked_bytes += PAGESIZE; 7059 } 7060 } else { 7061 ulong_t i_idx, i_edx; 7062 anon_sync_obj_t i_cookie; 7063 struct anon *i_ap; 7064 struct vnode *i_vp; 7065 u_offset_t i_off; 7066 7067 /* Only count sysV pages once for locked memory */ 7068 i_edx = svd->anon_index + seg_page(seg, addr + len); 7069 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 7070 for (i_idx = anon_index; i_idx < i_edx; i_idx++) { 7071 anon_array_enter(amp, i_idx, &i_cookie); 7072 i_ap = anon_get_ptr(amp->ahp, i_idx); 7073 if (i_ap == NULL) { 7074 unlocked_bytes += PAGESIZE; 7075 anon_array_exit(&i_cookie); 7076 continue; 7077 } 7078 swap_xlate(i_ap, &i_vp, &i_off); 7079 anon_array_exit(&i_cookie); 7080 pp = page_lookup(i_vp, i_off, SE_SHARED); 7081 if (pp == NULL) { 7082 unlocked_bytes += PAGESIZE; 7083 continue; 7084 } else if (pp->p_lckcnt == 0) 7085 unlocked_bytes += PAGESIZE; 7086 page_unlock(pp); 7087 } 7088 ANON_LOCK_EXIT(&->a_rwlock); 7089 } 7090 7091 mutex_enter(&p->p_lock); 7092 err = rctl_incr_locked_mem(p, proj, unlocked_bytes, 7093 chargeproc); 7094 mutex_exit(&p->p_lock); 7095 7096 if (err) { 7097 if (sp != NULL) 7098 mutex_exit(&sp->shm_mlock); 7099 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7100 return (err); 7101 } 7102 } 7103 /* 7104 * Loop over all pages in the range. Process if we're locking and 7105 * page has not already been locked in this mapping; or if we're 7106 * unlocking and the page has been locked. 7107 */ 7108 for (vpp = &svd->vpage[seg_page(seg, addr)]; vpp < evp; 7109 vpp++, pos++, addr += PAGESIZE, offset += PAGESIZE, anon_index++) { 7110 if ((attr == 0 || VPP_PROT(vpp) == pageprot) && 7111 ((op == MC_LOCK && !VPP_ISPPLOCK(vpp)) || 7112 (op == MC_UNLOCK && VPP_ISPPLOCK(vpp)))) { 7113 7114 if (amp != NULL) 7115 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 7116 /* 7117 * If this isn't a MAP_NORESERVE segment and 7118 * we're locking, allocate anon slots if they 7119 * don't exist. The page is brought in later on. 7120 */ 7121 if (op == MC_LOCK && svd->vp == NULL && 7122 ((svd->flags & MAP_NORESERVE) == 0) && 7123 amp != NULL && 7124 ((ap = anon_get_ptr(amp->ahp, anon_index)) 7125 == NULL)) { 7126 anon_array_enter(amp, anon_index, &cookie); 7127 7128 if ((ap = anon_get_ptr(amp->ahp, 7129 anon_index)) == NULL) { 7130 pp = anon_zero(seg, addr, &ap, 7131 svd->cred); 7132 if (pp == NULL) { 7133 anon_array_exit(&cookie); 7134 ANON_LOCK_EXIT(&->a_rwlock); 7135 err = ENOMEM; 7136 goto out; 7137 } 7138 ASSERT(anon_get_ptr(amp->ahp, 7139 anon_index) == NULL); 7140 (void) anon_set_ptr(amp->ahp, 7141 anon_index, ap, ANON_SLEEP); 7142 page_unlock(pp); 7143 } 7144 anon_array_exit(&cookie); 7145 } 7146 7147 /* 7148 * Get name for page, accounting for 7149 * existence of private copy. 7150 */ 7151 ap = NULL; 7152 if (amp != NULL) { 7153 anon_array_enter(amp, anon_index, &cookie); 7154 ap = anon_get_ptr(amp->ahp, anon_index); 7155 if (ap != NULL) { 7156 swap_xlate(ap, &vp, &off); 7157 } else { 7158 if (svd->vp == NULL && 7159 (svd->flags & MAP_NORESERVE)) { 7160 anon_array_exit(&cookie); 7161 ANON_LOCK_EXIT(&->a_rwlock); 7162 continue; 7163 } 7164 vp = svd->vp; 7165 off = offset; 7166 } 7167 anon_array_exit(&cookie); 7168 ANON_LOCK_EXIT(&->a_rwlock); 7169 } else { 7170 vp = svd->vp; 7171 off = offset; 7172 } 7173 7174 /* 7175 * Get page frame. It's ok if the page is 7176 * not available when we're unlocking, as this 7177 * may simply mean that a page we locked got 7178 * truncated out of existence after we locked it. 7179 * 7180 * Invoke VOP_GETPAGE() to obtain the page struct 7181 * since we may need to read it from disk if its 7182 * been paged out. 7183 */ 7184 if (op != MC_LOCK) 7185 pp = page_lookup(vp, off, SE_SHARED); 7186 else { 7187 page_t *pl[1 + 1]; 7188 int error; 7189 7190 ASSERT(vp != NULL); 7191 7192 error = VOP_GETPAGE(vp, (offset_t)off, PAGESIZE, 7193 (uint_t *)NULL, pl, PAGESIZE, seg, addr, 7194 S_OTHER, svd->cred); 7195 7196 /* 7197 * If the error is EDEADLK then we must bounce 7198 * up and drop all vm subsystem locks and then 7199 * retry the operation later 7200 * This behavior is a temporary measure because 7201 * ufs/sds logging is badly designed and will 7202 * deadlock if we don't allow this bounce to 7203 * happen. The real solution is to re-design 7204 * the logging code to work properly. See bug 7205 * 4125102 for details of the problem. 7206 */ 7207 if (error == EDEADLK) { 7208 err = error; 7209 goto out; 7210 } 7211 /* 7212 * Quit if we fail to fault in the page. Treat 7213 * the failure as an error, unless the addr 7214 * is mapped beyond the end of a file. 7215 */ 7216 if (error && svd->vp) { 7217 va.va_mask = AT_SIZE; 7218 if (VOP_GETATTR(svd->vp, &va, 0, 7219 svd->cred) != 0) { 7220 err = EIO; 7221 goto out; 7222 } 7223 if (btopr(va.va_size) >= 7224 btopr(off + 1)) { 7225 err = EIO; 7226 goto out; 7227 } 7228 goto out; 7229 7230 } else if (error) { 7231 err = EIO; 7232 goto out; 7233 } 7234 pp = pl[0]; 7235 ASSERT(pp != NULL); 7236 } 7237 7238 /* 7239 * See Statement at the beginning of this routine. 7240 * 7241 * claim is always set if MAP_PRIVATE and PROT_WRITE 7242 * irrespective of following factors: 7243 * 7244 * (1) anon slots are populated or not 7245 * (2) cow is broken or not 7246 * (3) refcnt on ap is 1 or greater than 1 7247 * 7248 * See 4140683 for details 7249 */ 7250 claim = ((VPP_PROT(vpp) & PROT_WRITE) && 7251 (svd->type == MAP_PRIVATE)); 7252 7253 /* 7254 * Perform page-level operation appropriate to 7255 * operation. If locking, undo the SOFTLOCK 7256 * performed to bring the page into memory 7257 * after setting the lock. If unlocking, 7258 * and no page was found, account for the claim 7259 * separately. 7260 */ 7261 if (op == MC_LOCK) { 7262 int ret = 1; /* Assume success */ 7263 7264 ASSERT(!VPP_ISPPLOCK(vpp)); 7265 7266 ret = page_pp_lock(pp, claim, 0); 7267 if (ret == 0) { 7268 /* locking page failed */ 7269 page_unlock(pp); 7270 err = EAGAIN; 7271 goto out; 7272 } 7273 VPP_SETPPLOCK(vpp); 7274 if (sp != NULL) { 7275 if (pp->p_lckcnt == 1) 7276 locked_bytes += PAGESIZE; 7277 } else 7278 locked_bytes += PAGESIZE; 7279 7280 if (lockmap != (ulong_t *)NULL) 7281 BT_SET(lockmap, pos); 7282 7283 page_unlock(pp); 7284 } else { 7285 ASSERT(VPP_ISPPLOCK(vpp)); 7286 if (pp != NULL) { 7287 /* sysV pages should be locked */ 7288 ASSERT(sp == NULL || pp->p_lckcnt > 0); 7289 page_pp_unlock(pp, claim, 0); 7290 if (sp != NULL) { 7291 if (pp->p_lckcnt == 0) 7292 unlocked_bytes 7293 += PAGESIZE; 7294 } else 7295 unlocked_bytes += PAGESIZE; 7296 page_unlock(pp); 7297 } else { 7298 ASSERT(sp == NULL); 7299 unlocked_bytes += PAGESIZE; 7300 } 7301 VPP_CLRPPLOCK(vpp); 7302 } 7303 } 7304 } 7305 out: 7306 if (op == MC_LOCK) { 7307 /* Credit back bytes that did not get locked */ 7308 if ((unlocked_bytes - locked_bytes) > 0) { 7309 if (proj == NULL) 7310 mutex_enter(&p->p_lock); 7311 rctl_decr_locked_mem(p, proj, 7312 (unlocked_bytes - locked_bytes), chargeproc); 7313 if (proj == NULL) 7314 mutex_exit(&p->p_lock); 7315 } 7316 7317 } else { 7318 /* Account bytes that were unlocked */ 7319 if (unlocked_bytes > 0) { 7320 if (proj == NULL) 7321 mutex_enter(&p->p_lock); 7322 rctl_decr_locked_mem(p, proj, unlocked_bytes, 7323 chargeproc); 7324 if (proj == NULL) 7325 mutex_exit(&p->p_lock); 7326 } 7327 } 7328 if (sp != NULL) 7329 mutex_exit(&sp->shm_mlock); 7330 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7331 7332 return (err); 7333 } 7334 7335 /* 7336 * Set advice from user for specified pages 7337 * There are 5 types of advice: 7338 * MADV_NORMAL - Normal (default) behavior (whatever that is) 7339 * MADV_RANDOM - Random page references 7340 * do not allow readahead or 'klustering' 7341 * MADV_SEQUENTIAL - Sequential page references 7342 * Pages previous to the one currently being 7343 * accessed (determined by fault) are 'not needed' 7344 * and are freed immediately 7345 * MADV_WILLNEED - Pages are likely to be used (fault ahead in mctl) 7346 * MADV_DONTNEED - Pages are not needed (synced out in mctl) 7347 * MADV_FREE - Contents can be discarded 7348 * MADV_ACCESS_DEFAULT- Default access 7349 * MADV_ACCESS_LWP - Next LWP will access heavily 7350 * MADV_ACCESS_MANY- Many LWPs or processes will access heavily 7351 */ 7352 static int 7353 segvn_advise(struct seg *seg, caddr_t addr, size_t len, uint_t behav) 7354 { 7355 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 7356 size_t page; 7357 int err = 0; 7358 int already_set; 7359 struct anon_map *amp; 7360 ulong_t anon_index; 7361 struct seg *next; 7362 lgrp_mem_policy_t policy; 7363 struct seg *prev; 7364 struct vnode *vp; 7365 7366 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 7367 7368 /* 7369 * In case of MADV_FREE, we won't be modifying any segment private 7370 * data structures; so, we only need to grab READER's lock 7371 */ 7372 if (behav != MADV_FREE) 7373 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_WRITER); 7374 else 7375 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER); 7376 7377 /* 7378 * Large pages are assumed to be only turned on when accesses to the 7379 * segment's address range have spatial and temporal locality. That 7380 * justifies ignoring MADV_SEQUENTIAL for large page segments. 7381 * Also, ignore advice affecting lgroup memory allocation 7382 * if don't need to do lgroup optimizations on this system 7383 */ 7384 7385 if ((behav == MADV_SEQUENTIAL && seg->s_szc != 0) || 7386 (!lgrp_optimizations() && (behav == MADV_ACCESS_DEFAULT || 7387 behav == MADV_ACCESS_LWP || behav == MADV_ACCESS_MANY))) { 7388 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7389 return (0); 7390 } 7391 7392 if (behav == MADV_SEQUENTIAL || behav == MADV_ACCESS_DEFAULT || 7393 behav == MADV_ACCESS_LWP || behav == MADV_ACCESS_MANY) { 7394 /* 7395 * Since we are going to unload hat mappings 7396 * we first have to flush the cache. Otherwise 7397 * this might lead to system panic if another 7398 * thread is doing physio on the range whose 7399 * mappings are unloaded by madvise(3C). 7400 */ 7401 if (svd->softlockcnt > 0) { 7402 /* 7403 * Since we do have the segvn writers lock 7404 * nobody can fill the cache with entries 7405 * belonging to this seg during the purge. 7406 * The flush either succeeds or we still 7407 * have pending I/Os. In the later case, 7408 * madvise(3C) fails. 7409 */ 7410 segvn_purge(seg); 7411 if (svd->softlockcnt > 0) { 7412 /* 7413 * Since madvise(3C) is advisory and 7414 * it's not part of UNIX98, madvise(3C) 7415 * failure here doesn't cause any hardship. 7416 * Note that we don't block in "as" layer. 7417 */ 7418 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7419 return (EAGAIN); 7420 } 7421 } 7422 } 7423 7424 amp = svd->amp; 7425 vp = svd->vp; 7426 if (behav == MADV_FREE) { 7427 /* 7428 * MADV_FREE is not supported for segments with 7429 * underlying object; if anonmap is NULL, anon slots 7430 * are not yet populated and there is nothing for 7431 * us to do. As MADV_FREE is advisory, we don't 7432 * return error in either case. 7433 */ 7434 if (vp || amp == NULL) { 7435 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7436 return (0); 7437 } 7438 7439 page = seg_page(seg, addr); 7440 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 7441 anon_disclaim(amp, svd->anon_index + page, len, 0); 7442 ANON_LOCK_EXIT(&->a_rwlock); 7443 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7444 return (0); 7445 } 7446 7447 /* 7448 * If advice is to be applied to entire segment, 7449 * use advice field in seg_data structure 7450 * otherwise use appropriate vpage entry. 7451 */ 7452 if ((addr == seg->s_base) && (len == seg->s_size)) { 7453 switch (behav) { 7454 case MADV_ACCESS_LWP: 7455 case MADV_ACCESS_MANY: 7456 case MADV_ACCESS_DEFAULT: 7457 /* 7458 * Set memory allocation policy for this segment 7459 */ 7460 policy = lgrp_madv_to_policy(behav, len, svd->type); 7461 if (svd->type == MAP_SHARED) 7462 already_set = lgrp_shm_policy_set(policy, amp, 7463 svd->anon_index, vp, svd->offset, len); 7464 else { 7465 /* 7466 * For private memory, need writers lock on 7467 * address space because the segment may be 7468 * split or concatenated when changing policy 7469 */ 7470 if (AS_READ_HELD(seg->s_as, 7471 &seg->s_as->a_lock)) { 7472 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7473 return (IE_RETRY); 7474 } 7475 7476 already_set = lgrp_privm_policy_set(policy, 7477 &svd->policy_info, len); 7478 } 7479 7480 /* 7481 * If policy set already and it shouldn't be reapplied, 7482 * don't do anything. 7483 */ 7484 if (already_set && 7485 !LGRP_MEM_POLICY_REAPPLICABLE(policy)) 7486 break; 7487 7488 /* 7489 * Mark any existing pages in given range for 7490 * migration 7491 */ 7492 page_mark_migrate(seg, addr, len, amp, svd->anon_index, 7493 vp, svd->offset, 1); 7494 7495 /* 7496 * If same policy set already or this is a shared 7497 * memory segment, don't need to try to concatenate 7498 * segment with adjacent ones. 7499 */ 7500 if (already_set || svd->type == MAP_SHARED) 7501 break; 7502 7503 /* 7504 * Try to concatenate this segment with previous 7505 * one and next one, since we changed policy for 7506 * this one and it may be compatible with adjacent 7507 * ones now. 7508 */ 7509 prev = AS_SEGPREV(seg->s_as, seg); 7510 next = AS_SEGNEXT(seg->s_as, seg); 7511 7512 if (next && next->s_ops == &segvn_ops && 7513 addr + len == next->s_base) 7514 (void) segvn_concat(seg, next, 1); 7515 7516 if (prev && prev->s_ops == &segvn_ops && 7517 addr == prev->s_base + prev->s_size) { 7518 /* 7519 * Drop lock for private data of current 7520 * segment before concatenating (deleting) it 7521 * and return IE_REATTACH to tell as_ctl() that 7522 * current segment has changed 7523 */ 7524 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7525 if (!segvn_concat(prev, seg, 1)) 7526 err = IE_REATTACH; 7527 7528 return (err); 7529 } 7530 break; 7531 7532 case MADV_SEQUENTIAL: 7533 /* 7534 * unloading mapping guarantees 7535 * detection in segvn_fault 7536 */ 7537 ASSERT(seg->s_szc == 0); 7538 hat_unload(seg->s_as->a_hat, addr, len, 7539 HAT_UNLOAD); 7540 /* FALLTHROUGH */ 7541 case MADV_NORMAL: 7542 case MADV_RANDOM: 7543 svd->advice = (uchar_t)behav; 7544 svd->pageadvice = 0; 7545 break; 7546 case MADV_WILLNEED: /* handled in memcntl */ 7547 case MADV_DONTNEED: /* handled in memcntl */ 7548 case MADV_FREE: /* handled above */ 7549 break; 7550 default: 7551 err = EINVAL; 7552 } 7553 } else { 7554 caddr_t eaddr; 7555 struct seg *new_seg; 7556 struct segvn_data *new_svd; 7557 u_offset_t off; 7558 caddr_t oldeaddr; 7559 7560 page = seg_page(seg, addr); 7561 7562 segvn_vpage(seg); 7563 7564 switch (behav) { 7565 struct vpage *bvpp, *evpp; 7566 7567 case MADV_ACCESS_LWP: 7568 case MADV_ACCESS_MANY: 7569 case MADV_ACCESS_DEFAULT: 7570 /* 7571 * Set memory allocation policy for portion of this 7572 * segment 7573 */ 7574 7575 /* 7576 * Align address and length of advice to page 7577 * boundaries for large pages 7578 */ 7579 if (seg->s_szc != 0) { 7580 size_t pgsz; 7581 7582 pgsz = page_get_pagesize(seg->s_szc); 7583 addr = (caddr_t)P2ALIGN((uintptr_t)addr, pgsz); 7584 len = P2ROUNDUP(len, pgsz); 7585 } 7586 7587 /* 7588 * Check to see whether policy is set already 7589 */ 7590 policy = lgrp_madv_to_policy(behav, len, svd->type); 7591 7592 anon_index = svd->anon_index + page; 7593 off = svd->offset + (uintptr_t)(addr - seg->s_base); 7594 7595 if (svd->type == MAP_SHARED) 7596 already_set = lgrp_shm_policy_set(policy, amp, 7597 anon_index, vp, off, len); 7598 else 7599 already_set = 7600 (policy == svd->policy_info.mem_policy); 7601 7602 /* 7603 * If policy set already and it shouldn't be reapplied, 7604 * don't do anything. 7605 */ 7606 if (already_set && 7607 !LGRP_MEM_POLICY_REAPPLICABLE(policy)) 7608 break; 7609 7610 /* 7611 * For private memory, need writers lock on 7612 * address space because the segment may be 7613 * split or concatenated when changing policy 7614 */ 7615 if (svd->type == MAP_PRIVATE && 7616 AS_READ_HELD(seg->s_as, &seg->s_as->a_lock)) { 7617 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7618 return (IE_RETRY); 7619 } 7620 7621 /* 7622 * Mark any existing pages in given range for 7623 * migration 7624 */ 7625 page_mark_migrate(seg, addr, len, amp, svd->anon_index, 7626 vp, svd->offset, 1); 7627 7628 /* 7629 * Don't need to try to split or concatenate 7630 * segments, since policy is same or this is a shared 7631 * memory segment 7632 */ 7633 if (already_set || svd->type == MAP_SHARED) 7634 break; 7635 7636 /* 7637 * Split off new segment if advice only applies to a 7638 * portion of existing segment starting in middle 7639 */ 7640 new_seg = NULL; 7641 eaddr = addr + len; 7642 oldeaddr = seg->s_base + seg->s_size; 7643 if (addr > seg->s_base) { 7644 /* 7645 * Must flush I/O page cache 7646 * before splitting segment 7647 */ 7648 if (svd->softlockcnt > 0) 7649 segvn_purge(seg); 7650 7651 /* 7652 * Split segment and return IE_REATTACH to tell 7653 * as_ctl() that current segment changed 7654 */ 7655 new_seg = segvn_split_seg(seg, addr); 7656 new_svd = (struct segvn_data *)new_seg->s_data; 7657 err = IE_REATTACH; 7658 7659 /* 7660 * If new segment ends where old one 7661 * did, try to concatenate the new 7662 * segment with next one. 7663 */ 7664 if (eaddr == oldeaddr) { 7665 /* 7666 * Set policy for new segment 7667 */ 7668 (void) lgrp_privm_policy_set(policy, 7669 &new_svd->policy_info, 7670 new_seg->s_size); 7671 7672 next = AS_SEGNEXT(new_seg->s_as, 7673 new_seg); 7674 7675 if (next && 7676 next->s_ops == &segvn_ops && 7677 eaddr == next->s_base) 7678 (void) segvn_concat(new_seg, 7679 next, 1); 7680 } 7681 } 7682 7683 /* 7684 * Split off end of existing segment if advice only 7685 * applies to a portion of segment ending before 7686 * end of the existing segment 7687 */ 7688 if (eaddr < oldeaddr) { 7689 /* 7690 * Must flush I/O page cache 7691 * before splitting segment 7692 */ 7693 if (svd->softlockcnt > 0) 7694 segvn_purge(seg); 7695 7696 /* 7697 * If beginning of old segment was already 7698 * split off, use new segment to split end off 7699 * from. 7700 */ 7701 if (new_seg != NULL && new_seg != seg) { 7702 /* 7703 * Split segment 7704 */ 7705 (void) segvn_split_seg(new_seg, eaddr); 7706 7707 /* 7708 * Set policy for new segment 7709 */ 7710 (void) lgrp_privm_policy_set(policy, 7711 &new_svd->policy_info, 7712 new_seg->s_size); 7713 } else { 7714 /* 7715 * Split segment and return IE_REATTACH 7716 * to tell as_ctl() that current 7717 * segment changed 7718 */ 7719 (void) segvn_split_seg(seg, eaddr); 7720 err = IE_REATTACH; 7721 7722 (void) lgrp_privm_policy_set(policy, 7723 &svd->policy_info, seg->s_size); 7724 7725 /* 7726 * If new segment starts where old one 7727 * did, try to concatenate it with 7728 * previous segment. 7729 */ 7730 if (addr == seg->s_base) { 7731 prev = AS_SEGPREV(seg->s_as, 7732 seg); 7733 7734 /* 7735 * Drop lock for private data 7736 * of current segment before 7737 * concatenating (deleting) it 7738 */ 7739 if (prev && 7740 prev->s_ops == 7741 &segvn_ops && 7742 addr == prev->s_base + 7743 prev->s_size) { 7744 SEGVN_LOCK_EXIT( 7745 seg->s_as, 7746 &svd->lock); 7747 (void) segvn_concat( 7748 prev, seg, 1); 7749 return (err); 7750 } 7751 } 7752 } 7753 } 7754 break; 7755 case MADV_SEQUENTIAL: 7756 ASSERT(seg->s_szc == 0); 7757 hat_unload(seg->s_as->a_hat, addr, len, HAT_UNLOAD); 7758 /* FALLTHROUGH */ 7759 case MADV_NORMAL: 7760 case MADV_RANDOM: 7761 bvpp = &svd->vpage[page]; 7762 evpp = &svd->vpage[page + (len >> PAGESHIFT)]; 7763 for (; bvpp < evpp; bvpp++) 7764 VPP_SETADVICE(bvpp, behav); 7765 svd->advice = MADV_NORMAL; 7766 break; 7767 case MADV_WILLNEED: /* handled in memcntl */ 7768 case MADV_DONTNEED: /* handled in memcntl */ 7769 case MADV_FREE: /* handled above */ 7770 break; 7771 default: 7772 err = EINVAL; 7773 } 7774 } 7775 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7776 return (err); 7777 } 7778 7779 /* 7780 * Create a vpage structure for this seg. 7781 */ 7782 static void 7783 segvn_vpage(struct seg *seg) 7784 { 7785 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 7786 struct vpage *vp, *evp; 7787 7788 ASSERT(SEGVN_WRITE_HELD(seg->s_as, &svd->lock)); 7789 7790 /* 7791 * If no vpage structure exists, allocate one. Copy the protections 7792 * and the advice from the segment itself to the individual pages. 7793 */ 7794 if (svd->vpage == NULL) { 7795 svd->pageprot = 1; 7796 svd->pageadvice = 1; 7797 svd->vpage = kmem_zalloc(seg_pages(seg) * sizeof (struct vpage), 7798 KM_SLEEP); 7799 evp = &svd->vpage[seg_page(seg, seg->s_base + seg->s_size)]; 7800 for (vp = svd->vpage; vp < evp; vp++) { 7801 VPP_SETPROT(vp, svd->prot); 7802 VPP_SETADVICE(vp, svd->advice); 7803 } 7804 } 7805 } 7806 7807 /* 7808 * Dump the pages belonging to this segvn segment. 7809 */ 7810 static void 7811 segvn_dump(struct seg *seg) 7812 { 7813 struct segvn_data *svd; 7814 page_t *pp; 7815 struct anon_map *amp; 7816 ulong_t anon_index; 7817 struct vnode *vp; 7818 u_offset_t off, offset; 7819 pfn_t pfn; 7820 pgcnt_t page, npages; 7821 caddr_t addr; 7822 7823 npages = seg_pages(seg); 7824 svd = (struct segvn_data *)seg->s_data; 7825 vp = svd->vp; 7826 off = offset = svd->offset; 7827 addr = seg->s_base; 7828 7829 if ((amp = svd->amp) != NULL) { 7830 anon_index = svd->anon_index; 7831 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 7832 } 7833 7834 for (page = 0; page < npages; page++, offset += PAGESIZE) { 7835 struct anon *ap; 7836 int we_own_it = 0; 7837 7838 if (amp && (ap = anon_get_ptr(svd->amp->ahp, anon_index++))) { 7839 swap_xlate_nopanic(ap, &vp, &off); 7840 } else { 7841 vp = svd->vp; 7842 off = offset; 7843 } 7844 7845 /* 7846 * If pp == NULL, the page either does not exist 7847 * or is exclusively locked. So determine if it 7848 * exists before searching for it. 7849 */ 7850 7851 if ((pp = page_lookup_nowait(vp, off, SE_SHARED))) 7852 we_own_it = 1; 7853 else 7854 pp = page_exists(vp, off); 7855 7856 if (pp) { 7857 pfn = page_pptonum(pp); 7858 dump_addpage(seg->s_as, addr, pfn); 7859 if (we_own_it) 7860 page_unlock(pp); 7861 } 7862 addr += PAGESIZE; 7863 dump_timeleft = dump_timeout; 7864 } 7865 7866 if (amp != NULL) 7867 ANON_LOCK_EXIT(&->a_rwlock); 7868 } 7869 7870 /* 7871 * lock/unlock anon pages over a given range. Return shadow list 7872 */ 7873 static int 7874 segvn_pagelock(struct seg *seg, caddr_t addr, size_t len, struct page ***ppp, 7875 enum lock_type type, enum seg_rw rw) 7876 { 7877 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 7878 size_t np, adjustpages = 0, npages = (len >> PAGESHIFT); 7879 ulong_t anon_index; 7880 uint_t protchk; 7881 uint_t error; 7882 struct anon_map *amp; 7883 struct page **pplist, **pl, *pp; 7884 caddr_t a; 7885 size_t page; 7886 caddr_t lpgaddr, lpgeaddr; 7887 pgcnt_t szc0_npages = 0; 7888 7889 TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEGVN_START, 7890 "segvn_pagelock: start seg %p addr %p", seg, addr); 7891 7892 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 7893 if (seg->s_szc != 0 && (type == L_PAGELOCK || type == L_PAGEUNLOCK)) { 7894 /* 7895 * We are adjusting the pagelock region to the large page size 7896 * boundary because the unlocked part of a large page cannot 7897 * be freed anyway unless all constituent pages of a large 7898 * page are locked. Therefore this adjustment allows us to 7899 * decrement availrmem by the right value (note we don't want 7900 * to just decrement availrem by the large page size without 7901 * adjusting addr and len because then we may end up 7902 * decrementing availrmem by large page size for every 7903 * constituent page locked by a new as_pagelock call). 7904 * as_pageunlock caller must always match as_pagelock call's 7905 * addr and len. 7906 * 7907 * Note segment's page size cannot change while we are holding 7908 * as lock. And then it cannot change while softlockcnt is 7909 * not 0. This will allow us to correctly recalculate large 7910 * page size region for the matching pageunlock/reclaim call. 7911 * 7912 * for pageunlock *ppp points to the pointer of page_t that 7913 * corresponds to the real unadjusted start address. Similar 7914 * for pagelock *ppp must point to the pointer of page_t that 7915 * corresponds to the real unadjusted start address. 7916 */ 7917 size_t pgsz = page_get_pagesize(seg->s_szc); 7918 CALC_LPG_REGION(pgsz, seg, addr, len, lpgaddr, lpgeaddr); 7919 adjustpages = ((uintptr_t)(addr - lpgaddr)) >> PAGESHIFT; 7920 } 7921 7922 if (type == L_PAGEUNLOCK) { 7923 7924 /* 7925 * update hat ref bits for /proc. We need to make sure 7926 * that threads tracing the ref and mod bits of the 7927 * address space get the right data. 7928 * Note: page ref and mod bits are updated at reclaim time 7929 */ 7930 if (seg->s_as->a_vbits) { 7931 for (a = addr; a < addr + len; a += PAGESIZE) { 7932 if (rw == S_WRITE) { 7933 hat_setstat(seg->s_as, a, 7934 PAGESIZE, P_REF | P_MOD); 7935 } else { 7936 hat_setstat(seg->s_as, a, 7937 PAGESIZE, P_REF); 7938 } 7939 } 7940 } 7941 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER); 7942 if (seg->s_szc != 0) { 7943 VM_STAT_ADD(segvnvmstats.pagelock[0]); 7944 seg_pinactive(seg, lpgaddr, lpgeaddr - lpgaddr, 7945 *ppp - adjustpages, rw, segvn_reclaim); 7946 } else { 7947 seg_pinactive(seg, addr, len, *ppp, rw, segvn_reclaim); 7948 } 7949 7950 /* 7951 * If someone is blocked while unmapping, we purge 7952 * segment page cache and thus reclaim pplist synchronously 7953 * without waiting for seg_pasync_thread. This speeds up 7954 * unmapping in cases where munmap(2) is called, while 7955 * raw async i/o is still in progress or where a thread 7956 * exits on data fault in a multithreaded application. 7957 */ 7958 if (AS_ISUNMAPWAIT(seg->s_as) && (svd->softlockcnt > 0)) { 7959 /* 7960 * Even if we grab segvn WRITER's lock or segp_slock 7961 * here, there might be another thread which could've 7962 * successfully performed lookup/insert just before 7963 * we acquired the lock here. So, grabbing either 7964 * lock here is of not much use. Until we devise 7965 * a strategy at upper layers to solve the 7966 * synchronization issues completely, we expect 7967 * applications to handle this appropriately. 7968 */ 7969 segvn_purge(seg); 7970 } 7971 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7972 TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEGVN_UNLOCK_END, 7973 "segvn_pagelock: unlock seg %p addr %p", seg, addr); 7974 return (0); 7975 } else if (type == L_PAGERECLAIM) { 7976 VM_STAT_COND_ADD(seg->s_szc != 0, segvnvmstats.pagelock[1]); 7977 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER); 7978 (void) segvn_reclaim(seg, addr, len, *ppp, rw); 7979 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7980 TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEGVN_UNLOCK_END, 7981 "segvn_pagelock: reclaim seg %p addr %p", seg, addr); 7982 return (0); 7983 } 7984 7985 if (seg->s_szc != 0) { 7986 VM_STAT_ADD(segvnvmstats.pagelock[2]); 7987 addr = lpgaddr; 7988 len = lpgeaddr - lpgaddr; 7989 npages = (len >> PAGESHIFT); 7990 } 7991 7992 /* 7993 * for now we only support pagelock to anon memory. We've to check 7994 * protections for vnode objects and call into the vnode driver. 7995 * That's too much for a fast path. Let the fault entry point handle it. 7996 */ 7997 if (svd->vp != NULL) { 7998 TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEGVN_MISS_END, 7999 "segvn_pagelock: mapped vnode seg %p addr %p", seg, addr); 8000 *ppp = NULL; 8001 return (ENOTSUP); 8002 } 8003 8004 /* 8005 * if anonmap is not yet created, let the fault entry point populate it 8006 * with anon ptrs. 8007 */ 8008 if ((amp = svd->amp) == NULL) { 8009 TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEGVN_MISS_END, 8010 "segvn_pagelock: anonmap null seg %p addr %p", seg, addr); 8011 *ppp = NULL; 8012 return (EFAULT); 8013 } 8014 8015 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER); 8016 8017 /* 8018 * we acquire segp_slock to prevent duplicate entries 8019 * in seg_pcache 8020 */ 8021 mutex_enter(&svd->segp_slock); 8022 8023 /* 8024 * try to find pages in segment page cache 8025 */ 8026 pplist = seg_plookup(seg, addr, len, rw); 8027 if (pplist != NULL) { 8028 mutex_exit(&svd->segp_slock); 8029 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 8030 *ppp = pplist + adjustpages; 8031 TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEGVN_HIT_END, 8032 "segvn_pagelock: cache hit seg %p addr %p", seg, addr); 8033 return (0); 8034 } 8035 8036 if (rw == S_READ) { 8037 protchk = PROT_READ; 8038 } else { 8039 protchk = PROT_WRITE; 8040 } 8041 8042 if (svd->pageprot == 0) { 8043 if ((svd->prot & protchk) == 0) { 8044 mutex_exit(&svd->segp_slock); 8045 error = EFAULT; 8046 goto out; 8047 } 8048 } else { 8049 /* 8050 * check page protections 8051 */ 8052 for (a = addr; a < addr + len; a += PAGESIZE) { 8053 struct vpage *vp; 8054 8055 vp = &svd->vpage[seg_page(seg, a)]; 8056 if ((VPP_PROT(vp) & protchk) == 0) { 8057 mutex_exit(&svd->segp_slock); 8058 error = EFAULT; 8059 goto out; 8060 } 8061 } 8062 } 8063 8064 /* 8065 * Avoid per page overhead of segvn_pp_lock_anonpages() for small 8066 * pages. For large pages segvn_pp_lock_anonpages() only does real 8067 * work once per large page. The tradeoff is that we may decrement 8068 * availrmem more than once for the same page but this is ok 8069 * for small pages. 8070 */ 8071 if (seg->s_szc == 0) { 8072 mutex_enter(&freemem_lock); 8073 if (availrmem < tune.t_minarmem + npages) { 8074 mutex_exit(&freemem_lock); 8075 mutex_exit(&svd->segp_slock); 8076 error = ENOMEM; 8077 goto out; 8078 } 8079 availrmem -= npages; 8080 mutex_exit(&freemem_lock); 8081 } 8082 8083 pplist = kmem_alloc(sizeof (page_t *) * npages, KM_SLEEP); 8084 pl = pplist; 8085 *ppp = pplist + adjustpages; 8086 8087 page = seg_page(seg, addr); 8088 anon_index = svd->anon_index + page; 8089 8090 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 8091 for (a = addr; a < addr + len; a += PAGESIZE, anon_index++) { 8092 struct anon *ap; 8093 struct vnode *vp; 8094 u_offset_t off; 8095 anon_sync_obj_t cookie; 8096 8097 anon_array_enter(amp, anon_index, &cookie); 8098 ap = anon_get_ptr(amp->ahp, anon_index); 8099 if (ap == NULL) { 8100 anon_array_exit(&cookie); 8101 break; 8102 } else { 8103 /* 8104 * We must never use seg_pcache for COW pages 8105 * because we might end up with original page still 8106 * lying in seg_pcache even after private page is 8107 * created. This leads to data corruption as 8108 * aio_write refers to the page still in cache 8109 * while all other accesses refer to the private 8110 * page. 8111 */ 8112 if (ap->an_refcnt != 1) { 8113 anon_array_exit(&cookie); 8114 break; 8115 } 8116 } 8117 swap_xlate(ap, &vp, &off); 8118 anon_array_exit(&cookie); 8119 8120 pp = page_lookup_nowait(vp, off, SE_SHARED); 8121 if (pp == NULL) { 8122 break; 8123 } 8124 if (seg->s_szc != 0 || pp->p_szc != 0) { 8125 if (!segvn_pp_lock_anonpages(pp, a == addr)) { 8126 page_unlock(pp); 8127 break; 8128 } 8129 } else { 8130 szc0_npages++; 8131 } 8132 *pplist++ = pp; 8133 } 8134 ANON_LOCK_EXIT(&->a_rwlock); 8135 8136 ASSERT(npages >= szc0_npages); 8137 8138 if (a >= addr + len) { 8139 mutex_enter(&freemem_lock); 8140 if (seg->s_szc == 0 && npages != szc0_npages) { 8141 ASSERT(svd->type == MAP_SHARED && amp->a_szc > 0); 8142 availrmem += (npages - szc0_npages); 8143 } 8144 svd->softlockcnt += npages; 8145 segvn_pages_locked += npages; 8146 mutex_exit(&freemem_lock); 8147 (void) seg_pinsert(seg, addr, len, pl, rw, SEGP_ASYNC_FLUSH, 8148 segvn_reclaim); 8149 mutex_exit(&svd->segp_slock); 8150 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 8151 TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEGVN_FILL_END, 8152 "segvn_pagelock: cache fill seg %p addr %p", seg, addr); 8153 return (0); 8154 } 8155 8156 mutex_exit(&svd->segp_slock); 8157 if (seg->s_szc == 0) { 8158 mutex_enter(&freemem_lock); 8159 availrmem += npages; 8160 mutex_exit(&freemem_lock); 8161 } 8162 error = EFAULT; 8163 pplist = pl; 8164 np = ((uintptr_t)(a - addr)) >> PAGESHIFT; 8165 while (np > (uint_t)0) { 8166 ASSERT(PAGE_LOCKED(*pplist)); 8167 if (seg->s_szc != 0 || (*pplist)->p_szc != 0) { 8168 segvn_pp_unlock_anonpages(*pplist, pplist == pl); 8169 } 8170 page_unlock(*pplist); 8171 np--; 8172 pplist++; 8173 } 8174 kmem_free(pl, sizeof (page_t *) * npages); 8175 out: 8176 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 8177 *ppp = NULL; 8178 TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEGVN_MISS_END, 8179 "segvn_pagelock: cache miss seg %p addr %p", seg, addr); 8180 return (error); 8181 } 8182 8183 /* 8184 * purge any cached pages in the I/O page cache 8185 */ 8186 static void 8187 segvn_purge(struct seg *seg) 8188 { 8189 seg_ppurge(seg); 8190 } 8191 8192 static int 8193 segvn_reclaim(struct seg *seg, caddr_t addr, size_t len, struct page **pplist, 8194 enum seg_rw rw) 8195 { 8196 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 8197 pgcnt_t np, npages; 8198 struct page **pl; 8199 pgcnt_t szc0_npages = 0; 8200 8201 #ifdef lint 8202 addr = addr; 8203 #endif 8204 8205 npages = np = (len >> PAGESHIFT); 8206 ASSERT(npages); 8207 pl = pplist; 8208 if (seg->s_szc != 0) { 8209 size_t pgsz = page_get_pagesize(seg->s_szc); 8210 if (!IS_P2ALIGNED(addr, pgsz) || !IS_P2ALIGNED(len, pgsz)) { 8211 panic("segvn_reclaim: unaligned addr or len"); 8212 /*NOTREACHED*/ 8213 } 8214 } 8215 8216 ASSERT(svd->vp == NULL && svd->amp != NULL); 8217 8218 while (np > (uint_t)0) { 8219 if (rw == S_WRITE) { 8220 hat_setrefmod(*pplist); 8221 } else { 8222 hat_setref(*pplist); 8223 } 8224 if (seg->s_szc != 0 || (*pplist)->p_szc != 0) { 8225 segvn_pp_unlock_anonpages(*pplist, pplist == pl); 8226 } else { 8227 szc0_npages++; 8228 } 8229 page_unlock(*pplist); 8230 np--; 8231 pplist++; 8232 } 8233 kmem_free(pl, sizeof (page_t *) * npages); 8234 8235 mutex_enter(&freemem_lock); 8236 segvn_pages_locked -= npages; 8237 svd->softlockcnt -= npages; 8238 if (szc0_npages != 0) { 8239 availrmem += szc0_npages; 8240 } 8241 mutex_exit(&freemem_lock); 8242 if (svd->softlockcnt <= 0) { 8243 if (AS_ISUNMAPWAIT(seg->s_as)) { 8244 mutex_enter(&seg->s_as->a_contents); 8245 if (AS_ISUNMAPWAIT(seg->s_as)) { 8246 AS_CLRUNMAPWAIT(seg->s_as); 8247 cv_broadcast(&seg->s_as->a_cv); 8248 } 8249 mutex_exit(&seg->s_as->a_contents); 8250 } 8251 } 8252 return (0); 8253 } 8254 /* 8255 * get a memory ID for an addr in a given segment 8256 * 8257 * XXX only creates PAGESIZE pages if anon slots are not initialized. 8258 * At fault time they will be relocated into larger pages. 8259 */ 8260 static int 8261 segvn_getmemid(struct seg *seg, caddr_t addr, memid_t *memidp) 8262 { 8263 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 8264 struct anon *ap = NULL; 8265 ulong_t anon_index; 8266 struct anon_map *amp; 8267 anon_sync_obj_t cookie; 8268 8269 if (svd->type == MAP_PRIVATE) { 8270 memidp->val[0] = (uintptr_t)seg->s_as; 8271 memidp->val[1] = (uintptr_t)addr; 8272 return (0); 8273 } 8274 8275 if (svd->type == MAP_SHARED) { 8276 if (svd->vp) { 8277 memidp->val[0] = (uintptr_t)svd->vp; 8278 memidp->val[1] = (u_longlong_t)svd->offset + 8279 (uintptr_t)(addr - seg->s_base); 8280 return (0); 8281 } else { 8282 8283 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER); 8284 if ((amp = svd->amp) != NULL) { 8285 anon_index = svd->anon_index + 8286 seg_page(seg, addr); 8287 } 8288 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 8289 8290 ASSERT(amp != NULL); 8291 8292 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 8293 anon_array_enter(amp, anon_index, &cookie); 8294 ap = anon_get_ptr(amp->ahp, anon_index); 8295 if (ap == NULL) { 8296 page_t *pp; 8297 8298 pp = anon_zero(seg, addr, &ap, svd->cred); 8299 if (pp == NULL) { 8300 anon_array_exit(&cookie); 8301 ANON_LOCK_EXIT(&->a_rwlock); 8302 return (ENOMEM); 8303 } 8304 ASSERT(anon_get_ptr(amp->ahp, anon_index) 8305 == NULL); 8306 (void) anon_set_ptr(amp->ahp, anon_index, 8307 ap, ANON_SLEEP); 8308 page_unlock(pp); 8309 } 8310 8311 anon_array_exit(&cookie); 8312 ANON_LOCK_EXIT(&->a_rwlock); 8313 8314 memidp->val[0] = (uintptr_t)ap; 8315 memidp->val[1] = (uintptr_t)addr & PAGEOFFSET; 8316 return (0); 8317 } 8318 } 8319 return (EINVAL); 8320 } 8321 8322 static int 8323 sameprot(struct seg *seg, caddr_t a, size_t len) 8324 { 8325 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 8326 struct vpage *vpage; 8327 spgcnt_t pages = btop(len); 8328 uint_t prot; 8329 8330 if (svd->pageprot == 0) 8331 return (1); 8332 8333 ASSERT(svd->vpage != NULL); 8334 8335 vpage = &svd->vpage[seg_page(seg, a)]; 8336 prot = VPP_PROT(vpage); 8337 vpage++; 8338 pages--; 8339 while (pages-- > 0) { 8340 if (prot != VPP_PROT(vpage)) 8341 return (0); 8342 vpage++; 8343 } 8344 return (1); 8345 } 8346 8347 /* 8348 * Get memory allocation policy info for specified address in given segment 8349 */ 8350 static lgrp_mem_policy_info_t * 8351 segvn_getpolicy(struct seg *seg, caddr_t addr) 8352 { 8353 struct anon_map *amp; 8354 ulong_t anon_index; 8355 lgrp_mem_policy_info_t *policy_info; 8356 struct segvn_data *svn_data; 8357 u_offset_t vn_off; 8358 vnode_t *vp; 8359 8360 ASSERT(seg != NULL); 8361 8362 svn_data = (struct segvn_data *)seg->s_data; 8363 if (svn_data == NULL) 8364 return (NULL); 8365 8366 /* 8367 * Get policy info for private or shared memory 8368 */ 8369 if (svn_data->type != MAP_SHARED) 8370 policy_info = &svn_data->policy_info; 8371 else { 8372 amp = svn_data->amp; 8373 anon_index = svn_data->anon_index + seg_page(seg, addr); 8374 vp = svn_data->vp; 8375 vn_off = svn_data->offset + (uintptr_t)(addr - seg->s_base); 8376 policy_info = lgrp_shm_policy_get(amp, anon_index, vp, vn_off); 8377 } 8378 8379 return (policy_info); 8380 } 8381 8382 /*ARGSUSED*/ 8383 static int 8384 segvn_capable(struct seg *seg, segcapability_t capability) 8385 { 8386 return (0); 8387 } 8388