1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ 27 /* All Rights Reserved */ 28 29 /* 30 * University Copyright- Copyright (c) 1982, 1986, 1988 31 * The Regents of the University of California 32 * All Rights Reserved 33 * 34 * University Acknowledgment- Portions of this document are derived from 35 * software developed by the University of California, Berkeley, and its 36 * contributors. 37 */ 38 39 #pragma ident "%Z%%M% %I% %E% SMI" 40 41 /* 42 * VM - shared or copy-on-write from a vnode/anonymous memory. 43 */ 44 45 #include <sys/types.h> 46 #include <sys/param.h> 47 #include <sys/t_lock.h> 48 #include <sys/errno.h> 49 #include <sys/systm.h> 50 #include <sys/mman.h> 51 #include <sys/debug.h> 52 #include <sys/cred.h> 53 #include <sys/vmsystm.h> 54 #include <sys/tuneable.h> 55 #include <sys/bitmap.h> 56 #include <sys/swap.h> 57 #include <sys/kmem.h> 58 #include <sys/sysmacros.h> 59 #include <sys/vtrace.h> 60 #include <sys/cmn_err.h> 61 #include <sys/vm.h> 62 #include <sys/dumphdr.h> 63 #include <sys/lgrp.h> 64 65 #include <vm/hat.h> 66 #include <vm/as.h> 67 #include <vm/seg.h> 68 #include <vm/seg_vn.h> 69 #include <vm/pvn.h> 70 #include <vm/anon.h> 71 #include <vm/page.h> 72 #include <vm/vpage.h> 73 #include <sys/proc.h> 74 #include <sys/task.h> 75 #include <sys/project.h> 76 #include <sys/zone.h> 77 #include <sys/shm_impl.h> 78 /* 79 * Private seg op routines. 80 */ 81 static int segvn_dup(struct seg *seg, struct seg *newseg); 82 static int segvn_unmap(struct seg *seg, caddr_t addr, size_t len); 83 static void segvn_free(struct seg *seg); 84 static faultcode_t segvn_fault(struct hat *hat, struct seg *seg, 85 caddr_t addr, size_t len, enum fault_type type, 86 enum seg_rw rw); 87 static faultcode_t segvn_faulta(struct seg *seg, caddr_t addr); 88 static int segvn_setprot(struct seg *seg, caddr_t addr, 89 size_t len, uint_t prot); 90 static int segvn_checkprot(struct seg *seg, caddr_t addr, 91 size_t len, uint_t prot); 92 static int segvn_kluster(struct seg *seg, caddr_t addr, ssize_t delta); 93 static size_t segvn_swapout(struct seg *seg); 94 static int segvn_sync(struct seg *seg, caddr_t addr, size_t len, 95 int attr, uint_t flags); 96 static size_t segvn_incore(struct seg *seg, caddr_t addr, size_t len, 97 char *vec); 98 static int segvn_lockop(struct seg *seg, caddr_t addr, size_t len, 99 int attr, int op, ulong_t *lockmap, size_t pos); 100 static int segvn_getprot(struct seg *seg, caddr_t addr, size_t len, 101 uint_t *protv); 102 static u_offset_t segvn_getoffset(struct seg *seg, caddr_t addr); 103 static int segvn_gettype(struct seg *seg, caddr_t addr); 104 static int segvn_getvp(struct seg *seg, caddr_t addr, 105 struct vnode **vpp); 106 static int segvn_advise(struct seg *seg, caddr_t addr, size_t len, 107 uint_t behav); 108 static void segvn_dump(struct seg *seg); 109 static int segvn_pagelock(struct seg *seg, caddr_t addr, size_t len, 110 struct page ***ppp, enum lock_type type, enum seg_rw rw); 111 static int segvn_setpagesize(struct seg *seg, caddr_t addr, size_t len, 112 uint_t szc); 113 static int segvn_getmemid(struct seg *seg, caddr_t addr, 114 memid_t *memidp); 115 static lgrp_mem_policy_info_t *segvn_getpolicy(struct seg *, caddr_t); 116 static int segvn_capable(struct seg *seg, segcapability_t capable); 117 118 struct seg_ops segvn_ops = { 119 segvn_dup, 120 segvn_unmap, 121 segvn_free, 122 segvn_fault, 123 segvn_faulta, 124 segvn_setprot, 125 segvn_checkprot, 126 segvn_kluster, 127 segvn_swapout, 128 segvn_sync, 129 segvn_incore, 130 segvn_lockop, 131 segvn_getprot, 132 segvn_getoffset, 133 segvn_gettype, 134 segvn_getvp, 135 segvn_advise, 136 segvn_dump, 137 segvn_pagelock, 138 segvn_setpagesize, 139 segvn_getmemid, 140 segvn_getpolicy, 141 segvn_capable, 142 }; 143 144 /* 145 * Common zfod structures, provided as a shorthand for others to use. 146 */ 147 static segvn_crargs_t zfod_segvn_crargs = 148 SEGVN_ZFOD_ARGS(PROT_ZFOD, PROT_ALL); 149 static segvn_crargs_t kzfod_segvn_crargs = 150 SEGVN_ZFOD_ARGS(PROT_ZFOD & ~PROT_USER, 151 PROT_ALL & ~PROT_USER); 152 static segvn_crargs_t stack_noexec_crargs = 153 SEGVN_ZFOD_ARGS(PROT_ZFOD & ~PROT_EXEC, PROT_ALL); 154 155 caddr_t zfod_argsp = (caddr_t)&zfod_segvn_crargs; /* user zfod argsp */ 156 caddr_t kzfod_argsp = (caddr_t)&kzfod_segvn_crargs; /* kernel zfod argsp */ 157 caddr_t stack_exec_argsp = (caddr_t)&zfod_segvn_crargs; /* executable stack */ 158 caddr_t stack_noexec_argsp = (caddr_t)&stack_noexec_crargs; /* noexec stack */ 159 160 #define vpgtob(n) ((n) * sizeof (struct vpage)) /* For brevity */ 161 162 size_t segvn_comb_thrshld = UINT_MAX; /* patchable -- see 1196681 */ 163 164 static int segvn_concat(struct seg *, struct seg *, int); 165 static int segvn_extend_prev(struct seg *, struct seg *, 166 struct segvn_crargs *, size_t); 167 static int segvn_extend_next(struct seg *, struct seg *, 168 struct segvn_crargs *, size_t); 169 static void segvn_softunlock(struct seg *, caddr_t, size_t, enum seg_rw); 170 static void segvn_pagelist_rele(page_t **); 171 static void segvn_setvnode_mpss(vnode_t *); 172 static void segvn_relocate_pages(page_t **, page_t *); 173 static int segvn_full_szcpages(page_t **, uint_t, int *, uint_t *); 174 static int segvn_fill_vp_pages(struct segvn_data *, vnode_t *, u_offset_t, 175 uint_t, page_t **, page_t **, uint_t *, int *); 176 static faultcode_t segvn_fault_vnodepages(struct hat *, struct seg *, caddr_t, 177 caddr_t, enum fault_type, enum seg_rw, caddr_t, caddr_t, int); 178 static faultcode_t segvn_fault_anonpages(struct hat *, struct seg *, caddr_t, 179 caddr_t, enum fault_type, enum seg_rw, caddr_t, caddr_t, int); 180 static faultcode_t segvn_faultpage(struct hat *, struct seg *, caddr_t, 181 u_offset_t, struct vpage *, page_t **, uint_t, 182 enum fault_type, enum seg_rw, int, int); 183 static void segvn_vpage(struct seg *); 184 185 static void segvn_purge(struct seg *seg); 186 static int segvn_reclaim(struct seg *, caddr_t, size_t, struct page **, 187 enum seg_rw); 188 189 static int sameprot(struct seg *, caddr_t, size_t); 190 191 static int segvn_demote_range(struct seg *, caddr_t, size_t, int, uint_t); 192 static int segvn_clrszc(struct seg *); 193 static struct seg *segvn_split_seg(struct seg *, caddr_t); 194 static int segvn_claim_pages(struct seg *, struct vpage *, u_offset_t, 195 ulong_t, uint_t); 196 197 static int segvn_pp_lock_anonpages(page_t *, int); 198 static void segvn_pp_unlock_anonpages(page_t *, int); 199 200 static struct kmem_cache *segvn_cache; 201 202 #ifdef VM_STATS 203 static struct segvnvmstats_str { 204 ulong_t fill_vp_pages[31]; 205 ulong_t fltvnpages[49]; 206 ulong_t fullszcpages[10]; 207 ulong_t relocatepages[3]; 208 ulong_t fltanpages[17]; 209 ulong_t pagelock[3]; 210 ulong_t demoterange[3]; 211 } segvnvmstats; 212 #endif /* VM_STATS */ 213 214 #define SDR_RANGE 1 /* demote entire range */ 215 #define SDR_END 2 /* demote non aligned ends only */ 216 217 #define CALC_LPG_REGION(pgsz, seg, addr, len, lpgaddr, lpgeaddr) { \ 218 if ((len) != 0) { \ 219 lpgaddr = (caddr_t)P2ALIGN((uintptr_t)(addr), pgsz); \ 220 ASSERT(lpgaddr >= (seg)->s_base); \ 221 lpgeaddr = (caddr_t)P2ROUNDUP((uintptr_t)((addr) + \ 222 (len)), pgsz); \ 223 ASSERT(lpgeaddr > lpgaddr); \ 224 ASSERT(lpgeaddr <= (seg)->s_base + (seg)->s_size); \ 225 } else { \ 226 lpgeaddr = lpgaddr = (addr); \ 227 } \ 228 } 229 230 /*ARGSUSED*/ 231 static int 232 segvn_cache_constructor(void *buf, void *cdrarg, int kmflags) 233 { 234 struct segvn_data *svd = buf; 235 236 rw_init(&svd->lock, NULL, RW_DEFAULT, NULL); 237 mutex_init(&svd->segp_slock, NULL, MUTEX_DEFAULT, NULL); 238 return (0); 239 } 240 241 /*ARGSUSED1*/ 242 static void 243 segvn_cache_destructor(void *buf, void *cdrarg) 244 { 245 struct segvn_data *svd = buf; 246 247 rw_destroy(&svd->lock); 248 mutex_destroy(&svd->segp_slock); 249 } 250 251 /* 252 * Patching this variable to non-zero allows the system to run with 253 * stacks marked as "not executable". It's a bit of a kludge, but is 254 * provided as a tweakable for platforms that export those ABIs 255 * (e.g. sparc V8) that have executable stacks enabled by default. 256 * There are also some restrictions for platforms that don't actually 257 * implement 'noexec' protections. 258 * 259 * Once enabled, the system is (therefore) unable to provide a fully 260 * ABI-compliant execution environment, though practically speaking, 261 * most everything works. The exceptions are generally some interpreters 262 * and debuggers that create executable code on the stack and jump 263 * into it (without explicitly mprotecting the address range to include 264 * PROT_EXEC). 265 * 266 * One important class of applications that are disabled are those 267 * that have been transformed into malicious agents using one of the 268 * numerous "buffer overflow" attacks. See 4007890. 269 */ 270 int noexec_user_stack = 0; 271 int noexec_user_stack_log = 1; 272 273 int segvn_lpg_disable = 0; 274 uint_t segvn_maxpgszc = 0; 275 276 ulong_t segvn_vmpss_clrszc_cnt; 277 ulong_t segvn_vmpss_clrszc_err; 278 ulong_t segvn_fltvnpages_clrszc_cnt; 279 ulong_t segvn_fltvnpages_clrszc_err; 280 ulong_t segvn_setpgsz_align_err; 281 ulong_t segvn_setpgsz_anon_align_err; 282 ulong_t segvn_setpgsz_getattr_err; 283 ulong_t segvn_setpgsz_eof_err; 284 ulong_t segvn_faultvnmpss_align_err1; 285 ulong_t segvn_faultvnmpss_align_err2; 286 ulong_t segvn_faultvnmpss_align_err3; 287 ulong_t segvn_faultvnmpss_align_err4; 288 ulong_t segvn_faultvnmpss_align_err5; 289 ulong_t segvn_vmpss_pageio_deadlk_err; 290 291 /* 292 * Initialize segvn data structures 293 */ 294 void 295 segvn_init(void) 296 { 297 uint_t maxszc; 298 uint_t szc; 299 size_t pgsz; 300 301 segvn_cache = kmem_cache_create("segvn_cache", 302 sizeof (struct segvn_data), 0, 303 segvn_cache_constructor, segvn_cache_destructor, NULL, 304 NULL, NULL, 0); 305 306 if (segvn_lpg_disable != 0) 307 return; 308 szc = maxszc = page_num_pagesizes() - 1; 309 if (szc == 0) { 310 segvn_lpg_disable = 1; 311 return; 312 } 313 if (page_get_pagesize(0) != PAGESIZE) { 314 panic("segvn_init: bad szc 0"); 315 /*NOTREACHED*/ 316 } 317 while (szc != 0) { 318 pgsz = page_get_pagesize(szc); 319 if (pgsz <= PAGESIZE || !IS_P2ALIGNED(pgsz, pgsz)) { 320 panic("segvn_init: bad szc %d", szc); 321 /*NOTREACHED*/ 322 } 323 szc--; 324 } 325 if (segvn_maxpgszc == 0 || segvn_maxpgszc > maxszc) 326 segvn_maxpgszc = maxszc; 327 } 328 329 #define SEGVN_PAGEIO ((void *)0x1) 330 #define SEGVN_NOPAGEIO ((void *)0x2) 331 332 static void 333 segvn_setvnode_mpss(vnode_t *vp) 334 { 335 int err; 336 337 ASSERT(vp->v_mpssdata == NULL || 338 vp->v_mpssdata == SEGVN_PAGEIO || 339 vp->v_mpssdata == SEGVN_NOPAGEIO); 340 341 if (vp->v_mpssdata == NULL) { 342 if (vn_vmpss_usepageio(vp)) { 343 err = VOP_PAGEIO(vp, (page_t *)NULL, 344 (u_offset_t)0, 0, 0, CRED()); 345 } else { 346 err = ENOSYS; 347 } 348 /* 349 * set v_mpssdata just once per vnode life 350 * so that it never changes. 351 */ 352 mutex_enter(&vp->v_lock); 353 if (vp->v_mpssdata == NULL) { 354 if (err == EINVAL) { 355 vp->v_mpssdata = SEGVN_PAGEIO; 356 } else { 357 vp->v_mpssdata = SEGVN_NOPAGEIO; 358 } 359 } 360 mutex_exit(&vp->v_lock); 361 } 362 } 363 364 int 365 segvn_create(struct seg *seg, void *argsp) 366 { 367 struct segvn_crargs *a = (struct segvn_crargs *)argsp; 368 struct segvn_data *svd; 369 size_t swresv = 0; 370 struct cred *cred; 371 struct anon_map *amp; 372 int error = 0; 373 size_t pgsz; 374 lgrp_mem_policy_t mpolicy = LGRP_MEM_POLICY_DEFAULT; 375 376 377 ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock)); 378 379 if (a->type != MAP_PRIVATE && a->type != MAP_SHARED) { 380 panic("segvn_create type"); 381 /*NOTREACHED*/ 382 } 383 384 /* 385 * Check arguments. If a shared anon structure is given then 386 * it is illegal to also specify a vp. 387 */ 388 if (a->amp != NULL && a->vp != NULL) { 389 panic("segvn_create anon_map"); 390 /*NOTREACHED*/ 391 } 392 393 /* MAP_NORESERVE on a MAP_SHARED segment is meaningless. */ 394 if (a->type == MAP_SHARED) 395 a->flags &= ~MAP_NORESERVE; 396 397 if (a->szc != 0) { 398 if (segvn_lpg_disable != 0 || (a->szc == AS_MAP_NO_LPOOB) || 399 (a->amp != NULL && a->type == MAP_PRIVATE) || 400 (a->flags & MAP_NORESERVE) || seg->s_as == &kas) { 401 a->szc = 0; 402 } else { 403 if (a->szc > segvn_maxpgszc) 404 a->szc = segvn_maxpgszc; 405 pgsz = page_get_pagesize(a->szc); 406 if (!IS_P2ALIGNED(seg->s_base, pgsz) || 407 !IS_P2ALIGNED(seg->s_size, pgsz)) { 408 a->szc = 0; 409 } else if (a->vp != NULL) { 410 extern struct vnode kvp; 411 if (IS_SWAPFSVP(a->vp) || VN_ISKAS(a->vp)) { 412 /* 413 * paranoid check. 414 * hat_page_demote() is not supported 415 * on swapfs pages. 416 */ 417 a->szc = 0; 418 } else if (map_addr_vacalign_check(seg->s_base, 419 a->offset & PAGEMASK)) { 420 a->szc = 0; 421 } 422 } else if (a->amp != NULL) { 423 pgcnt_t anum = btopr(a->offset); 424 pgcnt_t pgcnt = page_get_pagecnt(a->szc); 425 if (!IS_P2ALIGNED(anum, pgcnt)) { 426 a->szc = 0; 427 } 428 } 429 } 430 } 431 432 /* 433 * If segment may need private pages, reserve them now. 434 */ 435 if (!(a->flags & MAP_NORESERVE) && ((a->vp == NULL && a->amp == NULL) || 436 (a->type == MAP_PRIVATE && (a->prot & PROT_WRITE)))) { 437 if (anon_resv(seg->s_size) == 0) 438 return (EAGAIN); 439 swresv = seg->s_size; 440 TRACE_3(TR_FAC_VM, TR_ANON_PROC, "anon proc:%p %lu %u", 441 seg, swresv, 1); 442 } 443 444 /* 445 * Reserve any mapping structures that may be required. 446 */ 447 hat_map(seg->s_as->a_hat, seg->s_base, seg->s_size, HAT_MAP); 448 449 if (a->cred) { 450 cred = a->cred; 451 crhold(cred); 452 } else { 453 crhold(cred = CRED()); 454 } 455 456 /* Inform the vnode of the new mapping */ 457 if (a->vp) { 458 error = VOP_ADDMAP(a->vp, a->offset & PAGEMASK, 459 seg->s_as, seg->s_base, seg->s_size, a->prot, 460 a->maxprot, a->type, cred); 461 if (error) { 462 if (swresv != 0) { 463 anon_unresv(swresv); 464 TRACE_3(TR_FAC_VM, TR_ANON_PROC, 465 "anon proc:%p %lu %u", 466 seg, swresv, 0); 467 } 468 crfree(cred); 469 hat_unload(seg->s_as->a_hat, seg->s_base, 470 seg->s_size, HAT_UNLOAD_UNMAP); 471 return (error); 472 } 473 } 474 475 /* 476 * If more than one segment in the address space, and 477 * they're adjacent virtually, try to concatenate them. 478 * Don't concatenate if an explicit anon_map structure 479 * was supplied (e.g., SystemV shared memory). 480 */ 481 if (a->amp == NULL) { 482 struct seg *pseg, *nseg; 483 struct segvn_data *psvd, *nsvd; 484 lgrp_mem_policy_t ppolicy, npolicy; 485 uint_t lgrp_mem_policy_flags = 0; 486 extern lgrp_mem_policy_t lgrp_mem_default_policy; 487 488 /* 489 * Memory policy flags (lgrp_mem_policy_flags) is valid when 490 * extending stack/heap segments. 491 */ 492 if ((a->vp == NULL) && (a->type == MAP_PRIVATE) && 493 !(a->flags & MAP_NORESERVE) && (seg->s_as != &kas)) { 494 lgrp_mem_policy_flags = a->lgrp_mem_policy_flags; 495 } else { 496 /* 497 * Get policy when not extending it from another segment 498 */ 499 mpolicy = lgrp_mem_policy_default(seg->s_size, a->type); 500 } 501 502 /* 503 * First, try to concatenate the previous and new segments 504 */ 505 pseg = AS_SEGPREV(seg->s_as, seg); 506 if (pseg != NULL && 507 pseg->s_base + pseg->s_size == seg->s_base && 508 pseg->s_ops == &segvn_ops) { 509 /* 510 * Get memory allocation policy from previous segment. 511 * When extension is specified (e.g. for heap) apply 512 * this policy to the new segment regardless of the 513 * outcome of segment concatenation. Extension occurs 514 * for non-default policy otherwise default policy is 515 * used and is based on extended segment size. 516 */ 517 psvd = (struct segvn_data *)pseg->s_data; 518 ppolicy = psvd->policy_info.mem_policy; 519 if (lgrp_mem_policy_flags == 520 LGRP_MP_FLAG_EXTEND_UP) { 521 if (ppolicy != lgrp_mem_default_policy) { 522 mpolicy = ppolicy; 523 } else { 524 mpolicy = lgrp_mem_policy_default( 525 pseg->s_size + seg->s_size, 526 a->type); 527 } 528 } 529 530 if (mpolicy == ppolicy && 531 (pseg->s_size + seg->s_size <= 532 segvn_comb_thrshld || psvd->amp == NULL) && 533 segvn_extend_prev(pseg, seg, a, swresv) == 0) { 534 /* 535 * success! now try to concatenate 536 * with following seg 537 */ 538 crfree(cred); 539 nseg = AS_SEGNEXT(pseg->s_as, pseg); 540 if (nseg != NULL && 541 nseg != pseg && 542 nseg->s_ops == &segvn_ops && 543 pseg->s_base + pseg->s_size == 544 nseg->s_base) 545 (void) segvn_concat(pseg, nseg, 0); 546 ASSERT(pseg->s_szc == 0 || 547 (a->szc == pseg->s_szc && 548 IS_P2ALIGNED(pseg->s_base, pgsz) && 549 IS_P2ALIGNED(pseg->s_size, pgsz))); 550 return (0); 551 } 552 } 553 554 /* 555 * Failed, so try to concatenate with following seg 556 */ 557 nseg = AS_SEGNEXT(seg->s_as, seg); 558 if (nseg != NULL && 559 seg->s_base + seg->s_size == nseg->s_base && 560 nseg->s_ops == &segvn_ops) { 561 /* 562 * Get memory allocation policy from next segment. 563 * When extension is specified (e.g. for stack) apply 564 * this policy to the new segment regardless of the 565 * outcome of segment concatenation. Extension occurs 566 * for non-default policy otherwise default policy is 567 * used and is based on extended segment size. 568 */ 569 nsvd = (struct segvn_data *)nseg->s_data; 570 npolicy = nsvd->policy_info.mem_policy; 571 if (lgrp_mem_policy_flags == 572 LGRP_MP_FLAG_EXTEND_DOWN) { 573 if (npolicy != lgrp_mem_default_policy) { 574 mpolicy = npolicy; 575 } else { 576 mpolicy = lgrp_mem_policy_default( 577 nseg->s_size + seg->s_size, 578 a->type); 579 } 580 } 581 582 if (mpolicy == npolicy && 583 segvn_extend_next(seg, nseg, a, swresv) == 0) { 584 crfree(cred); 585 ASSERT(nseg->s_szc == 0 || 586 (a->szc == nseg->s_szc && 587 IS_P2ALIGNED(nseg->s_base, pgsz) && 588 IS_P2ALIGNED(nseg->s_size, pgsz))); 589 return (0); 590 } 591 } 592 } 593 594 if (a->vp != NULL) { 595 VN_HOLD(a->vp); 596 if (a->type == MAP_SHARED) 597 lgrp_shm_policy_init(NULL, a->vp); 598 } 599 svd = kmem_cache_alloc(segvn_cache, KM_SLEEP); 600 601 seg->s_ops = &segvn_ops; 602 seg->s_data = (void *)svd; 603 seg->s_szc = a->szc; 604 605 svd->vp = a->vp; 606 /* 607 * Anonymous mappings have no backing file so the offset is meaningless. 608 */ 609 svd->offset = a->vp ? (a->offset & PAGEMASK) : 0; 610 svd->prot = a->prot; 611 svd->maxprot = a->maxprot; 612 svd->pageprot = 0; 613 svd->type = a->type; 614 svd->vpage = NULL; 615 svd->cred = cred; 616 svd->advice = MADV_NORMAL; 617 svd->pageadvice = 0; 618 svd->flags = (ushort_t)a->flags; 619 svd->softlockcnt = 0; 620 if (a->szc != 0 && a->vp != NULL) { 621 segvn_setvnode_mpss(a->vp); 622 } 623 624 amp = a->amp; 625 if ((svd->amp = amp) == NULL) { 626 svd->anon_index = 0; 627 if (svd->type == MAP_SHARED) { 628 svd->swresv = 0; 629 /* 630 * Shared mappings to a vp need no other setup. 631 * If we have a shared mapping to an anon_map object 632 * which hasn't been allocated yet, allocate the 633 * struct now so that it will be properly shared 634 * by remembering the swap reservation there. 635 */ 636 if (a->vp == NULL) { 637 svd->amp = anonmap_alloc(seg->s_size, swresv); 638 svd->amp->a_szc = seg->s_szc; 639 } 640 } else { 641 /* 642 * Private mapping (with or without a vp). 643 * Allocate anon_map when needed. 644 */ 645 svd->swresv = swresv; 646 } 647 } else { 648 pgcnt_t anon_num; 649 650 /* 651 * Mapping to an existing anon_map structure without a vp. 652 * For now we will insure that the segment size isn't larger 653 * than the size - offset gives us. Later on we may wish to 654 * have the anon array dynamically allocated itself so that 655 * we don't always have to allocate all the anon pointer slots. 656 * This of course involves adding extra code to check that we 657 * aren't trying to use an anon pointer slot beyond the end 658 * of the currently allocated anon array. 659 */ 660 if ((amp->size - a->offset) < seg->s_size) { 661 panic("segvn_create anon_map size"); 662 /*NOTREACHED*/ 663 } 664 665 anon_num = btopr(a->offset); 666 667 if (a->type == MAP_SHARED) { 668 /* 669 * SHARED mapping to a given anon_map. 670 */ 671 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 672 amp->refcnt++; 673 if (a->szc > amp->a_szc) { 674 amp->a_szc = a->szc; 675 } 676 ANON_LOCK_EXIT(&->a_rwlock); 677 svd->anon_index = anon_num; 678 svd->swresv = 0; 679 } else { 680 /* 681 * PRIVATE mapping to a given anon_map. 682 * Make sure that all the needed anon 683 * structures are created (so that we will 684 * share the underlying pages if nothing 685 * is written by this mapping) and then 686 * duplicate the anon array as is done 687 * when a privately mapped segment is dup'ed. 688 */ 689 struct anon *ap; 690 caddr_t addr; 691 caddr_t eaddr; 692 ulong_t anon_idx; 693 int hat_flag = HAT_LOAD; 694 695 if (svd->flags & MAP_TEXT) { 696 hat_flag |= HAT_LOAD_TEXT; 697 } 698 699 svd->amp = anonmap_alloc(seg->s_size, 0); 700 svd->amp->a_szc = seg->s_szc; 701 svd->anon_index = 0; 702 svd->swresv = swresv; 703 704 /* 705 * Prevent 2 threads from allocating anon 706 * slots simultaneously. 707 */ 708 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 709 eaddr = seg->s_base + seg->s_size; 710 711 for (anon_idx = anon_num, addr = seg->s_base; 712 addr < eaddr; addr += PAGESIZE, anon_idx++) { 713 page_t *pp; 714 715 if ((ap = anon_get_ptr(amp->ahp, 716 anon_idx)) != NULL) 717 continue; 718 719 /* 720 * Allocate the anon struct now. 721 * Might as well load up translation 722 * to the page while we're at it... 723 */ 724 pp = anon_zero(seg, addr, &ap, cred); 725 if (ap == NULL || pp == NULL) { 726 panic("segvn_create anon_zero"); 727 /*NOTREACHED*/ 728 } 729 730 /* 731 * Re-acquire the anon_map lock and 732 * initialize the anon array entry. 733 */ 734 ASSERT(anon_get_ptr(amp->ahp, 735 anon_idx) == NULL); 736 (void) anon_set_ptr(amp->ahp, anon_idx, ap, 737 ANON_SLEEP); 738 739 ASSERT(seg->s_szc == 0); 740 ASSERT(!IS_VMODSORT(pp->p_vnode)); 741 742 hat_memload(seg->s_as->a_hat, addr, pp, 743 svd->prot & ~PROT_WRITE, hat_flag); 744 745 page_unlock(pp); 746 } 747 ASSERT(seg->s_szc == 0); 748 anon_dup(amp->ahp, anon_num, svd->amp->ahp, 749 0, seg->s_size); 750 ANON_LOCK_EXIT(&->a_rwlock); 751 } 752 } 753 754 /* 755 * Set default memory allocation policy for segment 756 * 757 * Always set policy for private memory at least for initialization 758 * even if this is a shared memory segment 759 */ 760 (void) lgrp_privm_policy_set(mpolicy, &svd->policy_info, seg->s_size); 761 762 if (svd->type == MAP_SHARED) 763 (void) lgrp_shm_policy_set(mpolicy, svd->amp, svd->anon_index, 764 svd->vp, svd->offset, seg->s_size); 765 766 return (0); 767 } 768 769 /* 770 * Concatenate two existing segments, if possible. 771 * Return 0 on success, -1 if two segments are not compatible 772 * or -2 on memory allocation failure. 773 * If amp_cat == 1 then try and concat segments with anon maps 774 */ 775 static int 776 segvn_concat(struct seg *seg1, struct seg *seg2, int amp_cat) 777 { 778 struct segvn_data *svd1 = seg1->s_data; 779 struct segvn_data *svd2 = seg2->s_data; 780 struct anon_map *amp1 = svd1->amp; 781 struct anon_map *amp2 = svd2->amp; 782 struct vpage *vpage1 = svd1->vpage; 783 struct vpage *vpage2 = svd2->vpage, *nvpage = NULL; 784 size_t size, nvpsize; 785 pgcnt_t npages1, npages2; 786 787 ASSERT(seg1->s_as && seg2->s_as && seg1->s_as == seg2->s_as); 788 ASSERT(AS_WRITE_HELD(seg1->s_as, &seg1->s_as->a_lock)); 789 ASSERT(seg1->s_ops == seg2->s_ops); 790 791 /* both segments exist, try to merge them */ 792 #define incompat(x) (svd1->x != svd2->x) 793 if (incompat(vp) || incompat(maxprot) || 794 (!svd1->pageadvice && !svd2->pageadvice && incompat(advice)) || 795 (!svd1->pageprot && !svd2->pageprot && incompat(prot)) || 796 incompat(type) || incompat(cred) || incompat(flags) || 797 seg1->s_szc != seg2->s_szc || incompat(policy_info.mem_policy) || 798 (svd2->softlockcnt > 0)) 799 return (-1); 800 #undef incompat 801 802 /* 803 * vp == NULL implies zfod, offset doesn't matter 804 */ 805 if (svd1->vp != NULL && 806 svd1->offset + seg1->s_size != svd2->offset) { 807 return (-1); 808 } 809 810 /* 811 * Fail early if we're not supposed to concatenate 812 * segments with non NULL amp. 813 */ 814 if (amp_cat == 0 && (amp1 != NULL || amp2 != NULL)) { 815 return (-1); 816 } 817 818 if (svd1->vp == NULL && svd1->type == MAP_SHARED) { 819 if (amp1 != amp2) { 820 return (-1); 821 } 822 if (amp1 != NULL && svd1->anon_index + btop(seg1->s_size) != 823 svd2->anon_index) { 824 return (-1); 825 } 826 ASSERT(amp1 == NULL || amp1->refcnt >= 2); 827 } 828 829 /* 830 * If either seg has vpages, create a new merged vpage array. 831 */ 832 if (vpage1 != NULL || vpage2 != NULL) { 833 struct vpage *vp; 834 835 npages1 = seg_pages(seg1); 836 npages2 = seg_pages(seg2); 837 nvpsize = vpgtob(npages1 + npages2); 838 839 if ((nvpage = kmem_zalloc(nvpsize, KM_NOSLEEP)) == NULL) { 840 return (-2); 841 } 842 if (vpage1 != NULL) { 843 bcopy(vpage1, nvpage, vpgtob(npages1)); 844 } 845 if (vpage2 != NULL) { 846 bcopy(vpage2, nvpage + npages1, vpgtob(npages2)); 847 } 848 for (vp = nvpage; vp < nvpage + npages1; vp++) { 849 if (svd2->pageprot && !svd1->pageprot) { 850 VPP_SETPROT(vp, svd1->prot); 851 } 852 if (svd2->pageadvice && !svd1->pageadvice) { 853 VPP_SETADVICE(vp, svd1->advice); 854 } 855 } 856 for (vp = nvpage + npages1; 857 vp < nvpage + npages1 + npages2; vp++) { 858 if (svd1->pageprot && !svd2->pageprot) { 859 VPP_SETPROT(vp, svd2->prot); 860 } 861 if (svd1->pageadvice && !svd2->pageadvice) { 862 VPP_SETADVICE(vp, svd2->advice); 863 } 864 } 865 } 866 867 /* 868 * If either segment has private pages, create a new merged anon 869 * array. If mergeing shared anon segments just decrement anon map's 870 * refcnt. 871 */ 872 if (amp1 != NULL && svd1->type == MAP_SHARED) { 873 ASSERT(amp1 == amp2 && svd1->vp == NULL); 874 ANON_LOCK_ENTER(&1->a_rwlock, RW_WRITER); 875 ASSERT(amp1->refcnt >= 2); 876 amp1->refcnt--; 877 ANON_LOCK_EXIT(&1->a_rwlock); 878 svd2->amp = NULL; 879 } else if (amp1 != NULL || amp2 != NULL) { 880 struct anon_hdr *nahp; 881 struct anon_map *namp = NULL; 882 size_t asize; 883 884 ASSERT(svd1->type == MAP_PRIVATE); 885 886 asize = seg1->s_size + seg2->s_size; 887 if ((nahp = anon_create(btop(asize), ANON_NOSLEEP)) == NULL) { 888 if (nvpage != NULL) { 889 kmem_free(nvpage, nvpsize); 890 } 891 return (-2); 892 } 893 if (amp1 != NULL) { 894 /* 895 * XXX anon rwlock is not really needed because 896 * this is a private segment and we are writers. 897 */ 898 ANON_LOCK_ENTER(&1->a_rwlock, RW_WRITER); 899 ASSERT(amp1->refcnt == 1); 900 if (anon_copy_ptr(amp1->ahp, svd1->anon_index, 901 nahp, 0, btop(seg1->s_size), ANON_NOSLEEP)) { 902 anon_release(nahp, btop(asize)); 903 ANON_LOCK_EXIT(&1->a_rwlock); 904 if (nvpage != NULL) { 905 kmem_free(nvpage, nvpsize); 906 } 907 return (-2); 908 } 909 } 910 if (amp2 != NULL) { 911 ANON_LOCK_ENTER(&2->a_rwlock, RW_WRITER); 912 ASSERT(amp2->refcnt == 1); 913 if (anon_copy_ptr(amp2->ahp, svd2->anon_index, 914 nahp, btop(seg1->s_size), btop(seg2->s_size), 915 ANON_NOSLEEP)) { 916 anon_release(nahp, btop(asize)); 917 ANON_LOCK_EXIT(&2->a_rwlock); 918 if (amp1 != NULL) { 919 ANON_LOCK_EXIT(&1->a_rwlock); 920 } 921 if (nvpage != NULL) { 922 kmem_free(nvpage, nvpsize); 923 } 924 return (-2); 925 } 926 } 927 if (amp1 != NULL) { 928 namp = amp1; 929 anon_release(amp1->ahp, btop(amp1->size)); 930 } 931 if (amp2 != NULL) { 932 if (namp == NULL) { 933 ASSERT(amp1 == NULL); 934 namp = amp2; 935 anon_release(amp2->ahp, btop(amp2->size)); 936 } else { 937 amp2->refcnt--; 938 ANON_LOCK_EXIT(&2->a_rwlock); 939 anonmap_free(amp2); 940 } 941 svd2->amp = NULL; /* needed for seg_free */ 942 } 943 namp->ahp = nahp; 944 namp->size = asize; 945 svd1->amp = namp; 946 svd1->anon_index = 0; 947 ANON_LOCK_EXIT(&namp->a_rwlock); 948 } 949 /* 950 * Now free the old vpage structures. 951 */ 952 if (nvpage != NULL) { 953 if (vpage1 != NULL) { 954 kmem_free(vpage1, vpgtob(npages1)); 955 } 956 if (vpage2 != NULL) { 957 svd2->vpage = NULL; 958 kmem_free(vpage2, vpgtob(npages2)); 959 } 960 if (svd2->pageprot) { 961 svd1->pageprot = 1; 962 } 963 if (svd2->pageadvice) { 964 svd1->pageadvice = 1; 965 } 966 svd1->vpage = nvpage; 967 } 968 969 /* all looks ok, merge segments */ 970 svd1->swresv += svd2->swresv; 971 svd2->swresv = 0; /* so seg_free doesn't release swap space */ 972 size = seg2->s_size; 973 seg_free(seg2); 974 seg1->s_size += size; 975 return (0); 976 } 977 978 /* 979 * Extend the previous segment (seg1) to include the 980 * new segment (seg2 + a), if possible. 981 * Return 0 on success. 982 */ 983 static int 984 segvn_extend_prev(seg1, seg2, a, swresv) 985 struct seg *seg1, *seg2; 986 struct segvn_crargs *a; 987 size_t swresv; 988 { 989 struct segvn_data *svd1 = (struct segvn_data *)seg1->s_data; 990 size_t size; 991 struct anon_map *amp1; 992 struct vpage *new_vpage; 993 994 /* 995 * We don't need any segment level locks for "segvn" data 996 * since the address space is "write" locked. 997 */ 998 ASSERT(seg1->s_as && AS_WRITE_HELD(seg1->s_as, &seg1->s_as->a_lock)); 999 1000 /* second segment is new, try to extend first */ 1001 /* XXX - should also check cred */ 1002 if (svd1->vp != a->vp || svd1->maxprot != a->maxprot || 1003 (!svd1->pageprot && (svd1->prot != a->prot)) || 1004 svd1->type != a->type || svd1->flags != a->flags || 1005 seg1->s_szc != a->szc) 1006 return (-1); 1007 1008 /* vp == NULL implies zfod, offset doesn't matter */ 1009 if (svd1->vp != NULL && 1010 svd1->offset + seg1->s_size != (a->offset & PAGEMASK)) 1011 return (-1); 1012 1013 amp1 = svd1->amp; 1014 if (amp1) { 1015 pgcnt_t newpgs; 1016 1017 /* 1018 * Segment has private pages, can data structures 1019 * be expanded? 1020 * 1021 * Acquire the anon_map lock to prevent it from changing, 1022 * if it is shared. This ensures that the anon_map 1023 * will not change while a thread which has a read/write 1024 * lock on an address space references it. 1025 * XXX - Don't need the anon_map lock at all if "refcnt" 1026 * is 1. 1027 * 1028 * Can't grow a MAP_SHARED segment with an anonmap because 1029 * there may be existing anon slots where we want to extend 1030 * the segment and we wouldn't know what to do with them 1031 * (e.g., for tmpfs right thing is to just leave them there, 1032 * for /dev/zero they should be cleared out). 1033 */ 1034 if (svd1->type == MAP_SHARED) 1035 return (-1); 1036 1037 ANON_LOCK_ENTER(&1->a_rwlock, RW_WRITER); 1038 if (amp1->refcnt > 1) { 1039 ANON_LOCK_EXIT(&1->a_rwlock); 1040 return (-1); 1041 } 1042 newpgs = anon_grow(amp1->ahp, &svd1->anon_index, 1043 btop(seg1->s_size), btop(seg2->s_size), ANON_NOSLEEP); 1044 1045 if (newpgs == 0) { 1046 ANON_LOCK_EXIT(&1->a_rwlock); 1047 return (-1); 1048 } 1049 amp1->size = ptob(newpgs); 1050 ANON_LOCK_EXIT(&1->a_rwlock); 1051 } 1052 if (svd1->vpage != NULL) { 1053 new_vpage = 1054 kmem_zalloc(vpgtob(seg_pages(seg1) + seg_pages(seg2)), 1055 KM_NOSLEEP); 1056 if (new_vpage == NULL) 1057 return (-1); 1058 bcopy(svd1->vpage, new_vpage, vpgtob(seg_pages(seg1))); 1059 kmem_free(svd1->vpage, vpgtob(seg_pages(seg1))); 1060 svd1->vpage = new_vpage; 1061 if (svd1->pageprot) { 1062 struct vpage *vp, *evp; 1063 1064 vp = new_vpage + seg_pages(seg1); 1065 evp = vp + seg_pages(seg2); 1066 for (; vp < evp; vp++) 1067 VPP_SETPROT(vp, a->prot); 1068 } 1069 } 1070 size = seg2->s_size; 1071 seg_free(seg2); 1072 seg1->s_size += size; 1073 svd1->swresv += swresv; 1074 return (0); 1075 } 1076 1077 /* 1078 * Extend the next segment (seg2) to include the 1079 * new segment (seg1 + a), if possible. 1080 * Return 0 on success. 1081 */ 1082 static int 1083 segvn_extend_next( 1084 struct seg *seg1, 1085 struct seg *seg2, 1086 struct segvn_crargs *a, 1087 size_t swresv) 1088 { 1089 struct segvn_data *svd2 = (struct segvn_data *)seg2->s_data; 1090 size_t size; 1091 struct anon_map *amp2; 1092 struct vpage *new_vpage; 1093 1094 /* 1095 * We don't need any segment level locks for "segvn" data 1096 * since the address space is "write" locked. 1097 */ 1098 ASSERT(seg2->s_as && AS_WRITE_HELD(seg2->s_as, &seg2->s_as->a_lock)); 1099 1100 /* first segment is new, try to extend second */ 1101 /* XXX - should also check cred */ 1102 if (svd2->vp != a->vp || svd2->maxprot != a->maxprot || 1103 (!svd2->pageprot && (svd2->prot != a->prot)) || 1104 svd2->type != a->type || svd2->flags != a->flags || 1105 seg2->s_szc != a->szc) 1106 return (-1); 1107 /* vp == NULL implies zfod, offset doesn't matter */ 1108 if (svd2->vp != NULL && 1109 (a->offset & PAGEMASK) + seg1->s_size != svd2->offset) 1110 return (-1); 1111 1112 amp2 = svd2->amp; 1113 if (amp2) { 1114 pgcnt_t newpgs; 1115 1116 /* 1117 * Segment has private pages, can data structures 1118 * be expanded? 1119 * 1120 * Acquire the anon_map lock to prevent it from changing, 1121 * if it is shared. This ensures that the anon_map 1122 * will not change while a thread which has a read/write 1123 * lock on an address space references it. 1124 * 1125 * XXX - Don't need the anon_map lock at all if "refcnt" 1126 * is 1. 1127 */ 1128 if (svd2->type == MAP_SHARED) 1129 return (-1); 1130 1131 ANON_LOCK_ENTER(&2->a_rwlock, RW_WRITER); 1132 if (amp2->refcnt > 1) { 1133 ANON_LOCK_EXIT(&2->a_rwlock); 1134 return (-1); 1135 } 1136 newpgs = anon_grow(amp2->ahp, &svd2->anon_index, 1137 btop(seg2->s_size), btop(seg1->s_size), 1138 ANON_NOSLEEP | ANON_GROWDOWN); 1139 1140 if (newpgs == 0) { 1141 ANON_LOCK_EXIT(&2->a_rwlock); 1142 return (-1); 1143 } 1144 amp2->size = ptob(newpgs); 1145 ANON_LOCK_EXIT(&2->a_rwlock); 1146 } 1147 if (svd2->vpage != NULL) { 1148 new_vpage = 1149 kmem_zalloc(vpgtob(seg_pages(seg1) + seg_pages(seg2)), 1150 KM_NOSLEEP); 1151 if (new_vpage == NULL) { 1152 /* Not merging segments so adjust anon_index back */ 1153 if (amp2) 1154 svd2->anon_index += seg_pages(seg1); 1155 return (-1); 1156 } 1157 bcopy(svd2->vpage, new_vpage + seg_pages(seg1), 1158 vpgtob(seg_pages(seg2))); 1159 kmem_free(svd2->vpage, vpgtob(seg_pages(seg2))); 1160 svd2->vpage = new_vpage; 1161 if (svd2->pageprot) { 1162 struct vpage *vp, *evp; 1163 1164 vp = new_vpage; 1165 evp = vp + seg_pages(seg1); 1166 for (; vp < evp; vp++) 1167 VPP_SETPROT(vp, a->prot); 1168 } 1169 } 1170 size = seg1->s_size; 1171 seg_free(seg1); 1172 seg2->s_size += size; 1173 seg2->s_base -= size; 1174 svd2->offset -= size; 1175 svd2->swresv += swresv; 1176 return (0); 1177 } 1178 1179 static int 1180 segvn_dup(struct seg *seg, struct seg *newseg) 1181 { 1182 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 1183 struct segvn_data *newsvd; 1184 pgcnt_t npages = seg_pages(seg); 1185 int error = 0; 1186 uint_t prot; 1187 size_t len; 1188 1189 ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock)); 1190 1191 /* 1192 * If segment has anon reserved, reserve more for the new seg. 1193 * For a MAP_NORESERVE segment swresv will be a count of all the 1194 * allocated anon slots; thus we reserve for the child as many slots 1195 * as the parent has allocated. This semantic prevents the child or 1196 * parent from dieing during a copy-on-write fault caused by trying 1197 * to write a shared pre-existing anon page. 1198 */ 1199 if ((len = svd->swresv) != 0) { 1200 if (anon_resv(svd->swresv) == 0) 1201 return (ENOMEM); 1202 1203 TRACE_3(TR_FAC_VM, TR_ANON_PROC, "anon proc:%p %lu %u", 1204 seg, len, 0); 1205 } 1206 1207 newsvd = kmem_cache_alloc(segvn_cache, KM_SLEEP); 1208 1209 newseg->s_ops = &segvn_ops; 1210 newseg->s_data = (void *)newsvd; 1211 newseg->s_szc = seg->s_szc; 1212 1213 if ((newsvd->vp = svd->vp) != NULL) { 1214 VN_HOLD(svd->vp); 1215 if (svd->type == MAP_SHARED) 1216 lgrp_shm_policy_init(NULL, svd->vp); 1217 } 1218 newsvd->offset = svd->offset; 1219 newsvd->prot = svd->prot; 1220 newsvd->maxprot = svd->maxprot; 1221 newsvd->pageprot = svd->pageprot; 1222 newsvd->type = svd->type; 1223 newsvd->cred = svd->cred; 1224 crhold(newsvd->cred); 1225 newsvd->advice = svd->advice; 1226 newsvd->pageadvice = svd->pageadvice; 1227 newsvd->swresv = svd->swresv; 1228 newsvd->flags = svd->flags; 1229 newsvd->softlockcnt = 0; 1230 newsvd->policy_info = svd->policy_info; 1231 if ((newsvd->amp = svd->amp) == NULL) { 1232 /* 1233 * Not attaching to a shared anon object. 1234 */ 1235 newsvd->anon_index = 0; 1236 } else { 1237 struct anon_map *amp; 1238 1239 amp = svd->amp; 1240 if (svd->type == MAP_SHARED) { 1241 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 1242 amp->refcnt++; 1243 ANON_LOCK_EXIT(&->a_rwlock); 1244 newsvd->anon_index = svd->anon_index; 1245 } else { 1246 int reclaim = 1; 1247 1248 /* 1249 * Allocate and initialize new anon_map structure. 1250 */ 1251 newsvd->amp = anonmap_alloc(newseg->s_size, 0); 1252 newsvd->amp->a_szc = newseg->s_szc; 1253 newsvd->anon_index = 0; 1254 1255 /* 1256 * We don't have to acquire the anon_map lock 1257 * for the new segment (since it belongs to an 1258 * address space that is still not associated 1259 * with any process), or the segment in the old 1260 * address space (since all threads in it 1261 * are stopped while duplicating the address space). 1262 */ 1263 1264 /* 1265 * The goal of the following code is to make sure that 1266 * softlocked pages do not end up as copy on write 1267 * pages. This would cause problems where one 1268 * thread writes to a page that is COW and a different 1269 * thread in the same process has softlocked it. The 1270 * softlock lock would move away from this process 1271 * because the write would cause this process to get 1272 * a copy (without the softlock). 1273 * 1274 * The strategy here is to just break the 1275 * sharing on pages that could possibly be 1276 * softlocked. 1277 */ 1278 retry: 1279 if (svd->softlockcnt) { 1280 struct anon *ap, *newap; 1281 size_t i; 1282 uint_t vpprot; 1283 page_t *anon_pl[1+1], *pp; 1284 caddr_t addr; 1285 ulong_t anon_idx = 0; 1286 1287 /* 1288 * The softlock count might be non zero 1289 * because some pages are still stuck in the 1290 * cache for lazy reclaim. Flush the cache 1291 * now. This should drop the count to zero. 1292 * [or there is really I/O going on to these 1293 * pages]. Note, we have the writers lock so 1294 * nothing gets inserted during the flush. 1295 */ 1296 if (reclaim == 1) { 1297 segvn_purge(seg); 1298 reclaim = 0; 1299 goto retry; 1300 } 1301 i = btopr(seg->s_size); 1302 addr = seg->s_base; 1303 /* 1304 * XXX break cow sharing using PAGESIZE 1305 * pages. They will be relocated into larger 1306 * pages at fault time. 1307 */ 1308 while (i-- > 0) { 1309 if (ap = anon_get_ptr(amp->ahp, 1310 anon_idx)) { 1311 error = anon_getpage(&ap, 1312 &vpprot, anon_pl, PAGESIZE, 1313 seg, addr, S_READ, 1314 svd->cred); 1315 if (error) { 1316 newsvd->vpage = NULL; 1317 goto out; 1318 } 1319 /* 1320 * prot need not be computed 1321 * below 'cause anon_private is 1322 * going to ignore it anyway 1323 * as child doesn't inherit 1324 * pagelock from parent. 1325 */ 1326 prot = svd->pageprot ? 1327 VPP_PROT( 1328 &svd->vpage[ 1329 seg_page(seg, addr)]) 1330 : svd->prot; 1331 pp = anon_private(&newap, 1332 newseg, addr, prot, 1333 anon_pl[0], 0, 1334 newsvd->cred); 1335 if (pp == NULL) { 1336 /* no mem abort */ 1337 newsvd->vpage = NULL; 1338 error = ENOMEM; 1339 goto out; 1340 } 1341 (void) anon_set_ptr( 1342 newsvd->amp->ahp, anon_idx, 1343 newap, ANON_SLEEP); 1344 page_unlock(pp); 1345 } 1346 addr += PAGESIZE; 1347 anon_idx++; 1348 } 1349 } else { /* common case */ 1350 if (seg->s_szc != 0) { 1351 /* 1352 * If at least one of anon slots of a 1353 * large page exists then make sure 1354 * all anon slots of a large page 1355 * exist to avoid partial cow sharing 1356 * of a large page in the future. 1357 */ 1358 anon_dup_fill_holes(amp->ahp, 1359 svd->anon_index, newsvd->amp->ahp, 1360 0, seg->s_size, seg->s_szc, 1361 svd->vp != NULL); 1362 } else { 1363 anon_dup(amp->ahp, svd->anon_index, 1364 newsvd->amp->ahp, 0, seg->s_size); 1365 } 1366 1367 hat_clrattr(seg->s_as->a_hat, seg->s_base, 1368 seg->s_size, PROT_WRITE); 1369 } 1370 } 1371 } 1372 /* 1373 * If necessary, create a vpage structure for the new segment. 1374 * Do not copy any page lock indications. 1375 */ 1376 if (svd->vpage != NULL) { 1377 uint_t i; 1378 struct vpage *ovp = svd->vpage; 1379 struct vpage *nvp; 1380 1381 nvp = newsvd->vpage = 1382 kmem_alloc(vpgtob(npages), KM_SLEEP); 1383 for (i = 0; i < npages; i++) { 1384 *nvp = *ovp++; 1385 VPP_CLRPPLOCK(nvp++); 1386 } 1387 } else 1388 newsvd->vpage = NULL; 1389 1390 /* Inform the vnode of the new mapping */ 1391 if (newsvd->vp != NULL) { 1392 error = VOP_ADDMAP(newsvd->vp, (offset_t)newsvd->offset, 1393 newseg->s_as, newseg->s_base, newseg->s_size, newsvd->prot, 1394 newsvd->maxprot, newsvd->type, newsvd->cred); 1395 } 1396 out: 1397 return (error); 1398 } 1399 1400 1401 /* 1402 * callback function used by segvn_unmap to invoke free_vp_pages() for only 1403 * those pages actually processed by the HAT 1404 */ 1405 extern int free_pages; 1406 1407 static void 1408 segvn_hat_unload_callback(hat_callback_t *cb) 1409 { 1410 struct seg *seg = cb->hcb_data; 1411 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 1412 size_t len; 1413 u_offset_t off; 1414 1415 ASSERT(svd->vp != NULL); 1416 ASSERT(cb->hcb_end_addr > cb->hcb_start_addr); 1417 ASSERT(cb->hcb_start_addr >= seg->s_base); 1418 1419 len = cb->hcb_end_addr - cb->hcb_start_addr; 1420 off = cb->hcb_start_addr - seg->s_base; 1421 free_vp_pages(svd->vp, svd->offset + off, len); 1422 } 1423 1424 1425 static int 1426 segvn_unmap(struct seg *seg, caddr_t addr, size_t len) 1427 { 1428 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 1429 struct segvn_data *nsvd; 1430 struct seg *nseg; 1431 struct anon_map *amp; 1432 pgcnt_t opages; /* old segment size in pages */ 1433 pgcnt_t npages; /* new segment size in pages */ 1434 pgcnt_t dpages; /* pages being deleted (unmapped) */ 1435 hat_callback_t callback; /* used for free_vp_pages() */ 1436 hat_callback_t *cbp = NULL; 1437 caddr_t nbase; 1438 size_t nsize; 1439 size_t oswresv; 1440 int reclaim = 1; 1441 1442 /* 1443 * We don't need any segment level locks for "segvn" data 1444 * since the address space is "write" locked. 1445 */ 1446 ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock)); 1447 1448 /* 1449 * Fail the unmap if pages are SOFTLOCKed through this mapping. 1450 * softlockcnt is protected from change by the as write lock. 1451 */ 1452 retry: 1453 if (svd->softlockcnt > 0) { 1454 /* 1455 * since we do have the writers lock nobody can fill 1456 * the cache during the purge. The flush either succeeds 1457 * or we still have pending I/Os. 1458 */ 1459 if (reclaim == 1) { 1460 segvn_purge(seg); 1461 reclaim = 0; 1462 goto retry; 1463 } 1464 return (EAGAIN); 1465 } 1466 1467 /* 1468 * Check for bad sizes 1469 */ 1470 if (addr < seg->s_base || addr + len > seg->s_base + seg->s_size || 1471 (len & PAGEOFFSET) || ((uintptr_t)addr & PAGEOFFSET)) { 1472 panic("segvn_unmap"); 1473 /*NOTREACHED*/ 1474 } 1475 1476 if (seg->s_szc != 0) { 1477 size_t pgsz = page_get_pagesize(seg->s_szc); 1478 int err; 1479 if (!IS_P2ALIGNED(addr, pgsz) || !IS_P2ALIGNED(len, pgsz)) { 1480 ASSERT(seg->s_base != addr || seg->s_size != len); 1481 VM_STAT_ADD(segvnvmstats.demoterange[0]); 1482 err = segvn_demote_range(seg, addr, len, SDR_END, 0); 1483 if (err == 0) { 1484 return (IE_RETRY); 1485 } 1486 return (err); 1487 } 1488 } 1489 1490 /* Inform the vnode of the unmapping. */ 1491 if (svd->vp) { 1492 int error; 1493 1494 error = VOP_DELMAP(svd->vp, 1495 (offset_t)svd->offset + (uintptr_t)(addr - seg->s_base), 1496 seg->s_as, addr, len, svd->prot, svd->maxprot, 1497 svd->type, svd->cred); 1498 1499 if (error == EAGAIN) 1500 return (error); 1501 } 1502 /* 1503 * Remove any page locks set through this mapping. 1504 */ 1505 (void) segvn_lockop(seg, addr, len, 0, MC_UNLOCK, NULL, 0); 1506 1507 /* 1508 * Unload any hardware translations in the range to be taken out. 1509 * Use a callback to invoke free_vp_pages() effectively. 1510 */ 1511 if (svd->vp != NULL && free_pages != 0) { 1512 callback.hcb_data = seg; 1513 callback.hcb_function = segvn_hat_unload_callback; 1514 cbp = &callback; 1515 } 1516 hat_unload_callback(seg->s_as->a_hat, addr, len, HAT_UNLOAD_UNMAP, cbp); 1517 1518 /* 1519 * Check for entire segment 1520 */ 1521 if (addr == seg->s_base && len == seg->s_size) { 1522 seg_free(seg); 1523 return (0); 1524 } 1525 1526 opages = seg_pages(seg); 1527 dpages = btop(len); 1528 npages = opages - dpages; 1529 amp = svd->amp; 1530 ASSERT(amp == NULL || amp->a_szc >= seg->s_szc); 1531 1532 /* 1533 * Check for beginning of segment 1534 */ 1535 if (addr == seg->s_base) { 1536 if (svd->vpage != NULL) { 1537 size_t nbytes; 1538 struct vpage *ovpage; 1539 1540 ovpage = svd->vpage; /* keep pointer to vpage */ 1541 1542 nbytes = vpgtob(npages); 1543 svd->vpage = kmem_alloc(nbytes, KM_SLEEP); 1544 bcopy(&ovpage[dpages], svd->vpage, nbytes); 1545 1546 /* free up old vpage */ 1547 kmem_free(ovpage, vpgtob(opages)); 1548 } 1549 if (amp != NULL) { 1550 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 1551 if (amp->refcnt == 1 || svd->type == MAP_PRIVATE) { 1552 /* 1553 * Free up now unused parts of anon_map array. 1554 */ 1555 if (amp->a_szc == seg->s_szc) { 1556 if (seg->s_szc != 0) { 1557 anon_free_pages(amp->ahp, 1558 svd->anon_index, len, 1559 seg->s_szc); 1560 } else { 1561 anon_free(amp->ahp, 1562 svd->anon_index, 1563 len); 1564 } 1565 } else { 1566 ASSERT(svd->type == MAP_SHARED); 1567 ASSERT(amp->a_szc > seg->s_szc); 1568 anon_shmap_free_pages(amp, 1569 svd->anon_index, len); 1570 } 1571 1572 /* 1573 * Unreserve swap space for the 1574 * unmapped chunk of this segment in 1575 * case it's MAP_SHARED 1576 */ 1577 if (svd->type == MAP_SHARED) { 1578 anon_unresv(len); 1579 amp->swresv -= len; 1580 } 1581 } 1582 ANON_LOCK_EXIT(&->a_rwlock); 1583 svd->anon_index += dpages; 1584 } 1585 if (svd->vp != NULL) 1586 svd->offset += len; 1587 1588 if (svd->swresv) { 1589 if (svd->flags & MAP_NORESERVE) { 1590 ASSERT(amp); 1591 oswresv = svd->swresv; 1592 1593 svd->swresv = ptob(anon_pages(amp->ahp, 1594 svd->anon_index, npages)); 1595 anon_unresv(oswresv - svd->swresv); 1596 } else { 1597 anon_unresv(len); 1598 svd->swresv -= len; 1599 } 1600 TRACE_3(TR_FAC_VM, TR_ANON_PROC, "anon proc:%p %lu %u", 1601 seg, len, 0); 1602 } 1603 1604 seg->s_base += len; 1605 seg->s_size -= len; 1606 return (0); 1607 } 1608 1609 /* 1610 * Check for end of segment 1611 */ 1612 if (addr + len == seg->s_base + seg->s_size) { 1613 if (svd->vpage != NULL) { 1614 size_t nbytes; 1615 struct vpage *ovpage; 1616 1617 ovpage = svd->vpage; /* keep pointer to vpage */ 1618 1619 nbytes = vpgtob(npages); 1620 svd->vpage = kmem_alloc(nbytes, KM_SLEEP); 1621 bcopy(ovpage, svd->vpage, nbytes); 1622 1623 /* free up old vpage */ 1624 kmem_free(ovpage, vpgtob(opages)); 1625 1626 } 1627 if (amp != NULL) { 1628 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 1629 if (amp->refcnt == 1 || svd->type == MAP_PRIVATE) { 1630 /* 1631 * Free up now unused parts of anon_map array. 1632 */ 1633 ulong_t an_idx = svd->anon_index + npages; 1634 if (amp->a_szc == seg->s_szc) { 1635 if (seg->s_szc != 0) { 1636 anon_free_pages(amp->ahp, 1637 an_idx, len, 1638 seg->s_szc); 1639 } else { 1640 anon_free(amp->ahp, an_idx, 1641 len); 1642 } 1643 } else { 1644 ASSERT(svd->type == MAP_SHARED); 1645 ASSERT(amp->a_szc > seg->s_szc); 1646 anon_shmap_free_pages(amp, 1647 an_idx, len); 1648 } 1649 1650 /* 1651 * Unreserve swap space for the 1652 * unmapped chunk of this segment in 1653 * case it's MAP_SHARED 1654 */ 1655 if (svd->type == MAP_SHARED) { 1656 anon_unresv(len); 1657 amp->swresv -= len; 1658 } 1659 } 1660 ANON_LOCK_EXIT(&->a_rwlock); 1661 } 1662 1663 if (svd->swresv) { 1664 if (svd->flags & MAP_NORESERVE) { 1665 ASSERT(amp); 1666 oswresv = svd->swresv; 1667 svd->swresv = ptob(anon_pages(amp->ahp, 1668 svd->anon_index, npages)); 1669 anon_unresv(oswresv - svd->swresv); 1670 } else { 1671 anon_unresv(len); 1672 svd->swresv -= len; 1673 } 1674 TRACE_3(TR_FAC_VM, TR_ANON_PROC, 1675 "anon proc:%p %lu %u", seg, len, 0); 1676 } 1677 1678 seg->s_size -= len; 1679 return (0); 1680 } 1681 1682 /* 1683 * The section to go is in the middle of the segment, 1684 * have to make it into two segments. nseg is made for 1685 * the high end while seg is cut down at the low end. 1686 */ 1687 nbase = addr + len; /* new seg base */ 1688 nsize = (seg->s_base + seg->s_size) - nbase; /* new seg size */ 1689 seg->s_size = addr - seg->s_base; /* shrink old seg */ 1690 nseg = seg_alloc(seg->s_as, nbase, nsize); 1691 if (nseg == NULL) { 1692 panic("segvn_unmap seg_alloc"); 1693 /*NOTREACHED*/ 1694 } 1695 nseg->s_ops = seg->s_ops; 1696 nsvd = kmem_cache_alloc(segvn_cache, KM_SLEEP); 1697 nseg->s_data = (void *)nsvd; 1698 nseg->s_szc = seg->s_szc; 1699 *nsvd = *svd; 1700 nsvd->offset = svd->offset + (uintptr_t)(nseg->s_base - seg->s_base); 1701 nsvd->swresv = 0; 1702 nsvd->softlockcnt = 0; 1703 1704 if (svd->vp != NULL) { 1705 VN_HOLD(nsvd->vp); 1706 if (nsvd->type == MAP_SHARED) 1707 lgrp_shm_policy_init(NULL, nsvd->vp); 1708 } 1709 crhold(svd->cred); 1710 1711 if (svd->vpage == NULL) { 1712 nsvd->vpage = NULL; 1713 } else { 1714 /* need to split vpage into two arrays */ 1715 size_t nbytes; 1716 struct vpage *ovpage; 1717 1718 ovpage = svd->vpage; /* keep pointer to vpage */ 1719 1720 npages = seg_pages(seg); /* seg has shrunk */ 1721 nbytes = vpgtob(npages); 1722 svd->vpage = kmem_alloc(nbytes, KM_SLEEP); 1723 1724 bcopy(ovpage, svd->vpage, nbytes); 1725 1726 npages = seg_pages(nseg); 1727 nbytes = vpgtob(npages); 1728 nsvd->vpage = kmem_alloc(nbytes, KM_SLEEP); 1729 1730 bcopy(&ovpage[opages - npages], nsvd->vpage, nbytes); 1731 1732 /* free up old vpage */ 1733 kmem_free(ovpage, vpgtob(opages)); 1734 } 1735 1736 if (amp == NULL) { 1737 nsvd->amp = NULL; 1738 nsvd->anon_index = 0; 1739 } else { 1740 /* 1741 * Need to create a new anon map for the new segment. 1742 * We'll also allocate a new smaller array for the old 1743 * smaller segment to save space. 1744 */ 1745 opages = btop((uintptr_t)(addr - seg->s_base)); 1746 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 1747 if (amp->refcnt == 1 || svd->type == MAP_PRIVATE) { 1748 /* 1749 * Free up now unused parts of anon_map array. 1750 */ 1751 ulong_t an_idx = svd->anon_index + opages; 1752 if (amp->a_szc == seg->s_szc) { 1753 if (seg->s_szc != 0) { 1754 anon_free_pages(amp->ahp, an_idx, len, 1755 seg->s_szc); 1756 } else { 1757 anon_free(amp->ahp, an_idx, 1758 len); 1759 } 1760 } else { 1761 ASSERT(svd->type == MAP_SHARED); 1762 ASSERT(amp->a_szc > seg->s_szc); 1763 anon_shmap_free_pages(amp, an_idx, len); 1764 } 1765 1766 /* 1767 * Unreserve swap space for the 1768 * unmapped chunk of this segment in 1769 * case it's MAP_SHARED 1770 */ 1771 if (svd->type == MAP_SHARED) { 1772 anon_unresv(len); 1773 amp->swresv -= len; 1774 } 1775 } 1776 nsvd->anon_index = svd->anon_index + 1777 btop((uintptr_t)(nseg->s_base - seg->s_base)); 1778 if (svd->type == MAP_SHARED) { 1779 amp->refcnt++; 1780 nsvd->amp = amp; 1781 } else { 1782 struct anon_map *namp; 1783 struct anon_hdr *nahp; 1784 1785 ASSERT(svd->type == MAP_PRIVATE); 1786 nahp = anon_create(btop(seg->s_size), ANON_SLEEP); 1787 namp = anonmap_alloc(nseg->s_size, 0); 1788 namp->a_szc = seg->s_szc; 1789 (void) anon_copy_ptr(amp->ahp, svd->anon_index, nahp, 1790 0, btop(seg->s_size), ANON_SLEEP); 1791 (void) anon_copy_ptr(amp->ahp, nsvd->anon_index, 1792 namp->ahp, 0, btop(nseg->s_size), ANON_SLEEP); 1793 anon_release(amp->ahp, btop(amp->size)); 1794 svd->anon_index = 0; 1795 nsvd->anon_index = 0; 1796 amp->ahp = nahp; 1797 amp->size = seg->s_size; 1798 nsvd->amp = namp; 1799 } 1800 ANON_LOCK_EXIT(&->a_rwlock); 1801 } 1802 if (svd->swresv) { 1803 if (svd->flags & MAP_NORESERVE) { 1804 ASSERT(amp); 1805 oswresv = svd->swresv; 1806 svd->swresv = ptob(anon_pages(amp->ahp, 1807 svd->anon_index, btop(seg->s_size))); 1808 nsvd->swresv = ptob(anon_pages(nsvd->amp->ahp, 1809 nsvd->anon_index, btop(nseg->s_size))); 1810 ASSERT(oswresv >= (svd->swresv + nsvd->swresv)); 1811 anon_unresv(oswresv - (svd->swresv + nsvd->swresv)); 1812 } else { 1813 if (seg->s_size + nseg->s_size + len != svd->swresv) { 1814 panic("segvn_unmap: " 1815 "cannot split swap reservation"); 1816 /*NOTREACHED*/ 1817 } 1818 anon_unresv(len); 1819 svd->swresv = seg->s_size; 1820 nsvd->swresv = nseg->s_size; 1821 } 1822 TRACE_3(TR_FAC_VM, TR_ANON_PROC, "anon proc:%p %lu %u", 1823 seg, len, 0); 1824 } 1825 1826 return (0); /* I'm glad that's all over with! */ 1827 } 1828 1829 static void 1830 segvn_free(struct seg *seg) 1831 { 1832 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 1833 pgcnt_t npages = seg_pages(seg); 1834 struct anon_map *amp; 1835 size_t len; 1836 1837 /* 1838 * We don't need any segment level locks for "segvn" data 1839 * since the address space is "write" locked. 1840 */ 1841 ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock)); 1842 1843 /* 1844 * Be sure to unlock pages. XXX Why do things get free'ed instead 1845 * of unmapped? XXX 1846 */ 1847 (void) segvn_lockop(seg, seg->s_base, seg->s_size, 1848 0, MC_UNLOCK, NULL, 0); 1849 1850 /* 1851 * Deallocate the vpage and anon pointers if necessary and possible. 1852 */ 1853 if (svd->vpage != NULL) { 1854 kmem_free(svd->vpage, vpgtob(npages)); 1855 svd->vpage = NULL; 1856 } 1857 if ((amp = svd->amp) != NULL) { 1858 /* 1859 * If there are no more references to this anon_map 1860 * structure, then deallocate the structure after freeing 1861 * up all the anon slot pointers that we can. 1862 */ 1863 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 1864 ASSERT(amp->a_szc >= seg->s_szc); 1865 if (--amp->refcnt == 0) { 1866 if (svd->type == MAP_PRIVATE) { 1867 /* 1868 * Private - we only need to anon_free 1869 * the part that this segment refers to. 1870 */ 1871 if (seg->s_szc != 0) { 1872 anon_free_pages(amp->ahp, 1873 svd->anon_index, seg->s_size, 1874 seg->s_szc); 1875 } else { 1876 anon_free(amp->ahp, svd->anon_index, 1877 seg->s_size); 1878 } 1879 } else { 1880 /* 1881 * Shared - anon_free the entire 1882 * anon_map's worth of stuff and 1883 * release any swap reservation. 1884 */ 1885 if (amp->a_szc != 0) { 1886 anon_shmap_free_pages(amp, 0, 1887 amp->size); 1888 } else { 1889 anon_free(amp->ahp, 0, amp->size); 1890 } 1891 if ((len = amp->swresv) != 0) { 1892 anon_unresv(len); 1893 TRACE_3(TR_FAC_VM, TR_ANON_PROC, 1894 "anon proc:%p %lu %u", 1895 seg, len, 0); 1896 } 1897 } 1898 svd->amp = NULL; 1899 ANON_LOCK_EXIT(&->a_rwlock); 1900 anonmap_free(amp); 1901 } else if (svd->type == MAP_PRIVATE) { 1902 /* 1903 * We had a private mapping which still has 1904 * a held anon_map so just free up all the 1905 * anon slot pointers that we were using. 1906 */ 1907 if (seg->s_szc != 0) { 1908 anon_free_pages(amp->ahp, svd->anon_index, 1909 seg->s_size, seg->s_szc); 1910 } else { 1911 anon_free(amp->ahp, svd->anon_index, 1912 seg->s_size); 1913 } 1914 ANON_LOCK_EXIT(&->a_rwlock); 1915 } else { 1916 ANON_LOCK_EXIT(&->a_rwlock); 1917 } 1918 } 1919 1920 /* 1921 * Release swap reservation. 1922 */ 1923 if ((len = svd->swresv) != 0) { 1924 anon_unresv(svd->swresv); 1925 TRACE_3(TR_FAC_VM, TR_ANON_PROC, "anon proc:%p %lu %u", 1926 seg, len, 0); 1927 svd->swresv = 0; 1928 } 1929 /* 1930 * Release claim on vnode, credentials, and finally free the 1931 * private data. 1932 */ 1933 if (svd->vp != NULL) { 1934 if (svd->type == MAP_SHARED) 1935 lgrp_shm_policy_fini(NULL, svd->vp); 1936 VN_RELE(svd->vp); 1937 svd->vp = NULL; 1938 } 1939 crfree(svd->cred); 1940 svd->cred = NULL; 1941 1942 seg->s_data = NULL; 1943 kmem_cache_free(segvn_cache, svd); 1944 } 1945 1946 ulong_t segvn_lpglck_limit = 0; 1947 /* 1948 * Support routines used by segvn_pagelock() and softlock faults for anonymous 1949 * pages to implement availrmem accounting in a way that makes sure the 1950 * same memory is accounted just once for all softlock/pagelock purposes. 1951 * This prevents a bug when availrmem is quickly incorrectly exausted from 1952 * several pagelocks to different parts of the same large page since each 1953 * pagelock has to decrement availrmem by the size of the entire large 1954 * page. Note those pages are not COW shared until softunlock/pageunlock so 1955 * we don't need to use cow style accounting here. We also need to make sure 1956 * the entire large page is accounted even if softlock range is less than the 1957 * entire large page because large anon pages can't be demoted when any of 1958 * constituent pages is locked. The caller calls this routine for every page_t 1959 * it locks. The very first page in the range may not be the root page of a 1960 * large page. For all other pages it's guranteed we are going to visit the 1961 * root of a particular large page before any other constituent page as we are 1962 * locking sequential pages belonging to the same anon map. So we do all the 1963 * locking when the root is encountered except for the very first page. Since 1964 * softlocking is not supported (except S_READ_NOCOW special case) for vmpss 1965 * segments and since vnode pages can be demoted without locking all 1966 * constituent pages vnode pages don't come here. Unlocking relies on the 1967 * fact that pagesize can't change whenever any of constituent large pages is 1968 * locked at least SE_SHARED. This allows unlocking code to find the right 1969 * root and decrement availrmem by the same amount it was incremented when the 1970 * page was locked. 1971 */ 1972 static int 1973 segvn_pp_lock_anonpages(page_t *pp, int first) 1974 { 1975 pgcnt_t pages; 1976 pfn_t pfn; 1977 uchar_t szc = pp->p_szc; 1978 1979 ASSERT(PAGE_LOCKED(pp)); 1980 ASSERT(pp->p_vnode != NULL); 1981 ASSERT(IS_SWAPFSVP(pp->p_vnode)); 1982 1983 /* 1984 * pagesize won't change as long as any constituent page is locked. 1985 */ 1986 pages = page_get_pagecnt(pp->p_szc); 1987 pfn = page_pptonum(pp); 1988 1989 if (!first) { 1990 if (!IS_P2ALIGNED(pfn, pages)) { 1991 #ifdef DEBUG 1992 pp = &pp[-(spgcnt_t)(pfn & (pages - 1))]; 1993 pfn = page_pptonum(pp); 1994 ASSERT(IS_P2ALIGNED(pfn, pages)); 1995 ASSERT(pp->p_szc == szc); 1996 ASSERT(pp->p_vnode != NULL); 1997 ASSERT(IS_SWAPFSVP(pp->p_vnode)); 1998 ASSERT(pp->p_slckcnt != 0); 1999 #endif /* DEBUG */ 2000 return (1); 2001 } 2002 } else if (!IS_P2ALIGNED(pfn, pages)) { 2003 pp = &pp[-(spgcnt_t)(pfn & (pages - 1))]; 2004 #ifdef DEBUG 2005 pfn = page_pptonum(pp); 2006 ASSERT(IS_P2ALIGNED(pfn, pages)); 2007 ASSERT(pp->p_szc == szc); 2008 ASSERT(pp->p_vnode != NULL); 2009 ASSERT(IS_SWAPFSVP(pp->p_vnode)); 2010 #endif /* DEBUG */ 2011 } 2012 2013 /* 2014 * pp is a root page. 2015 * We haven't locked this large page yet. 2016 */ 2017 page_struct_lock(pp); 2018 if (pp->p_slckcnt != 0) { 2019 if (pp->p_slckcnt < PAGE_SLOCK_MAXIMUM) { 2020 pp->p_slckcnt++; 2021 page_struct_unlock(pp); 2022 return (1); 2023 } 2024 page_struct_unlock(pp); 2025 segvn_lpglck_limit++; 2026 return (0); 2027 } 2028 mutex_enter(&freemem_lock); 2029 if (availrmem < tune.t_minarmem + pages) { 2030 mutex_exit(&freemem_lock); 2031 page_struct_unlock(pp); 2032 return (0); 2033 } 2034 pp->p_slckcnt++; 2035 availrmem -= pages; 2036 mutex_exit(&freemem_lock); 2037 page_struct_unlock(pp); 2038 return (1); 2039 } 2040 2041 static void 2042 segvn_pp_unlock_anonpages(page_t *pp, int first) 2043 { 2044 pgcnt_t pages; 2045 pfn_t pfn; 2046 2047 ASSERT(PAGE_LOCKED(pp)); 2048 ASSERT(pp->p_vnode != NULL); 2049 ASSERT(IS_SWAPFSVP(pp->p_vnode)); 2050 2051 /* 2052 * pagesize won't change as long as any constituent page is locked. 2053 */ 2054 pages = page_get_pagecnt(pp->p_szc); 2055 pfn = page_pptonum(pp); 2056 2057 if (!first) { 2058 if (!IS_P2ALIGNED(pfn, pages)) { 2059 return; 2060 } 2061 } else if (!IS_P2ALIGNED(pfn, pages)) { 2062 pp = &pp[-(spgcnt_t)(pfn & (pages - 1))]; 2063 #ifdef DEBUG 2064 pfn = page_pptonum(pp); 2065 ASSERT(IS_P2ALIGNED(pfn, pages)); 2066 #endif /* DEBUG */ 2067 } 2068 ASSERT(pp->p_vnode != NULL); 2069 ASSERT(IS_SWAPFSVP(pp->p_vnode)); 2070 ASSERT(pp->p_slckcnt != 0); 2071 page_struct_lock(pp); 2072 if (--pp->p_slckcnt == 0) { 2073 mutex_enter(&freemem_lock); 2074 availrmem += pages; 2075 mutex_exit(&freemem_lock); 2076 } 2077 page_struct_unlock(pp); 2078 } 2079 2080 /* 2081 * Do a F_SOFTUNLOCK call over the range requested. The range must have 2082 * already been F_SOFTLOCK'ed. 2083 * Caller must always match addr and len of a softunlock with a previous 2084 * softlock with exactly the same addr and len. 2085 */ 2086 static void 2087 segvn_softunlock(struct seg *seg, caddr_t addr, size_t len, enum seg_rw rw) 2088 { 2089 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 2090 page_t *pp; 2091 caddr_t adr; 2092 struct vnode *vp; 2093 u_offset_t offset; 2094 ulong_t anon_index; 2095 struct anon_map *amp; 2096 struct anon *ap = NULL; 2097 2098 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 2099 ASSERT(SEGVN_LOCK_HELD(seg->s_as, &svd->lock)); 2100 2101 if ((amp = svd->amp) != NULL) 2102 anon_index = svd->anon_index + seg_page(seg, addr); 2103 2104 hat_unlock(seg->s_as->a_hat, addr, len); 2105 for (adr = addr; adr < addr + len; adr += PAGESIZE) { 2106 if (amp != NULL) { 2107 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 2108 if ((ap = anon_get_ptr(amp->ahp, anon_index++)) 2109 != NULL) { 2110 swap_xlate(ap, &vp, &offset); 2111 } else { 2112 vp = svd->vp; 2113 offset = svd->offset + 2114 (uintptr_t)(adr - seg->s_base); 2115 } 2116 ANON_LOCK_EXIT(&->a_rwlock); 2117 } else { 2118 vp = svd->vp; 2119 offset = svd->offset + 2120 (uintptr_t)(adr - seg->s_base); 2121 } 2122 2123 /* 2124 * Use page_find() instead of page_lookup() to 2125 * find the page since we know that it is locked. 2126 */ 2127 pp = page_find(vp, offset); 2128 if (pp == NULL) { 2129 panic( 2130 "segvn_softunlock: addr %p, ap %p, vp %p, off %llx", 2131 (void *)adr, (void *)ap, (void *)vp, offset); 2132 /*NOTREACHED*/ 2133 } 2134 2135 if (rw == S_WRITE) { 2136 hat_setrefmod(pp); 2137 if (seg->s_as->a_vbits) 2138 hat_setstat(seg->s_as, adr, PAGESIZE, 2139 P_REF | P_MOD); 2140 } else if (rw != S_OTHER) { 2141 hat_setref(pp); 2142 if (seg->s_as->a_vbits) 2143 hat_setstat(seg->s_as, adr, PAGESIZE, P_REF); 2144 } 2145 TRACE_3(TR_FAC_VM, TR_SEGVN_FAULT, 2146 "segvn_fault:pp %p vp %p offset %llx", pp, vp, offset); 2147 if (svd->vp == NULL) { 2148 segvn_pp_unlock_anonpages(pp, adr == addr); 2149 } 2150 page_unlock(pp); 2151 } 2152 mutex_enter(&freemem_lock); /* for availrmem */ 2153 if (svd->vp != NULL) { 2154 availrmem += btop(len); 2155 } 2156 segvn_pages_locked -= btop(len); 2157 svd->softlockcnt -= btop(len); 2158 mutex_exit(&freemem_lock); 2159 if (svd->softlockcnt == 0) { 2160 /* 2161 * All SOFTLOCKS are gone. Wakeup any waiting 2162 * unmappers so they can try again to unmap. 2163 * Check for waiters first without the mutex 2164 * held so we don't always grab the mutex on 2165 * softunlocks. 2166 */ 2167 if (AS_ISUNMAPWAIT(seg->s_as)) { 2168 mutex_enter(&seg->s_as->a_contents); 2169 if (AS_ISUNMAPWAIT(seg->s_as)) { 2170 AS_CLRUNMAPWAIT(seg->s_as); 2171 cv_broadcast(&seg->s_as->a_cv); 2172 } 2173 mutex_exit(&seg->s_as->a_contents); 2174 } 2175 } 2176 } 2177 2178 #define PAGE_HANDLED ((page_t *)-1) 2179 2180 /* 2181 * Release all the pages in the NULL terminated ppp list 2182 * which haven't already been converted to PAGE_HANDLED. 2183 */ 2184 static void 2185 segvn_pagelist_rele(page_t **ppp) 2186 { 2187 for (; *ppp != NULL; ppp++) { 2188 if (*ppp != PAGE_HANDLED) 2189 page_unlock(*ppp); 2190 } 2191 } 2192 2193 static int stealcow = 1; 2194 2195 /* 2196 * Workaround for viking chip bug. See bug id 1220902. 2197 * To fix this down in pagefault() would require importing so 2198 * much as and segvn code as to be unmaintainable. 2199 */ 2200 int enable_mbit_wa = 0; 2201 2202 /* 2203 * Handles all the dirty work of getting the right 2204 * anonymous pages and loading up the translations. 2205 * This routine is called only from segvn_fault() 2206 * when looping over the range of addresses requested. 2207 * 2208 * The basic algorithm here is: 2209 * If this is an anon_zero case 2210 * Call anon_zero to allocate page 2211 * Load up translation 2212 * Return 2213 * endif 2214 * If this is an anon page 2215 * Use anon_getpage to get the page 2216 * else 2217 * Find page in pl[] list passed in 2218 * endif 2219 * If not a cow 2220 * Load up the translation to the page 2221 * return 2222 * endif 2223 * Call anon_private to handle cow 2224 * Load up (writable) translation to new page 2225 */ 2226 static faultcode_t 2227 segvn_faultpage( 2228 struct hat *hat, /* the hat to use for mapping */ 2229 struct seg *seg, /* seg_vn of interest */ 2230 caddr_t addr, /* address in as */ 2231 u_offset_t off, /* offset in vp */ 2232 struct vpage *vpage, /* pointer to vpage for vp, off */ 2233 page_t *pl[], /* object source page pointer */ 2234 uint_t vpprot, /* access allowed to object pages */ 2235 enum fault_type type, /* type of fault */ 2236 enum seg_rw rw, /* type of access at fault */ 2237 int brkcow, /* we may need to break cow */ 2238 int first) /* first page for this fault if 1 */ 2239 { 2240 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 2241 page_t *pp, **ppp; 2242 uint_t pageflags = 0; 2243 page_t *anon_pl[1 + 1]; 2244 page_t *opp = NULL; /* original page */ 2245 uint_t prot; 2246 int err; 2247 int cow; 2248 int claim; 2249 int steal = 0; 2250 ulong_t anon_index; 2251 struct anon *ap, *oldap; 2252 struct anon_map *amp; 2253 int hat_flag = (type == F_SOFTLOCK) ? HAT_LOAD_LOCK : HAT_LOAD; 2254 int anon_lock = 0; 2255 anon_sync_obj_t cookie; 2256 2257 if (svd->flags & MAP_TEXT) { 2258 hat_flag |= HAT_LOAD_TEXT; 2259 } 2260 2261 ASSERT(SEGVN_READ_HELD(seg->s_as, &svd->lock)); 2262 ASSERT(seg->s_szc == 0); 2263 2264 /* 2265 * Initialize protection value for this page. 2266 * If we have per page protection values check it now. 2267 */ 2268 if (svd->pageprot) { 2269 uint_t protchk; 2270 2271 switch (rw) { 2272 case S_READ: 2273 protchk = PROT_READ; 2274 break; 2275 case S_WRITE: 2276 protchk = PROT_WRITE; 2277 break; 2278 case S_EXEC: 2279 protchk = PROT_EXEC; 2280 break; 2281 case S_OTHER: 2282 default: 2283 protchk = PROT_READ | PROT_WRITE | PROT_EXEC; 2284 break; 2285 } 2286 2287 prot = VPP_PROT(vpage); 2288 if ((prot & protchk) == 0) 2289 return (FC_PROT); /* illegal access type */ 2290 } else { 2291 prot = svd->prot; 2292 } 2293 2294 if (type == F_SOFTLOCK && svd->vp != NULL) { 2295 mutex_enter(&freemem_lock); 2296 if (availrmem <= tune.t_minarmem) { 2297 mutex_exit(&freemem_lock); 2298 return (FC_MAKE_ERR(ENOMEM)); /* out of real memory */ 2299 } else { 2300 availrmem--; 2301 svd->softlockcnt++; 2302 segvn_pages_locked++; 2303 } 2304 mutex_exit(&freemem_lock); 2305 } 2306 2307 /* 2308 * Always acquire the anon array lock to prevent 2 threads from 2309 * allocating separate anon slots for the same "addr". 2310 */ 2311 2312 if ((amp = svd->amp) != NULL) { 2313 ASSERT(RW_READ_HELD(&->a_rwlock)); 2314 anon_index = svd->anon_index + seg_page(seg, addr); 2315 anon_array_enter(amp, anon_index, &cookie); 2316 anon_lock = 1; 2317 } 2318 2319 if (svd->vp == NULL && amp != NULL) { 2320 if ((ap = anon_get_ptr(amp->ahp, anon_index)) == NULL) { 2321 /* 2322 * Allocate a (normally) writable anonymous page of 2323 * zeroes. If no advance reservations, reserve now. 2324 */ 2325 if (svd->flags & MAP_NORESERVE) { 2326 if (anon_resv_zone(ptob(1), 2327 seg->s_as->a_proc->p_zone)) { 2328 atomic_add_long(&svd->swresv, ptob(1)); 2329 } else { 2330 err = ENOMEM; 2331 goto out; 2332 } 2333 } 2334 if ((pp = anon_zero(seg, addr, &ap, 2335 svd->cred)) == NULL) { 2336 err = ENOMEM; 2337 goto out; /* out of swap space */ 2338 } 2339 /* 2340 * Re-acquire the anon_map lock and 2341 * initialize the anon array entry. 2342 */ 2343 (void) anon_set_ptr(amp->ahp, anon_index, ap, 2344 ANON_SLEEP); 2345 2346 ASSERT(pp->p_szc == 0); 2347 2348 /* 2349 * Handle pages that have been marked for migration 2350 */ 2351 if (lgrp_optimizations()) 2352 page_migrate(seg, addr, &pp, 1); 2353 2354 if (type == F_SOFTLOCK) { 2355 if (!segvn_pp_lock_anonpages(pp, first)) { 2356 page_unlock(pp); 2357 err = ENOMEM; 2358 goto out; 2359 } else { 2360 mutex_enter(&freemem_lock); 2361 svd->softlockcnt++; 2362 segvn_pages_locked++; 2363 mutex_exit(&freemem_lock); 2364 } 2365 } 2366 2367 if (enable_mbit_wa) { 2368 if (rw == S_WRITE) 2369 hat_setmod(pp); 2370 else if (!hat_ismod(pp)) 2371 prot &= ~PROT_WRITE; 2372 } 2373 /* 2374 * If AS_PAGLCK is set in a_flags (via memcntl(2) 2375 * with MC_LOCKAS, MCL_FUTURE) and this is a 2376 * MAP_NORESERVE segment, we may need to 2377 * permanently lock the page as it is being faulted 2378 * for the first time. The following text applies 2379 * only to MAP_NORESERVE segments: 2380 * 2381 * As per memcntl(2), if this segment was created 2382 * after MCL_FUTURE was applied (a "future" 2383 * segment), its pages must be locked. If this 2384 * segment existed at MCL_FUTURE application (a 2385 * "past" segment), the interface is unclear. 2386 * 2387 * We decide to lock only if vpage is present: 2388 * 2389 * - "future" segments will have a vpage array (see 2390 * as_map), and so will be locked as required 2391 * 2392 * - "past" segments may not have a vpage array, 2393 * depending on whether events (such as 2394 * mprotect) have occurred. Locking if vpage 2395 * exists will preserve legacy behavior. Not 2396 * locking if vpage is absent, will not break 2397 * the interface or legacy behavior. Note that 2398 * allocating vpage here if it's absent requires 2399 * upgrading the segvn reader lock, the cost of 2400 * which does not seem worthwhile. 2401 * 2402 * Usually testing and setting VPP_ISPPLOCK and 2403 * VPP_SETPPLOCK requires holding the segvn lock as 2404 * writer, but in this case all readers are 2405 * serializing on the anon array lock. 2406 */ 2407 if (AS_ISPGLCK(seg->s_as) && vpage != NULL && 2408 (svd->flags & MAP_NORESERVE) && 2409 !VPP_ISPPLOCK(vpage)) { 2410 proc_t *p = seg->s_as->a_proc; 2411 ASSERT(svd->type == MAP_PRIVATE); 2412 mutex_enter(&p->p_lock); 2413 if (rctl_incr_locked_mem(p, NULL, PAGESIZE, 2414 1) == 0) { 2415 claim = VPP_PROT(vpage) & PROT_WRITE; 2416 if (page_pp_lock(pp, claim, 0)) { 2417 VPP_SETPPLOCK(vpage); 2418 } else { 2419 rctl_decr_locked_mem(p, NULL, 2420 PAGESIZE, 1); 2421 } 2422 } 2423 mutex_exit(&p->p_lock); 2424 } 2425 2426 hat_memload(hat, addr, pp, prot, hat_flag); 2427 2428 if (!(hat_flag & HAT_LOAD_LOCK)) 2429 page_unlock(pp); 2430 2431 anon_array_exit(&cookie); 2432 return (0); 2433 } 2434 } 2435 2436 /* 2437 * Obtain the page structure via anon_getpage() if it is 2438 * a private copy of an object (the result of a previous 2439 * copy-on-write). 2440 */ 2441 if (amp != NULL) { 2442 if ((ap = anon_get_ptr(amp->ahp, anon_index)) != NULL) { 2443 err = anon_getpage(&ap, &vpprot, anon_pl, PAGESIZE, 2444 seg, addr, rw, svd->cred); 2445 if (err) 2446 goto out; 2447 2448 if (svd->type == MAP_SHARED) { 2449 /* 2450 * If this is a shared mapping to an 2451 * anon_map, then ignore the write 2452 * permissions returned by anon_getpage(). 2453 * They apply to the private mappings 2454 * of this anon_map. 2455 */ 2456 vpprot |= PROT_WRITE; 2457 } 2458 opp = anon_pl[0]; 2459 } 2460 } 2461 2462 /* 2463 * Search the pl[] list passed in if it is from the 2464 * original object (i.e., not a private copy). 2465 */ 2466 if (opp == NULL) { 2467 /* 2468 * Find original page. We must be bringing it in 2469 * from the list in pl[]. 2470 */ 2471 for (ppp = pl; (opp = *ppp) != NULL; ppp++) { 2472 if (opp == PAGE_HANDLED) 2473 continue; 2474 ASSERT(opp->p_vnode == svd->vp); /* XXX */ 2475 if (opp->p_offset == off) 2476 break; 2477 } 2478 if (opp == NULL) { 2479 panic("segvn_faultpage not found"); 2480 /*NOTREACHED*/ 2481 } 2482 *ppp = PAGE_HANDLED; 2483 2484 } 2485 2486 ASSERT(PAGE_LOCKED(opp)); 2487 2488 TRACE_3(TR_FAC_VM, TR_SEGVN_FAULT, 2489 "segvn_fault:pp %p vp %p offset %llx", 2490 opp, NULL, 0); 2491 2492 /* 2493 * The fault is treated as a copy-on-write fault if a 2494 * write occurs on a private segment and the object 2495 * page (i.e., mapping) is write protected. We assume 2496 * that fatal protection checks have already been made. 2497 */ 2498 2499 cow = brkcow && ((vpprot & PROT_WRITE) == 0); 2500 2501 /* 2502 * If not a copy-on-write case load the translation 2503 * and return. 2504 */ 2505 if (cow == 0) { 2506 2507 /* 2508 * Handle pages that have been marked for migration 2509 */ 2510 if (lgrp_optimizations()) 2511 page_migrate(seg, addr, &opp, 1); 2512 2513 if (type == F_SOFTLOCK && svd->vp == NULL) { 2514 2515 ASSERT(opp->p_szc == 0 || 2516 (svd->type == MAP_SHARED && 2517 amp != NULL && amp->a_szc != 0)); 2518 2519 if (!segvn_pp_lock_anonpages(opp, first)) { 2520 page_unlock(opp); 2521 err = ENOMEM; 2522 goto out; 2523 } else { 2524 mutex_enter(&freemem_lock); 2525 svd->softlockcnt++; 2526 segvn_pages_locked++; 2527 mutex_exit(&freemem_lock); 2528 } 2529 } 2530 if (IS_VMODSORT(opp->p_vnode) || enable_mbit_wa) { 2531 if (rw == S_WRITE) 2532 hat_setmod(opp); 2533 else if (rw != S_OTHER && !hat_ismod(opp)) 2534 prot &= ~PROT_WRITE; 2535 } 2536 2537 hat_memload(hat, addr, opp, prot & vpprot, hat_flag); 2538 2539 if (!(hat_flag & HAT_LOAD_LOCK)) 2540 page_unlock(opp); 2541 2542 if (anon_lock) { 2543 anon_array_exit(&cookie); 2544 } 2545 return (0); 2546 } 2547 2548 hat_setref(opp); 2549 2550 ASSERT(amp != NULL && anon_lock); 2551 2552 /* 2553 * Steal the page only if it isn't a private page 2554 * since stealing a private page is not worth the effort. 2555 */ 2556 if ((ap = anon_get_ptr(amp->ahp, anon_index)) == NULL) 2557 steal = 1; 2558 2559 /* 2560 * Steal the original page if the following conditions are true: 2561 * 2562 * We are low on memory, the page is not private, page is not large, 2563 * not shared, not modified, not `locked' or if we have it `locked' 2564 * (i.e., p_cowcnt == 1 and p_lckcnt == 0, which also implies 2565 * that the page is not shared) and if it doesn't have any 2566 * translations. page_struct_lock isn't needed to look at p_cowcnt 2567 * and p_lckcnt because we first get exclusive lock on page. 2568 */ 2569 (void) hat_pagesync(opp, HAT_SYNC_DONTZERO | HAT_SYNC_STOPON_MOD); 2570 2571 if (stealcow && freemem < minfree && steal && opp->p_szc == 0 && 2572 page_tryupgrade(opp) && !hat_ismod(opp) && 2573 ((opp->p_lckcnt == 0 && opp->p_cowcnt == 0) || 2574 (opp->p_lckcnt == 0 && opp->p_cowcnt == 1 && 2575 vpage != NULL && VPP_ISPPLOCK(vpage)))) { 2576 /* 2577 * Check if this page has other translations 2578 * after unloading our translation. 2579 */ 2580 if (hat_page_is_mapped(opp)) { 2581 hat_unload(seg->s_as->a_hat, addr, PAGESIZE, 2582 HAT_UNLOAD); 2583 } 2584 2585 /* 2586 * hat_unload() might sync back someone else's recent 2587 * modification, so check again. 2588 */ 2589 if (!hat_ismod(opp) && !hat_page_is_mapped(opp)) 2590 pageflags |= STEAL_PAGE; 2591 } 2592 2593 /* 2594 * If we have a vpage pointer, see if it indicates that we have 2595 * ``locked'' the page we map -- if so, tell anon_private to 2596 * transfer the locking resource to the new page. 2597 * 2598 * See Statement at the beginning of segvn_lockop regarding 2599 * the way lockcnts/cowcnts are handled during COW. 2600 * 2601 */ 2602 if (vpage != NULL && VPP_ISPPLOCK(vpage)) 2603 pageflags |= LOCK_PAGE; 2604 2605 /* 2606 * Allocate a private page and perform the copy. 2607 * For MAP_NORESERVE reserve swap space now, unless this 2608 * is a cow fault on an existing anon page in which case 2609 * MAP_NORESERVE will have made advance reservations. 2610 */ 2611 if ((svd->flags & MAP_NORESERVE) && (ap == NULL)) { 2612 if (anon_resv_zone(ptob(1), seg->s_as->a_proc->p_zone)) { 2613 atomic_add_long(&svd->swresv, ptob(1)); 2614 } else { 2615 page_unlock(opp); 2616 err = ENOMEM; 2617 goto out; 2618 } 2619 } 2620 oldap = ap; 2621 pp = anon_private(&ap, seg, addr, prot, opp, pageflags, svd->cred); 2622 if (pp == NULL) { 2623 err = ENOMEM; /* out of swap space */ 2624 goto out; 2625 } 2626 2627 /* 2628 * If we copied away from an anonymous page, then 2629 * we are one step closer to freeing up an anon slot. 2630 * 2631 * NOTE: The original anon slot must be released while 2632 * holding the "anon_map" lock. This is necessary to prevent 2633 * other threads from obtaining a pointer to the anon slot 2634 * which may be freed if its "refcnt" is 1. 2635 */ 2636 if (oldap != NULL) 2637 anon_decref(oldap); 2638 2639 (void) anon_set_ptr(amp->ahp, anon_index, ap, ANON_SLEEP); 2640 2641 /* 2642 * Handle pages that have been marked for migration 2643 */ 2644 if (lgrp_optimizations()) 2645 page_migrate(seg, addr, &pp, 1); 2646 2647 ASSERT(pp->p_szc == 0); 2648 if (type == F_SOFTLOCK && svd->vp == NULL) { 2649 if (!segvn_pp_lock_anonpages(pp, first)) { 2650 page_unlock(pp); 2651 err = ENOMEM; 2652 goto out; 2653 } else { 2654 mutex_enter(&freemem_lock); 2655 svd->softlockcnt++; 2656 segvn_pages_locked++; 2657 mutex_exit(&freemem_lock); 2658 } 2659 } 2660 2661 ASSERT(!IS_VMODSORT(pp->p_vnode)); 2662 if (enable_mbit_wa) { 2663 if (rw == S_WRITE) 2664 hat_setmod(pp); 2665 else if (!hat_ismod(pp)) 2666 prot &= ~PROT_WRITE; 2667 } 2668 2669 hat_memload(hat, addr, pp, prot, hat_flag); 2670 2671 if (!(hat_flag & HAT_LOAD_LOCK)) 2672 page_unlock(pp); 2673 2674 ASSERT(anon_lock); 2675 anon_array_exit(&cookie); 2676 return (0); 2677 out: 2678 if (anon_lock) 2679 anon_array_exit(&cookie); 2680 2681 if (type == F_SOFTLOCK && svd->vp != NULL) { 2682 mutex_enter(&freemem_lock); 2683 availrmem++; 2684 segvn_pages_locked--; 2685 svd->softlockcnt--; 2686 mutex_exit(&freemem_lock); 2687 } 2688 return (FC_MAKE_ERR(err)); 2689 } 2690 2691 /* 2692 * relocate a bunch of smaller targ pages into one large repl page. all targ 2693 * pages must be complete pages smaller than replacement pages. 2694 * it's assumed that no page's szc can change since they are all PAGESIZE or 2695 * complete large pages locked SHARED. 2696 */ 2697 static void 2698 segvn_relocate_pages(page_t **targ, page_t *replacement) 2699 { 2700 page_t *pp; 2701 pgcnt_t repl_npgs, curnpgs; 2702 pgcnt_t i; 2703 uint_t repl_szc = replacement->p_szc; 2704 page_t *first_repl = replacement; 2705 page_t *repl; 2706 spgcnt_t npgs; 2707 2708 VM_STAT_ADD(segvnvmstats.relocatepages[0]); 2709 2710 ASSERT(repl_szc != 0); 2711 npgs = repl_npgs = page_get_pagecnt(repl_szc); 2712 2713 i = 0; 2714 while (repl_npgs) { 2715 spgcnt_t nreloc; 2716 int err; 2717 ASSERT(replacement != NULL); 2718 pp = targ[i]; 2719 ASSERT(pp->p_szc < repl_szc); 2720 ASSERT(PAGE_EXCL(pp)); 2721 ASSERT(!PP_ISFREE(pp)); 2722 curnpgs = page_get_pagecnt(pp->p_szc); 2723 if (curnpgs == 1) { 2724 VM_STAT_ADD(segvnvmstats.relocatepages[1]); 2725 repl = replacement; 2726 page_sub(&replacement, repl); 2727 ASSERT(PAGE_EXCL(repl)); 2728 ASSERT(!PP_ISFREE(repl)); 2729 ASSERT(repl->p_szc == repl_szc); 2730 } else { 2731 page_t *repl_savepp; 2732 int j; 2733 VM_STAT_ADD(segvnvmstats.relocatepages[2]); 2734 repl_savepp = replacement; 2735 for (j = 0; j < curnpgs; j++) { 2736 repl = replacement; 2737 page_sub(&replacement, repl); 2738 ASSERT(PAGE_EXCL(repl)); 2739 ASSERT(!PP_ISFREE(repl)); 2740 ASSERT(repl->p_szc == repl_szc); 2741 ASSERT(page_pptonum(targ[i + j]) == 2742 page_pptonum(targ[i]) + j); 2743 } 2744 repl = repl_savepp; 2745 ASSERT(IS_P2ALIGNED(page_pptonum(repl), curnpgs)); 2746 } 2747 err = page_relocate(&pp, &repl, 0, 1, &nreloc, NULL); 2748 if (err || nreloc != curnpgs) { 2749 panic("segvn_relocate_pages: " 2750 "page_relocate failed err=%d curnpgs=%ld " 2751 "nreloc=%ld", err, curnpgs, nreloc); 2752 } 2753 ASSERT(curnpgs <= repl_npgs); 2754 repl_npgs -= curnpgs; 2755 i += curnpgs; 2756 } 2757 ASSERT(replacement == NULL); 2758 2759 repl = first_repl; 2760 repl_npgs = npgs; 2761 for (i = 0; i < repl_npgs; i++) { 2762 ASSERT(PAGE_EXCL(repl)); 2763 ASSERT(!PP_ISFREE(repl)); 2764 targ[i] = repl; 2765 page_downgrade(targ[i]); 2766 repl++; 2767 } 2768 } 2769 2770 /* 2771 * Check if all pages in ppa array are complete smaller than szc pages and 2772 * their roots will still be aligned relative to their current size if the 2773 * entire ppa array is relocated into one szc page. If these conditions are 2774 * not met return 0. 2775 * 2776 * If all pages are properly aligned attempt to upgrade their locks 2777 * to exclusive mode. If it fails set *upgrdfail to 1 and return 0. 2778 * upgrdfail was set to 0 by caller. 2779 * 2780 * Return 1 if all pages are aligned and locked exclusively. 2781 * 2782 * If all pages in ppa array happen to be physically contiguous to make one 2783 * szc page and all exclusive locks are successfully obtained promote the page 2784 * size to szc and set *pszc to szc. Return 1 with pages locked shared. 2785 */ 2786 static int 2787 segvn_full_szcpages(page_t **ppa, uint_t szc, int *upgrdfail, uint_t *pszc) 2788 { 2789 page_t *pp; 2790 pfn_t pfn; 2791 pgcnt_t totnpgs = page_get_pagecnt(szc); 2792 pfn_t first_pfn; 2793 int contig = 1; 2794 pgcnt_t i; 2795 pgcnt_t j; 2796 uint_t curszc; 2797 pgcnt_t curnpgs; 2798 int root = 0; 2799 2800 ASSERT(szc > 0); 2801 2802 VM_STAT_ADD(segvnvmstats.fullszcpages[0]); 2803 2804 for (i = 0; i < totnpgs; i++) { 2805 pp = ppa[i]; 2806 ASSERT(PAGE_SHARED(pp)); 2807 ASSERT(!PP_ISFREE(pp)); 2808 pfn = page_pptonum(pp); 2809 if (i == 0) { 2810 if (!IS_P2ALIGNED(pfn, totnpgs)) { 2811 contig = 0; 2812 } else { 2813 first_pfn = pfn; 2814 } 2815 } else if (contig && pfn != first_pfn + i) { 2816 contig = 0; 2817 } 2818 if (pp->p_szc == 0) { 2819 if (root) { 2820 VM_STAT_ADD(segvnvmstats.fullszcpages[1]); 2821 return (0); 2822 } 2823 } else if (!root) { 2824 if ((curszc = pp->p_szc) >= szc) { 2825 VM_STAT_ADD(segvnvmstats.fullszcpages[2]); 2826 return (0); 2827 } 2828 if (curszc == 0) { 2829 /* 2830 * p_szc changed means we don't have all pages 2831 * locked. return failure. 2832 */ 2833 VM_STAT_ADD(segvnvmstats.fullszcpages[3]); 2834 return (0); 2835 } 2836 curnpgs = page_get_pagecnt(curszc); 2837 if (!IS_P2ALIGNED(pfn, curnpgs) || 2838 !IS_P2ALIGNED(i, curnpgs)) { 2839 VM_STAT_ADD(segvnvmstats.fullszcpages[4]); 2840 return (0); 2841 } 2842 root = 1; 2843 } else { 2844 ASSERT(i > 0); 2845 VM_STAT_ADD(segvnvmstats.fullszcpages[5]); 2846 if (pp->p_szc != curszc) { 2847 VM_STAT_ADD(segvnvmstats.fullszcpages[6]); 2848 return (0); 2849 } 2850 if (pfn - 1 != page_pptonum(ppa[i - 1])) { 2851 panic("segvn_full_szcpages: " 2852 "large page not physically contiguous"); 2853 } 2854 if (P2PHASE(pfn, curnpgs) == curnpgs - 1) { 2855 root = 0; 2856 } 2857 } 2858 } 2859 2860 for (i = 0; i < totnpgs; i++) { 2861 ASSERT(ppa[i]->p_szc < szc); 2862 if (!page_tryupgrade(ppa[i])) { 2863 for (j = 0; j < i; j++) { 2864 page_downgrade(ppa[j]); 2865 } 2866 *pszc = ppa[i]->p_szc; 2867 *upgrdfail = 1; 2868 VM_STAT_ADD(segvnvmstats.fullszcpages[7]); 2869 return (0); 2870 } 2871 } 2872 2873 /* 2874 * When a page is put a free cachelist its szc is set to 0. if file 2875 * system reclaimed pages from cachelist targ pages will be physically 2876 * contiguous with 0 p_szc. in this case just upgrade szc of targ 2877 * pages without any relocations. 2878 * To avoid any hat issues with previous small mappings 2879 * hat_pageunload() the target pages first. 2880 */ 2881 if (contig) { 2882 VM_STAT_ADD(segvnvmstats.fullszcpages[8]); 2883 for (i = 0; i < totnpgs; i++) { 2884 (void) hat_pageunload(ppa[i], HAT_FORCE_PGUNLOAD); 2885 } 2886 for (i = 0; i < totnpgs; i++) { 2887 ppa[i]->p_szc = szc; 2888 } 2889 for (i = 0; i < totnpgs; i++) { 2890 ASSERT(PAGE_EXCL(ppa[i])); 2891 page_downgrade(ppa[i]); 2892 } 2893 if (pszc != NULL) { 2894 *pszc = szc; 2895 } 2896 } 2897 VM_STAT_ADD(segvnvmstats.fullszcpages[9]); 2898 return (1); 2899 } 2900 2901 /* 2902 * Create physically contiguous pages for [vp, off] - [vp, off + 2903 * page_size(szc)) range and for private segment return them in ppa array. 2904 * Pages are created either via IO or relocations. 2905 * 2906 * Return 1 on sucess and 0 on failure. 2907 * 2908 * If physically contiguos pages already exist for this range return 1 without 2909 * filling ppa array. Caller initializes ppa[0] as NULL to detect that ppa 2910 * array wasn't filled. In this case caller fills ppa array via VOP_GETPAGE(). 2911 */ 2912 2913 static int 2914 segvn_fill_vp_pages(struct segvn_data *svd, vnode_t *vp, u_offset_t off, 2915 uint_t szc, page_t **ppa, page_t **ppplist, uint_t *ret_pszc, 2916 int *downsize) 2917 2918 { 2919 page_t *pplist = *ppplist; 2920 size_t pgsz = page_get_pagesize(szc); 2921 pgcnt_t pages = btop(pgsz); 2922 ulong_t start_off = off; 2923 u_offset_t eoff = off + pgsz; 2924 spgcnt_t nreloc; 2925 u_offset_t io_off = off; 2926 size_t io_len; 2927 page_t *io_pplist = NULL; 2928 page_t *done_pplist = NULL; 2929 pgcnt_t pgidx = 0; 2930 page_t *pp; 2931 page_t *newpp; 2932 page_t *targpp; 2933 int io_err = 0; 2934 int i; 2935 pfn_t pfn; 2936 ulong_t ppages; 2937 page_t *targ_pplist = NULL; 2938 page_t *repl_pplist = NULL; 2939 page_t *tmp_pplist; 2940 int nios = 0; 2941 uint_t pszc; 2942 struct vattr va; 2943 2944 VM_STAT_ADD(segvnvmstats.fill_vp_pages[0]); 2945 2946 ASSERT(szc != 0); 2947 ASSERT(pplist->p_szc == szc); 2948 2949 /* 2950 * downsize will be set to 1 only if we fail to lock pages. this will 2951 * allow subsequent faults to try to relocate the page again. If we 2952 * fail due to misalignment don't downsize and let the caller map the 2953 * whole region with small mappings to avoid more faults into the area 2954 * where we can't get large pages anyway. 2955 */ 2956 *downsize = 0; 2957 2958 while (off < eoff) { 2959 newpp = pplist; 2960 ASSERT(newpp != NULL); 2961 ASSERT(PAGE_EXCL(newpp)); 2962 ASSERT(!PP_ISFREE(newpp)); 2963 /* 2964 * we pass NULL for nrelocp to page_lookup_create() 2965 * so that it doesn't relocate. We relocate here 2966 * later only after we make sure we can lock all 2967 * pages in the range we handle and they are all 2968 * aligned. 2969 */ 2970 pp = page_lookup_create(vp, off, SE_SHARED, newpp, NULL, 0); 2971 ASSERT(pp != NULL); 2972 ASSERT(!PP_ISFREE(pp)); 2973 ASSERT(pp->p_vnode == vp); 2974 ASSERT(pp->p_offset == off); 2975 if (pp == newpp) { 2976 VM_STAT_ADD(segvnvmstats.fill_vp_pages[1]); 2977 page_sub(&pplist, pp); 2978 ASSERT(PAGE_EXCL(pp)); 2979 ASSERT(page_iolock_assert(pp)); 2980 page_list_concat(&io_pplist, &pp); 2981 off += PAGESIZE; 2982 continue; 2983 } 2984 VM_STAT_ADD(segvnvmstats.fill_vp_pages[2]); 2985 pfn = page_pptonum(pp); 2986 pszc = pp->p_szc; 2987 if (pszc >= szc && targ_pplist == NULL && io_pplist == NULL && 2988 IS_P2ALIGNED(pfn, pages)) { 2989 ASSERT(repl_pplist == NULL); 2990 ASSERT(done_pplist == NULL); 2991 ASSERT(pplist == *ppplist); 2992 page_unlock(pp); 2993 page_free_replacement_page(pplist); 2994 page_create_putback(pages); 2995 *ppplist = NULL; 2996 VM_STAT_ADD(segvnvmstats.fill_vp_pages[3]); 2997 return (1); 2998 } 2999 if (pszc >= szc) { 3000 page_unlock(pp); 3001 segvn_faultvnmpss_align_err1++; 3002 goto out; 3003 } 3004 ppages = page_get_pagecnt(pszc); 3005 if (!IS_P2ALIGNED(pfn, ppages)) { 3006 ASSERT(pszc > 0); 3007 /* 3008 * sizing down to pszc won't help. 3009 */ 3010 page_unlock(pp); 3011 segvn_faultvnmpss_align_err2++; 3012 goto out; 3013 } 3014 pfn = page_pptonum(newpp); 3015 if (!IS_P2ALIGNED(pfn, ppages)) { 3016 ASSERT(pszc > 0); 3017 /* 3018 * sizing down to pszc won't help. 3019 */ 3020 page_unlock(pp); 3021 segvn_faultvnmpss_align_err3++; 3022 goto out; 3023 } 3024 if (!PAGE_EXCL(pp)) { 3025 VM_STAT_ADD(segvnvmstats.fill_vp_pages[4]); 3026 page_unlock(pp); 3027 *downsize = 1; 3028 *ret_pszc = pp->p_szc; 3029 goto out; 3030 } 3031 targpp = pp; 3032 if (io_pplist != NULL) { 3033 VM_STAT_ADD(segvnvmstats.fill_vp_pages[5]); 3034 io_len = off - io_off; 3035 /* 3036 * Some file systems like NFS don't check EOF 3037 * conditions in VOP_PAGEIO(). Check it here 3038 * now that pages are locked SE_EXCL. Any file 3039 * truncation will wait until the pages are 3040 * unlocked so no need to worry that file will 3041 * be truncated after we check its size here. 3042 * XXX fix NFS to remove this check. 3043 */ 3044 va.va_mask = AT_SIZE; 3045 if (VOP_GETATTR(vp, &va, ATTR_HINT, svd->cred) != 0) { 3046 VM_STAT_ADD(segvnvmstats.fill_vp_pages[6]); 3047 page_unlock(targpp); 3048 goto out; 3049 } 3050 if (btopr(va.va_size) < btopr(io_off + io_len)) { 3051 VM_STAT_ADD(segvnvmstats.fill_vp_pages[7]); 3052 *downsize = 1; 3053 *ret_pszc = 0; 3054 page_unlock(targpp); 3055 goto out; 3056 } 3057 io_err = VOP_PAGEIO(vp, io_pplist, io_off, io_len, 3058 B_READ, svd->cred); 3059 if (io_err) { 3060 VM_STAT_ADD(segvnvmstats.fill_vp_pages[8]); 3061 page_unlock(targpp); 3062 if (io_err == EDEADLK) { 3063 segvn_vmpss_pageio_deadlk_err++; 3064 } 3065 goto out; 3066 } 3067 nios++; 3068 VM_STAT_ADD(segvnvmstats.fill_vp_pages[9]); 3069 while (io_pplist != NULL) { 3070 pp = io_pplist; 3071 page_sub(&io_pplist, pp); 3072 ASSERT(page_iolock_assert(pp)); 3073 page_io_unlock(pp); 3074 pgidx = (pp->p_offset - start_off) >> 3075 PAGESHIFT; 3076 ASSERT(pgidx < pages); 3077 ppa[pgidx] = pp; 3078 page_list_concat(&done_pplist, &pp); 3079 } 3080 } 3081 pp = targpp; 3082 ASSERT(PAGE_EXCL(pp)); 3083 ASSERT(pp->p_szc <= pszc); 3084 if (pszc != 0 && !group_page_trylock(pp, SE_EXCL)) { 3085 VM_STAT_ADD(segvnvmstats.fill_vp_pages[10]); 3086 page_unlock(pp); 3087 *downsize = 1; 3088 *ret_pszc = pp->p_szc; 3089 goto out; 3090 } 3091 VM_STAT_ADD(segvnvmstats.fill_vp_pages[11]); 3092 /* 3093 * page szc chould have changed before the entire group was 3094 * locked. reread page szc. 3095 */ 3096 pszc = pp->p_szc; 3097 ppages = page_get_pagecnt(pszc); 3098 3099 /* link just the roots */ 3100 page_list_concat(&targ_pplist, &pp); 3101 page_sub(&pplist, newpp); 3102 page_list_concat(&repl_pplist, &newpp); 3103 off += PAGESIZE; 3104 while (--ppages != 0) { 3105 newpp = pplist; 3106 page_sub(&pplist, newpp); 3107 off += PAGESIZE; 3108 } 3109 io_off = off; 3110 } 3111 if (io_pplist != NULL) { 3112 VM_STAT_ADD(segvnvmstats.fill_vp_pages[12]); 3113 io_len = eoff - io_off; 3114 va.va_mask = AT_SIZE; 3115 if (VOP_GETATTR(vp, &va, ATTR_HINT, svd->cred) != 0) { 3116 VM_STAT_ADD(segvnvmstats.fill_vp_pages[13]); 3117 goto out; 3118 } 3119 if (btopr(va.va_size) < btopr(io_off + io_len)) { 3120 VM_STAT_ADD(segvnvmstats.fill_vp_pages[14]); 3121 *downsize = 1; 3122 *ret_pszc = 0; 3123 goto out; 3124 } 3125 io_err = VOP_PAGEIO(vp, io_pplist, io_off, io_len, 3126 B_READ, svd->cred); 3127 if (io_err) { 3128 VM_STAT_ADD(segvnvmstats.fill_vp_pages[15]); 3129 if (io_err == EDEADLK) { 3130 segvn_vmpss_pageio_deadlk_err++; 3131 } 3132 goto out; 3133 } 3134 nios++; 3135 while (io_pplist != NULL) { 3136 pp = io_pplist; 3137 page_sub(&io_pplist, pp); 3138 ASSERT(page_iolock_assert(pp)); 3139 page_io_unlock(pp); 3140 pgidx = (pp->p_offset - start_off) >> PAGESHIFT; 3141 ASSERT(pgidx < pages); 3142 ppa[pgidx] = pp; 3143 } 3144 } 3145 /* 3146 * we're now bound to succeed or panic. 3147 * remove pages from done_pplist. it's not needed anymore. 3148 */ 3149 while (done_pplist != NULL) { 3150 pp = done_pplist; 3151 page_sub(&done_pplist, pp); 3152 } 3153 VM_STAT_ADD(segvnvmstats.fill_vp_pages[16]); 3154 ASSERT(pplist == NULL); 3155 *ppplist = NULL; 3156 while (targ_pplist != NULL) { 3157 int ret; 3158 VM_STAT_ADD(segvnvmstats.fill_vp_pages[17]); 3159 ASSERT(repl_pplist); 3160 pp = targ_pplist; 3161 page_sub(&targ_pplist, pp); 3162 pgidx = (pp->p_offset - start_off) >> PAGESHIFT; 3163 newpp = repl_pplist; 3164 page_sub(&repl_pplist, newpp); 3165 #ifdef DEBUG 3166 pfn = page_pptonum(pp); 3167 pszc = pp->p_szc; 3168 ppages = page_get_pagecnt(pszc); 3169 ASSERT(IS_P2ALIGNED(pfn, ppages)); 3170 pfn = page_pptonum(newpp); 3171 ASSERT(IS_P2ALIGNED(pfn, ppages)); 3172 ASSERT(P2PHASE(pfn, pages) == pgidx); 3173 #endif 3174 nreloc = 0; 3175 ret = page_relocate(&pp, &newpp, 0, 1, &nreloc, NULL); 3176 if (ret != 0 || nreloc == 0) { 3177 panic("segvn_fill_vp_pages: " 3178 "page_relocate failed"); 3179 } 3180 pp = newpp; 3181 while (nreloc-- != 0) { 3182 ASSERT(PAGE_EXCL(pp)); 3183 ASSERT(pp->p_vnode == vp); 3184 ASSERT(pgidx == 3185 ((pp->p_offset - start_off) >> PAGESHIFT)); 3186 ppa[pgidx++] = pp; 3187 pp++; 3188 } 3189 } 3190 3191 if (svd->type == MAP_PRIVATE) { 3192 VM_STAT_ADD(segvnvmstats.fill_vp_pages[18]); 3193 for (i = 0; i < pages; i++) { 3194 ASSERT(ppa[i] != NULL); 3195 ASSERT(PAGE_EXCL(ppa[i])); 3196 ASSERT(ppa[i]->p_vnode == vp); 3197 ASSERT(ppa[i]->p_offset == 3198 start_off + (i << PAGESHIFT)); 3199 page_downgrade(ppa[i]); 3200 } 3201 ppa[pages] = NULL; 3202 } else { 3203 VM_STAT_ADD(segvnvmstats.fill_vp_pages[19]); 3204 /* 3205 * the caller will still call VOP_GETPAGE() for shared segments 3206 * to check FS write permissions. For private segments we map 3207 * file read only anyway. so no VOP_GETPAGE is needed. 3208 */ 3209 for (i = 0; i < pages; i++) { 3210 ASSERT(ppa[i] != NULL); 3211 ASSERT(PAGE_EXCL(ppa[i])); 3212 ASSERT(ppa[i]->p_vnode == vp); 3213 ASSERT(ppa[i]->p_offset == 3214 start_off + (i << PAGESHIFT)); 3215 page_unlock(ppa[i]); 3216 } 3217 ppa[0] = NULL; 3218 } 3219 3220 return (1); 3221 out: 3222 /* 3223 * Do the cleanup. Unlock target pages we didn't relocate. They are 3224 * linked on targ_pplist by root pages. reassemble unused replacement 3225 * and io pages back to pplist. 3226 */ 3227 if (io_pplist != NULL) { 3228 VM_STAT_ADD(segvnvmstats.fill_vp_pages[20]); 3229 pp = io_pplist; 3230 do { 3231 ASSERT(pp->p_vnode == vp); 3232 ASSERT(pp->p_offset == io_off); 3233 ASSERT(page_iolock_assert(pp)); 3234 page_io_unlock(pp); 3235 page_hashout(pp, NULL); 3236 io_off += PAGESIZE; 3237 } while ((pp = pp->p_next) != io_pplist); 3238 page_list_concat(&io_pplist, &pplist); 3239 pplist = io_pplist; 3240 } 3241 tmp_pplist = NULL; 3242 while (targ_pplist != NULL) { 3243 VM_STAT_ADD(segvnvmstats.fill_vp_pages[21]); 3244 pp = targ_pplist; 3245 ASSERT(PAGE_EXCL(pp)); 3246 page_sub(&targ_pplist, pp); 3247 3248 pszc = pp->p_szc; 3249 ppages = page_get_pagecnt(pszc); 3250 ASSERT(IS_P2ALIGNED(page_pptonum(pp), ppages)); 3251 3252 if (pszc != 0) { 3253 group_page_unlock(pp); 3254 } 3255 page_unlock(pp); 3256 3257 pp = repl_pplist; 3258 ASSERT(pp != NULL); 3259 ASSERT(PAGE_EXCL(pp)); 3260 ASSERT(pp->p_szc == szc); 3261 page_sub(&repl_pplist, pp); 3262 3263 ASSERT(IS_P2ALIGNED(page_pptonum(pp), ppages)); 3264 3265 /* relink replacement page */ 3266 page_list_concat(&tmp_pplist, &pp); 3267 while (--ppages != 0) { 3268 VM_STAT_ADD(segvnvmstats.fill_vp_pages[22]); 3269 pp++; 3270 ASSERT(PAGE_EXCL(pp)); 3271 ASSERT(pp->p_szc == szc); 3272 page_list_concat(&tmp_pplist, &pp); 3273 } 3274 } 3275 if (tmp_pplist != NULL) { 3276 VM_STAT_ADD(segvnvmstats.fill_vp_pages[23]); 3277 page_list_concat(&tmp_pplist, &pplist); 3278 pplist = tmp_pplist; 3279 } 3280 /* 3281 * at this point all pages are either on done_pplist or 3282 * pplist. They can't be all on done_pplist otherwise 3283 * we'd've been done. 3284 */ 3285 ASSERT(pplist != NULL); 3286 if (nios != 0) { 3287 VM_STAT_ADD(segvnvmstats.fill_vp_pages[24]); 3288 pp = pplist; 3289 do { 3290 VM_STAT_ADD(segvnvmstats.fill_vp_pages[25]); 3291 ASSERT(pp->p_szc == szc); 3292 ASSERT(PAGE_EXCL(pp)); 3293 ASSERT(pp->p_vnode != vp); 3294 pp->p_szc = 0; 3295 } while ((pp = pp->p_next) != pplist); 3296 3297 pp = done_pplist; 3298 do { 3299 VM_STAT_ADD(segvnvmstats.fill_vp_pages[26]); 3300 ASSERT(pp->p_szc == szc); 3301 ASSERT(PAGE_EXCL(pp)); 3302 ASSERT(pp->p_vnode == vp); 3303 pp->p_szc = 0; 3304 } while ((pp = pp->p_next) != done_pplist); 3305 3306 while (pplist != NULL) { 3307 VM_STAT_ADD(segvnvmstats.fill_vp_pages[27]); 3308 pp = pplist; 3309 page_sub(&pplist, pp); 3310 page_free(pp, 0); 3311 } 3312 3313 while (done_pplist != NULL) { 3314 VM_STAT_ADD(segvnvmstats.fill_vp_pages[28]); 3315 pp = done_pplist; 3316 page_sub(&done_pplist, pp); 3317 page_unlock(pp); 3318 } 3319 *ppplist = NULL; 3320 return (0); 3321 } 3322 ASSERT(pplist == *ppplist); 3323 if (io_err) { 3324 VM_STAT_ADD(segvnvmstats.fill_vp_pages[29]); 3325 /* 3326 * don't downsize on io error. 3327 * see if vop_getpage succeeds. 3328 * pplist may still be used in this case 3329 * for relocations. 3330 */ 3331 return (0); 3332 } 3333 VM_STAT_ADD(segvnvmstats.fill_vp_pages[30]); 3334 page_free_replacement_page(pplist); 3335 page_create_putback(pages); 3336 *ppplist = NULL; 3337 return (0); 3338 } 3339 3340 int segvn_anypgsz = 0; 3341 3342 #define SEGVN_RESTORE_SOFTLOCK(type, pages) \ 3343 if ((type) == F_SOFTLOCK) { \ 3344 mutex_enter(&freemem_lock); \ 3345 availrmem += (pages); \ 3346 segvn_pages_locked -= (pages); \ 3347 svd->softlockcnt -= (pages); \ 3348 mutex_exit(&freemem_lock); \ 3349 } 3350 3351 #define SEGVN_UPDATE_MODBITS(ppa, pages, rw, prot, vpprot) \ 3352 if (IS_VMODSORT((ppa)[0]->p_vnode)) { \ 3353 if ((rw) == S_WRITE) { \ 3354 for (i = 0; i < (pages); i++) { \ 3355 ASSERT((ppa)[i]->p_vnode == \ 3356 (ppa)[0]->p_vnode); \ 3357 hat_setmod((ppa)[i]); \ 3358 } \ 3359 } else if ((rw) != S_OTHER && \ 3360 ((prot) & (vpprot) & PROT_WRITE)) { \ 3361 for (i = 0; i < (pages); i++) { \ 3362 ASSERT((ppa)[i]->p_vnode == \ 3363 (ppa)[0]->p_vnode); \ 3364 if (!hat_ismod((ppa)[i])) { \ 3365 prot &= ~PROT_WRITE; \ 3366 break; \ 3367 } \ 3368 } \ 3369 } \ 3370 } 3371 3372 #ifdef VM_STATS 3373 3374 #define SEGVN_VMSTAT_FLTVNPAGES(idx) \ 3375 VM_STAT_ADD(segvnvmstats.fltvnpages[(idx)]); 3376 3377 #else /* VM_STATS */ 3378 3379 #define SEGVN_VMSTAT_FLTVNPAGES(idx) 3380 3381 #endif 3382 3383 static faultcode_t 3384 segvn_fault_vnodepages(struct hat *hat, struct seg *seg, caddr_t lpgaddr, 3385 caddr_t lpgeaddr, enum fault_type type, enum seg_rw rw, caddr_t addr, 3386 caddr_t eaddr, int brkcow) 3387 { 3388 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 3389 struct anon_map *amp = svd->amp; 3390 uchar_t segtype = svd->type; 3391 uint_t szc = seg->s_szc; 3392 size_t pgsz = page_get_pagesize(szc); 3393 size_t maxpgsz = pgsz; 3394 pgcnt_t pages = btop(pgsz); 3395 pgcnt_t maxpages = pages; 3396 size_t ppasize = (pages + 1) * sizeof (page_t *); 3397 caddr_t a = lpgaddr; 3398 caddr_t maxlpgeaddr = lpgeaddr; 3399 u_offset_t off = svd->offset + (uintptr_t)(a - seg->s_base); 3400 ulong_t aindx = svd->anon_index + seg_page(seg, a); 3401 struct vpage *vpage = (svd->vpage != NULL) ? 3402 &svd->vpage[seg_page(seg, a)] : NULL; 3403 vnode_t *vp = svd->vp; 3404 page_t **ppa; 3405 uint_t pszc; 3406 size_t ppgsz; 3407 pgcnt_t ppages; 3408 faultcode_t err = 0; 3409 int ierr; 3410 int vop_size_err = 0; 3411 uint_t protchk, prot, vpprot; 3412 ulong_t i; 3413 int hat_flag = (type == F_SOFTLOCK) ? HAT_LOAD_LOCK : HAT_LOAD; 3414 anon_sync_obj_t an_cookie; 3415 enum seg_rw arw; 3416 int alloc_failed = 0; 3417 int adjszc_chk; 3418 struct vattr va; 3419 int xhat = 0; 3420 page_t *pplist; 3421 pfn_t pfn; 3422 int physcontig; 3423 int upgrdfail; 3424 int segvn_anypgsz_vnode = 0; /* for now map vnode with 2 page sizes */ 3425 3426 ASSERT(szc != 0); 3427 ASSERT(vp != NULL); 3428 ASSERT(brkcow == 0 || amp != NULL); 3429 ASSERT(enable_mbit_wa == 0); /* no mbit simulations with large pages */ 3430 ASSERT(!(svd->flags & MAP_NORESERVE)); 3431 ASSERT(type != F_SOFTUNLOCK); 3432 ASSERT(IS_P2ALIGNED(a, maxpgsz)); 3433 ASSERT(amp == NULL || IS_P2ALIGNED(aindx, maxpages)); 3434 ASSERT(SEGVN_LOCK_HELD(seg->s_as, &svd->lock)); 3435 ASSERT(seg->s_szc < NBBY * sizeof (int)); 3436 ASSERT(type != F_SOFTLOCK || lpgeaddr - a == maxpgsz); 3437 3438 VM_STAT_COND_ADD(type == F_SOFTLOCK, segvnvmstats.fltvnpages[0]); 3439 VM_STAT_COND_ADD(type != F_SOFTLOCK, segvnvmstats.fltvnpages[1]); 3440 3441 if (svd->flags & MAP_TEXT) { 3442 hat_flag |= HAT_LOAD_TEXT; 3443 } 3444 3445 if (svd->pageprot) { 3446 switch (rw) { 3447 case S_READ: 3448 protchk = PROT_READ; 3449 break; 3450 case S_WRITE: 3451 protchk = PROT_WRITE; 3452 break; 3453 case S_EXEC: 3454 protchk = PROT_EXEC; 3455 break; 3456 case S_OTHER: 3457 default: 3458 protchk = PROT_READ | PROT_WRITE | PROT_EXEC; 3459 break; 3460 } 3461 } else { 3462 prot = svd->prot; 3463 /* caller has already done segment level protection check. */ 3464 } 3465 3466 if (seg->s_as->a_hat != hat) { 3467 xhat = 1; 3468 } 3469 3470 if (rw == S_WRITE && segtype == MAP_PRIVATE) { 3471 SEGVN_VMSTAT_FLTVNPAGES(2); 3472 arw = S_READ; 3473 } else { 3474 arw = rw; 3475 } 3476 3477 ppa = kmem_alloc(ppasize, KM_SLEEP); 3478 3479 VM_STAT_COND_ADD(amp != NULL, segvnvmstats.fltvnpages[3]); 3480 3481 for (;;) { 3482 adjszc_chk = 0; 3483 for (; a < lpgeaddr; a += pgsz, off += pgsz, aindx += pages) { 3484 if (adjszc_chk) { 3485 while (szc < seg->s_szc) { 3486 uintptr_t e; 3487 uint_t tszc; 3488 tszc = segvn_anypgsz_vnode ? szc + 1 : 3489 seg->s_szc; 3490 ppgsz = page_get_pagesize(tszc); 3491 if (!IS_P2ALIGNED(a, ppgsz) || 3492 ((alloc_failed >> tszc) & 3493 0x1)) { 3494 break; 3495 } 3496 SEGVN_VMSTAT_FLTVNPAGES(4); 3497 szc = tszc; 3498 pgsz = ppgsz; 3499 pages = btop(pgsz); 3500 e = P2ROUNDUP((uintptr_t)eaddr, pgsz); 3501 lpgeaddr = (caddr_t)e; 3502 } 3503 } 3504 3505 again: 3506 if (IS_P2ALIGNED(a, maxpgsz) && amp != NULL) { 3507 ASSERT(IS_P2ALIGNED(aindx, maxpages)); 3508 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 3509 anon_array_enter(amp, aindx, &an_cookie); 3510 if (anon_get_ptr(amp->ahp, aindx) != NULL) { 3511 SEGVN_VMSTAT_FLTVNPAGES(5); 3512 if (anon_pages(amp->ahp, aindx, 3513 maxpages) != maxpages) { 3514 panic("segvn_fault_vnodepages:" 3515 " empty anon slots\n"); 3516 } 3517 anon_array_exit(&an_cookie); 3518 ANON_LOCK_EXIT(&->a_rwlock); 3519 err = segvn_fault_anonpages(hat, seg, 3520 a, a + maxpgsz, type, rw, 3521 MAX(a, addr), 3522 MIN(a + maxpgsz, eaddr), brkcow); 3523 if (err != 0) { 3524 SEGVN_VMSTAT_FLTVNPAGES(6); 3525 goto out; 3526 } 3527 if (szc < seg->s_szc) { 3528 szc = seg->s_szc; 3529 pgsz = maxpgsz; 3530 pages = maxpages; 3531 lpgeaddr = maxlpgeaddr; 3532 } 3533 goto next; 3534 } else if (anon_pages(amp->ahp, aindx, 3535 maxpages)) { 3536 panic("segvn_fault_vnodepages:" 3537 " non empty anon slots\n"); 3538 } else { 3539 SEGVN_VMSTAT_FLTVNPAGES(7); 3540 anon_array_exit(&an_cookie); 3541 ANON_LOCK_EXIT(&->a_rwlock); 3542 } 3543 } 3544 ASSERT(!brkcow || IS_P2ALIGNED(a, maxpgsz)); 3545 3546 if (svd->pageprot != 0 && IS_P2ALIGNED(a, maxpgsz)) { 3547 ASSERT(vpage != NULL); 3548 prot = VPP_PROT(vpage); 3549 ASSERT(sameprot(seg, a, maxpgsz)); 3550 if ((prot & protchk) == 0) { 3551 SEGVN_VMSTAT_FLTVNPAGES(8); 3552 err = FC_PROT; 3553 goto out; 3554 } 3555 } 3556 if (type == F_SOFTLOCK) { 3557 mutex_enter(&freemem_lock); 3558 if (availrmem < tune.t_minarmem + pages) { 3559 mutex_exit(&freemem_lock); 3560 err = FC_MAKE_ERR(ENOMEM); 3561 goto out; 3562 } else { 3563 availrmem -= pages; 3564 segvn_pages_locked += pages; 3565 svd->softlockcnt += pages; 3566 } 3567 mutex_exit(&freemem_lock); 3568 } 3569 3570 pplist = NULL; 3571 physcontig = 0; 3572 ppa[0] = NULL; 3573 if (!brkcow && szc && 3574 !page_exists_physcontig(vp, off, szc, 3575 segtype == MAP_PRIVATE ? ppa : NULL)) { 3576 SEGVN_VMSTAT_FLTVNPAGES(9); 3577 if (page_alloc_pages(vp, seg, a, &pplist, NULL, 3578 szc, 0) && type != F_SOFTLOCK) { 3579 SEGVN_VMSTAT_FLTVNPAGES(10); 3580 pszc = 0; 3581 ierr = -1; 3582 alloc_failed |= (1 << szc); 3583 break; 3584 } 3585 if (pplist != NULL && 3586 vp->v_mpssdata == SEGVN_PAGEIO) { 3587 int downsize; 3588 SEGVN_VMSTAT_FLTVNPAGES(11); 3589 physcontig = segvn_fill_vp_pages(svd, 3590 vp, off, szc, ppa, &pplist, 3591 &pszc, &downsize); 3592 ASSERT(!physcontig || pplist == NULL); 3593 if (!physcontig && downsize && 3594 type != F_SOFTLOCK) { 3595 ASSERT(pplist == NULL); 3596 SEGVN_VMSTAT_FLTVNPAGES(12); 3597 ierr = -1; 3598 break; 3599 } 3600 ASSERT(!physcontig || 3601 segtype == MAP_PRIVATE || 3602 ppa[0] == NULL); 3603 if (physcontig && ppa[0] == NULL) { 3604 physcontig = 0; 3605 } 3606 } 3607 } else if (!brkcow && szc && ppa[0] != NULL) { 3608 SEGVN_VMSTAT_FLTVNPAGES(13); 3609 ASSERT(segtype == MAP_PRIVATE); 3610 physcontig = 1; 3611 } 3612 3613 if (!physcontig) { 3614 SEGVN_VMSTAT_FLTVNPAGES(14); 3615 ppa[0] = NULL; 3616 ierr = VOP_GETPAGE(vp, (offset_t)off, pgsz, 3617 &vpprot, ppa, pgsz, seg, a, arw, 3618 svd->cred); 3619 #ifdef DEBUG 3620 if (ierr == 0) { 3621 for (i = 0; i < pages; i++) { 3622 ASSERT(PAGE_LOCKED(ppa[i])); 3623 ASSERT(!PP_ISFREE(ppa[i])); 3624 ASSERT(ppa[i]->p_vnode == vp); 3625 ASSERT(ppa[i]->p_offset == 3626 off + (i << PAGESHIFT)); 3627 } 3628 } 3629 #endif /* DEBUG */ 3630 if (segtype == MAP_PRIVATE) { 3631 SEGVN_VMSTAT_FLTVNPAGES(15); 3632 vpprot &= ~PROT_WRITE; 3633 } 3634 } else { 3635 ASSERT(segtype == MAP_PRIVATE); 3636 SEGVN_VMSTAT_FLTVNPAGES(16); 3637 vpprot = PROT_ALL & ~PROT_WRITE; 3638 ierr = 0; 3639 } 3640 3641 if (ierr != 0) { 3642 SEGVN_VMSTAT_FLTVNPAGES(17); 3643 if (pplist != NULL) { 3644 SEGVN_VMSTAT_FLTVNPAGES(18); 3645 page_free_replacement_page(pplist); 3646 page_create_putback(pages); 3647 } 3648 SEGVN_RESTORE_SOFTLOCK(type, pages); 3649 if (a + pgsz <= eaddr) { 3650 SEGVN_VMSTAT_FLTVNPAGES(19); 3651 err = FC_MAKE_ERR(ierr); 3652 goto out; 3653 } 3654 va.va_mask = AT_SIZE; 3655 if (VOP_GETATTR(vp, &va, 0, svd->cred) != 0) { 3656 SEGVN_VMSTAT_FLTVNPAGES(20); 3657 err = FC_MAKE_ERR(EIO); 3658 goto out; 3659 } 3660 if (btopr(va.va_size) >= btopr(off + pgsz)) { 3661 SEGVN_VMSTAT_FLTVNPAGES(21); 3662 err = FC_MAKE_ERR(ierr); 3663 goto out; 3664 } 3665 if (btopr(va.va_size) < 3666 btopr(off + (eaddr - a))) { 3667 SEGVN_VMSTAT_FLTVNPAGES(22); 3668 err = FC_MAKE_ERR(ierr); 3669 goto out; 3670 } 3671 if (brkcow || type == F_SOFTLOCK) { 3672 /* can't reduce map area */ 3673 SEGVN_VMSTAT_FLTVNPAGES(23); 3674 vop_size_err = 1; 3675 goto out; 3676 } 3677 SEGVN_VMSTAT_FLTVNPAGES(24); 3678 ASSERT(szc != 0); 3679 pszc = 0; 3680 ierr = -1; 3681 break; 3682 } 3683 3684 if (amp != NULL) { 3685 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 3686 anon_array_enter(amp, aindx, &an_cookie); 3687 } 3688 if (amp != NULL && 3689 anon_get_ptr(amp->ahp, aindx) != NULL) { 3690 ulong_t taindx = P2ALIGN(aindx, maxpages); 3691 3692 SEGVN_VMSTAT_FLTVNPAGES(25); 3693 if (anon_pages(amp->ahp, taindx, maxpages) != 3694 maxpages) { 3695 panic("segvn_fault_vnodepages:" 3696 " empty anon slots\n"); 3697 } 3698 for (i = 0; i < pages; i++) { 3699 page_unlock(ppa[i]); 3700 } 3701 anon_array_exit(&an_cookie); 3702 ANON_LOCK_EXIT(&->a_rwlock); 3703 if (pplist != NULL) { 3704 page_free_replacement_page(pplist); 3705 page_create_putback(pages); 3706 } 3707 SEGVN_RESTORE_SOFTLOCK(type, pages); 3708 if (szc < seg->s_szc) { 3709 SEGVN_VMSTAT_FLTVNPAGES(26); 3710 /* 3711 * For private segments SOFTLOCK 3712 * either always breaks cow (any rw 3713 * type except S_READ_NOCOW) or 3714 * address space is locked as writer 3715 * (S_READ_NOCOW case) and anon slots 3716 * can't show up on second check. 3717 * Therefore if we are here for 3718 * SOFTLOCK case it must be a cow 3719 * break but cow break never reduces 3720 * szc. Thus the assert below. 3721 */ 3722 ASSERT(!brkcow && type != F_SOFTLOCK); 3723 pszc = seg->s_szc; 3724 ierr = -2; 3725 break; 3726 } 3727 ASSERT(IS_P2ALIGNED(a, maxpgsz)); 3728 goto again; 3729 } 3730 #ifdef DEBUG 3731 if (amp != NULL) { 3732 ulong_t taindx = P2ALIGN(aindx, maxpages); 3733 ASSERT(!anon_pages(amp->ahp, taindx, maxpages)); 3734 } 3735 #endif /* DEBUG */ 3736 3737 if (brkcow) { 3738 ASSERT(amp != NULL); 3739 ASSERT(pplist == NULL); 3740 ASSERT(szc == seg->s_szc); 3741 ASSERT(IS_P2ALIGNED(a, maxpgsz)); 3742 ASSERT(IS_P2ALIGNED(aindx, maxpages)); 3743 SEGVN_VMSTAT_FLTVNPAGES(27); 3744 ierr = anon_map_privatepages(amp, aindx, szc, 3745 seg, a, prot, ppa, vpage, segvn_anypgsz, 3746 svd->cred); 3747 if (ierr != 0) { 3748 SEGVN_VMSTAT_FLTVNPAGES(28); 3749 anon_array_exit(&an_cookie); 3750 ANON_LOCK_EXIT(&->a_rwlock); 3751 SEGVN_RESTORE_SOFTLOCK(type, pages); 3752 err = FC_MAKE_ERR(ierr); 3753 goto out; 3754 } 3755 3756 ASSERT(!IS_VMODSORT(ppa[0]->p_vnode)); 3757 /* 3758 * p_szc can't be changed for locked 3759 * swapfs pages. 3760 */ 3761 hat_memload_array(hat, a, pgsz, ppa, prot, 3762 hat_flag); 3763 3764 if (!(hat_flag & HAT_LOAD_LOCK)) { 3765 SEGVN_VMSTAT_FLTVNPAGES(29); 3766 for (i = 0; i < pages; i++) { 3767 page_unlock(ppa[i]); 3768 } 3769 } 3770 anon_array_exit(&an_cookie); 3771 ANON_LOCK_EXIT(&->a_rwlock); 3772 goto next; 3773 } 3774 3775 pfn = page_pptonum(ppa[0]); 3776 /* 3777 * hat_page_demote() needs an EXCl lock on one of 3778 * constituent page_t's and it decreases root's p_szc 3779 * last. This means if root's p_szc is equal szc and 3780 * all its constituent pages are locked 3781 * hat_page_demote() that could have changed p_szc to 3782 * szc is already done and no new have page_demote() 3783 * can start for this large page. 3784 */ 3785 3786 /* 3787 * we need to make sure same mapping size is used for 3788 * the same address range if there's a possibility the 3789 * adddress is already mapped because hat layer panics 3790 * when translation is loaded for the range already 3791 * mapped with a different page size. We achieve it 3792 * by always using largest page size possible subject 3793 * to the constraints of page size, segment page size 3794 * and page alignment. Since mappings are invalidated 3795 * when those constraints change and make it 3796 * impossible to use previously used mapping size no 3797 * mapping size conflicts should happen. 3798 */ 3799 3800 chkszc: 3801 if ((pszc = ppa[0]->p_szc) == szc && 3802 IS_P2ALIGNED(pfn, pages)) { 3803 3804 SEGVN_VMSTAT_FLTVNPAGES(30); 3805 #ifdef DEBUG 3806 for (i = 0; i < pages; i++) { 3807 ASSERT(PAGE_LOCKED(ppa[i])); 3808 ASSERT(!PP_ISFREE(ppa[i])); 3809 ASSERT(page_pptonum(ppa[i]) == 3810 pfn + i); 3811 ASSERT(ppa[i]->p_szc == szc); 3812 ASSERT(ppa[i]->p_vnode == vp); 3813 ASSERT(ppa[i]->p_offset == 3814 off + (i << PAGESHIFT)); 3815 } 3816 #endif /* DEBUG */ 3817 /* 3818 * All pages are of szc we need and they are 3819 * all locked so they can't change szc. load 3820 * translations. 3821 * 3822 * if page got promoted since last check 3823 * we don't need pplist. 3824 */ 3825 if (pplist != NULL) { 3826 page_free_replacement_page(pplist); 3827 page_create_putback(pages); 3828 } 3829 if (PP_ISMIGRATE(ppa[0])) { 3830 page_migrate(seg, a, ppa, pages); 3831 } 3832 SEGVN_UPDATE_MODBITS(ppa, pages, rw, 3833 prot, vpprot); 3834 if (!xhat) { 3835 hat_memload_array(hat, a, pgsz, ppa, 3836 prot & vpprot, hat_flag); 3837 } else { 3838 /* 3839 * avoid large xhat mappings to FS 3840 * pages so that hat_page_demote() 3841 * doesn't need to check for xhat 3842 * large mappings. 3843 */ 3844 for (i = 0; i < pages; i++) { 3845 hat_memload(hat, 3846 a + (i << PAGESHIFT), 3847 ppa[i], prot & vpprot, 3848 hat_flag); 3849 } 3850 } 3851 3852 if (!(hat_flag & HAT_LOAD_LOCK)) { 3853 for (i = 0; i < pages; i++) { 3854 page_unlock(ppa[i]); 3855 } 3856 } 3857 if (amp != NULL) { 3858 anon_array_exit(&an_cookie); 3859 ANON_LOCK_EXIT(&->a_rwlock); 3860 } 3861 goto next; 3862 } 3863 3864 /* 3865 * See if upsize is possible. 3866 */ 3867 if (pszc > szc && szc < seg->s_szc && 3868 (segvn_anypgsz_vnode || pszc >= seg->s_szc)) { 3869 pgcnt_t aphase; 3870 uint_t pszc1 = MIN(pszc, seg->s_szc); 3871 ppgsz = page_get_pagesize(pszc1); 3872 ppages = btop(ppgsz); 3873 aphase = btop(P2PHASE((uintptr_t)a, ppgsz)); 3874 3875 ASSERT(type != F_SOFTLOCK); 3876 3877 SEGVN_VMSTAT_FLTVNPAGES(31); 3878 if (aphase != P2PHASE(pfn, ppages)) { 3879 segvn_faultvnmpss_align_err4++; 3880 } else { 3881 SEGVN_VMSTAT_FLTVNPAGES(32); 3882 if (pplist != NULL) { 3883 page_t *pl = pplist; 3884 page_free_replacement_page(pl); 3885 page_create_putback(pages); 3886 } 3887 for (i = 0; i < pages; i++) { 3888 page_unlock(ppa[i]); 3889 } 3890 if (amp != NULL) { 3891 anon_array_exit(&an_cookie); 3892 ANON_LOCK_EXIT(&->a_rwlock); 3893 } 3894 pszc = pszc1; 3895 ierr = -2; 3896 break; 3897 } 3898 } 3899 3900 /* 3901 * check if we should use smallest mapping size. 3902 */ 3903 upgrdfail = 0; 3904 if (szc == 0 || xhat || 3905 (pszc >= szc && 3906 !IS_P2ALIGNED(pfn, pages)) || 3907 (pszc < szc && 3908 !segvn_full_szcpages(ppa, szc, &upgrdfail, 3909 &pszc))) { 3910 3911 if (upgrdfail && type != F_SOFTLOCK) { 3912 /* 3913 * segvn_full_szcpages failed to lock 3914 * all pages EXCL. Size down. 3915 */ 3916 ASSERT(pszc < szc); 3917 3918 SEGVN_VMSTAT_FLTVNPAGES(33); 3919 3920 if (pplist != NULL) { 3921 page_t *pl = pplist; 3922 page_free_replacement_page(pl); 3923 page_create_putback(pages); 3924 } 3925 3926 for (i = 0; i < pages; i++) { 3927 page_unlock(ppa[i]); 3928 } 3929 if (amp != NULL) { 3930 anon_array_exit(&an_cookie); 3931 ANON_LOCK_EXIT(&->a_rwlock); 3932 } 3933 ierr = -1; 3934 break; 3935 } 3936 if (szc != 0 && !xhat && !upgrdfail) { 3937 segvn_faultvnmpss_align_err5++; 3938 } 3939 SEGVN_VMSTAT_FLTVNPAGES(34); 3940 if (pplist != NULL) { 3941 page_free_replacement_page(pplist); 3942 page_create_putback(pages); 3943 } 3944 SEGVN_UPDATE_MODBITS(ppa, pages, rw, 3945 prot, vpprot); 3946 if (upgrdfail && segvn_anypgsz_vnode) { 3947 /* SOFTLOCK case */ 3948 hat_memload_array(hat, a, pgsz, 3949 ppa, prot & vpprot, hat_flag); 3950 } else { 3951 for (i = 0; i < pages; i++) { 3952 hat_memload(hat, 3953 a + (i << PAGESHIFT), 3954 ppa[i], prot & vpprot, 3955 hat_flag); 3956 } 3957 } 3958 if (!(hat_flag & HAT_LOAD_LOCK)) { 3959 for (i = 0; i < pages; i++) { 3960 page_unlock(ppa[i]); 3961 } 3962 } 3963 if (amp != NULL) { 3964 anon_array_exit(&an_cookie); 3965 ANON_LOCK_EXIT(&->a_rwlock); 3966 } 3967 goto next; 3968 } 3969 3970 if (pszc == szc) { 3971 /* 3972 * segvn_full_szcpages() upgraded pages szc. 3973 */ 3974 ASSERT(pszc == ppa[0]->p_szc); 3975 ASSERT(IS_P2ALIGNED(pfn, pages)); 3976 goto chkszc; 3977 } 3978 3979 if (pszc > szc) { 3980 kmutex_t *szcmtx; 3981 SEGVN_VMSTAT_FLTVNPAGES(35); 3982 /* 3983 * p_szc of ppa[0] can change since we haven't 3984 * locked all constituent pages. Call 3985 * page_lock_szc() to prevent szc changes. 3986 * This should be a rare case that happens when 3987 * multiple segments use a different page size 3988 * to map the same file offsets. 3989 */ 3990 szcmtx = page_szc_lock(ppa[0]); 3991 pszc = ppa[0]->p_szc; 3992 ASSERT(szcmtx != NULL || pszc == 0); 3993 ASSERT(ppa[0]->p_szc <= pszc); 3994 if (pszc <= szc) { 3995 SEGVN_VMSTAT_FLTVNPAGES(36); 3996 if (szcmtx != NULL) { 3997 mutex_exit(szcmtx); 3998 } 3999 goto chkszc; 4000 } 4001 if (pplist != NULL) { 4002 /* 4003 * page got promoted since last check. 4004 * we don't need preaalocated large 4005 * page. 4006 */ 4007 SEGVN_VMSTAT_FLTVNPAGES(37); 4008 page_free_replacement_page(pplist); 4009 page_create_putback(pages); 4010 } 4011 SEGVN_UPDATE_MODBITS(ppa, pages, rw, 4012 prot, vpprot); 4013 hat_memload_array(hat, a, pgsz, ppa, 4014 prot & vpprot, hat_flag); 4015 mutex_exit(szcmtx); 4016 if (!(hat_flag & HAT_LOAD_LOCK)) { 4017 for (i = 0; i < pages; i++) { 4018 page_unlock(ppa[i]); 4019 } 4020 } 4021 if (amp != NULL) { 4022 anon_array_exit(&an_cookie); 4023 ANON_LOCK_EXIT(&->a_rwlock); 4024 } 4025 goto next; 4026 } 4027 4028 /* 4029 * if page got demoted since last check 4030 * we could have not allocated larger page. 4031 * allocate now. 4032 */ 4033 if (pplist == NULL && 4034 page_alloc_pages(vp, seg, a, &pplist, NULL, 4035 szc, 0) && type != F_SOFTLOCK) { 4036 SEGVN_VMSTAT_FLTVNPAGES(38); 4037 for (i = 0; i < pages; i++) { 4038 page_unlock(ppa[i]); 4039 } 4040 if (amp != NULL) { 4041 anon_array_exit(&an_cookie); 4042 ANON_LOCK_EXIT(&->a_rwlock); 4043 } 4044 ierr = -1; 4045 alloc_failed |= (1 << szc); 4046 break; 4047 } 4048 4049 SEGVN_VMSTAT_FLTVNPAGES(39); 4050 4051 if (pplist != NULL) { 4052 segvn_relocate_pages(ppa, pplist); 4053 #ifdef DEBUG 4054 } else { 4055 ASSERT(type == F_SOFTLOCK); 4056 SEGVN_VMSTAT_FLTVNPAGES(40); 4057 #endif /* DEBUG */ 4058 } 4059 4060 SEGVN_UPDATE_MODBITS(ppa, pages, rw, prot, vpprot); 4061 4062 if (pplist == NULL && segvn_anypgsz_vnode == 0) { 4063 ASSERT(type == F_SOFTLOCK); 4064 for (i = 0; i < pages; i++) { 4065 ASSERT(ppa[i]->p_szc < szc); 4066 hat_memload(hat, a + (i << PAGESHIFT), 4067 ppa[i], prot & vpprot, hat_flag); 4068 } 4069 } else { 4070 ASSERT(pplist != NULL || type == F_SOFTLOCK); 4071 hat_memload_array(hat, a, pgsz, ppa, 4072 prot & vpprot, hat_flag); 4073 } 4074 if (!(hat_flag & HAT_LOAD_LOCK)) { 4075 for (i = 0; i < pages; i++) { 4076 ASSERT(PAGE_SHARED(ppa[i])); 4077 page_unlock(ppa[i]); 4078 } 4079 } 4080 if (amp != NULL) { 4081 anon_array_exit(&an_cookie); 4082 ANON_LOCK_EXIT(&->a_rwlock); 4083 } 4084 4085 next: 4086 if (vpage != NULL) { 4087 vpage += pages; 4088 } 4089 adjszc_chk = 1; 4090 } 4091 if (a == lpgeaddr) 4092 break; 4093 ASSERT(a < lpgeaddr); 4094 4095 ASSERT(!brkcow && type != F_SOFTLOCK); 4096 4097 /* 4098 * ierr == -1 means we failed to map with a large page. 4099 * (either due to allocation/relocation failures or 4100 * misalignment with other mappings to this file. 4101 * 4102 * ierr == -2 means some other thread allocated a large page 4103 * after we gave up tp map with a large page. retry with 4104 * larger mapping. 4105 */ 4106 ASSERT(ierr == -1 || ierr == -2); 4107 ASSERT(ierr == -2 || szc != 0); 4108 ASSERT(ierr == -1 || szc < seg->s_szc); 4109 if (ierr == -2) { 4110 SEGVN_VMSTAT_FLTVNPAGES(41); 4111 ASSERT(pszc > szc && pszc <= seg->s_szc); 4112 szc = pszc; 4113 } else if (segvn_anypgsz_vnode) { 4114 SEGVN_VMSTAT_FLTVNPAGES(42); 4115 szc--; 4116 } else { 4117 SEGVN_VMSTAT_FLTVNPAGES(43); 4118 ASSERT(pszc < szc); 4119 /* 4120 * other process created pszc large page. 4121 * but we still have to drop to 0 szc. 4122 */ 4123 szc = 0; 4124 } 4125 4126 pgsz = page_get_pagesize(szc); 4127 pages = btop(pgsz); 4128 if (ierr == -2) { 4129 /* 4130 * Size up case. Note lpgaddr may only be needed for 4131 * softlock case so we don't adjust it here. 4132 */ 4133 a = (caddr_t)P2ALIGN((uintptr_t)a, pgsz); 4134 ASSERT(a >= lpgaddr); 4135 lpgeaddr = (caddr_t)P2ROUNDUP((uintptr_t)eaddr, pgsz); 4136 off = svd->offset + (uintptr_t)(a - seg->s_base); 4137 aindx = svd->anon_index + seg_page(seg, a); 4138 vpage = (svd->vpage != NULL) ? 4139 &svd->vpage[seg_page(seg, a)] : NULL; 4140 } else { 4141 /* 4142 * Size down case. Note lpgaddr may only be needed for 4143 * softlock case so we don't adjust it here. 4144 */ 4145 ASSERT(IS_P2ALIGNED(a, pgsz)); 4146 ASSERT(IS_P2ALIGNED(lpgeaddr, pgsz)); 4147 lpgeaddr = (caddr_t)P2ROUNDUP((uintptr_t)eaddr, pgsz); 4148 ASSERT(a < lpgeaddr); 4149 if (a < addr) { 4150 SEGVN_VMSTAT_FLTVNPAGES(44); 4151 /* 4152 * The beginning of the large page region can 4153 * be pulled to the right to make a smaller 4154 * region. We haven't yet faulted a single 4155 * page. 4156 */ 4157 a = (caddr_t)P2ALIGN((uintptr_t)addr, pgsz); 4158 ASSERT(a >= lpgaddr); 4159 off = svd->offset + 4160 (uintptr_t)(a - seg->s_base); 4161 aindx = svd->anon_index + seg_page(seg, a); 4162 vpage = (svd->vpage != NULL) ? 4163 &svd->vpage[seg_page(seg, a)] : NULL; 4164 } 4165 } 4166 } 4167 out: 4168 kmem_free(ppa, ppasize); 4169 if (!err && !vop_size_err) { 4170 SEGVN_VMSTAT_FLTVNPAGES(45); 4171 return (0); 4172 } 4173 if (type == F_SOFTLOCK && a > lpgaddr) { 4174 SEGVN_VMSTAT_FLTVNPAGES(46); 4175 segvn_softunlock(seg, lpgaddr, a - lpgaddr, S_OTHER); 4176 } 4177 if (!vop_size_err) { 4178 SEGVN_VMSTAT_FLTVNPAGES(47); 4179 return (err); 4180 } 4181 ASSERT(brkcow || type == F_SOFTLOCK); 4182 /* 4183 * Large page end is mapped beyond the end of file and it's a cow 4184 * fault or softlock so we can't reduce the map area. For now just 4185 * demote the segment. This should really only happen if the end of 4186 * the file changed after the mapping was established since when large 4187 * page segments are created we make sure they don't extend beyond the 4188 * end of the file. 4189 */ 4190 SEGVN_VMSTAT_FLTVNPAGES(48); 4191 4192 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 4193 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_WRITER); 4194 err = 0; 4195 if (seg->s_szc != 0) { 4196 segvn_fltvnpages_clrszc_cnt++; 4197 ASSERT(svd->softlockcnt == 0); 4198 err = segvn_clrszc(seg); 4199 if (err != 0) { 4200 segvn_fltvnpages_clrszc_err++; 4201 } 4202 } 4203 ASSERT(err || seg->s_szc == 0); 4204 SEGVN_LOCK_DOWNGRADE(seg->s_as, &svd->lock); 4205 /* segvn_fault will do its job as if szc had been zero to begin with */ 4206 return (err == 0 ? IE_RETRY : FC_MAKE_ERR(err)); 4207 } 4208 4209 /* 4210 * This routine will attempt to fault in one large page. 4211 * it will use smaller pages if that fails. 4212 * It should only be called for pure anonymous segments. 4213 */ 4214 static faultcode_t 4215 segvn_fault_anonpages(struct hat *hat, struct seg *seg, caddr_t lpgaddr, 4216 caddr_t lpgeaddr, enum fault_type type, enum seg_rw rw, caddr_t addr, 4217 caddr_t eaddr, int brkcow) 4218 { 4219 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 4220 struct anon_map *amp = svd->amp; 4221 uchar_t segtype = svd->type; 4222 uint_t szc = seg->s_szc; 4223 size_t pgsz = page_get_pagesize(szc); 4224 size_t maxpgsz = pgsz; 4225 pgcnt_t pages = btop(pgsz); 4226 size_t ppasize = pages * sizeof (page_t *); 4227 caddr_t a = lpgaddr; 4228 ulong_t aindx = svd->anon_index + seg_page(seg, a); 4229 struct vpage *vpage = (svd->vpage != NULL) ? 4230 &svd->vpage[seg_page(seg, a)] : NULL; 4231 page_t **ppa; 4232 uint_t ppa_szc; 4233 faultcode_t err; 4234 int ierr; 4235 uint_t protchk, prot, vpprot; 4236 ulong_t i; 4237 int hat_flag = (type == F_SOFTLOCK) ? HAT_LOAD_LOCK : HAT_LOAD; 4238 anon_sync_obj_t cookie; 4239 int first = 1; 4240 int adjszc_chk; 4241 int purged = 0; 4242 4243 ASSERT(szc != 0); 4244 ASSERT(amp != NULL); 4245 ASSERT(enable_mbit_wa == 0); /* no mbit simulations with large pages */ 4246 ASSERT(!(svd->flags & MAP_NORESERVE)); 4247 ASSERT(type != F_SOFTUNLOCK); 4248 ASSERT(IS_P2ALIGNED(a, maxpgsz)); 4249 4250 ASSERT(SEGVN_LOCK_HELD(seg->s_as, &svd->lock)); 4251 4252 VM_STAT_COND_ADD(type == F_SOFTLOCK, segvnvmstats.fltanpages[0]); 4253 VM_STAT_COND_ADD(type != F_SOFTLOCK, segvnvmstats.fltanpages[1]); 4254 4255 if (svd->flags & MAP_TEXT) { 4256 hat_flag |= HAT_LOAD_TEXT; 4257 } 4258 4259 if (svd->pageprot) { 4260 switch (rw) { 4261 case S_READ: 4262 protchk = PROT_READ; 4263 break; 4264 case S_WRITE: 4265 protchk = PROT_WRITE; 4266 break; 4267 case S_EXEC: 4268 protchk = PROT_EXEC; 4269 break; 4270 case S_OTHER: 4271 default: 4272 protchk = PROT_READ | PROT_WRITE | PROT_EXEC; 4273 break; 4274 } 4275 VM_STAT_ADD(segvnvmstats.fltanpages[2]); 4276 } else { 4277 prot = svd->prot; 4278 /* caller has already done segment level protection check. */ 4279 } 4280 4281 ppa = kmem_alloc(ppasize, KM_SLEEP); 4282 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 4283 for (;;) { 4284 adjszc_chk = 0; 4285 for (; a < lpgeaddr; a += pgsz, aindx += pages) { 4286 if (svd->pageprot != 0 && IS_P2ALIGNED(a, maxpgsz)) { 4287 VM_STAT_ADD(segvnvmstats.fltanpages[3]); 4288 ASSERT(vpage != NULL); 4289 prot = VPP_PROT(vpage); 4290 ASSERT(sameprot(seg, a, maxpgsz)); 4291 if ((prot & protchk) == 0) { 4292 err = FC_PROT; 4293 goto error; 4294 } 4295 } 4296 if (adjszc_chk && IS_P2ALIGNED(a, maxpgsz) && 4297 pgsz < maxpgsz) { 4298 ASSERT(a > lpgaddr); 4299 szc = seg->s_szc; 4300 pgsz = maxpgsz; 4301 pages = btop(pgsz); 4302 ASSERT(IS_P2ALIGNED(aindx, pages)); 4303 lpgeaddr = (caddr_t)P2ROUNDUP((uintptr_t)eaddr, 4304 pgsz); 4305 } 4306 if (type == F_SOFTLOCK && svd->vp != NULL) { 4307 mutex_enter(&freemem_lock); 4308 if (availrmem < tune.t_minarmem + pages) { 4309 mutex_exit(&freemem_lock); 4310 err = FC_MAKE_ERR(ENOMEM); 4311 goto error; 4312 } else { 4313 availrmem -= pages; 4314 segvn_pages_locked += pages; 4315 svd->softlockcnt += pages; 4316 } 4317 mutex_exit(&freemem_lock); 4318 } 4319 anon_array_enter(amp, aindx, &cookie); 4320 ppa_szc = (uint_t)-1; 4321 ierr = anon_map_getpages(amp, aindx, szc, seg, a, 4322 prot, &vpprot, ppa, &ppa_szc, vpage, rw, brkcow, 4323 segvn_anypgsz, svd->cred); 4324 if (ierr != 0) { 4325 anon_array_exit(&cookie); 4326 VM_STAT_ADD(segvnvmstats.fltanpages[4]); 4327 if (type == F_SOFTLOCK && svd->vp != NULL) { 4328 VM_STAT_ADD(segvnvmstats.fltanpages[5]); 4329 mutex_enter(&freemem_lock); 4330 availrmem += pages; 4331 segvn_pages_locked -= pages; 4332 svd->softlockcnt -= pages; 4333 mutex_exit(&freemem_lock); 4334 } 4335 if (ierr > 0) { 4336 VM_STAT_ADD(segvnvmstats.fltanpages[6]); 4337 err = FC_MAKE_ERR(ierr); 4338 goto error; 4339 } 4340 break; 4341 } 4342 4343 ASSERT(!IS_VMODSORT(ppa[0]->p_vnode)); 4344 4345 ASSERT(segtype == MAP_SHARED || 4346 ppa[0]->p_szc <= szc); 4347 ASSERT(segtype == MAP_PRIVATE || 4348 ppa[0]->p_szc >= szc); 4349 4350 /* 4351 * Handle pages that have been marked for migration 4352 */ 4353 if (lgrp_optimizations()) 4354 page_migrate(seg, a, ppa, pages); 4355 4356 if (type == F_SOFTLOCK && svd->vp == NULL) { 4357 /* 4358 * All pages in ppa array belong to the same 4359 * large page. This means it's ok to call 4360 * segvn_pp_lock_anonpages just for ppa[0]. 4361 */ 4362 if (!segvn_pp_lock_anonpages(ppa[0], first)) { 4363 for (i = 0; i < pages; i++) { 4364 page_unlock(ppa[i]); 4365 } 4366 err = FC_MAKE_ERR(ENOMEM); 4367 goto error; 4368 } 4369 first = 0; 4370 mutex_enter(&freemem_lock); 4371 svd->softlockcnt += pages; 4372 segvn_pages_locked += pages; 4373 mutex_exit(&freemem_lock); 4374 } 4375 4376 if (segtype == MAP_SHARED) { 4377 vpprot |= PROT_WRITE; 4378 } 4379 4380 hat_memload_array(hat, a, pgsz, ppa, 4381 prot & vpprot, hat_flag); 4382 4383 if (hat_flag & HAT_LOAD_LOCK) { 4384 VM_STAT_ADD(segvnvmstats.fltanpages[7]); 4385 } else { 4386 VM_STAT_ADD(segvnvmstats.fltanpages[8]); 4387 for (i = 0; i < pages; i++) 4388 page_unlock(ppa[i]); 4389 } 4390 if (vpage != NULL) 4391 vpage += pages; 4392 4393 anon_array_exit(&cookie); 4394 adjszc_chk = 1; 4395 } 4396 if (a == lpgeaddr) 4397 break; 4398 ASSERT(a < lpgeaddr); 4399 /* 4400 * ierr == -1 means we failed to allocate a large page. 4401 * so do a size down operation. 4402 * 4403 * ierr == -2 means some other process that privately shares 4404 * pages with this process has allocated a larger page and we 4405 * need to retry with larger pages. So do a size up 4406 * operation. This relies on the fact that large pages are 4407 * never partially shared i.e. if we share any constituent 4408 * page of a large page with another process we must share the 4409 * entire large page. Note this cannot happen for SOFTLOCK 4410 * case, unless current address (a) is at the beginning of the 4411 * next page size boundary because the other process couldn't 4412 * have relocated locked pages. 4413 */ 4414 ASSERT(ierr == -1 || ierr == -2); 4415 /* 4416 * For the very first relocation failure try to purge this 4417 * segment's cache so that the relocator can obtain an 4418 * exclusive lock on pages we want to relocate. 4419 */ 4420 if (!purged && ierr == -1 && ppa_szc != (uint_t)-1 && 4421 svd->softlockcnt != 0) { 4422 purged = 1; 4423 segvn_purge(seg); 4424 continue; 4425 } 4426 4427 if (segvn_anypgsz) { 4428 ASSERT(ierr == -2 || szc != 0); 4429 ASSERT(ierr == -1 || szc < seg->s_szc); 4430 szc = (ierr == -1) ? szc - 1 : szc + 1; 4431 } else { 4432 /* 4433 * For non COW faults and segvn_anypgsz == 0 4434 * we need to be careful not to loop forever 4435 * if existing page is found with szc other 4436 * than 0 or seg->s_szc. This could be due 4437 * to page relocations on behalf of DR or 4438 * more likely large page creation. For this 4439 * case simply re-size to existing page's szc 4440 * if returned by anon_map_getpages(). 4441 */ 4442 if (ppa_szc == (uint_t)-1) { 4443 szc = (ierr == -1) ? 0 : seg->s_szc; 4444 } else { 4445 ASSERT(ppa_szc <= seg->s_szc); 4446 ASSERT(ierr == -2 || ppa_szc < szc); 4447 ASSERT(ierr == -1 || ppa_szc > szc); 4448 szc = ppa_szc; 4449 } 4450 } 4451 4452 pgsz = page_get_pagesize(szc); 4453 pages = btop(pgsz); 4454 ASSERT(type != F_SOFTLOCK || ierr == -1 || 4455 (IS_P2ALIGNED(a, pgsz) && IS_P2ALIGNED(lpgeaddr, pgsz))); 4456 if (type == F_SOFTLOCK) { 4457 /* 4458 * For softlocks we cannot reduce the fault area 4459 * (calculated based on the largest page size for this 4460 * segment) for size down and a is already next 4461 * page size aligned as assertted above for size 4462 * ups. Therefore just continue in case of softlock. 4463 */ 4464 VM_STAT_ADD(segvnvmstats.fltanpages[9]); 4465 continue; /* keep lint happy */ 4466 } else if (ierr == -2) { 4467 4468 /* 4469 * Size up case. Note lpgaddr may only be needed for 4470 * softlock case so we don't adjust it here. 4471 */ 4472 VM_STAT_ADD(segvnvmstats.fltanpages[10]); 4473 a = (caddr_t)P2ALIGN((uintptr_t)a, pgsz); 4474 ASSERT(a >= lpgaddr); 4475 lpgeaddr = (caddr_t)P2ROUNDUP((uintptr_t)eaddr, pgsz); 4476 aindx = svd->anon_index + seg_page(seg, a); 4477 vpage = (svd->vpage != NULL) ? 4478 &svd->vpage[seg_page(seg, a)] : NULL; 4479 } else { 4480 /* 4481 * Size down case. Note lpgaddr may only be needed for 4482 * softlock case so we don't adjust it here. 4483 */ 4484 VM_STAT_ADD(segvnvmstats.fltanpages[11]); 4485 ASSERT(IS_P2ALIGNED(a, pgsz)); 4486 ASSERT(IS_P2ALIGNED(lpgeaddr, pgsz)); 4487 lpgeaddr = (caddr_t)P2ROUNDUP((uintptr_t)eaddr, pgsz); 4488 ASSERT(a < lpgeaddr); 4489 if (a < addr) { 4490 /* 4491 * The beginning of the large page region can 4492 * be pulled to the right to make a smaller 4493 * region. We haven't yet faulted a single 4494 * page. 4495 */ 4496 VM_STAT_ADD(segvnvmstats.fltanpages[12]); 4497 a = (caddr_t)P2ALIGN((uintptr_t)addr, pgsz); 4498 ASSERT(a >= lpgaddr); 4499 aindx = svd->anon_index + seg_page(seg, a); 4500 vpage = (svd->vpage != NULL) ? 4501 &svd->vpage[seg_page(seg, a)] : NULL; 4502 } 4503 } 4504 } 4505 VM_STAT_ADD(segvnvmstats.fltanpages[13]); 4506 ANON_LOCK_EXIT(&->a_rwlock); 4507 kmem_free(ppa, ppasize); 4508 return (0); 4509 error: 4510 VM_STAT_ADD(segvnvmstats.fltanpages[14]); 4511 ANON_LOCK_EXIT(&->a_rwlock); 4512 kmem_free(ppa, ppasize); 4513 if (type == F_SOFTLOCK && a > lpgaddr) { 4514 VM_STAT_ADD(segvnvmstats.fltanpages[15]); 4515 segvn_softunlock(seg, lpgaddr, a - lpgaddr, S_OTHER); 4516 } 4517 return (err); 4518 } 4519 4520 int fltadvice = 1; /* set to free behind pages for sequential access */ 4521 4522 /* 4523 * This routine is called via a machine specific fault handling routine. 4524 * It is also called by software routines wishing to lock or unlock 4525 * a range of addresses. 4526 * 4527 * Here is the basic algorithm: 4528 * If unlocking 4529 * Call segvn_softunlock 4530 * Return 4531 * endif 4532 * Checking and set up work 4533 * If we will need some non-anonymous pages 4534 * Call VOP_GETPAGE over the range of non-anonymous pages 4535 * endif 4536 * Loop over all addresses requested 4537 * Call segvn_faultpage passing in page list 4538 * to load up translations and handle anonymous pages 4539 * endloop 4540 * Load up translation to any additional pages in page list not 4541 * already handled that fit into this segment 4542 */ 4543 static faultcode_t 4544 segvn_fault(struct hat *hat, struct seg *seg, caddr_t addr, size_t len, 4545 enum fault_type type, enum seg_rw rw) 4546 { 4547 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 4548 page_t **plp, **ppp, *pp; 4549 u_offset_t off; 4550 caddr_t a; 4551 struct vpage *vpage; 4552 uint_t vpprot, prot; 4553 int err; 4554 page_t *pl[PVN_GETPAGE_NUM + 1]; 4555 size_t plsz, pl_alloc_sz; 4556 size_t page; 4557 ulong_t anon_index; 4558 struct anon_map *amp; 4559 int dogetpage = 0; 4560 caddr_t lpgaddr, lpgeaddr; 4561 size_t pgsz; 4562 anon_sync_obj_t cookie; 4563 int brkcow = BREAK_COW_SHARE(rw, type, svd->type); 4564 4565 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 4566 4567 /* 4568 * First handle the easy stuff 4569 */ 4570 if (type == F_SOFTUNLOCK) { 4571 if (rw == S_READ_NOCOW) { 4572 rw = S_READ; 4573 ASSERT(AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock)); 4574 } 4575 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER); 4576 pgsz = (seg->s_szc == 0) ? PAGESIZE : 4577 page_get_pagesize(seg->s_szc); 4578 VM_STAT_COND_ADD(pgsz > PAGESIZE, segvnvmstats.fltanpages[16]); 4579 CALC_LPG_REGION(pgsz, seg, addr, len, lpgaddr, lpgeaddr); 4580 segvn_softunlock(seg, lpgaddr, lpgeaddr - lpgaddr, rw); 4581 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 4582 return (0); 4583 } 4584 4585 top: 4586 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER); 4587 4588 /* 4589 * If we have the same protections for the entire segment, 4590 * insure that the access being attempted is legitimate. 4591 */ 4592 4593 if (svd->pageprot == 0) { 4594 uint_t protchk; 4595 4596 switch (rw) { 4597 case S_READ: 4598 case S_READ_NOCOW: 4599 protchk = PROT_READ; 4600 break; 4601 case S_WRITE: 4602 protchk = PROT_WRITE; 4603 break; 4604 case S_EXEC: 4605 protchk = PROT_EXEC; 4606 break; 4607 case S_OTHER: 4608 default: 4609 protchk = PROT_READ | PROT_WRITE | PROT_EXEC; 4610 break; 4611 } 4612 4613 if ((svd->prot & protchk) == 0) { 4614 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 4615 return (FC_PROT); /* illegal access type */ 4616 } 4617 } 4618 4619 /* 4620 * We can't allow the long term use of softlocks for vmpss segments, 4621 * because in some file truncation cases we should be able to demote 4622 * the segment, which requires that there are no softlocks. The 4623 * only case where it's ok to allow a SOFTLOCK fault against a vmpss 4624 * segment is S_READ_NOCOW, where the caller holds the address space 4625 * locked as writer and calls softunlock before dropping the as lock. 4626 * S_READ_NOCOW is used by /proc to read memory from another user. 4627 * 4628 * Another deadlock between SOFTLOCK and file truncation can happen 4629 * because segvn_fault_vnodepages() calls the FS one pagesize at 4630 * a time. A second VOP_GETPAGE() call by segvn_fault_vnodepages() 4631 * can cause a deadlock because the first set of page_t's remain 4632 * locked SE_SHARED. To avoid this, we demote segments on a first 4633 * SOFTLOCK if they have a length greater than the segment's 4634 * page size. 4635 * 4636 * So for now, we only avoid demoting a segment on a SOFTLOCK when 4637 * the access type is S_READ_NOCOW and the fault length is less than 4638 * or equal to the segment's page size. While this is quite restrictive, 4639 * it should be the most common case of SOFTLOCK against a vmpss 4640 * segment. 4641 * 4642 * For S_READ_NOCOW, it's safe not to do a copy on write because the 4643 * caller makes sure no COW will be caused by another thread for a 4644 * softlocked page. 4645 */ 4646 if (type == F_SOFTLOCK && svd->vp != NULL && seg->s_szc != 0) { 4647 int demote = 0; 4648 4649 if (rw != S_READ_NOCOW) { 4650 demote = 1; 4651 } 4652 if (!demote && len > PAGESIZE) { 4653 pgsz = page_get_pagesize(seg->s_szc); 4654 CALC_LPG_REGION(pgsz, seg, addr, len, lpgaddr, 4655 lpgeaddr); 4656 if (lpgeaddr - lpgaddr > pgsz) { 4657 demote = 1; 4658 } 4659 } 4660 4661 ASSERT(demote || AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock)); 4662 4663 if (demote) { 4664 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 4665 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_WRITER); 4666 if (seg->s_szc != 0) { 4667 segvn_vmpss_clrszc_cnt++; 4668 ASSERT(svd->softlockcnt == 0); 4669 err = segvn_clrszc(seg); 4670 if (err) { 4671 segvn_vmpss_clrszc_err++; 4672 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 4673 return (FC_MAKE_ERR(err)); 4674 } 4675 } 4676 ASSERT(seg->s_szc == 0); 4677 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 4678 goto top; 4679 } 4680 } 4681 4682 /* 4683 * Check to see if we need to allocate an anon_map structure. 4684 */ 4685 if (svd->amp == NULL && (svd->vp == NULL || brkcow)) { 4686 /* 4687 * Drop the "read" lock on the segment and acquire 4688 * the "write" version since we have to allocate the 4689 * anon_map. 4690 */ 4691 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 4692 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_WRITER); 4693 4694 if (svd->amp == NULL) { 4695 svd->amp = anonmap_alloc(seg->s_size, 0); 4696 svd->amp->a_szc = seg->s_szc; 4697 } 4698 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 4699 4700 /* 4701 * Start all over again since segment protections 4702 * may have changed after we dropped the "read" lock. 4703 */ 4704 goto top; 4705 } 4706 4707 /* 4708 * S_READ_NOCOW vs S_READ distinction was 4709 * only needed for the code above. After 4710 * that we treat it as S_READ. 4711 */ 4712 if (rw == S_READ_NOCOW) { 4713 ASSERT(type == F_SOFTLOCK); 4714 ASSERT(AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock)); 4715 rw = S_READ; 4716 } 4717 4718 amp = svd->amp; 4719 4720 /* 4721 * MADV_SEQUENTIAL work is ignored for large page segments. 4722 */ 4723 if (seg->s_szc != 0) { 4724 pgsz = page_get_pagesize(seg->s_szc); 4725 ASSERT(SEGVN_LOCK_HELD(seg->s_as, &svd->lock)); 4726 CALC_LPG_REGION(pgsz, seg, addr, len, lpgaddr, lpgeaddr); 4727 if (svd->vp == NULL) { 4728 err = segvn_fault_anonpages(hat, seg, lpgaddr, 4729 lpgeaddr, type, rw, addr, addr + len, brkcow); 4730 } else { 4731 err = segvn_fault_vnodepages(hat, seg, lpgaddr, 4732 lpgeaddr, type, rw, addr, addr + len, brkcow); 4733 if (err == IE_RETRY) { 4734 ASSERT(seg->s_szc == 0); 4735 ASSERT(SEGVN_READ_HELD(seg->s_as, &svd->lock)); 4736 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 4737 goto top; 4738 } 4739 } 4740 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 4741 return (err); 4742 } 4743 4744 page = seg_page(seg, addr); 4745 if (amp != NULL) { 4746 anon_index = svd->anon_index + page; 4747 4748 if ((type == F_PROT) && (rw == S_READ) && 4749 svd->type == MAP_PRIVATE && svd->pageprot == 0) { 4750 size_t index = anon_index; 4751 struct anon *ap; 4752 4753 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 4754 /* 4755 * The fast path could apply to S_WRITE also, except 4756 * that the protection fault could be caused by lazy 4757 * tlb flush when ro->rw. In this case, the pte is 4758 * RW already. But RO in the other cpu's tlb causes 4759 * the fault. Since hat_chgprot won't do anything if 4760 * pte doesn't change, we may end up faulting 4761 * indefinitely until the RO tlb entry gets replaced. 4762 */ 4763 for (a = addr; a < addr + len; a += PAGESIZE, index++) { 4764 anon_array_enter(amp, index, &cookie); 4765 ap = anon_get_ptr(amp->ahp, index); 4766 anon_array_exit(&cookie); 4767 if ((ap == NULL) || (ap->an_refcnt != 1)) { 4768 ANON_LOCK_EXIT(&->a_rwlock); 4769 goto slow; 4770 } 4771 } 4772 hat_chgprot(seg->s_as->a_hat, addr, len, svd->prot); 4773 ANON_LOCK_EXIT(&->a_rwlock); 4774 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 4775 return (0); 4776 } 4777 } 4778 slow: 4779 4780 if (svd->vpage == NULL) 4781 vpage = NULL; 4782 else 4783 vpage = &svd->vpage[page]; 4784 4785 off = svd->offset + (uintptr_t)(addr - seg->s_base); 4786 4787 /* 4788 * If MADV_SEQUENTIAL has been set for the particular page we 4789 * are faulting on, free behind all pages in the segment and put 4790 * them on the free list. 4791 */ 4792 if ((page != 0) && fltadvice) { /* not if first page in segment */ 4793 struct vpage *vpp; 4794 ulong_t fanon_index; 4795 size_t fpage; 4796 u_offset_t pgoff, fpgoff; 4797 struct vnode *fvp; 4798 struct anon *fap = NULL; 4799 4800 if (svd->advice == MADV_SEQUENTIAL || 4801 (svd->pageadvice && 4802 VPP_ADVICE(vpage) == MADV_SEQUENTIAL)) { 4803 pgoff = off - PAGESIZE; 4804 fpage = page - 1; 4805 if (vpage != NULL) 4806 vpp = &svd->vpage[fpage]; 4807 if (amp != NULL) 4808 fanon_index = svd->anon_index + fpage; 4809 4810 while (pgoff > svd->offset) { 4811 if (svd->advice != MADV_SEQUENTIAL && 4812 (!svd->pageadvice || (vpage && 4813 VPP_ADVICE(vpp) != MADV_SEQUENTIAL))) 4814 break; 4815 4816 /* 4817 * If this is an anon page, we must find the 4818 * correct <vp, offset> for it 4819 */ 4820 fap = NULL; 4821 if (amp != NULL) { 4822 ANON_LOCK_ENTER(&->a_rwlock, 4823 RW_READER); 4824 anon_array_enter(amp, fanon_index, 4825 &cookie); 4826 fap = anon_get_ptr(amp->ahp, 4827 fanon_index); 4828 if (fap != NULL) { 4829 swap_xlate(fap, &fvp, &fpgoff); 4830 } else { 4831 fpgoff = pgoff; 4832 fvp = svd->vp; 4833 } 4834 anon_array_exit(&cookie); 4835 ANON_LOCK_EXIT(&->a_rwlock); 4836 } else { 4837 fpgoff = pgoff; 4838 fvp = svd->vp; 4839 } 4840 if (fvp == NULL) 4841 break; /* XXX */ 4842 /* 4843 * Skip pages that are free or have an 4844 * "exclusive" lock. 4845 */ 4846 pp = page_lookup_nowait(fvp, fpgoff, SE_SHARED); 4847 if (pp == NULL) 4848 break; 4849 /* 4850 * We don't need the page_struct_lock to test 4851 * as this is only advisory; even if we 4852 * acquire it someone might race in and lock 4853 * the page after we unlock and before the 4854 * PUTPAGE, then VOP_PUTPAGE will do nothing. 4855 */ 4856 if (pp->p_lckcnt == 0 && pp->p_cowcnt == 0) { 4857 /* 4858 * Hold the vnode before releasing 4859 * the page lock to prevent it from 4860 * being freed and re-used by some 4861 * other thread. 4862 */ 4863 VN_HOLD(fvp); 4864 page_unlock(pp); 4865 /* 4866 * We should build a page list 4867 * to kluster putpages XXX 4868 */ 4869 (void) VOP_PUTPAGE(fvp, 4870 (offset_t)fpgoff, PAGESIZE, 4871 (B_DONTNEED|B_FREE|B_ASYNC), 4872 svd->cred); 4873 VN_RELE(fvp); 4874 } else { 4875 /* 4876 * XXX - Should the loop terminate if 4877 * the page is `locked'? 4878 */ 4879 page_unlock(pp); 4880 } 4881 --vpp; 4882 --fanon_index; 4883 pgoff -= PAGESIZE; 4884 } 4885 } 4886 } 4887 4888 plp = pl; 4889 *plp = NULL; 4890 pl_alloc_sz = 0; 4891 4892 /* 4893 * See if we need to call VOP_GETPAGE for 4894 * *any* of the range being faulted on. 4895 * We can skip all of this work if there 4896 * was no original vnode. 4897 */ 4898 if (svd->vp != NULL) { 4899 u_offset_t vp_off; 4900 size_t vp_len; 4901 struct anon *ap; 4902 vnode_t *vp; 4903 4904 vp_off = off; 4905 vp_len = len; 4906 4907 if (amp == NULL) 4908 dogetpage = 1; 4909 else { 4910 /* 4911 * Only acquire reader lock to prevent amp->ahp 4912 * from being changed. It's ok to miss pages, 4913 * hence we don't do anon_array_enter 4914 */ 4915 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 4916 ap = anon_get_ptr(amp->ahp, anon_index); 4917 4918 if (len <= PAGESIZE) 4919 /* inline non_anon() */ 4920 dogetpage = (ap == NULL); 4921 else 4922 dogetpage = non_anon(amp->ahp, anon_index, 4923 &vp_off, &vp_len); 4924 ANON_LOCK_EXIT(&->a_rwlock); 4925 } 4926 4927 if (dogetpage) { 4928 enum seg_rw arw; 4929 struct as *as = seg->s_as; 4930 4931 if (len > ptob((sizeof (pl) / sizeof (pl[0])) - 1)) { 4932 /* 4933 * Page list won't fit in local array, 4934 * allocate one of the needed size. 4935 */ 4936 pl_alloc_sz = 4937 (btop(len) + 1) * sizeof (page_t *); 4938 plp = kmem_alloc(pl_alloc_sz, KM_SLEEP); 4939 plp[0] = NULL; 4940 plsz = len; 4941 } else if (rw == S_WRITE && svd->type == MAP_PRIVATE || 4942 rw == S_OTHER || 4943 (((size_t)(addr + PAGESIZE) < 4944 (size_t)(seg->s_base + seg->s_size)) && 4945 hat_probe(as->a_hat, addr + PAGESIZE))) { 4946 /* 4947 * Ask VOP_GETPAGE to return the exact number 4948 * of pages if 4949 * (a) this is a COW fault, or 4950 * (b) this is a software fault, or 4951 * (c) next page is already mapped. 4952 */ 4953 plsz = len; 4954 } else { 4955 /* 4956 * Ask VOP_GETPAGE to return adjacent pages 4957 * within the segment. 4958 */ 4959 plsz = MIN((size_t)PVN_GETPAGE_SZ, (size_t) 4960 ((seg->s_base + seg->s_size) - addr)); 4961 ASSERT((addr + plsz) <= 4962 (seg->s_base + seg->s_size)); 4963 } 4964 4965 /* 4966 * Need to get some non-anonymous pages. 4967 * We need to make only one call to GETPAGE to do 4968 * this to prevent certain deadlocking conditions 4969 * when we are doing locking. In this case 4970 * non_anon() should have picked up the smallest 4971 * range which includes all the non-anonymous 4972 * pages in the requested range. We have to 4973 * be careful regarding which rw flag to pass in 4974 * because on a private mapping, the underlying 4975 * object is never allowed to be written. 4976 */ 4977 if (rw == S_WRITE && svd->type == MAP_PRIVATE) { 4978 arw = S_READ; 4979 } else { 4980 arw = rw; 4981 } 4982 vp = svd->vp; 4983 TRACE_3(TR_FAC_VM, TR_SEGVN_GETPAGE, 4984 "segvn_getpage:seg %p addr %p vp %p", 4985 seg, addr, vp); 4986 err = VOP_GETPAGE(vp, (offset_t)vp_off, vp_len, 4987 &vpprot, plp, plsz, seg, addr + (vp_off - off), arw, 4988 svd->cred); 4989 if (err) { 4990 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 4991 segvn_pagelist_rele(plp); 4992 if (pl_alloc_sz) 4993 kmem_free(plp, pl_alloc_sz); 4994 return (FC_MAKE_ERR(err)); 4995 } 4996 if (svd->type == MAP_PRIVATE) 4997 vpprot &= ~PROT_WRITE; 4998 } 4999 } 5000 5001 /* 5002 * N.B. at this time the plp array has all the needed non-anon 5003 * pages in addition to (possibly) having some adjacent pages. 5004 */ 5005 5006 /* 5007 * Always acquire the anon_array_lock to prevent 5008 * 2 threads from allocating separate anon slots for 5009 * the same "addr". 5010 * 5011 * If this is a copy-on-write fault and we don't already 5012 * have the anon_array_lock, acquire it to prevent the 5013 * fault routine from handling multiple copy-on-write faults 5014 * on the same "addr" in the same address space. 5015 * 5016 * Only one thread should deal with the fault since after 5017 * it is handled, the other threads can acquire a translation 5018 * to the newly created private page. This prevents two or 5019 * more threads from creating different private pages for the 5020 * same fault. 5021 * 5022 * We grab "serialization" lock here if this is a MAP_PRIVATE segment 5023 * to prevent deadlock between this thread and another thread 5024 * which has soft-locked this page and wants to acquire serial_lock. 5025 * ( bug 4026339 ) 5026 * 5027 * The fix for bug 4026339 becomes unnecessary when using the 5028 * locking scheme with per amp rwlock and a global set of hash 5029 * lock, anon_array_lock. If we steal a vnode page when low 5030 * on memory and upgrad the page lock through page_rename, 5031 * then the page is PAGE_HANDLED, nothing needs to be done 5032 * for this page after returning from segvn_faultpage. 5033 * 5034 * But really, the page lock should be downgraded after 5035 * the stolen page is page_rename'd. 5036 */ 5037 5038 if (amp != NULL) 5039 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 5040 5041 /* 5042 * Ok, now loop over the address range and handle faults 5043 */ 5044 for (a = addr; a < addr + len; a += PAGESIZE, off += PAGESIZE) { 5045 err = segvn_faultpage(hat, seg, a, off, vpage, plp, vpprot, 5046 type, rw, brkcow, a == addr); 5047 if (err) { 5048 if (amp != NULL) 5049 ANON_LOCK_EXIT(&->a_rwlock); 5050 if (type == F_SOFTLOCK && a > addr) { 5051 segvn_softunlock(seg, addr, (a - addr), 5052 S_OTHER); 5053 } 5054 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5055 segvn_pagelist_rele(plp); 5056 if (pl_alloc_sz) 5057 kmem_free(plp, pl_alloc_sz); 5058 return (err); 5059 } 5060 if (vpage) { 5061 vpage++; 5062 } else if (svd->vpage) { 5063 page = seg_page(seg, addr); 5064 vpage = &svd->vpage[++page]; 5065 } 5066 } 5067 5068 /* Didn't get pages from the underlying fs so we're done */ 5069 if (!dogetpage) 5070 goto done; 5071 5072 /* 5073 * Now handle any other pages in the list returned. 5074 * If the page can be used, load up the translations now. 5075 * Note that the for loop will only be entered if "plp" 5076 * is pointing to a non-NULL page pointer which means that 5077 * VOP_GETPAGE() was called and vpprot has been initialized. 5078 */ 5079 if (svd->pageprot == 0) 5080 prot = svd->prot & vpprot; 5081 5082 5083 /* 5084 * Large Files: diff should be unsigned value because we started 5085 * supporting > 2GB segment sizes from 2.5.1 and when a 5086 * large file of size > 2GB gets mapped to address space 5087 * the diff value can be > 2GB. 5088 */ 5089 5090 for (ppp = plp; (pp = *ppp) != NULL; ppp++) { 5091 size_t diff; 5092 struct anon *ap; 5093 int anon_index; 5094 anon_sync_obj_t cookie; 5095 int hat_flag = HAT_LOAD_ADV; 5096 5097 if (svd->flags & MAP_TEXT) { 5098 hat_flag |= HAT_LOAD_TEXT; 5099 } 5100 5101 if (pp == PAGE_HANDLED) 5102 continue; 5103 5104 if (pp->p_offset >= svd->offset && 5105 (pp->p_offset < svd->offset + seg->s_size)) { 5106 5107 diff = pp->p_offset - svd->offset; 5108 5109 /* 5110 * Large Files: Following is the assertion 5111 * validating the above cast. 5112 */ 5113 ASSERT(svd->vp == pp->p_vnode); 5114 5115 page = btop(diff); 5116 if (svd->pageprot) 5117 prot = VPP_PROT(&svd->vpage[page]) & vpprot; 5118 5119 /* 5120 * Prevent other threads in the address space from 5121 * creating private pages (i.e., allocating anon slots) 5122 * while we are in the process of loading translations 5123 * to additional pages returned by the underlying 5124 * object. 5125 */ 5126 if (amp != NULL) { 5127 anon_index = svd->anon_index + page; 5128 anon_array_enter(amp, anon_index, &cookie); 5129 ap = anon_get_ptr(amp->ahp, anon_index); 5130 } 5131 if ((amp == NULL) || (ap == NULL)) { 5132 if (IS_VMODSORT(pp->p_vnode) || 5133 enable_mbit_wa) { 5134 if (rw == S_WRITE) 5135 hat_setmod(pp); 5136 else if (rw != S_OTHER && 5137 !hat_ismod(pp)) 5138 prot &= ~PROT_WRITE; 5139 } 5140 /* 5141 * Skip mapping read ahead pages marked 5142 * for migration, so they will get migrated 5143 * properly on fault 5144 */ 5145 if ((prot & PROT_READ) && !PP_ISMIGRATE(pp)) { 5146 hat_memload(hat, seg->s_base + diff, 5147 pp, prot, hat_flag); 5148 } 5149 } 5150 if (amp != NULL) 5151 anon_array_exit(&cookie); 5152 } 5153 page_unlock(pp); 5154 } 5155 done: 5156 if (amp != NULL) 5157 ANON_LOCK_EXIT(&->a_rwlock); 5158 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5159 if (pl_alloc_sz) 5160 kmem_free(plp, pl_alloc_sz); 5161 return (0); 5162 } 5163 5164 /* 5165 * This routine is used to start I/O on pages asynchronously. XXX it will 5166 * only create PAGESIZE pages. At fault time they will be relocated into 5167 * larger pages. 5168 */ 5169 static faultcode_t 5170 segvn_faulta(struct seg *seg, caddr_t addr) 5171 { 5172 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 5173 int err; 5174 struct anon_map *amp; 5175 vnode_t *vp; 5176 5177 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 5178 5179 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER); 5180 if ((amp = svd->amp) != NULL) { 5181 struct anon *ap; 5182 5183 /* 5184 * Reader lock to prevent amp->ahp from being changed. 5185 * This is advisory, it's ok to miss a page, so 5186 * we don't do anon_array_enter lock. 5187 */ 5188 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 5189 if ((ap = anon_get_ptr(amp->ahp, 5190 svd->anon_index + seg_page(seg, addr))) != NULL) { 5191 5192 err = anon_getpage(&ap, NULL, NULL, 5193 0, seg, addr, S_READ, svd->cred); 5194 5195 ANON_LOCK_EXIT(&->a_rwlock); 5196 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5197 if (err) 5198 return (FC_MAKE_ERR(err)); 5199 return (0); 5200 } 5201 ANON_LOCK_EXIT(&->a_rwlock); 5202 } 5203 5204 if (svd->vp == NULL) { 5205 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5206 return (0); /* zfod page - do nothing now */ 5207 } 5208 5209 vp = svd->vp; 5210 TRACE_3(TR_FAC_VM, TR_SEGVN_GETPAGE, 5211 "segvn_getpage:seg %p addr %p vp %p", seg, addr, vp); 5212 err = VOP_GETPAGE(vp, 5213 (offset_t)(svd->offset + (uintptr_t)(addr - seg->s_base)), 5214 PAGESIZE, NULL, NULL, 0, seg, addr, 5215 S_OTHER, svd->cred); 5216 5217 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5218 if (err) 5219 return (FC_MAKE_ERR(err)); 5220 return (0); 5221 } 5222 5223 static int 5224 segvn_setprot(struct seg *seg, caddr_t addr, size_t len, uint_t prot) 5225 { 5226 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 5227 struct vpage *svp, *evp; 5228 struct vnode *vp; 5229 size_t pgsz; 5230 pgcnt_t pgcnt; 5231 anon_sync_obj_t cookie; 5232 5233 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 5234 5235 if ((svd->maxprot & prot) != prot) 5236 return (EACCES); /* violated maxprot */ 5237 5238 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_WRITER); 5239 5240 /* return if prot is the same */ 5241 if (!svd->pageprot && svd->prot == prot) { 5242 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5243 return (0); 5244 } 5245 5246 /* 5247 * Since we change protections we first have to flush the cache. 5248 * This makes sure all the pagelock calls have to recheck 5249 * protections. 5250 */ 5251 if (svd->softlockcnt > 0) { 5252 /* 5253 * Since we do have the segvn writers lock nobody can fill 5254 * the cache with entries belonging to this seg during 5255 * the purge. The flush either succeeds or we still have 5256 * pending I/Os. 5257 */ 5258 segvn_purge(seg); 5259 if (svd->softlockcnt > 0) { 5260 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5261 return (EAGAIN); 5262 } 5263 } 5264 5265 if (seg->s_szc != 0) { 5266 int err; 5267 pgsz = page_get_pagesize(seg->s_szc); 5268 pgcnt = pgsz >> PAGESHIFT; 5269 ASSERT(IS_P2ALIGNED(pgcnt, pgcnt)); 5270 if (!IS_P2ALIGNED(addr, pgsz) || !IS_P2ALIGNED(len, pgsz)) { 5271 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5272 ASSERT(seg->s_base != addr || seg->s_size != len); 5273 /* 5274 * If we are holding the as lock as a reader then 5275 * we need to return IE_RETRY and let the as 5276 * layer drop and re-aquire the lock as a writer. 5277 */ 5278 if (AS_READ_HELD(seg->s_as, &seg->s_as->a_lock)) 5279 return (IE_RETRY); 5280 VM_STAT_ADD(segvnvmstats.demoterange[1]); 5281 if (svd->type == MAP_PRIVATE || svd->vp != NULL) { 5282 err = segvn_demote_range(seg, addr, len, 5283 SDR_END, 0); 5284 } else { 5285 uint_t szcvec = map_pgszcvec(seg->s_base, 5286 pgsz, (uintptr_t)seg->s_base, 5287 (svd->flags & MAP_TEXT), MAPPGSZC_SHM, 0); 5288 err = segvn_demote_range(seg, addr, len, 5289 SDR_END, szcvec); 5290 } 5291 if (err == 0) 5292 return (IE_RETRY); 5293 if (err == ENOMEM) 5294 return (IE_NOMEM); 5295 return (err); 5296 } 5297 } 5298 5299 5300 /* 5301 * If it's a private mapping and we're making it writable 5302 * and no swap space has been reserved, have to reserve 5303 * it all now. If it's a private mapping to a file (i.e., vp != NULL) 5304 * and we're removing write permission on the entire segment and 5305 * we haven't modified any pages, we can release the swap space. 5306 */ 5307 if (svd->type == MAP_PRIVATE) { 5308 if (prot & PROT_WRITE) { 5309 size_t sz; 5310 if (svd->swresv == 0 && !(svd->flags & MAP_NORESERVE)) { 5311 if (anon_resv_zone(seg->s_size, 5312 seg->s_as->a_proc->p_zone) == 0) { 5313 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5314 return (IE_NOMEM); 5315 } 5316 sz = svd->swresv = seg->s_size; 5317 TRACE_3(TR_FAC_VM, TR_ANON_PROC, 5318 "anon proc:%p %lu %u", 5319 seg, sz, 1); 5320 } 5321 } else { 5322 /* 5323 * Swap space is released only if this segment 5324 * does not map anonymous memory, since read faults 5325 * on such segments still need an anon slot to read 5326 * in the data. 5327 */ 5328 if (svd->swresv != 0 && svd->vp != NULL && 5329 svd->amp == NULL && addr == seg->s_base && 5330 len == seg->s_size && svd->pageprot == 0) { 5331 anon_unresv_zone(svd->swresv, 5332 seg->s_as->a_proc->p_zone); 5333 svd->swresv = 0; 5334 TRACE_3(TR_FAC_VM, TR_ANON_PROC, 5335 "anon proc:%p %lu %u", 5336 seg, 0, 0); 5337 } 5338 } 5339 } 5340 5341 if (addr == seg->s_base && len == seg->s_size && svd->pageprot == 0) { 5342 if (svd->prot == prot) { 5343 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5344 return (0); /* all done */ 5345 } 5346 svd->prot = (uchar_t)prot; 5347 } else if (svd->type == MAP_PRIVATE) { 5348 struct anon *ap = NULL; 5349 page_t *pp; 5350 u_offset_t offset, off; 5351 struct anon_map *amp; 5352 ulong_t anon_idx = 0; 5353 5354 /* 5355 * A vpage structure exists or else the change does not 5356 * involve the entire segment. Establish a vpage structure 5357 * if none is there. Then, for each page in the range, 5358 * adjust its individual permissions. Note that write- 5359 * enabling a MAP_PRIVATE page can affect the claims for 5360 * locked down memory. Overcommitting memory terminates 5361 * the operation. 5362 */ 5363 segvn_vpage(seg); 5364 if ((amp = svd->amp) != NULL) { 5365 anon_idx = svd->anon_index + seg_page(seg, addr); 5366 ASSERT(seg->s_szc == 0 || 5367 IS_P2ALIGNED(anon_idx, pgcnt)); 5368 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 5369 } 5370 5371 offset = svd->offset + (uintptr_t)(addr - seg->s_base); 5372 evp = &svd->vpage[seg_page(seg, addr + len)]; 5373 5374 /* 5375 * See Statement at the beginning of segvn_lockop regarding 5376 * the way cowcnts and lckcnts are handled. 5377 */ 5378 for (svp = &svd->vpage[seg_page(seg, addr)]; svp < evp; svp++) { 5379 5380 if (seg->s_szc != 0) { 5381 if (amp != NULL) { 5382 anon_array_enter(amp, anon_idx, 5383 &cookie); 5384 } 5385 if (IS_P2ALIGNED(anon_idx, pgcnt) && 5386 !segvn_claim_pages(seg, svp, offset, 5387 anon_idx, prot)) { 5388 if (amp != NULL) { 5389 anon_array_exit(&cookie); 5390 } 5391 break; 5392 } 5393 if (amp != NULL) { 5394 anon_array_exit(&cookie); 5395 } 5396 anon_idx++; 5397 } else { 5398 if (amp != NULL) { 5399 anon_array_enter(amp, anon_idx, 5400 &cookie); 5401 ap = anon_get_ptr(amp->ahp, anon_idx++); 5402 } 5403 5404 if (VPP_ISPPLOCK(svp) && 5405 VPP_PROT(svp) != prot) { 5406 5407 if (amp == NULL || ap == NULL) { 5408 vp = svd->vp; 5409 off = offset; 5410 } else 5411 swap_xlate(ap, &vp, &off); 5412 if (amp != NULL) 5413 anon_array_exit(&cookie); 5414 5415 if ((pp = page_lookup(vp, off, 5416 SE_SHARED)) == NULL) { 5417 panic("segvn_setprot: no page"); 5418 /*NOTREACHED*/ 5419 } 5420 ASSERT(seg->s_szc == 0); 5421 if ((VPP_PROT(svp) ^ prot) & 5422 PROT_WRITE) { 5423 if (prot & PROT_WRITE) { 5424 if (!page_addclaim(pp)) { 5425 page_unlock(pp); 5426 break; 5427 } 5428 } else { 5429 if (!page_subclaim(pp)) { 5430 page_unlock(pp); 5431 break; 5432 } 5433 } 5434 } 5435 page_unlock(pp); 5436 } else if (amp != NULL) 5437 anon_array_exit(&cookie); 5438 } 5439 VPP_SETPROT(svp, prot); 5440 offset += PAGESIZE; 5441 } 5442 if (amp != NULL) 5443 ANON_LOCK_EXIT(&->a_rwlock); 5444 5445 /* 5446 * Did we terminate prematurely? If so, simply unload 5447 * the translations to the things we've updated so far. 5448 */ 5449 if (svp != evp) { 5450 len = (svp - &svd->vpage[seg_page(seg, addr)]) * 5451 PAGESIZE; 5452 ASSERT(seg->s_szc == 0 || IS_P2ALIGNED(len, pgsz)); 5453 if (len != 0) 5454 hat_unload(seg->s_as->a_hat, addr, 5455 len, HAT_UNLOAD); 5456 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5457 return (IE_NOMEM); 5458 } 5459 } else { 5460 segvn_vpage(seg); 5461 evp = &svd->vpage[seg_page(seg, addr + len)]; 5462 for (svp = &svd->vpage[seg_page(seg, addr)]; svp < evp; svp++) { 5463 VPP_SETPROT(svp, prot); 5464 } 5465 } 5466 5467 if (((prot & PROT_WRITE) != 0 && 5468 (svd->vp != NULL || svd->type == MAP_PRIVATE)) || 5469 (prot & ~PROT_USER) == PROT_NONE) { 5470 /* 5471 * Either private or shared data with write access (in 5472 * which case we need to throw out all former translations 5473 * so that we get the right translations set up on fault 5474 * and we don't allow write access to any copy-on-write pages 5475 * that might be around or to prevent write access to pages 5476 * representing holes in a file), or we don't have permission 5477 * to access the memory at all (in which case we have to 5478 * unload any current translations that might exist). 5479 */ 5480 hat_unload(seg->s_as->a_hat, addr, len, HAT_UNLOAD); 5481 } else { 5482 /* 5483 * A shared mapping or a private mapping in which write 5484 * protection is going to be denied - just change all the 5485 * protections over the range of addresses in question. 5486 * segvn does not support any other attributes other 5487 * than prot so we can use hat_chgattr. 5488 */ 5489 hat_chgattr(seg->s_as->a_hat, addr, len, prot); 5490 } 5491 5492 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5493 5494 return (0); 5495 } 5496 5497 /* 5498 * segvn_setpagesize is called via SEGOP_SETPAGESIZE from as_setpagesize, 5499 * to determine if the seg is capable of mapping the requested szc. 5500 */ 5501 static int 5502 segvn_setpagesize(struct seg *seg, caddr_t addr, size_t len, uint_t szc) 5503 { 5504 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 5505 struct segvn_data *nsvd; 5506 struct anon_map *amp = svd->amp; 5507 struct seg *nseg; 5508 caddr_t eaddr = addr + len, a; 5509 size_t pgsz = page_get_pagesize(szc); 5510 pgcnt_t pgcnt = page_get_pagecnt(szc); 5511 int err; 5512 u_offset_t off = svd->offset + (uintptr_t)(addr - seg->s_base); 5513 extern struct vnode kvp; 5514 5515 ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock)); 5516 ASSERT(addr >= seg->s_base && eaddr <= seg->s_base + seg->s_size); 5517 5518 if (seg->s_szc == szc || segvn_lpg_disable != 0) { 5519 return (0); 5520 } 5521 5522 /* 5523 * addr should always be pgsz aligned but eaddr may be misaligned if 5524 * it's at the end of the segment. 5525 * 5526 * XXX we should assert this condition since as_setpagesize() logic 5527 * guarantees it. 5528 */ 5529 if (!IS_P2ALIGNED(addr, pgsz) || 5530 (!IS_P2ALIGNED(eaddr, pgsz) && 5531 eaddr != seg->s_base + seg->s_size)) { 5532 5533 segvn_setpgsz_align_err++; 5534 return (EINVAL); 5535 } 5536 5537 if (amp != NULL && svd->type == MAP_SHARED) { 5538 ulong_t an_idx = svd->anon_index + seg_page(seg, addr); 5539 if (!IS_P2ALIGNED(an_idx, pgcnt)) { 5540 5541 segvn_setpgsz_anon_align_err++; 5542 return (EINVAL); 5543 } 5544 } 5545 5546 if ((svd->flags & MAP_NORESERVE) || seg->s_as == &kas || 5547 szc > segvn_maxpgszc) { 5548 return (EINVAL); 5549 } 5550 5551 /* paranoid check */ 5552 if (svd->vp != NULL && 5553 (IS_SWAPFSVP(svd->vp) || VN_ISKAS(svd->vp))) { 5554 return (EINVAL); 5555 } 5556 5557 if (seg->s_szc == 0 && svd->vp != NULL && 5558 map_addr_vacalign_check(addr, off)) { 5559 return (EINVAL); 5560 } 5561 5562 /* 5563 * Check that protections are the same within new page 5564 * size boundaries. 5565 */ 5566 if (svd->pageprot) { 5567 for (a = addr; a < eaddr; a += pgsz) { 5568 if ((a + pgsz) > eaddr) { 5569 if (!sameprot(seg, a, eaddr - a)) { 5570 return (EINVAL); 5571 } 5572 } else { 5573 if (!sameprot(seg, a, pgsz)) { 5574 return (EINVAL); 5575 } 5576 } 5577 } 5578 } 5579 5580 /* 5581 * Since we are changing page size we first have to flush 5582 * the cache. This makes sure all the pagelock calls have 5583 * to recheck protections. 5584 */ 5585 if (svd->softlockcnt > 0) { 5586 /* 5587 * Since we do have the segvn writers lock nobody can fill 5588 * the cache with entries belonging to this seg during 5589 * the purge. The flush either succeeds or we still have 5590 * pending I/Os. 5591 */ 5592 segvn_purge(seg); 5593 if (svd->softlockcnt > 0) { 5594 return (EAGAIN); 5595 } 5596 } 5597 5598 /* 5599 * Operation for sub range of existing segment. 5600 */ 5601 if (addr != seg->s_base || eaddr != (seg->s_base + seg->s_size)) { 5602 if (szc < seg->s_szc) { 5603 VM_STAT_ADD(segvnvmstats.demoterange[2]); 5604 err = segvn_demote_range(seg, addr, len, SDR_RANGE, 0); 5605 if (err == 0) { 5606 return (IE_RETRY); 5607 } 5608 if (err == ENOMEM) { 5609 return (IE_NOMEM); 5610 } 5611 return (err); 5612 } 5613 if (addr != seg->s_base) { 5614 nseg = segvn_split_seg(seg, addr); 5615 if (eaddr != (nseg->s_base + nseg->s_size)) { 5616 /* eaddr is szc aligned */ 5617 (void) segvn_split_seg(nseg, eaddr); 5618 } 5619 return (IE_RETRY); 5620 } 5621 if (eaddr != (seg->s_base + seg->s_size)) { 5622 /* eaddr is szc aligned */ 5623 (void) segvn_split_seg(seg, eaddr); 5624 } 5625 return (IE_RETRY); 5626 } 5627 5628 /* 5629 * Break any low level sharing and reset seg->s_szc to 0. 5630 */ 5631 if ((err = segvn_clrszc(seg)) != 0) { 5632 if (err == ENOMEM) { 5633 err = IE_NOMEM; 5634 } 5635 return (err); 5636 } 5637 ASSERT(seg->s_szc == 0); 5638 5639 /* 5640 * If the end of the current segment is not pgsz aligned 5641 * then attempt to concatenate with the next segment. 5642 */ 5643 if (!IS_P2ALIGNED(eaddr, pgsz)) { 5644 nseg = AS_SEGNEXT(seg->s_as, seg); 5645 if (nseg == NULL || nseg == seg || eaddr != nseg->s_base) { 5646 return (ENOMEM); 5647 } 5648 if (nseg->s_ops != &segvn_ops) { 5649 return (EINVAL); 5650 } 5651 nsvd = (struct segvn_data *)nseg->s_data; 5652 if (nsvd->softlockcnt > 0) { 5653 segvn_purge(nseg); 5654 if (nsvd->softlockcnt > 0) { 5655 return (EAGAIN); 5656 } 5657 } 5658 err = segvn_clrszc(nseg); 5659 if (err == ENOMEM) { 5660 err = IE_NOMEM; 5661 } 5662 if (err != 0) { 5663 return (err); 5664 } 5665 err = segvn_concat(seg, nseg, 1); 5666 if (err == -1) { 5667 return (EINVAL); 5668 } 5669 if (err == -2) { 5670 return (IE_NOMEM); 5671 } 5672 return (IE_RETRY); 5673 } 5674 5675 /* 5676 * May need to re-align anon array to 5677 * new szc. 5678 */ 5679 if (amp != NULL) { 5680 if (!IS_P2ALIGNED(svd->anon_index, pgcnt)) { 5681 struct anon_hdr *nahp; 5682 5683 ASSERT(svd->type == MAP_PRIVATE); 5684 5685 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 5686 ASSERT(amp->refcnt == 1); 5687 nahp = anon_create(btop(amp->size), ANON_NOSLEEP); 5688 if (nahp == NULL) { 5689 ANON_LOCK_EXIT(&->a_rwlock); 5690 return (IE_NOMEM); 5691 } 5692 if (anon_copy_ptr(amp->ahp, svd->anon_index, 5693 nahp, 0, btop(seg->s_size), ANON_NOSLEEP)) { 5694 anon_release(nahp, btop(amp->size)); 5695 ANON_LOCK_EXIT(&->a_rwlock); 5696 return (IE_NOMEM); 5697 } 5698 anon_release(amp->ahp, btop(amp->size)); 5699 amp->ahp = nahp; 5700 svd->anon_index = 0; 5701 ANON_LOCK_EXIT(&->a_rwlock); 5702 } 5703 } 5704 if (svd->vp != NULL && szc != 0) { 5705 struct vattr va; 5706 u_offset_t eoffpage = svd->offset; 5707 va.va_mask = AT_SIZE; 5708 eoffpage += seg->s_size; 5709 eoffpage = btopr(eoffpage); 5710 if (VOP_GETATTR(svd->vp, &va, 0, svd->cred) != 0) { 5711 segvn_setpgsz_getattr_err++; 5712 return (EINVAL); 5713 } 5714 if (btopr(va.va_size) < eoffpage) { 5715 segvn_setpgsz_eof_err++; 5716 return (EINVAL); 5717 } 5718 if (amp != NULL) { 5719 /* 5720 * anon_fill_cow_holes() may call VOP_GETPAGE(). 5721 * don't take anon map lock here to avoid holding it 5722 * across VOP_GETPAGE() calls that may call back into 5723 * segvn for klsutering checks. We don't really need 5724 * anon map lock here since it's a private segment and 5725 * we hold as level lock as writers. 5726 */ 5727 if ((err = anon_fill_cow_holes(seg, seg->s_base, 5728 amp->ahp, svd->anon_index, svd->vp, svd->offset, 5729 seg->s_size, szc, svd->prot, svd->vpage, 5730 svd->cred)) != 0) { 5731 return (EINVAL); 5732 } 5733 } 5734 segvn_setvnode_mpss(svd->vp); 5735 } 5736 5737 if (amp != NULL) { 5738 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 5739 if (svd->type == MAP_PRIVATE) { 5740 amp->a_szc = szc; 5741 } else if (szc > amp->a_szc) { 5742 amp->a_szc = szc; 5743 } 5744 ANON_LOCK_EXIT(&->a_rwlock); 5745 } 5746 5747 seg->s_szc = szc; 5748 5749 return (0); 5750 } 5751 5752 static int 5753 segvn_clrszc(struct seg *seg) 5754 { 5755 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 5756 struct anon_map *amp = svd->amp; 5757 size_t pgsz; 5758 pgcnt_t pages; 5759 int err = 0; 5760 caddr_t a = seg->s_base; 5761 caddr_t ea = a + seg->s_size; 5762 ulong_t an_idx = svd->anon_index; 5763 vnode_t *vp = svd->vp; 5764 struct vpage *vpage = svd->vpage; 5765 page_t *anon_pl[1 + 1], *pp; 5766 struct anon *ap, *oldap; 5767 uint_t prot = svd->prot, vpprot; 5768 int pageflag = 0; 5769 5770 ASSERT(AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock) || 5771 SEGVN_WRITE_HELD(seg->s_as, &svd->lock)); 5772 5773 if (vp == NULL && amp == NULL) { 5774 seg->s_szc = 0; 5775 return (0); 5776 } 5777 5778 /* 5779 * do HAT_UNLOAD_UNMAP since we are changing the pagesize. 5780 * unload argument is 0 when we are freeing the segment 5781 * and unload was already done. 5782 */ 5783 hat_unload(seg->s_as->a_hat, seg->s_base, seg->s_size, 5784 HAT_UNLOAD_UNMAP); 5785 5786 if (amp == NULL || svd->type == MAP_SHARED) { 5787 seg->s_szc = 0; 5788 return (0); 5789 } 5790 5791 pgsz = page_get_pagesize(seg->s_szc); 5792 pages = btop(pgsz); 5793 5794 /* 5795 * XXX anon rwlock is not really needed because this is a 5796 * private segment and we are writers. 5797 */ 5798 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 5799 5800 for (; a < ea; a += pgsz, an_idx += pages) { 5801 if ((oldap = anon_get_ptr(amp->ahp, an_idx)) != NULL) { 5802 ASSERT(vpage != NULL || svd->pageprot == 0); 5803 if (vpage != NULL) { 5804 ASSERT(sameprot(seg, a, pgsz)); 5805 prot = VPP_PROT(vpage); 5806 pageflag = VPP_ISPPLOCK(vpage) ? LOCK_PAGE : 0; 5807 } 5808 if (seg->s_szc != 0) { 5809 ASSERT(vp == NULL || anon_pages(amp->ahp, 5810 an_idx, pages) == pages); 5811 if ((err = anon_map_demotepages(amp, an_idx, 5812 seg, a, prot, vpage, svd->cred)) != 0) { 5813 goto out; 5814 } 5815 } else { 5816 if (oldap->an_refcnt == 1) { 5817 continue; 5818 } 5819 if ((err = anon_getpage(&oldap, &vpprot, 5820 anon_pl, PAGESIZE, seg, a, S_READ, 5821 svd->cred))) { 5822 goto out; 5823 } 5824 if ((pp = anon_private(&ap, seg, a, prot, 5825 anon_pl[0], pageflag, svd->cred)) == NULL) { 5826 err = ENOMEM; 5827 goto out; 5828 } 5829 anon_decref(oldap); 5830 (void) anon_set_ptr(amp->ahp, an_idx, ap, 5831 ANON_SLEEP); 5832 page_unlock(pp); 5833 } 5834 } 5835 vpage = (vpage == NULL) ? NULL : vpage + pages; 5836 } 5837 5838 amp->a_szc = 0; 5839 seg->s_szc = 0; 5840 out: 5841 ANON_LOCK_EXIT(&->a_rwlock); 5842 return (err); 5843 } 5844 5845 static int 5846 segvn_claim_pages( 5847 struct seg *seg, 5848 struct vpage *svp, 5849 u_offset_t off, 5850 ulong_t anon_idx, 5851 uint_t prot) 5852 { 5853 pgcnt_t pgcnt = page_get_pagecnt(seg->s_szc); 5854 size_t ppasize = (pgcnt + 1) * sizeof (page_t *); 5855 page_t **ppa; 5856 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 5857 struct anon_map *amp = svd->amp; 5858 struct vpage *evp = svp + pgcnt; 5859 caddr_t addr = ((uintptr_t)(svp - svd->vpage) << PAGESHIFT) 5860 + seg->s_base; 5861 struct anon *ap; 5862 struct vnode *vp = svd->vp; 5863 page_t *pp; 5864 pgcnt_t pg_idx, i; 5865 int err = 0; 5866 anoff_t aoff; 5867 int anon = (amp != NULL) ? 1 : 0; 5868 5869 ASSERT(svd->type == MAP_PRIVATE); 5870 ASSERT(svd->vpage != NULL); 5871 ASSERT(seg->s_szc != 0); 5872 ASSERT(IS_P2ALIGNED(pgcnt, pgcnt)); 5873 ASSERT(amp == NULL || IS_P2ALIGNED(anon_idx, pgcnt)); 5874 ASSERT(sameprot(seg, addr, pgcnt << PAGESHIFT)); 5875 5876 if (VPP_PROT(svp) == prot) 5877 return (1); 5878 if (!((VPP_PROT(svp) ^ prot) & PROT_WRITE)) 5879 return (1); 5880 5881 ppa = kmem_alloc(ppasize, KM_SLEEP); 5882 if (anon && vp != NULL) { 5883 if (anon_get_ptr(amp->ahp, anon_idx) == NULL) { 5884 anon = 0; 5885 ASSERT(!anon_pages(amp->ahp, anon_idx, pgcnt)); 5886 } 5887 ASSERT(!anon || 5888 anon_pages(amp->ahp, anon_idx, pgcnt) == pgcnt); 5889 } 5890 5891 for (*ppa = NULL, pg_idx = 0; svp < evp; svp++, anon_idx++) { 5892 if (!VPP_ISPPLOCK(svp)) 5893 continue; 5894 if (anon) { 5895 ap = anon_get_ptr(amp->ahp, anon_idx); 5896 if (ap == NULL) { 5897 panic("segvn_claim_pages: no anon slot"); 5898 } 5899 swap_xlate(ap, &vp, &aoff); 5900 off = (u_offset_t)aoff; 5901 } 5902 ASSERT(vp != NULL); 5903 if ((pp = page_lookup(vp, 5904 (u_offset_t)off, SE_SHARED)) == NULL) { 5905 panic("segvn_claim_pages: no page"); 5906 } 5907 ppa[pg_idx++] = pp; 5908 off += PAGESIZE; 5909 } 5910 5911 if (ppa[0] == NULL) { 5912 kmem_free(ppa, ppasize); 5913 return (1); 5914 } 5915 5916 ASSERT(pg_idx <= pgcnt); 5917 ppa[pg_idx] = NULL; 5918 5919 if (prot & PROT_WRITE) 5920 err = page_addclaim_pages(ppa); 5921 else 5922 err = page_subclaim_pages(ppa); 5923 5924 for (i = 0; i < pg_idx; i++) { 5925 ASSERT(ppa[i] != NULL); 5926 page_unlock(ppa[i]); 5927 } 5928 5929 kmem_free(ppa, ppasize); 5930 return (err); 5931 } 5932 5933 /* 5934 * Returns right (upper address) segment if split occured. 5935 * If the address is equal to the beginning or end of its segment it returns 5936 * the current segment. 5937 */ 5938 static struct seg * 5939 segvn_split_seg(struct seg *seg, caddr_t addr) 5940 { 5941 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 5942 struct seg *nseg; 5943 size_t nsize; 5944 struct segvn_data *nsvd; 5945 5946 ASSERT(AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock)); 5947 ASSERT(addr >= seg->s_base); 5948 ASSERT(addr <= seg->s_base + seg->s_size); 5949 5950 if (addr == seg->s_base || addr == seg->s_base + seg->s_size) 5951 return (seg); 5952 5953 nsize = seg->s_base + seg->s_size - addr; 5954 seg->s_size = addr - seg->s_base; 5955 nseg = seg_alloc(seg->s_as, addr, nsize); 5956 ASSERT(nseg != NULL); 5957 nseg->s_ops = seg->s_ops; 5958 nsvd = kmem_cache_alloc(segvn_cache, KM_SLEEP); 5959 nseg->s_data = (void *)nsvd; 5960 nseg->s_szc = seg->s_szc; 5961 *nsvd = *svd; 5962 rw_init(&nsvd->lock, NULL, RW_DEFAULT, NULL); 5963 5964 if (nsvd->vp != NULL) { 5965 VN_HOLD(nsvd->vp); 5966 nsvd->offset = svd->offset + 5967 (uintptr_t)(nseg->s_base - seg->s_base); 5968 if (nsvd->type == MAP_SHARED) 5969 lgrp_shm_policy_init(NULL, nsvd->vp); 5970 } else { 5971 /* 5972 * The offset for an anonymous segment has no signifigance in 5973 * terms of an offset into a file. If we were to use the above 5974 * calculation instead, the structures read out of 5975 * /proc/<pid>/xmap would be more difficult to decipher since 5976 * it would be unclear whether two seemingly contiguous 5977 * prxmap_t structures represented different segments or a 5978 * single segment that had been split up into multiple prxmap_t 5979 * structures (e.g. if some part of the segment had not yet 5980 * been faulted in). 5981 */ 5982 nsvd->offset = 0; 5983 } 5984 5985 ASSERT(svd->softlockcnt == 0); 5986 crhold(svd->cred); 5987 5988 if (svd->vpage != NULL) { 5989 size_t bytes = vpgtob(seg_pages(seg)); 5990 size_t nbytes = vpgtob(seg_pages(nseg)); 5991 struct vpage *ovpage = svd->vpage; 5992 5993 svd->vpage = kmem_alloc(bytes, KM_SLEEP); 5994 bcopy(ovpage, svd->vpage, bytes); 5995 nsvd->vpage = kmem_alloc(nbytes, KM_SLEEP); 5996 bcopy(ovpage + seg_pages(seg), nsvd->vpage, nbytes); 5997 kmem_free(ovpage, bytes + nbytes); 5998 } 5999 if (svd->amp != NULL && svd->type == MAP_PRIVATE) { 6000 struct anon_map *oamp = svd->amp, *namp; 6001 struct anon_hdr *nahp; 6002 6003 ANON_LOCK_ENTER(&oamp->a_rwlock, RW_WRITER); 6004 ASSERT(oamp->refcnt == 1); 6005 nahp = anon_create(btop(seg->s_size), ANON_SLEEP); 6006 (void) anon_copy_ptr(oamp->ahp, svd->anon_index, 6007 nahp, 0, btop(seg->s_size), ANON_SLEEP); 6008 6009 namp = anonmap_alloc(nseg->s_size, 0); 6010 namp->a_szc = nseg->s_szc; 6011 (void) anon_copy_ptr(oamp->ahp, 6012 svd->anon_index + btop(seg->s_size), 6013 namp->ahp, 0, btop(nseg->s_size), ANON_SLEEP); 6014 anon_release(oamp->ahp, btop(oamp->size)); 6015 oamp->ahp = nahp; 6016 oamp->size = seg->s_size; 6017 svd->anon_index = 0; 6018 nsvd->amp = namp; 6019 nsvd->anon_index = 0; 6020 ANON_LOCK_EXIT(&oamp->a_rwlock); 6021 } else if (svd->amp != NULL) { 6022 pgcnt_t pgcnt = page_get_pagecnt(seg->s_szc); 6023 ASSERT(svd->amp == nsvd->amp); 6024 ASSERT(seg->s_szc <= svd->amp->a_szc); 6025 nsvd->anon_index = svd->anon_index + seg_pages(seg); 6026 ASSERT(IS_P2ALIGNED(nsvd->anon_index, pgcnt)); 6027 ANON_LOCK_ENTER(&svd->amp->a_rwlock, RW_WRITER); 6028 svd->amp->refcnt++; 6029 ANON_LOCK_EXIT(&svd->amp->a_rwlock); 6030 } 6031 6032 /* 6033 * Split amount of swap reserve 6034 */ 6035 if (svd->swresv) { 6036 /* 6037 * For MAP_NORESERVE, only allocate swap reserve for pages 6038 * being used. Other segments get enough to cover whole 6039 * segment. 6040 */ 6041 if (svd->flags & MAP_NORESERVE) { 6042 size_t oswresv; 6043 6044 ASSERT(svd->amp); 6045 oswresv = svd->swresv; 6046 svd->swresv = ptob(anon_pages(svd->amp->ahp, 6047 svd->anon_index, btop(seg->s_size))); 6048 nsvd->swresv = ptob(anon_pages(nsvd->amp->ahp, 6049 nsvd->anon_index, btop(nseg->s_size))); 6050 ASSERT(oswresv >= (svd->swresv + nsvd->swresv)); 6051 } else { 6052 ASSERT(svd->swresv == seg->s_size + nseg->s_size); 6053 svd->swresv = seg->s_size; 6054 nsvd->swresv = nseg->s_size; 6055 } 6056 } 6057 6058 return (nseg); 6059 } 6060 6061 /* 6062 * called on memory operations (unmap, setprot, setpagesize) for a subset 6063 * of a large page segment to either demote the memory range (SDR_RANGE) 6064 * or the ends (SDR_END) by addr/len. 6065 * 6066 * returns 0 on success. returns errno, including ENOMEM, on failure. 6067 */ 6068 static int 6069 segvn_demote_range( 6070 struct seg *seg, 6071 caddr_t addr, 6072 size_t len, 6073 int flag, 6074 uint_t szcvec) 6075 { 6076 caddr_t eaddr = addr + len; 6077 caddr_t lpgaddr, lpgeaddr; 6078 struct seg *nseg; 6079 struct seg *badseg1 = NULL; 6080 struct seg *badseg2 = NULL; 6081 size_t pgsz; 6082 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 6083 int err; 6084 uint_t szc = seg->s_szc; 6085 uint_t tszcvec; 6086 6087 ASSERT(AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock)); 6088 ASSERT(szc != 0); 6089 pgsz = page_get_pagesize(szc); 6090 ASSERT(seg->s_base != addr || seg->s_size != len); 6091 ASSERT(addr >= seg->s_base && eaddr <= seg->s_base + seg->s_size); 6092 ASSERT(svd->softlockcnt == 0); 6093 ASSERT(szcvec == 0 || (flag == SDR_END && svd->type == MAP_SHARED)); 6094 6095 CALC_LPG_REGION(pgsz, seg, addr, len, lpgaddr, lpgeaddr); 6096 ASSERT(flag == SDR_RANGE || eaddr < lpgeaddr || addr > lpgaddr); 6097 if (flag == SDR_RANGE) { 6098 /* demote entire range */ 6099 badseg1 = nseg = segvn_split_seg(seg, lpgaddr); 6100 (void) segvn_split_seg(nseg, lpgeaddr); 6101 ASSERT(badseg1->s_base == lpgaddr); 6102 ASSERT(badseg1->s_size == lpgeaddr - lpgaddr); 6103 } else if (addr != lpgaddr) { 6104 ASSERT(flag == SDR_END); 6105 badseg1 = nseg = segvn_split_seg(seg, lpgaddr); 6106 if (eaddr != lpgeaddr && eaddr > lpgaddr + pgsz && 6107 eaddr < lpgaddr + 2 * pgsz) { 6108 (void) segvn_split_seg(nseg, lpgeaddr); 6109 ASSERT(badseg1->s_base == lpgaddr); 6110 ASSERT(badseg1->s_size == 2 * pgsz); 6111 } else { 6112 nseg = segvn_split_seg(nseg, lpgaddr + pgsz); 6113 ASSERT(badseg1->s_base == lpgaddr); 6114 ASSERT(badseg1->s_size == pgsz); 6115 if (eaddr != lpgeaddr && eaddr > lpgaddr + pgsz) { 6116 ASSERT(lpgeaddr - lpgaddr > 2 * pgsz); 6117 nseg = segvn_split_seg(nseg, lpgeaddr - pgsz); 6118 badseg2 = nseg; 6119 (void) segvn_split_seg(nseg, lpgeaddr); 6120 ASSERT(badseg2->s_base == lpgeaddr - pgsz); 6121 ASSERT(badseg2->s_size == pgsz); 6122 } 6123 } 6124 } else { 6125 ASSERT(flag == SDR_END); 6126 ASSERT(eaddr < lpgeaddr); 6127 badseg1 = nseg = segvn_split_seg(seg, lpgeaddr - pgsz); 6128 (void) segvn_split_seg(nseg, lpgeaddr); 6129 ASSERT(badseg1->s_base == lpgeaddr - pgsz); 6130 ASSERT(badseg1->s_size == pgsz); 6131 } 6132 6133 ASSERT(badseg1 != NULL); 6134 ASSERT(badseg1->s_szc == szc); 6135 ASSERT(flag == SDR_RANGE || badseg1->s_size == pgsz || 6136 badseg1->s_size == 2 * pgsz); 6137 ASSERT(sameprot(badseg1, badseg1->s_base, pgsz)); 6138 ASSERT(badseg1->s_size == pgsz || 6139 sameprot(badseg1, badseg1->s_base + pgsz, pgsz)); 6140 if (err = segvn_clrszc(badseg1)) { 6141 return (err); 6142 } 6143 ASSERT(badseg1->s_szc == 0); 6144 6145 if (szc > 1 && (tszcvec = P2PHASE(szcvec, 1 << szc)) > 1) { 6146 uint_t tszc = highbit(tszcvec) - 1; 6147 caddr_t ta = MAX(addr, badseg1->s_base); 6148 caddr_t te; 6149 size_t tpgsz = page_get_pagesize(tszc); 6150 6151 ASSERT(svd->type == MAP_SHARED); 6152 ASSERT(flag == SDR_END); 6153 ASSERT(tszc < szc && tszc > 0); 6154 6155 if (eaddr > badseg1->s_base + badseg1->s_size) { 6156 te = badseg1->s_base + badseg1->s_size; 6157 } else { 6158 te = eaddr; 6159 } 6160 6161 ASSERT(ta <= te); 6162 badseg1->s_szc = tszc; 6163 if (!IS_P2ALIGNED(ta, tpgsz) || !IS_P2ALIGNED(te, tpgsz)) { 6164 if (badseg2 != NULL) { 6165 err = segvn_demote_range(badseg1, ta, te - ta, 6166 SDR_END, tszcvec); 6167 if (err != 0) { 6168 return (err); 6169 } 6170 } else { 6171 return (segvn_demote_range(badseg1, ta, 6172 te - ta, SDR_END, tszcvec)); 6173 } 6174 } 6175 } 6176 6177 if (badseg2 == NULL) 6178 return (0); 6179 ASSERT(badseg2->s_szc == szc); 6180 ASSERT(badseg2->s_size == pgsz); 6181 ASSERT(sameprot(badseg2, badseg2->s_base, badseg2->s_size)); 6182 if (err = segvn_clrszc(badseg2)) { 6183 return (err); 6184 } 6185 ASSERT(badseg2->s_szc == 0); 6186 6187 if (szc > 1 && (tszcvec = P2PHASE(szcvec, 1 << szc)) > 1) { 6188 uint_t tszc = highbit(tszcvec) - 1; 6189 size_t tpgsz = page_get_pagesize(tszc); 6190 6191 ASSERT(svd->type == MAP_SHARED); 6192 ASSERT(flag == SDR_END); 6193 ASSERT(tszc < szc && tszc > 0); 6194 ASSERT(badseg2->s_base > addr); 6195 ASSERT(eaddr > badseg2->s_base); 6196 ASSERT(eaddr < badseg2->s_base + badseg2->s_size); 6197 6198 badseg2->s_szc = tszc; 6199 if (!IS_P2ALIGNED(eaddr, tpgsz)) { 6200 return (segvn_demote_range(badseg2, badseg2->s_base, 6201 eaddr - badseg2->s_base, SDR_END, tszcvec)); 6202 } 6203 } 6204 6205 return (0); 6206 } 6207 6208 static int 6209 segvn_checkprot(struct seg *seg, caddr_t addr, size_t len, uint_t prot) 6210 { 6211 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 6212 struct vpage *vp, *evp; 6213 6214 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 6215 6216 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER); 6217 /* 6218 * If segment protection can be used, simply check against them. 6219 */ 6220 if (svd->pageprot == 0) { 6221 int err; 6222 6223 err = ((svd->prot & prot) != prot) ? EACCES : 0; 6224 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 6225 return (err); 6226 } 6227 6228 /* 6229 * Have to check down to the vpage level. 6230 */ 6231 evp = &svd->vpage[seg_page(seg, addr + len)]; 6232 for (vp = &svd->vpage[seg_page(seg, addr)]; vp < evp; vp++) { 6233 if ((VPP_PROT(vp) & prot) != prot) { 6234 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 6235 return (EACCES); 6236 } 6237 } 6238 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 6239 return (0); 6240 } 6241 6242 static int 6243 segvn_getprot(struct seg *seg, caddr_t addr, size_t len, uint_t *protv) 6244 { 6245 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 6246 size_t pgno = seg_page(seg, addr + len) - seg_page(seg, addr) + 1; 6247 6248 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 6249 6250 if (pgno != 0) { 6251 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER); 6252 if (svd->pageprot == 0) { 6253 do 6254 protv[--pgno] = svd->prot; 6255 while (pgno != 0); 6256 } else { 6257 size_t pgoff = seg_page(seg, addr); 6258 6259 do { 6260 pgno--; 6261 protv[pgno] = VPP_PROT(&svd->vpage[pgno+pgoff]); 6262 } while (pgno != 0); 6263 } 6264 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 6265 } 6266 return (0); 6267 } 6268 6269 static u_offset_t 6270 segvn_getoffset(struct seg *seg, caddr_t addr) 6271 { 6272 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 6273 6274 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 6275 6276 return (svd->offset + (uintptr_t)(addr - seg->s_base)); 6277 } 6278 6279 /*ARGSUSED*/ 6280 static int 6281 segvn_gettype(struct seg *seg, caddr_t addr) 6282 { 6283 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 6284 6285 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 6286 6287 return (svd->type | (svd->flags & (MAP_NORESERVE | MAP_TEXT | 6288 MAP_INITDATA))); 6289 } 6290 6291 /*ARGSUSED*/ 6292 static int 6293 segvn_getvp(struct seg *seg, caddr_t addr, struct vnode **vpp) 6294 { 6295 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 6296 6297 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 6298 6299 *vpp = svd->vp; 6300 return (0); 6301 } 6302 6303 /* 6304 * Check to see if it makes sense to do kluster/read ahead to 6305 * addr + delta relative to the mapping at addr. We assume here 6306 * that delta is a signed PAGESIZE'd multiple (which can be negative). 6307 * 6308 * For segvn, we currently "approve" of the action if we are 6309 * still in the segment and it maps from the same vp/off, 6310 * or if the advice stored in segvn_data or vpages allows it. 6311 * Currently, klustering is not allowed only if MADV_RANDOM is set. 6312 */ 6313 static int 6314 segvn_kluster(struct seg *seg, caddr_t addr, ssize_t delta) 6315 { 6316 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 6317 struct anon *oap, *ap; 6318 ssize_t pd; 6319 size_t page; 6320 struct vnode *vp1, *vp2; 6321 u_offset_t off1, off2; 6322 struct anon_map *amp; 6323 6324 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 6325 ASSERT(AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock) || 6326 SEGVN_LOCK_HELD(seg->s_as, &svd->lock)); 6327 6328 if (addr + delta < seg->s_base || 6329 addr + delta >= (seg->s_base + seg->s_size)) 6330 return (-1); /* exceeded segment bounds */ 6331 6332 pd = delta / (ssize_t)PAGESIZE; /* divide to preserve sign bit */ 6333 page = seg_page(seg, addr); 6334 6335 /* 6336 * Check to see if either of the pages addr or addr + delta 6337 * have advice set that prevents klustering (if MADV_RANDOM advice 6338 * is set for entire segment, or MADV_SEQUENTIAL is set and delta 6339 * is negative). 6340 */ 6341 if (svd->advice == MADV_RANDOM || 6342 svd->advice == MADV_SEQUENTIAL && delta < 0) 6343 return (-1); 6344 else if (svd->pageadvice && svd->vpage) { 6345 struct vpage *bvpp, *evpp; 6346 6347 bvpp = &svd->vpage[page]; 6348 evpp = &svd->vpage[page + pd]; 6349 if (VPP_ADVICE(bvpp) == MADV_RANDOM || 6350 VPP_ADVICE(evpp) == MADV_SEQUENTIAL && delta < 0) 6351 return (-1); 6352 if (VPP_ADVICE(bvpp) != VPP_ADVICE(evpp) && 6353 VPP_ADVICE(evpp) == MADV_RANDOM) 6354 return (-1); 6355 } 6356 6357 if (svd->type == MAP_SHARED) 6358 return (0); /* shared mapping - all ok */ 6359 6360 if ((amp = svd->amp) == NULL) 6361 return (0); /* off original vnode */ 6362 6363 page += svd->anon_index; 6364 6365 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 6366 6367 oap = anon_get_ptr(amp->ahp, page); 6368 ap = anon_get_ptr(amp->ahp, page + pd); 6369 6370 ANON_LOCK_EXIT(&->a_rwlock); 6371 6372 if ((oap == NULL && ap != NULL) || (oap != NULL && ap == NULL)) { 6373 return (-1); /* one with and one without an anon */ 6374 } 6375 6376 if (oap == NULL) { /* implies that ap == NULL */ 6377 return (0); /* off original vnode */ 6378 } 6379 6380 /* 6381 * Now we know we have two anon pointers - check to 6382 * see if they happen to be properly allocated. 6383 */ 6384 6385 /* 6386 * XXX We cheat here and don't lock the anon slots. We can't because 6387 * we may have been called from the anon layer which might already 6388 * have locked them. We are holding a refcnt on the slots so they 6389 * can't disappear. The worst that will happen is we'll get the wrong 6390 * names (vp, off) for the slots and make a poor klustering decision. 6391 */ 6392 swap_xlate(ap, &vp1, &off1); 6393 swap_xlate(oap, &vp2, &off2); 6394 6395 6396 if (!VOP_CMP(vp1, vp2) || off1 - off2 != delta) 6397 return (-1); 6398 return (0); 6399 } 6400 6401 /* 6402 * Swap the pages of seg out to secondary storage, returning the 6403 * number of bytes of storage freed. 6404 * 6405 * The basic idea is first to unload all translations and then to call 6406 * VOP_PUTPAGE() for all newly-unmapped pages, to push them out to the 6407 * swap device. Pages to which other segments have mappings will remain 6408 * mapped and won't be swapped. Our caller (as_swapout) has already 6409 * performed the unloading step. 6410 * 6411 * The value returned is intended to correlate well with the process's 6412 * memory requirements. However, there are some caveats: 6413 * 1) When given a shared segment as argument, this routine will 6414 * only succeed in swapping out pages for the last sharer of the 6415 * segment. (Previous callers will only have decremented mapping 6416 * reference counts.) 6417 * 2) We assume that the hat layer maintains a large enough translation 6418 * cache to capture process reference patterns. 6419 */ 6420 static size_t 6421 segvn_swapout(struct seg *seg) 6422 { 6423 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 6424 struct anon_map *amp; 6425 pgcnt_t pgcnt = 0; 6426 pgcnt_t npages; 6427 pgcnt_t page; 6428 ulong_t anon_index; 6429 6430 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 6431 6432 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER); 6433 /* 6434 * Find pages unmapped by our caller and force them 6435 * out to the virtual swap device. 6436 */ 6437 if ((amp = svd->amp) != NULL) 6438 anon_index = svd->anon_index; 6439 npages = seg->s_size >> PAGESHIFT; 6440 for (page = 0; page < npages; page++) { 6441 page_t *pp; 6442 struct anon *ap; 6443 struct vnode *vp; 6444 u_offset_t off; 6445 anon_sync_obj_t cookie; 6446 6447 /* 6448 * Obtain <vp, off> pair for the page, then look it up. 6449 * 6450 * Note that this code is willing to consider regular 6451 * pages as well as anon pages. Is this appropriate here? 6452 */ 6453 ap = NULL; 6454 if (amp != NULL) { 6455 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 6456 if (anon_array_try_enter(amp, anon_index + page, 6457 &cookie)) { 6458 ANON_LOCK_EXIT(&->a_rwlock); 6459 continue; 6460 } 6461 ap = anon_get_ptr(amp->ahp, anon_index + page); 6462 if (ap != NULL) { 6463 swap_xlate(ap, &vp, &off); 6464 } else { 6465 vp = svd->vp; 6466 off = svd->offset + ptob(page); 6467 } 6468 anon_array_exit(&cookie); 6469 ANON_LOCK_EXIT(&->a_rwlock); 6470 } else { 6471 vp = svd->vp; 6472 off = svd->offset + ptob(page); 6473 } 6474 if (vp == NULL) { /* untouched zfod page */ 6475 ASSERT(ap == NULL); 6476 continue; 6477 } 6478 6479 pp = page_lookup_nowait(vp, off, SE_SHARED); 6480 if (pp == NULL) 6481 continue; 6482 6483 6484 /* 6485 * Examine the page to see whether it can be tossed out, 6486 * keeping track of how many we've found. 6487 */ 6488 if (!page_tryupgrade(pp)) { 6489 /* 6490 * If the page has an i/o lock and no mappings, 6491 * it's very likely that the page is being 6492 * written out as a result of klustering. 6493 * Assume this is so and take credit for it here. 6494 */ 6495 if (!page_io_trylock(pp)) { 6496 if (!hat_page_is_mapped(pp)) 6497 pgcnt++; 6498 } else { 6499 page_io_unlock(pp); 6500 } 6501 page_unlock(pp); 6502 continue; 6503 } 6504 ASSERT(!page_iolock_assert(pp)); 6505 6506 6507 /* 6508 * Skip if page is locked or has mappings. 6509 * We don't need the page_struct_lock to look at lckcnt 6510 * and cowcnt because the page is exclusive locked. 6511 */ 6512 if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0 || 6513 hat_page_is_mapped(pp)) { 6514 page_unlock(pp); 6515 continue; 6516 } 6517 6518 /* 6519 * dispose skips large pages so try to demote first. 6520 */ 6521 if (pp->p_szc != 0 && !page_try_demote_pages(pp)) { 6522 page_unlock(pp); 6523 /* 6524 * XXX should skip the remaining page_t's of this 6525 * large page. 6526 */ 6527 continue; 6528 } 6529 6530 ASSERT(pp->p_szc == 0); 6531 6532 /* 6533 * No longer mapped -- we can toss it out. How 6534 * we do so depends on whether or not it's dirty. 6535 */ 6536 if (hat_ismod(pp) && pp->p_vnode) { 6537 /* 6538 * We must clean the page before it can be 6539 * freed. Setting B_FREE will cause pvn_done 6540 * to free the page when the i/o completes. 6541 * XXX: This also causes it to be accounted 6542 * as a pageout instead of a swap: need 6543 * B_SWAPOUT bit to use instead of B_FREE. 6544 * 6545 * Hold the vnode before releasing the page lock 6546 * to prevent it from being freed and re-used by 6547 * some other thread. 6548 */ 6549 VN_HOLD(vp); 6550 page_unlock(pp); 6551 6552 /* 6553 * Queue all i/o requests for the pageout thread 6554 * to avoid saturating the pageout devices. 6555 */ 6556 if (!queue_io_request(vp, off)) 6557 VN_RELE(vp); 6558 } else { 6559 /* 6560 * The page was clean, free it. 6561 * 6562 * XXX: Can we ever encounter modified pages 6563 * with no associated vnode here? 6564 */ 6565 ASSERT(pp->p_vnode != NULL); 6566 /*LINTED: constant in conditional context*/ 6567 VN_DISPOSE(pp, B_FREE, 0, kcred); 6568 } 6569 6570 /* 6571 * Credit now even if i/o is in progress. 6572 */ 6573 pgcnt++; 6574 } 6575 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 6576 6577 /* 6578 * Wakeup pageout to initiate i/o on all queued requests. 6579 */ 6580 cv_signal_pageout(); 6581 return (ptob(pgcnt)); 6582 } 6583 6584 /* 6585 * Synchronize primary storage cache with real object in virtual memory. 6586 * 6587 * XXX - Anonymous pages should not be sync'ed out at all. 6588 */ 6589 static int 6590 segvn_sync(struct seg *seg, caddr_t addr, size_t len, int attr, uint_t flags) 6591 { 6592 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 6593 struct vpage *vpp; 6594 page_t *pp; 6595 u_offset_t offset; 6596 struct vnode *vp; 6597 u_offset_t off; 6598 caddr_t eaddr; 6599 int bflags; 6600 int err = 0; 6601 int segtype; 6602 int pageprot; 6603 int prot; 6604 ulong_t anon_index; 6605 struct anon_map *amp; 6606 struct anon *ap; 6607 anon_sync_obj_t cookie; 6608 6609 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 6610 6611 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER); 6612 6613 if (svd->softlockcnt > 0) { 6614 /* 6615 * flush all pages from seg cache 6616 * otherwise we may deadlock in swap_putpage 6617 * for B_INVAL page (4175402). 6618 * 6619 * Even if we grab segvn WRITER's lock or segp_slock 6620 * here, there might be another thread which could've 6621 * successfully performed lookup/insert just before 6622 * we acquired the lock here. So, grabbing either 6623 * lock here is of not much use. Until we devise 6624 * a strategy at upper layers to solve the 6625 * synchronization issues completely, we expect 6626 * applications to handle this appropriately. 6627 */ 6628 segvn_purge(seg); 6629 if (svd->softlockcnt > 0) { 6630 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 6631 return (EAGAIN); 6632 } 6633 } 6634 6635 vpp = svd->vpage; 6636 offset = svd->offset + (uintptr_t)(addr - seg->s_base); 6637 bflags = ((flags & MS_ASYNC) ? B_ASYNC : 0) | 6638 ((flags & MS_INVALIDATE) ? B_INVAL : 0); 6639 6640 if (attr) { 6641 pageprot = attr & ~(SHARED|PRIVATE); 6642 segtype = (attr & SHARED) ? MAP_SHARED : MAP_PRIVATE; 6643 6644 /* 6645 * We are done if the segment types don't match 6646 * or if we have segment level protections and 6647 * they don't match. 6648 */ 6649 if (svd->type != segtype) { 6650 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 6651 return (0); 6652 } 6653 if (vpp == NULL) { 6654 if (svd->prot != pageprot) { 6655 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 6656 return (0); 6657 } 6658 prot = svd->prot; 6659 } else 6660 vpp = &svd->vpage[seg_page(seg, addr)]; 6661 6662 } else if (svd->vp && svd->amp == NULL && 6663 (flags & MS_INVALIDATE) == 0) { 6664 6665 /* 6666 * No attributes, no anonymous pages and MS_INVALIDATE flag 6667 * is not on, just use one big request. 6668 */ 6669 err = VOP_PUTPAGE(svd->vp, (offset_t)offset, len, 6670 bflags, svd->cred); 6671 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 6672 return (err); 6673 } 6674 6675 if ((amp = svd->amp) != NULL) 6676 anon_index = svd->anon_index + seg_page(seg, addr); 6677 6678 for (eaddr = addr + len; addr < eaddr; addr += PAGESIZE) { 6679 ap = NULL; 6680 if (amp != NULL) { 6681 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 6682 anon_array_enter(amp, anon_index, &cookie); 6683 ap = anon_get_ptr(amp->ahp, anon_index++); 6684 if (ap != NULL) { 6685 swap_xlate(ap, &vp, &off); 6686 } else { 6687 vp = svd->vp; 6688 off = offset; 6689 } 6690 anon_array_exit(&cookie); 6691 ANON_LOCK_EXIT(&->a_rwlock); 6692 } else { 6693 vp = svd->vp; 6694 off = offset; 6695 } 6696 offset += PAGESIZE; 6697 6698 if (vp == NULL) /* untouched zfod page */ 6699 continue; 6700 6701 if (attr) { 6702 if (vpp) { 6703 prot = VPP_PROT(vpp); 6704 vpp++; 6705 } 6706 if (prot != pageprot) { 6707 continue; 6708 } 6709 } 6710 6711 /* 6712 * See if any of these pages are locked -- if so, then we 6713 * will have to truncate an invalidate request at the first 6714 * locked one. We don't need the page_struct_lock to test 6715 * as this is only advisory; even if we acquire it someone 6716 * might race in and lock the page after we unlock and before 6717 * we do the PUTPAGE, then PUTPAGE simply does nothing. 6718 */ 6719 if (flags & MS_INVALIDATE) { 6720 if ((pp = page_lookup(vp, off, SE_SHARED)) != NULL) { 6721 if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) { 6722 page_unlock(pp); 6723 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 6724 return (EBUSY); 6725 } 6726 if (ap != NULL && pp->p_szc != 0 && 6727 page_tryupgrade(pp)) { 6728 if (pp->p_lckcnt == 0 && 6729 pp->p_cowcnt == 0) { 6730 /* 6731 * swapfs VN_DISPOSE() won't 6732 * invalidate large pages. 6733 * Attempt to demote. 6734 * XXX can't help it if it 6735 * fails. But for swapfs 6736 * pages it is no big deal. 6737 */ 6738 (void) page_try_demote_pages( 6739 pp); 6740 } 6741 } 6742 page_unlock(pp); 6743 } 6744 } else if (svd->type == MAP_SHARED && amp != NULL) { 6745 /* 6746 * Avoid writting out to disk ISM's large pages 6747 * because segspt_free_pages() relies on NULL an_pvp 6748 * of anon slots of such pages. 6749 */ 6750 6751 ASSERT(svd->vp == NULL); 6752 /* 6753 * swapfs uses page_lookup_nowait if not freeing or 6754 * invalidating and skips a page if 6755 * page_lookup_nowait returns NULL. 6756 */ 6757 pp = page_lookup_nowait(vp, off, SE_SHARED); 6758 if (pp == NULL) { 6759 continue; 6760 } 6761 if (pp->p_szc != 0) { 6762 page_unlock(pp); 6763 continue; 6764 } 6765 6766 /* 6767 * Note ISM pages are created large so (vp, off)'s 6768 * page cannot suddenly become large after we unlock 6769 * pp. 6770 */ 6771 page_unlock(pp); 6772 } 6773 /* 6774 * XXX - Should ultimately try to kluster 6775 * calls to VOP_PUTPAGE() for performance. 6776 */ 6777 VN_HOLD(vp); 6778 err = VOP_PUTPAGE(vp, (offset_t)off, PAGESIZE, 6779 bflags, svd->cred); 6780 VN_RELE(vp); 6781 if (err) 6782 break; 6783 } 6784 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 6785 return (err); 6786 } 6787 6788 /* 6789 * Determine if we have data corresponding to pages in the 6790 * primary storage virtual memory cache (i.e., "in core"). 6791 */ 6792 static size_t 6793 segvn_incore(struct seg *seg, caddr_t addr, size_t len, char *vec) 6794 { 6795 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 6796 struct vnode *vp, *avp; 6797 u_offset_t offset, aoffset; 6798 size_t p, ep; 6799 int ret; 6800 struct vpage *vpp; 6801 page_t *pp; 6802 uint_t start; 6803 struct anon_map *amp; /* XXX - for locknest */ 6804 struct anon *ap; 6805 uint_t attr; 6806 anon_sync_obj_t cookie; 6807 6808 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 6809 6810 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER); 6811 if (svd->amp == NULL && svd->vp == NULL) { 6812 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 6813 bzero(vec, btopr(len)); 6814 return (len); /* no anonymous pages created yet */ 6815 } 6816 6817 p = seg_page(seg, addr); 6818 ep = seg_page(seg, addr + len); 6819 start = svd->vp ? SEG_PAGE_VNODEBACKED : 0; 6820 6821 amp = svd->amp; 6822 for (; p < ep; p++, addr += PAGESIZE) { 6823 vpp = (svd->vpage) ? &svd->vpage[p]: NULL; 6824 ret = start; 6825 ap = NULL; 6826 avp = NULL; 6827 /* Grab the vnode/offset for the anon slot */ 6828 if (amp != NULL) { 6829 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 6830 anon_array_enter(amp, svd->anon_index + p, &cookie); 6831 ap = anon_get_ptr(amp->ahp, svd->anon_index + p); 6832 if (ap != NULL) { 6833 swap_xlate(ap, &avp, &aoffset); 6834 } 6835 anon_array_exit(&cookie); 6836 ANON_LOCK_EXIT(&->a_rwlock); 6837 } 6838 if ((avp != NULL) && page_exists(avp, aoffset)) { 6839 /* A page exists for the anon slot */ 6840 ret |= SEG_PAGE_INCORE; 6841 6842 /* 6843 * If page is mapped and writable 6844 */ 6845 attr = (uint_t)0; 6846 if ((hat_getattr(seg->s_as->a_hat, addr, 6847 &attr) != -1) && (attr & PROT_WRITE)) { 6848 ret |= SEG_PAGE_ANON; 6849 } 6850 /* 6851 * Don't get page_struct lock for lckcnt and cowcnt, 6852 * since this is purely advisory. 6853 */ 6854 if ((pp = page_lookup_nowait(avp, aoffset, 6855 SE_SHARED)) != NULL) { 6856 if (pp->p_lckcnt) 6857 ret |= SEG_PAGE_SOFTLOCK; 6858 if (pp->p_cowcnt) 6859 ret |= SEG_PAGE_HASCOW; 6860 page_unlock(pp); 6861 } 6862 } 6863 6864 /* Gather vnode statistics */ 6865 vp = svd->vp; 6866 offset = svd->offset + (uintptr_t)(addr - seg->s_base); 6867 6868 if (vp != NULL) { 6869 /* 6870 * Try to obtain a "shared" lock on the page 6871 * without blocking. If this fails, determine 6872 * if the page is in memory. 6873 */ 6874 pp = page_lookup_nowait(vp, offset, SE_SHARED); 6875 if ((pp == NULL) && (page_exists(vp, offset))) { 6876 /* Page is incore, and is named */ 6877 ret |= (SEG_PAGE_INCORE | SEG_PAGE_VNODE); 6878 } 6879 /* 6880 * Don't get page_struct lock for lckcnt and cowcnt, 6881 * since this is purely advisory. 6882 */ 6883 if (pp != NULL) { 6884 ret |= (SEG_PAGE_INCORE | SEG_PAGE_VNODE); 6885 if (pp->p_lckcnt) 6886 ret |= SEG_PAGE_SOFTLOCK; 6887 if (pp->p_cowcnt) 6888 ret |= SEG_PAGE_HASCOW; 6889 page_unlock(pp); 6890 } 6891 } 6892 6893 /* Gather virtual page information */ 6894 if (vpp) { 6895 if (VPP_ISPPLOCK(vpp)) 6896 ret |= SEG_PAGE_LOCKED; 6897 vpp++; 6898 } 6899 6900 *vec++ = (char)ret; 6901 } 6902 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 6903 return (len); 6904 } 6905 6906 /* 6907 * Statement for p_cowcnts/p_lckcnts. 6908 * 6909 * p_cowcnt is updated while mlock/munlocking MAP_PRIVATE and PROT_WRITE region 6910 * irrespective of the following factors or anything else: 6911 * 6912 * (1) anon slots are populated or not 6913 * (2) cow is broken or not 6914 * (3) refcnt on ap is 1 or greater than 1 6915 * 6916 * If it's not MAP_PRIVATE and PROT_WRITE, p_lckcnt is updated during mlock 6917 * and munlock. 6918 * 6919 * 6920 * Handling p_cowcnts/p_lckcnts during copy-on-write fault: 6921 * 6922 * if vpage has PROT_WRITE 6923 * transfer cowcnt on the oldpage -> cowcnt on the newpage 6924 * else 6925 * transfer lckcnt on the oldpage -> lckcnt on the newpage 6926 * 6927 * During copy-on-write, decrement p_cowcnt on the oldpage and increment 6928 * p_cowcnt on the newpage *if* the corresponding vpage has PROT_WRITE. 6929 * 6930 * We may also break COW if softlocking on read access in the physio case. 6931 * In this case, vpage may not have PROT_WRITE. So, we need to decrement 6932 * p_lckcnt on the oldpage and increment p_lckcnt on the newpage *if* the 6933 * vpage doesn't have PROT_WRITE. 6934 * 6935 * 6936 * Handling p_cowcnts/p_lckcnts during mprotect on mlocked region: 6937 * 6938 * If a MAP_PRIVATE region loses PROT_WRITE, we decrement p_cowcnt and 6939 * increment p_lckcnt by calling page_subclaim() which takes care of 6940 * availrmem accounting and p_lckcnt overflow. 6941 * 6942 * If a MAP_PRIVATE region gains PROT_WRITE, we decrement p_lckcnt and 6943 * increment p_cowcnt by calling page_addclaim() which takes care of 6944 * availrmem availability and p_cowcnt overflow. 6945 */ 6946 6947 /* 6948 * Lock down (or unlock) pages mapped by this segment. 6949 * 6950 * XXX only creates PAGESIZE pages if anon slots are not initialized. 6951 * At fault time they will be relocated into larger pages. 6952 */ 6953 static int 6954 segvn_lockop(struct seg *seg, caddr_t addr, size_t len, 6955 int attr, int op, ulong_t *lockmap, size_t pos) 6956 { 6957 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 6958 struct vpage *vpp; 6959 struct vpage *evp; 6960 page_t *pp; 6961 u_offset_t offset; 6962 u_offset_t off; 6963 int segtype; 6964 int pageprot; 6965 int claim; 6966 struct vnode *vp; 6967 ulong_t anon_index; 6968 struct anon_map *amp; 6969 struct anon *ap; 6970 struct vattr va; 6971 anon_sync_obj_t cookie; 6972 struct kshmid *sp = NULL; 6973 struct proc *p = curproc; 6974 kproject_t *proj = NULL; 6975 int chargeproc = 1; 6976 size_t locked_bytes = 0; 6977 size_t unlocked_bytes = 0; 6978 int err = 0; 6979 6980 /* 6981 * Hold write lock on address space because may split or concatenate 6982 * segments 6983 */ 6984 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 6985 6986 /* 6987 * If this is a shm, use shm's project and zone, else use 6988 * project and zone of calling process 6989 */ 6990 6991 /* Determine if this segment backs a sysV shm */ 6992 if (svd->amp != NULL && svd->amp->a_sp != NULL) { 6993 sp = svd->amp->a_sp; 6994 proj = sp->shm_perm.ipc_proj; 6995 chargeproc = 0; 6996 } 6997 6998 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_WRITER); 6999 if (attr) { 7000 pageprot = attr & ~(SHARED|PRIVATE); 7001 segtype = attr & SHARED ? MAP_SHARED : MAP_PRIVATE; 7002 7003 /* 7004 * We are done if the segment types don't match 7005 * or if we have segment level protections and 7006 * they don't match. 7007 */ 7008 if (svd->type != segtype) { 7009 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7010 return (0); 7011 } 7012 if (svd->pageprot == 0 && svd->prot != pageprot) { 7013 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7014 return (0); 7015 } 7016 } 7017 7018 /* 7019 * If we're locking, then we must create a vpage structure if 7020 * none exists. If we're unlocking, then check to see if there 7021 * is a vpage -- if not, then we could not have locked anything. 7022 */ 7023 7024 if ((vpp = svd->vpage) == NULL) { 7025 if (op == MC_LOCK) 7026 segvn_vpage(seg); 7027 else { 7028 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7029 return (0); 7030 } 7031 } 7032 7033 /* 7034 * The anonymous data vector (i.e., previously 7035 * unreferenced mapping to swap space) can be allocated 7036 * by lazily testing for its existence. 7037 */ 7038 if (op == MC_LOCK && svd->amp == NULL && svd->vp == NULL) { 7039 svd->amp = anonmap_alloc(seg->s_size, 0); 7040 svd->amp->a_szc = seg->s_szc; 7041 } 7042 7043 if ((amp = svd->amp) != NULL) { 7044 anon_index = svd->anon_index + seg_page(seg, addr); 7045 } 7046 7047 offset = svd->offset + (uintptr_t)(addr - seg->s_base); 7048 evp = &svd->vpage[seg_page(seg, addr + len)]; 7049 7050 if (sp != NULL) 7051 mutex_enter(&sp->shm_mlock); 7052 7053 /* determine number of unlocked bytes in range for lock operation */ 7054 if (op == MC_LOCK) { 7055 7056 if (sp == NULL) { 7057 for (vpp = &svd->vpage[seg_page(seg, addr)]; vpp < evp; 7058 vpp++) { 7059 if (!VPP_ISPPLOCK(vpp)) 7060 unlocked_bytes += PAGESIZE; 7061 } 7062 } else { 7063 ulong_t i_idx, i_edx; 7064 anon_sync_obj_t i_cookie; 7065 struct anon *i_ap; 7066 struct vnode *i_vp; 7067 u_offset_t i_off; 7068 7069 /* Only count sysV pages once for locked memory */ 7070 i_edx = svd->anon_index + seg_page(seg, addr + len); 7071 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 7072 for (i_idx = anon_index; i_idx < i_edx; i_idx++) { 7073 anon_array_enter(amp, i_idx, &i_cookie); 7074 i_ap = anon_get_ptr(amp->ahp, i_idx); 7075 if (i_ap == NULL) { 7076 unlocked_bytes += PAGESIZE; 7077 anon_array_exit(&i_cookie); 7078 continue; 7079 } 7080 swap_xlate(i_ap, &i_vp, &i_off); 7081 anon_array_exit(&i_cookie); 7082 pp = page_lookup(i_vp, i_off, SE_SHARED); 7083 if (pp == NULL) { 7084 unlocked_bytes += PAGESIZE; 7085 continue; 7086 } else if (pp->p_lckcnt == 0) 7087 unlocked_bytes += PAGESIZE; 7088 page_unlock(pp); 7089 } 7090 ANON_LOCK_EXIT(&->a_rwlock); 7091 } 7092 7093 mutex_enter(&p->p_lock); 7094 err = rctl_incr_locked_mem(p, proj, unlocked_bytes, 7095 chargeproc); 7096 mutex_exit(&p->p_lock); 7097 7098 if (err) { 7099 if (sp != NULL) 7100 mutex_exit(&sp->shm_mlock); 7101 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7102 return (err); 7103 } 7104 } 7105 /* 7106 * Loop over all pages in the range. Process if we're locking and 7107 * page has not already been locked in this mapping; or if we're 7108 * unlocking and the page has been locked. 7109 */ 7110 for (vpp = &svd->vpage[seg_page(seg, addr)]; vpp < evp; 7111 vpp++, pos++, addr += PAGESIZE, offset += PAGESIZE, anon_index++) { 7112 if ((attr == 0 || VPP_PROT(vpp) == pageprot) && 7113 ((op == MC_LOCK && !VPP_ISPPLOCK(vpp)) || 7114 (op == MC_UNLOCK && VPP_ISPPLOCK(vpp)))) { 7115 7116 if (amp != NULL) 7117 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 7118 /* 7119 * If this isn't a MAP_NORESERVE segment and 7120 * we're locking, allocate anon slots if they 7121 * don't exist. The page is brought in later on. 7122 */ 7123 if (op == MC_LOCK && svd->vp == NULL && 7124 ((svd->flags & MAP_NORESERVE) == 0) && 7125 amp != NULL && 7126 ((ap = anon_get_ptr(amp->ahp, anon_index)) 7127 == NULL)) { 7128 anon_array_enter(amp, anon_index, &cookie); 7129 7130 if ((ap = anon_get_ptr(amp->ahp, 7131 anon_index)) == NULL) { 7132 pp = anon_zero(seg, addr, &ap, 7133 svd->cred); 7134 if (pp == NULL) { 7135 anon_array_exit(&cookie); 7136 ANON_LOCK_EXIT(&->a_rwlock); 7137 err = ENOMEM; 7138 goto out; 7139 } 7140 ASSERT(anon_get_ptr(amp->ahp, 7141 anon_index) == NULL); 7142 (void) anon_set_ptr(amp->ahp, 7143 anon_index, ap, ANON_SLEEP); 7144 page_unlock(pp); 7145 } 7146 anon_array_exit(&cookie); 7147 } 7148 7149 /* 7150 * Get name for page, accounting for 7151 * existence of private copy. 7152 */ 7153 ap = NULL; 7154 if (amp != NULL) { 7155 anon_array_enter(amp, anon_index, &cookie); 7156 ap = anon_get_ptr(amp->ahp, anon_index); 7157 if (ap != NULL) { 7158 swap_xlate(ap, &vp, &off); 7159 } else { 7160 if (svd->vp == NULL && 7161 (svd->flags & MAP_NORESERVE)) { 7162 anon_array_exit(&cookie); 7163 ANON_LOCK_EXIT(&->a_rwlock); 7164 continue; 7165 } 7166 vp = svd->vp; 7167 off = offset; 7168 } 7169 anon_array_exit(&cookie); 7170 ANON_LOCK_EXIT(&->a_rwlock); 7171 } else { 7172 vp = svd->vp; 7173 off = offset; 7174 } 7175 7176 /* 7177 * Get page frame. It's ok if the page is 7178 * not available when we're unlocking, as this 7179 * may simply mean that a page we locked got 7180 * truncated out of existence after we locked it. 7181 * 7182 * Invoke VOP_GETPAGE() to obtain the page struct 7183 * since we may need to read it from disk if its 7184 * been paged out. 7185 */ 7186 if (op != MC_LOCK) 7187 pp = page_lookup(vp, off, SE_SHARED); 7188 else { 7189 page_t *pl[1 + 1]; 7190 int error; 7191 7192 ASSERT(vp != NULL); 7193 7194 error = VOP_GETPAGE(vp, (offset_t)off, PAGESIZE, 7195 (uint_t *)NULL, pl, PAGESIZE, seg, addr, 7196 S_OTHER, svd->cred); 7197 7198 /* 7199 * If the error is EDEADLK then we must bounce 7200 * up and drop all vm subsystem locks and then 7201 * retry the operation later 7202 * This behavior is a temporary measure because 7203 * ufs/sds logging is badly designed and will 7204 * deadlock if we don't allow this bounce to 7205 * happen. The real solution is to re-design 7206 * the logging code to work properly. See bug 7207 * 4125102 for details of the problem. 7208 */ 7209 if (error == EDEADLK) { 7210 err = error; 7211 goto out; 7212 } 7213 /* 7214 * Quit if we fail to fault in the page. Treat 7215 * the failure as an error, unless the addr 7216 * is mapped beyond the end of a file. 7217 */ 7218 if (error && svd->vp) { 7219 va.va_mask = AT_SIZE; 7220 if (VOP_GETATTR(svd->vp, &va, 0, 7221 svd->cred) != 0) { 7222 err = EIO; 7223 goto out; 7224 } 7225 if (btopr(va.va_size) >= 7226 btopr(off + 1)) { 7227 err = EIO; 7228 goto out; 7229 } 7230 goto out; 7231 7232 } else if (error) { 7233 err = EIO; 7234 goto out; 7235 } 7236 pp = pl[0]; 7237 ASSERT(pp != NULL); 7238 } 7239 7240 /* 7241 * See Statement at the beginning of this routine. 7242 * 7243 * claim is always set if MAP_PRIVATE and PROT_WRITE 7244 * irrespective of following factors: 7245 * 7246 * (1) anon slots are populated or not 7247 * (2) cow is broken or not 7248 * (3) refcnt on ap is 1 or greater than 1 7249 * 7250 * See 4140683 for details 7251 */ 7252 claim = ((VPP_PROT(vpp) & PROT_WRITE) && 7253 (svd->type == MAP_PRIVATE)); 7254 7255 /* 7256 * Perform page-level operation appropriate to 7257 * operation. If locking, undo the SOFTLOCK 7258 * performed to bring the page into memory 7259 * after setting the lock. If unlocking, 7260 * and no page was found, account for the claim 7261 * separately. 7262 */ 7263 if (op == MC_LOCK) { 7264 int ret = 1; /* Assume success */ 7265 7266 ASSERT(!VPP_ISPPLOCK(vpp)); 7267 7268 ret = page_pp_lock(pp, claim, 0); 7269 if (ret == 0) { 7270 /* locking page failed */ 7271 page_unlock(pp); 7272 err = EAGAIN; 7273 goto out; 7274 } 7275 VPP_SETPPLOCK(vpp); 7276 if (sp != NULL) { 7277 if (pp->p_lckcnt == 1) 7278 locked_bytes += PAGESIZE; 7279 } else 7280 locked_bytes += PAGESIZE; 7281 7282 if (lockmap != (ulong_t *)NULL) 7283 BT_SET(lockmap, pos); 7284 7285 page_unlock(pp); 7286 } else { 7287 ASSERT(VPP_ISPPLOCK(vpp)); 7288 if (pp != NULL) { 7289 /* sysV pages should be locked */ 7290 ASSERT(sp == NULL || pp->p_lckcnt > 0); 7291 page_pp_unlock(pp, claim, 0); 7292 if (sp != NULL) { 7293 if (pp->p_lckcnt == 0) 7294 unlocked_bytes 7295 += PAGESIZE; 7296 } else 7297 unlocked_bytes += PAGESIZE; 7298 page_unlock(pp); 7299 } else { 7300 ASSERT(sp == NULL); 7301 unlocked_bytes += PAGESIZE; 7302 } 7303 VPP_CLRPPLOCK(vpp); 7304 } 7305 } 7306 } 7307 out: 7308 if (op == MC_LOCK) { 7309 /* Credit back bytes that did not get locked */ 7310 if ((unlocked_bytes - locked_bytes) > 0) { 7311 if (proj == NULL) 7312 mutex_enter(&p->p_lock); 7313 rctl_decr_locked_mem(p, proj, 7314 (unlocked_bytes - locked_bytes), chargeproc); 7315 if (proj == NULL) 7316 mutex_exit(&p->p_lock); 7317 } 7318 7319 } else { 7320 /* Account bytes that were unlocked */ 7321 if (unlocked_bytes > 0) { 7322 if (proj == NULL) 7323 mutex_enter(&p->p_lock); 7324 rctl_decr_locked_mem(p, proj, unlocked_bytes, 7325 chargeproc); 7326 if (proj == NULL) 7327 mutex_exit(&p->p_lock); 7328 } 7329 } 7330 if (sp != NULL) 7331 mutex_exit(&sp->shm_mlock); 7332 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7333 7334 return (err); 7335 } 7336 7337 /* 7338 * Set advice from user for specified pages 7339 * There are 5 types of advice: 7340 * MADV_NORMAL - Normal (default) behavior (whatever that is) 7341 * MADV_RANDOM - Random page references 7342 * do not allow readahead or 'klustering' 7343 * MADV_SEQUENTIAL - Sequential page references 7344 * Pages previous to the one currently being 7345 * accessed (determined by fault) are 'not needed' 7346 * and are freed immediately 7347 * MADV_WILLNEED - Pages are likely to be used (fault ahead in mctl) 7348 * MADV_DONTNEED - Pages are not needed (synced out in mctl) 7349 * MADV_FREE - Contents can be discarded 7350 * MADV_ACCESS_DEFAULT- Default access 7351 * MADV_ACCESS_LWP - Next LWP will access heavily 7352 * MADV_ACCESS_MANY- Many LWPs or processes will access heavily 7353 */ 7354 static int 7355 segvn_advise(struct seg *seg, caddr_t addr, size_t len, uint_t behav) 7356 { 7357 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 7358 size_t page; 7359 int err = 0; 7360 int already_set; 7361 struct anon_map *amp; 7362 ulong_t anon_index; 7363 struct seg *next; 7364 lgrp_mem_policy_t policy; 7365 struct seg *prev; 7366 struct vnode *vp; 7367 7368 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 7369 7370 /* 7371 * In case of MADV_FREE, we won't be modifying any segment private 7372 * data structures; so, we only need to grab READER's lock 7373 */ 7374 if (behav != MADV_FREE) 7375 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_WRITER); 7376 else 7377 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER); 7378 7379 /* 7380 * Large pages are assumed to be only turned on when accesses to the 7381 * segment's address range have spatial and temporal locality. That 7382 * justifies ignoring MADV_SEQUENTIAL for large page segments. 7383 * Also, ignore advice affecting lgroup memory allocation 7384 * if don't need to do lgroup optimizations on this system 7385 */ 7386 7387 if ((behav == MADV_SEQUENTIAL && seg->s_szc != 0) || 7388 (!lgrp_optimizations() && (behav == MADV_ACCESS_DEFAULT || 7389 behav == MADV_ACCESS_LWP || behav == MADV_ACCESS_MANY))) { 7390 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7391 return (0); 7392 } 7393 7394 if (behav == MADV_SEQUENTIAL || behav == MADV_ACCESS_DEFAULT || 7395 behav == MADV_ACCESS_LWP || behav == MADV_ACCESS_MANY) { 7396 /* 7397 * Since we are going to unload hat mappings 7398 * we first have to flush the cache. Otherwise 7399 * this might lead to system panic if another 7400 * thread is doing physio on the range whose 7401 * mappings are unloaded by madvise(3C). 7402 */ 7403 if (svd->softlockcnt > 0) { 7404 /* 7405 * Since we do have the segvn writers lock 7406 * nobody can fill the cache with entries 7407 * belonging to this seg during the purge. 7408 * The flush either succeeds or we still 7409 * have pending I/Os. In the later case, 7410 * madvise(3C) fails. 7411 */ 7412 segvn_purge(seg); 7413 if (svd->softlockcnt > 0) { 7414 /* 7415 * Since madvise(3C) is advisory and 7416 * it's not part of UNIX98, madvise(3C) 7417 * failure here doesn't cause any hardship. 7418 * Note that we don't block in "as" layer. 7419 */ 7420 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7421 return (EAGAIN); 7422 } 7423 } 7424 } 7425 7426 amp = svd->amp; 7427 vp = svd->vp; 7428 if (behav == MADV_FREE) { 7429 /* 7430 * MADV_FREE is not supported for segments with 7431 * underlying object; if anonmap is NULL, anon slots 7432 * are not yet populated and there is nothing for 7433 * us to do. As MADV_FREE is advisory, we don't 7434 * return error in either case. 7435 */ 7436 if (vp || amp == NULL) { 7437 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7438 return (0); 7439 } 7440 7441 page = seg_page(seg, addr); 7442 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 7443 anon_disclaim(amp, svd->anon_index + page, len, 0); 7444 ANON_LOCK_EXIT(&->a_rwlock); 7445 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7446 return (0); 7447 } 7448 7449 /* 7450 * If advice is to be applied to entire segment, 7451 * use advice field in seg_data structure 7452 * otherwise use appropriate vpage entry. 7453 */ 7454 if ((addr == seg->s_base) && (len == seg->s_size)) { 7455 switch (behav) { 7456 case MADV_ACCESS_LWP: 7457 case MADV_ACCESS_MANY: 7458 case MADV_ACCESS_DEFAULT: 7459 /* 7460 * Set memory allocation policy for this segment 7461 */ 7462 policy = lgrp_madv_to_policy(behav, len, svd->type); 7463 if (svd->type == MAP_SHARED) 7464 already_set = lgrp_shm_policy_set(policy, amp, 7465 svd->anon_index, vp, svd->offset, len); 7466 else { 7467 /* 7468 * For private memory, need writers lock on 7469 * address space because the segment may be 7470 * split or concatenated when changing policy 7471 */ 7472 if (AS_READ_HELD(seg->s_as, 7473 &seg->s_as->a_lock)) { 7474 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7475 return (IE_RETRY); 7476 } 7477 7478 already_set = lgrp_privm_policy_set(policy, 7479 &svd->policy_info, len); 7480 } 7481 7482 /* 7483 * If policy set already and it shouldn't be reapplied, 7484 * don't do anything. 7485 */ 7486 if (already_set && 7487 !LGRP_MEM_POLICY_REAPPLICABLE(policy)) 7488 break; 7489 7490 /* 7491 * Mark any existing pages in given range for 7492 * migration 7493 */ 7494 page_mark_migrate(seg, addr, len, amp, svd->anon_index, 7495 vp, svd->offset, 1); 7496 7497 /* 7498 * If same policy set already or this is a shared 7499 * memory segment, don't need to try to concatenate 7500 * segment with adjacent ones. 7501 */ 7502 if (already_set || svd->type == MAP_SHARED) 7503 break; 7504 7505 /* 7506 * Try to concatenate this segment with previous 7507 * one and next one, since we changed policy for 7508 * this one and it may be compatible with adjacent 7509 * ones now. 7510 */ 7511 prev = AS_SEGPREV(seg->s_as, seg); 7512 next = AS_SEGNEXT(seg->s_as, seg); 7513 7514 if (next && next->s_ops == &segvn_ops && 7515 addr + len == next->s_base) 7516 (void) segvn_concat(seg, next, 1); 7517 7518 if (prev && prev->s_ops == &segvn_ops && 7519 addr == prev->s_base + prev->s_size) { 7520 /* 7521 * Drop lock for private data of current 7522 * segment before concatenating (deleting) it 7523 * and return IE_REATTACH to tell as_ctl() that 7524 * current segment has changed 7525 */ 7526 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7527 if (!segvn_concat(prev, seg, 1)) 7528 err = IE_REATTACH; 7529 7530 return (err); 7531 } 7532 break; 7533 7534 case MADV_SEQUENTIAL: 7535 /* 7536 * unloading mapping guarantees 7537 * detection in segvn_fault 7538 */ 7539 ASSERT(seg->s_szc == 0); 7540 hat_unload(seg->s_as->a_hat, addr, len, 7541 HAT_UNLOAD); 7542 /* FALLTHROUGH */ 7543 case MADV_NORMAL: 7544 case MADV_RANDOM: 7545 svd->advice = (uchar_t)behav; 7546 svd->pageadvice = 0; 7547 break; 7548 case MADV_WILLNEED: /* handled in memcntl */ 7549 case MADV_DONTNEED: /* handled in memcntl */ 7550 case MADV_FREE: /* handled above */ 7551 break; 7552 default: 7553 err = EINVAL; 7554 } 7555 } else { 7556 caddr_t eaddr; 7557 struct seg *new_seg; 7558 struct segvn_data *new_svd; 7559 u_offset_t off; 7560 caddr_t oldeaddr; 7561 7562 page = seg_page(seg, addr); 7563 7564 segvn_vpage(seg); 7565 7566 switch (behav) { 7567 struct vpage *bvpp, *evpp; 7568 7569 case MADV_ACCESS_LWP: 7570 case MADV_ACCESS_MANY: 7571 case MADV_ACCESS_DEFAULT: 7572 /* 7573 * Set memory allocation policy for portion of this 7574 * segment 7575 */ 7576 7577 /* 7578 * Align address and length of advice to page 7579 * boundaries for large pages 7580 */ 7581 if (seg->s_szc != 0) { 7582 size_t pgsz; 7583 7584 pgsz = page_get_pagesize(seg->s_szc); 7585 addr = (caddr_t)P2ALIGN((uintptr_t)addr, pgsz); 7586 len = P2ROUNDUP(len, pgsz); 7587 } 7588 7589 /* 7590 * Check to see whether policy is set already 7591 */ 7592 policy = lgrp_madv_to_policy(behav, len, svd->type); 7593 7594 anon_index = svd->anon_index + page; 7595 off = svd->offset + (uintptr_t)(addr - seg->s_base); 7596 7597 if (svd->type == MAP_SHARED) 7598 already_set = lgrp_shm_policy_set(policy, amp, 7599 anon_index, vp, off, len); 7600 else 7601 already_set = 7602 (policy == svd->policy_info.mem_policy); 7603 7604 /* 7605 * If policy set already and it shouldn't be reapplied, 7606 * don't do anything. 7607 */ 7608 if (already_set && 7609 !LGRP_MEM_POLICY_REAPPLICABLE(policy)) 7610 break; 7611 7612 /* 7613 * For private memory, need writers lock on 7614 * address space because the segment may be 7615 * split or concatenated when changing policy 7616 */ 7617 if (svd->type == MAP_PRIVATE && 7618 AS_READ_HELD(seg->s_as, &seg->s_as->a_lock)) { 7619 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7620 return (IE_RETRY); 7621 } 7622 7623 /* 7624 * Mark any existing pages in given range for 7625 * migration 7626 */ 7627 page_mark_migrate(seg, addr, len, amp, svd->anon_index, 7628 vp, svd->offset, 1); 7629 7630 /* 7631 * Don't need to try to split or concatenate 7632 * segments, since policy is same or this is a shared 7633 * memory segment 7634 */ 7635 if (already_set || svd->type == MAP_SHARED) 7636 break; 7637 7638 /* 7639 * Split off new segment if advice only applies to a 7640 * portion of existing segment starting in middle 7641 */ 7642 new_seg = NULL; 7643 eaddr = addr + len; 7644 oldeaddr = seg->s_base + seg->s_size; 7645 if (addr > seg->s_base) { 7646 /* 7647 * Must flush I/O page cache 7648 * before splitting segment 7649 */ 7650 if (svd->softlockcnt > 0) 7651 segvn_purge(seg); 7652 7653 /* 7654 * Split segment and return IE_REATTACH to tell 7655 * as_ctl() that current segment changed 7656 */ 7657 new_seg = segvn_split_seg(seg, addr); 7658 new_svd = (struct segvn_data *)new_seg->s_data; 7659 err = IE_REATTACH; 7660 7661 /* 7662 * If new segment ends where old one 7663 * did, try to concatenate the new 7664 * segment with next one. 7665 */ 7666 if (eaddr == oldeaddr) { 7667 /* 7668 * Set policy for new segment 7669 */ 7670 (void) lgrp_privm_policy_set(policy, 7671 &new_svd->policy_info, 7672 new_seg->s_size); 7673 7674 next = AS_SEGNEXT(new_seg->s_as, 7675 new_seg); 7676 7677 if (next && 7678 next->s_ops == &segvn_ops && 7679 eaddr == next->s_base) 7680 (void) segvn_concat(new_seg, 7681 next, 1); 7682 } 7683 } 7684 7685 /* 7686 * Split off end of existing segment if advice only 7687 * applies to a portion of segment ending before 7688 * end of the existing segment 7689 */ 7690 if (eaddr < oldeaddr) { 7691 /* 7692 * Must flush I/O page cache 7693 * before splitting segment 7694 */ 7695 if (svd->softlockcnt > 0) 7696 segvn_purge(seg); 7697 7698 /* 7699 * If beginning of old segment was already 7700 * split off, use new segment to split end off 7701 * from. 7702 */ 7703 if (new_seg != NULL && new_seg != seg) { 7704 /* 7705 * Split segment 7706 */ 7707 (void) segvn_split_seg(new_seg, eaddr); 7708 7709 /* 7710 * Set policy for new segment 7711 */ 7712 (void) lgrp_privm_policy_set(policy, 7713 &new_svd->policy_info, 7714 new_seg->s_size); 7715 } else { 7716 /* 7717 * Split segment and return IE_REATTACH 7718 * to tell as_ctl() that current 7719 * segment changed 7720 */ 7721 (void) segvn_split_seg(seg, eaddr); 7722 err = IE_REATTACH; 7723 7724 (void) lgrp_privm_policy_set(policy, 7725 &svd->policy_info, seg->s_size); 7726 7727 /* 7728 * If new segment starts where old one 7729 * did, try to concatenate it with 7730 * previous segment. 7731 */ 7732 if (addr == seg->s_base) { 7733 prev = AS_SEGPREV(seg->s_as, 7734 seg); 7735 7736 /* 7737 * Drop lock for private data 7738 * of current segment before 7739 * concatenating (deleting) it 7740 */ 7741 if (prev && 7742 prev->s_ops == 7743 &segvn_ops && 7744 addr == prev->s_base + 7745 prev->s_size) { 7746 SEGVN_LOCK_EXIT( 7747 seg->s_as, 7748 &svd->lock); 7749 (void) segvn_concat( 7750 prev, seg, 1); 7751 return (err); 7752 } 7753 } 7754 } 7755 } 7756 break; 7757 case MADV_SEQUENTIAL: 7758 ASSERT(seg->s_szc == 0); 7759 hat_unload(seg->s_as->a_hat, addr, len, HAT_UNLOAD); 7760 /* FALLTHROUGH */ 7761 case MADV_NORMAL: 7762 case MADV_RANDOM: 7763 bvpp = &svd->vpage[page]; 7764 evpp = &svd->vpage[page + (len >> PAGESHIFT)]; 7765 for (; bvpp < evpp; bvpp++) 7766 VPP_SETADVICE(bvpp, behav); 7767 svd->advice = MADV_NORMAL; 7768 break; 7769 case MADV_WILLNEED: /* handled in memcntl */ 7770 case MADV_DONTNEED: /* handled in memcntl */ 7771 case MADV_FREE: /* handled above */ 7772 break; 7773 default: 7774 err = EINVAL; 7775 } 7776 } 7777 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7778 return (err); 7779 } 7780 7781 /* 7782 * Create a vpage structure for this seg. 7783 */ 7784 static void 7785 segvn_vpage(struct seg *seg) 7786 { 7787 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 7788 struct vpage *vp, *evp; 7789 7790 ASSERT(SEGVN_WRITE_HELD(seg->s_as, &svd->lock)); 7791 7792 /* 7793 * If no vpage structure exists, allocate one. Copy the protections 7794 * and the advice from the segment itself to the individual pages. 7795 */ 7796 if (svd->vpage == NULL) { 7797 svd->pageprot = 1; 7798 svd->pageadvice = 1; 7799 svd->vpage = kmem_zalloc(seg_pages(seg) * sizeof (struct vpage), 7800 KM_SLEEP); 7801 evp = &svd->vpage[seg_page(seg, seg->s_base + seg->s_size)]; 7802 for (vp = svd->vpage; vp < evp; vp++) { 7803 VPP_SETPROT(vp, svd->prot); 7804 VPP_SETADVICE(vp, svd->advice); 7805 } 7806 } 7807 } 7808 7809 /* 7810 * Dump the pages belonging to this segvn segment. 7811 */ 7812 static void 7813 segvn_dump(struct seg *seg) 7814 { 7815 struct segvn_data *svd; 7816 page_t *pp; 7817 struct anon_map *amp; 7818 ulong_t anon_index; 7819 struct vnode *vp; 7820 u_offset_t off, offset; 7821 pfn_t pfn; 7822 pgcnt_t page, npages; 7823 caddr_t addr; 7824 7825 npages = seg_pages(seg); 7826 svd = (struct segvn_data *)seg->s_data; 7827 vp = svd->vp; 7828 off = offset = svd->offset; 7829 addr = seg->s_base; 7830 7831 if ((amp = svd->amp) != NULL) { 7832 anon_index = svd->anon_index; 7833 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 7834 } 7835 7836 for (page = 0; page < npages; page++, offset += PAGESIZE) { 7837 struct anon *ap; 7838 int we_own_it = 0; 7839 7840 if (amp && (ap = anon_get_ptr(svd->amp->ahp, anon_index++))) { 7841 swap_xlate_nopanic(ap, &vp, &off); 7842 } else { 7843 vp = svd->vp; 7844 off = offset; 7845 } 7846 7847 /* 7848 * If pp == NULL, the page either does not exist 7849 * or is exclusively locked. So determine if it 7850 * exists before searching for it. 7851 */ 7852 7853 if ((pp = page_lookup_nowait(vp, off, SE_SHARED))) 7854 we_own_it = 1; 7855 else 7856 pp = page_exists(vp, off); 7857 7858 if (pp) { 7859 pfn = page_pptonum(pp); 7860 dump_addpage(seg->s_as, addr, pfn); 7861 if (we_own_it) 7862 page_unlock(pp); 7863 } 7864 addr += PAGESIZE; 7865 dump_timeleft = dump_timeout; 7866 } 7867 7868 if (amp != NULL) 7869 ANON_LOCK_EXIT(&->a_rwlock); 7870 } 7871 7872 /* 7873 * lock/unlock anon pages over a given range. Return shadow list 7874 */ 7875 static int 7876 segvn_pagelock(struct seg *seg, caddr_t addr, size_t len, struct page ***ppp, 7877 enum lock_type type, enum seg_rw rw) 7878 { 7879 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 7880 size_t np, adjustpages = 0, npages = (len >> PAGESHIFT); 7881 ulong_t anon_index; 7882 uint_t protchk; 7883 uint_t error; 7884 struct anon_map *amp; 7885 struct page **pplist, **pl, *pp; 7886 caddr_t a; 7887 size_t page; 7888 caddr_t lpgaddr, lpgeaddr; 7889 pgcnt_t szc0_npages = 0; 7890 7891 TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEGVN_START, 7892 "segvn_pagelock: start seg %p addr %p", seg, addr); 7893 7894 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 7895 if (seg->s_szc != 0 && (type == L_PAGELOCK || type == L_PAGEUNLOCK)) { 7896 /* 7897 * We are adjusting the pagelock region to the large page size 7898 * boundary because the unlocked part of a large page cannot 7899 * be freed anyway unless all constituent pages of a large 7900 * page are locked. Therefore this adjustment allows us to 7901 * decrement availrmem by the right value (note we don't want 7902 * to just decrement availrem by the large page size without 7903 * adjusting addr and len because then we may end up 7904 * decrementing availrmem by large page size for every 7905 * constituent page locked by a new as_pagelock call). 7906 * as_pageunlock caller must always match as_pagelock call's 7907 * addr and len. 7908 * 7909 * Note segment's page size cannot change while we are holding 7910 * as lock. And then it cannot change while softlockcnt is 7911 * not 0. This will allow us to correctly recalculate large 7912 * page size region for the matching pageunlock/reclaim call. 7913 * 7914 * for pageunlock *ppp points to the pointer of page_t that 7915 * corresponds to the real unadjusted start address. Similar 7916 * for pagelock *ppp must point to the pointer of page_t that 7917 * corresponds to the real unadjusted start address. 7918 */ 7919 size_t pgsz = page_get_pagesize(seg->s_szc); 7920 CALC_LPG_REGION(pgsz, seg, addr, len, lpgaddr, lpgeaddr); 7921 adjustpages = ((uintptr_t)(addr - lpgaddr)) >> PAGESHIFT; 7922 } 7923 7924 if (type == L_PAGEUNLOCK) { 7925 7926 /* 7927 * update hat ref bits for /proc. We need to make sure 7928 * that threads tracing the ref and mod bits of the 7929 * address space get the right data. 7930 * Note: page ref and mod bits are updated at reclaim time 7931 */ 7932 if (seg->s_as->a_vbits) { 7933 for (a = addr; a < addr + len; a += PAGESIZE) { 7934 if (rw == S_WRITE) { 7935 hat_setstat(seg->s_as, a, 7936 PAGESIZE, P_REF | P_MOD); 7937 } else { 7938 hat_setstat(seg->s_as, a, 7939 PAGESIZE, P_REF); 7940 } 7941 } 7942 } 7943 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER); 7944 if (seg->s_szc != 0) { 7945 VM_STAT_ADD(segvnvmstats.pagelock[0]); 7946 seg_pinactive(seg, lpgaddr, lpgeaddr - lpgaddr, 7947 *ppp - adjustpages, rw, segvn_reclaim); 7948 } else { 7949 seg_pinactive(seg, addr, len, *ppp, rw, segvn_reclaim); 7950 } 7951 7952 /* 7953 * If someone is blocked while unmapping, we purge 7954 * segment page cache and thus reclaim pplist synchronously 7955 * without waiting for seg_pasync_thread. This speeds up 7956 * unmapping in cases where munmap(2) is called, while 7957 * raw async i/o is still in progress or where a thread 7958 * exits on data fault in a multithreaded application. 7959 */ 7960 if (AS_ISUNMAPWAIT(seg->s_as) && (svd->softlockcnt > 0)) { 7961 /* 7962 * Even if we grab segvn WRITER's lock or segp_slock 7963 * here, there might be another thread which could've 7964 * successfully performed lookup/insert just before 7965 * we acquired the lock here. So, grabbing either 7966 * lock here is of not much use. Until we devise 7967 * a strategy at upper layers to solve the 7968 * synchronization issues completely, we expect 7969 * applications to handle this appropriately. 7970 */ 7971 segvn_purge(seg); 7972 } 7973 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7974 TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEGVN_UNLOCK_END, 7975 "segvn_pagelock: unlock seg %p addr %p", seg, addr); 7976 return (0); 7977 } else if (type == L_PAGERECLAIM) { 7978 VM_STAT_COND_ADD(seg->s_szc != 0, segvnvmstats.pagelock[1]); 7979 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER); 7980 (void) segvn_reclaim(seg, addr, len, *ppp, rw); 7981 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7982 TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEGVN_UNLOCK_END, 7983 "segvn_pagelock: reclaim seg %p addr %p", seg, addr); 7984 return (0); 7985 } 7986 7987 if (seg->s_szc != 0) { 7988 VM_STAT_ADD(segvnvmstats.pagelock[2]); 7989 addr = lpgaddr; 7990 len = lpgeaddr - lpgaddr; 7991 npages = (len >> PAGESHIFT); 7992 } 7993 7994 /* 7995 * for now we only support pagelock to anon memory. We've to check 7996 * protections for vnode objects and call into the vnode driver. 7997 * That's too much for a fast path. Let the fault entry point handle it. 7998 */ 7999 if (svd->vp != NULL) { 8000 TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEGVN_MISS_END, 8001 "segvn_pagelock: mapped vnode seg %p addr %p", seg, addr); 8002 *ppp = NULL; 8003 return (ENOTSUP); 8004 } 8005 8006 /* 8007 * if anonmap is not yet created, let the fault entry point populate it 8008 * with anon ptrs. 8009 */ 8010 if ((amp = svd->amp) == NULL) { 8011 TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEGVN_MISS_END, 8012 "segvn_pagelock: anonmap null seg %p addr %p", seg, addr); 8013 *ppp = NULL; 8014 return (EFAULT); 8015 } 8016 8017 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER); 8018 8019 /* 8020 * we acquire segp_slock to prevent duplicate entries 8021 * in seg_pcache 8022 */ 8023 mutex_enter(&svd->segp_slock); 8024 8025 /* 8026 * try to find pages in segment page cache 8027 */ 8028 pplist = seg_plookup(seg, addr, len, rw); 8029 if (pplist != NULL) { 8030 mutex_exit(&svd->segp_slock); 8031 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 8032 *ppp = pplist + adjustpages; 8033 TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEGVN_HIT_END, 8034 "segvn_pagelock: cache hit seg %p addr %p", seg, addr); 8035 return (0); 8036 } 8037 8038 if (rw == S_READ) { 8039 protchk = PROT_READ; 8040 } else { 8041 protchk = PROT_WRITE; 8042 } 8043 8044 if (svd->pageprot == 0) { 8045 if ((svd->prot & protchk) == 0) { 8046 mutex_exit(&svd->segp_slock); 8047 error = EFAULT; 8048 goto out; 8049 } 8050 } else { 8051 /* 8052 * check page protections 8053 */ 8054 for (a = addr; a < addr + len; a += PAGESIZE) { 8055 struct vpage *vp; 8056 8057 vp = &svd->vpage[seg_page(seg, a)]; 8058 if ((VPP_PROT(vp) & protchk) == 0) { 8059 mutex_exit(&svd->segp_slock); 8060 error = EFAULT; 8061 goto out; 8062 } 8063 } 8064 } 8065 8066 /* 8067 * Avoid per page overhead of segvn_pp_lock_anonpages() for small 8068 * pages. For large pages segvn_pp_lock_anonpages() only does real 8069 * work once per large page. The tradeoff is that we may decrement 8070 * availrmem more than once for the same page but this is ok 8071 * for small pages. 8072 */ 8073 if (seg->s_szc == 0) { 8074 mutex_enter(&freemem_lock); 8075 if (availrmem < tune.t_minarmem + npages) { 8076 mutex_exit(&freemem_lock); 8077 mutex_exit(&svd->segp_slock); 8078 error = ENOMEM; 8079 goto out; 8080 } 8081 availrmem -= npages; 8082 mutex_exit(&freemem_lock); 8083 } 8084 8085 pplist = kmem_alloc(sizeof (page_t *) * npages, KM_SLEEP); 8086 pl = pplist; 8087 *ppp = pplist + adjustpages; 8088 8089 page = seg_page(seg, addr); 8090 anon_index = svd->anon_index + page; 8091 8092 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 8093 for (a = addr; a < addr + len; a += PAGESIZE, anon_index++) { 8094 struct anon *ap; 8095 struct vnode *vp; 8096 u_offset_t off; 8097 anon_sync_obj_t cookie; 8098 8099 anon_array_enter(amp, anon_index, &cookie); 8100 ap = anon_get_ptr(amp->ahp, anon_index); 8101 if (ap == NULL) { 8102 anon_array_exit(&cookie); 8103 break; 8104 } else { 8105 /* 8106 * We must never use seg_pcache for COW pages 8107 * because we might end up with original page still 8108 * lying in seg_pcache even after private page is 8109 * created. This leads to data corruption as 8110 * aio_write refers to the page still in cache 8111 * while all other accesses refer to the private 8112 * page. 8113 */ 8114 if (ap->an_refcnt != 1) { 8115 anon_array_exit(&cookie); 8116 break; 8117 } 8118 } 8119 swap_xlate(ap, &vp, &off); 8120 anon_array_exit(&cookie); 8121 8122 pp = page_lookup_nowait(vp, off, SE_SHARED); 8123 if (pp == NULL) { 8124 break; 8125 } 8126 if (seg->s_szc != 0 || pp->p_szc != 0) { 8127 if (!segvn_pp_lock_anonpages(pp, a == addr)) { 8128 page_unlock(pp); 8129 break; 8130 } 8131 } else { 8132 szc0_npages++; 8133 } 8134 *pplist++ = pp; 8135 } 8136 ANON_LOCK_EXIT(&->a_rwlock); 8137 8138 ASSERT(npages >= szc0_npages); 8139 8140 if (a >= addr + len) { 8141 mutex_enter(&freemem_lock); 8142 if (seg->s_szc == 0 && npages != szc0_npages) { 8143 ASSERT(svd->type == MAP_SHARED && amp->a_szc > 0); 8144 availrmem += (npages - szc0_npages); 8145 } 8146 svd->softlockcnt += npages; 8147 segvn_pages_locked += npages; 8148 mutex_exit(&freemem_lock); 8149 (void) seg_pinsert(seg, addr, len, pl, rw, SEGP_ASYNC_FLUSH, 8150 segvn_reclaim); 8151 mutex_exit(&svd->segp_slock); 8152 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 8153 TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEGVN_FILL_END, 8154 "segvn_pagelock: cache fill seg %p addr %p", seg, addr); 8155 return (0); 8156 } 8157 8158 mutex_exit(&svd->segp_slock); 8159 if (seg->s_szc == 0) { 8160 mutex_enter(&freemem_lock); 8161 availrmem += npages; 8162 mutex_exit(&freemem_lock); 8163 } 8164 error = EFAULT; 8165 pplist = pl; 8166 np = ((uintptr_t)(a - addr)) >> PAGESHIFT; 8167 while (np > (uint_t)0) { 8168 ASSERT(PAGE_LOCKED(*pplist)); 8169 if (seg->s_szc != 0 || (*pplist)->p_szc != 0) { 8170 segvn_pp_unlock_anonpages(*pplist, pplist == pl); 8171 } 8172 page_unlock(*pplist); 8173 np--; 8174 pplist++; 8175 } 8176 kmem_free(pl, sizeof (page_t *) * npages); 8177 out: 8178 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 8179 *ppp = NULL; 8180 TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEGVN_MISS_END, 8181 "segvn_pagelock: cache miss seg %p addr %p", seg, addr); 8182 return (error); 8183 } 8184 8185 /* 8186 * purge any cached pages in the I/O page cache 8187 */ 8188 static void 8189 segvn_purge(struct seg *seg) 8190 { 8191 seg_ppurge(seg); 8192 } 8193 8194 static int 8195 segvn_reclaim(struct seg *seg, caddr_t addr, size_t len, struct page **pplist, 8196 enum seg_rw rw) 8197 { 8198 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 8199 pgcnt_t np, npages; 8200 struct page **pl; 8201 pgcnt_t szc0_npages = 0; 8202 8203 #ifdef lint 8204 addr = addr; 8205 #endif 8206 8207 npages = np = (len >> PAGESHIFT); 8208 ASSERT(npages); 8209 pl = pplist; 8210 if (seg->s_szc != 0) { 8211 size_t pgsz = page_get_pagesize(seg->s_szc); 8212 if (!IS_P2ALIGNED(addr, pgsz) || !IS_P2ALIGNED(len, pgsz)) { 8213 panic("segvn_reclaim: unaligned addr or len"); 8214 /*NOTREACHED*/ 8215 } 8216 } 8217 8218 ASSERT(svd->vp == NULL && svd->amp != NULL); 8219 8220 while (np > (uint_t)0) { 8221 if (rw == S_WRITE) { 8222 hat_setrefmod(*pplist); 8223 } else { 8224 hat_setref(*pplist); 8225 } 8226 if (seg->s_szc != 0 || (*pplist)->p_szc != 0) { 8227 segvn_pp_unlock_anonpages(*pplist, pplist == pl); 8228 } else { 8229 szc0_npages++; 8230 } 8231 page_unlock(*pplist); 8232 np--; 8233 pplist++; 8234 } 8235 kmem_free(pl, sizeof (page_t *) * npages); 8236 8237 mutex_enter(&freemem_lock); 8238 segvn_pages_locked -= npages; 8239 svd->softlockcnt -= npages; 8240 if (szc0_npages != 0) { 8241 availrmem += szc0_npages; 8242 } 8243 mutex_exit(&freemem_lock); 8244 if (svd->softlockcnt <= 0) { 8245 if (AS_ISUNMAPWAIT(seg->s_as)) { 8246 mutex_enter(&seg->s_as->a_contents); 8247 if (AS_ISUNMAPWAIT(seg->s_as)) { 8248 AS_CLRUNMAPWAIT(seg->s_as); 8249 cv_broadcast(&seg->s_as->a_cv); 8250 } 8251 mutex_exit(&seg->s_as->a_contents); 8252 } 8253 } 8254 return (0); 8255 } 8256 /* 8257 * get a memory ID for an addr in a given segment 8258 * 8259 * XXX only creates PAGESIZE pages if anon slots are not initialized. 8260 * At fault time they will be relocated into larger pages. 8261 */ 8262 static int 8263 segvn_getmemid(struct seg *seg, caddr_t addr, memid_t *memidp) 8264 { 8265 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 8266 struct anon *ap = NULL; 8267 ulong_t anon_index; 8268 struct anon_map *amp; 8269 anon_sync_obj_t cookie; 8270 8271 if (svd->type == MAP_PRIVATE) { 8272 memidp->val[0] = (uintptr_t)seg->s_as; 8273 memidp->val[1] = (uintptr_t)addr; 8274 return (0); 8275 } 8276 8277 if (svd->type == MAP_SHARED) { 8278 if (svd->vp) { 8279 memidp->val[0] = (uintptr_t)svd->vp; 8280 memidp->val[1] = (u_longlong_t)svd->offset + 8281 (uintptr_t)(addr - seg->s_base); 8282 return (0); 8283 } else { 8284 8285 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER); 8286 if ((amp = svd->amp) != NULL) { 8287 anon_index = svd->anon_index + 8288 seg_page(seg, addr); 8289 } 8290 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 8291 8292 ASSERT(amp != NULL); 8293 8294 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 8295 anon_array_enter(amp, anon_index, &cookie); 8296 ap = anon_get_ptr(amp->ahp, anon_index); 8297 if (ap == NULL) { 8298 page_t *pp; 8299 8300 pp = anon_zero(seg, addr, &ap, svd->cred); 8301 if (pp == NULL) { 8302 anon_array_exit(&cookie); 8303 ANON_LOCK_EXIT(&->a_rwlock); 8304 return (ENOMEM); 8305 } 8306 ASSERT(anon_get_ptr(amp->ahp, anon_index) 8307 == NULL); 8308 (void) anon_set_ptr(amp->ahp, anon_index, 8309 ap, ANON_SLEEP); 8310 page_unlock(pp); 8311 } 8312 8313 anon_array_exit(&cookie); 8314 ANON_LOCK_EXIT(&->a_rwlock); 8315 8316 memidp->val[0] = (uintptr_t)ap; 8317 memidp->val[1] = (uintptr_t)addr & PAGEOFFSET; 8318 return (0); 8319 } 8320 } 8321 return (EINVAL); 8322 } 8323 8324 static int 8325 sameprot(struct seg *seg, caddr_t a, size_t len) 8326 { 8327 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 8328 struct vpage *vpage; 8329 spgcnt_t pages = btop(len); 8330 uint_t prot; 8331 8332 if (svd->pageprot == 0) 8333 return (1); 8334 8335 ASSERT(svd->vpage != NULL); 8336 8337 vpage = &svd->vpage[seg_page(seg, a)]; 8338 prot = VPP_PROT(vpage); 8339 vpage++; 8340 pages--; 8341 while (pages-- > 0) { 8342 if (prot != VPP_PROT(vpage)) 8343 return (0); 8344 vpage++; 8345 } 8346 return (1); 8347 } 8348 8349 /* 8350 * Get memory allocation policy info for specified address in given segment 8351 */ 8352 static lgrp_mem_policy_info_t * 8353 segvn_getpolicy(struct seg *seg, caddr_t addr) 8354 { 8355 struct anon_map *amp; 8356 ulong_t anon_index; 8357 lgrp_mem_policy_info_t *policy_info; 8358 struct segvn_data *svn_data; 8359 u_offset_t vn_off; 8360 vnode_t *vp; 8361 8362 ASSERT(seg != NULL); 8363 8364 svn_data = (struct segvn_data *)seg->s_data; 8365 if (svn_data == NULL) 8366 return (NULL); 8367 8368 /* 8369 * Get policy info for private or shared memory 8370 */ 8371 if (svn_data->type != MAP_SHARED) 8372 policy_info = &svn_data->policy_info; 8373 else { 8374 amp = svn_data->amp; 8375 anon_index = svn_data->anon_index + seg_page(seg, addr); 8376 vp = svn_data->vp; 8377 vn_off = svn_data->offset + (uintptr_t)(addr - seg->s_base); 8378 policy_info = lgrp_shm_policy_get(amp, anon_index, vp, vn_off); 8379 } 8380 8381 return (policy_info); 8382 } 8383 8384 /*ARGSUSED*/ 8385 static int 8386 segvn_capable(struct seg *seg, segcapability_t capability) 8387 { 8388 return (0); 8389 } 8390