1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ 27 /* All Rights Reserved */ 28 29 /* 30 * University Copyright- Copyright (c) 1982, 1986, 1988 31 * The Regents of the University of California 32 * All Rights Reserved 33 * 34 * University Acknowledgment- Portions of this document are derived from 35 * software developed by the University of California, Berkeley, and its 36 * contributors. 37 */ 38 39 #pragma ident "%Z%%M% %I% %E% SMI" 40 41 /* 42 * VM - shared or copy-on-write from a vnode/anonymous memory. 43 */ 44 45 #include <sys/types.h> 46 #include <sys/param.h> 47 #include <sys/t_lock.h> 48 #include <sys/errno.h> 49 #include <sys/systm.h> 50 #include <sys/mman.h> 51 #include <sys/debug.h> 52 #include <sys/cred.h> 53 #include <sys/vmsystm.h> 54 #include <sys/tuneable.h> 55 #include <sys/bitmap.h> 56 #include <sys/swap.h> 57 #include <sys/kmem.h> 58 #include <sys/sysmacros.h> 59 #include <sys/vtrace.h> 60 #include <sys/cmn_err.h> 61 #include <sys/callb.h> 62 #include <sys/vm.h> 63 #include <sys/dumphdr.h> 64 #include <sys/lgrp.h> 65 66 #include <vm/hat.h> 67 #include <vm/as.h> 68 #include <vm/seg.h> 69 #include <vm/seg_vn.h> 70 #include <vm/pvn.h> 71 #include <vm/anon.h> 72 #include <vm/page.h> 73 #include <vm/vpage.h> 74 #include <sys/proc.h> 75 #include <sys/task.h> 76 #include <sys/project.h> 77 #include <sys/zone.h> 78 #include <sys/shm_impl.h> 79 /* 80 * Private seg op routines. 81 */ 82 static int segvn_dup(struct seg *seg, struct seg *newseg); 83 static int segvn_unmap(struct seg *seg, caddr_t addr, size_t len); 84 static void segvn_free(struct seg *seg); 85 static faultcode_t segvn_fault(struct hat *hat, struct seg *seg, 86 caddr_t addr, size_t len, enum fault_type type, 87 enum seg_rw rw); 88 static faultcode_t segvn_faulta(struct seg *seg, caddr_t addr); 89 static int segvn_setprot(struct seg *seg, caddr_t addr, 90 size_t len, uint_t prot); 91 static int segvn_checkprot(struct seg *seg, caddr_t addr, 92 size_t len, uint_t prot); 93 static int segvn_kluster(struct seg *seg, caddr_t addr, ssize_t delta); 94 static size_t segvn_swapout(struct seg *seg); 95 static int segvn_sync(struct seg *seg, caddr_t addr, size_t len, 96 int attr, uint_t flags); 97 static size_t segvn_incore(struct seg *seg, caddr_t addr, size_t len, 98 char *vec); 99 static int segvn_lockop(struct seg *seg, caddr_t addr, size_t len, 100 int attr, int op, ulong_t *lockmap, size_t pos); 101 static int segvn_getprot(struct seg *seg, caddr_t addr, size_t len, 102 uint_t *protv); 103 static u_offset_t segvn_getoffset(struct seg *seg, caddr_t addr); 104 static int segvn_gettype(struct seg *seg, caddr_t addr); 105 static int segvn_getvp(struct seg *seg, caddr_t addr, 106 struct vnode **vpp); 107 static int segvn_advise(struct seg *seg, caddr_t addr, size_t len, 108 uint_t behav); 109 static void segvn_dump(struct seg *seg); 110 static int segvn_pagelock(struct seg *seg, caddr_t addr, size_t len, 111 struct page ***ppp, enum lock_type type, enum seg_rw rw); 112 static int segvn_setpagesize(struct seg *seg, caddr_t addr, size_t len, 113 uint_t szc); 114 static int segvn_getmemid(struct seg *seg, caddr_t addr, 115 memid_t *memidp); 116 static lgrp_mem_policy_info_t *segvn_getpolicy(struct seg *, caddr_t); 117 static int segvn_capable(struct seg *seg, segcapability_t capable); 118 119 struct seg_ops segvn_ops = { 120 segvn_dup, 121 segvn_unmap, 122 segvn_free, 123 segvn_fault, 124 segvn_faulta, 125 segvn_setprot, 126 segvn_checkprot, 127 segvn_kluster, 128 segvn_swapout, 129 segvn_sync, 130 segvn_incore, 131 segvn_lockop, 132 segvn_getprot, 133 segvn_getoffset, 134 segvn_gettype, 135 segvn_getvp, 136 segvn_advise, 137 segvn_dump, 138 segvn_pagelock, 139 segvn_setpagesize, 140 segvn_getmemid, 141 segvn_getpolicy, 142 segvn_capable, 143 }; 144 145 /* 146 * Common zfod structures, provided as a shorthand for others to use. 147 */ 148 static segvn_crargs_t zfod_segvn_crargs = 149 SEGVN_ZFOD_ARGS(PROT_ZFOD, PROT_ALL); 150 static segvn_crargs_t kzfod_segvn_crargs = 151 SEGVN_ZFOD_ARGS(PROT_ZFOD & ~PROT_USER, 152 PROT_ALL & ~PROT_USER); 153 static segvn_crargs_t stack_noexec_crargs = 154 SEGVN_ZFOD_ARGS(PROT_ZFOD & ~PROT_EXEC, PROT_ALL); 155 156 caddr_t zfod_argsp = (caddr_t)&zfod_segvn_crargs; /* user zfod argsp */ 157 caddr_t kzfod_argsp = (caddr_t)&kzfod_segvn_crargs; /* kernel zfod argsp */ 158 caddr_t stack_exec_argsp = (caddr_t)&zfod_segvn_crargs; /* executable stack */ 159 caddr_t stack_noexec_argsp = (caddr_t)&stack_noexec_crargs; /* noexec stack */ 160 161 #define vpgtob(n) ((n) * sizeof (struct vpage)) /* For brevity */ 162 163 size_t segvn_comb_thrshld = UINT_MAX; /* patchable -- see 1196681 */ 164 165 static int segvn_concat(struct seg *, struct seg *, int); 166 static int segvn_extend_prev(struct seg *, struct seg *, 167 struct segvn_crargs *, size_t); 168 static int segvn_extend_next(struct seg *, struct seg *, 169 struct segvn_crargs *, size_t); 170 static void segvn_softunlock(struct seg *, caddr_t, size_t, enum seg_rw); 171 static void segvn_pagelist_rele(page_t **); 172 static void segvn_setvnode_mpss(vnode_t *); 173 static void segvn_relocate_pages(page_t **, page_t *); 174 static int segvn_full_szcpages(page_t **, uint_t, int *, uint_t *); 175 static int segvn_fill_vp_pages(struct segvn_data *, vnode_t *, u_offset_t, 176 uint_t, page_t **, page_t **, uint_t *, int *); 177 static faultcode_t segvn_fault_vnodepages(struct hat *, struct seg *, caddr_t, 178 caddr_t, enum fault_type, enum seg_rw, caddr_t, caddr_t, int); 179 static faultcode_t segvn_fault_anonpages(struct hat *, struct seg *, caddr_t, 180 caddr_t, enum fault_type, enum seg_rw, caddr_t, caddr_t, int); 181 static faultcode_t segvn_faultpage(struct hat *, struct seg *, caddr_t, 182 u_offset_t, struct vpage *, page_t **, uint_t, 183 enum fault_type, enum seg_rw, int, int); 184 static void segvn_vpage(struct seg *); 185 186 static void segvn_purge(struct seg *seg); 187 static int segvn_reclaim(struct seg *, caddr_t, size_t, struct page **, 188 enum seg_rw); 189 190 static int sameprot(struct seg *, caddr_t, size_t); 191 192 static int segvn_demote_range(struct seg *, caddr_t, size_t, int, uint_t); 193 static int segvn_clrszc(struct seg *); 194 static struct seg *segvn_split_seg(struct seg *, caddr_t); 195 static int segvn_claim_pages(struct seg *, struct vpage *, u_offset_t, 196 ulong_t, uint_t); 197 198 static int segvn_pp_lock_anonpages(page_t *, int); 199 static void segvn_pp_unlock_anonpages(page_t *, int); 200 201 static struct kmem_cache *segvn_cache; 202 203 #ifdef VM_STATS 204 static struct segvnvmstats_str { 205 ulong_t fill_vp_pages[31]; 206 ulong_t fltvnpages[49]; 207 ulong_t fullszcpages[10]; 208 ulong_t relocatepages[3]; 209 ulong_t fltanpages[17]; 210 ulong_t pagelock[3]; 211 ulong_t demoterange[3]; 212 } segvnvmstats; 213 #endif /* VM_STATS */ 214 215 #define SDR_RANGE 1 /* demote entire range */ 216 #define SDR_END 2 /* demote non aligned ends only */ 217 218 #define CALC_LPG_REGION(pgsz, seg, addr, len, lpgaddr, lpgeaddr) { \ 219 if ((len) != 0) { \ 220 lpgaddr = (caddr_t)P2ALIGN((uintptr_t)(addr), pgsz); \ 221 ASSERT(lpgaddr >= (seg)->s_base); \ 222 lpgeaddr = (caddr_t)P2ROUNDUP((uintptr_t)((addr) + \ 223 (len)), pgsz); \ 224 ASSERT(lpgeaddr > lpgaddr); \ 225 ASSERT(lpgeaddr <= (seg)->s_base + (seg)->s_size); \ 226 } else { \ 227 lpgeaddr = lpgaddr = (addr); \ 228 } \ 229 } 230 231 /*ARGSUSED*/ 232 static int 233 segvn_cache_constructor(void *buf, void *cdrarg, int kmflags) 234 { 235 struct segvn_data *svd = buf; 236 237 rw_init(&svd->lock, NULL, RW_DEFAULT, NULL); 238 mutex_init(&svd->segp_slock, NULL, MUTEX_DEFAULT, NULL); 239 svd->svn_trnext = svd->svn_trprev = NULL; 240 return (0); 241 } 242 243 /*ARGSUSED1*/ 244 static void 245 segvn_cache_destructor(void *buf, void *cdrarg) 246 { 247 struct segvn_data *svd = buf; 248 249 rw_destroy(&svd->lock); 250 mutex_destroy(&svd->segp_slock); 251 } 252 253 /*ARGSUSED*/ 254 static int 255 svntr_cache_constructor(void *buf, void *cdrarg, int kmflags) 256 { 257 bzero(buf, sizeof (svntr_t)); 258 return (0); 259 } 260 261 /* 262 * Patching this variable to non-zero allows the system to run with 263 * stacks marked as "not executable". It's a bit of a kludge, but is 264 * provided as a tweakable for platforms that export those ABIs 265 * (e.g. sparc V8) that have executable stacks enabled by default. 266 * There are also some restrictions for platforms that don't actually 267 * implement 'noexec' protections. 268 * 269 * Once enabled, the system is (therefore) unable to provide a fully 270 * ABI-compliant execution environment, though practically speaking, 271 * most everything works. The exceptions are generally some interpreters 272 * and debuggers that create executable code on the stack and jump 273 * into it (without explicitly mprotecting the address range to include 274 * PROT_EXEC). 275 * 276 * One important class of applications that are disabled are those 277 * that have been transformed into malicious agents using one of the 278 * numerous "buffer overflow" attacks. See 4007890. 279 */ 280 int noexec_user_stack = 0; 281 int noexec_user_stack_log = 1; 282 283 int segvn_lpg_disable = 0; 284 uint_t segvn_maxpgszc = 0; 285 286 ulong_t segvn_vmpss_clrszc_cnt; 287 ulong_t segvn_vmpss_clrszc_err; 288 ulong_t segvn_fltvnpages_clrszc_cnt; 289 ulong_t segvn_fltvnpages_clrszc_err; 290 ulong_t segvn_setpgsz_align_err; 291 ulong_t segvn_setpgsz_anon_align_err; 292 ulong_t segvn_setpgsz_getattr_err; 293 ulong_t segvn_setpgsz_eof_err; 294 ulong_t segvn_faultvnmpss_align_err1; 295 ulong_t segvn_faultvnmpss_align_err2; 296 ulong_t segvn_faultvnmpss_align_err3; 297 ulong_t segvn_faultvnmpss_align_err4; 298 ulong_t segvn_faultvnmpss_align_err5; 299 ulong_t segvn_vmpss_pageio_deadlk_err; 300 301 /* 302 * Segvn supports text replication optimization for NUMA platforms. Text 303 * replica's are represented by anon maps (amp). There's one amp per text file 304 * region per lgroup. A process chooses the amp for each of its text mappings 305 * based on the lgroup assignment of its main thread (t_tid = 1). All 306 * processes that want a replica on a particular lgroup for the same text file 307 * mapping share the same amp. amp's are looked up in svntr_hashtab hash table 308 * with vp,off,size,szc used as a key. Text replication segments are read only 309 * MAP_PRIVATE|MAP_TEXT segments that map vnode. Replication is achieved by 310 * forcing COW faults from vnode to amp and mapping amp pages instead of vnode 311 * pages. Replication amp is assigned to a segment when it gets its first 312 * pagefault. To handle main thread lgroup rehoming segvn_trasync_thread 313 * rechecks periodically if the process still maps an amp local to the main 314 * thread. If not async thread forces process to remap to an amp in the new 315 * home lgroup of the main thread. Current text replication implementation 316 * only provides the benefit to workloads that do most of their work in the 317 * main thread of a process or all the threads of a process run in the same 318 * lgroup. To extend text replication benefit to different types of 319 * multithreaded workloads further work would be needed in the hat layer to 320 * allow the same virtual address in the same hat to simultaneously map 321 * different physical addresses (i.e. page table replication would be needed 322 * for x86). 323 * 324 * amp pages are used instead of vnode pages as long as segment has a very 325 * simple life cycle. It's created via segvn_create(), handles S_EXEC 326 * (S_READ) pagefaults and is fully unmapped. If anything more complicated 327 * happens such as protection is changed, real COW fault happens, pagesize is 328 * changed, MC_LOCK is requested or segment is partially unmapped we turn off 329 * text replication by converting the segment back to vnode only segment 330 * (unmap segment's address range and set svd->amp to NULL). 331 * 332 * The original file can be changed after amp is inserted into 333 * svntr_hashtab. Processes that are launched after the file is already 334 * changed can't use the replica's created prior to the file change. To 335 * implement this functionality hash entries are timestamped. Replica's can 336 * only be used if current file modification time is the same as the timestamp 337 * saved when hash entry was created. However just timestamps alone are not 338 * sufficient to detect file modification via mmap(MAP_SHARED) mappings. We 339 * deal with file changes via MAP_SHARED mappings differently. When writable 340 * MAP_SHARED mappings are created to vnodes marked as executable we mark all 341 * existing replica's for this vnode as not usable for future text 342 * mappings. And we don't create new replica's for files that currently have 343 * potentially writable MAP_SHARED mappings (i.e. vn_is_mapped(V_WRITE) is 344 * true). 345 */ 346 347 #define SEGVN_TEXTREPL_MAXBYTES_FACTOR (20) 348 size_t segvn_textrepl_max_bytes_factor = SEGVN_TEXTREPL_MAXBYTES_FACTOR; 349 350 static ulong_t svntr_hashtab_sz = 512; 351 static svntr_bucket_t *svntr_hashtab = NULL; 352 static struct kmem_cache *svntr_cache; 353 static svntr_stats_t *segvn_textrepl_stats; 354 static ksema_t segvn_trasync_sem; 355 356 int segvn_disable_textrepl = 0; 357 size_t textrepl_size_thresh = (size_t)-1; 358 size_t segvn_textrepl_bytes = 0; 359 size_t segvn_textrepl_max_bytes = 0; 360 clock_t segvn_update_textrepl_interval = 0; 361 int segvn_update_tr_time = 10; 362 int segvn_disable_textrepl_update = 0; 363 364 static void segvn_textrepl(struct seg *); 365 static void segvn_textunrepl(struct seg *, int); 366 static void segvn_inval_trcache(vnode_t *); 367 static void segvn_trasync_thread(void); 368 static void segvn_trupdate_wakeup(void *); 369 static void segvn_trupdate(void); 370 static void segvn_trupdate_seg(struct seg *, segvn_data_t *, svntr_t *, 371 ulong_t); 372 373 /* 374 * Initialize segvn data structures 375 */ 376 void 377 segvn_init(void) 378 { 379 uint_t maxszc; 380 uint_t szc; 381 size_t pgsz; 382 383 segvn_cache = kmem_cache_create("segvn_cache", 384 sizeof (struct segvn_data), 0, 385 segvn_cache_constructor, segvn_cache_destructor, NULL, 386 NULL, NULL, 0); 387 388 if (segvn_lpg_disable != 0) 389 return; 390 szc = maxszc = page_num_pagesizes() - 1; 391 if (szc == 0) { 392 segvn_lpg_disable = 1; 393 return; 394 } 395 if (page_get_pagesize(0) != PAGESIZE) { 396 panic("segvn_init: bad szc 0"); 397 /*NOTREACHED*/ 398 } 399 while (szc != 0) { 400 pgsz = page_get_pagesize(szc); 401 if (pgsz <= PAGESIZE || !IS_P2ALIGNED(pgsz, pgsz)) { 402 panic("segvn_init: bad szc %d", szc); 403 /*NOTREACHED*/ 404 } 405 szc--; 406 } 407 if (segvn_maxpgszc == 0 || segvn_maxpgszc > maxszc) 408 segvn_maxpgszc = maxszc; 409 410 if (lgrp_optimizations() && textrepl_size_thresh != (size_t)-1 && 411 !segvn_disable_textrepl) { 412 ulong_t i; 413 size_t hsz = svntr_hashtab_sz * sizeof (svntr_bucket_t); 414 415 svntr_cache = kmem_cache_create("svntr_cache", 416 sizeof (svntr_t), 0, svntr_cache_constructor, NULL, 417 NULL, NULL, NULL, 0); 418 svntr_hashtab = kmem_zalloc(hsz, KM_SLEEP); 419 for (i = 0; i < svntr_hashtab_sz; i++) { 420 mutex_init(&svntr_hashtab[i].tr_lock, NULL, 421 MUTEX_DEFAULT, NULL); 422 } 423 segvn_textrepl_max_bytes = ptob(physmem) / 424 segvn_textrepl_max_bytes_factor; 425 segvn_textrepl_stats = kmem_zalloc(NCPU * 426 sizeof (svntr_stats_t), KM_SLEEP); 427 sema_init(&segvn_trasync_sem, 0, NULL, SEMA_DEFAULT, NULL); 428 (void) thread_create(NULL, 0, segvn_trasync_thread, 429 NULL, 0, &p0, TS_RUN, minclsyspri); 430 } 431 } 432 433 #define SEGVN_PAGEIO ((void *)0x1) 434 #define SEGVN_NOPAGEIO ((void *)0x2) 435 436 static void 437 segvn_setvnode_mpss(vnode_t *vp) 438 { 439 int err; 440 441 ASSERT(vp->v_mpssdata == NULL || 442 vp->v_mpssdata == SEGVN_PAGEIO || 443 vp->v_mpssdata == SEGVN_NOPAGEIO); 444 445 if (vp->v_mpssdata == NULL) { 446 if (vn_vmpss_usepageio(vp)) { 447 err = VOP_PAGEIO(vp, (page_t *)NULL, 448 (u_offset_t)0, 0, 0, CRED()); 449 } else { 450 err = ENOSYS; 451 } 452 /* 453 * set v_mpssdata just once per vnode life 454 * so that it never changes. 455 */ 456 mutex_enter(&vp->v_lock); 457 if (vp->v_mpssdata == NULL) { 458 if (err == EINVAL) { 459 vp->v_mpssdata = SEGVN_PAGEIO; 460 } else { 461 vp->v_mpssdata = SEGVN_NOPAGEIO; 462 } 463 } 464 mutex_exit(&vp->v_lock); 465 } 466 } 467 468 int 469 segvn_create(struct seg *seg, void *argsp) 470 { 471 struct segvn_crargs *a = (struct segvn_crargs *)argsp; 472 struct segvn_data *svd; 473 size_t swresv = 0; 474 struct cred *cred; 475 struct anon_map *amp; 476 int error = 0; 477 size_t pgsz; 478 lgrp_mem_policy_t mpolicy = LGRP_MEM_POLICY_DEFAULT; 479 int trok = 0; 480 481 482 ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock)); 483 484 if (a->type != MAP_PRIVATE && a->type != MAP_SHARED) { 485 panic("segvn_create type"); 486 /*NOTREACHED*/ 487 } 488 489 /* 490 * Check arguments. If a shared anon structure is given then 491 * it is illegal to also specify a vp. 492 */ 493 if (a->amp != NULL && a->vp != NULL) { 494 panic("segvn_create anon_map"); 495 /*NOTREACHED*/ 496 } 497 498 /* MAP_NORESERVE on a MAP_SHARED segment is meaningless. */ 499 if (a->type == MAP_SHARED) 500 a->flags &= ~MAP_NORESERVE; 501 502 if (a->szc != 0) { 503 if (segvn_lpg_disable != 0 || (a->szc == AS_MAP_NO_LPOOB) || 504 (a->amp != NULL && a->type == MAP_PRIVATE) || 505 (a->flags & MAP_NORESERVE) || seg->s_as == &kas) { 506 a->szc = 0; 507 } else { 508 if (a->szc > segvn_maxpgszc) 509 a->szc = segvn_maxpgszc; 510 pgsz = page_get_pagesize(a->szc); 511 if (!IS_P2ALIGNED(seg->s_base, pgsz) || 512 !IS_P2ALIGNED(seg->s_size, pgsz)) { 513 a->szc = 0; 514 } else if (a->vp != NULL) { 515 extern struct vnode kvp; 516 if (IS_SWAPFSVP(a->vp) || VN_ISKAS(a->vp)) { 517 /* 518 * paranoid check. 519 * hat_page_demote() is not supported 520 * on swapfs pages. 521 */ 522 a->szc = 0; 523 } else if (map_addr_vacalign_check(seg->s_base, 524 a->offset & PAGEMASK)) { 525 a->szc = 0; 526 } 527 } else if (a->amp != NULL) { 528 pgcnt_t anum = btopr(a->offset); 529 pgcnt_t pgcnt = page_get_pagecnt(a->szc); 530 if (!IS_P2ALIGNED(anum, pgcnt)) { 531 a->szc = 0; 532 } 533 } 534 } 535 } 536 537 /* 538 * If segment may need private pages, reserve them now. 539 */ 540 if (!(a->flags & MAP_NORESERVE) && ((a->vp == NULL && a->amp == NULL) || 541 (a->type == MAP_PRIVATE && (a->prot & PROT_WRITE)))) { 542 if (anon_resv(seg->s_size) == 0) 543 return (EAGAIN); 544 swresv = seg->s_size; 545 TRACE_3(TR_FAC_VM, TR_ANON_PROC, "anon proc:%p %lu %u", 546 seg, swresv, 1); 547 } 548 549 /* 550 * Reserve any mapping structures that may be required. 551 */ 552 hat_map(seg->s_as->a_hat, seg->s_base, seg->s_size, HAT_MAP); 553 554 if (a->cred) { 555 cred = a->cred; 556 crhold(cred); 557 } else { 558 crhold(cred = CRED()); 559 } 560 561 /* Inform the vnode of the new mapping */ 562 if (a->vp != NULL) { 563 error = VOP_ADDMAP(a->vp, a->offset & PAGEMASK, 564 seg->s_as, seg->s_base, seg->s_size, a->prot, 565 a->maxprot, a->type, cred); 566 if (error) { 567 if (swresv != 0) { 568 anon_unresv(swresv); 569 TRACE_3(TR_FAC_VM, TR_ANON_PROC, 570 "anon proc:%p %lu %u", 571 seg, swresv, 0); 572 } 573 crfree(cred); 574 hat_unload(seg->s_as->a_hat, seg->s_base, 575 seg->s_size, HAT_UNLOAD_UNMAP); 576 return (error); 577 } 578 trok = ((a->flags & MAP_TEXT) && 579 (seg->s_size > textrepl_size_thresh || 580 (a->flags & _MAP_TEXTREPL)) && 581 lgrp_optimizations() && svntr_hashtab != NULL && 582 a->type == MAP_PRIVATE && swresv == 0 && 583 !(a->flags & MAP_NORESERVE) && 584 seg->s_as != &kas && a->vp->v_type == VREG); 585 } 586 587 /* 588 * If more than one segment in the address space, and they're adjacent 589 * virtually, try to concatenate them. Don't concatenate if an 590 * explicit anon_map structure was supplied (e.g., SystemV shared 591 * memory) or if we'll use text replication for this segment. 592 */ 593 if (a->amp == NULL && !trok) { 594 struct seg *pseg, *nseg; 595 struct segvn_data *psvd, *nsvd; 596 lgrp_mem_policy_t ppolicy, npolicy; 597 uint_t lgrp_mem_policy_flags = 0; 598 extern lgrp_mem_policy_t lgrp_mem_default_policy; 599 600 /* 601 * Memory policy flags (lgrp_mem_policy_flags) is valid when 602 * extending stack/heap segments. 603 */ 604 if ((a->vp == NULL) && (a->type == MAP_PRIVATE) && 605 !(a->flags & MAP_NORESERVE) && (seg->s_as != &kas)) { 606 lgrp_mem_policy_flags = a->lgrp_mem_policy_flags; 607 } else { 608 /* 609 * Get policy when not extending it from another segment 610 */ 611 mpolicy = lgrp_mem_policy_default(seg->s_size, a->type); 612 } 613 614 /* 615 * First, try to concatenate the previous and new segments 616 */ 617 pseg = AS_SEGPREV(seg->s_as, seg); 618 if (pseg != NULL && 619 pseg->s_base + pseg->s_size == seg->s_base && 620 pseg->s_ops == &segvn_ops) { 621 /* 622 * Get memory allocation policy from previous segment. 623 * When extension is specified (e.g. for heap) apply 624 * this policy to the new segment regardless of the 625 * outcome of segment concatenation. Extension occurs 626 * for non-default policy otherwise default policy is 627 * used and is based on extended segment size. 628 */ 629 psvd = (struct segvn_data *)pseg->s_data; 630 ppolicy = psvd->policy_info.mem_policy; 631 if (lgrp_mem_policy_flags == 632 LGRP_MP_FLAG_EXTEND_UP) { 633 if (ppolicy != lgrp_mem_default_policy) { 634 mpolicy = ppolicy; 635 } else { 636 mpolicy = lgrp_mem_policy_default( 637 pseg->s_size + seg->s_size, 638 a->type); 639 } 640 } 641 642 if (mpolicy == ppolicy && 643 (pseg->s_size + seg->s_size <= 644 segvn_comb_thrshld || psvd->amp == NULL) && 645 segvn_extend_prev(pseg, seg, a, swresv) == 0) { 646 /* 647 * success! now try to concatenate 648 * with following seg 649 */ 650 crfree(cred); 651 nseg = AS_SEGNEXT(pseg->s_as, pseg); 652 if (nseg != NULL && 653 nseg != pseg && 654 nseg->s_ops == &segvn_ops && 655 pseg->s_base + pseg->s_size == 656 nseg->s_base) 657 (void) segvn_concat(pseg, nseg, 0); 658 ASSERT(pseg->s_szc == 0 || 659 (a->szc == pseg->s_szc && 660 IS_P2ALIGNED(pseg->s_base, pgsz) && 661 IS_P2ALIGNED(pseg->s_size, pgsz))); 662 return (0); 663 } 664 } 665 666 /* 667 * Failed, so try to concatenate with following seg 668 */ 669 nseg = AS_SEGNEXT(seg->s_as, seg); 670 if (nseg != NULL && 671 seg->s_base + seg->s_size == nseg->s_base && 672 nseg->s_ops == &segvn_ops) { 673 /* 674 * Get memory allocation policy from next segment. 675 * When extension is specified (e.g. for stack) apply 676 * this policy to the new segment regardless of the 677 * outcome of segment concatenation. Extension occurs 678 * for non-default policy otherwise default policy is 679 * used and is based on extended segment size. 680 */ 681 nsvd = (struct segvn_data *)nseg->s_data; 682 npolicy = nsvd->policy_info.mem_policy; 683 if (lgrp_mem_policy_flags == 684 LGRP_MP_FLAG_EXTEND_DOWN) { 685 if (npolicy != lgrp_mem_default_policy) { 686 mpolicy = npolicy; 687 } else { 688 mpolicy = lgrp_mem_policy_default( 689 nseg->s_size + seg->s_size, 690 a->type); 691 } 692 } 693 694 if (mpolicy == npolicy && 695 segvn_extend_next(seg, nseg, a, swresv) == 0) { 696 crfree(cred); 697 ASSERT(nseg->s_szc == 0 || 698 (a->szc == nseg->s_szc && 699 IS_P2ALIGNED(nseg->s_base, pgsz) && 700 IS_P2ALIGNED(nseg->s_size, pgsz))); 701 return (0); 702 } 703 } 704 } 705 706 if (a->vp != NULL) { 707 VN_HOLD(a->vp); 708 if (a->type == MAP_SHARED) 709 lgrp_shm_policy_init(NULL, a->vp); 710 } 711 svd = kmem_cache_alloc(segvn_cache, KM_SLEEP); 712 713 seg->s_ops = &segvn_ops; 714 seg->s_data = (void *)svd; 715 seg->s_szc = a->szc; 716 717 svd->seg = seg; 718 svd->vp = a->vp; 719 /* 720 * Anonymous mappings have no backing file so the offset is meaningless. 721 */ 722 svd->offset = a->vp ? (a->offset & PAGEMASK) : 0; 723 svd->prot = a->prot; 724 svd->maxprot = a->maxprot; 725 svd->pageprot = 0; 726 svd->type = a->type; 727 svd->vpage = NULL; 728 svd->cred = cred; 729 svd->advice = MADV_NORMAL; 730 svd->pageadvice = 0; 731 svd->flags = (ushort_t)a->flags; 732 svd->softlockcnt = 0; 733 if (a->szc != 0 && a->vp != NULL) { 734 segvn_setvnode_mpss(a->vp); 735 } 736 if (svd->type == MAP_SHARED && svd->vp != NULL && 737 (svd->vp->v_flag & VVMEXEC) && (svd->prot & PROT_WRITE)) { 738 ASSERT(vn_is_mapped(svd->vp, V_WRITE)); 739 segvn_inval_trcache(svd->vp); 740 } 741 742 amp = a->amp; 743 if ((svd->amp = amp) == NULL) { 744 svd->anon_index = 0; 745 if (svd->type == MAP_SHARED) { 746 svd->swresv = 0; 747 /* 748 * Shared mappings to a vp need no other setup. 749 * If we have a shared mapping to an anon_map object 750 * which hasn't been allocated yet, allocate the 751 * struct now so that it will be properly shared 752 * by remembering the swap reservation there. 753 */ 754 if (a->vp == NULL) { 755 svd->amp = anonmap_alloc(seg->s_size, swresv, 756 ANON_SLEEP); 757 svd->amp->a_szc = seg->s_szc; 758 } 759 } else { 760 /* 761 * Private mapping (with or without a vp). 762 * Allocate anon_map when needed. 763 */ 764 svd->swresv = swresv; 765 } 766 } else { 767 pgcnt_t anon_num; 768 769 /* 770 * Mapping to an existing anon_map structure without a vp. 771 * For now we will insure that the segment size isn't larger 772 * than the size - offset gives us. Later on we may wish to 773 * have the anon array dynamically allocated itself so that 774 * we don't always have to allocate all the anon pointer slots. 775 * This of course involves adding extra code to check that we 776 * aren't trying to use an anon pointer slot beyond the end 777 * of the currently allocated anon array. 778 */ 779 if ((amp->size - a->offset) < seg->s_size) { 780 panic("segvn_create anon_map size"); 781 /*NOTREACHED*/ 782 } 783 784 anon_num = btopr(a->offset); 785 786 if (a->type == MAP_SHARED) { 787 /* 788 * SHARED mapping to a given anon_map. 789 */ 790 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 791 amp->refcnt++; 792 if (a->szc > amp->a_szc) { 793 amp->a_szc = a->szc; 794 } 795 ANON_LOCK_EXIT(&->a_rwlock); 796 svd->anon_index = anon_num; 797 svd->swresv = 0; 798 } else { 799 /* 800 * PRIVATE mapping to a given anon_map. 801 * Make sure that all the needed anon 802 * structures are created (so that we will 803 * share the underlying pages if nothing 804 * is written by this mapping) and then 805 * duplicate the anon array as is done 806 * when a privately mapped segment is dup'ed. 807 */ 808 struct anon *ap; 809 caddr_t addr; 810 caddr_t eaddr; 811 ulong_t anon_idx; 812 int hat_flag = HAT_LOAD; 813 814 if (svd->flags & MAP_TEXT) { 815 hat_flag |= HAT_LOAD_TEXT; 816 } 817 818 svd->amp = anonmap_alloc(seg->s_size, 0, ANON_SLEEP); 819 svd->amp->a_szc = seg->s_szc; 820 svd->anon_index = 0; 821 svd->swresv = swresv; 822 823 /* 824 * Prevent 2 threads from allocating anon 825 * slots simultaneously. 826 */ 827 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 828 eaddr = seg->s_base + seg->s_size; 829 830 for (anon_idx = anon_num, addr = seg->s_base; 831 addr < eaddr; addr += PAGESIZE, anon_idx++) { 832 page_t *pp; 833 834 if ((ap = anon_get_ptr(amp->ahp, 835 anon_idx)) != NULL) 836 continue; 837 838 /* 839 * Allocate the anon struct now. 840 * Might as well load up translation 841 * to the page while we're at it... 842 */ 843 pp = anon_zero(seg, addr, &ap, cred); 844 if (ap == NULL || pp == NULL) { 845 panic("segvn_create anon_zero"); 846 /*NOTREACHED*/ 847 } 848 849 /* 850 * Re-acquire the anon_map lock and 851 * initialize the anon array entry. 852 */ 853 ASSERT(anon_get_ptr(amp->ahp, 854 anon_idx) == NULL); 855 (void) anon_set_ptr(amp->ahp, anon_idx, ap, 856 ANON_SLEEP); 857 858 ASSERT(seg->s_szc == 0); 859 ASSERT(!IS_VMODSORT(pp->p_vnode)); 860 861 hat_memload(seg->s_as->a_hat, addr, pp, 862 svd->prot & ~PROT_WRITE, hat_flag); 863 864 page_unlock(pp); 865 } 866 ASSERT(seg->s_szc == 0); 867 anon_dup(amp->ahp, anon_num, svd->amp->ahp, 868 0, seg->s_size); 869 ANON_LOCK_EXIT(&->a_rwlock); 870 } 871 } 872 873 /* 874 * Set default memory allocation policy for segment 875 * 876 * Always set policy for private memory at least for initialization 877 * even if this is a shared memory segment 878 */ 879 (void) lgrp_privm_policy_set(mpolicy, &svd->policy_info, seg->s_size); 880 881 if (svd->type == MAP_SHARED) 882 (void) lgrp_shm_policy_set(mpolicy, svd->amp, svd->anon_index, 883 svd->vp, svd->offset, seg->s_size); 884 885 ASSERT(!trok || !(svd->prot & PROT_WRITE)); 886 svd->tr_state = trok ? SEGVN_TR_INIT : SEGVN_TR_OFF; 887 888 return (0); 889 } 890 891 /* 892 * Concatenate two existing segments, if possible. 893 * Return 0 on success, -1 if two segments are not compatible 894 * or -2 on memory allocation failure. 895 * If amp_cat == 1 then try and concat segments with anon maps 896 */ 897 static int 898 segvn_concat(struct seg *seg1, struct seg *seg2, int amp_cat) 899 { 900 struct segvn_data *svd1 = seg1->s_data; 901 struct segvn_data *svd2 = seg2->s_data; 902 struct anon_map *amp1 = svd1->amp; 903 struct anon_map *amp2 = svd2->amp; 904 struct vpage *vpage1 = svd1->vpage; 905 struct vpage *vpage2 = svd2->vpage, *nvpage = NULL; 906 size_t size, nvpsize; 907 pgcnt_t npages1, npages2; 908 909 ASSERT(seg1->s_as && seg2->s_as && seg1->s_as == seg2->s_as); 910 ASSERT(AS_WRITE_HELD(seg1->s_as, &seg1->s_as->a_lock)); 911 ASSERT(seg1->s_ops == seg2->s_ops); 912 913 /* both segments exist, try to merge them */ 914 #define incompat(x) (svd1->x != svd2->x) 915 if (incompat(vp) || incompat(maxprot) || 916 (!svd1->pageadvice && !svd2->pageadvice && incompat(advice)) || 917 (!svd1->pageprot && !svd2->pageprot && incompat(prot)) || 918 incompat(type) || incompat(cred) || incompat(flags) || 919 seg1->s_szc != seg2->s_szc || incompat(policy_info.mem_policy) || 920 (svd2->softlockcnt > 0)) 921 return (-1); 922 #undef incompat 923 924 /* 925 * vp == NULL implies zfod, offset doesn't matter 926 */ 927 if (svd1->vp != NULL && 928 svd1->offset + seg1->s_size != svd2->offset) { 929 return (-1); 930 } 931 932 /* 933 * Don't concatenate if either segment uses text replication. 934 */ 935 if (svd1->tr_state != SEGVN_TR_OFF || svd2->tr_state != SEGVN_TR_OFF) { 936 return (-1); 937 } 938 939 /* 940 * Fail early if we're not supposed to concatenate 941 * segments with non NULL amp. 942 */ 943 if (amp_cat == 0 && (amp1 != NULL || amp2 != NULL)) { 944 return (-1); 945 } 946 947 if (svd1->vp == NULL && svd1->type == MAP_SHARED) { 948 if (amp1 != amp2) { 949 return (-1); 950 } 951 if (amp1 != NULL && svd1->anon_index + btop(seg1->s_size) != 952 svd2->anon_index) { 953 return (-1); 954 } 955 ASSERT(amp1 == NULL || amp1->refcnt >= 2); 956 } 957 958 /* 959 * If either seg has vpages, create a new merged vpage array. 960 */ 961 if (vpage1 != NULL || vpage2 != NULL) { 962 struct vpage *vp; 963 964 npages1 = seg_pages(seg1); 965 npages2 = seg_pages(seg2); 966 nvpsize = vpgtob(npages1 + npages2); 967 968 if ((nvpage = kmem_zalloc(nvpsize, KM_NOSLEEP)) == NULL) { 969 return (-2); 970 } 971 if (vpage1 != NULL) { 972 bcopy(vpage1, nvpage, vpgtob(npages1)); 973 } 974 if (vpage2 != NULL) { 975 bcopy(vpage2, nvpage + npages1, vpgtob(npages2)); 976 } 977 for (vp = nvpage; vp < nvpage + npages1; vp++) { 978 if (svd2->pageprot && !svd1->pageprot) { 979 VPP_SETPROT(vp, svd1->prot); 980 } 981 if (svd2->pageadvice && !svd1->pageadvice) { 982 VPP_SETADVICE(vp, svd1->advice); 983 } 984 } 985 for (vp = nvpage + npages1; 986 vp < nvpage + npages1 + npages2; vp++) { 987 if (svd1->pageprot && !svd2->pageprot) { 988 VPP_SETPROT(vp, svd2->prot); 989 } 990 if (svd1->pageadvice && !svd2->pageadvice) { 991 VPP_SETADVICE(vp, svd2->advice); 992 } 993 } 994 } 995 996 /* 997 * If either segment has private pages, create a new merged anon 998 * array. If mergeing shared anon segments just decrement anon map's 999 * refcnt. 1000 */ 1001 if (amp1 != NULL && svd1->type == MAP_SHARED) { 1002 ASSERT(amp1 == amp2 && svd1->vp == NULL); 1003 ANON_LOCK_ENTER(&1->a_rwlock, RW_WRITER); 1004 ASSERT(amp1->refcnt >= 2); 1005 amp1->refcnt--; 1006 ANON_LOCK_EXIT(&1->a_rwlock); 1007 svd2->amp = NULL; 1008 } else if (amp1 != NULL || amp2 != NULL) { 1009 struct anon_hdr *nahp; 1010 struct anon_map *namp = NULL; 1011 size_t asize; 1012 1013 ASSERT(svd1->type == MAP_PRIVATE); 1014 1015 asize = seg1->s_size + seg2->s_size; 1016 if ((nahp = anon_create(btop(asize), ANON_NOSLEEP)) == NULL) { 1017 if (nvpage != NULL) { 1018 kmem_free(nvpage, nvpsize); 1019 } 1020 return (-2); 1021 } 1022 if (amp1 != NULL) { 1023 /* 1024 * XXX anon rwlock is not really needed because 1025 * this is a private segment and we are writers. 1026 */ 1027 ANON_LOCK_ENTER(&1->a_rwlock, RW_WRITER); 1028 ASSERT(amp1->refcnt == 1); 1029 if (anon_copy_ptr(amp1->ahp, svd1->anon_index, 1030 nahp, 0, btop(seg1->s_size), ANON_NOSLEEP)) { 1031 anon_release(nahp, btop(asize)); 1032 ANON_LOCK_EXIT(&1->a_rwlock); 1033 if (nvpage != NULL) { 1034 kmem_free(nvpage, nvpsize); 1035 } 1036 return (-2); 1037 } 1038 } 1039 if (amp2 != NULL) { 1040 ANON_LOCK_ENTER(&2->a_rwlock, RW_WRITER); 1041 ASSERT(amp2->refcnt == 1); 1042 if (anon_copy_ptr(amp2->ahp, svd2->anon_index, 1043 nahp, btop(seg1->s_size), btop(seg2->s_size), 1044 ANON_NOSLEEP)) { 1045 anon_release(nahp, btop(asize)); 1046 ANON_LOCK_EXIT(&2->a_rwlock); 1047 if (amp1 != NULL) { 1048 ANON_LOCK_EXIT(&1->a_rwlock); 1049 } 1050 if (nvpage != NULL) { 1051 kmem_free(nvpage, nvpsize); 1052 } 1053 return (-2); 1054 } 1055 } 1056 if (amp1 != NULL) { 1057 namp = amp1; 1058 anon_release(amp1->ahp, btop(amp1->size)); 1059 } 1060 if (amp2 != NULL) { 1061 if (namp == NULL) { 1062 ASSERT(amp1 == NULL); 1063 namp = amp2; 1064 anon_release(amp2->ahp, btop(amp2->size)); 1065 } else { 1066 amp2->refcnt--; 1067 ANON_LOCK_EXIT(&2->a_rwlock); 1068 anonmap_free(amp2); 1069 } 1070 svd2->amp = NULL; /* needed for seg_free */ 1071 } 1072 namp->ahp = nahp; 1073 namp->size = asize; 1074 svd1->amp = namp; 1075 svd1->anon_index = 0; 1076 ANON_LOCK_EXIT(&namp->a_rwlock); 1077 } 1078 /* 1079 * Now free the old vpage structures. 1080 */ 1081 if (nvpage != NULL) { 1082 if (vpage1 != NULL) { 1083 kmem_free(vpage1, vpgtob(npages1)); 1084 } 1085 if (vpage2 != NULL) { 1086 svd2->vpage = NULL; 1087 kmem_free(vpage2, vpgtob(npages2)); 1088 } 1089 if (svd2->pageprot) { 1090 svd1->pageprot = 1; 1091 } 1092 if (svd2->pageadvice) { 1093 svd1->pageadvice = 1; 1094 } 1095 svd1->vpage = nvpage; 1096 } 1097 1098 /* all looks ok, merge segments */ 1099 svd1->swresv += svd2->swresv; 1100 svd2->swresv = 0; /* so seg_free doesn't release swap space */ 1101 size = seg2->s_size; 1102 seg_free(seg2); 1103 seg1->s_size += size; 1104 return (0); 1105 } 1106 1107 /* 1108 * Extend the previous segment (seg1) to include the 1109 * new segment (seg2 + a), if possible. 1110 * Return 0 on success. 1111 */ 1112 static int 1113 segvn_extend_prev(seg1, seg2, a, swresv) 1114 struct seg *seg1, *seg2; 1115 struct segvn_crargs *a; 1116 size_t swresv; 1117 { 1118 struct segvn_data *svd1 = (struct segvn_data *)seg1->s_data; 1119 size_t size; 1120 struct anon_map *amp1; 1121 struct vpage *new_vpage; 1122 1123 /* 1124 * We don't need any segment level locks for "segvn" data 1125 * since the address space is "write" locked. 1126 */ 1127 ASSERT(seg1->s_as && AS_WRITE_HELD(seg1->s_as, &seg1->s_as->a_lock)); 1128 1129 /* second segment is new, try to extend first */ 1130 /* XXX - should also check cred */ 1131 if (svd1->vp != a->vp || svd1->maxprot != a->maxprot || 1132 (!svd1->pageprot && (svd1->prot != a->prot)) || 1133 svd1->type != a->type || svd1->flags != a->flags || 1134 seg1->s_szc != a->szc) 1135 return (-1); 1136 1137 /* vp == NULL implies zfod, offset doesn't matter */ 1138 if (svd1->vp != NULL && 1139 svd1->offset + seg1->s_size != (a->offset & PAGEMASK)) 1140 return (-1); 1141 1142 if (svd1->tr_state != SEGVN_TR_OFF) { 1143 return (-1); 1144 } 1145 1146 amp1 = svd1->amp; 1147 if (amp1) { 1148 pgcnt_t newpgs; 1149 1150 /* 1151 * Segment has private pages, can data structures 1152 * be expanded? 1153 * 1154 * Acquire the anon_map lock to prevent it from changing, 1155 * if it is shared. This ensures that the anon_map 1156 * will not change while a thread which has a read/write 1157 * lock on an address space references it. 1158 * XXX - Don't need the anon_map lock at all if "refcnt" 1159 * is 1. 1160 * 1161 * Can't grow a MAP_SHARED segment with an anonmap because 1162 * there may be existing anon slots where we want to extend 1163 * the segment and we wouldn't know what to do with them 1164 * (e.g., for tmpfs right thing is to just leave them there, 1165 * for /dev/zero they should be cleared out). 1166 */ 1167 if (svd1->type == MAP_SHARED) 1168 return (-1); 1169 1170 ANON_LOCK_ENTER(&1->a_rwlock, RW_WRITER); 1171 if (amp1->refcnt > 1) { 1172 ANON_LOCK_EXIT(&1->a_rwlock); 1173 return (-1); 1174 } 1175 newpgs = anon_grow(amp1->ahp, &svd1->anon_index, 1176 btop(seg1->s_size), btop(seg2->s_size), ANON_NOSLEEP); 1177 1178 if (newpgs == 0) { 1179 ANON_LOCK_EXIT(&1->a_rwlock); 1180 return (-1); 1181 } 1182 amp1->size = ptob(newpgs); 1183 ANON_LOCK_EXIT(&1->a_rwlock); 1184 } 1185 if (svd1->vpage != NULL) { 1186 new_vpage = 1187 kmem_zalloc(vpgtob(seg_pages(seg1) + seg_pages(seg2)), 1188 KM_NOSLEEP); 1189 if (new_vpage == NULL) 1190 return (-1); 1191 bcopy(svd1->vpage, new_vpage, vpgtob(seg_pages(seg1))); 1192 kmem_free(svd1->vpage, vpgtob(seg_pages(seg1))); 1193 svd1->vpage = new_vpage; 1194 if (svd1->pageprot) { 1195 struct vpage *vp, *evp; 1196 1197 vp = new_vpage + seg_pages(seg1); 1198 evp = vp + seg_pages(seg2); 1199 for (; vp < evp; vp++) 1200 VPP_SETPROT(vp, a->prot); 1201 } 1202 } 1203 size = seg2->s_size; 1204 seg_free(seg2); 1205 seg1->s_size += size; 1206 svd1->swresv += swresv; 1207 if (svd1->pageprot && (a->prot & PROT_WRITE) && 1208 svd1->type == MAP_SHARED && svd1->vp != NULL && 1209 (svd1->vp->v_flag & VVMEXEC)) { 1210 ASSERT(vn_is_mapped(svd1->vp, V_WRITE)); 1211 segvn_inval_trcache(svd1->vp); 1212 } 1213 return (0); 1214 } 1215 1216 /* 1217 * Extend the next segment (seg2) to include the 1218 * new segment (seg1 + a), if possible. 1219 * Return 0 on success. 1220 */ 1221 static int 1222 segvn_extend_next( 1223 struct seg *seg1, 1224 struct seg *seg2, 1225 struct segvn_crargs *a, 1226 size_t swresv) 1227 { 1228 struct segvn_data *svd2 = (struct segvn_data *)seg2->s_data; 1229 size_t size; 1230 struct anon_map *amp2; 1231 struct vpage *new_vpage; 1232 1233 /* 1234 * We don't need any segment level locks for "segvn" data 1235 * since the address space is "write" locked. 1236 */ 1237 ASSERT(seg2->s_as && AS_WRITE_HELD(seg2->s_as, &seg2->s_as->a_lock)); 1238 1239 /* first segment is new, try to extend second */ 1240 /* XXX - should also check cred */ 1241 if (svd2->vp != a->vp || svd2->maxprot != a->maxprot || 1242 (!svd2->pageprot && (svd2->prot != a->prot)) || 1243 svd2->type != a->type || svd2->flags != a->flags || 1244 seg2->s_szc != a->szc) 1245 return (-1); 1246 /* vp == NULL implies zfod, offset doesn't matter */ 1247 if (svd2->vp != NULL && 1248 (a->offset & PAGEMASK) + seg1->s_size != svd2->offset) 1249 return (-1); 1250 1251 if (svd2->tr_state != SEGVN_TR_OFF) { 1252 return (-1); 1253 } 1254 1255 amp2 = svd2->amp; 1256 if (amp2) { 1257 pgcnt_t newpgs; 1258 1259 /* 1260 * Segment has private pages, can data structures 1261 * be expanded? 1262 * 1263 * Acquire the anon_map lock to prevent it from changing, 1264 * if it is shared. This ensures that the anon_map 1265 * will not change while a thread which has a read/write 1266 * lock on an address space references it. 1267 * 1268 * XXX - Don't need the anon_map lock at all if "refcnt" 1269 * is 1. 1270 */ 1271 if (svd2->type == MAP_SHARED) 1272 return (-1); 1273 1274 ANON_LOCK_ENTER(&2->a_rwlock, RW_WRITER); 1275 if (amp2->refcnt > 1) { 1276 ANON_LOCK_EXIT(&2->a_rwlock); 1277 return (-1); 1278 } 1279 newpgs = anon_grow(amp2->ahp, &svd2->anon_index, 1280 btop(seg2->s_size), btop(seg1->s_size), 1281 ANON_NOSLEEP | ANON_GROWDOWN); 1282 1283 if (newpgs == 0) { 1284 ANON_LOCK_EXIT(&2->a_rwlock); 1285 return (-1); 1286 } 1287 amp2->size = ptob(newpgs); 1288 ANON_LOCK_EXIT(&2->a_rwlock); 1289 } 1290 if (svd2->vpage != NULL) { 1291 new_vpage = 1292 kmem_zalloc(vpgtob(seg_pages(seg1) + seg_pages(seg2)), 1293 KM_NOSLEEP); 1294 if (new_vpage == NULL) { 1295 /* Not merging segments so adjust anon_index back */ 1296 if (amp2) 1297 svd2->anon_index += seg_pages(seg1); 1298 return (-1); 1299 } 1300 bcopy(svd2->vpage, new_vpage + seg_pages(seg1), 1301 vpgtob(seg_pages(seg2))); 1302 kmem_free(svd2->vpage, vpgtob(seg_pages(seg2))); 1303 svd2->vpage = new_vpage; 1304 if (svd2->pageprot) { 1305 struct vpage *vp, *evp; 1306 1307 vp = new_vpage; 1308 evp = vp + seg_pages(seg1); 1309 for (; vp < evp; vp++) 1310 VPP_SETPROT(vp, a->prot); 1311 } 1312 } 1313 size = seg1->s_size; 1314 seg_free(seg1); 1315 seg2->s_size += size; 1316 seg2->s_base -= size; 1317 svd2->offset -= size; 1318 svd2->swresv += swresv; 1319 if (svd2->pageprot && (a->prot & PROT_WRITE) && 1320 svd2->type == MAP_SHARED && svd2->vp != NULL && 1321 (svd2->vp->v_flag & VVMEXEC)) { 1322 ASSERT(vn_is_mapped(svd2->vp, V_WRITE)); 1323 segvn_inval_trcache(svd2->vp); 1324 } 1325 return (0); 1326 } 1327 1328 static int 1329 segvn_dup(struct seg *seg, struct seg *newseg) 1330 { 1331 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 1332 struct segvn_data *newsvd; 1333 pgcnt_t npages = seg_pages(seg); 1334 int error = 0; 1335 uint_t prot; 1336 size_t len; 1337 struct anon_map *amp; 1338 1339 ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock)); 1340 1341 /* 1342 * If segment has anon reserved, reserve more for the new seg. 1343 * For a MAP_NORESERVE segment swresv will be a count of all the 1344 * allocated anon slots; thus we reserve for the child as many slots 1345 * as the parent has allocated. This semantic prevents the child or 1346 * parent from dieing during a copy-on-write fault caused by trying 1347 * to write a shared pre-existing anon page. 1348 */ 1349 if ((len = svd->swresv) != 0) { 1350 if (anon_resv(svd->swresv) == 0) 1351 return (ENOMEM); 1352 1353 TRACE_3(TR_FAC_VM, TR_ANON_PROC, "anon proc:%p %lu %u", 1354 seg, len, 0); 1355 } 1356 1357 newsvd = kmem_cache_alloc(segvn_cache, KM_SLEEP); 1358 1359 newseg->s_ops = &segvn_ops; 1360 newseg->s_data = (void *)newsvd; 1361 newseg->s_szc = seg->s_szc; 1362 1363 newsvd->seg = newseg; 1364 if ((newsvd->vp = svd->vp) != NULL) { 1365 VN_HOLD(svd->vp); 1366 if (svd->type == MAP_SHARED) 1367 lgrp_shm_policy_init(NULL, svd->vp); 1368 } 1369 newsvd->offset = svd->offset; 1370 newsvd->prot = svd->prot; 1371 newsvd->maxprot = svd->maxprot; 1372 newsvd->pageprot = svd->pageprot; 1373 newsvd->type = svd->type; 1374 newsvd->cred = svd->cred; 1375 crhold(newsvd->cred); 1376 newsvd->advice = svd->advice; 1377 newsvd->pageadvice = svd->pageadvice; 1378 newsvd->swresv = svd->swresv; 1379 newsvd->flags = svd->flags; 1380 newsvd->softlockcnt = 0; 1381 newsvd->policy_info = svd->policy_info; 1382 if ((amp = svd->amp) == NULL || svd->tr_state == SEGVN_TR_ON) { 1383 /* 1384 * Not attaching to a shared anon object. 1385 */ 1386 if (svd->tr_state == SEGVN_TR_ON) { 1387 ASSERT(newsvd->vp != NULL && amp != NULL); 1388 newsvd->tr_state = SEGVN_TR_INIT; 1389 } else { 1390 newsvd->tr_state = svd->tr_state; 1391 } 1392 newsvd->amp = NULL; 1393 newsvd->anon_index = 0; 1394 } else { 1395 ASSERT(svd->tr_state == SEGVN_TR_OFF); 1396 newsvd->tr_state = SEGVN_TR_OFF; 1397 if (svd->type == MAP_SHARED) { 1398 newsvd->amp = amp; 1399 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 1400 amp->refcnt++; 1401 ANON_LOCK_EXIT(&->a_rwlock); 1402 newsvd->anon_index = svd->anon_index; 1403 } else { 1404 int reclaim = 1; 1405 1406 /* 1407 * Allocate and initialize new anon_map structure. 1408 */ 1409 newsvd->amp = anonmap_alloc(newseg->s_size, 0, 1410 ANON_SLEEP); 1411 newsvd->amp->a_szc = newseg->s_szc; 1412 newsvd->anon_index = 0; 1413 1414 /* 1415 * We don't have to acquire the anon_map lock 1416 * for the new segment (since it belongs to an 1417 * address space that is still not associated 1418 * with any process), or the segment in the old 1419 * address space (since all threads in it 1420 * are stopped while duplicating the address space). 1421 */ 1422 1423 /* 1424 * The goal of the following code is to make sure that 1425 * softlocked pages do not end up as copy on write 1426 * pages. This would cause problems where one 1427 * thread writes to a page that is COW and a different 1428 * thread in the same process has softlocked it. The 1429 * softlock lock would move away from this process 1430 * because the write would cause this process to get 1431 * a copy (without the softlock). 1432 * 1433 * The strategy here is to just break the 1434 * sharing on pages that could possibly be 1435 * softlocked. 1436 */ 1437 retry: 1438 if (svd->softlockcnt) { 1439 struct anon *ap, *newap; 1440 size_t i; 1441 uint_t vpprot; 1442 page_t *anon_pl[1+1], *pp; 1443 caddr_t addr; 1444 ulong_t old_idx = svd->anon_index; 1445 ulong_t new_idx = 0; 1446 1447 /* 1448 * The softlock count might be non zero 1449 * because some pages are still stuck in the 1450 * cache for lazy reclaim. Flush the cache 1451 * now. This should drop the count to zero. 1452 * [or there is really I/O going on to these 1453 * pages]. Note, we have the writers lock so 1454 * nothing gets inserted during the flush. 1455 */ 1456 if (reclaim == 1) { 1457 segvn_purge(seg); 1458 reclaim = 0; 1459 goto retry; 1460 } 1461 i = btopr(seg->s_size); 1462 addr = seg->s_base; 1463 /* 1464 * XXX break cow sharing using PAGESIZE 1465 * pages. They will be relocated into larger 1466 * pages at fault time. 1467 */ 1468 while (i-- > 0) { 1469 if (ap = anon_get_ptr(amp->ahp, 1470 old_idx)) { 1471 error = anon_getpage(&ap, 1472 &vpprot, anon_pl, PAGESIZE, 1473 seg, addr, S_READ, 1474 svd->cred); 1475 if (error) { 1476 newsvd->vpage = NULL; 1477 goto out; 1478 } 1479 /* 1480 * prot need not be computed 1481 * below 'cause anon_private is 1482 * going to ignore it anyway 1483 * as child doesn't inherit 1484 * pagelock from parent. 1485 */ 1486 prot = svd->pageprot ? 1487 VPP_PROT( 1488 &svd->vpage[ 1489 seg_page(seg, addr)]) 1490 : svd->prot; 1491 pp = anon_private(&newap, 1492 newseg, addr, prot, 1493 anon_pl[0], 0, 1494 newsvd->cred); 1495 if (pp == NULL) { 1496 /* no mem abort */ 1497 newsvd->vpage = NULL; 1498 error = ENOMEM; 1499 goto out; 1500 } 1501 (void) anon_set_ptr( 1502 newsvd->amp->ahp, new_idx, 1503 newap, ANON_SLEEP); 1504 page_unlock(pp); 1505 } 1506 addr += PAGESIZE; 1507 old_idx++; 1508 new_idx++; 1509 } 1510 } else { /* common case */ 1511 if (seg->s_szc != 0) { 1512 /* 1513 * If at least one of anon slots of a 1514 * large page exists then make sure 1515 * all anon slots of a large page 1516 * exist to avoid partial cow sharing 1517 * of a large page in the future. 1518 */ 1519 anon_dup_fill_holes(amp->ahp, 1520 svd->anon_index, newsvd->amp->ahp, 1521 0, seg->s_size, seg->s_szc, 1522 svd->vp != NULL); 1523 } else { 1524 anon_dup(amp->ahp, svd->anon_index, 1525 newsvd->amp->ahp, 0, seg->s_size); 1526 } 1527 1528 hat_clrattr(seg->s_as->a_hat, seg->s_base, 1529 seg->s_size, PROT_WRITE); 1530 } 1531 } 1532 } 1533 /* 1534 * If necessary, create a vpage structure for the new segment. 1535 * Do not copy any page lock indications. 1536 */ 1537 if (svd->vpage != NULL) { 1538 uint_t i; 1539 struct vpage *ovp = svd->vpage; 1540 struct vpage *nvp; 1541 1542 nvp = newsvd->vpage = 1543 kmem_alloc(vpgtob(npages), KM_SLEEP); 1544 for (i = 0; i < npages; i++) { 1545 *nvp = *ovp++; 1546 VPP_CLRPPLOCK(nvp++); 1547 } 1548 } else 1549 newsvd->vpage = NULL; 1550 1551 /* Inform the vnode of the new mapping */ 1552 if (newsvd->vp != NULL) { 1553 error = VOP_ADDMAP(newsvd->vp, (offset_t)newsvd->offset, 1554 newseg->s_as, newseg->s_base, newseg->s_size, newsvd->prot, 1555 newsvd->maxprot, newsvd->type, newsvd->cred); 1556 } 1557 out: 1558 return (error); 1559 } 1560 1561 1562 /* 1563 * callback function used by segvn_unmap to invoke free_vp_pages() for only 1564 * those pages actually processed by the HAT 1565 */ 1566 extern int free_pages; 1567 1568 static void 1569 segvn_hat_unload_callback(hat_callback_t *cb) 1570 { 1571 struct seg *seg = cb->hcb_data; 1572 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 1573 size_t len; 1574 u_offset_t off; 1575 1576 ASSERT(svd->vp != NULL); 1577 ASSERT(cb->hcb_end_addr > cb->hcb_start_addr); 1578 ASSERT(cb->hcb_start_addr >= seg->s_base); 1579 1580 len = cb->hcb_end_addr - cb->hcb_start_addr; 1581 off = cb->hcb_start_addr - seg->s_base; 1582 free_vp_pages(svd->vp, svd->offset + off, len); 1583 } 1584 1585 1586 static int 1587 segvn_unmap(struct seg *seg, caddr_t addr, size_t len) 1588 { 1589 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 1590 struct segvn_data *nsvd; 1591 struct seg *nseg; 1592 struct anon_map *amp; 1593 pgcnt_t opages; /* old segment size in pages */ 1594 pgcnt_t npages; /* new segment size in pages */ 1595 pgcnt_t dpages; /* pages being deleted (unmapped) */ 1596 hat_callback_t callback; /* used for free_vp_pages() */ 1597 hat_callback_t *cbp = NULL; 1598 caddr_t nbase; 1599 size_t nsize; 1600 size_t oswresv; 1601 int reclaim = 1; 1602 int unmap = 1; 1603 1604 /* 1605 * We don't need any segment level locks for "segvn" data 1606 * since the address space is "write" locked. 1607 */ 1608 ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock)); 1609 1610 /* 1611 * Fail the unmap if pages are SOFTLOCKed through this mapping. 1612 * softlockcnt is protected from change by the as write lock. 1613 */ 1614 retry: 1615 if (svd->softlockcnt > 0) { 1616 ASSERT(svd->tr_state == SEGVN_TR_OFF); 1617 /* 1618 * since we do have the writers lock nobody can fill 1619 * the cache during the purge. The flush either succeeds 1620 * or we still have pending I/Os. 1621 */ 1622 if (reclaim == 1) { 1623 segvn_purge(seg); 1624 reclaim = 0; 1625 goto retry; 1626 } 1627 return (EAGAIN); 1628 } 1629 1630 /* 1631 * Check for bad sizes 1632 */ 1633 if (addr < seg->s_base || addr + len > seg->s_base + seg->s_size || 1634 (len & PAGEOFFSET) || ((uintptr_t)addr & PAGEOFFSET)) { 1635 panic("segvn_unmap"); 1636 /*NOTREACHED*/ 1637 } 1638 1639 if (seg->s_szc != 0) { 1640 size_t pgsz = page_get_pagesize(seg->s_szc); 1641 int err; 1642 if (!IS_P2ALIGNED(addr, pgsz) || !IS_P2ALIGNED(len, pgsz)) { 1643 ASSERT(seg->s_base != addr || seg->s_size != len); 1644 if (svd->tr_state == SEGVN_TR_INIT) { 1645 svd->tr_state = SEGVN_TR_OFF; 1646 } else if (svd->tr_state == SEGVN_TR_ON) { 1647 ASSERT(svd->amp != NULL); 1648 segvn_textunrepl(seg, 1); 1649 ASSERT(svd->amp == NULL); 1650 ASSERT(svd->tr_state == SEGVN_TR_OFF); 1651 } 1652 VM_STAT_ADD(segvnvmstats.demoterange[0]); 1653 err = segvn_demote_range(seg, addr, len, SDR_END, 0); 1654 if (err == 0) { 1655 return (IE_RETRY); 1656 } 1657 return (err); 1658 } 1659 } 1660 1661 /* Inform the vnode of the unmapping. */ 1662 if (svd->vp) { 1663 int error; 1664 1665 error = VOP_DELMAP(svd->vp, 1666 (offset_t)svd->offset + (uintptr_t)(addr - seg->s_base), 1667 seg->s_as, addr, len, svd->prot, svd->maxprot, 1668 svd->type, svd->cred); 1669 1670 if (error == EAGAIN) 1671 return (error); 1672 } 1673 1674 if (svd->tr_state == SEGVN_TR_INIT) { 1675 svd->tr_state = SEGVN_TR_OFF; 1676 } else if (svd->tr_state == SEGVN_TR_ON) { 1677 ASSERT(svd->amp != NULL); 1678 ASSERT(svd->pageprot == 0 && !(svd->prot & PROT_WRITE)); 1679 segvn_textunrepl(seg, 1); 1680 ASSERT(svd->amp == NULL && svd->tr_state == SEGVN_TR_OFF); 1681 unmap = 0; 1682 } 1683 1684 /* 1685 * Remove any page locks set through this mapping. 1686 */ 1687 (void) segvn_lockop(seg, addr, len, 0, MC_UNLOCK, NULL, 0); 1688 1689 if (unmap) { 1690 /* 1691 * Unload any hardware translations in the range to be taken 1692 * out. Use a callback to invoke free_vp_pages() effectively. 1693 */ 1694 if (svd->vp != NULL && free_pages != 0) { 1695 callback.hcb_data = seg; 1696 callback.hcb_function = segvn_hat_unload_callback; 1697 cbp = &callback; 1698 } 1699 hat_unload_callback(seg->s_as->a_hat, addr, len, 1700 HAT_UNLOAD_UNMAP, cbp); 1701 1702 if (svd->type == MAP_SHARED && svd->vp != NULL && 1703 (svd->vp->v_flag & VVMEXEC) && 1704 ((svd->prot & PROT_WRITE) || svd->pageprot)) { 1705 segvn_inval_trcache(svd->vp); 1706 } 1707 } 1708 1709 /* 1710 * Check for entire segment 1711 */ 1712 if (addr == seg->s_base && len == seg->s_size) { 1713 seg_free(seg); 1714 return (0); 1715 } 1716 1717 opages = seg_pages(seg); 1718 dpages = btop(len); 1719 npages = opages - dpages; 1720 amp = svd->amp; 1721 ASSERT(amp == NULL || amp->a_szc >= seg->s_szc); 1722 1723 /* 1724 * Check for beginning of segment 1725 */ 1726 if (addr == seg->s_base) { 1727 if (svd->vpage != NULL) { 1728 size_t nbytes; 1729 struct vpage *ovpage; 1730 1731 ovpage = svd->vpage; /* keep pointer to vpage */ 1732 1733 nbytes = vpgtob(npages); 1734 svd->vpage = kmem_alloc(nbytes, KM_SLEEP); 1735 bcopy(&ovpage[dpages], svd->vpage, nbytes); 1736 1737 /* free up old vpage */ 1738 kmem_free(ovpage, vpgtob(opages)); 1739 } 1740 if (amp != NULL) { 1741 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 1742 if (amp->refcnt == 1 || svd->type == MAP_PRIVATE) { 1743 /* 1744 * Free up now unused parts of anon_map array. 1745 */ 1746 if (amp->a_szc == seg->s_szc) { 1747 if (seg->s_szc != 0) { 1748 anon_free_pages(amp->ahp, 1749 svd->anon_index, len, 1750 seg->s_szc); 1751 } else { 1752 anon_free(amp->ahp, 1753 svd->anon_index, 1754 len); 1755 } 1756 } else { 1757 ASSERT(svd->type == MAP_SHARED); 1758 ASSERT(amp->a_szc > seg->s_szc); 1759 anon_shmap_free_pages(amp, 1760 svd->anon_index, len); 1761 } 1762 1763 /* 1764 * Unreserve swap space for the 1765 * unmapped chunk of this segment in 1766 * case it's MAP_SHARED 1767 */ 1768 if (svd->type == MAP_SHARED) { 1769 anon_unresv(len); 1770 amp->swresv -= len; 1771 } 1772 } 1773 ANON_LOCK_EXIT(&->a_rwlock); 1774 svd->anon_index += dpages; 1775 } 1776 if (svd->vp != NULL) 1777 svd->offset += len; 1778 1779 if (svd->swresv) { 1780 if (svd->flags & MAP_NORESERVE) { 1781 ASSERT(amp); 1782 oswresv = svd->swresv; 1783 1784 svd->swresv = ptob(anon_pages(amp->ahp, 1785 svd->anon_index, npages)); 1786 anon_unresv(oswresv - svd->swresv); 1787 } else { 1788 anon_unresv(len); 1789 svd->swresv -= len; 1790 } 1791 TRACE_3(TR_FAC_VM, TR_ANON_PROC, "anon proc:%p %lu %u", 1792 seg, len, 0); 1793 } 1794 1795 seg->s_base += len; 1796 seg->s_size -= len; 1797 return (0); 1798 } 1799 1800 /* 1801 * Check for end of segment 1802 */ 1803 if (addr + len == seg->s_base + seg->s_size) { 1804 if (svd->vpage != NULL) { 1805 size_t nbytes; 1806 struct vpage *ovpage; 1807 1808 ovpage = svd->vpage; /* keep pointer to vpage */ 1809 1810 nbytes = vpgtob(npages); 1811 svd->vpage = kmem_alloc(nbytes, KM_SLEEP); 1812 bcopy(ovpage, svd->vpage, nbytes); 1813 1814 /* free up old vpage */ 1815 kmem_free(ovpage, vpgtob(opages)); 1816 1817 } 1818 if (amp != NULL) { 1819 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 1820 if (amp->refcnt == 1 || svd->type == MAP_PRIVATE) { 1821 /* 1822 * Free up now unused parts of anon_map array. 1823 */ 1824 ulong_t an_idx = svd->anon_index + npages; 1825 if (amp->a_szc == seg->s_szc) { 1826 if (seg->s_szc != 0) { 1827 anon_free_pages(amp->ahp, 1828 an_idx, len, 1829 seg->s_szc); 1830 } else { 1831 anon_free(amp->ahp, an_idx, 1832 len); 1833 } 1834 } else { 1835 ASSERT(svd->type == MAP_SHARED); 1836 ASSERT(amp->a_szc > seg->s_szc); 1837 anon_shmap_free_pages(amp, 1838 an_idx, len); 1839 } 1840 1841 /* 1842 * Unreserve swap space for the 1843 * unmapped chunk of this segment in 1844 * case it's MAP_SHARED 1845 */ 1846 if (svd->type == MAP_SHARED) { 1847 anon_unresv(len); 1848 amp->swresv -= len; 1849 } 1850 } 1851 ANON_LOCK_EXIT(&->a_rwlock); 1852 } 1853 1854 if (svd->swresv) { 1855 if (svd->flags & MAP_NORESERVE) { 1856 ASSERT(amp); 1857 oswresv = svd->swresv; 1858 svd->swresv = ptob(anon_pages(amp->ahp, 1859 svd->anon_index, npages)); 1860 anon_unresv(oswresv - svd->swresv); 1861 } else { 1862 anon_unresv(len); 1863 svd->swresv -= len; 1864 } 1865 TRACE_3(TR_FAC_VM, TR_ANON_PROC, 1866 "anon proc:%p %lu %u", seg, len, 0); 1867 } 1868 1869 seg->s_size -= len; 1870 return (0); 1871 } 1872 1873 /* 1874 * The section to go is in the middle of the segment, 1875 * have to make it into two segments. nseg is made for 1876 * the high end while seg is cut down at the low end. 1877 */ 1878 nbase = addr + len; /* new seg base */ 1879 nsize = (seg->s_base + seg->s_size) - nbase; /* new seg size */ 1880 seg->s_size = addr - seg->s_base; /* shrink old seg */ 1881 nseg = seg_alloc(seg->s_as, nbase, nsize); 1882 if (nseg == NULL) { 1883 panic("segvn_unmap seg_alloc"); 1884 /*NOTREACHED*/ 1885 } 1886 nseg->s_ops = seg->s_ops; 1887 nsvd = kmem_cache_alloc(segvn_cache, KM_SLEEP); 1888 nseg->s_data = (void *)nsvd; 1889 nseg->s_szc = seg->s_szc; 1890 *nsvd = *svd; 1891 nsvd->seg = nseg; 1892 nsvd->offset = svd->offset + (uintptr_t)(nseg->s_base - seg->s_base); 1893 nsvd->swresv = 0; 1894 nsvd->softlockcnt = 0; 1895 1896 if (svd->vp != NULL) { 1897 VN_HOLD(nsvd->vp); 1898 if (nsvd->type == MAP_SHARED) 1899 lgrp_shm_policy_init(NULL, nsvd->vp); 1900 } 1901 crhold(svd->cred); 1902 1903 if (svd->vpage == NULL) { 1904 nsvd->vpage = NULL; 1905 } else { 1906 /* need to split vpage into two arrays */ 1907 size_t nbytes; 1908 struct vpage *ovpage; 1909 1910 ovpage = svd->vpage; /* keep pointer to vpage */ 1911 1912 npages = seg_pages(seg); /* seg has shrunk */ 1913 nbytes = vpgtob(npages); 1914 svd->vpage = kmem_alloc(nbytes, KM_SLEEP); 1915 1916 bcopy(ovpage, svd->vpage, nbytes); 1917 1918 npages = seg_pages(nseg); 1919 nbytes = vpgtob(npages); 1920 nsvd->vpage = kmem_alloc(nbytes, KM_SLEEP); 1921 1922 bcopy(&ovpage[opages - npages], nsvd->vpage, nbytes); 1923 1924 /* free up old vpage */ 1925 kmem_free(ovpage, vpgtob(opages)); 1926 } 1927 1928 if (amp == NULL) { 1929 nsvd->amp = NULL; 1930 nsvd->anon_index = 0; 1931 } else { 1932 /* 1933 * Need to create a new anon map for the new segment. 1934 * We'll also allocate a new smaller array for the old 1935 * smaller segment to save space. 1936 */ 1937 opages = btop((uintptr_t)(addr - seg->s_base)); 1938 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 1939 if (amp->refcnt == 1 || svd->type == MAP_PRIVATE) { 1940 /* 1941 * Free up now unused parts of anon_map array. 1942 */ 1943 ulong_t an_idx = svd->anon_index + opages; 1944 if (amp->a_szc == seg->s_szc) { 1945 if (seg->s_szc != 0) { 1946 anon_free_pages(amp->ahp, an_idx, len, 1947 seg->s_szc); 1948 } else { 1949 anon_free(amp->ahp, an_idx, 1950 len); 1951 } 1952 } else { 1953 ASSERT(svd->type == MAP_SHARED); 1954 ASSERT(amp->a_szc > seg->s_szc); 1955 anon_shmap_free_pages(amp, an_idx, len); 1956 } 1957 1958 /* 1959 * Unreserve swap space for the 1960 * unmapped chunk of this segment in 1961 * case it's MAP_SHARED 1962 */ 1963 if (svd->type == MAP_SHARED) { 1964 anon_unresv(len); 1965 amp->swresv -= len; 1966 } 1967 } 1968 nsvd->anon_index = svd->anon_index + 1969 btop((uintptr_t)(nseg->s_base - seg->s_base)); 1970 if (svd->type == MAP_SHARED) { 1971 amp->refcnt++; 1972 nsvd->amp = amp; 1973 } else { 1974 struct anon_map *namp; 1975 struct anon_hdr *nahp; 1976 1977 ASSERT(svd->type == MAP_PRIVATE); 1978 nahp = anon_create(btop(seg->s_size), ANON_SLEEP); 1979 namp = anonmap_alloc(nseg->s_size, 0, ANON_SLEEP); 1980 namp->a_szc = seg->s_szc; 1981 (void) anon_copy_ptr(amp->ahp, svd->anon_index, nahp, 1982 0, btop(seg->s_size), ANON_SLEEP); 1983 (void) anon_copy_ptr(amp->ahp, nsvd->anon_index, 1984 namp->ahp, 0, btop(nseg->s_size), ANON_SLEEP); 1985 anon_release(amp->ahp, btop(amp->size)); 1986 svd->anon_index = 0; 1987 nsvd->anon_index = 0; 1988 amp->ahp = nahp; 1989 amp->size = seg->s_size; 1990 nsvd->amp = namp; 1991 } 1992 ANON_LOCK_EXIT(&->a_rwlock); 1993 } 1994 if (svd->swresv) { 1995 if (svd->flags & MAP_NORESERVE) { 1996 ASSERT(amp); 1997 oswresv = svd->swresv; 1998 svd->swresv = ptob(anon_pages(amp->ahp, 1999 svd->anon_index, btop(seg->s_size))); 2000 nsvd->swresv = ptob(anon_pages(nsvd->amp->ahp, 2001 nsvd->anon_index, btop(nseg->s_size))); 2002 ASSERT(oswresv >= (svd->swresv + nsvd->swresv)); 2003 anon_unresv(oswresv - (svd->swresv + nsvd->swresv)); 2004 } else { 2005 if (seg->s_size + nseg->s_size + len != svd->swresv) { 2006 panic("segvn_unmap: " 2007 "cannot split swap reservation"); 2008 /*NOTREACHED*/ 2009 } 2010 anon_unresv(len); 2011 svd->swresv = seg->s_size; 2012 nsvd->swresv = nseg->s_size; 2013 } 2014 TRACE_3(TR_FAC_VM, TR_ANON_PROC, "anon proc:%p %lu %u", 2015 seg, len, 0); 2016 } 2017 2018 return (0); /* I'm glad that's all over with! */ 2019 } 2020 2021 static void 2022 segvn_free(struct seg *seg) 2023 { 2024 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 2025 pgcnt_t npages = seg_pages(seg); 2026 struct anon_map *amp; 2027 size_t len; 2028 2029 /* 2030 * We don't need any segment level locks for "segvn" data 2031 * since the address space is "write" locked. 2032 */ 2033 ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock)); 2034 ASSERT(svd->tr_state == SEGVN_TR_OFF); 2035 2036 /* 2037 * Be sure to unlock pages. XXX Why do things get free'ed instead 2038 * of unmapped? XXX 2039 */ 2040 (void) segvn_lockop(seg, seg->s_base, seg->s_size, 2041 0, MC_UNLOCK, NULL, 0); 2042 2043 /* 2044 * Deallocate the vpage and anon pointers if necessary and possible. 2045 */ 2046 if (svd->vpage != NULL) { 2047 kmem_free(svd->vpage, vpgtob(npages)); 2048 svd->vpage = NULL; 2049 } 2050 if ((amp = svd->amp) != NULL) { 2051 /* 2052 * If there are no more references to this anon_map 2053 * structure, then deallocate the structure after freeing 2054 * up all the anon slot pointers that we can. 2055 */ 2056 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 2057 ASSERT(amp->a_szc >= seg->s_szc); 2058 if (--amp->refcnt == 0) { 2059 if (svd->type == MAP_PRIVATE) { 2060 /* 2061 * Private - we only need to anon_free 2062 * the part that this segment refers to. 2063 */ 2064 if (seg->s_szc != 0) { 2065 anon_free_pages(amp->ahp, 2066 svd->anon_index, seg->s_size, 2067 seg->s_szc); 2068 } else { 2069 anon_free(amp->ahp, svd->anon_index, 2070 seg->s_size); 2071 } 2072 } else { 2073 /* 2074 * Shared - anon_free the entire 2075 * anon_map's worth of stuff and 2076 * release any swap reservation. 2077 */ 2078 if (amp->a_szc != 0) { 2079 anon_shmap_free_pages(amp, 0, 2080 amp->size); 2081 } else { 2082 anon_free(amp->ahp, 0, amp->size); 2083 } 2084 if ((len = amp->swresv) != 0) { 2085 anon_unresv(len); 2086 TRACE_3(TR_FAC_VM, TR_ANON_PROC, 2087 "anon proc:%p %lu %u", 2088 seg, len, 0); 2089 } 2090 } 2091 svd->amp = NULL; 2092 ANON_LOCK_EXIT(&->a_rwlock); 2093 anonmap_free(amp); 2094 } else if (svd->type == MAP_PRIVATE) { 2095 /* 2096 * We had a private mapping which still has 2097 * a held anon_map so just free up all the 2098 * anon slot pointers that we were using. 2099 */ 2100 if (seg->s_szc != 0) { 2101 anon_free_pages(amp->ahp, svd->anon_index, 2102 seg->s_size, seg->s_szc); 2103 } else { 2104 anon_free(amp->ahp, svd->anon_index, 2105 seg->s_size); 2106 } 2107 ANON_LOCK_EXIT(&->a_rwlock); 2108 } else { 2109 ANON_LOCK_EXIT(&->a_rwlock); 2110 } 2111 } 2112 2113 /* 2114 * Release swap reservation. 2115 */ 2116 if ((len = svd->swresv) != 0) { 2117 anon_unresv(svd->swresv); 2118 TRACE_3(TR_FAC_VM, TR_ANON_PROC, "anon proc:%p %lu %u", 2119 seg, len, 0); 2120 svd->swresv = 0; 2121 } 2122 /* 2123 * Release claim on vnode, credentials, and finally free the 2124 * private data. 2125 */ 2126 if (svd->vp != NULL) { 2127 if (svd->type == MAP_SHARED) 2128 lgrp_shm_policy_fini(NULL, svd->vp); 2129 VN_RELE(svd->vp); 2130 svd->vp = NULL; 2131 } 2132 crfree(svd->cred); 2133 svd->cred = NULL; 2134 2135 seg->s_data = NULL; 2136 kmem_cache_free(segvn_cache, svd); 2137 } 2138 2139 ulong_t segvn_lpglck_limit = 0; 2140 /* 2141 * Support routines used by segvn_pagelock() and softlock faults for anonymous 2142 * pages to implement availrmem accounting in a way that makes sure the 2143 * same memory is accounted just once for all softlock/pagelock purposes. 2144 * This prevents a bug when availrmem is quickly incorrectly exausted from 2145 * several pagelocks to different parts of the same large page since each 2146 * pagelock has to decrement availrmem by the size of the entire large 2147 * page. Note those pages are not COW shared until softunlock/pageunlock so 2148 * we don't need to use cow style accounting here. We also need to make sure 2149 * the entire large page is accounted even if softlock range is less than the 2150 * entire large page because large anon pages can't be demoted when any of 2151 * constituent pages is locked. The caller calls this routine for every page_t 2152 * it locks. The very first page in the range may not be the root page of a 2153 * large page. For all other pages it's guranteed we are going to visit the 2154 * root of a particular large page before any other constituent page as we are 2155 * locking sequential pages belonging to the same anon map. So we do all the 2156 * locking when the root is encountered except for the very first page. Since 2157 * softlocking is not supported (except S_READ_NOCOW special case) for vmpss 2158 * segments and since vnode pages can be demoted without locking all 2159 * constituent pages vnode pages don't come here. Unlocking relies on the 2160 * fact that pagesize can't change whenever any of constituent large pages is 2161 * locked at least SE_SHARED. This allows unlocking code to find the right 2162 * root and decrement availrmem by the same amount it was incremented when the 2163 * page was locked. 2164 */ 2165 static int 2166 segvn_pp_lock_anonpages(page_t *pp, int first) 2167 { 2168 pgcnt_t pages; 2169 pfn_t pfn; 2170 uchar_t szc = pp->p_szc; 2171 2172 ASSERT(PAGE_LOCKED(pp)); 2173 ASSERT(pp->p_vnode != NULL); 2174 ASSERT(IS_SWAPFSVP(pp->p_vnode)); 2175 2176 /* 2177 * pagesize won't change as long as any constituent page is locked. 2178 */ 2179 pages = page_get_pagecnt(pp->p_szc); 2180 pfn = page_pptonum(pp); 2181 2182 if (!first) { 2183 if (!IS_P2ALIGNED(pfn, pages)) { 2184 #ifdef DEBUG 2185 pp = &pp[-(spgcnt_t)(pfn & (pages - 1))]; 2186 pfn = page_pptonum(pp); 2187 ASSERT(IS_P2ALIGNED(pfn, pages)); 2188 ASSERT(pp->p_szc == szc); 2189 ASSERT(pp->p_vnode != NULL); 2190 ASSERT(IS_SWAPFSVP(pp->p_vnode)); 2191 ASSERT(pp->p_slckcnt != 0); 2192 #endif /* DEBUG */ 2193 return (1); 2194 } 2195 } else if (!IS_P2ALIGNED(pfn, pages)) { 2196 pp = &pp[-(spgcnt_t)(pfn & (pages - 1))]; 2197 #ifdef DEBUG 2198 pfn = page_pptonum(pp); 2199 ASSERT(IS_P2ALIGNED(pfn, pages)); 2200 ASSERT(pp->p_szc == szc); 2201 ASSERT(pp->p_vnode != NULL); 2202 ASSERT(IS_SWAPFSVP(pp->p_vnode)); 2203 #endif /* DEBUG */ 2204 } 2205 2206 /* 2207 * pp is a root page. 2208 * We haven't locked this large page yet. 2209 */ 2210 page_struct_lock(pp); 2211 if (pp->p_slckcnt != 0) { 2212 if (pp->p_slckcnt < PAGE_SLOCK_MAXIMUM) { 2213 pp->p_slckcnt++; 2214 page_struct_unlock(pp); 2215 return (1); 2216 } 2217 page_struct_unlock(pp); 2218 segvn_lpglck_limit++; 2219 return (0); 2220 } 2221 mutex_enter(&freemem_lock); 2222 if (availrmem < tune.t_minarmem + pages) { 2223 mutex_exit(&freemem_lock); 2224 page_struct_unlock(pp); 2225 return (0); 2226 } 2227 pp->p_slckcnt++; 2228 availrmem -= pages; 2229 mutex_exit(&freemem_lock); 2230 page_struct_unlock(pp); 2231 return (1); 2232 } 2233 2234 static void 2235 segvn_pp_unlock_anonpages(page_t *pp, int first) 2236 { 2237 pgcnt_t pages; 2238 pfn_t pfn; 2239 2240 ASSERT(PAGE_LOCKED(pp)); 2241 ASSERT(pp->p_vnode != NULL); 2242 ASSERT(IS_SWAPFSVP(pp->p_vnode)); 2243 2244 /* 2245 * pagesize won't change as long as any constituent page is locked. 2246 */ 2247 pages = page_get_pagecnt(pp->p_szc); 2248 pfn = page_pptonum(pp); 2249 2250 if (!first) { 2251 if (!IS_P2ALIGNED(pfn, pages)) { 2252 return; 2253 } 2254 } else if (!IS_P2ALIGNED(pfn, pages)) { 2255 pp = &pp[-(spgcnt_t)(pfn & (pages - 1))]; 2256 #ifdef DEBUG 2257 pfn = page_pptonum(pp); 2258 ASSERT(IS_P2ALIGNED(pfn, pages)); 2259 #endif /* DEBUG */ 2260 } 2261 ASSERT(pp->p_vnode != NULL); 2262 ASSERT(IS_SWAPFSVP(pp->p_vnode)); 2263 ASSERT(pp->p_slckcnt != 0); 2264 page_struct_lock(pp); 2265 if (--pp->p_slckcnt == 0) { 2266 mutex_enter(&freemem_lock); 2267 availrmem += pages; 2268 mutex_exit(&freemem_lock); 2269 } 2270 page_struct_unlock(pp); 2271 } 2272 2273 /* 2274 * Do a F_SOFTUNLOCK call over the range requested. The range must have 2275 * already been F_SOFTLOCK'ed. 2276 * Caller must always match addr and len of a softunlock with a previous 2277 * softlock with exactly the same addr and len. 2278 */ 2279 static void 2280 segvn_softunlock(struct seg *seg, caddr_t addr, size_t len, enum seg_rw rw) 2281 { 2282 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 2283 page_t *pp; 2284 caddr_t adr; 2285 struct vnode *vp; 2286 u_offset_t offset; 2287 ulong_t anon_index; 2288 struct anon_map *amp; 2289 struct anon *ap = NULL; 2290 2291 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 2292 ASSERT(SEGVN_LOCK_HELD(seg->s_as, &svd->lock)); 2293 2294 if ((amp = svd->amp) != NULL) 2295 anon_index = svd->anon_index + seg_page(seg, addr); 2296 2297 hat_unlock(seg->s_as->a_hat, addr, len); 2298 for (adr = addr; adr < addr + len; adr += PAGESIZE) { 2299 if (amp != NULL) { 2300 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 2301 if ((ap = anon_get_ptr(amp->ahp, anon_index++)) 2302 != NULL) { 2303 swap_xlate(ap, &vp, &offset); 2304 } else { 2305 vp = svd->vp; 2306 offset = svd->offset + 2307 (uintptr_t)(adr - seg->s_base); 2308 } 2309 ANON_LOCK_EXIT(&->a_rwlock); 2310 } else { 2311 vp = svd->vp; 2312 offset = svd->offset + 2313 (uintptr_t)(adr - seg->s_base); 2314 } 2315 2316 /* 2317 * Use page_find() instead of page_lookup() to 2318 * find the page since we know that it is locked. 2319 */ 2320 pp = page_find(vp, offset); 2321 if (pp == NULL) { 2322 panic( 2323 "segvn_softunlock: addr %p, ap %p, vp %p, off %llx", 2324 (void *)adr, (void *)ap, (void *)vp, offset); 2325 /*NOTREACHED*/ 2326 } 2327 2328 if (rw == S_WRITE) { 2329 hat_setrefmod(pp); 2330 if (seg->s_as->a_vbits) 2331 hat_setstat(seg->s_as, adr, PAGESIZE, 2332 P_REF | P_MOD); 2333 } else if (rw != S_OTHER) { 2334 hat_setref(pp); 2335 if (seg->s_as->a_vbits) 2336 hat_setstat(seg->s_as, adr, PAGESIZE, P_REF); 2337 } 2338 TRACE_3(TR_FAC_VM, TR_SEGVN_FAULT, 2339 "segvn_fault:pp %p vp %p offset %llx", pp, vp, offset); 2340 if (svd->vp == NULL) { 2341 segvn_pp_unlock_anonpages(pp, adr == addr); 2342 } 2343 page_unlock(pp); 2344 } 2345 mutex_enter(&freemem_lock); /* for availrmem */ 2346 if (svd->vp != NULL) { 2347 availrmem += btop(len); 2348 } 2349 segvn_pages_locked -= btop(len); 2350 svd->softlockcnt -= btop(len); 2351 mutex_exit(&freemem_lock); 2352 if (svd->softlockcnt == 0) { 2353 /* 2354 * All SOFTLOCKS are gone. Wakeup any waiting 2355 * unmappers so they can try again to unmap. 2356 * Check for waiters first without the mutex 2357 * held so we don't always grab the mutex on 2358 * softunlocks. 2359 */ 2360 if (AS_ISUNMAPWAIT(seg->s_as)) { 2361 mutex_enter(&seg->s_as->a_contents); 2362 if (AS_ISUNMAPWAIT(seg->s_as)) { 2363 AS_CLRUNMAPWAIT(seg->s_as); 2364 cv_broadcast(&seg->s_as->a_cv); 2365 } 2366 mutex_exit(&seg->s_as->a_contents); 2367 } 2368 } 2369 } 2370 2371 #define PAGE_HANDLED ((page_t *)-1) 2372 2373 /* 2374 * Release all the pages in the NULL terminated ppp list 2375 * which haven't already been converted to PAGE_HANDLED. 2376 */ 2377 static void 2378 segvn_pagelist_rele(page_t **ppp) 2379 { 2380 for (; *ppp != NULL; ppp++) { 2381 if (*ppp != PAGE_HANDLED) 2382 page_unlock(*ppp); 2383 } 2384 } 2385 2386 static int stealcow = 1; 2387 2388 /* 2389 * Workaround for viking chip bug. See bug id 1220902. 2390 * To fix this down in pagefault() would require importing so 2391 * much as and segvn code as to be unmaintainable. 2392 */ 2393 int enable_mbit_wa = 0; 2394 2395 /* 2396 * Handles all the dirty work of getting the right 2397 * anonymous pages and loading up the translations. 2398 * This routine is called only from segvn_fault() 2399 * when looping over the range of addresses requested. 2400 * 2401 * The basic algorithm here is: 2402 * If this is an anon_zero case 2403 * Call anon_zero to allocate page 2404 * Load up translation 2405 * Return 2406 * endif 2407 * If this is an anon page 2408 * Use anon_getpage to get the page 2409 * else 2410 * Find page in pl[] list passed in 2411 * endif 2412 * If not a cow 2413 * Load up the translation to the page 2414 * return 2415 * endif 2416 * Call anon_private to handle cow 2417 * Load up (writable) translation to new page 2418 */ 2419 static faultcode_t 2420 segvn_faultpage( 2421 struct hat *hat, /* the hat to use for mapping */ 2422 struct seg *seg, /* seg_vn of interest */ 2423 caddr_t addr, /* address in as */ 2424 u_offset_t off, /* offset in vp */ 2425 struct vpage *vpage, /* pointer to vpage for vp, off */ 2426 page_t *pl[], /* object source page pointer */ 2427 uint_t vpprot, /* access allowed to object pages */ 2428 enum fault_type type, /* type of fault */ 2429 enum seg_rw rw, /* type of access at fault */ 2430 int brkcow, /* we may need to break cow */ 2431 int first) /* first page for this fault if 1 */ 2432 { 2433 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 2434 page_t *pp, **ppp; 2435 uint_t pageflags = 0; 2436 page_t *anon_pl[1 + 1]; 2437 page_t *opp = NULL; /* original page */ 2438 uint_t prot; 2439 int err; 2440 int cow; 2441 int claim; 2442 int steal = 0; 2443 ulong_t anon_index; 2444 struct anon *ap, *oldap; 2445 struct anon_map *amp; 2446 int hat_flag = (type == F_SOFTLOCK) ? HAT_LOAD_LOCK : HAT_LOAD; 2447 int anon_lock = 0; 2448 anon_sync_obj_t cookie; 2449 2450 if (svd->flags & MAP_TEXT) { 2451 hat_flag |= HAT_LOAD_TEXT; 2452 } 2453 2454 ASSERT(SEGVN_READ_HELD(seg->s_as, &svd->lock)); 2455 ASSERT(seg->s_szc == 0); 2456 2457 /* 2458 * Initialize protection value for this page. 2459 * If we have per page protection values check it now. 2460 */ 2461 if (svd->pageprot) { 2462 uint_t protchk; 2463 2464 switch (rw) { 2465 case S_READ: 2466 protchk = PROT_READ; 2467 break; 2468 case S_WRITE: 2469 protchk = PROT_WRITE; 2470 break; 2471 case S_EXEC: 2472 protchk = PROT_EXEC; 2473 break; 2474 case S_OTHER: 2475 default: 2476 protchk = PROT_READ | PROT_WRITE | PROT_EXEC; 2477 break; 2478 } 2479 2480 prot = VPP_PROT(vpage); 2481 if ((prot & protchk) == 0) 2482 return (FC_PROT); /* illegal access type */ 2483 } else { 2484 prot = svd->prot; 2485 } 2486 2487 if (type == F_SOFTLOCK && svd->vp != NULL) { 2488 mutex_enter(&freemem_lock); 2489 if (availrmem <= tune.t_minarmem) { 2490 mutex_exit(&freemem_lock); 2491 return (FC_MAKE_ERR(ENOMEM)); /* out of real memory */ 2492 } else { 2493 availrmem--; 2494 svd->softlockcnt++; 2495 segvn_pages_locked++; 2496 } 2497 mutex_exit(&freemem_lock); 2498 } 2499 2500 /* 2501 * Always acquire the anon array lock to prevent 2 threads from 2502 * allocating separate anon slots for the same "addr". 2503 */ 2504 2505 if ((amp = svd->amp) != NULL) { 2506 ASSERT(RW_READ_HELD(&->a_rwlock)); 2507 anon_index = svd->anon_index + seg_page(seg, addr); 2508 anon_array_enter(amp, anon_index, &cookie); 2509 anon_lock = 1; 2510 } 2511 2512 if (svd->vp == NULL && amp != NULL) { 2513 if ((ap = anon_get_ptr(amp->ahp, anon_index)) == NULL) { 2514 /* 2515 * Allocate a (normally) writable anonymous page of 2516 * zeroes. If no advance reservations, reserve now. 2517 */ 2518 if (svd->flags & MAP_NORESERVE) { 2519 if (anon_resv_zone(ptob(1), 2520 seg->s_as->a_proc->p_zone)) { 2521 atomic_add_long(&svd->swresv, ptob(1)); 2522 } else { 2523 err = ENOMEM; 2524 goto out; 2525 } 2526 } 2527 if ((pp = anon_zero(seg, addr, &ap, 2528 svd->cred)) == NULL) { 2529 err = ENOMEM; 2530 goto out; /* out of swap space */ 2531 } 2532 /* 2533 * Re-acquire the anon_map lock and 2534 * initialize the anon array entry. 2535 */ 2536 (void) anon_set_ptr(amp->ahp, anon_index, ap, 2537 ANON_SLEEP); 2538 2539 ASSERT(pp->p_szc == 0); 2540 2541 /* 2542 * Handle pages that have been marked for migration 2543 */ 2544 if (lgrp_optimizations()) 2545 page_migrate(seg, addr, &pp, 1); 2546 2547 if (type == F_SOFTLOCK) { 2548 if (!segvn_pp_lock_anonpages(pp, first)) { 2549 page_unlock(pp); 2550 err = ENOMEM; 2551 goto out; 2552 } else { 2553 mutex_enter(&freemem_lock); 2554 svd->softlockcnt++; 2555 segvn_pages_locked++; 2556 mutex_exit(&freemem_lock); 2557 } 2558 } 2559 2560 if (enable_mbit_wa) { 2561 if (rw == S_WRITE) 2562 hat_setmod(pp); 2563 else if (!hat_ismod(pp)) 2564 prot &= ~PROT_WRITE; 2565 } 2566 /* 2567 * If AS_PAGLCK is set in a_flags (via memcntl(2) 2568 * with MC_LOCKAS, MCL_FUTURE) and this is a 2569 * MAP_NORESERVE segment, we may need to 2570 * permanently lock the page as it is being faulted 2571 * for the first time. The following text applies 2572 * only to MAP_NORESERVE segments: 2573 * 2574 * As per memcntl(2), if this segment was created 2575 * after MCL_FUTURE was applied (a "future" 2576 * segment), its pages must be locked. If this 2577 * segment existed at MCL_FUTURE application (a 2578 * "past" segment), the interface is unclear. 2579 * 2580 * We decide to lock only if vpage is present: 2581 * 2582 * - "future" segments will have a vpage array (see 2583 * as_map), and so will be locked as required 2584 * 2585 * - "past" segments may not have a vpage array, 2586 * depending on whether events (such as 2587 * mprotect) have occurred. Locking if vpage 2588 * exists will preserve legacy behavior. Not 2589 * locking if vpage is absent, will not break 2590 * the interface or legacy behavior. Note that 2591 * allocating vpage here if it's absent requires 2592 * upgrading the segvn reader lock, the cost of 2593 * which does not seem worthwhile. 2594 * 2595 * Usually testing and setting VPP_ISPPLOCK and 2596 * VPP_SETPPLOCK requires holding the segvn lock as 2597 * writer, but in this case all readers are 2598 * serializing on the anon array lock. 2599 */ 2600 if (AS_ISPGLCK(seg->s_as) && vpage != NULL && 2601 (svd->flags & MAP_NORESERVE) && 2602 !VPP_ISPPLOCK(vpage)) { 2603 proc_t *p = seg->s_as->a_proc; 2604 ASSERT(svd->type == MAP_PRIVATE); 2605 mutex_enter(&p->p_lock); 2606 if (rctl_incr_locked_mem(p, NULL, PAGESIZE, 2607 1) == 0) { 2608 claim = VPP_PROT(vpage) & PROT_WRITE; 2609 if (page_pp_lock(pp, claim, 0)) { 2610 VPP_SETPPLOCK(vpage); 2611 } else { 2612 rctl_decr_locked_mem(p, NULL, 2613 PAGESIZE, 1); 2614 } 2615 } 2616 mutex_exit(&p->p_lock); 2617 } 2618 2619 hat_memload(hat, addr, pp, prot, hat_flag); 2620 2621 if (!(hat_flag & HAT_LOAD_LOCK)) 2622 page_unlock(pp); 2623 2624 anon_array_exit(&cookie); 2625 return (0); 2626 } 2627 } 2628 2629 /* 2630 * Obtain the page structure via anon_getpage() if it is 2631 * a private copy of an object (the result of a previous 2632 * copy-on-write). 2633 */ 2634 if (amp != NULL) { 2635 if ((ap = anon_get_ptr(amp->ahp, anon_index)) != NULL) { 2636 err = anon_getpage(&ap, &vpprot, anon_pl, PAGESIZE, 2637 seg, addr, rw, svd->cred); 2638 if (err) 2639 goto out; 2640 2641 if (svd->type == MAP_SHARED) { 2642 /* 2643 * If this is a shared mapping to an 2644 * anon_map, then ignore the write 2645 * permissions returned by anon_getpage(). 2646 * They apply to the private mappings 2647 * of this anon_map. 2648 */ 2649 vpprot |= PROT_WRITE; 2650 } 2651 opp = anon_pl[0]; 2652 } 2653 } 2654 2655 /* 2656 * Search the pl[] list passed in if it is from the 2657 * original object (i.e., not a private copy). 2658 */ 2659 if (opp == NULL) { 2660 /* 2661 * Find original page. We must be bringing it in 2662 * from the list in pl[]. 2663 */ 2664 for (ppp = pl; (opp = *ppp) != NULL; ppp++) { 2665 if (opp == PAGE_HANDLED) 2666 continue; 2667 ASSERT(opp->p_vnode == svd->vp); /* XXX */ 2668 if (opp->p_offset == off) 2669 break; 2670 } 2671 if (opp == NULL) { 2672 panic("segvn_faultpage not found"); 2673 /*NOTREACHED*/ 2674 } 2675 *ppp = PAGE_HANDLED; 2676 2677 } 2678 2679 ASSERT(PAGE_LOCKED(opp)); 2680 2681 TRACE_3(TR_FAC_VM, TR_SEGVN_FAULT, 2682 "segvn_fault:pp %p vp %p offset %llx", 2683 opp, NULL, 0); 2684 2685 /* 2686 * The fault is treated as a copy-on-write fault if a 2687 * write occurs on a private segment and the object 2688 * page (i.e., mapping) is write protected. We assume 2689 * that fatal protection checks have already been made. 2690 */ 2691 2692 if (brkcow) { 2693 ASSERT(svd->tr_state == SEGVN_TR_OFF); 2694 cow = !(vpprot & PROT_WRITE); 2695 } else if (svd->tr_state == SEGVN_TR_ON) { 2696 /* 2697 * If we are doing text replication COW on first touch. 2698 */ 2699 ASSERT(amp != NULL); 2700 ASSERT(svd->vp != NULL); 2701 ASSERT(rw != S_WRITE); 2702 cow = (ap == NULL); 2703 } else { 2704 cow = 0; 2705 } 2706 2707 /* 2708 * If not a copy-on-write case load the translation 2709 * and return. 2710 */ 2711 if (cow == 0) { 2712 2713 /* 2714 * Handle pages that have been marked for migration 2715 */ 2716 if (lgrp_optimizations()) 2717 page_migrate(seg, addr, &opp, 1); 2718 2719 if (type == F_SOFTLOCK && svd->vp == NULL) { 2720 2721 ASSERT(opp->p_szc == 0 || 2722 (svd->type == MAP_SHARED && 2723 amp != NULL && amp->a_szc != 0)); 2724 2725 if (!segvn_pp_lock_anonpages(opp, first)) { 2726 page_unlock(opp); 2727 err = ENOMEM; 2728 goto out; 2729 } else { 2730 mutex_enter(&freemem_lock); 2731 svd->softlockcnt++; 2732 segvn_pages_locked++; 2733 mutex_exit(&freemem_lock); 2734 } 2735 } 2736 if (IS_VMODSORT(opp->p_vnode) || enable_mbit_wa) { 2737 if (rw == S_WRITE) 2738 hat_setmod(opp); 2739 else if (rw != S_OTHER && !hat_ismod(opp)) 2740 prot &= ~PROT_WRITE; 2741 } 2742 2743 hat_memload(hat, addr, opp, prot & vpprot, hat_flag); 2744 2745 if (!(hat_flag & HAT_LOAD_LOCK)) 2746 page_unlock(opp); 2747 2748 if (anon_lock) { 2749 anon_array_exit(&cookie); 2750 } 2751 return (0); 2752 } 2753 2754 hat_setref(opp); 2755 2756 ASSERT(amp != NULL && anon_lock); 2757 2758 /* 2759 * Steal the page only if it isn't a private page 2760 * since stealing a private page is not worth the effort. 2761 */ 2762 if ((ap = anon_get_ptr(amp->ahp, anon_index)) == NULL) 2763 steal = 1; 2764 2765 /* 2766 * Steal the original page if the following conditions are true: 2767 * 2768 * We are low on memory, the page is not private, page is not large, 2769 * not shared, not modified, not `locked' or if we have it `locked' 2770 * (i.e., p_cowcnt == 1 and p_lckcnt == 0, which also implies 2771 * that the page is not shared) and if it doesn't have any 2772 * translations. page_struct_lock isn't needed to look at p_cowcnt 2773 * and p_lckcnt because we first get exclusive lock on page. 2774 */ 2775 (void) hat_pagesync(opp, HAT_SYNC_DONTZERO | HAT_SYNC_STOPON_MOD); 2776 2777 if (stealcow && freemem < minfree && steal && opp->p_szc == 0 && 2778 page_tryupgrade(opp) && !hat_ismod(opp) && 2779 ((opp->p_lckcnt == 0 && opp->p_cowcnt == 0) || 2780 (opp->p_lckcnt == 0 && opp->p_cowcnt == 1 && 2781 vpage != NULL && VPP_ISPPLOCK(vpage)))) { 2782 /* 2783 * Check if this page has other translations 2784 * after unloading our translation. 2785 */ 2786 if (hat_page_is_mapped(opp)) { 2787 hat_unload(seg->s_as->a_hat, addr, PAGESIZE, 2788 HAT_UNLOAD); 2789 } 2790 2791 /* 2792 * hat_unload() might sync back someone else's recent 2793 * modification, so check again. 2794 */ 2795 if (!hat_ismod(opp) && !hat_page_is_mapped(opp)) 2796 pageflags |= STEAL_PAGE; 2797 } 2798 2799 /* 2800 * If we have a vpage pointer, see if it indicates that we have 2801 * ``locked'' the page we map -- if so, tell anon_private to 2802 * transfer the locking resource to the new page. 2803 * 2804 * See Statement at the beginning of segvn_lockop regarding 2805 * the way lockcnts/cowcnts are handled during COW. 2806 * 2807 */ 2808 if (vpage != NULL && VPP_ISPPLOCK(vpage)) 2809 pageflags |= LOCK_PAGE; 2810 2811 /* 2812 * Allocate a private page and perform the copy. 2813 * For MAP_NORESERVE reserve swap space now, unless this 2814 * is a cow fault on an existing anon page in which case 2815 * MAP_NORESERVE will have made advance reservations. 2816 */ 2817 if ((svd->flags & MAP_NORESERVE) && (ap == NULL)) { 2818 if (anon_resv_zone(ptob(1), seg->s_as->a_proc->p_zone)) { 2819 atomic_add_long(&svd->swresv, ptob(1)); 2820 } else { 2821 page_unlock(opp); 2822 err = ENOMEM; 2823 goto out; 2824 } 2825 } 2826 oldap = ap; 2827 pp = anon_private(&ap, seg, addr, prot, opp, pageflags, svd->cred); 2828 if (pp == NULL) { 2829 err = ENOMEM; /* out of swap space */ 2830 goto out; 2831 } 2832 2833 /* 2834 * If we copied away from an anonymous page, then 2835 * we are one step closer to freeing up an anon slot. 2836 * 2837 * NOTE: The original anon slot must be released while 2838 * holding the "anon_map" lock. This is necessary to prevent 2839 * other threads from obtaining a pointer to the anon slot 2840 * which may be freed if its "refcnt" is 1. 2841 */ 2842 if (oldap != NULL) 2843 anon_decref(oldap); 2844 2845 (void) anon_set_ptr(amp->ahp, anon_index, ap, ANON_SLEEP); 2846 2847 /* 2848 * Handle pages that have been marked for migration 2849 */ 2850 if (lgrp_optimizations()) 2851 page_migrate(seg, addr, &pp, 1); 2852 2853 ASSERT(pp->p_szc == 0); 2854 if (type == F_SOFTLOCK && svd->vp == NULL) { 2855 if (!segvn_pp_lock_anonpages(pp, first)) { 2856 page_unlock(pp); 2857 err = ENOMEM; 2858 goto out; 2859 } else { 2860 mutex_enter(&freemem_lock); 2861 svd->softlockcnt++; 2862 segvn_pages_locked++; 2863 mutex_exit(&freemem_lock); 2864 } 2865 } 2866 2867 ASSERT(!IS_VMODSORT(pp->p_vnode)); 2868 if (enable_mbit_wa) { 2869 if (rw == S_WRITE) 2870 hat_setmod(pp); 2871 else if (!hat_ismod(pp)) 2872 prot &= ~PROT_WRITE; 2873 } 2874 2875 hat_memload(hat, addr, pp, prot, hat_flag); 2876 2877 if (!(hat_flag & HAT_LOAD_LOCK)) 2878 page_unlock(pp); 2879 2880 ASSERT(anon_lock); 2881 anon_array_exit(&cookie); 2882 return (0); 2883 out: 2884 if (anon_lock) 2885 anon_array_exit(&cookie); 2886 2887 if (type == F_SOFTLOCK && svd->vp != NULL) { 2888 mutex_enter(&freemem_lock); 2889 availrmem++; 2890 segvn_pages_locked--; 2891 svd->softlockcnt--; 2892 mutex_exit(&freemem_lock); 2893 } 2894 return (FC_MAKE_ERR(err)); 2895 } 2896 2897 /* 2898 * relocate a bunch of smaller targ pages into one large repl page. all targ 2899 * pages must be complete pages smaller than replacement pages. 2900 * it's assumed that no page's szc can change since they are all PAGESIZE or 2901 * complete large pages locked SHARED. 2902 */ 2903 static void 2904 segvn_relocate_pages(page_t **targ, page_t *replacement) 2905 { 2906 page_t *pp; 2907 pgcnt_t repl_npgs, curnpgs; 2908 pgcnt_t i; 2909 uint_t repl_szc = replacement->p_szc; 2910 page_t *first_repl = replacement; 2911 page_t *repl; 2912 spgcnt_t npgs; 2913 2914 VM_STAT_ADD(segvnvmstats.relocatepages[0]); 2915 2916 ASSERT(repl_szc != 0); 2917 npgs = repl_npgs = page_get_pagecnt(repl_szc); 2918 2919 i = 0; 2920 while (repl_npgs) { 2921 spgcnt_t nreloc; 2922 int err; 2923 ASSERT(replacement != NULL); 2924 pp = targ[i]; 2925 ASSERT(pp->p_szc < repl_szc); 2926 ASSERT(PAGE_EXCL(pp)); 2927 ASSERT(!PP_ISFREE(pp)); 2928 curnpgs = page_get_pagecnt(pp->p_szc); 2929 if (curnpgs == 1) { 2930 VM_STAT_ADD(segvnvmstats.relocatepages[1]); 2931 repl = replacement; 2932 page_sub(&replacement, repl); 2933 ASSERT(PAGE_EXCL(repl)); 2934 ASSERT(!PP_ISFREE(repl)); 2935 ASSERT(repl->p_szc == repl_szc); 2936 } else { 2937 page_t *repl_savepp; 2938 int j; 2939 VM_STAT_ADD(segvnvmstats.relocatepages[2]); 2940 repl_savepp = replacement; 2941 for (j = 0; j < curnpgs; j++) { 2942 repl = replacement; 2943 page_sub(&replacement, repl); 2944 ASSERT(PAGE_EXCL(repl)); 2945 ASSERT(!PP_ISFREE(repl)); 2946 ASSERT(repl->p_szc == repl_szc); 2947 ASSERT(page_pptonum(targ[i + j]) == 2948 page_pptonum(targ[i]) + j); 2949 } 2950 repl = repl_savepp; 2951 ASSERT(IS_P2ALIGNED(page_pptonum(repl), curnpgs)); 2952 } 2953 err = page_relocate(&pp, &repl, 0, 1, &nreloc, NULL); 2954 if (err || nreloc != curnpgs) { 2955 panic("segvn_relocate_pages: " 2956 "page_relocate failed err=%d curnpgs=%ld " 2957 "nreloc=%ld", err, curnpgs, nreloc); 2958 } 2959 ASSERT(curnpgs <= repl_npgs); 2960 repl_npgs -= curnpgs; 2961 i += curnpgs; 2962 } 2963 ASSERT(replacement == NULL); 2964 2965 repl = first_repl; 2966 repl_npgs = npgs; 2967 for (i = 0; i < repl_npgs; i++) { 2968 ASSERT(PAGE_EXCL(repl)); 2969 ASSERT(!PP_ISFREE(repl)); 2970 targ[i] = repl; 2971 page_downgrade(targ[i]); 2972 repl++; 2973 } 2974 } 2975 2976 /* 2977 * Check if all pages in ppa array are complete smaller than szc pages and 2978 * their roots will still be aligned relative to their current size if the 2979 * entire ppa array is relocated into one szc page. If these conditions are 2980 * not met return 0. 2981 * 2982 * If all pages are properly aligned attempt to upgrade their locks 2983 * to exclusive mode. If it fails set *upgrdfail to 1 and return 0. 2984 * upgrdfail was set to 0 by caller. 2985 * 2986 * Return 1 if all pages are aligned and locked exclusively. 2987 * 2988 * If all pages in ppa array happen to be physically contiguous to make one 2989 * szc page and all exclusive locks are successfully obtained promote the page 2990 * size to szc and set *pszc to szc. Return 1 with pages locked shared. 2991 */ 2992 static int 2993 segvn_full_szcpages(page_t **ppa, uint_t szc, int *upgrdfail, uint_t *pszc) 2994 { 2995 page_t *pp; 2996 pfn_t pfn; 2997 pgcnt_t totnpgs = page_get_pagecnt(szc); 2998 pfn_t first_pfn; 2999 int contig = 1; 3000 pgcnt_t i; 3001 pgcnt_t j; 3002 uint_t curszc; 3003 pgcnt_t curnpgs; 3004 int root = 0; 3005 3006 ASSERT(szc > 0); 3007 3008 VM_STAT_ADD(segvnvmstats.fullszcpages[0]); 3009 3010 for (i = 0; i < totnpgs; i++) { 3011 pp = ppa[i]; 3012 ASSERT(PAGE_SHARED(pp)); 3013 ASSERT(!PP_ISFREE(pp)); 3014 pfn = page_pptonum(pp); 3015 if (i == 0) { 3016 if (!IS_P2ALIGNED(pfn, totnpgs)) { 3017 contig = 0; 3018 } else { 3019 first_pfn = pfn; 3020 } 3021 } else if (contig && pfn != first_pfn + i) { 3022 contig = 0; 3023 } 3024 if (pp->p_szc == 0) { 3025 if (root) { 3026 VM_STAT_ADD(segvnvmstats.fullszcpages[1]); 3027 return (0); 3028 } 3029 } else if (!root) { 3030 if ((curszc = pp->p_szc) >= szc) { 3031 VM_STAT_ADD(segvnvmstats.fullszcpages[2]); 3032 return (0); 3033 } 3034 if (curszc == 0) { 3035 /* 3036 * p_szc changed means we don't have all pages 3037 * locked. return failure. 3038 */ 3039 VM_STAT_ADD(segvnvmstats.fullszcpages[3]); 3040 return (0); 3041 } 3042 curnpgs = page_get_pagecnt(curszc); 3043 if (!IS_P2ALIGNED(pfn, curnpgs) || 3044 !IS_P2ALIGNED(i, curnpgs)) { 3045 VM_STAT_ADD(segvnvmstats.fullszcpages[4]); 3046 return (0); 3047 } 3048 root = 1; 3049 } else { 3050 ASSERT(i > 0); 3051 VM_STAT_ADD(segvnvmstats.fullszcpages[5]); 3052 if (pp->p_szc != curszc) { 3053 VM_STAT_ADD(segvnvmstats.fullszcpages[6]); 3054 return (0); 3055 } 3056 if (pfn - 1 != page_pptonum(ppa[i - 1])) { 3057 panic("segvn_full_szcpages: " 3058 "large page not physically contiguous"); 3059 } 3060 if (P2PHASE(pfn, curnpgs) == curnpgs - 1) { 3061 root = 0; 3062 } 3063 } 3064 } 3065 3066 for (i = 0; i < totnpgs; i++) { 3067 ASSERT(ppa[i]->p_szc < szc); 3068 if (!page_tryupgrade(ppa[i])) { 3069 for (j = 0; j < i; j++) { 3070 page_downgrade(ppa[j]); 3071 } 3072 *pszc = ppa[i]->p_szc; 3073 *upgrdfail = 1; 3074 VM_STAT_ADD(segvnvmstats.fullszcpages[7]); 3075 return (0); 3076 } 3077 } 3078 3079 /* 3080 * When a page is put a free cachelist its szc is set to 0. if file 3081 * system reclaimed pages from cachelist targ pages will be physically 3082 * contiguous with 0 p_szc. in this case just upgrade szc of targ 3083 * pages without any relocations. 3084 * To avoid any hat issues with previous small mappings 3085 * hat_pageunload() the target pages first. 3086 */ 3087 if (contig) { 3088 VM_STAT_ADD(segvnvmstats.fullszcpages[8]); 3089 for (i = 0; i < totnpgs; i++) { 3090 (void) hat_pageunload(ppa[i], HAT_FORCE_PGUNLOAD); 3091 } 3092 for (i = 0; i < totnpgs; i++) { 3093 ppa[i]->p_szc = szc; 3094 } 3095 for (i = 0; i < totnpgs; i++) { 3096 ASSERT(PAGE_EXCL(ppa[i])); 3097 page_downgrade(ppa[i]); 3098 } 3099 if (pszc != NULL) { 3100 *pszc = szc; 3101 } 3102 } 3103 VM_STAT_ADD(segvnvmstats.fullszcpages[9]); 3104 return (1); 3105 } 3106 3107 /* 3108 * Create physically contiguous pages for [vp, off] - [vp, off + 3109 * page_size(szc)) range and for private segment return them in ppa array. 3110 * Pages are created either via IO or relocations. 3111 * 3112 * Return 1 on sucess and 0 on failure. 3113 * 3114 * If physically contiguos pages already exist for this range return 1 without 3115 * filling ppa array. Caller initializes ppa[0] as NULL to detect that ppa 3116 * array wasn't filled. In this case caller fills ppa array via VOP_GETPAGE(). 3117 */ 3118 3119 static int 3120 segvn_fill_vp_pages(struct segvn_data *svd, vnode_t *vp, u_offset_t off, 3121 uint_t szc, page_t **ppa, page_t **ppplist, uint_t *ret_pszc, 3122 int *downsize) 3123 3124 { 3125 page_t *pplist = *ppplist; 3126 size_t pgsz = page_get_pagesize(szc); 3127 pgcnt_t pages = btop(pgsz); 3128 ulong_t start_off = off; 3129 u_offset_t eoff = off + pgsz; 3130 spgcnt_t nreloc; 3131 u_offset_t io_off = off; 3132 size_t io_len; 3133 page_t *io_pplist = NULL; 3134 page_t *done_pplist = NULL; 3135 pgcnt_t pgidx = 0; 3136 page_t *pp; 3137 page_t *newpp; 3138 page_t *targpp; 3139 int io_err = 0; 3140 int i; 3141 pfn_t pfn; 3142 ulong_t ppages; 3143 page_t *targ_pplist = NULL; 3144 page_t *repl_pplist = NULL; 3145 page_t *tmp_pplist; 3146 int nios = 0; 3147 uint_t pszc; 3148 struct vattr va; 3149 3150 VM_STAT_ADD(segvnvmstats.fill_vp_pages[0]); 3151 3152 ASSERT(szc != 0); 3153 ASSERT(pplist->p_szc == szc); 3154 3155 /* 3156 * downsize will be set to 1 only if we fail to lock pages. this will 3157 * allow subsequent faults to try to relocate the page again. If we 3158 * fail due to misalignment don't downsize and let the caller map the 3159 * whole region with small mappings to avoid more faults into the area 3160 * where we can't get large pages anyway. 3161 */ 3162 *downsize = 0; 3163 3164 while (off < eoff) { 3165 newpp = pplist; 3166 ASSERT(newpp != NULL); 3167 ASSERT(PAGE_EXCL(newpp)); 3168 ASSERT(!PP_ISFREE(newpp)); 3169 /* 3170 * we pass NULL for nrelocp to page_lookup_create() 3171 * so that it doesn't relocate. We relocate here 3172 * later only after we make sure we can lock all 3173 * pages in the range we handle and they are all 3174 * aligned. 3175 */ 3176 pp = page_lookup_create(vp, off, SE_SHARED, newpp, NULL, 0); 3177 ASSERT(pp != NULL); 3178 ASSERT(!PP_ISFREE(pp)); 3179 ASSERT(pp->p_vnode == vp); 3180 ASSERT(pp->p_offset == off); 3181 if (pp == newpp) { 3182 VM_STAT_ADD(segvnvmstats.fill_vp_pages[1]); 3183 page_sub(&pplist, pp); 3184 ASSERT(PAGE_EXCL(pp)); 3185 ASSERT(page_iolock_assert(pp)); 3186 page_list_concat(&io_pplist, &pp); 3187 off += PAGESIZE; 3188 continue; 3189 } 3190 VM_STAT_ADD(segvnvmstats.fill_vp_pages[2]); 3191 pfn = page_pptonum(pp); 3192 pszc = pp->p_szc; 3193 if (pszc >= szc && targ_pplist == NULL && io_pplist == NULL && 3194 IS_P2ALIGNED(pfn, pages)) { 3195 ASSERT(repl_pplist == NULL); 3196 ASSERT(done_pplist == NULL); 3197 ASSERT(pplist == *ppplist); 3198 page_unlock(pp); 3199 page_free_replacement_page(pplist); 3200 page_create_putback(pages); 3201 *ppplist = NULL; 3202 VM_STAT_ADD(segvnvmstats.fill_vp_pages[3]); 3203 return (1); 3204 } 3205 if (pszc >= szc) { 3206 page_unlock(pp); 3207 segvn_faultvnmpss_align_err1++; 3208 goto out; 3209 } 3210 ppages = page_get_pagecnt(pszc); 3211 if (!IS_P2ALIGNED(pfn, ppages)) { 3212 ASSERT(pszc > 0); 3213 /* 3214 * sizing down to pszc won't help. 3215 */ 3216 page_unlock(pp); 3217 segvn_faultvnmpss_align_err2++; 3218 goto out; 3219 } 3220 pfn = page_pptonum(newpp); 3221 if (!IS_P2ALIGNED(pfn, ppages)) { 3222 ASSERT(pszc > 0); 3223 /* 3224 * sizing down to pszc won't help. 3225 */ 3226 page_unlock(pp); 3227 segvn_faultvnmpss_align_err3++; 3228 goto out; 3229 } 3230 if (!PAGE_EXCL(pp)) { 3231 VM_STAT_ADD(segvnvmstats.fill_vp_pages[4]); 3232 page_unlock(pp); 3233 *downsize = 1; 3234 *ret_pszc = pp->p_szc; 3235 goto out; 3236 } 3237 targpp = pp; 3238 if (io_pplist != NULL) { 3239 VM_STAT_ADD(segvnvmstats.fill_vp_pages[5]); 3240 io_len = off - io_off; 3241 /* 3242 * Some file systems like NFS don't check EOF 3243 * conditions in VOP_PAGEIO(). Check it here 3244 * now that pages are locked SE_EXCL. Any file 3245 * truncation will wait until the pages are 3246 * unlocked so no need to worry that file will 3247 * be truncated after we check its size here. 3248 * XXX fix NFS to remove this check. 3249 */ 3250 va.va_mask = AT_SIZE; 3251 if (VOP_GETATTR(vp, &va, ATTR_HINT, svd->cred) != 0) { 3252 VM_STAT_ADD(segvnvmstats.fill_vp_pages[6]); 3253 page_unlock(targpp); 3254 goto out; 3255 } 3256 if (btopr(va.va_size) < btopr(io_off + io_len)) { 3257 VM_STAT_ADD(segvnvmstats.fill_vp_pages[7]); 3258 *downsize = 1; 3259 *ret_pszc = 0; 3260 page_unlock(targpp); 3261 goto out; 3262 } 3263 io_err = VOP_PAGEIO(vp, io_pplist, io_off, io_len, 3264 B_READ, svd->cred); 3265 if (io_err) { 3266 VM_STAT_ADD(segvnvmstats.fill_vp_pages[8]); 3267 page_unlock(targpp); 3268 if (io_err == EDEADLK) { 3269 segvn_vmpss_pageio_deadlk_err++; 3270 } 3271 goto out; 3272 } 3273 nios++; 3274 VM_STAT_ADD(segvnvmstats.fill_vp_pages[9]); 3275 while (io_pplist != NULL) { 3276 pp = io_pplist; 3277 page_sub(&io_pplist, pp); 3278 ASSERT(page_iolock_assert(pp)); 3279 page_io_unlock(pp); 3280 pgidx = (pp->p_offset - start_off) >> 3281 PAGESHIFT; 3282 ASSERT(pgidx < pages); 3283 ppa[pgidx] = pp; 3284 page_list_concat(&done_pplist, &pp); 3285 } 3286 } 3287 pp = targpp; 3288 ASSERT(PAGE_EXCL(pp)); 3289 ASSERT(pp->p_szc <= pszc); 3290 if (pszc != 0 && !group_page_trylock(pp, SE_EXCL)) { 3291 VM_STAT_ADD(segvnvmstats.fill_vp_pages[10]); 3292 page_unlock(pp); 3293 *downsize = 1; 3294 *ret_pszc = pp->p_szc; 3295 goto out; 3296 } 3297 VM_STAT_ADD(segvnvmstats.fill_vp_pages[11]); 3298 /* 3299 * page szc chould have changed before the entire group was 3300 * locked. reread page szc. 3301 */ 3302 pszc = pp->p_szc; 3303 ppages = page_get_pagecnt(pszc); 3304 3305 /* link just the roots */ 3306 page_list_concat(&targ_pplist, &pp); 3307 page_sub(&pplist, newpp); 3308 page_list_concat(&repl_pplist, &newpp); 3309 off += PAGESIZE; 3310 while (--ppages != 0) { 3311 newpp = pplist; 3312 page_sub(&pplist, newpp); 3313 off += PAGESIZE; 3314 } 3315 io_off = off; 3316 } 3317 if (io_pplist != NULL) { 3318 VM_STAT_ADD(segvnvmstats.fill_vp_pages[12]); 3319 io_len = eoff - io_off; 3320 va.va_mask = AT_SIZE; 3321 if (VOP_GETATTR(vp, &va, ATTR_HINT, svd->cred) != 0) { 3322 VM_STAT_ADD(segvnvmstats.fill_vp_pages[13]); 3323 goto out; 3324 } 3325 if (btopr(va.va_size) < btopr(io_off + io_len)) { 3326 VM_STAT_ADD(segvnvmstats.fill_vp_pages[14]); 3327 *downsize = 1; 3328 *ret_pszc = 0; 3329 goto out; 3330 } 3331 io_err = VOP_PAGEIO(vp, io_pplist, io_off, io_len, 3332 B_READ, svd->cred); 3333 if (io_err) { 3334 VM_STAT_ADD(segvnvmstats.fill_vp_pages[15]); 3335 if (io_err == EDEADLK) { 3336 segvn_vmpss_pageio_deadlk_err++; 3337 } 3338 goto out; 3339 } 3340 nios++; 3341 while (io_pplist != NULL) { 3342 pp = io_pplist; 3343 page_sub(&io_pplist, pp); 3344 ASSERT(page_iolock_assert(pp)); 3345 page_io_unlock(pp); 3346 pgidx = (pp->p_offset - start_off) >> PAGESHIFT; 3347 ASSERT(pgidx < pages); 3348 ppa[pgidx] = pp; 3349 } 3350 } 3351 /* 3352 * we're now bound to succeed or panic. 3353 * remove pages from done_pplist. it's not needed anymore. 3354 */ 3355 while (done_pplist != NULL) { 3356 pp = done_pplist; 3357 page_sub(&done_pplist, pp); 3358 } 3359 VM_STAT_ADD(segvnvmstats.fill_vp_pages[16]); 3360 ASSERT(pplist == NULL); 3361 *ppplist = NULL; 3362 while (targ_pplist != NULL) { 3363 int ret; 3364 VM_STAT_ADD(segvnvmstats.fill_vp_pages[17]); 3365 ASSERT(repl_pplist); 3366 pp = targ_pplist; 3367 page_sub(&targ_pplist, pp); 3368 pgidx = (pp->p_offset - start_off) >> PAGESHIFT; 3369 newpp = repl_pplist; 3370 page_sub(&repl_pplist, newpp); 3371 #ifdef DEBUG 3372 pfn = page_pptonum(pp); 3373 pszc = pp->p_szc; 3374 ppages = page_get_pagecnt(pszc); 3375 ASSERT(IS_P2ALIGNED(pfn, ppages)); 3376 pfn = page_pptonum(newpp); 3377 ASSERT(IS_P2ALIGNED(pfn, ppages)); 3378 ASSERT(P2PHASE(pfn, pages) == pgidx); 3379 #endif 3380 nreloc = 0; 3381 ret = page_relocate(&pp, &newpp, 0, 1, &nreloc, NULL); 3382 if (ret != 0 || nreloc == 0) { 3383 panic("segvn_fill_vp_pages: " 3384 "page_relocate failed"); 3385 } 3386 pp = newpp; 3387 while (nreloc-- != 0) { 3388 ASSERT(PAGE_EXCL(pp)); 3389 ASSERT(pp->p_vnode == vp); 3390 ASSERT(pgidx == 3391 ((pp->p_offset - start_off) >> PAGESHIFT)); 3392 ppa[pgidx++] = pp; 3393 pp++; 3394 } 3395 } 3396 3397 if (svd->type == MAP_PRIVATE) { 3398 VM_STAT_ADD(segvnvmstats.fill_vp_pages[18]); 3399 for (i = 0; i < pages; i++) { 3400 ASSERT(ppa[i] != NULL); 3401 ASSERT(PAGE_EXCL(ppa[i])); 3402 ASSERT(ppa[i]->p_vnode == vp); 3403 ASSERT(ppa[i]->p_offset == 3404 start_off + (i << PAGESHIFT)); 3405 page_downgrade(ppa[i]); 3406 } 3407 ppa[pages] = NULL; 3408 } else { 3409 VM_STAT_ADD(segvnvmstats.fill_vp_pages[19]); 3410 /* 3411 * the caller will still call VOP_GETPAGE() for shared segments 3412 * to check FS write permissions. For private segments we map 3413 * file read only anyway. so no VOP_GETPAGE is needed. 3414 */ 3415 for (i = 0; i < pages; i++) { 3416 ASSERT(ppa[i] != NULL); 3417 ASSERT(PAGE_EXCL(ppa[i])); 3418 ASSERT(ppa[i]->p_vnode == vp); 3419 ASSERT(ppa[i]->p_offset == 3420 start_off + (i << PAGESHIFT)); 3421 page_unlock(ppa[i]); 3422 } 3423 ppa[0] = NULL; 3424 } 3425 3426 return (1); 3427 out: 3428 /* 3429 * Do the cleanup. Unlock target pages we didn't relocate. They are 3430 * linked on targ_pplist by root pages. reassemble unused replacement 3431 * and io pages back to pplist. 3432 */ 3433 if (io_pplist != NULL) { 3434 VM_STAT_ADD(segvnvmstats.fill_vp_pages[20]); 3435 pp = io_pplist; 3436 do { 3437 ASSERT(pp->p_vnode == vp); 3438 ASSERT(pp->p_offset == io_off); 3439 ASSERT(page_iolock_assert(pp)); 3440 page_io_unlock(pp); 3441 page_hashout(pp, NULL); 3442 io_off += PAGESIZE; 3443 } while ((pp = pp->p_next) != io_pplist); 3444 page_list_concat(&io_pplist, &pplist); 3445 pplist = io_pplist; 3446 } 3447 tmp_pplist = NULL; 3448 while (targ_pplist != NULL) { 3449 VM_STAT_ADD(segvnvmstats.fill_vp_pages[21]); 3450 pp = targ_pplist; 3451 ASSERT(PAGE_EXCL(pp)); 3452 page_sub(&targ_pplist, pp); 3453 3454 pszc = pp->p_szc; 3455 ppages = page_get_pagecnt(pszc); 3456 ASSERT(IS_P2ALIGNED(page_pptonum(pp), ppages)); 3457 3458 if (pszc != 0) { 3459 group_page_unlock(pp); 3460 } 3461 page_unlock(pp); 3462 3463 pp = repl_pplist; 3464 ASSERT(pp != NULL); 3465 ASSERT(PAGE_EXCL(pp)); 3466 ASSERT(pp->p_szc == szc); 3467 page_sub(&repl_pplist, pp); 3468 3469 ASSERT(IS_P2ALIGNED(page_pptonum(pp), ppages)); 3470 3471 /* relink replacement page */ 3472 page_list_concat(&tmp_pplist, &pp); 3473 while (--ppages != 0) { 3474 VM_STAT_ADD(segvnvmstats.fill_vp_pages[22]); 3475 pp++; 3476 ASSERT(PAGE_EXCL(pp)); 3477 ASSERT(pp->p_szc == szc); 3478 page_list_concat(&tmp_pplist, &pp); 3479 } 3480 } 3481 if (tmp_pplist != NULL) { 3482 VM_STAT_ADD(segvnvmstats.fill_vp_pages[23]); 3483 page_list_concat(&tmp_pplist, &pplist); 3484 pplist = tmp_pplist; 3485 } 3486 /* 3487 * at this point all pages are either on done_pplist or 3488 * pplist. They can't be all on done_pplist otherwise 3489 * we'd've been done. 3490 */ 3491 ASSERT(pplist != NULL); 3492 if (nios != 0) { 3493 VM_STAT_ADD(segvnvmstats.fill_vp_pages[24]); 3494 pp = pplist; 3495 do { 3496 VM_STAT_ADD(segvnvmstats.fill_vp_pages[25]); 3497 ASSERT(pp->p_szc == szc); 3498 ASSERT(PAGE_EXCL(pp)); 3499 ASSERT(pp->p_vnode != vp); 3500 pp->p_szc = 0; 3501 } while ((pp = pp->p_next) != pplist); 3502 3503 pp = done_pplist; 3504 do { 3505 VM_STAT_ADD(segvnvmstats.fill_vp_pages[26]); 3506 ASSERT(pp->p_szc == szc); 3507 ASSERT(PAGE_EXCL(pp)); 3508 ASSERT(pp->p_vnode == vp); 3509 pp->p_szc = 0; 3510 } while ((pp = pp->p_next) != done_pplist); 3511 3512 while (pplist != NULL) { 3513 VM_STAT_ADD(segvnvmstats.fill_vp_pages[27]); 3514 pp = pplist; 3515 page_sub(&pplist, pp); 3516 page_free(pp, 0); 3517 } 3518 3519 while (done_pplist != NULL) { 3520 VM_STAT_ADD(segvnvmstats.fill_vp_pages[28]); 3521 pp = done_pplist; 3522 page_sub(&done_pplist, pp); 3523 page_unlock(pp); 3524 } 3525 *ppplist = NULL; 3526 return (0); 3527 } 3528 ASSERT(pplist == *ppplist); 3529 if (io_err) { 3530 VM_STAT_ADD(segvnvmstats.fill_vp_pages[29]); 3531 /* 3532 * don't downsize on io error. 3533 * see if vop_getpage succeeds. 3534 * pplist may still be used in this case 3535 * for relocations. 3536 */ 3537 return (0); 3538 } 3539 VM_STAT_ADD(segvnvmstats.fill_vp_pages[30]); 3540 page_free_replacement_page(pplist); 3541 page_create_putback(pages); 3542 *ppplist = NULL; 3543 return (0); 3544 } 3545 3546 int segvn_anypgsz = 0; 3547 3548 #define SEGVN_RESTORE_SOFTLOCK(type, pages) \ 3549 if ((type) == F_SOFTLOCK) { \ 3550 mutex_enter(&freemem_lock); \ 3551 availrmem += (pages); \ 3552 segvn_pages_locked -= (pages); \ 3553 svd->softlockcnt -= (pages); \ 3554 mutex_exit(&freemem_lock); \ 3555 } 3556 3557 #define SEGVN_UPDATE_MODBITS(ppa, pages, rw, prot, vpprot) \ 3558 if (IS_VMODSORT((ppa)[0]->p_vnode)) { \ 3559 if ((rw) == S_WRITE) { \ 3560 for (i = 0; i < (pages); i++) { \ 3561 ASSERT((ppa)[i]->p_vnode == \ 3562 (ppa)[0]->p_vnode); \ 3563 hat_setmod((ppa)[i]); \ 3564 } \ 3565 } else if ((rw) != S_OTHER && \ 3566 ((prot) & (vpprot) & PROT_WRITE)) { \ 3567 for (i = 0; i < (pages); i++) { \ 3568 ASSERT((ppa)[i]->p_vnode == \ 3569 (ppa)[0]->p_vnode); \ 3570 if (!hat_ismod((ppa)[i])) { \ 3571 prot &= ~PROT_WRITE; \ 3572 break; \ 3573 } \ 3574 } \ 3575 } \ 3576 } 3577 3578 #ifdef VM_STATS 3579 3580 #define SEGVN_VMSTAT_FLTVNPAGES(idx) \ 3581 VM_STAT_ADD(segvnvmstats.fltvnpages[(idx)]); 3582 3583 #else /* VM_STATS */ 3584 3585 #define SEGVN_VMSTAT_FLTVNPAGES(idx) 3586 3587 #endif 3588 3589 static faultcode_t 3590 segvn_fault_vnodepages(struct hat *hat, struct seg *seg, caddr_t lpgaddr, 3591 caddr_t lpgeaddr, enum fault_type type, enum seg_rw rw, caddr_t addr, 3592 caddr_t eaddr, int brkcow) 3593 { 3594 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 3595 struct anon_map *amp = svd->amp; 3596 uchar_t segtype = svd->type; 3597 uint_t szc = seg->s_szc; 3598 size_t pgsz = page_get_pagesize(szc); 3599 size_t maxpgsz = pgsz; 3600 pgcnt_t pages = btop(pgsz); 3601 pgcnt_t maxpages = pages; 3602 size_t ppasize = (pages + 1) * sizeof (page_t *); 3603 caddr_t a = lpgaddr; 3604 caddr_t maxlpgeaddr = lpgeaddr; 3605 u_offset_t off = svd->offset + (uintptr_t)(a - seg->s_base); 3606 ulong_t aindx = svd->anon_index + seg_page(seg, a); 3607 struct vpage *vpage = (svd->vpage != NULL) ? 3608 &svd->vpage[seg_page(seg, a)] : NULL; 3609 vnode_t *vp = svd->vp; 3610 page_t **ppa; 3611 uint_t pszc; 3612 size_t ppgsz; 3613 pgcnt_t ppages; 3614 faultcode_t err = 0; 3615 int ierr; 3616 int vop_size_err = 0; 3617 uint_t protchk, prot, vpprot; 3618 ulong_t i; 3619 int hat_flag = (type == F_SOFTLOCK) ? HAT_LOAD_LOCK : HAT_LOAD; 3620 anon_sync_obj_t an_cookie; 3621 enum seg_rw arw; 3622 int alloc_failed = 0; 3623 int adjszc_chk; 3624 struct vattr va; 3625 int xhat = 0; 3626 page_t *pplist; 3627 pfn_t pfn; 3628 int physcontig; 3629 int upgrdfail; 3630 int segvn_anypgsz_vnode = 0; /* for now map vnode with 2 page sizes */ 3631 int tron = (svd->tr_state == SEGVN_TR_ON); 3632 3633 ASSERT(szc != 0); 3634 ASSERT(vp != NULL); 3635 ASSERT(brkcow == 0 || amp != NULL); 3636 ASSERT(tron == 0 || amp != NULL); 3637 ASSERT(enable_mbit_wa == 0); /* no mbit simulations with large pages */ 3638 ASSERT(!(svd->flags & MAP_NORESERVE)); 3639 ASSERT(type != F_SOFTUNLOCK); 3640 ASSERT(IS_P2ALIGNED(a, maxpgsz)); 3641 ASSERT(amp == NULL || IS_P2ALIGNED(aindx, maxpages)); 3642 ASSERT(SEGVN_LOCK_HELD(seg->s_as, &svd->lock)); 3643 ASSERT(seg->s_szc < NBBY * sizeof (int)); 3644 ASSERT(type != F_SOFTLOCK || lpgeaddr - a == maxpgsz); 3645 3646 VM_STAT_COND_ADD(type == F_SOFTLOCK, segvnvmstats.fltvnpages[0]); 3647 VM_STAT_COND_ADD(type != F_SOFTLOCK, segvnvmstats.fltvnpages[1]); 3648 3649 if (svd->flags & MAP_TEXT) { 3650 hat_flag |= HAT_LOAD_TEXT; 3651 } 3652 3653 if (svd->pageprot) { 3654 switch (rw) { 3655 case S_READ: 3656 protchk = PROT_READ; 3657 break; 3658 case S_WRITE: 3659 protchk = PROT_WRITE; 3660 break; 3661 case S_EXEC: 3662 protchk = PROT_EXEC; 3663 break; 3664 case S_OTHER: 3665 default: 3666 protchk = PROT_READ | PROT_WRITE | PROT_EXEC; 3667 break; 3668 } 3669 } else { 3670 prot = svd->prot; 3671 /* caller has already done segment level protection check. */ 3672 } 3673 3674 if (seg->s_as->a_hat != hat) { 3675 xhat = 1; 3676 } 3677 3678 if (rw == S_WRITE && segtype == MAP_PRIVATE) { 3679 SEGVN_VMSTAT_FLTVNPAGES(2); 3680 arw = S_READ; 3681 } else { 3682 arw = rw; 3683 } 3684 3685 ppa = kmem_alloc(ppasize, KM_SLEEP); 3686 3687 VM_STAT_COND_ADD(amp != NULL, segvnvmstats.fltvnpages[3]); 3688 3689 for (;;) { 3690 adjszc_chk = 0; 3691 for (; a < lpgeaddr; a += pgsz, off += pgsz, aindx += pages) { 3692 if (adjszc_chk) { 3693 while (szc < seg->s_szc) { 3694 uintptr_t e; 3695 uint_t tszc; 3696 tszc = segvn_anypgsz_vnode ? szc + 1 : 3697 seg->s_szc; 3698 ppgsz = page_get_pagesize(tszc); 3699 if (!IS_P2ALIGNED(a, ppgsz) || 3700 ((alloc_failed >> tszc) & 3701 0x1)) { 3702 break; 3703 } 3704 SEGVN_VMSTAT_FLTVNPAGES(4); 3705 szc = tszc; 3706 pgsz = ppgsz; 3707 pages = btop(pgsz); 3708 e = P2ROUNDUP((uintptr_t)eaddr, pgsz); 3709 lpgeaddr = (caddr_t)e; 3710 } 3711 } 3712 3713 again: 3714 if (IS_P2ALIGNED(a, maxpgsz) && amp != NULL) { 3715 ASSERT(IS_P2ALIGNED(aindx, maxpages)); 3716 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 3717 anon_array_enter(amp, aindx, &an_cookie); 3718 if (anon_get_ptr(amp->ahp, aindx) != NULL) { 3719 SEGVN_VMSTAT_FLTVNPAGES(5); 3720 ASSERT(anon_pages(amp->ahp, aindx, 3721 maxpages) == maxpages); 3722 anon_array_exit(&an_cookie); 3723 ANON_LOCK_EXIT(&->a_rwlock); 3724 err = segvn_fault_anonpages(hat, seg, 3725 a, a + maxpgsz, type, rw, 3726 MAX(a, addr), 3727 MIN(a + maxpgsz, eaddr), brkcow); 3728 if (err != 0) { 3729 SEGVN_VMSTAT_FLTVNPAGES(6); 3730 goto out; 3731 } 3732 if (szc < seg->s_szc) { 3733 szc = seg->s_szc; 3734 pgsz = maxpgsz; 3735 pages = maxpages; 3736 lpgeaddr = maxlpgeaddr; 3737 } 3738 goto next; 3739 } else { 3740 ASSERT(anon_pages(amp->ahp, aindx, 3741 maxpages) == 0); 3742 SEGVN_VMSTAT_FLTVNPAGES(7); 3743 anon_array_exit(&an_cookie); 3744 ANON_LOCK_EXIT(&->a_rwlock); 3745 } 3746 } 3747 ASSERT(!brkcow || IS_P2ALIGNED(a, maxpgsz)); 3748 ASSERT(!tron || IS_P2ALIGNED(a, maxpgsz)); 3749 3750 if (svd->pageprot != 0 && IS_P2ALIGNED(a, maxpgsz)) { 3751 ASSERT(vpage != NULL); 3752 prot = VPP_PROT(vpage); 3753 ASSERT(sameprot(seg, a, maxpgsz)); 3754 if ((prot & protchk) == 0) { 3755 SEGVN_VMSTAT_FLTVNPAGES(8); 3756 err = FC_PROT; 3757 goto out; 3758 } 3759 } 3760 if (type == F_SOFTLOCK) { 3761 mutex_enter(&freemem_lock); 3762 if (availrmem < tune.t_minarmem + pages) { 3763 mutex_exit(&freemem_lock); 3764 err = FC_MAKE_ERR(ENOMEM); 3765 goto out; 3766 } else { 3767 availrmem -= pages; 3768 segvn_pages_locked += pages; 3769 svd->softlockcnt += pages; 3770 } 3771 mutex_exit(&freemem_lock); 3772 } 3773 3774 pplist = NULL; 3775 physcontig = 0; 3776 ppa[0] = NULL; 3777 if (!brkcow && !tron && szc && 3778 !page_exists_physcontig(vp, off, szc, 3779 segtype == MAP_PRIVATE ? ppa : NULL)) { 3780 SEGVN_VMSTAT_FLTVNPAGES(9); 3781 if (page_alloc_pages(vp, seg, a, &pplist, NULL, 3782 szc, 0, 0) && type != F_SOFTLOCK) { 3783 SEGVN_VMSTAT_FLTVNPAGES(10); 3784 pszc = 0; 3785 ierr = -1; 3786 alloc_failed |= (1 << szc); 3787 break; 3788 } 3789 if (pplist != NULL && 3790 vp->v_mpssdata == SEGVN_PAGEIO) { 3791 int downsize; 3792 SEGVN_VMSTAT_FLTVNPAGES(11); 3793 physcontig = segvn_fill_vp_pages(svd, 3794 vp, off, szc, ppa, &pplist, 3795 &pszc, &downsize); 3796 ASSERT(!physcontig || pplist == NULL); 3797 if (!physcontig && downsize && 3798 type != F_SOFTLOCK) { 3799 ASSERT(pplist == NULL); 3800 SEGVN_VMSTAT_FLTVNPAGES(12); 3801 ierr = -1; 3802 break; 3803 } 3804 ASSERT(!physcontig || 3805 segtype == MAP_PRIVATE || 3806 ppa[0] == NULL); 3807 if (physcontig && ppa[0] == NULL) { 3808 physcontig = 0; 3809 } 3810 } 3811 } else if (!brkcow && !tron && szc && ppa[0] != NULL) { 3812 SEGVN_VMSTAT_FLTVNPAGES(13); 3813 ASSERT(segtype == MAP_PRIVATE); 3814 physcontig = 1; 3815 } 3816 3817 if (!physcontig) { 3818 SEGVN_VMSTAT_FLTVNPAGES(14); 3819 ppa[0] = NULL; 3820 ierr = VOP_GETPAGE(vp, (offset_t)off, pgsz, 3821 &vpprot, ppa, pgsz, seg, a, arw, 3822 svd->cred); 3823 #ifdef DEBUG 3824 if (ierr == 0) { 3825 for (i = 0; i < pages; i++) { 3826 ASSERT(PAGE_LOCKED(ppa[i])); 3827 ASSERT(!PP_ISFREE(ppa[i])); 3828 ASSERT(ppa[i]->p_vnode == vp); 3829 ASSERT(ppa[i]->p_offset == 3830 off + (i << PAGESHIFT)); 3831 } 3832 } 3833 #endif /* DEBUG */ 3834 if (segtype == MAP_PRIVATE) { 3835 SEGVN_VMSTAT_FLTVNPAGES(15); 3836 vpprot &= ~PROT_WRITE; 3837 } 3838 } else { 3839 ASSERT(segtype == MAP_PRIVATE); 3840 SEGVN_VMSTAT_FLTVNPAGES(16); 3841 vpprot = PROT_ALL & ~PROT_WRITE; 3842 ierr = 0; 3843 } 3844 3845 if (ierr != 0) { 3846 SEGVN_VMSTAT_FLTVNPAGES(17); 3847 if (pplist != NULL) { 3848 SEGVN_VMSTAT_FLTVNPAGES(18); 3849 page_free_replacement_page(pplist); 3850 page_create_putback(pages); 3851 } 3852 SEGVN_RESTORE_SOFTLOCK(type, pages); 3853 if (a + pgsz <= eaddr) { 3854 SEGVN_VMSTAT_FLTVNPAGES(19); 3855 err = FC_MAKE_ERR(ierr); 3856 goto out; 3857 } 3858 va.va_mask = AT_SIZE; 3859 if (VOP_GETATTR(vp, &va, 0, svd->cred) != 0) { 3860 SEGVN_VMSTAT_FLTVNPAGES(20); 3861 err = FC_MAKE_ERR(EIO); 3862 goto out; 3863 } 3864 if (btopr(va.va_size) >= btopr(off + pgsz)) { 3865 SEGVN_VMSTAT_FLTVNPAGES(21); 3866 err = FC_MAKE_ERR(ierr); 3867 goto out; 3868 } 3869 if (btopr(va.va_size) < 3870 btopr(off + (eaddr - a))) { 3871 SEGVN_VMSTAT_FLTVNPAGES(22); 3872 err = FC_MAKE_ERR(ierr); 3873 goto out; 3874 } 3875 if (brkcow || tron || type == F_SOFTLOCK) { 3876 /* can't reduce map area */ 3877 SEGVN_VMSTAT_FLTVNPAGES(23); 3878 vop_size_err = 1; 3879 goto out; 3880 } 3881 SEGVN_VMSTAT_FLTVNPAGES(24); 3882 ASSERT(szc != 0); 3883 pszc = 0; 3884 ierr = -1; 3885 break; 3886 } 3887 3888 if (amp != NULL) { 3889 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 3890 anon_array_enter(amp, aindx, &an_cookie); 3891 } 3892 if (amp != NULL && 3893 anon_get_ptr(amp->ahp, aindx) != NULL) { 3894 ulong_t taindx = P2ALIGN(aindx, maxpages); 3895 3896 SEGVN_VMSTAT_FLTVNPAGES(25); 3897 ASSERT(anon_pages(amp->ahp, taindx, 3898 maxpages) == maxpages); 3899 for (i = 0; i < pages; i++) { 3900 page_unlock(ppa[i]); 3901 } 3902 anon_array_exit(&an_cookie); 3903 ANON_LOCK_EXIT(&->a_rwlock); 3904 if (pplist != NULL) { 3905 page_free_replacement_page(pplist); 3906 page_create_putback(pages); 3907 } 3908 SEGVN_RESTORE_SOFTLOCK(type, pages); 3909 if (szc < seg->s_szc) { 3910 SEGVN_VMSTAT_FLTVNPAGES(26); 3911 /* 3912 * For private segments SOFTLOCK 3913 * either always breaks cow (any rw 3914 * type except S_READ_NOCOW) or 3915 * address space is locked as writer 3916 * (S_READ_NOCOW case) and anon slots 3917 * can't show up on second check. 3918 * Therefore if we are here for 3919 * SOFTLOCK case it must be a cow 3920 * break but cow break never reduces 3921 * szc. text replication (tron) in 3922 * this case works as cow break. 3923 * Thus the assert below. 3924 */ 3925 ASSERT(!brkcow && !tron && 3926 type != F_SOFTLOCK); 3927 pszc = seg->s_szc; 3928 ierr = -2; 3929 break; 3930 } 3931 ASSERT(IS_P2ALIGNED(a, maxpgsz)); 3932 goto again; 3933 } 3934 #ifdef DEBUG 3935 if (amp != NULL) { 3936 ulong_t taindx = P2ALIGN(aindx, maxpages); 3937 ASSERT(!anon_pages(amp->ahp, taindx, maxpages)); 3938 } 3939 #endif /* DEBUG */ 3940 3941 if (brkcow || tron) { 3942 ASSERT(amp != NULL); 3943 ASSERT(pplist == NULL); 3944 ASSERT(szc == seg->s_szc); 3945 ASSERT(IS_P2ALIGNED(a, maxpgsz)); 3946 ASSERT(IS_P2ALIGNED(aindx, maxpages)); 3947 SEGVN_VMSTAT_FLTVNPAGES(27); 3948 ierr = anon_map_privatepages(amp, aindx, szc, 3949 seg, a, prot, ppa, vpage, segvn_anypgsz, 3950 tron ? PG_LOCAL : 0, svd->cred); 3951 if (ierr != 0) { 3952 SEGVN_VMSTAT_FLTVNPAGES(28); 3953 anon_array_exit(&an_cookie); 3954 ANON_LOCK_EXIT(&->a_rwlock); 3955 SEGVN_RESTORE_SOFTLOCK(type, pages); 3956 err = FC_MAKE_ERR(ierr); 3957 goto out; 3958 } 3959 3960 ASSERT(!IS_VMODSORT(ppa[0]->p_vnode)); 3961 /* 3962 * p_szc can't be changed for locked 3963 * swapfs pages. 3964 */ 3965 hat_memload_array(hat, a, pgsz, ppa, prot, 3966 hat_flag); 3967 3968 if (!(hat_flag & HAT_LOAD_LOCK)) { 3969 SEGVN_VMSTAT_FLTVNPAGES(29); 3970 for (i = 0; i < pages; i++) { 3971 page_unlock(ppa[i]); 3972 } 3973 } 3974 anon_array_exit(&an_cookie); 3975 ANON_LOCK_EXIT(&->a_rwlock); 3976 goto next; 3977 } 3978 3979 pfn = page_pptonum(ppa[0]); 3980 /* 3981 * hat_page_demote() needs an EXCl lock on one of 3982 * constituent page_t's and it decreases root's p_szc 3983 * last. This means if root's p_szc is equal szc and 3984 * all its constituent pages are locked 3985 * hat_page_demote() that could have changed p_szc to 3986 * szc is already done and no new have page_demote() 3987 * can start for this large page. 3988 */ 3989 3990 /* 3991 * we need to make sure same mapping size is used for 3992 * the same address range if there's a possibility the 3993 * adddress is already mapped because hat layer panics 3994 * when translation is loaded for the range already 3995 * mapped with a different page size. We achieve it 3996 * by always using largest page size possible subject 3997 * to the constraints of page size, segment page size 3998 * and page alignment. Since mappings are invalidated 3999 * when those constraints change and make it 4000 * impossible to use previously used mapping size no 4001 * mapping size conflicts should happen. 4002 */ 4003 4004 chkszc: 4005 if ((pszc = ppa[0]->p_szc) == szc && 4006 IS_P2ALIGNED(pfn, pages)) { 4007 4008 SEGVN_VMSTAT_FLTVNPAGES(30); 4009 #ifdef DEBUG 4010 for (i = 0; i < pages; i++) { 4011 ASSERT(PAGE_LOCKED(ppa[i])); 4012 ASSERT(!PP_ISFREE(ppa[i])); 4013 ASSERT(page_pptonum(ppa[i]) == 4014 pfn + i); 4015 ASSERT(ppa[i]->p_szc == szc); 4016 ASSERT(ppa[i]->p_vnode == vp); 4017 ASSERT(ppa[i]->p_offset == 4018 off + (i << PAGESHIFT)); 4019 } 4020 #endif /* DEBUG */ 4021 /* 4022 * All pages are of szc we need and they are 4023 * all locked so they can't change szc. load 4024 * translations. 4025 * 4026 * if page got promoted since last check 4027 * we don't need pplist. 4028 */ 4029 if (pplist != NULL) { 4030 page_free_replacement_page(pplist); 4031 page_create_putback(pages); 4032 } 4033 if (PP_ISMIGRATE(ppa[0])) { 4034 page_migrate(seg, a, ppa, pages); 4035 } 4036 SEGVN_UPDATE_MODBITS(ppa, pages, rw, 4037 prot, vpprot); 4038 if (!xhat) { 4039 hat_memload_array(hat, a, pgsz, ppa, 4040 prot & vpprot, hat_flag); 4041 } else { 4042 /* 4043 * avoid large xhat mappings to FS 4044 * pages so that hat_page_demote() 4045 * doesn't need to check for xhat 4046 * large mappings. 4047 */ 4048 for (i = 0; i < pages; i++) { 4049 hat_memload(hat, 4050 a + (i << PAGESHIFT), 4051 ppa[i], prot & vpprot, 4052 hat_flag); 4053 } 4054 } 4055 4056 if (!(hat_flag & HAT_LOAD_LOCK)) { 4057 for (i = 0; i < pages; i++) { 4058 page_unlock(ppa[i]); 4059 } 4060 } 4061 if (amp != NULL) { 4062 anon_array_exit(&an_cookie); 4063 ANON_LOCK_EXIT(&->a_rwlock); 4064 } 4065 goto next; 4066 } 4067 4068 /* 4069 * See if upsize is possible. 4070 */ 4071 if (pszc > szc && szc < seg->s_szc && 4072 (segvn_anypgsz_vnode || pszc >= seg->s_szc)) { 4073 pgcnt_t aphase; 4074 uint_t pszc1 = MIN(pszc, seg->s_szc); 4075 ppgsz = page_get_pagesize(pszc1); 4076 ppages = btop(ppgsz); 4077 aphase = btop(P2PHASE((uintptr_t)a, ppgsz)); 4078 4079 ASSERT(type != F_SOFTLOCK); 4080 4081 SEGVN_VMSTAT_FLTVNPAGES(31); 4082 if (aphase != P2PHASE(pfn, ppages)) { 4083 segvn_faultvnmpss_align_err4++; 4084 } else { 4085 SEGVN_VMSTAT_FLTVNPAGES(32); 4086 if (pplist != NULL) { 4087 page_t *pl = pplist; 4088 page_free_replacement_page(pl); 4089 page_create_putback(pages); 4090 } 4091 for (i = 0; i < pages; i++) { 4092 page_unlock(ppa[i]); 4093 } 4094 if (amp != NULL) { 4095 anon_array_exit(&an_cookie); 4096 ANON_LOCK_EXIT(&->a_rwlock); 4097 } 4098 pszc = pszc1; 4099 ierr = -2; 4100 break; 4101 } 4102 } 4103 4104 /* 4105 * check if we should use smallest mapping size. 4106 */ 4107 upgrdfail = 0; 4108 if (szc == 0 || xhat || 4109 (pszc >= szc && 4110 !IS_P2ALIGNED(pfn, pages)) || 4111 (pszc < szc && 4112 !segvn_full_szcpages(ppa, szc, &upgrdfail, 4113 &pszc))) { 4114 4115 if (upgrdfail && type != F_SOFTLOCK) { 4116 /* 4117 * segvn_full_szcpages failed to lock 4118 * all pages EXCL. Size down. 4119 */ 4120 ASSERT(pszc < szc); 4121 4122 SEGVN_VMSTAT_FLTVNPAGES(33); 4123 4124 if (pplist != NULL) { 4125 page_t *pl = pplist; 4126 page_free_replacement_page(pl); 4127 page_create_putback(pages); 4128 } 4129 4130 for (i = 0; i < pages; i++) { 4131 page_unlock(ppa[i]); 4132 } 4133 if (amp != NULL) { 4134 anon_array_exit(&an_cookie); 4135 ANON_LOCK_EXIT(&->a_rwlock); 4136 } 4137 ierr = -1; 4138 break; 4139 } 4140 if (szc != 0 && !xhat && !upgrdfail) { 4141 segvn_faultvnmpss_align_err5++; 4142 } 4143 SEGVN_VMSTAT_FLTVNPAGES(34); 4144 if (pplist != NULL) { 4145 page_free_replacement_page(pplist); 4146 page_create_putback(pages); 4147 } 4148 SEGVN_UPDATE_MODBITS(ppa, pages, rw, 4149 prot, vpprot); 4150 if (upgrdfail && segvn_anypgsz_vnode) { 4151 /* SOFTLOCK case */ 4152 hat_memload_array(hat, a, pgsz, 4153 ppa, prot & vpprot, hat_flag); 4154 } else { 4155 for (i = 0; i < pages; i++) { 4156 hat_memload(hat, 4157 a + (i << PAGESHIFT), 4158 ppa[i], prot & vpprot, 4159 hat_flag); 4160 } 4161 } 4162 if (!(hat_flag & HAT_LOAD_LOCK)) { 4163 for (i = 0; i < pages; i++) { 4164 page_unlock(ppa[i]); 4165 } 4166 } 4167 if (amp != NULL) { 4168 anon_array_exit(&an_cookie); 4169 ANON_LOCK_EXIT(&->a_rwlock); 4170 } 4171 goto next; 4172 } 4173 4174 if (pszc == szc) { 4175 /* 4176 * segvn_full_szcpages() upgraded pages szc. 4177 */ 4178 ASSERT(pszc == ppa[0]->p_szc); 4179 ASSERT(IS_P2ALIGNED(pfn, pages)); 4180 goto chkszc; 4181 } 4182 4183 if (pszc > szc) { 4184 kmutex_t *szcmtx; 4185 SEGVN_VMSTAT_FLTVNPAGES(35); 4186 /* 4187 * p_szc of ppa[0] can change since we haven't 4188 * locked all constituent pages. Call 4189 * page_lock_szc() to prevent szc changes. 4190 * This should be a rare case that happens when 4191 * multiple segments use a different page size 4192 * to map the same file offsets. 4193 */ 4194 szcmtx = page_szc_lock(ppa[0]); 4195 pszc = ppa[0]->p_szc; 4196 ASSERT(szcmtx != NULL || pszc == 0); 4197 ASSERT(ppa[0]->p_szc <= pszc); 4198 if (pszc <= szc) { 4199 SEGVN_VMSTAT_FLTVNPAGES(36); 4200 if (szcmtx != NULL) { 4201 mutex_exit(szcmtx); 4202 } 4203 goto chkszc; 4204 } 4205 if (pplist != NULL) { 4206 /* 4207 * page got promoted since last check. 4208 * we don't need preaalocated large 4209 * page. 4210 */ 4211 SEGVN_VMSTAT_FLTVNPAGES(37); 4212 page_free_replacement_page(pplist); 4213 page_create_putback(pages); 4214 } 4215 SEGVN_UPDATE_MODBITS(ppa, pages, rw, 4216 prot, vpprot); 4217 hat_memload_array(hat, a, pgsz, ppa, 4218 prot & vpprot, hat_flag); 4219 mutex_exit(szcmtx); 4220 if (!(hat_flag & HAT_LOAD_LOCK)) { 4221 for (i = 0; i < pages; i++) { 4222 page_unlock(ppa[i]); 4223 } 4224 } 4225 if (amp != NULL) { 4226 anon_array_exit(&an_cookie); 4227 ANON_LOCK_EXIT(&->a_rwlock); 4228 } 4229 goto next; 4230 } 4231 4232 /* 4233 * if page got demoted since last check 4234 * we could have not allocated larger page. 4235 * allocate now. 4236 */ 4237 if (pplist == NULL && 4238 page_alloc_pages(vp, seg, a, &pplist, NULL, 4239 szc, 0, 0) && type != F_SOFTLOCK) { 4240 SEGVN_VMSTAT_FLTVNPAGES(38); 4241 for (i = 0; i < pages; i++) { 4242 page_unlock(ppa[i]); 4243 } 4244 if (amp != NULL) { 4245 anon_array_exit(&an_cookie); 4246 ANON_LOCK_EXIT(&->a_rwlock); 4247 } 4248 ierr = -1; 4249 alloc_failed |= (1 << szc); 4250 break; 4251 } 4252 4253 SEGVN_VMSTAT_FLTVNPAGES(39); 4254 4255 if (pplist != NULL) { 4256 segvn_relocate_pages(ppa, pplist); 4257 #ifdef DEBUG 4258 } else { 4259 ASSERT(type == F_SOFTLOCK); 4260 SEGVN_VMSTAT_FLTVNPAGES(40); 4261 #endif /* DEBUG */ 4262 } 4263 4264 SEGVN_UPDATE_MODBITS(ppa, pages, rw, prot, vpprot); 4265 4266 if (pplist == NULL && segvn_anypgsz_vnode == 0) { 4267 ASSERT(type == F_SOFTLOCK); 4268 for (i = 0; i < pages; i++) { 4269 ASSERT(ppa[i]->p_szc < szc); 4270 hat_memload(hat, a + (i << PAGESHIFT), 4271 ppa[i], prot & vpprot, hat_flag); 4272 } 4273 } else { 4274 ASSERT(pplist != NULL || type == F_SOFTLOCK); 4275 hat_memload_array(hat, a, pgsz, ppa, 4276 prot & vpprot, hat_flag); 4277 } 4278 if (!(hat_flag & HAT_LOAD_LOCK)) { 4279 for (i = 0; i < pages; i++) { 4280 ASSERT(PAGE_SHARED(ppa[i])); 4281 page_unlock(ppa[i]); 4282 } 4283 } 4284 if (amp != NULL) { 4285 anon_array_exit(&an_cookie); 4286 ANON_LOCK_EXIT(&->a_rwlock); 4287 } 4288 4289 next: 4290 if (vpage != NULL) { 4291 vpage += pages; 4292 } 4293 adjszc_chk = 1; 4294 } 4295 if (a == lpgeaddr) 4296 break; 4297 ASSERT(a < lpgeaddr); 4298 4299 ASSERT(!brkcow && !tron && type != F_SOFTLOCK); 4300 4301 /* 4302 * ierr == -1 means we failed to map with a large page. 4303 * (either due to allocation/relocation failures or 4304 * misalignment with other mappings to this file. 4305 * 4306 * ierr == -2 means some other thread allocated a large page 4307 * after we gave up tp map with a large page. retry with 4308 * larger mapping. 4309 */ 4310 ASSERT(ierr == -1 || ierr == -2); 4311 ASSERT(ierr == -2 || szc != 0); 4312 ASSERT(ierr == -1 || szc < seg->s_szc); 4313 if (ierr == -2) { 4314 SEGVN_VMSTAT_FLTVNPAGES(41); 4315 ASSERT(pszc > szc && pszc <= seg->s_szc); 4316 szc = pszc; 4317 } else if (segvn_anypgsz_vnode) { 4318 SEGVN_VMSTAT_FLTVNPAGES(42); 4319 szc--; 4320 } else { 4321 SEGVN_VMSTAT_FLTVNPAGES(43); 4322 ASSERT(pszc < szc); 4323 /* 4324 * other process created pszc large page. 4325 * but we still have to drop to 0 szc. 4326 */ 4327 szc = 0; 4328 } 4329 4330 pgsz = page_get_pagesize(szc); 4331 pages = btop(pgsz); 4332 if (ierr == -2) { 4333 /* 4334 * Size up case. Note lpgaddr may only be needed for 4335 * softlock case so we don't adjust it here. 4336 */ 4337 a = (caddr_t)P2ALIGN((uintptr_t)a, pgsz); 4338 ASSERT(a >= lpgaddr); 4339 lpgeaddr = (caddr_t)P2ROUNDUP((uintptr_t)eaddr, pgsz); 4340 off = svd->offset + (uintptr_t)(a - seg->s_base); 4341 aindx = svd->anon_index + seg_page(seg, a); 4342 vpage = (svd->vpage != NULL) ? 4343 &svd->vpage[seg_page(seg, a)] : NULL; 4344 } else { 4345 /* 4346 * Size down case. Note lpgaddr may only be needed for 4347 * softlock case so we don't adjust it here. 4348 */ 4349 ASSERT(IS_P2ALIGNED(a, pgsz)); 4350 ASSERT(IS_P2ALIGNED(lpgeaddr, pgsz)); 4351 lpgeaddr = (caddr_t)P2ROUNDUP((uintptr_t)eaddr, pgsz); 4352 ASSERT(a < lpgeaddr); 4353 if (a < addr) { 4354 SEGVN_VMSTAT_FLTVNPAGES(44); 4355 /* 4356 * The beginning of the large page region can 4357 * be pulled to the right to make a smaller 4358 * region. We haven't yet faulted a single 4359 * page. 4360 */ 4361 a = (caddr_t)P2ALIGN((uintptr_t)addr, pgsz); 4362 ASSERT(a >= lpgaddr); 4363 off = svd->offset + 4364 (uintptr_t)(a - seg->s_base); 4365 aindx = svd->anon_index + seg_page(seg, a); 4366 vpage = (svd->vpage != NULL) ? 4367 &svd->vpage[seg_page(seg, a)] : NULL; 4368 } 4369 } 4370 } 4371 out: 4372 kmem_free(ppa, ppasize); 4373 if (!err && !vop_size_err) { 4374 SEGVN_VMSTAT_FLTVNPAGES(45); 4375 return (0); 4376 } 4377 if (type == F_SOFTLOCK && a > lpgaddr) { 4378 SEGVN_VMSTAT_FLTVNPAGES(46); 4379 segvn_softunlock(seg, lpgaddr, a - lpgaddr, S_OTHER); 4380 } 4381 if (!vop_size_err) { 4382 SEGVN_VMSTAT_FLTVNPAGES(47); 4383 return (err); 4384 } 4385 ASSERT(brkcow || tron || type == F_SOFTLOCK); 4386 /* 4387 * Large page end is mapped beyond the end of file and it's a cow 4388 * fault (can be a text replication induced cow) or softlock so we can't 4389 * reduce the map area. For now just demote the segment. This should 4390 * really only happen if the end of the file changed after the mapping 4391 * was established since when large page segments are created we make 4392 * sure they don't extend beyond the end of the file. 4393 */ 4394 SEGVN_VMSTAT_FLTVNPAGES(48); 4395 4396 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 4397 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_WRITER); 4398 err = 0; 4399 if (seg->s_szc != 0) { 4400 segvn_fltvnpages_clrszc_cnt++; 4401 ASSERT(svd->softlockcnt == 0); 4402 err = segvn_clrszc(seg); 4403 if (err != 0) { 4404 segvn_fltvnpages_clrszc_err++; 4405 } 4406 } 4407 ASSERT(err || seg->s_szc == 0); 4408 SEGVN_LOCK_DOWNGRADE(seg->s_as, &svd->lock); 4409 /* segvn_fault will do its job as if szc had been zero to begin with */ 4410 return (err == 0 ? IE_RETRY : FC_MAKE_ERR(err)); 4411 } 4412 4413 /* 4414 * This routine will attempt to fault in one large page. 4415 * it will use smaller pages if that fails. 4416 * It should only be called for pure anonymous segments. 4417 */ 4418 static faultcode_t 4419 segvn_fault_anonpages(struct hat *hat, struct seg *seg, caddr_t lpgaddr, 4420 caddr_t lpgeaddr, enum fault_type type, enum seg_rw rw, caddr_t addr, 4421 caddr_t eaddr, int brkcow) 4422 { 4423 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 4424 struct anon_map *amp = svd->amp; 4425 uchar_t segtype = svd->type; 4426 uint_t szc = seg->s_szc; 4427 size_t pgsz = page_get_pagesize(szc); 4428 size_t maxpgsz = pgsz; 4429 pgcnt_t pages = btop(pgsz); 4430 size_t ppasize = pages * sizeof (page_t *); 4431 caddr_t a = lpgaddr; 4432 ulong_t aindx = svd->anon_index + seg_page(seg, a); 4433 struct vpage *vpage = (svd->vpage != NULL) ? 4434 &svd->vpage[seg_page(seg, a)] : NULL; 4435 page_t **ppa; 4436 uint_t ppa_szc; 4437 faultcode_t err; 4438 int ierr; 4439 uint_t protchk, prot, vpprot; 4440 ulong_t i; 4441 int hat_flag = (type == F_SOFTLOCK) ? HAT_LOAD_LOCK : HAT_LOAD; 4442 anon_sync_obj_t cookie; 4443 int first = 1; 4444 int adjszc_chk; 4445 int purged = 0; 4446 int pgflags = (svd->tr_state == SEGVN_TR_ON) ? PG_LOCAL : 0; 4447 4448 ASSERT(szc != 0); 4449 ASSERT(amp != NULL); 4450 ASSERT(enable_mbit_wa == 0); /* no mbit simulations with large pages */ 4451 ASSERT(!(svd->flags & MAP_NORESERVE)); 4452 ASSERT(type != F_SOFTUNLOCK); 4453 ASSERT(IS_P2ALIGNED(a, maxpgsz)); 4454 ASSERT(!brkcow || svd->tr_state == SEGVN_TR_OFF); 4455 4456 ASSERT(SEGVN_LOCK_HELD(seg->s_as, &svd->lock)); 4457 4458 VM_STAT_COND_ADD(type == F_SOFTLOCK, segvnvmstats.fltanpages[0]); 4459 VM_STAT_COND_ADD(type != F_SOFTLOCK, segvnvmstats.fltanpages[1]); 4460 4461 if (svd->flags & MAP_TEXT) { 4462 hat_flag |= HAT_LOAD_TEXT; 4463 } 4464 4465 if (svd->pageprot) { 4466 switch (rw) { 4467 case S_READ: 4468 protchk = PROT_READ; 4469 break; 4470 case S_WRITE: 4471 protchk = PROT_WRITE; 4472 break; 4473 case S_EXEC: 4474 protchk = PROT_EXEC; 4475 break; 4476 case S_OTHER: 4477 default: 4478 protchk = PROT_READ | PROT_WRITE | PROT_EXEC; 4479 break; 4480 } 4481 VM_STAT_ADD(segvnvmstats.fltanpages[2]); 4482 } else { 4483 prot = svd->prot; 4484 /* caller has already done segment level protection check. */ 4485 } 4486 4487 ppa = kmem_alloc(ppasize, KM_SLEEP); 4488 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 4489 for (;;) { 4490 adjszc_chk = 0; 4491 for (; a < lpgeaddr; a += pgsz, aindx += pages) { 4492 if (svd->pageprot != 0 && IS_P2ALIGNED(a, maxpgsz)) { 4493 VM_STAT_ADD(segvnvmstats.fltanpages[3]); 4494 ASSERT(vpage != NULL); 4495 prot = VPP_PROT(vpage); 4496 ASSERT(sameprot(seg, a, maxpgsz)); 4497 if ((prot & protchk) == 0) { 4498 err = FC_PROT; 4499 goto error; 4500 } 4501 } 4502 if (adjszc_chk && IS_P2ALIGNED(a, maxpgsz) && 4503 pgsz < maxpgsz) { 4504 ASSERT(a > lpgaddr); 4505 szc = seg->s_szc; 4506 pgsz = maxpgsz; 4507 pages = btop(pgsz); 4508 ASSERT(IS_P2ALIGNED(aindx, pages)); 4509 lpgeaddr = (caddr_t)P2ROUNDUP((uintptr_t)eaddr, 4510 pgsz); 4511 } 4512 if (type == F_SOFTLOCK && svd->vp != NULL) { 4513 mutex_enter(&freemem_lock); 4514 if (availrmem < tune.t_minarmem + pages) { 4515 mutex_exit(&freemem_lock); 4516 err = FC_MAKE_ERR(ENOMEM); 4517 goto error; 4518 } else { 4519 availrmem -= pages; 4520 segvn_pages_locked += pages; 4521 svd->softlockcnt += pages; 4522 } 4523 mutex_exit(&freemem_lock); 4524 } 4525 anon_array_enter(amp, aindx, &cookie); 4526 ppa_szc = (uint_t)-1; 4527 ierr = anon_map_getpages(amp, aindx, szc, seg, a, 4528 prot, &vpprot, ppa, &ppa_szc, vpage, rw, brkcow, 4529 segvn_anypgsz, pgflags, svd->cred); 4530 if (ierr != 0) { 4531 anon_array_exit(&cookie); 4532 VM_STAT_ADD(segvnvmstats.fltanpages[4]); 4533 if (type == F_SOFTLOCK && svd->vp != NULL) { 4534 VM_STAT_ADD(segvnvmstats.fltanpages[5]); 4535 mutex_enter(&freemem_lock); 4536 availrmem += pages; 4537 segvn_pages_locked -= pages; 4538 svd->softlockcnt -= pages; 4539 mutex_exit(&freemem_lock); 4540 } 4541 if (ierr > 0) { 4542 VM_STAT_ADD(segvnvmstats.fltanpages[6]); 4543 err = FC_MAKE_ERR(ierr); 4544 goto error; 4545 } 4546 break; 4547 } 4548 4549 ASSERT(!IS_VMODSORT(ppa[0]->p_vnode)); 4550 4551 ASSERT(segtype == MAP_SHARED || 4552 ppa[0]->p_szc <= szc); 4553 ASSERT(segtype == MAP_PRIVATE || 4554 ppa[0]->p_szc >= szc); 4555 4556 /* 4557 * Handle pages that have been marked for migration 4558 */ 4559 if (lgrp_optimizations()) 4560 page_migrate(seg, a, ppa, pages); 4561 4562 if (type == F_SOFTLOCK && svd->vp == NULL) { 4563 /* 4564 * All pages in ppa array belong to the same 4565 * large page. This means it's ok to call 4566 * segvn_pp_lock_anonpages just for ppa[0]. 4567 */ 4568 if (!segvn_pp_lock_anonpages(ppa[0], first)) { 4569 for (i = 0; i < pages; i++) { 4570 page_unlock(ppa[i]); 4571 } 4572 err = FC_MAKE_ERR(ENOMEM); 4573 goto error; 4574 } 4575 first = 0; 4576 mutex_enter(&freemem_lock); 4577 svd->softlockcnt += pages; 4578 segvn_pages_locked += pages; 4579 mutex_exit(&freemem_lock); 4580 } 4581 4582 if (segtype == MAP_SHARED) { 4583 vpprot |= PROT_WRITE; 4584 } 4585 4586 hat_memload_array(hat, a, pgsz, ppa, 4587 prot & vpprot, hat_flag); 4588 4589 if (hat_flag & HAT_LOAD_LOCK) { 4590 VM_STAT_ADD(segvnvmstats.fltanpages[7]); 4591 } else { 4592 VM_STAT_ADD(segvnvmstats.fltanpages[8]); 4593 for (i = 0; i < pages; i++) 4594 page_unlock(ppa[i]); 4595 } 4596 if (vpage != NULL) 4597 vpage += pages; 4598 4599 anon_array_exit(&cookie); 4600 adjszc_chk = 1; 4601 } 4602 if (a == lpgeaddr) 4603 break; 4604 ASSERT(a < lpgeaddr); 4605 /* 4606 * ierr == -1 means we failed to allocate a large page. 4607 * so do a size down operation. 4608 * 4609 * ierr == -2 means some other process that privately shares 4610 * pages with this process has allocated a larger page and we 4611 * need to retry with larger pages. So do a size up 4612 * operation. This relies on the fact that large pages are 4613 * never partially shared i.e. if we share any constituent 4614 * page of a large page with another process we must share the 4615 * entire large page. Note this cannot happen for SOFTLOCK 4616 * case, unless current address (a) is at the beginning of the 4617 * next page size boundary because the other process couldn't 4618 * have relocated locked pages. 4619 */ 4620 ASSERT(ierr == -1 || ierr == -2); 4621 /* 4622 * For the very first relocation failure try to purge this 4623 * segment's cache so that the relocator can obtain an 4624 * exclusive lock on pages we want to relocate. 4625 */ 4626 if (!purged && ierr == -1 && ppa_szc != (uint_t)-1 && 4627 svd->softlockcnt != 0) { 4628 purged = 1; 4629 segvn_purge(seg); 4630 continue; 4631 } 4632 4633 if (segvn_anypgsz) { 4634 ASSERT(ierr == -2 || szc != 0); 4635 ASSERT(ierr == -1 || szc < seg->s_szc); 4636 szc = (ierr == -1) ? szc - 1 : szc + 1; 4637 } else { 4638 /* 4639 * For non COW faults and segvn_anypgsz == 0 4640 * we need to be careful not to loop forever 4641 * if existing page is found with szc other 4642 * than 0 or seg->s_szc. This could be due 4643 * to page relocations on behalf of DR or 4644 * more likely large page creation. For this 4645 * case simply re-size to existing page's szc 4646 * if returned by anon_map_getpages(). 4647 */ 4648 if (ppa_szc == (uint_t)-1) { 4649 szc = (ierr == -1) ? 0 : seg->s_szc; 4650 } else { 4651 ASSERT(ppa_szc <= seg->s_szc); 4652 ASSERT(ierr == -2 || ppa_szc < szc); 4653 ASSERT(ierr == -1 || ppa_szc > szc); 4654 szc = ppa_szc; 4655 } 4656 } 4657 4658 pgsz = page_get_pagesize(szc); 4659 pages = btop(pgsz); 4660 ASSERT(type != F_SOFTLOCK || ierr == -1 || 4661 (IS_P2ALIGNED(a, pgsz) && IS_P2ALIGNED(lpgeaddr, pgsz))); 4662 if (type == F_SOFTLOCK) { 4663 /* 4664 * For softlocks we cannot reduce the fault area 4665 * (calculated based on the largest page size for this 4666 * segment) for size down and a is already next 4667 * page size aligned as assertted above for size 4668 * ups. Therefore just continue in case of softlock. 4669 */ 4670 VM_STAT_ADD(segvnvmstats.fltanpages[9]); 4671 continue; /* keep lint happy */ 4672 } else if (ierr == -2) { 4673 4674 /* 4675 * Size up case. Note lpgaddr may only be needed for 4676 * softlock case so we don't adjust it here. 4677 */ 4678 VM_STAT_ADD(segvnvmstats.fltanpages[10]); 4679 a = (caddr_t)P2ALIGN((uintptr_t)a, pgsz); 4680 ASSERT(a >= lpgaddr); 4681 lpgeaddr = (caddr_t)P2ROUNDUP((uintptr_t)eaddr, pgsz); 4682 aindx = svd->anon_index + seg_page(seg, a); 4683 vpage = (svd->vpage != NULL) ? 4684 &svd->vpage[seg_page(seg, a)] : NULL; 4685 } else { 4686 /* 4687 * Size down case. Note lpgaddr may only be needed for 4688 * softlock case so we don't adjust it here. 4689 */ 4690 VM_STAT_ADD(segvnvmstats.fltanpages[11]); 4691 ASSERT(IS_P2ALIGNED(a, pgsz)); 4692 ASSERT(IS_P2ALIGNED(lpgeaddr, pgsz)); 4693 lpgeaddr = (caddr_t)P2ROUNDUP((uintptr_t)eaddr, pgsz); 4694 ASSERT(a < lpgeaddr); 4695 if (a < addr) { 4696 /* 4697 * The beginning of the large page region can 4698 * be pulled to the right to make a smaller 4699 * region. We haven't yet faulted a single 4700 * page. 4701 */ 4702 VM_STAT_ADD(segvnvmstats.fltanpages[12]); 4703 a = (caddr_t)P2ALIGN((uintptr_t)addr, pgsz); 4704 ASSERT(a >= lpgaddr); 4705 aindx = svd->anon_index + seg_page(seg, a); 4706 vpage = (svd->vpage != NULL) ? 4707 &svd->vpage[seg_page(seg, a)] : NULL; 4708 } 4709 } 4710 } 4711 VM_STAT_ADD(segvnvmstats.fltanpages[13]); 4712 ANON_LOCK_EXIT(&->a_rwlock); 4713 kmem_free(ppa, ppasize); 4714 return (0); 4715 error: 4716 VM_STAT_ADD(segvnvmstats.fltanpages[14]); 4717 ANON_LOCK_EXIT(&->a_rwlock); 4718 kmem_free(ppa, ppasize); 4719 if (type == F_SOFTLOCK && a > lpgaddr) { 4720 VM_STAT_ADD(segvnvmstats.fltanpages[15]); 4721 segvn_softunlock(seg, lpgaddr, a - lpgaddr, S_OTHER); 4722 } 4723 return (err); 4724 } 4725 4726 int fltadvice = 1; /* set to free behind pages for sequential access */ 4727 4728 /* 4729 * This routine is called via a machine specific fault handling routine. 4730 * It is also called by software routines wishing to lock or unlock 4731 * a range of addresses. 4732 * 4733 * Here is the basic algorithm: 4734 * If unlocking 4735 * Call segvn_softunlock 4736 * Return 4737 * endif 4738 * Checking and set up work 4739 * If we will need some non-anonymous pages 4740 * Call VOP_GETPAGE over the range of non-anonymous pages 4741 * endif 4742 * Loop over all addresses requested 4743 * Call segvn_faultpage passing in page list 4744 * to load up translations and handle anonymous pages 4745 * endloop 4746 * Load up translation to any additional pages in page list not 4747 * already handled that fit into this segment 4748 */ 4749 static faultcode_t 4750 segvn_fault(struct hat *hat, struct seg *seg, caddr_t addr, size_t len, 4751 enum fault_type type, enum seg_rw rw) 4752 { 4753 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 4754 page_t **plp, **ppp, *pp; 4755 u_offset_t off; 4756 caddr_t a; 4757 struct vpage *vpage; 4758 uint_t vpprot, prot; 4759 int err; 4760 page_t *pl[PVN_GETPAGE_NUM + 1]; 4761 size_t plsz, pl_alloc_sz; 4762 size_t page; 4763 ulong_t anon_index; 4764 struct anon_map *amp; 4765 int dogetpage = 0; 4766 caddr_t lpgaddr, lpgeaddr; 4767 size_t pgsz; 4768 anon_sync_obj_t cookie; 4769 int brkcow = BREAK_COW_SHARE(rw, type, svd->type); 4770 4771 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 4772 4773 /* 4774 * First handle the easy stuff 4775 */ 4776 if (type == F_SOFTUNLOCK) { 4777 if (rw == S_READ_NOCOW) { 4778 rw = S_READ; 4779 ASSERT(AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock)); 4780 } 4781 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER); 4782 pgsz = (seg->s_szc == 0) ? PAGESIZE : 4783 page_get_pagesize(seg->s_szc); 4784 VM_STAT_COND_ADD(pgsz > PAGESIZE, segvnvmstats.fltanpages[16]); 4785 CALC_LPG_REGION(pgsz, seg, addr, len, lpgaddr, lpgeaddr); 4786 segvn_softunlock(seg, lpgaddr, lpgeaddr - lpgaddr, rw); 4787 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 4788 return (0); 4789 } 4790 4791 if (brkcow == 0) { 4792 if (svd->tr_state == SEGVN_TR_INIT) { 4793 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_WRITER); 4794 if (svd->tr_state == SEGVN_TR_INIT) { 4795 ASSERT(svd->vp != NULL && svd->amp == NULL); 4796 ASSERT(svd->flags & MAP_TEXT); 4797 ASSERT(svd->type == MAP_PRIVATE); 4798 segvn_textrepl(seg); 4799 ASSERT(svd->tr_state != SEGVN_TR_INIT); 4800 ASSERT(svd->tr_state != SEGVN_TR_ON || 4801 svd->amp != NULL); 4802 } 4803 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 4804 } 4805 } else if (svd->tr_state != SEGVN_TR_OFF) { 4806 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_WRITER); 4807 if (svd->tr_state == SEGVN_TR_ON) { 4808 ASSERT(svd->vp != NULL && svd->amp != NULL); 4809 segvn_textunrepl(seg, 0); 4810 ASSERT(svd->amp == NULL && 4811 svd->tr_state == SEGVN_TR_OFF); 4812 } else if (svd->tr_state != SEGVN_TR_OFF) { 4813 svd->tr_state = SEGVN_TR_OFF; 4814 } 4815 ASSERT(svd->amp == NULL && svd->tr_state == SEGVN_TR_OFF); 4816 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 4817 } 4818 4819 top: 4820 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER); 4821 4822 /* 4823 * If we have the same protections for the entire segment, 4824 * insure that the access being attempted is legitimate. 4825 */ 4826 4827 if (svd->pageprot == 0) { 4828 uint_t protchk; 4829 4830 switch (rw) { 4831 case S_READ: 4832 case S_READ_NOCOW: 4833 protchk = PROT_READ; 4834 break; 4835 case S_WRITE: 4836 protchk = PROT_WRITE; 4837 break; 4838 case S_EXEC: 4839 protchk = PROT_EXEC; 4840 break; 4841 case S_OTHER: 4842 default: 4843 protchk = PROT_READ | PROT_WRITE | PROT_EXEC; 4844 break; 4845 } 4846 4847 if ((svd->prot & protchk) == 0) { 4848 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 4849 return (FC_PROT); /* illegal access type */ 4850 } 4851 } 4852 4853 /* 4854 * We can't allow the long term use of softlocks for vmpss segments, 4855 * because in some file truncation cases we should be able to demote 4856 * the segment, which requires that there are no softlocks. The 4857 * only case where it's ok to allow a SOFTLOCK fault against a vmpss 4858 * segment is S_READ_NOCOW, where the caller holds the address space 4859 * locked as writer and calls softunlock before dropping the as lock. 4860 * S_READ_NOCOW is used by /proc to read memory from another user. 4861 * 4862 * Another deadlock between SOFTLOCK and file truncation can happen 4863 * because segvn_fault_vnodepages() calls the FS one pagesize at 4864 * a time. A second VOP_GETPAGE() call by segvn_fault_vnodepages() 4865 * can cause a deadlock because the first set of page_t's remain 4866 * locked SE_SHARED. To avoid this, we demote segments on a first 4867 * SOFTLOCK if they have a length greater than the segment's 4868 * page size. 4869 * 4870 * So for now, we only avoid demoting a segment on a SOFTLOCK when 4871 * the access type is S_READ_NOCOW and the fault length is less than 4872 * or equal to the segment's page size. While this is quite restrictive, 4873 * it should be the most common case of SOFTLOCK against a vmpss 4874 * segment. 4875 * 4876 * For S_READ_NOCOW, it's safe not to do a copy on write because the 4877 * caller makes sure no COW will be caused by another thread for a 4878 * softlocked page. 4879 */ 4880 if (type == F_SOFTLOCK && svd->vp != NULL && seg->s_szc != 0) { 4881 int demote = 0; 4882 4883 if (rw != S_READ_NOCOW) { 4884 demote = 1; 4885 } 4886 if (!demote && len > PAGESIZE) { 4887 pgsz = page_get_pagesize(seg->s_szc); 4888 CALC_LPG_REGION(pgsz, seg, addr, len, lpgaddr, 4889 lpgeaddr); 4890 if (lpgeaddr - lpgaddr > pgsz) { 4891 demote = 1; 4892 } 4893 } 4894 4895 ASSERT(demote || AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock)); 4896 4897 if (demote) { 4898 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 4899 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_WRITER); 4900 if (seg->s_szc != 0) { 4901 segvn_vmpss_clrszc_cnt++; 4902 ASSERT(svd->softlockcnt == 0); 4903 err = segvn_clrszc(seg); 4904 if (err) { 4905 segvn_vmpss_clrszc_err++; 4906 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 4907 return (FC_MAKE_ERR(err)); 4908 } 4909 } 4910 ASSERT(seg->s_szc == 0); 4911 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 4912 goto top; 4913 } 4914 } 4915 4916 /* 4917 * Check to see if we need to allocate an anon_map structure. 4918 */ 4919 if (svd->amp == NULL && (svd->vp == NULL || brkcow)) { 4920 /* 4921 * Drop the "read" lock on the segment and acquire 4922 * the "write" version since we have to allocate the 4923 * anon_map. 4924 */ 4925 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 4926 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_WRITER); 4927 4928 if (svd->amp == NULL) { 4929 svd->amp = anonmap_alloc(seg->s_size, 0, ANON_SLEEP); 4930 svd->amp->a_szc = seg->s_szc; 4931 } 4932 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 4933 4934 /* 4935 * Start all over again since segment protections 4936 * may have changed after we dropped the "read" lock. 4937 */ 4938 goto top; 4939 } 4940 4941 /* 4942 * S_READ_NOCOW vs S_READ distinction was 4943 * only needed for the code above. After 4944 * that we treat it as S_READ. 4945 */ 4946 if (rw == S_READ_NOCOW) { 4947 ASSERT(type == F_SOFTLOCK); 4948 ASSERT(AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock)); 4949 rw = S_READ; 4950 } 4951 4952 amp = svd->amp; 4953 4954 /* 4955 * MADV_SEQUENTIAL work is ignored for large page segments. 4956 */ 4957 if (seg->s_szc != 0) { 4958 pgsz = page_get_pagesize(seg->s_szc); 4959 ASSERT(SEGVN_LOCK_HELD(seg->s_as, &svd->lock)); 4960 CALC_LPG_REGION(pgsz, seg, addr, len, lpgaddr, lpgeaddr); 4961 if (svd->vp == NULL) { 4962 err = segvn_fault_anonpages(hat, seg, lpgaddr, 4963 lpgeaddr, type, rw, addr, addr + len, brkcow); 4964 } else { 4965 err = segvn_fault_vnodepages(hat, seg, lpgaddr, 4966 lpgeaddr, type, rw, addr, addr + len, brkcow); 4967 if (err == IE_RETRY) { 4968 ASSERT(seg->s_szc == 0); 4969 ASSERT(SEGVN_READ_HELD(seg->s_as, &svd->lock)); 4970 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 4971 goto top; 4972 } 4973 } 4974 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 4975 return (err); 4976 } 4977 4978 page = seg_page(seg, addr); 4979 if (amp != NULL) { 4980 anon_index = svd->anon_index + page; 4981 4982 if (type == F_PROT && rw == S_READ && 4983 svd->tr_state == SEGVN_TR_OFF && 4984 svd->type == MAP_PRIVATE && svd->pageprot == 0) { 4985 size_t index = anon_index; 4986 struct anon *ap; 4987 4988 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 4989 /* 4990 * The fast path could apply to S_WRITE also, except 4991 * that the protection fault could be caused by lazy 4992 * tlb flush when ro->rw. In this case, the pte is 4993 * RW already. But RO in the other cpu's tlb causes 4994 * the fault. Since hat_chgprot won't do anything if 4995 * pte doesn't change, we may end up faulting 4996 * indefinitely until the RO tlb entry gets replaced. 4997 */ 4998 for (a = addr; a < addr + len; a += PAGESIZE, index++) { 4999 anon_array_enter(amp, index, &cookie); 5000 ap = anon_get_ptr(amp->ahp, index); 5001 anon_array_exit(&cookie); 5002 if ((ap == NULL) || (ap->an_refcnt != 1)) { 5003 ANON_LOCK_EXIT(&->a_rwlock); 5004 goto slow; 5005 } 5006 } 5007 hat_chgprot(seg->s_as->a_hat, addr, len, svd->prot); 5008 ANON_LOCK_EXIT(&->a_rwlock); 5009 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5010 return (0); 5011 } 5012 } 5013 slow: 5014 5015 if (svd->vpage == NULL) 5016 vpage = NULL; 5017 else 5018 vpage = &svd->vpage[page]; 5019 5020 off = svd->offset + (uintptr_t)(addr - seg->s_base); 5021 5022 /* 5023 * If MADV_SEQUENTIAL has been set for the particular page we 5024 * are faulting on, free behind all pages in the segment and put 5025 * them on the free list. 5026 */ 5027 5028 if ((page != 0) && fltadvice && svd->tr_state != SEGVN_TR_ON) { 5029 struct vpage *vpp; 5030 ulong_t fanon_index; 5031 size_t fpage; 5032 u_offset_t pgoff, fpgoff; 5033 struct vnode *fvp; 5034 struct anon *fap = NULL; 5035 5036 if (svd->advice == MADV_SEQUENTIAL || 5037 (svd->pageadvice && 5038 VPP_ADVICE(vpage) == MADV_SEQUENTIAL)) { 5039 pgoff = off - PAGESIZE; 5040 fpage = page - 1; 5041 if (vpage != NULL) 5042 vpp = &svd->vpage[fpage]; 5043 if (amp != NULL) 5044 fanon_index = svd->anon_index + fpage; 5045 5046 while (pgoff > svd->offset) { 5047 if (svd->advice != MADV_SEQUENTIAL && 5048 (!svd->pageadvice || (vpage && 5049 VPP_ADVICE(vpp) != MADV_SEQUENTIAL))) 5050 break; 5051 5052 /* 5053 * If this is an anon page, we must find the 5054 * correct <vp, offset> for it 5055 */ 5056 fap = NULL; 5057 if (amp != NULL) { 5058 ANON_LOCK_ENTER(&->a_rwlock, 5059 RW_READER); 5060 anon_array_enter(amp, fanon_index, 5061 &cookie); 5062 fap = anon_get_ptr(amp->ahp, 5063 fanon_index); 5064 if (fap != NULL) { 5065 swap_xlate(fap, &fvp, &fpgoff); 5066 } else { 5067 fpgoff = pgoff; 5068 fvp = svd->vp; 5069 } 5070 anon_array_exit(&cookie); 5071 ANON_LOCK_EXIT(&->a_rwlock); 5072 } else { 5073 fpgoff = pgoff; 5074 fvp = svd->vp; 5075 } 5076 if (fvp == NULL) 5077 break; /* XXX */ 5078 /* 5079 * Skip pages that are free or have an 5080 * "exclusive" lock. 5081 */ 5082 pp = page_lookup_nowait(fvp, fpgoff, SE_SHARED); 5083 if (pp == NULL) 5084 break; 5085 /* 5086 * We don't need the page_struct_lock to test 5087 * as this is only advisory; even if we 5088 * acquire it someone might race in and lock 5089 * the page after we unlock and before the 5090 * PUTPAGE, then VOP_PUTPAGE will do nothing. 5091 */ 5092 if (pp->p_lckcnt == 0 && pp->p_cowcnt == 0) { 5093 /* 5094 * Hold the vnode before releasing 5095 * the page lock to prevent it from 5096 * being freed and re-used by some 5097 * other thread. 5098 */ 5099 VN_HOLD(fvp); 5100 page_unlock(pp); 5101 /* 5102 * We should build a page list 5103 * to kluster putpages XXX 5104 */ 5105 (void) VOP_PUTPAGE(fvp, 5106 (offset_t)fpgoff, PAGESIZE, 5107 (B_DONTNEED|B_FREE|B_ASYNC), 5108 svd->cred); 5109 VN_RELE(fvp); 5110 } else { 5111 /* 5112 * XXX - Should the loop terminate if 5113 * the page is `locked'? 5114 */ 5115 page_unlock(pp); 5116 } 5117 --vpp; 5118 --fanon_index; 5119 pgoff -= PAGESIZE; 5120 } 5121 } 5122 } 5123 5124 plp = pl; 5125 *plp = NULL; 5126 pl_alloc_sz = 0; 5127 5128 /* 5129 * See if we need to call VOP_GETPAGE for 5130 * *any* of the range being faulted on. 5131 * We can skip all of this work if there 5132 * was no original vnode. 5133 */ 5134 if (svd->vp != NULL) { 5135 u_offset_t vp_off; 5136 size_t vp_len; 5137 struct anon *ap; 5138 vnode_t *vp; 5139 5140 vp_off = off; 5141 vp_len = len; 5142 5143 if (amp == NULL) 5144 dogetpage = 1; 5145 else { 5146 /* 5147 * Only acquire reader lock to prevent amp->ahp 5148 * from being changed. It's ok to miss pages, 5149 * hence we don't do anon_array_enter 5150 */ 5151 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 5152 ap = anon_get_ptr(amp->ahp, anon_index); 5153 5154 if (len <= PAGESIZE) 5155 /* inline non_anon() */ 5156 dogetpage = (ap == NULL); 5157 else 5158 dogetpage = non_anon(amp->ahp, anon_index, 5159 &vp_off, &vp_len); 5160 ANON_LOCK_EXIT(&->a_rwlock); 5161 } 5162 5163 if (dogetpage) { 5164 enum seg_rw arw; 5165 struct as *as = seg->s_as; 5166 5167 if (len > ptob((sizeof (pl) / sizeof (pl[0])) - 1)) { 5168 /* 5169 * Page list won't fit in local array, 5170 * allocate one of the needed size. 5171 */ 5172 pl_alloc_sz = 5173 (btop(len) + 1) * sizeof (page_t *); 5174 plp = kmem_alloc(pl_alloc_sz, KM_SLEEP); 5175 plp[0] = NULL; 5176 plsz = len; 5177 } else if (rw == S_WRITE && svd->type == MAP_PRIVATE || 5178 svd->tr_state == SEGVN_TR_ON || rw == S_OTHER || 5179 (((size_t)(addr + PAGESIZE) < 5180 (size_t)(seg->s_base + seg->s_size)) && 5181 hat_probe(as->a_hat, addr + PAGESIZE))) { 5182 /* 5183 * Ask VOP_GETPAGE to return the exact number 5184 * of pages if 5185 * (a) this is a COW fault, or 5186 * (b) this is a software fault, or 5187 * (c) next page is already mapped. 5188 */ 5189 plsz = len; 5190 } else { 5191 /* 5192 * Ask VOP_GETPAGE to return adjacent pages 5193 * within the segment. 5194 */ 5195 plsz = MIN((size_t)PVN_GETPAGE_SZ, (size_t) 5196 ((seg->s_base + seg->s_size) - addr)); 5197 ASSERT((addr + plsz) <= 5198 (seg->s_base + seg->s_size)); 5199 } 5200 5201 /* 5202 * Need to get some non-anonymous pages. 5203 * We need to make only one call to GETPAGE to do 5204 * this to prevent certain deadlocking conditions 5205 * when we are doing locking. In this case 5206 * non_anon() should have picked up the smallest 5207 * range which includes all the non-anonymous 5208 * pages in the requested range. We have to 5209 * be careful regarding which rw flag to pass in 5210 * because on a private mapping, the underlying 5211 * object is never allowed to be written. 5212 */ 5213 if (rw == S_WRITE && svd->type == MAP_PRIVATE) { 5214 arw = S_READ; 5215 } else { 5216 arw = rw; 5217 } 5218 vp = svd->vp; 5219 TRACE_3(TR_FAC_VM, TR_SEGVN_GETPAGE, 5220 "segvn_getpage:seg %p addr %p vp %p", 5221 seg, addr, vp); 5222 err = VOP_GETPAGE(vp, (offset_t)vp_off, vp_len, 5223 &vpprot, plp, plsz, seg, addr + (vp_off - off), arw, 5224 svd->cred); 5225 if (err) { 5226 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5227 segvn_pagelist_rele(plp); 5228 if (pl_alloc_sz) 5229 kmem_free(plp, pl_alloc_sz); 5230 return (FC_MAKE_ERR(err)); 5231 } 5232 if (svd->type == MAP_PRIVATE) 5233 vpprot &= ~PROT_WRITE; 5234 } 5235 } 5236 5237 /* 5238 * N.B. at this time the plp array has all the needed non-anon 5239 * pages in addition to (possibly) having some adjacent pages. 5240 */ 5241 5242 /* 5243 * Always acquire the anon_array_lock to prevent 5244 * 2 threads from allocating separate anon slots for 5245 * the same "addr". 5246 * 5247 * If this is a copy-on-write fault and we don't already 5248 * have the anon_array_lock, acquire it to prevent the 5249 * fault routine from handling multiple copy-on-write faults 5250 * on the same "addr" in the same address space. 5251 * 5252 * Only one thread should deal with the fault since after 5253 * it is handled, the other threads can acquire a translation 5254 * to the newly created private page. This prevents two or 5255 * more threads from creating different private pages for the 5256 * same fault. 5257 * 5258 * We grab "serialization" lock here if this is a MAP_PRIVATE segment 5259 * to prevent deadlock between this thread and another thread 5260 * which has soft-locked this page and wants to acquire serial_lock. 5261 * ( bug 4026339 ) 5262 * 5263 * The fix for bug 4026339 becomes unnecessary when using the 5264 * locking scheme with per amp rwlock and a global set of hash 5265 * lock, anon_array_lock. If we steal a vnode page when low 5266 * on memory and upgrad the page lock through page_rename, 5267 * then the page is PAGE_HANDLED, nothing needs to be done 5268 * for this page after returning from segvn_faultpage. 5269 * 5270 * But really, the page lock should be downgraded after 5271 * the stolen page is page_rename'd. 5272 */ 5273 5274 if (amp != NULL) 5275 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 5276 5277 /* 5278 * Ok, now loop over the address range and handle faults 5279 */ 5280 for (a = addr; a < addr + len; a += PAGESIZE, off += PAGESIZE) { 5281 err = segvn_faultpage(hat, seg, a, off, vpage, plp, vpprot, 5282 type, rw, brkcow, a == addr); 5283 if (err) { 5284 if (amp != NULL) 5285 ANON_LOCK_EXIT(&->a_rwlock); 5286 if (type == F_SOFTLOCK && a > addr) { 5287 segvn_softunlock(seg, addr, (a - addr), 5288 S_OTHER); 5289 } 5290 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5291 segvn_pagelist_rele(plp); 5292 if (pl_alloc_sz) 5293 kmem_free(plp, pl_alloc_sz); 5294 return (err); 5295 } 5296 if (vpage) { 5297 vpage++; 5298 } else if (svd->vpage) { 5299 page = seg_page(seg, addr); 5300 vpage = &svd->vpage[++page]; 5301 } 5302 } 5303 5304 /* Didn't get pages from the underlying fs so we're done */ 5305 if (!dogetpage) 5306 goto done; 5307 5308 /* 5309 * Now handle any other pages in the list returned. 5310 * If the page can be used, load up the translations now. 5311 * Note that the for loop will only be entered if "plp" 5312 * is pointing to a non-NULL page pointer which means that 5313 * VOP_GETPAGE() was called and vpprot has been initialized. 5314 */ 5315 if (svd->pageprot == 0) 5316 prot = svd->prot & vpprot; 5317 5318 5319 /* 5320 * Large Files: diff should be unsigned value because we started 5321 * supporting > 2GB segment sizes from 2.5.1 and when a 5322 * large file of size > 2GB gets mapped to address space 5323 * the diff value can be > 2GB. 5324 */ 5325 5326 for (ppp = plp; (pp = *ppp) != NULL; ppp++) { 5327 size_t diff; 5328 struct anon *ap; 5329 int anon_index; 5330 anon_sync_obj_t cookie; 5331 int hat_flag = HAT_LOAD_ADV; 5332 5333 if (svd->flags & MAP_TEXT) { 5334 hat_flag |= HAT_LOAD_TEXT; 5335 } 5336 5337 if (pp == PAGE_HANDLED) 5338 continue; 5339 5340 if (svd->tr_state != SEGVN_TR_ON && 5341 pp->p_offset >= svd->offset && 5342 pp->p_offset < svd->offset + seg->s_size) { 5343 5344 diff = pp->p_offset - svd->offset; 5345 5346 /* 5347 * Large Files: Following is the assertion 5348 * validating the above cast. 5349 */ 5350 ASSERT(svd->vp == pp->p_vnode); 5351 5352 page = btop(diff); 5353 if (svd->pageprot) 5354 prot = VPP_PROT(&svd->vpage[page]) & vpprot; 5355 5356 /* 5357 * Prevent other threads in the address space from 5358 * creating private pages (i.e., allocating anon slots) 5359 * while we are in the process of loading translations 5360 * to additional pages returned by the underlying 5361 * object. 5362 */ 5363 if (amp != NULL) { 5364 anon_index = svd->anon_index + page; 5365 anon_array_enter(amp, anon_index, &cookie); 5366 ap = anon_get_ptr(amp->ahp, anon_index); 5367 } 5368 if ((amp == NULL) || (ap == NULL)) { 5369 if (IS_VMODSORT(pp->p_vnode) || 5370 enable_mbit_wa) { 5371 if (rw == S_WRITE) 5372 hat_setmod(pp); 5373 else if (rw != S_OTHER && 5374 !hat_ismod(pp)) 5375 prot &= ~PROT_WRITE; 5376 } 5377 /* 5378 * Skip mapping read ahead pages marked 5379 * for migration, so they will get migrated 5380 * properly on fault 5381 */ 5382 if ((prot & PROT_READ) && !PP_ISMIGRATE(pp)) { 5383 hat_memload(hat, seg->s_base + diff, 5384 pp, prot, hat_flag); 5385 } 5386 } 5387 if (amp != NULL) 5388 anon_array_exit(&cookie); 5389 } 5390 page_unlock(pp); 5391 } 5392 done: 5393 if (amp != NULL) 5394 ANON_LOCK_EXIT(&->a_rwlock); 5395 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5396 if (pl_alloc_sz) 5397 kmem_free(plp, pl_alloc_sz); 5398 return (0); 5399 } 5400 5401 /* 5402 * This routine is used to start I/O on pages asynchronously. XXX it will 5403 * only create PAGESIZE pages. At fault time they will be relocated into 5404 * larger pages. 5405 */ 5406 static faultcode_t 5407 segvn_faulta(struct seg *seg, caddr_t addr) 5408 { 5409 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 5410 int err; 5411 struct anon_map *amp; 5412 vnode_t *vp; 5413 5414 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 5415 5416 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER); 5417 if ((amp = svd->amp) != NULL) { 5418 struct anon *ap; 5419 5420 /* 5421 * Reader lock to prevent amp->ahp from being changed. 5422 * This is advisory, it's ok to miss a page, so 5423 * we don't do anon_array_enter lock. 5424 */ 5425 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 5426 if ((ap = anon_get_ptr(amp->ahp, 5427 svd->anon_index + seg_page(seg, addr))) != NULL) { 5428 5429 err = anon_getpage(&ap, NULL, NULL, 5430 0, seg, addr, S_READ, svd->cred); 5431 5432 ANON_LOCK_EXIT(&->a_rwlock); 5433 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5434 if (err) 5435 return (FC_MAKE_ERR(err)); 5436 return (0); 5437 } 5438 ANON_LOCK_EXIT(&->a_rwlock); 5439 } 5440 5441 if (svd->vp == NULL) { 5442 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5443 return (0); /* zfod page - do nothing now */ 5444 } 5445 5446 vp = svd->vp; 5447 TRACE_3(TR_FAC_VM, TR_SEGVN_GETPAGE, 5448 "segvn_getpage:seg %p addr %p vp %p", seg, addr, vp); 5449 err = VOP_GETPAGE(vp, 5450 (offset_t)(svd->offset + (uintptr_t)(addr - seg->s_base)), 5451 PAGESIZE, NULL, NULL, 0, seg, addr, 5452 S_OTHER, svd->cred); 5453 5454 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5455 if (err) 5456 return (FC_MAKE_ERR(err)); 5457 return (0); 5458 } 5459 5460 static int 5461 segvn_setprot(struct seg *seg, caddr_t addr, size_t len, uint_t prot) 5462 { 5463 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 5464 struct vpage *svp, *evp; 5465 struct vnode *vp; 5466 size_t pgsz; 5467 pgcnt_t pgcnt; 5468 anon_sync_obj_t cookie; 5469 5470 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 5471 5472 if ((svd->maxprot & prot) != prot) 5473 return (EACCES); /* violated maxprot */ 5474 5475 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_WRITER); 5476 5477 /* return if prot is the same */ 5478 if (!svd->pageprot && svd->prot == prot) { 5479 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5480 return (0); 5481 } 5482 5483 /* 5484 * Since we change protections we first have to flush the cache. 5485 * This makes sure all the pagelock calls have to recheck 5486 * protections. 5487 */ 5488 if (svd->softlockcnt > 0) { 5489 ASSERT(svd->tr_state == SEGVN_TR_OFF); 5490 /* 5491 * Since we do have the segvn writers lock nobody can fill 5492 * the cache with entries belonging to this seg during 5493 * the purge. The flush either succeeds or we still have 5494 * pending I/Os. 5495 */ 5496 segvn_purge(seg); 5497 if (svd->softlockcnt > 0) { 5498 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5499 return (EAGAIN); 5500 } 5501 } 5502 5503 if (svd->tr_state == SEGVN_TR_INIT) { 5504 svd->tr_state = SEGVN_TR_OFF; 5505 } else if (svd->tr_state == SEGVN_TR_ON) { 5506 ASSERT(svd->amp != NULL); 5507 segvn_textunrepl(seg, 0); 5508 ASSERT(svd->amp == NULL && svd->tr_state == SEGVN_TR_OFF); 5509 } 5510 5511 if ((prot & PROT_WRITE) && svd->type == MAP_SHARED && 5512 svd->vp != NULL && (svd->vp->v_flag & VVMEXEC)) { 5513 ASSERT(vn_is_mapped(svd->vp, V_WRITE)); 5514 segvn_inval_trcache(svd->vp); 5515 } 5516 5517 if (seg->s_szc != 0) { 5518 int err; 5519 pgsz = page_get_pagesize(seg->s_szc); 5520 pgcnt = pgsz >> PAGESHIFT; 5521 ASSERT(IS_P2ALIGNED(pgcnt, pgcnt)); 5522 if (!IS_P2ALIGNED(addr, pgsz) || !IS_P2ALIGNED(len, pgsz)) { 5523 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5524 ASSERT(seg->s_base != addr || seg->s_size != len); 5525 /* 5526 * If we are holding the as lock as a reader then 5527 * we need to return IE_RETRY and let the as 5528 * layer drop and re-aquire the lock as a writer. 5529 */ 5530 if (AS_READ_HELD(seg->s_as, &seg->s_as->a_lock)) 5531 return (IE_RETRY); 5532 VM_STAT_ADD(segvnvmstats.demoterange[1]); 5533 if (svd->type == MAP_PRIVATE || svd->vp != NULL) { 5534 err = segvn_demote_range(seg, addr, len, 5535 SDR_END, 0); 5536 } else { 5537 uint_t szcvec = map_pgszcvec(seg->s_base, 5538 pgsz, (uintptr_t)seg->s_base, 5539 (svd->flags & MAP_TEXT), MAPPGSZC_SHM, 0); 5540 err = segvn_demote_range(seg, addr, len, 5541 SDR_END, szcvec); 5542 } 5543 if (err == 0) 5544 return (IE_RETRY); 5545 if (err == ENOMEM) 5546 return (IE_NOMEM); 5547 return (err); 5548 } 5549 } 5550 5551 5552 /* 5553 * If it's a private mapping and we're making it writable 5554 * and no swap space has been reserved, have to reserve 5555 * it all now. If it's a private mapping to a file (i.e., vp != NULL) 5556 * and we're removing write permission on the entire segment and 5557 * we haven't modified any pages, we can release the swap space. 5558 */ 5559 if (svd->type == MAP_PRIVATE) { 5560 if (prot & PROT_WRITE) { 5561 size_t sz; 5562 if (svd->swresv == 0 && !(svd->flags & MAP_NORESERVE)) { 5563 if (anon_resv_zone(seg->s_size, 5564 seg->s_as->a_proc->p_zone) == 0) { 5565 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5566 return (IE_NOMEM); 5567 } 5568 sz = svd->swresv = seg->s_size; 5569 TRACE_3(TR_FAC_VM, TR_ANON_PROC, 5570 "anon proc:%p %lu %u", 5571 seg, sz, 1); 5572 } 5573 } else { 5574 /* 5575 * Swap space is released only if this segment 5576 * does not map anonymous memory, since read faults 5577 * on such segments still need an anon slot to read 5578 * in the data. 5579 */ 5580 if (svd->swresv != 0 && svd->vp != NULL && 5581 svd->amp == NULL && addr == seg->s_base && 5582 len == seg->s_size && svd->pageprot == 0) { 5583 anon_unresv_zone(svd->swresv, 5584 seg->s_as->a_proc->p_zone); 5585 svd->swresv = 0; 5586 TRACE_3(TR_FAC_VM, TR_ANON_PROC, 5587 "anon proc:%p %lu %u", 5588 seg, 0, 0); 5589 } 5590 } 5591 } 5592 5593 if (addr == seg->s_base && len == seg->s_size && svd->pageprot == 0) { 5594 if (svd->prot == prot) { 5595 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5596 return (0); /* all done */ 5597 } 5598 svd->prot = (uchar_t)prot; 5599 } else if (svd->type == MAP_PRIVATE) { 5600 struct anon *ap = NULL; 5601 page_t *pp; 5602 u_offset_t offset, off; 5603 struct anon_map *amp; 5604 ulong_t anon_idx = 0; 5605 5606 /* 5607 * A vpage structure exists or else the change does not 5608 * involve the entire segment. Establish a vpage structure 5609 * if none is there. Then, for each page in the range, 5610 * adjust its individual permissions. Note that write- 5611 * enabling a MAP_PRIVATE page can affect the claims for 5612 * locked down memory. Overcommitting memory terminates 5613 * the operation. 5614 */ 5615 segvn_vpage(seg); 5616 if ((amp = svd->amp) != NULL) { 5617 anon_idx = svd->anon_index + seg_page(seg, addr); 5618 ASSERT(seg->s_szc == 0 || 5619 IS_P2ALIGNED(anon_idx, pgcnt)); 5620 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 5621 } 5622 5623 offset = svd->offset + (uintptr_t)(addr - seg->s_base); 5624 evp = &svd->vpage[seg_page(seg, addr + len)]; 5625 5626 /* 5627 * See Statement at the beginning of segvn_lockop regarding 5628 * the way cowcnts and lckcnts are handled. 5629 */ 5630 for (svp = &svd->vpage[seg_page(seg, addr)]; svp < evp; svp++) { 5631 5632 if (seg->s_szc != 0) { 5633 if (amp != NULL) { 5634 anon_array_enter(amp, anon_idx, 5635 &cookie); 5636 } 5637 if (IS_P2ALIGNED(anon_idx, pgcnt) && 5638 !segvn_claim_pages(seg, svp, offset, 5639 anon_idx, prot)) { 5640 if (amp != NULL) { 5641 anon_array_exit(&cookie); 5642 } 5643 break; 5644 } 5645 if (amp != NULL) { 5646 anon_array_exit(&cookie); 5647 } 5648 anon_idx++; 5649 } else { 5650 if (amp != NULL) { 5651 anon_array_enter(amp, anon_idx, 5652 &cookie); 5653 ap = anon_get_ptr(amp->ahp, anon_idx++); 5654 } 5655 5656 if (VPP_ISPPLOCK(svp) && 5657 VPP_PROT(svp) != prot) { 5658 5659 if (amp == NULL || ap == NULL) { 5660 vp = svd->vp; 5661 off = offset; 5662 } else 5663 swap_xlate(ap, &vp, &off); 5664 if (amp != NULL) 5665 anon_array_exit(&cookie); 5666 5667 if ((pp = page_lookup(vp, off, 5668 SE_SHARED)) == NULL) { 5669 panic("segvn_setprot: no page"); 5670 /*NOTREACHED*/ 5671 } 5672 ASSERT(seg->s_szc == 0); 5673 if ((VPP_PROT(svp) ^ prot) & 5674 PROT_WRITE) { 5675 if (prot & PROT_WRITE) { 5676 if (!page_addclaim(pp)) { 5677 page_unlock(pp); 5678 break; 5679 } 5680 } else { 5681 if (!page_subclaim(pp)) { 5682 page_unlock(pp); 5683 break; 5684 } 5685 } 5686 } 5687 page_unlock(pp); 5688 } else if (amp != NULL) 5689 anon_array_exit(&cookie); 5690 } 5691 VPP_SETPROT(svp, prot); 5692 offset += PAGESIZE; 5693 } 5694 if (amp != NULL) 5695 ANON_LOCK_EXIT(&->a_rwlock); 5696 5697 /* 5698 * Did we terminate prematurely? If so, simply unload 5699 * the translations to the things we've updated so far. 5700 */ 5701 if (svp != evp) { 5702 len = (svp - &svd->vpage[seg_page(seg, addr)]) * 5703 PAGESIZE; 5704 ASSERT(seg->s_szc == 0 || IS_P2ALIGNED(len, pgsz)); 5705 if (len != 0) 5706 hat_unload(seg->s_as->a_hat, addr, 5707 len, HAT_UNLOAD); 5708 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5709 return (IE_NOMEM); 5710 } 5711 } else { 5712 segvn_vpage(seg); 5713 evp = &svd->vpage[seg_page(seg, addr + len)]; 5714 for (svp = &svd->vpage[seg_page(seg, addr)]; svp < evp; svp++) { 5715 VPP_SETPROT(svp, prot); 5716 } 5717 } 5718 5719 if (((prot & PROT_WRITE) != 0 && 5720 (svd->vp != NULL || svd->type == MAP_PRIVATE)) || 5721 (prot & ~PROT_USER) == PROT_NONE) { 5722 /* 5723 * Either private or shared data with write access (in 5724 * which case we need to throw out all former translations 5725 * so that we get the right translations set up on fault 5726 * and we don't allow write access to any copy-on-write pages 5727 * that might be around or to prevent write access to pages 5728 * representing holes in a file), or we don't have permission 5729 * to access the memory at all (in which case we have to 5730 * unload any current translations that might exist). 5731 */ 5732 hat_unload(seg->s_as->a_hat, addr, len, HAT_UNLOAD); 5733 } else { 5734 /* 5735 * A shared mapping or a private mapping in which write 5736 * protection is going to be denied - just change all the 5737 * protections over the range of addresses in question. 5738 * segvn does not support any other attributes other 5739 * than prot so we can use hat_chgattr. 5740 */ 5741 hat_chgattr(seg->s_as->a_hat, addr, len, prot); 5742 } 5743 5744 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5745 5746 return (0); 5747 } 5748 5749 /* 5750 * segvn_setpagesize is called via SEGOP_SETPAGESIZE from as_setpagesize, 5751 * to determine if the seg is capable of mapping the requested szc. 5752 */ 5753 static int 5754 segvn_setpagesize(struct seg *seg, caddr_t addr, size_t len, uint_t szc) 5755 { 5756 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 5757 struct segvn_data *nsvd; 5758 struct anon_map *amp = svd->amp; 5759 struct seg *nseg; 5760 caddr_t eaddr = addr + len, a; 5761 size_t pgsz = page_get_pagesize(szc); 5762 pgcnt_t pgcnt = page_get_pagecnt(szc); 5763 int err; 5764 u_offset_t off = svd->offset + (uintptr_t)(addr - seg->s_base); 5765 extern struct vnode kvp; 5766 5767 ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock)); 5768 ASSERT(addr >= seg->s_base && eaddr <= seg->s_base + seg->s_size); 5769 5770 if (seg->s_szc == szc || segvn_lpg_disable != 0) { 5771 return (0); 5772 } 5773 5774 /* 5775 * addr should always be pgsz aligned but eaddr may be misaligned if 5776 * it's at the end of the segment. 5777 * 5778 * XXX we should assert this condition since as_setpagesize() logic 5779 * guarantees it. 5780 */ 5781 if (!IS_P2ALIGNED(addr, pgsz) || 5782 (!IS_P2ALIGNED(eaddr, pgsz) && 5783 eaddr != seg->s_base + seg->s_size)) { 5784 5785 segvn_setpgsz_align_err++; 5786 return (EINVAL); 5787 } 5788 5789 if (amp != NULL && svd->type == MAP_SHARED) { 5790 ulong_t an_idx = svd->anon_index + seg_page(seg, addr); 5791 if (!IS_P2ALIGNED(an_idx, pgcnt)) { 5792 5793 segvn_setpgsz_anon_align_err++; 5794 return (EINVAL); 5795 } 5796 } 5797 5798 if ((svd->flags & MAP_NORESERVE) || seg->s_as == &kas || 5799 szc > segvn_maxpgszc) { 5800 return (EINVAL); 5801 } 5802 5803 /* paranoid check */ 5804 if (svd->vp != NULL && 5805 (IS_SWAPFSVP(svd->vp) || VN_ISKAS(svd->vp))) { 5806 return (EINVAL); 5807 } 5808 5809 if (seg->s_szc == 0 && svd->vp != NULL && 5810 map_addr_vacalign_check(addr, off)) { 5811 return (EINVAL); 5812 } 5813 5814 /* 5815 * Check that protections are the same within new page 5816 * size boundaries. 5817 */ 5818 if (svd->pageprot) { 5819 for (a = addr; a < eaddr; a += pgsz) { 5820 if ((a + pgsz) > eaddr) { 5821 if (!sameprot(seg, a, eaddr - a)) { 5822 return (EINVAL); 5823 } 5824 } else { 5825 if (!sameprot(seg, a, pgsz)) { 5826 return (EINVAL); 5827 } 5828 } 5829 } 5830 } 5831 5832 /* 5833 * Since we are changing page size we first have to flush 5834 * the cache. This makes sure all the pagelock calls have 5835 * to recheck protections. 5836 */ 5837 if (svd->softlockcnt > 0) { 5838 ASSERT(svd->tr_state == SEGVN_TR_OFF); 5839 /* 5840 * Since we do have the segvn writers lock nobody can fill 5841 * the cache with entries belonging to this seg during 5842 * the purge. The flush either succeeds or we still have 5843 * pending I/Os. 5844 */ 5845 segvn_purge(seg); 5846 if (svd->softlockcnt > 0) { 5847 return (EAGAIN); 5848 } 5849 } 5850 5851 if (svd->tr_state == SEGVN_TR_INIT) { 5852 svd->tr_state = SEGVN_TR_OFF; 5853 } else if (svd->tr_state == SEGVN_TR_ON) { 5854 ASSERT(svd->amp != NULL); 5855 segvn_textunrepl(seg, 1); 5856 ASSERT(svd->amp == NULL && svd->tr_state == SEGVN_TR_OFF); 5857 amp = NULL; 5858 } 5859 5860 /* 5861 * Operation for sub range of existing segment. 5862 */ 5863 if (addr != seg->s_base || eaddr != (seg->s_base + seg->s_size)) { 5864 if (szc < seg->s_szc) { 5865 VM_STAT_ADD(segvnvmstats.demoterange[2]); 5866 err = segvn_demote_range(seg, addr, len, SDR_RANGE, 0); 5867 if (err == 0) { 5868 return (IE_RETRY); 5869 } 5870 if (err == ENOMEM) { 5871 return (IE_NOMEM); 5872 } 5873 return (err); 5874 } 5875 if (addr != seg->s_base) { 5876 nseg = segvn_split_seg(seg, addr); 5877 if (eaddr != (nseg->s_base + nseg->s_size)) { 5878 /* eaddr is szc aligned */ 5879 (void) segvn_split_seg(nseg, eaddr); 5880 } 5881 return (IE_RETRY); 5882 } 5883 if (eaddr != (seg->s_base + seg->s_size)) { 5884 /* eaddr is szc aligned */ 5885 (void) segvn_split_seg(seg, eaddr); 5886 } 5887 return (IE_RETRY); 5888 } 5889 5890 /* 5891 * Break any low level sharing and reset seg->s_szc to 0. 5892 */ 5893 if ((err = segvn_clrszc(seg)) != 0) { 5894 if (err == ENOMEM) { 5895 err = IE_NOMEM; 5896 } 5897 return (err); 5898 } 5899 ASSERT(seg->s_szc == 0); 5900 5901 /* 5902 * If the end of the current segment is not pgsz aligned 5903 * then attempt to concatenate with the next segment. 5904 */ 5905 if (!IS_P2ALIGNED(eaddr, pgsz)) { 5906 nseg = AS_SEGNEXT(seg->s_as, seg); 5907 if (nseg == NULL || nseg == seg || eaddr != nseg->s_base) { 5908 return (ENOMEM); 5909 } 5910 if (nseg->s_ops != &segvn_ops) { 5911 return (EINVAL); 5912 } 5913 nsvd = (struct segvn_data *)nseg->s_data; 5914 if (nsvd->softlockcnt > 0) { 5915 segvn_purge(nseg); 5916 if (nsvd->softlockcnt > 0) { 5917 return (EAGAIN); 5918 } 5919 } 5920 err = segvn_clrszc(nseg); 5921 if (err == ENOMEM) { 5922 err = IE_NOMEM; 5923 } 5924 if (err != 0) { 5925 return (err); 5926 } 5927 err = segvn_concat(seg, nseg, 1); 5928 if (err == -1) { 5929 return (EINVAL); 5930 } 5931 if (err == -2) { 5932 return (IE_NOMEM); 5933 } 5934 return (IE_RETRY); 5935 } 5936 5937 /* 5938 * May need to re-align anon array to 5939 * new szc. 5940 */ 5941 if (amp != NULL) { 5942 if (!IS_P2ALIGNED(svd->anon_index, pgcnt)) { 5943 struct anon_hdr *nahp; 5944 5945 ASSERT(svd->type == MAP_PRIVATE); 5946 5947 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 5948 ASSERT(amp->refcnt == 1); 5949 nahp = anon_create(btop(amp->size), ANON_NOSLEEP); 5950 if (nahp == NULL) { 5951 ANON_LOCK_EXIT(&->a_rwlock); 5952 return (IE_NOMEM); 5953 } 5954 if (anon_copy_ptr(amp->ahp, svd->anon_index, 5955 nahp, 0, btop(seg->s_size), ANON_NOSLEEP)) { 5956 anon_release(nahp, btop(amp->size)); 5957 ANON_LOCK_EXIT(&->a_rwlock); 5958 return (IE_NOMEM); 5959 } 5960 anon_release(amp->ahp, btop(amp->size)); 5961 amp->ahp = nahp; 5962 svd->anon_index = 0; 5963 ANON_LOCK_EXIT(&->a_rwlock); 5964 } 5965 } 5966 if (svd->vp != NULL && szc != 0) { 5967 struct vattr va; 5968 u_offset_t eoffpage = svd->offset; 5969 va.va_mask = AT_SIZE; 5970 eoffpage += seg->s_size; 5971 eoffpage = btopr(eoffpage); 5972 if (VOP_GETATTR(svd->vp, &va, 0, svd->cred) != 0) { 5973 segvn_setpgsz_getattr_err++; 5974 return (EINVAL); 5975 } 5976 if (btopr(va.va_size) < eoffpage) { 5977 segvn_setpgsz_eof_err++; 5978 return (EINVAL); 5979 } 5980 if (amp != NULL) { 5981 /* 5982 * anon_fill_cow_holes() may call VOP_GETPAGE(). 5983 * don't take anon map lock here to avoid holding it 5984 * across VOP_GETPAGE() calls that may call back into 5985 * segvn for klsutering checks. We don't really need 5986 * anon map lock here since it's a private segment and 5987 * we hold as level lock as writers. 5988 */ 5989 if ((err = anon_fill_cow_holes(seg, seg->s_base, 5990 amp->ahp, svd->anon_index, svd->vp, svd->offset, 5991 seg->s_size, szc, svd->prot, svd->vpage, 5992 svd->cred)) != 0) { 5993 return (EINVAL); 5994 } 5995 } 5996 segvn_setvnode_mpss(svd->vp); 5997 } 5998 5999 if (amp != NULL) { 6000 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 6001 if (svd->type == MAP_PRIVATE) { 6002 amp->a_szc = szc; 6003 } else if (szc > amp->a_szc) { 6004 amp->a_szc = szc; 6005 } 6006 ANON_LOCK_EXIT(&->a_rwlock); 6007 } 6008 6009 seg->s_szc = szc; 6010 6011 return (0); 6012 } 6013 6014 static int 6015 segvn_clrszc(struct seg *seg) 6016 { 6017 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 6018 struct anon_map *amp = svd->amp; 6019 size_t pgsz; 6020 pgcnt_t pages; 6021 int err = 0; 6022 caddr_t a = seg->s_base; 6023 caddr_t ea = a + seg->s_size; 6024 ulong_t an_idx = svd->anon_index; 6025 vnode_t *vp = svd->vp; 6026 struct vpage *vpage = svd->vpage; 6027 page_t *anon_pl[1 + 1], *pp; 6028 struct anon *ap, *oldap; 6029 uint_t prot = svd->prot, vpprot; 6030 int pageflag = 0; 6031 int unmap = 1; 6032 6033 ASSERT(AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock) || 6034 SEGVN_WRITE_HELD(seg->s_as, &svd->lock)); 6035 6036 if (vp == NULL && amp == NULL) { 6037 seg->s_szc = 0; 6038 return (0); 6039 } 6040 6041 if (svd->tr_state == SEGVN_TR_INIT) { 6042 svd->tr_state = SEGVN_TR_OFF; 6043 } else if (svd->tr_state == SEGVN_TR_ON) { 6044 ASSERT(svd->amp != NULL); 6045 segvn_textunrepl(seg, 1); 6046 ASSERT(svd->amp == NULL && svd->tr_state == SEGVN_TR_OFF); 6047 amp = NULL; 6048 unmap = 0; 6049 } 6050 6051 if (unmap) { 6052 /* 6053 * do HAT_UNLOAD_UNMAP since we are changing the pagesize. 6054 * unload argument is 0 when we are freeing the segment 6055 * and unload was already done. 6056 */ 6057 hat_unload(seg->s_as->a_hat, seg->s_base, seg->s_size, 6058 HAT_UNLOAD_UNMAP); 6059 } 6060 6061 if (amp == NULL || svd->type == MAP_SHARED) { 6062 seg->s_szc = 0; 6063 return (0); 6064 } 6065 6066 pgsz = page_get_pagesize(seg->s_szc); 6067 pages = btop(pgsz); 6068 6069 /* 6070 * XXX anon rwlock is not really needed because this is a 6071 * private segment and we are writers. 6072 */ 6073 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 6074 6075 for (; a < ea; a += pgsz, an_idx += pages) { 6076 if ((oldap = anon_get_ptr(amp->ahp, an_idx)) != NULL) { 6077 ASSERT(vpage != NULL || svd->pageprot == 0); 6078 if (vpage != NULL) { 6079 ASSERT(sameprot(seg, a, pgsz)); 6080 prot = VPP_PROT(vpage); 6081 pageflag = VPP_ISPPLOCK(vpage) ? LOCK_PAGE : 0; 6082 } 6083 if (seg->s_szc != 0) { 6084 ASSERT(vp == NULL || anon_pages(amp->ahp, 6085 an_idx, pages) == pages); 6086 if ((err = anon_map_demotepages(amp, an_idx, 6087 seg, a, prot, vpage, svd->cred)) != 0) { 6088 goto out; 6089 } 6090 } else { 6091 if (oldap->an_refcnt == 1) { 6092 continue; 6093 } 6094 if ((err = anon_getpage(&oldap, &vpprot, 6095 anon_pl, PAGESIZE, seg, a, S_READ, 6096 svd->cred))) { 6097 goto out; 6098 } 6099 if ((pp = anon_private(&ap, seg, a, prot, 6100 anon_pl[0], pageflag, svd->cred)) == NULL) { 6101 err = ENOMEM; 6102 goto out; 6103 } 6104 anon_decref(oldap); 6105 (void) anon_set_ptr(amp->ahp, an_idx, ap, 6106 ANON_SLEEP); 6107 page_unlock(pp); 6108 } 6109 } 6110 vpage = (vpage == NULL) ? NULL : vpage + pages; 6111 } 6112 6113 amp->a_szc = 0; 6114 seg->s_szc = 0; 6115 out: 6116 ANON_LOCK_EXIT(&->a_rwlock); 6117 return (err); 6118 } 6119 6120 static int 6121 segvn_claim_pages( 6122 struct seg *seg, 6123 struct vpage *svp, 6124 u_offset_t off, 6125 ulong_t anon_idx, 6126 uint_t prot) 6127 { 6128 pgcnt_t pgcnt = page_get_pagecnt(seg->s_szc); 6129 size_t ppasize = (pgcnt + 1) * sizeof (page_t *); 6130 page_t **ppa; 6131 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 6132 struct anon_map *amp = svd->amp; 6133 struct vpage *evp = svp + pgcnt; 6134 caddr_t addr = ((uintptr_t)(svp - svd->vpage) << PAGESHIFT) 6135 + seg->s_base; 6136 struct anon *ap; 6137 struct vnode *vp = svd->vp; 6138 page_t *pp; 6139 pgcnt_t pg_idx, i; 6140 int err = 0; 6141 anoff_t aoff; 6142 int anon = (amp != NULL) ? 1 : 0; 6143 6144 ASSERT(svd->type == MAP_PRIVATE); 6145 ASSERT(svd->vpage != NULL); 6146 ASSERT(seg->s_szc != 0); 6147 ASSERT(IS_P2ALIGNED(pgcnt, pgcnt)); 6148 ASSERT(amp == NULL || IS_P2ALIGNED(anon_idx, pgcnt)); 6149 ASSERT(sameprot(seg, addr, pgcnt << PAGESHIFT)); 6150 6151 if (VPP_PROT(svp) == prot) 6152 return (1); 6153 if (!((VPP_PROT(svp) ^ prot) & PROT_WRITE)) 6154 return (1); 6155 6156 ppa = kmem_alloc(ppasize, KM_SLEEP); 6157 if (anon && vp != NULL) { 6158 if (anon_get_ptr(amp->ahp, anon_idx) == NULL) { 6159 anon = 0; 6160 ASSERT(!anon_pages(amp->ahp, anon_idx, pgcnt)); 6161 } 6162 ASSERT(!anon || 6163 anon_pages(amp->ahp, anon_idx, pgcnt) == pgcnt); 6164 } 6165 6166 for (*ppa = NULL, pg_idx = 0; svp < evp; svp++, anon_idx++) { 6167 if (!VPP_ISPPLOCK(svp)) 6168 continue; 6169 if (anon) { 6170 ap = anon_get_ptr(amp->ahp, anon_idx); 6171 if (ap == NULL) { 6172 panic("segvn_claim_pages: no anon slot"); 6173 } 6174 swap_xlate(ap, &vp, &aoff); 6175 off = (u_offset_t)aoff; 6176 } 6177 ASSERT(vp != NULL); 6178 if ((pp = page_lookup(vp, 6179 (u_offset_t)off, SE_SHARED)) == NULL) { 6180 panic("segvn_claim_pages: no page"); 6181 } 6182 ppa[pg_idx++] = pp; 6183 off += PAGESIZE; 6184 } 6185 6186 if (ppa[0] == NULL) { 6187 kmem_free(ppa, ppasize); 6188 return (1); 6189 } 6190 6191 ASSERT(pg_idx <= pgcnt); 6192 ppa[pg_idx] = NULL; 6193 6194 if (prot & PROT_WRITE) 6195 err = page_addclaim_pages(ppa); 6196 else 6197 err = page_subclaim_pages(ppa); 6198 6199 for (i = 0; i < pg_idx; i++) { 6200 ASSERT(ppa[i] != NULL); 6201 page_unlock(ppa[i]); 6202 } 6203 6204 kmem_free(ppa, ppasize); 6205 return (err); 6206 } 6207 6208 /* 6209 * Returns right (upper address) segment if split occured. 6210 * If the address is equal to the beginning or end of its segment it returns 6211 * the current segment. 6212 */ 6213 static struct seg * 6214 segvn_split_seg(struct seg *seg, caddr_t addr) 6215 { 6216 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 6217 struct seg *nseg; 6218 size_t nsize; 6219 struct segvn_data *nsvd; 6220 6221 ASSERT(AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock)); 6222 ASSERT(svd->tr_state == SEGVN_TR_OFF); 6223 6224 ASSERT(addr >= seg->s_base); 6225 ASSERT(addr <= seg->s_base + seg->s_size); 6226 6227 if (addr == seg->s_base || addr == seg->s_base + seg->s_size) 6228 return (seg); 6229 6230 nsize = seg->s_base + seg->s_size - addr; 6231 seg->s_size = addr - seg->s_base; 6232 nseg = seg_alloc(seg->s_as, addr, nsize); 6233 ASSERT(nseg != NULL); 6234 nseg->s_ops = seg->s_ops; 6235 nsvd = kmem_cache_alloc(segvn_cache, KM_SLEEP); 6236 nseg->s_data = (void *)nsvd; 6237 nseg->s_szc = seg->s_szc; 6238 *nsvd = *svd; 6239 nsvd->seg = nseg; 6240 rw_init(&nsvd->lock, NULL, RW_DEFAULT, NULL); 6241 6242 if (nsvd->vp != NULL) { 6243 VN_HOLD(nsvd->vp); 6244 nsvd->offset = svd->offset + 6245 (uintptr_t)(nseg->s_base - seg->s_base); 6246 if (nsvd->type == MAP_SHARED) 6247 lgrp_shm_policy_init(NULL, nsvd->vp); 6248 } else { 6249 /* 6250 * The offset for an anonymous segment has no signifigance in 6251 * terms of an offset into a file. If we were to use the above 6252 * calculation instead, the structures read out of 6253 * /proc/<pid>/xmap would be more difficult to decipher since 6254 * it would be unclear whether two seemingly contiguous 6255 * prxmap_t structures represented different segments or a 6256 * single segment that had been split up into multiple prxmap_t 6257 * structures (e.g. if some part of the segment had not yet 6258 * been faulted in). 6259 */ 6260 nsvd->offset = 0; 6261 } 6262 6263 ASSERT(svd->softlockcnt == 0); 6264 crhold(svd->cred); 6265 6266 if (svd->vpage != NULL) { 6267 size_t bytes = vpgtob(seg_pages(seg)); 6268 size_t nbytes = vpgtob(seg_pages(nseg)); 6269 struct vpage *ovpage = svd->vpage; 6270 6271 svd->vpage = kmem_alloc(bytes, KM_SLEEP); 6272 bcopy(ovpage, svd->vpage, bytes); 6273 nsvd->vpage = kmem_alloc(nbytes, KM_SLEEP); 6274 bcopy(ovpage + seg_pages(seg), nsvd->vpage, nbytes); 6275 kmem_free(ovpage, bytes + nbytes); 6276 } 6277 if (svd->amp != NULL && svd->type == MAP_PRIVATE) { 6278 struct anon_map *oamp = svd->amp, *namp; 6279 struct anon_hdr *nahp; 6280 6281 ANON_LOCK_ENTER(&oamp->a_rwlock, RW_WRITER); 6282 ASSERT(oamp->refcnt == 1); 6283 nahp = anon_create(btop(seg->s_size), ANON_SLEEP); 6284 (void) anon_copy_ptr(oamp->ahp, svd->anon_index, 6285 nahp, 0, btop(seg->s_size), ANON_SLEEP); 6286 6287 namp = anonmap_alloc(nseg->s_size, 0, ANON_SLEEP); 6288 namp->a_szc = nseg->s_szc; 6289 (void) anon_copy_ptr(oamp->ahp, 6290 svd->anon_index + btop(seg->s_size), 6291 namp->ahp, 0, btop(nseg->s_size), ANON_SLEEP); 6292 anon_release(oamp->ahp, btop(oamp->size)); 6293 oamp->ahp = nahp; 6294 oamp->size = seg->s_size; 6295 svd->anon_index = 0; 6296 nsvd->amp = namp; 6297 nsvd->anon_index = 0; 6298 ANON_LOCK_EXIT(&oamp->a_rwlock); 6299 } else if (svd->amp != NULL) { 6300 pgcnt_t pgcnt = page_get_pagecnt(seg->s_szc); 6301 ASSERT(svd->amp == nsvd->amp); 6302 ASSERT(seg->s_szc <= svd->amp->a_szc); 6303 nsvd->anon_index = svd->anon_index + seg_pages(seg); 6304 ASSERT(IS_P2ALIGNED(nsvd->anon_index, pgcnt)); 6305 ANON_LOCK_ENTER(&svd->amp->a_rwlock, RW_WRITER); 6306 svd->amp->refcnt++; 6307 ANON_LOCK_EXIT(&svd->amp->a_rwlock); 6308 } 6309 6310 /* 6311 * Split amount of swap reserve 6312 */ 6313 if (svd->swresv) { 6314 /* 6315 * For MAP_NORESERVE, only allocate swap reserve for pages 6316 * being used. Other segments get enough to cover whole 6317 * segment. 6318 */ 6319 if (svd->flags & MAP_NORESERVE) { 6320 size_t oswresv; 6321 6322 ASSERT(svd->amp); 6323 oswresv = svd->swresv; 6324 svd->swresv = ptob(anon_pages(svd->amp->ahp, 6325 svd->anon_index, btop(seg->s_size))); 6326 nsvd->swresv = ptob(anon_pages(nsvd->amp->ahp, 6327 nsvd->anon_index, btop(nseg->s_size))); 6328 ASSERT(oswresv >= (svd->swresv + nsvd->swresv)); 6329 } else { 6330 ASSERT(svd->swresv == seg->s_size + nseg->s_size); 6331 svd->swresv = seg->s_size; 6332 nsvd->swresv = nseg->s_size; 6333 } 6334 } 6335 6336 return (nseg); 6337 } 6338 6339 /* 6340 * called on memory operations (unmap, setprot, setpagesize) for a subset 6341 * of a large page segment to either demote the memory range (SDR_RANGE) 6342 * or the ends (SDR_END) by addr/len. 6343 * 6344 * returns 0 on success. returns errno, including ENOMEM, on failure. 6345 */ 6346 static int 6347 segvn_demote_range( 6348 struct seg *seg, 6349 caddr_t addr, 6350 size_t len, 6351 int flag, 6352 uint_t szcvec) 6353 { 6354 caddr_t eaddr = addr + len; 6355 caddr_t lpgaddr, lpgeaddr; 6356 struct seg *nseg; 6357 struct seg *badseg1 = NULL; 6358 struct seg *badseg2 = NULL; 6359 size_t pgsz; 6360 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 6361 int err; 6362 uint_t szc = seg->s_szc; 6363 uint_t tszcvec; 6364 6365 ASSERT(AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock)); 6366 ASSERT(svd->tr_state == SEGVN_TR_OFF); 6367 ASSERT(szc != 0); 6368 pgsz = page_get_pagesize(szc); 6369 ASSERT(seg->s_base != addr || seg->s_size != len); 6370 ASSERT(addr >= seg->s_base && eaddr <= seg->s_base + seg->s_size); 6371 ASSERT(svd->softlockcnt == 0); 6372 ASSERT(szcvec == 0 || (flag == SDR_END && svd->type == MAP_SHARED)); 6373 6374 CALC_LPG_REGION(pgsz, seg, addr, len, lpgaddr, lpgeaddr); 6375 ASSERT(flag == SDR_RANGE || eaddr < lpgeaddr || addr > lpgaddr); 6376 if (flag == SDR_RANGE) { 6377 /* demote entire range */ 6378 badseg1 = nseg = segvn_split_seg(seg, lpgaddr); 6379 (void) segvn_split_seg(nseg, lpgeaddr); 6380 ASSERT(badseg1->s_base == lpgaddr); 6381 ASSERT(badseg1->s_size == lpgeaddr - lpgaddr); 6382 } else if (addr != lpgaddr) { 6383 ASSERT(flag == SDR_END); 6384 badseg1 = nseg = segvn_split_seg(seg, lpgaddr); 6385 if (eaddr != lpgeaddr && eaddr > lpgaddr + pgsz && 6386 eaddr < lpgaddr + 2 * pgsz) { 6387 (void) segvn_split_seg(nseg, lpgeaddr); 6388 ASSERT(badseg1->s_base == lpgaddr); 6389 ASSERT(badseg1->s_size == 2 * pgsz); 6390 } else { 6391 nseg = segvn_split_seg(nseg, lpgaddr + pgsz); 6392 ASSERT(badseg1->s_base == lpgaddr); 6393 ASSERT(badseg1->s_size == pgsz); 6394 if (eaddr != lpgeaddr && eaddr > lpgaddr + pgsz) { 6395 ASSERT(lpgeaddr - lpgaddr > 2 * pgsz); 6396 nseg = segvn_split_seg(nseg, lpgeaddr - pgsz); 6397 badseg2 = nseg; 6398 (void) segvn_split_seg(nseg, lpgeaddr); 6399 ASSERT(badseg2->s_base == lpgeaddr - pgsz); 6400 ASSERT(badseg2->s_size == pgsz); 6401 } 6402 } 6403 } else { 6404 ASSERT(flag == SDR_END); 6405 ASSERT(eaddr < lpgeaddr); 6406 badseg1 = nseg = segvn_split_seg(seg, lpgeaddr - pgsz); 6407 (void) segvn_split_seg(nseg, lpgeaddr); 6408 ASSERT(badseg1->s_base == lpgeaddr - pgsz); 6409 ASSERT(badseg1->s_size == pgsz); 6410 } 6411 6412 ASSERT(badseg1 != NULL); 6413 ASSERT(badseg1->s_szc == szc); 6414 ASSERT(flag == SDR_RANGE || badseg1->s_size == pgsz || 6415 badseg1->s_size == 2 * pgsz); 6416 ASSERT(sameprot(badseg1, badseg1->s_base, pgsz)); 6417 ASSERT(badseg1->s_size == pgsz || 6418 sameprot(badseg1, badseg1->s_base + pgsz, pgsz)); 6419 if (err = segvn_clrszc(badseg1)) { 6420 return (err); 6421 } 6422 ASSERT(badseg1->s_szc == 0); 6423 6424 if (szc > 1 && (tszcvec = P2PHASE(szcvec, 1 << szc)) > 1) { 6425 uint_t tszc = highbit(tszcvec) - 1; 6426 caddr_t ta = MAX(addr, badseg1->s_base); 6427 caddr_t te; 6428 size_t tpgsz = page_get_pagesize(tszc); 6429 6430 ASSERT(svd->type == MAP_SHARED); 6431 ASSERT(flag == SDR_END); 6432 ASSERT(tszc < szc && tszc > 0); 6433 6434 if (eaddr > badseg1->s_base + badseg1->s_size) { 6435 te = badseg1->s_base + badseg1->s_size; 6436 } else { 6437 te = eaddr; 6438 } 6439 6440 ASSERT(ta <= te); 6441 badseg1->s_szc = tszc; 6442 if (!IS_P2ALIGNED(ta, tpgsz) || !IS_P2ALIGNED(te, tpgsz)) { 6443 if (badseg2 != NULL) { 6444 err = segvn_demote_range(badseg1, ta, te - ta, 6445 SDR_END, tszcvec); 6446 if (err != 0) { 6447 return (err); 6448 } 6449 } else { 6450 return (segvn_demote_range(badseg1, ta, 6451 te - ta, SDR_END, tszcvec)); 6452 } 6453 } 6454 } 6455 6456 if (badseg2 == NULL) 6457 return (0); 6458 ASSERT(badseg2->s_szc == szc); 6459 ASSERT(badseg2->s_size == pgsz); 6460 ASSERT(sameprot(badseg2, badseg2->s_base, badseg2->s_size)); 6461 if (err = segvn_clrszc(badseg2)) { 6462 return (err); 6463 } 6464 ASSERT(badseg2->s_szc == 0); 6465 6466 if (szc > 1 && (tszcvec = P2PHASE(szcvec, 1 << szc)) > 1) { 6467 uint_t tszc = highbit(tszcvec) - 1; 6468 size_t tpgsz = page_get_pagesize(tszc); 6469 6470 ASSERT(svd->type == MAP_SHARED); 6471 ASSERT(flag == SDR_END); 6472 ASSERT(tszc < szc && tszc > 0); 6473 ASSERT(badseg2->s_base > addr); 6474 ASSERT(eaddr > badseg2->s_base); 6475 ASSERT(eaddr < badseg2->s_base + badseg2->s_size); 6476 6477 badseg2->s_szc = tszc; 6478 if (!IS_P2ALIGNED(eaddr, tpgsz)) { 6479 return (segvn_demote_range(badseg2, badseg2->s_base, 6480 eaddr - badseg2->s_base, SDR_END, tszcvec)); 6481 } 6482 } 6483 6484 return (0); 6485 } 6486 6487 static int 6488 segvn_checkprot(struct seg *seg, caddr_t addr, size_t len, uint_t prot) 6489 { 6490 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 6491 struct vpage *vp, *evp; 6492 6493 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 6494 6495 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER); 6496 /* 6497 * If segment protection can be used, simply check against them. 6498 */ 6499 if (svd->pageprot == 0) { 6500 int err; 6501 6502 err = ((svd->prot & prot) != prot) ? EACCES : 0; 6503 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 6504 return (err); 6505 } 6506 6507 /* 6508 * Have to check down to the vpage level. 6509 */ 6510 evp = &svd->vpage[seg_page(seg, addr + len)]; 6511 for (vp = &svd->vpage[seg_page(seg, addr)]; vp < evp; vp++) { 6512 if ((VPP_PROT(vp) & prot) != prot) { 6513 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 6514 return (EACCES); 6515 } 6516 } 6517 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 6518 return (0); 6519 } 6520 6521 static int 6522 segvn_getprot(struct seg *seg, caddr_t addr, size_t len, uint_t *protv) 6523 { 6524 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 6525 size_t pgno = seg_page(seg, addr + len) - seg_page(seg, addr) + 1; 6526 6527 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 6528 6529 if (pgno != 0) { 6530 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER); 6531 if (svd->pageprot == 0) { 6532 do 6533 protv[--pgno] = svd->prot; 6534 while (pgno != 0); 6535 } else { 6536 size_t pgoff = seg_page(seg, addr); 6537 6538 do { 6539 pgno--; 6540 protv[pgno] = VPP_PROT(&svd->vpage[pgno+pgoff]); 6541 } while (pgno != 0); 6542 } 6543 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 6544 } 6545 return (0); 6546 } 6547 6548 static u_offset_t 6549 segvn_getoffset(struct seg *seg, caddr_t addr) 6550 { 6551 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 6552 6553 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 6554 6555 return (svd->offset + (uintptr_t)(addr - seg->s_base)); 6556 } 6557 6558 /*ARGSUSED*/ 6559 static int 6560 segvn_gettype(struct seg *seg, caddr_t addr) 6561 { 6562 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 6563 6564 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 6565 6566 return (svd->type | (svd->flags & (MAP_NORESERVE | MAP_TEXT | 6567 MAP_INITDATA))); 6568 } 6569 6570 /*ARGSUSED*/ 6571 static int 6572 segvn_getvp(struct seg *seg, caddr_t addr, struct vnode **vpp) 6573 { 6574 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 6575 6576 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 6577 6578 *vpp = svd->vp; 6579 return (0); 6580 } 6581 6582 /* 6583 * Check to see if it makes sense to do kluster/read ahead to 6584 * addr + delta relative to the mapping at addr. We assume here 6585 * that delta is a signed PAGESIZE'd multiple (which can be negative). 6586 * 6587 * For segvn, we currently "approve" of the action if we are 6588 * still in the segment and it maps from the same vp/off, 6589 * or if the advice stored in segvn_data or vpages allows it. 6590 * Currently, klustering is not allowed only if MADV_RANDOM is set. 6591 */ 6592 static int 6593 segvn_kluster(struct seg *seg, caddr_t addr, ssize_t delta) 6594 { 6595 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 6596 struct anon *oap, *ap; 6597 ssize_t pd; 6598 size_t page; 6599 struct vnode *vp1, *vp2; 6600 u_offset_t off1, off2; 6601 struct anon_map *amp; 6602 6603 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 6604 ASSERT(AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock) || 6605 SEGVN_LOCK_HELD(seg->s_as, &svd->lock)); 6606 6607 if (addr + delta < seg->s_base || 6608 addr + delta >= (seg->s_base + seg->s_size)) 6609 return (-1); /* exceeded segment bounds */ 6610 6611 pd = delta / (ssize_t)PAGESIZE; /* divide to preserve sign bit */ 6612 page = seg_page(seg, addr); 6613 6614 /* 6615 * Check to see if either of the pages addr or addr + delta 6616 * have advice set that prevents klustering (if MADV_RANDOM advice 6617 * is set for entire segment, or MADV_SEQUENTIAL is set and delta 6618 * is negative). 6619 */ 6620 if (svd->advice == MADV_RANDOM || 6621 svd->advice == MADV_SEQUENTIAL && delta < 0) 6622 return (-1); 6623 else if (svd->pageadvice && svd->vpage) { 6624 struct vpage *bvpp, *evpp; 6625 6626 bvpp = &svd->vpage[page]; 6627 evpp = &svd->vpage[page + pd]; 6628 if (VPP_ADVICE(bvpp) == MADV_RANDOM || 6629 VPP_ADVICE(evpp) == MADV_SEQUENTIAL && delta < 0) 6630 return (-1); 6631 if (VPP_ADVICE(bvpp) != VPP_ADVICE(evpp) && 6632 VPP_ADVICE(evpp) == MADV_RANDOM) 6633 return (-1); 6634 } 6635 6636 if (svd->type == MAP_SHARED) 6637 return (0); /* shared mapping - all ok */ 6638 6639 if ((amp = svd->amp) == NULL) 6640 return (0); /* off original vnode */ 6641 6642 page += svd->anon_index; 6643 6644 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 6645 6646 oap = anon_get_ptr(amp->ahp, page); 6647 ap = anon_get_ptr(amp->ahp, page + pd); 6648 6649 ANON_LOCK_EXIT(&->a_rwlock); 6650 6651 if ((oap == NULL && ap != NULL) || (oap != NULL && ap == NULL)) { 6652 return (-1); /* one with and one without an anon */ 6653 } 6654 6655 if (oap == NULL) { /* implies that ap == NULL */ 6656 return (0); /* off original vnode */ 6657 } 6658 6659 /* 6660 * Now we know we have two anon pointers - check to 6661 * see if they happen to be properly allocated. 6662 */ 6663 6664 /* 6665 * XXX We cheat here and don't lock the anon slots. We can't because 6666 * we may have been called from the anon layer which might already 6667 * have locked them. We are holding a refcnt on the slots so they 6668 * can't disappear. The worst that will happen is we'll get the wrong 6669 * names (vp, off) for the slots and make a poor klustering decision. 6670 */ 6671 swap_xlate(ap, &vp1, &off1); 6672 swap_xlate(oap, &vp2, &off2); 6673 6674 6675 if (!VOP_CMP(vp1, vp2) || off1 - off2 != delta) 6676 return (-1); 6677 return (0); 6678 } 6679 6680 /* 6681 * Swap the pages of seg out to secondary storage, returning the 6682 * number of bytes of storage freed. 6683 * 6684 * The basic idea is first to unload all translations and then to call 6685 * VOP_PUTPAGE() for all newly-unmapped pages, to push them out to the 6686 * swap device. Pages to which other segments have mappings will remain 6687 * mapped and won't be swapped. Our caller (as_swapout) has already 6688 * performed the unloading step. 6689 * 6690 * The value returned is intended to correlate well with the process's 6691 * memory requirements. However, there are some caveats: 6692 * 1) When given a shared segment as argument, this routine will 6693 * only succeed in swapping out pages for the last sharer of the 6694 * segment. (Previous callers will only have decremented mapping 6695 * reference counts.) 6696 * 2) We assume that the hat layer maintains a large enough translation 6697 * cache to capture process reference patterns. 6698 */ 6699 static size_t 6700 segvn_swapout(struct seg *seg) 6701 { 6702 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 6703 struct anon_map *amp; 6704 pgcnt_t pgcnt = 0; 6705 pgcnt_t npages; 6706 pgcnt_t page; 6707 ulong_t anon_index; 6708 6709 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 6710 6711 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER); 6712 /* 6713 * Find pages unmapped by our caller and force them 6714 * out to the virtual swap device. 6715 */ 6716 if ((amp = svd->amp) != NULL) 6717 anon_index = svd->anon_index; 6718 npages = seg->s_size >> PAGESHIFT; 6719 for (page = 0; page < npages; page++) { 6720 page_t *pp; 6721 struct anon *ap; 6722 struct vnode *vp; 6723 u_offset_t off; 6724 anon_sync_obj_t cookie; 6725 6726 /* 6727 * Obtain <vp, off> pair for the page, then look it up. 6728 * 6729 * Note that this code is willing to consider regular 6730 * pages as well as anon pages. Is this appropriate here? 6731 */ 6732 ap = NULL; 6733 if (amp != NULL) { 6734 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 6735 if (anon_array_try_enter(amp, anon_index + page, 6736 &cookie)) { 6737 ANON_LOCK_EXIT(&->a_rwlock); 6738 continue; 6739 } 6740 ap = anon_get_ptr(amp->ahp, anon_index + page); 6741 if (ap != NULL) { 6742 swap_xlate(ap, &vp, &off); 6743 } else { 6744 vp = svd->vp; 6745 off = svd->offset + ptob(page); 6746 } 6747 anon_array_exit(&cookie); 6748 ANON_LOCK_EXIT(&->a_rwlock); 6749 } else { 6750 vp = svd->vp; 6751 off = svd->offset + ptob(page); 6752 } 6753 if (vp == NULL) { /* untouched zfod page */ 6754 ASSERT(ap == NULL); 6755 continue; 6756 } 6757 6758 pp = page_lookup_nowait(vp, off, SE_SHARED); 6759 if (pp == NULL) 6760 continue; 6761 6762 6763 /* 6764 * Examine the page to see whether it can be tossed out, 6765 * keeping track of how many we've found. 6766 */ 6767 if (!page_tryupgrade(pp)) { 6768 /* 6769 * If the page has an i/o lock and no mappings, 6770 * it's very likely that the page is being 6771 * written out as a result of klustering. 6772 * Assume this is so and take credit for it here. 6773 */ 6774 if (!page_io_trylock(pp)) { 6775 if (!hat_page_is_mapped(pp)) 6776 pgcnt++; 6777 } else { 6778 page_io_unlock(pp); 6779 } 6780 page_unlock(pp); 6781 continue; 6782 } 6783 ASSERT(!page_iolock_assert(pp)); 6784 6785 6786 /* 6787 * Skip if page is locked or has mappings. 6788 * We don't need the page_struct_lock to look at lckcnt 6789 * and cowcnt because the page is exclusive locked. 6790 */ 6791 if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0 || 6792 hat_page_is_mapped(pp)) { 6793 page_unlock(pp); 6794 continue; 6795 } 6796 6797 /* 6798 * dispose skips large pages so try to demote first. 6799 */ 6800 if (pp->p_szc != 0 && !page_try_demote_pages(pp)) { 6801 page_unlock(pp); 6802 /* 6803 * XXX should skip the remaining page_t's of this 6804 * large page. 6805 */ 6806 continue; 6807 } 6808 6809 ASSERT(pp->p_szc == 0); 6810 6811 /* 6812 * No longer mapped -- we can toss it out. How 6813 * we do so depends on whether or not it's dirty. 6814 */ 6815 if (hat_ismod(pp) && pp->p_vnode) { 6816 /* 6817 * We must clean the page before it can be 6818 * freed. Setting B_FREE will cause pvn_done 6819 * to free the page when the i/o completes. 6820 * XXX: This also causes it to be accounted 6821 * as a pageout instead of a swap: need 6822 * B_SWAPOUT bit to use instead of B_FREE. 6823 * 6824 * Hold the vnode before releasing the page lock 6825 * to prevent it from being freed and re-used by 6826 * some other thread. 6827 */ 6828 VN_HOLD(vp); 6829 page_unlock(pp); 6830 6831 /* 6832 * Queue all i/o requests for the pageout thread 6833 * to avoid saturating the pageout devices. 6834 */ 6835 if (!queue_io_request(vp, off)) 6836 VN_RELE(vp); 6837 } else { 6838 /* 6839 * The page was clean, free it. 6840 * 6841 * XXX: Can we ever encounter modified pages 6842 * with no associated vnode here? 6843 */ 6844 ASSERT(pp->p_vnode != NULL); 6845 /*LINTED: constant in conditional context*/ 6846 VN_DISPOSE(pp, B_FREE, 0, kcred); 6847 } 6848 6849 /* 6850 * Credit now even if i/o is in progress. 6851 */ 6852 pgcnt++; 6853 } 6854 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 6855 6856 /* 6857 * Wakeup pageout to initiate i/o on all queued requests. 6858 */ 6859 cv_signal_pageout(); 6860 return (ptob(pgcnt)); 6861 } 6862 6863 /* 6864 * Synchronize primary storage cache with real object in virtual memory. 6865 * 6866 * XXX - Anonymous pages should not be sync'ed out at all. 6867 */ 6868 static int 6869 segvn_sync(struct seg *seg, caddr_t addr, size_t len, int attr, uint_t flags) 6870 { 6871 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 6872 struct vpage *vpp; 6873 page_t *pp; 6874 u_offset_t offset; 6875 struct vnode *vp; 6876 u_offset_t off; 6877 caddr_t eaddr; 6878 int bflags; 6879 int err = 0; 6880 int segtype; 6881 int pageprot; 6882 int prot; 6883 ulong_t anon_index; 6884 struct anon_map *amp; 6885 struct anon *ap; 6886 anon_sync_obj_t cookie; 6887 6888 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 6889 6890 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER); 6891 6892 if (svd->softlockcnt > 0) { 6893 /* 6894 * flush all pages from seg cache 6895 * otherwise we may deadlock in swap_putpage 6896 * for B_INVAL page (4175402). 6897 * 6898 * Even if we grab segvn WRITER's lock or segp_slock 6899 * here, there might be another thread which could've 6900 * successfully performed lookup/insert just before 6901 * we acquired the lock here. So, grabbing either 6902 * lock here is of not much use. Until we devise 6903 * a strategy at upper layers to solve the 6904 * synchronization issues completely, we expect 6905 * applications to handle this appropriately. 6906 */ 6907 segvn_purge(seg); 6908 if (svd->softlockcnt > 0) { 6909 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 6910 return (EAGAIN); 6911 } 6912 } 6913 6914 vpp = svd->vpage; 6915 offset = svd->offset + (uintptr_t)(addr - seg->s_base); 6916 bflags = ((flags & MS_ASYNC) ? B_ASYNC : 0) | 6917 ((flags & MS_INVALIDATE) ? B_INVAL : 0); 6918 6919 if (attr) { 6920 pageprot = attr & ~(SHARED|PRIVATE); 6921 segtype = (attr & SHARED) ? MAP_SHARED : MAP_PRIVATE; 6922 6923 /* 6924 * We are done if the segment types don't match 6925 * or if we have segment level protections and 6926 * they don't match. 6927 */ 6928 if (svd->type != segtype) { 6929 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 6930 return (0); 6931 } 6932 if (vpp == NULL) { 6933 if (svd->prot != pageprot) { 6934 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 6935 return (0); 6936 } 6937 prot = svd->prot; 6938 } else 6939 vpp = &svd->vpage[seg_page(seg, addr)]; 6940 6941 } else if (svd->vp && svd->amp == NULL && 6942 (flags & MS_INVALIDATE) == 0) { 6943 6944 /* 6945 * No attributes, no anonymous pages and MS_INVALIDATE flag 6946 * is not on, just use one big request. 6947 */ 6948 err = VOP_PUTPAGE(svd->vp, (offset_t)offset, len, 6949 bflags, svd->cred); 6950 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 6951 return (err); 6952 } 6953 6954 if ((amp = svd->amp) != NULL) 6955 anon_index = svd->anon_index + seg_page(seg, addr); 6956 6957 for (eaddr = addr + len; addr < eaddr; addr += PAGESIZE) { 6958 ap = NULL; 6959 if (amp != NULL) { 6960 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 6961 anon_array_enter(amp, anon_index, &cookie); 6962 ap = anon_get_ptr(amp->ahp, anon_index++); 6963 if (ap != NULL) { 6964 swap_xlate(ap, &vp, &off); 6965 } else { 6966 vp = svd->vp; 6967 off = offset; 6968 } 6969 anon_array_exit(&cookie); 6970 ANON_LOCK_EXIT(&->a_rwlock); 6971 } else { 6972 vp = svd->vp; 6973 off = offset; 6974 } 6975 offset += PAGESIZE; 6976 6977 if (vp == NULL) /* untouched zfod page */ 6978 continue; 6979 6980 if (attr) { 6981 if (vpp) { 6982 prot = VPP_PROT(vpp); 6983 vpp++; 6984 } 6985 if (prot != pageprot) { 6986 continue; 6987 } 6988 } 6989 6990 /* 6991 * See if any of these pages are locked -- if so, then we 6992 * will have to truncate an invalidate request at the first 6993 * locked one. We don't need the page_struct_lock to test 6994 * as this is only advisory; even if we acquire it someone 6995 * might race in and lock the page after we unlock and before 6996 * we do the PUTPAGE, then PUTPAGE simply does nothing. 6997 */ 6998 if (flags & MS_INVALIDATE) { 6999 if ((pp = page_lookup(vp, off, SE_SHARED)) != NULL) { 7000 if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) { 7001 page_unlock(pp); 7002 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7003 return (EBUSY); 7004 } 7005 if (ap != NULL && pp->p_szc != 0 && 7006 page_tryupgrade(pp)) { 7007 if (pp->p_lckcnt == 0 && 7008 pp->p_cowcnt == 0) { 7009 /* 7010 * swapfs VN_DISPOSE() won't 7011 * invalidate large pages. 7012 * Attempt to demote. 7013 * XXX can't help it if it 7014 * fails. But for swapfs 7015 * pages it is no big deal. 7016 */ 7017 (void) page_try_demote_pages( 7018 pp); 7019 } 7020 } 7021 page_unlock(pp); 7022 } 7023 } else if (svd->type == MAP_SHARED && amp != NULL) { 7024 /* 7025 * Avoid writting out to disk ISM's large pages 7026 * because segspt_free_pages() relies on NULL an_pvp 7027 * of anon slots of such pages. 7028 */ 7029 7030 ASSERT(svd->vp == NULL); 7031 /* 7032 * swapfs uses page_lookup_nowait if not freeing or 7033 * invalidating and skips a page if 7034 * page_lookup_nowait returns NULL. 7035 */ 7036 pp = page_lookup_nowait(vp, off, SE_SHARED); 7037 if (pp == NULL) { 7038 continue; 7039 } 7040 if (pp->p_szc != 0) { 7041 page_unlock(pp); 7042 continue; 7043 } 7044 7045 /* 7046 * Note ISM pages are created large so (vp, off)'s 7047 * page cannot suddenly become large after we unlock 7048 * pp. 7049 */ 7050 page_unlock(pp); 7051 } 7052 /* 7053 * XXX - Should ultimately try to kluster 7054 * calls to VOP_PUTPAGE() for performance. 7055 */ 7056 VN_HOLD(vp); 7057 err = VOP_PUTPAGE(vp, (offset_t)off, PAGESIZE, 7058 bflags, svd->cred); 7059 VN_RELE(vp); 7060 if (err) 7061 break; 7062 } 7063 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7064 return (err); 7065 } 7066 7067 /* 7068 * Determine if we have data corresponding to pages in the 7069 * primary storage virtual memory cache (i.e., "in core"). 7070 */ 7071 static size_t 7072 segvn_incore(struct seg *seg, caddr_t addr, size_t len, char *vec) 7073 { 7074 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 7075 struct vnode *vp, *avp; 7076 u_offset_t offset, aoffset; 7077 size_t p, ep; 7078 int ret; 7079 struct vpage *vpp; 7080 page_t *pp; 7081 uint_t start; 7082 struct anon_map *amp; /* XXX - for locknest */ 7083 struct anon *ap; 7084 uint_t attr; 7085 anon_sync_obj_t cookie; 7086 7087 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 7088 7089 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER); 7090 if (svd->amp == NULL && svd->vp == NULL) { 7091 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7092 bzero(vec, btopr(len)); 7093 return (len); /* no anonymous pages created yet */ 7094 } 7095 7096 p = seg_page(seg, addr); 7097 ep = seg_page(seg, addr + len); 7098 start = svd->vp ? SEG_PAGE_VNODEBACKED : 0; 7099 7100 amp = svd->amp; 7101 for (; p < ep; p++, addr += PAGESIZE) { 7102 vpp = (svd->vpage) ? &svd->vpage[p]: NULL; 7103 ret = start; 7104 ap = NULL; 7105 avp = NULL; 7106 /* Grab the vnode/offset for the anon slot */ 7107 if (amp != NULL) { 7108 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 7109 anon_array_enter(amp, svd->anon_index + p, &cookie); 7110 ap = anon_get_ptr(amp->ahp, svd->anon_index + p); 7111 if (ap != NULL) { 7112 swap_xlate(ap, &avp, &aoffset); 7113 } 7114 anon_array_exit(&cookie); 7115 ANON_LOCK_EXIT(&->a_rwlock); 7116 } 7117 if ((avp != NULL) && page_exists(avp, aoffset)) { 7118 /* A page exists for the anon slot */ 7119 ret |= SEG_PAGE_INCORE; 7120 7121 /* 7122 * If page is mapped and writable 7123 */ 7124 attr = (uint_t)0; 7125 if ((hat_getattr(seg->s_as->a_hat, addr, 7126 &attr) != -1) && (attr & PROT_WRITE)) { 7127 ret |= SEG_PAGE_ANON; 7128 } 7129 /* 7130 * Don't get page_struct lock for lckcnt and cowcnt, 7131 * since this is purely advisory. 7132 */ 7133 if ((pp = page_lookup_nowait(avp, aoffset, 7134 SE_SHARED)) != NULL) { 7135 if (pp->p_lckcnt) 7136 ret |= SEG_PAGE_SOFTLOCK; 7137 if (pp->p_cowcnt) 7138 ret |= SEG_PAGE_HASCOW; 7139 page_unlock(pp); 7140 } 7141 } 7142 7143 /* Gather vnode statistics */ 7144 vp = svd->vp; 7145 offset = svd->offset + (uintptr_t)(addr - seg->s_base); 7146 7147 if (vp != NULL) { 7148 /* 7149 * Try to obtain a "shared" lock on the page 7150 * without blocking. If this fails, determine 7151 * if the page is in memory. 7152 */ 7153 pp = page_lookup_nowait(vp, offset, SE_SHARED); 7154 if ((pp == NULL) && (page_exists(vp, offset))) { 7155 /* Page is incore, and is named */ 7156 ret |= (SEG_PAGE_INCORE | SEG_PAGE_VNODE); 7157 } 7158 /* 7159 * Don't get page_struct lock for lckcnt and cowcnt, 7160 * since this is purely advisory. 7161 */ 7162 if (pp != NULL) { 7163 ret |= (SEG_PAGE_INCORE | SEG_PAGE_VNODE); 7164 if (pp->p_lckcnt) 7165 ret |= SEG_PAGE_SOFTLOCK; 7166 if (pp->p_cowcnt) 7167 ret |= SEG_PAGE_HASCOW; 7168 page_unlock(pp); 7169 } 7170 } 7171 7172 /* Gather virtual page information */ 7173 if (vpp) { 7174 if (VPP_ISPPLOCK(vpp)) 7175 ret |= SEG_PAGE_LOCKED; 7176 vpp++; 7177 } 7178 7179 *vec++ = (char)ret; 7180 } 7181 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7182 return (len); 7183 } 7184 7185 /* 7186 * Statement for p_cowcnts/p_lckcnts. 7187 * 7188 * p_cowcnt is updated while mlock/munlocking MAP_PRIVATE and PROT_WRITE region 7189 * irrespective of the following factors or anything else: 7190 * 7191 * (1) anon slots are populated or not 7192 * (2) cow is broken or not 7193 * (3) refcnt on ap is 1 or greater than 1 7194 * 7195 * If it's not MAP_PRIVATE and PROT_WRITE, p_lckcnt is updated during mlock 7196 * and munlock. 7197 * 7198 * 7199 * Handling p_cowcnts/p_lckcnts during copy-on-write fault: 7200 * 7201 * if vpage has PROT_WRITE 7202 * transfer cowcnt on the oldpage -> cowcnt on the newpage 7203 * else 7204 * transfer lckcnt on the oldpage -> lckcnt on the newpage 7205 * 7206 * During copy-on-write, decrement p_cowcnt on the oldpage and increment 7207 * p_cowcnt on the newpage *if* the corresponding vpage has PROT_WRITE. 7208 * 7209 * We may also break COW if softlocking on read access in the physio case. 7210 * In this case, vpage may not have PROT_WRITE. So, we need to decrement 7211 * p_lckcnt on the oldpage and increment p_lckcnt on the newpage *if* the 7212 * vpage doesn't have PROT_WRITE. 7213 * 7214 * 7215 * Handling p_cowcnts/p_lckcnts during mprotect on mlocked region: 7216 * 7217 * If a MAP_PRIVATE region loses PROT_WRITE, we decrement p_cowcnt and 7218 * increment p_lckcnt by calling page_subclaim() which takes care of 7219 * availrmem accounting and p_lckcnt overflow. 7220 * 7221 * If a MAP_PRIVATE region gains PROT_WRITE, we decrement p_lckcnt and 7222 * increment p_cowcnt by calling page_addclaim() which takes care of 7223 * availrmem availability and p_cowcnt overflow. 7224 */ 7225 7226 /* 7227 * Lock down (or unlock) pages mapped by this segment. 7228 * 7229 * XXX only creates PAGESIZE pages if anon slots are not initialized. 7230 * At fault time they will be relocated into larger pages. 7231 */ 7232 static int 7233 segvn_lockop(struct seg *seg, caddr_t addr, size_t len, 7234 int attr, int op, ulong_t *lockmap, size_t pos) 7235 { 7236 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 7237 struct vpage *vpp; 7238 struct vpage *evp; 7239 page_t *pp; 7240 u_offset_t offset; 7241 u_offset_t off; 7242 int segtype; 7243 int pageprot; 7244 int claim; 7245 struct vnode *vp; 7246 ulong_t anon_index; 7247 struct anon_map *amp; 7248 struct anon *ap; 7249 struct vattr va; 7250 anon_sync_obj_t cookie; 7251 struct kshmid *sp = NULL; 7252 struct proc *p = curproc; 7253 kproject_t *proj = NULL; 7254 int chargeproc = 1; 7255 size_t locked_bytes = 0; 7256 size_t unlocked_bytes = 0; 7257 int err = 0; 7258 7259 /* 7260 * Hold write lock on address space because may split or concatenate 7261 * segments 7262 */ 7263 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 7264 7265 /* 7266 * If this is a shm, use shm's project and zone, else use 7267 * project and zone of calling process 7268 */ 7269 7270 /* Determine if this segment backs a sysV shm */ 7271 if (svd->amp != NULL && svd->amp->a_sp != NULL) { 7272 ASSERT(svd->type == MAP_SHARED); 7273 ASSERT(svd->tr_state == SEGVN_TR_OFF); 7274 sp = svd->amp->a_sp; 7275 proj = sp->shm_perm.ipc_proj; 7276 chargeproc = 0; 7277 } 7278 7279 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_WRITER); 7280 if (attr) { 7281 pageprot = attr & ~(SHARED|PRIVATE); 7282 segtype = attr & SHARED ? MAP_SHARED : MAP_PRIVATE; 7283 7284 /* 7285 * We are done if the segment types don't match 7286 * or if we have segment level protections and 7287 * they don't match. 7288 */ 7289 if (svd->type != segtype) { 7290 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7291 return (0); 7292 } 7293 if (svd->pageprot == 0 && svd->prot != pageprot) { 7294 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7295 return (0); 7296 } 7297 } 7298 7299 if (op == MC_LOCK) { 7300 if (svd->tr_state == SEGVN_TR_INIT) { 7301 svd->tr_state = SEGVN_TR_OFF; 7302 } else if (svd->tr_state == SEGVN_TR_ON) { 7303 ASSERT(svd->amp != NULL); 7304 segvn_textunrepl(seg, 0); 7305 ASSERT(svd->amp == NULL && 7306 svd->tr_state == SEGVN_TR_OFF); 7307 } 7308 } 7309 7310 /* 7311 * If we're locking, then we must create a vpage structure if 7312 * none exists. If we're unlocking, then check to see if there 7313 * is a vpage -- if not, then we could not have locked anything. 7314 */ 7315 7316 if ((vpp = svd->vpage) == NULL) { 7317 if (op == MC_LOCK) 7318 segvn_vpage(seg); 7319 else { 7320 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7321 return (0); 7322 } 7323 } 7324 7325 /* 7326 * The anonymous data vector (i.e., previously 7327 * unreferenced mapping to swap space) can be allocated 7328 * by lazily testing for its existence. 7329 */ 7330 if (op == MC_LOCK && svd->amp == NULL && svd->vp == NULL) { 7331 svd->amp = anonmap_alloc(seg->s_size, 0, ANON_SLEEP); 7332 svd->amp->a_szc = seg->s_szc; 7333 } 7334 7335 if ((amp = svd->amp) != NULL) { 7336 anon_index = svd->anon_index + seg_page(seg, addr); 7337 } 7338 7339 offset = svd->offset + (uintptr_t)(addr - seg->s_base); 7340 evp = &svd->vpage[seg_page(seg, addr + len)]; 7341 7342 if (sp != NULL) 7343 mutex_enter(&sp->shm_mlock); 7344 7345 /* determine number of unlocked bytes in range for lock operation */ 7346 if (op == MC_LOCK) { 7347 7348 if (sp == NULL) { 7349 for (vpp = &svd->vpage[seg_page(seg, addr)]; vpp < evp; 7350 vpp++) { 7351 if (!VPP_ISPPLOCK(vpp)) 7352 unlocked_bytes += PAGESIZE; 7353 } 7354 } else { 7355 ulong_t i_idx, i_edx; 7356 anon_sync_obj_t i_cookie; 7357 struct anon *i_ap; 7358 struct vnode *i_vp; 7359 u_offset_t i_off; 7360 7361 /* Only count sysV pages once for locked memory */ 7362 i_edx = svd->anon_index + seg_page(seg, addr + len); 7363 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 7364 for (i_idx = anon_index; i_idx < i_edx; i_idx++) { 7365 anon_array_enter(amp, i_idx, &i_cookie); 7366 i_ap = anon_get_ptr(amp->ahp, i_idx); 7367 if (i_ap == NULL) { 7368 unlocked_bytes += PAGESIZE; 7369 anon_array_exit(&i_cookie); 7370 continue; 7371 } 7372 swap_xlate(i_ap, &i_vp, &i_off); 7373 anon_array_exit(&i_cookie); 7374 pp = page_lookup(i_vp, i_off, SE_SHARED); 7375 if (pp == NULL) { 7376 unlocked_bytes += PAGESIZE; 7377 continue; 7378 } else if (pp->p_lckcnt == 0) 7379 unlocked_bytes += PAGESIZE; 7380 page_unlock(pp); 7381 } 7382 ANON_LOCK_EXIT(&->a_rwlock); 7383 } 7384 7385 mutex_enter(&p->p_lock); 7386 err = rctl_incr_locked_mem(p, proj, unlocked_bytes, 7387 chargeproc); 7388 mutex_exit(&p->p_lock); 7389 7390 if (err) { 7391 if (sp != NULL) 7392 mutex_exit(&sp->shm_mlock); 7393 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7394 return (err); 7395 } 7396 } 7397 /* 7398 * Loop over all pages in the range. Process if we're locking and 7399 * page has not already been locked in this mapping; or if we're 7400 * unlocking and the page has been locked. 7401 */ 7402 for (vpp = &svd->vpage[seg_page(seg, addr)]; vpp < evp; 7403 vpp++, pos++, addr += PAGESIZE, offset += PAGESIZE, anon_index++) { 7404 if ((attr == 0 || VPP_PROT(vpp) == pageprot) && 7405 ((op == MC_LOCK && !VPP_ISPPLOCK(vpp)) || 7406 (op == MC_UNLOCK && VPP_ISPPLOCK(vpp)))) { 7407 7408 if (amp != NULL) 7409 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 7410 /* 7411 * If this isn't a MAP_NORESERVE segment and 7412 * we're locking, allocate anon slots if they 7413 * don't exist. The page is brought in later on. 7414 */ 7415 if (op == MC_LOCK && svd->vp == NULL && 7416 ((svd->flags & MAP_NORESERVE) == 0) && 7417 amp != NULL && 7418 ((ap = anon_get_ptr(amp->ahp, anon_index)) 7419 == NULL)) { 7420 anon_array_enter(amp, anon_index, &cookie); 7421 7422 if ((ap = anon_get_ptr(amp->ahp, 7423 anon_index)) == NULL) { 7424 pp = anon_zero(seg, addr, &ap, 7425 svd->cred); 7426 if (pp == NULL) { 7427 anon_array_exit(&cookie); 7428 ANON_LOCK_EXIT(&->a_rwlock); 7429 err = ENOMEM; 7430 goto out; 7431 } 7432 ASSERT(anon_get_ptr(amp->ahp, 7433 anon_index) == NULL); 7434 (void) anon_set_ptr(amp->ahp, 7435 anon_index, ap, ANON_SLEEP); 7436 page_unlock(pp); 7437 } 7438 anon_array_exit(&cookie); 7439 } 7440 7441 /* 7442 * Get name for page, accounting for 7443 * existence of private copy. 7444 */ 7445 ap = NULL; 7446 if (amp != NULL) { 7447 anon_array_enter(amp, anon_index, &cookie); 7448 ap = anon_get_ptr(amp->ahp, anon_index); 7449 if (ap != NULL) { 7450 swap_xlate(ap, &vp, &off); 7451 } else { 7452 if (svd->vp == NULL && 7453 (svd->flags & MAP_NORESERVE)) { 7454 anon_array_exit(&cookie); 7455 ANON_LOCK_EXIT(&->a_rwlock); 7456 continue; 7457 } 7458 vp = svd->vp; 7459 off = offset; 7460 } 7461 anon_array_exit(&cookie); 7462 ANON_LOCK_EXIT(&->a_rwlock); 7463 } else { 7464 vp = svd->vp; 7465 off = offset; 7466 } 7467 7468 /* 7469 * Get page frame. It's ok if the page is 7470 * not available when we're unlocking, as this 7471 * may simply mean that a page we locked got 7472 * truncated out of existence after we locked it. 7473 * 7474 * Invoke VOP_GETPAGE() to obtain the page struct 7475 * since we may need to read it from disk if its 7476 * been paged out. 7477 */ 7478 if (op != MC_LOCK) 7479 pp = page_lookup(vp, off, SE_SHARED); 7480 else { 7481 page_t *pl[1 + 1]; 7482 int error; 7483 7484 ASSERT(vp != NULL); 7485 7486 error = VOP_GETPAGE(vp, (offset_t)off, PAGESIZE, 7487 (uint_t *)NULL, pl, PAGESIZE, seg, addr, 7488 S_OTHER, svd->cred); 7489 7490 /* 7491 * If the error is EDEADLK then we must bounce 7492 * up and drop all vm subsystem locks and then 7493 * retry the operation later 7494 * This behavior is a temporary measure because 7495 * ufs/sds logging is badly designed and will 7496 * deadlock if we don't allow this bounce to 7497 * happen. The real solution is to re-design 7498 * the logging code to work properly. See bug 7499 * 4125102 for details of the problem. 7500 */ 7501 if (error == EDEADLK) { 7502 err = error; 7503 goto out; 7504 } 7505 /* 7506 * Quit if we fail to fault in the page. Treat 7507 * the failure as an error, unless the addr 7508 * is mapped beyond the end of a file. 7509 */ 7510 if (error && svd->vp) { 7511 va.va_mask = AT_SIZE; 7512 if (VOP_GETATTR(svd->vp, &va, 0, 7513 svd->cred) != 0) { 7514 err = EIO; 7515 goto out; 7516 } 7517 if (btopr(va.va_size) >= 7518 btopr(off + 1)) { 7519 err = EIO; 7520 goto out; 7521 } 7522 goto out; 7523 7524 } else if (error) { 7525 err = EIO; 7526 goto out; 7527 } 7528 pp = pl[0]; 7529 ASSERT(pp != NULL); 7530 } 7531 7532 /* 7533 * See Statement at the beginning of this routine. 7534 * 7535 * claim is always set if MAP_PRIVATE and PROT_WRITE 7536 * irrespective of following factors: 7537 * 7538 * (1) anon slots are populated or not 7539 * (2) cow is broken or not 7540 * (3) refcnt on ap is 1 or greater than 1 7541 * 7542 * See 4140683 for details 7543 */ 7544 claim = ((VPP_PROT(vpp) & PROT_WRITE) && 7545 (svd->type == MAP_PRIVATE)); 7546 7547 /* 7548 * Perform page-level operation appropriate to 7549 * operation. If locking, undo the SOFTLOCK 7550 * performed to bring the page into memory 7551 * after setting the lock. If unlocking, 7552 * and no page was found, account for the claim 7553 * separately. 7554 */ 7555 if (op == MC_LOCK) { 7556 int ret = 1; /* Assume success */ 7557 7558 ASSERT(!VPP_ISPPLOCK(vpp)); 7559 7560 ret = page_pp_lock(pp, claim, 0); 7561 if (ret == 0) { 7562 /* locking page failed */ 7563 page_unlock(pp); 7564 err = EAGAIN; 7565 goto out; 7566 } 7567 VPP_SETPPLOCK(vpp); 7568 if (sp != NULL) { 7569 if (pp->p_lckcnt == 1) 7570 locked_bytes += PAGESIZE; 7571 } else 7572 locked_bytes += PAGESIZE; 7573 7574 if (lockmap != (ulong_t *)NULL) 7575 BT_SET(lockmap, pos); 7576 7577 page_unlock(pp); 7578 } else { 7579 ASSERT(VPP_ISPPLOCK(vpp)); 7580 if (pp != NULL) { 7581 /* sysV pages should be locked */ 7582 ASSERT(sp == NULL || pp->p_lckcnt > 0); 7583 page_pp_unlock(pp, claim, 0); 7584 if (sp != NULL) { 7585 if (pp->p_lckcnt == 0) 7586 unlocked_bytes 7587 += PAGESIZE; 7588 } else 7589 unlocked_bytes += PAGESIZE; 7590 page_unlock(pp); 7591 } else { 7592 ASSERT(sp == NULL); 7593 unlocked_bytes += PAGESIZE; 7594 } 7595 VPP_CLRPPLOCK(vpp); 7596 } 7597 } 7598 } 7599 out: 7600 if (op == MC_LOCK) { 7601 /* Credit back bytes that did not get locked */ 7602 if ((unlocked_bytes - locked_bytes) > 0) { 7603 if (proj == NULL) 7604 mutex_enter(&p->p_lock); 7605 rctl_decr_locked_mem(p, proj, 7606 (unlocked_bytes - locked_bytes), chargeproc); 7607 if (proj == NULL) 7608 mutex_exit(&p->p_lock); 7609 } 7610 7611 } else { 7612 /* Account bytes that were unlocked */ 7613 if (unlocked_bytes > 0) { 7614 if (proj == NULL) 7615 mutex_enter(&p->p_lock); 7616 rctl_decr_locked_mem(p, proj, unlocked_bytes, 7617 chargeproc); 7618 if (proj == NULL) 7619 mutex_exit(&p->p_lock); 7620 } 7621 } 7622 if (sp != NULL) 7623 mutex_exit(&sp->shm_mlock); 7624 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7625 7626 return (err); 7627 } 7628 7629 /* 7630 * Set advice from user for specified pages 7631 * There are 5 types of advice: 7632 * MADV_NORMAL - Normal (default) behavior (whatever that is) 7633 * MADV_RANDOM - Random page references 7634 * do not allow readahead or 'klustering' 7635 * MADV_SEQUENTIAL - Sequential page references 7636 * Pages previous to the one currently being 7637 * accessed (determined by fault) are 'not needed' 7638 * and are freed immediately 7639 * MADV_WILLNEED - Pages are likely to be used (fault ahead in mctl) 7640 * MADV_DONTNEED - Pages are not needed (synced out in mctl) 7641 * MADV_FREE - Contents can be discarded 7642 * MADV_ACCESS_DEFAULT- Default access 7643 * MADV_ACCESS_LWP - Next LWP will access heavily 7644 * MADV_ACCESS_MANY- Many LWPs or processes will access heavily 7645 */ 7646 static int 7647 segvn_advise(struct seg *seg, caddr_t addr, size_t len, uint_t behav) 7648 { 7649 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 7650 size_t page; 7651 int err = 0; 7652 int already_set; 7653 struct anon_map *amp; 7654 ulong_t anon_index; 7655 struct seg *next; 7656 lgrp_mem_policy_t policy; 7657 struct seg *prev; 7658 struct vnode *vp; 7659 7660 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 7661 7662 /* 7663 * In case of MADV_FREE, we won't be modifying any segment private 7664 * data structures; so, we only need to grab READER's lock 7665 */ 7666 if (behav != MADV_FREE) { 7667 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_WRITER); 7668 if (svd->tr_state != SEGVN_TR_OFF) { 7669 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7670 return (0); 7671 } 7672 } else { 7673 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER); 7674 } 7675 7676 /* 7677 * Large pages are assumed to be only turned on when accesses to the 7678 * segment's address range have spatial and temporal locality. That 7679 * justifies ignoring MADV_SEQUENTIAL for large page segments. 7680 * Also, ignore advice affecting lgroup memory allocation 7681 * if don't need to do lgroup optimizations on this system 7682 */ 7683 7684 if ((behav == MADV_SEQUENTIAL && seg->s_szc != 0) || 7685 (!lgrp_optimizations() && (behav == MADV_ACCESS_DEFAULT || 7686 behav == MADV_ACCESS_LWP || behav == MADV_ACCESS_MANY))) { 7687 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7688 return (0); 7689 } 7690 7691 if (behav == MADV_SEQUENTIAL || behav == MADV_ACCESS_DEFAULT || 7692 behav == MADV_ACCESS_LWP || behav == MADV_ACCESS_MANY) { 7693 /* 7694 * Since we are going to unload hat mappings 7695 * we first have to flush the cache. Otherwise 7696 * this might lead to system panic if another 7697 * thread is doing physio on the range whose 7698 * mappings are unloaded by madvise(3C). 7699 */ 7700 if (svd->softlockcnt > 0) { 7701 /* 7702 * Since we do have the segvn writers lock 7703 * nobody can fill the cache with entries 7704 * belonging to this seg during the purge. 7705 * The flush either succeeds or we still 7706 * have pending I/Os. In the later case, 7707 * madvise(3C) fails. 7708 */ 7709 segvn_purge(seg); 7710 if (svd->softlockcnt > 0) { 7711 /* 7712 * Since madvise(3C) is advisory and 7713 * it's not part of UNIX98, madvise(3C) 7714 * failure here doesn't cause any hardship. 7715 * Note that we don't block in "as" layer. 7716 */ 7717 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7718 return (EAGAIN); 7719 } 7720 } 7721 } 7722 7723 amp = svd->amp; 7724 vp = svd->vp; 7725 if (behav == MADV_FREE) { 7726 /* 7727 * MADV_FREE is not supported for segments with 7728 * underlying object; if anonmap is NULL, anon slots 7729 * are not yet populated and there is nothing for 7730 * us to do. As MADV_FREE is advisory, we don't 7731 * return error in either case. 7732 */ 7733 if (vp != NULL || amp == NULL) { 7734 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7735 return (0); 7736 } 7737 7738 page = seg_page(seg, addr); 7739 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 7740 anon_disclaim(amp, svd->anon_index + page, len, 0); 7741 ANON_LOCK_EXIT(&->a_rwlock); 7742 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7743 return (0); 7744 } 7745 7746 /* 7747 * If advice is to be applied to entire segment, 7748 * use advice field in seg_data structure 7749 * otherwise use appropriate vpage entry. 7750 */ 7751 if ((addr == seg->s_base) && (len == seg->s_size)) { 7752 switch (behav) { 7753 case MADV_ACCESS_LWP: 7754 case MADV_ACCESS_MANY: 7755 case MADV_ACCESS_DEFAULT: 7756 /* 7757 * Set memory allocation policy for this segment 7758 */ 7759 policy = lgrp_madv_to_policy(behav, len, svd->type); 7760 if (svd->type == MAP_SHARED) 7761 already_set = lgrp_shm_policy_set(policy, amp, 7762 svd->anon_index, vp, svd->offset, len); 7763 else { 7764 /* 7765 * For private memory, need writers lock on 7766 * address space because the segment may be 7767 * split or concatenated when changing policy 7768 */ 7769 if (AS_READ_HELD(seg->s_as, 7770 &seg->s_as->a_lock)) { 7771 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7772 return (IE_RETRY); 7773 } 7774 7775 already_set = lgrp_privm_policy_set(policy, 7776 &svd->policy_info, len); 7777 } 7778 7779 /* 7780 * If policy set already and it shouldn't be reapplied, 7781 * don't do anything. 7782 */ 7783 if (already_set && 7784 !LGRP_MEM_POLICY_REAPPLICABLE(policy)) 7785 break; 7786 7787 /* 7788 * Mark any existing pages in given range for 7789 * migration 7790 */ 7791 page_mark_migrate(seg, addr, len, amp, svd->anon_index, 7792 vp, svd->offset, 1); 7793 7794 /* 7795 * If same policy set already or this is a shared 7796 * memory segment, don't need to try to concatenate 7797 * segment with adjacent ones. 7798 */ 7799 if (already_set || svd->type == MAP_SHARED) 7800 break; 7801 7802 /* 7803 * Try to concatenate this segment with previous 7804 * one and next one, since we changed policy for 7805 * this one and it may be compatible with adjacent 7806 * ones now. 7807 */ 7808 prev = AS_SEGPREV(seg->s_as, seg); 7809 next = AS_SEGNEXT(seg->s_as, seg); 7810 7811 if (next && next->s_ops == &segvn_ops && 7812 addr + len == next->s_base) 7813 (void) segvn_concat(seg, next, 1); 7814 7815 if (prev && prev->s_ops == &segvn_ops && 7816 addr == prev->s_base + prev->s_size) { 7817 /* 7818 * Drop lock for private data of current 7819 * segment before concatenating (deleting) it 7820 * and return IE_REATTACH to tell as_ctl() that 7821 * current segment has changed 7822 */ 7823 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7824 if (!segvn_concat(prev, seg, 1)) 7825 err = IE_REATTACH; 7826 7827 return (err); 7828 } 7829 break; 7830 7831 case MADV_SEQUENTIAL: 7832 /* 7833 * unloading mapping guarantees 7834 * detection in segvn_fault 7835 */ 7836 ASSERT(seg->s_szc == 0); 7837 hat_unload(seg->s_as->a_hat, addr, len, 7838 HAT_UNLOAD); 7839 /* FALLTHROUGH */ 7840 case MADV_NORMAL: 7841 case MADV_RANDOM: 7842 svd->advice = (uchar_t)behav; 7843 svd->pageadvice = 0; 7844 break; 7845 case MADV_WILLNEED: /* handled in memcntl */ 7846 case MADV_DONTNEED: /* handled in memcntl */ 7847 case MADV_FREE: /* handled above */ 7848 break; 7849 default: 7850 err = EINVAL; 7851 } 7852 } else { 7853 caddr_t eaddr; 7854 struct seg *new_seg; 7855 struct segvn_data *new_svd; 7856 u_offset_t off; 7857 caddr_t oldeaddr; 7858 7859 page = seg_page(seg, addr); 7860 7861 segvn_vpage(seg); 7862 7863 switch (behav) { 7864 struct vpage *bvpp, *evpp; 7865 7866 case MADV_ACCESS_LWP: 7867 case MADV_ACCESS_MANY: 7868 case MADV_ACCESS_DEFAULT: 7869 /* 7870 * Set memory allocation policy for portion of this 7871 * segment 7872 */ 7873 7874 /* 7875 * Align address and length of advice to page 7876 * boundaries for large pages 7877 */ 7878 if (seg->s_szc != 0) { 7879 size_t pgsz; 7880 7881 pgsz = page_get_pagesize(seg->s_szc); 7882 addr = (caddr_t)P2ALIGN((uintptr_t)addr, pgsz); 7883 len = P2ROUNDUP(len, pgsz); 7884 } 7885 7886 /* 7887 * Check to see whether policy is set already 7888 */ 7889 policy = lgrp_madv_to_policy(behav, len, svd->type); 7890 7891 anon_index = svd->anon_index + page; 7892 off = svd->offset + (uintptr_t)(addr - seg->s_base); 7893 7894 if (svd->type == MAP_SHARED) 7895 already_set = lgrp_shm_policy_set(policy, amp, 7896 anon_index, vp, off, len); 7897 else 7898 already_set = 7899 (policy == svd->policy_info.mem_policy); 7900 7901 /* 7902 * If policy set already and it shouldn't be reapplied, 7903 * don't do anything. 7904 */ 7905 if (already_set && 7906 !LGRP_MEM_POLICY_REAPPLICABLE(policy)) 7907 break; 7908 7909 /* 7910 * For private memory, need writers lock on 7911 * address space because the segment may be 7912 * split or concatenated when changing policy 7913 */ 7914 if (svd->type == MAP_PRIVATE && 7915 AS_READ_HELD(seg->s_as, &seg->s_as->a_lock)) { 7916 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7917 return (IE_RETRY); 7918 } 7919 7920 /* 7921 * Mark any existing pages in given range for 7922 * migration 7923 */ 7924 page_mark_migrate(seg, addr, len, amp, svd->anon_index, 7925 vp, svd->offset, 1); 7926 7927 /* 7928 * Don't need to try to split or concatenate 7929 * segments, since policy is same or this is a shared 7930 * memory segment 7931 */ 7932 if (already_set || svd->type == MAP_SHARED) 7933 break; 7934 7935 /* 7936 * Split off new segment if advice only applies to a 7937 * portion of existing segment starting in middle 7938 */ 7939 new_seg = NULL; 7940 eaddr = addr + len; 7941 oldeaddr = seg->s_base + seg->s_size; 7942 if (addr > seg->s_base) { 7943 /* 7944 * Must flush I/O page cache 7945 * before splitting segment 7946 */ 7947 if (svd->softlockcnt > 0) 7948 segvn_purge(seg); 7949 7950 /* 7951 * Split segment and return IE_REATTACH to tell 7952 * as_ctl() that current segment changed 7953 */ 7954 new_seg = segvn_split_seg(seg, addr); 7955 new_svd = (struct segvn_data *)new_seg->s_data; 7956 err = IE_REATTACH; 7957 7958 /* 7959 * If new segment ends where old one 7960 * did, try to concatenate the new 7961 * segment with next one. 7962 */ 7963 if (eaddr == oldeaddr) { 7964 /* 7965 * Set policy for new segment 7966 */ 7967 (void) lgrp_privm_policy_set(policy, 7968 &new_svd->policy_info, 7969 new_seg->s_size); 7970 7971 next = AS_SEGNEXT(new_seg->s_as, 7972 new_seg); 7973 7974 if (next && 7975 next->s_ops == &segvn_ops && 7976 eaddr == next->s_base) 7977 (void) segvn_concat(new_seg, 7978 next, 1); 7979 } 7980 } 7981 7982 /* 7983 * Split off end of existing segment if advice only 7984 * applies to a portion of segment ending before 7985 * end of the existing segment 7986 */ 7987 if (eaddr < oldeaddr) { 7988 /* 7989 * Must flush I/O page cache 7990 * before splitting segment 7991 */ 7992 if (svd->softlockcnt > 0) 7993 segvn_purge(seg); 7994 7995 /* 7996 * If beginning of old segment was already 7997 * split off, use new segment to split end off 7998 * from. 7999 */ 8000 if (new_seg != NULL && new_seg != seg) { 8001 /* 8002 * Split segment 8003 */ 8004 (void) segvn_split_seg(new_seg, eaddr); 8005 8006 /* 8007 * Set policy for new segment 8008 */ 8009 (void) lgrp_privm_policy_set(policy, 8010 &new_svd->policy_info, 8011 new_seg->s_size); 8012 } else { 8013 /* 8014 * Split segment and return IE_REATTACH 8015 * to tell as_ctl() that current 8016 * segment changed 8017 */ 8018 (void) segvn_split_seg(seg, eaddr); 8019 err = IE_REATTACH; 8020 8021 (void) lgrp_privm_policy_set(policy, 8022 &svd->policy_info, seg->s_size); 8023 8024 /* 8025 * If new segment starts where old one 8026 * did, try to concatenate it with 8027 * previous segment. 8028 */ 8029 if (addr == seg->s_base) { 8030 prev = AS_SEGPREV(seg->s_as, 8031 seg); 8032 8033 /* 8034 * Drop lock for private data 8035 * of current segment before 8036 * concatenating (deleting) it 8037 */ 8038 if (prev && 8039 prev->s_ops == 8040 &segvn_ops && 8041 addr == prev->s_base + 8042 prev->s_size) { 8043 SEGVN_LOCK_EXIT( 8044 seg->s_as, 8045 &svd->lock); 8046 (void) segvn_concat( 8047 prev, seg, 1); 8048 return (err); 8049 } 8050 } 8051 } 8052 } 8053 break; 8054 case MADV_SEQUENTIAL: 8055 ASSERT(seg->s_szc == 0); 8056 hat_unload(seg->s_as->a_hat, addr, len, HAT_UNLOAD); 8057 /* FALLTHROUGH */ 8058 case MADV_NORMAL: 8059 case MADV_RANDOM: 8060 bvpp = &svd->vpage[page]; 8061 evpp = &svd->vpage[page + (len >> PAGESHIFT)]; 8062 for (; bvpp < evpp; bvpp++) 8063 VPP_SETADVICE(bvpp, behav); 8064 svd->advice = MADV_NORMAL; 8065 break; 8066 case MADV_WILLNEED: /* handled in memcntl */ 8067 case MADV_DONTNEED: /* handled in memcntl */ 8068 case MADV_FREE: /* handled above */ 8069 break; 8070 default: 8071 err = EINVAL; 8072 } 8073 } 8074 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 8075 return (err); 8076 } 8077 8078 /* 8079 * Create a vpage structure for this seg. 8080 */ 8081 static void 8082 segvn_vpage(struct seg *seg) 8083 { 8084 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 8085 struct vpage *vp, *evp; 8086 8087 ASSERT(SEGVN_WRITE_HELD(seg->s_as, &svd->lock)); 8088 8089 /* 8090 * If no vpage structure exists, allocate one. Copy the protections 8091 * and the advice from the segment itself to the individual pages. 8092 */ 8093 if (svd->vpage == NULL) { 8094 svd->pageprot = 1; 8095 svd->pageadvice = 1; 8096 svd->vpage = kmem_zalloc(seg_pages(seg) * sizeof (struct vpage), 8097 KM_SLEEP); 8098 evp = &svd->vpage[seg_page(seg, seg->s_base + seg->s_size)]; 8099 for (vp = svd->vpage; vp < evp; vp++) { 8100 VPP_SETPROT(vp, svd->prot); 8101 VPP_SETADVICE(vp, svd->advice); 8102 } 8103 } 8104 } 8105 8106 /* 8107 * Dump the pages belonging to this segvn segment. 8108 */ 8109 static void 8110 segvn_dump(struct seg *seg) 8111 { 8112 struct segvn_data *svd; 8113 page_t *pp; 8114 struct anon_map *amp; 8115 ulong_t anon_index; 8116 struct vnode *vp; 8117 u_offset_t off, offset; 8118 pfn_t pfn; 8119 pgcnt_t page, npages; 8120 caddr_t addr; 8121 8122 npages = seg_pages(seg); 8123 svd = (struct segvn_data *)seg->s_data; 8124 vp = svd->vp; 8125 off = offset = svd->offset; 8126 addr = seg->s_base; 8127 8128 if ((amp = svd->amp) != NULL) { 8129 anon_index = svd->anon_index; 8130 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 8131 } 8132 8133 for (page = 0; page < npages; page++, offset += PAGESIZE) { 8134 struct anon *ap; 8135 int we_own_it = 0; 8136 8137 if (amp && (ap = anon_get_ptr(svd->amp->ahp, anon_index++))) { 8138 swap_xlate_nopanic(ap, &vp, &off); 8139 } else { 8140 vp = svd->vp; 8141 off = offset; 8142 } 8143 8144 /* 8145 * If pp == NULL, the page either does not exist 8146 * or is exclusively locked. So determine if it 8147 * exists before searching for it. 8148 */ 8149 8150 if ((pp = page_lookup_nowait(vp, off, SE_SHARED))) 8151 we_own_it = 1; 8152 else 8153 pp = page_exists(vp, off); 8154 8155 if (pp) { 8156 pfn = page_pptonum(pp); 8157 dump_addpage(seg->s_as, addr, pfn); 8158 if (we_own_it) 8159 page_unlock(pp); 8160 } 8161 addr += PAGESIZE; 8162 dump_timeleft = dump_timeout; 8163 } 8164 8165 if (amp != NULL) 8166 ANON_LOCK_EXIT(&->a_rwlock); 8167 } 8168 8169 /* 8170 * lock/unlock anon pages over a given range. Return shadow list 8171 */ 8172 static int 8173 segvn_pagelock(struct seg *seg, caddr_t addr, size_t len, struct page ***ppp, 8174 enum lock_type type, enum seg_rw rw) 8175 { 8176 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 8177 size_t np, adjustpages = 0, npages = (len >> PAGESHIFT); 8178 ulong_t anon_index; 8179 uint_t protchk; 8180 uint_t error; 8181 struct anon_map *amp; 8182 struct page **pplist, **pl, *pp; 8183 caddr_t a; 8184 size_t page; 8185 caddr_t lpgaddr, lpgeaddr; 8186 pgcnt_t szc0_npages = 0; 8187 8188 TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEGVN_START, 8189 "segvn_pagelock: start seg %p addr %p", seg, addr); 8190 8191 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 8192 if (seg->s_szc != 0 && (type == L_PAGELOCK || type == L_PAGEUNLOCK)) { 8193 /* 8194 * We are adjusting the pagelock region to the large page size 8195 * boundary because the unlocked part of a large page cannot 8196 * be freed anyway unless all constituent pages of a large 8197 * page are locked. Therefore this adjustment allows us to 8198 * decrement availrmem by the right value (note we don't want 8199 * to just decrement availrem by the large page size without 8200 * adjusting addr and len because then we may end up 8201 * decrementing availrmem by large page size for every 8202 * constituent page locked by a new as_pagelock call). 8203 * as_pageunlock caller must always match as_pagelock call's 8204 * addr and len. 8205 * 8206 * Note segment's page size cannot change while we are holding 8207 * as lock. And then it cannot change while softlockcnt is 8208 * not 0. This will allow us to correctly recalculate large 8209 * page size region for the matching pageunlock/reclaim call. 8210 * 8211 * for pageunlock *ppp points to the pointer of page_t that 8212 * corresponds to the real unadjusted start address. Similar 8213 * for pagelock *ppp must point to the pointer of page_t that 8214 * corresponds to the real unadjusted start address. 8215 */ 8216 size_t pgsz = page_get_pagesize(seg->s_szc); 8217 CALC_LPG_REGION(pgsz, seg, addr, len, lpgaddr, lpgeaddr); 8218 adjustpages = ((uintptr_t)(addr - lpgaddr)) >> PAGESHIFT; 8219 } 8220 8221 if (type == L_PAGEUNLOCK) { 8222 8223 /* 8224 * update hat ref bits for /proc. We need to make sure 8225 * that threads tracing the ref and mod bits of the 8226 * address space get the right data. 8227 * Note: page ref and mod bits are updated at reclaim time 8228 */ 8229 if (seg->s_as->a_vbits) { 8230 for (a = addr; a < addr + len; a += PAGESIZE) { 8231 if (rw == S_WRITE) { 8232 hat_setstat(seg->s_as, a, 8233 PAGESIZE, P_REF | P_MOD); 8234 } else { 8235 hat_setstat(seg->s_as, a, 8236 PAGESIZE, P_REF); 8237 } 8238 } 8239 } 8240 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER); 8241 if (seg->s_szc != 0) { 8242 VM_STAT_ADD(segvnvmstats.pagelock[0]); 8243 seg_pinactive(seg, lpgaddr, lpgeaddr - lpgaddr, 8244 *ppp - adjustpages, rw, segvn_reclaim); 8245 } else { 8246 seg_pinactive(seg, addr, len, *ppp, rw, segvn_reclaim); 8247 } 8248 8249 /* 8250 * If someone is blocked while unmapping, we purge 8251 * segment page cache and thus reclaim pplist synchronously 8252 * without waiting for seg_pasync_thread. This speeds up 8253 * unmapping in cases where munmap(2) is called, while 8254 * raw async i/o is still in progress or where a thread 8255 * exits on data fault in a multithreaded application. 8256 */ 8257 if (AS_ISUNMAPWAIT(seg->s_as) && (svd->softlockcnt > 0)) { 8258 /* 8259 * Even if we grab segvn WRITER's lock or segp_slock 8260 * here, there might be another thread which could've 8261 * successfully performed lookup/insert just before 8262 * we acquired the lock here. So, grabbing either 8263 * lock here is of not much use. Until we devise 8264 * a strategy at upper layers to solve the 8265 * synchronization issues completely, we expect 8266 * applications to handle this appropriately. 8267 */ 8268 segvn_purge(seg); 8269 } 8270 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 8271 TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEGVN_UNLOCK_END, 8272 "segvn_pagelock: unlock seg %p addr %p", seg, addr); 8273 return (0); 8274 } else if (type == L_PAGERECLAIM) { 8275 VM_STAT_COND_ADD(seg->s_szc != 0, segvnvmstats.pagelock[1]); 8276 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER); 8277 (void) segvn_reclaim(seg, addr, len, *ppp, rw); 8278 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 8279 TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEGVN_UNLOCK_END, 8280 "segvn_pagelock: reclaim seg %p addr %p", seg, addr); 8281 return (0); 8282 } 8283 8284 if (seg->s_szc != 0) { 8285 VM_STAT_ADD(segvnvmstats.pagelock[2]); 8286 addr = lpgaddr; 8287 len = lpgeaddr - lpgaddr; 8288 npages = (len >> PAGESHIFT); 8289 } 8290 8291 /* 8292 * for now we only support pagelock to anon memory. We've to check 8293 * protections for vnode objects and call into the vnode driver. 8294 * That's too much for a fast path. Let the fault entry point handle it. 8295 */ 8296 if (svd->vp != NULL) { 8297 TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEGVN_MISS_END, 8298 "segvn_pagelock: mapped vnode seg %p addr %p", seg, addr); 8299 *ppp = NULL; 8300 return (ENOTSUP); 8301 } 8302 8303 /* 8304 * if anonmap is not yet created, let the fault entry point populate it 8305 * with anon ptrs. 8306 */ 8307 if ((amp = svd->amp) == NULL) { 8308 TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEGVN_MISS_END, 8309 "segvn_pagelock: anonmap null seg %p addr %p", seg, addr); 8310 *ppp = NULL; 8311 return (EFAULT); 8312 } 8313 8314 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER); 8315 8316 /* 8317 * we acquire segp_slock to prevent duplicate entries 8318 * in seg_pcache 8319 */ 8320 mutex_enter(&svd->segp_slock); 8321 8322 /* 8323 * try to find pages in segment page cache 8324 */ 8325 pplist = seg_plookup(seg, addr, len, rw); 8326 if (pplist != NULL) { 8327 mutex_exit(&svd->segp_slock); 8328 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 8329 *ppp = pplist + adjustpages; 8330 TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEGVN_HIT_END, 8331 "segvn_pagelock: cache hit seg %p addr %p", seg, addr); 8332 return (0); 8333 } 8334 8335 if (rw == S_READ) { 8336 protchk = PROT_READ; 8337 } else { 8338 protchk = PROT_WRITE; 8339 } 8340 8341 if (svd->pageprot == 0) { 8342 if ((svd->prot & protchk) == 0) { 8343 mutex_exit(&svd->segp_slock); 8344 error = EFAULT; 8345 goto out; 8346 } 8347 } else { 8348 /* 8349 * check page protections 8350 */ 8351 for (a = addr; a < addr + len; a += PAGESIZE) { 8352 struct vpage *vp; 8353 8354 vp = &svd->vpage[seg_page(seg, a)]; 8355 if ((VPP_PROT(vp) & protchk) == 0) { 8356 mutex_exit(&svd->segp_slock); 8357 error = EFAULT; 8358 goto out; 8359 } 8360 } 8361 } 8362 8363 /* 8364 * Avoid per page overhead of segvn_pp_lock_anonpages() for small 8365 * pages. For large pages segvn_pp_lock_anonpages() only does real 8366 * work once per large page. The tradeoff is that we may decrement 8367 * availrmem more than once for the same page but this is ok 8368 * for small pages. 8369 */ 8370 if (seg->s_szc == 0) { 8371 mutex_enter(&freemem_lock); 8372 if (availrmem < tune.t_minarmem + npages) { 8373 mutex_exit(&freemem_lock); 8374 mutex_exit(&svd->segp_slock); 8375 error = ENOMEM; 8376 goto out; 8377 } 8378 availrmem -= npages; 8379 mutex_exit(&freemem_lock); 8380 } 8381 8382 pplist = kmem_alloc(sizeof (page_t *) * npages, KM_SLEEP); 8383 pl = pplist; 8384 *ppp = pplist + adjustpages; 8385 8386 page = seg_page(seg, addr); 8387 anon_index = svd->anon_index + page; 8388 8389 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 8390 for (a = addr; a < addr + len; a += PAGESIZE, anon_index++) { 8391 struct anon *ap; 8392 struct vnode *vp; 8393 u_offset_t off; 8394 anon_sync_obj_t cookie; 8395 8396 anon_array_enter(amp, anon_index, &cookie); 8397 ap = anon_get_ptr(amp->ahp, anon_index); 8398 if (ap == NULL) { 8399 anon_array_exit(&cookie); 8400 break; 8401 } else { 8402 /* 8403 * We must never use seg_pcache for COW pages 8404 * because we might end up with original page still 8405 * lying in seg_pcache even after private page is 8406 * created. This leads to data corruption as 8407 * aio_write refers to the page still in cache 8408 * while all other accesses refer to the private 8409 * page. 8410 */ 8411 if (ap->an_refcnt != 1) { 8412 anon_array_exit(&cookie); 8413 break; 8414 } 8415 } 8416 swap_xlate(ap, &vp, &off); 8417 anon_array_exit(&cookie); 8418 8419 pp = page_lookup_nowait(vp, off, SE_SHARED); 8420 if (pp == NULL) { 8421 break; 8422 } 8423 if (seg->s_szc != 0 || pp->p_szc != 0) { 8424 if (!segvn_pp_lock_anonpages(pp, a == addr)) { 8425 page_unlock(pp); 8426 break; 8427 } 8428 } else { 8429 szc0_npages++; 8430 } 8431 *pplist++ = pp; 8432 } 8433 ANON_LOCK_EXIT(&->a_rwlock); 8434 8435 ASSERT(npages >= szc0_npages); 8436 8437 if (a >= addr + len) { 8438 mutex_enter(&freemem_lock); 8439 if (seg->s_szc == 0 && npages != szc0_npages) { 8440 ASSERT(svd->type == MAP_SHARED && amp->a_szc > 0); 8441 availrmem += (npages - szc0_npages); 8442 } 8443 svd->softlockcnt += npages; 8444 segvn_pages_locked += npages; 8445 mutex_exit(&freemem_lock); 8446 (void) seg_pinsert(seg, addr, len, pl, rw, SEGP_ASYNC_FLUSH, 8447 segvn_reclaim); 8448 mutex_exit(&svd->segp_slock); 8449 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 8450 TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEGVN_FILL_END, 8451 "segvn_pagelock: cache fill seg %p addr %p", seg, addr); 8452 return (0); 8453 } 8454 8455 mutex_exit(&svd->segp_slock); 8456 if (seg->s_szc == 0) { 8457 mutex_enter(&freemem_lock); 8458 availrmem += npages; 8459 mutex_exit(&freemem_lock); 8460 } 8461 error = EFAULT; 8462 pplist = pl; 8463 np = ((uintptr_t)(a - addr)) >> PAGESHIFT; 8464 while (np > (uint_t)0) { 8465 ASSERT(PAGE_LOCKED(*pplist)); 8466 if (seg->s_szc != 0 || (*pplist)->p_szc != 0) { 8467 segvn_pp_unlock_anonpages(*pplist, pplist == pl); 8468 } 8469 page_unlock(*pplist); 8470 np--; 8471 pplist++; 8472 } 8473 kmem_free(pl, sizeof (page_t *) * npages); 8474 out: 8475 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 8476 *ppp = NULL; 8477 TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEGVN_MISS_END, 8478 "segvn_pagelock: cache miss seg %p addr %p", seg, addr); 8479 return (error); 8480 } 8481 8482 /* 8483 * purge any cached pages in the I/O page cache 8484 */ 8485 static void 8486 segvn_purge(struct seg *seg) 8487 { 8488 seg_ppurge(seg); 8489 } 8490 8491 static int 8492 segvn_reclaim(struct seg *seg, caddr_t addr, size_t len, struct page **pplist, 8493 enum seg_rw rw) 8494 { 8495 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 8496 pgcnt_t np, npages; 8497 struct page **pl; 8498 pgcnt_t szc0_npages = 0; 8499 8500 #ifdef lint 8501 addr = addr; 8502 #endif 8503 8504 npages = np = (len >> PAGESHIFT); 8505 ASSERT(npages); 8506 pl = pplist; 8507 if (seg->s_szc != 0) { 8508 size_t pgsz = page_get_pagesize(seg->s_szc); 8509 if (!IS_P2ALIGNED(addr, pgsz) || !IS_P2ALIGNED(len, pgsz)) { 8510 panic("segvn_reclaim: unaligned addr or len"); 8511 /*NOTREACHED*/ 8512 } 8513 } 8514 8515 ASSERT(svd->vp == NULL && svd->amp != NULL); 8516 8517 while (np > (uint_t)0) { 8518 if (rw == S_WRITE) { 8519 hat_setrefmod(*pplist); 8520 } else { 8521 hat_setref(*pplist); 8522 } 8523 if (seg->s_szc != 0 || (*pplist)->p_szc != 0) { 8524 segvn_pp_unlock_anonpages(*pplist, pplist == pl); 8525 } else { 8526 szc0_npages++; 8527 } 8528 page_unlock(*pplist); 8529 np--; 8530 pplist++; 8531 } 8532 kmem_free(pl, sizeof (page_t *) * npages); 8533 8534 mutex_enter(&freemem_lock); 8535 segvn_pages_locked -= npages; 8536 svd->softlockcnt -= npages; 8537 if (szc0_npages != 0) { 8538 availrmem += szc0_npages; 8539 } 8540 mutex_exit(&freemem_lock); 8541 if (svd->softlockcnt <= 0) { 8542 if (AS_ISUNMAPWAIT(seg->s_as)) { 8543 mutex_enter(&seg->s_as->a_contents); 8544 if (AS_ISUNMAPWAIT(seg->s_as)) { 8545 AS_CLRUNMAPWAIT(seg->s_as); 8546 cv_broadcast(&seg->s_as->a_cv); 8547 } 8548 mutex_exit(&seg->s_as->a_contents); 8549 } 8550 } 8551 return (0); 8552 } 8553 /* 8554 * get a memory ID for an addr in a given segment 8555 * 8556 * XXX only creates PAGESIZE pages if anon slots are not initialized. 8557 * At fault time they will be relocated into larger pages. 8558 */ 8559 static int 8560 segvn_getmemid(struct seg *seg, caddr_t addr, memid_t *memidp) 8561 { 8562 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 8563 struct anon *ap = NULL; 8564 ulong_t anon_index; 8565 struct anon_map *amp; 8566 anon_sync_obj_t cookie; 8567 8568 if (svd->type == MAP_PRIVATE) { 8569 memidp->val[0] = (uintptr_t)seg->s_as; 8570 memidp->val[1] = (uintptr_t)addr; 8571 return (0); 8572 } 8573 8574 if (svd->type == MAP_SHARED) { 8575 if (svd->vp) { 8576 memidp->val[0] = (uintptr_t)svd->vp; 8577 memidp->val[1] = (u_longlong_t)svd->offset + 8578 (uintptr_t)(addr - seg->s_base); 8579 return (0); 8580 } else { 8581 8582 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER); 8583 if ((amp = svd->amp) != NULL) { 8584 anon_index = svd->anon_index + 8585 seg_page(seg, addr); 8586 } 8587 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 8588 8589 ASSERT(amp != NULL); 8590 8591 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 8592 anon_array_enter(amp, anon_index, &cookie); 8593 ap = anon_get_ptr(amp->ahp, anon_index); 8594 if (ap == NULL) { 8595 page_t *pp; 8596 8597 pp = anon_zero(seg, addr, &ap, svd->cred); 8598 if (pp == NULL) { 8599 anon_array_exit(&cookie); 8600 ANON_LOCK_EXIT(&->a_rwlock); 8601 return (ENOMEM); 8602 } 8603 ASSERT(anon_get_ptr(amp->ahp, anon_index) 8604 == NULL); 8605 (void) anon_set_ptr(amp->ahp, anon_index, 8606 ap, ANON_SLEEP); 8607 page_unlock(pp); 8608 } 8609 8610 anon_array_exit(&cookie); 8611 ANON_LOCK_EXIT(&->a_rwlock); 8612 8613 memidp->val[0] = (uintptr_t)ap; 8614 memidp->val[1] = (uintptr_t)addr & PAGEOFFSET; 8615 return (0); 8616 } 8617 } 8618 return (EINVAL); 8619 } 8620 8621 static int 8622 sameprot(struct seg *seg, caddr_t a, size_t len) 8623 { 8624 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 8625 struct vpage *vpage; 8626 spgcnt_t pages = btop(len); 8627 uint_t prot; 8628 8629 if (svd->pageprot == 0) 8630 return (1); 8631 8632 ASSERT(svd->vpage != NULL); 8633 8634 vpage = &svd->vpage[seg_page(seg, a)]; 8635 prot = VPP_PROT(vpage); 8636 vpage++; 8637 pages--; 8638 while (pages-- > 0) { 8639 if (prot != VPP_PROT(vpage)) 8640 return (0); 8641 vpage++; 8642 } 8643 return (1); 8644 } 8645 8646 /* 8647 * Get memory allocation policy info for specified address in given segment 8648 */ 8649 static lgrp_mem_policy_info_t * 8650 segvn_getpolicy(struct seg *seg, caddr_t addr) 8651 { 8652 struct anon_map *amp; 8653 ulong_t anon_index; 8654 lgrp_mem_policy_info_t *policy_info; 8655 struct segvn_data *svn_data; 8656 u_offset_t vn_off; 8657 vnode_t *vp; 8658 8659 ASSERT(seg != NULL); 8660 8661 svn_data = (struct segvn_data *)seg->s_data; 8662 if (svn_data == NULL) 8663 return (NULL); 8664 8665 /* 8666 * Get policy info for private or shared memory 8667 */ 8668 if (svn_data->type != MAP_SHARED) { 8669 if (svn_data->tr_state != SEGVN_TR_ON) { 8670 policy_info = &svn_data->policy_info; 8671 } else { 8672 policy_info = &svn_data->tr_policy_info; 8673 ASSERT(policy_info->mem_policy == 8674 LGRP_MEM_POLICY_NEXT_SEG); 8675 } 8676 } else { 8677 amp = svn_data->amp; 8678 anon_index = svn_data->anon_index + seg_page(seg, addr); 8679 vp = svn_data->vp; 8680 vn_off = svn_data->offset + (uintptr_t)(addr - seg->s_base); 8681 policy_info = lgrp_shm_policy_get(amp, anon_index, vp, vn_off); 8682 } 8683 8684 return (policy_info); 8685 } 8686 8687 /*ARGSUSED*/ 8688 static int 8689 segvn_capable(struct seg *seg, segcapability_t capability) 8690 { 8691 return (0); 8692 } 8693 8694 /* 8695 * Bind text vnode segment to an amp. If we bind successfully mappings will be 8696 * established to per vnode mapping per lgroup amp pages instead of to vnode 8697 * pages. There's one amp per vnode text mapping per lgroup. Many processes 8698 * may share the same text replication amp. If a suitable amp doesn't already 8699 * exist in svntr hash table create a new one. We may fail to bind to amp if 8700 * segment is not eligible for text replication. Code below first checks for 8701 * these conditions. If binding is successful segment tr_state is set to on 8702 * and svd->amp points to the amp to use. Otherwise tr_state is set to off and 8703 * svd->amp remains as NULL. 8704 */ 8705 static void 8706 segvn_textrepl(struct seg *seg) 8707 { 8708 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 8709 vnode_t *vp = svd->vp; 8710 u_offset_t off = svd->offset; 8711 size_t size = seg->s_size; 8712 u_offset_t eoff = off + size; 8713 uint_t szc = seg->s_szc; 8714 ulong_t hash = SVNTR_HASH_FUNC(vp); 8715 svntr_t *svntrp; 8716 struct vattr va; 8717 proc_t *p = seg->s_as->a_proc; 8718 lgrp_id_t lgrp_id; 8719 lgrp_id_t olid; 8720 int first; 8721 struct anon_map *amp; 8722 8723 ASSERT(AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 8724 ASSERT(SEGVN_WRITE_HELD(seg->s_as, &svd->lock)); 8725 ASSERT(p != NULL); 8726 ASSERT(svd->tr_state == SEGVN_TR_INIT); 8727 ASSERT(svd->flags & MAP_TEXT); 8728 ASSERT(svd->type == MAP_PRIVATE); 8729 ASSERT(vp != NULL && svd->amp == NULL); 8730 ASSERT(!svd->pageprot && !(svd->prot & PROT_WRITE)); 8731 ASSERT(!(svd->flags & MAP_NORESERVE) && svd->swresv == 0); 8732 ASSERT(seg->s_as != &kas); 8733 ASSERT(off < eoff); 8734 ASSERT(svntr_hashtab != NULL); 8735 8736 /* 8737 * If numa optimizations are no longer desired bail out. 8738 */ 8739 if (!lgrp_optimizations()) { 8740 svd->tr_state = SEGVN_TR_OFF; 8741 return; 8742 } 8743 8744 /* 8745 * Avoid creating anon maps with size bigger than the file size. 8746 * If VOP_GETATTR() call fails bail out. 8747 */ 8748 va.va_mask = AT_SIZE | AT_MTIME; 8749 if (VOP_GETATTR(vp, &va, 0, svd->cred) != 0) { 8750 svd->tr_state = SEGVN_TR_OFF; 8751 SEGVN_TR_ADDSTAT(gaerr); 8752 return; 8753 } 8754 if (btopr(va.va_size) < btopr(eoff)) { 8755 svd->tr_state = SEGVN_TR_OFF; 8756 SEGVN_TR_ADDSTAT(overmap); 8757 return; 8758 } 8759 8760 /* 8761 * VVMEXEC may not be set yet if exec() prefaults text segment. Set 8762 * this flag now before vn_is_mapped(V_WRITE) so that MAP_SHARED 8763 * mapping that checks if trcache for this vnode needs to be 8764 * invalidated can't miss us. 8765 */ 8766 if (!(vp->v_flag & VVMEXEC)) { 8767 mutex_enter(&vp->v_lock); 8768 vp->v_flag |= VVMEXEC; 8769 mutex_exit(&vp->v_lock); 8770 } 8771 mutex_enter(&svntr_hashtab[hash].tr_lock); 8772 /* 8773 * Bail out if potentially MAP_SHARED writable mappings exist to this 8774 * vnode. We don't want to use old file contents from existing 8775 * replicas if this mapping was established after the original file 8776 * was changed. 8777 */ 8778 if (vn_is_mapped(vp, V_WRITE)) { 8779 mutex_exit(&svntr_hashtab[hash].tr_lock); 8780 svd->tr_state = SEGVN_TR_OFF; 8781 SEGVN_TR_ADDSTAT(wrcnt); 8782 return; 8783 } 8784 svntrp = svntr_hashtab[hash].tr_head; 8785 for (; svntrp != NULL; svntrp = svntrp->tr_next) { 8786 ASSERT(svntrp->tr_refcnt != 0); 8787 if (svntrp->tr_vp != vp) { 8788 continue; 8789 } 8790 /* 8791 * Bail out if file was changed after this replication entry 8792 * was created since we need to use the latest file contents. 8793 */ 8794 if (!svntrp->tr_valid || 8795 svntrp->tr_mtime.tv_sec != va.va_mtime.tv_sec || 8796 svntrp->tr_mtime.tv_nsec != va.va_mtime.tv_nsec) { 8797 mutex_exit(&svntr_hashtab[hash].tr_lock); 8798 svd->tr_state = SEGVN_TR_OFF; 8799 SEGVN_TR_ADDSTAT(stale); 8800 return; 8801 } 8802 /* 8803 * if off, eoff and szc match current segment we found the 8804 * existing entry we can use. 8805 */ 8806 if (svntrp->tr_off == off && svntrp->tr_eoff == eoff && 8807 svntrp->tr_szc == szc) { 8808 break; 8809 } 8810 /* 8811 * Don't create different but overlapping in file offsets 8812 * entries to avoid replication of the same file pages more 8813 * than once per lgroup. 8814 */ 8815 if ((off >= svntrp->tr_off && off < svntrp->tr_eoff) || 8816 (eoff > svntrp->tr_off && eoff <= svntrp->tr_eoff)) { 8817 mutex_exit(&svntr_hashtab[hash].tr_lock); 8818 svd->tr_state = SEGVN_TR_OFF; 8819 SEGVN_TR_ADDSTAT(overlap); 8820 return; 8821 } 8822 } 8823 /* 8824 * If we didn't find existing entry create a new one. 8825 */ 8826 if (svntrp == NULL) { 8827 svntrp = kmem_cache_alloc(svntr_cache, KM_NOSLEEP); 8828 if (svntrp == NULL) { 8829 mutex_exit(&svntr_hashtab[hash].tr_lock); 8830 svd->tr_state = SEGVN_TR_OFF; 8831 SEGVN_TR_ADDSTAT(nokmem); 8832 return; 8833 } 8834 #ifdef DEBUG 8835 { 8836 lgrp_id_t i; 8837 for (i = 0; i < NLGRPS_MAX; i++) { 8838 ASSERT(svntrp->tr_amp[i] == NULL); 8839 } 8840 } 8841 #endif /* DEBUG */ 8842 svntrp->tr_vp = vp; 8843 svntrp->tr_off = off; 8844 svntrp->tr_eoff = eoff; 8845 svntrp->tr_szc = szc; 8846 svntrp->tr_valid = 1; 8847 svntrp->tr_mtime = va.va_mtime; 8848 svntrp->tr_refcnt = 0; 8849 svntrp->tr_next = svntr_hashtab[hash].tr_head; 8850 svntr_hashtab[hash].tr_head = svntrp; 8851 } 8852 first = 1; 8853 again: 8854 /* 8855 * We want to pick a replica with pages on main thread's (t_tid = 1, 8856 * aka T1) lgrp. Currently text replication is only optimized for 8857 * workloads that either have all threads of a process on the same 8858 * lgrp or execute their large text primarily on main thread. 8859 */ 8860 lgrp_id = p->p_t1_lgrpid; 8861 if (lgrp_id == LGRP_NONE) { 8862 /* 8863 * In case exec() prefaults text on non main thread use 8864 * current thread lgrpid. It will become main thread anyway 8865 * soon. 8866 */ 8867 lgrp_id = lgrp_home_id(curthread); 8868 } 8869 /* 8870 * Set p_tr_lgrpid to lgrpid if it hasn't been set yet. Otherwise 8871 * just set it to NLGRPS_MAX if it's different from current process T1 8872 * home lgrp. p_tr_lgrpid is used to detect if process uses text 8873 * replication and T1 new home is different from lgrp used for text 8874 * replication. When this happens asyncronous segvn thread rechecks if 8875 * segments should change lgrps used for text replication. If we fail 8876 * to set p_tr_lgrpid with cas32 then set it to NLGRPS_MAX without cas 8877 * if it's not already NLGRPS_MAX and not equal lgrp_id we want to 8878 * use. We don't need to use cas in this case because another thread 8879 * that races in between our non atomic check and set may only change 8880 * p_tr_lgrpid to NLGRPS_MAX at this point. 8881 */ 8882 ASSERT(lgrp_id != LGRP_NONE && lgrp_id < NLGRPS_MAX); 8883 olid = p->p_tr_lgrpid; 8884 if (lgrp_id != olid && olid != NLGRPS_MAX) { 8885 lgrp_id_t nlid = (olid == LGRP_NONE) ? lgrp_id : NLGRPS_MAX; 8886 if (cas32((uint32_t *)&p->p_tr_lgrpid, olid, nlid) != olid) { 8887 olid = p->p_tr_lgrpid; 8888 ASSERT(olid != LGRP_NONE); 8889 if (olid != lgrp_id && olid != NLGRPS_MAX) { 8890 p->p_tr_lgrpid = NLGRPS_MAX; 8891 } 8892 } 8893 ASSERT(p->p_tr_lgrpid != LGRP_NONE); 8894 membar_producer(); 8895 /* 8896 * lgrp_move_thread() won't schedule async recheck after 8897 * p->p_t1_lgrpid update unless p->p_tr_lgrpid is not 8898 * LGRP_NONE. Recheck p_t1_lgrpid once now that p->p_tr_lgrpid 8899 * is not LGRP_NONE. 8900 */ 8901 if (first && p->p_t1_lgrpid != LGRP_NONE && 8902 p->p_t1_lgrpid != lgrp_id) { 8903 first = 0; 8904 goto again; 8905 } 8906 } 8907 /* 8908 * If no amp was created yet for lgrp_id create a new one as long as 8909 * we have enough memory to afford it. 8910 */ 8911 if ((amp = svntrp->tr_amp[lgrp_id]) == NULL) { 8912 size_t trmem = atomic_add_long_nv(&segvn_textrepl_bytes, size); 8913 if (trmem > segvn_textrepl_max_bytes) { 8914 SEGVN_TR_ADDSTAT(normem); 8915 goto fail; 8916 } 8917 if (anon_try_resv_zone(size, NULL) == 0) { 8918 SEGVN_TR_ADDSTAT(noanon); 8919 goto fail; 8920 } 8921 amp = anonmap_alloc(size, size, ANON_NOSLEEP); 8922 if (amp == NULL) { 8923 anon_unresv_zone(size, NULL); 8924 SEGVN_TR_ADDSTAT(nokmem); 8925 goto fail; 8926 } 8927 ASSERT(amp->refcnt == 1); 8928 amp->a_szc = szc; 8929 svntrp->tr_amp[lgrp_id] = amp; 8930 SEGVN_TR_ADDSTAT(newamp); 8931 } 8932 svntrp->tr_refcnt++; 8933 ASSERT(svd->svn_trnext == NULL); 8934 ASSERT(svd->svn_trprev == NULL); 8935 svd->svn_trnext = svntrp->tr_svnhead; 8936 svd->svn_trprev = NULL; 8937 if (svntrp->tr_svnhead != NULL) { 8938 svntrp->tr_svnhead->svn_trprev = svd; 8939 } 8940 svntrp->tr_svnhead = svd; 8941 ASSERT(amp->a_szc == szc && amp->size == size && amp->swresv == size); 8942 ASSERT(amp->refcnt >= 1); 8943 svd->amp = amp; 8944 svd->anon_index = 0; 8945 svd->tr_policy_info.mem_policy = LGRP_MEM_POLICY_NEXT_SEG; 8946 svd->tr_policy_info.mem_lgrpid = lgrp_id; 8947 svd->tr_state = SEGVN_TR_ON; 8948 mutex_exit(&svntr_hashtab[hash].tr_lock); 8949 SEGVN_TR_ADDSTAT(repl); 8950 return; 8951 fail: 8952 ASSERT(segvn_textrepl_bytes >= size); 8953 atomic_add_long(&segvn_textrepl_bytes, -size); 8954 ASSERT(svntrp != NULL); 8955 ASSERT(svntrp->tr_amp[lgrp_id] == NULL); 8956 if (svntrp->tr_refcnt == 0) { 8957 ASSERT(svntrp == svntr_hashtab[hash].tr_head); 8958 svntr_hashtab[hash].tr_head = svntrp->tr_next; 8959 mutex_exit(&svntr_hashtab[hash].tr_lock); 8960 kmem_cache_free(svntr_cache, svntrp); 8961 } else { 8962 mutex_exit(&svntr_hashtab[hash].tr_lock); 8963 } 8964 svd->tr_state = SEGVN_TR_OFF; 8965 } 8966 8967 /* 8968 * Convert seg back to regular vnode mapping seg by unbinding it from its text 8969 * replication amp. This routine is most typically called when segment is 8970 * unmapped but can also be called when segment no longer qualifies for text 8971 * replication (e.g. due to protection changes). If unload_unmap is set use 8972 * HAT_UNLOAD_UNMAP flag in hat_unload_callback(). If we are the last user of 8973 * svntr free all its anon maps and remove it from the hash table. 8974 */ 8975 static void 8976 segvn_textunrepl(struct seg *seg, int unload_unmap) 8977 { 8978 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 8979 vnode_t *vp = svd->vp; 8980 u_offset_t off = svd->offset; 8981 size_t size = seg->s_size; 8982 u_offset_t eoff = off + size; 8983 uint_t szc = seg->s_szc; 8984 ulong_t hash = SVNTR_HASH_FUNC(vp); 8985 svntr_t *svntrp; 8986 svntr_t **prv_svntrp; 8987 lgrp_id_t lgrp_id = svd->tr_policy_info.mem_lgrpid; 8988 lgrp_id_t i; 8989 8990 ASSERT(AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 8991 ASSERT(AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock) || 8992 SEGVN_WRITE_HELD(seg->s_as, &svd->lock)); 8993 ASSERT(svd->tr_state == SEGVN_TR_ON); 8994 ASSERT(svd->amp != NULL); 8995 ASSERT(svd->amp->refcnt >= 1); 8996 ASSERT(svd->anon_index == 0); 8997 ASSERT(lgrp_id != LGRP_NONE && lgrp_id < NLGRPS_MAX); 8998 ASSERT(svntr_hashtab != NULL); 8999 9000 mutex_enter(&svntr_hashtab[hash].tr_lock); 9001 prv_svntrp = &svntr_hashtab[hash].tr_head; 9002 for (; (svntrp = *prv_svntrp) != NULL; prv_svntrp = &svntrp->tr_next) { 9003 ASSERT(svntrp->tr_refcnt != 0); 9004 if (svntrp->tr_vp == vp && svntrp->tr_off == off && 9005 svntrp->tr_eoff == eoff && svntrp->tr_szc == szc) { 9006 break; 9007 } 9008 } 9009 if (svntrp == NULL) { 9010 panic("segvn_textunrepl: svntr record not found"); 9011 } 9012 if (svntrp->tr_amp[lgrp_id] != svd->amp) { 9013 panic("segvn_textunrepl: amp mismatch"); 9014 } 9015 svd->tr_state = SEGVN_TR_OFF; 9016 svd->amp = NULL; 9017 if (svd->svn_trprev == NULL) { 9018 ASSERT(svntrp->tr_svnhead == svd); 9019 svntrp->tr_svnhead = svd->svn_trnext; 9020 if (svntrp->tr_svnhead != NULL) { 9021 svntrp->tr_svnhead->svn_trprev = NULL; 9022 } 9023 svd->svn_trnext = NULL; 9024 } else { 9025 svd->svn_trprev->svn_trnext = svd->svn_trnext; 9026 if (svd->svn_trnext != NULL) { 9027 svd->svn_trnext->svn_trprev = svd->svn_trprev; 9028 svd->svn_trnext = NULL; 9029 } 9030 svd->svn_trprev = NULL; 9031 } 9032 if (--svntrp->tr_refcnt) { 9033 mutex_exit(&svntr_hashtab[hash].tr_lock); 9034 goto done; 9035 } 9036 *prv_svntrp = svntrp->tr_next; 9037 mutex_exit(&svntr_hashtab[hash].tr_lock); 9038 for (i = 0; i < NLGRPS_MAX; i++) { 9039 struct anon_map *amp = svntrp->tr_amp[i]; 9040 if (amp == NULL) { 9041 continue; 9042 } 9043 ASSERT(amp->refcnt == 1); 9044 ASSERT(amp->swresv == size); 9045 ASSERT(amp->size == size); 9046 ASSERT(amp->a_szc == szc); 9047 if (amp->a_szc != 0) { 9048 anon_free_pages(amp->ahp, 0, size, szc); 9049 } else { 9050 anon_free(amp->ahp, 0, size); 9051 } 9052 svntrp->tr_amp[i] = NULL; 9053 ASSERT(segvn_textrepl_bytes >= size); 9054 atomic_add_long(&segvn_textrepl_bytes, -size); 9055 anon_unresv_zone(amp->swresv, NULL); 9056 amp->refcnt = 0; 9057 anonmap_free(amp); 9058 } 9059 kmem_cache_free(svntr_cache, svntrp); 9060 done: 9061 hat_unload_callback(seg->s_as->a_hat, seg->s_base, size, 9062 unload_unmap ? HAT_UNLOAD_UNMAP : 0, NULL); 9063 } 9064 9065 /* 9066 * This is called when a MAP_SHARED writabble mapping is created to a vnode 9067 * that is currently used for execution (VVMEXEC flag is set). In this case we 9068 * need to prevent further use of existing replicas. 9069 */ 9070 static void 9071 segvn_inval_trcache(vnode_t *vp) 9072 { 9073 ulong_t hash = SVNTR_HASH_FUNC(vp); 9074 svntr_t *svntrp; 9075 9076 ASSERT(vp->v_flag & VVMEXEC); 9077 9078 if (svntr_hashtab == NULL) { 9079 return; 9080 } 9081 9082 mutex_enter(&svntr_hashtab[hash].tr_lock); 9083 svntrp = svntr_hashtab[hash].tr_head; 9084 for (; svntrp != NULL; svntrp = svntrp->tr_next) { 9085 ASSERT(svntrp->tr_refcnt != 0); 9086 if (svntrp->tr_vp == vp && svntrp->tr_valid) { 9087 svntrp->tr_valid = 0; 9088 } 9089 } 9090 mutex_exit(&svntr_hashtab[hash].tr_lock); 9091 } 9092 9093 static void 9094 segvn_trasync_thread(void) 9095 { 9096 callb_cpr_t cpr_info; 9097 kmutex_t cpr_lock; /* just for CPR stuff */ 9098 9099 mutex_init(&cpr_lock, NULL, MUTEX_DEFAULT, NULL); 9100 9101 CALLB_CPR_INIT(&cpr_info, &cpr_lock, 9102 callb_generic_cpr, "segvn_async"); 9103 9104 if (segvn_update_textrepl_interval == 0) { 9105 segvn_update_textrepl_interval = segvn_update_tr_time * hz; 9106 } else { 9107 segvn_update_textrepl_interval *= hz; 9108 } 9109 (void) timeout(segvn_trupdate_wakeup, NULL, 9110 segvn_update_textrepl_interval); 9111 9112 for (;;) { 9113 mutex_enter(&cpr_lock); 9114 CALLB_CPR_SAFE_BEGIN(&cpr_info); 9115 mutex_exit(&cpr_lock); 9116 sema_p(&segvn_trasync_sem); 9117 mutex_enter(&cpr_lock); 9118 CALLB_CPR_SAFE_END(&cpr_info, &cpr_lock); 9119 mutex_exit(&cpr_lock); 9120 segvn_trupdate(); 9121 } 9122 } 9123 9124 static uint64_t segvn_lgrp_trthr_migrs_snpsht = 0; 9125 9126 static void 9127 segvn_trupdate_wakeup(void *dummy) 9128 { 9129 uint64_t cur_lgrp_trthr_migrs = lgrp_get_trthr_migrations(); 9130 9131 if (cur_lgrp_trthr_migrs != segvn_lgrp_trthr_migrs_snpsht) { 9132 segvn_lgrp_trthr_migrs_snpsht = cur_lgrp_trthr_migrs; 9133 sema_v(&segvn_trasync_sem); 9134 } 9135 9136 if (!segvn_disable_textrepl_update && 9137 segvn_update_textrepl_interval != 0) { 9138 (void) timeout(segvn_trupdate_wakeup, dummy, 9139 segvn_update_textrepl_interval); 9140 } 9141 } 9142 9143 static void 9144 segvn_trupdate(void) 9145 { 9146 ulong_t hash; 9147 svntr_t *svntrp; 9148 segvn_data_t *svd; 9149 9150 ASSERT(svntr_hashtab != NULL); 9151 9152 for (hash = 0; hash < svntr_hashtab_sz; hash++) { 9153 mutex_enter(&svntr_hashtab[hash].tr_lock); 9154 svntrp = svntr_hashtab[hash].tr_head; 9155 for (; svntrp != NULL; svntrp = svntrp->tr_next) { 9156 ASSERT(svntrp->tr_refcnt != 0); 9157 svd = svntrp->tr_svnhead; 9158 for (; svd != NULL; svd = svd->svn_trnext) { 9159 segvn_trupdate_seg(svd->seg, svd, svntrp, 9160 hash); 9161 } 9162 } 9163 mutex_exit(&svntr_hashtab[hash].tr_lock); 9164 } 9165 } 9166 9167 static void 9168 segvn_trupdate_seg(struct seg *seg, 9169 segvn_data_t *svd, 9170 svntr_t *svntrp, 9171 ulong_t hash) 9172 { 9173 proc_t *p; 9174 lgrp_id_t lgrp_id; 9175 struct as *as; 9176 size_t size; 9177 struct anon_map *amp; 9178 9179 ASSERT(svd->vp != NULL); 9180 ASSERT(svd->vp == svntrp->tr_vp); 9181 ASSERT(svd->offset == svntrp->tr_off); 9182 ASSERT(svd->offset + seg->s_size == svntrp->tr_eoff); 9183 ASSERT(seg != NULL); 9184 ASSERT(svd->seg == seg); 9185 ASSERT(seg->s_data == (void *)svd); 9186 ASSERT(seg->s_szc == svntrp->tr_szc); 9187 ASSERT(svd->tr_state == SEGVN_TR_ON); 9188 ASSERT(svd->amp != NULL); 9189 ASSERT(svd->tr_policy_info.mem_policy == LGRP_MEM_POLICY_NEXT_SEG); 9190 ASSERT(svd->tr_policy_info.mem_lgrpid != LGRP_NONE); 9191 ASSERT(svd->tr_policy_info.mem_lgrpid < NLGRPS_MAX); 9192 ASSERT(svntrp->tr_amp[svd->tr_policy_info.mem_lgrpid] == svd->amp); 9193 ASSERT(svntrp->tr_refcnt != 0); 9194 ASSERT(mutex_owned(&svntr_hashtab[hash].tr_lock)); 9195 9196 as = seg->s_as; 9197 ASSERT(as != NULL && as != &kas); 9198 p = as->a_proc; 9199 ASSERT(p != NULL); 9200 ASSERT(p->p_tr_lgrpid != LGRP_NONE); 9201 lgrp_id = p->p_t1_lgrpid; 9202 if (lgrp_id == LGRP_NONE) { 9203 return; 9204 } 9205 ASSERT(lgrp_id < NLGRPS_MAX); 9206 if (svd->tr_policy_info.mem_lgrpid == lgrp_id) { 9207 return; 9208 } 9209 9210 /* 9211 * Use tryenter locking since we are locking as/seg and svntr hash 9212 * lock in reverse from syncrounous thread order. 9213 */ 9214 if (!AS_LOCK_TRYENTER(as, &as->a_lock, RW_READER)) { 9215 SEGVN_TR_ADDSTAT(nolock); 9216 if (segvn_lgrp_trthr_migrs_snpsht) { 9217 segvn_lgrp_trthr_migrs_snpsht = 0; 9218 } 9219 return; 9220 } 9221 if (!SEGVN_LOCK_TRYENTER(seg->s_as, &svd->lock, RW_WRITER)) { 9222 AS_LOCK_EXIT(as, &as->a_lock); 9223 SEGVN_TR_ADDSTAT(nolock); 9224 if (segvn_lgrp_trthr_migrs_snpsht) { 9225 segvn_lgrp_trthr_migrs_snpsht = 0; 9226 } 9227 return; 9228 } 9229 size = seg->s_size; 9230 if (svntrp->tr_amp[lgrp_id] == NULL) { 9231 size_t trmem = atomic_add_long_nv(&segvn_textrepl_bytes, size); 9232 if (trmem > segvn_textrepl_max_bytes) { 9233 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 9234 AS_LOCK_EXIT(as, &as->a_lock); 9235 atomic_add_long(&segvn_textrepl_bytes, -size); 9236 SEGVN_TR_ADDSTAT(normem); 9237 return; 9238 } 9239 if (anon_try_resv_zone(size, NULL) == 0) { 9240 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 9241 AS_LOCK_EXIT(as, &as->a_lock); 9242 atomic_add_long(&segvn_textrepl_bytes, -size); 9243 SEGVN_TR_ADDSTAT(noanon); 9244 return; 9245 } 9246 amp = anonmap_alloc(size, size, KM_NOSLEEP); 9247 if (amp == NULL) { 9248 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 9249 AS_LOCK_EXIT(as, &as->a_lock); 9250 atomic_add_long(&segvn_textrepl_bytes, -size); 9251 anon_unresv_zone(size, NULL); 9252 SEGVN_TR_ADDSTAT(nokmem); 9253 return; 9254 } 9255 ASSERT(amp->refcnt == 1); 9256 amp->a_szc = seg->s_szc; 9257 svntrp->tr_amp[lgrp_id] = amp; 9258 } 9259 /* 9260 * We don't need to drop the bucket lock but here we give other 9261 * threads a chance. svntr and svd can't be unlinked as long as 9262 * segment lock is held as a writer and AS held as well. After we 9263 * retake bucket lock we'll continue from where we left. We'll be able 9264 * to reach the end of either list since new entries are always added 9265 * to the beginning of the lists. 9266 */ 9267 mutex_exit(&svntr_hashtab[hash].tr_lock); 9268 hat_unload_callback(as->a_hat, seg->s_base, size, 0, NULL); 9269 mutex_enter(&svntr_hashtab[hash].tr_lock); 9270 9271 ASSERT(svd->tr_state == SEGVN_TR_ON); 9272 ASSERT(svd->amp != NULL); 9273 ASSERT(svd->tr_policy_info.mem_policy == LGRP_MEM_POLICY_NEXT_SEG); 9274 ASSERT(svd->tr_policy_info.mem_lgrpid != lgrp_id); 9275 ASSERT(svd->amp != svntrp->tr_amp[lgrp_id]); 9276 9277 svd->tr_policy_info.mem_lgrpid = lgrp_id; 9278 svd->amp = svntrp->tr_amp[lgrp_id]; 9279 p->p_tr_lgrpid = NLGRPS_MAX; 9280 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 9281 AS_LOCK_EXIT(as, &as->a_lock); 9282 9283 ASSERT(svntrp->tr_refcnt != 0); 9284 ASSERT(svd->vp == svntrp->tr_vp); 9285 ASSERT(svd->tr_policy_info.mem_lgrpid == lgrp_id); 9286 ASSERT(svd->amp != NULL && svd->amp == svntrp->tr_amp[lgrp_id]); 9287 ASSERT(svd->seg == seg); 9288 ASSERT(svd->tr_state == SEGVN_TR_ON); 9289 9290 SEGVN_TR_ADDSTAT(asyncrepl); 9291 } 9292