1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ 27 /* All Rights Reserved */ 28 29 /* 30 * University Copyright- Copyright (c) 1982, 1986, 1988 31 * The Regents of the University of California 32 * All Rights Reserved 33 * 34 * University Acknowledgment- Portions of this document are derived from 35 * software developed by the University of California, Berkeley, and its 36 * contributors. 37 */ 38 39 #pragma ident "%Z%%M% %I% %E% SMI" 40 41 /* 42 * VM - shared or copy-on-write from a vnode/anonymous memory. 43 */ 44 45 #include <sys/types.h> 46 #include <sys/param.h> 47 #include <sys/t_lock.h> 48 #include <sys/errno.h> 49 #include <sys/systm.h> 50 #include <sys/mman.h> 51 #include <sys/debug.h> 52 #include <sys/cred.h> 53 #include <sys/vmsystm.h> 54 #include <sys/tuneable.h> 55 #include <sys/bitmap.h> 56 #include <sys/swap.h> 57 #include <sys/kmem.h> 58 #include <sys/sysmacros.h> 59 #include <sys/vtrace.h> 60 #include <sys/cmn_err.h> 61 #include <sys/callb.h> 62 #include <sys/vm.h> 63 #include <sys/dumphdr.h> 64 #include <sys/lgrp.h> 65 66 #include <vm/hat.h> 67 #include <vm/as.h> 68 #include <vm/seg.h> 69 #include <vm/seg_vn.h> 70 #include <vm/pvn.h> 71 #include <vm/anon.h> 72 #include <vm/page.h> 73 #include <vm/vpage.h> 74 #include <sys/proc.h> 75 #include <sys/task.h> 76 #include <sys/project.h> 77 #include <sys/zone.h> 78 #include <sys/shm_impl.h> 79 /* 80 * Private seg op routines. 81 */ 82 static int segvn_dup(struct seg *seg, struct seg *newseg); 83 static int segvn_unmap(struct seg *seg, caddr_t addr, size_t len); 84 static void segvn_free(struct seg *seg); 85 static faultcode_t segvn_fault(struct hat *hat, struct seg *seg, 86 caddr_t addr, size_t len, enum fault_type type, 87 enum seg_rw rw); 88 static faultcode_t segvn_faulta(struct seg *seg, caddr_t addr); 89 static int segvn_setprot(struct seg *seg, caddr_t addr, 90 size_t len, uint_t prot); 91 static int segvn_checkprot(struct seg *seg, caddr_t addr, 92 size_t len, uint_t prot); 93 static int segvn_kluster(struct seg *seg, caddr_t addr, ssize_t delta); 94 static size_t segvn_swapout(struct seg *seg); 95 static int segvn_sync(struct seg *seg, caddr_t addr, size_t len, 96 int attr, uint_t flags); 97 static size_t segvn_incore(struct seg *seg, caddr_t addr, size_t len, 98 char *vec); 99 static int segvn_lockop(struct seg *seg, caddr_t addr, size_t len, 100 int attr, int op, ulong_t *lockmap, size_t pos); 101 static int segvn_getprot(struct seg *seg, caddr_t addr, size_t len, 102 uint_t *protv); 103 static u_offset_t segvn_getoffset(struct seg *seg, caddr_t addr); 104 static int segvn_gettype(struct seg *seg, caddr_t addr); 105 static int segvn_getvp(struct seg *seg, caddr_t addr, 106 struct vnode **vpp); 107 static int segvn_advise(struct seg *seg, caddr_t addr, size_t len, 108 uint_t behav); 109 static void segvn_dump(struct seg *seg); 110 static int segvn_pagelock(struct seg *seg, caddr_t addr, size_t len, 111 struct page ***ppp, enum lock_type type, enum seg_rw rw); 112 static int segvn_setpagesize(struct seg *seg, caddr_t addr, size_t len, 113 uint_t szc); 114 static int segvn_getmemid(struct seg *seg, caddr_t addr, 115 memid_t *memidp); 116 static lgrp_mem_policy_info_t *segvn_getpolicy(struct seg *, caddr_t); 117 static int segvn_capable(struct seg *seg, segcapability_t capable); 118 119 struct seg_ops segvn_ops = { 120 segvn_dup, 121 segvn_unmap, 122 segvn_free, 123 segvn_fault, 124 segvn_faulta, 125 segvn_setprot, 126 segvn_checkprot, 127 segvn_kluster, 128 segvn_swapout, 129 segvn_sync, 130 segvn_incore, 131 segvn_lockop, 132 segvn_getprot, 133 segvn_getoffset, 134 segvn_gettype, 135 segvn_getvp, 136 segvn_advise, 137 segvn_dump, 138 segvn_pagelock, 139 segvn_setpagesize, 140 segvn_getmemid, 141 segvn_getpolicy, 142 segvn_capable, 143 }; 144 145 /* 146 * Common zfod structures, provided as a shorthand for others to use. 147 */ 148 static segvn_crargs_t zfod_segvn_crargs = 149 SEGVN_ZFOD_ARGS(PROT_ZFOD, PROT_ALL); 150 static segvn_crargs_t kzfod_segvn_crargs = 151 SEGVN_ZFOD_ARGS(PROT_ZFOD & ~PROT_USER, 152 PROT_ALL & ~PROT_USER); 153 static segvn_crargs_t stack_noexec_crargs = 154 SEGVN_ZFOD_ARGS(PROT_ZFOD & ~PROT_EXEC, PROT_ALL); 155 156 caddr_t zfod_argsp = (caddr_t)&zfod_segvn_crargs; /* user zfod argsp */ 157 caddr_t kzfod_argsp = (caddr_t)&kzfod_segvn_crargs; /* kernel zfod argsp */ 158 caddr_t stack_exec_argsp = (caddr_t)&zfod_segvn_crargs; /* executable stack */ 159 caddr_t stack_noexec_argsp = (caddr_t)&stack_noexec_crargs; /* noexec stack */ 160 161 #define vpgtob(n) ((n) * sizeof (struct vpage)) /* For brevity */ 162 163 size_t segvn_comb_thrshld = UINT_MAX; /* patchable -- see 1196681 */ 164 165 static int segvn_concat(struct seg *, struct seg *, int); 166 static int segvn_extend_prev(struct seg *, struct seg *, 167 struct segvn_crargs *, size_t); 168 static int segvn_extend_next(struct seg *, struct seg *, 169 struct segvn_crargs *, size_t); 170 static void segvn_softunlock(struct seg *, caddr_t, size_t, enum seg_rw); 171 static void segvn_pagelist_rele(page_t **); 172 static void segvn_setvnode_mpss(vnode_t *); 173 static void segvn_relocate_pages(page_t **, page_t *); 174 static int segvn_full_szcpages(page_t **, uint_t, int *, uint_t *); 175 static int segvn_fill_vp_pages(struct segvn_data *, vnode_t *, u_offset_t, 176 uint_t, page_t **, page_t **, uint_t *, int *); 177 static faultcode_t segvn_fault_vnodepages(struct hat *, struct seg *, caddr_t, 178 caddr_t, enum fault_type, enum seg_rw, caddr_t, caddr_t, int); 179 static faultcode_t segvn_fault_anonpages(struct hat *, struct seg *, caddr_t, 180 caddr_t, enum fault_type, enum seg_rw, caddr_t, caddr_t, int); 181 static faultcode_t segvn_faultpage(struct hat *, struct seg *, caddr_t, 182 u_offset_t, struct vpage *, page_t **, uint_t, 183 enum fault_type, enum seg_rw, int, int); 184 static void segvn_vpage(struct seg *); 185 186 static void segvn_purge(struct seg *seg); 187 static int segvn_reclaim(struct seg *, caddr_t, size_t, struct page **, 188 enum seg_rw); 189 190 static int sameprot(struct seg *, caddr_t, size_t); 191 192 static int segvn_demote_range(struct seg *, caddr_t, size_t, int, uint_t); 193 static int segvn_clrszc(struct seg *); 194 static struct seg *segvn_split_seg(struct seg *, caddr_t); 195 static int segvn_claim_pages(struct seg *, struct vpage *, u_offset_t, 196 ulong_t, uint_t); 197 198 static void segvn_hat_rgn_unload_callback(caddr_t, caddr_t, caddr_t, 199 size_t, void *, u_offset_t); 200 201 static int segvn_slock_anonpages(page_t *, int); 202 static void segvn_sunlock_anonpages(page_t *, int); 203 204 static struct kmem_cache *segvn_cache; 205 206 #ifdef VM_STATS 207 static struct segvnvmstats_str { 208 ulong_t fill_vp_pages[31]; 209 ulong_t fltvnpages[49]; 210 ulong_t fullszcpages[10]; 211 ulong_t relocatepages[3]; 212 ulong_t fltanpages[17]; 213 ulong_t pagelock[3]; 214 ulong_t demoterange[3]; 215 } segvnvmstats; 216 #endif /* VM_STATS */ 217 218 #define SDR_RANGE 1 /* demote entire range */ 219 #define SDR_END 2 /* demote non aligned ends only */ 220 221 #define CALC_LPG_REGION(pgsz, seg, addr, len, lpgaddr, lpgeaddr) { \ 222 if ((len) != 0) { \ 223 lpgaddr = (caddr_t)P2ALIGN((uintptr_t)(addr), pgsz); \ 224 ASSERT(lpgaddr >= (seg)->s_base); \ 225 lpgeaddr = (caddr_t)P2ROUNDUP((uintptr_t)((addr) + \ 226 (len)), pgsz); \ 227 ASSERT(lpgeaddr > lpgaddr); \ 228 ASSERT(lpgeaddr <= (seg)->s_base + (seg)->s_size); \ 229 } else { \ 230 lpgeaddr = lpgaddr = (addr); \ 231 } \ 232 } 233 234 /*ARGSUSED*/ 235 static int 236 segvn_cache_constructor(void *buf, void *cdrarg, int kmflags) 237 { 238 struct segvn_data *svd = buf; 239 240 rw_init(&svd->lock, NULL, RW_DEFAULT, NULL); 241 mutex_init(&svd->segp_slock, NULL, MUTEX_DEFAULT, NULL); 242 svd->svn_trnext = svd->svn_trprev = NULL; 243 return (0); 244 } 245 246 /*ARGSUSED1*/ 247 static void 248 segvn_cache_destructor(void *buf, void *cdrarg) 249 { 250 struct segvn_data *svd = buf; 251 252 rw_destroy(&svd->lock); 253 mutex_destroy(&svd->segp_slock); 254 } 255 256 /*ARGSUSED*/ 257 static int 258 svntr_cache_constructor(void *buf, void *cdrarg, int kmflags) 259 { 260 bzero(buf, sizeof (svntr_t)); 261 return (0); 262 } 263 264 /* 265 * Patching this variable to non-zero allows the system to run with 266 * stacks marked as "not executable". It's a bit of a kludge, but is 267 * provided as a tweakable for platforms that export those ABIs 268 * (e.g. sparc V8) that have executable stacks enabled by default. 269 * There are also some restrictions for platforms that don't actually 270 * implement 'noexec' protections. 271 * 272 * Once enabled, the system is (therefore) unable to provide a fully 273 * ABI-compliant execution environment, though practically speaking, 274 * most everything works. The exceptions are generally some interpreters 275 * and debuggers that create executable code on the stack and jump 276 * into it (without explicitly mprotecting the address range to include 277 * PROT_EXEC). 278 * 279 * One important class of applications that are disabled are those 280 * that have been transformed into malicious agents using one of the 281 * numerous "buffer overflow" attacks. See 4007890. 282 */ 283 int noexec_user_stack = 0; 284 int noexec_user_stack_log = 1; 285 286 int segvn_lpg_disable = 0; 287 uint_t segvn_maxpgszc = 0; 288 289 ulong_t segvn_vmpss_clrszc_cnt; 290 ulong_t segvn_vmpss_clrszc_err; 291 ulong_t segvn_fltvnpages_clrszc_cnt; 292 ulong_t segvn_fltvnpages_clrszc_err; 293 ulong_t segvn_setpgsz_align_err; 294 ulong_t segvn_setpgsz_anon_align_err; 295 ulong_t segvn_setpgsz_getattr_err; 296 ulong_t segvn_setpgsz_eof_err; 297 ulong_t segvn_faultvnmpss_align_err1; 298 ulong_t segvn_faultvnmpss_align_err2; 299 ulong_t segvn_faultvnmpss_align_err3; 300 ulong_t segvn_faultvnmpss_align_err4; 301 ulong_t segvn_faultvnmpss_align_err5; 302 ulong_t segvn_vmpss_pageio_deadlk_err; 303 304 int segvn_use_regions = 1; 305 306 /* 307 * Segvn supports text replication optimization for NUMA platforms. Text 308 * replica's are represented by anon maps (amp). There's one amp per text file 309 * region per lgroup. A process chooses the amp for each of its text mappings 310 * based on the lgroup assignment of its main thread (t_tid = 1). All 311 * processes that want a replica on a particular lgroup for the same text file 312 * mapping share the same amp. amp's are looked up in svntr_hashtab hash table 313 * with vp,off,size,szc used as a key. Text replication segments are read only 314 * MAP_PRIVATE|MAP_TEXT segments that map vnode. Replication is achieved by 315 * forcing COW faults from vnode to amp and mapping amp pages instead of vnode 316 * pages. Replication amp is assigned to a segment when it gets its first 317 * pagefault. To handle main thread lgroup rehoming segvn_trasync_thread 318 * rechecks periodically if the process still maps an amp local to the main 319 * thread. If not async thread forces process to remap to an amp in the new 320 * home lgroup of the main thread. Current text replication implementation 321 * only provides the benefit to workloads that do most of their work in the 322 * main thread of a process or all the threads of a process run in the same 323 * lgroup. To extend text replication benefit to different types of 324 * multithreaded workloads further work would be needed in the hat layer to 325 * allow the same virtual address in the same hat to simultaneously map 326 * different physical addresses (i.e. page table replication would be needed 327 * for x86). 328 * 329 * amp pages are used instead of vnode pages as long as segment has a very 330 * simple life cycle. It's created via segvn_create(), handles S_EXEC 331 * (S_READ) pagefaults and is fully unmapped. If anything more complicated 332 * happens such as protection is changed, real COW fault happens, pagesize is 333 * changed, MC_LOCK is requested or segment is partially unmapped we turn off 334 * text replication by converting the segment back to vnode only segment 335 * (unmap segment's address range and set svd->amp to NULL). 336 * 337 * The original file can be changed after amp is inserted into 338 * svntr_hashtab. Processes that are launched after the file is already 339 * changed can't use the replica's created prior to the file change. To 340 * implement this functionality hash entries are timestamped. Replica's can 341 * only be used if current file modification time is the same as the timestamp 342 * saved when hash entry was created. However just timestamps alone are not 343 * sufficient to detect file modification via mmap(MAP_SHARED) mappings. We 344 * deal with file changes via MAP_SHARED mappings differently. When writable 345 * MAP_SHARED mappings are created to vnodes marked as executable we mark all 346 * existing replica's for this vnode as not usable for future text 347 * mappings. And we don't create new replica's for files that currently have 348 * potentially writable MAP_SHARED mappings (i.e. vn_is_mapped(V_WRITE) is 349 * true). 350 */ 351 352 #define SEGVN_TEXTREPL_MAXBYTES_FACTOR (20) 353 size_t segvn_textrepl_max_bytes_factor = SEGVN_TEXTREPL_MAXBYTES_FACTOR; 354 355 static ulong_t svntr_hashtab_sz = 512; 356 static svntr_bucket_t *svntr_hashtab = NULL; 357 static struct kmem_cache *svntr_cache; 358 static svntr_stats_t *segvn_textrepl_stats; 359 static ksema_t segvn_trasync_sem; 360 361 int segvn_disable_textrepl = 1; 362 size_t textrepl_size_thresh = (size_t)-1; 363 size_t segvn_textrepl_bytes = 0; 364 size_t segvn_textrepl_max_bytes = 0; 365 clock_t segvn_update_textrepl_interval = 0; 366 int segvn_update_tr_time = 10; 367 int segvn_disable_textrepl_update = 0; 368 369 static void segvn_textrepl(struct seg *); 370 static void segvn_textunrepl(struct seg *, int); 371 static void segvn_inval_trcache(vnode_t *); 372 static void segvn_trasync_thread(void); 373 static void segvn_trupdate_wakeup(void *); 374 static void segvn_trupdate(void); 375 static void segvn_trupdate_seg(struct seg *, segvn_data_t *, svntr_t *, 376 ulong_t); 377 378 /* 379 * Initialize segvn data structures 380 */ 381 void 382 segvn_init(void) 383 { 384 uint_t maxszc; 385 uint_t szc; 386 size_t pgsz; 387 388 segvn_cache = kmem_cache_create("segvn_cache", 389 sizeof (struct segvn_data), 0, 390 segvn_cache_constructor, segvn_cache_destructor, NULL, 391 NULL, NULL, 0); 392 393 if (segvn_lpg_disable == 0) { 394 szc = maxszc = page_num_pagesizes() - 1; 395 if (szc == 0) { 396 segvn_lpg_disable = 1; 397 } 398 if (page_get_pagesize(0) != PAGESIZE) { 399 panic("segvn_init: bad szc 0"); 400 /*NOTREACHED*/ 401 } 402 while (szc != 0) { 403 pgsz = page_get_pagesize(szc); 404 if (pgsz <= PAGESIZE || !IS_P2ALIGNED(pgsz, pgsz)) { 405 panic("segvn_init: bad szc %d", szc); 406 /*NOTREACHED*/ 407 } 408 szc--; 409 } 410 if (segvn_maxpgszc == 0 || segvn_maxpgszc > maxszc) 411 segvn_maxpgszc = maxszc; 412 } 413 414 if (segvn_use_regions && !hat_supported(HAT_SHARED_REGIONS, NULL)) 415 segvn_use_regions = 0; 416 417 /* 418 * For now shared regions and text replication segvn support 419 * are mutually exclusive. This is acceptable because 420 * currently significant benefit from text replication was 421 * only observed on AMD64 NUMA platforms (due to relatively 422 * small L2$ size) and currently we don't support shared 423 * regions on x86. 424 */ 425 if (segvn_use_regions && !segvn_disable_textrepl) { 426 segvn_disable_textrepl = 1; 427 } 428 429 #if defined(_LP64) 430 if (lgrp_optimizations() && textrepl_size_thresh != (size_t)-1 && 431 !segvn_disable_textrepl) { 432 ulong_t i; 433 size_t hsz = svntr_hashtab_sz * sizeof (svntr_bucket_t); 434 435 svntr_cache = kmem_cache_create("svntr_cache", 436 sizeof (svntr_t), 0, svntr_cache_constructor, NULL, 437 NULL, NULL, NULL, 0); 438 svntr_hashtab = kmem_zalloc(hsz, KM_SLEEP); 439 for (i = 0; i < svntr_hashtab_sz; i++) { 440 mutex_init(&svntr_hashtab[i].tr_lock, NULL, 441 MUTEX_DEFAULT, NULL); 442 } 443 segvn_textrepl_max_bytes = ptob(physmem) / 444 segvn_textrepl_max_bytes_factor; 445 segvn_textrepl_stats = kmem_zalloc(NCPU * 446 sizeof (svntr_stats_t), KM_SLEEP); 447 sema_init(&segvn_trasync_sem, 0, NULL, SEMA_DEFAULT, NULL); 448 (void) thread_create(NULL, 0, segvn_trasync_thread, 449 NULL, 0, &p0, TS_RUN, minclsyspri); 450 } 451 #endif 452 } 453 454 #define SEGVN_PAGEIO ((void *)0x1) 455 #define SEGVN_NOPAGEIO ((void *)0x2) 456 457 static void 458 segvn_setvnode_mpss(vnode_t *vp) 459 { 460 int err; 461 462 ASSERT(vp->v_mpssdata == NULL || 463 vp->v_mpssdata == SEGVN_PAGEIO || 464 vp->v_mpssdata == SEGVN_NOPAGEIO); 465 466 if (vp->v_mpssdata == NULL) { 467 if (vn_vmpss_usepageio(vp)) { 468 err = VOP_PAGEIO(vp, (page_t *)NULL, 469 (u_offset_t)0, 0, 0, CRED()); 470 } else { 471 err = ENOSYS; 472 } 473 /* 474 * set v_mpssdata just once per vnode life 475 * so that it never changes. 476 */ 477 mutex_enter(&vp->v_lock); 478 if (vp->v_mpssdata == NULL) { 479 if (err == EINVAL) { 480 vp->v_mpssdata = SEGVN_PAGEIO; 481 } else { 482 vp->v_mpssdata = SEGVN_NOPAGEIO; 483 } 484 } 485 mutex_exit(&vp->v_lock); 486 } 487 } 488 489 int 490 segvn_create(struct seg *seg, void *argsp) 491 { 492 struct segvn_crargs *a = (struct segvn_crargs *)argsp; 493 struct segvn_data *svd; 494 size_t swresv = 0; 495 struct cred *cred; 496 struct anon_map *amp; 497 int error = 0; 498 size_t pgsz; 499 lgrp_mem_policy_t mpolicy = LGRP_MEM_POLICY_DEFAULT; 500 int use_rgn = 0; 501 int trok = 0; 502 503 ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock)); 504 505 if (a->type != MAP_PRIVATE && a->type != MAP_SHARED) { 506 panic("segvn_create type"); 507 /*NOTREACHED*/ 508 } 509 510 /* 511 * Check arguments. If a shared anon structure is given then 512 * it is illegal to also specify a vp. 513 */ 514 if (a->amp != NULL && a->vp != NULL) { 515 panic("segvn_create anon_map"); 516 /*NOTREACHED*/ 517 } 518 519 if (a->type == MAP_PRIVATE && (a->flags & MAP_TEXT) && 520 a->vp != NULL && a->prot == (PROT_USER | PROT_READ | PROT_EXEC) && 521 segvn_use_regions) { 522 use_rgn = 1; 523 } 524 525 /* MAP_NORESERVE on a MAP_SHARED segment is meaningless. */ 526 if (a->type == MAP_SHARED) 527 a->flags &= ~MAP_NORESERVE; 528 529 if (a->szc != 0) { 530 if (segvn_lpg_disable != 0 || (a->szc == AS_MAP_NO_LPOOB) || 531 (a->amp != NULL && a->type == MAP_PRIVATE) || 532 (a->flags & MAP_NORESERVE) || seg->s_as == &kas) { 533 a->szc = 0; 534 } else { 535 if (a->szc > segvn_maxpgszc) 536 a->szc = segvn_maxpgszc; 537 pgsz = page_get_pagesize(a->szc); 538 if (!IS_P2ALIGNED(seg->s_base, pgsz) || 539 !IS_P2ALIGNED(seg->s_size, pgsz)) { 540 a->szc = 0; 541 } else if (a->vp != NULL) { 542 extern struct vnode kvp; 543 if (IS_SWAPFSVP(a->vp) || VN_ISKAS(a->vp)) { 544 /* 545 * paranoid check. 546 * hat_page_demote() is not supported 547 * on swapfs pages. 548 */ 549 a->szc = 0; 550 } else if (map_addr_vacalign_check(seg->s_base, 551 a->offset & PAGEMASK)) { 552 a->szc = 0; 553 } 554 } else if (a->amp != NULL) { 555 pgcnt_t anum = btopr(a->offset); 556 pgcnt_t pgcnt = page_get_pagecnt(a->szc); 557 if (!IS_P2ALIGNED(anum, pgcnt)) { 558 a->szc = 0; 559 } 560 } 561 } 562 } 563 564 /* 565 * If segment may need private pages, reserve them now. 566 */ 567 if (!(a->flags & MAP_NORESERVE) && ((a->vp == NULL && a->amp == NULL) || 568 (a->type == MAP_PRIVATE && (a->prot & PROT_WRITE)))) { 569 if (anon_resv(seg->s_size) == 0) 570 return (EAGAIN); 571 swresv = seg->s_size; 572 TRACE_3(TR_FAC_VM, TR_ANON_PROC, "anon proc:%p %lu %u", 573 seg, swresv, 1); 574 } 575 576 /* 577 * Reserve any mapping structures that may be required. 578 * 579 * Don't do it for segments that may use regions. It's currently a 580 * noop in the hat implementations anyway. 581 */ 582 if (!use_rgn) { 583 hat_map(seg->s_as->a_hat, seg->s_base, seg->s_size, HAT_MAP); 584 } 585 586 if (a->cred) { 587 cred = a->cred; 588 crhold(cred); 589 } else { 590 crhold(cred = CRED()); 591 } 592 593 /* Inform the vnode of the new mapping */ 594 if (a->vp != NULL) { 595 error = VOP_ADDMAP(a->vp, a->offset & PAGEMASK, 596 seg->s_as, seg->s_base, seg->s_size, a->prot, 597 a->maxprot, a->type, cred); 598 if (error) { 599 if (swresv != 0) { 600 anon_unresv(swresv); 601 TRACE_3(TR_FAC_VM, TR_ANON_PROC, 602 "anon proc:%p %lu %u", 603 seg, swresv, 0); 604 } 605 crfree(cred); 606 if (!use_rgn) { 607 hat_unload(seg->s_as->a_hat, seg->s_base, 608 seg->s_size, HAT_UNLOAD_UNMAP); 609 } 610 return (error); 611 } 612 /* 613 * svntr_hashtab will be NULL if we support shared regions. 614 */ 615 trok = ((a->flags & MAP_TEXT) && 616 (seg->s_size > textrepl_size_thresh || 617 (a->flags & _MAP_TEXTREPL)) && 618 lgrp_optimizations() && svntr_hashtab != NULL && 619 a->type == MAP_PRIVATE && swresv == 0 && 620 !(a->flags & MAP_NORESERVE) && 621 seg->s_as != &kas && a->vp->v_type == VREG); 622 623 ASSERT(!trok || !use_rgn); 624 } 625 626 /* 627 * If more than one segment in the address space, and they're adjacent 628 * virtually, try to concatenate them. Don't concatenate if an 629 * explicit anon_map structure was supplied (e.g., SystemV shared 630 * memory) or if we'll use text replication for this segment. 631 */ 632 if (a->amp == NULL && !use_rgn && !trok) { 633 struct seg *pseg, *nseg; 634 struct segvn_data *psvd, *nsvd; 635 lgrp_mem_policy_t ppolicy, npolicy; 636 uint_t lgrp_mem_policy_flags = 0; 637 extern lgrp_mem_policy_t lgrp_mem_default_policy; 638 639 /* 640 * Memory policy flags (lgrp_mem_policy_flags) is valid when 641 * extending stack/heap segments. 642 */ 643 if ((a->vp == NULL) && (a->type == MAP_PRIVATE) && 644 !(a->flags & MAP_NORESERVE) && (seg->s_as != &kas)) { 645 lgrp_mem_policy_flags = a->lgrp_mem_policy_flags; 646 } else { 647 /* 648 * Get policy when not extending it from another segment 649 */ 650 mpolicy = lgrp_mem_policy_default(seg->s_size, a->type); 651 } 652 653 /* 654 * First, try to concatenate the previous and new segments 655 */ 656 pseg = AS_SEGPREV(seg->s_as, seg); 657 if (pseg != NULL && 658 pseg->s_base + pseg->s_size == seg->s_base && 659 pseg->s_ops == &segvn_ops) { 660 /* 661 * Get memory allocation policy from previous segment. 662 * When extension is specified (e.g. for heap) apply 663 * this policy to the new segment regardless of the 664 * outcome of segment concatenation. Extension occurs 665 * for non-default policy otherwise default policy is 666 * used and is based on extended segment size. 667 */ 668 psvd = (struct segvn_data *)pseg->s_data; 669 ppolicy = psvd->policy_info.mem_policy; 670 if (lgrp_mem_policy_flags == 671 LGRP_MP_FLAG_EXTEND_UP) { 672 if (ppolicy != lgrp_mem_default_policy) { 673 mpolicy = ppolicy; 674 } else { 675 mpolicy = lgrp_mem_policy_default( 676 pseg->s_size + seg->s_size, 677 a->type); 678 } 679 } 680 681 if (mpolicy == ppolicy && 682 (pseg->s_size + seg->s_size <= 683 segvn_comb_thrshld || psvd->amp == NULL) && 684 segvn_extend_prev(pseg, seg, a, swresv) == 0) { 685 /* 686 * success! now try to concatenate 687 * with following seg 688 */ 689 crfree(cred); 690 nseg = AS_SEGNEXT(pseg->s_as, pseg); 691 if (nseg != NULL && 692 nseg != pseg && 693 nseg->s_ops == &segvn_ops && 694 pseg->s_base + pseg->s_size == 695 nseg->s_base) 696 (void) segvn_concat(pseg, nseg, 0); 697 ASSERT(pseg->s_szc == 0 || 698 (a->szc == pseg->s_szc && 699 IS_P2ALIGNED(pseg->s_base, pgsz) && 700 IS_P2ALIGNED(pseg->s_size, pgsz))); 701 return (0); 702 } 703 } 704 705 /* 706 * Failed, so try to concatenate with following seg 707 */ 708 nseg = AS_SEGNEXT(seg->s_as, seg); 709 if (nseg != NULL && 710 seg->s_base + seg->s_size == nseg->s_base && 711 nseg->s_ops == &segvn_ops) { 712 /* 713 * Get memory allocation policy from next segment. 714 * When extension is specified (e.g. for stack) apply 715 * this policy to the new segment regardless of the 716 * outcome of segment concatenation. Extension occurs 717 * for non-default policy otherwise default policy is 718 * used and is based on extended segment size. 719 */ 720 nsvd = (struct segvn_data *)nseg->s_data; 721 npolicy = nsvd->policy_info.mem_policy; 722 if (lgrp_mem_policy_flags == 723 LGRP_MP_FLAG_EXTEND_DOWN) { 724 if (npolicy != lgrp_mem_default_policy) { 725 mpolicy = npolicy; 726 } else { 727 mpolicy = lgrp_mem_policy_default( 728 nseg->s_size + seg->s_size, 729 a->type); 730 } 731 } 732 733 if (mpolicy == npolicy && 734 segvn_extend_next(seg, nseg, a, swresv) == 0) { 735 crfree(cred); 736 ASSERT(nseg->s_szc == 0 || 737 (a->szc == nseg->s_szc && 738 IS_P2ALIGNED(nseg->s_base, pgsz) && 739 IS_P2ALIGNED(nseg->s_size, pgsz))); 740 return (0); 741 } 742 } 743 } 744 745 if (a->vp != NULL) { 746 VN_HOLD(a->vp); 747 if (a->type == MAP_SHARED) 748 lgrp_shm_policy_init(NULL, a->vp); 749 } 750 svd = kmem_cache_alloc(segvn_cache, KM_SLEEP); 751 752 seg->s_ops = &segvn_ops; 753 seg->s_data = (void *)svd; 754 seg->s_szc = a->szc; 755 756 svd->seg = seg; 757 svd->vp = a->vp; 758 /* 759 * Anonymous mappings have no backing file so the offset is meaningless. 760 */ 761 svd->offset = a->vp ? (a->offset & PAGEMASK) : 0; 762 svd->prot = a->prot; 763 svd->maxprot = a->maxprot; 764 svd->pageprot = 0; 765 svd->type = a->type; 766 svd->vpage = NULL; 767 svd->cred = cred; 768 svd->advice = MADV_NORMAL; 769 svd->pageadvice = 0; 770 svd->flags = (ushort_t)a->flags; 771 svd->softlockcnt = 0; 772 svd->rcookie = HAT_INVALID_REGION_COOKIE; 773 774 if (a->szc != 0 && a->vp != NULL) { 775 segvn_setvnode_mpss(a->vp); 776 } 777 if (svd->type == MAP_SHARED && svd->vp != NULL && 778 (svd->vp->v_flag & VVMEXEC) && (svd->prot & PROT_WRITE)) { 779 ASSERT(vn_is_mapped(svd->vp, V_WRITE)); 780 segvn_inval_trcache(svd->vp); 781 } 782 783 amp = a->amp; 784 if ((svd->amp = amp) == NULL) { 785 svd->anon_index = 0; 786 if (svd->type == MAP_SHARED) { 787 svd->swresv = 0; 788 /* 789 * Shared mappings to a vp need no other setup. 790 * If we have a shared mapping to an anon_map object 791 * which hasn't been allocated yet, allocate the 792 * struct now so that it will be properly shared 793 * by remembering the swap reservation there. 794 */ 795 if (a->vp == NULL) { 796 svd->amp = anonmap_alloc(seg->s_size, swresv, 797 ANON_SLEEP); 798 svd->amp->a_szc = seg->s_szc; 799 } 800 } else { 801 /* 802 * Private mapping (with or without a vp). 803 * Allocate anon_map when needed. 804 */ 805 svd->swresv = swresv; 806 } 807 } else { 808 pgcnt_t anon_num; 809 810 /* 811 * Mapping to an existing anon_map structure without a vp. 812 * For now we will insure that the segment size isn't larger 813 * than the size - offset gives us. Later on we may wish to 814 * have the anon array dynamically allocated itself so that 815 * we don't always have to allocate all the anon pointer slots. 816 * This of course involves adding extra code to check that we 817 * aren't trying to use an anon pointer slot beyond the end 818 * of the currently allocated anon array. 819 */ 820 if ((amp->size - a->offset) < seg->s_size) { 821 panic("segvn_create anon_map size"); 822 /*NOTREACHED*/ 823 } 824 825 anon_num = btopr(a->offset); 826 827 if (a->type == MAP_SHARED) { 828 /* 829 * SHARED mapping to a given anon_map. 830 */ 831 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 832 amp->refcnt++; 833 if (a->szc > amp->a_szc) { 834 amp->a_szc = a->szc; 835 } 836 ANON_LOCK_EXIT(&->a_rwlock); 837 svd->anon_index = anon_num; 838 svd->swresv = 0; 839 } else { 840 /* 841 * PRIVATE mapping to a given anon_map. 842 * Make sure that all the needed anon 843 * structures are created (so that we will 844 * share the underlying pages if nothing 845 * is written by this mapping) and then 846 * duplicate the anon array as is done 847 * when a privately mapped segment is dup'ed. 848 */ 849 struct anon *ap; 850 caddr_t addr; 851 caddr_t eaddr; 852 ulong_t anon_idx; 853 int hat_flag = HAT_LOAD; 854 855 if (svd->flags & MAP_TEXT) { 856 hat_flag |= HAT_LOAD_TEXT; 857 } 858 859 svd->amp = anonmap_alloc(seg->s_size, 0, ANON_SLEEP); 860 svd->amp->a_szc = seg->s_szc; 861 svd->anon_index = 0; 862 svd->swresv = swresv; 863 864 /* 865 * Prevent 2 threads from allocating anon 866 * slots simultaneously. 867 */ 868 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 869 eaddr = seg->s_base + seg->s_size; 870 871 for (anon_idx = anon_num, addr = seg->s_base; 872 addr < eaddr; addr += PAGESIZE, anon_idx++) { 873 page_t *pp; 874 875 if ((ap = anon_get_ptr(amp->ahp, 876 anon_idx)) != NULL) 877 continue; 878 879 /* 880 * Allocate the anon struct now. 881 * Might as well load up translation 882 * to the page while we're at it... 883 */ 884 pp = anon_zero(seg, addr, &ap, cred); 885 if (ap == NULL || pp == NULL) { 886 panic("segvn_create anon_zero"); 887 /*NOTREACHED*/ 888 } 889 890 /* 891 * Re-acquire the anon_map lock and 892 * initialize the anon array entry. 893 */ 894 ASSERT(anon_get_ptr(amp->ahp, 895 anon_idx) == NULL); 896 (void) anon_set_ptr(amp->ahp, anon_idx, ap, 897 ANON_SLEEP); 898 899 ASSERT(seg->s_szc == 0); 900 ASSERT(!IS_VMODSORT(pp->p_vnode)); 901 902 ASSERT(use_rgn == 0); 903 hat_memload(seg->s_as->a_hat, addr, pp, 904 svd->prot & ~PROT_WRITE, hat_flag); 905 906 page_unlock(pp); 907 } 908 ASSERT(seg->s_szc == 0); 909 anon_dup(amp->ahp, anon_num, svd->amp->ahp, 910 0, seg->s_size); 911 ANON_LOCK_EXIT(&->a_rwlock); 912 } 913 } 914 915 /* 916 * Set default memory allocation policy for segment 917 * 918 * Always set policy for private memory at least for initialization 919 * even if this is a shared memory segment 920 */ 921 (void) lgrp_privm_policy_set(mpolicy, &svd->policy_info, seg->s_size); 922 923 if (svd->type == MAP_SHARED) 924 (void) lgrp_shm_policy_set(mpolicy, svd->amp, svd->anon_index, 925 svd->vp, svd->offset, seg->s_size); 926 927 if (use_rgn) { 928 ASSERT(!trok); 929 ASSERT(svd->amp == NULL); 930 svd->rcookie = hat_join_region(seg->s_as->a_hat, seg->s_base, 931 seg->s_size, (void *)svd->vp, svd->offset, svd->prot, 932 (uchar_t)seg->s_szc, segvn_hat_rgn_unload_callback, 933 HAT_REGION_TEXT); 934 } 935 936 ASSERT(!trok || !(svd->prot & PROT_WRITE)); 937 svd->tr_state = trok ? SEGVN_TR_INIT : SEGVN_TR_OFF; 938 939 return (0); 940 } 941 942 /* 943 * Concatenate two existing segments, if possible. 944 * Return 0 on success, -1 if two segments are not compatible 945 * or -2 on memory allocation failure. 946 * If amp_cat == 1 then try and concat segments with anon maps 947 */ 948 static int 949 segvn_concat(struct seg *seg1, struct seg *seg2, int amp_cat) 950 { 951 struct segvn_data *svd1 = seg1->s_data; 952 struct segvn_data *svd2 = seg2->s_data; 953 struct anon_map *amp1 = svd1->amp; 954 struct anon_map *amp2 = svd2->amp; 955 struct vpage *vpage1 = svd1->vpage; 956 struct vpage *vpage2 = svd2->vpage, *nvpage = NULL; 957 size_t size, nvpsize; 958 pgcnt_t npages1, npages2; 959 960 ASSERT(seg1->s_as && seg2->s_as && seg1->s_as == seg2->s_as); 961 ASSERT(AS_WRITE_HELD(seg1->s_as, &seg1->s_as->a_lock)); 962 ASSERT(seg1->s_ops == seg2->s_ops); 963 964 if (HAT_IS_REGION_COOKIE_VALID(svd1->rcookie) || 965 HAT_IS_REGION_COOKIE_VALID(svd2->rcookie)) { 966 return (-1); 967 } 968 969 /* both segments exist, try to merge them */ 970 #define incompat(x) (svd1->x != svd2->x) 971 if (incompat(vp) || incompat(maxprot) || 972 (!svd1->pageadvice && !svd2->pageadvice && incompat(advice)) || 973 (!svd1->pageprot && !svd2->pageprot && incompat(prot)) || 974 incompat(type) || incompat(cred) || incompat(flags) || 975 seg1->s_szc != seg2->s_szc || incompat(policy_info.mem_policy) || 976 (svd2->softlockcnt > 0)) 977 return (-1); 978 #undef incompat 979 980 /* 981 * vp == NULL implies zfod, offset doesn't matter 982 */ 983 if (svd1->vp != NULL && 984 svd1->offset + seg1->s_size != svd2->offset) { 985 return (-1); 986 } 987 988 /* 989 * Don't concatenate if either segment uses text replication. 990 */ 991 if (svd1->tr_state != SEGVN_TR_OFF || svd2->tr_state != SEGVN_TR_OFF) { 992 return (-1); 993 } 994 995 /* 996 * Fail early if we're not supposed to concatenate 997 * segments with non NULL amp. 998 */ 999 if (amp_cat == 0 && (amp1 != NULL || amp2 != NULL)) { 1000 return (-1); 1001 } 1002 1003 if (svd1->vp == NULL && svd1->type == MAP_SHARED) { 1004 if (amp1 != amp2) { 1005 return (-1); 1006 } 1007 if (amp1 != NULL && svd1->anon_index + btop(seg1->s_size) != 1008 svd2->anon_index) { 1009 return (-1); 1010 } 1011 ASSERT(amp1 == NULL || amp1->refcnt >= 2); 1012 } 1013 1014 /* 1015 * If either seg has vpages, create a new merged vpage array. 1016 */ 1017 if (vpage1 != NULL || vpage2 != NULL) { 1018 struct vpage *vp; 1019 1020 npages1 = seg_pages(seg1); 1021 npages2 = seg_pages(seg2); 1022 nvpsize = vpgtob(npages1 + npages2); 1023 1024 if ((nvpage = kmem_zalloc(nvpsize, KM_NOSLEEP)) == NULL) { 1025 return (-2); 1026 } 1027 1028 if (vpage1 != NULL) { 1029 bcopy(vpage1, nvpage, vpgtob(npages1)); 1030 } else { 1031 for (vp = nvpage; vp < nvpage + npages1; vp++) { 1032 VPP_SETPROT(vp, svd1->prot); 1033 VPP_SETADVICE(vp, svd1->advice); 1034 } 1035 } 1036 1037 if (vpage2 != NULL) { 1038 bcopy(vpage2, nvpage + npages1, vpgtob(npages2)); 1039 } else { 1040 for (vp = nvpage + npages1; 1041 vp < nvpage + npages1 + npages2; vp++) { 1042 VPP_SETPROT(vp, svd2->prot); 1043 VPP_SETADVICE(vp, svd2->advice); 1044 } 1045 } 1046 } 1047 1048 /* 1049 * If either segment has private pages, create a new merged anon 1050 * array. If mergeing shared anon segments just decrement anon map's 1051 * refcnt. 1052 */ 1053 if (amp1 != NULL && svd1->type == MAP_SHARED) { 1054 ASSERT(amp1 == amp2 && svd1->vp == NULL); 1055 ANON_LOCK_ENTER(&1->a_rwlock, RW_WRITER); 1056 ASSERT(amp1->refcnt >= 2); 1057 amp1->refcnt--; 1058 ANON_LOCK_EXIT(&1->a_rwlock); 1059 svd2->amp = NULL; 1060 } else if (amp1 != NULL || amp2 != NULL) { 1061 struct anon_hdr *nahp; 1062 struct anon_map *namp = NULL; 1063 size_t asize; 1064 1065 ASSERT(svd1->type == MAP_PRIVATE); 1066 1067 asize = seg1->s_size + seg2->s_size; 1068 if ((nahp = anon_create(btop(asize), ANON_NOSLEEP)) == NULL) { 1069 if (nvpage != NULL) { 1070 kmem_free(nvpage, nvpsize); 1071 } 1072 return (-2); 1073 } 1074 if (amp1 != NULL) { 1075 /* 1076 * XXX anon rwlock is not really needed because 1077 * this is a private segment and we are writers. 1078 */ 1079 ANON_LOCK_ENTER(&1->a_rwlock, RW_WRITER); 1080 ASSERT(amp1->refcnt == 1); 1081 if (anon_copy_ptr(amp1->ahp, svd1->anon_index, 1082 nahp, 0, btop(seg1->s_size), ANON_NOSLEEP)) { 1083 anon_release(nahp, btop(asize)); 1084 ANON_LOCK_EXIT(&1->a_rwlock); 1085 if (nvpage != NULL) { 1086 kmem_free(nvpage, nvpsize); 1087 } 1088 return (-2); 1089 } 1090 } 1091 if (amp2 != NULL) { 1092 ANON_LOCK_ENTER(&2->a_rwlock, RW_WRITER); 1093 ASSERT(amp2->refcnt == 1); 1094 if (anon_copy_ptr(amp2->ahp, svd2->anon_index, 1095 nahp, btop(seg1->s_size), btop(seg2->s_size), 1096 ANON_NOSLEEP)) { 1097 anon_release(nahp, btop(asize)); 1098 ANON_LOCK_EXIT(&2->a_rwlock); 1099 if (amp1 != NULL) { 1100 ANON_LOCK_EXIT(&1->a_rwlock); 1101 } 1102 if (nvpage != NULL) { 1103 kmem_free(nvpage, nvpsize); 1104 } 1105 return (-2); 1106 } 1107 } 1108 if (amp1 != NULL) { 1109 namp = amp1; 1110 anon_release(amp1->ahp, btop(amp1->size)); 1111 } 1112 if (amp2 != NULL) { 1113 if (namp == NULL) { 1114 ASSERT(amp1 == NULL); 1115 namp = amp2; 1116 anon_release(amp2->ahp, btop(amp2->size)); 1117 } else { 1118 amp2->refcnt--; 1119 ANON_LOCK_EXIT(&2->a_rwlock); 1120 anonmap_free(amp2); 1121 } 1122 svd2->amp = NULL; /* needed for seg_free */ 1123 } 1124 namp->ahp = nahp; 1125 namp->size = asize; 1126 svd1->amp = namp; 1127 svd1->anon_index = 0; 1128 ANON_LOCK_EXIT(&namp->a_rwlock); 1129 } 1130 /* 1131 * Now free the old vpage structures. 1132 */ 1133 if (nvpage != NULL) { 1134 if (vpage1 != NULL) { 1135 kmem_free(vpage1, vpgtob(npages1)); 1136 } 1137 if (vpage2 != NULL) { 1138 svd2->vpage = NULL; 1139 kmem_free(vpage2, vpgtob(npages2)); 1140 } 1141 if (svd2->pageprot) { 1142 svd1->pageprot = 1; 1143 } 1144 if (svd2->pageadvice) { 1145 svd1->pageadvice = 1; 1146 } 1147 svd1->vpage = nvpage; 1148 } 1149 1150 /* all looks ok, merge segments */ 1151 svd1->swresv += svd2->swresv; 1152 svd2->swresv = 0; /* so seg_free doesn't release swap space */ 1153 size = seg2->s_size; 1154 seg_free(seg2); 1155 seg1->s_size += size; 1156 return (0); 1157 } 1158 1159 /* 1160 * Extend the previous segment (seg1) to include the 1161 * new segment (seg2 + a), if possible. 1162 * Return 0 on success. 1163 */ 1164 static int 1165 segvn_extend_prev(seg1, seg2, a, swresv) 1166 struct seg *seg1, *seg2; 1167 struct segvn_crargs *a; 1168 size_t swresv; 1169 { 1170 struct segvn_data *svd1 = (struct segvn_data *)seg1->s_data; 1171 size_t size; 1172 struct anon_map *amp1; 1173 struct vpage *new_vpage; 1174 1175 /* 1176 * We don't need any segment level locks for "segvn" data 1177 * since the address space is "write" locked. 1178 */ 1179 ASSERT(seg1->s_as && AS_WRITE_HELD(seg1->s_as, &seg1->s_as->a_lock)); 1180 1181 if (HAT_IS_REGION_COOKIE_VALID(svd1->rcookie)) { 1182 return (-1); 1183 } 1184 1185 /* second segment is new, try to extend first */ 1186 /* XXX - should also check cred */ 1187 if (svd1->vp != a->vp || svd1->maxprot != a->maxprot || 1188 (!svd1->pageprot && (svd1->prot != a->prot)) || 1189 svd1->type != a->type || svd1->flags != a->flags || 1190 seg1->s_szc != a->szc) 1191 return (-1); 1192 1193 /* vp == NULL implies zfod, offset doesn't matter */ 1194 if (svd1->vp != NULL && 1195 svd1->offset + seg1->s_size != (a->offset & PAGEMASK)) 1196 return (-1); 1197 1198 if (svd1->tr_state != SEGVN_TR_OFF) { 1199 return (-1); 1200 } 1201 1202 amp1 = svd1->amp; 1203 if (amp1) { 1204 pgcnt_t newpgs; 1205 1206 /* 1207 * Segment has private pages, can data structures 1208 * be expanded? 1209 * 1210 * Acquire the anon_map lock to prevent it from changing, 1211 * if it is shared. This ensures that the anon_map 1212 * will not change while a thread which has a read/write 1213 * lock on an address space references it. 1214 * XXX - Don't need the anon_map lock at all if "refcnt" 1215 * is 1. 1216 * 1217 * Can't grow a MAP_SHARED segment with an anonmap because 1218 * there may be existing anon slots where we want to extend 1219 * the segment and we wouldn't know what to do with them 1220 * (e.g., for tmpfs right thing is to just leave them there, 1221 * for /dev/zero they should be cleared out). 1222 */ 1223 if (svd1->type == MAP_SHARED) 1224 return (-1); 1225 1226 ANON_LOCK_ENTER(&1->a_rwlock, RW_WRITER); 1227 if (amp1->refcnt > 1) { 1228 ANON_LOCK_EXIT(&1->a_rwlock); 1229 return (-1); 1230 } 1231 newpgs = anon_grow(amp1->ahp, &svd1->anon_index, 1232 btop(seg1->s_size), btop(seg2->s_size), ANON_NOSLEEP); 1233 1234 if (newpgs == 0) { 1235 ANON_LOCK_EXIT(&1->a_rwlock); 1236 return (-1); 1237 } 1238 amp1->size = ptob(newpgs); 1239 ANON_LOCK_EXIT(&1->a_rwlock); 1240 } 1241 if (svd1->vpage != NULL) { 1242 struct vpage *vp, *evp; 1243 new_vpage = 1244 kmem_zalloc(vpgtob(seg_pages(seg1) + seg_pages(seg2)), 1245 KM_NOSLEEP); 1246 if (new_vpage == NULL) 1247 return (-1); 1248 bcopy(svd1->vpage, new_vpage, vpgtob(seg_pages(seg1))); 1249 kmem_free(svd1->vpage, vpgtob(seg_pages(seg1))); 1250 svd1->vpage = new_vpage; 1251 1252 vp = new_vpage + seg_pages(seg1); 1253 evp = vp + seg_pages(seg2); 1254 for (; vp < evp; vp++) 1255 VPP_SETPROT(vp, a->prot); 1256 } 1257 size = seg2->s_size; 1258 seg_free(seg2); 1259 seg1->s_size += size; 1260 svd1->swresv += swresv; 1261 if (svd1->pageprot && (a->prot & PROT_WRITE) && 1262 svd1->type == MAP_SHARED && svd1->vp != NULL && 1263 (svd1->vp->v_flag & VVMEXEC)) { 1264 ASSERT(vn_is_mapped(svd1->vp, V_WRITE)); 1265 segvn_inval_trcache(svd1->vp); 1266 } 1267 return (0); 1268 } 1269 1270 /* 1271 * Extend the next segment (seg2) to include the 1272 * new segment (seg1 + a), if possible. 1273 * Return 0 on success. 1274 */ 1275 static int 1276 segvn_extend_next( 1277 struct seg *seg1, 1278 struct seg *seg2, 1279 struct segvn_crargs *a, 1280 size_t swresv) 1281 { 1282 struct segvn_data *svd2 = (struct segvn_data *)seg2->s_data; 1283 size_t size; 1284 struct anon_map *amp2; 1285 struct vpage *new_vpage; 1286 1287 /* 1288 * We don't need any segment level locks for "segvn" data 1289 * since the address space is "write" locked. 1290 */ 1291 ASSERT(seg2->s_as && AS_WRITE_HELD(seg2->s_as, &seg2->s_as->a_lock)); 1292 1293 if (HAT_IS_REGION_COOKIE_VALID(svd2->rcookie)) { 1294 return (-1); 1295 } 1296 1297 /* first segment is new, try to extend second */ 1298 /* XXX - should also check cred */ 1299 if (svd2->vp != a->vp || svd2->maxprot != a->maxprot || 1300 (!svd2->pageprot && (svd2->prot != a->prot)) || 1301 svd2->type != a->type || svd2->flags != a->flags || 1302 seg2->s_szc != a->szc) 1303 return (-1); 1304 /* vp == NULL implies zfod, offset doesn't matter */ 1305 if (svd2->vp != NULL && 1306 (a->offset & PAGEMASK) + seg1->s_size != svd2->offset) 1307 return (-1); 1308 1309 if (svd2->tr_state != SEGVN_TR_OFF) { 1310 return (-1); 1311 } 1312 1313 amp2 = svd2->amp; 1314 if (amp2) { 1315 pgcnt_t newpgs; 1316 1317 /* 1318 * Segment has private pages, can data structures 1319 * be expanded? 1320 * 1321 * Acquire the anon_map lock to prevent it from changing, 1322 * if it is shared. This ensures that the anon_map 1323 * will not change while a thread which has a read/write 1324 * lock on an address space references it. 1325 * 1326 * XXX - Don't need the anon_map lock at all if "refcnt" 1327 * is 1. 1328 */ 1329 if (svd2->type == MAP_SHARED) 1330 return (-1); 1331 1332 ANON_LOCK_ENTER(&2->a_rwlock, RW_WRITER); 1333 if (amp2->refcnt > 1) { 1334 ANON_LOCK_EXIT(&2->a_rwlock); 1335 return (-1); 1336 } 1337 newpgs = anon_grow(amp2->ahp, &svd2->anon_index, 1338 btop(seg2->s_size), btop(seg1->s_size), 1339 ANON_NOSLEEP | ANON_GROWDOWN); 1340 1341 if (newpgs == 0) { 1342 ANON_LOCK_EXIT(&2->a_rwlock); 1343 return (-1); 1344 } 1345 amp2->size = ptob(newpgs); 1346 ANON_LOCK_EXIT(&2->a_rwlock); 1347 } 1348 if (svd2->vpage != NULL) { 1349 struct vpage *vp, *evp; 1350 new_vpage = 1351 kmem_zalloc(vpgtob(seg_pages(seg1) + seg_pages(seg2)), 1352 KM_NOSLEEP); 1353 if (new_vpage == NULL) { 1354 /* Not merging segments so adjust anon_index back */ 1355 if (amp2) 1356 svd2->anon_index += seg_pages(seg1); 1357 return (-1); 1358 } 1359 bcopy(svd2->vpage, new_vpage + seg_pages(seg1), 1360 vpgtob(seg_pages(seg2))); 1361 kmem_free(svd2->vpage, vpgtob(seg_pages(seg2))); 1362 svd2->vpage = new_vpage; 1363 1364 vp = new_vpage; 1365 evp = vp + seg_pages(seg1); 1366 for (; vp < evp; vp++) 1367 VPP_SETPROT(vp, a->prot); 1368 } 1369 size = seg1->s_size; 1370 seg_free(seg1); 1371 seg2->s_size += size; 1372 seg2->s_base -= size; 1373 svd2->offset -= size; 1374 svd2->swresv += swresv; 1375 if (svd2->pageprot && (a->prot & PROT_WRITE) && 1376 svd2->type == MAP_SHARED && svd2->vp != NULL && 1377 (svd2->vp->v_flag & VVMEXEC)) { 1378 ASSERT(vn_is_mapped(svd2->vp, V_WRITE)); 1379 segvn_inval_trcache(svd2->vp); 1380 } 1381 return (0); 1382 } 1383 1384 static int 1385 segvn_dup(struct seg *seg, struct seg *newseg) 1386 { 1387 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 1388 struct segvn_data *newsvd; 1389 pgcnt_t npages = seg_pages(seg); 1390 int error = 0; 1391 uint_t prot; 1392 size_t len; 1393 struct anon_map *amp; 1394 1395 ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock)); 1396 1397 /* 1398 * If segment has anon reserved, reserve more for the new seg. 1399 * For a MAP_NORESERVE segment swresv will be a count of all the 1400 * allocated anon slots; thus we reserve for the child as many slots 1401 * as the parent has allocated. This semantic prevents the child or 1402 * parent from dieing during a copy-on-write fault caused by trying 1403 * to write a shared pre-existing anon page. 1404 */ 1405 if ((len = svd->swresv) != 0) { 1406 if (anon_resv(svd->swresv) == 0) 1407 return (ENOMEM); 1408 1409 TRACE_3(TR_FAC_VM, TR_ANON_PROC, "anon proc:%p %lu %u", 1410 seg, len, 0); 1411 } 1412 1413 newsvd = kmem_cache_alloc(segvn_cache, KM_SLEEP); 1414 1415 newseg->s_ops = &segvn_ops; 1416 newseg->s_data = (void *)newsvd; 1417 newseg->s_szc = seg->s_szc; 1418 1419 newsvd->seg = newseg; 1420 if ((newsvd->vp = svd->vp) != NULL) { 1421 VN_HOLD(svd->vp); 1422 if (svd->type == MAP_SHARED) 1423 lgrp_shm_policy_init(NULL, svd->vp); 1424 } 1425 newsvd->offset = svd->offset; 1426 newsvd->prot = svd->prot; 1427 newsvd->maxprot = svd->maxprot; 1428 newsvd->pageprot = svd->pageprot; 1429 newsvd->type = svd->type; 1430 newsvd->cred = svd->cred; 1431 crhold(newsvd->cred); 1432 newsvd->advice = svd->advice; 1433 newsvd->pageadvice = svd->pageadvice; 1434 newsvd->swresv = svd->swresv; 1435 newsvd->flags = svd->flags; 1436 newsvd->softlockcnt = 0; 1437 newsvd->policy_info = svd->policy_info; 1438 newsvd->rcookie = HAT_INVALID_REGION_COOKIE; 1439 1440 if ((amp = svd->amp) == NULL || svd->tr_state == SEGVN_TR_ON) { 1441 /* 1442 * Not attaching to a shared anon object. 1443 */ 1444 ASSERT(!HAT_IS_REGION_COOKIE_VALID(svd->rcookie) || 1445 svd->tr_state == SEGVN_TR_OFF); 1446 if (svd->tr_state == SEGVN_TR_ON) { 1447 ASSERT(newsvd->vp != NULL && amp != NULL); 1448 newsvd->tr_state = SEGVN_TR_INIT; 1449 } else { 1450 newsvd->tr_state = svd->tr_state; 1451 } 1452 newsvd->amp = NULL; 1453 newsvd->anon_index = 0; 1454 } else { 1455 /* regions for now are only used on pure vnode segments */ 1456 ASSERT(svd->rcookie == HAT_INVALID_REGION_COOKIE); 1457 ASSERT(svd->tr_state == SEGVN_TR_OFF); 1458 newsvd->tr_state = SEGVN_TR_OFF; 1459 if (svd->type == MAP_SHARED) { 1460 newsvd->amp = amp; 1461 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 1462 amp->refcnt++; 1463 ANON_LOCK_EXIT(&->a_rwlock); 1464 newsvd->anon_index = svd->anon_index; 1465 } else { 1466 int reclaim = 1; 1467 1468 /* 1469 * Allocate and initialize new anon_map structure. 1470 */ 1471 newsvd->amp = anonmap_alloc(newseg->s_size, 0, 1472 ANON_SLEEP); 1473 newsvd->amp->a_szc = newseg->s_szc; 1474 newsvd->anon_index = 0; 1475 1476 /* 1477 * We don't have to acquire the anon_map lock 1478 * for the new segment (since it belongs to an 1479 * address space that is still not associated 1480 * with any process), or the segment in the old 1481 * address space (since all threads in it 1482 * are stopped while duplicating the address space). 1483 */ 1484 1485 /* 1486 * The goal of the following code is to make sure that 1487 * softlocked pages do not end up as copy on write 1488 * pages. This would cause problems where one 1489 * thread writes to a page that is COW and a different 1490 * thread in the same process has softlocked it. The 1491 * softlock lock would move away from this process 1492 * because the write would cause this process to get 1493 * a copy (without the softlock). 1494 * 1495 * The strategy here is to just break the 1496 * sharing on pages that could possibly be 1497 * softlocked. 1498 */ 1499 retry: 1500 if (svd->softlockcnt) { 1501 struct anon *ap, *newap; 1502 size_t i; 1503 uint_t vpprot; 1504 page_t *anon_pl[1+1], *pp; 1505 caddr_t addr; 1506 ulong_t old_idx = svd->anon_index; 1507 ulong_t new_idx = 0; 1508 1509 /* 1510 * The softlock count might be non zero 1511 * because some pages are still stuck in the 1512 * cache for lazy reclaim. Flush the cache 1513 * now. This should drop the count to zero. 1514 * [or there is really I/O going on to these 1515 * pages]. Note, we have the writers lock so 1516 * nothing gets inserted during the flush. 1517 */ 1518 if (reclaim == 1) { 1519 segvn_purge(seg); 1520 reclaim = 0; 1521 goto retry; 1522 } 1523 i = btopr(seg->s_size); 1524 addr = seg->s_base; 1525 /* 1526 * XXX break cow sharing using PAGESIZE 1527 * pages. They will be relocated into larger 1528 * pages at fault time. 1529 */ 1530 while (i-- > 0) { 1531 if (ap = anon_get_ptr(amp->ahp, 1532 old_idx)) { 1533 error = anon_getpage(&ap, 1534 &vpprot, anon_pl, PAGESIZE, 1535 seg, addr, S_READ, 1536 svd->cred); 1537 if (error) { 1538 newsvd->vpage = NULL; 1539 goto out; 1540 } 1541 /* 1542 * prot need not be computed 1543 * below 'cause anon_private is 1544 * going to ignore it anyway 1545 * as child doesn't inherit 1546 * pagelock from parent. 1547 */ 1548 prot = svd->pageprot ? 1549 VPP_PROT( 1550 &svd->vpage[ 1551 seg_page(seg, addr)]) 1552 : svd->prot; 1553 pp = anon_private(&newap, 1554 newseg, addr, prot, 1555 anon_pl[0], 0, 1556 newsvd->cred); 1557 if (pp == NULL) { 1558 /* no mem abort */ 1559 newsvd->vpage = NULL; 1560 error = ENOMEM; 1561 goto out; 1562 } 1563 (void) anon_set_ptr( 1564 newsvd->amp->ahp, new_idx, 1565 newap, ANON_SLEEP); 1566 page_unlock(pp); 1567 } 1568 addr += PAGESIZE; 1569 old_idx++; 1570 new_idx++; 1571 } 1572 } else { /* common case */ 1573 if (seg->s_szc != 0) { 1574 /* 1575 * If at least one of anon slots of a 1576 * large page exists then make sure 1577 * all anon slots of a large page 1578 * exist to avoid partial cow sharing 1579 * of a large page in the future. 1580 */ 1581 anon_dup_fill_holes(amp->ahp, 1582 svd->anon_index, newsvd->amp->ahp, 1583 0, seg->s_size, seg->s_szc, 1584 svd->vp != NULL); 1585 } else { 1586 anon_dup(amp->ahp, svd->anon_index, 1587 newsvd->amp->ahp, 0, seg->s_size); 1588 } 1589 1590 hat_clrattr(seg->s_as->a_hat, seg->s_base, 1591 seg->s_size, PROT_WRITE); 1592 } 1593 } 1594 } 1595 /* 1596 * If necessary, create a vpage structure for the new segment. 1597 * Do not copy any page lock indications. 1598 */ 1599 if (svd->vpage != NULL) { 1600 uint_t i; 1601 struct vpage *ovp = svd->vpage; 1602 struct vpage *nvp; 1603 1604 nvp = newsvd->vpage = 1605 kmem_alloc(vpgtob(npages), KM_SLEEP); 1606 for (i = 0; i < npages; i++) { 1607 *nvp = *ovp++; 1608 VPP_CLRPPLOCK(nvp++); 1609 } 1610 } else 1611 newsvd->vpage = NULL; 1612 1613 /* Inform the vnode of the new mapping */ 1614 if (newsvd->vp != NULL) { 1615 error = VOP_ADDMAP(newsvd->vp, (offset_t)newsvd->offset, 1616 newseg->s_as, newseg->s_base, newseg->s_size, newsvd->prot, 1617 newsvd->maxprot, newsvd->type, newsvd->cred); 1618 } 1619 out: 1620 if (error == 0 && HAT_IS_REGION_COOKIE_VALID(svd->rcookie)) { 1621 ASSERT(newsvd->amp == NULL); 1622 ASSERT(newsvd->tr_state == SEGVN_TR_OFF); 1623 newsvd->rcookie = svd->rcookie; 1624 hat_dup_region(newseg->s_as->a_hat, newsvd->rcookie); 1625 } 1626 return (error); 1627 } 1628 1629 1630 /* 1631 * callback function used by segvn_unmap to invoke free_vp_pages() for only 1632 * those pages actually processed by the HAT 1633 */ 1634 extern int free_pages; 1635 1636 static void 1637 segvn_hat_rgn_unload_callback(caddr_t saddr, caddr_t eaddr, caddr_t r_saddr, 1638 size_t r_size, void *r_obj, u_offset_t r_objoff) 1639 { 1640 u_offset_t off; 1641 size_t len; 1642 vnode_t *vp = (vnode_t *)r_obj; 1643 1644 ASSERT(eaddr > saddr); 1645 ASSERT(saddr >= r_saddr); 1646 ASSERT(saddr < r_saddr + r_size); 1647 ASSERT(eaddr > r_saddr); 1648 ASSERT(eaddr <= r_saddr + r_size); 1649 ASSERT(vp != NULL); 1650 1651 if (!free_pages) { 1652 return; 1653 } 1654 1655 len = eaddr - saddr; 1656 off = (saddr - r_saddr) + r_objoff; 1657 free_vp_pages(vp, off, len); 1658 } 1659 1660 static void 1661 segvn_hat_unload_callback(hat_callback_t *cb) 1662 { 1663 struct seg *seg = cb->hcb_data; 1664 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 1665 size_t len; 1666 u_offset_t off; 1667 1668 ASSERT(svd->vp != NULL); 1669 ASSERT(cb->hcb_end_addr > cb->hcb_start_addr); 1670 ASSERT(cb->hcb_start_addr >= seg->s_base); 1671 1672 len = cb->hcb_end_addr - cb->hcb_start_addr; 1673 off = cb->hcb_start_addr - seg->s_base; 1674 free_vp_pages(svd->vp, svd->offset + off, len); 1675 } 1676 1677 static int 1678 segvn_unmap(struct seg *seg, caddr_t addr, size_t len) 1679 { 1680 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 1681 struct segvn_data *nsvd; 1682 struct seg *nseg; 1683 struct anon_map *amp; 1684 pgcnt_t opages; /* old segment size in pages */ 1685 pgcnt_t npages; /* new segment size in pages */ 1686 pgcnt_t dpages; /* pages being deleted (unmapped) */ 1687 hat_callback_t callback; /* used for free_vp_pages() */ 1688 hat_callback_t *cbp = NULL; 1689 caddr_t nbase; 1690 size_t nsize; 1691 size_t oswresv; 1692 int reclaim = 1; 1693 1694 /* 1695 * We don't need any segment level locks for "segvn" data 1696 * since the address space is "write" locked. 1697 */ 1698 ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock)); 1699 1700 /* 1701 * Fail the unmap if pages are SOFTLOCKed through this mapping. 1702 * softlockcnt is protected from change by the as write lock. 1703 */ 1704 retry: 1705 if (svd->softlockcnt > 0) { 1706 ASSERT(svd->tr_state == SEGVN_TR_OFF); 1707 /* 1708 * since we do have the writers lock nobody can fill 1709 * the cache during the purge. The flush either succeeds 1710 * or we still have pending I/Os. 1711 */ 1712 if (reclaim == 1) { 1713 segvn_purge(seg); 1714 reclaim = 0; 1715 goto retry; 1716 } 1717 return (EAGAIN); 1718 } 1719 1720 /* 1721 * Check for bad sizes 1722 */ 1723 if (addr < seg->s_base || addr + len > seg->s_base + seg->s_size || 1724 (len & PAGEOFFSET) || ((uintptr_t)addr & PAGEOFFSET)) { 1725 panic("segvn_unmap"); 1726 /*NOTREACHED*/ 1727 } 1728 1729 if (seg->s_szc != 0) { 1730 size_t pgsz = page_get_pagesize(seg->s_szc); 1731 int err; 1732 if (!IS_P2ALIGNED(addr, pgsz) || !IS_P2ALIGNED(len, pgsz)) { 1733 ASSERT(seg->s_base != addr || seg->s_size != len); 1734 if (HAT_IS_REGION_COOKIE_VALID(svd->rcookie)) { 1735 ASSERT(svd->amp == NULL); 1736 ASSERT(svd->tr_state == SEGVN_TR_OFF); 1737 hat_leave_region(seg->s_as->a_hat, 1738 svd->rcookie, HAT_REGION_TEXT); 1739 svd->rcookie = HAT_INVALID_REGION_COOKIE; 1740 /* 1741 * could pass a flag to segvn_demote_range() 1742 * below to tell it not to do any unloads but 1743 * this case is rare enough to not bother for 1744 * now. 1745 */ 1746 } else if (svd->tr_state == SEGVN_TR_INIT) { 1747 svd->tr_state = SEGVN_TR_OFF; 1748 } else if (svd->tr_state == SEGVN_TR_ON) { 1749 ASSERT(svd->amp != NULL); 1750 segvn_textunrepl(seg, 1); 1751 ASSERT(svd->amp == NULL); 1752 ASSERT(svd->tr_state == SEGVN_TR_OFF); 1753 } 1754 VM_STAT_ADD(segvnvmstats.demoterange[0]); 1755 err = segvn_demote_range(seg, addr, len, SDR_END, 0); 1756 if (err == 0) { 1757 return (IE_RETRY); 1758 } 1759 return (err); 1760 } 1761 } 1762 1763 /* Inform the vnode of the unmapping. */ 1764 if (svd->vp) { 1765 int error; 1766 1767 error = VOP_DELMAP(svd->vp, 1768 (offset_t)svd->offset + (uintptr_t)(addr - seg->s_base), 1769 seg->s_as, addr, len, svd->prot, svd->maxprot, 1770 svd->type, svd->cred); 1771 1772 if (error == EAGAIN) 1773 return (error); 1774 } 1775 1776 /* 1777 * Remove any page locks set through this mapping. 1778 * If text replication is not off no page locks could have been 1779 * established via this mapping. 1780 */ 1781 if (svd->tr_state == SEGVN_TR_OFF) { 1782 (void) segvn_lockop(seg, addr, len, 0, MC_UNLOCK, NULL, 0); 1783 } 1784 1785 if (HAT_IS_REGION_COOKIE_VALID(svd->rcookie)) { 1786 ASSERT(svd->amp == NULL); 1787 ASSERT(svd->tr_state == SEGVN_TR_OFF); 1788 ASSERT(svd->type == MAP_PRIVATE); 1789 hat_leave_region(seg->s_as->a_hat, svd->rcookie, 1790 HAT_REGION_TEXT); 1791 svd->rcookie = HAT_INVALID_REGION_COOKIE; 1792 } else if (svd->tr_state == SEGVN_TR_ON) { 1793 ASSERT(svd->amp != NULL); 1794 ASSERT(svd->pageprot == 0 && !(svd->prot & PROT_WRITE)); 1795 segvn_textunrepl(seg, 1); 1796 ASSERT(svd->amp == NULL && svd->tr_state == SEGVN_TR_OFF); 1797 } else { 1798 if (svd->tr_state != SEGVN_TR_OFF) { 1799 ASSERT(svd->tr_state == SEGVN_TR_INIT); 1800 svd->tr_state = SEGVN_TR_OFF; 1801 } 1802 /* 1803 * Unload any hardware translations in the range to be taken 1804 * out. Use a callback to invoke free_vp_pages() effectively. 1805 */ 1806 if (svd->vp != NULL && free_pages != 0) { 1807 callback.hcb_data = seg; 1808 callback.hcb_function = segvn_hat_unload_callback; 1809 cbp = &callback; 1810 } 1811 hat_unload_callback(seg->s_as->a_hat, addr, len, 1812 HAT_UNLOAD_UNMAP, cbp); 1813 1814 if (svd->type == MAP_SHARED && svd->vp != NULL && 1815 (svd->vp->v_flag & VVMEXEC) && 1816 ((svd->prot & PROT_WRITE) || svd->pageprot)) { 1817 segvn_inval_trcache(svd->vp); 1818 } 1819 } 1820 1821 /* 1822 * Check for entire segment 1823 */ 1824 if (addr == seg->s_base && len == seg->s_size) { 1825 seg_free(seg); 1826 return (0); 1827 } 1828 1829 opages = seg_pages(seg); 1830 dpages = btop(len); 1831 npages = opages - dpages; 1832 amp = svd->amp; 1833 ASSERT(amp == NULL || amp->a_szc >= seg->s_szc); 1834 1835 /* 1836 * Check for beginning of segment 1837 */ 1838 if (addr == seg->s_base) { 1839 if (svd->vpage != NULL) { 1840 size_t nbytes; 1841 struct vpage *ovpage; 1842 1843 ovpage = svd->vpage; /* keep pointer to vpage */ 1844 1845 nbytes = vpgtob(npages); 1846 svd->vpage = kmem_alloc(nbytes, KM_SLEEP); 1847 bcopy(&ovpage[dpages], svd->vpage, nbytes); 1848 1849 /* free up old vpage */ 1850 kmem_free(ovpage, vpgtob(opages)); 1851 } 1852 if (amp != NULL) { 1853 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 1854 if (amp->refcnt == 1 || svd->type == MAP_PRIVATE) { 1855 /* 1856 * Free up now unused parts of anon_map array. 1857 */ 1858 if (amp->a_szc == seg->s_szc) { 1859 if (seg->s_szc != 0) { 1860 anon_free_pages(amp->ahp, 1861 svd->anon_index, len, 1862 seg->s_szc); 1863 } else { 1864 anon_free(amp->ahp, 1865 svd->anon_index, 1866 len); 1867 } 1868 } else { 1869 ASSERT(svd->type == MAP_SHARED); 1870 ASSERT(amp->a_szc > seg->s_szc); 1871 anon_shmap_free_pages(amp, 1872 svd->anon_index, len); 1873 } 1874 1875 /* 1876 * Unreserve swap space for the 1877 * unmapped chunk of this segment in 1878 * case it's MAP_SHARED 1879 */ 1880 if (svd->type == MAP_SHARED) { 1881 anon_unresv(len); 1882 amp->swresv -= len; 1883 } 1884 } 1885 ANON_LOCK_EXIT(&->a_rwlock); 1886 svd->anon_index += dpages; 1887 } 1888 if (svd->vp != NULL) 1889 svd->offset += len; 1890 1891 if (svd->swresv) { 1892 if (svd->flags & MAP_NORESERVE) { 1893 ASSERT(amp); 1894 oswresv = svd->swresv; 1895 1896 svd->swresv = ptob(anon_pages(amp->ahp, 1897 svd->anon_index, npages)); 1898 anon_unresv(oswresv - svd->swresv); 1899 } else { 1900 anon_unresv(len); 1901 svd->swresv -= len; 1902 } 1903 TRACE_3(TR_FAC_VM, TR_ANON_PROC, "anon proc:%p %lu %u", 1904 seg, len, 0); 1905 } 1906 1907 seg->s_base += len; 1908 seg->s_size -= len; 1909 return (0); 1910 } 1911 1912 /* 1913 * Check for end of segment 1914 */ 1915 if (addr + len == seg->s_base + seg->s_size) { 1916 if (svd->vpage != NULL) { 1917 size_t nbytes; 1918 struct vpage *ovpage; 1919 1920 ovpage = svd->vpage; /* keep pointer to vpage */ 1921 1922 nbytes = vpgtob(npages); 1923 svd->vpage = kmem_alloc(nbytes, KM_SLEEP); 1924 bcopy(ovpage, svd->vpage, nbytes); 1925 1926 /* free up old vpage */ 1927 kmem_free(ovpage, vpgtob(opages)); 1928 1929 } 1930 if (amp != NULL) { 1931 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 1932 if (amp->refcnt == 1 || svd->type == MAP_PRIVATE) { 1933 /* 1934 * Free up now unused parts of anon_map array. 1935 */ 1936 ulong_t an_idx = svd->anon_index + npages; 1937 if (amp->a_szc == seg->s_szc) { 1938 if (seg->s_szc != 0) { 1939 anon_free_pages(amp->ahp, 1940 an_idx, len, 1941 seg->s_szc); 1942 } else { 1943 anon_free(amp->ahp, an_idx, 1944 len); 1945 } 1946 } else { 1947 ASSERT(svd->type == MAP_SHARED); 1948 ASSERT(amp->a_szc > seg->s_szc); 1949 anon_shmap_free_pages(amp, 1950 an_idx, len); 1951 } 1952 1953 /* 1954 * Unreserve swap space for the 1955 * unmapped chunk of this segment in 1956 * case it's MAP_SHARED 1957 */ 1958 if (svd->type == MAP_SHARED) { 1959 anon_unresv(len); 1960 amp->swresv -= len; 1961 } 1962 } 1963 ANON_LOCK_EXIT(&->a_rwlock); 1964 } 1965 1966 if (svd->swresv) { 1967 if (svd->flags & MAP_NORESERVE) { 1968 ASSERT(amp); 1969 oswresv = svd->swresv; 1970 svd->swresv = ptob(anon_pages(amp->ahp, 1971 svd->anon_index, npages)); 1972 anon_unresv(oswresv - svd->swresv); 1973 } else { 1974 anon_unresv(len); 1975 svd->swresv -= len; 1976 } 1977 TRACE_3(TR_FAC_VM, TR_ANON_PROC, 1978 "anon proc:%p %lu %u", seg, len, 0); 1979 } 1980 1981 seg->s_size -= len; 1982 return (0); 1983 } 1984 1985 /* 1986 * The section to go is in the middle of the segment, 1987 * have to make it into two segments. nseg is made for 1988 * the high end while seg is cut down at the low end. 1989 */ 1990 nbase = addr + len; /* new seg base */ 1991 nsize = (seg->s_base + seg->s_size) - nbase; /* new seg size */ 1992 seg->s_size = addr - seg->s_base; /* shrink old seg */ 1993 nseg = seg_alloc(seg->s_as, nbase, nsize); 1994 if (nseg == NULL) { 1995 panic("segvn_unmap seg_alloc"); 1996 /*NOTREACHED*/ 1997 } 1998 nseg->s_ops = seg->s_ops; 1999 nsvd = kmem_cache_alloc(segvn_cache, KM_SLEEP); 2000 nseg->s_data = (void *)nsvd; 2001 nseg->s_szc = seg->s_szc; 2002 *nsvd = *svd; 2003 nsvd->seg = nseg; 2004 nsvd->offset = svd->offset + (uintptr_t)(nseg->s_base - seg->s_base); 2005 nsvd->swresv = 0; 2006 nsvd->softlockcnt = 0; 2007 ASSERT(nsvd->rcookie == HAT_INVALID_REGION_COOKIE); 2008 2009 if (svd->vp != NULL) { 2010 VN_HOLD(nsvd->vp); 2011 if (nsvd->type == MAP_SHARED) 2012 lgrp_shm_policy_init(NULL, nsvd->vp); 2013 } 2014 crhold(svd->cred); 2015 2016 if (svd->vpage == NULL) { 2017 nsvd->vpage = NULL; 2018 } else { 2019 /* need to split vpage into two arrays */ 2020 size_t nbytes; 2021 struct vpage *ovpage; 2022 2023 ovpage = svd->vpage; /* keep pointer to vpage */ 2024 2025 npages = seg_pages(seg); /* seg has shrunk */ 2026 nbytes = vpgtob(npages); 2027 svd->vpage = kmem_alloc(nbytes, KM_SLEEP); 2028 2029 bcopy(ovpage, svd->vpage, nbytes); 2030 2031 npages = seg_pages(nseg); 2032 nbytes = vpgtob(npages); 2033 nsvd->vpage = kmem_alloc(nbytes, KM_SLEEP); 2034 2035 bcopy(&ovpage[opages - npages], nsvd->vpage, nbytes); 2036 2037 /* free up old vpage */ 2038 kmem_free(ovpage, vpgtob(opages)); 2039 } 2040 2041 if (amp == NULL) { 2042 nsvd->amp = NULL; 2043 nsvd->anon_index = 0; 2044 } else { 2045 /* 2046 * Need to create a new anon map for the new segment. 2047 * We'll also allocate a new smaller array for the old 2048 * smaller segment to save space. 2049 */ 2050 opages = btop((uintptr_t)(addr - seg->s_base)); 2051 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 2052 if (amp->refcnt == 1 || svd->type == MAP_PRIVATE) { 2053 /* 2054 * Free up now unused parts of anon_map array. 2055 */ 2056 ulong_t an_idx = svd->anon_index + opages; 2057 if (amp->a_szc == seg->s_szc) { 2058 if (seg->s_szc != 0) { 2059 anon_free_pages(amp->ahp, an_idx, len, 2060 seg->s_szc); 2061 } else { 2062 anon_free(amp->ahp, an_idx, 2063 len); 2064 } 2065 } else { 2066 ASSERT(svd->type == MAP_SHARED); 2067 ASSERT(amp->a_szc > seg->s_szc); 2068 anon_shmap_free_pages(amp, an_idx, len); 2069 } 2070 2071 /* 2072 * Unreserve swap space for the 2073 * unmapped chunk of this segment in 2074 * case it's MAP_SHARED 2075 */ 2076 if (svd->type == MAP_SHARED) { 2077 anon_unresv(len); 2078 amp->swresv -= len; 2079 } 2080 } 2081 nsvd->anon_index = svd->anon_index + 2082 btop((uintptr_t)(nseg->s_base - seg->s_base)); 2083 if (svd->type == MAP_SHARED) { 2084 amp->refcnt++; 2085 nsvd->amp = amp; 2086 } else { 2087 struct anon_map *namp; 2088 struct anon_hdr *nahp; 2089 2090 ASSERT(svd->type == MAP_PRIVATE); 2091 nahp = anon_create(btop(seg->s_size), ANON_SLEEP); 2092 namp = anonmap_alloc(nseg->s_size, 0, ANON_SLEEP); 2093 namp->a_szc = seg->s_szc; 2094 (void) anon_copy_ptr(amp->ahp, svd->anon_index, nahp, 2095 0, btop(seg->s_size), ANON_SLEEP); 2096 (void) anon_copy_ptr(amp->ahp, nsvd->anon_index, 2097 namp->ahp, 0, btop(nseg->s_size), ANON_SLEEP); 2098 anon_release(amp->ahp, btop(amp->size)); 2099 svd->anon_index = 0; 2100 nsvd->anon_index = 0; 2101 amp->ahp = nahp; 2102 amp->size = seg->s_size; 2103 nsvd->amp = namp; 2104 } 2105 ANON_LOCK_EXIT(&->a_rwlock); 2106 } 2107 if (svd->swresv) { 2108 if (svd->flags & MAP_NORESERVE) { 2109 ASSERT(amp); 2110 oswresv = svd->swresv; 2111 svd->swresv = ptob(anon_pages(amp->ahp, 2112 svd->anon_index, btop(seg->s_size))); 2113 nsvd->swresv = ptob(anon_pages(nsvd->amp->ahp, 2114 nsvd->anon_index, btop(nseg->s_size))); 2115 ASSERT(oswresv >= (svd->swresv + nsvd->swresv)); 2116 anon_unresv(oswresv - (svd->swresv + nsvd->swresv)); 2117 } else { 2118 if (seg->s_size + nseg->s_size + len != svd->swresv) { 2119 panic("segvn_unmap: " 2120 "cannot split swap reservation"); 2121 /*NOTREACHED*/ 2122 } 2123 anon_unresv(len); 2124 svd->swresv = seg->s_size; 2125 nsvd->swresv = nseg->s_size; 2126 } 2127 TRACE_3(TR_FAC_VM, TR_ANON_PROC, "anon proc:%p %lu %u", 2128 seg, len, 0); 2129 } 2130 2131 return (0); /* I'm glad that's all over with! */ 2132 } 2133 2134 static void 2135 segvn_free(struct seg *seg) 2136 { 2137 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 2138 pgcnt_t npages = seg_pages(seg); 2139 struct anon_map *amp; 2140 size_t len; 2141 2142 /* 2143 * We don't need any segment level locks for "segvn" data 2144 * since the address space is "write" locked. 2145 */ 2146 ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock)); 2147 ASSERT(svd->tr_state == SEGVN_TR_OFF); 2148 2149 ASSERT(svd->rcookie == HAT_INVALID_REGION_COOKIE); 2150 2151 /* 2152 * Be sure to unlock pages. XXX Why do things get free'ed instead 2153 * of unmapped? XXX 2154 */ 2155 (void) segvn_lockop(seg, seg->s_base, seg->s_size, 2156 0, MC_UNLOCK, NULL, 0); 2157 2158 /* 2159 * Deallocate the vpage and anon pointers if necessary and possible. 2160 */ 2161 if (svd->vpage != NULL) { 2162 kmem_free(svd->vpage, vpgtob(npages)); 2163 svd->vpage = NULL; 2164 } 2165 if ((amp = svd->amp) != NULL) { 2166 /* 2167 * If there are no more references to this anon_map 2168 * structure, then deallocate the structure after freeing 2169 * up all the anon slot pointers that we can. 2170 */ 2171 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 2172 ASSERT(amp->a_szc >= seg->s_szc); 2173 if (--amp->refcnt == 0) { 2174 if (svd->type == MAP_PRIVATE) { 2175 /* 2176 * Private - we only need to anon_free 2177 * the part that this segment refers to. 2178 */ 2179 if (seg->s_szc != 0) { 2180 anon_free_pages(amp->ahp, 2181 svd->anon_index, seg->s_size, 2182 seg->s_szc); 2183 } else { 2184 anon_free(amp->ahp, svd->anon_index, 2185 seg->s_size); 2186 } 2187 } else { 2188 /* 2189 * Shared - anon_free the entire 2190 * anon_map's worth of stuff and 2191 * release any swap reservation. 2192 */ 2193 if (amp->a_szc != 0) { 2194 anon_shmap_free_pages(amp, 0, 2195 amp->size); 2196 } else { 2197 anon_free(amp->ahp, 0, amp->size); 2198 } 2199 if ((len = amp->swresv) != 0) { 2200 anon_unresv(len); 2201 TRACE_3(TR_FAC_VM, TR_ANON_PROC, 2202 "anon proc:%p %lu %u", 2203 seg, len, 0); 2204 } 2205 } 2206 svd->amp = NULL; 2207 ANON_LOCK_EXIT(&->a_rwlock); 2208 anonmap_free(amp); 2209 } else if (svd->type == MAP_PRIVATE) { 2210 /* 2211 * We had a private mapping which still has 2212 * a held anon_map so just free up all the 2213 * anon slot pointers that we were using. 2214 */ 2215 if (seg->s_szc != 0) { 2216 anon_free_pages(amp->ahp, svd->anon_index, 2217 seg->s_size, seg->s_szc); 2218 } else { 2219 anon_free(amp->ahp, svd->anon_index, 2220 seg->s_size); 2221 } 2222 ANON_LOCK_EXIT(&->a_rwlock); 2223 } else { 2224 ANON_LOCK_EXIT(&->a_rwlock); 2225 } 2226 } 2227 2228 /* 2229 * Release swap reservation. 2230 */ 2231 if ((len = svd->swresv) != 0) { 2232 anon_unresv(svd->swresv); 2233 TRACE_3(TR_FAC_VM, TR_ANON_PROC, "anon proc:%p %lu %u", 2234 seg, len, 0); 2235 svd->swresv = 0; 2236 } 2237 /* 2238 * Release claim on vnode, credentials, and finally free the 2239 * private data. 2240 */ 2241 if (svd->vp != NULL) { 2242 if (svd->type == MAP_SHARED) 2243 lgrp_shm_policy_fini(NULL, svd->vp); 2244 VN_RELE(svd->vp); 2245 svd->vp = NULL; 2246 } 2247 crfree(svd->cred); 2248 svd->cred = NULL; 2249 2250 seg->s_data = NULL; 2251 kmem_cache_free(segvn_cache, svd); 2252 } 2253 2254 #ifdef DEBUG 2255 uint32_t segvn_slock_mtbf = 0; 2256 #endif 2257 2258 ulong_t segvn_lpglck_limit = 0; 2259 2260 /* 2261 * Support routines used by segvn_pagelock() and softlock faults for anonymous 2262 * pages to implement availrmem accounting in a way that makes sure the 2263 * same memory is accounted just once for all softlock/pagelock purposes. 2264 * This prevents a bug when availrmem is quickly incorrectly exausted from 2265 * several pagelocks to different parts of the same large page since each 2266 * pagelock has to decrement availrmem by the size of the entire large 2267 * page. Note those pages are not COW shared until softunlock/pageunlock so 2268 * we don't need to use cow style accounting here. We also need to make sure 2269 * the entire large page is accounted even if softlock range is less than the 2270 * entire large page because large anon pages can't be demoted when any of 2271 * constituent pages is locked. The caller calls this routine for every page_t 2272 * it locks. The very first page in the range may not be the root page of a 2273 * large page. For all other pages it's guranteed we are going to visit the 2274 * root of a particular large page before any other constituent page as we are 2275 * locking sequential pages belonging to the same anon map. So we do all the 2276 * locking when the root is encountered except for the very first page. Since 2277 * softlocking is not supported (except S_READ_NOCOW special case) for vmpss 2278 * segments and since vnode pages can be demoted without locking all 2279 * constituent pages vnode pages don't come here. Unlocking relies on the 2280 * fact that pagesize can't change whenever any of constituent large pages is 2281 * locked at least SE_SHARED. This allows unlocking code to find the right 2282 * root and decrement availrmem by the same amount it was incremented when the 2283 * page was locked. 2284 */ 2285 static int 2286 segvn_slock_anonpages(page_t *pp, int first) 2287 { 2288 pgcnt_t pages; 2289 pfn_t pfn; 2290 uchar_t szc = pp->p_szc; 2291 2292 ASSERT(PAGE_LOCKED(pp)); 2293 ASSERT(pp->p_vnode != NULL); 2294 ASSERT(IS_SWAPFSVP(pp->p_vnode)); 2295 2296 /* 2297 * pagesize won't change as long as any constituent page is locked. 2298 */ 2299 pages = page_get_pagecnt(pp->p_szc); 2300 pfn = page_pptonum(pp); 2301 2302 if (!first) { 2303 if (!IS_P2ALIGNED(pfn, pages)) { 2304 #ifdef DEBUG 2305 pp = &pp[-(spgcnt_t)(pfn & (pages - 1))]; 2306 pfn = page_pptonum(pp); 2307 ASSERT(IS_P2ALIGNED(pfn, pages)); 2308 ASSERT(pp->p_szc == szc); 2309 ASSERT(pp->p_vnode != NULL); 2310 ASSERT(IS_SWAPFSVP(pp->p_vnode)); 2311 ASSERT(pp->p_slckcnt != 0); 2312 #endif /* DEBUG */ 2313 return (1); 2314 } 2315 } else if (!IS_P2ALIGNED(pfn, pages)) { 2316 pp = &pp[-(spgcnt_t)(pfn & (pages - 1))]; 2317 #ifdef DEBUG 2318 pfn = page_pptonum(pp); 2319 ASSERT(IS_P2ALIGNED(pfn, pages)); 2320 ASSERT(pp->p_szc == szc); 2321 ASSERT(pp->p_vnode != NULL); 2322 ASSERT(IS_SWAPFSVP(pp->p_vnode)); 2323 #endif /* DEBUG */ 2324 } 2325 2326 #ifdef DEBUG 2327 if (segvn_slock_mtbf && !(gethrtime() % segvn_slock_mtbf)) { 2328 return (0); 2329 } 2330 #endif /* DEBUG */ 2331 2332 /* 2333 * pp is a root page. 2334 * We haven't locked this large page yet. 2335 */ 2336 page_struct_lock(pp); 2337 if (pp->p_slckcnt != 0) { 2338 if (pp->p_slckcnt < PAGE_SLOCK_MAXIMUM) { 2339 pp->p_slckcnt++; 2340 page_struct_unlock(pp); 2341 return (1); 2342 } 2343 page_struct_unlock(pp); 2344 segvn_lpglck_limit++; 2345 return (0); 2346 } 2347 mutex_enter(&freemem_lock); 2348 if (availrmem < tune.t_minarmem + pages) { 2349 mutex_exit(&freemem_lock); 2350 page_struct_unlock(pp); 2351 return (0); 2352 } 2353 pp->p_slckcnt++; 2354 availrmem -= pages; 2355 mutex_exit(&freemem_lock); 2356 page_struct_unlock(pp); 2357 return (1); 2358 } 2359 2360 static void 2361 segvn_sunlock_anonpages(page_t *pp, int first) 2362 { 2363 pgcnt_t pages; 2364 pfn_t pfn; 2365 2366 ASSERT(PAGE_LOCKED(pp)); 2367 ASSERT(pp->p_vnode != NULL); 2368 ASSERT(IS_SWAPFSVP(pp->p_vnode)); 2369 2370 /* 2371 * pagesize won't change as long as any constituent page is locked. 2372 */ 2373 pages = page_get_pagecnt(pp->p_szc); 2374 pfn = page_pptonum(pp); 2375 2376 if (!first) { 2377 if (!IS_P2ALIGNED(pfn, pages)) { 2378 return; 2379 } 2380 } else if (!IS_P2ALIGNED(pfn, pages)) { 2381 pp = &pp[-(spgcnt_t)(pfn & (pages - 1))]; 2382 #ifdef DEBUG 2383 pfn = page_pptonum(pp); 2384 ASSERT(IS_P2ALIGNED(pfn, pages)); 2385 #endif /* DEBUG */ 2386 } 2387 ASSERT(pp->p_vnode != NULL); 2388 ASSERT(IS_SWAPFSVP(pp->p_vnode)); 2389 ASSERT(pp->p_slckcnt != 0); 2390 page_struct_lock(pp); 2391 if (--pp->p_slckcnt == 0) { 2392 mutex_enter(&freemem_lock); 2393 availrmem += pages; 2394 mutex_exit(&freemem_lock); 2395 } 2396 page_struct_unlock(pp); 2397 } 2398 2399 /* 2400 * Do a F_SOFTUNLOCK call over the range requested. The range must have 2401 * already been F_SOFTLOCK'ed. 2402 * Caller must always match addr and len of a softunlock with a previous 2403 * softlock with exactly the same addr and len. 2404 */ 2405 static void 2406 segvn_softunlock(struct seg *seg, caddr_t addr, size_t len, enum seg_rw rw) 2407 { 2408 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 2409 page_t *pp; 2410 caddr_t adr; 2411 struct vnode *vp; 2412 u_offset_t offset; 2413 ulong_t anon_index; 2414 struct anon_map *amp; 2415 struct anon *ap = NULL; 2416 2417 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 2418 ASSERT(SEGVN_LOCK_HELD(seg->s_as, &svd->lock)); 2419 2420 if ((amp = svd->amp) != NULL) 2421 anon_index = svd->anon_index + seg_page(seg, addr); 2422 2423 if (HAT_IS_REGION_COOKIE_VALID(svd->rcookie)) { 2424 ASSERT(svd->tr_state == SEGVN_TR_OFF); 2425 hat_unlock_region(seg->s_as->a_hat, addr, len, svd->rcookie); 2426 } else { 2427 hat_unlock(seg->s_as->a_hat, addr, len); 2428 } 2429 for (adr = addr; adr < addr + len; adr += PAGESIZE) { 2430 if (amp != NULL) { 2431 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 2432 if ((ap = anon_get_ptr(amp->ahp, anon_index++)) 2433 != NULL) { 2434 swap_xlate(ap, &vp, &offset); 2435 } else { 2436 vp = svd->vp; 2437 offset = svd->offset + 2438 (uintptr_t)(adr - seg->s_base); 2439 } 2440 ANON_LOCK_EXIT(&->a_rwlock); 2441 } else { 2442 vp = svd->vp; 2443 offset = svd->offset + 2444 (uintptr_t)(adr - seg->s_base); 2445 } 2446 2447 /* 2448 * Use page_find() instead of page_lookup() to 2449 * find the page since we know that it is locked. 2450 */ 2451 pp = page_find(vp, offset); 2452 if (pp == NULL) { 2453 panic( 2454 "segvn_softunlock: addr %p, ap %p, vp %p, off %llx", 2455 (void *)adr, (void *)ap, (void *)vp, offset); 2456 /*NOTREACHED*/ 2457 } 2458 2459 if (rw == S_WRITE) { 2460 hat_setrefmod(pp); 2461 if (seg->s_as->a_vbits) 2462 hat_setstat(seg->s_as, adr, PAGESIZE, 2463 P_REF | P_MOD); 2464 } else if (rw != S_OTHER) { 2465 hat_setref(pp); 2466 if (seg->s_as->a_vbits) 2467 hat_setstat(seg->s_as, adr, PAGESIZE, P_REF); 2468 } 2469 TRACE_3(TR_FAC_VM, TR_SEGVN_FAULT, 2470 "segvn_fault:pp %p vp %p offset %llx", pp, vp, offset); 2471 if (svd->vp == NULL) { 2472 segvn_sunlock_anonpages(pp, adr == addr); 2473 } 2474 page_unlock(pp); 2475 } 2476 mutex_enter(&freemem_lock); /* for availrmem */ 2477 if (svd->vp != NULL) { 2478 availrmem += btop(len); 2479 } 2480 segvn_pages_locked -= btop(len); 2481 svd->softlockcnt -= btop(len); 2482 mutex_exit(&freemem_lock); 2483 if (svd->softlockcnt == 0) { 2484 /* 2485 * All SOFTLOCKS are gone. Wakeup any waiting 2486 * unmappers so they can try again to unmap. 2487 * Check for waiters first without the mutex 2488 * held so we don't always grab the mutex on 2489 * softunlocks. 2490 */ 2491 if (AS_ISUNMAPWAIT(seg->s_as)) { 2492 mutex_enter(&seg->s_as->a_contents); 2493 if (AS_ISUNMAPWAIT(seg->s_as)) { 2494 AS_CLRUNMAPWAIT(seg->s_as); 2495 cv_broadcast(&seg->s_as->a_cv); 2496 } 2497 mutex_exit(&seg->s_as->a_contents); 2498 } 2499 } 2500 } 2501 2502 #define PAGE_HANDLED ((page_t *)-1) 2503 2504 /* 2505 * Release all the pages in the NULL terminated ppp list 2506 * which haven't already been converted to PAGE_HANDLED. 2507 */ 2508 static void 2509 segvn_pagelist_rele(page_t **ppp) 2510 { 2511 for (; *ppp != NULL; ppp++) { 2512 if (*ppp != PAGE_HANDLED) 2513 page_unlock(*ppp); 2514 } 2515 } 2516 2517 static int stealcow = 1; 2518 2519 /* 2520 * Workaround for viking chip bug. See bug id 1220902. 2521 * To fix this down in pagefault() would require importing so 2522 * much as and segvn code as to be unmaintainable. 2523 */ 2524 int enable_mbit_wa = 0; 2525 2526 /* 2527 * Handles all the dirty work of getting the right 2528 * anonymous pages and loading up the translations. 2529 * This routine is called only from segvn_fault() 2530 * when looping over the range of addresses requested. 2531 * 2532 * The basic algorithm here is: 2533 * If this is an anon_zero case 2534 * Call anon_zero to allocate page 2535 * Load up translation 2536 * Return 2537 * endif 2538 * If this is an anon page 2539 * Use anon_getpage to get the page 2540 * else 2541 * Find page in pl[] list passed in 2542 * endif 2543 * If not a cow 2544 * Load up the translation to the page 2545 * return 2546 * endif 2547 * Call anon_private to handle cow 2548 * Load up (writable) translation to new page 2549 */ 2550 static faultcode_t 2551 segvn_faultpage( 2552 struct hat *hat, /* the hat to use for mapping */ 2553 struct seg *seg, /* seg_vn of interest */ 2554 caddr_t addr, /* address in as */ 2555 u_offset_t off, /* offset in vp */ 2556 struct vpage *vpage, /* pointer to vpage for vp, off */ 2557 page_t *pl[], /* object source page pointer */ 2558 uint_t vpprot, /* access allowed to object pages */ 2559 enum fault_type type, /* type of fault */ 2560 enum seg_rw rw, /* type of access at fault */ 2561 int brkcow, /* we may need to break cow */ 2562 int first) /* first page for this fault if 1 */ 2563 { 2564 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 2565 page_t *pp, **ppp; 2566 uint_t pageflags = 0; 2567 page_t *anon_pl[1 + 1]; 2568 page_t *opp = NULL; /* original page */ 2569 uint_t prot; 2570 int err; 2571 int cow; 2572 int claim; 2573 int steal = 0; 2574 ulong_t anon_index; 2575 struct anon *ap, *oldap; 2576 struct anon_map *amp; 2577 int hat_flag = (type == F_SOFTLOCK) ? HAT_LOAD_LOCK : HAT_LOAD; 2578 int anon_lock = 0; 2579 anon_sync_obj_t cookie; 2580 2581 if (svd->flags & MAP_TEXT) { 2582 hat_flag |= HAT_LOAD_TEXT; 2583 } 2584 2585 ASSERT(SEGVN_READ_HELD(seg->s_as, &svd->lock)); 2586 ASSERT(seg->s_szc == 0); 2587 ASSERT(svd->tr_state != SEGVN_TR_INIT); 2588 2589 /* 2590 * Initialize protection value for this page. 2591 * If we have per page protection values check it now. 2592 */ 2593 if (svd->pageprot) { 2594 uint_t protchk; 2595 2596 switch (rw) { 2597 case S_READ: 2598 protchk = PROT_READ; 2599 break; 2600 case S_WRITE: 2601 protchk = PROT_WRITE; 2602 break; 2603 case S_EXEC: 2604 protchk = PROT_EXEC; 2605 break; 2606 case S_OTHER: 2607 default: 2608 protchk = PROT_READ | PROT_WRITE | PROT_EXEC; 2609 break; 2610 } 2611 2612 prot = VPP_PROT(vpage); 2613 if ((prot & protchk) == 0) 2614 return (FC_PROT); /* illegal access type */ 2615 } else { 2616 prot = svd->prot; 2617 } 2618 2619 if (type == F_SOFTLOCK && svd->vp != NULL) { 2620 mutex_enter(&freemem_lock); 2621 if (availrmem <= tune.t_minarmem) { 2622 mutex_exit(&freemem_lock); 2623 return (FC_MAKE_ERR(ENOMEM)); /* out of real memory */ 2624 } else { 2625 availrmem--; 2626 svd->softlockcnt++; 2627 segvn_pages_locked++; 2628 } 2629 mutex_exit(&freemem_lock); 2630 } 2631 2632 /* 2633 * Always acquire the anon array lock to prevent 2 threads from 2634 * allocating separate anon slots for the same "addr". 2635 */ 2636 2637 if ((amp = svd->amp) != NULL) { 2638 ASSERT(RW_READ_HELD(&->a_rwlock)); 2639 anon_index = svd->anon_index + seg_page(seg, addr); 2640 anon_array_enter(amp, anon_index, &cookie); 2641 anon_lock = 1; 2642 } 2643 2644 if (svd->vp == NULL && amp != NULL) { 2645 if ((ap = anon_get_ptr(amp->ahp, anon_index)) == NULL) { 2646 /* 2647 * Allocate a (normally) writable anonymous page of 2648 * zeroes. If no advance reservations, reserve now. 2649 */ 2650 if (svd->flags & MAP_NORESERVE) { 2651 if (anon_resv_zone(ptob(1), 2652 seg->s_as->a_proc->p_zone)) { 2653 atomic_add_long(&svd->swresv, ptob(1)); 2654 } else { 2655 err = ENOMEM; 2656 goto out; 2657 } 2658 } 2659 if ((pp = anon_zero(seg, addr, &ap, 2660 svd->cred)) == NULL) { 2661 err = ENOMEM; 2662 goto out; /* out of swap space */ 2663 } 2664 /* 2665 * Re-acquire the anon_map lock and 2666 * initialize the anon array entry. 2667 */ 2668 (void) anon_set_ptr(amp->ahp, anon_index, ap, 2669 ANON_SLEEP); 2670 2671 ASSERT(pp->p_szc == 0); 2672 2673 /* 2674 * Handle pages that have been marked for migration 2675 */ 2676 if (lgrp_optimizations()) 2677 page_migrate(seg, addr, &pp, 1); 2678 2679 if (type == F_SOFTLOCK) { 2680 if (!segvn_slock_anonpages(pp, first)) { 2681 page_unlock(pp); 2682 err = ENOMEM; 2683 goto out; 2684 } else { 2685 mutex_enter(&freemem_lock); 2686 svd->softlockcnt++; 2687 segvn_pages_locked++; 2688 mutex_exit(&freemem_lock); 2689 } 2690 } 2691 2692 if (enable_mbit_wa) { 2693 if (rw == S_WRITE) 2694 hat_setmod(pp); 2695 else if (!hat_ismod(pp)) 2696 prot &= ~PROT_WRITE; 2697 } 2698 /* 2699 * If AS_PAGLCK is set in a_flags (via memcntl(2) 2700 * with MC_LOCKAS, MCL_FUTURE) and this is a 2701 * MAP_NORESERVE segment, we may need to 2702 * permanently lock the page as it is being faulted 2703 * for the first time. The following text applies 2704 * only to MAP_NORESERVE segments: 2705 * 2706 * As per memcntl(2), if this segment was created 2707 * after MCL_FUTURE was applied (a "future" 2708 * segment), its pages must be locked. If this 2709 * segment existed at MCL_FUTURE application (a 2710 * "past" segment), the interface is unclear. 2711 * 2712 * We decide to lock only if vpage is present: 2713 * 2714 * - "future" segments will have a vpage array (see 2715 * as_map), and so will be locked as required 2716 * 2717 * - "past" segments may not have a vpage array, 2718 * depending on whether events (such as 2719 * mprotect) have occurred. Locking if vpage 2720 * exists will preserve legacy behavior. Not 2721 * locking if vpage is absent, will not break 2722 * the interface or legacy behavior. Note that 2723 * allocating vpage here if it's absent requires 2724 * upgrading the segvn reader lock, the cost of 2725 * which does not seem worthwhile. 2726 * 2727 * Usually testing and setting VPP_ISPPLOCK and 2728 * VPP_SETPPLOCK requires holding the segvn lock as 2729 * writer, but in this case all readers are 2730 * serializing on the anon array lock. 2731 */ 2732 if (AS_ISPGLCK(seg->s_as) && vpage != NULL && 2733 (svd->flags & MAP_NORESERVE) && 2734 !VPP_ISPPLOCK(vpage)) { 2735 proc_t *p = seg->s_as->a_proc; 2736 ASSERT(svd->type == MAP_PRIVATE); 2737 mutex_enter(&p->p_lock); 2738 if (rctl_incr_locked_mem(p, NULL, PAGESIZE, 2739 1) == 0) { 2740 claim = VPP_PROT(vpage) & PROT_WRITE; 2741 if (page_pp_lock(pp, claim, 0)) { 2742 VPP_SETPPLOCK(vpage); 2743 } else { 2744 rctl_decr_locked_mem(p, NULL, 2745 PAGESIZE, 1); 2746 } 2747 } 2748 mutex_exit(&p->p_lock); 2749 } 2750 2751 ASSERT(svd->rcookie == HAT_INVALID_REGION_COOKIE); 2752 hat_memload(hat, addr, pp, prot, hat_flag); 2753 2754 if (!(hat_flag & HAT_LOAD_LOCK)) 2755 page_unlock(pp); 2756 2757 anon_array_exit(&cookie); 2758 return (0); 2759 } 2760 } 2761 2762 /* 2763 * Obtain the page structure via anon_getpage() if it is 2764 * a private copy of an object (the result of a previous 2765 * copy-on-write). 2766 */ 2767 if (amp != NULL) { 2768 if ((ap = anon_get_ptr(amp->ahp, anon_index)) != NULL) { 2769 err = anon_getpage(&ap, &vpprot, anon_pl, PAGESIZE, 2770 seg, addr, rw, svd->cred); 2771 if (err) 2772 goto out; 2773 2774 if (svd->type == MAP_SHARED) { 2775 /* 2776 * If this is a shared mapping to an 2777 * anon_map, then ignore the write 2778 * permissions returned by anon_getpage(). 2779 * They apply to the private mappings 2780 * of this anon_map. 2781 */ 2782 vpprot |= PROT_WRITE; 2783 } 2784 opp = anon_pl[0]; 2785 } 2786 } 2787 2788 /* 2789 * Search the pl[] list passed in if it is from the 2790 * original object (i.e., not a private copy). 2791 */ 2792 if (opp == NULL) { 2793 /* 2794 * Find original page. We must be bringing it in 2795 * from the list in pl[]. 2796 */ 2797 for (ppp = pl; (opp = *ppp) != NULL; ppp++) { 2798 if (opp == PAGE_HANDLED) 2799 continue; 2800 ASSERT(opp->p_vnode == svd->vp); /* XXX */ 2801 if (opp->p_offset == off) 2802 break; 2803 } 2804 if (opp == NULL) { 2805 panic("segvn_faultpage not found"); 2806 /*NOTREACHED*/ 2807 } 2808 *ppp = PAGE_HANDLED; 2809 2810 } 2811 2812 ASSERT(PAGE_LOCKED(opp)); 2813 2814 TRACE_3(TR_FAC_VM, TR_SEGVN_FAULT, 2815 "segvn_fault:pp %p vp %p offset %llx", 2816 opp, NULL, 0); 2817 2818 /* 2819 * The fault is treated as a copy-on-write fault if a 2820 * write occurs on a private segment and the object 2821 * page (i.e., mapping) is write protected. We assume 2822 * that fatal protection checks have already been made. 2823 */ 2824 2825 if (brkcow) { 2826 ASSERT(svd->tr_state == SEGVN_TR_OFF); 2827 cow = !(vpprot & PROT_WRITE); 2828 } else if (svd->tr_state == SEGVN_TR_ON) { 2829 /* 2830 * If we are doing text replication COW on first touch. 2831 */ 2832 ASSERT(amp != NULL); 2833 ASSERT(svd->vp != NULL); 2834 ASSERT(rw != S_WRITE); 2835 cow = (ap == NULL); 2836 } else { 2837 cow = 0; 2838 } 2839 2840 /* 2841 * If not a copy-on-write case load the translation 2842 * and return. 2843 */ 2844 if (cow == 0) { 2845 2846 /* 2847 * Handle pages that have been marked for migration 2848 */ 2849 if (lgrp_optimizations()) 2850 page_migrate(seg, addr, &opp, 1); 2851 2852 if (type == F_SOFTLOCK && svd->vp == NULL) { 2853 2854 ASSERT(opp->p_szc == 0 || 2855 (svd->type == MAP_SHARED && 2856 amp != NULL && amp->a_szc != 0)); 2857 2858 if (!segvn_slock_anonpages(opp, first)) { 2859 page_unlock(opp); 2860 err = ENOMEM; 2861 goto out; 2862 } else { 2863 mutex_enter(&freemem_lock); 2864 svd->softlockcnt++; 2865 segvn_pages_locked++; 2866 mutex_exit(&freemem_lock); 2867 } 2868 } 2869 if (IS_VMODSORT(opp->p_vnode) || enable_mbit_wa) { 2870 if (rw == S_WRITE) 2871 hat_setmod(opp); 2872 else if (rw != S_OTHER && !hat_ismod(opp)) 2873 prot &= ~PROT_WRITE; 2874 } 2875 2876 ASSERT(svd->rcookie == HAT_INVALID_REGION_COOKIE || 2877 (!svd->pageprot && svd->prot == (prot & vpprot))); 2878 ASSERT(amp == NULL || 2879 svd->rcookie == HAT_INVALID_REGION_COOKIE); 2880 hat_memload_region(hat, addr, opp, prot & vpprot, hat_flag, 2881 svd->rcookie); 2882 2883 if (!(hat_flag & HAT_LOAD_LOCK)) 2884 page_unlock(opp); 2885 2886 if (anon_lock) { 2887 anon_array_exit(&cookie); 2888 } 2889 return (0); 2890 } 2891 2892 ASSERT(svd->rcookie == HAT_INVALID_REGION_COOKIE); 2893 2894 hat_setref(opp); 2895 2896 ASSERT(amp != NULL && anon_lock); 2897 2898 /* 2899 * Steal the page only if it isn't a private page 2900 * since stealing a private page is not worth the effort. 2901 */ 2902 if ((ap = anon_get_ptr(amp->ahp, anon_index)) == NULL) 2903 steal = 1; 2904 2905 /* 2906 * Steal the original page if the following conditions are true: 2907 * 2908 * We are low on memory, the page is not private, page is not large, 2909 * not shared, not modified, not `locked' or if we have it `locked' 2910 * (i.e., p_cowcnt == 1 and p_lckcnt == 0, which also implies 2911 * that the page is not shared) and if it doesn't have any 2912 * translations. page_struct_lock isn't needed to look at p_cowcnt 2913 * and p_lckcnt because we first get exclusive lock on page. 2914 */ 2915 (void) hat_pagesync(opp, HAT_SYNC_DONTZERO | HAT_SYNC_STOPON_MOD); 2916 2917 if (stealcow && freemem < minfree && steal && opp->p_szc == 0 && 2918 page_tryupgrade(opp) && !hat_ismod(opp) && 2919 ((opp->p_lckcnt == 0 && opp->p_cowcnt == 0) || 2920 (opp->p_lckcnt == 0 && opp->p_cowcnt == 1 && 2921 vpage != NULL && VPP_ISPPLOCK(vpage)))) { 2922 /* 2923 * Check if this page has other translations 2924 * after unloading our translation. 2925 */ 2926 if (hat_page_is_mapped(opp)) { 2927 ASSERT(svd->rcookie == HAT_INVALID_REGION_COOKIE); 2928 hat_unload(seg->s_as->a_hat, addr, PAGESIZE, 2929 HAT_UNLOAD); 2930 } 2931 2932 /* 2933 * hat_unload() might sync back someone else's recent 2934 * modification, so check again. 2935 */ 2936 if (!hat_ismod(opp) && !hat_page_is_mapped(opp)) 2937 pageflags |= STEAL_PAGE; 2938 } 2939 2940 /* 2941 * If we have a vpage pointer, see if it indicates that we have 2942 * ``locked'' the page we map -- if so, tell anon_private to 2943 * transfer the locking resource to the new page. 2944 * 2945 * See Statement at the beginning of segvn_lockop regarding 2946 * the way lockcnts/cowcnts are handled during COW. 2947 * 2948 */ 2949 if (vpage != NULL && VPP_ISPPLOCK(vpage)) 2950 pageflags |= LOCK_PAGE; 2951 2952 /* 2953 * Allocate a private page and perform the copy. 2954 * For MAP_NORESERVE reserve swap space now, unless this 2955 * is a cow fault on an existing anon page in which case 2956 * MAP_NORESERVE will have made advance reservations. 2957 */ 2958 if ((svd->flags & MAP_NORESERVE) && (ap == NULL)) { 2959 if (anon_resv_zone(ptob(1), seg->s_as->a_proc->p_zone)) { 2960 atomic_add_long(&svd->swresv, ptob(1)); 2961 } else { 2962 page_unlock(opp); 2963 err = ENOMEM; 2964 goto out; 2965 } 2966 } 2967 oldap = ap; 2968 pp = anon_private(&ap, seg, addr, prot, opp, pageflags, svd->cred); 2969 if (pp == NULL) { 2970 err = ENOMEM; /* out of swap space */ 2971 goto out; 2972 } 2973 2974 /* 2975 * If we copied away from an anonymous page, then 2976 * we are one step closer to freeing up an anon slot. 2977 * 2978 * NOTE: The original anon slot must be released while 2979 * holding the "anon_map" lock. This is necessary to prevent 2980 * other threads from obtaining a pointer to the anon slot 2981 * which may be freed if its "refcnt" is 1. 2982 */ 2983 if (oldap != NULL) 2984 anon_decref(oldap); 2985 2986 (void) anon_set_ptr(amp->ahp, anon_index, ap, ANON_SLEEP); 2987 2988 /* 2989 * Handle pages that have been marked for migration 2990 */ 2991 if (lgrp_optimizations()) 2992 page_migrate(seg, addr, &pp, 1); 2993 2994 ASSERT(pp->p_szc == 0); 2995 if (type == F_SOFTLOCK && svd->vp == NULL) { 2996 if (!segvn_slock_anonpages(pp, first)) { 2997 page_unlock(pp); 2998 err = ENOMEM; 2999 goto out; 3000 } else { 3001 mutex_enter(&freemem_lock); 3002 svd->softlockcnt++; 3003 segvn_pages_locked++; 3004 mutex_exit(&freemem_lock); 3005 } 3006 } 3007 3008 ASSERT(!IS_VMODSORT(pp->p_vnode)); 3009 if (enable_mbit_wa) { 3010 if (rw == S_WRITE) 3011 hat_setmod(pp); 3012 else if (!hat_ismod(pp)) 3013 prot &= ~PROT_WRITE; 3014 } 3015 3016 ASSERT(svd->rcookie == HAT_INVALID_REGION_COOKIE); 3017 hat_memload(hat, addr, pp, prot, hat_flag); 3018 3019 if (!(hat_flag & HAT_LOAD_LOCK)) 3020 page_unlock(pp); 3021 3022 ASSERT(anon_lock); 3023 anon_array_exit(&cookie); 3024 return (0); 3025 out: 3026 if (anon_lock) 3027 anon_array_exit(&cookie); 3028 3029 if (type == F_SOFTLOCK && svd->vp != NULL) { 3030 mutex_enter(&freemem_lock); 3031 availrmem++; 3032 segvn_pages_locked--; 3033 svd->softlockcnt--; 3034 mutex_exit(&freemem_lock); 3035 } 3036 return (FC_MAKE_ERR(err)); 3037 } 3038 3039 /* 3040 * relocate a bunch of smaller targ pages into one large repl page. all targ 3041 * pages must be complete pages smaller than replacement pages. 3042 * it's assumed that no page's szc can change since they are all PAGESIZE or 3043 * complete large pages locked SHARED. 3044 */ 3045 static void 3046 segvn_relocate_pages(page_t **targ, page_t *replacement) 3047 { 3048 page_t *pp; 3049 pgcnt_t repl_npgs, curnpgs; 3050 pgcnt_t i; 3051 uint_t repl_szc = replacement->p_szc; 3052 page_t *first_repl = replacement; 3053 page_t *repl; 3054 spgcnt_t npgs; 3055 3056 VM_STAT_ADD(segvnvmstats.relocatepages[0]); 3057 3058 ASSERT(repl_szc != 0); 3059 npgs = repl_npgs = page_get_pagecnt(repl_szc); 3060 3061 i = 0; 3062 while (repl_npgs) { 3063 spgcnt_t nreloc; 3064 int err; 3065 ASSERT(replacement != NULL); 3066 pp = targ[i]; 3067 ASSERT(pp->p_szc < repl_szc); 3068 ASSERT(PAGE_EXCL(pp)); 3069 ASSERT(!PP_ISFREE(pp)); 3070 curnpgs = page_get_pagecnt(pp->p_szc); 3071 if (curnpgs == 1) { 3072 VM_STAT_ADD(segvnvmstats.relocatepages[1]); 3073 repl = replacement; 3074 page_sub(&replacement, repl); 3075 ASSERT(PAGE_EXCL(repl)); 3076 ASSERT(!PP_ISFREE(repl)); 3077 ASSERT(repl->p_szc == repl_szc); 3078 } else { 3079 page_t *repl_savepp; 3080 int j; 3081 VM_STAT_ADD(segvnvmstats.relocatepages[2]); 3082 repl_savepp = replacement; 3083 for (j = 0; j < curnpgs; j++) { 3084 repl = replacement; 3085 page_sub(&replacement, repl); 3086 ASSERT(PAGE_EXCL(repl)); 3087 ASSERT(!PP_ISFREE(repl)); 3088 ASSERT(repl->p_szc == repl_szc); 3089 ASSERT(page_pptonum(targ[i + j]) == 3090 page_pptonum(targ[i]) + j); 3091 } 3092 repl = repl_savepp; 3093 ASSERT(IS_P2ALIGNED(page_pptonum(repl), curnpgs)); 3094 } 3095 err = page_relocate(&pp, &repl, 0, 1, &nreloc, NULL); 3096 if (err || nreloc != curnpgs) { 3097 panic("segvn_relocate_pages: " 3098 "page_relocate failed err=%d curnpgs=%ld " 3099 "nreloc=%ld", err, curnpgs, nreloc); 3100 } 3101 ASSERT(curnpgs <= repl_npgs); 3102 repl_npgs -= curnpgs; 3103 i += curnpgs; 3104 } 3105 ASSERT(replacement == NULL); 3106 3107 repl = first_repl; 3108 repl_npgs = npgs; 3109 for (i = 0; i < repl_npgs; i++) { 3110 ASSERT(PAGE_EXCL(repl)); 3111 ASSERT(!PP_ISFREE(repl)); 3112 targ[i] = repl; 3113 page_downgrade(targ[i]); 3114 repl++; 3115 } 3116 } 3117 3118 /* 3119 * Check if all pages in ppa array are complete smaller than szc pages and 3120 * their roots will still be aligned relative to their current size if the 3121 * entire ppa array is relocated into one szc page. If these conditions are 3122 * not met return 0. 3123 * 3124 * If all pages are properly aligned attempt to upgrade their locks 3125 * to exclusive mode. If it fails set *upgrdfail to 1 and return 0. 3126 * upgrdfail was set to 0 by caller. 3127 * 3128 * Return 1 if all pages are aligned and locked exclusively. 3129 * 3130 * If all pages in ppa array happen to be physically contiguous to make one 3131 * szc page and all exclusive locks are successfully obtained promote the page 3132 * size to szc and set *pszc to szc. Return 1 with pages locked shared. 3133 */ 3134 static int 3135 segvn_full_szcpages(page_t **ppa, uint_t szc, int *upgrdfail, uint_t *pszc) 3136 { 3137 page_t *pp; 3138 pfn_t pfn; 3139 pgcnt_t totnpgs = page_get_pagecnt(szc); 3140 pfn_t first_pfn; 3141 int contig = 1; 3142 pgcnt_t i; 3143 pgcnt_t j; 3144 uint_t curszc; 3145 pgcnt_t curnpgs; 3146 int root = 0; 3147 3148 ASSERT(szc > 0); 3149 3150 VM_STAT_ADD(segvnvmstats.fullszcpages[0]); 3151 3152 for (i = 0; i < totnpgs; i++) { 3153 pp = ppa[i]; 3154 ASSERT(PAGE_SHARED(pp)); 3155 ASSERT(!PP_ISFREE(pp)); 3156 pfn = page_pptonum(pp); 3157 if (i == 0) { 3158 if (!IS_P2ALIGNED(pfn, totnpgs)) { 3159 contig = 0; 3160 } else { 3161 first_pfn = pfn; 3162 } 3163 } else if (contig && pfn != first_pfn + i) { 3164 contig = 0; 3165 } 3166 if (pp->p_szc == 0) { 3167 if (root) { 3168 VM_STAT_ADD(segvnvmstats.fullszcpages[1]); 3169 return (0); 3170 } 3171 } else if (!root) { 3172 if ((curszc = pp->p_szc) >= szc) { 3173 VM_STAT_ADD(segvnvmstats.fullszcpages[2]); 3174 return (0); 3175 } 3176 if (curszc == 0) { 3177 /* 3178 * p_szc changed means we don't have all pages 3179 * locked. return failure. 3180 */ 3181 VM_STAT_ADD(segvnvmstats.fullszcpages[3]); 3182 return (0); 3183 } 3184 curnpgs = page_get_pagecnt(curszc); 3185 if (!IS_P2ALIGNED(pfn, curnpgs) || 3186 !IS_P2ALIGNED(i, curnpgs)) { 3187 VM_STAT_ADD(segvnvmstats.fullszcpages[4]); 3188 return (0); 3189 } 3190 root = 1; 3191 } else { 3192 ASSERT(i > 0); 3193 VM_STAT_ADD(segvnvmstats.fullszcpages[5]); 3194 if (pp->p_szc != curszc) { 3195 VM_STAT_ADD(segvnvmstats.fullszcpages[6]); 3196 return (0); 3197 } 3198 if (pfn - 1 != page_pptonum(ppa[i - 1])) { 3199 panic("segvn_full_szcpages: " 3200 "large page not physically contiguous"); 3201 } 3202 if (P2PHASE(pfn, curnpgs) == curnpgs - 1) { 3203 root = 0; 3204 } 3205 } 3206 } 3207 3208 for (i = 0; i < totnpgs; i++) { 3209 ASSERT(ppa[i]->p_szc < szc); 3210 if (!page_tryupgrade(ppa[i])) { 3211 for (j = 0; j < i; j++) { 3212 page_downgrade(ppa[j]); 3213 } 3214 *pszc = ppa[i]->p_szc; 3215 *upgrdfail = 1; 3216 VM_STAT_ADD(segvnvmstats.fullszcpages[7]); 3217 return (0); 3218 } 3219 } 3220 3221 /* 3222 * When a page is put a free cachelist its szc is set to 0. if file 3223 * system reclaimed pages from cachelist targ pages will be physically 3224 * contiguous with 0 p_szc. in this case just upgrade szc of targ 3225 * pages without any relocations. 3226 * To avoid any hat issues with previous small mappings 3227 * hat_pageunload() the target pages first. 3228 */ 3229 if (contig) { 3230 VM_STAT_ADD(segvnvmstats.fullszcpages[8]); 3231 for (i = 0; i < totnpgs; i++) { 3232 (void) hat_pageunload(ppa[i], HAT_FORCE_PGUNLOAD); 3233 } 3234 for (i = 0; i < totnpgs; i++) { 3235 ppa[i]->p_szc = szc; 3236 } 3237 for (i = 0; i < totnpgs; i++) { 3238 ASSERT(PAGE_EXCL(ppa[i])); 3239 page_downgrade(ppa[i]); 3240 } 3241 if (pszc != NULL) { 3242 *pszc = szc; 3243 } 3244 } 3245 VM_STAT_ADD(segvnvmstats.fullszcpages[9]); 3246 return (1); 3247 } 3248 3249 /* 3250 * Create physically contiguous pages for [vp, off] - [vp, off + 3251 * page_size(szc)) range and for private segment return them in ppa array. 3252 * Pages are created either via IO or relocations. 3253 * 3254 * Return 1 on sucess and 0 on failure. 3255 * 3256 * If physically contiguos pages already exist for this range return 1 without 3257 * filling ppa array. Caller initializes ppa[0] as NULL to detect that ppa 3258 * array wasn't filled. In this case caller fills ppa array via VOP_GETPAGE(). 3259 */ 3260 3261 static int 3262 segvn_fill_vp_pages(struct segvn_data *svd, vnode_t *vp, u_offset_t off, 3263 uint_t szc, page_t **ppa, page_t **ppplist, uint_t *ret_pszc, 3264 int *downsize) 3265 3266 { 3267 page_t *pplist = *ppplist; 3268 size_t pgsz = page_get_pagesize(szc); 3269 pgcnt_t pages = btop(pgsz); 3270 ulong_t start_off = off; 3271 u_offset_t eoff = off + pgsz; 3272 spgcnt_t nreloc; 3273 u_offset_t io_off = off; 3274 size_t io_len; 3275 page_t *io_pplist = NULL; 3276 page_t *done_pplist = NULL; 3277 pgcnt_t pgidx = 0; 3278 page_t *pp; 3279 page_t *newpp; 3280 page_t *targpp; 3281 int io_err = 0; 3282 int i; 3283 pfn_t pfn; 3284 ulong_t ppages; 3285 page_t *targ_pplist = NULL; 3286 page_t *repl_pplist = NULL; 3287 page_t *tmp_pplist; 3288 int nios = 0; 3289 uint_t pszc; 3290 struct vattr va; 3291 3292 VM_STAT_ADD(segvnvmstats.fill_vp_pages[0]); 3293 3294 ASSERT(szc != 0); 3295 ASSERT(pplist->p_szc == szc); 3296 3297 /* 3298 * downsize will be set to 1 only if we fail to lock pages. this will 3299 * allow subsequent faults to try to relocate the page again. If we 3300 * fail due to misalignment don't downsize and let the caller map the 3301 * whole region with small mappings to avoid more faults into the area 3302 * where we can't get large pages anyway. 3303 */ 3304 *downsize = 0; 3305 3306 while (off < eoff) { 3307 newpp = pplist; 3308 ASSERT(newpp != NULL); 3309 ASSERT(PAGE_EXCL(newpp)); 3310 ASSERT(!PP_ISFREE(newpp)); 3311 /* 3312 * we pass NULL for nrelocp to page_lookup_create() 3313 * so that it doesn't relocate. We relocate here 3314 * later only after we make sure we can lock all 3315 * pages in the range we handle and they are all 3316 * aligned. 3317 */ 3318 pp = page_lookup_create(vp, off, SE_SHARED, newpp, NULL, 0); 3319 ASSERT(pp != NULL); 3320 ASSERT(!PP_ISFREE(pp)); 3321 ASSERT(pp->p_vnode == vp); 3322 ASSERT(pp->p_offset == off); 3323 if (pp == newpp) { 3324 VM_STAT_ADD(segvnvmstats.fill_vp_pages[1]); 3325 page_sub(&pplist, pp); 3326 ASSERT(PAGE_EXCL(pp)); 3327 ASSERT(page_iolock_assert(pp)); 3328 page_list_concat(&io_pplist, &pp); 3329 off += PAGESIZE; 3330 continue; 3331 } 3332 VM_STAT_ADD(segvnvmstats.fill_vp_pages[2]); 3333 pfn = page_pptonum(pp); 3334 pszc = pp->p_szc; 3335 if (pszc >= szc && targ_pplist == NULL && io_pplist == NULL && 3336 IS_P2ALIGNED(pfn, pages)) { 3337 ASSERT(repl_pplist == NULL); 3338 ASSERT(done_pplist == NULL); 3339 ASSERT(pplist == *ppplist); 3340 page_unlock(pp); 3341 page_free_replacement_page(pplist); 3342 page_create_putback(pages); 3343 *ppplist = NULL; 3344 VM_STAT_ADD(segvnvmstats.fill_vp_pages[3]); 3345 return (1); 3346 } 3347 if (pszc >= szc) { 3348 page_unlock(pp); 3349 segvn_faultvnmpss_align_err1++; 3350 goto out; 3351 } 3352 ppages = page_get_pagecnt(pszc); 3353 if (!IS_P2ALIGNED(pfn, ppages)) { 3354 ASSERT(pszc > 0); 3355 /* 3356 * sizing down to pszc won't help. 3357 */ 3358 page_unlock(pp); 3359 segvn_faultvnmpss_align_err2++; 3360 goto out; 3361 } 3362 pfn = page_pptonum(newpp); 3363 if (!IS_P2ALIGNED(pfn, ppages)) { 3364 ASSERT(pszc > 0); 3365 /* 3366 * sizing down to pszc won't help. 3367 */ 3368 page_unlock(pp); 3369 segvn_faultvnmpss_align_err3++; 3370 goto out; 3371 } 3372 if (!PAGE_EXCL(pp)) { 3373 VM_STAT_ADD(segvnvmstats.fill_vp_pages[4]); 3374 page_unlock(pp); 3375 *downsize = 1; 3376 *ret_pszc = pp->p_szc; 3377 goto out; 3378 } 3379 targpp = pp; 3380 if (io_pplist != NULL) { 3381 VM_STAT_ADD(segvnvmstats.fill_vp_pages[5]); 3382 io_len = off - io_off; 3383 /* 3384 * Some file systems like NFS don't check EOF 3385 * conditions in VOP_PAGEIO(). Check it here 3386 * now that pages are locked SE_EXCL. Any file 3387 * truncation will wait until the pages are 3388 * unlocked so no need to worry that file will 3389 * be truncated after we check its size here. 3390 * XXX fix NFS to remove this check. 3391 */ 3392 va.va_mask = AT_SIZE; 3393 if (VOP_GETATTR(vp, &va, ATTR_HINT, svd->cred) != 0) { 3394 VM_STAT_ADD(segvnvmstats.fill_vp_pages[6]); 3395 page_unlock(targpp); 3396 goto out; 3397 } 3398 if (btopr(va.va_size) < btopr(io_off + io_len)) { 3399 VM_STAT_ADD(segvnvmstats.fill_vp_pages[7]); 3400 *downsize = 1; 3401 *ret_pszc = 0; 3402 page_unlock(targpp); 3403 goto out; 3404 } 3405 io_err = VOP_PAGEIO(vp, io_pplist, io_off, io_len, 3406 B_READ, svd->cred); 3407 if (io_err) { 3408 VM_STAT_ADD(segvnvmstats.fill_vp_pages[8]); 3409 page_unlock(targpp); 3410 if (io_err == EDEADLK) { 3411 segvn_vmpss_pageio_deadlk_err++; 3412 } 3413 goto out; 3414 } 3415 nios++; 3416 VM_STAT_ADD(segvnvmstats.fill_vp_pages[9]); 3417 while (io_pplist != NULL) { 3418 pp = io_pplist; 3419 page_sub(&io_pplist, pp); 3420 ASSERT(page_iolock_assert(pp)); 3421 page_io_unlock(pp); 3422 pgidx = (pp->p_offset - start_off) >> 3423 PAGESHIFT; 3424 ASSERT(pgidx < pages); 3425 ppa[pgidx] = pp; 3426 page_list_concat(&done_pplist, &pp); 3427 } 3428 } 3429 pp = targpp; 3430 ASSERT(PAGE_EXCL(pp)); 3431 ASSERT(pp->p_szc <= pszc); 3432 if (pszc != 0 && !group_page_trylock(pp, SE_EXCL)) { 3433 VM_STAT_ADD(segvnvmstats.fill_vp_pages[10]); 3434 page_unlock(pp); 3435 *downsize = 1; 3436 *ret_pszc = pp->p_szc; 3437 goto out; 3438 } 3439 VM_STAT_ADD(segvnvmstats.fill_vp_pages[11]); 3440 /* 3441 * page szc chould have changed before the entire group was 3442 * locked. reread page szc. 3443 */ 3444 pszc = pp->p_szc; 3445 ppages = page_get_pagecnt(pszc); 3446 3447 /* link just the roots */ 3448 page_list_concat(&targ_pplist, &pp); 3449 page_sub(&pplist, newpp); 3450 page_list_concat(&repl_pplist, &newpp); 3451 off += PAGESIZE; 3452 while (--ppages != 0) { 3453 newpp = pplist; 3454 page_sub(&pplist, newpp); 3455 off += PAGESIZE; 3456 } 3457 io_off = off; 3458 } 3459 if (io_pplist != NULL) { 3460 VM_STAT_ADD(segvnvmstats.fill_vp_pages[12]); 3461 io_len = eoff - io_off; 3462 va.va_mask = AT_SIZE; 3463 if (VOP_GETATTR(vp, &va, ATTR_HINT, svd->cred) != 0) { 3464 VM_STAT_ADD(segvnvmstats.fill_vp_pages[13]); 3465 goto out; 3466 } 3467 if (btopr(va.va_size) < btopr(io_off + io_len)) { 3468 VM_STAT_ADD(segvnvmstats.fill_vp_pages[14]); 3469 *downsize = 1; 3470 *ret_pszc = 0; 3471 goto out; 3472 } 3473 io_err = VOP_PAGEIO(vp, io_pplist, io_off, io_len, 3474 B_READ, svd->cred); 3475 if (io_err) { 3476 VM_STAT_ADD(segvnvmstats.fill_vp_pages[15]); 3477 if (io_err == EDEADLK) { 3478 segvn_vmpss_pageio_deadlk_err++; 3479 } 3480 goto out; 3481 } 3482 nios++; 3483 while (io_pplist != NULL) { 3484 pp = io_pplist; 3485 page_sub(&io_pplist, pp); 3486 ASSERT(page_iolock_assert(pp)); 3487 page_io_unlock(pp); 3488 pgidx = (pp->p_offset - start_off) >> PAGESHIFT; 3489 ASSERT(pgidx < pages); 3490 ppa[pgidx] = pp; 3491 } 3492 } 3493 /* 3494 * we're now bound to succeed or panic. 3495 * remove pages from done_pplist. it's not needed anymore. 3496 */ 3497 while (done_pplist != NULL) { 3498 pp = done_pplist; 3499 page_sub(&done_pplist, pp); 3500 } 3501 VM_STAT_ADD(segvnvmstats.fill_vp_pages[16]); 3502 ASSERT(pplist == NULL); 3503 *ppplist = NULL; 3504 while (targ_pplist != NULL) { 3505 int ret; 3506 VM_STAT_ADD(segvnvmstats.fill_vp_pages[17]); 3507 ASSERT(repl_pplist); 3508 pp = targ_pplist; 3509 page_sub(&targ_pplist, pp); 3510 pgidx = (pp->p_offset - start_off) >> PAGESHIFT; 3511 newpp = repl_pplist; 3512 page_sub(&repl_pplist, newpp); 3513 #ifdef DEBUG 3514 pfn = page_pptonum(pp); 3515 pszc = pp->p_szc; 3516 ppages = page_get_pagecnt(pszc); 3517 ASSERT(IS_P2ALIGNED(pfn, ppages)); 3518 pfn = page_pptonum(newpp); 3519 ASSERT(IS_P2ALIGNED(pfn, ppages)); 3520 ASSERT(P2PHASE(pfn, pages) == pgidx); 3521 #endif 3522 nreloc = 0; 3523 ret = page_relocate(&pp, &newpp, 0, 1, &nreloc, NULL); 3524 if (ret != 0 || nreloc == 0) { 3525 panic("segvn_fill_vp_pages: " 3526 "page_relocate failed"); 3527 } 3528 pp = newpp; 3529 while (nreloc-- != 0) { 3530 ASSERT(PAGE_EXCL(pp)); 3531 ASSERT(pp->p_vnode == vp); 3532 ASSERT(pgidx == 3533 ((pp->p_offset - start_off) >> PAGESHIFT)); 3534 ppa[pgidx++] = pp; 3535 pp++; 3536 } 3537 } 3538 3539 if (svd->type == MAP_PRIVATE) { 3540 VM_STAT_ADD(segvnvmstats.fill_vp_pages[18]); 3541 for (i = 0; i < pages; i++) { 3542 ASSERT(ppa[i] != NULL); 3543 ASSERT(PAGE_EXCL(ppa[i])); 3544 ASSERT(ppa[i]->p_vnode == vp); 3545 ASSERT(ppa[i]->p_offset == 3546 start_off + (i << PAGESHIFT)); 3547 page_downgrade(ppa[i]); 3548 } 3549 ppa[pages] = NULL; 3550 } else { 3551 VM_STAT_ADD(segvnvmstats.fill_vp_pages[19]); 3552 /* 3553 * the caller will still call VOP_GETPAGE() for shared segments 3554 * to check FS write permissions. For private segments we map 3555 * file read only anyway. so no VOP_GETPAGE is needed. 3556 */ 3557 for (i = 0; i < pages; i++) { 3558 ASSERT(ppa[i] != NULL); 3559 ASSERT(PAGE_EXCL(ppa[i])); 3560 ASSERT(ppa[i]->p_vnode == vp); 3561 ASSERT(ppa[i]->p_offset == 3562 start_off + (i << PAGESHIFT)); 3563 page_unlock(ppa[i]); 3564 } 3565 ppa[0] = NULL; 3566 } 3567 3568 return (1); 3569 out: 3570 /* 3571 * Do the cleanup. Unlock target pages we didn't relocate. They are 3572 * linked on targ_pplist by root pages. reassemble unused replacement 3573 * and io pages back to pplist. 3574 */ 3575 if (io_pplist != NULL) { 3576 VM_STAT_ADD(segvnvmstats.fill_vp_pages[20]); 3577 pp = io_pplist; 3578 do { 3579 ASSERT(pp->p_vnode == vp); 3580 ASSERT(pp->p_offset == io_off); 3581 ASSERT(page_iolock_assert(pp)); 3582 page_io_unlock(pp); 3583 page_hashout(pp, NULL); 3584 io_off += PAGESIZE; 3585 } while ((pp = pp->p_next) != io_pplist); 3586 page_list_concat(&io_pplist, &pplist); 3587 pplist = io_pplist; 3588 } 3589 tmp_pplist = NULL; 3590 while (targ_pplist != NULL) { 3591 VM_STAT_ADD(segvnvmstats.fill_vp_pages[21]); 3592 pp = targ_pplist; 3593 ASSERT(PAGE_EXCL(pp)); 3594 page_sub(&targ_pplist, pp); 3595 3596 pszc = pp->p_szc; 3597 ppages = page_get_pagecnt(pszc); 3598 ASSERT(IS_P2ALIGNED(page_pptonum(pp), ppages)); 3599 3600 if (pszc != 0) { 3601 group_page_unlock(pp); 3602 } 3603 page_unlock(pp); 3604 3605 pp = repl_pplist; 3606 ASSERT(pp != NULL); 3607 ASSERT(PAGE_EXCL(pp)); 3608 ASSERT(pp->p_szc == szc); 3609 page_sub(&repl_pplist, pp); 3610 3611 ASSERT(IS_P2ALIGNED(page_pptonum(pp), ppages)); 3612 3613 /* relink replacement page */ 3614 page_list_concat(&tmp_pplist, &pp); 3615 while (--ppages != 0) { 3616 VM_STAT_ADD(segvnvmstats.fill_vp_pages[22]); 3617 pp++; 3618 ASSERT(PAGE_EXCL(pp)); 3619 ASSERT(pp->p_szc == szc); 3620 page_list_concat(&tmp_pplist, &pp); 3621 } 3622 } 3623 if (tmp_pplist != NULL) { 3624 VM_STAT_ADD(segvnvmstats.fill_vp_pages[23]); 3625 page_list_concat(&tmp_pplist, &pplist); 3626 pplist = tmp_pplist; 3627 } 3628 /* 3629 * at this point all pages are either on done_pplist or 3630 * pplist. They can't be all on done_pplist otherwise 3631 * we'd've been done. 3632 */ 3633 ASSERT(pplist != NULL); 3634 if (nios != 0) { 3635 VM_STAT_ADD(segvnvmstats.fill_vp_pages[24]); 3636 pp = pplist; 3637 do { 3638 VM_STAT_ADD(segvnvmstats.fill_vp_pages[25]); 3639 ASSERT(pp->p_szc == szc); 3640 ASSERT(PAGE_EXCL(pp)); 3641 ASSERT(pp->p_vnode != vp); 3642 pp->p_szc = 0; 3643 } while ((pp = pp->p_next) != pplist); 3644 3645 pp = done_pplist; 3646 do { 3647 VM_STAT_ADD(segvnvmstats.fill_vp_pages[26]); 3648 ASSERT(pp->p_szc == szc); 3649 ASSERT(PAGE_EXCL(pp)); 3650 ASSERT(pp->p_vnode == vp); 3651 pp->p_szc = 0; 3652 } while ((pp = pp->p_next) != done_pplist); 3653 3654 while (pplist != NULL) { 3655 VM_STAT_ADD(segvnvmstats.fill_vp_pages[27]); 3656 pp = pplist; 3657 page_sub(&pplist, pp); 3658 page_free(pp, 0); 3659 } 3660 3661 while (done_pplist != NULL) { 3662 VM_STAT_ADD(segvnvmstats.fill_vp_pages[28]); 3663 pp = done_pplist; 3664 page_sub(&done_pplist, pp); 3665 page_unlock(pp); 3666 } 3667 *ppplist = NULL; 3668 return (0); 3669 } 3670 ASSERT(pplist == *ppplist); 3671 if (io_err) { 3672 VM_STAT_ADD(segvnvmstats.fill_vp_pages[29]); 3673 /* 3674 * don't downsize on io error. 3675 * see if vop_getpage succeeds. 3676 * pplist may still be used in this case 3677 * for relocations. 3678 */ 3679 return (0); 3680 } 3681 VM_STAT_ADD(segvnvmstats.fill_vp_pages[30]); 3682 page_free_replacement_page(pplist); 3683 page_create_putback(pages); 3684 *ppplist = NULL; 3685 return (0); 3686 } 3687 3688 int segvn_anypgsz = 0; 3689 3690 #define SEGVN_RESTORE_SOFTLOCK(type, pages) \ 3691 if ((type) == F_SOFTLOCK) { \ 3692 mutex_enter(&freemem_lock); \ 3693 availrmem += (pages); \ 3694 segvn_pages_locked -= (pages); \ 3695 svd->softlockcnt -= (pages); \ 3696 mutex_exit(&freemem_lock); \ 3697 } 3698 3699 #define SEGVN_UPDATE_MODBITS(ppa, pages, rw, prot, vpprot) \ 3700 if (IS_VMODSORT((ppa)[0]->p_vnode)) { \ 3701 if ((rw) == S_WRITE) { \ 3702 for (i = 0; i < (pages); i++) { \ 3703 ASSERT((ppa)[i]->p_vnode == \ 3704 (ppa)[0]->p_vnode); \ 3705 hat_setmod((ppa)[i]); \ 3706 } \ 3707 } else if ((rw) != S_OTHER && \ 3708 ((prot) & (vpprot) & PROT_WRITE)) { \ 3709 for (i = 0; i < (pages); i++) { \ 3710 ASSERT((ppa)[i]->p_vnode == \ 3711 (ppa)[0]->p_vnode); \ 3712 if (!hat_ismod((ppa)[i])) { \ 3713 prot &= ~PROT_WRITE; \ 3714 break; \ 3715 } \ 3716 } \ 3717 } \ 3718 } 3719 3720 #ifdef VM_STATS 3721 3722 #define SEGVN_VMSTAT_FLTVNPAGES(idx) \ 3723 VM_STAT_ADD(segvnvmstats.fltvnpages[(idx)]); 3724 3725 #else /* VM_STATS */ 3726 3727 #define SEGVN_VMSTAT_FLTVNPAGES(idx) 3728 3729 #endif 3730 3731 static faultcode_t 3732 segvn_fault_vnodepages(struct hat *hat, struct seg *seg, caddr_t lpgaddr, 3733 caddr_t lpgeaddr, enum fault_type type, enum seg_rw rw, caddr_t addr, 3734 caddr_t eaddr, int brkcow) 3735 { 3736 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 3737 struct anon_map *amp = svd->amp; 3738 uchar_t segtype = svd->type; 3739 uint_t szc = seg->s_szc; 3740 size_t pgsz = page_get_pagesize(szc); 3741 size_t maxpgsz = pgsz; 3742 pgcnt_t pages = btop(pgsz); 3743 pgcnt_t maxpages = pages; 3744 size_t ppasize = (pages + 1) * sizeof (page_t *); 3745 caddr_t a = lpgaddr; 3746 caddr_t maxlpgeaddr = lpgeaddr; 3747 u_offset_t off = svd->offset + (uintptr_t)(a - seg->s_base); 3748 ulong_t aindx = svd->anon_index + seg_page(seg, a); 3749 struct vpage *vpage = (svd->vpage != NULL) ? 3750 &svd->vpage[seg_page(seg, a)] : NULL; 3751 vnode_t *vp = svd->vp; 3752 page_t **ppa; 3753 uint_t pszc; 3754 size_t ppgsz; 3755 pgcnt_t ppages; 3756 faultcode_t err = 0; 3757 int ierr; 3758 int vop_size_err = 0; 3759 uint_t protchk, prot, vpprot; 3760 ulong_t i; 3761 int hat_flag = (type == F_SOFTLOCK) ? HAT_LOAD_LOCK : HAT_LOAD; 3762 anon_sync_obj_t an_cookie; 3763 enum seg_rw arw; 3764 int alloc_failed = 0; 3765 int adjszc_chk; 3766 struct vattr va; 3767 int xhat = 0; 3768 page_t *pplist; 3769 pfn_t pfn; 3770 int physcontig; 3771 int upgrdfail; 3772 int segvn_anypgsz_vnode = 0; /* for now map vnode with 2 page sizes */ 3773 int tron = (svd->tr_state == SEGVN_TR_ON); 3774 3775 ASSERT(szc != 0); 3776 ASSERT(vp != NULL); 3777 ASSERT(brkcow == 0 || amp != NULL); 3778 ASSERT(tron == 0 || amp != NULL); 3779 ASSERT(enable_mbit_wa == 0); /* no mbit simulations with large pages */ 3780 ASSERT(!(svd->flags & MAP_NORESERVE)); 3781 ASSERT(type != F_SOFTUNLOCK); 3782 ASSERT(IS_P2ALIGNED(a, maxpgsz)); 3783 ASSERT(amp == NULL || IS_P2ALIGNED(aindx, maxpages)); 3784 ASSERT(SEGVN_LOCK_HELD(seg->s_as, &svd->lock)); 3785 ASSERT(seg->s_szc < NBBY * sizeof (int)); 3786 ASSERT(type != F_SOFTLOCK || lpgeaddr - a == maxpgsz); 3787 ASSERT(svd->tr_state != SEGVN_TR_INIT); 3788 3789 VM_STAT_COND_ADD(type == F_SOFTLOCK, segvnvmstats.fltvnpages[0]); 3790 VM_STAT_COND_ADD(type != F_SOFTLOCK, segvnvmstats.fltvnpages[1]); 3791 3792 if (svd->flags & MAP_TEXT) { 3793 hat_flag |= HAT_LOAD_TEXT; 3794 } 3795 3796 if (svd->pageprot) { 3797 switch (rw) { 3798 case S_READ: 3799 protchk = PROT_READ; 3800 break; 3801 case S_WRITE: 3802 protchk = PROT_WRITE; 3803 break; 3804 case S_EXEC: 3805 protchk = PROT_EXEC; 3806 break; 3807 case S_OTHER: 3808 default: 3809 protchk = PROT_READ | PROT_WRITE | PROT_EXEC; 3810 break; 3811 } 3812 } else { 3813 prot = svd->prot; 3814 /* caller has already done segment level protection check. */ 3815 } 3816 3817 if (seg->s_as->a_hat != hat) { 3818 xhat = 1; 3819 } 3820 3821 if (rw == S_WRITE && segtype == MAP_PRIVATE) { 3822 SEGVN_VMSTAT_FLTVNPAGES(2); 3823 arw = S_READ; 3824 } else { 3825 arw = rw; 3826 } 3827 3828 ppa = kmem_alloc(ppasize, KM_SLEEP); 3829 3830 VM_STAT_COND_ADD(amp != NULL, segvnvmstats.fltvnpages[3]); 3831 3832 for (;;) { 3833 adjszc_chk = 0; 3834 for (; a < lpgeaddr; a += pgsz, off += pgsz, aindx += pages) { 3835 if (adjszc_chk) { 3836 while (szc < seg->s_szc) { 3837 uintptr_t e; 3838 uint_t tszc; 3839 tszc = segvn_anypgsz_vnode ? szc + 1 : 3840 seg->s_szc; 3841 ppgsz = page_get_pagesize(tszc); 3842 if (!IS_P2ALIGNED(a, ppgsz) || 3843 ((alloc_failed >> tszc) & 3844 0x1)) { 3845 break; 3846 } 3847 SEGVN_VMSTAT_FLTVNPAGES(4); 3848 szc = tszc; 3849 pgsz = ppgsz; 3850 pages = btop(pgsz); 3851 e = P2ROUNDUP((uintptr_t)eaddr, pgsz); 3852 lpgeaddr = (caddr_t)e; 3853 } 3854 } 3855 3856 again: 3857 if (IS_P2ALIGNED(a, maxpgsz) && amp != NULL) { 3858 ASSERT(IS_P2ALIGNED(aindx, maxpages)); 3859 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 3860 anon_array_enter(amp, aindx, &an_cookie); 3861 if (anon_get_ptr(amp->ahp, aindx) != NULL) { 3862 SEGVN_VMSTAT_FLTVNPAGES(5); 3863 ASSERT(anon_pages(amp->ahp, aindx, 3864 maxpages) == maxpages); 3865 anon_array_exit(&an_cookie); 3866 ANON_LOCK_EXIT(&->a_rwlock); 3867 err = segvn_fault_anonpages(hat, seg, 3868 a, a + maxpgsz, type, rw, 3869 MAX(a, addr), 3870 MIN(a + maxpgsz, eaddr), brkcow); 3871 if (err != 0) { 3872 SEGVN_VMSTAT_FLTVNPAGES(6); 3873 goto out; 3874 } 3875 if (szc < seg->s_szc) { 3876 szc = seg->s_szc; 3877 pgsz = maxpgsz; 3878 pages = maxpages; 3879 lpgeaddr = maxlpgeaddr; 3880 } 3881 goto next; 3882 } else { 3883 ASSERT(anon_pages(amp->ahp, aindx, 3884 maxpages) == 0); 3885 SEGVN_VMSTAT_FLTVNPAGES(7); 3886 anon_array_exit(&an_cookie); 3887 ANON_LOCK_EXIT(&->a_rwlock); 3888 } 3889 } 3890 ASSERT(!brkcow || IS_P2ALIGNED(a, maxpgsz)); 3891 ASSERT(!tron || IS_P2ALIGNED(a, maxpgsz)); 3892 3893 if (svd->pageprot != 0 && IS_P2ALIGNED(a, maxpgsz)) { 3894 ASSERT(vpage != NULL); 3895 prot = VPP_PROT(vpage); 3896 ASSERT(sameprot(seg, a, maxpgsz)); 3897 if ((prot & protchk) == 0) { 3898 SEGVN_VMSTAT_FLTVNPAGES(8); 3899 err = FC_PROT; 3900 goto out; 3901 } 3902 } 3903 if (type == F_SOFTLOCK) { 3904 mutex_enter(&freemem_lock); 3905 if (availrmem < tune.t_minarmem + pages) { 3906 mutex_exit(&freemem_lock); 3907 err = FC_MAKE_ERR(ENOMEM); 3908 goto out; 3909 } else { 3910 availrmem -= pages; 3911 segvn_pages_locked += pages; 3912 svd->softlockcnt += pages; 3913 } 3914 mutex_exit(&freemem_lock); 3915 } 3916 3917 pplist = NULL; 3918 physcontig = 0; 3919 ppa[0] = NULL; 3920 if (!brkcow && !tron && szc && 3921 !page_exists_physcontig(vp, off, szc, 3922 segtype == MAP_PRIVATE ? ppa : NULL)) { 3923 SEGVN_VMSTAT_FLTVNPAGES(9); 3924 if (page_alloc_pages(vp, seg, a, &pplist, NULL, 3925 szc, 0, 0) && type != F_SOFTLOCK) { 3926 SEGVN_VMSTAT_FLTVNPAGES(10); 3927 pszc = 0; 3928 ierr = -1; 3929 alloc_failed |= (1 << szc); 3930 break; 3931 } 3932 if (pplist != NULL && 3933 vp->v_mpssdata == SEGVN_PAGEIO) { 3934 int downsize; 3935 SEGVN_VMSTAT_FLTVNPAGES(11); 3936 physcontig = segvn_fill_vp_pages(svd, 3937 vp, off, szc, ppa, &pplist, 3938 &pszc, &downsize); 3939 ASSERT(!physcontig || pplist == NULL); 3940 if (!physcontig && downsize && 3941 type != F_SOFTLOCK) { 3942 ASSERT(pplist == NULL); 3943 SEGVN_VMSTAT_FLTVNPAGES(12); 3944 ierr = -1; 3945 break; 3946 } 3947 ASSERT(!physcontig || 3948 segtype == MAP_PRIVATE || 3949 ppa[0] == NULL); 3950 if (physcontig && ppa[0] == NULL) { 3951 physcontig = 0; 3952 } 3953 } 3954 } else if (!brkcow && !tron && szc && ppa[0] != NULL) { 3955 SEGVN_VMSTAT_FLTVNPAGES(13); 3956 ASSERT(segtype == MAP_PRIVATE); 3957 physcontig = 1; 3958 } 3959 3960 if (!physcontig) { 3961 SEGVN_VMSTAT_FLTVNPAGES(14); 3962 ppa[0] = NULL; 3963 ierr = VOP_GETPAGE(vp, (offset_t)off, pgsz, 3964 &vpprot, ppa, pgsz, seg, a, arw, 3965 svd->cred); 3966 #ifdef DEBUG 3967 if (ierr == 0) { 3968 for (i = 0; i < pages; i++) { 3969 ASSERT(PAGE_LOCKED(ppa[i])); 3970 ASSERT(!PP_ISFREE(ppa[i])); 3971 ASSERT(ppa[i]->p_vnode == vp); 3972 ASSERT(ppa[i]->p_offset == 3973 off + (i << PAGESHIFT)); 3974 } 3975 } 3976 #endif /* DEBUG */ 3977 if (segtype == MAP_PRIVATE) { 3978 SEGVN_VMSTAT_FLTVNPAGES(15); 3979 vpprot &= ~PROT_WRITE; 3980 } 3981 } else { 3982 ASSERT(segtype == MAP_PRIVATE); 3983 SEGVN_VMSTAT_FLTVNPAGES(16); 3984 vpprot = PROT_ALL & ~PROT_WRITE; 3985 ierr = 0; 3986 } 3987 3988 if (ierr != 0) { 3989 SEGVN_VMSTAT_FLTVNPAGES(17); 3990 if (pplist != NULL) { 3991 SEGVN_VMSTAT_FLTVNPAGES(18); 3992 page_free_replacement_page(pplist); 3993 page_create_putback(pages); 3994 } 3995 SEGVN_RESTORE_SOFTLOCK(type, pages); 3996 if (a + pgsz <= eaddr) { 3997 SEGVN_VMSTAT_FLTVNPAGES(19); 3998 err = FC_MAKE_ERR(ierr); 3999 goto out; 4000 } 4001 va.va_mask = AT_SIZE; 4002 if (VOP_GETATTR(vp, &va, 0, svd->cred) != 0) { 4003 SEGVN_VMSTAT_FLTVNPAGES(20); 4004 err = FC_MAKE_ERR(EIO); 4005 goto out; 4006 } 4007 if (btopr(va.va_size) >= btopr(off + pgsz)) { 4008 SEGVN_VMSTAT_FLTVNPAGES(21); 4009 err = FC_MAKE_ERR(ierr); 4010 goto out; 4011 } 4012 if (btopr(va.va_size) < 4013 btopr(off + (eaddr - a))) { 4014 SEGVN_VMSTAT_FLTVNPAGES(22); 4015 err = FC_MAKE_ERR(ierr); 4016 goto out; 4017 } 4018 if (brkcow || tron || type == F_SOFTLOCK) { 4019 /* can't reduce map area */ 4020 SEGVN_VMSTAT_FLTVNPAGES(23); 4021 vop_size_err = 1; 4022 goto out; 4023 } 4024 SEGVN_VMSTAT_FLTVNPAGES(24); 4025 ASSERT(szc != 0); 4026 pszc = 0; 4027 ierr = -1; 4028 break; 4029 } 4030 4031 if (amp != NULL) { 4032 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 4033 anon_array_enter(amp, aindx, &an_cookie); 4034 } 4035 if (amp != NULL && 4036 anon_get_ptr(amp->ahp, aindx) != NULL) { 4037 ulong_t taindx = P2ALIGN(aindx, maxpages); 4038 4039 SEGVN_VMSTAT_FLTVNPAGES(25); 4040 ASSERT(anon_pages(amp->ahp, taindx, 4041 maxpages) == maxpages); 4042 for (i = 0; i < pages; i++) { 4043 page_unlock(ppa[i]); 4044 } 4045 anon_array_exit(&an_cookie); 4046 ANON_LOCK_EXIT(&->a_rwlock); 4047 if (pplist != NULL) { 4048 page_free_replacement_page(pplist); 4049 page_create_putback(pages); 4050 } 4051 SEGVN_RESTORE_SOFTLOCK(type, pages); 4052 if (szc < seg->s_szc) { 4053 SEGVN_VMSTAT_FLTVNPAGES(26); 4054 /* 4055 * For private segments SOFTLOCK 4056 * either always breaks cow (any rw 4057 * type except S_READ_NOCOW) or 4058 * address space is locked as writer 4059 * (S_READ_NOCOW case) and anon slots 4060 * can't show up on second check. 4061 * Therefore if we are here for 4062 * SOFTLOCK case it must be a cow 4063 * break but cow break never reduces 4064 * szc. text replication (tron) in 4065 * this case works as cow break. 4066 * Thus the assert below. 4067 */ 4068 ASSERT(!brkcow && !tron && 4069 type != F_SOFTLOCK); 4070 pszc = seg->s_szc; 4071 ierr = -2; 4072 break; 4073 } 4074 ASSERT(IS_P2ALIGNED(a, maxpgsz)); 4075 goto again; 4076 } 4077 #ifdef DEBUG 4078 if (amp != NULL) { 4079 ulong_t taindx = P2ALIGN(aindx, maxpages); 4080 ASSERT(!anon_pages(amp->ahp, taindx, maxpages)); 4081 } 4082 #endif /* DEBUG */ 4083 4084 if (brkcow || tron) { 4085 ASSERT(amp != NULL); 4086 ASSERT(pplist == NULL); 4087 ASSERT(szc == seg->s_szc); 4088 ASSERT(IS_P2ALIGNED(a, maxpgsz)); 4089 ASSERT(IS_P2ALIGNED(aindx, maxpages)); 4090 SEGVN_VMSTAT_FLTVNPAGES(27); 4091 ierr = anon_map_privatepages(amp, aindx, szc, 4092 seg, a, prot, ppa, vpage, segvn_anypgsz, 4093 tron ? PG_LOCAL : 0, svd->cred); 4094 if (ierr != 0) { 4095 SEGVN_VMSTAT_FLTVNPAGES(28); 4096 anon_array_exit(&an_cookie); 4097 ANON_LOCK_EXIT(&->a_rwlock); 4098 SEGVN_RESTORE_SOFTLOCK(type, pages); 4099 err = FC_MAKE_ERR(ierr); 4100 goto out; 4101 } 4102 4103 ASSERT(!IS_VMODSORT(ppa[0]->p_vnode)); 4104 /* 4105 * p_szc can't be changed for locked 4106 * swapfs pages. 4107 */ 4108 ASSERT(svd->rcookie == 4109 HAT_INVALID_REGION_COOKIE); 4110 hat_memload_array(hat, a, pgsz, ppa, prot, 4111 hat_flag); 4112 4113 if (!(hat_flag & HAT_LOAD_LOCK)) { 4114 SEGVN_VMSTAT_FLTVNPAGES(29); 4115 for (i = 0; i < pages; i++) { 4116 page_unlock(ppa[i]); 4117 } 4118 } 4119 anon_array_exit(&an_cookie); 4120 ANON_LOCK_EXIT(&->a_rwlock); 4121 goto next; 4122 } 4123 4124 ASSERT(svd->rcookie == HAT_INVALID_REGION_COOKIE || 4125 (!svd->pageprot && svd->prot == (prot & vpprot))); 4126 4127 pfn = page_pptonum(ppa[0]); 4128 /* 4129 * hat_page_demote() needs an SE_EXCL lock on one of 4130 * constituent page_t's and it decreases root's p_szc 4131 * last. This means if root's p_szc is equal szc and 4132 * all its constituent pages are locked 4133 * hat_page_demote() that could have changed p_szc to 4134 * szc is already done and no new have page_demote() 4135 * can start for this large page. 4136 */ 4137 4138 /* 4139 * we need to make sure same mapping size is used for 4140 * the same address range if there's a possibility the 4141 * adddress is already mapped because hat layer panics 4142 * when translation is loaded for the range already 4143 * mapped with a different page size. We achieve it 4144 * by always using largest page size possible subject 4145 * to the constraints of page size, segment page size 4146 * and page alignment. Since mappings are invalidated 4147 * when those constraints change and make it 4148 * impossible to use previously used mapping size no 4149 * mapping size conflicts should happen. 4150 */ 4151 4152 chkszc: 4153 if ((pszc = ppa[0]->p_szc) == szc && 4154 IS_P2ALIGNED(pfn, pages)) { 4155 4156 SEGVN_VMSTAT_FLTVNPAGES(30); 4157 #ifdef DEBUG 4158 for (i = 0; i < pages; i++) { 4159 ASSERT(PAGE_LOCKED(ppa[i])); 4160 ASSERT(!PP_ISFREE(ppa[i])); 4161 ASSERT(page_pptonum(ppa[i]) == 4162 pfn + i); 4163 ASSERT(ppa[i]->p_szc == szc); 4164 ASSERT(ppa[i]->p_vnode == vp); 4165 ASSERT(ppa[i]->p_offset == 4166 off + (i << PAGESHIFT)); 4167 } 4168 #endif /* DEBUG */ 4169 /* 4170 * All pages are of szc we need and they are 4171 * all locked so they can't change szc. load 4172 * translations. 4173 * 4174 * if page got promoted since last check 4175 * we don't need pplist. 4176 */ 4177 if (pplist != NULL) { 4178 page_free_replacement_page(pplist); 4179 page_create_putback(pages); 4180 } 4181 if (PP_ISMIGRATE(ppa[0])) { 4182 page_migrate(seg, a, ppa, pages); 4183 } 4184 SEGVN_UPDATE_MODBITS(ppa, pages, rw, 4185 prot, vpprot); 4186 if (!xhat) { 4187 hat_memload_array_region(hat, a, pgsz, 4188 ppa, prot & vpprot, hat_flag, 4189 svd->rcookie); 4190 } else { 4191 /* 4192 * avoid large xhat mappings to FS 4193 * pages so that hat_page_demote() 4194 * doesn't need to check for xhat 4195 * large mappings. 4196 * Don't use regions with xhats. 4197 */ 4198 for (i = 0; i < pages; i++) { 4199 hat_memload(hat, 4200 a + (i << PAGESHIFT), 4201 ppa[i], prot & vpprot, 4202 hat_flag); 4203 } 4204 } 4205 4206 if (!(hat_flag & HAT_LOAD_LOCK)) { 4207 for (i = 0; i < pages; i++) { 4208 page_unlock(ppa[i]); 4209 } 4210 } 4211 if (amp != NULL) { 4212 anon_array_exit(&an_cookie); 4213 ANON_LOCK_EXIT(&->a_rwlock); 4214 } 4215 goto next; 4216 } 4217 4218 /* 4219 * See if upsize is possible. 4220 */ 4221 if (pszc > szc && szc < seg->s_szc && 4222 (segvn_anypgsz_vnode || pszc >= seg->s_szc)) { 4223 pgcnt_t aphase; 4224 uint_t pszc1 = MIN(pszc, seg->s_szc); 4225 ppgsz = page_get_pagesize(pszc1); 4226 ppages = btop(ppgsz); 4227 aphase = btop(P2PHASE((uintptr_t)a, ppgsz)); 4228 4229 ASSERT(type != F_SOFTLOCK); 4230 4231 SEGVN_VMSTAT_FLTVNPAGES(31); 4232 if (aphase != P2PHASE(pfn, ppages)) { 4233 segvn_faultvnmpss_align_err4++; 4234 } else { 4235 SEGVN_VMSTAT_FLTVNPAGES(32); 4236 if (pplist != NULL) { 4237 page_t *pl = pplist; 4238 page_free_replacement_page(pl); 4239 page_create_putback(pages); 4240 } 4241 for (i = 0; i < pages; i++) { 4242 page_unlock(ppa[i]); 4243 } 4244 if (amp != NULL) { 4245 anon_array_exit(&an_cookie); 4246 ANON_LOCK_EXIT(&->a_rwlock); 4247 } 4248 pszc = pszc1; 4249 ierr = -2; 4250 break; 4251 } 4252 } 4253 4254 /* 4255 * check if we should use smallest mapping size. 4256 */ 4257 upgrdfail = 0; 4258 if (szc == 0 || xhat || 4259 (pszc >= szc && 4260 !IS_P2ALIGNED(pfn, pages)) || 4261 (pszc < szc && 4262 !segvn_full_szcpages(ppa, szc, &upgrdfail, 4263 &pszc))) { 4264 4265 if (upgrdfail && type != F_SOFTLOCK) { 4266 /* 4267 * segvn_full_szcpages failed to lock 4268 * all pages EXCL. Size down. 4269 */ 4270 ASSERT(pszc < szc); 4271 4272 SEGVN_VMSTAT_FLTVNPAGES(33); 4273 4274 if (pplist != NULL) { 4275 page_t *pl = pplist; 4276 page_free_replacement_page(pl); 4277 page_create_putback(pages); 4278 } 4279 4280 for (i = 0; i < pages; i++) { 4281 page_unlock(ppa[i]); 4282 } 4283 if (amp != NULL) { 4284 anon_array_exit(&an_cookie); 4285 ANON_LOCK_EXIT(&->a_rwlock); 4286 } 4287 ierr = -1; 4288 break; 4289 } 4290 if (szc != 0 && !xhat && !upgrdfail) { 4291 segvn_faultvnmpss_align_err5++; 4292 } 4293 SEGVN_VMSTAT_FLTVNPAGES(34); 4294 if (pplist != NULL) { 4295 page_free_replacement_page(pplist); 4296 page_create_putback(pages); 4297 } 4298 SEGVN_UPDATE_MODBITS(ppa, pages, rw, 4299 prot, vpprot); 4300 if (upgrdfail && segvn_anypgsz_vnode) { 4301 /* SOFTLOCK case */ 4302 hat_memload_array_region(hat, a, pgsz, 4303 ppa, prot & vpprot, hat_flag, 4304 svd->rcookie); 4305 } else { 4306 for (i = 0; i < pages; i++) { 4307 hat_memload_region(hat, 4308 a + (i << PAGESHIFT), 4309 ppa[i], prot & vpprot, 4310 hat_flag, svd->rcookie); 4311 } 4312 } 4313 if (!(hat_flag & HAT_LOAD_LOCK)) { 4314 for (i = 0; i < pages; i++) { 4315 page_unlock(ppa[i]); 4316 } 4317 } 4318 if (amp != NULL) { 4319 anon_array_exit(&an_cookie); 4320 ANON_LOCK_EXIT(&->a_rwlock); 4321 } 4322 goto next; 4323 } 4324 4325 if (pszc == szc) { 4326 /* 4327 * segvn_full_szcpages() upgraded pages szc. 4328 */ 4329 ASSERT(pszc == ppa[0]->p_szc); 4330 ASSERT(IS_P2ALIGNED(pfn, pages)); 4331 goto chkszc; 4332 } 4333 4334 if (pszc > szc) { 4335 kmutex_t *szcmtx; 4336 SEGVN_VMSTAT_FLTVNPAGES(35); 4337 /* 4338 * p_szc of ppa[0] can change since we haven't 4339 * locked all constituent pages. Call 4340 * page_lock_szc() to prevent szc changes. 4341 * This should be a rare case that happens when 4342 * multiple segments use a different page size 4343 * to map the same file offsets. 4344 */ 4345 szcmtx = page_szc_lock(ppa[0]); 4346 pszc = ppa[0]->p_szc; 4347 ASSERT(szcmtx != NULL || pszc == 0); 4348 ASSERT(ppa[0]->p_szc <= pszc); 4349 if (pszc <= szc) { 4350 SEGVN_VMSTAT_FLTVNPAGES(36); 4351 if (szcmtx != NULL) { 4352 mutex_exit(szcmtx); 4353 } 4354 goto chkszc; 4355 } 4356 if (pplist != NULL) { 4357 /* 4358 * page got promoted since last check. 4359 * we don't need preaalocated large 4360 * page. 4361 */ 4362 SEGVN_VMSTAT_FLTVNPAGES(37); 4363 page_free_replacement_page(pplist); 4364 page_create_putback(pages); 4365 } 4366 SEGVN_UPDATE_MODBITS(ppa, pages, rw, 4367 prot, vpprot); 4368 hat_memload_array_region(hat, a, pgsz, ppa, 4369 prot & vpprot, hat_flag, svd->rcookie); 4370 mutex_exit(szcmtx); 4371 if (!(hat_flag & HAT_LOAD_LOCK)) { 4372 for (i = 0; i < pages; i++) { 4373 page_unlock(ppa[i]); 4374 } 4375 } 4376 if (amp != NULL) { 4377 anon_array_exit(&an_cookie); 4378 ANON_LOCK_EXIT(&->a_rwlock); 4379 } 4380 goto next; 4381 } 4382 4383 /* 4384 * if page got demoted since last check 4385 * we could have not allocated larger page. 4386 * allocate now. 4387 */ 4388 if (pplist == NULL && 4389 page_alloc_pages(vp, seg, a, &pplist, NULL, 4390 szc, 0, 0) && type != F_SOFTLOCK) { 4391 SEGVN_VMSTAT_FLTVNPAGES(38); 4392 for (i = 0; i < pages; i++) { 4393 page_unlock(ppa[i]); 4394 } 4395 if (amp != NULL) { 4396 anon_array_exit(&an_cookie); 4397 ANON_LOCK_EXIT(&->a_rwlock); 4398 } 4399 ierr = -1; 4400 alloc_failed |= (1 << szc); 4401 break; 4402 } 4403 4404 SEGVN_VMSTAT_FLTVNPAGES(39); 4405 4406 if (pplist != NULL) { 4407 segvn_relocate_pages(ppa, pplist); 4408 #ifdef DEBUG 4409 } else { 4410 ASSERT(type == F_SOFTLOCK); 4411 SEGVN_VMSTAT_FLTVNPAGES(40); 4412 #endif /* DEBUG */ 4413 } 4414 4415 SEGVN_UPDATE_MODBITS(ppa, pages, rw, prot, vpprot); 4416 4417 if (pplist == NULL && segvn_anypgsz_vnode == 0) { 4418 ASSERT(type == F_SOFTLOCK); 4419 for (i = 0; i < pages; i++) { 4420 ASSERT(ppa[i]->p_szc < szc); 4421 hat_memload_region(hat, 4422 a + (i << PAGESHIFT), 4423 ppa[i], prot & vpprot, hat_flag, 4424 svd->rcookie); 4425 } 4426 } else { 4427 ASSERT(pplist != NULL || type == F_SOFTLOCK); 4428 hat_memload_array_region(hat, a, pgsz, ppa, 4429 prot & vpprot, hat_flag, svd->rcookie); 4430 } 4431 if (!(hat_flag & HAT_LOAD_LOCK)) { 4432 for (i = 0; i < pages; i++) { 4433 ASSERT(PAGE_SHARED(ppa[i])); 4434 page_unlock(ppa[i]); 4435 } 4436 } 4437 if (amp != NULL) { 4438 anon_array_exit(&an_cookie); 4439 ANON_LOCK_EXIT(&->a_rwlock); 4440 } 4441 4442 next: 4443 if (vpage != NULL) { 4444 vpage += pages; 4445 } 4446 adjszc_chk = 1; 4447 } 4448 if (a == lpgeaddr) 4449 break; 4450 ASSERT(a < lpgeaddr); 4451 4452 ASSERT(!brkcow && !tron && type != F_SOFTLOCK); 4453 4454 /* 4455 * ierr == -1 means we failed to map with a large page. 4456 * (either due to allocation/relocation failures or 4457 * misalignment with other mappings to this file. 4458 * 4459 * ierr == -2 means some other thread allocated a large page 4460 * after we gave up tp map with a large page. retry with 4461 * larger mapping. 4462 */ 4463 ASSERT(ierr == -1 || ierr == -2); 4464 ASSERT(ierr == -2 || szc != 0); 4465 ASSERT(ierr == -1 || szc < seg->s_szc); 4466 if (ierr == -2) { 4467 SEGVN_VMSTAT_FLTVNPAGES(41); 4468 ASSERT(pszc > szc && pszc <= seg->s_szc); 4469 szc = pszc; 4470 } else if (segvn_anypgsz_vnode) { 4471 SEGVN_VMSTAT_FLTVNPAGES(42); 4472 szc--; 4473 } else { 4474 SEGVN_VMSTAT_FLTVNPAGES(43); 4475 ASSERT(pszc < szc); 4476 /* 4477 * other process created pszc large page. 4478 * but we still have to drop to 0 szc. 4479 */ 4480 szc = 0; 4481 } 4482 4483 pgsz = page_get_pagesize(szc); 4484 pages = btop(pgsz); 4485 if (ierr == -2) { 4486 /* 4487 * Size up case. Note lpgaddr may only be needed for 4488 * softlock case so we don't adjust it here. 4489 */ 4490 a = (caddr_t)P2ALIGN((uintptr_t)a, pgsz); 4491 ASSERT(a >= lpgaddr); 4492 lpgeaddr = (caddr_t)P2ROUNDUP((uintptr_t)eaddr, pgsz); 4493 off = svd->offset + (uintptr_t)(a - seg->s_base); 4494 aindx = svd->anon_index + seg_page(seg, a); 4495 vpage = (svd->vpage != NULL) ? 4496 &svd->vpage[seg_page(seg, a)] : NULL; 4497 } else { 4498 /* 4499 * Size down case. Note lpgaddr may only be needed for 4500 * softlock case so we don't adjust it here. 4501 */ 4502 ASSERT(IS_P2ALIGNED(a, pgsz)); 4503 ASSERT(IS_P2ALIGNED(lpgeaddr, pgsz)); 4504 lpgeaddr = (caddr_t)P2ROUNDUP((uintptr_t)eaddr, pgsz); 4505 ASSERT(a < lpgeaddr); 4506 if (a < addr) { 4507 SEGVN_VMSTAT_FLTVNPAGES(44); 4508 /* 4509 * The beginning of the large page region can 4510 * be pulled to the right to make a smaller 4511 * region. We haven't yet faulted a single 4512 * page. 4513 */ 4514 a = (caddr_t)P2ALIGN((uintptr_t)addr, pgsz); 4515 ASSERT(a >= lpgaddr); 4516 off = svd->offset + 4517 (uintptr_t)(a - seg->s_base); 4518 aindx = svd->anon_index + seg_page(seg, a); 4519 vpage = (svd->vpage != NULL) ? 4520 &svd->vpage[seg_page(seg, a)] : NULL; 4521 } 4522 } 4523 } 4524 out: 4525 kmem_free(ppa, ppasize); 4526 if (!err && !vop_size_err) { 4527 SEGVN_VMSTAT_FLTVNPAGES(45); 4528 return (0); 4529 } 4530 if (type == F_SOFTLOCK && a > lpgaddr) { 4531 SEGVN_VMSTAT_FLTVNPAGES(46); 4532 segvn_softunlock(seg, lpgaddr, a - lpgaddr, S_OTHER); 4533 } 4534 if (!vop_size_err) { 4535 SEGVN_VMSTAT_FLTVNPAGES(47); 4536 return (err); 4537 } 4538 ASSERT(brkcow || tron || type == F_SOFTLOCK); 4539 /* 4540 * Large page end is mapped beyond the end of file and it's a cow 4541 * fault (can be a text replication induced cow) or softlock so we can't 4542 * reduce the map area. For now just demote the segment. This should 4543 * really only happen if the end of the file changed after the mapping 4544 * was established since when large page segments are created we make 4545 * sure they don't extend beyond the end of the file. 4546 */ 4547 SEGVN_VMSTAT_FLTVNPAGES(48); 4548 4549 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 4550 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_WRITER); 4551 err = 0; 4552 if (seg->s_szc != 0) { 4553 segvn_fltvnpages_clrszc_cnt++; 4554 ASSERT(svd->softlockcnt == 0); 4555 err = segvn_clrszc(seg); 4556 if (err != 0) { 4557 segvn_fltvnpages_clrszc_err++; 4558 } 4559 } 4560 ASSERT(err || seg->s_szc == 0); 4561 SEGVN_LOCK_DOWNGRADE(seg->s_as, &svd->lock); 4562 /* segvn_fault will do its job as if szc had been zero to begin with */ 4563 return (err == 0 ? IE_RETRY : FC_MAKE_ERR(err)); 4564 } 4565 4566 /* 4567 * This routine will attempt to fault in one large page. 4568 * it will use smaller pages if that fails. 4569 * It should only be called for pure anonymous segments. 4570 */ 4571 static faultcode_t 4572 segvn_fault_anonpages(struct hat *hat, struct seg *seg, caddr_t lpgaddr, 4573 caddr_t lpgeaddr, enum fault_type type, enum seg_rw rw, caddr_t addr, 4574 caddr_t eaddr, int brkcow) 4575 { 4576 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 4577 struct anon_map *amp = svd->amp; 4578 uchar_t segtype = svd->type; 4579 uint_t szc = seg->s_szc; 4580 size_t pgsz = page_get_pagesize(szc); 4581 size_t maxpgsz = pgsz; 4582 pgcnt_t pages = btop(pgsz); 4583 size_t ppasize = pages * sizeof (page_t *); 4584 caddr_t a = lpgaddr; 4585 ulong_t aindx = svd->anon_index + seg_page(seg, a); 4586 struct vpage *vpage = (svd->vpage != NULL) ? 4587 &svd->vpage[seg_page(seg, a)] : NULL; 4588 page_t **ppa; 4589 uint_t ppa_szc; 4590 faultcode_t err; 4591 int ierr; 4592 uint_t protchk, prot, vpprot; 4593 ulong_t i; 4594 int hat_flag = (type == F_SOFTLOCK) ? HAT_LOAD_LOCK : HAT_LOAD; 4595 anon_sync_obj_t cookie; 4596 int first = 1; 4597 int adjszc_chk; 4598 int purged = 0; 4599 int pgflags = (svd->tr_state == SEGVN_TR_ON) ? PG_LOCAL : 0; 4600 4601 ASSERT(szc != 0); 4602 ASSERT(amp != NULL); 4603 ASSERT(enable_mbit_wa == 0); /* no mbit simulations with large pages */ 4604 ASSERT(!(svd->flags & MAP_NORESERVE)); 4605 ASSERT(type != F_SOFTUNLOCK); 4606 ASSERT(IS_P2ALIGNED(a, maxpgsz)); 4607 ASSERT(!brkcow || svd->tr_state == SEGVN_TR_OFF); 4608 ASSERT(svd->tr_state != SEGVN_TR_INIT); 4609 4610 ASSERT(SEGVN_LOCK_HELD(seg->s_as, &svd->lock)); 4611 4612 VM_STAT_COND_ADD(type == F_SOFTLOCK, segvnvmstats.fltanpages[0]); 4613 VM_STAT_COND_ADD(type != F_SOFTLOCK, segvnvmstats.fltanpages[1]); 4614 4615 if (svd->flags & MAP_TEXT) { 4616 hat_flag |= HAT_LOAD_TEXT; 4617 } 4618 4619 if (svd->pageprot) { 4620 switch (rw) { 4621 case S_READ: 4622 protchk = PROT_READ; 4623 break; 4624 case S_WRITE: 4625 protchk = PROT_WRITE; 4626 break; 4627 case S_EXEC: 4628 protchk = PROT_EXEC; 4629 break; 4630 case S_OTHER: 4631 default: 4632 protchk = PROT_READ | PROT_WRITE | PROT_EXEC; 4633 break; 4634 } 4635 VM_STAT_ADD(segvnvmstats.fltanpages[2]); 4636 } else { 4637 prot = svd->prot; 4638 /* caller has already done segment level protection check. */ 4639 } 4640 4641 ppa = kmem_alloc(ppasize, KM_SLEEP); 4642 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 4643 for (;;) { 4644 adjszc_chk = 0; 4645 for (; a < lpgeaddr; a += pgsz, aindx += pages) { 4646 if (svd->pageprot != 0 && IS_P2ALIGNED(a, maxpgsz)) { 4647 VM_STAT_ADD(segvnvmstats.fltanpages[3]); 4648 ASSERT(vpage != NULL); 4649 prot = VPP_PROT(vpage); 4650 ASSERT(sameprot(seg, a, maxpgsz)); 4651 if ((prot & protchk) == 0) { 4652 err = FC_PROT; 4653 goto error; 4654 } 4655 } 4656 if (adjszc_chk && IS_P2ALIGNED(a, maxpgsz) && 4657 pgsz < maxpgsz) { 4658 ASSERT(a > lpgaddr); 4659 szc = seg->s_szc; 4660 pgsz = maxpgsz; 4661 pages = btop(pgsz); 4662 ASSERT(IS_P2ALIGNED(aindx, pages)); 4663 lpgeaddr = (caddr_t)P2ROUNDUP((uintptr_t)eaddr, 4664 pgsz); 4665 } 4666 if (type == F_SOFTLOCK && svd->vp != NULL) { 4667 mutex_enter(&freemem_lock); 4668 if (availrmem < tune.t_minarmem + pages) { 4669 mutex_exit(&freemem_lock); 4670 err = FC_MAKE_ERR(ENOMEM); 4671 goto error; 4672 } else { 4673 availrmem -= pages; 4674 segvn_pages_locked += pages; 4675 svd->softlockcnt += pages; 4676 } 4677 mutex_exit(&freemem_lock); 4678 } 4679 anon_array_enter(amp, aindx, &cookie); 4680 ppa_szc = (uint_t)-1; 4681 ierr = anon_map_getpages(amp, aindx, szc, seg, a, 4682 prot, &vpprot, ppa, &ppa_szc, vpage, rw, brkcow, 4683 segvn_anypgsz, pgflags, svd->cred); 4684 if (ierr != 0) { 4685 anon_array_exit(&cookie); 4686 VM_STAT_ADD(segvnvmstats.fltanpages[4]); 4687 if (type == F_SOFTLOCK && svd->vp != NULL) { 4688 VM_STAT_ADD(segvnvmstats.fltanpages[5]); 4689 mutex_enter(&freemem_lock); 4690 availrmem += pages; 4691 segvn_pages_locked -= pages; 4692 svd->softlockcnt -= pages; 4693 mutex_exit(&freemem_lock); 4694 } 4695 if (ierr > 0) { 4696 VM_STAT_ADD(segvnvmstats.fltanpages[6]); 4697 err = FC_MAKE_ERR(ierr); 4698 goto error; 4699 } 4700 break; 4701 } 4702 4703 ASSERT(!IS_VMODSORT(ppa[0]->p_vnode)); 4704 4705 ASSERT(segtype == MAP_SHARED || 4706 ppa[0]->p_szc <= szc); 4707 ASSERT(segtype == MAP_PRIVATE || 4708 ppa[0]->p_szc >= szc); 4709 4710 /* 4711 * Handle pages that have been marked for migration 4712 */ 4713 if (lgrp_optimizations()) 4714 page_migrate(seg, a, ppa, pages); 4715 4716 ASSERT(svd->rcookie == HAT_INVALID_REGION_COOKIE); 4717 if (type == F_SOFTLOCK && svd->vp == NULL) { 4718 /* 4719 * If all pages in ppa array belong to the same 4720 * large page call segvn_slock_anonpages() 4721 * just for ppa[0]. 4722 */ 4723 for (i = 0; i < pages; i++) { 4724 if (!segvn_slock_anonpages(ppa[i], 4725 i == 0 && first)) { 4726 ulong_t j; 4727 for (j = 0; j < i; j++) { 4728 segvn_sunlock_anonpages( 4729 ppa[j], 4730 j == 0 && 4731 first); 4732 page_unlock(ppa[j]); 4733 } 4734 for (j = i; j < pages; j++) { 4735 page_unlock(ppa[j]); 4736 } 4737 anon_array_exit(&cookie); 4738 err = FC_MAKE_ERR(ENOMEM); 4739 goto error; 4740 } 4741 if (i == 0 && ppa[0]->p_szc >= szc) { 4742 ASSERT(!(page_pptonum(ppa[0]) & 4743 (pages - 1))); 4744 break; 4745 } 4746 } 4747 first = 0; 4748 mutex_enter(&freemem_lock); 4749 svd->softlockcnt += pages; 4750 segvn_pages_locked += pages; 4751 mutex_exit(&freemem_lock); 4752 } 4753 4754 if (segtype == MAP_SHARED) { 4755 vpprot |= PROT_WRITE; 4756 } 4757 4758 hat_memload_array(hat, a, pgsz, ppa, 4759 prot & vpprot, hat_flag); 4760 4761 if (hat_flag & HAT_LOAD_LOCK) { 4762 VM_STAT_ADD(segvnvmstats.fltanpages[7]); 4763 } else { 4764 VM_STAT_ADD(segvnvmstats.fltanpages[8]); 4765 for (i = 0; i < pages; i++) 4766 page_unlock(ppa[i]); 4767 } 4768 if (vpage != NULL) 4769 vpage += pages; 4770 4771 anon_array_exit(&cookie); 4772 adjszc_chk = 1; 4773 } 4774 if (a == lpgeaddr) 4775 break; 4776 ASSERT(a < lpgeaddr); 4777 /* 4778 * ierr == -1 means we failed to allocate a large page. 4779 * so do a size down operation. 4780 * 4781 * ierr == -2 means some other process that privately shares 4782 * pages with this process has allocated a larger page and we 4783 * need to retry with larger pages. So do a size up 4784 * operation. This relies on the fact that large pages are 4785 * never partially shared i.e. if we share any constituent 4786 * page of a large page with another process we must share the 4787 * entire large page. Note this cannot happen for SOFTLOCK 4788 * case, unless current address (a) is at the beginning of the 4789 * next page size boundary because the other process couldn't 4790 * have relocated locked pages. 4791 */ 4792 ASSERT(ierr == -1 || ierr == -2); 4793 /* 4794 * For the very first relocation failure try to purge this 4795 * segment's cache so that the relocator can obtain an 4796 * exclusive lock on pages we want to relocate. 4797 */ 4798 if (!purged && ierr == -1 && ppa_szc != (uint_t)-1 && 4799 svd->softlockcnt != 0) { 4800 purged = 1; 4801 segvn_purge(seg); 4802 continue; 4803 } 4804 4805 if (segvn_anypgsz) { 4806 ASSERT(ierr == -2 || szc != 0); 4807 ASSERT(ierr == -1 || szc < seg->s_szc); 4808 szc = (ierr == -1) ? szc - 1 : szc + 1; 4809 } else { 4810 /* 4811 * For non COW faults and segvn_anypgsz == 0 4812 * we need to be careful not to loop forever 4813 * if existing page is found with szc other 4814 * than 0 or seg->s_szc. This could be due 4815 * to page relocations on behalf of DR or 4816 * more likely large page creation. For this 4817 * case simply re-size to existing page's szc 4818 * if returned by anon_map_getpages(). 4819 */ 4820 if (ppa_szc == (uint_t)-1) { 4821 szc = (ierr == -1) ? 0 : seg->s_szc; 4822 } else { 4823 ASSERT(ppa_szc <= seg->s_szc); 4824 ASSERT(ierr == -2 || ppa_szc < szc); 4825 ASSERT(ierr == -1 || ppa_szc > szc); 4826 szc = ppa_szc; 4827 } 4828 } 4829 4830 pgsz = page_get_pagesize(szc); 4831 pages = btop(pgsz); 4832 ASSERT(type != F_SOFTLOCK || ierr == -1 || 4833 (IS_P2ALIGNED(a, pgsz) && IS_P2ALIGNED(lpgeaddr, pgsz))); 4834 if (type == F_SOFTLOCK) { 4835 /* 4836 * For softlocks we cannot reduce the fault area 4837 * (calculated based on the largest page size for this 4838 * segment) for size down and a is already next 4839 * page size aligned as assertted above for size 4840 * ups. Therefore just continue in case of softlock. 4841 */ 4842 VM_STAT_ADD(segvnvmstats.fltanpages[9]); 4843 continue; /* keep lint happy */ 4844 } else if (ierr == -2) { 4845 4846 /* 4847 * Size up case. Note lpgaddr may only be needed for 4848 * softlock case so we don't adjust it here. 4849 */ 4850 VM_STAT_ADD(segvnvmstats.fltanpages[10]); 4851 a = (caddr_t)P2ALIGN((uintptr_t)a, pgsz); 4852 ASSERT(a >= lpgaddr); 4853 lpgeaddr = (caddr_t)P2ROUNDUP((uintptr_t)eaddr, pgsz); 4854 aindx = svd->anon_index + seg_page(seg, a); 4855 vpage = (svd->vpage != NULL) ? 4856 &svd->vpage[seg_page(seg, a)] : NULL; 4857 } else { 4858 /* 4859 * Size down case. Note lpgaddr may only be needed for 4860 * softlock case so we don't adjust it here. 4861 */ 4862 VM_STAT_ADD(segvnvmstats.fltanpages[11]); 4863 ASSERT(IS_P2ALIGNED(a, pgsz)); 4864 ASSERT(IS_P2ALIGNED(lpgeaddr, pgsz)); 4865 lpgeaddr = (caddr_t)P2ROUNDUP((uintptr_t)eaddr, pgsz); 4866 ASSERT(a < lpgeaddr); 4867 if (a < addr) { 4868 /* 4869 * The beginning of the large page region can 4870 * be pulled to the right to make a smaller 4871 * region. We haven't yet faulted a single 4872 * page. 4873 */ 4874 VM_STAT_ADD(segvnvmstats.fltanpages[12]); 4875 a = (caddr_t)P2ALIGN((uintptr_t)addr, pgsz); 4876 ASSERT(a >= lpgaddr); 4877 aindx = svd->anon_index + seg_page(seg, a); 4878 vpage = (svd->vpage != NULL) ? 4879 &svd->vpage[seg_page(seg, a)] : NULL; 4880 } 4881 } 4882 } 4883 VM_STAT_ADD(segvnvmstats.fltanpages[13]); 4884 ANON_LOCK_EXIT(&->a_rwlock); 4885 kmem_free(ppa, ppasize); 4886 return (0); 4887 error: 4888 VM_STAT_ADD(segvnvmstats.fltanpages[14]); 4889 ANON_LOCK_EXIT(&->a_rwlock); 4890 kmem_free(ppa, ppasize); 4891 if (type == F_SOFTLOCK && a > lpgaddr) { 4892 VM_STAT_ADD(segvnvmstats.fltanpages[15]); 4893 segvn_softunlock(seg, lpgaddr, a - lpgaddr, S_OTHER); 4894 } 4895 return (err); 4896 } 4897 4898 int fltadvice = 1; /* set to free behind pages for sequential access */ 4899 4900 /* 4901 * This routine is called via a machine specific fault handling routine. 4902 * It is also called by software routines wishing to lock or unlock 4903 * a range of addresses. 4904 * 4905 * Here is the basic algorithm: 4906 * If unlocking 4907 * Call segvn_softunlock 4908 * Return 4909 * endif 4910 * Checking and set up work 4911 * If we will need some non-anonymous pages 4912 * Call VOP_GETPAGE over the range of non-anonymous pages 4913 * endif 4914 * Loop over all addresses requested 4915 * Call segvn_faultpage passing in page list 4916 * to load up translations and handle anonymous pages 4917 * endloop 4918 * Load up translation to any additional pages in page list not 4919 * already handled that fit into this segment 4920 */ 4921 static faultcode_t 4922 segvn_fault(struct hat *hat, struct seg *seg, caddr_t addr, size_t len, 4923 enum fault_type type, enum seg_rw rw) 4924 { 4925 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 4926 page_t **plp, **ppp, *pp; 4927 u_offset_t off; 4928 caddr_t a; 4929 struct vpage *vpage; 4930 uint_t vpprot, prot; 4931 int err; 4932 page_t *pl[PVN_GETPAGE_NUM + 1]; 4933 size_t plsz, pl_alloc_sz; 4934 size_t page; 4935 ulong_t anon_index; 4936 struct anon_map *amp; 4937 int dogetpage = 0; 4938 caddr_t lpgaddr, lpgeaddr; 4939 size_t pgsz; 4940 anon_sync_obj_t cookie; 4941 int brkcow = BREAK_COW_SHARE(rw, type, svd->type); 4942 4943 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 4944 ASSERT(svd->amp == NULL || svd->rcookie == HAT_INVALID_REGION_COOKIE); 4945 4946 /* 4947 * First handle the easy stuff 4948 */ 4949 if (type == F_SOFTUNLOCK) { 4950 if (rw == S_READ_NOCOW) { 4951 rw = S_READ; 4952 ASSERT(AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock)); 4953 } 4954 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER); 4955 pgsz = (seg->s_szc == 0) ? PAGESIZE : 4956 page_get_pagesize(seg->s_szc); 4957 VM_STAT_COND_ADD(pgsz > PAGESIZE, segvnvmstats.fltanpages[16]); 4958 CALC_LPG_REGION(pgsz, seg, addr, len, lpgaddr, lpgeaddr); 4959 segvn_softunlock(seg, lpgaddr, lpgeaddr - lpgaddr, rw); 4960 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 4961 return (0); 4962 } 4963 4964 ASSERT(svd->tr_state == SEGVN_TR_OFF || 4965 !HAT_IS_REGION_COOKIE_VALID(svd->rcookie)); 4966 if (brkcow == 0) { 4967 if (svd->tr_state == SEGVN_TR_INIT) { 4968 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_WRITER); 4969 if (svd->tr_state == SEGVN_TR_INIT) { 4970 ASSERT(svd->vp != NULL && svd->amp == NULL); 4971 ASSERT(svd->flags & MAP_TEXT); 4972 ASSERT(svd->type == MAP_PRIVATE); 4973 segvn_textrepl(seg); 4974 ASSERT(svd->tr_state != SEGVN_TR_INIT); 4975 ASSERT(svd->tr_state != SEGVN_TR_ON || 4976 svd->amp != NULL); 4977 } 4978 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 4979 } 4980 } else if (svd->tr_state != SEGVN_TR_OFF) { 4981 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_WRITER); 4982 4983 if (rw == S_WRITE && svd->tr_state != SEGVN_TR_OFF) { 4984 ASSERT(!svd->pageprot && !(svd->prot & PROT_WRITE)); 4985 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 4986 return (FC_PROT); 4987 } 4988 4989 if (svd->tr_state == SEGVN_TR_ON) { 4990 ASSERT(svd->vp != NULL && svd->amp != NULL); 4991 segvn_textunrepl(seg, 0); 4992 ASSERT(svd->amp == NULL && 4993 svd->tr_state == SEGVN_TR_OFF); 4994 } else if (svd->tr_state != SEGVN_TR_OFF) { 4995 svd->tr_state = SEGVN_TR_OFF; 4996 } 4997 ASSERT(svd->amp == NULL && svd->tr_state == SEGVN_TR_OFF); 4998 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 4999 } 5000 5001 top: 5002 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER); 5003 5004 /* 5005 * If we have the same protections for the entire segment, 5006 * insure that the access being attempted is legitimate. 5007 */ 5008 5009 if (svd->pageprot == 0) { 5010 uint_t protchk; 5011 5012 switch (rw) { 5013 case S_READ: 5014 case S_READ_NOCOW: 5015 protchk = PROT_READ; 5016 break; 5017 case S_WRITE: 5018 protchk = PROT_WRITE; 5019 break; 5020 case S_EXEC: 5021 protchk = PROT_EXEC; 5022 break; 5023 case S_OTHER: 5024 default: 5025 protchk = PROT_READ | PROT_WRITE | PROT_EXEC; 5026 break; 5027 } 5028 5029 if ((svd->prot & protchk) == 0) { 5030 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5031 return (FC_PROT); /* illegal access type */ 5032 } 5033 } 5034 5035 if (brkcow && HAT_IS_REGION_COOKIE_VALID(svd->rcookie)) { 5036 /* this must be SOFTLOCK S_READ fault */ 5037 ASSERT(svd->amp == NULL); 5038 ASSERT(svd->tr_state == SEGVN_TR_OFF); 5039 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5040 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_WRITER); 5041 if (HAT_IS_REGION_COOKIE_VALID(svd->rcookie)) { 5042 /* 5043 * this must be the first ever non S_READ_NOCOW 5044 * softlock for this segment. 5045 */ 5046 ASSERT(svd->softlockcnt == 0); 5047 hat_leave_region(seg->s_as->a_hat, svd->rcookie, 5048 HAT_REGION_TEXT); 5049 svd->rcookie = HAT_INVALID_REGION_COOKIE; 5050 } 5051 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5052 goto top; 5053 } 5054 5055 /* 5056 * We can't allow the long term use of softlocks for vmpss segments, 5057 * because in some file truncation cases we should be able to demote 5058 * the segment, which requires that there are no softlocks. The 5059 * only case where it's ok to allow a SOFTLOCK fault against a vmpss 5060 * segment is S_READ_NOCOW, where the caller holds the address space 5061 * locked as writer and calls softunlock before dropping the as lock. 5062 * S_READ_NOCOW is used by /proc to read memory from another user. 5063 * 5064 * Another deadlock between SOFTLOCK and file truncation can happen 5065 * because segvn_fault_vnodepages() calls the FS one pagesize at 5066 * a time. A second VOP_GETPAGE() call by segvn_fault_vnodepages() 5067 * can cause a deadlock because the first set of page_t's remain 5068 * locked SE_SHARED. To avoid this, we demote segments on a first 5069 * SOFTLOCK if they have a length greater than the segment's 5070 * page size. 5071 * 5072 * So for now, we only avoid demoting a segment on a SOFTLOCK when 5073 * the access type is S_READ_NOCOW and the fault length is less than 5074 * or equal to the segment's page size. While this is quite restrictive, 5075 * it should be the most common case of SOFTLOCK against a vmpss 5076 * segment. 5077 * 5078 * For S_READ_NOCOW, it's safe not to do a copy on write because the 5079 * caller makes sure no COW will be caused by another thread for a 5080 * softlocked page. 5081 */ 5082 if (type == F_SOFTLOCK && svd->vp != NULL && seg->s_szc != 0) { 5083 int demote = 0; 5084 5085 if (rw != S_READ_NOCOW) { 5086 demote = 1; 5087 } 5088 if (!demote && len > PAGESIZE) { 5089 pgsz = page_get_pagesize(seg->s_szc); 5090 CALC_LPG_REGION(pgsz, seg, addr, len, lpgaddr, 5091 lpgeaddr); 5092 if (lpgeaddr - lpgaddr > pgsz) { 5093 demote = 1; 5094 } 5095 } 5096 5097 ASSERT(demote || AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock)); 5098 5099 if (demote) { 5100 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5101 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_WRITER); 5102 if (seg->s_szc != 0) { 5103 segvn_vmpss_clrszc_cnt++; 5104 ASSERT(svd->softlockcnt == 0); 5105 err = segvn_clrszc(seg); 5106 if (err) { 5107 segvn_vmpss_clrszc_err++; 5108 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5109 return (FC_MAKE_ERR(err)); 5110 } 5111 } 5112 ASSERT(seg->s_szc == 0); 5113 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5114 goto top; 5115 } 5116 } 5117 5118 /* 5119 * Check to see if we need to allocate an anon_map structure. 5120 */ 5121 if (svd->amp == NULL && (svd->vp == NULL || brkcow)) { 5122 ASSERT(svd->rcookie == HAT_INVALID_REGION_COOKIE); 5123 /* 5124 * Drop the "read" lock on the segment and acquire 5125 * the "write" version since we have to allocate the 5126 * anon_map. 5127 */ 5128 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5129 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_WRITER); 5130 5131 if (svd->amp == NULL) { 5132 svd->amp = anonmap_alloc(seg->s_size, 0, ANON_SLEEP); 5133 svd->amp->a_szc = seg->s_szc; 5134 } 5135 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5136 5137 /* 5138 * Start all over again since segment protections 5139 * may have changed after we dropped the "read" lock. 5140 */ 5141 goto top; 5142 } 5143 5144 /* 5145 * S_READ_NOCOW vs S_READ distinction was 5146 * only needed for the code above. After 5147 * that we treat it as S_READ. 5148 */ 5149 if (rw == S_READ_NOCOW) { 5150 ASSERT(type == F_SOFTLOCK); 5151 ASSERT(AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock)); 5152 rw = S_READ; 5153 } 5154 5155 amp = svd->amp; 5156 5157 /* 5158 * MADV_SEQUENTIAL work is ignored for large page segments. 5159 */ 5160 if (seg->s_szc != 0) { 5161 pgsz = page_get_pagesize(seg->s_szc); 5162 ASSERT(SEGVN_LOCK_HELD(seg->s_as, &svd->lock)); 5163 CALC_LPG_REGION(pgsz, seg, addr, len, lpgaddr, lpgeaddr); 5164 if (svd->vp == NULL) { 5165 err = segvn_fault_anonpages(hat, seg, lpgaddr, 5166 lpgeaddr, type, rw, addr, addr + len, brkcow); 5167 } else { 5168 err = segvn_fault_vnodepages(hat, seg, lpgaddr, 5169 lpgeaddr, type, rw, addr, addr + len, brkcow); 5170 if (err == IE_RETRY) { 5171 ASSERT(seg->s_szc == 0); 5172 ASSERT(SEGVN_READ_HELD(seg->s_as, &svd->lock)); 5173 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5174 goto top; 5175 } 5176 } 5177 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5178 return (err); 5179 } 5180 5181 page = seg_page(seg, addr); 5182 if (amp != NULL) { 5183 ASSERT(svd->rcookie == HAT_INVALID_REGION_COOKIE); 5184 anon_index = svd->anon_index + page; 5185 5186 if (type == F_PROT && rw == S_READ && 5187 svd->tr_state == SEGVN_TR_OFF && 5188 svd->type == MAP_PRIVATE && svd->pageprot == 0) { 5189 size_t index = anon_index; 5190 struct anon *ap; 5191 5192 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 5193 /* 5194 * The fast path could apply to S_WRITE also, except 5195 * that the protection fault could be caused by lazy 5196 * tlb flush when ro->rw. In this case, the pte is 5197 * RW already. But RO in the other cpu's tlb causes 5198 * the fault. Since hat_chgprot won't do anything if 5199 * pte doesn't change, we may end up faulting 5200 * indefinitely until the RO tlb entry gets replaced. 5201 */ 5202 for (a = addr; a < addr + len; a += PAGESIZE, index++) { 5203 anon_array_enter(amp, index, &cookie); 5204 ap = anon_get_ptr(amp->ahp, index); 5205 anon_array_exit(&cookie); 5206 if ((ap == NULL) || (ap->an_refcnt != 1)) { 5207 ANON_LOCK_EXIT(&->a_rwlock); 5208 goto slow; 5209 } 5210 } 5211 hat_chgprot(seg->s_as->a_hat, addr, len, svd->prot); 5212 ANON_LOCK_EXIT(&->a_rwlock); 5213 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5214 return (0); 5215 } 5216 } 5217 slow: 5218 5219 if (svd->vpage == NULL) 5220 vpage = NULL; 5221 else 5222 vpage = &svd->vpage[page]; 5223 5224 off = svd->offset + (uintptr_t)(addr - seg->s_base); 5225 5226 /* 5227 * If MADV_SEQUENTIAL has been set for the particular page we 5228 * are faulting on, free behind all pages in the segment and put 5229 * them on the free list. 5230 */ 5231 5232 if ((page != 0) && fltadvice && svd->tr_state != SEGVN_TR_ON) { 5233 struct vpage *vpp; 5234 ulong_t fanon_index; 5235 size_t fpage; 5236 u_offset_t pgoff, fpgoff; 5237 struct vnode *fvp; 5238 struct anon *fap = NULL; 5239 5240 if (svd->advice == MADV_SEQUENTIAL || 5241 (svd->pageadvice && 5242 VPP_ADVICE(vpage) == MADV_SEQUENTIAL)) { 5243 pgoff = off - PAGESIZE; 5244 fpage = page - 1; 5245 if (vpage != NULL) 5246 vpp = &svd->vpage[fpage]; 5247 if (amp != NULL) 5248 fanon_index = svd->anon_index + fpage; 5249 5250 while (pgoff > svd->offset) { 5251 if (svd->advice != MADV_SEQUENTIAL && 5252 (!svd->pageadvice || (vpage && 5253 VPP_ADVICE(vpp) != MADV_SEQUENTIAL))) 5254 break; 5255 5256 /* 5257 * If this is an anon page, we must find the 5258 * correct <vp, offset> for it 5259 */ 5260 fap = NULL; 5261 if (amp != NULL) { 5262 ANON_LOCK_ENTER(&->a_rwlock, 5263 RW_READER); 5264 anon_array_enter(amp, fanon_index, 5265 &cookie); 5266 fap = anon_get_ptr(amp->ahp, 5267 fanon_index); 5268 if (fap != NULL) { 5269 swap_xlate(fap, &fvp, &fpgoff); 5270 } else { 5271 fpgoff = pgoff; 5272 fvp = svd->vp; 5273 } 5274 anon_array_exit(&cookie); 5275 ANON_LOCK_EXIT(&->a_rwlock); 5276 } else { 5277 fpgoff = pgoff; 5278 fvp = svd->vp; 5279 } 5280 if (fvp == NULL) 5281 break; /* XXX */ 5282 /* 5283 * Skip pages that are free or have an 5284 * "exclusive" lock. 5285 */ 5286 pp = page_lookup_nowait(fvp, fpgoff, SE_SHARED); 5287 if (pp == NULL) 5288 break; 5289 /* 5290 * We don't need the page_struct_lock to test 5291 * as this is only advisory; even if we 5292 * acquire it someone might race in and lock 5293 * the page after we unlock and before the 5294 * PUTPAGE, then VOP_PUTPAGE will do nothing. 5295 */ 5296 if (pp->p_lckcnt == 0 && pp->p_cowcnt == 0) { 5297 /* 5298 * Hold the vnode before releasing 5299 * the page lock to prevent it from 5300 * being freed and re-used by some 5301 * other thread. 5302 */ 5303 VN_HOLD(fvp); 5304 page_unlock(pp); 5305 /* 5306 * We should build a page list 5307 * to kluster putpages XXX 5308 */ 5309 (void) VOP_PUTPAGE(fvp, 5310 (offset_t)fpgoff, PAGESIZE, 5311 (B_DONTNEED|B_FREE|B_ASYNC), 5312 svd->cred); 5313 VN_RELE(fvp); 5314 } else { 5315 /* 5316 * XXX - Should the loop terminate if 5317 * the page is `locked'? 5318 */ 5319 page_unlock(pp); 5320 } 5321 --vpp; 5322 --fanon_index; 5323 pgoff -= PAGESIZE; 5324 } 5325 } 5326 } 5327 5328 plp = pl; 5329 *plp = NULL; 5330 pl_alloc_sz = 0; 5331 5332 /* 5333 * See if we need to call VOP_GETPAGE for 5334 * *any* of the range being faulted on. 5335 * We can skip all of this work if there 5336 * was no original vnode. 5337 */ 5338 if (svd->vp != NULL) { 5339 u_offset_t vp_off; 5340 size_t vp_len; 5341 struct anon *ap; 5342 vnode_t *vp; 5343 5344 vp_off = off; 5345 vp_len = len; 5346 5347 if (amp == NULL) 5348 dogetpage = 1; 5349 else { 5350 /* 5351 * Only acquire reader lock to prevent amp->ahp 5352 * from being changed. It's ok to miss pages, 5353 * hence we don't do anon_array_enter 5354 */ 5355 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 5356 ap = anon_get_ptr(amp->ahp, anon_index); 5357 5358 if (len <= PAGESIZE) 5359 /* inline non_anon() */ 5360 dogetpage = (ap == NULL); 5361 else 5362 dogetpage = non_anon(amp->ahp, anon_index, 5363 &vp_off, &vp_len); 5364 ANON_LOCK_EXIT(&->a_rwlock); 5365 } 5366 5367 if (dogetpage) { 5368 enum seg_rw arw; 5369 struct as *as = seg->s_as; 5370 5371 if (len > ptob((sizeof (pl) / sizeof (pl[0])) - 1)) { 5372 /* 5373 * Page list won't fit in local array, 5374 * allocate one of the needed size. 5375 */ 5376 pl_alloc_sz = 5377 (btop(len) + 1) * sizeof (page_t *); 5378 plp = kmem_alloc(pl_alloc_sz, KM_SLEEP); 5379 plp[0] = NULL; 5380 plsz = len; 5381 } else if (rw == S_WRITE && svd->type == MAP_PRIVATE || 5382 svd->tr_state == SEGVN_TR_ON || rw == S_OTHER || 5383 (((size_t)(addr + PAGESIZE) < 5384 (size_t)(seg->s_base + seg->s_size)) && 5385 hat_probe(as->a_hat, addr + PAGESIZE))) { 5386 /* 5387 * Ask VOP_GETPAGE to return the exact number 5388 * of pages if 5389 * (a) this is a COW fault, or 5390 * (b) this is a software fault, or 5391 * (c) next page is already mapped. 5392 */ 5393 plsz = len; 5394 } else { 5395 /* 5396 * Ask VOP_GETPAGE to return adjacent pages 5397 * within the segment. 5398 */ 5399 plsz = MIN((size_t)PVN_GETPAGE_SZ, (size_t) 5400 ((seg->s_base + seg->s_size) - addr)); 5401 ASSERT((addr + plsz) <= 5402 (seg->s_base + seg->s_size)); 5403 } 5404 5405 /* 5406 * Need to get some non-anonymous pages. 5407 * We need to make only one call to GETPAGE to do 5408 * this to prevent certain deadlocking conditions 5409 * when we are doing locking. In this case 5410 * non_anon() should have picked up the smallest 5411 * range which includes all the non-anonymous 5412 * pages in the requested range. We have to 5413 * be careful regarding which rw flag to pass in 5414 * because on a private mapping, the underlying 5415 * object is never allowed to be written. 5416 */ 5417 if (rw == S_WRITE && svd->type == MAP_PRIVATE) { 5418 arw = S_READ; 5419 } else { 5420 arw = rw; 5421 } 5422 vp = svd->vp; 5423 TRACE_3(TR_FAC_VM, TR_SEGVN_GETPAGE, 5424 "segvn_getpage:seg %p addr %p vp %p", 5425 seg, addr, vp); 5426 err = VOP_GETPAGE(vp, (offset_t)vp_off, vp_len, 5427 &vpprot, plp, plsz, seg, addr + (vp_off - off), arw, 5428 svd->cred); 5429 if (err) { 5430 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5431 segvn_pagelist_rele(plp); 5432 if (pl_alloc_sz) 5433 kmem_free(plp, pl_alloc_sz); 5434 return (FC_MAKE_ERR(err)); 5435 } 5436 if (svd->type == MAP_PRIVATE) 5437 vpprot &= ~PROT_WRITE; 5438 } 5439 } 5440 5441 /* 5442 * N.B. at this time the plp array has all the needed non-anon 5443 * pages in addition to (possibly) having some adjacent pages. 5444 */ 5445 5446 /* 5447 * Always acquire the anon_array_lock to prevent 5448 * 2 threads from allocating separate anon slots for 5449 * the same "addr". 5450 * 5451 * If this is a copy-on-write fault and we don't already 5452 * have the anon_array_lock, acquire it to prevent the 5453 * fault routine from handling multiple copy-on-write faults 5454 * on the same "addr" in the same address space. 5455 * 5456 * Only one thread should deal with the fault since after 5457 * it is handled, the other threads can acquire a translation 5458 * to the newly created private page. This prevents two or 5459 * more threads from creating different private pages for the 5460 * same fault. 5461 * 5462 * We grab "serialization" lock here if this is a MAP_PRIVATE segment 5463 * to prevent deadlock between this thread and another thread 5464 * which has soft-locked this page and wants to acquire serial_lock. 5465 * ( bug 4026339 ) 5466 * 5467 * The fix for bug 4026339 becomes unnecessary when using the 5468 * locking scheme with per amp rwlock and a global set of hash 5469 * lock, anon_array_lock. If we steal a vnode page when low 5470 * on memory and upgrad the page lock through page_rename, 5471 * then the page is PAGE_HANDLED, nothing needs to be done 5472 * for this page after returning from segvn_faultpage. 5473 * 5474 * But really, the page lock should be downgraded after 5475 * the stolen page is page_rename'd. 5476 */ 5477 5478 if (amp != NULL) 5479 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 5480 5481 /* 5482 * Ok, now loop over the address range and handle faults 5483 */ 5484 for (a = addr; a < addr + len; a += PAGESIZE, off += PAGESIZE) { 5485 err = segvn_faultpage(hat, seg, a, off, vpage, plp, vpprot, 5486 type, rw, brkcow, a == addr); 5487 if (err) { 5488 if (amp != NULL) 5489 ANON_LOCK_EXIT(&->a_rwlock); 5490 if (type == F_SOFTLOCK && a > addr) { 5491 segvn_softunlock(seg, addr, (a - addr), 5492 S_OTHER); 5493 } 5494 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5495 segvn_pagelist_rele(plp); 5496 if (pl_alloc_sz) 5497 kmem_free(plp, pl_alloc_sz); 5498 return (err); 5499 } 5500 if (vpage) { 5501 vpage++; 5502 } else if (svd->vpage) { 5503 page = seg_page(seg, addr); 5504 vpage = &svd->vpage[++page]; 5505 } 5506 } 5507 5508 /* Didn't get pages from the underlying fs so we're done */ 5509 if (!dogetpage) 5510 goto done; 5511 5512 /* 5513 * Now handle any other pages in the list returned. 5514 * If the page can be used, load up the translations now. 5515 * Note that the for loop will only be entered if "plp" 5516 * is pointing to a non-NULL page pointer which means that 5517 * VOP_GETPAGE() was called and vpprot has been initialized. 5518 */ 5519 if (svd->pageprot == 0) 5520 prot = svd->prot & vpprot; 5521 5522 5523 /* 5524 * Large Files: diff should be unsigned value because we started 5525 * supporting > 2GB segment sizes from 2.5.1 and when a 5526 * large file of size > 2GB gets mapped to address space 5527 * the diff value can be > 2GB. 5528 */ 5529 5530 for (ppp = plp; (pp = *ppp) != NULL; ppp++) { 5531 size_t diff; 5532 struct anon *ap; 5533 int anon_index; 5534 anon_sync_obj_t cookie; 5535 int hat_flag = HAT_LOAD_ADV; 5536 5537 if (svd->flags & MAP_TEXT) { 5538 hat_flag |= HAT_LOAD_TEXT; 5539 } 5540 5541 if (pp == PAGE_HANDLED) 5542 continue; 5543 5544 if (svd->tr_state != SEGVN_TR_ON && 5545 pp->p_offset >= svd->offset && 5546 pp->p_offset < svd->offset + seg->s_size) { 5547 5548 diff = pp->p_offset - svd->offset; 5549 5550 /* 5551 * Large Files: Following is the assertion 5552 * validating the above cast. 5553 */ 5554 ASSERT(svd->vp == pp->p_vnode); 5555 5556 page = btop(diff); 5557 if (svd->pageprot) 5558 prot = VPP_PROT(&svd->vpage[page]) & vpprot; 5559 5560 /* 5561 * Prevent other threads in the address space from 5562 * creating private pages (i.e., allocating anon slots) 5563 * while we are in the process of loading translations 5564 * to additional pages returned by the underlying 5565 * object. 5566 */ 5567 if (amp != NULL) { 5568 anon_index = svd->anon_index + page; 5569 anon_array_enter(amp, anon_index, &cookie); 5570 ap = anon_get_ptr(amp->ahp, anon_index); 5571 } 5572 if ((amp == NULL) || (ap == NULL)) { 5573 if (IS_VMODSORT(pp->p_vnode) || 5574 enable_mbit_wa) { 5575 if (rw == S_WRITE) 5576 hat_setmod(pp); 5577 else if (rw != S_OTHER && 5578 !hat_ismod(pp)) 5579 prot &= ~PROT_WRITE; 5580 } 5581 /* 5582 * Skip mapping read ahead pages marked 5583 * for migration, so they will get migrated 5584 * properly on fault 5585 */ 5586 ASSERT(amp == NULL || 5587 svd->rcookie == HAT_INVALID_REGION_COOKIE); 5588 if ((prot & PROT_READ) && !PP_ISMIGRATE(pp)) { 5589 hat_memload_region(hat, 5590 seg->s_base + diff, 5591 pp, prot, hat_flag, 5592 svd->rcookie); 5593 } 5594 } 5595 if (amp != NULL) 5596 anon_array_exit(&cookie); 5597 } 5598 page_unlock(pp); 5599 } 5600 done: 5601 if (amp != NULL) 5602 ANON_LOCK_EXIT(&->a_rwlock); 5603 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5604 if (pl_alloc_sz) 5605 kmem_free(plp, pl_alloc_sz); 5606 return (0); 5607 } 5608 5609 /* 5610 * This routine is used to start I/O on pages asynchronously. XXX it will 5611 * only create PAGESIZE pages. At fault time they will be relocated into 5612 * larger pages. 5613 */ 5614 static faultcode_t 5615 segvn_faulta(struct seg *seg, caddr_t addr) 5616 { 5617 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 5618 int err; 5619 struct anon_map *amp; 5620 vnode_t *vp; 5621 5622 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 5623 5624 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER); 5625 if ((amp = svd->amp) != NULL) { 5626 struct anon *ap; 5627 5628 /* 5629 * Reader lock to prevent amp->ahp from being changed. 5630 * This is advisory, it's ok to miss a page, so 5631 * we don't do anon_array_enter lock. 5632 */ 5633 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 5634 if ((ap = anon_get_ptr(amp->ahp, 5635 svd->anon_index + seg_page(seg, addr))) != NULL) { 5636 5637 err = anon_getpage(&ap, NULL, NULL, 5638 0, seg, addr, S_READ, svd->cred); 5639 5640 ANON_LOCK_EXIT(&->a_rwlock); 5641 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5642 if (err) 5643 return (FC_MAKE_ERR(err)); 5644 return (0); 5645 } 5646 ANON_LOCK_EXIT(&->a_rwlock); 5647 } 5648 5649 if (svd->vp == NULL) { 5650 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5651 return (0); /* zfod page - do nothing now */ 5652 } 5653 5654 vp = svd->vp; 5655 TRACE_3(TR_FAC_VM, TR_SEGVN_GETPAGE, 5656 "segvn_getpage:seg %p addr %p vp %p", seg, addr, vp); 5657 err = VOP_GETPAGE(vp, 5658 (offset_t)(svd->offset + (uintptr_t)(addr - seg->s_base)), 5659 PAGESIZE, NULL, NULL, 0, seg, addr, 5660 S_OTHER, svd->cred); 5661 5662 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5663 if (err) 5664 return (FC_MAKE_ERR(err)); 5665 return (0); 5666 } 5667 5668 static int 5669 segvn_setprot(struct seg *seg, caddr_t addr, size_t len, uint_t prot) 5670 { 5671 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 5672 struct vpage *svp, *evp; 5673 struct vnode *vp; 5674 size_t pgsz; 5675 pgcnt_t pgcnt; 5676 anon_sync_obj_t cookie; 5677 int unload_done = 0; 5678 5679 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 5680 5681 if ((svd->maxprot & prot) != prot) 5682 return (EACCES); /* violated maxprot */ 5683 5684 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_WRITER); 5685 5686 /* return if prot is the same */ 5687 if (!svd->pageprot && svd->prot == prot) { 5688 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5689 return (0); 5690 } 5691 5692 /* 5693 * Since we change protections we first have to flush the cache. 5694 * This makes sure all the pagelock calls have to recheck 5695 * protections. 5696 */ 5697 if (svd->softlockcnt > 0) { 5698 ASSERT(svd->tr_state == SEGVN_TR_OFF); 5699 /* 5700 * Since we do have the segvn writers lock nobody can fill 5701 * the cache with entries belonging to this seg during 5702 * the purge. The flush either succeeds or we still have 5703 * pending I/Os. 5704 */ 5705 segvn_purge(seg); 5706 if (svd->softlockcnt > 0) { 5707 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5708 return (EAGAIN); 5709 } 5710 } 5711 5712 if (HAT_IS_REGION_COOKIE_VALID(svd->rcookie)) { 5713 ASSERT(svd->amp == NULL); 5714 ASSERT(svd->tr_state == SEGVN_TR_OFF); 5715 hat_leave_region(seg->s_as->a_hat, svd->rcookie, 5716 HAT_REGION_TEXT); 5717 svd->rcookie = HAT_INVALID_REGION_COOKIE; 5718 unload_done = 1; 5719 } else if (svd->tr_state == SEGVN_TR_INIT) { 5720 svd->tr_state = SEGVN_TR_OFF; 5721 } else if (svd->tr_state == SEGVN_TR_ON) { 5722 ASSERT(svd->amp != NULL); 5723 segvn_textunrepl(seg, 0); 5724 ASSERT(svd->amp == NULL && svd->tr_state == SEGVN_TR_OFF); 5725 unload_done = 1; 5726 } 5727 5728 if ((prot & PROT_WRITE) && svd->type == MAP_SHARED && 5729 svd->vp != NULL && (svd->vp->v_flag & VVMEXEC)) { 5730 ASSERT(vn_is_mapped(svd->vp, V_WRITE)); 5731 segvn_inval_trcache(svd->vp); 5732 } 5733 if (seg->s_szc != 0) { 5734 int err; 5735 pgsz = page_get_pagesize(seg->s_szc); 5736 pgcnt = pgsz >> PAGESHIFT; 5737 ASSERT(IS_P2ALIGNED(pgcnt, pgcnt)); 5738 if (!IS_P2ALIGNED(addr, pgsz) || !IS_P2ALIGNED(len, pgsz)) { 5739 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5740 ASSERT(seg->s_base != addr || seg->s_size != len); 5741 /* 5742 * If we are holding the as lock as a reader then 5743 * we need to return IE_RETRY and let the as 5744 * layer drop and re-aquire the lock as a writer. 5745 */ 5746 if (AS_READ_HELD(seg->s_as, &seg->s_as->a_lock)) 5747 return (IE_RETRY); 5748 VM_STAT_ADD(segvnvmstats.demoterange[1]); 5749 if (svd->type == MAP_PRIVATE || svd->vp != NULL) { 5750 err = segvn_demote_range(seg, addr, len, 5751 SDR_END, 0); 5752 } else { 5753 uint_t szcvec = map_pgszcvec(seg->s_base, 5754 pgsz, (uintptr_t)seg->s_base, 5755 (svd->flags & MAP_TEXT), MAPPGSZC_SHM, 0); 5756 err = segvn_demote_range(seg, addr, len, 5757 SDR_END, szcvec); 5758 } 5759 if (err == 0) 5760 return (IE_RETRY); 5761 if (err == ENOMEM) 5762 return (IE_NOMEM); 5763 return (err); 5764 } 5765 } 5766 5767 5768 /* 5769 * If it's a private mapping and we're making it writable 5770 * and no swap space has been reserved, have to reserve 5771 * it all now. If it's a private mapping to a file (i.e., vp != NULL) 5772 * and we're removing write permission on the entire segment and 5773 * we haven't modified any pages, we can release the swap space. 5774 */ 5775 if (svd->type == MAP_PRIVATE) { 5776 if (prot & PROT_WRITE) { 5777 size_t sz; 5778 if (svd->swresv == 0 && !(svd->flags & MAP_NORESERVE)) { 5779 if (anon_resv_zone(seg->s_size, 5780 seg->s_as->a_proc->p_zone) == 0) { 5781 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5782 return (IE_NOMEM); 5783 } 5784 sz = svd->swresv = seg->s_size; 5785 TRACE_3(TR_FAC_VM, TR_ANON_PROC, 5786 "anon proc:%p %lu %u", 5787 seg, sz, 1); 5788 } 5789 } else { 5790 /* 5791 * Swap space is released only if this segment 5792 * does not map anonymous memory, since read faults 5793 * on such segments still need an anon slot to read 5794 * in the data. 5795 */ 5796 if (svd->swresv != 0 && svd->vp != NULL && 5797 svd->amp == NULL && addr == seg->s_base && 5798 len == seg->s_size && svd->pageprot == 0) { 5799 anon_unresv_zone(svd->swresv, 5800 seg->s_as->a_proc->p_zone); 5801 svd->swresv = 0; 5802 TRACE_3(TR_FAC_VM, TR_ANON_PROC, 5803 "anon proc:%p %lu %u", 5804 seg, 0, 0); 5805 } 5806 } 5807 } 5808 5809 if (addr == seg->s_base && len == seg->s_size && svd->vpage == NULL) { 5810 if (svd->prot == prot) { 5811 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5812 return (0); /* all done */ 5813 } 5814 svd->prot = (uchar_t)prot; 5815 } else if (svd->type == MAP_PRIVATE) { 5816 struct anon *ap = NULL; 5817 page_t *pp; 5818 u_offset_t offset, off; 5819 struct anon_map *amp; 5820 ulong_t anon_idx = 0; 5821 5822 /* 5823 * A vpage structure exists or else the change does not 5824 * involve the entire segment. Establish a vpage structure 5825 * if none is there. Then, for each page in the range, 5826 * adjust its individual permissions. Note that write- 5827 * enabling a MAP_PRIVATE page can affect the claims for 5828 * locked down memory. Overcommitting memory terminates 5829 * the operation. 5830 */ 5831 segvn_vpage(seg); 5832 svd->pageprot = 1; 5833 if ((amp = svd->amp) != NULL) { 5834 anon_idx = svd->anon_index + seg_page(seg, addr); 5835 ASSERT(seg->s_szc == 0 || 5836 IS_P2ALIGNED(anon_idx, pgcnt)); 5837 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 5838 } 5839 5840 offset = svd->offset + (uintptr_t)(addr - seg->s_base); 5841 evp = &svd->vpage[seg_page(seg, addr + len)]; 5842 5843 /* 5844 * See Statement at the beginning of segvn_lockop regarding 5845 * the way cowcnts and lckcnts are handled. 5846 */ 5847 for (svp = &svd->vpage[seg_page(seg, addr)]; svp < evp; svp++) { 5848 5849 if (seg->s_szc != 0) { 5850 if (amp != NULL) { 5851 anon_array_enter(amp, anon_idx, 5852 &cookie); 5853 } 5854 if (IS_P2ALIGNED(anon_idx, pgcnt) && 5855 !segvn_claim_pages(seg, svp, offset, 5856 anon_idx, prot)) { 5857 if (amp != NULL) { 5858 anon_array_exit(&cookie); 5859 } 5860 break; 5861 } 5862 if (amp != NULL) { 5863 anon_array_exit(&cookie); 5864 } 5865 anon_idx++; 5866 } else { 5867 if (amp != NULL) { 5868 anon_array_enter(amp, anon_idx, 5869 &cookie); 5870 ap = anon_get_ptr(amp->ahp, anon_idx++); 5871 } 5872 5873 if (VPP_ISPPLOCK(svp) && 5874 VPP_PROT(svp) != prot) { 5875 5876 if (amp == NULL || ap == NULL) { 5877 vp = svd->vp; 5878 off = offset; 5879 } else 5880 swap_xlate(ap, &vp, &off); 5881 if (amp != NULL) 5882 anon_array_exit(&cookie); 5883 5884 if ((pp = page_lookup(vp, off, 5885 SE_SHARED)) == NULL) { 5886 panic("segvn_setprot: no page"); 5887 /*NOTREACHED*/ 5888 } 5889 ASSERT(seg->s_szc == 0); 5890 if ((VPP_PROT(svp) ^ prot) & 5891 PROT_WRITE) { 5892 if (prot & PROT_WRITE) { 5893 if (!page_addclaim(pp)) { 5894 page_unlock(pp); 5895 break; 5896 } 5897 } else { 5898 if (!page_subclaim(pp)) { 5899 page_unlock(pp); 5900 break; 5901 } 5902 } 5903 } 5904 page_unlock(pp); 5905 } else if (amp != NULL) 5906 anon_array_exit(&cookie); 5907 } 5908 VPP_SETPROT(svp, prot); 5909 offset += PAGESIZE; 5910 } 5911 if (amp != NULL) 5912 ANON_LOCK_EXIT(&->a_rwlock); 5913 5914 /* 5915 * Did we terminate prematurely? If so, simply unload 5916 * the translations to the things we've updated so far. 5917 */ 5918 if (svp != evp) { 5919 if (unload_done) { 5920 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5921 return (IE_NOMEM); 5922 } 5923 len = (svp - &svd->vpage[seg_page(seg, addr)]) * 5924 PAGESIZE; 5925 ASSERT(seg->s_szc == 0 || IS_P2ALIGNED(len, pgsz)); 5926 if (len != 0) 5927 hat_unload(seg->s_as->a_hat, addr, 5928 len, HAT_UNLOAD); 5929 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5930 return (IE_NOMEM); 5931 } 5932 } else { 5933 segvn_vpage(seg); 5934 svd->pageprot = 1; 5935 evp = &svd->vpage[seg_page(seg, addr + len)]; 5936 for (svp = &svd->vpage[seg_page(seg, addr)]; svp < evp; svp++) { 5937 VPP_SETPROT(svp, prot); 5938 } 5939 } 5940 5941 if (unload_done) { 5942 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5943 return (0); 5944 } 5945 5946 if (((prot & PROT_WRITE) != 0 && 5947 (svd->vp != NULL || svd->type == MAP_PRIVATE)) || 5948 (prot & ~PROT_USER) == PROT_NONE) { 5949 /* 5950 * Either private or shared data with write access (in 5951 * which case we need to throw out all former translations 5952 * so that we get the right translations set up on fault 5953 * and we don't allow write access to any copy-on-write pages 5954 * that might be around or to prevent write access to pages 5955 * representing holes in a file), or we don't have permission 5956 * to access the memory at all (in which case we have to 5957 * unload any current translations that might exist). 5958 */ 5959 hat_unload(seg->s_as->a_hat, addr, len, HAT_UNLOAD); 5960 } else { 5961 /* 5962 * A shared mapping or a private mapping in which write 5963 * protection is going to be denied - just change all the 5964 * protections over the range of addresses in question. 5965 * segvn does not support any other attributes other 5966 * than prot so we can use hat_chgattr. 5967 */ 5968 hat_chgattr(seg->s_as->a_hat, addr, len, prot); 5969 } 5970 5971 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5972 5973 return (0); 5974 } 5975 5976 /* 5977 * segvn_setpagesize is called via SEGOP_SETPAGESIZE from as_setpagesize, 5978 * to determine if the seg is capable of mapping the requested szc. 5979 */ 5980 static int 5981 segvn_setpagesize(struct seg *seg, caddr_t addr, size_t len, uint_t szc) 5982 { 5983 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 5984 struct segvn_data *nsvd; 5985 struct anon_map *amp = svd->amp; 5986 struct seg *nseg; 5987 caddr_t eaddr = addr + len, a; 5988 size_t pgsz = page_get_pagesize(szc); 5989 pgcnt_t pgcnt = page_get_pagecnt(szc); 5990 int err; 5991 u_offset_t off = svd->offset + (uintptr_t)(addr - seg->s_base); 5992 extern struct vnode kvp; 5993 5994 ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock)); 5995 ASSERT(addr >= seg->s_base && eaddr <= seg->s_base + seg->s_size); 5996 5997 if (seg->s_szc == szc || segvn_lpg_disable != 0) { 5998 return (0); 5999 } 6000 6001 /* 6002 * addr should always be pgsz aligned but eaddr may be misaligned if 6003 * it's at the end of the segment. 6004 * 6005 * XXX we should assert this condition since as_setpagesize() logic 6006 * guarantees it. 6007 */ 6008 if (!IS_P2ALIGNED(addr, pgsz) || 6009 (!IS_P2ALIGNED(eaddr, pgsz) && 6010 eaddr != seg->s_base + seg->s_size)) { 6011 6012 segvn_setpgsz_align_err++; 6013 return (EINVAL); 6014 } 6015 6016 if (amp != NULL && svd->type == MAP_SHARED) { 6017 ulong_t an_idx = svd->anon_index + seg_page(seg, addr); 6018 if (!IS_P2ALIGNED(an_idx, pgcnt)) { 6019 6020 segvn_setpgsz_anon_align_err++; 6021 return (EINVAL); 6022 } 6023 } 6024 6025 if ((svd->flags & MAP_NORESERVE) || seg->s_as == &kas || 6026 szc > segvn_maxpgszc) { 6027 return (EINVAL); 6028 } 6029 6030 /* paranoid check */ 6031 if (svd->vp != NULL && 6032 (IS_SWAPFSVP(svd->vp) || VN_ISKAS(svd->vp))) { 6033 return (EINVAL); 6034 } 6035 6036 if (seg->s_szc == 0 && svd->vp != NULL && 6037 map_addr_vacalign_check(addr, off)) { 6038 return (EINVAL); 6039 } 6040 6041 /* 6042 * Check that protections are the same within new page 6043 * size boundaries. 6044 */ 6045 if (svd->pageprot) { 6046 for (a = addr; a < eaddr; a += pgsz) { 6047 if ((a + pgsz) > eaddr) { 6048 if (!sameprot(seg, a, eaddr - a)) { 6049 return (EINVAL); 6050 } 6051 } else { 6052 if (!sameprot(seg, a, pgsz)) { 6053 return (EINVAL); 6054 } 6055 } 6056 } 6057 } 6058 6059 /* 6060 * Since we are changing page size we first have to flush 6061 * the cache. This makes sure all the pagelock calls have 6062 * to recheck protections. 6063 */ 6064 if (svd->softlockcnt > 0) { 6065 ASSERT(svd->tr_state == SEGVN_TR_OFF); 6066 /* 6067 * Since we do have the segvn writers lock nobody can fill 6068 * the cache with entries belonging to this seg during 6069 * the purge. The flush either succeeds or we still have 6070 * pending I/Os. 6071 */ 6072 segvn_purge(seg); 6073 if (svd->softlockcnt > 0) { 6074 return (EAGAIN); 6075 } 6076 } 6077 6078 if (HAT_IS_REGION_COOKIE_VALID(svd->rcookie)) { 6079 ASSERT(svd->amp == NULL); 6080 ASSERT(svd->tr_state == SEGVN_TR_OFF); 6081 hat_leave_region(seg->s_as->a_hat, svd->rcookie, 6082 HAT_REGION_TEXT); 6083 svd->rcookie = HAT_INVALID_REGION_COOKIE; 6084 } else if (svd->tr_state == SEGVN_TR_INIT) { 6085 svd->tr_state = SEGVN_TR_OFF; 6086 } else if (svd->tr_state == SEGVN_TR_ON) { 6087 ASSERT(svd->amp != NULL); 6088 segvn_textunrepl(seg, 1); 6089 ASSERT(svd->amp == NULL && svd->tr_state == SEGVN_TR_OFF); 6090 amp = NULL; 6091 } 6092 6093 /* 6094 * Operation for sub range of existing segment. 6095 */ 6096 if (addr != seg->s_base || eaddr != (seg->s_base + seg->s_size)) { 6097 if (szc < seg->s_szc) { 6098 VM_STAT_ADD(segvnvmstats.demoterange[2]); 6099 err = segvn_demote_range(seg, addr, len, SDR_RANGE, 0); 6100 if (err == 0) { 6101 return (IE_RETRY); 6102 } 6103 if (err == ENOMEM) { 6104 return (IE_NOMEM); 6105 } 6106 return (err); 6107 } 6108 if (addr != seg->s_base) { 6109 nseg = segvn_split_seg(seg, addr); 6110 if (eaddr != (nseg->s_base + nseg->s_size)) { 6111 /* eaddr is szc aligned */ 6112 (void) segvn_split_seg(nseg, eaddr); 6113 } 6114 return (IE_RETRY); 6115 } 6116 if (eaddr != (seg->s_base + seg->s_size)) { 6117 /* eaddr is szc aligned */ 6118 (void) segvn_split_seg(seg, eaddr); 6119 } 6120 return (IE_RETRY); 6121 } 6122 6123 /* 6124 * Break any low level sharing and reset seg->s_szc to 0. 6125 */ 6126 if ((err = segvn_clrszc(seg)) != 0) { 6127 if (err == ENOMEM) { 6128 err = IE_NOMEM; 6129 } 6130 return (err); 6131 } 6132 ASSERT(seg->s_szc == 0); 6133 6134 /* 6135 * If the end of the current segment is not pgsz aligned 6136 * then attempt to concatenate with the next segment. 6137 */ 6138 if (!IS_P2ALIGNED(eaddr, pgsz)) { 6139 nseg = AS_SEGNEXT(seg->s_as, seg); 6140 if (nseg == NULL || nseg == seg || eaddr != nseg->s_base) { 6141 return (ENOMEM); 6142 } 6143 if (nseg->s_ops != &segvn_ops) { 6144 return (EINVAL); 6145 } 6146 nsvd = (struct segvn_data *)nseg->s_data; 6147 if (nsvd->softlockcnt > 0) { 6148 segvn_purge(nseg); 6149 if (nsvd->softlockcnt > 0) { 6150 return (EAGAIN); 6151 } 6152 } 6153 err = segvn_clrszc(nseg); 6154 if (err == ENOMEM) { 6155 err = IE_NOMEM; 6156 } 6157 if (err != 0) { 6158 return (err); 6159 } 6160 ASSERT(nsvd->rcookie == HAT_INVALID_REGION_COOKIE); 6161 err = segvn_concat(seg, nseg, 1); 6162 if (err == -1) { 6163 return (EINVAL); 6164 } 6165 if (err == -2) { 6166 return (IE_NOMEM); 6167 } 6168 return (IE_RETRY); 6169 } 6170 6171 /* 6172 * May need to re-align anon array to 6173 * new szc. 6174 */ 6175 if (amp != NULL) { 6176 if (!IS_P2ALIGNED(svd->anon_index, pgcnt)) { 6177 struct anon_hdr *nahp; 6178 6179 ASSERT(svd->type == MAP_PRIVATE); 6180 6181 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 6182 ASSERT(amp->refcnt == 1); 6183 nahp = anon_create(btop(amp->size), ANON_NOSLEEP); 6184 if (nahp == NULL) { 6185 ANON_LOCK_EXIT(&->a_rwlock); 6186 return (IE_NOMEM); 6187 } 6188 if (anon_copy_ptr(amp->ahp, svd->anon_index, 6189 nahp, 0, btop(seg->s_size), ANON_NOSLEEP)) { 6190 anon_release(nahp, btop(amp->size)); 6191 ANON_LOCK_EXIT(&->a_rwlock); 6192 return (IE_NOMEM); 6193 } 6194 anon_release(amp->ahp, btop(amp->size)); 6195 amp->ahp = nahp; 6196 svd->anon_index = 0; 6197 ANON_LOCK_EXIT(&->a_rwlock); 6198 } 6199 } 6200 if (svd->vp != NULL && szc != 0) { 6201 struct vattr va; 6202 u_offset_t eoffpage = svd->offset; 6203 va.va_mask = AT_SIZE; 6204 eoffpage += seg->s_size; 6205 eoffpage = btopr(eoffpage); 6206 if (VOP_GETATTR(svd->vp, &va, 0, svd->cred) != 0) { 6207 segvn_setpgsz_getattr_err++; 6208 return (EINVAL); 6209 } 6210 if (btopr(va.va_size) < eoffpage) { 6211 segvn_setpgsz_eof_err++; 6212 return (EINVAL); 6213 } 6214 if (amp != NULL) { 6215 /* 6216 * anon_fill_cow_holes() may call VOP_GETPAGE(). 6217 * don't take anon map lock here to avoid holding it 6218 * across VOP_GETPAGE() calls that may call back into 6219 * segvn for klsutering checks. We don't really need 6220 * anon map lock here since it's a private segment and 6221 * we hold as level lock as writers. 6222 */ 6223 if ((err = anon_fill_cow_holes(seg, seg->s_base, 6224 amp->ahp, svd->anon_index, svd->vp, svd->offset, 6225 seg->s_size, szc, svd->prot, svd->vpage, 6226 svd->cred)) != 0) { 6227 return (EINVAL); 6228 } 6229 } 6230 segvn_setvnode_mpss(svd->vp); 6231 } 6232 6233 if (amp != NULL) { 6234 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 6235 if (svd->type == MAP_PRIVATE) { 6236 amp->a_szc = szc; 6237 } else if (szc > amp->a_szc) { 6238 amp->a_szc = szc; 6239 } 6240 ANON_LOCK_EXIT(&->a_rwlock); 6241 } 6242 6243 seg->s_szc = szc; 6244 6245 return (0); 6246 } 6247 6248 static int 6249 segvn_clrszc(struct seg *seg) 6250 { 6251 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 6252 struct anon_map *amp = svd->amp; 6253 size_t pgsz; 6254 pgcnt_t pages; 6255 int err = 0; 6256 caddr_t a = seg->s_base; 6257 caddr_t ea = a + seg->s_size; 6258 ulong_t an_idx = svd->anon_index; 6259 vnode_t *vp = svd->vp; 6260 struct vpage *vpage = svd->vpage; 6261 page_t *anon_pl[1 + 1], *pp; 6262 struct anon *ap, *oldap; 6263 uint_t prot = svd->prot, vpprot; 6264 int pageflag = 0; 6265 6266 ASSERT(AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock) || 6267 SEGVN_WRITE_HELD(seg->s_as, &svd->lock)); 6268 ASSERT(svd->softlockcnt == 0); 6269 6270 if (vp == NULL && amp == NULL) { 6271 ASSERT(svd->rcookie == HAT_INVALID_REGION_COOKIE); 6272 seg->s_szc = 0; 6273 return (0); 6274 } 6275 6276 if (HAT_IS_REGION_COOKIE_VALID(svd->rcookie)) { 6277 ASSERT(svd->amp == NULL); 6278 ASSERT(svd->tr_state == SEGVN_TR_OFF); 6279 hat_leave_region(seg->s_as->a_hat, svd->rcookie, 6280 HAT_REGION_TEXT); 6281 svd->rcookie = HAT_INVALID_REGION_COOKIE; 6282 } else if (svd->tr_state == SEGVN_TR_ON) { 6283 ASSERT(svd->amp != NULL); 6284 segvn_textunrepl(seg, 1); 6285 ASSERT(svd->amp == NULL && svd->tr_state == SEGVN_TR_OFF); 6286 amp = NULL; 6287 } else { 6288 if (svd->tr_state != SEGVN_TR_OFF) { 6289 ASSERT(svd->tr_state == SEGVN_TR_INIT); 6290 svd->tr_state = SEGVN_TR_OFF; 6291 } 6292 6293 /* 6294 * do HAT_UNLOAD_UNMAP since we are changing the pagesize. 6295 * unload argument is 0 when we are freeing the segment 6296 * and unload was already done. 6297 */ 6298 hat_unload(seg->s_as->a_hat, seg->s_base, seg->s_size, 6299 HAT_UNLOAD_UNMAP); 6300 } 6301 6302 if (amp == NULL || svd->type == MAP_SHARED) { 6303 seg->s_szc = 0; 6304 return (0); 6305 } 6306 6307 pgsz = page_get_pagesize(seg->s_szc); 6308 pages = btop(pgsz); 6309 6310 /* 6311 * XXX anon rwlock is not really needed because this is a 6312 * private segment and we are writers. 6313 */ 6314 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 6315 6316 for (; a < ea; a += pgsz, an_idx += pages) { 6317 if ((oldap = anon_get_ptr(amp->ahp, an_idx)) != NULL) { 6318 ASSERT(vpage != NULL || svd->pageprot == 0); 6319 if (vpage != NULL) { 6320 ASSERT(sameprot(seg, a, pgsz)); 6321 prot = VPP_PROT(vpage); 6322 pageflag = VPP_ISPPLOCK(vpage) ? LOCK_PAGE : 0; 6323 } 6324 if (seg->s_szc != 0) { 6325 ASSERT(vp == NULL || anon_pages(amp->ahp, 6326 an_idx, pages) == pages); 6327 if ((err = anon_map_demotepages(amp, an_idx, 6328 seg, a, prot, vpage, svd->cred)) != 0) { 6329 goto out; 6330 } 6331 } else { 6332 if (oldap->an_refcnt == 1) { 6333 continue; 6334 } 6335 if ((err = anon_getpage(&oldap, &vpprot, 6336 anon_pl, PAGESIZE, seg, a, S_READ, 6337 svd->cred))) { 6338 goto out; 6339 } 6340 if ((pp = anon_private(&ap, seg, a, prot, 6341 anon_pl[0], pageflag, svd->cred)) == NULL) { 6342 err = ENOMEM; 6343 goto out; 6344 } 6345 anon_decref(oldap); 6346 (void) anon_set_ptr(amp->ahp, an_idx, ap, 6347 ANON_SLEEP); 6348 page_unlock(pp); 6349 } 6350 } 6351 vpage = (vpage == NULL) ? NULL : vpage + pages; 6352 } 6353 6354 amp->a_szc = 0; 6355 seg->s_szc = 0; 6356 out: 6357 ANON_LOCK_EXIT(&->a_rwlock); 6358 return (err); 6359 } 6360 6361 static int 6362 segvn_claim_pages( 6363 struct seg *seg, 6364 struct vpage *svp, 6365 u_offset_t off, 6366 ulong_t anon_idx, 6367 uint_t prot) 6368 { 6369 pgcnt_t pgcnt = page_get_pagecnt(seg->s_szc); 6370 size_t ppasize = (pgcnt + 1) * sizeof (page_t *); 6371 page_t **ppa; 6372 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 6373 struct anon_map *amp = svd->amp; 6374 struct vpage *evp = svp + pgcnt; 6375 caddr_t addr = ((uintptr_t)(svp - svd->vpage) << PAGESHIFT) 6376 + seg->s_base; 6377 struct anon *ap; 6378 struct vnode *vp = svd->vp; 6379 page_t *pp; 6380 pgcnt_t pg_idx, i; 6381 int err = 0; 6382 anoff_t aoff; 6383 int anon = (amp != NULL) ? 1 : 0; 6384 6385 ASSERT(svd->type == MAP_PRIVATE); 6386 ASSERT(svd->vpage != NULL); 6387 ASSERT(seg->s_szc != 0); 6388 ASSERT(IS_P2ALIGNED(pgcnt, pgcnt)); 6389 ASSERT(amp == NULL || IS_P2ALIGNED(anon_idx, pgcnt)); 6390 ASSERT(sameprot(seg, addr, pgcnt << PAGESHIFT)); 6391 6392 if (VPP_PROT(svp) == prot) 6393 return (1); 6394 if (!((VPP_PROT(svp) ^ prot) & PROT_WRITE)) 6395 return (1); 6396 6397 ppa = kmem_alloc(ppasize, KM_SLEEP); 6398 if (anon && vp != NULL) { 6399 if (anon_get_ptr(amp->ahp, anon_idx) == NULL) { 6400 anon = 0; 6401 ASSERT(!anon_pages(amp->ahp, anon_idx, pgcnt)); 6402 } 6403 ASSERT(!anon || 6404 anon_pages(amp->ahp, anon_idx, pgcnt) == pgcnt); 6405 } 6406 6407 for (*ppa = NULL, pg_idx = 0; svp < evp; svp++, anon_idx++) { 6408 if (!VPP_ISPPLOCK(svp)) 6409 continue; 6410 if (anon) { 6411 ap = anon_get_ptr(amp->ahp, anon_idx); 6412 if (ap == NULL) { 6413 panic("segvn_claim_pages: no anon slot"); 6414 } 6415 swap_xlate(ap, &vp, &aoff); 6416 off = (u_offset_t)aoff; 6417 } 6418 ASSERT(vp != NULL); 6419 if ((pp = page_lookup(vp, 6420 (u_offset_t)off, SE_SHARED)) == NULL) { 6421 panic("segvn_claim_pages: no page"); 6422 } 6423 ppa[pg_idx++] = pp; 6424 off += PAGESIZE; 6425 } 6426 6427 if (ppa[0] == NULL) { 6428 kmem_free(ppa, ppasize); 6429 return (1); 6430 } 6431 6432 ASSERT(pg_idx <= pgcnt); 6433 ppa[pg_idx] = NULL; 6434 6435 if (prot & PROT_WRITE) 6436 err = page_addclaim_pages(ppa); 6437 else 6438 err = page_subclaim_pages(ppa); 6439 6440 for (i = 0; i < pg_idx; i++) { 6441 ASSERT(ppa[i] != NULL); 6442 page_unlock(ppa[i]); 6443 } 6444 6445 kmem_free(ppa, ppasize); 6446 return (err); 6447 } 6448 6449 /* 6450 * Returns right (upper address) segment if split occured. 6451 * If the address is equal to the beginning or end of its segment it returns 6452 * the current segment. 6453 */ 6454 static struct seg * 6455 segvn_split_seg(struct seg *seg, caddr_t addr) 6456 { 6457 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 6458 struct seg *nseg; 6459 size_t nsize; 6460 struct segvn_data *nsvd; 6461 6462 ASSERT(AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock)); 6463 ASSERT(svd->tr_state == SEGVN_TR_OFF); 6464 6465 ASSERT(addr >= seg->s_base); 6466 ASSERT(addr <= seg->s_base + seg->s_size); 6467 ASSERT(svd->rcookie == HAT_INVALID_REGION_COOKIE); 6468 6469 if (addr == seg->s_base || addr == seg->s_base + seg->s_size) 6470 return (seg); 6471 6472 nsize = seg->s_base + seg->s_size - addr; 6473 seg->s_size = addr - seg->s_base; 6474 nseg = seg_alloc(seg->s_as, addr, nsize); 6475 ASSERT(nseg != NULL); 6476 nseg->s_ops = seg->s_ops; 6477 nsvd = kmem_cache_alloc(segvn_cache, KM_SLEEP); 6478 nseg->s_data = (void *)nsvd; 6479 nseg->s_szc = seg->s_szc; 6480 *nsvd = *svd; 6481 ASSERT(nsvd->rcookie == HAT_INVALID_REGION_COOKIE); 6482 nsvd->seg = nseg; 6483 rw_init(&nsvd->lock, NULL, RW_DEFAULT, NULL); 6484 6485 if (nsvd->vp != NULL) { 6486 VN_HOLD(nsvd->vp); 6487 nsvd->offset = svd->offset + 6488 (uintptr_t)(nseg->s_base - seg->s_base); 6489 if (nsvd->type == MAP_SHARED) 6490 lgrp_shm_policy_init(NULL, nsvd->vp); 6491 } else { 6492 /* 6493 * The offset for an anonymous segment has no signifigance in 6494 * terms of an offset into a file. If we were to use the above 6495 * calculation instead, the structures read out of 6496 * /proc/<pid>/xmap would be more difficult to decipher since 6497 * it would be unclear whether two seemingly contiguous 6498 * prxmap_t structures represented different segments or a 6499 * single segment that had been split up into multiple prxmap_t 6500 * structures (e.g. if some part of the segment had not yet 6501 * been faulted in). 6502 */ 6503 nsvd->offset = 0; 6504 } 6505 6506 ASSERT(svd->softlockcnt == 0); 6507 crhold(svd->cred); 6508 6509 if (svd->vpage != NULL) { 6510 size_t bytes = vpgtob(seg_pages(seg)); 6511 size_t nbytes = vpgtob(seg_pages(nseg)); 6512 struct vpage *ovpage = svd->vpage; 6513 6514 svd->vpage = kmem_alloc(bytes, KM_SLEEP); 6515 bcopy(ovpage, svd->vpage, bytes); 6516 nsvd->vpage = kmem_alloc(nbytes, KM_SLEEP); 6517 bcopy(ovpage + seg_pages(seg), nsvd->vpage, nbytes); 6518 kmem_free(ovpage, bytes + nbytes); 6519 } 6520 if (svd->amp != NULL && svd->type == MAP_PRIVATE) { 6521 struct anon_map *oamp = svd->amp, *namp; 6522 struct anon_hdr *nahp; 6523 6524 ANON_LOCK_ENTER(&oamp->a_rwlock, RW_WRITER); 6525 ASSERT(oamp->refcnt == 1); 6526 nahp = anon_create(btop(seg->s_size), ANON_SLEEP); 6527 (void) anon_copy_ptr(oamp->ahp, svd->anon_index, 6528 nahp, 0, btop(seg->s_size), ANON_SLEEP); 6529 6530 namp = anonmap_alloc(nseg->s_size, 0, ANON_SLEEP); 6531 namp->a_szc = nseg->s_szc; 6532 (void) anon_copy_ptr(oamp->ahp, 6533 svd->anon_index + btop(seg->s_size), 6534 namp->ahp, 0, btop(nseg->s_size), ANON_SLEEP); 6535 anon_release(oamp->ahp, btop(oamp->size)); 6536 oamp->ahp = nahp; 6537 oamp->size = seg->s_size; 6538 svd->anon_index = 0; 6539 nsvd->amp = namp; 6540 nsvd->anon_index = 0; 6541 ANON_LOCK_EXIT(&oamp->a_rwlock); 6542 } else if (svd->amp != NULL) { 6543 pgcnt_t pgcnt = page_get_pagecnt(seg->s_szc); 6544 ASSERT(svd->amp == nsvd->amp); 6545 ASSERT(seg->s_szc <= svd->amp->a_szc); 6546 nsvd->anon_index = svd->anon_index + seg_pages(seg); 6547 ASSERT(IS_P2ALIGNED(nsvd->anon_index, pgcnt)); 6548 ANON_LOCK_ENTER(&svd->amp->a_rwlock, RW_WRITER); 6549 svd->amp->refcnt++; 6550 ANON_LOCK_EXIT(&svd->amp->a_rwlock); 6551 } 6552 6553 /* 6554 * Split amount of swap reserve 6555 */ 6556 if (svd->swresv) { 6557 /* 6558 * For MAP_NORESERVE, only allocate swap reserve for pages 6559 * being used. Other segments get enough to cover whole 6560 * segment. 6561 */ 6562 if (svd->flags & MAP_NORESERVE) { 6563 size_t oswresv; 6564 6565 ASSERT(svd->amp); 6566 oswresv = svd->swresv; 6567 svd->swresv = ptob(anon_pages(svd->amp->ahp, 6568 svd->anon_index, btop(seg->s_size))); 6569 nsvd->swresv = ptob(anon_pages(nsvd->amp->ahp, 6570 nsvd->anon_index, btop(nseg->s_size))); 6571 ASSERT(oswresv >= (svd->swresv + nsvd->swresv)); 6572 } else { 6573 ASSERT(svd->swresv == seg->s_size + nseg->s_size); 6574 svd->swresv = seg->s_size; 6575 nsvd->swresv = nseg->s_size; 6576 } 6577 } 6578 6579 return (nseg); 6580 } 6581 6582 /* 6583 * called on memory operations (unmap, setprot, setpagesize) for a subset 6584 * of a large page segment to either demote the memory range (SDR_RANGE) 6585 * or the ends (SDR_END) by addr/len. 6586 * 6587 * returns 0 on success. returns errno, including ENOMEM, on failure. 6588 */ 6589 static int 6590 segvn_demote_range( 6591 struct seg *seg, 6592 caddr_t addr, 6593 size_t len, 6594 int flag, 6595 uint_t szcvec) 6596 { 6597 caddr_t eaddr = addr + len; 6598 caddr_t lpgaddr, lpgeaddr; 6599 struct seg *nseg; 6600 struct seg *badseg1 = NULL; 6601 struct seg *badseg2 = NULL; 6602 size_t pgsz; 6603 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 6604 int err; 6605 uint_t szc = seg->s_szc; 6606 uint_t tszcvec; 6607 6608 ASSERT(AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock)); 6609 ASSERT(svd->tr_state == SEGVN_TR_OFF); 6610 ASSERT(szc != 0); 6611 pgsz = page_get_pagesize(szc); 6612 ASSERT(seg->s_base != addr || seg->s_size != len); 6613 ASSERT(addr >= seg->s_base && eaddr <= seg->s_base + seg->s_size); 6614 ASSERT(svd->softlockcnt == 0); 6615 ASSERT(svd->rcookie == HAT_INVALID_REGION_COOKIE); 6616 ASSERT(szcvec == 0 || (flag == SDR_END && svd->type == MAP_SHARED)); 6617 6618 CALC_LPG_REGION(pgsz, seg, addr, len, lpgaddr, lpgeaddr); 6619 ASSERT(flag == SDR_RANGE || eaddr < lpgeaddr || addr > lpgaddr); 6620 if (flag == SDR_RANGE) { 6621 /* demote entire range */ 6622 badseg1 = nseg = segvn_split_seg(seg, lpgaddr); 6623 (void) segvn_split_seg(nseg, lpgeaddr); 6624 ASSERT(badseg1->s_base == lpgaddr); 6625 ASSERT(badseg1->s_size == lpgeaddr - lpgaddr); 6626 } else if (addr != lpgaddr) { 6627 ASSERT(flag == SDR_END); 6628 badseg1 = nseg = segvn_split_seg(seg, lpgaddr); 6629 if (eaddr != lpgeaddr && eaddr > lpgaddr + pgsz && 6630 eaddr < lpgaddr + 2 * pgsz) { 6631 (void) segvn_split_seg(nseg, lpgeaddr); 6632 ASSERT(badseg1->s_base == lpgaddr); 6633 ASSERT(badseg1->s_size == 2 * pgsz); 6634 } else { 6635 nseg = segvn_split_seg(nseg, lpgaddr + pgsz); 6636 ASSERT(badseg1->s_base == lpgaddr); 6637 ASSERT(badseg1->s_size == pgsz); 6638 if (eaddr != lpgeaddr && eaddr > lpgaddr + pgsz) { 6639 ASSERT(lpgeaddr - lpgaddr > 2 * pgsz); 6640 nseg = segvn_split_seg(nseg, lpgeaddr - pgsz); 6641 badseg2 = nseg; 6642 (void) segvn_split_seg(nseg, lpgeaddr); 6643 ASSERT(badseg2->s_base == lpgeaddr - pgsz); 6644 ASSERT(badseg2->s_size == pgsz); 6645 } 6646 } 6647 } else { 6648 ASSERT(flag == SDR_END); 6649 ASSERT(eaddr < lpgeaddr); 6650 badseg1 = nseg = segvn_split_seg(seg, lpgeaddr - pgsz); 6651 (void) segvn_split_seg(nseg, lpgeaddr); 6652 ASSERT(badseg1->s_base == lpgeaddr - pgsz); 6653 ASSERT(badseg1->s_size == pgsz); 6654 } 6655 6656 ASSERT(badseg1 != NULL); 6657 ASSERT(badseg1->s_szc == szc); 6658 ASSERT(flag == SDR_RANGE || badseg1->s_size == pgsz || 6659 badseg1->s_size == 2 * pgsz); 6660 ASSERT(sameprot(badseg1, badseg1->s_base, pgsz)); 6661 ASSERT(badseg1->s_size == pgsz || 6662 sameprot(badseg1, badseg1->s_base + pgsz, pgsz)); 6663 if (err = segvn_clrszc(badseg1)) { 6664 return (err); 6665 } 6666 ASSERT(badseg1->s_szc == 0); 6667 6668 if (szc > 1 && (tszcvec = P2PHASE(szcvec, 1 << szc)) > 1) { 6669 uint_t tszc = highbit(tszcvec) - 1; 6670 caddr_t ta = MAX(addr, badseg1->s_base); 6671 caddr_t te; 6672 size_t tpgsz = page_get_pagesize(tszc); 6673 6674 ASSERT(svd->type == MAP_SHARED); 6675 ASSERT(flag == SDR_END); 6676 ASSERT(tszc < szc && tszc > 0); 6677 6678 if (eaddr > badseg1->s_base + badseg1->s_size) { 6679 te = badseg1->s_base + badseg1->s_size; 6680 } else { 6681 te = eaddr; 6682 } 6683 6684 ASSERT(ta <= te); 6685 badseg1->s_szc = tszc; 6686 if (!IS_P2ALIGNED(ta, tpgsz) || !IS_P2ALIGNED(te, tpgsz)) { 6687 if (badseg2 != NULL) { 6688 err = segvn_demote_range(badseg1, ta, te - ta, 6689 SDR_END, tszcvec); 6690 if (err != 0) { 6691 return (err); 6692 } 6693 } else { 6694 return (segvn_demote_range(badseg1, ta, 6695 te - ta, SDR_END, tszcvec)); 6696 } 6697 } 6698 } 6699 6700 if (badseg2 == NULL) 6701 return (0); 6702 ASSERT(badseg2->s_szc == szc); 6703 ASSERT(badseg2->s_size == pgsz); 6704 ASSERT(sameprot(badseg2, badseg2->s_base, badseg2->s_size)); 6705 if (err = segvn_clrszc(badseg2)) { 6706 return (err); 6707 } 6708 ASSERT(badseg2->s_szc == 0); 6709 6710 if (szc > 1 && (tszcvec = P2PHASE(szcvec, 1 << szc)) > 1) { 6711 uint_t tszc = highbit(tszcvec) - 1; 6712 size_t tpgsz = page_get_pagesize(tszc); 6713 6714 ASSERT(svd->type == MAP_SHARED); 6715 ASSERT(flag == SDR_END); 6716 ASSERT(tszc < szc && tszc > 0); 6717 ASSERT(badseg2->s_base > addr); 6718 ASSERT(eaddr > badseg2->s_base); 6719 ASSERT(eaddr < badseg2->s_base + badseg2->s_size); 6720 6721 badseg2->s_szc = tszc; 6722 if (!IS_P2ALIGNED(eaddr, tpgsz)) { 6723 return (segvn_demote_range(badseg2, badseg2->s_base, 6724 eaddr - badseg2->s_base, SDR_END, tszcvec)); 6725 } 6726 } 6727 6728 return (0); 6729 } 6730 6731 static int 6732 segvn_checkprot(struct seg *seg, caddr_t addr, size_t len, uint_t prot) 6733 { 6734 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 6735 struct vpage *vp, *evp; 6736 6737 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 6738 6739 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER); 6740 /* 6741 * If segment protection can be used, simply check against them. 6742 */ 6743 if (svd->pageprot == 0) { 6744 int err; 6745 6746 err = ((svd->prot & prot) != prot) ? EACCES : 0; 6747 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 6748 return (err); 6749 } 6750 6751 /* 6752 * Have to check down to the vpage level. 6753 */ 6754 evp = &svd->vpage[seg_page(seg, addr + len)]; 6755 for (vp = &svd->vpage[seg_page(seg, addr)]; vp < evp; vp++) { 6756 if ((VPP_PROT(vp) & prot) != prot) { 6757 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 6758 return (EACCES); 6759 } 6760 } 6761 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 6762 return (0); 6763 } 6764 6765 static int 6766 segvn_getprot(struct seg *seg, caddr_t addr, size_t len, uint_t *protv) 6767 { 6768 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 6769 size_t pgno = seg_page(seg, addr + len) - seg_page(seg, addr) + 1; 6770 6771 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 6772 6773 if (pgno != 0) { 6774 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER); 6775 if (svd->pageprot == 0) { 6776 do 6777 protv[--pgno] = svd->prot; 6778 while (pgno != 0); 6779 } else { 6780 size_t pgoff = seg_page(seg, addr); 6781 6782 do { 6783 pgno--; 6784 protv[pgno] = VPP_PROT(&svd->vpage[pgno+pgoff]); 6785 } while (pgno != 0); 6786 } 6787 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 6788 } 6789 return (0); 6790 } 6791 6792 static u_offset_t 6793 segvn_getoffset(struct seg *seg, caddr_t addr) 6794 { 6795 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 6796 6797 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 6798 6799 return (svd->offset + (uintptr_t)(addr - seg->s_base)); 6800 } 6801 6802 /*ARGSUSED*/ 6803 static int 6804 segvn_gettype(struct seg *seg, caddr_t addr) 6805 { 6806 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 6807 6808 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 6809 6810 return (svd->type | (svd->flags & (MAP_NORESERVE | MAP_TEXT | 6811 MAP_INITDATA))); 6812 } 6813 6814 /*ARGSUSED*/ 6815 static int 6816 segvn_getvp(struct seg *seg, caddr_t addr, struct vnode **vpp) 6817 { 6818 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 6819 6820 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 6821 6822 *vpp = svd->vp; 6823 return (0); 6824 } 6825 6826 /* 6827 * Check to see if it makes sense to do kluster/read ahead to 6828 * addr + delta relative to the mapping at addr. We assume here 6829 * that delta is a signed PAGESIZE'd multiple (which can be negative). 6830 * 6831 * For segvn, we currently "approve" of the action if we are 6832 * still in the segment and it maps from the same vp/off, 6833 * or if the advice stored in segvn_data or vpages allows it. 6834 * Currently, klustering is not allowed only if MADV_RANDOM is set. 6835 */ 6836 static int 6837 segvn_kluster(struct seg *seg, caddr_t addr, ssize_t delta) 6838 { 6839 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 6840 struct anon *oap, *ap; 6841 ssize_t pd; 6842 size_t page; 6843 struct vnode *vp1, *vp2; 6844 u_offset_t off1, off2; 6845 struct anon_map *amp; 6846 6847 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 6848 ASSERT(AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock) || 6849 SEGVN_LOCK_HELD(seg->s_as, &svd->lock)); 6850 6851 if (addr + delta < seg->s_base || 6852 addr + delta >= (seg->s_base + seg->s_size)) 6853 return (-1); /* exceeded segment bounds */ 6854 6855 pd = delta / (ssize_t)PAGESIZE; /* divide to preserve sign bit */ 6856 page = seg_page(seg, addr); 6857 6858 /* 6859 * Check to see if either of the pages addr or addr + delta 6860 * have advice set that prevents klustering (if MADV_RANDOM advice 6861 * is set for entire segment, or MADV_SEQUENTIAL is set and delta 6862 * is negative). 6863 */ 6864 if (svd->advice == MADV_RANDOM || 6865 svd->advice == MADV_SEQUENTIAL && delta < 0) 6866 return (-1); 6867 else if (svd->pageadvice && svd->vpage) { 6868 struct vpage *bvpp, *evpp; 6869 6870 bvpp = &svd->vpage[page]; 6871 evpp = &svd->vpage[page + pd]; 6872 if (VPP_ADVICE(bvpp) == MADV_RANDOM || 6873 VPP_ADVICE(evpp) == MADV_SEQUENTIAL && delta < 0) 6874 return (-1); 6875 if (VPP_ADVICE(bvpp) != VPP_ADVICE(evpp) && 6876 VPP_ADVICE(evpp) == MADV_RANDOM) 6877 return (-1); 6878 } 6879 6880 if (svd->type == MAP_SHARED) 6881 return (0); /* shared mapping - all ok */ 6882 6883 if ((amp = svd->amp) == NULL) 6884 return (0); /* off original vnode */ 6885 6886 page += svd->anon_index; 6887 6888 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 6889 6890 oap = anon_get_ptr(amp->ahp, page); 6891 ap = anon_get_ptr(amp->ahp, page + pd); 6892 6893 ANON_LOCK_EXIT(&->a_rwlock); 6894 6895 if ((oap == NULL && ap != NULL) || (oap != NULL && ap == NULL)) { 6896 return (-1); /* one with and one without an anon */ 6897 } 6898 6899 if (oap == NULL) { /* implies that ap == NULL */ 6900 return (0); /* off original vnode */ 6901 } 6902 6903 /* 6904 * Now we know we have two anon pointers - check to 6905 * see if they happen to be properly allocated. 6906 */ 6907 6908 /* 6909 * XXX We cheat here and don't lock the anon slots. We can't because 6910 * we may have been called from the anon layer which might already 6911 * have locked them. We are holding a refcnt on the slots so they 6912 * can't disappear. The worst that will happen is we'll get the wrong 6913 * names (vp, off) for the slots and make a poor klustering decision. 6914 */ 6915 swap_xlate(ap, &vp1, &off1); 6916 swap_xlate(oap, &vp2, &off2); 6917 6918 6919 if (!VOP_CMP(vp1, vp2) || off1 - off2 != delta) 6920 return (-1); 6921 return (0); 6922 } 6923 6924 /* 6925 * Swap the pages of seg out to secondary storage, returning the 6926 * number of bytes of storage freed. 6927 * 6928 * The basic idea is first to unload all translations and then to call 6929 * VOP_PUTPAGE() for all newly-unmapped pages, to push them out to the 6930 * swap device. Pages to which other segments have mappings will remain 6931 * mapped and won't be swapped. Our caller (as_swapout) has already 6932 * performed the unloading step. 6933 * 6934 * The value returned is intended to correlate well with the process's 6935 * memory requirements. However, there are some caveats: 6936 * 1) When given a shared segment as argument, this routine will 6937 * only succeed in swapping out pages for the last sharer of the 6938 * segment. (Previous callers will only have decremented mapping 6939 * reference counts.) 6940 * 2) We assume that the hat layer maintains a large enough translation 6941 * cache to capture process reference patterns. 6942 */ 6943 static size_t 6944 segvn_swapout(struct seg *seg) 6945 { 6946 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 6947 struct anon_map *amp; 6948 pgcnt_t pgcnt = 0; 6949 pgcnt_t npages; 6950 pgcnt_t page; 6951 ulong_t anon_index; 6952 6953 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 6954 6955 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER); 6956 /* 6957 * Find pages unmapped by our caller and force them 6958 * out to the virtual swap device. 6959 */ 6960 if ((amp = svd->amp) != NULL) 6961 anon_index = svd->anon_index; 6962 npages = seg->s_size >> PAGESHIFT; 6963 for (page = 0; page < npages; page++) { 6964 page_t *pp; 6965 struct anon *ap; 6966 struct vnode *vp; 6967 u_offset_t off; 6968 anon_sync_obj_t cookie; 6969 6970 /* 6971 * Obtain <vp, off> pair for the page, then look it up. 6972 * 6973 * Note that this code is willing to consider regular 6974 * pages as well as anon pages. Is this appropriate here? 6975 */ 6976 ap = NULL; 6977 if (amp != NULL) { 6978 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 6979 if (anon_array_try_enter(amp, anon_index + page, 6980 &cookie)) { 6981 ANON_LOCK_EXIT(&->a_rwlock); 6982 continue; 6983 } 6984 ap = anon_get_ptr(amp->ahp, anon_index + page); 6985 if (ap != NULL) { 6986 swap_xlate(ap, &vp, &off); 6987 } else { 6988 vp = svd->vp; 6989 off = svd->offset + ptob(page); 6990 } 6991 anon_array_exit(&cookie); 6992 ANON_LOCK_EXIT(&->a_rwlock); 6993 } else { 6994 vp = svd->vp; 6995 off = svd->offset + ptob(page); 6996 } 6997 if (vp == NULL) { /* untouched zfod page */ 6998 ASSERT(ap == NULL); 6999 continue; 7000 } 7001 7002 pp = page_lookup_nowait(vp, off, SE_SHARED); 7003 if (pp == NULL) 7004 continue; 7005 7006 7007 /* 7008 * Examine the page to see whether it can be tossed out, 7009 * keeping track of how many we've found. 7010 */ 7011 if (!page_tryupgrade(pp)) { 7012 /* 7013 * If the page has an i/o lock and no mappings, 7014 * it's very likely that the page is being 7015 * written out as a result of klustering. 7016 * Assume this is so and take credit for it here. 7017 */ 7018 if (!page_io_trylock(pp)) { 7019 if (!hat_page_is_mapped(pp)) 7020 pgcnt++; 7021 } else { 7022 page_io_unlock(pp); 7023 } 7024 page_unlock(pp); 7025 continue; 7026 } 7027 ASSERT(!page_iolock_assert(pp)); 7028 7029 7030 /* 7031 * Skip if page is locked or has mappings. 7032 * We don't need the page_struct_lock to look at lckcnt 7033 * and cowcnt because the page is exclusive locked. 7034 */ 7035 if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0 || 7036 hat_page_is_mapped(pp)) { 7037 page_unlock(pp); 7038 continue; 7039 } 7040 7041 /* 7042 * dispose skips large pages so try to demote first. 7043 */ 7044 if (pp->p_szc != 0 && !page_try_demote_pages(pp)) { 7045 page_unlock(pp); 7046 /* 7047 * XXX should skip the remaining page_t's of this 7048 * large page. 7049 */ 7050 continue; 7051 } 7052 7053 ASSERT(pp->p_szc == 0); 7054 7055 /* 7056 * No longer mapped -- we can toss it out. How 7057 * we do so depends on whether or not it's dirty. 7058 */ 7059 if (hat_ismod(pp) && pp->p_vnode) { 7060 /* 7061 * We must clean the page before it can be 7062 * freed. Setting B_FREE will cause pvn_done 7063 * to free the page when the i/o completes. 7064 * XXX: This also causes it to be accounted 7065 * as a pageout instead of a swap: need 7066 * B_SWAPOUT bit to use instead of B_FREE. 7067 * 7068 * Hold the vnode before releasing the page lock 7069 * to prevent it from being freed and re-used by 7070 * some other thread. 7071 */ 7072 VN_HOLD(vp); 7073 page_unlock(pp); 7074 7075 /* 7076 * Queue all i/o requests for the pageout thread 7077 * to avoid saturating the pageout devices. 7078 */ 7079 if (!queue_io_request(vp, off)) 7080 VN_RELE(vp); 7081 } else { 7082 /* 7083 * The page was clean, free it. 7084 * 7085 * XXX: Can we ever encounter modified pages 7086 * with no associated vnode here? 7087 */ 7088 ASSERT(pp->p_vnode != NULL); 7089 /*LINTED: constant in conditional context*/ 7090 VN_DISPOSE(pp, B_FREE, 0, kcred); 7091 } 7092 7093 /* 7094 * Credit now even if i/o is in progress. 7095 */ 7096 pgcnt++; 7097 } 7098 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7099 7100 /* 7101 * Wakeup pageout to initiate i/o on all queued requests. 7102 */ 7103 cv_signal_pageout(); 7104 return (ptob(pgcnt)); 7105 } 7106 7107 /* 7108 * Synchronize primary storage cache with real object in virtual memory. 7109 * 7110 * XXX - Anonymous pages should not be sync'ed out at all. 7111 */ 7112 static int 7113 segvn_sync(struct seg *seg, caddr_t addr, size_t len, int attr, uint_t flags) 7114 { 7115 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 7116 struct vpage *vpp; 7117 page_t *pp; 7118 u_offset_t offset; 7119 struct vnode *vp; 7120 u_offset_t off; 7121 caddr_t eaddr; 7122 int bflags; 7123 int err = 0; 7124 int segtype; 7125 int pageprot; 7126 int prot; 7127 ulong_t anon_index; 7128 struct anon_map *amp; 7129 struct anon *ap; 7130 anon_sync_obj_t cookie; 7131 7132 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 7133 7134 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER); 7135 7136 if (svd->softlockcnt > 0) { 7137 /* 7138 * flush all pages from seg cache 7139 * otherwise we may deadlock in swap_putpage 7140 * for B_INVAL page (4175402). 7141 * 7142 * Even if we grab segvn WRITER's lock or segp_slock 7143 * here, there might be another thread which could've 7144 * successfully performed lookup/insert just before 7145 * we acquired the lock here. So, grabbing either 7146 * lock here is of not much use. Until we devise 7147 * a strategy at upper layers to solve the 7148 * synchronization issues completely, we expect 7149 * applications to handle this appropriately. 7150 */ 7151 segvn_purge(seg); 7152 if (svd->softlockcnt > 0) { 7153 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7154 return (EAGAIN); 7155 } 7156 } 7157 7158 vpp = svd->vpage; 7159 offset = svd->offset + (uintptr_t)(addr - seg->s_base); 7160 bflags = ((flags & MS_ASYNC) ? B_ASYNC : 0) | 7161 ((flags & MS_INVALIDATE) ? B_INVAL : 0); 7162 7163 if (attr) { 7164 pageprot = attr & ~(SHARED|PRIVATE); 7165 segtype = (attr & SHARED) ? MAP_SHARED : MAP_PRIVATE; 7166 7167 /* 7168 * We are done if the segment types don't match 7169 * or if we have segment level protections and 7170 * they don't match. 7171 */ 7172 if (svd->type != segtype) { 7173 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7174 return (0); 7175 } 7176 if (vpp == NULL) { 7177 if (svd->prot != pageprot) { 7178 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7179 return (0); 7180 } 7181 prot = svd->prot; 7182 } else 7183 vpp = &svd->vpage[seg_page(seg, addr)]; 7184 7185 } else if (svd->vp && svd->amp == NULL && 7186 (flags & MS_INVALIDATE) == 0) { 7187 7188 /* 7189 * No attributes, no anonymous pages and MS_INVALIDATE flag 7190 * is not on, just use one big request. 7191 */ 7192 err = VOP_PUTPAGE(svd->vp, (offset_t)offset, len, 7193 bflags, svd->cred); 7194 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7195 return (err); 7196 } 7197 7198 if ((amp = svd->amp) != NULL) 7199 anon_index = svd->anon_index + seg_page(seg, addr); 7200 7201 for (eaddr = addr + len; addr < eaddr; addr += PAGESIZE) { 7202 ap = NULL; 7203 if (amp != NULL) { 7204 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 7205 anon_array_enter(amp, anon_index, &cookie); 7206 ap = anon_get_ptr(amp->ahp, anon_index++); 7207 if (ap != NULL) { 7208 swap_xlate(ap, &vp, &off); 7209 } else { 7210 vp = svd->vp; 7211 off = offset; 7212 } 7213 anon_array_exit(&cookie); 7214 ANON_LOCK_EXIT(&->a_rwlock); 7215 } else { 7216 vp = svd->vp; 7217 off = offset; 7218 } 7219 offset += PAGESIZE; 7220 7221 if (vp == NULL) /* untouched zfod page */ 7222 continue; 7223 7224 if (attr) { 7225 if (vpp) { 7226 prot = VPP_PROT(vpp); 7227 vpp++; 7228 } 7229 if (prot != pageprot) { 7230 continue; 7231 } 7232 } 7233 7234 /* 7235 * See if any of these pages are locked -- if so, then we 7236 * will have to truncate an invalidate request at the first 7237 * locked one. We don't need the page_struct_lock to test 7238 * as this is only advisory; even if we acquire it someone 7239 * might race in and lock the page after we unlock and before 7240 * we do the PUTPAGE, then PUTPAGE simply does nothing. 7241 */ 7242 if (flags & MS_INVALIDATE) { 7243 if ((pp = page_lookup(vp, off, SE_SHARED)) != NULL) { 7244 if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) { 7245 page_unlock(pp); 7246 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7247 return (EBUSY); 7248 } 7249 if (ap != NULL && pp->p_szc != 0 && 7250 page_tryupgrade(pp)) { 7251 if (pp->p_lckcnt == 0 && 7252 pp->p_cowcnt == 0) { 7253 /* 7254 * swapfs VN_DISPOSE() won't 7255 * invalidate large pages. 7256 * Attempt to demote. 7257 * XXX can't help it if it 7258 * fails. But for swapfs 7259 * pages it is no big deal. 7260 */ 7261 (void) page_try_demote_pages( 7262 pp); 7263 } 7264 } 7265 page_unlock(pp); 7266 } 7267 } else if (svd->type == MAP_SHARED && amp != NULL) { 7268 /* 7269 * Avoid writting out to disk ISM's large pages 7270 * because segspt_free_pages() relies on NULL an_pvp 7271 * of anon slots of such pages. 7272 */ 7273 7274 ASSERT(svd->vp == NULL); 7275 /* 7276 * swapfs uses page_lookup_nowait if not freeing or 7277 * invalidating and skips a page if 7278 * page_lookup_nowait returns NULL. 7279 */ 7280 pp = page_lookup_nowait(vp, off, SE_SHARED); 7281 if (pp == NULL) { 7282 continue; 7283 } 7284 if (pp->p_szc != 0) { 7285 page_unlock(pp); 7286 continue; 7287 } 7288 7289 /* 7290 * Note ISM pages are created large so (vp, off)'s 7291 * page cannot suddenly become large after we unlock 7292 * pp. 7293 */ 7294 page_unlock(pp); 7295 } 7296 /* 7297 * XXX - Should ultimately try to kluster 7298 * calls to VOP_PUTPAGE() for performance. 7299 */ 7300 VN_HOLD(vp); 7301 err = VOP_PUTPAGE(vp, (offset_t)off, PAGESIZE, 7302 bflags, svd->cred); 7303 VN_RELE(vp); 7304 if (err) 7305 break; 7306 } 7307 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7308 return (err); 7309 } 7310 7311 /* 7312 * Determine if we have data corresponding to pages in the 7313 * primary storage virtual memory cache (i.e., "in core"). 7314 */ 7315 static size_t 7316 segvn_incore(struct seg *seg, caddr_t addr, size_t len, char *vec) 7317 { 7318 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 7319 struct vnode *vp, *avp; 7320 u_offset_t offset, aoffset; 7321 size_t p, ep; 7322 int ret; 7323 struct vpage *vpp; 7324 page_t *pp; 7325 uint_t start; 7326 struct anon_map *amp; /* XXX - for locknest */ 7327 struct anon *ap; 7328 uint_t attr; 7329 anon_sync_obj_t cookie; 7330 7331 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 7332 7333 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER); 7334 if (svd->amp == NULL && svd->vp == NULL) { 7335 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7336 bzero(vec, btopr(len)); 7337 return (len); /* no anonymous pages created yet */ 7338 } 7339 7340 p = seg_page(seg, addr); 7341 ep = seg_page(seg, addr + len); 7342 start = svd->vp ? SEG_PAGE_VNODEBACKED : 0; 7343 7344 amp = svd->amp; 7345 for (; p < ep; p++, addr += PAGESIZE) { 7346 vpp = (svd->vpage) ? &svd->vpage[p]: NULL; 7347 ret = start; 7348 ap = NULL; 7349 avp = NULL; 7350 /* Grab the vnode/offset for the anon slot */ 7351 if (amp != NULL) { 7352 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 7353 anon_array_enter(amp, svd->anon_index + p, &cookie); 7354 ap = anon_get_ptr(amp->ahp, svd->anon_index + p); 7355 if (ap != NULL) { 7356 swap_xlate(ap, &avp, &aoffset); 7357 } 7358 anon_array_exit(&cookie); 7359 ANON_LOCK_EXIT(&->a_rwlock); 7360 } 7361 if ((avp != NULL) && page_exists(avp, aoffset)) { 7362 /* A page exists for the anon slot */ 7363 ret |= SEG_PAGE_INCORE; 7364 7365 /* 7366 * If page is mapped and writable 7367 */ 7368 attr = (uint_t)0; 7369 if ((hat_getattr(seg->s_as->a_hat, addr, 7370 &attr) != -1) && (attr & PROT_WRITE)) { 7371 ret |= SEG_PAGE_ANON; 7372 } 7373 /* 7374 * Don't get page_struct lock for lckcnt and cowcnt, 7375 * since this is purely advisory. 7376 */ 7377 if ((pp = page_lookup_nowait(avp, aoffset, 7378 SE_SHARED)) != NULL) { 7379 if (pp->p_lckcnt) 7380 ret |= SEG_PAGE_SOFTLOCK; 7381 if (pp->p_cowcnt) 7382 ret |= SEG_PAGE_HASCOW; 7383 page_unlock(pp); 7384 } 7385 } 7386 7387 /* Gather vnode statistics */ 7388 vp = svd->vp; 7389 offset = svd->offset + (uintptr_t)(addr - seg->s_base); 7390 7391 if (vp != NULL) { 7392 /* 7393 * Try to obtain a "shared" lock on the page 7394 * without blocking. If this fails, determine 7395 * if the page is in memory. 7396 */ 7397 pp = page_lookup_nowait(vp, offset, SE_SHARED); 7398 if ((pp == NULL) && (page_exists(vp, offset))) { 7399 /* Page is incore, and is named */ 7400 ret |= (SEG_PAGE_INCORE | SEG_PAGE_VNODE); 7401 } 7402 /* 7403 * Don't get page_struct lock for lckcnt and cowcnt, 7404 * since this is purely advisory. 7405 */ 7406 if (pp != NULL) { 7407 ret |= (SEG_PAGE_INCORE | SEG_PAGE_VNODE); 7408 if (pp->p_lckcnt) 7409 ret |= SEG_PAGE_SOFTLOCK; 7410 if (pp->p_cowcnt) 7411 ret |= SEG_PAGE_HASCOW; 7412 page_unlock(pp); 7413 } 7414 } 7415 7416 /* Gather virtual page information */ 7417 if (vpp) { 7418 if (VPP_ISPPLOCK(vpp)) 7419 ret |= SEG_PAGE_LOCKED; 7420 vpp++; 7421 } 7422 7423 *vec++ = (char)ret; 7424 } 7425 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7426 return (len); 7427 } 7428 7429 /* 7430 * Statement for p_cowcnts/p_lckcnts. 7431 * 7432 * p_cowcnt is updated while mlock/munlocking MAP_PRIVATE and PROT_WRITE region 7433 * irrespective of the following factors or anything else: 7434 * 7435 * (1) anon slots are populated or not 7436 * (2) cow is broken or not 7437 * (3) refcnt on ap is 1 or greater than 1 7438 * 7439 * If it's not MAP_PRIVATE and PROT_WRITE, p_lckcnt is updated during mlock 7440 * and munlock. 7441 * 7442 * 7443 * Handling p_cowcnts/p_lckcnts during copy-on-write fault: 7444 * 7445 * if vpage has PROT_WRITE 7446 * transfer cowcnt on the oldpage -> cowcnt on the newpage 7447 * else 7448 * transfer lckcnt on the oldpage -> lckcnt on the newpage 7449 * 7450 * During copy-on-write, decrement p_cowcnt on the oldpage and increment 7451 * p_cowcnt on the newpage *if* the corresponding vpage has PROT_WRITE. 7452 * 7453 * We may also break COW if softlocking on read access in the physio case. 7454 * In this case, vpage may not have PROT_WRITE. So, we need to decrement 7455 * p_lckcnt on the oldpage and increment p_lckcnt on the newpage *if* the 7456 * vpage doesn't have PROT_WRITE. 7457 * 7458 * 7459 * Handling p_cowcnts/p_lckcnts during mprotect on mlocked region: 7460 * 7461 * If a MAP_PRIVATE region loses PROT_WRITE, we decrement p_cowcnt and 7462 * increment p_lckcnt by calling page_subclaim() which takes care of 7463 * availrmem accounting and p_lckcnt overflow. 7464 * 7465 * If a MAP_PRIVATE region gains PROT_WRITE, we decrement p_lckcnt and 7466 * increment p_cowcnt by calling page_addclaim() which takes care of 7467 * availrmem availability and p_cowcnt overflow. 7468 */ 7469 7470 /* 7471 * Lock down (or unlock) pages mapped by this segment. 7472 * 7473 * XXX only creates PAGESIZE pages if anon slots are not initialized. 7474 * At fault time they will be relocated into larger pages. 7475 */ 7476 static int 7477 segvn_lockop(struct seg *seg, caddr_t addr, size_t len, 7478 int attr, int op, ulong_t *lockmap, size_t pos) 7479 { 7480 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 7481 struct vpage *vpp; 7482 struct vpage *evp; 7483 page_t *pp; 7484 u_offset_t offset; 7485 u_offset_t off; 7486 int segtype; 7487 int pageprot; 7488 int claim; 7489 struct vnode *vp; 7490 ulong_t anon_index; 7491 struct anon_map *amp; 7492 struct anon *ap; 7493 struct vattr va; 7494 anon_sync_obj_t cookie; 7495 struct kshmid *sp = NULL; 7496 struct proc *p = curproc; 7497 kproject_t *proj = NULL; 7498 int chargeproc = 1; 7499 size_t locked_bytes = 0; 7500 size_t unlocked_bytes = 0; 7501 int err = 0; 7502 7503 /* 7504 * Hold write lock on address space because may split or concatenate 7505 * segments 7506 */ 7507 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 7508 7509 /* 7510 * If this is a shm, use shm's project and zone, else use 7511 * project and zone of calling process 7512 */ 7513 7514 /* Determine if this segment backs a sysV shm */ 7515 if (svd->amp != NULL && svd->amp->a_sp != NULL) { 7516 ASSERT(svd->type == MAP_SHARED); 7517 ASSERT(svd->tr_state == SEGVN_TR_OFF); 7518 sp = svd->amp->a_sp; 7519 proj = sp->shm_perm.ipc_proj; 7520 chargeproc = 0; 7521 } 7522 7523 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_WRITER); 7524 if (attr) { 7525 pageprot = attr & ~(SHARED|PRIVATE); 7526 segtype = attr & SHARED ? MAP_SHARED : MAP_PRIVATE; 7527 7528 /* 7529 * We are done if the segment types don't match 7530 * or if we have segment level protections and 7531 * they don't match. 7532 */ 7533 if (svd->type != segtype) { 7534 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7535 return (0); 7536 } 7537 if (svd->pageprot == 0 && svd->prot != pageprot) { 7538 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7539 return (0); 7540 } 7541 } 7542 7543 if (op == MC_LOCK) { 7544 if (svd->tr_state == SEGVN_TR_INIT) { 7545 svd->tr_state = SEGVN_TR_OFF; 7546 } else if (svd->tr_state == SEGVN_TR_ON) { 7547 ASSERT(svd->amp != NULL); 7548 segvn_textunrepl(seg, 0); 7549 ASSERT(svd->amp == NULL && 7550 svd->tr_state == SEGVN_TR_OFF); 7551 } 7552 } 7553 7554 /* 7555 * If we're locking, then we must create a vpage structure if 7556 * none exists. If we're unlocking, then check to see if there 7557 * is a vpage -- if not, then we could not have locked anything. 7558 */ 7559 7560 if ((vpp = svd->vpage) == NULL) { 7561 if (op == MC_LOCK) 7562 segvn_vpage(seg); 7563 else { 7564 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7565 return (0); 7566 } 7567 } 7568 7569 /* 7570 * The anonymous data vector (i.e., previously 7571 * unreferenced mapping to swap space) can be allocated 7572 * by lazily testing for its existence. 7573 */ 7574 if (op == MC_LOCK && svd->amp == NULL && svd->vp == NULL) { 7575 ASSERT(svd->rcookie == HAT_INVALID_REGION_COOKIE); 7576 svd->amp = anonmap_alloc(seg->s_size, 0, ANON_SLEEP); 7577 svd->amp->a_szc = seg->s_szc; 7578 } 7579 7580 if ((amp = svd->amp) != NULL) { 7581 anon_index = svd->anon_index + seg_page(seg, addr); 7582 } 7583 7584 offset = svd->offset + (uintptr_t)(addr - seg->s_base); 7585 evp = &svd->vpage[seg_page(seg, addr + len)]; 7586 7587 if (sp != NULL) 7588 mutex_enter(&sp->shm_mlock); 7589 7590 /* determine number of unlocked bytes in range for lock operation */ 7591 if (op == MC_LOCK) { 7592 7593 if (sp == NULL) { 7594 for (vpp = &svd->vpage[seg_page(seg, addr)]; vpp < evp; 7595 vpp++) { 7596 if (!VPP_ISPPLOCK(vpp)) 7597 unlocked_bytes += PAGESIZE; 7598 } 7599 } else { 7600 ulong_t i_idx, i_edx; 7601 anon_sync_obj_t i_cookie; 7602 struct anon *i_ap; 7603 struct vnode *i_vp; 7604 u_offset_t i_off; 7605 7606 /* Only count sysV pages once for locked memory */ 7607 i_edx = svd->anon_index + seg_page(seg, addr + len); 7608 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 7609 for (i_idx = anon_index; i_idx < i_edx; i_idx++) { 7610 anon_array_enter(amp, i_idx, &i_cookie); 7611 i_ap = anon_get_ptr(amp->ahp, i_idx); 7612 if (i_ap == NULL) { 7613 unlocked_bytes += PAGESIZE; 7614 anon_array_exit(&i_cookie); 7615 continue; 7616 } 7617 swap_xlate(i_ap, &i_vp, &i_off); 7618 anon_array_exit(&i_cookie); 7619 pp = page_lookup(i_vp, i_off, SE_SHARED); 7620 if (pp == NULL) { 7621 unlocked_bytes += PAGESIZE; 7622 continue; 7623 } else if (pp->p_lckcnt == 0) 7624 unlocked_bytes += PAGESIZE; 7625 page_unlock(pp); 7626 } 7627 ANON_LOCK_EXIT(&->a_rwlock); 7628 } 7629 7630 mutex_enter(&p->p_lock); 7631 err = rctl_incr_locked_mem(p, proj, unlocked_bytes, 7632 chargeproc); 7633 mutex_exit(&p->p_lock); 7634 7635 if (err) { 7636 if (sp != NULL) 7637 mutex_exit(&sp->shm_mlock); 7638 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7639 return (err); 7640 } 7641 } 7642 /* 7643 * Loop over all pages in the range. Process if we're locking and 7644 * page has not already been locked in this mapping; or if we're 7645 * unlocking and the page has been locked. 7646 */ 7647 for (vpp = &svd->vpage[seg_page(seg, addr)]; vpp < evp; 7648 vpp++, pos++, addr += PAGESIZE, offset += PAGESIZE, anon_index++) { 7649 if ((attr == 0 || VPP_PROT(vpp) == pageprot) && 7650 ((op == MC_LOCK && !VPP_ISPPLOCK(vpp)) || 7651 (op == MC_UNLOCK && VPP_ISPPLOCK(vpp)))) { 7652 7653 if (amp != NULL) 7654 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 7655 /* 7656 * If this isn't a MAP_NORESERVE segment and 7657 * we're locking, allocate anon slots if they 7658 * don't exist. The page is brought in later on. 7659 */ 7660 if (op == MC_LOCK && svd->vp == NULL && 7661 ((svd->flags & MAP_NORESERVE) == 0) && 7662 amp != NULL && 7663 ((ap = anon_get_ptr(amp->ahp, anon_index)) 7664 == NULL)) { 7665 anon_array_enter(amp, anon_index, &cookie); 7666 7667 if ((ap = anon_get_ptr(amp->ahp, 7668 anon_index)) == NULL) { 7669 pp = anon_zero(seg, addr, &ap, 7670 svd->cred); 7671 if (pp == NULL) { 7672 anon_array_exit(&cookie); 7673 ANON_LOCK_EXIT(&->a_rwlock); 7674 err = ENOMEM; 7675 goto out; 7676 } 7677 ASSERT(anon_get_ptr(amp->ahp, 7678 anon_index) == NULL); 7679 (void) anon_set_ptr(amp->ahp, 7680 anon_index, ap, ANON_SLEEP); 7681 page_unlock(pp); 7682 } 7683 anon_array_exit(&cookie); 7684 } 7685 7686 /* 7687 * Get name for page, accounting for 7688 * existence of private copy. 7689 */ 7690 ap = NULL; 7691 if (amp != NULL) { 7692 anon_array_enter(amp, anon_index, &cookie); 7693 ap = anon_get_ptr(amp->ahp, anon_index); 7694 if (ap != NULL) { 7695 swap_xlate(ap, &vp, &off); 7696 } else { 7697 if (svd->vp == NULL && 7698 (svd->flags & MAP_NORESERVE)) { 7699 anon_array_exit(&cookie); 7700 ANON_LOCK_EXIT(&->a_rwlock); 7701 continue; 7702 } 7703 vp = svd->vp; 7704 off = offset; 7705 } 7706 anon_array_exit(&cookie); 7707 ANON_LOCK_EXIT(&->a_rwlock); 7708 } else { 7709 vp = svd->vp; 7710 off = offset; 7711 } 7712 7713 /* 7714 * Get page frame. It's ok if the page is 7715 * not available when we're unlocking, as this 7716 * may simply mean that a page we locked got 7717 * truncated out of existence after we locked it. 7718 * 7719 * Invoke VOP_GETPAGE() to obtain the page struct 7720 * since we may need to read it from disk if its 7721 * been paged out. 7722 */ 7723 if (op != MC_LOCK) 7724 pp = page_lookup(vp, off, SE_SHARED); 7725 else { 7726 page_t *pl[1 + 1]; 7727 int error; 7728 7729 ASSERT(vp != NULL); 7730 7731 error = VOP_GETPAGE(vp, (offset_t)off, PAGESIZE, 7732 (uint_t *)NULL, pl, PAGESIZE, seg, addr, 7733 S_OTHER, svd->cred); 7734 7735 /* 7736 * If the error is EDEADLK then we must bounce 7737 * up and drop all vm subsystem locks and then 7738 * retry the operation later 7739 * This behavior is a temporary measure because 7740 * ufs/sds logging is badly designed and will 7741 * deadlock if we don't allow this bounce to 7742 * happen. The real solution is to re-design 7743 * the logging code to work properly. See bug 7744 * 4125102 for details of the problem. 7745 */ 7746 if (error == EDEADLK) { 7747 err = error; 7748 goto out; 7749 } 7750 /* 7751 * Quit if we fail to fault in the page. Treat 7752 * the failure as an error, unless the addr 7753 * is mapped beyond the end of a file. 7754 */ 7755 if (error && svd->vp) { 7756 va.va_mask = AT_SIZE; 7757 if (VOP_GETATTR(svd->vp, &va, 0, 7758 svd->cred) != 0) { 7759 err = EIO; 7760 goto out; 7761 } 7762 if (btopr(va.va_size) >= 7763 btopr(off + 1)) { 7764 err = EIO; 7765 goto out; 7766 } 7767 goto out; 7768 7769 } else if (error) { 7770 err = EIO; 7771 goto out; 7772 } 7773 pp = pl[0]; 7774 ASSERT(pp != NULL); 7775 } 7776 7777 /* 7778 * See Statement at the beginning of this routine. 7779 * 7780 * claim is always set if MAP_PRIVATE and PROT_WRITE 7781 * irrespective of following factors: 7782 * 7783 * (1) anon slots are populated or not 7784 * (2) cow is broken or not 7785 * (3) refcnt on ap is 1 or greater than 1 7786 * 7787 * See 4140683 for details 7788 */ 7789 claim = ((VPP_PROT(vpp) & PROT_WRITE) && 7790 (svd->type == MAP_PRIVATE)); 7791 7792 /* 7793 * Perform page-level operation appropriate to 7794 * operation. If locking, undo the SOFTLOCK 7795 * performed to bring the page into memory 7796 * after setting the lock. If unlocking, 7797 * and no page was found, account for the claim 7798 * separately. 7799 */ 7800 if (op == MC_LOCK) { 7801 int ret = 1; /* Assume success */ 7802 7803 ASSERT(!VPP_ISPPLOCK(vpp)); 7804 7805 ret = page_pp_lock(pp, claim, 0); 7806 if (ret == 0) { 7807 /* locking page failed */ 7808 page_unlock(pp); 7809 err = EAGAIN; 7810 goto out; 7811 } 7812 VPP_SETPPLOCK(vpp); 7813 if (sp != NULL) { 7814 if (pp->p_lckcnt == 1) 7815 locked_bytes += PAGESIZE; 7816 } else 7817 locked_bytes += PAGESIZE; 7818 7819 if (lockmap != (ulong_t *)NULL) 7820 BT_SET(lockmap, pos); 7821 7822 page_unlock(pp); 7823 } else { 7824 ASSERT(VPP_ISPPLOCK(vpp)); 7825 if (pp != NULL) { 7826 /* sysV pages should be locked */ 7827 ASSERT(sp == NULL || pp->p_lckcnt > 0); 7828 page_pp_unlock(pp, claim, 0); 7829 if (sp != NULL) { 7830 if (pp->p_lckcnt == 0) 7831 unlocked_bytes 7832 += PAGESIZE; 7833 } else 7834 unlocked_bytes += PAGESIZE; 7835 page_unlock(pp); 7836 } else { 7837 ASSERT(sp == NULL); 7838 unlocked_bytes += PAGESIZE; 7839 } 7840 VPP_CLRPPLOCK(vpp); 7841 } 7842 } 7843 } 7844 out: 7845 if (op == MC_LOCK) { 7846 /* Credit back bytes that did not get locked */ 7847 if ((unlocked_bytes - locked_bytes) > 0) { 7848 if (proj == NULL) 7849 mutex_enter(&p->p_lock); 7850 rctl_decr_locked_mem(p, proj, 7851 (unlocked_bytes - locked_bytes), chargeproc); 7852 if (proj == NULL) 7853 mutex_exit(&p->p_lock); 7854 } 7855 7856 } else { 7857 /* Account bytes that were unlocked */ 7858 if (unlocked_bytes > 0) { 7859 if (proj == NULL) 7860 mutex_enter(&p->p_lock); 7861 rctl_decr_locked_mem(p, proj, unlocked_bytes, 7862 chargeproc); 7863 if (proj == NULL) 7864 mutex_exit(&p->p_lock); 7865 } 7866 } 7867 if (sp != NULL) 7868 mutex_exit(&sp->shm_mlock); 7869 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7870 7871 return (err); 7872 } 7873 7874 /* 7875 * Set advice from user for specified pages 7876 * There are 5 types of advice: 7877 * MADV_NORMAL - Normal (default) behavior (whatever that is) 7878 * MADV_RANDOM - Random page references 7879 * do not allow readahead or 'klustering' 7880 * MADV_SEQUENTIAL - Sequential page references 7881 * Pages previous to the one currently being 7882 * accessed (determined by fault) are 'not needed' 7883 * and are freed immediately 7884 * MADV_WILLNEED - Pages are likely to be used (fault ahead in mctl) 7885 * MADV_DONTNEED - Pages are not needed (synced out in mctl) 7886 * MADV_FREE - Contents can be discarded 7887 * MADV_ACCESS_DEFAULT- Default access 7888 * MADV_ACCESS_LWP - Next LWP will access heavily 7889 * MADV_ACCESS_MANY- Many LWPs or processes will access heavily 7890 */ 7891 static int 7892 segvn_advise(struct seg *seg, caddr_t addr, size_t len, uint_t behav) 7893 { 7894 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 7895 size_t page; 7896 int err = 0; 7897 int already_set; 7898 struct anon_map *amp; 7899 ulong_t anon_index; 7900 struct seg *next; 7901 lgrp_mem_policy_t policy; 7902 struct seg *prev; 7903 struct vnode *vp; 7904 7905 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 7906 7907 /* 7908 * In case of MADV_FREE, we won't be modifying any segment private 7909 * data structures; so, we only need to grab READER's lock 7910 */ 7911 if (behav != MADV_FREE) { 7912 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_WRITER); 7913 if (svd->tr_state != SEGVN_TR_OFF) { 7914 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7915 return (0); 7916 } 7917 } else { 7918 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER); 7919 } 7920 7921 /* 7922 * Large pages are assumed to be only turned on when accesses to the 7923 * segment's address range have spatial and temporal locality. That 7924 * justifies ignoring MADV_SEQUENTIAL for large page segments. 7925 * Also, ignore advice affecting lgroup memory allocation 7926 * if don't need to do lgroup optimizations on this system 7927 */ 7928 7929 if ((behav == MADV_SEQUENTIAL && 7930 (seg->s_szc != 0 || HAT_IS_REGION_COOKIE_VALID(svd->rcookie))) || 7931 (!lgrp_optimizations() && (behav == MADV_ACCESS_DEFAULT || 7932 behav == MADV_ACCESS_LWP || behav == MADV_ACCESS_MANY))) { 7933 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7934 return (0); 7935 } 7936 7937 if (behav == MADV_SEQUENTIAL || behav == MADV_ACCESS_DEFAULT || 7938 behav == MADV_ACCESS_LWP || behav == MADV_ACCESS_MANY) { 7939 /* 7940 * Since we are going to unload hat mappings 7941 * we first have to flush the cache. Otherwise 7942 * this might lead to system panic if another 7943 * thread is doing physio on the range whose 7944 * mappings are unloaded by madvise(3C). 7945 */ 7946 if (svd->softlockcnt > 0) { 7947 /* 7948 * Since we do have the segvn writers lock 7949 * nobody can fill the cache with entries 7950 * belonging to this seg during the purge. 7951 * The flush either succeeds or we still 7952 * have pending I/Os. In the later case, 7953 * madvise(3C) fails. 7954 */ 7955 segvn_purge(seg); 7956 if (svd->softlockcnt > 0) { 7957 /* 7958 * Since madvise(3C) is advisory and 7959 * it's not part of UNIX98, madvise(3C) 7960 * failure here doesn't cause any hardship. 7961 * Note that we don't block in "as" layer. 7962 */ 7963 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7964 return (EAGAIN); 7965 } 7966 } 7967 } 7968 7969 amp = svd->amp; 7970 vp = svd->vp; 7971 if (behav == MADV_FREE) { 7972 /* 7973 * MADV_FREE is not supported for segments with 7974 * underlying object; if anonmap is NULL, anon slots 7975 * are not yet populated and there is nothing for 7976 * us to do. As MADV_FREE is advisory, we don't 7977 * return error in either case. 7978 */ 7979 if (vp != NULL || amp == NULL) { 7980 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7981 return (0); 7982 } 7983 7984 page = seg_page(seg, addr); 7985 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 7986 anon_disclaim(amp, svd->anon_index + page, len, 0); 7987 ANON_LOCK_EXIT(&->a_rwlock); 7988 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7989 return (0); 7990 } 7991 7992 /* 7993 * If advice is to be applied to entire segment, 7994 * use advice field in seg_data structure 7995 * otherwise use appropriate vpage entry. 7996 */ 7997 if ((addr == seg->s_base) && (len == seg->s_size)) { 7998 switch (behav) { 7999 case MADV_ACCESS_LWP: 8000 case MADV_ACCESS_MANY: 8001 case MADV_ACCESS_DEFAULT: 8002 /* 8003 * Set memory allocation policy for this segment 8004 */ 8005 policy = lgrp_madv_to_policy(behav, len, svd->type); 8006 if (svd->type == MAP_SHARED) 8007 already_set = lgrp_shm_policy_set(policy, amp, 8008 svd->anon_index, vp, svd->offset, len); 8009 else { 8010 /* 8011 * For private memory, need writers lock on 8012 * address space because the segment may be 8013 * split or concatenated when changing policy 8014 */ 8015 if (AS_READ_HELD(seg->s_as, 8016 &seg->s_as->a_lock)) { 8017 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 8018 return (IE_RETRY); 8019 } 8020 8021 already_set = lgrp_privm_policy_set(policy, 8022 &svd->policy_info, len); 8023 } 8024 8025 /* 8026 * If policy set already and it shouldn't be reapplied, 8027 * don't do anything. 8028 */ 8029 if (already_set && 8030 !LGRP_MEM_POLICY_REAPPLICABLE(policy)) 8031 break; 8032 8033 /* 8034 * Mark any existing pages in given range for 8035 * migration 8036 */ 8037 page_mark_migrate(seg, addr, len, amp, svd->anon_index, 8038 vp, svd->offset, 1); 8039 8040 /* 8041 * If same policy set already or this is a shared 8042 * memory segment, don't need to try to concatenate 8043 * segment with adjacent ones. 8044 */ 8045 if (already_set || svd->type == MAP_SHARED) 8046 break; 8047 8048 /* 8049 * Try to concatenate this segment with previous 8050 * one and next one, since we changed policy for 8051 * this one and it may be compatible with adjacent 8052 * ones now. 8053 */ 8054 prev = AS_SEGPREV(seg->s_as, seg); 8055 next = AS_SEGNEXT(seg->s_as, seg); 8056 8057 if (next && next->s_ops == &segvn_ops && 8058 addr + len == next->s_base) 8059 (void) segvn_concat(seg, next, 1); 8060 8061 if (prev && prev->s_ops == &segvn_ops && 8062 addr == prev->s_base + prev->s_size) { 8063 /* 8064 * Drop lock for private data of current 8065 * segment before concatenating (deleting) it 8066 * and return IE_REATTACH to tell as_ctl() that 8067 * current segment has changed 8068 */ 8069 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 8070 if (!segvn_concat(prev, seg, 1)) 8071 err = IE_REATTACH; 8072 8073 return (err); 8074 } 8075 break; 8076 8077 case MADV_SEQUENTIAL: 8078 /* 8079 * unloading mapping guarantees 8080 * detection in segvn_fault 8081 */ 8082 ASSERT(seg->s_szc == 0); 8083 ASSERT(svd->rcookie == HAT_INVALID_REGION_COOKIE); 8084 hat_unload(seg->s_as->a_hat, addr, len, 8085 HAT_UNLOAD); 8086 /* FALLTHROUGH */ 8087 case MADV_NORMAL: 8088 case MADV_RANDOM: 8089 svd->advice = (uchar_t)behav; 8090 svd->pageadvice = 0; 8091 break; 8092 case MADV_WILLNEED: /* handled in memcntl */ 8093 case MADV_DONTNEED: /* handled in memcntl */ 8094 case MADV_FREE: /* handled above */ 8095 break; 8096 default: 8097 err = EINVAL; 8098 } 8099 } else { 8100 caddr_t eaddr; 8101 struct seg *new_seg; 8102 struct segvn_data *new_svd; 8103 u_offset_t off; 8104 caddr_t oldeaddr; 8105 8106 page = seg_page(seg, addr); 8107 8108 segvn_vpage(seg); 8109 8110 switch (behav) { 8111 struct vpage *bvpp, *evpp; 8112 8113 case MADV_ACCESS_LWP: 8114 case MADV_ACCESS_MANY: 8115 case MADV_ACCESS_DEFAULT: 8116 /* 8117 * Set memory allocation policy for portion of this 8118 * segment 8119 */ 8120 8121 /* 8122 * Align address and length of advice to page 8123 * boundaries for large pages 8124 */ 8125 if (seg->s_szc != 0) { 8126 size_t pgsz; 8127 8128 pgsz = page_get_pagesize(seg->s_szc); 8129 addr = (caddr_t)P2ALIGN((uintptr_t)addr, pgsz); 8130 len = P2ROUNDUP(len, pgsz); 8131 } 8132 8133 /* 8134 * Check to see whether policy is set already 8135 */ 8136 policy = lgrp_madv_to_policy(behav, len, svd->type); 8137 8138 anon_index = svd->anon_index + page; 8139 off = svd->offset + (uintptr_t)(addr - seg->s_base); 8140 8141 if (svd->type == MAP_SHARED) 8142 already_set = lgrp_shm_policy_set(policy, amp, 8143 anon_index, vp, off, len); 8144 else 8145 already_set = 8146 (policy == svd->policy_info.mem_policy); 8147 8148 /* 8149 * If policy set already and it shouldn't be reapplied, 8150 * don't do anything. 8151 */ 8152 if (already_set && 8153 !LGRP_MEM_POLICY_REAPPLICABLE(policy)) 8154 break; 8155 8156 /* 8157 * For private memory, need writers lock on 8158 * address space because the segment may be 8159 * split or concatenated when changing policy 8160 */ 8161 if (svd->type == MAP_PRIVATE && 8162 AS_READ_HELD(seg->s_as, &seg->s_as->a_lock)) { 8163 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 8164 return (IE_RETRY); 8165 } 8166 8167 /* 8168 * Mark any existing pages in given range for 8169 * migration 8170 */ 8171 page_mark_migrate(seg, addr, len, amp, svd->anon_index, 8172 vp, svd->offset, 1); 8173 8174 /* 8175 * Don't need to try to split or concatenate 8176 * segments, since policy is same or this is a shared 8177 * memory segment 8178 */ 8179 if (already_set || svd->type == MAP_SHARED) 8180 break; 8181 8182 if (HAT_IS_REGION_COOKIE_VALID(svd->rcookie)) { 8183 ASSERT(svd->amp == NULL); 8184 ASSERT(svd->tr_state == SEGVN_TR_OFF); 8185 ASSERT(svd->softlockcnt == 0); 8186 hat_leave_region(seg->s_as->a_hat, svd->rcookie, 8187 HAT_REGION_TEXT); 8188 svd->rcookie = HAT_INVALID_REGION_COOKIE; 8189 } 8190 8191 /* 8192 * Split off new segment if advice only applies to a 8193 * portion of existing segment starting in middle 8194 */ 8195 new_seg = NULL; 8196 eaddr = addr + len; 8197 oldeaddr = seg->s_base + seg->s_size; 8198 if (addr > seg->s_base) { 8199 /* 8200 * Must flush I/O page cache 8201 * before splitting segment 8202 */ 8203 if (svd->softlockcnt > 0) 8204 segvn_purge(seg); 8205 8206 /* 8207 * Split segment and return IE_REATTACH to tell 8208 * as_ctl() that current segment changed 8209 */ 8210 new_seg = segvn_split_seg(seg, addr); 8211 new_svd = (struct segvn_data *)new_seg->s_data; 8212 err = IE_REATTACH; 8213 8214 /* 8215 * If new segment ends where old one 8216 * did, try to concatenate the new 8217 * segment with next one. 8218 */ 8219 if (eaddr == oldeaddr) { 8220 /* 8221 * Set policy for new segment 8222 */ 8223 (void) lgrp_privm_policy_set(policy, 8224 &new_svd->policy_info, 8225 new_seg->s_size); 8226 8227 next = AS_SEGNEXT(new_seg->s_as, 8228 new_seg); 8229 8230 if (next && 8231 next->s_ops == &segvn_ops && 8232 eaddr == next->s_base) 8233 (void) segvn_concat(new_seg, 8234 next, 1); 8235 } 8236 } 8237 8238 /* 8239 * Split off end of existing segment if advice only 8240 * applies to a portion of segment ending before 8241 * end of the existing segment 8242 */ 8243 if (eaddr < oldeaddr) { 8244 /* 8245 * Must flush I/O page cache 8246 * before splitting segment 8247 */ 8248 if (svd->softlockcnt > 0) 8249 segvn_purge(seg); 8250 8251 /* 8252 * If beginning of old segment was already 8253 * split off, use new segment to split end off 8254 * from. 8255 */ 8256 if (new_seg != NULL && new_seg != seg) { 8257 /* 8258 * Split segment 8259 */ 8260 (void) segvn_split_seg(new_seg, eaddr); 8261 8262 /* 8263 * Set policy for new segment 8264 */ 8265 (void) lgrp_privm_policy_set(policy, 8266 &new_svd->policy_info, 8267 new_seg->s_size); 8268 } else { 8269 /* 8270 * Split segment and return IE_REATTACH 8271 * to tell as_ctl() that current 8272 * segment changed 8273 */ 8274 (void) segvn_split_seg(seg, eaddr); 8275 err = IE_REATTACH; 8276 8277 (void) lgrp_privm_policy_set(policy, 8278 &svd->policy_info, seg->s_size); 8279 8280 /* 8281 * If new segment starts where old one 8282 * did, try to concatenate it with 8283 * previous segment. 8284 */ 8285 if (addr == seg->s_base) { 8286 prev = AS_SEGPREV(seg->s_as, 8287 seg); 8288 8289 /* 8290 * Drop lock for private data 8291 * of current segment before 8292 * concatenating (deleting) it 8293 */ 8294 if (prev && 8295 prev->s_ops == 8296 &segvn_ops && 8297 addr == prev->s_base + 8298 prev->s_size) { 8299 SEGVN_LOCK_EXIT( 8300 seg->s_as, 8301 &svd->lock); 8302 (void) segvn_concat( 8303 prev, seg, 1); 8304 return (err); 8305 } 8306 } 8307 } 8308 } 8309 break; 8310 case MADV_SEQUENTIAL: 8311 ASSERT(seg->s_szc == 0); 8312 ASSERT(svd->rcookie == HAT_INVALID_REGION_COOKIE); 8313 hat_unload(seg->s_as->a_hat, addr, len, HAT_UNLOAD); 8314 /* FALLTHROUGH */ 8315 case MADV_NORMAL: 8316 case MADV_RANDOM: 8317 bvpp = &svd->vpage[page]; 8318 evpp = &svd->vpage[page + (len >> PAGESHIFT)]; 8319 for (; bvpp < evpp; bvpp++) 8320 VPP_SETADVICE(bvpp, behav); 8321 svd->advice = MADV_NORMAL; 8322 break; 8323 case MADV_WILLNEED: /* handled in memcntl */ 8324 case MADV_DONTNEED: /* handled in memcntl */ 8325 case MADV_FREE: /* handled above */ 8326 break; 8327 default: 8328 err = EINVAL; 8329 } 8330 } 8331 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 8332 return (err); 8333 } 8334 8335 /* 8336 * Create a vpage structure for this seg. 8337 */ 8338 static void 8339 segvn_vpage(struct seg *seg) 8340 { 8341 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 8342 struct vpage *vp, *evp; 8343 8344 ASSERT(SEGVN_WRITE_HELD(seg->s_as, &svd->lock)); 8345 8346 /* 8347 * If no vpage structure exists, allocate one. Copy the protections 8348 * and the advice from the segment itself to the individual pages. 8349 */ 8350 if (svd->vpage == NULL) { 8351 svd->pageadvice = 1; 8352 svd->vpage = kmem_zalloc(seg_pages(seg) * sizeof (struct vpage), 8353 KM_SLEEP); 8354 evp = &svd->vpage[seg_page(seg, seg->s_base + seg->s_size)]; 8355 for (vp = svd->vpage; vp < evp; vp++) { 8356 VPP_SETPROT(vp, svd->prot); 8357 VPP_SETADVICE(vp, svd->advice); 8358 } 8359 } 8360 } 8361 8362 /* 8363 * Dump the pages belonging to this segvn segment. 8364 */ 8365 static void 8366 segvn_dump(struct seg *seg) 8367 { 8368 struct segvn_data *svd; 8369 page_t *pp; 8370 struct anon_map *amp; 8371 ulong_t anon_index; 8372 struct vnode *vp; 8373 u_offset_t off, offset; 8374 pfn_t pfn; 8375 pgcnt_t page, npages; 8376 caddr_t addr; 8377 8378 npages = seg_pages(seg); 8379 svd = (struct segvn_data *)seg->s_data; 8380 vp = svd->vp; 8381 off = offset = svd->offset; 8382 addr = seg->s_base; 8383 8384 if ((amp = svd->amp) != NULL) { 8385 anon_index = svd->anon_index; 8386 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 8387 } 8388 8389 for (page = 0; page < npages; page++, offset += PAGESIZE) { 8390 struct anon *ap; 8391 int we_own_it = 0; 8392 8393 if (amp && (ap = anon_get_ptr(svd->amp->ahp, anon_index++))) { 8394 swap_xlate_nopanic(ap, &vp, &off); 8395 } else { 8396 vp = svd->vp; 8397 off = offset; 8398 } 8399 8400 /* 8401 * If pp == NULL, the page either does not exist 8402 * or is exclusively locked. So determine if it 8403 * exists before searching for it. 8404 */ 8405 8406 if ((pp = page_lookup_nowait(vp, off, SE_SHARED))) 8407 we_own_it = 1; 8408 else 8409 pp = page_exists(vp, off); 8410 8411 if (pp) { 8412 pfn = page_pptonum(pp); 8413 dump_addpage(seg->s_as, addr, pfn); 8414 if (we_own_it) 8415 page_unlock(pp); 8416 } 8417 addr += PAGESIZE; 8418 dump_timeleft = dump_timeout; 8419 } 8420 8421 if (amp != NULL) 8422 ANON_LOCK_EXIT(&->a_rwlock); 8423 } 8424 8425 /* 8426 * lock/unlock anon pages over a given range. Return shadow list 8427 */ 8428 static int 8429 segvn_pagelock(struct seg *seg, caddr_t addr, size_t len, struct page ***ppp, 8430 enum lock_type type, enum seg_rw rw) 8431 { 8432 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 8433 size_t np, adjustpages = 0, npages = (len >> PAGESHIFT); 8434 ulong_t anon_index; 8435 uint_t protchk; 8436 uint_t error; 8437 struct anon_map *amp; 8438 struct page **pplist, **pl, *pp; 8439 caddr_t a; 8440 size_t page; 8441 caddr_t lpgaddr, lpgeaddr; 8442 pgcnt_t szc0_npages = 0; 8443 8444 TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEGVN_START, 8445 "segvn_pagelock: start seg %p addr %p", seg, addr); 8446 8447 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 8448 if (seg->s_szc != 0 && (type == L_PAGELOCK || type == L_PAGEUNLOCK)) { 8449 /* 8450 * We are adjusting the pagelock region to the large page size 8451 * boundary because the unlocked part of a large page cannot 8452 * be freed anyway unless all constituent pages of a large 8453 * page are locked. Therefore this adjustment allows us to 8454 * decrement availrmem by the right value (note we don't want 8455 * to just decrement availrem by the large page size without 8456 * adjusting addr and len because then we may end up 8457 * decrementing availrmem by large page size for every 8458 * constituent page locked by a new as_pagelock call). 8459 * as_pageunlock caller must always match as_pagelock call's 8460 * addr and len. 8461 * 8462 * Note segment's page size cannot change while we are holding 8463 * as lock. And then it cannot change while softlockcnt is 8464 * not 0. This will allow us to correctly recalculate large 8465 * page size region for the matching pageunlock/reclaim call. 8466 * 8467 * for pageunlock *ppp points to the pointer of page_t that 8468 * corresponds to the real unadjusted start address. Similar 8469 * for pagelock *ppp must point to the pointer of page_t that 8470 * corresponds to the real unadjusted start address. 8471 */ 8472 size_t pgsz = page_get_pagesize(seg->s_szc); 8473 CALC_LPG_REGION(pgsz, seg, addr, len, lpgaddr, lpgeaddr); 8474 adjustpages = ((uintptr_t)(addr - lpgaddr)) >> PAGESHIFT; 8475 } 8476 8477 if (type == L_PAGEUNLOCK) { 8478 8479 /* 8480 * update hat ref bits for /proc. We need to make sure 8481 * that threads tracing the ref and mod bits of the 8482 * address space get the right data. 8483 * Note: page ref and mod bits are updated at reclaim time 8484 */ 8485 if (seg->s_as->a_vbits) { 8486 for (a = addr; a < addr + len; a += PAGESIZE) { 8487 if (rw == S_WRITE) { 8488 hat_setstat(seg->s_as, a, 8489 PAGESIZE, P_REF | P_MOD); 8490 } else { 8491 hat_setstat(seg->s_as, a, 8492 PAGESIZE, P_REF); 8493 } 8494 } 8495 } 8496 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER); 8497 if (seg->s_szc != 0) { 8498 VM_STAT_ADD(segvnvmstats.pagelock[0]); 8499 seg_pinactive(seg, lpgaddr, lpgeaddr - lpgaddr, 8500 *ppp - adjustpages, rw, segvn_reclaim); 8501 } else { 8502 seg_pinactive(seg, addr, len, *ppp, rw, segvn_reclaim); 8503 } 8504 8505 /* 8506 * If someone is blocked while unmapping, we purge 8507 * segment page cache and thus reclaim pplist synchronously 8508 * without waiting for seg_pasync_thread. This speeds up 8509 * unmapping in cases where munmap(2) is called, while 8510 * raw async i/o is still in progress or where a thread 8511 * exits on data fault in a multithreaded application. 8512 */ 8513 if (AS_ISUNMAPWAIT(seg->s_as) && (svd->softlockcnt > 0)) { 8514 /* 8515 * Even if we grab segvn WRITER's lock or segp_slock 8516 * here, there might be another thread which could've 8517 * successfully performed lookup/insert just before 8518 * we acquired the lock here. So, grabbing either 8519 * lock here is of not much use. Until we devise 8520 * a strategy at upper layers to solve the 8521 * synchronization issues completely, we expect 8522 * applications to handle this appropriately. 8523 */ 8524 segvn_purge(seg); 8525 } 8526 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 8527 TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEGVN_UNLOCK_END, 8528 "segvn_pagelock: unlock seg %p addr %p", seg, addr); 8529 return (0); 8530 } else if (type == L_PAGERECLAIM) { 8531 VM_STAT_COND_ADD(seg->s_szc != 0, segvnvmstats.pagelock[1]); 8532 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER); 8533 (void) segvn_reclaim(seg, addr, len, *ppp, rw); 8534 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 8535 TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEGVN_UNLOCK_END, 8536 "segvn_pagelock: reclaim seg %p addr %p", seg, addr); 8537 return (0); 8538 } 8539 8540 if (seg->s_szc != 0) { 8541 VM_STAT_ADD(segvnvmstats.pagelock[2]); 8542 addr = lpgaddr; 8543 len = lpgeaddr - lpgaddr; 8544 npages = (len >> PAGESHIFT); 8545 } 8546 8547 /* 8548 * for now we only support pagelock to anon memory. We've to check 8549 * protections for vnode objects and call into the vnode driver. 8550 * That's too much for a fast path. Let the fault entry point handle it. 8551 */ 8552 if (svd->vp != NULL) { 8553 TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEGVN_MISS_END, 8554 "segvn_pagelock: mapped vnode seg %p addr %p", seg, addr); 8555 *ppp = NULL; 8556 return (ENOTSUP); 8557 } 8558 8559 /* 8560 * if anonmap is not yet created, let the fault entry point populate it 8561 * with anon ptrs. 8562 */ 8563 if ((amp = svd->amp) == NULL) { 8564 TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEGVN_MISS_END, 8565 "segvn_pagelock: anonmap null seg %p addr %p", seg, addr); 8566 *ppp = NULL; 8567 return (EFAULT); 8568 } 8569 8570 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER); 8571 8572 /* 8573 * we acquire segp_slock to prevent duplicate entries 8574 * in seg_pcache 8575 */ 8576 mutex_enter(&svd->segp_slock); 8577 8578 /* 8579 * try to find pages in segment page cache 8580 */ 8581 pplist = seg_plookup(seg, addr, len, rw); 8582 if (pplist != NULL) { 8583 mutex_exit(&svd->segp_slock); 8584 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 8585 *ppp = pplist + adjustpages; 8586 TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEGVN_HIT_END, 8587 "segvn_pagelock: cache hit seg %p addr %p", seg, addr); 8588 return (0); 8589 } 8590 8591 if (rw == S_READ) { 8592 protchk = PROT_READ; 8593 } else { 8594 protchk = PROT_WRITE; 8595 } 8596 8597 if (svd->pageprot == 0) { 8598 if ((svd->prot & protchk) == 0) { 8599 mutex_exit(&svd->segp_slock); 8600 error = EFAULT; 8601 goto out; 8602 } 8603 } else { 8604 /* 8605 * check page protections 8606 */ 8607 for (a = addr; a < addr + len; a += PAGESIZE) { 8608 struct vpage *vp; 8609 8610 vp = &svd->vpage[seg_page(seg, a)]; 8611 if ((VPP_PROT(vp) & protchk) == 0) { 8612 mutex_exit(&svd->segp_slock); 8613 error = EFAULT; 8614 goto out; 8615 } 8616 } 8617 } 8618 8619 /* 8620 * Avoid per page overhead of segvn_slock_anonpages() for small 8621 * pages. For large pages segvn_slock_anonpages() only does real 8622 * work once per large page. The tradeoff is that we may decrement 8623 * availrmem more than once for the same page but this is ok 8624 * for small pages. 8625 */ 8626 if (seg->s_szc == 0) { 8627 mutex_enter(&freemem_lock); 8628 if (availrmem < tune.t_minarmem + npages) { 8629 mutex_exit(&freemem_lock); 8630 mutex_exit(&svd->segp_slock); 8631 error = ENOMEM; 8632 goto out; 8633 } 8634 availrmem -= npages; 8635 mutex_exit(&freemem_lock); 8636 } 8637 8638 pplist = kmem_alloc(sizeof (page_t *) * npages, KM_SLEEP); 8639 pl = pplist; 8640 *ppp = pplist + adjustpages; 8641 8642 page = seg_page(seg, addr); 8643 anon_index = svd->anon_index + page; 8644 8645 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 8646 for (a = addr; a < addr + len; a += PAGESIZE, anon_index++) { 8647 struct anon *ap; 8648 struct vnode *vp; 8649 u_offset_t off; 8650 anon_sync_obj_t cookie; 8651 8652 anon_array_enter(amp, anon_index, &cookie); 8653 ap = anon_get_ptr(amp->ahp, anon_index); 8654 if (ap == NULL) { 8655 anon_array_exit(&cookie); 8656 break; 8657 } else { 8658 /* 8659 * We must never use seg_pcache for COW pages 8660 * because we might end up with original page still 8661 * lying in seg_pcache even after private page is 8662 * created. This leads to data corruption as 8663 * aio_write refers to the page still in cache 8664 * while all other accesses refer to the private 8665 * page. 8666 */ 8667 if (ap->an_refcnt != 1) { 8668 anon_array_exit(&cookie); 8669 break; 8670 } 8671 } 8672 swap_xlate(ap, &vp, &off); 8673 anon_array_exit(&cookie); 8674 8675 pp = page_lookup_nowait(vp, off, SE_SHARED); 8676 if (pp == NULL) { 8677 break; 8678 } 8679 if (seg->s_szc != 0 || pp->p_szc != 0) { 8680 if (!segvn_slock_anonpages(pp, a == addr)) { 8681 page_unlock(pp); 8682 break; 8683 } 8684 } else { 8685 szc0_npages++; 8686 } 8687 *pplist++ = pp; 8688 } 8689 ANON_LOCK_EXIT(&->a_rwlock); 8690 8691 ASSERT(npages >= szc0_npages); 8692 8693 if (a >= addr + len) { 8694 mutex_enter(&freemem_lock); 8695 if (seg->s_szc == 0 && npages != szc0_npages) { 8696 ASSERT(svd->type == MAP_SHARED && amp->a_szc > 0); 8697 availrmem += (npages - szc0_npages); 8698 } 8699 svd->softlockcnt += npages; 8700 segvn_pages_locked += npages; 8701 mutex_exit(&freemem_lock); 8702 (void) seg_pinsert(seg, addr, len, pl, rw, SEGP_ASYNC_FLUSH, 8703 segvn_reclaim); 8704 mutex_exit(&svd->segp_slock); 8705 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 8706 TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEGVN_FILL_END, 8707 "segvn_pagelock: cache fill seg %p addr %p", seg, addr); 8708 return (0); 8709 } 8710 8711 mutex_exit(&svd->segp_slock); 8712 if (seg->s_szc == 0) { 8713 mutex_enter(&freemem_lock); 8714 availrmem += npages; 8715 mutex_exit(&freemem_lock); 8716 } 8717 error = EFAULT; 8718 pplist = pl; 8719 np = ((uintptr_t)(a - addr)) >> PAGESHIFT; 8720 while (np > (uint_t)0) { 8721 ASSERT(PAGE_LOCKED(*pplist)); 8722 if (seg->s_szc != 0 || (*pplist)->p_szc != 0) { 8723 segvn_sunlock_anonpages(*pplist, pplist == pl); 8724 } 8725 page_unlock(*pplist); 8726 np--; 8727 pplist++; 8728 } 8729 kmem_free(pl, sizeof (page_t *) * npages); 8730 out: 8731 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 8732 *ppp = NULL; 8733 TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEGVN_MISS_END, 8734 "segvn_pagelock: cache miss seg %p addr %p", seg, addr); 8735 return (error); 8736 } 8737 8738 /* 8739 * purge any cached pages in the I/O page cache 8740 */ 8741 static void 8742 segvn_purge(struct seg *seg) 8743 { 8744 seg_ppurge(seg); 8745 } 8746 8747 static int 8748 segvn_reclaim(struct seg *seg, caddr_t addr, size_t len, struct page **pplist, 8749 enum seg_rw rw) 8750 { 8751 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 8752 pgcnt_t np, npages; 8753 struct page **pl; 8754 pgcnt_t szc0_npages = 0; 8755 8756 #ifdef lint 8757 addr = addr; 8758 #endif 8759 8760 npages = np = (len >> PAGESHIFT); 8761 ASSERT(npages); 8762 pl = pplist; 8763 if (seg->s_szc != 0) { 8764 size_t pgsz = page_get_pagesize(seg->s_szc); 8765 if (!IS_P2ALIGNED(addr, pgsz) || !IS_P2ALIGNED(len, pgsz)) { 8766 panic("segvn_reclaim: unaligned addr or len"); 8767 /*NOTREACHED*/ 8768 } 8769 } 8770 8771 ASSERT(svd->vp == NULL && svd->amp != NULL); 8772 8773 while (np > (uint_t)0) { 8774 if (rw == S_WRITE) { 8775 hat_setrefmod(*pplist); 8776 } else { 8777 hat_setref(*pplist); 8778 } 8779 if (seg->s_szc != 0 || (*pplist)->p_szc != 0) { 8780 segvn_sunlock_anonpages(*pplist, pplist == pl); 8781 } else { 8782 szc0_npages++; 8783 } 8784 page_unlock(*pplist); 8785 np--; 8786 pplist++; 8787 } 8788 kmem_free(pl, sizeof (page_t *) * npages); 8789 8790 mutex_enter(&freemem_lock); 8791 segvn_pages_locked -= npages; 8792 svd->softlockcnt -= npages; 8793 if (szc0_npages != 0) { 8794 availrmem += szc0_npages; 8795 } 8796 mutex_exit(&freemem_lock); 8797 if (svd->softlockcnt <= 0) { 8798 if (AS_ISUNMAPWAIT(seg->s_as)) { 8799 mutex_enter(&seg->s_as->a_contents); 8800 if (AS_ISUNMAPWAIT(seg->s_as)) { 8801 AS_CLRUNMAPWAIT(seg->s_as); 8802 cv_broadcast(&seg->s_as->a_cv); 8803 } 8804 mutex_exit(&seg->s_as->a_contents); 8805 } 8806 } 8807 return (0); 8808 } 8809 /* 8810 * get a memory ID for an addr in a given segment 8811 * 8812 * XXX only creates PAGESIZE pages if anon slots are not initialized. 8813 * At fault time they will be relocated into larger pages. 8814 */ 8815 static int 8816 segvn_getmemid(struct seg *seg, caddr_t addr, memid_t *memidp) 8817 { 8818 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 8819 struct anon *ap = NULL; 8820 ulong_t anon_index; 8821 struct anon_map *amp; 8822 anon_sync_obj_t cookie; 8823 8824 if (svd->type == MAP_PRIVATE) { 8825 memidp->val[0] = (uintptr_t)seg->s_as; 8826 memidp->val[1] = (uintptr_t)addr; 8827 return (0); 8828 } 8829 8830 if (svd->type == MAP_SHARED) { 8831 if (svd->vp) { 8832 memidp->val[0] = (uintptr_t)svd->vp; 8833 memidp->val[1] = (u_longlong_t)svd->offset + 8834 (uintptr_t)(addr - seg->s_base); 8835 return (0); 8836 } else { 8837 8838 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER); 8839 if ((amp = svd->amp) != NULL) { 8840 anon_index = svd->anon_index + 8841 seg_page(seg, addr); 8842 } 8843 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 8844 8845 ASSERT(amp != NULL); 8846 8847 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 8848 anon_array_enter(amp, anon_index, &cookie); 8849 ap = anon_get_ptr(amp->ahp, anon_index); 8850 if (ap == NULL) { 8851 page_t *pp; 8852 8853 pp = anon_zero(seg, addr, &ap, svd->cred); 8854 if (pp == NULL) { 8855 anon_array_exit(&cookie); 8856 ANON_LOCK_EXIT(&->a_rwlock); 8857 return (ENOMEM); 8858 } 8859 ASSERT(anon_get_ptr(amp->ahp, anon_index) 8860 == NULL); 8861 (void) anon_set_ptr(amp->ahp, anon_index, 8862 ap, ANON_SLEEP); 8863 page_unlock(pp); 8864 } 8865 8866 anon_array_exit(&cookie); 8867 ANON_LOCK_EXIT(&->a_rwlock); 8868 8869 memidp->val[0] = (uintptr_t)ap; 8870 memidp->val[1] = (uintptr_t)addr & PAGEOFFSET; 8871 return (0); 8872 } 8873 } 8874 return (EINVAL); 8875 } 8876 8877 static int 8878 sameprot(struct seg *seg, caddr_t a, size_t len) 8879 { 8880 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 8881 struct vpage *vpage; 8882 spgcnt_t pages = btop(len); 8883 uint_t prot; 8884 8885 if (svd->pageprot == 0) 8886 return (1); 8887 8888 ASSERT(svd->vpage != NULL); 8889 8890 vpage = &svd->vpage[seg_page(seg, a)]; 8891 prot = VPP_PROT(vpage); 8892 vpage++; 8893 pages--; 8894 while (pages-- > 0) { 8895 if (prot != VPP_PROT(vpage)) 8896 return (0); 8897 vpage++; 8898 } 8899 return (1); 8900 } 8901 8902 /* 8903 * Get memory allocation policy info for specified address in given segment 8904 */ 8905 static lgrp_mem_policy_info_t * 8906 segvn_getpolicy(struct seg *seg, caddr_t addr) 8907 { 8908 struct anon_map *amp; 8909 ulong_t anon_index; 8910 lgrp_mem_policy_info_t *policy_info; 8911 struct segvn_data *svn_data; 8912 u_offset_t vn_off; 8913 vnode_t *vp; 8914 8915 ASSERT(seg != NULL); 8916 8917 svn_data = (struct segvn_data *)seg->s_data; 8918 if (svn_data == NULL) 8919 return (NULL); 8920 8921 /* 8922 * Get policy info for private or shared memory 8923 */ 8924 if (svn_data->type != MAP_SHARED) { 8925 if (svn_data->tr_state != SEGVN_TR_ON) { 8926 policy_info = &svn_data->policy_info; 8927 } else { 8928 policy_info = &svn_data->tr_policy_info; 8929 ASSERT(policy_info->mem_policy == 8930 LGRP_MEM_POLICY_NEXT_SEG); 8931 } 8932 } else { 8933 amp = svn_data->amp; 8934 anon_index = svn_data->anon_index + seg_page(seg, addr); 8935 vp = svn_data->vp; 8936 vn_off = svn_data->offset + (uintptr_t)(addr - seg->s_base); 8937 policy_info = lgrp_shm_policy_get(amp, anon_index, vp, vn_off); 8938 } 8939 8940 return (policy_info); 8941 } 8942 8943 /*ARGSUSED*/ 8944 static int 8945 segvn_capable(struct seg *seg, segcapability_t capability) 8946 { 8947 return (0); 8948 } 8949 8950 /* 8951 * Bind text vnode segment to an amp. If we bind successfully mappings will be 8952 * established to per vnode mapping per lgroup amp pages instead of to vnode 8953 * pages. There's one amp per vnode text mapping per lgroup. Many processes 8954 * may share the same text replication amp. If a suitable amp doesn't already 8955 * exist in svntr hash table create a new one. We may fail to bind to amp if 8956 * segment is not eligible for text replication. Code below first checks for 8957 * these conditions. If binding is successful segment tr_state is set to on 8958 * and svd->amp points to the amp to use. Otherwise tr_state is set to off and 8959 * svd->amp remains as NULL. 8960 */ 8961 static void 8962 segvn_textrepl(struct seg *seg) 8963 { 8964 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 8965 vnode_t *vp = svd->vp; 8966 u_offset_t off = svd->offset; 8967 size_t size = seg->s_size; 8968 u_offset_t eoff = off + size; 8969 uint_t szc = seg->s_szc; 8970 ulong_t hash = SVNTR_HASH_FUNC(vp); 8971 svntr_t *svntrp; 8972 struct vattr va; 8973 proc_t *p = seg->s_as->a_proc; 8974 lgrp_id_t lgrp_id; 8975 lgrp_id_t olid; 8976 int first; 8977 struct anon_map *amp; 8978 8979 ASSERT(AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 8980 ASSERT(SEGVN_WRITE_HELD(seg->s_as, &svd->lock)); 8981 ASSERT(p != NULL); 8982 ASSERT(svd->tr_state == SEGVN_TR_INIT); 8983 ASSERT(!HAT_IS_REGION_COOKIE_VALID(svd->rcookie)); 8984 ASSERT(svd->flags & MAP_TEXT); 8985 ASSERT(svd->type == MAP_PRIVATE); 8986 ASSERT(vp != NULL && svd->amp == NULL); 8987 ASSERT(!svd->pageprot && !(svd->prot & PROT_WRITE)); 8988 ASSERT(!(svd->flags & MAP_NORESERVE) && svd->swresv == 0); 8989 ASSERT(seg->s_as != &kas); 8990 ASSERT(off < eoff); 8991 ASSERT(svntr_hashtab != NULL); 8992 8993 /* 8994 * If numa optimizations are no longer desired bail out. 8995 */ 8996 if (!lgrp_optimizations()) { 8997 svd->tr_state = SEGVN_TR_OFF; 8998 return; 8999 } 9000 9001 /* 9002 * Avoid creating anon maps with size bigger than the file size. 9003 * If VOP_GETATTR() call fails bail out. 9004 */ 9005 va.va_mask = AT_SIZE | AT_MTIME | AT_CTIME; 9006 if (VOP_GETATTR(vp, &va, 0, svd->cred) != 0) { 9007 svd->tr_state = SEGVN_TR_OFF; 9008 SEGVN_TR_ADDSTAT(gaerr); 9009 return; 9010 } 9011 if (btopr(va.va_size) < btopr(eoff)) { 9012 svd->tr_state = SEGVN_TR_OFF; 9013 SEGVN_TR_ADDSTAT(overmap); 9014 return; 9015 } 9016 9017 /* 9018 * VVMEXEC may not be set yet if exec() prefaults text segment. Set 9019 * this flag now before vn_is_mapped(V_WRITE) so that MAP_SHARED 9020 * mapping that checks if trcache for this vnode needs to be 9021 * invalidated can't miss us. 9022 */ 9023 if (!(vp->v_flag & VVMEXEC)) { 9024 mutex_enter(&vp->v_lock); 9025 vp->v_flag |= VVMEXEC; 9026 mutex_exit(&vp->v_lock); 9027 } 9028 mutex_enter(&svntr_hashtab[hash].tr_lock); 9029 /* 9030 * Bail out if potentially MAP_SHARED writable mappings exist to this 9031 * vnode. We don't want to use old file contents from existing 9032 * replicas if this mapping was established after the original file 9033 * was changed. 9034 */ 9035 if (vn_is_mapped(vp, V_WRITE)) { 9036 mutex_exit(&svntr_hashtab[hash].tr_lock); 9037 svd->tr_state = SEGVN_TR_OFF; 9038 SEGVN_TR_ADDSTAT(wrcnt); 9039 return; 9040 } 9041 svntrp = svntr_hashtab[hash].tr_head; 9042 for (; svntrp != NULL; svntrp = svntrp->tr_next) { 9043 ASSERT(svntrp->tr_refcnt != 0); 9044 if (svntrp->tr_vp != vp) { 9045 continue; 9046 } 9047 9048 /* 9049 * Bail out if the file or its attributes were changed after 9050 * this replication entry was created since we need to use the 9051 * latest file contents. Note that mtime test alone is not 9052 * sufficient because a user can explicitly change mtime via 9053 * utimes(2) interfaces back to the old value after modifiying 9054 * the file contents. To detect this case we also have to test 9055 * ctime which among other things records the time of the last 9056 * mtime change by utimes(2). ctime is not changed when the file 9057 * is only read or executed so we expect that typically existing 9058 * replication amp's can be used most of the time. 9059 */ 9060 if (!svntrp->tr_valid || 9061 svntrp->tr_mtime.tv_sec != va.va_mtime.tv_sec || 9062 svntrp->tr_mtime.tv_nsec != va.va_mtime.tv_nsec || 9063 svntrp->tr_ctime.tv_sec != va.va_ctime.tv_sec || 9064 svntrp->tr_ctime.tv_nsec != va.va_ctime.tv_nsec) { 9065 mutex_exit(&svntr_hashtab[hash].tr_lock); 9066 svd->tr_state = SEGVN_TR_OFF; 9067 SEGVN_TR_ADDSTAT(stale); 9068 return; 9069 } 9070 /* 9071 * if off, eoff and szc match current segment we found the 9072 * existing entry we can use. 9073 */ 9074 if (svntrp->tr_off == off && svntrp->tr_eoff == eoff && 9075 svntrp->tr_szc == szc) { 9076 break; 9077 } 9078 /* 9079 * Don't create different but overlapping in file offsets 9080 * entries to avoid replication of the same file pages more 9081 * than once per lgroup. 9082 */ 9083 if ((off >= svntrp->tr_off && off < svntrp->tr_eoff) || 9084 (eoff > svntrp->tr_off && eoff <= svntrp->tr_eoff)) { 9085 mutex_exit(&svntr_hashtab[hash].tr_lock); 9086 svd->tr_state = SEGVN_TR_OFF; 9087 SEGVN_TR_ADDSTAT(overlap); 9088 return; 9089 } 9090 } 9091 /* 9092 * If we didn't find existing entry create a new one. 9093 */ 9094 if (svntrp == NULL) { 9095 svntrp = kmem_cache_alloc(svntr_cache, KM_NOSLEEP); 9096 if (svntrp == NULL) { 9097 mutex_exit(&svntr_hashtab[hash].tr_lock); 9098 svd->tr_state = SEGVN_TR_OFF; 9099 SEGVN_TR_ADDSTAT(nokmem); 9100 return; 9101 } 9102 #ifdef DEBUG 9103 { 9104 lgrp_id_t i; 9105 for (i = 0; i < NLGRPS_MAX; i++) { 9106 ASSERT(svntrp->tr_amp[i] == NULL); 9107 } 9108 } 9109 #endif /* DEBUG */ 9110 svntrp->tr_vp = vp; 9111 svntrp->tr_off = off; 9112 svntrp->tr_eoff = eoff; 9113 svntrp->tr_szc = szc; 9114 svntrp->tr_valid = 1; 9115 svntrp->tr_mtime = va.va_mtime; 9116 svntrp->tr_ctime = va.va_ctime; 9117 svntrp->tr_refcnt = 0; 9118 svntrp->tr_next = svntr_hashtab[hash].tr_head; 9119 svntr_hashtab[hash].tr_head = svntrp; 9120 } 9121 first = 1; 9122 again: 9123 /* 9124 * We want to pick a replica with pages on main thread's (t_tid = 1, 9125 * aka T1) lgrp. Currently text replication is only optimized for 9126 * workloads that either have all threads of a process on the same 9127 * lgrp or execute their large text primarily on main thread. 9128 */ 9129 lgrp_id = p->p_t1_lgrpid; 9130 if (lgrp_id == LGRP_NONE) { 9131 /* 9132 * In case exec() prefaults text on non main thread use 9133 * current thread lgrpid. It will become main thread anyway 9134 * soon. 9135 */ 9136 lgrp_id = lgrp_home_id(curthread); 9137 } 9138 /* 9139 * Set p_tr_lgrpid to lgrpid if it hasn't been set yet. Otherwise 9140 * just set it to NLGRPS_MAX if it's different from current process T1 9141 * home lgrp. p_tr_lgrpid is used to detect if process uses text 9142 * replication and T1 new home is different from lgrp used for text 9143 * replication. When this happens asyncronous segvn thread rechecks if 9144 * segments should change lgrps used for text replication. If we fail 9145 * to set p_tr_lgrpid with cas32 then set it to NLGRPS_MAX without cas 9146 * if it's not already NLGRPS_MAX and not equal lgrp_id we want to 9147 * use. We don't need to use cas in this case because another thread 9148 * that races in between our non atomic check and set may only change 9149 * p_tr_lgrpid to NLGRPS_MAX at this point. 9150 */ 9151 ASSERT(lgrp_id != LGRP_NONE && lgrp_id < NLGRPS_MAX); 9152 olid = p->p_tr_lgrpid; 9153 if (lgrp_id != olid && olid != NLGRPS_MAX) { 9154 lgrp_id_t nlid = (olid == LGRP_NONE) ? lgrp_id : NLGRPS_MAX; 9155 if (cas32((uint32_t *)&p->p_tr_lgrpid, olid, nlid) != olid) { 9156 olid = p->p_tr_lgrpid; 9157 ASSERT(olid != LGRP_NONE); 9158 if (olid != lgrp_id && olid != NLGRPS_MAX) { 9159 p->p_tr_lgrpid = NLGRPS_MAX; 9160 } 9161 } 9162 ASSERT(p->p_tr_lgrpid != LGRP_NONE); 9163 membar_producer(); 9164 /* 9165 * lgrp_move_thread() won't schedule async recheck after 9166 * p->p_t1_lgrpid update unless p->p_tr_lgrpid is not 9167 * LGRP_NONE. Recheck p_t1_lgrpid once now that p->p_tr_lgrpid 9168 * is not LGRP_NONE. 9169 */ 9170 if (first && p->p_t1_lgrpid != LGRP_NONE && 9171 p->p_t1_lgrpid != lgrp_id) { 9172 first = 0; 9173 goto again; 9174 } 9175 } 9176 /* 9177 * If no amp was created yet for lgrp_id create a new one as long as 9178 * we have enough memory to afford it. 9179 */ 9180 if ((amp = svntrp->tr_amp[lgrp_id]) == NULL) { 9181 size_t trmem = atomic_add_long_nv(&segvn_textrepl_bytes, size); 9182 if (trmem > segvn_textrepl_max_bytes) { 9183 SEGVN_TR_ADDSTAT(normem); 9184 goto fail; 9185 } 9186 if (anon_try_resv_zone(size, NULL) == 0) { 9187 SEGVN_TR_ADDSTAT(noanon); 9188 goto fail; 9189 } 9190 amp = anonmap_alloc(size, size, ANON_NOSLEEP); 9191 if (amp == NULL) { 9192 anon_unresv_zone(size, NULL); 9193 SEGVN_TR_ADDSTAT(nokmem); 9194 goto fail; 9195 } 9196 ASSERT(amp->refcnt == 1); 9197 amp->a_szc = szc; 9198 svntrp->tr_amp[lgrp_id] = amp; 9199 SEGVN_TR_ADDSTAT(newamp); 9200 } 9201 svntrp->tr_refcnt++; 9202 ASSERT(svd->svn_trnext == NULL); 9203 ASSERT(svd->svn_trprev == NULL); 9204 svd->svn_trnext = svntrp->tr_svnhead; 9205 svd->svn_trprev = NULL; 9206 if (svntrp->tr_svnhead != NULL) { 9207 svntrp->tr_svnhead->svn_trprev = svd; 9208 } 9209 svntrp->tr_svnhead = svd; 9210 ASSERT(amp->a_szc == szc && amp->size == size && amp->swresv == size); 9211 ASSERT(amp->refcnt >= 1); 9212 svd->amp = amp; 9213 svd->anon_index = 0; 9214 svd->tr_policy_info.mem_policy = LGRP_MEM_POLICY_NEXT_SEG; 9215 svd->tr_policy_info.mem_lgrpid = lgrp_id; 9216 svd->tr_state = SEGVN_TR_ON; 9217 mutex_exit(&svntr_hashtab[hash].tr_lock); 9218 SEGVN_TR_ADDSTAT(repl); 9219 return; 9220 fail: 9221 ASSERT(segvn_textrepl_bytes >= size); 9222 atomic_add_long(&segvn_textrepl_bytes, -size); 9223 ASSERT(svntrp != NULL); 9224 ASSERT(svntrp->tr_amp[lgrp_id] == NULL); 9225 if (svntrp->tr_refcnt == 0) { 9226 ASSERT(svntrp == svntr_hashtab[hash].tr_head); 9227 svntr_hashtab[hash].tr_head = svntrp->tr_next; 9228 mutex_exit(&svntr_hashtab[hash].tr_lock); 9229 kmem_cache_free(svntr_cache, svntrp); 9230 } else { 9231 mutex_exit(&svntr_hashtab[hash].tr_lock); 9232 } 9233 svd->tr_state = SEGVN_TR_OFF; 9234 } 9235 9236 /* 9237 * Convert seg back to regular vnode mapping seg by unbinding it from its text 9238 * replication amp. This routine is most typically called when segment is 9239 * unmapped but can also be called when segment no longer qualifies for text 9240 * replication (e.g. due to protection changes). If unload_unmap is set use 9241 * HAT_UNLOAD_UNMAP flag in hat_unload_callback(). If we are the last user of 9242 * svntr free all its anon maps and remove it from the hash table. 9243 */ 9244 static void 9245 segvn_textunrepl(struct seg *seg, int unload_unmap) 9246 { 9247 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 9248 vnode_t *vp = svd->vp; 9249 u_offset_t off = svd->offset; 9250 size_t size = seg->s_size; 9251 u_offset_t eoff = off + size; 9252 uint_t szc = seg->s_szc; 9253 ulong_t hash = SVNTR_HASH_FUNC(vp); 9254 svntr_t *svntrp; 9255 svntr_t **prv_svntrp; 9256 lgrp_id_t lgrp_id = svd->tr_policy_info.mem_lgrpid; 9257 lgrp_id_t i; 9258 9259 ASSERT(AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 9260 ASSERT(AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock) || 9261 SEGVN_WRITE_HELD(seg->s_as, &svd->lock)); 9262 ASSERT(svd->tr_state == SEGVN_TR_ON); 9263 ASSERT(!HAT_IS_REGION_COOKIE_VALID(svd->rcookie)); 9264 ASSERT(svd->amp != NULL); 9265 ASSERT(svd->amp->refcnt >= 1); 9266 ASSERT(svd->anon_index == 0); 9267 ASSERT(lgrp_id != LGRP_NONE && lgrp_id < NLGRPS_MAX); 9268 ASSERT(svntr_hashtab != NULL); 9269 9270 mutex_enter(&svntr_hashtab[hash].tr_lock); 9271 prv_svntrp = &svntr_hashtab[hash].tr_head; 9272 for (; (svntrp = *prv_svntrp) != NULL; prv_svntrp = &svntrp->tr_next) { 9273 ASSERT(svntrp->tr_refcnt != 0); 9274 if (svntrp->tr_vp == vp && svntrp->tr_off == off && 9275 svntrp->tr_eoff == eoff && svntrp->tr_szc == szc) { 9276 break; 9277 } 9278 } 9279 if (svntrp == NULL) { 9280 panic("segvn_textunrepl: svntr record not found"); 9281 } 9282 if (svntrp->tr_amp[lgrp_id] != svd->amp) { 9283 panic("segvn_textunrepl: amp mismatch"); 9284 } 9285 svd->tr_state = SEGVN_TR_OFF; 9286 svd->amp = NULL; 9287 if (svd->svn_trprev == NULL) { 9288 ASSERT(svntrp->tr_svnhead == svd); 9289 svntrp->tr_svnhead = svd->svn_trnext; 9290 if (svntrp->tr_svnhead != NULL) { 9291 svntrp->tr_svnhead->svn_trprev = NULL; 9292 } 9293 svd->svn_trnext = NULL; 9294 } else { 9295 svd->svn_trprev->svn_trnext = svd->svn_trnext; 9296 if (svd->svn_trnext != NULL) { 9297 svd->svn_trnext->svn_trprev = svd->svn_trprev; 9298 svd->svn_trnext = NULL; 9299 } 9300 svd->svn_trprev = NULL; 9301 } 9302 if (--svntrp->tr_refcnt) { 9303 mutex_exit(&svntr_hashtab[hash].tr_lock); 9304 goto done; 9305 } 9306 *prv_svntrp = svntrp->tr_next; 9307 mutex_exit(&svntr_hashtab[hash].tr_lock); 9308 for (i = 0; i < NLGRPS_MAX; i++) { 9309 struct anon_map *amp = svntrp->tr_amp[i]; 9310 if (amp == NULL) { 9311 continue; 9312 } 9313 ASSERT(amp->refcnt == 1); 9314 ASSERT(amp->swresv == size); 9315 ASSERT(amp->size == size); 9316 ASSERT(amp->a_szc == szc); 9317 if (amp->a_szc != 0) { 9318 anon_free_pages(amp->ahp, 0, size, szc); 9319 } else { 9320 anon_free(amp->ahp, 0, size); 9321 } 9322 svntrp->tr_amp[i] = NULL; 9323 ASSERT(segvn_textrepl_bytes >= size); 9324 atomic_add_long(&segvn_textrepl_bytes, -size); 9325 anon_unresv_zone(amp->swresv, NULL); 9326 amp->refcnt = 0; 9327 anonmap_free(amp); 9328 } 9329 kmem_cache_free(svntr_cache, svntrp); 9330 done: 9331 hat_unload_callback(seg->s_as->a_hat, seg->s_base, size, 9332 unload_unmap ? HAT_UNLOAD_UNMAP : 0, NULL); 9333 } 9334 9335 /* 9336 * This is called when a MAP_SHARED writabble mapping is created to a vnode 9337 * that is currently used for execution (VVMEXEC flag is set). In this case we 9338 * need to prevent further use of existing replicas. 9339 */ 9340 static void 9341 segvn_inval_trcache(vnode_t *vp) 9342 { 9343 ulong_t hash = SVNTR_HASH_FUNC(vp); 9344 svntr_t *svntrp; 9345 9346 ASSERT(vp->v_flag & VVMEXEC); 9347 9348 if (svntr_hashtab == NULL) { 9349 return; 9350 } 9351 9352 mutex_enter(&svntr_hashtab[hash].tr_lock); 9353 svntrp = svntr_hashtab[hash].tr_head; 9354 for (; svntrp != NULL; svntrp = svntrp->tr_next) { 9355 ASSERT(svntrp->tr_refcnt != 0); 9356 if (svntrp->tr_vp == vp && svntrp->tr_valid) { 9357 svntrp->tr_valid = 0; 9358 } 9359 } 9360 mutex_exit(&svntr_hashtab[hash].tr_lock); 9361 } 9362 9363 static void 9364 segvn_trasync_thread(void) 9365 { 9366 callb_cpr_t cpr_info; 9367 kmutex_t cpr_lock; /* just for CPR stuff */ 9368 9369 mutex_init(&cpr_lock, NULL, MUTEX_DEFAULT, NULL); 9370 9371 CALLB_CPR_INIT(&cpr_info, &cpr_lock, 9372 callb_generic_cpr, "segvn_async"); 9373 9374 if (segvn_update_textrepl_interval == 0) { 9375 segvn_update_textrepl_interval = segvn_update_tr_time * hz; 9376 } else { 9377 segvn_update_textrepl_interval *= hz; 9378 } 9379 (void) timeout(segvn_trupdate_wakeup, NULL, 9380 segvn_update_textrepl_interval); 9381 9382 for (;;) { 9383 mutex_enter(&cpr_lock); 9384 CALLB_CPR_SAFE_BEGIN(&cpr_info); 9385 mutex_exit(&cpr_lock); 9386 sema_p(&segvn_trasync_sem); 9387 mutex_enter(&cpr_lock); 9388 CALLB_CPR_SAFE_END(&cpr_info, &cpr_lock); 9389 mutex_exit(&cpr_lock); 9390 segvn_trupdate(); 9391 } 9392 } 9393 9394 static uint64_t segvn_lgrp_trthr_migrs_snpsht = 0; 9395 9396 static void 9397 segvn_trupdate_wakeup(void *dummy) 9398 { 9399 uint64_t cur_lgrp_trthr_migrs = lgrp_get_trthr_migrations(); 9400 9401 if (cur_lgrp_trthr_migrs != segvn_lgrp_trthr_migrs_snpsht) { 9402 segvn_lgrp_trthr_migrs_snpsht = cur_lgrp_trthr_migrs; 9403 sema_v(&segvn_trasync_sem); 9404 } 9405 9406 if (!segvn_disable_textrepl_update && 9407 segvn_update_textrepl_interval != 0) { 9408 (void) timeout(segvn_trupdate_wakeup, dummy, 9409 segvn_update_textrepl_interval); 9410 } 9411 } 9412 9413 static void 9414 segvn_trupdate(void) 9415 { 9416 ulong_t hash; 9417 svntr_t *svntrp; 9418 segvn_data_t *svd; 9419 9420 ASSERT(svntr_hashtab != NULL); 9421 9422 for (hash = 0; hash < svntr_hashtab_sz; hash++) { 9423 mutex_enter(&svntr_hashtab[hash].tr_lock); 9424 svntrp = svntr_hashtab[hash].tr_head; 9425 for (; svntrp != NULL; svntrp = svntrp->tr_next) { 9426 ASSERT(svntrp->tr_refcnt != 0); 9427 svd = svntrp->tr_svnhead; 9428 for (; svd != NULL; svd = svd->svn_trnext) { 9429 segvn_trupdate_seg(svd->seg, svd, svntrp, 9430 hash); 9431 } 9432 } 9433 mutex_exit(&svntr_hashtab[hash].tr_lock); 9434 } 9435 } 9436 9437 static void 9438 segvn_trupdate_seg(struct seg *seg, 9439 segvn_data_t *svd, 9440 svntr_t *svntrp, 9441 ulong_t hash) 9442 { 9443 proc_t *p; 9444 lgrp_id_t lgrp_id; 9445 struct as *as; 9446 size_t size; 9447 struct anon_map *amp; 9448 9449 ASSERT(svd->vp != NULL); 9450 ASSERT(svd->vp == svntrp->tr_vp); 9451 ASSERT(svd->offset == svntrp->tr_off); 9452 ASSERT(svd->offset + seg->s_size == svntrp->tr_eoff); 9453 ASSERT(seg != NULL); 9454 ASSERT(svd->seg == seg); 9455 ASSERT(seg->s_data == (void *)svd); 9456 ASSERT(seg->s_szc == svntrp->tr_szc); 9457 ASSERT(svd->tr_state == SEGVN_TR_ON); 9458 ASSERT(!HAT_IS_REGION_COOKIE_VALID(svd->rcookie)); 9459 ASSERT(svd->amp != NULL); 9460 ASSERT(svd->tr_policy_info.mem_policy == LGRP_MEM_POLICY_NEXT_SEG); 9461 ASSERT(svd->tr_policy_info.mem_lgrpid != LGRP_NONE); 9462 ASSERT(svd->tr_policy_info.mem_lgrpid < NLGRPS_MAX); 9463 ASSERT(svntrp->tr_amp[svd->tr_policy_info.mem_lgrpid] == svd->amp); 9464 ASSERT(svntrp->tr_refcnt != 0); 9465 ASSERT(mutex_owned(&svntr_hashtab[hash].tr_lock)); 9466 9467 as = seg->s_as; 9468 ASSERT(as != NULL && as != &kas); 9469 p = as->a_proc; 9470 ASSERT(p != NULL); 9471 ASSERT(p->p_tr_lgrpid != LGRP_NONE); 9472 lgrp_id = p->p_t1_lgrpid; 9473 if (lgrp_id == LGRP_NONE) { 9474 return; 9475 } 9476 ASSERT(lgrp_id < NLGRPS_MAX); 9477 if (svd->tr_policy_info.mem_lgrpid == lgrp_id) { 9478 return; 9479 } 9480 9481 /* 9482 * Use tryenter locking since we are locking as/seg and svntr hash 9483 * lock in reverse from syncrounous thread order. 9484 */ 9485 if (!AS_LOCK_TRYENTER(as, &as->a_lock, RW_READER)) { 9486 SEGVN_TR_ADDSTAT(nolock); 9487 if (segvn_lgrp_trthr_migrs_snpsht) { 9488 segvn_lgrp_trthr_migrs_snpsht = 0; 9489 } 9490 return; 9491 } 9492 if (!SEGVN_LOCK_TRYENTER(seg->s_as, &svd->lock, RW_WRITER)) { 9493 AS_LOCK_EXIT(as, &as->a_lock); 9494 SEGVN_TR_ADDSTAT(nolock); 9495 if (segvn_lgrp_trthr_migrs_snpsht) { 9496 segvn_lgrp_trthr_migrs_snpsht = 0; 9497 } 9498 return; 9499 } 9500 size = seg->s_size; 9501 if (svntrp->tr_amp[lgrp_id] == NULL) { 9502 size_t trmem = atomic_add_long_nv(&segvn_textrepl_bytes, size); 9503 if (trmem > segvn_textrepl_max_bytes) { 9504 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 9505 AS_LOCK_EXIT(as, &as->a_lock); 9506 atomic_add_long(&segvn_textrepl_bytes, -size); 9507 SEGVN_TR_ADDSTAT(normem); 9508 return; 9509 } 9510 if (anon_try_resv_zone(size, NULL) == 0) { 9511 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 9512 AS_LOCK_EXIT(as, &as->a_lock); 9513 atomic_add_long(&segvn_textrepl_bytes, -size); 9514 SEGVN_TR_ADDSTAT(noanon); 9515 return; 9516 } 9517 amp = anonmap_alloc(size, size, KM_NOSLEEP); 9518 if (amp == NULL) { 9519 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 9520 AS_LOCK_EXIT(as, &as->a_lock); 9521 atomic_add_long(&segvn_textrepl_bytes, -size); 9522 anon_unresv_zone(size, NULL); 9523 SEGVN_TR_ADDSTAT(nokmem); 9524 return; 9525 } 9526 ASSERT(amp->refcnt == 1); 9527 amp->a_szc = seg->s_szc; 9528 svntrp->tr_amp[lgrp_id] = amp; 9529 } 9530 /* 9531 * We don't need to drop the bucket lock but here we give other 9532 * threads a chance. svntr and svd can't be unlinked as long as 9533 * segment lock is held as a writer and AS held as well. After we 9534 * retake bucket lock we'll continue from where we left. We'll be able 9535 * to reach the end of either list since new entries are always added 9536 * to the beginning of the lists. 9537 */ 9538 mutex_exit(&svntr_hashtab[hash].tr_lock); 9539 hat_unload_callback(as->a_hat, seg->s_base, size, 0, NULL); 9540 mutex_enter(&svntr_hashtab[hash].tr_lock); 9541 9542 ASSERT(svd->tr_state == SEGVN_TR_ON); 9543 ASSERT(svd->amp != NULL); 9544 ASSERT(svd->tr_policy_info.mem_policy == LGRP_MEM_POLICY_NEXT_SEG); 9545 ASSERT(svd->tr_policy_info.mem_lgrpid != lgrp_id); 9546 ASSERT(svd->amp != svntrp->tr_amp[lgrp_id]); 9547 9548 svd->tr_policy_info.mem_lgrpid = lgrp_id; 9549 svd->amp = svntrp->tr_amp[lgrp_id]; 9550 p->p_tr_lgrpid = NLGRPS_MAX; 9551 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 9552 AS_LOCK_EXIT(as, &as->a_lock); 9553 9554 ASSERT(svntrp->tr_refcnt != 0); 9555 ASSERT(svd->vp == svntrp->tr_vp); 9556 ASSERT(svd->tr_policy_info.mem_lgrpid == lgrp_id); 9557 ASSERT(svd->amp != NULL && svd->amp == svntrp->tr_amp[lgrp_id]); 9558 ASSERT(svd->seg == seg); 9559 ASSERT(svd->tr_state == SEGVN_TR_ON); 9560 9561 SEGVN_TR_ADDSTAT(asyncrepl); 9562 } 9563