1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ 27 /* All Rights Reserved */ 28 29 /* 30 * University Copyright- Copyright (c) 1982, 1986, 1988 31 * The Regents of the University of California 32 * All Rights Reserved 33 * 34 * University Acknowledgment- Portions of this document are derived from 35 * software developed by the University of California, Berkeley, and its 36 * contributors. 37 */ 38 39 #pragma ident "%Z%%M% %I% %E% SMI" 40 41 /* 42 * VM - shared or copy-on-write from a vnode/anonymous memory. 43 */ 44 45 #include <sys/types.h> 46 #include <sys/param.h> 47 #include <sys/t_lock.h> 48 #include <sys/errno.h> 49 #include <sys/systm.h> 50 #include <sys/mman.h> 51 #include <sys/debug.h> 52 #include <sys/cred.h> 53 #include <sys/vmsystm.h> 54 #include <sys/tuneable.h> 55 #include <sys/bitmap.h> 56 #include <sys/swap.h> 57 #include <sys/kmem.h> 58 #include <sys/sysmacros.h> 59 #include <sys/vtrace.h> 60 #include <sys/cmn_err.h> 61 #include <sys/callb.h> 62 #include <sys/vm.h> 63 #include <sys/dumphdr.h> 64 #include <sys/lgrp.h> 65 66 #include <vm/hat.h> 67 #include <vm/as.h> 68 #include <vm/seg.h> 69 #include <vm/seg_vn.h> 70 #include <vm/pvn.h> 71 #include <vm/anon.h> 72 #include <vm/page.h> 73 #include <vm/vpage.h> 74 #include <sys/proc.h> 75 #include <sys/task.h> 76 #include <sys/project.h> 77 #include <sys/zone.h> 78 #include <sys/shm_impl.h> 79 /* 80 * Private seg op routines. 81 */ 82 static int segvn_dup(struct seg *seg, struct seg *newseg); 83 static int segvn_unmap(struct seg *seg, caddr_t addr, size_t len); 84 static void segvn_free(struct seg *seg); 85 static faultcode_t segvn_fault(struct hat *hat, struct seg *seg, 86 caddr_t addr, size_t len, enum fault_type type, 87 enum seg_rw rw); 88 static faultcode_t segvn_faulta(struct seg *seg, caddr_t addr); 89 static int segvn_setprot(struct seg *seg, caddr_t addr, 90 size_t len, uint_t prot); 91 static int segvn_checkprot(struct seg *seg, caddr_t addr, 92 size_t len, uint_t prot); 93 static int segvn_kluster(struct seg *seg, caddr_t addr, ssize_t delta); 94 static size_t segvn_swapout(struct seg *seg); 95 static int segvn_sync(struct seg *seg, caddr_t addr, size_t len, 96 int attr, uint_t flags); 97 static size_t segvn_incore(struct seg *seg, caddr_t addr, size_t len, 98 char *vec); 99 static int segvn_lockop(struct seg *seg, caddr_t addr, size_t len, 100 int attr, int op, ulong_t *lockmap, size_t pos); 101 static int segvn_getprot(struct seg *seg, caddr_t addr, size_t len, 102 uint_t *protv); 103 static u_offset_t segvn_getoffset(struct seg *seg, caddr_t addr); 104 static int segvn_gettype(struct seg *seg, caddr_t addr); 105 static int segvn_getvp(struct seg *seg, caddr_t addr, 106 struct vnode **vpp); 107 static int segvn_advise(struct seg *seg, caddr_t addr, size_t len, 108 uint_t behav); 109 static void segvn_dump(struct seg *seg); 110 static int segvn_pagelock(struct seg *seg, caddr_t addr, size_t len, 111 struct page ***ppp, enum lock_type type, enum seg_rw rw); 112 static int segvn_setpagesize(struct seg *seg, caddr_t addr, size_t len, 113 uint_t szc); 114 static int segvn_getmemid(struct seg *seg, caddr_t addr, 115 memid_t *memidp); 116 static lgrp_mem_policy_info_t *segvn_getpolicy(struct seg *, caddr_t); 117 static int segvn_capable(struct seg *seg, segcapability_t capable); 118 119 struct seg_ops segvn_ops = { 120 segvn_dup, 121 segvn_unmap, 122 segvn_free, 123 segvn_fault, 124 segvn_faulta, 125 segvn_setprot, 126 segvn_checkprot, 127 segvn_kluster, 128 segvn_swapout, 129 segvn_sync, 130 segvn_incore, 131 segvn_lockop, 132 segvn_getprot, 133 segvn_getoffset, 134 segvn_gettype, 135 segvn_getvp, 136 segvn_advise, 137 segvn_dump, 138 segvn_pagelock, 139 segvn_setpagesize, 140 segvn_getmemid, 141 segvn_getpolicy, 142 segvn_capable, 143 }; 144 145 /* 146 * Common zfod structures, provided as a shorthand for others to use. 147 */ 148 static segvn_crargs_t zfod_segvn_crargs = 149 SEGVN_ZFOD_ARGS(PROT_ZFOD, PROT_ALL); 150 static segvn_crargs_t kzfod_segvn_crargs = 151 SEGVN_ZFOD_ARGS(PROT_ZFOD & ~PROT_USER, 152 PROT_ALL & ~PROT_USER); 153 static segvn_crargs_t stack_noexec_crargs = 154 SEGVN_ZFOD_ARGS(PROT_ZFOD & ~PROT_EXEC, PROT_ALL); 155 156 caddr_t zfod_argsp = (caddr_t)&zfod_segvn_crargs; /* user zfod argsp */ 157 caddr_t kzfod_argsp = (caddr_t)&kzfod_segvn_crargs; /* kernel zfod argsp */ 158 caddr_t stack_exec_argsp = (caddr_t)&zfod_segvn_crargs; /* executable stack */ 159 caddr_t stack_noexec_argsp = (caddr_t)&stack_noexec_crargs; /* noexec stack */ 160 161 #define vpgtob(n) ((n) * sizeof (struct vpage)) /* For brevity */ 162 163 size_t segvn_comb_thrshld = UINT_MAX; /* patchable -- see 1196681 */ 164 165 static int segvn_concat(struct seg *, struct seg *, int); 166 static int segvn_extend_prev(struct seg *, struct seg *, 167 struct segvn_crargs *, size_t); 168 static int segvn_extend_next(struct seg *, struct seg *, 169 struct segvn_crargs *, size_t); 170 static void segvn_softunlock(struct seg *, caddr_t, size_t, enum seg_rw); 171 static void segvn_pagelist_rele(page_t **); 172 static void segvn_setvnode_mpss(vnode_t *); 173 static void segvn_relocate_pages(page_t **, page_t *); 174 static int segvn_full_szcpages(page_t **, uint_t, int *, uint_t *); 175 static int segvn_fill_vp_pages(struct segvn_data *, vnode_t *, u_offset_t, 176 uint_t, page_t **, page_t **, uint_t *, int *); 177 static faultcode_t segvn_fault_vnodepages(struct hat *, struct seg *, caddr_t, 178 caddr_t, enum fault_type, enum seg_rw, caddr_t, caddr_t, int); 179 static faultcode_t segvn_fault_anonpages(struct hat *, struct seg *, caddr_t, 180 caddr_t, enum fault_type, enum seg_rw, caddr_t, caddr_t, int); 181 static faultcode_t segvn_faultpage(struct hat *, struct seg *, caddr_t, 182 u_offset_t, struct vpage *, page_t **, uint_t, 183 enum fault_type, enum seg_rw, int, int); 184 static void segvn_vpage(struct seg *); 185 186 static void segvn_purge(struct seg *seg); 187 static int segvn_reclaim(struct seg *, caddr_t, size_t, struct page **, 188 enum seg_rw); 189 190 static int sameprot(struct seg *, caddr_t, size_t); 191 192 static int segvn_demote_range(struct seg *, caddr_t, size_t, int, uint_t); 193 static int segvn_clrszc(struct seg *); 194 static struct seg *segvn_split_seg(struct seg *, caddr_t); 195 static int segvn_claim_pages(struct seg *, struct vpage *, u_offset_t, 196 ulong_t, uint_t); 197 198 static void segvn_hat_rgn_unload_callback(caddr_t, caddr_t, caddr_t, 199 size_t, void *, u_offset_t); 200 201 static int segvn_slock_anonpages(page_t *, int); 202 static void segvn_sunlock_anonpages(page_t *, int); 203 204 static struct kmem_cache *segvn_cache; 205 206 #ifdef VM_STATS 207 static struct segvnvmstats_str { 208 ulong_t fill_vp_pages[31]; 209 ulong_t fltvnpages[49]; 210 ulong_t fullszcpages[10]; 211 ulong_t relocatepages[3]; 212 ulong_t fltanpages[17]; 213 ulong_t pagelock[3]; 214 ulong_t demoterange[3]; 215 } segvnvmstats; 216 #endif /* VM_STATS */ 217 218 #define SDR_RANGE 1 /* demote entire range */ 219 #define SDR_END 2 /* demote non aligned ends only */ 220 221 #define CALC_LPG_REGION(pgsz, seg, addr, len, lpgaddr, lpgeaddr) { \ 222 if ((len) != 0) { \ 223 lpgaddr = (caddr_t)P2ALIGN((uintptr_t)(addr), pgsz); \ 224 ASSERT(lpgaddr >= (seg)->s_base); \ 225 lpgeaddr = (caddr_t)P2ROUNDUP((uintptr_t)((addr) + \ 226 (len)), pgsz); \ 227 ASSERT(lpgeaddr > lpgaddr); \ 228 ASSERT(lpgeaddr <= (seg)->s_base + (seg)->s_size); \ 229 } else { \ 230 lpgeaddr = lpgaddr = (addr); \ 231 } \ 232 } 233 234 /*ARGSUSED*/ 235 static int 236 segvn_cache_constructor(void *buf, void *cdrarg, int kmflags) 237 { 238 struct segvn_data *svd = buf; 239 240 rw_init(&svd->lock, NULL, RW_DEFAULT, NULL); 241 mutex_init(&svd->segp_slock, NULL, MUTEX_DEFAULT, NULL); 242 svd->svn_trnext = svd->svn_trprev = NULL; 243 return (0); 244 } 245 246 /*ARGSUSED1*/ 247 static void 248 segvn_cache_destructor(void *buf, void *cdrarg) 249 { 250 struct segvn_data *svd = buf; 251 252 rw_destroy(&svd->lock); 253 mutex_destroy(&svd->segp_slock); 254 } 255 256 /*ARGSUSED*/ 257 static int 258 svntr_cache_constructor(void *buf, void *cdrarg, int kmflags) 259 { 260 bzero(buf, sizeof (svntr_t)); 261 return (0); 262 } 263 264 /* 265 * Patching this variable to non-zero allows the system to run with 266 * stacks marked as "not executable". It's a bit of a kludge, but is 267 * provided as a tweakable for platforms that export those ABIs 268 * (e.g. sparc V8) that have executable stacks enabled by default. 269 * There are also some restrictions for platforms that don't actually 270 * implement 'noexec' protections. 271 * 272 * Once enabled, the system is (therefore) unable to provide a fully 273 * ABI-compliant execution environment, though practically speaking, 274 * most everything works. The exceptions are generally some interpreters 275 * and debuggers that create executable code on the stack and jump 276 * into it (without explicitly mprotecting the address range to include 277 * PROT_EXEC). 278 * 279 * One important class of applications that are disabled are those 280 * that have been transformed into malicious agents using one of the 281 * numerous "buffer overflow" attacks. See 4007890. 282 */ 283 int noexec_user_stack = 0; 284 int noexec_user_stack_log = 1; 285 286 int segvn_lpg_disable = 0; 287 uint_t segvn_maxpgszc = 0; 288 289 ulong_t segvn_vmpss_clrszc_cnt; 290 ulong_t segvn_vmpss_clrszc_err; 291 ulong_t segvn_fltvnpages_clrszc_cnt; 292 ulong_t segvn_fltvnpages_clrszc_err; 293 ulong_t segvn_setpgsz_align_err; 294 ulong_t segvn_setpgsz_anon_align_err; 295 ulong_t segvn_setpgsz_getattr_err; 296 ulong_t segvn_setpgsz_eof_err; 297 ulong_t segvn_faultvnmpss_align_err1; 298 ulong_t segvn_faultvnmpss_align_err2; 299 ulong_t segvn_faultvnmpss_align_err3; 300 ulong_t segvn_faultvnmpss_align_err4; 301 ulong_t segvn_faultvnmpss_align_err5; 302 ulong_t segvn_vmpss_pageio_deadlk_err; 303 304 int segvn_use_regions = 1; 305 306 /* 307 * Segvn supports text replication optimization for NUMA platforms. Text 308 * replica's are represented by anon maps (amp). There's one amp per text file 309 * region per lgroup. A process chooses the amp for each of its text mappings 310 * based on the lgroup assignment of its main thread (t_tid = 1). All 311 * processes that want a replica on a particular lgroup for the same text file 312 * mapping share the same amp. amp's are looked up in svntr_hashtab hash table 313 * with vp,off,size,szc used as a key. Text replication segments are read only 314 * MAP_PRIVATE|MAP_TEXT segments that map vnode. Replication is achieved by 315 * forcing COW faults from vnode to amp and mapping amp pages instead of vnode 316 * pages. Replication amp is assigned to a segment when it gets its first 317 * pagefault. To handle main thread lgroup rehoming segvn_trasync_thread 318 * rechecks periodically if the process still maps an amp local to the main 319 * thread. If not async thread forces process to remap to an amp in the new 320 * home lgroup of the main thread. Current text replication implementation 321 * only provides the benefit to workloads that do most of their work in the 322 * main thread of a process or all the threads of a process run in the same 323 * lgroup. To extend text replication benefit to different types of 324 * multithreaded workloads further work would be needed in the hat layer to 325 * allow the same virtual address in the same hat to simultaneously map 326 * different physical addresses (i.e. page table replication would be needed 327 * for x86). 328 * 329 * amp pages are used instead of vnode pages as long as segment has a very 330 * simple life cycle. It's created via segvn_create(), handles S_EXEC 331 * (S_READ) pagefaults and is fully unmapped. If anything more complicated 332 * happens such as protection is changed, real COW fault happens, pagesize is 333 * changed, MC_LOCK is requested or segment is partially unmapped we turn off 334 * text replication by converting the segment back to vnode only segment 335 * (unmap segment's address range and set svd->amp to NULL). 336 * 337 * The original file can be changed after amp is inserted into 338 * svntr_hashtab. Processes that are launched after the file is already 339 * changed can't use the replica's created prior to the file change. To 340 * implement this functionality hash entries are timestamped. Replica's can 341 * only be used if current file modification time is the same as the timestamp 342 * saved when hash entry was created. However just timestamps alone are not 343 * sufficient to detect file modification via mmap(MAP_SHARED) mappings. We 344 * deal with file changes via MAP_SHARED mappings differently. When writable 345 * MAP_SHARED mappings are created to vnodes marked as executable we mark all 346 * existing replica's for this vnode as not usable for future text 347 * mappings. And we don't create new replica's for files that currently have 348 * potentially writable MAP_SHARED mappings (i.e. vn_is_mapped(V_WRITE) is 349 * true). 350 */ 351 352 #define SEGVN_TEXTREPL_MAXBYTES_FACTOR (20) 353 size_t segvn_textrepl_max_bytes_factor = SEGVN_TEXTREPL_MAXBYTES_FACTOR; 354 355 static ulong_t svntr_hashtab_sz = 512; 356 static svntr_bucket_t *svntr_hashtab = NULL; 357 static struct kmem_cache *svntr_cache; 358 static svntr_stats_t *segvn_textrepl_stats; 359 static ksema_t segvn_trasync_sem; 360 361 int segvn_disable_textrepl = 1; 362 size_t textrepl_size_thresh = (size_t)-1; 363 size_t segvn_textrepl_bytes = 0; 364 size_t segvn_textrepl_max_bytes = 0; 365 clock_t segvn_update_textrepl_interval = 0; 366 int segvn_update_tr_time = 10; 367 int segvn_disable_textrepl_update = 0; 368 369 static void segvn_textrepl(struct seg *); 370 static void segvn_textunrepl(struct seg *, int); 371 static void segvn_inval_trcache(vnode_t *); 372 static void segvn_trasync_thread(void); 373 static void segvn_trupdate_wakeup(void *); 374 static void segvn_trupdate(void); 375 static void segvn_trupdate_seg(struct seg *, segvn_data_t *, svntr_t *, 376 ulong_t); 377 378 /* 379 * Initialize segvn data structures 380 */ 381 void 382 segvn_init(void) 383 { 384 uint_t maxszc; 385 uint_t szc; 386 size_t pgsz; 387 388 segvn_cache = kmem_cache_create("segvn_cache", 389 sizeof (struct segvn_data), 0, 390 segvn_cache_constructor, segvn_cache_destructor, NULL, 391 NULL, NULL, 0); 392 393 if (segvn_lpg_disable == 0) { 394 szc = maxszc = page_num_pagesizes() - 1; 395 if (szc == 0) { 396 segvn_lpg_disable = 1; 397 } 398 if (page_get_pagesize(0) != PAGESIZE) { 399 panic("segvn_init: bad szc 0"); 400 /*NOTREACHED*/ 401 } 402 while (szc != 0) { 403 pgsz = page_get_pagesize(szc); 404 if (pgsz <= PAGESIZE || !IS_P2ALIGNED(pgsz, pgsz)) { 405 panic("segvn_init: bad szc %d", szc); 406 /*NOTREACHED*/ 407 } 408 szc--; 409 } 410 if (segvn_maxpgszc == 0 || segvn_maxpgszc > maxszc) 411 segvn_maxpgszc = maxszc; 412 } 413 414 if (segvn_use_regions && !hat_supported(HAT_SHARED_REGIONS, NULL)) 415 segvn_use_regions = 0; 416 417 /* 418 * For now shared regions and text replication segvn support 419 * are mutually exclusive. This is acceptable because 420 * currently significant benefit from text replication was 421 * only observed on AMD64 NUMA platforms (due to relatively 422 * small L2$ size) and currently we don't support shared 423 * regions on x86. 424 */ 425 if (segvn_use_regions && !segvn_disable_textrepl) { 426 segvn_disable_textrepl = 1; 427 } 428 429 #if defined(_LP64) 430 if (lgrp_optimizations() && textrepl_size_thresh != (size_t)-1 && 431 !segvn_disable_textrepl) { 432 ulong_t i; 433 size_t hsz = svntr_hashtab_sz * sizeof (svntr_bucket_t); 434 435 svntr_cache = kmem_cache_create("svntr_cache", 436 sizeof (svntr_t), 0, svntr_cache_constructor, NULL, 437 NULL, NULL, NULL, 0); 438 svntr_hashtab = kmem_zalloc(hsz, KM_SLEEP); 439 for (i = 0; i < svntr_hashtab_sz; i++) { 440 mutex_init(&svntr_hashtab[i].tr_lock, NULL, 441 MUTEX_DEFAULT, NULL); 442 } 443 segvn_textrepl_max_bytes = ptob(physmem) / 444 segvn_textrepl_max_bytes_factor; 445 segvn_textrepl_stats = kmem_zalloc(NCPU * 446 sizeof (svntr_stats_t), KM_SLEEP); 447 sema_init(&segvn_trasync_sem, 0, NULL, SEMA_DEFAULT, NULL); 448 (void) thread_create(NULL, 0, segvn_trasync_thread, 449 NULL, 0, &p0, TS_RUN, minclsyspri); 450 } 451 #endif 452 } 453 454 #define SEGVN_PAGEIO ((void *)0x1) 455 #define SEGVN_NOPAGEIO ((void *)0x2) 456 457 static void 458 segvn_setvnode_mpss(vnode_t *vp) 459 { 460 int err; 461 462 ASSERT(vp->v_mpssdata == NULL || 463 vp->v_mpssdata == SEGVN_PAGEIO || 464 vp->v_mpssdata == SEGVN_NOPAGEIO); 465 466 if (vp->v_mpssdata == NULL) { 467 if (vn_vmpss_usepageio(vp)) { 468 err = VOP_PAGEIO(vp, (page_t *)NULL, 469 (u_offset_t)0, 0, 0, CRED(), NULL); 470 } else { 471 err = ENOSYS; 472 } 473 /* 474 * set v_mpssdata just once per vnode life 475 * so that it never changes. 476 */ 477 mutex_enter(&vp->v_lock); 478 if (vp->v_mpssdata == NULL) { 479 if (err == EINVAL) { 480 vp->v_mpssdata = SEGVN_PAGEIO; 481 } else { 482 vp->v_mpssdata = SEGVN_NOPAGEIO; 483 } 484 } 485 mutex_exit(&vp->v_lock); 486 } 487 } 488 489 int 490 segvn_create(struct seg *seg, void *argsp) 491 { 492 struct segvn_crargs *a = (struct segvn_crargs *)argsp; 493 struct segvn_data *svd; 494 size_t swresv = 0; 495 struct cred *cred; 496 struct anon_map *amp; 497 int error = 0; 498 size_t pgsz; 499 lgrp_mem_policy_t mpolicy = LGRP_MEM_POLICY_DEFAULT; 500 int use_rgn = 0; 501 int trok = 0; 502 503 ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock)); 504 505 if (a->type != MAP_PRIVATE && a->type != MAP_SHARED) { 506 panic("segvn_create type"); 507 /*NOTREACHED*/ 508 } 509 510 /* 511 * Check arguments. If a shared anon structure is given then 512 * it is illegal to also specify a vp. 513 */ 514 if (a->amp != NULL && a->vp != NULL) { 515 panic("segvn_create anon_map"); 516 /*NOTREACHED*/ 517 } 518 519 if (a->type == MAP_PRIVATE && (a->flags & MAP_TEXT) && 520 a->vp != NULL && a->prot == (PROT_USER | PROT_READ | PROT_EXEC) && 521 segvn_use_regions) { 522 use_rgn = 1; 523 } 524 525 /* MAP_NORESERVE on a MAP_SHARED segment is meaningless. */ 526 if (a->type == MAP_SHARED) 527 a->flags &= ~MAP_NORESERVE; 528 529 if (a->szc != 0) { 530 if (segvn_lpg_disable != 0 || (a->szc == AS_MAP_NO_LPOOB) || 531 (a->amp != NULL && a->type == MAP_PRIVATE) || 532 (a->flags & MAP_NORESERVE) || seg->s_as == &kas) { 533 a->szc = 0; 534 } else { 535 if (a->szc > segvn_maxpgszc) 536 a->szc = segvn_maxpgszc; 537 pgsz = page_get_pagesize(a->szc); 538 if (!IS_P2ALIGNED(seg->s_base, pgsz) || 539 !IS_P2ALIGNED(seg->s_size, pgsz)) { 540 a->szc = 0; 541 } else if (a->vp != NULL) { 542 extern struct vnode kvp; 543 if (IS_SWAPFSVP(a->vp) || VN_ISKAS(a->vp)) { 544 /* 545 * paranoid check. 546 * hat_page_demote() is not supported 547 * on swapfs pages. 548 */ 549 a->szc = 0; 550 } else if (map_addr_vacalign_check(seg->s_base, 551 a->offset & PAGEMASK)) { 552 a->szc = 0; 553 } 554 } else if (a->amp != NULL) { 555 pgcnt_t anum = btopr(a->offset); 556 pgcnt_t pgcnt = page_get_pagecnt(a->szc); 557 if (!IS_P2ALIGNED(anum, pgcnt)) { 558 a->szc = 0; 559 } 560 } 561 } 562 } 563 564 /* 565 * If segment may need private pages, reserve them now. 566 */ 567 if (!(a->flags & MAP_NORESERVE) && ((a->vp == NULL && a->amp == NULL) || 568 (a->type == MAP_PRIVATE && (a->prot & PROT_WRITE)))) { 569 if (anon_resv(seg->s_size) == 0) 570 return (EAGAIN); 571 swresv = seg->s_size; 572 TRACE_3(TR_FAC_VM, TR_ANON_PROC, "anon proc:%p %lu %u", 573 seg, swresv, 1); 574 } 575 576 /* 577 * Reserve any mapping structures that may be required. 578 * 579 * Don't do it for segments that may use regions. It's currently a 580 * noop in the hat implementations anyway. 581 */ 582 if (!use_rgn) { 583 hat_map(seg->s_as->a_hat, seg->s_base, seg->s_size, HAT_MAP); 584 } 585 586 if (a->cred) { 587 cred = a->cred; 588 crhold(cred); 589 } else { 590 crhold(cred = CRED()); 591 } 592 593 /* Inform the vnode of the new mapping */ 594 if (a->vp != NULL) { 595 error = VOP_ADDMAP(a->vp, a->offset & PAGEMASK, 596 seg->s_as, seg->s_base, seg->s_size, a->prot, 597 a->maxprot, a->type, cred, NULL); 598 if (error) { 599 if (swresv != 0) { 600 anon_unresv(swresv); 601 TRACE_3(TR_FAC_VM, TR_ANON_PROC, 602 "anon proc:%p %lu %u", 603 seg, swresv, 0); 604 } 605 crfree(cred); 606 if (!use_rgn) { 607 hat_unload(seg->s_as->a_hat, seg->s_base, 608 seg->s_size, HAT_UNLOAD_UNMAP); 609 } 610 return (error); 611 } 612 /* 613 * svntr_hashtab will be NULL if we support shared regions. 614 */ 615 trok = ((a->flags & MAP_TEXT) && 616 (seg->s_size > textrepl_size_thresh || 617 (a->flags & _MAP_TEXTREPL)) && 618 lgrp_optimizations() && svntr_hashtab != NULL && 619 a->type == MAP_PRIVATE && swresv == 0 && 620 !(a->flags & MAP_NORESERVE) && 621 seg->s_as != &kas && a->vp->v_type == VREG); 622 623 ASSERT(!trok || !use_rgn); 624 } 625 626 /* 627 * If more than one segment in the address space, and they're adjacent 628 * virtually, try to concatenate them. Don't concatenate if an 629 * explicit anon_map structure was supplied (e.g., SystemV shared 630 * memory) or if we'll use text replication for this segment. 631 */ 632 if (a->amp == NULL && !use_rgn && !trok) { 633 struct seg *pseg, *nseg; 634 struct segvn_data *psvd, *nsvd; 635 lgrp_mem_policy_t ppolicy, npolicy; 636 uint_t lgrp_mem_policy_flags = 0; 637 extern lgrp_mem_policy_t lgrp_mem_default_policy; 638 639 /* 640 * Memory policy flags (lgrp_mem_policy_flags) is valid when 641 * extending stack/heap segments. 642 */ 643 if ((a->vp == NULL) && (a->type == MAP_PRIVATE) && 644 !(a->flags & MAP_NORESERVE) && (seg->s_as != &kas)) { 645 lgrp_mem_policy_flags = a->lgrp_mem_policy_flags; 646 } else { 647 /* 648 * Get policy when not extending it from another segment 649 */ 650 mpolicy = lgrp_mem_policy_default(seg->s_size, a->type); 651 } 652 653 /* 654 * First, try to concatenate the previous and new segments 655 */ 656 pseg = AS_SEGPREV(seg->s_as, seg); 657 if (pseg != NULL && 658 pseg->s_base + pseg->s_size == seg->s_base && 659 pseg->s_ops == &segvn_ops) { 660 /* 661 * Get memory allocation policy from previous segment. 662 * When extension is specified (e.g. for heap) apply 663 * this policy to the new segment regardless of the 664 * outcome of segment concatenation. Extension occurs 665 * for non-default policy otherwise default policy is 666 * used and is based on extended segment size. 667 */ 668 psvd = (struct segvn_data *)pseg->s_data; 669 ppolicy = psvd->policy_info.mem_policy; 670 if (lgrp_mem_policy_flags == 671 LGRP_MP_FLAG_EXTEND_UP) { 672 if (ppolicy != lgrp_mem_default_policy) { 673 mpolicy = ppolicy; 674 } else { 675 mpolicy = lgrp_mem_policy_default( 676 pseg->s_size + seg->s_size, 677 a->type); 678 } 679 } 680 681 if (mpolicy == ppolicy && 682 (pseg->s_size + seg->s_size <= 683 segvn_comb_thrshld || psvd->amp == NULL) && 684 segvn_extend_prev(pseg, seg, a, swresv) == 0) { 685 /* 686 * success! now try to concatenate 687 * with following seg 688 */ 689 crfree(cred); 690 nseg = AS_SEGNEXT(pseg->s_as, pseg); 691 if (nseg != NULL && 692 nseg != pseg && 693 nseg->s_ops == &segvn_ops && 694 pseg->s_base + pseg->s_size == 695 nseg->s_base) 696 (void) segvn_concat(pseg, nseg, 0); 697 ASSERT(pseg->s_szc == 0 || 698 (a->szc == pseg->s_szc && 699 IS_P2ALIGNED(pseg->s_base, pgsz) && 700 IS_P2ALIGNED(pseg->s_size, pgsz))); 701 return (0); 702 } 703 } 704 705 /* 706 * Failed, so try to concatenate with following seg 707 */ 708 nseg = AS_SEGNEXT(seg->s_as, seg); 709 if (nseg != NULL && 710 seg->s_base + seg->s_size == nseg->s_base && 711 nseg->s_ops == &segvn_ops) { 712 /* 713 * Get memory allocation policy from next segment. 714 * When extension is specified (e.g. for stack) apply 715 * this policy to the new segment regardless of the 716 * outcome of segment concatenation. Extension occurs 717 * for non-default policy otherwise default policy is 718 * used and is based on extended segment size. 719 */ 720 nsvd = (struct segvn_data *)nseg->s_data; 721 npolicy = nsvd->policy_info.mem_policy; 722 if (lgrp_mem_policy_flags == 723 LGRP_MP_FLAG_EXTEND_DOWN) { 724 if (npolicy != lgrp_mem_default_policy) { 725 mpolicy = npolicy; 726 } else { 727 mpolicy = lgrp_mem_policy_default( 728 nseg->s_size + seg->s_size, 729 a->type); 730 } 731 } 732 733 if (mpolicy == npolicy && 734 segvn_extend_next(seg, nseg, a, swresv) == 0) { 735 crfree(cred); 736 ASSERT(nseg->s_szc == 0 || 737 (a->szc == nseg->s_szc && 738 IS_P2ALIGNED(nseg->s_base, pgsz) && 739 IS_P2ALIGNED(nseg->s_size, pgsz))); 740 return (0); 741 } 742 } 743 } 744 745 if (a->vp != NULL) { 746 VN_HOLD(a->vp); 747 if (a->type == MAP_SHARED) 748 lgrp_shm_policy_init(NULL, a->vp); 749 } 750 svd = kmem_cache_alloc(segvn_cache, KM_SLEEP); 751 752 seg->s_ops = &segvn_ops; 753 seg->s_data = (void *)svd; 754 seg->s_szc = a->szc; 755 756 svd->seg = seg; 757 svd->vp = a->vp; 758 /* 759 * Anonymous mappings have no backing file so the offset is meaningless. 760 */ 761 svd->offset = a->vp ? (a->offset & PAGEMASK) : 0; 762 svd->prot = a->prot; 763 svd->maxprot = a->maxprot; 764 svd->pageprot = 0; 765 svd->type = a->type; 766 svd->vpage = NULL; 767 svd->cred = cred; 768 svd->advice = MADV_NORMAL; 769 svd->pageadvice = 0; 770 svd->flags = (ushort_t)a->flags; 771 svd->softlockcnt = 0; 772 svd->rcookie = HAT_INVALID_REGION_COOKIE; 773 774 if (a->szc != 0 && a->vp != NULL) { 775 segvn_setvnode_mpss(a->vp); 776 } 777 if (svd->type == MAP_SHARED && svd->vp != NULL && 778 (svd->vp->v_flag & VVMEXEC) && (svd->prot & PROT_WRITE)) { 779 ASSERT(vn_is_mapped(svd->vp, V_WRITE)); 780 segvn_inval_trcache(svd->vp); 781 } 782 783 amp = a->amp; 784 if ((svd->amp = amp) == NULL) { 785 svd->anon_index = 0; 786 if (svd->type == MAP_SHARED) { 787 svd->swresv = 0; 788 /* 789 * Shared mappings to a vp need no other setup. 790 * If we have a shared mapping to an anon_map object 791 * which hasn't been allocated yet, allocate the 792 * struct now so that it will be properly shared 793 * by remembering the swap reservation there. 794 */ 795 if (a->vp == NULL) { 796 svd->amp = anonmap_alloc(seg->s_size, swresv, 797 ANON_SLEEP); 798 svd->amp->a_szc = seg->s_szc; 799 } 800 } else { 801 /* 802 * Private mapping (with or without a vp). 803 * Allocate anon_map when needed. 804 */ 805 svd->swresv = swresv; 806 } 807 } else { 808 pgcnt_t anon_num; 809 810 /* 811 * Mapping to an existing anon_map structure without a vp. 812 * For now we will insure that the segment size isn't larger 813 * than the size - offset gives us. Later on we may wish to 814 * have the anon array dynamically allocated itself so that 815 * we don't always have to allocate all the anon pointer slots. 816 * This of course involves adding extra code to check that we 817 * aren't trying to use an anon pointer slot beyond the end 818 * of the currently allocated anon array. 819 */ 820 if ((amp->size - a->offset) < seg->s_size) { 821 panic("segvn_create anon_map size"); 822 /*NOTREACHED*/ 823 } 824 825 anon_num = btopr(a->offset); 826 827 if (a->type == MAP_SHARED) { 828 /* 829 * SHARED mapping to a given anon_map. 830 */ 831 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 832 amp->refcnt++; 833 if (a->szc > amp->a_szc) { 834 amp->a_szc = a->szc; 835 } 836 ANON_LOCK_EXIT(&->a_rwlock); 837 svd->anon_index = anon_num; 838 svd->swresv = 0; 839 } else { 840 /* 841 * PRIVATE mapping to a given anon_map. 842 * Make sure that all the needed anon 843 * structures are created (so that we will 844 * share the underlying pages if nothing 845 * is written by this mapping) and then 846 * duplicate the anon array as is done 847 * when a privately mapped segment is dup'ed. 848 */ 849 struct anon *ap; 850 caddr_t addr; 851 caddr_t eaddr; 852 ulong_t anon_idx; 853 int hat_flag = HAT_LOAD; 854 855 if (svd->flags & MAP_TEXT) { 856 hat_flag |= HAT_LOAD_TEXT; 857 } 858 859 svd->amp = anonmap_alloc(seg->s_size, 0, ANON_SLEEP); 860 svd->amp->a_szc = seg->s_szc; 861 svd->anon_index = 0; 862 svd->swresv = swresv; 863 864 /* 865 * Prevent 2 threads from allocating anon 866 * slots simultaneously. 867 */ 868 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 869 eaddr = seg->s_base + seg->s_size; 870 871 for (anon_idx = anon_num, addr = seg->s_base; 872 addr < eaddr; addr += PAGESIZE, anon_idx++) { 873 page_t *pp; 874 875 if ((ap = anon_get_ptr(amp->ahp, 876 anon_idx)) != NULL) 877 continue; 878 879 /* 880 * Allocate the anon struct now. 881 * Might as well load up translation 882 * to the page while we're at it... 883 */ 884 pp = anon_zero(seg, addr, &ap, cred); 885 if (ap == NULL || pp == NULL) { 886 panic("segvn_create anon_zero"); 887 /*NOTREACHED*/ 888 } 889 890 /* 891 * Re-acquire the anon_map lock and 892 * initialize the anon array entry. 893 */ 894 ASSERT(anon_get_ptr(amp->ahp, 895 anon_idx) == NULL); 896 (void) anon_set_ptr(amp->ahp, anon_idx, ap, 897 ANON_SLEEP); 898 899 ASSERT(seg->s_szc == 0); 900 ASSERT(!IS_VMODSORT(pp->p_vnode)); 901 902 ASSERT(use_rgn == 0); 903 hat_memload(seg->s_as->a_hat, addr, pp, 904 svd->prot & ~PROT_WRITE, hat_flag); 905 906 page_unlock(pp); 907 } 908 ASSERT(seg->s_szc == 0); 909 anon_dup(amp->ahp, anon_num, svd->amp->ahp, 910 0, seg->s_size); 911 ANON_LOCK_EXIT(&->a_rwlock); 912 } 913 } 914 915 /* 916 * Set default memory allocation policy for segment 917 * 918 * Always set policy for private memory at least for initialization 919 * even if this is a shared memory segment 920 */ 921 (void) lgrp_privm_policy_set(mpolicy, &svd->policy_info, seg->s_size); 922 923 if (svd->type == MAP_SHARED) 924 (void) lgrp_shm_policy_set(mpolicy, svd->amp, svd->anon_index, 925 svd->vp, svd->offset, seg->s_size); 926 927 if (use_rgn) { 928 ASSERT(!trok); 929 ASSERT(svd->amp == NULL); 930 svd->rcookie = hat_join_region(seg->s_as->a_hat, seg->s_base, 931 seg->s_size, (void *)svd->vp, svd->offset, svd->prot, 932 (uchar_t)seg->s_szc, segvn_hat_rgn_unload_callback, 933 HAT_REGION_TEXT); 934 } 935 936 ASSERT(!trok || !(svd->prot & PROT_WRITE)); 937 svd->tr_state = trok ? SEGVN_TR_INIT : SEGVN_TR_OFF; 938 939 return (0); 940 } 941 942 /* 943 * Concatenate two existing segments, if possible. 944 * Return 0 on success, -1 if two segments are not compatible 945 * or -2 on memory allocation failure. 946 * If amp_cat == 1 then try and concat segments with anon maps 947 */ 948 static int 949 segvn_concat(struct seg *seg1, struct seg *seg2, int amp_cat) 950 { 951 struct segvn_data *svd1 = seg1->s_data; 952 struct segvn_data *svd2 = seg2->s_data; 953 struct anon_map *amp1 = svd1->amp; 954 struct anon_map *amp2 = svd2->amp; 955 struct vpage *vpage1 = svd1->vpage; 956 struct vpage *vpage2 = svd2->vpage, *nvpage = NULL; 957 size_t size, nvpsize; 958 pgcnt_t npages1, npages2; 959 960 ASSERT(seg1->s_as && seg2->s_as && seg1->s_as == seg2->s_as); 961 ASSERT(AS_WRITE_HELD(seg1->s_as, &seg1->s_as->a_lock)); 962 ASSERT(seg1->s_ops == seg2->s_ops); 963 964 if (HAT_IS_REGION_COOKIE_VALID(svd1->rcookie) || 965 HAT_IS_REGION_COOKIE_VALID(svd2->rcookie)) { 966 return (-1); 967 } 968 969 /* both segments exist, try to merge them */ 970 #define incompat(x) (svd1->x != svd2->x) 971 if (incompat(vp) || incompat(maxprot) || 972 (!svd1->pageadvice && !svd2->pageadvice && incompat(advice)) || 973 (!svd1->pageprot && !svd2->pageprot && incompat(prot)) || 974 incompat(type) || incompat(cred) || incompat(flags) || 975 seg1->s_szc != seg2->s_szc || incompat(policy_info.mem_policy) || 976 (svd2->softlockcnt > 0)) 977 return (-1); 978 #undef incompat 979 980 /* 981 * vp == NULL implies zfod, offset doesn't matter 982 */ 983 if (svd1->vp != NULL && 984 svd1->offset + seg1->s_size != svd2->offset) { 985 return (-1); 986 } 987 988 /* 989 * Don't concatenate if either segment uses text replication. 990 */ 991 if (svd1->tr_state != SEGVN_TR_OFF || svd2->tr_state != SEGVN_TR_OFF) { 992 return (-1); 993 } 994 995 /* 996 * Fail early if we're not supposed to concatenate 997 * segments with non NULL amp. 998 */ 999 if (amp_cat == 0 && (amp1 != NULL || amp2 != NULL)) { 1000 return (-1); 1001 } 1002 1003 if (svd1->vp == NULL && svd1->type == MAP_SHARED) { 1004 if (amp1 != amp2) { 1005 return (-1); 1006 } 1007 if (amp1 != NULL && svd1->anon_index + btop(seg1->s_size) != 1008 svd2->anon_index) { 1009 return (-1); 1010 } 1011 ASSERT(amp1 == NULL || amp1->refcnt >= 2); 1012 } 1013 1014 /* 1015 * If either seg has vpages, create a new merged vpage array. 1016 */ 1017 if (vpage1 != NULL || vpage2 != NULL) { 1018 struct vpage *vp; 1019 1020 npages1 = seg_pages(seg1); 1021 npages2 = seg_pages(seg2); 1022 nvpsize = vpgtob(npages1 + npages2); 1023 1024 if ((nvpage = kmem_zalloc(nvpsize, KM_NOSLEEP)) == NULL) { 1025 return (-2); 1026 } 1027 1028 if (vpage1 != NULL) { 1029 bcopy(vpage1, nvpage, vpgtob(npages1)); 1030 } else { 1031 for (vp = nvpage; vp < nvpage + npages1; vp++) { 1032 VPP_SETPROT(vp, svd1->prot); 1033 VPP_SETADVICE(vp, svd1->advice); 1034 } 1035 } 1036 1037 if (vpage2 != NULL) { 1038 bcopy(vpage2, nvpage + npages1, vpgtob(npages2)); 1039 } else { 1040 for (vp = nvpage + npages1; 1041 vp < nvpage + npages1 + npages2; vp++) { 1042 VPP_SETPROT(vp, svd2->prot); 1043 VPP_SETADVICE(vp, svd2->advice); 1044 } 1045 } 1046 } 1047 1048 /* 1049 * If either segment has private pages, create a new merged anon 1050 * array. If mergeing shared anon segments just decrement anon map's 1051 * refcnt. 1052 */ 1053 if (amp1 != NULL && svd1->type == MAP_SHARED) { 1054 ASSERT(amp1 == amp2 && svd1->vp == NULL); 1055 ANON_LOCK_ENTER(&1->a_rwlock, RW_WRITER); 1056 ASSERT(amp1->refcnt >= 2); 1057 amp1->refcnt--; 1058 ANON_LOCK_EXIT(&1->a_rwlock); 1059 svd2->amp = NULL; 1060 } else if (amp1 != NULL || amp2 != NULL) { 1061 struct anon_hdr *nahp; 1062 struct anon_map *namp = NULL; 1063 size_t asize; 1064 1065 ASSERT(svd1->type == MAP_PRIVATE); 1066 1067 asize = seg1->s_size + seg2->s_size; 1068 if ((nahp = anon_create(btop(asize), ANON_NOSLEEP)) == NULL) { 1069 if (nvpage != NULL) { 1070 kmem_free(nvpage, nvpsize); 1071 } 1072 return (-2); 1073 } 1074 if (amp1 != NULL) { 1075 /* 1076 * XXX anon rwlock is not really needed because 1077 * this is a private segment and we are writers. 1078 */ 1079 ANON_LOCK_ENTER(&1->a_rwlock, RW_WRITER); 1080 ASSERT(amp1->refcnt == 1); 1081 if (anon_copy_ptr(amp1->ahp, svd1->anon_index, 1082 nahp, 0, btop(seg1->s_size), ANON_NOSLEEP)) { 1083 anon_release(nahp, btop(asize)); 1084 ANON_LOCK_EXIT(&1->a_rwlock); 1085 if (nvpage != NULL) { 1086 kmem_free(nvpage, nvpsize); 1087 } 1088 return (-2); 1089 } 1090 } 1091 if (amp2 != NULL) { 1092 ANON_LOCK_ENTER(&2->a_rwlock, RW_WRITER); 1093 ASSERT(amp2->refcnt == 1); 1094 if (anon_copy_ptr(amp2->ahp, svd2->anon_index, 1095 nahp, btop(seg1->s_size), btop(seg2->s_size), 1096 ANON_NOSLEEP)) { 1097 anon_release(nahp, btop(asize)); 1098 ANON_LOCK_EXIT(&2->a_rwlock); 1099 if (amp1 != NULL) { 1100 ANON_LOCK_EXIT(&1->a_rwlock); 1101 } 1102 if (nvpage != NULL) { 1103 kmem_free(nvpage, nvpsize); 1104 } 1105 return (-2); 1106 } 1107 } 1108 if (amp1 != NULL) { 1109 namp = amp1; 1110 anon_release(amp1->ahp, btop(amp1->size)); 1111 } 1112 if (amp2 != NULL) { 1113 if (namp == NULL) { 1114 ASSERT(amp1 == NULL); 1115 namp = amp2; 1116 anon_release(amp2->ahp, btop(amp2->size)); 1117 } else { 1118 amp2->refcnt--; 1119 ANON_LOCK_EXIT(&2->a_rwlock); 1120 anonmap_free(amp2); 1121 } 1122 svd2->amp = NULL; /* needed for seg_free */ 1123 } 1124 namp->ahp = nahp; 1125 namp->size = asize; 1126 svd1->amp = namp; 1127 svd1->anon_index = 0; 1128 ANON_LOCK_EXIT(&namp->a_rwlock); 1129 } 1130 /* 1131 * Now free the old vpage structures. 1132 */ 1133 if (nvpage != NULL) { 1134 if (vpage1 != NULL) { 1135 kmem_free(vpage1, vpgtob(npages1)); 1136 } 1137 if (vpage2 != NULL) { 1138 svd2->vpage = NULL; 1139 kmem_free(vpage2, vpgtob(npages2)); 1140 } 1141 if (svd2->pageprot) { 1142 svd1->pageprot = 1; 1143 } 1144 if (svd2->pageadvice) { 1145 svd1->pageadvice = 1; 1146 } 1147 svd1->vpage = nvpage; 1148 } 1149 1150 /* all looks ok, merge segments */ 1151 svd1->swresv += svd2->swresv; 1152 svd2->swresv = 0; /* so seg_free doesn't release swap space */ 1153 size = seg2->s_size; 1154 seg_free(seg2); 1155 seg1->s_size += size; 1156 return (0); 1157 } 1158 1159 /* 1160 * Extend the previous segment (seg1) to include the 1161 * new segment (seg2 + a), if possible. 1162 * Return 0 on success. 1163 */ 1164 static int 1165 segvn_extend_prev(seg1, seg2, a, swresv) 1166 struct seg *seg1, *seg2; 1167 struct segvn_crargs *a; 1168 size_t swresv; 1169 { 1170 struct segvn_data *svd1 = (struct segvn_data *)seg1->s_data; 1171 size_t size; 1172 struct anon_map *amp1; 1173 struct vpage *new_vpage; 1174 1175 /* 1176 * We don't need any segment level locks for "segvn" data 1177 * since the address space is "write" locked. 1178 */ 1179 ASSERT(seg1->s_as && AS_WRITE_HELD(seg1->s_as, &seg1->s_as->a_lock)); 1180 1181 if (HAT_IS_REGION_COOKIE_VALID(svd1->rcookie)) { 1182 return (-1); 1183 } 1184 1185 /* second segment is new, try to extend first */ 1186 /* XXX - should also check cred */ 1187 if (svd1->vp != a->vp || svd1->maxprot != a->maxprot || 1188 (!svd1->pageprot && (svd1->prot != a->prot)) || 1189 svd1->type != a->type || svd1->flags != a->flags || 1190 seg1->s_szc != a->szc) 1191 return (-1); 1192 1193 /* vp == NULL implies zfod, offset doesn't matter */ 1194 if (svd1->vp != NULL && 1195 svd1->offset + seg1->s_size != (a->offset & PAGEMASK)) 1196 return (-1); 1197 1198 if (svd1->tr_state != SEGVN_TR_OFF) { 1199 return (-1); 1200 } 1201 1202 amp1 = svd1->amp; 1203 if (amp1) { 1204 pgcnt_t newpgs; 1205 1206 /* 1207 * Segment has private pages, can data structures 1208 * be expanded? 1209 * 1210 * Acquire the anon_map lock to prevent it from changing, 1211 * if it is shared. This ensures that the anon_map 1212 * will not change while a thread which has a read/write 1213 * lock on an address space references it. 1214 * XXX - Don't need the anon_map lock at all if "refcnt" 1215 * is 1. 1216 * 1217 * Can't grow a MAP_SHARED segment with an anonmap because 1218 * there may be existing anon slots where we want to extend 1219 * the segment and we wouldn't know what to do with them 1220 * (e.g., for tmpfs right thing is to just leave them there, 1221 * for /dev/zero they should be cleared out). 1222 */ 1223 if (svd1->type == MAP_SHARED) 1224 return (-1); 1225 1226 ANON_LOCK_ENTER(&1->a_rwlock, RW_WRITER); 1227 if (amp1->refcnt > 1) { 1228 ANON_LOCK_EXIT(&1->a_rwlock); 1229 return (-1); 1230 } 1231 newpgs = anon_grow(amp1->ahp, &svd1->anon_index, 1232 btop(seg1->s_size), btop(seg2->s_size), ANON_NOSLEEP); 1233 1234 if (newpgs == 0) { 1235 ANON_LOCK_EXIT(&1->a_rwlock); 1236 return (-1); 1237 } 1238 amp1->size = ptob(newpgs); 1239 ANON_LOCK_EXIT(&1->a_rwlock); 1240 } 1241 if (svd1->vpage != NULL) { 1242 struct vpage *vp, *evp; 1243 new_vpage = 1244 kmem_zalloc(vpgtob(seg_pages(seg1) + seg_pages(seg2)), 1245 KM_NOSLEEP); 1246 if (new_vpage == NULL) 1247 return (-1); 1248 bcopy(svd1->vpage, new_vpage, vpgtob(seg_pages(seg1))); 1249 kmem_free(svd1->vpage, vpgtob(seg_pages(seg1))); 1250 svd1->vpage = new_vpage; 1251 1252 vp = new_vpage + seg_pages(seg1); 1253 evp = vp + seg_pages(seg2); 1254 for (; vp < evp; vp++) 1255 VPP_SETPROT(vp, a->prot); 1256 } 1257 size = seg2->s_size; 1258 seg_free(seg2); 1259 seg1->s_size += size; 1260 svd1->swresv += swresv; 1261 if (svd1->pageprot && (a->prot & PROT_WRITE) && 1262 svd1->type == MAP_SHARED && svd1->vp != NULL && 1263 (svd1->vp->v_flag & VVMEXEC)) { 1264 ASSERT(vn_is_mapped(svd1->vp, V_WRITE)); 1265 segvn_inval_trcache(svd1->vp); 1266 } 1267 return (0); 1268 } 1269 1270 /* 1271 * Extend the next segment (seg2) to include the 1272 * new segment (seg1 + a), if possible. 1273 * Return 0 on success. 1274 */ 1275 static int 1276 segvn_extend_next( 1277 struct seg *seg1, 1278 struct seg *seg2, 1279 struct segvn_crargs *a, 1280 size_t swresv) 1281 { 1282 struct segvn_data *svd2 = (struct segvn_data *)seg2->s_data; 1283 size_t size; 1284 struct anon_map *amp2; 1285 struct vpage *new_vpage; 1286 1287 /* 1288 * We don't need any segment level locks for "segvn" data 1289 * since the address space is "write" locked. 1290 */ 1291 ASSERT(seg2->s_as && AS_WRITE_HELD(seg2->s_as, &seg2->s_as->a_lock)); 1292 1293 if (HAT_IS_REGION_COOKIE_VALID(svd2->rcookie)) { 1294 return (-1); 1295 } 1296 1297 /* first segment is new, try to extend second */ 1298 /* XXX - should also check cred */ 1299 if (svd2->vp != a->vp || svd2->maxprot != a->maxprot || 1300 (!svd2->pageprot && (svd2->prot != a->prot)) || 1301 svd2->type != a->type || svd2->flags != a->flags || 1302 seg2->s_szc != a->szc) 1303 return (-1); 1304 /* vp == NULL implies zfod, offset doesn't matter */ 1305 if (svd2->vp != NULL && 1306 (a->offset & PAGEMASK) + seg1->s_size != svd2->offset) 1307 return (-1); 1308 1309 if (svd2->tr_state != SEGVN_TR_OFF) { 1310 return (-1); 1311 } 1312 1313 amp2 = svd2->amp; 1314 if (amp2) { 1315 pgcnt_t newpgs; 1316 1317 /* 1318 * Segment has private pages, can data structures 1319 * be expanded? 1320 * 1321 * Acquire the anon_map lock to prevent it from changing, 1322 * if it is shared. This ensures that the anon_map 1323 * will not change while a thread which has a read/write 1324 * lock on an address space references it. 1325 * 1326 * XXX - Don't need the anon_map lock at all if "refcnt" 1327 * is 1. 1328 */ 1329 if (svd2->type == MAP_SHARED) 1330 return (-1); 1331 1332 ANON_LOCK_ENTER(&2->a_rwlock, RW_WRITER); 1333 if (amp2->refcnt > 1) { 1334 ANON_LOCK_EXIT(&2->a_rwlock); 1335 return (-1); 1336 } 1337 newpgs = anon_grow(amp2->ahp, &svd2->anon_index, 1338 btop(seg2->s_size), btop(seg1->s_size), 1339 ANON_NOSLEEP | ANON_GROWDOWN); 1340 1341 if (newpgs == 0) { 1342 ANON_LOCK_EXIT(&2->a_rwlock); 1343 return (-1); 1344 } 1345 amp2->size = ptob(newpgs); 1346 ANON_LOCK_EXIT(&2->a_rwlock); 1347 } 1348 if (svd2->vpage != NULL) { 1349 struct vpage *vp, *evp; 1350 new_vpage = 1351 kmem_zalloc(vpgtob(seg_pages(seg1) + seg_pages(seg2)), 1352 KM_NOSLEEP); 1353 if (new_vpage == NULL) { 1354 /* Not merging segments so adjust anon_index back */ 1355 if (amp2) 1356 svd2->anon_index += seg_pages(seg1); 1357 return (-1); 1358 } 1359 bcopy(svd2->vpage, new_vpage + seg_pages(seg1), 1360 vpgtob(seg_pages(seg2))); 1361 kmem_free(svd2->vpage, vpgtob(seg_pages(seg2))); 1362 svd2->vpage = new_vpage; 1363 1364 vp = new_vpage; 1365 evp = vp + seg_pages(seg1); 1366 for (; vp < evp; vp++) 1367 VPP_SETPROT(vp, a->prot); 1368 } 1369 size = seg1->s_size; 1370 seg_free(seg1); 1371 seg2->s_size += size; 1372 seg2->s_base -= size; 1373 svd2->offset -= size; 1374 svd2->swresv += swresv; 1375 if (svd2->pageprot && (a->prot & PROT_WRITE) && 1376 svd2->type == MAP_SHARED && svd2->vp != NULL && 1377 (svd2->vp->v_flag & VVMEXEC)) { 1378 ASSERT(vn_is_mapped(svd2->vp, V_WRITE)); 1379 segvn_inval_trcache(svd2->vp); 1380 } 1381 return (0); 1382 } 1383 1384 static int 1385 segvn_dup(struct seg *seg, struct seg *newseg) 1386 { 1387 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 1388 struct segvn_data *newsvd; 1389 pgcnt_t npages = seg_pages(seg); 1390 int error = 0; 1391 uint_t prot; 1392 size_t len; 1393 struct anon_map *amp; 1394 1395 ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock)); 1396 1397 /* 1398 * If segment has anon reserved, reserve more for the new seg. 1399 * For a MAP_NORESERVE segment swresv will be a count of all the 1400 * allocated anon slots; thus we reserve for the child as many slots 1401 * as the parent has allocated. This semantic prevents the child or 1402 * parent from dieing during a copy-on-write fault caused by trying 1403 * to write a shared pre-existing anon page. 1404 */ 1405 if ((len = svd->swresv) != 0) { 1406 if (anon_resv(svd->swresv) == 0) 1407 return (ENOMEM); 1408 1409 TRACE_3(TR_FAC_VM, TR_ANON_PROC, "anon proc:%p %lu %u", 1410 seg, len, 0); 1411 } 1412 1413 newsvd = kmem_cache_alloc(segvn_cache, KM_SLEEP); 1414 1415 newseg->s_ops = &segvn_ops; 1416 newseg->s_data = (void *)newsvd; 1417 newseg->s_szc = seg->s_szc; 1418 1419 newsvd->seg = newseg; 1420 if ((newsvd->vp = svd->vp) != NULL) { 1421 VN_HOLD(svd->vp); 1422 if (svd->type == MAP_SHARED) 1423 lgrp_shm_policy_init(NULL, svd->vp); 1424 } 1425 newsvd->offset = svd->offset; 1426 newsvd->prot = svd->prot; 1427 newsvd->maxprot = svd->maxprot; 1428 newsvd->pageprot = svd->pageprot; 1429 newsvd->type = svd->type; 1430 newsvd->cred = svd->cred; 1431 crhold(newsvd->cred); 1432 newsvd->advice = svd->advice; 1433 newsvd->pageadvice = svd->pageadvice; 1434 newsvd->swresv = svd->swresv; 1435 newsvd->flags = svd->flags; 1436 newsvd->softlockcnt = 0; 1437 newsvd->policy_info = svd->policy_info; 1438 newsvd->rcookie = HAT_INVALID_REGION_COOKIE; 1439 1440 if ((amp = svd->amp) == NULL || svd->tr_state == SEGVN_TR_ON) { 1441 /* 1442 * Not attaching to a shared anon object. 1443 */ 1444 ASSERT(!HAT_IS_REGION_COOKIE_VALID(svd->rcookie) || 1445 svd->tr_state == SEGVN_TR_OFF); 1446 if (svd->tr_state == SEGVN_TR_ON) { 1447 ASSERT(newsvd->vp != NULL && amp != NULL); 1448 newsvd->tr_state = SEGVN_TR_INIT; 1449 } else { 1450 newsvd->tr_state = svd->tr_state; 1451 } 1452 newsvd->amp = NULL; 1453 newsvd->anon_index = 0; 1454 } else { 1455 /* regions for now are only used on pure vnode segments */ 1456 ASSERT(svd->rcookie == HAT_INVALID_REGION_COOKIE); 1457 ASSERT(svd->tr_state == SEGVN_TR_OFF); 1458 newsvd->tr_state = SEGVN_TR_OFF; 1459 if (svd->type == MAP_SHARED) { 1460 newsvd->amp = amp; 1461 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 1462 amp->refcnt++; 1463 ANON_LOCK_EXIT(&->a_rwlock); 1464 newsvd->anon_index = svd->anon_index; 1465 } else { 1466 int reclaim = 1; 1467 1468 /* 1469 * Allocate and initialize new anon_map structure. 1470 */ 1471 newsvd->amp = anonmap_alloc(newseg->s_size, 0, 1472 ANON_SLEEP); 1473 newsvd->amp->a_szc = newseg->s_szc; 1474 newsvd->anon_index = 0; 1475 1476 /* 1477 * We don't have to acquire the anon_map lock 1478 * for the new segment (since it belongs to an 1479 * address space that is still not associated 1480 * with any process), or the segment in the old 1481 * address space (since all threads in it 1482 * are stopped while duplicating the address space). 1483 */ 1484 1485 /* 1486 * The goal of the following code is to make sure that 1487 * softlocked pages do not end up as copy on write 1488 * pages. This would cause problems where one 1489 * thread writes to a page that is COW and a different 1490 * thread in the same process has softlocked it. The 1491 * softlock lock would move away from this process 1492 * because the write would cause this process to get 1493 * a copy (without the softlock). 1494 * 1495 * The strategy here is to just break the 1496 * sharing on pages that could possibly be 1497 * softlocked. 1498 */ 1499 retry: 1500 if (svd->softlockcnt) { 1501 struct anon *ap, *newap; 1502 size_t i; 1503 uint_t vpprot; 1504 page_t *anon_pl[1+1], *pp; 1505 caddr_t addr; 1506 ulong_t old_idx = svd->anon_index; 1507 ulong_t new_idx = 0; 1508 1509 /* 1510 * The softlock count might be non zero 1511 * because some pages are still stuck in the 1512 * cache for lazy reclaim. Flush the cache 1513 * now. This should drop the count to zero. 1514 * [or there is really I/O going on to these 1515 * pages]. Note, we have the writers lock so 1516 * nothing gets inserted during the flush. 1517 */ 1518 if (reclaim == 1) { 1519 segvn_purge(seg); 1520 reclaim = 0; 1521 goto retry; 1522 } 1523 i = btopr(seg->s_size); 1524 addr = seg->s_base; 1525 /* 1526 * XXX break cow sharing using PAGESIZE 1527 * pages. They will be relocated into larger 1528 * pages at fault time. 1529 */ 1530 while (i-- > 0) { 1531 if (ap = anon_get_ptr(amp->ahp, 1532 old_idx)) { 1533 error = anon_getpage(&ap, 1534 &vpprot, anon_pl, PAGESIZE, 1535 seg, addr, S_READ, 1536 svd->cred); 1537 if (error) { 1538 newsvd->vpage = NULL; 1539 goto out; 1540 } 1541 /* 1542 * prot need not be computed 1543 * below 'cause anon_private is 1544 * going to ignore it anyway 1545 * as child doesn't inherit 1546 * pagelock from parent. 1547 */ 1548 prot = svd->pageprot ? 1549 VPP_PROT( 1550 &svd->vpage[ 1551 seg_page(seg, addr)]) 1552 : svd->prot; 1553 pp = anon_private(&newap, 1554 newseg, addr, prot, 1555 anon_pl[0], 0, 1556 newsvd->cred); 1557 if (pp == NULL) { 1558 /* no mem abort */ 1559 newsvd->vpage = NULL; 1560 error = ENOMEM; 1561 goto out; 1562 } 1563 (void) anon_set_ptr( 1564 newsvd->amp->ahp, new_idx, 1565 newap, ANON_SLEEP); 1566 page_unlock(pp); 1567 } 1568 addr += PAGESIZE; 1569 old_idx++; 1570 new_idx++; 1571 } 1572 } else { /* common case */ 1573 if (seg->s_szc != 0) { 1574 /* 1575 * If at least one of anon slots of a 1576 * large page exists then make sure 1577 * all anon slots of a large page 1578 * exist to avoid partial cow sharing 1579 * of a large page in the future. 1580 */ 1581 anon_dup_fill_holes(amp->ahp, 1582 svd->anon_index, newsvd->amp->ahp, 1583 0, seg->s_size, seg->s_szc, 1584 svd->vp != NULL); 1585 } else { 1586 anon_dup(amp->ahp, svd->anon_index, 1587 newsvd->amp->ahp, 0, seg->s_size); 1588 } 1589 1590 hat_clrattr(seg->s_as->a_hat, seg->s_base, 1591 seg->s_size, PROT_WRITE); 1592 } 1593 } 1594 } 1595 /* 1596 * If necessary, create a vpage structure for the new segment. 1597 * Do not copy any page lock indications. 1598 */ 1599 if (svd->vpage != NULL) { 1600 uint_t i; 1601 struct vpage *ovp = svd->vpage; 1602 struct vpage *nvp; 1603 1604 nvp = newsvd->vpage = 1605 kmem_alloc(vpgtob(npages), KM_SLEEP); 1606 for (i = 0; i < npages; i++) { 1607 *nvp = *ovp++; 1608 VPP_CLRPPLOCK(nvp++); 1609 } 1610 } else 1611 newsvd->vpage = NULL; 1612 1613 /* Inform the vnode of the new mapping */ 1614 if (newsvd->vp != NULL) { 1615 error = VOP_ADDMAP(newsvd->vp, (offset_t)newsvd->offset, 1616 newseg->s_as, newseg->s_base, newseg->s_size, newsvd->prot, 1617 newsvd->maxprot, newsvd->type, newsvd->cred, NULL); 1618 } 1619 out: 1620 if (error == 0 && HAT_IS_REGION_COOKIE_VALID(svd->rcookie)) { 1621 ASSERT(newsvd->amp == NULL); 1622 ASSERT(newsvd->tr_state == SEGVN_TR_OFF); 1623 newsvd->rcookie = svd->rcookie; 1624 hat_dup_region(newseg->s_as->a_hat, newsvd->rcookie); 1625 } 1626 return (error); 1627 } 1628 1629 1630 /* 1631 * callback function to invoke free_vp_pages() for only those pages actually 1632 * processed by the HAT when a shared region is destroyed. 1633 */ 1634 extern int free_pages; 1635 1636 static void 1637 segvn_hat_rgn_unload_callback(caddr_t saddr, caddr_t eaddr, caddr_t r_saddr, 1638 size_t r_size, void *r_obj, u_offset_t r_objoff) 1639 { 1640 u_offset_t off; 1641 size_t len; 1642 vnode_t *vp = (vnode_t *)r_obj; 1643 1644 ASSERT(eaddr > saddr); 1645 ASSERT(saddr >= r_saddr); 1646 ASSERT(saddr < r_saddr + r_size); 1647 ASSERT(eaddr > r_saddr); 1648 ASSERT(eaddr <= r_saddr + r_size); 1649 ASSERT(vp != NULL); 1650 1651 if (!free_pages) { 1652 return; 1653 } 1654 1655 len = eaddr - saddr; 1656 off = (saddr - r_saddr) + r_objoff; 1657 free_vp_pages(vp, off, len); 1658 } 1659 1660 /* 1661 * callback function used by segvn_unmap to invoke free_vp_pages() for only 1662 * those pages actually processed by the HAT 1663 */ 1664 static void 1665 segvn_hat_unload_callback(hat_callback_t *cb) 1666 { 1667 struct seg *seg = cb->hcb_data; 1668 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 1669 size_t len; 1670 u_offset_t off; 1671 1672 ASSERT(svd->vp != NULL); 1673 ASSERT(cb->hcb_end_addr > cb->hcb_start_addr); 1674 ASSERT(cb->hcb_start_addr >= seg->s_base); 1675 1676 len = cb->hcb_end_addr - cb->hcb_start_addr; 1677 off = cb->hcb_start_addr - seg->s_base; 1678 free_vp_pages(svd->vp, svd->offset + off, len); 1679 } 1680 1681 static int 1682 segvn_unmap(struct seg *seg, caddr_t addr, size_t len) 1683 { 1684 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 1685 struct segvn_data *nsvd; 1686 struct seg *nseg; 1687 struct anon_map *amp; 1688 pgcnt_t opages; /* old segment size in pages */ 1689 pgcnt_t npages; /* new segment size in pages */ 1690 pgcnt_t dpages; /* pages being deleted (unmapped) */ 1691 hat_callback_t callback; /* used for free_vp_pages() */ 1692 hat_callback_t *cbp = NULL; 1693 caddr_t nbase; 1694 size_t nsize; 1695 size_t oswresv; 1696 int reclaim = 1; 1697 1698 /* 1699 * We don't need any segment level locks for "segvn" data 1700 * since the address space is "write" locked. 1701 */ 1702 ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock)); 1703 1704 /* 1705 * Fail the unmap if pages are SOFTLOCKed through this mapping. 1706 * softlockcnt is protected from change by the as write lock. 1707 */ 1708 retry: 1709 if (svd->softlockcnt > 0) { 1710 ASSERT(svd->tr_state == SEGVN_TR_OFF); 1711 /* 1712 * since we do have the writers lock nobody can fill 1713 * the cache during the purge. The flush either succeeds 1714 * or we still have pending I/Os. 1715 */ 1716 if (reclaim == 1) { 1717 segvn_purge(seg); 1718 reclaim = 0; 1719 goto retry; 1720 } 1721 return (EAGAIN); 1722 } 1723 1724 /* 1725 * Check for bad sizes 1726 */ 1727 if (addr < seg->s_base || addr + len > seg->s_base + seg->s_size || 1728 (len & PAGEOFFSET) || ((uintptr_t)addr & PAGEOFFSET)) { 1729 panic("segvn_unmap"); 1730 /*NOTREACHED*/ 1731 } 1732 1733 if (seg->s_szc != 0) { 1734 size_t pgsz = page_get_pagesize(seg->s_szc); 1735 int err; 1736 if (!IS_P2ALIGNED(addr, pgsz) || !IS_P2ALIGNED(len, pgsz)) { 1737 ASSERT(seg->s_base != addr || seg->s_size != len); 1738 if (HAT_IS_REGION_COOKIE_VALID(svd->rcookie)) { 1739 ASSERT(svd->amp == NULL); 1740 ASSERT(svd->tr_state == SEGVN_TR_OFF); 1741 hat_leave_region(seg->s_as->a_hat, 1742 svd->rcookie, HAT_REGION_TEXT); 1743 svd->rcookie = HAT_INVALID_REGION_COOKIE; 1744 /* 1745 * could pass a flag to segvn_demote_range() 1746 * below to tell it not to do any unloads but 1747 * this case is rare enough to not bother for 1748 * now. 1749 */ 1750 } else if (svd->tr_state == SEGVN_TR_INIT) { 1751 svd->tr_state = SEGVN_TR_OFF; 1752 } else if (svd->tr_state == SEGVN_TR_ON) { 1753 ASSERT(svd->amp != NULL); 1754 segvn_textunrepl(seg, 1); 1755 ASSERT(svd->amp == NULL); 1756 ASSERT(svd->tr_state == SEGVN_TR_OFF); 1757 } 1758 VM_STAT_ADD(segvnvmstats.demoterange[0]); 1759 err = segvn_demote_range(seg, addr, len, SDR_END, 0); 1760 if (err == 0) { 1761 return (IE_RETRY); 1762 } 1763 return (err); 1764 } 1765 } 1766 1767 /* Inform the vnode of the unmapping. */ 1768 if (svd->vp) { 1769 int error; 1770 1771 error = VOP_DELMAP(svd->vp, 1772 (offset_t)svd->offset + (uintptr_t)(addr - seg->s_base), 1773 seg->s_as, addr, len, svd->prot, svd->maxprot, 1774 svd->type, svd->cred, NULL); 1775 1776 if (error == EAGAIN) 1777 return (error); 1778 } 1779 1780 /* 1781 * Remove any page locks set through this mapping. 1782 * If text replication is not off no page locks could have been 1783 * established via this mapping. 1784 */ 1785 if (svd->tr_state == SEGVN_TR_OFF) { 1786 (void) segvn_lockop(seg, addr, len, 0, MC_UNLOCK, NULL, 0); 1787 } 1788 1789 if (HAT_IS_REGION_COOKIE_VALID(svd->rcookie)) { 1790 ASSERT(svd->amp == NULL); 1791 ASSERT(svd->tr_state == SEGVN_TR_OFF); 1792 ASSERT(svd->type == MAP_PRIVATE); 1793 hat_leave_region(seg->s_as->a_hat, svd->rcookie, 1794 HAT_REGION_TEXT); 1795 svd->rcookie = HAT_INVALID_REGION_COOKIE; 1796 } else if (svd->tr_state == SEGVN_TR_ON) { 1797 ASSERT(svd->amp != NULL); 1798 ASSERT(svd->pageprot == 0 && !(svd->prot & PROT_WRITE)); 1799 segvn_textunrepl(seg, 1); 1800 ASSERT(svd->amp == NULL && svd->tr_state == SEGVN_TR_OFF); 1801 } else { 1802 if (svd->tr_state != SEGVN_TR_OFF) { 1803 ASSERT(svd->tr_state == SEGVN_TR_INIT); 1804 svd->tr_state = SEGVN_TR_OFF; 1805 } 1806 /* 1807 * Unload any hardware translations in the range to be taken 1808 * out. Use a callback to invoke free_vp_pages() effectively. 1809 */ 1810 if (svd->vp != NULL && free_pages != 0) { 1811 callback.hcb_data = seg; 1812 callback.hcb_function = segvn_hat_unload_callback; 1813 cbp = &callback; 1814 } 1815 hat_unload_callback(seg->s_as->a_hat, addr, len, 1816 HAT_UNLOAD_UNMAP, cbp); 1817 1818 if (svd->type == MAP_SHARED && svd->vp != NULL && 1819 (svd->vp->v_flag & VVMEXEC) && 1820 ((svd->prot & PROT_WRITE) || svd->pageprot)) { 1821 segvn_inval_trcache(svd->vp); 1822 } 1823 } 1824 1825 /* 1826 * Check for entire segment 1827 */ 1828 if (addr == seg->s_base && len == seg->s_size) { 1829 seg_free(seg); 1830 return (0); 1831 } 1832 1833 opages = seg_pages(seg); 1834 dpages = btop(len); 1835 npages = opages - dpages; 1836 amp = svd->amp; 1837 ASSERT(amp == NULL || amp->a_szc >= seg->s_szc); 1838 1839 /* 1840 * Check for beginning of segment 1841 */ 1842 if (addr == seg->s_base) { 1843 if (svd->vpage != NULL) { 1844 size_t nbytes; 1845 struct vpage *ovpage; 1846 1847 ovpage = svd->vpage; /* keep pointer to vpage */ 1848 1849 nbytes = vpgtob(npages); 1850 svd->vpage = kmem_alloc(nbytes, KM_SLEEP); 1851 bcopy(&ovpage[dpages], svd->vpage, nbytes); 1852 1853 /* free up old vpage */ 1854 kmem_free(ovpage, vpgtob(opages)); 1855 } 1856 if (amp != NULL) { 1857 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 1858 if (amp->refcnt == 1 || svd->type == MAP_PRIVATE) { 1859 /* 1860 * Free up now unused parts of anon_map array. 1861 */ 1862 if (amp->a_szc == seg->s_szc) { 1863 if (seg->s_szc != 0) { 1864 anon_free_pages(amp->ahp, 1865 svd->anon_index, len, 1866 seg->s_szc); 1867 } else { 1868 anon_free(amp->ahp, 1869 svd->anon_index, 1870 len); 1871 } 1872 } else { 1873 ASSERT(svd->type == MAP_SHARED); 1874 ASSERT(amp->a_szc > seg->s_szc); 1875 anon_shmap_free_pages(amp, 1876 svd->anon_index, len); 1877 } 1878 1879 /* 1880 * Unreserve swap space for the 1881 * unmapped chunk of this segment in 1882 * case it's MAP_SHARED 1883 */ 1884 if (svd->type == MAP_SHARED) { 1885 anon_unresv(len); 1886 amp->swresv -= len; 1887 } 1888 } 1889 ANON_LOCK_EXIT(&->a_rwlock); 1890 svd->anon_index += dpages; 1891 } 1892 if (svd->vp != NULL) 1893 svd->offset += len; 1894 1895 if (svd->swresv) { 1896 if (svd->flags & MAP_NORESERVE) { 1897 ASSERT(amp); 1898 oswresv = svd->swresv; 1899 1900 svd->swresv = ptob(anon_pages(amp->ahp, 1901 svd->anon_index, npages)); 1902 anon_unresv(oswresv - svd->swresv); 1903 } else { 1904 anon_unresv(len); 1905 svd->swresv -= len; 1906 } 1907 TRACE_3(TR_FAC_VM, TR_ANON_PROC, "anon proc:%p %lu %u", 1908 seg, len, 0); 1909 } 1910 1911 seg->s_base += len; 1912 seg->s_size -= len; 1913 return (0); 1914 } 1915 1916 /* 1917 * Check for end of segment 1918 */ 1919 if (addr + len == seg->s_base + seg->s_size) { 1920 if (svd->vpage != NULL) { 1921 size_t nbytes; 1922 struct vpage *ovpage; 1923 1924 ovpage = svd->vpage; /* keep pointer to vpage */ 1925 1926 nbytes = vpgtob(npages); 1927 svd->vpage = kmem_alloc(nbytes, KM_SLEEP); 1928 bcopy(ovpage, svd->vpage, nbytes); 1929 1930 /* free up old vpage */ 1931 kmem_free(ovpage, vpgtob(opages)); 1932 1933 } 1934 if (amp != NULL) { 1935 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 1936 if (amp->refcnt == 1 || svd->type == MAP_PRIVATE) { 1937 /* 1938 * Free up now unused parts of anon_map array. 1939 */ 1940 ulong_t an_idx = svd->anon_index + npages; 1941 if (amp->a_szc == seg->s_szc) { 1942 if (seg->s_szc != 0) { 1943 anon_free_pages(amp->ahp, 1944 an_idx, len, 1945 seg->s_szc); 1946 } else { 1947 anon_free(amp->ahp, an_idx, 1948 len); 1949 } 1950 } else { 1951 ASSERT(svd->type == MAP_SHARED); 1952 ASSERT(amp->a_szc > seg->s_szc); 1953 anon_shmap_free_pages(amp, 1954 an_idx, len); 1955 } 1956 1957 /* 1958 * Unreserve swap space for the 1959 * unmapped chunk of this segment in 1960 * case it's MAP_SHARED 1961 */ 1962 if (svd->type == MAP_SHARED) { 1963 anon_unresv(len); 1964 amp->swresv -= len; 1965 } 1966 } 1967 ANON_LOCK_EXIT(&->a_rwlock); 1968 } 1969 1970 if (svd->swresv) { 1971 if (svd->flags & MAP_NORESERVE) { 1972 ASSERT(amp); 1973 oswresv = svd->swresv; 1974 svd->swresv = ptob(anon_pages(amp->ahp, 1975 svd->anon_index, npages)); 1976 anon_unresv(oswresv - svd->swresv); 1977 } else { 1978 anon_unresv(len); 1979 svd->swresv -= len; 1980 } 1981 TRACE_3(TR_FAC_VM, TR_ANON_PROC, 1982 "anon proc:%p %lu %u", seg, len, 0); 1983 } 1984 1985 seg->s_size -= len; 1986 return (0); 1987 } 1988 1989 /* 1990 * The section to go is in the middle of the segment, 1991 * have to make it into two segments. nseg is made for 1992 * the high end while seg is cut down at the low end. 1993 */ 1994 nbase = addr + len; /* new seg base */ 1995 nsize = (seg->s_base + seg->s_size) - nbase; /* new seg size */ 1996 seg->s_size = addr - seg->s_base; /* shrink old seg */ 1997 nseg = seg_alloc(seg->s_as, nbase, nsize); 1998 if (nseg == NULL) { 1999 panic("segvn_unmap seg_alloc"); 2000 /*NOTREACHED*/ 2001 } 2002 nseg->s_ops = seg->s_ops; 2003 nsvd = kmem_cache_alloc(segvn_cache, KM_SLEEP); 2004 nseg->s_data = (void *)nsvd; 2005 nseg->s_szc = seg->s_szc; 2006 *nsvd = *svd; 2007 nsvd->seg = nseg; 2008 nsvd->offset = svd->offset + (uintptr_t)(nseg->s_base - seg->s_base); 2009 nsvd->swresv = 0; 2010 nsvd->softlockcnt = 0; 2011 ASSERT(nsvd->rcookie == HAT_INVALID_REGION_COOKIE); 2012 2013 if (svd->vp != NULL) { 2014 VN_HOLD(nsvd->vp); 2015 if (nsvd->type == MAP_SHARED) 2016 lgrp_shm_policy_init(NULL, nsvd->vp); 2017 } 2018 crhold(svd->cred); 2019 2020 if (svd->vpage == NULL) { 2021 nsvd->vpage = NULL; 2022 } else { 2023 /* need to split vpage into two arrays */ 2024 size_t nbytes; 2025 struct vpage *ovpage; 2026 2027 ovpage = svd->vpage; /* keep pointer to vpage */ 2028 2029 npages = seg_pages(seg); /* seg has shrunk */ 2030 nbytes = vpgtob(npages); 2031 svd->vpage = kmem_alloc(nbytes, KM_SLEEP); 2032 2033 bcopy(ovpage, svd->vpage, nbytes); 2034 2035 npages = seg_pages(nseg); 2036 nbytes = vpgtob(npages); 2037 nsvd->vpage = kmem_alloc(nbytes, KM_SLEEP); 2038 2039 bcopy(&ovpage[opages - npages], nsvd->vpage, nbytes); 2040 2041 /* free up old vpage */ 2042 kmem_free(ovpage, vpgtob(opages)); 2043 } 2044 2045 if (amp == NULL) { 2046 nsvd->amp = NULL; 2047 nsvd->anon_index = 0; 2048 } else { 2049 /* 2050 * Need to create a new anon map for the new segment. 2051 * We'll also allocate a new smaller array for the old 2052 * smaller segment to save space. 2053 */ 2054 opages = btop((uintptr_t)(addr - seg->s_base)); 2055 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 2056 if (amp->refcnt == 1 || svd->type == MAP_PRIVATE) { 2057 /* 2058 * Free up now unused parts of anon_map array. 2059 */ 2060 ulong_t an_idx = svd->anon_index + opages; 2061 if (amp->a_szc == seg->s_szc) { 2062 if (seg->s_szc != 0) { 2063 anon_free_pages(amp->ahp, an_idx, len, 2064 seg->s_szc); 2065 } else { 2066 anon_free(amp->ahp, an_idx, 2067 len); 2068 } 2069 } else { 2070 ASSERT(svd->type == MAP_SHARED); 2071 ASSERT(amp->a_szc > seg->s_szc); 2072 anon_shmap_free_pages(amp, an_idx, len); 2073 } 2074 2075 /* 2076 * Unreserve swap space for the 2077 * unmapped chunk of this segment in 2078 * case it's MAP_SHARED 2079 */ 2080 if (svd->type == MAP_SHARED) { 2081 anon_unresv(len); 2082 amp->swresv -= len; 2083 } 2084 } 2085 nsvd->anon_index = svd->anon_index + 2086 btop((uintptr_t)(nseg->s_base - seg->s_base)); 2087 if (svd->type == MAP_SHARED) { 2088 amp->refcnt++; 2089 nsvd->amp = amp; 2090 } else { 2091 struct anon_map *namp; 2092 struct anon_hdr *nahp; 2093 2094 ASSERT(svd->type == MAP_PRIVATE); 2095 nahp = anon_create(btop(seg->s_size), ANON_SLEEP); 2096 namp = anonmap_alloc(nseg->s_size, 0, ANON_SLEEP); 2097 namp->a_szc = seg->s_szc; 2098 (void) anon_copy_ptr(amp->ahp, svd->anon_index, nahp, 2099 0, btop(seg->s_size), ANON_SLEEP); 2100 (void) anon_copy_ptr(amp->ahp, nsvd->anon_index, 2101 namp->ahp, 0, btop(nseg->s_size), ANON_SLEEP); 2102 anon_release(amp->ahp, btop(amp->size)); 2103 svd->anon_index = 0; 2104 nsvd->anon_index = 0; 2105 amp->ahp = nahp; 2106 amp->size = seg->s_size; 2107 nsvd->amp = namp; 2108 } 2109 ANON_LOCK_EXIT(&->a_rwlock); 2110 } 2111 if (svd->swresv) { 2112 if (svd->flags & MAP_NORESERVE) { 2113 ASSERT(amp); 2114 oswresv = svd->swresv; 2115 svd->swresv = ptob(anon_pages(amp->ahp, 2116 svd->anon_index, btop(seg->s_size))); 2117 nsvd->swresv = ptob(anon_pages(nsvd->amp->ahp, 2118 nsvd->anon_index, btop(nseg->s_size))); 2119 ASSERT(oswresv >= (svd->swresv + nsvd->swresv)); 2120 anon_unresv(oswresv - (svd->swresv + nsvd->swresv)); 2121 } else { 2122 if (seg->s_size + nseg->s_size + len != svd->swresv) { 2123 panic("segvn_unmap: " 2124 "cannot split swap reservation"); 2125 /*NOTREACHED*/ 2126 } 2127 anon_unresv(len); 2128 svd->swresv = seg->s_size; 2129 nsvd->swresv = nseg->s_size; 2130 } 2131 TRACE_3(TR_FAC_VM, TR_ANON_PROC, "anon proc:%p %lu %u", 2132 seg, len, 0); 2133 } 2134 2135 return (0); /* I'm glad that's all over with! */ 2136 } 2137 2138 static void 2139 segvn_free(struct seg *seg) 2140 { 2141 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 2142 pgcnt_t npages = seg_pages(seg); 2143 struct anon_map *amp; 2144 size_t len; 2145 2146 /* 2147 * We don't need any segment level locks for "segvn" data 2148 * since the address space is "write" locked. 2149 */ 2150 ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock)); 2151 ASSERT(svd->tr_state == SEGVN_TR_OFF); 2152 2153 ASSERT(svd->rcookie == HAT_INVALID_REGION_COOKIE); 2154 2155 /* 2156 * Be sure to unlock pages. XXX Why do things get free'ed instead 2157 * of unmapped? XXX 2158 */ 2159 (void) segvn_lockop(seg, seg->s_base, seg->s_size, 2160 0, MC_UNLOCK, NULL, 0); 2161 2162 /* 2163 * Deallocate the vpage and anon pointers if necessary and possible. 2164 */ 2165 if (svd->vpage != NULL) { 2166 kmem_free(svd->vpage, vpgtob(npages)); 2167 svd->vpage = NULL; 2168 } 2169 if ((amp = svd->amp) != NULL) { 2170 /* 2171 * If there are no more references to this anon_map 2172 * structure, then deallocate the structure after freeing 2173 * up all the anon slot pointers that we can. 2174 */ 2175 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 2176 ASSERT(amp->a_szc >= seg->s_szc); 2177 if (--amp->refcnt == 0) { 2178 if (svd->type == MAP_PRIVATE) { 2179 /* 2180 * Private - we only need to anon_free 2181 * the part that this segment refers to. 2182 */ 2183 if (seg->s_szc != 0) { 2184 anon_free_pages(amp->ahp, 2185 svd->anon_index, seg->s_size, 2186 seg->s_szc); 2187 } else { 2188 anon_free(amp->ahp, svd->anon_index, 2189 seg->s_size); 2190 } 2191 } else { 2192 /* 2193 * Shared - anon_free the entire 2194 * anon_map's worth of stuff and 2195 * release any swap reservation. 2196 */ 2197 if (amp->a_szc != 0) { 2198 anon_shmap_free_pages(amp, 0, 2199 amp->size); 2200 } else { 2201 anon_free(amp->ahp, 0, amp->size); 2202 } 2203 if ((len = amp->swresv) != 0) { 2204 anon_unresv(len); 2205 TRACE_3(TR_FAC_VM, TR_ANON_PROC, 2206 "anon proc:%p %lu %u", 2207 seg, len, 0); 2208 } 2209 } 2210 svd->amp = NULL; 2211 ANON_LOCK_EXIT(&->a_rwlock); 2212 anonmap_free(amp); 2213 } else if (svd->type == MAP_PRIVATE) { 2214 /* 2215 * We had a private mapping which still has 2216 * a held anon_map so just free up all the 2217 * anon slot pointers that we were using. 2218 */ 2219 if (seg->s_szc != 0) { 2220 anon_free_pages(amp->ahp, svd->anon_index, 2221 seg->s_size, seg->s_szc); 2222 } else { 2223 anon_free(amp->ahp, svd->anon_index, 2224 seg->s_size); 2225 } 2226 ANON_LOCK_EXIT(&->a_rwlock); 2227 } else { 2228 ANON_LOCK_EXIT(&->a_rwlock); 2229 } 2230 } 2231 2232 /* 2233 * Release swap reservation. 2234 */ 2235 if ((len = svd->swresv) != 0) { 2236 anon_unresv(svd->swresv); 2237 TRACE_3(TR_FAC_VM, TR_ANON_PROC, "anon proc:%p %lu %u", 2238 seg, len, 0); 2239 svd->swresv = 0; 2240 } 2241 /* 2242 * Release claim on vnode, credentials, and finally free the 2243 * private data. 2244 */ 2245 if (svd->vp != NULL) { 2246 if (svd->type == MAP_SHARED) 2247 lgrp_shm_policy_fini(NULL, svd->vp); 2248 VN_RELE(svd->vp); 2249 svd->vp = NULL; 2250 } 2251 crfree(svd->cred); 2252 svd->cred = NULL; 2253 2254 seg->s_data = NULL; 2255 kmem_cache_free(segvn_cache, svd); 2256 } 2257 2258 #ifdef DEBUG 2259 uint32_t segvn_slock_mtbf = 0; 2260 #endif 2261 2262 ulong_t segvn_lpglck_limit = 0; 2263 2264 /* 2265 * Support routines used by segvn_pagelock() and softlock faults for anonymous 2266 * pages to implement availrmem accounting in a way that makes sure the 2267 * same memory is accounted just once for all softlock/pagelock purposes. 2268 * This prevents a bug when availrmem is quickly incorrectly exhausted from 2269 * several pagelocks to different parts of the same large page since each 2270 * pagelock has to decrement availrmem by the size of the entire large 2271 * page. Note those pages are not COW shared until softunlock/pageunlock so 2272 * we don't need to use cow style accounting here. We also need to make sure 2273 * the entire large page is accounted even if softlock range is less than the 2274 * entire large page because large anon pages can't be demoted when any of 2275 * constituent pages is locked. The caller calls this routine for every page_t 2276 * it locks. The very first page in the range may not be the root page of a 2277 * large page. For all other pages it's guaranteed we are going to visit the 2278 * root of a particular large page before any other constituent page as we are 2279 * locking sequential pages belonging to the same anon map. So we do all the 2280 * locking when the root is encountered except for the very first page. Since 2281 * softlocking is not supported (except S_READ_NOCOW special case) for vmpss 2282 * segments and since vnode pages can be demoted without locking all 2283 * constituent pages vnode pages don't come here. Unlocking relies on the 2284 * fact that pagesize can't change whenever any of constituent large pages is 2285 * locked at least SE_SHARED. This allows unlocking code to find the right 2286 * root and decrement availrmem by the same amount it was incremented when the 2287 * page was locked. 2288 */ 2289 static int 2290 segvn_slock_anonpages(page_t *pp, int first) 2291 { 2292 pgcnt_t pages; 2293 pfn_t pfn; 2294 uchar_t szc = pp->p_szc; 2295 2296 ASSERT(PAGE_LOCKED(pp)); 2297 ASSERT(pp->p_vnode != NULL); 2298 ASSERT(IS_SWAPFSVP(pp->p_vnode)); 2299 2300 /* 2301 * pagesize won't change as long as any constituent page is locked. 2302 */ 2303 pages = page_get_pagecnt(pp->p_szc); 2304 pfn = page_pptonum(pp); 2305 2306 if (!first) { 2307 if (!IS_P2ALIGNED(pfn, pages)) { 2308 #ifdef DEBUG 2309 pp = &pp[-(spgcnt_t)(pfn & (pages - 1))]; 2310 pfn = page_pptonum(pp); 2311 ASSERT(IS_P2ALIGNED(pfn, pages)); 2312 ASSERT(pp->p_szc == szc); 2313 ASSERT(pp->p_vnode != NULL); 2314 ASSERT(IS_SWAPFSVP(pp->p_vnode)); 2315 ASSERT(pp->p_slckcnt != 0); 2316 #endif /* DEBUG */ 2317 return (1); 2318 } 2319 } else if (!IS_P2ALIGNED(pfn, pages)) { 2320 pp = &pp[-(spgcnt_t)(pfn & (pages - 1))]; 2321 #ifdef DEBUG 2322 pfn = page_pptonum(pp); 2323 ASSERT(IS_P2ALIGNED(pfn, pages)); 2324 ASSERT(pp->p_szc == szc); 2325 ASSERT(pp->p_vnode != NULL); 2326 ASSERT(IS_SWAPFSVP(pp->p_vnode)); 2327 #endif /* DEBUG */ 2328 } 2329 2330 #ifdef DEBUG 2331 if (segvn_slock_mtbf && !(gethrtime() % segvn_slock_mtbf)) { 2332 return (0); 2333 } 2334 #endif /* DEBUG */ 2335 2336 /* 2337 * pp is a root page. 2338 * We haven't locked this large page yet. 2339 */ 2340 page_struct_lock(pp); 2341 if (pp->p_slckcnt != 0) { 2342 if (pp->p_slckcnt < PAGE_SLOCK_MAXIMUM) { 2343 pp->p_slckcnt++; 2344 page_struct_unlock(pp); 2345 return (1); 2346 } 2347 page_struct_unlock(pp); 2348 segvn_lpglck_limit++; 2349 return (0); 2350 } 2351 mutex_enter(&freemem_lock); 2352 if (availrmem < tune.t_minarmem + pages) { 2353 mutex_exit(&freemem_lock); 2354 page_struct_unlock(pp); 2355 return (0); 2356 } 2357 pp->p_slckcnt++; 2358 availrmem -= pages; 2359 mutex_exit(&freemem_lock); 2360 page_struct_unlock(pp); 2361 return (1); 2362 } 2363 2364 static void 2365 segvn_sunlock_anonpages(page_t *pp, int first) 2366 { 2367 pgcnt_t pages; 2368 pfn_t pfn; 2369 2370 ASSERT(PAGE_LOCKED(pp)); 2371 ASSERT(pp->p_vnode != NULL); 2372 ASSERT(IS_SWAPFSVP(pp->p_vnode)); 2373 2374 /* 2375 * pagesize won't change as long as any constituent page is locked. 2376 */ 2377 pages = page_get_pagecnt(pp->p_szc); 2378 pfn = page_pptonum(pp); 2379 2380 if (!first) { 2381 if (!IS_P2ALIGNED(pfn, pages)) { 2382 return; 2383 } 2384 } else if (!IS_P2ALIGNED(pfn, pages)) { 2385 pp = &pp[-(spgcnt_t)(pfn & (pages - 1))]; 2386 #ifdef DEBUG 2387 pfn = page_pptonum(pp); 2388 ASSERT(IS_P2ALIGNED(pfn, pages)); 2389 #endif /* DEBUG */ 2390 } 2391 ASSERT(pp->p_vnode != NULL); 2392 ASSERT(IS_SWAPFSVP(pp->p_vnode)); 2393 ASSERT(pp->p_slckcnt != 0); 2394 page_struct_lock(pp); 2395 if (--pp->p_slckcnt == 0) { 2396 mutex_enter(&freemem_lock); 2397 availrmem += pages; 2398 mutex_exit(&freemem_lock); 2399 } 2400 page_struct_unlock(pp); 2401 } 2402 2403 /* 2404 * Do a F_SOFTUNLOCK call over the range requested. The range must have 2405 * already been F_SOFTLOCK'ed. 2406 * Caller must always match addr and len of a softunlock with a previous 2407 * softlock with exactly the same addr and len. 2408 */ 2409 static void 2410 segvn_softunlock(struct seg *seg, caddr_t addr, size_t len, enum seg_rw rw) 2411 { 2412 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 2413 page_t *pp; 2414 caddr_t adr; 2415 struct vnode *vp; 2416 u_offset_t offset; 2417 ulong_t anon_index; 2418 struct anon_map *amp; 2419 struct anon *ap = NULL; 2420 2421 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 2422 ASSERT(SEGVN_LOCK_HELD(seg->s_as, &svd->lock)); 2423 2424 if ((amp = svd->amp) != NULL) 2425 anon_index = svd->anon_index + seg_page(seg, addr); 2426 2427 if (HAT_IS_REGION_COOKIE_VALID(svd->rcookie)) { 2428 ASSERT(svd->tr_state == SEGVN_TR_OFF); 2429 hat_unlock_region(seg->s_as->a_hat, addr, len, svd->rcookie); 2430 } else { 2431 hat_unlock(seg->s_as->a_hat, addr, len); 2432 } 2433 for (adr = addr; adr < addr + len; adr += PAGESIZE) { 2434 if (amp != NULL) { 2435 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 2436 if ((ap = anon_get_ptr(amp->ahp, anon_index++)) 2437 != NULL) { 2438 swap_xlate(ap, &vp, &offset); 2439 } else { 2440 vp = svd->vp; 2441 offset = svd->offset + 2442 (uintptr_t)(adr - seg->s_base); 2443 } 2444 ANON_LOCK_EXIT(&->a_rwlock); 2445 } else { 2446 vp = svd->vp; 2447 offset = svd->offset + 2448 (uintptr_t)(adr - seg->s_base); 2449 } 2450 2451 /* 2452 * Use page_find() instead of page_lookup() to 2453 * find the page since we know that it is locked. 2454 */ 2455 pp = page_find(vp, offset); 2456 if (pp == NULL) { 2457 panic( 2458 "segvn_softunlock: addr %p, ap %p, vp %p, off %llx", 2459 (void *)adr, (void *)ap, (void *)vp, offset); 2460 /*NOTREACHED*/ 2461 } 2462 2463 if (rw == S_WRITE) { 2464 hat_setrefmod(pp); 2465 if (seg->s_as->a_vbits) 2466 hat_setstat(seg->s_as, adr, PAGESIZE, 2467 P_REF | P_MOD); 2468 } else if (rw != S_OTHER) { 2469 hat_setref(pp); 2470 if (seg->s_as->a_vbits) 2471 hat_setstat(seg->s_as, adr, PAGESIZE, P_REF); 2472 } 2473 TRACE_3(TR_FAC_VM, TR_SEGVN_FAULT, 2474 "segvn_fault:pp %p vp %p offset %llx", pp, vp, offset); 2475 if (svd->vp == NULL) { 2476 segvn_sunlock_anonpages(pp, adr == addr); 2477 } 2478 page_unlock(pp); 2479 } 2480 mutex_enter(&freemem_lock); /* for availrmem */ 2481 if (svd->vp != NULL) { 2482 availrmem += btop(len); 2483 } 2484 segvn_pages_locked -= btop(len); 2485 svd->softlockcnt -= btop(len); 2486 mutex_exit(&freemem_lock); 2487 if (svd->softlockcnt == 0) { 2488 /* 2489 * All SOFTLOCKS are gone. Wakeup any waiting 2490 * unmappers so they can try again to unmap. 2491 * Check for waiters first without the mutex 2492 * held so we don't always grab the mutex on 2493 * softunlocks. 2494 */ 2495 if (AS_ISUNMAPWAIT(seg->s_as)) { 2496 mutex_enter(&seg->s_as->a_contents); 2497 if (AS_ISUNMAPWAIT(seg->s_as)) { 2498 AS_CLRUNMAPWAIT(seg->s_as); 2499 cv_broadcast(&seg->s_as->a_cv); 2500 } 2501 mutex_exit(&seg->s_as->a_contents); 2502 } 2503 } 2504 } 2505 2506 #define PAGE_HANDLED ((page_t *)-1) 2507 2508 /* 2509 * Release all the pages in the NULL terminated ppp list 2510 * which haven't already been converted to PAGE_HANDLED. 2511 */ 2512 static void 2513 segvn_pagelist_rele(page_t **ppp) 2514 { 2515 for (; *ppp != NULL; ppp++) { 2516 if (*ppp != PAGE_HANDLED) 2517 page_unlock(*ppp); 2518 } 2519 } 2520 2521 static int stealcow = 1; 2522 2523 /* 2524 * Workaround for viking chip bug. See bug id 1220902. 2525 * To fix this down in pagefault() would require importing so 2526 * much as and segvn code as to be unmaintainable. 2527 */ 2528 int enable_mbit_wa = 0; 2529 2530 /* 2531 * Handles all the dirty work of getting the right 2532 * anonymous pages and loading up the translations. 2533 * This routine is called only from segvn_fault() 2534 * when looping over the range of addresses requested. 2535 * 2536 * The basic algorithm here is: 2537 * If this is an anon_zero case 2538 * Call anon_zero to allocate page 2539 * Load up translation 2540 * Return 2541 * endif 2542 * If this is an anon page 2543 * Use anon_getpage to get the page 2544 * else 2545 * Find page in pl[] list passed in 2546 * endif 2547 * If not a cow 2548 * Load up the translation to the page 2549 * return 2550 * endif 2551 * Call anon_private to handle cow 2552 * Load up (writable) translation to new page 2553 */ 2554 static faultcode_t 2555 segvn_faultpage( 2556 struct hat *hat, /* the hat to use for mapping */ 2557 struct seg *seg, /* seg_vn of interest */ 2558 caddr_t addr, /* address in as */ 2559 u_offset_t off, /* offset in vp */ 2560 struct vpage *vpage, /* pointer to vpage for vp, off */ 2561 page_t *pl[], /* object source page pointer */ 2562 uint_t vpprot, /* access allowed to object pages */ 2563 enum fault_type type, /* type of fault */ 2564 enum seg_rw rw, /* type of access at fault */ 2565 int brkcow, /* we may need to break cow */ 2566 int first) /* first page for this fault if 1 */ 2567 { 2568 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 2569 page_t *pp, **ppp; 2570 uint_t pageflags = 0; 2571 page_t *anon_pl[1 + 1]; 2572 page_t *opp = NULL; /* original page */ 2573 uint_t prot; 2574 int err; 2575 int cow; 2576 int claim; 2577 int steal = 0; 2578 ulong_t anon_index; 2579 struct anon *ap, *oldap; 2580 struct anon_map *amp; 2581 int hat_flag = (type == F_SOFTLOCK) ? HAT_LOAD_LOCK : HAT_LOAD; 2582 int anon_lock = 0; 2583 anon_sync_obj_t cookie; 2584 2585 if (svd->flags & MAP_TEXT) { 2586 hat_flag |= HAT_LOAD_TEXT; 2587 } 2588 2589 ASSERT(SEGVN_READ_HELD(seg->s_as, &svd->lock)); 2590 ASSERT(seg->s_szc == 0); 2591 ASSERT(svd->tr_state != SEGVN_TR_INIT); 2592 2593 /* 2594 * Initialize protection value for this page. 2595 * If we have per page protection values check it now. 2596 */ 2597 if (svd->pageprot) { 2598 uint_t protchk; 2599 2600 switch (rw) { 2601 case S_READ: 2602 protchk = PROT_READ; 2603 break; 2604 case S_WRITE: 2605 protchk = PROT_WRITE; 2606 break; 2607 case S_EXEC: 2608 protchk = PROT_EXEC; 2609 break; 2610 case S_OTHER: 2611 default: 2612 protchk = PROT_READ | PROT_WRITE | PROT_EXEC; 2613 break; 2614 } 2615 2616 prot = VPP_PROT(vpage); 2617 if ((prot & protchk) == 0) 2618 return (FC_PROT); /* illegal access type */ 2619 } else { 2620 prot = svd->prot; 2621 } 2622 2623 if (type == F_SOFTLOCK && svd->vp != NULL) { 2624 mutex_enter(&freemem_lock); 2625 if (availrmem <= tune.t_minarmem) { 2626 mutex_exit(&freemem_lock); 2627 return (FC_MAKE_ERR(ENOMEM)); /* out of real memory */ 2628 } else { 2629 availrmem--; 2630 svd->softlockcnt++; 2631 segvn_pages_locked++; 2632 } 2633 mutex_exit(&freemem_lock); 2634 } 2635 2636 /* 2637 * Always acquire the anon array lock to prevent 2 threads from 2638 * allocating separate anon slots for the same "addr". 2639 */ 2640 2641 if ((amp = svd->amp) != NULL) { 2642 ASSERT(RW_READ_HELD(&->a_rwlock)); 2643 anon_index = svd->anon_index + seg_page(seg, addr); 2644 anon_array_enter(amp, anon_index, &cookie); 2645 anon_lock = 1; 2646 } 2647 2648 if (svd->vp == NULL && amp != NULL) { 2649 if ((ap = anon_get_ptr(amp->ahp, anon_index)) == NULL) { 2650 /* 2651 * Allocate a (normally) writable anonymous page of 2652 * zeroes. If no advance reservations, reserve now. 2653 */ 2654 if (svd->flags & MAP_NORESERVE) { 2655 if (anon_resv_zone(ptob(1), 2656 seg->s_as->a_proc->p_zone)) { 2657 atomic_add_long(&svd->swresv, ptob(1)); 2658 } else { 2659 err = ENOMEM; 2660 goto out; 2661 } 2662 } 2663 if ((pp = anon_zero(seg, addr, &ap, 2664 svd->cred)) == NULL) { 2665 err = ENOMEM; 2666 goto out; /* out of swap space */ 2667 } 2668 /* 2669 * Re-acquire the anon_map lock and 2670 * initialize the anon array entry. 2671 */ 2672 (void) anon_set_ptr(amp->ahp, anon_index, ap, 2673 ANON_SLEEP); 2674 2675 ASSERT(pp->p_szc == 0); 2676 2677 /* 2678 * Handle pages that have been marked for migration 2679 */ 2680 if (lgrp_optimizations()) 2681 page_migrate(seg, addr, &pp, 1); 2682 2683 if (type == F_SOFTLOCK) { 2684 if (!segvn_slock_anonpages(pp, first)) { 2685 page_unlock(pp); 2686 err = ENOMEM; 2687 goto out; 2688 } else { 2689 mutex_enter(&freemem_lock); 2690 svd->softlockcnt++; 2691 segvn_pages_locked++; 2692 mutex_exit(&freemem_lock); 2693 } 2694 } 2695 2696 if (enable_mbit_wa) { 2697 if (rw == S_WRITE) 2698 hat_setmod(pp); 2699 else if (!hat_ismod(pp)) 2700 prot &= ~PROT_WRITE; 2701 } 2702 /* 2703 * If AS_PAGLCK is set in a_flags (via memcntl(2) 2704 * with MC_LOCKAS, MCL_FUTURE) and this is a 2705 * MAP_NORESERVE segment, we may need to 2706 * permanently lock the page as it is being faulted 2707 * for the first time. The following text applies 2708 * only to MAP_NORESERVE segments: 2709 * 2710 * As per memcntl(2), if this segment was created 2711 * after MCL_FUTURE was applied (a "future" 2712 * segment), its pages must be locked. If this 2713 * segment existed at MCL_FUTURE application (a 2714 * "past" segment), the interface is unclear. 2715 * 2716 * We decide to lock only if vpage is present: 2717 * 2718 * - "future" segments will have a vpage array (see 2719 * as_map), and so will be locked as required 2720 * 2721 * - "past" segments may not have a vpage array, 2722 * depending on whether events (such as 2723 * mprotect) have occurred. Locking if vpage 2724 * exists will preserve legacy behavior. Not 2725 * locking if vpage is absent, will not break 2726 * the interface or legacy behavior. Note that 2727 * allocating vpage here if it's absent requires 2728 * upgrading the segvn reader lock, the cost of 2729 * which does not seem worthwhile. 2730 * 2731 * Usually testing and setting VPP_ISPPLOCK and 2732 * VPP_SETPPLOCK requires holding the segvn lock as 2733 * writer, but in this case all readers are 2734 * serializing on the anon array lock. 2735 */ 2736 if (AS_ISPGLCK(seg->s_as) && vpage != NULL && 2737 (svd->flags & MAP_NORESERVE) && 2738 !VPP_ISPPLOCK(vpage)) { 2739 proc_t *p = seg->s_as->a_proc; 2740 ASSERT(svd->type == MAP_PRIVATE); 2741 mutex_enter(&p->p_lock); 2742 if (rctl_incr_locked_mem(p, NULL, PAGESIZE, 2743 1) == 0) { 2744 claim = VPP_PROT(vpage) & PROT_WRITE; 2745 if (page_pp_lock(pp, claim, 0)) { 2746 VPP_SETPPLOCK(vpage); 2747 } else { 2748 rctl_decr_locked_mem(p, NULL, 2749 PAGESIZE, 1); 2750 } 2751 } 2752 mutex_exit(&p->p_lock); 2753 } 2754 2755 ASSERT(svd->rcookie == HAT_INVALID_REGION_COOKIE); 2756 hat_memload(hat, addr, pp, prot, hat_flag); 2757 2758 if (!(hat_flag & HAT_LOAD_LOCK)) 2759 page_unlock(pp); 2760 2761 anon_array_exit(&cookie); 2762 return (0); 2763 } 2764 } 2765 2766 /* 2767 * Obtain the page structure via anon_getpage() if it is 2768 * a private copy of an object (the result of a previous 2769 * copy-on-write). 2770 */ 2771 if (amp != NULL) { 2772 if ((ap = anon_get_ptr(amp->ahp, anon_index)) != NULL) { 2773 err = anon_getpage(&ap, &vpprot, anon_pl, PAGESIZE, 2774 seg, addr, rw, svd->cred); 2775 if (err) 2776 goto out; 2777 2778 if (svd->type == MAP_SHARED) { 2779 /* 2780 * If this is a shared mapping to an 2781 * anon_map, then ignore the write 2782 * permissions returned by anon_getpage(). 2783 * They apply to the private mappings 2784 * of this anon_map. 2785 */ 2786 vpprot |= PROT_WRITE; 2787 } 2788 opp = anon_pl[0]; 2789 } 2790 } 2791 2792 /* 2793 * Search the pl[] list passed in if it is from the 2794 * original object (i.e., not a private copy). 2795 */ 2796 if (opp == NULL) { 2797 /* 2798 * Find original page. We must be bringing it in 2799 * from the list in pl[]. 2800 */ 2801 for (ppp = pl; (opp = *ppp) != NULL; ppp++) { 2802 if (opp == PAGE_HANDLED) 2803 continue; 2804 ASSERT(opp->p_vnode == svd->vp); /* XXX */ 2805 if (opp->p_offset == off) 2806 break; 2807 } 2808 if (opp == NULL) { 2809 panic("segvn_faultpage not found"); 2810 /*NOTREACHED*/ 2811 } 2812 *ppp = PAGE_HANDLED; 2813 2814 } 2815 2816 ASSERT(PAGE_LOCKED(opp)); 2817 2818 TRACE_3(TR_FAC_VM, TR_SEGVN_FAULT, 2819 "segvn_fault:pp %p vp %p offset %llx", 2820 opp, NULL, 0); 2821 2822 /* 2823 * The fault is treated as a copy-on-write fault if a 2824 * write occurs on a private segment and the object 2825 * page (i.e., mapping) is write protected. We assume 2826 * that fatal protection checks have already been made. 2827 */ 2828 2829 if (brkcow) { 2830 ASSERT(svd->tr_state == SEGVN_TR_OFF); 2831 cow = !(vpprot & PROT_WRITE); 2832 } else if (svd->tr_state == SEGVN_TR_ON) { 2833 /* 2834 * If we are doing text replication COW on first touch. 2835 */ 2836 ASSERT(amp != NULL); 2837 ASSERT(svd->vp != NULL); 2838 ASSERT(rw != S_WRITE); 2839 cow = (ap == NULL); 2840 } else { 2841 cow = 0; 2842 } 2843 2844 /* 2845 * If not a copy-on-write case load the translation 2846 * and return. 2847 */ 2848 if (cow == 0) { 2849 2850 /* 2851 * Handle pages that have been marked for migration 2852 */ 2853 if (lgrp_optimizations()) 2854 page_migrate(seg, addr, &opp, 1); 2855 2856 if (type == F_SOFTLOCK && svd->vp == NULL) { 2857 2858 ASSERT(opp->p_szc == 0 || 2859 (svd->type == MAP_SHARED && 2860 amp != NULL && amp->a_szc != 0)); 2861 2862 if (!segvn_slock_anonpages(opp, first)) { 2863 page_unlock(opp); 2864 err = ENOMEM; 2865 goto out; 2866 } else { 2867 mutex_enter(&freemem_lock); 2868 svd->softlockcnt++; 2869 segvn_pages_locked++; 2870 mutex_exit(&freemem_lock); 2871 } 2872 } 2873 if (IS_VMODSORT(opp->p_vnode) || enable_mbit_wa) { 2874 if (rw == S_WRITE) 2875 hat_setmod(opp); 2876 else if (rw != S_OTHER && !hat_ismod(opp)) 2877 prot &= ~PROT_WRITE; 2878 } 2879 2880 ASSERT(svd->rcookie == HAT_INVALID_REGION_COOKIE || 2881 (!svd->pageprot && svd->prot == (prot & vpprot))); 2882 ASSERT(amp == NULL || 2883 svd->rcookie == HAT_INVALID_REGION_COOKIE); 2884 hat_memload_region(hat, addr, opp, prot & vpprot, hat_flag, 2885 svd->rcookie); 2886 2887 if (!(hat_flag & HAT_LOAD_LOCK)) 2888 page_unlock(opp); 2889 2890 if (anon_lock) { 2891 anon_array_exit(&cookie); 2892 } 2893 return (0); 2894 } 2895 2896 ASSERT(svd->rcookie == HAT_INVALID_REGION_COOKIE); 2897 2898 hat_setref(opp); 2899 2900 ASSERT(amp != NULL && anon_lock); 2901 2902 /* 2903 * Steal the page only if it isn't a private page 2904 * since stealing a private page is not worth the effort. 2905 */ 2906 if ((ap = anon_get_ptr(amp->ahp, anon_index)) == NULL) 2907 steal = 1; 2908 2909 /* 2910 * Steal the original page if the following conditions are true: 2911 * 2912 * We are low on memory, the page is not private, page is not large, 2913 * not shared, not modified, not `locked' or if we have it `locked' 2914 * (i.e., p_cowcnt == 1 and p_lckcnt == 0, which also implies 2915 * that the page is not shared) and if it doesn't have any 2916 * translations. page_struct_lock isn't needed to look at p_cowcnt 2917 * and p_lckcnt because we first get exclusive lock on page. 2918 */ 2919 (void) hat_pagesync(opp, HAT_SYNC_DONTZERO | HAT_SYNC_STOPON_MOD); 2920 2921 if (stealcow && freemem < minfree && steal && opp->p_szc == 0 && 2922 page_tryupgrade(opp) && !hat_ismod(opp) && 2923 ((opp->p_lckcnt == 0 && opp->p_cowcnt == 0) || 2924 (opp->p_lckcnt == 0 && opp->p_cowcnt == 1 && 2925 vpage != NULL && VPP_ISPPLOCK(vpage)))) { 2926 /* 2927 * Check if this page has other translations 2928 * after unloading our translation. 2929 */ 2930 if (hat_page_is_mapped(opp)) { 2931 ASSERT(svd->rcookie == HAT_INVALID_REGION_COOKIE); 2932 hat_unload(seg->s_as->a_hat, addr, PAGESIZE, 2933 HAT_UNLOAD); 2934 } 2935 2936 /* 2937 * hat_unload() might sync back someone else's recent 2938 * modification, so check again. 2939 */ 2940 if (!hat_ismod(opp) && !hat_page_is_mapped(opp)) 2941 pageflags |= STEAL_PAGE; 2942 } 2943 2944 /* 2945 * If we have a vpage pointer, see if it indicates that we have 2946 * ``locked'' the page we map -- if so, tell anon_private to 2947 * transfer the locking resource to the new page. 2948 * 2949 * See Statement at the beginning of segvn_lockop regarding 2950 * the way lockcnts/cowcnts are handled during COW. 2951 * 2952 */ 2953 if (vpage != NULL && VPP_ISPPLOCK(vpage)) 2954 pageflags |= LOCK_PAGE; 2955 2956 /* 2957 * Allocate a private page and perform the copy. 2958 * For MAP_NORESERVE reserve swap space now, unless this 2959 * is a cow fault on an existing anon page in which case 2960 * MAP_NORESERVE will have made advance reservations. 2961 */ 2962 if ((svd->flags & MAP_NORESERVE) && (ap == NULL)) { 2963 if (anon_resv_zone(ptob(1), seg->s_as->a_proc->p_zone)) { 2964 atomic_add_long(&svd->swresv, ptob(1)); 2965 } else { 2966 page_unlock(opp); 2967 err = ENOMEM; 2968 goto out; 2969 } 2970 } 2971 oldap = ap; 2972 pp = anon_private(&ap, seg, addr, prot, opp, pageflags, svd->cred); 2973 if (pp == NULL) { 2974 err = ENOMEM; /* out of swap space */ 2975 goto out; 2976 } 2977 2978 /* 2979 * If we copied away from an anonymous page, then 2980 * we are one step closer to freeing up an anon slot. 2981 * 2982 * NOTE: The original anon slot must be released while 2983 * holding the "anon_map" lock. This is necessary to prevent 2984 * other threads from obtaining a pointer to the anon slot 2985 * which may be freed if its "refcnt" is 1. 2986 */ 2987 if (oldap != NULL) 2988 anon_decref(oldap); 2989 2990 (void) anon_set_ptr(amp->ahp, anon_index, ap, ANON_SLEEP); 2991 2992 /* 2993 * Handle pages that have been marked for migration 2994 */ 2995 if (lgrp_optimizations()) 2996 page_migrate(seg, addr, &pp, 1); 2997 2998 ASSERT(pp->p_szc == 0); 2999 if (type == F_SOFTLOCK && svd->vp == NULL) { 3000 if (!segvn_slock_anonpages(pp, first)) { 3001 page_unlock(pp); 3002 err = ENOMEM; 3003 goto out; 3004 } else { 3005 mutex_enter(&freemem_lock); 3006 svd->softlockcnt++; 3007 segvn_pages_locked++; 3008 mutex_exit(&freemem_lock); 3009 } 3010 } 3011 3012 ASSERT(!IS_VMODSORT(pp->p_vnode)); 3013 if (enable_mbit_wa) { 3014 if (rw == S_WRITE) 3015 hat_setmod(pp); 3016 else if (!hat_ismod(pp)) 3017 prot &= ~PROT_WRITE; 3018 } 3019 3020 ASSERT(svd->rcookie == HAT_INVALID_REGION_COOKIE); 3021 hat_memload(hat, addr, pp, prot, hat_flag); 3022 3023 if (!(hat_flag & HAT_LOAD_LOCK)) 3024 page_unlock(pp); 3025 3026 ASSERT(anon_lock); 3027 anon_array_exit(&cookie); 3028 return (0); 3029 out: 3030 if (anon_lock) 3031 anon_array_exit(&cookie); 3032 3033 if (type == F_SOFTLOCK && svd->vp != NULL) { 3034 mutex_enter(&freemem_lock); 3035 availrmem++; 3036 segvn_pages_locked--; 3037 svd->softlockcnt--; 3038 mutex_exit(&freemem_lock); 3039 } 3040 return (FC_MAKE_ERR(err)); 3041 } 3042 3043 /* 3044 * relocate a bunch of smaller targ pages into one large repl page. all targ 3045 * pages must be complete pages smaller than replacement pages. 3046 * it's assumed that no page's szc can change since they are all PAGESIZE or 3047 * complete large pages locked SHARED. 3048 */ 3049 static void 3050 segvn_relocate_pages(page_t **targ, page_t *replacement) 3051 { 3052 page_t *pp; 3053 pgcnt_t repl_npgs, curnpgs; 3054 pgcnt_t i; 3055 uint_t repl_szc = replacement->p_szc; 3056 page_t *first_repl = replacement; 3057 page_t *repl; 3058 spgcnt_t npgs; 3059 3060 VM_STAT_ADD(segvnvmstats.relocatepages[0]); 3061 3062 ASSERT(repl_szc != 0); 3063 npgs = repl_npgs = page_get_pagecnt(repl_szc); 3064 3065 i = 0; 3066 while (repl_npgs) { 3067 spgcnt_t nreloc; 3068 int err; 3069 ASSERT(replacement != NULL); 3070 pp = targ[i]; 3071 ASSERT(pp->p_szc < repl_szc); 3072 ASSERT(PAGE_EXCL(pp)); 3073 ASSERT(!PP_ISFREE(pp)); 3074 curnpgs = page_get_pagecnt(pp->p_szc); 3075 if (curnpgs == 1) { 3076 VM_STAT_ADD(segvnvmstats.relocatepages[1]); 3077 repl = replacement; 3078 page_sub(&replacement, repl); 3079 ASSERT(PAGE_EXCL(repl)); 3080 ASSERT(!PP_ISFREE(repl)); 3081 ASSERT(repl->p_szc == repl_szc); 3082 } else { 3083 page_t *repl_savepp; 3084 int j; 3085 VM_STAT_ADD(segvnvmstats.relocatepages[2]); 3086 repl_savepp = replacement; 3087 for (j = 0; j < curnpgs; j++) { 3088 repl = replacement; 3089 page_sub(&replacement, repl); 3090 ASSERT(PAGE_EXCL(repl)); 3091 ASSERT(!PP_ISFREE(repl)); 3092 ASSERT(repl->p_szc == repl_szc); 3093 ASSERT(page_pptonum(targ[i + j]) == 3094 page_pptonum(targ[i]) + j); 3095 } 3096 repl = repl_savepp; 3097 ASSERT(IS_P2ALIGNED(page_pptonum(repl), curnpgs)); 3098 } 3099 err = page_relocate(&pp, &repl, 0, 1, &nreloc, NULL); 3100 if (err || nreloc != curnpgs) { 3101 panic("segvn_relocate_pages: " 3102 "page_relocate failed err=%d curnpgs=%ld " 3103 "nreloc=%ld", err, curnpgs, nreloc); 3104 } 3105 ASSERT(curnpgs <= repl_npgs); 3106 repl_npgs -= curnpgs; 3107 i += curnpgs; 3108 } 3109 ASSERT(replacement == NULL); 3110 3111 repl = first_repl; 3112 repl_npgs = npgs; 3113 for (i = 0; i < repl_npgs; i++) { 3114 ASSERT(PAGE_EXCL(repl)); 3115 ASSERT(!PP_ISFREE(repl)); 3116 targ[i] = repl; 3117 page_downgrade(targ[i]); 3118 repl++; 3119 } 3120 } 3121 3122 /* 3123 * Check if all pages in ppa array are complete smaller than szc pages and 3124 * their roots will still be aligned relative to their current size if the 3125 * entire ppa array is relocated into one szc page. If these conditions are 3126 * not met return 0. 3127 * 3128 * If all pages are properly aligned attempt to upgrade their locks 3129 * to exclusive mode. If it fails set *upgrdfail to 1 and return 0. 3130 * upgrdfail was set to 0 by caller. 3131 * 3132 * Return 1 if all pages are aligned and locked exclusively. 3133 * 3134 * If all pages in ppa array happen to be physically contiguous to make one 3135 * szc page and all exclusive locks are successfully obtained promote the page 3136 * size to szc and set *pszc to szc. Return 1 with pages locked shared. 3137 */ 3138 static int 3139 segvn_full_szcpages(page_t **ppa, uint_t szc, int *upgrdfail, uint_t *pszc) 3140 { 3141 page_t *pp; 3142 pfn_t pfn; 3143 pgcnt_t totnpgs = page_get_pagecnt(szc); 3144 pfn_t first_pfn; 3145 int contig = 1; 3146 pgcnt_t i; 3147 pgcnt_t j; 3148 uint_t curszc; 3149 pgcnt_t curnpgs; 3150 int root = 0; 3151 3152 ASSERT(szc > 0); 3153 3154 VM_STAT_ADD(segvnvmstats.fullszcpages[0]); 3155 3156 for (i = 0; i < totnpgs; i++) { 3157 pp = ppa[i]; 3158 ASSERT(PAGE_SHARED(pp)); 3159 ASSERT(!PP_ISFREE(pp)); 3160 pfn = page_pptonum(pp); 3161 if (i == 0) { 3162 if (!IS_P2ALIGNED(pfn, totnpgs)) { 3163 contig = 0; 3164 } else { 3165 first_pfn = pfn; 3166 } 3167 } else if (contig && pfn != first_pfn + i) { 3168 contig = 0; 3169 } 3170 if (pp->p_szc == 0) { 3171 if (root) { 3172 VM_STAT_ADD(segvnvmstats.fullszcpages[1]); 3173 return (0); 3174 } 3175 } else if (!root) { 3176 if ((curszc = pp->p_szc) >= szc) { 3177 VM_STAT_ADD(segvnvmstats.fullszcpages[2]); 3178 return (0); 3179 } 3180 if (curszc == 0) { 3181 /* 3182 * p_szc changed means we don't have all pages 3183 * locked. return failure. 3184 */ 3185 VM_STAT_ADD(segvnvmstats.fullszcpages[3]); 3186 return (0); 3187 } 3188 curnpgs = page_get_pagecnt(curszc); 3189 if (!IS_P2ALIGNED(pfn, curnpgs) || 3190 !IS_P2ALIGNED(i, curnpgs)) { 3191 VM_STAT_ADD(segvnvmstats.fullszcpages[4]); 3192 return (0); 3193 } 3194 root = 1; 3195 } else { 3196 ASSERT(i > 0); 3197 VM_STAT_ADD(segvnvmstats.fullszcpages[5]); 3198 if (pp->p_szc != curszc) { 3199 VM_STAT_ADD(segvnvmstats.fullszcpages[6]); 3200 return (0); 3201 } 3202 if (pfn - 1 != page_pptonum(ppa[i - 1])) { 3203 panic("segvn_full_szcpages: " 3204 "large page not physically contiguous"); 3205 } 3206 if (P2PHASE(pfn, curnpgs) == curnpgs - 1) { 3207 root = 0; 3208 } 3209 } 3210 } 3211 3212 for (i = 0; i < totnpgs; i++) { 3213 ASSERT(ppa[i]->p_szc < szc); 3214 if (!page_tryupgrade(ppa[i])) { 3215 for (j = 0; j < i; j++) { 3216 page_downgrade(ppa[j]); 3217 } 3218 *pszc = ppa[i]->p_szc; 3219 *upgrdfail = 1; 3220 VM_STAT_ADD(segvnvmstats.fullszcpages[7]); 3221 return (0); 3222 } 3223 } 3224 3225 /* 3226 * When a page is put a free cachelist its szc is set to 0. if file 3227 * system reclaimed pages from cachelist targ pages will be physically 3228 * contiguous with 0 p_szc. in this case just upgrade szc of targ 3229 * pages without any relocations. 3230 * To avoid any hat issues with previous small mappings 3231 * hat_pageunload() the target pages first. 3232 */ 3233 if (contig) { 3234 VM_STAT_ADD(segvnvmstats.fullszcpages[8]); 3235 for (i = 0; i < totnpgs; i++) { 3236 (void) hat_pageunload(ppa[i], HAT_FORCE_PGUNLOAD); 3237 } 3238 for (i = 0; i < totnpgs; i++) { 3239 ppa[i]->p_szc = szc; 3240 } 3241 for (i = 0; i < totnpgs; i++) { 3242 ASSERT(PAGE_EXCL(ppa[i])); 3243 page_downgrade(ppa[i]); 3244 } 3245 if (pszc != NULL) { 3246 *pszc = szc; 3247 } 3248 } 3249 VM_STAT_ADD(segvnvmstats.fullszcpages[9]); 3250 return (1); 3251 } 3252 3253 /* 3254 * Create physically contiguous pages for [vp, off] - [vp, off + 3255 * page_size(szc)) range and for private segment return them in ppa array. 3256 * Pages are created either via IO or relocations. 3257 * 3258 * Return 1 on success and 0 on failure. 3259 * 3260 * If physically contiguous pages already exist for this range return 1 without 3261 * filling ppa array. Caller initializes ppa[0] as NULL to detect that ppa 3262 * array wasn't filled. In this case caller fills ppa array via VOP_GETPAGE(). 3263 */ 3264 3265 static int 3266 segvn_fill_vp_pages(struct segvn_data *svd, vnode_t *vp, u_offset_t off, 3267 uint_t szc, page_t **ppa, page_t **ppplist, uint_t *ret_pszc, 3268 int *downsize) 3269 3270 { 3271 page_t *pplist = *ppplist; 3272 size_t pgsz = page_get_pagesize(szc); 3273 pgcnt_t pages = btop(pgsz); 3274 ulong_t start_off = off; 3275 u_offset_t eoff = off + pgsz; 3276 spgcnt_t nreloc; 3277 u_offset_t io_off = off; 3278 size_t io_len; 3279 page_t *io_pplist = NULL; 3280 page_t *done_pplist = NULL; 3281 pgcnt_t pgidx = 0; 3282 page_t *pp; 3283 page_t *newpp; 3284 page_t *targpp; 3285 int io_err = 0; 3286 int i; 3287 pfn_t pfn; 3288 ulong_t ppages; 3289 page_t *targ_pplist = NULL; 3290 page_t *repl_pplist = NULL; 3291 page_t *tmp_pplist; 3292 int nios = 0; 3293 uint_t pszc; 3294 struct vattr va; 3295 3296 VM_STAT_ADD(segvnvmstats.fill_vp_pages[0]); 3297 3298 ASSERT(szc != 0); 3299 ASSERT(pplist->p_szc == szc); 3300 3301 /* 3302 * downsize will be set to 1 only if we fail to lock pages. this will 3303 * allow subsequent faults to try to relocate the page again. If we 3304 * fail due to misalignment don't downsize and let the caller map the 3305 * whole region with small mappings to avoid more faults into the area 3306 * where we can't get large pages anyway. 3307 */ 3308 *downsize = 0; 3309 3310 while (off < eoff) { 3311 newpp = pplist; 3312 ASSERT(newpp != NULL); 3313 ASSERT(PAGE_EXCL(newpp)); 3314 ASSERT(!PP_ISFREE(newpp)); 3315 /* 3316 * we pass NULL for nrelocp to page_lookup_create() 3317 * so that it doesn't relocate. We relocate here 3318 * later only after we make sure we can lock all 3319 * pages in the range we handle and they are all 3320 * aligned. 3321 */ 3322 pp = page_lookup_create(vp, off, SE_SHARED, newpp, NULL, 0); 3323 ASSERT(pp != NULL); 3324 ASSERT(!PP_ISFREE(pp)); 3325 ASSERT(pp->p_vnode == vp); 3326 ASSERT(pp->p_offset == off); 3327 if (pp == newpp) { 3328 VM_STAT_ADD(segvnvmstats.fill_vp_pages[1]); 3329 page_sub(&pplist, pp); 3330 ASSERT(PAGE_EXCL(pp)); 3331 ASSERT(page_iolock_assert(pp)); 3332 page_list_concat(&io_pplist, &pp); 3333 off += PAGESIZE; 3334 continue; 3335 } 3336 VM_STAT_ADD(segvnvmstats.fill_vp_pages[2]); 3337 pfn = page_pptonum(pp); 3338 pszc = pp->p_szc; 3339 if (pszc >= szc && targ_pplist == NULL && io_pplist == NULL && 3340 IS_P2ALIGNED(pfn, pages)) { 3341 ASSERT(repl_pplist == NULL); 3342 ASSERT(done_pplist == NULL); 3343 ASSERT(pplist == *ppplist); 3344 page_unlock(pp); 3345 page_free_replacement_page(pplist); 3346 page_create_putback(pages); 3347 *ppplist = NULL; 3348 VM_STAT_ADD(segvnvmstats.fill_vp_pages[3]); 3349 return (1); 3350 } 3351 if (pszc >= szc) { 3352 page_unlock(pp); 3353 segvn_faultvnmpss_align_err1++; 3354 goto out; 3355 } 3356 ppages = page_get_pagecnt(pszc); 3357 if (!IS_P2ALIGNED(pfn, ppages)) { 3358 ASSERT(pszc > 0); 3359 /* 3360 * sizing down to pszc won't help. 3361 */ 3362 page_unlock(pp); 3363 segvn_faultvnmpss_align_err2++; 3364 goto out; 3365 } 3366 pfn = page_pptonum(newpp); 3367 if (!IS_P2ALIGNED(pfn, ppages)) { 3368 ASSERT(pszc > 0); 3369 /* 3370 * sizing down to pszc won't help. 3371 */ 3372 page_unlock(pp); 3373 segvn_faultvnmpss_align_err3++; 3374 goto out; 3375 } 3376 if (!PAGE_EXCL(pp)) { 3377 VM_STAT_ADD(segvnvmstats.fill_vp_pages[4]); 3378 page_unlock(pp); 3379 *downsize = 1; 3380 *ret_pszc = pp->p_szc; 3381 goto out; 3382 } 3383 targpp = pp; 3384 if (io_pplist != NULL) { 3385 VM_STAT_ADD(segvnvmstats.fill_vp_pages[5]); 3386 io_len = off - io_off; 3387 /* 3388 * Some file systems like NFS don't check EOF 3389 * conditions in VOP_PAGEIO(). Check it here 3390 * now that pages are locked SE_EXCL. Any file 3391 * truncation will wait until the pages are 3392 * unlocked so no need to worry that file will 3393 * be truncated after we check its size here. 3394 * XXX fix NFS to remove this check. 3395 */ 3396 va.va_mask = AT_SIZE; 3397 if (VOP_GETATTR(vp, &va, ATTR_HINT, svd->cred, NULL)) { 3398 VM_STAT_ADD(segvnvmstats.fill_vp_pages[6]); 3399 page_unlock(targpp); 3400 goto out; 3401 } 3402 if (btopr(va.va_size) < btopr(io_off + io_len)) { 3403 VM_STAT_ADD(segvnvmstats.fill_vp_pages[7]); 3404 *downsize = 1; 3405 *ret_pszc = 0; 3406 page_unlock(targpp); 3407 goto out; 3408 } 3409 io_err = VOP_PAGEIO(vp, io_pplist, io_off, io_len, 3410 B_READ, svd->cred, NULL); 3411 if (io_err) { 3412 VM_STAT_ADD(segvnvmstats.fill_vp_pages[8]); 3413 page_unlock(targpp); 3414 if (io_err == EDEADLK) { 3415 segvn_vmpss_pageio_deadlk_err++; 3416 } 3417 goto out; 3418 } 3419 nios++; 3420 VM_STAT_ADD(segvnvmstats.fill_vp_pages[9]); 3421 while (io_pplist != NULL) { 3422 pp = io_pplist; 3423 page_sub(&io_pplist, pp); 3424 ASSERT(page_iolock_assert(pp)); 3425 page_io_unlock(pp); 3426 pgidx = (pp->p_offset - start_off) >> 3427 PAGESHIFT; 3428 ASSERT(pgidx < pages); 3429 ppa[pgidx] = pp; 3430 page_list_concat(&done_pplist, &pp); 3431 } 3432 } 3433 pp = targpp; 3434 ASSERT(PAGE_EXCL(pp)); 3435 ASSERT(pp->p_szc <= pszc); 3436 if (pszc != 0 && !group_page_trylock(pp, SE_EXCL)) { 3437 VM_STAT_ADD(segvnvmstats.fill_vp_pages[10]); 3438 page_unlock(pp); 3439 *downsize = 1; 3440 *ret_pszc = pp->p_szc; 3441 goto out; 3442 } 3443 VM_STAT_ADD(segvnvmstats.fill_vp_pages[11]); 3444 /* 3445 * page szc chould have changed before the entire group was 3446 * locked. reread page szc. 3447 */ 3448 pszc = pp->p_szc; 3449 ppages = page_get_pagecnt(pszc); 3450 3451 /* link just the roots */ 3452 page_list_concat(&targ_pplist, &pp); 3453 page_sub(&pplist, newpp); 3454 page_list_concat(&repl_pplist, &newpp); 3455 off += PAGESIZE; 3456 while (--ppages != 0) { 3457 newpp = pplist; 3458 page_sub(&pplist, newpp); 3459 off += PAGESIZE; 3460 } 3461 io_off = off; 3462 } 3463 if (io_pplist != NULL) { 3464 VM_STAT_ADD(segvnvmstats.fill_vp_pages[12]); 3465 io_len = eoff - io_off; 3466 va.va_mask = AT_SIZE; 3467 if (VOP_GETATTR(vp, &va, ATTR_HINT, svd->cred, NULL) != 0) { 3468 VM_STAT_ADD(segvnvmstats.fill_vp_pages[13]); 3469 goto out; 3470 } 3471 if (btopr(va.va_size) < btopr(io_off + io_len)) { 3472 VM_STAT_ADD(segvnvmstats.fill_vp_pages[14]); 3473 *downsize = 1; 3474 *ret_pszc = 0; 3475 goto out; 3476 } 3477 io_err = VOP_PAGEIO(vp, io_pplist, io_off, io_len, 3478 B_READ, svd->cred, NULL); 3479 if (io_err) { 3480 VM_STAT_ADD(segvnvmstats.fill_vp_pages[15]); 3481 if (io_err == EDEADLK) { 3482 segvn_vmpss_pageio_deadlk_err++; 3483 } 3484 goto out; 3485 } 3486 nios++; 3487 while (io_pplist != NULL) { 3488 pp = io_pplist; 3489 page_sub(&io_pplist, pp); 3490 ASSERT(page_iolock_assert(pp)); 3491 page_io_unlock(pp); 3492 pgidx = (pp->p_offset - start_off) >> PAGESHIFT; 3493 ASSERT(pgidx < pages); 3494 ppa[pgidx] = pp; 3495 } 3496 } 3497 /* 3498 * we're now bound to succeed or panic. 3499 * remove pages from done_pplist. it's not needed anymore. 3500 */ 3501 while (done_pplist != NULL) { 3502 pp = done_pplist; 3503 page_sub(&done_pplist, pp); 3504 } 3505 VM_STAT_ADD(segvnvmstats.fill_vp_pages[16]); 3506 ASSERT(pplist == NULL); 3507 *ppplist = NULL; 3508 while (targ_pplist != NULL) { 3509 int ret; 3510 VM_STAT_ADD(segvnvmstats.fill_vp_pages[17]); 3511 ASSERT(repl_pplist); 3512 pp = targ_pplist; 3513 page_sub(&targ_pplist, pp); 3514 pgidx = (pp->p_offset - start_off) >> PAGESHIFT; 3515 newpp = repl_pplist; 3516 page_sub(&repl_pplist, newpp); 3517 #ifdef DEBUG 3518 pfn = page_pptonum(pp); 3519 pszc = pp->p_szc; 3520 ppages = page_get_pagecnt(pszc); 3521 ASSERT(IS_P2ALIGNED(pfn, ppages)); 3522 pfn = page_pptonum(newpp); 3523 ASSERT(IS_P2ALIGNED(pfn, ppages)); 3524 ASSERT(P2PHASE(pfn, pages) == pgidx); 3525 #endif 3526 nreloc = 0; 3527 ret = page_relocate(&pp, &newpp, 0, 1, &nreloc, NULL); 3528 if (ret != 0 || nreloc == 0) { 3529 panic("segvn_fill_vp_pages: " 3530 "page_relocate failed"); 3531 } 3532 pp = newpp; 3533 while (nreloc-- != 0) { 3534 ASSERT(PAGE_EXCL(pp)); 3535 ASSERT(pp->p_vnode == vp); 3536 ASSERT(pgidx == 3537 ((pp->p_offset - start_off) >> PAGESHIFT)); 3538 ppa[pgidx++] = pp; 3539 pp++; 3540 } 3541 } 3542 3543 if (svd->type == MAP_PRIVATE) { 3544 VM_STAT_ADD(segvnvmstats.fill_vp_pages[18]); 3545 for (i = 0; i < pages; i++) { 3546 ASSERT(ppa[i] != NULL); 3547 ASSERT(PAGE_EXCL(ppa[i])); 3548 ASSERT(ppa[i]->p_vnode == vp); 3549 ASSERT(ppa[i]->p_offset == 3550 start_off + (i << PAGESHIFT)); 3551 page_downgrade(ppa[i]); 3552 } 3553 ppa[pages] = NULL; 3554 } else { 3555 VM_STAT_ADD(segvnvmstats.fill_vp_pages[19]); 3556 /* 3557 * the caller will still call VOP_GETPAGE() for shared segments 3558 * to check FS write permissions. For private segments we map 3559 * file read only anyway. so no VOP_GETPAGE is needed. 3560 */ 3561 for (i = 0; i < pages; i++) { 3562 ASSERT(ppa[i] != NULL); 3563 ASSERT(PAGE_EXCL(ppa[i])); 3564 ASSERT(ppa[i]->p_vnode == vp); 3565 ASSERT(ppa[i]->p_offset == 3566 start_off + (i << PAGESHIFT)); 3567 page_unlock(ppa[i]); 3568 } 3569 ppa[0] = NULL; 3570 } 3571 3572 return (1); 3573 out: 3574 /* 3575 * Do the cleanup. Unlock target pages we didn't relocate. They are 3576 * linked on targ_pplist by root pages. reassemble unused replacement 3577 * and io pages back to pplist. 3578 */ 3579 if (io_pplist != NULL) { 3580 VM_STAT_ADD(segvnvmstats.fill_vp_pages[20]); 3581 pp = io_pplist; 3582 do { 3583 ASSERT(pp->p_vnode == vp); 3584 ASSERT(pp->p_offset == io_off); 3585 ASSERT(page_iolock_assert(pp)); 3586 page_io_unlock(pp); 3587 page_hashout(pp, NULL); 3588 io_off += PAGESIZE; 3589 } while ((pp = pp->p_next) != io_pplist); 3590 page_list_concat(&io_pplist, &pplist); 3591 pplist = io_pplist; 3592 } 3593 tmp_pplist = NULL; 3594 while (targ_pplist != NULL) { 3595 VM_STAT_ADD(segvnvmstats.fill_vp_pages[21]); 3596 pp = targ_pplist; 3597 ASSERT(PAGE_EXCL(pp)); 3598 page_sub(&targ_pplist, pp); 3599 3600 pszc = pp->p_szc; 3601 ppages = page_get_pagecnt(pszc); 3602 ASSERT(IS_P2ALIGNED(page_pptonum(pp), ppages)); 3603 3604 if (pszc != 0) { 3605 group_page_unlock(pp); 3606 } 3607 page_unlock(pp); 3608 3609 pp = repl_pplist; 3610 ASSERT(pp != NULL); 3611 ASSERT(PAGE_EXCL(pp)); 3612 ASSERT(pp->p_szc == szc); 3613 page_sub(&repl_pplist, pp); 3614 3615 ASSERT(IS_P2ALIGNED(page_pptonum(pp), ppages)); 3616 3617 /* relink replacement page */ 3618 page_list_concat(&tmp_pplist, &pp); 3619 while (--ppages != 0) { 3620 VM_STAT_ADD(segvnvmstats.fill_vp_pages[22]); 3621 pp++; 3622 ASSERT(PAGE_EXCL(pp)); 3623 ASSERT(pp->p_szc == szc); 3624 page_list_concat(&tmp_pplist, &pp); 3625 } 3626 } 3627 if (tmp_pplist != NULL) { 3628 VM_STAT_ADD(segvnvmstats.fill_vp_pages[23]); 3629 page_list_concat(&tmp_pplist, &pplist); 3630 pplist = tmp_pplist; 3631 } 3632 /* 3633 * at this point all pages are either on done_pplist or 3634 * pplist. They can't be all on done_pplist otherwise 3635 * we'd've been done. 3636 */ 3637 ASSERT(pplist != NULL); 3638 if (nios != 0) { 3639 VM_STAT_ADD(segvnvmstats.fill_vp_pages[24]); 3640 pp = pplist; 3641 do { 3642 VM_STAT_ADD(segvnvmstats.fill_vp_pages[25]); 3643 ASSERT(pp->p_szc == szc); 3644 ASSERT(PAGE_EXCL(pp)); 3645 ASSERT(pp->p_vnode != vp); 3646 pp->p_szc = 0; 3647 } while ((pp = pp->p_next) != pplist); 3648 3649 pp = done_pplist; 3650 do { 3651 VM_STAT_ADD(segvnvmstats.fill_vp_pages[26]); 3652 ASSERT(pp->p_szc == szc); 3653 ASSERT(PAGE_EXCL(pp)); 3654 ASSERT(pp->p_vnode == vp); 3655 pp->p_szc = 0; 3656 } while ((pp = pp->p_next) != done_pplist); 3657 3658 while (pplist != NULL) { 3659 VM_STAT_ADD(segvnvmstats.fill_vp_pages[27]); 3660 pp = pplist; 3661 page_sub(&pplist, pp); 3662 page_free(pp, 0); 3663 } 3664 3665 while (done_pplist != NULL) { 3666 VM_STAT_ADD(segvnvmstats.fill_vp_pages[28]); 3667 pp = done_pplist; 3668 page_sub(&done_pplist, pp); 3669 page_unlock(pp); 3670 } 3671 *ppplist = NULL; 3672 return (0); 3673 } 3674 ASSERT(pplist == *ppplist); 3675 if (io_err) { 3676 VM_STAT_ADD(segvnvmstats.fill_vp_pages[29]); 3677 /* 3678 * don't downsize on io error. 3679 * see if vop_getpage succeeds. 3680 * pplist may still be used in this case 3681 * for relocations. 3682 */ 3683 return (0); 3684 } 3685 VM_STAT_ADD(segvnvmstats.fill_vp_pages[30]); 3686 page_free_replacement_page(pplist); 3687 page_create_putback(pages); 3688 *ppplist = NULL; 3689 return (0); 3690 } 3691 3692 int segvn_anypgsz = 0; 3693 3694 #define SEGVN_RESTORE_SOFTLOCK(type, pages) \ 3695 if ((type) == F_SOFTLOCK) { \ 3696 mutex_enter(&freemem_lock); \ 3697 availrmem += (pages); \ 3698 segvn_pages_locked -= (pages); \ 3699 svd->softlockcnt -= (pages); \ 3700 mutex_exit(&freemem_lock); \ 3701 } 3702 3703 #define SEGVN_UPDATE_MODBITS(ppa, pages, rw, prot, vpprot) \ 3704 if (IS_VMODSORT((ppa)[0]->p_vnode)) { \ 3705 if ((rw) == S_WRITE) { \ 3706 for (i = 0; i < (pages); i++) { \ 3707 ASSERT((ppa)[i]->p_vnode == \ 3708 (ppa)[0]->p_vnode); \ 3709 hat_setmod((ppa)[i]); \ 3710 } \ 3711 } else if ((rw) != S_OTHER && \ 3712 ((prot) & (vpprot) & PROT_WRITE)) { \ 3713 for (i = 0; i < (pages); i++) { \ 3714 ASSERT((ppa)[i]->p_vnode == \ 3715 (ppa)[0]->p_vnode); \ 3716 if (!hat_ismod((ppa)[i])) { \ 3717 prot &= ~PROT_WRITE; \ 3718 break; \ 3719 } \ 3720 } \ 3721 } \ 3722 } 3723 3724 #ifdef VM_STATS 3725 3726 #define SEGVN_VMSTAT_FLTVNPAGES(idx) \ 3727 VM_STAT_ADD(segvnvmstats.fltvnpages[(idx)]); 3728 3729 #else /* VM_STATS */ 3730 3731 #define SEGVN_VMSTAT_FLTVNPAGES(idx) 3732 3733 #endif 3734 3735 static faultcode_t 3736 segvn_fault_vnodepages(struct hat *hat, struct seg *seg, caddr_t lpgaddr, 3737 caddr_t lpgeaddr, enum fault_type type, enum seg_rw rw, caddr_t addr, 3738 caddr_t eaddr, int brkcow) 3739 { 3740 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 3741 struct anon_map *amp = svd->amp; 3742 uchar_t segtype = svd->type; 3743 uint_t szc = seg->s_szc; 3744 size_t pgsz = page_get_pagesize(szc); 3745 size_t maxpgsz = pgsz; 3746 pgcnt_t pages = btop(pgsz); 3747 pgcnt_t maxpages = pages; 3748 size_t ppasize = (pages + 1) * sizeof (page_t *); 3749 caddr_t a = lpgaddr; 3750 caddr_t maxlpgeaddr = lpgeaddr; 3751 u_offset_t off = svd->offset + (uintptr_t)(a - seg->s_base); 3752 ulong_t aindx = svd->anon_index + seg_page(seg, a); 3753 struct vpage *vpage = (svd->vpage != NULL) ? 3754 &svd->vpage[seg_page(seg, a)] : NULL; 3755 vnode_t *vp = svd->vp; 3756 page_t **ppa; 3757 uint_t pszc; 3758 size_t ppgsz; 3759 pgcnt_t ppages; 3760 faultcode_t err = 0; 3761 int ierr; 3762 int vop_size_err = 0; 3763 uint_t protchk, prot, vpprot; 3764 ulong_t i; 3765 int hat_flag = (type == F_SOFTLOCK) ? HAT_LOAD_LOCK : HAT_LOAD; 3766 anon_sync_obj_t an_cookie; 3767 enum seg_rw arw; 3768 int alloc_failed = 0; 3769 int adjszc_chk; 3770 struct vattr va; 3771 int xhat = 0; 3772 page_t *pplist; 3773 pfn_t pfn; 3774 int physcontig; 3775 int upgrdfail; 3776 int segvn_anypgsz_vnode = 0; /* for now map vnode with 2 page sizes */ 3777 int tron = (svd->tr_state == SEGVN_TR_ON); 3778 3779 ASSERT(szc != 0); 3780 ASSERT(vp != NULL); 3781 ASSERT(brkcow == 0 || amp != NULL); 3782 ASSERT(tron == 0 || amp != NULL); 3783 ASSERT(enable_mbit_wa == 0); /* no mbit simulations with large pages */ 3784 ASSERT(!(svd->flags & MAP_NORESERVE)); 3785 ASSERT(type != F_SOFTUNLOCK); 3786 ASSERT(IS_P2ALIGNED(a, maxpgsz)); 3787 ASSERT(amp == NULL || IS_P2ALIGNED(aindx, maxpages)); 3788 ASSERT(SEGVN_LOCK_HELD(seg->s_as, &svd->lock)); 3789 ASSERT(seg->s_szc < NBBY * sizeof (int)); 3790 ASSERT(type != F_SOFTLOCK || lpgeaddr - a == maxpgsz); 3791 ASSERT(svd->tr_state != SEGVN_TR_INIT); 3792 3793 VM_STAT_COND_ADD(type == F_SOFTLOCK, segvnvmstats.fltvnpages[0]); 3794 VM_STAT_COND_ADD(type != F_SOFTLOCK, segvnvmstats.fltvnpages[1]); 3795 3796 if (svd->flags & MAP_TEXT) { 3797 hat_flag |= HAT_LOAD_TEXT; 3798 } 3799 3800 if (svd->pageprot) { 3801 switch (rw) { 3802 case S_READ: 3803 protchk = PROT_READ; 3804 break; 3805 case S_WRITE: 3806 protchk = PROT_WRITE; 3807 break; 3808 case S_EXEC: 3809 protchk = PROT_EXEC; 3810 break; 3811 case S_OTHER: 3812 default: 3813 protchk = PROT_READ | PROT_WRITE | PROT_EXEC; 3814 break; 3815 } 3816 } else { 3817 prot = svd->prot; 3818 /* caller has already done segment level protection check. */ 3819 } 3820 3821 if (seg->s_as->a_hat != hat) { 3822 xhat = 1; 3823 } 3824 3825 if (rw == S_WRITE && segtype == MAP_PRIVATE) { 3826 SEGVN_VMSTAT_FLTVNPAGES(2); 3827 arw = S_READ; 3828 } else { 3829 arw = rw; 3830 } 3831 3832 ppa = kmem_alloc(ppasize, KM_SLEEP); 3833 3834 VM_STAT_COND_ADD(amp != NULL, segvnvmstats.fltvnpages[3]); 3835 3836 for (;;) { 3837 adjszc_chk = 0; 3838 for (; a < lpgeaddr; a += pgsz, off += pgsz, aindx += pages) { 3839 if (adjszc_chk) { 3840 while (szc < seg->s_szc) { 3841 uintptr_t e; 3842 uint_t tszc; 3843 tszc = segvn_anypgsz_vnode ? szc + 1 : 3844 seg->s_szc; 3845 ppgsz = page_get_pagesize(tszc); 3846 if (!IS_P2ALIGNED(a, ppgsz) || 3847 ((alloc_failed >> tszc) & 3848 0x1)) { 3849 break; 3850 } 3851 SEGVN_VMSTAT_FLTVNPAGES(4); 3852 szc = tszc; 3853 pgsz = ppgsz; 3854 pages = btop(pgsz); 3855 e = P2ROUNDUP((uintptr_t)eaddr, pgsz); 3856 lpgeaddr = (caddr_t)e; 3857 } 3858 } 3859 3860 again: 3861 if (IS_P2ALIGNED(a, maxpgsz) && amp != NULL) { 3862 ASSERT(IS_P2ALIGNED(aindx, maxpages)); 3863 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 3864 anon_array_enter(amp, aindx, &an_cookie); 3865 if (anon_get_ptr(amp->ahp, aindx) != NULL) { 3866 SEGVN_VMSTAT_FLTVNPAGES(5); 3867 ASSERT(anon_pages(amp->ahp, aindx, 3868 maxpages) == maxpages); 3869 anon_array_exit(&an_cookie); 3870 ANON_LOCK_EXIT(&->a_rwlock); 3871 err = segvn_fault_anonpages(hat, seg, 3872 a, a + maxpgsz, type, rw, 3873 MAX(a, addr), 3874 MIN(a + maxpgsz, eaddr), brkcow); 3875 if (err != 0) { 3876 SEGVN_VMSTAT_FLTVNPAGES(6); 3877 goto out; 3878 } 3879 if (szc < seg->s_szc) { 3880 szc = seg->s_szc; 3881 pgsz = maxpgsz; 3882 pages = maxpages; 3883 lpgeaddr = maxlpgeaddr; 3884 } 3885 goto next; 3886 } else { 3887 ASSERT(anon_pages(amp->ahp, aindx, 3888 maxpages) == 0); 3889 SEGVN_VMSTAT_FLTVNPAGES(7); 3890 anon_array_exit(&an_cookie); 3891 ANON_LOCK_EXIT(&->a_rwlock); 3892 } 3893 } 3894 ASSERT(!brkcow || IS_P2ALIGNED(a, maxpgsz)); 3895 ASSERT(!tron || IS_P2ALIGNED(a, maxpgsz)); 3896 3897 if (svd->pageprot != 0 && IS_P2ALIGNED(a, maxpgsz)) { 3898 ASSERT(vpage != NULL); 3899 prot = VPP_PROT(vpage); 3900 ASSERT(sameprot(seg, a, maxpgsz)); 3901 if ((prot & protchk) == 0) { 3902 SEGVN_VMSTAT_FLTVNPAGES(8); 3903 err = FC_PROT; 3904 goto out; 3905 } 3906 } 3907 if (type == F_SOFTLOCK) { 3908 mutex_enter(&freemem_lock); 3909 if (availrmem < tune.t_minarmem + pages) { 3910 mutex_exit(&freemem_lock); 3911 err = FC_MAKE_ERR(ENOMEM); 3912 goto out; 3913 } else { 3914 availrmem -= pages; 3915 segvn_pages_locked += pages; 3916 svd->softlockcnt += pages; 3917 } 3918 mutex_exit(&freemem_lock); 3919 } 3920 3921 pplist = NULL; 3922 physcontig = 0; 3923 ppa[0] = NULL; 3924 if (!brkcow && !tron && szc && 3925 !page_exists_physcontig(vp, off, szc, 3926 segtype == MAP_PRIVATE ? ppa : NULL)) { 3927 SEGVN_VMSTAT_FLTVNPAGES(9); 3928 if (page_alloc_pages(vp, seg, a, &pplist, NULL, 3929 szc, 0, 0) && type != F_SOFTLOCK) { 3930 SEGVN_VMSTAT_FLTVNPAGES(10); 3931 pszc = 0; 3932 ierr = -1; 3933 alloc_failed |= (1 << szc); 3934 break; 3935 } 3936 if (pplist != NULL && 3937 vp->v_mpssdata == SEGVN_PAGEIO) { 3938 int downsize; 3939 SEGVN_VMSTAT_FLTVNPAGES(11); 3940 physcontig = segvn_fill_vp_pages(svd, 3941 vp, off, szc, ppa, &pplist, 3942 &pszc, &downsize); 3943 ASSERT(!physcontig || pplist == NULL); 3944 if (!physcontig && downsize && 3945 type != F_SOFTLOCK) { 3946 ASSERT(pplist == NULL); 3947 SEGVN_VMSTAT_FLTVNPAGES(12); 3948 ierr = -1; 3949 break; 3950 } 3951 ASSERT(!physcontig || 3952 segtype == MAP_PRIVATE || 3953 ppa[0] == NULL); 3954 if (physcontig && ppa[0] == NULL) { 3955 physcontig = 0; 3956 } 3957 } 3958 } else if (!brkcow && !tron && szc && ppa[0] != NULL) { 3959 SEGVN_VMSTAT_FLTVNPAGES(13); 3960 ASSERT(segtype == MAP_PRIVATE); 3961 physcontig = 1; 3962 } 3963 3964 if (!physcontig) { 3965 SEGVN_VMSTAT_FLTVNPAGES(14); 3966 ppa[0] = NULL; 3967 ierr = VOP_GETPAGE(vp, (offset_t)off, pgsz, 3968 &vpprot, ppa, pgsz, seg, a, arw, 3969 svd->cred, NULL); 3970 #ifdef DEBUG 3971 if (ierr == 0) { 3972 for (i = 0; i < pages; i++) { 3973 ASSERT(PAGE_LOCKED(ppa[i])); 3974 ASSERT(!PP_ISFREE(ppa[i])); 3975 ASSERT(ppa[i]->p_vnode == vp); 3976 ASSERT(ppa[i]->p_offset == 3977 off + (i << PAGESHIFT)); 3978 } 3979 } 3980 #endif /* DEBUG */ 3981 if (segtype == MAP_PRIVATE) { 3982 SEGVN_VMSTAT_FLTVNPAGES(15); 3983 vpprot &= ~PROT_WRITE; 3984 } 3985 } else { 3986 ASSERT(segtype == MAP_PRIVATE); 3987 SEGVN_VMSTAT_FLTVNPAGES(16); 3988 vpprot = PROT_ALL & ~PROT_WRITE; 3989 ierr = 0; 3990 } 3991 3992 if (ierr != 0) { 3993 SEGVN_VMSTAT_FLTVNPAGES(17); 3994 if (pplist != NULL) { 3995 SEGVN_VMSTAT_FLTVNPAGES(18); 3996 page_free_replacement_page(pplist); 3997 page_create_putback(pages); 3998 } 3999 SEGVN_RESTORE_SOFTLOCK(type, pages); 4000 if (a + pgsz <= eaddr) { 4001 SEGVN_VMSTAT_FLTVNPAGES(19); 4002 err = FC_MAKE_ERR(ierr); 4003 goto out; 4004 } 4005 va.va_mask = AT_SIZE; 4006 if (VOP_GETATTR(vp, &va, 0, svd->cred, NULL)) { 4007 SEGVN_VMSTAT_FLTVNPAGES(20); 4008 err = FC_MAKE_ERR(EIO); 4009 goto out; 4010 } 4011 if (btopr(va.va_size) >= btopr(off + pgsz)) { 4012 SEGVN_VMSTAT_FLTVNPAGES(21); 4013 err = FC_MAKE_ERR(ierr); 4014 goto out; 4015 } 4016 if (btopr(va.va_size) < 4017 btopr(off + (eaddr - a))) { 4018 SEGVN_VMSTAT_FLTVNPAGES(22); 4019 err = FC_MAKE_ERR(ierr); 4020 goto out; 4021 } 4022 if (brkcow || tron || type == F_SOFTLOCK) { 4023 /* can't reduce map area */ 4024 SEGVN_VMSTAT_FLTVNPAGES(23); 4025 vop_size_err = 1; 4026 goto out; 4027 } 4028 SEGVN_VMSTAT_FLTVNPAGES(24); 4029 ASSERT(szc != 0); 4030 pszc = 0; 4031 ierr = -1; 4032 break; 4033 } 4034 4035 if (amp != NULL) { 4036 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 4037 anon_array_enter(amp, aindx, &an_cookie); 4038 } 4039 if (amp != NULL && 4040 anon_get_ptr(amp->ahp, aindx) != NULL) { 4041 ulong_t taindx = P2ALIGN(aindx, maxpages); 4042 4043 SEGVN_VMSTAT_FLTVNPAGES(25); 4044 ASSERT(anon_pages(amp->ahp, taindx, 4045 maxpages) == maxpages); 4046 for (i = 0; i < pages; i++) { 4047 page_unlock(ppa[i]); 4048 } 4049 anon_array_exit(&an_cookie); 4050 ANON_LOCK_EXIT(&->a_rwlock); 4051 if (pplist != NULL) { 4052 page_free_replacement_page(pplist); 4053 page_create_putback(pages); 4054 } 4055 SEGVN_RESTORE_SOFTLOCK(type, pages); 4056 if (szc < seg->s_szc) { 4057 SEGVN_VMSTAT_FLTVNPAGES(26); 4058 /* 4059 * For private segments SOFTLOCK 4060 * either always breaks cow (any rw 4061 * type except S_READ_NOCOW) or 4062 * address space is locked as writer 4063 * (S_READ_NOCOW case) and anon slots 4064 * can't show up on second check. 4065 * Therefore if we are here for 4066 * SOFTLOCK case it must be a cow 4067 * break but cow break never reduces 4068 * szc. text replication (tron) in 4069 * this case works as cow break. 4070 * Thus the assert below. 4071 */ 4072 ASSERT(!brkcow && !tron && 4073 type != F_SOFTLOCK); 4074 pszc = seg->s_szc; 4075 ierr = -2; 4076 break; 4077 } 4078 ASSERT(IS_P2ALIGNED(a, maxpgsz)); 4079 goto again; 4080 } 4081 #ifdef DEBUG 4082 if (amp != NULL) { 4083 ulong_t taindx = P2ALIGN(aindx, maxpages); 4084 ASSERT(!anon_pages(amp->ahp, taindx, maxpages)); 4085 } 4086 #endif /* DEBUG */ 4087 4088 if (brkcow || tron) { 4089 ASSERT(amp != NULL); 4090 ASSERT(pplist == NULL); 4091 ASSERT(szc == seg->s_szc); 4092 ASSERT(IS_P2ALIGNED(a, maxpgsz)); 4093 ASSERT(IS_P2ALIGNED(aindx, maxpages)); 4094 SEGVN_VMSTAT_FLTVNPAGES(27); 4095 ierr = anon_map_privatepages(amp, aindx, szc, 4096 seg, a, prot, ppa, vpage, segvn_anypgsz, 4097 tron ? PG_LOCAL : 0, svd->cred); 4098 if (ierr != 0) { 4099 SEGVN_VMSTAT_FLTVNPAGES(28); 4100 anon_array_exit(&an_cookie); 4101 ANON_LOCK_EXIT(&->a_rwlock); 4102 SEGVN_RESTORE_SOFTLOCK(type, pages); 4103 err = FC_MAKE_ERR(ierr); 4104 goto out; 4105 } 4106 4107 ASSERT(!IS_VMODSORT(ppa[0]->p_vnode)); 4108 /* 4109 * p_szc can't be changed for locked 4110 * swapfs pages. 4111 */ 4112 ASSERT(svd->rcookie == 4113 HAT_INVALID_REGION_COOKIE); 4114 hat_memload_array(hat, a, pgsz, ppa, prot, 4115 hat_flag); 4116 4117 if (!(hat_flag & HAT_LOAD_LOCK)) { 4118 SEGVN_VMSTAT_FLTVNPAGES(29); 4119 for (i = 0; i < pages; i++) { 4120 page_unlock(ppa[i]); 4121 } 4122 } 4123 anon_array_exit(&an_cookie); 4124 ANON_LOCK_EXIT(&->a_rwlock); 4125 goto next; 4126 } 4127 4128 ASSERT(svd->rcookie == HAT_INVALID_REGION_COOKIE || 4129 (!svd->pageprot && svd->prot == (prot & vpprot))); 4130 4131 pfn = page_pptonum(ppa[0]); 4132 /* 4133 * hat_page_demote() needs an SE_EXCL lock on one of 4134 * constituent page_t's and it decreases root's p_szc 4135 * last. This means if root's p_szc is equal szc and 4136 * all its constituent pages are locked 4137 * hat_page_demote() that could have changed p_szc to 4138 * szc is already done and no new have page_demote() 4139 * can start for this large page. 4140 */ 4141 4142 /* 4143 * we need to make sure same mapping size is used for 4144 * the same address range if there's a possibility the 4145 * adddress is already mapped because hat layer panics 4146 * when translation is loaded for the range already 4147 * mapped with a different page size. We achieve it 4148 * by always using largest page size possible subject 4149 * to the constraints of page size, segment page size 4150 * and page alignment. Since mappings are invalidated 4151 * when those constraints change and make it 4152 * impossible to use previously used mapping size no 4153 * mapping size conflicts should happen. 4154 */ 4155 4156 chkszc: 4157 if ((pszc = ppa[0]->p_szc) == szc && 4158 IS_P2ALIGNED(pfn, pages)) { 4159 4160 SEGVN_VMSTAT_FLTVNPAGES(30); 4161 #ifdef DEBUG 4162 for (i = 0; i < pages; i++) { 4163 ASSERT(PAGE_LOCKED(ppa[i])); 4164 ASSERT(!PP_ISFREE(ppa[i])); 4165 ASSERT(page_pptonum(ppa[i]) == 4166 pfn + i); 4167 ASSERT(ppa[i]->p_szc == szc); 4168 ASSERT(ppa[i]->p_vnode == vp); 4169 ASSERT(ppa[i]->p_offset == 4170 off + (i << PAGESHIFT)); 4171 } 4172 #endif /* DEBUG */ 4173 /* 4174 * All pages are of szc we need and they are 4175 * all locked so they can't change szc. load 4176 * translations. 4177 * 4178 * if page got promoted since last check 4179 * we don't need pplist. 4180 */ 4181 if (pplist != NULL) { 4182 page_free_replacement_page(pplist); 4183 page_create_putback(pages); 4184 } 4185 if (PP_ISMIGRATE(ppa[0])) { 4186 page_migrate(seg, a, ppa, pages); 4187 } 4188 SEGVN_UPDATE_MODBITS(ppa, pages, rw, 4189 prot, vpprot); 4190 if (!xhat) { 4191 hat_memload_array_region(hat, a, pgsz, 4192 ppa, prot & vpprot, hat_flag, 4193 svd->rcookie); 4194 } else { 4195 /* 4196 * avoid large xhat mappings to FS 4197 * pages so that hat_page_demote() 4198 * doesn't need to check for xhat 4199 * large mappings. 4200 * Don't use regions with xhats. 4201 */ 4202 for (i = 0; i < pages; i++) { 4203 hat_memload(hat, 4204 a + (i << PAGESHIFT), 4205 ppa[i], prot & vpprot, 4206 hat_flag); 4207 } 4208 } 4209 4210 if (!(hat_flag & HAT_LOAD_LOCK)) { 4211 for (i = 0; i < pages; i++) { 4212 page_unlock(ppa[i]); 4213 } 4214 } 4215 if (amp != NULL) { 4216 anon_array_exit(&an_cookie); 4217 ANON_LOCK_EXIT(&->a_rwlock); 4218 } 4219 goto next; 4220 } 4221 4222 /* 4223 * See if upsize is possible. 4224 */ 4225 if (pszc > szc && szc < seg->s_szc && 4226 (segvn_anypgsz_vnode || pszc >= seg->s_szc)) { 4227 pgcnt_t aphase; 4228 uint_t pszc1 = MIN(pszc, seg->s_szc); 4229 ppgsz = page_get_pagesize(pszc1); 4230 ppages = btop(ppgsz); 4231 aphase = btop(P2PHASE((uintptr_t)a, ppgsz)); 4232 4233 ASSERT(type != F_SOFTLOCK); 4234 4235 SEGVN_VMSTAT_FLTVNPAGES(31); 4236 if (aphase != P2PHASE(pfn, ppages)) { 4237 segvn_faultvnmpss_align_err4++; 4238 } else { 4239 SEGVN_VMSTAT_FLTVNPAGES(32); 4240 if (pplist != NULL) { 4241 page_t *pl = pplist; 4242 page_free_replacement_page(pl); 4243 page_create_putback(pages); 4244 } 4245 for (i = 0; i < pages; i++) { 4246 page_unlock(ppa[i]); 4247 } 4248 if (amp != NULL) { 4249 anon_array_exit(&an_cookie); 4250 ANON_LOCK_EXIT(&->a_rwlock); 4251 } 4252 pszc = pszc1; 4253 ierr = -2; 4254 break; 4255 } 4256 } 4257 4258 /* 4259 * check if we should use smallest mapping size. 4260 */ 4261 upgrdfail = 0; 4262 if (szc == 0 || xhat || 4263 (pszc >= szc && 4264 !IS_P2ALIGNED(pfn, pages)) || 4265 (pszc < szc && 4266 !segvn_full_szcpages(ppa, szc, &upgrdfail, 4267 &pszc))) { 4268 4269 if (upgrdfail && type != F_SOFTLOCK) { 4270 /* 4271 * segvn_full_szcpages failed to lock 4272 * all pages EXCL. Size down. 4273 */ 4274 ASSERT(pszc < szc); 4275 4276 SEGVN_VMSTAT_FLTVNPAGES(33); 4277 4278 if (pplist != NULL) { 4279 page_t *pl = pplist; 4280 page_free_replacement_page(pl); 4281 page_create_putback(pages); 4282 } 4283 4284 for (i = 0; i < pages; i++) { 4285 page_unlock(ppa[i]); 4286 } 4287 if (amp != NULL) { 4288 anon_array_exit(&an_cookie); 4289 ANON_LOCK_EXIT(&->a_rwlock); 4290 } 4291 ierr = -1; 4292 break; 4293 } 4294 if (szc != 0 && !xhat && !upgrdfail) { 4295 segvn_faultvnmpss_align_err5++; 4296 } 4297 SEGVN_VMSTAT_FLTVNPAGES(34); 4298 if (pplist != NULL) { 4299 page_free_replacement_page(pplist); 4300 page_create_putback(pages); 4301 } 4302 SEGVN_UPDATE_MODBITS(ppa, pages, rw, 4303 prot, vpprot); 4304 if (upgrdfail && segvn_anypgsz_vnode) { 4305 /* SOFTLOCK case */ 4306 hat_memload_array_region(hat, a, pgsz, 4307 ppa, prot & vpprot, hat_flag, 4308 svd->rcookie); 4309 } else { 4310 for (i = 0; i < pages; i++) { 4311 hat_memload_region(hat, 4312 a + (i << PAGESHIFT), 4313 ppa[i], prot & vpprot, 4314 hat_flag, svd->rcookie); 4315 } 4316 } 4317 if (!(hat_flag & HAT_LOAD_LOCK)) { 4318 for (i = 0; i < pages; i++) { 4319 page_unlock(ppa[i]); 4320 } 4321 } 4322 if (amp != NULL) { 4323 anon_array_exit(&an_cookie); 4324 ANON_LOCK_EXIT(&->a_rwlock); 4325 } 4326 goto next; 4327 } 4328 4329 if (pszc == szc) { 4330 /* 4331 * segvn_full_szcpages() upgraded pages szc. 4332 */ 4333 ASSERT(pszc == ppa[0]->p_szc); 4334 ASSERT(IS_P2ALIGNED(pfn, pages)); 4335 goto chkszc; 4336 } 4337 4338 if (pszc > szc) { 4339 kmutex_t *szcmtx; 4340 SEGVN_VMSTAT_FLTVNPAGES(35); 4341 /* 4342 * p_szc of ppa[0] can change since we haven't 4343 * locked all constituent pages. Call 4344 * page_lock_szc() to prevent szc changes. 4345 * This should be a rare case that happens when 4346 * multiple segments use a different page size 4347 * to map the same file offsets. 4348 */ 4349 szcmtx = page_szc_lock(ppa[0]); 4350 pszc = ppa[0]->p_szc; 4351 ASSERT(szcmtx != NULL || pszc == 0); 4352 ASSERT(ppa[0]->p_szc <= pszc); 4353 if (pszc <= szc) { 4354 SEGVN_VMSTAT_FLTVNPAGES(36); 4355 if (szcmtx != NULL) { 4356 mutex_exit(szcmtx); 4357 } 4358 goto chkszc; 4359 } 4360 if (pplist != NULL) { 4361 /* 4362 * page got promoted since last check. 4363 * we don't need preaalocated large 4364 * page. 4365 */ 4366 SEGVN_VMSTAT_FLTVNPAGES(37); 4367 page_free_replacement_page(pplist); 4368 page_create_putback(pages); 4369 } 4370 SEGVN_UPDATE_MODBITS(ppa, pages, rw, 4371 prot, vpprot); 4372 hat_memload_array_region(hat, a, pgsz, ppa, 4373 prot & vpprot, hat_flag, svd->rcookie); 4374 mutex_exit(szcmtx); 4375 if (!(hat_flag & HAT_LOAD_LOCK)) { 4376 for (i = 0; i < pages; i++) { 4377 page_unlock(ppa[i]); 4378 } 4379 } 4380 if (amp != NULL) { 4381 anon_array_exit(&an_cookie); 4382 ANON_LOCK_EXIT(&->a_rwlock); 4383 } 4384 goto next; 4385 } 4386 4387 /* 4388 * if page got demoted since last check 4389 * we could have not allocated larger page. 4390 * allocate now. 4391 */ 4392 if (pplist == NULL && 4393 page_alloc_pages(vp, seg, a, &pplist, NULL, 4394 szc, 0, 0) && type != F_SOFTLOCK) { 4395 SEGVN_VMSTAT_FLTVNPAGES(38); 4396 for (i = 0; i < pages; i++) { 4397 page_unlock(ppa[i]); 4398 } 4399 if (amp != NULL) { 4400 anon_array_exit(&an_cookie); 4401 ANON_LOCK_EXIT(&->a_rwlock); 4402 } 4403 ierr = -1; 4404 alloc_failed |= (1 << szc); 4405 break; 4406 } 4407 4408 SEGVN_VMSTAT_FLTVNPAGES(39); 4409 4410 if (pplist != NULL) { 4411 segvn_relocate_pages(ppa, pplist); 4412 #ifdef DEBUG 4413 } else { 4414 ASSERT(type == F_SOFTLOCK); 4415 SEGVN_VMSTAT_FLTVNPAGES(40); 4416 #endif /* DEBUG */ 4417 } 4418 4419 SEGVN_UPDATE_MODBITS(ppa, pages, rw, prot, vpprot); 4420 4421 if (pplist == NULL && segvn_anypgsz_vnode == 0) { 4422 ASSERT(type == F_SOFTLOCK); 4423 for (i = 0; i < pages; i++) { 4424 ASSERT(ppa[i]->p_szc < szc); 4425 hat_memload_region(hat, 4426 a + (i << PAGESHIFT), 4427 ppa[i], prot & vpprot, hat_flag, 4428 svd->rcookie); 4429 } 4430 } else { 4431 ASSERT(pplist != NULL || type == F_SOFTLOCK); 4432 hat_memload_array_region(hat, a, pgsz, ppa, 4433 prot & vpprot, hat_flag, svd->rcookie); 4434 } 4435 if (!(hat_flag & HAT_LOAD_LOCK)) { 4436 for (i = 0; i < pages; i++) { 4437 ASSERT(PAGE_SHARED(ppa[i])); 4438 page_unlock(ppa[i]); 4439 } 4440 } 4441 if (amp != NULL) { 4442 anon_array_exit(&an_cookie); 4443 ANON_LOCK_EXIT(&->a_rwlock); 4444 } 4445 4446 next: 4447 if (vpage != NULL) { 4448 vpage += pages; 4449 } 4450 adjszc_chk = 1; 4451 } 4452 if (a == lpgeaddr) 4453 break; 4454 ASSERT(a < lpgeaddr); 4455 4456 ASSERT(!brkcow && !tron && type != F_SOFTLOCK); 4457 4458 /* 4459 * ierr == -1 means we failed to map with a large page. 4460 * (either due to allocation/relocation failures or 4461 * misalignment with other mappings to this file. 4462 * 4463 * ierr == -2 means some other thread allocated a large page 4464 * after we gave up tp map with a large page. retry with 4465 * larger mapping. 4466 */ 4467 ASSERT(ierr == -1 || ierr == -2); 4468 ASSERT(ierr == -2 || szc != 0); 4469 ASSERT(ierr == -1 || szc < seg->s_szc); 4470 if (ierr == -2) { 4471 SEGVN_VMSTAT_FLTVNPAGES(41); 4472 ASSERT(pszc > szc && pszc <= seg->s_szc); 4473 szc = pszc; 4474 } else if (segvn_anypgsz_vnode) { 4475 SEGVN_VMSTAT_FLTVNPAGES(42); 4476 szc--; 4477 } else { 4478 SEGVN_VMSTAT_FLTVNPAGES(43); 4479 ASSERT(pszc < szc); 4480 /* 4481 * other process created pszc large page. 4482 * but we still have to drop to 0 szc. 4483 */ 4484 szc = 0; 4485 } 4486 4487 pgsz = page_get_pagesize(szc); 4488 pages = btop(pgsz); 4489 if (ierr == -2) { 4490 /* 4491 * Size up case. Note lpgaddr may only be needed for 4492 * softlock case so we don't adjust it here. 4493 */ 4494 a = (caddr_t)P2ALIGN((uintptr_t)a, pgsz); 4495 ASSERT(a >= lpgaddr); 4496 lpgeaddr = (caddr_t)P2ROUNDUP((uintptr_t)eaddr, pgsz); 4497 off = svd->offset + (uintptr_t)(a - seg->s_base); 4498 aindx = svd->anon_index + seg_page(seg, a); 4499 vpage = (svd->vpage != NULL) ? 4500 &svd->vpage[seg_page(seg, a)] : NULL; 4501 } else { 4502 /* 4503 * Size down case. Note lpgaddr may only be needed for 4504 * softlock case so we don't adjust it here. 4505 */ 4506 ASSERT(IS_P2ALIGNED(a, pgsz)); 4507 ASSERT(IS_P2ALIGNED(lpgeaddr, pgsz)); 4508 lpgeaddr = (caddr_t)P2ROUNDUP((uintptr_t)eaddr, pgsz); 4509 ASSERT(a < lpgeaddr); 4510 if (a < addr) { 4511 SEGVN_VMSTAT_FLTVNPAGES(44); 4512 /* 4513 * The beginning of the large page region can 4514 * be pulled to the right to make a smaller 4515 * region. We haven't yet faulted a single 4516 * page. 4517 */ 4518 a = (caddr_t)P2ALIGN((uintptr_t)addr, pgsz); 4519 ASSERT(a >= lpgaddr); 4520 off = svd->offset + 4521 (uintptr_t)(a - seg->s_base); 4522 aindx = svd->anon_index + seg_page(seg, a); 4523 vpage = (svd->vpage != NULL) ? 4524 &svd->vpage[seg_page(seg, a)] : NULL; 4525 } 4526 } 4527 } 4528 out: 4529 kmem_free(ppa, ppasize); 4530 if (!err && !vop_size_err) { 4531 SEGVN_VMSTAT_FLTVNPAGES(45); 4532 return (0); 4533 } 4534 if (type == F_SOFTLOCK && a > lpgaddr) { 4535 SEGVN_VMSTAT_FLTVNPAGES(46); 4536 segvn_softunlock(seg, lpgaddr, a - lpgaddr, S_OTHER); 4537 } 4538 if (!vop_size_err) { 4539 SEGVN_VMSTAT_FLTVNPAGES(47); 4540 return (err); 4541 } 4542 ASSERT(brkcow || tron || type == F_SOFTLOCK); 4543 /* 4544 * Large page end is mapped beyond the end of file and it's a cow 4545 * fault (can be a text replication induced cow) or softlock so we can't 4546 * reduce the map area. For now just demote the segment. This should 4547 * really only happen if the end of the file changed after the mapping 4548 * was established since when large page segments are created we make 4549 * sure they don't extend beyond the end of the file. 4550 */ 4551 SEGVN_VMSTAT_FLTVNPAGES(48); 4552 4553 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 4554 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_WRITER); 4555 err = 0; 4556 if (seg->s_szc != 0) { 4557 segvn_fltvnpages_clrszc_cnt++; 4558 ASSERT(svd->softlockcnt == 0); 4559 err = segvn_clrszc(seg); 4560 if (err != 0) { 4561 segvn_fltvnpages_clrszc_err++; 4562 } 4563 } 4564 ASSERT(err || seg->s_szc == 0); 4565 SEGVN_LOCK_DOWNGRADE(seg->s_as, &svd->lock); 4566 /* segvn_fault will do its job as if szc had been zero to begin with */ 4567 return (err == 0 ? IE_RETRY : FC_MAKE_ERR(err)); 4568 } 4569 4570 /* 4571 * This routine will attempt to fault in one large page. 4572 * it will use smaller pages if that fails. 4573 * It should only be called for pure anonymous segments. 4574 */ 4575 static faultcode_t 4576 segvn_fault_anonpages(struct hat *hat, struct seg *seg, caddr_t lpgaddr, 4577 caddr_t lpgeaddr, enum fault_type type, enum seg_rw rw, caddr_t addr, 4578 caddr_t eaddr, int brkcow) 4579 { 4580 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 4581 struct anon_map *amp = svd->amp; 4582 uchar_t segtype = svd->type; 4583 uint_t szc = seg->s_szc; 4584 size_t pgsz = page_get_pagesize(szc); 4585 size_t maxpgsz = pgsz; 4586 pgcnt_t pages = btop(pgsz); 4587 size_t ppasize = pages * sizeof (page_t *); 4588 caddr_t a = lpgaddr; 4589 ulong_t aindx = svd->anon_index + seg_page(seg, a); 4590 struct vpage *vpage = (svd->vpage != NULL) ? 4591 &svd->vpage[seg_page(seg, a)] : NULL; 4592 page_t **ppa; 4593 uint_t ppa_szc; 4594 faultcode_t err; 4595 int ierr; 4596 uint_t protchk, prot, vpprot; 4597 ulong_t i; 4598 int hat_flag = (type == F_SOFTLOCK) ? HAT_LOAD_LOCK : HAT_LOAD; 4599 anon_sync_obj_t cookie; 4600 int first = 1; 4601 int adjszc_chk; 4602 int purged = 0; 4603 int pgflags = (svd->tr_state == SEGVN_TR_ON) ? PG_LOCAL : 0; 4604 4605 ASSERT(szc != 0); 4606 ASSERT(amp != NULL); 4607 ASSERT(enable_mbit_wa == 0); /* no mbit simulations with large pages */ 4608 ASSERT(!(svd->flags & MAP_NORESERVE)); 4609 ASSERT(type != F_SOFTUNLOCK); 4610 ASSERT(IS_P2ALIGNED(a, maxpgsz)); 4611 ASSERT(!brkcow || svd->tr_state == SEGVN_TR_OFF); 4612 ASSERT(svd->tr_state != SEGVN_TR_INIT); 4613 4614 ASSERT(SEGVN_LOCK_HELD(seg->s_as, &svd->lock)); 4615 4616 VM_STAT_COND_ADD(type == F_SOFTLOCK, segvnvmstats.fltanpages[0]); 4617 VM_STAT_COND_ADD(type != F_SOFTLOCK, segvnvmstats.fltanpages[1]); 4618 4619 if (svd->flags & MAP_TEXT) { 4620 hat_flag |= HAT_LOAD_TEXT; 4621 } 4622 4623 if (svd->pageprot) { 4624 switch (rw) { 4625 case S_READ: 4626 protchk = PROT_READ; 4627 break; 4628 case S_WRITE: 4629 protchk = PROT_WRITE; 4630 break; 4631 case S_EXEC: 4632 protchk = PROT_EXEC; 4633 break; 4634 case S_OTHER: 4635 default: 4636 protchk = PROT_READ | PROT_WRITE | PROT_EXEC; 4637 break; 4638 } 4639 VM_STAT_ADD(segvnvmstats.fltanpages[2]); 4640 } else { 4641 prot = svd->prot; 4642 /* caller has already done segment level protection check. */ 4643 } 4644 4645 ppa = kmem_alloc(ppasize, KM_SLEEP); 4646 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 4647 for (;;) { 4648 adjszc_chk = 0; 4649 for (; a < lpgeaddr; a += pgsz, aindx += pages) { 4650 if (svd->pageprot != 0 && IS_P2ALIGNED(a, maxpgsz)) { 4651 VM_STAT_ADD(segvnvmstats.fltanpages[3]); 4652 ASSERT(vpage != NULL); 4653 prot = VPP_PROT(vpage); 4654 ASSERT(sameprot(seg, a, maxpgsz)); 4655 if ((prot & protchk) == 0) { 4656 err = FC_PROT; 4657 goto error; 4658 } 4659 } 4660 if (adjszc_chk && IS_P2ALIGNED(a, maxpgsz) && 4661 pgsz < maxpgsz) { 4662 ASSERT(a > lpgaddr); 4663 szc = seg->s_szc; 4664 pgsz = maxpgsz; 4665 pages = btop(pgsz); 4666 ASSERT(IS_P2ALIGNED(aindx, pages)); 4667 lpgeaddr = (caddr_t)P2ROUNDUP((uintptr_t)eaddr, 4668 pgsz); 4669 } 4670 if (type == F_SOFTLOCK && svd->vp != NULL) { 4671 mutex_enter(&freemem_lock); 4672 if (availrmem < tune.t_minarmem + pages) { 4673 mutex_exit(&freemem_lock); 4674 err = FC_MAKE_ERR(ENOMEM); 4675 goto error; 4676 } else { 4677 availrmem -= pages; 4678 segvn_pages_locked += pages; 4679 svd->softlockcnt += pages; 4680 } 4681 mutex_exit(&freemem_lock); 4682 } 4683 anon_array_enter(amp, aindx, &cookie); 4684 ppa_szc = (uint_t)-1; 4685 ierr = anon_map_getpages(amp, aindx, szc, seg, a, 4686 prot, &vpprot, ppa, &ppa_szc, vpage, rw, brkcow, 4687 segvn_anypgsz, pgflags, svd->cred); 4688 if (ierr != 0) { 4689 anon_array_exit(&cookie); 4690 VM_STAT_ADD(segvnvmstats.fltanpages[4]); 4691 if (type == F_SOFTLOCK && svd->vp != NULL) { 4692 VM_STAT_ADD(segvnvmstats.fltanpages[5]); 4693 mutex_enter(&freemem_lock); 4694 availrmem += pages; 4695 segvn_pages_locked -= pages; 4696 svd->softlockcnt -= pages; 4697 mutex_exit(&freemem_lock); 4698 } 4699 if (ierr > 0) { 4700 VM_STAT_ADD(segvnvmstats.fltanpages[6]); 4701 err = FC_MAKE_ERR(ierr); 4702 goto error; 4703 } 4704 break; 4705 } 4706 4707 ASSERT(!IS_VMODSORT(ppa[0]->p_vnode)); 4708 4709 ASSERT(segtype == MAP_SHARED || 4710 ppa[0]->p_szc <= szc); 4711 ASSERT(segtype == MAP_PRIVATE || 4712 ppa[0]->p_szc >= szc); 4713 4714 /* 4715 * Handle pages that have been marked for migration 4716 */ 4717 if (lgrp_optimizations()) 4718 page_migrate(seg, a, ppa, pages); 4719 4720 ASSERT(svd->rcookie == HAT_INVALID_REGION_COOKIE); 4721 if (type == F_SOFTLOCK && svd->vp == NULL) { 4722 /* 4723 * If all pages in ppa array belong to the same 4724 * large page call segvn_slock_anonpages() 4725 * just for ppa[0]. 4726 */ 4727 for (i = 0; i < pages; i++) { 4728 if (!segvn_slock_anonpages(ppa[i], 4729 i == 0 && first)) { 4730 ulong_t j; 4731 for (j = 0; j < i; j++) { 4732 segvn_sunlock_anonpages( 4733 ppa[j], 4734 j == 0 && 4735 first); 4736 page_unlock(ppa[j]); 4737 } 4738 for (j = i; j < pages; j++) { 4739 page_unlock(ppa[j]); 4740 } 4741 anon_array_exit(&cookie); 4742 err = FC_MAKE_ERR(ENOMEM); 4743 goto error; 4744 } 4745 if (i == 0 && ppa[0]->p_szc >= szc) { 4746 ASSERT(!(page_pptonum(ppa[0]) & 4747 (pages - 1))); 4748 break; 4749 } 4750 } 4751 first = 0; 4752 mutex_enter(&freemem_lock); 4753 svd->softlockcnt += pages; 4754 segvn_pages_locked += pages; 4755 mutex_exit(&freemem_lock); 4756 } 4757 4758 if (segtype == MAP_SHARED) { 4759 vpprot |= PROT_WRITE; 4760 } 4761 4762 hat_memload_array(hat, a, pgsz, ppa, 4763 prot & vpprot, hat_flag); 4764 4765 if (hat_flag & HAT_LOAD_LOCK) { 4766 VM_STAT_ADD(segvnvmstats.fltanpages[7]); 4767 } else { 4768 VM_STAT_ADD(segvnvmstats.fltanpages[8]); 4769 for (i = 0; i < pages; i++) 4770 page_unlock(ppa[i]); 4771 } 4772 if (vpage != NULL) 4773 vpage += pages; 4774 4775 anon_array_exit(&cookie); 4776 adjszc_chk = 1; 4777 } 4778 if (a == lpgeaddr) 4779 break; 4780 ASSERT(a < lpgeaddr); 4781 /* 4782 * ierr == -1 means we failed to allocate a large page. 4783 * so do a size down operation. 4784 * 4785 * ierr == -2 means some other process that privately shares 4786 * pages with this process has allocated a larger page and we 4787 * need to retry with larger pages. So do a size up 4788 * operation. This relies on the fact that large pages are 4789 * never partially shared i.e. if we share any constituent 4790 * page of a large page with another process we must share the 4791 * entire large page. Note this cannot happen for SOFTLOCK 4792 * case, unless current address (a) is at the beginning of the 4793 * next page size boundary because the other process couldn't 4794 * have relocated locked pages. 4795 */ 4796 ASSERT(ierr == -1 || ierr == -2); 4797 /* 4798 * For the very first relocation failure try to purge this 4799 * segment's cache so that the relocator can obtain an 4800 * exclusive lock on pages we want to relocate. 4801 */ 4802 if (!purged && ierr == -1 && ppa_szc != (uint_t)-1 && 4803 svd->softlockcnt != 0) { 4804 purged = 1; 4805 segvn_purge(seg); 4806 continue; 4807 } 4808 4809 if (segvn_anypgsz) { 4810 ASSERT(ierr == -2 || szc != 0); 4811 ASSERT(ierr == -1 || szc < seg->s_szc); 4812 szc = (ierr == -1) ? szc - 1 : szc + 1; 4813 } else { 4814 /* 4815 * For non COW faults and segvn_anypgsz == 0 4816 * we need to be careful not to loop forever 4817 * if existing page is found with szc other 4818 * than 0 or seg->s_szc. This could be due 4819 * to page relocations on behalf of DR or 4820 * more likely large page creation. For this 4821 * case simply re-size to existing page's szc 4822 * if returned by anon_map_getpages(). 4823 */ 4824 if (ppa_szc == (uint_t)-1) { 4825 szc = (ierr == -1) ? 0 : seg->s_szc; 4826 } else { 4827 ASSERT(ppa_szc <= seg->s_szc); 4828 ASSERT(ierr == -2 || ppa_szc < szc); 4829 ASSERT(ierr == -1 || ppa_szc > szc); 4830 szc = ppa_szc; 4831 } 4832 } 4833 4834 pgsz = page_get_pagesize(szc); 4835 pages = btop(pgsz); 4836 ASSERT(type != F_SOFTLOCK || ierr == -1 || 4837 (IS_P2ALIGNED(a, pgsz) && IS_P2ALIGNED(lpgeaddr, pgsz))); 4838 if (type == F_SOFTLOCK) { 4839 /* 4840 * For softlocks we cannot reduce the fault area 4841 * (calculated based on the largest page size for this 4842 * segment) for size down and a is already next 4843 * page size aligned as assertted above for size 4844 * ups. Therefore just continue in case of softlock. 4845 */ 4846 VM_STAT_ADD(segvnvmstats.fltanpages[9]); 4847 continue; /* keep lint happy */ 4848 } else if (ierr == -2) { 4849 4850 /* 4851 * Size up case. Note lpgaddr may only be needed for 4852 * softlock case so we don't adjust it here. 4853 */ 4854 VM_STAT_ADD(segvnvmstats.fltanpages[10]); 4855 a = (caddr_t)P2ALIGN((uintptr_t)a, pgsz); 4856 ASSERT(a >= lpgaddr); 4857 lpgeaddr = (caddr_t)P2ROUNDUP((uintptr_t)eaddr, pgsz); 4858 aindx = svd->anon_index + seg_page(seg, a); 4859 vpage = (svd->vpage != NULL) ? 4860 &svd->vpage[seg_page(seg, a)] : NULL; 4861 } else { 4862 /* 4863 * Size down case. Note lpgaddr may only be needed for 4864 * softlock case so we don't adjust it here. 4865 */ 4866 VM_STAT_ADD(segvnvmstats.fltanpages[11]); 4867 ASSERT(IS_P2ALIGNED(a, pgsz)); 4868 ASSERT(IS_P2ALIGNED(lpgeaddr, pgsz)); 4869 lpgeaddr = (caddr_t)P2ROUNDUP((uintptr_t)eaddr, pgsz); 4870 ASSERT(a < lpgeaddr); 4871 if (a < addr) { 4872 /* 4873 * The beginning of the large page region can 4874 * be pulled to the right to make a smaller 4875 * region. We haven't yet faulted a single 4876 * page. 4877 */ 4878 VM_STAT_ADD(segvnvmstats.fltanpages[12]); 4879 a = (caddr_t)P2ALIGN((uintptr_t)addr, pgsz); 4880 ASSERT(a >= lpgaddr); 4881 aindx = svd->anon_index + seg_page(seg, a); 4882 vpage = (svd->vpage != NULL) ? 4883 &svd->vpage[seg_page(seg, a)] : NULL; 4884 } 4885 } 4886 } 4887 VM_STAT_ADD(segvnvmstats.fltanpages[13]); 4888 ANON_LOCK_EXIT(&->a_rwlock); 4889 kmem_free(ppa, ppasize); 4890 return (0); 4891 error: 4892 VM_STAT_ADD(segvnvmstats.fltanpages[14]); 4893 ANON_LOCK_EXIT(&->a_rwlock); 4894 kmem_free(ppa, ppasize); 4895 if (type == F_SOFTLOCK && a > lpgaddr) { 4896 VM_STAT_ADD(segvnvmstats.fltanpages[15]); 4897 segvn_softunlock(seg, lpgaddr, a - lpgaddr, S_OTHER); 4898 } 4899 return (err); 4900 } 4901 4902 int fltadvice = 1; /* set to free behind pages for sequential access */ 4903 4904 /* 4905 * This routine is called via a machine specific fault handling routine. 4906 * It is also called by software routines wishing to lock or unlock 4907 * a range of addresses. 4908 * 4909 * Here is the basic algorithm: 4910 * If unlocking 4911 * Call segvn_softunlock 4912 * Return 4913 * endif 4914 * Checking and set up work 4915 * If we will need some non-anonymous pages 4916 * Call VOP_GETPAGE over the range of non-anonymous pages 4917 * endif 4918 * Loop over all addresses requested 4919 * Call segvn_faultpage passing in page list 4920 * to load up translations and handle anonymous pages 4921 * endloop 4922 * Load up translation to any additional pages in page list not 4923 * already handled that fit into this segment 4924 */ 4925 static faultcode_t 4926 segvn_fault(struct hat *hat, struct seg *seg, caddr_t addr, size_t len, 4927 enum fault_type type, enum seg_rw rw) 4928 { 4929 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 4930 page_t **plp, **ppp, *pp; 4931 u_offset_t off; 4932 caddr_t a; 4933 struct vpage *vpage; 4934 uint_t vpprot, prot; 4935 int err; 4936 page_t *pl[PVN_GETPAGE_NUM + 1]; 4937 size_t plsz, pl_alloc_sz; 4938 size_t page; 4939 ulong_t anon_index; 4940 struct anon_map *amp; 4941 int dogetpage = 0; 4942 caddr_t lpgaddr, lpgeaddr; 4943 size_t pgsz; 4944 anon_sync_obj_t cookie; 4945 int brkcow = BREAK_COW_SHARE(rw, type, svd->type); 4946 4947 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 4948 ASSERT(svd->amp == NULL || svd->rcookie == HAT_INVALID_REGION_COOKIE); 4949 4950 /* 4951 * First handle the easy stuff 4952 */ 4953 if (type == F_SOFTUNLOCK) { 4954 if (rw == S_READ_NOCOW) { 4955 rw = S_READ; 4956 ASSERT(AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock)); 4957 } 4958 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER); 4959 pgsz = (seg->s_szc == 0) ? PAGESIZE : 4960 page_get_pagesize(seg->s_szc); 4961 VM_STAT_COND_ADD(pgsz > PAGESIZE, segvnvmstats.fltanpages[16]); 4962 CALC_LPG_REGION(pgsz, seg, addr, len, lpgaddr, lpgeaddr); 4963 segvn_softunlock(seg, lpgaddr, lpgeaddr - lpgaddr, rw); 4964 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 4965 return (0); 4966 } 4967 4968 ASSERT(svd->tr_state == SEGVN_TR_OFF || 4969 !HAT_IS_REGION_COOKIE_VALID(svd->rcookie)); 4970 if (brkcow == 0) { 4971 if (svd->tr_state == SEGVN_TR_INIT) { 4972 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_WRITER); 4973 if (svd->tr_state == SEGVN_TR_INIT) { 4974 ASSERT(svd->vp != NULL && svd->amp == NULL); 4975 ASSERT(svd->flags & MAP_TEXT); 4976 ASSERT(svd->type == MAP_PRIVATE); 4977 segvn_textrepl(seg); 4978 ASSERT(svd->tr_state != SEGVN_TR_INIT); 4979 ASSERT(svd->tr_state != SEGVN_TR_ON || 4980 svd->amp != NULL); 4981 } 4982 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 4983 } 4984 } else if (svd->tr_state != SEGVN_TR_OFF) { 4985 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_WRITER); 4986 4987 if (rw == S_WRITE && svd->tr_state != SEGVN_TR_OFF) { 4988 ASSERT(!svd->pageprot && !(svd->prot & PROT_WRITE)); 4989 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 4990 return (FC_PROT); 4991 } 4992 4993 if (svd->tr_state == SEGVN_TR_ON) { 4994 ASSERT(svd->vp != NULL && svd->amp != NULL); 4995 segvn_textunrepl(seg, 0); 4996 ASSERT(svd->amp == NULL && 4997 svd->tr_state == SEGVN_TR_OFF); 4998 } else if (svd->tr_state != SEGVN_TR_OFF) { 4999 svd->tr_state = SEGVN_TR_OFF; 5000 } 5001 ASSERT(svd->amp == NULL && svd->tr_state == SEGVN_TR_OFF); 5002 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5003 } 5004 5005 top: 5006 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER); 5007 5008 /* 5009 * If we have the same protections for the entire segment, 5010 * insure that the access being attempted is legitimate. 5011 */ 5012 5013 if (svd->pageprot == 0) { 5014 uint_t protchk; 5015 5016 switch (rw) { 5017 case S_READ: 5018 case S_READ_NOCOW: 5019 protchk = PROT_READ; 5020 break; 5021 case S_WRITE: 5022 protchk = PROT_WRITE; 5023 break; 5024 case S_EXEC: 5025 protchk = PROT_EXEC; 5026 break; 5027 case S_OTHER: 5028 default: 5029 protchk = PROT_READ | PROT_WRITE | PROT_EXEC; 5030 break; 5031 } 5032 5033 if ((svd->prot & protchk) == 0) { 5034 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5035 return (FC_PROT); /* illegal access type */ 5036 } 5037 } 5038 5039 if (brkcow && HAT_IS_REGION_COOKIE_VALID(svd->rcookie)) { 5040 /* this must be SOFTLOCK S_READ fault */ 5041 ASSERT(svd->amp == NULL); 5042 ASSERT(svd->tr_state == SEGVN_TR_OFF); 5043 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5044 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_WRITER); 5045 if (HAT_IS_REGION_COOKIE_VALID(svd->rcookie)) { 5046 /* 5047 * this must be the first ever non S_READ_NOCOW 5048 * softlock for this segment. 5049 */ 5050 ASSERT(svd->softlockcnt == 0); 5051 hat_leave_region(seg->s_as->a_hat, svd->rcookie, 5052 HAT_REGION_TEXT); 5053 svd->rcookie = HAT_INVALID_REGION_COOKIE; 5054 } 5055 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5056 goto top; 5057 } 5058 5059 /* 5060 * We can't allow the long term use of softlocks for vmpss segments, 5061 * because in some file truncation cases we should be able to demote 5062 * the segment, which requires that there are no softlocks. The 5063 * only case where it's ok to allow a SOFTLOCK fault against a vmpss 5064 * segment is S_READ_NOCOW, where the caller holds the address space 5065 * locked as writer and calls softunlock before dropping the as lock. 5066 * S_READ_NOCOW is used by /proc to read memory from another user. 5067 * 5068 * Another deadlock between SOFTLOCK and file truncation can happen 5069 * because segvn_fault_vnodepages() calls the FS one pagesize at 5070 * a time. A second VOP_GETPAGE() call by segvn_fault_vnodepages() 5071 * can cause a deadlock because the first set of page_t's remain 5072 * locked SE_SHARED. To avoid this, we demote segments on a first 5073 * SOFTLOCK if they have a length greater than the segment's 5074 * page size. 5075 * 5076 * So for now, we only avoid demoting a segment on a SOFTLOCK when 5077 * the access type is S_READ_NOCOW and the fault length is less than 5078 * or equal to the segment's page size. While this is quite restrictive, 5079 * it should be the most common case of SOFTLOCK against a vmpss 5080 * segment. 5081 * 5082 * For S_READ_NOCOW, it's safe not to do a copy on write because the 5083 * caller makes sure no COW will be caused by another thread for a 5084 * softlocked page. 5085 */ 5086 if (type == F_SOFTLOCK && svd->vp != NULL && seg->s_szc != 0) { 5087 int demote = 0; 5088 5089 if (rw != S_READ_NOCOW) { 5090 demote = 1; 5091 } 5092 if (!demote && len > PAGESIZE) { 5093 pgsz = page_get_pagesize(seg->s_szc); 5094 CALC_LPG_REGION(pgsz, seg, addr, len, lpgaddr, 5095 lpgeaddr); 5096 if (lpgeaddr - lpgaddr > pgsz) { 5097 demote = 1; 5098 } 5099 } 5100 5101 ASSERT(demote || AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock)); 5102 5103 if (demote) { 5104 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5105 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_WRITER); 5106 if (seg->s_szc != 0) { 5107 segvn_vmpss_clrszc_cnt++; 5108 ASSERT(svd->softlockcnt == 0); 5109 err = segvn_clrszc(seg); 5110 if (err) { 5111 segvn_vmpss_clrszc_err++; 5112 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5113 return (FC_MAKE_ERR(err)); 5114 } 5115 } 5116 ASSERT(seg->s_szc == 0); 5117 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5118 goto top; 5119 } 5120 } 5121 5122 /* 5123 * Check to see if we need to allocate an anon_map structure. 5124 */ 5125 if (svd->amp == NULL && (svd->vp == NULL || brkcow)) { 5126 ASSERT(svd->rcookie == HAT_INVALID_REGION_COOKIE); 5127 /* 5128 * Drop the "read" lock on the segment and acquire 5129 * the "write" version since we have to allocate the 5130 * anon_map. 5131 */ 5132 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5133 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_WRITER); 5134 5135 if (svd->amp == NULL) { 5136 svd->amp = anonmap_alloc(seg->s_size, 0, ANON_SLEEP); 5137 svd->amp->a_szc = seg->s_szc; 5138 } 5139 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5140 5141 /* 5142 * Start all over again since segment protections 5143 * may have changed after we dropped the "read" lock. 5144 */ 5145 goto top; 5146 } 5147 5148 /* 5149 * S_READ_NOCOW vs S_READ distinction was 5150 * only needed for the code above. After 5151 * that we treat it as S_READ. 5152 */ 5153 if (rw == S_READ_NOCOW) { 5154 ASSERT(type == F_SOFTLOCK); 5155 ASSERT(AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock)); 5156 rw = S_READ; 5157 } 5158 5159 amp = svd->amp; 5160 5161 /* 5162 * MADV_SEQUENTIAL work is ignored for large page segments. 5163 */ 5164 if (seg->s_szc != 0) { 5165 pgsz = page_get_pagesize(seg->s_szc); 5166 ASSERT(SEGVN_LOCK_HELD(seg->s_as, &svd->lock)); 5167 CALC_LPG_REGION(pgsz, seg, addr, len, lpgaddr, lpgeaddr); 5168 if (svd->vp == NULL) { 5169 err = segvn_fault_anonpages(hat, seg, lpgaddr, 5170 lpgeaddr, type, rw, addr, addr + len, brkcow); 5171 } else { 5172 err = segvn_fault_vnodepages(hat, seg, lpgaddr, 5173 lpgeaddr, type, rw, addr, addr + len, brkcow); 5174 if (err == IE_RETRY) { 5175 ASSERT(seg->s_szc == 0); 5176 ASSERT(SEGVN_READ_HELD(seg->s_as, &svd->lock)); 5177 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5178 goto top; 5179 } 5180 } 5181 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5182 return (err); 5183 } 5184 5185 page = seg_page(seg, addr); 5186 if (amp != NULL) { 5187 ASSERT(svd->rcookie == HAT_INVALID_REGION_COOKIE); 5188 anon_index = svd->anon_index + page; 5189 5190 if (type == F_PROT && rw == S_READ && 5191 svd->tr_state == SEGVN_TR_OFF && 5192 svd->type == MAP_PRIVATE && svd->pageprot == 0) { 5193 size_t index = anon_index; 5194 struct anon *ap; 5195 5196 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 5197 /* 5198 * The fast path could apply to S_WRITE also, except 5199 * that the protection fault could be caused by lazy 5200 * tlb flush when ro->rw. In this case, the pte is 5201 * RW already. But RO in the other cpu's tlb causes 5202 * the fault. Since hat_chgprot won't do anything if 5203 * pte doesn't change, we may end up faulting 5204 * indefinitely until the RO tlb entry gets replaced. 5205 */ 5206 for (a = addr; a < addr + len; a += PAGESIZE, index++) { 5207 anon_array_enter(amp, index, &cookie); 5208 ap = anon_get_ptr(amp->ahp, index); 5209 anon_array_exit(&cookie); 5210 if ((ap == NULL) || (ap->an_refcnt != 1)) { 5211 ANON_LOCK_EXIT(&->a_rwlock); 5212 goto slow; 5213 } 5214 } 5215 hat_chgprot(seg->s_as->a_hat, addr, len, svd->prot); 5216 ANON_LOCK_EXIT(&->a_rwlock); 5217 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5218 return (0); 5219 } 5220 } 5221 slow: 5222 5223 if (svd->vpage == NULL) 5224 vpage = NULL; 5225 else 5226 vpage = &svd->vpage[page]; 5227 5228 off = svd->offset + (uintptr_t)(addr - seg->s_base); 5229 5230 /* 5231 * If MADV_SEQUENTIAL has been set for the particular page we 5232 * are faulting on, free behind all pages in the segment and put 5233 * them on the free list. 5234 */ 5235 5236 if ((page != 0) && fltadvice && svd->tr_state != SEGVN_TR_ON) { 5237 struct vpage *vpp; 5238 ulong_t fanon_index; 5239 size_t fpage; 5240 u_offset_t pgoff, fpgoff; 5241 struct vnode *fvp; 5242 struct anon *fap = NULL; 5243 5244 if (svd->advice == MADV_SEQUENTIAL || 5245 (svd->pageadvice && 5246 VPP_ADVICE(vpage) == MADV_SEQUENTIAL)) { 5247 pgoff = off - PAGESIZE; 5248 fpage = page - 1; 5249 if (vpage != NULL) 5250 vpp = &svd->vpage[fpage]; 5251 if (amp != NULL) 5252 fanon_index = svd->anon_index + fpage; 5253 5254 while (pgoff > svd->offset) { 5255 if (svd->advice != MADV_SEQUENTIAL && 5256 (!svd->pageadvice || (vpage && 5257 VPP_ADVICE(vpp) != MADV_SEQUENTIAL))) 5258 break; 5259 5260 /* 5261 * If this is an anon page, we must find the 5262 * correct <vp, offset> for it 5263 */ 5264 fap = NULL; 5265 if (amp != NULL) { 5266 ANON_LOCK_ENTER(&->a_rwlock, 5267 RW_READER); 5268 anon_array_enter(amp, fanon_index, 5269 &cookie); 5270 fap = anon_get_ptr(amp->ahp, 5271 fanon_index); 5272 if (fap != NULL) { 5273 swap_xlate(fap, &fvp, &fpgoff); 5274 } else { 5275 fpgoff = pgoff; 5276 fvp = svd->vp; 5277 } 5278 anon_array_exit(&cookie); 5279 ANON_LOCK_EXIT(&->a_rwlock); 5280 } else { 5281 fpgoff = pgoff; 5282 fvp = svd->vp; 5283 } 5284 if (fvp == NULL) 5285 break; /* XXX */ 5286 /* 5287 * Skip pages that are free or have an 5288 * "exclusive" lock. 5289 */ 5290 pp = page_lookup_nowait(fvp, fpgoff, SE_SHARED); 5291 if (pp == NULL) 5292 break; 5293 /* 5294 * We don't need the page_struct_lock to test 5295 * as this is only advisory; even if we 5296 * acquire it someone might race in and lock 5297 * the page after we unlock and before the 5298 * PUTPAGE, then VOP_PUTPAGE will do nothing. 5299 */ 5300 if (pp->p_lckcnt == 0 && pp->p_cowcnt == 0) { 5301 /* 5302 * Hold the vnode before releasing 5303 * the page lock to prevent it from 5304 * being freed and re-used by some 5305 * other thread. 5306 */ 5307 VN_HOLD(fvp); 5308 page_unlock(pp); 5309 /* 5310 * We should build a page list 5311 * to kluster putpages XXX 5312 */ 5313 (void) VOP_PUTPAGE(fvp, 5314 (offset_t)fpgoff, PAGESIZE, 5315 (B_DONTNEED|B_FREE|B_ASYNC), 5316 svd->cred, NULL); 5317 VN_RELE(fvp); 5318 } else { 5319 /* 5320 * XXX - Should the loop terminate if 5321 * the page is `locked'? 5322 */ 5323 page_unlock(pp); 5324 } 5325 --vpp; 5326 --fanon_index; 5327 pgoff -= PAGESIZE; 5328 } 5329 } 5330 } 5331 5332 plp = pl; 5333 *plp = NULL; 5334 pl_alloc_sz = 0; 5335 5336 /* 5337 * See if we need to call VOP_GETPAGE for 5338 * *any* of the range being faulted on. 5339 * We can skip all of this work if there 5340 * was no original vnode. 5341 */ 5342 if (svd->vp != NULL) { 5343 u_offset_t vp_off; 5344 size_t vp_len; 5345 struct anon *ap; 5346 vnode_t *vp; 5347 5348 vp_off = off; 5349 vp_len = len; 5350 5351 if (amp == NULL) 5352 dogetpage = 1; 5353 else { 5354 /* 5355 * Only acquire reader lock to prevent amp->ahp 5356 * from being changed. It's ok to miss pages, 5357 * hence we don't do anon_array_enter 5358 */ 5359 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 5360 ap = anon_get_ptr(amp->ahp, anon_index); 5361 5362 if (len <= PAGESIZE) 5363 /* inline non_anon() */ 5364 dogetpage = (ap == NULL); 5365 else 5366 dogetpage = non_anon(amp->ahp, anon_index, 5367 &vp_off, &vp_len); 5368 ANON_LOCK_EXIT(&->a_rwlock); 5369 } 5370 5371 if (dogetpage) { 5372 enum seg_rw arw; 5373 struct as *as = seg->s_as; 5374 5375 if (len > ptob((sizeof (pl) / sizeof (pl[0])) - 1)) { 5376 /* 5377 * Page list won't fit in local array, 5378 * allocate one of the needed size. 5379 */ 5380 pl_alloc_sz = 5381 (btop(len) + 1) * sizeof (page_t *); 5382 plp = kmem_alloc(pl_alloc_sz, KM_SLEEP); 5383 plp[0] = NULL; 5384 plsz = len; 5385 } else if (rw == S_WRITE && svd->type == MAP_PRIVATE || 5386 svd->tr_state == SEGVN_TR_ON || rw == S_OTHER || 5387 (((size_t)(addr + PAGESIZE) < 5388 (size_t)(seg->s_base + seg->s_size)) && 5389 hat_probe(as->a_hat, addr + PAGESIZE))) { 5390 /* 5391 * Ask VOP_GETPAGE to return the exact number 5392 * of pages if 5393 * (a) this is a COW fault, or 5394 * (b) this is a software fault, or 5395 * (c) next page is already mapped. 5396 */ 5397 plsz = len; 5398 } else { 5399 /* 5400 * Ask VOP_GETPAGE to return adjacent pages 5401 * within the segment. 5402 */ 5403 plsz = MIN((size_t)PVN_GETPAGE_SZ, (size_t) 5404 ((seg->s_base + seg->s_size) - addr)); 5405 ASSERT((addr + plsz) <= 5406 (seg->s_base + seg->s_size)); 5407 } 5408 5409 /* 5410 * Need to get some non-anonymous pages. 5411 * We need to make only one call to GETPAGE to do 5412 * this to prevent certain deadlocking conditions 5413 * when we are doing locking. In this case 5414 * non_anon() should have picked up the smallest 5415 * range which includes all the non-anonymous 5416 * pages in the requested range. We have to 5417 * be careful regarding which rw flag to pass in 5418 * because on a private mapping, the underlying 5419 * object is never allowed to be written. 5420 */ 5421 if (rw == S_WRITE && svd->type == MAP_PRIVATE) { 5422 arw = S_READ; 5423 } else { 5424 arw = rw; 5425 } 5426 vp = svd->vp; 5427 TRACE_3(TR_FAC_VM, TR_SEGVN_GETPAGE, 5428 "segvn_getpage:seg %p addr %p vp %p", 5429 seg, addr, vp); 5430 err = VOP_GETPAGE(vp, (offset_t)vp_off, vp_len, 5431 &vpprot, plp, plsz, seg, addr + (vp_off - off), arw, 5432 svd->cred, NULL); 5433 if (err) { 5434 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5435 segvn_pagelist_rele(plp); 5436 if (pl_alloc_sz) 5437 kmem_free(plp, pl_alloc_sz); 5438 return (FC_MAKE_ERR(err)); 5439 } 5440 if (svd->type == MAP_PRIVATE) 5441 vpprot &= ~PROT_WRITE; 5442 } 5443 } 5444 5445 /* 5446 * N.B. at this time the plp array has all the needed non-anon 5447 * pages in addition to (possibly) having some adjacent pages. 5448 */ 5449 5450 /* 5451 * Always acquire the anon_array_lock to prevent 5452 * 2 threads from allocating separate anon slots for 5453 * the same "addr". 5454 * 5455 * If this is a copy-on-write fault and we don't already 5456 * have the anon_array_lock, acquire it to prevent the 5457 * fault routine from handling multiple copy-on-write faults 5458 * on the same "addr" in the same address space. 5459 * 5460 * Only one thread should deal with the fault since after 5461 * it is handled, the other threads can acquire a translation 5462 * to the newly created private page. This prevents two or 5463 * more threads from creating different private pages for the 5464 * same fault. 5465 * 5466 * We grab "serialization" lock here if this is a MAP_PRIVATE segment 5467 * to prevent deadlock between this thread and another thread 5468 * which has soft-locked this page and wants to acquire serial_lock. 5469 * ( bug 4026339 ) 5470 * 5471 * The fix for bug 4026339 becomes unnecessary when using the 5472 * locking scheme with per amp rwlock and a global set of hash 5473 * lock, anon_array_lock. If we steal a vnode page when low 5474 * on memory and upgrad the page lock through page_rename, 5475 * then the page is PAGE_HANDLED, nothing needs to be done 5476 * for this page after returning from segvn_faultpage. 5477 * 5478 * But really, the page lock should be downgraded after 5479 * the stolen page is page_rename'd. 5480 */ 5481 5482 if (amp != NULL) 5483 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 5484 5485 /* 5486 * Ok, now loop over the address range and handle faults 5487 */ 5488 for (a = addr; a < addr + len; a += PAGESIZE, off += PAGESIZE) { 5489 err = segvn_faultpage(hat, seg, a, off, vpage, plp, vpprot, 5490 type, rw, brkcow, a == addr); 5491 if (err) { 5492 if (amp != NULL) 5493 ANON_LOCK_EXIT(&->a_rwlock); 5494 if (type == F_SOFTLOCK && a > addr) { 5495 segvn_softunlock(seg, addr, (a - addr), 5496 S_OTHER); 5497 } 5498 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5499 segvn_pagelist_rele(plp); 5500 if (pl_alloc_sz) 5501 kmem_free(plp, pl_alloc_sz); 5502 return (err); 5503 } 5504 if (vpage) { 5505 vpage++; 5506 } else if (svd->vpage) { 5507 page = seg_page(seg, addr); 5508 vpage = &svd->vpage[++page]; 5509 } 5510 } 5511 5512 /* Didn't get pages from the underlying fs so we're done */ 5513 if (!dogetpage) 5514 goto done; 5515 5516 /* 5517 * Now handle any other pages in the list returned. 5518 * If the page can be used, load up the translations now. 5519 * Note that the for loop will only be entered if "plp" 5520 * is pointing to a non-NULL page pointer which means that 5521 * VOP_GETPAGE() was called and vpprot has been initialized. 5522 */ 5523 if (svd->pageprot == 0) 5524 prot = svd->prot & vpprot; 5525 5526 5527 /* 5528 * Large Files: diff should be unsigned value because we started 5529 * supporting > 2GB segment sizes from 2.5.1 and when a 5530 * large file of size > 2GB gets mapped to address space 5531 * the diff value can be > 2GB. 5532 */ 5533 5534 for (ppp = plp; (pp = *ppp) != NULL; ppp++) { 5535 size_t diff; 5536 struct anon *ap; 5537 int anon_index; 5538 anon_sync_obj_t cookie; 5539 int hat_flag = HAT_LOAD_ADV; 5540 5541 if (svd->flags & MAP_TEXT) { 5542 hat_flag |= HAT_LOAD_TEXT; 5543 } 5544 5545 if (pp == PAGE_HANDLED) 5546 continue; 5547 5548 if (svd->tr_state != SEGVN_TR_ON && 5549 pp->p_offset >= svd->offset && 5550 pp->p_offset < svd->offset + seg->s_size) { 5551 5552 diff = pp->p_offset - svd->offset; 5553 5554 /* 5555 * Large Files: Following is the assertion 5556 * validating the above cast. 5557 */ 5558 ASSERT(svd->vp == pp->p_vnode); 5559 5560 page = btop(diff); 5561 if (svd->pageprot) 5562 prot = VPP_PROT(&svd->vpage[page]) & vpprot; 5563 5564 /* 5565 * Prevent other threads in the address space from 5566 * creating private pages (i.e., allocating anon slots) 5567 * while we are in the process of loading translations 5568 * to additional pages returned by the underlying 5569 * object. 5570 */ 5571 if (amp != NULL) { 5572 anon_index = svd->anon_index + page; 5573 anon_array_enter(amp, anon_index, &cookie); 5574 ap = anon_get_ptr(amp->ahp, anon_index); 5575 } 5576 if ((amp == NULL) || (ap == NULL)) { 5577 if (IS_VMODSORT(pp->p_vnode) || 5578 enable_mbit_wa) { 5579 if (rw == S_WRITE) 5580 hat_setmod(pp); 5581 else if (rw != S_OTHER && 5582 !hat_ismod(pp)) 5583 prot &= ~PROT_WRITE; 5584 } 5585 /* 5586 * Skip mapping read ahead pages marked 5587 * for migration, so they will get migrated 5588 * properly on fault 5589 */ 5590 ASSERT(amp == NULL || 5591 svd->rcookie == HAT_INVALID_REGION_COOKIE); 5592 if ((prot & PROT_READ) && !PP_ISMIGRATE(pp)) { 5593 hat_memload_region(hat, 5594 seg->s_base + diff, 5595 pp, prot, hat_flag, 5596 svd->rcookie); 5597 } 5598 } 5599 if (amp != NULL) 5600 anon_array_exit(&cookie); 5601 } 5602 page_unlock(pp); 5603 } 5604 done: 5605 if (amp != NULL) 5606 ANON_LOCK_EXIT(&->a_rwlock); 5607 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5608 if (pl_alloc_sz) 5609 kmem_free(plp, pl_alloc_sz); 5610 return (0); 5611 } 5612 5613 /* 5614 * This routine is used to start I/O on pages asynchronously. XXX it will 5615 * only create PAGESIZE pages. At fault time they will be relocated into 5616 * larger pages. 5617 */ 5618 static faultcode_t 5619 segvn_faulta(struct seg *seg, caddr_t addr) 5620 { 5621 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 5622 int err; 5623 struct anon_map *amp; 5624 vnode_t *vp; 5625 5626 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 5627 5628 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER); 5629 if ((amp = svd->amp) != NULL) { 5630 struct anon *ap; 5631 5632 /* 5633 * Reader lock to prevent amp->ahp from being changed. 5634 * This is advisory, it's ok to miss a page, so 5635 * we don't do anon_array_enter lock. 5636 */ 5637 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 5638 if ((ap = anon_get_ptr(amp->ahp, 5639 svd->anon_index + seg_page(seg, addr))) != NULL) { 5640 5641 err = anon_getpage(&ap, NULL, NULL, 5642 0, seg, addr, S_READ, svd->cred); 5643 5644 ANON_LOCK_EXIT(&->a_rwlock); 5645 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5646 if (err) 5647 return (FC_MAKE_ERR(err)); 5648 return (0); 5649 } 5650 ANON_LOCK_EXIT(&->a_rwlock); 5651 } 5652 5653 if (svd->vp == NULL) { 5654 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5655 return (0); /* zfod page - do nothing now */ 5656 } 5657 5658 vp = svd->vp; 5659 TRACE_3(TR_FAC_VM, TR_SEGVN_GETPAGE, 5660 "segvn_getpage:seg %p addr %p vp %p", seg, addr, vp); 5661 err = VOP_GETPAGE(vp, 5662 (offset_t)(svd->offset + (uintptr_t)(addr - seg->s_base)), 5663 PAGESIZE, NULL, NULL, 0, seg, addr, 5664 S_OTHER, svd->cred, NULL); 5665 5666 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5667 if (err) 5668 return (FC_MAKE_ERR(err)); 5669 return (0); 5670 } 5671 5672 static int 5673 segvn_setprot(struct seg *seg, caddr_t addr, size_t len, uint_t prot) 5674 { 5675 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 5676 struct vpage *svp, *evp; 5677 struct vnode *vp; 5678 size_t pgsz; 5679 pgcnt_t pgcnt; 5680 anon_sync_obj_t cookie; 5681 int unload_done = 0; 5682 5683 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 5684 5685 if ((svd->maxprot & prot) != prot) 5686 return (EACCES); /* violated maxprot */ 5687 5688 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_WRITER); 5689 5690 /* return if prot is the same */ 5691 if (!svd->pageprot && svd->prot == prot) { 5692 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5693 return (0); 5694 } 5695 5696 /* 5697 * Since we change protections we first have to flush the cache. 5698 * This makes sure all the pagelock calls have to recheck 5699 * protections. 5700 */ 5701 if (svd->softlockcnt > 0) { 5702 ASSERT(svd->tr_state == SEGVN_TR_OFF); 5703 /* 5704 * Since we do have the segvn writers lock nobody can fill 5705 * the cache with entries belonging to this seg during 5706 * the purge. The flush either succeeds or we still have 5707 * pending I/Os. 5708 */ 5709 segvn_purge(seg); 5710 if (svd->softlockcnt > 0) { 5711 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5712 return (EAGAIN); 5713 } 5714 } 5715 5716 if (HAT_IS_REGION_COOKIE_VALID(svd->rcookie)) { 5717 ASSERT(svd->amp == NULL); 5718 ASSERT(svd->tr_state == SEGVN_TR_OFF); 5719 hat_leave_region(seg->s_as->a_hat, svd->rcookie, 5720 HAT_REGION_TEXT); 5721 svd->rcookie = HAT_INVALID_REGION_COOKIE; 5722 unload_done = 1; 5723 } else if (svd->tr_state == SEGVN_TR_INIT) { 5724 svd->tr_state = SEGVN_TR_OFF; 5725 } else if (svd->tr_state == SEGVN_TR_ON) { 5726 ASSERT(svd->amp != NULL); 5727 segvn_textunrepl(seg, 0); 5728 ASSERT(svd->amp == NULL && svd->tr_state == SEGVN_TR_OFF); 5729 unload_done = 1; 5730 } 5731 5732 if ((prot & PROT_WRITE) && svd->type == MAP_SHARED && 5733 svd->vp != NULL && (svd->vp->v_flag & VVMEXEC)) { 5734 ASSERT(vn_is_mapped(svd->vp, V_WRITE)); 5735 segvn_inval_trcache(svd->vp); 5736 } 5737 if (seg->s_szc != 0) { 5738 int err; 5739 pgsz = page_get_pagesize(seg->s_szc); 5740 pgcnt = pgsz >> PAGESHIFT; 5741 ASSERT(IS_P2ALIGNED(pgcnt, pgcnt)); 5742 if (!IS_P2ALIGNED(addr, pgsz) || !IS_P2ALIGNED(len, pgsz)) { 5743 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5744 ASSERT(seg->s_base != addr || seg->s_size != len); 5745 /* 5746 * If we are holding the as lock as a reader then 5747 * we need to return IE_RETRY and let the as 5748 * layer drop and re-acquire the lock as a writer. 5749 */ 5750 if (AS_READ_HELD(seg->s_as, &seg->s_as->a_lock)) 5751 return (IE_RETRY); 5752 VM_STAT_ADD(segvnvmstats.demoterange[1]); 5753 if (svd->type == MAP_PRIVATE || svd->vp != NULL) { 5754 err = segvn_demote_range(seg, addr, len, 5755 SDR_END, 0); 5756 } else { 5757 uint_t szcvec = map_pgszcvec(seg->s_base, 5758 pgsz, (uintptr_t)seg->s_base, 5759 (svd->flags & MAP_TEXT), MAPPGSZC_SHM, 0); 5760 err = segvn_demote_range(seg, addr, len, 5761 SDR_END, szcvec); 5762 } 5763 if (err == 0) 5764 return (IE_RETRY); 5765 if (err == ENOMEM) 5766 return (IE_NOMEM); 5767 return (err); 5768 } 5769 } 5770 5771 5772 /* 5773 * If it's a private mapping and we're making it writable 5774 * and no swap space has been reserved, have to reserve 5775 * it all now. If it's a private mapping to a file (i.e., vp != NULL) 5776 * and we're removing write permission on the entire segment and 5777 * we haven't modified any pages, we can release the swap space. 5778 */ 5779 if (svd->type == MAP_PRIVATE) { 5780 if (prot & PROT_WRITE) { 5781 size_t sz; 5782 if (svd->swresv == 0 && !(svd->flags & MAP_NORESERVE)) { 5783 if (anon_resv_zone(seg->s_size, 5784 seg->s_as->a_proc->p_zone) == 0) { 5785 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5786 return (IE_NOMEM); 5787 } 5788 sz = svd->swresv = seg->s_size; 5789 TRACE_3(TR_FAC_VM, TR_ANON_PROC, 5790 "anon proc:%p %lu %u", 5791 seg, sz, 1); 5792 } 5793 } else { 5794 /* 5795 * Swap space is released only if this segment 5796 * does not map anonymous memory, since read faults 5797 * on such segments still need an anon slot to read 5798 * in the data. 5799 */ 5800 if (svd->swresv != 0 && svd->vp != NULL && 5801 svd->amp == NULL && addr == seg->s_base && 5802 len == seg->s_size && svd->pageprot == 0) { 5803 anon_unresv_zone(svd->swresv, 5804 seg->s_as->a_proc->p_zone); 5805 svd->swresv = 0; 5806 TRACE_3(TR_FAC_VM, TR_ANON_PROC, 5807 "anon proc:%p %lu %u", 5808 seg, 0, 0); 5809 } 5810 } 5811 } 5812 5813 if (addr == seg->s_base && len == seg->s_size && svd->vpage == NULL) { 5814 if (svd->prot == prot) { 5815 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5816 return (0); /* all done */ 5817 } 5818 svd->prot = (uchar_t)prot; 5819 } else if (svd->type == MAP_PRIVATE) { 5820 struct anon *ap = NULL; 5821 page_t *pp; 5822 u_offset_t offset, off; 5823 struct anon_map *amp; 5824 ulong_t anon_idx = 0; 5825 5826 /* 5827 * A vpage structure exists or else the change does not 5828 * involve the entire segment. Establish a vpage structure 5829 * if none is there. Then, for each page in the range, 5830 * adjust its individual permissions. Note that write- 5831 * enabling a MAP_PRIVATE page can affect the claims for 5832 * locked down memory. Overcommitting memory terminates 5833 * the operation. 5834 */ 5835 segvn_vpage(seg); 5836 svd->pageprot = 1; 5837 if ((amp = svd->amp) != NULL) { 5838 anon_idx = svd->anon_index + seg_page(seg, addr); 5839 ASSERT(seg->s_szc == 0 || 5840 IS_P2ALIGNED(anon_idx, pgcnt)); 5841 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 5842 } 5843 5844 offset = svd->offset + (uintptr_t)(addr - seg->s_base); 5845 evp = &svd->vpage[seg_page(seg, addr + len)]; 5846 5847 /* 5848 * See Statement at the beginning of segvn_lockop regarding 5849 * the way cowcnts and lckcnts are handled. 5850 */ 5851 for (svp = &svd->vpage[seg_page(seg, addr)]; svp < evp; svp++) { 5852 5853 if (seg->s_szc != 0) { 5854 if (amp != NULL) { 5855 anon_array_enter(amp, anon_idx, 5856 &cookie); 5857 } 5858 if (IS_P2ALIGNED(anon_idx, pgcnt) && 5859 !segvn_claim_pages(seg, svp, offset, 5860 anon_idx, prot)) { 5861 if (amp != NULL) { 5862 anon_array_exit(&cookie); 5863 } 5864 break; 5865 } 5866 if (amp != NULL) { 5867 anon_array_exit(&cookie); 5868 } 5869 anon_idx++; 5870 } else { 5871 if (amp != NULL) { 5872 anon_array_enter(amp, anon_idx, 5873 &cookie); 5874 ap = anon_get_ptr(amp->ahp, anon_idx++); 5875 } 5876 5877 if (VPP_ISPPLOCK(svp) && 5878 VPP_PROT(svp) != prot) { 5879 5880 if (amp == NULL || ap == NULL) { 5881 vp = svd->vp; 5882 off = offset; 5883 } else 5884 swap_xlate(ap, &vp, &off); 5885 if (amp != NULL) 5886 anon_array_exit(&cookie); 5887 5888 if ((pp = page_lookup(vp, off, 5889 SE_SHARED)) == NULL) { 5890 panic("segvn_setprot: no page"); 5891 /*NOTREACHED*/ 5892 } 5893 ASSERT(seg->s_szc == 0); 5894 if ((VPP_PROT(svp) ^ prot) & 5895 PROT_WRITE) { 5896 if (prot & PROT_WRITE) { 5897 if (!page_addclaim(pp)) { 5898 page_unlock(pp); 5899 break; 5900 } 5901 } else { 5902 if (!page_subclaim(pp)) { 5903 page_unlock(pp); 5904 break; 5905 } 5906 } 5907 } 5908 page_unlock(pp); 5909 } else if (amp != NULL) 5910 anon_array_exit(&cookie); 5911 } 5912 VPP_SETPROT(svp, prot); 5913 offset += PAGESIZE; 5914 } 5915 if (amp != NULL) 5916 ANON_LOCK_EXIT(&->a_rwlock); 5917 5918 /* 5919 * Did we terminate prematurely? If so, simply unload 5920 * the translations to the things we've updated so far. 5921 */ 5922 if (svp != evp) { 5923 if (unload_done) { 5924 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5925 return (IE_NOMEM); 5926 } 5927 len = (svp - &svd->vpage[seg_page(seg, addr)]) * 5928 PAGESIZE; 5929 ASSERT(seg->s_szc == 0 || IS_P2ALIGNED(len, pgsz)); 5930 if (len != 0) 5931 hat_unload(seg->s_as->a_hat, addr, 5932 len, HAT_UNLOAD); 5933 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5934 return (IE_NOMEM); 5935 } 5936 } else { 5937 segvn_vpage(seg); 5938 svd->pageprot = 1; 5939 evp = &svd->vpage[seg_page(seg, addr + len)]; 5940 for (svp = &svd->vpage[seg_page(seg, addr)]; svp < evp; svp++) { 5941 VPP_SETPROT(svp, prot); 5942 } 5943 } 5944 5945 if (unload_done) { 5946 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5947 return (0); 5948 } 5949 5950 if (((prot & PROT_WRITE) != 0 && 5951 (svd->vp != NULL || svd->type == MAP_PRIVATE)) || 5952 (prot & ~PROT_USER) == PROT_NONE) { 5953 /* 5954 * Either private or shared data with write access (in 5955 * which case we need to throw out all former translations 5956 * so that we get the right translations set up on fault 5957 * and we don't allow write access to any copy-on-write pages 5958 * that might be around or to prevent write access to pages 5959 * representing holes in a file), or we don't have permission 5960 * to access the memory at all (in which case we have to 5961 * unload any current translations that might exist). 5962 */ 5963 hat_unload(seg->s_as->a_hat, addr, len, HAT_UNLOAD); 5964 } else { 5965 /* 5966 * A shared mapping or a private mapping in which write 5967 * protection is going to be denied - just change all the 5968 * protections over the range of addresses in question. 5969 * segvn does not support any other attributes other 5970 * than prot so we can use hat_chgattr. 5971 */ 5972 hat_chgattr(seg->s_as->a_hat, addr, len, prot); 5973 } 5974 5975 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5976 5977 return (0); 5978 } 5979 5980 /* 5981 * segvn_setpagesize is called via SEGOP_SETPAGESIZE from as_setpagesize, 5982 * to determine if the seg is capable of mapping the requested szc. 5983 */ 5984 static int 5985 segvn_setpagesize(struct seg *seg, caddr_t addr, size_t len, uint_t szc) 5986 { 5987 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 5988 struct segvn_data *nsvd; 5989 struct anon_map *amp = svd->amp; 5990 struct seg *nseg; 5991 caddr_t eaddr = addr + len, a; 5992 size_t pgsz = page_get_pagesize(szc); 5993 pgcnt_t pgcnt = page_get_pagecnt(szc); 5994 int err; 5995 u_offset_t off = svd->offset + (uintptr_t)(addr - seg->s_base); 5996 extern struct vnode kvp; 5997 5998 ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock)); 5999 ASSERT(addr >= seg->s_base && eaddr <= seg->s_base + seg->s_size); 6000 6001 if (seg->s_szc == szc || segvn_lpg_disable != 0) { 6002 return (0); 6003 } 6004 6005 /* 6006 * addr should always be pgsz aligned but eaddr may be misaligned if 6007 * it's at the end of the segment. 6008 * 6009 * XXX we should assert this condition since as_setpagesize() logic 6010 * guarantees it. 6011 */ 6012 if (!IS_P2ALIGNED(addr, pgsz) || 6013 (!IS_P2ALIGNED(eaddr, pgsz) && 6014 eaddr != seg->s_base + seg->s_size)) { 6015 6016 segvn_setpgsz_align_err++; 6017 return (EINVAL); 6018 } 6019 6020 if (amp != NULL && svd->type == MAP_SHARED) { 6021 ulong_t an_idx = svd->anon_index + seg_page(seg, addr); 6022 if (!IS_P2ALIGNED(an_idx, pgcnt)) { 6023 6024 segvn_setpgsz_anon_align_err++; 6025 return (EINVAL); 6026 } 6027 } 6028 6029 if ((svd->flags & MAP_NORESERVE) || seg->s_as == &kas || 6030 szc > segvn_maxpgszc) { 6031 return (EINVAL); 6032 } 6033 6034 /* paranoid check */ 6035 if (svd->vp != NULL && 6036 (IS_SWAPFSVP(svd->vp) || VN_ISKAS(svd->vp))) { 6037 return (EINVAL); 6038 } 6039 6040 if (seg->s_szc == 0 && svd->vp != NULL && 6041 map_addr_vacalign_check(addr, off)) { 6042 return (EINVAL); 6043 } 6044 6045 /* 6046 * Check that protections are the same within new page 6047 * size boundaries. 6048 */ 6049 if (svd->pageprot) { 6050 for (a = addr; a < eaddr; a += pgsz) { 6051 if ((a + pgsz) > eaddr) { 6052 if (!sameprot(seg, a, eaddr - a)) { 6053 return (EINVAL); 6054 } 6055 } else { 6056 if (!sameprot(seg, a, pgsz)) { 6057 return (EINVAL); 6058 } 6059 } 6060 } 6061 } 6062 6063 /* 6064 * Since we are changing page size we first have to flush 6065 * the cache. This makes sure all the pagelock calls have 6066 * to recheck protections. 6067 */ 6068 if (svd->softlockcnt > 0) { 6069 ASSERT(svd->tr_state == SEGVN_TR_OFF); 6070 /* 6071 * Since we do have the segvn writers lock nobody can fill 6072 * the cache with entries belonging to this seg during 6073 * the purge. The flush either succeeds or we still have 6074 * pending I/Os. 6075 */ 6076 segvn_purge(seg); 6077 if (svd->softlockcnt > 0) { 6078 return (EAGAIN); 6079 } 6080 } 6081 6082 if (HAT_IS_REGION_COOKIE_VALID(svd->rcookie)) { 6083 ASSERT(svd->amp == NULL); 6084 ASSERT(svd->tr_state == SEGVN_TR_OFF); 6085 hat_leave_region(seg->s_as->a_hat, svd->rcookie, 6086 HAT_REGION_TEXT); 6087 svd->rcookie = HAT_INVALID_REGION_COOKIE; 6088 } else if (svd->tr_state == SEGVN_TR_INIT) { 6089 svd->tr_state = SEGVN_TR_OFF; 6090 } else if (svd->tr_state == SEGVN_TR_ON) { 6091 ASSERT(svd->amp != NULL); 6092 segvn_textunrepl(seg, 1); 6093 ASSERT(svd->amp == NULL && svd->tr_state == SEGVN_TR_OFF); 6094 amp = NULL; 6095 } 6096 6097 /* 6098 * Operation for sub range of existing segment. 6099 */ 6100 if (addr != seg->s_base || eaddr != (seg->s_base + seg->s_size)) { 6101 if (szc < seg->s_szc) { 6102 VM_STAT_ADD(segvnvmstats.demoterange[2]); 6103 err = segvn_demote_range(seg, addr, len, SDR_RANGE, 0); 6104 if (err == 0) { 6105 return (IE_RETRY); 6106 } 6107 if (err == ENOMEM) { 6108 return (IE_NOMEM); 6109 } 6110 return (err); 6111 } 6112 if (addr != seg->s_base) { 6113 nseg = segvn_split_seg(seg, addr); 6114 if (eaddr != (nseg->s_base + nseg->s_size)) { 6115 /* eaddr is szc aligned */ 6116 (void) segvn_split_seg(nseg, eaddr); 6117 } 6118 return (IE_RETRY); 6119 } 6120 if (eaddr != (seg->s_base + seg->s_size)) { 6121 /* eaddr is szc aligned */ 6122 (void) segvn_split_seg(seg, eaddr); 6123 } 6124 return (IE_RETRY); 6125 } 6126 6127 /* 6128 * Break any low level sharing and reset seg->s_szc to 0. 6129 */ 6130 if ((err = segvn_clrszc(seg)) != 0) { 6131 if (err == ENOMEM) { 6132 err = IE_NOMEM; 6133 } 6134 return (err); 6135 } 6136 ASSERT(seg->s_szc == 0); 6137 6138 /* 6139 * If the end of the current segment is not pgsz aligned 6140 * then attempt to concatenate with the next segment. 6141 */ 6142 if (!IS_P2ALIGNED(eaddr, pgsz)) { 6143 nseg = AS_SEGNEXT(seg->s_as, seg); 6144 if (nseg == NULL || nseg == seg || eaddr != nseg->s_base) { 6145 return (ENOMEM); 6146 } 6147 if (nseg->s_ops != &segvn_ops) { 6148 return (EINVAL); 6149 } 6150 nsvd = (struct segvn_data *)nseg->s_data; 6151 if (nsvd->softlockcnt > 0) { 6152 segvn_purge(nseg); 6153 if (nsvd->softlockcnt > 0) { 6154 return (EAGAIN); 6155 } 6156 } 6157 err = segvn_clrszc(nseg); 6158 if (err == ENOMEM) { 6159 err = IE_NOMEM; 6160 } 6161 if (err != 0) { 6162 return (err); 6163 } 6164 ASSERT(nsvd->rcookie == HAT_INVALID_REGION_COOKIE); 6165 err = segvn_concat(seg, nseg, 1); 6166 if (err == -1) { 6167 return (EINVAL); 6168 } 6169 if (err == -2) { 6170 return (IE_NOMEM); 6171 } 6172 return (IE_RETRY); 6173 } 6174 6175 /* 6176 * May need to re-align anon array to 6177 * new szc. 6178 */ 6179 if (amp != NULL) { 6180 if (!IS_P2ALIGNED(svd->anon_index, pgcnt)) { 6181 struct anon_hdr *nahp; 6182 6183 ASSERT(svd->type == MAP_PRIVATE); 6184 6185 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 6186 ASSERT(amp->refcnt == 1); 6187 nahp = anon_create(btop(amp->size), ANON_NOSLEEP); 6188 if (nahp == NULL) { 6189 ANON_LOCK_EXIT(&->a_rwlock); 6190 return (IE_NOMEM); 6191 } 6192 if (anon_copy_ptr(amp->ahp, svd->anon_index, 6193 nahp, 0, btop(seg->s_size), ANON_NOSLEEP)) { 6194 anon_release(nahp, btop(amp->size)); 6195 ANON_LOCK_EXIT(&->a_rwlock); 6196 return (IE_NOMEM); 6197 } 6198 anon_release(amp->ahp, btop(amp->size)); 6199 amp->ahp = nahp; 6200 svd->anon_index = 0; 6201 ANON_LOCK_EXIT(&->a_rwlock); 6202 } 6203 } 6204 if (svd->vp != NULL && szc != 0) { 6205 struct vattr va; 6206 u_offset_t eoffpage = svd->offset; 6207 va.va_mask = AT_SIZE; 6208 eoffpage += seg->s_size; 6209 eoffpage = btopr(eoffpage); 6210 if (VOP_GETATTR(svd->vp, &va, 0, svd->cred, NULL) != 0) { 6211 segvn_setpgsz_getattr_err++; 6212 return (EINVAL); 6213 } 6214 if (btopr(va.va_size) < eoffpage) { 6215 segvn_setpgsz_eof_err++; 6216 return (EINVAL); 6217 } 6218 if (amp != NULL) { 6219 /* 6220 * anon_fill_cow_holes() may call VOP_GETPAGE(). 6221 * don't take anon map lock here to avoid holding it 6222 * across VOP_GETPAGE() calls that may call back into 6223 * segvn for klsutering checks. We don't really need 6224 * anon map lock here since it's a private segment and 6225 * we hold as level lock as writers. 6226 */ 6227 if ((err = anon_fill_cow_holes(seg, seg->s_base, 6228 amp->ahp, svd->anon_index, svd->vp, svd->offset, 6229 seg->s_size, szc, svd->prot, svd->vpage, 6230 svd->cred)) != 0) { 6231 return (EINVAL); 6232 } 6233 } 6234 segvn_setvnode_mpss(svd->vp); 6235 } 6236 6237 if (amp != NULL) { 6238 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 6239 if (svd->type == MAP_PRIVATE) { 6240 amp->a_szc = szc; 6241 } else if (szc > amp->a_szc) { 6242 amp->a_szc = szc; 6243 } 6244 ANON_LOCK_EXIT(&->a_rwlock); 6245 } 6246 6247 seg->s_szc = szc; 6248 6249 return (0); 6250 } 6251 6252 static int 6253 segvn_clrszc(struct seg *seg) 6254 { 6255 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 6256 struct anon_map *amp = svd->amp; 6257 size_t pgsz; 6258 pgcnt_t pages; 6259 int err = 0; 6260 caddr_t a = seg->s_base; 6261 caddr_t ea = a + seg->s_size; 6262 ulong_t an_idx = svd->anon_index; 6263 vnode_t *vp = svd->vp; 6264 struct vpage *vpage = svd->vpage; 6265 page_t *anon_pl[1 + 1], *pp; 6266 struct anon *ap, *oldap; 6267 uint_t prot = svd->prot, vpprot; 6268 int pageflag = 0; 6269 6270 ASSERT(AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock) || 6271 SEGVN_WRITE_HELD(seg->s_as, &svd->lock)); 6272 ASSERT(svd->softlockcnt == 0); 6273 6274 if (vp == NULL && amp == NULL) { 6275 ASSERT(svd->rcookie == HAT_INVALID_REGION_COOKIE); 6276 seg->s_szc = 0; 6277 return (0); 6278 } 6279 6280 if (HAT_IS_REGION_COOKIE_VALID(svd->rcookie)) { 6281 ASSERT(svd->amp == NULL); 6282 ASSERT(svd->tr_state == SEGVN_TR_OFF); 6283 hat_leave_region(seg->s_as->a_hat, svd->rcookie, 6284 HAT_REGION_TEXT); 6285 svd->rcookie = HAT_INVALID_REGION_COOKIE; 6286 } else if (svd->tr_state == SEGVN_TR_ON) { 6287 ASSERT(svd->amp != NULL); 6288 segvn_textunrepl(seg, 1); 6289 ASSERT(svd->amp == NULL && svd->tr_state == SEGVN_TR_OFF); 6290 amp = NULL; 6291 } else { 6292 if (svd->tr_state != SEGVN_TR_OFF) { 6293 ASSERT(svd->tr_state == SEGVN_TR_INIT); 6294 svd->tr_state = SEGVN_TR_OFF; 6295 } 6296 6297 /* 6298 * do HAT_UNLOAD_UNMAP since we are changing the pagesize. 6299 * unload argument is 0 when we are freeing the segment 6300 * and unload was already done. 6301 */ 6302 hat_unload(seg->s_as->a_hat, seg->s_base, seg->s_size, 6303 HAT_UNLOAD_UNMAP); 6304 } 6305 6306 if (amp == NULL || svd->type == MAP_SHARED) { 6307 seg->s_szc = 0; 6308 return (0); 6309 } 6310 6311 pgsz = page_get_pagesize(seg->s_szc); 6312 pages = btop(pgsz); 6313 6314 /* 6315 * XXX anon rwlock is not really needed because this is a 6316 * private segment and we are writers. 6317 */ 6318 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 6319 6320 for (; a < ea; a += pgsz, an_idx += pages) { 6321 if ((oldap = anon_get_ptr(amp->ahp, an_idx)) != NULL) { 6322 ASSERT(vpage != NULL || svd->pageprot == 0); 6323 if (vpage != NULL) { 6324 ASSERT(sameprot(seg, a, pgsz)); 6325 prot = VPP_PROT(vpage); 6326 pageflag = VPP_ISPPLOCK(vpage) ? LOCK_PAGE : 0; 6327 } 6328 if (seg->s_szc != 0) { 6329 ASSERT(vp == NULL || anon_pages(amp->ahp, 6330 an_idx, pages) == pages); 6331 if ((err = anon_map_demotepages(amp, an_idx, 6332 seg, a, prot, vpage, svd->cred)) != 0) { 6333 goto out; 6334 } 6335 } else { 6336 if (oldap->an_refcnt == 1) { 6337 continue; 6338 } 6339 if ((err = anon_getpage(&oldap, &vpprot, 6340 anon_pl, PAGESIZE, seg, a, S_READ, 6341 svd->cred))) { 6342 goto out; 6343 } 6344 if ((pp = anon_private(&ap, seg, a, prot, 6345 anon_pl[0], pageflag, svd->cred)) == NULL) { 6346 err = ENOMEM; 6347 goto out; 6348 } 6349 anon_decref(oldap); 6350 (void) anon_set_ptr(amp->ahp, an_idx, ap, 6351 ANON_SLEEP); 6352 page_unlock(pp); 6353 } 6354 } 6355 vpage = (vpage == NULL) ? NULL : vpage + pages; 6356 } 6357 6358 amp->a_szc = 0; 6359 seg->s_szc = 0; 6360 out: 6361 ANON_LOCK_EXIT(&->a_rwlock); 6362 return (err); 6363 } 6364 6365 static int 6366 segvn_claim_pages( 6367 struct seg *seg, 6368 struct vpage *svp, 6369 u_offset_t off, 6370 ulong_t anon_idx, 6371 uint_t prot) 6372 { 6373 pgcnt_t pgcnt = page_get_pagecnt(seg->s_szc); 6374 size_t ppasize = (pgcnt + 1) * sizeof (page_t *); 6375 page_t **ppa; 6376 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 6377 struct anon_map *amp = svd->amp; 6378 struct vpage *evp = svp + pgcnt; 6379 caddr_t addr = ((uintptr_t)(svp - svd->vpage) << PAGESHIFT) 6380 + seg->s_base; 6381 struct anon *ap; 6382 struct vnode *vp = svd->vp; 6383 page_t *pp; 6384 pgcnt_t pg_idx, i; 6385 int err = 0; 6386 anoff_t aoff; 6387 int anon = (amp != NULL) ? 1 : 0; 6388 6389 ASSERT(svd->type == MAP_PRIVATE); 6390 ASSERT(svd->vpage != NULL); 6391 ASSERT(seg->s_szc != 0); 6392 ASSERT(IS_P2ALIGNED(pgcnt, pgcnt)); 6393 ASSERT(amp == NULL || IS_P2ALIGNED(anon_idx, pgcnt)); 6394 ASSERT(sameprot(seg, addr, pgcnt << PAGESHIFT)); 6395 6396 if (VPP_PROT(svp) == prot) 6397 return (1); 6398 if (!((VPP_PROT(svp) ^ prot) & PROT_WRITE)) 6399 return (1); 6400 6401 ppa = kmem_alloc(ppasize, KM_SLEEP); 6402 if (anon && vp != NULL) { 6403 if (anon_get_ptr(amp->ahp, anon_idx) == NULL) { 6404 anon = 0; 6405 ASSERT(!anon_pages(amp->ahp, anon_idx, pgcnt)); 6406 } 6407 ASSERT(!anon || 6408 anon_pages(amp->ahp, anon_idx, pgcnt) == pgcnt); 6409 } 6410 6411 for (*ppa = NULL, pg_idx = 0; svp < evp; svp++, anon_idx++) { 6412 if (!VPP_ISPPLOCK(svp)) 6413 continue; 6414 if (anon) { 6415 ap = anon_get_ptr(amp->ahp, anon_idx); 6416 if (ap == NULL) { 6417 panic("segvn_claim_pages: no anon slot"); 6418 } 6419 swap_xlate(ap, &vp, &aoff); 6420 off = (u_offset_t)aoff; 6421 } 6422 ASSERT(vp != NULL); 6423 if ((pp = page_lookup(vp, 6424 (u_offset_t)off, SE_SHARED)) == NULL) { 6425 panic("segvn_claim_pages: no page"); 6426 } 6427 ppa[pg_idx++] = pp; 6428 off += PAGESIZE; 6429 } 6430 6431 if (ppa[0] == NULL) { 6432 kmem_free(ppa, ppasize); 6433 return (1); 6434 } 6435 6436 ASSERT(pg_idx <= pgcnt); 6437 ppa[pg_idx] = NULL; 6438 6439 if (prot & PROT_WRITE) 6440 err = page_addclaim_pages(ppa); 6441 else 6442 err = page_subclaim_pages(ppa); 6443 6444 for (i = 0; i < pg_idx; i++) { 6445 ASSERT(ppa[i] != NULL); 6446 page_unlock(ppa[i]); 6447 } 6448 6449 kmem_free(ppa, ppasize); 6450 return (err); 6451 } 6452 6453 /* 6454 * Returns right (upper address) segment if split occurred. 6455 * If the address is equal to the beginning or end of its segment it returns 6456 * the current segment. 6457 */ 6458 static struct seg * 6459 segvn_split_seg(struct seg *seg, caddr_t addr) 6460 { 6461 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 6462 struct seg *nseg; 6463 size_t nsize; 6464 struct segvn_data *nsvd; 6465 6466 ASSERT(AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock)); 6467 ASSERT(svd->tr_state == SEGVN_TR_OFF); 6468 6469 ASSERT(addr >= seg->s_base); 6470 ASSERT(addr <= seg->s_base + seg->s_size); 6471 ASSERT(svd->rcookie == HAT_INVALID_REGION_COOKIE); 6472 6473 if (addr == seg->s_base || addr == seg->s_base + seg->s_size) 6474 return (seg); 6475 6476 nsize = seg->s_base + seg->s_size - addr; 6477 seg->s_size = addr - seg->s_base; 6478 nseg = seg_alloc(seg->s_as, addr, nsize); 6479 ASSERT(nseg != NULL); 6480 nseg->s_ops = seg->s_ops; 6481 nsvd = kmem_cache_alloc(segvn_cache, KM_SLEEP); 6482 nseg->s_data = (void *)nsvd; 6483 nseg->s_szc = seg->s_szc; 6484 *nsvd = *svd; 6485 ASSERT(nsvd->rcookie == HAT_INVALID_REGION_COOKIE); 6486 nsvd->seg = nseg; 6487 rw_init(&nsvd->lock, NULL, RW_DEFAULT, NULL); 6488 6489 if (nsvd->vp != NULL) { 6490 VN_HOLD(nsvd->vp); 6491 nsvd->offset = svd->offset + 6492 (uintptr_t)(nseg->s_base - seg->s_base); 6493 if (nsvd->type == MAP_SHARED) 6494 lgrp_shm_policy_init(NULL, nsvd->vp); 6495 } else { 6496 /* 6497 * The offset for an anonymous segment has no signifigance in 6498 * terms of an offset into a file. If we were to use the above 6499 * calculation instead, the structures read out of 6500 * /proc/<pid>/xmap would be more difficult to decipher since 6501 * it would be unclear whether two seemingly contiguous 6502 * prxmap_t structures represented different segments or a 6503 * single segment that had been split up into multiple prxmap_t 6504 * structures (e.g. if some part of the segment had not yet 6505 * been faulted in). 6506 */ 6507 nsvd->offset = 0; 6508 } 6509 6510 ASSERT(svd->softlockcnt == 0); 6511 crhold(svd->cred); 6512 6513 if (svd->vpage != NULL) { 6514 size_t bytes = vpgtob(seg_pages(seg)); 6515 size_t nbytes = vpgtob(seg_pages(nseg)); 6516 struct vpage *ovpage = svd->vpage; 6517 6518 svd->vpage = kmem_alloc(bytes, KM_SLEEP); 6519 bcopy(ovpage, svd->vpage, bytes); 6520 nsvd->vpage = kmem_alloc(nbytes, KM_SLEEP); 6521 bcopy(ovpage + seg_pages(seg), nsvd->vpage, nbytes); 6522 kmem_free(ovpage, bytes + nbytes); 6523 } 6524 if (svd->amp != NULL && svd->type == MAP_PRIVATE) { 6525 struct anon_map *oamp = svd->amp, *namp; 6526 struct anon_hdr *nahp; 6527 6528 ANON_LOCK_ENTER(&oamp->a_rwlock, RW_WRITER); 6529 ASSERT(oamp->refcnt == 1); 6530 nahp = anon_create(btop(seg->s_size), ANON_SLEEP); 6531 (void) anon_copy_ptr(oamp->ahp, svd->anon_index, 6532 nahp, 0, btop(seg->s_size), ANON_SLEEP); 6533 6534 namp = anonmap_alloc(nseg->s_size, 0, ANON_SLEEP); 6535 namp->a_szc = nseg->s_szc; 6536 (void) anon_copy_ptr(oamp->ahp, 6537 svd->anon_index + btop(seg->s_size), 6538 namp->ahp, 0, btop(nseg->s_size), ANON_SLEEP); 6539 anon_release(oamp->ahp, btop(oamp->size)); 6540 oamp->ahp = nahp; 6541 oamp->size = seg->s_size; 6542 svd->anon_index = 0; 6543 nsvd->amp = namp; 6544 nsvd->anon_index = 0; 6545 ANON_LOCK_EXIT(&oamp->a_rwlock); 6546 } else if (svd->amp != NULL) { 6547 pgcnt_t pgcnt = page_get_pagecnt(seg->s_szc); 6548 ASSERT(svd->amp == nsvd->amp); 6549 ASSERT(seg->s_szc <= svd->amp->a_szc); 6550 nsvd->anon_index = svd->anon_index + seg_pages(seg); 6551 ASSERT(IS_P2ALIGNED(nsvd->anon_index, pgcnt)); 6552 ANON_LOCK_ENTER(&svd->amp->a_rwlock, RW_WRITER); 6553 svd->amp->refcnt++; 6554 ANON_LOCK_EXIT(&svd->amp->a_rwlock); 6555 } 6556 6557 /* 6558 * Split amount of swap reserve 6559 */ 6560 if (svd->swresv) { 6561 /* 6562 * For MAP_NORESERVE, only allocate swap reserve for pages 6563 * being used. Other segments get enough to cover whole 6564 * segment. 6565 */ 6566 if (svd->flags & MAP_NORESERVE) { 6567 size_t oswresv; 6568 6569 ASSERT(svd->amp); 6570 oswresv = svd->swresv; 6571 svd->swresv = ptob(anon_pages(svd->amp->ahp, 6572 svd->anon_index, btop(seg->s_size))); 6573 nsvd->swresv = ptob(anon_pages(nsvd->amp->ahp, 6574 nsvd->anon_index, btop(nseg->s_size))); 6575 ASSERT(oswresv >= (svd->swresv + nsvd->swresv)); 6576 } else { 6577 ASSERT(svd->swresv == seg->s_size + nseg->s_size); 6578 svd->swresv = seg->s_size; 6579 nsvd->swresv = nseg->s_size; 6580 } 6581 } 6582 6583 return (nseg); 6584 } 6585 6586 /* 6587 * called on memory operations (unmap, setprot, setpagesize) for a subset 6588 * of a large page segment to either demote the memory range (SDR_RANGE) 6589 * or the ends (SDR_END) by addr/len. 6590 * 6591 * returns 0 on success. returns errno, including ENOMEM, on failure. 6592 */ 6593 static int 6594 segvn_demote_range( 6595 struct seg *seg, 6596 caddr_t addr, 6597 size_t len, 6598 int flag, 6599 uint_t szcvec) 6600 { 6601 caddr_t eaddr = addr + len; 6602 caddr_t lpgaddr, lpgeaddr; 6603 struct seg *nseg; 6604 struct seg *badseg1 = NULL; 6605 struct seg *badseg2 = NULL; 6606 size_t pgsz; 6607 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 6608 int err; 6609 uint_t szc = seg->s_szc; 6610 uint_t tszcvec; 6611 6612 ASSERT(AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock)); 6613 ASSERT(svd->tr_state == SEGVN_TR_OFF); 6614 ASSERT(szc != 0); 6615 pgsz = page_get_pagesize(szc); 6616 ASSERT(seg->s_base != addr || seg->s_size != len); 6617 ASSERT(addr >= seg->s_base && eaddr <= seg->s_base + seg->s_size); 6618 ASSERT(svd->softlockcnt == 0); 6619 ASSERT(svd->rcookie == HAT_INVALID_REGION_COOKIE); 6620 ASSERT(szcvec == 0 || (flag == SDR_END && svd->type == MAP_SHARED)); 6621 6622 CALC_LPG_REGION(pgsz, seg, addr, len, lpgaddr, lpgeaddr); 6623 ASSERT(flag == SDR_RANGE || eaddr < lpgeaddr || addr > lpgaddr); 6624 if (flag == SDR_RANGE) { 6625 /* demote entire range */ 6626 badseg1 = nseg = segvn_split_seg(seg, lpgaddr); 6627 (void) segvn_split_seg(nseg, lpgeaddr); 6628 ASSERT(badseg1->s_base == lpgaddr); 6629 ASSERT(badseg1->s_size == lpgeaddr - lpgaddr); 6630 } else if (addr != lpgaddr) { 6631 ASSERT(flag == SDR_END); 6632 badseg1 = nseg = segvn_split_seg(seg, lpgaddr); 6633 if (eaddr != lpgeaddr && eaddr > lpgaddr + pgsz && 6634 eaddr < lpgaddr + 2 * pgsz) { 6635 (void) segvn_split_seg(nseg, lpgeaddr); 6636 ASSERT(badseg1->s_base == lpgaddr); 6637 ASSERT(badseg1->s_size == 2 * pgsz); 6638 } else { 6639 nseg = segvn_split_seg(nseg, lpgaddr + pgsz); 6640 ASSERT(badseg1->s_base == lpgaddr); 6641 ASSERT(badseg1->s_size == pgsz); 6642 if (eaddr != lpgeaddr && eaddr > lpgaddr + pgsz) { 6643 ASSERT(lpgeaddr - lpgaddr > 2 * pgsz); 6644 nseg = segvn_split_seg(nseg, lpgeaddr - pgsz); 6645 badseg2 = nseg; 6646 (void) segvn_split_seg(nseg, lpgeaddr); 6647 ASSERT(badseg2->s_base == lpgeaddr - pgsz); 6648 ASSERT(badseg2->s_size == pgsz); 6649 } 6650 } 6651 } else { 6652 ASSERT(flag == SDR_END); 6653 ASSERT(eaddr < lpgeaddr); 6654 badseg1 = nseg = segvn_split_seg(seg, lpgeaddr - pgsz); 6655 (void) segvn_split_seg(nseg, lpgeaddr); 6656 ASSERT(badseg1->s_base == lpgeaddr - pgsz); 6657 ASSERT(badseg1->s_size == pgsz); 6658 } 6659 6660 ASSERT(badseg1 != NULL); 6661 ASSERT(badseg1->s_szc == szc); 6662 ASSERT(flag == SDR_RANGE || badseg1->s_size == pgsz || 6663 badseg1->s_size == 2 * pgsz); 6664 ASSERT(sameprot(badseg1, badseg1->s_base, pgsz)); 6665 ASSERT(badseg1->s_size == pgsz || 6666 sameprot(badseg1, badseg1->s_base + pgsz, pgsz)); 6667 if (err = segvn_clrszc(badseg1)) { 6668 return (err); 6669 } 6670 ASSERT(badseg1->s_szc == 0); 6671 6672 if (szc > 1 && (tszcvec = P2PHASE(szcvec, 1 << szc)) > 1) { 6673 uint_t tszc = highbit(tszcvec) - 1; 6674 caddr_t ta = MAX(addr, badseg1->s_base); 6675 caddr_t te; 6676 size_t tpgsz = page_get_pagesize(tszc); 6677 6678 ASSERT(svd->type == MAP_SHARED); 6679 ASSERT(flag == SDR_END); 6680 ASSERT(tszc < szc && tszc > 0); 6681 6682 if (eaddr > badseg1->s_base + badseg1->s_size) { 6683 te = badseg1->s_base + badseg1->s_size; 6684 } else { 6685 te = eaddr; 6686 } 6687 6688 ASSERT(ta <= te); 6689 badseg1->s_szc = tszc; 6690 if (!IS_P2ALIGNED(ta, tpgsz) || !IS_P2ALIGNED(te, tpgsz)) { 6691 if (badseg2 != NULL) { 6692 err = segvn_demote_range(badseg1, ta, te - ta, 6693 SDR_END, tszcvec); 6694 if (err != 0) { 6695 return (err); 6696 } 6697 } else { 6698 return (segvn_demote_range(badseg1, ta, 6699 te - ta, SDR_END, tszcvec)); 6700 } 6701 } 6702 } 6703 6704 if (badseg2 == NULL) 6705 return (0); 6706 ASSERT(badseg2->s_szc == szc); 6707 ASSERT(badseg2->s_size == pgsz); 6708 ASSERT(sameprot(badseg2, badseg2->s_base, badseg2->s_size)); 6709 if (err = segvn_clrszc(badseg2)) { 6710 return (err); 6711 } 6712 ASSERT(badseg2->s_szc == 0); 6713 6714 if (szc > 1 && (tszcvec = P2PHASE(szcvec, 1 << szc)) > 1) { 6715 uint_t tszc = highbit(tszcvec) - 1; 6716 size_t tpgsz = page_get_pagesize(tszc); 6717 6718 ASSERT(svd->type == MAP_SHARED); 6719 ASSERT(flag == SDR_END); 6720 ASSERT(tszc < szc && tszc > 0); 6721 ASSERT(badseg2->s_base > addr); 6722 ASSERT(eaddr > badseg2->s_base); 6723 ASSERT(eaddr < badseg2->s_base + badseg2->s_size); 6724 6725 badseg2->s_szc = tszc; 6726 if (!IS_P2ALIGNED(eaddr, tpgsz)) { 6727 return (segvn_demote_range(badseg2, badseg2->s_base, 6728 eaddr - badseg2->s_base, SDR_END, tszcvec)); 6729 } 6730 } 6731 6732 return (0); 6733 } 6734 6735 static int 6736 segvn_checkprot(struct seg *seg, caddr_t addr, size_t len, uint_t prot) 6737 { 6738 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 6739 struct vpage *vp, *evp; 6740 6741 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 6742 6743 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER); 6744 /* 6745 * If segment protection can be used, simply check against them. 6746 */ 6747 if (svd->pageprot == 0) { 6748 int err; 6749 6750 err = ((svd->prot & prot) != prot) ? EACCES : 0; 6751 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 6752 return (err); 6753 } 6754 6755 /* 6756 * Have to check down to the vpage level. 6757 */ 6758 evp = &svd->vpage[seg_page(seg, addr + len)]; 6759 for (vp = &svd->vpage[seg_page(seg, addr)]; vp < evp; vp++) { 6760 if ((VPP_PROT(vp) & prot) != prot) { 6761 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 6762 return (EACCES); 6763 } 6764 } 6765 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 6766 return (0); 6767 } 6768 6769 static int 6770 segvn_getprot(struct seg *seg, caddr_t addr, size_t len, uint_t *protv) 6771 { 6772 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 6773 size_t pgno = seg_page(seg, addr + len) - seg_page(seg, addr) + 1; 6774 6775 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 6776 6777 if (pgno != 0) { 6778 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER); 6779 if (svd->pageprot == 0) { 6780 do { 6781 protv[--pgno] = svd->prot; 6782 } while (pgno != 0); 6783 } else { 6784 size_t pgoff = seg_page(seg, addr); 6785 6786 do { 6787 pgno--; 6788 protv[pgno] = VPP_PROT(&svd->vpage[pgno+pgoff]); 6789 } while (pgno != 0); 6790 } 6791 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 6792 } 6793 return (0); 6794 } 6795 6796 static u_offset_t 6797 segvn_getoffset(struct seg *seg, caddr_t addr) 6798 { 6799 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 6800 6801 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 6802 6803 return (svd->offset + (uintptr_t)(addr - seg->s_base)); 6804 } 6805 6806 /*ARGSUSED*/ 6807 static int 6808 segvn_gettype(struct seg *seg, caddr_t addr) 6809 { 6810 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 6811 6812 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 6813 6814 return (svd->type | (svd->flags & (MAP_NORESERVE | MAP_TEXT | 6815 MAP_INITDATA))); 6816 } 6817 6818 /*ARGSUSED*/ 6819 static int 6820 segvn_getvp(struct seg *seg, caddr_t addr, struct vnode **vpp) 6821 { 6822 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 6823 6824 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 6825 6826 *vpp = svd->vp; 6827 return (0); 6828 } 6829 6830 /* 6831 * Check to see if it makes sense to do kluster/read ahead to 6832 * addr + delta relative to the mapping at addr. We assume here 6833 * that delta is a signed PAGESIZE'd multiple (which can be negative). 6834 * 6835 * For segvn, we currently "approve" of the action if we are 6836 * still in the segment and it maps from the same vp/off, 6837 * or if the advice stored in segvn_data or vpages allows it. 6838 * Currently, klustering is not allowed only if MADV_RANDOM is set. 6839 */ 6840 static int 6841 segvn_kluster(struct seg *seg, caddr_t addr, ssize_t delta) 6842 { 6843 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 6844 struct anon *oap, *ap; 6845 ssize_t pd; 6846 size_t page; 6847 struct vnode *vp1, *vp2; 6848 u_offset_t off1, off2; 6849 struct anon_map *amp; 6850 6851 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 6852 ASSERT(AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock) || 6853 SEGVN_LOCK_HELD(seg->s_as, &svd->lock)); 6854 6855 if (addr + delta < seg->s_base || 6856 addr + delta >= (seg->s_base + seg->s_size)) 6857 return (-1); /* exceeded segment bounds */ 6858 6859 pd = delta / (ssize_t)PAGESIZE; /* divide to preserve sign bit */ 6860 page = seg_page(seg, addr); 6861 6862 /* 6863 * Check to see if either of the pages addr or addr + delta 6864 * have advice set that prevents klustering (if MADV_RANDOM advice 6865 * is set for entire segment, or MADV_SEQUENTIAL is set and delta 6866 * is negative). 6867 */ 6868 if (svd->advice == MADV_RANDOM || 6869 svd->advice == MADV_SEQUENTIAL && delta < 0) 6870 return (-1); 6871 else if (svd->pageadvice && svd->vpage) { 6872 struct vpage *bvpp, *evpp; 6873 6874 bvpp = &svd->vpage[page]; 6875 evpp = &svd->vpage[page + pd]; 6876 if (VPP_ADVICE(bvpp) == MADV_RANDOM || 6877 VPP_ADVICE(evpp) == MADV_SEQUENTIAL && delta < 0) 6878 return (-1); 6879 if (VPP_ADVICE(bvpp) != VPP_ADVICE(evpp) && 6880 VPP_ADVICE(evpp) == MADV_RANDOM) 6881 return (-1); 6882 } 6883 6884 if (svd->type == MAP_SHARED) 6885 return (0); /* shared mapping - all ok */ 6886 6887 if ((amp = svd->amp) == NULL) 6888 return (0); /* off original vnode */ 6889 6890 page += svd->anon_index; 6891 6892 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 6893 6894 oap = anon_get_ptr(amp->ahp, page); 6895 ap = anon_get_ptr(amp->ahp, page + pd); 6896 6897 ANON_LOCK_EXIT(&->a_rwlock); 6898 6899 if ((oap == NULL && ap != NULL) || (oap != NULL && ap == NULL)) { 6900 return (-1); /* one with and one without an anon */ 6901 } 6902 6903 if (oap == NULL) { /* implies that ap == NULL */ 6904 return (0); /* off original vnode */ 6905 } 6906 6907 /* 6908 * Now we know we have two anon pointers - check to 6909 * see if they happen to be properly allocated. 6910 */ 6911 6912 /* 6913 * XXX We cheat here and don't lock the anon slots. We can't because 6914 * we may have been called from the anon layer which might already 6915 * have locked them. We are holding a refcnt on the slots so they 6916 * can't disappear. The worst that will happen is we'll get the wrong 6917 * names (vp, off) for the slots and make a poor klustering decision. 6918 */ 6919 swap_xlate(ap, &vp1, &off1); 6920 swap_xlate(oap, &vp2, &off2); 6921 6922 6923 if (!VOP_CMP(vp1, vp2, NULL) || off1 - off2 != delta) 6924 return (-1); 6925 return (0); 6926 } 6927 6928 /* 6929 * Swap the pages of seg out to secondary storage, returning the 6930 * number of bytes of storage freed. 6931 * 6932 * The basic idea is first to unload all translations and then to call 6933 * VOP_PUTPAGE() for all newly-unmapped pages, to push them out to the 6934 * swap device. Pages to which other segments have mappings will remain 6935 * mapped and won't be swapped. Our caller (as_swapout) has already 6936 * performed the unloading step. 6937 * 6938 * The value returned is intended to correlate well with the process's 6939 * memory requirements. However, there are some caveats: 6940 * 1) When given a shared segment as argument, this routine will 6941 * only succeed in swapping out pages for the last sharer of the 6942 * segment. (Previous callers will only have decremented mapping 6943 * reference counts.) 6944 * 2) We assume that the hat layer maintains a large enough translation 6945 * cache to capture process reference patterns. 6946 */ 6947 static size_t 6948 segvn_swapout(struct seg *seg) 6949 { 6950 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 6951 struct anon_map *amp; 6952 pgcnt_t pgcnt = 0; 6953 pgcnt_t npages; 6954 pgcnt_t page; 6955 ulong_t anon_index; 6956 6957 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 6958 6959 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER); 6960 /* 6961 * Find pages unmapped by our caller and force them 6962 * out to the virtual swap device. 6963 */ 6964 if ((amp = svd->amp) != NULL) 6965 anon_index = svd->anon_index; 6966 npages = seg->s_size >> PAGESHIFT; 6967 for (page = 0; page < npages; page++) { 6968 page_t *pp; 6969 struct anon *ap; 6970 struct vnode *vp; 6971 u_offset_t off; 6972 anon_sync_obj_t cookie; 6973 6974 /* 6975 * Obtain <vp, off> pair for the page, then look it up. 6976 * 6977 * Note that this code is willing to consider regular 6978 * pages as well as anon pages. Is this appropriate here? 6979 */ 6980 ap = NULL; 6981 if (amp != NULL) { 6982 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 6983 if (anon_array_try_enter(amp, anon_index + page, 6984 &cookie)) { 6985 ANON_LOCK_EXIT(&->a_rwlock); 6986 continue; 6987 } 6988 ap = anon_get_ptr(amp->ahp, anon_index + page); 6989 if (ap != NULL) { 6990 swap_xlate(ap, &vp, &off); 6991 } else { 6992 vp = svd->vp; 6993 off = svd->offset + ptob(page); 6994 } 6995 anon_array_exit(&cookie); 6996 ANON_LOCK_EXIT(&->a_rwlock); 6997 } else { 6998 vp = svd->vp; 6999 off = svd->offset + ptob(page); 7000 } 7001 if (vp == NULL) { /* untouched zfod page */ 7002 ASSERT(ap == NULL); 7003 continue; 7004 } 7005 7006 pp = page_lookup_nowait(vp, off, SE_SHARED); 7007 if (pp == NULL) 7008 continue; 7009 7010 7011 /* 7012 * Examine the page to see whether it can be tossed out, 7013 * keeping track of how many we've found. 7014 */ 7015 if (!page_tryupgrade(pp)) { 7016 /* 7017 * If the page has an i/o lock and no mappings, 7018 * it's very likely that the page is being 7019 * written out as a result of klustering. 7020 * Assume this is so and take credit for it here. 7021 */ 7022 if (!page_io_trylock(pp)) { 7023 if (!hat_page_is_mapped(pp)) 7024 pgcnt++; 7025 } else { 7026 page_io_unlock(pp); 7027 } 7028 page_unlock(pp); 7029 continue; 7030 } 7031 ASSERT(!page_iolock_assert(pp)); 7032 7033 7034 /* 7035 * Skip if page is locked or has mappings. 7036 * We don't need the page_struct_lock to look at lckcnt 7037 * and cowcnt because the page is exclusive locked. 7038 */ 7039 if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0 || 7040 hat_page_is_mapped(pp)) { 7041 page_unlock(pp); 7042 continue; 7043 } 7044 7045 /* 7046 * dispose skips large pages so try to demote first. 7047 */ 7048 if (pp->p_szc != 0 && !page_try_demote_pages(pp)) { 7049 page_unlock(pp); 7050 /* 7051 * XXX should skip the remaining page_t's of this 7052 * large page. 7053 */ 7054 continue; 7055 } 7056 7057 ASSERT(pp->p_szc == 0); 7058 7059 /* 7060 * No longer mapped -- we can toss it out. How 7061 * we do so depends on whether or not it's dirty. 7062 */ 7063 if (hat_ismod(pp) && pp->p_vnode) { 7064 /* 7065 * We must clean the page before it can be 7066 * freed. Setting B_FREE will cause pvn_done 7067 * to free the page when the i/o completes. 7068 * XXX: This also causes it to be accounted 7069 * as a pageout instead of a swap: need 7070 * B_SWAPOUT bit to use instead of B_FREE. 7071 * 7072 * Hold the vnode before releasing the page lock 7073 * to prevent it from being freed and re-used by 7074 * some other thread. 7075 */ 7076 VN_HOLD(vp); 7077 page_unlock(pp); 7078 7079 /* 7080 * Queue all i/o requests for the pageout thread 7081 * to avoid saturating the pageout devices. 7082 */ 7083 if (!queue_io_request(vp, off)) 7084 VN_RELE(vp); 7085 } else { 7086 /* 7087 * The page was clean, free it. 7088 * 7089 * XXX: Can we ever encounter modified pages 7090 * with no associated vnode here? 7091 */ 7092 ASSERT(pp->p_vnode != NULL); 7093 /*LINTED: constant in conditional context*/ 7094 VN_DISPOSE(pp, B_FREE, 0, kcred); 7095 } 7096 7097 /* 7098 * Credit now even if i/o is in progress. 7099 */ 7100 pgcnt++; 7101 } 7102 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7103 7104 /* 7105 * Wakeup pageout to initiate i/o on all queued requests. 7106 */ 7107 cv_signal_pageout(); 7108 return (ptob(pgcnt)); 7109 } 7110 7111 /* 7112 * Synchronize primary storage cache with real object in virtual memory. 7113 * 7114 * XXX - Anonymous pages should not be sync'ed out at all. 7115 */ 7116 static int 7117 segvn_sync(struct seg *seg, caddr_t addr, size_t len, int attr, uint_t flags) 7118 { 7119 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 7120 struct vpage *vpp; 7121 page_t *pp; 7122 u_offset_t offset; 7123 struct vnode *vp; 7124 u_offset_t off; 7125 caddr_t eaddr; 7126 int bflags; 7127 int err = 0; 7128 int segtype; 7129 int pageprot; 7130 int prot; 7131 ulong_t anon_index; 7132 struct anon_map *amp; 7133 struct anon *ap; 7134 anon_sync_obj_t cookie; 7135 7136 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 7137 7138 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER); 7139 7140 if (svd->softlockcnt > 0) { 7141 /* 7142 * flush all pages from seg cache 7143 * otherwise we may deadlock in swap_putpage 7144 * for B_INVAL page (4175402). 7145 * 7146 * Even if we grab segvn WRITER's lock or segp_slock 7147 * here, there might be another thread which could've 7148 * successfully performed lookup/insert just before 7149 * we acquired the lock here. So, grabbing either 7150 * lock here is of not much use. Until we devise 7151 * a strategy at upper layers to solve the 7152 * synchronization issues completely, we expect 7153 * applications to handle this appropriately. 7154 */ 7155 segvn_purge(seg); 7156 if (svd->softlockcnt > 0) { 7157 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7158 return (EAGAIN); 7159 } 7160 } 7161 7162 vpp = svd->vpage; 7163 offset = svd->offset + (uintptr_t)(addr - seg->s_base); 7164 bflags = ((flags & MS_ASYNC) ? B_ASYNC : 0) | 7165 ((flags & MS_INVALIDATE) ? B_INVAL : 0); 7166 7167 if (attr) { 7168 pageprot = attr & ~(SHARED|PRIVATE); 7169 segtype = (attr & SHARED) ? MAP_SHARED : MAP_PRIVATE; 7170 7171 /* 7172 * We are done if the segment types don't match 7173 * or if we have segment level protections and 7174 * they don't match. 7175 */ 7176 if (svd->type != segtype) { 7177 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7178 return (0); 7179 } 7180 if (vpp == NULL) { 7181 if (svd->prot != pageprot) { 7182 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7183 return (0); 7184 } 7185 prot = svd->prot; 7186 } else 7187 vpp = &svd->vpage[seg_page(seg, addr)]; 7188 7189 } else if (svd->vp && svd->amp == NULL && 7190 (flags & MS_INVALIDATE) == 0) { 7191 7192 /* 7193 * No attributes, no anonymous pages and MS_INVALIDATE flag 7194 * is not on, just use one big request. 7195 */ 7196 err = VOP_PUTPAGE(svd->vp, (offset_t)offset, len, 7197 bflags, svd->cred, NULL); 7198 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7199 return (err); 7200 } 7201 7202 if ((amp = svd->amp) != NULL) 7203 anon_index = svd->anon_index + seg_page(seg, addr); 7204 7205 for (eaddr = addr + len; addr < eaddr; addr += PAGESIZE) { 7206 ap = NULL; 7207 if (amp != NULL) { 7208 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 7209 anon_array_enter(amp, anon_index, &cookie); 7210 ap = anon_get_ptr(amp->ahp, anon_index++); 7211 if (ap != NULL) { 7212 swap_xlate(ap, &vp, &off); 7213 } else { 7214 vp = svd->vp; 7215 off = offset; 7216 } 7217 anon_array_exit(&cookie); 7218 ANON_LOCK_EXIT(&->a_rwlock); 7219 } else { 7220 vp = svd->vp; 7221 off = offset; 7222 } 7223 offset += PAGESIZE; 7224 7225 if (vp == NULL) /* untouched zfod page */ 7226 continue; 7227 7228 if (attr) { 7229 if (vpp) { 7230 prot = VPP_PROT(vpp); 7231 vpp++; 7232 } 7233 if (prot != pageprot) { 7234 continue; 7235 } 7236 } 7237 7238 /* 7239 * See if any of these pages are locked -- if so, then we 7240 * will have to truncate an invalidate request at the first 7241 * locked one. We don't need the page_struct_lock to test 7242 * as this is only advisory; even if we acquire it someone 7243 * might race in and lock the page after we unlock and before 7244 * we do the PUTPAGE, then PUTPAGE simply does nothing. 7245 */ 7246 if (flags & MS_INVALIDATE) { 7247 if ((pp = page_lookup(vp, off, SE_SHARED)) != NULL) { 7248 if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) { 7249 page_unlock(pp); 7250 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7251 return (EBUSY); 7252 } 7253 if (ap != NULL && pp->p_szc != 0 && 7254 page_tryupgrade(pp)) { 7255 if (pp->p_lckcnt == 0 && 7256 pp->p_cowcnt == 0) { 7257 /* 7258 * swapfs VN_DISPOSE() won't 7259 * invalidate large pages. 7260 * Attempt to demote. 7261 * XXX can't help it if it 7262 * fails. But for swapfs 7263 * pages it is no big deal. 7264 */ 7265 (void) page_try_demote_pages( 7266 pp); 7267 } 7268 } 7269 page_unlock(pp); 7270 } 7271 } else if (svd->type == MAP_SHARED && amp != NULL) { 7272 /* 7273 * Avoid writing out to disk ISM's large pages 7274 * because segspt_free_pages() relies on NULL an_pvp 7275 * of anon slots of such pages. 7276 */ 7277 7278 ASSERT(svd->vp == NULL); 7279 /* 7280 * swapfs uses page_lookup_nowait if not freeing or 7281 * invalidating and skips a page if 7282 * page_lookup_nowait returns NULL. 7283 */ 7284 pp = page_lookup_nowait(vp, off, SE_SHARED); 7285 if (pp == NULL) { 7286 continue; 7287 } 7288 if (pp->p_szc != 0) { 7289 page_unlock(pp); 7290 continue; 7291 } 7292 7293 /* 7294 * Note ISM pages are created large so (vp, off)'s 7295 * page cannot suddenly become large after we unlock 7296 * pp. 7297 */ 7298 page_unlock(pp); 7299 } 7300 /* 7301 * XXX - Should ultimately try to kluster 7302 * calls to VOP_PUTPAGE() for performance. 7303 */ 7304 VN_HOLD(vp); 7305 err = VOP_PUTPAGE(vp, (offset_t)off, PAGESIZE, 7306 bflags, svd->cred, NULL); 7307 VN_RELE(vp); 7308 if (err) 7309 break; 7310 } 7311 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7312 return (err); 7313 } 7314 7315 /* 7316 * Determine if we have data corresponding to pages in the 7317 * primary storage virtual memory cache (i.e., "in core"). 7318 */ 7319 static size_t 7320 segvn_incore(struct seg *seg, caddr_t addr, size_t len, char *vec) 7321 { 7322 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 7323 struct vnode *vp, *avp; 7324 u_offset_t offset, aoffset; 7325 size_t p, ep; 7326 int ret; 7327 struct vpage *vpp; 7328 page_t *pp; 7329 uint_t start; 7330 struct anon_map *amp; /* XXX - for locknest */ 7331 struct anon *ap; 7332 uint_t attr; 7333 anon_sync_obj_t cookie; 7334 7335 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 7336 7337 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER); 7338 if (svd->amp == NULL && svd->vp == NULL) { 7339 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7340 bzero(vec, btopr(len)); 7341 return (len); /* no anonymous pages created yet */ 7342 } 7343 7344 p = seg_page(seg, addr); 7345 ep = seg_page(seg, addr + len); 7346 start = svd->vp ? SEG_PAGE_VNODEBACKED : 0; 7347 7348 amp = svd->amp; 7349 for (; p < ep; p++, addr += PAGESIZE) { 7350 vpp = (svd->vpage) ? &svd->vpage[p]: NULL; 7351 ret = start; 7352 ap = NULL; 7353 avp = NULL; 7354 /* Grab the vnode/offset for the anon slot */ 7355 if (amp != NULL) { 7356 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 7357 anon_array_enter(amp, svd->anon_index + p, &cookie); 7358 ap = anon_get_ptr(amp->ahp, svd->anon_index + p); 7359 if (ap != NULL) { 7360 swap_xlate(ap, &avp, &aoffset); 7361 } 7362 anon_array_exit(&cookie); 7363 ANON_LOCK_EXIT(&->a_rwlock); 7364 } 7365 if ((avp != NULL) && page_exists(avp, aoffset)) { 7366 /* A page exists for the anon slot */ 7367 ret |= SEG_PAGE_INCORE; 7368 7369 /* 7370 * If page is mapped and writable 7371 */ 7372 attr = (uint_t)0; 7373 if ((hat_getattr(seg->s_as->a_hat, addr, 7374 &attr) != -1) && (attr & PROT_WRITE)) { 7375 ret |= SEG_PAGE_ANON; 7376 } 7377 /* 7378 * Don't get page_struct lock for lckcnt and cowcnt, 7379 * since this is purely advisory. 7380 */ 7381 if ((pp = page_lookup_nowait(avp, aoffset, 7382 SE_SHARED)) != NULL) { 7383 if (pp->p_lckcnt) 7384 ret |= SEG_PAGE_SOFTLOCK; 7385 if (pp->p_cowcnt) 7386 ret |= SEG_PAGE_HASCOW; 7387 page_unlock(pp); 7388 } 7389 } 7390 7391 /* Gather vnode statistics */ 7392 vp = svd->vp; 7393 offset = svd->offset + (uintptr_t)(addr - seg->s_base); 7394 7395 if (vp != NULL) { 7396 /* 7397 * Try to obtain a "shared" lock on the page 7398 * without blocking. If this fails, determine 7399 * if the page is in memory. 7400 */ 7401 pp = page_lookup_nowait(vp, offset, SE_SHARED); 7402 if ((pp == NULL) && (page_exists(vp, offset))) { 7403 /* Page is incore, and is named */ 7404 ret |= (SEG_PAGE_INCORE | SEG_PAGE_VNODE); 7405 } 7406 /* 7407 * Don't get page_struct lock for lckcnt and cowcnt, 7408 * since this is purely advisory. 7409 */ 7410 if (pp != NULL) { 7411 ret |= (SEG_PAGE_INCORE | SEG_PAGE_VNODE); 7412 if (pp->p_lckcnt) 7413 ret |= SEG_PAGE_SOFTLOCK; 7414 if (pp->p_cowcnt) 7415 ret |= SEG_PAGE_HASCOW; 7416 page_unlock(pp); 7417 } 7418 } 7419 7420 /* Gather virtual page information */ 7421 if (vpp) { 7422 if (VPP_ISPPLOCK(vpp)) 7423 ret |= SEG_PAGE_LOCKED; 7424 vpp++; 7425 } 7426 7427 *vec++ = (char)ret; 7428 } 7429 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7430 return (len); 7431 } 7432 7433 /* 7434 * Statement for p_cowcnts/p_lckcnts. 7435 * 7436 * p_cowcnt is updated while mlock/munlocking MAP_PRIVATE and PROT_WRITE region 7437 * irrespective of the following factors or anything else: 7438 * 7439 * (1) anon slots are populated or not 7440 * (2) cow is broken or not 7441 * (3) refcnt on ap is 1 or greater than 1 7442 * 7443 * If it's not MAP_PRIVATE and PROT_WRITE, p_lckcnt is updated during mlock 7444 * and munlock. 7445 * 7446 * 7447 * Handling p_cowcnts/p_lckcnts during copy-on-write fault: 7448 * 7449 * if vpage has PROT_WRITE 7450 * transfer cowcnt on the oldpage -> cowcnt on the newpage 7451 * else 7452 * transfer lckcnt on the oldpage -> lckcnt on the newpage 7453 * 7454 * During copy-on-write, decrement p_cowcnt on the oldpage and increment 7455 * p_cowcnt on the newpage *if* the corresponding vpage has PROT_WRITE. 7456 * 7457 * We may also break COW if softlocking on read access in the physio case. 7458 * In this case, vpage may not have PROT_WRITE. So, we need to decrement 7459 * p_lckcnt on the oldpage and increment p_lckcnt on the newpage *if* the 7460 * vpage doesn't have PROT_WRITE. 7461 * 7462 * 7463 * Handling p_cowcnts/p_lckcnts during mprotect on mlocked region: 7464 * 7465 * If a MAP_PRIVATE region loses PROT_WRITE, we decrement p_cowcnt and 7466 * increment p_lckcnt by calling page_subclaim() which takes care of 7467 * availrmem accounting and p_lckcnt overflow. 7468 * 7469 * If a MAP_PRIVATE region gains PROT_WRITE, we decrement p_lckcnt and 7470 * increment p_cowcnt by calling page_addclaim() which takes care of 7471 * availrmem availability and p_cowcnt overflow. 7472 */ 7473 7474 /* 7475 * Lock down (or unlock) pages mapped by this segment. 7476 * 7477 * XXX only creates PAGESIZE pages if anon slots are not initialized. 7478 * At fault time they will be relocated into larger pages. 7479 */ 7480 static int 7481 segvn_lockop(struct seg *seg, caddr_t addr, size_t len, 7482 int attr, int op, ulong_t *lockmap, size_t pos) 7483 { 7484 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 7485 struct vpage *vpp; 7486 struct vpage *evp; 7487 page_t *pp; 7488 u_offset_t offset; 7489 u_offset_t off; 7490 int segtype; 7491 int pageprot; 7492 int claim; 7493 struct vnode *vp; 7494 ulong_t anon_index; 7495 struct anon_map *amp; 7496 struct anon *ap; 7497 struct vattr va; 7498 anon_sync_obj_t cookie; 7499 struct kshmid *sp = NULL; 7500 struct proc *p = curproc; 7501 kproject_t *proj = NULL; 7502 int chargeproc = 1; 7503 size_t locked_bytes = 0; 7504 size_t unlocked_bytes = 0; 7505 int err = 0; 7506 7507 /* 7508 * Hold write lock on address space because may split or concatenate 7509 * segments 7510 */ 7511 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 7512 7513 /* 7514 * If this is a shm, use shm's project and zone, else use 7515 * project and zone of calling process 7516 */ 7517 7518 /* Determine if this segment backs a sysV shm */ 7519 if (svd->amp != NULL && svd->amp->a_sp != NULL) { 7520 ASSERT(svd->type == MAP_SHARED); 7521 ASSERT(svd->tr_state == SEGVN_TR_OFF); 7522 sp = svd->amp->a_sp; 7523 proj = sp->shm_perm.ipc_proj; 7524 chargeproc = 0; 7525 } 7526 7527 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_WRITER); 7528 if (attr) { 7529 pageprot = attr & ~(SHARED|PRIVATE); 7530 segtype = attr & SHARED ? MAP_SHARED : MAP_PRIVATE; 7531 7532 /* 7533 * We are done if the segment types don't match 7534 * or if we have segment level protections and 7535 * they don't match. 7536 */ 7537 if (svd->type != segtype) { 7538 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7539 return (0); 7540 } 7541 if (svd->pageprot == 0 && svd->prot != pageprot) { 7542 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7543 return (0); 7544 } 7545 } 7546 7547 if (op == MC_LOCK) { 7548 if (svd->tr_state == SEGVN_TR_INIT) { 7549 svd->tr_state = SEGVN_TR_OFF; 7550 } else if (svd->tr_state == SEGVN_TR_ON) { 7551 ASSERT(svd->amp != NULL); 7552 segvn_textunrepl(seg, 0); 7553 ASSERT(svd->amp == NULL && 7554 svd->tr_state == SEGVN_TR_OFF); 7555 } 7556 } 7557 7558 /* 7559 * If we're locking, then we must create a vpage structure if 7560 * none exists. If we're unlocking, then check to see if there 7561 * is a vpage -- if not, then we could not have locked anything. 7562 */ 7563 7564 if ((vpp = svd->vpage) == NULL) { 7565 if (op == MC_LOCK) 7566 segvn_vpage(seg); 7567 else { 7568 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7569 return (0); 7570 } 7571 } 7572 7573 /* 7574 * The anonymous data vector (i.e., previously 7575 * unreferenced mapping to swap space) can be allocated 7576 * by lazily testing for its existence. 7577 */ 7578 if (op == MC_LOCK && svd->amp == NULL && svd->vp == NULL) { 7579 ASSERT(svd->rcookie == HAT_INVALID_REGION_COOKIE); 7580 svd->amp = anonmap_alloc(seg->s_size, 0, ANON_SLEEP); 7581 svd->amp->a_szc = seg->s_szc; 7582 } 7583 7584 if ((amp = svd->amp) != NULL) { 7585 anon_index = svd->anon_index + seg_page(seg, addr); 7586 } 7587 7588 offset = svd->offset + (uintptr_t)(addr - seg->s_base); 7589 evp = &svd->vpage[seg_page(seg, addr + len)]; 7590 7591 if (sp != NULL) 7592 mutex_enter(&sp->shm_mlock); 7593 7594 /* determine number of unlocked bytes in range for lock operation */ 7595 if (op == MC_LOCK) { 7596 7597 if (sp == NULL) { 7598 for (vpp = &svd->vpage[seg_page(seg, addr)]; vpp < evp; 7599 vpp++) { 7600 if (!VPP_ISPPLOCK(vpp)) 7601 unlocked_bytes += PAGESIZE; 7602 } 7603 } else { 7604 ulong_t i_idx, i_edx; 7605 anon_sync_obj_t i_cookie; 7606 struct anon *i_ap; 7607 struct vnode *i_vp; 7608 u_offset_t i_off; 7609 7610 /* Only count sysV pages once for locked memory */ 7611 i_edx = svd->anon_index + seg_page(seg, addr + len); 7612 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 7613 for (i_idx = anon_index; i_idx < i_edx; i_idx++) { 7614 anon_array_enter(amp, i_idx, &i_cookie); 7615 i_ap = anon_get_ptr(amp->ahp, i_idx); 7616 if (i_ap == NULL) { 7617 unlocked_bytes += PAGESIZE; 7618 anon_array_exit(&i_cookie); 7619 continue; 7620 } 7621 swap_xlate(i_ap, &i_vp, &i_off); 7622 anon_array_exit(&i_cookie); 7623 pp = page_lookup(i_vp, i_off, SE_SHARED); 7624 if (pp == NULL) { 7625 unlocked_bytes += PAGESIZE; 7626 continue; 7627 } else if (pp->p_lckcnt == 0) 7628 unlocked_bytes += PAGESIZE; 7629 page_unlock(pp); 7630 } 7631 ANON_LOCK_EXIT(&->a_rwlock); 7632 } 7633 7634 mutex_enter(&p->p_lock); 7635 err = rctl_incr_locked_mem(p, proj, unlocked_bytes, 7636 chargeproc); 7637 mutex_exit(&p->p_lock); 7638 7639 if (err) { 7640 if (sp != NULL) 7641 mutex_exit(&sp->shm_mlock); 7642 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7643 return (err); 7644 } 7645 } 7646 /* 7647 * Loop over all pages in the range. Process if we're locking and 7648 * page has not already been locked in this mapping; or if we're 7649 * unlocking and the page has been locked. 7650 */ 7651 for (vpp = &svd->vpage[seg_page(seg, addr)]; vpp < evp; 7652 vpp++, pos++, addr += PAGESIZE, offset += PAGESIZE, anon_index++) { 7653 if ((attr == 0 || VPP_PROT(vpp) == pageprot) && 7654 ((op == MC_LOCK && !VPP_ISPPLOCK(vpp)) || 7655 (op == MC_UNLOCK && VPP_ISPPLOCK(vpp)))) { 7656 7657 if (amp != NULL) 7658 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 7659 /* 7660 * If this isn't a MAP_NORESERVE segment and 7661 * we're locking, allocate anon slots if they 7662 * don't exist. The page is brought in later on. 7663 */ 7664 if (op == MC_LOCK && svd->vp == NULL && 7665 ((svd->flags & MAP_NORESERVE) == 0) && 7666 amp != NULL && 7667 ((ap = anon_get_ptr(amp->ahp, anon_index)) 7668 == NULL)) { 7669 anon_array_enter(amp, anon_index, &cookie); 7670 7671 if ((ap = anon_get_ptr(amp->ahp, 7672 anon_index)) == NULL) { 7673 pp = anon_zero(seg, addr, &ap, 7674 svd->cred); 7675 if (pp == NULL) { 7676 anon_array_exit(&cookie); 7677 ANON_LOCK_EXIT(&->a_rwlock); 7678 err = ENOMEM; 7679 goto out; 7680 } 7681 ASSERT(anon_get_ptr(amp->ahp, 7682 anon_index) == NULL); 7683 (void) anon_set_ptr(amp->ahp, 7684 anon_index, ap, ANON_SLEEP); 7685 page_unlock(pp); 7686 } 7687 anon_array_exit(&cookie); 7688 } 7689 7690 /* 7691 * Get name for page, accounting for 7692 * existence of private copy. 7693 */ 7694 ap = NULL; 7695 if (amp != NULL) { 7696 anon_array_enter(amp, anon_index, &cookie); 7697 ap = anon_get_ptr(amp->ahp, anon_index); 7698 if (ap != NULL) { 7699 swap_xlate(ap, &vp, &off); 7700 } else { 7701 if (svd->vp == NULL && 7702 (svd->flags & MAP_NORESERVE)) { 7703 anon_array_exit(&cookie); 7704 ANON_LOCK_EXIT(&->a_rwlock); 7705 continue; 7706 } 7707 vp = svd->vp; 7708 off = offset; 7709 } 7710 anon_array_exit(&cookie); 7711 ANON_LOCK_EXIT(&->a_rwlock); 7712 } else { 7713 vp = svd->vp; 7714 off = offset; 7715 } 7716 7717 /* 7718 * Get page frame. It's ok if the page is 7719 * not available when we're unlocking, as this 7720 * may simply mean that a page we locked got 7721 * truncated out of existence after we locked it. 7722 * 7723 * Invoke VOP_GETPAGE() to obtain the page struct 7724 * since we may need to read it from disk if its 7725 * been paged out. 7726 */ 7727 if (op != MC_LOCK) 7728 pp = page_lookup(vp, off, SE_SHARED); 7729 else { 7730 page_t *pl[1 + 1]; 7731 int error; 7732 7733 ASSERT(vp != NULL); 7734 7735 error = VOP_GETPAGE(vp, (offset_t)off, PAGESIZE, 7736 (uint_t *)NULL, pl, PAGESIZE, seg, addr, 7737 S_OTHER, svd->cred, NULL); 7738 7739 /* 7740 * If the error is EDEADLK then we must bounce 7741 * up and drop all vm subsystem locks and then 7742 * retry the operation later 7743 * This behavior is a temporary measure because 7744 * ufs/sds logging is badly designed and will 7745 * deadlock if we don't allow this bounce to 7746 * happen. The real solution is to re-design 7747 * the logging code to work properly. See bug 7748 * 4125102 for details of the problem. 7749 */ 7750 if (error == EDEADLK) { 7751 err = error; 7752 goto out; 7753 } 7754 /* 7755 * Quit if we fail to fault in the page. Treat 7756 * the failure as an error, unless the addr 7757 * is mapped beyond the end of a file. 7758 */ 7759 if (error && svd->vp) { 7760 va.va_mask = AT_SIZE; 7761 if (VOP_GETATTR(svd->vp, &va, 0, 7762 svd->cred, NULL) != 0) { 7763 err = EIO; 7764 goto out; 7765 } 7766 if (btopr(va.va_size) >= 7767 btopr(off + 1)) { 7768 err = EIO; 7769 goto out; 7770 } 7771 goto out; 7772 7773 } else if (error) { 7774 err = EIO; 7775 goto out; 7776 } 7777 pp = pl[0]; 7778 ASSERT(pp != NULL); 7779 } 7780 7781 /* 7782 * See Statement at the beginning of this routine. 7783 * 7784 * claim is always set if MAP_PRIVATE and PROT_WRITE 7785 * irrespective of following factors: 7786 * 7787 * (1) anon slots are populated or not 7788 * (2) cow is broken or not 7789 * (3) refcnt on ap is 1 or greater than 1 7790 * 7791 * See 4140683 for details 7792 */ 7793 claim = ((VPP_PROT(vpp) & PROT_WRITE) && 7794 (svd->type == MAP_PRIVATE)); 7795 7796 /* 7797 * Perform page-level operation appropriate to 7798 * operation. If locking, undo the SOFTLOCK 7799 * performed to bring the page into memory 7800 * after setting the lock. If unlocking, 7801 * and no page was found, account for the claim 7802 * separately. 7803 */ 7804 if (op == MC_LOCK) { 7805 int ret = 1; /* Assume success */ 7806 7807 ASSERT(!VPP_ISPPLOCK(vpp)); 7808 7809 ret = page_pp_lock(pp, claim, 0); 7810 if (ret == 0) { 7811 /* locking page failed */ 7812 page_unlock(pp); 7813 err = EAGAIN; 7814 goto out; 7815 } 7816 VPP_SETPPLOCK(vpp); 7817 if (sp != NULL) { 7818 if (pp->p_lckcnt == 1) 7819 locked_bytes += PAGESIZE; 7820 } else 7821 locked_bytes += PAGESIZE; 7822 7823 if (lockmap != (ulong_t *)NULL) 7824 BT_SET(lockmap, pos); 7825 7826 page_unlock(pp); 7827 } else { 7828 ASSERT(VPP_ISPPLOCK(vpp)); 7829 if (pp != NULL) { 7830 /* sysV pages should be locked */ 7831 ASSERT(sp == NULL || pp->p_lckcnt > 0); 7832 page_pp_unlock(pp, claim, 0); 7833 if (sp != NULL) { 7834 if (pp->p_lckcnt == 0) 7835 unlocked_bytes 7836 += PAGESIZE; 7837 } else 7838 unlocked_bytes += PAGESIZE; 7839 page_unlock(pp); 7840 } else { 7841 ASSERT(sp == NULL); 7842 unlocked_bytes += PAGESIZE; 7843 } 7844 VPP_CLRPPLOCK(vpp); 7845 } 7846 } 7847 } 7848 out: 7849 if (op == MC_LOCK) { 7850 /* Credit back bytes that did not get locked */ 7851 if ((unlocked_bytes - locked_bytes) > 0) { 7852 if (proj == NULL) 7853 mutex_enter(&p->p_lock); 7854 rctl_decr_locked_mem(p, proj, 7855 (unlocked_bytes - locked_bytes), chargeproc); 7856 if (proj == NULL) 7857 mutex_exit(&p->p_lock); 7858 } 7859 7860 } else { 7861 /* Account bytes that were unlocked */ 7862 if (unlocked_bytes > 0) { 7863 if (proj == NULL) 7864 mutex_enter(&p->p_lock); 7865 rctl_decr_locked_mem(p, proj, unlocked_bytes, 7866 chargeproc); 7867 if (proj == NULL) 7868 mutex_exit(&p->p_lock); 7869 } 7870 } 7871 if (sp != NULL) 7872 mutex_exit(&sp->shm_mlock); 7873 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7874 7875 return (err); 7876 } 7877 7878 /* 7879 * Set advice from user for specified pages 7880 * There are 5 types of advice: 7881 * MADV_NORMAL - Normal (default) behavior (whatever that is) 7882 * MADV_RANDOM - Random page references 7883 * do not allow readahead or 'klustering' 7884 * MADV_SEQUENTIAL - Sequential page references 7885 * Pages previous to the one currently being 7886 * accessed (determined by fault) are 'not needed' 7887 * and are freed immediately 7888 * MADV_WILLNEED - Pages are likely to be used (fault ahead in mctl) 7889 * MADV_DONTNEED - Pages are not needed (synced out in mctl) 7890 * MADV_FREE - Contents can be discarded 7891 * MADV_ACCESS_DEFAULT- Default access 7892 * MADV_ACCESS_LWP - Next LWP will access heavily 7893 * MADV_ACCESS_MANY- Many LWPs or processes will access heavily 7894 */ 7895 static int 7896 segvn_advise(struct seg *seg, caddr_t addr, size_t len, uint_t behav) 7897 { 7898 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 7899 size_t page; 7900 int err = 0; 7901 int already_set; 7902 struct anon_map *amp; 7903 ulong_t anon_index; 7904 struct seg *next; 7905 lgrp_mem_policy_t policy; 7906 struct seg *prev; 7907 struct vnode *vp; 7908 7909 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 7910 7911 /* 7912 * In case of MADV_FREE, we won't be modifying any segment private 7913 * data structures; so, we only need to grab READER's lock 7914 */ 7915 if (behav != MADV_FREE) { 7916 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_WRITER); 7917 if (svd->tr_state != SEGVN_TR_OFF) { 7918 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7919 return (0); 7920 } 7921 } else { 7922 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER); 7923 } 7924 7925 /* 7926 * Large pages are assumed to be only turned on when accesses to the 7927 * segment's address range have spatial and temporal locality. That 7928 * justifies ignoring MADV_SEQUENTIAL for large page segments. 7929 * Also, ignore advice affecting lgroup memory allocation 7930 * if don't need to do lgroup optimizations on this system 7931 */ 7932 7933 if ((behav == MADV_SEQUENTIAL && 7934 (seg->s_szc != 0 || HAT_IS_REGION_COOKIE_VALID(svd->rcookie))) || 7935 (!lgrp_optimizations() && (behav == MADV_ACCESS_DEFAULT || 7936 behav == MADV_ACCESS_LWP || behav == MADV_ACCESS_MANY))) { 7937 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7938 return (0); 7939 } 7940 7941 if (behav == MADV_SEQUENTIAL || behav == MADV_ACCESS_DEFAULT || 7942 behav == MADV_ACCESS_LWP || behav == MADV_ACCESS_MANY) { 7943 /* 7944 * Since we are going to unload hat mappings 7945 * we first have to flush the cache. Otherwise 7946 * this might lead to system panic if another 7947 * thread is doing physio on the range whose 7948 * mappings are unloaded by madvise(3C). 7949 */ 7950 if (svd->softlockcnt > 0) { 7951 /* 7952 * Since we do have the segvn writers lock 7953 * nobody can fill the cache with entries 7954 * belonging to this seg during the purge. 7955 * The flush either succeeds or we still 7956 * have pending I/Os. In the later case, 7957 * madvise(3C) fails. 7958 */ 7959 segvn_purge(seg); 7960 if (svd->softlockcnt > 0) { 7961 /* 7962 * Since madvise(3C) is advisory and 7963 * it's not part of UNIX98, madvise(3C) 7964 * failure here doesn't cause any hardship. 7965 * Note that we don't block in "as" layer. 7966 */ 7967 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7968 return (EAGAIN); 7969 } 7970 } 7971 } 7972 7973 amp = svd->amp; 7974 vp = svd->vp; 7975 if (behav == MADV_FREE) { 7976 /* 7977 * MADV_FREE is not supported for segments with 7978 * underlying object; if anonmap is NULL, anon slots 7979 * are not yet populated and there is nothing for 7980 * us to do. As MADV_FREE is advisory, we don't 7981 * return error in either case. 7982 */ 7983 if (vp != NULL || amp == NULL) { 7984 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7985 return (0); 7986 } 7987 7988 page = seg_page(seg, addr); 7989 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 7990 anon_disclaim(amp, svd->anon_index + page, len); 7991 ANON_LOCK_EXIT(&->a_rwlock); 7992 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7993 return (0); 7994 } 7995 7996 /* 7997 * If advice is to be applied to entire segment, 7998 * use advice field in seg_data structure 7999 * otherwise use appropriate vpage entry. 8000 */ 8001 if ((addr == seg->s_base) && (len == seg->s_size)) { 8002 switch (behav) { 8003 case MADV_ACCESS_LWP: 8004 case MADV_ACCESS_MANY: 8005 case MADV_ACCESS_DEFAULT: 8006 /* 8007 * Set memory allocation policy for this segment 8008 */ 8009 policy = lgrp_madv_to_policy(behav, len, svd->type); 8010 if (svd->type == MAP_SHARED) 8011 already_set = lgrp_shm_policy_set(policy, amp, 8012 svd->anon_index, vp, svd->offset, len); 8013 else { 8014 /* 8015 * For private memory, need writers lock on 8016 * address space because the segment may be 8017 * split or concatenated when changing policy 8018 */ 8019 if (AS_READ_HELD(seg->s_as, 8020 &seg->s_as->a_lock)) { 8021 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 8022 return (IE_RETRY); 8023 } 8024 8025 already_set = lgrp_privm_policy_set(policy, 8026 &svd->policy_info, len); 8027 } 8028 8029 /* 8030 * If policy set already and it shouldn't be reapplied, 8031 * don't do anything. 8032 */ 8033 if (already_set && 8034 !LGRP_MEM_POLICY_REAPPLICABLE(policy)) 8035 break; 8036 8037 /* 8038 * Mark any existing pages in given range for 8039 * migration 8040 */ 8041 page_mark_migrate(seg, addr, len, amp, svd->anon_index, 8042 vp, svd->offset, 1); 8043 8044 /* 8045 * If same policy set already or this is a shared 8046 * memory segment, don't need to try to concatenate 8047 * segment with adjacent ones. 8048 */ 8049 if (already_set || svd->type == MAP_SHARED) 8050 break; 8051 8052 /* 8053 * Try to concatenate this segment with previous 8054 * one and next one, since we changed policy for 8055 * this one and it may be compatible with adjacent 8056 * ones now. 8057 */ 8058 prev = AS_SEGPREV(seg->s_as, seg); 8059 next = AS_SEGNEXT(seg->s_as, seg); 8060 8061 if (next && next->s_ops == &segvn_ops && 8062 addr + len == next->s_base) 8063 (void) segvn_concat(seg, next, 1); 8064 8065 if (prev && prev->s_ops == &segvn_ops && 8066 addr == prev->s_base + prev->s_size) { 8067 /* 8068 * Drop lock for private data of current 8069 * segment before concatenating (deleting) it 8070 * and return IE_REATTACH to tell as_ctl() that 8071 * current segment has changed 8072 */ 8073 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 8074 if (!segvn_concat(prev, seg, 1)) 8075 err = IE_REATTACH; 8076 8077 return (err); 8078 } 8079 break; 8080 8081 case MADV_SEQUENTIAL: 8082 /* 8083 * unloading mapping guarantees 8084 * detection in segvn_fault 8085 */ 8086 ASSERT(seg->s_szc == 0); 8087 ASSERT(svd->rcookie == HAT_INVALID_REGION_COOKIE); 8088 hat_unload(seg->s_as->a_hat, addr, len, 8089 HAT_UNLOAD); 8090 /* FALLTHROUGH */ 8091 case MADV_NORMAL: 8092 case MADV_RANDOM: 8093 svd->advice = (uchar_t)behav; 8094 svd->pageadvice = 0; 8095 break; 8096 case MADV_WILLNEED: /* handled in memcntl */ 8097 case MADV_DONTNEED: /* handled in memcntl */ 8098 case MADV_FREE: /* handled above */ 8099 break; 8100 default: 8101 err = EINVAL; 8102 } 8103 } else { 8104 caddr_t eaddr; 8105 struct seg *new_seg; 8106 struct segvn_data *new_svd; 8107 u_offset_t off; 8108 caddr_t oldeaddr; 8109 8110 page = seg_page(seg, addr); 8111 8112 segvn_vpage(seg); 8113 8114 switch (behav) { 8115 struct vpage *bvpp, *evpp; 8116 8117 case MADV_ACCESS_LWP: 8118 case MADV_ACCESS_MANY: 8119 case MADV_ACCESS_DEFAULT: 8120 /* 8121 * Set memory allocation policy for portion of this 8122 * segment 8123 */ 8124 8125 /* 8126 * Align address and length of advice to page 8127 * boundaries for large pages 8128 */ 8129 if (seg->s_szc != 0) { 8130 size_t pgsz; 8131 8132 pgsz = page_get_pagesize(seg->s_szc); 8133 addr = (caddr_t)P2ALIGN((uintptr_t)addr, pgsz); 8134 len = P2ROUNDUP(len, pgsz); 8135 } 8136 8137 /* 8138 * Check to see whether policy is set already 8139 */ 8140 policy = lgrp_madv_to_policy(behav, len, svd->type); 8141 8142 anon_index = svd->anon_index + page; 8143 off = svd->offset + (uintptr_t)(addr - seg->s_base); 8144 8145 if (svd->type == MAP_SHARED) 8146 already_set = lgrp_shm_policy_set(policy, amp, 8147 anon_index, vp, off, len); 8148 else 8149 already_set = 8150 (policy == svd->policy_info.mem_policy); 8151 8152 /* 8153 * If policy set already and it shouldn't be reapplied, 8154 * don't do anything. 8155 */ 8156 if (already_set && 8157 !LGRP_MEM_POLICY_REAPPLICABLE(policy)) 8158 break; 8159 8160 /* 8161 * For private memory, need writers lock on 8162 * address space because the segment may be 8163 * split or concatenated when changing policy 8164 */ 8165 if (svd->type == MAP_PRIVATE && 8166 AS_READ_HELD(seg->s_as, &seg->s_as->a_lock)) { 8167 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 8168 return (IE_RETRY); 8169 } 8170 8171 /* 8172 * Mark any existing pages in given range for 8173 * migration 8174 */ 8175 page_mark_migrate(seg, addr, len, amp, svd->anon_index, 8176 vp, svd->offset, 1); 8177 8178 /* 8179 * Don't need to try to split or concatenate 8180 * segments, since policy is same or this is a shared 8181 * memory segment 8182 */ 8183 if (already_set || svd->type == MAP_SHARED) 8184 break; 8185 8186 if (HAT_IS_REGION_COOKIE_VALID(svd->rcookie)) { 8187 ASSERT(svd->amp == NULL); 8188 ASSERT(svd->tr_state == SEGVN_TR_OFF); 8189 ASSERT(svd->softlockcnt == 0); 8190 hat_leave_region(seg->s_as->a_hat, svd->rcookie, 8191 HAT_REGION_TEXT); 8192 svd->rcookie = HAT_INVALID_REGION_COOKIE; 8193 } 8194 8195 /* 8196 * Split off new segment if advice only applies to a 8197 * portion of existing segment starting in middle 8198 */ 8199 new_seg = NULL; 8200 eaddr = addr + len; 8201 oldeaddr = seg->s_base + seg->s_size; 8202 if (addr > seg->s_base) { 8203 /* 8204 * Must flush I/O page cache 8205 * before splitting segment 8206 */ 8207 if (svd->softlockcnt > 0) 8208 segvn_purge(seg); 8209 8210 /* 8211 * Split segment and return IE_REATTACH to tell 8212 * as_ctl() that current segment changed 8213 */ 8214 new_seg = segvn_split_seg(seg, addr); 8215 new_svd = (struct segvn_data *)new_seg->s_data; 8216 err = IE_REATTACH; 8217 8218 /* 8219 * If new segment ends where old one 8220 * did, try to concatenate the new 8221 * segment with next one. 8222 */ 8223 if (eaddr == oldeaddr) { 8224 /* 8225 * Set policy for new segment 8226 */ 8227 (void) lgrp_privm_policy_set(policy, 8228 &new_svd->policy_info, 8229 new_seg->s_size); 8230 8231 next = AS_SEGNEXT(new_seg->s_as, 8232 new_seg); 8233 8234 if (next && 8235 next->s_ops == &segvn_ops && 8236 eaddr == next->s_base) 8237 (void) segvn_concat(new_seg, 8238 next, 1); 8239 } 8240 } 8241 8242 /* 8243 * Split off end of existing segment if advice only 8244 * applies to a portion of segment ending before 8245 * end of the existing segment 8246 */ 8247 if (eaddr < oldeaddr) { 8248 /* 8249 * Must flush I/O page cache 8250 * before splitting segment 8251 */ 8252 if (svd->softlockcnt > 0) 8253 segvn_purge(seg); 8254 8255 /* 8256 * If beginning of old segment was already 8257 * split off, use new segment to split end off 8258 * from. 8259 */ 8260 if (new_seg != NULL && new_seg != seg) { 8261 /* 8262 * Split segment 8263 */ 8264 (void) segvn_split_seg(new_seg, eaddr); 8265 8266 /* 8267 * Set policy for new segment 8268 */ 8269 (void) lgrp_privm_policy_set(policy, 8270 &new_svd->policy_info, 8271 new_seg->s_size); 8272 } else { 8273 /* 8274 * Split segment and return IE_REATTACH 8275 * to tell as_ctl() that current 8276 * segment changed 8277 */ 8278 (void) segvn_split_seg(seg, eaddr); 8279 err = IE_REATTACH; 8280 8281 (void) lgrp_privm_policy_set(policy, 8282 &svd->policy_info, seg->s_size); 8283 8284 /* 8285 * If new segment starts where old one 8286 * did, try to concatenate it with 8287 * previous segment. 8288 */ 8289 if (addr == seg->s_base) { 8290 prev = AS_SEGPREV(seg->s_as, 8291 seg); 8292 8293 /* 8294 * Drop lock for private data 8295 * of current segment before 8296 * concatenating (deleting) it 8297 */ 8298 if (prev && 8299 prev->s_ops == 8300 &segvn_ops && 8301 addr == prev->s_base + 8302 prev->s_size) { 8303 SEGVN_LOCK_EXIT( 8304 seg->s_as, 8305 &svd->lock); 8306 (void) segvn_concat( 8307 prev, seg, 1); 8308 return (err); 8309 } 8310 } 8311 } 8312 } 8313 break; 8314 case MADV_SEQUENTIAL: 8315 ASSERT(seg->s_szc == 0); 8316 ASSERT(svd->rcookie == HAT_INVALID_REGION_COOKIE); 8317 hat_unload(seg->s_as->a_hat, addr, len, HAT_UNLOAD); 8318 /* FALLTHROUGH */ 8319 case MADV_NORMAL: 8320 case MADV_RANDOM: 8321 bvpp = &svd->vpage[page]; 8322 evpp = &svd->vpage[page + (len >> PAGESHIFT)]; 8323 for (; bvpp < evpp; bvpp++) 8324 VPP_SETADVICE(bvpp, behav); 8325 svd->advice = MADV_NORMAL; 8326 break; 8327 case MADV_WILLNEED: /* handled in memcntl */ 8328 case MADV_DONTNEED: /* handled in memcntl */ 8329 case MADV_FREE: /* handled above */ 8330 break; 8331 default: 8332 err = EINVAL; 8333 } 8334 } 8335 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 8336 return (err); 8337 } 8338 8339 /* 8340 * Create a vpage structure for this seg. 8341 */ 8342 static void 8343 segvn_vpage(struct seg *seg) 8344 { 8345 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 8346 struct vpage *vp, *evp; 8347 8348 ASSERT(SEGVN_WRITE_HELD(seg->s_as, &svd->lock)); 8349 8350 /* 8351 * If no vpage structure exists, allocate one. Copy the protections 8352 * and the advice from the segment itself to the individual pages. 8353 */ 8354 if (svd->vpage == NULL) { 8355 svd->pageadvice = 1; 8356 svd->vpage = kmem_zalloc(seg_pages(seg) * sizeof (struct vpage), 8357 KM_SLEEP); 8358 evp = &svd->vpage[seg_page(seg, seg->s_base + seg->s_size)]; 8359 for (vp = svd->vpage; vp < evp; vp++) { 8360 VPP_SETPROT(vp, svd->prot); 8361 VPP_SETADVICE(vp, svd->advice); 8362 } 8363 } 8364 } 8365 8366 /* 8367 * Dump the pages belonging to this segvn segment. 8368 */ 8369 static void 8370 segvn_dump(struct seg *seg) 8371 { 8372 struct segvn_data *svd; 8373 page_t *pp; 8374 struct anon_map *amp; 8375 ulong_t anon_index; 8376 struct vnode *vp; 8377 u_offset_t off, offset; 8378 pfn_t pfn; 8379 pgcnt_t page, npages; 8380 caddr_t addr; 8381 8382 npages = seg_pages(seg); 8383 svd = (struct segvn_data *)seg->s_data; 8384 vp = svd->vp; 8385 off = offset = svd->offset; 8386 addr = seg->s_base; 8387 8388 if ((amp = svd->amp) != NULL) { 8389 anon_index = svd->anon_index; 8390 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 8391 } 8392 8393 for (page = 0; page < npages; page++, offset += PAGESIZE) { 8394 struct anon *ap; 8395 int we_own_it = 0; 8396 8397 if (amp && (ap = anon_get_ptr(svd->amp->ahp, anon_index++))) { 8398 swap_xlate_nopanic(ap, &vp, &off); 8399 } else { 8400 vp = svd->vp; 8401 off = offset; 8402 } 8403 8404 /* 8405 * If pp == NULL, the page either does not exist 8406 * or is exclusively locked. So determine if it 8407 * exists before searching for it. 8408 */ 8409 8410 if ((pp = page_lookup_nowait(vp, off, SE_SHARED))) 8411 we_own_it = 1; 8412 else 8413 pp = page_exists(vp, off); 8414 8415 if (pp) { 8416 pfn = page_pptonum(pp); 8417 dump_addpage(seg->s_as, addr, pfn); 8418 if (we_own_it) 8419 page_unlock(pp); 8420 } 8421 addr += PAGESIZE; 8422 dump_timeleft = dump_timeout; 8423 } 8424 8425 if (amp != NULL) 8426 ANON_LOCK_EXIT(&->a_rwlock); 8427 } 8428 8429 /* 8430 * lock/unlock anon pages over a given range. Return shadow list 8431 */ 8432 static int 8433 segvn_pagelock(struct seg *seg, caddr_t addr, size_t len, struct page ***ppp, 8434 enum lock_type type, enum seg_rw rw) 8435 { 8436 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 8437 size_t np, adjustpages = 0, npages = (len >> PAGESHIFT); 8438 ulong_t anon_index; 8439 uint_t protchk; 8440 uint_t error; 8441 struct anon_map *amp; 8442 struct page **pplist, **pl, *pp; 8443 caddr_t a; 8444 size_t page; 8445 caddr_t lpgaddr, lpgeaddr; 8446 pgcnt_t szc0_npages = 0; 8447 8448 TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEGVN_START, 8449 "segvn_pagelock: start seg %p addr %p", seg, addr); 8450 8451 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 8452 if (seg->s_szc != 0 && (type == L_PAGELOCK || type == L_PAGEUNLOCK)) { 8453 /* 8454 * We are adjusting the pagelock region to the large page size 8455 * boundary because the unlocked part of a large page cannot 8456 * be freed anyway unless all constituent pages of a large 8457 * page are locked. Therefore this adjustment allows us to 8458 * decrement availrmem by the right value (note we don't want 8459 * to just decrement availrem by the large page size without 8460 * adjusting addr and len because then we may end up 8461 * decrementing availrmem by large page size for every 8462 * constituent page locked by a new as_pagelock call). 8463 * as_pageunlock caller must always match as_pagelock call's 8464 * addr and len. 8465 * 8466 * Note segment's page size cannot change while we are holding 8467 * as lock. And then it cannot change while softlockcnt is 8468 * not 0. This will allow us to correctly recalculate large 8469 * page size region for the matching pageunlock/reclaim call. 8470 * 8471 * for pageunlock *ppp points to the pointer of page_t that 8472 * corresponds to the real unadjusted start address. Similar 8473 * for pagelock *ppp must point to the pointer of page_t that 8474 * corresponds to the real unadjusted start address. 8475 */ 8476 size_t pgsz = page_get_pagesize(seg->s_szc); 8477 CALC_LPG_REGION(pgsz, seg, addr, len, lpgaddr, lpgeaddr); 8478 adjustpages = ((uintptr_t)(addr - lpgaddr)) >> PAGESHIFT; 8479 } 8480 8481 if (type == L_PAGEUNLOCK) { 8482 8483 /* 8484 * update hat ref bits for /proc. We need to make sure 8485 * that threads tracing the ref and mod bits of the 8486 * address space get the right data. 8487 * Note: page ref and mod bits are updated at reclaim time 8488 */ 8489 if (seg->s_as->a_vbits) { 8490 for (a = addr; a < addr + len; a += PAGESIZE) { 8491 if (rw == S_WRITE) { 8492 hat_setstat(seg->s_as, a, 8493 PAGESIZE, P_REF | P_MOD); 8494 } else { 8495 hat_setstat(seg->s_as, a, 8496 PAGESIZE, P_REF); 8497 } 8498 } 8499 } 8500 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER); 8501 if (seg->s_szc != 0) { 8502 VM_STAT_ADD(segvnvmstats.pagelock[0]); 8503 seg_pinactive(seg, lpgaddr, lpgeaddr - lpgaddr, 8504 *ppp - adjustpages, rw, segvn_reclaim); 8505 } else { 8506 seg_pinactive(seg, addr, len, *ppp, rw, segvn_reclaim); 8507 } 8508 8509 /* 8510 * If someone is blocked while unmapping, we purge 8511 * segment page cache and thus reclaim pplist synchronously 8512 * without waiting for seg_pasync_thread. This speeds up 8513 * unmapping in cases where munmap(2) is called, while 8514 * raw async i/o is still in progress or where a thread 8515 * exits on data fault in a multithreaded application. 8516 */ 8517 if (AS_ISUNMAPWAIT(seg->s_as) && (svd->softlockcnt > 0)) { 8518 /* 8519 * Even if we grab segvn WRITER's lock or segp_slock 8520 * here, there might be another thread which could've 8521 * successfully performed lookup/insert just before 8522 * we acquired the lock here. So, grabbing either 8523 * lock here is of not much use. Until we devise 8524 * a strategy at upper layers to solve the 8525 * synchronization issues completely, we expect 8526 * applications to handle this appropriately. 8527 */ 8528 segvn_purge(seg); 8529 } 8530 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 8531 TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEGVN_UNLOCK_END, 8532 "segvn_pagelock: unlock seg %p addr %p", seg, addr); 8533 return (0); 8534 } else if (type == L_PAGERECLAIM) { 8535 VM_STAT_COND_ADD(seg->s_szc != 0, segvnvmstats.pagelock[1]); 8536 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER); 8537 (void) segvn_reclaim(seg, addr, len, *ppp, rw); 8538 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 8539 TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEGVN_UNLOCK_END, 8540 "segvn_pagelock: reclaim seg %p addr %p", seg, addr); 8541 return (0); 8542 } 8543 8544 if (seg->s_szc != 0) { 8545 VM_STAT_ADD(segvnvmstats.pagelock[2]); 8546 addr = lpgaddr; 8547 len = lpgeaddr - lpgaddr; 8548 npages = (len >> PAGESHIFT); 8549 } 8550 8551 /* 8552 * for now we only support pagelock to anon memory. We've to check 8553 * protections for vnode objects and call into the vnode driver. 8554 * That's too much for a fast path. Let the fault entry point handle it. 8555 */ 8556 if (svd->vp != NULL) { 8557 TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEGVN_MISS_END, 8558 "segvn_pagelock: mapped vnode seg %p addr %p", seg, addr); 8559 *ppp = NULL; 8560 return (ENOTSUP); 8561 } 8562 8563 /* 8564 * if anonmap is not yet created, let the fault entry point populate it 8565 * with anon ptrs. 8566 */ 8567 if ((amp = svd->amp) == NULL) { 8568 TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEGVN_MISS_END, 8569 "segvn_pagelock: anonmap null seg %p addr %p", seg, addr); 8570 *ppp = NULL; 8571 return (EFAULT); 8572 } 8573 8574 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER); 8575 8576 /* 8577 * we acquire segp_slock to prevent duplicate entries 8578 * in seg_pcache 8579 */ 8580 mutex_enter(&svd->segp_slock); 8581 8582 /* 8583 * try to find pages in segment page cache 8584 */ 8585 pplist = seg_plookup(seg, addr, len, rw); 8586 if (pplist != NULL) { 8587 mutex_exit(&svd->segp_slock); 8588 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 8589 *ppp = pplist + adjustpages; 8590 TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEGVN_HIT_END, 8591 "segvn_pagelock: cache hit seg %p addr %p", seg, addr); 8592 return (0); 8593 } 8594 8595 if (rw == S_READ) { 8596 protchk = PROT_READ; 8597 } else { 8598 protchk = PROT_WRITE; 8599 } 8600 8601 if (svd->pageprot == 0) { 8602 if ((svd->prot & protchk) == 0) { 8603 mutex_exit(&svd->segp_slock); 8604 error = EFAULT; 8605 goto out; 8606 } 8607 } else { 8608 /* 8609 * check page protections 8610 */ 8611 for (a = addr; a < addr + len; a += PAGESIZE) { 8612 struct vpage *vp; 8613 8614 vp = &svd->vpage[seg_page(seg, a)]; 8615 if ((VPP_PROT(vp) & protchk) == 0) { 8616 mutex_exit(&svd->segp_slock); 8617 error = EFAULT; 8618 goto out; 8619 } 8620 } 8621 } 8622 8623 /* 8624 * Avoid per page overhead of segvn_slock_anonpages() for small 8625 * pages. For large pages segvn_slock_anonpages() only does real 8626 * work once per large page. The tradeoff is that we may decrement 8627 * availrmem more than once for the same page but this is ok 8628 * for small pages. 8629 */ 8630 if (seg->s_szc == 0) { 8631 mutex_enter(&freemem_lock); 8632 if (availrmem < tune.t_minarmem + npages) { 8633 mutex_exit(&freemem_lock); 8634 mutex_exit(&svd->segp_slock); 8635 error = ENOMEM; 8636 goto out; 8637 } 8638 availrmem -= npages; 8639 mutex_exit(&freemem_lock); 8640 } 8641 8642 pplist = kmem_alloc(sizeof (page_t *) * npages, KM_SLEEP); 8643 pl = pplist; 8644 *ppp = pplist + adjustpages; 8645 8646 page = seg_page(seg, addr); 8647 anon_index = svd->anon_index + page; 8648 8649 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 8650 for (a = addr; a < addr + len; a += PAGESIZE, anon_index++) { 8651 struct anon *ap; 8652 struct vnode *vp; 8653 u_offset_t off; 8654 anon_sync_obj_t cookie; 8655 8656 anon_array_enter(amp, anon_index, &cookie); 8657 ap = anon_get_ptr(amp->ahp, anon_index); 8658 if (ap == NULL) { 8659 anon_array_exit(&cookie); 8660 break; 8661 } else { 8662 /* 8663 * We must never use seg_pcache for COW pages 8664 * because we might end up with original page still 8665 * lying in seg_pcache even after private page is 8666 * created. This leads to data corruption as 8667 * aio_write refers to the page still in cache 8668 * while all other accesses refer to the private 8669 * page. 8670 */ 8671 if (ap->an_refcnt != 1) { 8672 anon_array_exit(&cookie); 8673 break; 8674 } 8675 } 8676 swap_xlate(ap, &vp, &off); 8677 anon_array_exit(&cookie); 8678 8679 pp = page_lookup_nowait(vp, off, SE_SHARED); 8680 if (pp == NULL) { 8681 break; 8682 } 8683 if (seg->s_szc != 0 || pp->p_szc != 0) { 8684 if (!segvn_slock_anonpages(pp, a == addr)) { 8685 page_unlock(pp); 8686 break; 8687 } 8688 } else { 8689 szc0_npages++; 8690 } 8691 *pplist++ = pp; 8692 } 8693 ANON_LOCK_EXIT(&->a_rwlock); 8694 8695 ASSERT(npages >= szc0_npages); 8696 8697 if (a >= addr + len) { 8698 mutex_enter(&freemem_lock); 8699 if (seg->s_szc == 0 && npages != szc0_npages) { 8700 ASSERT(svd->type == MAP_SHARED && amp->a_szc > 0); 8701 availrmem += (npages - szc0_npages); 8702 } 8703 svd->softlockcnt += npages; 8704 segvn_pages_locked += npages; 8705 mutex_exit(&freemem_lock); 8706 (void) seg_pinsert(seg, addr, len, pl, rw, SEGP_ASYNC_FLUSH, 8707 segvn_reclaim); 8708 mutex_exit(&svd->segp_slock); 8709 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 8710 TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEGVN_FILL_END, 8711 "segvn_pagelock: cache fill seg %p addr %p", seg, addr); 8712 return (0); 8713 } 8714 8715 mutex_exit(&svd->segp_slock); 8716 if (seg->s_szc == 0) { 8717 mutex_enter(&freemem_lock); 8718 availrmem += npages; 8719 mutex_exit(&freemem_lock); 8720 } 8721 error = EFAULT; 8722 pplist = pl; 8723 np = ((uintptr_t)(a - addr)) >> PAGESHIFT; 8724 while (np > (uint_t)0) { 8725 ASSERT(PAGE_LOCKED(*pplist)); 8726 if (seg->s_szc != 0 || (*pplist)->p_szc != 0) { 8727 segvn_sunlock_anonpages(*pplist, pplist == pl); 8728 } 8729 page_unlock(*pplist); 8730 np--; 8731 pplist++; 8732 } 8733 kmem_free(pl, sizeof (page_t *) * npages); 8734 out: 8735 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 8736 *ppp = NULL; 8737 TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEGVN_MISS_END, 8738 "segvn_pagelock: cache miss seg %p addr %p", seg, addr); 8739 return (error); 8740 } 8741 8742 /* 8743 * purge any cached pages in the I/O page cache 8744 */ 8745 static void 8746 segvn_purge(struct seg *seg) 8747 { 8748 seg_ppurge(seg); 8749 } 8750 8751 static int 8752 segvn_reclaim(struct seg *seg, caddr_t addr, size_t len, struct page **pplist, 8753 enum seg_rw rw) 8754 { 8755 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 8756 pgcnt_t np, npages; 8757 struct page **pl; 8758 pgcnt_t szc0_npages = 0; 8759 8760 #ifdef lint 8761 addr = addr; 8762 #endif 8763 8764 npages = np = (len >> PAGESHIFT); 8765 ASSERT(npages); 8766 pl = pplist; 8767 if (seg->s_szc != 0) { 8768 size_t pgsz = page_get_pagesize(seg->s_szc); 8769 if (!IS_P2ALIGNED(addr, pgsz) || !IS_P2ALIGNED(len, pgsz)) { 8770 panic("segvn_reclaim: unaligned addr or len"); 8771 /*NOTREACHED*/ 8772 } 8773 } 8774 8775 ASSERT(svd->vp == NULL && svd->amp != NULL); 8776 8777 while (np > (uint_t)0) { 8778 if (rw == S_WRITE) { 8779 hat_setrefmod(*pplist); 8780 } else { 8781 hat_setref(*pplist); 8782 } 8783 if (seg->s_szc != 0 || (*pplist)->p_szc != 0) { 8784 segvn_sunlock_anonpages(*pplist, pplist == pl); 8785 } else { 8786 szc0_npages++; 8787 } 8788 page_unlock(*pplist); 8789 np--; 8790 pplist++; 8791 } 8792 kmem_free(pl, sizeof (page_t *) * npages); 8793 8794 mutex_enter(&freemem_lock); 8795 segvn_pages_locked -= npages; 8796 svd->softlockcnt -= npages; 8797 if (szc0_npages != 0) { 8798 availrmem += szc0_npages; 8799 } 8800 mutex_exit(&freemem_lock); 8801 if (svd->softlockcnt <= 0) { 8802 if (AS_ISUNMAPWAIT(seg->s_as)) { 8803 mutex_enter(&seg->s_as->a_contents); 8804 if (AS_ISUNMAPWAIT(seg->s_as)) { 8805 AS_CLRUNMAPWAIT(seg->s_as); 8806 cv_broadcast(&seg->s_as->a_cv); 8807 } 8808 mutex_exit(&seg->s_as->a_contents); 8809 } 8810 } 8811 return (0); 8812 } 8813 /* 8814 * get a memory ID for an addr in a given segment 8815 * 8816 * XXX only creates PAGESIZE pages if anon slots are not initialized. 8817 * At fault time they will be relocated into larger pages. 8818 */ 8819 static int 8820 segvn_getmemid(struct seg *seg, caddr_t addr, memid_t *memidp) 8821 { 8822 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 8823 struct anon *ap = NULL; 8824 ulong_t anon_index; 8825 struct anon_map *amp; 8826 anon_sync_obj_t cookie; 8827 8828 if (svd->type == MAP_PRIVATE) { 8829 memidp->val[0] = (uintptr_t)seg->s_as; 8830 memidp->val[1] = (uintptr_t)addr; 8831 return (0); 8832 } 8833 8834 if (svd->type == MAP_SHARED) { 8835 if (svd->vp) { 8836 memidp->val[0] = (uintptr_t)svd->vp; 8837 memidp->val[1] = (u_longlong_t)svd->offset + 8838 (uintptr_t)(addr - seg->s_base); 8839 return (0); 8840 } else { 8841 8842 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER); 8843 if ((amp = svd->amp) != NULL) { 8844 anon_index = svd->anon_index + 8845 seg_page(seg, addr); 8846 } 8847 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 8848 8849 ASSERT(amp != NULL); 8850 8851 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 8852 anon_array_enter(amp, anon_index, &cookie); 8853 ap = anon_get_ptr(amp->ahp, anon_index); 8854 if (ap == NULL) { 8855 page_t *pp; 8856 8857 pp = anon_zero(seg, addr, &ap, svd->cred); 8858 if (pp == NULL) { 8859 anon_array_exit(&cookie); 8860 ANON_LOCK_EXIT(&->a_rwlock); 8861 return (ENOMEM); 8862 } 8863 ASSERT(anon_get_ptr(amp->ahp, anon_index) 8864 == NULL); 8865 (void) anon_set_ptr(amp->ahp, anon_index, 8866 ap, ANON_SLEEP); 8867 page_unlock(pp); 8868 } 8869 8870 anon_array_exit(&cookie); 8871 ANON_LOCK_EXIT(&->a_rwlock); 8872 8873 memidp->val[0] = (uintptr_t)ap; 8874 memidp->val[1] = (uintptr_t)addr & PAGEOFFSET; 8875 return (0); 8876 } 8877 } 8878 return (EINVAL); 8879 } 8880 8881 static int 8882 sameprot(struct seg *seg, caddr_t a, size_t len) 8883 { 8884 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 8885 struct vpage *vpage; 8886 spgcnt_t pages = btop(len); 8887 uint_t prot; 8888 8889 if (svd->pageprot == 0) 8890 return (1); 8891 8892 ASSERT(svd->vpage != NULL); 8893 8894 vpage = &svd->vpage[seg_page(seg, a)]; 8895 prot = VPP_PROT(vpage); 8896 vpage++; 8897 pages--; 8898 while (pages-- > 0) { 8899 if (prot != VPP_PROT(vpage)) 8900 return (0); 8901 vpage++; 8902 } 8903 return (1); 8904 } 8905 8906 /* 8907 * Get memory allocation policy info for specified address in given segment 8908 */ 8909 static lgrp_mem_policy_info_t * 8910 segvn_getpolicy(struct seg *seg, caddr_t addr) 8911 { 8912 struct anon_map *amp; 8913 ulong_t anon_index; 8914 lgrp_mem_policy_info_t *policy_info; 8915 struct segvn_data *svn_data; 8916 u_offset_t vn_off; 8917 vnode_t *vp; 8918 8919 ASSERT(seg != NULL); 8920 8921 svn_data = (struct segvn_data *)seg->s_data; 8922 if (svn_data == NULL) 8923 return (NULL); 8924 8925 /* 8926 * Get policy info for private or shared memory 8927 */ 8928 if (svn_data->type != MAP_SHARED) { 8929 if (svn_data->tr_state != SEGVN_TR_ON) { 8930 policy_info = &svn_data->policy_info; 8931 } else { 8932 policy_info = &svn_data->tr_policy_info; 8933 ASSERT(policy_info->mem_policy == 8934 LGRP_MEM_POLICY_NEXT_SEG); 8935 } 8936 } else { 8937 amp = svn_data->amp; 8938 anon_index = svn_data->anon_index + seg_page(seg, addr); 8939 vp = svn_data->vp; 8940 vn_off = svn_data->offset + (uintptr_t)(addr - seg->s_base); 8941 policy_info = lgrp_shm_policy_get(amp, anon_index, vp, vn_off); 8942 } 8943 8944 return (policy_info); 8945 } 8946 8947 /*ARGSUSED*/ 8948 static int 8949 segvn_capable(struct seg *seg, segcapability_t capability) 8950 { 8951 return (0); 8952 } 8953 8954 /* 8955 * Bind text vnode segment to an amp. If we bind successfully mappings will be 8956 * established to per vnode mapping per lgroup amp pages instead of to vnode 8957 * pages. There's one amp per vnode text mapping per lgroup. Many processes 8958 * may share the same text replication amp. If a suitable amp doesn't already 8959 * exist in svntr hash table create a new one. We may fail to bind to amp if 8960 * segment is not eligible for text replication. Code below first checks for 8961 * these conditions. If binding is successful segment tr_state is set to on 8962 * and svd->amp points to the amp to use. Otherwise tr_state is set to off and 8963 * svd->amp remains as NULL. 8964 */ 8965 static void 8966 segvn_textrepl(struct seg *seg) 8967 { 8968 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 8969 vnode_t *vp = svd->vp; 8970 u_offset_t off = svd->offset; 8971 size_t size = seg->s_size; 8972 u_offset_t eoff = off + size; 8973 uint_t szc = seg->s_szc; 8974 ulong_t hash = SVNTR_HASH_FUNC(vp); 8975 svntr_t *svntrp; 8976 struct vattr va; 8977 proc_t *p = seg->s_as->a_proc; 8978 lgrp_id_t lgrp_id; 8979 lgrp_id_t olid; 8980 int first; 8981 struct anon_map *amp; 8982 8983 ASSERT(AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 8984 ASSERT(SEGVN_WRITE_HELD(seg->s_as, &svd->lock)); 8985 ASSERT(p != NULL); 8986 ASSERT(svd->tr_state == SEGVN_TR_INIT); 8987 ASSERT(!HAT_IS_REGION_COOKIE_VALID(svd->rcookie)); 8988 ASSERT(svd->flags & MAP_TEXT); 8989 ASSERT(svd->type == MAP_PRIVATE); 8990 ASSERT(vp != NULL && svd->amp == NULL); 8991 ASSERT(!svd->pageprot && !(svd->prot & PROT_WRITE)); 8992 ASSERT(!(svd->flags & MAP_NORESERVE) && svd->swresv == 0); 8993 ASSERT(seg->s_as != &kas); 8994 ASSERT(off < eoff); 8995 ASSERT(svntr_hashtab != NULL); 8996 8997 /* 8998 * If numa optimizations are no longer desired bail out. 8999 */ 9000 if (!lgrp_optimizations()) { 9001 svd->tr_state = SEGVN_TR_OFF; 9002 return; 9003 } 9004 9005 /* 9006 * Avoid creating anon maps with size bigger than the file size. 9007 * If VOP_GETATTR() call fails bail out. 9008 */ 9009 va.va_mask = AT_SIZE | AT_MTIME | AT_CTIME; 9010 if (VOP_GETATTR(vp, &va, 0, svd->cred, NULL) != 0) { 9011 svd->tr_state = SEGVN_TR_OFF; 9012 SEGVN_TR_ADDSTAT(gaerr); 9013 return; 9014 } 9015 if (btopr(va.va_size) < btopr(eoff)) { 9016 svd->tr_state = SEGVN_TR_OFF; 9017 SEGVN_TR_ADDSTAT(overmap); 9018 return; 9019 } 9020 9021 /* 9022 * VVMEXEC may not be set yet if exec() prefaults text segment. Set 9023 * this flag now before vn_is_mapped(V_WRITE) so that MAP_SHARED 9024 * mapping that checks if trcache for this vnode needs to be 9025 * invalidated can't miss us. 9026 */ 9027 if (!(vp->v_flag & VVMEXEC)) { 9028 mutex_enter(&vp->v_lock); 9029 vp->v_flag |= VVMEXEC; 9030 mutex_exit(&vp->v_lock); 9031 } 9032 mutex_enter(&svntr_hashtab[hash].tr_lock); 9033 /* 9034 * Bail out if potentially MAP_SHARED writable mappings exist to this 9035 * vnode. We don't want to use old file contents from existing 9036 * replicas if this mapping was established after the original file 9037 * was changed. 9038 */ 9039 if (vn_is_mapped(vp, V_WRITE)) { 9040 mutex_exit(&svntr_hashtab[hash].tr_lock); 9041 svd->tr_state = SEGVN_TR_OFF; 9042 SEGVN_TR_ADDSTAT(wrcnt); 9043 return; 9044 } 9045 svntrp = svntr_hashtab[hash].tr_head; 9046 for (; svntrp != NULL; svntrp = svntrp->tr_next) { 9047 ASSERT(svntrp->tr_refcnt != 0); 9048 if (svntrp->tr_vp != vp) { 9049 continue; 9050 } 9051 9052 /* 9053 * Bail out if the file or its attributes were changed after 9054 * this replication entry was created since we need to use the 9055 * latest file contents. Note that mtime test alone is not 9056 * sufficient because a user can explicitly change mtime via 9057 * utimes(2) interfaces back to the old value after modifiying 9058 * the file contents. To detect this case we also have to test 9059 * ctime which among other things records the time of the last 9060 * mtime change by utimes(2). ctime is not changed when the file 9061 * is only read or executed so we expect that typically existing 9062 * replication amp's can be used most of the time. 9063 */ 9064 if (!svntrp->tr_valid || 9065 svntrp->tr_mtime.tv_sec != va.va_mtime.tv_sec || 9066 svntrp->tr_mtime.tv_nsec != va.va_mtime.tv_nsec || 9067 svntrp->tr_ctime.tv_sec != va.va_ctime.tv_sec || 9068 svntrp->tr_ctime.tv_nsec != va.va_ctime.tv_nsec) { 9069 mutex_exit(&svntr_hashtab[hash].tr_lock); 9070 svd->tr_state = SEGVN_TR_OFF; 9071 SEGVN_TR_ADDSTAT(stale); 9072 return; 9073 } 9074 /* 9075 * if off, eoff and szc match current segment we found the 9076 * existing entry we can use. 9077 */ 9078 if (svntrp->tr_off == off && svntrp->tr_eoff == eoff && 9079 svntrp->tr_szc == szc) { 9080 break; 9081 } 9082 /* 9083 * Don't create different but overlapping in file offsets 9084 * entries to avoid replication of the same file pages more 9085 * than once per lgroup. 9086 */ 9087 if ((off >= svntrp->tr_off && off < svntrp->tr_eoff) || 9088 (eoff > svntrp->tr_off && eoff <= svntrp->tr_eoff)) { 9089 mutex_exit(&svntr_hashtab[hash].tr_lock); 9090 svd->tr_state = SEGVN_TR_OFF; 9091 SEGVN_TR_ADDSTAT(overlap); 9092 return; 9093 } 9094 } 9095 /* 9096 * If we didn't find existing entry create a new one. 9097 */ 9098 if (svntrp == NULL) { 9099 svntrp = kmem_cache_alloc(svntr_cache, KM_NOSLEEP); 9100 if (svntrp == NULL) { 9101 mutex_exit(&svntr_hashtab[hash].tr_lock); 9102 svd->tr_state = SEGVN_TR_OFF; 9103 SEGVN_TR_ADDSTAT(nokmem); 9104 return; 9105 } 9106 #ifdef DEBUG 9107 { 9108 lgrp_id_t i; 9109 for (i = 0; i < NLGRPS_MAX; i++) { 9110 ASSERT(svntrp->tr_amp[i] == NULL); 9111 } 9112 } 9113 #endif /* DEBUG */ 9114 svntrp->tr_vp = vp; 9115 svntrp->tr_off = off; 9116 svntrp->tr_eoff = eoff; 9117 svntrp->tr_szc = szc; 9118 svntrp->tr_valid = 1; 9119 svntrp->tr_mtime = va.va_mtime; 9120 svntrp->tr_ctime = va.va_ctime; 9121 svntrp->tr_refcnt = 0; 9122 svntrp->tr_next = svntr_hashtab[hash].tr_head; 9123 svntr_hashtab[hash].tr_head = svntrp; 9124 } 9125 first = 1; 9126 again: 9127 /* 9128 * We want to pick a replica with pages on main thread's (t_tid = 1, 9129 * aka T1) lgrp. Currently text replication is only optimized for 9130 * workloads that either have all threads of a process on the same 9131 * lgrp or execute their large text primarily on main thread. 9132 */ 9133 lgrp_id = p->p_t1_lgrpid; 9134 if (lgrp_id == LGRP_NONE) { 9135 /* 9136 * In case exec() prefaults text on non main thread use 9137 * current thread lgrpid. It will become main thread anyway 9138 * soon. 9139 */ 9140 lgrp_id = lgrp_home_id(curthread); 9141 } 9142 /* 9143 * Set p_tr_lgrpid to lgrpid if it hasn't been set yet. Otherwise 9144 * just set it to NLGRPS_MAX if it's different from current process T1 9145 * home lgrp. p_tr_lgrpid is used to detect if process uses text 9146 * replication and T1 new home is different from lgrp used for text 9147 * replication. When this happens asyncronous segvn thread rechecks if 9148 * segments should change lgrps used for text replication. If we fail 9149 * to set p_tr_lgrpid with cas32 then set it to NLGRPS_MAX without cas 9150 * if it's not already NLGRPS_MAX and not equal lgrp_id we want to 9151 * use. We don't need to use cas in this case because another thread 9152 * that races in between our non atomic check and set may only change 9153 * p_tr_lgrpid to NLGRPS_MAX at this point. 9154 */ 9155 ASSERT(lgrp_id != LGRP_NONE && lgrp_id < NLGRPS_MAX); 9156 olid = p->p_tr_lgrpid; 9157 if (lgrp_id != olid && olid != NLGRPS_MAX) { 9158 lgrp_id_t nlid = (olid == LGRP_NONE) ? lgrp_id : NLGRPS_MAX; 9159 if (cas32((uint32_t *)&p->p_tr_lgrpid, olid, nlid) != olid) { 9160 olid = p->p_tr_lgrpid; 9161 ASSERT(olid != LGRP_NONE); 9162 if (olid != lgrp_id && olid != NLGRPS_MAX) { 9163 p->p_tr_lgrpid = NLGRPS_MAX; 9164 } 9165 } 9166 ASSERT(p->p_tr_lgrpid != LGRP_NONE); 9167 membar_producer(); 9168 /* 9169 * lgrp_move_thread() won't schedule async recheck after 9170 * p->p_t1_lgrpid update unless p->p_tr_lgrpid is not 9171 * LGRP_NONE. Recheck p_t1_lgrpid once now that p->p_tr_lgrpid 9172 * is not LGRP_NONE. 9173 */ 9174 if (first && p->p_t1_lgrpid != LGRP_NONE && 9175 p->p_t1_lgrpid != lgrp_id) { 9176 first = 0; 9177 goto again; 9178 } 9179 } 9180 /* 9181 * If no amp was created yet for lgrp_id create a new one as long as 9182 * we have enough memory to afford it. 9183 */ 9184 if ((amp = svntrp->tr_amp[lgrp_id]) == NULL) { 9185 size_t trmem = atomic_add_long_nv(&segvn_textrepl_bytes, size); 9186 if (trmem > segvn_textrepl_max_bytes) { 9187 SEGVN_TR_ADDSTAT(normem); 9188 goto fail; 9189 } 9190 if (anon_try_resv_zone(size, NULL) == 0) { 9191 SEGVN_TR_ADDSTAT(noanon); 9192 goto fail; 9193 } 9194 amp = anonmap_alloc(size, size, ANON_NOSLEEP); 9195 if (amp == NULL) { 9196 anon_unresv_zone(size, NULL); 9197 SEGVN_TR_ADDSTAT(nokmem); 9198 goto fail; 9199 } 9200 ASSERT(amp->refcnt == 1); 9201 amp->a_szc = szc; 9202 svntrp->tr_amp[lgrp_id] = amp; 9203 SEGVN_TR_ADDSTAT(newamp); 9204 } 9205 svntrp->tr_refcnt++; 9206 ASSERT(svd->svn_trnext == NULL); 9207 ASSERT(svd->svn_trprev == NULL); 9208 svd->svn_trnext = svntrp->tr_svnhead; 9209 svd->svn_trprev = NULL; 9210 if (svntrp->tr_svnhead != NULL) { 9211 svntrp->tr_svnhead->svn_trprev = svd; 9212 } 9213 svntrp->tr_svnhead = svd; 9214 ASSERT(amp->a_szc == szc && amp->size == size && amp->swresv == size); 9215 ASSERT(amp->refcnt >= 1); 9216 svd->amp = amp; 9217 svd->anon_index = 0; 9218 svd->tr_policy_info.mem_policy = LGRP_MEM_POLICY_NEXT_SEG; 9219 svd->tr_policy_info.mem_lgrpid = lgrp_id; 9220 svd->tr_state = SEGVN_TR_ON; 9221 mutex_exit(&svntr_hashtab[hash].tr_lock); 9222 SEGVN_TR_ADDSTAT(repl); 9223 return; 9224 fail: 9225 ASSERT(segvn_textrepl_bytes >= size); 9226 atomic_add_long(&segvn_textrepl_bytes, -size); 9227 ASSERT(svntrp != NULL); 9228 ASSERT(svntrp->tr_amp[lgrp_id] == NULL); 9229 if (svntrp->tr_refcnt == 0) { 9230 ASSERT(svntrp == svntr_hashtab[hash].tr_head); 9231 svntr_hashtab[hash].tr_head = svntrp->tr_next; 9232 mutex_exit(&svntr_hashtab[hash].tr_lock); 9233 kmem_cache_free(svntr_cache, svntrp); 9234 } else { 9235 mutex_exit(&svntr_hashtab[hash].tr_lock); 9236 } 9237 svd->tr_state = SEGVN_TR_OFF; 9238 } 9239 9240 /* 9241 * Convert seg back to regular vnode mapping seg by unbinding it from its text 9242 * replication amp. This routine is most typically called when segment is 9243 * unmapped but can also be called when segment no longer qualifies for text 9244 * replication (e.g. due to protection changes). If unload_unmap is set use 9245 * HAT_UNLOAD_UNMAP flag in hat_unload_callback(). If we are the last user of 9246 * svntr free all its anon maps and remove it from the hash table. 9247 */ 9248 static void 9249 segvn_textunrepl(struct seg *seg, int unload_unmap) 9250 { 9251 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 9252 vnode_t *vp = svd->vp; 9253 u_offset_t off = svd->offset; 9254 size_t size = seg->s_size; 9255 u_offset_t eoff = off + size; 9256 uint_t szc = seg->s_szc; 9257 ulong_t hash = SVNTR_HASH_FUNC(vp); 9258 svntr_t *svntrp; 9259 svntr_t **prv_svntrp; 9260 lgrp_id_t lgrp_id = svd->tr_policy_info.mem_lgrpid; 9261 lgrp_id_t i; 9262 9263 ASSERT(AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 9264 ASSERT(AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock) || 9265 SEGVN_WRITE_HELD(seg->s_as, &svd->lock)); 9266 ASSERT(svd->tr_state == SEGVN_TR_ON); 9267 ASSERT(!HAT_IS_REGION_COOKIE_VALID(svd->rcookie)); 9268 ASSERT(svd->amp != NULL); 9269 ASSERT(svd->amp->refcnt >= 1); 9270 ASSERT(svd->anon_index == 0); 9271 ASSERT(lgrp_id != LGRP_NONE && lgrp_id < NLGRPS_MAX); 9272 ASSERT(svntr_hashtab != NULL); 9273 9274 mutex_enter(&svntr_hashtab[hash].tr_lock); 9275 prv_svntrp = &svntr_hashtab[hash].tr_head; 9276 for (; (svntrp = *prv_svntrp) != NULL; prv_svntrp = &svntrp->tr_next) { 9277 ASSERT(svntrp->tr_refcnt != 0); 9278 if (svntrp->tr_vp == vp && svntrp->tr_off == off && 9279 svntrp->tr_eoff == eoff && svntrp->tr_szc == szc) { 9280 break; 9281 } 9282 } 9283 if (svntrp == NULL) { 9284 panic("segvn_textunrepl: svntr record not found"); 9285 } 9286 if (svntrp->tr_amp[lgrp_id] != svd->amp) { 9287 panic("segvn_textunrepl: amp mismatch"); 9288 } 9289 svd->tr_state = SEGVN_TR_OFF; 9290 svd->amp = NULL; 9291 if (svd->svn_trprev == NULL) { 9292 ASSERT(svntrp->tr_svnhead == svd); 9293 svntrp->tr_svnhead = svd->svn_trnext; 9294 if (svntrp->tr_svnhead != NULL) { 9295 svntrp->tr_svnhead->svn_trprev = NULL; 9296 } 9297 svd->svn_trnext = NULL; 9298 } else { 9299 svd->svn_trprev->svn_trnext = svd->svn_trnext; 9300 if (svd->svn_trnext != NULL) { 9301 svd->svn_trnext->svn_trprev = svd->svn_trprev; 9302 svd->svn_trnext = NULL; 9303 } 9304 svd->svn_trprev = NULL; 9305 } 9306 if (--svntrp->tr_refcnt) { 9307 mutex_exit(&svntr_hashtab[hash].tr_lock); 9308 goto done; 9309 } 9310 *prv_svntrp = svntrp->tr_next; 9311 mutex_exit(&svntr_hashtab[hash].tr_lock); 9312 for (i = 0; i < NLGRPS_MAX; i++) { 9313 struct anon_map *amp = svntrp->tr_amp[i]; 9314 if (amp == NULL) { 9315 continue; 9316 } 9317 ASSERT(amp->refcnt == 1); 9318 ASSERT(amp->swresv == size); 9319 ASSERT(amp->size == size); 9320 ASSERT(amp->a_szc == szc); 9321 if (amp->a_szc != 0) { 9322 anon_free_pages(amp->ahp, 0, size, szc); 9323 } else { 9324 anon_free(amp->ahp, 0, size); 9325 } 9326 svntrp->tr_amp[i] = NULL; 9327 ASSERT(segvn_textrepl_bytes >= size); 9328 atomic_add_long(&segvn_textrepl_bytes, -size); 9329 anon_unresv_zone(amp->swresv, NULL); 9330 amp->refcnt = 0; 9331 anonmap_free(amp); 9332 } 9333 kmem_cache_free(svntr_cache, svntrp); 9334 done: 9335 hat_unload_callback(seg->s_as->a_hat, seg->s_base, size, 9336 unload_unmap ? HAT_UNLOAD_UNMAP : 0, NULL); 9337 } 9338 9339 /* 9340 * This is called when a MAP_SHARED writable mapping is created to a vnode 9341 * that is currently used for execution (VVMEXEC flag is set). In this case we 9342 * need to prevent further use of existing replicas. 9343 */ 9344 static void 9345 segvn_inval_trcache(vnode_t *vp) 9346 { 9347 ulong_t hash = SVNTR_HASH_FUNC(vp); 9348 svntr_t *svntrp; 9349 9350 ASSERT(vp->v_flag & VVMEXEC); 9351 9352 if (svntr_hashtab == NULL) { 9353 return; 9354 } 9355 9356 mutex_enter(&svntr_hashtab[hash].tr_lock); 9357 svntrp = svntr_hashtab[hash].tr_head; 9358 for (; svntrp != NULL; svntrp = svntrp->tr_next) { 9359 ASSERT(svntrp->tr_refcnt != 0); 9360 if (svntrp->tr_vp == vp && svntrp->tr_valid) { 9361 svntrp->tr_valid = 0; 9362 } 9363 } 9364 mutex_exit(&svntr_hashtab[hash].tr_lock); 9365 } 9366 9367 static void 9368 segvn_trasync_thread(void) 9369 { 9370 callb_cpr_t cpr_info; 9371 kmutex_t cpr_lock; /* just for CPR stuff */ 9372 9373 mutex_init(&cpr_lock, NULL, MUTEX_DEFAULT, NULL); 9374 9375 CALLB_CPR_INIT(&cpr_info, &cpr_lock, 9376 callb_generic_cpr, "segvn_async"); 9377 9378 if (segvn_update_textrepl_interval == 0) { 9379 segvn_update_textrepl_interval = segvn_update_tr_time * hz; 9380 } else { 9381 segvn_update_textrepl_interval *= hz; 9382 } 9383 (void) timeout(segvn_trupdate_wakeup, NULL, 9384 segvn_update_textrepl_interval); 9385 9386 for (;;) { 9387 mutex_enter(&cpr_lock); 9388 CALLB_CPR_SAFE_BEGIN(&cpr_info); 9389 mutex_exit(&cpr_lock); 9390 sema_p(&segvn_trasync_sem); 9391 mutex_enter(&cpr_lock); 9392 CALLB_CPR_SAFE_END(&cpr_info, &cpr_lock); 9393 mutex_exit(&cpr_lock); 9394 segvn_trupdate(); 9395 } 9396 } 9397 9398 static uint64_t segvn_lgrp_trthr_migrs_snpsht = 0; 9399 9400 static void 9401 segvn_trupdate_wakeup(void *dummy) 9402 { 9403 uint64_t cur_lgrp_trthr_migrs = lgrp_get_trthr_migrations(); 9404 9405 if (cur_lgrp_trthr_migrs != segvn_lgrp_trthr_migrs_snpsht) { 9406 segvn_lgrp_trthr_migrs_snpsht = cur_lgrp_trthr_migrs; 9407 sema_v(&segvn_trasync_sem); 9408 } 9409 9410 if (!segvn_disable_textrepl_update && 9411 segvn_update_textrepl_interval != 0) { 9412 (void) timeout(segvn_trupdate_wakeup, dummy, 9413 segvn_update_textrepl_interval); 9414 } 9415 } 9416 9417 static void 9418 segvn_trupdate(void) 9419 { 9420 ulong_t hash; 9421 svntr_t *svntrp; 9422 segvn_data_t *svd; 9423 9424 ASSERT(svntr_hashtab != NULL); 9425 9426 for (hash = 0; hash < svntr_hashtab_sz; hash++) { 9427 mutex_enter(&svntr_hashtab[hash].tr_lock); 9428 svntrp = svntr_hashtab[hash].tr_head; 9429 for (; svntrp != NULL; svntrp = svntrp->tr_next) { 9430 ASSERT(svntrp->tr_refcnt != 0); 9431 svd = svntrp->tr_svnhead; 9432 for (; svd != NULL; svd = svd->svn_trnext) { 9433 segvn_trupdate_seg(svd->seg, svd, svntrp, 9434 hash); 9435 } 9436 } 9437 mutex_exit(&svntr_hashtab[hash].tr_lock); 9438 } 9439 } 9440 9441 static void 9442 segvn_trupdate_seg(struct seg *seg, 9443 segvn_data_t *svd, 9444 svntr_t *svntrp, 9445 ulong_t hash) 9446 { 9447 proc_t *p; 9448 lgrp_id_t lgrp_id; 9449 struct as *as; 9450 size_t size; 9451 struct anon_map *amp; 9452 9453 ASSERT(svd->vp != NULL); 9454 ASSERT(svd->vp == svntrp->tr_vp); 9455 ASSERT(svd->offset == svntrp->tr_off); 9456 ASSERT(svd->offset + seg->s_size == svntrp->tr_eoff); 9457 ASSERT(seg != NULL); 9458 ASSERT(svd->seg == seg); 9459 ASSERT(seg->s_data == (void *)svd); 9460 ASSERT(seg->s_szc == svntrp->tr_szc); 9461 ASSERT(svd->tr_state == SEGVN_TR_ON); 9462 ASSERT(!HAT_IS_REGION_COOKIE_VALID(svd->rcookie)); 9463 ASSERT(svd->amp != NULL); 9464 ASSERT(svd->tr_policy_info.mem_policy == LGRP_MEM_POLICY_NEXT_SEG); 9465 ASSERT(svd->tr_policy_info.mem_lgrpid != LGRP_NONE); 9466 ASSERT(svd->tr_policy_info.mem_lgrpid < NLGRPS_MAX); 9467 ASSERT(svntrp->tr_amp[svd->tr_policy_info.mem_lgrpid] == svd->amp); 9468 ASSERT(svntrp->tr_refcnt != 0); 9469 ASSERT(mutex_owned(&svntr_hashtab[hash].tr_lock)); 9470 9471 as = seg->s_as; 9472 ASSERT(as != NULL && as != &kas); 9473 p = as->a_proc; 9474 ASSERT(p != NULL); 9475 ASSERT(p->p_tr_lgrpid != LGRP_NONE); 9476 lgrp_id = p->p_t1_lgrpid; 9477 if (lgrp_id == LGRP_NONE) { 9478 return; 9479 } 9480 ASSERT(lgrp_id < NLGRPS_MAX); 9481 if (svd->tr_policy_info.mem_lgrpid == lgrp_id) { 9482 return; 9483 } 9484 9485 /* 9486 * Use tryenter locking since we are locking as/seg and svntr hash 9487 * lock in reverse from syncrounous thread order. 9488 */ 9489 if (!AS_LOCK_TRYENTER(as, &as->a_lock, RW_READER)) { 9490 SEGVN_TR_ADDSTAT(nolock); 9491 if (segvn_lgrp_trthr_migrs_snpsht) { 9492 segvn_lgrp_trthr_migrs_snpsht = 0; 9493 } 9494 return; 9495 } 9496 if (!SEGVN_LOCK_TRYENTER(seg->s_as, &svd->lock, RW_WRITER)) { 9497 AS_LOCK_EXIT(as, &as->a_lock); 9498 SEGVN_TR_ADDSTAT(nolock); 9499 if (segvn_lgrp_trthr_migrs_snpsht) { 9500 segvn_lgrp_trthr_migrs_snpsht = 0; 9501 } 9502 return; 9503 } 9504 size = seg->s_size; 9505 if (svntrp->tr_amp[lgrp_id] == NULL) { 9506 size_t trmem = atomic_add_long_nv(&segvn_textrepl_bytes, size); 9507 if (trmem > segvn_textrepl_max_bytes) { 9508 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 9509 AS_LOCK_EXIT(as, &as->a_lock); 9510 atomic_add_long(&segvn_textrepl_bytes, -size); 9511 SEGVN_TR_ADDSTAT(normem); 9512 return; 9513 } 9514 if (anon_try_resv_zone(size, NULL) == 0) { 9515 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 9516 AS_LOCK_EXIT(as, &as->a_lock); 9517 atomic_add_long(&segvn_textrepl_bytes, -size); 9518 SEGVN_TR_ADDSTAT(noanon); 9519 return; 9520 } 9521 amp = anonmap_alloc(size, size, KM_NOSLEEP); 9522 if (amp == NULL) { 9523 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 9524 AS_LOCK_EXIT(as, &as->a_lock); 9525 atomic_add_long(&segvn_textrepl_bytes, -size); 9526 anon_unresv_zone(size, NULL); 9527 SEGVN_TR_ADDSTAT(nokmem); 9528 return; 9529 } 9530 ASSERT(amp->refcnt == 1); 9531 amp->a_szc = seg->s_szc; 9532 svntrp->tr_amp[lgrp_id] = amp; 9533 } 9534 /* 9535 * We don't need to drop the bucket lock but here we give other 9536 * threads a chance. svntr and svd can't be unlinked as long as 9537 * segment lock is held as a writer and AS held as well. After we 9538 * retake bucket lock we'll continue from where we left. We'll be able 9539 * to reach the end of either list since new entries are always added 9540 * to the beginning of the lists. 9541 */ 9542 mutex_exit(&svntr_hashtab[hash].tr_lock); 9543 hat_unload_callback(as->a_hat, seg->s_base, size, 0, NULL); 9544 mutex_enter(&svntr_hashtab[hash].tr_lock); 9545 9546 ASSERT(svd->tr_state == SEGVN_TR_ON); 9547 ASSERT(svd->amp != NULL); 9548 ASSERT(svd->tr_policy_info.mem_policy == LGRP_MEM_POLICY_NEXT_SEG); 9549 ASSERT(svd->tr_policy_info.mem_lgrpid != lgrp_id); 9550 ASSERT(svd->amp != svntrp->tr_amp[lgrp_id]); 9551 9552 svd->tr_policy_info.mem_lgrpid = lgrp_id; 9553 svd->amp = svntrp->tr_amp[lgrp_id]; 9554 p->p_tr_lgrpid = NLGRPS_MAX; 9555 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 9556 AS_LOCK_EXIT(as, &as->a_lock); 9557 9558 ASSERT(svntrp->tr_refcnt != 0); 9559 ASSERT(svd->vp == svntrp->tr_vp); 9560 ASSERT(svd->tr_policy_info.mem_lgrpid == lgrp_id); 9561 ASSERT(svd->amp != NULL && svd->amp == svntrp->tr_amp[lgrp_id]); 9562 ASSERT(svd->seg == seg); 9563 ASSERT(svd->tr_state == SEGVN_TR_ON); 9564 9565 SEGVN_TR_ADDSTAT(asyncrepl); 9566 } 9567